Adding upstream version 4.19.249.upstream/4.19.249

Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
author: Daniel Baumann <daniel.baumann@progress-linux.org> 2024-05-06 01:02:30 +0000
committer: Daniel Baumann <daniel.baumann@progress-linux.org> 2024-05-06 01:02:30 +0000
commit: 76cb841cb886eef6b3bee341a2266c76578724ad (patch)
tree: f5892e5ba6cc11949952a6ce4ecbe6d516d6ce58 /drivers/xen
parent: Initial commit. (diff)
download: linux-76cb841cb886eef6b3bee341a2266c76578724ad.tar.xz
linux-76cb841cb886eef6b3bee341a2266c76578724ad.zip
77 files changed, 33192 insertions, 0 deletions
diff --git a/drivers/xen/Kconfig b/drivers/xen/Kconfig
new file mode 100644
index 000000000..0505eeb59
--- /dev/null
+++ b/drivers/xen/Kconfig
@@ -0,0 +1,353 @@
+menu "Xen driver support"
+	depends on XEN
+
+config XEN_BALLOON
+	bool "Xen memory balloon driver"
+	default y
+	help
+	  The balloon driver allows the Xen domain to request more memory from
+	  the system to expand the domain's memory allocation, or alternatively
+	  return unneeded memory to the system.
+
+config XEN_SELFBALLOONING
+	bool "Dynamically self-balloon kernel memory to target"
+	depends on XEN && XEN_BALLOON && CLEANCACHE && SWAP && XEN_TMEM
+	default n
+	help
+	  Self-ballooning dynamically balloons available kernel memory driven
+	  by the current usage of anonymous memory ("committed AS") and
+	  controlled by various sysfs-settable parameters.  Configuring
+	  FRONTSWAP is highly recommended; if it is not configured, self-
+	  ballooning is disabled by default. If FRONTSWAP is configured,
+	  frontswap-selfshrinking is enabled by default but can be disabled
+	  with the 'tmem.selfshrink=0' kernel boot parameter; and self-ballooning
+	  is enabled by default but can be disabled with the 'tmem.selfballooning=0'
+	  kernel boot parameter.  Note that systems without a sufficiently
+	  large swap device should not enable self-ballooning.
+
+config XEN_BALLOON_MEMORY_HOTPLUG
+	bool "Memory hotplug support for Xen balloon driver"
+	default n
+	depends on XEN_BALLOON && MEMORY_HOTPLUG
+	help
+	  Memory hotplug support for Xen balloon driver allows expanding memory
+	  available for the system above limit declared at system startup.
+	  It is very useful on critical systems which require long
+	  run without rebooting.
+
+	  Memory could be hotplugged in following steps:
+
+	    1) target domain: ensure that memory auto online policy is in
+	       effect by checking /sys/devices/system/memory/auto_online_blocks
+	       file (should be 'online').
+
+	    2) control domain: xl mem-max <target-domain> <maxmem>
+	       where <maxmem> is >= requested memory size,
+
+	    3) control domain: xl mem-set <target-domain> <memory>
+	       where <memory> is requested memory size; alternatively memory
+	       could be added by writing proper value to
+	       /sys/devices/system/xen_memory/xen_memory0/target or
+	       /sys/devices/system/xen_memory/xen_memory0/target_kb on the
+	       target domain.
+
+	  Alternatively, if memory auto onlining was not requested at step 1
+	  the newly added memory can be manually onlined in the target domain
+	  by doing the following:
+
+		for i in /sys/devices/system/memory/memory*/state; do \
+		  [ "`cat "$i"`" = offline ] && echo online > "$i"; done
+
+	  or by adding the following line to udev rules:
+
+	  SUBSYSTEM=="memory", ACTION=="add", RUN+="/bin/sh -c '[ -f /sys$devpath/state ] && echo online > /sys$devpath/state'"
+
+config XEN_BALLOON_MEMORY_HOTPLUG_LIMIT
+	int "Hotplugged memory limit (in GiB) for a PV guest"
+	default 512 if X86_64
+	default 4 if X86_32
+	range 0 64 if X86_32
+	depends on XEN_HAVE_PVMMU
+	depends on XEN_BALLOON_MEMORY_HOTPLUG
+	help
+	  Maxmium amount of memory (in GiB) that a PV guest can be
+	  expanded to when using memory hotplug.
+
+	  A PV guest can have more memory than this limit if is
+	  started with a larger maximum.
+
+	  This value is used to allocate enough space in internal
+	  tables needed for physical memory administration.
+
+config XEN_SCRUB_PAGES_DEFAULT
+	bool "Scrub pages before returning them to system by default"
+	depends on XEN_BALLOON
+	default y
+	help
+	  Scrub pages before returning them to the system for reuse by
+	  other domains.  This makes sure that any confidential data
+	  is not accidentally visible to other domains.  Is it more
+	  secure, but slightly less efficient. This can be controlled with
+	  xen_scrub_pages=0 parameter and
+	  /sys/devices/system/xen_memory/xen_memory0/scrub_pages.
+	  This option only sets the default value.
+
+	  If in doubt, say yes.
+
+config XEN_DEV_EVTCHN
+	tristate "Xen /dev/xen/evtchn device"
+	default y
+	help
+	  The evtchn driver allows a userspace process to trigger event
+	  channels and to receive notification of an event channel
+	  firing.
+	  If in doubt, say yes.
+
+config XEN_BACKEND
+	bool "Backend driver support"
+	depends on XEN_DOM0
+	default y
+	help
+	  Support for backend device drivers that provide I/O services
+	  to other virtual machines.
+
+config XENFS
+	tristate "Xen filesystem"
+	select XEN_PRIVCMD
+	default y
+	help
+	  The xen filesystem provides a way for domains to share
+	  information with each other and with the hypervisor.
+	  For example, by reading and writing the "xenbus" file, guests
+	  may pass arbitrary information to the initial domain.
+	  If in doubt, say yes.
+
+config XEN_COMPAT_XENFS
+       bool "Create compatibility mount point /proc/xen"
+       depends on XENFS
+       default y
+       help
+         The old xenstore userspace tools expect to find "xenbus"
+         under /proc/xen, but "xenbus" is now found at the root of the
+         xenfs filesystem.  Selecting this causes the kernel to create
+         the compatibility mount point /proc/xen if it is running on
+         a xen platform.
+         If in doubt, say yes.
+
+config XEN_SYS_HYPERVISOR
+       bool "Create xen entries under /sys/hypervisor"
+       depends on SYSFS
+       select SYS_HYPERVISOR
+       default y
+       help
+         Create entries under /sys/hypervisor describing the Xen
+	 hypervisor environment.  When running native or in another
+	 virtual environment, /sys/hypervisor will still be present,
+	 but will have no xen contents.
+
+config XEN_XENBUS_FRONTEND
+	tristate
+
+config XEN_GNTDEV
+	tristate "userspace grant access device driver"
+	depends on XEN
+	default m
+	select MMU_NOTIFIER
+	help
+	  Allows userspace processes to use grants.
+
+config XEN_GNTDEV_DMABUF
+	bool "Add support for dma-buf grant access device driver extension"
+	depends on XEN_GNTDEV && XEN_GRANT_DMA_ALLOC
+	select DMA_SHARED_BUFFER
+	help
+	  Allows userspace processes and kernel modules to use Xen backed
+	  dma-buf implementation. With this extension grant references to
+	  the pages of an imported dma-buf can be exported for other domain
+	  use and grant references coming from a foreign domain can be
+	  converted into a local dma-buf for local export.
+
+config XEN_GRANT_DEV_ALLOC
+	tristate "User-space grant reference allocator driver"
+	depends on XEN
+	default m
+	help
+	  Allows userspace processes to create pages with access granted
+	  to other domains. This can be used to implement frontend drivers
+	  or as part of an inter-domain shared memory channel.
+
+config XEN_GRANT_DMA_ALLOC
+	bool "Allow allocating DMA capable buffers with grant reference module"
+	depends on XEN && HAS_DMA
+	help
+	  Extends grant table module API to allow allocating DMA capable
+	  buffers and mapping foreign grant references on top of it.
+	  The resulting buffer is similar to one allocated by the balloon
+	  driver in that proper memory reservation is made by
+	  ({increase|decrease}_reservation and VA mappings are updated if
+	  needed).
+	  This is useful for sharing foreign buffers with HW drivers which
+	  cannot work with scattered buffers provided by the balloon driver,
+	  but require DMAable memory instead.
+
+config SWIOTLB_XEN
+	def_bool y
+	select SWIOTLB
+
+config XEN_TMEM
+	tristate
+	depends on !ARM && !ARM64
+	default m if (CLEANCACHE || FRONTSWAP)
+	help
+	  Shim to interface in-kernel Transcendent Memory hooks
+	  (e.g. cleancache and frontswap) to Xen tmem hypercalls.
+
+config XEN_PCIDEV_BACKEND
+	tristate "Xen PCI-device backend driver"
+	depends on PCI && X86 && XEN
+	depends on XEN_BACKEND
+	default m
+	help
+	  The PCI device backend driver allows the kernel to export arbitrary
+	  PCI devices to other guests. If you select this to be a module, you
+	  will need to make sure no other driver has bound to the device(s)
+	  you want to make visible to other guests.
+
+	  The parameter "passthrough" allows you specify how you want the PCI
+	  devices to appear in the guest. You can choose the default (0) where
+	  PCI topology starts at 00.00.0, or (1) for passthrough if you want
+	  the PCI devices topology appear the same as in the host.
+
+	  The "hide" parameter (only applicable if backend driver is compiled
+	  into the kernel) allows you to bind the PCI devices to this module
+	  from the default device drivers. The argument is the list of PCI BDFs:
+	  xen-pciback.hide=(03:00.0)(04:00.0)
+
+	  If in doubt, say m.
+
+config XEN_PVCALLS_FRONTEND
+	tristate "XEN PV Calls frontend driver"
+	depends on INET && XEN
+	default n
+	select XEN_XENBUS_FRONTEND
+	help
+	  Experimental frontend for the Xen PV Calls protocol
+	  (https://xenbits.xen.org/docs/unstable/misc/pvcalls.html). It
+	  sends a small set of POSIX calls to the backend, which
+	  implements them.
+
+config XEN_PVCALLS_BACKEND
+	bool "XEN PV Calls backend driver"
+	depends on INET && XEN && XEN_BACKEND
+	default n
+	help
+	  Experimental backend for the Xen PV Calls protocol
+	  (https://xenbits.xen.org/docs/unstable/misc/pvcalls.html). It
+	  allows PV Calls frontends to send POSIX calls to the backend,
+	  which implements them.
+
+	  If in doubt, say n.
+
+config XEN_SCSI_BACKEND
+	tristate "XEN SCSI backend driver"
+	depends on XEN && XEN_BACKEND && TARGET_CORE
+	help
+	  The SCSI backend driver allows the kernel to export its SCSI Devices
+	  to other guests via a high-performance shared-memory interface.
+	  Only needed for systems running as XEN driver domains (e.g. Dom0) and
+	  if guests need generic access to SCSI devices.
+
+config XEN_PRIVCMD
+	tristate
+	depends on XEN
+	default m
+
+config XEN_STUB
+	bool "Xen stub drivers"
+	depends on XEN && X86_64 && BROKEN
+	default n
+	help
+	  Allow kernel to install stub drivers, to reserve space for Xen drivers,
+	  i.e. memory hotplug and cpu hotplug, and to block native drivers loaded,
+	  so that real Xen drivers can be modular.
+
+	  To enable Xen features like cpu and memory hotplug, select Y here.
+
+config XEN_ACPI_HOTPLUG_MEMORY
+	tristate "Xen ACPI memory hotplug"
+	depends on XEN_DOM0 && XEN_STUB && ACPI
+	default n
+	help
+	  This is Xen ACPI memory hotplug.
+
+	  Currently Xen only support ACPI memory hot-add. If you want
+	  to hot-add memory at runtime (the hot-added memory cannot be
+	  removed until machine stop), select Y/M here, otherwise select N.
+
+config XEN_ACPI_HOTPLUG_CPU
+	tristate "Xen ACPI cpu hotplug"
+	depends on XEN_DOM0 && XEN_STUB && ACPI
+	select ACPI_CONTAINER
+	default n
+	help
+	  Xen ACPI cpu enumerating and hotplugging
+
+	  For hotplugging, currently Xen only support ACPI cpu hotadd.
+	  If you want to hotadd cpu at runtime (the hotadded cpu cannot
+	  be removed until machine stop), select Y/M here.
+
+config XEN_ACPI_PROCESSOR
+	tristate "Xen ACPI processor"
+	depends on XEN && XEN_DOM0 && X86 && ACPI_PROCESSOR && CPU_FREQ
+	default m
+	help
+          This ACPI processor uploads Power Management information to the Xen
+	  hypervisor.
+
+	  To do that the driver parses the Power Management data and uploads
+	  said information to the Xen hypervisor. Then the Xen hypervisor can
+	  select the proper Cx and Pxx states. It also registers itself as the
+	  SMM so that other drivers (such as ACPI cpufreq scaling driver) will
+	  not load.
+
+          To compile this driver as a module, choose M here: the module will be
+	  called xen_acpi_processor  If you do not know what to choose, select
+	  M here. If the CPUFREQ drivers are built in, select Y here.
+
+config XEN_MCE_LOG
+	bool "Xen platform mcelog"
+	depends on XEN_DOM0 && X86_64 && X86_MCE
+	default n
+	help
+	  Allow kernel fetching MCE error from Xen platform and
+	  converting it into Linux mcelog format for mcelog tools
+
+config XEN_HAVE_PVMMU
+       bool
+
+config XEN_EFI
+	def_bool y
+	depends on (ARM || ARM64 || X86_64) && EFI
+
+config XEN_AUTO_XLATE
+	def_bool y
+	depends on ARM || ARM64 || XEN_PVHVM
+	help
+	  Support for auto-translated physmap guests.
+
+config XEN_ACPI
+	def_bool y
+	depends on X86 && ACPI
+
+config XEN_SYMS
+       bool "Xen symbols"
+       depends on X86 && XEN_DOM0 && XENFS
+       default y if KALLSYMS
+       help
+          Exports hypervisor symbols (along with their types and addresses) via
+          /proc/xen/xensyms file, similar to /proc/kallsyms
+
+config XEN_HAVE_VPMU
+       bool
+
+endmenu
diff --git a/drivers/xen/Makefile b/drivers/xen/Makefile
new file mode 100644
index 000000000..3e542f60f
--- /dev/null
+++ b/drivers/xen/Makefile
@@ -0,0 +1,46 @@
+# SPDX-License-Identifier: GPL-2.0
+obj-$(CONFIG_HOTPLUG_CPU)		+= cpu_hotplug.o
+obj-$(CONFIG_X86)			+= fallback.o
+obj-y	+= grant-table.o features.o balloon.o manage.o preempt.o time.o
+obj-y	+= mem-reservation.o
+obj-y	+= events/
+obj-y	+= xenbus/
+
+nostackp := $(call cc-option, -fno-stack-protector)
+CFLAGS_features.o			:= $(nostackp)
+
+dom0-$(CONFIG_ARM64) += arm-device.o
+dom0-$(CONFIG_PCI) += pci.o
+dom0-$(CONFIG_USB_SUPPORT) += dbgp.o
+dom0-$(CONFIG_XEN_ACPI) += acpi.o $(xen-pad-y)
+xen-pad-$(CONFIG_X86) += xen-acpi-pad.o
+dom0-$(CONFIG_X86) += pcpu.o
+obj-$(CONFIG_XEN_DOM0)			+= $(dom0-y)
+obj-$(CONFIG_BLOCK)			+= biomerge.o
+obj-$(CONFIG_XEN_BALLOON)		+= xen-balloon.o
+obj-$(CONFIG_XEN_SELFBALLOONING)	+= xen-selfballoon.o
+obj-$(CONFIG_XEN_DEV_EVTCHN)		+= xen-evtchn.o
+obj-$(CONFIG_XEN_GNTDEV)		+= xen-gntdev.o
+obj-$(CONFIG_XEN_GRANT_DEV_ALLOC)	+= xen-gntalloc.o
+obj-$(CONFIG_XENFS)			+= xenfs/
+obj-$(CONFIG_XEN_SYS_HYPERVISOR)	+= sys-hypervisor.o
+obj-$(CONFIG_XEN_PVHVM)			+= platform-pci.o
+obj-$(CONFIG_XEN_TMEM)			+= tmem.o
+obj-$(CONFIG_SWIOTLB_XEN)		+= swiotlb-xen.o
+obj-$(CONFIG_XEN_MCE_LOG)		+= mcelog.o
+obj-$(CONFIG_XEN_PCIDEV_BACKEND)	+= xen-pciback/
+obj-$(CONFIG_XEN_PRIVCMD)		+= xen-privcmd.o
+obj-$(CONFIG_XEN_STUB)			+= xen-stub.o
+obj-$(CONFIG_XEN_ACPI_HOTPLUG_MEMORY)	+= xen-acpi-memhotplug.o
+obj-$(CONFIG_XEN_ACPI_HOTPLUG_CPU)	+= xen-acpi-cpuhotplug.o
+obj-$(CONFIG_XEN_ACPI_PROCESSOR)	+= xen-acpi-processor.o
+obj-$(CONFIG_XEN_EFI)			+= efi.o
+obj-$(CONFIG_XEN_SCSI_BACKEND)		+= xen-scsiback.o
+obj-$(CONFIG_XEN_AUTO_XLATE)		+= xlate_mmu.o
+obj-$(CONFIG_XEN_PVCALLS_BACKEND)	+= pvcalls-back.o
+obj-$(CONFIG_XEN_PVCALLS_FRONTEND)	+= pvcalls-front.o
+xen-evtchn-y				:= evtchn.o
+xen-gntdev-y				:= gntdev.o
+xen-gntdev-$(CONFIG_XEN_GNTDEV_DMABUF)	+= gntdev-dmabuf.o
+xen-gntalloc-y				:= gntalloc.o
+xen-privcmd-y				:= privcmd.o privcmd-buf.o
diff --git a/drivers/xen/acpi.c b/drivers/xen/acpi.c
new file mode 100644
index 000000000..6893c79fd
--- /dev/null
+++ b/drivers/xen/acpi.c
@@ -0,0 +1,77 @@
+/******************************************************************************
+ * acpi.c
+ * acpi file for domain 0 kernel
+ *
+ * Copyright (c) 2011 Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
+ * Copyright (c) 2011 Yu Ke ke.yu@intel.com
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation; or, when distributed
+ * separately from the Linux kernel or incorporated into other
+ * software packages, subject to the following license:
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this source file (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy, modify,
+ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
+ * and to permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include <xen/acpi.h>
+#include <xen/interface/platform.h>
+#include <asm/xen/hypercall.h>
+#include <asm/xen/hypervisor.h>
+
+static int xen_acpi_notify_hypervisor_state(u8 sleep_state,
+					    u32 val_a, u32 val_b,
+					    bool extended)
+{
+	unsigned int bits = extended ? 8 : 16;
+
+	struct xen_platform_op op = {
+		.cmd = XENPF_enter_acpi_sleep,
+		.interface_version = XENPF_INTERFACE_VERSION,
+		.u.enter_acpi_sleep = {
+			.val_a = (u16)val_a,
+			.val_b = (u16)val_b,
+			.sleep_state = sleep_state,
+			.flags = extended ? XENPF_ACPI_SLEEP_EXTENDED : 0,
+		},
+	};
+
+	if (WARN((val_a & (~0 << bits)) || (val_b & (~0 << bits)),
+		 "Using more than %u bits of sleep control values %#x/%#x!"
+		 "Email xen-devel@lists.xen.org - Thank you.\n", \
+		 bits, val_a, val_b))
+		return -1;
+
+	HYPERVISOR_platform_op(&op);
+	return 1;
+}
+
+int xen_acpi_notify_hypervisor_sleep(u8 sleep_state,
+				     u32 pm1a_cnt, u32 pm1b_cnt)
+{
+	return xen_acpi_notify_hypervisor_state(sleep_state, pm1a_cnt,
+						pm1b_cnt, false);
+}
+
+int xen_acpi_notify_hypervisor_extended_sleep(u8 sleep_state,
+				     u32 val_a, u32 val_b)
+{
+	return xen_acpi_notify_hypervisor_state(sleep_state, val_a,
+						val_b, true);
+}
diff --git a/drivers/xen/arm-device.c b/drivers/xen/arm-device.c
new file mode 100644
index 000000000..3e789c77f
--- /dev/null
+++ b/drivers/xen/arm-device.c
@@ -0,0 +1,198 @@
+/*
+ * Copyright (c) 2015, Linaro Limited, Shannon Zhao
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <linux/platform_device.h>
+#include <linux/acpi.h>
+#include <xen/xen.h>
+#include <xen/page.h>
+#include <xen/interface/memory.h>
+#include <asm/xen/hypervisor.h>
+#include <asm/xen/hypercall.h>
+
+static int xen_unmap_device_mmio(const struct resource *resources,
+				 unsigned int count)
+{
+	unsigned int i, j, nr;
+	int rc = 0;
+	const struct resource *r;
+	struct xen_remove_from_physmap xrp;
+
+	for (i = 0; i < count; i++) {
+		r = &resources[i];
+		nr = DIV_ROUND_UP(resource_size(r), XEN_PAGE_SIZE);
+		if ((resource_type(r) != IORESOURCE_MEM) || (nr == 0))
+			continue;
+
+		for (j = 0; j < nr; j++) {
+			xrp.domid = DOMID_SELF;
+			xrp.gpfn = XEN_PFN_DOWN(r->start) + j;
+			rc = HYPERVISOR_memory_op(XENMEM_remove_from_physmap,
+						  &xrp);
+			if (rc)
+				return rc;
+		}
+	}
+
+	return rc;
+}
+
+static int xen_map_device_mmio(const struct resource *resources,
+			       unsigned int count)
+{
+	unsigned int i, j, nr;
+	int rc = 0;
+	const struct resource *r;
+	xen_pfn_t *gpfns;
+	xen_ulong_t *idxs;
+	int *errs;
+
+	for (i = 0; i < count; i++) {
+		struct xen_add_to_physmap_range xatp = {
+			.domid = DOMID_SELF,
+			.space = XENMAPSPACE_dev_mmio
+		};
+
+		r = &resources[i];
+		nr = DIV_ROUND_UP(resource_size(r), XEN_PAGE_SIZE);
+		if ((resource_type(r) != IORESOURCE_MEM) || (nr == 0))
+			continue;
+
+		gpfns = kcalloc(nr, sizeof(xen_pfn_t), GFP_KERNEL);
+		idxs = kcalloc(nr, sizeof(xen_ulong_t), GFP_KERNEL);
+		errs = kcalloc(nr, sizeof(int), GFP_KERNEL);
+		if (!gpfns || !idxs || !errs) {
+			kfree(gpfns);
+			kfree(idxs);
+			kfree(errs);
+			rc = -ENOMEM;
+			goto unmap;
+		}
+
+		for (j = 0; j < nr; j++) {
+			/*
+			 * The regions are always mapped 1:1 to DOM0 and this is
+			 * fine because the memory map for DOM0 is the same as
+			 * the host (except for the RAM).
+			 */
+			gpfns[j] = XEN_PFN_DOWN(r->start) + j;
+			idxs[j] = XEN_PFN_DOWN(r->start) + j;
+		}
+
+		xatp.size = nr;
+
+		set_xen_guest_handle(xatp.gpfns, gpfns);
+		set_xen_guest_handle(xatp.idxs, idxs);
+		set_xen_guest_handle(xatp.errs, errs);
+
+		rc = HYPERVISOR_memory_op(XENMEM_add_to_physmap_range, &xatp);
+		kfree(gpfns);
+		kfree(idxs);
+		kfree(errs);
+		if (rc)
+			goto unmap;
+	}
+
+	return rc;
+
+unmap:
+	xen_unmap_device_mmio(resources, i);
+	return rc;
+}
+
+static int xen_platform_notifier(struct notifier_block *nb,
+				 unsigned long action, void *data)
+{
+	struct platform_device *pdev = to_platform_device(data);
+	int r = 0;
+
+	if (pdev->num_resources == 0 || pdev->resource == NULL)
+		return NOTIFY_OK;
+
+	switch (action) {
+	case BUS_NOTIFY_ADD_DEVICE:
+		r = xen_map_device_mmio(pdev->resource, pdev->num_resources);
+		break;
+	case BUS_NOTIFY_DEL_DEVICE:
+		r = xen_unmap_device_mmio(pdev->resource, pdev->num_resources);
+		break;
+	default:
+		return NOTIFY_DONE;
+	}
+	if (r)
+		dev_err(&pdev->dev, "Platform: Failed to %s device %s MMIO!\n",
+			action == BUS_NOTIFY_ADD_DEVICE ? "map" :
+			(action == BUS_NOTIFY_DEL_DEVICE ? "unmap" : "?"),
+			pdev->name);
+
+	return NOTIFY_OK;
+}
+
+static struct notifier_block platform_device_nb = {
+	.notifier_call = xen_platform_notifier,
+};
+
+static int __init register_xen_platform_notifier(void)
+{
+	if (!xen_initial_domain() || acpi_disabled)
+		return 0;
+
+	return bus_register_notifier(&platform_bus_type, &platform_device_nb);
+}
+
+arch_initcall(register_xen_platform_notifier);
+
+#ifdef CONFIG_ARM_AMBA
+#include <linux/amba/bus.h>
+
+static int xen_amba_notifier(struct notifier_block *nb,
+			     unsigned long action, void *data)
+{
+	struct amba_device *adev = to_amba_device(data);
+	int r = 0;
+
+	switch (action) {
+	case BUS_NOTIFY_ADD_DEVICE:
+		r = xen_map_device_mmio(&adev->res, 1);
+		break;
+	case BUS_NOTIFY_DEL_DEVICE:
+		r = xen_unmap_device_mmio(&adev->res, 1);
+		break;
+	default:
+		return NOTIFY_DONE;
+	}
+	if (r)
+		dev_err(&adev->dev, "AMBA: Failed to %s device %s MMIO!\n",
+			action == BUS_NOTIFY_ADD_DEVICE ? "map" :
+			(action == BUS_NOTIFY_DEL_DEVICE ? "unmap" : "?"),
+			adev->dev.init_name);
+
+	return NOTIFY_OK;
+}
+
+static struct notifier_block amba_device_nb = {
+	.notifier_call = xen_amba_notifier,
+};
+
+static int __init register_xen_amba_notifier(void)
+{
+	if (!xen_initial_domain() || acpi_disabled)
+		return 0;
+
+	return bus_register_notifier(&amba_bustype, &amba_device_nb);
+}
+
+arch_initcall(register_xen_amba_notifier);
+#endif
diff --git a/drivers/xen/balloon.c b/drivers/xen/balloon.c
new file mode 100644
index 000000000..4f4a47ae7
--- /dev/null
+++ b/drivers/xen/balloon.c
@@ -0,0 +1,826 @@
+/******************************************************************************
+ * Xen balloon driver - enables returning/claiming memory to/from Xen.
+ *
+ * Copyright (c) 2003, B Dragovic
+ * Copyright (c) 2003-2004, M Williamson, K Fraser
+ * Copyright (c) 2005 Dan M. Smith, IBM Corporation
+ * Copyright (c) 2010 Daniel Kiper
+ *
+ * Memory hotplug support was written by Daniel Kiper. Work on
+ * it was sponsored by Google under Google Summer of Code 2010
+ * program. Jeremy Fitzhardinge from Citrix was the mentor for
+ * this project.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation; or, when distributed
+ * separately from the Linux kernel or incorporated into other
+ * software packages, subject to the following license:
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this source file (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy, modify,
+ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
+ * and to permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#define pr_fmt(fmt) "xen:" KBUILD_MODNAME ": " fmt
+
+#include <linux/cpu.h>
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/cred.h>
+#include <linux/errno.h>
+#include <linux/freezer.h>
+#include <linux/kthread.h>
+#include <linux/mm.h>
+#include <linux/bootmem.h>
+#include <linux/pagemap.h>
+#include <linux/highmem.h>
+#include <linux/mutex.h>
+#include <linux/list.h>
+#include <linux/gfp.h>
+#include <linux/notifier.h>
+#include <linux/memory.h>
+#include <linux/memory_hotplug.h>
+#include <linux/percpu-defs.h>
+#include <linux/slab.h>
+#include <linux/sysctl.h>
+#include <linux/moduleparam.h>
+
+#include <asm/page.h>
+#include <asm/pgalloc.h>
+#include <asm/pgtable.h>
+#include <asm/tlb.h>
+
+#include <asm/xen/hypervisor.h>
+#include <asm/xen/hypercall.h>
+
+#include <xen/xen.h>
+#include <xen/interface/xen.h>
+#include <xen/interface/memory.h>
+#include <xen/balloon.h>
+#include <xen/features.h>
+#include <xen/page.h>
+#include <xen/mem-reservation.h>
+
+#undef MODULE_PARAM_PREFIX
+#define MODULE_PARAM_PREFIX "xen."
+
+static uint __read_mostly balloon_boot_timeout = 180;
+module_param(balloon_boot_timeout, uint, 0444);
+
+static int xen_hotplug_unpopulated;
+
+#ifdef CONFIG_XEN_BALLOON_MEMORY_HOTPLUG
+
+static int zero;
+static int one = 1;
+
+static struct ctl_table balloon_table[] = {
+	{
+		.procname	= "hotplug_unpopulated",
+		.data		= &xen_hotplug_unpopulated,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_minmax,
+		.extra1         = &zero,
+		.extra2         = &one,
+	},
+	{ }
+};
+
+static struct ctl_table balloon_root[] = {
+	{
+		.procname	= "balloon",
+		.mode		= 0555,
+		.child		= balloon_table,
+	},
+	{ }
+};
+
+static struct ctl_table xen_root[] = {
+	{
+		.procname	= "xen",
+		.mode		= 0555,
+		.child		= balloon_root,
+	},
+	{ }
+};
+
+#endif
+
+/*
+ * Use one extent per PAGE_SIZE to avoid to break down the page into
+ * multiple frame.
+ */
+#define EXTENT_ORDER (fls(XEN_PFN_PER_PAGE) - 1)
+
+/*
+ * balloon_thread() state:
+ *
+ * BP_DONE: done or nothing to do,
+ * BP_WAIT: wait to be rescheduled,
+ * BP_EAGAIN: error, go to sleep,
+ * BP_ECANCELED: error, balloon operation canceled.
+ */
+
+static enum bp_state {
+	BP_DONE,
+	BP_WAIT,
+	BP_EAGAIN,
+	BP_ECANCELED
+} balloon_state = BP_DONE;
+
+/* Main waiting point for xen-balloon thread. */
+static DECLARE_WAIT_QUEUE_HEAD(balloon_thread_wq);
+
+static DEFINE_MUTEX(balloon_mutex);
+
+struct balloon_stats balloon_stats;
+EXPORT_SYMBOL_GPL(balloon_stats);
+
+/* We increase/decrease in batches which fit in a page */
+static xen_pfn_t frame_list[PAGE_SIZE / sizeof(xen_pfn_t)];
+
+
+/* List of ballooned pages, threaded through the mem_map array. */
+static LIST_HEAD(ballooned_pages);
+static DECLARE_WAIT_QUEUE_HEAD(balloon_wq);
+
+/* When ballooning out (allocating memory to return to Xen) we don't really
+   want the kernel to try too hard since that can trigger the oom killer. */
+#define GFP_BALLOON \
+	(GFP_HIGHUSER | __GFP_NOWARN | __GFP_NORETRY | __GFP_NOMEMALLOC)
+
+/* balloon_append: add the given page to the balloon. */
+static void __balloon_append(struct page *page)
+{
+	/* Lowmem is re-populated first, so highmem pages go at list tail. */
+	if (PageHighMem(page)) {
+		list_add_tail(&page->lru, &ballooned_pages);
+		balloon_stats.balloon_high++;
+	} else {
+		list_add(&page->lru, &ballooned_pages);
+		balloon_stats.balloon_low++;
+	}
+	wake_up(&balloon_wq);
+}
+
+static void balloon_append(struct page *page)
+{
+	__balloon_append(page);
+}
+
+/* balloon_retrieve: rescue a page from the balloon, if it is not empty. */
+static struct page *balloon_retrieve(bool require_lowmem)
+{
+	struct page *page;
+
+	if (list_empty(&ballooned_pages))
+		return NULL;
+
+	page = list_entry(ballooned_pages.next, struct page, lru);
+	if (require_lowmem && PageHighMem(page))
+		return NULL;
+	list_del(&page->lru);
+
+	if (PageHighMem(page))
+		balloon_stats.balloon_high--;
+	else
+		balloon_stats.balloon_low--;
+
+	return page;
+}
+
+static struct page *balloon_next_page(struct page *page)
+{
+	struct list_head *next = page->lru.next;
+	if (next == &ballooned_pages)
+		return NULL;
+	return list_entry(next, struct page, lru);
+}
+
+static void update_schedule(void)
+{
+	if (balloon_state == BP_WAIT || balloon_state == BP_ECANCELED)
+		return;
+
+	if (balloon_state == BP_DONE) {
+		balloon_stats.schedule_delay = 1;
+		balloon_stats.retry_count = 1;
+		return;
+	}
+
+	++balloon_stats.retry_count;
+
+	if (balloon_stats.max_retry_count != RETRY_UNLIMITED &&
+			balloon_stats.retry_count > balloon_stats.max_retry_count) {
+		balloon_stats.schedule_delay = 1;
+		balloon_stats.retry_count = 1;
+		balloon_state = BP_ECANCELED;
+		return;
+	}
+
+	balloon_stats.schedule_delay <<= 1;
+
+	if (balloon_stats.schedule_delay > balloon_stats.max_schedule_delay)
+		balloon_stats.schedule_delay = balloon_stats.max_schedule_delay;
+
+	balloon_state = BP_EAGAIN;
+}
+
+#ifdef CONFIG_XEN_BALLOON_MEMORY_HOTPLUG
+static void release_memory_resource(struct resource *resource)
+{
+	if (!resource)
+		return;
+
+	/*
+	 * No need to reset region to identity mapped since we now
+	 * know that no I/O can be in this region
+	 */
+	release_resource(resource);
+	kfree(resource);
+}
+
+static struct resource *additional_memory_resource(phys_addr_t size)
+{
+	struct resource *res;
+	int ret;
+
+	res = kzalloc(sizeof(*res), GFP_KERNEL);
+	if (!res)
+		return NULL;
+
+	res->name = "System RAM";
+	res->flags = IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY;
+
+	ret = allocate_resource(&iomem_resource, res,
+				size, 0, -1,
+				PAGES_PER_SECTION * PAGE_SIZE, NULL, NULL);
+	if (ret < 0) {
+		pr_err("Cannot allocate new System RAM resource\n");
+		kfree(res);
+		return NULL;
+	}
+
+#ifdef CONFIG_SPARSEMEM
+	{
+		unsigned long limit = 1UL << (MAX_PHYSMEM_BITS - PAGE_SHIFT);
+		unsigned long pfn = res->start >> PAGE_SHIFT;
+
+		if (pfn > limit) {
+			pr_err("New System RAM resource outside addressable RAM (%lu > %lu)\n",
+			       pfn, limit);
+			release_memory_resource(res);
+			return NULL;
+		}
+	}
+#endif
+
+	return res;
+}
+
+static enum bp_state reserve_additional_memory(void)
+{
+	long credit;
+	struct resource *resource;
+	int nid, rc;
+	unsigned long balloon_hotplug;
+
+	credit = balloon_stats.target_pages + balloon_stats.target_unpopulated
+		- balloon_stats.total_pages;
+
+	/*
+	 * Already hotplugged enough pages?  Wait for them to be
+	 * onlined.
+	 */
+	if (credit <= 0)
+		return BP_WAIT;
+
+	balloon_hotplug = round_up(credit, PAGES_PER_SECTION);
+
+	resource = additional_memory_resource(balloon_hotplug * PAGE_SIZE);
+	if (!resource)
+		goto err;
+
+	nid = memory_add_physaddr_to_nid(resource->start);
+
+#ifdef CONFIG_XEN_HAVE_PVMMU
+	/*
+	 * We don't support PV MMU when Linux and Xen is using
+	 * different page granularity.
+	 */
+	BUILD_BUG_ON(XEN_PAGE_SIZE != PAGE_SIZE);
+
+        /*
+         * add_memory() will build page tables for the new memory so
+         * the p2m must contain invalid entries so the correct
+         * non-present PTEs will be written.
+         *
+         * If a failure occurs, the original (identity) p2m entries
+         * are not restored since this region is now known not to
+         * conflict with any devices.
+         */ 
+	if (!xen_feature(XENFEAT_auto_translated_physmap)) {
+		unsigned long pfn, i;
+
+		pfn = PFN_DOWN(resource->start);
+		for (i = 0; i < balloon_hotplug; i++) {
+			if (!set_phys_to_machine(pfn + i, INVALID_P2M_ENTRY)) {
+				pr_warn("set_phys_to_machine() failed, no memory added\n");
+				goto err;
+			}
+                }
+	}
+#endif
+
+	/*
+	 * add_memory_resource() will call online_pages() which in its turn
+	 * will call xen_online_page() callback causing deadlock if we don't
+	 * release balloon_mutex here. Unlocking here is safe because the
+	 * callers drop the mutex before trying again.
+	 */
+	mutex_unlock(&balloon_mutex);
+	/* add_memory_resource() requires the device_hotplug lock */
+	lock_device_hotplug();
+	rc = add_memory_resource(nid, resource, memhp_auto_online);
+	unlock_device_hotplug();
+	mutex_lock(&balloon_mutex);
+
+	if (rc) {
+		pr_warn("Cannot add additional memory (%i)\n", rc);
+		goto err;
+	}
+
+	balloon_stats.total_pages += balloon_hotplug;
+
+	return BP_WAIT;
+  err:
+	release_memory_resource(resource);
+	return BP_ECANCELED;
+}
+
+static void xen_online_page(struct page *page)
+{
+	__online_page_set_limits(page);
+
+	mutex_lock(&balloon_mutex);
+
+	__balloon_append(page);
+
+	mutex_unlock(&balloon_mutex);
+}
+
+static int xen_memory_notifier(struct notifier_block *nb, unsigned long val, void *v)
+{
+	if (val == MEM_ONLINE)
+		wake_up(&balloon_thread_wq);
+
+	return NOTIFY_OK;
+}
+
+static struct notifier_block xen_memory_nb = {
+	.notifier_call = xen_memory_notifier,
+	.priority = 0
+};
+#else
+static enum bp_state reserve_additional_memory(void)
+{
+	balloon_stats.target_pages = balloon_stats.current_pages +
+				     balloon_stats.target_unpopulated;
+	return BP_ECANCELED;
+}
+#endif /* CONFIG_XEN_BALLOON_MEMORY_HOTPLUG */
+
+static long current_credit(void)
+{
+	return balloon_stats.target_pages - balloon_stats.current_pages;
+}
+
+static bool balloon_is_inflated(void)
+{
+	return balloon_stats.balloon_low || balloon_stats.balloon_high;
+}
+
+static enum bp_state increase_reservation(unsigned long nr_pages)
+{
+	int rc;
+	unsigned long i;
+	struct page   *page;
+
+	if (nr_pages > ARRAY_SIZE(frame_list))
+		nr_pages = ARRAY_SIZE(frame_list);
+
+	page = list_first_entry_or_null(&ballooned_pages, struct page, lru);
+	for (i = 0; i < nr_pages; i++) {
+		if (!page) {
+			nr_pages = i;
+			break;
+		}
+
+		frame_list[i] = page_to_xen_pfn(page);
+		page = balloon_next_page(page);
+	}
+
+	rc = xenmem_reservation_increase(nr_pages, frame_list);
+	if (rc <= 0)
+		return BP_EAGAIN;
+
+	for (i = 0; i < rc; i++) {
+		page = balloon_retrieve(false);
+		BUG_ON(page == NULL);
+
+		xenmem_reservation_va_mapping_update(1, &page, &frame_list[i]);
+
+		/* Relinquish the page back to the allocator. */
+		free_reserved_page(page);
+	}
+
+	balloon_stats.current_pages += rc;
+
+	return BP_DONE;
+}
+
+static enum bp_state decrease_reservation(unsigned long nr_pages, gfp_t gfp)
+{
+	enum bp_state state = BP_DONE;
+	unsigned long i;
+	struct page *page, *tmp;
+	int ret;
+	LIST_HEAD(pages);
+
+	if (nr_pages > ARRAY_SIZE(frame_list))
+		nr_pages = ARRAY_SIZE(frame_list);
+
+	for (i = 0; i < nr_pages; i++) {
+		page = alloc_page(gfp);
+		if (page == NULL) {
+			nr_pages = i;
+			state = BP_EAGAIN;
+			break;
+		}
+		adjust_managed_page_count(page, -1);
+		xenmem_reservation_scrub_page(page);
+		list_add(&page->lru, &pages);
+	}
+
+	/*
+	 * Ensure that ballooned highmem pages don't have kmaps.
+	 *
+	 * Do this before changing the p2m as kmap_flush_unused()
+	 * reads PTEs to obtain pages (and hence needs the original
+	 * p2m entry).
+	 */
+	kmap_flush_unused();
+
+	/*
+	 * Setup the frame, update direct mapping, invalidate P2M,
+	 * and add to balloon.
+	 */
+	i = 0;
+	list_for_each_entry_safe(page, tmp, &pages, lru) {
+		frame_list[i++] = xen_page_to_gfn(page);
+
+		xenmem_reservation_va_mapping_reset(1, &page);
+
+		list_del(&page->lru);
+
+		balloon_append(page);
+	}
+
+	flush_tlb_all();
+
+	ret = xenmem_reservation_decrease(nr_pages, frame_list);
+	BUG_ON(ret != nr_pages);
+
+	balloon_stats.current_pages -= nr_pages;
+
+	return state;
+}
+
+/*
+ * Stop waiting if either state is BP_DONE and ballooning action is
+ * needed, or if the credit has changed while state is not BP_DONE.
+ */
+static bool balloon_thread_cond(long credit)
+{
+	if (balloon_state == BP_DONE)
+		credit = 0;
+
+	return current_credit() != credit || kthread_should_stop();
+}
+
+/*
+ * As this is a kthread it is guaranteed to run as a single instance only.
+ * We may of course race updates of the target counts (which are protected
+ * by the balloon lock), or with changes to the Xen hard limit, but we will
+ * recover from these in time.
+ */
+static int balloon_thread(void *unused)
+{
+	long credit;
+	unsigned long timeout;
+
+	set_freezable();
+	for (;;) {
+		switch (balloon_state) {
+		case BP_DONE:
+		case BP_ECANCELED:
+			timeout = 3600 * HZ;
+			break;
+		case BP_EAGAIN:
+			timeout = balloon_stats.schedule_delay * HZ;
+			break;
+		case BP_WAIT:
+			timeout = HZ;
+			break;
+		}
+
+		credit = current_credit();
+
+		wait_event_freezable_timeout(balloon_thread_wq,
+			balloon_thread_cond(credit), timeout);
+
+		if (kthread_should_stop())
+			return 0;
+
+		mutex_lock(&balloon_mutex);
+
+		credit = current_credit();
+
+		if (credit > 0) {
+			if (balloon_is_inflated())
+				balloon_state = increase_reservation(credit);
+			else
+				balloon_state = reserve_additional_memory();
+		}
+
+		if (credit < 0) {
+			long n_pages;
+
+			n_pages = min(-credit, si_mem_available());
+			balloon_state = decrease_reservation(n_pages,
+							     GFP_BALLOON);
+			if (balloon_state == BP_DONE && n_pages != -credit &&
+			    n_pages < totalreserve_pages)
+				balloon_state = BP_EAGAIN;
+		}
+
+		update_schedule();
+
+		mutex_unlock(&balloon_mutex);
+
+		cond_resched();
+	}
+}
+
+/* Resets the Xen limit, sets new target, and kicks off processing. */
+void balloon_set_new_target(unsigned long target)
+{
+	/* No need for lock. Not read-modify-write updates. */
+	balloon_stats.target_pages = target;
+	wake_up(&balloon_thread_wq);
+}
+EXPORT_SYMBOL_GPL(balloon_set_new_target);
+
+static int add_ballooned_pages(int nr_pages)
+{
+	enum bp_state st;
+
+	if (xen_hotplug_unpopulated) {
+		st = reserve_additional_memory();
+		if (st != BP_ECANCELED) {
+			int rc;
+
+			mutex_unlock(&balloon_mutex);
+			rc = wait_event_interruptible(balloon_wq,
+				   !list_empty(&ballooned_pages));
+			mutex_lock(&balloon_mutex);
+			return rc ? -ENOMEM : 0;
+		}
+	}
+
+	if (si_mem_available() < nr_pages)
+		return -ENOMEM;
+
+	st = decrease_reservation(nr_pages, GFP_USER);
+	if (st != BP_DONE)
+		return -ENOMEM;
+
+	return 0;
+}
+
+/**
+ * alloc_xenballooned_pages - get pages that have been ballooned out
+ * @nr_pages: Number of pages to get
+ * @pages: pages returned
+ * @return 0 on success, error otherwise
+ */
+int alloc_xenballooned_pages(int nr_pages, struct page **pages)
+{
+	int pgno = 0;
+	struct page *page;
+	int ret;
+
+	mutex_lock(&balloon_mutex);
+
+	balloon_stats.target_unpopulated += nr_pages;
+
+	while (pgno < nr_pages) {
+		page = balloon_retrieve(true);
+		if (page) {
+			pages[pgno++] = page;
+#ifdef CONFIG_XEN_HAVE_PVMMU
+			/*
+			 * We don't support PV MMU when Linux and Xen is using
+			 * different page granularity.
+			 */
+			BUILD_BUG_ON(XEN_PAGE_SIZE != PAGE_SIZE);
+
+			if (!xen_feature(XENFEAT_auto_translated_physmap)) {
+				ret = xen_alloc_p2m_entry(page_to_pfn(page));
+				if (ret < 0)
+					goto out_undo;
+			}
+#endif
+		} else {
+			ret = add_ballooned_pages(nr_pages - pgno);
+			if (ret < 0)
+				goto out_undo;
+		}
+	}
+	mutex_unlock(&balloon_mutex);
+	return 0;
+ out_undo:
+	mutex_unlock(&balloon_mutex);
+	free_xenballooned_pages(pgno, pages);
+	/*
+	 * NB: free_xenballooned_pages will only subtract pgno pages, but since
+	 * target_unpopulated is incremented with nr_pages at the start we need
+	 * to remove the remaining ones also, or accounting will be screwed.
+	 */
+	balloon_stats.target_unpopulated -= nr_pages - pgno;
+	return ret;
+}
+EXPORT_SYMBOL(alloc_xenballooned_pages);
+
+/**
+ * free_xenballooned_pages - return pages retrieved with get_ballooned_pages
+ * @nr_pages: Number of pages
+ * @pages: pages to return
+ */
+void free_xenballooned_pages(int nr_pages, struct page **pages)
+{
+	int i;
+
+	mutex_lock(&balloon_mutex);
+
+	for (i = 0; i < nr_pages; i++) {
+		if (pages[i])
+			balloon_append(pages[i]);
+	}
+
+	balloon_stats.target_unpopulated -= nr_pages;
+
+	/* The balloon may be too large now. Shrink it if needed. */
+	if (current_credit())
+		wake_up(&balloon_thread_wq);
+
+	mutex_unlock(&balloon_mutex);
+}
+EXPORT_SYMBOL(free_xenballooned_pages);
+
+#ifdef CONFIG_XEN_PV
+static void __init balloon_add_region(unsigned long start_pfn,
+				      unsigned long pages)
+{
+	unsigned long pfn, extra_pfn_end;
+	struct page *page;
+
+	/*
+	 * If the amount of usable memory has been limited (e.g., with
+	 * the 'mem' command line parameter), don't add pages beyond
+	 * this limit.
+	 */
+	extra_pfn_end = min(max_pfn, start_pfn + pages);
+
+	for (pfn = start_pfn; pfn < extra_pfn_end; pfn++) {
+		page = pfn_to_page(pfn);
+		/* totalram_pages and totalhigh_pages do not
+		   include the boot-time balloon extension, so
+		   don't subtract from it. */
+		__balloon_append(page);
+	}
+
+	balloon_stats.total_pages += extra_pfn_end - start_pfn;
+}
+#endif
+
+static int __init balloon_init(void)
+{
+	struct task_struct *task;
+
+	if (!xen_domain())
+		return -ENODEV;
+
+	pr_info("Initialising balloon driver\n");
+
+#ifdef CONFIG_XEN_PV
+	balloon_stats.current_pages = xen_pv_domain()
+		? min(xen_start_info->nr_pages - xen_released_pages, max_pfn)
+		: get_num_physpages();
+#else
+	balloon_stats.current_pages = get_num_physpages();
+#endif
+	balloon_stats.target_pages  = balloon_stats.current_pages;
+	balloon_stats.balloon_low   = 0;
+	balloon_stats.balloon_high  = 0;
+	balloon_stats.total_pages   = balloon_stats.current_pages;
+
+	balloon_stats.schedule_delay = 1;
+	balloon_stats.max_schedule_delay = 32;
+	balloon_stats.retry_count = 1;
+	balloon_stats.max_retry_count = 4;
+
+#ifdef CONFIG_XEN_BALLOON_MEMORY_HOTPLUG
+	set_online_page_callback(&xen_online_page);
+	register_memory_notifier(&xen_memory_nb);
+	register_sysctl_table(xen_root);
+#endif
+
+#ifdef CONFIG_XEN_PV
+	{
+		int i;
+
+		/*
+		 * Initialize the balloon with pages from the extra memory
+		 * regions (see arch/x86/xen/setup.c).
+		 */
+		for (i = 0; i < XEN_EXTRA_MEM_MAX_REGIONS; i++)
+			if (xen_extra_mem[i].n_pfns)
+				balloon_add_region(xen_extra_mem[i].start_pfn,
+						   xen_extra_mem[i].n_pfns);
+	}
+#endif
+
+	task = kthread_run(balloon_thread, NULL, "xen-balloon");
+	if (IS_ERR(task)) {
+		pr_err("xen-balloon thread could not be started, ballooning will not work!\n");
+		return PTR_ERR(task);
+	}
+
+	/* Init the xen-balloon driver. */
+	xen_balloon_init();
+
+	return 0;
+}
+subsys_initcall(balloon_init);
+
+static int __init balloon_wait_finish(void)
+{
+	long credit, last_credit = 0;
+	unsigned long last_changed = 0;
+
+	if (!xen_domain())
+		return -ENODEV;
+
+	/* PV guests don't need to wait. */
+	if (xen_pv_domain() || !current_credit())
+		return 0;
+
+	pr_notice("Waiting for initial ballooning down having finished.\n");
+
+	while ((credit = current_credit()) < 0) {
+		if (credit != last_credit) {
+			last_changed = jiffies;
+			last_credit = credit;
+		}
+		if (balloon_state == BP_ECANCELED) {
+			pr_warn_once("Initial ballooning failed, %ld pages need to be freed.\n",
+				     -credit);
+			if (jiffies - last_changed >= HZ * balloon_boot_timeout)
+				panic("Initial ballooning failed!\n");
+		}
+
+		schedule_timeout_interruptible(HZ / 10);
+	}
+
+	pr_notice("Initial ballooning down finished.\n");
+
+	return 0;
+}
+late_initcall_sync(balloon_wait_finish);
diff --git a/drivers/xen/biomerge.c b/drivers/xen/biomerge.c
new file mode 100644
index 000000000..55ed80c3a
--- /dev/null
+++ b/drivers/xen/biomerge.c
@@ -0,0 +1,23 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/bio.h>
+#include <linux/io.h>
+#include <linux/export.h>
+#include <xen/page.h>
+
+bool xen_biovec_phys_mergeable(const struct bio_vec *vec1,
+			       const struct bio_vec *vec2)
+{
+#if XEN_PAGE_SIZE == PAGE_SIZE
+	unsigned long bfn1 = pfn_to_bfn(page_to_pfn(vec1->bv_page));
+	unsigned long bfn2 = pfn_to_bfn(page_to_pfn(vec2->bv_page));
+
+	return bfn1 + PFN_DOWN(vec1->bv_offset + vec1->bv_len) == bfn2;
+#else
+	/*
+	 * XXX: Add support for merging bio_vec when using different page
+	 * size in Xen and Linux.
+	 */
+	return false;
+#endif
+}
+EXPORT_SYMBOL(xen_biovec_phys_mergeable);
diff --git a/drivers/xen/cpu_hotplug.c b/drivers/xen/cpu_hotplug.c
new file mode 100644
index 000000000..f192b6f42
--- /dev/null
+++ b/drivers/xen/cpu_hotplug.c
@@ -0,0 +1,123 @@
+// SPDX-License-Identifier: GPL-2.0
+#define pr_fmt(fmt) "xen:" KBUILD_MODNAME ": " fmt
+
+#include <linux/notifier.h>
+
+#include <xen/xen.h>
+#include <xen/xenbus.h>
+
+#include <asm/xen/hypervisor.h>
+#include <asm/cpu.h>
+
+static void enable_hotplug_cpu(int cpu)
+{
+	if (!cpu_present(cpu))
+		xen_arch_register_cpu(cpu);
+
+	set_cpu_present(cpu, true);
+}
+
+static void disable_hotplug_cpu(int cpu)
+{
+	if (!cpu_is_hotpluggable(cpu))
+		return;
+	lock_device_hotplug();
+	if (cpu_online(cpu))
+		device_offline(get_cpu_device(cpu));
+	if (!cpu_online(cpu) && cpu_present(cpu)) {
+		xen_arch_unregister_cpu(cpu);
+		set_cpu_present(cpu, false);
+	}
+	unlock_device_hotplug();
+}
+
+static int vcpu_online(unsigned int cpu)
+{
+	int err;
+	char dir[16], state[16];
+
+	sprintf(dir, "cpu/%u", cpu);
+	err = xenbus_scanf(XBT_NIL, dir, "availability", "%15s", state);
+	if (err != 1) {
+		if (!xen_initial_domain())
+			pr_err("Unable to read cpu state\n");
+		return err;
+	}
+
+	if (strcmp(state, "online") == 0)
+		return 1;
+	else if (strcmp(state, "offline") == 0)
+		return 0;
+
+	pr_err("unknown state(%s) on CPU%d\n", state, cpu);
+	return -EINVAL;
+}
+static void vcpu_hotplug(unsigned int cpu)
+{
+	if (cpu >= nr_cpu_ids || !cpu_possible(cpu))
+		return;
+
+	switch (vcpu_online(cpu)) {
+	case 1:
+		enable_hotplug_cpu(cpu);
+		break;
+	case 0:
+		disable_hotplug_cpu(cpu);
+		break;
+	default:
+		break;
+	}
+}
+
+static void handle_vcpu_hotplug_event(struct xenbus_watch *watch,
+				      const char *path, const char *token)
+{
+	unsigned int cpu;
+	char *cpustr;
+
+	cpustr = strstr(path, "cpu/");
+	if (cpustr != NULL) {
+		sscanf(cpustr, "cpu/%u", &cpu);
+		vcpu_hotplug(cpu);
+	}
+}
+
+static int setup_cpu_watcher(struct notifier_block *notifier,
+			      unsigned long event, void *data)
+{
+	int cpu;
+	static struct xenbus_watch cpu_watch = {
+		.node = "cpu",
+		.callback = handle_vcpu_hotplug_event};
+
+	(void)register_xenbus_watch(&cpu_watch);
+
+	for_each_possible_cpu(cpu) {
+		if (vcpu_online(cpu) == 0) {
+			(void)cpu_down(cpu);
+			set_cpu_present(cpu, false);
+		}
+	}
+
+	return NOTIFY_DONE;
+}
+
+static int __init setup_vcpu_hotplug_event(void)
+{
+	static struct notifier_block xsn_cpu = {
+		.notifier_call = setup_cpu_watcher };
+
+#ifdef CONFIG_X86
+	if (!xen_pv_domain() && !xen_pvh_domain())
+#else
+	if (!xen_domain())
+#endif
+		return -ENODEV;
+
+	register_xenstore_notifier(&xsn_cpu);
+
+	return 0;
+}
+
+arch_initcall(setup_vcpu_hotplug_event);
+
diff --git a/drivers/xen/dbgp.c b/drivers/xen/dbgp.c
new file mode 100644
index 000000000..8145a59fd
--- /dev/null
+++ b/drivers/xen/dbgp.c
@@ -0,0 +1,50 @@
+#include <linux/pci.h>
+#include <linux/usb.h>
+#include <linux/usb/ehci_def.h>
+#include <linux/usb/hcd.h>
+#include <asm/xen/hypercall.h>
+#include <xen/interface/physdev.h>
+#include <xen/xen.h>
+
+static int xen_dbgp_op(struct usb_hcd *hcd, int op)
+{
+#ifdef CONFIG_PCI
+	const struct device *ctrlr = hcd_to_bus(hcd)->controller;
+#endif
+	struct physdev_dbgp_op dbgp;
+
+	if (!xen_initial_domain())
+		return 0;
+
+	dbgp.op = op;
+
+#ifdef CONFIG_PCI
+	if (dev_is_pci(ctrlr)) {
+		const struct pci_dev *pdev = to_pci_dev(ctrlr);
+
+		dbgp.u.pci.seg = pci_domain_nr(pdev->bus);
+		dbgp.u.pci.bus = pdev->bus->number;
+		dbgp.u.pci.devfn = pdev->devfn;
+		dbgp.bus = PHYSDEVOP_DBGP_BUS_PCI;
+	} else
+#endif
+		dbgp.bus = PHYSDEVOP_DBGP_BUS_UNKNOWN;
+
+	return HYPERVISOR_physdev_op(PHYSDEVOP_dbgp_op, &dbgp);
+}
+
+int xen_dbgp_reset_prep(struct usb_hcd *hcd)
+{
+	return xen_dbgp_op(hcd, PHYSDEVOP_DBGP_RESET_PREPARE);
+}
+
+int xen_dbgp_external_startup(struct usb_hcd *hcd)
+{
+	return xen_dbgp_op(hcd, PHYSDEVOP_DBGP_RESET_DONE);
+}
+
+#ifndef CONFIG_EARLY_PRINTK_DBGP
+#include <linux/export.h>
+EXPORT_SYMBOL_GPL(xen_dbgp_reset_prep);
+EXPORT_SYMBOL_GPL(xen_dbgp_external_startup);
+#endif
diff --git a/drivers/xen/efi.c b/drivers/xen/efi.c
new file mode 100644
index 000000000..9243a9051
--- /dev/null
+++ b/drivers/xen/efi.c
@@ -0,0 +1,283 @@
+/*
+ * EFI support for Xen.
+ *
+ * Copyright (C) 1999 VA Linux Systems
+ * Copyright (C) 1999 Walt Drummond <drummond@valinux.com>
+ * Copyright (C) 1999-2002 Hewlett-Packard Co.
+ *	David Mosberger-Tang <davidm@hpl.hp.com>
+ *	Stephane Eranian <eranian@hpl.hp.com>
+ * Copyright (C) 2005-2008 Intel Co.
+ *	Fenghua Yu <fenghua.yu@intel.com>
+ *	Bibo Mao <bibo.mao@intel.com>
+ *	Chandramouli Narayanan <mouli@linux.intel.com>
+ *	Huang Ying <ying.huang@intel.com>
+ * Copyright (C) 2011 Novell Co.
+ *	Jan Beulich <JBeulich@suse.com>
+ * Copyright (C) 2011-2012 Oracle Co.
+ *	Liang Tang <liang.tang@oracle.com>
+ * Copyright (c) 2014 Oracle Co., Daniel Kiper
+ */
+
+#include <linux/bug.h>
+#include <linux/efi.h>
+#include <linux/init.h>
+#include <linux/string.h>
+
+#include <xen/interface/xen.h>
+#include <xen/interface/platform.h>
+#include <xen/xen.h>
+#include <xen/xen-ops.h>
+
+#include <asm/page.h>
+
+#include <asm/xen/hypercall.h>
+
+#define INIT_EFI_OP(name) \
+	{.cmd = XENPF_efi_runtime_call, \
+	 .u.efi_runtime_call.function = XEN_EFI_##name, \
+	 .u.efi_runtime_call.misc = 0}
+
+#define efi_data(op)	(op.u.efi_runtime_call)
+
+efi_status_t xen_efi_get_time(efi_time_t *tm, efi_time_cap_t *tc)
+{
+	struct xen_platform_op op = INIT_EFI_OP(get_time);
+
+	if (HYPERVISOR_platform_op(&op) < 0)
+		return EFI_UNSUPPORTED;
+
+	if (tm) {
+		BUILD_BUG_ON(sizeof(*tm) != sizeof(efi_data(op).u.get_time.time));
+		memcpy(tm, &efi_data(op).u.get_time.time, sizeof(*tm));
+	}
+
+	if (tc) {
+		tc->resolution = efi_data(op).u.get_time.resolution;
+		tc->accuracy = efi_data(op).u.get_time.accuracy;
+		tc->sets_to_zero = !!(efi_data(op).misc &
+				      XEN_EFI_GET_TIME_SET_CLEARS_NS);
+	}
+
+	return efi_data(op).status;
+}
+EXPORT_SYMBOL_GPL(xen_efi_get_time);
+
+efi_status_t xen_efi_set_time(efi_time_t *tm)
+{
+	struct xen_platform_op op = INIT_EFI_OP(set_time);
+
+	BUILD_BUG_ON(sizeof(*tm) != sizeof(efi_data(op).u.set_time));
+	memcpy(&efi_data(op).u.set_time, tm, sizeof(*tm));
+
+	if (HYPERVISOR_platform_op(&op) < 0)
+		return EFI_UNSUPPORTED;
+
+	return efi_data(op).status;
+}
+EXPORT_SYMBOL_GPL(xen_efi_set_time);
+
+efi_status_t xen_efi_get_wakeup_time(efi_bool_t *enabled, efi_bool_t *pending,
+				     efi_time_t *tm)
+{
+	struct xen_platform_op op = INIT_EFI_OP(get_wakeup_time);
+
+	if (HYPERVISOR_platform_op(&op) < 0)
+		return EFI_UNSUPPORTED;
+
+	if (tm) {
+		BUILD_BUG_ON(sizeof(*tm) != sizeof(efi_data(op).u.get_wakeup_time));
+		memcpy(tm, &efi_data(op).u.get_wakeup_time, sizeof(*tm));
+	}
+
+	if (enabled)
+		*enabled = !!(efi_data(op).misc & XEN_EFI_GET_WAKEUP_TIME_ENABLED);
+
+	if (pending)
+		*pending = !!(efi_data(op).misc & XEN_EFI_GET_WAKEUP_TIME_PENDING);
+
+	return efi_data(op).status;
+}
+EXPORT_SYMBOL_GPL(xen_efi_get_wakeup_time);
+
+efi_status_t xen_efi_set_wakeup_time(efi_bool_t enabled, efi_time_t *tm)
+{
+	struct xen_platform_op op = INIT_EFI_OP(set_wakeup_time);
+
+	BUILD_BUG_ON(sizeof(*tm) != sizeof(efi_data(op).u.set_wakeup_time));
+	if (enabled)
+		efi_data(op).misc = XEN_EFI_SET_WAKEUP_TIME_ENABLE;
+	if (tm)
+		memcpy(&efi_data(op).u.set_wakeup_time, tm, sizeof(*tm));
+	else
+		efi_data(op).misc |= XEN_EFI_SET_WAKEUP_TIME_ENABLE_ONLY;
+
+	if (HYPERVISOR_platform_op(&op) < 0)
+		return EFI_UNSUPPORTED;
+
+	return efi_data(op).status;
+}
+EXPORT_SYMBOL_GPL(xen_efi_set_wakeup_time);
+
+efi_status_t xen_efi_get_variable(efi_char16_t *name, efi_guid_t *vendor,
+				  u32 *attr, unsigned long *data_size,
+				  void *data)
+{
+	struct xen_platform_op op = INIT_EFI_OP(get_variable);
+
+	set_xen_guest_handle(efi_data(op).u.get_variable.name, name);
+	BUILD_BUG_ON(sizeof(*vendor) !=
+		     sizeof(efi_data(op).u.get_variable.vendor_guid));
+	memcpy(&efi_data(op).u.get_variable.vendor_guid, vendor, sizeof(*vendor));
+	efi_data(op).u.get_variable.size = *data_size;
+	set_xen_guest_handle(efi_data(op).u.get_variable.data, data);
+
+	if (HYPERVISOR_platform_op(&op) < 0)
+		return EFI_UNSUPPORTED;
+
+	*data_size = efi_data(op).u.get_variable.size;
+	if (attr)
+		*attr = efi_data(op).misc;
+
+	return efi_data(op).status;
+}
+EXPORT_SYMBOL_GPL(xen_efi_get_variable);
+
+efi_status_t xen_efi_get_next_variable(unsigned long *name_size,
+				       efi_char16_t *name,
+				       efi_guid_t *vendor)
+{
+	struct xen_platform_op op = INIT_EFI_OP(get_next_variable_name);
+
+	efi_data(op).u.get_next_variable_name.size = *name_size;
+	set_xen_guest_handle(efi_data(op).u.get_next_variable_name.name, name);
+	BUILD_BUG_ON(sizeof(*vendor) !=
+		     sizeof(efi_data(op).u.get_next_variable_name.vendor_guid));
+	memcpy(&efi_data(op).u.get_next_variable_name.vendor_guid, vendor,
+	       sizeof(*vendor));
+
+	if (HYPERVISOR_platform_op(&op) < 0)
+		return EFI_UNSUPPORTED;
+
+	*name_size = efi_data(op).u.get_next_variable_name.size;
+	memcpy(vendor, &efi_data(op).u.get_next_variable_name.vendor_guid,
+	       sizeof(*vendor));
+
+	return efi_data(op).status;
+}
+EXPORT_SYMBOL_GPL(xen_efi_get_next_variable);
+
+efi_status_t xen_efi_set_variable(efi_char16_t *name, efi_guid_t *vendor,
+				 u32 attr, unsigned long data_size,
+				 void *data)
+{
+	struct xen_platform_op op = INIT_EFI_OP(set_variable);
+
+	set_xen_guest_handle(efi_data(op).u.set_variable.name, name);
+	efi_data(op).misc = attr;
+	BUILD_BUG_ON(sizeof(*vendor) !=
+		     sizeof(efi_data(op).u.set_variable.vendor_guid));
+	memcpy(&efi_data(op).u.set_variable.vendor_guid, vendor, sizeof(*vendor));
+	efi_data(op).u.set_variable.size = data_size;
+	set_xen_guest_handle(efi_data(op).u.set_variable.data, data);
+
+	if (HYPERVISOR_platform_op(&op) < 0)
+		return EFI_UNSUPPORTED;
+
+	return efi_data(op).status;
+}
+EXPORT_SYMBOL_GPL(xen_efi_set_variable);
+
+efi_status_t xen_efi_query_variable_info(u32 attr, u64 *storage_space,
+					 u64 *remaining_space,
+					 u64 *max_variable_size)
+{
+	struct xen_platform_op op = INIT_EFI_OP(query_variable_info);
+
+	if (efi.runtime_version < EFI_2_00_SYSTEM_TABLE_REVISION)
+		return EFI_UNSUPPORTED;
+
+	efi_data(op).u.query_variable_info.attr = attr;
+
+	if (HYPERVISOR_platform_op(&op) < 0)
+		return EFI_UNSUPPORTED;
+
+	*storage_space = efi_data(op).u.query_variable_info.max_store_size;
+	*remaining_space = efi_data(op).u.query_variable_info.remain_store_size;
+	*max_variable_size = efi_data(op).u.query_variable_info.max_size;
+
+	return efi_data(op).status;
+}
+EXPORT_SYMBOL_GPL(xen_efi_query_variable_info);
+
+efi_status_t xen_efi_get_next_high_mono_count(u32 *count)
+{
+	struct xen_platform_op op = INIT_EFI_OP(get_next_high_monotonic_count);
+
+	if (HYPERVISOR_platform_op(&op) < 0)
+		return EFI_UNSUPPORTED;
+
+	*count = efi_data(op).misc;
+
+	return efi_data(op).status;
+}
+EXPORT_SYMBOL_GPL(xen_efi_get_next_high_mono_count);
+
+efi_status_t xen_efi_update_capsule(efi_capsule_header_t **capsules,
+				    unsigned long count, unsigned long sg_list)
+{
+	struct xen_platform_op op = INIT_EFI_OP(update_capsule);
+
+	if (efi.runtime_version < EFI_2_00_SYSTEM_TABLE_REVISION)
+		return EFI_UNSUPPORTED;
+
+	set_xen_guest_handle(efi_data(op).u.update_capsule.capsule_header_array,
+			     capsules);
+	efi_data(op).u.update_capsule.capsule_count = count;
+	efi_data(op).u.update_capsule.sg_list = sg_list;
+
+	if (HYPERVISOR_platform_op(&op) < 0)
+		return EFI_UNSUPPORTED;
+
+	return efi_data(op).status;
+}
+EXPORT_SYMBOL_GPL(xen_efi_update_capsule);
+
+efi_status_t xen_efi_query_capsule_caps(efi_capsule_header_t **capsules,
+					unsigned long count, u64 *max_size,
+					int *reset_type)
+{
+	struct xen_platform_op op = INIT_EFI_OP(query_capsule_capabilities);
+
+	if (efi.runtime_version < EFI_2_00_SYSTEM_TABLE_REVISION)
+		return EFI_UNSUPPORTED;
+
+	set_xen_guest_handle(efi_data(op).u.query_capsule_capabilities.capsule_header_array,
+					capsules);
+	efi_data(op).u.query_capsule_capabilities.capsule_count = count;
+
+	if (HYPERVISOR_platform_op(&op) < 0)
+		return EFI_UNSUPPORTED;
+
+	*max_size = efi_data(op).u.query_capsule_capabilities.max_capsule_size;
+	*reset_type = efi_data(op).u.query_capsule_capabilities.reset_type;
+
+	return efi_data(op).status;
+}
+EXPORT_SYMBOL_GPL(xen_efi_query_capsule_caps);
+
+void xen_efi_reset_system(int reset_type, efi_status_t status,
+			  unsigned long data_size, efi_char16_t *data)
+{
+	switch (reset_type) {
+	case EFI_RESET_COLD:
+	case EFI_RESET_WARM:
+		xen_reboot(SHUTDOWN_reboot);
+		break;
+	case EFI_RESET_SHUTDOWN:
+		xen_reboot(SHUTDOWN_poweroff);
+		break;
+	default:
+		BUG();
+	}
+}
+EXPORT_SYMBOL_GPL(xen_efi_reset_system);
diff --git a/drivers/xen/events/Makefile b/drivers/xen/events/Makefile
new file mode 100644
index 000000000..62be55cd9
--- /dev/null
+++ b/drivers/xen/events/Makefile
@@ -0,0 +1,5 @@
+obj-y += events.o
+
+events-y += events_base.o
+events-y += events_2l.o
+events-y += events_fifo.o
diff --git a/drivers/xen/events/events_2l.c b/drivers/xen/events/events_2l.c
new file mode 100644
index 000000000..77cc80bcb
--- /dev/null
+++ b/drivers/xen/events/events_2l.c
@@ -0,0 +1,383 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Xen event channels (2-level ABI)
+ *
+ * Jeremy Fitzhardinge <jeremy@xensource.com>, XenSource Inc, 2007
+ */
+
+#define pr_fmt(fmt) "xen:" KBUILD_MODNAME ": " fmt
+
+#include <linux/linkage.h>
+#include <linux/interrupt.h>
+#include <linux/irq.h>
+
+#include <asm/sync_bitops.h>
+#include <asm/xen/hypercall.h>
+#include <asm/xen/hypervisor.h>
+
+#include <xen/xen.h>
+#include <xen/xen-ops.h>
+#include <xen/events.h>
+#include <xen/interface/xen.h>
+#include <xen/interface/event_channel.h>
+
+#include "events_internal.h"
+
+/*
+ * Note sizeof(xen_ulong_t) can be more than sizeof(unsigned long). Be
+ * careful to only use bitops which allow for this (e.g
+ * test_bit/find_first_bit and friends but not __ffs) and to pass
+ * BITS_PER_EVTCHN_WORD as the bitmask length.
+ */
+#define BITS_PER_EVTCHN_WORD (sizeof(xen_ulong_t)*8)
+/*
+ * Make a bitmask (i.e. unsigned long *) of a xen_ulong_t
+ * array. Primarily to avoid long lines (hence the terse name).
+ */
+#define BM(x) (unsigned long *)(x)
+/* Find the first set bit in a evtchn mask */
+#define EVTCHN_FIRST_BIT(w) find_first_bit(BM(&(w)), BITS_PER_EVTCHN_WORD)
+
+#define EVTCHN_MASK_SIZE (EVTCHN_2L_NR_CHANNELS/BITS_PER_EVTCHN_WORD)
+
+static DEFINE_PER_CPU(xen_ulong_t [EVTCHN_MASK_SIZE], cpu_evtchn_mask);
+
+static unsigned evtchn_2l_max_channels(void)
+{
+	return EVTCHN_2L_NR_CHANNELS;
+}
+
+static void evtchn_2l_remove(evtchn_port_t evtchn, unsigned int cpu)
+{
+	clear_bit(evtchn, BM(per_cpu(cpu_evtchn_mask, cpu)));
+}
+
+static void evtchn_2l_bind_to_cpu(struct irq_info *info, unsigned cpu)
+{
+	clear_bit(info->evtchn, BM(per_cpu(cpu_evtchn_mask, info->cpu)));
+	set_bit(info->evtchn, BM(per_cpu(cpu_evtchn_mask, cpu)));
+}
+
+static void evtchn_2l_clear_pending(unsigned port)
+{
+	struct shared_info *s = HYPERVISOR_shared_info;
+	sync_clear_bit(port, BM(&s->evtchn_pending[0]));
+}
+
+static void evtchn_2l_set_pending(unsigned port)
+{
+	struct shared_info *s = HYPERVISOR_shared_info;
+	sync_set_bit(port, BM(&s->evtchn_pending[0]));
+}
+
+static bool evtchn_2l_is_pending(unsigned port)
+{
+	struct shared_info *s = HYPERVISOR_shared_info;
+	return sync_test_bit(port, BM(&s->evtchn_pending[0]));
+}
+
+static void evtchn_2l_mask(unsigned port)
+{
+	struct shared_info *s = HYPERVISOR_shared_info;
+	sync_set_bit(port, BM(&s->evtchn_mask[0]));
+}
+
+static void evtchn_2l_unmask(unsigned port)
+{
+	struct shared_info *s = HYPERVISOR_shared_info;
+	unsigned int cpu = get_cpu();
+	int do_hypercall = 0, evtchn_pending = 0;
+
+	BUG_ON(!irqs_disabled());
+
+	smp_wmb();	/* All writes before unmask must be visible. */
+
+	if (unlikely((cpu != cpu_from_evtchn(port))))
+		do_hypercall = 1;
+	else {
+		/*
+		 * Need to clear the mask before checking pending to
+		 * avoid a race with an event becoming pending.
+		 *
+		 * EVTCHNOP_unmask will only trigger an upcall if the
+		 * mask bit was set, so if a hypercall is needed
+		 * remask the event.
+		 */
+		sync_clear_bit(port, BM(&s->evtchn_mask[0]));
+		evtchn_pending = sync_test_bit(port, BM(&s->evtchn_pending[0]));
+
+		if (unlikely(evtchn_pending && xen_hvm_domain())) {
+			sync_set_bit(port, BM(&s->evtchn_mask[0]));
+			do_hypercall = 1;
+		}
+	}
+
+	/* Slow path (hypercall) if this is a non-local port or if this is
+	 * an hvm domain and an event is pending (hvm domains don't have
+	 * their own implementation of irq_enable). */
+	if (do_hypercall) {
+		struct evtchn_unmask unmask = { .port = port };
+		(void)HYPERVISOR_event_channel_op(EVTCHNOP_unmask, &unmask);
+	} else {
+		struct vcpu_info *vcpu_info = __this_cpu_read(xen_vcpu);
+
+		/*
+		 * The following is basically the equivalent of
+		 * 'hw_resend_irq'. Just like a real IO-APIC we 'lose
+		 * the interrupt edge' if the channel is masked.
+		 */
+		if (evtchn_pending &&
+		    !sync_test_and_set_bit(port / BITS_PER_EVTCHN_WORD,
+					   BM(&vcpu_info->evtchn_pending_sel)))
+			vcpu_info->evtchn_upcall_pending = 1;
+	}
+
+	put_cpu();
+}
+
+static DEFINE_PER_CPU(unsigned int, current_word_idx);
+static DEFINE_PER_CPU(unsigned int, current_bit_idx);
+
+/*
+ * Mask out the i least significant bits of w
+ */
+#define MASK_LSBS(w, i) (w & ((~((xen_ulong_t)0UL)) << i))
+
+static inline xen_ulong_t active_evtchns(unsigned int cpu,
+					 struct shared_info *sh,
+					 unsigned int idx)
+{
+	return sh->evtchn_pending[idx] &
+		per_cpu(cpu_evtchn_mask, cpu)[idx] &
+		~sh->evtchn_mask[idx];
+}
+
+/*
+ * Search the CPU's pending events bitmasks.  For each one found, map
+ * the event number to an irq, and feed it into do_IRQ() for handling.
+ *
+ * Xen uses a two-level bitmap to speed searching.  The first level is
+ * a bitset of words which contain pending event bits.  The second
+ * level is a bitset of pending events themselves.
+ */
+static void evtchn_2l_handle_events(unsigned cpu, struct evtchn_loop_ctrl *ctrl)
+{
+	int irq;
+	xen_ulong_t pending_words;
+	xen_ulong_t pending_bits;
+	int start_word_idx, start_bit_idx;
+	int word_idx, bit_idx;
+	int i;
+	struct shared_info *s = HYPERVISOR_shared_info;
+	struct vcpu_info *vcpu_info = __this_cpu_read(xen_vcpu);
+
+	/* Timer interrupt has highest priority. */
+	irq = irq_from_virq(cpu, VIRQ_TIMER);
+	if (irq != -1) {
+		unsigned int evtchn = evtchn_from_irq(irq);
+		word_idx = evtchn / BITS_PER_LONG;
+		bit_idx = evtchn % BITS_PER_LONG;
+		if (active_evtchns(cpu, s, word_idx) & (1ULL << bit_idx))
+			generic_handle_irq(irq);
+	}
+
+	/*
+	 * Master flag must be cleared /before/ clearing
+	 * selector flag. xchg_xen_ulong must contain an
+	 * appropriate barrier.
+	 */
+	pending_words = xchg_xen_ulong(&vcpu_info->evtchn_pending_sel, 0);
+
+	start_word_idx = __this_cpu_read(current_word_idx);
+	start_bit_idx = __this_cpu_read(current_bit_idx);
+
+	word_idx = start_word_idx;
+
+	for (i = 0; pending_words != 0; i++) {
+		xen_ulong_t words;
+
+		words = MASK_LSBS(pending_words, word_idx);
+
+		/*
+		 * If we masked out all events, wrap to beginning.
+		 */
+		if (words == 0) {
+			word_idx = 0;
+			bit_idx = 0;
+			continue;
+		}
+		word_idx = EVTCHN_FIRST_BIT(words);
+
+		pending_bits = active_evtchns(cpu, s, word_idx);
+		bit_idx = 0; /* usually scan entire word from start */
+		/*
+		 * We scan the starting word in two parts.
+		 *
+		 * 1st time: start in the middle, scanning the
+		 * upper bits.
+		 *
+		 * 2nd time: scan the whole word (not just the
+		 * parts skipped in the first pass) -- if an
+		 * event in the previously scanned bits is
+		 * pending again it would just be scanned on
+		 * the next loop anyway.
+		 */
+		if (word_idx == start_word_idx) {
+			if (i == 0)
+				bit_idx = start_bit_idx;
+		}
+
+		do {
+			xen_ulong_t bits;
+			int port;
+
+			bits = MASK_LSBS(pending_bits, bit_idx);
+
+			/* If we masked out all events, move on. */
+			if (bits == 0)
+				break;
+
+			bit_idx = EVTCHN_FIRST_BIT(bits);
+
+			/* Process port. */
+			port = (word_idx * BITS_PER_EVTCHN_WORD) + bit_idx;
+			handle_irq_for_port(port, ctrl);
+
+			bit_idx = (bit_idx + 1) % BITS_PER_EVTCHN_WORD;
+
+			/* Next caller starts at last processed + 1 */
+			__this_cpu_write(current_word_idx,
+					 bit_idx ? word_idx :
+					 (word_idx+1) % BITS_PER_EVTCHN_WORD);
+			__this_cpu_write(current_bit_idx, bit_idx);
+		} while (bit_idx != 0);
+
+		/* Scan start_l1i twice; all others once. */
+		if ((word_idx != start_word_idx) || (i != 0))
+			pending_words &= ~(1UL << word_idx);
+
+		word_idx = (word_idx + 1) % BITS_PER_EVTCHN_WORD;
+	}
+}
+
+irqreturn_t xen_debug_interrupt(int irq, void *dev_id)
+{
+	struct shared_info *sh = HYPERVISOR_shared_info;
+	int cpu = smp_processor_id();
+	xen_ulong_t *cpu_evtchn = per_cpu(cpu_evtchn_mask, cpu);
+	int i;
+	unsigned long flags;
+	static DEFINE_SPINLOCK(debug_lock);
+	struct vcpu_info *v;
+
+	spin_lock_irqsave(&debug_lock, flags);
+
+	printk("\nvcpu %d\n  ", cpu);
+
+	for_each_online_cpu(i) {
+		int pending;
+		v = per_cpu(xen_vcpu, i);
+		pending = (get_irq_regs() && i == cpu)
+			? xen_irqs_disabled(get_irq_regs())
+			: v->evtchn_upcall_mask;
+		printk("%d: masked=%d pending=%d event_sel %0*"PRI_xen_ulong"\n  ", i,
+		       pending, v->evtchn_upcall_pending,
+		       (int)(sizeof(v->evtchn_pending_sel)*2),
+		       v->evtchn_pending_sel);
+	}
+	v = per_cpu(xen_vcpu, cpu);
+
+	printk("\npending:\n   ");
+	for (i = ARRAY_SIZE(sh->evtchn_pending)-1; i >= 0; i--)
+		printk("%0*"PRI_xen_ulong"%s",
+		       (int)sizeof(sh->evtchn_pending[0])*2,
+		       sh->evtchn_pending[i],
+		       i % 8 == 0 ? "\n   " : " ");
+	printk("\nglobal mask:\n   ");
+	for (i = ARRAY_SIZE(sh->evtchn_mask)-1; i >= 0; i--)
+		printk("%0*"PRI_xen_ulong"%s",
+		       (int)(sizeof(sh->evtchn_mask[0])*2),
+		       sh->evtchn_mask[i],
+		       i % 8 == 0 ? "\n   " : " ");
+
+	printk("\nglobally unmasked:\n   ");
+	for (i = ARRAY_SIZE(sh->evtchn_mask)-1; i >= 0; i--)
+		printk("%0*"PRI_xen_ulong"%s",
+		       (int)(sizeof(sh->evtchn_mask[0])*2),
+		       sh->evtchn_pending[i] & ~sh->evtchn_mask[i],
+		       i % 8 == 0 ? "\n   " : " ");
+
+	printk("\nlocal cpu%d mask:\n   ", cpu);
+	for (i = (EVTCHN_2L_NR_CHANNELS/BITS_PER_EVTCHN_WORD)-1; i >= 0; i--)
+		printk("%0*"PRI_xen_ulong"%s", (int)(sizeof(cpu_evtchn[0])*2),
+		       cpu_evtchn[i],
+		       i % 8 == 0 ? "\n   " : " ");
+
+	printk("\nlocally unmasked:\n   ");
+	for (i = ARRAY_SIZE(sh->evtchn_mask)-1; i >= 0; i--) {
+		xen_ulong_t pending = sh->evtchn_pending[i]
+			& ~sh->evtchn_mask[i]
+			& cpu_evtchn[i];
+		printk("%0*"PRI_xen_ulong"%s",
+		       (int)(sizeof(sh->evtchn_mask[0])*2),
+		       pending, i % 8 == 0 ? "\n   " : " ");
+	}
+
+	printk("\npending list:\n");
+	for (i = 0; i < EVTCHN_2L_NR_CHANNELS; i++) {
+		if (sync_test_bit(i, BM(sh->evtchn_pending))) {
+			int word_idx = i / BITS_PER_EVTCHN_WORD;
+			printk("  %d: event %d -> irq %d%s%s%s\n",
+			       cpu_from_evtchn(i), i,
+			       get_evtchn_to_irq(i),
+			       sync_test_bit(word_idx, BM(&v->evtchn_pending_sel))
+			       ? "" : " l2-clear",
+			       !sync_test_bit(i, BM(sh->evtchn_mask))
+			       ? "" : " globally-masked",
+			       sync_test_bit(i, BM(cpu_evtchn))
+			       ? "" : " locally-masked");
+		}
+	}
+
+	spin_unlock_irqrestore(&debug_lock, flags);
+
+	return IRQ_HANDLED;
+}
+
+static void evtchn_2l_resume(void)
+{
+	int i;
+
+	for_each_online_cpu(i)
+		memset(per_cpu(cpu_evtchn_mask, i), 0, sizeof(xen_ulong_t) *
+				EVTCHN_2L_NR_CHANNELS/BITS_PER_EVTCHN_WORD);
+}
+
+static int evtchn_2l_percpu_deinit(unsigned int cpu)
+{
+	memset(per_cpu(cpu_evtchn_mask, cpu), 0, sizeof(xen_ulong_t) *
+			EVTCHN_2L_NR_CHANNELS/BITS_PER_EVTCHN_WORD);
+
+	return 0;
+}
+
+static const struct evtchn_ops evtchn_ops_2l = {
+	.max_channels      = evtchn_2l_max_channels,
+	.nr_channels       = evtchn_2l_max_channels,
+	.remove            = evtchn_2l_remove,
+	.bind_to_cpu       = evtchn_2l_bind_to_cpu,
+	.clear_pending     = evtchn_2l_clear_pending,
+	.set_pending       = evtchn_2l_set_pending,
+	.is_pending        = evtchn_2l_is_pending,
+	.mask              = evtchn_2l_mask,
+	.unmask            = evtchn_2l_unmask,
+	.handle_events     = evtchn_2l_handle_events,
+	.resume	           = evtchn_2l_resume,
+	.percpu_deinit     = evtchn_2l_percpu_deinit,
+};
+
+void __init xen_evtchn_2l_init(void)
+{
+	pr_info("Using 2-level ABI\n");
+	evtchn_ops = &evtchn_ops_2l;
+}
diff --git a/drivers/xen/events/events_base.c b/drivers/xen/events/events_base.c
new file mode 100644
index 000000000..d13802703
--- /dev/null
+++ b/drivers/xen/events/events_base.c
@@ -0,0 +1,2182 @@
+/*
+ * Xen event channels
+ *
+ * Xen models interrupts with abstract event channels.  Because each
+ * domain gets 1024 event channels, but NR_IRQ is not that large, we
+ * must dynamically map irqs<->event channels.  The event channels
+ * interface with the rest of the kernel by defining a xen interrupt
+ * chip.  When an event is received, it is mapped to an irq and sent
+ * through the normal interrupt processing path.
+ *
+ * There are four kinds of events which can be mapped to an event
+ * channel:
+ *
+ * 1. Inter-domain notifications.  This includes all the virtual
+ *    device events, since they're driven by front-ends in another domain
+ *    (typically dom0).
+ * 2. VIRQs, typically used for timers.  These are per-cpu events.
+ * 3. IPIs.
+ * 4. PIRQs - Hardware interrupts.
+ *
+ * Jeremy Fitzhardinge <jeremy@xensource.com>, XenSource Inc, 2007
+ */
+
+#define pr_fmt(fmt) "xen:" KBUILD_MODNAME ": " fmt
+
+#include <linux/linkage.h>
+#include <linux/interrupt.h>
+#include <linux/irq.h>
+#include <linux/moduleparam.h>
+#include <linux/string.h>
+#include <linux/bootmem.h>
+#include <linux/slab.h>
+#include <linux/irqnr.h>
+#include <linux/pci.h>
+#include <linux/spinlock.h>
+#include <linux/cpuhotplug.h>
+#include <linux/atomic.h>
+#include <linux/ktime.h>
+
+#ifdef CONFIG_X86
+#include <asm/desc.h>
+#include <asm/ptrace.h>
+#include <asm/irq.h>
+#include <asm/io_apic.h>
+#include <asm/i8259.h>
+#include <asm/xen/pci.h>
+#endif
+#include <asm/sync_bitops.h>
+#include <asm/xen/hypercall.h>
+#include <asm/xen/hypervisor.h>
+#include <xen/page.h>
+
+#include <xen/xen.h>
+#include <xen/hvm.h>
+#include <xen/xen-ops.h>
+#include <xen/events.h>
+#include <xen/interface/xen.h>
+#include <xen/interface/event_channel.h>
+#include <xen/interface/hvm/hvm_op.h>
+#include <xen/interface/hvm/params.h>
+#include <xen/interface/physdev.h>
+#include <xen/interface/sched.h>
+#include <xen/interface/vcpu.h>
+#include <asm/hw_irq.h>
+
+#include "events_internal.h"
+
+#undef MODULE_PARAM_PREFIX
+#define MODULE_PARAM_PREFIX "xen."
+
+static uint __read_mostly event_loop_timeout = 2;
+module_param(event_loop_timeout, uint, 0644);
+
+static uint __read_mostly event_eoi_delay = 10;
+module_param(event_eoi_delay, uint, 0644);
+
+const struct evtchn_ops *evtchn_ops;
+
+/*
+ * This lock protects updates to the following mapping and reference-count
+ * arrays. The lock does not need to be acquired to read the mapping tables.
+ */
+static DEFINE_MUTEX(irq_mapping_update_lock);
+
+/*
+ * Lock protecting event handling loop against removing event channels.
+ * Adding of event channels is no issue as the associated IRQ becomes active
+ * only after everything is setup (before request_[threaded_]irq() the handler
+ * can't be entered for an event, as the event channel will be unmasked only
+ * then).
+ */
+static DEFINE_RWLOCK(evtchn_rwlock);
+
+/*
+ * Lock hierarchy:
+ *
+ * irq_mapping_update_lock
+ *   evtchn_rwlock
+ *     IRQ-desc lock
+ *       percpu eoi_list_lock
+ *         irq_info->lock
+ */
+
+static LIST_HEAD(xen_irq_list_head);
+
+/* IRQ <-> VIRQ mapping. */
+static DEFINE_PER_CPU(int [NR_VIRQS], virq_to_irq) = {[0 ... NR_VIRQS-1] = -1};
+
+/* IRQ <-> IPI mapping */
+static DEFINE_PER_CPU(int [XEN_NR_IPIS], ipi_to_irq) = {[0 ... XEN_NR_IPIS-1] = -1};
+
+int **evtchn_to_irq;
+#ifdef CONFIG_X86
+static unsigned long *pirq_eoi_map;
+#endif
+static bool (*pirq_needs_eoi)(unsigned irq);
+
+#define EVTCHN_ROW(e)  (e / (PAGE_SIZE/sizeof(**evtchn_to_irq)))
+#define EVTCHN_COL(e)  (e % (PAGE_SIZE/sizeof(**evtchn_to_irq)))
+#define EVTCHN_PER_ROW (PAGE_SIZE / sizeof(**evtchn_to_irq))
+
+/* Xen will never allocate port zero for any purpose. */
+#define VALID_EVTCHN(chn)	((chn) != 0)
+
+static struct irq_info *legacy_info_ptrs[NR_IRQS_LEGACY];
+
+static struct irq_chip xen_dynamic_chip;
+static struct irq_chip xen_lateeoi_chip;
+static struct irq_chip xen_percpu_chip;
+static struct irq_chip xen_pirq_chip;
+static void enable_dynirq(struct irq_data *data);
+static void disable_dynirq(struct irq_data *data);
+
+static DEFINE_PER_CPU(unsigned int, irq_epoch);
+
+static void clear_evtchn_to_irq_row(int *evtchn_row)
+{
+	unsigned col;
+
+	for (col = 0; col < EVTCHN_PER_ROW; col++)
+		WRITE_ONCE(evtchn_row[col], -1);
+}
+
+static void clear_evtchn_to_irq_all(void)
+{
+	unsigned row;
+
+	for (row = 0; row < EVTCHN_ROW(xen_evtchn_max_channels()); row++) {
+		if (evtchn_to_irq[row] == NULL)
+			continue;
+		clear_evtchn_to_irq_row(evtchn_to_irq[row]);
+	}
+}
+
+static int set_evtchn_to_irq(unsigned evtchn, unsigned irq)
+{
+	unsigned row;
+	unsigned col;
+	int *evtchn_row;
+
+	if (evtchn >= xen_evtchn_max_channels())
+		return -EINVAL;
+
+	row = EVTCHN_ROW(evtchn);
+	col = EVTCHN_COL(evtchn);
+
+	if (evtchn_to_irq[row] == NULL) {
+		/* Unallocated irq entries return -1 anyway */
+		if (irq == -1)
+			return 0;
+
+		evtchn_row = (int *) __get_free_pages(GFP_KERNEL, 0);
+		if (evtchn_row == NULL)
+			return -ENOMEM;
+
+		clear_evtchn_to_irq_row(evtchn_row);
+
+		/*
+		 * We've prepared an empty row for the mapping. If a different
+		 * thread was faster inserting it, we can drop ours.
+		 */
+		if (cmpxchg(&evtchn_to_irq[row], NULL, evtchn_row) != NULL)
+			free_page((unsigned long) evtchn_row);
+	}
+
+	WRITE_ONCE(evtchn_to_irq[row][col], irq);
+	return 0;
+}
+
+int get_evtchn_to_irq(unsigned evtchn)
+{
+	if (evtchn >= xen_evtchn_max_channels())
+		return -1;
+	if (evtchn_to_irq[EVTCHN_ROW(evtchn)] == NULL)
+		return -1;
+	return READ_ONCE(evtchn_to_irq[EVTCHN_ROW(evtchn)][EVTCHN_COL(evtchn)]);
+}
+
+/* Get info for IRQ */
+struct irq_info *info_for_irq(unsigned irq)
+{
+	if (irq < nr_legacy_irqs())
+		return legacy_info_ptrs[irq];
+	else
+		return irq_get_chip_data(irq);
+}
+
+static void set_info_for_irq(unsigned int irq, struct irq_info *info)
+{
+	if (irq < nr_legacy_irqs())
+		legacy_info_ptrs[irq] = info;
+	else
+		irq_set_chip_data(irq, info);
+}
+
+/* Constructors for packed IRQ information. */
+static int xen_irq_info_common_setup(struct irq_info *info,
+				     unsigned irq,
+				     enum xen_irq_type type,
+				     unsigned evtchn,
+				     unsigned short cpu)
+{
+	int ret;
+
+	BUG_ON(info->type != IRQT_UNBOUND && info->type != type);
+
+	info->type = type;
+	info->irq = irq;
+	info->evtchn = evtchn;
+	info->cpu = cpu;
+	info->mask_reason = EVT_MASK_REASON_EXPLICIT;
+	raw_spin_lock_init(&info->lock);
+
+	ret = set_evtchn_to_irq(evtchn, irq);
+	if (ret < 0)
+		return ret;
+
+	irq_clear_status_flags(irq, IRQ_NOREQUEST|IRQ_NOAUTOEN);
+
+	return xen_evtchn_port_setup(info);
+}
+
+static int xen_irq_info_evtchn_setup(unsigned irq,
+				     unsigned evtchn)
+{
+	struct irq_info *info = info_for_irq(irq);
+
+	return xen_irq_info_common_setup(info, irq, IRQT_EVTCHN, evtchn, 0);
+}
+
+static int xen_irq_info_ipi_setup(unsigned cpu,
+				  unsigned irq,
+				  unsigned evtchn,
+				  enum ipi_vector ipi)
+{
+	struct irq_info *info = info_for_irq(irq);
+
+	info->u.ipi = ipi;
+
+	per_cpu(ipi_to_irq, cpu)[ipi] = irq;
+
+	return xen_irq_info_common_setup(info, irq, IRQT_IPI, evtchn, 0);
+}
+
+static int xen_irq_info_virq_setup(unsigned cpu,
+				   unsigned irq,
+				   unsigned evtchn,
+				   unsigned virq)
+{
+	struct irq_info *info = info_for_irq(irq);
+
+	info->u.virq = virq;
+
+	per_cpu(virq_to_irq, cpu)[virq] = irq;
+
+	return xen_irq_info_common_setup(info, irq, IRQT_VIRQ, evtchn, 0);
+}
+
+static int xen_irq_info_pirq_setup(unsigned irq,
+				   unsigned evtchn,
+				   unsigned pirq,
+				   unsigned gsi,
+				   uint16_t domid,
+				   unsigned char flags)
+{
+	struct irq_info *info = info_for_irq(irq);
+
+	info->u.pirq.pirq = pirq;
+	info->u.pirq.gsi = gsi;
+	info->u.pirq.domid = domid;
+	info->u.pirq.flags = flags;
+
+	return xen_irq_info_common_setup(info, irq, IRQT_PIRQ, evtchn, 0);
+}
+
+static void xen_irq_info_cleanup(struct irq_info *info)
+{
+	set_evtchn_to_irq(info->evtchn, -1);
+	xen_evtchn_port_remove(info->evtchn, info->cpu);
+	info->evtchn = 0;
+}
+
+/*
+ * Accessors for packed IRQ information.
+ */
+unsigned int evtchn_from_irq(unsigned irq)
+{
+	const struct irq_info *info = NULL;
+
+	if (likely(irq < nr_irqs))
+		info = info_for_irq(irq);
+	if (!info)
+		return 0;
+
+	return info->evtchn;
+}
+
+unsigned irq_from_evtchn(unsigned int evtchn)
+{
+	return get_evtchn_to_irq(evtchn);
+}
+EXPORT_SYMBOL_GPL(irq_from_evtchn);
+
+int irq_from_virq(unsigned int cpu, unsigned int virq)
+{
+	return per_cpu(virq_to_irq, cpu)[virq];
+}
+
+static enum ipi_vector ipi_from_irq(unsigned irq)
+{
+	struct irq_info *info = info_for_irq(irq);
+
+	BUG_ON(info == NULL);
+	BUG_ON(info->type != IRQT_IPI);
+
+	return info->u.ipi;
+}
+
+static unsigned virq_from_irq(unsigned irq)
+{
+	struct irq_info *info = info_for_irq(irq);
+
+	BUG_ON(info == NULL);
+	BUG_ON(info->type != IRQT_VIRQ);
+
+	return info->u.virq;
+}
+
+static unsigned pirq_from_irq(unsigned irq)
+{
+	struct irq_info *info = info_for_irq(irq);
+
+	BUG_ON(info == NULL);
+	BUG_ON(info->type != IRQT_PIRQ);
+
+	return info->u.pirq.pirq;
+}
+
+static enum xen_irq_type type_from_irq(unsigned irq)
+{
+	return info_for_irq(irq)->type;
+}
+
+unsigned cpu_from_irq(unsigned irq)
+{
+	return info_for_irq(irq)->cpu;
+}
+
+unsigned int cpu_from_evtchn(unsigned int evtchn)
+{
+	int irq = get_evtchn_to_irq(evtchn);
+	unsigned ret = 0;
+
+	if (irq != -1)
+		ret = cpu_from_irq(irq);
+
+	return ret;
+}
+
+static void do_mask(struct irq_info *info, u8 reason)
+{
+	unsigned long flags;
+
+	raw_spin_lock_irqsave(&info->lock, flags);
+
+	if (!info->mask_reason)
+		mask_evtchn(info->evtchn);
+
+	info->mask_reason |= reason;
+
+	raw_spin_unlock_irqrestore(&info->lock, flags);
+}
+
+static void do_unmask(struct irq_info *info, u8 reason)
+{
+	unsigned long flags;
+
+	raw_spin_lock_irqsave(&info->lock, flags);
+
+	info->mask_reason &= ~reason;
+
+	if (!info->mask_reason)
+		unmask_evtchn(info->evtchn);
+
+	raw_spin_unlock_irqrestore(&info->lock, flags);
+}
+
+#ifdef CONFIG_X86
+static bool pirq_check_eoi_map(unsigned irq)
+{
+	return test_bit(pirq_from_irq(irq), pirq_eoi_map);
+}
+#endif
+
+static bool pirq_needs_eoi_flag(unsigned irq)
+{
+	struct irq_info *info = info_for_irq(irq);
+	BUG_ON(info->type != IRQT_PIRQ);
+
+	return info->u.pirq.flags & PIRQ_NEEDS_EOI;
+}
+
+static void bind_evtchn_to_cpu(unsigned int chn, unsigned int cpu)
+{
+	int irq = get_evtchn_to_irq(chn);
+	struct irq_info *info = info_for_irq(irq);
+
+	BUG_ON(irq == -1);
+#ifdef CONFIG_SMP
+	cpumask_copy(irq_get_affinity_mask(irq), cpumask_of(cpu));
+#endif
+	xen_evtchn_port_bind_to_cpu(info, cpu);
+
+	info->cpu = cpu;
+}
+
+/**
+ * notify_remote_via_irq - send event to remote end of event channel via irq
+ * @irq: irq of event channel to send event to
+ *
+ * Unlike notify_remote_via_evtchn(), this is safe to use across
+ * save/restore. Notifications on a broken connection are silently
+ * dropped.
+ */
+void notify_remote_via_irq(int irq)
+{
+	int evtchn = evtchn_from_irq(irq);
+
+	if (VALID_EVTCHN(evtchn))
+		notify_remote_via_evtchn(evtchn);
+}
+EXPORT_SYMBOL_GPL(notify_remote_via_irq);
+
+struct lateeoi_work {
+	struct delayed_work delayed;
+	spinlock_t eoi_list_lock;
+	struct list_head eoi_list;
+};
+
+static DEFINE_PER_CPU(struct lateeoi_work, lateeoi);
+
+static void lateeoi_list_del(struct irq_info *info)
+{
+	struct lateeoi_work *eoi = &per_cpu(lateeoi, info->eoi_cpu);
+	unsigned long flags;
+
+	spin_lock_irqsave(&eoi->eoi_list_lock, flags);
+	list_del_init(&info->eoi_list);
+	spin_unlock_irqrestore(&eoi->eoi_list_lock, flags);
+}
+
+static void lateeoi_list_add(struct irq_info *info)
+{
+	struct lateeoi_work *eoi = &per_cpu(lateeoi, info->eoi_cpu);
+	struct irq_info *elem;
+	u64 now = get_jiffies_64();
+	unsigned long delay;
+	unsigned long flags;
+
+	if (now < info->eoi_time)
+		delay = info->eoi_time - now;
+	else
+		delay = 1;
+
+	spin_lock_irqsave(&eoi->eoi_list_lock, flags);
+
+	if (list_empty(&eoi->eoi_list)) {
+		list_add(&info->eoi_list, &eoi->eoi_list);
+		mod_delayed_work_on(info->eoi_cpu, system_wq,
+				    &eoi->delayed, delay);
+	} else {
+		list_for_each_entry_reverse(elem, &eoi->eoi_list, eoi_list) {
+			if (elem->eoi_time <= info->eoi_time)
+				break;
+		}
+		list_add(&info->eoi_list, &elem->eoi_list);
+	}
+
+	spin_unlock_irqrestore(&eoi->eoi_list_lock, flags);
+}
+
+static void xen_irq_lateeoi_locked(struct irq_info *info, bool spurious)
+{
+	evtchn_port_t evtchn;
+	unsigned int cpu;
+	unsigned int delay = 0;
+
+	evtchn = info->evtchn;
+	if (!VALID_EVTCHN(evtchn) || !list_empty(&info->eoi_list))
+		return;
+
+	if (spurious) {
+		if ((1 << info->spurious_cnt) < (HZ << 2))
+			info->spurious_cnt++;
+		if (info->spurious_cnt > 1) {
+			delay = 1 << (info->spurious_cnt - 2);
+			if (delay > HZ)
+				delay = HZ;
+			if (!info->eoi_time)
+				info->eoi_cpu = smp_processor_id();
+			info->eoi_time = get_jiffies_64() + delay;
+		}
+	} else {
+		info->spurious_cnt = 0;
+	}
+
+	cpu = info->eoi_cpu;
+	if (info->eoi_time &&
+	    (info->irq_epoch == per_cpu(irq_epoch, cpu) || delay)) {
+		lateeoi_list_add(info);
+		return;
+	}
+
+	info->eoi_time = 0;
+
+	/* is_active hasn't been reset yet, do it now. */
+	smp_store_release(&info->is_active, 0);
+	do_unmask(info, EVT_MASK_REASON_EOI_PENDING);
+}
+
+static void xen_irq_lateeoi_worker(struct work_struct *work)
+{
+	struct lateeoi_work *eoi;
+	struct irq_info *info;
+	u64 now = get_jiffies_64();
+	unsigned long flags;
+
+	eoi = container_of(to_delayed_work(work), struct lateeoi_work, delayed);
+
+	read_lock_irqsave(&evtchn_rwlock, flags);
+
+	while (true) {
+		spin_lock(&eoi->eoi_list_lock);
+
+		info = list_first_entry_or_null(&eoi->eoi_list, struct irq_info,
+						eoi_list);
+
+		if (info == NULL || now < info->eoi_time) {
+			spin_unlock(&eoi->eoi_list_lock);
+			break;
+		}
+
+		list_del_init(&info->eoi_list);
+
+		spin_unlock(&eoi->eoi_list_lock);
+
+		info->eoi_time = 0;
+
+		xen_irq_lateeoi_locked(info, false);
+	}
+
+	if (info)
+		mod_delayed_work_on(info->eoi_cpu, system_wq,
+				    &eoi->delayed, info->eoi_time - now);
+
+	read_unlock_irqrestore(&evtchn_rwlock, flags);
+}
+
+static void xen_cpu_init_eoi(unsigned int cpu)
+{
+	struct lateeoi_work *eoi = &per_cpu(lateeoi, cpu);
+
+	INIT_DELAYED_WORK(&eoi->delayed, xen_irq_lateeoi_worker);
+	spin_lock_init(&eoi->eoi_list_lock);
+	INIT_LIST_HEAD(&eoi->eoi_list);
+}
+
+void xen_irq_lateeoi(unsigned int irq, unsigned int eoi_flags)
+{
+	struct irq_info *info;
+	unsigned long flags;
+
+	read_lock_irqsave(&evtchn_rwlock, flags);
+
+	info = info_for_irq(irq);
+
+	if (info)
+		xen_irq_lateeoi_locked(info, eoi_flags & XEN_EOI_FLAG_SPURIOUS);
+
+	read_unlock_irqrestore(&evtchn_rwlock, flags);
+}
+EXPORT_SYMBOL_GPL(xen_irq_lateeoi);
+
+static void xen_irq_init(unsigned irq)
+{
+	struct irq_info *info;
+
+#ifdef CONFIG_SMP
+	/* By default all event channels notify CPU#0. */
+	cpumask_copy(irq_get_affinity_mask(irq), cpumask_of(0));
+#endif
+
+	info = kzalloc(sizeof(*info), GFP_KERNEL);
+	if (info == NULL)
+		panic("Unable to allocate metadata for IRQ%d\n", irq);
+
+	info->type = IRQT_UNBOUND;
+	info->refcnt = -1;
+
+	set_info_for_irq(irq, info);
+
+	INIT_LIST_HEAD(&info->eoi_list);
+	list_add_tail(&info->list, &xen_irq_list_head);
+}
+
+static int __must_check xen_allocate_irqs_dynamic(int nvec)
+{
+	int i, irq = irq_alloc_descs(-1, 0, nvec, -1);
+
+	if (irq >= 0) {
+		for (i = 0; i < nvec; i++)
+			xen_irq_init(irq + i);
+	}
+
+	return irq;
+}
+
+static inline int __must_check xen_allocate_irq_dynamic(void)
+{
+
+	return xen_allocate_irqs_dynamic(1);
+}
+
+static int __must_check xen_allocate_irq_gsi(unsigned gsi)
+{
+	int irq;
+
+	/*
+	 * A PV guest has no concept of a GSI (since it has no ACPI
+	 * nor access to/knowledge of the physical APICs). Therefore
+	 * all IRQs are dynamically allocated from the entire IRQ
+	 * space.
+	 */
+	if (xen_pv_domain() && !xen_initial_domain())
+		return xen_allocate_irq_dynamic();
+
+	/* Legacy IRQ descriptors are already allocated by the arch. */
+	if (gsi < nr_legacy_irqs())
+		irq = gsi;
+	else
+		irq = irq_alloc_desc_at(gsi, -1);
+
+	xen_irq_init(irq);
+
+	return irq;
+}
+
+static void xen_free_irq(unsigned irq)
+{
+	struct irq_info *info = info_for_irq(irq);
+	unsigned long flags;
+
+	if (WARN_ON(!info))
+		return;
+
+	write_lock_irqsave(&evtchn_rwlock, flags);
+
+	if (!list_empty(&info->eoi_list))
+		lateeoi_list_del(info);
+
+	list_del(&info->list);
+
+	set_info_for_irq(irq, NULL);
+
+	WARN_ON(info->refcnt > 0);
+
+	write_unlock_irqrestore(&evtchn_rwlock, flags);
+
+	kfree(info);
+
+	/* Legacy IRQ descriptors are managed by the arch. */
+	if (irq < nr_legacy_irqs())
+		return;
+
+	irq_free_desc(irq);
+}
+
+static void xen_evtchn_close(unsigned int port)
+{
+	struct evtchn_close close;
+
+	close.port = port;
+	if (HYPERVISOR_event_channel_op(EVTCHNOP_close, &close) != 0)
+		BUG();
+}
+
+static void event_handler_exit(struct irq_info *info)
+{
+	smp_store_release(&info->is_active, 0);
+	clear_evtchn(info->evtchn);
+}
+
+static void pirq_query_unmask(int irq)
+{
+	struct physdev_irq_status_query irq_status;
+	struct irq_info *info = info_for_irq(irq);
+
+	BUG_ON(info->type != IRQT_PIRQ);
+
+	irq_status.irq = pirq_from_irq(irq);
+	if (HYPERVISOR_physdev_op(PHYSDEVOP_irq_status_query, &irq_status))
+		irq_status.flags = 0;
+
+	info->u.pirq.flags &= ~PIRQ_NEEDS_EOI;
+	if (irq_status.flags & XENIRQSTAT_needs_eoi)
+		info->u.pirq.flags |= PIRQ_NEEDS_EOI;
+}
+
+static void eoi_pirq(struct irq_data *data)
+{
+	struct irq_info *info = info_for_irq(data->irq);
+	int evtchn = info ? info->evtchn : 0;
+	struct physdev_eoi eoi = { .irq = pirq_from_irq(data->irq) };
+	int rc = 0;
+
+	if (!VALID_EVTCHN(evtchn))
+		return;
+
+	if (unlikely(irqd_is_setaffinity_pending(data)) &&
+	    likely(!irqd_irq_disabled(data))) {
+		do_mask(info, EVT_MASK_REASON_TEMPORARY);
+
+		event_handler_exit(info);
+
+		irq_move_masked_irq(data);
+
+		do_unmask(info, EVT_MASK_REASON_TEMPORARY);
+	} else
+		event_handler_exit(info);
+
+	if (pirq_needs_eoi(data->irq)) {
+		rc = HYPERVISOR_physdev_op(PHYSDEVOP_eoi, &eoi);
+		WARN_ON(rc);
+	}
+}
+
+static void mask_ack_pirq(struct irq_data *data)
+{
+	disable_dynirq(data);
+	eoi_pirq(data);
+}
+
+static unsigned int __startup_pirq(unsigned int irq)
+{
+	struct evtchn_bind_pirq bind_pirq;
+	struct irq_info *info = info_for_irq(irq);
+	int evtchn = evtchn_from_irq(irq);
+	int rc;
+
+	BUG_ON(info->type != IRQT_PIRQ);
+
+	if (VALID_EVTCHN(evtchn))
+		goto out;
+
+	bind_pirq.pirq = pirq_from_irq(irq);
+	/* NB. We are happy to share unless we are probing. */
+	bind_pirq.flags = info->u.pirq.flags & PIRQ_SHAREABLE ?
+					BIND_PIRQ__WILL_SHARE : 0;
+	rc = HYPERVISOR_event_channel_op(EVTCHNOP_bind_pirq, &bind_pirq);
+	if (rc != 0) {
+		pr_warn("Failed to obtain physical IRQ %d\n", irq);
+		return 0;
+	}
+	evtchn = bind_pirq.port;
+
+	pirq_query_unmask(irq);
+
+	rc = set_evtchn_to_irq(evtchn, irq);
+	if (rc)
+		goto err;
+
+	info->evtchn = evtchn;
+	bind_evtchn_to_cpu(evtchn, 0);
+
+	rc = xen_evtchn_port_setup(info);
+	if (rc)
+		goto err;
+
+out:
+	do_unmask(info, EVT_MASK_REASON_EXPLICIT);
+
+	eoi_pirq(irq_get_irq_data(irq));
+
+	return 0;
+
+err:
+	pr_err("irq%d: Failed to set port to irq mapping (%d)\n", irq, rc);
+	xen_evtchn_close(evtchn);
+	return 0;
+}
+
+static unsigned int startup_pirq(struct irq_data *data)
+{
+	return __startup_pirq(data->irq);
+}
+
+static void shutdown_pirq(struct irq_data *data)
+{
+	unsigned int irq = data->irq;
+	struct irq_info *info = info_for_irq(irq);
+	unsigned evtchn = evtchn_from_irq(irq);
+
+	BUG_ON(info->type != IRQT_PIRQ);
+
+	if (!VALID_EVTCHN(evtchn))
+		return;
+
+	do_mask(info, EVT_MASK_REASON_EXPLICIT);
+	xen_evtchn_close(evtchn);
+	xen_irq_info_cleanup(info);
+}
+
+static void enable_pirq(struct irq_data *data)
+{
+	enable_dynirq(data);
+}
+
+static void disable_pirq(struct irq_data *data)
+{
+	disable_dynirq(data);
+}
+
+int xen_irq_from_gsi(unsigned gsi)
+{
+	struct irq_info *info;
+
+	list_for_each_entry(info, &xen_irq_list_head, list) {
+		if (info->type != IRQT_PIRQ)
+			continue;
+
+		if (info->u.pirq.gsi == gsi)
+			return info->irq;
+	}
+
+	return -1;
+}
+EXPORT_SYMBOL_GPL(xen_irq_from_gsi);
+
+static void __unbind_from_irq(unsigned int irq)
+{
+	int evtchn = evtchn_from_irq(irq);
+	struct irq_info *info = info_for_irq(irq);
+
+	if (info->refcnt > 0) {
+		info->refcnt--;
+		if (info->refcnt != 0)
+			return;
+	}
+
+	if (VALID_EVTCHN(evtchn)) {
+		unsigned int cpu = cpu_from_irq(irq);
+
+		xen_evtchn_close(evtchn);
+
+		switch (type_from_irq(irq)) {
+		case IRQT_VIRQ:
+			per_cpu(virq_to_irq, cpu)[virq_from_irq(irq)] = -1;
+			break;
+		case IRQT_IPI:
+			per_cpu(ipi_to_irq, cpu)[ipi_from_irq(irq)] = -1;
+			break;
+		default:
+			break;
+		}
+
+		xen_irq_info_cleanup(info);
+	}
+
+	xen_free_irq(irq);
+}
+
+/*
+ * Do not make any assumptions regarding the relationship between the
+ * IRQ number returned here and the Xen pirq argument.
+ *
+ * Note: We don't assign an event channel until the irq actually started
+ * up.  Return an existing irq if we've already got one for the gsi.
+ *
+ * Shareable implies level triggered, not shareable implies edge
+ * triggered here.
+ */
+int xen_bind_pirq_gsi_to_irq(unsigned gsi,
+			     unsigned pirq, int shareable, char *name)
+{
+	int irq = -1;
+	struct physdev_irq irq_op;
+	int ret;
+
+	mutex_lock(&irq_mapping_update_lock);
+
+	irq = xen_irq_from_gsi(gsi);
+	if (irq != -1) {
+		pr_info("%s: returning irq %d for gsi %u\n",
+			__func__, irq, gsi);
+		goto out;
+	}
+
+	irq = xen_allocate_irq_gsi(gsi);
+	if (irq < 0)
+		goto out;
+
+	irq_op.irq = irq;
+	irq_op.vector = 0;
+
+	/* Only the privileged domain can do this. For non-priv, the pcifront
+	 * driver provides a PCI bus that does the call to do exactly
+	 * this in the priv domain. */
+	if (xen_initial_domain() &&
+	    HYPERVISOR_physdev_op(PHYSDEVOP_alloc_irq_vector, &irq_op)) {
+		xen_free_irq(irq);
+		irq = -ENOSPC;
+		goto out;
+	}
+
+	ret = xen_irq_info_pirq_setup(irq, 0, pirq, gsi, DOMID_SELF,
+			       shareable ? PIRQ_SHAREABLE : 0);
+	if (ret < 0) {
+		__unbind_from_irq(irq);
+		irq = ret;
+		goto out;
+	}
+
+	pirq_query_unmask(irq);
+	/* We try to use the handler with the appropriate semantic for the
+	 * type of interrupt: if the interrupt is an edge triggered
+	 * interrupt we use handle_edge_irq.
+	 *
+	 * On the other hand if the interrupt is level triggered we use
+	 * handle_fasteoi_irq like the native code does for this kind of
+	 * interrupts.
+	 *
+	 * Depending on the Xen version, pirq_needs_eoi might return true
+	 * not only for level triggered interrupts but for edge triggered
+	 * interrupts too. In any case Xen always honors the eoi mechanism,
+	 * not injecting any more pirqs of the same kind if the first one
+	 * hasn't received an eoi yet. Therefore using the fasteoi handler
+	 * is the right choice either way.
+	 */
+	if (shareable)
+		irq_set_chip_and_handler_name(irq, &xen_pirq_chip,
+				handle_fasteoi_irq, name);
+	else
+		irq_set_chip_and_handler_name(irq, &xen_pirq_chip,
+				handle_edge_irq, name);
+
+out:
+	mutex_unlock(&irq_mapping_update_lock);
+
+	return irq;
+}
+
+#ifdef CONFIG_PCI_MSI
+int xen_allocate_pirq_msi(struct pci_dev *dev, struct msi_desc *msidesc)
+{
+	int rc;
+	struct physdev_get_free_pirq op_get_free_pirq;
+
+	op_get_free_pirq.type = MAP_PIRQ_TYPE_MSI;
+	rc = HYPERVISOR_physdev_op(PHYSDEVOP_get_free_pirq, &op_get_free_pirq);
+
+	WARN_ONCE(rc == -ENOSYS,
+		  "hypervisor does not support the PHYSDEVOP_get_free_pirq interface\n");
+
+	return rc ? -1 : op_get_free_pirq.pirq;
+}
+
+int xen_bind_pirq_msi_to_irq(struct pci_dev *dev, struct msi_desc *msidesc,
+			     int pirq, int nvec, const char *name, domid_t domid)
+{
+	int i, irq, ret;
+
+	mutex_lock(&irq_mapping_update_lock);
+
+	irq = xen_allocate_irqs_dynamic(nvec);
+	if (irq < 0)
+		goto out;
+
+	for (i = 0; i < nvec; i++) {
+		irq_set_chip_and_handler_name(irq + i, &xen_pirq_chip, handle_edge_irq, name);
+
+		ret = xen_irq_info_pirq_setup(irq + i, 0, pirq + i, 0, domid,
+					      i == 0 ? 0 : PIRQ_MSI_GROUP);
+		if (ret < 0)
+			goto error_irq;
+	}
+
+	ret = irq_set_msi_desc(irq, msidesc);
+	if (ret < 0)
+		goto error_irq;
+out:
+	mutex_unlock(&irq_mapping_update_lock);
+	return irq;
+error_irq:
+	while (nvec--)
+		__unbind_from_irq(irq + nvec);
+	mutex_unlock(&irq_mapping_update_lock);
+	return ret;
+}
+#endif
+
+int xen_destroy_irq(int irq)
+{
+	struct physdev_unmap_pirq unmap_irq;
+	struct irq_info *info = info_for_irq(irq);
+	int rc = -ENOENT;
+
+	mutex_lock(&irq_mapping_update_lock);
+
+	/*
+	 * If trying to remove a vector in a MSI group different
+	 * than the first one skip the PIRQ unmap unless this vector
+	 * is the first one in the group.
+	 */
+	if (xen_initial_domain() && !(info->u.pirq.flags & PIRQ_MSI_GROUP)) {
+		unmap_irq.pirq = info->u.pirq.pirq;
+		unmap_irq.domid = info->u.pirq.domid;
+		rc = HYPERVISOR_physdev_op(PHYSDEVOP_unmap_pirq, &unmap_irq);
+		/* If another domain quits without making the pci_disable_msix
+		 * call, the Xen hypervisor takes care of freeing the PIRQs
+		 * (free_domain_pirqs).
+		 */
+		if ((rc == -ESRCH && info->u.pirq.domid != DOMID_SELF))
+			pr_info("domain %d does not have %d anymore\n",
+				info->u.pirq.domid, info->u.pirq.pirq);
+		else if (rc) {
+			pr_warn("unmap irq failed %d\n", rc);
+			goto out;
+		}
+	}
+
+	xen_free_irq(irq);
+
+out:
+	mutex_unlock(&irq_mapping_update_lock);
+	return rc;
+}
+
+int xen_irq_from_pirq(unsigned pirq)
+{
+	int irq;
+
+	struct irq_info *info;
+
+	mutex_lock(&irq_mapping_update_lock);
+
+	list_for_each_entry(info, &xen_irq_list_head, list) {
+		if (info->type != IRQT_PIRQ)
+			continue;
+		irq = info->irq;
+		if (info->u.pirq.pirq == pirq)
+			goto out;
+	}
+	irq = -1;
+out:
+	mutex_unlock(&irq_mapping_update_lock);
+
+	return irq;
+}
+
+
+int xen_pirq_from_irq(unsigned irq)
+{
+	return pirq_from_irq(irq);
+}
+EXPORT_SYMBOL_GPL(xen_pirq_from_irq);
+
+static int bind_evtchn_to_irq_chip(evtchn_port_t evtchn, struct irq_chip *chip)
+{
+	int irq;
+	int ret;
+
+	if (evtchn >= xen_evtchn_max_channels())
+		return -ENOMEM;
+
+	mutex_lock(&irq_mapping_update_lock);
+
+	irq = get_evtchn_to_irq(evtchn);
+
+	if (irq == -1) {
+		irq = xen_allocate_irq_dynamic();
+		if (irq < 0)
+			goto out;
+
+		irq_set_chip_and_handler_name(irq, chip,
+					      handle_edge_irq, "event");
+
+		ret = xen_irq_info_evtchn_setup(irq, evtchn);
+		if (ret < 0) {
+			__unbind_from_irq(irq);
+			irq = ret;
+			goto out;
+		}
+		/* New interdomain events are bound to VCPU 0. */
+		bind_evtchn_to_cpu(evtchn, 0);
+	} else {
+		struct irq_info *info = info_for_irq(irq);
+		WARN_ON(info == NULL || info->type != IRQT_EVTCHN);
+	}
+
+out:
+	mutex_unlock(&irq_mapping_update_lock);
+
+	return irq;
+}
+
+int bind_evtchn_to_irq(evtchn_port_t evtchn)
+{
+	return bind_evtchn_to_irq_chip(evtchn, &xen_dynamic_chip);
+}
+EXPORT_SYMBOL_GPL(bind_evtchn_to_irq);
+
+int bind_evtchn_to_irq_lateeoi(evtchn_port_t evtchn)
+{
+	return bind_evtchn_to_irq_chip(evtchn, &xen_lateeoi_chip);
+}
+EXPORT_SYMBOL_GPL(bind_evtchn_to_irq_lateeoi);
+
+static int bind_ipi_to_irq(unsigned int ipi, unsigned int cpu)
+{
+	struct evtchn_bind_ipi bind_ipi;
+	int evtchn, irq;
+	int ret;
+
+	mutex_lock(&irq_mapping_update_lock);
+
+	irq = per_cpu(ipi_to_irq, cpu)[ipi];
+
+	if (irq == -1) {
+		irq = xen_allocate_irq_dynamic();
+		if (irq < 0)
+			goto out;
+
+		irq_set_chip_and_handler_name(irq, &xen_percpu_chip,
+					      handle_percpu_irq, "ipi");
+
+		bind_ipi.vcpu = xen_vcpu_nr(cpu);
+		if (HYPERVISOR_event_channel_op(EVTCHNOP_bind_ipi,
+						&bind_ipi) != 0)
+			BUG();
+		evtchn = bind_ipi.port;
+
+		ret = xen_irq_info_ipi_setup(cpu, irq, evtchn, ipi);
+		if (ret < 0) {
+			__unbind_from_irq(irq);
+			irq = ret;
+			goto out;
+		}
+		bind_evtchn_to_cpu(evtchn, cpu);
+	} else {
+		struct irq_info *info = info_for_irq(irq);
+		WARN_ON(info == NULL || info->type != IRQT_IPI);
+	}
+
+ out:
+	mutex_unlock(&irq_mapping_update_lock);
+	return irq;
+}
+
+static int bind_interdomain_evtchn_to_irq_chip(unsigned int remote_domain,
+					       evtchn_port_t remote_port,
+					       struct irq_chip *chip)
+{
+	struct evtchn_bind_interdomain bind_interdomain;
+	int err;
+
+	bind_interdomain.remote_dom  = remote_domain;
+	bind_interdomain.remote_port = remote_port;
+
+	err = HYPERVISOR_event_channel_op(EVTCHNOP_bind_interdomain,
+					  &bind_interdomain);
+
+	return err ? : bind_evtchn_to_irq_chip(bind_interdomain.local_port,
+					       chip);
+}
+
+int bind_interdomain_evtchn_to_irq(unsigned int remote_domain,
+				   evtchn_port_t remote_port)
+{
+	return bind_interdomain_evtchn_to_irq_chip(remote_domain, remote_port,
+						   &xen_dynamic_chip);
+}
+EXPORT_SYMBOL_GPL(bind_interdomain_evtchn_to_irq);
+
+int bind_interdomain_evtchn_to_irq_lateeoi(unsigned int remote_domain,
+					   evtchn_port_t remote_port)
+{
+	return bind_interdomain_evtchn_to_irq_chip(remote_domain, remote_port,
+						   &xen_lateeoi_chip);
+}
+EXPORT_SYMBOL_GPL(bind_interdomain_evtchn_to_irq_lateeoi);
+
+static int find_virq(unsigned int virq, unsigned int cpu)
+{
+	struct evtchn_status status;
+	int port, rc = -ENOENT;
+
+	memset(&status, 0, sizeof(status));
+	for (port = 0; port < xen_evtchn_max_channels(); port++) {
+		status.dom = DOMID_SELF;
+		status.port = port;
+		rc = HYPERVISOR_event_channel_op(EVTCHNOP_status, &status);
+		if (rc < 0)
+			continue;
+		if (status.status != EVTCHNSTAT_virq)
+			continue;
+		if (status.u.virq == virq && status.vcpu == xen_vcpu_nr(cpu)) {
+			rc = port;
+			break;
+		}
+	}
+	return rc;
+}
+
+/**
+ * xen_evtchn_nr_channels - number of usable event channel ports
+ *
+ * This may be less than the maximum supported by the current
+ * hypervisor ABI. Use xen_evtchn_max_channels() for the maximum
+ * supported.
+ */
+unsigned xen_evtchn_nr_channels(void)
+{
+        return evtchn_ops->nr_channels();
+}
+EXPORT_SYMBOL_GPL(xen_evtchn_nr_channels);
+
+int bind_virq_to_irq(unsigned int virq, unsigned int cpu, bool percpu)
+{
+	struct evtchn_bind_virq bind_virq;
+	int evtchn, irq, ret;
+
+	mutex_lock(&irq_mapping_update_lock);
+
+	irq = per_cpu(virq_to_irq, cpu)[virq];
+
+	if (irq == -1) {
+		irq = xen_allocate_irq_dynamic();
+		if (irq < 0)
+			goto out;
+
+		if (percpu)
+			irq_set_chip_and_handler_name(irq, &xen_percpu_chip,
+						      handle_percpu_irq, "virq");
+		else
+			irq_set_chip_and_handler_name(irq, &xen_dynamic_chip,
+						      handle_edge_irq, "virq");
+
+		bind_virq.virq = virq;
+		bind_virq.vcpu = xen_vcpu_nr(cpu);
+		ret = HYPERVISOR_event_channel_op(EVTCHNOP_bind_virq,
+						&bind_virq);
+		if (ret == 0)
+			evtchn = bind_virq.port;
+		else {
+			if (ret == -EEXIST)
+				ret = find_virq(virq, cpu);
+			BUG_ON(ret < 0);
+			evtchn = ret;
+		}
+
+		ret = xen_irq_info_virq_setup(cpu, irq, evtchn, virq);
+		if (ret < 0) {
+			__unbind_from_irq(irq);
+			irq = ret;
+			goto out;
+		}
+
+		bind_evtchn_to_cpu(evtchn, cpu);
+	} else {
+		struct irq_info *info = info_for_irq(irq);
+		WARN_ON(info == NULL || info->type != IRQT_VIRQ);
+	}
+
+out:
+	mutex_unlock(&irq_mapping_update_lock);
+
+	return irq;
+}
+
+static void unbind_from_irq(unsigned int irq)
+{
+	mutex_lock(&irq_mapping_update_lock);
+	__unbind_from_irq(irq);
+	mutex_unlock(&irq_mapping_update_lock);
+}
+
+static int bind_evtchn_to_irqhandler_chip(evtchn_port_t evtchn,
+					  irq_handler_t handler,
+					  unsigned long irqflags,
+					  const char *devname, void *dev_id,
+					  struct irq_chip *chip)
+{
+	int irq, retval;
+
+	irq = bind_evtchn_to_irq_chip(evtchn, chip);
+	if (irq < 0)
+		return irq;
+	retval = request_irq(irq, handler, irqflags, devname, dev_id);
+	if (retval != 0) {
+		unbind_from_irq(irq);
+		return retval;
+	}
+
+	return irq;
+}
+
+int bind_evtchn_to_irqhandler(evtchn_port_t evtchn,
+			      irq_handler_t handler,
+			      unsigned long irqflags,
+			      const char *devname, void *dev_id)
+{
+	return bind_evtchn_to_irqhandler_chip(evtchn, handler, irqflags,
+					      devname, dev_id,
+					      &xen_dynamic_chip);
+}
+EXPORT_SYMBOL_GPL(bind_evtchn_to_irqhandler);
+
+int bind_evtchn_to_irqhandler_lateeoi(evtchn_port_t evtchn,
+				      irq_handler_t handler,
+				      unsigned long irqflags,
+				      const char *devname, void *dev_id)
+{
+	return bind_evtchn_to_irqhandler_chip(evtchn, handler, irqflags,
+					      devname, dev_id,
+					      &xen_lateeoi_chip);
+}
+EXPORT_SYMBOL_GPL(bind_evtchn_to_irqhandler_lateeoi);
+
+static int bind_interdomain_evtchn_to_irqhandler_chip(
+		unsigned int remote_domain, evtchn_port_t remote_port,
+		irq_handler_t handler, unsigned long irqflags,
+		const char *devname, void *dev_id, struct irq_chip *chip)
+{
+	int irq, retval;
+
+	irq = bind_interdomain_evtchn_to_irq_chip(remote_domain, remote_port,
+						  chip);
+	if (irq < 0)
+		return irq;
+
+	retval = request_irq(irq, handler, irqflags, devname, dev_id);
+	if (retval != 0) {
+		unbind_from_irq(irq);
+		return retval;
+	}
+
+	return irq;
+}
+
+int bind_interdomain_evtchn_to_irqhandler(unsigned int remote_domain,
+					  evtchn_port_t remote_port,
+					  irq_handler_t handler,
+					  unsigned long irqflags,
+					  const char *devname,
+					  void *dev_id)
+{
+	return bind_interdomain_evtchn_to_irqhandler_chip(remote_domain,
+				remote_port, handler, irqflags, devname,
+				dev_id, &xen_dynamic_chip);
+}
+EXPORT_SYMBOL_GPL(bind_interdomain_evtchn_to_irqhandler);
+
+int bind_interdomain_evtchn_to_irqhandler_lateeoi(unsigned int remote_domain,
+						  evtchn_port_t remote_port,
+						  irq_handler_t handler,
+						  unsigned long irqflags,
+						  const char *devname,
+						  void *dev_id)
+{
+	return bind_interdomain_evtchn_to_irqhandler_chip(remote_domain,
+				remote_port, handler, irqflags, devname,
+				dev_id, &xen_lateeoi_chip);
+}
+EXPORT_SYMBOL_GPL(bind_interdomain_evtchn_to_irqhandler_lateeoi);
+
+int bind_virq_to_irqhandler(unsigned int virq, unsigned int cpu,
+			    irq_handler_t handler,
+			    unsigned long irqflags, const char *devname, void *dev_id)
+{
+	int irq, retval;
+
+	irq = bind_virq_to_irq(virq, cpu, irqflags & IRQF_PERCPU);
+	if (irq < 0)
+		return irq;
+	retval = request_irq(irq, handler, irqflags, devname, dev_id);
+	if (retval != 0) {
+		unbind_from_irq(irq);
+		return retval;
+	}
+
+	return irq;
+}
+EXPORT_SYMBOL_GPL(bind_virq_to_irqhandler);
+
+int bind_ipi_to_irqhandler(enum ipi_vector ipi,
+			   unsigned int cpu,
+			   irq_handler_t handler,
+			   unsigned long irqflags,
+			   const char *devname,
+			   void *dev_id)
+{
+	int irq, retval;
+
+	irq = bind_ipi_to_irq(ipi, cpu);
+	if (irq < 0)
+		return irq;
+
+	irqflags |= IRQF_NO_SUSPEND | IRQF_FORCE_RESUME | IRQF_EARLY_RESUME;
+	retval = request_irq(irq, handler, irqflags, devname, dev_id);
+	if (retval != 0) {
+		unbind_from_irq(irq);
+		return retval;
+	}
+
+	return irq;
+}
+
+void unbind_from_irqhandler(unsigned int irq, void *dev_id)
+{
+	struct irq_info *info = info_for_irq(irq);
+
+	if (WARN_ON(!info))
+		return;
+	free_irq(irq, dev_id);
+	unbind_from_irq(irq);
+}
+EXPORT_SYMBOL_GPL(unbind_from_irqhandler);
+
+/**
+ * xen_set_irq_priority() - set an event channel priority.
+ * @irq:irq bound to an event channel.
+ * @priority: priority between XEN_IRQ_PRIORITY_MAX and XEN_IRQ_PRIORITY_MIN.
+ */
+int xen_set_irq_priority(unsigned irq, unsigned priority)
+{
+	struct evtchn_set_priority set_priority;
+
+	set_priority.port = evtchn_from_irq(irq);
+	set_priority.priority = priority;
+
+	return HYPERVISOR_event_channel_op(EVTCHNOP_set_priority,
+					   &set_priority);
+}
+EXPORT_SYMBOL_GPL(xen_set_irq_priority);
+
+int evtchn_make_refcounted(unsigned int evtchn)
+{
+	int irq = get_evtchn_to_irq(evtchn);
+	struct irq_info *info;
+
+	if (irq == -1)
+		return -ENOENT;
+
+	info = info_for_irq(irq);
+
+	if (!info)
+		return -ENOENT;
+
+	WARN_ON(info->refcnt != -1);
+
+	info->refcnt = 1;
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(evtchn_make_refcounted);
+
+int evtchn_get(unsigned int evtchn)
+{
+	int irq;
+	struct irq_info *info;
+	int err = -ENOENT;
+
+	if (evtchn >= xen_evtchn_max_channels())
+		return -EINVAL;
+
+	mutex_lock(&irq_mapping_update_lock);
+
+	irq = get_evtchn_to_irq(evtchn);
+	if (irq == -1)
+		goto done;
+
+	info = info_for_irq(irq);
+
+	if (!info)
+		goto done;
+
+	err = -EINVAL;
+	if (info->refcnt <= 0 || info->refcnt == SHRT_MAX)
+		goto done;
+
+	info->refcnt++;
+	err = 0;
+ done:
+	mutex_unlock(&irq_mapping_update_lock);
+
+	return err;
+}
+EXPORT_SYMBOL_GPL(evtchn_get);
+
+void evtchn_put(unsigned int evtchn)
+{
+	int irq = get_evtchn_to_irq(evtchn);
+	if (WARN_ON(irq == -1))
+		return;
+	unbind_from_irq(irq);
+}
+EXPORT_SYMBOL_GPL(evtchn_put);
+
+void xen_send_IPI_one(unsigned int cpu, enum ipi_vector vector)
+{
+	int irq;
+
+#ifdef CONFIG_X86
+	if (unlikely(vector == XEN_NMI_VECTOR)) {
+		int rc =  HYPERVISOR_vcpu_op(VCPUOP_send_nmi, xen_vcpu_nr(cpu),
+					     NULL);
+		if (rc < 0)
+			printk(KERN_WARNING "Sending nmi to CPU%d failed (rc:%d)\n", cpu, rc);
+		return;
+	}
+#endif
+	irq = per_cpu(ipi_to_irq, cpu)[vector];
+	BUG_ON(irq < 0);
+	notify_remote_via_irq(irq);
+}
+
+struct evtchn_loop_ctrl {
+	ktime_t timeout;
+	unsigned count;
+	bool defer_eoi;
+};
+
+void handle_irq_for_port(evtchn_port_t port, struct evtchn_loop_ctrl *ctrl)
+{
+	int irq;
+	struct irq_info *info;
+
+	irq = get_evtchn_to_irq(port);
+	if (irq == -1)
+		return;
+
+	/*
+	 * Check for timeout every 256 events.
+	 * We are setting the timeout value only after the first 256
+	 * events in order to not hurt the common case of few loop
+	 * iterations. The 256 is basically an arbitrary value.
+	 *
+	 * In case we are hitting the timeout we need to defer all further
+	 * EOIs in order to ensure to leave the event handling loop rather
+	 * sooner than later.
+	 */
+	if (!ctrl->defer_eoi && !(++ctrl->count & 0xff)) {
+		ktime_t kt = ktime_get();
+
+		if (!ctrl->timeout) {
+			kt = ktime_add_ms(kt,
+					  jiffies_to_msecs(event_loop_timeout));
+			ctrl->timeout = kt;
+		} else if (kt > ctrl->timeout) {
+			ctrl->defer_eoi = true;
+		}
+	}
+
+	info = info_for_irq(irq);
+	if (xchg_acquire(&info->is_active, 1))
+		return;
+
+	if (ctrl->defer_eoi) {
+		info->eoi_cpu = smp_processor_id();
+		info->irq_epoch = __this_cpu_read(irq_epoch);
+		info->eoi_time = get_jiffies_64() + event_eoi_delay;
+	}
+
+	generic_handle_irq(irq);
+}
+
+static DEFINE_PER_CPU(unsigned, xed_nesting_count);
+
+static void __xen_evtchn_do_upcall(void)
+{
+	struct vcpu_info *vcpu_info = __this_cpu_read(xen_vcpu);
+	int cpu = get_cpu();
+	unsigned count;
+	struct evtchn_loop_ctrl ctrl = { 0 };
+
+	read_lock(&evtchn_rwlock);
+
+	do {
+		vcpu_info->evtchn_upcall_pending = 0;
+
+		if (__this_cpu_inc_return(xed_nesting_count) - 1)
+			goto out;
+
+		xen_evtchn_handle_events(cpu, &ctrl);
+
+		BUG_ON(!irqs_disabled());
+
+		count = __this_cpu_read(xed_nesting_count);
+		__this_cpu_write(xed_nesting_count, 0);
+	} while (count != 1 || vcpu_info->evtchn_upcall_pending);
+
+out:
+	read_unlock(&evtchn_rwlock);
+
+	/*
+	 * Increment irq_epoch only now to defer EOIs only for
+	 * xen_irq_lateeoi() invocations occurring from inside the loop
+	 * above.
+	 */
+	__this_cpu_inc(irq_epoch);
+
+	put_cpu();
+}
+
+void xen_evtchn_do_upcall(struct pt_regs *regs)
+{
+	struct pt_regs *old_regs = set_irq_regs(regs);
+
+	irq_enter();
+#ifdef CONFIG_X86
+	inc_irq_stat(irq_hv_callback_count);
+#endif
+
+	__xen_evtchn_do_upcall();
+
+	irq_exit();
+	set_irq_regs(old_regs);
+}
+
+void xen_hvm_evtchn_do_upcall(void)
+{
+	__xen_evtchn_do_upcall();
+}
+EXPORT_SYMBOL_GPL(xen_hvm_evtchn_do_upcall);
+
+/* Rebind a new event channel to an existing irq. */
+void rebind_evtchn_irq(int evtchn, int irq)
+{
+	struct irq_info *info = info_for_irq(irq);
+
+	if (WARN_ON(!info))
+		return;
+
+	/* Make sure the irq is masked, since the new event channel
+	   will also be masked. */
+	disable_irq(irq);
+
+	mutex_lock(&irq_mapping_update_lock);
+
+	/* After resume the irq<->evtchn mappings are all cleared out */
+	BUG_ON(get_evtchn_to_irq(evtchn) != -1);
+	/* Expect irq to have been bound before,
+	   so there should be a proper type */
+	BUG_ON(info->type == IRQT_UNBOUND);
+
+	(void)xen_irq_info_evtchn_setup(irq, evtchn);
+
+	mutex_unlock(&irq_mapping_update_lock);
+
+        bind_evtchn_to_cpu(evtchn, info->cpu);
+	/* This will be deferred until interrupt is processed */
+	irq_set_affinity(irq, cpumask_of(info->cpu));
+
+	/* Unmask the event channel. */
+	enable_irq(irq);
+}
+
+/* Rebind an evtchn so that it gets delivered to a specific cpu */
+static int xen_rebind_evtchn_to_cpu(struct irq_info *info, unsigned int tcpu)
+{
+	struct evtchn_bind_vcpu bind_vcpu;
+	evtchn_port_t evtchn = info ? info->evtchn : 0;
+
+	if (!VALID_EVTCHN(evtchn))
+		return -1;
+
+	if (!xen_support_evtchn_rebind())
+		return -1;
+
+	/* Send future instances of this interrupt to other vcpu. */
+	bind_vcpu.port = evtchn;
+	bind_vcpu.vcpu = xen_vcpu_nr(tcpu);
+
+	/*
+	 * Mask the event while changing the VCPU binding to prevent
+	 * it being delivered on an unexpected VCPU.
+	 */
+	do_mask(info, EVT_MASK_REASON_TEMPORARY);
+
+	/*
+	 * If this fails, it usually just indicates that we're dealing with a
+	 * virq or IPI channel, which don't actually need to be rebound. Ignore
+	 * it, but don't do the xenlinux-level rebind in that case.
+	 */
+	if (HYPERVISOR_event_channel_op(EVTCHNOP_bind_vcpu, &bind_vcpu) >= 0)
+		bind_evtchn_to_cpu(evtchn, tcpu);
+
+	do_unmask(info, EVT_MASK_REASON_TEMPORARY);
+
+	return 0;
+}
+
+static int set_affinity_irq(struct irq_data *data, const struct cpumask *dest,
+			    bool force)
+{
+	unsigned tcpu = cpumask_first_and(dest, cpu_online_mask);
+	int ret = xen_rebind_evtchn_to_cpu(info_for_irq(data->irq), tcpu);
+
+	if (!ret)
+		irq_data_update_effective_affinity(data, cpumask_of(tcpu));
+
+	return ret;
+}
+
+/* To be called with desc->lock held. */
+int xen_set_affinity_evtchn(struct irq_desc *desc, unsigned int tcpu)
+{
+	struct irq_data *d = irq_desc_get_irq_data(desc);
+
+	return set_affinity_irq(d, cpumask_of(tcpu), false);
+}
+EXPORT_SYMBOL_GPL(xen_set_affinity_evtchn);
+
+static void enable_dynirq(struct irq_data *data)
+{
+	struct irq_info *info = info_for_irq(data->irq);
+	evtchn_port_t evtchn = info ? info->evtchn : 0;
+
+	if (VALID_EVTCHN(evtchn))
+		do_unmask(info, EVT_MASK_REASON_EXPLICIT);
+}
+
+static void disable_dynirq(struct irq_data *data)
+{
+	struct irq_info *info = info_for_irq(data->irq);
+	evtchn_port_t evtchn = info ? info->evtchn : 0;
+
+	if (VALID_EVTCHN(evtchn))
+		do_mask(info, EVT_MASK_REASON_EXPLICIT);
+}
+
+static void ack_dynirq(struct irq_data *data)
+{
+	struct irq_info *info = info_for_irq(data->irq);
+	evtchn_port_t evtchn = info ? info->evtchn : 0;
+
+	if (!VALID_EVTCHN(evtchn))
+		return;
+
+	if (unlikely(irqd_is_setaffinity_pending(data)) &&
+	    likely(!irqd_irq_disabled(data))) {
+		do_mask(info, EVT_MASK_REASON_TEMPORARY);
+
+		event_handler_exit(info);
+
+		irq_move_masked_irq(data);
+
+		do_unmask(info, EVT_MASK_REASON_TEMPORARY);
+	} else
+		event_handler_exit(info);
+}
+
+static void mask_ack_dynirq(struct irq_data *data)
+{
+	disable_dynirq(data);
+	ack_dynirq(data);
+}
+
+static void lateeoi_ack_dynirq(struct irq_data *data)
+{
+	struct irq_info *info = info_for_irq(data->irq);
+	evtchn_port_t evtchn = info ? info->evtchn : 0;
+
+	if (!VALID_EVTCHN(evtchn))
+		return;
+
+	do_mask(info, EVT_MASK_REASON_EOI_PENDING);
+
+	if (unlikely(irqd_is_setaffinity_pending(data)) &&
+	    likely(!irqd_irq_disabled(data))) {
+		do_mask(info, EVT_MASK_REASON_TEMPORARY);
+
+		clear_evtchn(evtchn);
+
+		irq_move_masked_irq(data);
+
+		do_unmask(info, EVT_MASK_REASON_TEMPORARY);
+	} else
+		clear_evtchn(evtchn);
+}
+
+static void lateeoi_mask_ack_dynirq(struct irq_data *data)
+{
+	struct irq_info *info = info_for_irq(data->irq);
+	evtchn_port_t evtchn = info ? info->evtchn : 0;
+
+	if (VALID_EVTCHN(evtchn)) {
+		do_mask(info, EVT_MASK_REASON_EXPLICIT);
+		ack_dynirq(data);
+	}
+}
+
+static int retrigger_dynirq(struct irq_data *data)
+{
+	struct irq_info *info = info_for_irq(data->irq);
+	evtchn_port_t evtchn = info ? info->evtchn : 0;
+
+	if (!VALID_EVTCHN(evtchn))
+		return 0;
+
+	do_mask(info, EVT_MASK_REASON_TEMPORARY);
+	set_evtchn(evtchn);
+	do_unmask(info, EVT_MASK_REASON_TEMPORARY);
+
+	return 1;
+}
+
+static void restore_pirqs(void)
+{
+	int pirq, rc, irq, gsi;
+	struct physdev_map_pirq map_irq;
+	struct irq_info *info;
+
+	list_for_each_entry(info, &xen_irq_list_head, list) {
+		if (info->type != IRQT_PIRQ)
+			continue;
+
+		pirq = info->u.pirq.pirq;
+		gsi = info->u.pirq.gsi;
+		irq = info->irq;
+
+		/* save/restore of PT devices doesn't work, so at this point the
+		 * only devices present are GSI based emulated devices */
+		if (!gsi)
+			continue;
+
+		map_irq.domid = DOMID_SELF;
+		map_irq.type = MAP_PIRQ_TYPE_GSI;
+		map_irq.index = gsi;
+		map_irq.pirq = pirq;
+
+		rc = HYPERVISOR_physdev_op(PHYSDEVOP_map_pirq, &map_irq);
+		if (rc) {
+			pr_warn("xen map irq failed gsi=%d irq=%d pirq=%d rc=%d\n",
+				gsi, irq, pirq, rc);
+			xen_free_irq(irq);
+			continue;
+		}
+
+		printk(KERN_DEBUG "xen: --> irq=%d, pirq=%d\n", irq, map_irq.pirq);
+
+		__startup_pirq(irq);
+	}
+}
+
+static void restore_cpu_virqs(unsigned int cpu)
+{
+	struct evtchn_bind_virq bind_virq;
+	int virq, irq, evtchn;
+
+	for (virq = 0; virq < NR_VIRQS; virq++) {
+		if ((irq = per_cpu(virq_to_irq, cpu)[virq]) == -1)
+			continue;
+
+		BUG_ON(virq_from_irq(irq) != virq);
+
+		/* Get a new binding from Xen. */
+		bind_virq.virq = virq;
+		bind_virq.vcpu = xen_vcpu_nr(cpu);
+		if (HYPERVISOR_event_channel_op(EVTCHNOP_bind_virq,
+						&bind_virq) != 0)
+			BUG();
+		evtchn = bind_virq.port;
+
+		/* Record the new mapping. */
+		(void)xen_irq_info_virq_setup(cpu, irq, evtchn, virq);
+		bind_evtchn_to_cpu(evtchn, cpu);
+	}
+}
+
+static void restore_cpu_ipis(unsigned int cpu)
+{
+	struct evtchn_bind_ipi bind_ipi;
+	int ipi, irq, evtchn;
+
+	for (ipi = 0; ipi < XEN_NR_IPIS; ipi++) {
+		if ((irq = per_cpu(ipi_to_irq, cpu)[ipi]) == -1)
+			continue;
+
+		BUG_ON(ipi_from_irq(irq) != ipi);
+
+		/* Get a new binding from Xen. */
+		bind_ipi.vcpu = xen_vcpu_nr(cpu);
+		if (HYPERVISOR_event_channel_op(EVTCHNOP_bind_ipi,
+						&bind_ipi) != 0)
+			BUG();
+		evtchn = bind_ipi.port;
+
+		/* Record the new mapping. */
+		(void)xen_irq_info_ipi_setup(cpu, irq, evtchn, ipi);
+		bind_evtchn_to_cpu(evtchn, cpu);
+	}
+}
+
+/* Clear an irq's pending state, in preparation for polling on it */
+void xen_clear_irq_pending(int irq)
+{
+	struct irq_info *info = info_for_irq(irq);
+	evtchn_port_t evtchn = info ? info->evtchn : 0;
+
+	if (VALID_EVTCHN(evtchn))
+		event_handler_exit(info);
+}
+EXPORT_SYMBOL(xen_clear_irq_pending);
+void xen_set_irq_pending(int irq)
+{
+	int evtchn = evtchn_from_irq(irq);
+
+	if (VALID_EVTCHN(evtchn))
+		set_evtchn(evtchn);
+}
+
+bool xen_test_irq_pending(int irq)
+{
+	int evtchn = evtchn_from_irq(irq);
+	bool ret = false;
+
+	if (VALID_EVTCHN(evtchn))
+		ret = test_evtchn(evtchn);
+
+	return ret;
+}
+
+/* Poll waiting for an irq to become pending with timeout.  In the usual case,
+ * the irq will be disabled so it won't deliver an interrupt. */
+void xen_poll_irq_timeout(int irq, u64 timeout)
+{
+	evtchn_port_t evtchn = evtchn_from_irq(irq);
+
+	if (VALID_EVTCHN(evtchn)) {
+		struct sched_poll poll;
+
+		poll.nr_ports = 1;
+		poll.timeout = timeout;
+		set_xen_guest_handle(poll.ports, &evtchn);
+
+		if (HYPERVISOR_sched_op(SCHEDOP_poll, &poll) != 0)
+			BUG();
+	}
+}
+EXPORT_SYMBOL(xen_poll_irq_timeout);
+/* Poll waiting for an irq to become pending.  In the usual case, the
+ * irq will be disabled so it won't deliver an interrupt. */
+void xen_poll_irq(int irq)
+{
+	xen_poll_irq_timeout(irq, 0 /* no timeout */);
+}
+
+/* Check whether the IRQ line is shared with other guests. */
+int xen_test_irq_shared(int irq)
+{
+	struct irq_info *info = info_for_irq(irq);
+	struct physdev_irq_status_query irq_status;
+
+	if (WARN_ON(!info))
+		return -ENOENT;
+
+	irq_status.irq = info->u.pirq.pirq;
+
+	if (HYPERVISOR_physdev_op(PHYSDEVOP_irq_status_query, &irq_status))
+		return 0;
+	return !(irq_status.flags & XENIRQSTAT_shared);
+}
+EXPORT_SYMBOL_GPL(xen_test_irq_shared);
+
+void xen_irq_resume(void)
+{
+	unsigned int cpu;
+	struct irq_info *info;
+
+	/* New event-channel space is not 'live' yet. */
+	xen_evtchn_resume();
+
+	/* No IRQ <-> event-channel mappings. */
+	list_for_each_entry(info, &xen_irq_list_head, list)
+		info->evtchn = 0; /* zap event-channel binding */
+
+	clear_evtchn_to_irq_all();
+
+	for_each_possible_cpu(cpu) {
+		restore_cpu_virqs(cpu);
+		restore_cpu_ipis(cpu);
+	}
+
+	restore_pirqs();
+}
+
+static struct irq_chip xen_dynamic_chip __read_mostly = {
+	.name			= "xen-dyn",
+
+	.irq_disable		= disable_dynirq,
+	.irq_mask		= disable_dynirq,
+	.irq_unmask		= enable_dynirq,
+
+	.irq_ack		= ack_dynirq,
+	.irq_mask_ack		= mask_ack_dynirq,
+
+	.irq_set_affinity	= set_affinity_irq,
+	.irq_retrigger		= retrigger_dynirq,
+};
+
+static struct irq_chip xen_lateeoi_chip __read_mostly = {
+	/* The chip name needs to contain "xen-dyn" for irqbalance to work. */
+	.name			= "xen-dyn-lateeoi",
+
+	.irq_disable		= disable_dynirq,
+	.irq_mask		= disable_dynirq,
+	.irq_unmask		= enable_dynirq,
+
+	.irq_ack		= lateeoi_ack_dynirq,
+	.irq_mask_ack		= lateeoi_mask_ack_dynirq,
+
+	.irq_set_affinity	= set_affinity_irq,
+	.irq_retrigger		= retrigger_dynirq,
+};
+
+static struct irq_chip xen_pirq_chip __read_mostly = {
+	.name			= "xen-pirq",
+
+	.irq_startup		= startup_pirq,
+	.irq_shutdown		= shutdown_pirq,
+	.irq_enable		= enable_pirq,
+	.irq_disable		= disable_pirq,
+
+	.irq_mask		= disable_dynirq,
+	.irq_unmask		= enable_dynirq,
+
+	.irq_ack		= eoi_pirq,
+	.irq_eoi		= eoi_pirq,
+	.irq_mask_ack		= mask_ack_pirq,
+
+	.irq_set_affinity	= set_affinity_irq,
+
+	.irq_retrigger		= retrigger_dynirq,
+};
+
+static struct irq_chip xen_percpu_chip __read_mostly = {
+	.name			= "xen-percpu",
+
+	.irq_disable		= disable_dynirq,
+	.irq_mask		= disable_dynirq,
+	.irq_unmask		= enable_dynirq,
+
+	.irq_ack		= ack_dynirq,
+};
+
+#ifdef CONFIG_XEN_PVHVM
+/* Vector callbacks are better than PCI interrupts to receive event
+ * channel notifications because we can receive vector callbacks on any
+ * vcpu and we don't need PCI support or APIC interactions. */
+void xen_callback_vector(void)
+{
+	int rc;
+	uint64_t callback_via;
+
+	if (xen_have_vector_callback) {
+		callback_via = HVM_CALLBACK_VECTOR(HYPERVISOR_CALLBACK_VECTOR);
+		rc = xen_set_callback_via(callback_via);
+		if (rc) {
+			pr_err("Request for Xen HVM callback vector failed\n");
+			xen_have_vector_callback = 0;
+			return;
+		}
+		pr_info_once("Xen HVM callback vector for event delivery is enabled\n");
+		alloc_intr_gate(HYPERVISOR_CALLBACK_VECTOR,
+				xen_hvm_callback_vector);
+	}
+}
+#else
+void xen_callback_vector(void) {}
+#endif
+
+static bool fifo_events = true;
+module_param(fifo_events, bool, 0);
+
+static int xen_evtchn_cpu_prepare(unsigned int cpu)
+{
+	int ret = 0;
+
+	xen_cpu_init_eoi(cpu);
+
+	if (evtchn_ops->percpu_init)
+		ret = evtchn_ops->percpu_init(cpu);
+
+	return ret;
+}
+
+static int xen_evtchn_cpu_dead(unsigned int cpu)
+{
+	int ret = 0;
+
+	if (evtchn_ops->percpu_deinit)
+		ret = evtchn_ops->percpu_deinit(cpu);
+
+	return ret;
+}
+
+void __init xen_init_IRQ(void)
+{
+	int ret = -EINVAL;
+	unsigned int evtchn;
+
+	if (fifo_events)
+		ret = xen_evtchn_fifo_init();
+	if (ret < 0)
+		xen_evtchn_2l_init();
+
+	xen_cpu_init_eoi(smp_processor_id());
+
+	cpuhp_setup_state_nocalls(CPUHP_XEN_EVTCHN_PREPARE,
+				  "xen/evtchn:prepare",
+				  xen_evtchn_cpu_prepare, xen_evtchn_cpu_dead);
+
+	evtchn_to_irq = kcalloc(EVTCHN_ROW(xen_evtchn_max_channels()),
+				sizeof(*evtchn_to_irq), GFP_KERNEL);
+	BUG_ON(!evtchn_to_irq);
+
+	/* No event channels are 'live' right now. */
+	for (evtchn = 0; evtchn < xen_evtchn_nr_channels(); evtchn++)
+		mask_evtchn(evtchn);
+
+	pirq_needs_eoi = pirq_needs_eoi_flag;
+
+#ifdef CONFIG_X86
+	if (xen_pv_domain()) {
+		irq_ctx_init(smp_processor_id());
+		if (xen_initial_domain())
+			pci_xen_initial_domain();
+	}
+	if (xen_feature(XENFEAT_hvm_callback_vector))
+		xen_callback_vector();
+
+	if (xen_hvm_domain()) {
+		native_init_IRQ();
+		/* pci_xen_hvm_init must be called after native_init_IRQ so that
+		 * __acpi_register_gsi can point at the right function */
+		pci_xen_hvm_init();
+	} else {
+		int rc;
+		struct physdev_pirq_eoi_gmfn eoi_gmfn;
+
+		pirq_eoi_map = (void *)__get_free_page(GFP_KERNEL|__GFP_ZERO);
+		eoi_gmfn.gmfn = virt_to_gfn(pirq_eoi_map);
+		rc = HYPERVISOR_physdev_op(PHYSDEVOP_pirq_eoi_gmfn_v2, &eoi_gmfn);
+		if (rc != 0) {
+			free_page((unsigned long) pirq_eoi_map);
+			pirq_eoi_map = NULL;
+		} else
+			pirq_needs_eoi = pirq_check_eoi_map;
+	}
+#endif
+}
diff --git a/drivers/xen/events/events_fifo.c b/drivers/xen/events/events_fifo.c
new file mode 100644
index 000000000..360a7f8cd
--- /dev/null
+++ b/drivers/xen/events/events_fifo.c
@@ -0,0 +1,439 @@
+/*
+ * Xen event channels (FIFO-based ABI)
+ *
+ * Copyright (C) 2013 Citrix Systems R&D ltd.
+ *
+ * This source code is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2 of the
+ * License, or (at your option) any later version.
+ *
+ * Or, when distributed separately from the Linux kernel or
+ * incorporated into other software packages, subject to the following
+ * license:
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this source file (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy, modify,
+ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
+ * and to permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#define pr_fmt(fmt) "xen:" KBUILD_MODNAME ": " fmt
+
+#include <linux/linkage.h>
+#include <linux/interrupt.h>
+#include <linux/irq.h>
+#include <linux/smp.h>
+#include <linux/percpu.h>
+#include <linux/cpu.h>
+
+#include <asm/barrier.h>
+#include <asm/sync_bitops.h>
+#include <asm/xen/hypercall.h>
+#include <asm/xen/hypervisor.h>
+
+#include <xen/xen.h>
+#include <xen/xen-ops.h>
+#include <xen/events.h>
+#include <xen/interface/xen.h>
+#include <xen/interface/event_channel.h>
+#include <xen/page.h>
+
+#include "events_internal.h"
+
+#define EVENT_WORDS_PER_PAGE (XEN_PAGE_SIZE / sizeof(event_word_t))
+#define MAX_EVENT_ARRAY_PAGES (EVTCHN_FIFO_NR_CHANNELS / EVENT_WORDS_PER_PAGE)
+
+struct evtchn_fifo_queue {
+	uint32_t head[EVTCHN_FIFO_MAX_QUEUES];
+};
+
+static DEFINE_PER_CPU(struct evtchn_fifo_control_block *, cpu_control_block);
+static DEFINE_PER_CPU(struct evtchn_fifo_queue, cpu_queue);
+static event_word_t *event_array[MAX_EVENT_ARRAY_PAGES] __read_mostly;
+static unsigned event_array_pages __read_mostly;
+
+/*
+ * sync_set_bit() and friends must be unsigned long aligned.
+ */
+#if BITS_PER_LONG > 32
+
+#define BM(w) (unsigned long *)((unsigned long)w & ~0x7UL)
+#define EVTCHN_FIFO_BIT(b, w) \
+    (((unsigned long)w & 0x4UL) ? (EVTCHN_FIFO_ ##b + 32) : EVTCHN_FIFO_ ##b)
+
+#else
+
+#define BM(w) ((unsigned long *)(w))
+#define EVTCHN_FIFO_BIT(b, w) EVTCHN_FIFO_ ##b
+
+#endif
+
+static inline event_word_t *event_word_from_port(unsigned port)
+{
+	unsigned i = port / EVENT_WORDS_PER_PAGE;
+
+	return event_array[i] + port % EVENT_WORDS_PER_PAGE;
+}
+
+static unsigned evtchn_fifo_max_channels(void)
+{
+	return EVTCHN_FIFO_NR_CHANNELS;
+}
+
+static unsigned evtchn_fifo_nr_channels(void)
+{
+	return event_array_pages * EVENT_WORDS_PER_PAGE;
+}
+
+static int init_control_block(int cpu,
+                              struct evtchn_fifo_control_block *control_block)
+{
+	struct evtchn_fifo_queue *q = &per_cpu(cpu_queue, cpu);
+	struct evtchn_init_control init_control;
+	unsigned int i;
+
+	/* Reset the control block and the local HEADs. */
+	clear_page(control_block);
+	for (i = 0; i < EVTCHN_FIFO_MAX_QUEUES; i++)
+		q->head[i] = 0;
+
+	init_control.control_gfn = virt_to_gfn(control_block);
+	init_control.offset      = 0;
+	init_control.vcpu        = xen_vcpu_nr(cpu);
+
+	return HYPERVISOR_event_channel_op(EVTCHNOP_init_control, &init_control);
+}
+
+static void free_unused_array_pages(void)
+{
+	unsigned i;
+
+	for (i = event_array_pages; i < MAX_EVENT_ARRAY_PAGES; i++) {
+		if (!event_array[i])
+			break;
+		free_page((unsigned long)event_array[i]);
+		event_array[i] = NULL;
+	}
+}
+
+static void init_array_page(event_word_t *array_page)
+{
+	unsigned i;
+
+	for (i = 0; i < EVENT_WORDS_PER_PAGE; i++)
+		array_page[i] = 1 << EVTCHN_FIFO_MASKED;
+}
+
+static int evtchn_fifo_setup(struct irq_info *info)
+{
+	unsigned port = info->evtchn;
+	unsigned new_array_pages;
+	int ret;
+
+	new_array_pages = port / EVENT_WORDS_PER_PAGE + 1;
+
+	if (new_array_pages > MAX_EVENT_ARRAY_PAGES)
+		return -EINVAL;
+
+	while (event_array_pages < new_array_pages) {
+		void *array_page;
+		struct evtchn_expand_array expand_array;
+
+		/* Might already have a page if we've resumed. */
+		array_page = event_array[event_array_pages];
+		if (!array_page) {
+			array_page = (void *)__get_free_page(GFP_KERNEL);
+			if (array_page == NULL) {
+				ret = -ENOMEM;
+				goto error;
+			}
+			event_array[event_array_pages] = array_page;
+		}
+
+		/* Mask all events in this page before adding it. */
+		init_array_page(array_page);
+
+		expand_array.array_gfn = virt_to_gfn(array_page);
+
+		ret = HYPERVISOR_event_channel_op(EVTCHNOP_expand_array, &expand_array);
+		if (ret < 0)
+			goto error;
+
+		event_array_pages++;
+	}
+	return 0;
+
+  error:
+	if (event_array_pages == 0)
+		panic("xen: unable to expand event array with initial page (%d)\n", ret);
+	else
+		pr_err("unable to expand event array (%d)\n", ret);
+	free_unused_array_pages();
+	return ret;
+}
+
+static void evtchn_fifo_bind_to_cpu(struct irq_info *info, unsigned cpu)
+{
+	/* no-op */
+}
+
+static void evtchn_fifo_clear_pending(unsigned port)
+{
+	event_word_t *word = event_word_from_port(port);
+	sync_clear_bit(EVTCHN_FIFO_BIT(PENDING, word), BM(word));
+}
+
+static void evtchn_fifo_set_pending(unsigned port)
+{
+	event_word_t *word = event_word_from_port(port);
+	sync_set_bit(EVTCHN_FIFO_BIT(PENDING, word), BM(word));
+}
+
+static bool evtchn_fifo_is_pending(unsigned port)
+{
+	event_word_t *word = event_word_from_port(port);
+	return sync_test_bit(EVTCHN_FIFO_BIT(PENDING, word), BM(word));
+}
+
+static void evtchn_fifo_mask(unsigned port)
+{
+	event_word_t *word = event_word_from_port(port);
+	sync_set_bit(EVTCHN_FIFO_BIT(MASKED, word), BM(word));
+}
+
+static bool evtchn_fifo_is_masked(unsigned port)
+{
+	event_word_t *word = event_word_from_port(port);
+	return sync_test_bit(EVTCHN_FIFO_BIT(MASKED, word), BM(word));
+}
+/*
+ * Clear MASKED if not PENDING, spinning if BUSY is set.
+ * Return true if mask was cleared.
+ */
+static bool clear_masked_cond(volatile event_word_t *word)
+{
+	event_word_t new, old, w;
+
+	w = *word;
+
+	do {
+		if (w & (1 << EVTCHN_FIFO_PENDING))
+			return false;
+
+		old = w & ~(1 << EVTCHN_FIFO_BUSY);
+		new = old & ~(1 << EVTCHN_FIFO_MASKED);
+		w = sync_cmpxchg(word, old, new);
+	} while (w != old);
+
+	return true;
+}
+
+static void evtchn_fifo_unmask(unsigned port)
+{
+	event_word_t *word = event_word_from_port(port);
+
+	BUG_ON(!irqs_disabled());
+
+	if (!clear_masked_cond(word)) {
+		struct evtchn_unmask unmask = { .port = port };
+		(void)HYPERVISOR_event_channel_op(EVTCHNOP_unmask, &unmask);
+	}
+}
+
+static uint32_t clear_linked(volatile event_word_t *word)
+{
+	event_word_t new, old, w;
+
+	w = *word;
+
+	do {
+		old = w;
+		new = (w & ~((1 << EVTCHN_FIFO_LINKED)
+			     | EVTCHN_FIFO_LINK_MASK));
+	} while ((w = sync_cmpxchg(word, old, new)) != old);
+
+	return w & EVTCHN_FIFO_LINK_MASK;
+}
+
+static void consume_one_event(unsigned cpu, struct evtchn_loop_ctrl *ctrl,
+			      struct evtchn_fifo_control_block *control_block,
+			      unsigned priority, unsigned long *ready)
+{
+	struct evtchn_fifo_queue *q = &per_cpu(cpu_queue, cpu);
+	uint32_t head;
+	unsigned port;
+	event_word_t *word;
+
+	head = q->head[priority];
+
+	/*
+	 * Reached the tail last time?  Read the new HEAD from the
+	 * control block.
+	 */
+	if (head == 0) {
+		virt_rmb(); /* Ensure word is up-to-date before reading head. */
+		head = control_block->head[priority];
+	}
+
+	port = head;
+	word = event_word_from_port(port);
+	head = clear_linked(word);
+
+	/*
+	 * If the link is non-zero, there are more events in the
+	 * queue, otherwise the queue is empty.
+	 *
+	 * If the queue is empty, clear this priority from our local
+	 * copy of the ready word.
+	 */
+	if (head == 0)
+		clear_bit(priority, ready);
+
+	if (evtchn_fifo_is_pending(port) && !evtchn_fifo_is_masked(port)) {
+		if (unlikely(!ctrl))
+			pr_warn("Dropping pending event for port %u\n", port);
+		else
+			handle_irq_for_port(port, ctrl);
+	}
+
+	q->head[priority] = head;
+}
+
+static void __evtchn_fifo_handle_events(unsigned cpu,
+					struct evtchn_loop_ctrl *ctrl)
+{
+	struct evtchn_fifo_control_block *control_block;
+	unsigned long ready;
+	unsigned q;
+
+	control_block = per_cpu(cpu_control_block, cpu);
+
+	ready = xchg(&control_block->ready, 0);
+
+	while (ready) {
+		q = find_first_bit(&ready, EVTCHN_FIFO_MAX_QUEUES);
+		consume_one_event(cpu, ctrl, control_block, q, &ready);
+		ready |= xchg(&control_block->ready, 0);
+	}
+}
+
+static void evtchn_fifo_handle_events(unsigned cpu,
+				      struct evtchn_loop_ctrl *ctrl)
+{
+	__evtchn_fifo_handle_events(cpu, ctrl);
+}
+
+static void evtchn_fifo_resume(void)
+{
+	unsigned cpu;
+
+	for_each_possible_cpu(cpu) {
+		void *control_block = per_cpu(cpu_control_block, cpu);
+		int ret;
+
+		if (!control_block)
+			continue;
+
+		/*
+		 * If this CPU is offline, take the opportunity to
+		 * free the control block while it is not being
+		 * used.
+		 */
+		if (!cpu_online(cpu)) {
+			free_page((unsigned long)control_block);
+			per_cpu(cpu_control_block, cpu) = NULL;
+			continue;
+		}
+
+		ret = init_control_block(cpu, control_block);
+		BUG_ON(ret < 0);
+	}
+
+	/*
+	 * The event array starts out as empty again and is extended
+	 * as normal when events are bound.  The existing pages will
+	 * be reused.
+	 */
+	event_array_pages = 0;
+}
+
+static int evtchn_fifo_alloc_control_block(unsigned cpu)
+{
+	void *control_block = NULL;
+	int ret = -ENOMEM;
+
+	control_block = (void *)__get_free_page(GFP_KERNEL);
+	if (control_block == NULL)
+		goto error;
+
+	ret = init_control_block(cpu, control_block);
+	if (ret < 0)
+		goto error;
+
+	per_cpu(cpu_control_block, cpu) = control_block;
+
+	return 0;
+
+  error:
+	free_page((unsigned long)control_block);
+	return ret;
+}
+
+static int evtchn_fifo_percpu_init(unsigned int cpu)
+{
+	if (!per_cpu(cpu_control_block, cpu))
+		return evtchn_fifo_alloc_control_block(cpu);
+	return 0;
+}
+
+static int evtchn_fifo_percpu_deinit(unsigned int cpu)
+{
+	__evtchn_fifo_handle_events(cpu, NULL);
+	return 0;
+}
+
+static const struct evtchn_ops evtchn_ops_fifo = {
+	.max_channels      = evtchn_fifo_max_channels,
+	.nr_channels       = evtchn_fifo_nr_channels,
+	.setup             = evtchn_fifo_setup,
+	.bind_to_cpu       = evtchn_fifo_bind_to_cpu,
+	.clear_pending     = evtchn_fifo_clear_pending,
+	.set_pending       = evtchn_fifo_set_pending,
+	.is_pending        = evtchn_fifo_is_pending,
+	.mask              = evtchn_fifo_mask,
+	.unmask            = evtchn_fifo_unmask,
+	.handle_events     = evtchn_fifo_handle_events,
+	.resume            = evtchn_fifo_resume,
+	.percpu_init       = evtchn_fifo_percpu_init,
+	.percpu_deinit     = evtchn_fifo_percpu_deinit,
+};
+
+int __init xen_evtchn_fifo_init(void)
+{
+	int cpu = smp_processor_id();
+	int ret;
+
+	ret = evtchn_fifo_alloc_control_block(cpu);
+	if (ret < 0)
+		return ret;
+
+	pr_info("Using FIFO-based ABI\n");
+
+	evtchn_ops = &evtchn_ops_fifo;
+
+	return ret;
+}
diff --git a/drivers/xen/events/events_internal.h b/drivers/xen/events/events_internal.h
new file mode 100644
index 000000000..cc37b7114
--- /dev/null
+++ b/drivers/xen/events/events_internal.h
@@ -0,0 +1,171 @@
+/*
+ * Xen Event Channels (internal header)
+ *
+ * Copyright (C) 2013 Citrix Systems R&D Ltd.
+ *
+ * This source code is licensed under the GNU General Public License,
+ * Version 2 or later.  See the file COPYING for more details.
+ */
+#ifndef __EVENTS_INTERNAL_H__
+#define __EVENTS_INTERNAL_H__
+
+/* Interrupt types. */
+enum xen_irq_type {
+	IRQT_UNBOUND = 0,
+	IRQT_PIRQ,
+	IRQT_VIRQ,
+	IRQT_IPI,
+	IRQT_EVTCHN
+};
+
+/*
+ * Packed IRQ information:
+ * type - enum xen_irq_type
+ * event channel - irq->event channel mapping
+ * cpu - cpu this event channel is bound to
+ * index - type-specific information:
+ *    PIRQ - vector, with MSB being "needs EIO", or physical IRQ of the HVM
+ *           guest, or GSI (real passthrough IRQ) of the device.
+ *    VIRQ - virq number
+ *    IPI - IPI vector
+ *    EVTCHN -
+ */
+struct irq_info {
+	struct list_head list;
+	struct list_head eoi_list;
+	short refcnt;
+	short spurious_cnt;
+	short type;		/* type */
+	u8 mask_reason;		/* Why is event channel masked */
+#define EVT_MASK_REASON_EXPLICIT	0x01
+#define EVT_MASK_REASON_TEMPORARY	0x02
+#define EVT_MASK_REASON_EOI_PENDING	0x04
+	u8 is_active;		/* Is event just being handled? */
+	unsigned irq;
+	unsigned int evtchn;	/* event channel */
+	unsigned short cpu;	/* cpu bound */
+	unsigned short eoi_cpu;	/* EOI must happen on this cpu */
+	unsigned int irq_epoch;	/* If eoi_cpu valid: irq_epoch of event */
+	u64 eoi_time;		/* Time in jiffies when to EOI. */
+	raw_spinlock_t lock;
+
+	union {
+		unsigned short virq;
+		enum ipi_vector ipi;
+		struct {
+			unsigned short pirq;
+			unsigned short gsi;
+			unsigned char vector;
+			unsigned char flags;
+			uint16_t domid;
+		} pirq;
+	} u;
+};
+
+#define PIRQ_NEEDS_EOI	(1 << 0)
+#define PIRQ_SHAREABLE	(1 << 1)
+#define PIRQ_MSI_GROUP	(1 << 2)
+
+struct evtchn_loop_ctrl;
+
+struct evtchn_ops {
+	unsigned (*max_channels)(void);
+	unsigned (*nr_channels)(void);
+
+	int (*setup)(struct irq_info *info);
+	void (*remove)(evtchn_port_t port, unsigned int cpu);
+	void (*bind_to_cpu)(struct irq_info *info, unsigned cpu);
+
+	void (*clear_pending)(unsigned port);
+	void (*set_pending)(unsigned port);
+	bool (*is_pending)(unsigned port);
+	void (*mask)(unsigned port);
+	void (*unmask)(unsigned port);
+
+	void (*handle_events)(unsigned cpu, struct evtchn_loop_ctrl *ctrl);
+	void (*resume)(void);
+
+	int (*percpu_init)(unsigned int cpu);
+	int (*percpu_deinit)(unsigned int cpu);
+};
+
+extern const struct evtchn_ops *evtchn_ops;
+
+extern int **evtchn_to_irq;
+int get_evtchn_to_irq(unsigned int evtchn);
+void handle_irq_for_port(evtchn_port_t port, struct evtchn_loop_ctrl *ctrl);
+
+struct irq_info *info_for_irq(unsigned irq);
+unsigned cpu_from_irq(unsigned irq);
+unsigned cpu_from_evtchn(unsigned int evtchn);
+
+static inline unsigned xen_evtchn_max_channels(void)
+{
+	return evtchn_ops->max_channels();
+}
+
+/*
+ * Do any ABI specific setup for a bound event channel before it can
+ * be unmasked and used.
+ */
+static inline int xen_evtchn_port_setup(struct irq_info *info)
+{
+	if (evtchn_ops->setup)
+		return evtchn_ops->setup(info);
+	return 0;
+}
+
+static inline void xen_evtchn_port_remove(evtchn_port_t evtchn,
+					  unsigned int cpu)
+{
+	if (evtchn_ops->remove)
+		evtchn_ops->remove(evtchn, cpu);
+}
+
+static inline void xen_evtchn_port_bind_to_cpu(struct irq_info *info,
+					       unsigned cpu)
+{
+	evtchn_ops->bind_to_cpu(info, cpu);
+}
+
+static inline void clear_evtchn(unsigned port)
+{
+	evtchn_ops->clear_pending(port);
+}
+
+static inline void set_evtchn(unsigned port)
+{
+	evtchn_ops->set_pending(port);
+}
+
+static inline bool test_evtchn(unsigned port)
+{
+	return evtchn_ops->is_pending(port);
+}
+
+static inline void mask_evtchn(unsigned port)
+{
+	return evtchn_ops->mask(port);
+}
+
+static inline void unmask_evtchn(unsigned port)
+{
+	return evtchn_ops->unmask(port);
+}
+
+static inline void xen_evtchn_handle_events(unsigned cpu,
+					    struct evtchn_loop_ctrl *ctrl)
+{
+	return evtchn_ops->handle_events(cpu, ctrl);
+}
+
+static inline void xen_evtchn_resume(void)
+{
+	if (evtchn_ops->resume)
+		evtchn_ops->resume();
+}
+
+void xen_evtchn_2l_init(void);
+int xen_evtchn_fifo_init(void);
+
+#endif /* #ifndef __EVENTS_INTERNAL_H__ */
diff --git a/drivers/xen/evtchn.c b/drivers/xen/evtchn.c
new file mode 100644
index 000000000..4b11e60e3
--- /dev/null
+++ b/drivers/xen/evtchn.c
@@ -0,0 +1,733 @@
+/******************************************************************************
+ * evtchn.c
+ *
+ * Driver for receiving and demuxing event-channel signals.
+ *
+ * Copyright (c) 2004-2005, K A Fraser
+ * Multi-process extensions Copyright (c) 2004, Steven Smith
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation; or, when distributed
+ * separately from the Linux kernel or incorporated into other
+ * software packages, subject to the following license:
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this source file (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy, modify,
+ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
+ * and to permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#define pr_fmt(fmt) "xen:" KBUILD_MODNAME ": " fmt
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/string.h>
+#include <linux/errno.h>
+#include <linux/fs.h>
+#include <linux/miscdevice.h>
+#include <linux/major.h>
+#include <linux/proc_fs.h>
+#include <linux/stat.h>
+#include <linux/poll.h>
+#include <linux/irq.h>
+#include <linux/init.h>
+#include <linux/mutex.h>
+#include <linux/cpu.h>
+#include <linux/mm.h>
+#include <linux/vmalloc.h>
+
+#include <xen/xen.h>
+#include <xen/events.h>
+#include <xen/evtchn.h>
+#include <xen/xen-ops.h>
+#include <asm/xen/hypervisor.h>
+
+struct per_user_data {
+	struct mutex bind_mutex; /* serialize bind/unbind operations */
+	struct rb_root evtchns;
+	unsigned int nr_evtchns;
+
+	/* Notification ring, accessed via /dev/xen/evtchn. */
+	unsigned int ring_size;
+	evtchn_port_t *ring;
+	unsigned int ring_cons, ring_prod, ring_overflow;
+	struct mutex ring_cons_mutex; /* protect against concurrent readers */
+	spinlock_t ring_prod_lock; /* product against concurrent interrupts */
+
+	/* Processes wait on this queue when ring is empty. */
+	wait_queue_head_t evtchn_wait;
+	struct fasync_struct *evtchn_async_queue;
+	const char *name;
+
+	domid_t restrict_domid;
+};
+
+#define UNRESTRICTED_DOMID ((domid_t)-1)
+
+struct user_evtchn {
+	struct rb_node node;
+	struct per_user_data *user;
+	unsigned port;
+	bool enabled;
+};
+
+static void evtchn_free_ring(evtchn_port_t *ring)
+{
+	kvfree(ring);
+}
+
+static unsigned int evtchn_ring_offset(struct per_user_data *u,
+				       unsigned int idx)
+{
+	return idx & (u->ring_size - 1);
+}
+
+static evtchn_port_t *evtchn_ring_entry(struct per_user_data *u,
+					unsigned int idx)
+{
+	return u->ring + evtchn_ring_offset(u, idx);
+}
+
+static int add_evtchn(struct per_user_data *u, struct user_evtchn *evtchn)
+{
+	struct rb_node **new = &(u->evtchns.rb_node), *parent = NULL;
+
+	u->nr_evtchns++;
+
+	while (*new) {
+		struct user_evtchn *this;
+
+		this = rb_entry(*new, struct user_evtchn, node);
+
+		parent = *new;
+		if (this->port < evtchn->port)
+			new = &((*new)->rb_left);
+		else if (this->port > evtchn->port)
+			new = &((*new)->rb_right);
+		else
+			return -EEXIST;
+	}
+
+	/* Add new node and rebalance tree. */
+	rb_link_node(&evtchn->node, parent, new);
+	rb_insert_color(&evtchn->node, &u->evtchns);
+
+	return 0;
+}
+
+static void del_evtchn(struct per_user_data *u, struct user_evtchn *evtchn)
+{
+	u->nr_evtchns--;
+	rb_erase(&evtchn->node, &u->evtchns);
+	kfree(evtchn);
+}
+
+static struct user_evtchn *find_evtchn(struct per_user_data *u, unsigned port)
+{
+	struct rb_node *node = u->evtchns.rb_node;
+
+	while (node) {
+		struct user_evtchn *evtchn;
+
+		evtchn = rb_entry(node, struct user_evtchn, node);
+
+		if (evtchn->port < port)
+			node = node->rb_left;
+		else if (evtchn->port > port)
+			node = node->rb_right;
+		else
+			return evtchn;
+	}
+	return NULL;
+}
+
+static irqreturn_t evtchn_interrupt(int irq, void *data)
+{
+	struct user_evtchn *evtchn = data;
+	struct per_user_data *u = evtchn->user;
+
+	WARN(!evtchn->enabled,
+	     "Interrupt for port %d, but apparently not enabled; per-user %p\n",
+	     evtchn->port, u);
+
+	evtchn->enabled = false;
+
+	spin_lock(&u->ring_prod_lock);
+
+	if ((u->ring_prod - u->ring_cons) < u->ring_size) {
+		*evtchn_ring_entry(u, u->ring_prod) = evtchn->port;
+		wmb(); /* Ensure ring contents visible */
+		if (u->ring_cons == u->ring_prod++) {
+			wake_up_interruptible(&u->evtchn_wait);
+			kill_fasync(&u->evtchn_async_queue,
+				    SIGIO, POLL_IN);
+		}
+	} else
+		u->ring_overflow = 1;
+
+	spin_unlock(&u->ring_prod_lock);
+
+	return IRQ_HANDLED;
+}
+
+static ssize_t evtchn_read(struct file *file, char __user *buf,
+			   size_t count, loff_t *ppos)
+{
+	int rc;
+	unsigned int c, p, bytes1 = 0, bytes2 = 0;
+	struct per_user_data *u = file->private_data;
+
+	/* Whole number of ports. */
+	count &= ~(sizeof(evtchn_port_t)-1);
+
+	if (count == 0)
+		return 0;
+
+	if (count > PAGE_SIZE)
+		count = PAGE_SIZE;
+
+	for (;;) {
+		mutex_lock(&u->ring_cons_mutex);
+
+		rc = -EFBIG;
+		if (u->ring_overflow)
+			goto unlock_out;
+
+		c = u->ring_cons;
+		p = u->ring_prod;
+		if (c != p)
+			break;
+
+		mutex_unlock(&u->ring_cons_mutex);
+
+		if (file->f_flags & O_NONBLOCK)
+			return -EAGAIN;
+
+		rc = wait_event_interruptible(u->evtchn_wait,
+					      u->ring_cons != u->ring_prod);
+		if (rc)
+			return rc;
+	}
+
+	/* Byte lengths of two chunks. Chunk split (if any) is at ring wrap. */
+	if (((c ^ p) & u->ring_size) != 0) {
+		bytes1 = (u->ring_size - evtchn_ring_offset(u, c)) *
+			sizeof(evtchn_port_t);
+		bytes2 = evtchn_ring_offset(u, p) * sizeof(evtchn_port_t);
+	} else {
+		bytes1 = (p - c) * sizeof(evtchn_port_t);
+		bytes2 = 0;
+	}
+
+	/* Truncate chunks according to caller's maximum byte count. */
+	if (bytes1 > count) {
+		bytes1 = count;
+		bytes2 = 0;
+	} else if ((bytes1 + bytes2) > count) {
+		bytes2 = count - bytes1;
+	}
+
+	rc = -EFAULT;
+	rmb(); /* Ensure that we see the port before we copy it. */
+	if (copy_to_user(buf, evtchn_ring_entry(u, c), bytes1) ||
+	    ((bytes2 != 0) &&
+	     copy_to_user(&buf[bytes1], &u->ring[0], bytes2)))
+		goto unlock_out;
+
+	u->ring_cons += (bytes1 + bytes2) / sizeof(evtchn_port_t);
+	rc = bytes1 + bytes2;
+
+ unlock_out:
+	mutex_unlock(&u->ring_cons_mutex);
+	return rc;
+}
+
+static ssize_t evtchn_write(struct file *file, const char __user *buf,
+			    size_t count, loff_t *ppos)
+{
+	int rc, i;
+	evtchn_port_t *kbuf = (evtchn_port_t *)__get_free_page(GFP_KERNEL);
+	struct per_user_data *u = file->private_data;
+
+	if (kbuf == NULL)
+		return -ENOMEM;
+
+	/* Whole number of ports. */
+	count &= ~(sizeof(evtchn_port_t)-1);
+
+	rc = 0;
+	if (count == 0)
+		goto out;
+
+	if (count > PAGE_SIZE)
+		count = PAGE_SIZE;
+
+	rc = -EFAULT;
+	if (copy_from_user(kbuf, buf, count) != 0)
+		goto out;
+
+	mutex_lock(&u->bind_mutex);
+
+	for (i = 0; i < (count/sizeof(evtchn_port_t)); i++) {
+		unsigned port = kbuf[i];
+		struct user_evtchn *evtchn;
+
+		evtchn = find_evtchn(u, port);
+		if (evtchn && !evtchn->enabled) {
+			evtchn->enabled = true;
+			xen_irq_lateeoi(irq_from_evtchn(port), 0);
+		}
+	}
+
+	mutex_unlock(&u->bind_mutex);
+
+	rc = count;
+
+ out:
+	free_page((unsigned long)kbuf);
+	return rc;
+}
+
+static int evtchn_resize_ring(struct per_user_data *u)
+{
+	unsigned int new_size;
+	evtchn_port_t *new_ring, *old_ring;
+
+	/*
+	 * Ensure the ring is large enough to capture all possible
+	 * events. i.e., one free slot for each bound event.
+	 */
+	if (u->nr_evtchns <= u->ring_size)
+		return 0;
+
+	if (u->ring_size == 0)
+		new_size = 64;
+	else
+		new_size = 2 * u->ring_size;
+
+	new_ring = kvmalloc_array(new_size, sizeof(*new_ring), GFP_KERNEL);
+	if (!new_ring)
+		return -ENOMEM;
+
+	old_ring = u->ring;
+
+	/*
+	 * Access to the ring contents is serialized by either the
+	 * prod /or/ cons lock so take both when resizing.
+	 */
+	mutex_lock(&u->ring_cons_mutex);
+	spin_lock_irq(&u->ring_prod_lock);
+
+	/*
+	 * Copy the old ring contents to the new ring.
+	 *
+	 * To take care of wrapping, a full ring, and the new index
+	 * pointing into the second half, simply copy the old contents
+	 * twice.
+	 *
+	 * +---------+    +------------------+
+	 * |34567  12| -> |34567  1234567  12|
+	 * +-----p-c-+    +-------c------p---+
+	 */
+	memcpy(new_ring, old_ring, u->ring_size * sizeof(*u->ring));
+	memcpy(new_ring + u->ring_size, old_ring,
+	       u->ring_size * sizeof(*u->ring));
+
+	u->ring = new_ring;
+	u->ring_size = new_size;
+
+	spin_unlock_irq(&u->ring_prod_lock);
+	mutex_unlock(&u->ring_cons_mutex);
+
+	evtchn_free_ring(old_ring);
+
+	return 0;
+}
+
+static int evtchn_bind_to_user(struct per_user_data *u, int port)
+{
+	struct user_evtchn *evtchn;
+	struct evtchn_close close;
+	int rc = 0;
+
+	/*
+	 * Ports are never reused, so every caller should pass in a
+	 * unique port.
+	 *
+	 * (Locking not necessary because we haven't registered the
+	 * interrupt handler yet, and our caller has already
+	 * serialized bind operations.)
+	 */
+
+	evtchn = kzalloc(sizeof(*evtchn), GFP_KERNEL);
+	if (!evtchn)
+		return -ENOMEM;
+
+	evtchn->user = u;
+	evtchn->port = port;
+	evtchn->enabled = true; /* start enabled */
+
+	rc = add_evtchn(u, evtchn);
+	if (rc < 0)
+		goto err;
+
+	rc = evtchn_resize_ring(u);
+	if (rc < 0)
+		goto err;
+
+	rc = bind_evtchn_to_irqhandler_lateeoi(port, evtchn_interrupt, 0,
+					       u->name, evtchn);
+	if (rc < 0)
+		goto err;
+
+	rc = evtchn_make_refcounted(port);
+	return rc;
+
+err:
+	/* bind failed, should close the port now */
+	close.port = port;
+	if (HYPERVISOR_event_channel_op(EVTCHNOP_close, &close) != 0)
+		BUG();
+	del_evtchn(u, evtchn);
+	return rc;
+}
+
+static void evtchn_unbind_from_user(struct per_user_data *u,
+				    struct user_evtchn *evtchn)
+{
+	int irq = irq_from_evtchn(evtchn->port);
+
+	BUG_ON(irq < 0);
+
+	unbind_from_irqhandler(irq, evtchn);
+
+	del_evtchn(u, evtchn);
+}
+
+static DEFINE_PER_CPU(int, bind_last_selected_cpu);
+
+static void evtchn_bind_interdom_next_vcpu(int evtchn)
+{
+	unsigned int selected_cpu, irq;
+	struct irq_desc *desc;
+	unsigned long flags;
+
+	irq = irq_from_evtchn(evtchn);
+	desc = irq_to_desc(irq);
+
+	if (!desc)
+		return;
+
+	raw_spin_lock_irqsave(&desc->lock, flags);
+	selected_cpu = this_cpu_read(bind_last_selected_cpu);
+	selected_cpu = cpumask_next_and(selected_cpu,
+			desc->irq_common_data.affinity, cpu_online_mask);
+
+	if (unlikely(selected_cpu >= nr_cpu_ids))
+		selected_cpu = cpumask_first_and(desc->irq_common_data.affinity,
+				cpu_online_mask);
+
+	this_cpu_write(bind_last_selected_cpu, selected_cpu);
+
+	/* unmask expects irqs to be disabled */
+	xen_set_affinity_evtchn(desc, selected_cpu);
+	raw_spin_unlock_irqrestore(&desc->lock, flags);
+}
+
+static long evtchn_ioctl(struct file *file,
+			 unsigned int cmd, unsigned long arg)
+{
+	int rc;
+	struct per_user_data *u = file->private_data;
+	void __user *uarg = (void __user *) arg;
+
+	/* Prevent bind from racing with unbind */
+	mutex_lock(&u->bind_mutex);
+
+	switch (cmd) {
+	case IOCTL_EVTCHN_BIND_VIRQ: {
+		struct ioctl_evtchn_bind_virq bind;
+		struct evtchn_bind_virq bind_virq;
+
+		rc = -EACCES;
+		if (u->restrict_domid != UNRESTRICTED_DOMID)
+			break;
+
+		rc = -EFAULT;
+		if (copy_from_user(&bind, uarg, sizeof(bind)))
+			break;
+
+		bind_virq.virq = bind.virq;
+		bind_virq.vcpu = xen_vcpu_nr(0);
+		rc = HYPERVISOR_event_channel_op(EVTCHNOP_bind_virq,
+						 &bind_virq);
+		if (rc != 0)
+			break;
+
+		rc = evtchn_bind_to_user(u, bind_virq.port);
+		if (rc == 0)
+			rc = bind_virq.port;
+		break;
+	}
+
+	case IOCTL_EVTCHN_BIND_INTERDOMAIN: {
+		struct ioctl_evtchn_bind_interdomain bind;
+		struct evtchn_bind_interdomain bind_interdomain;
+
+		rc = -EFAULT;
+		if (copy_from_user(&bind, uarg, sizeof(bind)))
+			break;
+
+		rc = -EACCES;
+		if (u->restrict_domid != UNRESTRICTED_DOMID &&
+		    u->restrict_domid != bind.remote_domain)
+			break;
+
+		bind_interdomain.remote_dom  = bind.remote_domain;
+		bind_interdomain.remote_port = bind.remote_port;
+		rc = HYPERVISOR_event_channel_op(EVTCHNOP_bind_interdomain,
+						 &bind_interdomain);
+		if (rc != 0)
+			break;
+
+		rc = evtchn_bind_to_user(u, bind_interdomain.local_port);
+		if (rc == 0) {
+			rc = bind_interdomain.local_port;
+			evtchn_bind_interdom_next_vcpu(rc);
+		}
+		break;
+	}
+
+	case IOCTL_EVTCHN_BIND_UNBOUND_PORT: {
+		struct ioctl_evtchn_bind_unbound_port bind;
+		struct evtchn_alloc_unbound alloc_unbound;
+
+		rc = -EACCES;
+		if (u->restrict_domid != UNRESTRICTED_DOMID)
+			break;
+
+		rc = -EFAULT;
+		if (copy_from_user(&bind, uarg, sizeof(bind)))
+			break;
+
+		alloc_unbound.dom        = DOMID_SELF;
+		alloc_unbound.remote_dom = bind.remote_domain;
+		rc = HYPERVISOR_event_channel_op(EVTCHNOP_alloc_unbound,
+						 &alloc_unbound);
+		if (rc != 0)
+			break;
+
+		rc = evtchn_bind_to_user(u, alloc_unbound.port);
+		if (rc == 0)
+			rc = alloc_unbound.port;
+		break;
+	}
+
+	case IOCTL_EVTCHN_UNBIND: {
+		struct ioctl_evtchn_unbind unbind;
+		struct user_evtchn *evtchn;
+
+		rc = -EFAULT;
+		if (copy_from_user(&unbind, uarg, sizeof(unbind)))
+			break;
+
+		rc = -EINVAL;
+		if (unbind.port >= xen_evtchn_nr_channels())
+			break;
+
+		rc = -ENOTCONN;
+		evtchn = find_evtchn(u, unbind.port);
+		if (!evtchn)
+			break;
+
+		disable_irq(irq_from_evtchn(unbind.port));
+		evtchn_unbind_from_user(u, evtchn);
+		rc = 0;
+		break;
+	}
+
+	case IOCTL_EVTCHN_NOTIFY: {
+		struct ioctl_evtchn_notify notify;
+		struct user_evtchn *evtchn;
+
+		rc = -EFAULT;
+		if (copy_from_user(&notify, uarg, sizeof(notify)))
+			break;
+
+		rc = -ENOTCONN;
+		evtchn = find_evtchn(u, notify.port);
+		if (evtchn) {
+			notify_remote_via_evtchn(notify.port);
+			rc = 0;
+		}
+		break;
+	}
+
+	case IOCTL_EVTCHN_RESET: {
+		/* Initialise the ring to empty. Clear errors. */
+		mutex_lock(&u->ring_cons_mutex);
+		spin_lock_irq(&u->ring_prod_lock);
+		u->ring_cons = u->ring_prod = u->ring_overflow = 0;
+		spin_unlock_irq(&u->ring_prod_lock);
+		mutex_unlock(&u->ring_cons_mutex);
+		rc = 0;
+		break;
+	}
+
+	case IOCTL_EVTCHN_RESTRICT_DOMID: {
+		struct ioctl_evtchn_restrict_domid ierd;
+
+		rc = -EACCES;
+		if (u->restrict_domid != UNRESTRICTED_DOMID)
+			break;
+
+		rc = -EFAULT;
+		if (copy_from_user(&ierd, uarg, sizeof(ierd)))
+		    break;
+
+		rc = -EINVAL;
+		if (ierd.domid == 0 || ierd.domid >= DOMID_FIRST_RESERVED)
+			break;
+
+		u->restrict_domid = ierd.domid;
+		rc = 0;
+
+		break;
+	}
+
+	default:
+		rc = -ENOSYS;
+		break;
+	}
+	mutex_unlock(&u->bind_mutex);
+
+	return rc;
+}
+
+static __poll_t evtchn_poll(struct file *file, poll_table *wait)
+{
+	__poll_t mask = EPOLLOUT | EPOLLWRNORM;
+	struct per_user_data *u = file->private_data;
+
+	poll_wait(file, &u->evtchn_wait, wait);
+	if (u->ring_cons != u->ring_prod)
+		mask |= EPOLLIN | EPOLLRDNORM;
+	if (u->ring_overflow)
+		mask = EPOLLERR;
+	return mask;
+}
+
+static int evtchn_fasync(int fd, struct file *filp, int on)
+{
+	struct per_user_data *u = filp->private_data;
+	return fasync_helper(fd, filp, on, &u->evtchn_async_queue);
+}
+
+static int evtchn_open(struct inode *inode, struct file *filp)
+{
+	struct per_user_data *u;
+
+	u = kzalloc(sizeof(*u), GFP_KERNEL);
+	if (u == NULL)
+		return -ENOMEM;
+
+	u->name = kasprintf(GFP_KERNEL, "evtchn:%s", current->comm);
+	if (u->name == NULL) {
+		kfree(u);
+		return -ENOMEM;
+	}
+
+	init_waitqueue_head(&u->evtchn_wait);
+
+	mutex_init(&u->bind_mutex);
+	mutex_init(&u->ring_cons_mutex);
+	spin_lock_init(&u->ring_prod_lock);
+
+	u->restrict_domid = UNRESTRICTED_DOMID;
+
+	filp->private_data = u;
+
+	return nonseekable_open(inode, filp);
+}
+
+static int evtchn_release(struct inode *inode, struct file *filp)
+{
+	struct per_user_data *u = filp->private_data;
+	struct rb_node *node;
+
+	while ((node = u->evtchns.rb_node)) {
+		struct user_evtchn *evtchn;
+
+		evtchn = rb_entry(node, struct user_evtchn, node);
+		disable_irq(irq_from_evtchn(evtchn->port));
+		evtchn_unbind_from_user(u, evtchn);
+	}
+
+	evtchn_free_ring(u->ring);
+	kfree(u->name);
+	kfree(u);
+
+	return 0;
+}
+
+static const struct file_operations evtchn_fops = {
+	.owner   = THIS_MODULE,
+	.read    = evtchn_read,
+	.write   = evtchn_write,
+	.unlocked_ioctl = evtchn_ioctl,
+	.poll    = evtchn_poll,
+	.fasync  = evtchn_fasync,
+	.open    = evtchn_open,
+	.release = evtchn_release,
+	.llseek	 = no_llseek,
+};
+
+static struct miscdevice evtchn_miscdev = {
+	.minor        = MISC_DYNAMIC_MINOR,
+	.name         = "xen/evtchn",
+	.fops         = &evtchn_fops,
+};
+static int __init evtchn_init(void)
+{
+	int err;
+
+	if (!xen_domain())
+		return -ENODEV;
+
+	/* Create '/dev/xen/evtchn'. */
+	err = misc_register(&evtchn_miscdev);
+	if (err != 0) {
+		pr_err("Could not register /dev/xen/evtchn\n");
+		return err;
+	}
+
+	pr_info("Event-channel device installed\n");
+
+	return 0;
+}
+
+static void __exit evtchn_cleanup(void)
+{
+	misc_deregister(&evtchn_miscdev);
+}
+
+module_init(evtchn_init);
+module_exit(evtchn_cleanup);
+
+MODULE_LICENSE("GPL");
diff --git a/drivers/xen/fallback.c b/drivers/xen/fallback.c
new file mode 100644
index 000000000..b04fb64c5
--- /dev/null
+++ b/drivers/xen/fallback.c
@@ -0,0 +1,81 @@
+#include <linux/kernel.h>
+#include <linux/string.h>
+#include <linux/bug.h>
+#include <linux/export.h>
+#include <asm/hypervisor.h>
+#include <asm/xen/hypercall.h>
+
+int xen_event_channel_op_compat(int cmd, void *arg)
+{
+	struct evtchn_op op;
+	int rc;
+
+	op.cmd = cmd;
+	memcpy(&op.u, arg, sizeof(op.u));
+	rc = _hypercall1(int, event_channel_op_compat, &op);
+
+	switch (cmd) {
+	case EVTCHNOP_close:
+	case EVTCHNOP_send:
+	case EVTCHNOP_bind_vcpu:
+	case EVTCHNOP_unmask:
+		/* no output */
+		break;
+
+#define COPY_BACK(eop) \
+	case EVTCHNOP_##eop: \
+		memcpy(arg, &op.u.eop, sizeof(op.u.eop)); \
+		break
+
+	COPY_BACK(bind_interdomain);
+	COPY_BACK(bind_virq);
+	COPY_BACK(bind_pirq);
+	COPY_BACK(status);
+	COPY_BACK(alloc_unbound);
+	COPY_BACK(bind_ipi);
+#undef COPY_BACK
+
+	default:
+		WARN_ON(rc != -ENOSYS);
+		break;
+	}
+
+	return rc;
+}
+EXPORT_SYMBOL_GPL(xen_event_channel_op_compat);
+
+int xen_physdev_op_compat(int cmd, void *arg)
+{
+	struct physdev_op op;
+	int rc;
+
+	op.cmd = cmd;
+	memcpy(&op.u, arg, sizeof(op.u));
+	rc = _hypercall1(int, physdev_op_compat, &op);
+
+	switch (cmd) {
+	case PHYSDEVOP_IRQ_UNMASK_NOTIFY:
+	case PHYSDEVOP_set_iopl:
+	case PHYSDEVOP_set_iobitmap:
+	case PHYSDEVOP_apic_write:
+		/* no output */
+		break;
+
+#define COPY_BACK(pop, fld) \
+	case PHYSDEVOP_##pop: \
+		memcpy(arg, &op.u.fld, sizeof(op.u.fld)); \
+		break
+
+	COPY_BACK(irq_status_query, irq_status_query);
+	COPY_BACK(apic_read, apic_op);
+	COPY_BACK(ASSIGN_VECTOR, irq_op);
+#undef COPY_BACK
+
+	default:
+		WARN_ON(rc != -ENOSYS);
+		break;
+	}
+
+	return rc;
+}
+EXPORT_SYMBOL_GPL(xen_physdev_op_compat);
diff --git a/drivers/xen/features.c b/drivers/xen/features.c
new file mode 100644
index 000000000..d7d34fdfc
--- /dev/null
+++ b/drivers/xen/features.c
@@ -0,0 +1,33 @@
+/******************************************************************************
+ * features.c
+ *
+ * Xen feature flags.
+ *
+ * Copyright (c) 2006, Ian Campbell, XenSource Inc.
+ */
+#include <linux/types.h>
+#include <linux/cache.h>
+#include <linux/export.h>
+
+#include <asm/xen/hypercall.h>
+
+#include <xen/interface/xen.h>
+#include <xen/interface/version.h>
+#include <xen/features.h>
+
+u8 xen_features[XENFEAT_NR_SUBMAPS * 32] __read_mostly;
+EXPORT_SYMBOL_GPL(xen_features);
+
+void xen_setup_features(void)
+{
+	struct xen_feature_info fi;
+	int i, j;
+
+	for (i = 0; i < XENFEAT_NR_SUBMAPS; i++) {
+		fi.submap_idx = i;
+		if (HYPERVISOR_xen_version(XENVER_get_features, &fi) < 0)
+			break;
+		for (j = 0; j < 32; j++)
+			xen_features[i * 32 + j] = !!(fi.submap & 1<<j);
+	}
+}
diff --git a/drivers/xen/gntalloc.c b/drivers/xen/gntalloc.c
new file mode 100644
index 000000000..edb0acd0b
--- /dev/null
+++ b/drivers/xen/gntalloc.c
@@ -0,0 +1,601 @@
+/******************************************************************************
+ * gntalloc.c
+ *
+ * Device for creating grant references (in user-space) that may be shared
+ * with other domains.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+/*
+ * This driver exists to allow userspace programs in Linux to allocate kernel
+ * memory that will later be shared with another domain.  Without this device,
+ * Linux userspace programs cannot create grant references.
+ *
+ * How this stuff works:
+ *   X -> granting a page to Y
+ *   Y -> mapping the grant from X
+ *
+ *   1. X uses the gntalloc device to allocate a page of kernel memory, P.
+ *   2. X creates an entry in the grant table that says domid(Y) can access P.
+ *      This is done without a hypercall unless the grant table needs expansion.
+ *   3. X gives the grant reference identifier, GREF, to Y.
+ *   4. Y maps the page, either directly into kernel memory for use in a backend
+ *      driver, or via a the gntdev device to map into the address space of an
+ *      application running in Y. This is the first point at which Xen does any
+ *      tracking of the page.
+ *   5. A program in X mmap()s a segment of the gntalloc device that corresponds
+ *      to the shared page, and can now communicate with Y over the shared page.
+ *
+ *
+ * NOTE TO USERSPACE LIBRARIES:
+ *   The grant allocation and mmap()ing are, naturally, two separate operations.
+ *   You set up the sharing by calling the create ioctl() and then the mmap().
+ *   Teardown requires munmap() and either close() or ioctl().
+ *
+ * WARNING: Since Xen does not allow a guest to forcibly end the use of a grant
+ * reference, this device can be used to consume kernel memory by leaving grant
+ * references mapped by another domain when an application exits. Therefore,
+ * there is a global limit on the number of pages that can be allocated. When
+ * all references to the page are unmapped, it will be freed during the next
+ * grant operation.
+ */
+
+#define pr_fmt(fmt) "xen:" KBUILD_MODNAME ": " fmt
+
+#include <linux/atomic.h>
+#include <linux/module.h>
+#include <linux/miscdevice.h>
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/slab.h>
+#include <linux/fs.h>
+#include <linux/device.h>
+#include <linux/mm.h>
+#include <linux/uaccess.h>
+#include <linux/types.h>
+#include <linux/list.h>
+#include <linux/highmem.h>
+
+#include <xen/xen.h>
+#include <xen/page.h>
+#include <xen/grant_table.h>
+#include <xen/gntalloc.h>
+#include <xen/events.h>
+
+static int limit = 1024;
+module_param(limit, int, 0644);
+MODULE_PARM_DESC(limit, "Maximum number of grants that may be allocated by "
+		"the gntalloc device");
+
+static LIST_HEAD(gref_list);
+static DEFINE_MUTEX(gref_mutex);
+static int gref_size;
+
+struct notify_info {
+	uint16_t pgoff:12;    /* Bits 0-11: Offset of the byte to clear */
+	uint16_t flags:2;     /* Bits 12-13: Unmap notification flags */
+	int event;            /* Port (event channel) to notify */
+};
+
+/* Metadata on a grant reference. */
+struct gntalloc_gref {
+	struct list_head next_gref;  /* list entry gref_list */
+	struct list_head next_file;  /* list entry file->list, if open */
+	struct page *page;	     /* The shared page */
+	uint64_t file_index;         /* File offset for mmap() */
+	unsigned int users;          /* Use count - when zero, waiting on Xen */
+	grant_ref_t gref_id;         /* The grant reference number */
+	struct notify_info notify;   /* Unmap notification */
+};
+
+struct gntalloc_file_private_data {
+	struct list_head list;
+	uint64_t index;
+};
+
+struct gntalloc_vma_private_data {
+	struct gntalloc_gref *gref;
+	int users;
+	int count;
+};
+
+static void __del_gref(struct gntalloc_gref *gref);
+
+static void do_cleanup(void)
+{
+	struct gntalloc_gref *gref, *n;
+	list_for_each_entry_safe(gref, n, &gref_list, next_gref) {
+		if (!gref->users)
+			__del_gref(gref);
+	}
+}
+
+static int add_grefs(struct ioctl_gntalloc_alloc_gref *op,
+	uint32_t *gref_ids, struct gntalloc_file_private_data *priv)
+{
+	int i, rc, readonly;
+	LIST_HEAD(queue_gref);
+	LIST_HEAD(queue_file);
+	struct gntalloc_gref *gref, *next;
+
+	readonly = !(op->flags & GNTALLOC_FLAG_WRITABLE);
+	for (i = 0; i < op->count; i++) {
+		gref = kzalloc(sizeof(*gref), GFP_KERNEL);
+		if (!gref) {
+			rc = -ENOMEM;
+			goto undo;
+		}
+		list_add_tail(&gref->next_gref, &queue_gref);
+		list_add_tail(&gref->next_file, &queue_file);
+		gref->users = 1;
+		gref->file_index = op->index + i * PAGE_SIZE;
+		gref->page = alloc_page(GFP_KERNEL|__GFP_ZERO);
+		if (!gref->page) {
+			rc = -ENOMEM;
+			goto undo;
+		}
+
+		/* Grant foreign access to the page. */
+		rc = gnttab_grant_foreign_access(op->domid,
+						 xen_page_to_gfn(gref->page),
+						 readonly);
+		if (rc < 0)
+			goto undo;
+		gref_ids[i] = gref->gref_id = rc;
+	}
+
+	/* Add to gref lists. */
+	mutex_lock(&gref_mutex);
+	list_splice_tail(&queue_gref, &gref_list);
+	list_splice_tail(&queue_file, &priv->list);
+	mutex_unlock(&gref_mutex);
+
+	return 0;
+
+undo:
+	mutex_lock(&gref_mutex);
+	gref_size -= (op->count - i);
+
+	list_for_each_entry_safe(gref, next, &queue_file, next_file) {
+		list_del(&gref->next_file);
+		__del_gref(gref);
+	}
+
+	mutex_unlock(&gref_mutex);
+	return rc;
+}
+
+static void __del_gref(struct gntalloc_gref *gref)
+{
+	unsigned long addr;
+
+	if (gref->notify.flags & UNMAP_NOTIFY_CLEAR_BYTE) {
+		uint8_t *tmp = kmap(gref->page);
+		tmp[gref->notify.pgoff] = 0;
+		kunmap(gref->page);
+	}
+	if (gref->notify.flags & UNMAP_NOTIFY_SEND_EVENT) {
+		notify_remote_via_evtchn(gref->notify.event);
+		evtchn_put(gref->notify.event);
+	}
+
+	gref->notify.flags = 0;
+
+	if (gref->gref_id) {
+		if (gref->page) {
+			addr = (unsigned long)page_to_virt(gref->page);
+			gnttab_end_foreign_access(gref->gref_id, 0, addr);
+		} else
+			gnttab_free_grant_reference(gref->gref_id);
+	}
+
+	gref_size--;
+	list_del(&gref->next_gref);
+
+	kfree(gref);
+}
+
+/* finds contiguous grant references in a file, returns the first */
+static struct gntalloc_gref *find_grefs(struct gntalloc_file_private_data *priv,
+		uint64_t index, uint32_t count)
+{
+	struct gntalloc_gref *rv = NULL, *gref;
+	list_for_each_entry(gref, &priv->list, next_file) {
+		if (gref->file_index == index && !rv)
+			rv = gref;
+		if (rv) {
+			if (gref->file_index != index)
+				return NULL;
+			index += PAGE_SIZE;
+			count--;
+			if (count == 0)
+				return rv;
+		}
+	}
+	return NULL;
+}
+
+/*
+ * -------------------------------------
+ *  File operations.
+ * -------------------------------------
+ */
+static int gntalloc_open(struct inode *inode, struct file *filp)
+{
+	struct gntalloc_file_private_data *priv;
+
+	priv = kzalloc(sizeof(*priv), GFP_KERNEL);
+	if (!priv)
+		goto out_nomem;
+	INIT_LIST_HEAD(&priv->list);
+
+	filp->private_data = priv;
+
+	pr_debug("%s: priv %p\n", __func__, priv);
+
+	return 0;
+
+out_nomem:
+	return -ENOMEM;
+}
+
+static int gntalloc_release(struct inode *inode, struct file *filp)
+{
+	struct gntalloc_file_private_data *priv = filp->private_data;
+	struct gntalloc_gref *gref;
+
+	pr_debug("%s: priv %p\n", __func__, priv);
+
+	mutex_lock(&gref_mutex);
+	while (!list_empty(&priv->list)) {
+		gref = list_entry(priv->list.next,
+			struct gntalloc_gref, next_file);
+		list_del(&gref->next_file);
+		gref->users--;
+		if (gref->users == 0)
+			__del_gref(gref);
+	}
+	kfree(priv);
+	mutex_unlock(&gref_mutex);
+
+	return 0;
+}
+
+static long gntalloc_ioctl_alloc(struct gntalloc_file_private_data *priv,
+		struct ioctl_gntalloc_alloc_gref __user *arg)
+{
+	int rc = 0;
+	struct ioctl_gntalloc_alloc_gref op;
+	uint32_t *gref_ids;
+
+	pr_debug("%s: priv %p\n", __func__, priv);
+
+	if (copy_from_user(&op, arg, sizeof(op))) {
+		rc = -EFAULT;
+		goto out;
+	}
+
+	gref_ids = kcalloc(op.count, sizeof(gref_ids[0]), GFP_KERNEL);
+	if (!gref_ids) {
+		rc = -ENOMEM;
+		goto out;
+	}
+
+	mutex_lock(&gref_mutex);
+	/* Clean up pages that were at zero (local) users but were still mapped
+	 * by remote domains. Since those pages count towards the limit that we
+	 * are about to enforce, removing them here is a good idea.
+	 */
+	do_cleanup();
+	if (gref_size + op.count > limit) {
+		mutex_unlock(&gref_mutex);
+		rc = -ENOSPC;
+		goto out_free;
+	}
+	gref_size += op.count;
+	op.index = priv->index;
+	priv->index += op.count * PAGE_SIZE;
+	mutex_unlock(&gref_mutex);
+
+	rc = add_grefs(&op, gref_ids, priv);
+	if (rc < 0)
+		goto out_free;
+
+	/* Once we finish add_grefs, it is unsafe to touch the new reference,
+	 * since it is possible for a concurrent ioctl to remove it (by guessing
+	 * its index). If the userspace application doesn't provide valid memory
+	 * to write the IDs to, then it will need to close the file in order to
+	 * release - which it will do by segfaulting when it tries to access the
+	 * IDs to close them.
+	 */
+	if (copy_to_user(arg, &op, sizeof(op))) {
+		rc = -EFAULT;
+		goto out_free;
+	}
+	if (copy_to_user(arg->gref_ids, gref_ids,
+			sizeof(gref_ids[0]) * op.count)) {
+		rc = -EFAULT;
+		goto out_free;
+	}
+
+out_free:
+	kfree(gref_ids);
+out:
+	return rc;
+}
+
+static long gntalloc_ioctl_dealloc(struct gntalloc_file_private_data *priv,
+		void __user *arg)
+{
+	int i, rc = 0;
+	struct ioctl_gntalloc_dealloc_gref op;
+	struct gntalloc_gref *gref, *n;
+
+	pr_debug("%s: priv %p\n", __func__, priv);
+
+	if (copy_from_user(&op, arg, sizeof(op))) {
+		rc = -EFAULT;
+		goto dealloc_grant_out;
+	}
+
+	mutex_lock(&gref_mutex);
+	gref = find_grefs(priv, op.index, op.count);
+	if (gref) {
+		/* Remove from the file list only, and decrease reference count.
+		 * The later call to do_cleanup() will remove from gref_list and
+		 * free the memory if the pages aren't mapped anywhere.
+		 */
+		for (i = 0; i < op.count; i++) {
+			n = list_entry(gref->next_file.next,
+				struct gntalloc_gref, next_file);
+			list_del(&gref->next_file);
+			gref->users--;
+			gref = n;
+		}
+	} else {
+		rc = -EINVAL;
+	}
+
+	do_cleanup();
+
+	mutex_unlock(&gref_mutex);
+dealloc_grant_out:
+	return rc;
+}
+
+static long gntalloc_ioctl_unmap_notify(struct gntalloc_file_private_data *priv,
+		void __user *arg)
+{
+	struct ioctl_gntalloc_unmap_notify op;
+	struct gntalloc_gref *gref;
+	uint64_t index;
+	int pgoff;
+	int rc;
+
+	if (copy_from_user(&op, arg, sizeof(op)))
+		return -EFAULT;
+
+	index = op.index & ~(PAGE_SIZE - 1);
+	pgoff = op.index & (PAGE_SIZE - 1);
+
+	mutex_lock(&gref_mutex);
+
+	gref = find_grefs(priv, index, 1);
+	if (!gref) {
+		rc = -ENOENT;
+		goto unlock_out;
+	}
+
+	if (op.action & ~(UNMAP_NOTIFY_CLEAR_BYTE|UNMAP_NOTIFY_SEND_EVENT)) {
+		rc = -EINVAL;
+		goto unlock_out;
+	}
+
+	/* We need to grab a reference to the event channel we are going to use
+	 * to send the notify before releasing the reference we may already have
+	 * (if someone has called this ioctl twice). This is required so that
+	 * it is possible to change the clear_byte part of the notification
+	 * without disturbing the event channel part, which may now be the last
+	 * reference to that event channel.
+	 */
+	if (op.action & UNMAP_NOTIFY_SEND_EVENT) {
+		if (evtchn_get(op.event_channel_port)) {
+			rc = -EINVAL;
+			goto unlock_out;
+		}
+	}
+
+	if (gref->notify.flags & UNMAP_NOTIFY_SEND_EVENT)
+		evtchn_put(gref->notify.event);
+
+	gref->notify.flags = op.action;
+	gref->notify.pgoff = pgoff;
+	gref->notify.event = op.event_channel_port;
+	rc = 0;
+
+ unlock_out:
+	mutex_unlock(&gref_mutex);
+	return rc;
+}
+
+static long gntalloc_ioctl(struct file *filp, unsigned int cmd,
+		unsigned long arg)
+{
+	struct gntalloc_file_private_data *priv = filp->private_data;
+
+	switch (cmd) {
+	case IOCTL_GNTALLOC_ALLOC_GREF:
+		return gntalloc_ioctl_alloc(priv, (void __user *)arg);
+
+	case IOCTL_GNTALLOC_DEALLOC_GREF:
+		return gntalloc_ioctl_dealloc(priv, (void __user *)arg);
+
+	case IOCTL_GNTALLOC_SET_UNMAP_NOTIFY:
+		return gntalloc_ioctl_unmap_notify(priv, (void __user *)arg);
+
+	default:
+		return -ENOIOCTLCMD;
+	}
+
+	return 0;
+}
+
+static void gntalloc_vma_open(struct vm_area_struct *vma)
+{
+	struct gntalloc_vma_private_data *priv = vma->vm_private_data;
+
+	if (!priv)
+		return;
+
+	mutex_lock(&gref_mutex);
+	priv->users++;
+	mutex_unlock(&gref_mutex);
+}
+
+static void gntalloc_vma_close(struct vm_area_struct *vma)
+{
+	struct gntalloc_vma_private_data *priv = vma->vm_private_data;
+	struct gntalloc_gref *gref, *next;
+	int i;
+
+	if (!priv)
+		return;
+
+	mutex_lock(&gref_mutex);
+	priv->users--;
+	if (priv->users == 0) {
+		gref = priv->gref;
+		for (i = 0; i < priv->count; i++) {
+			gref->users--;
+			next = list_entry(gref->next_gref.next,
+					  struct gntalloc_gref, next_gref);
+			if (gref->users == 0)
+				__del_gref(gref);
+			gref = next;
+		}
+		kfree(priv);
+	}
+	mutex_unlock(&gref_mutex);
+}
+
+static const struct vm_operations_struct gntalloc_vmops = {
+	.open = gntalloc_vma_open,
+	.close = gntalloc_vma_close,
+};
+
+static int gntalloc_mmap(struct file *filp, struct vm_area_struct *vma)
+{
+	struct gntalloc_file_private_data *priv = filp->private_data;
+	struct gntalloc_vma_private_data *vm_priv;
+	struct gntalloc_gref *gref;
+	int count = vma_pages(vma);
+	int rv, i;
+
+	if (!(vma->vm_flags & VM_SHARED)) {
+		pr_err("%s: Mapping must be shared\n", __func__);
+		return -EINVAL;
+	}
+
+	vm_priv = kmalloc(sizeof(*vm_priv), GFP_KERNEL);
+	if (!vm_priv)
+		return -ENOMEM;
+
+	mutex_lock(&gref_mutex);
+
+	pr_debug("%s: priv %p,%p, page %lu+%d\n", __func__,
+		       priv, vm_priv, vma->vm_pgoff, count);
+
+	gref = find_grefs(priv, vma->vm_pgoff << PAGE_SHIFT, count);
+	if (gref == NULL) {
+		rv = -ENOENT;
+		pr_debug("%s: Could not find grant reference",
+				__func__);
+		kfree(vm_priv);
+		goto out_unlock;
+	}
+
+	vm_priv->gref = gref;
+	vm_priv->users = 1;
+	vm_priv->count = count;
+
+	vma->vm_private_data = vm_priv;
+
+	vma->vm_flags |= VM_DONTEXPAND | VM_DONTDUMP;
+
+	vma->vm_ops = &gntalloc_vmops;
+
+	for (i = 0; i < count; i++) {
+		gref->users++;
+		rv = vm_insert_page(vma, vma->vm_start + i * PAGE_SIZE,
+				gref->page);
+		if (rv)
+			goto out_unlock;
+
+		gref = list_entry(gref->next_file.next,
+				struct gntalloc_gref, next_file);
+	}
+	rv = 0;
+
+out_unlock:
+	mutex_unlock(&gref_mutex);
+	return rv;
+}
+
+static const struct file_operations gntalloc_fops = {
+	.owner = THIS_MODULE,
+	.open = gntalloc_open,
+	.release = gntalloc_release,
+	.unlocked_ioctl = gntalloc_ioctl,
+	.mmap = gntalloc_mmap
+};
+
+/*
+ * -------------------------------------
+ * Module creation/destruction.
+ * -------------------------------------
+ */
+static struct miscdevice gntalloc_miscdev = {
+	.minor	= MISC_DYNAMIC_MINOR,
+	.name	= "xen/gntalloc",
+	.fops	= &gntalloc_fops,
+};
+
+static int __init gntalloc_init(void)
+{
+	int err;
+
+	if (!xen_domain())
+		return -ENODEV;
+
+	err = misc_register(&gntalloc_miscdev);
+	if (err != 0) {
+		pr_err("Could not register misc gntalloc device\n");
+		return err;
+	}
+
+	pr_debug("Created grant allocation device at %d,%d\n",
+			MISC_MAJOR, gntalloc_miscdev.minor);
+
+	return 0;
+}
+
+static void __exit gntalloc_exit(void)
+{
+	misc_deregister(&gntalloc_miscdev);
+}
+
+module_init(gntalloc_init);
+module_exit(gntalloc_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Carter Weatherly <carter.weatherly@jhuapl.edu>, "
+		"Daniel De Graaf <dgdegra@tycho.nsa.gov>");
+MODULE_DESCRIPTION("User-space grant reference allocator driver");
diff --git a/drivers/xen/gntdev-common.h b/drivers/xen/gntdev-common.h
new file mode 100644
index 000000000..2f8b949c3
--- /dev/null
+++ b/drivers/xen/gntdev-common.h
@@ -0,0 +1,94 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+/*
+ * Common functionality of grant device.
+ *
+ * Copyright (c) 2006-2007, D G Murray.
+ *           (c) 2009 Gerd Hoffmann <kraxel@redhat.com>
+ *           (c) 2018 Oleksandr Andrushchenko, EPAM Systems Inc.
+ */
+
+#ifndef _GNTDEV_COMMON_H
+#define _GNTDEV_COMMON_H
+
+#include <linux/mm.h>
+#include <linux/mman.h>
+#include <linux/mmu_notifier.h>
+#include <linux/types.h>
+
+struct gntdev_dmabuf_priv;
+
+struct gntdev_priv {
+	/* Maps with visible offsets in the file descriptor. */
+	struct list_head maps;
+	/*
+	 * Maps that are not visible; will be freed on munmap.
+	 * Only populated if populate_freeable_maps == 1
+	 */
+	struct list_head freeable_maps;
+	/* lock protects maps and freeable_maps. */
+	struct mutex lock;
+	struct mm_struct *mm;
+	struct mmu_notifier mn;
+
+#ifdef CONFIG_XEN_GRANT_DMA_ALLOC
+	/* Device for which DMA memory is allocated. */
+	struct device *dma_dev;
+#endif
+
+#ifdef CONFIG_XEN_GNTDEV_DMABUF
+	struct gntdev_dmabuf_priv *dmabuf_priv;
+#endif
+};
+
+struct gntdev_unmap_notify {
+	int flags;
+	/* Address relative to the start of the gntdev_grant_map. */
+	int addr;
+	int event;
+};
+
+struct gntdev_grant_map {
+	struct list_head next;
+	struct vm_area_struct *vma;
+	int index;
+	int count;
+	int flags;
+	refcount_t users;
+	struct gntdev_unmap_notify notify;
+	struct ioctl_gntdev_grant_ref *grants;
+	struct gnttab_map_grant_ref   *map_ops;
+	struct gnttab_unmap_grant_ref *unmap_ops;
+	struct gnttab_map_grant_ref   *kmap_ops;
+	struct gnttab_unmap_grant_ref *kunmap_ops;
+	struct page **pages;
+	unsigned long pages_vm_start;
+
+#ifdef CONFIG_XEN_GRANT_DMA_ALLOC
+	/*
+	 * If dmabuf_vaddr is not NULL then this mapping is backed by DMA
+	 * capable memory.
+	 */
+
+	struct device *dma_dev;
+	/* Flags used to create this DMA buffer: GNTDEV_DMA_FLAG_XXX. */
+	int dma_flags;
+	void *dma_vaddr;
+	dma_addr_t dma_bus_addr;
+	/* Needed to avoid allocation in gnttab_dma_free_pages(). */
+	xen_pfn_t *frames;
+#endif
+};
+
+struct gntdev_grant_map *gntdev_alloc_map(struct gntdev_priv *priv, int count,
+					  int dma_flags);
+
+void gntdev_add_map(struct gntdev_priv *priv, struct gntdev_grant_map *add);
+
+void gntdev_put_map(struct gntdev_priv *priv, struct gntdev_grant_map *map);
+
+bool gntdev_account_mapped_pages(int count);
+
+int gntdev_map_grant_pages(struct gntdev_grant_map *map);
+
+#endif
diff --git a/drivers/xen/gntdev-dmabuf.c b/drivers/xen/gntdev-dmabuf.c
new file mode 100644
index 000000000..f6589563f
--- /dev/null
+++ b/drivers/xen/gntdev-dmabuf.c
@@ -0,0 +1,874 @@
+// SPDX-License-Identifier: GPL-2.0
+
+/*
+ * Xen dma-buf functionality for gntdev.
+ *
+ * DMA buffer implementation is based on drivers/gpu/drm/drm_prime.c.
+ *
+ * Copyright (c) 2018 Oleksandr Andrushchenko, EPAM Systems Inc.
+ */
+
+#include <linux/kernel.h>
+#include <linux/errno.h>
+#include <linux/dma-buf.h>
+#include <linux/slab.h>
+#include <linux/types.h>
+#include <linux/uaccess.h>
+
+#include <xen/xen.h>
+#include <xen/grant_table.h>
+
+#include "gntdev-common.h"
+#include "gntdev-dmabuf.h"
+
+#ifndef GRANT_INVALID_REF
+/*
+ * Note on usage of grant reference 0 as invalid grant reference:
+ * grant reference 0 is valid, but never exposed to a driver,
+ * because of the fact it is already in use/reserved by the PV console.
+ */
+#define GRANT_INVALID_REF	0
+#endif
+
+struct gntdev_dmabuf {
+	struct gntdev_dmabuf_priv *priv;
+	struct dma_buf *dmabuf;
+	struct list_head next;
+	int fd;
+
+	union {
+		struct {
+			/* Exported buffers are reference counted. */
+			struct kref refcount;
+
+			struct gntdev_priv *priv;
+			struct gntdev_grant_map *map;
+		} exp;
+		struct {
+			/* Granted references of the imported buffer. */
+			grant_ref_t *refs;
+			/* Scatter-gather table of the imported buffer. */
+			struct sg_table *sgt;
+			/* dma-buf attachment of the imported buffer. */
+			struct dma_buf_attachment *attach;
+		} imp;
+	} u;
+
+	/* Number of pages this buffer has. */
+	int nr_pages;
+	/* Pages of this buffer. */
+	struct page **pages;
+};
+
+struct gntdev_dmabuf_wait_obj {
+	struct list_head next;
+	struct gntdev_dmabuf *gntdev_dmabuf;
+	struct completion completion;
+};
+
+struct gntdev_dmabuf_attachment {
+	struct sg_table *sgt;
+	enum dma_data_direction dir;
+};
+
+struct gntdev_dmabuf_priv {
+	/* List of exported DMA buffers. */
+	struct list_head exp_list;
+	/* List of wait objects. */
+	struct list_head exp_wait_list;
+	/* List of imported DMA buffers. */
+	struct list_head imp_list;
+	/* This is the lock which protects dma_buf_xxx lists. */
+	struct mutex lock;
+	/*
+	 * We reference this file while exporting dma-bufs, so
+	 * the grant device context is not destroyed while there are
+	 * external users alive.
+	 */
+	struct file *filp;
+};
+
+/* DMA buffer export support. */
+
+/* Implementation of wait for exported DMA buffer to be released. */
+
+static void dmabuf_exp_release(struct kref *kref);
+
+static struct gntdev_dmabuf_wait_obj *
+dmabuf_exp_wait_obj_new(struct gntdev_dmabuf_priv *priv,
+			struct gntdev_dmabuf *gntdev_dmabuf)
+{
+	struct gntdev_dmabuf_wait_obj *obj;
+
+	obj = kzalloc(sizeof(*obj), GFP_KERNEL);
+	if (!obj)
+		return ERR_PTR(-ENOMEM);
+
+	init_completion(&obj->completion);
+	obj->gntdev_dmabuf = gntdev_dmabuf;
+
+	mutex_lock(&priv->lock);
+	list_add(&obj->next, &priv->exp_wait_list);
+	/* Put our reference and wait for gntdev_dmabuf's release to fire. */
+	kref_put(&gntdev_dmabuf->u.exp.refcount, dmabuf_exp_release);
+	mutex_unlock(&priv->lock);
+	return obj;
+}
+
+static void dmabuf_exp_wait_obj_free(struct gntdev_dmabuf_priv *priv,
+				     struct gntdev_dmabuf_wait_obj *obj)
+{
+	mutex_lock(&priv->lock);
+	list_del(&obj->next);
+	mutex_unlock(&priv->lock);
+	kfree(obj);
+}
+
+static int dmabuf_exp_wait_obj_wait(struct gntdev_dmabuf_wait_obj *obj,
+				    u32 wait_to_ms)
+{
+	if (wait_for_completion_timeout(&obj->completion,
+			msecs_to_jiffies(wait_to_ms)) <= 0)
+		return -ETIMEDOUT;
+
+	return 0;
+}
+
+static void dmabuf_exp_wait_obj_signal(struct gntdev_dmabuf_priv *priv,
+				       struct gntdev_dmabuf *gntdev_dmabuf)
+{
+	struct gntdev_dmabuf_wait_obj *obj;
+
+	list_for_each_entry(obj, &priv->exp_wait_list, next)
+		if (obj->gntdev_dmabuf == gntdev_dmabuf) {
+			pr_debug("Found gntdev_dmabuf in the wait list, wake\n");
+			complete_all(&obj->completion);
+			break;
+		}
+}
+
+static struct gntdev_dmabuf *
+dmabuf_exp_wait_obj_get_dmabuf(struct gntdev_dmabuf_priv *priv, int fd)
+{
+	struct gntdev_dmabuf *gntdev_dmabuf, *ret = ERR_PTR(-ENOENT);
+
+	mutex_lock(&priv->lock);
+	list_for_each_entry(gntdev_dmabuf, &priv->exp_list, next)
+		if (gntdev_dmabuf->fd == fd) {
+			pr_debug("Found gntdev_dmabuf in the wait list\n");
+			kref_get(&gntdev_dmabuf->u.exp.refcount);
+			ret = gntdev_dmabuf;
+			break;
+		}
+	mutex_unlock(&priv->lock);
+	return ret;
+}
+
+static int dmabuf_exp_wait_released(struct gntdev_dmabuf_priv *priv, int fd,
+				    int wait_to_ms)
+{
+	struct gntdev_dmabuf *gntdev_dmabuf;
+	struct gntdev_dmabuf_wait_obj *obj;
+	int ret;
+
+	pr_debug("Will wait for dma-buf with fd %d\n", fd);
+	/*
+	 * Try to find the DMA buffer: if not found means that
+	 * either the buffer has already been released or file descriptor
+	 * provided is wrong.
+	 */
+	gntdev_dmabuf = dmabuf_exp_wait_obj_get_dmabuf(priv, fd);
+	if (IS_ERR(gntdev_dmabuf))
+		return PTR_ERR(gntdev_dmabuf);
+
+	/*
+	 * gntdev_dmabuf still exists and is reference count locked by us now,
+	 * so prepare to wait: allocate wait object and add it to the wait list,
+	 * so we can find it on release.
+	 */
+	obj = dmabuf_exp_wait_obj_new(priv, gntdev_dmabuf);
+	if (IS_ERR(obj))
+		return PTR_ERR(obj);
+
+	ret = dmabuf_exp_wait_obj_wait(obj, wait_to_ms);
+	dmabuf_exp_wait_obj_free(priv, obj);
+	return ret;
+}
+
+/* DMA buffer export support. */
+
+static struct sg_table *
+dmabuf_pages_to_sgt(struct page **pages, unsigned int nr_pages)
+{
+	struct sg_table *sgt;
+	int ret;
+
+	sgt = kmalloc(sizeof(*sgt), GFP_KERNEL);
+	if (!sgt) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	ret = sg_alloc_table_from_pages(sgt, pages, nr_pages, 0,
+					nr_pages << PAGE_SHIFT,
+					GFP_KERNEL);
+	if (ret)
+		goto out;
+
+	return sgt;
+
+out:
+	kfree(sgt);
+	return ERR_PTR(ret);
+}
+
+static int dmabuf_exp_ops_attach(struct dma_buf *dma_buf,
+				 struct dma_buf_attachment *attach)
+{
+	struct gntdev_dmabuf_attachment *gntdev_dmabuf_attach;
+
+	gntdev_dmabuf_attach = kzalloc(sizeof(*gntdev_dmabuf_attach),
+				       GFP_KERNEL);
+	if (!gntdev_dmabuf_attach)
+		return -ENOMEM;
+
+	gntdev_dmabuf_attach->dir = DMA_NONE;
+	attach->priv = gntdev_dmabuf_attach;
+	return 0;
+}
+
+static void dmabuf_exp_ops_detach(struct dma_buf *dma_buf,
+				  struct dma_buf_attachment *attach)
+{
+	struct gntdev_dmabuf_attachment *gntdev_dmabuf_attach = attach->priv;
+
+	if (gntdev_dmabuf_attach) {
+		struct sg_table *sgt = gntdev_dmabuf_attach->sgt;
+
+		if (sgt) {
+			if (gntdev_dmabuf_attach->dir != DMA_NONE)
+				dma_unmap_sg_attrs(attach->dev, sgt->sgl,
+						   sgt->nents,
+						   gntdev_dmabuf_attach->dir,
+						   DMA_ATTR_SKIP_CPU_SYNC);
+			sg_free_table(sgt);
+		}
+
+		kfree(sgt);
+		kfree(gntdev_dmabuf_attach);
+		attach->priv = NULL;
+	}
+}
+
+static struct sg_table *
+dmabuf_exp_ops_map_dma_buf(struct dma_buf_attachment *attach,
+			   enum dma_data_direction dir)
+{
+	struct gntdev_dmabuf_attachment *gntdev_dmabuf_attach = attach->priv;
+	struct gntdev_dmabuf *gntdev_dmabuf = attach->dmabuf->priv;
+	struct sg_table *sgt;
+
+	pr_debug("Mapping %d pages for dev %p\n", gntdev_dmabuf->nr_pages,
+		 attach->dev);
+
+	if (dir == DMA_NONE || !gntdev_dmabuf_attach)
+		return ERR_PTR(-EINVAL);
+
+	/* Return the cached mapping when possible. */
+	if (gntdev_dmabuf_attach->dir == dir)
+		return gntdev_dmabuf_attach->sgt;
+
+	/*
+	 * Two mappings with different directions for the same attachment are
+	 * not allowed.
+	 */
+	if (gntdev_dmabuf_attach->dir != DMA_NONE)
+		return ERR_PTR(-EBUSY);
+
+	sgt = dmabuf_pages_to_sgt(gntdev_dmabuf->pages,
+				  gntdev_dmabuf->nr_pages);
+	if (!IS_ERR(sgt)) {
+		if (!dma_map_sg_attrs(attach->dev, sgt->sgl, sgt->nents, dir,
+				      DMA_ATTR_SKIP_CPU_SYNC)) {
+			sg_free_table(sgt);
+			kfree(sgt);
+			sgt = ERR_PTR(-ENOMEM);
+		} else {
+			gntdev_dmabuf_attach->sgt = sgt;
+			gntdev_dmabuf_attach->dir = dir;
+		}
+	}
+	if (IS_ERR(sgt))
+		pr_debug("Failed to map sg table for dev %p\n", attach->dev);
+	return sgt;
+}
+
+static void dmabuf_exp_ops_unmap_dma_buf(struct dma_buf_attachment *attach,
+					 struct sg_table *sgt,
+					 enum dma_data_direction dir)
+{
+	/* Not implemented. The unmap is done at dmabuf_exp_ops_detach(). */
+}
+
+static void dmabuf_exp_release(struct kref *kref)
+{
+	struct gntdev_dmabuf *gntdev_dmabuf =
+		container_of(kref, struct gntdev_dmabuf, u.exp.refcount);
+
+	dmabuf_exp_wait_obj_signal(gntdev_dmabuf->priv, gntdev_dmabuf);
+	list_del(&gntdev_dmabuf->next);
+	fput(gntdev_dmabuf->priv->filp);
+	kfree(gntdev_dmabuf);
+}
+
+static void dmabuf_exp_remove_map(struct gntdev_priv *priv,
+				  struct gntdev_grant_map *map)
+{
+	mutex_lock(&priv->lock);
+	list_del(&map->next);
+	gntdev_put_map(NULL /* already removed */, map);
+	mutex_unlock(&priv->lock);
+}
+
+static void dmabuf_exp_ops_release(struct dma_buf *dma_buf)
+{
+	struct gntdev_dmabuf *gntdev_dmabuf = dma_buf->priv;
+	struct gntdev_dmabuf_priv *priv = gntdev_dmabuf->priv;
+
+	dmabuf_exp_remove_map(gntdev_dmabuf->u.exp.priv,
+			      gntdev_dmabuf->u.exp.map);
+	mutex_lock(&priv->lock);
+	kref_put(&gntdev_dmabuf->u.exp.refcount, dmabuf_exp_release);
+	mutex_unlock(&priv->lock);
+}
+
+static void *dmabuf_exp_ops_kmap(struct dma_buf *dma_buf,
+				 unsigned long page_num)
+{
+	/* Not implemented. */
+	return NULL;
+}
+
+static void dmabuf_exp_ops_kunmap(struct dma_buf *dma_buf,
+				  unsigned long page_num, void *addr)
+{
+	/* Not implemented. */
+}
+
+static int dmabuf_exp_ops_mmap(struct dma_buf *dma_buf,
+			       struct vm_area_struct *vma)
+{
+	/* Not implemented. */
+	return 0;
+}
+
+static const struct dma_buf_ops dmabuf_exp_ops =  {
+	.attach = dmabuf_exp_ops_attach,
+	.detach = dmabuf_exp_ops_detach,
+	.map_dma_buf = dmabuf_exp_ops_map_dma_buf,
+	.unmap_dma_buf = dmabuf_exp_ops_unmap_dma_buf,
+	.release = dmabuf_exp_ops_release,
+	.map = dmabuf_exp_ops_kmap,
+	.unmap = dmabuf_exp_ops_kunmap,
+	.mmap = dmabuf_exp_ops_mmap,
+};
+
+struct gntdev_dmabuf_export_args {
+	struct gntdev_priv *priv;
+	struct gntdev_grant_map *map;
+	struct gntdev_dmabuf_priv *dmabuf_priv;
+	struct device *dev;
+	int count;
+	struct page **pages;
+	u32 fd;
+};
+
+static int dmabuf_exp_from_pages(struct gntdev_dmabuf_export_args *args)
+{
+	DEFINE_DMA_BUF_EXPORT_INFO(exp_info);
+	struct gntdev_dmabuf *gntdev_dmabuf;
+	int ret;
+
+	gntdev_dmabuf = kzalloc(sizeof(*gntdev_dmabuf), GFP_KERNEL);
+	if (!gntdev_dmabuf)
+		return -ENOMEM;
+
+	kref_init(&gntdev_dmabuf->u.exp.refcount);
+
+	gntdev_dmabuf->priv = args->dmabuf_priv;
+	gntdev_dmabuf->nr_pages = args->count;
+	gntdev_dmabuf->pages = args->pages;
+	gntdev_dmabuf->u.exp.priv = args->priv;
+	gntdev_dmabuf->u.exp.map = args->map;
+
+	exp_info.exp_name = KBUILD_MODNAME;
+	if (args->dev->driver && args->dev->driver->owner)
+		exp_info.owner = args->dev->driver->owner;
+	else
+		exp_info.owner = THIS_MODULE;
+	exp_info.ops = &dmabuf_exp_ops;
+	exp_info.size = args->count << PAGE_SHIFT;
+	exp_info.flags = O_RDWR;
+	exp_info.priv = gntdev_dmabuf;
+
+	gntdev_dmabuf->dmabuf = dma_buf_export(&exp_info);
+	if (IS_ERR(gntdev_dmabuf->dmabuf)) {
+		ret = PTR_ERR(gntdev_dmabuf->dmabuf);
+		gntdev_dmabuf->dmabuf = NULL;
+		goto fail;
+	}
+
+	ret = dma_buf_fd(gntdev_dmabuf->dmabuf, O_CLOEXEC);
+	if (ret < 0)
+		goto fail;
+
+	gntdev_dmabuf->fd = ret;
+	args->fd = ret;
+
+	pr_debug("Exporting DMA buffer with fd %d\n", ret);
+
+	mutex_lock(&args->dmabuf_priv->lock);
+	list_add(&gntdev_dmabuf->next, &args->dmabuf_priv->exp_list);
+	mutex_unlock(&args->dmabuf_priv->lock);
+	get_file(gntdev_dmabuf->priv->filp);
+	return 0;
+
+fail:
+	if (gntdev_dmabuf->dmabuf)
+		dma_buf_put(gntdev_dmabuf->dmabuf);
+	kfree(gntdev_dmabuf);
+	return ret;
+}
+
+static struct gntdev_grant_map *
+dmabuf_exp_alloc_backing_storage(struct gntdev_priv *priv, int dmabuf_flags,
+				 int count)
+{
+	struct gntdev_grant_map *map;
+
+	if (unlikely(count <= 0))
+		return ERR_PTR(-EINVAL);
+
+	if ((dmabuf_flags & GNTDEV_DMA_FLAG_WC) &&
+	    (dmabuf_flags & GNTDEV_DMA_FLAG_COHERENT)) {
+		pr_debug("Wrong dma-buf flags: 0x%x\n", dmabuf_flags);
+		return ERR_PTR(-EINVAL);
+	}
+
+	map = gntdev_alloc_map(priv, count, dmabuf_flags);
+	if (!map)
+		return ERR_PTR(-ENOMEM);
+
+	if (unlikely(gntdev_account_mapped_pages(count))) {
+		pr_debug("can't map %d pages: over limit\n", count);
+		gntdev_put_map(NULL, map);
+		return ERR_PTR(-ENOMEM);
+	}
+	return map;
+}
+
+static int dmabuf_exp_from_refs(struct gntdev_priv *priv, int flags,
+				int count, u32 domid, u32 *refs, u32 *fd)
+{
+	struct gntdev_grant_map *map;
+	struct gntdev_dmabuf_export_args args;
+	int i, ret;
+
+	map = dmabuf_exp_alloc_backing_storage(priv, flags, count);
+	if (IS_ERR(map))
+		return PTR_ERR(map);
+
+	for (i = 0; i < count; i++) {
+		map->grants[i].domid = domid;
+		map->grants[i].ref = refs[i];
+	}
+
+	mutex_lock(&priv->lock);
+	gntdev_add_map(priv, map);
+	mutex_unlock(&priv->lock);
+
+	map->flags |= GNTMAP_host_map;
+#if defined(CONFIG_X86)
+	map->flags |= GNTMAP_device_map;
+#endif
+
+	ret = gntdev_map_grant_pages(map);
+	if (ret < 0)
+		goto out;
+
+	args.priv = priv;
+	args.map = map;
+	args.dev = priv->dma_dev;
+	args.dmabuf_priv = priv->dmabuf_priv;
+	args.count = map->count;
+	args.pages = map->pages;
+	args.fd = -1; /* Shut up unnecessary gcc warning for i386 */
+
+	ret = dmabuf_exp_from_pages(&args);
+	if (ret < 0)
+		goto out;
+
+	*fd = args.fd;
+	return 0;
+
+out:
+	dmabuf_exp_remove_map(priv, map);
+	return ret;
+}
+
+/* DMA buffer import support. */
+
+static int
+dmabuf_imp_grant_foreign_access(struct page **pages, u32 *refs,
+				int count, int domid)
+{
+	grant_ref_t priv_gref_head;
+	int i, ret;
+
+	ret = gnttab_alloc_grant_references(count, &priv_gref_head);
+	if (ret < 0) {
+		pr_debug("Cannot allocate grant references, ret %d\n", ret);
+		return ret;
+	}
+
+	for (i = 0; i < count; i++) {
+		int cur_ref;
+
+		cur_ref = gnttab_claim_grant_reference(&priv_gref_head);
+		if (cur_ref < 0) {
+			ret = cur_ref;
+			pr_debug("Cannot claim grant reference, ret %d\n", ret);
+			goto out;
+		}
+
+		gnttab_grant_foreign_access_ref(cur_ref, domid,
+						xen_page_to_gfn(pages[i]), 0);
+		refs[i] = cur_ref;
+	}
+
+	return 0;
+
+out:
+	gnttab_free_grant_references(priv_gref_head);
+	return ret;
+}
+
+static void dmabuf_imp_end_foreign_access(u32 *refs, int count)
+{
+	int i;
+
+	for (i = 0; i < count; i++)
+		if (refs[i] != GRANT_INVALID_REF)
+			gnttab_end_foreign_access(refs[i], 0, 0UL);
+}
+
+static void dmabuf_imp_free_storage(struct gntdev_dmabuf *gntdev_dmabuf)
+{
+	kfree(gntdev_dmabuf->pages);
+	kfree(gntdev_dmabuf->u.imp.refs);
+	kfree(gntdev_dmabuf);
+}
+
+static struct gntdev_dmabuf *dmabuf_imp_alloc_storage(int count)
+{
+	struct gntdev_dmabuf *gntdev_dmabuf;
+	int i;
+
+	gntdev_dmabuf = kzalloc(sizeof(*gntdev_dmabuf), GFP_KERNEL);
+	if (!gntdev_dmabuf)
+		goto fail_no_free;
+
+	gntdev_dmabuf->u.imp.refs = kcalloc(count,
+					    sizeof(gntdev_dmabuf->u.imp.refs[0]),
+					    GFP_KERNEL);
+	if (!gntdev_dmabuf->u.imp.refs)
+		goto fail;
+
+	gntdev_dmabuf->pages = kcalloc(count,
+				       sizeof(gntdev_dmabuf->pages[0]),
+				       GFP_KERNEL);
+	if (!gntdev_dmabuf->pages)
+		goto fail;
+
+	gntdev_dmabuf->nr_pages = count;
+
+	for (i = 0; i < count; i++)
+		gntdev_dmabuf->u.imp.refs[i] = GRANT_INVALID_REF;
+
+	return gntdev_dmabuf;
+
+fail:
+	dmabuf_imp_free_storage(gntdev_dmabuf);
+fail_no_free:
+	return ERR_PTR(-ENOMEM);
+}
+
+static struct gntdev_dmabuf *
+dmabuf_imp_to_refs(struct gntdev_dmabuf_priv *priv, struct device *dev,
+		   int fd, int count, int domid)
+{
+	struct gntdev_dmabuf *gntdev_dmabuf, *ret;
+	struct dma_buf *dma_buf;
+	struct dma_buf_attachment *attach;
+	struct sg_table *sgt;
+	struct sg_page_iter sg_iter;
+	int i;
+
+	dma_buf = dma_buf_get(fd);
+	if (IS_ERR(dma_buf))
+		return ERR_CAST(dma_buf);
+
+	gntdev_dmabuf = dmabuf_imp_alloc_storage(count);
+	if (IS_ERR(gntdev_dmabuf)) {
+		ret = gntdev_dmabuf;
+		goto fail_put;
+	}
+
+	gntdev_dmabuf->priv = priv;
+	gntdev_dmabuf->fd = fd;
+
+	attach = dma_buf_attach(dma_buf, dev);
+	if (IS_ERR(attach)) {
+		ret = ERR_CAST(attach);
+		goto fail_free_obj;
+	}
+
+	gntdev_dmabuf->u.imp.attach = attach;
+
+	sgt = dma_buf_map_attachment(attach, DMA_BIDIRECTIONAL);
+	if (IS_ERR(sgt)) {
+		ret = ERR_CAST(sgt);
+		goto fail_detach;
+	}
+
+	/* Check that we have zero offset. */
+	if (sgt->sgl->offset) {
+		ret = ERR_PTR(-EINVAL);
+		pr_debug("DMA buffer has %d bytes offset, user-space expects 0\n",
+			 sgt->sgl->offset);
+		goto fail_unmap;
+	}
+
+	/* Check number of pages that imported buffer has. */
+	if (attach->dmabuf->size != gntdev_dmabuf->nr_pages << PAGE_SHIFT) {
+		ret = ERR_PTR(-EINVAL);
+		pr_debug("DMA buffer has %zu pages, user-space expects %d\n",
+			 attach->dmabuf->size, gntdev_dmabuf->nr_pages);
+		goto fail_unmap;
+	}
+
+	gntdev_dmabuf->u.imp.sgt = sgt;
+
+	/* Now convert sgt to array of pages and check for page validity. */
+	i = 0;
+	for_each_sg_page(sgt->sgl, &sg_iter, sgt->nents, 0) {
+		struct page *page = sg_page_iter_page(&sg_iter);
+		/*
+		 * Check if page is valid: this can happen if we are given
+		 * a page from VRAM or other resources which are not backed
+		 * by a struct page.
+		 */
+		if (!pfn_valid(page_to_pfn(page))) {
+			ret = ERR_PTR(-EINVAL);
+			goto fail_unmap;
+		}
+
+		gntdev_dmabuf->pages[i++] = page;
+	}
+
+	ret = ERR_PTR(dmabuf_imp_grant_foreign_access(gntdev_dmabuf->pages,
+						      gntdev_dmabuf->u.imp.refs,
+						      count, domid));
+	if (IS_ERR(ret))
+		goto fail_end_access;
+
+	pr_debug("Imported DMA buffer with fd %d\n", fd);
+
+	mutex_lock(&priv->lock);
+	list_add(&gntdev_dmabuf->next, &priv->imp_list);
+	mutex_unlock(&priv->lock);
+
+	return gntdev_dmabuf;
+
+fail_end_access:
+	dmabuf_imp_end_foreign_access(gntdev_dmabuf->u.imp.refs, count);
+fail_unmap:
+	dma_buf_unmap_attachment(attach, sgt, DMA_BIDIRECTIONAL);
+fail_detach:
+	dma_buf_detach(dma_buf, attach);
+fail_free_obj:
+	dmabuf_imp_free_storage(gntdev_dmabuf);
+fail_put:
+	dma_buf_put(dma_buf);
+	return ret;
+}
+
+/*
+ * Find the hyper dma-buf by its file descriptor and remove
+ * it from the buffer's list.
+ */
+static struct gntdev_dmabuf *
+dmabuf_imp_find_unlink(struct gntdev_dmabuf_priv *priv, int fd)
+{
+	struct gntdev_dmabuf *q, *gntdev_dmabuf, *ret = ERR_PTR(-ENOENT);
+
+	mutex_lock(&priv->lock);
+	list_for_each_entry_safe(gntdev_dmabuf, q, &priv->imp_list, next) {
+		if (gntdev_dmabuf->fd == fd) {
+			pr_debug("Found gntdev_dmabuf in the import list\n");
+			ret = gntdev_dmabuf;
+			list_del(&gntdev_dmabuf->next);
+			break;
+		}
+	}
+	mutex_unlock(&priv->lock);
+	return ret;
+}
+
+static int dmabuf_imp_release(struct gntdev_dmabuf_priv *priv, u32 fd)
+{
+	struct gntdev_dmabuf *gntdev_dmabuf;
+	struct dma_buf_attachment *attach;
+	struct dma_buf *dma_buf;
+
+	gntdev_dmabuf = dmabuf_imp_find_unlink(priv, fd);
+	if (IS_ERR(gntdev_dmabuf))
+		return PTR_ERR(gntdev_dmabuf);
+
+	pr_debug("Releasing DMA buffer with fd %d\n", fd);
+
+	dmabuf_imp_end_foreign_access(gntdev_dmabuf->u.imp.refs,
+				      gntdev_dmabuf->nr_pages);
+
+	attach = gntdev_dmabuf->u.imp.attach;
+
+	if (gntdev_dmabuf->u.imp.sgt)
+		dma_buf_unmap_attachment(attach, gntdev_dmabuf->u.imp.sgt,
+					 DMA_BIDIRECTIONAL);
+	dma_buf = attach->dmabuf;
+	dma_buf_detach(attach->dmabuf, attach);
+	dma_buf_put(dma_buf);
+
+	dmabuf_imp_free_storage(gntdev_dmabuf);
+	return 0;
+}
+
+/* DMA buffer IOCTL support. */
+
+long gntdev_ioctl_dmabuf_exp_from_refs(struct gntdev_priv *priv, int use_ptemod,
+				       struct ioctl_gntdev_dmabuf_exp_from_refs __user *u)
+{
+	struct ioctl_gntdev_dmabuf_exp_from_refs op;
+	u32 *refs;
+	long ret;
+
+	if (use_ptemod) {
+		pr_debug("Cannot provide dma-buf: use_ptemode %d\n",
+			 use_ptemod);
+		return -EINVAL;
+	}
+
+	if (copy_from_user(&op, u, sizeof(op)) != 0)
+		return -EFAULT;
+
+	if (unlikely(op.count <= 0))
+		return -EINVAL;
+
+	refs = kcalloc(op.count, sizeof(*refs), GFP_KERNEL);
+	if (!refs)
+		return -ENOMEM;
+
+	if (copy_from_user(refs, u->refs, sizeof(*refs) * op.count) != 0) {
+		ret = -EFAULT;
+		goto out;
+	}
+
+	ret = dmabuf_exp_from_refs(priv, op.flags, op.count,
+				   op.domid, refs, &op.fd);
+	if (ret)
+		goto out;
+
+	if (copy_to_user(u, &op, sizeof(op)) != 0)
+		ret = -EFAULT;
+
+out:
+	kfree(refs);
+	return ret;
+}
+
+long gntdev_ioctl_dmabuf_exp_wait_released(struct gntdev_priv *priv,
+					   struct ioctl_gntdev_dmabuf_exp_wait_released __user *u)
+{
+	struct ioctl_gntdev_dmabuf_exp_wait_released op;
+
+	if (copy_from_user(&op, u, sizeof(op)) != 0)
+		return -EFAULT;
+
+	return dmabuf_exp_wait_released(priv->dmabuf_priv, op.fd,
+					op.wait_to_ms);
+}
+
+long gntdev_ioctl_dmabuf_imp_to_refs(struct gntdev_priv *priv,
+				     struct ioctl_gntdev_dmabuf_imp_to_refs __user *u)
+{
+	struct ioctl_gntdev_dmabuf_imp_to_refs op;
+	struct gntdev_dmabuf *gntdev_dmabuf;
+	long ret;
+
+	if (copy_from_user(&op, u, sizeof(op)) != 0)
+		return -EFAULT;
+
+	if (unlikely(op.count <= 0))
+		return -EINVAL;
+
+	gntdev_dmabuf = dmabuf_imp_to_refs(priv->dmabuf_priv,
+					   priv->dma_dev, op.fd,
+					   op.count, op.domid);
+	if (IS_ERR(gntdev_dmabuf))
+		return PTR_ERR(gntdev_dmabuf);
+
+	if (copy_to_user(u->refs, gntdev_dmabuf->u.imp.refs,
+			 sizeof(*u->refs) * op.count) != 0) {
+		ret = -EFAULT;
+		goto out_release;
+	}
+	return 0;
+
+out_release:
+	dmabuf_imp_release(priv->dmabuf_priv, op.fd);
+	return ret;
+}
+
+long gntdev_ioctl_dmabuf_imp_release(struct gntdev_priv *priv,
+				     struct ioctl_gntdev_dmabuf_imp_release __user *u)
+{
+	struct ioctl_gntdev_dmabuf_imp_release op;
+
+	if (copy_from_user(&op, u, sizeof(op)) != 0)
+		return -EFAULT;
+
+	return dmabuf_imp_release(priv->dmabuf_priv, op.fd);
+}
+
+struct gntdev_dmabuf_priv *gntdev_dmabuf_init(struct file *filp)
+{
+	struct gntdev_dmabuf_priv *priv;
+
+	priv = kzalloc(sizeof(*priv), GFP_KERNEL);
+	if (!priv)
+		return ERR_PTR(-ENOMEM);
+
+	mutex_init(&priv->lock);
+	INIT_LIST_HEAD(&priv->exp_list);
+	INIT_LIST_HEAD(&priv->exp_wait_list);
+	INIT_LIST_HEAD(&priv->imp_list);
+
+	priv->filp = filp;
+
+	return priv;
+}
+
+void gntdev_dmabuf_fini(struct gntdev_dmabuf_priv *priv)
+{
+	kfree(priv);
+}
diff --git a/drivers/xen/gntdev-dmabuf.h b/drivers/xen/gntdev-dmabuf.h
new file mode 100644
index 000000000..3d9b9cf9d
--- /dev/null
+++ b/drivers/xen/gntdev-dmabuf.h
@@ -0,0 +1,33 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+/*
+ * Xen dma-buf functionality for gntdev.
+ *
+ * Copyright (c) 2018 Oleksandr Andrushchenko, EPAM Systems Inc.
+ */
+
+#ifndef _GNTDEV_DMABUF_H
+#define _GNTDEV_DMABUF_H
+
+#include <xen/gntdev.h>
+
+struct gntdev_dmabuf_priv;
+struct gntdev_priv;
+
+struct gntdev_dmabuf_priv *gntdev_dmabuf_init(struct file *filp);
+
+void gntdev_dmabuf_fini(struct gntdev_dmabuf_priv *priv);
+
+long gntdev_ioctl_dmabuf_exp_from_refs(struct gntdev_priv *priv, int use_ptemod,
+				       struct ioctl_gntdev_dmabuf_exp_from_refs __user *u);
+
+long gntdev_ioctl_dmabuf_exp_wait_released(struct gntdev_priv *priv,
+					   struct ioctl_gntdev_dmabuf_exp_wait_released __user *u);
+
+long gntdev_ioctl_dmabuf_imp_to_refs(struct gntdev_priv *priv,
+				     struct ioctl_gntdev_dmabuf_imp_to_refs __user *u);
+
+long gntdev_ioctl_dmabuf_imp_release(struct gntdev_priv *priv,
+				     struct ioctl_gntdev_dmabuf_imp_release __user *u);
+
+#endif
diff --git a/drivers/xen/gntdev.c b/drivers/xen/gntdev.c
new file mode 100644
index 000000000..e519063e4
--- /dev/null
+++ b/drivers/xen/gntdev.c
@@ -0,0 +1,1241 @@
+/******************************************************************************
+ * gntdev.c
+ *
+ * Device for accessing (in user-space) pages that have been granted by other
+ * domains.
+ *
+ * Copyright (c) 2006-2007, D G Murray.
+ *           (c) 2009 Gerd Hoffmann <kraxel@redhat.com>
+ *           (c) 2018 Oleksandr Andrushchenko, EPAM Systems Inc.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+#undef DEBUG
+
+#define pr_fmt(fmt) "xen:" KBUILD_MODNAME ": " fmt
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/miscdevice.h>
+#include <linux/fs.h>
+#include <linux/uaccess.h>
+#include <linux/sched.h>
+#include <linux/sched/mm.h>
+#include <linux/spinlock.h>
+#include <linux/slab.h>
+#include <linux/highmem.h>
+#include <linux/refcount.h>
+#ifdef CONFIG_XEN_GRANT_DMA_ALLOC
+#include <linux/of_device.h>
+#endif
+
+#include <xen/xen.h>
+#include <xen/grant_table.h>
+#include <xen/balloon.h>
+#include <xen/gntdev.h>
+#include <xen/events.h>
+#include <xen/page.h>
+#include <asm/xen/hypervisor.h>
+#include <asm/xen/hypercall.h>
+
+#include "gntdev-common.h"
+#ifdef CONFIG_XEN_GNTDEV_DMABUF
+#include "gntdev-dmabuf.h"
+#endif
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Derek G. Murray <Derek.Murray@cl.cam.ac.uk>, "
+	      "Gerd Hoffmann <kraxel@redhat.com>");
+MODULE_DESCRIPTION("User-space granted page access driver");
+
+static int limit = 1024*1024;
+module_param(limit, int, 0644);
+MODULE_PARM_DESC(limit, "Maximum number of grants that may be mapped by "
+		"the gntdev device");
+
+static atomic_t pages_mapped = ATOMIC_INIT(0);
+
+static int use_ptemod;
+#define populate_freeable_maps use_ptemod
+
+static int unmap_grant_pages(struct gntdev_grant_map *map,
+			     int offset, int pages);
+
+static struct miscdevice gntdev_miscdev;
+
+/* ------------------------------------------------------------------ */
+
+bool gntdev_account_mapped_pages(int count)
+{
+	return atomic_add_return(count, &pages_mapped) > limit;
+}
+
+static void gntdev_print_maps(struct gntdev_priv *priv,
+			      char *text, int text_index)
+{
+#ifdef DEBUG
+	struct gntdev_grant_map *map;
+
+	pr_debug("%s: maps list (priv %p)\n", __func__, priv);
+	list_for_each_entry(map, &priv->maps, next)
+		pr_debug("  index %2d, count %2d %s\n",
+		       map->index, map->count,
+		       map->index == text_index && text ? text : "");
+#endif
+}
+
+static void gntdev_free_map(struct gntdev_grant_map *map)
+{
+	if (map == NULL)
+		return;
+
+#ifdef CONFIG_XEN_GRANT_DMA_ALLOC
+	if (map->dma_vaddr) {
+		struct gnttab_dma_alloc_args args;
+
+		args.dev = map->dma_dev;
+		args.coherent = !!(map->dma_flags & GNTDEV_DMA_FLAG_COHERENT);
+		args.nr_pages = map->count;
+		args.pages = map->pages;
+		args.frames = map->frames;
+		args.vaddr = map->dma_vaddr;
+		args.dev_bus_addr = map->dma_bus_addr;
+
+		gnttab_dma_free_pages(&args);
+	} else
+#endif
+	if (map->pages)
+		gnttab_free_pages(map->count, map->pages);
+
+#ifdef CONFIG_XEN_GRANT_DMA_ALLOC
+	kfree(map->frames);
+#endif
+	kfree(map->pages);
+	kfree(map->grants);
+	kfree(map->map_ops);
+	kfree(map->unmap_ops);
+	kfree(map->kmap_ops);
+	kfree(map->kunmap_ops);
+	kfree(map);
+}
+
+struct gntdev_grant_map *gntdev_alloc_map(struct gntdev_priv *priv, int count,
+					  int dma_flags)
+{
+	struct gntdev_grant_map *add;
+	int i;
+
+	add = kzalloc(sizeof(*add), GFP_KERNEL);
+	if (NULL == add)
+		return NULL;
+
+	add->grants    = kcalloc(count, sizeof(add->grants[0]), GFP_KERNEL);
+	add->map_ops   = kcalloc(count, sizeof(add->map_ops[0]), GFP_KERNEL);
+	add->unmap_ops = kcalloc(count, sizeof(add->unmap_ops[0]), GFP_KERNEL);
+	add->kmap_ops  = kcalloc(count, sizeof(add->kmap_ops[0]), GFP_KERNEL);
+	add->kunmap_ops = kcalloc(count, sizeof(add->kunmap_ops[0]), GFP_KERNEL);
+	add->pages     = kcalloc(count, sizeof(add->pages[0]), GFP_KERNEL);
+	if (NULL == add->grants    ||
+	    NULL == add->map_ops   ||
+	    NULL == add->unmap_ops ||
+	    NULL == add->kmap_ops  ||
+	    NULL == add->kunmap_ops ||
+	    NULL == add->pages)
+		goto err;
+
+#ifdef CONFIG_XEN_GRANT_DMA_ALLOC
+	add->dma_flags = dma_flags;
+
+	/*
+	 * Check if this mapping is requested to be backed
+	 * by a DMA buffer.
+	 */
+	if (dma_flags & (GNTDEV_DMA_FLAG_WC | GNTDEV_DMA_FLAG_COHERENT)) {
+		struct gnttab_dma_alloc_args args;
+
+		add->frames = kcalloc(count, sizeof(add->frames[0]),
+				      GFP_KERNEL);
+		if (!add->frames)
+			goto err;
+
+		/* Remember the device, so we can free DMA memory. */
+		add->dma_dev = priv->dma_dev;
+
+		args.dev = priv->dma_dev;
+		args.coherent = !!(dma_flags & GNTDEV_DMA_FLAG_COHERENT);
+		args.nr_pages = count;
+		args.pages = add->pages;
+		args.frames = add->frames;
+
+		if (gnttab_dma_alloc_pages(&args))
+			goto err;
+
+		add->dma_vaddr = args.vaddr;
+		add->dma_bus_addr = args.dev_bus_addr;
+	} else
+#endif
+	if (gnttab_alloc_pages(count, add->pages))
+		goto err;
+
+	for (i = 0; i < count; i++) {
+		add->map_ops[i].handle = -1;
+		add->unmap_ops[i].handle = -1;
+		add->kmap_ops[i].handle = -1;
+		add->kunmap_ops[i].handle = -1;
+	}
+
+	add->index = 0;
+	add->count = count;
+	refcount_set(&add->users, 1);
+
+	return add;
+
+err:
+	gntdev_free_map(add);
+	return NULL;
+}
+
+void gntdev_add_map(struct gntdev_priv *priv, struct gntdev_grant_map *add)
+{
+	struct gntdev_grant_map *map;
+
+	list_for_each_entry(map, &priv->maps, next) {
+		if (add->index + add->count < map->index) {
+			list_add_tail(&add->next, &map->next);
+			goto done;
+		}
+		add->index = map->index + map->count;
+	}
+	list_add_tail(&add->next, &priv->maps);
+
+done:
+	gntdev_print_maps(priv, "[new]", add->index);
+}
+
+static struct gntdev_grant_map *gntdev_find_map_index(struct gntdev_priv *priv,
+						      int index, int count)
+{
+	struct gntdev_grant_map *map;
+
+	list_for_each_entry(map, &priv->maps, next) {
+		if (map->index != index)
+			continue;
+		if (count && map->count != count)
+			continue;
+		return map;
+	}
+	return NULL;
+}
+
+void gntdev_put_map(struct gntdev_priv *priv, struct gntdev_grant_map *map)
+{
+	if (!map)
+		return;
+
+	if (!refcount_dec_and_test(&map->users))
+		return;
+
+	atomic_sub(map->count, &pages_mapped);
+
+	if (map->notify.flags & UNMAP_NOTIFY_SEND_EVENT) {
+		notify_remote_via_evtchn(map->notify.event);
+		evtchn_put(map->notify.event);
+	}
+
+	if (populate_freeable_maps && priv) {
+		mutex_lock(&priv->lock);
+		list_del(&map->next);
+		mutex_unlock(&priv->lock);
+	}
+
+	if (map->pages && !use_ptemod)
+		unmap_grant_pages(map, 0, map->count);
+	gntdev_free_map(map);
+}
+
+/* ------------------------------------------------------------------ */
+
+static int find_grant_ptes(pte_t *pte, pgtable_t token,
+		unsigned long addr, void *data)
+{
+	struct gntdev_grant_map *map = data;
+	unsigned int pgnr = (addr - map->vma->vm_start) >> PAGE_SHIFT;
+	int flags = map->flags | GNTMAP_application_map | GNTMAP_contains_pte;
+	u64 pte_maddr;
+
+	BUG_ON(pgnr >= map->count);
+	pte_maddr = arbitrary_virt_to_machine(pte).maddr;
+
+	/*
+	 * Set the PTE as special to force get_user_pages_fast() fall
+	 * back to the slow path.  If this is not supported as part of
+	 * the grant map, it will be done afterwards.
+	 */
+	if (xen_feature(XENFEAT_gnttab_map_avail_bits))
+		flags |= (1 << _GNTMAP_guest_avail0);
+
+	gnttab_set_map_op(&map->map_ops[pgnr], pte_maddr, flags,
+			  map->grants[pgnr].ref,
+			  map->grants[pgnr].domid);
+	gnttab_set_unmap_op(&map->unmap_ops[pgnr], pte_maddr, flags,
+			    -1 /* handle */);
+	return 0;
+}
+
+#ifdef CONFIG_X86
+static int set_grant_ptes_as_special(pte_t *pte, pgtable_t token,
+				     unsigned long addr, void *data)
+{
+	set_pte_at(current->mm, addr, pte, pte_mkspecial(*pte));
+	return 0;
+}
+#endif
+
+int gntdev_map_grant_pages(struct gntdev_grant_map *map)
+{
+	int i, err = 0;
+
+	if (!use_ptemod) {
+		/* Note: it could already be mapped */
+		if (map->map_ops[0].handle != -1)
+			return 0;
+		for (i = 0; i < map->count; i++) {
+			unsigned long addr = (unsigned long)
+				pfn_to_kaddr(page_to_pfn(map->pages[i]));
+			gnttab_set_map_op(&map->map_ops[i], addr, map->flags,
+				map->grants[i].ref,
+				map->grants[i].domid);
+			gnttab_set_unmap_op(&map->unmap_ops[i], addr,
+				map->flags, -1 /* handle */);
+		}
+	} else {
+		/*
+		 * Setup the map_ops corresponding to the pte entries pointing
+		 * to the kernel linear addresses of the struct pages.
+		 * These ptes are completely different from the user ptes dealt
+		 * with find_grant_ptes.
+		 * Note that GNTMAP_device_map isn't needed here: The
+		 * dev_bus_addr output field gets consumed only from ->map_ops,
+		 * and by not requesting it when mapping we also avoid needing
+		 * to mirror dev_bus_addr into ->unmap_ops (and holding an extra
+		 * reference to the page in the hypervisor).
+		 */
+		unsigned int flags = (map->flags & ~GNTMAP_device_map) |
+				     GNTMAP_host_map;
+
+		for (i = 0; i < map->count; i++) {
+			unsigned long address = (unsigned long)
+				pfn_to_kaddr(page_to_pfn(map->pages[i]));
+			BUG_ON(PageHighMem(map->pages[i]));
+
+			gnttab_set_map_op(&map->kmap_ops[i], address, flags,
+				map->grants[i].ref,
+				map->grants[i].domid);
+			gnttab_set_unmap_op(&map->kunmap_ops[i], address,
+				flags, -1);
+		}
+	}
+
+	pr_debug("map %d+%d\n", map->index, map->count);
+	err = gnttab_map_refs(map->map_ops, use_ptemod ? map->kmap_ops : NULL,
+			map->pages, map->count);
+
+	for (i = 0; i < map->count; i++) {
+		if (map->map_ops[i].status == GNTST_okay)
+			map->unmap_ops[i].handle = map->map_ops[i].handle;
+		else if (!err)
+			err = -EINVAL;
+
+		if (map->flags & GNTMAP_device_map)
+			map->unmap_ops[i].dev_bus_addr = map->map_ops[i].dev_bus_addr;
+
+		if (use_ptemod) {
+			if (map->kmap_ops[i].status == GNTST_okay)
+				map->kunmap_ops[i].handle = map->kmap_ops[i].handle;
+			else if (!err)
+				err = -EINVAL;
+		}
+	}
+	return err;
+}
+
+static int __unmap_grant_pages(struct gntdev_grant_map *map, int offset,
+			       int pages)
+{
+	int i, err = 0;
+	struct gntab_unmap_queue_data unmap_data;
+
+	if (map->notify.flags & UNMAP_NOTIFY_CLEAR_BYTE) {
+		int pgno = (map->notify.addr >> PAGE_SHIFT);
+		if (pgno >= offset && pgno < offset + pages) {
+			/* No need for kmap, pages are in lowmem */
+			uint8_t *tmp = pfn_to_kaddr(page_to_pfn(map->pages[pgno]));
+			tmp[map->notify.addr & (PAGE_SIZE-1)] = 0;
+			map->notify.flags &= ~UNMAP_NOTIFY_CLEAR_BYTE;
+		}
+	}
+
+	unmap_data.unmap_ops = map->unmap_ops + offset;
+	unmap_data.kunmap_ops = use_ptemod ? map->kunmap_ops + offset : NULL;
+	unmap_data.pages = map->pages + offset;
+	unmap_data.count = pages;
+
+	err = gnttab_unmap_refs_sync(&unmap_data);
+	if (err)
+		return err;
+
+	for (i = 0; i < pages; i++) {
+		if (map->unmap_ops[offset+i].status)
+			err = -EINVAL;
+		pr_debug("unmap handle=%d st=%d\n",
+			map->unmap_ops[offset+i].handle,
+			map->unmap_ops[offset+i].status);
+		map->unmap_ops[offset+i].handle = -1;
+	}
+	return err;
+}
+
+static int unmap_grant_pages(struct gntdev_grant_map *map, int offset,
+			     int pages)
+{
+	int range, err = 0;
+
+	pr_debug("unmap %d+%d [%d+%d]\n", map->index, map->count, offset, pages);
+
+	/* It is possible the requested range will have a "hole" where we
+	 * already unmapped some of the grants. Only unmap valid ranges.
+	 */
+	while (pages && !err) {
+		while (pages && map->unmap_ops[offset].handle == -1) {
+			offset++;
+			pages--;
+		}
+		range = 0;
+		while (range < pages) {
+			if (map->unmap_ops[offset+range].handle == -1)
+				break;
+			range++;
+		}
+		err = __unmap_grant_pages(map, offset, range);
+		offset += range;
+		pages -= range;
+	}
+
+	return err;
+}
+
+/* ------------------------------------------------------------------ */
+
+static void gntdev_vma_open(struct vm_area_struct *vma)
+{
+	struct gntdev_grant_map *map = vma->vm_private_data;
+
+	pr_debug("gntdev_vma_open %p\n", vma);
+	refcount_inc(&map->users);
+}
+
+static void gntdev_vma_close(struct vm_area_struct *vma)
+{
+	struct gntdev_grant_map *map = vma->vm_private_data;
+	struct file *file = vma->vm_file;
+	struct gntdev_priv *priv = file->private_data;
+
+	pr_debug("gntdev_vma_close %p\n", vma);
+	if (use_ptemod) {
+		/* It is possible that an mmu notifier could be running
+		 * concurrently, so take priv->lock to ensure that the vma won't
+		 * vanishing during the unmap_grant_pages call, since we will
+		 * spin here until that completes. Such a concurrent call will
+		 * not do any unmapping, since that has been done prior to
+		 * closing the vma, but it may still iterate the unmap_ops list.
+		 */
+		mutex_lock(&priv->lock);
+		map->vma = NULL;
+		mutex_unlock(&priv->lock);
+	}
+	vma->vm_private_data = NULL;
+	gntdev_put_map(priv, map);
+}
+
+static struct page *gntdev_vma_find_special_page(struct vm_area_struct *vma,
+						 unsigned long addr)
+{
+	struct gntdev_grant_map *map = vma->vm_private_data;
+
+	return map->pages[(addr - map->pages_vm_start) >> PAGE_SHIFT];
+}
+
+static const struct vm_operations_struct gntdev_vmops = {
+	.open = gntdev_vma_open,
+	.close = gntdev_vma_close,
+	.find_special_page = gntdev_vma_find_special_page,
+};
+
+/* ------------------------------------------------------------------ */
+
+static bool in_range(struct gntdev_grant_map *map,
+			      unsigned long start, unsigned long end)
+{
+	if (!map->vma)
+		return false;
+	if (map->vma->vm_start >= end)
+		return false;
+	if (map->vma->vm_end <= start)
+		return false;
+
+	return true;
+}
+
+static int unmap_if_in_range(struct gntdev_grant_map *map,
+			      unsigned long start, unsigned long end,
+			      bool blockable)
+{
+	unsigned long mstart, mend;
+	int err;
+
+	if (!in_range(map, start, end))
+		return 0;
+
+	if (!blockable)
+		return -EAGAIN;
+
+	mstart = max(start, map->vma->vm_start);
+	mend   = min(end,   map->vma->vm_end);
+	pr_debug("map %d+%d (%lx %lx), range %lx %lx, mrange %lx %lx\n",
+			map->index, map->count,
+			map->vma->vm_start, map->vma->vm_end,
+			start, end, mstart, mend);
+	err = unmap_grant_pages(map,
+				(mstart - map->vma->vm_start) >> PAGE_SHIFT,
+				(mend - mstart) >> PAGE_SHIFT);
+	WARN_ON(err);
+
+	return 0;
+}
+
+static int mn_invl_range_start(struct mmu_notifier *mn,
+				struct mm_struct *mm,
+				unsigned long start, unsigned long end,
+				bool blockable)
+{
+	struct gntdev_priv *priv = container_of(mn, struct gntdev_priv, mn);
+	struct gntdev_grant_map *map;
+	int ret = 0;
+
+	if (blockable)
+		mutex_lock(&priv->lock);
+	else if (!mutex_trylock(&priv->lock))
+		return -EAGAIN;
+
+	list_for_each_entry(map, &priv->maps, next) {
+		ret = unmap_if_in_range(map, start, end, blockable);
+		if (ret)
+			goto out_unlock;
+	}
+	list_for_each_entry(map, &priv->freeable_maps, next) {
+		ret = unmap_if_in_range(map, start, end, blockable);
+		if (ret)
+			goto out_unlock;
+	}
+
+out_unlock:
+	mutex_unlock(&priv->lock);
+
+	return ret;
+}
+
+static void mn_release(struct mmu_notifier *mn,
+		       struct mm_struct *mm)
+{
+	struct gntdev_priv *priv = container_of(mn, struct gntdev_priv, mn);
+	struct gntdev_grant_map *map;
+	int err;
+
+	mutex_lock(&priv->lock);
+	list_for_each_entry(map, &priv->maps, next) {
+		if (!map->vma)
+			continue;
+		pr_debug("map %d+%d (%lx %lx)\n",
+				map->index, map->count,
+				map->vma->vm_start, map->vma->vm_end);
+		err = unmap_grant_pages(map, /* offset */ 0, map->count);
+		WARN_ON(err);
+	}
+	list_for_each_entry(map, &priv->freeable_maps, next) {
+		if (!map->vma)
+			continue;
+		pr_debug("map %d+%d (%lx %lx)\n",
+				map->index, map->count,
+				map->vma->vm_start, map->vma->vm_end);
+		err = unmap_grant_pages(map, /* offset */ 0, map->count);
+		WARN_ON(err);
+	}
+	mutex_unlock(&priv->lock);
+}
+
+static const struct mmu_notifier_ops gntdev_mmu_ops = {
+	.release                = mn_release,
+	.invalidate_range_start = mn_invl_range_start,
+};
+
+/* ------------------------------------------------------------------ */
+
+static int gntdev_open(struct inode *inode, struct file *flip)
+{
+	struct gntdev_priv *priv;
+	int ret = 0;
+
+	priv = kzalloc(sizeof(*priv), GFP_KERNEL);
+	if (!priv)
+		return -ENOMEM;
+
+	INIT_LIST_HEAD(&priv->maps);
+	INIT_LIST_HEAD(&priv->freeable_maps);
+	mutex_init(&priv->lock);
+
+#ifdef CONFIG_XEN_GNTDEV_DMABUF
+	priv->dmabuf_priv = gntdev_dmabuf_init(flip);
+	if (IS_ERR(priv->dmabuf_priv)) {
+		ret = PTR_ERR(priv->dmabuf_priv);
+		kfree(priv);
+		return ret;
+	}
+#endif
+
+	if (use_ptemod) {
+		priv->mm = get_task_mm(current);
+		if (!priv->mm) {
+			kfree(priv);
+			return -ENOMEM;
+		}
+		priv->mn.ops = &gntdev_mmu_ops;
+		ret = mmu_notifier_register(&priv->mn, priv->mm);
+		mmput(priv->mm);
+	}
+
+	if (ret) {
+		kfree(priv);
+		return ret;
+	}
+
+	flip->private_data = priv;
+#ifdef CONFIG_XEN_GRANT_DMA_ALLOC
+	priv->dma_dev = gntdev_miscdev.this_device;
+
+	/*
+	 * The device is not spawn from a device tree, so arch_setup_dma_ops
+	 * is not called, thus leaving the device with dummy DMA ops.
+	 * Fix this by calling of_dma_configure() with a NULL node to set
+	 * default DMA ops.
+	 */
+	of_dma_configure(priv->dma_dev, NULL, true);
+#endif
+	pr_debug("priv %p\n", priv);
+
+	return 0;
+}
+
+static int gntdev_release(struct inode *inode, struct file *flip)
+{
+	struct gntdev_priv *priv = flip->private_data;
+	struct gntdev_grant_map *map;
+
+	pr_debug("priv %p\n", priv);
+
+	mutex_lock(&priv->lock);
+	while (!list_empty(&priv->maps)) {
+		map = list_entry(priv->maps.next,
+				 struct gntdev_grant_map, next);
+		list_del(&map->next);
+		gntdev_put_map(NULL /* already removed */, map);
+	}
+	WARN_ON(!list_empty(&priv->freeable_maps));
+	mutex_unlock(&priv->lock);
+
+#ifdef CONFIG_XEN_GNTDEV_DMABUF
+	gntdev_dmabuf_fini(priv->dmabuf_priv);
+#endif
+
+	if (use_ptemod)
+		mmu_notifier_unregister(&priv->mn, priv->mm);
+
+	kfree(priv);
+	return 0;
+}
+
+static long gntdev_ioctl_map_grant_ref(struct gntdev_priv *priv,
+				       struct ioctl_gntdev_map_grant_ref __user *u)
+{
+	struct ioctl_gntdev_map_grant_ref op;
+	struct gntdev_grant_map *map;
+	int err;
+
+	if (copy_from_user(&op, u, sizeof(op)) != 0)
+		return -EFAULT;
+	pr_debug("priv %p, add %d\n", priv, op.count);
+	if (unlikely(op.count <= 0))
+		return -EINVAL;
+
+	err = -ENOMEM;
+	map = gntdev_alloc_map(priv, op.count, 0 /* This is not a dma-buf. */);
+	if (!map)
+		return err;
+
+	if (unlikely(gntdev_account_mapped_pages(op.count))) {
+		pr_debug("can't map: over limit\n");
+		gntdev_put_map(NULL, map);
+		return err;
+	}
+
+	if (copy_from_user(map->grants, &u->refs,
+			   sizeof(map->grants[0]) * op.count) != 0) {
+		gntdev_put_map(NULL, map);
+		return -EFAULT;
+	}
+
+	mutex_lock(&priv->lock);
+	gntdev_add_map(priv, map);
+	op.index = map->index << PAGE_SHIFT;
+	mutex_unlock(&priv->lock);
+
+	if (copy_to_user(u, &op, sizeof(op)) != 0)
+		return -EFAULT;
+
+	return 0;
+}
+
+static long gntdev_ioctl_unmap_grant_ref(struct gntdev_priv *priv,
+					 struct ioctl_gntdev_unmap_grant_ref __user *u)
+{
+	struct ioctl_gntdev_unmap_grant_ref op;
+	struct gntdev_grant_map *map;
+	int err = -ENOENT;
+
+	if (copy_from_user(&op, u, sizeof(op)) != 0)
+		return -EFAULT;
+	pr_debug("priv %p, del %d+%d\n", priv, (int)op.index, (int)op.count);
+
+	mutex_lock(&priv->lock);
+	map = gntdev_find_map_index(priv, op.index >> PAGE_SHIFT, op.count);
+	if (map) {
+		list_del(&map->next);
+		if (populate_freeable_maps)
+			list_add_tail(&map->next, &priv->freeable_maps);
+		err = 0;
+	}
+	mutex_unlock(&priv->lock);
+	if (map)
+		gntdev_put_map(priv, map);
+	return err;
+}
+
+static long gntdev_ioctl_get_offset_for_vaddr(struct gntdev_priv *priv,
+					      struct ioctl_gntdev_get_offset_for_vaddr __user *u)
+{
+	struct ioctl_gntdev_get_offset_for_vaddr op;
+	struct vm_area_struct *vma;
+	struct gntdev_grant_map *map;
+	int rv = -EINVAL;
+
+	if (copy_from_user(&op, u, sizeof(op)) != 0)
+		return -EFAULT;
+	pr_debug("priv %p, offset for vaddr %lx\n", priv, (unsigned long)op.vaddr);
+
+	down_read(&current->mm->mmap_sem);
+	vma = find_vma(current->mm, op.vaddr);
+	if (!vma || vma->vm_ops != &gntdev_vmops)
+		goto out_unlock;
+
+	map = vma->vm_private_data;
+	if (!map)
+		goto out_unlock;
+
+	op.offset = map->index << PAGE_SHIFT;
+	op.count = map->count;
+	rv = 0;
+
+ out_unlock:
+	up_read(&current->mm->mmap_sem);
+
+	if (rv == 0 && copy_to_user(u, &op, sizeof(op)) != 0)
+		return -EFAULT;
+	return rv;
+}
+
+static long gntdev_ioctl_notify(struct gntdev_priv *priv, void __user *u)
+{
+	struct ioctl_gntdev_unmap_notify op;
+	struct gntdev_grant_map *map;
+	int rc;
+	int out_flags;
+	unsigned int out_event;
+
+	if (copy_from_user(&op, u, sizeof(op)))
+		return -EFAULT;
+
+	if (op.action & ~(UNMAP_NOTIFY_CLEAR_BYTE|UNMAP_NOTIFY_SEND_EVENT))
+		return -EINVAL;
+
+	/* We need to grab a reference to the event channel we are going to use
+	 * to send the notify before releasing the reference we may already have
+	 * (if someone has called this ioctl twice). This is required so that
+	 * it is possible to change the clear_byte part of the notification
+	 * without disturbing the event channel part, which may now be the last
+	 * reference to that event channel.
+	 */
+	if (op.action & UNMAP_NOTIFY_SEND_EVENT) {
+		if (evtchn_get(op.event_channel_port))
+			return -EINVAL;
+	}
+
+	out_flags = op.action;
+	out_event = op.event_channel_port;
+
+	mutex_lock(&priv->lock);
+
+	list_for_each_entry(map, &priv->maps, next) {
+		uint64_t begin = map->index << PAGE_SHIFT;
+		uint64_t end = (map->index + map->count) << PAGE_SHIFT;
+		if (op.index >= begin && op.index < end)
+			goto found;
+	}
+	rc = -ENOENT;
+	goto unlock_out;
+
+ found:
+	if ((op.action & UNMAP_NOTIFY_CLEAR_BYTE) &&
+			(map->flags & GNTMAP_readonly)) {
+		rc = -EINVAL;
+		goto unlock_out;
+	}
+
+	out_flags = map->notify.flags;
+	out_event = map->notify.event;
+
+	map->notify.flags = op.action;
+	map->notify.addr = op.index - (map->index << PAGE_SHIFT);
+	map->notify.event = op.event_channel_port;
+
+	rc = 0;
+
+ unlock_out:
+	mutex_unlock(&priv->lock);
+
+	/* Drop the reference to the event channel we did not save in the map */
+	if (out_flags & UNMAP_NOTIFY_SEND_EVENT)
+		evtchn_put(out_event);
+
+	return rc;
+}
+
+#define GNTDEV_COPY_BATCH 16
+
+struct gntdev_copy_batch {
+	struct gnttab_copy ops[GNTDEV_COPY_BATCH];
+	struct page *pages[GNTDEV_COPY_BATCH];
+	s16 __user *status[GNTDEV_COPY_BATCH];
+	unsigned int nr_ops;
+	unsigned int nr_pages;
+	bool writeable;
+};
+
+static int gntdev_get_page(struct gntdev_copy_batch *batch, void __user *virt,
+				unsigned long *gfn)
+{
+	unsigned long addr = (unsigned long)virt;
+	struct page *page;
+	unsigned long xen_pfn;
+	int ret;
+
+	ret = get_user_pages_fast(addr, 1, batch->writeable, &page);
+	if (ret < 0)
+		return ret;
+
+	batch->pages[batch->nr_pages++] = page;
+
+	xen_pfn = page_to_xen_pfn(page) + XEN_PFN_DOWN(addr & ~PAGE_MASK);
+	*gfn = pfn_to_gfn(xen_pfn);
+
+	return 0;
+}
+
+static void gntdev_put_pages(struct gntdev_copy_batch *batch)
+{
+	unsigned int i;
+
+	for (i = 0; i < batch->nr_pages; i++) {
+		if (batch->writeable && !PageDirty(batch->pages[i]))
+			set_page_dirty_lock(batch->pages[i]);
+		put_page(batch->pages[i]);
+	}
+	batch->nr_pages = 0;
+	batch->writeable = false;
+}
+
+static int gntdev_copy(struct gntdev_copy_batch *batch)
+{
+	unsigned int i;
+
+	gnttab_batch_copy(batch->ops, batch->nr_ops);
+	gntdev_put_pages(batch);
+
+	/*
+	 * For each completed op, update the status if the op failed
+	 * and all previous ops for the segment were successful.
+	 */
+	for (i = 0; i < batch->nr_ops; i++) {
+		s16 status = batch->ops[i].status;
+		s16 old_status;
+
+		if (status == GNTST_okay)
+			continue;
+
+		if (__get_user(old_status, batch->status[i]))
+			return -EFAULT;
+
+		if (old_status != GNTST_okay)
+			continue;
+
+		if (__put_user(status, batch->status[i]))
+			return -EFAULT;
+	}
+
+	batch->nr_ops = 0;
+	return 0;
+}
+
+static int gntdev_grant_copy_seg(struct gntdev_copy_batch *batch,
+				 struct gntdev_grant_copy_segment *seg,
+				 s16 __user *status)
+{
+	uint16_t copied = 0;
+
+	/*
+	 * Disallow local -> local copies since there is only space in
+	 * batch->pages for one page per-op and this would be a very
+	 * expensive memcpy().
+	 */
+	if (!(seg->flags & (GNTCOPY_source_gref | GNTCOPY_dest_gref)))
+		return -EINVAL;
+
+	/* Can't cross page if source/dest is a grant ref. */
+	if (seg->flags & GNTCOPY_source_gref) {
+		if (seg->source.foreign.offset + seg->len > XEN_PAGE_SIZE)
+			return -EINVAL;
+	}
+	if (seg->flags & GNTCOPY_dest_gref) {
+		if (seg->dest.foreign.offset + seg->len > XEN_PAGE_SIZE)
+			return -EINVAL;
+	}
+
+	if (put_user(GNTST_okay, status))
+		return -EFAULT;
+
+	while (copied < seg->len) {
+		struct gnttab_copy *op;
+		void __user *virt;
+		size_t len, off;
+		unsigned long gfn;
+		int ret;
+
+		if (batch->nr_ops >= GNTDEV_COPY_BATCH) {
+			ret = gntdev_copy(batch);
+			if (ret < 0)
+				return ret;
+		}
+
+		len = seg->len - copied;
+
+		op = &batch->ops[batch->nr_ops];
+		op->flags = 0;
+
+		if (seg->flags & GNTCOPY_source_gref) {
+			op->source.u.ref = seg->source.foreign.ref;
+			op->source.domid = seg->source.foreign.domid;
+			op->source.offset = seg->source.foreign.offset + copied;
+			op->flags |= GNTCOPY_source_gref;
+		} else {
+			virt = seg->source.virt + copied;
+			off = (unsigned long)virt & ~XEN_PAGE_MASK;
+			len = min(len, (size_t)XEN_PAGE_SIZE - off);
+			batch->writeable = false;
+
+			ret = gntdev_get_page(batch, virt, &gfn);
+			if (ret < 0)
+				return ret;
+
+			op->source.u.gmfn = gfn;
+			op->source.domid = DOMID_SELF;
+			op->source.offset = off;
+		}
+
+		if (seg->flags & GNTCOPY_dest_gref) {
+			op->dest.u.ref = seg->dest.foreign.ref;
+			op->dest.domid = seg->dest.foreign.domid;
+			op->dest.offset = seg->dest.foreign.offset + copied;
+			op->flags |= GNTCOPY_dest_gref;
+		} else {
+			virt = seg->dest.virt + copied;
+			off = (unsigned long)virt & ~XEN_PAGE_MASK;
+			len = min(len, (size_t)XEN_PAGE_SIZE - off);
+			batch->writeable = true;
+
+			ret = gntdev_get_page(batch, virt, &gfn);
+			if (ret < 0)
+				return ret;
+
+			op->dest.u.gmfn = gfn;
+			op->dest.domid = DOMID_SELF;
+			op->dest.offset = off;
+		}
+
+		op->len = len;
+		copied += len;
+
+		batch->status[batch->nr_ops] = status;
+		batch->nr_ops++;
+	}
+
+	return 0;
+}
+
+static long gntdev_ioctl_grant_copy(struct gntdev_priv *priv, void __user *u)
+{
+	struct ioctl_gntdev_grant_copy copy;
+	struct gntdev_copy_batch batch;
+	unsigned int i;
+	int ret = 0;
+
+	if (copy_from_user(&copy, u, sizeof(copy)))
+		return -EFAULT;
+
+	batch.nr_ops = 0;
+	batch.nr_pages = 0;
+
+	for (i = 0; i < copy.count; i++) {
+		struct gntdev_grant_copy_segment seg;
+
+		if (copy_from_user(&seg, &copy.segments[i], sizeof(seg))) {
+			ret = -EFAULT;
+			goto out;
+		}
+
+		ret = gntdev_grant_copy_seg(&batch, &seg, &copy.segments[i].status);
+		if (ret < 0)
+			goto out;
+
+		cond_resched();
+	}
+	if (batch.nr_ops)
+		ret = gntdev_copy(&batch);
+	return ret;
+
+  out:
+	gntdev_put_pages(&batch);
+	return ret;
+}
+
+static long gntdev_ioctl(struct file *flip,
+			 unsigned int cmd, unsigned long arg)
+{
+	struct gntdev_priv *priv = flip->private_data;
+	void __user *ptr = (void __user *)arg;
+
+	switch (cmd) {
+	case IOCTL_GNTDEV_MAP_GRANT_REF:
+		return gntdev_ioctl_map_grant_ref(priv, ptr);
+
+	case IOCTL_GNTDEV_UNMAP_GRANT_REF:
+		return gntdev_ioctl_unmap_grant_ref(priv, ptr);
+
+	case IOCTL_GNTDEV_GET_OFFSET_FOR_VADDR:
+		return gntdev_ioctl_get_offset_for_vaddr(priv, ptr);
+
+	case IOCTL_GNTDEV_SET_UNMAP_NOTIFY:
+		return gntdev_ioctl_notify(priv, ptr);
+
+	case IOCTL_GNTDEV_GRANT_COPY:
+		return gntdev_ioctl_grant_copy(priv, ptr);
+
+#ifdef CONFIG_XEN_GNTDEV_DMABUF
+	case IOCTL_GNTDEV_DMABUF_EXP_FROM_REFS:
+		return gntdev_ioctl_dmabuf_exp_from_refs(priv, use_ptemod, ptr);
+
+	case IOCTL_GNTDEV_DMABUF_EXP_WAIT_RELEASED:
+		return gntdev_ioctl_dmabuf_exp_wait_released(priv, ptr);
+
+	case IOCTL_GNTDEV_DMABUF_IMP_TO_REFS:
+		return gntdev_ioctl_dmabuf_imp_to_refs(priv, ptr);
+
+	case IOCTL_GNTDEV_DMABUF_IMP_RELEASE:
+		return gntdev_ioctl_dmabuf_imp_release(priv, ptr);
+#endif
+
+	default:
+		pr_debug("priv %p, unknown cmd %x\n", priv, cmd);
+		return -ENOIOCTLCMD;
+	}
+
+	return 0;
+}
+
+static int gntdev_mmap(struct file *flip, struct vm_area_struct *vma)
+{
+	struct gntdev_priv *priv = flip->private_data;
+	int index = vma->vm_pgoff;
+	int count = vma_pages(vma);
+	struct gntdev_grant_map *map;
+	int i, err = -EINVAL;
+
+	if ((vma->vm_flags & VM_WRITE) && !(vma->vm_flags & VM_SHARED))
+		return -EINVAL;
+
+	pr_debug("map %d+%d at %lx (pgoff %lx)\n",
+			index, count, vma->vm_start, vma->vm_pgoff);
+
+	mutex_lock(&priv->lock);
+	map = gntdev_find_map_index(priv, index, count);
+	if (!map)
+		goto unlock_out;
+	if (use_ptemod && map->vma)
+		goto unlock_out;
+	if (use_ptemod && priv->mm != vma->vm_mm) {
+		pr_warn("Huh? Other mm?\n");
+		goto unlock_out;
+	}
+
+	refcount_inc(&map->users);
+
+	vma->vm_ops = &gntdev_vmops;
+
+	vma->vm_flags |= VM_DONTEXPAND | VM_DONTDUMP | VM_MIXEDMAP;
+
+	if (use_ptemod)
+		vma->vm_flags |= VM_DONTCOPY;
+
+	vma->vm_private_data = map;
+
+	if (use_ptemod)
+		map->vma = vma;
+
+	if (map->flags) {
+		if ((vma->vm_flags & VM_WRITE) &&
+				(map->flags & GNTMAP_readonly))
+			goto out_unlock_put;
+	} else {
+		map->flags = GNTMAP_host_map;
+		if (!(vma->vm_flags & VM_WRITE))
+			map->flags |= GNTMAP_readonly;
+	}
+
+	mutex_unlock(&priv->lock);
+
+	if (use_ptemod) {
+		map->pages_vm_start = vma->vm_start;
+		err = apply_to_page_range(vma->vm_mm, vma->vm_start,
+					  vma->vm_end - vma->vm_start,
+					  find_grant_ptes, map);
+		if (err) {
+			pr_warn("find_grant_ptes() failure.\n");
+			goto out_put_map;
+		}
+	}
+
+	err = gntdev_map_grant_pages(map);
+	if (err)
+		goto out_put_map;
+
+	if (!use_ptemod) {
+		for (i = 0; i < count; i++) {
+			err = vm_insert_page(vma, vma->vm_start + i*PAGE_SIZE,
+				map->pages[i]);
+			if (err)
+				goto out_put_map;
+		}
+	} else {
+#ifdef CONFIG_X86
+		/*
+		 * If the PTEs were not made special by the grant map
+		 * hypercall, do so here.
+		 *
+		 * This is racy since the mapping is already visible
+		 * to userspace but userspace should be well-behaved
+		 * enough to not touch it until the mmap() call
+		 * returns.
+		 */
+		if (!xen_feature(XENFEAT_gnttab_map_avail_bits)) {
+			apply_to_page_range(vma->vm_mm, vma->vm_start,
+					    vma->vm_end - vma->vm_start,
+					    set_grant_ptes_as_special, NULL);
+		}
+#endif
+	}
+
+	return 0;
+
+unlock_out:
+	mutex_unlock(&priv->lock);
+	return err;
+
+out_unlock_put:
+	mutex_unlock(&priv->lock);
+out_put_map:
+	if (use_ptemod) {
+		map->vma = NULL;
+		unmap_grant_pages(map, 0, map->count);
+	}
+	gntdev_put_map(priv, map);
+	return err;
+}
+
+static const struct file_operations gntdev_fops = {
+	.owner = THIS_MODULE,
+	.open = gntdev_open,
+	.release = gntdev_release,
+	.mmap = gntdev_mmap,
+	.unlocked_ioctl = gntdev_ioctl
+};
+
+static struct miscdevice gntdev_miscdev = {
+	.minor        = MISC_DYNAMIC_MINOR,
+	.name         = "xen/gntdev",
+	.fops         = &gntdev_fops,
+};
+
+/* ------------------------------------------------------------------ */
+
+static int __init gntdev_init(void)
+{
+	int err;
+
+	if (!xen_domain())
+		return -ENODEV;
+
+	use_ptemod = !xen_feature(XENFEAT_auto_translated_physmap);
+
+	err = misc_register(&gntdev_miscdev);
+	if (err != 0) {
+		pr_err("Could not register gntdev device\n");
+		return err;
+	}
+	return 0;
+}
+
+static void __exit gntdev_exit(void)
+{
+	misc_deregister(&gntdev_miscdev);
+}
+
+module_init(gntdev_init);
+module_exit(gntdev_exit);
+
+/* ------------------------------------------------------------------ */
diff --git a/drivers/xen/grant-table.c b/drivers/xen/grant-table.c
new file mode 100644
index 000000000..e434a583e
--- /dev/null
+++ b/drivers/xen/grant-table.c
@@ -0,0 +1,1497 @@
+/******************************************************************************
+ * grant_table.c
+ *
+ * Granting foreign access to our memory reservation.
+ *
+ * Copyright (c) 2005-2006, Christopher Clark
+ * Copyright (c) 2004-2005, K A Fraser
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation; or, when distributed
+ * separately from the Linux kernel or incorporated into other
+ * software packages, subject to the following license:
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this source file (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy, modify,
+ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
+ * and to permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#define pr_fmt(fmt) "xen:" KBUILD_MODNAME ": " fmt
+
+#include <linux/bootmem.h>
+#include <linux/sched.h>
+#include <linux/mm.h>
+#include <linux/slab.h>
+#include <linux/vmalloc.h>
+#include <linux/uaccess.h>
+#include <linux/io.h>
+#include <linux/delay.h>
+#include <linux/hardirq.h>
+#include <linux/workqueue.h>
+#include <linux/ratelimit.h>
+#include <linux/moduleparam.h>
+#ifdef CONFIG_XEN_GRANT_DMA_ALLOC
+#include <linux/dma-mapping.h>
+#endif
+
+#include <xen/xen.h>
+#include <xen/interface/xen.h>
+#include <xen/page.h>
+#include <xen/grant_table.h>
+#include <xen/interface/memory.h>
+#include <xen/hvc-console.h>
+#include <xen/swiotlb-xen.h>
+#include <xen/balloon.h>
+#ifdef CONFIG_X86
+#include <asm/xen/cpuid.h>
+#endif
+#include <xen/mem-reservation.h>
+#include <asm/xen/hypercall.h>
+#include <asm/xen/interface.h>
+
+#include <asm/pgtable.h>
+#include <asm/sync_bitops.h>
+
+/* External tools reserve first few grant table entries. */
+#define NR_RESERVED_ENTRIES 8
+#define GNTTAB_LIST_END 0xffffffff
+
+static grant_ref_t **gnttab_list;
+static unsigned int nr_grant_frames;
+static int gnttab_free_count;
+static grant_ref_t gnttab_free_head;
+static DEFINE_SPINLOCK(gnttab_list_lock);
+struct grant_frames xen_auto_xlat_grant_frames;
+static unsigned int xen_gnttab_version;
+module_param_named(version, xen_gnttab_version, uint, 0);
+
+static union {
+	struct grant_entry_v1 *v1;
+	union grant_entry_v2 *v2;
+	void *addr;
+} gnttab_shared;
+
+/*This is a structure of function pointers for grant table*/
+struct gnttab_ops {
+	/*
+	 * Version of the grant interface.
+	 */
+	unsigned int version;
+	/*
+	 * Grant refs per grant frame.
+	 */
+	unsigned int grefs_per_grant_frame;
+	/*
+	 * Mapping a list of frames for storing grant entries. Frames parameter
+	 * is used to store grant table address when grant table being setup,
+	 * nr_gframes is the number of frames to map grant table. Returning
+	 * GNTST_okay means success and negative value means failure.
+	 */
+	int (*map_frames)(xen_pfn_t *frames, unsigned int nr_gframes);
+	/*
+	 * Release a list of frames which are mapped in map_frames for grant
+	 * entry status.
+	 */
+	void (*unmap_frames)(void);
+	/*
+	 * Introducing a valid entry into the grant table, granting the frame of
+	 * this grant entry to domain for accessing or transfering. Ref
+	 * parameter is reference of this introduced grant entry, domid is id of
+	 * granted domain, frame is the page frame to be granted, and flags is
+	 * status of the grant entry to be updated.
+	 */
+	void (*update_entry)(grant_ref_t ref, domid_t domid,
+			     unsigned long frame, unsigned flags);
+	/*
+	 * Stop granting a grant entry to domain for accessing. Ref parameter is
+	 * reference of a grant entry whose grant access will be stopped,
+	 * readonly is not in use in this function. If the grant entry is
+	 * currently mapped for reading or writing, just return failure(==0)
+	 * directly and don't tear down the grant access. Otherwise, stop grant
+	 * access for this entry and return success(==1).
+	 */
+	int (*end_foreign_access_ref)(grant_ref_t ref, int readonly);
+	/*
+	 * Stop granting a grant entry to domain for transfer. Ref parameter is
+	 * reference of a grant entry whose grant transfer will be stopped. If
+	 * tranfer has not started, just reclaim the grant entry and return
+	 * failure(==0). Otherwise, wait for the transfer to complete and then
+	 * return the frame.
+	 */
+	unsigned long (*end_foreign_transfer_ref)(grant_ref_t ref);
+	/*
+	 * Read the frame number related to a given grant reference.
+	 */
+	unsigned long (*read_frame)(grant_ref_t ref);
+};
+
+struct unmap_refs_callback_data {
+	struct completion completion;
+	int result;
+};
+
+static const struct gnttab_ops *gnttab_interface;
+
+/* This reflects status of grant entries, so act as a global value. */
+static grant_status_t *grstatus;
+
+static struct gnttab_free_callback *gnttab_free_callback_list;
+
+static int gnttab_expand(unsigned int req_entries);
+
+#define RPP (PAGE_SIZE / sizeof(grant_ref_t))
+#define SPP (PAGE_SIZE / sizeof(grant_status_t))
+
+static inline grant_ref_t *__gnttab_entry(grant_ref_t entry)
+{
+	return &gnttab_list[(entry) / RPP][(entry) % RPP];
+}
+/* This can be used as an l-value */
+#define gnttab_entry(entry) (*__gnttab_entry(entry))
+
+static int get_free_entries(unsigned count)
+{
+	unsigned long flags;
+	int ref, rc = 0;
+	grant_ref_t head;
+
+	spin_lock_irqsave(&gnttab_list_lock, flags);
+
+	if ((gnttab_free_count < count) &&
+	    ((rc = gnttab_expand(count - gnttab_free_count)) < 0)) {
+		spin_unlock_irqrestore(&gnttab_list_lock, flags);
+		return rc;
+	}
+
+	ref = head = gnttab_free_head;
+	gnttab_free_count -= count;
+	while (count-- > 1)
+		head = gnttab_entry(head);
+	gnttab_free_head = gnttab_entry(head);
+	gnttab_entry(head) = GNTTAB_LIST_END;
+
+	spin_unlock_irqrestore(&gnttab_list_lock, flags);
+
+	return ref;
+}
+
+static void do_free_callbacks(void)
+{
+	struct gnttab_free_callback *callback, *next;
+
+	callback = gnttab_free_callback_list;
+	gnttab_free_callback_list = NULL;
+
+	while (callback != NULL) {
+		next = callback->next;
+		if (gnttab_free_count >= callback->count) {
+			callback->next = NULL;
+			callback->fn(callback->arg);
+		} else {
+			callback->next = gnttab_free_callback_list;
+			gnttab_free_callback_list = callback;
+		}
+		callback = next;
+	}
+}
+
+static inline void check_free_callbacks(void)
+{
+	if (unlikely(gnttab_free_callback_list))
+		do_free_callbacks();
+}
+
+static void put_free_entry(grant_ref_t ref)
+{
+	unsigned long flags;
+	spin_lock_irqsave(&gnttab_list_lock, flags);
+	gnttab_entry(ref) = gnttab_free_head;
+	gnttab_free_head = ref;
+	gnttab_free_count++;
+	check_free_callbacks();
+	spin_unlock_irqrestore(&gnttab_list_lock, flags);
+}
+
+/*
+ * Following applies to gnttab_update_entry_v1 and gnttab_update_entry_v2.
+ * Introducing a valid entry into the grant table:
+ *  1. Write ent->domid.
+ *  2. Write ent->frame:
+ *      GTF_permit_access:   Frame to which access is permitted.
+ *      GTF_accept_transfer: Pseudo-phys frame slot being filled by new
+ *                           frame, or zero if none.
+ *  3. Write memory barrier (WMB).
+ *  4. Write ent->flags, inc. valid type.
+ */
+static void gnttab_update_entry_v1(grant_ref_t ref, domid_t domid,
+				   unsigned long frame, unsigned flags)
+{
+	gnttab_shared.v1[ref].domid = domid;
+	gnttab_shared.v1[ref].frame = frame;
+	wmb();
+	gnttab_shared.v1[ref].flags = flags;
+}
+
+static void gnttab_update_entry_v2(grant_ref_t ref, domid_t domid,
+				   unsigned long frame, unsigned int flags)
+{
+	gnttab_shared.v2[ref].hdr.domid = domid;
+	gnttab_shared.v2[ref].full_page.frame = frame;
+	wmb();	/* Hypervisor concurrent accesses. */
+	gnttab_shared.v2[ref].hdr.flags = GTF_permit_access | flags;
+}
+
+/*
+ * Public grant-issuing interface functions
+ */
+void gnttab_grant_foreign_access_ref(grant_ref_t ref, domid_t domid,
+				     unsigned long frame, int readonly)
+{
+	gnttab_interface->update_entry(ref, domid, frame,
+			   GTF_permit_access | (readonly ? GTF_readonly : 0));
+}
+EXPORT_SYMBOL_GPL(gnttab_grant_foreign_access_ref);
+
+int gnttab_grant_foreign_access(domid_t domid, unsigned long frame,
+				int readonly)
+{
+	int ref;
+
+	ref = get_free_entries(1);
+	if (unlikely(ref < 0))
+		return -ENOSPC;
+
+	gnttab_grant_foreign_access_ref(ref, domid, frame, readonly);
+
+	return ref;
+}
+EXPORT_SYMBOL_GPL(gnttab_grant_foreign_access);
+
+static int gnttab_end_foreign_access_ref_v1(grant_ref_t ref, int readonly)
+{
+	u16 flags, nflags;
+	u16 *pflags;
+
+	pflags = &gnttab_shared.v1[ref].flags;
+	nflags = *pflags;
+	do {
+		flags = nflags;
+		if (flags & (GTF_reading|GTF_writing))
+			return 0;
+	} while ((nflags = sync_cmpxchg(pflags, flags, 0)) != flags);
+
+	return 1;
+}
+
+static int gnttab_end_foreign_access_ref_v2(grant_ref_t ref, int readonly)
+{
+	gnttab_shared.v2[ref].hdr.flags = 0;
+	mb();	/* Concurrent access by hypervisor. */
+	if (grstatus[ref] & (GTF_reading|GTF_writing)) {
+		return 0;
+	} else {
+		/*
+		 * The read of grstatus needs to have acquire semantics.
+		 *  On x86, reads already have that, and we just need to
+		 * protect against compiler reorderings.
+		 * On other architectures we may need a full barrier.
+		 */
+#ifdef CONFIG_X86
+		barrier();
+#else
+		mb();
+#endif
+	}
+
+	return 1;
+}
+
+static inline int _gnttab_end_foreign_access_ref(grant_ref_t ref, int readonly)
+{
+	return gnttab_interface->end_foreign_access_ref(ref, readonly);
+}
+
+int gnttab_end_foreign_access_ref(grant_ref_t ref, int readonly)
+{
+	if (_gnttab_end_foreign_access_ref(ref, readonly))
+		return 1;
+	pr_warn("WARNING: g.e. %#x still in use!\n", ref);
+	return 0;
+}
+EXPORT_SYMBOL_GPL(gnttab_end_foreign_access_ref);
+
+static unsigned long gnttab_read_frame_v1(grant_ref_t ref)
+{
+	return gnttab_shared.v1[ref].frame;
+}
+
+static unsigned long gnttab_read_frame_v2(grant_ref_t ref)
+{
+	return gnttab_shared.v2[ref].full_page.frame;
+}
+
+struct deferred_entry {
+	struct list_head list;
+	grant_ref_t ref;
+	bool ro;
+	uint16_t warn_delay;
+	struct page *page;
+};
+static LIST_HEAD(deferred_list);
+static void gnttab_handle_deferred(struct timer_list *);
+static DEFINE_TIMER(deferred_timer, gnttab_handle_deferred);
+
+static void gnttab_handle_deferred(struct timer_list *unused)
+{
+	unsigned int nr = 10;
+	struct deferred_entry *first = NULL;
+	unsigned long flags;
+
+	spin_lock_irqsave(&gnttab_list_lock, flags);
+	while (nr--) {
+		struct deferred_entry *entry
+			= list_first_entry(&deferred_list,
+					   struct deferred_entry, list);
+
+		if (entry == first)
+			break;
+		list_del(&entry->list);
+		spin_unlock_irqrestore(&gnttab_list_lock, flags);
+		if (_gnttab_end_foreign_access_ref(entry->ref, entry->ro)) {
+			put_free_entry(entry->ref);
+			pr_debug("freeing g.e. %#x (pfn %#lx)\n",
+				 entry->ref, page_to_pfn(entry->page));
+			put_page(entry->page);
+			kfree(entry);
+			entry = NULL;
+		} else {
+			if (!--entry->warn_delay)
+				pr_info("g.e. %#x still pending\n", entry->ref);
+			if (!first)
+				first = entry;
+		}
+		spin_lock_irqsave(&gnttab_list_lock, flags);
+		if (entry)
+			list_add_tail(&entry->list, &deferred_list);
+		else if (list_empty(&deferred_list))
+			break;
+	}
+	if (!list_empty(&deferred_list) && !timer_pending(&deferred_timer)) {
+		deferred_timer.expires = jiffies + HZ;
+		add_timer(&deferred_timer);
+	}
+	spin_unlock_irqrestore(&gnttab_list_lock, flags);
+}
+
+static void gnttab_add_deferred(grant_ref_t ref, bool readonly,
+				struct page *page)
+{
+	struct deferred_entry *entry;
+	gfp_t gfp = (in_atomic() || irqs_disabled()) ? GFP_ATOMIC : GFP_KERNEL;
+	const char *what = KERN_WARNING "leaking";
+
+	entry = kmalloc(sizeof(*entry), gfp);
+	if (!page) {
+		unsigned long gfn = gnttab_interface->read_frame(ref);
+
+		page = pfn_to_page(gfn_to_pfn(gfn));
+		get_page(page);
+	}
+
+	if (entry) {
+		unsigned long flags;
+
+		entry->ref = ref;
+		entry->ro = readonly;
+		entry->page = page;
+		entry->warn_delay = 60;
+		spin_lock_irqsave(&gnttab_list_lock, flags);
+		list_add_tail(&entry->list, &deferred_list);
+		if (!timer_pending(&deferred_timer)) {
+			deferred_timer.expires = jiffies + HZ;
+			add_timer(&deferred_timer);
+		}
+		spin_unlock_irqrestore(&gnttab_list_lock, flags);
+		what = KERN_DEBUG "deferring";
+	}
+	printk("%s g.e. %#x (pfn %#lx)\n",
+	       what, ref, page ? page_to_pfn(page) : -1);
+}
+
+int gnttab_try_end_foreign_access(grant_ref_t ref)
+{
+	int ret = _gnttab_end_foreign_access_ref(ref, 0);
+
+	if (ret)
+		put_free_entry(ref);
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(gnttab_try_end_foreign_access);
+
+void gnttab_end_foreign_access(grant_ref_t ref, int readonly,
+			       unsigned long page)
+{
+	if (gnttab_try_end_foreign_access(ref)) {
+		if (page != 0)
+			put_page(virt_to_page(page));
+	} else
+		gnttab_add_deferred(ref, readonly,
+				    page ? virt_to_page(page) : NULL);
+}
+EXPORT_SYMBOL_GPL(gnttab_end_foreign_access);
+
+int gnttab_grant_foreign_transfer(domid_t domid, unsigned long pfn)
+{
+	int ref;
+
+	ref = get_free_entries(1);
+	if (unlikely(ref < 0))
+		return -ENOSPC;
+	gnttab_grant_foreign_transfer_ref(ref, domid, pfn);
+
+	return ref;
+}
+EXPORT_SYMBOL_GPL(gnttab_grant_foreign_transfer);
+
+void gnttab_grant_foreign_transfer_ref(grant_ref_t ref, domid_t domid,
+				       unsigned long pfn)
+{
+	gnttab_interface->update_entry(ref, domid, pfn, GTF_accept_transfer);
+}
+EXPORT_SYMBOL_GPL(gnttab_grant_foreign_transfer_ref);
+
+static unsigned long gnttab_end_foreign_transfer_ref_v1(grant_ref_t ref)
+{
+	unsigned long frame;
+	u16           flags;
+	u16          *pflags;
+
+	pflags = &gnttab_shared.v1[ref].flags;
+
+	/*
+	 * If a transfer is not even yet started, try to reclaim the grant
+	 * reference and return failure (== 0).
+	 */
+	while (!((flags = *pflags) & GTF_transfer_committed)) {
+		if (sync_cmpxchg(pflags, flags, 0) == flags)
+			return 0;
+		cpu_relax();
+	}
+
+	/* If a transfer is in progress then wait until it is completed. */
+	while (!(flags & GTF_transfer_completed)) {
+		flags = *pflags;
+		cpu_relax();
+	}
+
+	rmb();	/* Read the frame number /after/ reading completion status. */
+	frame = gnttab_shared.v1[ref].frame;
+	BUG_ON(frame == 0);
+
+	return frame;
+}
+
+static unsigned long gnttab_end_foreign_transfer_ref_v2(grant_ref_t ref)
+{
+	unsigned long frame;
+	u16           flags;
+	u16          *pflags;
+
+	pflags = &gnttab_shared.v2[ref].hdr.flags;
+
+	/*
+	 * If a transfer is not even yet started, try to reclaim the grant
+	 * reference and return failure (== 0).
+	 */
+	while (!((flags = *pflags) & GTF_transfer_committed)) {
+		if (sync_cmpxchg(pflags, flags, 0) == flags)
+			return 0;
+		cpu_relax();
+	}
+
+	/* If a transfer is in progress then wait until it is completed. */
+	while (!(flags & GTF_transfer_completed)) {
+		flags = *pflags;
+		cpu_relax();
+	}
+
+	rmb();  /* Read the frame number /after/ reading completion status. */
+	frame = gnttab_shared.v2[ref].full_page.frame;
+	BUG_ON(frame == 0);
+
+	return frame;
+}
+
+unsigned long gnttab_end_foreign_transfer_ref(grant_ref_t ref)
+{
+	return gnttab_interface->end_foreign_transfer_ref(ref);
+}
+EXPORT_SYMBOL_GPL(gnttab_end_foreign_transfer_ref);
+
+unsigned long gnttab_end_foreign_transfer(grant_ref_t ref)
+{
+	unsigned long frame = gnttab_end_foreign_transfer_ref(ref);
+	put_free_entry(ref);
+	return frame;
+}
+EXPORT_SYMBOL_GPL(gnttab_end_foreign_transfer);
+
+void gnttab_free_grant_reference(grant_ref_t ref)
+{
+	put_free_entry(ref);
+}
+EXPORT_SYMBOL_GPL(gnttab_free_grant_reference);
+
+void gnttab_free_grant_references(grant_ref_t head)
+{
+	grant_ref_t ref;
+	unsigned long flags;
+	int count = 1;
+	if (head == GNTTAB_LIST_END)
+		return;
+	spin_lock_irqsave(&gnttab_list_lock, flags);
+	ref = head;
+	while (gnttab_entry(ref) != GNTTAB_LIST_END) {
+		ref = gnttab_entry(ref);
+		count++;
+	}
+	gnttab_entry(ref) = gnttab_free_head;
+	gnttab_free_head = head;
+	gnttab_free_count += count;
+	check_free_callbacks();
+	spin_unlock_irqrestore(&gnttab_list_lock, flags);
+}
+EXPORT_SYMBOL_GPL(gnttab_free_grant_references);
+
+int gnttab_alloc_grant_references(u16 count, grant_ref_t *head)
+{
+	int h = get_free_entries(count);
+
+	if (h < 0)
+		return -ENOSPC;
+
+	*head = h;
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(gnttab_alloc_grant_references);
+
+int gnttab_empty_grant_references(const grant_ref_t *private_head)
+{
+	return (*private_head == GNTTAB_LIST_END);
+}
+EXPORT_SYMBOL_GPL(gnttab_empty_grant_references);
+
+int gnttab_claim_grant_reference(grant_ref_t *private_head)
+{
+	grant_ref_t g = *private_head;
+	if (unlikely(g == GNTTAB_LIST_END))
+		return -ENOSPC;
+	*private_head = gnttab_entry(g);
+	return g;
+}
+EXPORT_SYMBOL_GPL(gnttab_claim_grant_reference);
+
+void gnttab_release_grant_reference(grant_ref_t *private_head,
+				    grant_ref_t release)
+{
+	gnttab_entry(release) = *private_head;
+	*private_head = release;
+}
+EXPORT_SYMBOL_GPL(gnttab_release_grant_reference);
+
+void gnttab_request_free_callback(struct gnttab_free_callback *callback,
+				  void (*fn)(void *), void *arg, u16 count)
+{
+	unsigned long flags;
+	struct gnttab_free_callback *cb;
+
+	spin_lock_irqsave(&gnttab_list_lock, flags);
+
+	/* Check if the callback is already on the list */
+	cb = gnttab_free_callback_list;
+	while (cb) {
+		if (cb == callback)
+			goto out;
+		cb = cb->next;
+	}
+
+	callback->fn = fn;
+	callback->arg = arg;
+	callback->count = count;
+	callback->next = gnttab_free_callback_list;
+	gnttab_free_callback_list = callback;
+	check_free_callbacks();
+out:
+	spin_unlock_irqrestore(&gnttab_list_lock, flags);
+}
+EXPORT_SYMBOL_GPL(gnttab_request_free_callback);
+
+void gnttab_cancel_free_callback(struct gnttab_free_callback *callback)
+{
+	struct gnttab_free_callback **pcb;
+	unsigned long flags;
+
+	spin_lock_irqsave(&gnttab_list_lock, flags);
+	for (pcb = &gnttab_free_callback_list; *pcb; pcb = &(*pcb)->next) {
+		if (*pcb == callback) {
+			*pcb = callback->next;
+			break;
+		}
+	}
+	spin_unlock_irqrestore(&gnttab_list_lock, flags);
+}
+EXPORT_SYMBOL_GPL(gnttab_cancel_free_callback);
+
+static unsigned int gnttab_frames(unsigned int frames, unsigned int align)
+{
+	return (frames * gnttab_interface->grefs_per_grant_frame + align - 1) /
+	       align;
+}
+
+static int grow_gnttab_list(unsigned int more_frames)
+{
+	unsigned int new_nr_grant_frames, extra_entries, i;
+	unsigned int nr_glist_frames, new_nr_glist_frames;
+	unsigned int grefs_per_frame;
+
+	BUG_ON(gnttab_interface == NULL);
+	grefs_per_frame = gnttab_interface->grefs_per_grant_frame;
+
+	new_nr_grant_frames = nr_grant_frames + more_frames;
+	extra_entries = more_frames * grefs_per_frame;
+
+	nr_glist_frames = gnttab_frames(nr_grant_frames, RPP);
+	new_nr_glist_frames = gnttab_frames(new_nr_grant_frames, RPP);
+	for (i = nr_glist_frames; i < new_nr_glist_frames; i++) {
+		gnttab_list[i] = (grant_ref_t *)__get_free_page(GFP_ATOMIC);
+		if (!gnttab_list[i])
+			goto grow_nomem;
+	}
+
+
+	for (i = grefs_per_frame * nr_grant_frames;
+	     i < grefs_per_frame * new_nr_grant_frames - 1; i++)
+		gnttab_entry(i) = i + 1;
+
+	gnttab_entry(i) = gnttab_free_head;
+	gnttab_free_head = grefs_per_frame * nr_grant_frames;
+	gnttab_free_count += extra_entries;
+
+	nr_grant_frames = new_nr_grant_frames;
+
+	check_free_callbacks();
+
+	return 0;
+
+grow_nomem:
+	while (i-- > nr_glist_frames)
+		free_page((unsigned long) gnttab_list[i]);
+	return -ENOMEM;
+}
+
+static unsigned int __max_nr_grant_frames(void)
+{
+	struct gnttab_query_size query;
+	int rc;
+
+	query.dom = DOMID_SELF;
+
+	rc = HYPERVISOR_grant_table_op(GNTTABOP_query_size, &query, 1);
+	if ((rc < 0) || (query.status != GNTST_okay))
+		return 4; /* Legacy max supported number of frames */
+
+	return query.max_nr_frames;
+}
+
+unsigned int gnttab_max_grant_frames(void)
+{
+	unsigned int xen_max = __max_nr_grant_frames();
+	static unsigned int boot_max_nr_grant_frames;
+
+	/* First time, initialize it properly. */
+	if (!boot_max_nr_grant_frames)
+		boot_max_nr_grant_frames = __max_nr_grant_frames();
+
+	if (xen_max > boot_max_nr_grant_frames)
+		return boot_max_nr_grant_frames;
+	return xen_max;
+}
+EXPORT_SYMBOL_GPL(gnttab_max_grant_frames);
+
+int gnttab_setup_auto_xlat_frames(phys_addr_t addr)
+{
+	xen_pfn_t *pfn;
+	unsigned int max_nr_gframes = __max_nr_grant_frames();
+	unsigned int i;
+	void *vaddr;
+
+	if (xen_auto_xlat_grant_frames.count)
+		return -EINVAL;
+
+	vaddr = xen_remap(addr, XEN_PAGE_SIZE * max_nr_gframes);
+	if (vaddr == NULL) {
+		pr_warn("Failed to ioremap gnttab share frames (addr=%pa)!\n",
+			&addr);
+		return -ENOMEM;
+	}
+	pfn = kcalloc(max_nr_gframes, sizeof(pfn[0]), GFP_KERNEL);
+	if (!pfn) {
+		xen_unmap(vaddr);
+		return -ENOMEM;
+	}
+	for (i = 0; i < max_nr_gframes; i++)
+		pfn[i] = XEN_PFN_DOWN(addr) + i;
+
+	xen_auto_xlat_grant_frames.vaddr = vaddr;
+	xen_auto_xlat_grant_frames.pfn = pfn;
+	xen_auto_xlat_grant_frames.count = max_nr_gframes;
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(gnttab_setup_auto_xlat_frames);
+
+void gnttab_free_auto_xlat_frames(void)
+{
+	if (!xen_auto_xlat_grant_frames.count)
+		return;
+	kfree(xen_auto_xlat_grant_frames.pfn);
+	xen_unmap(xen_auto_xlat_grant_frames.vaddr);
+
+	xen_auto_xlat_grant_frames.pfn = NULL;
+	xen_auto_xlat_grant_frames.count = 0;
+	xen_auto_xlat_grant_frames.vaddr = NULL;
+}
+EXPORT_SYMBOL_GPL(gnttab_free_auto_xlat_frames);
+
+int gnttab_pages_set_private(int nr_pages, struct page **pages)
+{
+	int i;
+
+	for (i = 0; i < nr_pages; i++) {
+#if BITS_PER_LONG < 64
+		struct xen_page_foreign *foreign;
+
+		foreign = kzalloc(sizeof(*foreign), GFP_KERNEL);
+		if (!foreign)
+			return -ENOMEM;
+
+		set_page_private(pages[i], (unsigned long)foreign);
+#endif
+		SetPagePrivate(pages[i]);
+	}
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(gnttab_pages_set_private);
+
+/**
+ * gnttab_alloc_pages - alloc pages suitable for grant mapping into
+ * @nr_pages: number of pages to alloc
+ * @pages: returns the pages
+ */
+int gnttab_alloc_pages(int nr_pages, struct page **pages)
+{
+	int ret;
+
+	ret = alloc_xenballooned_pages(nr_pages, pages);
+	if (ret < 0)
+		return ret;
+
+	ret = gnttab_pages_set_private(nr_pages, pages);
+	if (ret < 0)
+		gnttab_free_pages(nr_pages, pages);
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(gnttab_alloc_pages);
+
+void gnttab_pages_clear_private(int nr_pages, struct page **pages)
+{
+	int i;
+
+	for (i = 0; i < nr_pages; i++) {
+		if (PagePrivate(pages[i])) {
+#if BITS_PER_LONG < 64
+			kfree((void *)page_private(pages[i]));
+#endif
+			ClearPagePrivate(pages[i]);
+		}
+	}
+}
+EXPORT_SYMBOL_GPL(gnttab_pages_clear_private);
+
+/**
+ * gnttab_free_pages - free pages allocated by gnttab_alloc_pages()
+ * @nr_pages; number of pages to free
+ * @pages: the pages
+ */
+void gnttab_free_pages(int nr_pages, struct page **pages)
+{
+	gnttab_pages_clear_private(nr_pages, pages);
+	free_xenballooned_pages(nr_pages, pages);
+}
+EXPORT_SYMBOL_GPL(gnttab_free_pages);
+
+#ifdef CONFIG_XEN_GRANT_DMA_ALLOC
+/**
+ * gnttab_dma_alloc_pages - alloc DMAable pages suitable for grant mapping into
+ * @args: arguments to the function
+ */
+int gnttab_dma_alloc_pages(struct gnttab_dma_alloc_args *args)
+{
+	unsigned long pfn, start_pfn;
+	size_t size;
+	int i, ret;
+
+	size = args->nr_pages << PAGE_SHIFT;
+	if (args->coherent)
+		args->vaddr = dma_alloc_coherent(args->dev, size,
+						 &args->dev_bus_addr,
+						 GFP_KERNEL | __GFP_NOWARN);
+	else
+		args->vaddr = dma_alloc_wc(args->dev, size,
+					   &args->dev_bus_addr,
+					   GFP_KERNEL | __GFP_NOWARN);
+	if (!args->vaddr) {
+		pr_debug("Failed to allocate DMA buffer of size %zu\n", size);
+		return -ENOMEM;
+	}
+
+	start_pfn = __phys_to_pfn(args->dev_bus_addr);
+	for (pfn = start_pfn, i = 0; pfn < start_pfn + args->nr_pages;
+			pfn++, i++) {
+		struct page *page = pfn_to_page(pfn);
+
+		args->pages[i] = page;
+		args->frames[i] = xen_page_to_gfn(page);
+		xenmem_reservation_scrub_page(page);
+	}
+
+	xenmem_reservation_va_mapping_reset(args->nr_pages, args->pages);
+
+	ret = xenmem_reservation_decrease(args->nr_pages, args->frames);
+	if (ret != args->nr_pages) {
+		pr_debug("Failed to decrease reservation for DMA buffer\n");
+		ret = -EFAULT;
+		goto fail;
+	}
+
+	ret = gnttab_pages_set_private(args->nr_pages, args->pages);
+	if (ret < 0)
+		goto fail;
+
+	return 0;
+
+fail:
+	gnttab_dma_free_pages(args);
+	return ret;
+}
+EXPORT_SYMBOL_GPL(gnttab_dma_alloc_pages);
+
+/**
+ * gnttab_dma_free_pages - free DMAable pages
+ * @args: arguments to the function
+ */
+int gnttab_dma_free_pages(struct gnttab_dma_alloc_args *args)
+{
+	size_t size;
+	int i, ret;
+
+	gnttab_pages_clear_private(args->nr_pages, args->pages);
+
+	for (i = 0; i < args->nr_pages; i++)
+		args->frames[i] = page_to_xen_pfn(args->pages[i]);
+
+	ret = xenmem_reservation_increase(args->nr_pages, args->frames);
+	if (ret != args->nr_pages) {
+		pr_debug("Failed to increase reservation for DMA buffer\n");
+		ret = -EFAULT;
+	} else {
+		ret = 0;
+	}
+
+	xenmem_reservation_va_mapping_update(args->nr_pages, args->pages,
+					     args->frames);
+
+	size = args->nr_pages << PAGE_SHIFT;
+	if (args->coherent)
+		dma_free_coherent(args->dev, size,
+				  args->vaddr, args->dev_bus_addr);
+	else
+		dma_free_wc(args->dev, size,
+			    args->vaddr, args->dev_bus_addr);
+	return ret;
+}
+EXPORT_SYMBOL_GPL(gnttab_dma_free_pages);
+#endif
+
+/* Handling of paged out grant targets (GNTST_eagain) */
+#define MAX_DELAY 256
+static inline void
+gnttab_retry_eagain_gop(unsigned int cmd, void *gop, int16_t *status,
+						const char *func)
+{
+	unsigned delay = 1;
+
+	do {
+		BUG_ON(HYPERVISOR_grant_table_op(cmd, gop, 1));
+		if (*status == GNTST_eagain)
+			msleep(delay++);
+	} while ((*status == GNTST_eagain) && (delay < MAX_DELAY));
+
+	if (delay >= MAX_DELAY) {
+		pr_err("%s: %s eagain grant\n", func, current->comm);
+		*status = GNTST_bad_page;
+	}
+}
+
+void gnttab_batch_map(struct gnttab_map_grant_ref *batch, unsigned count)
+{
+	struct gnttab_map_grant_ref *op;
+
+	if (HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref, batch, count))
+		BUG();
+	for (op = batch; op < batch + count; op++)
+		if (op->status == GNTST_eagain)
+			gnttab_retry_eagain_gop(GNTTABOP_map_grant_ref, op,
+						&op->status, __func__);
+}
+EXPORT_SYMBOL_GPL(gnttab_batch_map);
+
+void gnttab_batch_copy(struct gnttab_copy *batch, unsigned count)
+{
+	struct gnttab_copy *op;
+
+	if (HYPERVISOR_grant_table_op(GNTTABOP_copy, batch, count))
+		BUG();
+	for (op = batch; op < batch + count; op++)
+		if (op->status == GNTST_eagain)
+			gnttab_retry_eagain_gop(GNTTABOP_copy, op,
+						&op->status, __func__);
+}
+EXPORT_SYMBOL_GPL(gnttab_batch_copy);
+
+void gnttab_foreach_grant_in_range(struct page *page,
+				   unsigned int offset,
+				   unsigned int len,
+				   xen_grant_fn_t fn,
+				   void *data)
+{
+	unsigned int goffset;
+	unsigned int glen;
+	unsigned long xen_pfn;
+
+	len = min_t(unsigned int, PAGE_SIZE - offset, len);
+	goffset = xen_offset_in_page(offset);
+
+	xen_pfn = page_to_xen_pfn(page) + XEN_PFN_DOWN(offset);
+
+	while (len) {
+		glen = min_t(unsigned int, XEN_PAGE_SIZE - goffset, len);
+		fn(pfn_to_gfn(xen_pfn), goffset, glen, data);
+
+		goffset = 0;
+		xen_pfn++;
+		len -= glen;
+	}
+}
+EXPORT_SYMBOL_GPL(gnttab_foreach_grant_in_range);
+
+void gnttab_foreach_grant(struct page **pages,
+			  unsigned int nr_grefs,
+			  xen_grant_fn_t fn,
+			  void *data)
+{
+	unsigned int goffset = 0;
+	unsigned long xen_pfn = 0;
+	unsigned int i;
+
+	for (i = 0; i < nr_grefs; i++) {
+		if ((i % XEN_PFN_PER_PAGE) == 0) {
+			xen_pfn = page_to_xen_pfn(pages[i / XEN_PFN_PER_PAGE]);
+			goffset = 0;
+		}
+
+		fn(pfn_to_gfn(xen_pfn), goffset, XEN_PAGE_SIZE, data);
+
+		goffset += XEN_PAGE_SIZE;
+		xen_pfn++;
+	}
+}
+
+int gnttab_map_refs(struct gnttab_map_grant_ref *map_ops,
+		    struct gnttab_map_grant_ref *kmap_ops,
+		    struct page **pages, unsigned int count)
+{
+	int i, ret;
+
+	ret = HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref, map_ops, count);
+	if (ret)
+		return ret;
+
+	for (i = 0; i < count; i++) {
+		switch (map_ops[i].status) {
+		case GNTST_okay:
+		{
+			struct xen_page_foreign *foreign;
+
+			SetPageForeign(pages[i]);
+			foreign = xen_page_foreign(pages[i]);
+			foreign->domid = map_ops[i].dom;
+			foreign->gref = map_ops[i].ref;
+			break;
+		}
+
+		case GNTST_no_device_space:
+			pr_warn_ratelimited("maptrack limit reached, can't map all guest pages\n");
+			break;
+
+		case GNTST_eagain:
+			/* Retry eagain maps */
+			gnttab_retry_eagain_gop(GNTTABOP_map_grant_ref,
+						map_ops + i,
+						&map_ops[i].status, __func__);
+			/* Test status in next loop iteration. */
+			i--;
+			break;
+
+		default:
+			break;
+		}
+	}
+
+	return set_foreign_p2m_mapping(map_ops, kmap_ops, pages, count);
+}
+EXPORT_SYMBOL_GPL(gnttab_map_refs);
+
+int gnttab_unmap_refs(struct gnttab_unmap_grant_ref *unmap_ops,
+		      struct gnttab_unmap_grant_ref *kunmap_ops,
+		      struct page **pages, unsigned int count)
+{
+	unsigned int i;
+	int ret;
+
+	ret = HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref, unmap_ops, count);
+	if (ret)
+		return ret;
+
+	for (i = 0; i < count; i++)
+		ClearPageForeign(pages[i]);
+
+	return clear_foreign_p2m_mapping(unmap_ops, kunmap_ops, pages, count);
+}
+EXPORT_SYMBOL_GPL(gnttab_unmap_refs);
+
+#define GNTTAB_UNMAP_REFS_DELAY 5
+
+static void __gnttab_unmap_refs_async(struct gntab_unmap_queue_data* item);
+
+static void gnttab_unmap_work(struct work_struct *work)
+{
+	struct gntab_unmap_queue_data
+		*unmap_data = container_of(work, 
+					   struct gntab_unmap_queue_data,
+					   gnttab_work.work);
+	if (unmap_data->age != UINT_MAX)
+		unmap_data->age++;
+	__gnttab_unmap_refs_async(unmap_data);
+}
+
+static void __gnttab_unmap_refs_async(struct gntab_unmap_queue_data* item)
+{
+	int ret;
+	int pc;
+
+	for (pc = 0; pc < item->count; pc++) {
+		if (page_count(item->pages[pc]) > 1) {
+			unsigned long delay = GNTTAB_UNMAP_REFS_DELAY * (item->age + 1);
+			schedule_delayed_work(&item->gnttab_work,
+					      msecs_to_jiffies(delay));
+			return;
+		}
+	}
+
+	ret = gnttab_unmap_refs(item->unmap_ops, item->kunmap_ops,
+				item->pages, item->count);
+	item->done(ret, item);
+}
+
+void gnttab_unmap_refs_async(struct gntab_unmap_queue_data* item)
+{
+	INIT_DELAYED_WORK(&item->gnttab_work, gnttab_unmap_work);
+	item->age = 0;
+
+	__gnttab_unmap_refs_async(item);
+}
+EXPORT_SYMBOL_GPL(gnttab_unmap_refs_async);
+
+static void unmap_refs_callback(int result,
+		struct gntab_unmap_queue_data *data)
+{
+	struct unmap_refs_callback_data *d = data->data;
+
+	d->result = result;
+	complete(&d->completion);
+}
+
+int gnttab_unmap_refs_sync(struct gntab_unmap_queue_data *item)
+{
+	struct unmap_refs_callback_data data;
+
+	init_completion(&data.completion);
+	item->data = &data;
+	item->done = &unmap_refs_callback;
+	gnttab_unmap_refs_async(item);
+	wait_for_completion(&data.completion);
+
+	return data.result;
+}
+EXPORT_SYMBOL_GPL(gnttab_unmap_refs_sync);
+
+static unsigned int nr_status_frames(unsigned int nr_grant_frames)
+{
+	BUG_ON(gnttab_interface == NULL);
+	return gnttab_frames(nr_grant_frames, SPP);
+}
+
+static int gnttab_map_frames_v1(xen_pfn_t *frames, unsigned int nr_gframes)
+{
+	int rc;
+
+	rc = arch_gnttab_map_shared(frames, nr_gframes,
+				    gnttab_max_grant_frames(),
+				    &gnttab_shared.addr);
+	BUG_ON(rc);
+
+	return 0;
+}
+
+static void gnttab_unmap_frames_v1(void)
+{
+	arch_gnttab_unmap(gnttab_shared.addr, nr_grant_frames);
+}
+
+static int gnttab_map_frames_v2(xen_pfn_t *frames, unsigned int nr_gframes)
+{
+	uint64_t *sframes;
+	unsigned int nr_sframes;
+	struct gnttab_get_status_frames getframes;
+	int rc;
+
+	nr_sframes = nr_status_frames(nr_gframes);
+
+	/* No need for kzalloc as it is initialized in following hypercall
+	 * GNTTABOP_get_status_frames.
+	 */
+	sframes = kmalloc_array(nr_sframes, sizeof(uint64_t), GFP_ATOMIC);
+	if (!sframes)
+		return -ENOMEM;
+
+	getframes.dom        = DOMID_SELF;
+	getframes.nr_frames  = nr_sframes;
+	set_xen_guest_handle(getframes.frame_list, sframes);
+
+	rc = HYPERVISOR_grant_table_op(GNTTABOP_get_status_frames,
+				       &getframes, 1);
+	if (rc == -ENOSYS) {
+		kfree(sframes);
+		return -ENOSYS;
+	}
+
+	BUG_ON(rc || getframes.status);
+
+	rc = arch_gnttab_map_status(sframes, nr_sframes,
+				    nr_status_frames(gnttab_max_grant_frames()),
+				    &grstatus);
+	BUG_ON(rc);
+	kfree(sframes);
+
+	rc = arch_gnttab_map_shared(frames, nr_gframes,
+				    gnttab_max_grant_frames(),
+				    &gnttab_shared.addr);
+	BUG_ON(rc);
+
+	return 0;
+}
+
+static void gnttab_unmap_frames_v2(void)
+{
+	arch_gnttab_unmap(gnttab_shared.addr, nr_grant_frames);
+	arch_gnttab_unmap(grstatus, nr_status_frames(nr_grant_frames));
+}
+
+static int gnttab_map(unsigned int start_idx, unsigned int end_idx)
+{
+	struct gnttab_setup_table setup;
+	xen_pfn_t *frames;
+	unsigned int nr_gframes = end_idx + 1;
+	int rc;
+
+	if (xen_feature(XENFEAT_auto_translated_physmap)) {
+		struct xen_add_to_physmap xatp;
+		unsigned int i = end_idx;
+		rc = 0;
+		BUG_ON(xen_auto_xlat_grant_frames.count < nr_gframes);
+		/*
+		 * Loop backwards, so that the first hypercall has the largest
+		 * index, ensuring that the table will grow only once.
+		 */
+		do {
+			xatp.domid = DOMID_SELF;
+			xatp.idx = i;
+			xatp.space = XENMAPSPACE_grant_table;
+			xatp.gpfn = xen_auto_xlat_grant_frames.pfn[i];
+			rc = HYPERVISOR_memory_op(XENMEM_add_to_physmap, &xatp);
+			if (rc != 0) {
+				pr_warn("grant table add_to_physmap failed, err=%d\n",
+					rc);
+				break;
+			}
+		} while (i-- > start_idx);
+
+		return rc;
+	}
+
+	/* No need for kzalloc as it is initialized in following hypercall
+	 * GNTTABOP_setup_table.
+	 */
+	frames = kmalloc_array(nr_gframes, sizeof(unsigned long), GFP_ATOMIC);
+	if (!frames)
+		return -ENOMEM;
+
+	setup.dom        = DOMID_SELF;
+	setup.nr_frames  = nr_gframes;
+	set_xen_guest_handle(setup.frame_list, frames);
+
+	rc = HYPERVISOR_grant_table_op(GNTTABOP_setup_table, &setup, 1);
+	if (rc == -ENOSYS) {
+		kfree(frames);
+		return -ENOSYS;
+	}
+
+	BUG_ON(rc || setup.status);
+
+	rc = gnttab_interface->map_frames(frames, nr_gframes);
+
+	kfree(frames);
+
+	return rc;
+}
+
+static const struct gnttab_ops gnttab_v1_ops = {
+	.version			= 1,
+	.grefs_per_grant_frame		= XEN_PAGE_SIZE /
+					  sizeof(struct grant_entry_v1),
+	.map_frames			= gnttab_map_frames_v1,
+	.unmap_frames			= gnttab_unmap_frames_v1,
+	.update_entry			= gnttab_update_entry_v1,
+	.end_foreign_access_ref		= gnttab_end_foreign_access_ref_v1,
+	.end_foreign_transfer_ref	= gnttab_end_foreign_transfer_ref_v1,
+	.read_frame			= gnttab_read_frame_v1,
+};
+
+static const struct gnttab_ops gnttab_v2_ops = {
+	.version			= 2,
+	.grefs_per_grant_frame		= XEN_PAGE_SIZE /
+					  sizeof(union grant_entry_v2),
+	.map_frames			= gnttab_map_frames_v2,
+	.unmap_frames			= gnttab_unmap_frames_v2,
+	.update_entry			= gnttab_update_entry_v2,
+	.end_foreign_access_ref		= gnttab_end_foreign_access_ref_v2,
+	.end_foreign_transfer_ref	= gnttab_end_foreign_transfer_ref_v2,
+	.read_frame			= gnttab_read_frame_v2,
+};
+
+static bool gnttab_need_v2(void)
+{
+#ifdef CONFIG_X86
+	uint32_t base, width;
+
+	if (xen_pv_domain()) {
+		base = xen_cpuid_base();
+		if (cpuid_eax(base) < 5)
+			return false;	/* Information not available, use V1. */
+		width = cpuid_ebx(base + 5) &
+			XEN_CPUID_MACHINE_ADDRESS_WIDTH_MASK;
+		return width > 32 + PAGE_SHIFT;
+	}
+#endif
+	return !!(max_possible_pfn >> 32);
+}
+
+static void gnttab_request_version(void)
+{
+	long rc;
+	struct gnttab_set_version gsv;
+
+	if (gnttab_need_v2())
+		gsv.version = 2;
+	else
+		gsv.version = 1;
+
+	/* Boot parameter overrides automatic selection. */
+	if (xen_gnttab_version >= 1 && xen_gnttab_version <= 2)
+		gsv.version = xen_gnttab_version;
+
+	rc = HYPERVISOR_grant_table_op(GNTTABOP_set_version, &gsv, 1);
+	if (rc == 0 && gsv.version == 2)
+		gnttab_interface = &gnttab_v2_ops;
+	else
+		gnttab_interface = &gnttab_v1_ops;
+	pr_info("Grant tables using version %d layout\n",
+		gnttab_interface->version);
+}
+
+static int gnttab_setup(void)
+{
+	unsigned int max_nr_gframes;
+
+	max_nr_gframes = gnttab_max_grant_frames();
+	if (max_nr_gframes < nr_grant_frames)
+		return -ENOSYS;
+
+	if (xen_feature(XENFEAT_auto_translated_physmap) && gnttab_shared.addr == NULL) {
+		gnttab_shared.addr = xen_auto_xlat_grant_frames.vaddr;
+		if (gnttab_shared.addr == NULL) {
+			pr_warn("gnttab share frames (addr=0x%08lx) is not mapped!\n",
+				(unsigned long)xen_auto_xlat_grant_frames.vaddr);
+			return -ENOMEM;
+		}
+	}
+	return gnttab_map(0, nr_grant_frames - 1);
+}
+
+int gnttab_resume(void)
+{
+	gnttab_request_version();
+	return gnttab_setup();
+}
+
+int gnttab_suspend(void)
+{
+	if (!xen_feature(XENFEAT_auto_translated_physmap))
+		gnttab_interface->unmap_frames();
+	return 0;
+}
+
+static int gnttab_expand(unsigned int req_entries)
+{
+	int rc;
+	unsigned int cur, extra;
+
+	BUG_ON(gnttab_interface == NULL);
+	cur = nr_grant_frames;
+	extra = ((req_entries + gnttab_interface->grefs_per_grant_frame - 1) /
+		 gnttab_interface->grefs_per_grant_frame);
+	if (cur + extra > gnttab_max_grant_frames()) {
+		pr_warn_ratelimited("xen/grant-table: max_grant_frames reached"
+				    " cur=%u extra=%u limit=%u"
+				    " gnttab_free_count=%u req_entries=%u\n",
+				    cur, extra, gnttab_max_grant_frames(),
+				    gnttab_free_count, req_entries);
+		return -ENOSPC;
+	}
+
+	rc = gnttab_map(cur, cur + extra - 1);
+	if (rc == 0)
+		rc = grow_gnttab_list(extra);
+
+	return rc;
+}
+
+int gnttab_init(void)
+{
+	int i;
+	unsigned long max_nr_grant_frames;
+	unsigned int max_nr_glist_frames, nr_glist_frames;
+	unsigned int nr_init_grefs;
+	int ret;
+
+	gnttab_request_version();
+	max_nr_grant_frames = gnttab_max_grant_frames();
+	nr_grant_frames = 1;
+
+	/* Determine the maximum number of frames required for the
+	 * grant reference free list on the current hypervisor.
+	 */
+	BUG_ON(gnttab_interface == NULL);
+	max_nr_glist_frames = (max_nr_grant_frames *
+			       gnttab_interface->grefs_per_grant_frame / RPP);
+
+	gnttab_list = kmalloc_array(max_nr_glist_frames,
+				    sizeof(grant_ref_t *),
+				    GFP_KERNEL);
+	if (gnttab_list == NULL)
+		return -ENOMEM;
+
+	nr_glist_frames = gnttab_frames(nr_grant_frames, RPP);
+	for (i = 0; i < nr_glist_frames; i++) {
+		gnttab_list[i] = (grant_ref_t *)__get_free_page(GFP_KERNEL);
+		if (gnttab_list[i] == NULL) {
+			ret = -ENOMEM;
+			goto ini_nomem;
+		}
+	}
+
+	ret = arch_gnttab_init(max_nr_grant_frames,
+			       nr_status_frames(max_nr_grant_frames));
+	if (ret < 0)
+		goto ini_nomem;
+
+	if (gnttab_setup() < 0) {
+		ret = -ENODEV;
+		goto ini_nomem;
+	}
+
+	nr_init_grefs = nr_grant_frames *
+			gnttab_interface->grefs_per_grant_frame;
+
+	for (i = NR_RESERVED_ENTRIES; i < nr_init_grefs - 1; i++)
+		gnttab_entry(i) = i + 1;
+
+	gnttab_entry(nr_init_grefs - 1) = GNTTAB_LIST_END;
+	gnttab_free_count = nr_init_grefs - NR_RESERVED_ENTRIES;
+	gnttab_free_head  = NR_RESERVED_ENTRIES;
+
+	printk("Grant table initialized\n");
+	return 0;
+
+ ini_nomem:
+	for (i--; i >= 0; i--)
+		free_page((unsigned long)gnttab_list[i]);
+	kfree(gnttab_list);
+	return ret;
+}
+EXPORT_SYMBOL_GPL(gnttab_init);
+
+static int __gnttab_init(void)
+{
+	if (!xen_domain())
+		return -ENODEV;
+
+	/* Delay grant-table initialization in the PV on HVM case */
+	if (xen_hvm_domain() && !xen_pvh_domain())
+		return 0;
+
+	return gnttab_init();
+}
+/* Starts after core_initcall so that xen_pvh_gnttab_setup can be called
+ * beforehand to initialize xen_auto_xlat_grant_frames. */
+core_initcall_sync(__gnttab_init);
diff --git a/drivers/xen/manage.c b/drivers/xen/manage.c
new file mode 100644
index 000000000..5bb01a62f
--- /dev/null
+++ b/drivers/xen/manage.c
@@ -0,0 +1,388 @@
+/*
+ * Handle extern requests for shutdown, reboot and sysrq
+ */
+
+#define pr_fmt(fmt) "xen:" KBUILD_MODNAME ": " fmt
+
+#include <linux/kernel.h>
+#include <linux/err.h>
+#include <linux/slab.h>
+#include <linux/reboot.h>
+#include <linux/sysrq.h>
+#include <linux/stop_machine.h>
+#include <linux/freezer.h>
+#include <linux/syscore_ops.h>
+#include <linux/export.h>
+
+#include <xen/xen.h>
+#include <xen/xenbus.h>
+#include <xen/grant_table.h>
+#include <xen/events.h>
+#include <xen/hvc-console.h>
+#include <xen/page.h>
+#include <xen/xen-ops.h>
+
+#include <asm/xen/hypercall.h>
+#include <asm/xen/hypervisor.h>
+
+enum shutdown_state {
+	SHUTDOWN_INVALID = -1,
+	SHUTDOWN_POWEROFF = 0,
+	SHUTDOWN_SUSPEND = 2,
+	/* Code 3 is SHUTDOWN_CRASH, which we don't use because the domain can only
+	   report a crash, not be instructed to crash!
+	   HALT is the same as POWEROFF, as far as we're concerned.  The tools use
+	   the distinction when we return the reason code to them.  */
+	 SHUTDOWN_HALT = 4,
+};
+
+/* Ignore multiple shutdown requests. */
+static enum shutdown_state shutting_down = SHUTDOWN_INVALID;
+
+struct suspend_info {
+	int cancelled;
+};
+
+static RAW_NOTIFIER_HEAD(xen_resume_notifier);
+
+void xen_resume_notifier_register(struct notifier_block *nb)
+{
+	raw_notifier_chain_register(&xen_resume_notifier, nb);
+}
+EXPORT_SYMBOL_GPL(xen_resume_notifier_register);
+
+void xen_resume_notifier_unregister(struct notifier_block *nb)
+{
+	raw_notifier_chain_unregister(&xen_resume_notifier, nb);
+}
+EXPORT_SYMBOL_GPL(xen_resume_notifier_unregister);
+
+#ifdef CONFIG_HIBERNATE_CALLBACKS
+static int xen_suspend(void *data)
+{
+	struct suspend_info *si = data;
+	int err;
+
+	BUG_ON(!irqs_disabled());
+
+	err = syscore_suspend();
+	if (err) {
+		pr_err("%s: system core suspend failed: %d\n", __func__, err);
+		return err;
+	}
+
+	gnttab_suspend();
+	xen_manage_runstate_time(-1);
+	xen_arch_pre_suspend();
+
+	si->cancelled = HYPERVISOR_suspend(xen_pv_domain()
+                                           ? virt_to_gfn(xen_start_info)
+                                           : 0);
+
+	xen_arch_post_suspend(si->cancelled);
+	xen_manage_runstate_time(si->cancelled ? 1 : 0);
+	gnttab_resume();
+
+	if (!si->cancelled) {
+		xen_irq_resume();
+		xen_timer_resume();
+	}
+
+	syscore_resume();
+
+	return 0;
+}
+
+static void do_suspend(void)
+{
+	int err;
+	struct suspend_info si;
+
+	shutting_down = SHUTDOWN_SUSPEND;
+
+	err = freeze_processes();
+	if (err) {
+		pr_err("%s: freeze processes failed %d\n", __func__, err);
+		goto out;
+	}
+
+	err = freeze_kernel_threads();
+	if (err) {
+		pr_err("%s: freeze kernel threads failed %d\n", __func__, err);
+		goto out_thaw;
+	}
+
+	err = dpm_suspend_start(PMSG_FREEZE);
+	if (err) {
+		pr_err("%s: dpm_suspend_start %d\n", __func__, err);
+		goto out_thaw;
+	}
+
+	printk(KERN_DEBUG "suspending xenstore...\n");
+	xs_suspend();
+
+	err = dpm_suspend_end(PMSG_FREEZE);
+	if (err) {
+		pr_err("dpm_suspend_end failed: %d\n", err);
+		si.cancelled = 0;
+		goto out_resume;
+	}
+
+	xen_arch_suspend();
+
+	si.cancelled = 1;
+
+	err = stop_machine(xen_suspend, &si, cpumask_of(0));
+
+	/* Resume console as early as possible. */
+	if (!si.cancelled)
+		xen_console_resume();
+
+	raw_notifier_call_chain(&xen_resume_notifier, 0, NULL);
+
+	dpm_resume_start(si.cancelled ? PMSG_THAW : PMSG_RESTORE);
+
+	if (err) {
+		pr_err("failed to start xen_suspend: %d\n", err);
+		si.cancelled = 1;
+	}
+
+	xen_arch_resume();
+
+out_resume:
+	if (!si.cancelled)
+		xs_resume();
+	else
+		xs_suspend_cancel();
+
+	dpm_resume_end(si.cancelled ? PMSG_THAW : PMSG_RESTORE);
+
+out_thaw:
+	thaw_processes();
+out:
+	shutting_down = SHUTDOWN_INVALID;
+}
+#endif	/* CONFIG_HIBERNATE_CALLBACKS */
+
+struct shutdown_handler {
+#define SHUTDOWN_CMD_SIZE 11
+	const char command[SHUTDOWN_CMD_SIZE];
+	bool flag;
+	void (*cb)(void);
+};
+
+static int poweroff_nb(struct notifier_block *cb, unsigned long code, void *unused)
+{
+	switch (code) {
+	case SYS_DOWN:
+	case SYS_HALT:
+	case SYS_POWER_OFF:
+		shutting_down = SHUTDOWN_POWEROFF;
+	default:
+		break;
+	}
+	return NOTIFY_DONE;
+}
+static void do_poweroff(void)
+{
+	switch (system_state) {
+	case SYSTEM_BOOTING:
+	case SYSTEM_SCHEDULING:
+		orderly_poweroff(true);
+		break;
+	case SYSTEM_RUNNING:
+		orderly_poweroff(false);
+		break;
+	default:
+		/* Don't do it when we are halting/rebooting. */
+		pr_info("Ignoring Xen toolstack shutdown.\n");
+		break;
+	}
+}
+
+static void do_reboot(void)
+{
+	shutting_down = SHUTDOWN_POWEROFF; /* ? */
+	ctrl_alt_del();
+}
+
+static struct shutdown_handler shutdown_handlers[] = {
+	{ "poweroff",	true,	do_poweroff },
+	{ "halt",	false,	do_poweroff },
+	{ "reboot",	true,	do_reboot   },
+#ifdef CONFIG_HIBERNATE_CALLBACKS
+	{ "suspend",	true,	do_suspend  },
+#endif
+};
+
+static void shutdown_handler(struct xenbus_watch *watch,
+			     const char *path, const char *token)
+{
+	char *str;
+	struct xenbus_transaction xbt;
+	int err;
+	int idx;
+
+	if (shutting_down != SHUTDOWN_INVALID)
+		return;
+
+ again:
+	err = xenbus_transaction_start(&xbt);
+	if (err)
+		return;
+
+	str = (char *)xenbus_read(xbt, "control", "shutdown", NULL);
+	/* Ignore read errors and empty reads. */
+	if (XENBUS_IS_ERR_READ(str)) {
+		xenbus_transaction_end(xbt, 1);
+		return;
+	}
+
+	for (idx = 0; idx < ARRAY_SIZE(shutdown_handlers); idx++) {
+		if (strcmp(str, shutdown_handlers[idx].command) == 0)
+			break;
+	}
+
+	/* Only acknowledge commands which we are prepared to handle. */
+	if (idx < ARRAY_SIZE(shutdown_handlers))
+		xenbus_write(xbt, "control", "shutdown", "");
+
+	err = xenbus_transaction_end(xbt, 0);
+	if (err == -EAGAIN) {
+		kfree(str);
+		goto again;
+	}
+
+	if (idx < ARRAY_SIZE(shutdown_handlers)) {
+		shutdown_handlers[idx].cb();
+	} else {
+		pr_info("Ignoring shutdown request: %s\n", str);
+		shutting_down = SHUTDOWN_INVALID;
+	}
+
+	kfree(str);
+}
+
+#ifdef CONFIG_MAGIC_SYSRQ
+static void sysrq_handler(struct xenbus_watch *watch, const char *path,
+			  const char *token)
+{
+	char sysrq_key = '\0';
+	struct xenbus_transaction xbt;
+	int err;
+
+ again:
+	err = xenbus_transaction_start(&xbt);
+	if (err)
+		return;
+	err = xenbus_scanf(xbt, "control", "sysrq", "%c", &sysrq_key);
+	if (err < 0) {
+		/*
+		 * The Xenstore watch fires directly after registering it and
+		 * after a suspend/resume cycle. So ENOENT is no error but
+		 * might happen in those cases. ERANGE is observed when we get
+		 * an empty value (''), this happens when we acknowledge the
+		 * request by writing '\0' below.
+		 */
+		if (err != -ENOENT && err != -ERANGE)
+			pr_err("Error %d reading sysrq code in control/sysrq\n",
+			       err);
+		xenbus_transaction_end(xbt, 1);
+		return;
+	}
+
+	if (sysrq_key != '\0') {
+		err = xenbus_printf(xbt, "control", "sysrq", "%c", '\0');
+		if (err) {
+			pr_err("%s: Error %d writing sysrq in control/sysrq\n",
+			       __func__, err);
+			xenbus_transaction_end(xbt, 1);
+			return;
+		}
+	}
+
+	err = xenbus_transaction_end(xbt, 0);
+	if (err == -EAGAIN)
+		goto again;
+
+	if (sysrq_key != '\0')
+		handle_sysrq(sysrq_key);
+}
+
+static struct xenbus_watch sysrq_watch = {
+	.node = "control/sysrq",
+	.callback = sysrq_handler
+};
+#endif
+
+static struct xenbus_watch shutdown_watch = {
+	.node = "control/shutdown",
+	.callback = shutdown_handler
+};
+
+static struct notifier_block xen_reboot_nb = {
+	.notifier_call = poweroff_nb,
+};
+
+static int setup_shutdown_watcher(void)
+{
+	int err;
+	int idx;
+#define FEATURE_PATH_SIZE (SHUTDOWN_CMD_SIZE + sizeof("feature-"))
+	char node[FEATURE_PATH_SIZE];
+
+	err = register_xenbus_watch(&shutdown_watch);
+	if (err) {
+		pr_err("Failed to set shutdown watcher\n");
+		return err;
+	}
+
+
+#ifdef CONFIG_MAGIC_SYSRQ
+	err = register_xenbus_watch(&sysrq_watch);
+	if (err) {
+		pr_err("Failed to set sysrq watcher\n");
+		return err;
+	}
+#endif
+
+	for (idx = 0; idx < ARRAY_SIZE(shutdown_handlers); idx++) {
+		if (!shutdown_handlers[idx].flag)
+			continue;
+		snprintf(node, FEATURE_PATH_SIZE, "feature-%s",
+			 shutdown_handlers[idx].command);
+		err = xenbus_printf(XBT_NIL, "control", node, "%u", 1);
+		if (err) {
+			pr_err("%s: Error %d writing %s\n", __func__,
+				err, node);
+			return err;
+		}
+	}
+
+	return 0;
+}
+
+static int shutdown_event(struct notifier_block *notifier,
+			  unsigned long event,
+			  void *data)
+{
+	setup_shutdown_watcher();
+	return NOTIFY_DONE;
+}
+
+int xen_setup_shutdown_event(void)
+{
+	static struct notifier_block xenstore_notifier = {
+		.notifier_call = shutdown_event
+	};
+
+	if (!xen_domain())
+		return -ENODEV;
+	register_xenstore_notifier(&xenstore_notifier);
+	register_reboot_notifier(&xen_reboot_nb);
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(xen_setup_shutdown_event);
+
+subsys_initcall(xen_setup_shutdown_event);
diff --git a/drivers/xen/mcelog.c b/drivers/xen/mcelog.c
new file mode 100644
index 000000000..b8bf61abb
--- /dev/null
+++ b/drivers/xen/mcelog.c
@@ -0,0 +1,417 @@
+/******************************************************************************
+ * mcelog.c
+ * Driver for receiving and transferring machine check error infomation
+ *
+ * Copyright (c) 2012 Intel Corporation
+ * Author: Liu, Jinsong <jinsong.liu@intel.com>
+ * Author: Jiang, Yunhong <yunhong.jiang@intel.com>
+ * Author: Ke, Liping <liping.ke@intel.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation; or, when distributed
+ * separately from the Linux kernel or incorporated into other
+ * software packages, subject to the following license:
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this source file (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy, modify,
+ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
+ * and to permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#define pr_fmt(fmt) "xen_mcelog: " fmt
+
+#include <linux/init.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/slab.h>
+#include <linux/fs.h>
+#include <linux/device.h>
+#include <linux/miscdevice.h>
+#include <linux/uaccess.h>
+#include <linux/capability.h>
+#include <linux/poll.h>
+#include <linux/sched.h>
+
+#include <xen/interface/xen.h>
+#include <xen/events.h>
+#include <xen/interface/vcpu.h>
+#include <xen/xen.h>
+#include <asm/xen/hypercall.h>
+#include <asm/xen/hypervisor.h>
+
+static struct mc_info g_mi;
+static struct mcinfo_logical_cpu *g_physinfo;
+static uint32_t ncpus;
+
+static DEFINE_MUTEX(mcelog_lock);
+
+static struct xen_mce_log xen_mcelog = {
+	.signature	= XEN_MCE_LOG_SIGNATURE,
+	.len		= XEN_MCE_LOG_LEN,
+	.recordlen	= sizeof(struct xen_mce),
+};
+
+static DEFINE_SPINLOCK(xen_mce_chrdev_state_lock);
+static int xen_mce_chrdev_open_count;	/* #times opened */
+static int xen_mce_chrdev_open_exclu;	/* already open exclusive? */
+
+static DECLARE_WAIT_QUEUE_HEAD(xen_mce_chrdev_wait);
+
+static int xen_mce_chrdev_open(struct inode *inode, struct file *file)
+{
+	spin_lock(&xen_mce_chrdev_state_lock);
+
+	if (xen_mce_chrdev_open_exclu ||
+	    (xen_mce_chrdev_open_count && (file->f_flags & O_EXCL))) {
+		spin_unlock(&xen_mce_chrdev_state_lock);
+
+		return -EBUSY;
+	}
+
+	if (file->f_flags & O_EXCL)
+		xen_mce_chrdev_open_exclu = 1;
+	xen_mce_chrdev_open_count++;
+
+	spin_unlock(&xen_mce_chrdev_state_lock);
+
+	return nonseekable_open(inode, file);
+}
+
+static int xen_mce_chrdev_release(struct inode *inode, struct file *file)
+{
+	spin_lock(&xen_mce_chrdev_state_lock);
+
+	xen_mce_chrdev_open_count--;
+	xen_mce_chrdev_open_exclu = 0;
+
+	spin_unlock(&xen_mce_chrdev_state_lock);
+
+	return 0;
+}
+
+static ssize_t xen_mce_chrdev_read(struct file *filp, char __user *ubuf,
+				size_t usize, loff_t *off)
+{
+	char __user *buf = ubuf;
+	unsigned num;
+	int i, err;
+
+	mutex_lock(&mcelog_lock);
+
+	num = xen_mcelog.next;
+
+	/* Only supports full reads right now */
+	err = -EINVAL;
+	if (*off != 0 || usize < XEN_MCE_LOG_LEN*sizeof(struct xen_mce))
+		goto out;
+
+	err = 0;
+	for (i = 0; i < num; i++) {
+		struct xen_mce *m = &xen_mcelog.entry[i];
+
+		err |= copy_to_user(buf, m, sizeof(*m));
+		buf += sizeof(*m);
+	}
+
+	memset(xen_mcelog.entry, 0, num * sizeof(struct xen_mce));
+	xen_mcelog.next = 0;
+
+	if (err)
+		err = -EFAULT;
+
+out:
+	mutex_unlock(&mcelog_lock);
+
+	return err ? err : buf - ubuf;
+}
+
+static __poll_t xen_mce_chrdev_poll(struct file *file, poll_table *wait)
+{
+	poll_wait(file, &xen_mce_chrdev_wait, wait);
+
+	if (xen_mcelog.next)
+		return EPOLLIN | EPOLLRDNORM;
+
+	return 0;
+}
+
+static long xen_mce_chrdev_ioctl(struct file *f, unsigned int cmd,
+				unsigned long arg)
+{
+	int __user *p = (int __user *)arg;
+
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
+	switch (cmd) {
+	case MCE_GET_RECORD_LEN:
+		return put_user(sizeof(struct xen_mce), p);
+	case MCE_GET_LOG_LEN:
+		return put_user(XEN_MCE_LOG_LEN, p);
+	case MCE_GETCLEAR_FLAGS: {
+		unsigned flags;
+
+		do {
+			flags = xen_mcelog.flags;
+		} while (cmpxchg(&xen_mcelog.flags, flags, 0) != flags);
+
+		return put_user(flags, p);
+	}
+	default:
+		return -ENOTTY;
+	}
+}
+
+static const struct file_operations xen_mce_chrdev_ops = {
+	.open			= xen_mce_chrdev_open,
+	.release		= xen_mce_chrdev_release,
+	.read			= xen_mce_chrdev_read,
+	.poll			= xen_mce_chrdev_poll,
+	.unlocked_ioctl		= xen_mce_chrdev_ioctl,
+	.llseek			= no_llseek,
+};
+
+static struct miscdevice xen_mce_chrdev_device = {
+	MISC_MCELOG_MINOR,
+	"mcelog",
+	&xen_mce_chrdev_ops,
+};
+
+/*
+ * Caller should hold the mcelog_lock
+ */
+static void xen_mce_log(struct xen_mce *mce)
+{
+	unsigned entry;
+
+	entry = xen_mcelog.next;
+
+	/*
+	 * When the buffer fills up discard new entries.
+	 * Assume that the earlier errors are the more
+	 * interesting ones:
+	 */
+	if (entry >= XEN_MCE_LOG_LEN) {
+		set_bit(XEN_MCE_OVERFLOW,
+			(unsigned long *)&xen_mcelog.flags);
+		return;
+	}
+
+	memcpy(xen_mcelog.entry + entry, mce, sizeof(struct xen_mce));
+
+	xen_mcelog.next++;
+}
+
+static int convert_log(struct mc_info *mi)
+{
+	struct mcinfo_common *mic;
+	struct mcinfo_global *mc_global;
+	struct mcinfo_bank *mc_bank;
+	struct xen_mce m;
+	uint32_t i;
+
+	mic = NULL;
+	x86_mcinfo_lookup(&mic, mi, MC_TYPE_GLOBAL);
+	if (unlikely(!mic)) {
+		pr_warn("Failed to find global error info\n");
+		return -ENODEV;
+	}
+
+	memset(&m, 0, sizeof(struct xen_mce));
+
+	mc_global = (struct mcinfo_global *)mic;
+	m.mcgstatus = mc_global->mc_gstatus;
+	m.apicid = mc_global->mc_apicid;
+
+	for (i = 0; i < ncpus; i++)
+		if (g_physinfo[i].mc_apicid == m.apicid)
+			break;
+	if (unlikely(i == ncpus)) {
+		pr_warn("Failed to match cpu with apicid %d\n", m.apicid);
+		return -ENODEV;
+	}
+
+	m.socketid = g_physinfo[i].mc_chipid;
+	m.cpu = m.extcpu = g_physinfo[i].mc_cpunr;
+	m.cpuvendor = (__u8)g_physinfo[i].mc_vendor;
+	m.mcgcap = g_physinfo[i].mc_msrvalues[__MC_MSR_MCGCAP].value;
+
+	mic = NULL;
+	x86_mcinfo_lookup(&mic, mi, MC_TYPE_BANK);
+	if (unlikely(!mic)) {
+		pr_warn("Fail to find bank error info\n");
+		return -ENODEV;
+	}
+
+	do {
+		if ((!mic) || (mic->size == 0) ||
+		    (mic->type != MC_TYPE_GLOBAL   &&
+		     mic->type != MC_TYPE_BANK     &&
+		     mic->type != MC_TYPE_EXTENDED &&
+		     mic->type != MC_TYPE_RECOVERY))
+			break;
+
+		if (mic->type == MC_TYPE_BANK) {
+			mc_bank = (struct mcinfo_bank *)mic;
+			m.misc = mc_bank->mc_misc;
+			m.status = mc_bank->mc_status;
+			m.addr = mc_bank->mc_addr;
+			m.tsc = mc_bank->mc_tsc;
+			m.bank = mc_bank->mc_bank;
+			m.finished = 1;
+			/*log this record*/
+			xen_mce_log(&m);
+		}
+		mic = x86_mcinfo_next(mic);
+	} while (1);
+
+	return 0;
+}
+
+static int mc_queue_handle(uint32_t flags)
+{
+	struct xen_mc mc_op;
+	int ret = 0;
+
+	mc_op.cmd = XEN_MC_fetch;
+	set_xen_guest_handle(mc_op.u.mc_fetch.data, &g_mi);
+	do {
+		mc_op.u.mc_fetch.flags = flags;
+		ret = HYPERVISOR_mca(&mc_op);
+		if (ret) {
+			pr_err("Failed to fetch %surgent error log\n",
+			       flags == XEN_MC_URGENT ? "" : "non");
+			break;
+		}
+
+		if (mc_op.u.mc_fetch.flags & XEN_MC_NODATA ||
+		    mc_op.u.mc_fetch.flags & XEN_MC_FETCHFAILED)
+			break;
+		else {
+			ret = convert_log(&g_mi);
+			if (ret)
+				pr_warn("Failed to convert this error log, continue acking it anyway\n");
+
+			mc_op.u.mc_fetch.flags = flags | XEN_MC_ACK;
+			ret = HYPERVISOR_mca(&mc_op);
+			if (ret) {
+				pr_err("Failed to ack previous error log\n");
+				break;
+			}
+		}
+	} while (1);
+
+	return ret;
+}
+
+/* virq handler for machine check error info*/
+static void xen_mce_work_fn(struct work_struct *work)
+{
+	int err;
+
+	mutex_lock(&mcelog_lock);
+
+	/* urgent mc_info */
+	err = mc_queue_handle(XEN_MC_URGENT);
+	if (err)
+		pr_err("Failed to handle urgent mc_info queue, continue handling nonurgent mc_info queue anyway\n");
+
+	/* nonurgent mc_info */
+	err = mc_queue_handle(XEN_MC_NONURGENT);
+	if (err)
+		pr_err("Failed to handle nonurgent mc_info queue\n");
+
+	/* wake processes polling /dev/mcelog */
+	wake_up_interruptible(&xen_mce_chrdev_wait);
+
+	mutex_unlock(&mcelog_lock);
+}
+static DECLARE_WORK(xen_mce_work, xen_mce_work_fn);
+
+static irqreturn_t xen_mce_interrupt(int irq, void *dev_id)
+{
+	schedule_work(&xen_mce_work);
+	return IRQ_HANDLED;
+}
+
+static int bind_virq_for_mce(void)
+{
+	int ret;
+	struct xen_mc mc_op;
+
+	memset(&mc_op, 0, sizeof(struct xen_mc));
+
+	/* Fetch physical CPU Numbers */
+	mc_op.cmd = XEN_MC_physcpuinfo;
+	set_xen_guest_handle(mc_op.u.mc_physcpuinfo.info, g_physinfo);
+	ret = HYPERVISOR_mca(&mc_op);
+	if (ret) {
+		pr_err("Failed to get CPU numbers\n");
+		return ret;
+	}
+
+	/* Fetch each CPU Physical Info for later reference*/
+	ncpus = mc_op.u.mc_physcpuinfo.ncpus;
+	g_physinfo = kcalloc(ncpus, sizeof(struct mcinfo_logical_cpu),
+			     GFP_KERNEL);
+	if (!g_physinfo)
+		return -ENOMEM;
+	set_xen_guest_handle(mc_op.u.mc_physcpuinfo.info, g_physinfo);
+	ret = HYPERVISOR_mca(&mc_op);
+	if (ret) {
+		pr_err("Failed to get CPU info\n");
+		kfree(g_physinfo);
+		return ret;
+	}
+
+	ret  = bind_virq_to_irqhandler(VIRQ_MCA, 0,
+				       xen_mce_interrupt, 0, "mce", NULL);
+	if (ret < 0) {
+		pr_err("Failed to bind virq\n");
+		kfree(g_physinfo);
+		return ret;
+	}
+
+	return 0;
+}
+
+static int __init xen_late_init_mcelog(void)
+{
+	int ret;
+
+	/* Only DOM0 is responsible for MCE logging */
+	if (!xen_initial_domain())
+		return -ENODEV;
+
+	/* register character device /dev/mcelog for xen mcelog */
+	ret = misc_register(&xen_mce_chrdev_device);
+	if (ret)
+		return ret;
+
+	ret = bind_virq_for_mce();
+	if (ret)
+		goto deregister;
+
+	pr_info("/dev/mcelog registered by Xen\n");
+
+	return 0;
+
+deregister:
+	misc_deregister(&xen_mce_chrdev_device);
+	return ret;
+}
+device_initcall(xen_late_init_mcelog);
diff --git a/drivers/xen/mem-reservation.c b/drivers/xen/mem-reservation.c
new file mode 100644
index 000000000..3782cf070
--- /dev/null
+++ b/drivers/xen/mem-reservation.c
@@ -0,0 +1,122 @@
+// SPDX-License-Identifier: GPL-2.0
+
+/******************************************************************************
+ * Xen memory reservation utilities.
+ *
+ * Copyright (c) 2003, B Dragovic
+ * Copyright (c) 2003-2004, M Williamson, K Fraser
+ * Copyright (c) 2005 Dan M. Smith, IBM Corporation
+ * Copyright (c) 2010 Daniel Kiper
+ * Copyright (c) 2018 Oleksandr Andrushchenko, EPAM Systems Inc.
+ */
+
+#include <asm/xen/hypercall.h>
+
+#include <xen/interface/memory.h>
+#include <xen/mem-reservation.h>
+#include <linux/moduleparam.h>
+
+bool __read_mostly xen_scrub_pages = IS_ENABLED(CONFIG_XEN_SCRUB_PAGES_DEFAULT);
+core_param(xen_scrub_pages, xen_scrub_pages, bool, 0);
+
+/*
+ * Use one extent per PAGE_SIZE to avoid to break down the page into
+ * multiple frame.
+ */
+#define EXTENT_ORDER (fls(XEN_PFN_PER_PAGE) - 1)
+
+#ifdef CONFIG_XEN_HAVE_PVMMU
+void __xenmem_reservation_va_mapping_update(unsigned long count,
+					    struct page **pages,
+					    xen_pfn_t *frames)
+{
+	int i;
+
+	for (i = 0; i < count; i++) {
+		struct page *page = pages[i];
+		unsigned long pfn = page_to_pfn(page);
+
+		BUG_ON(!page);
+
+		/*
+		 * We don't support PV MMU when Linux and Xen is using
+		 * different page granularity.
+		 */
+		BUILD_BUG_ON(XEN_PAGE_SIZE != PAGE_SIZE);
+
+		set_phys_to_machine(pfn, frames[i]);
+
+		/* Link back into the page tables if not highmem. */
+		if (!PageHighMem(page)) {
+			int ret;
+
+			ret = HYPERVISOR_update_va_mapping(
+					(unsigned long)__va(pfn << PAGE_SHIFT),
+					mfn_pte(frames[i], PAGE_KERNEL),
+					0);
+			BUG_ON(ret);
+		}
+	}
+}
+EXPORT_SYMBOL_GPL(__xenmem_reservation_va_mapping_update);
+
+void __xenmem_reservation_va_mapping_reset(unsigned long count,
+					   struct page **pages)
+{
+	int i;
+
+	for (i = 0; i < count; i++) {
+		struct page *page = pages[i];
+		unsigned long pfn = page_to_pfn(page);
+
+		/*
+		 * We don't support PV MMU when Linux and Xen are using
+		 * different page granularity.
+		 */
+		BUILD_BUG_ON(XEN_PAGE_SIZE != PAGE_SIZE);
+
+		if (!PageHighMem(page)) {
+			int ret;
+
+			ret = HYPERVISOR_update_va_mapping(
+					(unsigned long)__va(pfn << PAGE_SHIFT),
+					__pte_ma(0), 0);
+			BUG_ON(ret);
+		}
+		__set_phys_to_machine(pfn, INVALID_P2M_ENTRY);
+	}
+}
+EXPORT_SYMBOL_GPL(__xenmem_reservation_va_mapping_reset);
+#endif /* CONFIG_XEN_HAVE_PVMMU */
+
+/* @frames is an array of PFNs */
+int xenmem_reservation_increase(int count, xen_pfn_t *frames)
+{
+	struct xen_memory_reservation reservation = {
+		.address_bits = 0,
+		.extent_order = EXTENT_ORDER,
+		.domid        = DOMID_SELF
+	};
+
+	/* XENMEM_populate_physmap requires a PFN based on Xen granularity. */
+	set_xen_guest_handle(reservation.extent_start, frames);
+	reservation.nr_extents = count;
+	return HYPERVISOR_memory_op(XENMEM_populate_physmap, &reservation);
+}
+EXPORT_SYMBOL_GPL(xenmem_reservation_increase);
+
+/* @frames is an array of GFNs */
+int xenmem_reservation_decrease(int count, xen_pfn_t *frames)
+{
+	struct xen_memory_reservation reservation = {
+		.address_bits = 0,
+		.extent_order = EXTENT_ORDER,
+		.domid        = DOMID_SELF
+	};
+
+	/* XENMEM_decrease_reservation requires a GFN */
+	set_xen_guest_handle(reservation.extent_start, frames);
+	reservation.nr_extents = count;
+	return HYPERVISOR_memory_op(XENMEM_decrease_reservation, &reservation);
+}
+EXPORT_SYMBOL_GPL(xenmem_reservation_decrease);
diff --git a/drivers/xen/pci.c b/drivers/xen/pci.c
new file mode 100644
index 000000000..db58aaa4d
--- /dev/null
+++ b/drivers/xen/pci.c
@@ -0,0 +1,268 @@
+/*
+ * Copyright (c) 2009, Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+ * Place - Suite 330, Boston, MA 02111-1307 USA.
+ *
+ * Author: Weidong Han <weidong.han@intel.com>
+ */
+
+#include <linux/pci.h>
+#include <linux/acpi.h>
+#include <linux/pci-acpi.h>
+#include <xen/xen.h>
+#include <xen/interface/physdev.h>
+#include <xen/interface/xen.h>
+
+#include <asm/xen/hypervisor.h>
+#include <asm/xen/hypercall.h>
+#include "../pci/pci.h"
+#ifdef CONFIG_PCI_MMCONFIG
+#include <asm/pci_x86.h>
+
+static int xen_mcfg_late(void);
+#endif
+
+static bool __read_mostly pci_seg_supported = true;
+
+static int xen_add_device(struct device *dev)
+{
+	int r;
+	struct pci_dev *pci_dev = to_pci_dev(dev);
+#ifdef CONFIG_PCI_IOV
+	struct pci_dev *physfn = pci_dev->physfn;
+#endif
+#ifdef CONFIG_PCI_MMCONFIG
+	static bool pci_mcfg_reserved = false;
+	/*
+	 * Reserve MCFG areas in Xen on first invocation due to this being
+	 * potentially called from inside of acpi_init immediately after
+	 * MCFG table has been finally parsed.
+	 */
+	if (!pci_mcfg_reserved) {
+		xen_mcfg_late();
+		pci_mcfg_reserved = true;
+	}
+#endif
+	if (pci_seg_supported) {
+		struct {
+			struct physdev_pci_device_add add;
+			uint32_t pxm;
+		} add_ext = {
+			.add.seg = pci_domain_nr(pci_dev->bus),
+			.add.bus = pci_dev->bus->number,
+			.add.devfn = pci_dev->devfn
+		};
+		struct physdev_pci_device_add *add = &add_ext.add;
+
+#ifdef CONFIG_ACPI
+		acpi_handle handle;
+#endif
+
+#ifdef CONFIG_PCI_IOV
+		if (pci_dev->is_virtfn) {
+			add->flags = XEN_PCI_DEV_VIRTFN;
+			add->physfn.bus = physfn->bus->number;
+			add->physfn.devfn = physfn->devfn;
+		} else
+#endif
+		if (pci_ari_enabled(pci_dev->bus) && PCI_SLOT(pci_dev->devfn))
+			add->flags = XEN_PCI_DEV_EXTFN;
+
+#ifdef CONFIG_ACPI
+		handle = ACPI_HANDLE(&pci_dev->dev);
+#ifdef CONFIG_PCI_IOV
+		if (!handle && pci_dev->is_virtfn)
+			handle = ACPI_HANDLE(physfn->bus->bridge);
+#endif
+		if (!handle) {
+			/*
+			 * This device was not listed in the ACPI name space at
+			 * all. Try to get acpi handle of parent pci bus.
+			 */
+			struct pci_bus *pbus;
+			for (pbus = pci_dev->bus; pbus; pbus = pbus->parent) {
+				handle = acpi_pci_get_bridge_handle(pbus);
+				if (handle)
+					break;
+			}
+		}
+		if (handle) {
+			acpi_status status;
+
+			do {
+				unsigned long long pxm;
+
+				status = acpi_evaluate_integer(handle, "_PXM",
+							       NULL, &pxm);
+				if (ACPI_SUCCESS(status)) {
+					add->optarr[0] = pxm;
+					add->flags |= XEN_PCI_DEV_PXM;
+					break;
+				}
+				status = acpi_get_parent(handle, &handle);
+			} while (ACPI_SUCCESS(status));
+		}
+#endif /* CONFIG_ACPI */
+
+		r = HYPERVISOR_physdev_op(PHYSDEVOP_pci_device_add, add);
+		if (r != -ENOSYS)
+			return r;
+		pci_seg_supported = false;
+	}
+
+	if (pci_domain_nr(pci_dev->bus))
+		r = -ENOSYS;
+#ifdef CONFIG_PCI_IOV
+	else if (pci_dev->is_virtfn) {
+		struct physdev_manage_pci_ext manage_pci_ext = {
+			.bus		= pci_dev->bus->number,
+			.devfn		= pci_dev->devfn,
+			.is_virtfn 	= 1,
+			.physfn.bus	= physfn->bus->number,
+			.physfn.devfn	= physfn->devfn,
+		};
+
+		r = HYPERVISOR_physdev_op(PHYSDEVOP_manage_pci_add_ext,
+			&manage_pci_ext);
+	}
+#endif
+	else if (pci_ari_enabled(pci_dev->bus) && PCI_SLOT(pci_dev->devfn)) {
+		struct physdev_manage_pci_ext manage_pci_ext = {
+			.bus		= pci_dev->bus->number,
+			.devfn		= pci_dev->devfn,
+			.is_extfn	= 1,
+		};
+
+		r = HYPERVISOR_physdev_op(PHYSDEVOP_manage_pci_add_ext,
+			&manage_pci_ext);
+	} else {
+		struct physdev_manage_pci manage_pci = {
+			.bus	= pci_dev->bus->number,
+			.devfn	= pci_dev->devfn,
+		};
+
+		r = HYPERVISOR_physdev_op(PHYSDEVOP_manage_pci_add,
+			&manage_pci);
+	}
+
+	return r;
+}
+
+static int xen_remove_device(struct device *dev)
+{
+	int r;
+	struct pci_dev *pci_dev = to_pci_dev(dev);
+
+	if (pci_seg_supported) {
+		struct physdev_pci_device device = {
+			.seg = pci_domain_nr(pci_dev->bus),
+			.bus = pci_dev->bus->number,
+			.devfn = pci_dev->devfn
+		};
+
+		r = HYPERVISOR_physdev_op(PHYSDEVOP_pci_device_remove,
+					  &device);
+	} else if (pci_domain_nr(pci_dev->bus))
+		r = -ENOSYS;
+	else {
+		struct physdev_manage_pci manage_pci = {
+			.bus = pci_dev->bus->number,
+			.devfn = pci_dev->devfn
+		};
+
+		r = HYPERVISOR_physdev_op(PHYSDEVOP_manage_pci_remove,
+					  &manage_pci);
+	}
+
+	return r;
+}
+
+static int xen_pci_notifier(struct notifier_block *nb,
+			    unsigned long action, void *data)
+{
+	struct device *dev = data;
+	int r = 0;
+
+	switch (action) {
+	case BUS_NOTIFY_ADD_DEVICE:
+		r = xen_add_device(dev);
+		break;
+	case BUS_NOTIFY_DEL_DEVICE:
+		r = xen_remove_device(dev);
+		break;
+	default:
+		return NOTIFY_DONE;
+	}
+	if (r)
+		dev_err(dev, "Failed to %s - passthrough or MSI/MSI-X might fail!\n",
+			action == BUS_NOTIFY_ADD_DEVICE ? "add" :
+			(action == BUS_NOTIFY_DEL_DEVICE ? "delete" : "?"));
+	return NOTIFY_OK;
+}
+
+static struct notifier_block device_nb = {
+	.notifier_call = xen_pci_notifier,
+};
+
+static int __init register_xen_pci_notifier(void)
+{
+	if (!xen_initial_domain())
+		return 0;
+
+	return bus_register_notifier(&pci_bus_type, &device_nb);
+}
+
+arch_initcall(register_xen_pci_notifier);
+
+#ifdef CONFIG_PCI_MMCONFIG
+static int xen_mcfg_late(void)
+{
+	struct pci_mmcfg_region *cfg;
+	int rc;
+
+	if (!xen_initial_domain())
+		return 0;
+
+	if ((pci_probe & PCI_PROBE_MMCONF) == 0)
+		return 0;
+
+	if (list_empty(&pci_mmcfg_list))
+		return 0;
+
+	/* Check whether they are in the right area. */
+	list_for_each_entry(cfg, &pci_mmcfg_list, list) {
+		struct physdev_pci_mmcfg_reserved r;
+
+		r.address = cfg->address;
+		r.segment = cfg->segment;
+		r.start_bus = cfg->start_bus;
+		r.end_bus = cfg->end_bus;
+		r.flags = XEN_PCI_MMCFG_RESERVED;
+
+		rc = HYPERVISOR_physdev_op(PHYSDEVOP_pci_mmcfg_reserved, &r);
+		switch (rc) {
+		case 0:
+		case -ENOSYS:
+			continue;
+
+		default:
+			pr_warn("Failed to report MMCONFIG reservation"
+				" state for %s to hypervisor"
+				" (%d)\n",
+				cfg->name, rc);
+		}
+	}
+	return 0;
+}
+#endif
diff --git a/drivers/xen/pcpu.c b/drivers/xen/pcpu.c
new file mode 100644
index 000000000..cdc6daa7a
--- /dev/null
+++ b/drivers/xen/pcpu.c
@@ -0,0 +1,418 @@
+/******************************************************************************
+ * pcpu.c
+ * Management physical cpu in dom0, get pcpu info and provide sys interface
+ *
+ * Copyright (c) 2012 Intel Corporation
+ * Author: Liu, Jinsong <jinsong.liu@intel.com>
+ * Author: Jiang, Yunhong <yunhong.jiang@intel.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation; or, when distributed
+ * separately from the Linux kernel or incorporated into other
+ * software packages, subject to the following license:
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this source file (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy, modify,
+ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
+ * and to permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#define pr_fmt(fmt) "xen_cpu: " fmt
+
+#include <linux/interrupt.h>
+#include <linux/spinlock.h>
+#include <linux/cpu.h>
+#include <linux/stat.h>
+#include <linux/capability.h>
+
+#include <xen/xen.h>
+#include <xen/acpi.h>
+#include <xen/xenbus.h>
+#include <xen/events.h>
+#include <xen/interface/platform.h>
+#include <asm/xen/hypervisor.h>
+#include <asm/xen/hypercall.h>
+
+
+/*
+ * @cpu_id: Xen physical cpu logic number
+ * @flags: Xen physical cpu status flag
+ * - XEN_PCPU_FLAGS_ONLINE: cpu is online
+ * - XEN_PCPU_FLAGS_INVALID: cpu is not present
+ */
+struct pcpu {
+	struct list_head list;
+	struct device dev;
+	uint32_t cpu_id;
+	uint32_t flags;
+};
+
+static struct bus_type xen_pcpu_subsys = {
+	.name = "xen_cpu",
+	.dev_name = "xen_cpu",
+};
+
+static DEFINE_MUTEX(xen_pcpu_lock);
+
+static LIST_HEAD(xen_pcpus);
+
+static int xen_pcpu_down(uint32_t cpu_id)
+{
+	struct xen_platform_op op = {
+		.cmd			= XENPF_cpu_offline,
+		.interface_version	= XENPF_INTERFACE_VERSION,
+		.u.cpu_ol.cpuid		= cpu_id,
+	};
+
+	return HYPERVISOR_platform_op(&op);
+}
+
+static int xen_pcpu_up(uint32_t cpu_id)
+{
+	struct xen_platform_op op = {
+		.cmd			= XENPF_cpu_online,
+		.interface_version	= XENPF_INTERFACE_VERSION,
+		.u.cpu_ol.cpuid		= cpu_id,
+	};
+
+	return HYPERVISOR_platform_op(&op);
+}
+
+static ssize_t show_online(struct device *dev,
+			   struct device_attribute *attr,
+			   char *buf)
+{
+	struct pcpu *cpu = container_of(dev, struct pcpu, dev);
+
+	return sprintf(buf, "%u\n", !!(cpu->flags & XEN_PCPU_FLAGS_ONLINE));
+}
+
+static ssize_t __ref store_online(struct device *dev,
+				  struct device_attribute *attr,
+				  const char *buf, size_t count)
+{
+	struct pcpu *pcpu = container_of(dev, struct pcpu, dev);
+	unsigned long long val;
+	ssize_t ret;
+
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
+	if (kstrtoull(buf, 0, &val) < 0)
+		return -EINVAL;
+
+	switch (val) {
+	case 0:
+		ret = xen_pcpu_down(pcpu->cpu_id);
+		break;
+	case 1:
+		ret = xen_pcpu_up(pcpu->cpu_id);
+		break;
+	default:
+		ret = -EINVAL;
+	}
+
+	if (ret >= 0)
+		ret = count;
+	return ret;
+}
+static DEVICE_ATTR(online, S_IRUGO | S_IWUSR, show_online, store_online);
+
+static struct attribute *pcpu_dev_attrs[] = {
+	&dev_attr_online.attr,
+	NULL
+};
+
+static umode_t pcpu_dev_is_visible(struct kobject *kobj,
+				   struct attribute *attr, int idx)
+{
+	struct device *dev = kobj_to_dev(kobj);
+	/*
+	 * Xen never offline cpu0 due to several restrictions
+	 * and assumptions. This basically doesn't add a sys control
+	 * to user, one cannot attempt to offline BSP.
+	 */
+	return dev->id ? attr->mode : 0;
+}
+
+static const struct attribute_group pcpu_dev_group = {
+	.attrs = pcpu_dev_attrs,
+	.is_visible = pcpu_dev_is_visible,
+};
+
+static const struct attribute_group *pcpu_dev_groups[] = {
+	&pcpu_dev_group,
+	NULL
+};
+
+static bool xen_pcpu_online(uint32_t flags)
+{
+	return !!(flags & XEN_PCPU_FLAGS_ONLINE);
+}
+
+static void pcpu_online_status(struct xenpf_pcpuinfo *info,
+			       struct pcpu *pcpu)
+{
+	if (xen_pcpu_online(info->flags) &&
+	   !xen_pcpu_online(pcpu->flags)) {
+		/* the pcpu is onlined */
+		pcpu->flags |= XEN_PCPU_FLAGS_ONLINE;
+		kobject_uevent(&pcpu->dev.kobj, KOBJ_ONLINE);
+	} else if (!xen_pcpu_online(info->flags) &&
+		    xen_pcpu_online(pcpu->flags)) {
+		/* The pcpu is offlined */
+		pcpu->flags &= ~XEN_PCPU_FLAGS_ONLINE;
+		kobject_uevent(&pcpu->dev.kobj, KOBJ_OFFLINE);
+	}
+}
+
+static struct pcpu *get_pcpu(uint32_t cpu_id)
+{
+	struct pcpu *pcpu;
+
+	list_for_each_entry(pcpu, &xen_pcpus, list) {
+		if (pcpu->cpu_id == cpu_id)
+			return pcpu;
+	}
+
+	return NULL;
+}
+
+static void pcpu_release(struct device *dev)
+{
+	struct pcpu *pcpu = container_of(dev, struct pcpu, dev);
+
+	list_del(&pcpu->list);
+	kfree(pcpu);
+}
+
+static void unregister_and_remove_pcpu(struct pcpu *pcpu)
+{
+	struct device *dev;
+
+	if (!pcpu)
+		return;
+
+	dev = &pcpu->dev;
+	/* pcpu remove would be implicitly done */
+	device_unregister(dev);
+}
+
+static int register_pcpu(struct pcpu *pcpu)
+{
+	struct device *dev;
+	int err = -EINVAL;
+
+	if (!pcpu)
+		return err;
+
+	dev = &pcpu->dev;
+	dev->bus = &xen_pcpu_subsys;
+	dev->id = pcpu->cpu_id;
+	dev->release = pcpu_release;
+	dev->groups = pcpu_dev_groups;
+
+	err = device_register(dev);
+	if (err) {
+		pcpu_release(dev);
+		return err;
+	}
+
+	return 0;
+}
+
+static struct pcpu *create_and_register_pcpu(struct xenpf_pcpuinfo *info)
+{
+	struct pcpu *pcpu;
+	int err;
+
+	if (info->flags & XEN_PCPU_FLAGS_INVALID)
+		return ERR_PTR(-ENODEV);
+
+	pcpu = kzalloc(sizeof(struct pcpu), GFP_KERNEL);
+	if (!pcpu)
+		return ERR_PTR(-ENOMEM);
+
+	INIT_LIST_HEAD(&pcpu->list);
+	pcpu->cpu_id = info->xen_cpuid;
+	pcpu->flags = info->flags;
+
+	/* Need hold on xen_pcpu_lock before pcpu list manipulations */
+	list_add_tail(&pcpu->list, &xen_pcpus);
+
+	err = register_pcpu(pcpu);
+	if (err) {
+		pr_warn("Failed to register pcpu%u\n", info->xen_cpuid);
+		return ERR_PTR(-ENOENT);
+	}
+
+	return pcpu;
+}
+
+/*
+ * Caller should hold the xen_pcpu_lock
+ */
+static int sync_pcpu(uint32_t cpu, uint32_t *max_cpu)
+{
+	int ret;
+	struct pcpu *pcpu = NULL;
+	struct xenpf_pcpuinfo *info;
+	struct xen_platform_op op = {
+		.cmd                   = XENPF_get_cpuinfo,
+		.interface_version     = XENPF_INTERFACE_VERSION,
+		.u.pcpu_info.xen_cpuid = cpu,
+	};
+
+	ret = HYPERVISOR_platform_op(&op);
+	if (ret)
+		return ret;
+
+	info = &op.u.pcpu_info;
+	if (max_cpu)
+		*max_cpu = info->max_present;
+
+	pcpu = get_pcpu(cpu);
+
+	/*
+	 * Only those at cpu present map has its sys interface.
+	 */
+	if (info->flags & XEN_PCPU_FLAGS_INVALID) {
+		unregister_and_remove_pcpu(pcpu);
+		return 0;
+	}
+
+	if (!pcpu) {
+		pcpu = create_and_register_pcpu(info);
+		if (IS_ERR_OR_NULL(pcpu))
+			return -ENODEV;
+	} else
+		pcpu_online_status(info, pcpu);
+
+	return 0;
+}
+
+/*
+ * Sync dom0's pcpu information with xen hypervisor's
+ */
+static int xen_sync_pcpus(void)
+{
+	/*
+	 * Boot cpu always have cpu_id 0 in xen
+	 */
+	uint32_t cpu = 0, max_cpu = 0;
+	int err = 0;
+	struct pcpu *pcpu, *tmp;
+
+	mutex_lock(&xen_pcpu_lock);
+
+	while (!err && (cpu <= max_cpu)) {
+		err = sync_pcpu(cpu, &max_cpu);
+		cpu++;
+	}
+
+	if (err)
+		list_for_each_entry_safe(pcpu, tmp, &xen_pcpus, list)
+			unregister_and_remove_pcpu(pcpu);
+
+	mutex_unlock(&xen_pcpu_lock);
+
+	return err;
+}
+
+static void xen_pcpu_work_fn(struct work_struct *work)
+{
+	xen_sync_pcpus();
+}
+static DECLARE_WORK(xen_pcpu_work, xen_pcpu_work_fn);
+
+static irqreturn_t xen_pcpu_interrupt(int irq, void *dev_id)
+{
+	schedule_work(&xen_pcpu_work);
+	return IRQ_HANDLED;
+}
+
+/* Sync with Xen hypervisor after cpu hotadded */
+void xen_pcpu_hotplug_sync(void)
+{
+	schedule_work(&xen_pcpu_work);
+}
+EXPORT_SYMBOL_GPL(xen_pcpu_hotplug_sync);
+
+/*
+ * For hypervisor presented cpu, return logic cpu id;
+ * For hypervisor non-presented cpu, return -ENODEV.
+ */
+int xen_pcpu_id(uint32_t acpi_id)
+{
+	int cpu_id = 0, max_id = 0;
+	struct xen_platform_op op;
+
+	op.cmd = XENPF_get_cpuinfo;
+	while (cpu_id <= max_id) {
+		op.u.pcpu_info.xen_cpuid = cpu_id;
+		if (HYPERVISOR_platform_op(&op)) {
+			cpu_id++;
+			continue;
+		}
+
+		if (acpi_id == op.u.pcpu_info.acpi_id)
+			return cpu_id;
+		if (op.u.pcpu_info.max_present > max_id)
+			max_id = op.u.pcpu_info.max_present;
+		cpu_id++;
+	}
+
+	return -ENODEV;
+}
+EXPORT_SYMBOL_GPL(xen_pcpu_id);
+
+static int __init xen_pcpu_init(void)
+{
+	int irq, ret;
+
+	if (!xen_initial_domain())
+		return -ENODEV;
+
+	irq = bind_virq_to_irqhandler(VIRQ_PCPU_STATE, 0,
+				      xen_pcpu_interrupt, 0,
+				      "xen-pcpu", NULL);
+	if (irq < 0) {
+		pr_warn("Failed to bind pcpu virq\n");
+		return irq;
+	}
+
+	ret = subsys_system_register(&xen_pcpu_subsys, NULL);
+	if (ret) {
+		pr_warn("Failed to register pcpu subsys\n");
+		goto err1;
+	}
+
+	ret = xen_sync_pcpus();
+	if (ret) {
+		pr_warn("Failed to sync pcpu info\n");
+		goto err2;
+	}
+
+	return 0;
+
+err2:
+	bus_unregister(&xen_pcpu_subsys);
+err1:
+	unbind_from_irqhandler(irq, NULL);
+	return ret;
+}
+arch_initcall(xen_pcpu_init);
diff --git a/drivers/xen/platform-pci.c b/drivers/xen/platform-pci.c
new file mode 100644
index 000000000..4cec81466
--- /dev/null
+++ b/drivers/xen/platform-pci.c
@@ -0,0 +1,192 @@
+/******************************************************************************
+ * platform-pci.c
+ *
+ * Xen platform PCI device driver
+ *
+ * Authors: ssmith@xensource.com and stefano.stabellini@eu.citrix.com
+ *
+ * Copyright (c) 2005, Intel Corporation.
+ * Copyright (c) 2007, XenSource Inc.
+ * Copyright (c) 2010, Citrix
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+ * Place - Suite 330, Boston, MA 02111-1307 USA.
+ *
+ */
+
+
+#include <linux/interrupt.h>
+#include <linux/io.h>
+#include <linux/init.h>
+#include <linux/pci.h>
+
+#include <xen/platform_pci.h>
+#include <xen/grant_table.h>
+#include <xen/xenbus.h>
+#include <xen/events.h>
+#include <xen/hvm.h>
+#include <xen/xen-ops.h>
+
+#define DRV_NAME    "xen-platform-pci"
+
+static unsigned long platform_mmio;
+static unsigned long platform_mmio_alloc;
+static unsigned long platform_mmiolen;
+static uint64_t callback_via;
+
+static unsigned long alloc_xen_mmio(unsigned long len)
+{
+	unsigned long addr;
+
+	addr = platform_mmio + platform_mmio_alloc;
+	platform_mmio_alloc += len;
+	BUG_ON(platform_mmio_alloc > platform_mmiolen);
+
+	return addr;
+}
+
+static uint64_t get_callback_via(struct pci_dev *pdev)
+{
+	u8 pin;
+	int irq;
+
+	irq = pdev->irq;
+	if (irq < 16)
+		return irq; /* ISA IRQ */
+
+	pin = pdev->pin;
+
+	/* We don't know the GSI. Specify the PCI INTx line instead. */
+	return ((uint64_t)0x01 << HVM_CALLBACK_VIA_TYPE_SHIFT) | /* PCI INTx identifier */
+		((uint64_t)pci_domain_nr(pdev->bus) << 32) |
+		((uint64_t)pdev->bus->number << 16) |
+		((uint64_t)(pdev->devfn & 0xff) << 8) |
+		((uint64_t)(pin - 1) & 3);
+}
+
+static irqreturn_t do_hvm_evtchn_intr(int irq, void *dev_id)
+{
+	xen_hvm_evtchn_do_upcall();
+	return IRQ_HANDLED;
+}
+
+static int xen_allocate_irq(struct pci_dev *pdev)
+{
+	return request_irq(pdev->irq, do_hvm_evtchn_intr,
+			IRQF_NOBALANCING | IRQF_TRIGGER_RISING,
+			"xen-platform-pci", pdev);
+}
+
+static int platform_pci_resume(struct pci_dev *pdev)
+{
+	int err;
+
+	if (xen_have_vector_callback)
+		return 0;
+
+	err = xen_set_callback_via(callback_via);
+	if (err) {
+		dev_err(&pdev->dev, "platform_pci_resume failure!\n");
+		return err;
+	}
+	return 0;
+}
+
+static int platform_pci_probe(struct pci_dev *pdev,
+			      const struct pci_device_id *ent)
+{
+	int i, ret;
+	long ioaddr;
+	long mmio_addr, mmio_len;
+	unsigned int max_nr_gframes;
+	unsigned long grant_frames;
+
+	if (!xen_domain())
+		return -ENODEV;
+
+	i = pci_enable_device(pdev);
+	if (i)
+		return i;
+
+	ioaddr = pci_resource_start(pdev, 0);
+
+	mmio_addr = pci_resource_start(pdev, 1);
+	mmio_len = pci_resource_len(pdev, 1);
+
+	if (mmio_addr == 0 || ioaddr == 0) {
+		dev_err(&pdev->dev, "no resources found\n");
+		ret = -ENOENT;
+		goto pci_out;
+	}
+
+	ret = pci_request_region(pdev, 1, DRV_NAME);
+	if (ret < 0)
+		goto pci_out;
+
+	ret = pci_request_region(pdev, 0, DRV_NAME);
+	if (ret < 0)
+		goto mem_out;
+
+	platform_mmio = mmio_addr;
+	platform_mmiolen = mmio_len;
+	if (!xen_have_vector_callback) {
+		ret = xen_allocate_irq(pdev);
+		if (ret) {
+			dev_warn(&pdev->dev, "request_irq failed err=%d\n", ret);
+			goto out;
+		}
+		callback_via = get_callback_via(pdev);
+		ret = xen_set_callback_via(callback_via);
+		if (ret) {
+			dev_warn(&pdev->dev, "Unable to set the evtchn callback "
+					 "err=%d\n", ret);
+			goto out;
+		}
+	}
+
+	max_nr_gframes = gnttab_max_grant_frames();
+	grant_frames = alloc_xen_mmio(PAGE_SIZE * max_nr_gframes);
+	ret = gnttab_setup_auto_xlat_frames(grant_frames);
+	if (ret)
+		goto out;
+	ret = gnttab_init();
+	if (ret)
+		goto grant_out;
+	return 0;
+grant_out:
+	gnttab_free_auto_xlat_frames();
+out:
+	pci_release_region(pdev, 0);
+mem_out:
+	pci_release_region(pdev, 1);
+pci_out:
+	pci_disable_device(pdev);
+	return ret;
+}
+
+static const struct pci_device_id platform_pci_tbl[] = {
+	{PCI_VENDOR_ID_XEN, PCI_DEVICE_ID_XEN_PLATFORM,
+		PCI_ANY_ID, PCI_ANY_ID, 0, 0, 0},
+	{0,}
+};
+
+static struct pci_driver platform_driver = {
+	.name =           DRV_NAME,
+	.probe =          platform_pci_probe,
+	.id_table =       platform_pci_tbl,
+#ifdef CONFIG_PM
+	.resume_early =   platform_pci_resume,
+#endif
+};
+
+builtin_pci_driver(platform_driver);
diff --git a/drivers/xen/preempt.c b/drivers/xen/preempt.c
new file mode 100644
index 000000000..128375ff8
--- /dev/null
+++ b/drivers/xen/preempt.c
@@ -0,0 +1,46 @@
+/*
+ * Preemptible hypercalls
+ *
+ * Copyright (C) 2014 Citrix Systems R&D ltd.
+ *
+ * This source code is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2 of the
+ * License, or (at your option) any later version.
+ */
+
+#include <linux/sched.h>
+#include <xen/xen-ops.h>
+
+#ifndef CONFIG_PREEMPT
+
+/*
+ * Some hypercalls issued by the toolstack can take many 10s of
+ * seconds. Allow tasks running hypercalls via the privcmd driver to
+ * be voluntarily preempted even if full kernel preemption is
+ * disabled.
+ *
+ * Such preemptible hypercalls are bracketed by
+ * xen_preemptible_hcall_begin() and xen_preemptible_hcall_end()
+ * calls.
+ */
+
+DEFINE_PER_CPU(bool, xen_in_preemptible_hcall);
+EXPORT_SYMBOL_GPL(xen_in_preemptible_hcall);
+
+asmlinkage __visible void xen_maybe_preempt_hcall(void)
+{
+	if (unlikely(__this_cpu_read(xen_in_preemptible_hcall)
+		     && need_resched() && !preempt_count())) {
+		/*
+		 * Clear flag as we may be rescheduled on a different
+		 * cpu.
+		 */
+		__this_cpu_write(xen_in_preemptible_hcall, false);
+		local_irq_enable();
+		cond_resched();
+		local_irq_disable();
+		__this_cpu_write(xen_in_preemptible_hcall, true);
+	}
+}
+#endif /* CONFIG_PREEMPT */
diff --git a/drivers/xen/privcmd-buf.c b/drivers/xen/privcmd-buf.c
new file mode 100644
index 000000000..de01a6d00
--- /dev/null
+++ b/drivers/xen/privcmd-buf.c
@@ -0,0 +1,196 @@
+// SPDX-License-Identifier: GPL-2.0 OR MIT
+
+/******************************************************************************
+ * privcmd-buf.c
+ *
+ * Mmap of hypercall buffers.
+ *
+ * Copyright (c) 2018 Juergen Gross
+ */
+
+#define pr_fmt(fmt) "xen:" KBUILD_MODNAME ": " fmt
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/list.h>
+#include <linux/miscdevice.h>
+#include <linux/mm.h>
+#include <linux/slab.h>
+
+#include "privcmd.h"
+
+MODULE_LICENSE("GPL");
+
+struct privcmd_buf_private {
+	struct mutex lock;
+	struct list_head list;
+};
+
+struct privcmd_buf_vma_private {
+	struct privcmd_buf_private *file_priv;
+	struct list_head list;
+	unsigned int users;
+	unsigned int n_pages;
+	struct page *pages[];
+};
+
+static int privcmd_buf_open(struct inode *ino, struct file *file)
+{
+	struct privcmd_buf_private *file_priv;
+
+	file_priv = kzalloc(sizeof(*file_priv), GFP_KERNEL);
+	if (!file_priv)
+		return -ENOMEM;
+
+	mutex_init(&file_priv->lock);
+	INIT_LIST_HEAD(&file_priv->list);
+
+	file->private_data = file_priv;
+
+	return 0;
+}
+
+static void privcmd_buf_vmapriv_free(struct privcmd_buf_vma_private *vma_priv)
+{
+	unsigned int i;
+
+	list_del(&vma_priv->list);
+
+	for (i = 0; i < vma_priv->n_pages; i++)
+		__free_page(vma_priv->pages[i]);
+
+	kfree(vma_priv);
+}
+
+static int privcmd_buf_release(struct inode *ino, struct file *file)
+{
+	struct privcmd_buf_private *file_priv = file->private_data;
+	struct privcmd_buf_vma_private *vma_priv;
+
+	mutex_lock(&file_priv->lock);
+
+	while (!list_empty(&file_priv->list)) {
+		vma_priv = list_first_entry(&file_priv->list,
+					    struct privcmd_buf_vma_private,
+					    list);
+		privcmd_buf_vmapriv_free(vma_priv);
+	}
+
+	mutex_unlock(&file_priv->lock);
+
+	kfree(file_priv);
+
+	return 0;
+}
+
+static void privcmd_buf_vma_open(struct vm_area_struct *vma)
+{
+	struct privcmd_buf_vma_private *vma_priv = vma->vm_private_data;
+
+	if (!vma_priv)
+		return;
+
+	mutex_lock(&vma_priv->file_priv->lock);
+	vma_priv->users++;
+	mutex_unlock(&vma_priv->file_priv->lock);
+}
+
+static void privcmd_buf_vma_close(struct vm_area_struct *vma)
+{
+	struct privcmd_buf_vma_private *vma_priv = vma->vm_private_data;
+	struct privcmd_buf_private *file_priv;
+
+	if (!vma_priv)
+		return;
+
+	file_priv = vma_priv->file_priv;
+
+	mutex_lock(&file_priv->lock);
+
+	vma_priv->users--;
+	if (!vma_priv->users)
+		privcmd_buf_vmapriv_free(vma_priv);
+
+	mutex_unlock(&file_priv->lock);
+}
+
+static vm_fault_t privcmd_buf_vma_fault(struct vm_fault *vmf)
+{
+	pr_debug("fault: vma=%p %lx-%lx, pgoff=%lx, uv=%p\n",
+		 vmf->vma, vmf->vma->vm_start, vmf->vma->vm_end,
+		 vmf->pgoff, (void *)vmf->address);
+
+	return VM_FAULT_SIGBUS;
+}
+
+static const struct vm_operations_struct privcmd_buf_vm_ops = {
+	.open = privcmd_buf_vma_open,
+	.close = privcmd_buf_vma_close,
+	.fault = privcmd_buf_vma_fault,
+};
+
+static int privcmd_buf_mmap(struct file *file, struct vm_area_struct *vma)
+{
+	struct privcmd_buf_private *file_priv = file->private_data;
+	struct privcmd_buf_vma_private *vma_priv;
+	unsigned long count = vma_pages(vma);
+	unsigned int i;
+	int ret = 0;
+
+	if (!(vma->vm_flags & VM_SHARED))
+		return -EINVAL;
+
+	vma_priv = kzalloc(sizeof(*vma_priv) + count * sizeof(void *),
+			   GFP_KERNEL);
+	if (!vma_priv)
+		return -ENOMEM;
+
+	for (i = 0; i < count; i++) {
+		vma_priv->pages[i] = alloc_page(GFP_KERNEL | __GFP_ZERO);
+		if (!vma_priv->pages[i])
+			break;
+		vma_priv->n_pages++;
+	}
+
+	mutex_lock(&file_priv->lock);
+
+	vma_priv->file_priv = file_priv;
+	vma_priv->users = 1;
+
+	vma->vm_flags |= VM_IO | VM_DONTEXPAND;
+	vma->vm_ops = &privcmd_buf_vm_ops;
+	vma->vm_private_data = vma_priv;
+
+	list_add(&vma_priv->list, &file_priv->list);
+
+	if (vma_priv->n_pages != count)
+		ret = -ENOMEM;
+	else
+		for (i = 0; i < vma_priv->n_pages; i++) {
+			ret = vm_insert_page(vma, vma->vm_start + i * PAGE_SIZE,
+					     vma_priv->pages[i]);
+			if (ret)
+				break;
+		}
+
+	if (ret)
+		privcmd_buf_vmapriv_free(vma_priv);
+
+	mutex_unlock(&file_priv->lock);
+
+	return ret;
+}
+
+const struct file_operations xen_privcmdbuf_fops = {
+	.owner = THIS_MODULE,
+	.open = privcmd_buf_open,
+	.release = privcmd_buf_release,
+	.mmap = privcmd_buf_mmap,
+};
+EXPORT_SYMBOL_GPL(xen_privcmdbuf_fops);
+
+struct miscdevice xen_privcmdbuf_dev = {
+	.minor = MISC_DYNAMIC_MINOR,
+	.name = "xen/hypercall",
+	.fops = &xen_privcmdbuf_fops,
+};
diff --git a/drivers/xen/privcmd.c b/drivers/xen/privcmd.c
new file mode 100644
index 000000000..74ff28fda
--- /dev/null
+++ b/drivers/xen/privcmd.c
@@ -0,0 +1,1042 @@
+/******************************************************************************
+ * privcmd.c
+ *
+ * Interface to privileged domain-0 commands.
+ *
+ * Copyright (c) 2002-2004, K A Fraser, B Dragovic
+ */
+
+#define pr_fmt(fmt) "xen:" KBUILD_MODNAME ": " fmt
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/string.h>
+#include <linux/errno.h>
+#include <linux/mm.h>
+#include <linux/mman.h>
+#include <linux/uaccess.h>
+#include <linux/swap.h>
+#include <linux/highmem.h>
+#include <linux/pagemap.h>
+#include <linux/seq_file.h>
+#include <linux/miscdevice.h>
+#include <linux/moduleparam.h>
+
+#include <asm/pgalloc.h>
+#include <asm/pgtable.h>
+#include <asm/tlb.h>
+#include <asm/xen/hypervisor.h>
+#include <asm/xen/hypercall.h>
+
+#include <xen/xen.h>
+#include <xen/privcmd.h>
+#include <xen/interface/xen.h>
+#include <xen/interface/memory.h>
+#include <xen/interface/hvm/dm_op.h>
+#include <xen/features.h>
+#include <xen/page.h>
+#include <xen/xen-ops.h>
+#include <xen/balloon.h>
+
+#include "privcmd.h"
+
+MODULE_LICENSE("GPL");
+
+#define PRIV_VMA_LOCKED ((void *)1)
+
+static unsigned int privcmd_dm_op_max_num = 16;
+module_param_named(dm_op_max_nr_bufs, privcmd_dm_op_max_num, uint, 0644);
+MODULE_PARM_DESC(dm_op_max_nr_bufs,
+		 "Maximum number of buffers per dm_op hypercall");
+
+static unsigned int privcmd_dm_op_buf_max_size = 4096;
+module_param_named(dm_op_buf_max_size, privcmd_dm_op_buf_max_size, uint,
+		   0644);
+MODULE_PARM_DESC(dm_op_buf_max_size,
+		 "Maximum size of a dm_op hypercall buffer");
+
+struct privcmd_data {
+	domid_t domid;
+};
+
+static int privcmd_vma_range_is_mapped(
+               struct vm_area_struct *vma,
+               unsigned long addr,
+               unsigned long nr_pages);
+
+static long privcmd_ioctl_hypercall(struct file *file, void __user *udata)
+{
+	struct privcmd_data *data = file->private_data;
+	struct privcmd_hypercall hypercall;
+	long ret;
+
+	/* Disallow arbitrary hypercalls if restricted */
+	if (data->domid != DOMID_INVALID)
+		return -EPERM;
+
+	if (copy_from_user(&hypercall, udata, sizeof(hypercall)))
+		return -EFAULT;
+
+	xen_preemptible_hcall_begin();
+	ret = privcmd_call(hypercall.op,
+			   hypercall.arg[0], hypercall.arg[1],
+			   hypercall.arg[2], hypercall.arg[3],
+			   hypercall.arg[4]);
+	xen_preemptible_hcall_end();
+
+	return ret;
+}
+
+static void free_page_list(struct list_head *pages)
+{
+	struct page *p, *n;
+
+	list_for_each_entry_safe(p, n, pages, lru)
+		__free_page(p);
+
+	INIT_LIST_HEAD(pages);
+}
+
+/*
+ * Given an array of items in userspace, return a list of pages
+ * containing the data.  If copying fails, either because of memory
+ * allocation failure or a problem reading user memory, return an
+ * error code; its up to the caller to dispose of any partial list.
+ */
+static int gather_array(struct list_head *pagelist,
+			unsigned nelem, size_t size,
+			const void __user *data)
+{
+	unsigned pageidx;
+	void *pagedata;
+	int ret;
+
+	if (size > PAGE_SIZE)
+		return 0;
+
+	pageidx = PAGE_SIZE;
+	pagedata = NULL;	/* quiet, gcc */
+	while (nelem--) {
+		if (pageidx > PAGE_SIZE-size) {
+			struct page *page = alloc_page(GFP_KERNEL);
+
+			ret = -ENOMEM;
+			if (page == NULL)
+				goto fail;
+
+			pagedata = page_address(page);
+
+			list_add_tail(&page->lru, pagelist);
+			pageidx = 0;
+		}
+
+		ret = -EFAULT;
+		if (copy_from_user(pagedata + pageidx, data, size))
+			goto fail;
+
+		data += size;
+		pageidx += size;
+	}
+
+	ret = 0;
+
+fail:
+	return ret;
+}
+
+/*
+ * Call function "fn" on each element of the array fragmented
+ * over a list of pages.
+ */
+static int traverse_pages(unsigned nelem, size_t size,
+			  struct list_head *pos,
+			  int (*fn)(void *data, void *state),
+			  void *state)
+{
+	void *pagedata;
+	unsigned pageidx;
+	int ret = 0;
+
+	BUG_ON(size > PAGE_SIZE);
+
+	pageidx = PAGE_SIZE;
+	pagedata = NULL;	/* hush, gcc */
+
+	while (nelem--) {
+		if (pageidx > PAGE_SIZE-size) {
+			struct page *page;
+			pos = pos->next;
+			page = list_entry(pos, struct page, lru);
+			pagedata = page_address(page);
+			pageidx = 0;
+		}
+
+		ret = (*fn)(pagedata + pageidx, state);
+		if (ret)
+			break;
+		pageidx += size;
+	}
+
+	return ret;
+}
+
+/*
+ * Similar to traverse_pages, but use each page as a "block" of
+ * data to be processed as one unit.
+ */
+static int traverse_pages_block(unsigned nelem, size_t size,
+				struct list_head *pos,
+				int (*fn)(void *data, int nr, void *state),
+				void *state)
+{
+	void *pagedata;
+	int ret = 0;
+
+	BUG_ON(size > PAGE_SIZE);
+
+	while (nelem) {
+		int nr = (PAGE_SIZE/size);
+		struct page *page;
+		if (nr > nelem)
+			nr = nelem;
+		pos = pos->next;
+		page = list_entry(pos, struct page, lru);
+		pagedata = page_address(page);
+		ret = (*fn)(pagedata, nr, state);
+		if (ret)
+			break;
+		nelem -= nr;
+	}
+
+	return ret;
+}
+
+struct mmap_gfn_state {
+	unsigned long va;
+	struct vm_area_struct *vma;
+	domid_t domain;
+};
+
+static int mmap_gfn_range(void *data, void *state)
+{
+	struct privcmd_mmap_entry *msg = data;
+	struct mmap_gfn_state *st = state;
+	struct vm_area_struct *vma = st->vma;
+	int rc;
+
+	/* Do not allow range to wrap the address space. */
+	if ((msg->npages > (LONG_MAX >> PAGE_SHIFT)) ||
+	    ((unsigned long)(msg->npages << PAGE_SHIFT) >= -st->va))
+		return -EINVAL;
+
+	/* Range chunks must be contiguous in va space. */
+	if ((msg->va != st->va) ||
+	    ((msg->va+(msg->npages<<PAGE_SHIFT)) > vma->vm_end))
+		return -EINVAL;
+
+	rc = xen_remap_domain_gfn_range(vma,
+					msg->va & PAGE_MASK,
+					msg->mfn, msg->npages,
+					vma->vm_page_prot,
+					st->domain, NULL);
+	if (rc < 0)
+		return rc;
+
+	st->va += msg->npages << PAGE_SHIFT;
+
+	return 0;
+}
+
+static long privcmd_ioctl_mmap(struct file *file, void __user *udata)
+{
+	struct privcmd_data *data = file->private_data;
+	struct privcmd_mmap mmapcmd;
+	struct mm_struct *mm = current->mm;
+	struct vm_area_struct *vma;
+	int rc;
+	LIST_HEAD(pagelist);
+	struct mmap_gfn_state state;
+
+	/* We only support privcmd_ioctl_mmap_batch for auto translated. */
+	if (xen_feature(XENFEAT_auto_translated_physmap))
+		return -ENOSYS;
+
+	if (copy_from_user(&mmapcmd, udata, sizeof(mmapcmd)))
+		return -EFAULT;
+
+	/* If restriction is in place, check the domid matches */
+	if (data->domid != DOMID_INVALID && data->domid != mmapcmd.dom)
+		return -EPERM;
+
+	rc = gather_array(&pagelist,
+			  mmapcmd.num, sizeof(struct privcmd_mmap_entry),
+			  mmapcmd.entry);
+
+	if (rc || list_empty(&pagelist))
+		goto out;
+
+	down_write(&mm->mmap_sem);
+
+	{
+		struct page *page = list_first_entry(&pagelist,
+						     struct page, lru);
+		struct privcmd_mmap_entry *msg = page_address(page);
+
+		vma = find_vma(mm, msg->va);
+		rc = -EINVAL;
+
+		if (!vma || (msg->va != vma->vm_start) || vma->vm_private_data)
+			goto out_up;
+		vma->vm_private_data = PRIV_VMA_LOCKED;
+	}
+
+	state.va = vma->vm_start;
+	state.vma = vma;
+	state.domain = mmapcmd.dom;
+
+	rc = traverse_pages(mmapcmd.num, sizeof(struct privcmd_mmap_entry),
+			    &pagelist,
+			    mmap_gfn_range, &state);
+
+
+out_up:
+	up_write(&mm->mmap_sem);
+
+out:
+	free_page_list(&pagelist);
+
+	return rc;
+}
+
+struct mmap_batch_state {
+	domid_t domain;
+	unsigned long va;
+	struct vm_area_struct *vma;
+	int index;
+	/* A tristate:
+	 *      0 for no errors
+	 *      1 if at least one error has happened (and no
+	 *          -ENOENT errors have happened)
+	 *      -ENOENT if at least 1 -ENOENT has happened.
+	 */
+	int global_error;
+	int version;
+
+	/* User-space gfn array to store errors in the second pass for V1. */
+	xen_pfn_t __user *user_gfn;
+	/* User-space int array to store errors in the second pass for V2. */
+	int __user *user_err;
+};
+
+/* auto translated dom0 note: if domU being created is PV, then gfn is
+ * mfn(addr on bus). If it's auto xlated, then gfn is pfn (input to HAP).
+ */
+static int mmap_batch_fn(void *data, int nr, void *state)
+{
+	xen_pfn_t *gfnp = data;
+	struct mmap_batch_state *st = state;
+	struct vm_area_struct *vma = st->vma;
+	struct page **pages = vma->vm_private_data;
+	struct page **cur_pages = NULL;
+	int ret;
+
+	if (xen_feature(XENFEAT_auto_translated_physmap))
+		cur_pages = &pages[st->index];
+
+	BUG_ON(nr < 0);
+	ret = xen_remap_domain_gfn_array(st->vma, st->va & PAGE_MASK, gfnp, nr,
+					 (int *)gfnp, st->vma->vm_page_prot,
+					 st->domain, cur_pages);
+
+	/* Adjust the global_error? */
+	if (ret != nr) {
+		if (ret == -ENOENT)
+			st->global_error = -ENOENT;
+		else {
+			/* Record that at least one error has happened. */
+			if (st->global_error == 0)
+				st->global_error = 1;
+		}
+	}
+	st->va += XEN_PAGE_SIZE * nr;
+	st->index += nr / XEN_PFN_PER_PAGE;
+
+	return 0;
+}
+
+static int mmap_return_error(int err, struct mmap_batch_state *st)
+{
+	int ret;
+
+	if (st->version == 1) {
+		if (err) {
+			xen_pfn_t gfn;
+
+			ret = get_user(gfn, st->user_gfn);
+			if (ret < 0)
+				return ret;
+			/*
+			 * V1 encodes the error codes in the 32bit top
+			 * nibble of the gfn (with its known
+			 * limitations vis-a-vis 64 bit callers).
+			 */
+			gfn |= (err == -ENOENT) ?
+				PRIVCMD_MMAPBATCH_PAGED_ERROR :
+				PRIVCMD_MMAPBATCH_MFN_ERROR;
+			return __put_user(gfn, st->user_gfn++);
+		} else
+			st->user_gfn++;
+	} else { /* st->version == 2 */
+		if (err)
+			return __put_user(err, st->user_err++);
+		else
+			st->user_err++;
+	}
+
+	return 0;
+}
+
+static int mmap_return_errors(void *data, int nr, void *state)
+{
+	struct mmap_batch_state *st = state;
+	int *errs = data;
+	int i;
+	int ret;
+
+	for (i = 0; i < nr; i++) {
+		ret = mmap_return_error(errs[i], st);
+		if (ret < 0)
+			return ret;
+	}
+	return 0;
+}
+
+/* Allocate pfns that are then mapped with gfns from foreign domid. Update
+ * the vma with the page info to use later.
+ * Returns: 0 if success, otherwise -errno
+ */
+static int alloc_empty_pages(struct vm_area_struct *vma, int numpgs)
+{
+	int rc;
+	struct page **pages;
+
+	pages = kcalloc(numpgs, sizeof(pages[0]), GFP_KERNEL);
+	if (pages == NULL)
+		return -ENOMEM;
+
+	rc = alloc_xenballooned_pages(numpgs, pages);
+	if (rc != 0) {
+		pr_warn("%s Could not alloc %d pfns rc:%d\n", __func__,
+			numpgs, rc);
+		kfree(pages);
+		return -ENOMEM;
+	}
+	BUG_ON(vma->vm_private_data != NULL);
+	vma->vm_private_data = pages;
+
+	return 0;
+}
+
+static const struct vm_operations_struct privcmd_vm_ops;
+
+static long privcmd_ioctl_mmap_batch(
+	struct file *file, void __user *udata, int version)
+{
+	struct privcmd_data *data = file->private_data;
+	int ret;
+	struct privcmd_mmapbatch_v2 m;
+	struct mm_struct *mm = current->mm;
+	struct vm_area_struct *vma;
+	unsigned long nr_pages;
+	LIST_HEAD(pagelist);
+	struct mmap_batch_state state;
+
+	switch (version) {
+	case 1:
+		if (copy_from_user(&m, udata, sizeof(struct privcmd_mmapbatch)))
+			return -EFAULT;
+		/* Returns per-frame error in m.arr. */
+		m.err = NULL;
+		if (!access_ok(VERIFY_WRITE, m.arr, m.num * sizeof(*m.arr)))
+			return -EFAULT;
+		break;
+	case 2:
+		if (copy_from_user(&m, udata, sizeof(struct privcmd_mmapbatch_v2)))
+			return -EFAULT;
+		/* Returns per-frame error code in m.err. */
+		if (!access_ok(VERIFY_WRITE, m.err, m.num * (sizeof(*m.err))))
+			return -EFAULT;
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	/* If restriction is in place, check the domid matches */
+	if (data->domid != DOMID_INVALID && data->domid != m.dom)
+		return -EPERM;
+
+	nr_pages = DIV_ROUND_UP(m.num, XEN_PFN_PER_PAGE);
+	if ((m.num <= 0) || (nr_pages > (LONG_MAX >> PAGE_SHIFT)))
+		return -EINVAL;
+
+	ret = gather_array(&pagelist, m.num, sizeof(xen_pfn_t), m.arr);
+
+	if (ret)
+		goto out;
+	if (list_empty(&pagelist)) {
+		ret = -EINVAL;
+		goto out;
+	}
+
+	if (version == 2) {
+		/* Zero error array now to only copy back actual errors. */
+		if (clear_user(m.err, sizeof(int) * m.num)) {
+			ret = -EFAULT;
+			goto out;
+		}
+	}
+
+	down_write(&mm->mmap_sem);
+
+	vma = find_vma(mm, m.addr);
+	if (!vma ||
+	    vma->vm_ops != &privcmd_vm_ops) {
+		ret = -EINVAL;
+		goto out_unlock;
+	}
+
+	/*
+	 * Caller must either:
+	 *
+	 * Map the whole VMA range, which will also allocate all the
+	 * pages required for the auto_translated_physmap case.
+	 *
+	 * Or
+	 *
+	 * Map unmapped holes left from a previous map attempt (e.g.,
+	 * because those foreign frames were previously paged out).
+	 */
+	if (vma->vm_private_data == NULL) {
+		if (m.addr != vma->vm_start ||
+		    m.addr + (nr_pages << PAGE_SHIFT) != vma->vm_end) {
+			ret = -EINVAL;
+			goto out_unlock;
+		}
+		if (xen_feature(XENFEAT_auto_translated_physmap)) {
+			ret = alloc_empty_pages(vma, nr_pages);
+			if (ret < 0)
+				goto out_unlock;
+		} else
+			vma->vm_private_data = PRIV_VMA_LOCKED;
+	} else {
+		if (m.addr < vma->vm_start ||
+		    m.addr + (nr_pages << PAGE_SHIFT) > vma->vm_end) {
+			ret = -EINVAL;
+			goto out_unlock;
+		}
+		if (privcmd_vma_range_is_mapped(vma, m.addr, nr_pages)) {
+			ret = -EINVAL;
+			goto out_unlock;
+		}
+	}
+
+	state.domain        = m.dom;
+	state.vma           = vma;
+	state.va            = m.addr;
+	state.index         = 0;
+	state.global_error  = 0;
+	state.version       = version;
+
+	BUILD_BUG_ON(((PAGE_SIZE / sizeof(xen_pfn_t)) % XEN_PFN_PER_PAGE) != 0);
+	/* mmap_batch_fn guarantees ret == 0 */
+	BUG_ON(traverse_pages_block(m.num, sizeof(xen_pfn_t),
+				    &pagelist, mmap_batch_fn, &state));
+
+	up_write(&mm->mmap_sem);
+
+	if (state.global_error) {
+		/* Write back errors in second pass. */
+		state.user_gfn = (xen_pfn_t *)m.arr;
+		state.user_err = m.err;
+		ret = traverse_pages_block(m.num, sizeof(xen_pfn_t),
+					   &pagelist, mmap_return_errors, &state);
+	} else
+		ret = 0;
+
+	/* If we have not had any EFAULT-like global errors then set the global
+	 * error to -ENOENT if necessary. */
+	if ((ret == 0) && (state.global_error == -ENOENT))
+		ret = -ENOENT;
+
+out:
+	free_page_list(&pagelist);
+	return ret;
+
+out_unlock:
+	up_write(&mm->mmap_sem);
+	goto out;
+}
+
+static int lock_pages(
+	struct privcmd_dm_op_buf kbufs[], unsigned int num,
+	struct page *pages[], unsigned int nr_pages)
+{
+	unsigned int i;
+
+	for (i = 0; i < num; i++) {
+		unsigned int requested;
+		int pinned;
+
+		requested = DIV_ROUND_UP(
+			offset_in_page(kbufs[i].uptr) + kbufs[i].size,
+			PAGE_SIZE);
+		if (requested > nr_pages)
+			return -ENOSPC;
+
+		pinned = get_user_pages_fast(
+			(unsigned long) kbufs[i].uptr,
+			requested, FOLL_WRITE, pages);
+		if (pinned < 0)
+			return pinned;
+
+		nr_pages -= pinned;
+		pages += pinned;
+	}
+
+	return 0;
+}
+
+static void unlock_pages(struct page *pages[], unsigned int nr_pages)
+{
+	unsigned int i;
+
+	if (!pages)
+		return;
+
+	for (i = 0; i < nr_pages; i++) {
+		if (pages[i])
+			put_page(pages[i]);
+	}
+}
+
+static long privcmd_ioctl_dm_op(struct file *file, void __user *udata)
+{
+	struct privcmd_data *data = file->private_data;
+	struct privcmd_dm_op kdata;
+	struct privcmd_dm_op_buf *kbufs;
+	unsigned int nr_pages = 0;
+	struct page **pages = NULL;
+	struct xen_dm_op_buf *xbufs = NULL;
+	unsigned int i;
+	long rc;
+
+	if (copy_from_user(&kdata, udata, sizeof(kdata)))
+		return -EFAULT;
+
+	/* If restriction is in place, check the domid matches */
+	if (data->domid != DOMID_INVALID && data->domid != kdata.dom)
+		return -EPERM;
+
+	if (kdata.num == 0)
+		return 0;
+
+	if (kdata.num > privcmd_dm_op_max_num)
+		return -E2BIG;
+
+	kbufs = kcalloc(kdata.num, sizeof(*kbufs), GFP_KERNEL);
+	if (!kbufs)
+		return -ENOMEM;
+
+	if (copy_from_user(kbufs, kdata.ubufs,
+			   sizeof(*kbufs) * kdata.num)) {
+		rc = -EFAULT;
+		goto out;
+	}
+
+	for (i = 0; i < kdata.num; i++) {
+		if (kbufs[i].size > privcmd_dm_op_buf_max_size) {
+			rc = -E2BIG;
+			goto out;
+		}
+
+		if (!access_ok(VERIFY_WRITE, kbufs[i].uptr,
+			       kbufs[i].size)) {
+			rc = -EFAULT;
+			goto out;
+		}
+
+		nr_pages += DIV_ROUND_UP(
+			offset_in_page(kbufs[i].uptr) + kbufs[i].size,
+			PAGE_SIZE);
+	}
+
+	pages = kcalloc(nr_pages, sizeof(*pages), GFP_KERNEL);
+	if (!pages) {
+		rc = -ENOMEM;
+		goto out;
+	}
+
+	xbufs = kcalloc(kdata.num, sizeof(*xbufs), GFP_KERNEL);
+	if (!xbufs) {
+		rc = -ENOMEM;
+		goto out;
+	}
+
+	rc = lock_pages(kbufs, kdata.num, pages, nr_pages);
+	if (rc)
+		goto out;
+
+	for (i = 0; i < kdata.num; i++) {
+		set_xen_guest_handle(xbufs[i].h, kbufs[i].uptr);
+		xbufs[i].size = kbufs[i].size;
+	}
+
+	xen_preemptible_hcall_begin();
+	rc = HYPERVISOR_dm_op(kdata.dom, kdata.num, xbufs);
+	xen_preemptible_hcall_end();
+
+out:
+	unlock_pages(pages, nr_pages);
+	kfree(xbufs);
+	kfree(pages);
+	kfree(kbufs);
+
+	return rc;
+}
+
+static long privcmd_ioctl_restrict(struct file *file, void __user *udata)
+{
+	struct privcmd_data *data = file->private_data;
+	domid_t dom;
+
+	if (copy_from_user(&dom, udata, sizeof(dom)))
+		return -EFAULT;
+
+	/* Set restriction to the specified domain, or check it matches */
+	if (data->domid == DOMID_INVALID)
+		data->domid = dom;
+	else if (data->domid != dom)
+		return -EINVAL;
+
+	return 0;
+}
+
+struct remap_pfn {
+	struct mm_struct *mm;
+	struct page **pages;
+	pgprot_t prot;
+	unsigned long i;
+};
+
+static int remap_pfn_fn(pte_t *ptep, pgtable_t token, unsigned long addr,
+			void *data)
+{
+	struct remap_pfn *r = data;
+	struct page *page = r->pages[r->i];
+	pte_t pte = pte_mkspecial(pfn_pte(page_to_pfn(page), r->prot));
+
+	set_pte_at(r->mm, addr, ptep, pte);
+	r->i++;
+
+	return 0;
+}
+
+static long privcmd_ioctl_mmap_resource(struct file *file,
+				struct privcmd_mmap_resource __user *udata)
+{
+	struct privcmd_data *data = file->private_data;
+	struct mm_struct *mm = current->mm;
+	struct vm_area_struct *vma;
+	struct privcmd_mmap_resource kdata;
+	xen_pfn_t *pfns = NULL;
+	struct xen_mem_acquire_resource xdata = { };
+	int rc;
+
+	if (copy_from_user(&kdata, udata, sizeof(kdata)))
+		return -EFAULT;
+
+	/* If restriction is in place, check the domid matches */
+	if (data->domid != DOMID_INVALID && data->domid != kdata.dom)
+		return -EPERM;
+
+	/* Both fields must be set or unset */
+	if (!!kdata.addr != !!kdata.num)
+		return -EINVAL;
+
+	xdata.domid = kdata.dom;
+	xdata.type = kdata.type;
+	xdata.id = kdata.id;
+
+	if (!kdata.addr && !kdata.num) {
+		/* Query the size of the resource. */
+		rc = HYPERVISOR_memory_op(XENMEM_acquire_resource, &xdata);
+		if (rc)
+			return rc;
+		return __put_user(xdata.nr_frames, &udata->num);
+	}
+
+	down_write(&mm->mmap_sem);
+
+	vma = find_vma(mm, kdata.addr);
+	if (!vma || vma->vm_ops != &privcmd_vm_ops) {
+		rc = -EINVAL;
+		goto out;
+	}
+
+	pfns = kcalloc(kdata.num, sizeof(*pfns), GFP_KERNEL);
+	if (!pfns) {
+		rc = -ENOMEM;
+		goto out;
+	}
+
+	if (xen_feature(XENFEAT_auto_translated_physmap)) {
+		unsigned int nr = DIV_ROUND_UP(kdata.num, XEN_PFN_PER_PAGE);
+		struct page **pages;
+		unsigned int i;
+
+		rc = alloc_empty_pages(vma, nr);
+		if (rc < 0)
+			goto out;
+
+		pages = vma->vm_private_data;
+		for (i = 0; i < kdata.num; i++) {
+			xen_pfn_t pfn =
+				page_to_xen_pfn(pages[i / XEN_PFN_PER_PAGE]);
+
+			pfns[i] = pfn + (i % XEN_PFN_PER_PAGE);
+		}
+	} else
+		vma->vm_private_data = PRIV_VMA_LOCKED;
+
+	xdata.frame = kdata.idx;
+	xdata.nr_frames = kdata.num;
+	set_xen_guest_handle(xdata.frame_list, pfns);
+
+	xen_preemptible_hcall_begin();
+	rc = HYPERVISOR_memory_op(XENMEM_acquire_resource, &xdata);
+	xen_preemptible_hcall_end();
+
+	if (rc)
+		goto out;
+
+	if (xen_feature(XENFEAT_auto_translated_physmap)) {
+		struct remap_pfn r = {
+			.mm = vma->vm_mm,
+			.pages = vma->vm_private_data,
+			.prot = vma->vm_page_prot,
+		};
+
+		rc = apply_to_page_range(r.mm, kdata.addr,
+					 kdata.num << PAGE_SHIFT,
+					 remap_pfn_fn, &r);
+	} else {
+		unsigned int domid =
+			(xdata.flags & XENMEM_rsrc_acq_caller_owned) ?
+			DOMID_SELF : kdata.dom;
+		int num, *errs = (int *)pfns;
+
+		BUILD_BUG_ON(sizeof(*errs) > sizeof(*pfns));
+		num = xen_remap_domain_mfn_array(vma,
+						 kdata.addr & PAGE_MASK,
+						 pfns, kdata.num, errs,
+						 vma->vm_page_prot,
+						 domid,
+						 vma->vm_private_data);
+		if (num < 0)
+			rc = num;
+		else if (num != kdata.num) {
+			unsigned int i;
+
+			for (i = 0; i < num; i++) {
+				rc = errs[i];
+				if (rc < 0)
+					break;
+			}
+		} else
+			rc = 0;
+	}
+
+out:
+	up_write(&mm->mmap_sem);
+	kfree(pfns);
+
+	return rc;
+}
+
+static long privcmd_ioctl(struct file *file,
+			  unsigned int cmd, unsigned long data)
+{
+	int ret = -ENOTTY;
+	void __user *udata = (void __user *) data;
+
+	switch (cmd) {
+	case IOCTL_PRIVCMD_HYPERCALL:
+		ret = privcmd_ioctl_hypercall(file, udata);
+		break;
+
+	case IOCTL_PRIVCMD_MMAP:
+		ret = privcmd_ioctl_mmap(file, udata);
+		break;
+
+	case IOCTL_PRIVCMD_MMAPBATCH:
+		ret = privcmd_ioctl_mmap_batch(file, udata, 1);
+		break;
+
+	case IOCTL_PRIVCMD_MMAPBATCH_V2:
+		ret = privcmd_ioctl_mmap_batch(file, udata, 2);
+		break;
+
+	case IOCTL_PRIVCMD_DM_OP:
+		ret = privcmd_ioctl_dm_op(file, udata);
+		break;
+
+	case IOCTL_PRIVCMD_RESTRICT:
+		ret = privcmd_ioctl_restrict(file, udata);
+		break;
+
+	case IOCTL_PRIVCMD_MMAP_RESOURCE:
+		ret = privcmd_ioctl_mmap_resource(file, udata);
+		break;
+
+	default:
+		break;
+	}
+
+	return ret;
+}
+
+static int privcmd_open(struct inode *ino, struct file *file)
+{
+	struct privcmd_data *data = kzalloc(sizeof(*data), GFP_KERNEL);
+
+	if (!data)
+		return -ENOMEM;
+
+	/* DOMID_INVALID implies no restriction */
+	data->domid = DOMID_INVALID;
+
+	file->private_data = data;
+	return 0;
+}
+
+static int privcmd_release(struct inode *ino, struct file *file)
+{
+	struct privcmd_data *data = file->private_data;
+
+	kfree(data);
+	return 0;
+}
+
+static void privcmd_close(struct vm_area_struct *vma)
+{
+	struct page **pages = vma->vm_private_data;
+	int numpgs = vma_pages(vma);
+	int numgfns = (vma->vm_end - vma->vm_start) >> XEN_PAGE_SHIFT;
+	int rc;
+
+	if (!xen_feature(XENFEAT_auto_translated_physmap) || !numpgs || !pages)
+		return;
+
+	rc = xen_unmap_domain_gfn_range(vma, numgfns, pages);
+	if (rc == 0)
+		free_xenballooned_pages(numpgs, pages);
+	else
+		pr_crit("unable to unmap MFN range: leaking %d pages. rc=%d\n",
+			numpgs, rc);
+	kfree(pages);
+}
+
+static vm_fault_t privcmd_fault(struct vm_fault *vmf)
+{
+	printk(KERN_DEBUG "privcmd_fault: vma=%p %lx-%lx, pgoff=%lx, uv=%p\n",
+	       vmf->vma, vmf->vma->vm_start, vmf->vma->vm_end,
+	       vmf->pgoff, (void *)vmf->address);
+
+	return VM_FAULT_SIGBUS;
+}
+
+static const struct vm_operations_struct privcmd_vm_ops = {
+	.close = privcmd_close,
+	.fault = privcmd_fault
+};
+
+static int privcmd_mmap(struct file *file, struct vm_area_struct *vma)
+{
+	/* DONTCOPY is essential for Xen because copy_page_range doesn't know
+	 * how to recreate these mappings */
+	vma->vm_flags |= VM_IO | VM_PFNMAP | VM_DONTCOPY |
+			 VM_DONTEXPAND | VM_DONTDUMP;
+	vma->vm_ops = &privcmd_vm_ops;
+	vma->vm_private_data = NULL;
+
+	return 0;
+}
+
+/*
+ * For MMAPBATCH*. This allows asserting the singleshot mapping
+ * on a per pfn/pte basis. Mapping calls that fail with ENOENT
+ * can be then retried until success.
+ */
+static int is_mapped_fn(pte_t *pte, struct page *pmd_page,
+	                unsigned long addr, void *data)
+{
+	return pte_none(*pte) ? 0 : -EBUSY;
+}
+
+static int privcmd_vma_range_is_mapped(
+	           struct vm_area_struct *vma,
+	           unsigned long addr,
+	           unsigned long nr_pages)
+{
+	return apply_to_page_range(vma->vm_mm, addr, nr_pages << PAGE_SHIFT,
+				   is_mapped_fn, NULL) != 0;
+}
+
+const struct file_operations xen_privcmd_fops = {
+	.owner = THIS_MODULE,
+	.unlocked_ioctl = privcmd_ioctl,
+	.open = privcmd_open,
+	.release = privcmd_release,
+	.mmap = privcmd_mmap,
+};
+EXPORT_SYMBOL_GPL(xen_privcmd_fops);
+
+static struct miscdevice privcmd_dev = {
+	.minor = MISC_DYNAMIC_MINOR,
+	.name = "xen/privcmd",
+	.fops = &xen_privcmd_fops,
+};
+
+static int __init privcmd_init(void)
+{
+	int err;
+
+	if (!xen_domain())
+		return -ENODEV;
+
+	err = misc_register(&privcmd_dev);
+	if (err != 0) {
+		pr_err("Could not register Xen privcmd device\n");
+		return err;
+	}
+
+	err = misc_register(&xen_privcmdbuf_dev);
+	if (err != 0) {
+		pr_err("Could not register Xen hypercall-buf device\n");
+		misc_deregister(&privcmd_dev);
+		return err;
+	}
+
+	return 0;
+}
+
+static void __exit privcmd_exit(void)
+{
+	misc_deregister(&privcmd_dev);
+	misc_deregister(&xen_privcmdbuf_dev);
+}
+
+module_init(privcmd_init);
+module_exit(privcmd_exit);
diff --git a/drivers/xen/privcmd.h b/drivers/xen/privcmd.h
new file mode 100644
index 000000000..0dd9f8f67
--- /dev/null
+++ b/drivers/xen/privcmd.h
@@ -0,0 +1,6 @@
+#include <linux/fs.h>
+
+extern const struct file_operations xen_privcmd_fops;
+extern const struct file_operations xen_privcmdbuf_fops;
+
+extern struct miscdevice xen_privcmdbuf_dev;
diff --git a/drivers/xen/pvcalls-back.c b/drivers/xen/pvcalls-back.c
new file mode 100644
index 000000000..f94bb6034
--- /dev/null
+++ b/drivers/xen/pvcalls-back.c
@@ -0,0 +1,1252 @@
+/*
+ * (c) 2017 Stefano Stabellini <stefano@aporeto.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ */
+
+#include <linux/inet.h>
+#include <linux/kthread.h>
+#include <linux/list.h>
+#include <linux/radix-tree.h>
+#include <linux/module.h>
+#include <linux/semaphore.h>
+#include <linux/wait.h>
+#include <net/sock.h>
+#include <net/inet_common.h>
+#include <net/inet_connection_sock.h>
+#include <net/request_sock.h>
+
+#include <xen/events.h>
+#include <xen/grant_table.h>
+#include <xen/xen.h>
+#include <xen/xenbus.h>
+#include <xen/interface/io/pvcalls.h>
+
+#define PVCALLS_VERSIONS "1"
+#define MAX_RING_ORDER XENBUS_MAX_RING_GRANT_ORDER
+
+struct pvcalls_back_global {
+	struct list_head frontends;
+	struct semaphore frontends_lock;
+} pvcalls_back_global;
+
+/*
+ * Per-frontend data structure. It contains pointers to the command
+ * ring, its event channel, a list of active sockets and a tree of
+ * passive sockets.
+ */
+struct pvcalls_fedata {
+	struct list_head list;
+	struct xenbus_device *dev;
+	struct xen_pvcalls_sring *sring;
+	struct xen_pvcalls_back_ring ring;
+	int irq;
+	struct list_head socket_mappings;
+	struct radix_tree_root socketpass_mappings;
+	struct semaphore socket_lock;
+};
+
+struct pvcalls_ioworker {
+	struct work_struct register_work;
+	struct workqueue_struct *wq;
+};
+
+struct sock_mapping {
+	struct list_head list;
+	struct pvcalls_fedata *fedata;
+	struct sockpass_mapping *sockpass;
+	struct socket *sock;
+	uint64_t id;
+	grant_ref_t ref;
+	struct pvcalls_data_intf *ring;
+	void *bytes;
+	struct pvcalls_data data;
+	uint32_t ring_order;
+	int irq;
+	atomic_t read;
+	atomic_t write;
+	atomic_t io;
+	atomic_t release;
+	atomic_t eoi;
+	void (*saved_data_ready)(struct sock *sk);
+	struct pvcalls_ioworker ioworker;
+};
+
+struct sockpass_mapping {
+	struct list_head list;
+	struct pvcalls_fedata *fedata;
+	struct socket *sock;
+	uint64_t id;
+	struct xen_pvcalls_request reqcopy;
+	spinlock_t copy_lock;
+	struct workqueue_struct *wq;
+	struct work_struct register_work;
+	void (*saved_data_ready)(struct sock *sk);
+};
+
+static irqreturn_t pvcalls_back_conn_event(int irq, void *sock_map);
+static int pvcalls_back_release_active(struct xenbus_device *dev,
+				       struct pvcalls_fedata *fedata,
+				       struct sock_mapping *map);
+
+static bool pvcalls_conn_back_read(void *opaque)
+{
+	struct sock_mapping *map = (struct sock_mapping *)opaque;
+	struct msghdr msg;
+	struct kvec vec[2];
+	RING_IDX cons, prod, size, wanted, array_size, masked_prod, masked_cons;
+	int32_t error;
+	struct pvcalls_data_intf *intf = map->ring;
+	struct pvcalls_data *data = &map->data;
+	unsigned long flags;
+	int ret;
+
+	array_size = XEN_FLEX_RING_SIZE(map->ring_order);
+	cons = intf->in_cons;
+	prod = intf->in_prod;
+	error = intf->in_error;
+	/* read the indexes first, then deal with the data */
+	virt_mb();
+
+	if (error)
+		return false;
+
+	size = pvcalls_queued(prod, cons, array_size);
+	if (size >= array_size)
+		return false;
+	spin_lock_irqsave(&map->sock->sk->sk_receive_queue.lock, flags);
+	if (skb_queue_empty(&map->sock->sk->sk_receive_queue)) {
+		atomic_set(&map->read, 0);
+		spin_unlock_irqrestore(&map->sock->sk->sk_receive_queue.lock,
+				flags);
+		return true;
+	}
+	spin_unlock_irqrestore(&map->sock->sk->sk_receive_queue.lock, flags);
+	wanted = array_size - size;
+	masked_prod = pvcalls_mask(prod, array_size);
+	masked_cons = pvcalls_mask(cons, array_size);
+
+	memset(&msg, 0, sizeof(msg));
+	if (masked_prod < masked_cons) {
+		vec[0].iov_base = data->in + masked_prod;
+		vec[0].iov_len = wanted;
+		iov_iter_kvec(&msg.msg_iter, ITER_KVEC|WRITE, vec, 1, wanted);
+	} else {
+		vec[0].iov_base = data->in + masked_prod;
+		vec[0].iov_len = array_size - masked_prod;
+		vec[1].iov_base = data->in;
+		vec[1].iov_len = wanted - vec[0].iov_len;
+		iov_iter_kvec(&msg.msg_iter, ITER_KVEC|WRITE, vec, 2, wanted);
+	}
+
+	atomic_set(&map->read, 0);
+	ret = inet_recvmsg(map->sock, &msg, wanted, MSG_DONTWAIT);
+	WARN_ON(ret > wanted);
+	if (ret == -EAGAIN) /* shouldn't happen */
+		return true;
+	if (!ret)
+		ret = -ENOTCONN;
+	spin_lock_irqsave(&map->sock->sk->sk_receive_queue.lock, flags);
+	if (ret > 0 && !skb_queue_empty(&map->sock->sk->sk_receive_queue))
+		atomic_inc(&map->read);
+	spin_unlock_irqrestore(&map->sock->sk->sk_receive_queue.lock, flags);
+
+	/* write the data, then modify the indexes */
+	virt_wmb();
+	if (ret < 0) {
+		atomic_set(&map->read, 0);
+		intf->in_error = ret;
+	} else
+		intf->in_prod = prod + ret;
+	/* update the indexes, then notify the other end */
+	virt_wmb();
+	notify_remote_via_irq(map->irq);
+
+	return true;
+}
+
+static bool pvcalls_conn_back_write(struct sock_mapping *map)
+{
+	struct pvcalls_data_intf *intf = map->ring;
+	struct pvcalls_data *data = &map->data;
+	struct msghdr msg;
+	struct kvec vec[2];
+	RING_IDX cons, prod, size, array_size;
+	int ret;
+
+	cons = intf->out_cons;
+	prod = intf->out_prod;
+	/* read the indexes before dealing with the data */
+	virt_mb();
+
+	array_size = XEN_FLEX_RING_SIZE(map->ring_order);
+	size = pvcalls_queued(prod, cons, array_size);
+	if (size == 0)
+		return false;
+
+	memset(&msg, 0, sizeof(msg));
+	msg.msg_flags |= MSG_DONTWAIT;
+	if (pvcalls_mask(prod, array_size) > pvcalls_mask(cons, array_size)) {
+		vec[0].iov_base = data->out + pvcalls_mask(cons, array_size);
+		vec[0].iov_len = size;
+		iov_iter_kvec(&msg.msg_iter, ITER_KVEC|READ, vec, 1, size);
+	} else {
+		vec[0].iov_base = data->out + pvcalls_mask(cons, array_size);
+		vec[0].iov_len = array_size - pvcalls_mask(cons, array_size);
+		vec[1].iov_base = data->out;
+		vec[1].iov_len = size - vec[0].iov_len;
+		iov_iter_kvec(&msg.msg_iter, ITER_KVEC|READ, vec, 2, size);
+	}
+
+	atomic_set(&map->write, 0);
+	ret = inet_sendmsg(map->sock, &msg, size);
+	if (ret == -EAGAIN) {
+		atomic_inc(&map->write);
+		atomic_inc(&map->io);
+		return true;
+	}
+
+	/* write the data, then update the indexes */
+	virt_wmb();
+	if (ret < 0) {
+		intf->out_error = ret;
+	} else {
+		intf->out_error = 0;
+		intf->out_cons = cons + ret;
+		prod = intf->out_prod;
+	}
+	/* update the indexes, then notify the other end */
+	virt_wmb();
+	if (prod != cons + ret) {
+		atomic_inc(&map->write);
+		atomic_inc(&map->io);
+	}
+	notify_remote_via_irq(map->irq);
+
+	return true;
+}
+
+static void pvcalls_back_ioworker(struct work_struct *work)
+{
+	struct pvcalls_ioworker *ioworker = container_of(work,
+		struct pvcalls_ioworker, register_work);
+	struct sock_mapping *map = container_of(ioworker, struct sock_mapping,
+		ioworker);
+	unsigned int eoi_flags = XEN_EOI_FLAG_SPURIOUS;
+
+	while (atomic_read(&map->io) > 0) {
+		if (atomic_read(&map->release) > 0) {
+			atomic_set(&map->release, 0);
+			return;
+		}
+
+		if (atomic_read(&map->read) > 0 &&
+		    pvcalls_conn_back_read(map))
+			eoi_flags = 0;
+		if (atomic_read(&map->write) > 0 &&
+		    pvcalls_conn_back_write(map))
+			eoi_flags = 0;
+
+		if (atomic_read(&map->eoi) > 0 && !atomic_read(&map->write)) {
+			atomic_set(&map->eoi, 0);
+			xen_irq_lateeoi(map->irq, eoi_flags);
+			eoi_flags = XEN_EOI_FLAG_SPURIOUS;
+		}
+
+		atomic_dec(&map->io);
+	}
+}
+
+static int pvcalls_back_socket(struct xenbus_device *dev,
+		struct xen_pvcalls_request *req)
+{
+	struct pvcalls_fedata *fedata;
+	int ret;
+	struct xen_pvcalls_response *rsp;
+
+	fedata = dev_get_drvdata(&dev->dev);
+
+	if (req->u.socket.domain != AF_INET ||
+	    req->u.socket.type != SOCK_STREAM ||
+	    (req->u.socket.protocol != IPPROTO_IP &&
+	     req->u.socket.protocol != AF_INET))
+		ret = -EAFNOSUPPORT;
+	else
+		ret = 0;
+
+	/* leave the actual socket allocation for later */
+
+	rsp = RING_GET_RESPONSE(&fedata->ring, fedata->ring.rsp_prod_pvt++);
+	rsp->req_id = req->req_id;
+	rsp->cmd = req->cmd;
+	rsp->u.socket.id = req->u.socket.id;
+	rsp->ret = ret;
+
+	return 0;
+}
+
+static void pvcalls_sk_state_change(struct sock *sock)
+{
+	struct sock_mapping *map = sock->sk_user_data;
+
+	if (map == NULL)
+		return;
+
+	atomic_inc(&map->read);
+	notify_remote_via_irq(map->irq);
+}
+
+static void pvcalls_sk_data_ready(struct sock *sock)
+{
+	struct sock_mapping *map = sock->sk_user_data;
+	struct pvcalls_ioworker *iow;
+
+	if (map == NULL)
+		return;
+
+	iow = &map->ioworker;
+	atomic_inc(&map->read);
+	atomic_inc(&map->io);
+	queue_work(iow->wq, &iow->register_work);
+}
+
+static struct sock_mapping *pvcalls_new_active_socket(
+		struct pvcalls_fedata *fedata,
+		uint64_t id,
+		grant_ref_t ref,
+		uint32_t evtchn,
+		struct socket *sock)
+{
+	int ret;
+	struct sock_mapping *map;
+	void *page;
+
+	map = kzalloc(sizeof(*map), GFP_KERNEL);
+	if (map == NULL)
+		return NULL;
+
+	map->fedata = fedata;
+	map->sock = sock;
+	map->id = id;
+	map->ref = ref;
+
+	ret = xenbus_map_ring_valloc(fedata->dev, &ref, 1, &page);
+	if (ret < 0)
+		goto out;
+	map->ring = page;
+	map->ring_order = map->ring->ring_order;
+	/* first read the order, then map the data ring */
+	virt_rmb();
+	if (map->ring_order > MAX_RING_ORDER) {
+		pr_warn("%s frontend requested ring_order %u, which is > MAX (%u)\n",
+				__func__, map->ring_order, MAX_RING_ORDER);
+		goto out;
+	}
+	ret = xenbus_map_ring_valloc(fedata->dev, map->ring->ref,
+				     (1 << map->ring_order), &page);
+	if (ret < 0)
+		goto out;
+	map->bytes = page;
+
+	ret = bind_interdomain_evtchn_to_irqhandler_lateeoi(
+			fedata->dev->otherend_id, evtchn,
+			pvcalls_back_conn_event, 0, "pvcalls-backend", map);
+	if (ret < 0)
+		goto out;
+	map->irq = ret;
+
+	map->data.in = map->bytes;
+	map->data.out = map->bytes + XEN_FLEX_RING_SIZE(map->ring_order);
+
+	map->ioworker.wq = alloc_workqueue("pvcalls_io", WQ_UNBOUND, 1);
+	if (!map->ioworker.wq)
+		goto out;
+	atomic_set(&map->io, 1);
+	INIT_WORK(&map->ioworker.register_work,	pvcalls_back_ioworker);
+
+	down(&fedata->socket_lock);
+	list_add_tail(&map->list, &fedata->socket_mappings);
+	up(&fedata->socket_lock);
+
+	write_lock_bh(&map->sock->sk->sk_callback_lock);
+	map->saved_data_ready = map->sock->sk->sk_data_ready;
+	map->sock->sk->sk_user_data = map;
+	map->sock->sk->sk_data_ready = pvcalls_sk_data_ready;
+	map->sock->sk->sk_state_change = pvcalls_sk_state_change;
+	write_unlock_bh(&map->sock->sk->sk_callback_lock);
+
+	return map;
+out:
+	down(&fedata->socket_lock);
+	list_del(&map->list);
+	pvcalls_back_release_active(fedata->dev, fedata, map);
+	up(&fedata->socket_lock);
+	return NULL;
+}
+
+static int pvcalls_back_connect(struct xenbus_device *dev,
+				struct xen_pvcalls_request *req)
+{
+	struct pvcalls_fedata *fedata;
+	int ret = -EINVAL;
+	struct socket *sock;
+	struct sock_mapping *map;
+	struct xen_pvcalls_response *rsp;
+	struct sockaddr *sa = (struct sockaddr *)&req->u.connect.addr;
+
+	fedata = dev_get_drvdata(&dev->dev);
+
+	if (req->u.connect.len < sizeof(sa->sa_family) ||
+	    req->u.connect.len > sizeof(req->u.connect.addr) ||
+	    sa->sa_family != AF_INET)
+		goto out;
+
+	ret = sock_create(AF_INET, SOCK_STREAM, 0, &sock);
+	if (ret < 0)
+		goto out;
+	ret = inet_stream_connect(sock, sa, req->u.connect.len, 0);
+	if (ret < 0) {
+		sock_release(sock);
+		goto out;
+	}
+
+	map = pvcalls_new_active_socket(fedata,
+					req->u.connect.id,
+					req->u.connect.ref,
+					req->u.connect.evtchn,
+					sock);
+	if (!map) {
+		ret = -EFAULT;
+		sock_release(sock);
+	}
+
+out:
+	rsp = RING_GET_RESPONSE(&fedata->ring, fedata->ring.rsp_prod_pvt++);
+	rsp->req_id = req->req_id;
+	rsp->cmd = req->cmd;
+	rsp->u.connect.id = req->u.connect.id;
+	rsp->ret = ret;
+
+	return 0;
+}
+
+static int pvcalls_back_release_active(struct xenbus_device *dev,
+				       struct pvcalls_fedata *fedata,
+				       struct sock_mapping *map)
+{
+	disable_irq(map->irq);
+	if (map->sock->sk != NULL) {
+		write_lock_bh(&map->sock->sk->sk_callback_lock);
+		map->sock->sk->sk_user_data = NULL;
+		map->sock->sk->sk_data_ready = map->saved_data_ready;
+		write_unlock_bh(&map->sock->sk->sk_callback_lock);
+	}
+
+	atomic_set(&map->release, 1);
+	flush_work(&map->ioworker.register_work);
+
+	xenbus_unmap_ring_vfree(dev, map->bytes);
+	xenbus_unmap_ring_vfree(dev, (void *)map->ring);
+	unbind_from_irqhandler(map->irq, map);
+
+	sock_release(map->sock);
+	kfree(map);
+
+	return 0;
+}
+
+static int pvcalls_back_release_passive(struct xenbus_device *dev,
+					struct pvcalls_fedata *fedata,
+					struct sockpass_mapping *mappass)
+{
+	if (mappass->sock->sk != NULL) {
+		write_lock_bh(&mappass->sock->sk->sk_callback_lock);
+		mappass->sock->sk->sk_user_data = NULL;
+		mappass->sock->sk->sk_data_ready = mappass->saved_data_ready;
+		write_unlock_bh(&mappass->sock->sk->sk_callback_lock);
+	}
+	sock_release(mappass->sock);
+	flush_workqueue(mappass->wq);
+	destroy_workqueue(mappass->wq);
+	kfree(mappass);
+
+	return 0;
+}
+
+static int pvcalls_back_release(struct xenbus_device *dev,
+				struct xen_pvcalls_request *req)
+{
+	struct pvcalls_fedata *fedata;
+	struct sock_mapping *map, *n;
+	struct sockpass_mapping *mappass;
+	int ret = 0;
+	struct xen_pvcalls_response *rsp;
+
+	fedata = dev_get_drvdata(&dev->dev);
+
+	down(&fedata->socket_lock);
+	list_for_each_entry_safe(map, n, &fedata->socket_mappings, list) {
+		if (map->id == req->u.release.id) {
+			list_del(&map->list);
+			up(&fedata->socket_lock);
+			ret = pvcalls_back_release_active(dev, fedata, map);
+			goto out;
+		}
+	}
+	mappass = radix_tree_lookup(&fedata->socketpass_mappings,
+				    req->u.release.id);
+	if (mappass != NULL) {
+		radix_tree_delete(&fedata->socketpass_mappings, mappass->id);
+		up(&fedata->socket_lock);
+		ret = pvcalls_back_release_passive(dev, fedata, mappass);
+	} else
+		up(&fedata->socket_lock);
+
+out:
+	rsp = RING_GET_RESPONSE(&fedata->ring, fedata->ring.rsp_prod_pvt++);
+	rsp->req_id = req->req_id;
+	rsp->u.release.id = req->u.release.id;
+	rsp->cmd = req->cmd;
+	rsp->ret = ret;
+	return 0;
+}
+
+static void __pvcalls_back_accept(struct work_struct *work)
+{
+	struct sockpass_mapping *mappass = container_of(
+		work, struct sockpass_mapping, register_work);
+	struct sock_mapping *map;
+	struct pvcalls_ioworker *iow;
+	struct pvcalls_fedata *fedata;
+	struct socket *sock;
+	struct xen_pvcalls_response *rsp;
+	struct xen_pvcalls_request *req;
+	int notify;
+	int ret = -EINVAL;
+	unsigned long flags;
+
+	fedata = mappass->fedata;
+	/*
+	 * __pvcalls_back_accept can race against pvcalls_back_accept.
+	 * We only need to check the value of "cmd" on read. It could be
+	 * done atomically, but to simplify the code on the write side, we
+	 * use a spinlock.
+	 */
+	spin_lock_irqsave(&mappass->copy_lock, flags);
+	req = &mappass->reqcopy;
+	if (req->cmd != PVCALLS_ACCEPT) {
+		spin_unlock_irqrestore(&mappass->copy_lock, flags);
+		return;
+	}
+	spin_unlock_irqrestore(&mappass->copy_lock, flags);
+
+	sock = sock_alloc();
+	if (sock == NULL)
+		goto out_error;
+	sock->type = mappass->sock->type;
+	sock->ops = mappass->sock->ops;
+
+	ret = inet_accept(mappass->sock, sock, O_NONBLOCK, true);
+	if (ret == -EAGAIN) {
+		sock_release(sock);
+		return;
+	}
+
+	map = pvcalls_new_active_socket(fedata,
+					req->u.accept.id_new,
+					req->u.accept.ref,
+					req->u.accept.evtchn,
+					sock);
+	if (!map) {
+		ret = -EFAULT;
+		sock_release(sock);
+		goto out_error;
+	}
+
+	map->sockpass = mappass;
+	iow = &map->ioworker;
+	atomic_inc(&map->read);
+	atomic_inc(&map->io);
+	queue_work(iow->wq, &iow->register_work);
+
+out_error:
+	rsp = RING_GET_RESPONSE(&fedata->ring, fedata->ring.rsp_prod_pvt++);
+	rsp->req_id = req->req_id;
+	rsp->cmd = req->cmd;
+	rsp->u.accept.id = req->u.accept.id;
+	rsp->ret = ret;
+	RING_PUSH_RESPONSES_AND_CHECK_NOTIFY(&fedata->ring, notify);
+	if (notify)
+		notify_remote_via_irq(fedata->irq);
+
+	mappass->reqcopy.cmd = 0;
+}
+
+static void pvcalls_pass_sk_data_ready(struct sock *sock)
+{
+	struct sockpass_mapping *mappass = sock->sk_user_data;
+	struct pvcalls_fedata *fedata;
+	struct xen_pvcalls_response *rsp;
+	unsigned long flags;
+	int notify;
+
+	if (mappass == NULL)
+		return;
+
+	fedata = mappass->fedata;
+	spin_lock_irqsave(&mappass->copy_lock, flags);
+	if (mappass->reqcopy.cmd == PVCALLS_POLL) {
+		rsp = RING_GET_RESPONSE(&fedata->ring,
+					fedata->ring.rsp_prod_pvt++);
+		rsp->req_id = mappass->reqcopy.req_id;
+		rsp->u.poll.id = mappass->reqcopy.u.poll.id;
+		rsp->cmd = mappass->reqcopy.cmd;
+		rsp->ret = 0;
+
+		mappass->reqcopy.cmd = 0;
+		spin_unlock_irqrestore(&mappass->copy_lock, flags);
+
+		RING_PUSH_RESPONSES_AND_CHECK_NOTIFY(&fedata->ring, notify);
+		if (notify)
+			notify_remote_via_irq(mappass->fedata->irq);
+	} else {
+		spin_unlock_irqrestore(&mappass->copy_lock, flags);
+		queue_work(mappass->wq, &mappass->register_work);
+	}
+}
+
+static int pvcalls_back_bind(struct xenbus_device *dev,
+			     struct xen_pvcalls_request *req)
+{
+	struct pvcalls_fedata *fedata;
+	int ret;
+	struct sockpass_mapping *map;
+	struct xen_pvcalls_response *rsp;
+
+	fedata = dev_get_drvdata(&dev->dev);
+
+	map = kzalloc(sizeof(*map), GFP_KERNEL);
+	if (map == NULL) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	INIT_WORK(&map->register_work, __pvcalls_back_accept);
+	spin_lock_init(&map->copy_lock);
+	map->wq = alloc_workqueue("pvcalls_wq", WQ_UNBOUND, 1);
+	if (!map->wq) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	ret = sock_create(AF_INET, SOCK_STREAM, 0, &map->sock);
+	if (ret < 0)
+		goto out;
+
+	ret = inet_bind(map->sock, (struct sockaddr *)&req->u.bind.addr,
+			req->u.bind.len);
+	if (ret < 0)
+		goto out;
+
+	map->fedata = fedata;
+	map->id = req->u.bind.id;
+
+	down(&fedata->socket_lock);
+	ret = radix_tree_insert(&fedata->socketpass_mappings, map->id,
+				map);
+	up(&fedata->socket_lock);
+	if (ret)
+		goto out;
+
+	write_lock_bh(&map->sock->sk->sk_callback_lock);
+	map->saved_data_ready = map->sock->sk->sk_data_ready;
+	map->sock->sk->sk_user_data = map;
+	map->sock->sk->sk_data_ready = pvcalls_pass_sk_data_ready;
+	write_unlock_bh(&map->sock->sk->sk_callback_lock);
+
+out:
+	if (ret) {
+		if (map && map->sock)
+			sock_release(map->sock);
+		if (map && map->wq)
+			destroy_workqueue(map->wq);
+		kfree(map);
+	}
+	rsp = RING_GET_RESPONSE(&fedata->ring, fedata->ring.rsp_prod_pvt++);
+	rsp->req_id = req->req_id;
+	rsp->cmd = req->cmd;
+	rsp->u.bind.id = req->u.bind.id;
+	rsp->ret = ret;
+	return 0;
+}
+
+static int pvcalls_back_listen(struct xenbus_device *dev,
+			       struct xen_pvcalls_request *req)
+{
+	struct pvcalls_fedata *fedata;
+	int ret = -EINVAL;
+	struct sockpass_mapping *map;
+	struct xen_pvcalls_response *rsp;
+
+	fedata = dev_get_drvdata(&dev->dev);
+
+	down(&fedata->socket_lock);
+	map = radix_tree_lookup(&fedata->socketpass_mappings, req->u.listen.id);
+	up(&fedata->socket_lock);
+	if (map == NULL)
+		goto out;
+
+	ret = inet_listen(map->sock, req->u.listen.backlog);
+
+out:
+	rsp = RING_GET_RESPONSE(&fedata->ring, fedata->ring.rsp_prod_pvt++);
+	rsp->req_id = req->req_id;
+	rsp->cmd = req->cmd;
+	rsp->u.listen.id = req->u.listen.id;
+	rsp->ret = ret;
+	return 0;
+}
+
+static int pvcalls_back_accept(struct xenbus_device *dev,
+			       struct xen_pvcalls_request *req)
+{
+	struct pvcalls_fedata *fedata;
+	struct sockpass_mapping *mappass;
+	int ret = -EINVAL;
+	struct xen_pvcalls_response *rsp;
+	unsigned long flags;
+
+	fedata = dev_get_drvdata(&dev->dev);
+
+	down(&fedata->socket_lock);
+	mappass = radix_tree_lookup(&fedata->socketpass_mappings,
+		req->u.accept.id);
+	up(&fedata->socket_lock);
+	if (mappass == NULL)
+		goto out_error;
+
+	/*
+	 * Limitation of the current implementation: only support one
+	 * concurrent accept or poll call on one socket.
+	 */
+	spin_lock_irqsave(&mappass->copy_lock, flags);
+	if (mappass->reqcopy.cmd != 0) {
+		spin_unlock_irqrestore(&mappass->copy_lock, flags);
+		ret = -EINTR;
+		goto out_error;
+	}
+
+	mappass->reqcopy = *req;
+	spin_unlock_irqrestore(&mappass->copy_lock, flags);
+	queue_work(mappass->wq, &mappass->register_work);
+
+	/* Tell the caller we don't need to send back a notification yet */
+	return -1;
+
+out_error:
+	rsp = RING_GET_RESPONSE(&fedata->ring, fedata->ring.rsp_prod_pvt++);
+	rsp->req_id = req->req_id;
+	rsp->cmd = req->cmd;
+	rsp->u.accept.id = req->u.accept.id;
+	rsp->ret = ret;
+	return 0;
+}
+
+static int pvcalls_back_poll(struct xenbus_device *dev,
+			     struct xen_pvcalls_request *req)
+{
+	struct pvcalls_fedata *fedata;
+	struct sockpass_mapping *mappass;
+	struct xen_pvcalls_response *rsp;
+	struct inet_connection_sock *icsk;
+	struct request_sock_queue *queue;
+	unsigned long flags;
+	int ret;
+	bool data;
+
+	fedata = dev_get_drvdata(&dev->dev);
+
+	down(&fedata->socket_lock);
+	mappass = radix_tree_lookup(&fedata->socketpass_mappings,
+				    req->u.poll.id);
+	up(&fedata->socket_lock);
+	if (mappass == NULL)
+		return -EINVAL;
+
+	/*
+	 * Limitation of the current implementation: only support one
+	 * concurrent accept or poll call on one socket.
+	 */
+	spin_lock_irqsave(&mappass->copy_lock, flags);
+	if (mappass->reqcopy.cmd != 0) {
+		ret = -EINTR;
+		goto out;
+	}
+
+	mappass->reqcopy = *req;
+	icsk = inet_csk(mappass->sock->sk);
+	queue = &icsk->icsk_accept_queue;
+	data = READ_ONCE(queue->rskq_accept_head) != NULL;
+	if (data) {
+		mappass->reqcopy.cmd = 0;
+		ret = 0;
+		goto out;
+	}
+	spin_unlock_irqrestore(&mappass->copy_lock, flags);
+
+	/* Tell the caller we don't need to send back a notification yet */
+	return -1;
+
+out:
+	spin_unlock_irqrestore(&mappass->copy_lock, flags);
+
+	rsp = RING_GET_RESPONSE(&fedata->ring, fedata->ring.rsp_prod_pvt++);
+	rsp->req_id = req->req_id;
+	rsp->cmd = req->cmd;
+	rsp->u.poll.id = req->u.poll.id;
+	rsp->ret = ret;
+	return 0;
+}
+
+static int pvcalls_back_handle_cmd(struct xenbus_device *dev,
+				   struct xen_pvcalls_request *req)
+{
+	int ret = 0;
+
+	switch (req->cmd) {
+	case PVCALLS_SOCKET:
+		ret = pvcalls_back_socket(dev, req);
+		break;
+	case PVCALLS_CONNECT:
+		ret = pvcalls_back_connect(dev, req);
+		break;
+	case PVCALLS_RELEASE:
+		ret = pvcalls_back_release(dev, req);
+		break;
+	case PVCALLS_BIND:
+		ret = pvcalls_back_bind(dev, req);
+		break;
+	case PVCALLS_LISTEN:
+		ret = pvcalls_back_listen(dev, req);
+		break;
+	case PVCALLS_ACCEPT:
+		ret = pvcalls_back_accept(dev, req);
+		break;
+	case PVCALLS_POLL:
+		ret = pvcalls_back_poll(dev, req);
+		break;
+	default:
+	{
+		struct pvcalls_fedata *fedata;
+		struct xen_pvcalls_response *rsp;
+
+		fedata = dev_get_drvdata(&dev->dev);
+		rsp = RING_GET_RESPONSE(
+				&fedata->ring, fedata->ring.rsp_prod_pvt++);
+		rsp->req_id = req->req_id;
+		rsp->cmd = req->cmd;
+		rsp->ret = -ENOTSUPP;
+		break;
+	}
+	}
+	return ret;
+}
+
+static void pvcalls_back_work(struct pvcalls_fedata *fedata)
+{
+	int notify, notify_all = 0, more = 1;
+	struct xen_pvcalls_request req;
+	struct xenbus_device *dev = fedata->dev;
+
+	while (more) {
+		while (RING_HAS_UNCONSUMED_REQUESTS(&fedata->ring)) {
+			RING_COPY_REQUEST(&fedata->ring,
+					  fedata->ring.req_cons++,
+					  &req);
+
+			if (!pvcalls_back_handle_cmd(dev, &req)) {
+				RING_PUSH_RESPONSES_AND_CHECK_NOTIFY(
+					&fedata->ring, notify);
+				notify_all += notify;
+			}
+		}
+
+		if (notify_all) {
+			notify_remote_via_irq(fedata->irq);
+			notify_all = 0;
+		}
+
+		RING_FINAL_CHECK_FOR_REQUESTS(&fedata->ring, more);
+	}
+}
+
+static irqreturn_t pvcalls_back_event(int irq, void *dev_id)
+{
+	struct xenbus_device *dev = dev_id;
+	struct pvcalls_fedata *fedata = NULL;
+	unsigned int eoi_flags = XEN_EOI_FLAG_SPURIOUS;
+
+	if (dev) {
+		fedata = dev_get_drvdata(&dev->dev);
+		if (fedata) {
+			pvcalls_back_work(fedata);
+			eoi_flags = 0;
+		}
+	}
+
+	xen_irq_lateeoi(irq, eoi_flags);
+
+	return IRQ_HANDLED;
+}
+
+static irqreturn_t pvcalls_back_conn_event(int irq, void *sock_map)
+{
+	struct sock_mapping *map = sock_map;
+	struct pvcalls_ioworker *iow;
+
+	if (map == NULL || map->sock == NULL || map->sock->sk == NULL ||
+		map->sock->sk->sk_user_data != map) {
+		xen_irq_lateeoi(irq, 0);
+		return IRQ_HANDLED;
+	}
+
+	iow = &map->ioworker;
+
+	atomic_inc(&map->write);
+	atomic_inc(&map->eoi);
+	atomic_inc(&map->io);
+	queue_work(iow->wq, &iow->register_work);
+
+	return IRQ_HANDLED;
+}
+
+static int backend_connect(struct xenbus_device *dev)
+{
+	int err, evtchn;
+	grant_ref_t ring_ref;
+	struct pvcalls_fedata *fedata = NULL;
+
+	fedata = kzalloc(sizeof(struct pvcalls_fedata), GFP_KERNEL);
+	if (!fedata)
+		return -ENOMEM;
+
+	fedata->irq = -1;
+	err = xenbus_scanf(XBT_NIL, dev->otherend, "port", "%u",
+			   &evtchn);
+	if (err != 1) {
+		err = -EINVAL;
+		xenbus_dev_fatal(dev, err, "reading %s/event-channel",
+				 dev->otherend);
+		goto error;
+	}
+
+	err = xenbus_scanf(XBT_NIL, dev->otherend, "ring-ref", "%u", &ring_ref);
+	if (err != 1) {
+		err = -EINVAL;
+		xenbus_dev_fatal(dev, err, "reading %s/ring-ref",
+				 dev->otherend);
+		goto error;
+	}
+
+	err = bind_interdomain_evtchn_to_irq_lateeoi(dev->otherend_id, evtchn);
+	if (err < 0)
+		goto error;
+	fedata->irq = err;
+
+	err = request_threaded_irq(fedata->irq, NULL, pvcalls_back_event,
+				   IRQF_ONESHOT, "pvcalls-back", dev);
+	if (err < 0)
+		goto error;
+
+	err = xenbus_map_ring_valloc(dev, &ring_ref, 1,
+				     (void **)&fedata->sring);
+	if (err < 0)
+		goto error;
+
+	BACK_RING_INIT(&fedata->ring, fedata->sring, XEN_PAGE_SIZE * 1);
+	fedata->dev = dev;
+
+	INIT_LIST_HEAD(&fedata->socket_mappings);
+	INIT_RADIX_TREE(&fedata->socketpass_mappings, GFP_KERNEL);
+	sema_init(&fedata->socket_lock, 1);
+	dev_set_drvdata(&dev->dev, fedata);
+
+	down(&pvcalls_back_global.frontends_lock);
+	list_add_tail(&fedata->list, &pvcalls_back_global.frontends);
+	up(&pvcalls_back_global.frontends_lock);
+
+	return 0;
+
+ error:
+	if (fedata->irq >= 0)
+		unbind_from_irqhandler(fedata->irq, dev);
+	if (fedata->sring != NULL)
+		xenbus_unmap_ring_vfree(dev, fedata->sring);
+	kfree(fedata);
+	return err;
+}
+
+static int backend_disconnect(struct xenbus_device *dev)
+{
+	struct pvcalls_fedata *fedata;
+	struct sock_mapping *map, *n;
+	struct sockpass_mapping *mappass;
+	struct radix_tree_iter iter;
+	void **slot;
+
+
+	fedata = dev_get_drvdata(&dev->dev);
+
+	down(&fedata->socket_lock);
+	list_for_each_entry_safe(map, n, &fedata->socket_mappings, list) {
+		list_del(&map->list);
+		pvcalls_back_release_active(dev, fedata, map);
+	}
+
+	radix_tree_for_each_slot(slot, &fedata->socketpass_mappings, &iter, 0) {
+		mappass = radix_tree_deref_slot(slot);
+		if (!mappass)
+			continue;
+		if (radix_tree_exception(mappass)) {
+			if (radix_tree_deref_retry(mappass))
+				slot = radix_tree_iter_retry(&iter);
+		} else {
+			radix_tree_delete(&fedata->socketpass_mappings,
+					  mappass->id);
+			pvcalls_back_release_passive(dev, fedata, mappass);
+		}
+	}
+	up(&fedata->socket_lock);
+
+	unbind_from_irqhandler(fedata->irq, dev);
+	xenbus_unmap_ring_vfree(dev, fedata->sring);
+
+	list_del(&fedata->list);
+	kfree(fedata);
+	dev_set_drvdata(&dev->dev, NULL);
+
+	return 0;
+}
+
+static int pvcalls_back_probe(struct xenbus_device *dev,
+			      const struct xenbus_device_id *id)
+{
+	int err, abort;
+	struct xenbus_transaction xbt;
+
+again:
+	abort = 1;
+
+	err = xenbus_transaction_start(&xbt);
+	if (err) {
+		pr_warn("%s cannot create xenstore transaction\n", __func__);
+		return err;
+	}
+
+	err = xenbus_printf(xbt, dev->nodename, "versions", "%s",
+			    PVCALLS_VERSIONS);
+	if (err) {
+		pr_warn("%s write out 'versions' failed\n", __func__);
+		goto abort;
+	}
+
+	err = xenbus_printf(xbt, dev->nodename, "max-page-order", "%u",
+			    MAX_RING_ORDER);
+	if (err) {
+		pr_warn("%s write out 'max-page-order' failed\n", __func__);
+		goto abort;
+	}
+
+	err = xenbus_printf(xbt, dev->nodename, "function-calls",
+			    XENBUS_FUNCTIONS_CALLS);
+	if (err) {
+		pr_warn("%s write out 'function-calls' failed\n", __func__);
+		goto abort;
+	}
+
+	abort = 0;
+abort:
+	err = xenbus_transaction_end(xbt, abort);
+	if (err) {
+		if (err == -EAGAIN && !abort)
+			goto again;
+		pr_warn("%s cannot complete xenstore transaction\n", __func__);
+		return err;
+	}
+
+	if (abort)
+		return -EFAULT;
+
+	xenbus_switch_state(dev, XenbusStateInitWait);
+
+	return 0;
+}
+
+static void set_backend_state(struct xenbus_device *dev,
+			      enum xenbus_state state)
+{
+	while (dev->state != state) {
+		switch (dev->state) {
+		case XenbusStateClosed:
+			switch (state) {
+			case XenbusStateInitWait:
+			case XenbusStateConnected:
+				xenbus_switch_state(dev, XenbusStateInitWait);
+				break;
+			case XenbusStateClosing:
+				xenbus_switch_state(dev, XenbusStateClosing);
+				break;
+			default:
+				WARN_ON(1);
+			}
+			break;
+		case XenbusStateInitWait:
+		case XenbusStateInitialised:
+			switch (state) {
+			case XenbusStateConnected:
+				if (backend_connect(dev))
+					return;
+				xenbus_switch_state(dev, XenbusStateConnected);
+				break;
+			case XenbusStateClosing:
+			case XenbusStateClosed:
+				xenbus_switch_state(dev, XenbusStateClosing);
+				break;
+			default:
+				WARN_ON(1);
+			}
+			break;
+		case XenbusStateConnected:
+			switch (state) {
+			case XenbusStateInitWait:
+			case XenbusStateClosing:
+			case XenbusStateClosed:
+				down(&pvcalls_back_global.frontends_lock);
+				backend_disconnect(dev);
+				up(&pvcalls_back_global.frontends_lock);
+				xenbus_switch_state(dev, XenbusStateClosing);
+				break;
+			default:
+				WARN_ON(1);
+			}
+			break;
+		case XenbusStateClosing:
+			switch (state) {
+			case XenbusStateInitWait:
+			case XenbusStateConnected:
+			case XenbusStateClosed:
+				xenbus_switch_state(dev, XenbusStateClosed);
+				break;
+			default:
+				WARN_ON(1);
+			}
+			break;
+		default:
+			WARN_ON(1);
+		}
+	}
+}
+
+static void pvcalls_back_changed(struct xenbus_device *dev,
+				 enum xenbus_state frontend_state)
+{
+	switch (frontend_state) {
+	case XenbusStateInitialising:
+		set_backend_state(dev, XenbusStateInitWait);
+		break;
+
+	case XenbusStateInitialised:
+	case XenbusStateConnected:
+		set_backend_state(dev, XenbusStateConnected);
+		break;
+
+	case XenbusStateClosing:
+		set_backend_state(dev, XenbusStateClosing);
+		break;
+
+	case XenbusStateClosed:
+		set_backend_state(dev, XenbusStateClosed);
+		if (xenbus_dev_is_online(dev))
+			break;
+		device_unregister(&dev->dev);
+		break;
+	case XenbusStateUnknown:
+		set_backend_state(dev, XenbusStateClosed);
+		device_unregister(&dev->dev);
+		break;
+
+	default:
+		xenbus_dev_fatal(dev, -EINVAL, "saw state %d at frontend",
+				 frontend_state);
+		break;
+	}
+}
+
+static int pvcalls_back_remove(struct xenbus_device *dev)
+{
+	return 0;
+}
+
+static int pvcalls_back_uevent(struct xenbus_device *xdev,
+			       struct kobj_uevent_env *env)
+{
+	return 0;
+}
+
+static const struct xenbus_device_id pvcalls_back_ids[] = {
+	{ "pvcalls" },
+	{ "" }
+};
+
+static struct xenbus_driver pvcalls_back_driver = {
+	.ids = pvcalls_back_ids,
+	.probe = pvcalls_back_probe,
+	.remove = pvcalls_back_remove,
+	.uevent = pvcalls_back_uevent,
+	.otherend_changed = pvcalls_back_changed,
+};
+
+static int __init pvcalls_back_init(void)
+{
+	int ret;
+
+	if (!xen_domain())
+		return -ENODEV;
+
+	ret = xenbus_register_backend(&pvcalls_back_driver);
+	if (ret < 0)
+		return ret;
+
+	sema_init(&pvcalls_back_global.frontends_lock, 1);
+	INIT_LIST_HEAD(&pvcalls_back_global.frontends);
+	return 0;
+}
+module_init(pvcalls_back_init);
+
+static void __exit pvcalls_back_fin(void)
+{
+	struct pvcalls_fedata *fedata, *nfedata;
+
+	down(&pvcalls_back_global.frontends_lock);
+	list_for_each_entry_safe(fedata, nfedata,
+				 &pvcalls_back_global.frontends, list) {
+		backend_disconnect(fedata->dev);
+	}
+	up(&pvcalls_back_global.frontends_lock);
+
+	xenbus_unregister_driver(&pvcalls_back_driver);
+}
+
+module_exit(pvcalls_back_fin);
+
+MODULE_DESCRIPTION("Xen PV Calls backend driver");
+MODULE_AUTHOR("Stefano Stabellini <sstabellini@kernel.org>");
+MODULE_LICENSE("GPL");
diff --git a/drivers/xen/pvcalls-front.c b/drivers/xen/pvcalls-front.c
new file mode 100644
index 000000000..285bb8390
--- /dev/null
+++ b/drivers/xen/pvcalls-front.c
@@ -0,0 +1,1300 @@
+/*
+ * (c) 2017 Stefano Stabellini <stefano@aporeto.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ */
+
+#include <linux/module.h>
+#include <linux/net.h>
+#include <linux/socket.h>
+
+#include <net/sock.h>
+
+#include <xen/events.h>
+#include <xen/grant_table.h>
+#include <xen/xen.h>
+#include <xen/xenbus.h>
+#include <xen/interface/io/pvcalls.h>
+
+#include "pvcalls-front.h"
+
+#define PVCALLS_INVALID_ID UINT_MAX
+#define PVCALLS_RING_ORDER XENBUS_MAX_RING_GRANT_ORDER
+#define PVCALLS_NR_RSP_PER_RING __CONST_RING_SIZE(xen_pvcalls, XEN_PAGE_SIZE)
+#define PVCALLS_FRONT_MAX_SPIN 5000
+
+static struct proto pvcalls_proto = {
+	.name	= "PVCalls",
+	.owner	= THIS_MODULE,
+	.obj_size = sizeof(struct sock),
+};
+
+struct pvcalls_bedata {
+	struct xen_pvcalls_front_ring ring;
+	grant_ref_t ref;
+	int irq;
+
+	struct list_head socket_mappings;
+	spinlock_t socket_lock;
+
+	wait_queue_head_t inflight_req;
+	struct xen_pvcalls_response rsp[PVCALLS_NR_RSP_PER_RING];
+};
+/* Only one front/back connection supported. */
+static struct xenbus_device *pvcalls_front_dev;
+static atomic_t pvcalls_refcount;
+
+/* first increment refcount, then proceed */
+#define pvcalls_enter() {               \
+	atomic_inc(&pvcalls_refcount);      \
+}
+
+/* first complete other operations, then decrement refcount */
+#define pvcalls_exit() {                \
+	atomic_dec(&pvcalls_refcount);      \
+}
+
+struct sock_mapping {
+	bool active_socket;
+	struct list_head list;
+	struct socket *sock;
+	atomic_t refcount;
+	union {
+		struct {
+			int irq;
+			grant_ref_t ref;
+			struct pvcalls_data_intf *ring;
+			struct pvcalls_data data;
+			struct mutex in_mutex;
+			struct mutex out_mutex;
+
+			wait_queue_head_t inflight_conn_req;
+		} active;
+		struct {
+		/*
+		 * Socket status, needs to be 64-bit aligned due to the
+		 * test_and_* functions which have this requirement on arm64.
+		 */
+#define PVCALLS_STATUS_UNINITALIZED  0
+#define PVCALLS_STATUS_BIND          1
+#define PVCALLS_STATUS_LISTEN        2
+			uint8_t status __attribute__((aligned(8)));
+		/*
+		 * Internal state-machine flags.
+		 * Only one accept operation can be inflight for a socket.
+		 * Only one poll operation can be inflight for a given socket.
+		 * flags needs to be 64-bit aligned due to the test_and_*
+		 * functions which have this requirement on arm64.
+		 */
+#define PVCALLS_FLAG_ACCEPT_INFLIGHT 0
+#define PVCALLS_FLAG_POLL_INFLIGHT   1
+#define PVCALLS_FLAG_POLL_RET        2
+			uint8_t flags __attribute__((aligned(8)));
+			uint32_t inflight_req_id;
+			struct sock_mapping *accept_map;
+			wait_queue_head_t inflight_accept_req;
+		} passive;
+	};
+};
+
+static inline struct sock_mapping *pvcalls_enter_sock(struct socket *sock)
+{
+	struct sock_mapping *map;
+
+	if (!pvcalls_front_dev ||
+		dev_get_drvdata(&pvcalls_front_dev->dev) == NULL)
+		return ERR_PTR(-ENOTCONN);
+
+	map = (struct sock_mapping *)sock->sk->sk_send_head;
+	if (map == NULL)
+		return ERR_PTR(-ENOTSOCK);
+
+	pvcalls_enter();
+	atomic_inc(&map->refcount);
+	return map;
+}
+
+static inline void pvcalls_exit_sock(struct socket *sock)
+{
+	struct sock_mapping *map;
+
+	map = (struct sock_mapping *)sock->sk->sk_send_head;
+	atomic_dec(&map->refcount);
+	pvcalls_exit();
+}
+
+static inline int get_request(struct pvcalls_bedata *bedata, int *req_id)
+{
+	*req_id = bedata->ring.req_prod_pvt & (RING_SIZE(&bedata->ring) - 1);
+	if (RING_FULL(&bedata->ring) ||
+	    bedata->rsp[*req_id].req_id != PVCALLS_INVALID_ID)
+		return -EAGAIN;
+	return 0;
+}
+
+static bool pvcalls_front_write_todo(struct sock_mapping *map)
+{
+	struct pvcalls_data_intf *intf = map->active.ring;
+	RING_IDX cons, prod, size = XEN_FLEX_RING_SIZE(PVCALLS_RING_ORDER);
+	int32_t error;
+
+	error = intf->out_error;
+	if (error == -ENOTCONN)
+		return false;
+	if (error != 0)
+		return true;
+
+	cons = intf->out_cons;
+	prod = intf->out_prod;
+	return !!(size - pvcalls_queued(prod, cons, size));
+}
+
+static bool pvcalls_front_read_todo(struct sock_mapping *map)
+{
+	struct pvcalls_data_intf *intf = map->active.ring;
+	RING_IDX cons, prod;
+	int32_t error;
+
+	cons = intf->in_cons;
+	prod = intf->in_prod;
+	error = intf->in_error;
+	return (error != 0 ||
+		pvcalls_queued(prod, cons,
+			       XEN_FLEX_RING_SIZE(PVCALLS_RING_ORDER)) != 0);
+}
+
+static irqreturn_t pvcalls_front_event_handler(int irq, void *dev_id)
+{
+	struct xenbus_device *dev = dev_id;
+	struct pvcalls_bedata *bedata;
+	struct xen_pvcalls_response *rsp;
+	uint8_t *src, *dst;
+	int req_id = 0, more = 0, done = 0;
+
+	if (dev == NULL)
+		return IRQ_HANDLED;
+
+	pvcalls_enter();
+	bedata = dev_get_drvdata(&dev->dev);
+	if (bedata == NULL) {
+		pvcalls_exit();
+		return IRQ_HANDLED;
+	}
+
+again:
+	while (RING_HAS_UNCONSUMED_RESPONSES(&bedata->ring)) {
+		rsp = RING_GET_RESPONSE(&bedata->ring, bedata->ring.rsp_cons);
+
+		req_id = rsp->req_id;
+		if (rsp->cmd == PVCALLS_POLL) {
+			struct sock_mapping *map = (struct sock_mapping *)(uintptr_t)
+						   rsp->u.poll.id;
+
+			clear_bit(PVCALLS_FLAG_POLL_INFLIGHT,
+				  (void *)&map->passive.flags);
+			/*
+			 * clear INFLIGHT, then set RET. It pairs with
+			 * the checks at the beginning of
+			 * pvcalls_front_poll_passive.
+			 */
+			smp_wmb();
+			set_bit(PVCALLS_FLAG_POLL_RET,
+				(void *)&map->passive.flags);
+		} else {
+			dst = (uint8_t *)&bedata->rsp[req_id] +
+			      sizeof(rsp->req_id);
+			src = (uint8_t *)rsp + sizeof(rsp->req_id);
+			memcpy(dst, src, sizeof(*rsp) - sizeof(rsp->req_id));
+			/*
+			 * First copy the rest of the data, then req_id. It is
+			 * paired with the barrier when accessing bedata->rsp.
+			 */
+			smp_wmb();
+			bedata->rsp[req_id].req_id = req_id;
+		}
+
+		done = 1;
+		bedata->ring.rsp_cons++;
+	}
+
+	RING_FINAL_CHECK_FOR_RESPONSES(&bedata->ring, more);
+	if (more)
+		goto again;
+	if (done)
+		wake_up(&bedata->inflight_req);
+	pvcalls_exit();
+	return IRQ_HANDLED;
+}
+
+static void pvcalls_front_free_map(struct pvcalls_bedata *bedata,
+				   struct sock_mapping *map)
+{
+	int i;
+
+	unbind_from_irqhandler(map->active.irq, map);
+
+	spin_lock(&bedata->socket_lock);
+	if (!list_empty(&map->list))
+		list_del_init(&map->list);
+	spin_unlock(&bedata->socket_lock);
+
+	for (i = 0; i < (1 << PVCALLS_RING_ORDER); i++)
+		gnttab_end_foreign_access(map->active.ring->ref[i], 0, 0);
+	gnttab_end_foreign_access(map->active.ref, 0, 0);
+	free_page((unsigned long)map->active.ring);
+
+	kfree(map);
+}
+
+static irqreturn_t pvcalls_front_conn_handler(int irq, void *sock_map)
+{
+	struct sock_mapping *map = sock_map;
+
+	if (map == NULL)
+		return IRQ_HANDLED;
+
+	wake_up_interruptible(&map->active.inflight_conn_req);
+
+	return IRQ_HANDLED;
+}
+
+int pvcalls_front_socket(struct socket *sock)
+{
+	struct pvcalls_bedata *bedata;
+	struct sock_mapping *map = NULL;
+	struct xen_pvcalls_request *req;
+	int notify, req_id, ret;
+
+	/*
+	 * PVCalls only supports domain AF_INET,
+	 * type SOCK_STREAM and protocol 0 sockets for now.
+	 *
+	 * Check socket type here, AF_INET and protocol checks are done
+	 * by the caller.
+	 */
+	if (sock->type != SOCK_STREAM)
+		return -EOPNOTSUPP;
+
+	pvcalls_enter();
+	if (!pvcalls_front_dev) {
+		pvcalls_exit();
+		return -EACCES;
+	}
+	bedata = dev_get_drvdata(&pvcalls_front_dev->dev);
+
+	map = kzalloc(sizeof(*map), GFP_KERNEL);
+	if (map == NULL) {
+		pvcalls_exit();
+		return -ENOMEM;
+	}
+
+	spin_lock(&bedata->socket_lock);
+
+	ret = get_request(bedata, &req_id);
+	if (ret < 0) {
+		kfree(map);
+		spin_unlock(&bedata->socket_lock);
+		pvcalls_exit();
+		return ret;
+	}
+
+	/*
+	 * sock->sk->sk_send_head is not used for ip sockets: reuse the
+	 * field to store a pointer to the struct sock_mapping
+	 * corresponding to the socket. This way, we can easily get the
+	 * struct sock_mapping from the struct socket.
+	 */
+	sock->sk->sk_send_head = (void *)map;
+	list_add_tail(&map->list, &bedata->socket_mappings);
+
+	req = RING_GET_REQUEST(&bedata->ring, req_id);
+	req->req_id = req_id;
+	req->cmd = PVCALLS_SOCKET;
+	req->u.socket.id = (uintptr_t) map;
+	req->u.socket.domain = AF_INET;
+	req->u.socket.type = SOCK_STREAM;
+	req->u.socket.protocol = IPPROTO_IP;
+
+	bedata->ring.req_prod_pvt++;
+	RING_PUSH_REQUESTS_AND_CHECK_NOTIFY(&bedata->ring, notify);
+	spin_unlock(&bedata->socket_lock);
+	if (notify)
+		notify_remote_via_irq(bedata->irq);
+
+	wait_event(bedata->inflight_req,
+		   READ_ONCE(bedata->rsp[req_id].req_id) == req_id);
+
+	/* read req_id, then the content */
+	smp_rmb();
+	ret = bedata->rsp[req_id].ret;
+	bedata->rsp[req_id].req_id = PVCALLS_INVALID_ID;
+
+	pvcalls_exit();
+	return ret;
+}
+
+static void free_active_ring(struct sock_mapping *map)
+{
+	if (!map->active.ring)
+		return;
+
+	free_pages_exact(map->active.data.in,
+			 PAGE_SIZE << map->active.ring->ring_order);
+	free_page((unsigned long)map->active.ring);
+}
+
+static int alloc_active_ring(struct sock_mapping *map)
+{
+	void *bytes;
+
+	map->active.ring = (struct pvcalls_data_intf *)
+		get_zeroed_page(GFP_KERNEL);
+	if (!map->active.ring)
+		goto out;
+
+	map->active.ring->ring_order = PVCALLS_RING_ORDER;
+	bytes = alloc_pages_exact(PAGE_SIZE << PVCALLS_RING_ORDER,
+				  GFP_KERNEL | __GFP_ZERO);
+	if (!bytes)
+		goto out;
+
+	map->active.data.in = bytes;
+	map->active.data.out = bytes +
+		XEN_FLEX_RING_SIZE(PVCALLS_RING_ORDER);
+
+	return 0;
+
+out:
+	free_active_ring(map);
+	return -ENOMEM;
+}
+
+static int create_active(struct sock_mapping *map, int *evtchn)
+{
+	void *bytes;
+	int ret = -ENOMEM, irq = -1, i;
+
+	*evtchn = -1;
+	init_waitqueue_head(&map->active.inflight_conn_req);
+
+	bytes = map->active.data.in;
+	for (i = 0; i < (1 << PVCALLS_RING_ORDER); i++)
+		map->active.ring->ref[i] = gnttab_grant_foreign_access(
+			pvcalls_front_dev->otherend_id,
+			pfn_to_gfn(virt_to_pfn(bytes) + i), 0);
+
+	map->active.ref = gnttab_grant_foreign_access(
+		pvcalls_front_dev->otherend_id,
+		pfn_to_gfn(virt_to_pfn((void *)map->active.ring)), 0);
+
+	ret = xenbus_alloc_evtchn(pvcalls_front_dev, evtchn);
+	if (ret)
+		goto out_error;
+	irq = bind_evtchn_to_irqhandler(*evtchn, pvcalls_front_conn_handler,
+					0, "pvcalls-frontend", map);
+	if (irq < 0) {
+		ret = irq;
+		goto out_error;
+	}
+
+	map->active.irq = irq;
+	map->active_socket = true;
+	mutex_init(&map->active.in_mutex);
+	mutex_init(&map->active.out_mutex);
+
+	return 0;
+
+out_error:
+	if (*evtchn >= 0)
+		xenbus_free_evtchn(pvcalls_front_dev, *evtchn);
+	return ret;
+}
+
+int pvcalls_front_connect(struct socket *sock, struct sockaddr *addr,
+				int addr_len, int flags)
+{
+	struct pvcalls_bedata *bedata;
+	struct sock_mapping *map = NULL;
+	struct xen_pvcalls_request *req;
+	int notify, req_id, ret, evtchn;
+
+	if (addr->sa_family != AF_INET || sock->type != SOCK_STREAM)
+		return -EOPNOTSUPP;
+
+	map = pvcalls_enter_sock(sock);
+	if (IS_ERR(map))
+		return PTR_ERR(map);
+
+	bedata = dev_get_drvdata(&pvcalls_front_dev->dev);
+	ret = alloc_active_ring(map);
+	if (ret < 0) {
+		pvcalls_exit_sock(sock);
+		return ret;
+	}
+
+	spin_lock(&bedata->socket_lock);
+	ret = get_request(bedata, &req_id);
+	if (ret < 0) {
+		spin_unlock(&bedata->socket_lock);
+		free_active_ring(map);
+		pvcalls_exit_sock(sock);
+		return ret;
+	}
+	ret = create_active(map, &evtchn);
+	if (ret < 0) {
+		spin_unlock(&bedata->socket_lock);
+		free_active_ring(map);
+		pvcalls_exit_sock(sock);
+		return ret;
+	}
+
+	req = RING_GET_REQUEST(&bedata->ring, req_id);
+	req->req_id = req_id;
+	req->cmd = PVCALLS_CONNECT;
+	req->u.connect.id = (uintptr_t)map;
+	req->u.connect.len = addr_len;
+	req->u.connect.flags = flags;
+	req->u.connect.ref = map->active.ref;
+	req->u.connect.evtchn = evtchn;
+	memcpy(req->u.connect.addr, addr, sizeof(*addr));
+
+	map->sock = sock;
+
+	bedata->ring.req_prod_pvt++;
+	RING_PUSH_REQUESTS_AND_CHECK_NOTIFY(&bedata->ring, notify);
+	spin_unlock(&bedata->socket_lock);
+
+	if (notify)
+		notify_remote_via_irq(bedata->irq);
+
+	wait_event(bedata->inflight_req,
+		   READ_ONCE(bedata->rsp[req_id].req_id) == req_id);
+
+	/* read req_id, then the content */
+	smp_rmb();
+	ret = bedata->rsp[req_id].ret;
+	bedata->rsp[req_id].req_id = PVCALLS_INVALID_ID;
+	pvcalls_exit_sock(sock);
+	return ret;
+}
+
+static int __write_ring(struct pvcalls_data_intf *intf,
+			struct pvcalls_data *data,
+			struct iov_iter *msg_iter,
+			int len)
+{
+	RING_IDX cons, prod, size, masked_prod, masked_cons;
+	RING_IDX array_size = XEN_FLEX_RING_SIZE(PVCALLS_RING_ORDER);
+	int32_t error;
+
+	error = intf->out_error;
+	if (error < 0)
+		return error;
+	cons = intf->out_cons;
+	prod = intf->out_prod;
+	/* read indexes before continuing */
+	virt_mb();
+
+	size = pvcalls_queued(prod, cons, array_size);
+	if (size > array_size)
+		return -EINVAL;
+	if (size == array_size)
+		return 0;
+	if (len > array_size - size)
+		len = array_size - size;
+
+	masked_prod = pvcalls_mask(prod, array_size);
+	masked_cons = pvcalls_mask(cons, array_size);
+
+	if (masked_prod < masked_cons) {
+		len = copy_from_iter(data->out + masked_prod, len, msg_iter);
+	} else {
+		if (len > array_size - masked_prod) {
+			int ret = copy_from_iter(data->out + masked_prod,
+				       array_size - masked_prod, msg_iter);
+			if (ret != array_size - masked_prod) {
+				len = ret;
+				goto out;
+			}
+			len = ret + copy_from_iter(data->out, len - ret, msg_iter);
+		} else {
+			len = copy_from_iter(data->out + masked_prod, len, msg_iter);
+		}
+	}
+out:
+	/* write to ring before updating pointer */
+	virt_wmb();
+	intf->out_prod += len;
+
+	return len;
+}
+
+int pvcalls_front_sendmsg(struct socket *sock, struct msghdr *msg,
+			  size_t len)
+{
+	struct sock_mapping *map;
+	int sent, tot_sent = 0;
+	int count = 0, flags;
+
+	flags = msg->msg_flags;
+	if (flags & (MSG_CONFIRM|MSG_DONTROUTE|MSG_EOR|MSG_OOB))
+		return -EOPNOTSUPP;
+
+	map = pvcalls_enter_sock(sock);
+	if (IS_ERR(map))
+		return PTR_ERR(map);
+
+	mutex_lock(&map->active.out_mutex);
+	if ((flags & MSG_DONTWAIT) && !pvcalls_front_write_todo(map)) {
+		mutex_unlock(&map->active.out_mutex);
+		pvcalls_exit_sock(sock);
+		return -EAGAIN;
+	}
+	if (len > INT_MAX)
+		len = INT_MAX;
+
+again:
+	count++;
+	sent = __write_ring(map->active.ring,
+			    &map->active.data, &msg->msg_iter,
+			    len);
+	if (sent > 0) {
+		len -= sent;
+		tot_sent += sent;
+		notify_remote_via_irq(map->active.irq);
+	}
+	if (sent >= 0 && len > 0 && count < PVCALLS_FRONT_MAX_SPIN)
+		goto again;
+	if (sent < 0)
+		tot_sent = sent;
+
+	mutex_unlock(&map->active.out_mutex);
+	pvcalls_exit_sock(sock);
+	return tot_sent;
+}
+
+static int __read_ring(struct pvcalls_data_intf *intf,
+		       struct pvcalls_data *data,
+		       struct iov_iter *msg_iter,
+		       size_t len, int flags)
+{
+	RING_IDX cons, prod, size, masked_prod, masked_cons;
+	RING_IDX array_size = XEN_FLEX_RING_SIZE(PVCALLS_RING_ORDER);
+	int32_t error;
+
+	cons = intf->in_cons;
+	prod = intf->in_prod;
+	error = intf->in_error;
+	/* get pointers before reading from the ring */
+	virt_rmb();
+
+	size = pvcalls_queued(prod, cons, array_size);
+	masked_prod = pvcalls_mask(prod, array_size);
+	masked_cons = pvcalls_mask(cons, array_size);
+
+	if (size == 0)
+		return error ?: size;
+
+	if (len > size)
+		len = size;
+
+	if (masked_prod > masked_cons) {
+		len = copy_to_iter(data->in + masked_cons, len, msg_iter);
+	} else {
+		if (len > (array_size - masked_cons)) {
+			int ret = copy_to_iter(data->in + masked_cons,
+				     array_size - masked_cons, msg_iter);
+			if (ret != array_size - masked_cons) {
+				len = ret;
+				goto out;
+			}
+			len = ret + copy_to_iter(data->in, len - ret, msg_iter);
+		} else {
+			len = copy_to_iter(data->in + masked_cons, len, msg_iter);
+		}
+	}
+out:
+	/* read data from the ring before increasing the index */
+	virt_mb();
+	if (!(flags & MSG_PEEK))
+		intf->in_cons += len;
+
+	return len;
+}
+
+int pvcalls_front_recvmsg(struct socket *sock, struct msghdr *msg, size_t len,
+		     int flags)
+{
+	int ret;
+	struct sock_mapping *map;
+
+	if (flags & (MSG_CMSG_CLOEXEC|MSG_ERRQUEUE|MSG_OOB|MSG_TRUNC))
+		return -EOPNOTSUPP;
+
+	map = pvcalls_enter_sock(sock);
+	if (IS_ERR(map))
+		return PTR_ERR(map);
+
+	mutex_lock(&map->active.in_mutex);
+	if (len > XEN_FLEX_RING_SIZE(PVCALLS_RING_ORDER))
+		len = XEN_FLEX_RING_SIZE(PVCALLS_RING_ORDER);
+
+	while (!(flags & MSG_DONTWAIT) && !pvcalls_front_read_todo(map)) {
+		wait_event_interruptible(map->active.inflight_conn_req,
+					 pvcalls_front_read_todo(map));
+	}
+	ret = __read_ring(map->active.ring, &map->active.data,
+			  &msg->msg_iter, len, flags);
+
+	if (ret > 0)
+		notify_remote_via_irq(map->active.irq);
+	if (ret == 0)
+		ret = (flags & MSG_DONTWAIT) ? -EAGAIN : 0;
+	if (ret == -ENOTCONN)
+		ret = 0;
+
+	mutex_unlock(&map->active.in_mutex);
+	pvcalls_exit_sock(sock);
+	return ret;
+}
+
+int pvcalls_front_bind(struct socket *sock, struct sockaddr *addr, int addr_len)
+{
+	struct pvcalls_bedata *bedata;
+	struct sock_mapping *map = NULL;
+	struct xen_pvcalls_request *req;
+	int notify, req_id, ret;
+
+	if (addr->sa_family != AF_INET || sock->type != SOCK_STREAM)
+		return -EOPNOTSUPP;
+
+	map = pvcalls_enter_sock(sock);
+	if (IS_ERR(map))
+		return PTR_ERR(map);
+	bedata = dev_get_drvdata(&pvcalls_front_dev->dev);
+
+	spin_lock(&bedata->socket_lock);
+	ret = get_request(bedata, &req_id);
+	if (ret < 0) {
+		spin_unlock(&bedata->socket_lock);
+		pvcalls_exit_sock(sock);
+		return ret;
+	}
+	req = RING_GET_REQUEST(&bedata->ring, req_id);
+	req->req_id = req_id;
+	map->sock = sock;
+	req->cmd = PVCALLS_BIND;
+	req->u.bind.id = (uintptr_t)map;
+	memcpy(req->u.bind.addr, addr, sizeof(*addr));
+	req->u.bind.len = addr_len;
+
+	init_waitqueue_head(&map->passive.inflight_accept_req);
+
+	map->active_socket = false;
+
+	bedata->ring.req_prod_pvt++;
+	RING_PUSH_REQUESTS_AND_CHECK_NOTIFY(&bedata->ring, notify);
+	spin_unlock(&bedata->socket_lock);
+	if (notify)
+		notify_remote_via_irq(bedata->irq);
+
+	wait_event(bedata->inflight_req,
+		   READ_ONCE(bedata->rsp[req_id].req_id) == req_id);
+
+	/* read req_id, then the content */
+	smp_rmb();
+	ret = bedata->rsp[req_id].ret;
+	bedata->rsp[req_id].req_id = PVCALLS_INVALID_ID;
+
+	map->passive.status = PVCALLS_STATUS_BIND;
+	pvcalls_exit_sock(sock);
+	return 0;
+}
+
+int pvcalls_front_listen(struct socket *sock, int backlog)
+{
+	struct pvcalls_bedata *bedata;
+	struct sock_mapping *map;
+	struct xen_pvcalls_request *req;
+	int notify, req_id, ret;
+
+	map = pvcalls_enter_sock(sock);
+	if (IS_ERR(map))
+		return PTR_ERR(map);
+	bedata = dev_get_drvdata(&pvcalls_front_dev->dev);
+
+	if (map->passive.status != PVCALLS_STATUS_BIND) {
+		pvcalls_exit_sock(sock);
+		return -EOPNOTSUPP;
+	}
+
+	spin_lock(&bedata->socket_lock);
+	ret = get_request(bedata, &req_id);
+	if (ret < 0) {
+		spin_unlock(&bedata->socket_lock);
+		pvcalls_exit_sock(sock);
+		return ret;
+	}
+	req = RING_GET_REQUEST(&bedata->ring, req_id);
+	req->req_id = req_id;
+	req->cmd = PVCALLS_LISTEN;
+	req->u.listen.id = (uintptr_t) map;
+	req->u.listen.backlog = backlog;
+
+	bedata->ring.req_prod_pvt++;
+	RING_PUSH_REQUESTS_AND_CHECK_NOTIFY(&bedata->ring, notify);
+	spin_unlock(&bedata->socket_lock);
+	if (notify)
+		notify_remote_via_irq(bedata->irq);
+
+	wait_event(bedata->inflight_req,
+		   READ_ONCE(bedata->rsp[req_id].req_id) == req_id);
+
+	/* read req_id, then the content */
+	smp_rmb();
+	ret = bedata->rsp[req_id].ret;
+	bedata->rsp[req_id].req_id = PVCALLS_INVALID_ID;
+
+	map->passive.status = PVCALLS_STATUS_LISTEN;
+	pvcalls_exit_sock(sock);
+	return ret;
+}
+
+int pvcalls_front_accept(struct socket *sock, struct socket *newsock, int flags)
+{
+	struct pvcalls_bedata *bedata;
+	struct sock_mapping *map;
+	struct sock_mapping *map2 = NULL;
+	struct xen_pvcalls_request *req;
+	int notify, req_id, ret, evtchn, nonblock;
+
+	map = pvcalls_enter_sock(sock);
+	if (IS_ERR(map))
+		return PTR_ERR(map);
+	bedata = dev_get_drvdata(&pvcalls_front_dev->dev);
+
+	if (map->passive.status != PVCALLS_STATUS_LISTEN) {
+		pvcalls_exit_sock(sock);
+		return -EINVAL;
+	}
+
+	nonblock = flags & SOCK_NONBLOCK;
+	/*
+	 * Backend only supports 1 inflight accept request, will return
+	 * errors for the others
+	 */
+	if (test_and_set_bit(PVCALLS_FLAG_ACCEPT_INFLIGHT,
+			     (void *)&map->passive.flags)) {
+		req_id = READ_ONCE(map->passive.inflight_req_id);
+		if (req_id != PVCALLS_INVALID_ID &&
+		    READ_ONCE(bedata->rsp[req_id].req_id) == req_id) {
+			map2 = map->passive.accept_map;
+			goto received;
+		}
+		if (nonblock) {
+			pvcalls_exit_sock(sock);
+			return -EAGAIN;
+		}
+		if (wait_event_interruptible(map->passive.inflight_accept_req,
+			!test_and_set_bit(PVCALLS_FLAG_ACCEPT_INFLIGHT,
+					  (void *)&map->passive.flags))) {
+			pvcalls_exit_sock(sock);
+			return -EINTR;
+		}
+	}
+
+	map2 = kzalloc(sizeof(*map2), GFP_KERNEL);
+	if (map2 == NULL) {
+		clear_bit(PVCALLS_FLAG_ACCEPT_INFLIGHT,
+			  (void *)&map->passive.flags);
+		pvcalls_exit_sock(sock);
+		return -ENOMEM;
+	}
+	ret = alloc_active_ring(map2);
+	if (ret < 0) {
+		clear_bit(PVCALLS_FLAG_ACCEPT_INFLIGHT,
+				(void *)&map->passive.flags);
+		kfree(map2);
+		pvcalls_exit_sock(sock);
+		return ret;
+	}
+	spin_lock(&bedata->socket_lock);
+	ret = get_request(bedata, &req_id);
+	if (ret < 0) {
+		clear_bit(PVCALLS_FLAG_ACCEPT_INFLIGHT,
+			  (void *)&map->passive.flags);
+		spin_unlock(&bedata->socket_lock);
+		free_active_ring(map2);
+		kfree(map2);
+		pvcalls_exit_sock(sock);
+		return ret;
+	}
+
+	ret = create_active(map2, &evtchn);
+	if (ret < 0) {
+		free_active_ring(map2);
+		kfree(map2);
+		clear_bit(PVCALLS_FLAG_ACCEPT_INFLIGHT,
+			  (void *)&map->passive.flags);
+		spin_unlock(&bedata->socket_lock);
+		pvcalls_exit_sock(sock);
+		return ret;
+	}
+	list_add_tail(&map2->list, &bedata->socket_mappings);
+
+	req = RING_GET_REQUEST(&bedata->ring, req_id);
+	req->req_id = req_id;
+	req->cmd = PVCALLS_ACCEPT;
+	req->u.accept.id = (uintptr_t) map;
+	req->u.accept.ref = map2->active.ref;
+	req->u.accept.id_new = (uintptr_t) map2;
+	req->u.accept.evtchn = evtchn;
+	map->passive.accept_map = map2;
+
+	bedata->ring.req_prod_pvt++;
+	RING_PUSH_REQUESTS_AND_CHECK_NOTIFY(&bedata->ring, notify);
+	spin_unlock(&bedata->socket_lock);
+	if (notify)
+		notify_remote_via_irq(bedata->irq);
+	/* We could check if we have received a response before returning. */
+	if (nonblock) {
+		WRITE_ONCE(map->passive.inflight_req_id, req_id);
+		pvcalls_exit_sock(sock);
+		return -EAGAIN;
+	}
+
+	if (wait_event_interruptible(bedata->inflight_req,
+		READ_ONCE(bedata->rsp[req_id].req_id) == req_id)) {
+		pvcalls_exit_sock(sock);
+		return -EINTR;
+	}
+	/* read req_id, then the content */
+	smp_rmb();
+
+received:
+	map2->sock = newsock;
+	newsock->sk = sk_alloc(sock_net(sock->sk), PF_INET, GFP_KERNEL, &pvcalls_proto, false);
+	if (!newsock->sk) {
+		bedata->rsp[req_id].req_id = PVCALLS_INVALID_ID;
+		map->passive.inflight_req_id = PVCALLS_INVALID_ID;
+		clear_bit(PVCALLS_FLAG_ACCEPT_INFLIGHT,
+			  (void *)&map->passive.flags);
+		pvcalls_front_free_map(bedata, map2);
+		pvcalls_exit_sock(sock);
+		return -ENOMEM;
+	}
+	newsock->sk->sk_send_head = (void *)map2;
+
+	ret = bedata->rsp[req_id].ret;
+	bedata->rsp[req_id].req_id = PVCALLS_INVALID_ID;
+	map->passive.inflight_req_id = PVCALLS_INVALID_ID;
+
+	clear_bit(PVCALLS_FLAG_ACCEPT_INFLIGHT, (void *)&map->passive.flags);
+	wake_up(&map->passive.inflight_accept_req);
+
+	pvcalls_exit_sock(sock);
+	return ret;
+}
+
+static __poll_t pvcalls_front_poll_passive(struct file *file,
+					       struct pvcalls_bedata *bedata,
+					       struct sock_mapping *map,
+					       poll_table *wait)
+{
+	int notify, req_id, ret;
+	struct xen_pvcalls_request *req;
+
+	if (test_bit(PVCALLS_FLAG_ACCEPT_INFLIGHT,
+		     (void *)&map->passive.flags)) {
+		uint32_t req_id = READ_ONCE(map->passive.inflight_req_id);
+
+		if (req_id != PVCALLS_INVALID_ID &&
+		    READ_ONCE(bedata->rsp[req_id].req_id) == req_id)
+			return EPOLLIN | EPOLLRDNORM;
+
+		poll_wait(file, &map->passive.inflight_accept_req, wait);
+		return 0;
+	}
+
+	if (test_and_clear_bit(PVCALLS_FLAG_POLL_RET,
+			       (void *)&map->passive.flags))
+		return EPOLLIN | EPOLLRDNORM;
+
+	/*
+	 * First check RET, then INFLIGHT. No barriers necessary to
+	 * ensure execution ordering because of the conditional
+	 * instructions creating control dependencies.
+	 */
+
+	if (test_and_set_bit(PVCALLS_FLAG_POLL_INFLIGHT,
+			     (void *)&map->passive.flags)) {
+		poll_wait(file, &bedata->inflight_req, wait);
+		return 0;
+	}
+
+	spin_lock(&bedata->socket_lock);
+	ret = get_request(bedata, &req_id);
+	if (ret < 0) {
+		spin_unlock(&bedata->socket_lock);
+		return ret;
+	}
+	req = RING_GET_REQUEST(&bedata->ring, req_id);
+	req->req_id = req_id;
+	req->cmd = PVCALLS_POLL;
+	req->u.poll.id = (uintptr_t) map;
+
+	bedata->ring.req_prod_pvt++;
+	RING_PUSH_REQUESTS_AND_CHECK_NOTIFY(&bedata->ring, notify);
+	spin_unlock(&bedata->socket_lock);
+	if (notify)
+		notify_remote_via_irq(bedata->irq);
+
+	poll_wait(file, &bedata->inflight_req, wait);
+	return 0;
+}
+
+static __poll_t pvcalls_front_poll_active(struct file *file,
+					      struct pvcalls_bedata *bedata,
+					      struct sock_mapping *map,
+					      poll_table *wait)
+{
+	__poll_t mask = 0;
+	int32_t in_error, out_error;
+	struct pvcalls_data_intf *intf = map->active.ring;
+
+	out_error = intf->out_error;
+	in_error = intf->in_error;
+
+	poll_wait(file, &map->active.inflight_conn_req, wait);
+	if (pvcalls_front_write_todo(map))
+		mask |= EPOLLOUT | EPOLLWRNORM;
+	if (pvcalls_front_read_todo(map))
+		mask |= EPOLLIN | EPOLLRDNORM;
+	if (in_error != 0 || out_error != 0)
+		mask |= EPOLLERR;
+
+	return mask;
+}
+
+__poll_t pvcalls_front_poll(struct file *file, struct socket *sock,
+			       poll_table *wait)
+{
+	struct pvcalls_bedata *bedata;
+	struct sock_mapping *map;
+	__poll_t ret;
+
+	map = pvcalls_enter_sock(sock);
+	if (IS_ERR(map))
+		return EPOLLNVAL;
+	bedata = dev_get_drvdata(&pvcalls_front_dev->dev);
+
+	if (map->active_socket)
+		ret = pvcalls_front_poll_active(file, bedata, map, wait);
+	else
+		ret = pvcalls_front_poll_passive(file, bedata, map, wait);
+	pvcalls_exit_sock(sock);
+	return ret;
+}
+
+int pvcalls_front_release(struct socket *sock)
+{
+	struct pvcalls_bedata *bedata;
+	struct sock_mapping *map;
+	int req_id, notify, ret;
+	struct xen_pvcalls_request *req;
+
+	if (sock->sk == NULL)
+		return 0;
+
+	map = pvcalls_enter_sock(sock);
+	if (IS_ERR(map)) {
+		if (PTR_ERR(map) == -ENOTCONN)
+			return -EIO;
+		else
+			return 0;
+	}
+	bedata = dev_get_drvdata(&pvcalls_front_dev->dev);
+
+	spin_lock(&bedata->socket_lock);
+	ret = get_request(bedata, &req_id);
+	if (ret < 0) {
+		spin_unlock(&bedata->socket_lock);
+		pvcalls_exit_sock(sock);
+		return ret;
+	}
+	sock->sk->sk_send_head = NULL;
+
+	req = RING_GET_REQUEST(&bedata->ring, req_id);
+	req->req_id = req_id;
+	req->cmd = PVCALLS_RELEASE;
+	req->u.release.id = (uintptr_t)map;
+
+	bedata->ring.req_prod_pvt++;
+	RING_PUSH_REQUESTS_AND_CHECK_NOTIFY(&bedata->ring, notify);
+	spin_unlock(&bedata->socket_lock);
+	if (notify)
+		notify_remote_via_irq(bedata->irq);
+
+	wait_event(bedata->inflight_req,
+		   READ_ONCE(bedata->rsp[req_id].req_id) == req_id);
+
+	if (map->active_socket) {
+		/*
+		 * Set in_error and wake up inflight_conn_req to force
+		 * recvmsg waiters to exit.
+		 */
+		map->active.ring->in_error = -EBADF;
+		wake_up_interruptible(&map->active.inflight_conn_req);
+
+		/*
+		 * We need to make sure that sendmsg/recvmsg on this socket have
+		 * not started before we've cleared sk_send_head here. The
+		 * easiest way to guarantee this is to see that no pvcalls
+		 * (other than us) is in progress on this socket.
+		 */
+		while (atomic_read(&map->refcount) > 1)
+			cpu_relax();
+
+		pvcalls_front_free_map(bedata, map);
+	} else {
+		wake_up(&bedata->inflight_req);
+		wake_up(&map->passive.inflight_accept_req);
+
+		while (atomic_read(&map->refcount) > 1)
+			cpu_relax();
+
+		spin_lock(&bedata->socket_lock);
+		list_del(&map->list);
+		spin_unlock(&bedata->socket_lock);
+		if (READ_ONCE(map->passive.inflight_req_id) != PVCALLS_INVALID_ID &&
+			READ_ONCE(map->passive.inflight_req_id) != 0) {
+			pvcalls_front_free_map(bedata,
+					       map->passive.accept_map);
+		}
+		kfree(map);
+	}
+	WRITE_ONCE(bedata->rsp[req_id].req_id, PVCALLS_INVALID_ID);
+
+	pvcalls_exit();
+	return 0;
+}
+
+static const struct xenbus_device_id pvcalls_front_ids[] = {
+	{ "pvcalls" },
+	{ "" }
+};
+
+static int pvcalls_front_remove(struct xenbus_device *dev)
+{
+	struct pvcalls_bedata *bedata;
+	struct sock_mapping *map = NULL, *n;
+
+	bedata = dev_get_drvdata(&pvcalls_front_dev->dev);
+	dev_set_drvdata(&dev->dev, NULL);
+	pvcalls_front_dev = NULL;
+	if (bedata->irq >= 0)
+		unbind_from_irqhandler(bedata->irq, dev);
+
+	list_for_each_entry_safe(map, n, &bedata->socket_mappings, list) {
+		map->sock->sk->sk_send_head = NULL;
+		if (map->active_socket) {
+			map->active.ring->in_error = -EBADF;
+			wake_up_interruptible(&map->active.inflight_conn_req);
+		}
+	}
+
+	smp_mb();
+	while (atomic_read(&pvcalls_refcount) > 0)
+		cpu_relax();
+	list_for_each_entry_safe(map, n, &bedata->socket_mappings, list) {
+		if (map->active_socket) {
+			/* No need to lock, refcount is 0 */
+			pvcalls_front_free_map(bedata, map);
+		} else {
+			list_del(&map->list);
+			kfree(map);
+		}
+	}
+	if (bedata->ref != -1)
+		gnttab_end_foreign_access(bedata->ref, 0, 0);
+	kfree(bedata->ring.sring);
+	kfree(bedata);
+	xenbus_switch_state(dev, XenbusStateClosed);
+	return 0;
+}
+
+static int pvcalls_front_probe(struct xenbus_device *dev,
+			  const struct xenbus_device_id *id)
+{
+	int ret = -ENOMEM, evtchn, i;
+	unsigned int max_page_order, function_calls, len;
+	char *versions;
+	grant_ref_t gref_head = 0;
+	struct xenbus_transaction xbt;
+	struct pvcalls_bedata *bedata = NULL;
+	struct xen_pvcalls_sring *sring;
+
+	if (pvcalls_front_dev != NULL) {
+		dev_err(&dev->dev, "only one PV Calls connection supported\n");
+		return -EINVAL;
+	}
+
+	versions = xenbus_read(XBT_NIL, dev->otherend, "versions", &len);
+	if (IS_ERR(versions))
+		return PTR_ERR(versions);
+	if (!len)
+		return -EINVAL;
+	if (strcmp(versions, "1")) {
+		kfree(versions);
+		return -EINVAL;
+	}
+	kfree(versions);
+	max_page_order = xenbus_read_unsigned(dev->otherend,
+					      "max-page-order", 0);
+	if (max_page_order < PVCALLS_RING_ORDER)
+		return -ENODEV;
+	function_calls = xenbus_read_unsigned(dev->otherend,
+					      "function-calls", 0);
+	/* See XENBUS_FUNCTIONS_CALLS in pvcalls.h */
+	if (function_calls != 1)
+		return -ENODEV;
+	pr_info("%s max-page-order is %u\n", __func__, max_page_order);
+
+	bedata = kzalloc(sizeof(struct pvcalls_bedata), GFP_KERNEL);
+	if (!bedata)
+		return -ENOMEM;
+
+	dev_set_drvdata(&dev->dev, bedata);
+	pvcalls_front_dev = dev;
+	init_waitqueue_head(&bedata->inflight_req);
+	INIT_LIST_HEAD(&bedata->socket_mappings);
+	spin_lock_init(&bedata->socket_lock);
+	bedata->irq = -1;
+	bedata->ref = -1;
+
+	for (i = 0; i < PVCALLS_NR_RSP_PER_RING; i++)
+		bedata->rsp[i].req_id = PVCALLS_INVALID_ID;
+
+	sring = (struct xen_pvcalls_sring *) __get_free_page(GFP_KERNEL |
+							     __GFP_ZERO);
+	if (!sring)
+		goto error;
+	SHARED_RING_INIT(sring);
+	FRONT_RING_INIT(&bedata->ring, sring, XEN_PAGE_SIZE);
+
+	ret = xenbus_alloc_evtchn(dev, &evtchn);
+	if (ret)
+		goto error;
+
+	bedata->irq = bind_evtchn_to_irqhandler(evtchn,
+						pvcalls_front_event_handler,
+						0, "pvcalls-frontend", dev);
+	if (bedata->irq < 0) {
+		ret = bedata->irq;
+		goto error;
+	}
+
+	ret = gnttab_alloc_grant_references(1, &gref_head);
+	if (ret < 0)
+		goto error;
+	ret = gnttab_claim_grant_reference(&gref_head);
+	if (ret < 0)
+		goto error;
+	bedata->ref = ret;
+	gnttab_grant_foreign_access_ref(bedata->ref, dev->otherend_id,
+					virt_to_gfn((void *)sring), 0);
+
+ again:
+	ret = xenbus_transaction_start(&xbt);
+	if (ret) {
+		xenbus_dev_fatal(dev, ret, "starting transaction");
+		goto error;
+	}
+	ret = xenbus_printf(xbt, dev->nodename, "version", "%u", 1);
+	if (ret)
+		goto error_xenbus;
+	ret = xenbus_printf(xbt, dev->nodename, "ring-ref", "%d", bedata->ref);
+	if (ret)
+		goto error_xenbus;
+	ret = xenbus_printf(xbt, dev->nodename, "port", "%u",
+			    evtchn);
+	if (ret)
+		goto error_xenbus;
+	ret = xenbus_transaction_end(xbt, 0);
+	if (ret) {
+		if (ret == -EAGAIN)
+			goto again;
+		xenbus_dev_fatal(dev, ret, "completing transaction");
+		goto error;
+	}
+	xenbus_switch_state(dev, XenbusStateInitialised);
+
+	return 0;
+
+ error_xenbus:
+	xenbus_transaction_end(xbt, 1);
+	xenbus_dev_fatal(dev, ret, "writing xenstore");
+ error:
+	pvcalls_front_remove(dev);
+	return ret;
+}
+
+static void pvcalls_front_changed(struct xenbus_device *dev,
+			    enum xenbus_state backend_state)
+{
+	switch (backend_state) {
+	case XenbusStateReconfiguring:
+	case XenbusStateReconfigured:
+	case XenbusStateInitialising:
+	case XenbusStateInitialised:
+	case XenbusStateUnknown:
+		break;
+
+	case XenbusStateInitWait:
+		break;
+
+	case XenbusStateConnected:
+		xenbus_switch_state(dev, XenbusStateConnected);
+		break;
+
+	case XenbusStateClosed:
+		if (dev->state == XenbusStateClosed)
+			break;
+		/* Missed the backend's CLOSING state */
+		/* fall through */
+	case XenbusStateClosing:
+		xenbus_frontend_closed(dev);
+		break;
+	}
+}
+
+static struct xenbus_driver pvcalls_front_driver = {
+	.ids = pvcalls_front_ids,
+	.probe = pvcalls_front_probe,
+	.remove = pvcalls_front_remove,
+	.otherend_changed = pvcalls_front_changed,
+};
+
+static int __init pvcalls_frontend_init(void)
+{
+	if (!xen_domain())
+		return -ENODEV;
+
+	pr_info("Initialising Xen pvcalls frontend driver\n");
+
+	return xenbus_register_frontend(&pvcalls_front_driver);
+}
+
+module_init(pvcalls_frontend_init);
+
+MODULE_DESCRIPTION("Xen PV Calls frontend driver");
+MODULE_AUTHOR("Stefano Stabellini <sstabellini@kernel.org>");
+MODULE_LICENSE("GPL");
diff --git a/drivers/xen/pvcalls-front.h b/drivers/xen/pvcalls-front.h
new file mode 100644
index 000000000..f694ad773
--- /dev/null
+++ b/drivers/xen/pvcalls-front.h
@@ -0,0 +1,28 @@
+#ifndef __PVCALLS_FRONT_H__
+#define __PVCALLS_FRONT_H__
+
+#include <linux/net.h>
+
+int pvcalls_front_socket(struct socket *sock);
+int pvcalls_front_connect(struct socket *sock, struct sockaddr *addr,
+			  int addr_len, int flags);
+int pvcalls_front_bind(struct socket *sock,
+		       struct sockaddr *addr,
+		       int addr_len);
+int pvcalls_front_listen(struct socket *sock, int backlog);
+int pvcalls_front_accept(struct socket *sock,
+			 struct socket *newsock,
+			 int flags);
+int pvcalls_front_sendmsg(struct socket *sock,
+			  struct msghdr *msg,
+			  size_t len);
+int pvcalls_front_recvmsg(struct socket *sock,
+			  struct msghdr *msg,
+			  size_t len,
+			  int flags);
+__poll_t pvcalls_front_poll(struct file *file,
+				struct socket *sock,
+				poll_table *wait);
+int pvcalls_front_release(struct socket *sock);
+
+#endif
diff --git a/drivers/xen/swiotlb-xen.c b/drivers/xen/swiotlb-xen.c
new file mode 100644
index 000000000..3d9997595
--- /dev/null
+++ b/drivers/xen/swiotlb-xen.c
@@ -0,0 +1,721 @@
+/*
+ *  Copyright 2010
+ *  by Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
+ *
+ * This code provides a IOMMU for Xen PV guests with PCI passthrough.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License v2.0 as published by
+ * the Free Software Foundation
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * PV guests under Xen are running in an non-contiguous memory architecture.
+ *
+ * When PCI pass-through is utilized, this necessitates an IOMMU for
+ * translating bus (DMA) to virtual and vice-versa and also providing a
+ * mechanism to have contiguous pages for device drivers operations (say DMA
+ * operations).
+ *
+ * Specifically, under Xen the Linux idea of pages is an illusion. It
+ * assumes that pages start at zero and go up to the available memory. To
+ * help with that, the Linux Xen MMU provides a lookup mechanism to
+ * translate the page frame numbers (PFN) to machine frame numbers (MFN)
+ * and vice-versa. The MFN are the "real" frame numbers. Furthermore
+ * memory is not contiguous. Xen hypervisor stitches memory for guests
+ * from different pools, which means there is no guarantee that PFN==MFN
+ * and PFN+1==MFN+1. Lastly with Xen 4.0, pages (in debug mode) are
+ * allocated in descending order (high to low), meaning the guest might
+ * never get any MFN's under the 4GB mark.
+ *
+ */
+
+#define pr_fmt(fmt) "xen:" KBUILD_MODNAME ": " fmt
+
+#include <linux/bootmem.h>
+#include <linux/dma-direct.h>
+#include <linux/export.h>
+#include <xen/swiotlb-xen.h>
+#include <xen/page.h>
+#include <xen/xen-ops.h>
+#include <xen/hvc-console.h>
+
+#include <asm/dma-mapping.h>
+#include <asm/xen/page-coherent.h>
+
+#include <trace/events/swiotlb.h>
+/*
+ * Used to do a quick range check in swiotlb_tbl_unmap_single and
+ * swiotlb_tbl_sync_single_*, to see if the memory was in fact allocated by this
+ * API.
+ */
+
+#define XEN_SWIOTLB_ERROR_CODE	(~(dma_addr_t)0x0)
+
+static char *xen_io_tlb_start, *xen_io_tlb_end;
+static unsigned long xen_io_tlb_nslabs;
+/*
+ * Quick lookup value of the bus address of the IOTLB.
+ */
+
+static u64 start_dma_addr;
+
+/*
+ * Both of these functions should avoid XEN_PFN_PHYS because phys_addr_t
+ * can be 32bit when dma_addr_t is 64bit leading to a loss in
+ * information if the shift is done before casting to 64bit.
+ */
+static inline dma_addr_t xen_phys_to_bus(phys_addr_t paddr)
+{
+	unsigned long bfn = pfn_to_bfn(XEN_PFN_DOWN(paddr));
+	dma_addr_t dma = (dma_addr_t)bfn << XEN_PAGE_SHIFT;
+
+	dma |= paddr & ~XEN_PAGE_MASK;
+
+	return dma;
+}
+
+static inline phys_addr_t xen_bus_to_phys(dma_addr_t baddr)
+{
+	unsigned long xen_pfn = bfn_to_pfn(XEN_PFN_DOWN(baddr));
+	dma_addr_t dma = (dma_addr_t)xen_pfn << XEN_PAGE_SHIFT;
+	phys_addr_t paddr = dma;
+
+	paddr |= baddr & ~XEN_PAGE_MASK;
+
+	return paddr;
+}
+
+static inline dma_addr_t xen_virt_to_bus(void *address)
+{
+	return xen_phys_to_bus(virt_to_phys(address));
+}
+
+static int check_pages_physically_contiguous(unsigned long xen_pfn,
+					     unsigned int offset,
+					     size_t length)
+{
+	unsigned long next_bfn;
+	int i;
+	int nr_pages;
+
+	next_bfn = pfn_to_bfn(xen_pfn);
+	nr_pages = (offset + length + XEN_PAGE_SIZE-1) >> XEN_PAGE_SHIFT;
+
+	for (i = 1; i < nr_pages; i++) {
+		if (pfn_to_bfn(++xen_pfn) != ++next_bfn)
+			return 0;
+	}
+	return 1;
+}
+
+static inline int range_straddles_page_boundary(phys_addr_t p, size_t size)
+{
+	unsigned long xen_pfn = XEN_PFN_DOWN(p);
+	unsigned int offset = p & ~XEN_PAGE_MASK;
+
+	if (offset + size <= XEN_PAGE_SIZE)
+		return 0;
+	if (check_pages_physically_contiguous(xen_pfn, offset, size))
+		return 0;
+	return 1;
+}
+
+static int is_xen_swiotlb_buffer(dma_addr_t dma_addr)
+{
+	unsigned long bfn = XEN_PFN_DOWN(dma_addr);
+	unsigned long xen_pfn = bfn_to_local_pfn(bfn);
+	phys_addr_t paddr = XEN_PFN_PHYS(xen_pfn);
+
+	/* If the address is outside our domain, it CAN
+	 * have the same virtual address as another address
+	 * in our domain. Therefore _only_ check address within our domain.
+	 */
+	if (pfn_valid(PFN_DOWN(paddr))) {
+		return paddr >= virt_to_phys(xen_io_tlb_start) &&
+		       paddr < virt_to_phys(xen_io_tlb_end);
+	}
+	return 0;
+}
+
+static int max_dma_bits = 32;
+
+static int
+xen_swiotlb_fixup(void *buf, size_t size, unsigned long nslabs)
+{
+	int i, rc;
+	int dma_bits;
+	dma_addr_t dma_handle;
+	phys_addr_t p = virt_to_phys(buf);
+
+	dma_bits = get_order(IO_TLB_SEGSIZE << IO_TLB_SHIFT) + PAGE_SHIFT;
+
+	i = 0;
+	do {
+		int slabs = min(nslabs - i, (unsigned long)IO_TLB_SEGSIZE);
+
+		do {
+			rc = xen_create_contiguous_region(
+				p + (i << IO_TLB_SHIFT),
+				get_order(slabs << IO_TLB_SHIFT),
+				dma_bits, &dma_handle);
+		} while (rc && dma_bits++ < max_dma_bits);
+		if (rc)
+			return rc;
+
+		i += slabs;
+	} while (i < nslabs);
+	return 0;
+}
+static unsigned long xen_set_nslabs(unsigned long nr_tbl)
+{
+	if (!nr_tbl) {
+		xen_io_tlb_nslabs = (64 * 1024 * 1024 >> IO_TLB_SHIFT);
+		xen_io_tlb_nslabs = ALIGN(xen_io_tlb_nslabs, IO_TLB_SEGSIZE);
+	} else
+		xen_io_tlb_nslabs = nr_tbl;
+
+	return xen_io_tlb_nslabs << IO_TLB_SHIFT;
+}
+
+enum xen_swiotlb_err {
+	XEN_SWIOTLB_UNKNOWN = 0,
+	XEN_SWIOTLB_ENOMEM,
+	XEN_SWIOTLB_EFIXUP
+};
+
+static const char *xen_swiotlb_error(enum xen_swiotlb_err err)
+{
+	switch (err) {
+	case XEN_SWIOTLB_ENOMEM:
+		return "Cannot allocate Xen-SWIOTLB buffer\n";
+	case XEN_SWIOTLB_EFIXUP:
+		return "Failed to get contiguous memory for DMA from Xen!\n"\
+		    "You either: don't have the permissions, do not have"\
+		    " enough free memory under 4GB, or the hypervisor memory"\
+		    " is too fragmented!";
+	default:
+		break;
+	}
+	return "";
+}
+int __ref xen_swiotlb_init(int verbose, bool early)
+{
+	unsigned long bytes, order;
+	int rc = -ENOMEM;
+	enum xen_swiotlb_err m_ret = XEN_SWIOTLB_UNKNOWN;
+	unsigned int repeat = 3;
+
+	xen_io_tlb_nslabs = swiotlb_nr_tbl();
+retry:
+	bytes = xen_set_nslabs(xen_io_tlb_nslabs);
+	order = get_order(xen_io_tlb_nslabs << IO_TLB_SHIFT);
+	/*
+	 * Get IO TLB memory from any location.
+	 */
+	if (early)
+		xen_io_tlb_start = alloc_bootmem_pages(PAGE_ALIGN(bytes));
+	else {
+#define SLABS_PER_PAGE (1 << (PAGE_SHIFT - IO_TLB_SHIFT))
+#define IO_TLB_MIN_SLABS ((1<<20) >> IO_TLB_SHIFT)
+		while ((SLABS_PER_PAGE << order) > IO_TLB_MIN_SLABS) {
+			xen_io_tlb_start = (void *)xen_get_swiotlb_free_pages(order);
+			if (xen_io_tlb_start)
+				break;
+			order--;
+		}
+		if (order != get_order(bytes)) {
+			pr_warn("Warning: only able to allocate %ld MB for software IO TLB\n",
+				(PAGE_SIZE << order) >> 20);
+			xen_io_tlb_nslabs = SLABS_PER_PAGE << order;
+			bytes = xen_io_tlb_nslabs << IO_TLB_SHIFT;
+		}
+	}
+	if (!xen_io_tlb_start) {
+		m_ret = XEN_SWIOTLB_ENOMEM;
+		goto error;
+	}
+	xen_io_tlb_end = xen_io_tlb_start + bytes;
+	/*
+	 * And replace that memory with pages under 4GB.
+	 */
+	rc = xen_swiotlb_fixup(xen_io_tlb_start,
+			       bytes,
+			       xen_io_tlb_nslabs);
+	if (rc) {
+		if (early)
+			free_bootmem(__pa(xen_io_tlb_start), PAGE_ALIGN(bytes));
+		else {
+			free_pages((unsigned long)xen_io_tlb_start, order);
+			xen_io_tlb_start = NULL;
+		}
+		m_ret = XEN_SWIOTLB_EFIXUP;
+		goto error;
+	}
+	start_dma_addr = xen_virt_to_bus(xen_io_tlb_start);
+	if (early) {
+		if (swiotlb_init_with_tbl(xen_io_tlb_start, xen_io_tlb_nslabs,
+			 verbose))
+			panic("Cannot allocate SWIOTLB buffer");
+		rc = 0;
+	} else
+		rc = swiotlb_late_init_with_tbl(xen_io_tlb_start, xen_io_tlb_nslabs);
+
+	if (!rc)
+		swiotlb_set_max_segment(PAGE_SIZE);
+
+	return rc;
+error:
+	if (repeat--) {
+		xen_io_tlb_nslabs = max(1024UL, /* Min is 2MB */
+					(xen_io_tlb_nslabs >> 1));
+		pr_info("Lowering to %luMB\n",
+			(xen_io_tlb_nslabs << IO_TLB_SHIFT) >> 20);
+		goto retry;
+	}
+	pr_err("%s (rc:%d)\n", xen_swiotlb_error(m_ret), rc);
+	if (early)
+		panic("%s (rc:%d)", xen_swiotlb_error(m_ret), rc);
+	else
+		free_pages((unsigned long)xen_io_tlb_start, order);
+	return rc;
+}
+
+static void *
+xen_swiotlb_alloc_coherent(struct device *hwdev, size_t size,
+			   dma_addr_t *dma_handle, gfp_t flags,
+			   unsigned long attrs)
+{
+	void *ret;
+	int order = get_order(size);
+	u64 dma_mask = DMA_BIT_MASK(32);
+	phys_addr_t phys;
+	dma_addr_t dev_addr;
+
+	/*
+	* Ignore region specifiers - the kernel's ideas of
+	* pseudo-phys memory layout has nothing to do with the
+	* machine physical layout.  We can't allocate highmem
+	* because we can't return a pointer to it.
+	*/
+	flags &= ~(__GFP_DMA | __GFP_HIGHMEM);
+
+	/* Convert the size to actually allocated. */
+	size = 1UL << (order + XEN_PAGE_SHIFT);
+
+	/* On ARM this function returns an ioremap'ped virtual address for
+	 * which virt_to_phys doesn't return the corresponding physical
+	 * address. In fact on ARM virt_to_phys only works for kernel direct
+	 * mapped RAM memory. Also see comment below.
+	 */
+	ret = xen_alloc_coherent_pages(hwdev, size, dma_handle, flags, attrs);
+
+	if (!ret)
+		return ret;
+
+	if (hwdev && hwdev->coherent_dma_mask)
+		dma_mask = hwdev->coherent_dma_mask;
+
+	/* At this point dma_handle is the physical address, next we are
+	 * going to set it to the machine address.
+	 * Do not use virt_to_phys(ret) because on ARM it doesn't correspond
+	 * to *dma_handle. */
+	phys = *dma_handle;
+	dev_addr = xen_phys_to_bus(phys);
+	if (((dev_addr + size - 1 <= dma_mask)) &&
+	    !range_straddles_page_boundary(phys, size))
+		*dma_handle = dev_addr;
+	else {
+		if (xen_create_contiguous_region(phys, order,
+						 fls64(dma_mask), dma_handle) != 0) {
+			xen_free_coherent_pages(hwdev, size, ret, (dma_addr_t)phys, attrs);
+			return NULL;
+		}
+	}
+	memset(ret, 0, size);
+	return ret;
+}
+
+static void
+xen_swiotlb_free_coherent(struct device *hwdev, size_t size, void *vaddr,
+			  dma_addr_t dev_addr, unsigned long attrs)
+{
+	int order = get_order(size);
+	phys_addr_t phys;
+	u64 dma_mask = DMA_BIT_MASK(32);
+
+	if (hwdev && hwdev->coherent_dma_mask)
+		dma_mask = hwdev->coherent_dma_mask;
+
+	/* do not use virt_to_phys because on ARM it doesn't return you the
+	 * physical address */
+	phys = xen_bus_to_phys(dev_addr);
+
+	/* Convert the size to actually allocated. */
+	size = 1UL << (order + XEN_PAGE_SHIFT);
+
+	if (!WARN_ON((dev_addr + size - 1 > dma_mask) ||
+		     range_straddles_page_boundary(phys, size)))
+		xen_destroy_contiguous_region(phys, order);
+
+	xen_free_coherent_pages(hwdev, size, vaddr, (dma_addr_t)phys, attrs);
+}
+
+/*
+ * Map a single buffer of the indicated size for DMA in streaming mode.  The
+ * physical address to use is returned.
+ *
+ * Once the device is given the dma address, the device owns this memory until
+ * either xen_swiotlb_unmap_page or xen_swiotlb_dma_sync_single is performed.
+ */
+static dma_addr_t xen_swiotlb_map_page(struct device *dev, struct page *page,
+				unsigned long offset, size_t size,
+				enum dma_data_direction dir,
+				unsigned long attrs)
+{
+	phys_addr_t map, phys = page_to_phys(page) + offset;
+	dma_addr_t dev_addr = xen_phys_to_bus(phys);
+
+	BUG_ON(dir == DMA_NONE);
+	/*
+	 * If the address happens to be in the device's DMA window,
+	 * we can safely return the device addr and not worry about bounce
+	 * buffering it.
+	 */
+	if (dma_capable(dev, dev_addr, size) &&
+	    !range_straddles_page_boundary(phys, size) &&
+		!xen_arch_need_swiotlb(dev, phys, dev_addr) &&
+		(swiotlb_force != SWIOTLB_FORCE)) {
+		/* we are not interested in the dma_addr returned by
+		 * xen_dma_map_page, only in the potential cache flushes executed
+		 * by the function. */
+		xen_dma_map_page(dev, page, dev_addr, offset, size, dir, attrs);
+		return dev_addr;
+	}
+
+	/*
+	 * Oh well, have to allocate and map a bounce buffer.
+	 */
+	trace_swiotlb_bounced(dev, dev_addr, size, swiotlb_force);
+
+	map = swiotlb_tbl_map_single(dev, start_dma_addr, phys, size, dir,
+				     attrs);
+	if (map == SWIOTLB_MAP_ERROR)
+		return XEN_SWIOTLB_ERROR_CODE;
+
+	dev_addr = xen_phys_to_bus(map);
+	xen_dma_map_page(dev, pfn_to_page(map >> PAGE_SHIFT),
+					dev_addr, map & ~PAGE_MASK, size, dir, attrs);
+
+	/*
+	 * Ensure that the address returned is DMA'ble
+	 */
+	if (dma_capable(dev, dev_addr, size))
+		return dev_addr;
+
+	attrs |= DMA_ATTR_SKIP_CPU_SYNC;
+	swiotlb_tbl_unmap_single(dev, map, size, dir, attrs);
+
+	return XEN_SWIOTLB_ERROR_CODE;
+}
+
+/*
+ * Unmap a single streaming mode DMA translation.  The dma_addr and size must
+ * match what was provided for in a previous xen_swiotlb_map_page call.  All
+ * other usages are undefined.
+ *
+ * After this call, reads by the cpu to the buffer are guaranteed to see
+ * whatever the device wrote there.
+ */
+static void xen_unmap_single(struct device *hwdev, dma_addr_t dev_addr,
+			     size_t size, enum dma_data_direction dir,
+			     unsigned long attrs)
+{
+	phys_addr_t paddr = xen_bus_to_phys(dev_addr);
+
+	BUG_ON(dir == DMA_NONE);
+
+	xen_dma_unmap_page(hwdev, dev_addr, size, dir, attrs);
+
+	/* NOTE: We use dev_addr here, not paddr! */
+	if (is_xen_swiotlb_buffer(dev_addr)) {
+		swiotlb_tbl_unmap_single(hwdev, paddr, size, dir, attrs);
+		return;
+	}
+
+	if (dir != DMA_FROM_DEVICE)
+		return;
+
+	/*
+	 * phys_to_virt doesn't work with hihgmem page but we could
+	 * call dma_mark_clean() with hihgmem page here. However, we
+	 * are fine since dma_mark_clean() is null on POWERPC. We can
+	 * make dma_mark_clean() take a physical address if necessary.
+	 */
+	dma_mark_clean(phys_to_virt(paddr), size);
+}
+
+static void xen_swiotlb_unmap_page(struct device *hwdev, dma_addr_t dev_addr,
+			    size_t size, enum dma_data_direction dir,
+			    unsigned long attrs)
+{
+	xen_unmap_single(hwdev, dev_addr, size, dir, attrs);
+}
+
+/*
+ * Make physical memory consistent for a single streaming mode DMA translation
+ * after a transfer.
+ *
+ * If you perform a xen_swiotlb_map_page() but wish to interrogate the buffer
+ * using the cpu, yet do not wish to teardown the dma mapping, you must
+ * call this function before doing so.  At the next point you give the dma
+ * address back to the card, you must first perform a
+ * xen_swiotlb_dma_sync_for_device, and then the device again owns the buffer
+ */
+static void
+xen_swiotlb_sync_single(struct device *hwdev, dma_addr_t dev_addr,
+			size_t size, enum dma_data_direction dir,
+			enum dma_sync_target target)
+{
+	phys_addr_t paddr = xen_bus_to_phys(dev_addr);
+
+	BUG_ON(dir == DMA_NONE);
+
+	if (target == SYNC_FOR_CPU)
+		xen_dma_sync_single_for_cpu(hwdev, dev_addr, size, dir);
+
+	/* NOTE: We use dev_addr here, not paddr! */
+	if (is_xen_swiotlb_buffer(dev_addr))
+		swiotlb_tbl_sync_single(hwdev, paddr, size, dir, target);
+
+	if (target == SYNC_FOR_DEVICE)
+		xen_dma_sync_single_for_device(hwdev, dev_addr, size, dir);
+
+	if (dir != DMA_FROM_DEVICE)
+		return;
+
+	dma_mark_clean(phys_to_virt(paddr), size);
+}
+
+void
+xen_swiotlb_sync_single_for_cpu(struct device *hwdev, dma_addr_t dev_addr,
+				size_t size, enum dma_data_direction dir)
+{
+	xen_swiotlb_sync_single(hwdev, dev_addr, size, dir, SYNC_FOR_CPU);
+}
+
+void
+xen_swiotlb_sync_single_for_device(struct device *hwdev, dma_addr_t dev_addr,
+				   size_t size, enum dma_data_direction dir)
+{
+	xen_swiotlb_sync_single(hwdev, dev_addr, size, dir, SYNC_FOR_DEVICE);
+}
+
+/*
+ * Unmap a set of streaming mode DMA translations.  Again, cpu read rules
+ * concerning calls here are the same as for swiotlb_unmap_page() above.
+ */
+static void
+xen_swiotlb_unmap_sg_attrs(struct device *hwdev, struct scatterlist *sgl,
+			   int nelems, enum dma_data_direction dir,
+			   unsigned long attrs)
+{
+	struct scatterlist *sg;
+	int i;
+
+	BUG_ON(dir == DMA_NONE);
+
+	for_each_sg(sgl, sg, nelems, i)
+		xen_unmap_single(hwdev, sg->dma_address, sg_dma_len(sg), dir, attrs);
+
+}
+
+/*
+ * Map a set of buffers described by scatterlist in streaming mode for DMA.
+ * This is the scatter-gather version of the above xen_swiotlb_map_page
+ * interface.  Here the scatter gather list elements are each tagged with the
+ * appropriate dma address and length.  They are obtained via
+ * sg_dma_{address,length}(SG).
+ *
+ * NOTE: An implementation may be able to use a smaller number of
+ *       DMA address/length pairs than there are SG table elements.
+ *       (for example via virtual mapping capabilities)
+ *       The routine returns the number of addr/length pairs actually
+ *       used, at most nents.
+ *
+ * Device ownership issues as mentioned above for xen_swiotlb_map_page are the
+ * same here.
+ */
+static int
+xen_swiotlb_map_sg_attrs(struct device *hwdev, struct scatterlist *sgl,
+			 int nelems, enum dma_data_direction dir,
+			 unsigned long attrs)
+{
+	struct scatterlist *sg;
+	int i;
+
+	BUG_ON(dir == DMA_NONE);
+
+	for_each_sg(sgl, sg, nelems, i) {
+		phys_addr_t paddr = sg_phys(sg);
+		dma_addr_t dev_addr = xen_phys_to_bus(paddr);
+
+		if (swiotlb_force == SWIOTLB_FORCE ||
+		    xen_arch_need_swiotlb(hwdev, paddr, dev_addr) ||
+		    !dma_capable(hwdev, dev_addr, sg->length) ||
+		    range_straddles_page_boundary(paddr, sg->length)) {
+			phys_addr_t map = swiotlb_tbl_map_single(hwdev,
+								 start_dma_addr,
+								 sg_phys(sg),
+								 sg->length,
+								 dir, attrs);
+			if (map == SWIOTLB_MAP_ERROR) {
+				dev_warn(hwdev, "swiotlb buffer is full\n");
+				/* Don't panic here, we expect map_sg users
+				   to do proper error handling. */
+				attrs |= DMA_ATTR_SKIP_CPU_SYNC;
+				xen_swiotlb_unmap_sg_attrs(hwdev, sgl, i, dir,
+							   attrs);
+				sg_dma_len(sgl) = 0;
+				return 0;
+			}
+			dev_addr = xen_phys_to_bus(map);
+			xen_dma_map_page(hwdev, pfn_to_page(map >> PAGE_SHIFT),
+						dev_addr,
+						map & ~PAGE_MASK,
+						sg->length,
+						dir,
+						attrs);
+			sg->dma_address = dev_addr;
+		} else {
+			/* we are not interested in the dma_addr returned by
+			 * xen_dma_map_page, only in the potential cache flushes executed
+			 * by the function. */
+			xen_dma_map_page(hwdev, pfn_to_page(paddr >> PAGE_SHIFT),
+						dev_addr,
+						paddr & ~PAGE_MASK,
+						sg->length,
+						dir,
+						attrs);
+			sg->dma_address = dev_addr;
+		}
+		sg_dma_len(sg) = sg->length;
+	}
+	return nelems;
+}
+
+/*
+ * Make physical memory consistent for a set of streaming mode DMA translations
+ * after a transfer.
+ *
+ * The same as swiotlb_sync_single_* but for a scatter-gather list, same rules
+ * and usage.
+ */
+static void
+xen_swiotlb_sync_sg(struct device *hwdev, struct scatterlist *sgl,
+		    int nelems, enum dma_data_direction dir,
+		    enum dma_sync_target target)
+{
+	struct scatterlist *sg;
+	int i;
+
+	for_each_sg(sgl, sg, nelems, i)
+		xen_swiotlb_sync_single(hwdev, sg->dma_address,
+					sg_dma_len(sg), dir, target);
+}
+
+static void
+xen_swiotlb_sync_sg_for_cpu(struct device *hwdev, struct scatterlist *sg,
+			    int nelems, enum dma_data_direction dir)
+{
+	xen_swiotlb_sync_sg(hwdev, sg, nelems, dir, SYNC_FOR_CPU);
+}
+
+static void
+xen_swiotlb_sync_sg_for_device(struct device *hwdev, struct scatterlist *sg,
+			       int nelems, enum dma_data_direction dir)
+{
+	xen_swiotlb_sync_sg(hwdev, sg, nelems, dir, SYNC_FOR_DEVICE);
+}
+
+/*
+ * Return whether the given device DMA address mask can be supported
+ * properly.  For example, if your device can only drive the low 24-bits
+ * during bus mastering, then you would pass 0x00ffffff as the mask to
+ * this function.
+ */
+static int
+xen_swiotlb_dma_supported(struct device *hwdev, u64 mask)
+{
+	return xen_virt_to_bus(xen_io_tlb_end - 1) <= mask;
+}
+
+/*
+ * Create userspace mapping for the DMA-coherent memory.
+ * This function should be called with the pages from the current domain only,
+ * passing pages mapped from other domains would lead to memory corruption.
+ */
+static int
+xen_swiotlb_dma_mmap(struct device *dev, struct vm_area_struct *vma,
+		     void *cpu_addr, dma_addr_t dma_addr, size_t size,
+		     unsigned long attrs)
+{
+#if defined(CONFIG_ARM) || defined(CONFIG_ARM64)
+	if (xen_get_dma_ops(dev)->mmap)
+		return xen_get_dma_ops(dev)->mmap(dev, vma, cpu_addr,
+						    dma_addr, size, attrs);
+#endif
+	return dma_common_mmap(dev, vma, cpu_addr, dma_addr, size);
+}
+
+/*
+ * This function should be called with the pages from the current domain only,
+ * passing pages mapped from other domains would lead to memory corruption.
+ */
+static int
+xen_swiotlb_get_sgtable(struct device *dev, struct sg_table *sgt,
+			void *cpu_addr, dma_addr_t handle, size_t size,
+			unsigned long attrs)
+{
+#if defined(CONFIG_ARM) || defined(CONFIG_ARM64)
+	if (xen_get_dma_ops(dev)->get_sgtable) {
+#if 0
+	/*
+	 * This check verifies that the page belongs to the current domain and
+	 * is not one mapped from another domain.
+	 * This check is for debug only, and should not go to production build
+	 */
+		unsigned long bfn = PHYS_PFN(dma_to_phys(dev, handle));
+		BUG_ON (!page_is_ram(bfn));
+#endif
+		return xen_get_dma_ops(dev)->get_sgtable(dev, sgt, cpu_addr,
+							   handle, size, attrs);
+	}
+#endif
+	return dma_common_get_sgtable(dev, sgt, cpu_addr, handle, size);
+}
+
+static int xen_swiotlb_mapping_error(struct device *dev, dma_addr_t dma_addr)
+{
+	return dma_addr == XEN_SWIOTLB_ERROR_CODE;
+}
+
+const struct dma_map_ops xen_swiotlb_dma_ops = {
+	.alloc = xen_swiotlb_alloc_coherent,
+	.free = xen_swiotlb_free_coherent,
+	.sync_single_for_cpu = xen_swiotlb_sync_single_for_cpu,
+	.sync_single_for_device = xen_swiotlb_sync_single_for_device,
+	.sync_sg_for_cpu = xen_swiotlb_sync_sg_for_cpu,
+	.sync_sg_for_device = xen_swiotlb_sync_sg_for_device,
+	.map_sg = xen_swiotlb_map_sg_attrs,
+	.unmap_sg = xen_swiotlb_unmap_sg_attrs,
+	.map_page = xen_swiotlb_map_page,
+	.unmap_page = xen_swiotlb_unmap_page,
+	.dma_supported = xen_swiotlb_dma_supported,
+	.mmap = xen_swiotlb_dma_mmap,
+	.get_sgtable = xen_swiotlb_get_sgtable,
+	.mapping_error	= xen_swiotlb_mapping_error,
+};
diff --git a/drivers/xen/sys-hypervisor.c b/drivers/xen/sys-hypervisor.c
new file mode 100644
index 000000000..9d314bba7
--- /dev/null
+++ b/drivers/xen/sys-hypervisor.c
@@ -0,0 +1,613 @@
+/*
+ *  copyright (c) 2006 IBM Corporation
+ *  Authored by: Mike D. Day <ncmike@us.ibm.com>
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License version 2 as
+ *  published by the Free Software Foundation.
+ */
+
+#include <linux/slab.h>
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/kobject.h>
+#include <linux/err.h>
+
+#include <asm/xen/hypervisor.h>
+#include <asm/xen/hypercall.h>
+
+#include <xen/xen.h>
+#include <xen/xenbus.h>
+#include <xen/interface/xen.h>
+#include <xen/interface/version.h>
+#ifdef CONFIG_XEN_HAVE_VPMU
+#include <xen/interface/xenpmu.h>
+#endif
+
+#define HYPERVISOR_ATTR_RO(_name) \
+static struct hyp_sysfs_attr  _name##_attr = __ATTR_RO(_name)
+
+#define HYPERVISOR_ATTR_RW(_name) \
+static struct hyp_sysfs_attr _name##_attr = \
+	__ATTR(_name, 0644, _name##_show, _name##_store)
+
+struct hyp_sysfs_attr {
+	struct attribute attr;
+	ssize_t (*show)(struct hyp_sysfs_attr *, char *);
+	ssize_t (*store)(struct hyp_sysfs_attr *, const char *, size_t);
+	void *hyp_attr_data;
+};
+
+static ssize_t type_show(struct hyp_sysfs_attr *attr, char *buffer)
+{
+	return sprintf(buffer, "xen\n");
+}
+
+HYPERVISOR_ATTR_RO(type);
+
+static int __init xen_sysfs_type_init(void)
+{
+	return sysfs_create_file(hypervisor_kobj, &type_attr.attr);
+}
+
+static ssize_t guest_type_show(struct hyp_sysfs_attr *attr, char *buffer)
+{
+	const char *type;
+
+	switch (xen_domain_type) {
+	case XEN_NATIVE:
+		/* ARM only. */
+		type = "Xen";
+		break;
+	case XEN_PV_DOMAIN:
+		type = "PV";
+		break;
+	case XEN_HVM_DOMAIN:
+		type = xen_pvh_domain() ? "PVH" : "HVM";
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	return sprintf(buffer, "%s\n", type);
+}
+
+HYPERVISOR_ATTR_RO(guest_type);
+
+static int __init xen_sysfs_guest_type_init(void)
+{
+	return sysfs_create_file(hypervisor_kobj, &guest_type_attr.attr);
+}
+
+/* xen version attributes */
+static ssize_t major_show(struct hyp_sysfs_attr *attr, char *buffer)
+{
+	int version = HYPERVISOR_xen_version(XENVER_version, NULL);
+	if (version)
+		return sprintf(buffer, "%d\n", version >> 16);
+	return -ENODEV;
+}
+
+HYPERVISOR_ATTR_RO(major);
+
+static ssize_t minor_show(struct hyp_sysfs_attr *attr, char *buffer)
+{
+	int version = HYPERVISOR_xen_version(XENVER_version, NULL);
+	if (version)
+		return sprintf(buffer, "%d\n", version & 0xff);
+	return -ENODEV;
+}
+
+HYPERVISOR_ATTR_RO(minor);
+
+static ssize_t extra_show(struct hyp_sysfs_attr *attr, char *buffer)
+{
+	int ret = -ENOMEM;
+	char *extra;
+
+	extra = kmalloc(XEN_EXTRAVERSION_LEN, GFP_KERNEL);
+	if (extra) {
+		ret = HYPERVISOR_xen_version(XENVER_extraversion, extra);
+		if (!ret)
+			ret = sprintf(buffer, "%s\n", extra);
+		kfree(extra);
+	}
+
+	return ret;
+}
+
+HYPERVISOR_ATTR_RO(extra);
+
+static struct attribute *version_attrs[] = {
+	&major_attr.attr,
+	&minor_attr.attr,
+	&extra_attr.attr,
+	NULL
+};
+
+static const struct attribute_group version_group = {
+	.name = "version",
+	.attrs = version_attrs,
+};
+
+static int __init xen_sysfs_version_init(void)
+{
+	return sysfs_create_group(hypervisor_kobj, &version_group);
+}
+
+/* UUID */
+
+static ssize_t uuid_show_fallback(struct hyp_sysfs_attr *attr, char *buffer)
+{
+	char *vm, *val;
+	int ret;
+	extern int xenstored_ready;
+
+	if (!xenstored_ready)
+		return -EBUSY;
+
+	vm = xenbus_read(XBT_NIL, "vm", "", NULL);
+	if (IS_ERR(vm))
+		return PTR_ERR(vm);
+	val = xenbus_read(XBT_NIL, vm, "uuid", NULL);
+	kfree(vm);
+	if (IS_ERR(val))
+		return PTR_ERR(val);
+	ret = sprintf(buffer, "%s\n", val);
+	kfree(val);
+	return ret;
+}
+
+static ssize_t uuid_show(struct hyp_sysfs_attr *attr, char *buffer)
+{
+	xen_domain_handle_t uuid;
+	int ret;
+	ret = HYPERVISOR_xen_version(XENVER_guest_handle, uuid);
+	if (ret)
+		return uuid_show_fallback(attr, buffer);
+	ret = sprintf(buffer, "%pU\n", uuid);
+	return ret;
+}
+
+HYPERVISOR_ATTR_RO(uuid);
+
+static int __init xen_sysfs_uuid_init(void)
+{
+	return sysfs_create_file(hypervisor_kobj, &uuid_attr.attr);
+}
+
+/* xen compilation attributes */
+
+static ssize_t compiler_show(struct hyp_sysfs_attr *attr, char *buffer)
+{
+	int ret = -ENOMEM;
+	struct xen_compile_info *info;
+
+	info = kmalloc(sizeof(struct xen_compile_info), GFP_KERNEL);
+	if (info) {
+		ret = HYPERVISOR_xen_version(XENVER_compile_info, info);
+		if (!ret)
+			ret = sprintf(buffer, "%s\n", info->compiler);
+		kfree(info);
+	}
+
+	return ret;
+}
+
+HYPERVISOR_ATTR_RO(compiler);
+
+static ssize_t compiled_by_show(struct hyp_sysfs_attr *attr, char *buffer)
+{
+	int ret = -ENOMEM;
+	struct xen_compile_info *info;
+
+	info = kmalloc(sizeof(struct xen_compile_info), GFP_KERNEL);
+	if (info) {
+		ret = HYPERVISOR_xen_version(XENVER_compile_info, info);
+		if (!ret)
+			ret = sprintf(buffer, "%s\n", info->compile_by);
+		kfree(info);
+	}
+
+	return ret;
+}
+
+HYPERVISOR_ATTR_RO(compiled_by);
+
+static ssize_t compile_date_show(struct hyp_sysfs_attr *attr, char *buffer)
+{
+	int ret = -ENOMEM;
+	struct xen_compile_info *info;
+
+	info = kmalloc(sizeof(struct xen_compile_info), GFP_KERNEL);
+	if (info) {
+		ret = HYPERVISOR_xen_version(XENVER_compile_info, info);
+		if (!ret)
+			ret = sprintf(buffer, "%s\n", info->compile_date);
+		kfree(info);
+	}
+
+	return ret;
+}
+
+HYPERVISOR_ATTR_RO(compile_date);
+
+static struct attribute *xen_compile_attrs[] = {
+	&compiler_attr.attr,
+	&compiled_by_attr.attr,
+	&compile_date_attr.attr,
+	NULL
+};
+
+static const struct attribute_group xen_compilation_group = {
+	.name = "compilation",
+	.attrs = xen_compile_attrs,
+};
+
+static int __init xen_sysfs_compilation_init(void)
+{
+	return sysfs_create_group(hypervisor_kobj, &xen_compilation_group);
+}
+
+/* xen properties info */
+
+static ssize_t capabilities_show(struct hyp_sysfs_attr *attr, char *buffer)
+{
+	int ret = -ENOMEM;
+	char *caps;
+
+	caps = kmalloc(XEN_CAPABILITIES_INFO_LEN, GFP_KERNEL);
+	if (caps) {
+		ret = HYPERVISOR_xen_version(XENVER_capabilities, caps);
+		if (!ret)
+			ret = sprintf(buffer, "%s\n", caps);
+		kfree(caps);
+	}
+
+	return ret;
+}
+
+HYPERVISOR_ATTR_RO(capabilities);
+
+static ssize_t changeset_show(struct hyp_sysfs_attr *attr, char *buffer)
+{
+	int ret = -ENOMEM;
+	char *cset;
+
+	cset = kmalloc(XEN_CHANGESET_INFO_LEN, GFP_KERNEL);
+	if (cset) {
+		ret = HYPERVISOR_xen_version(XENVER_changeset, cset);
+		if (!ret)
+			ret = sprintf(buffer, "%s\n", cset);
+		kfree(cset);
+	}
+
+	return ret;
+}
+
+HYPERVISOR_ATTR_RO(changeset);
+
+static ssize_t virtual_start_show(struct hyp_sysfs_attr *attr, char *buffer)
+{
+	int ret = -ENOMEM;
+	struct xen_platform_parameters *parms;
+
+	parms = kmalloc(sizeof(struct xen_platform_parameters), GFP_KERNEL);
+	if (parms) {
+		ret = HYPERVISOR_xen_version(XENVER_platform_parameters,
+					     parms);
+		if (!ret)
+			ret = sprintf(buffer, "%"PRI_xen_ulong"\n",
+				      parms->virt_start);
+		kfree(parms);
+	}
+
+	return ret;
+}
+
+HYPERVISOR_ATTR_RO(virtual_start);
+
+static ssize_t pagesize_show(struct hyp_sysfs_attr *attr, char *buffer)
+{
+	int ret;
+
+	ret = HYPERVISOR_xen_version(XENVER_pagesize, NULL);
+	if (ret > 0)
+		ret = sprintf(buffer, "%x\n", ret);
+
+	return ret;
+}
+
+HYPERVISOR_ATTR_RO(pagesize);
+
+static ssize_t xen_feature_show(int index, char *buffer)
+{
+	ssize_t ret;
+	struct xen_feature_info info;
+
+	info.submap_idx = index;
+	ret = HYPERVISOR_xen_version(XENVER_get_features, &info);
+	if (!ret)
+		ret = sprintf(buffer, "%08x", info.submap);
+
+	return ret;
+}
+
+static ssize_t features_show(struct hyp_sysfs_attr *attr, char *buffer)
+{
+	ssize_t len;
+	int i;
+
+	len = 0;
+	for (i = XENFEAT_NR_SUBMAPS-1; i >= 0; i--) {
+		int ret = xen_feature_show(i, buffer + len);
+		if (ret < 0) {
+			if (len == 0)
+				len = ret;
+			break;
+		}
+		len += ret;
+	}
+	if (len > 0)
+		buffer[len++] = '\n';
+
+	return len;
+}
+
+HYPERVISOR_ATTR_RO(features);
+
+static ssize_t buildid_show(struct hyp_sysfs_attr *attr, char *buffer)
+{
+	ssize_t ret;
+	struct xen_build_id *buildid;
+
+	ret = HYPERVISOR_xen_version(XENVER_build_id, NULL);
+	if (ret < 0) {
+		if (ret == -EPERM)
+			ret = sprintf(buffer, "<denied>");
+		return ret;
+	}
+
+	buildid = kmalloc(sizeof(*buildid) + ret, GFP_KERNEL);
+	if (!buildid)
+		return -ENOMEM;
+
+	buildid->len = ret;
+	ret = HYPERVISOR_xen_version(XENVER_build_id, buildid);
+	if (ret > 0)
+		ret = sprintf(buffer, "%s", buildid->buf);
+	kfree(buildid);
+
+	return ret;
+}
+
+HYPERVISOR_ATTR_RO(buildid);
+
+static struct attribute *xen_properties_attrs[] = {
+	&capabilities_attr.attr,
+	&changeset_attr.attr,
+	&virtual_start_attr.attr,
+	&pagesize_attr.attr,
+	&features_attr.attr,
+	&buildid_attr.attr,
+	NULL
+};
+
+static const struct attribute_group xen_properties_group = {
+	.name = "properties",
+	.attrs = xen_properties_attrs,
+};
+
+static int __init xen_sysfs_properties_init(void)
+{
+	return sysfs_create_group(hypervisor_kobj, &xen_properties_group);
+}
+
+#ifdef CONFIG_XEN_HAVE_VPMU
+struct pmu_mode {
+	const char *name;
+	uint32_t mode;
+};
+
+static struct pmu_mode pmu_modes[] = {
+	{"off", XENPMU_MODE_OFF},
+	{"self", XENPMU_MODE_SELF},
+	{"hv", XENPMU_MODE_HV},
+	{"all", XENPMU_MODE_ALL}
+};
+
+static ssize_t pmu_mode_store(struct hyp_sysfs_attr *attr,
+			      const char *buffer, size_t len)
+{
+	int ret;
+	struct xen_pmu_params xp;
+	int i;
+
+	for (i = 0; i < ARRAY_SIZE(pmu_modes); i++) {
+		if (strncmp(buffer, pmu_modes[i].name, len - 1) == 0) {
+			xp.val = pmu_modes[i].mode;
+			break;
+		}
+	}
+
+	if (i == ARRAY_SIZE(pmu_modes))
+		return -EINVAL;
+
+	xp.version.maj = XENPMU_VER_MAJ;
+	xp.version.min = XENPMU_VER_MIN;
+	ret = HYPERVISOR_xenpmu_op(XENPMU_mode_set, &xp);
+	if (ret)
+		return ret;
+
+	return len;
+}
+
+static ssize_t pmu_mode_show(struct hyp_sysfs_attr *attr, char *buffer)
+{
+	int ret;
+	struct xen_pmu_params xp;
+	int i;
+	uint32_t mode;
+
+	xp.version.maj = XENPMU_VER_MAJ;
+	xp.version.min = XENPMU_VER_MIN;
+	ret = HYPERVISOR_xenpmu_op(XENPMU_mode_get, &xp);
+	if (ret)
+		return ret;
+
+	mode = (uint32_t)xp.val;
+	for (i = 0; i < ARRAY_SIZE(pmu_modes); i++) {
+		if (mode == pmu_modes[i].mode)
+			return sprintf(buffer, "%s\n", pmu_modes[i].name);
+	}
+
+	return -EINVAL;
+}
+HYPERVISOR_ATTR_RW(pmu_mode);
+
+static ssize_t pmu_features_store(struct hyp_sysfs_attr *attr,
+				  const char *buffer, size_t len)
+{
+	int ret;
+	uint32_t features;
+	struct xen_pmu_params xp;
+
+	ret = kstrtou32(buffer, 0, &features);
+	if (ret)
+		return ret;
+
+	xp.val = features;
+	xp.version.maj = XENPMU_VER_MAJ;
+	xp.version.min = XENPMU_VER_MIN;
+	ret = HYPERVISOR_xenpmu_op(XENPMU_feature_set, &xp);
+	if (ret)
+		return ret;
+
+	return len;
+}
+
+static ssize_t pmu_features_show(struct hyp_sysfs_attr *attr, char *buffer)
+{
+	int ret;
+	struct xen_pmu_params xp;
+
+	xp.version.maj = XENPMU_VER_MAJ;
+	xp.version.min = XENPMU_VER_MIN;
+	ret = HYPERVISOR_xenpmu_op(XENPMU_feature_get, &xp);
+	if (ret)
+		return ret;
+
+	return sprintf(buffer, "0x%x\n", (uint32_t)xp.val);
+}
+HYPERVISOR_ATTR_RW(pmu_features);
+
+static struct attribute *xen_pmu_attrs[] = {
+	&pmu_mode_attr.attr,
+	&pmu_features_attr.attr,
+	NULL
+};
+
+static const struct attribute_group xen_pmu_group = {
+	.name = "pmu",
+	.attrs = xen_pmu_attrs,
+};
+
+static int __init xen_sysfs_pmu_init(void)
+{
+	return sysfs_create_group(hypervisor_kobj, &xen_pmu_group);
+}
+#endif
+
+static int __init hyper_sysfs_init(void)
+{
+	int ret;
+
+	if (!xen_domain())
+		return -ENODEV;
+
+	ret = xen_sysfs_type_init();
+	if (ret)
+		goto out;
+	ret = xen_sysfs_guest_type_init();
+	if (ret)
+		goto guest_type_out;
+	ret = xen_sysfs_version_init();
+	if (ret)
+		goto version_out;
+	ret = xen_sysfs_compilation_init();
+	if (ret)
+		goto comp_out;
+	ret = xen_sysfs_uuid_init();
+	if (ret)
+		goto uuid_out;
+	ret = xen_sysfs_properties_init();
+	if (ret)
+		goto prop_out;
+#ifdef CONFIG_XEN_HAVE_VPMU
+	if (xen_initial_domain()) {
+		ret = xen_sysfs_pmu_init();
+		if (ret) {
+			sysfs_remove_group(hypervisor_kobj,
+					   &xen_properties_group);
+			goto prop_out;
+		}
+	}
+#endif
+	goto out;
+
+prop_out:
+	sysfs_remove_file(hypervisor_kobj, &uuid_attr.attr);
+uuid_out:
+	sysfs_remove_group(hypervisor_kobj, &xen_compilation_group);
+comp_out:
+	sysfs_remove_group(hypervisor_kobj, &version_group);
+version_out:
+	sysfs_remove_file(hypervisor_kobj, &guest_type_attr.attr);
+guest_type_out:
+	sysfs_remove_file(hypervisor_kobj, &type_attr.attr);
+out:
+	return ret;
+}
+device_initcall(hyper_sysfs_init);
+
+static ssize_t hyp_sysfs_show(struct kobject *kobj,
+			      struct attribute *attr,
+			      char *buffer)
+{
+	struct hyp_sysfs_attr *hyp_attr;
+	hyp_attr = container_of(attr, struct hyp_sysfs_attr, attr);
+	if (hyp_attr->show)
+		return hyp_attr->show(hyp_attr, buffer);
+	return 0;
+}
+
+static ssize_t hyp_sysfs_store(struct kobject *kobj,
+			       struct attribute *attr,
+			       const char *buffer,
+			       size_t len)
+{
+	struct hyp_sysfs_attr *hyp_attr;
+	hyp_attr = container_of(attr, struct hyp_sysfs_attr, attr);
+	if (hyp_attr->store)
+		return hyp_attr->store(hyp_attr, buffer, len);
+	return 0;
+}
+
+static const struct sysfs_ops hyp_sysfs_ops = {
+	.show = hyp_sysfs_show,
+	.store = hyp_sysfs_store,
+};
+
+static struct kobj_type hyp_sysfs_kobj_type = {
+	.sysfs_ops = &hyp_sysfs_ops,
+};
+
+static int __init hypervisor_subsys_init(void)
+{
+	if (!xen_domain())
+		return -ENODEV;
+
+	hypervisor_kobj->ktype = &hyp_sysfs_kobj_type;
+	return 0;
+}
+device_initcall(hypervisor_subsys_init);
diff --git a/drivers/xen/time.c b/drivers/xen/time.c
new file mode 100644
index 000000000..3e741cd14
--- /dev/null
+++ b/drivers/xen/time.c
@@ -0,0 +1,183 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Xen stolen ticks accounting.
+ */
+#include <linux/kernel.h>
+#include <linux/kernel_stat.h>
+#include <linux/math64.h>
+#include <linux/gfp.h>
+#include <linux/slab.h>
+
+#include <asm/paravirt.h>
+#include <asm/xen/hypervisor.h>
+#include <asm/xen/hypercall.h>
+
+#include <xen/events.h>
+#include <xen/features.h>
+#include <xen/interface/xen.h>
+#include <xen/interface/vcpu.h>
+#include <xen/xen-ops.h>
+
+/* runstate info updated by Xen */
+static DEFINE_PER_CPU(struct vcpu_runstate_info, xen_runstate);
+
+static DEFINE_PER_CPU(u64[4], old_runstate_time);
+
+/* return an consistent snapshot of 64-bit time/counter value */
+static u64 get64(const u64 *p)
+{
+	u64 ret;
+
+	if (BITS_PER_LONG < 64) {
+		u32 *p32 = (u32 *)p;
+		u32 h, l, h2;
+
+		/*
+		 * Read high then low, and then make sure high is
+		 * still the same; this will only loop if low wraps
+		 * and carries into high.
+		 * XXX some clean way to make this endian-proof?
+		 */
+		do {
+			h = READ_ONCE(p32[1]);
+			l = READ_ONCE(p32[0]);
+			h2 = READ_ONCE(p32[1]);
+		} while(h2 != h);
+
+		ret = (((u64)h) << 32) | l;
+	} else
+		ret = READ_ONCE(*p);
+
+	return ret;
+}
+
+static void xen_get_runstate_snapshot_cpu_delta(
+			      struct vcpu_runstate_info *res, unsigned int cpu)
+{
+	u64 state_time;
+	struct vcpu_runstate_info *state;
+
+	BUG_ON(preemptible());
+
+	state = per_cpu_ptr(&xen_runstate, cpu);
+
+	do {
+		state_time = get64(&state->state_entry_time);
+		rmb();	/* Hypervisor might update data. */
+		*res = READ_ONCE(*state);
+		rmb();	/* Hypervisor might update data. */
+	} while (get64(&state->state_entry_time) != state_time ||
+		 (state_time & XEN_RUNSTATE_UPDATE));
+}
+
+static void xen_get_runstate_snapshot_cpu(struct vcpu_runstate_info *res,
+					  unsigned int cpu)
+{
+	int i;
+
+	xen_get_runstate_snapshot_cpu_delta(res, cpu);
+
+	for (i = 0; i < 4; i++)
+		res->time[i] += per_cpu(old_runstate_time, cpu)[i];
+}
+
+void xen_manage_runstate_time(int action)
+{
+	static struct vcpu_runstate_info *runstate_delta;
+	struct vcpu_runstate_info state;
+	int cpu, i;
+
+	switch (action) {
+	case -1: /* backup runstate time before suspend */
+		if (unlikely(runstate_delta))
+			pr_warn_once("%s: memory leak as runstate_delta is not NULL\n",
+					__func__);
+
+		runstate_delta = kmalloc_array(num_possible_cpus(),
+					sizeof(*runstate_delta),
+					GFP_ATOMIC);
+		if (unlikely(!runstate_delta)) {
+			pr_warn("%s: failed to allocate runstate_delta\n",
+					__func__);
+			return;
+		}
+
+		for_each_possible_cpu(cpu) {
+			xen_get_runstate_snapshot_cpu_delta(&state, cpu);
+			memcpy(runstate_delta[cpu].time, state.time,
+					sizeof(runstate_delta[cpu].time));
+		}
+
+		break;
+
+	case 0: /* backup runstate time after resume */
+		if (unlikely(!runstate_delta)) {
+			pr_warn("%s: cannot accumulate runstate time as runstate_delta is NULL\n",
+					__func__);
+			return;
+		}
+
+		for_each_possible_cpu(cpu) {
+			for (i = 0; i < 4; i++)
+				per_cpu(old_runstate_time, cpu)[i] +=
+					runstate_delta[cpu].time[i];
+		}
+
+		break;
+
+	default: /* do not accumulate runstate time for checkpointing */
+		break;
+	}
+
+	if (action != -1 && runstate_delta) {
+		kfree(runstate_delta);
+		runstate_delta = NULL;
+	}
+}
+
+/*
+ * Runstate accounting
+ */
+void xen_get_runstate_snapshot(struct vcpu_runstate_info *res)
+{
+	xen_get_runstate_snapshot_cpu(res, smp_processor_id());
+}
+
+/* return true when a vcpu could run but has no real cpu to run on */
+bool xen_vcpu_stolen(int vcpu)
+{
+	return per_cpu(xen_runstate, vcpu).state == RUNSTATE_runnable;
+}
+
+u64 xen_steal_clock(int cpu)
+{
+	struct vcpu_runstate_info state;
+
+	xen_get_runstate_snapshot_cpu(&state, cpu);
+	return state.time[RUNSTATE_runnable] + state.time[RUNSTATE_offline];
+}
+
+void xen_setup_runstate_info(int cpu)
+{
+	struct vcpu_register_runstate_memory_area area;
+
+	area.addr.v = &per_cpu(xen_runstate, cpu);
+
+	if (HYPERVISOR_vcpu_op(VCPUOP_register_runstate_memory_area,
+			       xen_vcpu_nr(cpu), &area))
+		BUG();
+}
+
+void __init xen_time_setup_guest(void)
+{
+	bool xen_runstate_remote;
+
+	xen_runstate_remote = !HYPERVISOR_vm_assist(VMASST_CMD_enable,
+					VMASST_TYPE_runstate_update_flag);
+
+	pv_time_ops.steal_clock = xen_steal_clock;
+
+	static_key_slow_inc(&paravirt_steal_enabled);
+	if (xen_runstate_remote)
+		static_key_slow_inc(&paravirt_steal_rq_enabled);
+}
diff --git a/drivers/xen/tmem.c b/drivers/xen/tmem.c
new file mode 100644
index 000000000..04e7b3b29
--- /dev/null
+++ b/drivers/xen/tmem.c
@@ -0,0 +1,418 @@
+/*
+ * Xen implementation for transcendent memory (tmem)
+ *
+ * Copyright (C) 2009-2011 Oracle Corp.  All rights reserved.
+ * Author: Dan Magenheimer
+ */
+
+#define pr_fmt(fmt) "xen:" KBUILD_MODNAME ": " fmt
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/types.h>
+#include <linux/init.h>
+#include <linux/pagemap.h>
+#include <linux/cleancache.h>
+#include <linux/frontswap.h>
+
+#include <xen/xen.h>
+#include <xen/interface/xen.h>
+#include <xen/page.h>
+#include <asm/xen/hypercall.h>
+#include <asm/xen/hypervisor.h>
+#include <xen/tmem.h>
+
+#ifndef CONFIG_XEN_TMEM_MODULE
+bool __read_mostly tmem_enabled = false;
+
+static int __init enable_tmem(char *s)
+{
+	tmem_enabled = true;
+	return 1;
+}
+__setup("tmem", enable_tmem);
+#endif
+
+#ifdef CONFIG_CLEANCACHE
+static bool cleancache __read_mostly = true;
+module_param(cleancache, bool, S_IRUGO);
+static bool selfballooning __read_mostly = true;
+module_param(selfballooning, bool, S_IRUGO);
+#endif /* CONFIG_CLEANCACHE */
+
+#ifdef CONFIG_FRONTSWAP
+static bool frontswap __read_mostly = true;
+module_param(frontswap, bool, S_IRUGO);
+#else /* CONFIG_FRONTSWAP */
+#define frontswap (0)
+#endif /* CONFIG_FRONTSWAP */
+
+#ifdef CONFIG_XEN_SELFBALLOONING
+static bool selfshrinking __read_mostly = true;
+module_param(selfshrinking, bool, S_IRUGO);
+#endif /* CONFIG_XEN_SELFBALLOONING */
+
+#define TMEM_CONTROL               0
+#define TMEM_NEW_POOL              1
+#define TMEM_DESTROY_POOL          2
+#define TMEM_NEW_PAGE              3
+#define TMEM_PUT_PAGE              4
+#define TMEM_GET_PAGE              5
+#define TMEM_FLUSH_PAGE            6
+#define TMEM_FLUSH_OBJECT          7
+#define TMEM_READ                  8
+#define TMEM_WRITE                 9
+#define TMEM_XCHG                 10
+
+/* Bits for HYPERVISOR_tmem_op(TMEM_NEW_POOL) */
+#define TMEM_POOL_PERSIST          1
+#define TMEM_POOL_SHARED           2
+#define TMEM_POOL_PAGESIZE_SHIFT   4
+#define TMEM_VERSION_SHIFT        24
+
+
+struct tmem_pool_uuid {
+	u64 uuid_lo;
+	u64 uuid_hi;
+};
+
+struct tmem_oid {
+	u64 oid[3];
+};
+
+#define TMEM_POOL_PRIVATE_UUID	{ 0, 0 }
+
+/* flags for tmem_ops.new_pool */
+#define TMEM_POOL_PERSIST          1
+#define TMEM_POOL_SHARED           2
+
+/* xen tmem foundation ops/hypercalls */
+
+static inline int xen_tmem_op(u32 tmem_cmd, u32 tmem_pool, struct tmem_oid oid,
+	u32 index, unsigned long gmfn, u32 tmem_offset, u32 pfn_offset, u32 len)
+{
+	struct tmem_op op;
+	int rc = 0;
+
+	op.cmd = tmem_cmd;
+	op.pool_id = tmem_pool;
+	op.u.gen.oid[0] = oid.oid[0];
+	op.u.gen.oid[1] = oid.oid[1];
+	op.u.gen.oid[2] = oid.oid[2];
+	op.u.gen.index = index;
+	op.u.gen.tmem_offset = tmem_offset;
+	op.u.gen.pfn_offset = pfn_offset;
+	op.u.gen.len = len;
+	set_xen_guest_handle(op.u.gen.gmfn, (void *)gmfn);
+	rc = HYPERVISOR_tmem_op(&op);
+	return rc;
+}
+
+static int xen_tmem_new_pool(struct tmem_pool_uuid uuid,
+				u32 flags, unsigned long pagesize)
+{
+	struct tmem_op op;
+	int rc = 0, pageshift;
+
+	for (pageshift = 0; pagesize != 1; pageshift++)
+		pagesize >>= 1;
+	flags |= (pageshift - 12) << TMEM_POOL_PAGESIZE_SHIFT;
+	flags |= TMEM_SPEC_VERSION << TMEM_VERSION_SHIFT;
+	op.cmd = TMEM_NEW_POOL;
+	op.u.new.uuid[0] = uuid.uuid_lo;
+	op.u.new.uuid[1] = uuid.uuid_hi;
+	op.u.new.flags = flags;
+	rc = HYPERVISOR_tmem_op(&op);
+	return rc;
+}
+
+/* xen generic tmem ops */
+
+static int xen_tmem_put_page(u32 pool_id, struct tmem_oid oid,
+			     u32 index, struct page *page)
+{
+	return xen_tmem_op(TMEM_PUT_PAGE, pool_id, oid, index,
+			   xen_page_to_gfn(page), 0, 0, 0);
+}
+
+static int xen_tmem_get_page(u32 pool_id, struct tmem_oid oid,
+			     u32 index, struct page *page)
+{
+	return xen_tmem_op(TMEM_GET_PAGE, pool_id, oid, index,
+			   xen_page_to_gfn(page), 0, 0, 0);
+}
+
+static int xen_tmem_flush_page(u32 pool_id, struct tmem_oid oid, u32 index)
+{
+	return xen_tmem_op(TMEM_FLUSH_PAGE, pool_id, oid, index,
+		0, 0, 0, 0);
+}
+
+static int xen_tmem_flush_object(u32 pool_id, struct tmem_oid oid)
+{
+	return xen_tmem_op(TMEM_FLUSH_OBJECT, pool_id, oid, 0, 0, 0, 0, 0);
+}
+
+
+#ifdef CONFIG_CLEANCACHE
+static int xen_tmem_destroy_pool(u32 pool_id)
+{
+	struct tmem_oid oid = { { 0 } };
+
+	return xen_tmem_op(TMEM_DESTROY_POOL, pool_id, oid, 0, 0, 0, 0, 0);
+}
+
+/* cleancache ops */
+
+static void tmem_cleancache_put_page(int pool, struct cleancache_filekey key,
+				     pgoff_t index, struct page *page)
+{
+	u32 ind = (u32) index;
+	struct tmem_oid oid = *(struct tmem_oid *)&key;
+
+	if (pool < 0)
+		return;
+	if (ind != index)
+		return;
+	mb(); /* ensure page is quiescent; tmem may address it with an alias */
+	(void)xen_tmem_put_page((u32)pool, oid, ind, page);
+}
+
+static int tmem_cleancache_get_page(int pool, struct cleancache_filekey key,
+				    pgoff_t index, struct page *page)
+{
+	u32 ind = (u32) index;
+	struct tmem_oid oid = *(struct tmem_oid *)&key;
+	int ret;
+
+	/* translate return values to linux semantics */
+	if (pool < 0)
+		return -1;
+	if (ind != index)
+		return -1;
+	ret = xen_tmem_get_page((u32)pool, oid, ind, page);
+	if (ret == 1)
+		return 0;
+	else
+		return -1;
+}
+
+static void tmem_cleancache_flush_page(int pool, struct cleancache_filekey key,
+				       pgoff_t index)
+{
+	u32 ind = (u32) index;
+	struct tmem_oid oid = *(struct tmem_oid *)&key;
+
+	if (pool < 0)
+		return;
+	if (ind != index)
+		return;
+	(void)xen_tmem_flush_page((u32)pool, oid, ind);
+}
+
+static void tmem_cleancache_flush_inode(int pool, struct cleancache_filekey key)
+{
+	struct tmem_oid oid = *(struct tmem_oid *)&key;
+
+	if (pool < 0)
+		return;
+	(void)xen_tmem_flush_object((u32)pool, oid);
+}
+
+static void tmem_cleancache_flush_fs(int pool)
+{
+	if (pool < 0)
+		return;
+	(void)xen_tmem_destroy_pool((u32)pool);
+}
+
+static int tmem_cleancache_init_fs(size_t pagesize)
+{
+	struct tmem_pool_uuid uuid_private = TMEM_POOL_PRIVATE_UUID;
+
+	return xen_tmem_new_pool(uuid_private, 0, pagesize);
+}
+
+static int tmem_cleancache_init_shared_fs(uuid_t *uuid, size_t pagesize)
+{
+	struct tmem_pool_uuid shared_uuid;
+
+	shared_uuid.uuid_lo = *(u64 *)&uuid->b[0];
+	shared_uuid.uuid_hi = *(u64 *)&uuid->b[8];
+	return xen_tmem_new_pool(shared_uuid, TMEM_POOL_SHARED, pagesize);
+}
+
+static const struct cleancache_ops tmem_cleancache_ops = {
+	.put_page = tmem_cleancache_put_page,
+	.get_page = tmem_cleancache_get_page,
+	.invalidate_page = tmem_cleancache_flush_page,
+	.invalidate_inode = tmem_cleancache_flush_inode,
+	.invalidate_fs = tmem_cleancache_flush_fs,
+	.init_shared_fs = tmem_cleancache_init_shared_fs,
+	.init_fs = tmem_cleancache_init_fs
+};
+#endif
+
+#ifdef CONFIG_FRONTSWAP
+/* frontswap tmem operations */
+
+/* a single tmem poolid is used for all frontswap "types" (swapfiles) */
+static int tmem_frontswap_poolid;
+
+/*
+ * Swizzling increases objects per swaptype, increasing tmem concurrency
+ * for heavy swaploads.  Later, larger nr_cpus -> larger SWIZ_BITS
+ */
+#define SWIZ_BITS		4
+#define SWIZ_MASK		((1 << SWIZ_BITS) - 1)
+#define _oswiz(_type, _ind)	((_type << SWIZ_BITS) | (_ind & SWIZ_MASK))
+#define iswiz(_ind)		(_ind >> SWIZ_BITS)
+
+static inline struct tmem_oid oswiz(unsigned type, u32 ind)
+{
+	struct tmem_oid oid = { .oid = { 0 } };
+	oid.oid[0] = _oswiz(type, ind);
+	return oid;
+}
+
+/* returns 0 if the page was successfully put into frontswap, -1 if not */
+static int tmem_frontswap_store(unsigned type, pgoff_t offset,
+				   struct page *page)
+{
+	u64 ind64 = (u64)offset;
+	u32 ind = (u32)offset;
+	int pool = tmem_frontswap_poolid;
+	int ret;
+
+	/* THP isn't supported */
+	if (PageTransHuge(page))
+		return -1;
+
+	if (pool < 0)
+		return -1;
+	if (ind64 != ind)
+		return -1;
+	mb(); /* ensure page is quiescent; tmem may address it with an alias */
+	ret = xen_tmem_put_page(pool, oswiz(type, ind), iswiz(ind), page);
+	/* translate Xen tmem return values to linux semantics */
+	if (ret == 1)
+		return 0;
+	else
+		return -1;
+}
+
+/*
+ * returns 0 if the page was successfully gotten from frontswap, -1 if
+ * was not present (should never happen!)
+ */
+static int tmem_frontswap_load(unsigned type, pgoff_t offset,
+				   struct page *page)
+{
+	u64 ind64 = (u64)offset;
+	u32 ind = (u32)offset;
+	int pool = tmem_frontswap_poolid;
+	int ret;
+
+	if (pool < 0)
+		return -1;
+	if (ind64 != ind)
+		return -1;
+	ret = xen_tmem_get_page(pool, oswiz(type, ind), iswiz(ind), page);
+	/* translate Xen tmem return values to linux semantics */
+	if (ret == 1)
+		return 0;
+	else
+		return -1;
+}
+
+/* flush a single page from frontswap */
+static void tmem_frontswap_flush_page(unsigned type, pgoff_t offset)
+{
+	u64 ind64 = (u64)offset;
+	u32 ind = (u32)offset;
+	int pool = tmem_frontswap_poolid;
+
+	if (pool < 0)
+		return;
+	if (ind64 != ind)
+		return;
+	(void) xen_tmem_flush_page(pool, oswiz(type, ind), iswiz(ind));
+}
+
+/* flush all pages from the passed swaptype */
+static void tmem_frontswap_flush_area(unsigned type)
+{
+	int pool = tmem_frontswap_poolid;
+	int ind;
+
+	if (pool < 0)
+		return;
+	for (ind = SWIZ_MASK; ind >= 0; ind--)
+		(void)xen_tmem_flush_object(pool, oswiz(type, ind));
+}
+
+static void tmem_frontswap_init(unsigned ignored)
+{
+	struct tmem_pool_uuid private = TMEM_POOL_PRIVATE_UUID;
+
+	/* a single tmem poolid is used for all frontswap "types" (swapfiles) */
+	if (tmem_frontswap_poolid < 0)
+		tmem_frontswap_poolid =
+		    xen_tmem_new_pool(private, TMEM_POOL_PERSIST, PAGE_SIZE);
+}
+
+static struct frontswap_ops tmem_frontswap_ops = {
+	.store = tmem_frontswap_store,
+	.load = tmem_frontswap_load,
+	.invalidate_page = tmem_frontswap_flush_page,
+	.invalidate_area = tmem_frontswap_flush_area,
+	.init = tmem_frontswap_init
+};
+#endif
+
+static int __init xen_tmem_init(void)
+{
+	if (!xen_domain())
+		return 0;
+#ifdef CONFIG_FRONTSWAP
+	if (tmem_enabled && frontswap) {
+		char *s = "";
+
+		tmem_frontswap_poolid = -1;
+		frontswap_register_ops(&tmem_frontswap_ops);
+		pr_info("frontswap enabled, RAM provided by Xen Transcendent Memory%s\n",
+			s);
+	}
+#endif
+#ifdef CONFIG_CLEANCACHE
+	BUILD_BUG_ON(sizeof(struct cleancache_filekey) != sizeof(struct tmem_oid));
+	if (tmem_enabled && cleancache) {
+		int err;
+
+		err = cleancache_register_ops(&tmem_cleancache_ops);
+		if (err)
+			pr_warn("xen-tmem: failed to enable cleancache: %d\n",
+				err);
+		else
+			pr_info("cleancache enabled, RAM provided by "
+				"Xen Transcendent Memory\n");
+	}
+#endif
+#ifdef CONFIG_XEN_SELFBALLOONING
+	/*
+	 * There is no point of driving pages to the swap system if they
+	 * aren't going anywhere in tmem universe.
+	 */
+	if (!frontswap) {
+		selfshrinking = false;
+		selfballooning = false;
+	}
+	xen_selfballoon_init(selfballooning, selfshrinking);
+#endif
+	return 0;
+}
+
+module_init(xen_tmem_init)
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Dan Magenheimer <dan.magenheimer@oracle.com>");
+MODULE_DESCRIPTION("Shim to Xen transcendent memory");
diff --git a/drivers/xen/xen-acpi-cpuhotplug.c b/drivers/xen/xen-acpi-cpuhotplug.c
new file mode 100644
index 000000000..fdc9e67b8
--- /dev/null
+++ b/drivers/xen/xen-acpi-cpuhotplug.c
@@ -0,0 +1,456 @@
+/*
+ * Copyright (C) 2012 Intel Corporation
+ *    Author: Liu Jinsong <jinsong.liu@intel.com>
+ *    Author: Jiang Yunhong <yunhong.jiang@intel.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or (at
+ * your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ * NON INFRINGEMENT.  See the GNU General Public License for more
+ * details.
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/types.h>
+#include <linux/cpu.h>
+#include <linux/acpi.h>
+#include <linux/uaccess.h>
+#include <acpi/processor.h>
+#include <xen/acpi.h>
+#include <xen/interface/platform.h>
+#include <asm/xen/hypercall.h>
+
+#define PREFIX "ACPI:xen_cpu_hotplug:"
+
+#define INSTALL_NOTIFY_HANDLER		0
+#define UNINSTALL_NOTIFY_HANDLER	1
+
+static acpi_status xen_acpi_cpu_hotadd(struct acpi_processor *pr);
+
+/* --------------------------------------------------------------------------
+				Driver Interface
+-------------------------------------------------------------------------- */
+
+static int xen_acpi_processor_enable(struct acpi_device *device)
+{
+	acpi_status status = 0;
+	unsigned long long value;
+	union acpi_object object = { 0 };
+	struct acpi_buffer buffer = { sizeof(union acpi_object), &object };
+	struct acpi_processor *pr = acpi_driver_data(device);
+
+	if (!strcmp(acpi_device_hid(device), ACPI_PROCESSOR_OBJECT_HID)) {
+		/* Declared with "Processor" statement; match ProcessorID */
+		status = acpi_evaluate_object(pr->handle, NULL, NULL, &buffer);
+		if (ACPI_FAILURE(status)) {
+			pr_err(PREFIX "Evaluating processor object\n");
+			return -ENODEV;
+		}
+
+		pr->acpi_id = object.processor.proc_id;
+	} else {
+		/* Declared with "Device" statement; match _UID */
+		status = acpi_evaluate_integer(pr->handle, METHOD_NAME__UID,
+						NULL, &value);
+		if (ACPI_FAILURE(status)) {
+			pr_err(PREFIX "Evaluating processor _UID\n");
+			return -ENODEV;
+		}
+
+		pr->acpi_id = value;
+	}
+
+	pr->id = xen_pcpu_id(pr->acpi_id);
+
+	if (invalid_logical_cpuid(pr->id))
+		/* This cpu is not presented at hypervisor, try to hotadd it */
+		if (ACPI_FAILURE(xen_acpi_cpu_hotadd(pr))) {
+			pr_err(PREFIX "Hotadd CPU (acpi_id = %d) failed.\n",
+					pr->acpi_id);
+			return -ENODEV;
+		}
+
+	return 0;
+}
+
+static int xen_acpi_processor_add(struct acpi_device *device)
+{
+	int ret;
+	struct acpi_processor *pr;
+
+	if (!device)
+		return -EINVAL;
+
+	pr = kzalloc(sizeof(struct acpi_processor), GFP_KERNEL);
+	if (!pr)
+		return -ENOMEM;
+
+	pr->handle = device->handle;
+	strcpy(acpi_device_name(device), ACPI_PROCESSOR_DEVICE_NAME);
+	strcpy(acpi_device_class(device), ACPI_PROCESSOR_CLASS);
+	device->driver_data = pr;
+
+	ret = xen_acpi_processor_enable(device);
+	if (ret)
+		pr_err(PREFIX "Error when enabling Xen processor\n");
+
+	return ret;
+}
+
+static int xen_acpi_processor_remove(struct acpi_device *device)
+{
+	struct acpi_processor *pr;
+
+	if (!device)
+		return -EINVAL;
+
+	pr = acpi_driver_data(device);
+	if (!pr)
+		return -EINVAL;
+
+	kfree(pr);
+	return 0;
+}
+
+/*--------------------------------------------------------------
+		Acpi processor hotplug support
+--------------------------------------------------------------*/
+
+static int is_processor_present(acpi_handle handle)
+{
+	acpi_status status;
+	unsigned long long sta = 0;
+
+
+	status = acpi_evaluate_integer(handle, "_STA", NULL, &sta);
+
+	if (ACPI_SUCCESS(status) && (sta & ACPI_STA_DEVICE_PRESENT))
+		return 1;
+
+	/*
+	 * _STA is mandatory for a processor that supports hot plug
+	 */
+	if (status == AE_NOT_FOUND)
+		pr_info(PREFIX "Processor does not support hot plug\n");
+	else
+		pr_info(PREFIX "Processor Device is not present");
+	return 0;
+}
+
+static int xen_apic_id(acpi_handle handle)
+{
+	struct acpi_buffer buffer = { ACPI_ALLOCATE_BUFFER, NULL };
+	union acpi_object *obj;
+	struct acpi_madt_local_apic *lapic;
+	int apic_id;
+
+	if (ACPI_FAILURE(acpi_evaluate_object(handle, "_MAT", NULL, &buffer)))
+		return -EINVAL;
+
+	if (!buffer.length || !buffer.pointer)
+		return -EINVAL;
+
+	obj = buffer.pointer;
+	if (obj->type != ACPI_TYPE_BUFFER ||
+	    obj->buffer.length < sizeof(*lapic)) {
+		kfree(buffer.pointer);
+		return -EINVAL;
+	}
+
+	lapic = (struct acpi_madt_local_apic *)obj->buffer.pointer;
+
+	if (lapic->header.type != ACPI_MADT_TYPE_LOCAL_APIC ||
+	    !(lapic->lapic_flags & ACPI_MADT_ENABLED)) {
+		kfree(buffer.pointer);
+		return -EINVAL;
+	}
+
+	apic_id = (uint32_t)lapic->id;
+	kfree(buffer.pointer);
+	buffer.length = ACPI_ALLOCATE_BUFFER;
+	buffer.pointer = NULL;
+
+	return apic_id;
+}
+
+static int xen_hotadd_cpu(struct acpi_processor *pr)
+{
+	int cpu_id, apic_id, pxm;
+	struct xen_platform_op op;
+
+	apic_id = xen_apic_id(pr->handle);
+	if (apic_id < 0) {
+		pr_err(PREFIX "Failed to get apic_id for acpi_id %d\n",
+				pr->acpi_id);
+		return -ENODEV;
+	}
+
+	pxm = xen_acpi_get_pxm(pr->handle);
+	if (pxm < 0) {
+		pr_err(PREFIX "Failed to get _PXM for acpi_id %d\n",
+				pr->acpi_id);
+		return pxm;
+	}
+
+	op.cmd = XENPF_cpu_hotadd;
+	op.u.cpu_add.apic_id = apic_id;
+	op.u.cpu_add.acpi_id = pr->acpi_id;
+	op.u.cpu_add.pxm = pxm;
+
+	cpu_id = HYPERVISOR_platform_op(&op);
+	if (cpu_id < 0)
+		pr_err(PREFIX "Failed to hotadd CPU for acpi_id %d\n",
+				pr->acpi_id);
+
+	return cpu_id;
+}
+
+static acpi_status xen_acpi_cpu_hotadd(struct acpi_processor *pr)
+{
+	if (!is_processor_present(pr->handle))
+		return AE_ERROR;
+
+	pr->id = xen_hotadd_cpu(pr);
+	if (invalid_logical_cpuid(pr->id))
+		return AE_ERROR;
+
+	/*
+	 * Sync with Xen hypervisor, providing new /sys/.../xen_cpuX
+	 * interface after cpu hotadded.
+	 */
+	xen_pcpu_hotplug_sync();
+
+	return AE_OK;
+}
+
+static int acpi_processor_device_remove(struct acpi_device *device)
+{
+	pr_debug(PREFIX "Xen does not support CPU hotremove\n");
+
+	return -ENOSYS;
+}
+
+static void acpi_processor_hotplug_notify(acpi_handle handle,
+					  u32 event, void *data)
+{
+	struct acpi_processor *pr;
+	struct acpi_device *device = NULL;
+	u32 ost_code = ACPI_OST_SC_NON_SPECIFIC_FAILURE; /* default */
+	int result;
+
+	acpi_scan_lock_acquire();
+
+	switch (event) {
+	case ACPI_NOTIFY_BUS_CHECK:
+	case ACPI_NOTIFY_DEVICE_CHECK:
+		ACPI_DEBUG_PRINT((ACPI_DB_INFO,
+			"Processor driver received %s event\n",
+			(event == ACPI_NOTIFY_BUS_CHECK) ?
+			"ACPI_NOTIFY_BUS_CHECK" : "ACPI_NOTIFY_DEVICE_CHECK"));
+
+		if (!is_processor_present(handle))
+			break;
+
+		acpi_bus_get_device(handle, &device);
+		if (acpi_device_enumerated(device))
+			break;
+
+		result = acpi_bus_scan(handle);
+		if (result) {
+			pr_err(PREFIX "Unable to add the device\n");
+			break;
+		}
+		device = NULL;
+		acpi_bus_get_device(handle, &device);
+		if (!acpi_device_enumerated(device)) {
+			pr_err(PREFIX "Missing device object\n");
+			break;
+		}
+		ost_code = ACPI_OST_SC_SUCCESS;
+		break;
+
+	case ACPI_NOTIFY_EJECT_REQUEST:
+		ACPI_DEBUG_PRINT((ACPI_DB_INFO,
+				  "received ACPI_NOTIFY_EJECT_REQUEST\n"));
+
+		if (acpi_bus_get_device(handle, &device)) {
+			pr_err(PREFIX "Device don't exist, dropping EJECT\n");
+			break;
+		}
+		pr = acpi_driver_data(device);
+		if (!pr) {
+			pr_err(PREFIX "Driver data is NULL, dropping EJECT\n");
+			break;
+		}
+
+		/*
+		 * TBD: implement acpi_processor_device_remove if Xen support
+		 * CPU hotremove in the future.
+		 */
+		acpi_processor_device_remove(device);
+		break;
+
+	default:
+		ACPI_DEBUG_PRINT((ACPI_DB_INFO,
+				  "Unsupported event [0x%x]\n", event));
+
+		/* non-hotplug event; possibly handled by other handler */
+		goto out;
+	}
+
+	(void) acpi_evaluate_ost(handle, event, ost_code, NULL);
+
+out:
+	acpi_scan_lock_release();
+}
+
+static acpi_status is_processor_device(acpi_handle handle)
+{
+	struct acpi_device_info *info;
+	char *hid;
+	acpi_status status;
+
+	status = acpi_get_object_info(handle, &info);
+	if (ACPI_FAILURE(status))
+		return status;
+
+	if (info->type == ACPI_TYPE_PROCESSOR) {
+		kfree(info);
+		return AE_OK;	/* found a processor object */
+	}
+
+	if (!(info->valid & ACPI_VALID_HID)) {
+		kfree(info);
+		return AE_ERROR;
+	}
+
+	hid = info->hardware_id.string;
+	if ((hid == NULL) || strcmp(hid, ACPI_PROCESSOR_DEVICE_HID)) {
+		kfree(info);
+		return AE_ERROR;
+	}
+
+	kfree(info);
+	return AE_OK;	/* found a processor device object */
+}
+
+static acpi_status
+processor_walk_namespace_cb(acpi_handle handle,
+			    u32 lvl, void *context, void **rv)
+{
+	acpi_status status;
+	int *action = context;
+
+	status = is_processor_device(handle);
+	if (ACPI_FAILURE(status))
+		return AE_OK;	/* not a processor; continue to walk */
+
+	switch (*action) {
+	case INSTALL_NOTIFY_HANDLER:
+		acpi_install_notify_handler(handle,
+					    ACPI_SYSTEM_NOTIFY,
+					    acpi_processor_hotplug_notify,
+					    NULL);
+		break;
+	case UNINSTALL_NOTIFY_HANDLER:
+		acpi_remove_notify_handler(handle,
+					   ACPI_SYSTEM_NOTIFY,
+					   acpi_processor_hotplug_notify);
+		break;
+	default:
+		break;
+	}
+
+	/* found a processor; skip walking underneath */
+	return AE_CTRL_DEPTH;
+}
+
+static
+void acpi_processor_install_hotplug_notify(void)
+{
+	int action = INSTALL_NOTIFY_HANDLER;
+	acpi_walk_namespace(ACPI_TYPE_ANY,
+			    ACPI_ROOT_OBJECT,
+			    ACPI_UINT32_MAX,
+			    processor_walk_namespace_cb, NULL, &action, NULL);
+}
+
+static
+void acpi_processor_uninstall_hotplug_notify(void)
+{
+	int action = UNINSTALL_NOTIFY_HANDLER;
+	acpi_walk_namespace(ACPI_TYPE_ANY,
+			    ACPI_ROOT_OBJECT,
+			    ACPI_UINT32_MAX,
+			    processor_walk_namespace_cb, NULL, &action, NULL);
+}
+
+static const struct acpi_device_id processor_device_ids[] = {
+	{ACPI_PROCESSOR_OBJECT_HID, 0},
+	{ACPI_PROCESSOR_DEVICE_HID, 0},
+	{"", 0},
+};
+MODULE_DEVICE_TABLE(acpi, processor_device_ids);
+
+static struct acpi_driver xen_acpi_processor_driver = {
+	.name = "processor",
+	.class = ACPI_PROCESSOR_CLASS,
+	.ids = processor_device_ids,
+	.ops = {
+		.add = xen_acpi_processor_add,
+		.remove = xen_acpi_processor_remove,
+		},
+};
+
+static int __init xen_acpi_processor_init(void)
+{
+	int result = 0;
+
+	if (!xen_initial_domain())
+		return -ENODEV;
+
+	/* unregister the stub which only used to reserve driver space */
+	xen_stub_processor_exit();
+
+	result = acpi_bus_register_driver(&xen_acpi_processor_driver);
+	if (result < 0) {
+		xen_stub_processor_init();
+		return result;
+	}
+
+	acpi_processor_install_hotplug_notify();
+	return 0;
+}
+
+static void __exit xen_acpi_processor_exit(void)
+{
+	if (!xen_initial_domain())
+		return;
+
+	acpi_processor_uninstall_hotplug_notify();
+
+	acpi_bus_unregister_driver(&xen_acpi_processor_driver);
+
+	/*
+	 * stub reserve space again to prevent any chance of native
+	 * driver loading.
+	 */
+	xen_stub_processor_init();
+	return;
+}
+
+module_init(xen_acpi_processor_init);
+module_exit(xen_acpi_processor_exit);
+ACPI_MODULE_NAME("xen-acpi-cpuhotplug");
+MODULE_AUTHOR("Liu Jinsong <jinsong.liu@intel.com>");
+MODULE_DESCRIPTION("Xen Hotplug CPU Driver");
+MODULE_LICENSE("GPL");
diff --git a/drivers/xen/xen-acpi-memhotplug.c b/drivers/xen/xen-acpi-memhotplug.c
new file mode 100644
index 000000000..4fc886cd5
--- /dev/null
+++ b/drivers/xen/xen-acpi-memhotplug.c
@@ -0,0 +1,485 @@
+/*
+ * Copyright (C) 2012 Intel Corporation
+ *    Author: Liu Jinsong <jinsong.liu@intel.com>
+ *    Author: Jiang Yunhong <yunhong.jiang@intel.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or (at
+ * your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ * NON INFRINGEMENT.  See the GNU General Public License for more
+ * details.
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/types.h>
+#include <linux/acpi.h>
+#include <xen/acpi.h>
+#include <xen/interface/platform.h>
+#include <asm/xen/hypercall.h>
+
+#define PREFIX "ACPI:xen_memory_hotplug:"
+
+struct acpi_memory_info {
+	struct list_head list;
+	u64 start_addr;		/* Memory Range start physical addr */
+	u64 length;		/* Memory Range length */
+	unsigned short caching;	/* memory cache attribute */
+	unsigned short write_protect;	/* memory read/write attribute */
+				/* copied from buffer getting from _CRS */
+	unsigned int enabled:1;
+};
+
+struct acpi_memory_device {
+	struct acpi_device *device;
+	struct list_head res_list;
+};
+
+static bool acpi_hotmem_initialized __read_mostly;
+
+static int xen_hotadd_memory(int pxm, struct acpi_memory_info *info)
+{
+	int rc;
+	struct xen_platform_op op;
+
+	op.cmd = XENPF_mem_hotadd;
+	op.u.mem_add.spfn = info->start_addr >> PAGE_SHIFT;
+	op.u.mem_add.epfn = (info->start_addr + info->length) >> PAGE_SHIFT;
+	op.u.mem_add.pxm = pxm;
+
+	rc = HYPERVISOR_dom0_op(&op);
+	if (rc)
+		pr_err(PREFIX "Xen Hotplug Memory Add failed on "
+			"0x%lx -> 0x%lx, _PXM: %d, error: %d\n",
+			(unsigned long)info->start_addr,
+			(unsigned long)(info->start_addr + info->length),
+			pxm, rc);
+
+	return rc;
+}
+
+static int xen_acpi_memory_enable_device(struct acpi_memory_device *mem_device)
+{
+	int pxm, result;
+	int num_enabled = 0;
+	struct acpi_memory_info *info;
+
+	if (!mem_device)
+		return -EINVAL;
+
+	pxm = xen_acpi_get_pxm(mem_device->device->handle);
+	if (pxm < 0)
+		return pxm;
+
+	list_for_each_entry(info, &mem_device->res_list, list) {
+		if (info->enabled) { /* just sanity check...*/
+			num_enabled++;
+			continue;
+		}
+
+		if (!info->length)
+			continue;
+
+		result = xen_hotadd_memory(pxm, info);
+		if (result)
+			continue;
+		info->enabled = 1;
+		num_enabled++;
+	}
+
+	if (!num_enabled)
+		return -ENODEV;
+
+	return 0;
+}
+
+static acpi_status
+acpi_memory_get_resource(struct acpi_resource *resource, void *context)
+{
+	struct acpi_memory_device *mem_device = context;
+	struct acpi_resource_address64 address64;
+	struct acpi_memory_info *info, *new;
+	acpi_status status;
+
+	status = acpi_resource_to_address64(resource, &address64);
+	if (ACPI_FAILURE(status) ||
+	    (address64.resource_type != ACPI_MEMORY_RANGE))
+		return AE_OK;
+
+	list_for_each_entry(info, &mem_device->res_list, list) {
+		if ((info->caching == address64.info.mem.caching) &&
+		    (info->write_protect == address64.info.mem.write_protect) &&
+		    (info->start_addr + info->length == address64.address.minimum)) {
+			info->length += address64.address.address_length;
+			return AE_OK;
+		}
+	}
+
+	new = kzalloc(sizeof(struct acpi_memory_info), GFP_KERNEL);
+	if (!new)
+		return AE_ERROR;
+
+	INIT_LIST_HEAD(&new->list);
+	new->caching = address64.info.mem.caching;
+	new->write_protect = address64.info.mem.write_protect;
+	new->start_addr = address64.address.minimum;
+	new->length = address64.address.address_length;
+	list_add_tail(&new->list, &mem_device->res_list);
+
+	return AE_OK;
+}
+
+static int
+acpi_memory_get_device_resources(struct acpi_memory_device *mem_device)
+{
+	acpi_status status;
+	struct acpi_memory_info *info, *n;
+
+	if (!list_empty(&mem_device->res_list))
+		return 0;
+
+	status = acpi_walk_resources(mem_device->device->handle,
+		METHOD_NAME__CRS, acpi_memory_get_resource, mem_device);
+
+	if (ACPI_FAILURE(status)) {
+		list_for_each_entry_safe(info, n, &mem_device->res_list, list)
+			kfree(info);
+		INIT_LIST_HEAD(&mem_device->res_list);
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+static int acpi_memory_get_device(acpi_handle handle,
+				  struct acpi_memory_device **mem_device)
+{
+	struct acpi_device *device = NULL;
+	int result = 0;
+
+	acpi_scan_lock_acquire();
+
+	acpi_bus_get_device(handle, &device);
+	if (acpi_device_enumerated(device))
+		goto end;
+
+	/*
+	 * Now add the notified device.  This creates the acpi_device
+	 * and invokes .add function
+	 */
+	result = acpi_bus_scan(handle);
+	if (result) {
+		pr_warn(PREFIX "ACPI namespace scan failed\n");
+		result = -EINVAL;
+		goto out;
+	}
+	device = NULL;
+	acpi_bus_get_device(handle, &device);
+	if (!acpi_device_enumerated(device)) {
+		pr_warn(PREFIX "Missing device object\n");
+		result = -EINVAL;
+		goto out;
+	}
+
+end:
+	*mem_device = acpi_driver_data(device);
+	if (!(*mem_device)) {
+		pr_err(PREFIX "driver data not found\n");
+		result = -ENODEV;
+		goto out;
+	}
+
+out:
+	acpi_scan_lock_release();
+	return result;
+}
+
+static int acpi_memory_check_device(struct acpi_memory_device *mem_device)
+{
+	unsigned long long current_status;
+
+	/* Get device present/absent information from the _STA */
+	if (ACPI_FAILURE(acpi_evaluate_integer(mem_device->device->handle,
+				"_STA", NULL, &current_status)))
+		return -ENODEV;
+	/*
+	 * Check for device status. Device should be
+	 * present/enabled/functioning.
+	 */
+	if (!((current_status & ACPI_STA_DEVICE_PRESENT)
+	      && (current_status & ACPI_STA_DEVICE_ENABLED)
+	      && (current_status & ACPI_STA_DEVICE_FUNCTIONING)))
+		return -ENODEV;
+
+	return 0;
+}
+
+static int acpi_memory_disable_device(struct acpi_memory_device *mem_device)
+{
+	pr_debug(PREFIX "Xen does not support memory hotremove\n");
+
+	return -ENOSYS;
+}
+
+static void acpi_memory_device_notify(acpi_handle handle, u32 event, void *data)
+{
+	struct acpi_memory_device *mem_device;
+	struct acpi_device *device;
+	u32 ost_code = ACPI_OST_SC_NON_SPECIFIC_FAILURE; /* default */
+
+	switch (event) {
+	case ACPI_NOTIFY_BUS_CHECK:
+		ACPI_DEBUG_PRINT((ACPI_DB_INFO,
+			"\nReceived BUS CHECK notification for device\n"));
+		/* Fall Through */
+	case ACPI_NOTIFY_DEVICE_CHECK:
+		if (event == ACPI_NOTIFY_DEVICE_CHECK)
+			ACPI_DEBUG_PRINT((ACPI_DB_INFO,
+			"\nReceived DEVICE CHECK notification for device\n"));
+
+		if (acpi_memory_get_device(handle, &mem_device)) {
+			pr_err(PREFIX "Cannot find driver data\n");
+			break;
+		}
+
+		ost_code = ACPI_OST_SC_SUCCESS;
+		break;
+
+	case ACPI_NOTIFY_EJECT_REQUEST:
+		ACPI_DEBUG_PRINT((ACPI_DB_INFO,
+			"\nReceived EJECT REQUEST notification for device\n"));
+
+		acpi_scan_lock_acquire();
+		if (acpi_bus_get_device(handle, &device)) {
+			acpi_scan_lock_release();
+			pr_err(PREFIX "Device doesn't exist\n");
+			break;
+		}
+		mem_device = acpi_driver_data(device);
+		if (!mem_device) {
+			acpi_scan_lock_release();
+			pr_err(PREFIX "Driver Data is NULL\n");
+			break;
+		}
+
+		/*
+		 * TBD: implement acpi_memory_disable_device and invoke
+		 * acpi_bus_remove if Xen support hotremove in the future
+		 */
+		acpi_memory_disable_device(mem_device);
+		acpi_scan_lock_release();
+		break;
+
+	default:
+		ACPI_DEBUG_PRINT((ACPI_DB_INFO,
+				  "Unsupported event [0x%x]\n", event));
+		/* non-hotplug event; possibly handled by other handler */
+		return;
+	}
+
+	(void) acpi_evaluate_ost(handle, event, ost_code, NULL);
+	return;
+}
+
+static int xen_acpi_memory_device_add(struct acpi_device *device)
+{
+	int result;
+	struct acpi_memory_device *mem_device = NULL;
+
+
+	if (!device)
+		return -EINVAL;
+
+	mem_device = kzalloc(sizeof(struct acpi_memory_device), GFP_KERNEL);
+	if (!mem_device)
+		return -ENOMEM;
+
+	INIT_LIST_HEAD(&mem_device->res_list);
+	mem_device->device = device;
+	sprintf(acpi_device_name(device), "%s", ACPI_MEMORY_DEVICE_NAME);
+	sprintf(acpi_device_class(device), "%s", ACPI_MEMORY_DEVICE_CLASS);
+	device->driver_data = mem_device;
+
+	/* Get the range from the _CRS */
+	result = acpi_memory_get_device_resources(mem_device);
+	if (result) {
+		kfree(mem_device);
+		return result;
+	}
+
+	/*
+	 * For booting existed memory devices, early boot code has recognized
+	 * memory area by EFI/E820. If DSDT shows these memory devices on boot,
+	 * hotplug is not necessary for them.
+	 * For hot-added memory devices during runtime, it need hypercall to
+	 * Xen hypervisor to add memory.
+	 */
+	if (!acpi_hotmem_initialized)
+		return 0;
+
+	if (!acpi_memory_check_device(mem_device))
+		result = xen_acpi_memory_enable_device(mem_device);
+
+	return result;
+}
+
+static int xen_acpi_memory_device_remove(struct acpi_device *device)
+{
+	struct acpi_memory_device *mem_device = NULL;
+
+	if (!device || !acpi_driver_data(device))
+		return -EINVAL;
+
+	mem_device = acpi_driver_data(device);
+	kfree(mem_device);
+
+	return 0;
+}
+
+/*
+ * Helper function to check for memory device
+ */
+static acpi_status is_memory_device(acpi_handle handle)
+{
+	char *hardware_id;
+	acpi_status status;
+	struct acpi_device_info *info;
+
+	status = acpi_get_object_info(handle, &info);
+	if (ACPI_FAILURE(status))
+		return status;
+
+	if (!(info->valid & ACPI_VALID_HID)) {
+		kfree(info);
+		return AE_ERROR;
+	}
+
+	hardware_id = info->hardware_id.string;
+	if ((hardware_id == NULL) ||
+	    (strcmp(hardware_id, ACPI_MEMORY_DEVICE_HID)))
+		status = AE_ERROR;
+
+	kfree(info);
+	return status;
+}
+
+static acpi_status
+acpi_memory_register_notify_handler(acpi_handle handle,
+				    u32 level, void *ctxt, void **retv)
+{
+	acpi_status status;
+
+	status = is_memory_device(handle);
+	if (ACPI_FAILURE(status))
+		return AE_OK;	/* continue */
+
+	status = acpi_install_notify_handler(handle, ACPI_SYSTEM_NOTIFY,
+					     acpi_memory_device_notify, NULL);
+	/* continue */
+	return AE_OK;
+}
+
+static acpi_status
+acpi_memory_deregister_notify_handler(acpi_handle handle,
+				      u32 level, void *ctxt, void **retv)
+{
+	acpi_status status;
+
+	status = is_memory_device(handle);
+	if (ACPI_FAILURE(status))
+		return AE_OK;	/* continue */
+
+	status = acpi_remove_notify_handler(handle,
+					    ACPI_SYSTEM_NOTIFY,
+					    acpi_memory_device_notify);
+
+	return AE_OK;	/* continue */
+}
+
+static const struct acpi_device_id memory_device_ids[] = {
+	{ACPI_MEMORY_DEVICE_HID, 0},
+	{"", 0},
+};
+MODULE_DEVICE_TABLE(acpi, memory_device_ids);
+
+static struct acpi_driver xen_acpi_memory_device_driver = {
+	.name = "acpi_memhotplug",
+	.class = ACPI_MEMORY_DEVICE_CLASS,
+	.ids = memory_device_ids,
+	.ops = {
+		.add = xen_acpi_memory_device_add,
+		.remove = xen_acpi_memory_device_remove,
+		},
+};
+
+static int __init xen_acpi_memory_device_init(void)
+{
+	int result;
+	acpi_status status;
+
+	if (!xen_initial_domain())
+		return -ENODEV;
+
+	/* unregister the stub which only used to reserve driver space */
+	xen_stub_memory_device_exit();
+
+	result = acpi_bus_register_driver(&xen_acpi_memory_device_driver);
+	if (result < 0) {
+		xen_stub_memory_device_init();
+		return -ENODEV;
+	}
+
+	status = acpi_walk_namespace(ACPI_TYPE_DEVICE, ACPI_ROOT_OBJECT,
+				     ACPI_UINT32_MAX,
+				     acpi_memory_register_notify_handler,
+				     NULL, NULL, NULL);
+
+	if (ACPI_FAILURE(status)) {
+		pr_warn(PREFIX "walk_namespace failed\n");
+		acpi_bus_unregister_driver(&xen_acpi_memory_device_driver);
+		xen_stub_memory_device_init();
+		return -ENODEV;
+	}
+
+	acpi_hotmem_initialized = true;
+	return 0;
+}
+
+static void __exit xen_acpi_memory_device_exit(void)
+{
+	acpi_status status;
+
+	if (!xen_initial_domain())
+		return;
+
+	status = acpi_walk_namespace(ACPI_TYPE_DEVICE, ACPI_ROOT_OBJECT,
+				     ACPI_UINT32_MAX,
+				     acpi_memory_deregister_notify_handler,
+				     NULL, NULL, NULL);
+	if (ACPI_FAILURE(status))
+		pr_warn(PREFIX "walk_namespace failed\n");
+
+	acpi_bus_unregister_driver(&xen_acpi_memory_device_driver);
+
+	/*
+	 * stub reserve space again to prevent any chance of native
+	 * driver loading.
+	 */
+	xen_stub_memory_device_init();
+	return;
+}
+
+module_init(xen_acpi_memory_device_init);
+module_exit(xen_acpi_memory_device_exit);
+ACPI_MODULE_NAME("xen-acpi-memhotplug");
+MODULE_AUTHOR("Liu Jinsong <jinsong.liu@intel.com>");
+MODULE_DESCRIPTION("Xen Hotplug Mem Driver");
+MODULE_LICENSE("GPL");
diff --git a/drivers/xen/xen-acpi-pad.c b/drivers/xen/xen-acpi-pad.c
new file mode 100644
index 000000000..23d1808fe
--- /dev/null
+++ b/drivers/xen/xen-acpi-pad.c
@@ -0,0 +1,170 @@
+/*
+ * xen-acpi-pad.c - Xen pad interface
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ *    Author: Liu, Jinsong <jinsong.liu@intel.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/kernel.h>
+#include <linux/types.h>
+#include <linux/acpi.h>
+#include <xen/interface/version.h>
+#include <xen/xen-ops.h>
+#include <asm/xen/hypercall.h>
+
+#define ACPI_PROCESSOR_AGGREGATOR_CLASS	"acpi_pad"
+#define ACPI_PROCESSOR_AGGREGATOR_DEVICE_NAME "Processor Aggregator"
+#define ACPI_PROCESSOR_AGGREGATOR_NOTIFY 0x80
+static DEFINE_MUTEX(xen_cpu_lock);
+
+static int xen_acpi_pad_idle_cpus(unsigned int idle_nums)
+{
+	struct xen_platform_op op;
+
+	op.cmd = XENPF_core_parking;
+	op.u.core_parking.type = XEN_CORE_PARKING_SET;
+	op.u.core_parking.idle_nums = idle_nums;
+
+	return HYPERVISOR_platform_op(&op);
+}
+
+static int xen_acpi_pad_idle_cpus_num(void)
+{
+	struct xen_platform_op op;
+
+	op.cmd = XENPF_core_parking;
+	op.u.core_parking.type = XEN_CORE_PARKING_GET;
+
+	return HYPERVISOR_platform_op(&op)
+	       ?: op.u.core_parking.idle_nums;
+}
+
+/*
+ * Query firmware how many CPUs should be idle
+ * return -1 on failure
+ */
+static int acpi_pad_pur(acpi_handle handle)
+{
+	struct acpi_buffer buffer = {ACPI_ALLOCATE_BUFFER, NULL};
+	union acpi_object *package;
+	int num = -1;
+
+	if (ACPI_FAILURE(acpi_evaluate_object(handle, "_PUR", NULL, &buffer)))
+		return num;
+
+	if (!buffer.length || !buffer.pointer)
+		return num;
+
+	package = buffer.pointer;
+
+	if (package->type == ACPI_TYPE_PACKAGE &&
+		package->package.count == 2 &&
+		package->package.elements[0].integer.value == 1) /* rev 1 */
+		num = package->package.elements[1].integer.value;
+
+	kfree(buffer.pointer);
+	return num;
+}
+
+static void acpi_pad_handle_notify(acpi_handle handle)
+{
+	int idle_nums;
+	struct acpi_buffer param = {
+		.length = 4,
+		.pointer = (void *)&idle_nums,
+	};
+
+
+	mutex_lock(&xen_cpu_lock);
+	idle_nums = acpi_pad_pur(handle);
+	if (idle_nums < 0) {
+		mutex_unlock(&xen_cpu_lock);
+		return;
+	}
+
+	idle_nums = xen_acpi_pad_idle_cpus(idle_nums)
+		    ?: xen_acpi_pad_idle_cpus_num();
+	if (idle_nums >= 0)
+		acpi_evaluate_ost(handle, ACPI_PROCESSOR_AGGREGATOR_NOTIFY,
+				  0, &param);
+	mutex_unlock(&xen_cpu_lock);
+}
+
+static void acpi_pad_notify(acpi_handle handle, u32 event,
+	void *data)
+{
+	switch (event) {
+	case ACPI_PROCESSOR_AGGREGATOR_NOTIFY:
+		acpi_pad_handle_notify(handle);
+		break;
+	default:
+		pr_warn("Unsupported event [0x%x]\n", event);
+		break;
+	}
+}
+
+static int acpi_pad_add(struct acpi_device *device)
+{
+	acpi_status status;
+
+	strcpy(acpi_device_name(device), ACPI_PROCESSOR_AGGREGATOR_DEVICE_NAME);
+	strcpy(acpi_device_class(device), ACPI_PROCESSOR_AGGREGATOR_CLASS);
+
+	status = acpi_install_notify_handler(device->handle,
+		ACPI_DEVICE_NOTIFY, acpi_pad_notify, device);
+	if (ACPI_FAILURE(status))
+		return -ENODEV;
+
+	return 0;
+}
+
+static int acpi_pad_remove(struct acpi_device *device)
+{
+	mutex_lock(&xen_cpu_lock);
+	xen_acpi_pad_idle_cpus(0);
+	mutex_unlock(&xen_cpu_lock);
+
+	acpi_remove_notify_handler(device->handle,
+		ACPI_DEVICE_NOTIFY, acpi_pad_notify);
+	return 0;
+}
+
+static const struct acpi_device_id pad_device_ids[] = {
+	{"ACPI000C", 0},
+	{"", 0},
+};
+
+static struct acpi_driver acpi_pad_driver = {
+	.name = "processor_aggregator",
+	.class = ACPI_PROCESSOR_AGGREGATOR_CLASS,
+	.ids = pad_device_ids,
+	.ops = {
+		.add = acpi_pad_add,
+		.remove = acpi_pad_remove,
+	},
+};
+
+static int __init xen_acpi_pad_init(void)
+{
+	/* Only DOM0 is responsible for Xen acpi pad */
+	if (!xen_initial_domain())
+		return -ENODEV;
+
+	/* Only Xen4.2 or later support Xen acpi pad */
+	if (!xen_running_on_version_or_later(4, 2))
+		return -ENODEV;
+
+	return acpi_bus_register_driver(&acpi_pad_driver);
+}
+subsys_initcall(xen_acpi_pad_init);
diff --git a/drivers/xen/xen-acpi-processor.c b/drivers/xen/xen-acpi-processor.c
new file mode 100644
index 000000000..fbb9137c7
--- /dev/null
+++ b/drivers/xen/xen-acpi-processor.c
@@ -0,0 +1,612 @@
+/*
+ * Copyright 2012 by Oracle Inc
+ * Author: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
+ *
+ * This code borrows ideas from https://lkml.org/lkml/2011/11/30/249
+ * so many thanks go to Kevin Tian <kevin.tian@intel.com>
+ * and Yu Ke <ke.yu@intel.com>.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/cpumask.h>
+#include <linux/cpufreq.h>
+#include <linux/freezer.h>
+#include <linux/kernel.h>
+#include <linux/kthread.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/syscore_ops.h>
+#include <linux/acpi.h>
+#include <acpi/processor.h>
+#include <xen/xen.h>
+#include <xen/interface/platform.h>
+#include <asm/xen/hypercall.h>
+
+static int no_hypercall;
+MODULE_PARM_DESC(off, "Inhibit the hypercall.");
+module_param_named(off, no_hypercall, int, 0400);
+
+/*
+ * Note: Do not convert the acpi_id* below to cpumask_var_t or use cpumask_bit
+ * - as those shrink to nr_cpu_bits (which is dependent on possible_cpu), which
+ * can be less than what we want to put in. Instead use the 'nr_acpi_bits'
+ * which is dynamically computed based on the MADT or x2APIC table.
+ */
+static unsigned int nr_acpi_bits;
+/* Mutex to protect the acpi_ids_done - for CPU hotplug use. */
+static DEFINE_MUTEX(acpi_ids_mutex);
+/* Which ACPI ID we have processed from 'struct acpi_processor'. */
+static unsigned long *acpi_ids_done;
+/* Which ACPI ID exist in the SSDT/DSDT processor definitions. */
+static unsigned long *acpi_id_present;
+/* And if there is an _CST definition (or a PBLK) for the ACPI IDs */
+static unsigned long *acpi_id_cst_present;
+/* Which ACPI P-State dependencies for a enumerated processor */
+static struct acpi_psd_package *acpi_psd;
+
+static int push_cxx_to_hypervisor(struct acpi_processor *_pr)
+{
+	struct xen_platform_op op = {
+		.cmd			= XENPF_set_processor_pminfo,
+		.interface_version	= XENPF_INTERFACE_VERSION,
+		.u.set_pminfo.id	= _pr->acpi_id,
+		.u.set_pminfo.type	= XEN_PM_CX,
+	};
+	struct xen_processor_cx *dst_cx, *dst_cx_states = NULL;
+	struct acpi_processor_cx *cx;
+	unsigned int i, ok;
+	int ret = 0;
+
+	dst_cx_states = kcalloc(_pr->power.count,
+				sizeof(struct xen_processor_cx), GFP_KERNEL);
+	if (!dst_cx_states)
+		return -ENOMEM;
+
+	for (ok = 0, i = 1; i <= _pr->power.count; i++) {
+		cx = &_pr->power.states[i];
+		if (!cx->valid)
+			continue;
+
+		dst_cx = &(dst_cx_states[ok++]);
+
+		dst_cx->reg.space_id = ACPI_ADR_SPACE_SYSTEM_IO;
+		if (cx->entry_method == ACPI_CSTATE_SYSTEMIO) {
+			dst_cx->reg.bit_width = 8;
+			dst_cx->reg.bit_offset = 0;
+			dst_cx->reg.access_size = 1;
+		} else {
+			dst_cx->reg.space_id = ACPI_ADR_SPACE_FIXED_HARDWARE;
+			if (cx->entry_method == ACPI_CSTATE_FFH) {
+				/* NATIVE_CSTATE_BEYOND_HALT */
+				dst_cx->reg.bit_offset = 2;
+				dst_cx->reg.bit_width = 1; /* VENDOR_INTEL */
+			}
+			dst_cx->reg.access_size = 0;
+		}
+		dst_cx->reg.address = cx->address;
+
+		dst_cx->type = cx->type;
+		dst_cx->latency = cx->latency;
+
+		dst_cx->dpcnt = 0;
+		set_xen_guest_handle(dst_cx->dp, NULL);
+	}
+	if (!ok) {
+		pr_debug("No _Cx for ACPI CPU %u\n", _pr->acpi_id);
+		kfree(dst_cx_states);
+		return -EINVAL;
+	}
+	op.u.set_pminfo.power.count = ok;
+	op.u.set_pminfo.power.flags.bm_control = _pr->flags.bm_control;
+	op.u.set_pminfo.power.flags.bm_check = _pr->flags.bm_check;
+	op.u.set_pminfo.power.flags.has_cst = _pr->flags.has_cst;
+	op.u.set_pminfo.power.flags.power_setup_done =
+		_pr->flags.power_setup_done;
+
+	set_xen_guest_handle(op.u.set_pminfo.power.states, dst_cx_states);
+
+	if (!no_hypercall)
+		ret = HYPERVISOR_platform_op(&op);
+
+	if (!ret) {
+		pr_debug("ACPI CPU%u - C-states uploaded.\n", _pr->acpi_id);
+		for (i = 1; i <= _pr->power.count; i++) {
+			cx = &_pr->power.states[i];
+			if (!cx->valid)
+				continue;
+			pr_debug("     C%d: %s %d uS\n",
+				 cx->type, cx->desc, (u32)cx->latency);
+		}
+	} else if ((ret != -EINVAL) && (ret != -ENOSYS))
+		/* EINVAL means the ACPI ID is incorrect - meaning the ACPI
+		 * table is referencing a non-existing CPU - which can happen
+		 * with broken ACPI tables. */
+		pr_err("(CX): Hypervisor error (%d) for ACPI CPU%u\n",
+		       ret, _pr->acpi_id);
+
+	kfree(dst_cx_states);
+
+	return ret;
+}
+static struct xen_processor_px *
+xen_copy_pss_data(struct acpi_processor *_pr,
+		  struct xen_processor_performance *dst_perf)
+{
+	struct xen_processor_px *dst_states = NULL;
+	unsigned int i;
+
+	BUILD_BUG_ON(sizeof(struct xen_processor_px) !=
+		     sizeof(struct acpi_processor_px));
+
+	dst_states = kcalloc(_pr->performance->state_count,
+			     sizeof(struct xen_processor_px), GFP_KERNEL);
+	if (!dst_states)
+		return ERR_PTR(-ENOMEM);
+
+	dst_perf->state_count = _pr->performance->state_count;
+	for (i = 0; i < _pr->performance->state_count; i++) {
+		/* Fortunatly for us, they are both the same size */
+		memcpy(&(dst_states[i]), &(_pr->performance->states[i]),
+		       sizeof(struct acpi_processor_px));
+	}
+	return dst_states;
+}
+static int xen_copy_psd_data(struct acpi_processor *_pr,
+			     struct xen_processor_performance *dst)
+{
+	struct acpi_psd_package *pdomain;
+
+	BUILD_BUG_ON(sizeof(struct xen_psd_package) !=
+		     sizeof(struct acpi_psd_package));
+
+	/* This information is enumerated only if acpi_processor_preregister_performance
+	 * has been called.
+	 */
+	dst->shared_type = _pr->performance->shared_type;
+
+	pdomain = &(_pr->performance->domain_info);
+
+	/* 'acpi_processor_preregister_performance' does not parse if the
+	 * num_processors <= 1, but Xen still requires it. Do it manually here.
+	 */
+	if (pdomain->num_processors <= 1) {
+		if (pdomain->coord_type == DOMAIN_COORD_TYPE_SW_ALL)
+			dst->shared_type = CPUFREQ_SHARED_TYPE_ALL;
+		else if (pdomain->coord_type == DOMAIN_COORD_TYPE_HW_ALL)
+			dst->shared_type = CPUFREQ_SHARED_TYPE_HW;
+		else if (pdomain->coord_type == DOMAIN_COORD_TYPE_SW_ANY)
+			dst->shared_type = CPUFREQ_SHARED_TYPE_ANY;
+
+	}
+	memcpy(&(dst->domain_info), pdomain, sizeof(struct acpi_psd_package));
+	return 0;
+}
+static int xen_copy_pct_data(struct acpi_pct_register *pct,
+			     struct xen_pct_register *dst_pct)
+{
+	/* It would be nice if you could just do 'memcpy(pct, dst_pct') but
+	 * sadly the Xen structure did not have the proper padding so the
+	 * descriptor field takes two (dst_pct) bytes instead of one (pct).
+	 */
+	dst_pct->descriptor = pct->descriptor;
+	dst_pct->length = pct->length;
+	dst_pct->space_id = pct->space_id;
+	dst_pct->bit_width = pct->bit_width;
+	dst_pct->bit_offset = pct->bit_offset;
+	dst_pct->reserved = pct->reserved;
+	dst_pct->address = pct->address;
+	return 0;
+}
+static int push_pxx_to_hypervisor(struct acpi_processor *_pr)
+{
+	int ret = 0;
+	struct xen_platform_op op = {
+		.cmd			= XENPF_set_processor_pminfo,
+		.interface_version	= XENPF_INTERFACE_VERSION,
+		.u.set_pminfo.id	= _pr->acpi_id,
+		.u.set_pminfo.type	= XEN_PM_PX,
+	};
+	struct xen_processor_performance *dst_perf;
+	struct xen_processor_px *dst_states = NULL;
+
+	dst_perf = &op.u.set_pminfo.perf;
+
+	dst_perf->platform_limit = _pr->performance_platform_limit;
+	dst_perf->flags |= XEN_PX_PPC;
+	xen_copy_pct_data(&(_pr->performance->control_register),
+			  &dst_perf->control_register);
+	xen_copy_pct_data(&(_pr->performance->status_register),
+			  &dst_perf->status_register);
+	dst_perf->flags |= XEN_PX_PCT;
+	dst_states = xen_copy_pss_data(_pr, dst_perf);
+	if (!IS_ERR_OR_NULL(dst_states)) {
+		set_xen_guest_handle(dst_perf->states, dst_states);
+		dst_perf->flags |= XEN_PX_PSS;
+	}
+	if (!xen_copy_psd_data(_pr, dst_perf))
+		dst_perf->flags |= XEN_PX_PSD;
+
+	if (dst_perf->flags != (XEN_PX_PSD | XEN_PX_PSS | XEN_PX_PCT | XEN_PX_PPC)) {
+		pr_warn("ACPI CPU%u missing some P-state data (%x), skipping\n",
+			_pr->acpi_id, dst_perf->flags);
+		ret = -ENODEV;
+		goto err_free;
+	}
+
+	if (!no_hypercall)
+		ret = HYPERVISOR_platform_op(&op);
+
+	if (!ret) {
+		struct acpi_processor_performance *perf;
+		unsigned int i;
+
+		perf = _pr->performance;
+		pr_debug("ACPI CPU%u - P-states uploaded.\n", _pr->acpi_id);
+		for (i = 0; i < perf->state_count; i++) {
+			pr_debug("     %cP%d: %d MHz, %d mW, %d uS\n",
+			(i == perf->state ? '*' : ' '), i,
+			(u32) perf->states[i].core_frequency,
+			(u32) perf->states[i].power,
+			(u32) perf->states[i].transition_latency);
+		}
+	} else if ((ret != -EINVAL) && (ret != -ENOSYS))
+		/* EINVAL means the ACPI ID is incorrect - meaning the ACPI
+		 * table is referencing a non-existing CPU - which can happen
+		 * with broken ACPI tables. */
+		pr_warn("(_PXX): Hypervisor error (%d) for ACPI CPU%u\n",
+			ret, _pr->acpi_id);
+err_free:
+	if (!IS_ERR_OR_NULL(dst_states))
+		kfree(dst_states);
+
+	return ret;
+}
+static int upload_pm_data(struct acpi_processor *_pr)
+{
+	int err = 0;
+
+	mutex_lock(&acpi_ids_mutex);
+	if (__test_and_set_bit(_pr->acpi_id, acpi_ids_done)) {
+		mutex_unlock(&acpi_ids_mutex);
+		return -EBUSY;
+	}
+	if (_pr->flags.power)
+		err = push_cxx_to_hypervisor(_pr);
+
+	if (_pr->performance && _pr->performance->states)
+		err |= push_pxx_to_hypervisor(_pr);
+
+	mutex_unlock(&acpi_ids_mutex);
+	return err;
+}
+static unsigned int __init get_max_acpi_id(void)
+{
+	struct xenpf_pcpuinfo *info;
+	struct xen_platform_op op = {
+		.cmd = XENPF_get_cpuinfo,
+		.interface_version = XENPF_INTERFACE_VERSION,
+	};
+	int ret = 0;
+	unsigned int i, last_cpu, max_acpi_id = 0;
+
+	info = &op.u.pcpu_info;
+	info->xen_cpuid = 0;
+
+	ret = HYPERVISOR_platform_op(&op);
+	if (ret)
+		return NR_CPUS;
+
+	/* The max_present is the same irregardless of the xen_cpuid */
+	last_cpu = op.u.pcpu_info.max_present;
+	for (i = 0; i <= last_cpu; i++) {
+		info->xen_cpuid = i;
+		ret = HYPERVISOR_platform_op(&op);
+		if (ret)
+			continue;
+		max_acpi_id = max(info->acpi_id, max_acpi_id);
+	}
+	max_acpi_id *= 2; /* Slack for CPU hotplug support. */
+	pr_debug("Max ACPI ID: %u\n", max_acpi_id);
+	return max_acpi_id;
+}
+/*
+ * The read_acpi_id and check_acpi_ids are there to support the Xen
+ * oddity of virtual CPUs != physical CPUs in the initial domain.
+ * The user can supply 'xen_max_vcpus=X' on the Xen hypervisor line
+ * which will band the amount of CPUs the initial domain can see.
+ * In general that is OK, except it plays havoc with any of the
+ * for_each_[present|online]_cpu macros which are banded to the virtual
+ * CPU amount.
+ */
+static acpi_status
+read_acpi_id(acpi_handle handle, u32 lvl, void *context, void **rv)
+{
+	u32 acpi_id;
+	acpi_status status;
+	acpi_object_type acpi_type;
+	unsigned long long tmp;
+	union acpi_object object = { 0 };
+	struct acpi_buffer buffer = { sizeof(union acpi_object), &object };
+	acpi_io_address pblk = 0;
+
+	status = acpi_get_type(handle, &acpi_type);
+	if (ACPI_FAILURE(status))
+		return AE_OK;
+
+	switch (acpi_type) {
+	case ACPI_TYPE_PROCESSOR:
+		status = acpi_evaluate_object(handle, NULL, NULL, &buffer);
+		if (ACPI_FAILURE(status))
+			return AE_OK;
+		acpi_id = object.processor.proc_id;
+		pblk = object.processor.pblk_address;
+		break;
+	case ACPI_TYPE_DEVICE:
+		status = acpi_evaluate_integer(handle, "_UID", NULL, &tmp);
+		if (ACPI_FAILURE(status))
+			return AE_OK;
+		acpi_id = tmp;
+		break;
+	default:
+		return AE_OK;
+	}
+	if (invalid_phys_cpuid(acpi_get_phys_id(handle,
+						acpi_type == ACPI_TYPE_DEVICE,
+						acpi_id))) {
+		pr_debug("CPU with ACPI ID %u is unavailable\n", acpi_id);
+		return AE_OK;
+	}
+	/* There are more ACPI Processor objects than in x2APIC or MADT.
+	 * This can happen with incorrect ACPI SSDT declerations. */
+	if (acpi_id >= nr_acpi_bits) {
+		pr_debug("max acpi id %u, trying to set %u\n",
+			 nr_acpi_bits - 1, acpi_id);
+		return AE_OK;
+	}
+	/* OK, There is a ACPI Processor object */
+	__set_bit(acpi_id, acpi_id_present);
+
+	pr_debug("ACPI CPU%u w/ PBLK:0x%lx\n", acpi_id, (unsigned long)pblk);
+
+	/* It has P-state dependencies */
+	if (!acpi_processor_get_psd(handle, &acpi_psd[acpi_id])) {
+		pr_debug("ACPI CPU%u w/ PST:coord_type = %llu domain = %llu\n",
+			 acpi_id, acpi_psd[acpi_id].coord_type,
+			 acpi_psd[acpi_id].domain);
+	}
+
+	status = acpi_evaluate_object(handle, "_CST", NULL, &buffer);
+	if (ACPI_FAILURE(status)) {
+		if (!pblk)
+			return AE_OK;
+	}
+	/* .. and it has a C-state */
+	__set_bit(acpi_id, acpi_id_cst_present);
+
+	return AE_OK;
+}
+static int check_acpi_ids(struct acpi_processor *pr_backup)
+{
+
+	if (!pr_backup)
+		return -ENODEV;
+
+	if (acpi_id_present && acpi_id_cst_present)
+		/* OK, done this once .. skip to uploading */
+		goto upload;
+
+	/* All online CPUs have been processed at this stage. Now verify
+	 * whether in fact "online CPUs" == physical CPUs.
+	 */
+	acpi_id_present = kcalloc(BITS_TO_LONGS(nr_acpi_bits), sizeof(unsigned long), GFP_KERNEL);
+	if (!acpi_id_present)
+		return -ENOMEM;
+
+	acpi_id_cst_present = kcalloc(BITS_TO_LONGS(nr_acpi_bits), sizeof(unsigned long), GFP_KERNEL);
+	if (!acpi_id_cst_present) {
+		kfree(acpi_id_present);
+		return -ENOMEM;
+	}
+
+	acpi_psd = kcalloc(nr_acpi_bits, sizeof(struct acpi_psd_package),
+			   GFP_KERNEL);
+	if (!acpi_psd) {
+		kfree(acpi_id_present);
+		kfree(acpi_id_cst_present);
+		return -ENOMEM;
+	}
+
+	acpi_walk_namespace(ACPI_TYPE_PROCESSOR, ACPI_ROOT_OBJECT,
+			    ACPI_UINT32_MAX,
+			    read_acpi_id, NULL, NULL, NULL);
+	acpi_get_devices(ACPI_PROCESSOR_DEVICE_HID, read_acpi_id, NULL, NULL);
+
+upload:
+	if (!bitmap_equal(acpi_id_present, acpi_ids_done, nr_acpi_bits)) {
+		unsigned int i;
+		for_each_set_bit(i, acpi_id_present, nr_acpi_bits) {
+			pr_backup->acpi_id = i;
+			/* Mask out C-states if there are no _CST or PBLK */
+			pr_backup->flags.power = test_bit(i, acpi_id_cst_present);
+			/* num_entries is non-zero if we evaluated _PSD */
+			if (acpi_psd[i].num_entries) {
+				memcpy(&pr_backup->performance->domain_info,
+				       &acpi_psd[i],
+				       sizeof(struct acpi_psd_package));
+			}
+			(void)upload_pm_data(pr_backup);
+		}
+	}
+
+	return 0;
+}
+
+/* acpi_perf_data is a pointer to percpu data. */
+static struct acpi_processor_performance __percpu *acpi_perf_data;
+
+static void free_acpi_perf_data(void)
+{
+	unsigned int i;
+
+	/* Freeing a NULL pointer is OK, and alloc_percpu zeroes. */
+	for_each_possible_cpu(i)
+		free_cpumask_var(per_cpu_ptr(acpi_perf_data, i)
+				 ->shared_cpu_map);
+	free_percpu(acpi_perf_data);
+}
+
+static int xen_upload_processor_pm_data(void)
+{
+	struct acpi_processor *pr_backup = NULL;
+	unsigned int i;
+	int rc = 0;
+
+	pr_info("Uploading Xen processor PM info\n");
+
+	for_each_possible_cpu(i) {
+		struct acpi_processor *_pr;
+		_pr = per_cpu(processors, i /* APIC ID */);
+		if (!_pr)
+			continue;
+
+		if (!pr_backup) {
+			pr_backup = kzalloc(sizeof(struct acpi_processor), GFP_KERNEL);
+			if (pr_backup)
+				memcpy(pr_backup, _pr, sizeof(struct acpi_processor));
+		}
+		(void)upload_pm_data(_pr);
+	}
+
+	rc = check_acpi_ids(pr_backup);
+	kfree(pr_backup);
+
+	return rc;
+}
+
+static void xen_acpi_processor_resume_worker(struct work_struct *dummy)
+{
+	int rc;
+
+	bitmap_zero(acpi_ids_done, nr_acpi_bits);
+
+	rc = xen_upload_processor_pm_data();
+	if (rc != 0)
+		pr_info("ACPI data upload failed, error = %d\n", rc);
+}
+
+static void xen_acpi_processor_resume(void)
+{
+	static DECLARE_WORK(wq, xen_acpi_processor_resume_worker);
+
+	/*
+	 * xen_upload_processor_pm_data() calls non-atomic code.
+	 * However, the context for xen_acpi_processor_resume is syscore
+	 * with only the boot CPU online and in an atomic context.
+	 *
+	 * So defer the upload for some point safer.
+	 */
+	schedule_work(&wq);
+}
+
+static struct syscore_ops xap_syscore_ops = {
+	.resume	= xen_acpi_processor_resume,
+};
+
+static int __init xen_acpi_processor_init(void)
+{
+	unsigned int i;
+	int rc;
+
+	if (!xen_initial_domain())
+		return -ENODEV;
+
+	nr_acpi_bits = get_max_acpi_id() + 1;
+	acpi_ids_done = kcalloc(BITS_TO_LONGS(nr_acpi_bits), sizeof(unsigned long), GFP_KERNEL);
+	if (!acpi_ids_done)
+		return -ENOMEM;
+
+	acpi_perf_data = alloc_percpu(struct acpi_processor_performance);
+	if (!acpi_perf_data) {
+		pr_debug("Memory allocation error for acpi_perf_data\n");
+		kfree(acpi_ids_done);
+		return -ENOMEM;
+	}
+	for_each_possible_cpu(i) {
+		if (!zalloc_cpumask_var_node(
+			&per_cpu_ptr(acpi_perf_data, i)->shared_cpu_map,
+			GFP_KERNEL, cpu_to_node(i))) {
+			rc = -ENOMEM;
+			goto err_out;
+		}
+	}
+
+	/* Do initialization in ACPI core. It is OK to fail here. */
+	(void)acpi_processor_preregister_performance(acpi_perf_data);
+
+	for_each_possible_cpu(i) {
+		struct acpi_processor *pr;
+		struct acpi_processor_performance *perf;
+
+		pr = per_cpu(processors, i);
+		perf = per_cpu_ptr(acpi_perf_data, i);
+		if (!pr)
+			continue;
+
+		pr->performance = perf;
+		rc = acpi_processor_get_performance_info(pr);
+		if (rc)
+			goto err_out;
+	}
+
+	rc = xen_upload_processor_pm_data();
+	if (rc)
+		goto err_unregister;
+
+	register_syscore_ops(&xap_syscore_ops);
+
+	return 0;
+err_unregister:
+	for_each_possible_cpu(i)
+		acpi_processor_unregister_performance(i);
+
+err_out:
+	/* Freeing a NULL pointer is OK: alloc_percpu zeroes. */
+	free_acpi_perf_data();
+	kfree(acpi_ids_done);
+	return rc;
+}
+static void __exit xen_acpi_processor_exit(void)
+{
+	int i;
+
+	unregister_syscore_ops(&xap_syscore_ops);
+	kfree(acpi_ids_done);
+	kfree(acpi_id_present);
+	kfree(acpi_id_cst_present);
+	kfree(acpi_psd);
+	for_each_possible_cpu(i)
+		acpi_processor_unregister_performance(i);
+
+	free_acpi_perf_data();
+}
+
+MODULE_AUTHOR("Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>");
+MODULE_DESCRIPTION("Xen ACPI Processor P-states (and Cx) driver which uploads PM data to Xen hypervisor");
+MODULE_LICENSE("GPL");
+
+/* We want to be loaded before the CPU freq scaling drivers are loaded.
+ * They are loaded in late_initcall. */
+device_initcall(xen_acpi_processor_init);
+module_exit(xen_acpi_processor_exit);
diff --git a/drivers/xen/xen-balloon.c b/drivers/xen/xen-balloon.c
new file mode 100644
index 000000000..3aab77916
--- /dev/null
+++ b/drivers/xen/xen-balloon.c
@@ -0,0 +1,261 @@
+/******************************************************************************
+ * Xen balloon driver - enables returning/claiming memory to/from Xen.
+ *
+ * Copyright (c) 2003, B Dragovic
+ * Copyright (c) 2003-2004, M Williamson, K Fraser
+ * Copyright (c) 2005 Dan M. Smith, IBM Corporation
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation; or, when distributed
+ * separately from the Linux kernel or incorporated into other
+ * software packages, subject to the following license:
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this source file (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy, modify,
+ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
+ * and to permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/kernel.h>
+#include <linux/errno.h>
+#include <linux/mm_types.h>
+#include <linux/init.h>
+#include <linux/capability.h>
+
+#include <xen/xen.h>
+#include <xen/interface/xen.h>
+#include <xen/balloon.h>
+#include <xen/xenbus.h>
+#include <xen/features.h>
+#include <xen/page.h>
+#include <xen/mem-reservation.h>
+
+#define PAGES2KB(_p) ((_p)<<(PAGE_SHIFT-10))
+
+#define BALLOON_CLASS_NAME "xen_memory"
+
+static struct device balloon_dev;
+
+static int register_balloon(struct device *dev);
+
+/* React to a change in the target key */
+static void watch_target(struct xenbus_watch *watch,
+			 const char *path, const char *token)
+{
+	unsigned long long new_target, static_max;
+	int err;
+	static bool watch_fired;
+	static long target_diff;
+
+	err = xenbus_scanf(XBT_NIL, "memory", "target", "%llu", &new_target);
+	if (err != 1) {
+		/* This is ok (for domain0 at least) - so just return */
+		return;
+	}
+
+	/* The given memory/target value is in KiB, so it needs converting to
+	 * pages. PAGE_SHIFT converts bytes to pages, hence PAGE_SHIFT - 10.
+	 */
+	new_target >>= PAGE_SHIFT - 10;
+
+	if (!watch_fired) {
+		watch_fired = true;
+
+		if ((xenbus_scanf(XBT_NIL, "memory", "static-max",
+				  "%llu", &static_max) == 1) ||
+		    (xenbus_scanf(XBT_NIL, "memory", "memory_static_max",
+				  "%llu", &static_max) == 1))
+			static_max >>= PAGE_SHIFT - 10;
+		else
+			static_max = balloon_stats.current_pages;
+
+		target_diff = (xen_pv_domain() || xen_initial_domain()) ? 0
+				: static_max - balloon_stats.target_pages;
+	}
+
+	balloon_set_new_target(new_target - target_diff);
+}
+static struct xenbus_watch target_watch = {
+	.node = "memory/target",
+	.callback = watch_target,
+};
+
+
+static int balloon_init_watcher(struct notifier_block *notifier,
+				unsigned long event,
+				void *data)
+{
+	int err;
+
+	err = register_xenbus_watch(&target_watch);
+	if (err)
+		pr_err("Failed to set balloon watcher\n");
+
+	return NOTIFY_DONE;
+}
+
+static struct notifier_block xenstore_notifier = {
+	.notifier_call = balloon_init_watcher,
+};
+
+void xen_balloon_init(void)
+{
+	register_balloon(&balloon_dev);
+
+	register_xen_selfballooning(&balloon_dev);
+
+	register_xenstore_notifier(&xenstore_notifier);
+}
+EXPORT_SYMBOL_GPL(xen_balloon_init);
+
+#define BALLOON_SHOW(name, format, args...)				\
+	static ssize_t show_##name(struct device *dev,			\
+				   struct device_attribute *attr,	\
+				   char *buf)				\
+	{								\
+		return sprintf(buf, format, ##args);			\
+	}								\
+	static DEVICE_ATTR(name, S_IRUGO, show_##name, NULL)
+
+BALLOON_SHOW(current_kb, "%lu\n", PAGES2KB(balloon_stats.current_pages));
+BALLOON_SHOW(low_kb, "%lu\n", PAGES2KB(balloon_stats.balloon_low));
+BALLOON_SHOW(high_kb, "%lu\n", PAGES2KB(balloon_stats.balloon_high));
+
+static DEVICE_ULONG_ATTR(schedule_delay, 0444, balloon_stats.schedule_delay);
+static DEVICE_ULONG_ATTR(max_schedule_delay, 0644, balloon_stats.max_schedule_delay);
+static DEVICE_ULONG_ATTR(retry_count, 0444, balloon_stats.retry_count);
+static DEVICE_ULONG_ATTR(max_retry_count, 0644, balloon_stats.max_retry_count);
+static DEVICE_BOOL_ATTR(scrub_pages, 0644, xen_scrub_pages);
+
+static ssize_t show_target_kb(struct device *dev, struct device_attribute *attr,
+			      char *buf)
+{
+	return sprintf(buf, "%lu\n", PAGES2KB(balloon_stats.target_pages));
+}
+
+static ssize_t store_target_kb(struct device *dev,
+			       struct device_attribute *attr,
+			       const char *buf,
+			       size_t count)
+{
+	char *endchar;
+	unsigned long long target_bytes;
+
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
+	target_bytes = simple_strtoull(buf, &endchar, 0) * 1024;
+
+	balloon_set_new_target(target_bytes >> PAGE_SHIFT);
+
+	return count;
+}
+
+static DEVICE_ATTR(target_kb, S_IRUGO | S_IWUSR,
+		   show_target_kb, store_target_kb);
+
+
+static ssize_t show_target(struct device *dev, struct device_attribute *attr,
+			      char *buf)
+{
+	return sprintf(buf, "%llu\n",
+		       (unsigned long long)balloon_stats.target_pages
+		       << PAGE_SHIFT);
+}
+
+static ssize_t store_target(struct device *dev,
+			    struct device_attribute *attr,
+			    const char *buf,
+			    size_t count)
+{
+	char *endchar;
+	unsigned long long target_bytes;
+
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
+	target_bytes = memparse(buf, &endchar);
+
+	balloon_set_new_target(target_bytes >> PAGE_SHIFT);
+
+	return count;
+}
+
+static DEVICE_ATTR(target, S_IRUGO | S_IWUSR,
+		   show_target, store_target);
+
+
+static struct attribute *balloon_attrs[] = {
+	&dev_attr_target_kb.attr,
+	&dev_attr_target.attr,
+	&dev_attr_schedule_delay.attr.attr,
+	&dev_attr_max_schedule_delay.attr.attr,
+	&dev_attr_retry_count.attr.attr,
+	&dev_attr_max_retry_count.attr.attr,
+	&dev_attr_scrub_pages.attr.attr,
+	NULL
+};
+
+static const struct attribute_group balloon_group = {
+	.attrs = balloon_attrs
+};
+
+static struct attribute *balloon_info_attrs[] = {
+	&dev_attr_current_kb.attr,
+	&dev_attr_low_kb.attr,
+	&dev_attr_high_kb.attr,
+	NULL
+};
+
+static const struct attribute_group balloon_info_group = {
+	.name = "info",
+	.attrs = balloon_info_attrs
+};
+
+static const struct attribute_group *balloon_groups[] = {
+	&balloon_group,
+	&balloon_info_group,
+	NULL
+};
+
+static struct bus_type balloon_subsys = {
+	.name = BALLOON_CLASS_NAME,
+	.dev_name = BALLOON_CLASS_NAME,
+};
+
+static int register_balloon(struct device *dev)
+{
+	int error;
+
+	error = subsys_system_register(&balloon_subsys, NULL);
+	if (error)
+		return error;
+
+	dev->id = 0;
+	dev->bus = &balloon_subsys;
+	dev->groups = balloon_groups;
+
+	error = device_register(dev);
+	if (error) {
+		bus_unregister(&balloon_subsys);
+		return error;
+	}
+
+	return 0;
+}
diff --git a/drivers/xen/xen-pciback/Makefile b/drivers/xen/xen-pciback/Makefile
new file mode 100644
index 000000000..e8d981d43
--- /dev/null
+++ b/drivers/xen/xen-pciback/Makefile
@@ -0,0 +1,8 @@
+# SPDX-License-Identifier: GPL-2.0
+obj-$(CONFIG_XEN_PCIDEV_BACKEND) += xen-pciback.o
+
+xen-pciback-y := pci_stub.o pciback_ops.o xenbus.o
+xen-pciback-y += conf_space.o conf_space_header.o \
+		 conf_space_capability.o \
+		 conf_space_quirks.o vpci.o \
+		 passthrough.o
diff --git a/drivers/xen/xen-pciback/conf_space.c b/drivers/xen/xen-pciback/conf_space.c
new file mode 100644
index 000000000..60111719b
--- /dev/null
+++ b/drivers/xen/xen-pciback/conf_space.c
@@ -0,0 +1,431 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * PCI Backend - Functions for creating a virtual configuration space for
+ *               exported PCI Devices.
+ *               It's dangerous to allow PCI Driver Domains to change their
+ *               device's resources (memory, i/o ports, interrupts). We need to
+ *               restrict changes to certain PCI Configuration registers:
+ *               BARs, INTERRUPT_PIN, most registers in the header...
+ *
+ * Author: Ryan Wilson <hap9@epoch.ncsc.mil>
+ */
+
+#include <linux/kernel.h>
+#include <linux/moduleparam.h>
+#include <linux/pci.h>
+#include "pciback.h"
+#include "conf_space.h"
+#include "conf_space_quirks.h"
+
+bool xen_pcibk_permissive;
+module_param_named(permissive, xen_pcibk_permissive, bool, 0644);
+
+/* This is where xen_pcibk_read_config_byte, xen_pcibk_read_config_word,
+ * xen_pcibk_write_config_word, and xen_pcibk_write_config_byte are created. */
+#define DEFINE_PCI_CONFIG(op, size, type)			\
+int xen_pcibk_##op##_config_##size				\
+(struct pci_dev *dev, int offset, type value, void *data)	\
+{								\
+	return pci_##op##_config_##size(dev, offset, value);	\
+}
+
+DEFINE_PCI_CONFIG(read, byte, u8 *)
+DEFINE_PCI_CONFIG(read, word, u16 *)
+DEFINE_PCI_CONFIG(read, dword, u32 *)
+
+DEFINE_PCI_CONFIG(write, byte, u8)
+DEFINE_PCI_CONFIG(write, word, u16)
+DEFINE_PCI_CONFIG(write, dword, u32)
+
+static int conf_space_read(struct pci_dev *dev,
+			   const struct config_field_entry *entry,
+			   int offset, u32 *value)
+{
+	int ret = 0;
+	const struct config_field *field = entry->field;
+
+	*value = 0;
+
+	switch (field->size) {
+	case 1:
+		if (field->u.b.read)
+			ret = field->u.b.read(dev, offset, (u8 *) value,
+					      entry->data);
+		break;
+	case 2:
+		if (field->u.w.read)
+			ret = field->u.w.read(dev, offset, (u16 *) value,
+					      entry->data);
+		break;
+	case 4:
+		if (field->u.dw.read)
+			ret = field->u.dw.read(dev, offset, value, entry->data);
+		break;
+	}
+	return ret;
+}
+
+static int conf_space_write(struct pci_dev *dev,
+			    const struct config_field_entry *entry,
+			    int offset, u32 value)
+{
+	int ret = 0;
+	const struct config_field *field = entry->field;
+
+	switch (field->size) {
+	case 1:
+		if (field->u.b.write)
+			ret = field->u.b.write(dev, offset, (u8) value,
+					       entry->data);
+		break;
+	case 2:
+		if (field->u.w.write)
+			ret = field->u.w.write(dev, offset, (u16) value,
+					       entry->data);
+		break;
+	case 4:
+		if (field->u.dw.write)
+			ret = field->u.dw.write(dev, offset, value,
+						entry->data);
+		break;
+	}
+	return ret;
+}
+
+static inline u32 get_mask(int size)
+{
+	if (size == 1)
+		return 0xff;
+	else if (size == 2)
+		return 0xffff;
+	else
+		return 0xffffffff;
+}
+
+static inline int valid_request(int offset, int size)
+{
+	/* Validate request (no un-aligned requests) */
+	if ((size == 1 || size == 2 || size == 4) && (offset % size) == 0)
+		return 1;
+	return 0;
+}
+
+static inline u32 merge_value(u32 val, u32 new_val, u32 new_val_mask,
+			      int offset)
+{
+	if (offset >= 0) {
+		new_val_mask <<= (offset * 8);
+		new_val <<= (offset * 8);
+	} else {
+		new_val_mask >>= (offset * -8);
+		new_val >>= (offset * -8);
+	}
+	val = (val & ~new_val_mask) | (new_val & new_val_mask);
+
+	return val;
+}
+
+static int xen_pcibios_err_to_errno(int err)
+{
+	switch (err) {
+	case PCIBIOS_SUCCESSFUL:
+		return XEN_PCI_ERR_success;
+	case PCIBIOS_DEVICE_NOT_FOUND:
+		return XEN_PCI_ERR_dev_not_found;
+	case PCIBIOS_BAD_REGISTER_NUMBER:
+		return XEN_PCI_ERR_invalid_offset;
+	case PCIBIOS_FUNC_NOT_SUPPORTED:
+		return XEN_PCI_ERR_not_implemented;
+	case PCIBIOS_SET_FAILED:
+		return XEN_PCI_ERR_access_denied;
+	}
+	return err;
+}
+
+int xen_pcibk_config_read(struct pci_dev *dev, int offset, int size,
+			  u32 *ret_val)
+{
+	int err = 0;
+	struct xen_pcibk_dev_data *dev_data = pci_get_drvdata(dev);
+	const struct config_field_entry *cfg_entry;
+	const struct config_field *field;
+	int field_start, field_end;
+	/* if read fails for any reason, return 0
+	 * (as if device didn't respond) */
+	u32 value = 0, tmp_val;
+
+	if (unlikely(verbose_request))
+		printk(KERN_DEBUG DRV_NAME ": %s: read %d bytes at 0x%x\n",
+		       pci_name(dev), size, offset);
+
+	if (!valid_request(offset, size)) {
+		err = XEN_PCI_ERR_invalid_offset;
+		goto out;
+	}
+
+	/* Get the real value first, then modify as appropriate */
+	switch (size) {
+	case 1:
+		err = pci_read_config_byte(dev, offset, (u8 *) &value);
+		break;
+	case 2:
+		err = pci_read_config_word(dev, offset, (u16 *) &value);
+		break;
+	case 4:
+		err = pci_read_config_dword(dev, offset, &value);
+		break;
+	}
+
+	list_for_each_entry(cfg_entry, &dev_data->config_fields, list) {
+		field = cfg_entry->field;
+
+		field_start = OFFSET(cfg_entry);
+		field_end = OFFSET(cfg_entry) + field->size;
+
+		if (offset + size > field_start && field_end > offset) {
+			err = conf_space_read(dev, cfg_entry, field_start,
+					      &tmp_val);
+			if (err)
+				goto out;
+
+			value = merge_value(value, tmp_val,
+					    get_mask(field->size),
+					    field_start - offset);
+		}
+	}
+
+out:
+	if (unlikely(verbose_request))
+		printk(KERN_DEBUG DRV_NAME ": %s: read %d bytes at 0x%x = %x\n",
+		       pci_name(dev), size, offset, value);
+
+	*ret_val = value;
+	return xen_pcibios_err_to_errno(err);
+}
+
+int xen_pcibk_config_write(struct pci_dev *dev, int offset, int size, u32 value)
+{
+	int err = 0, handled = 0;
+	struct xen_pcibk_dev_data *dev_data = pci_get_drvdata(dev);
+	const struct config_field_entry *cfg_entry;
+	const struct config_field *field;
+	u32 tmp_val;
+	int field_start, field_end;
+
+	if (unlikely(verbose_request))
+		printk(KERN_DEBUG
+		       DRV_NAME ": %s: write request %d bytes at 0x%x = %x\n",
+		       pci_name(dev), size, offset, value);
+
+	if (!valid_request(offset, size))
+		return XEN_PCI_ERR_invalid_offset;
+
+	list_for_each_entry(cfg_entry, &dev_data->config_fields, list) {
+		field = cfg_entry->field;
+
+		field_start = OFFSET(cfg_entry);
+		field_end = OFFSET(cfg_entry) + field->size;
+
+		if (offset + size > field_start && field_end > offset) {
+			err = conf_space_read(dev, cfg_entry, field_start,
+					      &tmp_val);
+			if (err)
+				break;
+
+			tmp_val = merge_value(tmp_val, value, get_mask(size),
+					      offset - field_start);
+
+			err = conf_space_write(dev, cfg_entry, field_start,
+					       tmp_val);
+
+			/* handled is set true here, but not every byte
+			 * may have been written! Properly detecting if
+			 * every byte is handled is unnecessary as the
+			 * flag is used to detect devices that need
+			 * special helpers to work correctly.
+			 */
+			handled = 1;
+		}
+	}
+
+	if (!handled && !err) {
+		/* By default, anything not specificially handled above is
+		 * read-only. The permissive flag changes this behavior so
+		 * that anything not specifically handled above is writable.
+		 * This means that some fields may still be read-only because
+		 * they have entries in the config_field list that intercept
+		 * the write and do nothing. */
+		if (dev_data->permissive || xen_pcibk_permissive) {
+			switch (size) {
+			case 1:
+				err = pci_write_config_byte(dev, offset,
+							    (u8) value);
+				break;
+			case 2:
+				err = pci_write_config_word(dev, offset,
+							    (u16) value);
+				break;
+			case 4:
+				err = pci_write_config_dword(dev, offset,
+							     (u32) value);
+				break;
+			}
+		} else if (!dev_data->warned_on_write) {
+			dev_data->warned_on_write = 1;
+			dev_warn(&dev->dev, "Driver tried to write to a "
+				 "read-only configuration space field at offset"
+				 " 0x%x, size %d. This may be harmless, but if "
+				 "you have problems with your device:\n"
+				 "1) see permissive attribute in sysfs\n"
+				 "2) report problems to the xen-devel "
+				 "mailing list along with details of your "
+				 "device obtained from lspci.\n", offset, size);
+		}
+	}
+
+	return xen_pcibios_err_to_errno(err);
+}
+
+void xen_pcibk_config_free_dyn_fields(struct pci_dev *dev)
+{
+	struct xen_pcibk_dev_data *dev_data = pci_get_drvdata(dev);
+	struct config_field_entry *cfg_entry, *t;
+	const struct config_field *field;
+
+	dev_dbg(&dev->dev, "free-ing dynamically allocated virtual "
+			   "configuration space fields\n");
+	if (!dev_data)
+		return;
+
+	list_for_each_entry_safe(cfg_entry, t, &dev_data->config_fields, list) {
+		field = cfg_entry->field;
+
+		if (field->clean) {
+			field->clean((struct config_field *)field);
+
+			kfree(cfg_entry->data);
+
+			list_del(&cfg_entry->list);
+			kfree(cfg_entry);
+		}
+
+	}
+}
+
+void xen_pcibk_config_reset_dev(struct pci_dev *dev)
+{
+	struct xen_pcibk_dev_data *dev_data = pci_get_drvdata(dev);
+	const struct config_field_entry *cfg_entry;
+	const struct config_field *field;
+
+	dev_dbg(&dev->dev, "resetting virtual configuration space\n");
+	if (!dev_data)
+		return;
+
+	list_for_each_entry(cfg_entry, &dev_data->config_fields, list) {
+		field = cfg_entry->field;
+
+		if (field->reset)
+			field->reset(dev, OFFSET(cfg_entry), cfg_entry->data);
+	}
+}
+
+void xen_pcibk_config_free_dev(struct pci_dev *dev)
+{
+	struct xen_pcibk_dev_data *dev_data = pci_get_drvdata(dev);
+	struct config_field_entry *cfg_entry, *t;
+	const struct config_field *field;
+
+	dev_dbg(&dev->dev, "free-ing virtual configuration space fields\n");
+	if (!dev_data)
+		return;
+
+	list_for_each_entry_safe(cfg_entry, t, &dev_data->config_fields, list) {
+		list_del(&cfg_entry->list);
+
+		field = cfg_entry->field;
+
+		if (field->release)
+			field->release(dev, OFFSET(cfg_entry), cfg_entry->data);
+
+		kfree(cfg_entry);
+	}
+}
+
+int xen_pcibk_config_add_field_offset(struct pci_dev *dev,
+				    const struct config_field *field,
+				    unsigned int base_offset)
+{
+	int err = 0;
+	struct xen_pcibk_dev_data *dev_data = pci_get_drvdata(dev);
+	struct config_field_entry *cfg_entry;
+	void *tmp;
+
+	cfg_entry = kmalloc(sizeof(*cfg_entry), GFP_KERNEL);
+	if (!cfg_entry) {
+		err = -ENOMEM;
+		goto out;
+	}
+
+	cfg_entry->data = NULL;
+	cfg_entry->field = field;
+	cfg_entry->base_offset = base_offset;
+
+	/* silently ignore duplicate fields */
+	err = xen_pcibk_field_is_dup(dev, OFFSET(cfg_entry));
+	if (err)
+		goto out;
+
+	if (field->init) {
+		tmp = field->init(dev, OFFSET(cfg_entry));
+
+		if (IS_ERR(tmp)) {
+			err = PTR_ERR(tmp);
+			goto out;
+		}
+
+		cfg_entry->data = tmp;
+	}
+
+	dev_dbg(&dev->dev, "added config field at offset 0x%02x\n",
+		OFFSET(cfg_entry));
+	list_add_tail(&cfg_entry->list, &dev_data->config_fields);
+
+out:
+	if (err)
+		kfree(cfg_entry);
+
+	return err;
+}
+
+/* This sets up the device's virtual configuration space to keep track of
+ * certain registers (like the base address registers (BARs) so that we can
+ * keep the client from manipulating them directly.
+ */
+int xen_pcibk_config_init_dev(struct pci_dev *dev)
+{
+	int err = 0;
+	struct xen_pcibk_dev_data *dev_data = pci_get_drvdata(dev);
+
+	dev_dbg(&dev->dev, "initializing virtual configuration space\n");
+
+	INIT_LIST_HEAD(&dev_data->config_fields);
+
+	err = xen_pcibk_config_header_add_fields(dev);
+	if (err)
+		goto out;
+
+	err = xen_pcibk_config_capability_add_fields(dev);
+	if (err)
+		goto out;
+
+	err = xen_pcibk_config_quirks_init(dev);
+
+out:
+	return err;
+}
+
+int xen_pcibk_config_init(void)
+{
+	return xen_pcibk_config_capability_init();
+}
diff --git a/drivers/xen/xen-pciback/conf_space.h b/drivers/xen/xen-pciback/conf_space.h
new file mode 100644
index 000000000..22db63071
--- /dev/null
+++ b/drivers/xen/xen-pciback/conf_space.h
@@ -0,0 +1,129 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * PCI Backend - Common data structures for overriding the configuration space
+ *
+ * Author: Ryan Wilson <hap9@epoch.ncsc.mil>
+ */
+
+#ifndef __XEN_PCIBACK_CONF_SPACE_H__
+#define __XEN_PCIBACK_CONF_SPACE_H__
+
+#include <linux/list.h>
+#include <linux/err.h>
+
+/* conf_field_init can return an errno in a ptr with ERR_PTR() */
+typedef void *(*conf_field_init) (struct pci_dev *dev, int offset);
+typedef void (*conf_field_reset) (struct pci_dev *dev, int offset, void *data);
+typedef void (*conf_field_free) (struct pci_dev *dev, int offset, void *data);
+
+typedef int (*conf_dword_write) (struct pci_dev *dev, int offset, u32 value,
+				 void *data);
+typedef int (*conf_word_write) (struct pci_dev *dev, int offset, u16 value,
+				void *data);
+typedef int (*conf_byte_write) (struct pci_dev *dev, int offset, u8 value,
+				void *data);
+typedef int (*conf_dword_read) (struct pci_dev *dev, int offset, u32 *value,
+				void *data);
+typedef int (*conf_word_read) (struct pci_dev *dev, int offset, u16 *value,
+			       void *data);
+typedef int (*conf_byte_read) (struct pci_dev *dev, int offset, u8 *value,
+			       void *data);
+
+/* These are the fields within the configuration space which we
+ * are interested in intercepting reads/writes to and changing their
+ * values.
+ */
+struct config_field {
+	unsigned int offset;
+	unsigned int size;
+	unsigned int mask;
+	conf_field_init init;
+	conf_field_reset reset;
+	conf_field_free release;
+	void (*clean) (struct config_field *field);
+	union {
+		struct {
+			conf_dword_write write;
+			conf_dword_read read;
+		} dw;
+		struct {
+			conf_word_write write;
+			conf_word_read read;
+		} w;
+		struct {
+			conf_byte_write write;
+			conf_byte_read read;
+		} b;
+	} u;
+	struct list_head list;
+};
+
+struct config_field_entry {
+	struct list_head list;
+	const struct config_field *field;
+	unsigned int base_offset;
+	void *data;
+};
+
+extern bool xen_pcibk_permissive;
+
+#define OFFSET(cfg_entry) ((cfg_entry)->base_offset+(cfg_entry)->field->offset)
+
+/* Add fields to a device - the add_fields macro expects to get a pointer to
+ * the first entry in an array (of which the ending is marked by size==0)
+ */
+int xen_pcibk_config_add_field_offset(struct pci_dev *dev,
+				    const struct config_field *field,
+				    unsigned int offset);
+
+static inline int xen_pcibk_config_add_field(struct pci_dev *dev,
+					   const struct config_field *field)
+{
+	return xen_pcibk_config_add_field_offset(dev, field, 0);
+}
+
+static inline int xen_pcibk_config_add_fields(struct pci_dev *dev,
+					    const struct config_field *field)
+{
+	int i, err = 0;
+	for (i = 0; field[i].size != 0; i++) {
+		err = xen_pcibk_config_add_field(dev, &field[i]);
+		if (err)
+			break;
+	}
+	return err;
+}
+
+static inline int xen_pcibk_config_add_fields_offset(struct pci_dev *dev,
+					const struct config_field *field,
+					unsigned int offset)
+{
+	int i, err = 0;
+	for (i = 0; field[i].size != 0; i++) {
+		err = xen_pcibk_config_add_field_offset(dev, &field[i], offset);
+		if (err)
+			break;
+	}
+	return err;
+}
+
+/* Read/Write the real configuration space */
+int xen_pcibk_read_config_byte(struct pci_dev *dev, int offset, u8 *value,
+			       void *data);
+int xen_pcibk_read_config_word(struct pci_dev *dev, int offset, u16 *value,
+			       void *data);
+int xen_pcibk_read_config_dword(struct pci_dev *dev, int offset, u32 *value,
+				void *data);
+int xen_pcibk_write_config_byte(struct pci_dev *dev, int offset, u8 value,
+				 void *data);
+int xen_pcibk_write_config_word(struct pci_dev *dev, int offset, u16 value,
+				void *data);
+int xen_pcibk_write_config_dword(struct pci_dev *dev, int offset, u32 value,
+				 void *data);
+
+int xen_pcibk_config_capability_init(void);
+
+int xen_pcibk_config_header_add_fields(struct pci_dev *dev);
+int xen_pcibk_config_capability_add_fields(struct pci_dev *dev);
+
+#endif				/* __XEN_PCIBACK_CONF_SPACE_H__ */
diff --git a/drivers/xen/xen-pciback/conf_space_capability.c b/drivers/xen/xen-pciback/conf_space_capability.c
new file mode 100644
index 000000000..42f0f64fc
--- /dev/null
+++ b/drivers/xen/xen-pciback/conf_space_capability.c
@@ -0,0 +1,207 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * PCI Backend - Handles the virtual fields found on the capability lists
+ *               in the configuration space.
+ *
+ * Author: Ryan Wilson <hap9@epoch.ncsc.mil>
+ */
+
+#include <linux/kernel.h>
+#include <linux/pci.h>
+#include "pciback.h"
+#include "conf_space.h"
+
+static LIST_HEAD(capabilities);
+struct xen_pcibk_config_capability {
+	struct list_head cap_list;
+
+	int capability;
+
+	/* If the device has the capability found above, add these fields */
+	const struct config_field *fields;
+};
+
+static const struct config_field caplist_header[] = {
+	{
+	 .offset    = PCI_CAP_LIST_ID,
+	 .size      = 2, /* encompass PCI_CAP_LIST_ID & PCI_CAP_LIST_NEXT */
+	 .u.w.read  = xen_pcibk_read_config_word,
+	 .u.w.write = NULL,
+	},
+	{}
+};
+
+static inline void register_capability(struct xen_pcibk_config_capability *cap)
+{
+	list_add_tail(&cap->cap_list, &capabilities);
+}
+
+int xen_pcibk_config_capability_add_fields(struct pci_dev *dev)
+{
+	int err = 0;
+	struct xen_pcibk_config_capability *cap;
+	int cap_offset;
+
+	list_for_each_entry(cap, &capabilities, cap_list) {
+		cap_offset = pci_find_capability(dev, cap->capability);
+		if (cap_offset) {
+			dev_dbg(&dev->dev, "Found capability 0x%x at 0x%x\n",
+				cap->capability, cap_offset);
+
+			err = xen_pcibk_config_add_fields_offset(dev,
+							       caplist_header,
+							       cap_offset);
+			if (err)
+				goto out;
+			err = xen_pcibk_config_add_fields_offset(dev,
+							       cap->fields,
+							       cap_offset);
+			if (err)
+				goto out;
+		}
+	}
+
+out:
+	return err;
+}
+
+static int vpd_address_write(struct pci_dev *dev, int offset, u16 value,
+			     void *data)
+{
+	/* Disallow writes to the vital product data */
+	if (value & PCI_VPD_ADDR_F)
+		return PCIBIOS_SET_FAILED;
+	else
+		return pci_write_config_word(dev, offset, value);
+}
+
+static const struct config_field caplist_vpd[] = {
+	{
+	 .offset    = PCI_VPD_ADDR,
+	 .size      = 2,
+	 .u.w.read  = xen_pcibk_read_config_word,
+	 .u.w.write = vpd_address_write,
+	 },
+	{
+	 .offset     = PCI_VPD_DATA,
+	 .size       = 4,
+	 .u.dw.read  = xen_pcibk_read_config_dword,
+	 .u.dw.write = NULL,
+	 },
+	{}
+};
+
+static int pm_caps_read(struct pci_dev *dev, int offset, u16 *value,
+			void *data)
+{
+	int err;
+	u16 real_value;
+
+	err = pci_read_config_word(dev, offset, &real_value);
+	if (err)
+		goto out;
+
+	*value = real_value & ~PCI_PM_CAP_PME_MASK;
+
+out:
+	return err;
+}
+
+/* PM_OK_BITS specifies the bits that the driver domain is allowed to change.
+ * Can't allow driver domain to enable PMEs - they're shared */
+#define PM_OK_BITS (PCI_PM_CTRL_PME_STATUS|PCI_PM_CTRL_DATA_SEL_MASK)
+
+static int pm_ctrl_write(struct pci_dev *dev, int offset, u16 new_value,
+			 void *data)
+{
+	int err;
+	u16 old_value;
+	pci_power_t new_state;
+
+	err = pci_read_config_word(dev, offset, &old_value);
+	if (err)
+		goto out;
+
+	new_state = (pci_power_t)(new_value & PCI_PM_CTRL_STATE_MASK);
+
+	new_value &= PM_OK_BITS;
+	if ((old_value & PM_OK_BITS) != new_value) {
+		new_value = (old_value & ~PM_OK_BITS) | new_value;
+		err = pci_write_config_word(dev, offset, new_value);
+		if (err)
+			goto out;
+	}
+
+	/* Let pci core handle the power management change */
+	dev_dbg(&dev->dev, "set power state to %x\n", new_state);
+	err = pci_set_power_state(dev, new_state);
+	if (err) {
+		err = PCIBIOS_SET_FAILED;
+		goto out;
+	}
+
+ out:
+	return err;
+}
+
+/* Ensure PMEs are disabled */
+static void *pm_ctrl_init(struct pci_dev *dev, int offset)
+{
+	int err;
+	u16 value;
+
+	err = pci_read_config_word(dev, offset, &value);
+	if (err)
+		goto out;
+
+	if (value & PCI_PM_CTRL_PME_ENABLE) {
+		value &= ~PCI_PM_CTRL_PME_ENABLE;
+		err = pci_write_config_word(dev, offset, value);
+	}
+
+out:
+	return err ? ERR_PTR(err) : NULL;
+}
+
+static const struct config_field caplist_pm[] = {
+	{
+		.offset     = PCI_PM_PMC,
+		.size       = 2,
+		.u.w.read   = pm_caps_read,
+	},
+	{
+		.offset     = PCI_PM_CTRL,
+		.size       = 2,
+		.init       = pm_ctrl_init,
+		.u.w.read   = xen_pcibk_read_config_word,
+		.u.w.write  = pm_ctrl_write,
+	},
+	{
+		.offset     = PCI_PM_PPB_EXTENSIONS,
+		.size       = 1,
+		.u.b.read   = xen_pcibk_read_config_byte,
+	},
+	{
+		.offset     = PCI_PM_DATA_REGISTER,
+		.size       = 1,
+		.u.b.read   = xen_pcibk_read_config_byte,
+	},
+	{}
+};
+
+static struct xen_pcibk_config_capability xen_pcibk_config_capability_pm = {
+	.capability = PCI_CAP_ID_PM,
+	.fields = caplist_pm,
+};
+static struct xen_pcibk_config_capability xen_pcibk_config_capability_vpd = {
+	.capability = PCI_CAP_ID_VPD,
+	.fields = caplist_vpd,
+};
+
+int xen_pcibk_config_capability_init(void)
+{
+	register_capability(&xen_pcibk_config_capability_vpd);
+	register_capability(&xen_pcibk_config_capability_pm);
+
+	return 0;
+}
diff --git a/drivers/xen/xen-pciback/conf_space_header.c b/drivers/xen/xen-pciback/conf_space_header.c
new file mode 100644
index 000000000..10ae24b5a
--- /dev/null
+++ b/drivers/xen/xen-pciback/conf_space_header.c
@@ -0,0 +1,424 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * PCI Backend - Handles the virtual fields in the configuration space headers.
+ *
+ * Author: Ryan Wilson <hap9@epoch.ncsc.mil>
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/kernel.h>
+#include <linux/pci.h>
+#include "pciback.h"
+#include "conf_space.h"
+
+struct pci_cmd_info {
+	u16 val;
+};
+
+struct pci_bar_info {
+	u32 val;
+	u32 len_val;
+	int which;
+};
+
+#define is_enable_cmd(value) ((value)&(PCI_COMMAND_MEMORY|PCI_COMMAND_IO))
+#define is_master_cmd(value) ((value)&PCI_COMMAND_MASTER)
+
+/* Bits guests are allowed to control in permissive mode. */
+#define PCI_COMMAND_GUEST (PCI_COMMAND_MASTER|PCI_COMMAND_SPECIAL| \
+			   PCI_COMMAND_INVALIDATE|PCI_COMMAND_VGA_PALETTE| \
+			   PCI_COMMAND_WAIT|PCI_COMMAND_FAST_BACK)
+
+static void *command_init(struct pci_dev *dev, int offset)
+{
+	struct pci_cmd_info *cmd = kmalloc(sizeof(*cmd), GFP_KERNEL);
+	int err;
+
+	if (!cmd)
+		return ERR_PTR(-ENOMEM);
+
+	err = pci_read_config_word(dev, PCI_COMMAND, &cmd->val);
+	if (err) {
+		kfree(cmd);
+		return ERR_PTR(err);
+	}
+
+	return cmd;
+}
+
+static int command_read(struct pci_dev *dev, int offset, u16 *value, void *data)
+{
+	int ret = pci_read_config_word(dev, offset, value);
+	const struct pci_cmd_info *cmd = data;
+
+	*value &= PCI_COMMAND_GUEST;
+	*value |= cmd->val & ~PCI_COMMAND_GUEST;
+
+	return ret;
+}
+
+static int command_write(struct pci_dev *dev, int offset, u16 value, void *data)
+{
+	struct xen_pcibk_dev_data *dev_data;
+	int err;
+	u16 val;
+	struct pci_cmd_info *cmd = data;
+
+	dev_data = pci_get_drvdata(dev);
+	if (!pci_is_enabled(dev) && is_enable_cmd(value)) {
+		if (unlikely(verbose_request))
+			printk(KERN_DEBUG DRV_NAME ": %s: enable\n",
+			       pci_name(dev));
+		err = pci_enable_device(dev);
+		if (err)
+			return err;
+		if (dev_data)
+			dev_data->enable_intx = 1;
+	} else if (pci_is_enabled(dev) && !is_enable_cmd(value)) {
+		if (unlikely(verbose_request))
+			printk(KERN_DEBUG DRV_NAME ": %s: disable\n",
+			       pci_name(dev));
+		pci_disable_device(dev);
+		if (dev_data)
+			dev_data->enable_intx = 0;
+	}
+
+	if (!dev->is_busmaster && is_master_cmd(value)) {
+		if (unlikely(verbose_request))
+			printk(KERN_DEBUG DRV_NAME ": %s: set bus master\n",
+			       pci_name(dev));
+		pci_set_master(dev);
+	} else if (dev->is_busmaster && !is_master_cmd(value)) {
+		if (unlikely(verbose_request))
+			printk(KERN_DEBUG DRV_NAME ": %s: clear bus master\n",
+			       pci_name(dev));
+		pci_clear_master(dev);
+	}
+
+	if (!(cmd->val & PCI_COMMAND_INVALIDATE) &&
+	    (value & PCI_COMMAND_INVALIDATE)) {
+		if (unlikely(verbose_request))
+			printk(KERN_DEBUG
+			       DRV_NAME ": %s: enable memory-write-invalidate\n",
+			       pci_name(dev));
+		err = pci_set_mwi(dev);
+		if (err) {
+			pr_warn("%s: cannot enable memory-write-invalidate (%d)\n",
+				pci_name(dev), err);
+			value &= ~PCI_COMMAND_INVALIDATE;
+		}
+	} else if ((cmd->val & PCI_COMMAND_INVALIDATE) &&
+		   !(value & PCI_COMMAND_INVALIDATE)) {
+		if (unlikely(verbose_request))
+			printk(KERN_DEBUG
+			       DRV_NAME ": %s: disable memory-write-invalidate\n",
+			       pci_name(dev));
+		pci_clear_mwi(dev);
+	}
+
+	cmd->val = value;
+
+	if (!xen_pcibk_permissive && (!dev_data || !dev_data->permissive))
+		return 0;
+
+	/* Only allow the guest to control certain bits. */
+	err = pci_read_config_word(dev, offset, &val);
+	if (err || val == value)
+		return err;
+
+	value &= PCI_COMMAND_GUEST;
+	value |= val & ~PCI_COMMAND_GUEST;
+
+	return pci_write_config_word(dev, offset, value);
+}
+
+static int rom_write(struct pci_dev *dev, int offset, u32 value, void *data)
+{
+	struct pci_bar_info *bar = data;
+
+	if (unlikely(!bar)) {
+		pr_warn(DRV_NAME ": driver data not found for %s\n",
+		       pci_name(dev));
+		return XEN_PCI_ERR_op_failed;
+	}
+
+	/* A write to obtain the length must happen as a 32-bit write.
+	 * This does not (yet) support writing individual bytes
+	 */
+	if ((value | ~PCI_ROM_ADDRESS_MASK) == ~0U)
+		bar->which = 1;
+	else {
+		u32 tmpval;
+		pci_read_config_dword(dev, offset, &tmpval);
+		if (tmpval != bar->val && value == bar->val) {
+			/* Allow restoration of bar value. */
+			pci_write_config_dword(dev, offset, bar->val);
+		}
+		bar->which = 0;
+	}
+
+	/* Do we need to support enabling/disabling the rom address here? */
+
+	return 0;
+}
+
+/* For the BARs, only allow writes which write ~0 or
+ * the correct resource information
+ * (Needed for when the driver probes the resource usage)
+ */
+static int bar_write(struct pci_dev *dev, int offset, u32 value, void *data)
+{
+	struct pci_bar_info *bar = data;
+	unsigned int pos = (offset - PCI_BASE_ADDRESS_0) / 4;
+	const struct resource *res = dev->resource;
+	u32 mask;
+
+	if (unlikely(!bar)) {
+		pr_warn(DRV_NAME ": driver data not found for %s\n",
+		       pci_name(dev));
+		return XEN_PCI_ERR_op_failed;
+	}
+
+	/* A write to obtain the length must happen as a 32-bit write.
+	 * This does not (yet) support writing individual bytes
+	 */
+	if (res[pos].flags & IORESOURCE_IO)
+		mask = ~PCI_BASE_ADDRESS_IO_MASK;
+	else if (pos && (res[pos - 1].flags & IORESOURCE_MEM_64))
+		mask = 0;
+	else
+		mask = ~PCI_BASE_ADDRESS_MEM_MASK;
+	if ((value | mask) == ~0U)
+		bar->which = 1;
+	else {
+		u32 tmpval;
+		pci_read_config_dword(dev, offset, &tmpval);
+		if (tmpval != bar->val && value == bar->val) {
+			/* Allow restoration of bar value. */
+			pci_write_config_dword(dev, offset, bar->val);
+		}
+		bar->which = 0;
+	}
+
+	return 0;
+}
+
+static int bar_read(struct pci_dev *dev, int offset, u32 * value, void *data)
+{
+	struct pci_bar_info *bar = data;
+
+	if (unlikely(!bar)) {
+		pr_warn(DRV_NAME ": driver data not found for %s\n",
+		       pci_name(dev));
+		return XEN_PCI_ERR_op_failed;
+	}
+
+	*value = bar->which ? bar->len_val : bar->val;
+
+	return 0;
+}
+
+static void *bar_init(struct pci_dev *dev, int offset)
+{
+	unsigned int pos;
+	const struct resource *res = dev->resource;
+	struct pci_bar_info *bar = kzalloc(sizeof(*bar), GFP_KERNEL);
+
+	if (!bar)
+		return ERR_PTR(-ENOMEM);
+
+	if (offset == PCI_ROM_ADDRESS || offset == PCI_ROM_ADDRESS1)
+		pos = PCI_ROM_RESOURCE;
+	else {
+		pos = (offset - PCI_BASE_ADDRESS_0) / 4;
+		if (pos && (res[pos - 1].flags & IORESOURCE_MEM_64)) {
+			bar->val = res[pos - 1].start >> 32;
+			bar->len_val = -resource_size(&res[pos - 1]) >> 32;
+			return bar;
+		}
+	}
+
+	if (!res[pos].flags ||
+	    (res[pos].flags & (IORESOURCE_DISABLED | IORESOURCE_UNSET |
+			       IORESOURCE_BUSY)))
+		return bar;
+
+	bar->val = res[pos].start |
+		   (res[pos].flags & PCI_REGION_FLAG_MASK);
+	bar->len_val = -resource_size(&res[pos]) |
+		       (res[pos].flags & PCI_REGION_FLAG_MASK);
+
+	return bar;
+}
+
+static void bar_reset(struct pci_dev *dev, int offset, void *data)
+{
+	struct pci_bar_info *bar = data;
+
+	bar->which = 0;
+}
+
+static void bar_release(struct pci_dev *dev, int offset, void *data)
+{
+	kfree(data);
+}
+
+static int xen_pcibk_read_vendor(struct pci_dev *dev, int offset,
+			       u16 *value, void *data)
+{
+	*value = dev->vendor;
+
+	return 0;
+}
+
+static int xen_pcibk_read_device(struct pci_dev *dev, int offset,
+			       u16 *value, void *data)
+{
+	*value = dev->device;
+
+	return 0;
+}
+
+static int interrupt_read(struct pci_dev *dev, int offset, u8 * value,
+			  void *data)
+{
+	*value = (u8) dev->irq;
+
+	return 0;
+}
+
+static int bist_write(struct pci_dev *dev, int offset, u8 value, void *data)
+{
+	u8 cur_value;
+	int err;
+
+	err = pci_read_config_byte(dev, offset, &cur_value);
+	if (err)
+		goto out;
+
+	if ((cur_value & ~PCI_BIST_START) == (value & ~PCI_BIST_START)
+	    || value == PCI_BIST_START)
+		err = pci_write_config_byte(dev, offset, value);
+
+out:
+	return err;
+}
+
+static const struct config_field header_common[] = {
+	{
+	 .offset    = PCI_VENDOR_ID,
+	 .size      = 2,
+	 .u.w.read  = xen_pcibk_read_vendor,
+	},
+	{
+	 .offset    = PCI_DEVICE_ID,
+	 .size      = 2,
+	 .u.w.read  = xen_pcibk_read_device,
+	},
+	{
+	 .offset    = PCI_COMMAND,
+	 .size      = 2,
+	 .init      = command_init,
+	 .release   = bar_release,
+	 .u.w.read  = command_read,
+	 .u.w.write = command_write,
+	},
+	{
+	 .offset    = PCI_INTERRUPT_LINE,
+	 .size      = 1,
+	 .u.b.read  = interrupt_read,
+	},
+	{
+	 .offset    = PCI_INTERRUPT_PIN,
+	 .size      = 1,
+	 .u.b.read  = xen_pcibk_read_config_byte,
+	},
+	{
+	 /* Any side effects of letting driver domain control cache line? */
+	 .offset    = PCI_CACHE_LINE_SIZE,
+	 .size      = 1,
+	 .u.b.read  = xen_pcibk_read_config_byte,
+	 .u.b.write = xen_pcibk_write_config_byte,
+	},
+	{
+	 .offset    = PCI_LATENCY_TIMER,
+	 .size      = 1,
+	 .u.b.read  = xen_pcibk_read_config_byte,
+	},
+	{
+	 .offset    = PCI_BIST,
+	 .size      = 1,
+	 .u.b.read  = xen_pcibk_read_config_byte,
+	 .u.b.write = bist_write,
+	},
+	{}
+};
+
+#define CFG_FIELD_BAR(reg_offset)			\
+	{						\
+	.offset     = reg_offset,			\
+	.size       = 4,				\
+	.init       = bar_init,				\
+	.reset      = bar_reset,			\
+	.release    = bar_release,			\
+	.u.dw.read  = bar_read,				\
+	.u.dw.write = bar_write,			\
+	}
+
+#define CFG_FIELD_ROM(reg_offset)			\
+	{						\
+	.offset     = reg_offset,			\
+	.size       = 4,				\
+	.init       = bar_init,				\
+	.reset      = bar_reset,			\
+	.release    = bar_release,			\
+	.u.dw.read  = bar_read,				\
+	.u.dw.write = rom_write,			\
+	}
+
+static const struct config_field header_0[] = {
+	CFG_FIELD_BAR(PCI_BASE_ADDRESS_0),
+	CFG_FIELD_BAR(PCI_BASE_ADDRESS_1),
+	CFG_FIELD_BAR(PCI_BASE_ADDRESS_2),
+	CFG_FIELD_BAR(PCI_BASE_ADDRESS_3),
+	CFG_FIELD_BAR(PCI_BASE_ADDRESS_4),
+	CFG_FIELD_BAR(PCI_BASE_ADDRESS_5),
+	CFG_FIELD_ROM(PCI_ROM_ADDRESS),
+	{}
+};
+
+static const struct config_field header_1[] = {
+	CFG_FIELD_BAR(PCI_BASE_ADDRESS_0),
+	CFG_FIELD_BAR(PCI_BASE_ADDRESS_1),
+	CFG_FIELD_ROM(PCI_ROM_ADDRESS1),
+	{}
+};
+
+int xen_pcibk_config_header_add_fields(struct pci_dev *dev)
+{
+	int err;
+
+	err = xen_pcibk_config_add_fields(dev, header_common);
+	if (err)
+		goto out;
+
+	switch (dev->hdr_type) {
+	case PCI_HEADER_TYPE_NORMAL:
+		err = xen_pcibk_config_add_fields(dev, header_0);
+		break;
+
+	case PCI_HEADER_TYPE_BRIDGE:
+		err = xen_pcibk_config_add_fields(dev, header_1);
+		break;
+
+	default:
+		err = -EINVAL;
+		pr_err("%s: Unsupported header type %d!\n",
+		       pci_name(dev), dev->hdr_type);
+		break;
+	}
+
+out:
+	return err;
+}
diff --git a/drivers/xen/xen-pciback/conf_space_quirks.c b/drivers/xen/xen-pciback/conf_space_quirks.c
new file mode 100644
index 000000000..ed593d104
--- /dev/null
+++ b/drivers/xen/xen-pciback/conf_space_quirks.c
@@ -0,0 +1,140 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * PCI Backend - Handle special overlays for broken devices.
+ *
+ * Author: Ryan Wilson <hap9@epoch.ncsc.mil>
+ * Author: Chris Bookholt <hap10@epoch.ncsc.mil>
+ */
+
+#include <linux/kernel.h>
+#include <linux/pci.h>
+#include "pciback.h"
+#include "conf_space.h"
+#include "conf_space_quirks.h"
+
+LIST_HEAD(xen_pcibk_quirks);
+static inline const struct pci_device_id *
+match_one_device(const struct pci_device_id *id, const struct pci_dev *dev)
+{
+	if ((id->vendor == PCI_ANY_ID || id->vendor == dev->vendor) &&
+	    (id->device == PCI_ANY_ID || id->device == dev->device) &&
+	    (id->subvendor == PCI_ANY_ID ||
+				id->subvendor == dev->subsystem_vendor) &&
+	    (id->subdevice == PCI_ANY_ID ||
+				id->subdevice == dev->subsystem_device) &&
+	    !((id->class ^ dev->class) & id->class_mask))
+		return id;
+	return NULL;
+}
+
+static struct xen_pcibk_config_quirk *xen_pcibk_find_quirk(struct pci_dev *dev)
+{
+	struct xen_pcibk_config_quirk *tmp_quirk;
+
+	list_for_each_entry(tmp_quirk, &xen_pcibk_quirks, quirks_list)
+		if (match_one_device(&tmp_quirk->devid, dev) != NULL)
+			goto out;
+	tmp_quirk = NULL;
+	printk(KERN_DEBUG DRV_NAME
+	       ": quirk didn't match any device known\n");
+out:
+	return tmp_quirk;
+}
+
+static inline void register_quirk(struct xen_pcibk_config_quirk *quirk)
+{
+	list_add_tail(&quirk->quirks_list, &xen_pcibk_quirks);
+}
+
+int xen_pcibk_field_is_dup(struct pci_dev *dev, unsigned int reg)
+{
+	int ret = 0;
+	struct xen_pcibk_dev_data *dev_data = pci_get_drvdata(dev);
+	struct config_field_entry *cfg_entry;
+
+	list_for_each_entry(cfg_entry, &dev_data->config_fields, list) {
+		if (OFFSET(cfg_entry) == reg) {
+			ret = 1;
+			break;
+		}
+	}
+	return ret;
+}
+
+int xen_pcibk_config_quirks_add_field(struct pci_dev *dev, struct config_field
+				    *field)
+{
+	int err = 0;
+
+	switch (field->size) {
+	case 1:
+		field->u.b.read = xen_pcibk_read_config_byte;
+		field->u.b.write = xen_pcibk_write_config_byte;
+		break;
+	case 2:
+		field->u.w.read = xen_pcibk_read_config_word;
+		field->u.w.write = xen_pcibk_write_config_word;
+		break;
+	case 4:
+		field->u.dw.read = xen_pcibk_read_config_dword;
+		field->u.dw.write = xen_pcibk_write_config_dword;
+		break;
+	default:
+		err = -EINVAL;
+		goto out;
+	}
+
+	xen_pcibk_config_add_field(dev, field);
+
+out:
+	return err;
+}
+
+int xen_pcibk_config_quirks_init(struct pci_dev *dev)
+{
+	struct xen_pcibk_config_quirk *quirk;
+	int ret = 0;
+
+	quirk = kzalloc(sizeof(*quirk), GFP_KERNEL);
+	if (!quirk) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	quirk->devid.vendor = dev->vendor;
+	quirk->devid.device = dev->device;
+	quirk->devid.subvendor = dev->subsystem_vendor;
+	quirk->devid.subdevice = dev->subsystem_device;
+	quirk->devid.class = 0;
+	quirk->devid.class_mask = 0;
+	quirk->devid.driver_data = 0UL;
+
+	quirk->pdev = dev;
+
+	register_quirk(quirk);
+out:
+	return ret;
+}
+
+void xen_pcibk_config_field_free(struct config_field *field)
+{
+	kfree(field);
+}
+
+int xen_pcibk_config_quirk_release(struct pci_dev *dev)
+{
+	struct xen_pcibk_config_quirk *quirk;
+	int ret = 0;
+
+	quirk = xen_pcibk_find_quirk(dev);
+	if (!quirk) {
+		ret = -ENXIO;
+		goto out;
+	}
+
+	list_del(&quirk->quirks_list);
+	kfree(quirk);
+
+out:
+	return ret;
+}
diff --git a/drivers/xen/xen-pciback/conf_space_quirks.h b/drivers/xen/xen-pciback/conf_space_quirks.h
new file mode 100644
index 000000000..d873abe35
--- /dev/null
+++ b/drivers/xen/xen-pciback/conf_space_quirks.h
@@ -0,0 +1,34 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * PCI Backend - Data structures for special overlays for broken devices.
+ *
+ * Ryan Wilson <hap9@epoch.ncsc.mil>
+ * Chris Bookholt <hap10@epoch.ncsc.mil>
+ */
+
+#ifndef __XEN_PCIBACK_CONF_SPACE_QUIRKS_H__
+#define __XEN_PCIBACK_CONF_SPACE_QUIRKS_H__
+
+#include <linux/pci.h>
+#include <linux/list.h>
+
+struct xen_pcibk_config_quirk {
+	struct list_head quirks_list;
+	struct pci_device_id devid;
+	struct pci_dev *pdev;
+};
+
+int xen_pcibk_config_quirks_add_field(struct pci_dev *dev, struct config_field
+				    *field);
+
+int xen_pcibk_config_quirks_remove_field(struct pci_dev *dev, int reg);
+
+int xen_pcibk_config_quirks_init(struct pci_dev *dev);
+
+void xen_pcibk_config_field_free(struct config_field *field);
+
+int xen_pcibk_config_quirk_release(struct pci_dev *dev);
+
+int xen_pcibk_field_is_dup(struct pci_dev *dev, unsigned int reg);
+
+#endif
diff --git a/drivers/xen/xen-pciback/passthrough.c b/drivers/xen/xen-pciback/passthrough.c
new file mode 100644
index 000000000..66e9b814c
--- /dev/null
+++ b/drivers/xen/xen-pciback/passthrough.c
@@ -0,0 +1,197 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * PCI Backend - Provides restricted access to the real PCI bus topology
+ *               to the frontend
+ *
+ *   Author: Ryan Wilson <hap9@epoch.ncsc.mil>
+ */
+
+#include <linux/list.h>
+#include <linux/pci.h>
+#include <linux/mutex.h>
+#include "pciback.h"
+
+struct passthrough_dev_data {
+	/* Access to dev_list must be protected by lock */
+	struct list_head dev_list;
+	struct mutex lock;
+};
+
+static struct pci_dev *__xen_pcibk_get_pci_dev(struct xen_pcibk_device *pdev,
+					       unsigned int domain,
+					       unsigned int bus,
+					       unsigned int devfn)
+{
+	struct passthrough_dev_data *dev_data = pdev->pci_dev_data;
+	struct pci_dev_entry *dev_entry;
+	struct pci_dev *dev = NULL;
+
+	mutex_lock(&dev_data->lock);
+
+	list_for_each_entry(dev_entry, &dev_data->dev_list, list) {
+		if (domain == (unsigned int)pci_domain_nr(dev_entry->dev->bus)
+		    && bus == (unsigned int)dev_entry->dev->bus->number
+		    && devfn == dev_entry->dev->devfn) {
+			dev = dev_entry->dev;
+			break;
+		}
+	}
+
+	mutex_unlock(&dev_data->lock);
+
+	return dev;
+}
+
+static int __xen_pcibk_add_pci_dev(struct xen_pcibk_device *pdev,
+				   struct pci_dev *dev,
+				   int devid, publish_pci_dev_cb publish_cb)
+{
+	struct passthrough_dev_data *dev_data = pdev->pci_dev_data;
+	struct pci_dev_entry *dev_entry;
+	unsigned int domain, bus, devfn;
+	int err;
+
+	dev_entry = kmalloc(sizeof(*dev_entry), GFP_KERNEL);
+	if (!dev_entry)
+		return -ENOMEM;
+	dev_entry->dev = dev;
+
+	mutex_lock(&dev_data->lock);
+	list_add_tail(&dev_entry->list, &dev_data->dev_list);
+	mutex_unlock(&dev_data->lock);
+
+	/* Publish this device. */
+	domain = (unsigned int)pci_domain_nr(dev->bus);
+	bus = (unsigned int)dev->bus->number;
+	devfn = dev->devfn;
+	err = publish_cb(pdev, domain, bus, devfn, devid);
+
+	return err;
+}
+
+static void __xen_pcibk_release_pci_dev(struct xen_pcibk_device *pdev,
+					struct pci_dev *dev, bool lock)
+{
+	struct passthrough_dev_data *dev_data = pdev->pci_dev_data;
+	struct pci_dev_entry *dev_entry, *t;
+	struct pci_dev *found_dev = NULL;
+
+	mutex_lock(&dev_data->lock);
+
+	list_for_each_entry_safe(dev_entry, t, &dev_data->dev_list, list) {
+		if (dev_entry->dev == dev) {
+			list_del(&dev_entry->list);
+			found_dev = dev_entry->dev;
+			kfree(dev_entry);
+		}
+	}
+
+	mutex_unlock(&dev_data->lock);
+
+	if (found_dev) {
+		if (lock)
+			device_lock(&found_dev->dev);
+		pcistub_put_pci_dev(found_dev);
+		if (lock)
+			device_unlock(&found_dev->dev);
+	}
+}
+
+static int __xen_pcibk_init_devices(struct xen_pcibk_device *pdev)
+{
+	struct passthrough_dev_data *dev_data;
+
+	dev_data = kmalloc(sizeof(*dev_data), GFP_KERNEL);
+	if (!dev_data)
+		return -ENOMEM;
+
+	mutex_init(&dev_data->lock);
+
+	INIT_LIST_HEAD(&dev_data->dev_list);
+
+	pdev->pci_dev_data = dev_data;
+
+	return 0;
+}
+
+static int __xen_pcibk_publish_pci_roots(struct xen_pcibk_device *pdev,
+					 publish_pci_root_cb publish_root_cb)
+{
+	int err = 0;
+	struct passthrough_dev_data *dev_data = pdev->pci_dev_data;
+	struct pci_dev_entry *dev_entry, *e;
+	struct pci_dev *dev;
+	int found;
+	unsigned int domain, bus;
+
+	mutex_lock(&dev_data->lock);
+
+	list_for_each_entry(dev_entry, &dev_data->dev_list, list) {
+		/* Only publish this device as a root if none of its
+		 * parent bridges are exported
+		 */
+		found = 0;
+		dev = dev_entry->dev->bus->self;
+		for (; !found && dev != NULL; dev = dev->bus->self) {
+			list_for_each_entry(e, &dev_data->dev_list, list) {
+				if (dev == e->dev) {
+					found = 1;
+					break;
+				}
+			}
+		}
+
+		domain = (unsigned int)pci_domain_nr(dev_entry->dev->bus);
+		bus = (unsigned int)dev_entry->dev->bus->number;
+
+		if (!found) {
+			err = publish_root_cb(pdev, domain, bus);
+			if (err)
+				break;
+		}
+	}
+
+	mutex_unlock(&dev_data->lock);
+
+	return err;
+}
+
+static void __xen_pcibk_release_devices(struct xen_pcibk_device *pdev)
+{
+	struct passthrough_dev_data *dev_data = pdev->pci_dev_data;
+	struct pci_dev_entry *dev_entry, *t;
+
+	list_for_each_entry_safe(dev_entry, t, &dev_data->dev_list, list) {
+		struct pci_dev *dev = dev_entry->dev;
+		list_del(&dev_entry->list);
+		device_lock(&dev->dev);
+		pcistub_put_pci_dev(dev);
+		device_unlock(&dev->dev);
+		kfree(dev_entry);
+	}
+
+	kfree(dev_data);
+	pdev->pci_dev_data = NULL;
+}
+
+static int __xen_pcibk_get_pcifront_dev(struct pci_dev *pcidev,
+					struct xen_pcibk_device *pdev,
+					unsigned int *domain, unsigned int *bus,
+					unsigned int *devfn)
+{
+	*domain = pci_domain_nr(pcidev->bus);
+	*bus = pcidev->bus->number;
+	*devfn = pcidev->devfn;
+	return 1;
+}
+
+const struct xen_pcibk_backend xen_pcibk_passthrough_backend = {
+	.name           = "passthrough",
+	.init           = __xen_pcibk_init_devices,
+	.free		= __xen_pcibk_release_devices,
+	.find           = __xen_pcibk_get_pcifront_dev,
+	.publish        = __xen_pcibk_publish_pci_roots,
+	.release        = __xen_pcibk_release_pci_dev,
+	.add            = __xen_pcibk_add_pci_dev,
+	.get            = __xen_pcibk_get_pci_dev,
+};
diff --git a/drivers/xen/xen-pciback/pci_stub.c b/drivers/xen/xen-pciback/pci_stub.c
new file mode 100644
index 000000000..adf3aae29
--- /dev/null
+++ b/drivers/xen/xen-pciback/pci_stub.c
@@ -0,0 +1,1652 @@
+/*
+ * PCI Stub Driver - Grabs devices in backend to be exported later
+ *
+ * Ryan Wilson <hap9@epoch.ncsc.mil>
+ * Chris Bookholt <hap10@epoch.ncsc.mil>
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/rwsem.h>
+#include <linux/list.h>
+#include <linux/spinlock.h>
+#include <linux/kref.h>
+#include <linux/pci.h>
+#include <linux/wait.h>
+#include <linux/sched.h>
+#include <linux/atomic.h>
+#include <xen/events.h>
+#include <asm/xen/pci.h>
+#include <asm/xen/hypervisor.h>
+#include <xen/interface/physdev.h>
+#include "pciback.h"
+#include "conf_space.h"
+#include "conf_space_quirks.h"
+
+#define PCISTUB_DRIVER_NAME "pciback"
+
+static char *pci_devs_to_hide;
+wait_queue_head_t xen_pcibk_aer_wait_queue;
+/*Add sem for sync AER handling and xen_pcibk remove/reconfigue ops,
+* We want to avoid in middle of AER ops, xen_pcibk devices is being removed
+*/
+static DECLARE_RWSEM(pcistub_sem);
+module_param_named(hide, pci_devs_to_hide, charp, 0444);
+
+struct pcistub_device_id {
+	struct list_head slot_list;
+	int domain;
+	unsigned char bus;
+	unsigned int devfn;
+};
+static LIST_HEAD(pcistub_device_ids);
+static DEFINE_SPINLOCK(device_ids_lock);
+
+struct pcistub_device {
+	struct kref kref;
+	struct list_head dev_list;
+	spinlock_t lock;
+
+	struct pci_dev *dev;
+	struct xen_pcibk_device *pdev;/* non-NULL if struct pci_dev is in use */
+};
+
+/* Access to pcistub_devices & seized_devices lists and the initialize_devices
+ * flag must be locked with pcistub_devices_lock
+ */
+static DEFINE_SPINLOCK(pcistub_devices_lock);
+static LIST_HEAD(pcistub_devices);
+
+/* wait for device_initcall before initializing our devices
+ * (see pcistub_init_devices_late)
+ */
+static int initialize_devices;
+static LIST_HEAD(seized_devices);
+
+static struct pcistub_device *pcistub_device_alloc(struct pci_dev *dev)
+{
+	struct pcistub_device *psdev;
+
+	dev_dbg(&dev->dev, "pcistub_device_alloc\n");
+
+	psdev = kzalloc(sizeof(*psdev), GFP_KERNEL);
+	if (!psdev)
+		return NULL;
+
+	psdev->dev = pci_dev_get(dev);
+	if (!psdev->dev) {
+		kfree(psdev);
+		return NULL;
+	}
+
+	kref_init(&psdev->kref);
+	spin_lock_init(&psdev->lock);
+
+	return psdev;
+}
+
+/* Don't call this directly as it's called by pcistub_device_put */
+static void pcistub_device_release(struct kref *kref)
+{
+	struct pcistub_device *psdev;
+	struct pci_dev *dev;
+	struct xen_pcibk_dev_data *dev_data;
+
+	psdev = container_of(kref, struct pcistub_device, kref);
+	dev = psdev->dev;
+	dev_data = pci_get_drvdata(dev);
+
+	dev_dbg(&dev->dev, "pcistub_device_release\n");
+
+	xen_unregister_device_domain_owner(dev);
+
+	/* Call the reset function which does not take lock as this
+	 * is called from "unbind" which takes a device_lock mutex.
+	 */
+	__pci_reset_function_locked(dev);
+	if (dev_data &&
+	    pci_load_and_free_saved_state(dev, &dev_data->pci_saved_state))
+		dev_info(&dev->dev, "Could not reload PCI state\n");
+	else
+		pci_restore_state(dev);
+
+	if (dev->msix_cap) {
+		struct physdev_pci_device ppdev = {
+			.seg = pci_domain_nr(dev->bus),
+			.bus = dev->bus->number,
+			.devfn = dev->devfn
+		};
+		int err = HYPERVISOR_physdev_op(PHYSDEVOP_release_msix,
+						&ppdev);
+
+		if (err && err != -ENOSYS)
+			dev_warn(&dev->dev, "MSI-X release failed (%d)\n",
+				 err);
+	}
+
+	/* Disable the device */
+	xen_pcibk_reset_device(dev);
+
+	kfree(dev_data);
+	pci_set_drvdata(dev, NULL);
+
+	/* Clean-up the device */
+	xen_pcibk_config_free_dyn_fields(dev);
+	xen_pcibk_config_free_dev(dev);
+
+	pci_clear_dev_assigned(dev);
+	pci_dev_put(dev);
+
+	kfree(psdev);
+}
+
+static inline void pcistub_device_get(struct pcistub_device *psdev)
+{
+	kref_get(&psdev->kref);
+}
+
+static inline void pcistub_device_put(struct pcistub_device *psdev)
+{
+	kref_put(&psdev->kref, pcistub_device_release);
+}
+
+static struct pcistub_device *pcistub_device_find_locked(int domain, int bus,
+							 int slot, int func)
+{
+	struct pcistub_device *psdev;
+
+	list_for_each_entry(psdev, &pcistub_devices, dev_list) {
+		if (psdev->dev != NULL
+		    && domain == pci_domain_nr(psdev->dev->bus)
+		    && bus == psdev->dev->bus->number
+		    && slot == PCI_SLOT(psdev->dev->devfn)
+		    && func == PCI_FUNC(psdev->dev->devfn)) {
+			return psdev;
+		}
+	}
+
+	return NULL;
+}
+
+static struct pcistub_device *pcistub_device_find(int domain, int bus,
+						  int slot, int func)
+{
+	struct pcistub_device *psdev;
+	unsigned long flags;
+
+	spin_lock_irqsave(&pcistub_devices_lock, flags);
+
+	psdev = pcistub_device_find_locked(domain, bus, slot, func);
+	if (psdev)
+		pcistub_device_get(psdev);
+
+	spin_unlock_irqrestore(&pcistub_devices_lock, flags);
+	return psdev;
+}
+
+static struct pci_dev *pcistub_device_get_pci_dev(struct xen_pcibk_device *pdev,
+						  struct pcistub_device *psdev)
+{
+	struct pci_dev *pci_dev = NULL;
+	unsigned long flags;
+
+	pcistub_device_get(psdev);
+
+	spin_lock_irqsave(&psdev->lock, flags);
+	if (!psdev->pdev) {
+		psdev->pdev = pdev;
+		pci_dev = psdev->dev;
+	}
+	spin_unlock_irqrestore(&psdev->lock, flags);
+
+	if (!pci_dev)
+		pcistub_device_put(psdev);
+
+	return pci_dev;
+}
+
+struct pci_dev *pcistub_get_pci_dev_by_slot(struct xen_pcibk_device *pdev,
+					    int domain, int bus,
+					    int slot, int func)
+{
+	struct pcistub_device *psdev;
+	struct pci_dev *found_dev = NULL;
+	unsigned long flags;
+
+	spin_lock_irqsave(&pcistub_devices_lock, flags);
+
+	psdev = pcistub_device_find_locked(domain, bus, slot, func);
+	if (psdev)
+		found_dev = pcistub_device_get_pci_dev(pdev, psdev);
+
+	spin_unlock_irqrestore(&pcistub_devices_lock, flags);
+	return found_dev;
+}
+
+struct pci_dev *pcistub_get_pci_dev(struct xen_pcibk_device *pdev,
+				    struct pci_dev *dev)
+{
+	struct pcistub_device *psdev;
+	struct pci_dev *found_dev = NULL;
+	unsigned long flags;
+
+	spin_lock_irqsave(&pcistub_devices_lock, flags);
+
+	list_for_each_entry(psdev, &pcistub_devices, dev_list) {
+		if (psdev->dev == dev) {
+			found_dev = pcistub_device_get_pci_dev(pdev, psdev);
+			break;
+		}
+	}
+
+	spin_unlock_irqrestore(&pcistub_devices_lock, flags);
+	return found_dev;
+}
+
+/*
+ * Called when:
+ *  - XenBus state has been reconfigure (pci unplug). See xen_pcibk_remove_device
+ *  - XenBus state has been disconnected (guest shutdown). See xen_pcibk_xenbus_remove
+ *  - 'echo BDF > unbind' on pciback module with no guest attached. See pcistub_remove
+ *  - 'echo BDF > unbind' with a guest still using it. See pcistub_remove
+ *
+ *  As such we have to be careful.
+ *
+ *  To make this easier, the caller has to hold the device lock.
+ */
+void pcistub_put_pci_dev(struct pci_dev *dev)
+{
+	struct pcistub_device *psdev, *found_psdev = NULL;
+	unsigned long flags;
+	struct xen_pcibk_dev_data *dev_data;
+	int ret;
+
+	spin_lock_irqsave(&pcistub_devices_lock, flags);
+
+	list_for_each_entry(psdev, &pcistub_devices, dev_list) {
+		if (psdev->dev == dev) {
+			found_psdev = psdev;
+			break;
+		}
+	}
+
+	spin_unlock_irqrestore(&pcistub_devices_lock, flags);
+	if (WARN_ON(!found_psdev))
+		return;
+
+	/*hold this lock for avoiding breaking link between
+	* pcistub and xen_pcibk when AER is in processing
+	*/
+	down_write(&pcistub_sem);
+	/* Cleanup our device
+	 * (so it's ready for the next domain)
+	 */
+	device_lock_assert(&dev->dev);
+	__pci_reset_function_locked(dev);
+
+	dev_data = pci_get_drvdata(dev);
+	ret = pci_load_saved_state(dev, dev_data->pci_saved_state);
+	if (!ret) {
+		/*
+		 * The usual sequence is pci_save_state & pci_restore_state
+		 * but the guest might have messed the configuration space up.
+		 * Use the initial version (when device was bound to us).
+		 */
+		pci_restore_state(dev);
+	} else
+		dev_info(&dev->dev, "Could not reload PCI state\n");
+	/* This disables the device. */
+	xen_pcibk_reset_device(dev);
+
+	/* And cleanup up our emulated fields. */
+	xen_pcibk_config_reset_dev(dev);
+	xen_pcibk_config_free_dyn_fields(dev);
+
+	xen_unregister_device_domain_owner(dev);
+
+	spin_lock_irqsave(&found_psdev->lock, flags);
+	found_psdev->pdev = NULL;
+	spin_unlock_irqrestore(&found_psdev->lock, flags);
+
+	pcistub_device_put(found_psdev);
+	up_write(&pcistub_sem);
+}
+
+static int pcistub_match_one(struct pci_dev *dev,
+			     struct pcistub_device_id *pdev_id)
+{
+	/* Match the specified device by domain, bus, slot, func and also if
+	 * any of the device's parent bridges match.
+	 */
+	for (; dev != NULL; dev = dev->bus->self) {
+		if (pci_domain_nr(dev->bus) == pdev_id->domain
+		    && dev->bus->number == pdev_id->bus
+		    && dev->devfn == pdev_id->devfn)
+			return 1;
+
+		/* Sometimes topmost bridge links to itself. */
+		if (dev == dev->bus->self)
+			break;
+	}
+
+	return 0;
+}
+
+static int pcistub_match(struct pci_dev *dev)
+{
+	struct pcistub_device_id *pdev_id;
+	unsigned long flags;
+	int found = 0;
+
+	spin_lock_irqsave(&device_ids_lock, flags);
+	list_for_each_entry(pdev_id, &pcistub_device_ids, slot_list) {
+		if (pcistub_match_one(dev, pdev_id)) {
+			found = 1;
+			break;
+		}
+	}
+	spin_unlock_irqrestore(&device_ids_lock, flags);
+
+	return found;
+}
+
+static int pcistub_init_device(struct pci_dev *dev)
+{
+	struct xen_pcibk_dev_data *dev_data;
+	int err = 0;
+
+	dev_dbg(&dev->dev, "initializing...\n");
+
+	/* The PCI backend is not intended to be a module (or to work with
+	 * removable PCI devices (yet). If it were, xen_pcibk_config_free()
+	 * would need to be called somewhere to free the memory allocated
+	 * here and then to call kfree(pci_get_drvdata(psdev->dev)).
+	 */
+	dev_data = kzalloc(sizeof(*dev_data) +  strlen(DRV_NAME "[]")
+				+ strlen(pci_name(dev)) + 1, GFP_KERNEL);
+	if (!dev_data) {
+		err = -ENOMEM;
+		goto out;
+	}
+	pci_set_drvdata(dev, dev_data);
+
+	/*
+	 * Setup name for fake IRQ handler. It will only be enabled
+	 * once the device is turned on by the guest.
+	 */
+	sprintf(dev_data->irq_name, DRV_NAME "[%s]", pci_name(dev));
+
+	dev_dbg(&dev->dev, "initializing config\n");
+
+	init_waitqueue_head(&xen_pcibk_aer_wait_queue);
+	err = xen_pcibk_config_init_dev(dev);
+	if (err)
+		goto out;
+
+	/* HACK: Force device (& ACPI) to determine what IRQ it's on - we
+	 * must do this here because pcibios_enable_device may specify
+	 * the pci device's true irq (and possibly its other resources)
+	 * if they differ from what's in the configuration space.
+	 * This makes the assumption that the device's resources won't
+	 * change after this point (otherwise this code may break!)
+	 */
+	dev_dbg(&dev->dev, "enabling device\n");
+	err = pci_enable_device(dev);
+	if (err)
+		goto config_release;
+
+	if (dev->msix_cap) {
+		struct physdev_pci_device ppdev = {
+			.seg = pci_domain_nr(dev->bus),
+			.bus = dev->bus->number,
+			.devfn = dev->devfn
+		};
+
+		err = HYPERVISOR_physdev_op(PHYSDEVOP_prepare_msix, &ppdev);
+		if (err && err != -ENOSYS)
+			dev_err(&dev->dev, "MSI-X preparation failed (%d)\n",
+				err);
+	}
+
+	/* We need the device active to save the state. */
+	dev_dbg(&dev->dev, "save state of device\n");
+	pci_save_state(dev);
+	dev_data->pci_saved_state = pci_store_saved_state(dev);
+	if (!dev_data->pci_saved_state)
+		dev_err(&dev->dev, "Could not store PCI conf saved state!\n");
+	else {
+		dev_dbg(&dev->dev, "resetting (FLR, D3, etc) the device\n");
+		__pci_reset_function_locked(dev);
+		pci_restore_state(dev);
+	}
+	/* Now disable the device (this also ensures some private device
+	 * data is setup before we export)
+	 */
+	dev_dbg(&dev->dev, "reset device\n");
+	xen_pcibk_reset_device(dev);
+
+	pci_set_dev_assigned(dev);
+	return 0;
+
+config_release:
+	xen_pcibk_config_free_dev(dev);
+
+out:
+	pci_set_drvdata(dev, NULL);
+	kfree(dev_data);
+	return err;
+}
+
+/*
+ * Because some initialization still happens on
+ * devices during fs_initcall, we need to defer
+ * full initialization of our devices until
+ * device_initcall.
+ */
+static int __init pcistub_init_devices_late(void)
+{
+	struct pcistub_device *psdev;
+	unsigned long flags;
+	int err = 0;
+
+	spin_lock_irqsave(&pcistub_devices_lock, flags);
+
+	while (!list_empty(&seized_devices)) {
+		psdev = container_of(seized_devices.next,
+				     struct pcistub_device, dev_list);
+		list_del(&psdev->dev_list);
+
+		spin_unlock_irqrestore(&pcistub_devices_lock, flags);
+
+		err = pcistub_init_device(psdev->dev);
+		if (err) {
+			dev_err(&psdev->dev->dev,
+				"error %d initializing device\n", err);
+			kfree(psdev);
+			psdev = NULL;
+		}
+
+		spin_lock_irqsave(&pcistub_devices_lock, flags);
+
+		if (psdev)
+			list_add_tail(&psdev->dev_list, &pcistub_devices);
+	}
+
+	initialize_devices = 1;
+
+	spin_unlock_irqrestore(&pcistub_devices_lock, flags);
+
+	return 0;
+}
+
+static void pcistub_device_id_add_list(struct pcistub_device_id *new,
+				       int domain, int bus, unsigned int devfn)
+{
+	struct pcistub_device_id *pci_dev_id;
+	unsigned long flags;
+	int found = 0;
+
+	spin_lock_irqsave(&device_ids_lock, flags);
+
+	list_for_each_entry(pci_dev_id, &pcistub_device_ids, slot_list) {
+		if (pci_dev_id->domain == domain && pci_dev_id->bus == bus &&
+		    pci_dev_id->devfn == devfn) {
+			found = 1;
+			break;
+		}
+	}
+
+	if (!found) {
+		new->domain = domain;
+		new->bus = bus;
+		new->devfn = devfn;
+		list_add_tail(&new->slot_list, &pcistub_device_ids);
+	}
+
+	spin_unlock_irqrestore(&device_ids_lock, flags);
+
+	if (found)
+		kfree(new);
+}
+
+static int pcistub_seize(struct pci_dev *dev,
+			 struct pcistub_device_id *pci_dev_id)
+{
+	struct pcistub_device *psdev;
+	unsigned long flags;
+	int err = 0;
+
+	psdev = pcistub_device_alloc(dev);
+	if (!psdev) {
+		kfree(pci_dev_id);
+		return -ENOMEM;
+	}
+
+	spin_lock_irqsave(&pcistub_devices_lock, flags);
+
+	if (initialize_devices) {
+		spin_unlock_irqrestore(&pcistub_devices_lock, flags);
+
+		/* don't want irqs disabled when calling pcistub_init_device */
+		err = pcistub_init_device(psdev->dev);
+
+		spin_lock_irqsave(&pcistub_devices_lock, flags);
+
+		if (!err)
+			list_add(&psdev->dev_list, &pcistub_devices);
+	} else {
+		dev_dbg(&dev->dev, "deferring initialization\n");
+		list_add(&psdev->dev_list, &seized_devices);
+	}
+
+	spin_unlock_irqrestore(&pcistub_devices_lock, flags);
+
+	if (err) {
+		kfree(pci_dev_id);
+		pcistub_device_put(psdev);
+	} else if (pci_dev_id)
+		pcistub_device_id_add_list(pci_dev_id, pci_domain_nr(dev->bus),
+					   dev->bus->number, dev->devfn);
+
+	return err;
+}
+
+/* Called when 'bind'. This means we must _NOT_ call pci_reset_function or
+ * other functions that take the sysfs lock. */
+static int pcistub_probe(struct pci_dev *dev, const struct pci_device_id *id)
+{
+	int err = 0, match;
+	struct pcistub_device_id *pci_dev_id = NULL;
+
+	dev_dbg(&dev->dev, "probing...\n");
+
+	match = pcistub_match(dev);
+
+	if ((dev->driver_override &&
+	     !strcmp(dev->driver_override, PCISTUB_DRIVER_NAME)) ||
+	    match) {
+
+		if (dev->hdr_type != PCI_HEADER_TYPE_NORMAL
+		    && dev->hdr_type != PCI_HEADER_TYPE_BRIDGE) {
+			dev_err(&dev->dev, "can't export pci devices that "
+				"don't have a normal (0) or bridge (1) "
+				"header type!\n");
+			err = -ENODEV;
+			goto out;
+		}
+
+		if (!match) {
+			pci_dev_id = kmalloc(sizeof(*pci_dev_id), GFP_KERNEL);
+			if (!pci_dev_id) {
+				err = -ENOMEM;
+				goto out;
+			}
+		}
+
+		dev_info(&dev->dev, "seizing device\n");
+		err = pcistub_seize(dev, pci_dev_id);
+	} else
+		/* Didn't find the device */
+		err = -ENODEV;
+
+out:
+	return err;
+}
+
+/* Called when 'unbind'. This means we must _NOT_ call pci_reset_function or
+ * other functions that take the sysfs lock. */
+static void pcistub_remove(struct pci_dev *dev)
+{
+	struct pcistub_device *psdev, *found_psdev = NULL;
+	unsigned long flags;
+
+	dev_dbg(&dev->dev, "removing\n");
+
+	spin_lock_irqsave(&pcistub_devices_lock, flags);
+
+	xen_pcibk_config_quirk_release(dev);
+
+	list_for_each_entry(psdev, &pcistub_devices, dev_list) {
+		if (psdev->dev == dev) {
+			found_psdev = psdev;
+			break;
+		}
+	}
+
+	spin_unlock_irqrestore(&pcistub_devices_lock, flags);
+
+	if (found_psdev) {
+		dev_dbg(&dev->dev, "found device to remove %s\n",
+			found_psdev->pdev ? "- in-use" : "");
+
+		if (found_psdev->pdev) {
+			int domid = xen_find_device_domain_owner(dev);
+
+			pr_warn("****** removing device %s while still in-use by domain %d! ******\n",
+			       pci_name(found_psdev->dev), domid);
+			pr_warn("****** driver domain may still access this device's i/o resources!\n");
+			pr_warn("****** shutdown driver domain before binding device\n");
+			pr_warn("****** to other drivers or domains\n");
+
+			/* N.B. This ends up calling pcistub_put_pci_dev which ends up
+			 * doing the FLR. */
+			xen_pcibk_release_pci_dev(found_psdev->pdev,
+						found_psdev->dev,
+						false /* caller holds the lock. */);
+		}
+
+		spin_lock_irqsave(&pcistub_devices_lock, flags);
+		list_del(&found_psdev->dev_list);
+		spin_unlock_irqrestore(&pcistub_devices_lock, flags);
+
+		/* the final put for releasing from the list */
+		pcistub_device_put(found_psdev);
+	}
+}
+
+static const struct pci_device_id pcistub_ids[] = {
+	{
+	 .vendor = PCI_ANY_ID,
+	 .device = PCI_ANY_ID,
+	 .subvendor = PCI_ANY_ID,
+	 .subdevice = PCI_ANY_ID,
+	 },
+	{0,},
+};
+
+#define PCI_NODENAME_MAX 40
+static void kill_domain_by_device(struct pcistub_device *psdev)
+{
+	struct xenbus_transaction xbt;
+	int err;
+	char nodename[PCI_NODENAME_MAX];
+
+	BUG_ON(!psdev);
+	snprintf(nodename, PCI_NODENAME_MAX, "/local/domain/0/backend/pci/%d/0",
+		psdev->pdev->xdev->otherend_id);
+
+again:
+	err = xenbus_transaction_start(&xbt);
+	if (err) {
+		dev_err(&psdev->dev->dev,
+			"error %d when start xenbus transaction\n", err);
+		return;
+	}
+	/*PV AER handlers will set this flag*/
+	xenbus_printf(xbt, nodename, "aerState" , "aerfail");
+	err = xenbus_transaction_end(xbt, 0);
+	if (err) {
+		if (err == -EAGAIN)
+			goto again;
+		dev_err(&psdev->dev->dev,
+			"error %d when end xenbus transaction\n", err);
+		return;
+	}
+}
+
+/* For each aer recovery step error_detected, mmio_enabled, etc, front_end and
+ * backend need to have cooperation. In xen_pcibk, those steps will do similar
+ * jobs: send service request and waiting for front_end response.
+*/
+static pci_ers_result_t common_process(struct pcistub_device *psdev,
+				       pci_channel_state_t state, int aer_cmd,
+				       pci_ers_result_t result)
+{
+	pci_ers_result_t res = result;
+	struct xen_pcie_aer_op *aer_op;
+	struct xen_pcibk_device *pdev = psdev->pdev;
+	struct xen_pci_sharedinfo *sh_info = pdev->sh_info;
+	int ret;
+
+	/*with PV AER drivers*/
+	aer_op = &(sh_info->aer_op);
+	aer_op->cmd = aer_cmd ;
+	/*useful for error_detected callback*/
+	aer_op->err = state;
+	/*pcifront_end BDF*/
+	ret = xen_pcibk_get_pcifront_dev(psdev->dev, psdev->pdev,
+		&aer_op->domain, &aer_op->bus, &aer_op->devfn);
+	if (!ret) {
+		dev_err(&psdev->dev->dev,
+			DRV_NAME ": failed to get pcifront device\n");
+		return PCI_ERS_RESULT_NONE;
+	}
+	wmb();
+
+	dev_dbg(&psdev->dev->dev,
+			DRV_NAME ": aer_op %x dom %x bus %x devfn %x\n",
+			aer_cmd, aer_op->domain, aer_op->bus, aer_op->devfn);
+	/*local flag to mark there's aer request, xen_pcibk callback will use
+	* this flag to judge whether we need to check pci-front give aer
+	* service ack signal
+	*/
+	set_bit(_PCIB_op_pending, (unsigned long *)&pdev->flags);
+
+	/*It is possible that a pcifront conf_read_write ops request invokes
+	* the callback which cause the spurious execution of wake_up.
+	* Yet it is harmless and better than a spinlock here
+	*/
+	set_bit(_XEN_PCIB_active,
+		(unsigned long *)&sh_info->flags);
+	wmb();
+	notify_remote_via_irq(pdev->evtchn_irq);
+
+	/* Enable IRQ to signal "request done". */
+	xen_pcibk_lateeoi(pdev, 0);
+
+	ret = wait_event_timeout(xen_pcibk_aer_wait_queue,
+				 !(test_bit(_XEN_PCIB_active, (unsigned long *)
+				 &sh_info->flags)), 300*HZ);
+
+	/* Enable IRQ for pcifront request if not already active. */
+	if (!test_bit(_PDEVF_op_active, &pdev->flags))
+		xen_pcibk_lateeoi(pdev, 0);
+
+	if (!ret) {
+		if (test_bit(_XEN_PCIB_active,
+			(unsigned long *)&sh_info->flags)) {
+			dev_err(&psdev->dev->dev,
+				"pcifront aer process not responding!\n");
+			clear_bit(_XEN_PCIB_active,
+			  (unsigned long *)&sh_info->flags);
+			aer_op->err = PCI_ERS_RESULT_NONE;
+			return res;
+		}
+	}
+	clear_bit(_PCIB_op_pending, (unsigned long *)&pdev->flags);
+
+	res = (pci_ers_result_t)aer_op->err;
+	return res;
+}
+
+/*
+* xen_pcibk_slot_reset: it will send the slot_reset request to  pcifront in case
+* of the device driver could provide this service, and then wait for pcifront
+* ack.
+* @dev: pointer to PCI devices
+* return value is used by aer_core do_recovery policy
+*/
+static pci_ers_result_t xen_pcibk_slot_reset(struct pci_dev *dev)
+{
+	struct pcistub_device *psdev;
+	pci_ers_result_t result;
+
+	result = PCI_ERS_RESULT_RECOVERED;
+	dev_dbg(&dev->dev, "xen_pcibk_slot_reset(bus:%x,devfn:%x)\n",
+		dev->bus->number, dev->devfn);
+
+	down_write(&pcistub_sem);
+	psdev = pcistub_device_find(pci_domain_nr(dev->bus),
+				dev->bus->number,
+				PCI_SLOT(dev->devfn),
+				PCI_FUNC(dev->devfn));
+
+	if (!psdev || !psdev->pdev) {
+		dev_err(&dev->dev,
+			DRV_NAME " device is not found/assigned\n");
+		goto end;
+	}
+
+	if (!psdev->pdev->sh_info) {
+		dev_err(&dev->dev, DRV_NAME " device is not connected or owned"
+			" by HVM, kill it\n");
+		kill_domain_by_device(psdev);
+		goto end;
+	}
+
+	if (!test_bit(_XEN_PCIB_AERHANDLER,
+		(unsigned long *)&psdev->pdev->sh_info->flags)) {
+		dev_err(&dev->dev,
+			"guest with no AER driver should have been killed\n");
+		goto end;
+	}
+	result = common_process(psdev, 1, XEN_PCI_OP_aer_slotreset, result);
+
+	if (result == PCI_ERS_RESULT_NONE ||
+		result == PCI_ERS_RESULT_DISCONNECT) {
+		dev_dbg(&dev->dev,
+			"No AER slot_reset service or disconnected!\n");
+		kill_domain_by_device(psdev);
+	}
+end:
+	if (psdev)
+		pcistub_device_put(psdev);
+	up_write(&pcistub_sem);
+	return result;
+
+}
+
+
+/*xen_pcibk_mmio_enabled: it will send the mmio_enabled request to  pcifront
+* in case of the device driver could provide this service, and then wait
+* for pcifront ack
+* @dev: pointer to PCI devices
+* return value is used by aer_core do_recovery policy
+*/
+
+static pci_ers_result_t xen_pcibk_mmio_enabled(struct pci_dev *dev)
+{
+	struct pcistub_device *psdev;
+	pci_ers_result_t result;
+
+	result = PCI_ERS_RESULT_RECOVERED;
+	dev_dbg(&dev->dev, "xen_pcibk_mmio_enabled(bus:%x,devfn:%x)\n",
+		dev->bus->number, dev->devfn);
+
+	down_write(&pcistub_sem);
+	psdev = pcistub_device_find(pci_domain_nr(dev->bus),
+				dev->bus->number,
+				PCI_SLOT(dev->devfn),
+				PCI_FUNC(dev->devfn));
+
+	if (!psdev || !psdev->pdev) {
+		dev_err(&dev->dev,
+			DRV_NAME " device is not found/assigned\n");
+		goto end;
+	}
+
+	if (!psdev->pdev->sh_info) {
+		dev_err(&dev->dev, DRV_NAME " device is not connected or owned"
+			" by HVM, kill it\n");
+		kill_domain_by_device(psdev);
+		goto end;
+	}
+
+	if (!test_bit(_XEN_PCIB_AERHANDLER,
+		(unsigned long *)&psdev->pdev->sh_info->flags)) {
+		dev_err(&dev->dev,
+			"guest with no AER driver should have been killed\n");
+		goto end;
+	}
+	result = common_process(psdev, 1, XEN_PCI_OP_aer_mmio, result);
+
+	if (result == PCI_ERS_RESULT_NONE ||
+		result == PCI_ERS_RESULT_DISCONNECT) {
+		dev_dbg(&dev->dev,
+			"No AER mmio_enabled service or disconnected!\n");
+		kill_domain_by_device(psdev);
+	}
+end:
+	if (psdev)
+		pcistub_device_put(psdev);
+	up_write(&pcistub_sem);
+	return result;
+}
+
+/*xen_pcibk_error_detected: it will send the error_detected request to  pcifront
+* in case of the device driver could provide this service, and then wait
+* for pcifront ack.
+* @dev: pointer to PCI devices
+* @error: the current PCI connection state
+* return value is used by aer_core do_recovery policy
+*/
+
+static pci_ers_result_t xen_pcibk_error_detected(struct pci_dev *dev,
+	pci_channel_state_t error)
+{
+	struct pcistub_device *psdev;
+	pci_ers_result_t result;
+
+	result = PCI_ERS_RESULT_CAN_RECOVER;
+	dev_dbg(&dev->dev, "xen_pcibk_error_detected(bus:%x,devfn:%x)\n",
+		dev->bus->number, dev->devfn);
+
+	down_write(&pcistub_sem);
+	psdev = pcistub_device_find(pci_domain_nr(dev->bus),
+				dev->bus->number,
+				PCI_SLOT(dev->devfn),
+				PCI_FUNC(dev->devfn));
+
+	if (!psdev || !psdev->pdev) {
+		dev_err(&dev->dev,
+			DRV_NAME " device is not found/assigned\n");
+		goto end;
+	}
+
+	if (!psdev->pdev->sh_info) {
+		dev_err(&dev->dev, DRV_NAME " device is not connected or owned"
+			" by HVM, kill it\n");
+		kill_domain_by_device(psdev);
+		goto end;
+	}
+
+	/*Guest owns the device yet no aer handler regiested, kill guest*/
+	if (!test_bit(_XEN_PCIB_AERHANDLER,
+		(unsigned long *)&psdev->pdev->sh_info->flags)) {
+		dev_dbg(&dev->dev, "guest may have no aer driver, kill it\n");
+		kill_domain_by_device(psdev);
+		goto end;
+	}
+	result = common_process(psdev, error, XEN_PCI_OP_aer_detected, result);
+
+	if (result == PCI_ERS_RESULT_NONE ||
+		result == PCI_ERS_RESULT_DISCONNECT) {
+		dev_dbg(&dev->dev,
+			"No AER error_detected service or disconnected!\n");
+		kill_domain_by_device(psdev);
+	}
+end:
+	if (psdev)
+		pcistub_device_put(psdev);
+	up_write(&pcistub_sem);
+	return result;
+}
+
+/*xen_pcibk_error_resume: it will send the error_resume request to  pcifront
+* in case of the device driver could provide this service, and then wait
+* for pcifront ack.
+* @dev: pointer to PCI devices
+*/
+
+static void xen_pcibk_error_resume(struct pci_dev *dev)
+{
+	struct pcistub_device *psdev;
+
+	dev_dbg(&dev->dev, "xen_pcibk_error_resume(bus:%x,devfn:%x)\n",
+		dev->bus->number, dev->devfn);
+
+	down_write(&pcistub_sem);
+	psdev = pcistub_device_find(pci_domain_nr(dev->bus),
+				dev->bus->number,
+				PCI_SLOT(dev->devfn),
+				PCI_FUNC(dev->devfn));
+
+	if (!psdev || !psdev->pdev) {
+		dev_err(&dev->dev,
+			DRV_NAME " device is not found/assigned\n");
+		goto end;
+	}
+
+	if (!psdev->pdev->sh_info) {
+		dev_err(&dev->dev, DRV_NAME " device is not connected or owned"
+			" by HVM, kill it\n");
+		kill_domain_by_device(psdev);
+		goto end;
+	}
+
+	if (!test_bit(_XEN_PCIB_AERHANDLER,
+		(unsigned long *)&psdev->pdev->sh_info->flags)) {
+		dev_err(&dev->dev,
+			"guest with no AER driver should have been killed\n");
+		kill_domain_by_device(psdev);
+		goto end;
+	}
+	common_process(psdev, 1, XEN_PCI_OP_aer_resume,
+		       PCI_ERS_RESULT_RECOVERED);
+end:
+	if (psdev)
+		pcistub_device_put(psdev);
+	up_write(&pcistub_sem);
+	return;
+}
+
+/*add xen_pcibk AER handling*/
+static const struct pci_error_handlers xen_pcibk_error_handler = {
+	.error_detected = xen_pcibk_error_detected,
+	.mmio_enabled = xen_pcibk_mmio_enabled,
+	.slot_reset = xen_pcibk_slot_reset,
+	.resume = xen_pcibk_error_resume,
+};
+
+/*
+ * Note: There is no MODULE_DEVICE_TABLE entry here because this isn't
+ * for a normal device. I don't want it to be loaded automatically.
+ */
+
+static struct pci_driver xen_pcibk_pci_driver = {
+	/* The name should be xen_pciback, but until the tools are updated
+	 * we will keep it as pciback. */
+	.name = PCISTUB_DRIVER_NAME,
+	.id_table = pcistub_ids,
+	.probe = pcistub_probe,
+	.remove = pcistub_remove,
+	.err_handler = &xen_pcibk_error_handler,
+};
+
+static inline int str_to_slot(const char *buf, int *domain, int *bus,
+			      int *slot, int *func)
+{
+	int parsed = 0;
+
+	switch (sscanf(buf, " %x:%x:%x.%x %n", domain, bus, slot, func,
+		       &parsed)) {
+	case 3:
+		*func = -1;
+		sscanf(buf, " %x:%x:%x.* %n", domain, bus, slot, &parsed);
+		break;
+	case 2:
+		*slot = *func = -1;
+		sscanf(buf, " %x:%x:*.* %n", domain, bus, &parsed);
+		break;
+	}
+	if (parsed && !buf[parsed])
+		return 0;
+
+	/* try again without domain */
+	*domain = 0;
+	switch (sscanf(buf, " %x:%x.%x %n", bus, slot, func, &parsed)) {
+	case 2:
+		*func = -1;
+		sscanf(buf, " %x:%x.* %n", bus, slot, &parsed);
+		break;
+	case 1:
+		*slot = *func = -1;
+		sscanf(buf, " %x:*.* %n", bus, &parsed);
+		break;
+	}
+	if (parsed && !buf[parsed])
+		return 0;
+
+	return -EINVAL;
+}
+
+static inline int str_to_quirk(const char *buf, int *domain, int *bus, int
+			       *slot, int *func, int *reg, int *size, int *mask)
+{
+	int parsed = 0;
+
+	sscanf(buf, " %x:%x:%x.%x-%x:%x:%x %n", domain, bus, slot, func,
+	       reg, size, mask, &parsed);
+	if (parsed && !buf[parsed])
+		return 0;
+
+	/* try again without domain */
+	*domain = 0;
+	sscanf(buf, " %x:%x.%x-%x:%x:%x %n", bus, slot, func, reg, size,
+	       mask, &parsed);
+	if (parsed && !buf[parsed])
+		return 0;
+
+	return -EINVAL;
+}
+
+static int pcistub_device_id_add(int domain, int bus, int slot, int func)
+{
+	struct pcistub_device_id *pci_dev_id;
+	int rc = 0, devfn = PCI_DEVFN(slot, func);
+
+	if (slot < 0) {
+		for (slot = 0; !rc && slot < 32; ++slot)
+			rc = pcistub_device_id_add(domain, bus, slot, func);
+		return rc;
+	}
+
+	if (func < 0) {
+		for (func = 0; !rc && func < 8; ++func)
+			rc = pcistub_device_id_add(domain, bus, slot, func);
+		return rc;
+	}
+
+	if ((
+#if !defined(MODULE) /* pci_domains_supported is not being exported */ \
+    || !defined(CONFIG_PCI_DOMAINS)
+	     !pci_domains_supported ? domain :
+#endif
+	     domain < 0 || domain > 0xffff)
+	    || bus < 0 || bus > 0xff
+	    || PCI_SLOT(devfn) != slot
+	    || PCI_FUNC(devfn) != func)
+		return -EINVAL;
+
+	pci_dev_id = kmalloc(sizeof(*pci_dev_id), GFP_KERNEL);
+	if (!pci_dev_id)
+		return -ENOMEM;
+
+	pr_debug("wants to seize %04x:%02x:%02x.%d\n",
+		 domain, bus, slot, func);
+
+	pcistub_device_id_add_list(pci_dev_id, domain, bus, devfn);
+
+	return 0;
+}
+
+static int pcistub_device_id_remove(int domain, int bus, int slot, int func)
+{
+	struct pcistub_device_id *pci_dev_id, *t;
+	int err = -ENOENT;
+	unsigned long flags;
+
+	spin_lock_irqsave(&device_ids_lock, flags);
+	list_for_each_entry_safe(pci_dev_id, t, &pcistub_device_ids,
+				 slot_list) {
+		if (pci_dev_id->domain == domain && pci_dev_id->bus == bus
+		    && (slot < 0 || PCI_SLOT(pci_dev_id->devfn) == slot)
+		    && (func < 0 || PCI_FUNC(pci_dev_id->devfn) == func)) {
+			/* Don't break; here because it's possible the same
+			 * slot could be in the list more than once
+			 */
+			list_del(&pci_dev_id->slot_list);
+			kfree(pci_dev_id);
+
+			err = 0;
+
+			pr_debug("removed %04x:%02x:%02x.%d from seize list\n",
+				 domain, bus, slot, func);
+		}
+	}
+	spin_unlock_irqrestore(&device_ids_lock, flags);
+
+	return err;
+}
+
+static int pcistub_reg_add(int domain, int bus, int slot, int func,
+			   unsigned int reg, unsigned int size,
+			   unsigned int mask)
+{
+	int err = 0;
+	struct pcistub_device *psdev;
+	struct pci_dev *dev;
+	struct config_field *field;
+
+	if (reg > 0xfff || (size < 4 && (mask >> (size * 8))))
+		return -EINVAL;
+
+	psdev = pcistub_device_find(domain, bus, slot, func);
+	if (!psdev) {
+		err = -ENODEV;
+		goto out;
+	}
+	dev = psdev->dev;
+
+	field = kzalloc(sizeof(*field), GFP_KERNEL);
+	if (!field) {
+		err = -ENOMEM;
+		goto out;
+	}
+
+	field->offset = reg;
+	field->size = size;
+	field->mask = mask;
+	field->init = NULL;
+	field->reset = NULL;
+	field->release = NULL;
+	field->clean = xen_pcibk_config_field_free;
+
+	err = xen_pcibk_config_quirks_add_field(dev, field);
+	if (err)
+		kfree(field);
+out:
+	if (psdev)
+		pcistub_device_put(psdev);
+	return err;
+}
+
+static ssize_t new_slot_store(struct device_driver *drv, const char *buf,
+			      size_t count)
+{
+	int domain, bus, slot, func;
+	int err;
+
+	err = str_to_slot(buf, &domain, &bus, &slot, &func);
+	if (err)
+		goto out;
+
+	err = pcistub_device_id_add(domain, bus, slot, func);
+
+out:
+	if (!err)
+		err = count;
+	return err;
+}
+static DRIVER_ATTR_WO(new_slot);
+
+static ssize_t remove_slot_store(struct device_driver *drv, const char *buf,
+				 size_t count)
+{
+	int domain, bus, slot, func;
+	int err;
+
+	err = str_to_slot(buf, &domain, &bus, &slot, &func);
+	if (err)
+		goto out;
+
+	err = pcistub_device_id_remove(domain, bus, slot, func);
+
+out:
+	if (!err)
+		err = count;
+	return err;
+}
+static DRIVER_ATTR_WO(remove_slot);
+
+static ssize_t slots_show(struct device_driver *drv, char *buf)
+{
+	struct pcistub_device_id *pci_dev_id;
+	size_t count = 0;
+	unsigned long flags;
+
+	spin_lock_irqsave(&device_ids_lock, flags);
+	list_for_each_entry(pci_dev_id, &pcistub_device_ids, slot_list) {
+		if (count >= PAGE_SIZE)
+			break;
+
+		count += scnprintf(buf + count, PAGE_SIZE - count,
+				   "%04x:%02x:%02x.%d\n",
+				   pci_dev_id->domain, pci_dev_id->bus,
+				   PCI_SLOT(pci_dev_id->devfn),
+				   PCI_FUNC(pci_dev_id->devfn));
+	}
+	spin_unlock_irqrestore(&device_ids_lock, flags);
+
+	return count;
+}
+static DRIVER_ATTR_RO(slots);
+
+static ssize_t irq_handlers_show(struct device_driver *drv, char *buf)
+{
+	struct pcistub_device *psdev;
+	struct xen_pcibk_dev_data *dev_data;
+	size_t count = 0;
+	unsigned long flags;
+
+	spin_lock_irqsave(&pcistub_devices_lock, flags);
+	list_for_each_entry(psdev, &pcistub_devices, dev_list) {
+		if (count >= PAGE_SIZE)
+			break;
+		if (!psdev->dev)
+			continue;
+		dev_data = pci_get_drvdata(psdev->dev);
+		if (!dev_data)
+			continue;
+		count +=
+		    scnprintf(buf + count, PAGE_SIZE - count,
+			      "%s:%s:%sing:%ld\n",
+			      pci_name(psdev->dev),
+			      dev_data->isr_on ? "on" : "off",
+			      dev_data->ack_intr ? "ack" : "not ack",
+			      dev_data->handled);
+	}
+	spin_unlock_irqrestore(&pcistub_devices_lock, flags);
+	return count;
+}
+static DRIVER_ATTR_RO(irq_handlers);
+
+static ssize_t irq_handler_state_store(struct device_driver *drv,
+				       const char *buf, size_t count)
+{
+	struct pcistub_device *psdev;
+	struct xen_pcibk_dev_data *dev_data;
+	int domain, bus, slot, func;
+	int err;
+
+	err = str_to_slot(buf, &domain, &bus, &slot, &func);
+	if (err)
+		return err;
+
+	psdev = pcistub_device_find(domain, bus, slot, func);
+	if (!psdev) {
+		err = -ENOENT;
+		goto out;
+	}
+
+	dev_data = pci_get_drvdata(psdev->dev);
+	if (!dev_data) {
+		err = -ENOENT;
+		goto out;
+	}
+
+	dev_dbg(&psdev->dev->dev, "%s fake irq handler: %d->%d\n",
+		dev_data->irq_name, dev_data->isr_on,
+		!dev_data->isr_on);
+
+	dev_data->isr_on = !(dev_data->isr_on);
+	if (dev_data->isr_on)
+		dev_data->ack_intr = 1;
+out:
+	if (psdev)
+		pcistub_device_put(psdev);
+	if (!err)
+		err = count;
+	return err;
+}
+static DRIVER_ATTR_WO(irq_handler_state);
+
+static ssize_t quirks_store(struct device_driver *drv, const char *buf,
+			    size_t count)
+{
+	int domain, bus, slot, func, reg, size, mask;
+	int err;
+
+	err = str_to_quirk(buf, &domain, &bus, &slot, &func, &reg, &size,
+			   &mask);
+	if (err)
+		goto out;
+
+	err = pcistub_reg_add(domain, bus, slot, func, reg, size, mask);
+
+out:
+	if (!err)
+		err = count;
+	return err;
+}
+
+static ssize_t quirks_show(struct device_driver *drv, char *buf)
+{
+	int count = 0;
+	unsigned long flags;
+	struct xen_pcibk_config_quirk *quirk;
+	struct xen_pcibk_dev_data *dev_data;
+	const struct config_field *field;
+	const struct config_field_entry *cfg_entry;
+
+	spin_lock_irqsave(&device_ids_lock, flags);
+	list_for_each_entry(quirk, &xen_pcibk_quirks, quirks_list) {
+		if (count >= PAGE_SIZE)
+			goto out;
+
+		count += scnprintf(buf + count, PAGE_SIZE - count,
+				   "%02x:%02x.%01x\n\t%04x:%04x:%04x:%04x\n",
+				   quirk->pdev->bus->number,
+				   PCI_SLOT(quirk->pdev->devfn),
+				   PCI_FUNC(quirk->pdev->devfn),
+				   quirk->devid.vendor, quirk->devid.device,
+				   quirk->devid.subvendor,
+				   quirk->devid.subdevice);
+
+		dev_data = pci_get_drvdata(quirk->pdev);
+
+		list_for_each_entry(cfg_entry, &dev_data->config_fields, list) {
+			field = cfg_entry->field;
+			if (count >= PAGE_SIZE)
+				goto out;
+
+			count += scnprintf(buf + count, PAGE_SIZE - count,
+					   "\t\t%08x:%01x:%08x\n",
+					   cfg_entry->base_offset +
+					   field->offset, field->size,
+					   field->mask);
+		}
+	}
+
+out:
+	spin_unlock_irqrestore(&device_ids_lock, flags);
+
+	return count;
+}
+static DRIVER_ATTR_RW(quirks);
+
+static ssize_t permissive_store(struct device_driver *drv, const char *buf,
+				size_t count)
+{
+	int domain, bus, slot, func;
+	int err;
+	struct pcistub_device *psdev;
+	struct xen_pcibk_dev_data *dev_data;
+
+	err = str_to_slot(buf, &domain, &bus, &slot, &func);
+	if (err)
+		goto out;
+
+	psdev = pcistub_device_find(domain, bus, slot, func);
+	if (!psdev) {
+		err = -ENODEV;
+		goto out;
+	}
+
+	dev_data = pci_get_drvdata(psdev->dev);
+	/* the driver data for a device should never be null at this point */
+	if (!dev_data) {
+		err = -ENXIO;
+		goto release;
+	}
+	if (!dev_data->permissive) {
+		dev_data->permissive = 1;
+		/* Let user know that what they're doing could be unsafe */
+		dev_warn(&psdev->dev->dev, "enabling permissive mode "
+			 "configuration space accesses!\n");
+		dev_warn(&psdev->dev->dev,
+			 "permissive mode is potentially unsafe!\n");
+	}
+release:
+	pcistub_device_put(psdev);
+out:
+	if (!err)
+		err = count;
+	return err;
+}
+
+static ssize_t permissive_show(struct device_driver *drv, char *buf)
+{
+	struct pcistub_device *psdev;
+	struct xen_pcibk_dev_data *dev_data;
+	size_t count = 0;
+	unsigned long flags;
+	spin_lock_irqsave(&pcistub_devices_lock, flags);
+	list_for_each_entry(psdev, &pcistub_devices, dev_list) {
+		if (count >= PAGE_SIZE)
+			break;
+		if (!psdev->dev)
+			continue;
+		dev_data = pci_get_drvdata(psdev->dev);
+		if (!dev_data || !dev_data->permissive)
+			continue;
+		count +=
+		    scnprintf(buf + count, PAGE_SIZE - count, "%s\n",
+			      pci_name(psdev->dev));
+	}
+	spin_unlock_irqrestore(&pcistub_devices_lock, flags);
+	return count;
+}
+static DRIVER_ATTR_RW(permissive);
+
+static void pcistub_exit(void)
+{
+	driver_remove_file(&xen_pcibk_pci_driver.driver, &driver_attr_new_slot);
+	driver_remove_file(&xen_pcibk_pci_driver.driver,
+			   &driver_attr_remove_slot);
+	driver_remove_file(&xen_pcibk_pci_driver.driver, &driver_attr_slots);
+	driver_remove_file(&xen_pcibk_pci_driver.driver, &driver_attr_quirks);
+	driver_remove_file(&xen_pcibk_pci_driver.driver,
+			   &driver_attr_permissive);
+	driver_remove_file(&xen_pcibk_pci_driver.driver,
+			   &driver_attr_irq_handlers);
+	driver_remove_file(&xen_pcibk_pci_driver.driver,
+			   &driver_attr_irq_handler_state);
+	pci_unregister_driver(&xen_pcibk_pci_driver);
+}
+
+static int __init pcistub_init(void)
+{
+	int pos = 0;
+	int err = 0;
+	int domain, bus, slot, func;
+	int parsed;
+
+	if (pci_devs_to_hide && *pci_devs_to_hide) {
+		do {
+			parsed = 0;
+
+			err = sscanf(pci_devs_to_hide + pos,
+				     " (%x:%x:%x.%x) %n",
+				     &domain, &bus, &slot, &func, &parsed);
+			switch (err) {
+			case 3:
+				func = -1;
+				sscanf(pci_devs_to_hide + pos,
+				       " (%x:%x:%x.*) %n",
+				       &domain, &bus, &slot, &parsed);
+				break;
+			case 2:
+				slot = func = -1;
+				sscanf(pci_devs_to_hide + pos,
+				       " (%x:%x:*.*) %n",
+				       &domain, &bus, &parsed);
+				break;
+			}
+
+			if (!parsed) {
+				domain = 0;
+				err = sscanf(pci_devs_to_hide + pos,
+					     " (%x:%x.%x) %n",
+					     &bus, &slot, &func, &parsed);
+				switch (err) {
+				case 2:
+					func = -1;
+					sscanf(pci_devs_to_hide + pos,
+					       " (%x:%x.*) %n",
+					       &bus, &slot, &parsed);
+					break;
+				case 1:
+					slot = func = -1;
+					sscanf(pci_devs_to_hide + pos,
+					       " (%x:*.*) %n",
+					       &bus, &parsed);
+					break;
+				}
+			}
+
+			if (parsed <= 0)
+				goto parse_error;
+
+			err = pcistub_device_id_add(domain, bus, slot, func);
+			if (err)
+				goto out;
+
+			pos += parsed;
+		} while (pci_devs_to_hide[pos]);
+	}
+
+	/* If we're the first PCI Device Driver to register, we're the
+	 * first one to get offered PCI devices as they become
+	 * available (and thus we can be the first to grab them)
+	 */
+	err = pci_register_driver(&xen_pcibk_pci_driver);
+	if (err < 0)
+		goto out;
+
+	err = driver_create_file(&xen_pcibk_pci_driver.driver,
+				 &driver_attr_new_slot);
+	if (!err)
+		err = driver_create_file(&xen_pcibk_pci_driver.driver,
+					 &driver_attr_remove_slot);
+	if (!err)
+		err = driver_create_file(&xen_pcibk_pci_driver.driver,
+					 &driver_attr_slots);
+	if (!err)
+		err = driver_create_file(&xen_pcibk_pci_driver.driver,
+					 &driver_attr_quirks);
+	if (!err)
+		err = driver_create_file(&xen_pcibk_pci_driver.driver,
+					 &driver_attr_permissive);
+
+	if (!err)
+		err = driver_create_file(&xen_pcibk_pci_driver.driver,
+					 &driver_attr_irq_handlers);
+	if (!err)
+		err = driver_create_file(&xen_pcibk_pci_driver.driver,
+					&driver_attr_irq_handler_state);
+	if (err)
+		pcistub_exit();
+
+out:
+	return err;
+
+parse_error:
+	pr_err("Error parsing pci_devs_to_hide at \"%s\"\n",
+	       pci_devs_to_hide + pos);
+	return -EINVAL;
+}
+
+#ifndef MODULE
+/*
+ * fs_initcall happens before device_initcall
+ * so xen_pcibk *should* get called first (b/c we
+ * want to suck up any device before other drivers
+ * get a chance by being the first pci device
+ * driver to register)
+ */
+fs_initcall(pcistub_init);
+#endif
+
+#ifdef CONFIG_PCI_IOV
+static struct pcistub_device *find_vfs(const struct pci_dev *pdev)
+{
+	struct pcistub_device *psdev = NULL;
+	unsigned long flags;
+	bool found = false;
+
+	spin_lock_irqsave(&pcistub_devices_lock, flags);
+	list_for_each_entry(psdev, &pcistub_devices, dev_list) {
+		if (!psdev->pdev && psdev->dev != pdev
+		    && pci_physfn(psdev->dev) == pdev) {
+			found = true;
+			break;
+		}
+	}
+	spin_unlock_irqrestore(&pcistub_devices_lock, flags);
+	if (found)
+		return psdev;
+	return NULL;
+}
+
+static int pci_stub_notifier(struct notifier_block *nb,
+			     unsigned long action, void *data)
+{
+	struct device *dev = data;
+	const struct pci_dev *pdev = to_pci_dev(dev);
+
+	if (action != BUS_NOTIFY_UNBIND_DRIVER)
+		return NOTIFY_DONE;
+
+	if (!pdev->is_physfn)
+		return NOTIFY_DONE;
+
+	for (;;) {
+		struct pcistub_device *psdev = find_vfs(pdev);
+		if (!psdev)
+			break;
+		device_release_driver(&psdev->dev->dev);
+	}
+	return NOTIFY_DONE;
+}
+
+static struct notifier_block pci_stub_nb = {
+	.notifier_call = pci_stub_notifier,
+};
+#endif
+
+static int __init xen_pcibk_init(void)
+{
+	int err;
+
+	if (!xen_initial_domain())
+		return -ENODEV;
+
+	err = xen_pcibk_config_init();
+	if (err)
+		return err;
+
+#ifdef MODULE
+	err = pcistub_init();
+	if (err < 0)
+		return err;
+#endif
+
+	pcistub_init_devices_late();
+	err = xen_pcibk_xenbus_register();
+	if (err)
+		pcistub_exit();
+#ifdef CONFIG_PCI_IOV
+	else
+		bus_register_notifier(&pci_bus_type, &pci_stub_nb);
+#endif
+
+	return err;
+}
+
+static void __exit xen_pcibk_cleanup(void)
+{
+#ifdef CONFIG_PCI_IOV
+	bus_unregister_notifier(&pci_bus_type, &pci_stub_nb);
+#endif
+	xen_pcibk_xenbus_unregister();
+	pcistub_exit();
+}
+
+module_init(xen_pcibk_init);
+module_exit(xen_pcibk_cleanup);
+
+MODULE_LICENSE("Dual BSD/GPL");
+MODULE_ALIAS("xen-backend:pci");
diff --git a/drivers/xen/xen-pciback/pciback.h b/drivers/xen/xen-pciback/pciback.h
new file mode 100644
index 000000000..235cdfe13
--- /dev/null
+++ b/drivers/xen/xen-pciback/pciback.h
@@ -0,0 +1,202 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * PCI Backend Common Data Structures & Function Declarations
+ *
+ *   Author: Ryan Wilson <hap9@epoch.ncsc.mil>
+ */
+#ifndef __XEN_PCIBACK_H__
+#define __XEN_PCIBACK_H__
+
+#include <linux/pci.h>
+#include <linux/interrupt.h>
+#include <xen/xenbus.h>
+#include <linux/list.h>
+#include <linux/spinlock.h>
+#include <linux/workqueue.h>
+#include <linux/atomic.h>
+#include <xen/events.h>
+#include <xen/interface/io/pciif.h>
+
+#define DRV_NAME	"xen-pciback"
+
+struct pci_dev_entry {
+	struct list_head list;
+	struct pci_dev *dev;
+};
+
+#define _PDEVF_op_active	(0)
+#define PDEVF_op_active		(1<<(_PDEVF_op_active))
+#define _PCIB_op_pending	(1)
+#define PCIB_op_pending		(1<<(_PCIB_op_pending))
+#define _EOI_pending		(2)
+#define EOI_pending		(1<<(_EOI_pending))
+
+struct xen_pcibk_device {
+	void *pci_dev_data;
+	struct mutex dev_lock;
+	struct xenbus_device *xdev;
+	struct xenbus_watch be_watch;
+	u8 be_watching;
+	int evtchn_irq;
+	struct xen_pci_sharedinfo *sh_info;
+	unsigned long flags;
+	struct work_struct op_work;
+	struct xen_pci_op op;
+};
+
+struct xen_pcibk_dev_data {
+	struct list_head config_fields;
+	struct pci_saved_state *pci_saved_state;
+	unsigned int permissive:1;
+	unsigned int warned_on_write:1;
+	unsigned int enable_intx:1;
+	unsigned int isr_on:1; /* Whether the IRQ handler is installed. */
+	unsigned int ack_intr:1; /* .. and ACK-ing */
+	unsigned long handled;
+	unsigned int irq; /* Saved in case device transitions to MSI/MSI-X */
+	char irq_name[0]; /* xen-pcibk[000:04:00.0] */
+};
+
+/* Used by XenBus and xen_pcibk_ops.c */
+extern wait_queue_head_t xen_pcibk_aer_wait_queue;
+/* Used by pcistub.c and conf_space_quirks.c */
+extern struct list_head xen_pcibk_quirks;
+
+/* Get/Put PCI Devices that are hidden from the PCI Backend Domain */
+struct pci_dev *pcistub_get_pci_dev_by_slot(struct xen_pcibk_device *pdev,
+					    int domain, int bus,
+					    int slot, int func);
+struct pci_dev *pcistub_get_pci_dev(struct xen_pcibk_device *pdev,
+				    struct pci_dev *dev);
+void pcistub_put_pci_dev(struct pci_dev *dev);
+
+/* Ensure a device is turned off or reset */
+void xen_pcibk_reset_device(struct pci_dev *pdev);
+
+/* Access a virtual configuration space for a PCI device */
+int xen_pcibk_config_init(void);
+int xen_pcibk_config_init_dev(struct pci_dev *dev);
+void xen_pcibk_config_free_dyn_fields(struct pci_dev *dev);
+void xen_pcibk_config_reset_dev(struct pci_dev *dev);
+void xen_pcibk_config_free_dev(struct pci_dev *dev);
+int xen_pcibk_config_read(struct pci_dev *dev, int offset, int size,
+			  u32 *ret_val);
+int xen_pcibk_config_write(struct pci_dev *dev, int offset, int size,
+			   u32 value);
+
+/* Handle requests for specific devices from the frontend */
+typedef int (*publish_pci_dev_cb) (struct xen_pcibk_device *pdev,
+				   unsigned int domain, unsigned int bus,
+				   unsigned int devfn, unsigned int devid);
+typedef int (*publish_pci_root_cb) (struct xen_pcibk_device *pdev,
+				    unsigned int domain, unsigned int bus);
+
+/* Backend registration for the two types of BDF representation:
+ *  vpci - BDFs start at 00
+ *  passthrough - BDFs are exactly like in the host.
+ */
+struct xen_pcibk_backend {
+	const char *name;
+	int (*init)(struct xen_pcibk_device *pdev);
+	void (*free)(struct xen_pcibk_device *pdev);
+	int (*find)(struct pci_dev *pcidev, struct xen_pcibk_device *pdev,
+		    unsigned int *domain, unsigned int *bus,
+		    unsigned int *devfn);
+	int (*publish)(struct xen_pcibk_device *pdev, publish_pci_root_cb cb);
+	void (*release)(struct xen_pcibk_device *pdev, struct pci_dev *dev,
+                        bool lock);
+	int (*add)(struct xen_pcibk_device *pdev, struct pci_dev *dev,
+		   int devid, publish_pci_dev_cb publish_cb);
+	struct pci_dev *(*get)(struct xen_pcibk_device *pdev,
+			       unsigned int domain, unsigned int bus,
+			       unsigned int devfn);
+};
+
+extern const struct xen_pcibk_backend xen_pcibk_vpci_backend;
+extern const struct xen_pcibk_backend xen_pcibk_passthrough_backend;
+extern const struct xen_pcibk_backend *xen_pcibk_backend;
+
+static inline int xen_pcibk_add_pci_dev(struct xen_pcibk_device *pdev,
+					struct pci_dev *dev,
+					int devid,
+					publish_pci_dev_cb publish_cb)
+{
+	if (xen_pcibk_backend && xen_pcibk_backend->add)
+		return xen_pcibk_backend->add(pdev, dev, devid, publish_cb);
+	return -1;
+}
+
+static inline void xen_pcibk_release_pci_dev(struct xen_pcibk_device *pdev,
+					     struct pci_dev *dev, bool lock)
+{
+	if (xen_pcibk_backend && xen_pcibk_backend->release)
+		return xen_pcibk_backend->release(pdev, dev, lock);
+}
+
+static inline struct pci_dev *
+xen_pcibk_get_pci_dev(struct xen_pcibk_device *pdev, unsigned int domain,
+		      unsigned int bus, unsigned int devfn)
+{
+	if (xen_pcibk_backend && xen_pcibk_backend->get)
+		return xen_pcibk_backend->get(pdev, domain, bus, devfn);
+	return NULL;
+}
+
+/**
+* Add for domain0 PCIE-AER handling. Get guest domain/bus/devfn in xen_pcibk
+* before sending aer request to pcifront, so that guest could identify
+* device, coopearte with xen_pcibk to finish aer recovery job if device driver
+* has the capability
+*/
+static inline int xen_pcibk_get_pcifront_dev(struct pci_dev *pcidev,
+					     struct xen_pcibk_device *pdev,
+					     unsigned int *domain,
+					     unsigned int *bus,
+					     unsigned int *devfn)
+{
+	if (xen_pcibk_backend && xen_pcibk_backend->find)
+		return xen_pcibk_backend->find(pcidev, pdev, domain, bus,
+					       devfn);
+	return -1;
+}
+
+static inline int xen_pcibk_init_devices(struct xen_pcibk_device *pdev)
+{
+	if (xen_pcibk_backend && xen_pcibk_backend->init)
+		return xen_pcibk_backend->init(pdev);
+	return -1;
+}
+
+static inline int xen_pcibk_publish_pci_roots(struct xen_pcibk_device *pdev,
+					      publish_pci_root_cb cb)
+{
+	if (xen_pcibk_backend && xen_pcibk_backend->publish)
+		return xen_pcibk_backend->publish(pdev, cb);
+	return -1;
+}
+
+static inline void xen_pcibk_release_devices(struct xen_pcibk_device *pdev)
+{
+	if (xen_pcibk_backend && xen_pcibk_backend->free)
+		return xen_pcibk_backend->free(pdev);
+}
+
+/* Handles events from front-end */
+irqreturn_t xen_pcibk_handle_event(int irq, void *dev_id);
+void xen_pcibk_do_op(struct work_struct *data);
+
+static inline void xen_pcibk_lateeoi(struct xen_pcibk_device *pdev,
+				     unsigned int eoi_flag)
+{
+	if (test_and_clear_bit(_EOI_pending, &pdev->flags))
+		xen_irq_lateeoi(pdev->evtchn_irq, eoi_flag);
+}
+
+int xen_pcibk_xenbus_register(void);
+void xen_pcibk_xenbus_unregister(void);
+
+extern int verbose_request;
+#endif
+
+/* Handles shared IRQs that can to device domain and control domain. */
+void xen_pcibk_irq_handler(struct pci_dev *dev, int reset);
diff --git a/drivers/xen/xen-pciback/pciback_ops.c b/drivers/xen/xen-pciback/pciback_ops.c
new file mode 100644
index 000000000..c4ed2c634
--- /dev/null
+++ b/drivers/xen/xen-pciback/pciback_ops.c
@@ -0,0 +1,464 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * PCI Backend Operations - respond to PCI requests from Frontend
+ *
+ *   Author: Ryan Wilson <hap9@epoch.ncsc.mil>
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/moduleparam.h>
+#include <linux/wait.h>
+#include <linux/bitops.h>
+#include <xen/events.h>
+#include <linux/sched.h>
+#include "pciback.h"
+
+int verbose_request;
+module_param(verbose_request, int, 0644);
+
+static irqreturn_t xen_pcibk_guest_interrupt(int irq, void *dev_id);
+
+/* Ensure a device is has the fake IRQ handler "turned on/off" and is
+ * ready to be exported. This MUST be run after xen_pcibk_reset_device
+ * which does the actual PCI device enable/disable.
+ */
+static void xen_pcibk_control_isr(struct pci_dev *dev, int reset)
+{
+	struct xen_pcibk_dev_data *dev_data;
+	int rc;
+	int enable = 0;
+
+	dev_data = pci_get_drvdata(dev);
+	if (!dev_data)
+		return;
+
+	/* We don't deal with bridges */
+	if (dev->hdr_type != PCI_HEADER_TYPE_NORMAL)
+		return;
+
+	if (reset) {
+		dev_data->enable_intx = 0;
+		dev_data->ack_intr = 0;
+	}
+	enable =  dev_data->enable_intx;
+
+	/* Asked to disable, but ISR isn't runnig */
+	if (!enable && !dev_data->isr_on)
+		return;
+
+	/* Squirrel away the IRQs in the dev_data. We need this
+	 * b/c when device transitions to MSI, the dev->irq is
+	 * overwritten with the MSI vector.
+	 */
+	if (enable)
+		dev_data->irq = dev->irq;
+
+	/*
+	 * SR-IOV devices in all use MSI-X and have no legacy
+	 * interrupts, so inhibit creating a fake IRQ handler for them.
+	 */
+	if (dev_data->irq == 0)
+		goto out;
+
+	dev_dbg(&dev->dev, "%s: #%d %s %s%s %s-> %s\n",
+		dev_data->irq_name,
+		dev_data->irq,
+		pci_is_enabled(dev) ? "on" : "off",
+		dev->msi_enabled ? "MSI" : "",
+		dev->msix_enabled ? "MSI/X" : "",
+		dev_data->isr_on ? "enable" : "disable",
+		enable ? "enable" : "disable");
+
+	if (enable) {
+		/*
+		 * The MSI or MSI-X should not have an IRQ handler. Otherwise
+		 * if the guest terminates we BUG_ON in free_msi_irqs.
+		 */
+		if (dev->msi_enabled || dev->msix_enabled)
+			goto out;
+
+		rc = request_irq(dev_data->irq,
+				xen_pcibk_guest_interrupt, IRQF_SHARED,
+				dev_data->irq_name, dev);
+		if (rc) {
+			dev_err(&dev->dev, "%s: failed to install fake IRQ " \
+				"handler for IRQ %d! (rc:%d)\n",
+				dev_data->irq_name, dev_data->irq, rc);
+			goto out;
+		}
+	} else {
+		free_irq(dev_data->irq, dev);
+		dev_data->irq = 0;
+	}
+	dev_data->isr_on = enable;
+	dev_data->ack_intr = enable;
+out:
+	dev_dbg(&dev->dev, "%s: #%d %s %s%s %s\n",
+		dev_data->irq_name,
+		dev_data->irq,
+		pci_is_enabled(dev) ? "on" : "off",
+		dev->msi_enabled ? "MSI" : "",
+		dev->msix_enabled ? "MSI/X" : "",
+		enable ? (dev_data->isr_on ? "enabled" : "failed to enable") :
+			(dev_data->isr_on ? "failed to disable" : "disabled"));
+}
+
+/* Ensure a device is "turned off" and ready to be exported.
+ * (Also see xen_pcibk_config_reset to ensure virtual configuration space is
+ * ready to be re-exported)
+ */
+void xen_pcibk_reset_device(struct pci_dev *dev)
+{
+	u16 cmd;
+
+	xen_pcibk_control_isr(dev, 1 /* reset device */);
+
+	/* Disable devices (but not bridges) */
+	if (dev->hdr_type == PCI_HEADER_TYPE_NORMAL) {
+#ifdef CONFIG_PCI_MSI
+		/* The guest could have been abruptly killed without
+		 * disabling MSI/MSI-X interrupts.*/
+		if (dev->msix_enabled)
+			pci_disable_msix(dev);
+		if (dev->msi_enabled)
+			pci_disable_msi(dev);
+#endif
+		if (pci_is_enabled(dev))
+			pci_disable_device(dev);
+
+		dev->is_busmaster = 0;
+	} else {
+		pci_read_config_word(dev, PCI_COMMAND, &cmd);
+		if (cmd & (PCI_COMMAND_INVALIDATE)) {
+			cmd &= ~(PCI_COMMAND_INVALIDATE);
+			pci_write_config_word(dev, PCI_COMMAND, cmd);
+
+			dev->is_busmaster = 0;
+		}
+	}
+}
+
+#ifdef CONFIG_PCI_MSI
+static
+int xen_pcibk_enable_msi(struct xen_pcibk_device *pdev,
+			 struct pci_dev *dev, struct xen_pci_op *op)
+{
+	struct xen_pcibk_dev_data *dev_data;
+	int status;
+
+	if (unlikely(verbose_request))
+		printk(KERN_DEBUG DRV_NAME ": %s: enable MSI\n", pci_name(dev));
+
+	if (dev->msi_enabled)
+		status = -EALREADY;
+	else if (dev->msix_enabled)
+		status = -ENXIO;
+	else
+		status = pci_enable_msi(dev);
+
+	if (status) {
+		pr_warn_ratelimited("%s: error enabling MSI for guest %u: err %d\n",
+				    pci_name(dev), pdev->xdev->otherend_id,
+				    status);
+		op->value = 0;
+		return XEN_PCI_ERR_op_failed;
+	}
+
+	/* The value the guest needs is actually the IDT vector, not the
+	 * the local domain's IRQ number. */
+
+	op->value = dev->irq ? xen_pirq_from_irq(dev->irq) : 0;
+	if (unlikely(verbose_request))
+		printk(KERN_DEBUG DRV_NAME ": %s: MSI: %d\n", pci_name(dev),
+			op->value);
+
+	dev_data = pci_get_drvdata(dev);
+	if (dev_data)
+		dev_data->ack_intr = 0;
+
+	return 0;
+}
+
+static
+int xen_pcibk_disable_msi(struct xen_pcibk_device *pdev,
+			  struct pci_dev *dev, struct xen_pci_op *op)
+{
+	if (unlikely(verbose_request))
+		printk(KERN_DEBUG DRV_NAME ": %s: disable MSI\n",
+		       pci_name(dev));
+
+	if (dev->msi_enabled) {
+		struct xen_pcibk_dev_data *dev_data;
+
+		pci_disable_msi(dev);
+
+		dev_data = pci_get_drvdata(dev);
+		if (dev_data)
+			dev_data->ack_intr = 1;
+	}
+	op->value = dev->irq ? xen_pirq_from_irq(dev->irq) : 0;
+	if (unlikely(verbose_request))
+		printk(KERN_DEBUG DRV_NAME ": %s: MSI: %d\n", pci_name(dev),
+			op->value);
+	return 0;
+}
+
+static
+int xen_pcibk_enable_msix(struct xen_pcibk_device *pdev,
+			  struct pci_dev *dev, struct xen_pci_op *op)
+{
+	struct xen_pcibk_dev_data *dev_data;
+	int i, result;
+	struct msix_entry *entries;
+	u16 cmd;
+
+	if (unlikely(verbose_request))
+		printk(KERN_DEBUG DRV_NAME ": %s: enable MSI-X\n",
+		       pci_name(dev));
+
+	if (op->value > SH_INFO_MAX_VEC)
+		return -EINVAL;
+
+	if (dev->msix_enabled)
+		return -EALREADY;
+
+	/*
+	 * PCI_COMMAND_MEMORY must be enabled, otherwise we may not be able
+	 * to access the BARs where the MSI-X entries reside.
+	 * But VF devices are unique in which the PF needs to be checked.
+	 */
+	pci_read_config_word(pci_physfn(dev), PCI_COMMAND, &cmd);
+	if (dev->msi_enabled || !(cmd & PCI_COMMAND_MEMORY))
+		return -ENXIO;
+
+	entries = kmalloc_array(op->value, sizeof(*entries), GFP_KERNEL);
+	if (entries == NULL)
+		return -ENOMEM;
+
+	for (i = 0; i < op->value; i++) {
+		entries[i].entry = op->msix_entries[i].entry;
+		entries[i].vector = op->msix_entries[i].vector;
+	}
+
+	result = pci_enable_msix_exact(dev, entries, op->value);
+	if (result == 0) {
+		for (i = 0; i < op->value; i++) {
+			op->msix_entries[i].entry = entries[i].entry;
+			if (entries[i].vector) {
+				op->msix_entries[i].vector =
+					xen_pirq_from_irq(entries[i].vector);
+				if (unlikely(verbose_request))
+					printk(KERN_DEBUG DRV_NAME ": %s: " \
+						"MSI-X[%d]: %d\n",
+						pci_name(dev), i,
+						op->msix_entries[i].vector);
+			}
+		}
+	} else
+		pr_warn_ratelimited("%s: error enabling MSI-X for guest %u: err %d!\n",
+				    pci_name(dev), pdev->xdev->otherend_id,
+				    result);
+	kfree(entries);
+
+	op->value = result;
+	dev_data = pci_get_drvdata(dev);
+	if (dev_data)
+		dev_data->ack_intr = 0;
+
+	return result > 0 ? 0 : result;
+}
+
+static
+int xen_pcibk_disable_msix(struct xen_pcibk_device *pdev,
+			   struct pci_dev *dev, struct xen_pci_op *op)
+{
+	if (unlikely(verbose_request))
+		printk(KERN_DEBUG DRV_NAME ": %s: disable MSI-X\n",
+			pci_name(dev));
+
+	if (dev->msix_enabled) {
+		struct xen_pcibk_dev_data *dev_data;
+
+		pci_disable_msix(dev);
+
+		dev_data = pci_get_drvdata(dev);
+		if (dev_data)
+			dev_data->ack_intr = 1;
+	}
+	/*
+	 * SR-IOV devices (which don't have any legacy IRQ) have
+	 * an undefined IRQ value of zero.
+	 */
+	op->value = dev->irq ? xen_pirq_from_irq(dev->irq) : 0;
+	if (unlikely(verbose_request))
+		printk(KERN_DEBUG DRV_NAME ": %s: MSI-X: %d\n",
+		       pci_name(dev), op->value);
+	return 0;
+}
+#endif
+
+static inline bool xen_pcibk_test_op_pending(struct xen_pcibk_device *pdev)
+{
+	return test_bit(_XEN_PCIF_active,
+			(unsigned long *)&pdev->sh_info->flags) &&
+	       !test_and_set_bit(_PDEVF_op_active, &pdev->flags);
+}
+
+/*
+* Now the same evtchn is used for both pcifront conf_read_write request
+* as well as pcie aer front end ack. We use a new work_queue to schedule
+* xen_pcibk conf_read_write service for avoiding confict with aer_core
+* do_recovery job which also use the system default work_queue
+*/
+static void xen_pcibk_test_and_schedule_op(struct xen_pcibk_device *pdev)
+{
+	bool eoi = true;
+
+	/* Check that frontend is requesting an operation and that we are not
+	 * already processing a request */
+	if (xen_pcibk_test_op_pending(pdev)) {
+		schedule_work(&pdev->op_work);
+		eoi = false;
+	}
+	/*_XEN_PCIB_active should have been cleared by pcifront. And also make
+	sure xen_pcibk is waiting for ack by checking _PCIB_op_pending*/
+	if (!test_bit(_XEN_PCIB_active, (unsigned long *)&pdev->sh_info->flags)
+	    && test_bit(_PCIB_op_pending, &pdev->flags)) {
+		wake_up(&xen_pcibk_aer_wait_queue);
+		eoi = false;
+	}
+
+	/* EOI if there was nothing to do. */
+	if (eoi)
+		xen_pcibk_lateeoi(pdev, XEN_EOI_FLAG_SPURIOUS);
+}
+
+/* Performing the configuration space reads/writes must not be done in atomic
+ * context because some of the pci_* functions can sleep (mostly due to ACPI
+ * use of semaphores). This function is intended to be called from a work
+ * queue in process context taking a struct xen_pcibk_device as a parameter */
+
+static void xen_pcibk_do_one_op(struct xen_pcibk_device *pdev)
+{
+	struct pci_dev *dev;
+	struct xen_pcibk_dev_data *dev_data = NULL;
+	struct xen_pci_op *op = &pdev->op;
+	int test_intx = 0;
+#ifdef CONFIG_PCI_MSI
+	unsigned int nr = 0;
+#endif
+
+	*op = pdev->sh_info->op;
+	barrier();
+	dev = xen_pcibk_get_pci_dev(pdev, op->domain, op->bus, op->devfn);
+
+	if (dev == NULL)
+		op->err = XEN_PCI_ERR_dev_not_found;
+	else {
+		dev_data = pci_get_drvdata(dev);
+		if (dev_data)
+			test_intx = dev_data->enable_intx;
+		switch (op->cmd) {
+		case XEN_PCI_OP_conf_read:
+			op->err = xen_pcibk_config_read(dev,
+				  op->offset, op->size, &op->value);
+			break;
+		case XEN_PCI_OP_conf_write:
+			op->err = xen_pcibk_config_write(dev,
+				  op->offset, op->size,	op->value);
+			break;
+#ifdef CONFIG_PCI_MSI
+		case XEN_PCI_OP_enable_msi:
+			op->err = xen_pcibk_enable_msi(pdev, dev, op);
+			break;
+		case XEN_PCI_OP_disable_msi:
+			op->err = xen_pcibk_disable_msi(pdev, dev, op);
+			break;
+		case XEN_PCI_OP_enable_msix:
+			nr = op->value;
+			op->err = xen_pcibk_enable_msix(pdev, dev, op);
+			break;
+		case XEN_PCI_OP_disable_msix:
+			op->err = xen_pcibk_disable_msix(pdev, dev, op);
+			break;
+#endif
+		default:
+			op->err = XEN_PCI_ERR_not_implemented;
+			break;
+		}
+	}
+	if (!op->err && dev && dev_data) {
+		/* Transition detected */
+		if ((dev_data->enable_intx != test_intx))
+			xen_pcibk_control_isr(dev, 0 /* no reset */);
+	}
+	pdev->sh_info->op.err = op->err;
+	pdev->sh_info->op.value = op->value;
+#ifdef CONFIG_PCI_MSI
+	if (op->cmd == XEN_PCI_OP_enable_msix && op->err == 0) {
+		unsigned int i;
+
+		for (i = 0; i < nr; i++)
+			pdev->sh_info->op.msix_entries[i].vector =
+				op->msix_entries[i].vector;
+	}
+#endif
+	/* Tell the driver domain that we're done. */
+	wmb();
+	clear_bit(_XEN_PCIF_active, (unsigned long *)&pdev->sh_info->flags);
+	notify_remote_via_irq(pdev->evtchn_irq);
+
+	/* Mark that we're done. */
+	smp_mb__before_atomic(); /* /after/ clearing PCIF_active */
+	clear_bit(_PDEVF_op_active, &pdev->flags);
+	smp_mb__after_atomic(); /* /before/ final check for work */
+}
+
+void xen_pcibk_do_op(struct work_struct *data)
+{
+	struct xen_pcibk_device *pdev =
+		container_of(data, struct xen_pcibk_device, op_work);
+
+	do {
+		xen_pcibk_do_one_op(pdev);
+	} while (xen_pcibk_test_op_pending(pdev));
+
+	xen_pcibk_lateeoi(pdev, 0);
+}
+
+irqreturn_t xen_pcibk_handle_event(int irq, void *dev_id)
+{
+	struct xen_pcibk_device *pdev = dev_id;
+	bool eoi;
+
+	/* IRQs might come in before pdev->evtchn_irq is written. */
+	if (unlikely(pdev->evtchn_irq != irq))
+		pdev->evtchn_irq = irq;
+
+	eoi = test_and_set_bit(_EOI_pending, &pdev->flags);
+	WARN(eoi, "IRQ while EOI pending\n");
+
+	xen_pcibk_test_and_schedule_op(pdev);
+
+	return IRQ_HANDLED;
+}
+static irqreturn_t xen_pcibk_guest_interrupt(int irq, void *dev_id)
+{
+	struct pci_dev *dev = (struct pci_dev *)dev_id;
+	struct xen_pcibk_dev_data *dev_data = pci_get_drvdata(dev);
+
+	if (dev_data->isr_on && dev_data->ack_intr) {
+		dev_data->handled++;
+		if ((dev_data->handled % 1000) == 0) {
+			if (xen_test_irq_shared(irq)) {
+				pr_info("%s IRQ line is not shared "
+					"with other domains. Turning ISR off\n",
+					 dev_data->irq_name);
+				dev_data->ack_intr = 0;
+			}
+		}
+		return IRQ_HANDLED;
+	}
+	return IRQ_NONE;
+}
diff --git a/drivers/xen/xen-pciback/vpci.c b/drivers/xen/xen-pciback/vpci.c
new file mode 100644
index 000000000..30313084f
--- /dev/null
+++ b/drivers/xen/xen-pciback/vpci.c
@@ -0,0 +1,273 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * PCI Backend - Provides a Virtual PCI bus (with real devices)
+ *               to the frontend
+ *
+ *   Author: Ryan Wilson <hap9@epoch.ncsc.mil>
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/list.h>
+#include <linux/slab.h>
+#include <linux/pci.h>
+#include <linux/mutex.h>
+#include "pciback.h"
+
+#define PCI_SLOT_MAX 32
+
+struct vpci_dev_data {
+	/* Access to dev_list must be protected by lock */
+	struct list_head dev_list[PCI_SLOT_MAX];
+	struct mutex lock;
+};
+
+static inline struct list_head *list_first(struct list_head *head)
+{
+	return head->next;
+}
+
+static struct pci_dev *__xen_pcibk_get_pci_dev(struct xen_pcibk_device *pdev,
+					       unsigned int domain,
+					       unsigned int bus,
+					       unsigned int devfn)
+{
+	struct pci_dev_entry *entry;
+	struct pci_dev *dev = NULL;
+	struct vpci_dev_data *vpci_dev = pdev->pci_dev_data;
+
+	if (domain != 0 || bus != 0)
+		return NULL;
+
+	if (PCI_SLOT(devfn) < PCI_SLOT_MAX) {
+		mutex_lock(&vpci_dev->lock);
+
+		list_for_each_entry(entry,
+				    &vpci_dev->dev_list[PCI_SLOT(devfn)],
+				    list) {
+			if (PCI_FUNC(entry->dev->devfn) == PCI_FUNC(devfn)) {
+				dev = entry->dev;
+				break;
+			}
+		}
+
+		mutex_unlock(&vpci_dev->lock);
+	}
+	return dev;
+}
+
+static inline int match_slot(struct pci_dev *l, struct pci_dev *r)
+{
+	if (pci_domain_nr(l->bus) == pci_domain_nr(r->bus)
+	    && l->bus == r->bus && PCI_SLOT(l->devfn) == PCI_SLOT(r->devfn))
+		return 1;
+
+	return 0;
+}
+
+static int __xen_pcibk_add_pci_dev(struct xen_pcibk_device *pdev,
+				   struct pci_dev *dev, int devid,
+				   publish_pci_dev_cb publish_cb)
+{
+	int err = 0, slot, func = PCI_FUNC(dev->devfn);
+	struct pci_dev_entry *t, *dev_entry;
+	struct vpci_dev_data *vpci_dev = pdev->pci_dev_data;
+
+	if ((dev->class >> 24) == PCI_BASE_CLASS_BRIDGE) {
+		err = -EFAULT;
+		xenbus_dev_fatal(pdev->xdev, err,
+				 "Can't export bridges on the virtual PCI bus");
+		goto out;
+	}
+
+	dev_entry = kmalloc(sizeof(*dev_entry), GFP_KERNEL);
+	if (!dev_entry) {
+		err = -ENOMEM;
+		xenbus_dev_fatal(pdev->xdev, err,
+				 "Error adding entry to virtual PCI bus");
+		goto out;
+	}
+
+	dev_entry->dev = dev;
+
+	mutex_lock(&vpci_dev->lock);
+
+	/*
+	 * Keep multi-function devices together on the virtual PCI bus, except
+	 * that we want to keep virtual functions at func 0 on their own. They
+	 * aren't multi-function devices and hence their presence at func 0
+	 * may cause guests to not scan the other functions.
+	 */
+	if (!dev->is_virtfn || func) {
+		for (slot = 0; slot < PCI_SLOT_MAX; slot++) {
+			if (list_empty(&vpci_dev->dev_list[slot]))
+				continue;
+
+			t = list_entry(list_first(&vpci_dev->dev_list[slot]),
+				       struct pci_dev_entry, list);
+			if (t->dev->is_virtfn && !PCI_FUNC(t->dev->devfn))
+				continue;
+
+			if (match_slot(dev, t->dev)) {
+				pr_info("vpci: %s: assign to virtual slot %d func %d\n",
+					pci_name(dev), slot,
+					func);
+				list_add_tail(&dev_entry->list,
+					      &vpci_dev->dev_list[slot]);
+				goto unlock;
+			}
+		}
+	}
+
+	/* Assign to a new slot on the virtual PCI bus */
+	for (slot = 0; slot < PCI_SLOT_MAX; slot++) {
+		if (list_empty(&vpci_dev->dev_list[slot])) {
+			pr_info("vpci: %s: assign to virtual slot %d\n",
+				pci_name(dev), slot);
+			list_add_tail(&dev_entry->list,
+				      &vpci_dev->dev_list[slot]);
+			goto unlock;
+		}
+	}
+
+	err = -ENOMEM;
+	xenbus_dev_fatal(pdev->xdev, err,
+			 "No more space on root virtual PCI bus");
+
+unlock:
+	mutex_unlock(&vpci_dev->lock);
+
+	/* Publish this device. */
+	if (!err)
+		err = publish_cb(pdev, 0, 0, PCI_DEVFN(slot, func), devid);
+	else
+		kfree(dev_entry);
+
+out:
+	return err;
+}
+
+static void __xen_pcibk_release_pci_dev(struct xen_pcibk_device *pdev,
+					struct pci_dev *dev, bool lock)
+{
+	int slot;
+	struct vpci_dev_data *vpci_dev = pdev->pci_dev_data;
+	struct pci_dev *found_dev = NULL;
+
+	mutex_lock(&vpci_dev->lock);
+
+	for (slot = 0; slot < PCI_SLOT_MAX; slot++) {
+		struct pci_dev_entry *e;
+
+		list_for_each_entry(e, &vpci_dev->dev_list[slot], list) {
+			if (e->dev == dev) {
+				list_del(&e->list);
+				found_dev = e->dev;
+				kfree(e);
+				goto out;
+			}
+		}
+	}
+
+out:
+	mutex_unlock(&vpci_dev->lock);
+
+	if (found_dev) {
+		if (lock)
+			device_lock(&found_dev->dev);
+		pcistub_put_pci_dev(found_dev);
+		if (lock)
+			device_unlock(&found_dev->dev);
+	}
+}
+
+static int __xen_pcibk_init_devices(struct xen_pcibk_device *pdev)
+{
+	int slot;
+	struct vpci_dev_data *vpci_dev;
+
+	vpci_dev = kmalloc(sizeof(*vpci_dev), GFP_KERNEL);
+	if (!vpci_dev)
+		return -ENOMEM;
+
+	mutex_init(&vpci_dev->lock);
+
+	for (slot = 0; slot < PCI_SLOT_MAX; slot++)
+		INIT_LIST_HEAD(&vpci_dev->dev_list[slot]);
+
+	pdev->pci_dev_data = vpci_dev;
+
+	return 0;
+}
+
+static int __xen_pcibk_publish_pci_roots(struct xen_pcibk_device *pdev,
+					 publish_pci_root_cb publish_cb)
+{
+	/* The Virtual PCI bus has only one root */
+	return publish_cb(pdev, 0, 0);
+}
+
+static void __xen_pcibk_release_devices(struct xen_pcibk_device *pdev)
+{
+	int slot;
+	struct vpci_dev_data *vpci_dev = pdev->pci_dev_data;
+
+	for (slot = 0; slot < PCI_SLOT_MAX; slot++) {
+		struct pci_dev_entry *e, *tmp;
+		list_for_each_entry_safe(e, tmp, &vpci_dev->dev_list[slot],
+					 list) {
+			struct pci_dev *dev = e->dev;
+			list_del(&e->list);
+			device_lock(&dev->dev);
+			pcistub_put_pci_dev(dev);
+			device_unlock(&dev->dev);
+			kfree(e);
+		}
+	}
+
+	kfree(vpci_dev);
+	pdev->pci_dev_data = NULL;
+}
+
+static int __xen_pcibk_get_pcifront_dev(struct pci_dev *pcidev,
+					struct xen_pcibk_device *pdev,
+					unsigned int *domain, unsigned int *bus,
+					unsigned int *devfn)
+{
+	struct pci_dev_entry *entry;
+	struct pci_dev *dev = NULL;
+	struct vpci_dev_data *vpci_dev = pdev->pci_dev_data;
+	int found = 0, slot;
+
+	mutex_lock(&vpci_dev->lock);
+	for (slot = 0; slot < PCI_SLOT_MAX; slot++) {
+		list_for_each_entry(entry,
+			    &vpci_dev->dev_list[slot],
+			    list) {
+			dev = entry->dev;
+			if (dev && dev->bus->number == pcidev->bus->number
+				&& pci_domain_nr(dev->bus) ==
+					pci_domain_nr(pcidev->bus)
+				&& dev->devfn == pcidev->devfn) {
+				found = 1;
+				*domain = 0;
+				*bus = 0;
+				*devfn = PCI_DEVFN(slot,
+					 PCI_FUNC(pcidev->devfn));
+			}
+		}
+	}
+	mutex_unlock(&vpci_dev->lock);
+	return found;
+}
+
+const struct xen_pcibk_backend xen_pcibk_vpci_backend = {
+	.name		= "vpci",
+	.init		= __xen_pcibk_init_devices,
+	.free		= __xen_pcibk_release_devices,
+	.find		= __xen_pcibk_get_pcifront_dev,
+	.publish	= __xen_pcibk_publish_pci_roots,
+	.release	= __xen_pcibk_release_pci_dev,
+	.add		= __xen_pcibk_add_pci_dev,
+	.get		= __xen_pcibk_get_pci_dev,
+};
diff --git a/drivers/xen/xen-pciback/xenbus.c b/drivers/xen/xen-pciback/xenbus.c
new file mode 100644
index 000000000..4fb6aacf9
--- /dev/null
+++ b/drivers/xen/xen-pciback/xenbus.c
@@ -0,0 +1,755 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * PCI Backend Xenbus Setup - handles setup with frontend and xend
+ *
+ *   Author: Ryan Wilson <hap9@epoch.ncsc.mil>
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/moduleparam.h>
+#include <linux/init.h>
+#include <linux/list.h>
+#include <linux/vmalloc.h>
+#include <linux/workqueue.h>
+#include <xen/xenbus.h>
+#include <xen/events.h>
+#include <asm/xen/pci.h>
+#include "pciback.h"
+
+#define INVALID_EVTCHN_IRQ  (-1)
+
+static bool __read_mostly passthrough;
+module_param(passthrough, bool, S_IRUGO);
+MODULE_PARM_DESC(passthrough,
+	"Option to specify how to export PCI topology to guest:\n"\
+	" 0 - (default) Hide the true PCI topology and makes the frontend\n"\
+	"   there is a single PCI bus with only the exported devices on it.\n"\
+	"   For example, a device at 03:05.0 will be re-assigned to 00:00.0\n"\
+	"   while second device at 02:1a.1 will be re-assigned to 00:01.1.\n"\
+	" 1 - Passthrough provides a real view of the PCI topology to the\n"\
+	"   frontend (for example, a device at 06:01.b will still appear at\n"\
+	"   06:01.b to the frontend). This is similar to how Xen 2.0.x\n"\
+	"   exposed PCI devices to its driver domains. This may be required\n"\
+	"   for drivers which depend on finding their hardward in certain\n"\
+	"   bus/slot locations.");
+
+static struct xen_pcibk_device *alloc_pdev(struct xenbus_device *xdev)
+{
+	struct xen_pcibk_device *pdev;
+
+	pdev = kzalloc(sizeof(struct xen_pcibk_device), GFP_KERNEL);
+	if (pdev == NULL)
+		goto out;
+	dev_dbg(&xdev->dev, "allocated pdev @ 0x%p\n", pdev);
+
+	pdev->xdev = xdev;
+
+	mutex_init(&pdev->dev_lock);
+
+	pdev->sh_info = NULL;
+	pdev->evtchn_irq = INVALID_EVTCHN_IRQ;
+	pdev->be_watching = 0;
+
+	INIT_WORK(&pdev->op_work, xen_pcibk_do_op);
+
+	if (xen_pcibk_init_devices(pdev)) {
+		kfree(pdev);
+		pdev = NULL;
+	}
+
+	dev_set_drvdata(&xdev->dev, pdev);
+
+out:
+	return pdev;
+}
+
+static void xen_pcibk_disconnect(struct xen_pcibk_device *pdev)
+{
+	mutex_lock(&pdev->dev_lock);
+	/* Ensure the guest can't trigger our handler before removing devices */
+	if (pdev->evtchn_irq != INVALID_EVTCHN_IRQ) {
+		unbind_from_irqhandler(pdev->evtchn_irq, pdev);
+		pdev->evtchn_irq = INVALID_EVTCHN_IRQ;
+	}
+
+	/* If the driver domain started an op, make sure we complete it
+	 * before releasing the shared memory */
+
+	flush_work(&pdev->op_work);
+
+	if (pdev->sh_info != NULL) {
+		xenbus_unmap_ring_vfree(pdev->xdev, pdev->sh_info);
+		pdev->sh_info = NULL;
+	}
+	mutex_unlock(&pdev->dev_lock);
+}
+
+static void free_pdev(struct xen_pcibk_device *pdev)
+{
+	if (pdev->be_watching) {
+		unregister_xenbus_watch(&pdev->be_watch);
+		pdev->be_watching = 0;
+	}
+
+	xen_pcibk_disconnect(pdev);
+
+	/* N.B. This calls pcistub_put_pci_dev which does the FLR on all
+	 * of the PCIe devices. */
+	xen_pcibk_release_devices(pdev);
+
+	dev_set_drvdata(&pdev->xdev->dev, NULL);
+	pdev->xdev = NULL;
+
+	kfree(pdev);
+}
+
+static int xen_pcibk_do_attach(struct xen_pcibk_device *pdev, int gnt_ref,
+			     int remote_evtchn)
+{
+	int err = 0;
+	void *vaddr;
+
+	dev_dbg(&pdev->xdev->dev,
+		"Attaching to frontend resources - gnt_ref=%d evtchn=%d\n",
+		gnt_ref, remote_evtchn);
+
+	err = xenbus_map_ring_valloc(pdev->xdev, &gnt_ref, 1, &vaddr);
+	if (err < 0) {
+		xenbus_dev_fatal(pdev->xdev, err,
+				"Error mapping other domain page in ours.");
+		goto out;
+	}
+
+	pdev->sh_info = vaddr;
+
+	err = bind_interdomain_evtchn_to_irqhandler_lateeoi(
+		pdev->xdev->otherend_id, remote_evtchn, xen_pcibk_handle_event,
+		0, DRV_NAME, pdev);
+	if (err < 0) {
+		xenbus_dev_fatal(pdev->xdev, err,
+				 "Error binding event channel to IRQ");
+		goto out;
+	}
+	pdev->evtchn_irq = err;
+	err = 0;
+
+	dev_dbg(&pdev->xdev->dev, "Attached!\n");
+out:
+	return err;
+}
+
+static int xen_pcibk_attach(struct xen_pcibk_device *pdev)
+{
+	int err = 0;
+	int gnt_ref, remote_evtchn;
+	char *magic = NULL;
+
+
+	mutex_lock(&pdev->dev_lock);
+	/* Make sure we only do this setup once */
+	if (xenbus_read_driver_state(pdev->xdev->nodename) !=
+	    XenbusStateInitialised)
+		goto out;
+
+	/* Wait for frontend to state that it has published the configuration */
+	if (xenbus_read_driver_state(pdev->xdev->otherend) !=
+	    XenbusStateInitialised)
+		goto out;
+
+	dev_dbg(&pdev->xdev->dev, "Reading frontend config\n");
+
+	err = xenbus_gather(XBT_NIL, pdev->xdev->otherend,
+			    "pci-op-ref", "%u", &gnt_ref,
+			    "event-channel", "%u", &remote_evtchn,
+			    "magic", NULL, &magic, NULL);
+	if (err) {
+		/* If configuration didn't get read correctly, wait longer */
+		xenbus_dev_fatal(pdev->xdev, err,
+				 "Error reading configuration from frontend");
+		goto out;
+	}
+
+	if (magic == NULL || strcmp(magic, XEN_PCI_MAGIC) != 0) {
+		xenbus_dev_fatal(pdev->xdev, -EFAULT,
+				 "version mismatch (%s/%s) with pcifront - "
+				 "halting " DRV_NAME,
+				 magic, XEN_PCI_MAGIC);
+		err = -EFAULT;
+		goto out;
+	}
+
+	err = xen_pcibk_do_attach(pdev, gnt_ref, remote_evtchn);
+	if (err)
+		goto out;
+
+	dev_dbg(&pdev->xdev->dev, "Connecting...\n");
+
+	err = xenbus_switch_state(pdev->xdev, XenbusStateConnected);
+	if (err)
+		xenbus_dev_fatal(pdev->xdev, err,
+				 "Error switching to connected state!");
+
+	dev_dbg(&pdev->xdev->dev, "Connected? %d\n", err);
+out:
+	mutex_unlock(&pdev->dev_lock);
+
+	kfree(magic);
+
+	return err;
+}
+
+static int xen_pcibk_publish_pci_dev(struct xen_pcibk_device *pdev,
+				   unsigned int domain, unsigned int bus,
+				   unsigned int devfn, unsigned int devid)
+{
+	int err;
+	int len;
+	char str[64];
+
+	len = snprintf(str, sizeof(str), "vdev-%d", devid);
+	if (unlikely(len >= (sizeof(str) - 1))) {
+		err = -ENOMEM;
+		goto out;
+	}
+
+	/* Note: The PV protocol uses %02x, don't change it */
+	err = xenbus_printf(XBT_NIL, pdev->xdev->nodename, str,
+			    "%04x:%02x:%02x.%02x", domain, bus,
+			    PCI_SLOT(devfn), PCI_FUNC(devfn));
+
+out:
+	return err;
+}
+
+static int xen_pcibk_export_device(struct xen_pcibk_device *pdev,
+				 int domain, int bus, int slot, int func,
+				 int devid)
+{
+	struct pci_dev *dev;
+	int err = 0;
+
+	dev_dbg(&pdev->xdev->dev, "exporting dom %x bus %x slot %x func %x\n",
+		domain, bus, slot, func);
+
+	dev = pcistub_get_pci_dev_by_slot(pdev, domain, bus, slot, func);
+	if (!dev) {
+		err = -EINVAL;
+		xenbus_dev_fatal(pdev->xdev, err,
+				 "Couldn't locate PCI device "
+				 "(%04x:%02x:%02x.%d)! "
+				 "perhaps already in-use?",
+				 domain, bus, slot, func);
+		goto out;
+	}
+
+	err = xen_pcibk_add_pci_dev(pdev, dev, devid,
+				    xen_pcibk_publish_pci_dev);
+	if (err)
+		goto out;
+
+	dev_info(&dev->dev, "registering for %d\n", pdev->xdev->otherend_id);
+	if (xen_register_device_domain_owner(dev,
+					     pdev->xdev->otherend_id) != 0) {
+		dev_err(&dev->dev, "Stealing ownership from dom%d.\n",
+			xen_find_device_domain_owner(dev));
+		xen_unregister_device_domain_owner(dev);
+		xen_register_device_domain_owner(dev, pdev->xdev->otherend_id);
+	}
+
+	/* TODO: It'd be nice to export a bridge and have all of its children
+	 * get exported with it. This may be best done in xend (which will
+	 * have to calculate resource usage anyway) but we probably want to
+	 * put something in here to ensure that if a bridge gets given to a
+	 * driver domain, that all devices under that bridge are not given
+	 * to other driver domains (as he who controls the bridge can disable
+	 * it and stop the other devices from working).
+	 */
+out:
+	return err;
+}
+
+static int xen_pcibk_remove_device(struct xen_pcibk_device *pdev,
+				 int domain, int bus, int slot, int func)
+{
+	int err = 0;
+	struct pci_dev *dev;
+
+	dev_dbg(&pdev->xdev->dev, "removing dom %x bus %x slot %x func %x\n",
+		domain, bus, slot, func);
+
+	dev = xen_pcibk_get_pci_dev(pdev, domain, bus, PCI_DEVFN(slot, func));
+	if (!dev) {
+		err = -EINVAL;
+		dev_dbg(&pdev->xdev->dev, "Couldn't locate PCI device "
+			"(%04x:%02x:%02x.%d)! not owned by this domain\n",
+			domain, bus, slot, func);
+		goto out;
+	}
+
+	dev_dbg(&dev->dev, "unregistering for %d\n", pdev->xdev->otherend_id);
+	xen_unregister_device_domain_owner(dev);
+
+	/* N.B. This ends up calling pcistub_put_pci_dev which ends up
+	 * doing the FLR. */
+	xen_pcibk_release_pci_dev(pdev, dev, true /* use the lock. */);
+
+out:
+	return err;
+}
+
+static int xen_pcibk_publish_pci_root(struct xen_pcibk_device *pdev,
+				    unsigned int domain, unsigned int bus)
+{
+	unsigned int d, b;
+	int i, root_num, len, err;
+	char str[64];
+
+	dev_dbg(&pdev->xdev->dev, "Publishing pci roots\n");
+
+	err = xenbus_scanf(XBT_NIL, pdev->xdev->nodename,
+			   "root_num", "%d", &root_num);
+	if (err == 0 || err == -ENOENT)
+		root_num = 0;
+	else if (err < 0)
+		goto out;
+
+	/* Verify that we haven't already published this pci root */
+	for (i = 0; i < root_num; i++) {
+		len = snprintf(str, sizeof(str), "root-%d", i);
+		if (unlikely(len >= (sizeof(str) - 1))) {
+			err = -ENOMEM;
+			goto out;
+		}
+
+		err = xenbus_scanf(XBT_NIL, pdev->xdev->nodename,
+				   str, "%x:%x", &d, &b);
+		if (err < 0)
+			goto out;
+		if (err != 2) {
+			err = -EINVAL;
+			goto out;
+		}
+
+		if (d == domain && b == bus) {
+			err = 0;
+			goto out;
+		}
+	}
+
+	len = snprintf(str, sizeof(str), "root-%d", root_num);
+	if (unlikely(len >= (sizeof(str) - 1))) {
+		err = -ENOMEM;
+		goto out;
+	}
+
+	dev_dbg(&pdev->xdev->dev, "writing root %d at %04x:%02x\n",
+		root_num, domain, bus);
+
+	err = xenbus_printf(XBT_NIL, pdev->xdev->nodename, str,
+			    "%04x:%02x", domain, bus);
+	if (err)
+		goto out;
+
+	err = xenbus_printf(XBT_NIL, pdev->xdev->nodename,
+			    "root_num", "%d", (root_num + 1));
+
+out:
+	return err;
+}
+
+static int xen_pcibk_reconfigure(struct xen_pcibk_device *pdev,
+				 enum xenbus_state state)
+{
+	int err = 0;
+	int num_devs;
+	int domain, bus, slot, func;
+	unsigned int substate;
+	int i, len;
+	char state_str[64];
+	char dev_str[64];
+
+
+	dev_dbg(&pdev->xdev->dev, "Reconfiguring device ...\n");
+
+	mutex_lock(&pdev->dev_lock);
+	if (xenbus_read_driver_state(pdev->xdev->nodename) != state)
+		goto out;
+
+	err = xenbus_scanf(XBT_NIL, pdev->xdev->nodename, "num_devs", "%d",
+			   &num_devs);
+	if (err != 1) {
+		if (err >= 0)
+			err = -EINVAL;
+		xenbus_dev_fatal(pdev->xdev, err,
+				 "Error reading number of devices");
+		goto out;
+	}
+
+	for (i = 0; i < num_devs; i++) {
+		len = snprintf(state_str, sizeof(state_str), "state-%d", i);
+		if (unlikely(len >= (sizeof(state_str) - 1))) {
+			err = -ENOMEM;
+			xenbus_dev_fatal(pdev->xdev, err,
+					 "String overflow while reading "
+					 "configuration");
+			goto out;
+		}
+		substate = xenbus_read_unsigned(pdev->xdev->nodename, state_str,
+						XenbusStateUnknown);
+
+		switch (substate) {
+		case XenbusStateInitialising:
+			dev_dbg(&pdev->xdev->dev, "Attaching dev-%d ...\n", i);
+
+			len = snprintf(dev_str, sizeof(dev_str), "dev-%d", i);
+			if (unlikely(len >= (sizeof(dev_str) - 1))) {
+				err = -ENOMEM;
+				xenbus_dev_fatal(pdev->xdev, err,
+						 "String overflow while "
+						 "reading configuration");
+				goto out;
+			}
+			err = xenbus_scanf(XBT_NIL, pdev->xdev->nodename,
+					   dev_str, "%x:%x:%x.%x",
+					   &domain, &bus, &slot, &func);
+			if (err < 0) {
+				xenbus_dev_fatal(pdev->xdev, err,
+						 "Error reading device "
+						 "configuration");
+				goto out;
+			}
+			if (err != 4) {
+				err = -EINVAL;
+				xenbus_dev_fatal(pdev->xdev, err,
+						 "Error parsing pci device "
+						 "configuration");
+				goto out;
+			}
+
+			err = xen_pcibk_export_device(pdev, domain, bus, slot,
+						    func, i);
+			if (err)
+				goto out;
+
+			/* Publish pci roots. */
+			err = xen_pcibk_publish_pci_roots(pdev,
+						xen_pcibk_publish_pci_root);
+			if (err) {
+				xenbus_dev_fatal(pdev->xdev, err,
+						 "Error while publish PCI root"
+						 "buses for frontend");
+				goto out;
+			}
+
+			err = xenbus_printf(XBT_NIL, pdev->xdev->nodename,
+					    state_str, "%d",
+					    XenbusStateInitialised);
+			if (err) {
+				xenbus_dev_fatal(pdev->xdev, err,
+						 "Error switching substate of "
+						 "dev-%d\n", i);
+				goto out;
+			}
+			break;
+
+		case XenbusStateClosing:
+			dev_dbg(&pdev->xdev->dev, "Detaching dev-%d ...\n", i);
+
+			len = snprintf(dev_str, sizeof(dev_str), "vdev-%d", i);
+			if (unlikely(len >= (sizeof(dev_str) - 1))) {
+				err = -ENOMEM;
+				xenbus_dev_fatal(pdev->xdev, err,
+						 "String overflow while "
+						 "reading configuration");
+				goto out;
+			}
+			err = xenbus_scanf(XBT_NIL, pdev->xdev->nodename,
+					   dev_str, "%x:%x:%x.%x",
+					   &domain, &bus, &slot, &func);
+			if (err < 0) {
+				xenbus_dev_fatal(pdev->xdev, err,
+						 "Error reading device "
+						 "configuration");
+				goto out;
+			}
+			if (err != 4) {
+				err = -EINVAL;
+				xenbus_dev_fatal(pdev->xdev, err,
+						 "Error parsing pci device "
+						 "configuration");
+				goto out;
+			}
+
+			err = xen_pcibk_remove_device(pdev, domain, bus, slot,
+						    func);
+			if (err)
+				goto out;
+
+			/* TODO: If at some point we implement support for pci
+			 * root hot-remove on pcifront side, we'll need to
+			 * remove unnecessary xenstore nodes of pci roots here.
+			 */
+
+			break;
+
+		default:
+			break;
+		}
+	}
+
+	if (state != XenbusStateReconfiguring)
+		/* Make sure we only reconfigure once. */
+		goto out;
+
+	err = xenbus_switch_state(pdev->xdev, XenbusStateReconfigured);
+	if (err) {
+		xenbus_dev_fatal(pdev->xdev, err,
+				 "Error switching to reconfigured state!");
+		goto out;
+	}
+
+out:
+	mutex_unlock(&pdev->dev_lock);
+	return 0;
+}
+
+static void xen_pcibk_frontend_changed(struct xenbus_device *xdev,
+				     enum xenbus_state fe_state)
+{
+	struct xen_pcibk_device *pdev = dev_get_drvdata(&xdev->dev);
+
+	dev_dbg(&xdev->dev, "fe state changed %d\n", fe_state);
+
+	switch (fe_state) {
+	case XenbusStateInitialised:
+		xen_pcibk_attach(pdev);
+		break;
+
+	case XenbusStateReconfiguring:
+		xen_pcibk_reconfigure(pdev, XenbusStateReconfiguring);
+		break;
+
+	case XenbusStateConnected:
+		/* pcifront switched its state from reconfiguring to connected.
+		 * Then switch to connected state.
+		 */
+		xenbus_switch_state(xdev, XenbusStateConnected);
+		break;
+
+	case XenbusStateClosing:
+		xen_pcibk_disconnect(pdev);
+		xenbus_switch_state(xdev, XenbusStateClosing);
+		break;
+
+	case XenbusStateClosed:
+		xen_pcibk_disconnect(pdev);
+		xenbus_switch_state(xdev, XenbusStateClosed);
+		if (xenbus_dev_is_online(xdev))
+			break;
+		/* fall through if not online */
+	case XenbusStateUnknown:
+		dev_dbg(&xdev->dev, "frontend is gone! unregister device\n");
+		device_unregister(&xdev->dev);
+		break;
+
+	default:
+		break;
+	}
+}
+
+static int xen_pcibk_setup_backend(struct xen_pcibk_device *pdev)
+{
+	/* Get configuration from xend (if available now) */
+	int domain, bus, slot, func;
+	int err = 0;
+	int i, num_devs;
+	char dev_str[64];
+	char state_str[64];
+
+	mutex_lock(&pdev->dev_lock);
+	/* It's possible we could get the call to setup twice, so make sure
+	 * we're not already connected.
+	 */
+	if (xenbus_read_driver_state(pdev->xdev->nodename) !=
+	    XenbusStateInitWait)
+		goto out;
+
+	dev_dbg(&pdev->xdev->dev, "getting be setup\n");
+
+	err = xenbus_scanf(XBT_NIL, pdev->xdev->nodename, "num_devs", "%d",
+			   &num_devs);
+	if (err != 1) {
+		if (err >= 0)
+			err = -EINVAL;
+		xenbus_dev_fatal(pdev->xdev, err,
+				 "Error reading number of devices");
+		goto out;
+	}
+
+	for (i = 0; i < num_devs; i++) {
+		int l = snprintf(dev_str, sizeof(dev_str), "dev-%d", i);
+		if (unlikely(l >= (sizeof(dev_str) - 1))) {
+			err = -ENOMEM;
+			xenbus_dev_fatal(pdev->xdev, err,
+					 "String overflow while reading "
+					 "configuration");
+			goto out;
+		}
+
+		err = xenbus_scanf(XBT_NIL, pdev->xdev->nodename, dev_str,
+				   "%x:%x:%x.%x", &domain, &bus, &slot, &func);
+		if (err < 0) {
+			xenbus_dev_fatal(pdev->xdev, err,
+					 "Error reading device configuration");
+			goto out;
+		}
+		if (err != 4) {
+			err = -EINVAL;
+			xenbus_dev_fatal(pdev->xdev, err,
+					 "Error parsing pci device "
+					 "configuration");
+			goto out;
+		}
+
+		err = xen_pcibk_export_device(pdev, domain, bus, slot, func, i);
+		if (err)
+			goto out;
+
+		/* Switch substate of this device. */
+		l = snprintf(state_str, sizeof(state_str), "state-%d", i);
+		if (unlikely(l >= (sizeof(state_str) - 1))) {
+			err = -ENOMEM;
+			xenbus_dev_fatal(pdev->xdev, err,
+					 "String overflow while reading "
+					 "configuration");
+			goto out;
+		}
+		err = xenbus_printf(XBT_NIL, pdev->xdev->nodename, state_str,
+				    "%d", XenbusStateInitialised);
+		if (err) {
+			xenbus_dev_fatal(pdev->xdev, err, "Error switching "
+					 "substate of dev-%d\n", i);
+			goto out;
+		}
+	}
+
+	err = xen_pcibk_publish_pci_roots(pdev, xen_pcibk_publish_pci_root);
+	if (err) {
+		xenbus_dev_fatal(pdev->xdev, err,
+				 "Error while publish PCI root buses "
+				 "for frontend");
+		goto out;
+	}
+
+	err = xenbus_switch_state(pdev->xdev, XenbusStateInitialised);
+	if (err)
+		xenbus_dev_fatal(pdev->xdev, err,
+				 "Error switching to initialised state!");
+
+out:
+	mutex_unlock(&pdev->dev_lock);
+	if (!err)
+		/* see if pcifront is already configured (if not, we'll wait) */
+		xen_pcibk_attach(pdev);
+	return err;
+}
+
+static void xen_pcibk_be_watch(struct xenbus_watch *watch,
+			       const char *path, const char *token)
+{
+	struct xen_pcibk_device *pdev =
+	    container_of(watch, struct xen_pcibk_device, be_watch);
+
+	switch (xenbus_read_driver_state(pdev->xdev->nodename)) {
+	case XenbusStateInitWait:
+		xen_pcibk_setup_backend(pdev);
+		break;
+
+	case XenbusStateInitialised:
+		/*
+		 * We typically move to Initialised when the first device was
+		 * added. Hence subsequent devices getting added may need
+		 * reconfiguring.
+		 */
+		xen_pcibk_reconfigure(pdev, XenbusStateInitialised);
+		break;
+
+	default:
+		break;
+	}
+}
+
+static int xen_pcibk_xenbus_probe(struct xenbus_device *dev,
+				const struct xenbus_device_id *id)
+{
+	int err = 0;
+	struct xen_pcibk_device *pdev = alloc_pdev(dev);
+
+	if (pdev == NULL) {
+		err = -ENOMEM;
+		xenbus_dev_fatal(dev, err,
+				 "Error allocating xen_pcibk_device struct");
+		goto out;
+	}
+
+	/* wait for xend to configure us */
+	err = xenbus_switch_state(dev, XenbusStateInitWait);
+	if (err)
+		goto out;
+
+	/* watch the backend node for backend configuration information */
+	err = xenbus_watch_path(dev, dev->nodename, &pdev->be_watch,
+				NULL, xen_pcibk_be_watch);
+	if (err)
+		goto out;
+
+	pdev->be_watching = 1;
+
+	/* We need to force a call to our callback here in case
+	 * xend already configured us!
+	 */
+	xen_pcibk_be_watch(&pdev->be_watch, NULL, 0);
+
+out:
+	return err;
+}
+
+static int xen_pcibk_xenbus_remove(struct xenbus_device *dev)
+{
+	struct xen_pcibk_device *pdev = dev_get_drvdata(&dev->dev);
+
+	if (pdev != NULL)
+		free_pdev(pdev);
+
+	return 0;
+}
+
+static const struct xenbus_device_id xen_pcibk_ids[] = {
+	{"pci"},
+	{""},
+};
+
+static struct xenbus_driver xen_pcibk_driver = {
+	.name                   = DRV_NAME,
+	.ids                    = xen_pcibk_ids,
+	.probe			= xen_pcibk_xenbus_probe,
+	.remove			= xen_pcibk_xenbus_remove,
+	.otherend_changed	= xen_pcibk_frontend_changed,
+};
+
+const struct xen_pcibk_backend *__read_mostly xen_pcibk_backend;
+
+int __init xen_pcibk_xenbus_register(void)
+{
+	xen_pcibk_backend = &xen_pcibk_vpci_backend;
+	if (passthrough)
+		xen_pcibk_backend = &xen_pcibk_passthrough_backend;
+	pr_info("backend is %s\n", xen_pcibk_backend->name);
+	return xenbus_register_backend(&xen_pcibk_driver);
+}
+
+void __exit xen_pcibk_xenbus_unregister(void)
+{
+	xenbus_unregister_driver(&xen_pcibk_driver);
+}
diff --git a/drivers/xen/xen-scsiback.c b/drivers/xen/xen-scsiback.c
new file mode 100644
index 000000000..614d067ff
--- /dev/null
+++ b/drivers/xen/xen-scsiback.c
@@ -0,0 +1,1910 @@
+/*
+ * Xen SCSI backend driver
+ *
+ * Copyright (c) 2008, FUJITSU Limited
+ *
+ * Based on the blkback driver code.
+ * Adaption to kernel taget core infrastructure taken from vhost/scsi.c
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation; or, when distributed
+ * separately from the Linux kernel or incorporated into other
+ * software packages, subject to the following license:
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this source file (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy, modify,
+ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
+ * and to permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#define pr_fmt(fmt) "xen-pvscsi: " fmt
+
+#include <stdarg.h>
+
+#include <linux/module.h>
+#include <linux/utsname.h>
+#include <linux/interrupt.h>
+#include <linux/slab.h>
+#include <linux/wait.h>
+#include <linux/sched.h>
+#include <linux/list.h>
+#include <linux/gfp.h>
+#include <linux/delay.h>
+#include <linux/spinlock.h>
+#include <linux/configfs.h>
+
+#include <generated/utsrelease.h>
+
+#include <scsi/scsi_host.h> /* SG_ALL */
+
+#include <target/target_core_base.h>
+#include <target/target_core_fabric.h>
+
+#include <asm/hypervisor.h>
+
+#include <xen/xen.h>
+#include <xen/balloon.h>
+#include <xen/events.h>
+#include <xen/xenbus.h>
+#include <xen/grant_table.h>
+#include <xen/page.h>
+
+#include <xen/interface/grant_table.h>
+#include <xen/interface/io/vscsiif.h>
+
+#define VSCSI_VERSION	"v0.1"
+#define VSCSI_NAMELEN	32
+
+struct ids_tuple {
+	unsigned int hst;		/* host    */
+	unsigned int chn;		/* channel */
+	unsigned int tgt;		/* target  */
+	unsigned int lun;		/* LUN     */
+};
+
+struct v2p_entry {
+	struct ids_tuple v;		/* translate from */
+	struct scsiback_tpg *tpg;	/* translate to   */
+	unsigned int lun;
+	struct kref kref;
+	struct list_head l;
+};
+
+struct vscsibk_info {
+	struct xenbus_device *dev;
+
+	domid_t domid;
+	unsigned int irq;
+
+	struct vscsiif_back_ring ring;
+
+	spinlock_t ring_lock;
+	atomic_t nr_unreplied_reqs;
+
+	spinlock_t v2p_lock;
+	struct list_head v2p_entry_lists;
+
+	wait_queue_head_t waiting_to_free;
+};
+
+/* theoretical maximum of grants for one request */
+#define VSCSI_MAX_GRANTS	(SG_ALL + VSCSIIF_SG_TABLESIZE)
+
+/*
+ * VSCSI_GRANT_BATCH is the maximum number of grants to be processed in one
+ * call to map/unmap grants. Don't choose it too large, as there are arrays
+ * with VSCSI_GRANT_BATCH elements allocated on the stack.
+ */
+#define VSCSI_GRANT_BATCH	16
+
+struct vscsibk_pend {
+	uint16_t rqid;
+
+	uint8_t cmnd[VSCSIIF_MAX_COMMAND_SIZE];
+	uint8_t cmd_len;
+
+	uint8_t sc_data_direction;
+	uint16_t n_sg;		/* real length of SG list */
+	uint16_t n_grants;	/* SG pages and potentially SG list */
+	uint32_t data_len;
+	uint32_t result;
+
+	struct vscsibk_info *info;
+	struct v2p_entry *v2p;
+	struct scatterlist *sgl;
+
+	uint8_t sense_buffer[VSCSIIF_SENSE_BUFFERSIZE];
+
+	grant_handle_t grant_handles[VSCSI_MAX_GRANTS];
+	struct page *pages[VSCSI_MAX_GRANTS];
+
+	struct se_cmd se_cmd;
+
+	struct completion tmr_done;
+};
+
+#define VSCSI_DEFAULT_SESSION_TAGS	128
+
+struct scsiback_nexus {
+	/* Pointer to TCM session for I_T Nexus */
+	struct se_session *tvn_se_sess;
+};
+
+struct scsiback_tport {
+	/* SCSI protocol the tport is providing */
+	u8 tport_proto_id;
+	/* Binary World Wide unique Port Name for pvscsi Target port */
+	u64 tport_wwpn;
+	/* ASCII formatted WWPN for pvscsi Target port */
+	char tport_name[VSCSI_NAMELEN];
+	/* Returned by scsiback_make_tport() */
+	struct se_wwn tport_wwn;
+};
+
+struct scsiback_tpg {
+	/* scsiback port target portal group tag for TCM */
+	u16 tport_tpgt;
+	/* track number of TPG Port/Lun Links wrt explicit I_T Nexus shutdown */
+	int tv_tpg_port_count;
+	/* xen-pvscsi references to tpg_nexus, protected by tv_tpg_mutex */
+	int tv_tpg_fe_count;
+	/* list for scsiback_list */
+	struct list_head tv_tpg_list;
+	/* Used to protect access for tpg_nexus */
+	struct mutex tv_tpg_mutex;
+	/* Pointer to the TCM pvscsi I_T Nexus for this TPG endpoint */
+	struct scsiback_nexus *tpg_nexus;
+	/* Pointer back to scsiback_tport */
+	struct scsiback_tport *tport;
+	/* Returned by scsiback_make_tpg() */
+	struct se_portal_group se_tpg;
+	/* alias used in xenstore */
+	char param_alias[VSCSI_NAMELEN];
+	/* list of info structures related to this target portal group */
+	struct list_head info_list;
+};
+
+#define SCSIBACK_INVALID_HANDLE (~0)
+
+static bool log_print_stat;
+module_param(log_print_stat, bool, 0644);
+
+static int scsiback_max_buffer_pages = 1024;
+module_param_named(max_buffer_pages, scsiback_max_buffer_pages, int, 0644);
+MODULE_PARM_DESC(max_buffer_pages,
+"Maximum number of free pages to keep in backend buffer");
+
+static DEFINE_SPINLOCK(free_pages_lock);
+static int free_pages_num;
+static LIST_HEAD(scsiback_free_pages);
+
+/* Global spinlock to protect scsiback TPG list */
+static DEFINE_MUTEX(scsiback_mutex);
+static LIST_HEAD(scsiback_list);
+
+static void scsiback_get(struct vscsibk_info *info)
+{
+	atomic_inc(&info->nr_unreplied_reqs);
+}
+
+static void scsiback_put(struct vscsibk_info *info)
+{
+	if (atomic_dec_and_test(&info->nr_unreplied_reqs))
+		wake_up(&info->waiting_to_free);
+}
+
+static void put_free_pages(struct page **page, int num)
+{
+	unsigned long flags;
+	int i = free_pages_num + num, n = num;
+
+	if (num == 0)
+		return;
+	if (i > scsiback_max_buffer_pages) {
+		n = min(num, i - scsiback_max_buffer_pages);
+		gnttab_free_pages(n, page + num - n);
+		n = num - n;
+	}
+	spin_lock_irqsave(&free_pages_lock, flags);
+	for (i = 0; i < n; i++)
+		list_add(&page[i]->lru, &scsiback_free_pages);
+	free_pages_num += n;
+	spin_unlock_irqrestore(&free_pages_lock, flags);
+}
+
+static int get_free_page(struct page **page)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&free_pages_lock, flags);
+	if (list_empty(&scsiback_free_pages)) {
+		spin_unlock_irqrestore(&free_pages_lock, flags);
+		return gnttab_alloc_pages(1, page);
+	}
+	page[0] = list_first_entry(&scsiback_free_pages, struct page, lru);
+	list_del(&page[0]->lru);
+	free_pages_num--;
+	spin_unlock_irqrestore(&free_pages_lock, flags);
+	return 0;
+}
+
+static unsigned long vaddr_page(struct page *page)
+{
+	unsigned long pfn = page_to_pfn(page);
+
+	return (unsigned long)pfn_to_kaddr(pfn);
+}
+
+static unsigned long vaddr(struct vscsibk_pend *req, int seg)
+{
+	return vaddr_page(req->pages[seg]);
+}
+
+static void scsiback_print_status(char *sense_buffer, int errors,
+					struct vscsibk_pend *pending_req)
+{
+	struct scsiback_tpg *tpg = pending_req->v2p->tpg;
+
+	pr_err("[%s:%d] cmnd[0]=%02x -> st=%02x msg=%02x host=%02x drv=%02x\n",
+	       tpg->tport->tport_name, pending_req->v2p->lun,
+	       pending_req->cmnd[0], status_byte(errors), msg_byte(errors),
+	       host_byte(errors), driver_byte(errors));
+}
+
+static void scsiback_fast_flush_area(struct vscsibk_pend *req)
+{
+	struct gnttab_unmap_grant_ref unmap[VSCSI_GRANT_BATCH];
+	struct page *pages[VSCSI_GRANT_BATCH];
+	unsigned int i, invcount = 0;
+	grant_handle_t handle;
+	int err;
+
+	kfree(req->sgl);
+	req->sgl = NULL;
+	req->n_sg = 0;
+
+	if (!req->n_grants)
+		return;
+
+	for (i = 0; i < req->n_grants; i++) {
+		handle = req->grant_handles[i];
+		if (handle == SCSIBACK_INVALID_HANDLE)
+			continue;
+		gnttab_set_unmap_op(&unmap[invcount], vaddr(req, i),
+				    GNTMAP_host_map, handle);
+		req->grant_handles[i] = SCSIBACK_INVALID_HANDLE;
+		pages[invcount] = req->pages[i];
+		put_page(pages[invcount]);
+		invcount++;
+		if (invcount < VSCSI_GRANT_BATCH)
+			continue;
+		err = gnttab_unmap_refs(unmap, NULL, pages, invcount);
+		BUG_ON(err);
+		invcount = 0;
+	}
+
+	if (invcount) {
+		err = gnttab_unmap_refs(unmap, NULL, pages, invcount);
+		BUG_ON(err);
+	}
+
+	put_free_pages(req->pages, req->n_grants);
+	req->n_grants = 0;
+}
+
+static void scsiback_free_translation_entry(struct kref *kref)
+{
+	struct v2p_entry *entry = container_of(kref, struct v2p_entry, kref);
+	struct scsiback_tpg *tpg = entry->tpg;
+
+	mutex_lock(&tpg->tv_tpg_mutex);
+	tpg->tv_tpg_fe_count--;
+	mutex_unlock(&tpg->tv_tpg_mutex);
+
+	kfree(entry);
+}
+
+static void scsiback_send_response(struct vscsibk_info *info,
+			char *sense_buffer, int32_t result, uint32_t resid,
+			uint16_t rqid)
+{
+	struct vscsiif_response *ring_res;
+	int notify;
+	struct scsi_sense_hdr sshdr;
+	unsigned long flags;
+	unsigned len;
+
+	spin_lock_irqsave(&info->ring_lock, flags);
+
+	ring_res = RING_GET_RESPONSE(&info->ring, info->ring.rsp_prod_pvt);
+	info->ring.rsp_prod_pvt++;
+
+	ring_res->rslt   = result;
+	ring_res->rqid   = rqid;
+
+	if (sense_buffer != NULL &&
+	    scsi_normalize_sense(sense_buffer, VSCSIIF_SENSE_BUFFERSIZE,
+				 &sshdr)) {
+		len = min_t(unsigned, 8 + sense_buffer[7],
+			    VSCSIIF_SENSE_BUFFERSIZE);
+		memcpy(ring_res->sense_buffer, sense_buffer, len);
+		ring_res->sense_len = len;
+	} else {
+		ring_res->sense_len = 0;
+	}
+
+	ring_res->residual_len = resid;
+
+	RING_PUSH_RESPONSES_AND_CHECK_NOTIFY(&info->ring, notify);
+	spin_unlock_irqrestore(&info->ring_lock, flags);
+
+	if (notify)
+		notify_remote_via_irq(info->irq);
+}
+
+static void scsiback_do_resp_with_sense(char *sense_buffer, int32_t result,
+			uint32_t resid, struct vscsibk_pend *pending_req)
+{
+	scsiback_send_response(pending_req->info, sense_buffer, result,
+			       resid, pending_req->rqid);
+
+	if (pending_req->v2p)
+		kref_put(&pending_req->v2p->kref,
+			 scsiback_free_translation_entry);
+}
+
+static void scsiback_cmd_done(struct vscsibk_pend *pending_req)
+{
+	struct vscsibk_info *info = pending_req->info;
+	unsigned char *sense_buffer;
+	unsigned int resid;
+	int errors;
+
+	sense_buffer = pending_req->sense_buffer;
+	resid        = pending_req->se_cmd.residual_count;
+	errors       = pending_req->result;
+
+	if (errors && log_print_stat)
+		scsiback_print_status(sense_buffer, errors, pending_req);
+
+	scsiback_fast_flush_area(pending_req);
+	scsiback_do_resp_with_sense(sense_buffer, errors, resid, pending_req);
+	scsiback_put(info);
+	/*
+	 * Drop the extra KREF_ACK reference taken by target_submit_cmd_map_sgls()
+	 * ahead of scsiback_check_stop_free() ->  transport_generic_free_cmd()
+	 * final se_cmd->cmd_kref put.
+	 */
+	target_put_sess_cmd(&pending_req->se_cmd);
+}
+
+static void scsiback_cmd_exec(struct vscsibk_pend *pending_req)
+{
+	struct se_cmd *se_cmd = &pending_req->se_cmd;
+	struct se_session *sess = pending_req->v2p->tpg->tpg_nexus->tvn_se_sess;
+	int rc;
+
+	scsiback_get(pending_req->info);
+	se_cmd->tag = pending_req->rqid;
+	rc = target_submit_cmd_map_sgls(se_cmd, sess, pending_req->cmnd,
+			pending_req->sense_buffer, pending_req->v2p->lun,
+			pending_req->data_len, 0,
+			pending_req->sc_data_direction, TARGET_SCF_ACK_KREF,
+			pending_req->sgl, pending_req->n_sg,
+			NULL, 0, NULL, 0);
+	if (rc < 0) {
+		transport_send_check_condition_and_sense(se_cmd,
+				TCM_LOGICAL_UNIT_COMMUNICATION_FAILURE, 0);
+		transport_generic_free_cmd(se_cmd, 0);
+	}
+}
+
+static int scsiback_gnttab_data_map_batch(struct gnttab_map_grant_ref *map,
+	struct page **pg, grant_handle_t *grant, int cnt)
+{
+	int err, i;
+
+	if (!cnt)
+		return 0;
+
+	err = gnttab_map_refs(map, NULL, pg, cnt);
+	for (i = 0; i < cnt; i++) {
+		if (unlikely(map[i].status != GNTST_okay)) {
+			pr_err("invalid buffer -- could not remap it\n");
+			map[i].handle = SCSIBACK_INVALID_HANDLE;
+			if (!err)
+				err = -ENOMEM;
+		} else {
+			get_page(pg[i]);
+		}
+		grant[i] = map[i].handle;
+	}
+	return err;
+}
+
+static int scsiback_gnttab_data_map_list(struct vscsibk_pend *pending_req,
+			struct scsiif_request_segment *seg, struct page **pg,
+			grant_handle_t *grant, int cnt, u32 flags)
+{
+	int mapcount = 0, i, err = 0;
+	struct gnttab_map_grant_ref map[VSCSI_GRANT_BATCH];
+	struct vscsibk_info *info = pending_req->info;
+
+	for (i = 0; i < cnt; i++) {
+		if (get_free_page(pg + mapcount)) {
+			put_free_pages(pg, mapcount);
+			pr_err("no grant page\n");
+			return -ENOMEM;
+		}
+		gnttab_set_map_op(&map[mapcount], vaddr_page(pg[mapcount]),
+				  flags, seg[i].gref, info->domid);
+		mapcount++;
+		if (mapcount < VSCSI_GRANT_BATCH)
+			continue;
+		err = scsiback_gnttab_data_map_batch(map, pg, grant, mapcount);
+		pg += mapcount;
+		grant += mapcount;
+		pending_req->n_grants += mapcount;
+		if (err)
+			return err;
+		mapcount = 0;
+	}
+	err = scsiback_gnttab_data_map_batch(map, pg, grant, mapcount);
+	pending_req->n_grants += mapcount;
+	return err;
+}
+
+static int scsiback_gnttab_data_map(struct vscsiif_request *ring_req,
+					struct vscsibk_pend *pending_req)
+{
+	u32 flags;
+	int i, err, n_segs, i_seg = 0;
+	struct page **pg;
+	struct scsiif_request_segment *seg;
+	unsigned long end_seg = 0;
+	unsigned int nr_segments = (unsigned int)ring_req->nr_segments;
+	unsigned int nr_sgl = 0;
+	struct scatterlist *sg;
+	grant_handle_t *grant;
+
+	pending_req->n_sg = 0;
+	pending_req->n_grants = 0;
+	pending_req->data_len = 0;
+
+	nr_segments &= ~VSCSIIF_SG_GRANT;
+	if (!nr_segments)
+		return 0;
+
+	if (nr_segments > VSCSIIF_SG_TABLESIZE) {
+		pr_debug("invalid parameter nr_seg = %d\n",
+			ring_req->nr_segments);
+		return -EINVAL;
+	}
+
+	if (ring_req->nr_segments & VSCSIIF_SG_GRANT) {
+		err = scsiback_gnttab_data_map_list(pending_req, ring_req->seg,
+			pending_req->pages, pending_req->grant_handles,
+			nr_segments, GNTMAP_host_map | GNTMAP_readonly);
+		if (err)
+			return err;
+		nr_sgl = nr_segments;
+		nr_segments = 0;
+		for (i = 0; i < nr_sgl; i++) {
+			n_segs = ring_req->seg[i].length /
+				 sizeof(struct scsiif_request_segment);
+			if ((unsigned)ring_req->seg[i].offset +
+			    (unsigned)ring_req->seg[i].length > PAGE_SIZE ||
+			    n_segs * sizeof(struct scsiif_request_segment) !=
+			    ring_req->seg[i].length)
+				return -EINVAL;
+			nr_segments += n_segs;
+		}
+		if (nr_segments > SG_ALL) {
+			pr_debug("invalid nr_seg = %d\n", nr_segments);
+			return -EINVAL;
+		}
+	}
+
+	/* free of (sgl) in fast_flush_area() */
+	pending_req->sgl = kmalloc_array(nr_segments,
+					sizeof(struct scatterlist), GFP_KERNEL);
+	if (!pending_req->sgl)
+		return -ENOMEM;
+
+	sg_init_table(pending_req->sgl, nr_segments);
+	pending_req->n_sg = nr_segments;
+
+	flags = GNTMAP_host_map;
+	if (pending_req->sc_data_direction == DMA_TO_DEVICE)
+		flags |= GNTMAP_readonly;
+
+	pg = pending_req->pages + nr_sgl;
+	grant = pending_req->grant_handles + nr_sgl;
+	if (!nr_sgl) {
+		seg = ring_req->seg;
+		err = scsiback_gnttab_data_map_list(pending_req, seg,
+			pg, grant, nr_segments, flags);
+		if (err)
+			return err;
+	} else {
+		for (i = 0; i < nr_sgl; i++) {
+			seg = (struct scsiif_request_segment *)(
+			      vaddr(pending_req, i) + ring_req->seg[i].offset);
+			n_segs = ring_req->seg[i].length /
+				 sizeof(struct scsiif_request_segment);
+			err = scsiback_gnttab_data_map_list(pending_req, seg,
+				pg, grant, n_segs, flags);
+			if (err)
+				return err;
+			pg += n_segs;
+			grant += n_segs;
+		}
+		end_seg = vaddr(pending_req, 0) + ring_req->seg[0].offset;
+		seg = (struct scsiif_request_segment *)end_seg;
+		end_seg += ring_req->seg[0].length;
+		pg = pending_req->pages + nr_sgl;
+	}
+
+	for_each_sg(pending_req->sgl, sg, nr_segments, i) {
+		sg_set_page(sg, pg[i], seg->length, seg->offset);
+		pending_req->data_len += seg->length;
+		seg++;
+		if (nr_sgl && (unsigned long)seg >= end_seg) {
+			i_seg++;
+			end_seg = vaddr(pending_req, i_seg) +
+				  ring_req->seg[i_seg].offset;
+			seg = (struct scsiif_request_segment *)end_seg;
+			end_seg += ring_req->seg[i_seg].length;
+		}
+		if (sg->offset >= PAGE_SIZE ||
+		    sg->length > PAGE_SIZE ||
+		    sg->offset + sg->length > PAGE_SIZE)
+			return -EINVAL;
+	}
+
+	return 0;
+}
+
+static void scsiback_disconnect(struct vscsibk_info *info)
+{
+	wait_event(info->waiting_to_free,
+		atomic_read(&info->nr_unreplied_reqs) == 0);
+
+	unbind_from_irqhandler(info->irq, info);
+	info->irq = 0;
+	xenbus_unmap_ring_vfree(info->dev, info->ring.sring);
+}
+
+static void scsiback_device_action(struct vscsibk_pend *pending_req,
+	enum tcm_tmreq_table act, int tag)
+{
+	struct scsiback_tpg *tpg = pending_req->v2p->tpg;
+	struct scsiback_nexus *nexus = tpg->tpg_nexus;
+	struct se_cmd *se_cmd = &pending_req->se_cmd;
+	u64 unpacked_lun = pending_req->v2p->lun;
+	int rc, err = FAILED;
+
+	init_completion(&pending_req->tmr_done);
+
+	rc = target_submit_tmr(&pending_req->se_cmd, nexus->tvn_se_sess,
+			       &pending_req->sense_buffer[0],
+			       unpacked_lun, NULL, act, GFP_KERNEL,
+			       tag, TARGET_SCF_ACK_KREF);
+	if (rc)
+		goto err;
+
+	wait_for_completion(&pending_req->tmr_done);
+
+	err = (se_cmd->se_tmr_req->response == TMR_FUNCTION_COMPLETE) ?
+		SUCCESS : FAILED;
+
+	scsiback_do_resp_with_sense(NULL, err, 0, pending_req);
+	transport_generic_free_cmd(&pending_req->se_cmd, 0);
+	return;
+
+err:
+	scsiback_do_resp_with_sense(NULL, err, 0, pending_req);
+}
+
+/*
+  Perform virtual to physical translation
+*/
+static struct v2p_entry *scsiback_do_translation(struct vscsibk_info *info,
+			struct ids_tuple *v)
+{
+	struct v2p_entry *entry;
+	struct list_head *head = &(info->v2p_entry_lists);
+	unsigned long flags;
+
+	spin_lock_irqsave(&info->v2p_lock, flags);
+	list_for_each_entry(entry, head, l) {
+		if ((entry->v.chn == v->chn) &&
+		    (entry->v.tgt == v->tgt) &&
+		    (entry->v.lun == v->lun)) {
+			kref_get(&entry->kref);
+			goto out;
+		}
+	}
+	entry = NULL;
+
+out:
+	spin_unlock_irqrestore(&info->v2p_lock, flags);
+	return entry;
+}
+
+static struct vscsibk_pend *scsiback_get_pend_req(struct vscsiif_back_ring *ring,
+				struct v2p_entry *v2p)
+{
+	struct scsiback_tpg *tpg = v2p->tpg;
+	struct scsiback_nexus *nexus = tpg->tpg_nexus;
+	struct se_session *se_sess = nexus->tvn_se_sess;
+	struct vscsibk_pend *req;
+	int tag, cpu, i;
+
+	tag = sbitmap_queue_get(&se_sess->sess_tag_pool, &cpu);
+	if (tag < 0) {
+		pr_err("Unable to obtain tag for vscsiif_request\n");
+		return ERR_PTR(-ENOMEM);
+	}
+
+	req = &((struct vscsibk_pend *)se_sess->sess_cmd_map)[tag];
+	memset(req, 0, sizeof(*req));
+	req->se_cmd.map_tag = tag;
+	req->se_cmd.map_cpu = cpu;
+
+	for (i = 0; i < VSCSI_MAX_GRANTS; i++)
+		req->grant_handles[i] = SCSIBACK_INVALID_HANDLE;
+
+	return req;
+}
+
+static struct vscsibk_pend *prepare_pending_reqs(struct vscsibk_info *info,
+				struct vscsiif_back_ring *ring,
+				struct vscsiif_request *ring_req)
+{
+	struct vscsibk_pend *pending_req;
+	struct v2p_entry *v2p;
+	struct ids_tuple vir;
+
+	/* request range check from frontend */
+	if ((ring_req->sc_data_direction != DMA_BIDIRECTIONAL) &&
+		(ring_req->sc_data_direction != DMA_TO_DEVICE) &&
+		(ring_req->sc_data_direction != DMA_FROM_DEVICE) &&
+		(ring_req->sc_data_direction != DMA_NONE)) {
+		pr_debug("invalid parameter data_dir = %d\n",
+			ring_req->sc_data_direction);
+		return ERR_PTR(-EINVAL);
+	}
+	if (ring_req->cmd_len > VSCSIIF_MAX_COMMAND_SIZE) {
+		pr_debug("invalid parameter cmd_len = %d\n",
+			ring_req->cmd_len);
+		return ERR_PTR(-EINVAL);
+	}
+
+	vir.chn = ring_req->channel;
+	vir.tgt = ring_req->id;
+	vir.lun = ring_req->lun;
+
+	v2p = scsiback_do_translation(info, &vir);
+	if (!v2p) {
+		pr_debug("the v2p of (chn:%d, tgt:%d, lun:%d) doesn't exist.\n",
+			 vir.chn, vir.tgt, vir.lun);
+		return ERR_PTR(-ENODEV);
+	}
+
+	pending_req = scsiback_get_pend_req(ring, v2p);
+	if (IS_ERR(pending_req)) {
+		kref_put(&v2p->kref, scsiback_free_translation_entry);
+		return ERR_PTR(-ENOMEM);
+	}
+	pending_req->rqid = ring_req->rqid;
+	pending_req->info = info;
+	pending_req->v2p = v2p;
+	pending_req->sc_data_direction = ring_req->sc_data_direction;
+	pending_req->cmd_len = ring_req->cmd_len;
+	memcpy(pending_req->cmnd, ring_req->cmnd, pending_req->cmd_len);
+
+	return pending_req;
+}
+
+static int scsiback_do_cmd_fn(struct vscsibk_info *info,
+			      unsigned int *eoi_flags)
+{
+	struct vscsiif_back_ring *ring = &info->ring;
+	struct vscsiif_request ring_req;
+	struct vscsibk_pend *pending_req;
+	RING_IDX rc, rp;
+	int more_to_do;
+	uint32_t result;
+
+	rc = ring->req_cons;
+	rp = ring->sring->req_prod;
+	rmb();	/* guest system is accessing ring, too */
+
+	if (RING_REQUEST_PROD_OVERFLOW(ring, rp)) {
+		rc = ring->rsp_prod_pvt;
+		pr_warn("Dom%d provided bogus ring requests (%#x - %#x = %u). Halting ring processing\n",
+			   info->domid, rp, rc, rp - rc);
+		return -EINVAL;
+	}
+
+	while ((rc != rp)) {
+		*eoi_flags &= ~XEN_EOI_FLAG_SPURIOUS;
+
+		if (RING_REQUEST_CONS_OVERFLOW(ring, rc))
+			break;
+
+		RING_COPY_REQUEST(ring, rc, &ring_req);
+		ring->req_cons = ++rc;
+
+		pending_req = prepare_pending_reqs(info, ring, &ring_req);
+		if (IS_ERR(pending_req)) {
+			switch (PTR_ERR(pending_req)) {
+			case -ENODEV:
+				result = DID_NO_CONNECT;
+				break;
+			default:
+				result = DRIVER_ERROR;
+				break;
+			}
+			scsiback_send_response(info, NULL, result << 24, 0,
+					       ring_req.rqid);
+			return 1;
+		}
+
+		switch (ring_req.act) {
+		case VSCSIIF_ACT_SCSI_CDB:
+			if (scsiback_gnttab_data_map(&ring_req, pending_req)) {
+				scsiback_fast_flush_area(pending_req);
+				scsiback_do_resp_with_sense(NULL,
+						DRIVER_ERROR << 24, 0, pending_req);
+				transport_generic_free_cmd(&pending_req->se_cmd, 0);
+			} else {
+				scsiback_cmd_exec(pending_req);
+			}
+			break;
+		case VSCSIIF_ACT_SCSI_ABORT:
+			scsiback_device_action(pending_req, TMR_ABORT_TASK,
+				ring_req.ref_rqid);
+			break;
+		case VSCSIIF_ACT_SCSI_RESET:
+			scsiback_device_action(pending_req, TMR_LUN_RESET, 0);
+			break;
+		default:
+			pr_err_ratelimited("invalid request\n");
+			scsiback_do_resp_with_sense(NULL, DRIVER_ERROR << 24, 0,
+						    pending_req);
+			transport_generic_free_cmd(&pending_req->se_cmd, 0);
+			break;
+		}
+
+		/* Yield point for this unbounded loop. */
+		cond_resched();
+	}
+
+	RING_FINAL_CHECK_FOR_REQUESTS(&info->ring, more_to_do);
+	return more_to_do;
+}
+
+static irqreturn_t scsiback_irq_fn(int irq, void *dev_id)
+{
+	struct vscsibk_info *info = dev_id;
+	int rc;
+	unsigned int eoi_flags = XEN_EOI_FLAG_SPURIOUS;
+
+	while ((rc = scsiback_do_cmd_fn(info, &eoi_flags)) > 0)
+		cond_resched();
+
+	/* In case of a ring error we keep the event channel masked. */
+	if (!rc)
+		xen_irq_lateeoi(irq, eoi_flags);
+
+	return IRQ_HANDLED;
+}
+
+static int scsiback_init_sring(struct vscsibk_info *info, grant_ref_t ring_ref,
+			evtchn_port_t evtchn)
+{
+	void *area;
+	struct vscsiif_sring *sring;
+	int err;
+
+	if (info->irq)
+		return -1;
+
+	err = xenbus_map_ring_valloc(info->dev, &ring_ref, 1, &area);
+	if (err)
+		return err;
+
+	sring = (struct vscsiif_sring *)area;
+	BACK_RING_INIT(&info->ring, sring, PAGE_SIZE);
+
+	err = bind_interdomain_evtchn_to_irq_lateeoi(info->domid, evtchn);
+	if (err < 0)
+		goto unmap_page;
+
+	info->irq = err;
+
+	err = request_threaded_irq(info->irq, NULL, scsiback_irq_fn,
+				   IRQF_ONESHOT, "vscsiif-backend", info);
+	if (err)
+		goto free_irq;
+
+	return 0;
+
+free_irq:
+	unbind_from_irqhandler(info->irq, info);
+	info->irq = 0;
+unmap_page:
+	xenbus_unmap_ring_vfree(info->dev, area);
+
+	return err;
+}
+
+static int scsiback_map(struct vscsibk_info *info)
+{
+	struct xenbus_device *dev = info->dev;
+	unsigned int ring_ref, evtchn;
+	int err;
+
+	err = xenbus_gather(XBT_NIL, dev->otherend,
+			"ring-ref", "%u", &ring_ref,
+			"event-channel", "%u", &evtchn, NULL);
+	if (err) {
+		xenbus_dev_fatal(dev, err, "reading %s ring", dev->otherend);
+		return err;
+	}
+
+	return scsiback_init_sring(info, ring_ref, evtchn);
+}
+
+/*
+  Check for a translation entry being present
+*/
+static struct v2p_entry *scsiback_chk_translation_entry(
+	struct vscsibk_info *info, struct ids_tuple *v)
+{
+	struct list_head *head = &(info->v2p_entry_lists);
+	struct v2p_entry *entry;
+
+	list_for_each_entry(entry, head, l)
+		if ((entry->v.chn == v->chn) &&
+		    (entry->v.tgt == v->tgt) &&
+		    (entry->v.lun == v->lun))
+			return entry;
+
+	return NULL;
+}
+
+/*
+  Add a new translation entry
+*/
+static int scsiback_add_translation_entry(struct vscsibk_info *info,
+					  char *phy, struct ids_tuple *v)
+{
+	int err = 0;
+	struct v2p_entry *new;
+	unsigned long flags;
+	char *lunp;
+	unsigned long long unpacked_lun;
+	struct se_lun *se_lun;
+	struct scsiback_tpg *tpg_entry, *tpg = NULL;
+	char *error = "doesn't exist";
+
+	lunp = strrchr(phy, ':');
+	if (!lunp) {
+		pr_err("illegal format of physical device %s\n", phy);
+		return -EINVAL;
+	}
+	*lunp = 0;
+	lunp++;
+	err = kstrtoull(lunp, 10, &unpacked_lun);
+	if (err < 0) {
+		pr_err("lun number not valid: %s\n", lunp);
+		return err;
+	}
+
+	mutex_lock(&scsiback_mutex);
+	list_for_each_entry(tpg_entry, &scsiback_list, tv_tpg_list) {
+		if (!strcmp(phy, tpg_entry->tport->tport_name) ||
+		    !strcmp(phy, tpg_entry->param_alias)) {
+			mutex_lock(&tpg_entry->se_tpg.tpg_lun_mutex);
+			hlist_for_each_entry(se_lun, &tpg_entry->se_tpg.tpg_lun_hlist, link) {
+				if (se_lun->unpacked_lun == unpacked_lun) {
+					if (!tpg_entry->tpg_nexus)
+						error = "nexus undefined";
+					else
+						tpg = tpg_entry;
+					break;
+				}
+			}
+			mutex_unlock(&tpg_entry->se_tpg.tpg_lun_mutex);
+			break;
+		}
+	}
+	if (tpg) {
+		mutex_lock(&tpg->tv_tpg_mutex);
+		tpg->tv_tpg_fe_count++;
+		mutex_unlock(&tpg->tv_tpg_mutex);
+	}
+	mutex_unlock(&scsiback_mutex);
+
+	if (!tpg) {
+		pr_err("%s:%llu %s\n", phy, unpacked_lun, error);
+		return -ENODEV;
+	}
+
+	new = kmalloc(sizeof(struct v2p_entry), GFP_KERNEL);
+	if (new == NULL) {
+		err = -ENOMEM;
+		goto out_free;
+	}
+
+	spin_lock_irqsave(&info->v2p_lock, flags);
+
+	/* Check double assignment to identical virtual ID */
+	if (scsiback_chk_translation_entry(info, v)) {
+		pr_warn("Virtual ID is already used. Assignment was not performed.\n");
+		err = -EEXIST;
+		goto out;
+	}
+
+	/* Create a new translation entry and add to the list */
+	kref_init(&new->kref);
+	new->v = *v;
+	new->tpg = tpg;
+	new->lun = unpacked_lun;
+	list_add_tail(&new->l, &info->v2p_entry_lists);
+
+out:
+	spin_unlock_irqrestore(&info->v2p_lock, flags);
+
+out_free:
+	if (err) {
+		mutex_lock(&tpg->tv_tpg_mutex);
+		tpg->tv_tpg_fe_count--;
+		mutex_unlock(&tpg->tv_tpg_mutex);
+		kfree(new);
+	}
+
+	return err;
+}
+
+static void __scsiback_del_translation_entry(struct v2p_entry *entry)
+{
+	list_del(&entry->l);
+	kref_put(&entry->kref, scsiback_free_translation_entry);
+}
+
+/*
+  Delete the translation entry specified
+*/
+static int scsiback_del_translation_entry(struct vscsibk_info *info,
+					  struct ids_tuple *v)
+{
+	struct v2p_entry *entry;
+	unsigned long flags;
+	int ret = 0;
+
+	spin_lock_irqsave(&info->v2p_lock, flags);
+	/* Find out the translation entry specified */
+	entry = scsiback_chk_translation_entry(info, v);
+	if (entry)
+		__scsiback_del_translation_entry(entry);
+	else
+		ret = -ENOENT;
+
+	spin_unlock_irqrestore(&info->v2p_lock, flags);
+	return ret;
+}
+
+static void scsiback_do_add_lun(struct vscsibk_info *info, const char *state,
+				char *phy, struct ids_tuple *vir, int try)
+{
+	struct v2p_entry *entry;
+	unsigned long flags;
+	int err;
+
+	if (try) {
+		spin_lock_irqsave(&info->v2p_lock, flags);
+		entry = scsiback_chk_translation_entry(info, vir);
+		spin_unlock_irqrestore(&info->v2p_lock, flags);
+		if (entry)
+			return;
+	}
+	if (!scsiback_add_translation_entry(info, phy, vir)) {
+		if (xenbus_printf(XBT_NIL, info->dev->nodename, state,
+				  "%d", XenbusStateInitialised)) {
+			pr_err("xenbus_printf error %s\n", state);
+			scsiback_del_translation_entry(info, vir);
+		}
+	} else if (!try) {
+		err = xenbus_printf(XBT_NIL, info->dev->nodename, state,
+			      "%d", XenbusStateClosed);
+		if (err)
+			xenbus_dev_error(info->dev, err,
+				"%s: writing %s", __func__, state);
+	}
+}
+
+static void scsiback_do_del_lun(struct vscsibk_info *info, const char *state,
+				struct ids_tuple *vir)
+{
+	if (!scsiback_del_translation_entry(info, vir)) {
+		if (xenbus_printf(XBT_NIL, info->dev->nodename, state,
+				  "%d", XenbusStateClosed))
+			pr_err("xenbus_printf error %s\n", state);
+	}
+}
+
+#define VSCSIBACK_OP_ADD_OR_DEL_LUN	1
+#define VSCSIBACK_OP_UPDATEDEV_STATE	2
+
+static void scsiback_do_1lun_hotplug(struct vscsibk_info *info, int op,
+				     char *ent)
+{
+	int err;
+	struct ids_tuple vir;
+	char *val;
+	int device_state;
+	char phy[VSCSI_NAMELEN];
+	char str[64];
+	char state[64];
+	struct xenbus_device *dev = info->dev;
+
+	/* read status */
+	snprintf(state, sizeof(state), "vscsi-devs/%s/state", ent);
+	err = xenbus_scanf(XBT_NIL, dev->nodename, state, "%u", &device_state);
+	if (XENBUS_EXIST_ERR(err))
+		return;
+
+	/* physical SCSI device */
+	snprintf(str, sizeof(str), "vscsi-devs/%s/p-dev", ent);
+	val = xenbus_read(XBT_NIL, dev->nodename, str, NULL);
+	if (IS_ERR(val)) {
+		err = xenbus_printf(XBT_NIL, dev->nodename, state,
+			      "%d", XenbusStateClosed);
+		if (err)
+			xenbus_dev_error(info->dev, err,
+				"%s: writing %s", __func__, state);
+		return;
+	}
+	strlcpy(phy, val, VSCSI_NAMELEN);
+	kfree(val);
+
+	/* virtual SCSI device */
+	snprintf(str, sizeof(str), "vscsi-devs/%s/v-dev", ent);
+	err = xenbus_scanf(XBT_NIL, dev->nodename, str, "%u:%u:%u:%u",
+			   &vir.hst, &vir.chn, &vir.tgt, &vir.lun);
+	if (XENBUS_EXIST_ERR(err)) {
+		err = xenbus_printf(XBT_NIL, dev->nodename, state,
+			      "%d", XenbusStateClosed);
+		if (err)
+			xenbus_dev_error(info->dev, err,
+				"%s: writing %s", __func__, state);
+		return;
+	}
+
+	switch (op) {
+	case VSCSIBACK_OP_ADD_OR_DEL_LUN:
+		switch (device_state) {
+		case XenbusStateInitialising:
+			scsiback_do_add_lun(info, state, phy, &vir, 0);
+			break;
+		case XenbusStateConnected:
+			scsiback_do_add_lun(info, state, phy, &vir, 1);
+			break;
+		case XenbusStateClosing:
+			scsiback_do_del_lun(info, state, &vir);
+			break;
+		default:
+			break;
+		}
+		break;
+
+	case VSCSIBACK_OP_UPDATEDEV_STATE:
+		if (device_state == XenbusStateInitialised) {
+			/* modify vscsi-devs/dev-x/state */
+			if (xenbus_printf(XBT_NIL, dev->nodename, state,
+					  "%d", XenbusStateConnected)) {
+				pr_err("xenbus_printf error %s\n", str);
+				scsiback_del_translation_entry(info, &vir);
+				xenbus_printf(XBT_NIL, dev->nodename, state,
+					      "%d", XenbusStateClosed);
+			}
+		}
+		break;
+	/* When it is necessary, processing is added here. */
+	default:
+		break;
+	}
+}
+
+static void scsiback_do_lun_hotplug(struct vscsibk_info *info, int op)
+{
+	int i;
+	char **dir;
+	unsigned int ndir = 0;
+
+	dir = xenbus_directory(XBT_NIL, info->dev->nodename, "vscsi-devs",
+			       &ndir);
+	if (IS_ERR(dir))
+		return;
+
+	for (i = 0; i < ndir; i++)
+		scsiback_do_1lun_hotplug(info, op, dir[i]);
+
+	kfree(dir);
+}
+
+static void scsiback_frontend_changed(struct xenbus_device *dev,
+					enum xenbus_state frontend_state)
+{
+	struct vscsibk_info *info = dev_get_drvdata(&dev->dev);
+
+	switch (frontend_state) {
+	case XenbusStateInitialising:
+		break;
+
+	case XenbusStateInitialised:
+		if (scsiback_map(info))
+			break;
+
+		scsiback_do_lun_hotplug(info, VSCSIBACK_OP_ADD_OR_DEL_LUN);
+		xenbus_switch_state(dev, XenbusStateConnected);
+		break;
+
+	case XenbusStateConnected:
+		scsiback_do_lun_hotplug(info, VSCSIBACK_OP_UPDATEDEV_STATE);
+
+		if (dev->state == XenbusStateConnected)
+			break;
+
+		xenbus_switch_state(dev, XenbusStateConnected);
+		break;
+
+	case XenbusStateClosing:
+		if (info->irq)
+			scsiback_disconnect(info);
+
+		xenbus_switch_state(dev, XenbusStateClosing);
+		break;
+
+	case XenbusStateClosed:
+		xenbus_switch_state(dev, XenbusStateClosed);
+		if (xenbus_dev_is_online(dev))
+			break;
+		/* fall through if not online */
+	case XenbusStateUnknown:
+		device_unregister(&dev->dev);
+		break;
+
+	case XenbusStateReconfiguring:
+		scsiback_do_lun_hotplug(info, VSCSIBACK_OP_ADD_OR_DEL_LUN);
+		xenbus_switch_state(dev, XenbusStateReconfigured);
+
+		break;
+
+	default:
+		xenbus_dev_fatal(dev, -EINVAL, "saw state %d at frontend",
+					frontend_state);
+		break;
+	}
+}
+
+/*
+  Release the translation entry specfied
+*/
+static void scsiback_release_translation_entry(struct vscsibk_info *info)
+{
+	struct v2p_entry *entry, *tmp;
+	struct list_head *head = &(info->v2p_entry_lists);
+	unsigned long flags;
+
+	spin_lock_irqsave(&info->v2p_lock, flags);
+
+	list_for_each_entry_safe(entry, tmp, head, l)
+		__scsiback_del_translation_entry(entry);
+
+	spin_unlock_irqrestore(&info->v2p_lock, flags);
+}
+
+static int scsiback_remove(struct xenbus_device *dev)
+{
+	struct vscsibk_info *info = dev_get_drvdata(&dev->dev);
+
+	if (info->irq)
+		scsiback_disconnect(info);
+
+	scsiback_release_translation_entry(info);
+
+	dev_set_drvdata(&dev->dev, NULL);
+
+	return 0;
+}
+
+static int scsiback_probe(struct xenbus_device *dev,
+			   const struct xenbus_device_id *id)
+{
+	int err;
+
+	struct vscsibk_info *info = kzalloc(sizeof(struct vscsibk_info),
+					    GFP_KERNEL);
+
+	pr_debug("%s %p %d\n", __func__, dev, dev->otherend_id);
+
+	if (!info) {
+		xenbus_dev_fatal(dev, -ENOMEM, "allocating backend structure");
+		return -ENOMEM;
+	}
+	info->dev = dev;
+	dev_set_drvdata(&dev->dev, info);
+
+	info->domid = dev->otherend_id;
+	spin_lock_init(&info->ring_lock);
+	atomic_set(&info->nr_unreplied_reqs, 0);
+	init_waitqueue_head(&info->waiting_to_free);
+	info->dev = dev;
+	info->irq = 0;
+	INIT_LIST_HEAD(&info->v2p_entry_lists);
+	spin_lock_init(&info->v2p_lock);
+
+	err = xenbus_printf(XBT_NIL, dev->nodename, "feature-sg-grant", "%u",
+			    SG_ALL);
+	if (err)
+		xenbus_dev_error(dev, err, "writing feature-sg-grant");
+
+	err = xenbus_switch_state(dev, XenbusStateInitWait);
+	if (err)
+		goto fail;
+
+	return 0;
+
+fail:
+	pr_warn("%s failed\n", __func__);
+	scsiback_remove(dev);
+
+	return err;
+}
+
+static char *scsiback_dump_proto_id(struct scsiback_tport *tport)
+{
+	switch (tport->tport_proto_id) {
+	case SCSI_PROTOCOL_SAS:
+		return "SAS";
+	case SCSI_PROTOCOL_FCP:
+		return "FCP";
+	case SCSI_PROTOCOL_ISCSI:
+		return "iSCSI";
+	default:
+		break;
+	}
+
+	return "Unknown";
+}
+
+static char *scsiback_get_fabric_wwn(struct se_portal_group *se_tpg)
+{
+	struct scsiback_tpg *tpg = container_of(se_tpg,
+				struct scsiback_tpg, se_tpg);
+	struct scsiback_tport *tport = tpg->tport;
+
+	return &tport->tport_name[0];
+}
+
+static u16 scsiback_get_tag(struct se_portal_group *se_tpg)
+{
+	struct scsiback_tpg *tpg = container_of(se_tpg,
+				struct scsiback_tpg, se_tpg);
+	return tpg->tport_tpgt;
+}
+
+static struct se_wwn *
+scsiback_make_tport(struct target_fabric_configfs *tf,
+		     struct config_group *group,
+		     const char *name)
+{
+	struct scsiback_tport *tport;
+	char *ptr;
+	u64 wwpn = 0;
+	int off = 0;
+
+	tport = kzalloc(sizeof(struct scsiback_tport), GFP_KERNEL);
+	if (!tport)
+		return ERR_PTR(-ENOMEM);
+
+	tport->tport_wwpn = wwpn;
+	/*
+	 * Determine the emulated Protocol Identifier and Target Port Name
+	 * based on the incoming configfs directory name.
+	 */
+	ptr = strstr(name, "naa.");
+	if (ptr) {
+		tport->tport_proto_id = SCSI_PROTOCOL_SAS;
+		goto check_len;
+	}
+	ptr = strstr(name, "fc.");
+	if (ptr) {
+		tport->tport_proto_id = SCSI_PROTOCOL_FCP;
+		off = 3; /* Skip over "fc." */
+		goto check_len;
+	}
+	ptr = strstr(name, "iqn.");
+	if (ptr) {
+		tport->tport_proto_id = SCSI_PROTOCOL_ISCSI;
+		goto check_len;
+	}
+
+	pr_err("Unable to locate prefix for emulated Target Port: %s\n", name);
+	kfree(tport);
+	return ERR_PTR(-EINVAL);
+
+check_len:
+	if (strlen(name) >= VSCSI_NAMELEN) {
+		pr_err("Emulated %s Address: %s, exceeds max: %d\n", name,
+			scsiback_dump_proto_id(tport), VSCSI_NAMELEN);
+		kfree(tport);
+		return ERR_PTR(-EINVAL);
+	}
+	snprintf(&tport->tport_name[0], VSCSI_NAMELEN, "%s", &name[off]);
+
+	pr_debug("Allocated emulated Target %s Address: %s\n",
+		 scsiback_dump_proto_id(tport), name);
+
+	return &tport->tport_wwn;
+}
+
+static void scsiback_drop_tport(struct se_wwn *wwn)
+{
+	struct scsiback_tport *tport = container_of(wwn,
+				struct scsiback_tport, tport_wwn);
+
+	pr_debug("Deallocating emulated Target %s Address: %s\n",
+		 scsiback_dump_proto_id(tport), tport->tport_name);
+
+	kfree(tport);
+}
+
+static u32 scsiback_tpg_get_inst_index(struct se_portal_group *se_tpg)
+{
+	return 1;
+}
+
+static int scsiback_check_stop_free(struct se_cmd *se_cmd)
+{
+	return transport_generic_free_cmd(se_cmd, 0);
+}
+
+static void scsiback_release_cmd(struct se_cmd *se_cmd)
+{
+	target_free_tag(se_cmd->se_sess, se_cmd);
+}
+
+static u32 scsiback_sess_get_index(struct se_session *se_sess)
+{
+	return 0;
+}
+
+static int scsiback_write_pending(struct se_cmd *se_cmd)
+{
+	/* Go ahead and process the write immediately */
+	target_execute_cmd(se_cmd);
+
+	return 0;
+}
+
+static int scsiback_write_pending_status(struct se_cmd *se_cmd)
+{
+	return 0;
+}
+
+static void scsiback_set_default_node_attrs(struct se_node_acl *nacl)
+{
+}
+
+static int scsiback_get_cmd_state(struct se_cmd *se_cmd)
+{
+	return 0;
+}
+
+static int scsiback_queue_data_in(struct se_cmd *se_cmd)
+{
+	struct vscsibk_pend *pending_req = container_of(se_cmd,
+				struct vscsibk_pend, se_cmd);
+
+	pending_req->result = SAM_STAT_GOOD;
+	scsiback_cmd_done(pending_req);
+	return 0;
+}
+
+static int scsiback_queue_status(struct se_cmd *se_cmd)
+{
+	struct vscsibk_pend *pending_req = container_of(se_cmd,
+				struct vscsibk_pend, se_cmd);
+
+	if (se_cmd->sense_buffer &&
+	    ((se_cmd->se_cmd_flags & SCF_TRANSPORT_TASK_SENSE) ||
+	     (se_cmd->se_cmd_flags & SCF_EMULATED_TASK_SENSE)))
+		pending_req->result = (DRIVER_SENSE << 24) |
+				      SAM_STAT_CHECK_CONDITION;
+	else
+		pending_req->result = se_cmd->scsi_status;
+
+	scsiback_cmd_done(pending_req);
+	return 0;
+}
+
+static void scsiback_queue_tm_rsp(struct se_cmd *se_cmd)
+{
+	struct vscsibk_pend *pending_req = container_of(se_cmd,
+				struct vscsibk_pend, se_cmd);
+
+	complete(&pending_req->tmr_done);
+}
+
+static void scsiback_aborted_task(struct se_cmd *se_cmd)
+{
+}
+
+static ssize_t scsiback_tpg_param_alias_show(struct config_item *item,
+					     char *page)
+{
+	struct se_portal_group *se_tpg = param_to_tpg(item);
+	struct scsiback_tpg *tpg = container_of(se_tpg, struct scsiback_tpg,
+						se_tpg);
+	ssize_t rb;
+
+	mutex_lock(&tpg->tv_tpg_mutex);
+	rb = snprintf(page, PAGE_SIZE, "%s\n", tpg->param_alias);
+	mutex_unlock(&tpg->tv_tpg_mutex);
+
+	return rb;
+}
+
+static ssize_t scsiback_tpg_param_alias_store(struct config_item *item,
+					      const char *page, size_t count)
+{
+	struct se_portal_group *se_tpg = param_to_tpg(item);
+	struct scsiback_tpg *tpg = container_of(se_tpg, struct scsiback_tpg,
+						se_tpg);
+	int len;
+
+	if (strlen(page) >= VSCSI_NAMELEN) {
+		pr_err("param alias: %s, exceeds max: %d\n", page,
+			VSCSI_NAMELEN);
+		return -EINVAL;
+	}
+
+	mutex_lock(&tpg->tv_tpg_mutex);
+	len = snprintf(tpg->param_alias, VSCSI_NAMELEN, "%s", page);
+	if (tpg->param_alias[len - 1] == '\n')
+		tpg->param_alias[len - 1] = '\0';
+	mutex_unlock(&tpg->tv_tpg_mutex);
+
+	return count;
+}
+
+CONFIGFS_ATTR(scsiback_tpg_param_, alias);
+
+static struct configfs_attribute *scsiback_param_attrs[] = {
+	&scsiback_tpg_param_attr_alias,
+	NULL,
+};
+
+static int scsiback_alloc_sess_cb(struct se_portal_group *se_tpg,
+				  struct se_session *se_sess, void *p)
+{
+	struct scsiback_tpg *tpg = container_of(se_tpg,
+				struct scsiback_tpg, se_tpg);
+
+	tpg->tpg_nexus = p;
+	return 0;
+}
+
+static int scsiback_make_nexus(struct scsiback_tpg *tpg,
+				const char *name)
+{
+	struct scsiback_nexus *tv_nexus;
+	int ret = 0;
+
+	mutex_lock(&tpg->tv_tpg_mutex);
+	if (tpg->tpg_nexus) {
+		pr_debug("tpg->tpg_nexus already exists\n");
+		ret = -EEXIST;
+		goto out_unlock;
+	}
+
+	tv_nexus = kzalloc(sizeof(struct scsiback_nexus), GFP_KERNEL);
+	if (!tv_nexus) {
+		ret = -ENOMEM;
+		goto out_unlock;
+	}
+
+	tv_nexus->tvn_se_sess = target_setup_session(&tpg->se_tpg,
+						     VSCSI_DEFAULT_SESSION_TAGS,
+						     sizeof(struct vscsibk_pend),
+						     TARGET_PROT_NORMAL, name,
+						     tv_nexus, scsiback_alloc_sess_cb);
+	if (IS_ERR(tv_nexus->tvn_se_sess)) {
+		kfree(tv_nexus);
+		ret = -ENOMEM;
+		goto out_unlock;
+	}
+
+out_unlock:
+	mutex_unlock(&tpg->tv_tpg_mutex);
+	return ret;
+}
+
+static int scsiback_drop_nexus(struct scsiback_tpg *tpg)
+{
+	struct se_session *se_sess;
+	struct scsiback_nexus *tv_nexus;
+
+	mutex_lock(&tpg->tv_tpg_mutex);
+	tv_nexus = tpg->tpg_nexus;
+	if (!tv_nexus) {
+		mutex_unlock(&tpg->tv_tpg_mutex);
+		return -ENODEV;
+	}
+
+	se_sess = tv_nexus->tvn_se_sess;
+	if (!se_sess) {
+		mutex_unlock(&tpg->tv_tpg_mutex);
+		return -ENODEV;
+	}
+
+	if (tpg->tv_tpg_port_count != 0) {
+		mutex_unlock(&tpg->tv_tpg_mutex);
+		pr_err("Unable to remove xen-pvscsi I_T Nexus with active TPG port count: %d\n",
+			tpg->tv_tpg_port_count);
+		return -EBUSY;
+	}
+
+	if (tpg->tv_tpg_fe_count != 0) {
+		mutex_unlock(&tpg->tv_tpg_mutex);
+		pr_err("Unable to remove xen-pvscsi I_T Nexus with active TPG frontend count: %d\n",
+			tpg->tv_tpg_fe_count);
+		return -EBUSY;
+	}
+
+	pr_debug("Removing I_T Nexus to emulated %s Initiator Port: %s\n",
+		scsiback_dump_proto_id(tpg->tport),
+		tv_nexus->tvn_se_sess->se_node_acl->initiatorname);
+
+	/*
+	 * Release the SCSI I_T Nexus to the emulated xen-pvscsi Target Port
+	 */
+	target_remove_session(se_sess);
+	tpg->tpg_nexus = NULL;
+	mutex_unlock(&tpg->tv_tpg_mutex);
+
+	kfree(tv_nexus);
+	return 0;
+}
+
+static ssize_t scsiback_tpg_nexus_show(struct config_item *item, char *page)
+{
+	struct se_portal_group *se_tpg = to_tpg(item);
+	struct scsiback_tpg *tpg = container_of(se_tpg,
+				struct scsiback_tpg, se_tpg);
+	struct scsiback_nexus *tv_nexus;
+	ssize_t ret;
+
+	mutex_lock(&tpg->tv_tpg_mutex);
+	tv_nexus = tpg->tpg_nexus;
+	if (!tv_nexus) {
+		mutex_unlock(&tpg->tv_tpg_mutex);
+		return -ENODEV;
+	}
+	ret = snprintf(page, PAGE_SIZE, "%s\n",
+			tv_nexus->tvn_se_sess->se_node_acl->initiatorname);
+	mutex_unlock(&tpg->tv_tpg_mutex);
+
+	return ret;
+}
+
+static ssize_t scsiback_tpg_nexus_store(struct config_item *item,
+		const char *page, size_t count)
+{
+	struct se_portal_group *se_tpg = to_tpg(item);
+	struct scsiback_tpg *tpg = container_of(se_tpg,
+				struct scsiback_tpg, se_tpg);
+	struct scsiback_tport *tport_wwn = tpg->tport;
+	unsigned char i_port[VSCSI_NAMELEN], *ptr, *port_ptr;
+	int ret;
+	/*
+	 * Shutdown the active I_T nexus if 'NULL' is passed.
+	 */
+	if (!strncmp(page, "NULL", 4)) {
+		ret = scsiback_drop_nexus(tpg);
+		return (!ret) ? count : ret;
+	}
+	/*
+	 * Otherwise make sure the passed virtual Initiator port WWN matches
+	 * the fabric protocol_id set in scsiback_make_tport(), and call
+	 * scsiback_make_nexus().
+	 */
+	if (strlen(page) >= VSCSI_NAMELEN) {
+		pr_err("Emulated NAA Sas Address: %s, exceeds max: %d\n",
+			page, VSCSI_NAMELEN);
+		return -EINVAL;
+	}
+	snprintf(&i_port[0], VSCSI_NAMELEN, "%s", page);
+
+	ptr = strstr(i_port, "naa.");
+	if (ptr) {
+		if (tport_wwn->tport_proto_id != SCSI_PROTOCOL_SAS) {
+			pr_err("Passed SAS Initiator Port %s does not match target port protoid: %s\n",
+				i_port, scsiback_dump_proto_id(tport_wwn));
+			return -EINVAL;
+		}
+		port_ptr = &i_port[0];
+		goto check_newline;
+	}
+	ptr = strstr(i_port, "fc.");
+	if (ptr) {
+		if (tport_wwn->tport_proto_id != SCSI_PROTOCOL_FCP) {
+			pr_err("Passed FCP Initiator Port %s does not match target port protoid: %s\n",
+				i_port, scsiback_dump_proto_id(tport_wwn));
+			return -EINVAL;
+		}
+		port_ptr = &i_port[3]; /* Skip over "fc." */
+		goto check_newline;
+	}
+	ptr = strstr(i_port, "iqn.");
+	if (ptr) {
+		if (tport_wwn->tport_proto_id != SCSI_PROTOCOL_ISCSI) {
+			pr_err("Passed iSCSI Initiator Port %s does not match target port protoid: %s\n",
+				i_port, scsiback_dump_proto_id(tport_wwn));
+			return -EINVAL;
+		}
+		port_ptr = &i_port[0];
+		goto check_newline;
+	}
+	pr_err("Unable to locate prefix for emulated Initiator Port: %s\n",
+		i_port);
+	return -EINVAL;
+	/*
+	 * Clear any trailing newline for the NAA WWN
+	 */
+check_newline:
+	if (i_port[strlen(i_port) - 1] == '\n')
+		i_port[strlen(i_port) - 1] = '\0';
+
+	ret = scsiback_make_nexus(tpg, port_ptr);
+	if (ret < 0)
+		return ret;
+
+	return count;
+}
+
+CONFIGFS_ATTR(scsiback_tpg_, nexus);
+
+static struct configfs_attribute *scsiback_tpg_attrs[] = {
+	&scsiback_tpg_attr_nexus,
+	NULL,
+};
+
+static ssize_t
+scsiback_wwn_version_show(struct config_item *item, char *page)
+{
+	return sprintf(page, "xen-pvscsi fabric module %s on %s/%s on "
+		UTS_RELEASE"\n",
+		VSCSI_VERSION, utsname()->sysname, utsname()->machine);
+}
+
+CONFIGFS_ATTR_RO(scsiback_wwn_, version);
+
+static struct configfs_attribute *scsiback_wwn_attrs[] = {
+	&scsiback_wwn_attr_version,
+	NULL,
+};
+
+static char *scsiback_get_fabric_name(void)
+{
+	return "xen-pvscsi";
+}
+
+static int scsiback_port_link(struct se_portal_group *se_tpg,
+			       struct se_lun *lun)
+{
+	struct scsiback_tpg *tpg = container_of(se_tpg,
+				struct scsiback_tpg, se_tpg);
+
+	mutex_lock(&tpg->tv_tpg_mutex);
+	tpg->tv_tpg_port_count++;
+	mutex_unlock(&tpg->tv_tpg_mutex);
+
+	return 0;
+}
+
+static void scsiback_port_unlink(struct se_portal_group *se_tpg,
+				  struct se_lun *lun)
+{
+	struct scsiback_tpg *tpg = container_of(se_tpg,
+				struct scsiback_tpg, se_tpg);
+
+	mutex_lock(&tpg->tv_tpg_mutex);
+	tpg->tv_tpg_port_count--;
+	mutex_unlock(&tpg->tv_tpg_mutex);
+}
+
+static struct se_portal_group *
+scsiback_make_tpg(struct se_wwn *wwn, const char *name)
+{
+	struct scsiback_tport *tport = container_of(wwn,
+			struct scsiback_tport, tport_wwn);
+
+	struct scsiback_tpg *tpg;
+	u16 tpgt;
+	int ret;
+
+	if (strstr(name, "tpgt_") != name)
+		return ERR_PTR(-EINVAL);
+	ret = kstrtou16(name + 5, 10, &tpgt);
+	if (ret)
+		return ERR_PTR(ret);
+
+	tpg = kzalloc(sizeof(struct scsiback_tpg), GFP_KERNEL);
+	if (!tpg)
+		return ERR_PTR(-ENOMEM);
+
+	mutex_init(&tpg->tv_tpg_mutex);
+	INIT_LIST_HEAD(&tpg->tv_tpg_list);
+	INIT_LIST_HEAD(&tpg->info_list);
+	tpg->tport = tport;
+	tpg->tport_tpgt = tpgt;
+
+	ret = core_tpg_register(wwn, &tpg->se_tpg, tport->tport_proto_id);
+	if (ret < 0) {
+		kfree(tpg);
+		return NULL;
+	}
+	mutex_lock(&scsiback_mutex);
+	list_add_tail(&tpg->tv_tpg_list, &scsiback_list);
+	mutex_unlock(&scsiback_mutex);
+
+	return &tpg->se_tpg;
+}
+
+static void scsiback_drop_tpg(struct se_portal_group *se_tpg)
+{
+	struct scsiback_tpg *tpg = container_of(se_tpg,
+				struct scsiback_tpg, se_tpg);
+
+	mutex_lock(&scsiback_mutex);
+	list_del(&tpg->tv_tpg_list);
+	mutex_unlock(&scsiback_mutex);
+	/*
+	 * Release the virtual I_T Nexus for this xen-pvscsi TPG
+	 */
+	scsiback_drop_nexus(tpg);
+	/*
+	 * Deregister the se_tpg from TCM.
+	 */
+	core_tpg_deregister(se_tpg);
+	kfree(tpg);
+}
+
+static int scsiback_check_true(struct se_portal_group *se_tpg)
+{
+	return 1;
+}
+
+static int scsiback_check_false(struct se_portal_group *se_tpg)
+{
+	return 0;
+}
+
+static const struct target_core_fabric_ops scsiback_ops = {
+	.module				= THIS_MODULE,
+	.name				= "xen-pvscsi",
+	.get_fabric_name		= scsiback_get_fabric_name,
+	.tpg_get_wwn			= scsiback_get_fabric_wwn,
+	.tpg_get_tag			= scsiback_get_tag,
+	.tpg_check_demo_mode		= scsiback_check_true,
+	.tpg_check_demo_mode_cache	= scsiback_check_true,
+	.tpg_check_demo_mode_write_protect = scsiback_check_false,
+	.tpg_check_prod_mode_write_protect = scsiback_check_false,
+	.tpg_get_inst_index		= scsiback_tpg_get_inst_index,
+	.check_stop_free		= scsiback_check_stop_free,
+	.release_cmd			= scsiback_release_cmd,
+	.sess_get_index			= scsiback_sess_get_index,
+	.sess_get_initiator_sid		= NULL,
+	.write_pending			= scsiback_write_pending,
+	.write_pending_status		= scsiback_write_pending_status,
+	.set_default_node_attributes	= scsiback_set_default_node_attrs,
+	.get_cmd_state			= scsiback_get_cmd_state,
+	.queue_data_in			= scsiback_queue_data_in,
+	.queue_status			= scsiback_queue_status,
+	.queue_tm_rsp			= scsiback_queue_tm_rsp,
+	.aborted_task			= scsiback_aborted_task,
+	/*
+	 * Setup callers for generic logic in target_core_fabric_configfs.c
+	 */
+	.fabric_make_wwn		= scsiback_make_tport,
+	.fabric_drop_wwn		= scsiback_drop_tport,
+	.fabric_make_tpg		= scsiback_make_tpg,
+	.fabric_drop_tpg		= scsiback_drop_tpg,
+	.fabric_post_link		= scsiback_port_link,
+	.fabric_pre_unlink		= scsiback_port_unlink,
+
+	.tfc_wwn_attrs			= scsiback_wwn_attrs,
+	.tfc_tpg_base_attrs		= scsiback_tpg_attrs,
+	.tfc_tpg_param_attrs		= scsiback_param_attrs,
+};
+
+static const struct xenbus_device_id scsiback_ids[] = {
+	{ "vscsi" },
+	{ "" }
+};
+
+static struct xenbus_driver scsiback_driver = {
+	.ids			= scsiback_ids,
+	.probe			= scsiback_probe,
+	.remove			= scsiback_remove,
+	.otherend_changed	= scsiback_frontend_changed
+};
+
+static int __init scsiback_init(void)
+{
+	int ret;
+
+	if (!xen_domain())
+		return -ENODEV;
+
+	pr_debug("xen-pvscsi: fabric module %s on %s/%s on "UTS_RELEASE"\n",
+		 VSCSI_VERSION, utsname()->sysname, utsname()->machine);
+
+	ret = xenbus_register_backend(&scsiback_driver);
+	if (ret)
+		goto out;
+
+	ret = target_register_template(&scsiback_ops);
+	if (ret)
+		goto out_unregister_xenbus;
+
+	return 0;
+
+out_unregister_xenbus:
+	xenbus_unregister_driver(&scsiback_driver);
+out:
+	pr_err("%s: error %d\n", __func__, ret);
+	return ret;
+}
+
+static void __exit scsiback_exit(void)
+{
+	struct page *page;
+
+	while (free_pages_num) {
+		if (get_free_page(&page))
+			BUG();
+		gnttab_free_pages(1, &page);
+	}
+	target_unregister_template(&scsiback_ops);
+	xenbus_unregister_driver(&scsiback_driver);
+}
+
+module_init(scsiback_init);
+module_exit(scsiback_exit);
+
+MODULE_DESCRIPTION("Xen SCSI backend driver");
+MODULE_LICENSE("Dual BSD/GPL");
+MODULE_ALIAS("xen-backend:vscsi");
+MODULE_AUTHOR("Juergen Gross <jgross@suse.com>");
diff --git a/drivers/xen/xen-selfballoon.c b/drivers/xen/xen-selfballoon.c
new file mode 100644
index 000000000..55988b841
--- /dev/null
+++ b/drivers/xen/xen-selfballoon.c
@@ -0,0 +1,579 @@
+// SPDX-License-Identifier: GPL-2.0
+/******************************************************************************
+ * Xen selfballoon driver (and optional frontswap self-shrinking driver)
+ *
+ * Copyright (c) 2009-2011, Dan Magenheimer, Oracle Corp.
+ *
+ * This code complements the cleancache and frontswap patchsets to optimize
+ * support for Xen Transcendent Memory ("tmem").  The policy it implements
+ * is rudimentary and will likely improve over time, but it does work well
+ * enough today.
+ *
+ * Two functionalities are implemented here which both use "control theory"
+ * (feedback) to optimize memory utilization. In a virtualized environment
+ * such as Xen, RAM is often a scarce resource and we would like to ensure
+ * that each of a possibly large number of virtual machines is using RAM
+ * efficiently, i.e. using as little as possible when under light load
+ * and obtaining as much as possible when memory demands are high.
+ * Since RAM needs vary highly dynamically and sometimes dramatically,
+ * "hysteresis" is used, that is, memory target is determined not just
+ * on current data but also on past data stored in the system.
+ *
+ * "Selfballooning" creates memory pressure by managing the Xen balloon
+ * driver to decrease and increase available kernel memory, driven
+ * largely by the target value of "Committed_AS" (see /proc/meminfo).
+ * Since Committed_AS does not account for clean mapped pages (i.e. pages
+ * in RAM that are identical to pages on disk), selfballooning has the
+ * affect of pushing less frequently used clean pagecache pages out of
+ * kernel RAM and, presumably using cleancache, into Xen tmem where
+ * Xen can more efficiently optimize RAM utilization for such pages.
+ *
+ * When kernel memory demand unexpectedly increases faster than Xen, via
+ * the selfballoon driver, is able to (or chooses to) provide usable RAM,
+ * the kernel may invoke swapping.  In most cases, frontswap is able
+ * to absorb this swapping into Xen tmem.  However, due to the fact
+ * that the kernel swap subsystem assumes swapping occurs to a disk,
+ * swapped pages may sit on the disk for a very long time; even if
+ * the kernel knows the page will never be used again.  This is because
+ * the disk space costs very little and can be overwritten when
+ * necessary.  When such stale pages are in frontswap, however, they
+ * are taking up valuable real estate.  "Frontswap selfshrinking" works
+ * to resolve this:  When frontswap activity is otherwise stable
+ * and the guest kernel is not under memory pressure, the "frontswap
+ * selfshrinking" accounts for this by providing pressure to remove some
+ * pages from frontswap and return them to kernel memory.
+ *
+ * For both "selfballooning" and "frontswap-selfshrinking", a worker
+ * thread is used and sysfs tunables are provided to adjust the frequency
+ * and rate of adjustments to achieve the goal, as well as to disable one
+ * or both functions independently.
+ *
+ * While some argue that this functionality can and should be implemented
+ * in userspace, it has been observed that bad things happen (e.g. OOMs).
+ *
+ * System configuration note: Selfballooning should not be enabled on
+ * systems without a sufficiently large swap device configured; for best
+ * results, it is recommended that total swap be increased by the size
+ * of the guest memory. Note, that selfballooning should be disabled by default
+ * if frontswap is not configured.  Similarly selfballooning should be enabled
+ * by default if frontswap is configured and can be disabled with the
+ * "tmem.selfballooning=0" kernel boot option.  Finally, when frontswap is
+ * configured, frontswap-selfshrinking can be disabled  with the
+ * "tmem.selfshrink=0" kernel boot option.
+ *
+ * Selfballooning is disallowed in domain0 and force-disabled.
+ *
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/kernel.h>
+#include <linux/bootmem.h>
+#include <linux/swap.h>
+#include <linux/mm.h>
+#include <linux/mman.h>
+#include <linux/workqueue.h>
+#include <linux/device.h>
+#include <xen/balloon.h>
+#include <xen/tmem.h>
+#include <xen/xen.h>
+
+/* Enable/disable with sysfs. */
+static int xen_selfballooning_enabled __read_mostly;
+
+/*
+ * Controls rate at which memory target (this iteration) approaches
+ * ultimate goal when memory need is increasing (up-hysteresis) or
+ * decreasing (down-hysteresis). Higher values of hysteresis cause
+ * slower increases/decreases. The default values for the various
+ * parameters were deemed reasonable by experimentation, may be
+ * workload-dependent, and can all be adjusted via sysfs.
+ */
+static unsigned int selfballoon_downhysteresis __read_mostly = 8;
+static unsigned int selfballoon_uphysteresis __read_mostly = 1;
+
+/* In HZ, controls frequency of worker invocation. */
+static unsigned int selfballoon_interval __read_mostly = 5;
+
+/*
+ * Minimum usable RAM in MB for selfballooning target for balloon.
+ * If non-zero, it is added to totalreserve_pages and self-ballooning
+ * will not balloon below the sum.  If zero, a piecewise linear function
+ * is calculated as a minimum and added to totalreserve_pages.  Note that
+ * setting this value indiscriminately may cause OOMs and crashes.
+ */
+static unsigned int selfballoon_min_usable_mb;
+
+/*
+ * Amount of RAM in MB to add to the target number of pages.
+ * Can be used to reserve some more room for caches and the like.
+ */
+static unsigned int selfballoon_reserved_mb;
+
+static void selfballoon_process(struct work_struct *work);
+static DECLARE_DELAYED_WORK(selfballoon_worker, selfballoon_process);
+
+#ifdef CONFIG_FRONTSWAP
+#include <linux/frontswap.h>
+
+/* Enable/disable with sysfs. */
+static bool frontswap_selfshrinking __read_mostly;
+
+/*
+ * The default values for the following parameters were deemed reasonable
+ * by experimentation, may be workload-dependent, and can all be
+ * adjusted via sysfs.
+ */
+
+/* Control rate for frontswap shrinking. Higher hysteresis is slower. */
+static unsigned int frontswap_hysteresis __read_mostly = 20;
+
+/*
+ * Number of selfballoon worker invocations to wait before observing that
+ * frontswap selfshrinking should commence. Note that selfshrinking does
+ * not use a separate worker thread.
+ */
+static unsigned int frontswap_inertia __read_mostly = 3;
+
+/* Countdown to next invocation of frontswap_shrink() */
+static unsigned long frontswap_inertia_counter;
+
+/*
+ * Invoked by the selfballoon worker thread, uses current number of pages
+ * in frontswap (frontswap_curr_pages()), previous status, and control
+ * values (hysteresis and inertia) to determine if frontswap should be
+ * shrunk and what the new frontswap size should be.  Note that
+ * frontswap_shrink is essentially a partial swapoff that immediately
+ * transfers pages from the "swap device" (frontswap) back into kernel
+ * RAM; despite the name, frontswap "shrinking" is very different from
+ * the "shrinker" interface used by the kernel MM subsystem to reclaim
+ * memory.
+ */
+static void frontswap_selfshrink(void)
+{
+	static unsigned long cur_frontswap_pages;
+	unsigned long last_frontswap_pages;
+	unsigned long tgt_frontswap_pages;
+
+	last_frontswap_pages = cur_frontswap_pages;
+	cur_frontswap_pages = frontswap_curr_pages();
+	if (!cur_frontswap_pages ||
+			(cur_frontswap_pages > last_frontswap_pages)) {
+		frontswap_inertia_counter = frontswap_inertia;
+		return;
+	}
+	if (frontswap_inertia_counter && --frontswap_inertia_counter)
+		return;
+	if (cur_frontswap_pages <= frontswap_hysteresis)
+		tgt_frontswap_pages = 0;
+	else
+		tgt_frontswap_pages = cur_frontswap_pages -
+			(cur_frontswap_pages / frontswap_hysteresis);
+	frontswap_shrink(tgt_frontswap_pages);
+	frontswap_inertia_counter = frontswap_inertia;
+}
+
+#endif /* CONFIG_FRONTSWAP */
+
+#define MB2PAGES(mb)	((mb) << (20 - PAGE_SHIFT))
+#define PAGES2MB(pages) ((pages) >> (20 - PAGE_SHIFT))
+
+/*
+ * Use current balloon size, the goal (vm_committed_as), and hysteresis
+ * parameters to set a new target balloon size
+ */
+static void selfballoon_process(struct work_struct *work)
+{
+	unsigned long cur_pages, goal_pages, tgt_pages, floor_pages;
+	unsigned long useful_pages;
+	bool reset_timer = false;
+
+	if (xen_selfballooning_enabled) {
+		cur_pages = totalram_pages;
+		tgt_pages = cur_pages; /* default is no change */
+		goal_pages = vm_memory_committed() +
+				totalreserve_pages +
+				MB2PAGES(selfballoon_reserved_mb);
+#ifdef CONFIG_FRONTSWAP
+		/* allow space for frontswap pages to be repatriated */
+		if (frontswap_selfshrinking)
+			goal_pages += frontswap_curr_pages();
+#endif
+		if (cur_pages > goal_pages)
+			tgt_pages = cur_pages -
+				((cur_pages - goal_pages) /
+				  selfballoon_downhysteresis);
+		else if (cur_pages < goal_pages)
+			tgt_pages = cur_pages +
+				((goal_pages - cur_pages) /
+				  selfballoon_uphysteresis);
+		/* else if cur_pages == goal_pages, no change */
+		useful_pages = max_pfn - totalreserve_pages;
+		if (selfballoon_min_usable_mb != 0)
+			floor_pages = totalreserve_pages +
+					MB2PAGES(selfballoon_min_usable_mb);
+		/* piecewise linear function ending in ~3% slope */
+		else if (useful_pages < MB2PAGES(16))
+			floor_pages = max_pfn; /* not worth ballooning */
+		else if (useful_pages < MB2PAGES(64))
+			floor_pages = totalreserve_pages + MB2PAGES(16) +
+					((useful_pages - MB2PAGES(16)) >> 1);
+		else if (useful_pages < MB2PAGES(512))
+			floor_pages = totalreserve_pages + MB2PAGES(40) +
+					((useful_pages - MB2PAGES(40)) >> 3);
+		else /* useful_pages >= MB2PAGES(512) */
+			floor_pages = totalreserve_pages + MB2PAGES(99) +
+					((useful_pages - MB2PAGES(99)) >> 5);
+		if (tgt_pages < floor_pages)
+			tgt_pages = floor_pages;
+		balloon_set_new_target(tgt_pages +
+			balloon_stats.current_pages - totalram_pages);
+		reset_timer = true;
+	}
+#ifdef CONFIG_FRONTSWAP
+	if (frontswap_selfshrinking) {
+		frontswap_selfshrink();
+		reset_timer = true;
+	}
+#endif
+	if (reset_timer)
+		schedule_delayed_work(&selfballoon_worker,
+			selfballoon_interval * HZ);
+}
+
+#ifdef CONFIG_SYSFS
+
+#include <linux/capability.h>
+
+#define SELFBALLOON_SHOW(name, format, args...)				\
+	static ssize_t show_##name(struct device *dev,	\
+					  struct device_attribute *attr, \
+					  char *buf) \
+	{ \
+		return sprintf(buf, format, ##args); \
+	}
+
+SELFBALLOON_SHOW(selfballooning, "%d\n", xen_selfballooning_enabled);
+
+static ssize_t store_selfballooning(struct device *dev,
+			    struct device_attribute *attr,
+			    const char *buf,
+			    size_t count)
+{
+	bool was_enabled = xen_selfballooning_enabled;
+	unsigned long tmp;
+	int err;
+
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
+	err = kstrtoul(buf, 10, &tmp);
+	if (err)
+		return err;
+	if ((tmp != 0) && (tmp != 1))
+		return -EINVAL;
+
+	xen_selfballooning_enabled = !!tmp;
+	if (!was_enabled && xen_selfballooning_enabled)
+		schedule_delayed_work(&selfballoon_worker,
+			selfballoon_interval * HZ);
+
+	return count;
+}
+
+static DEVICE_ATTR(selfballooning, S_IRUGO | S_IWUSR,
+		   show_selfballooning, store_selfballooning);
+
+SELFBALLOON_SHOW(selfballoon_interval, "%d\n", selfballoon_interval);
+
+static ssize_t store_selfballoon_interval(struct device *dev,
+					  struct device_attribute *attr,
+					  const char *buf,
+					  size_t count)
+{
+	unsigned long val;
+	int err;
+
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+	err = kstrtoul(buf, 10, &val);
+	if (err)
+		return err;
+	if (val == 0)
+		return -EINVAL;
+	selfballoon_interval = val;
+	return count;
+}
+
+static DEVICE_ATTR(selfballoon_interval, S_IRUGO | S_IWUSR,
+		   show_selfballoon_interval, store_selfballoon_interval);
+
+SELFBALLOON_SHOW(selfballoon_downhys, "%d\n", selfballoon_downhysteresis);
+
+static ssize_t store_selfballoon_downhys(struct device *dev,
+					 struct device_attribute *attr,
+					 const char *buf,
+					 size_t count)
+{
+	unsigned long val;
+	int err;
+
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+	err = kstrtoul(buf, 10, &val);
+	if (err)
+		return err;
+	if (val == 0)
+		return -EINVAL;
+	selfballoon_downhysteresis = val;
+	return count;
+}
+
+static DEVICE_ATTR(selfballoon_downhysteresis, S_IRUGO | S_IWUSR,
+		   show_selfballoon_downhys, store_selfballoon_downhys);
+
+
+SELFBALLOON_SHOW(selfballoon_uphys, "%d\n", selfballoon_uphysteresis);
+
+static ssize_t store_selfballoon_uphys(struct device *dev,
+				       struct device_attribute *attr,
+				       const char *buf,
+				       size_t count)
+{
+	unsigned long val;
+	int err;
+
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+	err = kstrtoul(buf, 10, &val);
+	if (err)
+		return err;
+	if (val == 0)
+		return -EINVAL;
+	selfballoon_uphysteresis = val;
+	return count;
+}
+
+static DEVICE_ATTR(selfballoon_uphysteresis, S_IRUGO | S_IWUSR,
+		   show_selfballoon_uphys, store_selfballoon_uphys);
+
+SELFBALLOON_SHOW(selfballoon_min_usable_mb, "%d\n",
+				selfballoon_min_usable_mb);
+
+static ssize_t store_selfballoon_min_usable_mb(struct device *dev,
+					       struct device_attribute *attr,
+					       const char *buf,
+					       size_t count)
+{
+	unsigned long val;
+	int err;
+
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+	err = kstrtoul(buf, 10, &val);
+	if (err)
+		return err;
+	if (val == 0)
+		return -EINVAL;
+	selfballoon_min_usable_mb = val;
+	return count;
+}
+
+static DEVICE_ATTR(selfballoon_min_usable_mb, S_IRUGO | S_IWUSR,
+		   show_selfballoon_min_usable_mb,
+		   store_selfballoon_min_usable_mb);
+
+SELFBALLOON_SHOW(selfballoon_reserved_mb, "%d\n",
+				selfballoon_reserved_mb);
+
+static ssize_t store_selfballoon_reserved_mb(struct device *dev,
+					     struct device_attribute *attr,
+					     const char *buf,
+					     size_t count)
+{
+	unsigned long val;
+	int err;
+
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+	err = kstrtoul(buf, 10, &val);
+	if (err)
+		return err;
+	if (val == 0)
+		return -EINVAL;
+	selfballoon_reserved_mb = val;
+	return count;
+}
+
+static DEVICE_ATTR(selfballoon_reserved_mb, S_IRUGO | S_IWUSR,
+		   show_selfballoon_reserved_mb,
+		   store_selfballoon_reserved_mb);
+
+
+#ifdef CONFIG_FRONTSWAP
+SELFBALLOON_SHOW(frontswap_selfshrinking, "%d\n", frontswap_selfshrinking);
+
+static ssize_t store_frontswap_selfshrinking(struct device *dev,
+					     struct device_attribute *attr,
+					     const char *buf,
+					     size_t count)
+{
+	bool was_enabled = frontswap_selfshrinking;
+	unsigned long tmp;
+	int err;
+
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+	err = kstrtoul(buf, 10, &tmp);
+	if (err)
+		return err;
+	if ((tmp != 0) && (tmp != 1))
+		return -EINVAL;
+	frontswap_selfshrinking = !!tmp;
+	if (!was_enabled && !xen_selfballooning_enabled &&
+	     frontswap_selfshrinking)
+		schedule_delayed_work(&selfballoon_worker,
+			selfballoon_interval * HZ);
+
+	return count;
+}
+
+static DEVICE_ATTR(frontswap_selfshrinking, S_IRUGO | S_IWUSR,
+		   show_frontswap_selfshrinking, store_frontswap_selfshrinking);
+
+SELFBALLOON_SHOW(frontswap_inertia, "%d\n", frontswap_inertia);
+
+static ssize_t store_frontswap_inertia(struct device *dev,
+				       struct device_attribute *attr,
+				       const char *buf,
+				       size_t count)
+{
+	unsigned long val;
+	int err;
+
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+	err = kstrtoul(buf, 10, &val);
+	if (err)
+		return err;
+	if (val == 0)
+		return -EINVAL;
+	frontswap_inertia = val;
+	frontswap_inertia_counter = val;
+	return count;
+}
+
+static DEVICE_ATTR(frontswap_inertia, S_IRUGO | S_IWUSR,
+		   show_frontswap_inertia, store_frontswap_inertia);
+
+SELFBALLOON_SHOW(frontswap_hysteresis, "%d\n", frontswap_hysteresis);
+
+static ssize_t store_frontswap_hysteresis(struct device *dev,
+					  struct device_attribute *attr,
+					  const char *buf,
+					  size_t count)
+{
+	unsigned long val;
+	int err;
+
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+	err = kstrtoul(buf, 10, &val);
+	if (err)
+		return err;
+	if (val == 0)
+		return -EINVAL;
+	frontswap_hysteresis = val;
+	return count;
+}
+
+static DEVICE_ATTR(frontswap_hysteresis, S_IRUGO | S_IWUSR,
+		   show_frontswap_hysteresis, store_frontswap_hysteresis);
+
+#endif /* CONFIG_FRONTSWAP */
+
+static struct attribute *selfballoon_attrs[] = {
+	&dev_attr_selfballooning.attr,
+	&dev_attr_selfballoon_interval.attr,
+	&dev_attr_selfballoon_downhysteresis.attr,
+	&dev_attr_selfballoon_uphysteresis.attr,
+	&dev_attr_selfballoon_min_usable_mb.attr,
+	&dev_attr_selfballoon_reserved_mb.attr,
+#ifdef CONFIG_FRONTSWAP
+	&dev_attr_frontswap_selfshrinking.attr,
+	&dev_attr_frontswap_hysteresis.attr,
+	&dev_attr_frontswap_inertia.attr,
+#endif
+	NULL
+};
+
+static const struct attribute_group selfballoon_group = {
+	.name = "selfballoon",
+	.attrs = selfballoon_attrs
+};
+#endif
+
+int register_xen_selfballooning(struct device *dev)
+{
+	int error = -1;
+
+#ifdef CONFIG_SYSFS
+	error = sysfs_create_group(&dev->kobj, &selfballoon_group);
+#endif
+	return error;
+}
+EXPORT_SYMBOL(register_xen_selfballooning);
+
+int xen_selfballoon_init(bool use_selfballooning, bool use_frontswap_selfshrink)
+{
+	bool enable = false;
+	unsigned long reserve_pages;
+
+	if (!xen_domain())
+		return -ENODEV;
+
+	if (xen_initial_domain()) {
+		pr_info("Xen selfballooning driver disabled for domain0\n");
+		return -ENODEV;
+	}
+
+	xen_selfballooning_enabled = tmem_enabled && use_selfballooning;
+	if (xen_selfballooning_enabled) {
+		pr_info("Initializing Xen selfballooning driver\n");
+		enable = true;
+	}
+#ifdef CONFIG_FRONTSWAP
+	frontswap_selfshrinking = tmem_enabled && use_frontswap_selfshrink;
+	if (frontswap_selfshrinking) {
+		pr_info("Initializing frontswap selfshrinking driver\n");
+		enable = true;
+	}
+#endif
+	if (!enable)
+		return -ENODEV;
+
+	/*
+	 * Give selfballoon_reserved_mb a default value(10% of total ram pages)
+	 * to make selfballoon not so aggressive.
+	 *
+	 * There are mainly two reasons:
+	 * 1) The original goal_page didn't consider some pages used by kernel
+	 *    space, like slab pages and memory used by device drivers.
+	 *
+	 * 2) The balloon driver may not give back memory to guest OS fast
+	 *    enough when the workload suddenly aquries a lot of physical memory.
+	 *
+	 * In both cases, the guest OS will suffer from memory pressure and
+	 * OOM killer may be triggered.
+	 * By reserving extra 10% of total ram pages, we can keep the system
+	 * much more reliably and response faster in some cases.
+	 */
+	if (!selfballoon_reserved_mb) {
+		reserve_pages = totalram_pages / 10;
+		selfballoon_reserved_mb = PAGES2MB(reserve_pages);
+	}
+	schedule_delayed_work(&selfballoon_worker, selfballoon_interval * HZ);
+
+	return 0;
+}
+EXPORT_SYMBOL(xen_selfballoon_init);
diff --git a/drivers/xen/xen-stub.c b/drivers/xen/xen-stub.c
new file mode 100644
index 000000000..bbef194c5
--- /dev/null
+++ b/drivers/xen/xen-stub.c
@@ -0,0 +1,100 @@
+/*
+ * xen-stub.c - stub drivers to reserve space for Xen
+ *
+ * Copyright (C) 2012 Intel Corporation
+ *    Author: Liu Jinsong <jinsong.liu@intel.com>
+ *    Author: Jiang Yunhong <yunhong.jiang@intel.com>
+ *
+ * Copyright (C) 2012 Oracle Inc
+ *    Author: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or (at
+ * your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ * NON INFRINGEMENT.  See the GNU General Public License for more
+ * details.
+ */
+
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/export.h>
+#include <linux/types.h>
+#include <linux/acpi.h>
+#include <xen/acpi.h>
+
+#ifdef CONFIG_ACPI
+
+/*--------------------------------------------
+	stub driver for Xen memory hotplug
+--------------------------------------------*/
+
+static const struct acpi_device_id memory_device_ids[] = {
+	{ACPI_MEMORY_DEVICE_HID, 0},
+	{"", 0},
+};
+
+static struct acpi_driver xen_stub_memory_device_driver = {
+	/* same name as native memory driver to block native loaded */
+	.name = "acpi_memhotplug",
+	.class = ACPI_MEMORY_DEVICE_CLASS,
+	.ids = memory_device_ids,
+};
+
+int xen_stub_memory_device_init(void)
+{
+	if (!xen_initial_domain())
+		return -ENODEV;
+
+	/* just reserve space for Xen, block native driver loaded */
+	return acpi_bus_register_driver(&xen_stub_memory_device_driver);
+}
+EXPORT_SYMBOL_GPL(xen_stub_memory_device_init);
+subsys_initcall(xen_stub_memory_device_init);
+
+void xen_stub_memory_device_exit(void)
+{
+	acpi_bus_unregister_driver(&xen_stub_memory_device_driver);
+}
+EXPORT_SYMBOL_GPL(xen_stub_memory_device_exit);
+
+
+/*--------------------------------------------
+	stub driver for Xen cpu hotplug
+--------------------------------------------*/
+
+static const struct acpi_device_id processor_device_ids[] = {
+	{ACPI_PROCESSOR_OBJECT_HID, 0},
+	{ACPI_PROCESSOR_DEVICE_HID, 0},
+	{"", 0},
+};
+
+static struct acpi_driver xen_stub_processor_driver = {
+	/* same name as native processor driver to block native loaded */
+	.name = "processor",
+	.class = ACPI_PROCESSOR_CLASS,
+	.ids = processor_device_ids,
+};
+
+int xen_stub_processor_init(void)
+{
+	if (!xen_initial_domain())
+		return -ENODEV;
+
+	/* just reserve space for Xen, block native driver loaded */
+	return acpi_bus_register_driver(&xen_stub_processor_driver);
+}
+EXPORT_SYMBOL_GPL(xen_stub_processor_init);
+subsys_initcall(xen_stub_processor_init);
+
+void xen_stub_processor_exit(void)
+{
+	acpi_bus_unregister_driver(&xen_stub_processor_driver);
+}
+EXPORT_SYMBOL_GPL(xen_stub_processor_exit);
+
+#endif
diff --git a/drivers/xen/xenbus/Makefile b/drivers/xen/xenbus/Makefile
new file mode 100644
index 000000000..0c7532110
--- /dev/null
+++ b/drivers/xen/xenbus/Makefile
@@ -0,0 +1,15 @@
+# SPDX-License-Identifier: GPL-2.0
+obj-y	+= xenbus.o
+obj-y	+= xenbus_dev_frontend.o
+
+xenbus-objs =
+xenbus-objs += xenbus_client.o
+xenbus-objs += xenbus_comms.o
+xenbus-objs += xenbus_xs.o
+xenbus-objs += xenbus_probe.o
+
+xenbus-be-objs-$(CONFIG_XEN_BACKEND) += xenbus_probe_backend.o
+xenbus-objs += $(xenbus-be-objs-y)
+
+obj-$(CONFIG_XEN_BACKEND) += xenbus_dev_backend.o
+obj-$(CONFIG_XEN_XENBUS_FRONTEND) += xenbus_probe_frontend.o
diff --git a/drivers/xen/xenbus/xenbus.h b/drivers/xen/xenbus/xenbus.h
new file mode 100644
index 000000000..88516a8a9
--- /dev/null
+++ b/drivers/xen/xenbus/xenbus.h
@@ -0,0 +1,141 @@
+/*
+ * Private include for xenbus communications.
+ *
+ * Copyright (C) 2005 Rusty Russell, IBM Corporation
+ * Copyright (C) 2005 XenSource Ltd.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation; or, when distributed
+ * separately from the Linux kernel or incorporated into other
+ * software packages, subject to the following license:
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this source file (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy, modify,
+ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
+ * and to permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#ifndef _XENBUS_XENBUS_H
+#define _XENBUS_XENBUS_H
+
+#include <linux/mutex.h>
+#include <linux/uio.h>
+#include <xen/xenbus.h>
+
+#define XEN_BUS_ID_SIZE			20
+
+struct xen_bus_type {
+	char *root;
+	unsigned int levels;
+	int (*get_bus_id)(char bus_id[XEN_BUS_ID_SIZE], const char *nodename);
+	int (*probe)(struct xen_bus_type *bus, const char *type,
+		     const char *dir);
+	bool (*otherend_will_handle)(struct xenbus_watch *watch,
+				     const char *path, const char *token);
+	void (*otherend_changed)(struct xenbus_watch *watch, const char *path,
+				 const char *token);
+	struct bus_type bus;
+};
+
+enum xenstore_init {
+	XS_UNKNOWN,
+	XS_PV,
+	XS_HVM,
+	XS_LOCAL,
+};
+
+struct xs_watch_event {
+	struct list_head list;
+	unsigned int len;
+	struct xenbus_watch *handle;
+	const char *path;
+	const char *token;
+	char body[];
+};
+
+enum xb_req_state {
+	xb_req_state_queued,
+	xb_req_state_wait_reply,
+	xb_req_state_got_reply,
+	xb_req_state_aborted
+};
+
+struct xb_req_data {
+	struct list_head list;
+	wait_queue_head_t wq;
+	struct xsd_sockmsg msg;
+	uint32_t caller_req_id;
+	enum xsd_sockmsg_type type;
+	char *body;
+	const struct kvec *vec;
+	int num_vecs;
+	int err;
+	enum xb_req_state state;
+	bool user_req;
+	void (*cb)(struct xb_req_data *);
+	void *par;
+};
+
+extern enum xenstore_init xen_store_domain_type;
+extern const struct attribute_group *xenbus_dev_groups[];
+extern struct mutex xs_response_mutex;
+extern struct list_head xs_reply_list;
+extern struct list_head xb_write_list;
+extern wait_queue_head_t xb_waitq;
+extern struct mutex xb_write_mutex;
+
+int xs_init(void);
+int xb_init_comms(void);
+void xb_deinit_comms(void);
+int xs_watch_msg(struct xs_watch_event *event);
+void xs_request_exit(struct xb_req_data *req);
+
+int xenbus_match(struct device *_dev, struct device_driver *_drv);
+int xenbus_dev_probe(struct device *_dev);
+int xenbus_dev_remove(struct device *_dev);
+int xenbus_register_driver_common(struct xenbus_driver *drv,
+				  struct xen_bus_type *bus,
+				  struct module *owner,
+				  const char *mod_name);
+int xenbus_probe_node(struct xen_bus_type *bus,
+		      const char *type,
+		      const char *nodename);
+int xenbus_probe_devices(struct xen_bus_type *bus);
+
+void xenbus_dev_changed(const char *node, struct xen_bus_type *bus);
+
+void xenbus_dev_shutdown(struct device *_dev);
+
+int xenbus_dev_suspend(struct device *dev);
+int xenbus_dev_resume(struct device *dev);
+int xenbus_dev_cancel(struct device *dev);
+
+void xenbus_otherend_changed(struct xenbus_watch *watch,
+			     const char *path, const char *token,
+			     int ignore_on_shutdown);
+
+int xenbus_read_otherend_details(struct xenbus_device *xendev,
+				 char *id_node, char *path_node);
+
+void xenbus_ring_ops_init(void);
+
+int xenbus_dev_request_and_reply(struct xsd_sockmsg *msg, void *par);
+void xenbus_dev_queue_reply(struct xb_req_data *req);
+
+extern unsigned int xb_dev_generation_id;
+
+#endif
diff --git a/drivers/xen/xenbus/xenbus_client.c b/drivers/xen/xenbus/xenbus_client.c
new file mode 100644
index 000000000..6dde323da
--- /dev/null
+++ b/drivers/xen/xenbus/xenbus_client.c
@@ -0,0 +1,956 @@
+/******************************************************************************
+ * Client-facing interface for the Xenbus driver.  In other words, the
+ * interface between the Xenbus and the device-specific code, be it the
+ * frontend or the backend of that driver.
+ *
+ * Copyright (C) 2005 XenSource Ltd
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation; or, when distributed
+ * separately from the Linux kernel or incorporated into other
+ * software packages, subject to the following license:
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this source file (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy, modify,
+ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
+ * and to permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include <linux/mm.h>
+#include <linux/slab.h>
+#include <linux/types.h>
+#include <linux/spinlock.h>
+#include <linux/vmalloc.h>
+#include <linux/export.h>
+#include <asm/xen/hypervisor.h>
+#include <xen/page.h>
+#include <xen/interface/xen.h>
+#include <xen/interface/event_channel.h>
+#include <xen/balloon.h>
+#include <xen/events.h>
+#include <xen/grant_table.h>
+#include <xen/xenbus.h>
+#include <xen/xen.h>
+#include <xen/features.h>
+
+#include "xenbus.h"
+
+#define XENBUS_PAGES(_grants)	(DIV_ROUND_UP(_grants, XEN_PFN_PER_PAGE))
+
+#define XENBUS_MAX_RING_PAGES	(XENBUS_PAGES(XENBUS_MAX_RING_GRANTS))
+
+struct xenbus_map_node {
+	struct list_head next;
+	union {
+		struct {
+			struct vm_struct *area;
+		} pv;
+		struct {
+			struct page *pages[XENBUS_MAX_RING_PAGES];
+			unsigned long addrs[XENBUS_MAX_RING_GRANTS];
+			void *addr;
+		} hvm;
+	};
+	grant_handle_t handles[XENBUS_MAX_RING_GRANTS];
+	unsigned int   nr_handles;
+};
+
+static DEFINE_SPINLOCK(xenbus_valloc_lock);
+static LIST_HEAD(xenbus_valloc_pages);
+
+struct xenbus_ring_ops {
+	int (*map)(struct xenbus_device *dev,
+		   grant_ref_t *gnt_refs, unsigned int nr_grefs,
+		   void **vaddr);
+	int (*unmap)(struct xenbus_device *dev, void *vaddr);
+};
+
+static const struct xenbus_ring_ops *ring_ops __read_mostly;
+
+const char *xenbus_strstate(enum xenbus_state state)
+{
+	static const char *const name[] = {
+		[ XenbusStateUnknown      ] = "Unknown",
+		[ XenbusStateInitialising ] = "Initialising",
+		[ XenbusStateInitWait     ] = "InitWait",
+		[ XenbusStateInitialised  ] = "Initialised",
+		[ XenbusStateConnected    ] = "Connected",
+		[ XenbusStateClosing      ] = "Closing",
+		[ XenbusStateClosed	  ] = "Closed",
+		[XenbusStateReconfiguring] = "Reconfiguring",
+		[XenbusStateReconfigured] = "Reconfigured",
+	};
+	return (state < ARRAY_SIZE(name)) ? name[state] : "INVALID";
+}
+EXPORT_SYMBOL_GPL(xenbus_strstate);
+
+/**
+ * xenbus_watch_path - register a watch
+ * @dev: xenbus device
+ * @path: path to watch
+ * @watch: watch to register
+ * @callback: callback to register
+ *
+ * Register a @watch on the given path, using the given xenbus_watch structure
+ * for storage, and the given @callback function as the callback.  Return 0 on
+ * success, or -errno on error.  On success, the given @path will be saved as
+ * @watch->node, and remains the caller's to free.  On error, @watch->node will
+ * be NULL, the device will switch to %XenbusStateClosing, and the error will
+ * be saved in the store.
+ */
+int xenbus_watch_path(struct xenbus_device *dev, const char *path,
+		      struct xenbus_watch *watch,
+		      bool (*will_handle)(struct xenbus_watch *,
+					  const char *, const char *),
+		      void (*callback)(struct xenbus_watch *,
+				       const char *, const char *))
+{
+	int err;
+
+	watch->node = path;
+	watch->will_handle = will_handle;
+	watch->callback = callback;
+
+	err = register_xenbus_watch(watch);
+
+	if (err) {
+		watch->node = NULL;
+		watch->will_handle = NULL;
+		watch->callback = NULL;
+		xenbus_dev_fatal(dev, err, "adding watch on %s", path);
+	}
+
+	return err;
+}
+EXPORT_SYMBOL_GPL(xenbus_watch_path);
+
+
+/**
+ * xenbus_watch_pathfmt - register a watch on a sprintf-formatted path
+ * @dev: xenbus device
+ * @watch: watch to register
+ * @callback: callback to register
+ * @pathfmt: format of path to watch
+ *
+ * Register a watch on the given @path, using the given xenbus_watch
+ * structure for storage, and the given @callback function as the callback.
+ * Return 0 on success, or -errno on error.  On success, the watched path
+ * (@path/@path2) will be saved as @watch->node, and becomes the caller's to
+ * kfree().  On error, watch->node will be NULL, so the caller has nothing to
+ * free, the device will switch to %XenbusStateClosing, and the error will be
+ * saved in the store.
+ */
+int xenbus_watch_pathfmt(struct xenbus_device *dev,
+			 struct xenbus_watch *watch,
+			 bool (*will_handle)(struct xenbus_watch *,
+					const char *, const char *),
+			 void (*callback)(struct xenbus_watch *,
+					  const char *, const char *),
+			 const char *pathfmt, ...)
+{
+	int err;
+	va_list ap;
+	char *path;
+
+	va_start(ap, pathfmt);
+	path = kvasprintf(GFP_NOIO | __GFP_HIGH, pathfmt, ap);
+	va_end(ap);
+
+	if (!path) {
+		xenbus_dev_fatal(dev, -ENOMEM, "allocating path for watch");
+		return -ENOMEM;
+	}
+	err = xenbus_watch_path(dev, path, watch, will_handle, callback);
+
+	if (err)
+		kfree(path);
+	return err;
+}
+EXPORT_SYMBOL_GPL(xenbus_watch_pathfmt);
+
+static void xenbus_switch_fatal(struct xenbus_device *, int, int,
+				const char *, ...);
+
+static int
+__xenbus_switch_state(struct xenbus_device *dev,
+		      enum xenbus_state state, int depth)
+{
+	/* We check whether the state is currently set to the given value, and
+	   if not, then the state is set.  We don't want to unconditionally
+	   write the given state, because we don't want to fire watches
+	   unnecessarily.  Furthermore, if the node has gone, we don't write
+	   to it, as the device will be tearing down, and we don't want to
+	   resurrect that directory.
+
+	   Note that, because of this cached value of our state, this
+	   function will not take a caller's Xenstore transaction
+	   (something it was trying to in the past) because dev->state
+	   would not get reset if the transaction was aborted.
+	 */
+
+	struct xenbus_transaction xbt;
+	int current_state;
+	int err, abort;
+
+	if (state == dev->state)
+		return 0;
+
+again:
+	abort = 1;
+
+	err = xenbus_transaction_start(&xbt);
+	if (err) {
+		xenbus_switch_fatal(dev, depth, err, "starting transaction");
+		return 0;
+	}
+
+	err = xenbus_scanf(xbt, dev->nodename, "state", "%d", &current_state);
+	if (err != 1)
+		goto abort;
+
+	err = xenbus_printf(xbt, dev->nodename, "state", "%d", state);
+	if (err) {
+		xenbus_switch_fatal(dev, depth, err, "writing new state");
+		goto abort;
+	}
+
+	abort = 0;
+abort:
+	err = xenbus_transaction_end(xbt, abort);
+	if (err) {
+		if (err == -EAGAIN && !abort)
+			goto again;
+		xenbus_switch_fatal(dev, depth, err, "ending transaction");
+	} else
+		dev->state = state;
+
+	return 0;
+}
+
+/**
+ * xenbus_switch_state
+ * @dev: xenbus device
+ * @state: new state
+ *
+ * Advertise in the store a change of the given driver to the given new_state.
+ * Return 0 on success, or -errno on error.  On error, the device will switch
+ * to XenbusStateClosing, and the error will be saved in the store.
+ */
+int xenbus_switch_state(struct xenbus_device *dev, enum xenbus_state state)
+{
+	return __xenbus_switch_state(dev, state, 0);
+}
+
+EXPORT_SYMBOL_GPL(xenbus_switch_state);
+
+int xenbus_frontend_closed(struct xenbus_device *dev)
+{
+	xenbus_switch_state(dev, XenbusStateClosed);
+	complete(&dev->down);
+	return 0;
+}
+EXPORT_SYMBOL_GPL(xenbus_frontend_closed);
+
+static void xenbus_va_dev_error(struct xenbus_device *dev, int err,
+				const char *fmt, va_list ap)
+{
+	unsigned int len;
+	char *printf_buffer;
+	char *path_buffer;
+
+#define PRINTF_BUFFER_SIZE 4096
+
+	printf_buffer = kmalloc(PRINTF_BUFFER_SIZE, GFP_KERNEL);
+	if (!printf_buffer)
+		return;
+
+	len = sprintf(printf_buffer, "%i ", -err);
+	vsnprintf(printf_buffer + len, PRINTF_BUFFER_SIZE - len, fmt, ap);
+
+	dev_err(&dev->dev, "%s\n", printf_buffer);
+
+	path_buffer = kasprintf(GFP_KERNEL, "error/%s", dev->nodename);
+	if (!path_buffer ||
+	    xenbus_write(XBT_NIL, path_buffer, "error", printf_buffer))
+		dev_err(&dev->dev, "failed to write error node for %s (%s)\n",
+			dev->nodename, printf_buffer);
+
+	kfree(printf_buffer);
+	kfree(path_buffer);
+}
+
+/**
+ * xenbus_dev_error
+ * @dev: xenbus device
+ * @err: error to report
+ * @fmt: error message format
+ *
+ * Report the given negative errno into the store, along with the given
+ * formatted message.
+ */
+void xenbus_dev_error(struct xenbus_device *dev, int err, const char *fmt, ...)
+{
+	va_list ap;
+
+	va_start(ap, fmt);
+	xenbus_va_dev_error(dev, err, fmt, ap);
+	va_end(ap);
+}
+EXPORT_SYMBOL_GPL(xenbus_dev_error);
+
+/**
+ * xenbus_dev_fatal
+ * @dev: xenbus device
+ * @err: error to report
+ * @fmt: error message format
+ *
+ * Equivalent to xenbus_dev_error(dev, err, fmt, args), followed by
+ * xenbus_switch_state(dev, XenbusStateClosing) to schedule an orderly
+ * closedown of this driver and its peer.
+ */
+
+void xenbus_dev_fatal(struct xenbus_device *dev, int err, const char *fmt, ...)
+{
+	va_list ap;
+
+	va_start(ap, fmt);
+	xenbus_va_dev_error(dev, err, fmt, ap);
+	va_end(ap);
+
+	xenbus_switch_state(dev, XenbusStateClosing);
+}
+EXPORT_SYMBOL_GPL(xenbus_dev_fatal);
+
+/**
+ * Equivalent to xenbus_dev_fatal(dev, err, fmt, args), but helps
+ * avoiding recursion within xenbus_switch_state.
+ */
+static void xenbus_switch_fatal(struct xenbus_device *dev, int depth, int err,
+				const char *fmt, ...)
+{
+	va_list ap;
+
+	va_start(ap, fmt);
+	xenbus_va_dev_error(dev, err, fmt, ap);
+	va_end(ap);
+
+	if (!depth)
+		__xenbus_switch_state(dev, XenbusStateClosing, 1);
+}
+
+/**
+ * xenbus_grant_ring
+ * @dev: xenbus device
+ * @vaddr: starting virtual address of the ring
+ * @nr_pages: number of pages to be granted
+ * @grefs: grant reference array to be filled in
+ *
+ * Grant access to the given @vaddr to the peer of the given device.
+ * Then fill in @grefs with grant references.  Return 0 on success, or
+ * -errno on error.  On error, the device will switch to
+ * XenbusStateClosing, and the error will be saved in the store.
+ */
+int xenbus_grant_ring(struct xenbus_device *dev, void *vaddr,
+		      unsigned int nr_pages, grant_ref_t *grefs)
+{
+	int err;
+	unsigned int i;
+	grant_ref_t gref_head;
+
+	err = gnttab_alloc_grant_references(nr_pages, &gref_head);
+	if (err) {
+		xenbus_dev_fatal(dev, err, "granting access to ring page");
+		return err;
+	}
+
+	for (i = 0; i < nr_pages; i++) {
+		unsigned long gfn;
+
+		if (is_vmalloc_addr(vaddr))
+			gfn = pfn_to_gfn(vmalloc_to_pfn(vaddr));
+		else
+			gfn = virt_to_gfn(vaddr);
+
+		grefs[i] = gnttab_claim_grant_reference(&gref_head);
+		gnttab_grant_foreign_access_ref(grefs[i], dev->otherend_id,
+						gfn, 0);
+
+		vaddr = vaddr + XEN_PAGE_SIZE;
+	}
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(xenbus_grant_ring);
+
+
+/**
+ * Allocate an event channel for the given xenbus_device, assigning the newly
+ * created local port to *port.  Return 0 on success, or -errno on error.  On
+ * error, the device will switch to XenbusStateClosing, and the error will be
+ * saved in the store.
+ */
+int xenbus_alloc_evtchn(struct xenbus_device *dev, int *port)
+{
+	struct evtchn_alloc_unbound alloc_unbound;
+	int err;
+
+	alloc_unbound.dom = DOMID_SELF;
+	alloc_unbound.remote_dom = dev->otherend_id;
+
+	err = HYPERVISOR_event_channel_op(EVTCHNOP_alloc_unbound,
+					  &alloc_unbound);
+	if (err)
+		xenbus_dev_fatal(dev, err, "allocating event channel");
+	else
+		*port = alloc_unbound.port;
+
+	return err;
+}
+EXPORT_SYMBOL_GPL(xenbus_alloc_evtchn);
+
+
+/**
+ * Free an existing event channel. Returns 0 on success or -errno on error.
+ */
+int xenbus_free_evtchn(struct xenbus_device *dev, int port)
+{
+	struct evtchn_close close;
+	int err;
+
+	close.port = port;
+
+	err = HYPERVISOR_event_channel_op(EVTCHNOP_close, &close);
+	if (err)
+		xenbus_dev_error(dev, err, "freeing event channel %d", port);
+
+	return err;
+}
+EXPORT_SYMBOL_GPL(xenbus_free_evtchn);
+
+
+/**
+ * xenbus_map_ring_valloc
+ * @dev: xenbus device
+ * @gnt_refs: grant reference array
+ * @nr_grefs: number of grant references
+ * @vaddr: pointer to address to be filled out by mapping
+ *
+ * Map @nr_grefs pages of memory into this domain from another
+ * domain's grant table.  xenbus_map_ring_valloc allocates @nr_grefs
+ * pages of virtual address space, maps the pages to that address, and
+ * sets *vaddr to that address.  Returns 0 on success, and GNTST_*
+ * (see xen/include/interface/grant_table.h) or -ENOMEM / -EINVAL on
+ * error. If an error is returned, device will switch to
+ * XenbusStateClosing and the error message will be saved in XenStore.
+ */
+int xenbus_map_ring_valloc(struct xenbus_device *dev, grant_ref_t *gnt_refs,
+			   unsigned int nr_grefs, void **vaddr)
+{
+	int err;
+
+	err = ring_ops->map(dev, gnt_refs, nr_grefs, vaddr);
+	/* Some hypervisors are buggy and can return 1. */
+	if (err > 0)
+		err = GNTST_general_error;
+
+	return err;
+}
+EXPORT_SYMBOL_GPL(xenbus_map_ring_valloc);
+
+/* N.B. sizeof(phys_addr_t) doesn't always equal to sizeof(unsigned
+ * long), e.g. 32-on-64.  Caller is responsible for preparing the
+ * right array to feed into this function */
+static int __xenbus_map_ring(struct xenbus_device *dev,
+			     grant_ref_t *gnt_refs,
+			     unsigned int nr_grefs,
+			     grant_handle_t *handles,
+			     phys_addr_t *addrs,
+			     unsigned int flags,
+			     bool *leaked)
+{
+	struct gnttab_map_grant_ref map[XENBUS_MAX_RING_GRANTS];
+	struct gnttab_unmap_grant_ref unmap[XENBUS_MAX_RING_GRANTS];
+	int i, j;
+	int err = GNTST_okay;
+
+	if (nr_grefs > XENBUS_MAX_RING_GRANTS)
+		return -EINVAL;
+
+	for (i = 0; i < nr_grefs; i++) {
+		memset(&map[i], 0, sizeof(map[i]));
+		gnttab_set_map_op(&map[i], addrs[i], flags, gnt_refs[i],
+				  dev->otherend_id);
+		handles[i] = INVALID_GRANT_HANDLE;
+	}
+
+	gnttab_batch_map(map, i);
+
+	for (i = 0; i < nr_grefs; i++) {
+		if (map[i].status != GNTST_okay) {
+			err = map[i].status;
+			xenbus_dev_fatal(dev, map[i].status,
+					 "mapping in shared page %d from domain %d",
+					 gnt_refs[i], dev->otherend_id);
+			goto fail;
+		} else
+			handles[i] = map[i].handle;
+	}
+
+	return GNTST_okay;
+
+ fail:
+	for (i = j = 0; i < nr_grefs; i++) {
+		if (handles[i] != INVALID_GRANT_HANDLE) {
+			memset(&unmap[j], 0, sizeof(unmap[j]));
+			gnttab_set_unmap_op(&unmap[j], (phys_addr_t)addrs[i],
+					    GNTMAP_host_map, handles[i]);
+			j++;
+		}
+	}
+
+	if (HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref, unmap, j))
+		BUG();
+
+	*leaked = false;
+	for (i = 0; i < j; i++) {
+		if (unmap[i].status != GNTST_okay) {
+			*leaked = true;
+			break;
+		}
+	}
+
+	return err;
+}
+
+struct map_ring_valloc_hvm
+{
+	unsigned int idx;
+
+	/* Why do we need two arrays? See comment of __xenbus_map_ring */
+	phys_addr_t phys_addrs[XENBUS_MAX_RING_GRANTS];
+	unsigned long addrs[XENBUS_MAX_RING_GRANTS];
+};
+
+static void xenbus_map_ring_setup_grant_hvm(unsigned long gfn,
+					    unsigned int goffset,
+					    unsigned int len,
+					    void *data)
+{
+	struct map_ring_valloc_hvm *info = data;
+	unsigned long vaddr = (unsigned long)gfn_to_virt(gfn);
+
+	info->phys_addrs[info->idx] = vaddr;
+	info->addrs[info->idx] = vaddr;
+
+	info->idx++;
+}
+
+static int xenbus_map_ring_valloc_hvm(struct xenbus_device *dev,
+				      grant_ref_t *gnt_ref,
+				      unsigned int nr_grefs,
+				      void **vaddr)
+{
+	struct xenbus_map_node *node;
+	int err;
+	void *addr;
+	bool leaked = false;
+	struct map_ring_valloc_hvm info = {
+		.idx = 0,
+	};
+	unsigned int nr_pages = XENBUS_PAGES(nr_grefs);
+
+	if (nr_grefs > XENBUS_MAX_RING_GRANTS)
+		return -EINVAL;
+
+	*vaddr = NULL;
+
+	node = kzalloc(sizeof(*node), GFP_KERNEL);
+	if (!node)
+		return -ENOMEM;
+
+	err = alloc_xenballooned_pages(nr_pages, node->hvm.pages);
+	if (err)
+		goto out_err;
+
+	gnttab_foreach_grant(node->hvm.pages, nr_grefs,
+			     xenbus_map_ring_setup_grant_hvm,
+			     &info);
+
+	err = __xenbus_map_ring(dev, gnt_ref, nr_grefs, node->handles,
+				info.phys_addrs, GNTMAP_host_map, &leaked);
+	node->nr_handles = nr_grefs;
+
+	if (err)
+		goto out_free_ballooned_pages;
+
+	addr = vmap(node->hvm.pages, nr_pages, VM_MAP | VM_IOREMAP,
+		    PAGE_KERNEL);
+	if (!addr) {
+		err = -ENOMEM;
+		goto out_xenbus_unmap_ring;
+	}
+
+	node->hvm.addr = addr;
+
+	spin_lock(&xenbus_valloc_lock);
+	list_add(&node->next, &xenbus_valloc_pages);
+	spin_unlock(&xenbus_valloc_lock);
+
+	*vaddr = addr;
+	return 0;
+
+ out_xenbus_unmap_ring:
+	if (!leaked)
+		xenbus_unmap_ring(dev, node->handles, nr_grefs, info.addrs);
+	else
+		pr_alert("leaking %p size %u page(s)",
+			 addr, nr_pages);
+ out_free_ballooned_pages:
+	if (!leaked)
+		free_xenballooned_pages(nr_pages, node->hvm.pages);
+ out_err:
+	kfree(node);
+	return err;
+}
+
+
+/**
+ * xenbus_map_ring
+ * @dev: xenbus device
+ * @gnt_refs: grant reference array
+ * @nr_grefs: number of grant reference
+ * @handles: pointer to grant handle to be filled
+ * @vaddrs: addresses to be mapped to
+ * @leaked: fail to clean up a failed map, caller should not free vaddr
+ *
+ * Map pages of memory into this domain from another domain's grant table.
+ * xenbus_map_ring does not allocate the virtual address space (you must do
+ * this yourself!). It only maps in the pages to the specified address.
+ * Returns 0 on success, and GNTST_* (see xen/include/interface/grant_table.h)
+ * or -ENOMEM / -EINVAL on error. If an error is returned, device will switch to
+ * XenbusStateClosing and the first error message will be saved in XenStore.
+ * Further more if we fail to map the ring, caller should check @leaked.
+ * If @leaked is not zero it means xenbus_map_ring fails to clean up, caller
+ * should not free the address space of @vaddr.
+ */
+int xenbus_map_ring(struct xenbus_device *dev, grant_ref_t *gnt_refs,
+		    unsigned int nr_grefs, grant_handle_t *handles,
+		    unsigned long *vaddrs, bool *leaked)
+{
+	phys_addr_t phys_addrs[XENBUS_MAX_RING_GRANTS];
+	int i;
+
+	if (nr_grefs > XENBUS_MAX_RING_GRANTS)
+		return -EINVAL;
+
+	for (i = 0; i < nr_grefs; i++)
+		phys_addrs[i] = (unsigned long)vaddrs[i];
+
+	return __xenbus_map_ring(dev, gnt_refs, nr_grefs, handles,
+				 phys_addrs, GNTMAP_host_map, leaked);
+}
+EXPORT_SYMBOL_GPL(xenbus_map_ring);
+
+
+/**
+ * xenbus_unmap_ring_vfree
+ * @dev: xenbus device
+ * @vaddr: addr to unmap
+ *
+ * Based on Rusty Russell's skeleton driver's unmap_page.
+ * Unmap a page of memory in this domain that was imported from another domain.
+ * Use xenbus_unmap_ring_vfree if you mapped in your memory with
+ * xenbus_map_ring_valloc (it will free the virtual address space).
+ * Returns 0 on success and returns GNTST_* on error
+ * (see xen/include/interface/grant_table.h).
+ */
+int xenbus_unmap_ring_vfree(struct xenbus_device *dev, void *vaddr)
+{
+	return ring_ops->unmap(dev, vaddr);
+}
+EXPORT_SYMBOL_GPL(xenbus_unmap_ring_vfree);
+
+#ifdef CONFIG_XEN_PV
+static int xenbus_map_ring_valloc_pv(struct xenbus_device *dev,
+				     grant_ref_t *gnt_refs,
+				     unsigned int nr_grefs,
+				     void **vaddr)
+{
+	struct xenbus_map_node *node;
+	struct vm_struct *area;
+	pte_t *ptes[XENBUS_MAX_RING_GRANTS];
+	phys_addr_t phys_addrs[XENBUS_MAX_RING_GRANTS];
+	int err = GNTST_okay;
+	int i;
+	bool leaked;
+
+	*vaddr = NULL;
+
+	if (nr_grefs > XENBUS_MAX_RING_GRANTS)
+		return -EINVAL;
+
+	node = kzalloc(sizeof(*node), GFP_KERNEL);
+	if (!node)
+		return -ENOMEM;
+
+	area = alloc_vm_area(XEN_PAGE_SIZE * nr_grefs, ptes);
+	if (!area) {
+		kfree(node);
+		return -ENOMEM;
+	}
+
+	for (i = 0; i < nr_grefs; i++)
+		phys_addrs[i] = arbitrary_virt_to_machine(ptes[i]).maddr;
+
+	err = __xenbus_map_ring(dev, gnt_refs, nr_grefs, node->handles,
+				phys_addrs,
+				GNTMAP_host_map | GNTMAP_contains_pte,
+				&leaked);
+	if (err)
+		goto failed;
+
+	node->nr_handles = nr_grefs;
+	node->pv.area = area;
+
+	spin_lock(&xenbus_valloc_lock);
+	list_add(&node->next, &xenbus_valloc_pages);
+	spin_unlock(&xenbus_valloc_lock);
+
+	*vaddr = area->addr;
+	return 0;
+
+failed:
+	if (!leaked)
+		free_vm_area(area);
+	else
+		pr_alert("leaking VM area %p size %u page(s)", area, nr_grefs);
+
+	kfree(node);
+	return err;
+}
+
+static int xenbus_unmap_ring_vfree_pv(struct xenbus_device *dev, void *vaddr)
+{
+	struct xenbus_map_node *node;
+	struct gnttab_unmap_grant_ref unmap[XENBUS_MAX_RING_GRANTS];
+	unsigned int level;
+	int i;
+	bool leaked = false;
+	int err;
+
+	spin_lock(&xenbus_valloc_lock);
+	list_for_each_entry(node, &xenbus_valloc_pages, next) {
+		if (node->pv.area->addr == vaddr) {
+			list_del(&node->next);
+			goto found;
+		}
+	}
+	node = NULL;
+ found:
+	spin_unlock(&xenbus_valloc_lock);
+
+	if (!node) {
+		xenbus_dev_error(dev, -ENOENT,
+				 "can't find mapped virtual address %p", vaddr);
+		return GNTST_bad_virt_addr;
+	}
+
+	for (i = 0; i < node->nr_handles; i++) {
+		unsigned long addr;
+
+		memset(&unmap[i], 0, sizeof(unmap[i]));
+		addr = (unsigned long)vaddr + (XEN_PAGE_SIZE * i);
+		unmap[i].host_addr = arbitrary_virt_to_machine(
+			lookup_address(addr, &level)).maddr;
+		unmap[i].dev_bus_addr = 0;
+		unmap[i].handle = node->handles[i];
+	}
+
+	if (HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref, unmap, i))
+		BUG();
+
+	err = GNTST_okay;
+	leaked = false;
+	for (i = 0; i < node->nr_handles; i++) {
+		if (unmap[i].status != GNTST_okay) {
+			leaked = true;
+			xenbus_dev_error(dev, unmap[i].status,
+					 "unmapping page at handle %d error %d",
+					 node->handles[i], unmap[i].status);
+			err = unmap[i].status;
+			break;
+		}
+	}
+
+	if (!leaked)
+		free_vm_area(node->pv.area);
+	else
+		pr_alert("leaking VM area %p size %u page(s)",
+			 node->pv.area, node->nr_handles);
+
+	kfree(node);
+	return err;
+}
+
+static const struct xenbus_ring_ops ring_ops_pv = {
+	.map = xenbus_map_ring_valloc_pv,
+	.unmap = xenbus_unmap_ring_vfree_pv,
+};
+#endif
+
+struct unmap_ring_vfree_hvm
+{
+	unsigned int idx;
+	unsigned long addrs[XENBUS_MAX_RING_GRANTS];
+};
+
+static void xenbus_unmap_ring_setup_grant_hvm(unsigned long gfn,
+					      unsigned int goffset,
+					      unsigned int len,
+					      void *data)
+{
+	struct unmap_ring_vfree_hvm *info = data;
+
+	info->addrs[info->idx] = (unsigned long)gfn_to_virt(gfn);
+
+	info->idx++;
+}
+
+static int xenbus_unmap_ring_vfree_hvm(struct xenbus_device *dev, void *vaddr)
+{
+	int rv;
+	struct xenbus_map_node *node;
+	void *addr;
+	struct unmap_ring_vfree_hvm info = {
+		.idx = 0,
+	};
+	unsigned int nr_pages;
+
+	spin_lock(&xenbus_valloc_lock);
+	list_for_each_entry(node, &xenbus_valloc_pages, next) {
+		addr = node->hvm.addr;
+		if (addr == vaddr) {
+			list_del(&node->next);
+			goto found;
+		}
+	}
+	node = addr = NULL;
+ found:
+	spin_unlock(&xenbus_valloc_lock);
+
+	if (!node) {
+		xenbus_dev_error(dev, -ENOENT,
+				 "can't find mapped virtual address %p", vaddr);
+		return GNTST_bad_virt_addr;
+	}
+
+	nr_pages = XENBUS_PAGES(node->nr_handles);
+
+	gnttab_foreach_grant(node->hvm.pages, node->nr_handles,
+			     xenbus_unmap_ring_setup_grant_hvm,
+			     &info);
+
+	rv = xenbus_unmap_ring(dev, node->handles, node->nr_handles,
+			       info.addrs);
+	if (!rv) {
+		vunmap(vaddr);
+		free_xenballooned_pages(nr_pages, node->hvm.pages);
+	}
+	else
+		WARN(1, "Leaking %p, size %u page(s)\n", vaddr, nr_pages);
+
+	kfree(node);
+	return rv;
+}
+
+/**
+ * xenbus_unmap_ring
+ * @dev: xenbus device
+ * @handles: grant handle array
+ * @nr_handles: number of handles in the array
+ * @vaddrs: addresses to unmap
+ *
+ * Unmap memory in this domain that was imported from another domain.
+ * Returns 0 on success and returns GNTST_* on error
+ * (see xen/include/interface/grant_table.h).
+ */
+int xenbus_unmap_ring(struct xenbus_device *dev,
+		      grant_handle_t *handles, unsigned int nr_handles,
+		      unsigned long *vaddrs)
+{
+	struct gnttab_unmap_grant_ref unmap[XENBUS_MAX_RING_GRANTS];
+	int i;
+	int err;
+
+	if (nr_handles > XENBUS_MAX_RING_GRANTS)
+		return -EINVAL;
+
+	for (i = 0; i < nr_handles; i++)
+		gnttab_set_unmap_op(&unmap[i], vaddrs[i],
+				    GNTMAP_host_map, handles[i]);
+
+	if (HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref, unmap, i))
+		BUG();
+
+	err = GNTST_okay;
+	for (i = 0; i < nr_handles; i++) {
+		if (unmap[i].status != GNTST_okay) {
+			xenbus_dev_error(dev, unmap[i].status,
+					 "unmapping page at handle %d error %d",
+					 handles[i], unmap[i].status);
+			err = unmap[i].status;
+			break;
+		}
+	}
+
+	return err;
+}
+EXPORT_SYMBOL_GPL(xenbus_unmap_ring);
+
+
+/**
+ * xenbus_read_driver_state
+ * @path: path for driver
+ *
+ * Return the state of the driver rooted at the given store path, or
+ * XenbusStateUnknown if no state can be read.
+ */
+enum xenbus_state xenbus_read_driver_state(const char *path)
+{
+	enum xenbus_state result;
+	int err = xenbus_gather(XBT_NIL, path, "state", "%d", &result, NULL);
+	if (err)
+		result = XenbusStateUnknown;
+
+	return result;
+}
+EXPORT_SYMBOL_GPL(xenbus_read_driver_state);
+
+static const struct xenbus_ring_ops ring_ops_hvm = {
+	.map = xenbus_map_ring_valloc_hvm,
+	.unmap = xenbus_unmap_ring_vfree_hvm,
+};
+
+void __init xenbus_ring_ops_init(void)
+{
+#ifdef CONFIG_XEN_PV
+	if (!xen_feature(XENFEAT_auto_translated_physmap))
+		ring_ops = &ring_ops_pv;
+	else
+#endif
+		ring_ops = &ring_ops_hvm;
+}
diff --git a/drivers/xen/xenbus/xenbus_comms.c b/drivers/xen/xenbus/xenbus_comms.c
new file mode 100644
index 000000000..e5fda0256
--- /dev/null
+++ b/drivers/xen/xenbus/xenbus_comms.c
@@ -0,0 +1,484 @@
+/******************************************************************************
+ * xenbus_comms.c
+ *
+ * Low level code to talks to Xen Store: ringbuffer and event channel.
+ *
+ * Copyright (C) 2005 Rusty Russell, IBM Corporation
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation; or, when distributed
+ * separately from the Linux kernel or incorporated into other
+ * software packages, subject to the following license:
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this source file (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy, modify,
+ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
+ * and to permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/wait.h>
+#include <linux/interrupt.h>
+#include <linux/kthread.h>
+#include <linux/sched.h>
+#include <linux/err.h>
+#include <xen/xenbus.h>
+#include <asm/xen/hypervisor.h>
+#include <xen/events.h>
+#include <xen/page.h>
+#include "xenbus.h"
+
+/* A list of replies. Currently only one will ever be outstanding. */
+LIST_HEAD(xs_reply_list);
+
+/* A list of write requests. */
+LIST_HEAD(xb_write_list);
+DECLARE_WAIT_QUEUE_HEAD(xb_waitq);
+DEFINE_MUTEX(xb_write_mutex);
+
+/* Protect xenbus reader thread against save/restore. */
+DEFINE_MUTEX(xs_response_mutex);
+
+static int xenbus_irq;
+static struct task_struct *xenbus_task;
+
+static irqreturn_t wake_waiting(int irq, void *unused)
+{
+	wake_up(&xb_waitq);
+	return IRQ_HANDLED;
+}
+
+static int check_indexes(XENSTORE_RING_IDX cons, XENSTORE_RING_IDX prod)
+{
+	return ((prod - cons) <= XENSTORE_RING_SIZE);
+}
+
+static void *get_output_chunk(XENSTORE_RING_IDX cons,
+			      XENSTORE_RING_IDX prod,
+			      char *buf, uint32_t *len)
+{
+	*len = XENSTORE_RING_SIZE - MASK_XENSTORE_IDX(prod);
+	if ((XENSTORE_RING_SIZE - (prod - cons)) < *len)
+		*len = XENSTORE_RING_SIZE - (prod - cons);
+	return buf + MASK_XENSTORE_IDX(prod);
+}
+
+static const void *get_input_chunk(XENSTORE_RING_IDX cons,
+				   XENSTORE_RING_IDX prod,
+				   const char *buf, uint32_t *len)
+{
+	*len = XENSTORE_RING_SIZE - MASK_XENSTORE_IDX(cons);
+	if ((prod - cons) < *len)
+		*len = prod - cons;
+	return buf + MASK_XENSTORE_IDX(cons);
+}
+
+static int xb_data_to_write(void)
+{
+	struct xenstore_domain_interface *intf = xen_store_interface;
+
+	return (intf->req_prod - intf->req_cons) != XENSTORE_RING_SIZE &&
+		!list_empty(&xb_write_list);
+}
+
+/**
+ * xb_write - low level write
+ * @data: buffer to send
+ * @len: length of buffer
+ *
+ * Returns number of bytes written or -err.
+ */
+static int xb_write(const void *data, unsigned int len)
+{
+	struct xenstore_domain_interface *intf = xen_store_interface;
+	XENSTORE_RING_IDX cons, prod;
+	unsigned int bytes = 0;
+
+	while (len != 0) {
+		void *dst;
+		unsigned int avail;
+
+		/* Read indexes, then verify. */
+		cons = intf->req_cons;
+		prod = intf->req_prod;
+		if (!check_indexes(cons, prod)) {
+			intf->req_cons = intf->req_prod = 0;
+			return -EIO;
+		}
+		if (!xb_data_to_write())
+			return bytes;
+
+		/* Must write data /after/ reading the consumer index. */
+		virt_mb();
+
+		dst = get_output_chunk(cons, prod, intf->req, &avail);
+		if (avail == 0)
+			continue;
+		if (avail > len)
+			avail = len;
+
+		memcpy(dst, data, avail);
+		data += avail;
+		len -= avail;
+		bytes += avail;
+
+		/* Other side must not see new producer until data is there. */
+		virt_wmb();
+		intf->req_prod += avail;
+
+		/* Implies mb(): other side will see the updated producer. */
+		if (prod <= intf->req_cons)
+			notify_remote_via_evtchn(xen_store_evtchn);
+	}
+
+	return bytes;
+}
+
+static int xb_data_to_read(void)
+{
+	struct xenstore_domain_interface *intf = xen_store_interface;
+	return (intf->rsp_cons != intf->rsp_prod);
+}
+
+static int xb_read(void *data, unsigned int len)
+{
+	struct xenstore_domain_interface *intf = xen_store_interface;
+	XENSTORE_RING_IDX cons, prod;
+	unsigned int bytes = 0;
+
+	while (len != 0) {
+		unsigned int avail;
+		const char *src;
+
+		/* Read indexes, then verify. */
+		cons = intf->rsp_cons;
+		prod = intf->rsp_prod;
+		if (cons == prod)
+			return bytes;
+
+		if (!check_indexes(cons, prod)) {
+			intf->rsp_cons = intf->rsp_prod = 0;
+			return -EIO;
+		}
+
+		src = get_input_chunk(cons, prod, intf->rsp, &avail);
+		if (avail == 0)
+			continue;
+		if (avail > len)
+			avail = len;
+
+		/* Must read data /after/ reading the producer index. */
+		virt_rmb();
+
+		memcpy(data, src, avail);
+		data += avail;
+		len -= avail;
+		bytes += avail;
+
+		/* Other side must not see free space until we've copied out */
+		virt_mb();
+		intf->rsp_cons += avail;
+
+		/* Implies mb(): other side will see the updated consumer. */
+		if (intf->rsp_prod - cons >= XENSTORE_RING_SIZE)
+			notify_remote_via_evtchn(xen_store_evtchn);
+	}
+
+	return bytes;
+}
+
+static int process_msg(void)
+{
+	static struct {
+		struct xsd_sockmsg msg;
+		char *body;
+		union {
+			void *alloc;
+			struct xs_watch_event *watch;
+		};
+		bool in_msg;
+		bool in_hdr;
+		unsigned int read;
+	} state;
+	struct xb_req_data *req;
+	int err;
+	unsigned int len;
+
+	if (!state.in_msg) {
+		state.in_msg = true;
+		state.in_hdr = true;
+		state.read = 0;
+
+		/*
+		 * We must disallow save/restore while reading a message.
+		 * A partial read across s/r leaves us out of sync with
+		 * xenstored.
+		 * xs_response_mutex is locked as long as we are processing one
+		 * message. state.in_msg will be true as long as we are holding
+		 * the lock here.
+		 */
+		mutex_lock(&xs_response_mutex);
+
+		if (!xb_data_to_read()) {
+			/* We raced with save/restore: pending data 'gone'. */
+			mutex_unlock(&xs_response_mutex);
+			state.in_msg = false;
+			return 0;
+		}
+	}
+
+	if (state.in_hdr) {
+		if (state.read != sizeof(state.msg)) {
+			err = xb_read((void *)&state.msg + state.read,
+				      sizeof(state.msg) - state.read);
+			if (err < 0)
+				goto out;
+			state.read += err;
+			if (state.read != sizeof(state.msg))
+				return 0;
+			if (state.msg.len > XENSTORE_PAYLOAD_MAX) {
+				err = -EINVAL;
+				goto out;
+			}
+		}
+
+		len = state.msg.len + 1;
+		if (state.msg.type == XS_WATCH_EVENT)
+			len += sizeof(*state.watch);
+
+		state.alloc = kmalloc(len, GFP_NOIO | __GFP_HIGH);
+		if (!state.alloc)
+			return -ENOMEM;
+
+		if (state.msg.type == XS_WATCH_EVENT)
+			state.body = state.watch->body;
+		else
+			state.body = state.alloc;
+		state.in_hdr = false;
+		state.read = 0;
+	}
+
+	err = xb_read(state.body + state.read, state.msg.len - state.read);
+	if (err < 0)
+		goto out;
+
+	state.read += err;
+	if (state.read != state.msg.len)
+		return 0;
+
+	state.body[state.msg.len] = '\0';
+
+	if (state.msg.type == XS_WATCH_EVENT) {
+		state.watch->len = state.msg.len;
+		err = xs_watch_msg(state.watch);
+	} else {
+		err = -ENOENT;
+		mutex_lock(&xb_write_mutex);
+		list_for_each_entry(req, &xs_reply_list, list) {
+			if (req->msg.req_id == state.msg.req_id) {
+				list_del(&req->list);
+				err = 0;
+				break;
+			}
+		}
+		mutex_unlock(&xb_write_mutex);
+		if (err)
+			goto out;
+
+		if (req->state == xb_req_state_wait_reply) {
+			req->msg.req_id = req->caller_req_id;
+			req->msg.type = state.msg.type;
+			req->msg.len = state.msg.len;
+			req->body = state.body;
+			/* write body, then update state */
+			virt_wmb();
+			req->state = xb_req_state_got_reply;
+			req->cb(req);
+		} else
+			kfree(req);
+	}
+
+	mutex_unlock(&xs_response_mutex);
+
+	state.in_msg = false;
+	state.alloc = NULL;
+	return err;
+
+ out:
+	mutex_unlock(&xs_response_mutex);
+	state.in_msg = false;
+	kfree(state.alloc);
+	state.alloc = NULL;
+	return err;
+}
+
+static int process_writes(void)
+{
+	static struct {
+		struct xb_req_data *req;
+		int idx;
+		unsigned int written;
+	} state;
+	void *base;
+	unsigned int len;
+	int err = 0;
+
+	if (!xb_data_to_write())
+		return 0;
+
+	mutex_lock(&xb_write_mutex);
+
+	if (!state.req) {
+		state.req = list_first_entry(&xb_write_list,
+					     struct xb_req_data, list);
+		state.idx = -1;
+		state.written = 0;
+	}
+
+	if (state.req->state == xb_req_state_aborted)
+		goto out_err;
+
+	while (state.idx < state.req->num_vecs) {
+		if (state.idx < 0) {
+			base = &state.req->msg;
+			len = sizeof(state.req->msg);
+		} else {
+			base = state.req->vec[state.idx].iov_base;
+			len = state.req->vec[state.idx].iov_len;
+		}
+		err = xb_write(base + state.written, len - state.written);
+		if (err < 0)
+			goto out_err;
+		state.written += err;
+		if (state.written != len)
+			goto out;
+
+		state.idx++;
+		state.written = 0;
+	}
+
+	list_del(&state.req->list);
+	state.req->state = xb_req_state_wait_reply;
+	list_add_tail(&state.req->list, &xs_reply_list);
+	state.req = NULL;
+
+ out:
+	mutex_unlock(&xb_write_mutex);
+
+	return 0;
+
+ out_err:
+	state.req->msg.type = XS_ERROR;
+	state.req->err = err;
+	list_del(&state.req->list);
+	if (state.req->state == xb_req_state_aborted)
+		kfree(state.req);
+	else {
+		/* write err, then update state */
+		virt_wmb();
+		state.req->state = xb_req_state_got_reply;
+		wake_up(&state.req->wq);
+	}
+
+	mutex_unlock(&xb_write_mutex);
+
+	state.req = NULL;
+
+	return err;
+}
+
+static int xb_thread_work(void)
+{
+	return xb_data_to_read() || xb_data_to_write();
+}
+
+static int xenbus_thread(void *unused)
+{
+	int err;
+
+	while (!kthread_should_stop()) {
+		if (wait_event_interruptible(xb_waitq, xb_thread_work()))
+			continue;
+
+		err = process_msg();
+		if (err == -ENOMEM)
+			schedule();
+		else if (err)
+			pr_warn_ratelimited("error %d while reading message\n",
+					    err);
+
+		err = process_writes();
+		if (err)
+			pr_warn_ratelimited("error %d while writing message\n",
+					    err);
+	}
+
+	xenbus_task = NULL;
+	return 0;
+}
+
+/**
+ * xb_init_comms - Set up interrupt handler off store event channel.
+ */
+int xb_init_comms(void)
+{
+	struct xenstore_domain_interface *intf = xen_store_interface;
+
+	if (intf->req_prod != intf->req_cons)
+		pr_err("request ring is not quiescent (%08x:%08x)!\n",
+		       intf->req_cons, intf->req_prod);
+
+	if (intf->rsp_prod != intf->rsp_cons) {
+		pr_warn("response ring is not quiescent (%08x:%08x): fixing up\n",
+			intf->rsp_cons, intf->rsp_prod);
+		/* breaks kdump */
+		if (!reset_devices)
+			intf->rsp_cons = intf->rsp_prod;
+	}
+
+	if (xenbus_irq) {
+		/* Already have an irq; assume we're resuming */
+		rebind_evtchn_irq(xen_store_evtchn, xenbus_irq);
+	} else {
+		int err;
+
+		err = bind_evtchn_to_irqhandler(xen_store_evtchn, wake_waiting,
+						0, "xenbus", &xb_waitq);
+		if (err < 0) {
+			pr_err("request irq failed %i\n", err);
+			return err;
+		}
+
+		xenbus_irq = err;
+
+		if (!xenbus_task) {
+			xenbus_task = kthread_run(xenbus_thread, NULL,
+						  "xenbus");
+			if (IS_ERR(xenbus_task))
+				return PTR_ERR(xenbus_task);
+		}
+	}
+
+	return 0;
+}
+
+void xb_deinit_comms(void)
+{
+	unbind_from_irqhandler(xenbus_irq, &xb_waitq);
+	xenbus_irq = 0;
+}
diff --git a/drivers/xen/xenbus/xenbus_dev_backend.c b/drivers/xen/xenbus/xenbus_dev_backend.c
new file mode 100644
index 000000000..edba5fecd
--- /dev/null
+++ b/drivers/xen/xenbus/xenbus_dev_backend.c
@@ -0,0 +1,134 @@
+// SPDX-License-Identifier: GPL-2.0
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/slab.h>
+#include <linux/types.h>
+#include <linux/mm.h>
+#include <linux/fs.h>
+#include <linux/miscdevice.h>
+#include <linux/init.h>
+#include <linux/capability.h>
+
+#include <xen/xen.h>
+#include <xen/page.h>
+#include <xen/xenbus.h>
+#include <xen/xenbus_dev.h>
+#include <xen/grant_table.h>
+#include <xen/events.h>
+#include <asm/xen/hypervisor.h>
+
+#include "xenbus.h"
+
+static int xenbus_backend_open(struct inode *inode, struct file *filp)
+{
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
+	return nonseekable_open(inode, filp);
+}
+
+static long xenbus_alloc(domid_t domid)
+{
+	struct evtchn_alloc_unbound arg;
+	int err = -EEXIST;
+
+	xs_suspend();
+
+	/* If xenstored_ready is nonzero, that means we have already talked to
+	 * xenstore and set up watches. These watches will be restored by
+	 * xs_resume, but that requires communication over the port established
+	 * below that is not visible to anyone until the ioctl returns.
+	 *
+	 * This can be resolved by splitting the ioctl into two parts
+	 * (postponing the resume until xenstored is active) but this is
+	 * unnecessarily complex for the intended use where xenstored is only
+	 * started once - so return -EEXIST if it's already running.
+	 */
+	if (xenstored_ready)
+		goto out_err;
+
+	gnttab_grant_foreign_access_ref(GNTTAB_RESERVED_XENSTORE, domid,
+			virt_to_gfn(xen_store_interface), 0 /* writable */);
+
+	arg.dom = DOMID_SELF;
+	arg.remote_dom = domid;
+
+	err = HYPERVISOR_event_channel_op(EVTCHNOP_alloc_unbound, &arg);
+	if (err)
+		goto out_err;
+
+	if (xen_store_evtchn > 0)
+		xb_deinit_comms();
+
+	xen_store_evtchn = arg.port;
+
+	xs_resume();
+
+	return arg.port;
+
+ out_err:
+	xs_suspend_cancel();
+	return err;
+}
+
+static long xenbus_backend_ioctl(struct file *file, unsigned int cmd,
+				 unsigned long data)
+{
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
+	switch (cmd) {
+	case IOCTL_XENBUS_BACKEND_EVTCHN:
+		if (xen_store_evtchn > 0)
+			return xen_store_evtchn;
+		return -ENODEV;
+	case IOCTL_XENBUS_BACKEND_SETUP:
+		return xenbus_alloc(data);
+	default:
+		return -ENOTTY;
+	}
+}
+
+static int xenbus_backend_mmap(struct file *file, struct vm_area_struct *vma)
+{
+	size_t size = vma->vm_end - vma->vm_start;
+
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
+	if ((size > PAGE_SIZE) || (vma->vm_pgoff != 0))
+		return -EINVAL;
+
+	if (remap_pfn_range(vma, vma->vm_start,
+			    virt_to_pfn(xen_store_interface),
+			    size, vma->vm_page_prot))
+		return -EAGAIN;
+
+	return 0;
+}
+
+static const struct file_operations xenbus_backend_fops = {
+	.open = xenbus_backend_open,
+	.mmap = xenbus_backend_mmap,
+	.unlocked_ioctl = xenbus_backend_ioctl,
+};
+
+static struct miscdevice xenbus_backend_dev = {
+	.minor = MISC_DYNAMIC_MINOR,
+	.name = "xen/xenbus_backend",
+	.fops = &xenbus_backend_fops,
+};
+
+static int __init xenbus_backend_init(void)
+{
+	int err;
+
+	if (!xen_initial_domain())
+		return -ENODEV;
+
+	err = misc_register(&xenbus_backend_dev);
+	if (err)
+		pr_err("Could not register xenbus backend device\n");
+	return err;
+}
+device_initcall(xenbus_backend_init);
diff --git a/drivers/xen/xenbus/xenbus_dev_frontend.c b/drivers/xen/xenbus/xenbus_dev_frontend.c
new file mode 100644
index 000000000..454c6826a
--- /dev/null
+++ b/drivers/xen/xenbus/xenbus_dev_frontend.c
@@ -0,0 +1,727 @@
+/*
+ * Driver giving user-space access to the kernel's xenbus connection
+ * to xenstore.
+ *
+ * Copyright (c) 2005, Christian Limpach
+ * Copyright (c) 2005, Rusty Russell, IBM Corporation
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation; or, when distributed
+ * separately from the Linux kernel or incorporated into other
+ * software packages, subject to the following license:
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this source file (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy, modify,
+ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
+ * and to permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ *
+ * Changes:
+ * 2008-10-07  Alex Zeffertt    Replaced /proc/xen/xenbus with xenfs filesystem
+ *                              and /proc/xen compatibility mount point.
+ *                              Turned xenfs into a loadable module.
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/kernel.h>
+#include <linux/errno.h>
+#include <linux/uio.h>
+#include <linux/notifier.h>
+#include <linux/wait.h>
+#include <linux/fs.h>
+#include <linux/poll.h>
+#include <linux/mutex.h>
+#include <linux/sched.h>
+#include <linux/spinlock.h>
+#include <linux/mount.h>
+#include <linux/pagemap.h>
+#include <linux/uaccess.h>
+#include <linux/init.h>
+#include <linux/namei.h>
+#include <linux/string.h>
+#include <linux/slab.h>
+#include <linux/miscdevice.h>
+#include <linux/workqueue.h>
+
+#include <xen/xenbus.h>
+#include <xen/xen.h>
+#include <asm/xen/hypervisor.h>
+
+#include "xenbus.h"
+
+unsigned int xb_dev_generation_id;
+
+/*
+ * An element of a list of outstanding transactions, for which we're
+ * still waiting a reply.
+ */
+struct xenbus_transaction_holder {
+	struct list_head list;
+	struct xenbus_transaction handle;
+	unsigned int generation_id;
+};
+
+/*
+ * A buffer of data on the queue.
+ */
+struct read_buffer {
+	struct list_head list;
+	unsigned int cons;
+	unsigned int len;
+	char msg[];
+};
+
+struct xenbus_file_priv {
+	/*
+	 * msgbuffer_mutex is held while partial requests are built up
+	 * and complete requests are acted on.  It therefore protects
+	 * the "transactions" and "watches" lists, and the partial
+	 * request length and buffer.
+	 *
+	 * reply_mutex protects the reply being built up to return to
+	 * usermode.  It nests inside msgbuffer_mutex but may be held
+	 * alone during a watch callback.
+	 */
+	struct mutex msgbuffer_mutex;
+
+	/* In-progress transactions */
+	struct list_head transactions;
+
+	/* Active watches. */
+	struct list_head watches;
+
+	/* Partial request. */
+	unsigned int len;
+	union {
+		struct xsd_sockmsg msg;
+		char buffer[XENSTORE_PAYLOAD_MAX];
+	} u;
+
+	/* Response queue. */
+	struct mutex reply_mutex;
+	struct list_head read_buffers;
+	wait_queue_head_t read_waitq;
+
+	struct kref kref;
+
+	struct work_struct wq;
+};
+
+/* Read out any raw xenbus messages queued up. */
+static ssize_t xenbus_file_read(struct file *filp,
+			       char __user *ubuf,
+			       size_t len, loff_t *ppos)
+{
+	struct xenbus_file_priv *u = filp->private_data;
+	struct read_buffer *rb;
+	unsigned i;
+	int ret;
+
+	mutex_lock(&u->reply_mutex);
+again:
+	while (list_empty(&u->read_buffers)) {
+		mutex_unlock(&u->reply_mutex);
+		if (filp->f_flags & O_NONBLOCK)
+			return -EAGAIN;
+
+		ret = wait_event_interruptible(u->read_waitq,
+					       !list_empty(&u->read_buffers));
+		if (ret)
+			return ret;
+		mutex_lock(&u->reply_mutex);
+	}
+
+	rb = list_entry(u->read_buffers.next, struct read_buffer, list);
+	i = 0;
+	while (i < len) {
+		unsigned sz = min((unsigned)len - i, rb->len - rb->cons);
+
+		ret = copy_to_user(ubuf + i, &rb->msg[rb->cons], sz);
+
+		i += sz - ret;
+		rb->cons += sz - ret;
+
+		if (ret != 0) {
+			if (i == 0)
+				i = -EFAULT;
+			goto out;
+		}
+
+		/* Clear out buffer if it has been consumed */
+		if (rb->cons == rb->len) {
+			list_del(&rb->list);
+			kfree(rb);
+			if (list_empty(&u->read_buffers))
+				break;
+			rb = list_entry(u->read_buffers.next,
+					struct read_buffer, list);
+		}
+	}
+	if (i == 0)
+		goto again;
+
+out:
+	mutex_unlock(&u->reply_mutex);
+	return i;
+}
+
+/*
+ * Add a buffer to the queue.  Caller must hold the appropriate lock
+ * if the queue is not local.  (Commonly the caller will build up
+ * multiple queued buffers on a temporary local list, and then add it
+ * to the appropriate list under lock once all the buffers have een
+ * successfully allocated.)
+ */
+static int queue_reply(struct list_head *queue, const void *data, size_t len)
+{
+	struct read_buffer *rb;
+
+	if (len == 0)
+		return 0;
+	if (len > XENSTORE_PAYLOAD_MAX)
+		return -EINVAL;
+
+	rb = kmalloc(sizeof(*rb) + len, GFP_KERNEL);
+	if (rb == NULL)
+		return -ENOMEM;
+
+	rb->cons = 0;
+	rb->len = len;
+
+	memcpy(rb->msg, data, len);
+
+	list_add_tail(&rb->list, queue);
+	return 0;
+}
+
+/*
+ * Free all the read_buffer s on a list.
+ * Caller must have sole reference to list.
+ */
+static void queue_cleanup(struct list_head *list)
+{
+	struct read_buffer *rb;
+
+	while (!list_empty(list)) {
+		rb = list_entry(list->next, struct read_buffer, list);
+		list_del(list->next);
+		kfree(rb);
+	}
+}
+
+struct watch_adapter {
+	struct list_head list;
+	struct xenbus_watch watch;
+	struct xenbus_file_priv *dev_data;
+	char *token;
+};
+
+static void free_watch_adapter(struct watch_adapter *watch)
+{
+	kfree(watch->watch.node);
+	kfree(watch->token);
+	kfree(watch);
+}
+
+static struct watch_adapter *alloc_watch_adapter(const char *path,
+						 const char *token)
+{
+	struct watch_adapter *watch;
+
+	watch = kzalloc(sizeof(*watch), GFP_KERNEL);
+	if (watch == NULL)
+		goto out_fail;
+
+	watch->watch.node = kstrdup(path, GFP_KERNEL);
+	if (watch->watch.node == NULL)
+		goto out_free;
+
+	watch->token = kstrdup(token, GFP_KERNEL);
+	if (watch->token == NULL)
+		goto out_free;
+
+	return watch;
+
+out_free:
+	free_watch_adapter(watch);
+
+out_fail:
+	return NULL;
+}
+
+static void watch_fired(struct xenbus_watch *watch,
+			const char *path,
+			const char *token)
+{
+	struct watch_adapter *adap;
+	struct xsd_sockmsg hdr;
+	const char *token_caller;
+	int path_len, tok_len, body_len;
+	int ret;
+	LIST_HEAD(staging_q);
+
+	adap = container_of(watch, struct watch_adapter, watch);
+
+	token_caller = adap->token;
+
+	path_len = strlen(path) + 1;
+	tok_len = strlen(token_caller) + 1;
+	body_len = path_len + tok_len;
+
+	hdr.type = XS_WATCH_EVENT;
+	hdr.len = body_len;
+
+	mutex_lock(&adap->dev_data->reply_mutex);
+
+	ret = queue_reply(&staging_q, &hdr, sizeof(hdr));
+	if (!ret)
+		ret = queue_reply(&staging_q, path, path_len);
+	if (!ret)
+		ret = queue_reply(&staging_q, token_caller, tok_len);
+
+	if (!ret) {
+		/* success: pass reply list onto watcher */
+		list_splice_tail(&staging_q, &adap->dev_data->read_buffers);
+		wake_up(&adap->dev_data->read_waitq);
+	} else
+		queue_cleanup(&staging_q);
+
+	mutex_unlock(&adap->dev_data->reply_mutex);
+}
+
+static void xenbus_worker(struct work_struct *wq)
+{
+	struct xenbus_file_priv *u;
+	struct xenbus_transaction_holder *trans, *tmp;
+	struct watch_adapter *watch, *tmp_watch;
+	struct read_buffer *rb, *tmp_rb;
+
+	u = container_of(wq, struct xenbus_file_priv, wq);
+
+	/*
+	 * No need for locking here because there are no other users,
+	 * by definition.
+	 */
+
+	list_for_each_entry_safe(trans, tmp, &u->transactions, list) {
+		xenbus_transaction_end(trans->handle, 1);
+		list_del(&trans->list);
+		kfree(trans);
+	}
+
+	list_for_each_entry_safe(watch, tmp_watch, &u->watches, list) {
+		unregister_xenbus_watch(&watch->watch);
+		list_del(&watch->list);
+		free_watch_adapter(watch);
+	}
+
+	list_for_each_entry_safe(rb, tmp_rb, &u->read_buffers, list) {
+		list_del(&rb->list);
+		kfree(rb);
+	}
+	kfree(u);
+}
+
+static void xenbus_file_free(struct kref *kref)
+{
+	struct xenbus_file_priv *u;
+
+	/*
+	 * We might be called in xenbus_thread().
+	 * Use workqueue to avoid deadlock.
+	 */
+	u = container_of(kref, struct xenbus_file_priv, kref);
+	schedule_work(&u->wq);
+}
+
+static struct xenbus_transaction_holder *xenbus_get_transaction(
+	struct xenbus_file_priv *u, uint32_t tx_id)
+{
+	struct xenbus_transaction_holder *trans;
+
+	list_for_each_entry(trans, &u->transactions, list)
+		if (trans->handle.id == tx_id)
+			return trans;
+
+	return NULL;
+}
+
+void xenbus_dev_queue_reply(struct xb_req_data *req)
+{
+	struct xenbus_file_priv *u = req->par;
+	struct xenbus_transaction_holder *trans = NULL;
+	int rc;
+	LIST_HEAD(staging_q);
+
+	xs_request_exit(req);
+
+	mutex_lock(&u->msgbuffer_mutex);
+
+	if (req->type == XS_TRANSACTION_START) {
+		trans = xenbus_get_transaction(u, 0);
+		if (WARN_ON(!trans))
+			goto out;
+		if (req->msg.type == XS_ERROR) {
+			list_del(&trans->list);
+			kfree(trans);
+		} else {
+			rc = kstrtou32(req->body, 10, &trans->handle.id);
+			if (WARN_ON(rc))
+				goto out;
+		}
+	} else if (req->type == XS_TRANSACTION_END) {
+		trans = xenbus_get_transaction(u, req->msg.tx_id);
+		if (WARN_ON(!trans))
+			goto out;
+		list_del(&trans->list);
+		kfree(trans);
+	}
+
+	mutex_unlock(&u->msgbuffer_mutex);
+
+	mutex_lock(&u->reply_mutex);
+	rc = queue_reply(&staging_q, &req->msg, sizeof(req->msg));
+	if (!rc)
+		rc = queue_reply(&staging_q, req->body, req->msg.len);
+	if (!rc) {
+		list_splice_tail(&staging_q, &u->read_buffers);
+		wake_up(&u->read_waitq);
+	} else {
+		queue_cleanup(&staging_q);
+	}
+	mutex_unlock(&u->reply_mutex);
+
+	kfree(req->body);
+	kfree(req);
+
+	kref_put(&u->kref, xenbus_file_free);
+
+	return;
+
+ out:
+	mutex_unlock(&u->msgbuffer_mutex);
+}
+
+static int xenbus_command_reply(struct xenbus_file_priv *u,
+				unsigned int msg_type, const char *reply)
+{
+	struct {
+		struct xsd_sockmsg hdr;
+		char body[16];
+	} msg;
+	int rc;
+
+	msg.hdr = u->u.msg;
+	msg.hdr.type = msg_type;
+	msg.hdr.len = strlen(reply) + 1;
+	if (msg.hdr.len > sizeof(msg.body))
+		return -E2BIG;
+	memcpy(&msg.body, reply, msg.hdr.len);
+
+	mutex_lock(&u->reply_mutex);
+	rc = queue_reply(&u->read_buffers, &msg, sizeof(msg.hdr) + msg.hdr.len);
+	wake_up(&u->read_waitq);
+	mutex_unlock(&u->reply_mutex);
+
+	if (!rc)
+		kref_put(&u->kref, xenbus_file_free);
+
+	return rc;
+}
+
+static int xenbus_write_transaction(unsigned msg_type,
+				    struct xenbus_file_priv *u)
+{
+	int rc;
+	struct xenbus_transaction_holder *trans = NULL;
+	struct {
+		struct xsd_sockmsg hdr;
+		char body[];
+	} *msg = (void *)u->u.buffer;
+
+	if (msg_type == XS_TRANSACTION_START) {
+		trans = kzalloc(sizeof(*trans), GFP_KERNEL);
+		if (!trans) {
+			rc = -ENOMEM;
+			goto out;
+		}
+		trans->generation_id = xb_dev_generation_id;
+		list_add(&trans->list, &u->transactions);
+	} else if (msg->hdr.tx_id != 0 &&
+		   !xenbus_get_transaction(u, msg->hdr.tx_id))
+		return xenbus_command_reply(u, XS_ERROR, "ENOENT");
+	else if (msg_type == XS_TRANSACTION_END &&
+		 !(msg->hdr.len == 2 &&
+		   (!strcmp(msg->body, "T") || !strcmp(msg->body, "F"))))
+		return xenbus_command_reply(u, XS_ERROR, "EINVAL");
+	else if (msg_type == XS_TRANSACTION_END) {
+		trans = xenbus_get_transaction(u, msg->hdr.tx_id);
+		if (trans && trans->generation_id != xb_dev_generation_id) {
+			list_del(&trans->list);
+			kfree(trans);
+			if (!strcmp(msg->body, "T"))
+				return xenbus_command_reply(u, XS_ERROR,
+							    "EAGAIN");
+			else
+				return xenbus_command_reply(u,
+							    XS_TRANSACTION_END,
+							    "OK");
+		}
+	}
+
+	rc = xenbus_dev_request_and_reply(&msg->hdr, u);
+	if (rc && trans) {
+		list_del(&trans->list);
+		kfree(trans);
+	}
+
+out:
+	return rc;
+}
+
+static int xenbus_write_watch(unsigned msg_type, struct xenbus_file_priv *u)
+{
+	struct watch_adapter *watch;
+	char *path, *token;
+	int err, rc;
+	LIST_HEAD(staging_q);
+
+	path = u->u.buffer + sizeof(u->u.msg);
+	token = memchr(path, 0, u->u.msg.len);
+	if (token == NULL) {
+		rc = xenbus_command_reply(u, XS_ERROR, "EINVAL");
+		goto out;
+	}
+	token++;
+	if (memchr(token, 0, u->u.msg.len - (token - path)) == NULL) {
+		rc = xenbus_command_reply(u, XS_ERROR, "EINVAL");
+		goto out;
+	}
+
+	if (msg_type == XS_WATCH) {
+		watch = alloc_watch_adapter(path, token);
+		if (watch == NULL) {
+			rc = -ENOMEM;
+			goto out;
+		}
+
+		watch->watch.callback = watch_fired;
+		watch->dev_data = u;
+
+		err = register_xenbus_watch(&watch->watch);
+		if (err) {
+			free_watch_adapter(watch);
+			rc = err;
+			goto out;
+		}
+		list_add(&watch->list, &u->watches);
+	} else {
+		list_for_each_entry(watch, &u->watches, list) {
+			if (!strcmp(watch->token, token) &&
+			    !strcmp(watch->watch.node, path)) {
+				unregister_xenbus_watch(&watch->watch);
+				list_del(&watch->list);
+				free_watch_adapter(watch);
+				break;
+			}
+		}
+	}
+
+	/* Success.  Synthesize a reply to say all is OK. */
+	rc = xenbus_command_reply(u, msg_type, "OK");
+
+out:
+	return rc;
+}
+
+static ssize_t xenbus_file_write(struct file *filp,
+				const char __user *ubuf,
+				size_t len, loff_t *ppos)
+{
+	struct xenbus_file_priv *u = filp->private_data;
+	uint32_t msg_type;
+	int rc = len;
+	int ret;
+	LIST_HEAD(staging_q);
+
+	/*
+	 * We're expecting usermode to be writing properly formed
+	 * xenbus messages.  If they write an incomplete message we
+	 * buffer it up.  Once it is complete, we act on it.
+	 */
+
+	/*
+	 * Make sure concurrent writers can't stomp all over each
+	 * other's messages and make a mess of our partial message
+	 * buffer.  We don't make any attemppt to stop multiple
+	 * writers from making a mess of each other's incomplete
+	 * messages; we're just trying to guarantee our own internal
+	 * consistency and make sure that single writes are handled
+	 * atomically.
+	 */
+	mutex_lock(&u->msgbuffer_mutex);
+
+	/* Get this out of the way early to avoid confusion */
+	if (len == 0)
+		goto out;
+
+	/* Can't write a xenbus message larger we can buffer */
+	if (len > sizeof(u->u.buffer) - u->len) {
+		/* On error, dump existing buffer */
+		u->len = 0;
+		rc = -EINVAL;
+		goto out;
+	}
+
+	ret = copy_from_user(u->u.buffer + u->len, ubuf, len);
+
+	if (ret != 0) {
+		rc = -EFAULT;
+		goto out;
+	}
+
+	/* Deal with a partial copy. */
+	len -= ret;
+	rc = len;
+
+	u->len += len;
+
+	/* Return if we haven't got a full message yet */
+	if (u->len < sizeof(u->u.msg))
+		goto out;	/* not even the header yet */
+
+	/* If we're expecting a message that's larger than we can
+	   possibly send, dump what we have and return an error. */
+	if ((sizeof(u->u.msg) + u->u.msg.len) > sizeof(u->u.buffer)) {
+		rc = -E2BIG;
+		u->len = 0;
+		goto out;
+	}
+
+	if (u->len < (sizeof(u->u.msg) + u->u.msg.len))
+		goto out;	/* incomplete data portion */
+
+	/*
+	 * OK, now we have a complete message.  Do something with it.
+	 */
+
+	kref_get(&u->kref);
+
+	msg_type = u->u.msg.type;
+
+	switch (msg_type) {
+	case XS_WATCH:
+	case XS_UNWATCH:
+		/* (Un)Ask for some path to be watched for changes */
+		ret = xenbus_write_watch(msg_type, u);
+		break;
+
+	default:
+		/* Send out a transaction */
+		ret = xenbus_write_transaction(msg_type, u);
+		break;
+	}
+	if (ret != 0) {
+		rc = ret;
+		kref_put(&u->kref, xenbus_file_free);
+	}
+
+	/* Buffered message consumed */
+	u->len = 0;
+
+ out:
+	mutex_unlock(&u->msgbuffer_mutex);
+	return rc;
+}
+
+static int xenbus_file_open(struct inode *inode, struct file *filp)
+{
+	struct xenbus_file_priv *u;
+
+	if (xen_store_evtchn == 0)
+		return -ENOENT;
+
+	stream_open(inode, filp);
+
+	u = kzalloc(sizeof(*u), GFP_KERNEL);
+	if (u == NULL)
+		return -ENOMEM;
+
+	kref_init(&u->kref);
+
+	INIT_LIST_HEAD(&u->transactions);
+	INIT_LIST_HEAD(&u->watches);
+	INIT_LIST_HEAD(&u->read_buffers);
+	init_waitqueue_head(&u->read_waitq);
+	INIT_WORK(&u->wq, xenbus_worker);
+
+	mutex_init(&u->reply_mutex);
+	mutex_init(&u->msgbuffer_mutex);
+
+	filp->private_data = u;
+
+	return 0;
+}
+
+static int xenbus_file_release(struct inode *inode, struct file *filp)
+{
+	struct xenbus_file_priv *u = filp->private_data;
+
+	kref_put(&u->kref, xenbus_file_free);
+
+	return 0;
+}
+
+static __poll_t xenbus_file_poll(struct file *file, poll_table *wait)
+{
+	struct xenbus_file_priv *u = file->private_data;
+
+	poll_wait(file, &u->read_waitq, wait);
+	if (!list_empty(&u->read_buffers))
+		return EPOLLIN | EPOLLRDNORM;
+	return 0;
+}
+
+const struct file_operations xen_xenbus_fops = {
+	.read = xenbus_file_read,
+	.write = xenbus_file_write,
+	.open = xenbus_file_open,
+	.release = xenbus_file_release,
+	.poll = xenbus_file_poll,
+	.llseek = no_llseek,
+};
+EXPORT_SYMBOL_GPL(xen_xenbus_fops);
+
+static struct miscdevice xenbus_dev = {
+	.minor = MISC_DYNAMIC_MINOR,
+	.name = "xen/xenbus",
+	.fops = &xen_xenbus_fops,
+};
+
+static int __init xenbus_init(void)
+{
+	int err;
+
+	if (!xen_domain())
+		return -ENODEV;
+
+	err = misc_register(&xenbus_dev);
+	if (err)
+		pr_err("Could not register xenbus frontend device\n");
+	return err;
+}
+device_initcall(xenbus_init);
diff --git a/drivers/xen/xenbus/xenbus_probe.c b/drivers/xen/xenbus/xenbus_probe.c
new file mode 100644
index 000000000..b911a91bc
--- /dev/null
+++ b/drivers/xen/xenbus/xenbus_probe.c
@@ -0,0 +1,955 @@
+/******************************************************************************
+ * Talks to Xen Store to figure out what devices we have.
+ *
+ * Copyright (C) 2005 Rusty Russell, IBM Corporation
+ * Copyright (C) 2005 Mike Wray, Hewlett-Packard
+ * Copyright (C) 2005, 2006 XenSource Ltd
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation; or, when distributed
+ * separately from the Linux kernel or incorporated into other
+ * software packages, subject to the following license:
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this source file (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy, modify,
+ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
+ * and to permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#define DPRINTK(fmt, args...)				\
+	pr_debug("xenbus_probe (%s:%d) " fmt ".\n",	\
+		 __func__, __LINE__, ##args)
+
+#include <linux/kernel.h>
+#include <linux/err.h>
+#include <linux/string.h>
+#include <linux/ctype.h>
+#include <linux/fcntl.h>
+#include <linux/mm.h>
+#include <linux/proc_fs.h>
+#include <linux/notifier.h>
+#include <linux/kthread.h>
+#include <linux/mutex.h>
+#include <linux/io.h>
+#include <linux/slab.h>
+#include <linux/module.h>
+
+#include <asm/page.h>
+#include <asm/pgtable.h>
+#include <asm/xen/hypervisor.h>
+
+#include <xen/xen.h>
+#include <xen/xenbus.h>
+#include <xen/events.h>
+#include <xen/xen-ops.h>
+#include <xen/page.h>
+
+#include <xen/hvm.h>
+
+#include "xenbus.h"
+
+
+int xen_store_evtchn;
+EXPORT_SYMBOL_GPL(xen_store_evtchn);
+
+struct xenstore_domain_interface *xen_store_interface;
+EXPORT_SYMBOL_GPL(xen_store_interface);
+
+enum xenstore_init xen_store_domain_type;
+EXPORT_SYMBOL_GPL(xen_store_domain_type);
+
+static unsigned long xen_store_gfn;
+
+static BLOCKING_NOTIFIER_HEAD(xenstore_chain);
+
+/* If something in array of ids matches this device, return it. */
+static const struct xenbus_device_id *
+match_device(const struct xenbus_device_id *arr, struct xenbus_device *dev)
+{
+	for (; *arr->devicetype != '\0'; arr++) {
+		if (!strcmp(arr->devicetype, dev->devicetype))
+			return arr;
+	}
+	return NULL;
+}
+
+int xenbus_match(struct device *_dev, struct device_driver *_drv)
+{
+	struct xenbus_driver *drv = to_xenbus_driver(_drv);
+
+	if (!drv->ids)
+		return 0;
+
+	return match_device(drv->ids, to_xenbus_device(_dev)) != NULL;
+}
+EXPORT_SYMBOL_GPL(xenbus_match);
+
+
+static void free_otherend_details(struct xenbus_device *dev)
+{
+	kfree(dev->otherend);
+	dev->otherend = NULL;
+}
+
+
+static void free_otherend_watch(struct xenbus_device *dev)
+{
+	if (dev->otherend_watch.node) {
+		unregister_xenbus_watch(&dev->otherend_watch);
+		kfree(dev->otherend_watch.node);
+		dev->otherend_watch.node = NULL;
+	}
+}
+
+
+static int talk_to_otherend(struct xenbus_device *dev)
+{
+	struct xenbus_driver *drv = to_xenbus_driver(dev->dev.driver);
+
+	free_otherend_watch(dev);
+	free_otherend_details(dev);
+
+	return drv->read_otherend_details(dev);
+}
+
+
+
+static int watch_otherend(struct xenbus_device *dev)
+{
+	struct xen_bus_type *bus =
+		container_of(dev->dev.bus, struct xen_bus_type, bus);
+
+	return xenbus_watch_pathfmt(dev, &dev->otherend_watch,
+				    bus->otherend_will_handle,
+				    bus->otherend_changed,
+				    "%s/%s", dev->otherend, "state");
+}
+
+
+int xenbus_read_otherend_details(struct xenbus_device *xendev,
+				 char *id_node, char *path_node)
+{
+	int err = xenbus_gather(XBT_NIL, xendev->nodename,
+				id_node, "%i", &xendev->otherend_id,
+				path_node, NULL, &xendev->otherend,
+				NULL);
+	if (err) {
+		xenbus_dev_fatal(xendev, err,
+				 "reading other end details from %s",
+				 xendev->nodename);
+		return err;
+	}
+	if (strlen(xendev->otherend) == 0 ||
+	    !xenbus_exists(XBT_NIL, xendev->otherend, "")) {
+		xenbus_dev_fatal(xendev, -ENOENT,
+				 "unable to read other end from %s.  "
+				 "missing or inaccessible.",
+				 xendev->nodename);
+		free_otherend_details(xendev);
+		return -ENOENT;
+	}
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(xenbus_read_otherend_details);
+
+void xenbus_otherend_changed(struct xenbus_watch *watch,
+			     const char *path, const char *token,
+			     int ignore_on_shutdown)
+{
+	struct xenbus_device *dev =
+		container_of(watch, struct xenbus_device, otherend_watch);
+	struct xenbus_driver *drv = to_xenbus_driver(dev->dev.driver);
+	enum xenbus_state state;
+
+	/* Protect us against watches firing on old details when the otherend
+	   details change, say immediately after a resume. */
+	if (!dev->otherend ||
+	    strncmp(dev->otherend, path, strlen(dev->otherend))) {
+		dev_dbg(&dev->dev, "Ignoring watch at %s\n", path);
+		return;
+	}
+
+	state = xenbus_read_driver_state(dev->otherend);
+
+	dev_dbg(&dev->dev, "state is %d, (%s), %s, %s\n",
+		state, xenbus_strstate(state), dev->otherend_watch.node, path);
+
+	/*
+	 * Ignore xenbus transitions during shutdown. This prevents us doing
+	 * work that can fail e.g., when the rootfs is gone.
+	 */
+	if (system_state > SYSTEM_RUNNING) {
+		if (ignore_on_shutdown && (state == XenbusStateClosing))
+			xenbus_frontend_closed(dev);
+		return;
+	}
+
+	if (drv->otherend_changed)
+		drv->otherend_changed(dev, state);
+}
+EXPORT_SYMBOL_GPL(xenbus_otherend_changed);
+
+int xenbus_dev_probe(struct device *_dev)
+{
+	struct xenbus_device *dev = to_xenbus_device(_dev);
+	struct xenbus_driver *drv = to_xenbus_driver(_dev->driver);
+	const struct xenbus_device_id *id;
+	int err;
+
+	DPRINTK("%s", dev->nodename);
+
+	if (!drv->probe) {
+		err = -ENODEV;
+		goto fail;
+	}
+
+	id = match_device(drv->ids, dev);
+	if (!id) {
+		err = -ENODEV;
+		goto fail;
+	}
+
+	err = talk_to_otherend(dev);
+	if (err) {
+		dev_warn(&dev->dev, "talk_to_otherend on %s failed.\n",
+			 dev->nodename);
+		return err;
+	}
+
+	err = drv->probe(dev, id);
+	if (err)
+		goto fail;
+
+	err = watch_otherend(dev);
+	if (err) {
+		dev_warn(&dev->dev, "watch_otherend on %s failed.\n",
+		       dev->nodename);
+		return err;
+	}
+
+	return 0;
+fail:
+	xenbus_dev_error(dev, err, "xenbus_dev_probe on %s", dev->nodename);
+	xenbus_switch_state(dev, XenbusStateClosed);
+	return err;
+}
+EXPORT_SYMBOL_GPL(xenbus_dev_probe);
+
+int xenbus_dev_remove(struct device *_dev)
+{
+	struct xenbus_device *dev = to_xenbus_device(_dev);
+	struct xenbus_driver *drv = to_xenbus_driver(_dev->driver);
+
+	DPRINTK("%s", dev->nodename);
+
+	free_otherend_watch(dev);
+
+	if (drv->remove)
+		drv->remove(dev);
+
+	free_otherend_details(dev);
+
+	xenbus_switch_state(dev, XenbusStateClosed);
+	return 0;
+}
+EXPORT_SYMBOL_GPL(xenbus_dev_remove);
+
+void xenbus_dev_shutdown(struct device *_dev)
+{
+	struct xenbus_device *dev = to_xenbus_device(_dev);
+	unsigned long timeout = 5*HZ;
+
+	DPRINTK("%s", dev->nodename);
+
+	get_device(&dev->dev);
+	if (dev->state != XenbusStateConnected) {
+		pr_info("%s: %s: %s != Connected, skipping\n",
+			__func__, dev->nodename, xenbus_strstate(dev->state));
+		goto out;
+	}
+	xenbus_switch_state(dev, XenbusStateClosing);
+	timeout = wait_for_completion_timeout(&dev->down, timeout);
+	if (!timeout)
+		pr_info("%s: %s timeout closing device\n",
+			__func__, dev->nodename);
+ out:
+	put_device(&dev->dev);
+}
+EXPORT_SYMBOL_GPL(xenbus_dev_shutdown);
+
+int xenbus_register_driver_common(struct xenbus_driver *drv,
+				  struct xen_bus_type *bus,
+				  struct module *owner, const char *mod_name)
+{
+	drv->driver.name = drv->name ? drv->name : drv->ids[0].devicetype;
+	drv->driver.bus = &bus->bus;
+	drv->driver.owner = owner;
+	drv->driver.mod_name = mod_name;
+
+	return driver_register(&drv->driver);
+}
+EXPORT_SYMBOL_GPL(xenbus_register_driver_common);
+
+void xenbus_unregister_driver(struct xenbus_driver *drv)
+{
+	driver_unregister(&drv->driver);
+}
+EXPORT_SYMBOL_GPL(xenbus_unregister_driver);
+
+struct xb_find_info {
+	struct xenbus_device *dev;
+	const char *nodename;
+};
+
+static int cmp_dev(struct device *dev, void *data)
+{
+	struct xenbus_device *xendev = to_xenbus_device(dev);
+	struct xb_find_info *info = data;
+
+	if (!strcmp(xendev->nodename, info->nodename)) {
+		info->dev = xendev;
+		get_device(dev);
+		return 1;
+	}
+	return 0;
+}
+
+static struct xenbus_device *xenbus_device_find(const char *nodename,
+						struct bus_type *bus)
+{
+	struct xb_find_info info = { .dev = NULL, .nodename = nodename };
+
+	bus_for_each_dev(bus, NULL, &info, cmp_dev);
+	return info.dev;
+}
+
+static int cleanup_dev(struct device *dev, void *data)
+{
+	struct xenbus_device *xendev = to_xenbus_device(dev);
+	struct xb_find_info *info = data;
+	int len = strlen(info->nodename);
+
+	DPRINTK("%s", info->nodename);
+
+	/* Match the info->nodename path, or any subdirectory of that path. */
+	if (strncmp(xendev->nodename, info->nodename, len))
+		return 0;
+
+	/* If the node name is longer, ensure it really is a subdirectory. */
+	if ((strlen(xendev->nodename) > len) && (xendev->nodename[len] != '/'))
+		return 0;
+
+	info->dev = xendev;
+	get_device(dev);
+	return 1;
+}
+
+static void xenbus_cleanup_devices(const char *path, struct bus_type *bus)
+{
+	struct xb_find_info info = { .nodename = path };
+
+	do {
+		info.dev = NULL;
+		bus_for_each_dev(bus, NULL, &info, cleanup_dev);
+		if (info.dev) {
+			device_unregister(&info.dev->dev);
+			put_device(&info.dev->dev);
+		}
+	} while (info.dev);
+}
+
+static void xenbus_dev_release(struct device *dev)
+{
+	if (dev)
+		kfree(to_xenbus_device(dev));
+}
+
+static ssize_t nodename_show(struct device *dev,
+			     struct device_attribute *attr, char *buf)
+{
+	return sprintf(buf, "%s\n", to_xenbus_device(dev)->nodename);
+}
+static DEVICE_ATTR_RO(nodename);
+
+static ssize_t devtype_show(struct device *dev,
+			    struct device_attribute *attr, char *buf)
+{
+	return sprintf(buf, "%s\n", to_xenbus_device(dev)->devicetype);
+}
+static DEVICE_ATTR_RO(devtype);
+
+static ssize_t modalias_show(struct device *dev,
+			     struct device_attribute *attr, char *buf)
+{
+	return sprintf(buf, "%s:%s\n", dev->bus->name,
+		       to_xenbus_device(dev)->devicetype);
+}
+static DEVICE_ATTR_RO(modalias);
+
+static ssize_t state_show(struct device *dev,
+			    struct device_attribute *attr, char *buf)
+{
+	return sprintf(buf, "%s\n",
+			xenbus_strstate(to_xenbus_device(dev)->state));
+}
+static DEVICE_ATTR_RO(state);
+
+static struct attribute *xenbus_dev_attrs[] = {
+	&dev_attr_nodename.attr,
+	&dev_attr_devtype.attr,
+	&dev_attr_modalias.attr,
+	&dev_attr_state.attr,
+	NULL,
+};
+
+static const struct attribute_group xenbus_dev_group = {
+	.attrs = xenbus_dev_attrs,
+};
+
+const struct attribute_group *xenbus_dev_groups[] = {
+	&xenbus_dev_group,
+	NULL,
+};
+EXPORT_SYMBOL_GPL(xenbus_dev_groups);
+
+int xenbus_probe_node(struct xen_bus_type *bus,
+		      const char *type,
+		      const char *nodename)
+{
+	char devname[XEN_BUS_ID_SIZE];
+	int err;
+	struct xenbus_device *xendev;
+	size_t stringlen;
+	char *tmpstring;
+
+	enum xenbus_state state = xenbus_read_driver_state(nodename);
+
+	if (state != XenbusStateInitialising) {
+		/* Device is not new, so ignore it.  This can happen if a
+		   device is going away after switching to Closed.  */
+		return 0;
+	}
+
+	stringlen = strlen(nodename) + 1 + strlen(type) + 1;
+	xendev = kzalloc(sizeof(*xendev) + stringlen, GFP_KERNEL);
+	if (!xendev)
+		return -ENOMEM;
+
+	xendev->state = XenbusStateInitialising;
+
+	/* Copy the strings into the extra space. */
+
+	tmpstring = (char *)(xendev + 1);
+	strcpy(tmpstring, nodename);
+	xendev->nodename = tmpstring;
+
+	tmpstring += strlen(tmpstring) + 1;
+	strcpy(tmpstring, type);
+	xendev->devicetype = tmpstring;
+	init_completion(&xendev->down);
+
+	xendev->dev.bus = &bus->bus;
+	xendev->dev.release = xenbus_dev_release;
+
+	err = bus->get_bus_id(devname, xendev->nodename);
+	if (err)
+		goto fail;
+
+	dev_set_name(&xendev->dev, "%s", devname);
+
+	/* Register with generic device framework. */
+	err = device_register(&xendev->dev);
+	if (err) {
+		put_device(&xendev->dev);
+		xendev = NULL;
+		goto fail;
+	}
+
+	return 0;
+fail:
+	kfree(xendev);
+	return err;
+}
+EXPORT_SYMBOL_GPL(xenbus_probe_node);
+
+static int xenbus_probe_device_type(struct xen_bus_type *bus, const char *type)
+{
+	int err = 0;
+	char **dir;
+	unsigned int dir_n = 0;
+	int i;
+
+	dir = xenbus_directory(XBT_NIL, bus->root, type, &dir_n);
+	if (IS_ERR(dir))
+		return PTR_ERR(dir);
+
+	for (i = 0; i < dir_n; i++) {
+		err = bus->probe(bus, type, dir[i]);
+		if (err)
+			break;
+	}
+
+	kfree(dir);
+	return err;
+}
+
+int xenbus_probe_devices(struct xen_bus_type *bus)
+{
+	int err = 0;
+	char **dir;
+	unsigned int i, dir_n;
+
+	dir = xenbus_directory(XBT_NIL, bus->root, "", &dir_n);
+	if (IS_ERR(dir))
+		return PTR_ERR(dir);
+
+	for (i = 0; i < dir_n; i++) {
+		err = xenbus_probe_device_type(bus, dir[i]);
+		if (err)
+			break;
+	}
+
+	kfree(dir);
+	return err;
+}
+EXPORT_SYMBOL_GPL(xenbus_probe_devices);
+
+static unsigned int char_count(const char *str, char c)
+{
+	unsigned int i, ret = 0;
+
+	for (i = 0; str[i]; i++)
+		if (str[i] == c)
+			ret++;
+	return ret;
+}
+
+static int strsep_len(const char *str, char c, unsigned int len)
+{
+	unsigned int i;
+
+	for (i = 0; str[i]; i++)
+		if (str[i] == c) {
+			if (len == 0)
+				return i;
+			len--;
+		}
+	return (len == 0) ? i : -ERANGE;
+}
+
+void xenbus_dev_changed(const char *node, struct xen_bus_type *bus)
+{
+	int exists, rootlen;
+	struct xenbus_device *dev;
+	char type[XEN_BUS_ID_SIZE];
+	const char *p, *root;
+
+	if (char_count(node, '/') < 2)
+		return;
+
+	exists = xenbus_exists(XBT_NIL, node, "");
+	if (!exists) {
+		xenbus_cleanup_devices(node, &bus->bus);
+		return;
+	}
+
+	/* backend/<type>/... or device/<type>/... */
+	p = strchr(node, '/') + 1;
+	snprintf(type, XEN_BUS_ID_SIZE, "%.*s", (int)strcspn(p, "/"), p);
+	type[XEN_BUS_ID_SIZE-1] = '\0';
+
+	rootlen = strsep_len(node, '/', bus->levels);
+	if (rootlen < 0)
+		return;
+	root = kasprintf(GFP_KERNEL, "%.*s", rootlen, node);
+	if (!root)
+		return;
+
+	dev = xenbus_device_find(root, &bus->bus);
+	if (!dev)
+		xenbus_probe_node(bus, type, root);
+	else
+		put_device(&dev->dev);
+
+	kfree(root);
+}
+EXPORT_SYMBOL_GPL(xenbus_dev_changed);
+
+int xenbus_dev_suspend(struct device *dev)
+{
+	int err = 0;
+	struct xenbus_driver *drv;
+	struct xenbus_device *xdev
+		= container_of(dev, struct xenbus_device, dev);
+
+	DPRINTK("%s", xdev->nodename);
+
+	if (dev->driver == NULL)
+		return 0;
+	drv = to_xenbus_driver(dev->driver);
+	if (drv->suspend)
+		err = drv->suspend(xdev);
+	if (err)
+		pr_warn("suspend %s failed: %i\n", dev_name(dev), err);
+	return 0;
+}
+EXPORT_SYMBOL_GPL(xenbus_dev_suspend);
+
+int xenbus_dev_resume(struct device *dev)
+{
+	int err;
+	struct xenbus_driver *drv;
+	struct xenbus_device *xdev
+		= container_of(dev, struct xenbus_device, dev);
+
+	DPRINTK("%s", xdev->nodename);
+
+	if (dev->driver == NULL)
+		return 0;
+	drv = to_xenbus_driver(dev->driver);
+	err = talk_to_otherend(xdev);
+	if (err) {
+		pr_warn("resume (talk_to_otherend) %s failed: %i\n",
+			dev_name(dev), err);
+		return err;
+	}
+
+	xdev->state = XenbusStateInitialising;
+
+	if (drv->resume) {
+		err = drv->resume(xdev);
+		if (err) {
+			pr_warn("resume %s failed: %i\n", dev_name(dev), err);
+			return err;
+		}
+	}
+
+	err = watch_otherend(xdev);
+	if (err) {
+		pr_warn("resume (watch_otherend) %s failed: %d.\n",
+			dev_name(dev), err);
+		return err;
+	}
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(xenbus_dev_resume);
+
+int xenbus_dev_cancel(struct device *dev)
+{
+	/* Do nothing */
+	DPRINTK("cancel");
+	return 0;
+}
+EXPORT_SYMBOL_GPL(xenbus_dev_cancel);
+
+/* A flag to determine if xenstored is 'ready' (i.e. has started) */
+int xenstored_ready;
+
+
+int register_xenstore_notifier(struct notifier_block *nb)
+{
+	int ret = 0;
+
+	if (xenstored_ready > 0)
+		ret = nb->notifier_call(nb, 0, NULL);
+	else
+		blocking_notifier_chain_register(&xenstore_chain, nb);
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(register_xenstore_notifier);
+
+void unregister_xenstore_notifier(struct notifier_block *nb)
+{
+	blocking_notifier_chain_unregister(&xenstore_chain, nb);
+}
+EXPORT_SYMBOL_GPL(unregister_xenstore_notifier);
+
+static void xenbus_probe(void)
+{
+	xenstored_ready = 1;
+
+	/*
+	 * In the HVM case, xenbus_init() deferred its call to
+	 * xs_init() in case callbacks were not operational yet.
+	 * So do it now.
+	 */
+	if (xen_store_domain_type == XS_HVM)
+		xs_init();
+
+	/* Notify others that xenstore is up */
+	blocking_notifier_call_chain(&xenstore_chain, 0, NULL);
+}
+
+/*
+ * Returns true when XenStore init must be deferred in order to
+ * allow the PCI platform device to be initialised, before we
+ * can actually have event channel interrupts working.
+ */
+static bool xs_hvm_defer_init_for_callback(void)
+{
+#ifdef CONFIG_XEN_PVHVM
+	return xen_store_domain_type == XS_HVM &&
+		!xen_have_vector_callback;
+#else
+	return false;
+#endif
+}
+
+static int xenbus_probe_thread(void *unused)
+{
+	DEFINE_WAIT(w);
+
+	/*
+	 * We actually just want to wait for *any* trigger of xb_waitq,
+	 * and run xenbus_probe() the moment it occurs.
+	 */
+	prepare_to_wait(&xb_waitq, &w, TASK_INTERRUPTIBLE);
+	schedule();
+	finish_wait(&xb_waitq, &w);
+
+	DPRINTK("probing");
+	xenbus_probe();
+	return 0;
+}
+
+static int __init xenbus_probe_initcall(void)
+{
+	/*
+	 * Probe XenBus here in the XS_PV case, and also XS_HVM unless we
+	 * need to wait for the platform PCI device to come up.
+	 */
+	if (xen_store_domain_type == XS_PV ||
+	    (xen_store_domain_type == XS_HVM &&
+	     !xs_hvm_defer_init_for_callback()))
+		xenbus_probe();
+
+	/*
+	 * For XS_LOCAL, spawn a thread which will wait for xenstored
+	 * or a xenstore-stubdom to be started, then probe. It will be
+	 * triggered when communication starts happening, by waiting
+	 * on xb_waitq.
+	 */
+	if (xen_store_domain_type == XS_LOCAL) {
+		struct task_struct *probe_task;
+
+		probe_task = kthread_run(xenbus_probe_thread, NULL,
+					 "xenbus_probe");
+		if (IS_ERR(probe_task))
+			return PTR_ERR(probe_task);
+	}
+	return 0;
+}
+device_initcall(xenbus_probe_initcall);
+
+int xen_set_callback_via(uint64_t via)
+{
+	struct xen_hvm_param a;
+	int ret;
+
+	a.domid = DOMID_SELF;
+	a.index = HVM_PARAM_CALLBACK_IRQ;
+	a.value = via;
+
+	ret = HYPERVISOR_hvm_op(HVMOP_set_param, &a);
+	if (ret)
+		return ret;
+
+	/*
+	 * If xenbus_probe_initcall() deferred the xenbus_probe()
+	 * due to the callback not functioning yet, we can do it now.
+	 */
+	if (!xenstored_ready && xs_hvm_defer_init_for_callback())
+		xenbus_probe();
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(xen_set_callback_via);
+
+/* Set up event channel for xenstored which is run as a local process
+ * (this is normally used only in dom0)
+ */
+static int __init xenstored_local_init(void)
+{
+	int err = -ENOMEM;
+	unsigned long page = 0;
+	struct evtchn_alloc_unbound alloc_unbound;
+
+	/* Allocate Xenstore page */
+	page = get_zeroed_page(GFP_KERNEL);
+	if (!page)
+		goto out_err;
+
+	xen_store_gfn = virt_to_gfn((void *)page);
+
+	/* Next allocate a local port which xenstored can bind to */
+	alloc_unbound.dom        = DOMID_SELF;
+	alloc_unbound.remote_dom = DOMID_SELF;
+
+	err = HYPERVISOR_event_channel_op(EVTCHNOP_alloc_unbound,
+					  &alloc_unbound);
+	if (err == -ENOSYS)
+		goto out_err;
+
+	BUG_ON(err);
+	xen_store_evtchn = alloc_unbound.port;
+
+	return 0;
+
+ out_err:
+	if (page != 0)
+		free_page(page);
+	return err;
+}
+
+static int xenbus_resume_cb(struct notifier_block *nb,
+			    unsigned long action, void *data)
+{
+	int err = 0;
+
+	if (xen_hvm_domain()) {
+		uint64_t v = 0;
+
+		err = hvm_get_parameter(HVM_PARAM_STORE_EVTCHN, &v);
+		if (!err && v)
+			xen_store_evtchn = v;
+		else
+			pr_warn("Cannot update xenstore event channel: %d\n",
+				err);
+	} else
+		xen_store_evtchn = xen_start_info->store_evtchn;
+
+	return err;
+}
+
+static struct notifier_block xenbus_resume_nb = {
+	.notifier_call = xenbus_resume_cb,
+};
+
+static int __init xenbus_init(void)
+{
+	int err;
+	uint64_t v = 0;
+	xen_store_domain_type = XS_UNKNOWN;
+
+	if (!xen_domain())
+		return -ENODEV;
+
+	xenbus_ring_ops_init();
+
+	if (xen_pv_domain())
+		xen_store_domain_type = XS_PV;
+	if (xen_hvm_domain())
+		xen_store_domain_type = XS_HVM;
+	if (xen_hvm_domain() && xen_initial_domain())
+		xen_store_domain_type = XS_LOCAL;
+	if (xen_pv_domain() && !xen_start_info->store_evtchn)
+		xen_store_domain_type = XS_LOCAL;
+	if (xen_pv_domain() && xen_start_info->store_evtchn)
+		xenstored_ready = 1;
+
+	switch (xen_store_domain_type) {
+	case XS_LOCAL:
+		err = xenstored_local_init();
+		if (err)
+			goto out_error;
+		xen_store_interface = gfn_to_virt(xen_store_gfn);
+		break;
+	case XS_PV:
+		xen_store_evtchn = xen_start_info->store_evtchn;
+		xen_store_gfn = xen_start_info->store_mfn;
+		xen_store_interface = gfn_to_virt(xen_store_gfn);
+		break;
+	case XS_HVM:
+		err = hvm_get_parameter(HVM_PARAM_STORE_EVTCHN, &v);
+		if (err)
+			goto out_error;
+		xen_store_evtchn = (int)v;
+		err = hvm_get_parameter(HVM_PARAM_STORE_PFN, &v);
+		if (err)
+			goto out_error;
+		/*
+		 * Uninitialized hvm_params are zero and return no error.
+		 * Although it is theoretically possible to have
+		 * HVM_PARAM_STORE_PFN set to zero on purpose, in reality it is
+		 * not zero when valid. If zero, it means that Xenstore hasn't
+		 * been properly initialized. Instead of attempting to map a
+		 * wrong guest physical address return error.
+		 *
+		 * Also recognize all bits set as an invalid value.
+		 */
+		if (!v || !~v) {
+			err = -ENOENT;
+			goto out_error;
+		}
+		/* Avoid truncation on 32-bit. */
+#if BITS_PER_LONG == 32
+		if (v > ULONG_MAX) {
+			pr_err("%s: cannot handle HVM_PARAM_STORE_PFN=%llx > ULONG_MAX\n",
+			       __func__, v);
+			err = -EINVAL;
+			goto out_error;
+		}
+#endif
+		xen_store_gfn = (unsigned long)v;
+		xen_store_interface =
+			xen_remap(xen_store_gfn << XEN_PAGE_SHIFT,
+				  XEN_PAGE_SIZE);
+		break;
+	default:
+		pr_warn("Xenstore state unknown\n");
+		break;
+	}
+
+	/*
+	 * HVM domains may not have a functional callback yet. In that
+	 * case let xs_init() be called from xenbus_probe(), which will
+	 * get invoked at an appropriate time.
+	 */
+	if (xen_store_domain_type != XS_HVM) {
+		err = xs_init();
+		if (err) {
+			pr_warn("Error initializing xenstore comms: %i\n", err);
+			goto out_error;
+		}
+	}
+
+	if ((xen_store_domain_type != XS_LOCAL) &&
+	    (xen_store_domain_type != XS_UNKNOWN))
+		xen_resume_notifier_register(&xenbus_resume_nb);
+
+#ifdef CONFIG_XEN_COMPAT_XENFS
+	/*
+	 * Create xenfs mountpoint in /proc for compatibility with
+	 * utilities that expect to find "xenbus" under "/proc/xen".
+	 */
+	proc_create_mount_point("xen");
+#endif
+	return 0;
+
+out_error:
+	xen_store_domain_type = XS_UNKNOWN;
+	return err;
+}
+
+postcore_initcall(xenbus_init);
+
+MODULE_LICENSE("GPL");
diff --git a/drivers/xen/xenbus/xenbus_probe_backend.c b/drivers/xen/xenbus/xenbus_probe_backend.c
new file mode 100644
index 000000000..4bb603051
--- /dev/null
+++ b/drivers/xen/xenbus/xenbus_probe_backend.c
@@ -0,0 +1,276 @@
+/******************************************************************************
+ * Talks to Xen Store to figure out what devices we have (backend half).
+ *
+ * Copyright (C) 2005 Rusty Russell, IBM Corporation
+ * Copyright (C) 2005 Mike Wray, Hewlett-Packard
+ * Copyright (C) 2005, 2006 XenSource Ltd
+ * Copyright (C) 2007 Solarflare Communications, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation; or, when distributed
+ * separately from the Linux kernel or incorporated into other
+ * software packages, subject to the following license:
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this source file (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy, modify,
+ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
+ * and to permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#define DPRINTK(fmt, ...)				\
+	pr_debug("(%s:%d) " fmt "\n",			\
+		 __func__, __LINE__, ##__VA_ARGS__)
+
+#include <linux/kernel.h>
+#include <linux/err.h>
+#include <linux/string.h>
+#include <linux/ctype.h>
+#include <linux/fcntl.h>
+#include <linux/mm.h>
+#include <linux/notifier.h>
+#include <linux/export.h>
+
+#include <asm/page.h>
+#include <asm/pgtable.h>
+#include <asm/xen/hypervisor.h>
+#include <asm/hypervisor.h>
+#include <xen/xenbus.h>
+#include <xen/features.h>
+
+#include "xenbus.h"
+
+/* backend/<type>/<fe-uuid>/<id> => <type>-<fe-domid>-<id> */
+static int backend_bus_id(char bus_id[XEN_BUS_ID_SIZE], const char *nodename)
+{
+	int domid, err;
+	const char *devid, *type, *frontend;
+	unsigned int typelen;
+
+	type = strchr(nodename, '/');
+	if (!type)
+		return -EINVAL;
+	type++;
+	typelen = strcspn(type, "/");
+	if (!typelen || type[typelen] != '/')
+		return -EINVAL;
+
+	devid = strrchr(nodename, '/') + 1;
+
+	err = xenbus_gather(XBT_NIL, nodename, "frontend-id", "%i", &domid,
+			    "frontend", NULL, &frontend,
+			    NULL);
+	if (err)
+		return err;
+	if (strlen(frontend) == 0)
+		err = -ERANGE;
+	if (!err && !xenbus_exists(XBT_NIL, frontend, ""))
+		err = -ENOENT;
+	kfree(frontend);
+
+	if (err)
+		return err;
+
+	if (snprintf(bus_id, XEN_BUS_ID_SIZE, "%.*s-%i-%s",
+		     typelen, type, domid, devid) >= XEN_BUS_ID_SIZE)
+		return -ENOSPC;
+	return 0;
+}
+
+static int xenbus_uevent_backend(struct device *dev,
+				 struct kobj_uevent_env *env)
+{
+	struct xenbus_device *xdev;
+	struct xenbus_driver *drv;
+	struct xen_bus_type *bus;
+
+	DPRINTK("");
+
+	if (dev == NULL)
+		return -ENODEV;
+
+	xdev = to_xenbus_device(dev);
+	bus = container_of(xdev->dev.bus, struct xen_bus_type, bus);
+
+	if (add_uevent_var(env, "MODALIAS=xen-backend:%s", xdev->devicetype))
+		return -ENOMEM;
+
+	/* stuff we want to pass to /sbin/hotplug */
+	if (add_uevent_var(env, "XENBUS_TYPE=%s", xdev->devicetype))
+		return -ENOMEM;
+
+	if (add_uevent_var(env, "XENBUS_PATH=%s", xdev->nodename))
+		return -ENOMEM;
+
+	if (add_uevent_var(env, "XENBUS_BASE_PATH=%s", bus->root))
+		return -ENOMEM;
+
+	if (dev->driver) {
+		drv = to_xenbus_driver(dev->driver);
+		if (drv && drv->uevent)
+			return drv->uevent(xdev, env);
+	}
+
+	return 0;
+}
+
+/* backend/<typename>/<frontend-uuid>/<name> */
+static int xenbus_probe_backend_unit(struct xen_bus_type *bus,
+				     const char *dir,
+				     const char *type,
+				     const char *name)
+{
+	char *nodename;
+	int err;
+
+	nodename = kasprintf(GFP_KERNEL, "%s/%s", dir, name);
+	if (!nodename)
+		return -ENOMEM;
+
+	DPRINTK("%s\n", nodename);
+
+	err = xenbus_probe_node(bus, type, nodename);
+	kfree(nodename);
+	return err;
+}
+
+/* backend/<typename>/<frontend-domid> */
+static int xenbus_probe_backend(struct xen_bus_type *bus, const char *type,
+				const char *domid)
+{
+	char *nodename;
+	int err = 0;
+	char **dir;
+	unsigned int i, dir_n = 0;
+
+	DPRINTK("");
+
+	nodename = kasprintf(GFP_KERNEL, "%s/%s/%s", bus->root, type, domid);
+	if (!nodename)
+		return -ENOMEM;
+
+	dir = xenbus_directory(XBT_NIL, nodename, "", &dir_n);
+	if (IS_ERR(dir)) {
+		kfree(nodename);
+		return PTR_ERR(dir);
+	}
+
+	for (i = 0; i < dir_n; i++) {
+		err = xenbus_probe_backend_unit(bus, nodename, type, dir[i]);
+		if (err)
+			break;
+	}
+	kfree(dir);
+	kfree(nodename);
+	return err;
+}
+
+static bool frontend_will_handle(struct xenbus_watch *watch,
+				 const char *path, const char *token)
+{
+	return watch->nr_pending == 0;
+}
+
+static void frontend_changed(struct xenbus_watch *watch,
+			     const char *path, const char *token)
+{
+	xenbus_otherend_changed(watch, path, token, 0);
+}
+
+static struct xen_bus_type xenbus_backend = {
+	.root = "backend",
+	.levels = 3,		/* backend/type/<frontend>/<id> */
+	.get_bus_id = backend_bus_id,
+	.probe = xenbus_probe_backend,
+	.otherend_will_handle = frontend_will_handle,
+	.otherend_changed = frontend_changed,
+	.bus = {
+		.name		= "xen-backend",
+		.match		= xenbus_match,
+		.uevent		= xenbus_uevent_backend,
+		.probe		= xenbus_dev_probe,
+		.remove		= xenbus_dev_remove,
+		.shutdown	= xenbus_dev_shutdown,
+		.dev_groups	= xenbus_dev_groups,
+	},
+};
+
+static void backend_changed(struct xenbus_watch *watch,
+			    const char *path, const char *token)
+{
+	DPRINTK("");
+
+	xenbus_dev_changed(path, &xenbus_backend);
+}
+
+static struct xenbus_watch be_watch = {
+	.node = "backend",
+	.callback = backend_changed,
+};
+
+static int read_frontend_details(struct xenbus_device *xendev)
+{
+	return xenbus_read_otherend_details(xendev, "frontend-id", "frontend");
+}
+
+int xenbus_dev_is_online(struct xenbus_device *dev)
+{
+	return !!xenbus_read_unsigned(dev->nodename, "online", 0);
+}
+EXPORT_SYMBOL_GPL(xenbus_dev_is_online);
+
+int __xenbus_register_backend(struct xenbus_driver *drv, struct module *owner,
+			      const char *mod_name)
+{
+	drv->read_otherend_details = read_frontend_details;
+
+	return xenbus_register_driver_common(drv, &xenbus_backend,
+					     owner, mod_name);
+}
+EXPORT_SYMBOL_GPL(__xenbus_register_backend);
+
+static int backend_probe_and_watch(struct notifier_block *notifier,
+				   unsigned long event,
+				   void *data)
+{
+	/* Enumerate devices in xenstore and watch for changes. */
+	xenbus_probe_devices(&xenbus_backend);
+	register_xenbus_watch(&be_watch);
+
+	return NOTIFY_DONE;
+}
+
+static int __init xenbus_probe_backend_init(void)
+{
+	static struct notifier_block xenstore_notifier = {
+		.notifier_call = backend_probe_and_watch
+	};
+	int err;
+
+	DPRINTK("");
+
+	/* Register ourselves with the kernel bus subsystem */
+	err = bus_register(&xenbus_backend.bus);
+	if (err)
+		return err;
+
+	register_xenstore_notifier(&xenstore_notifier);
+
+	return 0;
+}
+subsys_initcall(xenbus_probe_backend_init);
diff --git a/drivers/xen/xenbus/xenbus_probe_frontend.c b/drivers/xen/xenbus/xenbus_probe_frontend.c
new file mode 100644
index 000000000..07896f4b2
--- /dev/null
+++ b/drivers/xen/xenbus/xenbus_probe_frontend.c
@@ -0,0 +1,502 @@
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#define DPRINTK(fmt, ...)				\
+	pr_debug("(%s:%d) " fmt "\n",			\
+		 __func__, __LINE__, ##__VA_ARGS__)
+
+#include <linux/kernel.h>
+#include <linux/err.h>
+#include <linux/string.h>
+#include <linux/ctype.h>
+#include <linux/fcntl.h>
+#include <linux/mm.h>
+#include <linux/proc_fs.h>
+#include <linux/notifier.h>
+#include <linux/kthread.h>
+#include <linux/mutex.h>
+#include <linux/io.h>
+#include <linux/module.h>
+
+#include <asm/page.h>
+#include <asm/pgtable.h>
+#include <asm/xen/hypervisor.h>
+#include <xen/xenbus.h>
+#include <xen/events.h>
+#include <xen/page.h>
+#include <xen/xen.h>
+
+#include <xen/platform_pci.h>
+
+#include "xenbus.h"
+
+
+
+/* device/<type>/<id> => <type>-<id> */
+static int frontend_bus_id(char bus_id[XEN_BUS_ID_SIZE], const char *nodename)
+{
+	nodename = strchr(nodename, '/');
+	if (!nodename || strlen(nodename + 1) >= XEN_BUS_ID_SIZE) {
+		pr_warn("bad frontend %s\n", nodename);
+		return -EINVAL;
+	}
+
+	strlcpy(bus_id, nodename + 1, XEN_BUS_ID_SIZE);
+	if (!strchr(bus_id, '/')) {
+		pr_warn("bus_id %s no slash\n", bus_id);
+		return -EINVAL;
+	}
+	*strchr(bus_id, '/') = '-';
+	return 0;
+}
+
+/* device/<typename>/<name> */
+static int xenbus_probe_frontend(struct xen_bus_type *bus, const char *type,
+				 const char *name)
+{
+	char *nodename;
+	int err;
+
+	/* ignore console/0 */
+	if (!strncmp(type, "console", 7) && !strncmp(name, "0", 1)) {
+		DPRINTK("Ignoring buggy device entry console/0");
+		return 0;
+	}
+
+	nodename = kasprintf(GFP_KERNEL, "%s/%s/%s", bus->root, type, name);
+	if (!nodename)
+		return -ENOMEM;
+
+	DPRINTK("%s", nodename);
+
+	err = xenbus_probe_node(bus, type, nodename);
+	kfree(nodename);
+	return err;
+}
+
+static int xenbus_uevent_frontend(struct device *_dev,
+				  struct kobj_uevent_env *env)
+{
+	struct xenbus_device *dev = to_xenbus_device(_dev);
+
+	if (add_uevent_var(env, "MODALIAS=xen:%s", dev->devicetype))
+		return -ENOMEM;
+
+	return 0;
+}
+
+
+static void backend_changed(struct xenbus_watch *watch,
+			    const char *path, const char *token)
+{
+	xenbus_otherend_changed(watch, path, token, 1);
+}
+
+static void xenbus_frontend_delayed_resume(struct work_struct *w)
+{
+	struct xenbus_device *xdev = container_of(w, struct xenbus_device, work);
+
+	xenbus_dev_resume(&xdev->dev);
+}
+
+static int xenbus_frontend_dev_resume(struct device *dev)
+{
+	/*
+	 * If xenstored is running in this domain, we cannot access the backend
+	 * state at the moment, so we need to defer xenbus_dev_resume
+	 */
+	if (xen_store_domain_type == XS_LOCAL) {
+		struct xenbus_device *xdev = to_xenbus_device(dev);
+
+		schedule_work(&xdev->work);
+
+		return 0;
+	}
+
+	return xenbus_dev_resume(dev);
+}
+
+static int xenbus_frontend_dev_probe(struct device *dev)
+{
+	if (xen_store_domain_type == XS_LOCAL) {
+		struct xenbus_device *xdev = to_xenbus_device(dev);
+		INIT_WORK(&xdev->work, xenbus_frontend_delayed_resume);
+	}
+
+	return xenbus_dev_probe(dev);
+}
+
+static const struct dev_pm_ops xenbus_pm_ops = {
+	.suspend	= xenbus_dev_suspend,
+	.resume		= xenbus_frontend_dev_resume,
+	.freeze		= xenbus_dev_suspend,
+	.thaw		= xenbus_dev_cancel,
+	.restore	= xenbus_dev_resume,
+};
+
+static struct xen_bus_type xenbus_frontend = {
+	.root = "device",
+	.levels = 2,		/* device/type/<id> */
+	.get_bus_id = frontend_bus_id,
+	.probe = xenbus_probe_frontend,
+	.otherend_changed = backend_changed,
+	.bus = {
+		.name		= "xen",
+		.match		= xenbus_match,
+		.uevent		= xenbus_uevent_frontend,
+		.probe		= xenbus_frontend_dev_probe,
+		.remove		= xenbus_dev_remove,
+		.shutdown	= xenbus_dev_shutdown,
+		.dev_groups	= xenbus_dev_groups,
+
+		.pm		= &xenbus_pm_ops,
+	},
+};
+
+static void frontend_changed(struct xenbus_watch *watch,
+			     const char *path, const char *token)
+{
+	DPRINTK("");
+
+	xenbus_dev_changed(path, &xenbus_frontend);
+}
+
+
+/* We watch for devices appearing and vanishing. */
+static struct xenbus_watch fe_watch = {
+	.node = "device",
+	.callback = frontend_changed,
+};
+
+static int read_backend_details(struct xenbus_device *xendev)
+{
+	return xenbus_read_otherend_details(xendev, "backend-id", "backend");
+}
+
+static int is_device_connecting(struct device *dev, void *data, bool ignore_nonessential)
+{
+	struct xenbus_device *xendev = to_xenbus_device(dev);
+	struct device_driver *drv = data;
+	struct xenbus_driver *xendrv;
+
+	/*
+	 * A device with no driver will never connect. We care only about
+	 * devices which should currently be in the process of connecting.
+	 */
+	if (!dev->driver)
+		return 0;
+
+	/* Is this search limited to a particular driver? */
+	if (drv && (dev->driver != drv))
+		return 0;
+
+	if (ignore_nonessential) {
+		/* With older QEMU, for PVonHVM guests the guest config files
+		 * could contain: vfb = [ 'vnc=1, vnclisten=0.0.0.0']
+		 * which is nonsensical as there is no PV FB (there can be
+		 * a PVKB) running as HVM guest. */
+
+		if ((strncmp(xendev->nodename, "device/vkbd", 11) == 0))
+			return 0;
+
+		if ((strncmp(xendev->nodename, "device/vfb", 10) == 0))
+			return 0;
+	}
+	xendrv = to_xenbus_driver(dev->driver);
+	return (xendev->state < XenbusStateConnected ||
+		(xendev->state == XenbusStateConnected &&
+		 xendrv->is_ready && !xendrv->is_ready(xendev)));
+}
+static int essential_device_connecting(struct device *dev, void *data)
+{
+	return is_device_connecting(dev, data, true /* ignore PV[KBB+FB] */);
+}
+static int non_essential_device_connecting(struct device *dev, void *data)
+{
+	return is_device_connecting(dev, data, false);
+}
+
+static int exists_essential_connecting_device(struct device_driver *drv)
+{
+	return bus_for_each_dev(&xenbus_frontend.bus, NULL, drv,
+				essential_device_connecting);
+}
+static int exists_non_essential_connecting_device(struct device_driver *drv)
+{
+	return bus_for_each_dev(&xenbus_frontend.bus, NULL, drv,
+				non_essential_device_connecting);
+}
+
+static int print_device_status(struct device *dev, void *data)
+{
+	struct xenbus_device *xendev = to_xenbus_device(dev);
+	struct device_driver *drv = data;
+
+	/* Is this operation limited to a particular driver? */
+	if (drv && (dev->driver != drv))
+		return 0;
+
+	if (!dev->driver) {
+		/* Information only: is this too noisy? */
+		pr_info("Device with no driver: %s\n", xendev->nodename);
+	} else if (xendev->state < XenbusStateConnected) {
+		enum xenbus_state rstate = XenbusStateUnknown;
+		if (xendev->otherend)
+			rstate = xenbus_read_driver_state(xendev->otherend);
+		pr_warn("Timeout connecting to device: %s (local state %d, remote state %d)\n",
+			xendev->nodename, xendev->state, rstate);
+	}
+
+	return 0;
+}
+
+/* We only wait for device setup after most initcalls have run. */
+static int ready_to_wait_for_devices;
+
+static bool wait_loop(unsigned long start, unsigned int max_delay,
+		     unsigned int *seconds_waited)
+{
+	if (time_after(jiffies, start + (*seconds_waited+5)*HZ)) {
+		if (!*seconds_waited)
+			pr_warn("Waiting for devices to initialise: ");
+		*seconds_waited += 5;
+		pr_cont("%us...", max_delay - *seconds_waited);
+		if (*seconds_waited == max_delay) {
+			pr_cont("\n");
+			return true;
+		}
+	}
+
+	schedule_timeout_interruptible(HZ/10);
+
+	return false;
+}
+/*
+ * On a 5-minute timeout, wait for all devices currently configured.  We need
+ * to do this to guarantee that the filesystems and / or network devices
+ * needed for boot are available, before we can allow the boot to proceed.
+ *
+ * This needs to be on a late_initcall, to happen after the frontend device
+ * drivers have been initialised, but before the root fs is mounted.
+ *
+ * A possible improvement here would be to have the tools add a per-device
+ * flag to the store entry, indicating whether it is needed at boot time.
+ * This would allow people who knew what they were doing to accelerate their
+ * boot slightly, but of course needs tools or manual intervention to set up
+ * those flags correctly.
+ */
+static void wait_for_devices(struct xenbus_driver *xendrv)
+{
+	unsigned long start = jiffies;
+	struct device_driver *drv = xendrv ? &xendrv->driver : NULL;
+	unsigned int seconds_waited = 0;
+
+	if (!ready_to_wait_for_devices || !xen_domain())
+		return;
+
+	while (exists_non_essential_connecting_device(drv))
+		if (wait_loop(start, 30, &seconds_waited))
+			break;
+
+	/* Skips PVKB and PVFB check.*/
+	while (exists_essential_connecting_device(drv))
+		if (wait_loop(start, 270, &seconds_waited))
+			break;
+
+	if (seconds_waited)
+		printk("\n");
+
+	bus_for_each_dev(&xenbus_frontend.bus, NULL, drv,
+			 print_device_status);
+}
+
+int __xenbus_register_frontend(struct xenbus_driver *drv, struct module *owner,
+			       const char *mod_name)
+{
+	int ret;
+
+	drv->read_otherend_details = read_backend_details;
+
+	ret = xenbus_register_driver_common(drv, &xenbus_frontend,
+					    owner, mod_name);
+	if (ret)
+		return ret;
+
+	/* If this driver is loaded as a module wait for devices to attach. */
+	wait_for_devices(drv);
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(__xenbus_register_frontend);
+
+static DECLARE_WAIT_QUEUE_HEAD(backend_state_wq);
+static int backend_state;
+
+static void xenbus_reset_backend_state_changed(struct xenbus_watch *w,
+					const char *path, const char *token)
+{
+	if (xenbus_scanf(XBT_NIL, path, "", "%i",
+			 &backend_state) != 1)
+		backend_state = XenbusStateUnknown;
+	printk(KERN_DEBUG "XENBUS: backend %s %s\n",
+	       path, xenbus_strstate(backend_state));
+	wake_up(&backend_state_wq);
+}
+
+static void xenbus_reset_wait_for_backend(char *be, int expected)
+{
+	long timeout;
+	timeout = wait_event_interruptible_timeout(backend_state_wq,
+			backend_state == expected, 5 * HZ);
+	if (timeout <= 0)
+		pr_info("backend %s timed out\n", be);
+}
+
+/*
+ * Reset frontend if it is in Connected or Closed state.
+ * Wait for backend to catch up.
+ * State Connected happens during kdump, Closed after kexec.
+ */
+static void xenbus_reset_frontend(char *fe, char *be, int be_state)
+{
+	struct xenbus_watch be_watch;
+
+	printk(KERN_DEBUG "XENBUS: backend %s %s\n",
+			be, xenbus_strstate(be_state));
+
+	memset(&be_watch, 0, sizeof(be_watch));
+	be_watch.node = kasprintf(GFP_NOIO | __GFP_HIGH, "%s/state", be);
+	if (!be_watch.node)
+		return;
+
+	be_watch.callback = xenbus_reset_backend_state_changed;
+	backend_state = XenbusStateUnknown;
+
+	pr_info("triggering reconnect on %s\n", be);
+	register_xenbus_watch(&be_watch);
+
+	/* fall through to forward backend to state XenbusStateInitialising */
+	switch (be_state) {
+	case XenbusStateConnected:
+		xenbus_printf(XBT_NIL, fe, "state", "%d", XenbusStateClosing);
+		xenbus_reset_wait_for_backend(be, XenbusStateClosing);
+		/* fall through */
+
+	case XenbusStateClosing:
+		xenbus_printf(XBT_NIL, fe, "state", "%d", XenbusStateClosed);
+		xenbus_reset_wait_for_backend(be, XenbusStateClosed);
+		/* fall through */
+
+	case XenbusStateClosed:
+		xenbus_printf(XBT_NIL, fe, "state", "%d", XenbusStateInitialising);
+		xenbus_reset_wait_for_backend(be, XenbusStateInitWait);
+	}
+
+	unregister_xenbus_watch(&be_watch);
+	pr_info("reconnect done on %s\n", be);
+	kfree(be_watch.node);
+}
+
+static void xenbus_check_frontend(char *class, char *dev)
+{
+	int be_state, fe_state, err;
+	char *backend, *frontend;
+
+	frontend = kasprintf(GFP_NOIO | __GFP_HIGH, "device/%s/%s", class, dev);
+	if (!frontend)
+		return;
+
+	err = xenbus_scanf(XBT_NIL, frontend, "state", "%i", &fe_state);
+	if (err != 1)
+		goto out;
+
+	switch (fe_state) {
+	case XenbusStateConnected:
+	case XenbusStateClosed:
+		printk(KERN_DEBUG "XENBUS: frontend %s %s\n",
+				frontend, xenbus_strstate(fe_state));
+		backend = xenbus_read(XBT_NIL, frontend, "backend", NULL);
+		if (!backend || IS_ERR(backend))
+			goto out;
+		err = xenbus_scanf(XBT_NIL, backend, "state", "%i", &be_state);
+		if (err == 1)
+			xenbus_reset_frontend(frontend, backend, be_state);
+		kfree(backend);
+		break;
+	default:
+		break;
+	}
+out:
+	kfree(frontend);
+}
+
+static void xenbus_reset_state(void)
+{
+	char **devclass, **dev;
+	int devclass_n, dev_n;
+	int i, j;
+
+	devclass = xenbus_directory(XBT_NIL, "device", "", &devclass_n);
+	if (IS_ERR(devclass))
+		return;
+
+	for (i = 0; i < devclass_n; i++) {
+		dev = xenbus_directory(XBT_NIL, "device", devclass[i], &dev_n);
+		if (IS_ERR(dev))
+			continue;
+		for (j = 0; j < dev_n; j++)
+			xenbus_check_frontend(devclass[i], dev[j]);
+		kfree(dev);
+	}
+	kfree(devclass);
+}
+
+static int frontend_probe_and_watch(struct notifier_block *notifier,
+				   unsigned long event,
+				   void *data)
+{
+	/* reset devices in Connected or Closed state */
+	if (xen_hvm_domain())
+		xenbus_reset_state();
+	/* Enumerate devices in xenstore and watch for changes. */
+	xenbus_probe_devices(&xenbus_frontend);
+	register_xenbus_watch(&fe_watch);
+
+	return NOTIFY_DONE;
+}
+
+
+static int __init xenbus_probe_frontend_init(void)
+{
+	static struct notifier_block xenstore_notifier = {
+		.notifier_call = frontend_probe_and_watch
+	};
+	int err;
+
+	DPRINTK("");
+
+	/* Register ourselves with the kernel bus subsystem */
+	err = bus_register(&xenbus_frontend.bus);
+	if (err)
+		return err;
+
+	register_xenstore_notifier(&xenstore_notifier);
+
+	return 0;
+}
+subsys_initcall(xenbus_probe_frontend_init);
+
+#ifndef MODULE
+static int __init boot_wait_for_devices(void)
+{
+	if (!xen_has_pv_devices())
+		return -ENODEV;
+
+	ready_to_wait_for_devices = 1;
+	wait_for_devices(NULL);
+	return 0;
+}
+
+late_initcall(boot_wait_for_devices);
+#endif
+
+MODULE_LICENSE("GPL");
diff --git a/drivers/xen/xenbus/xenbus_xs.c b/drivers/xen/xenbus/xenbus_xs.c
new file mode 100644
index 000000000..12e02eb01
--- /dev/null
+++ b/drivers/xen/xenbus/xenbus_xs.c
@@ -0,0 +1,956 @@
+/******************************************************************************
+ * xenbus_xs.c
+ *
+ * This is the kernel equivalent of the "xs" library.  We don't need everything
+ * and we use xenbus_comms for communication.
+ *
+ * Copyright (C) 2005 Rusty Russell, IBM Corporation
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation; or, when distributed
+ * separately from the Linux kernel or incorporated into other
+ * software packages, subject to the following license:
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this source file (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy, modify,
+ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
+ * and to permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/unistd.h>
+#include <linux/errno.h>
+#include <linux/types.h>
+#include <linux/uio.h>
+#include <linux/kernel.h>
+#include <linux/string.h>
+#include <linux/err.h>
+#include <linux/slab.h>
+#include <linux/fcntl.h>
+#include <linux/kthread.h>
+#include <linux/reboot.h>
+#include <linux/rwsem.h>
+#include <linux/mutex.h>
+#include <asm/xen/hypervisor.h>
+#include <xen/xenbus.h>
+#include <xen/xen.h>
+#include "xenbus.h"
+
+/*
+ * Framework to protect suspend/resume handling against normal Xenstore
+ * message handling:
+ * During suspend/resume there must be no open transaction and no pending
+ * Xenstore request.
+ * New watch events happening in this time can be ignored by firing all watches
+ * after resume.
+ */
+
+/* Lock protecting enter/exit critical region. */
+static DEFINE_SPINLOCK(xs_state_lock);
+/* Number of users in critical region (protected by xs_state_lock). */
+static unsigned int xs_state_users;
+/* Suspend handler waiting or already active (protected by xs_state_lock)? */
+static int xs_suspend_active;
+/* Unique Xenstore request id (protected by xs_state_lock). */
+static uint32_t xs_request_id;
+
+/* Wait queue for all callers waiting for critical region to become usable. */
+static DECLARE_WAIT_QUEUE_HEAD(xs_state_enter_wq);
+/* Wait queue for suspend handling waiting for critical region being empty. */
+static DECLARE_WAIT_QUEUE_HEAD(xs_state_exit_wq);
+
+/* List of registered watches, and a lock to protect it. */
+static LIST_HEAD(watches);
+static DEFINE_SPINLOCK(watches_lock);
+
+/* List of pending watch callback events, and a lock to protect it. */
+static LIST_HEAD(watch_events);
+static DEFINE_SPINLOCK(watch_events_lock);
+
+/* Protect watch (de)register against save/restore. */
+static DECLARE_RWSEM(xs_watch_rwsem);
+
+/*
+ * Details of the xenwatch callback kernel thread. The thread waits on the
+ * watch_events_waitq for work to do (queued on watch_events list). When it
+ * wakes up it acquires the xenwatch_mutex before reading the list and
+ * carrying out work.
+ */
+static pid_t xenwatch_pid;
+static DEFINE_MUTEX(xenwatch_mutex);
+static DECLARE_WAIT_QUEUE_HEAD(watch_events_waitq);
+
+static void xs_suspend_enter(void)
+{
+	spin_lock(&xs_state_lock);
+	xs_suspend_active++;
+	spin_unlock(&xs_state_lock);
+	wait_event(xs_state_exit_wq, xs_state_users == 0);
+}
+
+static void xs_suspend_exit(void)
+{
+	xb_dev_generation_id++;
+	spin_lock(&xs_state_lock);
+	xs_suspend_active--;
+	spin_unlock(&xs_state_lock);
+	wake_up_all(&xs_state_enter_wq);
+}
+
+static uint32_t xs_request_enter(struct xb_req_data *req)
+{
+	uint32_t rq_id;
+
+	req->type = req->msg.type;
+
+	spin_lock(&xs_state_lock);
+
+	while (!xs_state_users && xs_suspend_active) {
+		spin_unlock(&xs_state_lock);
+		wait_event(xs_state_enter_wq, xs_suspend_active == 0);
+		spin_lock(&xs_state_lock);
+	}
+
+	if (req->type == XS_TRANSACTION_START && !req->user_req)
+		xs_state_users++;
+	xs_state_users++;
+	rq_id = xs_request_id++;
+
+	spin_unlock(&xs_state_lock);
+
+	return rq_id;
+}
+
+void xs_request_exit(struct xb_req_data *req)
+{
+	spin_lock(&xs_state_lock);
+	xs_state_users--;
+	if ((req->type == XS_TRANSACTION_START && req->msg.type == XS_ERROR) ||
+	    (req->type == XS_TRANSACTION_END && !req->user_req &&
+	     !WARN_ON_ONCE(req->msg.type == XS_ERROR &&
+			   !strcmp(req->body, "ENOENT"))))
+		xs_state_users--;
+	spin_unlock(&xs_state_lock);
+
+	if (xs_suspend_active && !xs_state_users)
+		wake_up(&xs_state_exit_wq);
+}
+
+static int get_error(const char *errorstring)
+{
+	unsigned int i;
+
+	for (i = 0; strcmp(errorstring, xsd_errors[i].errstring) != 0; i++) {
+		if (i == ARRAY_SIZE(xsd_errors) - 1) {
+			pr_warn("xen store gave: unknown error %s\n",
+				errorstring);
+			return EINVAL;
+		}
+	}
+	return xsd_errors[i].errnum;
+}
+
+static bool xenbus_ok(void)
+{
+	switch (xen_store_domain_type) {
+	case XS_LOCAL:
+		switch (system_state) {
+		case SYSTEM_POWER_OFF:
+		case SYSTEM_RESTART:
+		case SYSTEM_HALT:
+			return false;
+		default:
+			break;
+		}
+		return true;
+	case XS_PV:
+	case XS_HVM:
+		/* FIXME: Could check that the remote domain is alive,
+		 * but it is normally initial domain. */
+		return true;
+	default:
+		break;
+	}
+	return false;
+}
+
+static bool test_reply(struct xb_req_data *req)
+{
+	if (req->state == xb_req_state_got_reply || !xenbus_ok()) {
+		/* read req->state before all other fields */
+		virt_rmb();
+		return true;
+	}
+
+	/* Make sure to reread req->state each time. */
+	barrier();
+
+	return false;
+}
+
+static void *read_reply(struct xb_req_data *req)
+{
+	do {
+		wait_event(req->wq, test_reply(req));
+
+		if (!xenbus_ok())
+			/*
+			 * If we are in the process of being shut-down there is
+			 * no point of trying to contact XenBus - it is either
+			 * killed (xenstored application) or the other domain
+			 * has been killed or is unreachable.
+			 */
+			return ERR_PTR(-EIO);
+		if (req->err)
+			return ERR_PTR(req->err);
+
+	} while (req->state != xb_req_state_got_reply);
+
+	return req->body;
+}
+
+static void xs_send(struct xb_req_data *req, struct xsd_sockmsg *msg)
+{
+	bool notify;
+
+	req->msg = *msg;
+	req->err = 0;
+	req->state = xb_req_state_queued;
+	init_waitqueue_head(&req->wq);
+
+	/* Save the caller req_id and restore it later in the reply */
+	req->caller_req_id = req->msg.req_id;
+	req->msg.req_id = xs_request_enter(req);
+
+	mutex_lock(&xb_write_mutex);
+	list_add_tail(&req->list, &xb_write_list);
+	notify = list_is_singular(&xb_write_list);
+	mutex_unlock(&xb_write_mutex);
+
+	if (notify)
+		wake_up(&xb_waitq);
+}
+
+static void *xs_wait_for_reply(struct xb_req_data *req, struct xsd_sockmsg *msg)
+{
+	void *ret;
+
+	ret = read_reply(req);
+
+	xs_request_exit(req);
+
+	msg->type = req->msg.type;
+	msg->len = req->msg.len;
+
+	mutex_lock(&xb_write_mutex);
+	if (req->state == xb_req_state_queued ||
+	    req->state == xb_req_state_wait_reply)
+		req->state = xb_req_state_aborted;
+	else
+		kfree(req);
+	mutex_unlock(&xb_write_mutex);
+
+	return ret;
+}
+
+static void xs_wake_up(struct xb_req_data *req)
+{
+	wake_up(&req->wq);
+}
+
+int xenbus_dev_request_and_reply(struct xsd_sockmsg *msg, void *par)
+{
+	struct xb_req_data *req;
+	struct kvec *vec;
+
+	req = kmalloc(sizeof(*req) + sizeof(*vec), GFP_KERNEL);
+	if (!req)
+		return -ENOMEM;
+
+	vec = (struct kvec *)(req + 1);
+	vec->iov_len = msg->len;
+	vec->iov_base = msg + 1;
+
+	req->vec = vec;
+	req->num_vecs = 1;
+	req->cb = xenbus_dev_queue_reply;
+	req->par = par;
+	req->user_req = true;
+
+	xs_send(req, msg);
+
+	return 0;
+}
+EXPORT_SYMBOL(xenbus_dev_request_and_reply);
+
+/* Send message to xs, get kmalloc'ed reply.  ERR_PTR() on error. */
+static void *xs_talkv(struct xenbus_transaction t,
+		      enum xsd_sockmsg_type type,
+		      const struct kvec *iovec,
+		      unsigned int num_vecs,
+		      unsigned int *len)
+{
+	struct xb_req_data *req;
+	struct xsd_sockmsg msg;
+	void *ret = NULL;
+	unsigned int i;
+	int err;
+
+	req = kmalloc(sizeof(*req), GFP_NOIO | __GFP_HIGH);
+	if (!req)
+		return ERR_PTR(-ENOMEM);
+
+	req->vec = iovec;
+	req->num_vecs = num_vecs;
+	req->cb = xs_wake_up;
+	req->user_req = false;
+
+	msg.req_id = 0;
+	msg.tx_id = t.id;
+	msg.type = type;
+	msg.len = 0;
+	for (i = 0; i < num_vecs; i++)
+		msg.len += iovec[i].iov_len;
+
+	xs_send(req, &msg);
+
+	ret = xs_wait_for_reply(req, &msg);
+	if (len)
+		*len = msg.len;
+
+	if (IS_ERR(ret))
+		return ret;
+
+	if (msg.type == XS_ERROR) {
+		err = get_error(ret);
+		kfree(ret);
+		return ERR_PTR(-err);
+	}
+
+	if (msg.type != type) {
+		pr_warn_ratelimited("unexpected type [%d], expected [%d]\n",
+				    msg.type, type);
+		kfree(ret);
+		return ERR_PTR(-EINVAL);
+	}
+	return ret;
+}
+
+/* Simplified version of xs_talkv: single message. */
+static void *xs_single(struct xenbus_transaction t,
+		       enum xsd_sockmsg_type type,
+		       const char *string,
+		       unsigned int *len)
+{
+	struct kvec iovec;
+
+	iovec.iov_base = (void *)string;
+	iovec.iov_len = strlen(string) + 1;
+	return xs_talkv(t, type, &iovec, 1, len);
+}
+
+/* Many commands only need an ack, don't care what it says. */
+static int xs_error(char *reply)
+{
+	if (IS_ERR(reply))
+		return PTR_ERR(reply);
+	kfree(reply);
+	return 0;
+}
+
+static unsigned int count_strings(const char *strings, unsigned int len)
+{
+	unsigned int num;
+	const char *p;
+
+	for (p = strings, num = 0; p < strings + len; p += strlen(p) + 1)
+		num++;
+
+	return num;
+}
+
+/* Return the path to dir with /name appended. Buffer must be kfree()'ed. */
+static char *join(const char *dir, const char *name)
+{
+	char *buffer;
+
+	if (strlen(name) == 0)
+		buffer = kasprintf(GFP_NOIO | __GFP_HIGH, "%s", dir);
+	else
+		buffer = kasprintf(GFP_NOIO | __GFP_HIGH, "%s/%s", dir, name);
+	return (!buffer) ? ERR_PTR(-ENOMEM) : buffer;
+}
+
+static char **split(char *strings, unsigned int len, unsigned int *num)
+{
+	char *p, **ret;
+
+	/* Count the strings. */
+	*num = count_strings(strings, len);
+
+	/* Transfer to one big alloc for easy freeing. */
+	ret = kmalloc(*num * sizeof(char *) + len, GFP_NOIO | __GFP_HIGH);
+	if (!ret) {
+		kfree(strings);
+		return ERR_PTR(-ENOMEM);
+	}
+	memcpy(&ret[*num], strings, len);
+	kfree(strings);
+
+	strings = (char *)&ret[*num];
+	for (p = strings, *num = 0; p < strings + len; p += strlen(p) + 1)
+		ret[(*num)++] = p;
+
+	return ret;
+}
+
+char **xenbus_directory(struct xenbus_transaction t,
+			const char *dir, const char *node, unsigned int *num)
+{
+	char *strings, *path;
+	unsigned int len;
+
+	path = join(dir, node);
+	if (IS_ERR(path))
+		return (char **)path;
+
+	strings = xs_single(t, XS_DIRECTORY, path, &len);
+	kfree(path);
+	if (IS_ERR(strings))
+		return (char **)strings;
+
+	return split(strings, len, num);
+}
+EXPORT_SYMBOL_GPL(xenbus_directory);
+
+/* Check if a path exists. Return 1 if it does. */
+int xenbus_exists(struct xenbus_transaction t,
+		  const char *dir, const char *node)
+{
+	char **d;
+	int dir_n;
+
+	d = xenbus_directory(t, dir, node, &dir_n);
+	if (IS_ERR(d))
+		return 0;
+	kfree(d);
+	return 1;
+}
+EXPORT_SYMBOL_GPL(xenbus_exists);
+
+/* Get the value of a single file.
+ * Returns a kmalloced value: call free() on it after use.
+ * len indicates length in bytes.
+ */
+void *xenbus_read(struct xenbus_transaction t,
+		  const char *dir, const char *node, unsigned int *len)
+{
+	char *path;
+	void *ret;
+
+	path = join(dir, node);
+	if (IS_ERR(path))
+		return (void *)path;
+
+	ret = xs_single(t, XS_READ, path, len);
+	kfree(path);
+	return ret;
+}
+EXPORT_SYMBOL_GPL(xenbus_read);
+
+/* Write the value of a single file.
+ * Returns -err on failure.
+ */
+int xenbus_write(struct xenbus_transaction t,
+		 const char *dir, const char *node, const char *string)
+{
+	const char *path;
+	struct kvec iovec[2];
+	int ret;
+
+	path = join(dir, node);
+	if (IS_ERR(path))
+		return PTR_ERR(path);
+
+	iovec[0].iov_base = (void *)path;
+	iovec[0].iov_len = strlen(path) + 1;
+	iovec[1].iov_base = (void *)string;
+	iovec[1].iov_len = strlen(string);
+
+	ret = xs_error(xs_talkv(t, XS_WRITE, iovec, ARRAY_SIZE(iovec), NULL));
+	kfree(path);
+	return ret;
+}
+EXPORT_SYMBOL_GPL(xenbus_write);
+
+/* Create a new directory. */
+int xenbus_mkdir(struct xenbus_transaction t,
+		 const char *dir, const char *node)
+{
+	char *path;
+	int ret;
+
+	path = join(dir, node);
+	if (IS_ERR(path))
+		return PTR_ERR(path);
+
+	ret = xs_error(xs_single(t, XS_MKDIR, path, NULL));
+	kfree(path);
+	return ret;
+}
+EXPORT_SYMBOL_GPL(xenbus_mkdir);
+
+/* Destroy a file or directory (directories must be empty). */
+int xenbus_rm(struct xenbus_transaction t, const char *dir, const char *node)
+{
+	char *path;
+	int ret;
+
+	path = join(dir, node);
+	if (IS_ERR(path))
+		return PTR_ERR(path);
+
+	ret = xs_error(xs_single(t, XS_RM, path, NULL));
+	kfree(path);
+	return ret;
+}
+EXPORT_SYMBOL_GPL(xenbus_rm);
+
+/* Start a transaction: changes by others will not be seen during this
+ * transaction, and changes will not be visible to others until end.
+ */
+int xenbus_transaction_start(struct xenbus_transaction *t)
+{
+	char *id_str;
+
+	id_str = xs_single(XBT_NIL, XS_TRANSACTION_START, "", NULL);
+	if (IS_ERR(id_str))
+		return PTR_ERR(id_str);
+
+	t->id = simple_strtoul(id_str, NULL, 0);
+	kfree(id_str);
+	return 0;
+}
+EXPORT_SYMBOL_GPL(xenbus_transaction_start);
+
+/* End a transaction.
+ * If abandon is true, transaction is discarded instead of committed.
+ */
+int xenbus_transaction_end(struct xenbus_transaction t, int abort)
+{
+	char abortstr[2];
+
+	if (abort)
+		strcpy(abortstr, "F");
+	else
+		strcpy(abortstr, "T");
+
+	return xs_error(xs_single(t, XS_TRANSACTION_END, abortstr, NULL));
+}
+EXPORT_SYMBOL_GPL(xenbus_transaction_end);
+
+/* Single read and scanf: returns -errno or num scanned. */
+int xenbus_scanf(struct xenbus_transaction t,
+		 const char *dir, const char *node, const char *fmt, ...)
+{
+	va_list ap;
+	int ret;
+	char *val;
+
+	val = xenbus_read(t, dir, node, NULL);
+	if (IS_ERR(val))
+		return PTR_ERR(val);
+
+	va_start(ap, fmt);
+	ret = vsscanf(val, fmt, ap);
+	va_end(ap);
+	kfree(val);
+	/* Distinctive errno. */
+	if (ret == 0)
+		return -ERANGE;
+	return ret;
+}
+EXPORT_SYMBOL_GPL(xenbus_scanf);
+
+/* Read an (optional) unsigned value. */
+unsigned int xenbus_read_unsigned(const char *dir, const char *node,
+				  unsigned int default_val)
+{
+	unsigned int val;
+	int ret;
+
+	ret = xenbus_scanf(XBT_NIL, dir, node, "%u", &val);
+	if (ret <= 0)
+		val = default_val;
+
+	return val;
+}
+EXPORT_SYMBOL_GPL(xenbus_read_unsigned);
+
+/* Single printf and write: returns -errno or 0. */
+int xenbus_printf(struct xenbus_transaction t,
+		  const char *dir, const char *node, const char *fmt, ...)
+{
+	va_list ap;
+	int ret;
+	char *buf;
+
+	va_start(ap, fmt);
+	buf = kvasprintf(GFP_NOIO | __GFP_HIGH, fmt, ap);
+	va_end(ap);
+
+	if (!buf)
+		return -ENOMEM;
+
+	ret = xenbus_write(t, dir, node, buf);
+
+	kfree(buf);
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(xenbus_printf);
+
+/* Takes tuples of names, scanf-style args, and void **, NULL terminated. */
+int xenbus_gather(struct xenbus_transaction t, const char *dir, ...)
+{
+	va_list ap;
+	const char *name;
+	int ret = 0;
+
+	va_start(ap, dir);
+	while (ret == 0 && (name = va_arg(ap, char *)) != NULL) {
+		const char *fmt = va_arg(ap, char *);
+		void *result = va_arg(ap, void *);
+		char *p;
+
+		p = xenbus_read(t, dir, name, NULL);
+		if (IS_ERR(p)) {
+			ret = PTR_ERR(p);
+			break;
+		}
+		if (fmt) {
+			if (sscanf(p, fmt, result) == 0)
+				ret = -EINVAL;
+			kfree(p);
+		} else
+			*(char **)result = p;
+	}
+	va_end(ap);
+	return ret;
+}
+EXPORT_SYMBOL_GPL(xenbus_gather);
+
+static int xs_watch(const char *path, const char *token)
+{
+	struct kvec iov[2];
+
+	iov[0].iov_base = (void *)path;
+	iov[0].iov_len = strlen(path) + 1;
+	iov[1].iov_base = (void *)token;
+	iov[1].iov_len = strlen(token) + 1;
+
+	return xs_error(xs_talkv(XBT_NIL, XS_WATCH, iov,
+				 ARRAY_SIZE(iov), NULL));
+}
+
+static int xs_unwatch(const char *path, const char *token)
+{
+	struct kvec iov[2];
+
+	iov[0].iov_base = (char *)path;
+	iov[0].iov_len = strlen(path) + 1;
+	iov[1].iov_base = (char *)token;
+	iov[1].iov_len = strlen(token) + 1;
+
+	return xs_error(xs_talkv(XBT_NIL, XS_UNWATCH, iov,
+				 ARRAY_SIZE(iov), NULL));
+}
+
+static struct xenbus_watch *find_watch(const char *token)
+{
+	struct xenbus_watch *i, *cmp;
+
+	cmp = (void *)simple_strtoul(token, NULL, 16);
+
+	list_for_each_entry(i, &watches, list)
+		if (i == cmp)
+			return i;
+
+	return NULL;
+}
+
+int xs_watch_msg(struct xs_watch_event *event)
+{
+	if (count_strings(event->body, event->len) != 2) {
+		kfree(event);
+		return -EINVAL;
+	}
+	event->path = (const char *)event->body;
+	event->token = (const char *)strchr(event->body, '\0') + 1;
+
+	spin_lock(&watches_lock);
+	event->handle = find_watch(event->token);
+	if (event->handle != NULL &&
+			(!event->handle->will_handle ||
+			 event->handle->will_handle(event->handle,
+				 event->path, event->token))) {
+		spin_lock(&watch_events_lock);
+		list_add_tail(&event->list, &watch_events);
+		event->handle->nr_pending++;
+		wake_up(&watch_events_waitq);
+		spin_unlock(&watch_events_lock);
+	} else
+		kfree(event);
+	spin_unlock(&watches_lock);
+
+	return 0;
+}
+
+/*
+ * Certain older XenBus toolstack cannot handle reading values that are
+ * not populated. Some Xen 3.4 installation are incapable of doing this
+ * so if we are running on anything older than 4 do not attempt to read
+ * control/platform-feature-xs_reset_watches.
+ */
+static bool xen_strict_xenbus_quirk(void)
+{
+#ifdef CONFIG_X86
+	uint32_t eax, ebx, ecx, edx, base;
+
+	base = xen_cpuid_base();
+	cpuid(base + 1, &eax, &ebx, &ecx, &edx);
+
+	if ((eax >> 16) < 4)
+		return true;
+#endif
+	return false;
+
+}
+static void xs_reset_watches(void)
+{
+	int err;
+
+	if (!xen_hvm_domain() || xen_initial_domain())
+		return;
+
+	if (xen_strict_xenbus_quirk())
+		return;
+
+	if (!xenbus_read_unsigned("control",
+				  "platform-feature-xs_reset_watches", 0))
+		return;
+
+	err = xs_error(xs_single(XBT_NIL, XS_RESET_WATCHES, "", NULL));
+	if (err && err != -EEXIST)
+		pr_warn("xs_reset_watches failed: %d\n", err);
+}
+
+/* Register callback to watch this node. */
+int register_xenbus_watch(struct xenbus_watch *watch)
+{
+	/* Pointer in ascii is the token. */
+	char token[sizeof(watch) * 2 + 1];
+	int err;
+
+	sprintf(token, "%lX", (long)watch);
+
+	watch->nr_pending = 0;
+
+	down_read(&xs_watch_rwsem);
+
+	spin_lock(&watches_lock);
+	BUG_ON(find_watch(token));
+	list_add(&watch->list, &watches);
+	spin_unlock(&watches_lock);
+
+	err = xs_watch(watch->node, token);
+
+	if (err) {
+		spin_lock(&watches_lock);
+		list_del(&watch->list);
+		spin_unlock(&watches_lock);
+	}
+
+	up_read(&xs_watch_rwsem);
+
+	return err;
+}
+EXPORT_SYMBOL_GPL(register_xenbus_watch);
+
+void unregister_xenbus_watch(struct xenbus_watch *watch)
+{
+	struct xs_watch_event *event, *tmp;
+	char token[sizeof(watch) * 2 + 1];
+	int err;
+
+	sprintf(token, "%lX", (long)watch);
+
+	down_read(&xs_watch_rwsem);
+
+	spin_lock(&watches_lock);
+	BUG_ON(!find_watch(token));
+	list_del(&watch->list);
+	spin_unlock(&watches_lock);
+
+	err = xs_unwatch(watch->node, token);
+	if (err)
+		pr_warn("Failed to release watch %s: %i\n", watch->node, err);
+
+	up_read(&xs_watch_rwsem);
+
+	/* Make sure there are no callbacks running currently (unless
+	   its us) */
+	if (current->pid != xenwatch_pid)
+		mutex_lock(&xenwatch_mutex);
+
+	/* Cancel pending watch events. */
+	spin_lock(&watch_events_lock);
+	if (watch->nr_pending) {
+		list_for_each_entry_safe(event, tmp, &watch_events, list) {
+			if (event->handle != watch)
+				continue;
+			list_del(&event->list);
+			kfree(event);
+		}
+		watch->nr_pending = 0;
+	}
+	spin_unlock(&watch_events_lock);
+
+	if (current->pid != xenwatch_pid)
+		mutex_unlock(&xenwatch_mutex);
+}
+EXPORT_SYMBOL_GPL(unregister_xenbus_watch);
+
+void xs_suspend(void)
+{
+	xs_suspend_enter();
+
+	down_write(&xs_watch_rwsem);
+	mutex_lock(&xs_response_mutex);
+}
+
+void xs_resume(void)
+{
+	struct xenbus_watch *watch;
+	char token[sizeof(watch) * 2 + 1];
+
+	xb_init_comms();
+
+	mutex_unlock(&xs_response_mutex);
+
+	xs_suspend_exit();
+
+	/* No need for watches_lock: the xs_watch_rwsem is sufficient. */
+	list_for_each_entry(watch, &watches, list) {
+		sprintf(token, "%lX", (long)watch);
+		xs_watch(watch->node, token);
+	}
+
+	up_write(&xs_watch_rwsem);
+}
+
+void xs_suspend_cancel(void)
+{
+	mutex_unlock(&xs_response_mutex);
+	up_write(&xs_watch_rwsem);
+
+	xs_suspend_exit();
+}
+
+static int xenwatch_thread(void *unused)
+{
+	struct xs_watch_event *event;
+
+	xenwatch_pid = current->pid;
+
+	for (;;) {
+		wait_event_interruptible(watch_events_waitq,
+					 !list_empty(&watch_events));
+
+		if (kthread_should_stop())
+			break;
+
+		mutex_lock(&xenwatch_mutex);
+
+		spin_lock(&watch_events_lock);
+		event = list_first_entry_or_null(&watch_events,
+				struct xs_watch_event, list);
+		if (event) {
+			list_del(&event->list);
+			event->handle->nr_pending--;
+		}
+		spin_unlock(&watch_events_lock);
+
+		if (event) {
+			event->handle->callback(event->handle, event->path,
+						event->token);
+			kfree(event);
+		}
+
+		mutex_unlock(&xenwatch_mutex);
+	}
+
+	return 0;
+}
+
+/*
+ * Wake up all threads waiting for a xenstore reply. In case of shutdown all
+ * pending replies will be marked as "aborted" in order to let the waiters
+ * return in spite of xenstore possibly no longer being able to reply. This
+ * will avoid blocking shutdown by a thread waiting for xenstore but being
+ * necessary for shutdown processing to proceed.
+ */
+static int xs_reboot_notify(struct notifier_block *nb,
+			    unsigned long code, void *unused)
+{
+	struct xb_req_data *req;
+
+	mutex_lock(&xb_write_mutex);
+	list_for_each_entry(req, &xs_reply_list, list)
+		wake_up(&req->wq);
+	list_for_each_entry(req, &xb_write_list, list)
+		wake_up(&req->wq);
+	mutex_unlock(&xb_write_mutex);
+	return NOTIFY_DONE;
+}
+
+static struct notifier_block xs_reboot_nb = {
+	.notifier_call = xs_reboot_notify,
+};
+
+int xs_init(void)
+{
+	int err;
+	struct task_struct *task;
+
+	register_reboot_notifier(&xs_reboot_nb);
+
+	/* Initialize the shared memory rings to talk to xenstored */
+	err = xb_init_comms();
+	if (err)
+		return err;
+
+	task = kthread_run(xenwatch_thread, NULL, "xenwatch");
+	if (IS_ERR(task))
+		return PTR_ERR(task);
+
+	/* shutdown watches for kexec boot */
+	xs_reset_watches();
+
+	return 0;
+}
diff --git a/drivers/xen/xenfs/Makefile b/drivers/xen/xenfs/Makefile
new file mode 100644
index 000000000..1a83010dd
--- /dev/null
+++ b/drivers/xen/xenfs/Makefile
@@ -0,0 +1,5 @@
+obj-$(CONFIG_XENFS) += xenfs.o
+
+xenfs-y			  = super.o
+xenfs-$(CONFIG_XEN_DOM0) += xenstored.o
+xenfs-$(CONFIG_XEN_SYMS) += xensyms.o
diff --git a/drivers/xen/xenfs/super.c b/drivers/xen/xenfs/super.c
new file mode 100644
index 000000000..71ddfb4cf
--- /dev/null
+++ b/drivers/xen/xenfs/super.c
@@ -0,0 +1,101 @@
+/*
+ *  xenfs.c - a filesystem for passing info between the a domain and
+ *  the hypervisor.
+ *
+ * 2008-10-07  Alex Zeffertt    Replaced /proc/xen/xenbus with xenfs filesystem
+ *                              and /proc/xen compatibility mount point.
+ *                              Turned xenfs into a loadable module.
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/kernel.h>
+#include <linux/errno.h>
+#include <linux/module.h>
+#include <linux/fs.h>
+#include <linux/magic.h>
+
+#include <xen/xen.h>
+#include <xen/xenbus.h>
+
+#include "xenfs.h"
+#include "../privcmd.h"
+
+#include <asm/xen/hypervisor.h>
+
+MODULE_DESCRIPTION("Xen filesystem");
+MODULE_LICENSE("GPL");
+
+static ssize_t capabilities_read(struct file *file, char __user *buf,
+				 size_t size, loff_t *off)
+{
+	char *tmp = "";
+
+	if (xen_initial_domain())
+		tmp = "control_d\n";
+
+	return simple_read_from_buffer(buf, size, off, tmp, strlen(tmp));
+}
+
+static const struct file_operations capabilities_file_ops = {
+	.read = capabilities_read,
+	.llseek = default_llseek,
+};
+
+static int xenfs_fill_super(struct super_block *sb, void *data, int silent)
+{
+	static const struct tree_descr xenfs_files[] = {
+		[2] = { "xenbus", &xen_xenbus_fops, S_IRUSR|S_IWUSR },
+		{ "capabilities", &capabilities_file_ops, S_IRUGO },
+		{ "privcmd", &xen_privcmd_fops, S_IRUSR|S_IWUSR },
+		{""},
+	};
+
+	static const struct tree_descr xenfs_init_files[] = {
+		[2] = { "xenbus", &xen_xenbus_fops, S_IRUSR|S_IWUSR },
+		{ "capabilities", &capabilities_file_ops, S_IRUGO },
+		{ "privcmd", &xen_privcmd_fops, S_IRUSR|S_IWUSR },
+		{ "xsd_kva", &xsd_kva_file_ops, S_IRUSR|S_IWUSR},
+		{ "xsd_port", &xsd_port_file_ops, S_IRUSR|S_IWUSR},
+#ifdef CONFIG_XEN_SYMS
+		{ "xensyms", &xensyms_ops, S_IRUSR},
+#endif
+		{""},
+	};
+
+	return simple_fill_super(sb, XENFS_SUPER_MAGIC,
+			xen_initial_domain() ? xenfs_init_files : xenfs_files);
+}
+
+static struct dentry *xenfs_mount(struct file_system_type *fs_type,
+				  int flags, const char *dev_name,
+				  void *data)
+{
+	return mount_single(fs_type, flags, data, xenfs_fill_super);
+}
+
+static struct file_system_type xenfs_type = {
+	.owner =	THIS_MODULE,
+	.name =		"xenfs",
+	.mount =	xenfs_mount,
+	.kill_sb =	kill_litter_super,
+};
+MODULE_ALIAS_FS("xenfs");
+
+static int __init xenfs_init(void)
+{
+	if (xen_domain())
+		return register_filesystem(&xenfs_type);
+
+	return 0;
+}
+
+static void __exit xenfs_exit(void)
+{
+	if (xen_domain())
+		unregister_filesystem(&xenfs_type);
+}
+
+module_init(xenfs_init);
+module_exit(xenfs_exit);
+
diff --git a/drivers/xen/xenfs/xenfs.h b/drivers/xen/xenfs/xenfs.h
new file mode 100644
index 000000000..cfe4874b8
--- /dev/null
+++ b/drivers/xen/xenfs/xenfs.h
@@ -0,0 +1,9 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _XENFS_XENBUS_H
+#define _XENFS_XENBUS_H
+
+extern const struct file_operations xsd_kva_file_ops;
+extern const struct file_operations xsd_port_file_ops;
+extern const struct file_operations xensyms_ops;
+
+#endif	/* _XENFS_XENBUS_H */
diff --git a/drivers/xen/xenfs/xenstored.c b/drivers/xen/xenfs/xenstored.c
new file mode 100644
index 000000000..f59235f9f
--- /dev/null
+++ b/drivers/xen/xenfs/xenstored.c
@@ -0,0 +1,69 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/slab.h>
+#include <linux/types.h>
+#include <linux/mm.h>
+#include <linux/fs.h>
+
+#include <xen/page.h>
+#include <xen/xenbus.h>
+
+#include "xenfs.h"
+
+static ssize_t xsd_read(struct file *file, char __user *buf,
+			    size_t size, loff_t *off)
+{
+	const char *str = (const char *)file->private_data;
+	return simple_read_from_buffer(buf, size, off, str, strlen(str));
+}
+
+static int xsd_release(struct inode *inode, struct file *file)
+{
+	kfree(file->private_data);
+	return 0;
+}
+
+static int xsd_kva_open(struct inode *inode, struct file *file)
+{
+	file->private_data = (void *)kasprintf(GFP_KERNEL, "0x%p",
+					       xen_store_interface);
+	if (!file->private_data)
+		return -ENOMEM;
+	return 0;
+}
+
+static int xsd_kva_mmap(struct file *file, struct vm_area_struct *vma)
+{
+	size_t size = vma->vm_end - vma->vm_start;
+
+	if ((size > PAGE_SIZE) || (vma->vm_pgoff != 0))
+		return -EINVAL;
+
+	if (remap_pfn_range(vma, vma->vm_start,
+			    virt_to_pfn(xen_store_interface),
+			    size, vma->vm_page_prot))
+		return -EAGAIN;
+
+	return 0;
+}
+
+const struct file_operations xsd_kva_file_ops = {
+	.open = xsd_kva_open,
+	.mmap = xsd_kva_mmap,
+	.read = xsd_read,
+	.release = xsd_release,
+};
+
+static int xsd_port_open(struct inode *inode, struct file *file)
+{
+	file->private_data = (void *)kasprintf(GFP_KERNEL, "%d",
+					       xen_store_evtchn);
+	if (!file->private_data)
+		return -ENOMEM;
+	return 0;
+}
+
+const struct file_operations xsd_port_file_ops = {
+	.open = xsd_port_open,
+	.read = xsd_read,
+	.release = xsd_release,
+};
diff --git a/drivers/xen/xenfs/xensyms.c b/drivers/xen/xenfs/xensyms.c
new file mode 100644
index 000000000..c6c73a33c
--- /dev/null
+++ b/drivers/xen/xenfs/xensyms.c
@@ -0,0 +1,152 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/init.h>
+#include <linux/seq_file.h>
+#include <linux/fs.h>
+#include <linux/mm.h>
+#include <linux/proc_fs.h>
+#include <linux/slab.h>
+#include <xen/interface/platform.h>
+#include <asm/xen/hypercall.h>
+#include <xen/xen-ops.h>
+#include "xenfs.h"
+
+
+#define XEN_KSYM_NAME_LEN 127 /* Hypervisor may have different name length */
+
+struct xensyms {
+	struct xen_platform_op op;
+	char *name;
+	uint32_t namelen;
+};
+
+/* Grab next output page from the hypervisor */
+static int xensyms_next_sym(struct xensyms *xs)
+{
+	int ret;
+	struct xenpf_symdata *symdata = &xs->op.u.symdata;
+	uint64_t symnum;
+
+	memset(xs->name, 0, xs->namelen);
+	symdata->namelen = xs->namelen;
+
+	symnum = symdata->symnum;
+
+	ret = HYPERVISOR_platform_op(&xs->op);
+	if (ret < 0)
+		return ret;
+
+	/*
+	 * If hypervisor's symbol didn't fit into the buffer then allocate
+	 * a larger buffer and try again.
+	 */
+	if (unlikely(symdata->namelen > xs->namelen)) {
+		kfree(xs->name);
+
+		xs->namelen = symdata->namelen;
+		xs->name = kzalloc(xs->namelen, GFP_KERNEL);
+		if (!xs->name)
+			return -ENOMEM;
+
+		set_xen_guest_handle(symdata->name, xs->name);
+		symdata->symnum--; /* Rewind */
+
+		ret = HYPERVISOR_platform_op(&xs->op);
+		if (ret < 0)
+			return ret;
+	}
+
+	if (symdata->symnum == symnum)
+		/* End of symbols */
+		return 1;
+
+	return 0;
+}
+
+static void *xensyms_start(struct seq_file *m, loff_t *pos)
+{
+	struct xensyms *xs = (struct xensyms *)m->private;
+
+	xs->op.u.symdata.symnum = *pos;
+
+	if (xensyms_next_sym(xs))
+		return NULL;
+
+	return m->private;
+}
+
+static void *xensyms_next(struct seq_file *m, void *p, loff_t *pos)
+{
+	struct xensyms *xs = (struct xensyms *)m->private;
+
+	xs->op.u.symdata.symnum = ++(*pos);
+
+	if (xensyms_next_sym(xs))
+		return NULL;
+
+	return p;
+}
+
+static int xensyms_show(struct seq_file *m, void *p)
+{
+	struct xensyms *xs = (struct xensyms *)m->private;
+	struct xenpf_symdata *symdata = &xs->op.u.symdata;
+
+	seq_printf(m, "%016llx %c %s\n", symdata->address,
+		   symdata->type, xs->name);
+
+	return 0;
+}
+
+static void xensyms_stop(struct seq_file *m, void *p)
+{
+}
+
+static const struct seq_operations xensyms_seq_ops = {
+	.start = xensyms_start,
+	.next = xensyms_next,
+	.show = xensyms_show,
+	.stop = xensyms_stop,
+};
+
+static int xensyms_open(struct inode *inode, struct file *file)
+{
+	struct seq_file *m;
+	struct xensyms *xs;
+	int ret;
+
+	ret = seq_open_private(file, &xensyms_seq_ops,
+			       sizeof(struct xensyms));
+	if (ret)
+		return ret;
+
+	m = file->private_data;
+	xs = (struct xensyms *)m->private;
+
+	xs->namelen = XEN_KSYM_NAME_LEN + 1;
+	xs->name = kzalloc(xs->namelen, GFP_KERNEL);
+	if (!xs->name) {
+		seq_release_private(inode, file);
+		return -ENOMEM;
+	}
+	set_xen_guest_handle(xs->op.u.symdata.name, xs->name);
+	xs->op.cmd = XENPF_get_symbol;
+	xs->op.u.symdata.namelen = xs->namelen;
+
+	return 0;
+}
+
+static int xensyms_release(struct inode *inode, struct file *file)
+{
+	struct seq_file *m = file->private_data;
+	struct xensyms *xs = (struct xensyms *)m->private;
+
+	kfree(xs->name);
+	return seq_release_private(inode, file);
+}
+
+const struct file_operations xensyms_ops = {
+	.open = xensyms_open,
+	.read = seq_read,
+	.llseek = seq_lseek,
+	.release = xensyms_release
+};
diff --git a/drivers/xen/xlate_mmu.c b/drivers/xen/xlate_mmu.c
new file mode 100644
index 000000000..e7df65d32
--- /dev/null
+++ b/drivers/xen/xlate_mmu.c
@@ -0,0 +1,265 @@
+/*
+ * MMU operations common to all auto-translated physmap guests.
+ *
+ * Copyright (C) 2015 Citrix Systems R&D Ltd.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation; or, when distributed
+ * separately from the Linux kernel or incorporated into other
+ * software packages, subject to the following license:
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this source file (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy, modify,
+ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
+ * and to permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/slab.h>
+#include <linux/vmalloc.h>
+
+#include <asm/xen/hypercall.h>
+#include <asm/xen/hypervisor.h>
+
+#include <xen/xen.h>
+#include <xen/xen-ops.h>
+#include <xen/page.h>
+#include <xen/interface/xen.h>
+#include <xen/interface/memory.h>
+#include <xen/balloon.h>
+
+typedef void (*xen_gfn_fn_t)(unsigned long gfn, void *data);
+
+/* Break down the pages in 4KB chunk and call fn for each gfn */
+static void xen_for_each_gfn(struct page **pages, unsigned nr_gfn,
+			     xen_gfn_fn_t fn, void *data)
+{
+	unsigned long xen_pfn = 0;
+	struct page *page;
+	int i;
+
+	for (i = 0; i < nr_gfn; i++) {
+		if ((i % XEN_PFN_PER_PAGE) == 0) {
+			page = pages[i / XEN_PFN_PER_PAGE];
+			xen_pfn = page_to_xen_pfn(page);
+		}
+		fn(pfn_to_gfn(xen_pfn++), data);
+	}
+}
+
+struct remap_data {
+	xen_pfn_t *fgfn; /* foreign domain's gfn */
+	int nr_fgfn; /* Number of foreign gfn left to map */
+	pgprot_t prot;
+	domid_t  domid;
+	struct vm_area_struct *vma;
+	int index;
+	struct page **pages;
+	struct xen_remap_gfn_info *info;
+	int *err_ptr;
+	int mapped;
+
+	/* Hypercall parameters */
+	int h_errs[XEN_PFN_PER_PAGE];
+	xen_ulong_t h_idxs[XEN_PFN_PER_PAGE];
+	xen_pfn_t h_gpfns[XEN_PFN_PER_PAGE];
+
+	int h_iter;	/* Iterator */
+};
+
+static void setup_hparams(unsigned long gfn, void *data)
+{
+	struct remap_data *info = data;
+
+	info->h_idxs[info->h_iter] = *info->fgfn;
+	info->h_gpfns[info->h_iter] = gfn;
+	info->h_errs[info->h_iter] = 0;
+
+	info->h_iter++;
+	info->fgfn++;
+}
+
+static int remap_pte_fn(pte_t *ptep, pgtable_t token, unsigned long addr,
+			void *data)
+{
+	struct remap_data *info = data;
+	struct page *page = info->pages[info->index++];
+	pte_t pte = pte_mkspecial(pfn_pte(page_to_pfn(page), info->prot));
+	int rc, nr_gfn;
+	uint32_t i;
+	struct xen_add_to_physmap_range xatp = {
+		.domid = DOMID_SELF,
+		.foreign_domid = info->domid,
+		.space = XENMAPSPACE_gmfn_foreign,
+	};
+
+	nr_gfn = min_t(typeof(info->nr_fgfn), XEN_PFN_PER_PAGE, info->nr_fgfn);
+	info->nr_fgfn -= nr_gfn;
+
+	info->h_iter = 0;
+	xen_for_each_gfn(&page, nr_gfn, setup_hparams, info);
+	BUG_ON(info->h_iter != nr_gfn);
+
+	set_xen_guest_handle(xatp.idxs, info->h_idxs);
+	set_xen_guest_handle(xatp.gpfns, info->h_gpfns);
+	set_xen_guest_handle(xatp.errs, info->h_errs);
+	xatp.size = nr_gfn;
+
+	rc = HYPERVISOR_memory_op(XENMEM_add_to_physmap_range, &xatp);
+
+	/* info->err_ptr expect to have one error status per Xen PFN */
+	for (i = 0; i < nr_gfn; i++) {
+		int err = (rc < 0) ? rc : info->h_errs[i];
+
+		*(info->err_ptr++) = err;
+		if (!err)
+			info->mapped++;
+	}
+
+	/*
+	 * Note: The hypercall will return 0 in most of the case if even if
+	 * all the fgmfn are not mapped. We still have to update the pte
+	 * as the userspace may decide to continue.
+	 */
+	if (!rc)
+		set_pte_at(info->vma->vm_mm, addr, ptep, pte);
+
+	return 0;
+}
+
+int xen_xlate_remap_gfn_array(struct vm_area_struct *vma,
+			      unsigned long addr,
+			      xen_pfn_t *gfn, int nr,
+			      int *err_ptr, pgprot_t prot,
+			      unsigned domid,
+			      struct page **pages)
+{
+	int err;
+	struct remap_data data;
+	unsigned long range = DIV_ROUND_UP(nr, XEN_PFN_PER_PAGE) << PAGE_SHIFT;
+
+	/* Kept here for the purpose of making sure code doesn't break
+	   x86 PVOPS */
+	BUG_ON(!((vma->vm_flags & (VM_PFNMAP | VM_IO)) == (VM_PFNMAP | VM_IO)));
+
+	data.fgfn = gfn;
+	data.nr_fgfn = nr;
+	data.prot  = prot;
+	data.domid = domid;
+	data.vma   = vma;
+	data.pages = pages;
+	data.index = 0;
+	data.err_ptr = err_ptr;
+	data.mapped = 0;
+
+	err = apply_to_page_range(vma->vm_mm, addr, range,
+				  remap_pte_fn, &data);
+	return err < 0 ? err : data.mapped;
+}
+EXPORT_SYMBOL_GPL(xen_xlate_remap_gfn_array);
+
+static void unmap_gfn(unsigned long gfn, void *data)
+{
+	struct xen_remove_from_physmap xrp;
+
+	xrp.domid = DOMID_SELF;
+	xrp.gpfn = gfn;
+	(void)HYPERVISOR_memory_op(XENMEM_remove_from_physmap, &xrp);
+}
+
+int xen_xlate_unmap_gfn_range(struct vm_area_struct *vma,
+			      int nr, struct page **pages)
+{
+	xen_for_each_gfn(pages, nr, unmap_gfn, NULL);
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(xen_xlate_unmap_gfn_range);
+
+struct map_balloon_pages {
+	xen_pfn_t *pfns;
+	unsigned int idx;
+};
+
+static void setup_balloon_gfn(unsigned long gfn, void *data)
+{
+	struct map_balloon_pages *info = data;
+
+	info->pfns[info->idx++] = gfn;
+}
+
+/**
+ * xen_xlate_map_ballooned_pages - map a new set of ballooned pages
+ * @gfns: returns the array of corresponding GFNs
+ * @virt: returns the virtual address of the mapped region
+ * @nr_grant_frames: number of GFNs
+ * @return 0 on success, error otherwise
+ *
+ * This allocates a set of ballooned pages and maps them into the
+ * kernel's address space.
+ */
+int __init xen_xlate_map_ballooned_pages(xen_pfn_t **gfns, void **virt,
+					 unsigned long nr_grant_frames)
+{
+	struct page **pages;
+	xen_pfn_t *pfns;
+	void *vaddr;
+	struct map_balloon_pages data;
+	int rc;
+	unsigned long nr_pages;
+
+	BUG_ON(nr_grant_frames == 0);
+	nr_pages = DIV_ROUND_UP(nr_grant_frames, XEN_PFN_PER_PAGE);
+	pages = kcalloc(nr_pages, sizeof(pages[0]), GFP_KERNEL);
+	if (!pages)
+		return -ENOMEM;
+
+	pfns = kcalloc(nr_grant_frames, sizeof(pfns[0]), GFP_KERNEL);
+	if (!pfns) {
+		kfree(pages);
+		return -ENOMEM;
+	}
+	rc = alloc_xenballooned_pages(nr_pages, pages);
+	if (rc) {
+		pr_warn("%s Couldn't balloon alloc %ld pages rc:%d\n", __func__,
+			nr_pages, rc);
+		kfree(pages);
+		kfree(pfns);
+		return rc;
+	}
+
+	data.pfns = pfns;
+	data.idx = 0;
+	xen_for_each_gfn(pages, nr_grant_frames, setup_balloon_gfn, &data);
+
+	vaddr = vmap(pages, nr_pages, 0, PAGE_KERNEL);
+	if (!vaddr) {
+		pr_warn("%s Couldn't map %ld pages rc:%d\n", __func__,
+			nr_pages, rc);
+		free_xenballooned_pages(nr_pages, pages);
+		kfree(pages);
+		kfree(pfns);
+		return -ENOMEM;
+	}
+	kfree(pages);
+
+	*gfns = pfns;
+	*virt = vaddr;
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(xen_xlate_map_ballooned_pages);
author	Daniel Baumann <daniel.baumann@progress-linux.org>	2024-05-06 01:02:30 +0000
committer	Daniel Baumann <daniel.baumann@progress-linux.org>	2024-05-06 01:02:30 +0000
commit	76cb841cb886eef6b3bee341a2266c76578724ad (patch)
tree	f5892e5ba6cc11949952a6ce4ecbe6d516d6ce58 /drivers/xen
parent	Initial commit. (diff)
download	linux-76cb841cb886eef6b3bee341a2266c76578724ad.tar.xz linux-76cb841cb886eef6b3bee341a2266c76578724ad.zip