Adding upstream version 4.19.249.upstream/4.19.249

Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
author: Daniel Baumann <daniel.baumann@progress-linux.org> 2024-05-06 01:02:30 +0000
committer: Daniel Baumann <daniel.baumann@progress-linux.org> 2024-05-06 01:02:30 +0000
commit: 76cb841cb886eef6b3bee341a2266c76578724ad (patch)
tree: f5892e5ba6cc11949952a6ce4ecbe6d516d6ce58 /drivers/gpu/drm/amd/amdkfd
parent: Initial commit. (diff)
download: linux-76cb841cb886eef6b3bee341a2266c76578724ad.tar.xz
linux-76cb841cb886eef6b3bee341a2266c76578724ad.zip
55 files changed, 24022 insertions, 0 deletions
diff --git a/drivers/gpu/drm/amd/amdkfd/Kconfig b/drivers/gpu/drm/amd/amdkfd/Kconfig
new file mode 100644
index 000000000..3858820a0
--- /dev/null
+++ b/drivers/gpu/drm/amd/amdkfd/Kconfig
@@ -0,0 +1,11 @@
+#
+# Heterogenous system architecture configuration
+#
+
+config HSA_AMD
+	tristate "HSA kernel driver for AMD GPU devices"
+	depends on DRM_AMDGPU && X86_64
+	imply AMD_IOMMU_V2
+	select MMU_NOTIFIER
+	help
+	  Enable this if you want to use HSA features on AMD GPU devices.
diff --git a/drivers/gpu/drm/amd/amdkfd/Makefile b/drivers/gpu/drm/amd/amdkfd/Makefile
new file mode 100644
index 000000000..ffd096fff
--- /dev/null
+++ b/drivers/gpu/drm/amd/amdkfd/Makefile
@@ -0,0 +1,48 @@
+#
+# Copyright 2017 Advanced Micro Devices, Inc.
+#
+# Permission is hereby granted, free of charge, to any person obtaining a
+# copy of this software and associated documentation files (the "Software"),
+# to deal in the Software without restriction, including without limitation
+# the rights to use, copy, modify, merge, publish, distribute, sublicense,
+# and/or sell copies of the Software, and to permit persons to whom the
+# Software is furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+# THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
+# OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+# ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+# OTHER DEALINGS IN THE SOFTWARE.
+#
+#
+# Makefile for Heterogenous System Architecture support for AMD GPU devices
+#
+
+ccflags-y := -Idrivers/gpu/drm/amd/include/  \
+		-Idrivers/gpu/drm/amd/include/asic_reg
+
+amdkfd-y	:= kfd_module.o kfd_device.o kfd_chardev.o kfd_topology.o \
+		kfd_pasid.o kfd_doorbell.o kfd_flat_memory.o \
+		kfd_process.o kfd_queue.o kfd_mqd_manager.o \
+		kfd_mqd_manager_cik.o kfd_mqd_manager_vi.o \
+		kfd_mqd_manager_v9.o \
+		kfd_kernel_queue.o kfd_kernel_queue_cik.o \
+		kfd_kernel_queue_vi.o kfd_kernel_queue_v9.o \
+		kfd_packet_manager.o kfd_process_queue_manager.o \
+		kfd_device_queue_manager.o kfd_device_queue_manager_cik.o \
+		kfd_device_queue_manager_vi.o kfd_device_queue_manager_v9.o \
+		kfd_interrupt.o kfd_events.o cik_event_interrupt.o \
+		kfd_int_process_v9.o kfd_dbgdev.o kfd_dbgmgr.o kfd_crat.o
+
+ifneq ($(CONFIG_AMD_IOMMU_V2),)
+amdkfd-y += kfd_iommu.o
+endif
+
+amdkfd-$(CONFIG_DEBUG_FS) += kfd_debugfs.o
+
+obj-$(CONFIG_HSA_AMD)	+= amdkfd.o
diff --git a/drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c b/drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c
new file mode 100644
index 000000000..5d2475d53
--- /dev/null
+++ b/drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c
@@ -0,0 +1,124 @@
+/*
+ * Copyright 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#include "kfd_priv.h"
+#include "kfd_events.h"
+#include "cik_int.h"
+
+static bool cik_event_interrupt_isr(struct kfd_dev *dev,
+					const uint32_t *ih_ring_entry,
+					uint32_t *patched_ihre,
+					bool *patched_flag)
+{
+	const struct cik_ih_ring_entry *ihre =
+			(const struct cik_ih_ring_entry *)ih_ring_entry;
+	const struct kfd2kgd_calls *f2g = dev->kfd2kgd;
+	unsigned int vmid, pasid;
+
+	/* This workaround is due to HW/FW limitation on Hawaii that
+	 * VMID and PASID are not written into ih_ring_entry
+	 */
+	if ((ihre->source_id == CIK_INTSRC_GFX_PAGE_INV_FAULT ||
+		ihre->source_id == CIK_INTSRC_GFX_MEM_PROT_FAULT) &&
+		dev->device_info->asic_family == CHIP_HAWAII) {
+		struct cik_ih_ring_entry *tmp_ihre =
+			(struct cik_ih_ring_entry *)patched_ihre;
+
+		*patched_flag = true;
+		*tmp_ihre = *ihre;
+
+		vmid = f2g->read_vmid_from_vmfault_reg(dev->kgd);
+		pasid = f2g->get_atc_vmid_pasid_mapping_pasid(dev->kgd, vmid);
+
+		tmp_ihre->ring_id &= 0x000000ff;
+		tmp_ihre->ring_id |= vmid << 8;
+		tmp_ihre->ring_id |= pasid << 16;
+
+		return (pasid != 0) &&
+			vmid >= dev->vm_info.first_vmid_kfd &&
+			vmid <= dev->vm_info.last_vmid_kfd;
+	}
+
+	/* Only handle interrupts from KFD VMIDs */
+	vmid  = (ihre->ring_id & 0x0000ff00) >> 8;
+	if (vmid < dev->vm_info.first_vmid_kfd ||
+	    vmid > dev->vm_info.last_vmid_kfd)
+		return 0;
+
+	/* If there is no valid PASID, it's likely a firmware bug */
+	pasid = (ihre->ring_id & 0xffff0000) >> 16;
+	if (WARN_ONCE(pasid == 0, "FW bug: No PASID in KFD interrupt"))
+		return 0;
+
+	/* Interrupt types we care about: various signals and faults.
+	 * They will be forwarded to a work queue (see below).
+	 */
+	return ihre->source_id == CIK_INTSRC_CP_END_OF_PIPE ||
+		ihre->source_id == CIK_INTSRC_SDMA_TRAP ||
+		ihre->source_id == CIK_INTSRC_SQ_INTERRUPT_MSG ||
+		ihre->source_id == CIK_INTSRC_CP_BAD_OPCODE ||
+		ihre->source_id == CIK_INTSRC_GFX_PAGE_INV_FAULT ||
+		ihre->source_id == CIK_INTSRC_GFX_MEM_PROT_FAULT;
+}
+
+static void cik_event_interrupt_wq(struct kfd_dev *dev,
+					const uint32_t *ih_ring_entry)
+{
+	const struct cik_ih_ring_entry *ihre =
+			(const struct cik_ih_ring_entry *)ih_ring_entry;
+	uint32_t context_id = ihre->data & 0xfffffff;
+	unsigned int vmid  = (ihre->ring_id & 0x0000ff00) >> 8;
+	unsigned int pasid = (ihre->ring_id & 0xffff0000) >> 16;
+
+	if (pasid == 0)
+		return;
+
+	if (ihre->source_id == CIK_INTSRC_CP_END_OF_PIPE)
+		kfd_signal_event_interrupt(pasid, context_id, 28);
+	else if (ihre->source_id == CIK_INTSRC_SDMA_TRAP)
+		kfd_signal_event_interrupt(pasid, context_id, 28);
+	else if (ihre->source_id == CIK_INTSRC_SQ_INTERRUPT_MSG)
+		kfd_signal_event_interrupt(pasid, context_id & 0xff, 8);
+	else if (ihre->source_id == CIK_INTSRC_CP_BAD_OPCODE)
+		kfd_signal_hw_exception_event(pasid);
+	else if (ihre->source_id == CIK_INTSRC_GFX_PAGE_INV_FAULT ||
+		ihre->source_id == CIK_INTSRC_GFX_MEM_PROT_FAULT) {
+		struct kfd_vm_fault_info info;
+
+		kfd_process_vm_fault(dev->dqm, pasid);
+
+		memset(&info, 0, sizeof(info));
+		dev->kfd2kgd->get_vm_fault_info(dev->kgd, &info);
+		if (!info.page_addr && !info.status)
+			return;
+
+		if (info.vmid == vmid)
+			kfd_signal_vm_fault_event(dev, pasid, &info);
+		else
+			kfd_signal_vm_fault_event(dev, pasid, NULL);
+	}
+}
+
+const struct kfd_event_interrupt_class event_interrupt_class_cik = {
+	.interrupt_isr = cik_event_interrupt_isr,
+	.interrupt_wq = cik_event_interrupt_wq,
+};
diff --git a/drivers/gpu/drm/amd/amdkfd/cik_int.h b/drivers/gpu/drm/amd/amdkfd/cik_int.h
new file mode 100644
index 000000000..76f8677a7
--- /dev/null
+++ b/drivers/gpu/drm/amd/amdkfd/cik_int.h
@@ -0,0 +1,43 @@
+/*
+ * Copyright 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#ifndef CIK_INT_H_INCLUDED
+#define CIK_INT_H_INCLUDED
+
+#include <linux/types.h>
+
+struct cik_ih_ring_entry {
+	uint32_t source_id;
+	uint32_t data;
+	uint32_t ring_id;
+	uint32_t reserved;
+};
+
+#define CIK_INTSRC_CP_END_OF_PIPE	0xB5
+#define CIK_INTSRC_CP_BAD_OPCODE	0xB7
+#define CIK_INTSRC_SDMA_TRAP		0xE0
+#define CIK_INTSRC_SQ_INTERRUPT_MSG	0xEF
+#define CIK_INTSRC_GFX_PAGE_INV_FAULT	0x92
+#define CIK_INTSRC_GFX_MEM_PROT_FAULT	0x93
+
+#endif
+
diff --git a/drivers/gpu/drm/amd/amdkfd/cik_regs.h b/drivers/gpu/drm/amd/amdkfd/cik_regs.h
new file mode 100644
index 000000000..37ce6dd65
--- /dev/null
+++ b/drivers/gpu/drm/amd/amdkfd/cik_regs.h
@@ -0,0 +1,73 @@
+/*
+ * Copyright 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#ifndef CIK_REGS_H
+#define CIK_REGS_H
+
+/* if PTR32, these are the bases for scratch and lds */
+#define	PRIVATE_BASE(x)					((x) << 0) /* scratch */
+#define	SHARED_BASE(x)					((x) << 16) /* LDS */
+#define	PTR32						(1 << 0)
+#define	ALIGNMENT_MODE(x)				((x) << 2)
+#define	SH_MEM_ALIGNMENT_MODE_UNALIGNED			3
+#define	DEFAULT_MTYPE(x)				((x) << 4)
+#define	APE1_MTYPE(x)					((x) << 7)
+
+/* valid for both DEFAULT_MTYPE and APE1_MTYPE */
+#define	MTYPE_CACHED_NV					0
+#define	MTYPE_CACHED					1
+#define	MTYPE_NONCACHED					3
+
+#define	DEFAULT_CP_HQD_PERSISTENT_STATE			(0x33U << 8)
+#define	PRELOAD_REQ					(1 << 0)
+
+#define	MQD_CONTROL_PRIV_STATE_EN			(1U << 8)
+
+#define	DEFAULT_MIN_IB_AVAIL_SIZE			(3U << 20)
+
+#define	IB_ATC_EN					(1U << 23)
+
+#define	QUANTUM_EN					1U
+#define	QUANTUM_SCALE_1MS				(1U << 4)
+#define	QUANTUM_DURATION(x)				((x) << 8)
+
+#define	RPTR_BLOCK_SIZE(x)				((x) << 8)
+#define	MIN_AVAIL_SIZE(x)				((x) << 20)
+#define	DEFAULT_RPTR_BLOCK_SIZE				RPTR_BLOCK_SIZE(5)
+#define	DEFAULT_MIN_AVAIL_SIZE				MIN_AVAIL_SIZE(3)
+
+#define	PQ_ATC_EN					(1 << 23)
+#define	NO_UPDATE_RPTR					(1 << 27)
+
+#define	DOORBELL_OFFSET(x)				((x) << 2)
+#define	DOORBELL_EN					(1 << 30)
+
+#define	PRIV_STATE					(1 << 30)
+#define	KMD_QUEUE					(1 << 31)
+
+#define	AQL_ENABLE					1
+
+#define GRBM_GFX_INDEX					0x30800
+
+#define	ATC_VMID_PASID_MAPPING_VALID			(1U << 31)
+
+#endif
diff --git a/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler.h b/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler.h
new file mode 100644
index 000000000..3621efbd5
--- /dev/null
+++ b/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler.h
@@ -0,0 +1,568 @@
+/*
+ * Copyright 2018 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+static const uint32_t cwsr_trap_gfx8_hex[] = {
+	0xbf820001, 0xbf82012b,
+	0xb8f4f802, 0x89748674,
+	0xb8f5f803, 0x8675ff75,
+	0x00000400, 0xbf850017,
+	0xc00a1e37, 0x00000000,
+	0xbf8c007f, 0x87777978,
+	0xbf840005, 0x8f728374,
+	0xb972e0c2, 0xbf800002,
+	0xb9740002, 0xbe801d78,
+	0xb8f5f803, 0x8675ff75,
+	0x000001ff, 0xbf850002,
+	0x80708470, 0x82718071,
+	0x8671ff71, 0x0000ffff,
+	0x8f728374, 0xb972e0c2,
+	0xbf800002, 0xb9740002,
+	0xbe801f70, 0xb8f5f803,
+	0x8675ff75, 0x00000100,
+	0xbf840006, 0xbefa0080,
+	0xb97a0203, 0x8671ff71,
+	0x0000ffff, 0x80f08870,
+	0x82f18071, 0xbefa0080,
+	0xb97a0283, 0xbef60068,
+	0xbef70069, 0xb8fa1c07,
+	0x8e7a9c7a, 0x87717a71,
+	0xb8fa03c7, 0x8e7a9b7a,
+	0x87717a71, 0xb8faf807,
+	0x867aff7a, 0x00007fff,
+	0xb97af807, 0xbef2007e,
+	0xbef3007f, 0xbefe0180,
+	0xbf900004, 0x877a8474,
+	0xb97af802, 0xbf8e0002,
+	0xbf88fffe, 0xbef8007e,
+	0x8679ff7f, 0x0000ffff,
+	0x8779ff79, 0x00040000,
+	0xbefa0080, 0xbefb00ff,
+	0x00807fac, 0x867aff7f,
+	0x08000000, 0x8f7a837a,
+	0x877b7a7b, 0x867aff7f,
+	0x70000000, 0x8f7a817a,
+	0x877b7a7b, 0xbeef007c,
+	0xbeee0080, 0xb8ee2a05,
+	0x806e816e, 0x8e6e8a6e,
+	0xb8fa1605, 0x807a817a,
+	0x8e7a867a, 0x806e7a6e,
+	0xbefa0084, 0xbefa00ff,
+	0x01000000, 0xbefe007c,
+	0xbefc006e, 0xc0611bfc,
+	0x0000007c, 0x806e846e,
+	0xbefc007e, 0xbefe007c,
+	0xbefc006e, 0xc0611c3c,
+	0x0000007c, 0x806e846e,
+	0xbefc007e, 0xbefe007c,
+	0xbefc006e, 0xc0611c7c,
+	0x0000007c, 0x806e846e,
+	0xbefc007e, 0xbefe007c,
+	0xbefc006e, 0xc0611cbc,
+	0x0000007c, 0x806e846e,
+	0xbefc007e, 0xbefe007c,
+	0xbefc006e, 0xc0611cfc,
+	0x0000007c, 0x806e846e,
+	0xbefc007e, 0xbefe007c,
+	0xbefc006e, 0xc0611d3c,
+	0x0000007c, 0x806e846e,
+	0xbefc007e, 0xb8f5f803,
+	0xbefe007c, 0xbefc006e,
+	0xc0611d7c, 0x0000007c,
+	0x806e846e, 0xbefc007e,
+	0xbefe007c, 0xbefc006e,
+	0xc0611dbc, 0x0000007c,
+	0x806e846e, 0xbefc007e,
+	0xbefe007c, 0xbefc006e,
+	0xc0611dfc, 0x0000007c,
+	0x806e846e, 0xbefc007e,
+	0xb8eff801, 0xbefe007c,
+	0xbefc006e, 0xc0611bfc,
+	0x0000007c, 0x806e846e,
+	0xbefc007e, 0xbefe007c,
+	0xbefc006e, 0xc0611b3c,
+	0x0000007c, 0x806e846e,
+	0xbefc007e, 0xbefe007c,
+	0xbefc006e, 0xc0611b7c,
+	0x0000007c, 0x806e846e,
+	0xbefc007e, 0x867aff7f,
+	0x04000000, 0xbef30080,
+	0x8773737a, 0xb8ee2a05,
+	0x806e816e, 0x8e6e8a6e,
+	0xb8f51605, 0x80758175,
+	0x8e758475, 0x8e7a8275,
+	0xbefa00ff, 0x01000000,
+	0xbef60178, 0x80786e78,
+	0x82798079, 0xbefc0080,
+	0xbe802b00, 0xbe822b02,
+	0xbe842b04, 0xbe862b06,
+	0xbe882b08, 0xbe8a2b0a,
+	0xbe8c2b0c, 0xbe8e2b0e,
+	0xc06b003c, 0x00000000,
+	0xc06b013c, 0x00000010,
+	0xc06b023c, 0x00000020,
+	0xc06b033c, 0x00000030,
+	0x8078c078, 0x82798079,
+	0x807c907c, 0xbf0a757c,
+	0xbf85ffeb, 0xbef80176,
+	0xbeee0080, 0xbefe00c1,
+	0xbeff00c1, 0xbefa00ff,
+	0x01000000, 0xe0724000,
+	0x6e1e0000, 0xe0724100,
+	0x6e1e0100, 0xe0724200,
+	0x6e1e0200, 0xe0724300,
+	0x6e1e0300, 0xbefe00c1,
+	0xbeff00c1, 0xb8f54306,
+	0x8675c175, 0xbf84002c,
+	0xbf8a0000, 0x867aff73,
+	0x04000000, 0xbf840028,
+	0x8e758675, 0x8e758275,
+	0xbefa0075, 0xb8ee2a05,
+	0x806e816e, 0x8e6e8a6e,
+	0xb8fa1605, 0x807a817a,
+	0x8e7a867a, 0x806e7a6e,
+	0x806eff6e, 0x00000080,
+	0xbefa00ff, 0x01000000,
+	0xbefc0080, 0xd28c0002,
+	0x000100c1, 0xd28d0003,
+	0x000204c1, 0xd1060002,
+	0x00011103, 0x7e0602ff,
+	0x00000200, 0xbefc00ff,
+	0x00010000, 0xbe80007b,
+	0x867bff7b, 0xff7fffff,
+	0x877bff7b, 0x00058000,
+	0xd8ec0000, 0x00000002,
+	0xbf8c007f, 0xe0765000,
+	0x6e1e0002, 0x32040702,
+	0xd0c9006a, 0x0000eb02,
+	0xbf87fff7, 0xbefb0000,
+	0xbeee00ff, 0x00000400,
+	0xbefe00c1, 0xbeff00c1,
+	0xb8f52a05, 0x80758175,
+	0x8e758275, 0x8e7a8875,
+	0xbefa00ff, 0x01000000,
+	0xbefc0084, 0xbf0a757c,
+	0xbf840015, 0xbf11017c,
+	0x8075ff75, 0x00001000,
+	0x7e000300, 0x7e020301,
+	0x7e040302, 0x7e060303,
+	0xe0724000, 0x6e1e0000,
+	0xe0724100, 0x6e1e0100,
+	0xe0724200, 0x6e1e0200,
+	0xe0724300, 0x6e1e0300,
+	0x807c847c, 0x806eff6e,
+	0x00000400, 0xbf0a757c,
+	0xbf85ffef, 0xbf9c0000,
+	0xbf8200cd, 0xbef8007e,
+	0x8679ff7f, 0x0000ffff,
+	0x8779ff79, 0x00040000,
+	0xbefa0080, 0xbefb00ff,
+	0x00807fac, 0x8676ff7f,
+	0x08000000, 0x8f768376,
+	0x877b767b, 0x8676ff7f,
+	0x70000000, 0x8f768176,
+	0x877b767b, 0x8676ff7f,
+	0x04000000, 0xbf84001e,
+	0xbefe00c1, 0xbeff00c1,
+	0xb8f34306, 0x8673c173,
+	0xbf840019, 0x8e738673,
+	0x8e738273, 0xbefa0073,
+	0xb8f22a05, 0x80728172,
+	0x8e728a72, 0xb8f61605,
+	0x80768176, 0x8e768676,
+	0x80727672, 0x8072ff72,
+	0x00000080, 0xbefa00ff,
+	0x01000000, 0xbefc0080,
+	0xe0510000, 0x721e0000,
+	0xe0510100, 0x721e0000,
+	0x807cff7c, 0x00000200,
+	0x8072ff72, 0x00000200,
+	0xbf0a737c, 0xbf85fff6,
+	0xbef20080, 0xbefe00c1,
+	0xbeff00c1, 0xb8f32a05,
+	0x80738173, 0x8e738273,
+	0x8e7a8873, 0xbefa00ff,
+	0x01000000, 0xbef60072,
+	0x8072ff72, 0x00000400,
+	0xbefc0084, 0xbf11087c,
+	0x8073ff73, 0x00008000,
+	0xe0524000, 0x721e0000,
+	0xe0524100, 0x721e0100,
+	0xe0524200, 0x721e0200,
+	0xe0524300, 0x721e0300,
+	0xbf8c0f70, 0x7e000300,
+	0x7e020301, 0x7e040302,
+	0x7e060303, 0x807c847c,
+	0x8072ff72, 0x00000400,
+	0xbf0a737c, 0xbf85ffee,
+	0xbf9c0000, 0xe0524000,
+	0x761e0000, 0xe0524100,
+	0x761e0100, 0xe0524200,
+	0x761e0200, 0xe0524300,
+	0x761e0300, 0xb8f22a05,
+	0x80728172, 0x8e728a72,
+	0xb8f61605, 0x80768176,
+	0x8e768676, 0x80727672,
+	0x80f2c072, 0xb8f31605,
+	0x80738173, 0x8e738473,
+	0x8e7a8273, 0xbefa00ff,
+	0x01000000, 0xbefc0073,
+	0xc031003c, 0x00000072,
+	0x80f2c072, 0xbf8c007f,
+	0x80fc907c, 0xbe802d00,
+	0xbe822d02, 0xbe842d04,
+	0xbe862d06, 0xbe882d08,
+	0xbe8a2d0a, 0xbe8c2d0c,
+	0xbe8e2d0e, 0xbf06807c,
+	0xbf84fff1, 0xb8f22a05,
+	0x80728172, 0x8e728a72,
+	0xb8f61605, 0x80768176,
+	0x8e768676, 0x80727672,
+	0xbefa0084, 0xbefa00ff,
+	0x01000000, 0xc0211cfc,
+	0x00000072, 0x80728472,
+	0xc0211c3c, 0x00000072,
+	0x80728472, 0xc0211c7c,
+	0x00000072, 0x80728472,
+	0xc0211bbc, 0x00000072,
+	0x80728472, 0xc0211bfc,
+	0x00000072, 0x80728472,
+	0xc0211d3c, 0x00000072,
+	0x80728472, 0xc0211d7c,
+	0x00000072, 0x80728472,
+	0xc0211a3c, 0x00000072,
+	0x80728472, 0xc0211a7c,
+	0x00000072, 0x80728472,
+	0xc0211dfc, 0x00000072,
+	0x80728472, 0xc0211b3c,
+	0x00000072, 0x80728472,
+	0xc0211b7c, 0x00000072,
+	0x80728472, 0xbf8c007f,
+	0xbefc0073, 0xbefe006e,
+	0xbeff006f, 0x867375ff,
+	0x000003ff, 0xb9734803,
+	0x867375ff, 0xfffff800,
+	0x8f738b73, 0xb973a2c3,
+	0xb977f801, 0x8673ff71,
+	0xf0000000, 0x8f739c73,
+	0x8e739073, 0xbef60080,
+	0x87767376, 0x8673ff71,
+	0x08000000, 0x8f739b73,
+	0x8e738f73, 0x87767376,
+	0x8673ff74, 0x00800000,
+	0x8f739773, 0xb976f807,
+	0x8671ff71, 0x0000ffff,
+	0x86fe7e7e, 0x86ea6a6a,
+	0x8f768374, 0xb976e0c2,
+	0xbf800002, 0xb9740002,
+	0xbf8a0000, 0x95807370,
+	0xbf810000, 0x00000000,
+};
+
+
+static const uint32_t cwsr_trap_gfx9_hex[] = {
+	0xbf820001, 0xbf82015d,
+	0xb8f8f802, 0x89788678,
+	0xb8f1f803, 0x866eff71,
+	0x00000400, 0xbf850037,
+	0x866eff71, 0x00000800,
+	0xbf850003, 0x866eff71,
+	0x00000100, 0xbf840008,
+	0x866eff78, 0x00002000,
+	0xbf840001, 0xbf810000,
+	0x8778ff78, 0x00002000,
+	0x80ec886c, 0x82ed806d,
+	0xb8eef807, 0x866fff6e,
+	0x001f8000, 0x8e6f8b6f,
+	0x8977ff77, 0xfc000000,
+	0x87776f77, 0x896eff6e,
+	0x001f8000, 0xb96ef807,
+	0xb8f0f812, 0xb8f1f813,
+	0x8ef08870, 0xc0071bb8,
+	0x00000000, 0xbf8cc07f,
+	0xc0071c38, 0x00000008,
+	0xbf8cc07f, 0x86ee6e6e,
+	0xbf840001, 0xbe801d6e,
+	0xb8f1f803, 0x8671ff71,
+	0x000001ff, 0xbf850002,
+	0x806c846c, 0x826d806d,
+	0x866dff6d, 0x0000ffff,
+	0x8f6e8b77, 0x866eff6e,
+	0x001f8000, 0xb96ef807,
+	0x86fe7e7e, 0x86ea6a6a,
+	0x8f6e8378, 0xb96ee0c2,
+	0xbf800002, 0xb9780002,
+	0xbe801f6c, 0x866dff6d,
+	0x0000ffff, 0xbef00080,
+	0xb9700283, 0xb8f02407,
+	0x8e709c70, 0x876d706d,
+	0xb8f003c7, 0x8e709b70,
+	0x876d706d, 0xb8f0f807,
+	0x8670ff70, 0x00007fff,
+	0xb970f807, 0xbeee007e,
+	0xbeef007f, 0xbefe0180,
+	0xbf900004, 0x87708478,
+	0xb970f802, 0xbf8e0002,
+	0xbf88fffe, 0xb8f02a05,
+	0x80708170, 0x8e708a70,
+	0xb8f11605, 0x80718171,
+	0x8e718671, 0x80707170,
+	0x80707e70, 0x8271807f,
+	0x8671ff71, 0x0000ffff,
+	0xc0471cb8, 0x00000040,
+	0xbf8cc07f, 0xc04b1d38,
+	0x00000048, 0xbf8cc07f,
+	0xc0431e78, 0x00000058,
+	0xbf8cc07f, 0xc0471eb8,
+	0x0000005c, 0xbf8cc07f,
+	0xbef4007e, 0x8675ff7f,
+	0x0000ffff, 0x8775ff75,
+	0x00040000, 0xbef60080,
+	0xbef700ff, 0x00807fac,
+	0x8670ff7f, 0x08000000,
+	0x8f708370, 0x87777077,
+	0x8670ff7f, 0x70000000,
+	0x8f708170, 0x87777077,
+	0xbefb007c, 0xbefa0080,
+	0xb8fa2a05, 0x807a817a,
+	0x8e7a8a7a, 0xb8f01605,
+	0x80708170, 0x8e708670,
+	0x807a707a, 0xbef60084,
+	0xbef600ff, 0x01000000,
+	0xbefe007c, 0xbefc007a,
+	0xc0611efa, 0x0000007c,
+	0xbf8cc07f, 0x807a847a,
+	0xbefc007e, 0xbefe007c,
+	0xbefc007a, 0xc0611b3a,
+	0x0000007c, 0xbf8cc07f,
+	0x807a847a, 0xbefc007e,
+	0xbefe007c, 0xbefc007a,
+	0xc0611b7a, 0x0000007c,
+	0xbf8cc07f, 0x807a847a,
+	0xbefc007e, 0xbefe007c,
+	0xbefc007a, 0xc0611bba,
+	0x0000007c, 0xbf8cc07f,
+	0x807a847a, 0xbefc007e,
+	0xbefe007c, 0xbefc007a,
+	0xc0611bfa, 0x0000007c,
+	0xbf8cc07f, 0x807a847a,
+	0xbefc007e, 0xbefe007c,
+	0xbefc007a, 0xc0611e3a,
+	0x0000007c, 0xbf8cc07f,
+	0x807a847a, 0xbefc007e,
+	0xb8f1f803, 0xbefe007c,
+	0xbefc007a, 0xc0611c7a,
+	0x0000007c, 0xbf8cc07f,
+	0x807a847a, 0xbefc007e,
+	0xbefe007c, 0xbefc007a,
+	0xc0611a3a, 0x0000007c,
+	0xbf8cc07f, 0x807a847a,
+	0xbefc007e, 0xbefe007c,
+	0xbefc007a, 0xc0611a7a,
+	0x0000007c, 0xbf8cc07f,
+	0x807a847a, 0xbefc007e,
+	0xb8fbf801, 0xbefe007c,
+	0xbefc007a, 0xc0611efa,
+	0x0000007c, 0xbf8cc07f,
+	0x807a847a, 0xbefc007e,
+	0x8670ff7f, 0x04000000,
+	0xbeef0080, 0x876f6f70,
+	0xb8fa2a05, 0x807a817a,
+	0x8e7a8a7a, 0xb8f11605,
+	0x80718171, 0x8e718471,
+	0x8e768271, 0xbef600ff,
+	0x01000000, 0xbef20174,
+	0x80747a74, 0x82758075,
+	0xbefc0080, 0xbf800000,
+	0xbe802b00, 0xbe822b02,
+	0xbe842b04, 0xbe862b06,
+	0xbe882b08, 0xbe8a2b0a,
+	0xbe8c2b0c, 0xbe8e2b0e,
+	0xc06b003a, 0x00000000,
+	0xbf8cc07f, 0xc06b013a,
+	0x00000010, 0xbf8cc07f,
+	0xc06b023a, 0x00000020,
+	0xbf8cc07f, 0xc06b033a,
+	0x00000030, 0xbf8cc07f,
+	0x8074c074, 0x82758075,
+	0x807c907c, 0xbf0a717c,
+	0xbf85ffe7, 0xbef40172,
+	0xbefa0080, 0xbefe00c1,
+	0xbeff00c1, 0xbee80080,
+	0xbee90080, 0xbef600ff,
+	0x01000000, 0xe0724000,
+	0x7a1d0000, 0xe0724100,
+	0x7a1d0100, 0xe0724200,
+	0x7a1d0200, 0xe0724300,
+	0x7a1d0300, 0xbefe00c1,
+	0xbeff00c1, 0xb8f14306,
+	0x8671c171, 0xbf84002c,
+	0xbf8a0000, 0x8670ff6f,
+	0x04000000, 0xbf840028,
+	0x8e718671, 0x8e718271,
+	0xbef60071, 0xb8fa2a05,
+	0x807a817a, 0x8e7a8a7a,
+	0xb8f01605, 0x80708170,
+	0x8e708670, 0x807a707a,
+	0x807aff7a, 0x00000080,
+	0xbef600ff, 0x01000000,
+	0xbefc0080, 0xd28c0002,
+	0x000100c1, 0xd28d0003,
+	0x000204c1, 0xd1060002,
+	0x00011103, 0x7e0602ff,
+	0x00000200, 0xbefc00ff,
+	0x00010000, 0xbe800077,
+	0x8677ff77, 0xff7fffff,
+	0x8777ff77, 0x00058000,
+	0xd8ec0000, 0x00000002,
+	0xbf8cc07f, 0xe0765000,
+	0x7a1d0002, 0x68040702,
+	0xd0c9006a, 0x0000e302,
+	0xbf87fff7, 0xbef70000,
+	0xbefa00ff, 0x00000400,
+	0xbefe00c1, 0xbeff00c1,
+	0xb8f12a05, 0x80718171,
+	0x8e718271, 0x8e768871,
+	0xbef600ff, 0x01000000,
+	0xbefc0084, 0xbf0a717c,
+	0xbf840015, 0xbf11017c,
+	0x8071ff71, 0x00001000,
+	0x7e000300, 0x7e020301,
+	0x7e040302, 0x7e060303,
+	0xe0724000, 0x7a1d0000,
+	0xe0724100, 0x7a1d0100,
+	0xe0724200, 0x7a1d0200,
+	0xe0724300, 0x7a1d0300,
+	0x807c847c, 0x807aff7a,
+	0x00000400, 0xbf0a717c,
+	0xbf85ffef, 0xbf9c0000,
+	0xbf8200dc, 0xbef4007e,
+	0x8675ff7f, 0x0000ffff,
+	0x8775ff75, 0x00040000,
+	0xbef60080, 0xbef700ff,
+	0x00807fac, 0x866eff7f,
+	0x08000000, 0x8f6e836e,
+	0x87776e77, 0x866eff7f,
+	0x70000000, 0x8f6e816e,
+	0x87776e77, 0x866eff7f,
+	0x04000000, 0xbf84001e,
+	0xbefe00c1, 0xbeff00c1,
+	0xb8ef4306, 0x866fc16f,
+	0xbf840019, 0x8e6f866f,
+	0x8e6f826f, 0xbef6006f,
+	0xb8f82a05, 0x80788178,
+	0x8e788a78, 0xb8ee1605,
+	0x806e816e, 0x8e6e866e,
+	0x80786e78, 0x8078ff78,
+	0x00000080, 0xbef600ff,
+	0x01000000, 0xbefc0080,
+	0xe0510000, 0x781d0000,
+	0xe0510100, 0x781d0000,
+	0x807cff7c, 0x00000200,
+	0x8078ff78, 0x00000200,
+	0xbf0a6f7c, 0xbf85fff6,
+	0xbef80080, 0xbefe00c1,
+	0xbeff00c1, 0xb8ef2a05,
+	0x806f816f, 0x8e6f826f,
+	0x8e76886f, 0xbef600ff,
+	0x01000000, 0xbeee0078,
+	0x8078ff78, 0x00000400,
+	0xbefc0084, 0xbf11087c,
+	0x806fff6f, 0x00008000,
+	0xe0524000, 0x781d0000,
+	0xe0524100, 0x781d0100,
+	0xe0524200, 0x781d0200,
+	0xe0524300, 0x781d0300,
+	0xbf8c0f70, 0x7e000300,
+	0x7e020301, 0x7e040302,
+	0x7e060303, 0x807c847c,
+	0x8078ff78, 0x00000400,
+	0xbf0a6f7c, 0xbf85ffee,
+	0xbf9c0000, 0xe0524000,
+	0x6e1d0000, 0xe0524100,
+	0x6e1d0100, 0xe0524200,
+	0x6e1d0200, 0xe0524300,
+	0x6e1d0300, 0xb8f82a05,
+	0x80788178, 0x8e788a78,
+	0xb8ee1605, 0x806e816e,
+	0x8e6e866e, 0x80786e78,
+	0x80f8c078, 0xb8ef1605,
+	0x806f816f, 0x8e6f846f,
+	0x8e76826f, 0xbef600ff,
+	0x01000000, 0xbefc006f,
+	0xc031003a, 0x00000078,
+	0x80f8c078, 0xbf8cc07f,
+	0x80fc907c, 0xbf800000,
+	0xbe802d00, 0xbe822d02,
+	0xbe842d04, 0xbe862d06,
+	0xbe882d08, 0xbe8a2d0a,
+	0xbe8c2d0c, 0xbe8e2d0e,
+	0xbf06807c, 0xbf84fff0,
+	0xb8f82a05, 0x80788178,
+	0x8e788a78, 0xb8ee1605,
+	0x806e816e, 0x8e6e866e,
+	0x80786e78, 0xbef60084,
+	0xbef600ff, 0x01000000,
+	0xc0211bfa, 0x00000078,
+	0x80788478, 0xc0211b3a,
+	0x00000078, 0x80788478,
+	0xc0211b7a, 0x00000078,
+	0x80788478, 0xc0211eba,
+	0x00000078, 0x80788478,
+	0xc0211efa, 0x00000078,
+	0x80788478, 0xc0211c3a,
+	0x00000078, 0x80788478,
+	0xc0211c7a, 0x00000078,
+	0x80788478, 0xc0211a3a,
+	0x00000078, 0x80788478,
+	0xc0211a7a, 0x00000078,
+	0x80788478, 0xc0211cfa,
+	0x00000078, 0x80788478,
+	0xbf8cc07f, 0xbefc006f,
+	0xbefe007a, 0xbeff007b,
+	0x866f71ff, 0x000003ff,
+	0xb96f4803, 0x866f71ff,
+	0xfffff800, 0x8f6f8b6f,
+	0xb96fa2c3, 0xb973f801,
+	0xb8ee2a05, 0x806e816e,
+	0x8e6e8a6e, 0xb8ef1605,
+	0x806f816f, 0x8e6f866f,
+	0x806e6f6e, 0x806e746e,
+	0x826f8075, 0x866fff6f,
+	0x0000ffff, 0xc0071cb7,
+	0x00000040, 0xc00b1d37,
+	0x00000048, 0xc0031e77,
+	0x00000058, 0xc0071eb7,
+	0x0000005c, 0xbf8cc07f,
+	0x866fff6d, 0xf0000000,
+	0x8f6f9c6f, 0x8e6f906f,
+	0xbeee0080, 0x876e6f6e,
+	0x866fff6d, 0x08000000,
+	0x8f6f9b6f, 0x8e6f8f6f,
+	0x876e6f6e, 0x866fff70,
+	0x00800000, 0x8f6f976f,
+	0xb96ef807, 0x866dff6d,
+	0x0000ffff, 0x86fe7e7e,
+	0x86ea6a6a, 0x8f6e8370,
+	0xb96ee0c2, 0xbf800002,
+	0xb9700002, 0xbf8a0000,
+	0x95806f6c, 0xbf810000,
+};
diff --git a/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler_gfx8.asm b/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler_gfx8.asm
new file mode 100644
index 000000000..abe1a5da2
--- /dev/null
+++ b/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler_gfx8.asm
@@ -0,0 +1,1148 @@
+/*
+ * Copyright 2015-2017 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+/* To compile this assembly code:
+ * PROJECT=vi ./sp3 cwsr_trap_handler_gfx8.asm -hex tmp.hex
+ */
+
+/* HW (VI) source code for CWSR trap handler */
+/* Version 18 + multiple trap handler */
+
+// this performance-optimal version was originally from Seven Xu at SRDC
+
+// Revison #18   --...
+/* Rev History
+** #1. Branch from gc dv.   //gfxip/gfx8/main/src/test/suites/block/cs/sr/cs_trap_handler.sp3#1,#50, #51, #52-53(Skip, Already Fixed by PV), #54-56(merged),#57-58(mergerd, skiped-already fixed by PV)
+** #4. SR Memory Layout:
+**             1. VGPR-SGPR-HWREG-{LDS}
+**             2. tba_hi.bits.26 - reconfigured as the first wave in tg bits, for defer Save LDS for a threadgroup.. performance concern..
+** #5. Update: 1. Accurate g8sr_ts_save_d timestamp
+** #6. Update: 1. Fix s_barrier usage; 2. VGPR s/r using swizzle buffer?(NoNeed, already matched the swizzle pattern, more investigation)
+** #7. Update: 1. don't barrier if noLDS
+** #8. Branch: 1. Branch to ver#0, which is very similar to gc dv version
+**             2. Fix SQ issue by s_sleep 2
+** #9. Update: 1. Fix scc restore failed issue, restore wave_status at last
+**             2. optimize s_buffer save by burst 16sgprs...
+** #10. Update 1. Optimize restore sgpr by busrt 16 sgprs.
+** #11. Update 1. Add 2 more timestamp for debug version
+** #12. Update 1. Add VGPR SR using DWx4, some case improve and some case drop performance
+** #13. Integ  1. Always use MUBUF for PV trap shader...
+** #14. Update 1. s_buffer_store soft clause...
+** #15. Update 1. PERF - sclar write with glc:0/mtype0 to allow L2 combine. perf improvement a lot.
+** #16. Update 1. PRRF - UNROLL LDS_DMA got 2500cycle save in IP tree
+** #17. Update 1. FUNC - LDS_DMA has issues while ATC, replace with ds_read/buffer_store for save part[TODO restore part]
+**             2. PERF - Save LDS before save VGPR to cover LDS save long latency...
+** #18. Update 1. FUNC - Implicitly estore STATUS.VCCZ, which is not writable by s_setreg_b32
+**             2. FUNC - Handle non-CWSR traps
+*/
+
+var G8SR_WDMEM_HWREG_OFFSET = 0
+var G8SR_WDMEM_SGPR_OFFSET  = 128  // in bytes
+
+// Keep definition same as the app shader, These 2 time stamps are part of the app shader... Should before any Save and after restore.
+
+var G8SR_DEBUG_TIMESTAMP = 0
+var G8SR_DEBUG_TS_SAVE_D_OFFSET = 40*4  // ts_save_d timestamp offset relative to SGPR_SR_memory_offset
+var s_g8sr_ts_save_s    = s[34:35]   // save start
+var s_g8sr_ts_sq_save_msg  = s[36:37]   // The save shader send SAVEWAVE msg to spi
+var s_g8sr_ts_spi_wrexec   = s[38:39]   // the SPI write the sr address to SQ
+var s_g8sr_ts_save_d    = s[40:41]   // save end
+var s_g8sr_ts_restore_s = s[42:43]   // restore start
+var s_g8sr_ts_restore_d = s[44:45]   // restore end
+
+var G8SR_VGPR_SR_IN_DWX4 = 0
+var G8SR_SAVE_BUF_RSRC_WORD1_STRIDE_DWx4 = 0x00100000    // DWx4 stride is 4*4Bytes
+var G8SR_RESTORE_BUF_RSRC_WORD1_STRIDE_DWx4  = G8SR_SAVE_BUF_RSRC_WORD1_STRIDE_DWx4
+
+
+/*************************************************************************/
+/*                  control on how to run the shader                     */
+/*************************************************************************/
+//any hack that needs to be made to run this code in EMU (either because various EMU code are not ready or no compute save & restore in EMU run)
+var EMU_RUN_HACK                    =   0
+var EMU_RUN_HACK_RESTORE_NORMAL     =   0
+var EMU_RUN_HACK_SAVE_NORMAL_EXIT   =   0
+var EMU_RUN_HACK_SAVE_SINGLE_WAVE   =   0
+var EMU_RUN_HACK_SAVE_FIRST_TIME    =   0                   //for interrupted restore in which the first save is through EMU_RUN_HACK
+var EMU_RUN_HACK_SAVE_FIRST_TIME_TBA_LO =   0                   //for interrupted restore in which the first save is through EMU_RUN_HACK
+var EMU_RUN_HACK_SAVE_FIRST_TIME_TBA_HI =   0                   //for interrupted restore in which the first save is through EMU_RUN_HACK
+var SAVE_LDS                        =   1
+var WG_BASE_ADDR_LO                 =   0x9000a000
+var WG_BASE_ADDR_HI                 =   0x0
+var WAVE_SPACE                      =   0x5000              //memory size that each wave occupies in workgroup state mem
+var CTX_SAVE_CONTROL                =   0x0
+var CTX_RESTORE_CONTROL             =   CTX_SAVE_CONTROL
+var SIM_RUN_HACK                    =   0                   //any hack that needs to be made to run this code in SIM (either because various RTL code are not ready or no compute save & restore in RTL run)
+var SGPR_SAVE_USE_SQC               =   1                   //use SQC D$ to do the write
+var USE_MTBUF_INSTEAD_OF_MUBUF      =   0                   //because TC EMU currently asserts on 0 of // overload DFMT field to carry 4 more bits of stride for MUBUF opcodes
+var SWIZZLE_EN                      =   0                   //whether we use swizzled buffer addressing
+
+/**************************************************************************/
+/*                      variables                                         */
+/**************************************************************************/
+var SQ_WAVE_STATUS_INST_ATC_SHIFT  = 23
+var SQ_WAVE_STATUS_INST_ATC_MASK   = 0x00800000
+var SQ_WAVE_STATUS_SPI_PRIO_SHIFT  = 1
+var SQ_WAVE_STATUS_SPI_PRIO_MASK   = 0x00000006
+var SQ_WAVE_STATUS_PRE_SPI_PRIO_SHIFT   = 0
+var SQ_WAVE_STATUS_PRE_SPI_PRIO_SIZE    = 1
+var SQ_WAVE_STATUS_POST_SPI_PRIO_SHIFT  = 3
+var SQ_WAVE_STATUS_POST_SPI_PRIO_SIZE   = 29
+
+var SQ_WAVE_LDS_ALLOC_LDS_SIZE_SHIFT    = 12
+var SQ_WAVE_LDS_ALLOC_LDS_SIZE_SIZE     = 9
+var SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SHIFT   = 8
+var SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SIZE    = 6
+var SQ_WAVE_GPR_ALLOC_SGPR_SIZE_SHIFT   = 24
+var SQ_WAVE_GPR_ALLOC_SGPR_SIZE_SIZE    = 3                     //FIXME  sq.blk still has 4 bits at this time while SQ programming guide has 3 bits
+
+var SQ_WAVE_TRAPSTS_SAVECTX_MASK    =   0x400
+var SQ_WAVE_TRAPSTS_EXCE_MASK       =   0x1FF                   // Exception mask
+var SQ_WAVE_TRAPSTS_SAVECTX_SHIFT   =   10
+var SQ_WAVE_TRAPSTS_MEM_VIOL_MASK   =   0x100
+var SQ_WAVE_TRAPSTS_MEM_VIOL_SHIFT  =   8
+var SQ_WAVE_TRAPSTS_PRE_SAVECTX_MASK    =   0x3FF
+var SQ_WAVE_TRAPSTS_PRE_SAVECTX_SHIFT   =   0x0
+var SQ_WAVE_TRAPSTS_PRE_SAVECTX_SIZE    =   10
+var SQ_WAVE_TRAPSTS_POST_SAVECTX_MASK   =   0xFFFFF800
+var SQ_WAVE_TRAPSTS_POST_SAVECTX_SHIFT  =   11
+var SQ_WAVE_TRAPSTS_POST_SAVECTX_SIZE   =   21
+
+var SQ_WAVE_IB_STS_RCNT_SHIFT           =   16                  //FIXME
+var SQ_WAVE_IB_STS_RCNT_SIZE            =   4                   //FIXME
+var SQ_WAVE_IB_STS_FIRST_REPLAY_SHIFT   =   15                  //FIXME
+var SQ_WAVE_IB_STS_FIRST_REPLAY_SIZE    =   1                   //FIXME
+var SQ_WAVE_IB_STS_RCNT_FIRST_REPLAY_MASK_NEG   = 0x00007FFF    //FIXME
+
+var SQ_BUF_RSRC_WORD1_ATC_SHIFT     =   24
+var SQ_BUF_RSRC_WORD3_MTYPE_SHIFT   =   27
+
+
+/*      Save        */
+var S_SAVE_BUF_RSRC_WORD1_STRIDE        =   0x00040000          //stride is 4 bytes
+var S_SAVE_BUF_RSRC_WORD3_MISC          =   0x00807FAC          //SQ_SEL_X/Y/Z/W, BUF_NUM_FORMAT_FLOAT, (0 for MUBUF stride[17:14] when ADD_TID_ENABLE and BUF_DATA_FORMAT_32 for MTBUF), ADD_TID_ENABLE
+
+var S_SAVE_SPI_INIT_ATC_MASK            =   0x08000000          //bit[27]: ATC bit
+var S_SAVE_SPI_INIT_ATC_SHIFT           =   27
+var S_SAVE_SPI_INIT_MTYPE_MASK          =   0x70000000          //bit[30:28]: Mtype
+var S_SAVE_SPI_INIT_MTYPE_SHIFT         =   28
+var S_SAVE_SPI_INIT_FIRST_WAVE_MASK     =   0x04000000          //bit[26]: FirstWaveInTG
+var S_SAVE_SPI_INIT_FIRST_WAVE_SHIFT    =   26
+
+var S_SAVE_PC_HI_RCNT_SHIFT             =   28                  //FIXME  check with Brian to ensure all fields other than PC[47:0] can be used
+var S_SAVE_PC_HI_RCNT_MASK              =   0xF0000000          //FIXME
+var S_SAVE_PC_HI_FIRST_REPLAY_SHIFT     =   27                  //FIXME
+var S_SAVE_PC_HI_FIRST_REPLAY_MASK      =   0x08000000          //FIXME
+
+var s_save_spi_init_lo              =   exec_lo
+var s_save_spi_init_hi              =   exec_hi
+
+                                                //tba_lo and tba_hi need to be saved/restored
+var s_save_pc_lo            =   ttmp0           //{TTMP1, TTMP0} = {3'h0,pc_rewind[3:0], HT[0],trapID[7:0], PC[47:0]}
+var s_save_pc_hi            =   ttmp1
+var s_save_exec_lo          =   ttmp2
+var s_save_exec_hi          =   ttmp3
+var s_save_status           =   ttmp4
+var s_save_trapsts          =   ttmp5           //not really used until the end of the SAVE routine
+var s_save_xnack_mask_lo    =   ttmp6
+var s_save_xnack_mask_hi    =   ttmp7
+var s_save_buf_rsrc0        =   ttmp8
+var s_save_buf_rsrc1        =   ttmp9
+var s_save_buf_rsrc2        =   ttmp10
+var s_save_buf_rsrc3        =   ttmp11
+
+var s_save_mem_offset       =   tma_lo
+var s_save_alloc_size       =   s_save_trapsts          //conflict
+var s_save_tmp              =   s_save_buf_rsrc2        //shared with s_save_buf_rsrc2  (conflict: should not use mem access with s_save_tmp at the same time)
+var s_save_m0               =   tma_hi
+
+/*      Restore     */
+var S_RESTORE_BUF_RSRC_WORD1_STRIDE         =   S_SAVE_BUF_RSRC_WORD1_STRIDE
+var S_RESTORE_BUF_RSRC_WORD3_MISC           =   S_SAVE_BUF_RSRC_WORD3_MISC
+
+var S_RESTORE_SPI_INIT_ATC_MASK             =   0x08000000          //bit[27]: ATC bit
+var S_RESTORE_SPI_INIT_ATC_SHIFT            =   27
+var S_RESTORE_SPI_INIT_MTYPE_MASK           =   0x70000000          //bit[30:28]: Mtype
+var S_RESTORE_SPI_INIT_MTYPE_SHIFT          =   28
+var S_RESTORE_SPI_INIT_FIRST_WAVE_MASK      =   0x04000000          //bit[26]: FirstWaveInTG
+var S_RESTORE_SPI_INIT_FIRST_WAVE_SHIFT     =   26
+
+var S_RESTORE_PC_HI_RCNT_SHIFT              =   S_SAVE_PC_HI_RCNT_SHIFT
+var S_RESTORE_PC_HI_RCNT_MASK               =   S_SAVE_PC_HI_RCNT_MASK
+var S_RESTORE_PC_HI_FIRST_REPLAY_SHIFT      =   S_SAVE_PC_HI_FIRST_REPLAY_SHIFT
+var S_RESTORE_PC_HI_FIRST_REPLAY_MASK       =   S_SAVE_PC_HI_FIRST_REPLAY_MASK
+
+var s_restore_spi_init_lo                   =   exec_lo
+var s_restore_spi_init_hi                   =   exec_hi
+
+var s_restore_mem_offset        =   ttmp2
+var s_restore_alloc_size        =   ttmp3
+var s_restore_tmp               =   ttmp6               //tba_lo/hi need to be restored
+var s_restore_mem_offset_save   =   s_restore_tmp       //no conflict
+
+var s_restore_m0            =   s_restore_alloc_size    //no conflict
+
+var s_restore_mode          =   ttmp7
+
+var s_restore_pc_lo         =   ttmp0
+var s_restore_pc_hi         =   ttmp1
+var s_restore_exec_lo       =   tma_lo                  //no conflict
+var s_restore_exec_hi       =   tma_hi                  //no conflict
+var s_restore_status        =   ttmp4
+var s_restore_trapsts       =   ttmp5
+var s_restore_xnack_mask_lo =   xnack_mask_lo
+var s_restore_xnack_mask_hi =   xnack_mask_hi
+var s_restore_buf_rsrc0     =   ttmp8
+var s_restore_buf_rsrc1     =   ttmp9
+var s_restore_buf_rsrc2     =   ttmp10
+var s_restore_buf_rsrc3     =   ttmp11
+
+/**************************************************************************/
+/*                      trap handler entry points                         */
+/**************************************************************************/
+/* Shader Main*/
+
+shader main
+  asic(VI)
+  type(CS)
+
+
+    if ((EMU_RUN_HACK) && (!EMU_RUN_HACK_RESTORE_NORMAL))                   //hack to use trap_id for determining save/restore
+        //FIXME VCCZ un-init assertion s_getreg_b32     s_save_status, hwreg(HW_REG_STATUS)         //save STATUS since we will change SCC
+        s_and_b32 s_save_tmp, s_save_pc_hi, 0xffff0000              //change SCC
+        s_cmp_eq_u32 s_save_tmp, 0x007e0000                         //Save: trap_id = 0x7e. Restore: trap_id = 0x7f.
+        s_cbranch_scc0 L_JUMP_TO_RESTORE                            //do not need to recover STATUS here  since we are going to RESTORE
+        //FIXME  s_setreg_b32   hwreg(HW_REG_STATUS),   s_save_status       //need to recover STATUS since we are going to SAVE
+        s_branch L_SKIP_RESTORE                                     //NOT restore, SAVE actually
+    else
+        s_branch L_SKIP_RESTORE                                     //NOT restore. might be a regular trap or save
+    end
+
+L_JUMP_TO_RESTORE:
+    s_branch L_RESTORE                                              //restore
+
+L_SKIP_RESTORE:
+
+    s_getreg_b32    s_save_status, hwreg(HW_REG_STATUS)                             //save STATUS since we will change SCC
+    s_andn2_b32     s_save_status, s_save_status, SQ_WAVE_STATUS_SPI_PRIO_MASK      //check whether this is for save
+    s_getreg_b32    s_save_trapsts, hwreg(HW_REG_TRAPSTS)
+    s_and_b32       s_save_trapsts, s_save_trapsts, SQ_WAVE_TRAPSTS_SAVECTX_MASK    //check whether this is for save
+    s_cbranch_scc1  L_SAVE                                      //this is the operation for save
+
+    // *********    Handle non-CWSR traps       *******************
+if (!EMU_RUN_HACK)
+    /* read tba and tma for next level trap handler, ttmp4 is used as s_save_status */
+    s_load_dwordx4  [ttmp8,ttmp9,ttmp10, ttmp11], [tma_lo,tma_hi], 0
+    s_waitcnt lgkmcnt(0)
+    s_or_b32        ttmp7, ttmp8, ttmp9
+    s_cbranch_scc0  L_NO_NEXT_TRAP //next level trap handler not been set
+    set_status_without_spi_prio(s_save_status, ttmp2) //restore HW status(SCC)
+    s_setpc_b64     [ttmp8,ttmp9] //jump to next level trap handler
+
+L_NO_NEXT_TRAP:
+    s_getreg_b32    s_save_trapsts, hwreg(HW_REG_TRAPSTS)
+    s_and_b32       s_save_trapsts, s_save_trapsts, SQ_WAVE_TRAPSTS_EXCE_MASK // Check whether it is an exception
+    s_cbranch_scc1  L_EXCP_CASE   // Exception, jump back to the shader program directly.
+    s_add_u32       ttmp0, ttmp0, 4   // S_TRAP case, add 4 to ttmp0
+    s_addc_u32  ttmp1, ttmp1, 0
+L_EXCP_CASE:
+    s_and_b32   ttmp1, ttmp1, 0xFFFF
+    set_status_without_spi_prio(s_save_status, ttmp2) //restore HW status(SCC)
+    s_rfe_b64       [ttmp0, ttmp1]
+end
+    // *********        End handling of non-CWSR traps   *******************
+
+/**************************************************************************/
+/*                      save routine                                      */
+/**************************************************************************/
+
+L_SAVE:
+
+if G8SR_DEBUG_TIMESTAMP
+        s_memrealtime   s_g8sr_ts_save_s
+        s_waitcnt lgkmcnt(0)         //FIXME, will cause xnack??
+end
+
+    //check whether there is mem_viol
+    s_getreg_b32    s_save_trapsts, hwreg(HW_REG_TRAPSTS)
+    s_and_b32   s_save_trapsts, s_save_trapsts, SQ_WAVE_TRAPSTS_MEM_VIOL_MASK
+    s_cbranch_scc0  L_NO_PC_REWIND
+
+    //if so, need rewind PC assuming GDS operation gets NACKed
+    s_mov_b32       s_save_tmp, 0                                                           //clear mem_viol bit
+    s_setreg_b32    hwreg(HW_REG_TRAPSTS, SQ_WAVE_TRAPSTS_MEM_VIOL_SHIFT, 1), s_save_tmp    //clear mem_viol bit
+    s_and_b32       s_save_pc_hi, s_save_pc_hi, 0x0000ffff    //pc[47:32]
+    s_sub_u32       s_save_pc_lo, s_save_pc_lo, 8             //pc[31:0]-8
+    s_subb_u32      s_save_pc_hi, s_save_pc_hi, 0x0           // -scc
+
+L_NO_PC_REWIND:
+    s_mov_b32       s_save_tmp, 0                                                           //clear saveCtx bit
+    s_setreg_b32    hwreg(HW_REG_TRAPSTS, SQ_WAVE_TRAPSTS_SAVECTX_SHIFT, 1), s_save_tmp     //clear saveCtx bit
+
+    s_mov_b32       s_save_xnack_mask_lo,   xnack_mask_lo                                   //save XNACK_MASK
+    s_mov_b32       s_save_xnack_mask_hi,   xnack_mask_hi    //save XNACK must before any memory operation
+    s_getreg_b32    s_save_tmp, hwreg(HW_REG_IB_STS, SQ_WAVE_IB_STS_RCNT_SHIFT, SQ_WAVE_IB_STS_RCNT_SIZE)                   //save RCNT
+    s_lshl_b32      s_save_tmp, s_save_tmp, S_SAVE_PC_HI_RCNT_SHIFT
+    s_or_b32        s_save_pc_hi, s_save_pc_hi, s_save_tmp
+    s_getreg_b32    s_save_tmp, hwreg(HW_REG_IB_STS, SQ_WAVE_IB_STS_FIRST_REPLAY_SHIFT, SQ_WAVE_IB_STS_FIRST_REPLAY_SIZE)   //save FIRST_REPLAY
+    s_lshl_b32      s_save_tmp, s_save_tmp, S_SAVE_PC_HI_FIRST_REPLAY_SHIFT
+    s_or_b32        s_save_pc_hi, s_save_pc_hi, s_save_tmp
+    s_getreg_b32    s_save_tmp, hwreg(HW_REG_IB_STS)                                        //clear RCNT and FIRST_REPLAY in IB_STS
+    s_and_b32       s_save_tmp, s_save_tmp, SQ_WAVE_IB_STS_RCNT_FIRST_REPLAY_MASK_NEG
+
+    s_setreg_b32    hwreg(HW_REG_IB_STS), s_save_tmp
+
+    /*      inform SPI the readiness and wait for SPI's go signal */
+    s_mov_b32       s_save_exec_lo, exec_lo                                                 //save EXEC and use EXEC for the go signal from SPI
+    s_mov_b32       s_save_exec_hi, exec_hi
+    s_mov_b64       exec,   0x0                                                             //clear EXEC to get ready to receive
+
+if G8SR_DEBUG_TIMESTAMP
+        s_memrealtime  s_g8sr_ts_sq_save_msg
+        s_waitcnt lgkmcnt(0)
+end
+
+    if (EMU_RUN_HACK)
+
+    else
+        s_sendmsg   sendmsg(MSG_SAVEWAVE)  //send SPI a message and wait for SPI's write to EXEC
+    end
+
+    // Set SPI_PRIO=2 to avoid starving instruction fetch in the waves we're waiting for.
+    s_or_b32 s_save_tmp, s_save_status, (2 << SQ_WAVE_STATUS_SPI_PRIO_SHIFT)
+    s_setreg_b32 hwreg(HW_REG_STATUS), s_save_tmp
+
+  L_SLEEP:
+    s_sleep 0x2                // sleep 1 (64clk) is not enough for 8 waves per SIMD, which will cause SQ hang, since the 7,8th wave could not get arbit to exec inst, while other waves are stuck into the sleep-loop and waiting for wrexec!=0
+
+    if (EMU_RUN_HACK)
+
+    else
+        s_cbranch_execz L_SLEEP
+    end
+
+if G8SR_DEBUG_TIMESTAMP
+        s_memrealtime  s_g8sr_ts_spi_wrexec
+        s_waitcnt lgkmcnt(0)
+end
+
+    /*      setup Resource Contants    */
+    if ((EMU_RUN_HACK) && (!EMU_RUN_HACK_SAVE_SINGLE_WAVE))
+        //calculate wd_addr using absolute thread id
+        v_readlane_b32 s_save_tmp, v9, 0
+        s_lshr_b32 s_save_tmp, s_save_tmp, 6
+        s_mul_i32 s_save_tmp, s_save_tmp, WAVE_SPACE
+        s_add_i32 s_save_spi_init_lo, s_save_tmp, WG_BASE_ADDR_LO
+        s_mov_b32 s_save_spi_init_hi, WG_BASE_ADDR_HI
+        s_and_b32 s_save_spi_init_hi, s_save_spi_init_hi, CTX_SAVE_CONTROL
+    else
+    end
+    if ((EMU_RUN_HACK) && (EMU_RUN_HACK_SAVE_SINGLE_WAVE))
+        s_add_i32 s_save_spi_init_lo, s_save_tmp, WG_BASE_ADDR_LO
+        s_mov_b32 s_save_spi_init_hi, WG_BASE_ADDR_HI
+        s_and_b32 s_save_spi_init_hi, s_save_spi_init_hi, CTX_SAVE_CONTROL
+    else
+    end
+
+
+    s_mov_b32       s_save_buf_rsrc0,   s_save_spi_init_lo                                                      //base_addr_lo
+    s_and_b32       s_save_buf_rsrc1,   s_save_spi_init_hi, 0x0000FFFF                                          //base_addr_hi
+    s_or_b32        s_save_buf_rsrc1,   s_save_buf_rsrc1,  S_SAVE_BUF_RSRC_WORD1_STRIDE
+    s_mov_b32       s_save_buf_rsrc2,   0                                                                       //NUM_RECORDS initial value = 0 (in bytes) although not neccessarily inited
+    s_mov_b32       s_save_buf_rsrc3,   S_SAVE_BUF_RSRC_WORD3_MISC
+    s_and_b32       s_save_tmp,         s_save_spi_init_hi, S_SAVE_SPI_INIT_ATC_MASK
+    s_lshr_b32      s_save_tmp,         s_save_tmp, (S_SAVE_SPI_INIT_ATC_SHIFT-SQ_BUF_RSRC_WORD1_ATC_SHIFT)         //get ATC bit into position
+    s_or_b32        s_save_buf_rsrc3,   s_save_buf_rsrc3,  s_save_tmp                                           //or ATC
+    s_and_b32       s_save_tmp,         s_save_spi_init_hi, S_SAVE_SPI_INIT_MTYPE_MASK
+    s_lshr_b32      s_save_tmp,         s_save_tmp, (S_SAVE_SPI_INIT_MTYPE_SHIFT-SQ_BUF_RSRC_WORD3_MTYPE_SHIFT)     //get MTYPE bits into position
+    s_or_b32        s_save_buf_rsrc3,   s_save_buf_rsrc3,  s_save_tmp                                           //or MTYPE
+
+    //FIXME  right now s_save_m0/s_save_mem_offset use tma_lo/tma_hi  (might need to save them before using them?)
+    s_mov_b32       s_save_m0,          m0                                                                  //save M0
+
+    /*      global mem offset           */
+    s_mov_b32       s_save_mem_offset,  0x0                                                                     //mem offset initial value = 0
+
+
+
+
+    /*      save HW registers   */
+    //////////////////////////////
+
+  L_SAVE_HWREG:
+        // HWREG SR memory offset : size(VGPR)+size(SGPR)
+       get_vgpr_size_bytes(s_save_mem_offset)
+       get_sgpr_size_bytes(s_save_tmp)
+       s_add_u32 s_save_mem_offset, s_save_mem_offset, s_save_tmp
+
+
+    s_mov_b32       s_save_buf_rsrc2, 0x4                               //NUM_RECORDS   in bytes
+    if (SWIZZLE_EN)
+        s_add_u32       s_save_buf_rsrc2, s_save_buf_rsrc2, 0x0                     //FIXME need to use swizzle to enable bounds checking?
+    else
+        s_mov_b32       s_save_buf_rsrc2,  0x1000000                                //NUM_RECORDS in bytes
+    end
+
+
+    write_hwreg_to_mem(s_save_m0, s_save_buf_rsrc0, s_save_mem_offset)                  //M0
+
+    if ((EMU_RUN_HACK) && (EMU_RUN_HACK_SAVE_FIRST_TIME))
+        s_add_u32 s_save_pc_lo, s_save_pc_lo, 4             //pc[31:0]+4
+        s_addc_u32 s_save_pc_hi, s_save_pc_hi, 0x0          //carry bit over
+        s_mov_b32   tba_lo, EMU_RUN_HACK_SAVE_FIRST_TIME_TBA_LO
+        s_mov_b32   tba_hi, EMU_RUN_HACK_SAVE_FIRST_TIME_TBA_HI
+    end
+
+    write_hwreg_to_mem(s_save_pc_lo, s_save_buf_rsrc0, s_save_mem_offset)                   //PC
+    write_hwreg_to_mem(s_save_pc_hi, s_save_buf_rsrc0, s_save_mem_offset)
+    write_hwreg_to_mem(s_save_exec_lo, s_save_buf_rsrc0, s_save_mem_offset)             //EXEC
+    write_hwreg_to_mem(s_save_exec_hi, s_save_buf_rsrc0, s_save_mem_offset)
+    write_hwreg_to_mem(s_save_status, s_save_buf_rsrc0, s_save_mem_offset)              //STATUS
+
+    //s_save_trapsts conflicts with s_save_alloc_size
+    s_getreg_b32    s_save_trapsts, hwreg(HW_REG_TRAPSTS)
+    write_hwreg_to_mem(s_save_trapsts, s_save_buf_rsrc0, s_save_mem_offset)             //TRAPSTS
+
+    write_hwreg_to_mem(s_save_xnack_mask_lo, s_save_buf_rsrc0, s_save_mem_offset)           //XNACK_MASK_LO
+    write_hwreg_to_mem(s_save_xnack_mask_hi, s_save_buf_rsrc0, s_save_mem_offset)           //XNACK_MASK_HI
+
+    //use s_save_tmp would introduce conflict here between s_save_tmp and s_save_buf_rsrc2
+    s_getreg_b32    s_save_m0, hwreg(HW_REG_MODE)                                                   //MODE
+    write_hwreg_to_mem(s_save_m0, s_save_buf_rsrc0, s_save_mem_offset)
+    write_hwreg_to_mem(tba_lo, s_save_buf_rsrc0, s_save_mem_offset)                     //TBA_LO
+    write_hwreg_to_mem(tba_hi, s_save_buf_rsrc0, s_save_mem_offset)                     //TBA_HI
+
+
+
+    /*      the first wave in the threadgroup    */
+        // save fist_wave bits in tba_hi unused bit.26
+    s_and_b32       s_save_tmp, s_save_spi_init_hi, S_SAVE_SPI_INIT_FIRST_WAVE_MASK     // extract fisrt wave bit
+    //s_or_b32        tba_hi, s_save_tmp, tba_hi                                        // save first wave bit to tba_hi.bits[26]
+    s_mov_b32        s_save_exec_hi, 0x0
+    s_or_b32         s_save_exec_hi, s_save_tmp, s_save_exec_hi                          // save first wave bit to s_save_exec_hi.bits[26]
+
+
+    /*          save SGPRs      */
+        // Save SGPR before LDS save, then the s0 to s4 can be used during LDS save...
+    //////////////////////////////
+
+    // SGPR SR memory offset : size(VGPR)
+    get_vgpr_size_bytes(s_save_mem_offset)
+    // TODO, change RSRC word to rearrange memory layout for SGPRS
+
+    s_getreg_b32    s_save_alloc_size, hwreg(HW_REG_GPR_ALLOC,SQ_WAVE_GPR_ALLOC_SGPR_SIZE_SHIFT,SQ_WAVE_GPR_ALLOC_SGPR_SIZE_SIZE)               //spgr_size
+    s_add_u32       s_save_alloc_size, s_save_alloc_size, 1
+    s_lshl_b32      s_save_alloc_size, s_save_alloc_size, 4                         //Number of SGPRs = (sgpr_size + 1) * 16   (non-zero value)
+
+    if (SGPR_SAVE_USE_SQC)
+        s_lshl_b32      s_save_buf_rsrc2,   s_save_alloc_size, 2                    //NUM_RECORDS in bytes
+    else
+        s_lshl_b32      s_save_buf_rsrc2,   s_save_alloc_size, 8                    //NUM_RECORDS in bytes (64 threads)
+    end
+
+    if (SWIZZLE_EN)
+        s_add_u32       s_save_buf_rsrc2, s_save_buf_rsrc2, 0x0                     //FIXME need to use swizzle to enable bounds checking?
+    else
+        s_mov_b32       s_save_buf_rsrc2,  0x1000000                                //NUM_RECORDS in bytes
+    end
+
+
+    // backup s_save_buf_rsrc0,1 to s_save_pc_lo/hi, since write_16sgpr_to_mem function will change the rsrc0
+    //s_mov_b64 s_save_pc_lo, s_save_buf_rsrc0
+    s_mov_b64 s_save_xnack_mask_lo, s_save_buf_rsrc0
+    s_add_u32 s_save_buf_rsrc0, s_save_buf_rsrc0, s_save_mem_offset
+    s_addc_u32 s_save_buf_rsrc1, s_save_buf_rsrc1, 0
+
+    s_mov_b32       m0, 0x0                         //SGPR initial index value =0
+  L_SAVE_SGPR_LOOP:
+    // SGPR is allocated in 16 SGPR granularity
+    s_movrels_b64   s0, s0     //s0 = s[0+m0], s1 = s[1+m0]
+    s_movrels_b64   s2, s2     //s2 = s[2+m0], s3 = s[3+m0]
+    s_movrels_b64   s4, s4     //s4 = s[4+m0], s5 = s[5+m0]
+    s_movrels_b64   s6, s6     //s6 = s[6+m0], s7 = s[7+m0]
+    s_movrels_b64   s8, s8     //s8 = s[8+m0], s9 = s[9+m0]
+    s_movrels_b64   s10, s10   //s10 = s[10+m0], s11 = s[11+m0]
+    s_movrels_b64   s12, s12   //s12 = s[12+m0], s13 = s[13+m0]
+    s_movrels_b64   s14, s14   //s14 = s[14+m0], s15 = s[15+m0]
+
+    write_16sgpr_to_mem(s0, s_save_buf_rsrc0, s_save_mem_offset) //PV: the best performance should be using s_buffer_store_dwordx4
+    s_add_u32       m0, m0, 16                                                      //next sgpr index
+    s_cmp_lt_u32    m0, s_save_alloc_size                                           //scc = (m0 < s_save_alloc_size) ? 1 : 0
+    s_cbranch_scc1  L_SAVE_SGPR_LOOP                                    //SGPR save is complete?
+    // restore s_save_buf_rsrc0,1
+    //s_mov_b64 s_save_buf_rsrc0, s_save_pc_lo
+    s_mov_b64 s_save_buf_rsrc0, s_save_xnack_mask_lo
+
+
+
+
+    /*          save first 4 VGPR, then LDS save could use   */
+        // each wave will alloc 4 vgprs at least...
+    /////////////////////////////////////////////////////////////////////////////////////
+
+    s_mov_b32       s_save_mem_offset, 0
+    s_mov_b32       exec_lo, 0xFFFFFFFF                                             //need every thread from now on
+    s_mov_b32       exec_hi, 0xFFFFFFFF
+
+    if (SWIZZLE_EN)
+        s_add_u32       s_save_buf_rsrc2, s_save_buf_rsrc2, 0x0                     //FIXME need to use swizzle to enable bounds checking?
+    else
+        s_mov_b32       s_save_buf_rsrc2,  0x1000000                                //NUM_RECORDS in bytes
+    end
+
+
+    // VGPR Allocated in 4-GPR granularity
+
+if G8SR_VGPR_SR_IN_DWX4
+        // the const stride for DWx4 is 4*4 bytes
+        s_and_b32 s_save_buf_rsrc1, s_save_buf_rsrc1, 0x0000FFFF   // reset const stride to 0
+        s_or_b32  s_save_buf_rsrc1, s_save_buf_rsrc1, G8SR_SAVE_BUF_RSRC_WORD1_STRIDE_DWx4  // const stride to 4*4 bytes
+
+        buffer_store_dwordx4 v0, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1
+
+        s_and_b32 s_save_buf_rsrc1, s_save_buf_rsrc1, 0x0000FFFF   // reset const stride to 0
+        s_or_b32  s_save_buf_rsrc1, s_save_buf_rsrc1, S_SAVE_BUF_RSRC_WORD1_STRIDE  // reset const stride to 4 bytes
+else
+        buffer_store_dword v0, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1
+        buffer_store_dword v1, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1  offset:256
+        buffer_store_dword v2, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1  offset:256*2
+        buffer_store_dword v3, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1  offset:256*3
+end
+
+
+
+    /*          save LDS        */
+    //////////////////////////////
+
+  L_SAVE_LDS:
+
+        // Change EXEC to all threads...
+    s_mov_b32       exec_lo, 0xFFFFFFFF   //need every thread from now on
+    s_mov_b32       exec_hi, 0xFFFFFFFF
+
+    s_getreg_b32    s_save_alloc_size, hwreg(HW_REG_LDS_ALLOC,SQ_WAVE_LDS_ALLOC_LDS_SIZE_SHIFT,SQ_WAVE_LDS_ALLOC_LDS_SIZE_SIZE)             //lds_size
+    s_and_b32       s_save_alloc_size, s_save_alloc_size, 0xFFFFFFFF                //lds_size is zero?
+    s_cbranch_scc0  L_SAVE_LDS_DONE                                                                            //no lds used? jump to L_SAVE_DONE
+
+    s_barrier               //LDS is used? wait for other waves in the same TG
+    //s_and_b32     s_save_tmp, tba_hi, S_SAVE_SPI_INIT_FIRST_WAVE_MASK                //exec is still used here
+    s_and_b32       s_save_tmp, s_save_exec_hi, S_SAVE_SPI_INIT_FIRST_WAVE_MASK                //exec is still used here
+    s_cbranch_scc0  L_SAVE_LDS_DONE
+
+        // first wave do LDS save;
+
+    s_lshl_b32      s_save_alloc_size, s_save_alloc_size, 6                         //LDS size in dwords = lds_size * 64dw
+    s_lshl_b32      s_save_alloc_size, s_save_alloc_size, 2                         //LDS size in bytes
+    s_mov_b32       s_save_buf_rsrc2,  s_save_alloc_size                            //NUM_RECORDS in bytes
+
+    // LDS at offset: size(VGPR)+SIZE(SGPR)+SIZE(HWREG)
+    //
+    get_vgpr_size_bytes(s_save_mem_offset)
+    get_sgpr_size_bytes(s_save_tmp)
+    s_add_u32  s_save_mem_offset, s_save_mem_offset, s_save_tmp
+    s_add_u32 s_save_mem_offset, s_save_mem_offset, get_hwreg_size_bytes()
+
+
+    if (SWIZZLE_EN)
+        s_add_u32       s_save_buf_rsrc2, s_save_buf_rsrc2, 0x0       //FIXME need to use swizzle to enable bounds checking?
+    else
+        s_mov_b32       s_save_buf_rsrc2,  0x1000000                  //NUM_RECORDS in bytes
+    end
+
+    s_mov_b32       m0, 0x0                                               //lds_offset initial value = 0
+
+
+var LDS_DMA_ENABLE = 0
+var UNROLL = 0
+if UNROLL==0 && LDS_DMA_ENABLE==1
+        s_mov_b32  s3, 256*2
+        s_nop 0
+        s_nop 0
+        s_nop 0
+  L_SAVE_LDS_LOOP:
+        //TODO: looks the 2 buffer_store/load clause for s/r will hurt performance.???
+    if (SAVE_LDS)     //SPI always alloc LDS space in 128DW granularity
+            buffer_store_lds_dword s_save_buf_rsrc0, s_save_mem_offset lds:1            // first 64DW
+            buffer_store_lds_dword s_save_buf_rsrc0, s_save_mem_offset lds:1 offset:256 // second 64DW
+    end
+
+    s_add_u32       m0, m0, s3                                          //every buffer_store_lds does 256 bytes
+    s_add_u32       s_save_mem_offset, s_save_mem_offset, s3                            //mem offset increased by 256 bytes
+    s_cmp_lt_u32    m0, s_save_alloc_size                                               //scc=(m0 < s_save_alloc_size) ? 1 : 0
+    s_cbranch_scc1  L_SAVE_LDS_LOOP                                                     //LDS save is complete?
+
+elsif LDS_DMA_ENABLE==1 && UNROLL==1 // UNROOL  , has ichace miss
+      // store from higest LDS address to lowest
+      s_mov_b32  s3, 256*2
+      s_sub_u32  m0, s_save_alloc_size, s3
+      s_add_u32 s_save_mem_offset, s_save_mem_offset, m0
+      s_lshr_b32 s_save_alloc_size, s_save_alloc_size, 9   // how many 128 trunks...
+      s_sub_u32 s_save_alloc_size, 128, s_save_alloc_size   // store from higheset addr to lowest
+      s_mul_i32 s_save_alloc_size, s_save_alloc_size, 6*4   // PC offset increment,  each LDS save block cost 6*4 Bytes instruction
+      s_add_u32 s_save_alloc_size, s_save_alloc_size, 3*4   //2is the below 2 inst...//s_addc and s_setpc
+      s_nop 0
+      s_nop 0
+      s_nop 0   //pad 3 dw to let LDS_DMA align with 64Bytes
+      s_getpc_b64 s[0:1]                              // reuse s[0:1], since s[0:1] already saved
+      s_add_u32   s0, s0,s_save_alloc_size
+      s_addc_u32  s1, s1, 0
+      s_setpc_b64 s[0:1]
+
+
+       for var i =0; i< 128; i++
+            // be careful to make here a 64Byte aligned address, which could improve performance...
+            buffer_store_lds_dword s_save_buf_rsrc0, s_save_mem_offset lds:1 offset:0           // first 64DW
+            buffer_store_lds_dword s_save_buf_rsrc0, s_save_mem_offset lds:1 offset:256           // second 64DW
+
+        if i!=127
+        s_sub_u32  m0, m0, s3      // use a sgpr to shrink 2DW-inst to 1DW inst to improve performance , i.e.  pack more LDS_DMA inst to one Cacheline
+            s_sub_u32  s_save_mem_offset, s_save_mem_offset,  s3
+            end
+       end
+
+else   // BUFFER_STORE
+      v_mbcnt_lo_u32_b32 v2, 0xffffffff, 0x0
+      v_mbcnt_hi_u32_b32 v3, 0xffffffff, v2     // tid
+      v_mul_i32_i24 v2, v3, 8   // tid*8
+      v_mov_b32 v3, 256*2
+      s_mov_b32 m0, 0x10000
+      s_mov_b32 s0, s_save_buf_rsrc3
+      s_and_b32 s_save_buf_rsrc3, s_save_buf_rsrc3, 0xFF7FFFFF    // disable add_tid
+      s_or_b32 s_save_buf_rsrc3, s_save_buf_rsrc3, 0x58000   //DFMT
+
+L_SAVE_LDS_LOOP_VECTOR:
+      ds_read_b64 v[0:1], v2    //x =LDS[a], byte address
+      s_waitcnt lgkmcnt(0)
+      buffer_store_dwordx2  v[0:1], v2, s_save_buf_rsrc0, s_save_mem_offset offen:1  glc:1  slc:1
+//      s_waitcnt vmcnt(0)
+      v_add_u32 v2, vcc[0:1], v2, v3
+      v_cmp_lt_u32 vcc[0:1], v2, s_save_alloc_size
+      s_cbranch_vccnz L_SAVE_LDS_LOOP_VECTOR
+
+      // restore rsrc3
+      s_mov_b32 s_save_buf_rsrc3, s0
+
+end
+
+L_SAVE_LDS_DONE:
+
+
+    /*          save VGPRs  - set the Rest VGPRs        */
+    //////////////////////////////////////////////////////////////////////////////////////
+  L_SAVE_VGPR:
+    // VGPR SR memory offset: 0
+    // TODO rearrange the RSRC words to use swizzle for VGPR save...
+
+    s_mov_b32       s_save_mem_offset, (0+256*4)                                    // for the rest VGPRs
+    s_mov_b32       exec_lo, 0xFFFFFFFF                                             //need every thread from now on
+    s_mov_b32       exec_hi, 0xFFFFFFFF
+
+    s_getreg_b32    s_save_alloc_size, hwreg(HW_REG_GPR_ALLOC,SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SHIFT,SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SIZE)                   //vpgr_size
+    s_add_u32       s_save_alloc_size, s_save_alloc_size, 1
+    s_lshl_b32      s_save_alloc_size, s_save_alloc_size, 2                         //Number of VGPRs = (vgpr_size + 1) * 4    (non-zero value)   //FIXME for GFX, zero is possible
+    s_lshl_b32      s_save_buf_rsrc2,  s_save_alloc_size, 8                         //NUM_RECORDS in bytes (64 threads*4)
+    if (SWIZZLE_EN)
+        s_add_u32       s_save_buf_rsrc2, s_save_buf_rsrc2, 0x0                     //FIXME need to use swizzle to enable bounds checking?
+    else
+        s_mov_b32       s_save_buf_rsrc2,  0x1000000                                //NUM_RECORDS in bytes
+    end
+
+
+    // VGPR Allocated in 4-GPR granularity
+
+if G8SR_VGPR_SR_IN_DWX4
+        // the const stride for DWx4 is 4*4 bytes
+        s_and_b32 s_save_buf_rsrc1, s_save_buf_rsrc1, 0x0000FFFF   // reset const stride to 0
+        s_or_b32  s_save_buf_rsrc1, s_save_buf_rsrc1, G8SR_SAVE_BUF_RSRC_WORD1_STRIDE_DWx4  // const stride to 4*4 bytes
+
+        s_mov_b32         m0, 4     // skip first 4 VGPRs
+        s_cmp_lt_u32      m0, s_save_alloc_size
+        s_cbranch_scc0    L_SAVE_VGPR_LOOP_END      // no more vgprs
+
+        s_set_gpr_idx_on  m0, 0x1   // This will change M0
+        s_add_u32         s_save_alloc_size, s_save_alloc_size, 0x1000  // because above inst change m0
+L_SAVE_VGPR_LOOP:
+        v_mov_b32         v0, v0   // v0 = v[0+m0]
+        v_mov_b32         v1, v1
+        v_mov_b32         v2, v2
+        v_mov_b32         v3, v3
+
+
+        buffer_store_dwordx4 v0, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1
+        s_add_u32         m0, m0, 4
+        s_add_u32         s_save_mem_offset, s_save_mem_offset, 256*4
+        s_cmp_lt_u32      m0, s_save_alloc_size
+    s_cbranch_scc1  L_SAVE_VGPR_LOOP                                                //VGPR save is complete?
+    s_set_gpr_idx_off
+L_SAVE_VGPR_LOOP_END:
+
+        s_and_b32 s_save_buf_rsrc1, s_save_buf_rsrc1, 0x0000FFFF   // reset const stride to 0
+        s_or_b32  s_save_buf_rsrc1, s_save_buf_rsrc1, S_SAVE_BUF_RSRC_WORD1_STRIDE  // reset const stride to 4 bytes
+else
+    // VGPR store using dw burst
+    s_mov_b32         m0, 0x4   //VGPR initial index value =0
+    s_cmp_lt_u32      m0, s_save_alloc_size
+    s_cbranch_scc0    L_SAVE_VGPR_END
+
+
+    s_set_gpr_idx_on    m0, 0x1 //M0[7:0] = M0[7:0] and M0[15:12] = 0x1
+    s_add_u32       s_save_alloc_size, s_save_alloc_size, 0x1000                    //add 0x1000 since we compare m0 against it later
+
+  L_SAVE_VGPR_LOOP:
+    v_mov_b32       v0, v0              //v0 = v[0+m0]
+    v_mov_b32       v1, v1              //v0 = v[0+m0]
+    v_mov_b32       v2, v2              //v0 = v[0+m0]
+    v_mov_b32       v3, v3              //v0 = v[0+m0]
+
+    if(USE_MTBUF_INSTEAD_OF_MUBUF)
+        tbuffer_store_format_x v0, v0, s_save_buf_rsrc0, s_save_mem_offset format:BUF_NUM_FORMAT_FLOAT format: BUF_DATA_FORMAT_32 slc:1 glc:1
+    else
+        buffer_store_dword v0, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1
+        buffer_store_dword v1, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1  offset:256
+        buffer_store_dword v2, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1  offset:256*2
+        buffer_store_dword v3, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1  offset:256*3
+    end
+
+    s_add_u32       m0, m0, 4                                                       //next vgpr index
+    s_add_u32       s_save_mem_offset, s_save_mem_offset, 256*4                     //every buffer_store_dword does 256 bytes
+    s_cmp_lt_u32    m0, s_save_alloc_size                                           //scc = (m0 < s_save_alloc_size) ? 1 : 0
+    s_cbranch_scc1  L_SAVE_VGPR_LOOP                                                //VGPR save is complete?
+    s_set_gpr_idx_off
+end
+
+L_SAVE_VGPR_END:
+
+
+
+
+
+
+    /*     S_PGM_END_SAVED  */                              //FIXME  graphics ONLY
+    if ((EMU_RUN_HACK) && (!EMU_RUN_HACK_SAVE_NORMAL_EXIT))
+        s_and_b32 s_save_pc_hi, s_save_pc_hi, 0x0000ffff    //pc[47:32]
+        s_add_u32 s_save_pc_lo, s_save_pc_lo, 4             //pc[31:0]+4
+        s_addc_u32 s_save_pc_hi, s_save_pc_hi, 0x0          //carry bit over
+        s_rfe_b64 s_save_pc_lo                              //Return to the main shader program
+    else
+    end
+
+// Save Done timestamp
+if G8SR_DEBUG_TIMESTAMP
+        s_memrealtime   s_g8sr_ts_save_d
+        // SGPR SR memory offset : size(VGPR)
+        get_vgpr_size_bytes(s_save_mem_offset)
+        s_add_u32 s_save_mem_offset, s_save_mem_offset, G8SR_DEBUG_TS_SAVE_D_OFFSET
+        s_waitcnt lgkmcnt(0)         //FIXME, will cause xnack??
+        // Need reset rsrc2??
+        s_mov_b32 m0, s_save_mem_offset
+        s_mov_b32 s_save_buf_rsrc2,  0x1000000                                  //NUM_RECORDS in bytes
+        s_buffer_store_dwordx2 s_g8sr_ts_save_d, s_save_buf_rsrc0, m0       glc:1
+end
+
+
+    s_branch    L_END_PGM
+
+
+
+/**************************************************************************/
+/*                      restore routine                                   */
+/**************************************************************************/
+
+L_RESTORE:
+    /*      Setup Resource Contants    */
+    if ((EMU_RUN_HACK) && (!EMU_RUN_HACK_RESTORE_NORMAL))
+        //calculate wd_addr using absolute thread id
+        v_readlane_b32 s_restore_tmp, v9, 0
+        s_lshr_b32 s_restore_tmp, s_restore_tmp, 6
+        s_mul_i32 s_restore_tmp, s_restore_tmp, WAVE_SPACE
+        s_add_i32 s_restore_spi_init_lo, s_restore_tmp, WG_BASE_ADDR_LO
+        s_mov_b32 s_restore_spi_init_hi, WG_BASE_ADDR_HI
+        s_and_b32 s_restore_spi_init_hi, s_restore_spi_init_hi, CTX_RESTORE_CONTROL
+    else
+    end
+
+if G8SR_DEBUG_TIMESTAMP
+        s_memrealtime   s_g8sr_ts_restore_s
+        s_waitcnt lgkmcnt(0)         //FIXME, will cause xnack??
+        // tma_lo/hi are sgpr 110, 111, which will not used for 112 SGPR allocated case...
+        s_mov_b32 s_restore_pc_lo, s_g8sr_ts_restore_s[0]
+        s_mov_b32 s_restore_pc_hi, s_g8sr_ts_restore_s[1]   //backup ts to ttmp0/1, sicne exec will be finally restored..
+end
+
+
+
+    s_mov_b32       s_restore_buf_rsrc0,    s_restore_spi_init_lo                                                           //base_addr_lo
+    s_and_b32       s_restore_buf_rsrc1,    s_restore_spi_init_hi, 0x0000FFFF                                               //base_addr_hi
+    s_or_b32        s_restore_buf_rsrc1,    s_restore_buf_rsrc1,  S_RESTORE_BUF_RSRC_WORD1_STRIDE
+    s_mov_b32       s_restore_buf_rsrc2,    0                                                                               //NUM_RECORDS initial value = 0 (in bytes)
+    s_mov_b32       s_restore_buf_rsrc3,    S_RESTORE_BUF_RSRC_WORD3_MISC
+    s_and_b32       s_restore_tmp,          s_restore_spi_init_hi, S_RESTORE_SPI_INIT_ATC_MASK
+    s_lshr_b32      s_restore_tmp,          s_restore_tmp, (S_RESTORE_SPI_INIT_ATC_SHIFT-SQ_BUF_RSRC_WORD1_ATC_SHIFT)       //get ATC bit into position
+    s_or_b32        s_restore_buf_rsrc3,    s_restore_buf_rsrc3,  s_restore_tmp                                             //or ATC
+    s_and_b32       s_restore_tmp,          s_restore_spi_init_hi, S_RESTORE_SPI_INIT_MTYPE_MASK
+    s_lshr_b32      s_restore_tmp,          s_restore_tmp, (S_RESTORE_SPI_INIT_MTYPE_SHIFT-SQ_BUF_RSRC_WORD3_MTYPE_SHIFT)   //get MTYPE bits into position
+    s_or_b32        s_restore_buf_rsrc3,    s_restore_buf_rsrc3,  s_restore_tmp                                             //or MTYPE
+
+    /*      global mem offset           */
+//  s_mov_b32       s_restore_mem_offset, 0x0                               //mem offset initial value = 0
+
+    /*      the first wave in the threadgroup    */
+    s_and_b32       s_restore_tmp, s_restore_spi_init_hi, S_RESTORE_SPI_INIT_FIRST_WAVE_MASK
+    s_cbranch_scc0  L_RESTORE_VGPR
+
+    /*          restore LDS     */
+    //////////////////////////////
+  L_RESTORE_LDS:
+
+    s_mov_b32       exec_lo, 0xFFFFFFFF                                                     //need every thread from now on   //be consistent with SAVE although can be moved ahead
+    s_mov_b32       exec_hi, 0xFFFFFFFF
+
+    s_getreg_b32    s_restore_alloc_size, hwreg(HW_REG_LDS_ALLOC,SQ_WAVE_LDS_ALLOC_LDS_SIZE_SHIFT,SQ_WAVE_LDS_ALLOC_LDS_SIZE_SIZE)              //lds_size
+    s_and_b32       s_restore_alloc_size, s_restore_alloc_size, 0xFFFFFFFF                  //lds_size is zero?
+    s_cbranch_scc0  L_RESTORE_VGPR                                                          //no lds used? jump to L_RESTORE_VGPR
+    s_lshl_b32      s_restore_alloc_size, s_restore_alloc_size, 6                           //LDS size in dwords = lds_size * 64dw
+    s_lshl_b32      s_restore_alloc_size, s_restore_alloc_size, 2                           //LDS size in bytes
+    s_mov_b32       s_restore_buf_rsrc2,    s_restore_alloc_size                            //NUM_RECORDS in bytes
+
+    // LDS at offset: size(VGPR)+SIZE(SGPR)+SIZE(HWREG)
+    //
+    get_vgpr_size_bytes(s_restore_mem_offset)
+    get_sgpr_size_bytes(s_restore_tmp)
+    s_add_u32  s_restore_mem_offset, s_restore_mem_offset, s_restore_tmp
+    s_add_u32  s_restore_mem_offset, s_restore_mem_offset, get_hwreg_size_bytes()            //FIXME, Check if offset overflow???
+
+
+    if (SWIZZLE_EN)
+        s_add_u32       s_restore_buf_rsrc2, s_restore_buf_rsrc2, 0x0                       //FIXME need to use swizzle to enable bounds checking?
+    else
+        s_mov_b32       s_restore_buf_rsrc2,  0x1000000                                     //NUM_RECORDS in bytes
+    end
+    s_mov_b32       m0, 0x0                                                                 //lds_offset initial value = 0
+
+  L_RESTORE_LDS_LOOP:
+    if (SAVE_LDS)
+        buffer_load_dword   v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset lds:1                    // first 64DW
+        buffer_load_dword   v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset lds:1 offset:256         // second 64DW
+    end
+    s_add_u32       m0, m0, 256*2                                               // 128 DW
+    s_add_u32       s_restore_mem_offset, s_restore_mem_offset, 256*2           //mem offset increased by 128DW
+    s_cmp_lt_u32    m0, s_restore_alloc_size                                    //scc=(m0 < s_restore_alloc_size) ? 1 : 0
+    s_cbranch_scc1  L_RESTORE_LDS_LOOP                                                      //LDS restore is complete?
+
+
+    /*          restore VGPRs       */
+    //////////////////////////////
+  L_RESTORE_VGPR:
+        // VGPR SR memory offset : 0
+    s_mov_b32       s_restore_mem_offset, 0x0
+    s_mov_b32       exec_lo, 0xFFFFFFFF                                                     //need every thread from now on   //be consistent with SAVE although can be moved ahead
+    s_mov_b32       exec_hi, 0xFFFFFFFF
+
+    s_getreg_b32    s_restore_alloc_size, hwreg(HW_REG_GPR_ALLOC,SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SHIFT,SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SIZE)    //vpgr_size
+    s_add_u32       s_restore_alloc_size, s_restore_alloc_size, 1
+    s_lshl_b32      s_restore_alloc_size, s_restore_alloc_size, 2                           //Number of VGPRs = (vgpr_size + 1) * 4    (non-zero value)
+    s_lshl_b32      s_restore_buf_rsrc2,  s_restore_alloc_size, 8                           //NUM_RECORDS in bytes (64 threads*4)
+    if (SWIZZLE_EN)
+        s_add_u32       s_restore_buf_rsrc2, s_restore_buf_rsrc2, 0x0                       //FIXME need to use swizzle to enable bounds checking?
+    else
+        s_mov_b32       s_restore_buf_rsrc2,  0x1000000                                     //NUM_RECORDS in bytes
+    end
+
+if G8SR_VGPR_SR_IN_DWX4
+     get_vgpr_size_bytes(s_restore_mem_offset)
+     s_sub_u32         s_restore_mem_offset, s_restore_mem_offset, 256*4
+
+     // the const stride for DWx4 is 4*4 bytes
+     s_and_b32 s_restore_buf_rsrc1, s_restore_buf_rsrc1, 0x0000FFFF   // reset const stride to 0
+     s_or_b32  s_restore_buf_rsrc1, s_restore_buf_rsrc1, G8SR_RESTORE_BUF_RSRC_WORD1_STRIDE_DWx4  // const stride to 4*4 bytes
+
+     s_mov_b32         m0, s_restore_alloc_size
+     s_set_gpr_idx_on  m0, 0x8    // Note.. This will change m0
+
+L_RESTORE_VGPR_LOOP:
+     buffer_load_dwordx4 v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset slc:1 glc:1
+     s_waitcnt vmcnt(0)
+     s_sub_u32         m0, m0, 4
+     v_mov_b32         v0, v0   // v[0+m0] = v0
+     v_mov_b32         v1, v1
+     v_mov_b32         v2, v2
+     v_mov_b32         v3, v3
+     s_sub_u32         s_restore_mem_offset, s_restore_mem_offset, 256*4
+     s_cmp_eq_u32      m0, 0x8000
+     s_cbranch_scc0    L_RESTORE_VGPR_LOOP
+     s_set_gpr_idx_off
+
+     s_and_b32 s_restore_buf_rsrc1, s_restore_buf_rsrc1, 0x0000FFFF   // reset const stride to 0
+     s_or_b32  s_restore_buf_rsrc1, s_restore_buf_rsrc1, S_RESTORE_BUF_RSRC_WORD1_STRIDE  // const stride to 4*4 bytes
+
+else
+    // VGPR load using dw burst
+    s_mov_b32       s_restore_mem_offset_save, s_restore_mem_offset     // restore start with v1, v0 will be the last
+    s_add_u32       s_restore_mem_offset, s_restore_mem_offset, 256*4
+    s_mov_b32       m0, 4                               //VGPR initial index value = 1
+    s_set_gpr_idx_on  m0, 0x8                       //M0[7:0] = M0[7:0] and M0[15:12] = 0x8
+    s_add_u32       s_restore_alloc_size, s_restore_alloc_size, 0x8000                      //add 0x8000 since we compare m0 against it later
+
+  L_RESTORE_VGPR_LOOP:
+    if(USE_MTBUF_INSTEAD_OF_MUBUF)
+        tbuffer_load_format_x v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset format:BUF_NUM_FORMAT_FLOAT format: BUF_DATA_FORMAT_32 slc:1 glc:1
+    else
+        buffer_load_dword v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset slc:1 glc:1
+        buffer_load_dword v1, v0, s_restore_buf_rsrc0, s_restore_mem_offset slc:1 glc:1 offset:256
+        buffer_load_dword v2, v0, s_restore_buf_rsrc0, s_restore_mem_offset slc:1 glc:1 offset:256*2
+        buffer_load_dword v3, v0, s_restore_buf_rsrc0, s_restore_mem_offset slc:1 glc:1 offset:256*3
+    end
+    s_waitcnt       vmcnt(0)                                                                //ensure data ready
+    v_mov_b32       v0, v0                                                                  //v[0+m0] = v0
+    v_mov_b32       v1, v1
+    v_mov_b32       v2, v2
+    v_mov_b32       v3, v3
+    s_add_u32       m0, m0, 4                                                               //next vgpr index
+    s_add_u32       s_restore_mem_offset, s_restore_mem_offset, 256*4                           //every buffer_load_dword does 256 bytes
+    s_cmp_lt_u32    m0, s_restore_alloc_size                                                //scc = (m0 < s_restore_alloc_size) ? 1 : 0
+    s_cbranch_scc1  L_RESTORE_VGPR_LOOP                                                     //VGPR restore (except v0) is complete?
+    s_set_gpr_idx_off
+                                                                                            /* VGPR restore on v0 */
+    if(USE_MTBUF_INSTEAD_OF_MUBUF)
+        tbuffer_load_format_x v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save format:BUF_NUM_FORMAT_FLOAT format: BUF_DATA_FORMAT_32 slc:1 glc:1
+    else
+        buffer_load_dword v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save    slc:1 glc:1
+        buffer_load_dword v1, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save    slc:1 glc:1 offset:256
+        buffer_load_dword v2, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save    slc:1 glc:1 offset:256*2
+        buffer_load_dword v3, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save    slc:1 glc:1 offset:256*3
+    end
+
+end
+
+    /*          restore SGPRs       */
+    //////////////////////////////
+
+    // SGPR SR memory offset : size(VGPR)
+    get_vgpr_size_bytes(s_restore_mem_offset)
+    get_sgpr_size_bytes(s_restore_tmp)
+    s_add_u32 s_restore_mem_offset, s_restore_mem_offset, s_restore_tmp
+    s_sub_u32 s_restore_mem_offset, s_restore_mem_offset, 16*4     // restore SGPR from S[n] to S[0], by 16 sgprs group
+    // TODO, change RSRC word to rearrange memory layout for SGPRS
+
+    s_getreg_b32    s_restore_alloc_size, hwreg(HW_REG_GPR_ALLOC,SQ_WAVE_GPR_ALLOC_SGPR_SIZE_SHIFT,SQ_WAVE_GPR_ALLOC_SGPR_SIZE_SIZE)                //spgr_size
+    s_add_u32       s_restore_alloc_size, s_restore_alloc_size, 1
+    s_lshl_b32      s_restore_alloc_size, s_restore_alloc_size, 4                           //Number of SGPRs = (sgpr_size + 1) * 16   (non-zero value)
+
+    if (SGPR_SAVE_USE_SQC)
+        s_lshl_b32      s_restore_buf_rsrc2,    s_restore_alloc_size, 2                     //NUM_RECORDS in bytes
+    else
+        s_lshl_b32      s_restore_buf_rsrc2,    s_restore_alloc_size, 8                     //NUM_RECORDS in bytes (64 threads)
+    end
+    if (SWIZZLE_EN)
+        s_add_u32       s_restore_buf_rsrc2, s_restore_buf_rsrc2, 0x0                       //FIXME need to use swizzle to enable bounds checking?
+    else
+        s_mov_b32       s_restore_buf_rsrc2,  0x1000000                                     //NUM_RECORDS in bytes
+    end
+
+    /* If 112 SGPRs ar allocated, 4 sgprs are not used TBA(108,109),TMA(110,111),
+       However, we are safe to restore these 4 SGPRs anyway, since TBA,TMA will later be restored by HWREG
+    */
+    s_mov_b32 m0, s_restore_alloc_size
+
+ L_RESTORE_SGPR_LOOP:
+    read_16sgpr_from_mem(s0, s_restore_buf_rsrc0, s_restore_mem_offset)  //PV: further performance improvement can be made
+    s_waitcnt       lgkmcnt(0)                                                              //ensure data ready
+
+    s_sub_u32 m0, m0, 16    // Restore from S[n] to S[0]
+
+    s_movreld_b64   s0, s0      //s[0+m0] = s0
+    s_movreld_b64   s2, s2
+    s_movreld_b64   s4, s4
+    s_movreld_b64   s6, s6
+    s_movreld_b64   s8, s8
+    s_movreld_b64   s10, s10
+    s_movreld_b64   s12, s12
+    s_movreld_b64   s14, s14
+
+    s_cmp_eq_u32    m0, 0               //scc = (m0 < s_restore_alloc_size) ? 1 : 0
+    s_cbranch_scc0  L_RESTORE_SGPR_LOOP             //SGPR restore (except s0) is complete?
+
+    /*      restore HW registers    */
+    //////////////////////////////
+  L_RESTORE_HWREG:
+
+
+if G8SR_DEBUG_TIMESTAMP
+      s_mov_b32 s_g8sr_ts_restore_s[0], s_restore_pc_lo
+      s_mov_b32 s_g8sr_ts_restore_s[1], s_restore_pc_hi
+end
+
+    // HWREG SR memory offset : size(VGPR)+size(SGPR)
+    get_vgpr_size_bytes(s_restore_mem_offset)
+    get_sgpr_size_bytes(s_restore_tmp)
+    s_add_u32 s_restore_mem_offset, s_restore_mem_offset, s_restore_tmp
+
+
+    s_mov_b32       s_restore_buf_rsrc2, 0x4                                                //NUM_RECORDS   in bytes
+    if (SWIZZLE_EN)
+        s_add_u32       s_restore_buf_rsrc2, s_restore_buf_rsrc2, 0x0                       //FIXME need to use swizzle to enable bounds checking?
+    else
+        s_mov_b32       s_restore_buf_rsrc2,  0x1000000                                     //NUM_RECORDS in bytes
+    end
+
+    read_hwreg_from_mem(s_restore_m0, s_restore_buf_rsrc0, s_restore_mem_offset)                    //M0
+    read_hwreg_from_mem(s_restore_pc_lo, s_restore_buf_rsrc0, s_restore_mem_offset)             //PC
+    read_hwreg_from_mem(s_restore_pc_hi, s_restore_buf_rsrc0, s_restore_mem_offset)
+    read_hwreg_from_mem(s_restore_exec_lo, s_restore_buf_rsrc0, s_restore_mem_offset)               //EXEC
+    read_hwreg_from_mem(s_restore_exec_hi, s_restore_buf_rsrc0, s_restore_mem_offset)
+    read_hwreg_from_mem(s_restore_status, s_restore_buf_rsrc0, s_restore_mem_offset)                //STATUS
+    read_hwreg_from_mem(s_restore_trapsts, s_restore_buf_rsrc0, s_restore_mem_offset)               //TRAPSTS
+    read_hwreg_from_mem(xnack_mask_lo, s_restore_buf_rsrc0, s_restore_mem_offset)                   //XNACK_MASK_LO
+    read_hwreg_from_mem(xnack_mask_hi, s_restore_buf_rsrc0, s_restore_mem_offset)                   //XNACK_MASK_HI
+    read_hwreg_from_mem(s_restore_mode, s_restore_buf_rsrc0, s_restore_mem_offset)              //MODE
+    read_hwreg_from_mem(tba_lo, s_restore_buf_rsrc0, s_restore_mem_offset)                      //TBA_LO
+    read_hwreg_from_mem(tba_hi, s_restore_buf_rsrc0, s_restore_mem_offset)                      //TBA_HI
+
+    s_waitcnt       lgkmcnt(0)                                                                                      //from now on, it is safe to restore STATUS and IB_STS
+
+    //for normal save & restore, the saved PC points to the next inst to execute, no adjustment needs to be made, otherwise:
+    if ((EMU_RUN_HACK) && (!EMU_RUN_HACK_RESTORE_NORMAL))
+        s_add_u32 s_restore_pc_lo, s_restore_pc_lo, 8            //pc[31:0]+8     //two back-to-back s_trap are used (first for save and second for restore)
+        s_addc_u32  s_restore_pc_hi, s_restore_pc_hi, 0x0        //carry bit over
+    end
+    if ((EMU_RUN_HACK) && (EMU_RUN_HACK_RESTORE_NORMAL))
+        s_add_u32 s_restore_pc_lo, s_restore_pc_lo, 4            //pc[31:0]+4     // save is hack through s_trap but restore is normal
+        s_addc_u32  s_restore_pc_hi, s_restore_pc_hi, 0x0        //carry bit over
+    end
+
+    s_mov_b32       m0,         s_restore_m0
+    s_mov_b32       exec_lo,    s_restore_exec_lo
+    s_mov_b32       exec_hi,    s_restore_exec_hi
+
+    s_and_b32       s_restore_m0, SQ_WAVE_TRAPSTS_PRE_SAVECTX_MASK, s_restore_trapsts
+    s_setreg_b32    hwreg(HW_REG_TRAPSTS, SQ_WAVE_TRAPSTS_PRE_SAVECTX_SHIFT, SQ_WAVE_TRAPSTS_PRE_SAVECTX_SIZE), s_restore_m0
+    s_and_b32       s_restore_m0, SQ_WAVE_TRAPSTS_POST_SAVECTX_MASK, s_restore_trapsts
+    s_lshr_b32      s_restore_m0, s_restore_m0, SQ_WAVE_TRAPSTS_POST_SAVECTX_SHIFT
+    s_setreg_b32    hwreg(HW_REG_TRAPSTS, SQ_WAVE_TRAPSTS_POST_SAVECTX_SHIFT, SQ_WAVE_TRAPSTS_POST_SAVECTX_SIZE), s_restore_m0
+    //s_setreg_b32  hwreg(HW_REG_TRAPSTS),  s_restore_trapsts      //don't overwrite SAVECTX bit as it may be set through external SAVECTX during restore
+    s_setreg_b32    hwreg(HW_REG_MODE),     s_restore_mode
+    //reuse s_restore_m0 as a temp register
+    s_and_b32       s_restore_m0, s_restore_pc_hi, S_SAVE_PC_HI_RCNT_MASK
+    s_lshr_b32      s_restore_m0, s_restore_m0, S_SAVE_PC_HI_RCNT_SHIFT
+    s_lshl_b32      s_restore_m0, s_restore_m0, SQ_WAVE_IB_STS_RCNT_SHIFT
+    s_mov_b32       s_restore_tmp, 0x0                                                                              //IB_STS is zero
+    s_or_b32        s_restore_tmp, s_restore_tmp, s_restore_m0
+    s_and_b32       s_restore_m0, s_restore_pc_hi, S_SAVE_PC_HI_FIRST_REPLAY_MASK
+    s_lshr_b32      s_restore_m0, s_restore_m0, S_SAVE_PC_HI_FIRST_REPLAY_SHIFT
+    s_lshl_b32      s_restore_m0, s_restore_m0, SQ_WAVE_IB_STS_FIRST_REPLAY_SHIFT
+    s_or_b32        s_restore_tmp, s_restore_tmp, s_restore_m0
+    s_and_b32       s_restore_m0, s_restore_status, SQ_WAVE_STATUS_INST_ATC_MASK
+    s_lshr_b32      s_restore_m0, s_restore_m0, SQ_WAVE_STATUS_INST_ATC_SHIFT
+    s_setreg_b32    hwreg(HW_REG_IB_STS),   s_restore_tmp
+
+    s_and_b32 s_restore_pc_hi, s_restore_pc_hi, 0x0000ffff      //pc[47:32]        //Do it here in order not to affect STATUS
+    s_and_b64    exec, exec, exec  // Restore STATUS.EXECZ, not writable by s_setreg_b32
+    s_and_b64    vcc, vcc, vcc  // Restore STATUS.VCCZ, not writable by s_setreg_b32
+    set_status_without_spi_prio(s_restore_status, s_restore_tmp) // SCC is included, which is changed by previous salu
+
+    s_barrier                                                   //barrier to ensure the readiness of LDS before access attempts from any other wave in the same TG //FIXME not performance-optimal at this time
+
+if G8SR_DEBUG_TIMESTAMP
+    s_memrealtime s_g8sr_ts_restore_d
+    s_waitcnt lgkmcnt(0)
+end
+
+//  s_rfe_b64 s_restore_pc_lo                                   //Return to the main shader program and resume execution
+    s_rfe_restore_b64  s_restore_pc_lo, s_restore_m0            // s_restore_m0[0] is used to set STATUS.inst_atc
+
+
+/**************************************************************************/
+/*                      the END                                           */
+/**************************************************************************/
+L_END_PGM:
+    s_endpgm
+
+end
+
+
+/**************************************************************************/
+/*                      the helper functions                              */
+/**************************************************************************/
+
+//Only for save hwreg to mem
+function write_hwreg_to_mem(s, s_rsrc, s_mem_offset)
+        s_mov_b32 exec_lo, m0                   //assuming exec_lo is not needed anymore from this point on
+        s_mov_b32 m0, s_mem_offset
+        s_buffer_store_dword s, s_rsrc, m0      glc:1
+        s_add_u32       s_mem_offset, s_mem_offset, 4
+        s_mov_b32   m0, exec_lo
+end
+
+
+// HWREG are saved before SGPRs, so all HWREG could be use.
+function write_16sgpr_to_mem(s, s_rsrc, s_mem_offset)
+
+        s_buffer_store_dwordx4 s[0], s_rsrc, 0  glc:1
+        s_buffer_store_dwordx4 s[4], s_rsrc, 16  glc:1
+        s_buffer_store_dwordx4 s[8], s_rsrc, 32  glc:1
+        s_buffer_store_dwordx4 s[12], s_rsrc, 48 glc:1
+        s_add_u32       s_rsrc[0], s_rsrc[0], 4*16
+        s_addc_u32      s_rsrc[1], s_rsrc[1], 0x0             // +scc
+end
+
+
+function read_hwreg_from_mem(s, s_rsrc, s_mem_offset)
+    s_buffer_load_dword s, s_rsrc, s_mem_offset     glc:1
+    s_add_u32       s_mem_offset, s_mem_offset, 4
+end
+
+function read_16sgpr_from_mem(s, s_rsrc, s_mem_offset)
+    s_buffer_load_dwordx16 s, s_rsrc, s_mem_offset      glc:1
+    s_sub_u32       s_mem_offset, s_mem_offset, 4*16
+end
+
+
+
+function get_lds_size_bytes(s_lds_size_byte)
+    // SQ LDS granularity is 64DW, while PGM_RSRC2.lds_size is in granularity 128DW
+    s_getreg_b32   s_lds_size_byte, hwreg(HW_REG_LDS_ALLOC, SQ_WAVE_LDS_ALLOC_LDS_SIZE_SHIFT, SQ_WAVE_LDS_ALLOC_LDS_SIZE_SIZE)          // lds_size
+    s_lshl_b32     s_lds_size_byte, s_lds_size_byte, 8                      //LDS size in dwords = lds_size * 64 *4Bytes    // granularity 64DW
+end
+
+function get_vgpr_size_bytes(s_vgpr_size_byte)
+    s_getreg_b32   s_vgpr_size_byte, hwreg(HW_REG_GPR_ALLOC,SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SHIFT,SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SIZE)  //vpgr_size
+    s_add_u32      s_vgpr_size_byte, s_vgpr_size_byte, 1
+    s_lshl_b32     s_vgpr_size_byte, s_vgpr_size_byte, (2+8) //Number of VGPRs = (vgpr_size + 1) * 4 * 64 * 4   (non-zero value)   //FIXME for GFX, zero is possible
+end
+
+function get_sgpr_size_bytes(s_sgpr_size_byte)
+    s_getreg_b32   s_sgpr_size_byte, hwreg(HW_REG_GPR_ALLOC,SQ_WAVE_GPR_ALLOC_SGPR_SIZE_SHIFT,SQ_WAVE_GPR_ALLOC_SGPR_SIZE_SIZE)  //spgr_size
+    s_add_u32      s_sgpr_size_byte, s_sgpr_size_byte, 1
+    s_lshl_b32     s_sgpr_size_byte, s_sgpr_size_byte, 6 //Number of SGPRs = (sgpr_size + 1) * 16 *4   (non-zero value)
+end
+
+function get_hwreg_size_bytes
+    return 128 //HWREG size 128 bytes
+end
+
+function set_status_without_spi_prio(status, tmp)
+    // Do not restore STATUS.SPI_PRIO since scheduler may have raised it.
+    s_lshr_b32      tmp, status, SQ_WAVE_STATUS_POST_SPI_PRIO_SHIFT
+    s_setreg_b32    hwreg(HW_REG_STATUS, SQ_WAVE_STATUS_POST_SPI_PRIO_SHIFT, SQ_WAVE_STATUS_POST_SPI_PRIO_SIZE), tmp
+    s_nop           0x2 // avoid S_SETREG => S_SETREG hazard
+    s_setreg_b32    hwreg(HW_REG_STATUS, SQ_WAVE_STATUS_PRE_SPI_PRIO_SHIFT, SQ_WAVE_STATUS_PRE_SPI_PRIO_SIZE), status
+end
diff --git a/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler_gfx9.asm b/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler_gfx9.asm
new file mode 100644
index 000000000..0bb9c577b
--- /dev/null
+++ b/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler_gfx9.asm
@@ -0,0 +1,1226 @@
+/*
+ * Copyright 2016 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+/* To compile this assembly code:
+ * PROJECT=greenland ./sp3 cwsr_trap_handler_gfx9.asm -hex tmp.hex
+ */
+
+/* HW (GFX9) source code for CWSR trap handler */
+/* Version 18 + multiple trap handler */
+
+// this performance-optimal version was originally from Seven Xu at SRDC
+
+// Revison #18	 --...
+/* Rev History
+** #1. Branch from gc dv.   //gfxip/gfx9/main/src/test/suites/block/cs/sr/cs_trap_handler.sp3#1,#50, #51, #52-53(Skip, Already Fixed by PV), #54-56(merged),#57-58(mergerd, skiped-already fixed by PV)
+** #4. SR Memory Layout:
+**			 1. VGPR-SGPR-HWREG-{LDS}
+**			 2. tba_hi.bits.26 - reconfigured as the first wave in tg bits, for defer Save LDS for a threadgroup.. performance concern..
+** #5. Update: 1. Accurate g8sr_ts_save_d timestamp
+** #6. Update: 1. Fix s_barrier usage; 2. VGPR s/r using swizzle buffer?(NoNeed, already matched the swizzle pattern, more investigation)
+** #7. Update: 1. don't barrier if noLDS
+** #8. Branch: 1. Branch to ver#0, which is very similar to gc dv version
+**	       2. Fix SQ issue by s_sleep 2
+** #9. Update: 1. Fix scc restore failed issue, restore wave_status at last
+**	       2. optimize s_buffer save by burst 16sgprs...
+** #10. Update 1. Optimize restore sgpr by busrt 16 sgprs.
+** #11. Update 1. Add 2 more timestamp for debug version
+** #12. Update 1. Add VGPR SR using DWx4, some case improve and some case drop performance
+** #13. Integ  1. Always use MUBUF for PV trap shader...
+** #14. Update 1. s_buffer_store soft clause...
+** #15. Update 1. PERF - sclar write with glc:0/mtype0 to allow L2 combine. perf improvement a lot.
+** #16. Update 1. PRRF - UNROLL LDS_DMA got 2500cycle save in IP tree
+** #17. Update 1. FUNC - LDS_DMA has issues while ATC, replace with ds_read/buffer_store for save part[TODO restore part]
+**	       2. PERF - Save LDS before save VGPR to cover LDS save long latency...
+** #18. Update 1. FUNC - Implicitly estore STATUS.VCCZ, which is not writable by s_setreg_b32
+**	       2. FUNC - Handle non-CWSR traps
+*/
+
+var G8SR_WDMEM_HWREG_OFFSET = 0
+var G8SR_WDMEM_SGPR_OFFSET  = 128  // in bytes
+
+// Keep definition same as the app shader, These 2 time stamps are part of the app shader... Should before any Save and after restore.
+
+var G8SR_DEBUG_TIMESTAMP = 0
+var G8SR_DEBUG_TS_SAVE_D_OFFSET = 40*4	// ts_save_d timestamp offset relative to SGPR_SR_memory_offset
+var s_g8sr_ts_save_s	= s[34:35]   // save start
+var s_g8sr_ts_sq_save_msg  = s[36:37]	// The save shader send SAVEWAVE msg to spi
+var s_g8sr_ts_spi_wrexec   = s[38:39]	// the SPI write the sr address to SQ
+var s_g8sr_ts_save_d	= s[40:41]   // save end
+var s_g8sr_ts_restore_s = s[42:43]   // restore start
+var s_g8sr_ts_restore_d = s[44:45]   // restore end
+
+var G8SR_VGPR_SR_IN_DWX4 = 0
+var G8SR_SAVE_BUF_RSRC_WORD1_STRIDE_DWx4 = 0x00100000	 // DWx4 stride is 4*4Bytes
+var G8SR_RESTORE_BUF_RSRC_WORD1_STRIDE_DWx4  = G8SR_SAVE_BUF_RSRC_WORD1_STRIDE_DWx4
+
+
+/*************************************************************************/
+/*		    control on how to run the shader			 */
+/*************************************************************************/
+//any hack that needs to be made to run this code in EMU (either because various EMU code are not ready or no compute save & restore in EMU run)
+var EMU_RUN_HACK		    =	0
+var EMU_RUN_HACK_RESTORE_NORMAL	    =	0
+var EMU_RUN_HACK_SAVE_NORMAL_EXIT   =	0
+var EMU_RUN_HACK_SAVE_SINGLE_WAVE   =	0
+var EMU_RUN_HACK_SAVE_FIRST_TIME    =	0		    //for interrupted restore in which the first save is through EMU_RUN_HACK
+var SAVE_LDS			    =	1
+var WG_BASE_ADDR_LO		    =	0x9000a000
+var WG_BASE_ADDR_HI		    =	0x0
+var WAVE_SPACE			    =	0x5000		    //memory size that each wave occupies in workgroup state mem
+var CTX_SAVE_CONTROL		    =	0x0
+var CTX_RESTORE_CONTROL		    =	CTX_SAVE_CONTROL
+var SIM_RUN_HACK		    =	0		    //any hack that needs to be made to run this code in SIM (either because various RTL code are not ready or no compute save & restore in RTL run)
+var SGPR_SAVE_USE_SQC		    =	1		    //use SQC D$ to do the write
+var USE_MTBUF_INSTEAD_OF_MUBUF	    =	0		    //because TC EMU currently asserts on 0 of // overload DFMT field to carry 4 more bits of stride for MUBUF opcodes
+var SWIZZLE_EN			    =	0		    //whether we use swizzled buffer addressing
+var ACK_SQC_STORE		    =	1		    //workaround for suspected SQC store bug causing incorrect stores under concurrency
+
+/**************************************************************************/
+/*			variables					  */
+/**************************************************************************/
+var SQ_WAVE_STATUS_INST_ATC_SHIFT  = 23
+var SQ_WAVE_STATUS_INST_ATC_MASK   = 0x00800000
+var SQ_WAVE_STATUS_SPI_PRIO_SHIFT  = 1
+var SQ_WAVE_STATUS_SPI_PRIO_MASK   = 0x00000006
+var SQ_WAVE_STATUS_HALT_MASK       = 0x2000
+var SQ_WAVE_STATUS_PRE_SPI_PRIO_SHIFT   = 0
+var SQ_WAVE_STATUS_PRE_SPI_PRIO_SIZE    = 1
+var SQ_WAVE_STATUS_POST_SPI_PRIO_SHIFT  = 3
+var SQ_WAVE_STATUS_POST_SPI_PRIO_SIZE   = 29
+
+var SQ_WAVE_LDS_ALLOC_LDS_SIZE_SHIFT	= 12
+var SQ_WAVE_LDS_ALLOC_LDS_SIZE_SIZE	= 9
+var SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SHIFT	= 8
+var SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SIZE	= 6
+var SQ_WAVE_GPR_ALLOC_SGPR_SIZE_SHIFT	= 24
+var SQ_WAVE_GPR_ALLOC_SGPR_SIZE_SIZE	= 3			//FIXME	 sq.blk still has 4 bits at this time while SQ programming guide has 3 bits
+
+var SQ_WAVE_TRAPSTS_SAVECTX_MASK    =	0x400
+var SQ_WAVE_TRAPSTS_EXCE_MASK	    =	0x1FF			// Exception mask
+var SQ_WAVE_TRAPSTS_SAVECTX_SHIFT   =	10
+var SQ_WAVE_TRAPSTS_MEM_VIOL_MASK   =	0x100
+var SQ_WAVE_TRAPSTS_MEM_VIOL_SHIFT  =	8
+var SQ_WAVE_TRAPSTS_PRE_SAVECTX_MASK	=   0x3FF
+var SQ_WAVE_TRAPSTS_PRE_SAVECTX_SHIFT	=   0x0
+var SQ_WAVE_TRAPSTS_PRE_SAVECTX_SIZE	=   10
+var SQ_WAVE_TRAPSTS_POST_SAVECTX_MASK	=   0xFFFFF800
+var SQ_WAVE_TRAPSTS_POST_SAVECTX_SHIFT	=   11
+var SQ_WAVE_TRAPSTS_POST_SAVECTX_SIZE	=   21
+var SQ_WAVE_TRAPSTS_ILLEGAL_INST_MASK	=   0x800
+
+var SQ_WAVE_IB_STS_RCNT_SHIFT		=   16			//FIXME
+var SQ_WAVE_IB_STS_FIRST_REPLAY_SHIFT	=   15			//FIXME
+var SQ_WAVE_IB_STS_RCNT_FIRST_REPLAY_MASK	= 0x1F8000
+var SQ_WAVE_IB_STS_RCNT_FIRST_REPLAY_MASK_NEG	= 0x00007FFF	//FIXME
+
+var SQ_BUF_RSRC_WORD1_ATC_SHIFT	    =	24
+var SQ_BUF_RSRC_WORD3_MTYPE_SHIFT   =	27
+
+var TTMP11_SAVE_RCNT_FIRST_REPLAY_SHIFT	=   26			// bits [31:26] unused by SPI debug data
+var TTMP11_SAVE_RCNT_FIRST_REPLAY_MASK	=   0xFC000000
+
+/*	Save	    */
+var S_SAVE_BUF_RSRC_WORD1_STRIDE	=   0x00040000		//stride is 4 bytes
+var S_SAVE_BUF_RSRC_WORD3_MISC		=   0x00807FAC		//SQ_SEL_X/Y/Z/W, BUF_NUM_FORMAT_FLOAT, (0 for MUBUF stride[17:14] when ADD_TID_ENABLE and BUF_DATA_FORMAT_32 for MTBUF), ADD_TID_ENABLE
+
+var S_SAVE_SPI_INIT_ATC_MASK		=   0x08000000		//bit[27]: ATC bit
+var S_SAVE_SPI_INIT_ATC_SHIFT		=   27
+var S_SAVE_SPI_INIT_MTYPE_MASK		=   0x70000000		//bit[30:28]: Mtype
+var S_SAVE_SPI_INIT_MTYPE_SHIFT		=   28
+var S_SAVE_SPI_INIT_FIRST_WAVE_MASK	=   0x04000000		//bit[26]: FirstWaveInTG
+var S_SAVE_SPI_INIT_FIRST_WAVE_SHIFT	=   26
+
+var S_SAVE_PC_HI_RCNT_SHIFT		=   28			//FIXME	 check with Brian to ensure all fields other than PC[47:0] can be used
+var S_SAVE_PC_HI_RCNT_MASK		=   0xF0000000		//FIXME
+var S_SAVE_PC_HI_FIRST_REPLAY_SHIFT	=   27			//FIXME
+var S_SAVE_PC_HI_FIRST_REPLAY_MASK	=   0x08000000		//FIXME
+
+var s_save_spi_init_lo		    =	exec_lo
+var s_save_spi_init_hi		    =	exec_hi
+
+var s_save_pc_lo	    =	ttmp0		//{TTMP1, TTMP0} = {3'h0,pc_rewind[3:0], HT[0],trapID[7:0], PC[47:0]}
+var s_save_pc_hi	    =	ttmp1
+var s_save_exec_lo	    =	ttmp2
+var s_save_exec_hi	    =	ttmp3
+var s_save_tmp		    =	ttmp4
+var s_save_trapsts	    =	ttmp5		//not really used until the end of the SAVE routine
+var s_save_xnack_mask_lo    =	ttmp6
+var s_save_xnack_mask_hi    =	ttmp7
+var s_save_buf_rsrc0	    =	ttmp8
+var s_save_buf_rsrc1	    =	ttmp9
+var s_save_buf_rsrc2	    =	ttmp10
+var s_save_buf_rsrc3	    =	ttmp11
+var s_save_status	    =	ttmp12
+var s_save_mem_offset	    =	ttmp14
+var s_save_alloc_size	    =	s_save_trapsts		//conflict
+var s_save_m0		    =	ttmp15
+var s_save_ttmps_lo	    =	s_save_tmp		//no conflict
+var s_save_ttmps_hi	    =	s_save_trapsts		//no conflict
+
+/*	Restore	    */
+var S_RESTORE_BUF_RSRC_WORD1_STRIDE	    =	S_SAVE_BUF_RSRC_WORD1_STRIDE
+var S_RESTORE_BUF_RSRC_WORD3_MISC	    =	S_SAVE_BUF_RSRC_WORD3_MISC
+
+var S_RESTORE_SPI_INIT_ATC_MASK		    =	0x08000000	    //bit[27]: ATC bit
+var S_RESTORE_SPI_INIT_ATC_SHIFT	    =	27
+var S_RESTORE_SPI_INIT_MTYPE_MASK	    =	0x70000000	    //bit[30:28]: Mtype
+var S_RESTORE_SPI_INIT_MTYPE_SHIFT	    =	28
+var S_RESTORE_SPI_INIT_FIRST_WAVE_MASK	    =	0x04000000	    //bit[26]: FirstWaveInTG
+var S_RESTORE_SPI_INIT_FIRST_WAVE_SHIFT	    =	26
+
+var S_RESTORE_PC_HI_RCNT_SHIFT		    =	S_SAVE_PC_HI_RCNT_SHIFT
+var S_RESTORE_PC_HI_RCNT_MASK		    =	S_SAVE_PC_HI_RCNT_MASK
+var S_RESTORE_PC_HI_FIRST_REPLAY_SHIFT	    =	S_SAVE_PC_HI_FIRST_REPLAY_SHIFT
+var S_RESTORE_PC_HI_FIRST_REPLAY_MASK	    =	S_SAVE_PC_HI_FIRST_REPLAY_MASK
+
+var s_restore_spi_init_lo		    =	exec_lo
+var s_restore_spi_init_hi		    =	exec_hi
+
+var s_restore_mem_offset	=   ttmp12
+var s_restore_alloc_size	=   ttmp3
+var s_restore_tmp		=   ttmp2
+var s_restore_mem_offset_save	=   s_restore_tmp	//no conflict
+
+var s_restore_m0	    =	s_restore_alloc_size	//no conflict
+
+var s_restore_mode	    =	ttmp7
+
+var s_restore_pc_lo	    =	ttmp0
+var s_restore_pc_hi	    =	ttmp1
+var s_restore_exec_lo	    =	ttmp14
+var s_restore_exec_hi	    = 	ttmp15
+var s_restore_status	    =	ttmp4
+var s_restore_trapsts	    =	ttmp5
+var s_restore_xnack_mask_lo =	xnack_mask_lo
+var s_restore_xnack_mask_hi =	xnack_mask_hi
+var s_restore_buf_rsrc0	    =	ttmp8
+var s_restore_buf_rsrc1	    =	ttmp9
+var s_restore_buf_rsrc2	    =	ttmp10
+var s_restore_buf_rsrc3	    =	ttmp11
+var s_restore_ttmps_lo	    =	s_restore_tmp		//no conflict
+var s_restore_ttmps_hi	    =	s_restore_alloc_size	//no conflict
+
+/**************************************************************************/
+/*			trap handler entry points			  */
+/**************************************************************************/
+/* Shader Main*/
+
+shader main
+  asic(GFX9)
+  type(CS)
+
+
+    if ((EMU_RUN_HACK) && (!EMU_RUN_HACK_RESTORE_NORMAL))		    //hack to use trap_id for determining save/restore
+	//FIXME VCCZ un-init assertion s_getreg_b32	s_save_status, hwreg(HW_REG_STATUS)	    //save STATUS since we will change SCC
+	s_and_b32 s_save_tmp, s_save_pc_hi, 0xffff0000		    //change SCC
+	s_cmp_eq_u32 s_save_tmp, 0x007e0000			    //Save: trap_id = 0x7e. Restore: trap_id = 0x7f.
+	s_cbranch_scc0 L_JUMP_TO_RESTORE			    //do not need to recover STATUS here  since we are going to RESTORE
+	//FIXME	 s_setreg_b32	hwreg(HW_REG_STATUS),	s_save_status	    //need to recover STATUS since we are going to SAVE
+	s_branch L_SKIP_RESTORE					    //NOT restore, SAVE actually
+    else
+	s_branch L_SKIP_RESTORE					    //NOT restore. might be a regular trap or save
+    end
+
+L_JUMP_TO_RESTORE:
+    s_branch L_RESTORE						    //restore
+
+L_SKIP_RESTORE:
+
+    s_getreg_b32    s_save_status, hwreg(HW_REG_STATUS)				    //save STATUS since we will change SCC
+    s_andn2_b32	    s_save_status, s_save_status, SQ_WAVE_STATUS_SPI_PRIO_MASK	    //check whether this is for save
+    s_getreg_b32    s_save_trapsts, hwreg(HW_REG_TRAPSTS)
+    s_and_b32       ttmp2, s_save_trapsts, SQ_WAVE_TRAPSTS_SAVECTX_MASK    //check whether this is for save
+    s_cbranch_scc1  L_SAVE					//this is the operation for save
+
+    // *********    Handle non-CWSR traps	*******************
+if (!EMU_RUN_HACK)
+    // Illegal instruction is a non-maskable exception which blocks context save.
+    // Halt the wavefront and return from the trap.
+    s_and_b32       ttmp2, s_save_trapsts, SQ_WAVE_TRAPSTS_ILLEGAL_INST_MASK
+    s_cbranch_scc1  L_HALT_WAVE
+
+    // If STATUS.MEM_VIOL is asserted then we cannot fetch from the TMA.
+    // Instead, halt the wavefront and return from the trap.
+    s_and_b32       ttmp2, s_save_trapsts, SQ_WAVE_TRAPSTS_MEM_VIOL_MASK
+    s_cbranch_scc0  L_FETCH_2ND_TRAP
+
+L_HALT_WAVE:
+    // If STATUS.HALT is set then this fault must come from SQC instruction fetch.
+    // We cannot prevent further faults so just terminate the wavefront.
+    s_and_b32       ttmp2, s_save_status, SQ_WAVE_STATUS_HALT_MASK
+    s_cbranch_scc0  L_NOT_ALREADY_HALTED
+    s_endpgm
+L_NOT_ALREADY_HALTED:
+    s_or_b32        s_save_status, s_save_status, SQ_WAVE_STATUS_HALT_MASK
+
+    // If the PC points to S_ENDPGM then context save will fail if STATUS.HALT is set.
+    // Rewind the PC to prevent this from occurring. The debugger compensates for this.
+    s_sub_u32       ttmp0, ttmp0, 0x8
+    s_subb_u32      ttmp1, ttmp1, 0x0
+
+L_FETCH_2ND_TRAP:
+    // Preserve and clear scalar XNACK state before issuing scalar reads.
+    // Save IB_STS.FIRST_REPLAY[15] and IB_STS.RCNT[20:16] into unused space ttmp11[31:26].
+    s_getreg_b32    ttmp2, hwreg(HW_REG_IB_STS)
+    s_and_b32       ttmp3, ttmp2, SQ_WAVE_IB_STS_RCNT_FIRST_REPLAY_MASK
+    s_lshl_b32      ttmp3, ttmp3, (TTMP11_SAVE_RCNT_FIRST_REPLAY_SHIFT - SQ_WAVE_IB_STS_FIRST_REPLAY_SHIFT)
+    s_andn2_b32     ttmp11, ttmp11, TTMP11_SAVE_RCNT_FIRST_REPLAY_MASK
+    s_or_b32        ttmp11, ttmp11, ttmp3
+
+    s_andn2_b32     ttmp2, ttmp2, SQ_WAVE_IB_STS_RCNT_FIRST_REPLAY_MASK
+    s_setreg_b32    hwreg(HW_REG_IB_STS), ttmp2
+
+    // Read second-level TBA/TMA from first-level TMA and jump if available.
+    // ttmp[2:5] and ttmp12 can be used (others hold SPI-initialized debug data)
+    // ttmp12 holds SQ_WAVE_STATUS
+    s_getreg_b32    ttmp4, hwreg(HW_REG_SQ_SHADER_TMA_LO)
+    s_getreg_b32    ttmp5, hwreg(HW_REG_SQ_SHADER_TMA_HI)
+    s_lshl_b64      [ttmp4, ttmp5], [ttmp4, ttmp5], 0x8
+    s_load_dwordx2  [ttmp2, ttmp3], [ttmp4, ttmp5], 0x0 glc:1 // second-level TBA
+    s_waitcnt       lgkmcnt(0)
+    s_load_dwordx2  [ttmp4, ttmp5], [ttmp4, ttmp5], 0x8 glc:1 // second-level TMA
+    s_waitcnt       lgkmcnt(0)
+    s_and_b64       [ttmp2, ttmp3], [ttmp2, ttmp3], [ttmp2, ttmp3]
+    s_cbranch_scc0  L_NO_NEXT_TRAP // second-level trap handler not been set
+    s_setpc_b64     [ttmp2, ttmp3] // jump to second-level trap handler
+
+L_NO_NEXT_TRAP:
+    s_getreg_b32    s_save_trapsts, hwreg(HW_REG_TRAPSTS)
+    s_and_b32	    s_save_trapsts, s_save_trapsts, SQ_WAVE_TRAPSTS_EXCE_MASK // Check whether it is an exception
+    s_cbranch_scc1  L_EXCP_CASE	  // Exception, jump back to the shader program directly.
+    s_add_u32	    ttmp0, ttmp0, 4   // S_TRAP case, add 4 to ttmp0
+    s_addc_u32	ttmp1, ttmp1, 0
+L_EXCP_CASE:
+    s_and_b32	ttmp1, ttmp1, 0xFFFF
+
+    // Restore SQ_WAVE_IB_STS.
+    s_lshr_b32      ttmp2, ttmp11, (TTMP11_SAVE_RCNT_FIRST_REPLAY_SHIFT - SQ_WAVE_IB_STS_FIRST_REPLAY_SHIFT)
+    s_and_b32       ttmp2, ttmp2, SQ_WAVE_IB_STS_RCNT_FIRST_REPLAY_MASK
+    s_setreg_b32    hwreg(HW_REG_IB_STS), ttmp2
+
+    // Restore SQ_WAVE_STATUS.
+    s_and_b64       exec, exec, exec // Restore STATUS.EXECZ, not writable by s_setreg_b32
+    s_and_b64       vcc, vcc, vcc    // Restore STATUS.VCCZ, not writable by s_setreg_b32
+    set_status_without_spi_prio(s_save_status, ttmp2)
+
+    s_rfe_b64       [ttmp0, ttmp1]
+end
+    // *********	End handling of non-CWSR traps	 *******************
+
+/**************************************************************************/
+/*			save routine					  */
+/**************************************************************************/
+
+L_SAVE:
+
+if G8SR_DEBUG_TIMESTAMP
+	s_memrealtime	s_g8sr_ts_save_s
+	s_waitcnt lgkmcnt(0)	     //FIXME, will cause xnack??
+end
+
+    s_and_b32	    s_save_pc_hi, s_save_pc_hi, 0x0000ffff    //pc[47:32]
+
+    s_mov_b32	    s_save_tmp, 0							    //clear saveCtx bit
+    s_setreg_b32    hwreg(HW_REG_TRAPSTS, SQ_WAVE_TRAPSTS_SAVECTX_SHIFT, 1), s_save_tmp	    //clear saveCtx bit
+
+    s_getreg_b32    s_save_tmp, hwreg(HW_REG_IB_STS, SQ_WAVE_IB_STS_RCNT_SHIFT, SQ_WAVE_IB_STS_RCNT_SIZE)		    //save RCNT
+    s_lshl_b32	    s_save_tmp, s_save_tmp, S_SAVE_PC_HI_RCNT_SHIFT
+    s_or_b32	    s_save_pc_hi, s_save_pc_hi, s_save_tmp
+    s_getreg_b32    s_save_tmp, hwreg(HW_REG_IB_STS, SQ_WAVE_IB_STS_FIRST_REPLAY_SHIFT, SQ_WAVE_IB_STS_FIRST_REPLAY_SIZE)   //save FIRST_REPLAY
+    s_lshl_b32	    s_save_tmp, s_save_tmp, S_SAVE_PC_HI_FIRST_REPLAY_SHIFT
+    s_or_b32	    s_save_pc_hi, s_save_pc_hi, s_save_tmp
+    s_getreg_b32    s_save_tmp, hwreg(HW_REG_IB_STS)					    //clear RCNT and FIRST_REPLAY in IB_STS
+    s_and_b32	    s_save_tmp, s_save_tmp, SQ_WAVE_IB_STS_RCNT_FIRST_REPLAY_MASK_NEG
+
+    s_setreg_b32    hwreg(HW_REG_IB_STS), s_save_tmp
+
+    /*	    inform SPI the readiness and wait for SPI's go signal */
+    s_mov_b32	    s_save_exec_lo, exec_lo						    //save EXEC and use EXEC for the go signal from SPI
+    s_mov_b32	    s_save_exec_hi, exec_hi
+    s_mov_b64	    exec,   0x0								    //clear EXEC to get ready to receive
+
+if G8SR_DEBUG_TIMESTAMP
+	s_memrealtime  s_g8sr_ts_sq_save_msg
+	s_waitcnt lgkmcnt(0)
+end
+
+    if (EMU_RUN_HACK)
+
+    else
+	s_sendmsg   sendmsg(MSG_SAVEWAVE)  //send SPI a message and wait for SPI's write to EXEC
+    end
+
+    // Set SPI_PRIO=2 to avoid starving instruction fetch in the waves we're waiting for.
+    s_or_b32 s_save_tmp, s_save_status, (2 << SQ_WAVE_STATUS_SPI_PRIO_SHIFT)
+    s_setreg_b32 hwreg(HW_REG_STATUS), s_save_tmp
+
+  L_SLEEP:
+    s_sleep 0x2		       // sleep 1 (64clk) is not enough for 8 waves per SIMD, which will cause SQ hang, since the 7,8th wave could not get arbit to exec inst, while other waves are stuck into the sleep-loop and waiting for wrexec!=0
+
+    if (EMU_RUN_HACK)
+
+    else
+	s_cbranch_execz L_SLEEP
+    end
+
+if G8SR_DEBUG_TIMESTAMP
+	s_memrealtime  s_g8sr_ts_spi_wrexec
+	s_waitcnt lgkmcnt(0)
+end
+
+    if ((EMU_RUN_HACK) && (!EMU_RUN_HACK_SAVE_SINGLE_WAVE))
+	//calculate wd_addr using absolute thread id
+	v_readlane_b32 s_save_tmp, v9, 0
+	s_lshr_b32 s_save_tmp, s_save_tmp, 6
+	s_mul_i32 s_save_tmp, s_save_tmp, WAVE_SPACE
+	s_add_i32 s_save_spi_init_lo, s_save_tmp, WG_BASE_ADDR_LO
+	s_mov_b32 s_save_spi_init_hi, WG_BASE_ADDR_HI
+	s_and_b32 s_save_spi_init_hi, s_save_spi_init_hi, CTX_SAVE_CONTROL
+    else
+    end
+    if ((EMU_RUN_HACK) && (EMU_RUN_HACK_SAVE_SINGLE_WAVE))
+	s_add_i32 s_save_spi_init_lo, s_save_tmp, WG_BASE_ADDR_LO
+	s_mov_b32 s_save_spi_init_hi, WG_BASE_ADDR_HI
+	s_and_b32 s_save_spi_init_hi, s_save_spi_init_hi, CTX_SAVE_CONTROL
+    else
+    end
+
+    // Save trap temporaries 6-11, 13-15 initialized by SPI debug dispatch logic
+    // ttmp SR memory offset : size(VGPR)+size(SGPR)+0x40
+    get_vgpr_size_bytes(s_save_ttmps_lo)
+    get_sgpr_size_bytes(s_save_ttmps_hi)
+    s_add_u32	    s_save_ttmps_lo, s_save_ttmps_lo, s_save_ttmps_hi
+    s_add_u32	    s_save_ttmps_lo, s_save_ttmps_lo, s_save_spi_init_lo
+    s_addc_u32	    s_save_ttmps_hi, s_save_spi_init_hi, 0x0
+    s_and_b32	    s_save_ttmps_hi, s_save_ttmps_hi, 0xFFFF
+    s_store_dwordx2 [ttmp6, ttmp7], [s_save_ttmps_lo, s_save_ttmps_hi], 0x40 glc:1
+    ack_sqc_store_workaround()
+    s_store_dwordx4 [ttmp8, ttmp9, ttmp10, ttmp11], [s_save_ttmps_lo, s_save_ttmps_hi], 0x48 glc:1
+    ack_sqc_store_workaround()
+    s_store_dword   ttmp13, [s_save_ttmps_lo, s_save_ttmps_hi], 0x58 glc:1
+    ack_sqc_store_workaround()
+    s_store_dwordx2 [ttmp14, ttmp15], [s_save_ttmps_lo, s_save_ttmps_hi], 0x5C glc:1
+    ack_sqc_store_workaround()
+
+    /*	    setup Resource Contants    */
+    s_mov_b32	    s_save_buf_rsrc0,	s_save_spi_init_lo							//base_addr_lo
+    s_and_b32	    s_save_buf_rsrc1,	s_save_spi_init_hi, 0x0000FFFF						//base_addr_hi
+    s_or_b32	    s_save_buf_rsrc1,	s_save_buf_rsrc1,  S_SAVE_BUF_RSRC_WORD1_STRIDE
+    s_mov_b32	    s_save_buf_rsrc2,	0									//NUM_RECORDS initial value = 0 (in bytes) although not neccessarily inited
+    s_mov_b32	    s_save_buf_rsrc3,	S_SAVE_BUF_RSRC_WORD3_MISC
+    s_and_b32	    s_save_tmp,		s_save_spi_init_hi, S_SAVE_SPI_INIT_ATC_MASK
+    s_lshr_b32	    s_save_tmp,		s_save_tmp, (S_SAVE_SPI_INIT_ATC_SHIFT-SQ_BUF_RSRC_WORD1_ATC_SHIFT)	    //get ATC bit into position
+    s_or_b32	    s_save_buf_rsrc3,	s_save_buf_rsrc3,  s_save_tmp						//or ATC
+    s_and_b32	    s_save_tmp,		s_save_spi_init_hi, S_SAVE_SPI_INIT_MTYPE_MASK
+    s_lshr_b32	    s_save_tmp,		s_save_tmp, (S_SAVE_SPI_INIT_MTYPE_SHIFT-SQ_BUF_RSRC_WORD3_MTYPE_SHIFT)	    //get MTYPE bits into position
+    s_or_b32	    s_save_buf_rsrc3,	s_save_buf_rsrc3,  s_save_tmp						//or MTYPE
+
+    //FIXME  right now s_save_m0/s_save_mem_offset use tma_lo/tma_hi  (might need to save them before using them?)
+    s_mov_b32	    s_save_m0,		m0								    //save M0
+
+    /*	    global mem offset		*/
+    s_mov_b32	    s_save_mem_offset,	0x0									//mem offset initial value = 0
+
+
+
+
+    /*	    save HW registers	*/
+    //////////////////////////////
+
+  L_SAVE_HWREG:
+	// HWREG SR memory offset : size(VGPR)+size(SGPR)
+       get_vgpr_size_bytes(s_save_mem_offset)
+       get_sgpr_size_bytes(s_save_tmp)
+       s_add_u32 s_save_mem_offset, s_save_mem_offset, s_save_tmp
+
+
+    s_mov_b32	    s_save_buf_rsrc2, 0x4				//NUM_RECORDS	in bytes
+    if (SWIZZLE_EN)
+	s_add_u32	s_save_buf_rsrc2, s_save_buf_rsrc2, 0x0			    //FIXME need to use swizzle to enable bounds checking?
+    else
+	s_mov_b32	s_save_buf_rsrc2,  0x1000000				    //NUM_RECORDS in bytes
+    end
+
+
+    write_hwreg_to_mem(s_save_m0, s_save_buf_rsrc0, s_save_mem_offset)			//M0
+
+    if ((EMU_RUN_HACK) && (EMU_RUN_HACK_SAVE_FIRST_TIME))
+	s_add_u32 s_save_pc_lo, s_save_pc_lo, 4		    //pc[31:0]+4
+	s_addc_u32 s_save_pc_hi, s_save_pc_hi, 0x0	    //carry bit over
+    end
+
+    write_hwreg_to_mem(s_save_pc_lo, s_save_buf_rsrc0, s_save_mem_offset)		    //PC
+    write_hwreg_to_mem(s_save_pc_hi, s_save_buf_rsrc0, s_save_mem_offset)
+    write_hwreg_to_mem(s_save_exec_lo, s_save_buf_rsrc0, s_save_mem_offset)		//EXEC
+    write_hwreg_to_mem(s_save_exec_hi, s_save_buf_rsrc0, s_save_mem_offset)
+    write_hwreg_to_mem(s_save_status, s_save_buf_rsrc0, s_save_mem_offset)		//STATUS
+
+    //s_save_trapsts conflicts with s_save_alloc_size
+    s_getreg_b32    s_save_trapsts, hwreg(HW_REG_TRAPSTS)
+    write_hwreg_to_mem(s_save_trapsts, s_save_buf_rsrc0, s_save_mem_offset)		//TRAPSTS
+
+    write_hwreg_to_mem(xnack_mask_lo, s_save_buf_rsrc0, s_save_mem_offset)	    //XNACK_MASK_LO
+    write_hwreg_to_mem(xnack_mask_hi, s_save_buf_rsrc0, s_save_mem_offset)	    //XNACK_MASK_HI
+
+    //use s_save_tmp would introduce conflict here between s_save_tmp and s_save_buf_rsrc2
+    s_getreg_b32    s_save_m0, hwreg(HW_REG_MODE)						    //MODE
+    write_hwreg_to_mem(s_save_m0, s_save_buf_rsrc0, s_save_mem_offset)
+
+
+
+    /*	    the first wave in the threadgroup	 */
+    s_and_b32	    s_save_tmp, s_save_spi_init_hi, S_SAVE_SPI_INIT_FIRST_WAVE_MASK	// extract fisrt wave bit
+    s_mov_b32	     s_save_exec_hi, 0x0
+    s_or_b32	     s_save_exec_hi, s_save_tmp, s_save_exec_hi				 // save first wave bit to s_save_exec_hi.bits[26]
+
+
+    /*		save SGPRs	*/
+	// Save SGPR before LDS save, then the s0 to s4 can be used during LDS save...
+    //////////////////////////////
+
+    // SGPR SR memory offset : size(VGPR)
+    get_vgpr_size_bytes(s_save_mem_offset)
+    // TODO, change RSRC word to rearrange memory layout for SGPRS
+
+    s_getreg_b32    s_save_alloc_size, hwreg(HW_REG_GPR_ALLOC,SQ_WAVE_GPR_ALLOC_SGPR_SIZE_SHIFT,SQ_WAVE_GPR_ALLOC_SGPR_SIZE_SIZE)		//spgr_size
+    s_add_u32	    s_save_alloc_size, s_save_alloc_size, 1
+    s_lshl_b32	    s_save_alloc_size, s_save_alloc_size, 4			    //Number of SGPRs = (sgpr_size + 1) * 16   (non-zero value)
+
+    if (SGPR_SAVE_USE_SQC)
+	s_lshl_b32	s_save_buf_rsrc2,   s_save_alloc_size, 2		    //NUM_RECORDS in bytes
+    else
+	s_lshl_b32	s_save_buf_rsrc2,   s_save_alloc_size, 8		    //NUM_RECORDS in bytes (64 threads)
+    end
+
+    if (SWIZZLE_EN)
+	s_add_u32	s_save_buf_rsrc2, s_save_buf_rsrc2, 0x0			    //FIXME need to use swizzle to enable bounds checking?
+    else
+	s_mov_b32	s_save_buf_rsrc2,  0x1000000				    //NUM_RECORDS in bytes
+    end
+
+
+    // backup s_save_buf_rsrc0,1 to s_save_pc_lo/hi, since write_16sgpr_to_mem function will change the rsrc0
+    //s_mov_b64 s_save_pc_lo, s_save_buf_rsrc0
+    s_mov_b64 s_save_xnack_mask_lo, s_save_buf_rsrc0
+    s_add_u32 s_save_buf_rsrc0, s_save_buf_rsrc0, s_save_mem_offset
+    s_addc_u32 s_save_buf_rsrc1, s_save_buf_rsrc1, 0
+
+    s_mov_b32	    m0, 0x0			    //SGPR initial index value =0
+    s_nop	    0x0				    //Manually inserted wait states
+  L_SAVE_SGPR_LOOP:
+    // SGPR is allocated in 16 SGPR granularity
+    s_movrels_b64   s0, s0     //s0 = s[0+m0], s1 = s[1+m0]
+    s_movrels_b64   s2, s2     //s2 = s[2+m0], s3 = s[3+m0]
+    s_movrels_b64   s4, s4     //s4 = s[4+m0], s5 = s[5+m0]
+    s_movrels_b64   s6, s6     //s6 = s[6+m0], s7 = s[7+m0]
+    s_movrels_b64   s8, s8     //s8 = s[8+m0], s9 = s[9+m0]
+    s_movrels_b64   s10, s10   //s10 = s[10+m0], s11 = s[11+m0]
+    s_movrels_b64   s12, s12   //s12 = s[12+m0], s13 = s[13+m0]
+    s_movrels_b64   s14, s14   //s14 = s[14+m0], s15 = s[15+m0]
+
+    write_16sgpr_to_mem(s0, s_save_buf_rsrc0, s_save_mem_offset) //PV: the best performance should be using s_buffer_store_dwordx4
+    s_add_u32	    m0, m0, 16							    //next sgpr index
+    s_cmp_lt_u32    m0, s_save_alloc_size					    //scc = (m0 < s_save_alloc_size) ? 1 : 0
+    s_cbranch_scc1  L_SAVE_SGPR_LOOP					//SGPR save is complete?
+    // restore s_save_buf_rsrc0,1
+    //s_mov_b64 s_save_buf_rsrc0, s_save_pc_lo
+    s_mov_b64 s_save_buf_rsrc0, s_save_xnack_mask_lo
+
+
+
+
+    /*		save first 4 VGPR, then LDS save could use   */
+	// each wave will alloc 4 vgprs at least...
+    /////////////////////////////////////////////////////////////////////////////////////
+
+    s_mov_b32	    s_save_mem_offset, 0
+    s_mov_b32	    exec_lo, 0xFFFFFFFF						    //need every thread from now on
+    s_mov_b32	    exec_hi, 0xFFFFFFFF
+    s_mov_b32	    xnack_mask_lo, 0x0
+    s_mov_b32	    xnack_mask_hi, 0x0
+
+    if (SWIZZLE_EN)
+	s_add_u32	s_save_buf_rsrc2, s_save_buf_rsrc2, 0x0			    //FIXME need to use swizzle to enable bounds checking?
+    else
+	s_mov_b32	s_save_buf_rsrc2,  0x1000000				    //NUM_RECORDS in bytes
+    end
+
+
+    // VGPR Allocated in 4-GPR granularity
+
+if G8SR_VGPR_SR_IN_DWX4
+	// the const stride for DWx4 is 4*4 bytes
+	s_and_b32 s_save_buf_rsrc1, s_save_buf_rsrc1, 0x0000FFFF   // reset const stride to 0
+	s_or_b32  s_save_buf_rsrc1, s_save_buf_rsrc1, G8SR_SAVE_BUF_RSRC_WORD1_STRIDE_DWx4  // const stride to 4*4 bytes
+
+	buffer_store_dwordx4 v0, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1
+
+	s_and_b32 s_save_buf_rsrc1, s_save_buf_rsrc1, 0x0000FFFF   // reset const stride to 0
+	s_or_b32  s_save_buf_rsrc1, s_save_buf_rsrc1, S_SAVE_BUF_RSRC_WORD1_STRIDE  // reset const stride to 4 bytes
+else
+	buffer_store_dword v0, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1
+	buffer_store_dword v1, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1  offset:256
+	buffer_store_dword v2, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1  offset:256*2
+	buffer_store_dword v3, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1  offset:256*3
+end
+
+
+
+    /*		save LDS	*/
+    //////////////////////////////
+
+  L_SAVE_LDS:
+
+	// Change EXEC to all threads...
+    s_mov_b32	    exec_lo, 0xFFFFFFFF	  //need every thread from now on
+    s_mov_b32	    exec_hi, 0xFFFFFFFF
+
+    s_getreg_b32    s_save_alloc_size, hwreg(HW_REG_LDS_ALLOC,SQ_WAVE_LDS_ALLOC_LDS_SIZE_SHIFT,SQ_WAVE_LDS_ALLOC_LDS_SIZE_SIZE)		    //lds_size
+    s_and_b32	    s_save_alloc_size, s_save_alloc_size, 0xFFFFFFFF		    //lds_size is zero?
+    s_cbranch_scc0  L_SAVE_LDS_DONE									       //no lds used? jump to L_SAVE_DONE
+
+    s_barrier		    //LDS is used? wait for other waves in the same TG
+    s_and_b32	    s_save_tmp, s_save_exec_hi, S_SAVE_SPI_INIT_FIRST_WAVE_MASK		       //exec is still used here
+    s_cbranch_scc0  L_SAVE_LDS_DONE
+
+	// first wave do LDS save;
+
+    s_lshl_b32	    s_save_alloc_size, s_save_alloc_size, 6			    //LDS size in dwords = lds_size * 64dw
+    s_lshl_b32	    s_save_alloc_size, s_save_alloc_size, 2			    //LDS size in bytes
+    s_mov_b32	    s_save_buf_rsrc2,  s_save_alloc_size			    //NUM_RECORDS in bytes
+
+    // LDS at offset: size(VGPR)+SIZE(SGPR)+SIZE(HWREG)
+    //
+    get_vgpr_size_bytes(s_save_mem_offset)
+    get_sgpr_size_bytes(s_save_tmp)
+    s_add_u32  s_save_mem_offset, s_save_mem_offset, s_save_tmp
+    s_add_u32 s_save_mem_offset, s_save_mem_offset, get_hwreg_size_bytes()
+
+
+    if (SWIZZLE_EN)
+	s_add_u32	s_save_buf_rsrc2, s_save_buf_rsrc2, 0x0	      //FIXME need to use swizzle to enable bounds checking?
+    else
+	s_mov_b32	s_save_buf_rsrc2,  0x1000000		      //NUM_RECORDS in bytes
+    end
+
+    s_mov_b32	    m0, 0x0						  //lds_offset initial value = 0
+
+
+var LDS_DMA_ENABLE = 0
+var UNROLL = 0
+if UNROLL==0 && LDS_DMA_ENABLE==1
+	s_mov_b32  s3, 256*2
+	s_nop 0
+	s_nop 0
+	s_nop 0
+  L_SAVE_LDS_LOOP:
+	//TODO: looks the 2 buffer_store/load clause for s/r will hurt performance.???
+    if (SAVE_LDS)     //SPI always alloc LDS space in 128DW granularity
+	    buffer_store_lds_dword s_save_buf_rsrc0, s_save_mem_offset lds:1		// first 64DW
+	    buffer_store_lds_dword s_save_buf_rsrc0, s_save_mem_offset lds:1 offset:256 // second 64DW
+    end
+
+    s_add_u32	    m0, m0, s3						//every buffer_store_lds does 256 bytes
+    s_add_u32	    s_save_mem_offset, s_save_mem_offset, s3				//mem offset increased by 256 bytes
+    s_cmp_lt_u32    m0, s_save_alloc_size						//scc=(m0 < s_save_alloc_size) ? 1 : 0
+    s_cbranch_scc1  L_SAVE_LDS_LOOP							//LDS save is complete?
+
+elsif LDS_DMA_ENABLE==1 && UNROLL==1 // UNROOL	, has ichace miss
+      // store from higest LDS address to lowest
+      s_mov_b32	 s3, 256*2
+      s_sub_u32	 m0, s_save_alloc_size, s3
+      s_add_u32 s_save_mem_offset, s_save_mem_offset, m0
+      s_lshr_b32 s_save_alloc_size, s_save_alloc_size, 9   // how many 128 trunks...
+      s_sub_u32 s_save_alloc_size, 128, s_save_alloc_size   // store from higheset addr to lowest
+      s_mul_i32 s_save_alloc_size, s_save_alloc_size, 6*4   // PC offset increment,  each LDS save block cost 6*4 Bytes instruction
+      s_add_u32 s_save_alloc_size, s_save_alloc_size, 3*4   //2is the below 2 inst...//s_addc and s_setpc
+      s_nop 0
+      s_nop 0
+      s_nop 0	//pad 3 dw to let LDS_DMA align with 64Bytes
+      s_getpc_b64 s[0:1]			      // reuse s[0:1], since s[0:1] already saved
+      s_add_u32	  s0, s0,s_save_alloc_size
+      s_addc_u32  s1, s1, 0
+      s_setpc_b64 s[0:1]
+
+
+       for var i =0; i< 128; i++
+	    // be careful to make here a 64Byte aligned address, which could improve performance...
+	    buffer_store_lds_dword s_save_buf_rsrc0, s_save_mem_offset lds:1 offset:0		// first 64DW
+	    buffer_store_lds_dword s_save_buf_rsrc0, s_save_mem_offset lds:1 offset:256		  // second 64DW
+
+	if i!=127
+	s_sub_u32  m0, m0, s3	   // use a sgpr to shrink 2DW-inst to 1DW inst to improve performance , i.e.  pack more LDS_DMA inst to one Cacheline
+	    s_sub_u32  s_save_mem_offset, s_save_mem_offset,  s3
+	    end
+       end
+
+else   // BUFFER_STORE
+      v_mbcnt_lo_u32_b32 v2, 0xffffffff, 0x0
+      v_mbcnt_hi_u32_b32 v3, 0xffffffff, v2	// tid
+      v_mul_i32_i24 v2, v3, 8	// tid*8
+      v_mov_b32 v3, 256*2
+      s_mov_b32 m0, 0x10000
+      s_mov_b32 s0, s_save_buf_rsrc3
+      s_and_b32 s_save_buf_rsrc3, s_save_buf_rsrc3, 0xFF7FFFFF	  // disable add_tid
+      s_or_b32 s_save_buf_rsrc3, s_save_buf_rsrc3, 0x58000   //DFMT
+
+L_SAVE_LDS_LOOP_VECTOR:
+      ds_read_b64 v[0:1], v2	//x =LDS[a], byte address
+      s_waitcnt lgkmcnt(0)
+      buffer_store_dwordx2  v[0:1], v2, s_save_buf_rsrc0, s_save_mem_offset offen:1  glc:1  slc:1
+//	s_waitcnt vmcnt(0)
+//	v_add_u32 v2, vcc[0:1], v2, v3
+      v_add_u32 v2, v2, v3
+      v_cmp_lt_u32 vcc[0:1], v2, s_save_alloc_size
+      s_cbranch_vccnz L_SAVE_LDS_LOOP_VECTOR
+
+      // restore rsrc3
+      s_mov_b32 s_save_buf_rsrc3, s0
+
+end
+
+L_SAVE_LDS_DONE:
+
+
+    /*		save VGPRs  - set the Rest VGPRs	*/
+    //////////////////////////////////////////////////////////////////////////////////////
+  L_SAVE_VGPR:
+    // VGPR SR memory offset: 0
+    // TODO rearrange the RSRC words to use swizzle for VGPR save...
+
+    s_mov_b32	    s_save_mem_offset, (0+256*4)				    // for the rest VGPRs
+    s_mov_b32	    exec_lo, 0xFFFFFFFF						    //need every thread from now on
+    s_mov_b32	    exec_hi, 0xFFFFFFFF
+
+    s_getreg_b32    s_save_alloc_size, hwreg(HW_REG_GPR_ALLOC,SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SHIFT,SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SIZE)		    //vpgr_size
+    s_add_u32	    s_save_alloc_size, s_save_alloc_size, 1
+    s_lshl_b32	    s_save_alloc_size, s_save_alloc_size, 2			    //Number of VGPRs = (vgpr_size + 1) * 4    (non-zero value)	  //FIXME for GFX, zero is possible
+    s_lshl_b32	    s_save_buf_rsrc2,  s_save_alloc_size, 8			    //NUM_RECORDS in bytes (64 threads*4)
+    if (SWIZZLE_EN)
+	s_add_u32	s_save_buf_rsrc2, s_save_buf_rsrc2, 0x0			    //FIXME need to use swizzle to enable bounds checking?
+    else
+	s_mov_b32	s_save_buf_rsrc2,  0x1000000				    //NUM_RECORDS in bytes
+    end
+
+
+    // VGPR Allocated in 4-GPR granularity
+
+if G8SR_VGPR_SR_IN_DWX4
+	// the const stride for DWx4 is 4*4 bytes
+	s_and_b32 s_save_buf_rsrc1, s_save_buf_rsrc1, 0x0000FFFF   // reset const stride to 0
+	s_or_b32  s_save_buf_rsrc1, s_save_buf_rsrc1, G8SR_SAVE_BUF_RSRC_WORD1_STRIDE_DWx4  // const stride to 4*4 bytes
+
+	s_mov_b32	  m0, 4	    // skip first 4 VGPRs
+	s_cmp_lt_u32	  m0, s_save_alloc_size
+	s_cbranch_scc0	  L_SAVE_VGPR_LOOP_END	    // no more vgprs
+
+	s_set_gpr_idx_on  m0, 0x1   // This will change M0
+	s_add_u32	  s_save_alloc_size, s_save_alloc_size, 0x1000	// because above inst change m0
+L_SAVE_VGPR_LOOP:
+	v_mov_b32	  v0, v0   // v0 = v[0+m0]
+	v_mov_b32	  v1, v1
+	v_mov_b32	  v2, v2
+	v_mov_b32	  v3, v3
+
+
+	buffer_store_dwordx4 v0, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1
+	s_add_u32	  m0, m0, 4
+	s_add_u32	  s_save_mem_offset, s_save_mem_offset, 256*4
+	s_cmp_lt_u32	  m0, s_save_alloc_size
+    s_cbranch_scc1  L_SAVE_VGPR_LOOP						    //VGPR save is complete?
+    s_set_gpr_idx_off
+L_SAVE_VGPR_LOOP_END:
+
+	s_and_b32 s_save_buf_rsrc1, s_save_buf_rsrc1, 0x0000FFFF   // reset const stride to 0
+	s_or_b32  s_save_buf_rsrc1, s_save_buf_rsrc1, S_SAVE_BUF_RSRC_WORD1_STRIDE  // reset const stride to 4 bytes
+else
+    // VGPR store using dw burst
+    s_mov_b32	      m0, 0x4	//VGPR initial index value =0
+    s_cmp_lt_u32      m0, s_save_alloc_size
+    s_cbranch_scc0    L_SAVE_VGPR_END
+
+
+    s_set_gpr_idx_on	m0, 0x1 //M0[7:0] = M0[7:0] and M0[15:12] = 0x1
+    s_add_u32	    s_save_alloc_size, s_save_alloc_size, 0x1000		    //add 0x1000 since we compare m0 against it later
+
+  L_SAVE_VGPR_LOOP:
+    v_mov_b32	    v0, v0		//v0 = v[0+m0]
+    v_mov_b32	    v1, v1		//v0 = v[0+m0]
+    v_mov_b32	    v2, v2		//v0 = v[0+m0]
+    v_mov_b32	    v3, v3		//v0 = v[0+m0]
+
+    if(USE_MTBUF_INSTEAD_OF_MUBUF)
+	tbuffer_store_format_x v0, v0, s_save_buf_rsrc0, s_save_mem_offset format:BUF_NUM_FORMAT_FLOAT format: BUF_DATA_FORMAT_32 slc:1 glc:1
+    else
+	buffer_store_dword v0, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1
+	buffer_store_dword v1, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1  offset:256
+	buffer_store_dword v2, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1  offset:256*2
+	buffer_store_dword v3, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1  offset:256*3
+    end
+
+    s_add_u32	    m0, m0, 4							    //next vgpr index
+    s_add_u32	    s_save_mem_offset, s_save_mem_offset, 256*4			    //every buffer_store_dword does 256 bytes
+    s_cmp_lt_u32    m0, s_save_alloc_size					    //scc = (m0 < s_save_alloc_size) ? 1 : 0
+    s_cbranch_scc1  L_SAVE_VGPR_LOOP						    //VGPR save is complete?
+    s_set_gpr_idx_off
+end
+
+L_SAVE_VGPR_END:
+
+
+
+
+
+
+    /*	   S_PGM_END_SAVED  */				    //FIXME  graphics ONLY
+    if ((EMU_RUN_HACK) && (!EMU_RUN_HACK_SAVE_NORMAL_EXIT))
+	s_and_b32 s_save_pc_hi, s_save_pc_hi, 0x0000ffff    //pc[47:32]
+	s_add_u32 s_save_pc_lo, s_save_pc_lo, 4		    //pc[31:0]+4
+	s_addc_u32 s_save_pc_hi, s_save_pc_hi, 0x0	    //carry bit over
+	s_rfe_b64 s_save_pc_lo				    //Return to the main shader program
+    else
+    end
+
+// Save Done timestamp
+if G8SR_DEBUG_TIMESTAMP
+	s_memrealtime	s_g8sr_ts_save_d
+	// SGPR SR memory offset : size(VGPR)
+	get_vgpr_size_bytes(s_save_mem_offset)
+	s_add_u32 s_save_mem_offset, s_save_mem_offset, G8SR_DEBUG_TS_SAVE_D_OFFSET
+	s_waitcnt lgkmcnt(0)	     //FIXME, will cause xnack??
+	// Need reset rsrc2??
+	s_mov_b32 m0, s_save_mem_offset
+	s_mov_b32 s_save_buf_rsrc2,  0x1000000					//NUM_RECORDS in bytes
+	s_buffer_store_dwordx2 s_g8sr_ts_save_d, s_save_buf_rsrc0, m0	    glc:1
+end
+
+
+    s_branch	L_END_PGM
+
+
+
+/**************************************************************************/
+/*			restore routine					  */
+/**************************************************************************/
+
+L_RESTORE:
+    /*	    Setup Resource Contants    */
+    if ((EMU_RUN_HACK) && (!EMU_RUN_HACK_RESTORE_NORMAL))
+	//calculate wd_addr using absolute thread id
+	v_readlane_b32 s_restore_tmp, v9, 0
+	s_lshr_b32 s_restore_tmp, s_restore_tmp, 6
+	s_mul_i32 s_restore_tmp, s_restore_tmp, WAVE_SPACE
+	s_add_i32 s_restore_spi_init_lo, s_restore_tmp, WG_BASE_ADDR_LO
+	s_mov_b32 s_restore_spi_init_hi, WG_BASE_ADDR_HI
+	s_and_b32 s_restore_spi_init_hi, s_restore_spi_init_hi, CTX_RESTORE_CONTROL
+    else
+    end
+
+if G8SR_DEBUG_TIMESTAMP
+	s_memrealtime	s_g8sr_ts_restore_s
+	s_waitcnt lgkmcnt(0)	     //FIXME, will cause xnack??
+	// tma_lo/hi are sgpr 110, 111, which will not used for 112 SGPR allocated case...
+	s_mov_b32 s_restore_pc_lo, s_g8sr_ts_restore_s[0]
+	s_mov_b32 s_restore_pc_hi, s_g8sr_ts_restore_s[1]   //backup ts to ttmp0/1, sicne exec will be finally restored..
+end
+
+
+
+    s_mov_b32	    s_restore_buf_rsrc0,    s_restore_spi_init_lo							    //base_addr_lo
+    s_and_b32	    s_restore_buf_rsrc1,    s_restore_spi_init_hi, 0x0000FFFF						    //base_addr_hi
+    s_or_b32	    s_restore_buf_rsrc1,    s_restore_buf_rsrc1,  S_RESTORE_BUF_RSRC_WORD1_STRIDE
+    s_mov_b32	    s_restore_buf_rsrc2,    0										    //NUM_RECORDS initial value = 0 (in bytes)
+    s_mov_b32	    s_restore_buf_rsrc3,    S_RESTORE_BUF_RSRC_WORD3_MISC
+    s_and_b32	    s_restore_tmp,	    s_restore_spi_init_hi, S_RESTORE_SPI_INIT_ATC_MASK
+    s_lshr_b32	    s_restore_tmp,	    s_restore_tmp, (S_RESTORE_SPI_INIT_ATC_SHIFT-SQ_BUF_RSRC_WORD1_ATC_SHIFT)	    //get ATC bit into position
+    s_or_b32	    s_restore_buf_rsrc3,    s_restore_buf_rsrc3,  s_restore_tmp						    //or ATC
+    s_and_b32	    s_restore_tmp,	    s_restore_spi_init_hi, S_RESTORE_SPI_INIT_MTYPE_MASK
+    s_lshr_b32	    s_restore_tmp,	    s_restore_tmp, (S_RESTORE_SPI_INIT_MTYPE_SHIFT-SQ_BUF_RSRC_WORD3_MTYPE_SHIFT)   //get MTYPE bits into position
+    s_or_b32	    s_restore_buf_rsrc3,    s_restore_buf_rsrc3,  s_restore_tmp						    //or MTYPE
+
+    /*	    global mem offset		*/
+//  s_mov_b32	    s_restore_mem_offset, 0x0				    //mem offset initial value = 0
+
+    /*	    the first wave in the threadgroup	 */
+    s_and_b32	    s_restore_tmp, s_restore_spi_init_hi, S_RESTORE_SPI_INIT_FIRST_WAVE_MASK
+    s_cbranch_scc0  L_RESTORE_VGPR
+
+    /*		restore LDS	*/
+    //////////////////////////////
+  L_RESTORE_LDS:
+
+    s_mov_b32	    exec_lo, 0xFFFFFFFF							    //need every thread from now on   //be consistent with SAVE although can be moved ahead
+    s_mov_b32	    exec_hi, 0xFFFFFFFF
+
+    s_getreg_b32    s_restore_alloc_size, hwreg(HW_REG_LDS_ALLOC,SQ_WAVE_LDS_ALLOC_LDS_SIZE_SHIFT,SQ_WAVE_LDS_ALLOC_LDS_SIZE_SIZE)		//lds_size
+    s_and_b32	    s_restore_alloc_size, s_restore_alloc_size, 0xFFFFFFFF		    //lds_size is zero?
+    s_cbranch_scc0  L_RESTORE_VGPR							    //no lds used? jump to L_RESTORE_VGPR
+    s_lshl_b32	    s_restore_alloc_size, s_restore_alloc_size, 6			    //LDS size in dwords = lds_size * 64dw
+    s_lshl_b32	    s_restore_alloc_size, s_restore_alloc_size, 2			    //LDS size in bytes
+    s_mov_b32	    s_restore_buf_rsrc2,    s_restore_alloc_size			    //NUM_RECORDS in bytes
+
+    // LDS at offset: size(VGPR)+SIZE(SGPR)+SIZE(HWREG)
+    //
+    get_vgpr_size_bytes(s_restore_mem_offset)
+    get_sgpr_size_bytes(s_restore_tmp)
+    s_add_u32  s_restore_mem_offset, s_restore_mem_offset, s_restore_tmp
+    s_add_u32  s_restore_mem_offset, s_restore_mem_offset, get_hwreg_size_bytes()	     //FIXME, Check if offset overflow???
+
+
+    if (SWIZZLE_EN)
+	s_add_u32	s_restore_buf_rsrc2, s_restore_buf_rsrc2, 0x0			    //FIXME need to use swizzle to enable bounds checking?
+    else
+	s_mov_b32	s_restore_buf_rsrc2,  0x1000000					    //NUM_RECORDS in bytes
+    end
+    s_mov_b32	    m0, 0x0								    //lds_offset initial value = 0
+
+  L_RESTORE_LDS_LOOP:
+    if (SAVE_LDS)
+	buffer_load_dword   v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset lds:1		       // first 64DW
+	buffer_load_dword   v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset lds:1 offset:256	       // second 64DW
+    end
+    s_add_u32	    m0, m0, 256*2						// 128 DW
+    s_add_u32	    s_restore_mem_offset, s_restore_mem_offset, 256*2		//mem offset increased by 128DW
+    s_cmp_lt_u32    m0, s_restore_alloc_size					//scc=(m0 < s_restore_alloc_size) ? 1 : 0
+    s_cbranch_scc1  L_RESTORE_LDS_LOOP							    //LDS restore is complete?
+
+
+    /*		restore VGPRs	    */
+    //////////////////////////////
+  L_RESTORE_VGPR:
+	// VGPR SR memory offset : 0
+    s_mov_b32	    s_restore_mem_offset, 0x0
+    s_mov_b32	    exec_lo, 0xFFFFFFFF							    //need every thread from now on   //be consistent with SAVE although can be moved ahead
+    s_mov_b32	    exec_hi, 0xFFFFFFFF
+
+    s_getreg_b32    s_restore_alloc_size, hwreg(HW_REG_GPR_ALLOC,SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SHIFT,SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SIZE)	//vpgr_size
+    s_add_u32	    s_restore_alloc_size, s_restore_alloc_size, 1
+    s_lshl_b32	    s_restore_alloc_size, s_restore_alloc_size, 2			    //Number of VGPRs = (vgpr_size + 1) * 4    (non-zero value)
+    s_lshl_b32	    s_restore_buf_rsrc2,  s_restore_alloc_size, 8			    //NUM_RECORDS in bytes (64 threads*4)
+    if (SWIZZLE_EN)
+	s_add_u32	s_restore_buf_rsrc2, s_restore_buf_rsrc2, 0x0			    //FIXME need to use swizzle to enable bounds checking?
+    else
+	s_mov_b32	s_restore_buf_rsrc2,  0x1000000					    //NUM_RECORDS in bytes
+    end
+
+if G8SR_VGPR_SR_IN_DWX4
+     get_vgpr_size_bytes(s_restore_mem_offset)
+     s_sub_u32	       s_restore_mem_offset, s_restore_mem_offset, 256*4
+
+     // the const stride for DWx4 is 4*4 bytes
+     s_and_b32 s_restore_buf_rsrc1, s_restore_buf_rsrc1, 0x0000FFFF   // reset const stride to 0
+     s_or_b32  s_restore_buf_rsrc1, s_restore_buf_rsrc1, G8SR_RESTORE_BUF_RSRC_WORD1_STRIDE_DWx4  // const stride to 4*4 bytes
+
+     s_mov_b32	       m0, s_restore_alloc_size
+     s_set_gpr_idx_on  m0, 0x8	  // Note.. This will change m0
+
+L_RESTORE_VGPR_LOOP:
+     buffer_load_dwordx4 v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset slc:1 glc:1
+     s_waitcnt vmcnt(0)
+     s_sub_u32	       m0, m0, 4
+     v_mov_b32	       v0, v0	// v[0+m0] = v0
+     v_mov_b32	       v1, v1
+     v_mov_b32	       v2, v2
+     v_mov_b32	       v3, v3
+     s_sub_u32	       s_restore_mem_offset, s_restore_mem_offset, 256*4
+     s_cmp_eq_u32      m0, 0x8000
+     s_cbranch_scc0    L_RESTORE_VGPR_LOOP
+     s_set_gpr_idx_off
+
+     s_and_b32 s_restore_buf_rsrc1, s_restore_buf_rsrc1, 0x0000FFFF   // reset const stride to 0
+     s_or_b32  s_restore_buf_rsrc1, s_restore_buf_rsrc1, S_RESTORE_BUF_RSRC_WORD1_STRIDE  // const stride to 4*4 bytes
+
+else
+    // VGPR load using dw burst
+    s_mov_b32	    s_restore_mem_offset_save, s_restore_mem_offset	// restore start with v1, v0 will be the last
+    s_add_u32	    s_restore_mem_offset, s_restore_mem_offset, 256*4
+    s_mov_b32	    m0, 4				//VGPR initial index value = 1
+    s_set_gpr_idx_on  m0, 0x8			    //M0[7:0] = M0[7:0] and M0[15:12] = 0x8
+    s_add_u32	    s_restore_alloc_size, s_restore_alloc_size, 0x8000			    //add 0x8000 since we compare m0 against it later
+
+  L_RESTORE_VGPR_LOOP:
+    if(USE_MTBUF_INSTEAD_OF_MUBUF)
+	tbuffer_load_format_x v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset format:BUF_NUM_FORMAT_FLOAT format: BUF_DATA_FORMAT_32 slc:1 glc:1
+    else
+	buffer_load_dword v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset slc:1 glc:1
+	buffer_load_dword v1, v0, s_restore_buf_rsrc0, s_restore_mem_offset slc:1 glc:1 offset:256
+	buffer_load_dword v2, v0, s_restore_buf_rsrc0, s_restore_mem_offset slc:1 glc:1 offset:256*2
+	buffer_load_dword v3, v0, s_restore_buf_rsrc0, s_restore_mem_offset slc:1 glc:1 offset:256*3
+    end
+    s_waitcnt	    vmcnt(0)								    //ensure data ready
+    v_mov_b32	    v0, v0								    //v[0+m0] = v0
+    v_mov_b32	    v1, v1
+    v_mov_b32	    v2, v2
+    v_mov_b32	    v3, v3
+    s_add_u32	    m0, m0, 4								    //next vgpr index
+    s_add_u32	    s_restore_mem_offset, s_restore_mem_offset, 256*4				//every buffer_load_dword does 256 bytes
+    s_cmp_lt_u32    m0, s_restore_alloc_size						    //scc = (m0 < s_restore_alloc_size) ? 1 : 0
+    s_cbranch_scc1  L_RESTORE_VGPR_LOOP							    //VGPR restore (except v0) is complete?
+    s_set_gpr_idx_off
+											    /* VGPR restore on v0 */
+    if(USE_MTBUF_INSTEAD_OF_MUBUF)
+	tbuffer_load_format_x v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save format:BUF_NUM_FORMAT_FLOAT format: BUF_DATA_FORMAT_32 slc:1 glc:1
+    else
+	buffer_load_dword v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save    slc:1 glc:1
+	buffer_load_dword v1, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save    slc:1 glc:1 offset:256
+	buffer_load_dword v2, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save    slc:1 glc:1 offset:256*2
+	buffer_load_dword v3, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save    slc:1 glc:1 offset:256*3
+    end
+
+end
+
+    /*		restore SGPRs	    */
+    //////////////////////////////
+
+    // SGPR SR memory offset : size(VGPR)
+    get_vgpr_size_bytes(s_restore_mem_offset)
+    get_sgpr_size_bytes(s_restore_tmp)
+    s_add_u32 s_restore_mem_offset, s_restore_mem_offset, s_restore_tmp
+    s_sub_u32 s_restore_mem_offset, s_restore_mem_offset, 16*4	   // restore SGPR from S[n] to S[0], by 16 sgprs group
+    // TODO, change RSRC word to rearrange memory layout for SGPRS
+
+    s_getreg_b32    s_restore_alloc_size, hwreg(HW_REG_GPR_ALLOC,SQ_WAVE_GPR_ALLOC_SGPR_SIZE_SHIFT,SQ_WAVE_GPR_ALLOC_SGPR_SIZE_SIZE)		    //spgr_size
+    s_add_u32	    s_restore_alloc_size, s_restore_alloc_size, 1
+    s_lshl_b32	    s_restore_alloc_size, s_restore_alloc_size, 4			    //Number of SGPRs = (sgpr_size + 1) * 16   (non-zero value)
+
+    if (SGPR_SAVE_USE_SQC)
+	s_lshl_b32	s_restore_buf_rsrc2,	s_restore_alloc_size, 2			    //NUM_RECORDS in bytes
+    else
+	s_lshl_b32	s_restore_buf_rsrc2,	s_restore_alloc_size, 8			    //NUM_RECORDS in bytes (64 threads)
+    end
+    if (SWIZZLE_EN)
+	s_add_u32	s_restore_buf_rsrc2, s_restore_buf_rsrc2, 0x0			    //FIXME need to use swizzle to enable bounds checking?
+    else
+	s_mov_b32	s_restore_buf_rsrc2,  0x1000000					    //NUM_RECORDS in bytes
+    end
+
+    s_mov_b32 m0, s_restore_alloc_size
+
+ L_RESTORE_SGPR_LOOP:
+    read_16sgpr_from_mem(s0, s_restore_buf_rsrc0, s_restore_mem_offset)	 //PV: further performance improvement can be made
+    s_waitcnt	    lgkmcnt(0)								    //ensure data ready
+
+    s_sub_u32 m0, m0, 16    // Restore from S[n] to S[0]
+    s_nop 0 // hazard SALU M0=> S_MOVREL
+
+    s_movreld_b64   s0, s0	//s[0+m0] = s0
+    s_movreld_b64   s2, s2
+    s_movreld_b64   s4, s4
+    s_movreld_b64   s6, s6
+    s_movreld_b64   s8, s8
+    s_movreld_b64   s10, s10
+    s_movreld_b64   s12, s12
+    s_movreld_b64   s14, s14
+
+    s_cmp_eq_u32    m0, 0		//scc = (m0 < s_restore_alloc_size) ? 1 : 0
+    s_cbranch_scc0  L_RESTORE_SGPR_LOOP		    //SGPR restore (except s0) is complete?
+
+    /*	    restore HW registers    */
+    //////////////////////////////
+  L_RESTORE_HWREG:
+
+
+if G8SR_DEBUG_TIMESTAMP
+      s_mov_b32 s_g8sr_ts_restore_s[0], s_restore_pc_lo
+      s_mov_b32 s_g8sr_ts_restore_s[1], s_restore_pc_hi
+end
+
+    // HWREG SR memory offset : size(VGPR)+size(SGPR)
+    get_vgpr_size_bytes(s_restore_mem_offset)
+    get_sgpr_size_bytes(s_restore_tmp)
+    s_add_u32 s_restore_mem_offset, s_restore_mem_offset, s_restore_tmp
+
+
+    s_mov_b32	    s_restore_buf_rsrc2, 0x4						    //NUM_RECORDS   in bytes
+    if (SWIZZLE_EN)
+	s_add_u32	s_restore_buf_rsrc2, s_restore_buf_rsrc2, 0x0			    //FIXME need to use swizzle to enable bounds checking?
+    else
+	s_mov_b32	s_restore_buf_rsrc2,  0x1000000					    //NUM_RECORDS in bytes
+    end
+
+    read_hwreg_from_mem(s_restore_m0, s_restore_buf_rsrc0, s_restore_mem_offset)		    //M0
+    read_hwreg_from_mem(s_restore_pc_lo, s_restore_buf_rsrc0, s_restore_mem_offset)		//PC
+    read_hwreg_from_mem(s_restore_pc_hi, s_restore_buf_rsrc0, s_restore_mem_offset)
+    read_hwreg_from_mem(s_restore_exec_lo, s_restore_buf_rsrc0, s_restore_mem_offset)		    //EXEC
+    read_hwreg_from_mem(s_restore_exec_hi, s_restore_buf_rsrc0, s_restore_mem_offset)
+    read_hwreg_from_mem(s_restore_status, s_restore_buf_rsrc0, s_restore_mem_offset)		    //STATUS
+    read_hwreg_from_mem(s_restore_trapsts, s_restore_buf_rsrc0, s_restore_mem_offset)		    //TRAPSTS
+    read_hwreg_from_mem(xnack_mask_lo, s_restore_buf_rsrc0, s_restore_mem_offset)		    //XNACK_MASK_LO
+    read_hwreg_from_mem(xnack_mask_hi, s_restore_buf_rsrc0, s_restore_mem_offset)		    //XNACK_MASK_HI
+    read_hwreg_from_mem(s_restore_mode, s_restore_buf_rsrc0, s_restore_mem_offset)		//MODE
+
+    s_waitcnt	    lgkmcnt(0)											    //from now on, it is safe to restore STATUS and IB_STS
+
+    //for normal save & restore, the saved PC points to the next inst to execute, no adjustment needs to be made, otherwise:
+    if ((EMU_RUN_HACK) && (!EMU_RUN_HACK_RESTORE_NORMAL))
+	s_add_u32 s_restore_pc_lo, s_restore_pc_lo, 8		 //pc[31:0]+8	  //two back-to-back s_trap are used (first for save and second for restore)
+	s_addc_u32  s_restore_pc_hi, s_restore_pc_hi, 0x0	 //carry bit over
+    end
+    if ((EMU_RUN_HACK) && (EMU_RUN_HACK_RESTORE_NORMAL))
+	s_add_u32 s_restore_pc_lo, s_restore_pc_lo, 4		 //pc[31:0]+4	  // save is hack through s_trap but restore is normal
+	s_addc_u32  s_restore_pc_hi, s_restore_pc_hi, 0x0	 //carry bit over
+    end
+
+    s_mov_b32	    m0,		s_restore_m0
+    s_mov_b32	    exec_lo,	s_restore_exec_lo
+    s_mov_b32	    exec_hi,	s_restore_exec_hi
+
+    s_and_b32	    s_restore_m0, SQ_WAVE_TRAPSTS_PRE_SAVECTX_MASK, s_restore_trapsts
+    s_setreg_b32    hwreg(HW_REG_TRAPSTS, SQ_WAVE_TRAPSTS_PRE_SAVECTX_SHIFT, SQ_WAVE_TRAPSTS_PRE_SAVECTX_SIZE), s_restore_m0
+    s_and_b32	    s_restore_m0, SQ_WAVE_TRAPSTS_POST_SAVECTX_MASK, s_restore_trapsts
+    s_lshr_b32	    s_restore_m0, s_restore_m0, SQ_WAVE_TRAPSTS_POST_SAVECTX_SHIFT
+    s_setreg_b32    hwreg(HW_REG_TRAPSTS, SQ_WAVE_TRAPSTS_POST_SAVECTX_SHIFT, SQ_WAVE_TRAPSTS_POST_SAVECTX_SIZE), s_restore_m0
+    //s_setreg_b32  hwreg(HW_REG_TRAPSTS),  s_restore_trapsts	   //don't overwrite SAVECTX bit as it may be set through external SAVECTX during restore
+    s_setreg_b32    hwreg(HW_REG_MODE),	    s_restore_mode
+
+    // Restore trap temporaries 6-11, 13-15 initialized by SPI debug dispatch logic
+    // ttmp SR memory offset : size(VGPR)+size(SGPR)+0x40
+    get_vgpr_size_bytes(s_restore_ttmps_lo)
+    get_sgpr_size_bytes(s_restore_ttmps_hi)
+    s_add_u32	    s_restore_ttmps_lo, s_restore_ttmps_lo, s_restore_ttmps_hi
+    s_add_u32	    s_restore_ttmps_lo, s_restore_ttmps_lo, s_restore_buf_rsrc0
+    s_addc_u32	    s_restore_ttmps_hi, s_restore_buf_rsrc1, 0x0
+    s_and_b32	    s_restore_ttmps_hi, s_restore_ttmps_hi, 0xFFFF
+    s_load_dwordx2  [ttmp6, ttmp7], [s_restore_ttmps_lo, s_restore_ttmps_hi], 0x40 glc:1
+    s_load_dwordx4  [ttmp8, ttmp9, ttmp10, ttmp11], [s_restore_ttmps_lo, s_restore_ttmps_hi], 0x48 glc:1
+    s_load_dword    ttmp13, [s_restore_ttmps_lo, s_restore_ttmps_hi], 0x58 glc:1
+    s_load_dwordx2  [ttmp14, ttmp15], [s_restore_ttmps_lo, s_restore_ttmps_hi], 0x5C glc:1
+    s_waitcnt	    lgkmcnt(0)
+
+    //reuse s_restore_m0 as a temp register
+    s_and_b32	    s_restore_m0, s_restore_pc_hi, S_SAVE_PC_HI_RCNT_MASK
+    s_lshr_b32	    s_restore_m0, s_restore_m0, S_SAVE_PC_HI_RCNT_SHIFT
+    s_lshl_b32	    s_restore_m0, s_restore_m0, SQ_WAVE_IB_STS_RCNT_SHIFT
+    s_mov_b32	    s_restore_tmp, 0x0										    //IB_STS is zero
+    s_or_b32	    s_restore_tmp, s_restore_tmp, s_restore_m0
+    s_and_b32	    s_restore_m0, s_restore_pc_hi, S_SAVE_PC_HI_FIRST_REPLAY_MASK
+    s_lshr_b32	    s_restore_m0, s_restore_m0, S_SAVE_PC_HI_FIRST_REPLAY_SHIFT
+    s_lshl_b32	    s_restore_m0, s_restore_m0, SQ_WAVE_IB_STS_FIRST_REPLAY_SHIFT
+    s_or_b32	    s_restore_tmp, s_restore_tmp, s_restore_m0
+    s_and_b32	    s_restore_m0, s_restore_status, SQ_WAVE_STATUS_INST_ATC_MASK
+    s_lshr_b32	    s_restore_m0, s_restore_m0, SQ_WAVE_STATUS_INST_ATC_SHIFT
+    s_setreg_b32    hwreg(HW_REG_IB_STS),   s_restore_tmp
+
+    s_and_b32 s_restore_pc_hi, s_restore_pc_hi, 0x0000ffff	//pc[47:32]	   //Do it here in order not to affect STATUS
+    s_and_b64	 exec, exec, exec  // Restore STATUS.EXECZ, not writable by s_setreg_b32
+    s_and_b64	 vcc, vcc, vcc	// Restore STATUS.VCCZ, not writable by s_setreg_b32
+    set_status_without_spi_prio(s_restore_status, s_restore_tmp) // SCC is included, which is changed by previous salu
+
+    s_barrier							//barrier to ensure the readiness of LDS before access attempts from any other wave in the same TG //FIXME not performance-optimal at this time
+
+if G8SR_DEBUG_TIMESTAMP
+    s_memrealtime s_g8sr_ts_restore_d
+    s_waitcnt lgkmcnt(0)
+end
+
+//  s_rfe_b64 s_restore_pc_lo					//Return to the main shader program and resume execution
+    s_rfe_restore_b64  s_restore_pc_lo, s_restore_m0		// s_restore_m0[0] is used to set STATUS.inst_atc
+
+
+/**************************************************************************/
+/*			the END						  */
+/**************************************************************************/
+L_END_PGM:
+    s_endpgm
+
+end
+
+
+/**************************************************************************/
+/*			the helper functions				  */
+/**************************************************************************/
+
+//Only for save hwreg to mem
+function write_hwreg_to_mem(s, s_rsrc, s_mem_offset)
+	s_mov_b32 exec_lo, m0			//assuming exec_lo is not needed anymore from this point on
+	s_mov_b32 m0, s_mem_offset
+	s_buffer_store_dword s, s_rsrc, m0	glc:1
+	ack_sqc_store_workaround()
+	s_add_u32	s_mem_offset, s_mem_offset, 4
+	s_mov_b32   m0, exec_lo
+end
+
+
+// HWREG are saved before SGPRs, so all HWREG could be use.
+function write_16sgpr_to_mem(s, s_rsrc, s_mem_offset)
+
+	s_buffer_store_dwordx4 s[0], s_rsrc, 0	glc:1
+	ack_sqc_store_workaround()
+	s_buffer_store_dwordx4 s[4], s_rsrc, 16	 glc:1
+	ack_sqc_store_workaround()
+	s_buffer_store_dwordx4 s[8], s_rsrc, 32	 glc:1
+	ack_sqc_store_workaround()
+	s_buffer_store_dwordx4 s[12], s_rsrc, 48 glc:1
+	ack_sqc_store_workaround()
+	s_add_u32	s_rsrc[0], s_rsrc[0], 4*16
+	s_addc_u32	s_rsrc[1], s_rsrc[1], 0x0	      // +scc
+end
+
+
+function read_hwreg_from_mem(s, s_rsrc, s_mem_offset)
+    s_buffer_load_dword s, s_rsrc, s_mem_offset	    glc:1
+    s_add_u32	    s_mem_offset, s_mem_offset, 4
+end
+
+function read_16sgpr_from_mem(s, s_rsrc, s_mem_offset)
+    s_buffer_load_dwordx16 s, s_rsrc, s_mem_offset	glc:1
+    s_sub_u32	    s_mem_offset, s_mem_offset, 4*16
+end
+
+
+
+function get_lds_size_bytes(s_lds_size_byte)
+    // SQ LDS granularity is 64DW, while PGM_RSRC2.lds_size is in granularity 128DW
+    s_getreg_b32   s_lds_size_byte, hwreg(HW_REG_LDS_ALLOC, SQ_WAVE_LDS_ALLOC_LDS_SIZE_SHIFT, SQ_WAVE_LDS_ALLOC_LDS_SIZE_SIZE)		// lds_size
+    s_lshl_b32	   s_lds_size_byte, s_lds_size_byte, 8			    //LDS size in dwords = lds_size * 64 *4Bytes    // granularity 64DW
+end
+
+function get_vgpr_size_bytes(s_vgpr_size_byte)
+    s_getreg_b32   s_vgpr_size_byte, hwreg(HW_REG_GPR_ALLOC,SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SHIFT,SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SIZE)	 //vpgr_size
+    s_add_u32	   s_vgpr_size_byte, s_vgpr_size_byte, 1
+    s_lshl_b32	   s_vgpr_size_byte, s_vgpr_size_byte, (2+8) //Number of VGPRs = (vgpr_size + 1) * 4 * 64 * 4	(non-zero value)   //FIXME for GFX, zero is possible
+end
+
+function get_sgpr_size_bytes(s_sgpr_size_byte)
+    s_getreg_b32   s_sgpr_size_byte, hwreg(HW_REG_GPR_ALLOC,SQ_WAVE_GPR_ALLOC_SGPR_SIZE_SHIFT,SQ_WAVE_GPR_ALLOC_SGPR_SIZE_SIZE)	 //spgr_size
+    s_add_u32	   s_sgpr_size_byte, s_sgpr_size_byte, 1
+    s_lshl_b32	   s_sgpr_size_byte, s_sgpr_size_byte, 6 //Number of SGPRs = (sgpr_size + 1) * 16 *4   (non-zero value)
+end
+
+function get_hwreg_size_bytes
+    return 128 //HWREG size 128 bytes
+end
+
+function ack_sqc_store_workaround
+    if ACK_SQC_STORE
+        s_waitcnt lgkmcnt(0)
+    end
+end
+
+function set_status_without_spi_prio(status, tmp)
+    // Do not restore STATUS.SPI_PRIO since scheduler may have raised it.
+    s_lshr_b32      tmp, status, SQ_WAVE_STATUS_POST_SPI_PRIO_SHIFT
+    s_setreg_b32    hwreg(HW_REG_STATUS, SQ_WAVE_STATUS_POST_SPI_PRIO_SHIFT, SQ_WAVE_STATUS_POST_SPI_PRIO_SIZE), tmp
+    s_nop           0x2 // avoid S_SETREG => S_SETREG hazard
+    s_setreg_b32    hwreg(HW_REG_STATUS, SQ_WAVE_STATUS_PRE_SPI_PRIO_SHIFT, SQ_WAVE_STATUS_PRE_SPI_PRIO_SIZE), status
+end
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
new file mode 100644
index 000000000..297b36c26
--- /dev/null
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
@@ -0,0 +1,1743 @@
+/*
+ * Copyright 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#include <linux/device.h>
+#include <linux/export.h>
+#include <linux/err.h>
+#include <linux/fs.h>
+#include <linux/file.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/uaccess.h>
+#include <linux/compat.h>
+#include <uapi/linux/kfd_ioctl.h>
+#include <linux/time.h>
+#include <linux/mm.h>
+#include <linux/mman.h>
+#include <asm/processor.h>
+#include "kfd_priv.h"
+#include "kfd_device_queue_manager.h"
+#include "kfd_dbgmgr.h"
+
+static long kfd_ioctl(struct file *, unsigned int, unsigned long);
+static int kfd_open(struct inode *, struct file *);
+static int kfd_mmap(struct file *, struct vm_area_struct *);
+
+static const char kfd_dev_name[] = "kfd";
+
+static const struct file_operations kfd_fops = {
+	.owner = THIS_MODULE,
+	.unlocked_ioctl = kfd_ioctl,
+	.compat_ioctl = kfd_ioctl,
+	.open = kfd_open,
+	.mmap = kfd_mmap,
+};
+
+static int kfd_char_dev_major = -1;
+static struct class *kfd_class;
+struct device *kfd_device;
+
+int kfd_chardev_init(void)
+{
+	int err = 0;
+
+	kfd_char_dev_major = register_chrdev(0, kfd_dev_name, &kfd_fops);
+	err = kfd_char_dev_major;
+	if (err < 0)
+		goto err_register_chrdev;
+
+	kfd_class = class_create(THIS_MODULE, kfd_dev_name);
+	err = PTR_ERR(kfd_class);
+	if (IS_ERR(kfd_class))
+		goto err_class_create;
+
+	kfd_device = device_create(kfd_class, NULL,
+					MKDEV(kfd_char_dev_major, 0),
+					NULL, kfd_dev_name);
+	err = PTR_ERR(kfd_device);
+	if (IS_ERR(kfd_device))
+		goto err_device_create;
+
+	return 0;
+
+err_device_create:
+	class_destroy(kfd_class);
+err_class_create:
+	unregister_chrdev(kfd_char_dev_major, kfd_dev_name);
+err_register_chrdev:
+	return err;
+}
+
+void kfd_chardev_exit(void)
+{
+	device_destroy(kfd_class, MKDEV(kfd_char_dev_major, 0));
+	class_destroy(kfd_class);
+	unregister_chrdev(kfd_char_dev_major, kfd_dev_name);
+}
+
+struct device *kfd_chardev(void)
+{
+	return kfd_device;
+}
+
+
+static int kfd_open(struct inode *inode, struct file *filep)
+{
+	struct kfd_process *process;
+	bool is_32bit_user_mode;
+
+	if (iminor(inode) != 0)
+		return -ENODEV;
+
+	is_32bit_user_mode = in_compat_syscall();
+
+	if (is_32bit_user_mode) {
+		dev_warn(kfd_device,
+			"Process %d (32-bit) failed to open /dev/kfd\n"
+			"32-bit processes are not supported by amdkfd\n",
+			current->pid);
+		return -EPERM;
+	}
+
+	process = kfd_create_process(filep);
+	if (IS_ERR(process))
+		return PTR_ERR(process);
+
+	if (kfd_is_locked())
+		return -EAGAIN;
+
+	dev_dbg(kfd_device, "process %d opened, compat mode (32 bit) - %d\n",
+		process->pasid, process->is_32bit_user_mode);
+
+	return 0;
+}
+
+static int kfd_ioctl_get_version(struct file *filep, struct kfd_process *p,
+					void *data)
+{
+	struct kfd_ioctl_get_version_args *args = data;
+
+	args->major_version = KFD_IOCTL_MAJOR_VERSION;
+	args->minor_version = KFD_IOCTL_MINOR_VERSION;
+
+	return 0;
+}
+
+static int set_queue_properties_from_user(struct queue_properties *q_properties,
+				struct kfd_ioctl_create_queue_args *args)
+{
+	if (args->queue_percentage > KFD_MAX_QUEUE_PERCENTAGE) {
+		pr_err("Queue percentage must be between 0 to KFD_MAX_QUEUE_PERCENTAGE\n");
+		return -EINVAL;
+	}
+
+	if (args->queue_priority > KFD_MAX_QUEUE_PRIORITY) {
+		pr_err("Queue priority must be between 0 to KFD_MAX_QUEUE_PRIORITY\n");
+		return -EINVAL;
+	}
+
+	if ((args->ring_base_address) &&
+		(!access_ok(VERIFY_WRITE,
+			(const void __user *) args->ring_base_address,
+			sizeof(uint64_t)))) {
+		pr_err("Can't access ring base address\n");
+		return -EFAULT;
+	}
+
+	if (!is_power_of_2(args->ring_size) && (args->ring_size != 0)) {
+		pr_err("Ring size must be a power of 2 or 0\n");
+		return -EINVAL;
+	}
+
+	if (!access_ok(VERIFY_WRITE,
+			(const void __user *) args->read_pointer_address,
+			sizeof(uint32_t))) {
+		pr_err("Can't access read pointer\n");
+		return -EFAULT;
+	}
+
+	if (!access_ok(VERIFY_WRITE,
+			(const void __user *) args->write_pointer_address,
+			sizeof(uint32_t))) {
+		pr_err("Can't access write pointer\n");
+		return -EFAULT;
+	}
+
+	if (args->eop_buffer_address &&
+		!access_ok(VERIFY_WRITE,
+			(const void __user *) args->eop_buffer_address,
+			sizeof(uint32_t))) {
+		pr_debug("Can't access eop buffer");
+		return -EFAULT;
+	}
+
+	if (args->ctx_save_restore_address &&
+		!access_ok(VERIFY_WRITE,
+			(const void __user *) args->ctx_save_restore_address,
+			sizeof(uint32_t))) {
+		pr_debug("Can't access ctx save restore buffer");
+		return -EFAULT;
+	}
+
+	q_properties->is_interop = false;
+	q_properties->queue_percent = args->queue_percentage;
+	q_properties->priority = args->queue_priority;
+	q_properties->queue_address = args->ring_base_address;
+	q_properties->queue_size = args->ring_size;
+	q_properties->read_ptr = (uint32_t *) args->read_pointer_address;
+	q_properties->write_ptr = (uint32_t *) args->write_pointer_address;
+	q_properties->eop_ring_buffer_address = args->eop_buffer_address;
+	q_properties->eop_ring_buffer_size = args->eop_buffer_size;
+	q_properties->ctx_save_restore_area_address =
+			args->ctx_save_restore_address;
+	q_properties->ctx_save_restore_area_size = args->ctx_save_restore_size;
+	q_properties->ctl_stack_size = args->ctl_stack_size;
+	if (args->queue_type == KFD_IOC_QUEUE_TYPE_COMPUTE ||
+		args->queue_type == KFD_IOC_QUEUE_TYPE_COMPUTE_AQL)
+		q_properties->type = KFD_QUEUE_TYPE_COMPUTE;
+	else if (args->queue_type == KFD_IOC_QUEUE_TYPE_SDMA)
+		q_properties->type = KFD_QUEUE_TYPE_SDMA;
+	else
+		return -ENOTSUPP;
+
+	if (args->queue_type == KFD_IOC_QUEUE_TYPE_COMPUTE_AQL)
+		q_properties->format = KFD_QUEUE_FORMAT_AQL;
+	else
+		q_properties->format = KFD_QUEUE_FORMAT_PM4;
+
+	pr_debug("Queue Percentage: %d, %d\n",
+			q_properties->queue_percent, args->queue_percentage);
+
+	pr_debug("Queue Priority: %d, %d\n",
+			q_properties->priority, args->queue_priority);
+
+	pr_debug("Queue Address: 0x%llX, 0x%llX\n",
+			q_properties->queue_address, args->ring_base_address);
+
+	pr_debug("Queue Size: 0x%llX, %u\n",
+			q_properties->queue_size, args->ring_size);
+
+	pr_debug("Queue r/w Pointers: %px, %px\n",
+			q_properties->read_ptr,
+			q_properties->write_ptr);
+
+	pr_debug("Queue Format: %d\n", q_properties->format);
+
+	pr_debug("Queue EOP: 0x%llX\n", q_properties->eop_ring_buffer_address);
+
+	pr_debug("Queue CTX save area: 0x%llX\n",
+			q_properties->ctx_save_restore_area_address);
+
+	return 0;
+}
+
+static int kfd_ioctl_create_queue(struct file *filep, struct kfd_process *p,
+					void *data)
+{
+	struct kfd_ioctl_create_queue_args *args = data;
+	struct kfd_dev *dev;
+	int err = 0;
+	unsigned int queue_id;
+	struct kfd_process_device *pdd;
+	struct queue_properties q_properties;
+
+	memset(&q_properties, 0, sizeof(struct queue_properties));
+
+	pr_debug("Creating queue ioctl\n");
+
+	err = set_queue_properties_from_user(&q_properties, args);
+	if (err)
+		return err;
+
+	pr_debug("Looking for gpu id 0x%x\n", args->gpu_id);
+	dev = kfd_device_by_id(args->gpu_id);
+	if (!dev) {
+		pr_debug("Could not find gpu id 0x%x\n", args->gpu_id);
+		return -EINVAL;
+	}
+
+	mutex_lock(&p->mutex);
+
+	pdd = kfd_bind_process_to_device(dev, p);
+	if (IS_ERR(pdd)) {
+		err = -ESRCH;
+		goto err_bind_process;
+	}
+
+	pr_debug("Creating queue for PASID %d on gpu 0x%x\n",
+			p->pasid,
+			dev->id);
+
+	err = pqm_create_queue(&p->pqm, dev, filep, &q_properties, &queue_id);
+	if (err != 0)
+		goto err_create_queue;
+
+	args->queue_id = queue_id;
+
+
+	/* Return gpu_id as doorbell offset for mmap usage */
+	args->doorbell_offset = KFD_MMAP_TYPE_DOORBELL;
+	args->doorbell_offset |= KFD_MMAP_GPU_ID(args->gpu_id);
+	args->doorbell_offset <<= PAGE_SHIFT;
+	if (KFD_IS_SOC15(dev->device_info->asic_family))
+		/* On SOC15 ASICs, doorbell allocation must be
+		 * per-device, and independent from the per-process
+		 * queue_id. Return the doorbell offset within the
+		 * doorbell aperture to user mode.
+		 */
+		args->doorbell_offset |= q_properties.doorbell_off;
+
+	mutex_unlock(&p->mutex);
+
+	pr_debug("Queue id %d was created successfully\n", args->queue_id);
+
+	pr_debug("Ring buffer address == 0x%016llX\n",
+			args->ring_base_address);
+
+	pr_debug("Read ptr address    == 0x%016llX\n",
+			args->read_pointer_address);
+
+	pr_debug("Write ptr address   == 0x%016llX\n",
+			args->write_pointer_address);
+
+	return 0;
+
+err_create_queue:
+err_bind_process:
+	mutex_unlock(&p->mutex);
+	return err;
+}
+
+static int kfd_ioctl_destroy_queue(struct file *filp, struct kfd_process *p,
+					void *data)
+{
+	int retval;
+	struct kfd_ioctl_destroy_queue_args *args = data;
+
+	pr_debug("Destroying queue id %d for pasid %d\n",
+				args->queue_id,
+				p->pasid);
+
+	mutex_lock(&p->mutex);
+
+	retval = pqm_destroy_queue(&p->pqm, args->queue_id);
+
+	mutex_unlock(&p->mutex);
+	return retval;
+}
+
+static int kfd_ioctl_update_queue(struct file *filp, struct kfd_process *p,
+					void *data)
+{
+	int retval;
+	struct kfd_ioctl_update_queue_args *args = data;
+	struct queue_properties properties;
+
+	if (args->queue_percentage > KFD_MAX_QUEUE_PERCENTAGE) {
+		pr_err("Queue percentage must be between 0 to KFD_MAX_QUEUE_PERCENTAGE\n");
+		return -EINVAL;
+	}
+
+	if (args->queue_priority > KFD_MAX_QUEUE_PRIORITY) {
+		pr_err("Queue priority must be between 0 to KFD_MAX_QUEUE_PRIORITY\n");
+		return -EINVAL;
+	}
+
+	if ((args->ring_base_address) &&
+		(!access_ok(VERIFY_WRITE,
+			(const void __user *) args->ring_base_address,
+			sizeof(uint64_t)))) {
+		pr_err("Can't access ring base address\n");
+		return -EFAULT;
+	}
+
+	if (!is_power_of_2(args->ring_size) && (args->ring_size != 0)) {
+		pr_err("Ring size must be a power of 2 or 0\n");
+		return -EINVAL;
+	}
+
+	properties.queue_address = args->ring_base_address;
+	properties.queue_size = args->ring_size;
+	properties.queue_percent = args->queue_percentage;
+	properties.priority = args->queue_priority;
+
+	pr_debug("Updating queue id %d for pasid %d\n",
+			args->queue_id, p->pasid);
+
+	mutex_lock(&p->mutex);
+
+	retval = pqm_update_queue(&p->pqm, args->queue_id, &properties);
+
+	mutex_unlock(&p->mutex);
+
+	return retval;
+}
+
+static int kfd_ioctl_set_cu_mask(struct file *filp, struct kfd_process *p,
+					void *data)
+{
+	int retval;
+	const int max_num_cus = 1024;
+	struct kfd_ioctl_set_cu_mask_args *args = data;
+	struct queue_properties properties;
+	uint32_t __user *cu_mask_ptr = (uint32_t __user *)args->cu_mask_ptr;
+	size_t cu_mask_size = sizeof(uint32_t) * (args->num_cu_mask / 32);
+
+	if ((args->num_cu_mask % 32) != 0) {
+		pr_debug("num_cu_mask 0x%x must be a multiple of 32",
+				args->num_cu_mask);
+		return -EINVAL;
+	}
+
+	properties.cu_mask_count = args->num_cu_mask;
+	if (properties.cu_mask_count == 0) {
+		pr_debug("CU mask cannot be 0");
+		return -EINVAL;
+	}
+
+	/* To prevent an unreasonably large CU mask size, set an arbitrary
+	 * limit of max_num_cus bits.  We can then just drop any CU mask bits
+	 * past max_num_cus bits and just use the first max_num_cus bits.
+	 */
+	if (properties.cu_mask_count > max_num_cus) {
+		pr_debug("CU mask cannot be greater than 1024 bits");
+		properties.cu_mask_count = max_num_cus;
+		cu_mask_size = sizeof(uint32_t) * (max_num_cus/32);
+	}
+
+	properties.cu_mask = kzalloc(cu_mask_size, GFP_KERNEL);
+	if (!properties.cu_mask)
+		return -ENOMEM;
+
+	retval = copy_from_user(properties.cu_mask, cu_mask_ptr, cu_mask_size);
+	if (retval) {
+		pr_debug("Could not copy CU mask from userspace");
+		kfree(properties.cu_mask);
+		return -EFAULT;
+	}
+
+	mutex_lock(&p->mutex);
+
+	retval = pqm_set_cu_mask(&p->pqm, args->queue_id, &properties);
+
+	mutex_unlock(&p->mutex);
+
+	if (retval)
+		kfree(properties.cu_mask);
+
+	return retval;
+}
+
+static int kfd_ioctl_set_memory_policy(struct file *filep,
+					struct kfd_process *p, void *data)
+{
+	struct kfd_ioctl_set_memory_policy_args *args = data;
+	struct kfd_dev *dev;
+	int err = 0;
+	struct kfd_process_device *pdd;
+	enum cache_policy default_policy, alternate_policy;
+
+	if (args->default_policy != KFD_IOC_CACHE_POLICY_COHERENT
+	    && args->default_policy != KFD_IOC_CACHE_POLICY_NONCOHERENT) {
+		return -EINVAL;
+	}
+
+	if (args->alternate_policy != KFD_IOC_CACHE_POLICY_COHERENT
+	    && args->alternate_policy != KFD_IOC_CACHE_POLICY_NONCOHERENT) {
+		return -EINVAL;
+	}
+
+	dev = kfd_device_by_id(args->gpu_id);
+	if (!dev)
+		return -EINVAL;
+
+	mutex_lock(&p->mutex);
+
+	pdd = kfd_bind_process_to_device(dev, p);
+	if (IS_ERR(pdd)) {
+		err = -ESRCH;
+		goto out;
+	}
+
+	default_policy = (args->default_policy == KFD_IOC_CACHE_POLICY_COHERENT)
+			 ? cache_policy_coherent : cache_policy_noncoherent;
+
+	alternate_policy =
+		(args->alternate_policy == KFD_IOC_CACHE_POLICY_COHERENT)
+		   ? cache_policy_coherent : cache_policy_noncoherent;
+
+	if (!dev->dqm->ops.set_cache_memory_policy(dev->dqm,
+				&pdd->qpd,
+				default_policy,
+				alternate_policy,
+				(void __user *)args->alternate_aperture_base,
+				args->alternate_aperture_size))
+		err = -EINVAL;
+
+out:
+	mutex_unlock(&p->mutex);
+
+	return err;
+}
+
+static int kfd_ioctl_set_trap_handler(struct file *filep,
+					struct kfd_process *p, void *data)
+{
+	struct kfd_ioctl_set_trap_handler_args *args = data;
+	struct kfd_dev *dev;
+	int err = 0;
+	struct kfd_process_device *pdd;
+
+	dev = kfd_device_by_id(args->gpu_id);
+	if (dev == NULL)
+		return -EINVAL;
+
+	mutex_lock(&p->mutex);
+
+	pdd = kfd_bind_process_to_device(dev, p);
+	if (IS_ERR(pdd)) {
+		err = -ESRCH;
+		goto out;
+	}
+
+	if (dev->dqm->ops.set_trap_handler(dev->dqm,
+					&pdd->qpd,
+					args->tba_addr,
+					args->tma_addr))
+		err = -EINVAL;
+
+out:
+	mutex_unlock(&p->mutex);
+
+	return err;
+}
+
+static int kfd_ioctl_dbg_register(struct file *filep,
+				struct kfd_process *p, void *data)
+{
+	struct kfd_ioctl_dbg_register_args *args = data;
+	struct kfd_dev *dev;
+	struct kfd_dbgmgr *dbgmgr_ptr;
+	struct kfd_process_device *pdd;
+	bool create_ok;
+	long status = 0;
+
+	dev = kfd_device_by_id(args->gpu_id);
+	if (!dev)
+		return -EINVAL;
+
+	if (dev->device_info->asic_family == CHIP_CARRIZO) {
+		pr_debug("kfd_ioctl_dbg_register not supported on CZ\n");
+		return -EINVAL;
+	}
+
+	mutex_lock(&p->mutex);
+	mutex_lock(kfd_get_dbgmgr_mutex());
+
+	/*
+	 * make sure that we have pdd, if this the first queue created for
+	 * this process
+	 */
+	pdd = kfd_bind_process_to_device(dev, p);
+	if (IS_ERR(pdd)) {
+		status = PTR_ERR(pdd);
+		goto out;
+	}
+
+	if (!dev->dbgmgr) {
+		/* In case of a legal call, we have no dbgmgr yet */
+		create_ok = kfd_dbgmgr_create(&dbgmgr_ptr, dev);
+		if (create_ok) {
+			status = kfd_dbgmgr_register(dbgmgr_ptr, p);
+			if (status != 0)
+				kfd_dbgmgr_destroy(dbgmgr_ptr);
+			else
+				dev->dbgmgr = dbgmgr_ptr;
+		}
+	} else {
+		pr_debug("debugger already registered\n");
+		status = -EINVAL;
+	}
+
+out:
+	mutex_unlock(kfd_get_dbgmgr_mutex());
+	mutex_unlock(&p->mutex);
+
+	return status;
+}
+
+static int kfd_ioctl_dbg_unregister(struct file *filep,
+				struct kfd_process *p, void *data)
+{
+	struct kfd_ioctl_dbg_unregister_args *args = data;
+	struct kfd_dev *dev;
+	long status;
+
+	dev = kfd_device_by_id(args->gpu_id);
+	if (!dev || !dev->dbgmgr)
+		return -EINVAL;
+
+	if (dev->device_info->asic_family == CHIP_CARRIZO) {
+		pr_debug("kfd_ioctl_dbg_unregister not supported on CZ\n");
+		return -EINVAL;
+	}
+
+	mutex_lock(kfd_get_dbgmgr_mutex());
+
+	status = kfd_dbgmgr_unregister(dev->dbgmgr, p);
+	if (!status) {
+		kfd_dbgmgr_destroy(dev->dbgmgr);
+		dev->dbgmgr = NULL;
+	}
+
+	mutex_unlock(kfd_get_dbgmgr_mutex());
+
+	return status;
+}
+
+/*
+ * Parse and generate variable size data structure for address watch.
+ * Total size of the buffer and # watch points is limited in order
+ * to prevent kernel abuse. (no bearing to the much smaller HW limitation
+ * which is enforced by dbgdev module)
+ * please also note that the watch address itself are not "copied from user",
+ * since it be set into the HW in user mode values.
+ *
+ */
+static int kfd_ioctl_dbg_address_watch(struct file *filep,
+					struct kfd_process *p, void *data)
+{
+	struct kfd_ioctl_dbg_address_watch_args *args = data;
+	struct kfd_dev *dev;
+	struct dbg_address_watch_info aw_info;
+	unsigned char *args_buff;
+	long status;
+	void __user *cmd_from_user;
+	uint64_t watch_mask_value = 0;
+	unsigned int args_idx = 0;
+
+	memset((void *) &aw_info, 0, sizeof(struct dbg_address_watch_info));
+
+	dev = kfd_device_by_id(args->gpu_id);
+	if (!dev)
+		return -EINVAL;
+
+	if (dev->device_info->asic_family == CHIP_CARRIZO) {
+		pr_debug("kfd_ioctl_dbg_wave_control not supported on CZ\n");
+		return -EINVAL;
+	}
+
+	cmd_from_user = (void __user *) args->content_ptr;
+
+	/* Validate arguments */
+
+	if ((args->buf_size_in_bytes > MAX_ALLOWED_AW_BUFF_SIZE) ||
+		(args->buf_size_in_bytes <= sizeof(*args) + sizeof(int) * 2) ||
+		(cmd_from_user == NULL))
+		return -EINVAL;
+
+	/* this is the actual buffer to work with */
+	args_buff = memdup_user(cmd_from_user,
+				args->buf_size_in_bytes - sizeof(*args));
+	if (IS_ERR(args_buff))
+		return PTR_ERR(args_buff);
+
+	aw_info.process = p;
+
+	aw_info.num_watch_points = *((uint32_t *)(&args_buff[args_idx]));
+	args_idx += sizeof(aw_info.num_watch_points);
+
+	aw_info.watch_mode = (enum HSA_DBG_WATCH_MODE *) &args_buff[args_idx];
+	args_idx += sizeof(enum HSA_DBG_WATCH_MODE) * aw_info.num_watch_points;
+
+	/*
+	 * set watch address base pointer to point on the array base
+	 * within args_buff
+	 */
+	aw_info.watch_address = (uint64_t *) &args_buff[args_idx];
+
+	/* skip over the addresses buffer */
+	args_idx += sizeof(aw_info.watch_address) * aw_info.num_watch_points;
+
+	if (args_idx >= args->buf_size_in_bytes - sizeof(*args)) {
+		status = -EINVAL;
+		goto out;
+	}
+
+	watch_mask_value = (uint64_t) args_buff[args_idx];
+
+	if (watch_mask_value > 0) {
+		/*
+		 * There is an array of masks.
+		 * set watch mask base pointer to point on the array base
+		 * within args_buff
+		 */
+		aw_info.watch_mask = (uint64_t *) &args_buff[args_idx];
+
+		/* skip over the masks buffer */
+		args_idx += sizeof(aw_info.watch_mask) *
+				aw_info.num_watch_points;
+	} else {
+		/* just the NULL mask, set to NULL and skip over it */
+		aw_info.watch_mask = NULL;
+		args_idx += sizeof(aw_info.watch_mask);
+	}
+
+	if (args_idx >= args->buf_size_in_bytes - sizeof(args)) {
+		status = -EINVAL;
+		goto out;
+	}
+
+	/* Currently HSA Event is not supported for DBG */
+	aw_info.watch_event = NULL;
+
+	mutex_lock(kfd_get_dbgmgr_mutex());
+
+	status = kfd_dbgmgr_address_watch(dev->dbgmgr, &aw_info);
+
+	mutex_unlock(kfd_get_dbgmgr_mutex());
+
+out:
+	kfree(args_buff);
+
+	return status;
+}
+
+/* Parse and generate fixed size data structure for wave control */
+static int kfd_ioctl_dbg_wave_control(struct file *filep,
+					struct kfd_process *p, void *data)
+{
+	struct kfd_ioctl_dbg_wave_control_args *args = data;
+	struct kfd_dev *dev;
+	struct dbg_wave_control_info wac_info;
+	unsigned char *args_buff;
+	uint32_t computed_buff_size;
+	long status;
+	void __user *cmd_from_user;
+	unsigned int args_idx = 0;
+
+	memset((void *) &wac_info, 0, sizeof(struct dbg_wave_control_info));
+
+	/* we use compact form, independent of the packing attribute value */
+	computed_buff_size = sizeof(*args) +
+				sizeof(wac_info.mode) +
+				sizeof(wac_info.operand) +
+				sizeof(wac_info.dbgWave_msg.DbgWaveMsg) +
+				sizeof(wac_info.dbgWave_msg.MemoryVA) +
+				sizeof(wac_info.trapId);
+
+	dev = kfd_device_by_id(args->gpu_id);
+	if (!dev)
+		return -EINVAL;
+
+	if (dev->device_info->asic_family == CHIP_CARRIZO) {
+		pr_debug("kfd_ioctl_dbg_wave_control not supported on CZ\n");
+		return -EINVAL;
+	}
+
+	/* input size must match the computed "compact" size */
+	if (args->buf_size_in_bytes != computed_buff_size) {
+		pr_debug("size mismatch, computed : actual %u : %u\n",
+				args->buf_size_in_bytes, computed_buff_size);
+		return -EINVAL;
+	}
+
+	cmd_from_user = (void __user *) args->content_ptr;
+
+	if (cmd_from_user == NULL)
+		return -EINVAL;
+
+	/* copy the entire buffer from user */
+
+	args_buff = memdup_user(cmd_from_user,
+				args->buf_size_in_bytes - sizeof(*args));
+	if (IS_ERR(args_buff))
+		return PTR_ERR(args_buff);
+
+	/* move ptr to the start of the "pay-load" area */
+	wac_info.process = p;
+
+	wac_info.operand = *((enum HSA_DBG_WAVEOP *)(&args_buff[args_idx]));
+	args_idx += sizeof(wac_info.operand);
+
+	wac_info.mode = *((enum HSA_DBG_WAVEMODE *)(&args_buff[args_idx]));
+	args_idx += sizeof(wac_info.mode);
+
+	wac_info.trapId = *((uint32_t *)(&args_buff[args_idx]));
+	args_idx += sizeof(wac_info.trapId);
+
+	wac_info.dbgWave_msg.DbgWaveMsg.WaveMsgInfoGen2.Value =
+					*((uint32_t *)(&args_buff[args_idx]));
+	wac_info.dbgWave_msg.MemoryVA = NULL;
+
+	mutex_lock(kfd_get_dbgmgr_mutex());
+
+	pr_debug("Calling dbg manager process %p, operand %u, mode %u, trapId %u, message %u\n",
+			wac_info.process, wac_info.operand,
+			wac_info.mode, wac_info.trapId,
+			wac_info.dbgWave_msg.DbgWaveMsg.WaveMsgInfoGen2.Value);
+
+	status = kfd_dbgmgr_wave_control(dev->dbgmgr, &wac_info);
+
+	pr_debug("Returned status of dbg manager is %ld\n", status);
+
+	mutex_unlock(kfd_get_dbgmgr_mutex());
+
+	kfree(args_buff);
+
+	return status;
+}
+
+static int kfd_ioctl_get_clock_counters(struct file *filep,
+				struct kfd_process *p, void *data)
+{
+	struct kfd_ioctl_get_clock_counters_args *args = data;
+	struct kfd_dev *dev;
+
+	dev = kfd_device_by_id(args->gpu_id);
+	if (dev)
+		/* Reading GPU clock counter from KGD */
+		args->gpu_clock_counter =
+			dev->kfd2kgd->get_gpu_clock_counter(dev->kgd);
+	else
+		/* Node without GPU resource */
+		args->gpu_clock_counter = 0;
+
+	/* No access to rdtsc. Using raw monotonic time */
+	args->cpu_clock_counter = ktime_get_raw_ns();
+	args->system_clock_counter = ktime_get_boot_ns();
+
+	/* Since the counter is in nano-seconds we use 1GHz frequency */
+	args->system_clock_freq = 1000000000;
+
+	return 0;
+}
+
+
+static int kfd_ioctl_get_process_apertures(struct file *filp,
+				struct kfd_process *p, void *data)
+{
+	struct kfd_ioctl_get_process_apertures_args *args = data;
+	struct kfd_process_device_apertures *pAperture;
+	struct kfd_process_device *pdd;
+
+	dev_dbg(kfd_device, "get apertures for PASID %d", p->pasid);
+
+	args->num_of_nodes = 0;
+
+	mutex_lock(&p->mutex);
+
+	/*if the process-device list isn't empty*/
+	if (kfd_has_process_device_data(p)) {
+		/* Run over all pdd of the process */
+		pdd = kfd_get_first_process_device_data(p);
+		do {
+			pAperture =
+				&args->process_apertures[args->num_of_nodes];
+			pAperture->gpu_id = pdd->dev->id;
+			pAperture->lds_base = pdd->lds_base;
+			pAperture->lds_limit = pdd->lds_limit;
+			pAperture->gpuvm_base = pdd->gpuvm_base;
+			pAperture->gpuvm_limit = pdd->gpuvm_limit;
+			pAperture->scratch_base = pdd->scratch_base;
+			pAperture->scratch_limit = pdd->scratch_limit;
+
+			dev_dbg(kfd_device,
+				"node id %u\n", args->num_of_nodes);
+			dev_dbg(kfd_device,
+				"gpu id %u\n", pdd->dev->id);
+			dev_dbg(kfd_device,
+				"lds_base %llX\n", pdd->lds_base);
+			dev_dbg(kfd_device,
+				"lds_limit %llX\n", pdd->lds_limit);
+			dev_dbg(kfd_device,
+				"gpuvm_base %llX\n", pdd->gpuvm_base);
+			dev_dbg(kfd_device,
+				"gpuvm_limit %llX\n", pdd->gpuvm_limit);
+			dev_dbg(kfd_device,
+				"scratch_base %llX\n", pdd->scratch_base);
+			dev_dbg(kfd_device,
+				"scratch_limit %llX\n", pdd->scratch_limit);
+
+			args->num_of_nodes++;
+
+			pdd = kfd_get_next_process_device_data(p, pdd);
+		} while (pdd && (args->num_of_nodes < NUM_OF_SUPPORTED_GPUS));
+	}
+
+	mutex_unlock(&p->mutex);
+
+	return 0;
+}
+
+static int kfd_ioctl_get_process_apertures_new(struct file *filp,
+				struct kfd_process *p, void *data)
+{
+	struct kfd_ioctl_get_process_apertures_new_args *args = data;
+	struct kfd_process_device_apertures *pa;
+	struct kfd_process_device *pdd;
+	uint32_t nodes = 0;
+	int ret;
+
+	dev_dbg(kfd_device, "get apertures for PASID %d", p->pasid);
+
+	if (args->num_of_nodes == 0) {
+		/* Return number of nodes, so that user space can alloacate
+		 * sufficient memory
+		 */
+		mutex_lock(&p->mutex);
+
+		if (!kfd_has_process_device_data(p))
+			goto out_unlock;
+
+		/* Run over all pdd of the process */
+		pdd = kfd_get_first_process_device_data(p);
+		do {
+			args->num_of_nodes++;
+			pdd = kfd_get_next_process_device_data(p, pdd);
+		} while (pdd);
+
+		goto out_unlock;
+	}
+
+	/* Fill in process-aperture information for all available
+	 * nodes, but not more than args->num_of_nodes as that is
+	 * the amount of memory allocated by user
+	 */
+	pa = kzalloc((sizeof(struct kfd_process_device_apertures) *
+				args->num_of_nodes), GFP_KERNEL);
+	if (!pa)
+		return -ENOMEM;
+
+	mutex_lock(&p->mutex);
+
+	if (!kfd_has_process_device_data(p)) {
+		args->num_of_nodes = 0;
+		kfree(pa);
+		goto out_unlock;
+	}
+
+	/* Run over all pdd of the process */
+	pdd = kfd_get_first_process_device_data(p);
+	do {
+		pa[nodes].gpu_id = pdd->dev->id;
+		pa[nodes].lds_base = pdd->lds_base;
+		pa[nodes].lds_limit = pdd->lds_limit;
+		pa[nodes].gpuvm_base = pdd->gpuvm_base;
+		pa[nodes].gpuvm_limit = pdd->gpuvm_limit;
+		pa[nodes].scratch_base = pdd->scratch_base;
+		pa[nodes].scratch_limit = pdd->scratch_limit;
+
+		dev_dbg(kfd_device,
+			"gpu id %u\n", pdd->dev->id);
+		dev_dbg(kfd_device,
+			"lds_base %llX\n", pdd->lds_base);
+		dev_dbg(kfd_device,
+			"lds_limit %llX\n", pdd->lds_limit);
+		dev_dbg(kfd_device,
+			"gpuvm_base %llX\n", pdd->gpuvm_base);
+		dev_dbg(kfd_device,
+			"gpuvm_limit %llX\n", pdd->gpuvm_limit);
+		dev_dbg(kfd_device,
+			"scratch_base %llX\n", pdd->scratch_base);
+		dev_dbg(kfd_device,
+			"scratch_limit %llX\n", pdd->scratch_limit);
+		nodes++;
+
+		pdd = kfd_get_next_process_device_data(p, pdd);
+	} while (pdd && (nodes < args->num_of_nodes));
+	mutex_unlock(&p->mutex);
+
+	args->num_of_nodes = nodes;
+	ret = copy_to_user(
+			(void __user *)args->kfd_process_device_apertures_ptr,
+			pa,
+			(nodes * sizeof(struct kfd_process_device_apertures)));
+	kfree(pa);
+	return ret ? -EFAULT : 0;
+
+out_unlock:
+	mutex_unlock(&p->mutex);
+	return 0;
+}
+
+static int kfd_ioctl_create_event(struct file *filp, struct kfd_process *p,
+					void *data)
+{
+	struct kfd_ioctl_create_event_args *args = data;
+	int err;
+
+	/* For dGPUs the event page is allocated in user mode. The
+	 * handle is passed to KFD with the first call to this IOCTL
+	 * through the event_page_offset field.
+	 */
+	if (args->event_page_offset) {
+		struct kfd_dev *kfd;
+		struct kfd_process_device *pdd;
+		void *mem, *kern_addr;
+		uint64_t size;
+
+		if (p->signal_page) {
+			pr_err("Event page is already set\n");
+			return -EINVAL;
+		}
+
+		kfd = kfd_device_by_id(GET_GPU_ID(args->event_page_offset));
+		if (!kfd) {
+			pr_err("Getting device by id failed in %s\n", __func__);
+			return -EINVAL;
+		}
+
+		mutex_lock(&p->mutex);
+		pdd = kfd_bind_process_to_device(kfd, p);
+		if (IS_ERR(pdd)) {
+			err = PTR_ERR(pdd);
+			goto out_unlock;
+		}
+
+		mem = kfd_process_device_translate_handle(pdd,
+				GET_IDR_HANDLE(args->event_page_offset));
+		if (!mem) {
+			pr_err("Can't find BO, offset is 0x%llx\n",
+			       args->event_page_offset);
+			err = -EINVAL;
+			goto out_unlock;
+		}
+		mutex_unlock(&p->mutex);
+
+		err = kfd->kfd2kgd->map_gtt_bo_to_kernel(kfd->kgd,
+						mem, &kern_addr, &size);
+		if (err) {
+			pr_err("Failed to map event page to kernel\n");
+			return err;
+		}
+
+		err = kfd_event_page_set(p, kern_addr, size);
+		if (err) {
+			pr_err("Failed to set event page\n");
+			return err;
+		}
+	}
+
+	err = kfd_event_create(filp, p, args->event_type,
+				args->auto_reset != 0, args->node_id,
+				&args->event_id, &args->event_trigger_data,
+				&args->event_page_offset,
+				&args->event_slot_index);
+
+	return err;
+
+out_unlock:
+	mutex_unlock(&p->mutex);
+	return err;
+}
+
+static int kfd_ioctl_destroy_event(struct file *filp, struct kfd_process *p,
+					void *data)
+{
+	struct kfd_ioctl_destroy_event_args *args = data;
+
+	return kfd_event_destroy(p, args->event_id);
+}
+
+static int kfd_ioctl_set_event(struct file *filp, struct kfd_process *p,
+				void *data)
+{
+	struct kfd_ioctl_set_event_args *args = data;
+
+	return kfd_set_event(p, args->event_id);
+}
+
+static int kfd_ioctl_reset_event(struct file *filp, struct kfd_process *p,
+				void *data)
+{
+	struct kfd_ioctl_reset_event_args *args = data;
+
+	return kfd_reset_event(p, args->event_id);
+}
+
+static int kfd_ioctl_wait_events(struct file *filp, struct kfd_process *p,
+				void *data)
+{
+	struct kfd_ioctl_wait_events_args *args = data;
+	int err;
+
+	err = kfd_wait_on_events(p, args->num_events,
+			(void __user *)args->events_ptr,
+			(args->wait_for_all != 0),
+			args->timeout, &args->wait_result);
+
+	return err;
+}
+static int kfd_ioctl_set_scratch_backing_va(struct file *filep,
+					struct kfd_process *p, void *data)
+{
+	struct kfd_ioctl_set_scratch_backing_va_args *args = data;
+	struct kfd_process_device *pdd;
+	struct kfd_dev *dev;
+	long err;
+
+	dev = kfd_device_by_id(args->gpu_id);
+	if (!dev)
+		return -EINVAL;
+
+	mutex_lock(&p->mutex);
+
+	pdd = kfd_bind_process_to_device(dev, p);
+	if (IS_ERR(pdd)) {
+		err = PTR_ERR(pdd);
+		goto bind_process_to_device_fail;
+	}
+
+	pdd->qpd.sh_hidden_private_base = args->va_addr;
+
+	mutex_unlock(&p->mutex);
+
+	if (dev->dqm->sched_policy == KFD_SCHED_POLICY_NO_HWS &&
+	    pdd->qpd.vmid != 0)
+		dev->kfd2kgd->set_scratch_backing_va(
+			dev->kgd, args->va_addr, pdd->qpd.vmid);
+
+	return 0;
+
+bind_process_to_device_fail:
+	mutex_unlock(&p->mutex);
+	return err;
+}
+
+static int kfd_ioctl_get_tile_config(struct file *filep,
+		struct kfd_process *p, void *data)
+{
+	struct kfd_ioctl_get_tile_config_args *args = data;
+	struct kfd_dev *dev;
+	struct tile_config config;
+	int err = 0;
+
+	dev = kfd_device_by_id(args->gpu_id);
+	if (!dev)
+		return -EINVAL;
+
+	dev->kfd2kgd->get_tile_config(dev->kgd, &config);
+
+	args->gb_addr_config = config.gb_addr_config;
+	args->num_banks = config.num_banks;
+	args->num_ranks = config.num_ranks;
+
+	if (args->num_tile_configs > config.num_tile_configs)
+		args->num_tile_configs = config.num_tile_configs;
+	err = copy_to_user((void __user *)args->tile_config_ptr,
+			config.tile_config_ptr,
+			args->num_tile_configs * sizeof(uint32_t));
+	if (err) {
+		args->num_tile_configs = 0;
+		return -EFAULT;
+	}
+
+	if (args->num_macro_tile_configs > config.num_macro_tile_configs)
+		args->num_macro_tile_configs =
+				config.num_macro_tile_configs;
+	err = copy_to_user((void __user *)args->macro_tile_config_ptr,
+			config.macro_tile_config_ptr,
+			args->num_macro_tile_configs * sizeof(uint32_t));
+	if (err) {
+		args->num_macro_tile_configs = 0;
+		return -EFAULT;
+	}
+
+	return 0;
+}
+
+static int kfd_ioctl_acquire_vm(struct file *filep, struct kfd_process *p,
+				void *data)
+{
+	struct kfd_ioctl_acquire_vm_args *args = data;
+	struct kfd_process_device *pdd;
+	struct kfd_dev *dev;
+	struct file *drm_file;
+	int ret;
+
+	dev = kfd_device_by_id(args->gpu_id);
+	if (!dev)
+		return -EINVAL;
+
+	drm_file = fget(args->drm_fd);
+	if (!drm_file)
+		return -EINVAL;
+
+	mutex_lock(&p->mutex);
+
+	pdd = kfd_get_process_device_data(dev, p);
+	if (!pdd) {
+		ret = -EINVAL;
+		goto err_unlock;
+	}
+
+	if (pdd->drm_file) {
+		ret = pdd->drm_file == drm_file ? 0 : -EBUSY;
+		goto err_unlock;
+	}
+
+	ret = kfd_process_device_init_vm(pdd, drm_file);
+	if (ret)
+		goto err_unlock;
+	/* On success, the PDD keeps the drm_file reference */
+	mutex_unlock(&p->mutex);
+
+	return 0;
+
+err_unlock:
+	mutex_unlock(&p->mutex);
+	fput(drm_file);
+	return ret;
+}
+
+static bool kfd_dev_is_large_bar(struct kfd_dev *dev)
+{
+	struct kfd_local_mem_info mem_info;
+
+	if (debug_largebar) {
+		pr_debug("Simulate large-bar allocation on non large-bar machine\n");
+		return true;
+	}
+
+	if (dev->device_info->needs_iommu_device)
+		return false;
+
+	dev->kfd2kgd->get_local_mem_info(dev->kgd, &mem_info);
+	if (mem_info.local_mem_size_private == 0 &&
+			mem_info.local_mem_size_public > 0)
+		return true;
+	return false;
+}
+
+static int kfd_ioctl_alloc_memory_of_gpu(struct file *filep,
+					struct kfd_process *p, void *data)
+{
+	struct kfd_ioctl_alloc_memory_of_gpu_args *args = data;
+	struct kfd_process_device *pdd;
+	void *mem;
+	struct kfd_dev *dev;
+	int idr_handle;
+	long err;
+	uint64_t offset = args->mmap_offset;
+	uint32_t flags = args->flags;
+
+	if (args->size == 0)
+		return -EINVAL;
+
+	dev = kfd_device_by_id(args->gpu_id);
+	if (!dev)
+		return -EINVAL;
+
+	if ((flags & KFD_IOC_ALLOC_MEM_FLAGS_PUBLIC) &&
+		(flags & KFD_IOC_ALLOC_MEM_FLAGS_VRAM) &&
+		!kfd_dev_is_large_bar(dev)) {
+		pr_err("Alloc host visible vram on small bar is not allowed\n");
+		return -EINVAL;
+	}
+
+	mutex_lock(&p->mutex);
+
+	pdd = kfd_bind_process_to_device(dev, p);
+	if (IS_ERR(pdd)) {
+		err = PTR_ERR(pdd);
+		goto err_unlock;
+	}
+
+	err = dev->kfd2kgd->alloc_memory_of_gpu(
+		dev->kgd, args->va_addr, args->size,
+		pdd->vm, (struct kgd_mem **) &mem, &offset,
+		flags);
+
+	if (err)
+		goto err_unlock;
+
+	idr_handle = kfd_process_device_create_obj_handle(pdd, mem);
+	if (idr_handle < 0) {
+		err = -EFAULT;
+		goto err_free;
+	}
+
+	mutex_unlock(&p->mutex);
+
+	args->handle = MAKE_HANDLE(args->gpu_id, idr_handle);
+	args->mmap_offset = offset;
+
+	return 0;
+
+err_free:
+	dev->kfd2kgd->free_memory_of_gpu(dev->kgd, (struct kgd_mem *)mem);
+err_unlock:
+	mutex_unlock(&p->mutex);
+	return err;
+}
+
+static int kfd_ioctl_free_memory_of_gpu(struct file *filep,
+					struct kfd_process *p, void *data)
+{
+	struct kfd_ioctl_free_memory_of_gpu_args *args = data;
+	struct kfd_process_device *pdd;
+	void *mem;
+	struct kfd_dev *dev;
+	int ret;
+
+	dev = kfd_device_by_id(GET_GPU_ID(args->handle));
+	if (!dev)
+		return -EINVAL;
+
+	mutex_lock(&p->mutex);
+
+	pdd = kfd_get_process_device_data(dev, p);
+	if (!pdd) {
+		pr_err("Process device data doesn't exist\n");
+		ret = -EINVAL;
+		goto err_unlock;
+	}
+
+	mem = kfd_process_device_translate_handle(
+		pdd, GET_IDR_HANDLE(args->handle));
+	if (!mem) {
+		ret = -EINVAL;
+		goto err_unlock;
+	}
+
+	ret = dev->kfd2kgd->free_memory_of_gpu(dev->kgd, (struct kgd_mem *)mem);
+
+	/* If freeing the buffer failed, leave the handle in place for
+	 * clean-up during process tear-down.
+	 */
+	if (!ret)
+		kfd_process_device_remove_obj_handle(
+			pdd, GET_IDR_HANDLE(args->handle));
+
+err_unlock:
+	mutex_unlock(&p->mutex);
+	return ret;
+}
+
+static int kfd_ioctl_map_memory_to_gpu(struct file *filep,
+					struct kfd_process *p, void *data)
+{
+	struct kfd_ioctl_map_memory_to_gpu_args *args = data;
+	struct kfd_process_device *pdd, *peer_pdd;
+	void *mem;
+	struct kfd_dev *dev, *peer;
+	long err = 0;
+	int i;
+	uint32_t *devices_arr = NULL;
+
+	dev = kfd_device_by_id(GET_GPU_ID(args->handle));
+	if (!dev)
+		return -EINVAL;
+
+	if (!args->n_devices) {
+		pr_debug("Device IDs array empty\n");
+		return -EINVAL;
+	}
+	if (args->n_success > args->n_devices) {
+		pr_debug("n_success exceeds n_devices\n");
+		return -EINVAL;
+	}
+
+	devices_arr = kmalloc_array(args->n_devices, sizeof(*devices_arr),
+				    GFP_KERNEL);
+	if (!devices_arr)
+		return -ENOMEM;
+
+	err = copy_from_user(devices_arr,
+			     (void __user *)args->device_ids_array_ptr,
+			     args->n_devices * sizeof(*devices_arr));
+	if (err != 0) {
+		err = -EFAULT;
+		goto copy_from_user_failed;
+	}
+
+	mutex_lock(&p->mutex);
+
+	pdd = kfd_bind_process_to_device(dev, p);
+	if (IS_ERR(pdd)) {
+		err = PTR_ERR(pdd);
+		goto bind_process_to_device_failed;
+	}
+
+	mem = kfd_process_device_translate_handle(pdd,
+						GET_IDR_HANDLE(args->handle));
+	if (!mem) {
+		err = -ENOMEM;
+		goto get_mem_obj_from_handle_failed;
+	}
+
+	for (i = args->n_success; i < args->n_devices; i++) {
+		peer = kfd_device_by_id(devices_arr[i]);
+		if (!peer) {
+			pr_debug("Getting device by id failed for 0x%x\n",
+				 devices_arr[i]);
+			err = -EINVAL;
+			goto get_mem_obj_from_handle_failed;
+		}
+
+		peer_pdd = kfd_bind_process_to_device(peer, p);
+		if (IS_ERR(peer_pdd)) {
+			err = PTR_ERR(peer_pdd);
+			goto get_mem_obj_from_handle_failed;
+		}
+		err = peer->kfd2kgd->map_memory_to_gpu(
+			peer->kgd, (struct kgd_mem *)mem, peer_pdd->vm);
+		if (err) {
+			pr_err("Failed to map to gpu %d/%d\n",
+			       i, args->n_devices);
+			goto map_memory_to_gpu_failed;
+		}
+		args->n_success = i+1;
+	}
+
+	mutex_unlock(&p->mutex);
+
+	err = dev->kfd2kgd->sync_memory(dev->kgd, (struct kgd_mem *) mem, true);
+	if (err) {
+		pr_debug("Sync memory failed, wait interrupted by user signal\n");
+		goto sync_memory_failed;
+	}
+
+	/* Flush TLBs after waiting for the page table updates to complete */
+	for (i = 0; i < args->n_devices; i++) {
+		peer = kfd_device_by_id(devices_arr[i]);
+		if (WARN_ON_ONCE(!peer))
+			continue;
+		peer_pdd = kfd_get_process_device_data(peer, p);
+		if (WARN_ON_ONCE(!peer_pdd))
+			continue;
+		kfd_flush_tlb(peer_pdd);
+	}
+
+	kfree(devices_arr);
+
+	return err;
+
+bind_process_to_device_failed:
+get_mem_obj_from_handle_failed:
+map_memory_to_gpu_failed:
+	mutex_unlock(&p->mutex);
+copy_from_user_failed:
+sync_memory_failed:
+	kfree(devices_arr);
+
+	return err;
+}
+
+static int kfd_ioctl_unmap_memory_from_gpu(struct file *filep,
+					struct kfd_process *p, void *data)
+{
+	struct kfd_ioctl_unmap_memory_from_gpu_args *args = data;
+	struct kfd_process_device *pdd, *peer_pdd;
+	void *mem;
+	struct kfd_dev *dev, *peer;
+	long err = 0;
+	uint32_t *devices_arr = NULL, i;
+
+	dev = kfd_device_by_id(GET_GPU_ID(args->handle));
+	if (!dev)
+		return -EINVAL;
+
+	if (!args->n_devices) {
+		pr_debug("Device IDs array empty\n");
+		return -EINVAL;
+	}
+	if (args->n_success > args->n_devices) {
+		pr_debug("n_success exceeds n_devices\n");
+		return -EINVAL;
+	}
+
+	devices_arr = kmalloc_array(args->n_devices, sizeof(*devices_arr),
+				    GFP_KERNEL);
+	if (!devices_arr)
+		return -ENOMEM;
+
+	err = copy_from_user(devices_arr,
+			     (void __user *)args->device_ids_array_ptr,
+			     args->n_devices * sizeof(*devices_arr));
+	if (err != 0) {
+		err = -EFAULT;
+		goto copy_from_user_failed;
+	}
+
+	mutex_lock(&p->mutex);
+
+	pdd = kfd_get_process_device_data(dev, p);
+	if (!pdd) {
+		err = -EINVAL;
+		goto bind_process_to_device_failed;
+	}
+
+	mem = kfd_process_device_translate_handle(pdd,
+						GET_IDR_HANDLE(args->handle));
+	if (!mem) {
+		err = -ENOMEM;
+		goto get_mem_obj_from_handle_failed;
+	}
+
+	for (i = args->n_success; i < args->n_devices; i++) {
+		peer = kfd_device_by_id(devices_arr[i]);
+		if (!peer) {
+			err = -EINVAL;
+			goto get_mem_obj_from_handle_failed;
+		}
+
+		peer_pdd = kfd_get_process_device_data(peer, p);
+		if (!peer_pdd) {
+			err = -ENODEV;
+			goto get_mem_obj_from_handle_failed;
+		}
+		err = dev->kfd2kgd->unmap_memory_to_gpu(
+			peer->kgd, (struct kgd_mem *)mem, peer_pdd->vm);
+		if (err) {
+			pr_err("Failed to unmap from gpu %d/%d\n",
+			       i, args->n_devices);
+			goto unmap_memory_from_gpu_failed;
+		}
+		args->n_success = i+1;
+	}
+	kfree(devices_arr);
+
+	mutex_unlock(&p->mutex);
+
+	return 0;
+
+bind_process_to_device_failed:
+get_mem_obj_from_handle_failed:
+unmap_memory_from_gpu_failed:
+	mutex_unlock(&p->mutex);
+copy_from_user_failed:
+	kfree(devices_arr);
+	return err;
+}
+
+#define AMDKFD_IOCTL_DEF(ioctl, _func, _flags) \
+	[_IOC_NR(ioctl)] = {.cmd = ioctl, .func = _func, .flags = _flags, \
+			    .cmd_drv = 0, .name = #ioctl}
+
+/** Ioctl table */
+static const struct amdkfd_ioctl_desc amdkfd_ioctls[] = {
+	AMDKFD_IOCTL_DEF(AMDKFD_IOC_GET_VERSION,
+			kfd_ioctl_get_version, 0),
+
+	AMDKFD_IOCTL_DEF(AMDKFD_IOC_CREATE_QUEUE,
+			kfd_ioctl_create_queue, 0),
+
+	AMDKFD_IOCTL_DEF(AMDKFD_IOC_DESTROY_QUEUE,
+			kfd_ioctl_destroy_queue, 0),
+
+	AMDKFD_IOCTL_DEF(AMDKFD_IOC_SET_MEMORY_POLICY,
+			kfd_ioctl_set_memory_policy, 0),
+
+	AMDKFD_IOCTL_DEF(AMDKFD_IOC_GET_CLOCK_COUNTERS,
+			kfd_ioctl_get_clock_counters, 0),
+
+	AMDKFD_IOCTL_DEF(AMDKFD_IOC_GET_PROCESS_APERTURES,
+			kfd_ioctl_get_process_apertures, 0),
+
+	AMDKFD_IOCTL_DEF(AMDKFD_IOC_UPDATE_QUEUE,
+			kfd_ioctl_update_queue, 0),
+
+	AMDKFD_IOCTL_DEF(AMDKFD_IOC_CREATE_EVENT,
+			kfd_ioctl_create_event, 0),
+
+	AMDKFD_IOCTL_DEF(AMDKFD_IOC_DESTROY_EVENT,
+			kfd_ioctl_destroy_event, 0),
+
+	AMDKFD_IOCTL_DEF(AMDKFD_IOC_SET_EVENT,
+			kfd_ioctl_set_event, 0),
+
+	AMDKFD_IOCTL_DEF(AMDKFD_IOC_RESET_EVENT,
+			kfd_ioctl_reset_event, 0),
+
+	AMDKFD_IOCTL_DEF(AMDKFD_IOC_WAIT_EVENTS,
+			kfd_ioctl_wait_events, 0),
+
+	AMDKFD_IOCTL_DEF(AMDKFD_IOC_DBG_REGISTER,
+			kfd_ioctl_dbg_register, 0),
+
+	AMDKFD_IOCTL_DEF(AMDKFD_IOC_DBG_UNREGISTER,
+			kfd_ioctl_dbg_unregister, 0),
+
+	AMDKFD_IOCTL_DEF(AMDKFD_IOC_DBG_ADDRESS_WATCH,
+			kfd_ioctl_dbg_address_watch, 0),
+
+	AMDKFD_IOCTL_DEF(AMDKFD_IOC_DBG_WAVE_CONTROL,
+			kfd_ioctl_dbg_wave_control, 0),
+
+	AMDKFD_IOCTL_DEF(AMDKFD_IOC_SET_SCRATCH_BACKING_VA,
+			kfd_ioctl_set_scratch_backing_va, 0),
+
+	AMDKFD_IOCTL_DEF(AMDKFD_IOC_GET_TILE_CONFIG,
+			kfd_ioctl_get_tile_config, 0),
+
+	AMDKFD_IOCTL_DEF(AMDKFD_IOC_SET_TRAP_HANDLER,
+			kfd_ioctl_set_trap_handler, 0),
+
+	AMDKFD_IOCTL_DEF(AMDKFD_IOC_GET_PROCESS_APERTURES_NEW,
+			kfd_ioctl_get_process_apertures_new, 0),
+
+	AMDKFD_IOCTL_DEF(AMDKFD_IOC_ACQUIRE_VM,
+			kfd_ioctl_acquire_vm, 0),
+
+	AMDKFD_IOCTL_DEF(AMDKFD_IOC_ALLOC_MEMORY_OF_GPU,
+			kfd_ioctl_alloc_memory_of_gpu, 0),
+
+	AMDKFD_IOCTL_DEF(AMDKFD_IOC_FREE_MEMORY_OF_GPU,
+			kfd_ioctl_free_memory_of_gpu, 0),
+
+	AMDKFD_IOCTL_DEF(AMDKFD_IOC_MAP_MEMORY_TO_GPU,
+			kfd_ioctl_map_memory_to_gpu, 0),
+
+	AMDKFD_IOCTL_DEF(AMDKFD_IOC_UNMAP_MEMORY_FROM_GPU,
+			kfd_ioctl_unmap_memory_from_gpu, 0),
+
+	AMDKFD_IOCTL_DEF(AMDKFD_IOC_SET_CU_MASK,
+			kfd_ioctl_set_cu_mask, 0),
+
+};
+
+#define AMDKFD_CORE_IOCTL_COUNT	ARRAY_SIZE(amdkfd_ioctls)
+
+static long kfd_ioctl(struct file *filep, unsigned int cmd, unsigned long arg)
+{
+	struct kfd_process *process;
+	amdkfd_ioctl_t *func;
+	const struct amdkfd_ioctl_desc *ioctl = NULL;
+	unsigned int nr = _IOC_NR(cmd);
+	char stack_kdata[128];
+	char *kdata = NULL;
+	unsigned int usize, asize;
+	int retcode = -EINVAL;
+
+	if (nr >= AMDKFD_CORE_IOCTL_COUNT)
+		goto err_i1;
+
+	if ((nr >= AMDKFD_COMMAND_START) && (nr < AMDKFD_COMMAND_END)) {
+		u32 amdkfd_size;
+
+		ioctl = &amdkfd_ioctls[nr];
+
+		amdkfd_size = _IOC_SIZE(ioctl->cmd);
+		usize = asize = _IOC_SIZE(cmd);
+		if (amdkfd_size > asize)
+			asize = amdkfd_size;
+
+		cmd = ioctl->cmd;
+	} else
+		goto err_i1;
+
+	dev_dbg(kfd_device, "ioctl cmd 0x%x (#%d), arg 0x%lx\n", cmd, nr, arg);
+
+	process = kfd_get_process(current);
+	if (IS_ERR(process)) {
+		dev_dbg(kfd_device, "no process\n");
+		goto err_i1;
+	}
+
+	/* Do not trust userspace, use our own definition */
+	func = ioctl->func;
+
+	if (unlikely(!func)) {
+		dev_dbg(kfd_device, "no function\n");
+		retcode = -EINVAL;
+		goto err_i1;
+	}
+
+	if (cmd & (IOC_IN | IOC_OUT)) {
+		if (asize <= sizeof(stack_kdata)) {
+			kdata = stack_kdata;
+		} else {
+			kdata = kmalloc(asize, GFP_KERNEL);
+			if (!kdata) {
+				retcode = -ENOMEM;
+				goto err_i1;
+			}
+		}
+		if (asize > usize)
+			memset(kdata + usize, 0, asize - usize);
+	}
+
+	if (cmd & IOC_IN) {
+		if (copy_from_user(kdata, (void __user *)arg, usize) != 0) {
+			retcode = -EFAULT;
+			goto err_i1;
+		}
+	} else if (cmd & IOC_OUT) {
+		memset(kdata, 0, usize);
+	}
+
+	retcode = func(filep, process, kdata);
+
+	if (cmd & IOC_OUT)
+		if (copy_to_user((void __user *)arg, kdata, usize) != 0)
+			retcode = -EFAULT;
+
+err_i1:
+	if (!ioctl)
+		dev_dbg(kfd_device, "invalid ioctl: pid=%d, cmd=0x%02x, nr=0x%02x\n",
+			  task_pid_nr(current), cmd, nr);
+
+	if (kdata != stack_kdata)
+		kfree(kdata);
+
+	if (retcode)
+		dev_dbg(kfd_device, "ret = %d\n", retcode);
+
+	return retcode;
+}
+
+static int kfd_mmap(struct file *filp, struct vm_area_struct *vma)
+{
+	struct kfd_process *process;
+	struct kfd_dev *dev = NULL;
+	unsigned long vm_pgoff;
+	unsigned int gpu_id;
+
+	process = kfd_get_process(current);
+	if (IS_ERR(process))
+		return PTR_ERR(process);
+
+	vm_pgoff = vma->vm_pgoff;
+	vma->vm_pgoff = KFD_MMAP_OFFSET_VALUE_GET(vm_pgoff);
+	gpu_id = KFD_MMAP_GPU_ID_GET(vm_pgoff);
+	if (gpu_id)
+		dev = kfd_device_by_id(gpu_id);
+
+	switch (vm_pgoff & KFD_MMAP_TYPE_MASK) {
+	case KFD_MMAP_TYPE_DOORBELL:
+		if (!dev)
+			return -ENODEV;
+		return kfd_doorbell_mmap(dev, process, vma);
+
+	case KFD_MMAP_TYPE_EVENTS:
+		return kfd_event_mmap(process, vma);
+
+	case KFD_MMAP_TYPE_RESERVED_MEM:
+		if (!dev)
+			return -ENODEV;
+		return kfd_reserved_mem_mmap(dev, process, vma);
+	}
+
+	return -EFAULT;
+}
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_crat.c b/drivers/gpu/drm/amd/amdkfd/kfd_crat.c
new file mode 100644
index 000000000..e2780643f
--- /dev/null
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_crat.c
@@ -0,0 +1,1308 @@
+/*
+ * Copyright 2015-2017 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#include <linux/pci.h>
+#include <linux/acpi.h>
+#include "kfd_crat.h"
+#include "kfd_priv.h"
+#include "kfd_topology.h"
+#include "kfd_iommu.h"
+
+/* GPU Processor ID base for dGPUs for which VCRAT needs to be created.
+ * GPU processor ID are expressed with Bit[31]=1.
+ * The base is set to 0x8000_0000 + 0x1000 to avoid collision with GPU IDs
+ * used in the CRAT.
+ */
+static uint32_t gpu_processor_id_low = 0x80001000;
+
+/* Return the next available gpu_processor_id and increment it for next GPU
+ *	@total_cu_count - Total CUs present in the GPU including ones
+ *			  masked off
+ */
+static inline unsigned int get_and_inc_gpu_processor_id(
+				unsigned int total_cu_count)
+{
+	int current_id = gpu_processor_id_low;
+
+	gpu_processor_id_low += total_cu_count;
+	return current_id;
+}
+
+/* Static table to describe GPU Cache information */
+struct kfd_gpu_cache_info {
+	uint32_t	cache_size;
+	uint32_t	cache_level;
+	uint32_t	flags;
+	/* Indicates how many Compute Units share this cache
+	 * Value = 1 indicates the cache is not shared
+	 */
+	uint32_t	num_cu_shared;
+};
+
+static struct kfd_gpu_cache_info kaveri_cache_info[] = {
+	{
+		/* TCP L1 Cache per CU */
+		.cache_size = 16,
+		.cache_level = 1,
+		.flags = (CRAT_CACHE_FLAGS_ENABLED |
+				CRAT_CACHE_FLAGS_DATA_CACHE |
+				CRAT_CACHE_FLAGS_SIMD_CACHE),
+		.num_cu_shared = 1,
+
+	},
+	{
+		/* Scalar L1 Instruction Cache (in SQC module) per bank */
+		.cache_size = 16,
+		.cache_level = 1,
+		.flags = (CRAT_CACHE_FLAGS_ENABLED |
+				CRAT_CACHE_FLAGS_INST_CACHE |
+				CRAT_CACHE_FLAGS_SIMD_CACHE),
+		.num_cu_shared = 2,
+	},
+	{
+		/* Scalar L1 Data Cache (in SQC module) per bank */
+		.cache_size = 8,
+		.cache_level = 1,
+		.flags = (CRAT_CACHE_FLAGS_ENABLED |
+				CRAT_CACHE_FLAGS_DATA_CACHE |
+				CRAT_CACHE_FLAGS_SIMD_CACHE),
+		.num_cu_shared = 2,
+	},
+
+	/* TODO: Add L2 Cache information */
+};
+
+
+static struct kfd_gpu_cache_info carrizo_cache_info[] = {
+	{
+		/* TCP L1 Cache per CU */
+		.cache_size = 16,
+		.cache_level = 1,
+		.flags = (CRAT_CACHE_FLAGS_ENABLED |
+				CRAT_CACHE_FLAGS_DATA_CACHE |
+				CRAT_CACHE_FLAGS_SIMD_CACHE),
+		.num_cu_shared = 1,
+	},
+	{
+		/* Scalar L1 Instruction Cache (in SQC module) per bank */
+		.cache_size = 8,
+		.cache_level = 1,
+		.flags = (CRAT_CACHE_FLAGS_ENABLED |
+				CRAT_CACHE_FLAGS_INST_CACHE |
+				CRAT_CACHE_FLAGS_SIMD_CACHE),
+		.num_cu_shared = 4,
+	},
+	{
+		/* Scalar L1 Data Cache (in SQC module) per bank. */
+		.cache_size = 4,
+		.cache_level = 1,
+		.flags = (CRAT_CACHE_FLAGS_ENABLED |
+				CRAT_CACHE_FLAGS_DATA_CACHE |
+				CRAT_CACHE_FLAGS_SIMD_CACHE),
+		.num_cu_shared = 4,
+	},
+
+	/* TODO: Add L2 Cache information */
+};
+
+/* NOTE: In future if more information is added to struct kfd_gpu_cache_info
+ * the following ASICs may need a separate table.
+ */
+#define hawaii_cache_info kaveri_cache_info
+#define tonga_cache_info carrizo_cache_info
+#define fiji_cache_info  carrizo_cache_info
+#define polaris10_cache_info carrizo_cache_info
+#define polaris11_cache_info carrizo_cache_info
+/* TODO - check & update Vega10 cache details */
+#define vega10_cache_info carrizo_cache_info
+#define raven_cache_info carrizo_cache_info
+
+static void kfd_populated_cu_info_cpu(struct kfd_topology_device *dev,
+		struct crat_subtype_computeunit *cu)
+{
+	dev->node_props.cpu_cores_count = cu->num_cpu_cores;
+	dev->node_props.cpu_core_id_base = cu->processor_id_low;
+	if (cu->hsa_capability & CRAT_CU_FLAGS_IOMMU_PRESENT)
+		dev->node_props.capability |= HSA_CAP_ATS_PRESENT;
+
+	pr_debug("CU CPU: cores=%d id_base=%d\n", cu->num_cpu_cores,
+			cu->processor_id_low);
+}
+
+static void kfd_populated_cu_info_gpu(struct kfd_topology_device *dev,
+		struct crat_subtype_computeunit *cu)
+{
+	dev->node_props.simd_id_base = cu->processor_id_low;
+	dev->node_props.simd_count = cu->num_simd_cores;
+	dev->node_props.lds_size_in_kb = cu->lds_size_in_kb;
+	dev->node_props.max_waves_per_simd = cu->max_waves_simd;
+	dev->node_props.wave_front_size = cu->wave_front_size;
+	dev->node_props.array_count = cu->array_count;
+	dev->node_props.cu_per_simd_array = cu->num_cu_per_array;
+	dev->node_props.simd_per_cu = cu->num_simd_per_cu;
+	dev->node_props.max_slots_scratch_cu = cu->max_slots_scatch_cu;
+	if (cu->hsa_capability & CRAT_CU_FLAGS_HOT_PLUGGABLE)
+		dev->node_props.capability |= HSA_CAP_HOT_PLUGGABLE;
+	pr_debug("CU GPU: id_base=%d\n", cu->processor_id_low);
+}
+
+/* kfd_parse_subtype_cu - parse compute unit subtypes and attach it to correct
+ * topology device present in the device_list
+ */
+static int kfd_parse_subtype_cu(struct crat_subtype_computeunit *cu,
+				struct list_head *device_list)
+{
+	struct kfd_topology_device *dev;
+
+	pr_debug("Found CU entry in CRAT table with proximity_domain=%d caps=%x\n",
+			cu->proximity_domain, cu->hsa_capability);
+	list_for_each_entry(dev, device_list, list) {
+		if (cu->proximity_domain == dev->proximity_domain) {
+			if (cu->flags & CRAT_CU_FLAGS_CPU_PRESENT)
+				kfd_populated_cu_info_cpu(dev, cu);
+
+			if (cu->flags & CRAT_CU_FLAGS_GPU_PRESENT)
+				kfd_populated_cu_info_gpu(dev, cu);
+			break;
+		}
+	}
+
+	return 0;
+}
+
+static struct kfd_mem_properties *
+find_subtype_mem(uint32_t heap_type, uint32_t flags, uint32_t width,
+		struct kfd_topology_device *dev)
+{
+	struct kfd_mem_properties *props;
+
+	list_for_each_entry(props, &dev->mem_props, list) {
+		if (props->heap_type == heap_type
+				&& props->flags == flags
+				&& props->width == width)
+			return props;
+	}
+
+	return NULL;
+}
+/* kfd_parse_subtype_mem - parse memory subtypes and attach it to correct
+ * topology device present in the device_list
+ */
+static int kfd_parse_subtype_mem(struct crat_subtype_memory *mem,
+				struct list_head *device_list)
+{
+	struct kfd_mem_properties *props;
+	struct kfd_topology_device *dev;
+	uint32_t heap_type;
+	uint64_t size_in_bytes;
+	uint32_t flags = 0;
+	uint32_t width;
+
+	pr_debug("Found memory entry in CRAT table with proximity_domain=%d\n",
+			mem->proximity_domain);
+	list_for_each_entry(dev, device_list, list) {
+		if (mem->proximity_domain == dev->proximity_domain) {
+			/* We're on GPU node */
+			if (dev->node_props.cpu_cores_count == 0) {
+				/* APU */
+				if (mem->visibility_type == 0)
+					heap_type =
+						HSA_MEM_HEAP_TYPE_FB_PRIVATE;
+				/* dGPU */
+				else
+					heap_type = mem->visibility_type;
+			} else
+				heap_type = HSA_MEM_HEAP_TYPE_SYSTEM;
+
+			if (mem->flags & CRAT_MEM_FLAGS_HOT_PLUGGABLE)
+				flags |= HSA_MEM_FLAGS_HOT_PLUGGABLE;
+			if (mem->flags & CRAT_MEM_FLAGS_NON_VOLATILE)
+				flags |= HSA_MEM_FLAGS_NON_VOLATILE;
+
+			size_in_bytes =
+				((uint64_t)mem->length_high << 32) +
+							mem->length_low;
+			width = mem->width;
+
+			/* Multiple banks of the same type are aggregated into
+			 * one. User mode doesn't care about multiple physical
+			 * memory segments. It's managed as a single virtual
+			 * heap for user mode.
+			 */
+			props = find_subtype_mem(heap_type, flags, width, dev);
+			if (props) {
+				props->size_in_bytes += size_in_bytes;
+				break;
+			}
+
+			props = kfd_alloc_struct(props);
+			if (!props)
+				return -ENOMEM;
+
+			props->heap_type = heap_type;
+			props->flags = flags;
+			props->size_in_bytes = size_in_bytes;
+			props->width = width;
+
+			dev->node_props.mem_banks_count++;
+			list_add_tail(&props->list, &dev->mem_props);
+
+			break;
+		}
+	}
+
+	return 0;
+}
+
+/* kfd_parse_subtype_cache - parse cache subtypes and attach it to correct
+ * topology device present in the device_list
+ */
+static int kfd_parse_subtype_cache(struct crat_subtype_cache *cache,
+			struct list_head *device_list)
+{
+	struct kfd_cache_properties *props;
+	struct kfd_topology_device *dev;
+	uint32_t id;
+	uint32_t total_num_of_cu;
+
+	id = cache->processor_id_low;
+
+	pr_debug("Found cache entry in CRAT table with processor_id=%d\n", id);
+	list_for_each_entry(dev, device_list, list) {
+		total_num_of_cu = (dev->node_props.array_count *
+					dev->node_props.cu_per_simd_array);
+
+		/* Cache infomration in CRAT doesn't have proximity_domain
+		 * information as it is associated with a CPU core or GPU
+		 * Compute Unit. So map the cache using CPU core Id or SIMD
+		 * (GPU) ID.
+		 * TODO: This works because currently we can safely assume that
+		 *  Compute Units are parsed before caches are parsed. In
+		 *  future, remove this dependency
+		 */
+		if ((id >= dev->node_props.cpu_core_id_base &&
+			id <= dev->node_props.cpu_core_id_base +
+				dev->node_props.cpu_cores_count) ||
+			(id >= dev->node_props.simd_id_base &&
+			id < dev->node_props.simd_id_base +
+				total_num_of_cu)) {
+			props = kfd_alloc_struct(props);
+			if (!props)
+				return -ENOMEM;
+
+			props->processor_id_low = id;
+			props->cache_level = cache->cache_level;
+			props->cache_size = cache->cache_size;
+			props->cacheline_size = cache->cache_line_size;
+			props->cachelines_per_tag = cache->lines_per_tag;
+			props->cache_assoc = cache->associativity;
+			props->cache_latency = cache->cache_latency;
+			memcpy(props->sibling_map, cache->sibling_map,
+					sizeof(props->sibling_map));
+
+			if (cache->flags & CRAT_CACHE_FLAGS_DATA_CACHE)
+				props->cache_type |= HSA_CACHE_TYPE_DATA;
+			if (cache->flags & CRAT_CACHE_FLAGS_INST_CACHE)
+				props->cache_type |= HSA_CACHE_TYPE_INSTRUCTION;
+			if (cache->flags & CRAT_CACHE_FLAGS_CPU_CACHE)
+				props->cache_type |= HSA_CACHE_TYPE_CPU;
+			if (cache->flags & CRAT_CACHE_FLAGS_SIMD_CACHE)
+				props->cache_type |= HSA_CACHE_TYPE_HSACU;
+
+			dev->cache_count++;
+			dev->node_props.caches_count++;
+			list_add_tail(&props->list, &dev->cache_props);
+
+			break;
+		}
+	}
+
+	return 0;
+}
+
+/* kfd_parse_subtype_iolink - parse iolink subtypes and attach it to correct
+ * topology device present in the device_list
+ */
+static int kfd_parse_subtype_iolink(struct crat_subtype_iolink *iolink,
+					struct list_head *device_list)
+{
+	struct kfd_iolink_properties *props = NULL, *props2;
+	struct kfd_topology_device *dev, *cpu_dev;
+	uint32_t id_from;
+	uint32_t id_to;
+
+	id_from = iolink->proximity_domain_from;
+	id_to = iolink->proximity_domain_to;
+
+	pr_debug("Found IO link entry in CRAT table with id_from=%d\n",
+			id_from);
+	list_for_each_entry(dev, device_list, list) {
+		if (id_from == dev->proximity_domain) {
+			props = kfd_alloc_struct(props);
+			if (!props)
+				return -ENOMEM;
+
+			props->node_from = id_from;
+			props->node_to = id_to;
+			props->ver_maj = iolink->version_major;
+			props->ver_min = iolink->version_minor;
+			props->iolink_type = iolink->io_interface_type;
+
+			if (props->iolink_type == CRAT_IOLINK_TYPE_PCIEXPRESS)
+				props->weight = 20;
+			else
+				props->weight = node_distance(id_from, id_to);
+
+			props->min_latency = iolink->minimum_latency;
+			props->max_latency = iolink->maximum_latency;
+			props->min_bandwidth = iolink->minimum_bandwidth_mbs;
+			props->max_bandwidth = iolink->maximum_bandwidth_mbs;
+			props->rec_transfer_size =
+					iolink->recommended_transfer_size;
+
+			dev->io_link_count++;
+			dev->node_props.io_links_count++;
+			list_add_tail(&props->list, &dev->io_link_props);
+			break;
+		}
+	}
+
+	/* CPU topology is created before GPUs are detected, so CPU->GPU
+	 * links are not built at that time. If a PCIe type is discovered, it
+	 * means a GPU is detected and we are adding GPU->CPU to the topology.
+	 * At this time, also add the corresponded CPU->GPU link.
+	 */
+	if (props && props->iolink_type == CRAT_IOLINK_TYPE_PCIEXPRESS) {
+		cpu_dev = kfd_topology_device_by_proximity_domain(id_to);
+		if (!cpu_dev)
+			return -ENODEV;
+		/* same everything but the other direction */
+		props2 = kmemdup(props, sizeof(*props2), GFP_KERNEL);
+		props2->node_from = id_to;
+		props2->node_to = id_from;
+		props2->kobj = NULL;
+		cpu_dev->io_link_count++;
+		cpu_dev->node_props.io_links_count++;
+		list_add_tail(&props2->list, &cpu_dev->io_link_props);
+	}
+
+	return 0;
+}
+
+/* kfd_parse_subtype - parse subtypes and attach it to correct topology device
+ * present in the device_list
+ *	@sub_type_hdr - subtype section of crat_image
+ *	@device_list - list of topology devices present in this crat_image
+ */
+static int kfd_parse_subtype(struct crat_subtype_generic *sub_type_hdr,
+				struct list_head *device_list)
+{
+	struct crat_subtype_computeunit *cu;
+	struct crat_subtype_memory *mem;
+	struct crat_subtype_cache *cache;
+	struct crat_subtype_iolink *iolink;
+	int ret = 0;
+
+	switch (sub_type_hdr->type) {
+	case CRAT_SUBTYPE_COMPUTEUNIT_AFFINITY:
+		cu = (struct crat_subtype_computeunit *)sub_type_hdr;
+		ret = kfd_parse_subtype_cu(cu, device_list);
+		break;
+	case CRAT_SUBTYPE_MEMORY_AFFINITY:
+		mem = (struct crat_subtype_memory *)sub_type_hdr;
+		ret = kfd_parse_subtype_mem(mem, device_list);
+		break;
+	case CRAT_SUBTYPE_CACHE_AFFINITY:
+		cache = (struct crat_subtype_cache *)sub_type_hdr;
+		ret = kfd_parse_subtype_cache(cache, device_list);
+		break;
+	case CRAT_SUBTYPE_TLB_AFFINITY:
+		/*
+		 * For now, nothing to do here
+		 */
+		pr_debug("Found TLB entry in CRAT table (not processing)\n");
+		break;
+	case CRAT_SUBTYPE_CCOMPUTE_AFFINITY:
+		/*
+		 * For now, nothing to do here
+		 */
+		pr_debug("Found CCOMPUTE entry in CRAT table (not processing)\n");
+		break;
+	case CRAT_SUBTYPE_IOLINK_AFFINITY:
+		iolink = (struct crat_subtype_iolink *)sub_type_hdr;
+		ret = kfd_parse_subtype_iolink(iolink, device_list);
+		break;
+	default:
+		pr_warn("Unknown subtype %d in CRAT\n",
+				sub_type_hdr->type);
+	}
+
+	return ret;
+}
+
+/* kfd_parse_crat_table - parse CRAT table. For each node present in CRAT
+ * create a kfd_topology_device and add in to device_list. Also parse
+ * CRAT subtypes and attach it to appropriate kfd_topology_device
+ *	@crat_image - input image containing CRAT
+ *	@device_list - [OUT] list of kfd_topology_device generated after
+ *		       parsing crat_image
+ *	@proximity_domain - Proximity domain of the first device in the table
+ *
+ *	Return - 0 if successful else -ve value
+ */
+int kfd_parse_crat_table(void *crat_image, struct list_head *device_list,
+			 uint32_t proximity_domain)
+{
+	struct kfd_topology_device *top_dev = NULL;
+	struct crat_subtype_generic *sub_type_hdr;
+	uint16_t node_id;
+	int ret = 0;
+	struct crat_header *crat_table = (struct crat_header *)crat_image;
+	uint16_t num_nodes;
+	uint32_t image_len;
+
+	if (!crat_image)
+		return -EINVAL;
+
+	if (!list_empty(device_list)) {
+		pr_warn("Error device list should be empty\n");
+		return -EINVAL;
+	}
+
+	num_nodes = crat_table->num_domains;
+	image_len = crat_table->length;
+
+	pr_info("Parsing CRAT table with %d nodes\n", num_nodes);
+
+	for (node_id = 0; node_id < num_nodes; node_id++) {
+		top_dev = kfd_create_topology_device(device_list);
+		if (!top_dev)
+			break;
+		top_dev->proximity_domain = proximity_domain++;
+	}
+
+	if (!top_dev) {
+		ret = -ENOMEM;
+		goto err;
+	}
+
+	memcpy(top_dev->oem_id, crat_table->oem_id, CRAT_OEMID_LENGTH);
+	memcpy(top_dev->oem_table_id, crat_table->oem_table_id,
+			CRAT_OEMTABLEID_LENGTH);
+	top_dev->oem_revision = crat_table->oem_revision;
+
+	sub_type_hdr = (struct crat_subtype_generic *)(crat_table+1);
+	while ((char *)sub_type_hdr + sizeof(struct crat_subtype_generic) <
+			((char *)crat_image) + image_len) {
+		if (sub_type_hdr->flags & CRAT_SUBTYPE_FLAGS_ENABLED) {
+			ret = kfd_parse_subtype(sub_type_hdr, device_list);
+			if (ret)
+				break;
+		}
+
+		sub_type_hdr = (typeof(sub_type_hdr))((char *)sub_type_hdr +
+				sub_type_hdr->length);
+	}
+
+err:
+	if (ret)
+		kfd_release_topology_device_list(device_list);
+
+	return ret;
+}
+
+/* Helper function. See kfd_fill_gpu_cache_info for parameter description */
+static int fill_in_pcache(struct crat_subtype_cache *pcache,
+				struct kfd_gpu_cache_info *pcache_info,
+				struct kfd_cu_info *cu_info,
+				int mem_available,
+				int cu_bitmask,
+				int cache_type, unsigned int cu_processor_id,
+				int cu_block)
+{
+	unsigned int cu_sibling_map_mask;
+	int first_active_cu;
+
+	/* First check if enough memory is available */
+	if (sizeof(struct crat_subtype_cache) > mem_available)
+		return -ENOMEM;
+
+	cu_sibling_map_mask = cu_bitmask;
+	cu_sibling_map_mask >>= cu_block;
+	cu_sibling_map_mask &=
+		((1 << pcache_info[cache_type].num_cu_shared) - 1);
+	first_active_cu = ffs(cu_sibling_map_mask);
+
+	/* CU could be inactive. In case of shared cache find the first active
+	 * CU. and incase of non-shared cache check if the CU is inactive. If
+	 * inactive active skip it
+	 */
+	if (first_active_cu) {
+		memset(pcache, 0, sizeof(struct crat_subtype_cache));
+		pcache->type = CRAT_SUBTYPE_CACHE_AFFINITY;
+		pcache->length = sizeof(struct crat_subtype_cache);
+		pcache->flags = pcache_info[cache_type].flags;
+		pcache->processor_id_low = cu_processor_id
+					 + (first_active_cu - 1);
+		pcache->cache_level = pcache_info[cache_type].cache_level;
+		pcache->cache_size = pcache_info[cache_type].cache_size;
+
+		/* Sibling map is w.r.t processor_id_low, so shift out
+		 * inactive CU
+		 */
+		cu_sibling_map_mask =
+			cu_sibling_map_mask >> (first_active_cu - 1);
+
+		pcache->sibling_map[0] = (uint8_t)(cu_sibling_map_mask & 0xFF);
+		pcache->sibling_map[1] =
+				(uint8_t)((cu_sibling_map_mask >> 8) & 0xFF);
+		pcache->sibling_map[2] =
+				(uint8_t)((cu_sibling_map_mask >> 16) & 0xFF);
+		pcache->sibling_map[3] =
+				(uint8_t)((cu_sibling_map_mask >> 24) & 0xFF);
+		return 0;
+	}
+	return 1;
+}
+
+/* kfd_fill_gpu_cache_info - Fill GPU cache info using kfd_gpu_cache_info
+ * tables
+ *
+ *	@kdev - [IN] GPU device
+ *	@gpu_processor_id - [IN] GPU processor ID to which these caches
+ *			    associate
+ *	@available_size - [IN] Amount of memory available in pcache
+ *	@cu_info - [IN] Compute Unit info obtained from KGD
+ *	@pcache - [OUT] memory into which cache data is to be filled in.
+ *	@size_filled - [OUT] amount of data used up in pcache.
+ *	@num_of_entries - [OUT] number of caches added
+ */
+static int kfd_fill_gpu_cache_info(struct kfd_dev *kdev,
+			int gpu_processor_id,
+			int available_size,
+			struct kfd_cu_info *cu_info,
+			struct crat_subtype_cache *pcache,
+			int *size_filled,
+			int *num_of_entries)
+{
+	struct kfd_gpu_cache_info *pcache_info;
+	int num_of_cache_types = 0;
+	int i, j, k;
+	int ct = 0;
+	int mem_available = available_size;
+	unsigned int cu_processor_id;
+	int ret;
+
+	switch (kdev->device_info->asic_family) {
+	case CHIP_KAVERI:
+		pcache_info = kaveri_cache_info;
+		num_of_cache_types = ARRAY_SIZE(kaveri_cache_info);
+		break;
+	case CHIP_HAWAII:
+		pcache_info = hawaii_cache_info;
+		num_of_cache_types = ARRAY_SIZE(hawaii_cache_info);
+		break;
+	case CHIP_CARRIZO:
+		pcache_info = carrizo_cache_info;
+		num_of_cache_types = ARRAY_SIZE(carrizo_cache_info);
+		break;
+	case CHIP_TONGA:
+		pcache_info = tonga_cache_info;
+		num_of_cache_types = ARRAY_SIZE(tonga_cache_info);
+		break;
+	case CHIP_FIJI:
+		pcache_info = fiji_cache_info;
+		num_of_cache_types = ARRAY_SIZE(fiji_cache_info);
+		break;
+	case CHIP_POLARIS10:
+		pcache_info = polaris10_cache_info;
+		num_of_cache_types = ARRAY_SIZE(polaris10_cache_info);
+		break;
+	case CHIP_POLARIS11:
+		pcache_info = polaris11_cache_info;
+		num_of_cache_types = ARRAY_SIZE(polaris11_cache_info);
+		break;
+	case CHIP_VEGA10:
+		pcache_info = vega10_cache_info;
+		num_of_cache_types = ARRAY_SIZE(vega10_cache_info);
+		break;
+	case CHIP_RAVEN:
+		pcache_info = raven_cache_info;
+		num_of_cache_types = ARRAY_SIZE(raven_cache_info);
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	*size_filled = 0;
+	*num_of_entries = 0;
+
+	/* For each type of cache listed in the kfd_gpu_cache_info table,
+	 * go through all available Compute Units.
+	 * The [i,j,k] loop will
+	 *		if kfd_gpu_cache_info.num_cu_shared = 1
+	 *			will parse through all available CU
+	 *		If (kfd_gpu_cache_info.num_cu_shared != 1)
+	 *			then it will consider only one CU from
+	 *			the shared unit
+	 */
+
+	for (ct = 0; ct < num_of_cache_types; ct++) {
+		cu_processor_id = gpu_processor_id;
+		for (i = 0; i < cu_info->num_shader_engines; i++) {
+			for (j = 0; j < cu_info->num_shader_arrays_per_engine;
+				j++) {
+				for (k = 0; k < cu_info->num_cu_per_sh;
+					k += pcache_info[ct].num_cu_shared) {
+
+					ret = fill_in_pcache(pcache,
+						pcache_info,
+						cu_info,
+						mem_available,
+						cu_info->cu_bitmap[i][j],
+						ct,
+						cu_processor_id,
+						k);
+
+					if (ret < 0)
+						break;
+
+					if (!ret) {
+						pcache++;
+						(*num_of_entries)++;
+						mem_available -=
+							sizeof(*pcache);
+						(*size_filled) +=
+							sizeof(*pcache);
+					}
+
+					/* Move to next CU block */
+					cu_processor_id +=
+						pcache_info[ct].num_cu_shared;
+				}
+			}
+		}
+	}
+
+	pr_debug("Added [%d] GPU cache entries\n", *num_of_entries);
+
+	return 0;
+}
+
+/*
+ * kfd_create_crat_image_acpi - Allocates memory for CRAT image and
+ * copies CRAT from ACPI (if available).
+ * NOTE: Call kfd_destroy_crat_image to free CRAT image memory
+ *
+ *	@crat_image: CRAT read from ACPI. If no CRAT in ACPI then
+ *		     crat_image will be NULL
+ *	@size: [OUT] size of crat_image
+ *
+ *	Return 0 if successful else return error code
+ */
+int kfd_create_crat_image_acpi(void **crat_image, size_t *size)
+{
+	struct acpi_table_header *crat_table;
+	acpi_status status;
+	void *pcrat_image;
+
+	if (!crat_image)
+		return -EINVAL;
+
+	*crat_image = NULL;
+
+	/* Fetch the CRAT table from ACPI */
+	status = acpi_get_table(CRAT_SIGNATURE, 0, &crat_table);
+	if (status == AE_NOT_FOUND) {
+		pr_info("CRAT table not found\n");
+		return -ENODATA;
+	} else if (ACPI_FAILURE(status)) {
+		const char *err = acpi_format_exception(status);
+
+		pr_err("CRAT table error: %s\n", err);
+		return -EINVAL;
+	}
+
+	if (ignore_crat) {
+		pr_info("CRAT table disabled by module option\n");
+		return -ENODATA;
+	}
+
+	pcrat_image = kmalloc(crat_table->length, GFP_KERNEL);
+	if (!pcrat_image)
+		return -ENOMEM;
+
+	memcpy(pcrat_image, crat_table, crat_table->length);
+
+	*crat_image = pcrat_image;
+	*size = crat_table->length;
+
+	return 0;
+}
+
+/* Memory required to create Virtual CRAT.
+ * Since there is no easy way to predict the amount of memory required, the
+ * following amount are allocated for CPU and GPU Virtual CRAT. This is
+ * expected to cover all known conditions. But to be safe additional check
+ * is put in the code to ensure we don't overwrite.
+ */
+#define VCRAT_SIZE_FOR_CPU	(2 * PAGE_SIZE)
+#define VCRAT_SIZE_FOR_GPU	(3 * PAGE_SIZE)
+
+/* kfd_fill_cu_for_cpu - Fill in Compute info for the given CPU NUMA node
+ *
+ *	@numa_node_id: CPU NUMA node id
+ *	@avail_size: Available size in the memory
+ *	@sub_type_hdr: Memory into which compute info will be filled in
+ *
+ *	Return 0 if successful else return -ve value
+ */
+static int kfd_fill_cu_for_cpu(int numa_node_id, int *avail_size,
+				int proximity_domain,
+				struct crat_subtype_computeunit *sub_type_hdr)
+{
+	const struct cpumask *cpumask;
+
+	*avail_size -= sizeof(struct crat_subtype_computeunit);
+	if (*avail_size < 0)
+		return -ENOMEM;
+
+	memset(sub_type_hdr, 0, sizeof(struct crat_subtype_computeunit));
+
+	/* Fill in subtype header data */
+	sub_type_hdr->type = CRAT_SUBTYPE_COMPUTEUNIT_AFFINITY;
+	sub_type_hdr->length = sizeof(struct crat_subtype_computeunit);
+	sub_type_hdr->flags = CRAT_SUBTYPE_FLAGS_ENABLED;
+
+	cpumask = cpumask_of_node(numa_node_id);
+
+	/* Fill in CU data */
+	sub_type_hdr->flags |= CRAT_CU_FLAGS_CPU_PRESENT;
+	sub_type_hdr->proximity_domain = proximity_domain;
+	sub_type_hdr->processor_id_low = kfd_numa_node_to_apic_id(numa_node_id);
+	if (sub_type_hdr->processor_id_low == -1)
+		return -EINVAL;
+
+	sub_type_hdr->num_cpu_cores = cpumask_weight(cpumask);
+
+	return 0;
+}
+
+/* kfd_fill_mem_info_for_cpu - Fill in Memory info for the given CPU NUMA node
+ *
+ *	@numa_node_id: CPU NUMA node id
+ *	@avail_size: Available size in the memory
+ *	@sub_type_hdr: Memory into which compute info will be filled in
+ *
+ *	Return 0 if successful else return -ve value
+ */
+static int kfd_fill_mem_info_for_cpu(int numa_node_id, int *avail_size,
+			int proximity_domain,
+			struct crat_subtype_memory *sub_type_hdr)
+{
+	uint64_t mem_in_bytes = 0;
+	pg_data_t *pgdat;
+	int zone_type;
+
+	*avail_size -= sizeof(struct crat_subtype_memory);
+	if (*avail_size < 0)
+		return -ENOMEM;
+
+	memset(sub_type_hdr, 0, sizeof(struct crat_subtype_memory));
+
+	/* Fill in subtype header data */
+	sub_type_hdr->type = CRAT_SUBTYPE_MEMORY_AFFINITY;
+	sub_type_hdr->length = sizeof(struct crat_subtype_memory);
+	sub_type_hdr->flags = CRAT_SUBTYPE_FLAGS_ENABLED;
+
+	/* Fill in Memory Subunit data */
+
+	/* Unlike si_meminfo, si_meminfo_node is not exported. So
+	 * the following lines are duplicated from si_meminfo_node
+	 * function
+	 */
+	pgdat = NODE_DATA(numa_node_id);
+	for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++)
+		mem_in_bytes += pgdat->node_zones[zone_type].managed_pages;
+	mem_in_bytes <<= PAGE_SHIFT;
+
+	sub_type_hdr->length_low = lower_32_bits(mem_in_bytes);
+	sub_type_hdr->length_high = upper_32_bits(mem_in_bytes);
+	sub_type_hdr->proximity_domain = proximity_domain;
+
+	return 0;
+}
+
+static int kfd_fill_iolink_info_for_cpu(int numa_node_id, int *avail_size,
+				uint32_t *num_entries,
+				struct crat_subtype_iolink *sub_type_hdr)
+{
+	int nid;
+	struct cpuinfo_x86 *c = &cpu_data(0);
+	uint8_t link_type;
+
+	if (c->x86_vendor == X86_VENDOR_AMD)
+		link_type = CRAT_IOLINK_TYPE_HYPERTRANSPORT;
+	else
+		link_type = CRAT_IOLINK_TYPE_QPI_1_1;
+
+	*num_entries = 0;
+
+	/* Create IO links from this node to other CPU nodes */
+	for_each_online_node(nid) {
+		if (nid == numa_node_id) /* node itself */
+			continue;
+
+		*avail_size -= sizeof(struct crat_subtype_iolink);
+		if (*avail_size < 0)
+			return -ENOMEM;
+
+		memset(sub_type_hdr, 0, sizeof(struct crat_subtype_iolink));
+
+		/* Fill in subtype header data */
+		sub_type_hdr->type = CRAT_SUBTYPE_IOLINK_AFFINITY;
+		sub_type_hdr->length = sizeof(struct crat_subtype_iolink);
+		sub_type_hdr->flags = CRAT_SUBTYPE_FLAGS_ENABLED;
+
+		/* Fill in IO link data */
+		sub_type_hdr->proximity_domain_from = numa_node_id;
+		sub_type_hdr->proximity_domain_to = nid;
+		sub_type_hdr->io_interface_type = link_type;
+
+		(*num_entries)++;
+		sub_type_hdr++;
+	}
+
+	return 0;
+}
+
+/* kfd_create_vcrat_image_cpu - Create Virtual CRAT for CPU
+ *
+ *	@pcrat_image: Fill in VCRAT for CPU
+ *	@size:	[IN] allocated size of crat_image.
+ *		[OUT] actual size of data filled in crat_image
+ */
+static int kfd_create_vcrat_image_cpu(void *pcrat_image, size_t *size)
+{
+	struct crat_header *crat_table = (struct crat_header *)pcrat_image;
+	struct acpi_table_header *acpi_table;
+	acpi_status status;
+	struct crat_subtype_generic *sub_type_hdr;
+	int avail_size = *size;
+	int numa_node_id;
+	uint32_t entries = 0;
+	int ret = 0;
+
+	if (!pcrat_image || avail_size < VCRAT_SIZE_FOR_CPU)
+		return -EINVAL;
+
+	/* Fill in CRAT Header.
+	 * Modify length and total_entries as subunits are added.
+	 */
+	avail_size -= sizeof(struct crat_header);
+	if (avail_size < 0)
+		return -ENOMEM;
+
+	memset(crat_table, 0, sizeof(struct crat_header));
+	memcpy(&crat_table->signature, CRAT_SIGNATURE,
+			sizeof(crat_table->signature));
+	crat_table->length = sizeof(struct crat_header);
+
+	status = acpi_get_table("DSDT", 0, &acpi_table);
+	if (status != AE_OK)
+		pr_warn("DSDT table not found for OEM information\n");
+	else {
+		crat_table->oem_revision = acpi_table->revision;
+		memcpy(crat_table->oem_id, acpi_table->oem_id,
+				CRAT_OEMID_LENGTH);
+		memcpy(crat_table->oem_table_id, acpi_table->oem_table_id,
+				CRAT_OEMTABLEID_LENGTH);
+	}
+	crat_table->total_entries = 0;
+	crat_table->num_domains = 0;
+
+	sub_type_hdr = (struct crat_subtype_generic *)(crat_table+1);
+
+	for_each_online_node(numa_node_id) {
+		if (kfd_numa_node_to_apic_id(numa_node_id) == -1)
+			continue;
+
+		/* Fill in Subtype: Compute Unit */
+		ret = kfd_fill_cu_for_cpu(numa_node_id, &avail_size,
+			crat_table->num_domains,
+			(struct crat_subtype_computeunit *)sub_type_hdr);
+		if (ret < 0)
+			return ret;
+		crat_table->length += sub_type_hdr->length;
+		crat_table->total_entries++;
+
+		sub_type_hdr = (typeof(sub_type_hdr))((char *)sub_type_hdr +
+			sub_type_hdr->length);
+
+		/* Fill in Subtype: Memory */
+		ret = kfd_fill_mem_info_for_cpu(numa_node_id, &avail_size,
+			crat_table->num_domains,
+			(struct crat_subtype_memory *)sub_type_hdr);
+		if (ret < 0)
+			return ret;
+		crat_table->length += sub_type_hdr->length;
+		crat_table->total_entries++;
+
+		sub_type_hdr = (typeof(sub_type_hdr))((char *)sub_type_hdr +
+			sub_type_hdr->length);
+
+		/* Fill in Subtype: IO Link */
+		ret = kfd_fill_iolink_info_for_cpu(numa_node_id, &avail_size,
+				&entries,
+				(struct crat_subtype_iolink *)sub_type_hdr);
+		if (ret < 0)
+			return ret;
+		crat_table->length += (sub_type_hdr->length * entries);
+		crat_table->total_entries += entries;
+
+		sub_type_hdr = (typeof(sub_type_hdr))((char *)sub_type_hdr +
+				sub_type_hdr->length * entries);
+
+		crat_table->num_domains++;
+	}
+
+	/* TODO: Add cache Subtype for CPU.
+	 * Currently, CPU cache information is available in function
+	 * detect_cache_attributes(cpu) defined in the file
+	 * ./arch/x86/kernel/cpu/intel_cacheinfo.c. This function is not
+	 * exported and to get the same information the code needs to be
+	 * duplicated.
+	 */
+
+	*size = crat_table->length;
+	pr_info("Virtual CRAT table created for CPU\n");
+
+	return 0;
+}
+
+static int kfd_fill_gpu_memory_affinity(int *avail_size,
+		struct kfd_dev *kdev, uint8_t type, uint64_t size,
+		struct crat_subtype_memory *sub_type_hdr,
+		uint32_t proximity_domain,
+		const struct kfd_local_mem_info *local_mem_info)
+{
+	*avail_size -= sizeof(struct crat_subtype_memory);
+	if (*avail_size < 0)
+		return -ENOMEM;
+
+	memset((void *)sub_type_hdr, 0, sizeof(struct crat_subtype_memory));
+	sub_type_hdr->type = CRAT_SUBTYPE_MEMORY_AFFINITY;
+	sub_type_hdr->length = sizeof(struct crat_subtype_memory);
+	sub_type_hdr->flags |= CRAT_SUBTYPE_FLAGS_ENABLED;
+
+	sub_type_hdr->proximity_domain = proximity_domain;
+
+	pr_debug("Fill gpu memory affinity - type 0x%x size 0x%llx\n",
+			type, size);
+
+	sub_type_hdr->length_low = lower_32_bits(size);
+	sub_type_hdr->length_high = upper_32_bits(size);
+
+	sub_type_hdr->width = local_mem_info->vram_width;
+	sub_type_hdr->visibility_type = type;
+
+	return 0;
+}
+
+/* kfd_fill_gpu_direct_io_link - Fill in direct io link from GPU
+ * to its NUMA node
+ *	@avail_size: Available size in the memory
+ *	@kdev - [IN] GPU device
+ *	@sub_type_hdr: Memory into which io link info will be filled in
+ *	@proximity_domain - proximity domain of the GPU node
+ *
+ *	Return 0 if successful else return -ve value
+ */
+static int kfd_fill_gpu_direct_io_link(int *avail_size,
+			struct kfd_dev *kdev,
+			struct crat_subtype_iolink *sub_type_hdr,
+			uint32_t proximity_domain)
+{
+	*avail_size -= sizeof(struct crat_subtype_iolink);
+	if (*avail_size < 0)
+		return -ENOMEM;
+
+	memset((void *)sub_type_hdr, 0, sizeof(struct crat_subtype_iolink));
+
+	/* Fill in subtype header data */
+	sub_type_hdr->type = CRAT_SUBTYPE_IOLINK_AFFINITY;
+	sub_type_hdr->length = sizeof(struct crat_subtype_iolink);
+	sub_type_hdr->flags |= CRAT_SUBTYPE_FLAGS_ENABLED;
+
+	/* Fill in IOLINK subtype.
+	 * TODO: Fill-in other fields of iolink subtype
+	 */
+	sub_type_hdr->io_interface_type = CRAT_IOLINK_TYPE_PCIEXPRESS;
+	sub_type_hdr->proximity_domain_from = proximity_domain;
+#ifdef CONFIG_NUMA
+	if (kdev->pdev->dev.numa_node == NUMA_NO_NODE)
+		sub_type_hdr->proximity_domain_to = 0;
+	else
+		sub_type_hdr->proximity_domain_to = kdev->pdev->dev.numa_node;
+#else
+	sub_type_hdr->proximity_domain_to = 0;
+#endif
+	return 0;
+}
+
+/* kfd_create_vcrat_image_gpu - Create Virtual CRAT for CPU
+ *
+ *	@pcrat_image: Fill in VCRAT for GPU
+ *	@size:	[IN] allocated size of crat_image.
+ *		[OUT] actual size of data filled in crat_image
+ */
+static int kfd_create_vcrat_image_gpu(void *pcrat_image,
+				      size_t *size, struct kfd_dev *kdev,
+				      uint32_t proximity_domain)
+{
+	struct crat_header *crat_table = (struct crat_header *)pcrat_image;
+	struct crat_subtype_generic *sub_type_hdr;
+	struct crat_subtype_computeunit *cu;
+	struct kfd_cu_info cu_info;
+	int avail_size = *size;
+	uint32_t total_num_of_cu;
+	int num_of_cache_entries = 0;
+	int cache_mem_filled = 0;
+	int ret = 0;
+	struct kfd_local_mem_info local_mem_info;
+
+	if (!pcrat_image || avail_size < VCRAT_SIZE_FOR_GPU)
+		return -EINVAL;
+
+	/* Fill the CRAT Header.
+	 * Modify length and total_entries as subunits are added.
+	 */
+	avail_size -= sizeof(struct crat_header);
+	if (avail_size < 0)
+		return -ENOMEM;
+
+	memset(crat_table, 0, sizeof(struct crat_header));
+
+	memcpy(&crat_table->signature, CRAT_SIGNATURE,
+			sizeof(crat_table->signature));
+	/* Change length as we add more subtypes*/
+	crat_table->length = sizeof(struct crat_header);
+	crat_table->num_domains = 1;
+	crat_table->total_entries = 0;
+
+	/* Fill in Subtype: Compute Unit
+	 * First fill in the sub type header and then sub type data
+	 */
+	avail_size -= sizeof(struct crat_subtype_computeunit);
+	if (avail_size < 0)
+		return -ENOMEM;
+
+	sub_type_hdr = (struct crat_subtype_generic *)(crat_table + 1);
+	memset(sub_type_hdr, 0, sizeof(struct crat_subtype_computeunit));
+
+	sub_type_hdr->type = CRAT_SUBTYPE_COMPUTEUNIT_AFFINITY;
+	sub_type_hdr->length = sizeof(struct crat_subtype_computeunit);
+	sub_type_hdr->flags = CRAT_SUBTYPE_FLAGS_ENABLED;
+
+	/* Fill CU subtype data */
+	cu = (struct crat_subtype_computeunit *)sub_type_hdr;
+	cu->flags |= CRAT_CU_FLAGS_GPU_PRESENT;
+	cu->proximity_domain = proximity_domain;
+
+	kdev->kfd2kgd->get_cu_info(kdev->kgd, &cu_info);
+	cu->num_simd_per_cu = cu_info.simd_per_cu;
+	cu->num_simd_cores = cu_info.simd_per_cu * cu_info.cu_active_number;
+	cu->max_waves_simd = cu_info.max_waves_per_simd;
+
+	cu->wave_front_size = cu_info.wave_front_size;
+	cu->array_count = cu_info.num_shader_arrays_per_engine *
+		cu_info.num_shader_engines;
+	total_num_of_cu = (cu->array_count * cu_info.num_cu_per_sh);
+	cu->processor_id_low = get_and_inc_gpu_processor_id(total_num_of_cu);
+	cu->num_cu_per_array = cu_info.num_cu_per_sh;
+	cu->max_slots_scatch_cu = cu_info.max_scratch_slots_per_cu;
+	cu->num_banks = cu_info.num_shader_engines;
+	cu->lds_size_in_kb = cu_info.lds_size;
+
+	cu->hsa_capability = 0;
+
+	/* Check if this node supports IOMMU. During parsing this flag will
+	 * translate to HSA_CAP_ATS_PRESENT
+	 */
+	if (!kfd_iommu_check_device(kdev))
+		cu->hsa_capability |= CRAT_CU_FLAGS_IOMMU_PRESENT;
+
+	crat_table->length += sub_type_hdr->length;
+	crat_table->total_entries++;
+
+	/* Fill in Subtype: Memory. Only on systems with large BAR (no
+	 * private FB), report memory as public. On other systems
+	 * report the total FB size (public+private) as a single
+	 * private heap.
+	 */
+	kdev->kfd2kgd->get_local_mem_info(kdev->kgd, &local_mem_info);
+	sub_type_hdr = (typeof(sub_type_hdr))((char *)sub_type_hdr +
+			sub_type_hdr->length);
+
+	if (debug_largebar)
+		local_mem_info.local_mem_size_private = 0;
+
+	if (local_mem_info.local_mem_size_private == 0)
+		ret = kfd_fill_gpu_memory_affinity(&avail_size,
+				kdev, HSA_MEM_HEAP_TYPE_FB_PUBLIC,
+				local_mem_info.local_mem_size_public,
+				(struct crat_subtype_memory *)sub_type_hdr,
+				proximity_domain,
+				&local_mem_info);
+	else
+		ret = kfd_fill_gpu_memory_affinity(&avail_size,
+				kdev, HSA_MEM_HEAP_TYPE_FB_PRIVATE,
+				local_mem_info.local_mem_size_public +
+				local_mem_info.local_mem_size_private,
+				(struct crat_subtype_memory *)sub_type_hdr,
+				proximity_domain,
+				&local_mem_info);
+	if (ret < 0)
+		return ret;
+
+	crat_table->length += sizeof(struct crat_subtype_memory);
+	crat_table->total_entries++;
+
+	/* TODO: Fill in cache information. This information is NOT readily
+	 * available in KGD
+	 */
+	sub_type_hdr = (typeof(sub_type_hdr))((char *)sub_type_hdr +
+		sub_type_hdr->length);
+	ret = kfd_fill_gpu_cache_info(kdev, cu->processor_id_low,
+				avail_size,
+				&cu_info,
+				(struct crat_subtype_cache *)sub_type_hdr,
+				&cache_mem_filled,
+				&num_of_cache_entries);
+
+	if (ret < 0)
+		return ret;
+
+	crat_table->length += cache_mem_filled;
+	crat_table->total_entries += num_of_cache_entries;
+	avail_size -= cache_mem_filled;
+
+	/* Fill in Subtype: IO_LINKS
+	 *  Only direct links are added here which is Link from GPU to
+	 *  to its NUMA node. Indirect links are added by userspace.
+	 */
+	sub_type_hdr = (typeof(sub_type_hdr))((char *)sub_type_hdr +
+		cache_mem_filled);
+	ret = kfd_fill_gpu_direct_io_link(&avail_size, kdev,
+		(struct crat_subtype_iolink *)sub_type_hdr, proximity_domain);
+
+	if (ret < 0)
+		return ret;
+
+	crat_table->length += sub_type_hdr->length;
+	crat_table->total_entries++;
+
+	*size = crat_table->length;
+	pr_info("Virtual CRAT table created for GPU\n");
+
+	return ret;
+}
+
+/* kfd_create_crat_image_virtual - Allocates memory for CRAT image and
+ *		creates a Virtual CRAT (VCRAT) image
+ *
+ * NOTE: Call kfd_destroy_crat_image to free CRAT image memory
+ *
+ *	@crat_image: VCRAT image created because ACPI does not have a
+ *		     CRAT for this device
+ *	@size: [OUT] size of virtual crat_image
+ *	@flags:	COMPUTE_UNIT_CPU - Create VCRAT for CPU device
+ *		COMPUTE_UNIT_GPU - Create VCRAT for GPU
+ *		(COMPUTE_UNIT_CPU | COMPUTE_UNIT_GPU) - Create VCRAT for APU
+ *			-- this option is not currently implemented.
+ *			The assumption is that all AMD APUs will have CRAT
+ *	@kdev: Valid kfd_device required if flags contain COMPUTE_UNIT_GPU
+ *
+ *	Return 0 if successful else return -ve value
+ */
+int kfd_create_crat_image_virtual(void **crat_image, size_t *size,
+				  int flags, struct kfd_dev *kdev,
+				  uint32_t proximity_domain)
+{
+	void *pcrat_image = NULL;
+	int ret = 0;
+
+	if (!crat_image)
+		return -EINVAL;
+
+	*crat_image = NULL;
+
+	/* Allocate one VCRAT_SIZE_FOR_CPU for CPU virtual CRAT image and
+	 * VCRAT_SIZE_FOR_GPU for GPU virtual CRAT image. This should cover
+	 * all the current conditions. A check is put not to overwrite beyond
+	 * allocated size
+	 */
+	switch (flags) {
+	case COMPUTE_UNIT_CPU:
+		pcrat_image = kmalloc(VCRAT_SIZE_FOR_CPU, GFP_KERNEL);
+		if (!pcrat_image)
+			return -ENOMEM;
+		*size = VCRAT_SIZE_FOR_CPU;
+		ret = kfd_create_vcrat_image_cpu(pcrat_image, size);
+		break;
+	case COMPUTE_UNIT_GPU:
+		if (!kdev)
+			return -EINVAL;
+		pcrat_image = kmalloc(VCRAT_SIZE_FOR_GPU, GFP_KERNEL);
+		if (!pcrat_image)
+			return -ENOMEM;
+		*size = VCRAT_SIZE_FOR_GPU;
+		ret = kfd_create_vcrat_image_gpu(pcrat_image, size, kdev,
+						 proximity_domain);
+		break;
+	case (COMPUTE_UNIT_CPU | COMPUTE_UNIT_GPU):
+		/* TODO: */
+		ret = -EINVAL;
+		pr_err("VCRAT not implemented for APU\n");
+		break;
+	default:
+		ret = -EINVAL;
+	}
+
+	if (!ret)
+		*crat_image = pcrat_image;
+	else
+		kfree(pcrat_image);
+
+	return ret;
+}
+
+
+/* kfd_destroy_crat_image
+ *
+ *	@crat_image: [IN] - crat_image from kfd_create_crat_image_xxx(..)
+ *
+ */
+void kfd_destroy_crat_image(void *crat_image)
+{
+	kfree(crat_image);
+}
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_crat.h b/drivers/gpu/drm/amd/amdkfd/kfd_crat.h
new file mode 100644
index 000000000..b5cd182b9
--- /dev/null
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_crat.h
@@ -0,0 +1,320 @@
+/*
+ * Copyright 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#ifndef KFD_CRAT_H_INCLUDED
+#define KFD_CRAT_H_INCLUDED
+
+#include <linux/types.h>
+
+#pragma pack(1)
+
+/*
+ * 4CC signature values for the CRAT and CDIT ACPI tables
+ */
+
+#define CRAT_SIGNATURE	"CRAT"
+#define CDIT_SIGNATURE	"CDIT"
+
+/*
+ * Component Resource Association Table (CRAT)
+ */
+
+#define CRAT_OEMID_LENGTH	6
+#define CRAT_OEMTABLEID_LENGTH	8
+#define CRAT_RESERVED_LENGTH	6
+
+#define CRAT_OEMID_64BIT_MASK ((1ULL << (CRAT_OEMID_LENGTH * 8)) - 1)
+
+/* Compute Unit flags */
+#define COMPUTE_UNIT_CPU	(1 << 0)  /* Create Virtual CRAT for CPU */
+#define COMPUTE_UNIT_GPU	(1 << 1)  /* Create Virtual CRAT for GPU */
+
+struct crat_header {
+	uint32_t	signature;
+	uint32_t	length;
+	uint8_t		revision;
+	uint8_t		checksum;
+	uint8_t		oem_id[CRAT_OEMID_LENGTH];
+	uint8_t		oem_table_id[CRAT_OEMTABLEID_LENGTH];
+	uint32_t	oem_revision;
+	uint32_t	creator_id;
+	uint32_t	creator_revision;
+	uint32_t	total_entries;
+	uint16_t	num_domains;
+	uint8_t		reserved[CRAT_RESERVED_LENGTH];
+};
+
+/*
+ * The header structure is immediately followed by total_entries of the
+ * data definitions
+ */
+
+/*
+ * The currently defined subtype entries in the CRAT
+ */
+#define CRAT_SUBTYPE_COMPUTEUNIT_AFFINITY	0
+#define CRAT_SUBTYPE_MEMORY_AFFINITY		1
+#define CRAT_SUBTYPE_CACHE_AFFINITY		2
+#define CRAT_SUBTYPE_TLB_AFFINITY		3
+#define CRAT_SUBTYPE_CCOMPUTE_AFFINITY		4
+#define CRAT_SUBTYPE_IOLINK_AFFINITY		5
+#define CRAT_SUBTYPE_MAX			6
+
+#define CRAT_SIBLINGMAP_SIZE	32
+
+/*
+ * ComputeUnit Affinity structure and definitions
+ */
+#define CRAT_CU_FLAGS_ENABLED		0x00000001
+#define CRAT_CU_FLAGS_HOT_PLUGGABLE	0x00000002
+#define CRAT_CU_FLAGS_CPU_PRESENT	0x00000004
+#define CRAT_CU_FLAGS_GPU_PRESENT	0x00000008
+#define CRAT_CU_FLAGS_IOMMU_PRESENT	0x00000010
+#define CRAT_CU_FLAGS_RESERVED		0xffffffe0
+
+#define CRAT_COMPUTEUNIT_RESERVED_LENGTH 4
+
+struct crat_subtype_computeunit {
+	uint8_t		type;
+	uint8_t		length;
+	uint16_t	reserved;
+	uint32_t	flags;
+	uint32_t	proximity_domain;
+	uint32_t	processor_id_low;
+	uint16_t	num_cpu_cores;
+	uint16_t	num_simd_cores;
+	uint16_t	max_waves_simd;
+	uint16_t	io_count;
+	uint16_t	hsa_capability;
+	uint16_t	lds_size_in_kb;
+	uint8_t		wave_front_size;
+	uint8_t		num_banks;
+	uint16_t	micro_engine_id;
+	uint8_t		array_count;
+	uint8_t		num_cu_per_array;
+	uint8_t		num_simd_per_cu;
+	uint8_t		max_slots_scatch_cu;
+	uint8_t		reserved2[CRAT_COMPUTEUNIT_RESERVED_LENGTH];
+};
+
+/*
+ * HSA Memory Affinity structure and definitions
+ */
+#define CRAT_MEM_FLAGS_ENABLED		0x00000001
+#define CRAT_MEM_FLAGS_HOT_PLUGGABLE	0x00000002
+#define CRAT_MEM_FLAGS_NON_VOLATILE	0x00000004
+#define CRAT_MEM_FLAGS_RESERVED		0xfffffff8
+
+#define CRAT_MEMORY_RESERVED_LENGTH 8
+
+struct crat_subtype_memory {
+	uint8_t		type;
+	uint8_t		length;
+	uint16_t	reserved;
+	uint32_t	flags;
+	uint32_t	proximity_domain;
+	uint32_t	base_addr_low;
+	uint32_t	base_addr_high;
+	uint32_t	length_low;
+	uint32_t	length_high;
+	uint32_t	width;
+	uint8_t		visibility_type; /* for virtual (dGPU) CRAT */
+	uint8_t		reserved2[CRAT_MEMORY_RESERVED_LENGTH - 1];
+};
+
+/*
+ * HSA Cache Affinity structure and definitions
+ */
+#define CRAT_CACHE_FLAGS_ENABLED	0x00000001
+#define CRAT_CACHE_FLAGS_DATA_CACHE	0x00000002
+#define CRAT_CACHE_FLAGS_INST_CACHE	0x00000004
+#define CRAT_CACHE_FLAGS_CPU_CACHE	0x00000008
+#define CRAT_CACHE_FLAGS_SIMD_CACHE	0x00000010
+#define CRAT_CACHE_FLAGS_RESERVED	0xffffffe0
+
+#define CRAT_CACHE_RESERVED_LENGTH 8
+
+struct crat_subtype_cache {
+	uint8_t		type;
+	uint8_t		length;
+	uint16_t	reserved;
+	uint32_t	flags;
+	uint32_t	processor_id_low;
+	uint8_t		sibling_map[CRAT_SIBLINGMAP_SIZE];
+	uint32_t	cache_size;
+	uint8_t		cache_level;
+	uint8_t		lines_per_tag;
+	uint16_t	cache_line_size;
+	uint8_t		associativity;
+	uint8_t		cache_properties;
+	uint16_t	cache_latency;
+	uint8_t		reserved2[CRAT_CACHE_RESERVED_LENGTH];
+};
+
+/*
+ * HSA TLB Affinity structure and definitions
+ */
+#define CRAT_TLB_FLAGS_ENABLED	0x00000001
+#define CRAT_TLB_FLAGS_DATA_TLB	0x00000002
+#define CRAT_TLB_FLAGS_INST_TLB	0x00000004
+#define CRAT_TLB_FLAGS_CPU_TLB	0x00000008
+#define CRAT_TLB_FLAGS_SIMD_TLB	0x00000010
+#define CRAT_TLB_FLAGS_RESERVED	0xffffffe0
+
+#define CRAT_TLB_RESERVED_LENGTH 4
+
+struct crat_subtype_tlb {
+	uint8_t		type;
+	uint8_t		length;
+	uint16_t	reserved;
+	uint32_t	flags;
+	uint32_t	processor_id_low;
+	uint8_t		sibling_map[CRAT_SIBLINGMAP_SIZE];
+	uint32_t	tlb_level;
+	uint8_t		data_tlb_associativity_2mb;
+	uint8_t		data_tlb_size_2mb;
+	uint8_t		instruction_tlb_associativity_2mb;
+	uint8_t		instruction_tlb_size_2mb;
+	uint8_t		data_tlb_associativity_4k;
+	uint8_t		data_tlb_size_4k;
+	uint8_t		instruction_tlb_associativity_4k;
+	uint8_t		instruction_tlb_size_4k;
+	uint8_t		data_tlb_associativity_1gb;
+	uint8_t		data_tlb_size_1gb;
+	uint8_t		instruction_tlb_associativity_1gb;
+	uint8_t		instruction_tlb_size_1gb;
+	uint8_t		reserved2[CRAT_TLB_RESERVED_LENGTH];
+};
+
+/*
+ * HSA CCompute/APU Affinity structure and definitions
+ */
+#define CRAT_CCOMPUTE_FLAGS_ENABLED	0x00000001
+#define CRAT_CCOMPUTE_FLAGS_RESERVED	0xfffffffe
+
+#define CRAT_CCOMPUTE_RESERVED_LENGTH 16
+
+struct crat_subtype_ccompute {
+	uint8_t		type;
+	uint8_t		length;
+	uint16_t	reserved;
+	uint32_t	flags;
+	uint32_t	processor_id_low;
+	uint8_t		sibling_map[CRAT_SIBLINGMAP_SIZE];
+	uint32_t	apu_size;
+	uint8_t		reserved2[CRAT_CCOMPUTE_RESERVED_LENGTH];
+};
+
+/*
+ * HSA IO Link Affinity structure and definitions
+ */
+#define CRAT_IOLINK_FLAGS_ENABLED		(1 << 0)
+#define CRAT_IOLINK_FLAGS_NON_COHERENT		(1 << 1)
+#define CRAT_IOLINK_FLAGS_NO_ATOMICS_32_BIT	(1 << 2)
+#define CRAT_IOLINK_FLAGS_NO_ATOMICS_64_BIT	(1 << 3)
+#define CRAT_IOLINK_FLAGS_NO_PEER_TO_PEER_DMA	(1 << 4)
+#define CRAT_IOLINK_FLAGS_RESERVED_MASK		0xffffffe0
+
+/*
+ * IO interface types
+ */
+#define CRAT_IOLINK_TYPE_UNDEFINED	0
+#define CRAT_IOLINK_TYPE_HYPERTRANSPORT	1
+#define CRAT_IOLINK_TYPE_PCIEXPRESS	2
+#define CRAT_IOLINK_TYPE_AMBA		3
+#define CRAT_IOLINK_TYPE_MIPI		4
+#define CRAT_IOLINK_TYPE_QPI_1_1	5
+#define CRAT_IOLINK_TYPE_RESERVED1	6
+#define CRAT_IOLINK_TYPE_RESERVED2	7
+#define CRAT_IOLINK_TYPE_RAPID_IO	8
+#define CRAT_IOLINK_TYPE_INFINIBAND	9
+#define CRAT_IOLINK_TYPE_RESERVED3	10
+#define CRAT_IOLINK_TYPE_OTHER		11
+#define CRAT_IOLINK_TYPE_MAX		255
+
+#define CRAT_IOLINK_RESERVED_LENGTH	24
+
+struct crat_subtype_iolink {
+	uint8_t		type;
+	uint8_t		length;
+	uint16_t	reserved;
+	uint32_t	flags;
+	uint32_t	proximity_domain_from;
+	uint32_t	proximity_domain_to;
+	uint8_t		io_interface_type;
+	uint8_t		version_major;
+	uint16_t	version_minor;
+	uint32_t	minimum_latency;
+	uint32_t	maximum_latency;
+	uint32_t	minimum_bandwidth_mbs;
+	uint32_t	maximum_bandwidth_mbs;
+	uint32_t	recommended_transfer_size;
+	uint8_t		reserved2[CRAT_IOLINK_RESERVED_LENGTH];
+};
+
+/*
+ * HSA generic sub-type header
+ */
+
+#define CRAT_SUBTYPE_FLAGS_ENABLED 0x00000001
+
+struct crat_subtype_generic {
+	uint8_t		type;
+	uint8_t		length;
+	uint16_t	reserved;
+	uint32_t	flags;
+};
+
+/*
+ * Component Locality Distance Information Table (CDIT)
+ */
+#define CDIT_OEMID_LENGTH	6
+#define CDIT_OEMTABLEID_LENGTH	8
+
+struct cdit_header {
+	uint32_t	signature;
+	uint32_t	length;
+	uint8_t		revision;
+	uint8_t		checksum;
+	uint8_t		oem_id[CDIT_OEMID_LENGTH];
+	uint8_t		oem_table_id[CDIT_OEMTABLEID_LENGTH];
+	uint32_t	oem_revision;
+	uint32_t	creator_id;
+	uint32_t	creator_revision;
+	uint32_t	total_entries;
+	uint16_t	num_domains;
+	uint8_t		entry[1];
+};
+
+#pragma pack()
+
+struct kfd_dev;
+
+int kfd_create_crat_image_acpi(void **crat_image, size_t *size);
+void kfd_destroy_crat_image(void *crat_image);
+int kfd_parse_crat_table(void *crat_image, struct list_head *device_list,
+			 uint32_t proximity_domain);
+int kfd_create_crat_image_virtual(void **crat_image, size_t *size,
+				  int flags, struct kfd_dev *kdev,
+				  uint32_t proximity_domain);
+
+#endif /* KFD_CRAT_H_INCLUDED */
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_dbgdev.c b/drivers/gpu/drm/amd/amdkfd/kfd_dbgdev.c
new file mode 100644
index 000000000..a3441b0e3
--- /dev/null
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_dbgdev.c
@@ -0,0 +1,845 @@
+/*
+ * Copyright 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ *
+ */
+
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/log2.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/mutex.h>
+#include <linux/device.h>
+
+#include "kfd_pm4_headers.h"
+#include "kfd_pm4_headers_diq.h"
+#include "kfd_kernel_queue.h"
+#include "kfd_priv.h"
+#include "kfd_pm4_opcodes.h"
+#include "cik_regs.h"
+#include "kfd_dbgmgr.h"
+#include "kfd_dbgdev.h"
+#include "kfd_device_queue_manager.h"
+
+static void dbgdev_address_watch_disable_nodiq(struct kfd_dev *dev)
+{
+	dev->kfd2kgd->address_watch_disable(dev->kgd);
+}
+
+static int dbgdev_diq_submit_ib(struct kfd_dbgdev *dbgdev,
+				unsigned int pasid, uint64_t vmid0_address,
+				uint32_t *packet_buff, size_t size_in_bytes)
+{
+	struct pm4__release_mem *rm_packet;
+	struct pm4__indirect_buffer_pasid *ib_packet;
+	struct kfd_mem_obj *mem_obj;
+	size_t pq_packets_size_in_bytes;
+	union ULARGE_INTEGER *largep;
+	union ULARGE_INTEGER addr;
+	struct kernel_queue *kq;
+	uint64_t *rm_state;
+	unsigned int *ib_packet_buff;
+	int status;
+
+	if (WARN_ON(!size_in_bytes))
+		return -EINVAL;
+
+	kq = dbgdev->kq;
+
+	pq_packets_size_in_bytes = sizeof(struct pm4__release_mem) +
+				sizeof(struct pm4__indirect_buffer_pasid);
+
+	/*
+	 * We acquire a buffer from DIQ
+	 * The receive packet buff will be sitting on the Indirect Buffer
+	 * and in the PQ we put the IB packet + sync packet(s).
+	 */
+	status = kq->ops.acquire_packet_buffer(kq,
+				pq_packets_size_in_bytes / sizeof(uint32_t),
+				&ib_packet_buff);
+	if (status) {
+		pr_err("acquire_packet_buffer failed\n");
+		return status;
+	}
+
+	memset(ib_packet_buff, 0, pq_packets_size_in_bytes);
+
+	ib_packet = (struct pm4__indirect_buffer_pasid *) (ib_packet_buff);
+
+	ib_packet->header.count = 3;
+	ib_packet->header.opcode = IT_INDIRECT_BUFFER_PASID;
+	ib_packet->header.type = PM4_TYPE_3;
+
+	largep = (union ULARGE_INTEGER *) &vmid0_address;
+
+	ib_packet->bitfields2.ib_base_lo = largep->u.low_part >> 2;
+	ib_packet->bitfields3.ib_base_hi = largep->u.high_part;
+
+	ib_packet->control = (1 << 23) | (1 << 31) |
+			((size_in_bytes / 4) & 0xfffff);
+
+	ib_packet->bitfields5.pasid = pasid;
+
+	/*
+	 * for now we use release mem for GPU-CPU synchronization
+	 * Consider WaitRegMem + WriteData as a better alternative
+	 * we get a GART allocations ( gpu/cpu mapping),
+	 * for the sync variable, and wait until:
+	 * (a) Sync with HW
+	 * (b) Sync var is written by CP to mem.
+	 */
+	rm_packet = (struct pm4__release_mem *) (ib_packet_buff +
+			(sizeof(struct pm4__indirect_buffer_pasid) /
+					sizeof(unsigned int)));
+
+	status = kfd_gtt_sa_allocate(dbgdev->dev, sizeof(uint64_t),
+					&mem_obj);
+
+	if (status) {
+		pr_err("Failed to allocate GART memory\n");
+		kq->ops.rollback_packet(kq);
+		return status;
+	}
+
+	rm_state = (uint64_t *) mem_obj->cpu_ptr;
+
+	*rm_state = QUEUESTATE__ACTIVE_COMPLETION_PENDING;
+
+	rm_packet->header.opcode = IT_RELEASE_MEM;
+	rm_packet->header.type = PM4_TYPE_3;
+	rm_packet->header.count = sizeof(struct pm4__release_mem) / 4 - 2;
+
+	rm_packet->bitfields2.event_type = CACHE_FLUSH_AND_INV_TS_EVENT;
+	rm_packet->bitfields2.event_index =
+				event_index___release_mem__end_of_pipe;
+
+	rm_packet->bitfields2.cache_policy = cache_policy___release_mem__lru;
+	rm_packet->bitfields2.atc = 0;
+	rm_packet->bitfields2.tc_wb_action_ena = 1;
+
+	addr.quad_part = mem_obj->gpu_addr;
+
+	rm_packet->bitfields4.address_lo_32b = addr.u.low_part >> 2;
+	rm_packet->address_hi = addr.u.high_part;
+
+	rm_packet->bitfields3.data_sel =
+				data_sel___release_mem__send_64_bit_data;
+
+	rm_packet->bitfields3.int_sel =
+			int_sel___release_mem__send_data_after_write_confirm;
+
+	rm_packet->bitfields3.dst_sel =
+			dst_sel___release_mem__memory_controller;
+
+	rm_packet->data_lo = QUEUESTATE__ACTIVE;
+
+	kq->ops.submit_packet(kq);
+
+	/* Wait till CP writes sync code: */
+	status = amdkfd_fence_wait_timeout(
+			(unsigned int *) rm_state,
+			QUEUESTATE__ACTIVE, 1500);
+
+	kfd_gtt_sa_free(dbgdev->dev, mem_obj);
+
+	return status;
+}
+
+static int dbgdev_register_nodiq(struct kfd_dbgdev *dbgdev)
+{
+	/*
+	 * no action is needed in this case,
+	 * just make sure diq will not be used
+	 */
+
+	dbgdev->kq = NULL;
+
+	return 0;
+}
+
+static int dbgdev_register_diq(struct kfd_dbgdev *dbgdev)
+{
+	struct queue_properties properties;
+	unsigned int qid;
+	struct kernel_queue *kq = NULL;
+	int status;
+
+	properties.type = KFD_QUEUE_TYPE_DIQ;
+
+	status = pqm_create_queue(dbgdev->pqm, dbgdev->dev, NULL,
+				&properties, &qid);
+
+	if (status) {
+		pr_err("Failed to create DIQ\n");
+		return status;
+	}
+
+	pr_debug("DIQ Created with queue id: %d\n", qid);
+
+	kq = pqm_get_kernel_queue(dbgdev->pqm, qid);
+
+	if (!kq) {
+		pr_err("Error getting DIQ\n");
+		pqm_destroy_queue(dbgdev->pqm, qid);
+		return -EFAULT;
+	}
+
+	dbgdev->kq = kq;
+
+	return status;
+}
+
+static int dbgdev_unregister_nodiq(struct kfd_dbgdev *dbgdev)
+{
+	/* disable watch address */
+	dbgdev_address_watch_disable_nodiq(dbgdev->dev);
+	return 0;
+}
+
+static int dbgdev_unregister_diq(struct kfd_dbgdev *dbgdev)
+{
+	/* todo - disable address watch */
+	int status;
+
+	status = pqm_destroy_queue(dbgdev->pqm,
+			dbgdev->kq->queue->properties.queue_id);
+	dbgdev->kq = NULL;
+
+	return status;
+}
+
+static void dbgdev_address_watch_set_registers(
+			const struct dbg_address_watch_info *adw_info,
+			union TCP_WATCH_ADDR_H_BITS *addrHi,
+			union TCP_WATCH_ADDR_L_BITS *addrLo,
+			union TCP_WATCH_CNTL_BITS *cntl,
+			unsigned int index, unsigned int vmid)
+{
+	union ULARGE_INTEGER addr;
+
+	addr.quad_part = 0;
+	addrHi->u32All = 0;
+	addrLo->u32All = 0;
+	cntl->u32All = 0;
+
+	if (adw_info->watch_mask)
+		cntl->bitfields.mask =
+			(uint32_t) (adw_info->watch_mask[index] &
+					ADDRESS_WATCH_REG_CNTL_DEFAULT_MASK);
+	else
+		cntl->bitfields.mask = ADDRESS_WATCH_REG_CNTL_DEFAULT_MASK;
+
+	addr.quad_part = (unsigned long long) adw_info->watch_address[index];
+
+	addrHi->bitfields.addr = addr.u.high_part &
+					ADDRESS_WATCH_REG_ADDHIGH_MASK;
+	addrLo->bitfields.addr =
+			(addr.u.low_part >> ADDRESS_WATCH_REG_ADDLOW_SHIFT);
+
+	cntl->bitfields.mode = adw_info->watch_mode[index];
+	cntl->bitfields.vmid = (uint32_t) vmid;
+	/* for now assume it is an ATC address */
+	cntl->u32All |= ADDRESS_WATCH_REG_CNTL_ATC_BIT;
+
+	pr_debug("\t\t%20s %08x\n", "set reg mask :", cntl->bitfields.mask);
+	pr_debug("\t\t%20s %08x\n", "set reg add high :",
+			addrHi->bitfields.addr);
+	pr_debug("\t\t%20s %08x\n", "set reg add low :",
+			addrLo->bitfields.addr);
+}
+
+static int dbgdev_address_watch_nodiq(struct kfd_dbgdev *dbgdev,
+				      struct dbg_address_watch_info *adw_info)
+{
+	union TCP_WATCH_ADDR_H_BITS addrHi;
+	union TCP_WATCH_ADDR_L_BITS addrLo;
+	union TCP_WATCH_CNTL_BITS cntl;
+	struct kfd_process_device *pdd;
+	unsigned int i;
+
+	/* taking the vmid for that process on the safe way using pdd */
+	pdd = kfd_get_process_device_data(dbgdev->dev,
+					adw_info->process);
+	if (!pdd) {
+		pr_err("Failed to get pdd for wave control no DIQ\n");
+		return -EFAULT;
+	}
+
+	addrHi.u32All = 0;
+	addrLo.u32All = 0;
+	cntl.u32All = 0;
+
+	if ((adw_info->num_watch_points > MAX_WATCH_ADDRESSES) ||
+			(adw_info->num_watch_points == 0)) {
+		pr_err("num_watch_points is invalid\n");
+		return -EINVAL;
+	}
+
+	if (!adw_info->watch_mode || !adw_info->watch_address) {
+		pr_err("adw_info fields are not valid\n");
+		return -EINVAL;
+	}
+
+	for (i = 0; i < adw_info->num_watch_points; i++) {
+		dbgdev_address_watch_set_registers(adw_info, &addrHi, &addrLo,
+						&cntl, i, pdd->qpd.vmid);
+
+		pr_debug("\t\t%30s\n", "* * * * * * * * * * * * * * * * * *");
+		pr_debug("\t\t%20s %08x\n", "register index :", i);
+		pr_debug("\t\t%20s %08x\n", "vmid is :", pdd->qpd.vmid);
+		pr_debug("\t\t%20s %08x\n", "Address Low is :",
+				addrLo.bitfields.addr);
+		pr_debug("\t\t%20s %08x\n", "Address high is :",
+				addrHi.bitfields.addr);
+		pr_debug("\t\t%20s %08x\n", "Address high is :",
+				addrHi.bitfields.addr);
+		pr_debug("\t\t%20s %08x\n", "Control Mask is :",
+				cntl.bitfields.mask);
+		pr_debug("\t\t%20s %08x\n", "Control Mode is :",
+				cntl.bitfields.mode);
+		pr_debug("\t\t%20s %08x\n", "Control Vmid is :",
+				cntl.bitfields.vmid);
+		pr_debug("\t\t%20s %08x\n", "Control atc  is :",
+				cntl.bitfields.atc);
+		pr_debug("\t\t%30s\n", "* * * * * * * * * * * * * * * * * *");
+
+		pdd->dev->kfd2kgd->address_watch_execute(
+						dbgdev->dev->kgd,
+						i,
+						cntl.u32All,
+						addrHi.u32All,
+						addrLo.u32All);
+	}
+
+	return 0;
+}
+
+static int dbgdev_address_watch_diq(struct kfd_dbgdev *dbgdev,
+				    struct dbg_address_watch_info *adw_info)
+{
+	struct pm4__set_config_reg *packets_vec;
+	union TCP_WATCH_ADDR_H_BITS addrHi;
+	union TCP_WATCH_ADDR_L_BITS addrLo;
+	union TCP_WATCH_CNTL_BITS cntl;
+	struct kfd_mem_obj *mem_obj;
+	unsigned int aw_reg_add_dword;
+	uint32_t *packet_buff_uint;
+	unsigned int i;
+	int status;
+	size_t ib_size = sizeof(struct pm4__set_config_reg) * 4;
+	/* we do not control the vmid in DIQ mode, just a place holder */
+	unsigned int vmid = 0;
+
+	addrHi.u32All = 0;
+	addrLo.u32All = 0;
+	cntl.u32All = 0;
+
+	if ((adw_info->num_watch_points > MAX_WATCH_ADDRESSES) ||
+			(adw_info->num_watch_points == 0)) {
+		pr_err("num_watch_points is invalid\n");
+		return -EINVAL;
+	}
+
+	if (!adw_info->watch_mode || !adw_info->watch_address) {
+		pr_err("adw_info fields are not valid\n");
+		return -EINVAL;
+	}
+
+	status = kfd_gtt_sa_allocate(dbgdev->dev, ib_size, &mem_obj);
+
+	if (status) {
+		pr_err("Failed to allocate GART memory\n");
+		return status;
+	}
+
+	packet_buff_uint = mem_obj->cpu_ptr;
+
+	memset(packet_buff_uint, 0, ib_size);
+
+	packets_vec = (struct pm4__set_config_reg *) (packet_buff_uint);
+
+	packets_vec[0].header.count = 1;
+	packets_vec[0].header.opcode = IT_SET_CONFIG_REG;
+	packets_vec[0].header.type = PM4_TYPE_3;
+	packets_vec[0].bitfields2.vmid_shift = ADDRESS_WATCH_CNTL_OFFSET;
+	packets_vec[0].bitfields2.insert_vmid = 1;
+	packets_vec[1].ordinal1 = packets_vec[0].ordinal1;
+	packets_vec[1].bitfields2.insert_vmid = 0;
+	packets_vec[2].ordinal1 = packets_vec[0].ordinal1;
+	packets_vec[2].bitfields2.insert_vmid = 0;
+	packets_vec[3].ordinal1 = packets_vec[0].ordinal1;
+	packets_vec[3].bitfields2.vmid_shift = ADDRESS_WATCH_CNTL_OFFSET;
+	packets_vec[3].bitfields2.insert_vmid = 1;
+
+	for (i = 0; i < adw_info->num_watch_points; i++) {
+		dbgdev_address_watch_set_registers(adw_info,
+						&addrHi,
+						&addrLo,
+						&cntl,
+						i,
+						vmid);
+
+		pr_debug("\t\t%30s\n", "* * * * * * * * * * * * * * * * * *");
+		pr_debug("\t\t%20s %08x\n", "register index :", i);
+		pr_debug("\t\t%20s %08x\n", "vmid is :", vmid);
+		pr_debug("\t\t%20s %p\n", "Add ptr is :",
+				adw_info->watch_address);
+		pr_debug("\t\t%20s %08llx\n", "Add     is :",
+				adw_info->watch_address[i]);
+		pr_debug("\t\t%20s %08x\n", "Address Low is :",
+				addrLo.bitfields.addr);
+		pr_debug("\t\t%20s %08x\n", "Address high is :",
+				addrHi.bitfields.addr);
+		pr_debug("\t\t%20s %08x\n", "Control Mask is :",
+				cntl.bitfields.mask);
+		pr_debug("\t\t%20s %08x\n", "Control Mode is :",
+				cntl.bitfields.mode);
+		pr_debug("\t\t%20s %08x\n", "Control Vmid is :",
+				cntl.bitfields.vmid);
+		pr_debug("\t\t%20s %08x\n", "Control atc  is :",
+				cntl.bitfields.atc);
+		pr_debug("\t\t%30s\n", "* * * * * * * * * * * * * * * * * *");
+
+		aw_reg_add_dword =
+				dbgdev->dev->kfd2kgd->address_watch_get_offset(
+					dbgdev->dev->kgd,
+					i,
+					ADDRESS_WATCH_REG_CNTL);
+
+		packets_vec[0].bitfields2.reg_offset =
+					aw_reg_add_dword - AMD_CONFIG_REG_BASE;
+
+		packets_vec[0].reg_data[0] = cntl.u32All;
+
+		aw_reg_add_dword =
+				dbgdev->dev->kfd2kgd->address_watch_get_offset(
+					dbgdev->dev->kgd,
+					i,
+					ADDRESS_WATCH_REG_ADDR_HI);
+
+		packets_vec[1].bitfields2.reg_offset =
+					aw_reg_add_dword - AMD_CONFIG_REG_BASE;
+		packets_vec[1].reg_data[0] = addrHi.u32All;
+
+		aw_reg_add_dword =
+				dbgdev->dev->kfd2kgd->address_watch_get_offset(
+					dbgdev->dev->kgd,
+					i,
+					ADDRESS_WATCH_REG_ADDR_LO);
+
+		packets_vec[2].bitfields2.reg_offset =
+				aw_reg_add_dword - AMD_CONFIG_REG_BASE;
+		packets_vec[2].reg_data[0] = addrLo.u32All;
+
+		/* enable watch flag if address is not zero*/
+		if (adw_info->watch_address[i] > 0)
+			cntl.bitfields.valid = 1;
+		else
+			cntl.bitfields.valid = 0;
+
+		aw_reg_add_dword =
+				dbgdev->dev->kfd2kgd->address_watch_get_offset(
+					dbgdev->dev->kgd,
+					i,
+					ADDRESS_WATCH_REG_CNTL);
+
+		packets_vec[3].bitfields2.reg_offset =
+					aw_reg_add_dword - AMD_CONFIG_REG_BASE;
+		packets_vec[3].reg_data[0] = cntl.u32All;
+
+		status = dbgdev_diq_submit_ib(
+					dbgdev,
+					adw_info->process->pasid,
+					mem_obj->gpu_addr,
+					packet_buff_uint,
+					ib_size);
+
+		if (status) {
+			pr_err("Failed to submit IB to DIQ\n");
+			break;
+		}
+	}
+
+	kfd_gtt_sa_free(dbgdev->dev, mem_obj);
+	return status;
+}
+
+static int dbgdev_wave_control_set_registers(
+				struct dbg_wave_control_info *wac_info,
+				union SQ_CMD_BITS *in_reg_sq_cmd,
+				union GRBM_GFX_INDEX_BITS *in_reg_gfx_index)
+{
+	int status = 0;
+	union SQ_CMD_BITS reg_sq_cmd;
+	union GRBM_GFX_INDEX_BITS reg_gfx_index;
+	struct HsaDbgWaveMsgAMDGen2 *pMsg;
+
+	reg_sq_cmd.u32All = 0;
+	reg_gfx_index.u32All = 0;
+	pMsg = &wac_info->dbgWave_msg.DbgWaveMsg.WaveMsgInfoGen2;
+
+	switch (wac_info->mode) {
+	/* Send command to single wave */
+	case HSA_DBG_WAVEMODE_SINGLE:
+		/*
+		 * Limit access to the process waves only,
+		 * by setting vmid check
+		 */
+		reg_sq_cmd.bits.check_vmid = 1;
+		reg_sq_cmd.bits.simd_id = pMsg->ui32.SIMD;
+		reg_sq_cmd.bits.wave_id = pMsg->ui32.WaveId;
+		reg_sq_cmd.bits.mode = SQ_IND_CMD_MODE_SINGLE;
+
+		reg_gfx_index.bits.sh_index = pMsg->ui32.ShaderArray;
+		reg_gfx_index.bits.se_index = pMsg->ui32.ShaderEngine;
+		reg_gfx_index.bits.instance_index = pMsg->ui32.HSACU;
+
+		break;
+
+	/* Send command to all waves with matching VMID */
+	case HSA_DBG_WAVEMODE_BROADCAST_PROCESS:
+
+		reg_gfx_index.bits.sh_broadcast_writes = 1;
+		reg_gfx_index.bits.se_broadcast_writes = 1;
+		reg_gfx_index.bits.instance_broadcast_writes = 1;
+
+		reg_sq_cmd.bits.mode = SQ_IND_CMD_MODE_BROADCAST;
+
+		break;
+
+	/* Send command to all CU waves with matching VMID */
+	case HSA_DBG_WAVEMODE_BROADCAST_PROCESS_CU:
+
+		reg_sq_cmd.bits.check_vmid = 1;
+		reg_sq_cmd.bits.mode = SQ_IND_CMD_MODE_BROADCAST;
+
+		reg_gfx_index.bits.sh_index = pMsg->ui32.ShaderArray;
+		reg_gfx_index.bits.se_index = pMsg->ui32.ShaderEngine;
+		reg_gfx_index.bits.instance_index = pMsg->ui32.HSACU;
+
+		break;
+
+	default:
+		return -EINVAL;
+	}
+
+	switch (wac_info->operand) {
+	case HSA_DBG_WAVEOP_HALT:
+		reg_sq_cmd.bits.cmd = SQ_IND_CMD_CMD_HALT;
+		break;
+
+	case HSA_DBG_WAVEOP_RESUME:
+		reg_sq_cmd.bits.cmd = SQ_IND_CMD_CMD_RESUME;
+		break;
+
+	case HSA_DBG_WAVEOP_KILL:
+		reg_sq_cmd.bits.cmd = SQ_IND_CMD_CMD_KILL;
+		break;
+
+	case HSA_DBG_WAVEOP_DEBUG:
+		reg_sq_cmd.bits.cmd = SQ_IND_CMD_CMD_DEBUG;
+		break;
+
+	case HSA_DBG_WAVEOP_TRAP:
+		if (wac_info->trapId < MAX_TRAPID) {
+			reg_sq_cmd.bits.cmd = SQ_IND_CMD_CMD_TRAP;
+			reg_sq_cmd.bits.trap_id = wac_info->trapId;
+		} else {
+			status = -EINVAL;
+		}
+		break;
+
+	default:
+		status = -EINVAL;
+		break;
+	}
+
+	if (status == 0) {
+		*in_reg_sq_cmd = reg_sq_cmd;
+		*in_reg_gfx_index = reg_gfx_index;
+	}
+
+	return status;
+}
+
+static int dbgdev_wave_control_diq(struct kfd_dbgdev *dbgdev,
+					struct dbg_wave_control_info *wac_info)
+{
+
+	int status;
+	union SQ_CMD_BITS reg_sq_cmd;
+	union GRBM_GFX_INDEX_BITS reg_gfx_index;
+	struct kfd_mem_obj *mem_obj;
+	uint32_t *packet_buff_uint;
+	struct pm4__set_config_reg *packets_vec;
+	size_t ib_size = sizeof(struct pm4__set_config_reg) * 3;
+
+	reg_sq_cmd.u32All = 0;
+
+	status = dbgdev_wave_control_set_registers(wac_info, &reg_sq_cmd,
+							&reg_gfx_index);
+	if (status) {
+		pr_err("Failed to set wave control registers\n");
+		return status;
+	}
+
+	/* we do not control the VMID in DIQ, so reset it to a known value */
+	reg_sq_cmd.bits.vm_id = 0;
+
+	pr_debug("\t\t %30s\n", "* * * * * * * * * * * * * * * * * *");
+
+	pr_debug("\t\t mode      is: %u\n", wac_info->mode);
+	pr_debug("\t\t operand   is: %u\n", wac_info->operand);
+	pr_debug("\t\t trap id   is: %u\n", wac_info->trapId);
+	pr_debug("\t\t msg value is: %u\n",
+			wac_info->dbgWave_msg.DbgWaveMsg.WaveMsgInfoGen2.Value);
+	pr_debug("\t\t vmid      is: N/A\n");
+
+	pr_debug("\t\t chk_vmid  is : %u\n", reg_sq_cmd.bitfields.check_vmid);
+	pr_debug("\t\t command   is : %u\n", reg_sq_cmd.bitfields.cmd);
+	pr_debug("\t\t queue id  is : %u\n", reg_sq_cmd.bitfields.queue_id);
+	pr_debug("\t\t simd id   is : %u\n", reg_sq_cmd.bitfields.simd_id);
+	pr_debug("\t\t mode      is : %u\n", reg_sq_cmd.bitfields.mode);
+	pr_debug("\t\t vm_id     is : %u\n", reg_sq_cmd.bitfields.vm_id);
+	pr_debug("\t\t wave_id   is : %u\n", reg_sq_cmd.bitfields.wave_id);
+
+	pr_debug("\t\t ibw       is : %u\n",
+			reg_gfx_index.bitfields.instance_broadcast_writes);
+	pr_debug("\t\t ii        is : %u\n",
+			reg_gfx_index.bitfields.instance_index);
+	pr_debug("\t\t sebw      is : %u\n",
+			reg_gfx_index.bitfields.se_broadcast_writes);
+	pr_debug("\t\t se_ind    is : %u\n", reg_gfx_index.bitfields.se_index);
+	pr_debug("\t\t sh_ind    is : %u\n", reg_gfx_index.bitfields.sh_index);
+	pr_debug("\t\t sbw       is : %u\n",
+			reg_gfx_index.bitfields.sh_broadcast_writes);
+
+	pr_debug("\t\t %30s\n", "* * * * * * * * * * * * * * * * * *");
+
+	status = kfd_gtt_sa_allocate(dbgdev->dev, ib_size, &mem_obj);
+
+	if (status != 0) {
+		pr_err("Failed to allocate GART memory\n");
+		return status;
+	}
+
+	packet_buff_uint = mem_obj->cpu_ptr;
+
+	memset(packet_buff_uint, 0, ib_size);
+
+	packets_vec =  (struct pm4__set_config_reg *) packet_buff_uint;
+	packets_vec[0].header.count = 1;
+	packets_vec[0].header.opcode = IT_SET_UCONFIG_REG;
+	packets_vec[0].header.type = PM4_TYPE_3;
+	packets_vec[0].bitfields2.reg_offset =
+			GRBM_GFX_INDEX / 4 - USERCONFIG_REG_BASE;
+
+	packets_vec[0].bitfields2.insert_vmid = 0;
+	packets_vec[0].reg_data[0] = reg_gfx_index.u32All;
+
+	packets_vec[1].header.count = 1;
+	packets_vec[1].header.opcode = IT_SET_CONFIG_REG;
+	packets_vec[1].header.type = PM4_TYPE_3;
+	packets_vec[1].bitfields2.reg_offset = SQ_CMD / 4 - AMD_CONFIG_REG_BASE;
+
+	packets_vec[1].bitfields2.vmid_shift = SQ_CMD_VMID_OFFSET;
+	packets_vec[1].bitfields2.insert_vmid = 1;
+	packets_vec[1].reg_data[0] = reg_sq_cmd.u32All;
+
+	/* Restore the GRBM_GFX_INDEX register */
+
+	reg_gfx_index.u32All = 0;
+	reg_gfx_index.bits.sh_broadcast_writes = 1;
+	reg_gfx_index.bits.instance_broadcast_writes = 1;
+	reg_gfx_index.bits.se_broadcast_writes = 1;
+
+
+	packets_vec[2].ordinal1 = packets_vec[0].ordinal1;
+	packets_vec[2].bitfields2.reg_offset =
+				GRBM_GFX_INDEX / 4 - USERCONFIG_REG_BASE;
+
+	packets_vec[2].bitfields2.insert_vmid = 0;
+	packets_vec[2].reg_data[0] = reg_gfx_index.u32All;
+
+	status = dbgdev_diq_submit_ib(
+			dbgdev,
+			wac_info->process->pasid,
+			mem_obj->gpu_addr,
+			packet_buff_uint,
+			ib_size);
+
+	if (status)
+		pr_err("Failed to submit IB to DIQ\n");
+
+	kfd_gtt_sa_free(dbgdev->dev, mem_obj);
+
+	return status;
+}
+
+static int dbgdev_wave_control_nodiq(struct kfd_dbgdev *dbgdev,
+					struct dbg_wave_control_info *wac_info)
+{
+	int status;
+	union SQ_CMD_BITS reg_sq_cmd;
+	union GRBM_GFX_INDEX_BITS reg_gfx_index;
+	struct kfd_process_device *pdd;
+
+	reg_sq_cmd.u32All = 0;
+
+	/* taking the VMID for that process on the safe way using PDD */
+	pdd = kfd_get_process_device_data(dbgdev->dev, wac_info->process);
+
+	if (!pdd) {
+		pr_err("Failed to get pdd for wave control no DIQ\n");
+		return -EFAULT;
+	}
+	status = dbgdev_wave_control_set_registers(wac_info, &reg_sq_cmd,
+							&reg_gfx_index);
+	if (status) {
+		pr_err("Failed to set wave control registers\n");
+		return status;
+	}
+
+	/* for non DIQ we need to patch the VMID: */
+
+	reg_sq_cmd.bits.vm_id = pdd->qpd.vmid;
+
+	pr_debug("\t\t %30s\n", "* * * * * * * * * * * * * * * * * *");
+
+	pr_debug("\t\t mode      is: %u\n", wac_info->mode);
+	pr_debug("\t\t operand   is: %u\n", wac_info->operand);
+	pr_debug("\t\t trap id   is: %u\n", wac_info->trapId);
+	pr_debug("\t\t msg value is: %u\n",
+			wac_info->dbgWave_msg.DbgWaveMsg.WaveMsgInfoGen2.Value);
+	pr_debug("\t\t vmid      is: %u\n", pdd->qpd.vmid);
+
+	pr_debug("\t\t chk_vmid  is : %u\n", reg_sq_cmd.bitfields.check_vmid);
+	pr_debug("\t\t command   is : %u\n", reg_sq_cmd.bitfields.cmd);
+	pr_debug("\t\t queue id  is : %u\n", reg_sq_cmd.bitfields.queue_id);
+	pr_debug("\t\t simd id   is : %u\n", reg_sq_cmd.bitfields.simd_id);
+	pr_debug("\t\t mode      is : %u\n", reg_sq_cmd.bitfields.mode);
+	pr_debug("\t\t vm_id     is : %u\n", reg_sq_cmd.bitfields.vm_id);
+	pr_debug("\t\t wave_id   is : %u\n", reg_sq_cmd.bitfields.wave_id);
+
+	pr_debug("\t\t ibw       is : %u\n",
+			reg_gfx_index.bitfields.instance_broadcast_writes);
+	pr_debug("\t\t ii        is : %u\n",
+			reg_gfx_index.bitfields.instance_index);
+	pr_debug("\t\t sebw      is : %u\n",
+			reg_gfx_index.bitfields.se_broadcast_writes);
+	pr_debug("\t\t se_ind    is : %u\n", reg_gfx_index.bitfields.se_index);
+	pr_debug("\t\t sh_ind    is : %u\n", reg_gfx_index.bitfields.sh_index);
+	pr_debug("\t\t sbw       is : %u\n",
+			reg_gfx_index.bitfields.sh_broadcast_writes);
+
+	pr_debug("\t\t %30s\n", "* * * * * * * * * * * * * * * * * *");
+
+	return dbgdev->dev->kfd2kgd->wave_control_execute(dbgdev->dev->kgd,
+							reg_gfx_index.u32All,
+							reg_sq_cmd.u32All);
+}
+
+int dbgdev_wave_reset_wavefronts(struct kfd_dev *dev, struct kfd_process *p)
+{
+	int status = 0;
+	unsigned int vmid;
+	union SQ_CMD_BITS reg_sq_cmd;
+	union GRBM_GFX_INDEX_BITS reg_gfx_index;
+	struct kfd_process_device *pdd;
+	struct dbg_wave_control_info wac_info;
+	int first_vmid_to_scan = dev->vm_info.first_vmid_kfd;
+	int last_vmid_to_scan = dev->vm_info.last_vmid_kfd;
+
+	reg_sq_cmd.u32All = 0;
+	status = 0;
+
+	wac_info.mode = HSA_DBG_WAVEMODE_BROADCAST_PROCESS;
+	wac_info.operand = HSA_DBG_WAVEOP_KILL;
+
+	pr_debug("Killing all process wavefronts\n");
+
+	/* Scan all registers in the range ATC_VMID8_PASID_MAPPING ..
+	 * ATC_VMID15_PASID_MAPPING
+	 * to check which VMID the current process is mapped to.
+	 */
+
+	for (vmid = first_vmid_to_scan; vmid <= last_vmid_to_scan; vmid++) {
+		if (dev->kfd2kgd->get_atc_vmid_pasid_mapping_valid
+				(dev->kgd, vmid)) {
+			if (dev->kfd2kgd->get_atc_vmid_pasid_mapping_pasid
+					(dev->kgd, vmid) == p->pasid) {
+				pr_debug("Killing wave fronts of vmid %d and pasid %d\n",
+						vmid, p->pasid);
+				break;
+			}
+		}
+	}
+
+	if (vmid > last_vmid_to_scan) {
+		pr_err("Didn't find vmid for pasid %d\n", p->pasid);
+		return -EFAULT;
+	}
+
+	/* taking the VMID for that process on the safe way using PDD */
+	pdd = kfd_get_process_device_data(dev, p);
+	if (!pdd)
+		return -EFAULT;
+
+	status = dbgdev_wave_control_set_registers(&wac_info, &reg_sq_cmd,
+			&reg_gfx_index);
+	if (status != 0)
+		return -EINVAL;
+
+	/* for non DIQ we need to patch the VMID: */
+	reg_sq_cmd.bits.vm_id = vmid;
+
+	dev->kfd2kgd->wave_control_execute(dev->kgd,
+					reg_gfx_index.u32All,
+					reg_sq_cmd.u32All);
+
+	return 0;
+}
+
+void kfd_dbgdev_init(struct kfd_dbgdev *pdbgdev, struct kfd_dev *pdev,
+			enum DBGDEV_TYPE type)
+{
+	pdbgdev->dev = pdev;
+	pdbgdev->kq = NULL;
+	pdbgdev->type = type;
+	pdbgdev->pqm = NULL;
+
+	switch (type) {
+	case DBGDEV_TYPE_NODIQ:
+		pdbgdev->dbgdev_register = dbgdev_register_nodiq;
+		pdbgdev->dbgdev_unregister = dbgdev_unregister_nodiq;
+		pdbgdev->dbgdev_wave_control = dbgdev_wave_control_nodiq;
+		pdbgdev->dbgdev_address_watch = dbgdev_address_watch_nodiq;
+		break;
+	case DBGDEV_TYPE_DIQ:
+	default:
+		pdbgdev->dbgdev_register = dbgdev_register_diq;
+		pdbgdev->dbgdev_unregister = dbgdev_unregister_diq;
+		pdbgdev->dbgdev_wave_control =  dbgdev_wave_control_diq;
+		pdbgdev->dbgdev_address_watch = dbgdev_address_watch_diq;
+		break;
+	}
+
+}
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_dbgdev.h b/drivers/gpu/drm/amd/amdkfd/kfd_dbgdev.h
new file mode 100644
index 000000000..0619c777b
--- /dev/null
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_dbgdev.h
@@ -0,0 +1,230 @@
+/*
+ * Copyright 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#ifndef KFD_DBGDEV_H_
+#define KFD_DBGDEV_H_
+
+enum {
+	SQ_CMD_VMID_OFFSET = 28,
+	ADDRESS_WATCH_CNTL_OFFSET = 24
+};
+
+enum {
+	PRIV_QUEUE_SYNC_TIME_MS = 200
+};
+
+/* CONTEXT reg space definition */
+enum {
+	CONTEXT_REG_BASE = 0xA000,
+	CONTEXT_REG_END = 0xA400,
+	CONTEXT_REG_SIZE = CONTEXT_REG_END - CONTEXT_REG_BASE
+};
+
+/* USER CONFIG reg space definition */
+enum {
+	USERCONFIG_REG_BASE = 0xC000,
+	USERCONFIG_REG_END = 0x10000,
+	USERCONFIG_REG_SIZE = USERCONFIG_REG_END - USERCONFIG_REG_BASE
+};
+
+/* CONFIG reg space definition */
+enum {
+	AMD_CONFIG_REG_BASE = 0x2000,	/* in dwords */
+	AMD_CONFIG_REG_END = 0x2B00,
+	AMD_CONFIG_REG_SIZE = AMD_CONFIG_REG_END - AMD_CONFIG_REG_BASE
+};
+
+/* SH reg space definition */
+enum {
+	SH_REG_BASE = 0x2C00,
+	SH_REG_END = 0x3000,
+	SH_REG_SIZE = SH_REG_END - SH_REG_BASE
+};
+
+/* SQ_CMD definitions */
+#define SQ_CMD						0x8DEC
+
+enum SQ_IND_CMD_CMD {
+	SQ_IND_CMD_CMD_NULL = 0x00000000,
+	SQ_IND_CMD_CMD_HALT = 0x00000001,
+	SQ_IND_CMD_CMD_RESUME = 0x00000002,
+	SQ_IND_CMD_CMD_KILL = 0x00000003,
+	SQ_IND_CMD_CMD_DEBUG = 0x00000004,
+	SQ_IND_CMD_CMD_TRAP = 0x00000005,
+};
+
+enum SQ_IND_CMD_MODE {
+	SQ_IND_CMD_MODE_SINGLE = 0x00000000,
+	SQ_IND_CMD_MODE_BROADCAST = 0x00000001,
+	SQ_IND_CMD_MODE_BROADCAST_QUEUE = 0x00000002,
+	SQ_IND_CMD_MODE_BROADCAST_PIPE = 0x00000003,
+	SQ_IND_CMD_MODE_BROADCAST_ME = 0x00000004,
+};
+
+union SQ_IND_INDEX_BITS {
+	struct {
+		uint32_t wave_id:4;
+		uint32_t simd_id:2;
+		uint32_t thread_id:6;
+		 uint32_t:1;
+		uint32_t force_read:1;
+		uint32_t read_timeout:1;
+		uint32_t unindexed:1;
+		uint32_t index:16;
+
+	} bitfields, bits;
+	uint32_t u32All;
+	signed int i32All;
+	float f32All;
+};
+
+union SQ_IND_CMD_BITS {
+	struct {
+		uint32_t data:32;
+	} bitfields, bits;
+	uint32_t u32All;
+	signed int i32All;
+	float f32All;
+};
+
+union SQ_CMD_BITS {
+	struct {
+		uint32_t cmd:3;
+		 uint32_t:1;
+		uint32_t mode:3;
+		uint32_t check_vmid:1;
+		uint32_t trap_id:3;
+		 uint32_t:5;
+		uint32_t wave_id:4;
+		uint32_t simd_id:2;
+		 uint32_t:2;
+		uint32_t queue_id:3;
+		 uint32_t:1;
+		uint32_t vm_id:4;
+	} bitfields, bits;
+	uint32_t u32All;
+	signed int i32All;
+	float f32All;
+};
+
+union SQ_IND_DATA_BITS {
+	struct {
+		uint32_t data:32;
+	} bitfields, bits;
+	uint32_t u32All;
+	signed int i32All;
+	float f32All;
+};
+
+union GRBM_GFX_INDEX_BITS {
+	struct {
+		uint32_t instance_index:8;
+		uint32_t sh_index:8;
+		uint32_t se_index:8;
+		 uint32_t:5;
+		uint32_t sh_broadcast_writes:1;
+		uint32_t instance_broadcast_writes:1;
+		uint32_t se_broadcast_writes:1;
+	} bitfields, bits;
+	uint32_t u32All;
+	signed int i32All;
+	float f32All;
+};
+
+union TCP_WATCH_ADDR_H_BITS {
+	struct {
+		uint32_t addr:16;
+		 uint32_t:16;
+
+	} bitfields, bits;
+	uint32_t u32All;
+	signed int i32All;
+	float f32All;
+};
+
+union TCP_WATCH_ADDR_L_BITS {
+	struct {
+		uint32_t:6;
+		uint32_t addr:26;
+	} bitfields, bits;
+	uint32_t u32All;
+	signed int i32All;
+	float f32All;
+};
+
+enum {
+	QUEUESTATE__INVALID = 0, /* so by default we'll get invalid state */
+	QUEUESTATE__ACTIVE_COMPLETION_PENDING,
+	QUEUESTATE__ACTIVE
+};
+
+union ULARGE_INTEGER {
+	struct {
+		uint32_t low_part;
+		uint32_t high_part;
+	} u;
+	unsigned long long quad_part;
+};
+
+
+#define KFD_CIK_VMID_START_OFFSET (8)
+#define KFD_CIK_VMID_END_OFFSET (KFD_CIK_VMID_START_OFFSET + (8))
+
+
+void kfd_dbgdev_init(struct kfd_dbgdev *pdbgdev, struct kfd_dev *pdev,
+			enum DBGDEV_TYPE type);
+
+union TCP_WATCH_CNTL_BITS {
+	struct {
+		uint32_t mask:24;
+		uint32_t vmid:4;
+		uint32_t atc:1;
+		uint32_t mode:2;
+		uint32_t valid:1;
+	} bitfields, bits;
+	uint32_t u32All;
+	signed int i32All;
+	float f32All;
+};
+
+enum {
+	ADDRESS_WATCH_REG_CNTL_ATC_BIT = 0x10000000UL,
+	ADDRESS_WATCH_REG_CNTL_DEFAULT_MASK = 0x00FFFFFF,
+	ADDRESS_WATCH_REG_ADDLOW_MASK_EXTENSION = 0x03000000,
+	/* extend the mask to 26 bits in order to match the low address field */
+	ADDRESS_WATCH_REG_ADDLOW_SHIFT = 6,
+	ADDRESS_WATCH_REG_ADDHIGH_MASK = 0xFFFF
+};
+
+enum {
+	MAX_TRAPID = 8,		/* 3 bits in the bitfield. */
+	MAX_WATCH_ADDRESSES = 4
+};
+
+enum {
+	ADDRESS_WATCH_REG_ADDR_HI = 0,
+	ADDRESS_WATCH_REG_ADDR_LO,
+	ADDRESS_WATCH_REG_CNTL,
+	ADDRESS_WATCH_REG_MAX
+};
+
+#endif	/* KFD_DBGDEV_H_ */
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_dbgmgr.c b/drivers/gpu/drm/amd/amdkfd/kfd_dbgmgr.c
new file mode 100644
index 000000000..9d4af961c
--- /dev/null
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_dbgmgr.c
@@ -0,0 +1,158 @@
+/*
+ * Copyright 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ *
+ */
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/log2.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/device.h>
+
+#include "kfd_priv.h"
+#include "cik_regs.h"
+#include "kfd_pm4_headers.h"
+#include "kfd_pm4_headers_diq.h"
+#include "kfd_dbgmgr.h"
+#include "kfd_dbgdev.h"
+#include "kfd_device_queue_manager.h"
+
+static DEFINE_MUTEX(kfd_dbgmgr_mutex);
+
+struct mutex *kfd_get_dbgmgr_mutex(void)
+{
+	return &kfd_dbgmgr_mutex;
+}
+
+
+static void kfd_dbgmgr_uninitialize(struct kfd_dbgmgr *pmgr)
+{
+	kfree(pmgr->dbgdev);
+
+	pmgr->dbgdev = NULL;
+	pmgr->pasid = 0;
+	pmgr->dev = NULL;
+}
+
+void kfd_dbgmgr_destroy(struct kfd_dbgmgr *pmgr)
+{
+	if (pmgr) {
+		kfd_dbgmgr_uninitialize(pmgr);
+		kfree(pmgr);
+	}
+}
+
+bool kfd_dbgmgr_create(struct kfd_dbgmgr **ppmgr, struct kfd_dev *pdev)
+{
+	enum DBGDEV_TYPE type = DBGDEV_TYPE_DIQ;
+	struct kfd_dbgmgr *new_buff;
+
+	if (WARN_ON(!pdev->init_complete))
+		return false;
+
+	new_buff = kfd_alloc_struct(new_buff);
+	if (!new_buff) {
+		pr_err("Failed to allocate dbgmgr instance\n");
+		return false;
+	}
+
+	new_buff->pasid = 0;
+	new_buff->dev = pdev;
+	new_buff->dbgdev = kfd_alloc_struct(new_buff->dbgdev);
+	if (!new_buff->dbgdev) {
+		pr_err("Failed to allocate dbgdev instance\n");
+		kfree(new_buff);
+		return false;
+	}
+
+	/* get actual type of DBGDevice cpsch or not */
+	if (pdev->dqm->sched_policy == KFD_SCHED_POLICY_NO_HWS)
+		type = DBGDEV_TYPE_NODIQ;
+
+	kfd_dbgdev_init(new_buff->dbgdev, pdev, type);
+	*ppmgr = new_buff;
+
+	return true;
+}
+
+long kfd_dbgmgr_register(struct kfd_dbgmgr *pmgr, struct kfd_process *p)
+{
+	if (pmgr->pasid != 0) {
+		pr_debug("H/W debugger is already active using pasid %d\n",
+				pmgr->pasid);
+		return -EBUSY;
+	}
+
+	/* remember pasid */
+	pmgr->pasid = p->pasid;
+
+	/* provide the pqm for diq generation */
+	pmgr->dbgdev->pqm = &p->pqm;
+
+	/* activate the actual registering */
+	pmgr->dbgdev->dbgdev_register(pmgr->dbgdev);
+
+	return 0;
+}
+
+long kfd_dbgmgr_unregister(struct kfd_dbgmgr *pmgr, struct kfd_process *p)
+{
+	/* Is the requests coming from the already registered process? */
+	if (pmgr->pasid != p->pasid) {
+		pr_debug("H/W debugger is not registered by calling pasid %d\n",
+				p->pasid);
+		return -EINVAL;
+	}
+
+	pmgr->dbgdev->dbgdev_unregister(pmgr->dbgdev);
+
+	pmgr->pasid = 0;
+
+	return 0;
+}
+
+long kfd_dbgmgr_wave_control(struct kfd_dbgmgr *pmgr,
+				struct dbg_wave_control_info *wac_info)
+{
+	/* Is the requests coming from the already registered process? */
+	if (pmgr->pasid != wac_info->process->pasid) {
+		pr_debug("H/W debugger support was not registered for requester pasid %d\n",
+				wac_info->process->pasid);
+		return -EINVAL;
+	}
+
+	return (long) pmgr->dbgdev->dbgdev_wave_control(pmgr->dbgdev, wac_info);
+}
+
+long kfd_dbgmgr_address_watch(struct kfd_dbgmgr *pmgr,
+				struct dbg_address_watch_info *adw_info)
+{
+	/* Is the requests coming from the already registered process? */
+	if (pmgr->pasid != adw_info->process->pasid) {
+		pr_debug("H/W debugger support was not registered for requester pasid %d\n",
+				adw_info->process->pasid);
+		return -EINVAL;
+	}
+
+	return (long) pmgr->dbgdev->dbgdev_address_watch(pmgr->dbgdev,
+							adw_info);
+}
+
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_dbgmgr.h b/drivers/gpu/drm/amd/amdkfd/kfd_dbgmgr.h
new file mode 100644
index 000000000..a04a1fe1d
--- /dev/null
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_dbgmgr.h
@@ -0,0 +1,293 @@
+/*
+ * Copyright 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ *
+ */
+
+#ifndef KFD_DBGMGR_H_
+#define KFD_DBGMGR_H_
+
+#include "kfd_priv.h"
+
+/* must align with hsakmttypes definition */
+#pragma pack(push, 4)
+
+enum HSA_DBG_WAVEOP {
+	HSA_DBG_WAVEOP_HALT = 1,   /* Halts a wavefront */
+	HSA_DBG_WAVEOP_RESUME = 2, /* Resumes a wavefront */
+	HSA_DBG_WAVEOP_KILL = 3,   /* Kills a wavefront */
+	HSA_DBG_WAVEOP_DEBUG = 4,  /* Causes wavefront to enter dbg mode */
+	HSA_DBG_WAVEOP_TRAP = 5,   /* Causes wavefront to take a trap */
+	HSA_DBG_NUM_WAVEOP = 5,
+	HSA_DBG_MAX_WAVEOP = 0xFFFFFFFF
+};
+
+enum HSA_DBG_WAVEMODE {
+	/* send command to a single wave */
+	HSA_DBG_WAVEMODE_SINGLE = 0,
+	/*
+	 * Broadcast to all wavefronts of all processes is not
+	 * supported for HSA user mode
+	 */
+
+	/* send to waves within current process */
+	HSA_DBG_WAVEMODE_BROADCAST_PROCESS = 2,
+	/* send to waves within current process on CU  */
+	HSA_DBG_WAVEMODE_BROADCAST_PROCESS_CU = 3,
+	HSA_DBG_NUM_WAVEMODE = 3,
+	HSA_DBG_MAX_WAVEMODE = 0xFFFFFFFF
+};
+
+enum HSA_DBG_WAVEMSG_TYPE {
+	HSA_DBG_WAVEMSG_AUTO = 0,
+	HSA_DBG_WAVEMSG_USER = 1,
+	HSA_DBG_WAVEMSG_ERROR = 2,
+	HSA_DBG_NUM_WAVEMSG,
+	HSA_DBG_MAX_WAVEMSG = 0xFFFFFFFF
+};
+
+enum HSA_DBG_WATCH_MODE {
+	HSA_DBG_WATCH_READ = 0,		/* Read operations only */
+	HSA_DBG_WATCH_NONREAD = 1,	/* Write or Atomic operations only */
+	HSA_DBG_WATCH_ATOMIC = 2,	/* Atomic Operations only */
+	HSA_DBG_WATCH_ALL = 3,		/* Read, Write or Atomic operations */
+	HSA_DBG_WATCH_NUM,
+	HSA_DBG_WATCH_SIZE = 0xFFFFFFFF
+};
+
+/* This structure is hardware specific and may change in the future */
+struct HsaDbgWaveMsgAMDGen2 {
+	union {
+		struct ui32 {
+			uint32_t UserData:8;	/* user data */
+			uint32_t ShaderArray:1;	/* Shader array */
+			uint32_t Priv:1;	/* Privileged */
+			uint32_t Reserved0:4;	/* Reserved, should be 0 */
+			uint32_t WaveId:4;	/* wave id */
+			uint32_t SIMD:2;	/* SIMD id */
+			uint32_t HSACU:4;	/* Compute unit */
+			uint32_t ShaderEngine:2;/* Shader engine */
+			uint32_t MessageType:2;	/* see HSA_DBG_WAVEMSG_TYPE */
+			uint32_t Reserved1:4;	/* Reserved, should be 0 */
+		} ui32;
+		uint32_t Value;
+	};
+	uint32_t Reserved2;
+};
+
+union HsaDbgWaveMessageAMD {
+	struct HsaDbgWaveMsgAMDGen2 WaveMsgInfoGen2;
+	/* for future HsaDbgWaveMsgAMDGen3; */
+};
+
+struct HsaDbgWaveMessage {
+	void *MemoryVA;		/* ptr to associated host-accessible data */
+	union HsaDbgWaveMessageAMD DbgWaveMsg;
+};
+
+/*
+ * TODO: This definitions to be MOVED to kfd_event, once it is implemented.
+ *
+ * HSA sync primitive, Event and HW Exception notification API definitions.
+ * The API functions allow the runtime to define a so-called sync-primitive,
+ * a SW object combining a user-mode provided "syncvar" and a scheduler event
+ * that can be signaled through a defined GPU interrupt. A syncvar is
+ * a process virtual memory location of a certain size that can be accessed
+ * by CPU and GPU shader code within the process to set and query the content
+ * within that memory. The definition of the content is determined by the HSA
+ * runtime and potentially GPU shader code interfacing with the HSA runtime.
+ * The syncvar values may be commonly written through an PM4 WRITE_DATA packet
+ * in the user mode instruction stream. The OS scheduler event is typically
+ * associated and signaled by an interrupt issued by the GPU, but other HSA
+ * system interrupt conditions from other HW (e.g. IOMMUv2) may be surfaced
+ * by the KFD by this mechanism, too.
+ */
+
+/* these are the new definitions for events */
+enum HSA_EVENTTYPE {
+	HSA_EVENTTYPE_SIGNAL = 0,	/* user-mode generated GPU signal */
+	HSA_EVENTTYPE_NODECHANGE = 1,	/* HSA node change (attach/detach) */
+	HSA_EVENTTYPE_DEVICESTATECHANGE = 2,	/* HSA device state change
+						 * (start/stop)
+						 */
+	HSA_EVENTTYPE_HW_EXCEPTION = 3,	/* GPU shader exception event */
+	HSA_EVENTTYPE_SYSTEM_EVENT = 4,	/* GPU SYSCALL with parameter info */
+	HSA_EVENTTYPE_DEBUG_EVENT = 5,	/* GPU signal for debugging */
+	HSA_EVENTTYPE_PROFILE_EVENT = 6,/* GPU signal for profiling */
+	HSA_EVENTTYPE_QUEUE_EVENT = 7,	/* GPU signal queue idle state
+					 * (EOP pm4)
+					 */
+	/* ...  */
+	HSA_EVENTTYPE_MAXID,
+	HSA_EVENTTYPE_TYPE_SIZE = 0xFFFFFFFF
+};
+
+/* Sub-definitions for various event types: Syncvar */
+struct HsaSyncVar {
+	union SyncVar {
+		void *UserData;	/* pointer to user mode data */
+		uint64_t UserDataPtrValue; /* 64bit compatibility of value */
+	} SyncVar;
+	uint64_t SyncVarSize;
+};
+
+/* Sub-definitions for various event types: NodeChange */
+
+enum HSA_EVENTTYPE_NODECHANGE_FLAGS {
+	HSA_EVENTTYPE_NODECHANGE_ADD = 0,
+	HSA_EVENTTYPE_NODECHANGE_REMOVE = 1,
+	HSA_EVENTTYPE_NODECHANGE_SIZE = 0xFFFFFFFF
+};
+
+struct HsaNodeChange {
+	/* HSA node added/removed on the platform */
+	enum HSA_EVENTTYPE_NODECHANGE_FLAGS Flags;
+};
+
+/* Sub-definitions for various event types: DeviceStateChange */
+enum HSA_EVENTTYPE_DEVICESTATECHANGE_FLAGS {
+	/* device started (and available) */
+	HSA_EVENTTYPE_DEVICESTATUSCHANGE_START = 0,
+	/* device stopped (i.e. unavailable) */
+	HSA_EVENTTYPE_DEVICESTATUSCHANGE_STOP = 1,
+	HSA_EVENTTYPE_DEVICESTATUSCHANGE_SIZE = 0xFFFFFFFF
+};
+
+enum HSA_DEVICE {
+	HSA_DEVICE_CPU = 0,
+	HSA_DEVICE_GPU = 1,
+	MAX_HSA_DEVICE = 2
+};
+
+struct HsaDeviceStateChange {
+	uint32_t NodeId;	/* F-NUMA node that contains the device */
+	enum HSA_DEVICE Device;	/* device type: GPU or CPU */
+	enum HSA_EVENTTYPE_DEVICESTATECHANGE_FLAGS Flags; /* event flags */
+};
+
+struct HsaEventData {
+	enum HSA_EVENTTYPE EventType; /* event type */
+	union EventData {
+		/*
+		 * return data associated with HSA_EVENTTYPE_SIGNAL
+		 * and other events
+		 */
+		struct HsaSyncVar SyncVar;
+
+		/* data associated with HSA_EVENTTYPE_NODE_CHANGE */
+		struct HsaNodeChange NodeChangeState;
+
+		/* data associated with HSA_EVENTTYPE_DEVICE_STATE_CHANGE */
+		struct HsaDeviceStateChange DeviceState;
+	} EventData;
+
+	/* the following data entries are internal to the KFD & thunk itself */
+
+	/* internal thunk store for Event data (OsEventHandle) */
+	uint64_t HWData1;
+	/* internal thunk store for Event data (HWAddress) */
+	uint64_t HWData2;
+	/* internal thunk store for Event data (HWData) */
+	uint32_t HWData3;
+};
+
+struct HsaEventDescriptor {
+	/* event type to allocate */
+	enum HSA_EVENTTYPE EventType;
+	/* H-NUMA node containing GPU device that is event source */
+	uint32_t NodeId;
+	/* pointer to user mode syncvar data, syncvar->UserDataPtrValue
+	 * may be NULL
+	 */
+	struct HsaSyncVar SyncVar;
+};
+
+struct HsaEvent {
+	uint32_t EventId;
+	struct HsaEventData EventData;
+};
+
+#pragma pack(pop)
+
+enum DBGDEV_TYPE {
+	DBGDEV_TYPE_ILLEGAL = 0,
+	DBGDEV_TYPE_NODIQ = 1,
+	DBGDEV_TYPE_DIQ = 2,
+	DBGDEV_TYPE_TEST = 3
+};
+
+struct dbg_address_watch_info {
+	struct kfd_process *process;
+	enum HSA_DBG_WATCH_MODE *watch_mode;
+	uint64_t *watch_address;
+	uint64_t *watch_mask;
+	struct HsaEvent *watch_event;
+	uint32_t num_watch_points;
+};
+
+struct dbg_wave_control_info {
+	struct kfd_process *process;
+	uint32_t trapId;
+	enum HSA_DBG_WAVEOP operand;
+	enum HSA_DBG_WAVEMODE mode;
+	struct HsaDbgWaveMessage dbgWave_msg;
+};
+
+struct kfd_dbgdev {
+
+	/* The device that owns this data. */
+	struct kfd_dev *dev;
+
+	/* kernel queue for DIQ */
+	struct kernel_queue *kq;
+
+	/* a pointer to the pqm of the calling process */
+	struct process_queue_manager *pqm;
+
+	/* type of debug device ( DIQ, non DIQ, etc. ) */
+	enum DBGDEV_TYPE type;
+
+	/* virtualized function pointers to device dbg */
+	int (*dbgdev_register)(struct kfd_dbgdev *dbgdev);
+	int (*dbgdev_unregister)(struct kfd_dbgdev *dbgdev);
+	int (*dbgdev_address_watch)(struct kfd_dbgdev *dbgdev,
+				struct dbg_address_watch_info *adw_info);
+	int (*dbgdev_wave_control)(struct kfd_dbgdev *dbgdev,
+				struct dbg_wave_control_info *wac_info);
+
+};
+
+struct kfd_dbgmgr {
+	unsigned int pasid;
+	struct kfd_dev *dev;
+	struct kfd_dbgdev *dbgdev;
+};
+
+/* prototypes for debug manager functions */
+struct mutex *kfd_get_dbgmgr_mutex(void);
+void kfd_dbgmgr_destroy(struct kfd_dbgmgr *pmgr);
+bool kfd_dbgmgr_create(struct kfd_dbgmgr **ppmgr, struct kfd_dev *pdev);
+long kfd_dbgmgr_register(struct kfd_dbgmgr *pmgr, struct kfd_process *p);
+long kfd_dbgmgr_unregister(struct kfd_dbgmgr *pmgr, struct kfd_process *p);
+long kfd_dbgmgr_wave_control(struct kfd_dbgmgr *pmgr,
+				struct dbg_wave_control_info *wac_info);
+long kfd_dbgmgr_address_watch(struct kfd_dbgmgr *pmgr,
+			struct dbg_address_watch_info *adw_info);
+#endif /* KFD_DBGMGR_H_ */
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_debugfs.c b/drivers/gpu/drm/amd/amdkfd/kfd_debugfs.c
new file mode 100644
index 000000000..ab37d36d9
--- /dev/null
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_debugfs.c
@@ -0,0 +1,123 @@
+/*
+ * Copyright 2016-2017 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#include <linux/debugfs.h>
+#include <linux/uaccess.h>
+
+#include "kfd_priv.h"
+
+static struct dentry *debugfs_root;
+
+static int kfd_debugfs_open(struct inode *inode, struct file *file)
+{
+	int (*show)(struct seq_file *, void *) = inode->i_private;
+
+	return single_open(file, show, NULL);
+}
+
+static ssize_t kfd_debugfs_hang_hws_write(struct file *file,
+	const char __user *user_buf, size_t size, loff_t *ppos)
+{
+	struct kfd_dev *dev;
+	char tmp[16];
+	uint32_t gpu_id;
+	int ret = -EINVAL;
+
+	memset(tmp, 0, 16);
+	if (size >= 16) {
+		pr_err("Invalid input for gpu id.\n");
+		goto out;
+	}
+	if (copy_from_user(tmp, user_buf, size)) {
+		ret = -EFAULT;
+		goto out;
+	}
+	if (kstrtoint(tmp, 10, &gpu_id)) {
+		pr_err("Invalid input for gpu id.\n");
+		goto out;
+	}
+	dev = kfd_device_by_id(gpu_id);
+	if (dev) {
+		kfd_debugfs_hang_hws(dev);
+		ret = size;
+	} else
+		pr_err("Cannot find device %d.\n", gpu_id);
+
+out:
+	return ret;
+}
+
+static const struct file_operations kfd_debugfs_fops = {
+	.owner = THIS_MODULE,
+	.open = kfd_debugfs_open,
+	.read = seq_read,
+	.llseek = seq_lseek,
+	.release = single_release,
+};
+
+static const struct file_operations kfd_debugfs_hang_hws_fops = {
+	.owner = THIS_MODULE,
+	.open = kfd_debugfs_open,
+	.read = seq_read,
+	.write = kfd_debugfs_hang_hws_write,
+	.llseek = seq_lseek,
+	.release = single_release,
+};
+
+void kfd_debugfs_init(void)
+{
+	struct dentry *ent;
+
+	debugfs_root = debugfs_create_dir("kfd", NULL);
+	if (!debugfs_root || debugfs_root == ERR_PTR(-ENODEV)) {
+		pr_warn("Failed to create kfd debugfs dir\n");
+		return;
+	}
+
+	ent = debugfs_create_file("mqds", S_IFREG | 0444, debugfs_root,
+				  kfd_debugfs_mqds_by_process,
+				  &kfd_debugfs_fops);
+	if (!ent)
+		pr_warn("Failed to create mqds in kfd debugfs\n");
+
+	ent = debugfs_create_file("hqds", S_IFREG | 0444, debugfs_root,
+				  kfd_debugfs_hqds_by_device,
+				  &kfd_debugfs_fops);
+	if (!ent)
+		pr_warn("Failed to create hqds in kfd debugfs\n");
+
+	ent = debugfs_create_file("rls", S_IFREG | 0444, debugfs_root,
+				  kfd_debugfs_rls_by_device,
+				  &kfd_debugfs_fops);
+
+	ent = debugfs_create_file("hang_hws", S_IFREG | 0644, debugfs_root,
+				  NULL,
+				  &kfd_debugfs_hang_hws_fops);
+
+	if (!ent)
+		pr_warn("Failed to create rls in kfd debugfs\n");
+}
+
+void kfd_debugfs_fini(void)
+{
+	debugfs_remove_recursive(debugfs_root);
+}
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device.c b/drivers/gpu/drm/amd/amdkfd/kfd_device.c
new file mode 100644
index 000000000..28022d1cb
--- /dev/null
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_device.c
@@ -0,0 +1,976 @@
+/*
+ * Copyright 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#include <linux/bsearch.h>
+#include <linux/pci.h>
+#include <linux/slab.h>
+#include "kfd_priv.h"
+#include "kfd_device_queue_manager.h"
+#include "kfd_pm4_headers_vi.h"
+#include "cwsr_trap_handler.h"
+#include "kfd_iommu.h"
+
+#define MQD_SIZE_ALIGNED 768
+
+/*
+ * kfd_locked is used to lock the kfd driver during suspend or reset
+ * once locked, kfd driver will stop any further GPU execution.
+ * create process (open) will return -EAGAIN.
+ */
+static atomic_t kfd_locked = ATOMIC_INIT(0);
+
+#ifdef KFD_SUPPORT_IOMMU_V2
+static const struct kfd_device_info kaveri_device_info = {
+	.asic_family = CHIP_KAVERI,
+	.max_pasid_bits = 16,
+	/* max num of queues for KV.TODO should be a dynamic value */
+	.max_no_of_hqd	= 24,
+	.doorbell_size  = 4,
+	.ih_ring_entry_size = 4 * sizeof(uint32_t),
+	.event_interrupt_class = &event_interrupt_class_cik,
+	.num_of_watch_points = 4,
+	.mqd_size_aligned = MQD_SIZE_ALIGNED,
+	.supports_cwsr = false,
+	.needs_iommu_device = true,
+	.needs_pci_atomics = false,
+	.num_sdma_engines = 2,
+};
+
+static const struct kfd_device_info carrizo_device_info = {
+	.asic_family = CHIP_CARRIZO,
+	.max_pasid_bits = 16,
+	/* max num of queues for CZ.TODO should be a dynamic value */
+	.max_no_of_hqd	= 24,
+	.doorbell_size  = 4,
+	.ih_ring_entry_size = 4 * sizeof(uint32_t),
+	.event_interrupt_class = &event_interrupt_class_cik,
+	.num_of_watch_points = 4,
+	.mqd_size_aligned = MQD_SIZE_ALIGNED,
+	.supports_cwsr = true,
+	.needs_iommu_device = true,
+	.needs_pci_atomics = false,
+	.num_sdma_engines = 2,
+};
+
+static const struct kfd_device_info raven_device_info = {
+	.asic_family = CHIP_RAVEN,
+	.max_pasid_bits = 16,
+	.max_no_of_hqd  = 24,
+	.doorbell_size  = 8,
+	.ih_ring_entry_size = 8 * sizeof(uint32_t),
+	.event_interrupt_class = &event_interrupt_class_v9,
+	.num_of_watch_points = 4,
+	.mqd_size_aligned = MQD_SIZE_ALIGNED,
+	.supports_cwsr = true,
+	.needs_iommu_device = true,
+	.needs_pci_atomics = true,
+	.num_sdma_engines = 1,
+};
+#endif
+
+static const struct kfd_device_info hawaii_device_info = {
+	.asic_family = CHIP_HAWAII,
+	.max_pasid_bits = 16,
+	/* max num of queues for KV.TODO should be a dynamic value */
+	.max_no_of_hqd	= 24,
+	.doorbell_size  = 4,
+	.ih_ring_entry_size = 4 * sizeof(uint32_t),
+	.event_interrupt_class = &event_interrupt_class_cik,
+	.num_of_watch_points = 4,
+	.mqd_size_aligned = MQD_SIZE_ALIGNED,
+	.supports_cwsr = false,
+	.needs_iommu_device = false,
+	.needs_pci_atomics = false,
+	.num_sdma_engines = 2,
+};
+
+static const struct kfd_device_info tonga_device_info = {
+	.asic_family = CHIP_TONGA,
+	.max_pasid_bits = 16,
+	.max_no_of_hqd  = 24,
+	.doorbell_size  = 4,
+	.ih_ring_entry_size = 4 * sizeof(uint32_t),
+	.event_interrupt_class = &event_interrupt_class_cik,
+	.num_of_watch_points = 4,
+	.mqd_size_aligned = MQD_SIZE_ALIGNED,
+	.supports_cwsr = false,
+	.needs_iommu_device = false,
+	.needs_pci_atomics = true,
+	.num_sdma_engines = 2,
+};
+
+static const struct kfd_device_info tonga_vf_device_info = {
+	.asic_family = CHIP_TONGA,
+	.max_pasid_bits = 16,
+	.max_no_of_hqd  = 24,
+	.doorbell_size  = 4,
+	.ih_ring_entry_size = 4 * sizeof(uint32_t),
+	.event_interrupt_class = &event_interrupt_class_cik,
+	.num_of_watch_points = 4,
+	.mqd_size_aligned = MQD_SIZE_ALIGNED,
+	.supports_cwsr = false,
+	.needs_iommu_device = false,
+	.needs_pci_atomics = false,
+	.num_sdma_engines = 2,
+};
+
+static const struct kfd_device_info fiji_device_info = {
+	.asic_family = CHIP_FIJI,
+	.max_pasid_bits = 16,
+	.max_no_of_hqd  = 24,
+	.doorbell_size  = 4,
+	.ih_ring_entry_size = 4 * sizeof(uint32_t),
+	.event_interrupt_class = &event_interrupt_class_cik,
+	.num_of_watch_points = 4,
+	.mqd_size_aligned = MQD_SIZE_ALIGNED,
+	.supports_cwsr = true,
+	.needs_iommu_device = false,
+	.needs_pci_atomics = true,
+	.num_sdma_engines = 2,
+};
+
+static const struct kfd_device_info fiji_vf_device_info = {
+	.asic_family = CHIP_FIJI,
+	.max_pasid_bits = 16,
+	.max_no_of_hqd  = 24,
+	.doorbell_size  = 4,
+	.ih_ring_entry_size = 4 * sizeof(uint32_t),
+	.event_interrupt_class = &event_interrupt_class_cik,
+	.num_of_watch_points = 4,
+	.mqd_size_aligned = MQD_SIZE_ALIGNED,
+	.supports_cwsr = true,
+	.needs_iommu_device = false,
+	.needs_pci_atomics = false,
+	.num_sdma_engines = 2,
+};
+
+
+static const struct kfd_device_info polaris10_device_info = {
+	.asic_family = CHIP_POLARIS10,
+	.max_pasid_bits = 16,
+	.max_no_of_hqd  = 24,
+	.doorbell_size  = 4,
+	.ih_ring_entry_size = 4 * sizeof(uint32_t),
+	.event_interrupt_class = &event_interrupt_class_cik,
+	.num_of_watch_points = 4,
+	.mqd_size_aligned = MQD_SIZE_ALIGNED,
+	.supports_cwsr = true,
+	.needs_iommu_device = false,
+	.needs_pci_atomics = true,
+	.num_sdma_engines = 2,
+};
+
+static const struct kfd_device_info polaris10_vf_device_info = {
+	.asic_family = CHIP_POLARIS10,
+	.max_pasid_bits = 16,
+	.max_no_of_hqd  = 24,
+	.doorbell_size  = 4,
+	.ih_ring_entry_size = 4 * sizeof(uint32_t),
+	.event_interrupt_class = &event_interrupt_class_cik,
+	.num_of_watch_points = 4,
+	.mqd_size_aligned = MQD_SIZE_ALIGNED,
+	.supports_cwsr = true,
+	.needs_iommu_device = false,
+	.needs_pci_atomics = false,
+	.num_sdma_engines = 2,
+};
+
+static const struct kfd_device_info polaris11_device_info = {
+	.asic_family = CHIP_POLARIS11,
+	.max_pasid_bits = 16,
+	.max_no_of_hqd  = 24,
+	.doorbell_size  = 4,
+	.ih_ring_entry_size = 4 * sizeof(uint32_t),
+	.event_interrupt_class = &event_interrupt_class_cik,
+	.num_of_watch_points = 4,
+	.mqd_size_aligned = MQD_SIZE_ALIGNED,
+	.supports_cwsr = true,
+	.needs_iommu_device = false,
+	.needs_pci_atomics = true,
+	.num_sdma_engines = 2,
+};
+
+static const struct kfd_device_info vega10_device_info = {
+	.asic_family = CHIP_VEGA10,
+	.max_pasid_bits = 16,
+	.max_no_of_hqd  = 24,
+	.doorbell_size  = 8,
+	.ih_ring_entry_size = 8 * sizeof(uint32_t),
+	.event_interrupt_class = &event_interrupt_class_v9,
+	.num_of_watch_points = 4,
+	.mqd_size_aligned = MQD_SIZE_ALIGNED,
+	.supports_cwsr = true,
+	.needs_iommu_device = false,
+	.needs_pci_atomics = false,
+	.num_sdma_engines = 2,
+};
+
+static const struct kfd_device_info vega10_vf_device_info = {
+	.asic_family = CHIP_VEGA10,
+	.max_pasid_bits = 16,
+	.max_no_of_hqd  = 24,
+	.doorbell_size  = 8,
+	.ih_ring_entry_size = 8 * sizeof(uint32_t),
+	.event_interrupt_class = &event_interrupt_class_v9,
+	.num_of_watch_points = 4,
+	.mqd_size_aligned = MQD_SIZE_ALIGNED,
+	.supports_cwsr = true,
+	.needs_iommu_device = false,
+	.needs_pci_atomics = false,
+	.num_sdma_engines = 2,
+};
+
+
+struct kfd_deviceid {
+	unsigned short did;
+	const struct kfd_device_info *device_info;
+};
+
+static const struct kfd_deviceid supported_devices[] = {
+#ifdef KFD_SUPPORT_IOMMU_V2
+	{ 0x1304, &kaveri_device_info },	/* Kaveri */
+	{ 0x1305, &kaveri_device_info },	/* Kaveri */
+	{ 0x1306, &kaveri_device_info },	/* Kaveri */
+	{ 0x1307, &kaveri_device_info },	/* Kaveri */
+	{ 0x1309, &kaveri_device_info },	/* Kaveri */
+	{ 0x130A, &kaveri_device_info },	/* Kaveri */
+	{ 0x130B, &kaveri_device_info },	/* Kaveri */
+	{ 0x130C, &kaveri_device_info },	/* Kaveri */
+	{ 0x130D, &kaveri_device_info },	/* Kaveri */
+	{ 0x130E, &kaveri_device_info },	/* Kaveri */
+	{ 0x130F, &kaveri_device_info },	/* Kaveri */
+	{ 0x1310, &kaveri_device_info },	/* Kaveri */
+	{ 0x1311, &kaveri_device_info },	/* Kaveri */
+	{ 0x1312, &kaveri_device_info },	/* Kaveri */
+	{ 0x1313, &kaveri_device_info },	/* Kaveri */
+	{ 0x1315, &kaveri_device_info },	/* Kaveri */
+	{ 0x1316, &kaveri_device_info },	/* Kaveri */
+	{ 0x1317, &kaveri_device_info },	/* Kaveri */
+	{ 0x1318, &kaveri_device_info },	/* Kaveri */
+	{ 0x131B, &kaveri_device_info },	/* Kaveri */
+	{ 0x131C, &kaveri_device_info },	/* Kaveri */
+	{ 0x131D, &kaveri_device_info },	/* Kaveri */
+	{ 0x9870, &carrizo_device_info },	/* Carrizo */
+	{ 0x9874, &carrizo_device_info },	/* Carrizo */
+	{ 0x9875, &carrizo_device_info },	/* Carrizo */
+	{ 0x9876, &carrizo_device_info },	/* Carrizo */
+	{ 0x9877, &carrizo_device_info },	/* Carrizo */
+	{ 0x15DD, &raven_device_info },		/* Raven */
+#endif
+	{ 0x67A0, &hawaii_device_info },	/* Hawaii */
+	{ 0x67A1, &hawaii_device_info },	/* Hawaii */
+	{ 0x67A2, &hawaii_device_info },	/* Hawaii */
+	{ 0x67A8, &hawaii_device_info },	/* Hawaii */
+	{ 0x67A9, &hawaii_device_info },	/* Hawaii */
+	{ 0x67AA, &hawaii_device_info },	/* Hawaii */
+	{ 0x67B0, &hawaii_device_info },	/* Hawaii */
+	{ 0x67B1, &hawaii_device_info },	/* Hawaii */
+	{ 0x67B8, &hawaii_device_info },	/* Hawaii */
+	{ 0x67B9, &hawaii_device_info },	/* Hawaii */
+	{ 0x67BA, &hawaii_device_info },	/* Hawaii */
+	{ 0x67BE, &hawaii_device_info },	/* Hawaii */
+	{ 0x6920, &tonga_device_info },		/* Tonga */
+	{ 0x6921, &tonga_device_info },		/* Tonga */
+	{ 0x6928, &tonga_device_info },		/* Tonga */
+	{ 0x6929, &tonga_device_info },		/* Tonga */
+	{ 0x692B, &tonga_device_info },		/* Tonga */
+	{ 0x692F, &tonga_vf_device_info },	/* Tonga vf */
+	{ 0x6938, &tonga_device_info },		/* Tonga */
+	{ 0x6939, &tonga_device_info },		/* Tonga */
+	{ 0x7300, &fiji_device_info },		/* Fiji */
+	{ 0x730F, &fiji_vf_device_info },	/* Fiji vf*/
+	{ 0x67C0, &polaris10_device_info },	/* Polaris10 */
+	{ 0x67C1, &polaris10_device_info },	/* Polaris10 */
+	{ 0x67C2, &polaris10_device_info },	/* Polaris10 */
+	{ 0x67C4, &polaris10_device_info },	/* Polaris10 */
+	{ 0x67C7, &polaris10_device_info },	/* Polaris10 */
+	{ 0x67C8, &polaris10_device_info },	/* Polaris10 */
+	{ 0x67C9, &polaris10_device_info },	/* Polaris10 */
+	{ 0x67CA, &polaris10_device_info },	/* Polaris10 */
+	{ 0x67CC, &polaris10_device_info },	/* Polaris10 */
+	{ 0x67CF, &polaris10_device_info },	/* Polaris10 */
+	{ 0x67D0, &polaris10_vf_device_info },	/* Polaris10 vf*/
+	{ 0x67DF, &polaris10_device_info },	/* Polaris10 */
+	{ 0x6FDF, &polaris10_device_info },	/* Polaris10 */
+	{ 0x67E0, &polaris11_device_info },	/* Polaris11 */
+	{ 0x67E1, &polaris11_device_info },	/* Polaris11 */
+	{ 0x67E3, &polaris11_device_info },	/* Polaris11 */
+	{ 0x67E7, &polaris11_device_info },	/* Polaris11 */
+	{ 0x67E8, &polaris11_device_info },	/* Polaris11 */
+	{ 0x67E9, &polaris11_device_info },	/* Polaris11 */
+	{ 0x67EB, &polaris11_device_info },	/* Polaris11 */
+	{ 0x67EF, &polaris11_device_info },	/* Polaris11 */
+	{ 0x67FF, &polaris11_device_info },	/* Polaris11 */
+	{ 0x6860, &vega10_device_info },	/* Vega10 */
+	{ 0x6861, &vega10_device_info },	/* Vega10 */
+	{ 0x6862, &vega10_device_info },	/* Vega10 */
+	{ 0x6863, &vega10_device_info },	/* Vega10 */
+	{ 0x6864, &vega10_device_info },	/* Vega10 */
+	{ 0x6867, &vega10_device_info },	/* Vega10 */
+	{ 0x6868, &vega10_device_info },	/* Vega10 */
+	{ 0x6869, &vega10_device_info },	/* Vega10 */
+	{ 0x686A, &vega10_device_info },	/* Vega10 */
+	{ 0x686B, &vega10_device_info },	/* Vega10 */
+	{ 0x686C, &vega10_vf_device_info },	/* Vega10  vf*/
+	{ 0x686D, &vega10_device_info },	/* Vega10 */
+	{ 0x686E, &vega10_device_info },	/* Vega10 */
+	{ 0x686F, &vega10_device_info },	/* Vega10 */
+	{ 0x687F, &vega10_device_info },	/* Vega10 */
+};
+
+static int kfd_gtt_sa_init(struct kfd_dev *kfd, unsigned int buf_size,
+				unsigned int chunk_size);
+static void kfd_gtt_sa_fini(struct kfd_dev *kfd);
+
+static int kfd_resume(struct kfd_dev *kfd);
+
+static const struct kfd_device_info *lookup_device_info(unsigned short did)
+{
+	size_t i;
+
+	for (i = 0; i < ARRAY_SIZE(supported_devices); i++) {
+		if (supported_devices[i].did == did) {
+			WARN_ON(!supported_devices[i].device_info);
+			return supported_devices[i].device_info;
+		}
+	}
+
+	dev_warn(kfd_device, "DID %04x is missing in supported_devices\n",
+		 did);
+
+	return NULL;
+}
+
+struct kfd_dev *kgd2kfd_probe(struct kgd_dev *kgd,
+	struct pci_dev *pdev, const struct kfd2kgd_calls *f2g)
+{
+	struct kfd_dev *kfd;
+	int ret;
+	const struct kfd_device_info *device_info =
+					lookup_device_info(pdev->device);
+
+	if (!device_info) {
+		dev_err(kfd_device, "kgd2kfd_probe failed\n");
+		return NULL;
+	}
+
+	/* Allow BIF to recode atomics to PCIe 3.0 AtomicOps.
+	 * 32 and 64-bit requests are possible and must be
+	 * supported.
+	 */
+	ret = pci_enable_atomic_ops_to_root(pdev,
+			PCI_EXP_DEVCAP2_ATOMIC_COMP32 |
+			PCI_EXP_DEVCAP2_ATOMIC_COMP64);
+	if (device_info->needs_pci_atomics && ret < 0) {
+		dev_info(kfd_device,
+			 "skipped device %x:%x, PCI rejects atomics\n",
+			 pdev->vendor, pdev->device);
+		return NULL;
+	}
+
+	kfd = kzalloc(sizeof(*kfd), GFP_KERNEL);
+	if (!kfd)
+		return NULL;
+
+	kfd->kgd = kgd;
+	kfd->device_info = device_info;
+	kfd->pdev = pdev;
+	kfd->init_complete = false;
+	kfd->kfd2kgd = f2g;
+
+	mutex_init(&kfd->doorbell_mutex);
+	memset(&kfd->doorbell_available_index, 0,
+		sizeof(kfd->doorbell_available_index));
+
+	return kfd;
+}
+
+static void kfd_cwsr_init(struct kfd_dev *kfd)
+{
+	if (cwsr_enable && kfd->device_info->supports_cwsr) {
+		if (kfd->device_info->asic_family < CHIP_VEGA10) {
+			BUILD_BUG_ON(sizeof(cwsr_trap_gfx8_hex) > PAGE_SIZE);
+			kfd->cwsr_isa = cwsr_trap_gfx8_hex;
+			kfd->cwsr_isa_size = sizeof(cwsr_trap_gfx8_hex);
+		} else {
+			BUILD_BUG_ON(sizeof(cwsr_trap_gfx9_hex) > PAGE_SIZE);
+			kfd->cwsr_isa = cwsr_trap_gfx9_hex;
+			kfd->cwsr_isa_size = sizeof(cwsr_trap_gfx9_hex);
+		}
+
+		kfd->cwsr_enabled = true;
+	}
+}
+
+bool kgd2kfd_device_init(struct kfd_dev *kfd,
+			 const struct kgd2kfd_shared_resources *gpu_resources)
+{
+	unsigned int size;
+
+	kfd->shared_resources = *gpu_resources;
+
+	kfd->vm_info.first_vmid_kfd = ffs(gpu_resources->compute_vmid_bitmap)-1;
+	kfd->vm_info.last_vmid_kfd = fls(gpu_resources->compute_vmid_bitmap)-1;
+	kfd->vm_info.vmid_num_kfd = kfd->vm_info.last_vmid_kfd
+			- kfd->vm_info.first_vmid_kfd + 1;
+
+	/* Verify module parameters regarding mapped process number*/
+	if ((hws_max_conc_proc < 0)
+			|| (hws_max_conc_proc > kfd->vm_info.vmid_num_kfd)) {
+		dev_err(kfd_device,
+			"hws_max_conc_proc %d must be between 0 and %d, use %d instead\n",
+			hws_max_conc_proc, kfd->vm_info.vmid_num_kfd,
+			kfd->vm_info.vmid_num_kfd);
+		kfd->max_proc_per_quantum = kfd->vm_info.vmid_num_kfd;
+	} else
+		kfd->max_proc_per_quantum = hws_max_conc_proc;
+
+	/* calculate max size of mqds needed for queues */
+	size = max_num_of_queues_per_device *
+			kfd->device_info->mqd_size_aligned;
+
+	/*
+	 * calculate max size of runlist packet.
+	 * There can be only 2 packets at once
+	 */
+	size += (KFD_MAX_NUM_OF_PROCESSES * sizeof(struct pm4_mes_map_process) +
+		max_num_of_queues_per_device * sizeof(struct pm4_mes_map_queues)
+		+ sizeof(struct pm4_mes_runlist)) * 2;
+
+	/* Add size of HIQ & DIQ */
+	size += KFD_KERNEL_QUEUE_SIZE * 2;
+
+	/* add another 512KB for all other allocations on gart (HPD, fences) */
+	size += 512 * 1024;
+
+	if (kfd->kfd2kgd->init_gtt_mem_allocation(
+			kfd->kgd, size, &kfd->gtt_mem,
+			&kfd->gtt_start_gpu_addr, &kfd->gtt_start_cpu_ptr,
+			false)) {
+		dev_err(kfd_device, "Could not allocate %d bytes\n", size);
+		goto out;
+	}
+
+	dev_info(kfd_device, "Allocated %d bytes on gart\n", size);
+
+	/* Initialize GTT sa with 512 byte chunk size */
+	if (kfd_gtt_sa_init(kfd, size, 512) != 0) {
+		dev_err(kfd_device, "Error initializing gtt sub-allocator\n");
+		goto kfd_gtt_sa_init_error;
+	}
+
+	if (kfd_doorbell_init(kfd)) {
+		dev_err(kfd_device,
+			"Error initializing doorbell aperture\n");
+		goto kfd_doorbell_error;
+	}
+
+	if (kfd_topology_add_device(kfd)) {
+		dev_err(kfd_device, "Error adding device to topology\n");
+		goto kfd_topology_add_device_error;
+	}
+
+	if (kfd_interrupt_init(kfd)) {
+		dev_err(kfd_device, "Error initializing interrupts\n");
+		goto kfd_interrupt_error;
+	}
+
+	kfd->dqm = device_queue_manager_init(kfd);
+	if (!kfd->dqm) {
+		dev_err(kfd_device, "Error initializing queue manager\n");
+		goto device_queue_manager_error;
+	}
+
+	if (kfd_iommu_device_init(kfd)) {
+		dev_err(kfd_device, "Error initializing iommuv2\n");
+		goto device_iommu_error;
+	}
+
+	kfd_cwsr_init(kfd);
+
+	if (kfd_resume(kfd))
+		goto kfd_resume_error;
+
+	kfd->dbgmgr = NULL;
+
+	kfd->init_complete = true;
+	dev_info(kfd_device, "added device %x:%x\n", kfd->pdev->vendor,
+		 kfd->pdev->device);
+
+	pr_debug("Starting kfd with the following scheduling policy %d\n",
+		kfd->dqm->sched_policy);
+
+	goto out;
+
+kfd_resume_error:
+device_iommu_error:
+	device_queue_manager_uninit(kfd->dqm);
+device_queue_manager_error:
+	kfd_interrupt_exit(kfd);
+kfd_interrupt_error:
+	kfd_topology_remove_device(kfd);
+kfd_topology_add_device_error:
+	kfd_doorbell_fini(kfd);
+kfd_doorbell_error:
+	kfd_gtt_sa_fini(kfd);
+kfd_gtt_sa_init_error:
+	kfd->kfd2kgd->free_gtt_mem(kfd->kgd, kfd->gtt_mem);
+	dev_err(kfd_device,
+		"device %x:%x NOT added due to errors\n",
+		kfd->pdev->vendor, kfd->pdev->device);
+out:
+	return kfd->init_complete;
+}
+
+void kgd2kfd_device_exit(struct kfd_dev *kfd)
+{
+	if (kfd->init_complete) {
+		kgd2kfd_suspend(kfd);
+		device_queue_manager_uninit(kfd->dqm);
+		kfd_interrupt_exit(kfd);
+		kfd_topology_remove_device(kfd);
+		kfd_doorbell_fini(kfd);
+		kfd_gtt_sa_fini(kfd);
+		kfd->kfd2kgd->free_gtt_mem(kfd->kgd, kfd->gtt_mem);
+	}
+
+	kfree(kfd);
+}
+
+int kgd2kfd_pre_reset(struct kfd_dev *kfd)
+{
+	if (!kfd->init_complete)
+		return 0;
+	kgd2kfd_suspend(kfd);
+
+	/* hold dqm->lock to prevent further execution*/
+	dqm_lock(kfd->dqm);
+
+	kfd_signal_reset_event(kfd);
+	return 0;
+}
+
+/*
+ * Fix me. KFD won't be able to resume existing process for now.
+ * We will keep all existing process in a evicted state and
+ * wait the process to be terminated.
+ */
+
+int kgd2kfd_post_reset(struct kfd_dev *kfd)
+{
+	int ret, count;
+
+	if (!kfd->init_complete)
+		return 0;
+
+	dqm_unlock(kfd->dqm);
+
+	ret = kfd_resume(kfd);
+	if (ret)
+		return ret;
+	count = atomic_dec_return(&kfd_locked);
+	WARN_ONCE(count != 0, "KFD reset ref. error");
+	return 0;
+}
+
+bool kfd_is_locked(void)
+{
+	return  (atomic_read(&kfd_locked) > 0);
+}
+
+void kgd2kfd_suspend(struct kfd_dev *kfd)
+{
+	if (!kfd->init_complete)
+		return;
+
+	/* For first KFD device suspend all the KFD processes */
+	if (atomic_inc_return(&kfd_locked) == 1)
+		kfd_suspend_all_processes();
+
+	kfd->dqm->ops.stop(kfd->dqm);
+
+	kfd_iommu_suspend(kfd);
+}
+
+int kgd2kfd_resume(struct kfd_dev *kfd)
+{
+	int ret, count;
+
+	if (!kfd->init_complete)
+		return 0;
+
+	ret = kfd_resume(kfd);
+	if (ret)
+		return ret;
+
+	count = atomic_dec_return(&kfd_locked);
+	WARN_ONCE(count < 0, "KFD suspend / resume ref. error");
+	if (count == 0)
+		ret = kfd_resume_all_processes();
+
+	return ret;
+}
+
+static int kfd_resume(struct kfd_dev *kfd)
+{
+	int err = 0;
+
+	err = kfd_iommu_resume(kfd);
+	if (err) {
+		dev_err(kfd_device,
+			"Failed to resume IOMMU for device %x:%x\n",
+			kfd->pdev->vendor, kfd->pdev->device);
+		return err;
+	}
+
+	err = kfd->dqm->ops.start(kfd->dqm);
+	if (err) {
+		dev_err(kfd_device,
+			"Error starting queue manager for device %x:%x\n",
+			kfd->pdev->vendor, kfd->pdev->device);
+		goto dqm_start_error;
+	}
+
+	return err;
+
+dqm_start_error:
+	kfd_iommu_suspend(kfd);
+	return err;
+}
+
+/* This is called directly from KGD at ISR. */
+void kgd2kfd_interrupt(struct kfd_dev *kfd, const void *ih_ring_entry)
+{
+	uint32_t patched_ihre[KFD_MAX_RING_ENTRY_SIZE];
+	bool is_patched = false;
+	unsigned long flags;
+
+	if (!kfd->init_complete)
+		return;
+
+	if (kfd->device_info->ih_ring_entry_size > sizeof(patched_ihre)) {
+		dev_err_once(kfd_device, "Ring entry too small\n");
+		return;
+	}
+
+	spin_lock_irqsave(&kfd->interrupt_lock, flags);
+
+	if (kfd->interrupts_active
+	    && interrupt_is_wanted(kfd, ih_ring_entry,
+				   patched_ihre, &is_patched)
+	    && enqueue_ih_ring_entry(kfd,
+				     is_patched ? patched_ihre : ih_ring_entry))
+		queue_work(kfd->ih_wq, &kfd->interrupt_work);
+
+	spin_unlock_irqrestore(&kfd->interrupt_lock, flags);
+}
+
+int kgd2kfd_quiesce_mm(struct mm_struct *mm)
+{
+	struct kfd_process *p;
+	int r;
+
+	/* Because we are called from arbitrary context (workqueue) as opposed
+	 * to process context, kfd_process could attempt to exit while we are
+	 * running so the lookup function increments the process ref count.
+	 */
+	p = kfd_lookup_process_by_mm(mm);
+	if (!p)
+		return -ESRCH;
+
+	r = kfd_process_evict_queues(p);
+
+	kfd_unref_process(p);
+	return r;
+}
+
+int kgd2kfd_resume_mm(struct mm_struct *mm)
+{
+	struct kfd_process *p;
+	int r;
+
+	/* Because we are called from arbitrary context (workqueue) as opposed
+	 * to process context, kfd_process could attempt to exit while we are
+	 * running so the lookup function increments the process ref count.
+	 */
+	p = kfd_lookup_process_by_mm(mm);
+	if (!p)
+		return -ESRCH;
+
+	r = kfd_process_restore_queues(p);
+
+	kfd_unref_process(p);
+	return r;
+}
+
+/** kgd2kfd_schedule_evict_and_restore_process - Schedules work queue that will
+ *   prepare for safe eviction of KFD BOs that belong to the specified
+ *   process.
+ *
+ * @mm: mm_struct that identifies the specified KFD process
+ * @fence: eviction fence attached to KFD process BOs
+ *
+ */
+int kgd2kfd_schedule_evict_and_restore_process(struct mm_struct *mm,
+					       struct dma_fence *fence)
+{
+	struct kfd_process *p;
+	unsigned long active_time;
+	unsigned long delay_jiffies = msecs_to_jiffies(PROCESS_ACTIVE_TIME_MS);
+
+	if (!fence)
+		return -EINVAL;
+
+	if (dma_fence_is_signaled(fence))
+		return 0;
+
+	p = kfd_lookup_process_by_mm(mm);
+	if (!p)
+		return -ENODEV;
+
+	if (fence->seqno == p->last_eviction_seqno)
+		goto out;
+
+	p->last_eviction_seqno = fence->seqno;
+
+	/* Avoid KFD process starvation. Wait for at least
+	 * PROCESS_ACTIVE_TIME_MS before evicting the process again
+	 */
+	active_time = get_jiffies_64() - p->last_restore_timestamp;
+	if (delay_jiffies > active_time)
+		delay_jiffies -= active_time;
+	else
+		delay_jiffies = 0;
+
+	/* During process initialization eviction_work.dwork is initialized
+	 * to kfd_evict_bo_worker
+	 */
+	schedule_delayed_work(&p->eviction_work, delay_jiffies);
+out:
+	kfd_unref_process(p);
+	return 0;
+}
+
+static int kfd_gtt_sa_init(struct kfd_dev *kfd, unsigned int buf_size,
+				unsigned int chunk_size)
+{
+	unsigned int num_of_longs;
+
+	if (WARN_ON(buf_size < chunk_size))
+		return -EINVAL;
+	if (WARN_ON(buf_size == 0))
+		return -EINVAL;
+	if (WARN_ON(chunk_size == 0))
+		return -EINVAL;
+
+	kfd->gtt_sa_chunk_size = chunk_size;
+	kfd->gtt_sa_num_of_chunks = buf_size / chunk_size;
+
+	num_of_longs = (kfd->gtt_sa_num_of_chunks + BITS_PER_LONG - 1) /
+		BITS_PER_LONG;
+
+	kfd->gtt_sa_bitmap = kcalloc(num_of_longs, sizeof(long), GFP_KERNEL);
+
+	if (!kfd->gtt_sa_bitmap)
+		return -ENOMEM;
+
+	pr_debug("gtt_sa_num_of_chunks = %d, gtt_sa_bitmap = %p\n",
+			kfd->gtt_sa_num_of_chunks, kfd->gtt_sa_bitmap);
+
+	mutex_init(&kfd->gtt_sa_lock);
+
+	return 0;
+
+}
+
+static void kfd_gtt_sa_fini(struct kfd_dev *kfd)
+{
+	mutex_destroy(&kfd->gtt_sa_lock);
+	kfree(kfd->gtt_sa_bitmap);
+}
+
+static inline uint64_t kfd_gtt_sa_calc_gpu_addr(uint64_t start_addr,
+						unsigned int bit_num,
+						unsigned int chunk_size)
+{
+	return start_addr + bit_num * chunk_size;
+}
+
+static inline uint32_t *kfd_gtt_sa_calc_cpu_addr(void *start_addr,
+						unsigned int bit_num,
+						unsigned int chunk_size)
+{
+	return (uint32_t *) ((uint64_t) start_addr + bit_num * chunk_size);
+}
+
+int kfd_gtt_sa_allocate(struct kfd_dev *kfd, unsigned int size,
+			struct kfd_mem_obj **mem_obj)
+{
+	unsigned int found, start_search, cur_size;
+
+	if (size == 0)
+		return -EINVAL;
+
+	if (size > kfd->gtt_sa_num_of_chunks * kfd->gtt_sa_chunk_size)
+		return -ENOMEM;
+
+	*mem_obj = kzalloc(sizeof(struct kfd_mem_obj), GFP_KERNEL);
+	if (!(*mem_obj))
+		return -ENOMEM;
+
+	pr_debug("Allocated mem_obj = %p for size = %d\n", *mem_obj, size);
+
+	start_search = 0;
+
+	mutex_lock(&kfd->gtt_sa_lock);
+
+kfd_gtt_restart_search:
+	/* Find the first chunk that is free */
+	found = find_next_zero_bit(kfd->gtt_sa_bitmap,
+					kfd->gtt_sa_num_of_chunks,
+					start_search);
+
+	pr_debug("Found = %d\n", found);
+
+	/* If there wasn't any free chunk, bail out */
+	if (found == kfd->gtt_sa_num_of_chunks)
+		goto kfd_gtt_no_free_chunk;
+
+	/* Update fields of mem_obj */
+	(*mem_obj)->range_start = found;
+	(*mem_obj)->range_end = found;
+	(*mem_obj)->gpu_addr = kfd_gtt_sa_calc_gpu_addr(
+					kfd->gtt_start_gpu_addr,
+					found,
+					kfd->gtt_sa_chunk_size);
+	(*mem_obj)->cpu_ptr = kfd_gtt_sa_calc_cpu_addr(
+					kfd->gtt_start_cpu_ptr,
+					found,
+					kfd->gtt_sa_chunk_size);
+
+	pr_debug("gpu_addr = %p, cpu_addr = %p\n",
+			(uint64_t *) (*mem_obj)->gpu_addr, (*mem_obj)->cpu_ptr);
+
+	/* If we need only one chunk, mark it as allocated and get out */
+	if (size <= kfd->gtt_sa_chunk_size) {
+		pr_debug("Single bit\n");
+		set_bit(found, kfd->gtt_sa_bitmap);
+		goto kfd_gtt_out;
+	}
+
+	/* Otherwise, try to see if we have enough contiguous chunks */
+	cur_size = size - kfd->gtt_sa_chunk_size;
+	do {
+		(*mem_obj)->range_end =
+			find_next_zero_bit(kfd->gtt_sa_bitmap,
+					kfd->gtt_sa_num_of_chunks, ++found);
+		/*
+		 * If next free chunk is not contiguous than we need to
+		 * restart our search from the last free chunk we found (which
+		 * wasn't contiguous to the previous ones
+		 */
+		if ((*mem_obj)->range_end != found) {
+			start_search = found;
+			goto kfd_gtt_restart_search;
+		}
+
+		/*
+		 * If we reached end of buffer, bail out with error
+		 */
+		if (found == kfd->gtt_sa_num_of_chunks)
+			goto kfd_gtt_no_free_chunk;
+
+		/* Check if we don't need another chunk */
+		if (cur_size <= kfd->gtt_sa_chunk_size)
+			cur_size = 0;
+		else
+			cur_size -= kfd->gtt_sa_chunk_size;
+
+	} while (cur_size > 0);
+
+	pr_debug("range_start = %d, range_end = %d\n",
+		(*mem_obj)->range_start, (*mem_obj)->range_end);
+
+	/* Mark the chunks as allocated */
+	for (found = (*mem_obj)->range_start;
+		found <= (*mem_obj)->range_end;
+		found++)
+		set_bit(found, kfd->gtt_sa_bitmap);
+
+kfd_gtt_out:
+	mutex_unlock(&kfd->gtt_sa_lock);
+	return 0;
+
+kfd_gtt_no_free_chunk:
+	pr_debug("Allocation failed with mem_obj = %p\n", *mem_obj);
+	mutex_unlock(&kfd->gtt_sa_lock);
+	kfree(*mem_obj);
+	return -ENOMEM;
+}
+
+int kfd_gtt_sa_free(struct kfd_dev *kfd, struct kfd_mem_obj *mem_obj)
+{
+	unsigned int bit;
+
+	/* Act like kfree when trying to free a NULL object */
+	if (!mem_obj)
+		return 0;
+
+	pr_debug("Free mem_obj = %p, range_start = %d, range_end = %d\n",
+			mem_obj, mem_obj->range_start, mem_obj->range_end);
+
+	mutex_lock(&kfd->gtt_sa_lock);
+
+	/* Mark the chunks as free */
+	for (bit = mem_obj->range_start;
+		bit <= mem_obj->range_end;
+		bit++)
+		clear_bit(bit, kfd->gtt_sa_bitmap);
+
+	mutex_unlock(&kfd->gtt_sa_lock);
+
+	kfree(mem_obj);
+	return 0;
+}
+
+#if defined(CONFIG_DEBUG_FS)
+
+/* This function will send a package to HIQ to hang the HWS
+ * which will trigger a GPU reset and bring the HWS back to normal state
+ */
+int kfd_debugfs_hang_hws(struct kfd_dev *dev)
+{
+	int r = 0;
+
+	if (dev->dqm->sched_policy != KFD_SCHED_POLICY_HWS) {
+		pr_err("HWS is not enabled");
+		return -EINVAL;
+	}
+
+	r = pm_debugfs_hang_hws(&dev->dqm->packets);
+	if (!r)
+		r = dqm_debugfs_execute_queues(dev->dqm);
+
+	return r;
+}
+
+#endif
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
new file mode 100644
index 000000000..bff39f561
--- /dev/null
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
@@ -0,0 +1,1864 @@
+/*
+ * Copyright 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ *
+ */
+
+#include <linux/ratelimit.h>
+#include <linux/printk.h>
+#include <linux/slab.h>
+#include <linux/list.h>
+#include <linux/types.h>
+#include <linux/bitops.h>
+#include <linux/sched.h>
+#include "kfd_priv.h"
+#include "kfd_device_queue_manager.h"
+#include "kfd_mqd_manager.h"
+#include "cik_regs.h"
+#include "kfd_kernel_queue.h"
+
+/* Size of the per-pipe EOP queue */
+#define CIK_HPD_EOP_BYTES_LOG2 11
+#define CIK_HPD_EOP_BYTES (1U << CIK_HPD_EOP_BYTES_LOG2)
+
+static int set_pasid_vmid_mapping(struct device_queue_manager *dqm,
+					unsigned int pasid, unsigned int vmid);
+
+static int create_compute_queue_nocpsch(struct device_queue_manager *dqm,
+					struct queue *q,
+					struct qcm_process_device *qpd);
+
+static int execute_queues_cpsch(struct device_queue_manager *dqm,
+				enum kfd_unmap_queues_filter filter,
+				uint32_t filter_param);
+static int unmap_queues_cpsch(struct device_queue_manager *dqm,
+				enum kfd_unmap_queues_filter filter,
+				uint32_t filter_param);
+
+static int map_queues_cpsch(struct device_queue_manager *dqm);
+
+static int create_sdma_queue_nocpsch(struct device_queue_manager *dqm,
+					struct queue *q,
+					struct qcm_process_device *qpd);
+
+static void deallocate_sdma_queue(struct device_queue_manager *dqm,
+				unsigned int sdma_queue_id);
+
+static void kfd_process_hw_exception(struct work_struct *work);
+
+static inline
+enum KFD_MQD_TYPE get_mqd_type_from_queue_type(enum kfd_queue_type type)
+{
+	if (type == KFD_QUEUE_TYPE_SDMA)
+		return KFD_MQD_TYPE_SDMA;
+	return KFD_MQD_TYPE_CP;
+}
+
+static bool is_pipe_enabled(struct device_queue_manager *dqm, int mec, int pipe)
+{
+	int i;
+	int pipe_offset = mec * dqm->dev->shared_resources.num_pipe_per_mec
+		+ pipe * dqm->dev->shared_resources.num_queue_per_pipe;
+
+	/* queue is available for KFD usage if bit is 1 */
+	for (i = 0; i <  dqm->dev->shared_resources.num_queue_per_pipe; ++i)
+		if (test_bit(pipe_offset + i,
+			      dqm->dev->shared_resources.queue_bitmap))
+			return true;
+	return false;
+}
+
+unsigned int get_queues_num(struct device_queue_manager *dqm)
+{
+	return bitmap_weight(dqm->dev->shared_resources.queue_bitmap,
+				KGD_MAX_QUEUES);
+}
+
+unsigned int get_queues_per_pipe(struct device_queue_manager *dqm)
+{
+	return dqm->dev->shared_resources.num_queue_per_pipe;
+}
+
+unsigned int get_pipes_per_mec(struct device_queue_manager *dqm)
+{
+	return dqm->dev->shared_resources.num_pipe_per_mec;
+}
+
+static unsigned int get_num_sdma_engines(struct device_queue_manager *dqm)
+{
+	return dqm->dev->device_info->num_sdma_engines;
+}
+
+unsigned int get_num_sdma_queues(struct device_queue_manager *dqm)
+{
+	return dqm->dev->device_info->num_sdma_engines
+			* KFD_SDMA_QUEUES_PER_ENGINE;
+}
+
+void program_sh_mem_settings(struct device_queue_manager *dqm,
+					struct qcm_process_device *qpd)
+{
+	return dqm->dev->kfd2kgd->program_sh_mem_settings(
+						dqm->dev->kgd, qpd->vmid,
+						qpd->sh_mem_config,
+						qpd->sh_mem_ape1_base,
+						qpd->sh_mem_ape1_limit,
+						qpd->sh_mem_bases);
+}
+
+static int allocate_doorbell(struct qcm_process_device *qpd, struct queue *q)
+{
+	struct kfd_dev *dev = qpd->dqm->dev;
+
+	if (!KFD_IS_SOC15(dev->device_info->asic_family)) {
+		/* On pre-SOC15 chips we need to use the queue ID to
+		 * preserve the user mode ABI.
+		 */
+		q->doorbell_id = q->properties.queue_id;
+	} else if (q->properties.type == KFD_QUEUE_TYPE_SDMA) {
+		/* For SDMA queues on SOC15, use static doorbell
+		 * assignments based on the engine and queue.
+		 */
+		q->doorbell_id = dev->shared_resources.sdma_doorbell
+			[q->properties.sdma_engine_id]
+			[q->properties.sdma_queue_id];
+	} else {
+		/* For CP queues on SOC15 reserve a free doorbell ID */
+		unsigned int found;
+
+		found = find_first_zero_bit(qpd->doorbell_bitmap,
+					    KFD_MAX_NUM_OF_QUEUES_PER_PROCESS);
+		if (found >= KFD_MAX_NUM_OF_QUEUES_PER_PROCESS) {
+			pr_debug("No doorbells available");
+			return -EBUSY;
+		}
+		set_bit(found, qpd->doorbell_bitmap);
+		q->doorbell_id = found;
+	}
+
+	q->properties.doorbell_off =
+		kfd_doorbell_id_to_offset(dev, q->process,
+					  q->doorbell_id);
+
+	return 0;
+}
+
+static void deallocate_doorbell(struct qcm_process_device *qpd,
+				struct queue *q)
+{
+	unsigned int old;
+	struct kfd_dev *dev = qpd->dqm->dev;
+
+	if (!KFD_IS_SOC15(dev->device_info->asic_family) ||
+	    q->properties.type == KFD_QUEUE_TYPE_SDMA)
+		return;
+
+	old = test_and_clear_bit(q->doorbell_id, qpd->doorbell_bitmap);
+	WARN_ON(!old);
+}
+
+static int allocate_vmid(struct device_queue_manager *dqm,
+			struct qcm_process_device *qpd,
+			struct queue *q)
+{
+	int bit, allocated_vmid;
+
+	if (dqm->vmid_bitmap == 0)
+		return -ENOMEM;
+
+	bit = ffs(dqm->vmid_bitmap) - 1;
+	dqm->vmid_bitmap &= ~(1 << bit);
+
+	allocated_vmid = bit + dqm->dev->vm_info.first_vmid_kfd;
+	pr_debug("vmid allocation %d\n", allocated_vmid);
+	qpd->vmid = allocated_vmid;
+	q->properties.vmid = allocated_vmid;
+
+	set_pasid_vmid_mapping(dqm, q->process->pasid, q->properties.vmid);
+	program_sh_mem_settings(dqm, qpd);
+
+	/* qpd->page_table_base is set earlier when register_process()
+	 * is called, i.e. when the first queue is created.
+	 */
+	dqm->dev->kfd2kgd->set_vm_context_page_table_base(dqm->dev->kgd,
+			qpd->vmid,
+			qpd->page_table_base);
+	/* invalidate the VM context after pasid and vmid mapping is set up */
+	kfd_flush_tlb(qpd_to_pdd(qpd));
+
+	return 0;
+}
+
+static int flush_texture_cache_nocpsch(struct kfd_dev *kdev,
+				struct qcm_process_device *qpd)
+{
+	const struct packet_manager_funcs *pmf = qpd->dqm->packets.pmf;
+	int ret;
+
+	if (!qpd->ib_kaddr)
+		return -ENOMEM;
+
+	ret = pmf->release_mem(qpd->ib_base, (uint32_t *)qpd->ib_kaddr);
+	if (ret)
+		return ret;
+
+	return kdev->kfd2kgd->submit_ib(kdev->kgd, KGD_ENGINE_MEC1, qpd->vmid,
+				qpd->ib_base, (uint32_t *)qpd->ib_kaddr,
+				pmf->release_mem_size / sizeof(uint32_t));
+}
+
+static void deallocate_vmid(struct device_queue_manager *dqm,
+				struct qcm_process_device *qpd,
+				struct queue *q)
+{
+	int bit = qpd->vmid - dqm->dev->vm_info.first_vmid_kfd;
+
+	/* On GFX v7, CP doesn't flush TC at dequeue */
+	if (q->device->device_info->asic_family == CHIP_HAWAII)
+		if (flush_texture_cache_nocpsch(q->device, qpd))
+			pr_err("Failed to flush TC\n");
+
+	kfd_flush_tlb(qpd_to_pdd(qpd));
+
+	/* Release the vmid mapping */
+	set_pasid_vmid_mapping(dqm, 0, qpd->vmid);
+
+	dqm->vmid_bitmap |= (1 << bit);
+	qpd->vmid = 0;
+	q->properties.vmid = 0;
+}
+
+static int create_queue_nocpsch(struct device_queue_manager *dqm,
+				struct queue *q,
+				struct qcm_process_device *qpd)
+{
+	int retval;
+
+	print_queue(q);
+
+	dqm_lock(dqm);
+
+	if (dqm->total_queue_count >= max_num_of_queues_per_device) {
+		pr_warn("Can't create new usermode queue because %d queues were already created\n",
+				dqm->total_queue_count);
+		retval = -EPERM;
+		goto out_unlock;
+	}
+
+	if (list_empty(&qpd->queues_list)) {
+		retval = allocate_vmid(dqm, qpd, q);
+		if (retval)
+			goto out_unlock;
+	}
+	q->properties.vmid = qpd->vmid;
+	/*
+	 * Eviction state logic: we only mark active queues as evicted
+	 * to avoid the overhead of restoring inactive queues later
+	 */
+	if (qpd->evicted)
+		q->properties.is_evicted = (q->properties.queue_size > 0 &&
+					    q->properties.queue_percent > 0 &&
+					    q->properties.queue_address != 0);
+
+	q->properties.tba_addr = qpd->tba_addr;
+	q->properties.tma_addr = qpd->tma_addr;
+
+	if (q->properties.type == KFD_QUEUE_TYPE_COMPUTE)
+		retval = create_compute_queue_nocpsch(dqm, q, qpd);
+	else if (q->properties.type == KFD_QUEUE_TYPE_SDMA)
+		retval = create_sdma_queue_nocpsch(dqm, q, qpd);
+	else
+		retval = -EINVAL;
+
+	if (retval) {
+		if (list_empty(&qpd->queues_list))
+			deallocate_vmid(dqm, qpd, q);
+		goto out_unlock;
+	}
+
+	list_add(&q->list, &qpd->queues_list);
+	qpd->queue_count++;
+	if (q->properties.is_active)
+		dqm->queue_count++;
+
+	if (q->properties.type == KFD_QUEUE_TYPE_SDMA)
+		dqm->sdma_queue_count++;
+
+	/*
+	 * Unconditionally increment this counter, regardless of the queue's
+	 * type or whether the queue is active.
+	 */
+	dqm->total_queue_count++;
+	pr_debug("Total of %d queues are accountable so far\n",
+			dqm->total_queue_count);
+
+out_unlock:
+	dqm_unlock(dqm);
+	return retval;
+}
+
+static int allocate_hqd(struct device_queue_manager *dqm, struct queue *q)
+{
+	bool set;
+	int pipe, bit, i;
+
+	set = false;
+
+	for (pipe = dqm->next_pipe_to_allocate, i = 0;
+			i < get_pipes_per_mec(dqm);
+			pipe = ((pipe + 1) % get_pipes_per_mec(dqm)), ++i) {
+
+		if (!is_pipe_enabled(dqm, 0, pipe))
+			continue;
+
+		if (dqm->allocated_queues[pipe] != 0) {
+			bit = ffs(dqm->allocated_queues[pipe]) - 1;
+			dqm->allocated_queues[pipe] &= ~(1 << bit);
+			q->pipe = pipe;
+			q->queue = bit;
+			set = true;
+			break;
+		}
+	}
+
+	if (!set)
+		return -EBUSY;
+
+	pr_debug("hqd slot - pipe %d, queue %d\n", q->pipe, q->queue);
+	/* horizontal hqd allocation */
+	dqm->next_pipe_to_allocate = (pipe + 1) % get_pipes_per_mec(dqm);
+
+	return 0;
+}
+
+static inline void deallocate_hqd(struct device_queue_manager *dqm,
+				struct queue *q)
+{
+	dqm->allocated_queues[q->pipe] |= (1 << q->queue);
+}
+
+static int create_compute_queue_nocpsch(struct device_queue_manager *dqm,
+					struct queue *q,
+					struct qcm_process_device *qpd)
+{
+	struct mqd_manager *mqd_mgr;
+	int retval;
+
+	mqd_mgr = dqm->ops.get_mqd_manager(dqm, KFD_MQD_TYPE_COMPUTE);
+	if (!mqd_mgr)
+		return -ENOMEM;
+
+	retval = allocate_hqd(dqm, q);
+	if (retval)
+		return retval;
+
+	retval = allocate_doorbell(qpd, q);
+	if (retval)
+		goto out_deallocate_hqd;
+
+	retval = mqd_mgr->init_mqd(mqd_mgr, &q->mqd, &q->mqd_mem_obj,
+				&q->gart_mqd_addr, &q->properties);
+	if (retval)
+		goto out_deallocate_doorbell;
+
+	pr_debug("Loading mqd to hqd on pipe %d, queue %d\n",
+			q->pipe, q->queue);
+
+	dqm->dev->kfd2kgd->set_scratch_backing_va(
+			dqm->dev->kgd, qpd->sh_hidden_private_base, qpd->vmid);
+
+	if (!q->properties.is_active)
+		return 0;
+
+	if (WARN(q->process->mm != current->mm,
+		 "should only run in user thread"))
+		retval = -EFAULT;
+	else
+		retval = mqd_mgr->load_mqd(mqd_mgr, q->mqd, q->pipe, q->queue,
+					   &q->properties, current->mm);
+	if (retval)
+		goto out_uninit_mqd;
+
+	return 0;
+
+out_uninit_mqd:
+	mqd_mgr->uninit_mqd(mqd_mgr, q->mqd, q->mqd_mem_obj);
+out_deallocate_doorbell:
+	deallocate_doorbell(qpd, q);
+out_deallocate_hqd:
+	deallocate_hqd(dqm, q);
+
+	return retval;
+}
+
+/* Access to DQM has to be locked before calling destroy_queue_nocpsch_locked
+ * to avoid asynchronized access
+ */
+static int destroy_queue_nocpsch_locked(struct device_queue_manager *dqm,
+				struct qcm_process_device *qpd,
+				struct queue *q)
+{
+	int retval;
+	struct mqd_manager *mqd_mgr;
+
+	mqd_mgr = dqm->ops.get_mqd_manager(dqm,
+		get_mqd_type_from_queue_type(q->properties.type));
+	if (!mqd_mgr)
+		return -ENOMEM;
+
+	if (q->properties.type == KFD_QUEUE_TYPE_COMPUTE) {
+		deallocate_hqd(dqm, q);
+	} else if (q->properties.type == KFD_QUEUE_TYPE_SDMA) {
+		dqm->sdma_queue_count--;
+		deallocate_sdma_queue(dqm, q->sdma_id);
+	} else {
+		pr_debug("q->properties.type %d is invalid\n",
+				q->properties.type);
+		return -EINVAL;
+	}
+	dqm->total_queue_count--;
+
+	deallocate_doorbell(qpd, q);
+
+	retval = mqd_mgr->destroy_mqd(mqd_mgr, q->mqd,
+				KFD_PREEMPT_TYPE_WAVEFRONT_RESET,
+				KFD_UNMAP_LATENCY_MS,
+				q->pipe, q->queue);
+	if (retval == -ETIME)
+		qpd->reset_wavefronts = true;
+
+	mqd_mgr->uninit_mqd(mqd_mgr, q->mqd, q->mqd_mem_obj);
+
+	list_del(&q->list);
+	if (list_empty(&qpd->queues_list)) {
+		if (qpd->reset_wavefronts) {
+			pr_warn("Resetting wave fronts (nocpsch) on dev %p\n",
+					dqm->dev);
+			/* dbgdev_wave_reset_wavefronts has to be called before
+			 * deallocate_vmid(), i.e. when vmid is still in use.
+			 */
+			dbgdev_wave_reset_wavefronts(dqm->dev,
+					qpd->pqm->process);
+			qpd->reset_wavefronts = false;
+		}
+
+		deallocate_vmid(dqm, qpd, q);
+	}
+	qpd->queue_count--;
+	if (q->properties.is_active)
+		dqm->queue_count--;
+
+	return retval;
+}
+
+static int destroy_queue_nocpsch(struct device_queue_manager *dqm,
+				struct qcm_process_device *qpd,
+				struct queue *q)
+{
+	int retval;
+
+	dqm_lock(dqm);
+	retval = destroy_queue_nocpsch_locked(dqm, qpd, q);
+	dqm_unlock(dqm);
+
+	return retval;
+}
+
+static int update_queue(struct device_queue_manager *dqm, struct queue *q)
+{
+	int retval;
+	struct mqd_manager *mqd_mgr;
+	struct kfd_process_device *pdd;
+	bool prev_active = false;
+
+	dqm_lock(dqm);
+	pdd = kfd_get_process_device_data(q->device, q->process);
+	if (!pdd) {
+		retval = -ENODEV;
+		goto out_unlock;
+	}
+	mqd_mgr = dqm->ops.get_mqd_manager(dqm,
+			get_mqd_type_from_queue_type(q->properties.type));
+	if (!mqd_mgr) {
+		retval = -ENOMEM;
+		goto out_unlock;
+	}
+	/*
+	 * Eviction state logic: we only mark active queues as evicted
+	 * to avoid the overhead of restoring inactive queues later
+	 */
+	if (pdd->qpd.evicted)
+		q->properties.is_evicted = (q->properties.queue_size > 0 &&
+					    q->properties.queue_percent > 0 &&
+					    q->properties.queue_address != 0);
+
+	/* Save previous activity state for counters */
+	prev_active = q->properties.is_active;
+
+	/* Make sure the queue is unmapped before updating the MQD */
+	if (dqm->sched_policy != KFD_SCHED_POLICY_NO_HWS) {
+		retval = unmap_queues_cpsch(dqm,
+				KFD_UNMAP_QUEUES_FILTER_DYNAMIC_QUEUES, 0);
+		if (retval) {
+			pr_err("unmap queue failed\n");
+			goto out_unlock;
+		}
+	} else if (prev_active &&
+		   (q->properties.type == KFD_QUEUE_TYPE_COMPUTE ||
+		    q->properties.type == KFD_QUEUE_TYPE_SDMA)) {
+		retval = mqd_mgr->destroy_mqd(mqd_mgr, q->mqd,
+				KFD_PREEMPT_TYPE_WAVEFRONT_DRAIN,
+				KFD_UNMAP_LATENCY_MS, q->pipe, q->queue);
+		if (retval) {
+			pr_err("destroy mqd failed\n");
+			goto out_unlock;
+		}
+	}
+
+	retval = mqd_mgr->update_mqd(mqd_mgr, q->mqd, &q->properties);
+
+	/*
+	 * check active state vs. the previous state and modify
+	 * counter accordingly. map_queues_cpsch uses the
+	 * dqm->queue_count to determine whether a new runlist must be
+	 * uploaded.
+	 */
+	if (q->properties.is_active && !prev_active)
+		dqm->queue_count++;
+	else if (!q->properties.is_active && prev_active)
+		dqm->queue_count--;
+
+	if (dqm->sched_policy != KFD_SCHED_POLICY_NO_HWS)
+		retval = map_queues_cpsch(dqm);
+	else if (q->properties.is_active &&
+		 (q->properties.type == KFD_QUEUE_TYPE_COMPUTE ||
+		  q->properties.type == KFD_QUEUE_TYPE_SDMA)) {
+		if (WARN(q->process->mm != current->mm,
+			 "should only run in user thread"))
+			retval = -EFAULT;
+		else
+			retval = mqd_mgr->load_mqd(mqd_mgr, q->mqd,
+						   q->pipe, q->queue,
+						   &q->properties, current->mm);
+	}
+
+out_unlock:
+	dqm_unlock(dqm);
+	return retval;
+}
+
+static struct mqd_manager *get_mqd_manager(
+		struct device_queue_manager *dqm, enum KFD_MQD_TYPE type)
+{
+	struct mqd_manager *mqd_mgr;
+
+	if (WARN_ON(type >= KFD_MQD_TYPE_MAX))
+		return NULL;
+
+	pr_debug("mqd type %d\n", type);
+
+	mqd_mgr = dqm->mqd_mgrs[type];
+	if (!mqd_mgr) {
+		mqd_mgr = mqd_manager_init(type, dqm->dev);
+		if (!mqd_mgr)
+			pr_err("mqd manager is NULL");
+		dqm->mqd_mgrs[type] = mqd_mgr;
+	}
+
+	return mqd_mgr;
+}
+
+static int evict_process_queues_nocpsch(struct device_queue_manager *dqm,
+					struct qcm_process_device *qpd)
+{
+	struct queue *q;
+	struct mqd_manager *mqd_mgr;
+	struct kfd_process_device *pdd;
+	int retval = 0;
+
+	dqm_lock(dqm);
+	if (qpd->evicted++ > 0) /* already evicted, do nothing */
+		goto out;
+
+	pdd = qpd_to_pdd(qpd);
+	pr_info_ratelimited("Evicting PASID %u queues\n",
+			    pdd->process->pasid);
+
+	/* unactivate all active queues on the qpd */
+	list_for_each_entry(q, &qpd->queues_list, list) {
+		if (!q->properties.is_active)
+			continue;
+		mqd_mgr = dqm->ops.get_mqd_manager(dqm,
+			get_mqd_type_from_queue_type(q->properties.type));
+		if (!mqd_mgr) { /* should not be here */
+			pr_err("Cannot evict queue, mqd mgr is NULL\n");
+			retval = -ENOMEM;
+			goto out;
+		}
+		q->properties.is_evicted = true;
+		q->properties.is_active = false;
+		retval = mqd_mgr->destroy_mqd(mqd_mgr, q->mqd,
+				KFD_PREEMPT_TYPE_WAVEFRONT_DRAIN,
+				KFD_UNMAP_LATENCY_MS, q->pipe, q->queue);
+		if (retval)
+			goto out;
+		dqm->queue_count--;
+	}
+
+out:
+	dqm_unlock(dqm);
+	return retval;
+}
+
+static int evict_process_queues_cpsch(struct device_queue_manager *dqm,
+				      struct qcm_process_device *qpd)
+{
+	struct queue *q;
+	struct kfd_process_device *pdd;
+	int retval = 0;
+
+	dqm_lock(dqm);
+	if (qpd->evicted++ > 0) /* already evicted, do nothing */
+		goto out;
+
+	pdd = qpd_to_pdd(qpd);
+	pr_info_ratelimited("Evicting PASID %u queues\n",
+			    pdd->process->pasid);
+
+	/* unactivate all active queues on the qpd */
+	list_for_each_entry(q, &qpd->queues_list, list) {
+		if (!q->properties.is_active)
+			continue;
+		q->properties.is_evicted = true;
+		q->properties.is_active = false;
+		dqm->queue_count--;
+	}
+	retval = execute_queues_cpsch(dqm,
+				qpd->is_debug ?
+				KFD_UNMAP_QUEUES_FILTER_ALL_QUEUES :
+				KFD_UNMAP_QUEUES_FILTER_DYNAMIC_QUEUES, 0);
+
+out:
+	dqm_unlock(dqm);
+	return retval;
+}
+
+static int restore_process_queues_nocpsch(struct device_queue_manager *dqm,
+					  struct qcm_process_device *qpd)
+{
+	struct mm_struct *mm = NULL;
+	struct queue *q;
+	struct mqd_manager *mqd_mgr;
+	struct kfd_process_device *pdd;
+	uint32_t pd_base;
+	int retval = 0;
+
+	pdd = qpd_to_pdd(qpd);
+	/* Retrieve PD base */
+	pd_base = dqm->dev->kfd2kgd->get_process_page_dir(pdd->vm);
+
+	dqm_lock(dqm);
+	if (WARN_ON_ONCE(!qpd->evicted)) /* already restored, do nothing */
+		goto out;
+	if (qpd->evicted > 1) { /* ref count still > 0, decrement & quit */
+		qpd->evicted--;
+		goto out;
+	}
+
+	pr_info_ratelimited("Restoring PASID %u queues\n",
+			    pdd->process->pasid);
+
+	/* Update PD Base in QPD */
+	qpd->page_table_base = pd_base;
+	pr_debug("Updated PD address to 0x%08x\n", pd_base);
+
+	if (!list_empty(&qpd->queues_list)) {
+		dqm->dev->kfd2kgd->set_vm_context_page_table_base(
+				dqm->dev->kgd,
+				qpd->vmid,
+				qpd->page_table_base);
+		kfd_flush_tlb(pdd);
+	}
+
+	/* Take a safe reference to the mm_struct, which may otherwise
+	 * disappear even while the kfd_process is still referenced.
+	 */
+	mm = get_task_mm(pdd->process->lead_thread);
+	if (!mm) {
+		retval = -EFAULT;
+		goto out;
+	}
+
+	/* activate all active queues on the qpd */
+	list_for_each_entry(q, &qpd->queues_list, list) {
+		if (!q->properties.is_evicted)
+			continue;
+		mqd_mgr = dqm->ops.get_mqd_manager(dqm,
+			get_mqd_type_from_queue_type(q->properties.type));
+		if (!mqd_mgr) { /* should not be here */
+			pr_err("Cannot restore queue, mqd mgr is NULL\n");
+			retval = -ENOMEM;
+			goto out;
+		}
+		q->properties.is_evicted = false;
+		q->properties.is_active = true;
+		retval = mqd_mgr->load_mqd(mqd_mgr, q->mqd, q->pipe,
+				       q->queue, &q->properties, mm);
+		if (retval)
+			goto out;
+		dqm->queue_count++;
+	}
+	qpd->evicted = 0;
+out:
+	if (mm)
+		mmput(mm);
+	dqm_unlock(dqm);
+	return retval;
+}
+
+static int restore_process_queues_cpsch(struct device_queue_manager *dqm,
+					struct qcm_process_device *qpd)
+{
+	struct queue *q;
+	struct kfd_process_device *pdd;
+	uint32_t pd_base;
+	int retval = 0;
+
+	pdd = qpd_to_pdd(qpd);
+	/* Retrieve PD base */
+	pd_base = dqm->dev->kfd2kgd->get_process_page_dir(pdd->vm);
+
+	dqm_lock(dqm);
+	if (WARN_ON_ONCE(!qpd->evicted)) /* already restored, do nothing */
+		goto out;
+	if (qpd->evicted > 1) { /* ref count still > 0, decrement & quit */
+		qpd->evicted--;
+		goto out;
+	}
+
+	pr_info_ratelimited("Restoring PASID %u queues\n",
+			    pdd->process->pasid);
+
+	/* Update PD Base in QPD */
+	qpd->page_table_base = pd_base;
+	pr_debug("Updated PD address to 0x%08x\n", pd_base);
+
+	/* activate all active queues on the qpd */
+	list_for_each_entry(q, &qpd->queues_list, list) {
+		if (!q->properties.is_evicted)
+			continue;
+		q->properties.is_evicted = false;
+		q->properties.is_active = true;
+		dqm->queue_count++;
+	}
+	retval = execute_queues_cpsch(dqm,
+				KFD_UNMAP_QUEUES_FILTER_DYNAMIC_QUEUES, 0);
+	if (!retval)
+		qpd->evicted = 0;
+out:
+	dqm_unlock(dqm);
+	return retval;
+}
+
+static int register_process(struct device_queue_manager *dqm,
+					struct qcm_process_device *qpd)
+{
+	struct device_process_node *n;
+	struct kfd_process_device *pdd;
+	uint32_t pd_base;
+	int retval;
+
+	n = kzalloc(sizeof(*n), GFP_KERNEL);
+	if (!n)
+		return -ENOMEM;
+
+	n->qpd = qpd;
+
+	pdd = qpd_to_pdd(qpd);
+	/* Retrieve PD base */
+	pd_base = dqm->dev->kfd2kgd->get_process_page_dir(pdd->vm);
+
+	dqm_lock(dqm);
+	list_add(&n->list, &dqm->queues);
+
+	/* Update PD Base in QPD */
+	qpd->page_table_base = pd_base;
+
+	retval = dqm->asic_ops.update_qpd(dqm, qpd);
+
+	if (dqm->processes_count++ == 0)
+		dqm->dev->kfd2kgd->set_compute_idle(dqm->dev->kgd, false);
+
+	dqm_unlock(dqm);
+
+	return retval;
+}
+
+static int unregister_process(struct device_queue_manager *dqm,
+					struct qcm_process_device *qpd)
+{
+	int retval;
+	struct device_process_node *cur, *next;
+
+	pr_debug("qpd->queues_list is %s\n",
+			list_empty(&qpd->queues_list) ? "empty" : "not empty");
+
+	retval = 0;
+	dqm_lock(dqm);
+
+	list_for_each_entry_safe(cur, next, &dqm->queues, list) {
+		if (qpd == cur->qpd) {
+			list_del(&cur->list);
+			kfree(cur);
+			if (--dqm->processes_count == 0)
+				dqm->dev->kfd2kgd->set_compute_idle(
+					dqm->dev->kgd, true);
+			goto out;
+		}
+	}
+	/* qpd not found in dqm list */
+	retval = 1;
+out:
+	dqm_unlock(dqm);
+	return retval;
+}
+
+static int
+set_pasid_vmid_mapping(struct device_queue_manager *dqm, unsigned int pasid,
+			unsigned int vmid)
+{
+	uint32_t pasid_mapping;
+
+	pasid_mapping = (pasid == 0) ? 0 :
+		(uint32_t)pasid |
+		ATC_VMID_PASID_MAPPING_VALID;
+
+	return dqm->dev->kfd2kgd->set_pasid_vmid_mapping(
+						dqm->dev->kgd, pasid_mapping,
+						vmid);
+}
+
+static void init_interrupts(struct device_queue_manager *dqm)
+{
+	unsigned int i;
+
+	for (i = 0 ; i < get_pipes_per_mec(dqm) ; i++)
+		if (is_pipe_enabled(dqm, 0, i))
+			dqm->dev->kfd2kgd->init_interrupts(dqm->dev->kgd, i);
+}
+
+static int initialize_nocpsch(struct device_queue_manager *dqm)
+{
+	int pipe, queue;
+
+	pr_debug("num of pipes: %d\n", get_pipes_per_mec(dqm));
+
+	dqm->allocated_queues = kcalloc(get_pipes_per_mec(dqm),
+					sizeof(unsigned int), GFP_KERNEL);
+	if (!dqm->allocated_queues)
+		return -ENOMEM;
+
+	mutex_init(&dqm->lock_hidden);
+	INIT_LIST_HEAD(&dqm->queues);
+	dqm->queue_count = dqm->next_pipe_to_allocate = 0;
+	dqm->sdma_queue_count = 0;
+
+	for (pipe = 0; pipe < get_pipes_per_mec(dqm); pipe++) {
+		int pipe_offset = pipe * get_queues_per_pipe(dqm);
+
+		for (queue = 0; queue < get_queues_per_pipe(dqm); queue++)
+			if (test_bit(pipe_offset + queue,
+				     dqm->dev->shared_resources.queue_bitmap))
+				dqm->allocated_queues[pipe] |= 1 << queue;
+	}
+
+	dqm->vmid_bitmap = (1 << dqm->dev->vm_info.vmid_num_kfd) - 1;
+	dqm->sdma_bitmap = (1 << get_num_sdma_queues(dqm)) - 1;
+
+	return 0;
+}
+
+static void uninitialize(struct device_queue_manager *dqm)
+{
+	int i;
+
+	WARN_ON(dqm->queue_count > 0 || dqm->processes_count > 0);
+
+	kfree(dqm->allocated_queues);
+	for (i = 0 ; i < KFD_MQD_TYPE_MAX ; i++)
+		kfree(dqm->mqd_mgrs[i]);
+	mutex_destroy(&dqm->lock_hidden);
+	kfd_gtt_sa_free(dqm->dev, dqm->pipeline_mem);
+}
+
+static int start_nocpsch(struct device_queue_manager *dqm)
+{
+	init_interrupts(dqm);
+	return pm_init(&dqm->packets, dqm);
+}
+
+static int stop_nocpsch(struct device_queue_manager *dqm)
+{
+	pm_uninit(&dqm->packets);
+	return 0;
+}
+
+static int allocate_sdma_queue(struct device_queue_manager *dqm,
+				unsigned int *sdma_queue_id)
+{
+	int bit;
+
+	if (dqm->sdma_bitmap == 0)
+		return -ENOMEM;
+
+	bit = ffs(dqm->sdma_bitmap) - 1;
+	dqm->sdma_bitmap &= ~(1 << bit);
+	*sdma_queue_id = bit;
+
+	return 0;
+}
+
+static void deallocate_sdma_queue(struct device_queue_manager *dqm,
+				unsigned int sdma_queue_id)
+{
+	if (sdma_queue_id >= get_num_sdma_queues(dqm))
+		return;
+	dqm->sdma_bitmap |= (1 << sdma_queue_id);
+}
+
+static int create_sdma_queue_nocpsch(struct device_queue_manager *dqm,
+					struct queue *q,
+					struct qcm_process_device *qpd)
+{
+	struct mqd_manager *mqd_mgr;
+	int retval;
+
+	mqd_mgr = dqm->ops.get_mqd_manager(dqm, KFD_MQD_TYPE_SDMA);
+	if (!mqd_mgr)
+		return -ENOMEM;
+
+	retval = allocate_sdma_queue(dqm, &q->sdma_id);
+	if (retval)
+		return retval;
+
+	q->properties.sdma_queue_id = q->sdma_id / get_num_sdma_engines(dqm);
+	q->properties.sdma_engine_id = q->sdma_id % get_num_sdma_engines(dqm);
+
+	retval = allocate_doorbell(qpd, q);
+	if (retval)
+		goto out_deallocate_sdma_queue;
+
+	pr_debug("SDMA id is:    %d\n", q->sdma_id);
+	pr_debug("SDMA queue id: %d\n", q->properties.sdma_queue_id);
+	pr_debug("SDMA engine id: %d\n", q->properties.sdma_engine_id);
+
+	dqm->asic_ops.init_sdma_vm(dqm, q, qpd);
+	retval = mqd_mgr->init_mqd(mqd_mgr, &q->mqd, &q->mqd_mem_obj,
+				&q->gart_mqd_addr, &q->properties);
+	if (retval)
+		goto out_deallocate_doorbell;
+
+	retval = mqd_mgr->load_mqd(mqd_mgr, q->mqd, 0, 0, &q->properties,
+				NULL);
+	if (retval)
+		goto out_uninit_mqd;
+
+	return 0;
+
+out_uninit_mqd:
+	mqd_mgr->uninit_mqd(mqd_mgr, q->mqd, q->mqd_mem_obj);
+out_deallocate_doorbell:
+	deallocate_doorbell(qpd, q);
+out_deallocate_sdma_queue:
+	deallocate_sdma_queue(dqm, q->sdma_id);
+
+	return retval;
+}
+
+/*
+ * Device Queue Manager implementation for cp scheduler
+ */
+
+static int set_sched_resources(struct device_queue_manager *dqm)
+{
+	int i, mec;
+	struct scheduling_resources res;
+
+	res.vmid_mask = dqm->dev->shared_resources.compute_vmid_bitmap;
+
+	res.queue_mask = 0;
+	for (i = 0; i < KGD_MAX_QUEUES; ++i) {
+		mec = (i / dqm->dev->shared_resources.num_queue_per_pipe)
+			/ dqm->dev->shared_resources.num_pipe_per_mec;
+
+		if (!test_bit(i, dqm->dev->shared_resources.queue_bitmap))
+			continue;
+
+		/* only acquire queues from the first MEC */
+		if (mec > 0)
+			continue;
+
+		/* This situation may be hit in the future if a new HW
+		 * generation exposes more than 64 queues. If so, the
+		 * definition of res.queue_mask needs updating
+		 */
+		if (WARN_ON(i >= (sizeof(res.queue_mask)*8))) {
+			pr_err("Invalid queue enabled by amdgpu: %d\n", i);
+			break;
+		}
+
+		res.queue_mask |= (1ull << i);
+	}
+	res.gws_mask = res.oac_mask = res.gds_heap_base =
+						res.gds_heap_size = 0;
+
+	pr_debug("Scheduling resources:\n"
+			"vmid mask: 0x%8X\n"
+			"queue mask: 0x%8llX\n",
+			res.vmid_mask, res.queue_mask);
+
+	return pm_send_set_resources(&dqm->packets, &res);
+}
+
+static int initialize_cpsch(struct device_queue_manager *dqm)
+{
+	pr_debug("num of pipes: %d\n", get_pipes_per_mec(dqm));
+
+	mutex_init(&dqm->lock_hidden);
+	INIT_LIST_HEAD(&dqm->queues);
+	dqm->queue_count = dqm->processes_count = 0;
+	dqm->sdma_queue_count = 0;
+	dqm->active_runlist = false;
+	dqm->sdma_bitmap = (1 << get_num_sdma_queues(dqm)) - 1;
+
+	INIT_WORK(&dqm->hw_exception_work, kfd_process_hw_exception);
+
+	return 0;
+}
+
+static int start_cpsch(struct device_queue_manager *dqm)
+{
+	int retval;
+
+	retval = 0;
+
+	retval = pm_init(&dqm->packets, dqm);
+	if (retval)
+		goto fail_packet_manager_init;
+
+	retval = set_sched_resources(dqm);
+	if (retval)
+		goto fail_set_sched_resources;
+
+	pr_debug("Allocating fence memory\n");
+
+	/* allocate fence memory on the gart */
+	retval = kfd_gtt_sa_allocate(dqm->dev, sizeof(*dqm->fence_addr),
+					&dqm->fence_mem);
+
+	if (retval)
+		goto fail_allocate_vidmem;
+
+	dqm->fence_addr = dqm->fence_mem->cpu_ptr;
+	dqm->fence_gpu_addr = dqm->fence_mem->gpu_addr;
+
+	init_interrupts(dqm);
+
+	dqm_lock(dqm);
+	/* clear hang status when driver try to start the hw scheduler */
+	dqm->is_hws_hang = false;
+	execute_queues_cpsch(dqm, KFD_UNMAP_QUEUES_FILTER_DYNAMIC_QUEUES, 0);
+	dqm_unlock(dqm);
+
+	return 0;
+fail_allocate_vidmem:
+fail_set_sched_resources:
+	pm_uninit(&dqm->packets);
+fail_packet_manager_init:
+	return retval;
+}
+
+static int stop_cpsch(struct device_queue_manager *dqm)
+{
+	dqm_lock(dqm);
+	unmap_queues_cpsch(dqm, KFD_UNMAP_QUEUES_FILTER_ALL_QUEUES, 0);
+	dqm_unlock(dqm);
+
+	pm_release_ib(&dqm->packets);
+
+	kfd_gtt_sa_free(dqm->dev, dqm->fence_mem);
+	pm_uninit(&dqm->packets);
+
+	return 0;
+}
+
+static int create_kernel_queue_cpsch(struct device_queue_manager *dqm,
+					struct kernel_queue *kq,
+					struct qcm_process_device *qpd)
+{
+	dqm_lock(dqm);
+	if (dqm->total_queue_count >= max_num_of_queues_per_device) {
+		pr_warn("Can't create new kernel queue because %d queues were already created\n",
+				dqm->total_queue_count);
+		dqm_unlock(dqm);
+		return -EPERM;
+	}
+
+	/*
+	 * Unconditionally increment this counter, regardless of the queue's
+	 * type or whether the queue is active.
+	 */
+	dqm->total_queue_count++;
+	pr_debug("Total of %d queues are accountable so far\n",
+			dqm->total_queue_count);
+
+	list_add(&kq->list, &qpd->priv_queue_list);
+	dqm->queue_count++;
+	qpd->is_debug = true;
+	execute_queues_cpsch(dqm, KFD_UNMAP_QUEUES_FILTER_DYNAMIC_QUEUES, 0);
+	dqm_unlock(dqm);
+
+	return 0;
+}
+
+static void destroy_kernel_queue_cpsch(struct device_queue_manager *dqm,
+					struct kernel_queue *kq,
+					struct qcm_process_device *qpd)
+{
+	dqm_lock(dqm);
+	list_del(&kq->list);
+	dqm->queue_count--;
+	qpd->is_debug = false;
+	execute_queues_cpsch(dqm, KFD_UNMAP_QUEUES_FILTER_ALL_QUEUES, 0);
+	/*
+	 * Unconditionally decrement this counter, regardless of the queue's
+	 * type.
+	 */
+	dqm->total_queue_count--;
+	pr_debug("Total of %d queues are accountable so far\n",
+			dqm->total_queue_count);
+	dqm_unlock(dqm);
+}
+
+static int create_queue_cpsch(struct device_queue_manager *dqm, struct queue *q,
+			struct qcm_process_device *qpd)
+{
+	int retval;
+	struct mqd_manager *mqd_mgr;
+
+	retval = 0;
+
+	dqm_lock(dqm);
+
+	if (dqm->total_queue_count >= max_num_of_queues_per_device) {
+		pr_warn("Can't create new usermode queue because %d queues were already created\n",
+				dqm->total_queue_count);
+		retval = -EPERM;
+		goto out_unlock;
+	}
+
+	if (q->properties.type == KFD_QUEUE_TYPE_SDMA) {
+		retval = allocate_sdma_queue(dqm, &q->sdma_id);
+		if (retval)
+			goto out_unlock;
+		q->properties.sdma_queue_id =
+			q->sdma_id / get_num_sdma_engines(dqm);
+		q->properties.sdma_engine_id =
+			q->sdma_id % get_num_sdma_engines(dqm);
+	}
+
+	retval = allocate_doorbell(qpd, q);
+	if (retval)
+		goto out_deallocate_sdma_queue;
+
+	mqd_mgr = dqm->ops.get_mqd_manager(dqm,
+			get_mqd_type_from_queue_type(q->properties.type));
+
+	if (!mqd_mgr) {
+		retval = -ENOMEM;
+		goto out_deallocate_doorbell;
+	}
+	/*
+	 * Eviction state logic: we only mark active queues as evicted
+	 * to avoid the overhead of restoring inactive queues later
+	 */
+	if (qpd->evicted)
+		q->properties.is_evicted = (q->properties.queue_size > 0 &&
+					    q->properties.queue_percent > 0 &&
+					    q->properties.queue_address != 0);
+
+	dqm->asic_ops.init_sdma_vm(dqm, q, qpd);
+
+	q->properties.tba_addr = qpd->tba_addr;
+	q->properties.tma_addr = qpd->tma_addr;
+	retval = mqd_mgr->init_mqd(mqd_mgr, &q->mqd, &q->mqd_mem_obj,
+				&q->gart_mqd_addr, &q->properties);
+	if (retval)
+		goto out_deallocate_doorbell;
+
+	list_add(&q->list, &qpd->queues_list);
+	qpd->queue_count++;
+	if (q->properties.is_active) {
+		dqm->queue_count++;
+		retval = execute_queues_cpsch(dqm,
+				KFD_UNMAP_QUEUES_FILTER_DYNAMIC_QUEUES, 0);
+	}
+
+	if (q->properties.type == KFD_QUEUE_TYPE_SDMA)
+		dqm->sdma_queue_count++;
+	/*
+	 * Unconditionally increment this counter, regardless of the queue's
+	 * type or whether the queue is active.
+	 */
+	dqm->total_queue_count++;
+
+	pr_debug("Total of %d queues are accountable so far\n",
+			dqm->total_queue_count);
+
+	dqm_unlock(dqm);
+	return retval;
+
+out_deallocate_doorbell:
+	deallocate_doorbell(qpd, q);
+out_deallocate_sdma_queue:
+	if (q->properties.type == KFD_QUEUE_TYPE_SDMA)
+		deallocate_sdma_queue(dqm, q->sdma_id);
+out_unlock:
+	dqm_unlock(dqm);
+
+	return retval;
+}
+
+int amdkfd_fence_wait_timeout(unsigned int *fence_addr,
+				unsigned int fence_value,
+				unsigned int timeout_ms)
+{
+	unsigned long end_jiffies = msecs_to_jiffies(timeout_ms) + jiffies;
+
+	while (*fence_addr != fence_value) {
+		if (time_after(jiffies, end_jiffies)) {
+			pr_err("qcm fence wait loop timeout expired\n");
+			/* In HWS case, this is used to halt the driver thread
+			 * in order not to mess up CP states before doing
+			 * scandumps for FW debugging.
+			 */
+			while (halt_if_hws_hang)
+				schedule();
+
+			return -ETIME;
+		}
+		schedule();
+	}
+
+	return 0;
+}
+
+static int unmap_sdma_queues(struct device_queue_manager *dqm)
+{
+	int i, retval = 0;
+
+	for (i = 0; i < dqm->dev->device_info->num_sdma_engines; i++) {
+		retval = pm_send_unmap_queue(&dqm->packets, KFD_QUEUE_TYPE_SDMA,
+			KFD_UNMAP_QUEUES_FILTER_DYNAMIC_QUEUES, 0, false, i);
+		if (retval)
+			return retval;
+	}
+	return retval;
+}
+
+/* dqm->lock mutex has to be locked before calling this function */
+static int map_queues_cpsch(struct device_queue_manager *dqm)
+{
+	int retval;
+
+	if (dqm->queue_count <= 0 || dqm->processes_count <= 0)
+		return 0;
+
+	if (dqm->active_runlist)
+		return 0;
+
+	retval = pm_send_runlist(&dqm->packets, &dqm->queues);
+	if (retval) {
+		pr_err("failed to execute runlist\n");
+		return retval;
+	}
+	dqm->active_runlist = true;
+
+	return retval;
+}
+
+/* dqm->lock mutex has to be locked before calling this function */
+static int unmap_queues_cpsch(struct device_queue_manager *dqm,
+				enum kfd_unmap_queues_filter filter,
+				uint32_t filter_param)
+{
+	int retval = 0;
+
+	if (dqm->is_hws_hang)
+		return -EIO;
+	if (!dqm->active_runlist)
+		return retval;
+
+	pr_debug("Before destroying queues, sdma queue count is : %u\n",
+		dqm->sdma_queue_count);
+
+	if (dqm->sdma_queue_count > 0)
+		unmap_sdma_queues(dqm);
+
+	retval = pm_send_unmap_queue(&dqm->packets, KFD_QUEUE_TYPE_COMPUTE,
+			filter, filter_param, false, 0);
+	if (retval)
+		return retval;
+
+	*dqm->fence_addr = KFD_FENCE_INIT;
+	pm_send_query_status(&dqm->packets, dqm->fence_gpu_addr,
+				KFD_FENCE_COMPLETED);
+	/* should be timed out */
+	retval = amdkfd_fence_wait_timeout(dqm->fence_addr, KFD_FENCE_COMPLETED,
+				QUEUE_PREEMPT_DEFAULT_TIMEOUT_MS);
+	if (retval)
+		return retval;
+
+	pm_release_ib(&dqm->packets);
+	dqm->active_runlist = false;
+
+	return retval;
+}
+
+/* dqm->lock mutex has to be locked before calling this function */
+static int execute_queues_cpsch(struct device_queue_manager *dqm,
+				enum kfd_unmap_queues_filter filter,
+				uint32_t filter_param)
+{
+	int retval;
+
+	if (dqm->is_hws_hang)
+		return -EIO;
+	retval = unmap_queues_cpsch(dqm, filter, filter_param);
+	if (retval) {
+		pr_err("The cp might be in an unrecoverable state due to an unsuccessful queues preemption\n");
+		dqm->is_hws_hang = true;
+		schedule_work(&dqm->hw_exception_work);
+		return retval;
+	}
+
+	return map_queues_cpsch(dqm);
+}
+
+static int destroy_queue_cpsch(struct device_queue_manager *dqm,
+				struct qcm_process_device *qpd,
+				struct queue *q)
+{
+	int retval;
+	struct mqd_manager *mqd_mgr;
+	bool preempt_all_queues;
+
+	preempt_all_queues = false;
+
+	retval = 0;
+
+	/* remove queue from list to prevent rescheduling after preemption */
+	dqm_lock(dqm);
+
+	if (qpd->is_debug) {
+		/*
+		 * error, currently we do not allow to destroy a queue
+		 * of a currently debugged process
+		 */
+		retval = -EBUSY;
+		goto failed_try_destroy_debugged_queue;
+
+	}
+
+	mqd_mgr = dqm->ops.get_mqd_manager(dqm,
+			get_mqd_type_from_queue_type(q->properties.type));
+	if (!mqd_mgr) {
+		retval = -ENOMEM;
+		goto failed;
+	}
+
+	deallocate_doorbell(qpd, q);
+
+	if (q->properties.type == KFD_QUEUE_TYPE_SDMA) {
+		dqm->sdma_queue_count--;
+		deallocate_sdma_queue(dqm, q->sdma_id);
+	}
+
+	list_del(&q->list);
+	qpd->queue_count--;
+	if (q->properties.is_active) {
+		dqm->queue_count--;
+		retval = execute_queues_cpsch(dqm,
+				KFD_UNMAP_QUEUES_FILTER_DYNAMIC_QUEUES, 0);
+		if (retval == -ETIME)
+			qpd->reset_wavefronts = true;
+	}
+
+	mqd_mgr->uninit_mqd(mqd_mgr, q->mqd, q->mqd_mem_obj);
+
+	/*
+	 * Unconditionally decrement this counter, regardless of the queue's
+	 * type
+	 */
+	dqm->total_queue_count--;
+	pr_debug("Total of %d queues are accountable so far\n",
+			dqm->total_queue_count);
+
+	dqm_unlock(dqm);
+
+	return retval;
+
+failed:
+failed_try_destroy_debugged_queue:
+
+	dqm_unlock(dqm);
+	return retval;
+}
+
+/*
+ * Low bits must be 0000/FFFF as required by HW, high bits must be 0 to
+ * stay in user mode.
+ */
+#define APE1_FIXED_BITS_MASK 0xFFFF80000000FFFFULL
+/* APE1 limit is inclusive and 64K aligned. */
+#define APE1_LIMIT_ALIGNMENT 0xFFFF
+
+static bool set_cache_memory_policy(struct device_queue_manager *dqm,
+				   struct qcm_process_device *qpd,
+				   enum cache_policy default_policy,
+				   enum cache_policy alternate_policy,
+				   void __user *alternate_aperture_base,
+				   uint64_t alternate_aperture_size)
+{
+	bool retval = true;
+
+	if (!dqm->asic_ops.set_cache_memory_policy)
+		return retval;
+
+	dqm_lock(dqm);
+
+	if (alternate_aperture_size == 0) {
+		/* base > limit disables APE1 */
+		qpd->sh_mem_ape1_base = 1;
+		qpd->sh_mem_ape1_limit = 0;
+	} else {
+		/*
+		 * In FSA64, APE1_Base[63:0] = { 16{SH_MEM_APE1_BASE[31]},
+		 *			SH_MEM_APE1_BASE[31:0], 0x0000 }
+		 * APE1_Limit[63:0] = { 16{SH_MEM_APE1_LIMIT[31]},
+		 *			SH_MEM_APE1_LIMIT[31:0], 0xFFFF }
+		 * Verify that the base and size parameters can be
+		 * represented in this format and convert them.
+		 * Additionally restrict APE1 to user-mode addresses.
+		 */
+
+		uint64_t base = (uintptr_t)alternate_aperture_base;
+		uint64_t limit = base + alternate_aperture_size - 1;
+
+		if (limit <= base || (base & APE1_FIXED_BITS_MASK) != 0 ||
+		   (limit & APE1_FIXED_BITS_MASK) != APE1_LIMIT_ALIGNMENT) {
+			retval = false;
+			goto out;
+		}
+
+		qpd->sh_mem_ape1_base = base >> 16;
+		qpd->sh_mem_ape1_limit = limit >> 16;
+	}
+
+	retval = dqm->asic_ops.set_cache_memory_policy(
+			dqm,
+			qpd,
+			default_policy,
+			alternate_policy,
+			alternate_aperture_base,
+			alternate_aperture_size);
+
+	if ((dqm->sched_policy == KFD_SCHED_POLICY_NO_HWS) && (qpd->vmid != 0))
+		program_sh_mem_settings(dqm, qpd);
+
+	pr_debug("sh_mem_config: 0x%x, ape1_base: 0x%x, ape1_limit: 0x%x\n",
+		qpd->sh_mem_config, qpd->sh_mem_ape1_base,
+		qpd->sh_mem_ape1_limit);
+
+out:
+	dqm_unlock(dqm);
+	return retval;
+}
+
+static int set_trap_handler(struct device_queue_manager *dqm,
+				struct qcm_process_device *qpd,
+				uint64_t tba_addr,
+				uint64_t tma_addr)
+{
+	uint64_t *tma;
+
+	if (dqm->dev->cwsr_enabled) {
+		/* Jump from CWSR trap handler to user trap */
+		tma = (uint64_t *)(qpd->cwsr_kaddr + KFD_CWSR_TMA_OFFSET);
+		tma[0] = tba_addr;
+		tma[1] = tma_addr;
+	} else {
+		qpd->tba_addr = tba_addr;
+		qpd->tma_addr = tma_addr;
+	}
+
+	return 0;
+}
+
+static int process_termination_nocpsch(struct device_queue_manager *dqm,
+		struct qcm_process_device *qpd)
+{
+	struct queue *q, *next;
+	struct device_process_node *cur, *next_dpn;
+	int retval = 0;
+
+	dqm_lock(dqm);
+
+	/* Clear all user mode queues */
+	list_for_each_entry_safe(q, next, &qpd->queues_list, list) {
+		int ret;
+
+		ret = destroy_queue_nocpsch_locked(dqm, qpd, q);
+		if (ret)
+			retval = ret;
+	}
+
+	/* Unregister process */
+	list_for_each_entry_safe(cur, next_dpn, &dqm->queues, list) {
+		if (qpd == cur->qpd) {
+			list_del(&cur->list);
+			kfree(cur);
+			dqm->processes_count--;
+			break;
+		}
+	}
+
+	dqm_unlock(dqm);
+	return retval;
+}
+
+
+static int process_termination_cpsch(struct device_queue_manager *dqm,
+		struct qcm_process_device *qpd)
+{
+	int retval;
+	struct queue *q, *next;
+	struct kernel_queue *kq, *kq_next;
+	struct mqd_manager *mqd_mgr;
+	struct device_process_node *cur, *next_dpn;
+	enum kfd_unmap_queues_filter filter =
+		KFD_UNMAP_QUEUES_FILTER_DYNAMIC_QUEUES;
+
+	retval = 0;
+
+	dqm_lock(dqm);
+
+	/* Clean all kernel queues */
+	list_for_each_entry_safe(kq, kq_next, &qpd->priv_queue_list, list) {
+		list_del(&kq->list);
+		dqm->queue_count--;
+		qpd->is_debug = false;
+		dqm->total_queue_count--;
+		filter = KFD_UNMAP_QUEUES_FILTER_ALL_QUEUES;
+	}
+
+	/* Clear all user mode queues */
+	list_for_each_entry(q, &qpd->queues_list, list) {
+		if (q->properties.type == KFD_QUEUE_TYPE_SDMA) {
+			dqm->sdma_queue_count--;
+			deallocate_sdma_queue(dqm, q->sdma_id);
+		}
+
+		if (q->properties.is_active)
+			dqm->queue_count--;
+
+		dqm->total_queue_count--;
+	}
+
+	/* Unregister process */
+	list_for_each_entry_safe(cur, next_dpn, &dqm->queues, list) {
+		if (qpd == cur->qpd) {
+			list_del(&cur->list);
+			kfree(cur);
+			dqm->processes_count--;
+			break;
+		}
+	}
+
+	retval = execute_queues_cpsch(dqm, filter, 0);
+	if ((!dqm->is_hws_hang) && (retval || qpd->reset_wavefronts)) {
+		pr_warn("Resetting wave fronts (cpsch) on dev %p\n", dqm->dev);
+		dbgdev_wave_reset_wavefronts(dqm->dev, qpd->pqm->process);
+		qpd->reset_wavefronts = false;
+	}
+
+	/* lastly, free mqd resources */
+	list_for_each_entry_safe(q, next, &qpd->queues_list, list) {
+		mqd_mgr = dqm->ops.get_mqd_manager(dqm,
+			get_mqd_type_from_queue_type(q->properties.type));
+		if (!mqd_mgr) {
+			retval = -ENOMEM;
+			goto out;
+		}
+		list_del(&q->list);
+		qpd->queue_count--;
+		mqd_mgr->uninit_mqd(mqd_mgr, q->mqd, q->mqd_mem_obj);
+	}
+
+out:
+	dqm_unlock(dqm);
+	return retval;
+}
+
+struct device_queue_manager *device_queue_manager_init(struct kfd_dev *dev)
+{
+	struct device_queue_manager *dqm;
+
+	pr_debug("Loading device queue manager\n");
+
+	dqm = kzalloc(sizeof(*dqm), GFP_KERNEL);
+	if (!dqm)
+		return NULL;
+
+	switch (dev->device_info->asic_family) {
+	/* HWS is not available on Hawaii. */
+	case CHIP_HAWAII:
+	/* HWS depends on CWSR for timely dequeue. CWSR is not
+	 * available on Tonga.
+	 *
+	 * FIXME: This argument also applies to Kaveri.
+	 */
+	case CHIP_TONGA:
+		dqm->sched_policy = KFD_SCHED_POLICY_NO_HWS;
+		break;
+	default:
+		dqm->sched_policy = sched_policy;
+		break;
+	}
+
+	dqm->dev = dev;
+	switch (dqm->sched_policy) {
+	case KFD_SCHED_POLICY_HWS:
+	case KFD_SCHED_POLICY_HWS_NO_OVERSUBSCRIPTION:
+		/* initialize dqm for cp scheduling */
+		dqm->ops.create_queue = create_queue_cpsch;
+		dqm->ops.initialize = initialize_cpsch;
+		dqm->ops.start = start_cpsch;
+		dqm->ops.stop = stop_cpsch;
+		dqm->ops.destroy_queue = destroy_queue_cpsch;
+		dqm->ops.update_queue = update_queue;
+		dqm->ops.get_mqd_manager = get_mqd_manager;
+		dqm->ops.register_process = register_process;
+		dqm->ops.unregister_process = unregister_process;
+		dqm->ops.uninitialize = uninitialize;
+		dqm->ops.create_kernel_queue = create_kernel_queue_cpsch;
+		dqm->ops.destroy_kernel_queue = destroy_kernel_queue_cpsch;
+		dqm->ops.set_cache_memory_policy = set_cache_memory_policy;
+		dqm->ops.set_trap_handler = set_trap_handler;
+		dqm->ops.process_termination = process_termination_cpsch;
+		dqm->ops.evict_process_queues = evict_process_queues_cpsch;
+		dqm->ops.restore_process_queues = restore_process_queues_cpsch;
+		break;
+	case KFD_SCHED_POLICY_NO_HWS:
+		/* initialize dqm for no cp scheduling */
+		dqm->ops.start = start_nocpsch;
+		dqm->ops.stop = stop_nocpsch;
+		dqm->ops.create_queue = create_queue_nocpsch;
+		dqm->ops.destroy_queue = destroy_queue_nocpsch;
+		dqm->ops.update_queue = update_queue;
+		dqm->ops.get_mqd_manager = get_mqd_manager;
+		dqm->ops.register_process = register_process;
+		dqm->ops.unregister_process = unregister_process;
+		dqm->ops.initialize = initialize_nocpsch;
+		dqm->ops.uninitialize = uninitialize;
+		dqm->ops.set_cache_memory_policy = set_cache_memory_policy;
+		dqm->ops.set_trap_handler = set_trap_handler;
+		dqm->ops.process_termination = process_termination_nocpsch;
+		dqm->ops.evict_process_queues = evict_process_queues_nocpsch;
+		dqm->ops.restore_process_queues =
+			restore_process_queues_nocpsch;
+		break;
+	default:
+		pr_err("Invalid scheduling policy %d\n", dqm->sched_policy);
+		goto out_free;
+	}
+
+	switch (dev->device_info->asic_family) {
+	case CHIP_CARRIZO:
+		device_queue_manager_init_vi(&dqm->asic_ops);
+		break;
+
+	case CHIP_KAVERI:
+		device_queue_manager_init_cik(&dqm->asic_ops);
+		break;
+
+	case CHIP_HAWAII:
+		device_queue_manager_init_cik_hawaii(&dqm->asic_ops);
+		break;
+
+	case CHIP_TONGA:
+	case CHIP_FIJI:
+	case CHIP_POLARIS10:
+	case CHIP_POLARIS11:
+		device_queue_manager_init_vi_tonga(&dqm->asic_ops);
+		break;
+
+	case CHIP_VEGA10:
+	case CHIP_RAVEN:
+		device_queue_manager_init_v9(&dqm->asic_ops);
+		break;
+	default:
+		WARN(1, "Unexpected ASIC family %u",
+		     dev->device_info->asic_family);
+		goto out_free;
+	}
+
+	if (!dqm->ops.initialize(dqm))
+		return dqm;
+
+out_free:
+	kfree(dqm);
+	return NULL;
+}
+
+void device_queue_manager_uninit(struct device_queue_manager *dqm)
+{
+	dqm->ops.uninitialize(dqm);
+	kfree(dqm);
+}
+
+int kfd_process_vm_fault(struct device_queue_manager *dqm,
+			 unsigned int pasid)
+{
+	struct kfd_process_device *pdd;
+	struct kfd_process *p = kfd_lookup_process_by_pasid(pasid);
+	int ret = 0;
+
+	if (!p)
+		return -EINVAL;
+	pdd = kfd_get_process_device_data(dqm->dev, p);
+	if (pdd)
+		ret = dqm->ops.evict_process_queues(dqm, &pdd->qpd);
+	kfd_unref_process(p);
+
+	return ret;
+}
+
+static void kfd_process_hw_exception(struct work_struct *work)
+{
+	struct device_queue_manager *dqm = container_of(work,
+			struct device_queue_manager, hw_exception_work);
+	dqm->dev->kfd2kgd->gpu_recover(dqm->dev->kgd);
+}
+
+#if defined(CONFIG_DEBUG_FS)
+
+static void seq_reg_dump(struct seq_file *m,
+			 uint32_t (*dump)[2], uint32_t n_regs)
+{
+	uint32_t i, count;
+
+	for (i = 0, count = 0; i < n_regs; i++) {
+		if (count == 0 ||
+		    dump[i-1][0] + sizeof(uint32_t) != dump[i][0]) {
+			seq_printf(m, "%s    %08x: %08x",
+				   i ? "\n" : "",
+				   dump[i][0], dump[i][1]);
+			count = 7;
+		} else {
+			seq_printf(m, " %08x", dump[i][1]);
+			count--;
+		}
+	}
+
+	seq_puts(m, "\n");
+}
+
+int dqm_debugfs_hqds(struct seq_file *m, void *data)
+{
+	struct device_queue_manager *dqm = data;
+	uint32_t (*dump)[2], n_regs;
+	int pipe, queue;
+	int r = 0;
+
+	r = dqm->dev->kfd2kgd->hqd_dump(dqm->dev->kgd,
+		KFD_CIK_HIQ_PIPE, KFD_CIK_HIQ_QUEUE, &dump, &n_regs);
+	if (!r) {
+		seq_printf(m, "  HIQ on MEC %d Pipe %d Queue %d\n",
+				KFD_CIK_HIQ_PIPE/get_pipes_per_mec(dqm)+1,
+				KFD_CIK_HIQ_PIPE%get_pipes_per_mec(dqm),
+				KFD_CIK_HIQ_QUEUE);
+		seq_reg_dump(m, dump, n_regs);
+
+		kfree(dump);
+	}
+
+	for (pipe = 0; pipe < get_pipes_per_mec(dqm); pipe++) {
+		int pipe_offset = pipe * get_queues_per_pipe(dqm);
+
+		for (queue = 0; queue < get_queues_per_pipe(dqm); queue++) {
+			if (!test_bit(pipe_offset + queue,
+				      dqm->dev->shared_resources.queue_bitmap))
+				continue;
+
+			r = dqm->dev->kfd2kgd->hqd_dump(
+				dqm->dev->kgd, pipe, queue, &dump, &n_regs);
+			if (r)
+				break;
+
+			seq_printf(m, "  CP Pipe %d, Queue %d\n",
+				  pipe, queue);
+			seq_reg_dump(m, dump, n_regs);
+
+			kfree(dump);
+		}
+	}
+
+	for (pipe = 0; pipe < get_num_sdma_engines(dqm); pipe++) {
+		for (queue = 0; queue < KFD_SDMA_QUEUES_PER_ENGINE; queue++) {
+			r = dqm->dev->kfd2kgd->hqd_sdma_dump(
+				dqm->dev->kgd, pipe, queue, &dump, &n_regs);
+			if (r)
+				break;
+
+			seq_printf(m, "  SDMA Engine %d, RLC %d\n",
+				  pipe, queue);
+			seq_reg_dump(m, dump, n_regs);
+
+			kfree(dump);
+		}
+	}
+
+	return r;
+}
+
+int dqm_debugfs_execute_queues(struct device_queue_manager *dqm)
+{
+	int r = 0;
+
+	dqm_lock(dqm);
+	dqm->active_runlist = true;
+	r = execute_queues_cpsch(dqm, KFD_UNMAP_QUEUES_FILTER_ALL_QUEUES, 0);
+	dqm_unlock(dqm);
+
+	return r;
+}
+
+#endif
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h
new file mode 100644
index 000000000..00da3169a
--- /dev/null
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h
@@ -0,0 +1,242 @@
+/*
+ * Copyright 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ *
+ */
+
+#ifndef KFD_DEVICE_QUEUE_MANAGER_H_
+#define KFD_DEVICE_QUEUE_MANAGER_H_
+
+#include <linux/rwsem.h>
+#include <linux/list.h>
+#include <linux/mutex.h>
+#include <linux/sched/mm.h>
+#include "kfd_priv.h"
+#include "kfd_mqd_manager.h"
+
+#define KFD_UNMAP_LATENCY_MS			(4000)
+#define QUEUE_PREEMPT_DEFAULT_TIMEOUT_MS (2 * KFD_UNMAP_LATENCY_MS + 1000)
+#define KFD_SDMA_QUEUES_PER_ENGINE		(2)
+
+struct device_process_node {
+	struct qcm_process_device *qpd;
+	struct list_head list;
+};
+
+/**
+ * struct device_queue_manager_ops
+ *
+ * @create_queue: Queue creation routine.
+ *
+ * @destroy_queue: Queue destruction routine.
+ *
+ * @update_queue: Queue update routine.
+ *
+ * @get_mqd_manager: Returns the mqd manager according to the mqd type.
+ *
+ * @exeute_queues: Dispatches the queues list to the H/W.
+ *
+ * @register_process: This routine associates a specific process with device.
+ *
+ * @unregister_process: destroys the associations between process to device.
+ *
+ * @initialize: Initializes the pipelines and memory module for that device.
+ *
+ * @start: Initializes the resources/modules the the device needs for queues
+ * execution. This function is called on device initialization and after the
+ * system woke up after suspension.
+ *
+ * @stop: This routine stops execution of all the active queue running on the
+ * H/W and basically this function called on system suspend.
+ *
+ * @uninitialize: Destroys all the device queue manager resources allocated in
+ * initialize routine.
+ *
+ * @create_kernel_queue: Creates kernel queue. Used for debug queue.
+ *
+ * @destroy_kernel_queue: Destroys kernel queue. Used for debug queue.
+ *
+ * @set_cache_memory_policy: Sets memory policy (cached/ non cached) for the
+ * memory apertures.
+ *
+ * @process_termination: Clears all process queues belongs to that device.
+ *
+ * @evict_process_queues: Evict all active queues of a process
+ *
+ * @restore_process_queues: Restore all evicted queues queues of a process
+ *
+ */
+
+struct device_queue_manager_ops {
+	int	(*create_queue)(struct device_queue_manager *dqm,
+				struct queue *q,
+				struct qcm_process_device *qpd);
+
+	int	(*destroy_queue)(struct device_queue_manager *dqm,
+				struct qcm_process_device *qpd,
+				struct queue *q);
+
+	int	(*update_queue)(struct device_queue_manager *dqm,
+				struct queue *q);
+
+	struct mqd_manager * (*get_mqd_manager)
+					(struct device_queue_manager *dqm,
+					enum KFD_MQD_TYPE type);
+
+	int	(*register_process)(struct device_queue_manager *dqm,
+					struct qcm_process_device *qpd);
+
+	int	(*unregister_process)(struct device_queue_manager *dqm,
+					struct qcm_process_device *qpd);
+
+	int	(*initialize)(struct device_queue_manager *dqm);
+	int	(*start)(struct device_queue_manager *dqm);
+	int	(*stop)(struct device_queue_manager *dqm);
+	void	(*uninitialize)(struct device_queue_manager *dqm);
+	int	(*create_kernel_queue)(struct device_queue_manager *dqm,
+					struct kernel_queue *kq,
+					struct qcm_process_device *qpd);
+
+	void	(*destroy_kernel_queue)(struct device_queue_manager *dqm,
+					struct kernel_queue *kq,
+					struct qcm_process_device *qpd);
+
+	bool	(*set_cache_memory_policy)(struct device_queue_manager *dqm,
+					   struct qcm_process_device *qpd,
+					   enum cache_policy default_policy,
+					   enum cache_policy alternate_policy,
+					   void __user *alternate_aperture_base,
+					   uint64_t alternate_aperture_size);
+
+	int	(*set_trap_handler)(struct device_queue_manager *dqm,
+				    struct qcm_process_device *qpd,
+				    uint64_t tba_addr,
+				    uint64_t tma_addr);
+
+	int (*process_termination)(struct device_queue_manager *dqm,
+			struct qcm_process_device *qpd);
+
+	int (*evict_process_queues)(struct device_queue_manager *dqm,
+				    struct qcm_process_device *qpd);
+	int (*restore_process_queues)(struct device_queue_manager *dqm,
+				      struct qcm_process_device *qpd);
+};
+
+struct device_queue_manager_asic_ops {
+	int	(*update_qpd)(struct device_queue_manager *dqm,
+					struct qcm_process_device *qpd);
+	bool	(*set_cache_memory_policy)(struct device_queue_manager *dqm,
+					   struct qcm_process_device *qpd,
+					   enum cache_policy default_policy,
+					   enum cache_policy alternate_policy,
+					   void __user *alternate_aperture_base,
+					   uint64_t alternate_aperture_size);
+	void	(*init_sdma_vm)(struct device_queue_manager *dqm,
+				struct queue *q,
+				struct qcm_process_device *qpd);
+};
+
+/**
+ * struct device_queue_manager
+ *
+ * This struct is a base class for the kfd queues scheduler in the
+ * device level. The device base class should expose the basic operations
+ * for queue creation and queue destruction. This base class hides the
+ * scheduling mode of the driver and the specific implementation of the
+ * concrete device. This class is the only class in the queues scheduler
+ * that configures the H/W.
+ *
+ */
+
+struct device_queue_manager {
+	struct device_queue_manager_ops ops;
+	struct device_queue_manager_asic_ops asic_ops;
+
+	struct mqd_manager	*mqd_mgrs[KFD_MQD_TYPE_MAX];
+	struct packet_manager	packets;
+	struct kfd_dev		*dev;
+	struct mutex		lock_hidden; /* use dqm_lock/unlock(dqm) */
+	struct list_head	queues;
+	unsigned int		saved_flags;
+	unsigned int		processes_count;
+	unsigned int		queue_count;
+	unsigned int		sdma_queue_count;
+	unsigned int		total_queue_count;
+	unsigned int		next_pipe_to_allocate;
+	unsigned int		*allocated_queues;
+	unsigned int		sdma_bitmap;
+	unsigned int		vmid_bitmap;
+	uint64_t		pipelines_addr;
+	struct kfd_mem_obj	*pipeline_mem;
+	uint64_t		fence_gpu_addr;
+	unsigned int		*fence_addr;
+	struct kfd_mem_obj	*fence_mem;
+	bool			active_runlist;
+	int			sched_policy;
+
+	/* hw exception  */
+	bool			is_hws_hang;
+	struct work_struct	hw_exception_work;
+};
+
+void device_queue_manager_init_cik(
+		struct device_queue_manager_asic_ops *asic_ops);
+void device_queue_manager_init_cik_hawaii(
+		struct device_queue_manager_asic_ops *asic_ops);
+void device_queue_manager_init_vi(
+		struct device_queue_manager_asic_ops *asic_ops);
+void device_queue_manager_init_vi_tonga(
+		struct device_queue_manager_asic_ops *asic_ops);
+void device_queue_manager_init_v9(
+		struct device_queue_manager_asic_ops *asic_ops);
+void program_sh_mem_settings(struct device_queue_manager *dqm,
+					struct qcm_process_device *qpd);
+unsigned int get_queues_num(struct device_queue_manager *dqm);
+unsigned int get_queues_per_pipe(struct device_queue_manager *dqm);
+unsigned int get_pipes_per_mec(struct device_queue_manager *dqm);
+unsigned int get_num_sdma_queues(struct device_queue_manager *dqm);
+
+static inline unsigned int get_sh_mem_bases_32(struct kfd_process_device *pdd)
+{
+	return (pdd->lds_base >> 16) & 0xFF;
+}
+
+static inline unsigned int
+get_sh_mem_bases_nybble_64(struct kfd_process_device *pdd)
+{
+	return (pdd->lds_base >> 60) & 0x0E;
+}
+
+/* The DQM lock can be taken in MMU notifiers. Make sure no reclaim-FS
+ * happens while holding this lock anywhere to prevent deadlocks when
+ * an MMU notifier runs in reclaim-FS context.
+ */
+static inline void dqm_lock(struct device_queue_manager *dqm)
+{
+	mutex_lock(&dqm->lock_hidden);
+	dqm->saved_flags = memalloc_nofs_save();
+}
+static inline void dqm_unlock(struct device_queue_manager *dqm)
+{
+	memalloc_nofs_restore(dqm->saved_flags);
+	mutex_unlock(&dqm->lock_hidden);
+}
+
+#endif /* KFD_DEVICE_QUEUE_MANAGER_H_ */
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager_cik.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager_cik.c
new file mode 100644
index 000000000..aed4c2141
--- /dev/null
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager_cik.c
@@ -0,0 +1,205 @@
+/*
+ * Copyright 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ *
+ */
+
+#include "kfd_device_queue_manager.h"
+#include "cik_regs.h"
+#include "oss/oss_2_4_sh_mask.h"
+#include "gca/gfx_7_2_sh_mask.h"
+
+static bool set_cache_memory_policy_cik(struct device_queue_manager *dqm,
+				   struct qcm_process_device *qpd,
+				   enum cache_policy default_policy,
+				   enum cache_policy alternate_policy,
+				   void __user *alternate_aperture_base,
+				   uint64_t alternate_aperture_size);
+static int update_qpd_cik(struct device_queue_manager *dqm,
+					struct qcm_process_device *qpd);
+static int update_qpd_cik_hawaii(struct device_queue_manager *dqm,
+					struct qcm_process_device *qpd);
+static void init_sdma_vm(struct device_queue_manager *dqm, struct queue *q,
+				struct qcm_process_device *qpd);
+static void init_sdma_vm_hawaii(struct device_queue_manager *dqm,
+				struct queue *q,
+				struct qcm_process_device *qpd);
+
+void device_queue_manager_init_cik(
+		struct device_queue_manager_asic_ops *asic_ops)
+{
+	asic_ops->set_cache_memory_policy = set_cache_memory_policy_cik;
+	asic_ops->update_qpd = update_qpd_cik;
+	asic_ops->init_sdma_vm = init_sdma_vm;
+}
+
+void device_queue_manager_init_cik_hawaii(
+		struct device_queue_manager_asic_ops *asic_ops)
+{
+	asic_ops->set_cache_memory_policy = set_cache_memory_policy_cik;
+	asic_ops->update_qpd = update_qpd_cik_hawaii;
+	asic_ops->init_sdma_vm = init_sdma_vm_hawaii;
+}
+
+static uint32_t compute_sh_mem_bases_64bit(unsigned int top_address_nybble)
+{
+	/* In 64-bit mode, we can only control the top 3 bits of the LDS,
+	 * scratch and GPUVM apertures.
+	 * The hardware fills in the remaining 59 bits according to the
+	 * following pattern:
+	 * LDS:		X0000000'00000000 - X0000001'00000000 (4GB)
+	 * Scratch:	X0000001'00000000 - X0000002'00000000 (4GB)
+	 * GPUVM:	Y0010000'00000000 - Y0020000'00000000 (1TB)
+	 *
+	 * (where X/Y is the configurable nybble with the low-bit 0)
+	 *
+	 * LDS and scratch will have the same top nybble programmed in the
+	 * top 3 bits of SH_MEM_BASES.PRIVATE_BASE.
+	 * GPUVM can have a different top nybble programmed in the
+	 * top 3 bits of SH_MEM_BASES.SHARED_BASE.
+	 * We don't bother to support different top nybbles
+	 * for LDS/Scratch and GPUVM.
+	 */
+
+	WARN_ON((top_address_nybble & 1) || top_address_nybble > 0xE ||
+		top_address_nybble == 0);
+
+	return PRIVATE_BASE(top_address_nybble << 12) |
+			SHARED_BASE(top_address_nybble << 12);
+}
+
+static bool set_cache_memory_policy_cik(struct device_queue_manager *dqm,
+				   struct qcm_process_device *qpd,
+				   enum cache_policy default_policy,
+				   enum cache_policy alternate_policy,
+				   void __user *alternate_aperture_base,
+				   uint64_t alternate_aperture_size)
+{
+	uint32_t default_mtype;
+	uint32_t ape1_mtype;
+
+	default_mtype = (default_policy == cache_policy_coherent) ?
+			MTYPE_NONCACHED :
+			MTYPE_CACHED;
+
+	ape1_mtype = (alternate_policy == cache_policy_coherent) ?
+			MTYPE_NONCACHED :
+			MTYPE_CACHED;
+
+	qpd->sh_mem_config = (qpd->sh_mem_config & PTR32)
+			| ALIGNMENT_MODE(SH_MEM_ALIGNMENT_MODE_UNALIGNED)
+			| DEFAULT_MTYPE(default_mtype)
+			| APE1_MTYPE(ape1_mtype);
+
+	return true;
+}
+
+static int update_qpd_cik(struct device_queue_manager *dqm,
+		struct qcm_process_device *qpd)
+{
+	struct kfd_process_device *pdd;
+	unsigned int temp;
+
+	pdd = qpd_to_pdd(qpd);
+
+	/* check if sh_mem_config register already configured */
+	if (qpd->sh_mem_config == 0) {
+		qpd->sh_mem_config =
+			ALIGNMENT_MODE(SH_MEM_ALIGNMENT_MODE_UNALIGNED) |
+			DEFAULT_MTYPE(MTYPE_NONCACHED) |
+			APE1_MTYPE(MTYPE_NONCACHED);
+		qpd->sh_mem_ape1_limit = 0;
+		qpd->sh_mem_ape1_base = 0;
+	}
+
+	if (qpd->pqm->process->is_32bit_user_mode) {
+		temp = get_sh_mem_bases_32(pdd);
+		qpd->sh_mem_bases = SHARED_BASE(temp);
+		qpd->sh_mem_config |= PTR32;
+	} else {
+		temp = get_sh_mem_bases_nybble_64(pdd);
+		qpd->sh_mem_bases = compute_sh_mem_bases_64bit(temp);
+		qpd->sh_mem_config |= 1  << SH_MEM_CONFIG__PRIVATE_ATC__SHIFT;
+	}
+
+	pr_debug("is32bit process: %d sh_mem_bases nybble: 0x%X and register 0x%X\n",
+		qpd->pqm->process->is_32bit_user_mode, temp, qpd->sh_mem_bases);
+
+	return 0;
+}
+
+static int update_qpd_cik_hawaii(struct device_queue_manager *dqm,
+		struct qcm_process_device *qpd)
+{
+	struct kfd_process_device *pdd;
+	unsigned int temp;
+
+	pdd = qpd_to_pdd(qpd);
+
+	/* check if sh_mem_config register already configured */
+	if (qpd->sh_mem_config == 0) {
+		qpd->sh_mem_config =
+			ALIGNMENT_MODE(SH_MEM_ALIGNMENT_MODE_UNALIGNED) |
+			DEFAULT_MTYPE(MTYPE_NONCACHED) |
+			APE1_MTYPE(MTYPE_NONCACHED);
+		qpd->sh_mem_ape1_limit = 0;
+		qpd->sh_mem_ape1_base = 0;
+	}
+
+	/* On dGPU we're always in GPUVM64 addressing mode with 64-bit
+	 * aperture addresses.
+	 */
+	temp = get_sh_mem_bases_nybble_64(pdd);
+	qpd->sh_mem_bases = compute_sh_mem_bases_64bit(temp);
+
+	pr_debug("is32bit process: %d sh_mem_bases nybble: 0x%X and register 0x%X\n",
+		qpd->pqm->process->is_32bit_user_mode, temp, qpd->sh_mem_bases);
+
+	return 0;
+}
+
+static void init_sdma_vm(struct device_queue_manager *dqm, struct queue *q,
+				struct qcm_process_device *qpd)
+{
+	uint32_t value = (1 << SDMA0_RLC0_VIRTUAL_ADDR__ATC__SHIFT);
+
+	if (q->process->is_32bit_user_mode)
+		value |= (1 << SDMA0_RLC0_VIRTUAL_ADDR__PTR32__SHIFT) |
+				get_sh_mem_bases_32(qpd_to_pdd(qpd));
+	else
+		value |= ((get_sh_mem_bases_nybble_64(qpd_to_pdd(qpd))) <<
+				SDMA0_RLC0_VIRTUAL_ADDR__SHARED_BASE__SHIFT) &
+				SDMA0_RLC0_VIRTUAL_ADDR__SHARED_BASE_MASK;
+
+	q->properties.sdma_vm_addr = value;
+}
+
+static void init_sdma_vm_hawaii(struct device_queue_manager *dqm,
+				struct queue *q,
+				struct qcm_process_device *qpd)
+{
+	/* On dGPU we're always in GPUVM64 addressing mode with 64-bit
+	 * aperture addresses.
+	 */
+	q->properties.sdma_vm_addr =
+		((get_sh_mem_bases_nybble_64(qpd_to_pdd(qpd))) <<
+		 SDMA0_RLC0_VIRTUAL_ADDR__SHARED_BASE__SHIFT) &
+		SDMA0_RLC0_VIRTUAL_ADDR__SHARED_BASE_MASK;
+}
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager_v9.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager_v9.c
new file mode 100644
index 000000000..417515332
--- /dev/null
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager_v9.c
@@ -0,0 +1,84 @@
+/*
+ * Copyright 2016-2018 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ *
+ */
+
+#include "kfd_device_queue_manager.h"
+#include "vega10_enum.h"
+#include "gc/gc_9_0_offset.h"
+#include "gc/gc_9_0_sh_mask.h"
+#include "sdma0/sdma0_4_0_sh_mask.h"
+
+static int update_qpd_v9(struct device_queue_manager *dqm,
+			 struct qcm_process_device *qpd);
+static void init_sdma_vm_v9(struct device_queue_manager *dqm, struct queue *q,
+			    struct qcm_process_device *qpd);
+
+void device_queue_manager_init_v9(
+	struct device_queue_manager_asic_ops *asic_ops)
+{
+	asic_ops->update_qpd = update_qpd_v9;
+	asic_ops->init_sdma_vm = init_sdma_vm_v9;
+}
+
+static uint32_t compute_sh_mem_bases_64bit(struct kfd_process_device *pdd)
+{
+	uint32_t shared_base = pdd->lds_base >> 48;
+	uint32_t private_base = pdd->scratch_base >> 48;
+
+	return (shared_base << SH_MEM_BASES__SHARED_BASE__SHIFT) |
+		private_base;
+}
+
+static int update_qpd_v9(struct device_queue_manager *dqm,
+			 struct qcm_process_device *qpd)
+{
+	struct kfd_process_device *pdd;
+
+	pdd = qpd_to_pdd(qpd);
+
+	/* check if sh_mem_config register already configured */
+	if (qpd->sh_mem_config == 0) {
+		qpd->sh_mem_config =
+				SH_MEM_ALIGNMENT_MODE_UNALIGNED <<
+					SH_MEM_CONFIG__ALIGNMENT_MODE__SHIFT;
+		if (noretry &&
+		    !dqm->dev->device_info->needs_iommu_device)
+			qpd->sh_mem_config |=
+				1 << SH_MEM_CONFIG__RETRY_DISABLE__SHIFT;
+
+		qpd->sh_mem_ape1_limit = 0;
+		qpd->sh_mem_ape1_base = 0;
+	}
+
+	qpd->sh_mem_bases = compute_sh_mem_bases_64bit(pdd);
+
+	pr_debug("sh_mem_bases 0x%X\n", qpd->sh_mem_bases);
+
+	return 0;
+}
+
+static void init_sdma_vm_v9(struct device_queue_manager *dqm, struct queue *q,
+			    struct qcm_process_device *qpd)
+{
+	/* Not needed on SDMAv4 any more */
+	q->properties.sdma_vm_addr = 0;
+}
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager_vi.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager_vi.c
new file mode 100644
index 000000000..fd60a116b
--- /dev/null
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager_vi.c
@@ -0,0 +1,254 @@
+/*
+ * Copyright 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ *
+ */
+
+#include "kfd_device_queue_manager.h"
+#include "gca/gfx_8_0_enum.h"
+#include "gca/gfx_8_0_sh_mask.h"
+#include "gca/gfx_8_0_enum.h"
+#include "oss/oss_3_0_sh_mask.h"
+
+static bool set_cache_memory_policy_vi(struct device_queue_manager *dqm,
+				   struct qcm_process_device *qpd,
+				   enum cache_policy default_policy,
+				   enum cache_policy alternate_policy,
+				   void __user *alternate_aperture_base,
+				   uint64_t alternate_aperture_size);
+static bool set_cache_memory_policy_vi_tonga(struct device_queue_manager *dqm,
+			struct qcm_process_device *qpd,
+			enum cache_policy default_policy,
+			enum cache_policy alternate_policy,
+			void __user *alternate_aperture_base,
+			uint64_t alternate_aperture_size);
+static int update_qpd_vi(struct device_queue_manager *dqm,
+					struct qcm_process_device *qpd);
+static int update_qpd_vi_tonga(struct device_queue_manager *dqm,
+			struct qcm_process_device *qpd);
+static void init_sdma_vm(struct device_queue_manager *dqm, struct queue *q,
+				struct qcm_process_device *qpd);
+static void init_sdma_vm_tonga(struct device_queue_manager *dqm,
+			struct queue *q,
+			struct qcm_process_device *qpd);
+
+void device_queue_manager_init_vi(
+		struct device_queue_manager_asic_ops *asic_ops)
+{
+	asic_ops->set_cache_memory_policy = set_cache_memory_policy_vi;
+	asic_ops->update_qpd = update_qpd_vi;
+	asic_ops->init_sdma_vm = init_sdma_vm;
+}
+
+void device_queue_manager_init_vi_tonga(
+		struct device_queue_manager_asic_ops *asic_ops)
+{
+	asic_ops->set_cache_memory_policy = set_cache_memory_policy_vi_tonga;
+	asic_ops->update_qpd = update_qpd_vi_tonga;
+	asic_ops->init_sdma_vm = init_sdma_vm_tonga;
+}
+
+static uint32_t compute_sh_mem_bases_64bit(unsigned int top_address_nybble)
+{
+	/* In 64-bit mode, we can only control the top 3 bits of the LDS,
+	 * scratch and GPUVM apertures.
+	 * The hardware fills in the remaining 59 bits according to the
+	 * following pattern:
+	 * LDS:		X0000000'00000000 - X0000001'00000000 (4GB)
+	 * Scratch:	X0000001'00000000 - X0000002'00000000 (4GB)
+	 * GPUVM:	Y0010000'00000000 - Y0020000'00000000 (1TB)
+	 *
+	 * (where X/Y is the configurable nybble with the low-bit 0)
+	 *
+	 * LDS and scratch will have the same top nybble programmed in the
+	 * top 3 bits of SH_MEM_BASES.PRIVATE_BASE.
+	 * GPUVM can have a different top nybble programmed in the
+	 * top 3 bits of SH_MEM_BASES.SHARED_BASE.
+	 * We don't bother to support different top nybbles
+	 * for LDS/Scratch and GPUVM.
+	 */
+
+	WARN_ON((top_address_nybble & 1) || top_address_nybble > 0xE ||
+		top_address_nybble == 0);
+
+	return top_address_nybble << 12 |
+			(top_address_nybble << 12) <<
+			SH_MEM_BASES__SHARED_BASE__SHIFT;
+}
+
+static bool set_cache_memory_policy_vi(struct device_queue_manager *dqm,
+				   struct qcm_process_device *qpd,
+				   enum cache_policy default_policy,
+				   enum cache_policy alternate_policy,
+				   void __user *alternate_aperture_base,
+				   uint64_t alternate_aperture_size)
+{
+	uint32_t default_mtype;
+	uint32_t ape1_mtype;
+
+	default_mtype = (default_policy == cache_policy_coherent) ?
+			MTYPE_CC :
+			MTYPE_NC;
+
+	ape1_mtype = (alternate_policy == cache_policy_coherent) ?
+			MTYPE_CC :
+			MTYPE_NC;
+
+	qpd->sh_mem_config = (qpd->sh_mem_config &
+			SH_MEM_CONFIG__ADDRESS_MODE_MASK) |
+		SH_MEM_ALIGNMENT_MODE_UNALIGNED <<
+				SH_MEM_CONFIG__ALIGNMENT_MODE__SHIFT |
+		default_mtype << SH_MEM_CONFIG__DEFAULT_MTYPE__SHIFT |
+		ape1_mtype << SH_MEM_CONFIG__APE1_MTYPE__SHIFT |
+		SH_MEM_CONFIG__PRIVATE_ATC_MASK;
+
+	return true;
+}
+
+static bool set_cache_memory_policy_vi_tonga(struct device_queue_manager *dqm,
+		struct qcm_process_device *qpd,
+		enum cache_policy default_policy,
+		enum cache_policy alternate_policy,
+		void __user *alternate_aperture_base,
+		uint64_t alternate_aperture_size)
+{
+	uint32_t default_mtype;
+	uint32_t ape1_mtype;
+
+	default_mtype = (default_policy == cache_policy_coherent) ?
+			MTYPE_UC :
+			MTYPE_NC;
+
+	ape1_mtype = (alternate_policy == cache_policy_coherent) ?
+			MTYPE_UC :
+			MTYPE_NC;
+
+	qpd->sh_mem_config =
+			SH_MEM_ALIGNMENT_MODE_UNALIGNED <<
+				   SH_MEM_CONFIG__ALIGNMENT_MODE__SHIFT |
+			default_mtype << SH_MEM_CONFIG__DEFAULT_MTYPE__SHIFT |
+			ape1_mtype << SH_MEM_CONFIG__APE1_MTYPE__SHIFT;
+
+	return true;
+}
+
+static int update_qpd_vi(struct device_queue_manager *dqm,
+					struct qcm_process_device *qpd)
+{
+	struct kfd_process_device *pdd;
+	unsigned int temp;
+
+	pdd = qpd_to_pdd(qpd);
+
+	/* check if sh_mem_config register already configured */
+	if (qpd->sh_mem_config == 0) {
+		qpd->sh_mem_config =
+			SH_MEM_ALIGNMENT_MODE_UNALIGNED <<
+				SH_MEM_CONFIG__ALIGNMENT_MODE__SHIFT |
+			MTYPE_CC << SH_MEM_CONFIG__DEFAULT_MTYPE__SHIFT |
+			MTYPE_CC << SH_MEM_CONFIG__APE1_MTYPE__SHIFT |
+			SH_MEM_CONFIG__PRIVATE_ATC_MASK;
+
+		qpd->sh_mem_ape1_limit = 0;
+		qpd->sh_mem_ape1_base = 0;
+	}
+
+	if (qpd->pqm->process->is_32bit_user_mode) {
+		temp = get_sh_mem_bases_32(pdd);
+		qpd->sh_mem_bases = temp << SH_MEM_BASES__SHARED_BASE__SHIFT;
+		qpd->sh_mem_config |= SH_MEM_ADDRESS_MODE_HSA32 <<
+					SH_MEM_CONFIG__ADDRESS_MODE__SHIFT;
+	} else {
+		temp = get_sh_mem_bases_nybble_64(pdd);
+		qpd->sh_mem_bases = compute_sh_mem_bases_64bit(temp);
+		qpd->sh_mem_config |= SH_MEM_ADDRESS_MODE_HSA64 <<
+			SH_MEM_CONFIG__ADDRESS_MODE__SHIFT;
+		qpd->sh_mem_config |= 1  <<
+			SH_MEM_CONFIG__PRIVATE_ATC__SHIFT;
+	}
+
+	pr_debug("is32bit process: %d sh_mem_bases nybble: 0x%X and register 0x%X\n",
+		qpd->pqm->process->is_32bit_user_mode, temp, qpd->sh_mem_bases);
+
+	return 0;
+}
+
+static int update_qpd_vi_tonga(struct device_queue_manager *dqm,
+			struct qcm_process_device *qpd)
+{
+	struct kfd_process_device *pdd;
+	unsigned int temp;
+
+	pdd = qpd_to_pdd(qpd);
+
+	/* check if sh_mem_config register already configured */
+	if (qpd->sh_mem_config == 0) {
+		qpd->sh_mem_config =
+				SH_MEM_ALIGNMENT_MODE_UNALIGNED <<
+					SH_MEM_CONFIG__ALIGNMENT_MODE__SHIFT |
+				MTYPE_UC <<
+					SH_MEM_CONFIG__DEFAULT_MTYPE__SHIFT |
+				MTYPE_UC <<
+					SH_MEM_CONFIG__APE1_MTYPE__SHIFT;
+
+		qpd->sh_mem_ape1_limit = 0;
+		qpd->sh_mem_ape1_base = 0;
+	}
+
+	/* On dGPU we're always in GPUVM64 addressing mode with 64-bit
+	 * aperture addresses.
+	 */
+	temp = get_sh_mem_bases_nybble_64(pdd);
+	qpd->sh_mem_bases = compute_sh_mem_bases_64bit(temp);
+
+	pr_debug("sh_mem_bases nybble: 0x%X and register 0x%X\n",
+		temp, qpd->sh_mem_bases);
+
+	return 0;
+}
+
+static void init_sdma_vm(struct device_queue_manager *dqm, struct queue *q,
+				struct qcm_process_device *qpd)
+{
+	uint32_t value = (1 << SDMA0_RLC0_VIRTUAL_ADDR__ATC__SHIFT);
+
+	if (q->process->is_32bit_user_mode)
+		value |= (1 << SDMA0_RLC0_VIRTUAL_ADDR__PTR32__SHIFT) |
+				get_sh_mem_bases_32(qpd_to_pdd(qpd));
+	else
+		value |= ((get_sh_mem_bases_nybble_64(qpd_to_pdd(qpd))) <<
+				SDMA0_RLC0_VIRTUAL_ADDR__SHARED_BASE__SHIFT) &
+				SDMA0_RLC0_VIRTUAL_ADDR__SHARED_BASE_MASK;
+
+	q->properties.sdma_vm_addr = value;
+}
+
+static void init_sdma_vm_tonga(struct device_queue_manager *dqm,
+			struct queue *q,
+			struct qcm_process_device *qpd)
+{
+	/* On dGPU we're always in GPUVM64 addressing mode with 64-bit
+	 * aperture addresses.
+	 */
+	q->properties.sdma_vm_addr =
+		((get_sh_mem_bases_nybble_64(qpd_to_pdd(qpd))) <<
+		 SDMA0_RLC0_VIRTUAL_ADDR__SHARED_BASE__SHIFT) &
+		SDMA0_RLC0_VIRTUAL_ADDR__SHARED_BASE_MASK;
+}
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_doorbell.c b/drivers/gpu/drm/amd/amdkfd/kfd_doorbell.c
new file mode 100644
index 000000000..ebe79bf00
--- /dev/null
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_doorbell.c
@@ -0,0 +1,275 @@
+/*
+ * Copyright 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+#include "kfd_priv.h"
+#include <linux/mm.h>
+#include <linux/mman.h>
+#include <linux/slab.h>
+#include <linux/io.h>
+#include <linux/idr.h>
+
+/*
+ * This extension supports a kernel level doorbells management for the
+ * kernel queues using the first doorbell page reserved for the kernel.
+ */
+
+static DEFINE_IDA(doorbell_ida);
+static unsigned int max_doorbell_slices;
+
+/*
+ * Each device exposes a doorbell aperture, a PCI MMIO aperture that
+ * receives 32-bit writes that are passed to queues as wptr values.
+ * The doorbells are intended to be written by applications as part
+ * of queueing work on user-mode queues.
+ * We assign doorbells to applications in PAGE_SIZE-sized and aligned chunks.
+ * We map the doorbell address space into user-mode when a process creates
+ * its first queue on each device.
+ * Although the mapping is done by KFD, it is equivalent to an mmap of
+ * the /dev/kfd with the particular device encoded in the mmap offset.
+ * There will be other uses for mmap of /dev/kfd, so only a range of
+ * offsets (KFD_MMAP_DOORBELL_START-END) is used for doorbells.
+ */
+
+/* # of doorbell bytes allocated for each process. */
+size_t kfd_doorbell_process_slice(struct kfd_dev *kfd)
+{
+	return roundup(kfd->device_info->doorbell_size *
+			KFD_MAX_NUM_OF_QUEUES_PER_PROCESS,
+			PAGE_SIZE);
+}
+
+/* Doorbell calculations for device init. */
+int kfd_doorbell_init(struct kfd_dev *kfd)
+{
+	size_t doorbell_start_offset;
+	size_t doorbell_aperture_size;
+	size_t doorbell_process_limit;
+
+	/*
+	 * We start with calculations in bytes because the input data might
+	 * only be byte-aligned.
+	 * Only after we have done the rounding can we assume any alignment.
+	 */
+
+	doorbell_start_offset =
+			roundup(kfd->shared_resources.doorbell_start_offset,
+					kfd_doorbell_process_slice(kfd));
+
+	doorbell_aperture_size =
+			rounddown(kfd->shared_resources.doorbell_aperture_size,
+					kfd_doorbell_process_slice(kfd));
+
+	if (doorbell_aperture_size > doorbell_start_offset)
+		doorbell_process_limit =
+			(doorbell_aperture_size - doorbell_start_offset) /
+						kfd_doorbell_process_slice(kfd);
+	else
+		return -ENOSPC;
+
+	if (!max_doorbell_slices ||
+	    doorbell_process_limit < max_doorbell_slices)
+		max_doorbell_slices = doorbell_process_limit;
+
+	kfd->doorbell_base = kfd->shared_resources.doorbell_physical_address +
+				doorbell_start_offset;
+
+	kfd->doorbell_id_offset = doorbell_start_offset / sizeof(u32);
+
+	kfd->doorbell_kernel_ptr = ioremap(kfd->doorbell_base,
+					   kfd_doorbell_process_slice(kfd));
+
+	if (!kfd->doorbell_kernel_ptr)
+		return -ENOMEM;
+
+	pr_debug("Doorbell initialization:\n");
+	pr_debug("doorbell base           == 0x%08lX\n",
+			(uintptr_t)kfd->doorbell_base);
+
+	pr_debug("doorbell_id_offset      == 0x%08lX\n",
+			kfd->doorbell_id_offset);
+
+	pr_debug("doorbell_process_limit  == 0x%08lX\n",
+			doorbell_process_limit);
+
+	pr_debug("doorbell_kernel_offset  == 0x%08lX\n",
+			(uintptr_t)kfd->doorbell_base);
+
+	pr_debug("doorbell aperture size  == 0x%08lX\n",
+			kfd->shared_resources.doorbell_aperture_size);
+
+	pr_debug("doorbell kernel address == %p\n", kfd->doorbell_kernel_ptr);
+
+	return 0;
+}
+
+void kfd_doorbell_fini(struct kfd_dev *kfd)
+{
+	if (kfd->doorbell_kernel_ptr)
+		iounmap(kfd->doorbell_kernel_ptr);
+}
+
+int kfd_doorbell_mmap(struct kfd_dev *dev, struct kfd_process *process,
+		      struct vm_area_struct *vma)
+{
+	phys_addr_t address;
+
+	/*
+	 * For simplicitly we only allow mapping of the entire doorbell
+	 * allocation of a single device & process.
+	 */
+	if (vma->vm_end - vma->vm_start != kfd_doorbell_process_slice(dev))
+		return -EINVAL;
+
+	/* Calculate physical address of doorbell */
+	address = kfd_get_process_doorbells(dev, process);
+
+	vma->vm_flags |= VM_IO | VM_DONTCOPY | VM_DONTEXPAND | VM_NORESERVE |
+				VM_DONTDUMP | VM_PFNMAP;
+
+	vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot);
+
+	pr_debug("Mapping doorbell page\n"
+		 "     target user address == 0x%08llX\n"
+		 "     physical address    == 0x%08llX\n"
+		 "     vm_flags            == 0x%04lX\n"
+		 "     size                == 0x%04lX\n",
+		 (unsigned long long) vma->vm_start, address, vma->vm_flags,
+		 kfd_doorbell_process_slice(dev));
+
+
+	return io_remap_pfn_range(vma,
+				vma->vm_start,
+				address >> PAGE_SHIFT,
+				kfd_doorbell_process_slice(dev),
+				vma->vm_page_prot);
+}
+
+
+/* get kernel iomem pointer for a doorbell */
+void __iomem *kfd_get_kernel_doorbell(struct kfd_dev *kfd,
+					unsigned int *doorbell_off)
+{
+	u32 inx;
+
+	mutex_lock(&kfd->doorbell_mutex);
+	inx = find_first_zero_bit(kfd->doorbell_available_index,
+					KFD_MAX_NUM_OF_QUEUES_PER_PROCESS);
+
+	__set_bit(inx, kfd->doorbell_available_index);
+	mutex_unlock(&kfd->doorbell_mutex);
+
+	if (inx >= KFD_MAX_NUM_OF_QUEUES_PER_PROCESS)
+		return NULL;
+
+	inx *= kfd->device_info->doorbell_size / sizeof(u32);
+
+	/*
+	 * Calculating the kernel doorbell offset using the first
+	 * doorbell page.
+	 */
+	*doorbell_off = kfd->doorbell_id_offset + inx;
+
+	pr_debug("Get kernel queue doorbell\n"
+			"     doorbell offset   == 0x%08X\n"
+			"     doorbell index    == 0x%x\n",
+		*doorbell_off, inx);
+
+	return kfd->doorbell_kernel_ptr + inx;
+}
+
+void kfd_release_kernel_doorbell(struct kfd_dev *kfd, u32 __iomem *db_addr)
+{
+	unsigned int inx;
+
+	inx = (unsigned int)(db_addr - kfd->doorbell_kernel_ptr)
+		* sizeof(u32) / kfd->device_info->doorbell_size;
+
+	mutex_lock(&kfd->doorbell_mutex);
+	__clear_bit(inx, kfd->doorbell_available_index);
+	mutex_unlock(&kfd->doorbell_mutex);
+}
+
+void write_kernel_doorbell(void __iomem *db, u32 value)
+{
+	if (db) {
+		writel(value, db);
+		pr_debug("Writing %d to doorbell address %p\n", value, db);
+	}
+}
+
+void write_kernel_doorbell64(void __iomem *db, u64 value)
+{
+	if (db) {
+		WARN(((unsigned long)db & 7) != 0,
+		     "Unaligned 64-bit doorbell");
+		writeq(value, (u64 __iomem *)db);
+		pr_debug("writing %llu to doorbell address %p\n", value, db);
+	}
+}
+
+unsigned int kfd_doorbell_id_to_offset(struct kfd_dev *kfd,
+					struct kfd_process *process,
+					unsigned int doorbell_id)
+{
+	/*
+	 * doorbell_id_offset accounts for doorbells taken by KGD.
+	 * index * kfd_doorbell_process_slice/sizeof(u32) adjusts to
+	 * the process's doorbells. The offset returned is in dword
+	 * units regardless of the ASIC-dependent doorbell size.
+	 */
+	return kfd->doorbell_id_offset +
+		process->doorbell_index
+		* kfd_doorbell_process_slice(kfd) / sizeof(u32) +
+		doorbell_id * kfd->device_info->doorbell_size / sizeof(u32);
+}
+
+uint64_t kfd_get_number_elems(struct kfd_dev *kfd)
+{
+	uint64_t num_of_elems = (kfd->shared_resources.doorbell_aperture_size -
+				kfd->shared_resources.doorbell_start_offset) /
+					kfd_doorbell_process_slice(kfd) + 1;
+
+	return num_of_elems;
+
+}
+
+phys_addr_t kfd_get_process_doorbells(struct kfd_dev *dev,
+					struct kfd_process *process)
+{
+	return dev->doorbell_base +
+		process->doorbell_index * kfd_doorbell_process_slice(dev);
+}
+
+int kfd_alloc_process_doorbells(struct kfd_process *process)
+{
+	int r = ida_simple_get(&doorbell_ida, 1, max_doorbell_slices,
+				GFP_KERNEL);
+	if (r > 0)
+		process->doorbell_index = r;
+
+	return r;
+}
+
+void kfd_free_process_doorbells(struct kfd_process *process)
+{
+	if (process->doorbell_index)
+		ida_simple_remove(&doorbell_ida, process->doorbell_index);
+}
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_events.c b/drivers/gpu/drm/amd/amdkfd/kfd_events.c
new file mode 100644
index 000000000..892077377
--- /dev/null
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_events.c
@@ -0,0 +1,1038 @@
+/*
+ * Copyright 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#include <linux/mm_types.h>
+#include <linux/slab.h>
+#include <linux/types.h>
+#include <linux/sched/signal.h>
+#include <linux/sched/mm.h>
+#include <linux/uaccess.h>
+#include <linux/mman.h>
+#include <linux/memory.h>
+#include "kfd_priv.h"
+#include "kfd_events.h"
+#include "kfd_iommu.h"
+#include <linux/device.h>
+
+/*
+ * Wrapper around wait_queue_entry_t
+ */
+struct kfd_event_waiter {
+	wait_queue_entry_t wait;
+	struct kfd_event *event; /* Event to wait for */
+	bool activated;		 /* Becomes true when event is signaled */
+};
+
+/*
+ * Each signal event needs a 64-bit signal slot where the signaler will write
+ * a 1 before sending an interrupt. (This is needed because some interrupts
+ * do not contain enough spare data bits to identify an event.)
+ * We get whole pages and map them to the process VA.
+ * Individual signal events use their event_id as slot index.
+ */
+struct kfd_signal_page {
+	uint64_t *kernel_address;
+	uint64_t __user *user_address;
+	bool need_to_free_pages;
+};
+
+
+static uint64_t *page_slots(struct kfd_signal_page *page)
+{
+	return page->kernel_address;
+}
+
+static struct kfd_signal_page *allocate_signal_page(struct kfd_process *p)
+{
+	void *backing_store;
+	struct kfd_signal_page *page;
+
+	page = kzalloc(sizeof(*page), GFP_KERNEL);
+	if (!page)
+		return NULL;
+
+	backing_store = (void *) __get_free_pages(GFP_KERNEL,
+					get_order(KFD_SIGNAL_EVENT_LIMIT * 8));
+	if (!backing_store)
+		goto fail_alloc_signal_store;
+
+	/* Initialize all events to unsignaled */
+	memset(backing_store, (uint8_t) UNSIGNALED_EVENT_SLOT,
+	       KFD_SIGNAL_EVENT_LIMIT * 8);
+
+	page->kernel_address = backing_store;
+	page->need_to_free_pages = true;
+	pr_debug("Allocated new event signal page at %p, for process %p\n",
+			page, p);
+
+	return page;
+
+fail_alloc_signal_store:
+	kfree(page);
+	return NULL;
+}
+
+static int allocate_event_notification_slot(struct kfd_process *p,
+					    struct kfd_event *ev)
+{
+	int id;
+
+	if (!p->signal_page) {
+		p->signal_page = allocate_signal_page(p);
+		if (!p->signal_page)
+			return -ENOMEM;
+		/* Oldest user mode expects 256 event slots */
+		p->signal_mapped_size = 256*8;
+	}
+
+	/*
+	 * Compatibility with old user mode: Only use signal slots
+	 * user mode has mapped, may be less than
+	 * KFD_SIGNAL_EVENT_LIMIT. This also allows future increase
+	 * of the event limit without breaking user mode.
+	 */
+	id = idr_alloc(&p->event_idr, ev, 0, p->signal_mapped_size / 8,
+		       GFP_KERNEL);
+	if (id < 0)
+		return id;
+
+	ev->event_id = id;
+	page_slots(p->signal_page)[id] = UNSIGNALED_EVENT_SLOT;
+
+	return 0;
+}
+
+/*
+ * Assumes that p->event_mutex is held and of course that p is not going
+ * away (current or locked).
+ */
+static struct kfd_event *lookup_event_by_id(struct kfd_process *p, uint32_t id)
+{
+	return idr_find(&p->event_idr, id);
+}
+
+/**
+ * lookup_signaled_event_by_partial_id - Lookup signaled event from partial ID
+ * @p:     Pointer to struct kfd_process
+ * @id:    ID to look up
+ * @bits:  Number of valid bits in @id
+ *
+ * Finds the first signaled event with a matching partial ID. If no
+ * matching signaled event is found, returns NULL. In that case the
+ * caller should assume that the partial ID is invalid and do an
+ * exhaustive search of all siglaned events.
+ *
+ * If multiple events with the same partial ID signal at the same
+ * time, they will be found one interrupt at a time, not necessarily
+ * in the same order the interrupts occurred. As long as the number of
+ * interrupts is correct, all signaled events will be seen by the
+ * driver.
+ */
+static struct kfd_event *lookup_signaled_event_by_partial_id(
+	struct kfd_process *p, uint32_t id, uint32_t bits)
+{
+	struct kfd_event *ev;
+
+	if (!p->signal_page || id >= KFD_SIGNAL_EVENT_LIMIT)
+		return NULL;
+
+	/* Fast path for the common case that @id is not a partial ID
+	 * and we only need a single lookup.
+	 */
+	if (bits > 31 || (1U << bits) >= KFD_SIGNAL_EVENT_LIMIT) {
+		if (page_slots(p->signal_page)[id] == UNSIGNALED_EVENT_SLOT)
+			return NULL;
+
+		return idr_find(&p->event_idr, id);
+	}
+
+	/* General case for partial IDs: Iterate over all matching IDs
+	 * and find the first one that has signaled.
+	 */
+	for (ev = NULL; id < KFD_SIGNAL_EVENT_LIMIT && !ev; id += 1U << bits) {
+		if (page_slots(p->signal_page)[id] == UNSIGNALED_EVENT_SLOT)
+			continue;
+
+		ev = idr_find(&p->event_idr, id);
+	}
+
+	return ev;
+}
+
+static int create_signal_event(struct file *devkfd,
+				struct kfd_process *p,
+				struct kfd_event *ev)
+{
+	int ret;
+
+	if (p->signal_mapped_size &&
+	    p->signal_event_count == p->signal_mapped_size / 8) {
+		if (!p->signal_event_limit_reached) {
+			pr_warn("Signal event wasn't created because limit was reached\n");
+			p->signal_event_limit_reached = true;
+		}
+		return -ENOSPC;
+	}
+
+	ret = allocate_event_notification_slot(p, ev);
+	if (ret) {
+		pr_warn("Signal event wasn't created because out of kernel memory\n");
+		return ret;
+	}
+
+	p->signal_event_count++;
+
+	ev->user_signal_address = &p->signal_page->user_address[ev->event_id];
+	pr_debug("Signal event number %zu created with id %d, address %p\n",
+			p->signal_event_count, ev->event_id,
+			ev->user_signal_address);
+
+	return 0;
+}
+
+static int create_other_event(struct kfd_process *p, struct kfd_event *ev)
+{
+	/* Cast KFD_LAST_NONSIGNAL_EVENT to uint32_t. This allows an
+	 * intentional integer overflow to -1 without a compiler
+	 * warning. idr_alloc treats a negative value as "maximum
+	 * signed integer".
+	 */
+	int id = idr_alloc(&p->event_idr, ev, KFD_FIRST_NONSIGNAL_EVENT_ID,
+			   (uint32_t)KFD_LAST_NONSIGNAL_EVENT_ID + 1,
+			   GFP_KERNEL);
+
+	if (id < 0)
+		return id;
+	ev->event_id = id;
+
+	return 0;
+}
+
+void kfd_event_init_process(struct kfd_process *p)
+{
+	mutex_init(&p->event_mutex);
+	idr_init(&p->event_idr);
+	p->signal_page = NULL;
+	p->signal_event_count = 0;
+}
+
+static void destroy_event(struct kfd_process *p, struct kfd_event *ev)
+{
+	struct kfd_event_waiter *waiter;
+
+	/* Wake up pending waiters. They will return failure */
+	list_for_each_entry(waiter, &ev->wq.head, wait.entry)
+		waiter->event = NULL;
+	wake_up_all(&ev->wq);
+
+	if (ev->type == KFD_EVENT_TYPE_SIGNAL ||
+	    ev->type == KFD_EVENT_TYPE_DEBUG)
+		p->signal_event_count--;
+
+	idr_remove(&p->event_idr, ev->event_id);
+	kfree(ev);
+}
+
+static void destroy_events(struct kfd_process *p)
+{
+	struct kfd_event *ev;
+	uint32_t id;
+
+	idr_for_each_entry(&p->event_idr, ev, id)
+		destroy_event(p, ev);
+	idr_destroy(&p->event_idr);
+}
+
+/*
+ * We assume that the process is being destroyed and there is no need to
+ * unmap the pages or keep bookkeeping data in order.
+ */
+static void shutdown_signal_page(struct kfd_process *p)
+{
+	struct kfd_signal_page *page = p->signal_page;
+
+	if (page) {
+		if (page->need_to_free_pages)
+			free_pages((unsigned long)page->kernel_address,
+				   get_order(KFD_SIGNAL_EVENT_LIMIT * 8));
+		kfree(page);
+	}
+}
+
+void kfd_event_free_process(struct kfd_process *p)
+{
+	destroy_events(p);
+	shutdown_signal_page(p);
+}
+
+static bool event_can_be_gpu_signaled(const struct kfd_event *ev)
+{
+	return ev->type == KFD_EVENT_TYPE_SIGNAL ||
+					ev->type == KFD_EVENT_TYPE_DEBUG;
+}
+
+static bool event_can_be_cpu_signaled(const struct kfd_event *ev)
+{
+	return ev->type == KFD_EVENT_TYPE_SIGNAL;
+}
+
+int kfd_event_page_set(struct kfd_process *p, void *kernel_address,
+		       uint64_t size)
+{
+	struct kfd_signal_page *page;
+
+	if (p->signal_page)
+		return -EBUSY;
+
+	page = kzalloc(sizeof(*page), GFP_KERNEL);
+	if (!page)
+		return -ENOMEM;
+
+	/* Initialize all events to unsignaled */
+	memset(kernel_address, (uint8_t) UNSIGNALED_EVENT_SLOT,
+	       KFD_SIGNAL_EVENT_LIMIT * 8);
+
+	page->kernel_address = kernel_address;
+
+	p->signal_page = page;
+	p->signal_mapped_size = size;
+
+	return 0;
+}
+
+int kfd_event_create(struct file *devkfd, struct kfd_process *p,
+		     uint32_t event_type, bool auto_reset, uint32_t node_id,
+		     uint32_t *event_id, uint32_t *event_trigger_data,
+		     uint64_t *event_page_offset, uint32_t *event_slot_index)
+{
+	int ret = 0;
+	struct kfd_event *ev = kzalloc(sizeof(*ev), GFP_KERNEL);
+
+	if (!ev)
+		return -ENOMEM;
+
+	ev->type = event_type;
+	ev->auto_reset = auto_reset;
+	ev->signaled = false;
+
+	init_waitqueue_head(&ev->wq);
+
+	*event_page_offset = 0;
+
+	mutex_lock(&p->event_mutex);
+
+	switch (event_type) {
+	case KFD_EVENT_TYPE_SIGNAL:
+	case KFD_EVENT_TYPE_DEBUG:
+		ret = create_signal_event(devkfd, p, ev);
+		if (!ret) {
+			*event_page_offset = KFD_MMAP_TYPE_EVENTS;
+			*event_page_offset <<= PAGE_SHIFT;
+			*event_slot_index = ev->event_id;
+		}
+		break;
+	default:
+		ret = create_other_event(p, ev);
+		break;
+	}
+
+	if (!ret) {
+		*event_id = ev->event_id;
+		*event_trigger_data = ev->event_id;
+	} else {
+		kfree(ev);
+	}
+
+	mutex_unlock(&p->event_mutex);
+
+	return ret;
+}
+
+/* Assumes that p is current. */
+int kfd_event_destroy(struct kfd_process *p, uint32_t event_id)
+{
+	struct kfd_event *ev;
+	int ret = 0;
+
+	mutex_lock(&p->event_mutex);
+
+	ev = lookup_event_by_id(p, event_id);
+
+	if (ev)
+		destroy_event(p, ev);
+	else
+		ret = -EINVAL;
+
+	mutex_unlock(&p->event_mutex);
+	return ret;
+}
+
+static void set_event(struct kfd_event *ev)
+{
+	struct kfd_event_waiter *waiter;
+
+	/* Auto reset if the list is non-empty and we're waking
+	 * someone. waitqueue_active is safe here because we're
+	 * protected by the p->event_mutex, which is also held when
+	 * updating the wait queues in kfd_wait_on_events.
+	 */
+	ev->signaled = !ev->auto_reset || !waitqueue_active(&ev->wq);
+
+	list_for_each_entry(waiter, &ev->wq.head, wait.entry)
+		waiter->activated = true;
+
+	wake_up_all(&ev->wq);
+}
+
+/* Assumes that p is current. */
+int kfd_set_event(struct kfd_process *p, uint32_t event_id)
+{
+	int ret = 0;
+	struct kfd_event *ev;
+
+	mutex_lock(&p->event_mutex);
+
+	ev = lookup_event_by_id(p, event_id);
+
+	if (ev && event_can_be_cpu_signaled(ev))
+		set_event(ev);
+	else
+		ret = -EINVAL;
+
+	mutex_unlock(&p->event_mutex);
+	return ret;
+}
+
+static void reset_event(struct kfd_event *ev)
+{
+	ev->signaled = false;
+}
+
+/* Assumes that p is current. */
+int kfd_reset_event(struct kfd_process *p, uint32_t event_id)
+{
+	int ret = 0;
+	struct kfd_event *ev;
+
+	mutex_lock(&p->event_mutex);
+
+	ev = lookup_event_by_id(p, event_id);
+
+	if (ev && event_can_be_cpu_signaled(ev))
+		reset_event(ev);
+	else
+		ret = -EINVAL;
+
+	mutex_unlock(&p->event_mutex);
+	return ret;
+
+}
+
+static void acknowledge_signal(struct kfd_process *p, struct kfd_event *ev)
+{
+	page_slots(p->signal_page)[ev->event_id] = UNSIGNALED_EVENT_SLOT;
+}
+
+static void set_event_from_interrupt(struct kfd_process *p,
+					struct kfd_event *ev)
+{
+	if (ev && event_can_be_gpu_signaled(ev)) {
+		acknowledge_signal(p, ev);
+		set_event(ev);
+	}
+}
+
+void kfd_signal_event_interrupt(unsigned int pasid, uint32_t partial_id,
+				uint32_t valid_id_bits)
+{
+	struct kfd_event *ev = NULL;
+
+	/*
+	 * Because we are called from arbitrary context (workqueue) as opposed
+	 * to process context, kfd_process could attempt to exit while we are
+	 * running so the lookup function increments the process ref count.
+	 */
+	struct kfd_process *p = kfd_lookup_process_by_pasid(pasid);
+
+	if (!p)
+		return; /* Presumably process exited. */
+
+	mutex_lock(&p->event_mutex);
+
+	if (valid_id_bits)
+		ev = lookup_signaled_event_by_partial_id(p, partial_id,
+							 valid_id_bits);
+	if (ev) {
+		set_event_from_interrupt(p, ev);
+	} else if (p->signal_page) {
+		/*
+		 * Partial ID lookup failed. Assume that the event ID
+		 * in the interrupt payload was invalid and do an
+		 * exhaustive search of signaled events.
+		 */
+		uint64_t *slots = page_slots(p->signal_page);
+		uint32_t id;
+
+		if (valid_id_bits)
+			pr_debug_ratelimited("Partial ID invalid: %u (%u valid bits)\n",
+					     partial_id, valid_id_bits);
+
+		if (p->signal_event_count < KFD_SIGNAL_EVENT_LIMIT / 64) {
+			/* With relatively few events, it's faster to
+			 * iterate over the event IDR
+			 */
+			idr_for_each_entry(&p->event_idr, ev, id) {
+				if (id >= KFD_SIGNAL_EVENT_LIMIT)
+					break;
+
+				if (slots[id] != UNSIGNALED_EVENT_SLOT)
+					set_event_from_interrupt(p, ev);
+			}
+		} else {
+			/* With relatively many events, it's faster to
+			 * iterate over the signal slots and lookup
+			 * only signaled events from the IDR.
+			 */
+			for (id = 0; id < KFD_SIGNAL_EVENT_LIMIT; id++)
+				if (slots[id] != UNSIGNALED_EVENT_SLOT) {
+					ev = lookup_event_by_id(p, id);
+					set_event_from_interrupt(p, ev);
+				}
+		}
+	}
+
+	mutex_unlock(&p->event_mutex);
+	kfd_unref_process(p);
+}
+
+static struct kfd_event_waiter *alloc_event_waiters(uint32_t num_events)
+{
+	struct kfd_event_waiter *event_waiters;
+	uint32_t i;
+
+	event_waiters = kmalloc_array(num_events,
+					sizeof(struct kfd_event_waiter),
+					GFP_KERNEL);
+	if (!event_waiters)
+		return NULL;
+
+	for (i = 0; (event_waiters) && (i < num_events) ; i++) {
+		init_wait(&event_waiters[i].wait);
+		event_waiters[i].activated = false;
+	}
+
+	return event_waiters;
+}
+
+static int init_event_waiter_get_status(struct kfd_process *p,
+		struct kfd_event_waiter *waiter,
+		uint32_t event_id)
+{
+	struct kfd_event *ev = lookup_event_by_id(p, event_id);
+
+	if (!ev)
+		return -EINVAL;
+
+	waiter->event = ev;
+	waiter->activated = ev->signaled;
+	ev->signaled = ev->signaled && !ev->auto_reset;
+
+	return 0;
+}
+
+static void init_event_waiter_add_to_waitlist(struct kfd_event_waiter *waiter)
+{
+	struct kfd_event *ev = waiter->event;
+
+	/* Only add to the wait list if we actually need to
+	 * wait on this event.
+	 */
+	if (!waiter->activated)
+		add_wait_queue(&ev->wq, &waiter->wait);
+}
+
+/* test_event_condition - Test condition of events being waited for
+ * @all:           Return completion only if all events have signaled
+ * @num_events:    Number of events to wait for
+ * @event_waiters: Array of event waiters, one per event
+ *
+ * Returns KFD_IOC_WAIT_RESULT_COMPLETE if all (or one) event(s) have
+ * signaled. Returns KFD_IOC_WAIT_RESULT_TIMEOUT if no (or not all)
+ * events have signaled. Returns KFD_IOC_WAIT_RESULT_FAIL if any of
+ * the events have been destroyed.
+ */
+static uint32_t test_event_condition(bool all, uint32_t num_events,
+				struct kfd_event_waiter *event_waiters)
+{
+	uint32_t i;
+	uint32_t activated_count = 0;
+
+	for (i = 0; i < num_events; i++) {
+		if (!event_waiters[i].event)
+			return KFD_IOC_WAIT_RESULT_FAIL;
+
+		if (event_waiters[i].activated) {
+			if (!all)
+				return KFD_IOC_WAIT_RESULT_COMPLETE;
+
+			activated_count++;
+		}
+	}
+
+	return activated_count == num_events ?
+		KFD_IOC_WAIT_RESULT_COMPLETE : KFD_IOC_WAIT_RESULT_TIMEOUT;
+}
+
+/*
+ * Copy event specific data, if defined.
+ * Currently only memory exception events have additional data to copy to user
+ */
+static int copy_signaled_event_data(uint32_t num_events,
+		struct kfd_event_waiter *event_waiters,
+		struct kfd_event_data __user *data)
+{
+	struct kfd_hsa_memory_exception_data *src;
+	struct kfd_hsa_memory_exception_data __user *dst;
+	struct kfd_event_waiter *waiter;
+	struct kfd_event *event;
+	uint32_t i;
+
+	for (i = 0; i < num_events; i++) {
+		waiter = &event_waiters[i];
+		event = waiter->event;
+		if (waiter->activated && event->type == KFD_EVENT_TYPE_MEMORY) {
+			dst = &data[i].memory_exception_data;
+			src = &event->memory_exception_data;
+			if (copy_to_user(dst, src,
+				sizeof(struct kfd_hsa_memory_exception_data)))
+				return -EFAULT;
+		}
+	}
+
+	return 0;
+
+}
+
+
+
+static long user_timeout_to_jiffies(uint32_t user_timeout_ms)
+{
+	if (user_timeout_ms == KFD_EVENT_TIMEOUT_IMMEDIATE)
+		return 0;
+
+	if (user_timeout_ms == KFD_EVENT_TIMEOUT_INFINITE)
+		return MAX_SCHEDULE_TIMEOUT;
+
+	/*
+	 * msecs_to_jiffies interprets all values above 2^31-1 as infinite,
+	 * but we consider them finite.
+	 * This hack is wrong, but nobody is likely to notice.
+	 */
+	user_timeout_ms = min_t(uint32_t, user_timeout_ms, 0x7FFFFFFF);
+
+	return msecs_to_jiffies(user_timeout_ms) + 1;
+}
+
+static void free_waiters(uint32_t num_events, struct kfd_event_waiter *waiters)
+{
+	uint32_t i;
+
+	for (i = 0; i < num_events; i++)
+		if (waiters[i].event)
+			remove_wait_queue(&waiters[i].event->wq,
+					  &waiters[i].wait);
+
+	kfree(waiters);
+}
+
+int kfd_wait_on_events(struct kfd_process *p,
+		       uint32_t num_events, void __user *data,
+		       bool all, uint32_t user_timeout_ms,
+		       uint32_t *wait_result)
+{
+	struct kfd_event_data __user *events =
+			(struct kfd_event_data __user *) data;
+	uint32_t i;
+	int ret = 0;
+
+	struct kfd_event_waiter *event_waiters = NULL;
+	long timeout = user_timeout_to_jiffies(user_timeout_ms);
+
+	event_waiters = alloc_event_waiters(num_events);
+	if (!event_waiters) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	mutex_lock(&p->event_mutex);
+
+	for (i = 0; i < num_events; i++) {
+		struct kfd_event_data event_data;
+
+		if (copy_from_user(&event_data, &events[i],
+				sizeof(struct kfd_event_data))) {
+			ret = -EFAULT;
+			goto out_unlock;
+		}
+
+		ret = init_event_waiter_get_status(p, &event_waiters[i],
+				event_data.event_id);
+		if (ret)
+			goto out_unlock;
+	}
+
+	/* Check condition once. */
+	*wait_result = test_event_condition(all, num_events, event_waiters);
+	if (*wait_result == KFD_IOC_WAIT_RESULT_COMPLETE) {
+		ret = copy_signaled_event_data(num_events,
+					       event_waiters, events);
+		goto out_unlock;
+	} else if (WARN_ON(*wait_result == KFD_IOC_WAIT_RESULT_FAIL)) {
+		/* This should not happen. Events shouldn't be
+		 * destroyed while we're holding the event_mutex
+		 */
+		goto out_unlock;
+	}
+
+	/* Add to wait lists if we need to wait. */
+	for (i = 0; i < num_events; i++)
+		init_event_waiter_add_to_waitlist(&event_waiters[i]);
+
+	mutex_unlock(&p->event_mutex);
+
+	while (true) {
+		if (fatal_signal_pending(current)) {
+			ret = -EINTR;
+			break;
+		}
+
+		if (signal_pending(current)) {
+			/*
+			 * This is wrong when a nonzero, non-infinite timeout
+			 * is specified. We need to use
+			 * ERESTARTSYS_RESTARTBLOCK, but struct restart_block
+			 * contains a union with data for each user and it's
+			 * in generic kernel code that I don't want to
+			 * touch yet.
+			 */
+			ret = -ERESTARTSYS;
+			break;
+		}
+
+		/* Set task state to interruptible sleep before
+		 * checking wake-up conditions. A concurrent wake-up
+		 * will put the task back into runnable state. In that
+		 * case schedule_timeout will not put the task to
+		 * sleep and we'll get a chance to re-check the
+		 * updated conditions almost immediately. Otherwise,
+		 * this race condition would lead to a soft hang or a
+		 * very long sleep.
+		 */
+		set_current_state(TASK_INTERRUPTIBLE);
+
+		*wait_result = test_event_condition(all, num_events,
+						    event_waiters);
+		if (*wait_result != KFD_IOC_WAIT_RESULT_TIMEOUT)
+			break;
+
+		if (timeout <= 0)
+			break;
+
+		timeout = schedule_timeout(timeout);
+	}
+	__set_current_state(TASK_RUNNING);
+
+	/* copy_signaled_event_data may sleep. So this has to happen
+	 * after the task state is set back to RUNNING.
+	 */
+	if (!ret && *wait_result == KFD_IOC_WAIT_RESULT_COMPLETE)
+		ret = copy_signaled_event_data(num_events,
+					       event_waiters, events);
+
+	mutex_lock(&p->event_mutex);
+out_unlock:
+	free_waiters(num_events, event_waiters);
+	mutex_unlock(&p->event_mutex);
+out:
+	if (ret)
+		*wait_result = KFD_IOC_WAIT_RESULT_FAIL;
+	else if (*wait_result == KFD_IOC_WAIT_RESULT_FAIL)
+		ret = -EIO;
+
+	return ret;
+}
+
+int kfd_event_mmap(struct kfd_process *p, struct vm_area_struct *vma)
+{
+	unsigned long pfn;
+	struct kfd_signal_page *page;
+	int ret;
+
+	/* check required size doesn't exceed the allocated size */
+	if (get_order(KFD_SIGNAL_EVENT_LIMIT * 8) <
+			get_order(vma->vm_end - vma->vm_start)) {
+		pr_err("Event page mmap requested illegal size\n");
+		return -EINVAL;
+	}
+
+	page = p->signal_page;
+	if (!page) {
+		/* Probably KFD bug, but mmap is user-accessible. */
+		pr_debug("Signal page could not be found\n");
+		return -EINVAL;
+	}
+
+	pfn = __pa(page->kernel_address);
+	pfn >>= PAGE_SHIFT;
+
+	vma->vm_flags |= VM_IO | VM_DONTCOPY | VM_DONTEXPAND | VM_NORESERVE
+		       | VM_DONTDUMP | VM_PFNMAP;
+
+	pr_debug("Mapping signal page\n");
+	pr_debug("     start user address  == 0x%08lx\n", vma->vm_start);
+	pr_debug("     end user address    == 0x%08lx\n", vma->vm_end);
+	pr_debug("     pfn                 == 0x%016lX\n", pfn);
+	pr_debug("     vm_flags            == 0x%08lX\n", vma->vm_flags);
+	pr_debug("     size                == 0x%08lX\n",
+			vma->vm_end - vma->vm_start);
+
+	page->user_address = (uint64_t __user *)vma->vm_start;
+
+	/* mapping the page to user process */
+	ret = remap_pfn_range(vma, vma->vm_start, pfn,
+			vma->vm_end - vma->vm_start, vma->vm_page_prot);
+	if (!ret)
+		p->signal_mapped_size = vma->vm_end - vma->vm_start;
+
+	return ret;
+}
+
+/*
+ * Assumes that p->event_mutex is held and of course
+ * that p is not going away (current or locked).
+ */
+static void lookup_events_by_type_and_signal(struct kfd_process *p,
+		int type, void *event_data)
+{
+	struct kfd_hsa_memory_exception_data *ev_data;
+	struct kfd_event *ev;
+	uint32_t id;
+	bool send_signal = true;
+
+	ev_data = (struct kfd_hsa_memory_exception_data *) event_data;
+
+	id = KFD_FIRST_NONSIGNAL_EVENT_ID;
+	idr_for_each_entry_continue(&p->event_idr, ev, id)
+		if (ev->type == type) {
+			send_signal = false;
+			dev_dbg(kfd_device,
+					"Event found: id %X type %d",
+					ev->event_id, ev->type);
+			set_event(ev);
+			if (ev->type == KFD_EVENT_TYPE_MEMORY && ev_data)
+				ev->memory_exception_data = *ev_data;
+		}
+
+	if (type == KFD_EVENT_TYPE_MEMORY) {
+		dev_warn(kfd_device,
+			"Sending SIGSEGV to HSA Process with PID %d ",
+				p->lead_thread->pid);
+		send_sig(SIGSEGV, p->lead_thread, 0);
+	}
+
+	/* Send SIGTERM no event of type "type" has been found*/
+	if (send_signal) {
+		if (send_sigterm) {
+			dev_warn(kfd_device,
+				"Sending SIGTERM to HSA Process with PID %d ",
+					p->lead_thread->pid);
+			send_sig(SIGTERM, p->lead_thread, 0);
+		} else {
+			dev_err(kfd_device,
+				"HSA Process (PID %d) got unhandled exception",
+				p->lead_thread->pid);
+		}
+	}
+}
+
+#ifdef KFD_SUPPORT_IOMMU_V2
+void kfd_signal_iommu_event(struct kfd_dev *dev, unsigned int pasid,
+		unsigned long address, bool is_write_requested,
+		bool is_execute_requested)
+{
+	struct kfd_hsa_memory_exception_data memory_exception_data;
+	struct vm_area_struct *vma;
+
+	/*
+	 * Because we are called from arbitrary context (workqueue) as opposed
+	 * to process context, kfd_process could attempt to exit while we are
+	 * running so the lookup function increments the process ref count.
+	 */
+	struct kfd_process *p = kfd_lookup_process_by_pasid(pasid);
+	struct mm_struct *mm;
+
+	if (!p)
+		return; /* Presumably process exited. */
+
+	/* Take a safe reference to the mm_struct, which may otherwise
+	 * disappear even while the kfd_process is still referenced.
+	 */
+	mm = get_task_mm(p->lead_thread);
+	if (!mm) {
+		kfd_unref_process(p);
+		return; /* Process is exiting */
+	}
+
+	memset(&memory_exception_data, 0, sizeof(memory_exception_data));
+
+	down_read(&mm->mmap_sem);
+	vma = find_vma(mm, address);
+
+	memory_exception_data.gpu_id = dev->id;
+	memory_exception_data.va = address;
+	/* Set failure reason */
+	memory_exception_data.failure.NotPresent = 1;
+	memory_exception_data.failure.NoExecute = 0;
+	memory_exception_data.failure.ReadOnly = 0;
+	if (vma && address >= vma->vm_start) {
+		memory_exception_data.failure.NotPresent = 0;
+
+		if (is_write_requested && !(vma->vm_flags & VM_WRITE))
+			memory_exception_data.failure.ReadOnly = 1;
+		else
+			memory_exception_data.failure.ReadOnly = 0;
+
+		if (is_execute_requested && !(vma->vm_flags & VM_EXEC))
+			memory_exception_data.failure.NoExecute = 1;
+		else
+			memory_exception_data.failure.NoExecute = 0;
+	}
+
+	up_read(&mm->mmap_sem);
+	mmput(mm);
+
+	pr_debug("notpresent %d, noexecute %d, readonly %d\n",
+			memory_exception_data.failure.NotPresent,
+			memory_exception_data.failure.NoExecute,
+			memory_exception_data.failure.ReadOnly);
+
+	/* Workaround on Raven to not kill the process when memory is freed
+	 * before IOMMU is able to finish processing all the excessive PPRs
+	 */
+	if (dev->device_info->asic_family != CHIP_RAVEN) {
+		mutex_lock(&p->event_mutex);
+
+		/* Lookup events by type and signal them */
+		lookup_events_by_type_and_signal(p, KFD_EVENT_TYPE_MEMORY,
+				&memory_exception_data);
+
+		mutex_unlock(&p->event_mutex);
+	}
+
+	kfd_unref_process(p);
+}
+#endif /* KFD_SUPPORT_IOMMU_V2 */
+
+void kfd_signal_hw_exception_event(unsigned int pasid)
+{
+	/*
+	 * Because we are called from arbitrary context (workqueue) as opposed
+	 * to process context, kfd_process could attempt to exit while we are
+	 * running so the lookup function increments the process ref count.
+	 */
+	struct kfd_process *p = kfd_lookup_process_by_pasid(pasid);
+
+	if (!p)
+		return; /* Presumably process exited. */
+
+	mutex_lock(&p->event_mutex);
+
+	/* Lookup events by type and signal them */
+	lookup_events_by_type_and_signal(p, KFD_EVENT_TYPE_HW_EXCEPTION, NULL);
+
+	mutex_unlock(&p->event_mutex);
+	kfd_unref_process(p);
+}
+
+void kfd_signal_vm_fault_event(struct kfd_dev *dev, unsigned int pasid,
+				struct kfd_vm_fault_info *info)
+{
+	struct kfd_event *ev;
+	uint32_t id;
+	struct kfd_process *p = kfd_lookup_process_by_pasid(pasid);
+	struct kfd_hsa_memory_exception_data memory_exception_data;
+
+	if (!p)
+		return; /* Presumably process exited. */
+	memset(&memory_exception_data, 0, sizeof(memory_exception_data));
+	memory_exception_data.gpu_id = dev->id;
+	memory_exception_data.failure.imprecise = 1;
+	/* Set failure reason */
+	if (info) {
+		memory_exception_data.va = (info->page_addr) << PAGE_SHIFT;
+		memory_exception_data.failure.NotPresent =
+			info->prot_valid ? 1 : 0;
+		memory_exception_data.failure.NoExecute =
+			info->prot_exec ? 1 : 0;
+		memory_exception_data.failure.ReadOnly =
+			info->prot_write ? 1 : 0;
+		memory_exception_data.failure.imprecise = 0;
+	}
+	mutex_lock(&p->event_mutex);
+
+	id = KFD_FIRST_NONSIGNAL_EVENT_ID;
+	idr_for_each_entry_continue(&p->event_idr, ev, id)
+		if (ev->type == KFD_EVENT_TYPE_MEMORY) {
+			ev->memory_exception_data = memory_exception_data;
+			set_event(ev);
+		}
+
+	mutex_unlock(&p->event_mutex);
+	kfd_unref_process(p);
+}
+
+void kfd_signal_reset_event(struct kfd_dev *dev)
+{
+	struct kfd_hsa_hw_exception_data hw_exception_data;
+	struct kfd_process *p;
+	struct kfd_event *ev;
+	unsigned int temp;
+	uint32_t id, idx;
+
+	/* Whole gpu reset caused by GPU hang and memory is lost */
+	memset(&hw_exception_data, 0, sizeof(hw_exception_data));
+	hw_exception_data.gpu_id = dev->id;
+	hw_exception_data.memory_lost = 1;
+
+	idx = srcu_read_lock(&kfd_processes_srcu);
+	hash_for_each_rcu(kfd_processes_table, temp, p, kfd_processes) {
+		mutex_lock(&p->event_mutex);
+		id = KFD_FIRST_NONSIGNAL_EVENT_ID;
+		idr_for_each_entry_continue(&p->event_idr, ev, id)
+			if (ev->type == KFD_EVENT_TYPE_HW_EXCEPTION) {
+				ev->hw_exception_data = hw_exception_data;
+				set_event(ev);
+			}
+		mutex_unlock(&p->event_mutex);
+	}
+	srcu_read_unlock(&kfd_processes_srcu, idx);
+}
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_events.h b/drivers/gpu/drm/amd/amdkfd/kfd_events.h
new file mode 100644
index 000000000..c7ac6c73a
--- /dev/null
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_events.h
@@ -0,0 +1,85 @@
+/*
+ * Copyright 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#ifndef KFD_EVENTS_H_INCLUDED
+#define KFD_EVENTS_H_INCLUDED
+
+#include <linux/kernel.h>
+#include <linux/hashtable.h>
+#include <linux/types.h>
+#include <linux/list.h>
+#include <linux/wait.h>
+#include "kfd_priv.h"
+#include <uapi/linux/kfd_ioctl.h>
+
+/*
+ * IDR supports non-negative integer IDs. Small IDs are used for
+ * signal events to match their signal slot. Use the upper half of the
+ * ID space for non-signal events.
+ */
+#define KFD_FIRST_NONSIGNAL_EVENT_ID ((INT_MAX >> 1) + 1)
+#define KFD_LAST_NONSIGNAL_EVENT_ID INT_MAX
+
+/*
+ * Written into kfd_signal_slot_t to indicate that the event is not signaled.
+ * Since the event protocol may need to write the event ID into memory, this
+ * must not be a valid event ID.
+ * For the sake of easy memset-ing, this must be a byte pattern.
+ */
+#define UNSIGNALED_EVENT_SLOT ((uint64_t)-1)
+
+struct kfd_event_waiter;
+struct signal_page;
+
+struct kfd_event {
+	u32 event_id;
+
+	bool signaled;
+	bool auto_reset;
+
+	int type;
+
+	wait_queue_head_t wq; /* List of event waiters. */
+
+	/* Only for signal events. */
+	uint64_t __user *user_signal_address;
+
+	/* type specific data */
+	union {
+		struct kfd_hsa_memory_exception_data memory_exception_data;
+		struct kfd_hsa_hw_exception_data hw_exception_data;
+	};
+};
+
+#define KFD_EVENT_TIMEOUT_IMMEDIATE 0
+#define KFD_EVENT_TIMEOUT_INFINITE 0xFFFFFFFFu
+
+/* Matching HSA_EVENTTYPE */
+#define KFD_EVENT_TYPE_SIGNAL 0
+#define KFD_EVENT_TYPE_HW_EXCEPTION 3
+#define KFD_EVENT_TYPE_DEBUG 5
+#define KFD_EVENT_TYPE_MEMORY 8
+
+extern void kfd_signal_event_interrupt(unsigned int pasid, uint32_t partial_id,
+					uint32_t valid_id_bits);
+
+#endif
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_flat_memory.c b/drivers/gpu/drm/amd/amdkfd/kfd_flat_memory.c
new file mode 100644
index 000000000..97d5423c5
--- /dev/null
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_flat_memory.c
@@ -0,0 +1,436 @@
+/*
+ * Copyright 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ *
+ */
+
+#include <linux/device.h>
+#include <linux/export.h>
+#include <linux/err.h>
+#include <linux/fs.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/uaccess.h>
+#include <linux/compat.h>
+#include <uapi/linux/kfd_ioctl.h>
+#include <linux/time.h>
+#include "kfd_priv.h"
+#include <linux/mm.h>
+#include <linux/mman.h>
+#include <asm/processor.h>
+
+/*
+ * The primary memory I/O features being added for revisions of gfxip
+ * beyond 7.0 (Kaveri) are:
+ *
+ * Access to ATC/IOMMU mapped memory w/ associated extension of VA to 48b
+ *
+ * “Flat” shader memory access – These are new shader vector memory
+ * operations that do not reference a T#/V# so a “pointer” is what is
+ * sourced from the vector gprs for direct access to memory.
+ * This pointer space has the Shared(LDS) and Private(Scratch) memory
+ * mapped into this pointer space as apertures.
+ * The hardware then determines how to direct the memory request
+ * based on what apertures the request falls in.
+ *
+ * Unaligned support and alignment check
+ *
+ *
+ * System Unified Address - SUA
+ *
+ * The standard usage for GPU virtual addresses are that they are mapped by
+ * a set of page tables we call GPUVM and these page tables are managed by
+ * a combination of vidMM/driver software components.  The current virtual
+ * address (VA) range for GPUVM is 40b.
+ *
+ * As of gfxip7.1 and beyond we’re adding the ability for compute memory
+ * clients (CP/RLC, DMA, SHADER(ifetch, scalar, and vector ops)) to access
+ * the same page tables used by host x86 processors and that are managed by
+ * the operating system. This is via a technique and hardware called ATC/IOMMU.
+ * The GPU has the capability of accessing both the GPUVM and ATC address
+ * spaces for a given VMID (process) simultaneously and we call this feature
+ * system unified address (SUA).
+ *
+ * There are three fundamental address modes of operation for a given VMID
+ * (process) on the GPU:
+ *
+ *	HSA64 – 64b pointers and the default address space is ATC
+ *	HSA32 – 32b pointers and the default address space is ATC
+ *	GPUVM – 64b pointers and the default address space is GPUVM (driver
+ *		model mode)
+ *
+ *
+ * HSA64 - ATC/IOMMU 64b
+ *
+ * A 64b pointer in the AMD64/IA64 CPU architecture is not fully utilized
+ * by the CPU so an AMD CPU can only access the high area
+ * (VA[63:47] == 0x1FFFF) and low area (VA[63:47 == 0) of the address space
+ * so the actual VA carried to translation is 48b.  There is a “hole” in
+ * the middle of the 64b VA space.
+ *
+ * The GPU not only has access to all of the CPU accessible address space via
+ * ATC/IOMMU, but it also has access to the GPUVM address space.  The “system
+ * unified address” feature (SUA) is the mapping of GPUVM and ATC address
+ * spaces into a unified pointer space.  The method we take for 64b mode is
+ * to map the full 40b GPUVM address space into the hole of the 64b address
+ * space.
+
+ * The GPUVM_Base/GPUVM_Limit defines the aperture in the 64b space where we
+ * direct requests to be translated via GPUVM page tables instead of the
+ * IOMMU path.
+ *
+ *
+ * 64b to 49b Address conversion
+ *
+ * Note that there are still significant portions of unused regions (holes)
+ * in the 64b address space even for the GPU.  There are several places in
+ * the pipeline (sw and hw), we wish to compress the 64b virtual address
+ * to a 49b address.  This 49b address is constituted of an “ATC” bit
+ * plus a 48b virtual address.  This 49b address is what is passed to the
+ * translation hardware.  ATC==0 means the 48b address is a GPUVM address
+ * (max of 2^40 – 1) intended to be translated via GPUVM page tables.
+ * ATC==1 means the 48b address is intended to be translated via IOMMU
+ * page tables.
+ *
+ * A 64b pointer is compared to the apertures that are defined (Base/Limit), in
+ * this case the GPUVM aperture (red) is defined and if a pointer falls in this
+ * aperture, we subtract the GPUVM_Base address and set the ATC bit to zero
+ * as part of the 64b to 49b conversion.
+ *
+ * Where this 64b to 49b conversion is done is a function of the usage.
+ * Most GPU memory access is via memory objects where the driver builds
+ * a descriptor which consists of a base address and a memory access by
+ * the GPU usually consists of some kind of an offset or Cartesian coordinate
+ * that references this memory descriptor.  This is the case for shader
+ * instructions that reference the T# or V# constants, or for specified
+ * locations of assets (ex. the shader program location).  In these cases
+ * the driver is what handles the 64b to 49b conversion and the base
+ * address in the descriptor (ex. V# or T# or shader program location)
+ * is defined as a 48b address w/ an ATC bit.  For this usage a given
+ * memory object cannot straddle multiple apertures in the 64b address
+ * space. For example a shader program cannot jump in/out between ATC
+ * and GPUVM space.
+ *
+ * In some cases we wish to pass a 64b pointer to the GPU hardware and
+ * the GPU hw does the 64b to 49b conversion before passing memory
+ * requests to the cache/memory system.  This is the case for the
+ * S_LOAD and FLAT_* shader memory instructions where we have 64b pointers
+ * in scalar and vector GPRs respectively.
+ *
+ * In all cases (no matter where the 64b -> 49b conversion is done), the gfxip
+ * hardware sends a 48b address along w/ an ATC bit, to the memory controller
+ * on the memory request interfaces.
+ *
+ *	<client>_MC_rdreq_atc   // read request ATC bit
+ *
+ *		0 : <client>_MC_rdreq_addr is a GPUVM VA
+ *
+ *		1 : <client>_MC_rdreq_addr is a ATC VA
+ *
+ *
+ * “Spare” aperture (APE1)
+ *
+ * We use the GPUVM aperture to differentiate ATC vs. GPUVM, but we also use
+ * apertures to set the Mtype field for S_LOAD/FLAT_* ops which is input to the
+ * config tables for setting cache policies. The “spare” (APE1) aperture is
+ * motivated by getting a different Mtype from the default.
+ * The default aperture isn’t an actual base/limit aperture; it is just the
+ * address space that doesn’t hit any defined base/limit apertures.
+ * The following diagram is a complete picture of the gfxip7.x SUA apertures.
+ * The APE1 can be placed either below or above
+ * the hole (cannot be in the hole).
+ *
+ *
+ * General Aperture definitions and rules
+ *
+ * An aperture register definition consists of a Base, Limit, Mtype, and
+ * usually an ATC bit indicating which translation tables that aperture uses.
+ * In all cases (for SUA and DUA apertures discussed later), aperture base
+ * and limit definitions are 64KB aligned.
+ *
+ *	<ape>_Base[63:0] = { <ape>_Base_register[63:16], 0x0000 }
+ *
+ *	<ape>_Limit[63:0] = { <ape>_Limit_register[63:16], 0xFFFF }
+ *
+ * The base and limit are considered inclusive to an aperture so being
+ * inside an aperture means (address >= Base) AND (address <= Limit).
+ *
+ * In no case is a payload that straddles multiple apertures expected to work.
+ * For example a load_dword_x4 that starts in one aperture and ends in another,
+ * does not work.  For the vector FLAT_* ops we have detection capability in
+ * the shader for reporting a “memory violation” back to the
+ * SQ block for use in traps.
+ * A memory violation results when an op falls into the hole,
+ * or a payload straddles multiple apertures.  The S_LOAD instruction
+ * does not have this detection.
+ *
+ * Apertures cannot overlap.
+ *
+ *
+ *
+ * HSA32 - ATC/IOMMU 32b
+ *
+ * For HSA32 mode, the pointers are interpreted as 32 bits and use a single GPR
+ * instead of two for the S_LOAD and FLAT_* ops. The entire GPUVM space of 40b
+ * will not fit so there is only partial visibility to the GPUVM
+ * space (defined by the aperture) for S_LOAD and FLAT_* ops.
+ * There is no spare (APE1) aperture for HSA32 mode.
+ *
+ *
+ * GPUVM 64b mode (driver model)
+ *
+ * This mode is related to HSA64 in that the difference really is that
+ * the default aperture is GPUVM (ATC==0) and not ATC space.
+ * We have gfxip7.x hardware that has FLAT_* and S_LOAD support for
+ * SUA GPUVM mode, but does not support HSA32/HSA64.
+ *
+ *
+ * Device Unified Address - DUA
+ *
+ * Device unified address (DUA) is the name of the feature that maps the
+ * Shared(LDS) memory and Private(Scratch) memory into the overall address
+ * space for use by the new FLAT_* vector memory ops.  The Shared and
+ * Private memories are mapped as apertures into the address space,
+ * and the hardware detects when a FLAT_* memory request is to be redirected
+ * to the LDS or Scratch memory when it falls into one of these apertures.
+ * Like the SUA apertures, the Shared/Private apertures are 64KB aligned and
+ * the base/limit is “in” the aperture. For both HSA64 and GPUVM SUA modes,
+ * the Shared/Private apertures are always placed in a limited selection of
+ * options in the hole of the 64b address space. For HSA32 mode, the
+ * Shared/Private apertures can be placed anywhere in the 32b space
+ * except at 0.
+ *
+ *
+ * HSA64 Apertures for FLAT_* vector ops
+ *
+ * For HSA64 SUA mode, the Shared and Private apertures are always placed
+ * in the hole w/ a limited selection of possible locations. The requests
+ * that fall in the private aperture are expanded as a function of the
+ * work-item id (tid) and redirected to the location of the
+ * “hidden private memory”. The hidden private can be placed in either GPUVM
+ * or ATC space. The addresses that fall in the shared aperture are
+ * re-directed to the on-chip LDS memory hardware.
+ *
+ *
+ * HSA32 Apertures for FLAT_* vector ops
+ *
+ * In HSA32 mode, the Private and Shared apertures can be placed anywhere
+ * in the 32b space except at 0 (Private or Shared Base at zero disables
+ * the apertures). If the base address of the apertures are non-zero
+ * (ie apertures exists), the size is always 64KB.
+ *
+ *
+ * GPUVM Apertures for FLAT_* vector ops
+ *
+ * In GPUVM mode, the Shared/Private apertures are specified identically
+ * to HSA64 mode where they are always in the hole at a limited selection
+ * of locations.
+ *
+ *
+ * Aperture Definitions for SUA and DUA
+ *
+ * The interpretation of the aperture register definitions for a given
+ * VMID is a function of the “SUA Mode” which is one of HSA64, HSA32, or
+ * GPUVM64 discussed in previous sections. The mode is first decoded, and
+ * then the remaining register decode is a function of the mode.
+ *
+ *
+ * SUA Mode Decode
+ *
+ * For the S_LOAD and FLAT_* shader operations, the SUA mode is decoded from
+ * the COMPUTE_DISPATCH_INITIATOR:DATA_ATC bit and
+ * the SH_MEM_CONFIG:PTR32 bits.
+ *
+ * COMPUTE_DISPATCH_INITIATOR:DATA_ATC    SH_MEM_CONFIG:PTR32        Mode
+ *
+ * 1                                              0                  HSA64
+ *
+ * 1                                              1                  HSA32
+ *
+ * 0                                              X                 GPUVM64
+ *
+ * In general the hardware will ignore the PTR32 bit and treat
+ * as “0” whenever DATA_ATC = “0”, but sw should set PTR32=0
+ * when DATA_ATC=0.
+ *
+ * The DATA_ATC bit is only set for compute dispatches.
+ * All “Draw” dispatches are hardcoded to GPUVM64 mode
+ * for FLAT_* / S_LOAD operations.
+ */
+
+#define MAKE_GPUVM_APP_BASE_VI(gpu_num) \
+	(((uint64_t)(gpu_num) << 61) + 0x1000000000000L)
+
+#define MAKE_GPUVM_APP_LIMIT(base, size) \
+	(((uint64_t)(base) & 0xFFFFFF0000000000UL) + (size) - 1)
+
+#define MAKE_SCRATCH_APP_BASE_VI() \
+	(((uint64_t)(0x1UL) << 61) + 0x100000000L)
+
+#define MAKE_SCRATCH_APP_LIMIT(base) \
+	(((uint64_t)base & 0xFFFFFFFF00000000UL) | 0xFFFFFFFF)
+
+#define MAKE_LDS_APP_BASE_VI() \
+	(((uint64_t)(0x1UL) << 61) + 0x0)
+#define MAKE_LDS_APP_LIMIT(base) \
+	(((uint64_t)(base) & 0xFFFFFFFF00000000UL) | 0xFFFFFFFF)
+
+/* On GFXv9 the LDS and scratch apertures are programmed independently
+ * using the high 16 bits of the 64-bit virtual address. They must be
+ * in the hole, which will be the case as long as the high 16 bits are
+ * not 0.
+ *
+ * The aperture sizes are still 4GB implicitly.
+ *
+ * A GPUVM aperture is not applicable on GFXv9.
+ */
+#define MAKE_LDS_APP_BASE_V9() ((uint64_t)(0x1UL) << 48)
+#define MAKE_SCRATCH_APP_BASE_V9() ((uint64_t)(0x2UL) << 48)
+
+/* User mode manages most of the SVM aperture address space. The low
+ * 16MB are reserved for kernel use (CWSR trap handler and kernel IB
+ * for now).
+ */
+#define SVM_USER_BASE 0x1000000ull
+#define SVM_CWSR_BASE (SVM_USER_BASE - KFD_CWSR_TBA_TMA_SIZE)
+#define SVM_IB_BASE   (SVM_CWSR_BASE - PAGE_SIZE)
+
+static void kfd_init_apertures_vi(struct kfd_process_device *pdd, uint8_t id)
+{
+	/*
+	 * node id couldn't be 0 - the three MSB bits of
+	 * aperture shoudn't be 0
+	 */
+	pdd->lds_base = MAKE_LDS_APP_BASE_VI();
+	pdd->lds_limit = MAKE_LDS_APP_LIMIT(pdd->lds_base);
+
+	if (!pdd->dev->device_info->needs_iommu_device) {
+		/* dGPUs: SVM aperture starting at 0
+		 * with small reserved space for kernel.
+		 * Set them to CANONICAL addresses.
+		 */
+		pdd->gpuvm_base = SVM_USER_BASE;
+		pdd->gpuvm_limit =
+			pdd->dev->shared_resources.gpuvm_size - 1;
+	} else {
+		/* set them to non CANONICAL addresses, and no SVM is
+		 * allocated.
+		 */
+		pdd->gpuvm_base = MAKE_GPUVM_APP_BASE_VI(id + 1);
+		pdd->gpuvm_limit = MAKE_GPUVM_APP_LIMIT(pdd->gpuvm_base,
+				pdd->dev->shared_resources.gpuvm_size);
+	}
+
+	pdd->scratch_base = MAKE_SCRATCH_APP_BASE_VI();
+	pdd->scratch_limit = MAKE_SCRATCH_APP_LIMIT(pdd->scratch_base);
+}
+
+static void kfd_init_apertures_v9(struct kfd_process_device *pdd, uint8_t id)
+{
+	pdd->lds_base = MAKE_LDS_APP_BASE_V9();
+	pdd->lds_limit = MAKE_LDS_APP_LIMIT(pdd->lds_base);
+
+	/* Raven needs SVM to support graphic handle, etc. Leave the small
+	 * reserved space before SVM on Raven as well, even though we don't
+	 * have to.
+	 * Set gpuvm_base and gpuvm_limit to CANONICAL addresses so that they
+	 * are used in Thunk to reserve SVM.
+	 */
+	pdd->gpuvm_base = SVM_USER_BASE;
+	pdd->gpuvm_limit =
+		pdd->dev->shared_resources.gpuvm_size - 1;
+
+	pdd->scratch_base = MAKE_SCRATCH_APP_BASE_V9();
+	pdd->scratch_limit = MAKE_SCRATCH_APP_LIMIT(pdd->scratch_base);
+}
+
+int kfd_init_apertures(struct kfd_process *process)
+{
+	uint8_t id  = 0;
+	struct kfd_dev *dev;
+	struct kfd_process_device *pdd;
+
+	/*Iterating over all devices*/
+	while (kfd_topology_enum_kfd_devices(id, &dev) == 0) {
+		if (!dev) {
+			id++; /* Skip non GPU devices */
+			continue;
+		}
+
+		pdd = kfd_create_process_device_data(dev, process);
+		if (!pdd) {
+			pr_err("Failed to create process device data\n");
+			return -ENOMEM;
+		}
+		/*
+		 * For 64 bit process apertures will be statically reserved in
+		 * the x86_64 non canonical process address space
+		 * amdkfd doesn't currently support apertures for 32 bit process
+		 */
+		if (process->is_32bit_user_mode) {
+			pdd->lds_base = pdd->lds_limit = 0;
+			pdd->gpuvm_base = pdd->gpuvm_limit = 0;
+			pdd->scratch_base = pdd->scratch_limit = 0;
+		} else {
+			switch (dev->device_info->asic_family) {
+			case CHIP_KAVERI:
+			case CHIP_HAWAII:
+			case CHIP_CARRIZO:
+			case CHIP_TONGA:
+			case CHIP_FIJI:
+			case CHIP_POLARIS10:
+			case CHIP_POLARIS11:
+				kfd_init_apertures_vi(pdd, id);
+				break;
+			case CHIP_VEGA10:
+			case CHIP_RAVEN:
+				kfd_init_apertures_v9(pdd, id);
+				break;
+			default:
+				WARN(1, "Unexpected ASIC family %u",
+				     dev->device_info->asic_family);
+				return -EINVAL;
+			}
+
+			if (!dev->device_info->needs_iommu_device) {
+				/* dGPUs: the reserved space for kernel
+				 * before SVM
+				 */
+				pdd->qpd.cwsr_base = SVM_CWSR_BASE;
+				pdd->qpd.ib_base = SVM_IB_BASE;
+			}
+		}
+
+		dev_dbg(kfd_device, "node id %u\n", id);
+		dev_dbg(kfd_device, "gpu id %u\n", pdd->dev->id);
+		dev_dbg(kfd_device, "lds_base %llX\n", pdd->lds_base);
+		dev_dbg(kfd_device, "lds_limit %llX\n", pdd->lds_limit);
+		dev_dbg(kfd_device, "gpuvm_base %llX\n", pdd->gpuvm_base);
+		dev_dbg(kfd_device, "gpuvm_limit %llX\n", pdd->gpuvm_limit);
+		dev_dbg(kfd_device, "scratch_base %llX\n", pdd->scratch_base);
+		dev_dbg(kfd_device, "scratch_limit %llX\n", pdd->scratch_limit);
+
+		id++;
+	}
+
+	return 0;
+}
+
+
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c
new file mode 100644
index 000000000..f836897bb
--- /dev/null
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c
@@ -0,0 +1,108 @@
+/*
+ * Copyright 2016-2018 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#include "kfd_priv.h"
+#include "kfd_events.h"
+#include "soc15_int.h"
+
+
+static bool event_interrupt_isr_v9(struct kfd_dev *dev,
+					const uint32_t *ih_ring_entry,
+					uint32_t *patched_ihre,
+					bool *patched_flag)
+{
+	uint16_t source_id, client_id, pasid, vmid;
+	const uint32_t *data = ih_ring_entry;
+
+	/* Only handle interrupts from KFD VMIDs */
+	vmid = SOC15_VMID_FROM_IH_ENTRY(ih_ring_entry);
+	if (vmid < dev->vm_info.first_vmid_kfd ||
+	    vmid > dev->vm_info.last_vmid_kfd)
+		return 0;
+
+	/* If there is no valid PASID, it's likely a firmware bug */
+	pasid = SOC15_PASID_FROM_IH_ENTRY(ih_ring_entry);
+	if (WARN_ONCE(pasid == 0, "FW bug: No PASID in KFD interrupt"))
+		return 0;
+
+	source_id = SOC15_SOURCE_ID_FROM_IH_ENTRY(ih_ring_entry);
+	client_id = SOC15_CLIENT_ID_FROM_IH_ENTRY(ih_ring_entry);
+
+	pr_debug("client id 0x%x, source id %d, pasid 0x%x. raw data:\n",
+		 client_id, source_id, pasid);
+	pr_debug("%8X, %8X, %8X, %8X, %8X, %8X, %8X, %8X.\n",
+		 data[0], data[1], data[2], data[3],
+		 data[4], data[5], data[6], data[7]);
+
+	/* Interrupt types we care about: various signals and faults.
+	 * They will be forwarded to a work queue (see below).
+	 */
+	return source_id == SOC15_INTSRC_CP_END_OF_PIPE ||
+		source_id == SOC15_INTSRC_SDMA_TRAP ||
+		source_id == SOC15_INTSRC_SQ_INTERRUPT_MSG ||
+		source_id == SOC15_INTSRC_CP_BAD_OPCODE ||
+		client_id == SOC15_IH_CLIENTID_VMC ||
+		client_id == SOC15_IH_CLIENTID_UTCL2;
+}
+
+static void event_interrupt_wq_v9(struct kfd_dev *dev,
+					const uint32_t *ih_ring_entry)
+{
+	uint16_t source_id, client_id, pasid, vmid;
+	uint32_t context_id;
+
+	source_id = SOC15_SOURCE_ID_FROM_IH_ENTRY(ih_ring_entry);
+	client_id = SOC15_CLIENT_ID_FROM_IH_ENTRY(ih_ring_entry);
+	pasid = SOC15_PASID_FROM_IH_ENTRY(ih_ring_entry);
+	vmid = SOC15_VMID_FROM_IH_ENTRY(ih_ring_entry);
+	context_id = SOC15_CONTEXT_ID0_FROM_IH_ENTRY(ih_ring_entry);
+
+	if (source_id == SOC15_INTSRC_CP_END_OF_PIPE)
+		kfd_signal_event_interrupt(pasid, context_id, 32);
+	else if (source_id == SOC15_INTSRC_SDMA_TRAP)
+		kfd_signal_event_interrupt(pasid, context_id & 0xfffffff, 28);
+	else if (source_id == SOC15_INTSRC_SQ_INTERRUPT_MSG)
+		kfd_signal_event_interrupt(pasid, context_id & 0xffffff, 24);
+	else if (source_id == SOC15_INTSRC_CP_BAD_OPCODE)
+		kfd_signal_hw_exception_event(pasid);
+	else if (client_id == SOC15_IH_CLIENTID_VMC ||
+		 client_id == SOC15_IH_CLIENTID_UTCL2) {
+		struct kfd_vm_fault_info info = {0};
+		uint16_t ring_id = SOC15_RING_ID_FROM_IH_ENTRY(ih_ring_entry);
+
+		info.vmid = vmid;
+		info.mc_id = client_id;
+		info.page_addr = ih_ring_entry[4] |
+			(uint64_t)(ih_ring_entry[5] & 0xf) << 32;
+		info.prot_valid = ring_id & 0x08;
+		info.prot_read  = ring_id & 0x10;
+		info.prot_write = ring_id & 0x20;
+
+		kfd_process_vm_fault(dev->dqm, pasid);
+		kfd_signal_vm_fault_event(dev, pasid, &info);
+	}
+}
+
+const struct kfd_event_interrupt_class event_interrupt_class_v9 = {
+	.interrupt_isr = event_interrupt_isr_v9,
+	.interrupt_wq = event_interrupt_wq_v9,
+};
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_interrupt.c b/drivers/gpu/drm/amd/amdkfd/kfd_interrupt.c
new file mode 100644
index 000000000..bc47f6a44
--- /dev/null
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_interrupt.c
@@ -0,0 +1,170 @@
+/*
+ * Copyright 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+/*
+ * KFD Interrupts.
+ *
+ * AMD GPUs deliver interrupts by pushing an interrupt description onto the
+ * interrupt ring and then sending an interrupt. KGD receives the interrupt
+ * in ISR and sends us a pointer to each new entry on the interrupt ring.
+ *
+ * We generally can't process interrupt-signaled events from ISR, so we call
+ * out to each interrupt client module (currently only the scheduler) to ask if
+ * each interrupt is interesting. If they return true, then it requires further
+ * processing so we copy it to an internal interrupt ring and call each
+ * interrupt client again from a work-queue.
+ *
+ * There's no acknowledgment for the interrupts we use. The hardware simply
+ * queues a new interrupt each time without waiting.
+ *
+ * The fixed-size internal queue means that it's possible for us to lose
+ * interrupts because we have no back-pressure to the hardware.
+ */
+
+#include <linux/slab.h>
+#include <linux/device.h>
+#include <linux/kfifo.h>
+#include "kfd_priv.h"
+
+#define KFD_IH_NUM_ENTRIES 8192
+
+static void interrupt_wq(struct work_struct *);
+
+int kfd_interrupt_init(struct kfd_dev *kfd)
+{
+	int r;
+
+	r = kfifo_alloc(&kfd->ih_fifo,
+		KFD_IH_NUM_ENTRIES * kfd->device_info->ih_ring_entry_size,
+		GFP_KERNEL);
+	if (r) {
+		dev_err(kfd_chardev(), "Failed to allocate IH fifo\n");
+		return r;
+	}
+
+	kfd->ih_wq = alloc_workqueue("KFD IH", WQ_HIGHPRI, 1);
+	if (unlikely(!kfd->ih_wq)) {
+		kfifo_free(&kfd->ih_fifo);
+		dev_err(kfd_chardev(), "Failed to allocate KFD IH workqueue\n");
+		return -ENOMEM;
+	}
+	spin_lock_init(&kfd->interrupt_lock);
+
+	INIT_WORK(&kfd->interrupt_work, interrupt_wq);
+
+	kfd->interrupts_active = true;
+
+	/*
+	 * After this function returns, the interrupt will be enabled. This
+	 * barrier ensures that the interrupt running on a different processor
+	 * sees all the above writes.
+	 */
+	smp_wmb();
+
+	return 0;
+}
+
+void kfd_interrupt_exit(struct kfd_dev *kfd)
+{
+	/*
+	 * Stop the interrupt handler from writing to the ring and scheduling
+	 * workqueue items. The spinlock ensures that any interrupt running
+	 * after we have unlocked sees interrupts_active = false.
+	 */
+	unsigned long flags;
+
+	spin_lock_irqsave(&kfd->interrupt_lock, flags);
+	kfd->interrupts_active = false;
+	spin_unlock_irqrestore(&kfd->interrupt_lock, flags);
+
+	/*
+	 * flush_work ensures that there are no outstanding
+	 * work-queue items that will access interrupt_ring. New work items
+	 * can't be created because we stopped interrupt handling above.
+	 */
+	flush_workqueue(kfd->ih_wq);
+
+	kfifo_free(&kfd->ih_fifo);
+}
+
+/*
+ * Assumption: single reader/writer. This function is not re-entrant
+ */
+bool enqueue_ih_ring_entry(struct kfd_dev *kfd,	const void *ih_ring_entry)
+{
+	int count;
+
+	count = kfifo_in(&kfd->ih_fifo, ih_ring_entry,
+				kfd->device_info->ih_ring_entry_size);
+	if (count != kfd->device_info->ih_ring_entry_size) {
+		dev_err_ratelimited(kfd_chardev(),
+			"Interrupt ring overflow, dropping interrupt %d\n",
+			count);
+		return false;
+	}
+
+	return true;
+}
+
+/*
+ * Assumption: single reader/writer. This function is not re-entrant
+ */
+static bool dequeue_ih_ring_entry(struct kfd_dev *kfd, void *ih_ring_entry)
+{
+	int count;
+
+	count = kfifo_out(&kfd->ih_fifo, ih_ring_entry,
+				kfd->device_info->ih_ring_entry_size);
+
+	WARN_ON(count && count != kfd->device_info->ih_ring_entry_size);
+
+	return count == kfd->device_info->ih_ring_entry_size;
+}
+
+static void interrupt_wq(struct work_struct *work)
+{
+	struct kfd_dev *dev = container_of(work, struct kfd_dev,
+						interrupt_work);
+	uint32_t ih_ring_entry[KFD_MAX_RING_ENTRY_SIZE];
+
+	if (dev->device_info->ih_ring_entry_size > sizeof(ih_ring_entry)) {
+		dev_err_once(kfd_chardev(), "Ring entry too small\n");
+		return;
+	}
+
+	while (dequeue_ih_ring_entry(dev, ih_ring_entry))
+		dev->device_info->event_interrupt_class->interrupt_wq(dev,
+								ih_ring_entry);
+}
+
+bool interrupt_is_wanted(struct kfd_dev *dev,
+			const uint32_t *ih_ring_entry,
+			uint32_t *patched_ihre, bool *flag)
+{
+	/* integer and bitwise OR so there is no boolean short-circuiting */
+	unsigned int wanted = 0;
+
+	wanted |= dev->device_info->event_interrupt_class->interrupt_isr(dev,
+					 ih_ring_entry, patched_ihre, flag);
+
+	return wanted != 0;
+}
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_iommu.c b/drivers/gpu/drm/amd/amdkfd/kfd_iommu.c
new file mode 100644
index 000000000..f3a526ed8
--- /dev/null
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_iommu.c
@@ -0,0 +1,374 @@
+/*
+ * Copyright 2018 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#include <linux/kconfig.h>
+
+#if IS_REACHABLE(CONFIG_AMD_IOMMU_V2)
+
+#include <linux/printk.h>
+#include <linux/device.h>
+#include <linux/slab.h>
+#include <linux/pci.h>
+#include <linux/amd-iommu.h>
+#include "kfd_priv.h"
+#include "kfd_dbgmgr.h"
+#include "kfd_topology.h"
+#include "kfd_iommu.h"
+
+static const u32 required_iommu_flags = AMD_IOMMU_DEVICE_FLAG_ATS_SUP |
+					AMD_IOMMU_DEVICE_FLAG_PRI_SUP |
+					AMD_IOMMU_DEVICE_FLAG_PASID_SUP;
+
+/** kfd_iommu_check_device - Check whether IOMMU is available for device
+ */
+int kfd_iommu_check_device(struct kfd_dev *kfd)
+{
+	struct amd_iommu_device_info iommu_info;
+	int err;
+
+	if (!kfd->device_info->needs_iommu_device)
+		return -ENODEV;
+
+	iommu_info.flags = 0;
+	err = amd_iommu_device_info(kfd->pdev, &iommu_info);
+	if (err)
+		return err;
+
+	if ((iommu_info.flags & required_iommu_flags) != required_iommu_flags)
+		return -ENODEV;
+
+	return 0;
+}
+
+/** kfd_iommu_device_init - Initialize IOMMU for device
+ */
+int kfd_iommu_device_init(struct kfd_dev *kfd)
+{
+	struct amd_iommu_device_info iommu_info;
+	unsigned int pasid_limit;
+	int err;
+	struct kfd_topology_device *top_dev;
+
+	top_dev = kfd_topology_device_by_id(kfd->id);
+
+	/*
+	 * Overwrite ATS capability according to needs_iommu_device to fix
+	 * potential missing corresponding bit in CRAT of BIOS.
+	 */
+	if (!kfd->device_info->needs_iommu_device) {
+		top_dev->node_props.capability &= ~HSA_CAP_ATS_PRESENT;
+		return 0;
+	}
+
+	top_dev->node_props.capability |= HSA_CAP_ATS_PRESENT;
+
+	iommu_info.flags = 0;
+	err = amd_iommu_device_info(kfd->pdev, &iommu_info);
+	if (err < 0) {
+		dev_err(kfd_device,
+			"error getting iommu info. is the iommu enabled?\n");
+		return -ENODEV;
+	}
+
+	if ((iommu_info.flags & required_iommu_flags) != required_iommu_flags) {
+		dev_err(kfd_device,
+			"error required iommu flags ats %i, pri %i, pasid %i\n",
+		       (iommu_info.flags & AMD_IOMMU_DEVICE_FLAG_ATS_SUP) != 0,
+		       (iommu_info.flags & AMD_IOMMU_DEVICE_FLAG_PRI_SUP) != 0,
+		       (iommu_info.flags & AMD_IOMMU_DEVICE_FLAG_PASID_SUP)
+									!= 0);
+		return -ENODEV;
+	}
+
+	pasid_limit = min_t(unsigned int,
+			(unsigned int)(1 << kfd->device_info->max_pasid_bits),
+			iommu_info.max_pasids);
+
+	if (!kfd_set_pasid_limit(pasid_limit)) {
+		dev_err(kfd_device, "error setting pasid limit\n");
+		return -EBUSY;
+	}
+
+	return 0;
+}
+
+/** kfd_iommu_bind_process_to_device - Have the IOMMU bind a process
+ *
+ * Binds the given process to the given device using its PASID. This
+ * enables IOMMUv2 address translation for the process on the device.
+ *
+ * This function assumes that the process mutex is held.
+ */
+int kfd_iommu_bind_process_to_device(struct kfd_process_device *pdd)
+{
+	struct kfd_dev *dev = pdd->dev;
+	struct kfd_process *p = pdd->process;
+	int err;
+
+	if (!dev->device_info->needs_iommu_device || pdd->bound == PDD_BOUND)
+		return 0;
+
+	if (unlikely(pdd->bound == PDD_BOUND_SUSPENDED)) {
+		pr_err("Binding PDD_BOUND_SUSPENDED pdd is unexpected!\n");
+		return -EINVAL;
+	}
+
+	err = amd_iommu_bind_pasid(dev->pdev, p->pasid, p->lead_thread);
+	if (!err)
+		pdd->bound = PDD_BOUND;
+
+	return err;
+}
+
+/** kfd_iommu_unbind_process - Unbind process from all devices
+ *
+ * This removes all IOMMU device bindings of the process. To be used
+ * before process termination.
+ */
+void kfd_iommu_unbind_process(struct kfd_process *p)
+{
+	struct kfd_process_device *pdd;
+
+	list_for_each_entry(pdd, &p->per_device_data, per_device_list)
+		if (pdd->bound == PDD_BOUND)
+			amd_iommu_unbind_pasid(pdd->dev->pdev, p->pasid);
+}
+
+/* Callback for process shutdown invoked by the IOMMU driver */
+static void iommu_pasid_shutdown_callback(struct pci_dev *pdev, int pasid)
+{
+	struct kfd_dev *dev = kfd_device_by_pci_dev(pdev);
+	struct kfd_process *p;
+	struct kfd_process_device *pdd;
+
+	if (!dev)
+		return;
+
+	/*
+	 * Look for the process that matches the pasid. If there is no such
+	 * process, we either released it in amdkfd's own notifier, or there
+	 * is a bug. Unfortunately, there is no way to tell...
+	 */
+	p = kfd_lookup_process_by_pasid(pasid);
+	if (!p)
+		return;
+
+	pr_debug("Unbinding process %d from IOMMU\n", pasid);
+
+	mutex_lock(kfd_get_dbgmgr_mutex());
+
+	if (dev->dbgmgr && dev->dbgmgr->pasid == p->pasid) {
+		if (!kfd_dbgmgr_unregister(dev->dbgmgr, p)) {
+			kfd_dbgmgr_destroy(dev->dbgmgr);
+			dev->dbgmgr = NULL;
+		}
+	}
+
+	mutex_unlock(kfd_get_dbgmgr_mutex());
+
+	mutex_lock(&p->mutex);
+
+	pdd = kfd_get_process_device_data(dev, p);
+	if (pdd)
+		/* For GPU relying on IOMMU, we need to dequeue here
+		 * when PASID is still bound.
+		 */
+		kfd_process_dequeue_from_device(pdd);
+
+	mutex_unlock(&p->mutex);
+
+	kfd_unref_process(p);
+}
+
+/* This function called by IOMMU driver on PPR failure */
+static int iommu_invalid_ppr_cb(struct pci_dev *pdev, int pasid,
+		unsigned long address, u16 flags)
+{
+	struct kfd_dev *dev;
+
+	dev_warn_ratelimited(kfd_device,
+			"Invalid PPR device %x:%x.%x pasid %d address 0x%lX flags 0x%X",
+			PCI_BUS_NUM(pdev->devfn),
+			PCI_SLOT(pdev->devfn),
+			PCI_FUNC(pdev->devfn),
+			pasid,
+			address,
+			flags);
+
+	dev = kfd_device_by_pci_dev(pdev);
+	if (!WARN_ON(!dev))
+		kfd_signal_iommu_event(dev, pasid, address,
+			flags & PPR_FAULT_WRITE, flags & PPR_FAULT_EXEC);
+
+	return AMD_IOMMU_INV_PRI_RSP_INVALID;
+}
+
+/*
+ * Bind processes do the device that have been temporarily unbound
+ * (PDD_BOUND_SUSPENDED) in kfd_unbind_processes_from_device.
+ */
+static int kfd_bind_processes_to_device(struct kfd_dev *kfd)
+{
+	struct kfd_process_device *pdd;
+	struct kfd_process *p;
+	unsigned int temp;
+	int err = 0;
+
+	int idx = srcu_read_lock(&kfd_processes_srcu);
+
+	hash_for_each_rcu(kfd_processes_table, temp, p, kfd_processes) {
+		mutex_lock(&p->mutex);
+		pdd = kfd_get_process_device_data(kfd, p);
+
+		if (WARN_ON(!pdd) || pdd->bound != PDD_BOUND_SUSPENDED) {
+			mutex_unlock(&p->mutex);
+			continue;
+		}
+
+		err = amd_iommu_bind_pasid(kfd->pdev, p->pasid,
+				p->lead_thread);
+		if (err < 0) {
+			pr_err("Unexpected pasid %d binding failure\n",
+					p->pasid);
+			mutex_unlock(&p->mutex);
+			break;
+		}
+
+		pdd->bound = PDD_BOUND;
+		mutex_unlock(&p->mutex);
+	}
+
+	srcu_read_unlock(&kfd_processes_srcu, idx);
+
+	return err;
+}
+
+/*
+ * Mark currently bound processes as PDD_BOUND_SUSPENDED. These
+ * processes will be restored to PDD_BOUND state in
+ * kfd_bind_processes_to_device.
+ */
+static void kfd_unbind_processes_from_device(struct kfd_dev *kfd)
+{
+	struct kfd_process_device *pdd;
+	struct kfd_process *p;
+	unsigned int temp;
+
+	int idx = srcu_read_lock(&kfd_processes_srcu);
+
+	hash_for_each_rcu(kfd_processes_table, temp, p, kfd_processes) {
+		mutex_lock(&p->mutex);
+		pdd = kfd_get_process_device_data(kfd, p);
+
+		if (WARN_ON(!pdd)) {
+			mutex_unlock(&p->mutex);
+			continue;
+		}
+
+		if (pdd->bound == PDD_BOUND)
+			pdd->bound = PDD_BOUND_SUSPENDED;
+		mutex_unlock(&p->mutex);
+	}
+
+	srcu_read_unlock(&kfd_processes_srcu, idx);
+}
+
+/** kfd_iommu_suspend - Prepare IOMMU for suspend
+ *
+ * This unbinds processes from the device and disables the IOMMU for
+ * the device.
+ */
+void kfd_iommu_suspend(struct kfd_dev *kfd)
+{
+	if (!kfd->device_info->needs_iommu_device)
+		return;
+
+	kfd_unbind_processes_from_device(kfd);
+
+	amd_iommu_set_invalidate_ctx_cb(kfd->pdev, NULL);
+	amd_iommu_set_invalid_ppr_cb(kfd->pdev, NULL);
+	amd_iommu_free_device(kfd->pdev);
+}
+
+/** kfd_iommu_resume - Restore IOMMU after resume
+ *
+ * This reinitializes the IOMMU for the device and re-binds previously
+ * suspended processes to the device.
+ */
+int kfd_iommu_resume(struct kfd_dev *kfd)
+{
+	unsigned int pasid_limit;
+	int err;
+
+	if (!kfd->device_info->needs_iommu_device)
+		return 0;
+
+	pasid_limit = kfd_get_pasid_limit();
+
+	err = amd_iommu_init_device(kfd->pdev, pasid_limit);
+	if (err)
+		return -ENXIO;
+
+	amd_iommu_set_invalidate_ctx_cb(kfd->pdev,
+					iommu_pasid_shutdown_callback);
+	amd_iommu_set_invalid_ppr_cb(kfd->pdev,
+				     iommu_invalid_ppr_cb);
+
+	err = kfd_bind_processes_to_device(kfd);
+	if (err) {
+		amd_iommu_set_invalidate_ctx_cb(kfd->pdev, NULL);
+		amd_iommu_set_invalid_ppr_cb(kfd->pdev, NULL);
+		amd_iommu_free_device(kfd->pdev);
+		return err;
+	}
+
+	return 0;
+}
+
+extern bool amd_iommu_pc_supported(void);
+extern u8 amd_iommu_pc_get_max_banks(u16 devid);
+extern u8 amd_iommu_pc_get_max_counters(u16 devid);
+
+/** kfd_iommu_add_perf_counters - Add IOMMU performance counters to topology
+ */
+int kfd_iommu_add_perf_counters(struct kfd_topology_device *kdev)
+{
+	struct kfd_perf_properties *props;
+
+	if (!(kdev->node_props.capability & HSA_CAP_ATS_PRESENT))
+		return 0;
+
+	if (!amd_iommu_pc_supported())
+		return 0;
+
+	props = kfd_alloc_struct(props);
+	if (!props)
+		return -ENOMEM;
+	strcpy(props->block_name, "iommu");
+	props->max_concurrent = amd_iommu_pc_get_max_banks(0) *
+		amd_iommu_pc_get_max_counters(0); /* assume one iommu */
+	list_add_tail(&props->list, &kdev->perf_props);
+
+	return 0;
+}
+
+#endif
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_iommu.h b/drivers/gpu/drm/amd/amdkfd/kfd_iommu.h
new file mode 100644
index 000000000..afd420b01
--- /dev/null
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_iommu.h
@@ -0,0 +1,83 @@
+/*
+ * Copyright 2018 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#ifndef __KFD_IOMMU_H__
+#define __KFD_IOMMU_H__
+
+#include <linux/kconfig.h>
+
+#if IS_REACHABLE(CONFIG_AMD_IOMMU_V2)
+
+#define KFD_SUPPORT_IOMMU_V2
+
+int kfd_iommu_check_device(struct kfd_dev *kfd);
+int kfd_iommu_device_init(struct kfd_dev *kfd);
+
+int kfd_iommu_bind_process_to_device(struct kfd_process_device *pdd);
+void kfd_iommu_unbind_process(struct kfd_process *p);
+
+void kfd_iommu_suspend(struct kfd_dev *kfd);
+int kfd_iommu_resume(struct kfd_dev *kfd);
+
+int kfd_iommu_add_perf_counters(struct kfd_topology_device *kdev);
+
+#else
+
+static inline int kfd_iommu_check_device(struct kfd_dev *kfd)
+{
+	return -ENODEV;
+}
+static inline int kfd_iommu_device_init(struct kfd_dev *kfd)
+{
+#if IS_MODULE(CONFIG_AMD_IOMMU_V2)
+	WARN_ONCE(1, "iommu_v2 module is not usable by built-in KFD");
+#endif
+	return 0;
+}
+
+static inline int kfd_iommu_bind_process_to_device(
+	struct kfd_process_device *pdd)
+{
+	return 0;
+}
+static inline void kfd_iommu_unbind_process(struct kfd_process *p)
+{
+	/* empty */
+}
+
+static inline void kfd_iommu_suspend(struct kfd_dev *kfd)
+{
+	/* empty */
+}
+static inline int kfd_iommu_resume(struct kfd_dev *kfd)
+{
+	return 0;
+}
+
+static inline int kfd_iommu_add_perf_counters(struct kfd_topology_device *kdev)
+{
+	return 0;
+}
+
+#endif /* IS_REACHABLE(CONFIG_AMD_IOMMU_V2) */
+
+#endif /* __KFD_IOMMU_H__ */
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c b/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c
new file mode 100644
index 000000000..9f84b4d9f
--- /dev/null
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c
@@ -0,0 +1,379 @@
+/*
+ * Copyright 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ *
+ */
+
+#include <linux/types.h>
+#include <linux/mutex.h>
+#include <linux/slab.h>
+#include <linux/printk.h>
+#include <linux/sched.h>
+#include "kfd_kernel_queue.h"
+#include "kfd_priv.h"
+#include "kfd_device_queue_manager.h"
+#include "kfd_pm4_headers.h"
+#include "kfd_pm4_opcodes.h"
+
+#define PM4_COUNT_ZERO (((1 << 15) - 1) << 16)
+
+static bool initialize(struct kernel_queue *kq, struct kfd_dev *dev,
+		enum kfd_queue_type type, unsigned int queue_size)
+{
+	struct queue_properties prop;
+	int retval;
+	union PM4_MES_TYPE_3_HEADER nop;
+
+	if (WARN_ON(type != KFD_QUEUE_TYPE_DIQ && type != KFD_QUEUE_TYPE_HIQ))
+		return false;
+
+	pr_debug("Initializing queue type %d size %d\n", KFD_QUEUE_TYPE_HIQ,
+			queue_size);
+
+	memset(&prop, 0, sizeof(prop));
+	memset(&nop, 0, sizeof(nop));
+
+	nop.opcode = IT_NOP;
+	nop.type = PM4_TYPE_3;
+	nop.u32all |= PM4_COUNT_ZERO;
+
+	kq->dev = dev;
+	kq->nop_packet = nop.u32all;
+	switch (type) {
+	case KFD_QUEUE_TYPE_DIQ:
+	case KFD_QUEUE_TYPE_HIQ:
+		kq->mqd_mgr = dev->dqm->ops.get_mqd_manager(dev->dqm,
+						KFD_MQD_TYPE_HIQ);
+		break;
+	default:
+		pr_err("Invalid queue type %d\n", type);
+		return false;
+	}
+
+	if (!kq->mqd_mgr)
+		return false;
+
+	prop.doorbell_ptr = kfd_get_kernel_doorbell(dev, &prop.doorbell_off);
+
+	if (!prop.doorbell_ptr) {
+		pr_err("Failed to initialize doorbell");
+		goto err_get_kernel_doorbell;
+	}
+
+	retval = kfd_gtt_sa_allocate(dev, queue_size, &kq->pq);
+	if (retval != 0) {
+		pr_err("Failed to init pq queues size %d\n", queue_size);
+		goto err_pq_allocate_vidmem;
+	}
+
+	kq->pq_kernel_addr = kq->pq->cpu_ptr;
+	kq->pq_gpu_addr = kq->pq->gpu_addr;
+
+	retval = kq->ops_asic_specific.initialize(kq, dev, type, queue_size);
+	if (!retval)
+		goto err_eop_allocate_vidmem;
+
+	retval = kfd_gtt_sa_allocate(dev, sizeof(*kq->rptr_kernel),
+					&kq->rptr_mem);
+
+	if (retval != 0)
+		goto err_rptr_allocate_vidmem;
+
+	kq->rptr_kernel = kq->rptr_mem->cpu_ptr;
+	kq->rptr_gpu_addr = kq->rptr_mem->gpu_addr;
+
+	retval = kfd_gtt_sa_allocate(dev, dev->device_info->doorbell_size,
+					&kq->wptr_mem);
+
+	if (retval != 0)
+		goto err_wptr_allocate_vidmem;
+
+	kq->wptr_kernel = kq->wptr_mem->cpu_ptr;
+	kq->wptr_gpu_addr = kq->wptr_mem->gpu_addr;
+
+	memset(kq->pq_kernel_addr, 0, queue_size);
+	memset(kq->rptr_kernel, 0, sizeof(*kq->rptr_kernel));
+	memset(kq->wptr_kernel, 0, sizeof(*kq->wptr_kernel));
+
+	prop.queue_size = queue_size;
+	prop.is_interop = false;
+	prop.priority = 1;
+	prop.queue_percent = 100;
+	prop.type = type;
+	prop.vmid = 0;
+	prop.queue_address = kq->pq_gpu_addr;
+	prop.read_ptr = (uint32_t *) kq->rptr_gpu_addr;
+	prop.write_ptr = (uint32_t *) kq->wptr_gpu_addr;
+	prop.eop_ring_buffer_address = kq->eop_gpu_addr;
+	prop.eop_ring_buffer_size = PAGE_SIZE;
+	prop.cu_mask = NULL;
+
+	if (init_queue(&kq->queue, &prop) != 0)
+		goto err_init_queue;
+
+	kq->queue->device = dev;
+	kq->queue->process = kfd_get_process(current);
+
+	retval = kq->mqd_mgr->init_mqd(kq->mqd_mgr, &kq->queue->mqd,
+					&kq->queue->mqd_mem_obj,
+					&kq->queue->gart_mqd_addr,
+					&kq->queue->properties);
+	if (retval != 0)
+		goto err_init_mqd;
+
+	/* assign HIQ to HQD */
+	if (type == KFD_QUEUE_TYPE_HIQ) {
+		pr_debug("Assigning hiq to hqd\n");
+		kq->queue->pipe = KFD_CIK_HIQ_PIPE;
+		kq->queue->queue = KFD_CIK_HIQ_QUEUE;
+		kq->mqd_mgr->load_mqd(kq->mqd_mgr, kq->queue->mqd,
+				kq->queue->pipe, kq->queue->queue,
+				&kq->queue->properties, NULL);
+	} else {
+		/* allocate fence for DIQ */
+
+		retval = kfd_gtt_sa_allocate(dev, sizeof(uint32_t),
+						&kq->fence_mem_obj);
+
+		if (retval != 0)
+			goto err_alloc_fence;
+
+		kq->fence_kernel_address = kq->fence_mem_obj->cpu_ptr;
+		kq->fence_gpu_addr = kq->fence_mem_obj->gpu_addr;
+	}
+
+	print_queue(kq->queue);
+
+	return true;
+err_alloc_fence:
+err_init_mqd:
+	uninit_queue(kq->queue);
+err_init_queue:
+	kfd_gtt_sa_free(dev, kq->wptr_mem);
+err_wptr_allocate_vidmem:
+	kfd_gtt_sa_free(dev, kq->rptr_mem);
+err_rptr_allocate_vidmem:
+	kfd_gtt_sa_free(dev, kq->eop_mem);
+err_eop_allocate_vidmem:
+	kfd_gtt_sa_free(dev, kq->pq);
+err_pq_allocate_vidmem:
+	kfd_release_kernel_doorbell(dev, prop.doorbell_ptr);
+err_get_kernel_doorbell:
+	return false;
+
+}
+
+static void uninitialize(struct kernel_queue *kq)
+{
+	if (kq->queue->properties.type == KFD_QUEUE_TYPE_HIQ)
+		kq->mqd_mgr->destroy_mqd(kq->mqd_mgr,
+					kq->queue->mqd,
+					KFD_PREEMPT_TYPE_WAVEFRONT_RESET,
+					KFD_UNMAP_LATENCY_MS,
+					kq->queue->pipe,
+					kq->queue->queue);
+	else if (kq->queue->properties.type == KFD_QUEUE_TYPE_DIQ)
+		kfd_gtt_sa_free(kq->dev, kq->fence_mem_obj);
+
+	kq->mqd_mgr->uninit_mqd(kq->mqd_mgr, kq->queue->mqd,
+				kq->queue->mqd_mem_obj);
+
+	kfd_gtt_sa_free(kq->dev, kq->rptr_mem);
+	kfd_gtt_sa_free(kq->dev, kq->wptr_mem);
+	kq->ops_asic_specific.uninitialize(kq);
+	kfd_gtt_sa_free(kq->dev, kq->pq);
+	kfd_release_kernel_doorbell(kq->dev,
+					kq->queue->properties.doorbell_ptr);
+	uninit_queue(kq->queue);
+}
+
+static int acquire_packet_buffer(struct kernel_queue *kq,
+		size_t packet_size_in_dwords, unsigned int **buffer_ptr)
+{
+	size_t available_size;
+	size_t queue_size_dwords;
+	uint32_t wptr, rptr;
+	uint64_t wptr64;
+	unsigned int *queue_address;
+
+	/* When rptr == wptr, the buffer is empty.
+	 * When rptr == wptr + 1, the buffer is full.
+	 * It is always rptr that advances to the position of wptr, rather than
+	 * the opposite. So we can only use up to queue_size_dwords - 1 dwords.
+	 */
+	rptr = *kq->rptr_kernel;
+	wptr = kq->pending_wptr;
+	wptr64 = kq->pending_wptr64;
+	queue_address = (unsigned int *)kq->pq_kernel_addr;
+	queue_size_dwords = kq->queue->properties.queue_size / 4;
+
+	pr_debug("rptr: %d\n", rptr);
+	pr_debug("wptr: %d\n", wptr);
+	pr_debug("queue_address 0x%p\n", queue_address);
+
+	available_size = (rptr + queue_size_dwords - 1 - wptr) %
+							queue_size_dwords;
+
+	if (packet_size_in_dwords > available_size) {
+		/*
+		 * make sure calling functions know
+		 * acquire_packet_buffer() failed
+		 */
+		goto err_no_space;
+	}
+
+	if (wptr + packet_size_in_dwords >= queue_size_dwords) {
+		/* make sure after rolling back to position 0, there is
+		 * still enough space.
+		 */
+		if (packet_size_in_dwords >= rptr)
+			goto err_no_space;
+
+		/* fill nops, roll back and start at position 0 */
+		while (wptr > 0) {
+			queue_address[wptr] = kq->nop_packet;
+			wptr = (wptr + 1) % queue_size_dwords;
+			wptr64++;
+		}
+	}
+
+	*buffer_ptr = &queue_address[wptr];
+	kq->pending_wptr = wptr + packet_size_in_dwords;
+	kq->pending_wptr64 = wptr64 + packet_size_in_dwords;
+
+	return 0;
+
+err_no_space:
+	*buffer_ptr = NULL;
+	return -ENOMEM;
+}
+
+static void submit_packet(struct kernel_queue *kq)
+{
+#ifdef DEBUG
+	int i;
+
+	for (i = *kq->wptr_kernel; i < kq->pending_wptr; i++) {
+		pr_debug("0x%2X ", kq->pq_kernel_addr[i]);
+		if (i % 15 == 0)
+			pr_debug("\n");
+	}
+	pr_debug("\n");
+#endif
+
+	kq->ops_asic_specific.submit_packet(kq);
+}
+
+static void rollback_packet(struct kernel_queue *kq)
+{
+	if (kq->dev->device_info->doorbell_size == 8) {
+		kq->pending_wptr64 = *kq->wptr64_kernel;
+		kq->pending_wptr = *kq->wptr_kernel %
+			(kq->queue->properties.queue_size / 4);
+	} else {
+		kq->pending_wptr = *kq->wptr_kernel;
+	}
+}
+
+struct kernel_queue *kernel_queue_init(struct kfd_dev *dev,
+					enum kfd_queue_type type)
+{
+	struct kernel_queue *kq;
+
+	kq = kzalloc(sizeof(*kq), GFP_KERNEL);
+	if (!kq)
+		return NULL;
+
+	kq->ops.initialize = initialize;
+	kq->ops.uninitialize = uninitialize;
+	kq->ops.acquire_packet_buffer = acquire_packet_buffer;
+	kq->ops.submit_packet = submit_packet;
+	kq->ops.rollback_packet = rollback_packet;
+
+	switch (dev->device_info->asic_family) {
+	case CHIP_CARRIZO:
+	case CHIP_TONGA:
+	case CHIP_FIJI:
+	case CHIP_POLARIS10:
+	case CHIP_POLARIS11:
+		kernel_queue_init_vi(&kq->ops_asic_specific);
+		break;
+
+	case CHIP_KAVERI:
+	case CHIP_HAWAII:
+		kernel_queue_init_cik(&kq->ops_asic_specific);
+		break;
+
+	case CHIP_VEGA10:
+	case CHIP_RAVEN:
+		kernel_queue_init_v9(&kq->ops_asic_specific);
+		break;
+	default:
+		WARN(1, "Unexpected ASIC family %u",
+		     dev->device_info->asic_family);
+		goto out_free;
+	}
+
+	if (kq->ops.initialize(kq, dev, type, KFD_KERNEL_QUEUE_SIZE))
+		return kq;
+
+	pr_err("Failed to init kernel queue\n");
+
+out_free:
+	kfree(kq);
+	return NULL;
+}
+
+void kernel_queue_uninit(struct kernel_queue *kq)
+{
+	kq->ops.uninitialize(kq);
+	kfree(kq);
+}
+
+/* FIXME: Can this test be removed? */
+static __attribute__((unused)) void test_kq(struct kfd_dev *dev)
+{
+	struct kernel_queue *kq;
+	uint32_t *buffer, i;
+	int retval;
+
+	pr_err("Starting kernel queue test\n");
+
+	kq = kernel_queue_init(dev, KFD_QUEUE_TYPE_HIQ);
+	if (unlikely(!kq)) {
+		pr_err("  Failed to initialize HIQ\n");
+		pr_err("Kernel queue test failed\n");
+		return;
+	}
+
+	retval = kq->ops.acquire_packet_buffer(kq, 5, &buffer);
+	if (unlikely(retval != 0)) {
+		pr_err("  Failed to acquire packet buffer\n");
+		pr_err("Kernel queue test failed\n");
+		return;
+	}
+	for (i = 0; i < 5; i++)
+		buffer[i] = kq->nop_packet;
+	kq->ops.submit_packet(kq);
+
+	pr_err("Ending kernel queue test\n");
+}
+
+
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.h b/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.h
new file mode 100644
index 000000000..a7116a939
--- /dev/null
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.h
@@ -0,0 +1,106 @@
+/*
+ * Copyright 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ *
+ */
+
+#ifndef KFD_KERNEL_QUEUE_H_
+#define KFD_KERNEL_QUEUE_H_
+
+#include <linux/list.h>
+#include <linux/types.h>
+#include "kfd_priv.h"
+
+/**
+ * struct kernel_queue_ops
+ *
+ * @initialize: Initialize a kernel queue, including allocations of GART memory
+ * needed for the queue.
+ *
+ * @uninitialize: Uninitialize a kernel queue and free all its memory usages.
+ *
+ * @acquire_packet_buffer: Returns a pointer to the location in the kernel
+ * queue ring buffer where the calling function can write its packet. It is
+ * Guaranteed that there is enough space for that packet. It also updates the
+ * pending write pointer to that location so subsequent calls to
+ * acquire_packet_buffer will get a correct write pointer
+ *
+ * @submit_packet: Update the write pointer and doorbell of a kernel queue.
+ *
+ * @sync_with_hw: Wait until the write pointer and the read pointer of a kernel
+ * queue are equal, which means the CP has read all the submitted packets.
+ *
+ * @rollback_packet: This routine is called if we failed to build an acquired
+ * packet for some reason. It just overwrites the pending wptr with the current
+ * one
+ *
+ */
+struct kernel_queue_ops {
+	bool	(*initialize)(struct kernel_queue *kq, struct kfd_dev *dev,
+			enum kfd_queue_type type, unsigned int queue_size);
+	void	(*uninitialize)(struct kernel_queue *kq);
+	int	(*acquire_packet_buffer)(struct kernel_queue *kq,
+					size_t packet_size_in_dwords,
+					unsigned int **buffer_ptr);
+
+	void	(*submit_packet)(struct kernel_queue *kq);
+	void	(*rollback_packet)(struct kernel_queue *kq);
+};
+
+struct kernel_queue {
+	struct kernel_queue_ops ops;
+	struct kernel_queue_ops ops_asic_specific;
+
+	/* data */
+	struct kfd_dev		*dev;
+	struct mqd_manager	*mqd_mgr;
+	struct queue		*queue;
+	uint64_t		pending_wptr64;
+	uint32_t		pending_wptr;
+	unsigned int		nop_packet;
+
+	struct kfd_mem_obj	*rptr_mem;
+	uint32_t		*rptr_kernel;
+	uint64_t		rptr_gpu_addr;
+	struct kfd_mem_obj	*wptr_mem;
+	union {
+		uint64_t	*wptr64_kernel;
+		uint32_t	*wptr_kernel;
+	};
+	uint64_t		wptr_gpu_addr;
+	struct kfd_mem_obj	*pq;
+	uint64_t		pq_gpu_addr;
+	uint32_t		*pq_kernel_addr;
+	struct kfd_mem_obj	*eop_mem;
+	uint64_t		eop_gpu_addr;
+	uint32_t		*eop_kernel_addr;
+
+	struct kfd_mem_obj	*fence_mem_obj;
+	uint64_t		fence_gpu_addr;
+	void			*fence_kernel_address;
+
+	struct list_head	list;
+};
+
+void kernel_queue_init_cik(struct kernel_queue_ops *ops);
+void kernel_queue_init_vi(struct kernel_queue_ops *ops);
+void kernel_queue_init_v9(struct kernel_queue_ops *ops);
+
+#endif /* KFD_KERNEL_QUEUE_H_ */
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue_cik.c b/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue_cik.c
new file mode 100644
index 000000000..19e54acb4
--- /dev/null
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue_cik.c
@@ -0,0 +1,53 @@
+/*
+ * Copyright 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ *
+ */
+
+#include "kfd_kernel_queue.h"
+
+static bool initialize_cik(struct kernel_queue *kq, struct kfd_dev *dev,
+			enum kfd_queue_type type, unsigned int queue_size);
+static void uninitialize_cik(struct kernel_queue *kq);
+static void submit_packet_cik(struct kernel_queue *kq);
+
+void kernel_queue_init_cik(struct kernel_queue_ops *ops)
+{
+	ops->initialize = initialize_cik;
+	ops->uninitialize = uninitialize_cik;
+	ops->submit_packet = submit_packet_cik;
+}
+
+static bool initialize_cik(struct kernel_queue *kq, struct kfd_dev *dev,
+			enum kfd_queue_type type, unsigned int queue_size)
+{
+	return true;
+}
+
+static void uninitialize_cik(struct kernel_queue *kq)
+{
+}
+
+static void submit_packet_cik(struct kernel_queue *kq)
+{
+	*kq->wptr_kernel = kq->pending_wptr;
+	write_kernel_doorbell(kq->queue->properties.doorbell_ptr,
+				kq->pending_wptr);
+}
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue_v9.c b/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue_v9.c
new file mode 100644
index 000000000..684a3bf07
--- /dev/null
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue_v9.c
@@ -0,0 +1,340 @@
+/*
+ * Copyright 2016-2018 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ *
+ */
+
+#include "kfd_kernel_queue.h"
+#include "kfd_device_queue_manager.h"
+#include "kfd_pm4_headers_ai.h"
+#include "kfd_pm4_opcodes.h"
+
+static bool initialize_v9(struct kernel_queue *kq, struct kfd_dev *dev,
+			enum kfd_queue_type type, unsigned int queue_size);
+static void uninitialize_v9(struct kernel_queue *kq);
+static void submit_packet_v9(struct kernel_queue *kq);
+
+void kernel_queue_init_v9(struct kernel_queue_ops *ops)
+{
+	ops->initialize = initialize_v9;
+	ops->uninitialize = uninitialize_v9;
+	ops->submit_packet = submit_packet_v9;
+}
+
+static bool initialize_v9(struct kernel_queue *kq, struct kfd_dev *dev,
+			enum kfd_queue_type type, unsigned int queue_size)
+{
+	int retval;
+
+	retval = kfd_gtt_sa_allocate(dev, PAGE_SIZE, &kq->eop_mem);
+	if (retval)
+		return false;
+
+	kq->eop_gpu_addr = kq->eop_mem->gpu_addr;
+	kq->eop_kernel_addr = kq->eop_mem->cpu_ptr;
+
+	memset(kq->eop_kernel_addr, 0, PAGE_SIZE);
+
+	return true;
+}
+
+static void uninitialize_v9(struct kernel_queue *kq)
+{
+	kfd_gtt_sa_free(kq->dev, kq->eop_mem);
+}
+
+static void submit_packet_v9(struct kernel_queue *kq)
+{
+	*kq->wptr64_kernel = kq->pending_wptr64;
+	write_kernel_doorbell64(kq->queue->properties.doorbell_ptr,
+				kq->pending_wptr64);
+}
+
+static int pm_map_process_v9(struct packet_manager *pm,
+		uint32_t *buffer, struct qcm_process_device *qpd)
+{
+	struct pm4_mes_map_process *packet;
+	uint64_t vm_page_table_base_addr =
+		(uint64_t)(qpd->page_table_base) << 12;
+
+	packet = (struct pm4_mes_map_process *)buffer;
+	memset(buffer, 0, sizeof(struct pm4_mes_map_process));
+
+	packet->header.u32All = pm_build_pm4_header(IT_MAP_PROCESS,
+					sizeof(struct pm4_mes_map_process));
+	packet->bitfields2.diq_enable = (qpd->is_debug) ? 1 : 0;
+	packet->bitfields2.process_quantum = 1;
+	packet->bitfields2.pasid = qpd->pqm->process->pasid;
+	packet->bitfields14.gds_size = qpd->gds_size;
+	packet->bitfields14.num_gws = qpd->num_gws;
+	packet->bitfields14.num_oac = qpd->num_oac;
+	packet->bitfields14.sdma_enable = 1;
+	packet->bitfields14.num_queues = (qpd->is_debug) ? 0 : qpd->queue_count;
+
+	packet->sh_mem_config = qpd->sh_mem_config;
+	packet->sh_mem_bases = qpd->sh_mem_bases;
+	packet->sq_shader_tba_lo = lower_32_bits(qpd->tba_addr >> 8);
+	packet->sq_shader_tba_hi = upper_32_bits(qpd->tba_addr >> 8);
+	packet->sq_shader_tma_lo = lower_32_bits(qpd->tma_addr >> 8);
+	packet->sq_shader_tma_hi = upper_32_bits(qpd->tma_addr >> 8);
+
+	packet->gds_addr_lo = lower_32_bits(qpd->gds_context_area);
+	packet->gds_addr_hi = upper_32_bits(qpd->gds_context_area);
+
+	packet->vm_context_page_table_base_addr_lo32 =
+			lower_32_bits(vm_page_table_base_addr);
+	packet->vm_context_page_table_base_addr_hi32 =
+			upper_32_bits(vm_page_table_base_addr);
+
+	return 0;
+}
+
+static int pm_runlist_v9(struct packet_manager *pm, uint32_t *buffer,
+			uint64_t ib, size_t ib_size_in_dwords, bool chain)
+{
+	struct pm4_mes_runlist *packet;
+
+	int concurrent_proc_cnt = 0;
+	struct kfd_dev *kfd = pm->dqm->dev;
+
+	/* Determine the number of processes to map together to HW:
+	 * it can not exceed the number of VMIDs available to the
+	 * scheduler, and it is determined by the smaller of the number
+	 * of processes in the runlist and kfd module parameter
+	 * hws_max_conc_proc.
+	 * Note: the arbitration between the number of VMIDs and
+	 * hws_max_conc_proc has been done in
+	 * kgd2kfd_device_init().
+	 */
+	concurrent_proc_cnt = min(pm->dqm->processes_count,
+			kfd->max_proc_per_quantum);
+
+	packet = (struct pm4_mes_runlist *)buffer;
+
+	memset(buffer, 0, sizeof(struct pm4_mes_runlist));
+	packet->header.u32All = pm_build_pm4_header(IT_RUN_LIST,
+						sizeof(struct pm4_mes_runlist));
+
+	packet->bitfields4.ib_size = ib_size_in_dwords;
+	packet->bitfields4.chain = chain ? 1 : 0;
+	packet->bitfields4.offload_polling = 0;
+	packet->bitfields4.valid = 1;
+	packet->bitfields4.process_cnt = concurrent_proc_cnt;
+	packet->ordinal2 = lower_32_bits(ib);
+	packet->ib_base_hi = upper_32_bits(ib);
+
+	return 0;
+}
+
+static int pm_map_queues_v9(struct packet_manager *pm, uint32_t *buffer,
+		struct queue *q, bool is_static)
+{
+	struct pm4_mes_map_queues *packet;
+	bool use_static = is_static;
+
+	packet = (struct pm4_mes_map_queues *)buffer;
+	memset(buffer, 0, sizeof(struct pm4_mes_map_queues));
+
+	packet->header.u32All = pm_build_pm4_header(IT_MAP_QUEUES,
+					sizeof(struct pm4_mes_map_queues));
+	packet->bitfields2.alloc_format =
+		alloc_format__mes_map_queues__one_per_pipe_vi;
+	packet->bitfields2.num_queues = 1;
+	packet->bitfields2.queue_sel =
+		queue_sel__mes_map_queues__map_to_hws_determined_queue_slots_vi;
+
+	packet->bitfields2.engine_sel =
+		engine_sel__mes_map_queues__compute_vi;
+	packet->bitfields2.queue_type =
+		queue_type__mes_map_queues__normal_compute_vi;
+
+	switch (q->properties.type) {
+	case KFD_QUEUE_TYPE_COMPUTE:
+		if (use_static)
+			packet->bitfields2.queue_type =
+		queue_type__mes_map_queues__normal_latency_static_queue_vi;
+		break;
+	case KFD_QUEUE_TYPE_DIQ:
+		packet->bitfields2.queue_type =
+			queue_type__mes_map_queues__debug_interface_queue_vi;
+		break;
+	case KFD_QUEUE_TYPE_SDMA:
+		packet->bitfields2.engine_sel = q->properties.sdma_engine_id +
+				engine_sel__mes_map_queues__sdma0_vi;
+		use_static = false; /* no static queues under SDMA */
+		break;
+	default:
+		WARN(1, "queue type %d", q->properties.type);
+		return -EINVAL;
+	}
+	packet->bitfields3.doorbell_offset =
+			q->properties.doorbell_off;
+
+	packet->mqd_addr_lo =
+			lower_32_bits(q->gart_mqd_addr);
+
+	packet->mqd_addr_hi =
+			upper_32_bits(q->gart_mqd_addr);
+
+	packet->wptr_addr_lo =
+			lower_32_bits((uint64_t)q->properties.write_ptr);
+
+	packet->wptr_addr_hi =
+			upper_32_bits((uint64_t)q->properties.write_ptr);
+
+	return 0;
+}
+
+static int pm_unmap_queues_v9(struct packet_manager *pm, uint32_t *buffer,
+			enum kfd_queue_type type,
+			enum kfd_unmap_queues_filter filter,
+			uint32_t filter_param, bool reset,
+			unsigned int sdma_engine)
+{
+	struct pm4_mes_unmap_queues *packet;
+
+	packet = (struct pm4_mes_unmap_queues *)buffer;
+	memset(buffer, 0, sizeof(struct pm4_mes_unmap_queues));
+
+	packet->header.u32All = pm_build_pm4_header(IT_UNMAP_QUEUES,
+					sizeof(struct pm4_mes_unmap_queues));
+	switch (type) {
+	case KFD_QUEUE_TYPE_COMPUTE:
+	case KFD_QUEUE_TYPE_DIQ:
+		packet->bitfields2.engine_sel =
+			engine_sel__mes_unmap_queues__compute;
+		break;
+	case KFD_QUEUE_TYPE_SDMA:
+		packet->bitfields2.engine_sel =
+			engine_sel__mes_unmap_queues__sdma0 + sdma_engine;
+		break;
+	default:
+		WARN(1, "queue type %d", type);
+		return -EINVAL;
+	}
+
+	if (reset)
+		packet->bitfields2.action =
+			action__mes_unmap_queues__reset_queues;
+	else
+		packet->bitfields2.action =
+			action__mes_unmap_queues__preempt_queues;
+
+	switch (filter) {
+	case KFD_UNMAP_QUEUES_FILTER_SINGLE_QUEUE:
+		packet->bitfields2.queue_sel =
+			queue_sel__mes_unmap_queues__perform_request_on_specified_queues;
+		packet->bitfields2.num_queues = 1;
+		packet->bitfields3b.doorbell_offset0 = filter_param;
+		break;
+	case KFD_UNMAP_QUEUES_FILTER_BY_PASID:
+		packet->bitfields2.queue_sel =
+			queue_sel__mes_unmap_queues__perform_request_on_pasid_queues;
+		packet->bitfields3a.pasid = filter_param;
+		break;
+	case KFD_UNMAP_QUEUES_FILTER_ALL_QUEUES:
+		packet->bitfields2.queue_sel =
+			queue_sel__mes_unmap_queues__unmap_all_queues;
+		break;
+	case KFD_UNMAP_QUEUES_FILTER_DYNAMIC_QUEUES:
+		/* in this case, we do not preempt static queues */
+		packet->bitfields2.queue_sel =
+			queue_sel__mes_unmap_queues__unmap_all_non_static_queues;
+		break;
+	default:
+		WARN(1, "filter %d", filter);
+		return -EINVAL;
+	}
+
+	return 0;
+
+}
+
+static int pm_query_status_v9(struct packet_manager *pm, uint32_t *buffer,
+			uint64_t fence_address,	uint32_t fence_value)
+{
+	struct pm4_mes_query_status *packet;
+
+	packet = (struct pm4_mes_query_status *)buffer;
+	memset(buffer, 0, sizeof(struct pm4_mes_query_status));
+
+
+	packet->header.u32All = pm_build_pm4_header(IT_QUERY_STATUS,
+					sizeof(struct pm4_mes_query_status));
+
+	packet->bitfields2.context_id = 0;
+	packet->bitfields2.interrupt_sel =
+			interrupt_sel__mes_query_status__completion_status;
+	packet->bitfields2.command =
+			command__mes_query_status__fence_only_after_write_ack;
+
+	packet->addr_hi = upper_32_bits((uint64_t)fence_address);
+	packet->addr_lo = lower_32_bits((uint64_t)fence_address);
+	packet->data_hi = upper_32_bits((uint64_t)fence_value);
+	packet->data_lo = lower_32_bits((uint64_t)fence_value);
+
+	return 0;
+}
+
+
+static int pm_release_mem_v9(uint64_t gpu_addr, uint32_t *buffer)
+{
+	struct pm4_mec_release_mem *packet;
+
+	packet = (struct pm4_mec_release_mem *)buffer;
+	memset(buffer, 0, sizeof(struct pm4_mec_release_mem));
+
+	packet->header.u32All = pm_build_pm4_header(IT_RELEASE_MEM,
+					sizeof(struct pm4_mec_release_mem));
+
+	packet->bitfields2.event_type = CACHE_FLUSH_AND_INV_TS_EVENT;
+	packet->bitfields2.event_index = event_index__mec_release_mem__end_of_pipe;
+	packet->bitfields2.tcl1_action_ena = 1;
+	packet->bitfields2.tc_action_ena = 1;
+	packet->bitfields2.cache_policy = cache_policy__mec_release_mem__lru;
+
+	packet->bitfields3.data_sel = data_sel__mec_release_mem__send_32_bit_low;
+	packet->bitfields3.int_sel =
+		int_sel__mec_release_mem__send_interrupt_after_write_confirm;
+
+	packet->bitfields4.address_lo_32b = (gpu_addr & 0xffffffff) >> 2;
+	packet->address_hi = upper_32_bits(gpu_addr);
+
+	packet->data_lo = 0;
+
+	return 0;
+}
+
+const struct packet_manager_funcs kfd_v9_pm_funcs = {
+	.map_process		= pm_map_process_v9,
+	.runlist		= pm_runlist_v9,
+	.set_resources		= pm_set_resources_vi,
+	.map_queues		= pm_map_queues_v9,
+	.unmap_queues		= pm_unmap_queues_v9,
+	.query_status		= pm_query_status_v9,
+	.release_mem		= pm_release_mem_v9,
+	.map_process_size	= sizeof(struct pm4_mes_map_process),
+	.runlist_size		= sizeof(struct pm4_mes_runlist),
+	.set_resources_size	= sizeof(struct pm4_mes_set_resources),
+	.map_queues_size	= sizeof(struct pm4_mes_map_queues),
+	.unmap_queues_size	= sizeof(struct pm4_mes_unmap_queues),
+	.query_status_size	= sizeof(struct pm4_mes_query_status),
+	.release_mem_size	= sizeof(struct pm4_mec_release_mem)
+};
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue_vi.c b/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue_vi.c
new file mode 100644
index 000000000..bf20c6d32
--- /dev/null
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue_vi.c
@@ -0,0 +1,375 @@
+/*
+ * Copyright 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ *
+ */
+
+#include "kfd_kernel_queue.h"
+#include "kfd_device_queue_manager.h"
+#include "kfd_pm4_headers_vi.h"
+#include "kfd_pm4_opcodes.h"
+
+static bool initialize_vi(struct kernel_queue *kq, struct kfd_dev *dev,
+			enum kfd_queue_type type, unsigned int queue_size);
+static void uninitialize_vi(struct kernel_queue *kq);
+static void submit_packet_vi(struct kernel_queue *kq);
+
+void kernel_queue_init_vi(struct kernel_queue_ops *ops)
+{
+	ops->initialize = initialize_vi;
+	ops->uninitialize = uninitialize_vi;
+	ops->submit_packet = submit_packet_vi;
+}
+
+static bool initialize_vi(struct kernel_queue *kq, struct kfd_dev *dev,
+			enum kfd_queue_type type, unsigned int queue_size)
+{
+	int retval;
+
+	retval = kfd_gtt_sa_allocate(dev, PAGE_SIZE, &kq->eop_mem);
+	if (retval != 0)
+		return false;
+
+	kq->eop_gpu_addr = kq->eop_mem->gpu_addr;
+	kq->eop_kernel_addr = kq->eop_mem->cpu_ptr;
+
+	memset(kq->eop_kernel_addr, 0, PAGE_SIZE);
+
+	return true;
+}
+
+static void uninitialize_vi(struct kernel_queue *kq)
+{
+	kfd_gtt_sa_free(kq->dev, kq->eop_mem);
+}
+
+static void submit_packet_vi(struct kernel_queue *kq)
+{
+	*kq->wptr_kernel = kq->pending_wptr;
+	write_kernel_doorbell(kq->queue->properties.doorbell_ptr,
+				kq->pending_wptr);
+}
+
+unsigned int pm_build_pm4_header(unsigned int opcode, size_t packet_size)
+{
+	union PM4_MES_TYPE_3_HEADER header;
+
+	header.u32All = 0;
+	header.opcode = opcode;
+	header.count = packet_size / 4 - 2;
+	header.type = PM4_TYPE_3;
+
+	return header.u32All;
+}
+
+static int pm_map_process_vi(struct packet_manager *pm, uint32_t *buffer,
+				struct qcm_process_device *qpd)
+{
+	struct pm4_mes_map_process *packet;
+
+	packet = (struct pm4_mes_map_process *)buffer;
+
+	memset(buffer, 0, sizeof(struct pm4_mes_map_process));
+
+	packet->header.u32All = pm_build_pm4_header(IT_MAP_PROCESS,
+					sizeof(struct pm4_mes_map_process));
+	packet->bitfields2.diq_enable = (qpd->is_debug) ? 1 : 0;
+	packet->bitfields2.process_quantum = 1;
+	packet->bitfields2.pasid = qpd->pqm->process->pasid;
+	packet->bitfields3.page_table_base = qpd->page_table_base;
+	packet->bitfields10.gds_size = qpd->gds_size;
+	packet->bitfields10.num_gws = qpd->num_gws;
+	packet->bitfields10.num_oac = qpd->num_oac;
+	packet->bitfields10.num_queues = (qpd->is_debug) ? 0 : qpd->queue_count;
+
+	packet->sh_mem_config = qpd->sh_mem_config;
+	packet->sh_mem_bases = qpd->sh_mem_bases;
+	packet->sh_mem_ape1_base = qpd->sh_mem_ape1_base;
+	packet->sh_mem_ape1_limit = qpd->sh_mem_ape1_limit;
+
+	packet->sh_hidden_private_base_vmid = qpd->sh_hidden_private_base;
+
+	packet->gds_addr_lo = lower_32_bits(qpd->gds_context_area);
+	packet->gds_addr_hi = upper_32_bits(qpd->gds_context_area);
+
+	return 0;
+}
+
+static int pm_runlist_vi(struct packet_manager *pm, uint32_t *buffer,
+			uint64_t ib, size_t ib_size_in_dwords, bool chain)
+{
+	struct pm4_mes_runlist *packet;
+	int concurrent_proc_cnt = 0;
+	struct kfd_dev *kfd = pm->dqm->dev;
+
+	if (WARN_ON(!ib))
+		return -EFAULT;
+
+	/* Determine the number of processes to map together to HW:
+	 * it can not exceed the number of VMIDs available to the
+	 * scheduler, and it is determined by the smaller of the number
+	 * of processes in the runlist and kfd module parameter
+	 * hws_max_conc_proc.
+	 * Note: the arbitration between the number of VMIDs and
+	 * hws_max_conc_proc has been done in
+	 * kgd2kfd_device_init().
+	 */
+	concurrent_proc_cnt = min(pm->dqm->processes_count,
+			kfd->max_proc_per_quantum);
+
+	packet = (struct pm4_mes_runlist *)buffer;
+
+	memset(buffer, 0, sizeof(struct pm4_mes_runlist));
+	packet->header.u32All = pm_build_pm4_header(IT_RUN_LIST,
+						sizeof(struct pm4_mes_runlist));
+
+	packet->bitfields4.ib_size = ib_size_in_dwords;
+	packet->bitfields4.chain = chain ? 1 : 0;
+	packet->bitfields4.offload_polling = 0;
+	packet->bitfields4.valid = 1;
+	packet->bitfields4.process_cnt = concurrent_proc_cnt;
+	packet->ordinal2 = lower_32_bits(ib);
+	packet->bitfields3.ib_base_hi = upper_32_bits(ib);
+
+	return 0;
+}
+
+int pm_set_resources_vi(struct packet_manager *pm, uint32_t *buffer,
+				struct scheduling_resources *res)
+{
+	struct pm4_mes_set_resources *packet;
+
+	packet = (struct pm4_mes_set_resources *)buffer;
+	memset(buffer, 0, sizeof(struct pm4_mes_set_resources));
+
+	packet->header.u32All = pm_build_pm4_header(IT_SET_RESOURCES,
+					sizeof(struct pm4_mes_set_resources));
+
+	packet->bitfields2.queue_type =
+			queue_type__mes_set_resources__hsa_interface_queue_hiq;
+	packet->bitfields2.vmid_mask = res->vmid_mask;
+	packet->bitfields2.unmap_latency = KFD_UNMAP_LATENCY_MS / 100;
+	packet->bitfields7.oac_mask = res->oac_mask;
+	packet->bitfields8.gds_heap_base = res->gds_heap_base;
+	packet->bitfields8.gds_heap_size = res->gds_heap_size;
+
+	packet->gws_mask_lo = lower_32_bits(res->gws_mask);
+	packet->gws_mask_hi = upper_32_bits(res->gws_mask);
+
+	packet->queue_mask_lo = lower_32_bits(res->queue_mask);
+	packet->queue_mask_hi = upper_32_bits(res->queue_mask);
+
+	return 0;
+}
+
+static int pm_map_queues_vi(struct packet_manager *pm, uint32_t *buffer,
+		struct queue *q, bool is_static)
+{
+	struct pm4_mes_map_queues *packet;
+	bool use_static = is_static;
+
+	packet = (struct pm4_mes_map_queues *)buffer;
+	memset(buffer, 0, sizeof(struct pm4_mes_map_queues));
+
+	packet->header.u32All = pm_build_pm4_header(IT_MAP_QUEUES,
+					sizeof(struct pm4_mes_map_queues));
+	packet->bitfields2.alloc_format =
+		alloc_format__mes_map_queues__one_per_pipe_vi;
+	packet->bitfields2.num_queues = 1;
+	packet->bitfields2.queue_sel =
+		queue_sel__mes_map_queues__map_to_hws_determined_queue_slots_vi;
+
+	packet->bitfields2.engine_sel =
+		engine_sel__mes_map_queues__compute_vi;
+	packet->bitfields2.queue_type =
+		queue_type__mes_map_queues__normal_compute_vi;
+
+	switch (q->properties.type) {
+	case KFD_QUEUE_TYPE_COMPUTE:
+		if (use_static)
+			packet->bitfields2.queue_type =
+		queue_type__mes_map_queues__normal_latency_static_queue_vi;
+		break;
+	case KFD_QUEUE_TYPE_DIQ:
+		packet->bitfields2.queue_type =
+			queue_type__mes_map_queues__debug_interface_queue_vi;
+		break;
+	case KFD_QUEUE_TYPE_SDMA:
+		packet->bitfields2.engine_sel = q->properties.sdma_engine_id +
+				engine_sel__mes_map_queues__sdma0_vi;
+		use_static = false; /* no static queues under SDMA */
+		break;
+	default:
+		WARN(1, "queue type %d", q->properties.type);
+		return -EINVAL;
+	}
+	packet->bitfields3.doorbell_offset =
+			q->properties.doorbell_off;
+
+	packet->mqd_addr_lo =
+			lower_32_bits(q->gart_mqd_addr);
+
+	packet->mqd_addr_hi =
+			upper_32_bits(q->gart_mqd_addr);
+
+	packet->wptr_addr_lo =
+			lower_32_bits((uint64_t)q->properties.write_ptr);
+
+	packet->wptr_addr_hi =
+			upper_32_bits((uint64_t)q->properties.write_ptr);
+
+	return 0;
+}
+
+static int pm_unmap_queues_vi(struct packet_manager *pm, uint32_t *buffer,
+			enum kfd_queue_type type,
+			enum kfd_unmap_queues_filter filter,
+			uint32_t filter_param, bool reset,
+			unsigned int sdma_engine)
+{
+	struct pm4_mes_unmap_queues *packet;
+
+	packet = (struct pm4_mes_unmap_queues *)buffer;
+	memset(buffer, 0, sizeof(struct pm4_mes_unmap_queues));
+
+	packet->header.u32All = pm_build_pm4_header(IT_UNMAP_QUEUES,
+					sizeof(struct pm4_mes_unmap_queues));
+	switch (type) {
+	case KFD_QUEUE_TYPE_COMPUTE:
+	case KFD_QUEUE_TYPE_DIQ:
+		packet->bitfields2.engine_sel =
+			engine_sel__mes_unmap_queues__compute;
+		break;
+	case KFD_QUEUE_TYPE_SDMA:
+		packet->bitfields2.engine_sel =
+			engine_sel__mes_unmap_queues__sdma0 + sdma_engine;
+		break;
+	default:
+		WARN(1, "queue type %d", type);
+		return -EINVAL;
+	}
+
+	if (reset)
+		packet->bitfields2.action =
+			action__mes_unmap_queues__reset_queues;
+	else
+		packet->bitfields2.action =
+			action__mes_unmap_queues__preempt_queues;
+
+	switch (filter) {
+	case KFD_UNMAP_QUEUES_FILTER_SINGLE_QUEUE:
+		packet->bitfields2.queue_sel =
+			queue_sel__mes_unmap_queues__perform_request_on_specified_queues;
+		packet->bitfields2.num_queues = 1;
+		packet->bitfields3b.doorbell_offset0 = filter_param;
+		break;
+	case KFD_UNMAP_QUEUES_FILTER_BY_PASID:
+		packet->bitfields2.queue_sel =
+			queue_sel__mes_unmap_queues__perform_request_on_pasid_queues;
+		packet->bitfields3a.pasid = filter_param;
+		break;
+	case KFD_UNMAP_QUEUES_FILTER_ALL_QUEUES:
+		packet->bitfields2.queue_sel =
+			queue_sel__mes_unmap_queues__unmap_all_queues;
+		break;
+	case KFD_UNMAP_QUEUES_FILTER_DYNAMIC_QUEUES:
+		/* in this case, we do not preempt static queues */
+		packet->bitfields2.queue_sel =
+			queue_sel__mes_unmap_queues__unmap_all_non_static_queues;
+		break;
+	default:
+		WARN(1, "filter %d", filter);
+		return -EINVAL;
+	}
+
+	return 0;
+
+}
+
+static int pm_query_status_vi(struct packet_manager *pm, uint32_t *buffer,
+			uint64_t fence_address,	uint32_t fence_value)
+{
+	struct pm4_mes_query_status *packet;
+
+	packet = (struct pm4_mes_query_status *)buffer;
+	memset(buffer, 0, sizeof(struct pm4_mes_query_status));
+
+	packet->header.u32All = pm_build_pm4_header(IT_QUERY_STATUS,
+					sizeof(struct pm4_mes_query_status));
+
+	packet->bitfields2.context_id = 0;
+	packet->bitfields2.interrupt_sel =
+			interrupt_sel__mes_query_status__completion_status;
+	packet->bitfields2.command =
+			command__mes_query_status__fence_only_after_write_ack;
+
+	packet->addr_hi = upper_32_bits((uint64_t)fence_address);
+	packet->addr_lo = lower_32_bits((uint64_t)fence_address);
+	packet->data_hi = upper_32_bits((uint64_t)fence_value);
+	packet->data_lo = lower_32_bits((uint64_t)fence_value);
+
+	return 0;
+}
+
+static int pm_release_mem_vi(uint64_t gpu_addr, uint32_t *buffer)
+{
+	struct pm4_mec_release_mem *packet;
+
+	packet = (struct pm4_mec_release_mem *)buffer;
+	memset(buffer, 0, sizeof(*packet));
+
+	packet->header.u32All = pm_build_pm4_header(IT_RELEASE_MEM,
+						 sizeof(*packet));
+
+	packet->bitfields2.event_type = CACHE_FLUSH_AND_INV_TS_EVENT;
+	packet->bitfields2.event_index = event_index___release_mem__end_of_pipe;
+	packet->bitfields2.tcl1_action_ena = 1;
+	packet->bitfields2.tc_action_ena = 1;
+	packet->bitfields2.cache_policy = cache_policy___release_mem__lru;
+	packet->bitfields2.atc = 0;
+
+	packet->bitfields3.data_sel = data_sel___release_mem__send_32_bit_low;
+	packet->bitfields3.int_sel =
+		int_sel___release_mem__send_interrupt_after_write_confirm;
+
+	packet->bitfields4.address_lo_32b = (gpu_addr & 0xffffffff) >> 2;
+	packet->address_hi = upper_32_bits(gpu_addr);
+
+	packet->data_lo = 0;
+
+	return 0;
+}
+
+const struct packet_manager_funcs kfd_vi_pm_funcs = {
+	.map_process		= pm_map_process_vi,
+	.runlist		= pm_runlist_vi,
+	.set_resources		= pm_set_resources_vi,
+	.map_queues		= pm_map_queues_vi,
+	.unmap_queues		= pm_unmap_queues_vi,
+	.query_status		= pm_query_status_vi,
+	.release_mem		= pm_release_mem_vi,
+	.map_process_size	= sizeof(struct pm4_mes_map_process),
+	.runlist_size		= sizeof(struct pm4_mes_runlist),
+	.set_resources_size	= sizeof(struct pm4_mes_set_resources),
+	.map_queues_size	= sizeof(struct pm4_mes_map_queues),
+	.unmap_queues_size	= sizeof(struct pm4_mes_unmap_queues),
+	.query_status_size	= sizeof(struct pm4_mes_query_status),
+	.release_mem_size	= sizeof(struct pm4_mec_release_mem)
+};
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_module.c b/drivers/gpu/drm/amd/amdkfd/kfd_module.c
new file mode 100644
index 000000000..6e1f5c7c2
--- /dev/null
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_module.c
@@ -0,0 +1,190 @@
+/*
+ * Copyright 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#include <linux/module.h>
+#include <linux/sched.h>
+#include <linux/moduleparam.h>
+#include <linux/device.h>
+#include <linux/printk.h>
+#include "kfd_priv.h"
+
+#define KFD_DRIVER_AUTHOR	"AMD Inc. and others"
+
+#define KFD_DRIVER_DESC		"Standalone HSA driver for AMD's GPUs"
+#define KFD_DRIVER_DATE		"20150421"
+#define KFD_DRIVER_MAJOR	0
+#define KFD_DRIVER_MINOR	7
+#define KFD_DRIVER_PATCHLEVEL	2
+
+static const struct kgd2kfd_calls kgd2kfd = {
+	.exit		= kgd2kfd_exit,
+	.probe		= kgd2kfd_probe,
+	.device_init	= kgd2kfd_device_init,
+	.device_exit	= kgd2kfd_device_exit,
+	.interrupt	= kgd2kfd_interrupt,
+	.suspend	= kgd2kfd_suspend,
+	.resume		= kgd2kfd_resume,
+	.quiesce_mm	= kgd2kfd_quiesce_mm,
+	.resume_mm	= kgd2kfd_resume_mm,
+	.schedule_evict_and_restore_process =
+			  kgd2kfd_schedule_evict_and_restore_process,
+	.pre_reset	= kgd2kfd_pre_reset,
+	.post_reset	= kgd2kfd_post_reset,
+};
+
+int sched_policy = KFD_SCHED_POLICY_HWS;
+module_param(sched_policy, int, 0444);
+MODULE_PARM_DESC(sched_policy,
+	"Scheduling policy (0 = HWS (Default), 1 = HWS without over-subscription, 2 = Non-HWS (Used for debugging only)");
+
+int hws_max_conc_proc = 8;
+module_param(hws_max_conc_proc, int, 0444);
+MODULE_PARM_DESC(hws_max_conc_proc,
+	"Max # processes HWS can execute concurrently when sched_policy=0 (0 = no concurrency, #VMIDs for KFD = Maximum(default))");
+
+int cwsr_enable = 1;
+module_param(cwsr_enable, int, 0444);
+MODULE_PARM_DESC(cwsr_enable, "CWSR enable (0 = off, 1 = on (default))");
+
+int max_num_of_queues_per_device = KFD_MAX_NUM_OF_QUEUES_PER_DEVICE_DEFAULT;
+module_param(max_num_of_queues_per_device, int, 0444);
+MODULE_PARM_DESC(max_num_of_queues_per_device,
+	"Maximum number of supported queues per device (1 = Minimum, 4096 = default)");
+
+int send_sigterm;
+module_param(send_sigterm, int, 0444);
+MODULE_PARM_DESC(send_sigterm,
+	"Send sigterm to HSA process on unhandled exception (0 = disable, 1 = enable)");
+
+int debug_largebar;
+module_param(debug_largebar, int, 0444);
+MODULE_PARM_DESC(debug_largebar,
+	"Debug large-bar flag used to simulate large-bar capability on non-large bar machine (0 = disable, 1 = enable)");
+
+int ignore_crat;
+module_param(ignore_crat, int, 0444);
+MODULE_PARM_DESC(ignore_crat,
+	"Ignore CRAT table during KFD initialization (0 = use CRAT (default), 1 = ignore CRAT)");
+
+int noretry;
+module_param(noretry, int, 0644);
+MODULE_PARM_DESC(noretry,
+	"Set sh_mem_config.retry_disable on GFXv9+ dGPUs (0 = retry enabled (default), 1 = retry disabled)");
+
+int halt_if_hws_hang;
+module_param(halt_if_hws_hang, int, 0644);
+MODULE_PARM_DESC(halt_if_hws_hang, "Halt if HWS hang is detected (0 = off (default), 1 = on)");
+
+
+static int amdkfd_init_completed;
+
+
+int kgd2kfd_init(unsigned int interface_version,
+		const struct kgd2kfd_calls **g2f)
+{
+	if (!amdkfd_init_completed)
+		return -EPROBE_DEFER;
+
+	/*
+	 * Only one interface version is supported,
+	 * no kfd/kgd version skew allowed.
+	 */
+	if (interface_version != KFD_INTERFACE_VERSION)
+		return -EINVAL;
+
+	*g2f = &kgd2kfd;
+
+	return 0;
+}
+EXPORT_SYMBOL(kgd2kfd_init);
+
+void kgd2kfd_exit(void)
+{
+}
+
+static int __init kfd_module_init(void)
+{
+	int err;
+
+	/* Verify module parameters */
+	if ((sched_policy < KFD_SCHED_POLICY_HWS) ||
+		(sched_policy > KFD_SCHED_POLICY_NO_HWS)) {
+		pr_err("sched_policy has invalid value\n");
+		return -1;
+	}
+
+	/* Verify module parameters */
+	if ((max_num_of_queues_per_device < 1) ||
+		(max_num_of_queues_per_device >
+			KFD_MAX_NUM_OF_QUEUES_PER_DEVICE)) {
+		pr_err("max_num_of_queues_per_device must be between 1 to KFD_MAX_NUM_OF_QUEUES_PER_DEVICE\n");
+		return -1;
+	}
+
+	err = kfd_chardev_init();
+	if (err < 0)
+		goto err_ioctl;
+
+	err = kfd_topology_init();
+	if (err < 0)
+		goto err_topology;
+
+	err = kfd_process_create_wq();
+	if (err < 0)
+		goto err_create_wq;
+
+	kfd_debugfs_init();
+
+	amdkfd_init_completed = 1;
+
+	dev_info(kfd_device, "Initialized module\n");
+
+	return 0;
+
+err_create_wq:
+	kfd_topology_shutdown();
+err_topology:
+	kfd_chardev_exit();
+err_ioctl:
+	return err;
+}
+
+static void __exit kfd_module_exit(void)
+{
+	amdkfd_init_completed = 0;
+
+	kfd_debugfs_fini();
+	kfd_process_destroy_wq();
+	kfd_topology_shutdown();
+	kfd_chardev_exit();
+	pr_info("amdkfd: Removed module\n");
+}
+
+module_init(kfd_module_init);
+module_exit(kfd_module_exit);
+
+MODULE_AUTHOR(KFD_DRIVER_AUTHOR);
+MODULE_DESCRIPTION(KFD_DRIVER_DESC);
+MODULE_LICENSE("GPL and additional rights");
+MODULE_VERSION(__stringify(KFD_DRIVER_MAJOR) "."
+	       __stringify(KFD_DRIVER_MINOR) "."
+	       __stringify(KFD_DRIVER_PATCHLEVEL));
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.c
new file mode 100644
index 000000000..3bc25ab84
--- /dev/null
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.c
@@ -0,0 +1,89 @@
+/*
+ * Copyright 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ *
+ */
+
+#include "kfd_mqd_manager.h"
+
+struct mqd_manager *mqd_manager_init(enum KFD_MQD_TYPE type,
+					struct kfd_dev *dev)
+{
+	switch (dev->device_info->asic_family) {
+	case CHIP_KAVERI:
+		return mqd_manager_init_cik(type, dev);
+	case CHIP_HAWAII:
+		return mqd_manager_init_cik_hawaii(type, dev);
+	case CHIP_CARRIZO:
+		return mqd_manager_init_vi(type, dev);
+	case CHIP_TONGA:
+	case CHIP_FIJI:
+	case CHIP_POLARIS10:
+	case CHIP_POLARIS11:
+		return mqd_manager_init_vi_tonga(type, dev);
+	case CHIP_VEGA10:
+	case CHIP_RAVEN:
+		return mqd_manager_init_v9(type, dev);
+	default:
+		WARN(1, "Unexpected ASIC family %u",
+		     dev->device_info->asic_family);
+	}
+
+	return NULL;
+}
+
+void mqd_symmetrically_map_cu_mask(struct mqd_manager *mm,
+		const uint32_t *cu_mask, uint32_t cu_mask_count,
+		uint32_t *se_mask)
+{
+	struct kfd_cu_info cu_info;
+	uint32_t cu_per_sh[4] = {0};
+	int i, se, cu = 0;
+
+	mm->dev->kfd2kgd->get_cu_info(mm->dev->kgd, &cu_info);
+
+	if (cu_mask_count > cu_info.cu_active_number)
+		cu_mask_count = cu_info.cu_active_number;
+
+	for (se = 0; se < cu_info.num_shader_engines; se++)
+		for (i = 0; i < 4; i++)
+			cu_per_sh[se] += hweight32(cu_info.cu_bitmap[se][i]);
+
+	/* Symmetrically map cu_mask to all SEs:
+	 * cu_mask[0] bit0 -> se_mask[0] bit0;
+	 * cu_mask[0] bit1 -> se_mask[1] bit0;
+	 * ... (if # SE is 4)
+	 * cu_mask[0] bit4 -> se_mask[0] bit1;
+	 * ...
+	 */
+	se = 0;
+	for (i = 0; i < cu_mask_count; i++) {
+		if (cu_mask[i / 32] & (1 << (i % 32)))
+			se_mask[se] |= 1 << cu;
+
+		do {
+			se++;
+			if (se == cu_info.num_shader_engines) {
+				se = 0;
+				cu++;
+			}
+		} while (cu >= cu_per_sh[se] && cu < 32);
+	}
+}
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.h b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.h
new file mode 100644
index 000000000..4e84052d4
--- /dev/null
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.h
@@ -0,0 +1,100 @@
+/*
+ * Copyright 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ *
+ */
+
+#ifndef KFD_MQD_MANAGER_H_
+#define KFD_MQD_MANAGER_H_
+
+#include "kfd_priv.h"
+
+/**
+ * struct mqd_manager
+ *
+ * @init_mqd: Allocates the mqd buffer on local gpu memory and initialize it.
+ *
+ * @load_mqd: Loads the mqd to a concrete hqd slot. Used only for no cp
+ * scheduling mode.
+ *
+ * @update_mqd: Handles a update call for the MQD
+ *
+ * @destroy_mqd: Destroys the HQD slot and by that preempt the relevant queue.
+ * Used only for no cp scheduling.
+ *
+ * @uninit_mqd: Releases the mqd buffer from local gpu memory.
+ *
+ * @is_occupied: Checks if the relevant HQD slot is occupied.
+ *
+ * @mqd_mutex: Mqd manager mutex.
+ *
+ * @dev: The kfd device structure coupled with this module.
+ *
+ * MQD stands for Memory Queue Descriptor which represents the current queue
+ * state in the memory and initiate the HQD (Hardware Queue Descriptor) state.
+ * This structure is actually a base class for the different types of MQDs
+ * structures for the variant ASICs that should be supported in the future.
+ * This base class is also contains all the MQD specific operations.
+ * Another important thing to mention is that each queue has a MQD that keeps
+ * his state (or context) after each preemption or reassignment.
+ * Basically there are a instances of the mqd manager class per MQD type per
+ * ASIC. Currently the kfd driver supports only Kaveri so there are instances
+ * per KFD_MQD_TYPE for each device.
+ *
+ */
+
+struct mqd_manager {
+	int	(*init_mqd)(struct mqd_manager *mm, void **mqd,
+			struct kfd_mem_obj **mqd_mem_obj, uint64_t *gart_addr,
+			struct queue_properties *q);
+
+	int	(*load_mqd)(struct mqd_manager *mm, void *mqd,
+				uint32_t pipe_id, uint32_t queue_id,
+				struct queue_properties *p,
+				struct mm_struct *mms);
+
+	int	(*update_mqd)(struct mqd_manager *mm, void *mqd,
+				struct queue_properties *q);
+
+	int	(*destroy_mqd)(struct mqd_manager *mm, void *mqd,
+				enum kfd_preempt_type type,
+				unsigned int timeout, uint32_t pipe_id,
+				uint32_t queue_id);
+
+	void	(*uninit_mqd)(struct mqd_manager *mm, void *mqd,
+				struct kfd_mem_obj *mqd_mem_obj);
+
+	bool	(*is_occupied)(struct mqd_manager *mm, void *mqd,
+				uint64_t queue_address,	uint32_t pipe_id,
+				uint32_t queue_id);
+
+#if defined(CONFIG_DEBUG_FS)
+	int	(*debugfs_show_mqd)(struct seq_file *m, void *data);
+#endif
+
+	struct mutex	mqd_mutex;
+	struct kfd_dev	*dev;
+};
+
+void mqd_symmetrically_map_cu_mask(struct mqd_manager *mm,
+		const uint32_t *cu_mask, uint32_t cu_mask_count,
+		uint32_t *se_mask);
+
+#endif /* KFD_MQD_MANAGER_H_ */
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_cik.c b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_cik.c
new file mode 100644
index 000000000..ae90a9990
--- /dev/null
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_cik.c
@@ -0,0 +1,448 @@
+/*
+ * Copyright 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ *
+ */
+
+#include <linux/printk.h>
+#include <linux/slab.h>
+#include <linux/mm_types.h>
+
+#include "kfd_priv.h"
+#include "kfd_mqd_manager.h"
+#include "cik_regs.h"
+#include "cik_structs.h"
+#include "oss/oss_2_4_sh_mask.h"
+
+static inline struct cik_mqd *get_mqd(void *mqd)
+{
+	return (struct cik_mqd *)mqd;
+}
+
+static inline struct cik_sdma_rlc_registers *get_sdma_mqd(void *mqd)
+{
+	return (struct cik_sdma_rlc_registers *)mqd;
+}
+
+static void update_cu_mask(struct mqd_manager *mm, void *mqd,
+			struct queue_properties *q)
+{
+	struct cik_mqd *m;
+	uint32_t se_mask[4] = {0}; /* 4 is the max # of SEs */
+
+	if (q->cu_mask_count == 0)
+		return;
+
+	mqd_symmetrically_map_cu_mask(mm,
+		q->cu_mask, q->cu_mask_count, se_mask);
+
+	m = get_mqd(mqd);
+	m->compute_static_thread_mgmt_se0 = se_mask[0];
+	m->compute_static_thread_mgmt_se1 = se_mask[1];
+	m->compute_static_thread_mgmt_se2 = se_mask[2];
+	m->compute_static_thread_mgmt_se3 = se_mask[3];
+
+	pr_debug("Update cu mask to %#x %#x %#x %#x\n",
+		m->compute_static_thread_mgmt_se0,
+		m->compute_static_thread_mgmt_se1,
+		m->compute_static_thread_mgmt_se2,
+		m->compute_static_thread_mgmt_se3);
+}
+
+static int init_mqd(struct mqd_manager *mm, void **mqd,
+		struct kfd_mem_obj **mqd_mem_obj, uint64_t *gart_addr,
+		struct queue_properties *q)
+{
+	uint64_t addr;
+	struct cik_mqd *m;
+	int retval;
+
+	retval = kfd_gtt_sa_allocate(mm->dev, sizeof(struct cik_mqd),
+					mqd_mem_obj);
+
+	if (retval != 0)
+		return -ENOMEM;
+
+	m = (struct cik_mqd *) (*mqd_mem_obj)->cpu_ptr;
+	addr = (*mqd_mem_obj)->gpu_addr;
+
+	memset(m, 0, ALIGN(sizeof(struct cik_mqd), 256));
+
+	m->header = 0xC0310800;
+	m->compute_pipelinestat_enable = 1;
+	m->compute_static_thread_mgmt_se0 = 0xFFFFFFFF;
+	m->compute_static_thread_mgmt_se1 = 0xFFFFFFFF;
+	m->compute_static_thread_mgmt_se2 = 0xFFFFFFFF;
+	m->compute_static_thread_mgmt_se3 = 0xFFFFFFFF;
+
+	/*
+	 * Make sure to use the last queue state saved on mqd when the cp
+	 * reassigns the queue, so when queue is switched on/off (e.g over
+	 * subscription or quantum timeout) the context will be consistent
+	 */
+	m->cp_hqd_persistent_state =
+				DEFAULT_CP_HQD_PERSISTENT_STATE | PRELOAD_REQ;
+
+	m->cp_mqd_control             = MQD_CONTROL_PRIV_STATE_EN;
+	m->cp_mqd_base_addr_lo        = lower_32_bits(addr);
+	m->cp_mqd_base_addr_hi        = upper_32_bits(addr);
+
+	m->cp_hqd_quantum = QUANTUM_EN | QUANTUM_SCALE_1MS |
+				QUANTUM_DURATION(10);
+
+	/*
+	 * Pipe Priority
+	 * Identifies the pipe relative priority when this queue is connected
+	 * to the pipeline. The pipe priority is against the GFX pipe and HP3D.
+	 * In KFD we are using a fixed pipe priority set to CS_MEDIUM.
+	 * 0 = CS_LOW (typically below GFX)
+	 * 1 = CS_MEDIUM (typically between HP3D and GFX
+	 * 2 = CS_HIGH (typically above HP3D)
+	 */
+	m->cp_hqd_pipe_priority = 1;
+	m->cp_hqd_queue_priority = 15;
+
+	if (q->format == KFD_QUEUE_FORMAT_AQL)
+		m->cp_hqd_iq_rptr = AQL_ENABLE;
+
+	*mqd = m;
+	if (gart_addr)
+		*gart_addr = addr;
+	retval = mm->update_mqd(mm, m, q);
+
+	return retval;
+}
+
+static int init_mqd_sdma(struct mqd_manager *mm, void **mqd,
+			struct kfd_mem_obj **mqd_mem_obj, uint64_t *gart_addr,
+			struct queue_properties *q)
+{
+	int retval;
+	struct cik_sdma_rlc_registers *m;
+
+	retval = kfd_gtt_sa_allocate(mm->dev,
+					sizeof(struct cik_sdma_rlc_registers),
+					mqd_mem_obj);
+
+	if (retval != 0)
+		return -ENOMEM;
+
+	m = (struct cik_sdma_rlc_registers *) (*mqd_mem_obj)->cpu_ptr;
+
+	memset(m, 0, sizeof(struct cik_sdma_rlc_registers));
+
+	*mqd = m;
+	if (gart_addr)
+		*gart_addr = (*mqd_mem_obj)->gpu_addr;
+
+	retval = mm->update_mqd(mm, m, q);
+
+	return retval;
+}
+
+static void uninit_mqd(struct mqd_manager *mm, void *mqd,
+			struct kfd_mem_obj *mqd_mem_obj)
+{
+	kfd_gtt_sa_free(mm->dev, mqd_mem_obj);
+}
+
+static void uninit_mqd_sdma(struct mqd_manager *mm, void *mqd,
+				struct kfd_mem_obj *mqd_mem_obj)
+{
+	kfd_gtt_sa_free(mm->dev, mqd_mem_obj);
+}
+
+static int load_mqd(struct mqd_manager *mm, void *mqd, uint32_t pipe_id,
+		    uint32_t queue_id, struct queue_properties *p,
+		    struct mm_struct *mms)
+{
+	/* AQL write pointer counts in 64B packets, PM4/CP counts in dwords. */
+	uint32_t wptr_shift = (p->format == KFD_QUEUE_FORMAT_AQL ? 4 : 0);
+	uint32_t wptr_mask = (uint32_t)((p->queue_size / 4) - 1);
+
+	return mm->dev->kfd2kgd->hqd_load(mm->dev->kgd, mqd, pipe_id, queue_id,
+					  (uint32_t __user *)p->write_ptr,
+					  wptr_shift, wptr_mask, mms);
+}
+
+static int load_mqd_sdma(struct mqd_manager *mm, void *mqd,
+			 uint32_t pipe_id, uint32_t queue_id,
+			 struct queue_properties *p, struct mm_struct *mms)
+{
+	return mm->dev->kfd2kgd->hqd_sdma_load(mm->dev->kgd, mqd,
+					       (uint32_t __user *)p->write_ptr,
+					       mms);
+}
+
+static int __update_mqd(struct mqd_manager *mm, void *mqd,
+			struct queue_properties *q, unsigned int atc_bit)
+{
+	struct cik_mqd *m;
+
+	m = get_mqd(mqd);
+	m->cp_hqd_pq_control = DEFAULT_RPTR_BLOCK_SIZE |
+				DEFAULT_MIN_AVAIL_SIZE;
+	m->cp_hqd_ib_control = DEFAULT_MIN_IB_AVAIL_SIZE;
+	if (atc_bit) {
+		m->cp_hqd_pq_control |= PQ_ATC_EN;
+		m->cp_hqd_ib_control |= IB_ATC_EN;
+	}
+
+	/*
+	 * Calculating queue size which is log base 2 of actual queue size -1
+	 * dwords and another -1 for ffs
+	 */
+	m->cp_hqd_pq_control |= order_base_2(q->queue_size / 4) - 1;
+	m->cp_hqd_pq_base_lo = lower_32_bits((uint64_t)q->queue_address >> 8);
+	m->cp_hqd_pq_base_hi = upper_32_bits((uint64_t)q->queue_address >> 8);
+	m->cp_hqd_pq_rptr_report_addr_lo = lower_32_bits((uint64_t)q->read_ptr);
+	m->cp_hqd_pq_rptr_report_addr_hi = upper_32_bits((uint64_t)q->read_ptr);
+	m->cp_hqd_pq_doorbell_control = DOORBELL_OFFSET(q->doorbell_off);
+
+	m->cp_hqd_vmid = q->vmid;
+
+	if (q->format == KFD_QUEUE_FORMAT_AQL)
+		m->cp_hqd_pq_control |= NO_UPDATE_RPTR;
+
+	update_cu_mask(mm, mqd, q);
+
+	q->is_active = (q->queue_size > 0 &&
+			q->queue_address != 0 &&
+			q->queue_percent > 0 &&
+			!q->is_evicted);
+
+	return 0;
+}
+
+static int update_mqd(struct mqd_manager *mm, void *mqd,
+			struct queue_properties *q)
+{
+	return __update_mqd(mm, mqd, q, 1);
+}
+
+static int update_mqd_hawaii(struct mqd_manager *mm, void *mqd,
+			struct queue_properties *q)
+{
+	return __update_mqd(mm, mqd, q, 0);
+}
+
+static int update_mqd_sdma(struct mqd_manager *mm, void *mqd,
+				struct queue_properties *q)
+{
+	struct cik_sdma_rlc_registers *m;
+
+	m = get_sdma_mqd(mqd);
+	m->sdma_rlc_rb_cntl = order_base_2(q->queue_size / 4)
+			<< SDMA0_RLC0_RB_CNTL__RB_SIZE__SHIFT |
+			q->vmid << SDMA0_RLC0_RB_CNTL__RB_VMID__SHIFT |
+			1 << SDMA0_RLC0_RB_CNTL__RPTR_WRITEBACK_ENABLE__SHIFT |
+			6 << SDMA0_RLC0_RB_CNTL__RPTR_WRITEBACK_TIMER__SHIFT;
+
+	m->sdma_rlc_rb_base = lower_32_bits(q->queue_address >> 8);
+	m->sdma_rlc_rb_base_hi = upper_32_bits(q->queue_address >> 8);
+	m->sdma_rlc_rb_rptr_addr_lo = lower_32_bits((uint64_t)q->read_ptr);
+	m->sdma_rlc_rb_rptr_addr_hi = upper_32_bits((uint64_t)q->read_ptr);
+	m->sdma_rlc_doorbell =
+		q->doorbell_off << SDMA0_RLC0_DOORBELL__OFFSET__SHIFT;
+
+	m->sdma_rlc_virtual_addr = q->sdma_vm_addr;
+
+	m->sdma_engine_id = q->sdma_engine_id;
+	m->sdma_queue_id = q->sdma_queue_id;
+
+	q->is_active = (q->queue_size > 0 &&
+			q->queue_address != 0 &&
+			q->queue_percent > 0 &&
+			!q->is_evicted);
+
+	return 0;
+}
+
+static int destroy_mqd(struct mqd_manager *mm, void *mqd,
+			enum kfd_preempt_type type,
+			unsigned int timeout, uint32_t pipe_id,
+			uint32_t queue_id)
+{
+	return mm->dev->kfd2kgd->hqd_destroy(mm->dev->kgd, mqd, type, timeout,
+					pipe_id, queue_id);
+}
+
+/*
+ * preempt type here is ignored because there is only one way
+ * to preempt sdma queue
+ */
+static int destroy_mqd_sdma(struct mqd_manager *mm, void *mqd,
+				enum kfd_preempt_type type,
+				unsigned int timeout, uint32_t pipe_id,
+				uint32_t queue_id)
+{
+	return mm->dev->kfd2kgd->hqd_sdma_destroy(mm->dev->kgd, mqd, timeout);
+}
+
+static bool is_occupied(struct mqd_manager *mm, void *mqd,
+			uint64_t queue_address,	uint32_t pipe_id,
+			uint32_t queue_id)
+{
+
+	return mm->dev->kfd2kgd->hqd_is_occupied(mm->dev->kgd, queue_address,
+					pipe_id, queue_id);
+
+}
+
+static bool is_occupied_sdma(struct mqd_manager *mm, void *mqd,
+			uint64_t queue_address,	uint32_t pipe_id,
+			uint32_t queue_id)
+{
+	return mm->dev->kfd2kgd->hqd_sdma_is_occupied(mm->dev->kgd, mqd);
+}
+
+/*
+ * HIQ MQD Implementation, concrete implementation for HIQ MQD implementation.
+ * The HIQ queue in Kaveri is using the same MQD structure as all the user mode
+ * queues but with different initial values.
+ */
+
+static int init_mqd_hiq(struct mqd_manager *mm, void **mqd,
+		struct kfd_mem_obj **mqd_mem_obj, uint64_t *gart_addr,
+		struct queue_properties *q)
+{
+	return init_mqd(mm, mqd, mqd_mem_obj, gart_addr, q);
+}
+
+static int update_mqd_hiq(struct mqd_manager *mm, void *mqd,
+				struct queue_properties *q)
+{
+	struct cik_mqd *m;
+
+	m = get_mqd(mqd);
+	m->cp_hqd_pq_control = DEFAULT_RPTR_BLOCK_SIZE |
+				DEFAULT_MIN_AVAIL_SIZE |
+				PRIV_STATE |
+				KMD_QUEUE;
+
+	/*
+	 * Calculating queue size which is log base 2 of actual queue
+	 * size -1 dwords
+	 */
+	m->cp_hqd_pq_control |= order_base_2(q->queue_size / 4) - 1;
+	m->cp_hqd_pq_base_lo = lower_32_bits((uint64_t)q->queue_address >> 8);
+	m->cp_hqd_pq_base_hi = upper_32_bits((uint64_t)q->queue_address >> 8);
+	m->cp_hqd_pq_rptr_report_addr_lo = lower_32_bits((uint64_t)q->read_ptr);
+	m->cp_hqd_pq_rptr_report_addr_hi = upper_32_bits((uint64_t)q->read_ptr);
+	m->cp_hqd_pq_doorbell_control = DOORBELL_OFFSET(q->doorbell_off);
+
+	m->cp_hqd_vmid = q->vmid;
+
+	q->is_active = (q->queue_size > 0 &&
+			q->queue_address != 0 &&
+			q->queue_percent > 0 &&
+			!q->is_evicted);
+
+	return 0;
+}
+
+#if defined(CONFIG_DEBUG_FS)
+
+static int debugfs_show_mqd(struct seq_file *m, void *data)
+{
+	seq_hex_dump(m, "    ", DUMP_PREFIX_OFFSET, 32, 4,
+		     data, sizeof(struct cik_mqd), false);
+	return 0;
+}
+
+static int debugfs_show_mqd_sdma(struct seq_file *m, void *data)
+{
+	seq_hex_dump(m, "    ", DUMP_PREFIX_OFFSET, 32, 4,
+		     data, sizeof(struct cik_sdma_rlc_registers), false);
+	return 0;
+}
+
+#endif
+
+
+struct mqd_manager *mqd_manager_init_cik(enum KFD_MQD_TYPE type,
+		struct kfd_dev *dev)
+{
+	struct mqd_manager *mqd;
+
+	if (WARN_ON(type >= KFD_MQD_TYPE_MAX))
+		return NULL;
+
+	mqd = kzalloc(sizeof(*mqd), GFP_KERNEL);
+	if (!mqd)
+		return NULL;
+
+	mqd->dev = dev;
+
+	switch (type) {
+	case KFD_MQD_TYPE_CP:
+	case KFD_MQD_TYPE_COMPUTE:
+		mqd->init_mqd = init_mqd;
+		mqd->uninit_mqd = uninit_mqd;
+		mqd->load_mqd = load_mqd;
+		mqd->update_mqd = update_mqd;
+		mqd->destroy_mqd = destroy_mqd;
+		mqd->is_occupied = is_occupied;
+#if defined(CONFIG_DEBUG_FS)
+		mqd->debugfs_show_mqd = debugfs_show_mqd;
+#endif
+		break;
+	case KFD_MQD_TYPE_HIQ:
+		mqd->init_mqd = init_mqd_hiq;
+		mqd->uninit_mqd = uninit_mqd;
+		mqd->load_mqd = load_mqd;
+		mqd->update_mqd = update_mqd_hiq;
+		mqd->destroy_mqd = destroy_mqd;
+		mqd->is_occupied = is_occupied;
+#if defined(CONFIG_DEBUG_FS)
+		mqd->debugfs_show_mqd = debugfs_show_mqd;
+#endif
+		break;
+	case KFD_MQD_TYPE_SDMA:
+		mqd->init_mqd = init_mqd_sdma;
+		mqd->uninit_mqd = uninit_mqd_sdma;
+		mqd->load_mqd = load_mqd_sdma;
+		mqd->update_mqd = update_mqd_sdma;
+		mqd->destroy_mqd = destroy_mqd_sdma;
+		mqd->is_occupied = is_occupied_sdma;
+#if defined(CONFIG_DEBUG_FS)
+		mqd->debugfs_show_mqd = debugfs_show_mqd_sdma;
+#endif
+		break;
+	default:
+		kfree(mqd);
+		return NULL;
+	}
+
+	return mqd;
+}
+
+struct mqd_manager *mqd_manager_init_cik_hawaii(enum KFD_MQD_TYPE type,
+			struct kfd_dev *dev)
+{
+	struct mqd_manager *mqd;
+
+	mqd = mqd_manager_init_cik(type, dev);
+	if (!mqd)
+		return NULL;
+	if ((type == KFD_MQD_TYPE_CP) || (type == KFD_MQD_TYPE_COMPUTE))
+		mqd->update_mqd = update_mqd_hawaii;
+	return mqd;
+}
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v9.c b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v9.c
new file mode 100644
index 000000000..985bebde5
--- /dev/null
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v9.c
@@ -0,0 +1,473 @@
+/*
+ * Copyright 2016-2018 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ *
+ */
+
+#include <linux/printk.h>
+#include <linux/slab.h>
+#include <linux/uaccess.h>
+#include "kfd_priv.h"
+#include "kfd_mqd_manager.h"
+#include "v9_structs.h"
+#include "gc/gc_9_0_offset.h"
+#include "gc/gc_9_0_sh_mask.h"
+#include "sdma0/sdma0_4_0_sh_mask.h"
+
+static inline struct v9_mqd *get_mqd(void *mqd)
+{
+	return (struct v9_mqd *)mqd;
+}
+
+static inline struct v9_sdma_mqd *get_sdma_mqd(void *mqd)
+{
+	return (struct v9_sdma_mqd *)mqd;
+}
+
+static void update_cu_mask(struct mqd_manager *mm, void *mqd,
+			struct queue_properties *q)
+{
+	struct v9_mqd *m;
+	uint32_t se_mask[4] = {0}; /* 4 is the max # of SEs */
+
+	if (q->cu_mask_count == 0)
+		return;
+
+	mqd_symmetrically_map_cu_mask(mm,
+		q->cu_mask, q->cu_mask_count, se_mask);
+
+	m = get_mqd(mqd);
+	m->compute_static_thread_mgmt_se0 = se_mask[0];
+	m->compute_static_thread_mgmt_se1 = se_mask[1];
+	m->compute_static_thread_mgmt_se2 = se_mask[2];
+	m->compute_static_thread_mgmt_se3 = se_mask[3];
+
+	pr_debug("update cu mask to %#x %#x %#x %#x\n",
+		m->compute_static_thread_mgmt_se0,
+		m->compute_static_thread_mgmt_se1,
+		m->compute_static_thread_mgmt_se2,
+		m->compute_static_thread_mgmt_se3);
+}
+
+static int init_mqd(struct mqd_manager *mm, void **mqd,
+			struct kfd_mem_obj **mqd_mem_obj, uint64_t *gart_addr,
+			struct queue_properties *q)
+{
+	int retval;
+	uint64_t addr;
+	struct v9_mqd *m;
+	struct kfd_dev *kfd = mm->dev;
+
+	*mqd_mem_obj = NULL;
+	/* From V9,  for CWSR, the control stack is located on the next page
+	 * boundary after the mqd, we will use the gtt allocation function
+	 * instead of sub-allocation function.
+	 */
+	if (kfd->cwsr_enabled && (q->type == KFD_QUEUE_TYPE_COMPUTE)) {
+		*mqd_mem_obj = kzalloc(sizeof(struct kfd_mem_obj), GFP_KERNEL);
+		if (!*mqd_mem_obj)
+			return -ENOMEM;
+		retval = kfd->kfd2kgd->init_gtt_mem_allocation(kfd->kgd,
+			ALIGN(q->ctl_stack_size, PAGE_SIZE) +
+				ALIGN(sizeof(struct v9_mqd), PAGE_SIZE),
+			&((*mqd_mem_obj)->gtt_mem),
+			&((*mqd_mem_obj)->gpu_addr),
+			(void *)&((*mqd_mem_obj)->cpu_ptr), true);
+	} else
+		retval = kfd_gtt_sa_allocate(mm->dev, sizeof(struct v9_mqd),
+				mqd_mem_obj);
+	if (retval) {
+		kfree(*mqd_mem_obj);
+		return -ENOMEM;
+	}
+
+	m = (struct v9_mqd *) (*mqd_mem_obj)->cpu_ptr;
+	addr = (*mqd_mem_obj)->gpu_addr;
+
+	memset(m, 0, sizeof(struct v9_mqd));
+
+	m->header = 0xC0310800;
+	m->compute_pipelinestat_enable = 1;
+	m->compute_static_thread_mgmt_se0 = 0xFFFFFFFF;
+	m->compute_static_thread_mgmt_se1 = 0xFFFFFFFF;
+	m->compute_static_thread_mgmt_se2 = 0xFFFFFFFF;
+	m->compute_static_thread_mgmt_se3 = 0xFFFFFFFF;
+
+	m->cp_hqd_persistent_state = CP_HQD_PERSISTENT_STATE__PRELOAD_REQ_MASK |
+			0x53 << CP_HQD_PERSISTENT_STATE__PRELOAD_SIZE__SHIFT;
+
+	m->cp_mqd_control = 1 << CP_MQD_CONTROL__PRIV_STATE__SHIFT;
+
+	m->cp_mqd_base_addr_lo        = lower_32_bits(addr);
+	m->cp_mqd_base_addr_hi        = upper_32_bits(addr);
+
+	m->cp_hqd_quantum = 1 << CP_HQD_QUANTUM__QUANTUM_EN__SHIFT |
+			1 << CP_HQD_QUANTUM__QUANTUM_SCALE__SHIFT |
+			10 << CP_HQD_QUANTUM__QUANTUM_DURATION__SHIFT;
+
+	m->cp_hqd_pipe_priority = 1;
+	m->cp_hqd_queue_priority = 15;
+
+	if (q->format == KFD_QUEUE_FORMAT_AQL) {
+		m->cp_hqd_aql_control =
+			1 << CP_HQD_AQL_CONTROL__CONTROL0__SHIFT;
+	}
+
+	if (q->tba_addr) {
+		m->compute_pgm_rsrc2 |=
+			(1 << COMPUTE_PGM_RSRC2__TRAP_PRESENT__SHIFT);
+	}
+
+	if (mm->dev->cwsr_enabled && q->ctx_save_restore_area_address) {
+		m->cp_hqd_persistent_state |=
+			(1 << CP_HQD_PERSISTENT_STATE__QSWITCH_MODE__SHIFT);
+		m->cp_hqd_ctx_save_base_addr_lo =
+			lower_32_bits(q->ctx_save_restore_area_address);
+		m->cp_hqd_ctx_save_base_addr_hi =
+			upper_32_bits(q->ctx_save_restore_area_address);
+		m->cp_hqd_ctx_save_size = q->ctx_save_restore_area_size;
+		m->cp_hqd_cntl_stack_size = q->ctl_stack_size;
+		m->cp_hqd_cntl_stack_offset = q->ctl_stack_size;
+		m->cp_hqd_wg_state_offset = q->ctl_stack_size;
+	}
+
+	*mqd = m;
+	if (gart_addr)
+		*gart_addr = addr;
+	retval = mm->update_mqd(mm, m, q);
+
+	return retval;
+}
+
+static int load_mqd(struct mqd_manager *mm, void *mqd,
+			uint32_t pipe_id, uint32_t queue_id,
+			struct queue_properties *p, struct mm_struct *mms)
+{
+	/* AQL write pointer counts in 64B packets, PM4/CP counts in dwords. */
+	uint32_t wptr_shift = (p->format == KFD_QUEUE_FORMAT_AQL ? 4 : 0);
+
+	return mm->dev->kfd2kgd->hqd_load(mm->dev->kgd, mqd, pipe_id, queue_id,
+					  (uint32_t __user *)p->write_ptr,
+					  wptr_shift, 0, mms);
+}
+
+static int update_mqd(struct mqd_manager *mm, void *mqd,
+		      struct queue_properties *q)
+{
+	struct v9_mqd *m;
+
+	m = get_mqd(mqd);
+
+	m->cp_hqd_pq_control = 5 << CP_HQD_PQ_CONTROL__RPTR_BLOCK_SIZE__SHIFT;
+	m->cp_hqd_pq_control |= order_base_2(q->queue_size / 4) - 1;
+	pr_debug("cp_hqd_pq_control 0x%x\n", m->cp_hqd_pq_control);
+
+	m->cp_hqd_pq_base_lo = lower_32_bits((uint64_t)q->queue_address >> 8);
+	m->cp_hqd_pq_base_hi = upper_32_bits((uint64_t)q->queue_address >> 8);
+
+	m->cp_hqd_pq_rptr_report_addr_lo = lower_32_bits((uint64_t)q->read_ptr);
+	m->cp_hqd_pq_rptr_report_addr_hi = upper_32_bits((uint64_t)q->read_ptr);
+	m->cp_hqd_pq_wptr_poll_addr_lo = lower_32_bits((uint64_t)q->write_ptr);
+	m->cp_hqd_pq_wptr_poll_addr_hi = upper_32_bits((uint64_t)q->write_ptr);
+
+	m->cp_hqd_pq_doorbell_control =
+		q->doorbell_off <<
+			CP_HQD_PQ_DOORBELL_CONTROL__DOORBELL_OFFSET__SHIFT;
+	pr_debug("cp_hqd_pq_doorbell_control 0x%x\n",
+			m->cp_hqd_pq_doorbell_control);
+
+	m->cp_hqd_ib_control =
+		3 << CP_HQD_IB_CONTROL__MIN_IB_AVAIL_SIZE__SHIFT |
+		1 << CP_HQD_IB_CONTROL__IB_EXE_DISABLE__SHIFT;
+
+	/*
+	 * HW does not clamp this field correctly. Maximum EOP queue size
+	 * is constrained by per-SE EOP done signal count, which is 8-bit.
+	 * Limit is 0xFF EOP entries (= 0x7F8 dwords). CP will not submit
+	 * more than (EOP entry count - 1) so a queue size of 0x800 dwords
+	 * is safe, giving a maximum field value of 0xA.
+	 */
+	m->cp_hqd_eop_control = min(0xA,
+		order_base_2(q->eop_ring_buffer_size / 4) - 1);
+	m->cp_hqd_eop_base_addr_lo =
+			lower_32_bits(q->eop_ring_buffer_address >> 8);
+	m->cp_hqd_eop_base_addr_hi =
+			upper_32_bits(q->eop_ring_buffer_address >> 8);
+
+	m->cp_hqd_iq_timer = 0;
+
+	m->cp_hqd_vmid = q->vmid;
+
+	if (q->format == KFD_QUEUE_FORMAT_AQL) {
+		m->cp_hqd_pq_control |= CP_HQD_PQ_CONTROL__NO_UPDATE_RPTR_MASK |
+				2 << CP_HQD_PQ_CONTROL__SLOT_BASED_WPTR__SHIFT |
+				1 << CP_HQD_PQ_CONTROL__QUEUE_FULL_EN__SHIFT |
+				1 << CP_HQD_PQ_CONTROL__WPP_CLAMP_EN__SHIFT;
+		m->cp_hqd_pq_doorbell_control |= 1 <<
+			CP_HQD_PQ_DOORBELL_CONTROL__DOORBELL_BIF_DROP__SHIFT;
+	}
+	if (mm->dev->cwsr_enabled && q->ctx_save_restore_area_address)
+		m->cp_hqd_ctx_save_control = 0;
+
+	update_cu_mask(mm, mqd, q);
+
+	q->is_active = (q->queue_size > 0 &&
+			q->queue_address != 0 &&
+			q->queue_percent > 0 &&
+			!q->is_evicted);
+
+	return 0;
+}
+
+
+static int destroy_mqd(struct mqd_manager *mm, void *mqd,
+			enum kfd_preempt_type type,
+			unsigned int timeout, uint32_t pipe_id,
+			uint32_t queue_id)
+{
+	return mm->dev->kfd2kgd->hqd_destroy
+		(mm->dev->kgd, mqd, type, timeout,
+		pipe_id, queue_id);
+}
+
+static void uninit_mqd(struct mqd_manager *mm, void *mqd,
+			struct kfd_mem_obj *mqd_mem_obj)
+{
+	struct kfd_dev *kfd = mm->dev;
+
+	if (mqd_mem_obj->gtt_mem) {
+		kfd->kfd2kgd->free_gtt_mem(kfd->kgd, mqd_mem_obj->gtt_mem);
+		kfree(mqd_mem_obj);
+	} else {
+		kfd_gtt_sa_free(mm->dev, mqd_mem_obj);
+	}
+}
+
+static bool is_occupied(struct mqd_manager *mm, void *mqd,
+			uint64_t queue_address,	uint32_t pipe_id,
+			uint32_t queue_id)
+{
+	return mm->dev->kfd2kgd->hqd_is_occupied(
+		mm->dev->kgd, queue_address,
+		pipe_id, queue_id);
+}
+
+static int init_mqd_hiq(struct mqd_manager *mm, void **mqd,
+			struct kfd_mem_obj **mqd_mem_obj, uint64_t *gart_addr,
+			struct queue_properties *q)
+{
+	struct v9_mqd *m;
+	int retval = init_mqd(mm, mqd, mqd_mem_obj, gart_addr, q);
+
+	if (retval != 0)
+		return retval;
+
+	m = get_mqd(*mqd);
+
+	m->cp_hqd_pq_control |= 1 << CP_HQD_PQ_CONTROL__PRIV_STATE__SHIFT |
+			1 << CP_HQD_PQ_CONTROL__KMD_QUEUE__SHIFT;
+
+	return retval;
+}
+
+static int update_mqd_hiq(struct mqd_manager *mm, void *mqd,
+			struct queue_properties *q)
+{
+	struct v9_mqd *m;
+	int retval = update_mqd(mm, mqd, q);
+
+	if (retval != 0)
+		return retval;
+
+	/* TODO: what's the point? update_mqd already does this. */
+	m = get_mqd(mqd);
+	m->cp_hqd_vmid = q->vmid;
+	return retval;
+}
+
+static int init_mqd_sdma(struct mqd_manager *mm, void **mqd,
+		struct kfd_mem_obj **mqd_mem_obj, uint64_t *gart_addr,
+		struct queue_properties *q)
+{
+	int retval;
+	struct v9_sdma_mqd *m;
+
+
+	retval = kfd_gtt_sa_allocate(mm->dev,
+			sizeof(struct v9_sdma_mqd),
+			mqd_mem_obj);
+
+	if (retval != 0)
+		return -ENOMEM;
+
+	m = (struct v9_sdma_mqd *) (*mqd_mem_obj)->cpu_ptr;
+
+	memset(m, 0, sizeof(struct v9_sdma_mqd));
+
+	*mqd = m;
+	if (gart_addr)
+		*gart_addr = (*mqd_mem_obj)->gpu_addr;
+
+	retval = mm->update_mqd(mm, m, q);
+
+	return retval;
+}
+
+static void uninit_mqd_sdma(struct mqd_manager *mm, void *mqd,
+		struct kfd_mem_obj *mqd_mem_obj)
+{
+	kfd_gtt_sa_free(mm->dev, mqd_mem_obj);
+}
+
+static int load_mqd_sdma(struct mqd_manager *mm, void *mqd,
+		uint32_t pipe_id, uint32_t queue_id,
+		struct queue_properties *p, struct mm_struct *mms)
+{
+	return mm->dev->kfd2kgd->hqd_sdma_load(mm->dev->kgd, mqd,
+					       (uint32_t __user *)p->write_ptr,
+					       mms);
+}
+
+#define SDMA_RLC_DUMMY_DEFAULT 0xf
+
+static int update_mqd_sdma(struct mqd_manager *mm, void *mqd,
+		struct queue_properties *q)
+{
+	struct v9_sdma_mqd *m;
+
+	m = get_sdma_mqd(mqd);
+	m->sdmax_rlcx_rb_cntl = order_base_2(q->queue_size / 4)
+		<< SDMA0_RLC0_RB_CNTL__RB_SIZE__SHIFT |
+		q->vmid << SDMA0_RLC0_RB_CNTL__RB_VMID__SHIFT |
+		1 << SDMA0_RLC0_RB_CNTL__RPTR_WRITEBACK_ENABLE__SHIFT |
+		6 << SDMA0_RLC0_RB_CNTL__RPTR_WRITEBACK_TIMER__SHIFT;
+
+	m->sdmax_rlcx_rb_base = lower_32_bits(q->queue_address >> 8);
+	m->sdmax_rlcx_rb_base_hi = upper_32_bits(q->queue_address >> 8);
+	m->sdmax_rlcx_rb_rptr_addr_lo = lower_32_bits((uint64_t)q->read_ptr);
+	m->sdmax_rlcx_rb_rptr_addr_hi = upper_32_bits((uint64_t)q->read_ptr);
+	m->sdmax_rlcx_doorbell_offset =
+		q->doorbell_off << SDMA0_RLC0_DOORBELL_OFFSET__OFFSET__SHIFT;
+
+	m->sdma_engine_id = q->sdma_engine_id;
+	m->sdma_queue_id = q->sdma_queue_id;
+	m->sdmax_rlcx_dummy_reg = SDMA_RLC_DUMMY_DEFAULT;
+
+	q->is_active = (q->queue_size > 0 &&
+			q->queue_address != 0 &&
+			q->queue_percent > 0 &&
+			!q->is_evicted);
+
+	return 0;
+}
+
+/*
+ *  * preempt type here is ignored because there is only one way
+ *  * to preempt sdma queue
+ */
+static int destroy_mqd_sdma(struct mqd_manager *mm, void *mqd,
+		enum kfd_preempt_type type,
+		unsigned int timeout, uint32_t pipe_id,
+		uint32_t queue_id)
+{
+	return mm->dev->kfd2kgd->hqd_sdma_destroy(mm->dev->kgd, mqd, timeout);
+}
+
+static bool is_occupied_sdma(struct mqd_manager *mm, void *mqd,
+		uint64_t queue_address, uint32_t pipe_id,
+		uint32_t queue_id)
+{
+	return mm->dev->kfd2kgd->hqd_sdma_is_occupied(mm->dev->kgd, mqd);
+}
+
+#if defined(CONFIG_DEBUG_FS)
+
+static int debugfs_show_mqd(struct seq_file *m, void *data)
+{
+	seq_hex_dump(m, "    ", DUMP_PREFIX_OFFSET, 32, 4,
+		     data, sizeof(struct v9_mqd), false);
+	return 0;
+}
+
+static int debugfs_show_mqd_sdma(struct seq_file *m, void *data)
+{
+	seq_hex_dump(m, "    ", DUMP_PREFIX_OFFSET, 32, 4,
+		     data, sizeof(struct v9_sdma_mqd), false);
+	return 0;
+}
+
+#endif
+
+struct mqd_manager *mqd_manager_init_v9(enum KFD_MQD_TYPE type,
+		struct kfd_dev *dev)
+{
+	struct mqd_manager *mqd;
+
+	if (WARN_ON(type >= KFD_MQD_TYPE_MAX))
+		return NULL;
+
+	mqd = kzalloc(sizeof(*mqd), GFP_KERNEL);
+	if (!mqd)
+		return NULL;
+
+	mqd->dev = dev;
+
+	switch (type) {
+	case KFD_MQD_TYPE_CP:
+	case KFD_MQD_TYPE_COMPUTE:
+		mqd->init_mqd = init_mqd;
+		mqd->uninit_mqd = uninit_mqd;
+		mqd->load_mqd = load_mqd;
+		mqd->update_mqd = update_mqd;
+		mqd->destroy_mqd = destroy_mqd;
+		mqd->is_occupied = is_occupied;
+#if defined(CONFIG_DEBUG_FS)
+		mqd->debugfs_show_mqd = debugfs_show_mqd;
+#endif
+		break;
+	case KFD_MQD_TYPE_HIQ:
+		mqd->init_mqd = init_mqd_hiq;
+		mqd->uninit_mqd = uninit_mqd;
+		mqd->load_mqd = load_mqd;
+		mqd->update_mqd = update_mqd_hiq;
+		mqd->destroy_mqd = destroy_mqd;
+		mqd->is_occupied = is_occupied;
+#if defined(CONFIG_DEBUG_FS)
+		mqd->debugfs_show_mqd = debugfs_show_mqd;
+#endif
+		break;
+	case KFD_MQD_TYPE_SDMA:
+		mqd->init_mqd = init_mqd_sdma;
+		mqd->uninit_mqd = uninit_mqd_sdma;
+		mqd->load_mqd = load_mqd_sdma;
+		mqd->update_mqd = update_mqd_sdma;
+		mqd->destroy_mqd = destroy_mqd_sdma;
+		mqd->is_occupied = is_occupied_sdma;
+#if defined(CONFIG_DEBUG_FS)
+		mqd->debugfs_show_mqd = debugfs_show_mqd_sdma;
+#endif
+		break;
+	default:
+		kfree(mqd);
+		return NULL;
+	}
+
+	return mqd;
+}
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_vi.c b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_vi.c
new file mode 100644
index 000000000..b81fda375
--- /dev/null
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_vi.c
@@ -0,0 +1,484 @@
+/*
+ * Copyright 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ *
+ */
+
+#include <linux/printk.h>
+#include <linux/slab.h>
+#include <linux/mm_types.h>
+
+#include "kfd_priv.h"
+#include "kfd_mqd_manager.h"
+#include "vi_structs.h"
+#include "gca/gfx_8_0_sh_mask.h"
+#include "gca/gfx_8_0_enum.h"
+#include "oss/oss_3_0_sh_mask.h"
+#define CP_MQD_CONTROL__PRIV_STATE__SHIFT 0x8
+
+static inline struct vi_mqd *get_mqd(void *mqd)
+{
+	return (struct vi_mqd *)mqd;
+}
+
+static inline struct vi_sdma_mqd *get_sdma_mqd(void *mqd)
+{
+	return (struct vi_sdma_mqd *)mqd;
+}
+
+static void update_cu_mask(struct mqd_manager *mm, void *mqd,
+			struct queue_properties *q)
+{
+	struct vi_mqd *m;
+	uint32_t se_mask[4] = {0}; /* 4 is the max # of SEs */
+
+	if (q->cu_mask_count == 0)
+		return;
+
+	mqd_symmetrically_map_cu_mask(mm,
+		q->cu_mask, q->cu_mask_count, se_mask);
+
+	m = get_mqd(mqd);
+	m->compute_static_thread_mgmt_se0 = se_mask[0];
+	m->compute_static_thread_mgmt_se1 = se_mask[1];
+	m->compute_static_thread_mgmt_se2 = se_mask[2];
+	m->compute_static_thread_mgmt_se3 = se_mask[3];
+
+	pr_debug("Update cu mask to %#x %#x %#x %#x\n",
+		m->compute_static_thread_mgmt_se0,
+		m->compute_static_thread_mgmt_se1,
+		m->compute_static_thread_mgmt_se2,
+		m->compute_static_thread_mgmt_se3);
+}
+
+static int init_mqd(struct mqd_manager *mm, void **mqd,
+			struct kfd_mem_obj **mqd_mem_obj, uint64_t *gart_addr,
+			struct queue_properties *q)
+{
+	int retval;
+	uint64_t addr;
+	struct vi_mqd *m;
+
+	retval = kfd_gtt_sa_allocate(mm->dev, sizeof(struct vi_mqd),
+			mqd_mem_obj);
+	if (retval != 0)
+		return -ENOMEM;
+
+	m = (struct vi_mqd *) (*mqd_mem_obj)->cpu_ptr;
+	addr = (*mqd_mem_obj)->gpu_addr;
+
+	memset(m, 0, sizeof(struct vi_mqd));
+
+	m->header = 0xC0310800;
+	m->compute_pipelinestat_enable = 1;
+	m->compute_static_thread_mgmt_se0 = 0xFFFFFFFF;
+	m->compute_static_thread_mgmt_se1 = 0xFFFFFFFF;
+	m->compute_static_thread_mgmt_se2 = 0xFFFFFFFF;
+	m->compute_static_thread_mgmt_se3 = 0xFFFFFFFF;
+
+	m->cp_hqd_persistent_state = CP_HQD_PERSISTENT_STATE__PRELOAD_REQ_MASK |
+			0x53 << CP_HQD_PERSISTENT_STATE__PRELOAD_SIZE__SHIFT;
+
+	m->cp_mqd_control = 1 << CP_MQD_CONTROL__PRIV_STATE__SHIFT |
+			MTYPE_UC << CP_MQD_CONTROL__MTYPE__SHIFT;
+
+	m->cp_mqd_base_addr_lo        = lower_32_bits(addr);
+	m->cp_mqd_base_addr_hi        = upper_32_bits(addr);
+
+	m->cp_hqd_quantum = 1 << CP_HQD_QUANTUM__QUANTUM_EN__SHIFT |
+			1 << CP_HQD_QUANTUM__QUANTUM_SCALE__SHIFT |
+			10 << CP_HQD_QUANTUM__QUANTUM_DURATION__SHIFT;
+
+	m->cp_hqd_pipe_priority = 1;
+	m->cp_hqd_queue_priority = 15;
+
+	m->cp_hqd_eop_rptr = 1 << CP_HQD_EOP_RPTR__INIT_FETCHER__SHIFT;
+
+	if (q->format == KFD_QUEUE_FORMAT_AQL)
+		m->cp_hqd_iq_rptr = 1;
+
+	if (q->tba_addr) {
+		m->compute_tba_lo = lower_32_bits(q->tba_addr >> 8);
+		m->compute_tba_hi = upper_32_bits(q->tba_addr >> 8);
+		m->compute_tma_lo = lower_32_bits(q->tma_addr >> 8);
+		m->compute_tma_hi = upper_32_bits(q->tma_addr >> 8);
+		m->compute_pgm_rsrc2 |=
+			(1 << COMPUTE_PGM_RSRC2__TRAP_PRESENT__SHIFT);
+	}
+
+	if (mm->dev->cwsr_enabled && q->ctx_save_restore_area_address) {
+		m->cp_hqd_persistent_state |=
+			(1 << CP_HQD_PERSISTENT_STATE__QSWITCH_MODE__SHIFT);
+		m->cp_hqd_ctx_save_base_addr_lo =
+			lower_32_bits(q->ctx_save_restore_area_address);
+		m->cp_hqd_ctx_save_base_addr_hi =
+			upper_32_bits(q->ctx_save_restore_area_address);
+		m->cp_hqd_ctx_save_size = q->ctx_save_restore_area_size;
+		m->cp_hqd_cntl_stack_size = q->ctl_stack_size;
+		m->cp_hqd_cntl_stack_offset = q->ctl_stack_size;
+		m->cp_hqd_wg_state_offset = q->ctl_stack_size;
+	}
+
+	*mqd = m;
+	if (gart_addr)
+		*gart_addr = addr;
+	retval = mm->update_mqd(mm, m, q);
+
+	return retval;
+}
+
+static int load_mqd(struct mqd_manager *mm, void *mqd,
+			uint32_t pipe_id, uint32_t queue_id,
+			struct queue_properties *p, struct mm_struct *mms)
+{
+	/* AQL write pointer counts in 64B packets, PM4/CP counts in dwords. */
+	uint32_t wptr_shift = (p->format == KFD_QUEUE_FORMAT_AQL ? 4 : 0);
+	uint32_t wptr_mask = (uint32_t)((p->queue_size / 4) - 1);
+
+	return mm->dev->kfd2kgd->hqd_load(mm->dev->kgd, mqd, pipe_id, queue_id,
+					  (uint32_t __user *)p->write_ptr,
+					  wptr_shift, wptr_mask, mms);
+}
+
+static int __update_mqd(struct mqd_manager *mm, void *mqd,
+			struct queue_properties *q, unsigned int mtype,
+			unsigned int atc_bit)
+{
+	struct vi_mqd *m;
+
+	m = get_mqd(mqd);
+
+	m->cp_hqd_pq_control = 5 << CP_HQD_PQ_CONTROL__RPTR_BLOCK_SIZE__SHIFT |
+			atc_bit << CP_HQD_PQ_CONTROL__PQ_ATC__SHIFT |
+			mtype << CP_HQD_PQ_CONTROL__MTYPE__SHIFT;
+	m->cp_hqd_pq_control |=	order_base_2(q->queue_size / 4) - 1;
+	pr_debug("cp_hqd_pq_control 0x%x\n", m->cp_hqd_pq_control);
+
+	m->cp_hqd_pq_base_lo = lower_32_bits((uint64_t)q->queue_address >> 8);
+	m->cp_hqd_pq_base_hi = upper_32_bits((uint64_t)q->queue_address >> 8);
+
+	m->cp_hqd_pq_rptr_report_addr_lo = lower_32_bits((uint64_t)q->read_ptr);
+	m->cp_hqd_pq_rptr_report_addr_hi = upper_32_bits((uint64_t)q->read_ptr);
+	m->cp_hqd_pq_wptr_poll_addr_lo = lower_32_bits((uint64_t)q->write_ptr);
+	m->cp_hqd_pq_wptr_poll_addr_hi = upper_32_bits((uint64_t)q->write_ptr);
+
+	m->cp_hqd_pq_doorbell_control =
+		q->doorbell_off <<
+			CP_HQD_PQ_DOORBELL_CONTROL__DOORBELL_OFFSET__SHIFT;
+	pr_debug("cp_hqd_pq_doorbell_control 0x%x\n",
+			m->cp_hqd_pq_doorbell_control);
+
+	m->cp_hqd_eop_control = atc_bit << CP_HQD_EOP_CONTROL__EOP_ATC__SHIFT |
+			mtype << CP_HQD_EOP_CONTROL__MTYPE__SHIFT;
+
+	m->cp_hqd_ib_control = atc_bit << CP_HQD_IB_CONTROL__IB_ATC__SHIFT |
+			3 << CP_HQD_IB_CONTROL__MIN_IB_AVAIL_SIZE__SHIFT |
+			mtype << CP_HQD_IB_CONTROL__MTYPE__SHIFT;
+
+	/*
+	 * HW does not clamp this field correctly. Maximum EOP queue size
+	 * is constrained by per-SE EOP done signal count, which is 8-bit.
+	 * Limit is 0xFF EOP entries (= 0x7F8 dwords). CP will not submit
+	 * more than (EOP entry count - 1) so a queue size of 0x800 dwords
+	 * is safe, giving a maximum field value of 0xA.
+	 */
+	m->cp_hqd_eop_control |= min(0xA,
+		order_base_2(q->eop_ring_buffer_size / 4) - 1);
+	m->cp_hqd_eop_base_addr_lo =
+			lower_32_bits(q->eop_ring_buffer_address >> 8);
+	m->cp_hqd_eop_base_addr_hi =
+			upper_32_bits(q->eop_ring_buffer_address >> 8);
+
+	m->cp_hqd_iq_timer = atc_bit << CP_HQD_IQ_TIMER__IQ_ATC__SHIFT |
+			mtype << CP_HQD_IQ_TIMER__MTYPE__SHIFT;
+
+	m->cp_hqd_vmid = q->vmid;
+
+	if (q->format == KFD_QUEUE_FORMAT_AQL) {
+		m->cp_hqd_pq_control |= CP_HQD_PQ_CONTROL__NO_UPDATE_RPTR_MASK |
+				2 << CP_HQD_PQ_CONTROL__SLOT_BASED_WPTR__SHIFT;
+	}
+
+	if (mm->dev->cwsr_enabled && q->ctx_save_restore_area_address)
+		m->cp_hqd_ctx_save_control =
+			atc_bit << CP_HQD_CTX_SAVE_CONTROL__ATC__SHIFT |
+			mtype << CP_HQD_CTX_SAVE_CONTROL__MTYPE__SHIFT;
+
+	update_cu_mask(mm, mqd, q);
+
+	q->is_active = (q->queue_size > 0 &&
+			q->queue_address != 0 &&
+			q->queue_percent > 0 &&
+			!q->is_evicted);
+
+	return 0;
+}
+
+
+static int update_mqd(struct mqd_manager *mm, void *mqd,
+			struct queue_properties *q)
+{
+	return __update_mqd(mm, mqd, q, MTYPE_CC, 1);
+}
+
+static int update_mqd_tonga(struct mqd_manager *mm, void *mqd,
+			struct queue_properties *q)
+{
+	return __update_mqd(mm, mqd, q, MTYPE_UC, 0);
+}
+
+static int destroy_mqd(struct mqd_manager *mm, void *mqd,
+			enum kfd_preempt_type type,
+			unsigned int timeout, uint32_t pipe_id,
+			uint32_t queue_id)
+{
+	return mm->dev->kfd2kgd->hqd_destroy
+		(mm->dev->kgd, mqd, type, timeout,
+		pipe_id, queue_id);
+}
+
+static void uninit_mqd(struct mqd_manager *mm, void *mqd,
+			struct kfd_mem_obj *mqd_mem_obj)
+{
+	kfd_gtt_sa_free(mm->dev, mqd_mem_obj);
+}
+
+static bool is_occupied(struct mqd_manager *mm, void *mqd,
+			uint64_t queue_address,	uint32_t pipe_id,
+			uint32_t queue_id)
+{
+	return mm->dev->kfd2kgd->hqd_is_occupied(
+		mm->dev->kgd, queue_address,
+		pipe_id, queue_id);
+}
+
+static int init_mqd_hiq(struct mqd_manager *mm, void **mqd,
+			struct kfd_mem_obj **mqd_mem_obj, uint64_t *gart_addr,
+			struct queue_properties *q)
+{
+	struct vi_mqd *m;
+	int retval = init_mqd(mm, mqd, mqd_mem_obj, gart_addr, q);
+
+	if (retval != 0)
+		return retval;
+
+	m = get_mqd(*mqd);
+
+	m->cp_hqd_pq_control |= 1 << CP_HQD_PQ_CONTROL__PRIV_STATE__SHIFT |
+			1 << CP_HQD_PQ_CONTROL__KMD_QUEUE__SHIFT;
+
+	return retval;
+}
+
+static int update_mqd_hiq(struct mqd_manager *mm, void *mqd,
+			struct queue_properties *q)
+{
+	struct vi_mqd *m;
+	int retval = __update_mqd(mm, mqd, q, MTYPE_UC, 0);
+
+	if (retval != 0)
+		return retval;
+
+	m = get_mqd(mqd);
+	m->cp_hqd_vmid = q->vmid;
+	return retval;
+}
+
+static int init_mqd_sdma(struct mqd_manager *mm, void **mqd,
+		struct kfd_mem_obj **mqd_mem_obj, uint64_t *gart_addr,
+		struct queue_properties *q)
+{
+	int retval;
+	struct vi_sdma_mqd *m;
+
+
+	retval = kfd_gtt_sa_allocate(mm->dev,
+			sizeof(struct vi_sdma_mqd),
+			mqd_mem_obj);
+
+	if (retval != 0)
+		return -ENOMEM;
+
+	m = (struct vi_sdma_mqd *) (*mqd_mem_obj)->cpu_ptr;
+
+	memset(m, 0, sizeof(struct vi_sdma_mqd));
+
+	*mqd = m;
+	if (gart_addr != NULL)
+		*gart_addr = (*mqd_mem_obj)->gpu_addr;
+
+	retval = mm->update_mqd(mm, m, q);
+
+	return retval;
+}
+
+static void uninit_mqd_sdma(struct mqd_manager *mm, void *mqd,
+		struct kfd_mem_obj *mqd_mem_obj)
+{
+	kfd_gtt_sa_free(mm->dev, mqd_mem_obj);
+}
+
+static int load_mqd_sdma(struct mqd_manager *mm, void *mqd,
+		uint32_t pipe_id, uint32_t queue_id,
+		struct queue_properties *p, struct mm_struct *mms)
+{
+	return mm->dev->kfd2kgd->hqd_sdma_load(mm->dev->kgd, mqd,
+					       (uint32_t __user *)p->write_ptr,
+					       mms);
+}
+
+static int update_mqd_sdma(struct mqd_manager *mm, void *mqd,
+		struct queue_properties *q)
+{
+	struct vi_sdma_mqd *m;
+
+	m = get_sdma_mqd(mqd);
+	m->sdmax_rlcx_rb_cntl = order_base_2(q->queue_size / 4)
+		<< SDMA0_RLC0_RB_CNTL__RB_SIZE__SHIFT |
+		q->vmid << SDMA0_RLC0_RB_CNTL__RB_VMID__SHIFT |
+		1 << SDMA0_RLC0_RB_CNTL__RPTR_WRITEBACK_ENABLE__SHIFT |
+		6 << SDMA0_RLC0_RB_CNTL__RPTR_WRITEBACK_TIMER__SHIFT;
+
+	m->sdmax_rlcx_rb_base = lower_32_bits(q->queue_address >> 8);
+	m->sdmax_rlcx_rb_base_hi = upper_32_bits(q->queue_address >> 8);
+	m->sdmax_rlcx_rb_rptr_addr_lo = lower_32_bits((uint64_t)q->read_ptr);
+	m->sdmax_rlcx_rb_rptr_addr_hi = upper_32_bits((uint64_t)q->read_ptr);
+	m->sdmax_rlcx_doorbell =
+		q->doorbell_off << SDMA0_RLC0_DOORBELL__OFFSET__SHIFT;
+
+	m->sdmax_rlcx_virtual_addr = q->sdma_vm_addr;
+
+	m->sdma_engine_id = q->sdma_engine_id;
+	m->sdma_queue_id = q->sdma_queue_id;
+
+	q->is_active = (q->queue_size > 0 &&
+			q->queue_address != 0 &&
+			q->queue_percent > 0 &&
+			!q->is_evicted);
+
+	return 0;
+}
+
+/*
+ *  * preempt type here is ignored because there is only one way
+ *  * to preempt sdma queue
+ */
+static int destroy_mqd_sdma(struct mqd_manager *mm, void *mqd,
+		enum kfd_preempt_type type,
+		unsigned int timeout, uint32_t pipe_id,
+		uint32_t queue_id)
+{
+	return mm->dev->kfd2kgd->hqd_sdma_destroy(mm->dev->kgd, mqd, timeout);
+}
+
+static bool is_occupied_sdma(struct mqd_manager *mm, void *mqd,
+		uint64_t queue_address, uint32_t pipe_id,
+		uint32_t queue_id)
+{
+	return mm->dev->kfd2kgd->hqd_sdma_is_occupied(mm->dev->kgd, mqd);
+}
+
+#if defined(CONFIG_DEBUG_FS)
+
+static int debugfs_show_mqd(struct seq_file *m, void *data)
+{
+	seq_hex_dump(m, "    ", DUMP_PREFIX_OFFSET, 32, 4,
+		     data, sizeof(struct vi_mqd), false);
+	return 0;
+}
+
+static int debugfs_show_mqd_sdma(struct seq_file *m, void *data)
+{
+	seq_hex_dump(m, "    ", DUMP_PREFIX_OFFSET, 32, 4,
+		     data, sizeof(struct vi_sdma_mqd), false);
+	return 0;
+}
+
+#endif
+
+struct mqd_manager *mqd_manager_init_vi(enum KFD_MQD_TYPE type,
+		struct kfd_dev *dev)
+{
+	struct mqd_manager *mqd;
+
+	if (WARN_ON(type >= KFD_MQD_TYPE_MAX))
+		return NULL;
+
+	mqd = kzalloc(sizeof(*mqd), GFP_KERNEL);
+	if (!mqd)
+		return NULL;
+
+	mqd->dev = dev;
+
+	switch (type) {
+	case KFD_MQD_TYPE_CP:
+	case KFD_MQD_TYPE_COMPUTE:
+		mqd->init_mqd = init_mqd;
+		mqd->uninit_mqd = uninit_mqd;
+		mqd->load_mqd = load_mqd;
+		mqd->update_mqd = update_mqd;
+		mqd->destroy_mqd = destroy_mqd;
+		mqd->is_occupied = is_occupied;
+#if defined(CONFIG_DEBUG_FS)
+		mqd->debugfs_show_mqd = debugfs_show_mqd;
+#endif
+		break;
+	case KFD_MQD_TYPE_HIQ:
+		mqd->init_mqd = init_mqd_hiq;
+		mqd->uninit_mqd = uninit_mqd;
+		mqd->load_mqd = load_mqd;
+		mqd->update_mqd = update_mqd_hiq;
+		mqd->destroy_mqd = destroy_mqd;
+		mqd->is_occupied = is_occupied;
+#if defined(CONFIG_DEBUG_FS)
+		mqd->debugfs_show_mqd = debugfs_show_mqd;
+#endif
+		break;
+	case KFD_MQD_TYPE_SDMA:
+		mqd->init_mqd = init_mqd_sdma;
+		mqd->uninit_mqd = uninit_mqd_sdma;
+		mqd->load_mqd = load_mqd_sdma;
+		mqd->update_mqd = update_mqd_sdma;
+		mqd->destroy_mqd = destroy_mqd_sdma;
+		mqd->is_occupied = is_occupied_sdma;
+#if defined(CONFIG_DEBUG_FS)
+		mqd->debugfs_show_mqd = debugfs_show_mqd_sdma;
+#endif
+		break;
+	default:
+		kfree(mqd);
+		return NULL;
+	}
+
+	return mqd;
+}
+
+struct mqd_manager *mqd_manager_init_vi_tonga(enum KFD_MQD_TYPE type,
+			struct kfd_dev *dev)
+{
+	struct mqd_manager *mqd;
+
+	mqd = mqd_manager_init_vi(type, dev);
+	if (!mqd)
+		return NULL;
+	if ((type == KFD_MQD_TYPE_CP) || (type == KFD_MQD_TYPE_COMPUTE))
+		mqd->update_mqd = update_mqd_tonga;
+	return mqd;
+}
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c
new file mode 100644
index 000000000..109263176
--- /dev/null
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c
@@ -0,0 +1,447 @@
+/*
+ * Copyright 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ *
+ */
+
+#include <linux/slab.h>
+#include <linux/mutex.h>
+#include "kfd_device_queue_manager.h"
+#include "kfd_kernel_queue.h"
+#include "kfd_priv.h"
+
+static inline void inc_wptr(unsigned int *wptr, unsigned int increment_bytes,
+				unsigned int buffer_size_bytes)
+{
+	unsigned int temp = *wptr + increment_bytes / sizeof(uint32_t);
+
+	WARN((temp * sizeof(uint32_t)) > buffer_size_bytes,
+	     "Runlist IB overflow");
+	*wptr = temp;
+}
+
+static void pm_calc_rlib_size(struct packet_manager *pm,
+				unsigned int *rlib_size,
+				bool *over_subscription)
+{
+	unsigned int process_count, queue_count, compute_queue_count;
+	unsigned int map_queue_size;
+	unsigned int max_proc_per_quantum = 1;
+	struct kfd_dev *dev = pm->dqm->dev;
+
+	process_count = pm->dqm->processes_count;
+	queue_count = pm->dqm->queue_count;
+	compute_queue_count = queue_count - pm->dqm->sdma_queue_count;
+
+	/* check if there is over subscription
+	 * Note: the arbitration between the number of VMIDs and
+	 * hws_max_conc_proc has been done in
+	 * kgd2kfd_device_init().
+	 */
+	*over_subscription = false;
+
+	if (dev->max_proc_per_quantum > 1)
+		max_proc_per_quantum = dev->max_proc_per_quantum;
+
+	if ((process_count > max_proc_per_quantum) ||
+	    compute_queue_count > get_queues_num(pm->dqm)) {
+		*over_subscription = true;
+		pr_debug("Over subscribed runlist\n");
+	}
+
+	map_queue_size = pm->pmf->map_queues_size;
+	/* calculate run list ib allocation size */
+	*rlib_size = process_count * pm->pmf->map_process_size +
+		     queue_count * map_queue_size;
+
+	/*
+	 * Increase the allocation size in case we need a chained run list
+	 * when over subscription
+	 */
+	if (*over_subscription)
+		*rlib_size += pm->pmf->runlist_size;
+
+	pr_debug("runlist ib size %d\n", *rlib_size);
+}
+
+static int pm_allocate_runlist_ib(struct packet_manager *pm,
+				unsigned int **rl_buffer,
+				uint64_t *rl_gpu_buffer,
+				unsigned int *rl_buffer_size,
+				bool *is_over_subscription)
+{
+	int retval;
+
+	if (WARN_ON(pm->allocated))
+		return -EINVAL;
+
+	pm_calc_rlib_size(pm, rl_buffer_size, is_over_subscription);
+
+	mutex_lock(&pm->lock);
+
+	retval = kfd_gtt_sa_allocate(pm->dqm->dev, *rl_buffer_size,
+					&pm->ib_buffer_obj);
+
+	if (retval) {
+		pr_err("Failed to allocate runlist IB\n");
+		goto out;
+	}
+
+	*(void **)rl_buffer = pm->ib_buffer_obj->cpu_ptr;
+	*rl_gpu_buffer = pm->ib_buffer_obj->gpu_addr;
+
+	memset(*rl_buffer, 0, *rl_buffer_size);
+	pm->allocated = true;
+
+out:
+	mutex_unlock(&pm->lock);
+	return retval;
+}
+
+static int pm_create_runlist_ib(struct packet_manager *pm,
+				struct list_head *queues,
+				uint64_t *rl_gpu_addr,
+				size_t *rl_size_bytes)
+{
+	unsigned int alloc_size_bytes;
+	unsigned int *rl_buffer, rl_wptr, i;
+	int retval, proccesses_mapped;
+	struct device_process_node *cur;
+	struct qcm_process_device *qpd;
+	struct queue *q;
+	struct kernel_queue *kq;
+	bool is_over_subscription;
+
+	rl_wptr = retval = proccesses_mapped = 0;
+
+	retval = pm_allocate_runlist_ib(pm, &rl_buffer, rl_gpu_addr,
+				&alloc_size_bytes, &is_over_subscription);
+	if (retval)
+		return retval;
+
+	*rl_size_bytes = alloc_size_bytes;
+	pm->ib_size_bytes = alloc_size_bytes;
+
+	pr_debug("Building runlist ib process count: %d queues count %d\n",
+		pm->dqm->processes_count, pm->dqm->queue_count);
+
+	/* build the run list ib packet */
+	list_for_each_entry(cur, queues, list) {
+		qpd = cur->qpd;
+		/* build map process packet */
+		if (proccesses_mapped >= pm->dqm->processes_count) {
+			pr_debug("Not enough space left in runlist IB\n");
+			pm_release_ib(pm);
+			return -ENOMEM;
+		}
+
+		retval = pm->pmf->map_process(pm, &rl_buffer[rl_wptr], qpd);
+		if (retval)
+			return retval;
+
+		proccesses_mapped++;
+		inc_wptr(&rl_wptr, pm->pmf->map_process_size,
+				alloc_size_bytes);
+
+		list_for_each_entry(kq, &qpd->priv_queue_list, list) {
+			if (!kq->queue->properties.is_active)
+				continue;
+
+			pr_debug("static_queue, mapping kernel q %d, is debug status %d\n",
+				kq->queue->queue, qpd->is_debug);
+
+			retval = pm->pmf->map_queues(pm,
+						&rl_buffer[rl_wptr],
+						kq->queue,
+						qpd->is_debug);
+			if (retval)
+				return retval;
+
+			inc_wptr(&rl_wptr,
+				pm->pmf->map_queues_size,
+				alloc_size_bytes);
+		}
+
+		list_for_each_entry(q, &qpd->queues_list, list) {
+			if (!q->properties.is_active)
+				continue;
+
+			pr_debug("static_queue, mapping user queue %d, is debug status %d\n",
+				q->queue, qpd->is_debug);
+
+			retval = pm->pmf->map_queues(pm,
+						&rl_buffer[rl_wptr],
+						q,
+						qpd->is_debug);
+
+			if (retval)
+				return retval;
+
+			inc_wptr(&rl_wptr,
+				pm->pmf->map_queues_size,
+				alloc_size_bytes);
+		}
+	}
+
+	pr_debug("Finished map process and queues to runlist\n");
+
+	if (is_over_subscription)
+		retval = pm->pmf->runlist(pm, &rl_buffer[rl_wptr],
+					*rl_gpu_addr,
+					alloc_size_bytes / sizeof(uint32_t),
+					true);
+
+	for (i = 0; i < alloc_size_bytes / sizeof(uint32_t); i++)
+		pr_debug("0x%2X ", rl_buffer[i]);
+	pr_debug("\n");
+
+	return retval;
+}
+
+int pm_init(struct packet_manager *pm, struct device_queue_manager *dqm)
+{
+	switch (dqm->dev->device_info->asic_family) {
+	case CHIP_KAVERI:
+	case CHIP_HAWAII:
+		/* PM4 packet structures on CIK are the same as on VI */
+	case CHIP_CARRIZO:
+	case CHIP_TONGA:
+	case CHIP_FIJI:
+	case CHIP_POLARIS10:
+	case CHIP_POLARIS11:
+		pm->pmf = &kfd_vi_pm_funcs;
+		break;
+	case CHIP_VEGA10:
+	case CHIP_RAVEN:
+		pm->pmf = &kfd_v9_pm_funcs;
+		break;
+	default:
+		WARN(1, "Unexpected ASIC family %u",
+		     dqm->dev->device_info->asic_family);
+		return -EINVAL;
+	}
+
+	pm->dqm = dqm;
+	mutex_init(&pm->lock);
+	pm->priv_queue = kernel_queue_init(dqm->dev, KFD_QUEUE_TYPE_HIQ);
+	if (!pm->priv_queue) {
+		mutex_destroy(&pm->lock);
+		return -ENOMEM;
+	}
+	pm->allocated = false;
+
+	return 0;
+}
+
+void pm_uninit(struct packet_manager *pm)
+{
+	mutex_destroy(&pm->lock);
+	kernel_queue_uninit(pm->priv_queue);
+}
+
+int pm_send_set_resources(struct packet_manager *pm,
+				struct scheduling_resources *res)
+{
+	uint32_t *buffer, size;
+	int retval = 0;
+
+	size = pm->pmf->set_resources_size;
+	mutex_lock(&pm->lock);
+	pm->priv_queue->ops.acquire_packet_buffer(pm->priv_queue,
+					size / sizeof(uint32_t),
+					(unsigned int **)&buffer);
+	if (!buffer) {
+		pr_err("Failed to allocate buffer on kernel queue\n");
+		retval = -ENOMEM;
+		goto out;
+	}
+
+	retval = pm->pmf->set_resources(pm, buffer, res);
+	if (!retval)
+		pm->priv_queue->ops.submit_packet(pm->priv_queue);
+	else
+		pm->priv_queue->ops.rollback_packet(pm->priv_queue);
+
+out:
+	mutex_unlock(&pm->lock);
+
+	return retval;
+}
+
+int pm_send_runlist(struct packet_manager *pm, struct list_head *dqm_queues)
+{
+	uint64_t rl_gpu_ib_addr;
+	uint32_t *rl_buffer;
+	size_t rl_ib_size, packet_size_dwords;
+	int retval;
+
+	retval = pm_create_runlist_ib(pm, dqm_queues, &rl_gpu_ib_addr,
+					&rl_ib_size);
+	if (retval)
+		goto fail_create_runlist_ib;
+
+	pr_debug("runlist IB address: 0x%llX\n", rl_gpu_ib_addr);
+
+	packet_size_dwords = pm->pmf->runlist_size / sizeof(uint32_t);
+	mutex_lock(&pm->lock);
+
+	retval = pm->priv_queue->ops.acquire_packet_buffer(pm->priv_queue,
+					packet_size_dwords, &rl_buffer);
+	if (retval)
+		goto fail_acquire_packet_buffer;
+
+	retval = pm->pmf->runlist(pm, rl_buffer, rl_gpu_ib_addr,
+					rl_ib_size / sizeof(uint32_t), false);
+	if (retval)
+		goto fail_create_runlist;
+
+	pm->priv_queue->ops.submit_packet(pm->priv_queue);
+
+	mutex_unlock(&pm->lock);
+
+	return retval;
+
+fail_create_runlist:
+	pm->priv_queue->ops.rollback_packet(pm->priv_queue);
+fail_acquire_packet_buffer:
+	mutex_unlock(&pm->lock);
+fail_create_runlist_ib:
+	pm_release_ib(pm);
+	return retval;
+}
+
+int pm_send_query_status(struct packet_manager *pm, uint64_t fence_address,
+			uint32_t fence_value)
+{
+	uint32_t *buffer, size;
+	int retval = 0;
+
+	if (WARN_ON(!fence_address))
+		return -EFAULT;
+
+	size = pm->pmf->query_status_size;
+	mutex_lock(&pm->lock);
+	pm->priv_queue->ops.acquire_packet_buffer(pm->priv_queue,
+			size / sizeof(uint32_t), (unsigned int **)&buffer);
+	if (!buffer) {
+		pr_err("Failed to allocate buffer on kernel queue\n");
+		retval = -ENOMEM;
+		goto out;
+	}
+
+	retval = pm->pmf->query_status(pm, buffer, fence_address, fence_value);
+	if (!retval)
+		pm->priv_queue->ops.submit_packet(pm->priv_queue);
+	else
+		pm->priv_queue->ops.rollback_packet(pm->priv_queue);
+
+out:
+	mutex_unlock(&pm->lock);
+	return retval;
+}
+
+int pm_send_unmap_queue(struct packet_manager *pm, enum kfd_queue_type type,
+			enum kfd_unmap_queues_filter filter,
+			uint32_t filter_param, bool reset,
+			unsigned int sdma_engine)
+{
+	uint32_t *buffer, size;
+	int retval = 0;
+
+	size = pm->pmf->unmap_queues_size;
+	mutex_lock(&pm->lock);
+	pm->priv_queue->ops.acquire_packet_buffer(pm->priv_queue,
+			size / sizeof(uint32_t), (unsigned int **)&buffer);
+	if (!buffer) {
+		pr_err("Failed to allocate buffer on kernel queue\n");
+		retval = -ENOMEM;
+		goto out;
+	}
+
+	retval = pm->pmf->unmap_queues(pm, buffer, type, filter, filter_param,
+				       reset, sdma_engine);
+	if (!retval)
+		pm->priv_queue->ops.submit_packet(pm->priv_queue);
+	else
+		pm->priv_queue->ops.rollback_packet(pm->priv_queue);
+
+out:
+	mutex_unlock(&pm->lock);
+	return retval;
+}
+
+void pm_release_ib(struct packet_manager *pm)
+{
+	mutex_lock(&pm->lock);
+	if (pm->allocated) {
+		kfd_gtt_sa_free(pm->dqm->dev, pm->ib_buffer_obj);
+		pm->allocated = false;
+	}
+	mutex_unlock(&pm->lock);
+}
+
+#if defined(CONFIG_DEBUG_FS)
+
+int pm_debugfs_runlist(struct seq_file *m, void *data)
+{
+	struct packet_manager *pm = data;
+
+	mutex_lock(&pm->lock);
+
+	if (!pm->allocated) {
+		seq_puts(m, "  No active runlist\n");
+		goto out;
+	}
+
+	seq_hex_dump(m, "  ", DUMP_PREFIX_OFFSET, 32, 4,
+		     pm->ib_buffer_obj->cpu_ptr, pm->ib_size_bytes, false);
+
+out:
+	mutex_unlock(&pm->lock);
+	return 0;
+}
+
+int pm_debugfs_hang_hws(struct packet_manager *pm)
+{
+	uint32_t *buffer, size;
+	int r = 0;
+
+	size = pm->pmf->query_status_size;
+	mutex_lock(&pm->lock);
+	pm->priv_queue->ops.acquire_packet_buffer(pm->priv_queue,
+			size / sizeof(uint32_t), (unsigned int **)&buffer);
+	if (!buffer) {
+		pr_err("Failed to allocate buffer on kernel queue\n");
+		r = -ENOMEM;
+		goto out;
+	}
+	memset(buffer, 0x55, size);
+	pm->priv_queue->ops.submit_packet(pm->priv_queue);
+
+	pr_info("Submitting %x %x %x %x %x %x %x to HIQ to hang the HWS.",
+		buffer[0], buffer[1], buffer[2], buffer[3],
+		buffer[4], buffer[5], buffer[6]);
+out:
+	mutex_unlock(&pm->lock);
+	return r;
+}
+
+
+#endif
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_pasid.c b/drivers/gpu/drm/amd/amdkfd/kfd_pasid.c
new file mode 100644
index 000000000..15fff4420
--- /dev/null
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_pasid.c
@@ -0,0 +1,83 @@
+/*
+ * Copyright 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#include <linux/types.h>
+#include "kfd_priv.h"
+
+static unsigned int pasid_bits = 16;
+static const struct kfd2kgd_calls *kfd2kgd;
+
+bool kfd_set_pasid_limit(unsigned int new_limit)
+{
+	if (new_limit < 2)
+		return false;
+
+	if (new_limit < (1U << pasid_bits)) {
+		if (kfd2kgd)
+			/* We've already allocated user PASIDs, too late to
+			 * change the limit
+			 */
+			return false;
+
+		while (new_limit < (1U << pasid_bits))
+			pasid_bits--;
+	}
+
+	return true;
+}
+
+unsigned int kfd_get_pasid_limit(void)
+{
+	return 1U << pasid_bits;
+}
+
+unsigned int kfd_pasid_alloc(void)
+{
+	int r;
+
+	/* Find the first best KFD device for calling KGD */
+	if (!kfd2kgd) {
+		struct kfd_dev *dev = NULL;
+		unsigned int i = 0;
+
+		while ((kfd_topology_enum_kfd_devices(i, &dev)) == 0) {
+			if (dev && dev->kfd2kgd) {
+				kfd2kgd = dev->kfd2kgd;
+				break;
+			}
+			i++;
+		}
+
+		if (!kfd2kgd)
+			return false;
+	}
+
+	r = kfd2kgd->alloc_pasid(pasid_bits);
+
+	return r > 0 ? r : 0;
+}
+
+void kfd_pasid_free(unsigned int pasid)
+{
+	if (kfd2kgd)
+		kfd2kgd->free_pasid(pasid);
+}
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_pm4_headers.h b/drivers/gpu/drm/amd/amdkfd/kfd_pm4_headers.h
new file mode 100644
index 000000000..e50f73d25
--- /dev/null
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_pm4_headers.h
@@ -0,0 +1,155 @@
+/*
+ * Copyright 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ *
+ */
+
+#ifndef KFD_PM4_HEADERS_H_
+#define KFD_PM4_HEADERS_H_
+
+#ifndef PM4_MES_HEADER_DEFINED
+#define PM4_MES_HEADER_DEFINED
+union PM4_MES_TYPE_3_HEADER {
+	struct {
+		/* reserved */
+		uint32_t reserved1:8;
+		/* IT opcode */
+		uint32_t opcode:8;
+		/* number of DWORDs - 1 in the information body */
+		uint32_t count:14;
+		/* packet identifier. It should be 3 for type 3 packets */
+		uint32_t type:2;
+	};
+	uint32_t u32all;
+};
+#endif /* PM4_MES_HEADER_DEFINED */
+
+
+/*--------------------MES_MAP_PROCESS-------------------- */
+
+#ifndef PM4_MES_MAP_PROCESS_DEFINED
+#define PM4_MES_MAP_PROCESS_DEFINED
+
+struct pm4_map_process {
+	union {
+		union PM4_MES_TYPE_3_HEADER header;	/* header */
+		uint32_t ordinal1;
+	};
+
+	union {
+		struct {
+			uint32_t pasid:16;
+			uint32_t reserved1:8;
+			uint32_t diq_enable:1;
+			uint32_t process_quantum:7;
+		} bitfields2;
+		uint32_t ordinal2;
+	};
+
+	union {
+		struct {
+			uint32_t page_table_base:28;
+			uint32_t reserved3:4;
+		} bitfields3;
+		uint32_t ordinal3;
+	};
+
+	uint32_t sh_mem_bases;
+	uint32_t sh_mem_ape1_base;
+	uint32_t sh_mem_ape1_limit;
+	uint32_t sh_mem_config;
+	uint32_t gds_addr_lo;
+	uint32_t gds_addr_hi;
+
+	union {
+		struct {
+			uint32_t num_gws:6;
+			uint32_t reserved4:2;
+			uint32_t num_oac:4;
+			uint32_t reserved5:4;
+			uint32_t gds_size:6;
+			uint32_t num_queues:10;
+		} bitfields10;
+		uint32_t ordinal10;
+	};
+
+};
+#endif
+
+#ifndef PM4_MES_MAP_PROCESS_DEFINED_KV_SCRATCH
+#define PM4_MES_MAP_PROCESS_DEFINED_KV_SCRATCH
+
+struct pm4_map_process_scratch_kv {
+	union {
+		union PM4_MES_TYPE_3_HEADER   header; /* header */
+		uint32_t            ordinal1;
+	};
+
+	union {
+		struct {
+			uint32_t pasid:16;
+			uint32_t reserved1:8;
+			uint32_t diq_enable:1;
+			uint32_t process_quantum:7;
+		} bitfields2;
+		uint32_t ordinal2;
+	};
+
+	union {
+		struct {
+			uint32_t page_table_base:28;
+			uint32_t reserved2:4;
+		} bitfields3;
+		uint32_t ordinal3;
+	};
+
+	uint32_t reserved3;
+	uint32_t sh_mem_bases;
+	uint32_t sh_mem_config;
+	uint32_t sh_mem_ape1_base;
+	uint32_t sh_mem_ape1_limit;
+	uint32_t sh_hidden_private_base_vmid;
+	uint32_t reserved4;
+	uint32_t reserved5;
+	uint32_t gds_addr_lo;
+	uint32_t gds_addr_hi;
+
+	union {
+		struct {
+			uint32_t num_gws:6;
+			uint32_t reserved6:2;
+			uint32_t num_oac:4;
+			uint32_t reserved7:4;
+			uint32_t gds_size:6;
+			uint32_t num_queues:10;
+		} bitfields14;
+		uint32_t ordinal14;
+	};
+
+	uint32_t completion_signal_lo32;
+uint32_t completion_signal_hi32;
+};
+#endif
+
+enum {
+	CACHE_FLUSH_AND_INV_TS_EVENT = 0x00000014
+};
+
+#endif /* KFD_PM4_HEADERS_H_ */
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_pm4_headers_ai.h b/drivers/gpu/drm/amd/amdkfd/kfd_pm4_headers_ai.h
new file mode 100644
index 000000000..f2bcf5c09
--- /dev/null
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_pm4_headers_ai.h
@@ -0,0 +1,583 @@
+/*
+ * Copyright 2016 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ *
+ */
+
+#ifndef F32_MES_PM4_PACKETS_H
+#define F32_MES_PM4_PACKETS_H
+
+#ifndef PM4_MES_HEADER_DEFINED
+#define PM4_MES_HEADER_DEFINED
+union PM4_MES_TYPE_3_HEADER {
+	struct {
+		uint32_t reserved1 : 8; /* < reserved */
+		uint32_t opcode    : 8; /* < IT opcode */
+		uint32_t count     : 14;/* < number of DWORDs - 1 in the
+					 *   information body.
+					 */
+		uint32_t type      : 2; /* < packet identifier.
+					 *   It should be 3 for type 3 packets
+					 */
+	};
+	uint32_t u32All;
+};
+#endif /* PM4_MES_HEADER_DEFINED */
+
+/*--------------------MES_SET_RESOURCES--------------------*/
+
+#ifndef PM4_MES_SET_RESOURCES_DEFINED
+#define PM4_MES_SET_RESOURCES_DEFINED
+enum mes_set_resources_queue_type_enum {
+	queue_type__mes_set_resources__kernel_interface_queue_kiq = 0,
+	queue_type__mes_set_resources__hsa_interface_queue_hiq = 1,
+	queue_type__mes_set_resources__hsa_debug_interface_queue = 4
+};
+
+
+struct pm4_mes_set_resources {
+	union {
+		union PM4_MES_TYPE_3_HEADER	header;		/* header */
+		uint32_t			ordinal1;
+	};
+
+	union {
+		struct {
+			uint32_t vmid_mask:16;
+			uint32_t unmap_latency:8;
+			uint32_t reserved1:5;
+			enum mes_set_resources_queue_type_enum queue_type:3;
+		} bitfields2;
+		uint32_t ordinal2;
+	};
+
+	uint32_t queue_mask_lo;
+	uint32_t queue_mask_hi;
+	uint32_t gws_mask_lo;
+	uint32_t gws_mask_hi;
+
+	union {
+		struct {
+			uint32_t oac_mask:16;
+			uint32_t reserved2:16;
+		} bitfields7;
+		uint32_t ordinal7;
+	};
+
+	union {
+		struct {
+		uint32_t gds_heap_base:6;
+		uint32_t reserved3:5;
+		uint32_t gds_heap_size:6;
+		uint32_t reserved4:15;
+		} bitfields8;
+		uint32_t ordinal8;
+	};
+
+};
+#endif
+
+/*--------------------MES_RUN_LIST--------------------*/
+
+#ifndef PM4_MES_RUN_LIST_DEFINED
+#define PM4_MES_RUN_LIST_DEFINED
+
+struct pm4_mes_runlist {
+	union {
+		union PM4_MES_TYPE_3_HEADER header; /* header */
+		uint32_t ordinal1;
+	};
+
+	union {
+		struct {
+			uint32_t reserved1:2;
+			uint32_t ib_base_lo:30;
+		} bitfields2;
+		uint32_t ordinal2;
+	};
+
+	uint32_t ib_base_hi;
+
+	union {
+		struct {
+			uint32_t ib_size:20;
+			uint32_t chain:1;
+			uint32_t offload_polling:1;
+			uint32_t reserved2:1;
+			uint32_t valid:1;
+			uint32_t process_cnt:4;
+			uint32_t reserved3:4;
+		} bitfields4;
+		uint32_t ordinal4;
+	};
+
+};
+#endif
+
+/*--------------------MES_MAP_PROCESS--------------------*/
+
+#ifndef PM4_MES_MAP_PROCESS_DEFINED
+#define PM4_MES_MAP_PROCESS_DEFINED
+
+struct pm4_mes_map_process {
+	union {
+		union PM4_MES_TYPE_3_HEADER header;	/* header */
+		uint32_t ordinal1;
+	};
+
+	union {
+		struct {
+			uint32_t pasid:16;
+			uint32_t reserved1:8;
+			uint32_t diq_enable:1;
+			uint32_t process_quantum:7;
+		} bitfields2;
+		uint32_t ordinal2;
+	};
+
+	uint32_t vm_context_page_table_base_addr_lo32;
+
+	uint32_t vm_context_page_table_base_addr_hi32;
+
+	uint32_t sh_mem_bases;
+
+	uint32_t sh_mem_config;
+
+	uint32_t sq_shader_tba_lo;
+
+	uint32_t sq_shader_tba_hi;
+
+	uint32_t sq_shader_tma_lo;
+
+	uint32_t sq_shader_tma_hi;
+
+	uint32_t reserved6;
+
+	uint32_t gds_addr_lo;
+
+	uint32_t gds_addr_hi;
+
+	union {
+		struct {
+			uint32_t num_gws:6;
+			uint32_t reserved7:1;
+			uint32_t sdma_enable:1;
+			uint32_t num_oac:4;
+			uint32_t reserved8:4;
+			uint32_t gds_size:6;
+			uint32_t num_queues:10;
+		} bitfields14;
+		uint32_t ordinal14;
+	};
+
+	uint32_t completion_signal_lo;
+
+	uint32_t completion_signal_hi;
+
+};
+
+#endif
+
+/*--------------------MES_MAP_PROCESS_VM--------------------*/
+
+#ifndef PM4_MES_MAP_PROCESS_VM_DEFINED
+#define PM4_MES_MAP_PROCESS_VM_DEFINED
+
+struct PM4_MES_MAP_PROCESS_VM {
+	union {
+		union PM4_MES_TYPE_3_HEADER header;	/* header */
+		uint32_t ordinal1;
+	};
+
+	uint32_t reserved1;
+
+	uint32_t vm_context_cntl;
+
+	uint32_t reserved2;
+
+	uint32_t vm_context_page_table_end_addr_lo32;
+
+	uint32_t vm_context_page_table_end_addr_hi32;
+
+	uint32_t vm_context_page_table_start_addr_lo32;
+
+	uint32_t vm_context_page_table_start_addr_hi32;
+
+	uint32_t reserved3;
+
+	uint32_t reserved4;
+
+	uint32_t reserved5;
+
+	uint32_t reserved6;
+
+	uint32_t reserved7;
+
+	uint32_t reserved8;
+
+	uint32_t completion_signal_lo32;
+
+	uint32_t completion_signal_hi32;
+
+};
+#endif
+
+/*--------------------MES_MAP_QUEUES--------------------*/
+
+#ifndef PM4_MES_MAP_QUEUES_VI_DEFINED
+#define PM4_MES_MAP_QUEUES_VI_DEFINED
+enum mes_map_queues_queue_sel_enum {
+	queue_sel__mes_map_queues__map_to_specified_queue_slots_vi = 0,
+queue_sel__mes_map_queues__map_to_hws_determined_queue_slots_vi = 1
+};
+
+enum mes_map_queues_queue_type_enum {
+	queue_type__mes_map_queues__normal_compute_vi = 0,
+	queue_type__mes_map_queues__debug_interface_queue_vi = 1,
+	queue_type__mes_map_queues__normal_latency_static_queue_vi = 2,
+queue_type__mes_map_queues__low_latency_static_queue_vi = 3
+};
+
+enum mes_map_queues_alloc_format_enum {
+	alloc_format__mes_map_queues__one_per_pipe_vi = 0,
+alloc_format__mes_map_queues__all_on_one_pipe_vi = 1
+};
+
+enum mes_map_queues_engine_sel_enum {
+	engine_sel__mes_map_queues__compute_vi = 0,
+	engine_sel__mes_map_queues__sdma0_vi = 2,
+	engine_sel__mes_map_queues__sdma1_vi = 3
+};
+
+
+struct pm4_mes_map_queues {
+	union {
+		union PM4_MES_TYPE_3_HEADER   header;            /* header */
+		uint32_t            ordinal1;
+	};
+
+	union {
+		struct {
+			uint32_t reserved1:4;
+			enum mes_map_queues_queue_sel_enum queue_sel:2;
+			uint32_t reserved2:15;
+			enum mes_map_queues_queue_type_enum queue_type:3;
+			enum mes_map_queues_alloc_format_enum alloc_format:2;
+			enum mes_map_queues_engine_sel_enum engine_sel:3;
+			uint32_t num_queues:3;
+		} bitfields2;
+		uint32_t ordinal2;
+	};
+
+	union {
+		struct {
+			uint32_t reserved3:1;
+			uint32_t check_disable:1;
+			uint32_t doorbell_offset:26;
+			uint32_t reserved4:4;
+		} bitfields3;
+		uint32_t ordinal3;
+	};
+
+	uint32_t mqd_addr_lo;
+	uint32_t mqd_addr_hi;
+	uint32_t wptr_addr_lo;
+	uint32_t wptr_addr_hi;
+};
+#endif
+
+/*--------------------MES_QUERY_STATUS--------------------*/
+
+#ifndef PM4_MES_QUERY_STATUS_DEFINED
+#define PM4_MES_QUERY_STATUS_DEFINED
+enum mes_query_status_interrupt_sel_enum {
+	interrupt_sel__mes_query_status__completion_status = 0,
+	interrupt_sel__mes_query_status__process_status = 1,
+	interrupt_sel__mes_query_status__queue_status = 2
+};
+
+enum mes_query_status_command_enum {
+	command__mes_query_status__interrupt_only = 0,
+	command__mes_query_status__fence_only_immediate = 1,
+	command__mes_query_status__fence_only_after_write_ack = 2,
+	command__mes_query_status__fence_wait_for_write_ack_send_interrupt = 3
+};
+
+enum mes_query_status_engine_sel_enum {
+	engine_sel__mes_query_status__compute = 0,
+	engine_sel__mes_query_status__sdma0_queue = 2,
+	engine_sel__mes_query_status__sdma1_queue = 3
+};
+
+struct pm4_mes_query_status {
+	union {
+		union PM4_MES_TYPE_3_HEADER   header;            /* header */
+		uint32_t            ordinal1;
+	};
+
+	union {
+		struct {
+			uint32_t context_id:28;
+			enum mes_query_status_interrupt_sel_enum	interrupt_sel:2;
+			enum mes_query_status_command_enum command:2;
+		} bitfields2;
+		uint32_t ordinal2;
+	};
+
+	union {
+		struct {
+			uint32_t pasid:16;
+			uint32_t reserved1:16;
+		} bitfields3a;
+		struct {
+			uint32_t reserved2:2;
+			uint32_t doorbell_offset:26;
+			enum mes_query_status_engine_sel_enum engine_sel:3;
+			uint32_t reserved3:1;
+		} bitfields3b;
+		uint32_t ordinal3;
+	};
+
+	uint32_t addr_lo;
+	uint32_t addr_hi;
+	uint32_t data_lo;
+	uint32_t data_hi;
+};
+#endif
+
+/*--------------------MES_UNMAP_QUEUES--------------------*/
+
+#ifndef PM4_MES_UNMAP_QUEUES_DEFINED
+#define PM4_MES_UNMAP_QUEUES_DEFINED
+enum mes_unmap_queues_action_enum {
+	action__mes_unmap_queues__preempt_queues = 0,
+	action__mes_unmap_queues__reset_queues = 1,
+	action__mes_unmap_queues__disable_process_queues = 2,
+	action__mes_unmap_queues__reserved = 3
+};
+
+enum mes_unmap_queues_queue_sel_enum {
+	queue_sel__mes_unmap_queues__perform_request_on_specified_queues = 0,
+	queue_sel__mes_unmap_queues__perform_request_on_pasid_queues = 1,
+	queue_sel__mes_unmap_queues__unmap_all_queues = 2,
+	queue_sel__mes_unmap_queues__unmap_all_non_static_queues = 3
+};
+
+enum mes_unmap_queues_engine_sel_enum {
+	engine_sel__mes_unmap_queues__compute = 0,
+	engine_sel__mes_unmap_queues__sdma0 = 2,
+	engine_sel__mes_unmap_queues__sdmal = 3
+};
+
+struct pm4_mes_unmap_queues {
+	union {
+		union PM4_MES_TYPE_3_HEADER   header;            /* header */
+		uint32_t            ordinal1;
+	};
+
+	union {
+		struct {
+			enum mes_unmap_queues_action_enum action:2;
+			uint32_t reserved1:2;
+			enum mes_unmap_queues_queue_sel_enum queue_sel:2;
+			uint32_t reserved2:20;
+			enum mes_unmap_queues_engine_sel_enum engine_sel:3;
+			uint32_t num_queues:3;
+		} bitfields2;
+		uint32_t ordinal2;
+	};
+
+	union {
+		struct {
+			uint32_t pasid:16;
+			uint32_t reserved3:16;
+		} bitfields3a;
+		struct {
+			uint32_t reserved4:2;
+			uint32_t doorbell_offset0:26;
+			int32_t reserved5:4;
+		} bitfields3b;
+		uint32_t ordinal3;
+	};
+
+	union {
+	struct {
+			uint32_t reserved6:2;
+			uint32_t doorbell_offset1:26;
+			uint32_t reserved7:4;
+		} bitfields4;
+		uint32_t ordinal4;
+	};
+
+	union {
+		struct {
+			uint32_t reserved8:2;
+			uint32_t doorbell_offset2:26;
+			uint32_t reserved9:4;
+		} bitfields5;
+		uint32_t ordinal5;
+	};
+
+	union {
+		struct {
+			uint32_t reserved10:2;
+			uint32_t doorbell_offset3:26;
+			uint32_t reserved11:4;
+		} bitfields6;
+		uint32_t ordinal6;
+	};
+};
+#endif
+
+#ifndef PM4_MEC_RELEASE_MEM_DEFINED
+#define PM4_MEC_RELEASE_MEM_DEFINED
+
+enum mec_release_mem_event_index_enum {
+	event_index__mec_release_mem__end_of_pipe = 5,
+	event_index__mec_release_mem__shader_done = 6
+};
+
+enum mec_release_mem_cache_policy_enum {
+	cache_policy__mec_release_mem__lru = 0,
+	cache_policy__mec_release_mem__stream = 1
+};
+
+enum mec_release_mem_pq_exe_status_enum {
+	pq_exe_status__mec_release_mem__default = 0,
+	pq_exe_status__mec_release_mem__phase_update = 1
+};
+
+enum mec_release_mem_dst_sel_enum {
+	dst_sel__mec_release_mem__memory_controller = 0,
+	dst_sel__mec_release_mem__tc_l2 = 1,
+	dst_sel__mec_release_mem__queue_write_pointer_register = 2,
+	dst_sel__mec_release_mem__queue_write_pointer_poll_mask_bit = 3
+};
+
+enum mec_release_mem_int_sel_enum {
+	int_sel__mec_release_mem__none = 0,
+	int_sel__mec_release_mem__send_interrupt_only = 1,
+	int_sel__mec_release_mem__send_interrupt_after_write_confirm = 2,
+	int_sel__mec_release_mem__send_data_after_write_confirm = 3,
+	int_sel__mec_release_mem__unconditionally_send_int_ctxid = 4,
+	int_sel__mec_release_mem__conditionally_send_int_ctxid_based_on_32_bit_compare = 5,
+	int_sel__mec_release_mem__conditionally_send_int_ctxid_based_on_64_bit_compare = 6
+};
+
+enum mec_release_mem_data_sel_enum {
+	data_sel__mec_release_mem__none = 0,
+	data_sel__mec_release_mem__send_32_bit_low = 1,
+	data_sel__mec_release_mem__send_64_bit_data = 2,
+	data_sel__mec_release_mem__send_gpu_clock_counter = 3,
+	data_sel__mec_release_mem__send_cp_perfcounter_hi_lo = 4,
+	data_sel__mec_release_mem__store_gds_data_to_memory = 5
+};
+
+struct pm4_mec_release_mem {
+	union {
+		union PM4_MES_TYPE_3_HEADER header;     /*header */
+		unsigned int ordinal1;
+	};
+
+	union {
+		struct {
+			unsigned int event_type:6;
+			unsigned int reserved1:2;
+			enum mec_release_mem_event_index_enum event_index:4;
+			unsigned int tcl1_vol_action_ena:1;
+			unsigned int tc_vol_action_ena:1;
+			unsigned int reserved2:1;
+			unsigned int tc_wb_action_ena:1;
+			unsigned int tcl1_action_ena:1;
+			unsigned int tc_action_ena:1;
+			uint32_t reserved3:1;
+			uint32_t tc_nc_action_ena:1;
+			uint32_t tc_wc_action_ena:1;
+			uint32_t tc_md_action_ena:1;
+			uint32_t reserved4:3;
+			enum mec_release_mem_cache_policy_enum cache_policy:2;
+			uint32_t reserved5:2;
+			enum mec_release_mem_pq_exe_status_enum pq_exe_status:1;
+			uint32_t reserved6:2;
+		} bitfields2;
+		unsigned int ordinal2;
+	};
+
+	union {
+		struct {
+			uint32_t reserved7:16;
+			enum mec_release_mem_dst_sel_enum dst_sel:2;
+			uint32_t reserved8:6;
+			enum mec_release_mem_int_sel_enum int_sel:3;
+			uint32_t reserved9:2;
+			enum mec_release_mem_data_sel_enum data_sel:3;
+		} bitfields3;
+		unsigned int ordinal3;
+	};
+
+	union {
+		struct {
+			uint32_t reserved10:2;
+			unsigned int address_lo_32b:30;
+		} bitfields4;
+		struct {
+			uint32_t reserved11:3;
+			uint32_t address_lo_64b:29;
+		} bitfields4b;
+		uint32_t reserved12;
+		unsigned int ordinal4;
+	};
+
+	union {
+		uint32_t address_hi;
+		uint32_t reserved13;
+		uint32_t ordinal5;
+	};
+
+	union {
+		uint32_t data_lo;
+		uint32_t cmp_data_lo;
+		struct {
+			uint32_t dw_offset:16;
+			uint32_t num_dwords:16;
+		} bitfields6c;
+		uint32_t reserved14;
+		uint32_t ordinal6;
+	};
+
+	union {
+		uint32_t data_hi;
+		uint32_t cmp_data_hi;
+		uint32_t reserved15;
+		uint32_t reserved16;
+		uint32_t ordinal7;
+	};
+
+	uint32_t int_ctxid;
+
+};
+
+#endif
+
+enum {
+	CACHE_FLUSH_AND_INV_TS_EVENT = 0x00000014
+};
+#endif
+
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_pm4_headers_diq.h b/drivers/gpu/drm/amd/amdkfd/kfd_pm4_headers_diq.h
new file mode 100644
index 000000000..a0ff34878
--- /dev/null
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_pm4_headers_diq.h
@@ -0,0 +1,290 @@
+/*
+ * Copyright 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ *
+ */
+
+#ifndef KFD_PM4_HEADERS_DIQ_H_
+#define KFD_PM4_HEADERS_DIQ_H_
+
+/*--------------------_INDIRECT_BUFFER-------------------- */
+
+#ifndef _PM4__INDIRECT_BUFFER_DEFINED
+#define _PM4__INDIRECT_BUFFER_DEFINED
+enum _INDIRECT_BUFFER_cache_policy_enum {
+	cache_policy___indirect_buffer__lru = 0,
+	cache_policy___indirect_buffer__stream = 1,
+	cache_policy___indirect_buffer__bypass = 2
+};
+
+enum {
+	IT_INDIRECT_BUFFER_PASID = 0x5C
+};
+
+struct pm4__indirect_buffer_pasid {
+	union {
+		union PM4_MES_TYPE_3_HEADER header;	/* header */
+		unsigned int ordinal1;
+	};
+
+	union {
+		struct {
+			unsigned int reserved1:2;
+			unsigned int ib_base_lo:30;
+		} bitfields2;
+		unsigned int ordinal2;
+	};
+
+	union {
+		struct {
+			unsigned int ib_base_hi:16;
+			unsigned int reserved2:16;
+		} bitfields3;
+		unsigned int ordinal3;
+	};
+
+	union {
+		unsigned int control;
+		unsigned int ordinal4;
+	};
+
+	union {
+		struct {
+			unsigned int pasid:10;
+			unsigned int reserved4:22;
+		} bitfields5;
+		unsigned int ordinal5;
+	};
+
+};
+
+#endif
+
+/*--------------------_RELEASE_MEM-------------------- */
+
+#ifndef _PM4__RELEASE_MEM_DEFINED
+#define _PM4__RELEASE_MEM_DEFINED
+enum _RELEASE_MEM_event_index_enum {
+	event_index___release_mem__end_of_pipe = 5,
+	event_index___release_mem__shader_done = 6
+};
+
+enum _RELEASE_MEM_cache_policy_enum {
+	cache_policy___release_mem__lru = 0,
+	cache_policy___release_mem__stream = 1,
+	cache_policy___release_mem__bypass = 2
+};
+
+enum _RELEASE_MEM_dst_sel_enum {
+	dst_sel___release_mem__memory_controller = 0,
+	dst_sel___release_mem__tc_l2 = 1,
+	dst_sel___release_mem__queue_write_pointer_register = 2,
+	dst_sel___release_mem__queue_write_pointer_poll_mask_bit = 3
+};
+
+enum _RELEASE_MEM_int_sel_enum {
+	int_sel___release_mem__none = 0,
+	int_sel___release_mem__send_interrupt_only = 1,
+	int_sel___release_mem__send_interrupt_after_write_confirm = 2,
+	int_sel___release_mem__send_data_after_write_confirm = 3
+};
+
+enum _RELEASE_MEM_data_sel_enum {
+	data_sel___release_mem__none = 0,
+	data_sel___release_mem__send_32_bit_low = 1,
+	data_sel___release_mem__send_64_bit_data = 2,
+	data_sel___release_mem__send_gpu_clock_counter = 3,
+	data_sel___release_mem__send_cp_perfcounter_hi_lo = 4,
+	data_sel___release_mem__store_gds_data_to_memory = 5
+};
+
+struct pm4__release_mem {
+	union {
+		union PM4_MES_TYPE_3_HEADER header;	/*header */
+		unsigned int ordinal1;
+	};
+
+	union {
+		struct {
+			unsigned int event_type:6;
+			unsigned int reserved1:2;
+			enum _RELEASE_MEM_event_index_enum event_index:4;
+			unsigned int tcl1_vol_action_ena:1;
+			unsigned int tc_vol_action_ena:1;
+			unsigned int reserved2:1;
+			unsigned int tc_wb_action_ena:1;
+			unsigned int tcl1_action_ena:1;
+			unsigned int tc_action_ena:1;
+			unsigned int reserved3:6;
+			unsigned int atc:1;
+			enum _RELEASE_MEM_cache_policy_enum cache_policy:2;
+			unsigned int reserved4:5;
+		} bitfields2;
+		unsigned int ordinal2;
+	};
+
+	union {
+		struct {
+			unsigned int reserved5:16;
+			enum _RELEASE_MEM_dst_sel_enum dst_sel:2;
+			unsigned int reserved6:6;
+			enum _RELEASE_MEM_int_sel_enum int_sel:3;
+			unsigned int reserved7:2;
+			enum _RELEASE_MEM_data_sel_enum data_sel:3;
+		} bitfields3;
+		unsigned int ordinal3;
+	};
+
+	union {
+		struct {
+			unsigned int reserved8:2;
+			unsigned int address_lo_32b:30;
+		} bitfields4;
+		struct {
+			unsigned int reserved9:3;
+			unsigned int address_lo_64b:29;
+		} bitfields5;
+		unsigned int ordinal4;
+	};
+
+	unsigned int address_hi;
+
+	unsigned int data_lo;
+
+	unsigned int data_hi;
+
+};
+#endif
+
+
+/*--------------------_SET_CONFIG_REG-------------------- */
+
+#ifndef _PM4__SET_CONFIG_REG_DEFINED
+#define _PM4__SET_CONFIG_REG_DEFINED
+
+struct pm4__set_config_reg {
+	union {
+		union PM4_MES_TYPE_3_HEADER header;	/*header */
+		unsigned int ordinal1;
+	};
+
+	union {
+		struct {
+			unsigned int reg_offset:16;
+			unsigned int reserved1:7;
+			unsigned int vmid_shift:5;
+			unsigned int insert_vmid:1;
+			unsigned int reserved2:3;
+		} bitfields2;
+		unsigned int ordinal2;
+	};
+
+	unsigned int reg_data[1];	/*1..N of these fields */
+
+};
+#endif
+
+/*--------------------_WAIT_REG_MEM-------------------- */
+
+#ifndef _PM4__WAIT_REG_MEM_DEFINED
+#define _PM4__WAIT_REG_MEM_DEFINED
+enum _WAIT_REG_MEM_function_enum {
+	function___wait_reg_mem__always_pass = 0,
+	function___wait_reg_mem__less_than_ref_value = 1,
+	function___wait_reg_mem__less_than_equal_to_the_ref_value = 2,
+	function___wait_reg_mem__equal_to_the_reference_value = 3,
+	function___wait_reg_mem__not_equal_reference_value = 4,
+	function___wait_reg_mem__greater_than_or_equal_reference_value = 5,
+	function___wait_reg_mem__greater_than_reference_value = 6,
+	function___wait_reg_mem__reserved = 7
+};
+
+enum _WAIT_REG_MEM_mem_space_enum {
+	mem_space___wait_reg_mem__register_space = 0,
+	mem_space___wait_reg_mem__memory_space = 1
+};
+
+enum _WAIT_REG_MEM_operation_enum {
+	operation___wait_reg_mem__wait_reg_mem = 0,
+	operation___wait_reg_mem__wr_wait_wr_reg = 1
+};
+
+struct pm4__wait_reg_mem {
+	union {
+		union PM4_MES_TYPE_3_HEADER header;	/*header */
+		unsigned int ordinal1;
+	};
+
+	union {
+		struct {
+			enum _WAIT_REG_MEM_function_enum function:3;
+			unsigned int reserved1:1;
+			enum _WAIT_REG_MEM_mem_space_enum mem_space:2;
+			enum _WAIT_REG_MEM_operation_enum operation:2;
+			unsigned int reserved2:24;
+		} bitfields2;
+		unsigned int ordinal2;
+	};
+
+	union {
+		struct {
+			unsigned int reserved3:2;
+			unsigned int memory_poll_addr_lo:30;
+		} bitfields3;
+		struct {
+			unsigned int register_poll_addr:16;
+			unsigned int reserved4:16;
+		} bitfields4;
+		struct {
+			unsigned int register_write_addr:16;
+			unsigned int reserved5:16;
+		} bitfields5;
+		unsigned int ordinal3;
+	};
+
+	union {
+		struct {
+			unsigned int poll_address_hi:16;
+			unsigned int reserved6:16;
+		} bitfields6;
+		struct {
+			unsigned int register_write_addr:16;
+			unsigned int reserved7:16;
+		} bitfields7;
+		unsigned int ordinal4;
+	};
+
+	unsigned int reference;
+
+	unsigned int mask;
+
+	union {
+		struct {
+			unsigned int poll_interval:16;
+			unsigned int reserved8:16;
+		} bitfields8;
+		unsigned int ordinal7;
+	};
+
+};
+#endif
+
+
+#endif /* KFD_PM4_HEADERS_DIQ_H_ */
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_pm4_headers_vi.h b/drivers/gpu/drm/amd/amdkfd/kfd_pm4_headers_vi.h
new file mode 100644
index 000000000..7c8d9b357
--- /dev/null
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_pm4_headers_vi.h
@@ -0,0 +1,510 @@
+/*
+ * Copyright 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ *
+ */
+
+#ifndef F32_MES_PM4_PACKETS_H
+#define F32_MES_PM4_PACKETS_H
+
+#ifndef PM4_MES_HEADER_DEFINED
+#define PM4_MES_HEADER_DEFINED
+union PM4_MES_TYPE_3_HEADER {
+	struct {
+		uint32_t reserved1 : 8; /* < reserved */
+		uint32_t opcode    : 8; /* < IT opcode */
+		uint32_t count     : 14;/* < Number of DWORDS - 1 in the
+					 *   information body
+					 */
+		uint32_t type      : 2; /* < packet identifier
+					 *   It should be 3 for type 3 packets
+					 */
+	};
+	uint32_t u32All;
+};
+#endif /* PM4_MES_HEADER_DEFINED */
+
+/*--------------------MES_SET_RESOURCES--------------------*/
+
+#ifndef PM4_MES_SET_RESOURCES_DEFINED
+#define PM4_MES_SET_RESOURCES_DEFINED
+enum mes_set_resources_queue_type_enum {
+	queue_type__mes_set_resources__kernel_interface_queue_kiq = 0,
+	queue_type__mes_set_resources__hsa_interface_queue_hiq = 1,
+	queue_type__mes_set_resources__hsa_debug_interface_queue = 4
+};
+
+
+struct pm4_mes_set_resources {
+	union {
+		union PM4_MES_TYPE_3_HEADER	header;		/* header */
+		uint32_t			ordinal1;
+	};
+
+	union {
+		struct {
+			uint32_t vmid_mask:16;
+			uint32_t unmap_latency:8;
+			uint32_t reserved1:5;
+			enum mes_set_resources_queue_type_enum queue_type:3;
+		} bitfields2;
+		uint32_t ordinal2;
+	};
+
+	uint32_t queue_mask_lo;
+	uint32_t queue_mask_hi;
+	uint32_t gws_mask_lo;
+	uint32_t gws_mask_hi;
+
+	union {
+		struct {
+			uint32_t oac_mask:16;
+			uint32_t reserved2:16;
+		} bitfields7;
+		uint32_t ordinal7;
+	};
+
+	union {
+		struct {
+		uint32_t gds_heap_base:6;
+		uint32_t reserved3:5;
+		uint32_t gds_heap_size:6;
+		uint32_t reserved4:15;
+		} bitfields8;
+		uint32_t ordinal8;
+	};
+
+};
+#endif
+
+/*--------------------MES_RUN_LIST--------------------*/
+
+#ifndef PM4_MES_RUN_LIST_DEFINED
+#define PM4_MES_RUN_LIST_DEFINED
+
+struct pm4_mes_runlist {
+	union {
+	    union PM4_MES_TYPE_3_HEADER   header;            /* header */
+	    uint32_t            ordinal1;
+	};
+
+	union {
+		struct {
+			uint32_t reserved1:2;
+			uint32_t ib_base_lo:30;
+		} bitfields2;
+		uint32_t ordinal2;
+	};
+
+	union {
+		struct {
+			uint32_t ib_base_hi:16;
+			uint32_t reserved2:16;
+		} bitfields3;
+		uint32_t ordinal3;
+	};
+
+	union {
+		struct {
+			uint32_t ib_size:20;
+			uint32_t chain:1;
+			uint32_t offload_polling:1;
+			uint32_t reserved2:1;
+			uint32_t valid:1;
+			uint32_t process_cnt:4;
+			uint32_t reserved3:4;
+		} bitfields4;
+		uint32_t ordinal4;
+	};
+
+};
+#endif
+
+/*--------------------MES_MAP_PROCESS--------------------*/
+
+#ifndef PM4_MES_MAP_PROCESS_DEFINED
+#define PM4_MES_MAP_PROCESS_DEFINED
+
+struct pm4_mes_map_process {
+	union {
+		union PM4_MES_TYPE_3_HEADER header;	/* header */
+		uint32_t ordinal1;
+	};
+
+	union {
+		struct {
+			uint32_t pasid:16;
+			uint32_t reserved1:8;
+			uint32_t diq_enable:1;
+			uint32_t process_quantum:7;
+		} bitfields2;
+		uint32_t ordinal2;
+	};
+
+	union {
+		struct {
+			uint32_t page_table_base:28;
+			uint32_t reserved3:4;
+		} bitfields3;
+		uint32_t ordinal3;
+	};
+
+	uint32_t reserved;
+
+	uint32_t sh_mem_bases;
+	uint32_t sh_mem_config;
+	uint32_t sh_mem_ape1_base;
+	uint32_t sh_mem_ape1_limit;
+
+	uint32_t sh_hidden_private_base_vmid;
+
+	uint32_t reserved2;
+	uint32_t reserved3;
+
+	uint32_t gds_addr_lo;
+	uint32_t gds_addr_hi;
+
+	union {
+		struct {
+			uint32_t num_gws:6;
+			uint32_t reserved4:2;
+			uint32_t num_oac:4;
+			uint32_t reserved5:4;
+			uint32_t gds_size:6;
+			uint32_t num_queues:10;
+		} bitfields10;
+		uint32_t ordinal10;
+	};
+
+	uint32_t completion_signal_lo;
+	uint32_t completion_signal_hi;
+
+};
+
+#endif
+
+/*--------------------MES_MAP_QUEUES--------------------*/
+
+#ifndef PM4_MES_MAP_QUEUES_VI_DEFINED
+#define PM4_MES_MAP_QUEUES_VI_DEFINED
+enum mes_map_queues_queue_sel_vi_enum {
+	queue_sel__mes_map_queues__map_to_specified_queue_slots_vi = 0,
+queue_sel__mes_map_queues__map_to_hws_determined_queue_slots_vi = 1
+};
+
+enum mes_map_queues_queue_type_vi_enum {
+	queue_type__mes_map_queues__normal_compute_vi = 0,
+	queue_type__mes_map_queues__debug_interface_queue_vi = 1,
+	queue_type__mes_map_queues__normal_latency_static_queue_vi = 2,
+queue_type__mes_map_queues__low_latency_static_queue_vi = 3
+};
+
+enum mes_map_queues_alloc_format_vi_enum {
+	alloc_format__mes_map_queues__one_per_pipe_vi = 0,
+alloc_format__mes_map_queues__all_on_one_pipe_vi = 1
+};
+
+enum mes_map_queues_engine_sel_vi_enum {
+	engine_sel__mes_map_queues__compute_vi = 0,
+	engine_sel__mes_map_queues__sdma0_vi = 2,
+	engine_sel__mes_map_queues__sdma1_vi = 3
+};
+
+
+struct pm4_mes_map_queues {
+	union {
+		union PM4_MES_TYPE_3_HEADER   header;            /* header */
+		uint32_t            ordinal1;
+	};
+
+	union {
+		struct {
+			uint32_t reserved1:4;
+			enum mes_map_queues_queue_sel_vi_enum queue_sel:2;
+			uint32_t reserved2:15;
+			enum mes_map_queues_queue_type_vi_enum queue_type:3;
+			enum mes_map_queues_alloc_format_vi_enum alloc_format:2;
+			enum mes_map_queues_engine_sel_vi_enum engine_sel:3;
+			uint32_t num_queues:3;
+		} bitfields2;
+		uint32_t ordinal2;
+	};
+
+	union {
+		struct {
+			uint32_t reserved3:1;
+			uint32_t check_disable:1;
+			uint32_t doorbell_offset:21;
+			uint32_t reserved4:3;
+			uint32_t queue:6;
+		} bitfields3;
+		uint32_t ordinal3;
+	};
+
+	uint32_t mqd_addr_lo;
+	uint32_t mqd_addr_hi;
+	uint32_t wptr_addr_lo;
+	uint32_t wptr_addr_hi;
+};
+#endif
+
+/*--------------------MES_QUERY_STATUS--------------------*/
+
+#ifndef PM4_MES_QUERY_STATUS_DEFINED
+#define PM4_MES_QUERY_STATUS_DEFINED
+enum mes_query_status_interrupt_sel_enum {
+	interrupt_sel__mes_query_status__completion_status = 0,
+	interrupt_sel__mes_query_status__process_status = 1,
+	interrupt_sel__mes_query_status__queue_status = 2
+};
+
+enum mes_query_status_command_enum {
+	command__mes_query_status__interrupt_only = 0,
+	command__mes_query_status__fence_only_immediate = 1,
+	command__mes_query_status__fence_only_after_write_ack = 2,
+	command__mes_query_status__fence_wait_for_write_ack_send_interrupt = 3
+};
+
+enum mes_query_status_engine_sel_enum {
+	engine_sel__mes_query_status__compute = 0,
+	engine_sel__mes_query_status__sdma0_queue = 2,
+	engine_sel__mes_query_status__sdma1_queue = 3
+};
+
+struct pm4_mes_query_status {
+	union {
+		union PM4_MES_TYPE_3_HEADER   header;            /* header */
+		uint32_t            ordinal1;
+	};
+
+	union {
+		struct {
+			uint32_t context_id:28;
+			enum mes_query_status_interrupt_sel_enum
+				interrupt_sel:2;
+			enum mes_query_status_command_enum command:2;
+		} bitfields2;
+		uint32_t ordinal2;
+	};
+
+	union {
+		struct {
+			uint32_t pasid:16;
+			uint32_t reserved1:16;
+		} bitfields3a;
+		struct {
+			uint32_t reserved2:2;
+			uint32_t doorbell_offset:21;
+			uint32_t reserved3:2;
+			enum mes_query_status_engine_sel_enum engine_sel:3;
+			uint32_t reserved4:4;
+		} bitfields3b;
+		uint32_t ordinal3;
+	};
+
+	uint32_t addr_lo;
+	uint32_t addr_hi;
+	uint32_t data_lo;
+	uint32_t data_hi;
+};
+#endif
+
+/*--------------------MES_UNMAP_QUEUES--------------------*/
+
+#ifndef PM4_MES_UNMAP_QUEUES_DEFINED
+#define PM4_MES_UNMAP_QUEUES_DEFINED
+enum mes_unmap_queues_action_enum {
+	action__mes_unmap_queues__preempt_queues = 0,
+	action__mes_unmap_queues__reset_queues = 1,
+	action__mes_unmap_queues__disable_process_queues = 2,
+	action__mes_unmap_queues__reserved = 3
+};
+
+enum mes_unmap_queues_queue_sel_enum {
+	queue_sel__mes_unmap_queues__perform_request_on_specified_queues = 0,
+	queue_sel__mes_unmap_queues__perform_request_on_pasid_queues = 1,
+	queue_sel__mes_unmap_queues__unmap_all_queues = 2,
+	queue_sel__mes_unmap_queues__unmap_all_non_static_queues = 3
+};
+
+enum mes_unmap_queues_engine_sel_enum {
+	engine_sel__mes_unmap_queues__compute = 0,
+	engine_sel__mes_unmap_queues__sdma0 = 2,
+	engine_sel__mes_unmap_queues__sdmal = 3
+};
+
+struct pm4_mes_unmap_queues {
+	union {
+		union PM4_MES_TYPE_3_HEADER   header;            /* header */
+		uint32_t            ordinal1;
+	};
+
+	union {
+		struct {
+			enum mes_unmap_queues_action_enum action:2;
+			uint32_t reserved1:2;
+			enum mes_unmap_queues_queue_sel_enum queue_sel:2;
+			uint32_t reserved2:20;
+			enum mes_unmap_queues_engine_sel_enum engine_sel:3;
+			uint32_t num_queues:3;
+		} bitfields2;
+		uint32_t ordinal2;
+	};
+
+	union {
+		struct {
+			uint32_t pasid:16;
+			uint32_t reserved3:16;
+		} bitfields3a;
+		struct {
+			uint32_t reserved4:2;
+			uint32_t doorbell_offset0:21;
+			uint32_t reserved5:9;
+		} bitfields3b;
+		uint32_t ordinal3;
+	};
+
+	union {
+	struct {
+			uint32_t reserved6:2;
+			uint32_t doorbell_offset1:21;
+			uint32_t reserved7:9;
+		} bitfields4;
+		uint32_t ordinal4;
+	};
+
+	union {
+		struct {
+			uint32_t reserved8:2;
+			uint32_t doorbell_offset2:21;
+			uint32_t reserved9:9;
+		} bitfields5;
+		uint32_t ordinal5;
+	};
+
+	union {
+		struct {
+			uint32_t reserved10:2;
+			uint32_t doorbell_offset3:21;
+			uint32_t reserved11:9;
+		} bitfields6;
+		uint32_t ordinal6;
+	};
+};
+#endif
+
+#ifndef PM4_MEC_RELEASE_MEM_DEFINED
+#define PM4_MEC_RELEASE_MEM_DEFINED
+enum RELEASE_MEM_event_index_enum {
+	event_index___release_mem__end_of_pipe = 5,
+	event_index___release_mem__shader_done = 6
+};
+
+enum RELEASE_MEM_cache_policy_enum {
+	cache_policy___release_mem__lru = 0,
+	cache_policy___release_mem__stream = 1,
+	cache_policy___release_mem__bypass = 2
+};
+
+enum RELEASE_MEM_dst_sel_enum {
+	dst_sel___release_mem__memory_controller = 0,
+	dst_sel___release_mem__tc_l2 = 1,
+	dst_sel___release_mem__queue_write_pointer_register = 2,
+	dst_sel___release_mem__queue_write_pointer_poll_mask_bit = 3
+};
+
+enum RELEASE_MEM_int_sel_enum {
+	int_sel___release_mem__none = 0,
+	int_sel___release_mem__send_interrupt_only = 1,
+	int_sel___release_mem__send_interrupt_after_write_confirm = 2,
+	int_sel___release_mem__send_data_after_write_confirm = 3
+};
+
+enum RELEASE_MEM_data_sel_enum {
+	data_sel___release_mem__none = 0,
+	data_sel___release_mem__send_32_bit_low = 1,
+	data_sel___release_mem__send_64_bit_data = 2,
+	data_sel___release_mem__send_gpu_clock_counter = 3,
+	data_sel___release_mem__send_cp_perfcounter_hi_lo = 4,
+	data_sel___release_mem__store_gds_data_to_memory = 5
+};
+
+struct pm4_mec_release_mem {
+	union {
+		union PM4_MES_TYPE_3_HEADER header;     /*header */
+		unsigned int ordinal1;
+	};
+
+	union {
+		struct {
+			unsigned int event_type:6;
+			unsigned int reserved1:2;
+			enum RELEASE_MEM_event_index_enum event_index:4;
+			unsigned int tcl1_vol_action_ena:1;
+			unsigned int tc_vol_action_ena:1;
+			unsigned int reserved2:1;
+			unsigned int tc_wb_action_ena:1;
+			unsigned int tcl1_action_ena:1;
+			unsigned int tc_action_ena:1;
+			unsigned int reserved3:6;
+			unsigned int atc:1;
+			enum RELEASE_MEM_cache_policy_enum cache_policy:2;
+			unsigned int reserved4:5;
+		} bitfields2;
+		unsigned int ordinal2;
+	};
+
+	union {
+		struct {
+			unsigned int reserved5:16;
+			enum RELEASE_MEM_dst_sel_enum dst_sel:2;
+			unsigned int reserved6:6;
+			enum RELEASE_MEM_int_sel_enum int_sel:3;
+			unsigned int reserved7:2;
+			enum RELEASE_MEM_data_sel_enum data_sel:3;
+		} bitfields3;
+		unsigned int ordinal3;
+	};
+
+	union {
+		struct {
+			unsigned int reserved8:2;
+			unsigned int address_lo_32b:30;
+		} bitfields4;
+		struct {
+			unsigned int reserved9:3;
+			unsigned int address_lo_64b:29;
+		} bitfields5;
+		unsigned int ordinal4;
+	};
+
+	unsigned int address_hi;
+
+	unsigned int data_lo;
+
+	unsigned int data_hi;
+};
+#endif
+
+enum {
+	CACHE_FLUSH_AND_INV_TS_EVENT = 0x00000014
+};
+
+#endif
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_pm4_opcodes.h b/drivers/gpu/drm/amd/amdkfd/kfd_pm4_opcodes.h
new file mode 100644
index 000000000..b72fa3b8c
--- /dev/null
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_pm4_opcodes.h
@@ -0,0 +1,107 @@
+/*
+ * Copyright 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ *
+ */
+
+
+#ifndef KFD_PM4_OPCODES_H
+#define KFD_PM4_OPCODES_H
+
+enum it_opcode_type {
+	IT_NOP                               = 0x10,
+	IT_SET_BASE                          = 0x11,
+	IT_CLEAR_STATE                       = 0x12,
+	IT_INDEX_BUFFER_SIZE                 = 0x13,
+	IT_DISPATCH_DIRECT                   = 0x15,
+	IT_DISPATCH_INDIRECT                 = 0x16,
+	IT_ATOMIC_GDS                        = 0x1D,
+	IT_OCCLUSION_QUERY                   = 0x1F,
+	IT_SET_PREDICATION                   = 0x20,
+	IT_REG_RMW                           = 0x21,
+	IT_COND_EXEC                         = 0x22,
+	IT_PRED_EXEC                         = 0x23,
+	IT_DRAW_INDIRECT                     = 0x24,
+	IT_DRAW_INDEX_INDIRECT               = 0x25,
+	IT_INDEX_BASE                        = 0x26,
+	IT_DRAW_INDEX_2                      = 0x27,
+	IT_CONTEXT_CONTROL                   = 0x28,
+	IT_INDEX_TYPE                        = 0x2A,
+	IT_DRAW_INDIRECT_MULTI               = 0x2C,
+	IT_DRAW_INDEX_AUTO                   = 0x2D,
+	IT_NUM_INSTANCES                     = 0x2F,
+	IT_DRAW_INDEX_MULTI_AUTO             = 0x30,
+	IT_INDIRECT_BUFFER_CNST              = 0x33,
+	IT_STRMOUT_BUFFER_UPDATE             = 0x34,
+	IT_DRAW_INDEX_OFFSET_2               = 0x35,
+	IT_DRAW_PREAMBLE                     = 0x36,
+	IT_WRITE_DATA                        = 0x37,
+	IT_DRAW_INDEX_INDIRECT_MULTI         = 0x38,
+	IT_MEM_SEMAPHORE                     = 0x39,
+	IT_COPY_DW                           = 0x3B,
+	IT_WAIT_REG_MEM                      = 0x3C,
+	IT_INDIRECT_BUFFER                   = 0x3F,
+	IT_COPY_DATA                         = 0x40,
+	IT_PFP_SYNC_ME                       = 0x42,
+	IT_SURFACE_SYNC                      = 0x43,
+	IT_COND_WRITE                        = 0x45,
+	IT_EVENT_WRITE                       = 0x46,
+	IT_EVENT_WRITE_EOP                   = 0x47,
+	IT_EVENT_WRITE_EOS                   = 0x48,
+	IT_RELEASE_MEM                       = 0x49,
+	IT_PREAMBLE_CNTL                     = 0x4A,
+	IT_DMA_DATA                          = 0x50,
+	IT_ACQUIRE_MEM                       = 0x58,
+	IT_REWIND                            = 0x59,
+	IT_LOAD_UCONFIG_REG                  = 0x5E,
+	IT_LOAD_SH_REG                       = 0x5F,
+	IT_LOAD_CONFIG_REG                   = 0x60,
+	IT_LOAD_CONTEXT_REG                  = 0x61,
+	IT_SET_CONFIG_REG                    = 0x68,
+	IT_SET_CONTEXT_REG                   = 0x69,
+	IT_SET_CONTEXT_REG_INDIRECT          = 0x73,
+	IT_SET_SH_REG                        = 0x76,
+	IT_SET_SH_REG_OFFSET                 = 0x77,
+	IT_SET_QUEUE_REG                     = 0x78,
+	IT_SET_UCONFIG_REG                   = 0x79,
+	IT_SCRATCH_RAM_WRITE                 = 0x7D,
+	IT_SCRATCH_RAM_READ                  = 0x7E,
+	IT_LOAD_CONST_RAM                    = 0x80,
+	IT_WRITE_CONST_RAM                   = 0x81,
+	IT_DUMP_CONST_RAM                    = 0x83,
+	IT_INCREMENT_CE_COUNTER              = 0x84,
+	IT_INCREMENT_DE_COUNTER              = 0x85,
+	IT_WAIT_ON_CE_COUNTER                = 0x86,
+	IT_WAIT_ON_DE_COUNTER_DIFF           = 0x88,
+	IT_SWITCH_BUFFER                     = 0x8B,
+	IT_SET_RESOURCES                     = 0xA0,
+	IT_MAP_PROCESS                       = 0xA1,
+	IT_MAP_QUEUES                        = 0xA2,
+	IT_UNMAP_QUEUES                      = 0xA3,
+	IT_QUERY_STATUS                      = 0xA4,
+	IT_RUN_LIST                          = 0xA5,
+};
+
+#define PM4_TYPE_0 0
+#define PM4_TYPE_2 2
+#define PM4_TYPE_3 3
+
+#endif /* KFD_PM4_OPCODES_H */
+
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
new file mode 100644
index 000000000..92b285ca7
--- /dev/null
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
@@ -0,0 +1,1021 @@
+/*
+ * Copyright 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#ifndef KFD_PRIV_H_INCLUDED
+#define KFD_PRIV_H_INCLUDED
+
+#include <linux/hashtable.h>
+#include <linux/mmu_notifier.h>
+#include <linux/mutex.h>
+#include <linux/types.h>
+#include <linux/atomic.h>
+#include <linux/workqueue.h>
+#include <linux/spinlock.h>
+#include <linux/kfd_ioctl.h>
+#include <linux/idr.h>
+#include <linux/kfifo.h>
+#include <linux/seq_file.h>
+#include <linux/kref.h>
+#include <kgd_kfd_interface.h>
+
+#include "amd_shared.h"
+
+#define KFD_MAX_RING_ENTRY_SIZE	8
+
+#define KFD_SYSFS_FILE_MODE 0444
+
+/* GPU ID hash width in bits */
+#define KFD_GPU_ID_HASH_WIDTH 16
+
+/* Use upper bits of mmap offset to store KFD driver specific information.
+ * BITS[63:62] - Encode MMAP type
+ * BITS[61:46] - Encode gpu_id. To identify to which GPU the offset belongs to
+ * BITS[45:0]  - MMAP offset value
+ *
+ * NOTE: struct vm_area_struct.vm_pgoff uses offset in pages. Hence, these
+ *  defines are w.r.t to PAGE_SIZE
+ */
+#define KFD_MMAP_TYPE_SHIFT	(62 - PAGE_SHIFT)
+#define KFD_MMAP_TYPE_MASK	(0x3ULL << KFD_MMAP_TYPE_SHIFT)
+#define KFD_MMAP_TYPE_DOORBELL	(0x3ULL << KFD_MMAP_TYPE_SHIFT)
+#define KFD_MMAP_TYPE_EVENTS	(0x2ULL << KFD_MMAP_TYPE_SHIFT)
+#define KFD_MMAP_TYPE_RESERVED_MEM	(0x1ULL << KFD_MMAP_TYPE_SHIFT)
+
+#define KFD_MMAP_GPU_ID_SHIFT (46 - PAGE_SHIFT)
+#define KFD_MMAP_GPU_ID_MASK (((1ULL << KFD_GPU_ID_HASH_WIDTH) - 1) \
+				<< KFD_MMAP_GPU_ID_SHIFT)
+#define KFD_MMAP_GPU_ID(gpu_id) ((((uint64_t)gpu_id) << KFD_MMAP_GPU_ID_SHIFT)\
+				& KFD_MMAP_GPU_ID_MASK)
+#define KFD_MMAP_GPU_ID_GET(offset)    ((offset & KFD_MMAP_GPU_ID_MASK) \
+				>> KFD_MMAP_GPU_ID_SHIFT)
+
+#define KFD_MMAP_OFFSET_VALUE_MASK	(0x3FFFFFFFFFFFULL >> PAGE_SHIFT)
+#define KFD_MMAP_OFFSET_VALUE_GET(offset) (offset & KFD_MMAP_OFFSET_VALUE_MASK)
+
+/*
+ * When working with cp scheduler we should assign the HIQ manually or via
+ * the amdgpu driver to a fixed hqd slot, here are the fixed HIQ hqd slot
+ * definitions for Kaveri. In Kaveri only the first ME queues participates
+ * in the cp scheduling taking that in mind we set the HIQ slot in the
+ * second ME.
+ */
+#define KFD_CIK_HIQ_PIPE 4
+#define KFD_CIK_HIQ_QUEUE 0
+
+/* Macro for allocating structures */
+#define kfd_alloc_struct(ptr_to_struct)	\
+	((typeof(ptr_to_struct)) kzalloc(sizeof(*ptr_to_struct), GFP_KERNEL))
+
+#define KFD_MAX_NUM_OF_PROCESSES 512
+#define KFD_MAX_NUM_OF_QUEUES_PER_PROCESS 1024
+
+/*
+ * Size of the per-process TBA+TMA buffer: 2 pages
+ *
+ * The first page is the TBA used for the CWSR ISA code. The second
+ * page is used as TMA for daisy changing a user-mode trap handler.
+ */
+#define KFD_CWSR_TBA_TMA_SIZE (PAGE_SIZE * 2)
+#define KFD_CWSR_TMA_OFFSET PAGE_SIZE
+
+/*
+ * Kernel module parameter to specify maximum number of supported queues per
+ * device
+ */
+extern int max_num_of_queues_per_device;
+
+#define KFD_MAX_NUM_OF_QUEUES_PER_DEVICE_DEFAULT 4096
+#define KFD_MAX_NUM_OF_QUEUES_PER_DEVICE		\
+	(KFD_MAX_NUM_OF_PROCESSES *			\
+			KFD_MAX_NUM_OF_QUEUES_PER_PROCESS)
+
+#define KFD_KERNEL_QUEUE_SIZE 2048
+
+/* Kernel module parameter to specify the scheduling policy */
+extern int sched_policy;
+
+/*
+ * Kernel module parameter to specify the maximum process
+ * number per HW scheduler
+ */
+extern int hws_max_conc_proc;
+
+extern int cwsr_enable;
+
+/*
+ * Kernel module parameter to specify whether to send sigterm to HSA process on
+ * unhandled exception
+ */
+extern int send_sigterm;
+
+/*
+ * This kernel module is used to simulate large bar machine on non-large bar
+ * enabled machines.
+ */
+extern int debug_largebar;
+
+/*
+ * Ignore CRAT table during KFD initialization, can be used to work around
+ * broken CRAT tables on some AMD systems
+ */
+extern int ignore_crat;
+
+/*
+ * Set sh_mem_config.retry_disable on Vega10
+ */
+extern int noretry;
+
+/*
+ * Halt if HWS hang is detected
+ */
+extern int halt_if_hws_hang;
+
+/**
+ * enum kfd_sched_policy
+ *
+ * @KFD_SCHED_POLICY_HWS: H/W scheduling policy known as command processor (cp)
+ * scheduling. In this scheduling mode we're using the firmware code to
+ * schedule the user mode queues and kernel queues such as HIQ and DIQ.
+ * the HIQ queue is used as a special queue that dispatches the configuration
+ * to the cp and the user mode queues list that are currently running.
+ * the DIQ queue is a debugging queue that dispatches debugging commands to the
+ * firmware.
+ * in this scheduling mode user mode queues over subscription feature is
+ * enabled.
+ *
+ * @KFD_SCHED_POLICY_HWS_NO_OVERSUBSCRIPTION: The same as above but the over
+ * subscription feature disabled.
+ *
+ * @KFD_SCHED_POLICY_NO_HWS: no H/W scheduling policy is a mode which directly
+ * set the command processor registers and sets the queues "manually". This
+ * mode is used *ONLY* for debugging proposes.
+ *
+ */
+enum kfd_sched_policy {
+	KFD_SCHED_POLICY_HWS = 0,
+	KFD_SCHED_POLICY_HWS_NO_OVERSUBSCRIPTION,
+	KFD_SCHED_POLICY_NO_HWS
+};
+
+enum cache_policy {
+	cache_policy_coherent,
+	cache_policy_noncoherent
+};
+
+#define KFD_IS_SOC15(chip) ((chip) >= CHIP_VEGA10)
+
+struct kfd_event_interrupt_class {
+	bool (*interrupt_isr)(struct kfd_dev *dev,
+			const uint32_t *ih_ring_entry, uint32_t *patched_ihre,
+			bool *patched_flag);
+	void (*interrupt_wq)(struct kfd_dev *dev,
+			const uint32_t *ih_ring_entry);
+};
+
+struct kfd_device_info {
+	enum amd_asic_type asic_family;
+	const struct kfd_event_interrupt_class *event_interrupt_class;
+	unsigned int max_pasid_bits;
+	unsigned int max_no_of_hqd;
+	unsigned int doorbell_size;
+	size_t ih_ring_entry_size;
+	uint8_t num_of_watch_points;
+	uint16_t mqd_size_aligned;
+	bool supports_cwsr;
+	bool needs_iommu_device;
+	bool needs_pci_atomics;
+	unsigned int num_sdma_engines;
+};
+
+struct kfd_mem_obj {
+	uint32_t range_start;
+	uint32_t range_end;
+	uint64_t gpu_addr;
+	uint32_t *cpu_ptr;
+	void *gtt_mem;
+};
+
+struct kfd_vmid_info {
+	uint32_t first_vmid_kfd;
+	uint32_t last_vmid_kfd;
+	uint32_t vmid_num_kfd;
+};
+
+struct kfd_dev {
+	struct kgd_dev *kgd;
+
+	const struct kfd_device_info *device_info;
+	struct pci_dev *pdev;
+
+	unsigned int id;		/* topology stub index */
+
+	phys_addr_t doorbell_base;	/* Start of actual doorbells used by
+					 * KFD. It is aligned for mapping
+					 * into user mode
+					 */
+	size_t doorbell_id_offset;	/* Doorbell offset (from KFD doorbell
+					 * to HW doorbell, GFX reserved some
+					 * at the start)
+					 */
+	u32 __iomem *doorbell_kernel_ptr; /* This is a pointer for a doorbells
+					   * page used by kernel queue
+					   */
+
+	struct kgd2kfd_shared_resources shared_resources;
+	struct kfd_vmid_info vm_info;
+
+	const struct kfd2kgd_calls *kfd2kgd;
+	struct mutex doorbell_mutex;
+	DECLARE_BITMAP(doorbell_available_index,
+			KFD_MAX_NUM_OF_QUEUES_PER_PROCESS);
+
+	void *gtt_mem;
+	uint64_t gtt_start_gpu_addr;
+	void *gtt_start_cpu_ptr;
+	void *gtt_sa_bitmap;
+	struct mutex gtt_sa_lock;
+	unsigned int gtt_sa_chunk_size;
+	unsigned int gtt_sa_num_of_chunks;
+
+	/* Interrupts */
+	struct kfifo ih_fifo;
+	struct workqueue_struct *ih_wq;
+	struct work_struct interrupt_work;
+	spinlock_t interrupt_lock;
+
+	/* QCM Device instance */
+	struct device_queue_manager *dqm;
+
+	bool init_complete;
+	/*
+	 * Interrupts of interest to KFD are copied
+	 * from the HW ring into a SW ring.
+	 */
+	bool interrupts_active;
+
+	/* Debug manager */
+	struct kfd_dbgmgr           *dbgmgr;
+
+	/* Maximum process number mapped to HW scheduler */
+	unsigned int max_proc_per_quantum;
+
+	/* CWSR */
+	bool cwsr_enabled;
+	const void *cwsr_isa;
+	unsigned int cwsr_isa_size;
+};
+
+/* KGD2KFD callbacks */
+void kgd2kfd_exit(void);
+struct kfd_dev *kgd2kfd_probe(struct kgd_dev *kgd,
+			struct pci_dev *pdev, const struct kfd2kgd_calls *f2g);
+bool kgd2kfd_device_init(struct kfd_dev *kfd,
+			const struct kgd2kfd_shared_resources *gpu_resources);
+void kgd2kfd_device_exit(struct kfd_dev *kfd);
+
+enum kfd_mempool {
+	KFD_MEMPOOL_SYSTEM_CACHEABLE = 1,
+	KFD_MEMPOOL_SYSTEM_WRITECOMBINE = 2,
+	KFD_MEMPOOL_FRAMEBUFFER = 3,
+};
+
+/* Character device interface */
+int kfd_chardev_init(void);
+void kfd_chardev_exit(void);
+struct device *kfd_chardev(void);
+
+/**
+ * enum kfd_unmap_queues_filter
+ *
+ * @KFD_UNMAP_QUEUES_FILTER_SINGLE_QUEUE: Preempts single queue.
+ *
+ * @KFD_UNMAP_QUEUES_FILTER_ALL_QUEUES: Preempts all queues in the
+ *						running queues list.
+ *
+ * @KFD_UNMAP_QUEUES_FILTER_BY_PASID: Preempts queues that belongs to
+ *						specific process.
+ *
+ */
+enum kfd_unmap_queues_filter {
+	KFD_UNMAP_QUEUES_FILTER_SINGLE_QUEUE,
+	KFD_UNMAP_QUEUES_FILTER_ALL_QUEUES,
+	KFD_UNMAP_QUEUES_FILTER_DYNAMIC_QUEUES,
+	KFD_UNMAP_QUEUES_FILTER_BY_PASID
+};
+
+/**
+ * enum kfd_queue_type
+ *
+ * @KFD_QUEUE_TYPE_COMPUTE: Regular user mode queue type.
+ *
+ * @KFD_QUEUE_TYPE_SDMA: Sdma user mode queue type.
+ *
+ * @KFD_QUEUE_TYPE_HIQ: HIQ queue type.
+ *
+ * @KFD_QUEUE_TYPE_DIQ: DIQ queue type.
+ */
+enum kfd_queue_type  {
+	KFD_QUEUE_TYPE_COMPUTE,
+	KFD_QUEUE_TYPE_SDMA,
+	KFD_QUEUE_TYPE_HIQ,
+	KFD_QUEUE_TYPE_DIQ
+};
+
+enum kfd_queue_format {
+	KFD_QUEUE_FORMAT_PM4,
+	KFD_QUEUE_FORMAT_AQL
+};
+
+/**
+ * struct queue_properties
+ *
+ * @type: The queue type.
+ *
+ * @queue_id: Queue identifier.
+ *
+ * @queue_address: Queue ring buffer address.
+ *
+ * @queue_size: Queue ring buffer size.
+ *
+ * @priority: Defines the queue priority relative to other queues in the
+ * process.
+ * This is just an indication and HW scheduling may override the priority as
+ * necessary while keeping the relative prioritization.
+ * the priority granularity is from 0 to f which f is the highest priority.
+ * currently all queues are initialized with the highest priority.
+ *
+ * @queue_percent: This field is partially implemented and currently a zero in
+ * this field defines that the queue is non active.
+ *
+ * @read_ptr: User space address which points to the number of dwords the
+ * cp read from the ring buffer. This field updates automatically by the H/W.
+ *
+ * @write_ptr: Defines the number of dwords written to the ring buffer.
+ *
+ * @doorbell_ptr: This field aim is to notify the H/W of new packet written to
+ * the queue ring buffer. This field should be similar to write_ptr and the
+ * user should update this field after he updated the write_ptr.
+ *
+ * @doorbell_off: The doorbell offset in the doorbell pci-bar.
+ *
+ * @is_interop: Defines if this is a interop queue. Interop queue means that
+ * the queue can access both graphics and compute resources.
+ *
+ * @is_evicted: Defines if the queue is evicted. Only active queues
+ * are evicted, rendering them inactive.
+ *
+ * @is_active: Defines if the queue is active or not. @is_active and
+ * @is_evicted are protected by the DQM lock.
+ *
+ * @vmid: If the scheduling mode is no cp scheduling the field defines the vmid
+ * of the queue.
+ *
+ * This structure represents the queue properties for each queue no matter if
+ * it's user mode or kernel mode queue.
+ *
+ */
+struct queue_properties {
+	enum kfd_queue_type type;
+	enum kfd_queue_format format;
+	unsigned int queue_id;
+	uint64_t queue_address;
+	uint64_t  queue_size;
+	uint32_t priority;
+	uint32_t queue_percent;
+	uint32_t *read_ptr;
+	uint32_t *write_ptr;
+	void __iomem *doorbell_ptr;
+	uint32_t doorbell_off;
+	bool is_interop;
+	bool is_evicted;
+	bool is_active;
+	/* Not relevant for user mode queues in cp scheduling */
+	unsigned int vmid;
+	/* Relevant only for sdma queues*/
+	uint32_t sdma_engine_id;
+	uint32_t sdma_queue_id;
+	uint32_t sdma_vm_addr;
+	/* Relevant only for VI */
+	uint64_t eop_ring_buffer_address;
+	uint32_t eop_ring_buffer_size;
+	uint64_t ctx_save_restore_area_address;
+	uint32_t ctx_save_restore_area_size;
+	uint32_t ctl_stack_size;
+	uint64_t tba_addr;
+	uint64_t tma_addr;
+	/* Relevant for CU */
+	uint32_t cu_mask_count; /* Must be a multiple of 32 */
+	uint32_t *cu_mask;
+};
+
+/**
+ * struct queue
+ *
+ * @list: Queue linked list.
+ *
+ * @mqd: The queue MQD.
+ *
+ * @mqd_mem_obj: The MQD local gpu memory object.
+ *
+ * @gart_mqd_addr: The MQD gart mc address.
+ *
+ * @properties: The queue properties.
+ *
+ * @mec: Used only in no cp scheduling mode and identifies to micro engine id
+ *	 that the queue should be execute on.
+ *
+ * @pipe: Used only in no cp scheduling mode and identifies the queue's pipe
+ *	  id.
+ *
+ * @queue: Used only in no cp scheduliong mode and identifies the queue's slot.
+ *
+ * @process: The kfd process that created this queue.
+ *
+ * @device: The kfd device that created this queue.
+ *
+ * This structure represents user mode compute queues.
+ * It contains all the necessary data to handle such queues.
+ *
+ */
+
+struct queue {
+	struct list_head list;
+	void *mqd;
+	struct kfd_mem_obj *mqd_mem_obj;
+	uint64_t gart_mqd_addr;
+	struct queue_properties properties;
+
+	uint32_t mec;
+	uint32_t pipe;
+	uint32_t queue;
+
+	unsigned int sdma_id;
+	unsigned int doorbell_id;
+
+	struct kfd_process	*process;
+	struct kfd_dev		*device;
+};
+
+/*
+ * Please read the kfd_mqd_manager.h description.
+ */
+enum KFD_MQD_TYPE {
+	KFD_MQD_TYPE_COMPUTE = 0,	/* for no cp scheduling */
+	KFD_MQD_TYPE_HIQ,		/* for hiq */
+	KFD_MQD_TYPE_CP,		/* for cp queues and diq */
+	KFD_MQD_TYPE_SDMA,		/* for sdma queues */
+	KFD_MQD_TYPE_MAX
+};
+
+struct scheduling_resources {
+	unsigned int vmid_mask;
+	enum kfd_queue_type type;
+	uint64_t queue_mask;
+	uint64_t gws_mask;
+	uint32_t oac_mask;
+	uint32_t gds_heap_base;
+	uint32_t gds_heap_size;
+};
+
+struct process_queue_manager {
+	/* data */
+	struct kfd_process	*process;
+	struct list_head	queues;
+	unsigned long		*queue_slot_bitmap;
+};
+
+struct qcm_process_device {
+	/* The Device Queue Manager that owns this data */
+	struct device_queue_manager *dqm;
+	struct process_queue_manager *pqm;
+	/* Queues list */
+	struct list_head queues_list;
+	struct list_head priv_queue_list;
+
+	unsigned int queue_count;
+	unsigned int vmid;
+	bool is_debug;
+	unsigned int evicted; /* eviction counter, 0=active */
+
+	/* This flag tells if we should reset all wavefronts on
+	 * process termination
+	 */
+	bool reset_wavefronts;
+
+	/*
+	 * All the memory management data should be here too
+	 */
+	uint64_t gds_context_area;
+	uint32_t sh_mem_config;
+	uint32_t sh_mem_bases;
+	uint32_t sh_mem_ape1_base;
+	uint32_t sh_mem_ape1_limit;
+	uint32_t page_table_base;
+	uint32_t gds_size;
+	uint32_t num_gws;
+	uint32_t num_oac;
+	uint32_t sh_hidden_private_base;
+
+	/* CWSR memory */
+	void *cwsr_kaddr;
+	uint64_t cwsr_base;
+	uint64_t tba_addr;
+	uint64_t tma_addr;
+
+	/* IB memory */
+	uint64_t ib_base;
+	void *ib_kaddr;
+
+	/* doorbell resources per process per device */
+	unsigned long *doorbell_bitmap;
+};
+
+/* KFD Memory Eviction */
+
+/* Approx. wait time before attempting to restore evicted BOs */
+#define PROCESS_RESTORE_TIME_MS 100
+/* Approx. back off time if restore fails due to lack of memory */
+#define PROCESS_BACK_OFF_TIME_MS 100
+/* Approx. time before evicting the process again */
+#define PROCESS_ACTIVE_TIME_MS 10
+
+int kgd2kfd_quiesce_mm(struct mm_struct *mm);
+int kgd2kfd_resume_mm(struct mm_struct *mm);
+int kgd2kfd_schedule_evict_and_restore_process(struct mm_struct *mm,
+					       struct dma_fence *fence);
+
+/* 8 byte handle containing GPU ID in the most significant 4 bytes and
+ * idr_handle in the least significant 4 bytes
+ */
+#define MAKE_HANDLE(gpu_id, idr_handle) \
+	(((uint64_t)(gpu_id) << 32) + idr_handle)
+#define GET_GPU_ID(handle) (handle >> 32)
+#define GET_IDR_HANDLE(handle) (handle & 0xFFFFFFFF)
+
+enum kfd_pdd_bound {
+	PDD_UNBOUND = 0,
+	PDD_BOUND,
+	PDD_BOUND_SUSPENDED,
+};
+
+/* Data that is per-process-per device. */
+struct kfd_process_device {
+	/*
+	 * List of all per-device data for a process.
+	 * Starts from kfd_process.per_device_data.
+	 */
+	struct list_head per_device_list;
+
+	/* The device that owns this data. */
+	struct kfd_dev *dev;
+
+	/* The process that owns this kfd_process_device. */
+	struct kfd_process *process;
+
+	/* per-process-per device QCM data structure */
+	struct qcm_process_device qpd;
+
+	/*Apertures*/
+	uint64_t lds_base;
+	uint64_t lds_limit;
+	uint64_t gpuvm_base;
+	uint64_t gpuvm_limit;
+	uint64_t scratch_base;
+	uint64_t scratch_limit;
+
+	/* VM context for GPUVM allocations */
+	struct file *drm_file;
+	void *vm;
+
+	/* GPUVM allocations storage */
+	struct idr alloc_idr;
+
+	/* Flag used to tell the pdd has dequeued from the dqm.
+	 * This is used to prevent dev->dqm->ops.process_termination() from
+	 * being called twice when it is already called in IOMMU callback
+	 * function.
+	 */
+	bool already_dequeued;
+
+	/* Is this process/pasid bound to this device? (amd_iommu_bind_pasid) */
+	enum kfd_pdd_bound bound;
+};
+
+#define qpd_to_pdd(x) container_of(x, struct kfd_process_device, qpd)
+
+/* Process data */
+struct kfd_process {
+	/*
+	 * kfd_process are stored in an mm_struct*->kfd_process*
+	 * hash table (kfd_processes in kfd_process.c)
+	 */
+	struct hlist_node kfd_processes;
+
+	/*
+	 * Opaque pointer to mm_struct. We don't hold a reference to
+	 * it so it should never be dereferenced from here. This is
+	 * only used for looking up processes by their mm.
+	 */
+	void *mm;
+
+	struct kref ref;
+	struct work_struct release_work;
+
+	struct mutex mutex;
+
+	/*
+	 * In any process, the thread that started main() is the lead
+	 * thread and outlives the rest.
+	 * It is here because amd_iommu_bind_pasid wants a task_struct.
+	 * It can also be used for safely getting a reference to the
+	 * mm_struct of the process.
+	 */
+	struct task_struct *lead_thread;
+
+	/* We want to receive a notification when the mm_struct is destroyed */
+	struct mmu_notifier mmu_notifier;
+
+	/* Use for delayed freeing of kfd_process structure */
+	struct rcu_head	rcu;
+
+	unsigned int pasid;
+	unsigned int doorbell_index;
+
+	/*
+	 * List of kfd_process_device structures,
+	 * one for each device the process is using.
+	 */
+	struct list_head per_device_data;
+
+	struct process_queue_manager pqm;
+
+	/*Is the user space process 32 bit?*/
+	bool is_32bit_user_mode;
+
+	/* Event-related data */
+	struct mutex event_mutex;
+	/* Event ID allocator and lookup */
+	struct idr event_idr;
+	/* Event page */
+	struct kfd_signal_page *signal_page;
+	size_t signal_mapped_size;
+	size_t signal_event_count;
+	bool signal_event_limit_reached;
+
+	/* Information used for memory eviction */
+	void *kgd_process_info;
+	/* Eviction fence that is attached to all the BOs of this process. The
+	 * fence will be triggered during eviction and new one will be created
+	 * during restore
+	 */
+	struct dma_fence *ef;
+
+	/* Work items for evicting and restoring BOs */
+	struct delayed_work eviction_work;
+	struct delayed_work restore_work;
+	/* seqno of the last scheduled eviction */
+	unsigned int last_eviction_seqno;
+	/* Approx. the last timestamp (in jiffies) when the process was
+	 * restored after an eviction
+	 */
+	unsigned long last_restore_timestamp;
+};
+
+#define KFD_PROCESS_TABLE_SIZE 5 /* bits: 32 entries */
+extern DECLARE_HASHTABLE(kfd_processes_table, KFD_PROCESS_TABLE_SIZE);
+extern struct srcu_struct kfd_processes_srcu;
+
+/**
+ * Ioctl function type.
+ *
+ * \param filep pointer to file structure.
+ * \param p amdkfd process pointer.
+ * \param data pointer to arg that was copied from user.
+ */
+typedef int amdkfd_ioctl_t(struct file *filep, struct kfd_process *p,
+				void *data);
+
+struct amdkfd_ioctl_desc {
+	unsigned int cmd;
+	int flags;
+	amdkfd_ioctl_t *func;
+	unsigned int cmd_drv;
+	const char *name;
+};
+
+int kfd_process_create_wq(void);
+void kfd_process_destroy_wq(void);
+struct kfd_process *kfd_create_process(struct file *filep);
+struct kfd_process *kfd_get_process(const struct task_struct *);
+struct kfd_process *kfd_lookup_process_by_pasid(unsigned int pasid);
+struct kfd_process *kfd_lookup_process_by_mm(const struct mm_struct *mm);
+void kfd_unref_process(struct kfd_process *p);
+int kfd_process_evict_queues(struct kfd_process *p);
+int kfd_process_restore_queues(struct kfd_process *p);
+void kfd_suspend_all_processes(void);
+int kfd_resume_all_processes(void);
+
+int kfd_process_device_init_vm(struct kfd_process_device *pdd,
+			       struct file *drm_file);
+struct kfd_process_device *kfd_bind_process_to_device(struct kfd_dev *dev,
+						struct kfd_process *p);
+struct kfd_process_device *kfd_get_process_device_data(struct kfd_dev *dev,
+							struct kfd_process *p);
+struct kfd_process_device *kfd_create_process_device_data(struct kfd_dev *dev,
+							struct kfd_process *p);
+
+int kfd_reserved_mem_mmap(struct kfd_dev *dev, struct kfd_process *process,
+			  struct vm_area_struct *vma);
+
+/* KFD process API for creating and translating handles */
+int kfd_process_device_create_obj_handle(struct kfd_process_device *pdd,
+					void *mem);
+void *kfd_process_device_translate_handle(struct kfd_process_device *p,
+					int handle);
+void kfd_process_device_remove_obj_handle(struct kfd_process_device *pdd,
+					int handle);
+
+/* Process device data iterator */
+struct kfd_process_device *kfd_get_first_process_device_data(
+							struct kfd_process *p);
+struct kfd_process_device *kfd_get_next_process_device_data(
+						struct kfd_process *p,
+						struct kfd_process_device *pdd);
+bool kfd_has_process_device_data(struct kfd_process *p);
+
+/* PASIDs */
+int kfd_pasid_init(void);
+void kfd_pasid_exit(void);
+bool kfd_set_pasid_limit(unsigned int new_limit);
+unsigned int kfd_get_pasid_limit(void);
+unsigned int kfd_pasid_alloc(void);
+void kfd_pasid_free(unsigned int pasid);
+
+/* Doorbells */
+size_t kfd_doorbell_process_slice(struct kfd_dev *kfd);
+int kfd_doorbell_init(struct kfd_dev *kfd);
+void kfd_doorbell_fini(struct kfd_dev *kfd);
+int kfd_doorbell_mmap(struct kfd_dev *dev, struct kfd_process *process,
+		      struct vm_area_struct *vma);
+void __iomem *kfd_get_kernel_doorbell(struct kfd_dev *kfd,
+					unsigned int *doorbell_off);
+void kfd_release_kernel_doorbell(struct kfd_dev *kfd, u32 __iomem *db_addr);
+u32 read_kernel_doorbell(u32 __iomem *db);
+void write_kernel_doorbell(void __iomem *db, u32 value);
+void write_kernel_doorbell64(void __iomem *db, u64 value);
+unsigned int kfd_doorbell_id_to_offset(struct kfd_dev *kfd,
+					struct kfd_process *process,
+					unsigned int doorbell_id);
+phys_addr_t kfd_get_process_doorbells(struct kfd_dev *dev,
+					struct kfd_process *process);
+int kfd_alloc_process_doorbells(struct kfd_process *process);
+void kfd_free_process_doorbells(struct kfd_process *process);
+
+/* GTT Sub-Allocator */
+
+int kfd_gtt_sa_allocate(struct kfd_dev *kfd, unsigned int size,
+			struct kfd_mem_obj **mem_obj);
+
+int kfd_gtt_sa_free(struct kfd_dev *kfd, struct kfd_mem_obj *mem_obj);
+
+extern struct device *kfd_device;
+
+/* Topology */
+int kfd_topology_init(void);
+void kfd_topology_shutdown(void);
+int kfd_topology_add_device(struct kfd_dev *gpu);
+int kfd_topology_remove_device(struct kfd_dev *gpu);
+struct kfd_topology_device *kfd_topology_device_by_proximity_domain(
+						uint32_t proximity_domain);
+struct kfd_topology_device *kfd_topology_device_by_id(uint32_t gpu_id);
+struct kfd_dev *kfd_device_by_id(uint32_t gpu_id);
+struct kfd_dev *kfd_device_by_pci_dev(const struct pci_dev *pdev);
+int kfd_topology_enum_kfd_devices(uint8_t idx, struct kfd_dev **kdev);
+int kfd_numa_node_to_apic_id(int numa_node_id);
+
+/* Interrupts */
+int kfd_interrupt_init(struct kfd_dev *dev);
+void kfd_interrupt_exit(struct kfd_dev *dev);
+void kgd2kfd_interrupt(struct kfd_dev *kfd, const void *ih_ring_entry);
+bool enqueue_ih_ring_entry(struct kfd_dev *kfd,	const void *ih_ring_entry);
+bool interrupt_is_wanted(struct kfd_dev *dev,
+				const uint32_t *ih_ring_entry,
+				uint32_t *patched_ihre, bool *flag);
+
+/* Power Management */
+void kgd2kfd_suspend(struct kfd_dev *kfd);
+int kgd2kfd_resume(struct kfd_dev *kfd);
+
+/* GPU reset */
+int kgd2kfd_pre_reset(struct kfd_dev *kfd);
+int kgd2kfd_post_reset(struct kfd_dev *kfd);
+
+/* amdkfd Apertures */
+int kfd_init_apertures(struct kfd_process *process);
+
+/* Queue Context Management */
+int init_queue(struct queue **q, const struct queue_properties *properties);
+void uninit_queue(struct queue *q);
+void print_queue_properties(struct queue_properties *q);
+void print_queue(struct queue *q);
+
+struct mqd_manager *mqd_manager_init(enum KFD_MQD_TYPE type,
+					struct kfd_dev *dev);
+struct mqd_manager *mqd_manager_init_cik(enum KFD_MQD_TYPE type,
+		struct kfd_dev *dev);
+struct mqd_manager *mqd_manager_init_cik_hawaii(enum KFD_MQD_TYPE type,
+		struct kfd_dev *dev);
+struct mqd_manager *mqd_manager_init_vi(enum KFD_MQD_TYPE type,
+		struct kfd_dev *dev);
+struct mqd_manager *mqd_manager_init_vi_tonga(enum KFD_MQD_TYPE type,
+		struct kfd_dev *dev);
+struct mqd_manager *mqd_manager_init_v9(enum KFD_MQD_TYPE type,
+		struct kfd_dev *dev);
+struct device_queue_manager *device_queue_manager_init(struct kfd_dev *dev);
+void device_queue_manager_uninit(struct device_queue_manager *dqm);
+struct kernel_queue *kernel_queue_init(struct kfd_dev *dev,
+					enum kfd_queue_type type);
+void kernel_queue_uninit(struct kernel_queue *kq);
+int kfd_process_vm_fault(struct device_queue_manager *dqm, unsigned int pasid);
+
+/* Process Queue Manager */
+struct process_queue_node {
+	struct queue *q;
+	struct kernel_queue *kq;
+	struct list_head process_queue_list;
+};
+
+void kfd_process_dequeue_from_device(struct kfd_process_device *pdd);
+void kfd_process_dequeue_from_all_devices(struct kfd_process *p);
+int pqm_init(struct process_queue_manager *pqm, struct kfd_process *p);
+void pqm_uninit(struct process_queue_manager *pqm);
+int pqm_create_queue(struct process_queue_manager *pqm,
+			    struct kfd_dev *dev,
+			    struct file *f,
+			    struct queue_properties *properties,
+			    unsigned int *qid);
+int pqm_destroy_queue(struct process_queue_manager *pqm, unsigned int qid);
+int pqm_update_queue(struct process_queue_manager *pqm, unsigned int qid,
+			struct queue_properties *p);
+int pqm_set_cu_mask(struct process_queue_manager *pqm, unsigned int qid,
+			struct queue_properties *p);
+struct kernel_queue *pqm_get_kernel_queue(struct process_queue_manager *pqm,
+						unsigned int qid);
+
+int amdkfd_fence_wait_timeout(unsigned int *fence_addr,
+				unsigned int fence_value,
+				unsigned int timeout_ms);
+
+/* Packet Manager */
+
+#define KFD_FENCE_COMPLETED (100)
+#define KFD_FENCE_INIT   (10)
+
+struct packet_manager {
+	struct device_queue_manager *dqm;
+	struct kernel_queue *priv_queue;
+	struct mutex lock;
+	bool allocated;
+	struct kfd_mem_obj *ib_buffer_obj;
+	unsigned int ib_size_bytes;
+
+	const struct packet_manager_funcs *pmf;
+};
+
+struct packet_manager_funcs {
+	/* Support ASIC-specific packet formats for PM4 packets */
+	int (*map_process)(struct packet_manager *pm, uint32_t *buffer,
+			struct qcm_process_device *qpd);
+	int (*runlist)(struct packet_manager *pm, uint32_t *buffer,
+			uint64_t ib, size_t ib_size_in_dwords, bool chain);
+	int (*set_resources)(struct packet_manager *pm, uint32_t *buffer,
+			struct scheduling_resources *res);
+	int (*map_queues)(struct packet_manager *pm, uint32_t *buffer,
+			struct queue *q, bool is_static);
+	int (*unmap_queues)(struct packet_manager *pm, uint32_t *buffer,
+			enum kfd_queue_type type,
+			enum kfd_unmap_queues_filter mode,
+			uint32_t filter_param, bool reset,
+			unsigned int sdma_engine);
+	int (*query_status)(struct packet_manager *pm, uint32_t *buffer,
+			uint64_t fence_address,	uint32_t fence_value);
+	int (*release_mem)(uint64_t gpu_addr, uint32_t *buffer);
+
+	/* Packet sizes */
+	int map_process_size;
+	int runlist_size;
+	int set_resources_size;
+	int map_queues_size;
+	int unmap_queues_size;
+	int query_status_size;
+	int release_mem_size;
+};
+
+extern const struct packet_manager_funcs kfd_vi_pm_funcs;
+extern const struct packet_manager_funcs kfd_v9_pm_funcs;
+
+int pm_init(struct packet_manager *pm, struct device_queue_manager *dqm);
+void pm_uninit(struct packet_manager *pm);
+int pm_send_set_resources(struct packet_manager *pm,
+				struct scheduling_resources *res);
+int pm_send_runlist(struct packet_manager *pm, struct list_head *dqm_queues);
+int pm_send_query_status(struct packet_manager *pm, uint64_t fence_address,
+				uint32_t fence_value);
+
+int pm_send_unmap_queue(struct packet_manager *pm, enum kfd_queue_type type,
+			enum kfd_unmap_queues_filter mode,
+			uint32_t filter_param, bool reset,
+			unsigned int sdma_engine);
+
+void pm_release_ib(struct packet_manager *pm);
+
+/* Following PM funcs can be shared among VI and AI */
+unsigned int pm_build_pm4_header(unsigned int opcode, size_t packet_size);
+int pm_set_resources_vi(struct packet_manager *pm, uint32_t *buffer,
+				struct scheduling_resources *res);
+
+uint64_t kfd_get_number_elems(struct kfd_dev *kfd);
+
+/* Events */
+extern const struct kfd_event_interrupt_class event_interrupt_class_cik;
+extern const struct kfd_event_interrupt_class event_interrupt_class_v9;
+
+extern const struct kfd_device_global_init_class device_global_init_class_cik;
+
+void kfd_event_init_process(struct kfd_process *p);
+void kfd_event_free_process(struct kfd_process *p);
+int kfd_event_mmap(struct kfd_process *process, struct vm_area_struct *vma);
+int kfd_wait_on_events(struct kfd_process *p,
+		       uint32_t num_events, void __user *data,
+		       bool all, uint32_t user_timeout_ms,
+		       uint32_t *wait_result);
+void kfd_signal_event_interrupt(unsigned int pasid, uint32_t partial_id,
+				uint32_t valid_id_bits);
+void kfd_signal_iommu_event(struct kfd_dev *dev,
+		unsigned int pasid, unsigned long address,
+		bool is_write_requested, bool is_execute_requested);
+void kfd_signal_hw_exception_event(unsigned int pasid);
+int kfd_set_event(struct kfd_process *p, uint32_t event_id);
+int kfd_reset_event(struct kfd_process *p, uint32_t event_id);
+int kfd_event_page_set(struct kfd_process *p, void *kernel_address,
+		       uint64_t size);
+int kfd_event_create(struct file *devkfd, struct kfd_process *p,
+		     uint32_t event_type, bool auto_reset, uint32_t node_id,
+		     uint32_t *event_id, uint32_t *event_trigger_data,
+		     uint64_t *event_page_offset, uint32_t *event_slot_index);
+int kfd_event_destroy(struct kfd_process *p, uint32_t event_id);
+
+void kfd_signal_vm_fault_event(struct kfd_dev *dev, unsigned int pasid,
+				struct kfd_vm_fault_info *info);
+
+void kfd_signal_reset_event(struct kfd_dev *dev);
+
+void kfd_flush_tlb(struct kfd_process_device *pdd);
+
+int dbgdev_wave_reset_wavefronts(struct kfd_dev *dev, struct kfd_process *p);
+
+bool kfd_is_locked(void);
+
+/* Debugfs */
+#if defined(CONFIG_DEBUG_FS)
+
+void kfd_debugfs_init(void);
+void kfd_debugfs_fini(void);
+int kfd_debugfs_mqds_by_process(struct seq_file *m, void *data);
+int pqm_debugfs_mqds(struct seq_file *m, void *data);
+int kfd_debugfs_hqds_by_device(struct seq_file *m, void *data);
+int dqm_debugfs_hqds(struct seq_file *m, void *data);
+int kfd_debugfs_rls_by_device(struct seq_file *m, void *data);
+int pm_debugfs_runlist(struct seq_file *m, void *data);
+
+int kfd_debugfs_hang_hws(struct kfd_dev *dev);
+int pm_debugfs_hang_hws(struct packet_manager *pm);
+int dqm_debugfs_execute_queues(struct device_queue_manager *dqm);
+
+#else
+
+static inline void kfd_debugfs_init(void) {}
+static inline void kfd_debugfs_fini(void) {}
+
+#endif
+
+#endif
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process.c b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
new file mode 100644
index 000000000..4694386cc
--- /dev/null
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
@@ -0,0 +1,1101 @@
+/*
+ * Copyright 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#include <linux/mutex.h>
+#include <linux/log2.h>
+#include <linux/sched.h>
+#include <linux/sched/mm.h>
+#include <linux/sched/task.h>
+#include <linux/slab.h>
+#include <linux/amd-iommu.h>
+#include <linux/notifier.h>
+#include <linux/compat.h>
+#include <linux/mman.h>
+#include <linux/file.h>
+
+struct mm_struct;
+
+#include "kfd_priv.h"
+#include "kfd_device_queue_manager.h"
+#include "kfd_dbgmgr.h"
+#include "kfd_iommu.h"
+
+/*
+ * List of struct kfd_process (field kfd_process).
+ * Unique/indexed by mm_struct*
+ */
+DEFINE_HASHTABLE(kfd_processes_table, KFD_PROCESS_TABLE_SIZE);
+static DEFINE_MUTEX(kfd_processes_mutex);
+
+DEFINE_SRCU(kfd_processes_srcu);
+
+/* For process termination handling */
+static struct workqueue_struct *kfd_process_wq;
+
+/* Ordered, single-threaded workqueue for restoring evicted
+ * processes. Restoring multiple processes concurrently under memory
+ * pressure can lead to processes blocking each other from validating
+ * their BOs and result in a live-lock situation where processes
+ * remain evicted indefinitely.
+ */
+static struct workqueue_struct *kfd_restore_wq;
+
+static struct kfd_process *find_process(const struct task_struct *thread);
+static void kfd_process_ref_release(struct kref *ref);
+static struct kfd_process *create_process(const struct task_struct *thread,
+					struct file *filep);
+
+static void evict_process_worker(struct work_struct *work);
+static void restore_process_worker(struct work_struct *work);
+
+
+int kfd_process_create_wq(void)
+{
+	if (!kfd_process_wq)
+		kfd_process_wq = alloc_workqueue("kfd_process_wq", 0, 0);
+	if (!kfd_restore_wq)
+		kfd_restore_wq = alloc_ordered_workqueue("kfd_restore_wq", 0);
+
+	if (!kfd_process_wq || !kfd_restore_wq) {
+		kfd_process_destroy_wq();
+		return -ENOMEM;
+	}
+
+	return 0;
+}
+
+void kfd_process_destroy_wq(void)
+{
+	if (kfd_process_wq) {
+		destroy_workqueue(kfd_process_wq);
+		kfd_process_wq = NULL;
+	}
+	if (kfd_restore_wq) {
+		destroy_workqueue(kfd_restore_wq);
+		kfd_restore_wq = NULL;
+	}
+}
+
+static void kfd_process_free_gpuvm(struct kgd_mem *mem,
+			struct kfd_process_device *pdd)
+{
+	struct kfd_dev *dev = pdd->dev;
+
+	dev->kfd2kgd->unmap_memory_to_gpu(dev->kgd, mem, pdd->vm);
+	dev->kfd2kgd->free_memory_of_gpu(dev->kgd, mem);
+}
+
+/* kfd_process_alloc_gpuvm - Allocate GPU VM for the KFD process
+ *	This function should be only called right after the process
+ *	is created and when kfd_processes_mutex is still being held
+ *	to avoid concurrency. Because of that exclusiveness, we do
+ *	not need to take p->mutex.
+ */
+static int kfd_process_alloc_gpuvm(struct kfd_process_device *pdd,
+				   uint64_t gpu_va, uint32_t size,
+				   uint32_t flags, void **kptr)
+{
+	struct kfd_dev *kdev = pdd->dev;
+	struct kgd_mem *mem = NULL;
+	int handle;
+	int err;
+
+	err = kdev->kfd2kgd->alloc_memory_of_gpu(kdev->kgd, gpu_va, size,
+						 pdd->vm, &mem, NULL, flags);
+	if (err)
+		goto err_alloc_mem;
+
+	err = kdev->kfd2kgd->map_memory_to_gpu(kdev->kgd, mem, pdd->vm);
+	if (err)
+		goto err_map_mem;
+
+	err = kdev->kfd2kgd->sync_memory(kdev->kgd, mem, true);
+	if (err) {
+		pr_debug("Sync memory failed, wait interrupted by user signal\n");
+		goto sync_memory_failed;
+	}
+
+	/* Create an obj handle so kfd_process_device_remove_obj_handle
+	 * will take care of the bo removal when the process finishes.
+	 * We do not need to take p->mutex, because the process is just
+	 * created and the ioctls have not had the chance to run.
+	 */
+	handle = kfd_process_device_create_obj_handle(pdd, mem);
+
+	if (handle < 0) {
+		err = handle;
+		goto free_gpuvm;
+	}
+
+	if (kptr) {
+		err = kdev->kfd2kgd->map_gtt_bo_to_kernel(kdev->kgd,
+				(struct kgd_mem *)mem, kptr, NULL);
+		if (err) {
+			pr_debug("Map GTT BO to kernel failed\n");
+			goto free_obj_handle;
+		}
+	}
+
+	return err;
+
+free_obj_handle:
+	kfd_process_device_remove_obj_handle(pdd, handle);
+free_gpuvm:
+sync_memory_failed:
+	kfd_process_free_gpuvm(mem, pdd);
+	return err;
+
+err_map_mem:
+	kdev->kfd2kgd->free_memory_of_gpu(kdev->kgd, mem);
+err_alloc_mem:
+	*kptr = NULL;
+	return err;
+}
+
+/* kfd_process_device_reserve_ib_mem - Reserve memory inside the
+ *	process for IB usage The memory reserved is for KFD to submit
+ *	IB to AMDGPU from kernel.  If the memory is reserved
+ *	successfully, ib_kaddr will have the CPU/kernel
+ *	address. Check ib_kaddr before accessing the memory.
+ */
+static int kfd_process_device_reserve_ib_mem(struct kfd_process_device *pdd)
+{
+	struct qcm_process_device *qpd = &pdd->qpd;
+	uint32_t flags = ALLOC_MEM_FLAGS_GTT |
+			 ALLOC_MEM_FLAGS_NO_SUBSTITUTE |
+			 ALLOC_MEM_FLAGS_WRITABLE |
+			 ALLOC_MEM_FLAGS_EXECUTABLE;
+	void *kaddr;
+	int ret;
+
+	if (qpd->ib_kaddr || !qpd->ib_base)
+		return 0;
+
+	/* ib_base is only set for dGPU */
+	ret = kfd_process_alloc_gpuvm(pdd, qpd->ib_base, PAGE_SIZE, flags,
+				      &kaddr);
+	if (ret)
+		return ret;
+
+	qpd->ib_kaddr = kaddr;
+
+	return 0;
+}
+
+struct kfd_process *kfd_create_process(struct file *filep)
+{
+	struct kfd_process *process;
+	struct task_struct *thread = current;
+
+	if (!thread->mm)
+		return ERR_PTR(-EINVAL);
+
+	/* Only the pthreads threading model is supported. */
+	if (thread->group_leader->mm != thread->mm)
+		return ERR_PTR(-EINVAL);
+
+	/*
+	 * take kfd processes mutex before starting of process creation
+	 * so there won't be a case where two threads of the same process
+	 * create two kfd_process structures
+	 */
+	mutex_lock(&kfd_processes_mutex);
+
+	/* A prior open of /dev/kfd could have already created the process. */
+	process = find_process(thread);
+	if (process)
+		pr_debug("Process already found\n");
+	else
+		process = create_process(thread, filep);
+
+	mutex_unlock(&kfd_processes_mutex);
+
+	return process;
+}
+
+struct kfd_process *kfd_get_process(const struct task_struct *thread)
+{
+	struct kfd_process *process;
+
+	if (!thread->mm)
+		return ERR_PTR(-EINVAL);
+
+	/* Only the pthreads threading model is supported. */
+	if (thread->group_leader->mm != thread->mm)
+		return ERR_PTR(-EINVAL);
+
+	process = find_process(thread);
+	if (!process)
+		return ERR_PTR(-EINVAL);
+
+	return process;
+}
+
+static struct kfd_process *find_process_by_mm(const struct mm_struct *mm)
+{
+	struct kfd_process *process;
+
+	hash_for_each_possible_rcu(kfd_processes_table, process,
+					kfd_processes, (uintptr_t)mm)
+		if (process->mm == mm)
+			return process;
+
+	return NULL;
+}
+
+static struct kfd_process *find_process(const struct task_struct *thread)
+{
+	struct kfd_process *p;
+	int idx;
+
+	idx = srcu_read_lock(&kfd_processes_srcu);
+	p = find_process_by_mm(thread->mm);
+	srcu_read_unlock(&kfd_processes_srcu, idx);
+
+	return p;
+}
+
+void kfd_unref_process(struct kfd_process *p)
+{
+	kref_put(&p->ref, kfd_process_ref_release);
+}
+
+static void kfd_process_device_free_bos(struct kfd_process_device *pdd)
+{
+	struct kfd_process *p = pdd->process;
+	void *mem;
+	int id;
+
+	/*
+	 * Remove all handles from idr and release appropriate
+	 * local memory object
+	 */
+	idr_for_each_entry(&pdd->alloc_idr, mem, id) {
+		struct kfd_process_device *peer_pdd;
+
+		list_for_each_entry(peer_pdd, &p->per_device_data,
+				    per_device_list) {
+			if (!peer_pdd->vm)
+				continue;
+			peer_pdd->dev->kfd2kgd->unmap_memory_to_gpu(
+				peer_pdd->dev->kgd, mem, peer_pdd->vm);
+		}
+
+		pdd->dev->kfd2kgd->free_memory_of_gpu(pdd->dev->kgd, mem);
+		kfd_process_device_remove_obj_handle(pdd, id);
+	}
+}
+
+static void kfd_process_free_outstanding_kfd_bos(struct kfd_process *p)
+{
+	struct kfd_process_device *pdd;
+
+	list_for_each_entry(pdd, &p->per_device_data, per_device_list)
+		kfd_process_device_free_bos(pdd);
+}
+
+static void kfd_process_destroy_pdds(struct kfd_process *p)
+{
+	struct kfd_process_device *pdd, *temp;
+
+	list_for_each_entry_safe(pdd, temp, &p->per_device_data,
+				 per_device_list) {
+		pr_debug("Releasing pdd (topology id %d) for process (pasid %d)\n",
+				pdd->dev->id, p->pasid);
+
+		if (pdd->drm_file)
+			fput(pdd->drm_file);
+		else if (pdd->vm)
+			pdd->dev->kfd2kgd->destroy_process_vm(
+				pdd->dev->kgd, pdd->vm);
+
+		list_del(&pdd->per_device_list);
+
+		if (pdd->qpd.cwsr_kaddr && !pdd->qpd.cwsr_base)
+			free_pages((unsigned long)pdd->qpd.cwsr_kaddr,
+				get_order(KFD_CWSR_TBA_TMA_SIZE));
+
+		kfree(pdd->qpd.doorbell_bitmap);
+		idr_destroy(&pdd->alloc_idr);
+
+		kfree(pdd);
+	}
+}
+
+/* No process locking is needed in this function, because the process
+ * is not findable any more. We must assume that no other thread is
+ * using it any more, otherwise we couldn't safely free the process
+ * structure in the end.
+ */
+static void kfd_process_wq_release(struct work_struct *work)
+{
+	struct kfd_process *p = container_of(work, struct kfd_process,
+					     release_work);
+
+	kfd_iommu_unbind_process(p);
+
+	kfd_process_free_outstanding_kfd_bos(p);
+
+	kfd_process_destroy_pdds(p);
+	dma_fence_put(p->ef);
+
+	kfd_event_free_process(p);
+
+	kfd_pasid_free(p->pasid);
+	kfd_free_process_doorbells(p);
+
+	mutex_destroy(&p->mutex);
+
+	put_task_struct(p->lead_thread);
+
+	kfree(p);
+}
+
+static void kfd_process_ref_release(struct kref *ref)
+{
+	struct kfd_process *p = container_of(ref, struct kfd_process, ref);
+
+	INIT_WORK(&p->release_work, kfd_process_wq_release);
+	queue_work(kfd_process_wq, &p->release_work);
+}
+
+static void kfd_process_destroy_delayed(struct rcu_head *rcu)
+{
+	struct kfd_process *p = container_of(rcu, struct kfd_process, rcu);
+
+	kfd_unref_process(p);
+}
+
+static void kfd_process_notifier_release(struct mmu_notifier *mn,
+					struct mm_struct *mm)
+{
+	struct kfd_process *p;
+	struct kfd_process_device *pdd = NULL;
+
+	/*
+	 * The kfd_process structure can not be free because the
+	 * mmu_notifier srcu is read locked
+	 */
+	p = container_of(mn, struct kfd_process, mmu_notifier);
+	if (WARN_ON(p->mm != mm))
+		return;
+
+	mutex_lock(&kfd_processes_mutex);
+	hash_del_rcu(&p->kfd_processes);
+	mutex_unlock(&kfd_processes_mutex);
+	synchronize_srcu(&kfd_processes_srcu);
+
+	cancel_delayed_work_sync(&p->eviction_work);
+	cancel_delayed_work_sync(&p->restore_work);
+
+	mutex_lock(&p->mutex);
+
+	/* Iterate over all process device data structures and if the
+	 * pdd is in debug mode, we should first force unregistration,
+	 * then we will be able to destroy the queues
+	 */
+	list_for_each_entry(pdd, &p->per_device_data, per_device_list) {
+		struct kfd_dev *dev = pdd->dev;
+
+		mutex_lock(kfd_get_dbgmgr_mutex());
+		if (dev && dev->dbgmgr && dev->dbgmgr->pasid == p->pasid) {
+			if (!kfd_dbgmgr_unregister(dev->dbgmgr, p)) {
+				kfd_dbgmgr_destroy(dev->dbgmgr);
+				dev->dbgmgr = NULL;
+			}
+		}
+		mutex_unlock(kfd_get_dbgmgr_mutex());
+	}
+
+	kfd_process_dequeue_from_all_devices(p);
+	pqm_uninit(&p->pqm);
+
+	/* Indicate to other users that MM is no longer valid */
+	p->mm = NULL;
+
+	mutex_unlock(&p->mutex);
+
+	mmu_notifier_unregister_no_release(&p->mmu_notifier, mm);
+	mmu_notifier_call_srcu(&p->rcu, &kfd_process_destroy_delayed);
+}
+
+static const struct mmu_notifier_ops kfd_process_mmu_notifier_ops = {
+	.release = kfd_process_notifier_release,
+};
+
+static int kfd_process_init_cwsr_apu(struct kfd_process *p, struct file *filep)
+{
+	unsigned long  offset;
+	struct kfd_process_device *pdd;
+
+	list_for_each_entry(pdd, &p->per_device_data, per_device_list) {
+		struct kfd_dev *dev = pdd->dev;
+		struct qcm_process_device *qpd = &pdd->qpd;
+
+		if (!dev->cwsr_enabled || qpd->cwsr_kaddr || qpd->cwsr_base)
+			continue;
+
+		offset = (KFD_MMAP_TYPE_RESERVED_MEM | KFD_MMAP_GPU_ID(dev->id))
+			<< PAGE_SHIFT;
+		qpd->tba_addr = (int64_t)vm_mmap(filep, 0,
+			KFD_CWSR_TBA_TMA_SIZE, PROT_READ | PROT_EXEC,
+			MAP_SHARED, offset);
+
+		if (IS_ERR_VALUE(qpd->tba_addr)) {
+			int err = qpd->tba_addr;
+
+			pr_err("Failure to set tba address. error %d.\n", err);
+			qpd->tba_addr = 0;
+			qpd->cwsr_kaddr = NULL;
+			return err;
+		}
+
+		memcpy(qpd->cwsr_kaddr, dev->cwsr_isa, dev->cwsr_isa_size);
+
+		qpd->tma_addr = qpd->tba_addr + KFD_CWSR_TMA_OFFSET;
+		pr_debug("set tba :0x%llx, tma:0x%llx, cwsr_kaddr:%p for pqm.\n",
+			qpd->tba_addr, qpd->tma_addr, qpd->cwsr_kaddr);
+	}
+
+	return 0;
+}
+
+static int kfd_process_device_init_cwsr_dgpu(struct kfd_process_device *pdd)
+{
+	struct kfd_dev *dev = pdd->dev;
+	struct qcm_process_device *qpd = &pdd->qpd;
+	uint32_t flags = ALLOC_MEM_FLAGS_GTT |
+		ALLOC_MEM_FLAGS_NO_SUBSTITUTE | ALLOC_MEM_FLAGS_EXECUTABLE;
+	void *kaddr;
+	int ret;
+
+	if (!dev->cwsr_enabled || qpd->cwsr_kaddr || !qpd->cwsr_base)
+		return 0;
+
+	/* cwsr_base is only set for dGPU */
+	ret = kfd_process_alloc_gpuvm(pdd, qpd->cwsr_base,
+				      KFD_CWSR_TBA_TMA_SIZE, flags, &kaddr);
+	if (ret)
+		return ret;
+
+	qpd->cwsr_kaddr = kaddr;
+	qpd->tba_addr = qpd->cwsr_base;
+
+	memcpy(qpd->cwsr_kaddr, dev->cwsr_isa, dev->cwsr_isa_size);
+
+	qpd->tma_addr = qpd->tba_addr + KFD_CWSR_TMA_OFFSET;
+	pr_debug("set tba :0x%llx, tma:0x%llx, cwsr_kaddr:%p for pqm.\n",
+		 qpd->tba_addr, qpd->tma_addr, qpd->cwsr_kaddr);
+
+	return 0;
+}
+
+static struct kfd_process *create_process(const struct task_struct *thread,
+					struct file *filep)
+{
+	struct kfd_process *process;
+	int err = -ENOMEM;
+
+	process = kzalloc(sizeof(*process), GFP_KERNEL);
+
+	if (!process)
+		goto err_alloc_process;
+
+	process->pasid = kfd_pasid_alloc();
+	if (process->pasid == 0)
+		goto err_alloc_pasid;
+
+	if (kfd_alloc_process_doorbells(process) < 0)
+		goto err_alloc_doorbells;
+
+	kref_init(&process->ref);
+
+	mutex_init(&process->mutex);
+
+	process->mm = thread->mm;
+
+	/* register notifier */
+	process->mmu_notifier.ops = &kfd_process_mmu_notifier_ops;
+	err = mmu_notifier_register(&process->mmu_notifier, process->mm);
+	if (err)
+		goto err_mmu_notifier;
+
+	hash_add_rcu(kfd_processes_table, &process->kfd_processes,
+			(uintptr_t)process->mm);
+
+	process->lead_thread = thread->group_leader;
+	get_task_struct(process->lead_thread);
+
+	INIT_LIST_HEAD(&process->per_device_data);
+
+	kfd_event_init_process(process);
+
+	err = pqm_init(&process->pqm, process);
+	if (err != 0)
+		goto err_process_pqm_init;
+
+	/* init process apertures*/
+	process->is_32bit_user_mode = in_compat_syscall();
+	err = kfd_init_apertures(process);
+	if (err != 0)
+		goto err_init_apertures;
+
+	INIT_DELAYED_WORK(&process->eviction_work, evict_process_worker);
+	INIT_DELAYED_WORK(&process->restore_work, restore_process_worker);
+	process->last_restore_timestamp = get_jiffies_64();
+
+	err = kfd_process_init_cwsr_apu(process, filep);
+	if (err)
+		goto err_init_cwsr;
+
+	return process;
+
+err_init_cwsr:
+	kfd_process_free_outstanding_kfd_bos(process);
+	kfd_process_destroy_pdds(process);
+err_init_apertures:
+	pqm_uninit(&process->pqm);
+err_process_pqm_init:
+	hash_del_rcu(&process->kfd_processes);
+	synchronize_rcu();
+	mmu_notifier_unregister_no_release(&process->mmu_notifier, process->mm);
+err_mmu_notifier:
+	mutex_destroy(&process->mutex);
+	kfd_free_process_doorbells(process);
+err_alloc_doorbells:
+	kfd_pasid_free(process->pasid);
+err_alloc_pasid:
+	kfree(process);
+err_alloc_process:
+	return ERR_PTR(err);
+}
+
+static int init_doorbell_bitmap(struct qcm_process_device *qpd,
+			struct kfd_dev *dev)
+{
+	unsigned int i;
+
+	if (!KFD_IS_SOC15(dev->device_info->asic_family))
+		return 0;
+
+	qpd->doorbell_bitmap =
+		kzalloc(DIV_ROUND_UP(KFD_MAX_NUM_OF_QUEUES_PER_PROCESS,
+				     BITS_PER_BYTE), GFP_KERNEL);
+	if (!qpd->doorbell_bitmap)
+		return -ENOMEM;
+
+	/* Mask out any reserved doorbells */
+	for (i = 0; i < KFD_MAX_NUM_OF_QUEUES_PER_PROCESS; i++)
+		if ((dev->shared_resources.reserved_doorbell_mask & i) ==
+		    dev->shared_resources.reserved_doorbell_val) {
+			set_bit(i, qpd->doorbell_bitmap);
+			pr_debug("reserved doorbell 0x%03x\n", i);
+		}
+
+	return 0;
+}
+
+struct kfd_process_device *kfd_get_process_device_data(struct kfd_dev *dev,
+							struct kfd_process *p)
+{
+	struct kfd_process_device *pdd = NULL;
+
+	list_for_each_entry(pdd, &p->per_device_data, per_device_list)
+		if (pdd->dev == dev)
+			return pdd;
+
+	return NULL;
+}
+
+struct kfd_process_device *kfd_create_process_device_data(struct kfd_dev *dev,
+							struct kfd_process *p)
+{
+	struct kfd_process_device *pdd = NULL;
+
+	pdd = kzalloc(sizeof(*pdd), GFP_KERNEL);
+	if (!pdd)
+		return NULL;
+
+	if (init_doorbell_bitmap(&pdd->qpd, dev)) {
+		pr_err("Failed to init doorbell for process\n");
+		kfree(pdd);
+		return NULL;
+	}
+
+	pdd->dev = dev;
+	INIT_LIST_HEAD(&pdd->qpd.queues_list);
+	INIT_LIST_HEAD(&pdd->qpd.priv_queue_list);
+	pdd->qpd.dqm = dev->dqm;
+	pdd->qpd.pqm = &p->pqm;
+	pdd->qpd.evicted = 0;
+	pdd->process = p;
+	pdd->bound = PDD_UNBOUND;
+	pdd->already_dequeued = false;
+	list_add(&pdd->per_device_list, &p->per_device_data);
+
+	/* Init idr used for memory handle translation */
+	idr_init(&pdd->alloc_idr);
+
+	return pdd;
+}
+
+/**
+ * kfd_process_device_init_vm - Initialize a VM for a process-device
+ *
+ * @pdd: The process-device
+ * @drm_file: Optional pointer to a DRM file descriptor
+ *
+ * If @drm_file is specified, it will be used to acquire the VM from
+ * that file descriptor. If successful, the @pdd takes ownership of
+ * the file descriptor.
+ *
+ * If @drm_file is NULL, a new VM is created.
+ *
+ * Returns 0 on success, -errno on failure.
+ */
+int kfd_process_device_init_vm(struct kfd_process_device *pdd,
+			       struct file *drm_file)
+{
+	struct kfd_process *p;
+	struct kfd_dev *dev;
+	int ret;
+
+	if (pdd->vm)
+		return drm_file ? -EBUSY : 0;
+
+	p = pdd->process;
+	dev = pdd->dev;
+
+	if (drm_file)
+		ret = dev->kfd2kgd->acquire_process_vm(
+			dev->kgd, drm_file,
+			&pdd->vm, &p->kgd_process_info, &p->ef);
+	else
+		ret = dev->kfd2kgd->create_process_vm(
+			dev->kgd, &pdd->vm, &p->kgd_process_info, &p->ef);
+	if (ret) {
+		pr_err("Failed to create process VM object\n");
+		return ret;
+	}
+
+	ret = kfd_process_device_reserve_ib_mem(pdd);
+	if (ret)
+		goto err_reserve_ib_mem;
+	ret = kfd_process_device_init_cwsr_dgpu(pdd);
+	if (ret)
+		goto err_init_cwsr;
+
+	pdd->drm_file = drm_file;
+
+	return 0;
+
+err_init_cwsr:
+err_reserve_ib_mem:
+	kfd_process_device_free_bos(pdd);
+	if (!drm_file)
+		dev->kfd2kgd->destroy_process_vm(dev->kgd, pdd->vm);
+	pdd->vm = NULL;
+
+	return ret;
+}
+
+/*
+ * Direct the IOMMU to bind the process (specifically the pasid->mm)
+ * to the device.
+ * Unbinding occurs when the process dies or the device is removed.
+ *
+ * Assumes that the process lock is held.
+ */
+struct kfd_process_device *kfd_bind_process_to_device(struct kfd_dev *dev,
+							struct kfd_process *p)
+{
+	struct kfd_process_device *pdd;
+	int err;
+
+	pdd = kfd_get_process_device_data(dev, p);
+	if (!pdd) {
+		pr_err("Process device data doesn't exist\n");
+		return ERR_PTR(-ENOMEM);
+	}
+
+	err = kfd_iommu_bind_process_to_device(pdd);
+	if (err)
+		return ERR_PTR(err);
+
+	err = kfd_process_device_init_vm(pdd, NULL);
+	if (err)
+		return ERR_PTR(err);
+
+	return pdd;
+}
+
+struct kfd_process_device *kfd_get_first_process_device_data(
+						struct kfd_process *p)
+{
+	return list_first_entry(&p->per_device_data,
+				struct kfd_process_device,
+				per_device_list);
+}
+
+struct kfd_process_device *kfd_get_next_process_device_data(
+						struct kfd_process *p,
+						struct kfd_process_device *pdd)
+{
+	if (list_is_last(&pdd->per_device_list, &p->per_device_data))
+		return NULL;
+	return list_next_entry(pdd, per_device_list);
+}
+
+bool kfd_has_process_device_data(struct kfd_process *p)
+{
+	return !(list_empty(&p->per_device_data));
+}
+
+/* Create specific handle mapped to mem from process local memory idr
+ * Assumes that the process lock is held.
+ */
+int kfd_process_device_create_obj_handle(struct kfd_process_device *pdd,
+					void *mem)
+{
+	return idr_alloc(&pdd->alloc_idr, mem, 0, 0, GFP_KERNEL);
+}
+
+/* Translate specific handle from process local memory idr
+ * Assumes that the process lock is held.
+ */
+void *kfd_process_device_translate_handle(struct kfd_process_device *pdd,
+					int handle)
+{
+	if (handle < 0)
+		return NULL;
+
+	return idr_find(&pdd->alloc_idr, handle);
+}
+
+/* Remove specific handle from process local memory idr
+ * Assumes that the process lock is held.
+ */
+void kfd_process_device_remove_obj_handle(struct kfd_process_device *pdd,
+					int handle)
+{
+	if (handle >= 0)
+		idr_remove(&pdd->alloc_idr, handle);
+}
+
+/* This increments the process->ref counter. */
+struct kfd_process *kfd_lookup_process_by_pasid(unsigned int pasid)
+{
+	struct kfd_process *p, *ret_p = NULL;
+	unsigned int temp;
+
+	int idx = srcu_read_lock(&kfd_processes_srcu);
+
+	hash_for_each_rcu(kfd_processes_table, temp, p, kfd_processes) {
+		if (p->pasid == pasid) {
+			kref_get(&p->ref);
+			ret_p = p;
+			break;
+		}
+	}
+
+	srcu_read_unlock(&kfd_processes_srcu, idx);
+
+	return ret_p;
+}
+
+/* This increments the process->ref counter. */
+struct kfd_process *kfd_lookup_process_by_mm(const struct mm_struct *mm)
+{
+	struct kfd_process *p;
+
+	int idx = srcu_read_lock(&kfd_processes_srcu);
+
+	p = find_process_by_mm(mm);
+	if (p)
+		kref_get(&p->ref);
+
+	srcu_read_unlock(&kfd_processes_srcu, idx);
+
+	return p;
+}
+
+/* process_evict_queues - Evict all user queues of a process
+ *
+ * Eviction is reference-counted per process-device. This means multiple
+ * evictions from different sources can be nested safely.
+ */
+int kfd_process_evict_queues(struct kfd_process *p)
+{
+	struct kfd_process_device *pdd;
+	int r = 0;
+	unsigned int n_evicted = 0;
+
+	list_for_each_entry(pdd, &p->per_device_data, per_device_list) {
+		r = pdd->dev->dqm->ops.evict_process_queues(pdd->dev->dqm,
+							    &pdd->qpd);
+		if (r) {
+			pr_err("Failed to evict process queues\n");
+			goto fail;
+		}
+		n_evicted++;
+	}
+
+	return r;
+
+fail:
+	/* To keep state consistent, roll back partial eviction by
+	 * restoring queues
+	 */
+	list_for_each_entry(pdd, &p->per_device_data, per_device_list) {
+		if (n_evicted == 0)
+			break;
+		if (pdd->dev->dqm->ops.restore_process_queues(pdd->dev->dqm,
+							      &pdd->qpd))
+			pr_err("Failed to restore queues\n");
+
+		n_evicted--;
+	}
+
+	return r;
+}
+
+/* process_restore_queues - Restore all user queues of a process */
+int kfd_process_restore_queues(struct kfd_process *p)
+{
+	struct kfd_process_device *pdd;
+	int r, ret = 0;
+
+	list_for_each_entry(pdd, &p->per_device_data, per_device_list) {
+		r = pdd->dev->dqm->ops.restore_process_queues(pdd->dev->dqm,
+							      &pdd->qpd);
+		if (r) {
+			pr_err("Failed to restore process queues\n");
+			if (!ret)
+				ret = r;
+		}
+	}
+
+	return ret;
+}
+
+static void evict_process_worker(struct work_struct *work)
+{
+	int ret;
+	struct kfd_process *p;
+	struct delayed_work *dwork;
+
+	dwork = to_delayed_work(work);
+
+	/* Process termination destroys this worker thread. So during the
+	 * lifetime of this thread, kfd_process p will be valid
+	 */
+	p = container_of(dwork, struct kfd_process, eviction_work);
+	WARN_ONCE(p->last_eviction_seqno != p->ef->seqno,
+		  "Eviction fence mismatch\n");
+
+	/* Narrow window of overlap between restore and evict work
+	 * item is possible. Once amdgpu_amdkfd_gpuvm_restore_process_bos
+	 * unreserves KFD BOs, it is possible to evicted again. But
+	 * restore has few more steps of finish. So lets wait for any
+	 * previous restore work to complete
+	 */
+	flush_delayed_work(&p->restore_work);
+
+	pr_debug("Started evicting pasid %d\n", p->pasid);
+	ret = kfd_process_evict_queues(p);
+	if (!ret) {
+		dma_fence_signal(p->ef);
+		dma_fence_put(p->ef);
+		p->ef = NULL;
+		queue_delayed_work(kfd_restore_wq, &p->restore_work,
+				msecs_to_jiffies(PROCESS_RESTORE_TIME_MS));
+
+		pr_debug("Finished evicting pasid %d\n", p->pasid);
+	} else
+		pr_err("Failed to evict queues of pasid %d\n", p->pasid);
+}
+
+static void restore_process_worker(struct work_struct *work)
+{
+	struct delayed_work *dwork;
+	struct kfd_process *p;
+	struct kfd_process_device *pdd;
+	int ret = 0;
+
+	dwork = to_delayed_work(work);
+
+	/* Process termination destroys this worker thread. So during the
+	 * lifetime of this thread, kfd_process p will be valid
+	 */
+	p = container_of(dwork, struct kfd_process, restore_work);
+
+	/* Call restore_process_bos on the first KGD device. This function
+	 * takes care of restoring the whole process including other devices.
+	 * Restore can fail if enough memory is not available. If so,
+	 * reschedule again.
+	 */
+	pdd = list_first_entry(&p->per_device_data,
+			       struct kfd_process_device,
+			       per_device_list);
+
+	pr_debug("Started restoring pasid %d\n", p->pasid);
+
+	/* Setting last_restore_timestamp before successful restoration.
+	 * Otherwise this would have to be set by KGD (restore_process_bos)
+	 * before KFD BOs are unreserved. If not, the process can be evicted
+	 * again before the timestamp is set.
+	 * If restore fails, the timestamp will be set again in the next
+	 * attempt. This would mean that the minimum GPU quanta would be
+	 * PROCESS_ACTIVE_TIME_MS - (time to execute the following two
+	 * functions)
+	 */
+
+	p->last_restore_timestamp = get_jiffies_64();
+	ret = pdd->dev->kfd2kgd->restore_process_bos(p->kgd_process_info,
+						     &p->ef);
+	if (ret) {
+		pr_debug("Failed to restore BOs of pasid %d, retry after %d ms\n",
+			 p->pasid, PROCESS_BACK_OFF_TIME_MS);
+		ret = queue_delayed_work(kfd_restore_wq, &p->restore_work,
+				msecs_to_jiffies(PROCESS_BACK_OFF_TIME_MS));
+		WARN(!ret, "reschedule restore work failed\n");
+		return;
+	}
+
+	ret = kfd_process_restore_queues(p);
+	if (!ret)
+		pr_debug("Finished restoring pasid %d\n", p->pasid);
+	else
+		pr_err("Failed to restore queues of pasid %d\n", p->pasid);
+}
+
+void kfd_suspend_all_processes(void)
+{
+	struct kfd_process *p;
+	unsigned int temp;
+	int idx = srcu_read_lock(&kfd_processes_srcu);
+
+	hash_for_each_rcu(kfd_processes_table, temp, p, kfd_processes) {
+		cancel_delayed_work_sync(&p->eviction_work);
+		cancel_delayed_work_sync(&p->restore_work);
+
+		if (kfd_process_evict_queues(p))
+			pr_err("Failed to suspend process %d\n", p->pasid);
+		dma_fence_signal(p->ef);
+		dma_fence_put(p->ef);
+		p->ef = NULL;
+	}
+	srcu_read_unlock(&kfd_processes_srcu, idx);
+}
+
+int kfd_resume_all_processes(void)
+{
+	struct kfd_process *p;
+	unsigned int temp;
+	int ret = 0, idx = srcu_read_lock(&kfd_processes_srcu);
+
+	hash_for_each_rcu(kfd_processes_table, temp, p, kfd_processes) {
+		if (!queue_delayed_work(kfd_restore_wq, &p->restore_work, 0)) {
+			pr_err("Restore process %d failed during resume\n",
+			       p->pasid);
+			ret = -EFAULT;
+		}
+	}
+	srcu_read_unlock(&kfd_processes_srcu, idx);
+	return ret;
+}
+
+int kfd_reserved_mem_mmap(struct kfd_dev *dev, struct kfd_process *process,
+			  struct vm_area_struct *vma)
+{
+	struct kfd_process_device *pdd;
+	struct qcm_process_device *qpd;
+
+	if ((vma->vm_end - vma->vm_start) != KFD_CWSR_TBA_TMA_SIZE) {
+		pr_err("Incorrect CWSR mapping size.\n");
+		return -EINVAL;
+	}
+
+	pdd = kfd_get_process_device_data(dev, process);
+	if (!pdd)
+		return -EINVAL;
+	qpd = &pdd->qpd;
+
+	qpd->cwsr_kaddr = (void *)__get_free_pages(GFP_KERNEL | __GFP_ZERO,
+					get_order(KFD_CWSR_TBA_TMA_SIZE));
+	if (!qpd->cwsr_kaddr) {
+		pr_err("Error allocating per process CWSR buffer.\n");
+		return -ENOMEM;
+	}
+
+	vma->vm_flags |= VM_IO | VM_DONTCOPY | VM_DONTEXPAND
+		| VM_NORESERVE | VM_DONTDUMP | VM_PFNMAP;
+	/* Mapping pages to user process */
+	return remap_pfn_range(vma, vma->vm_start,
+			       PFN_DOWN(__pa(qpd->cwsr_kaddr)),
+			       KFD_CWSR_TBA_TMA_SIZE, vma->vm_page_prot);
+}
+
+void kfd_flush_tlb(struct kfd_process_device *pdd)
+{
+	struct kfd_dev *dev = pdd->dev;
+	const struct kfd2kgd_calls *f2g = dev->kfd2kgd;
+
+	if (dev->dqm->sched_policy == KFD_SCHED_POLICY_NO_HWS) {
+		/* Nothing to flush until a VMID is assigned, which
+		 * only happens when the first queue is created.
+		 */
+		if (pdd->qpd.vmid)
+			f2g->invalidate_tlbs_vmid(dev->kgd, pdd->qpd.vmid);
+	} else {
+		f2g->invalidate_tlbs(dev->kgd, pdd->process->pasid);
+	}
+}
+
+#if defined(CONFIG_DEBUG_FS)
+
+int kfd_debugfs_mqds_by_process(struct seq_file *m, void *data)
+{
+	struct kfd_process *p;
+	unsigned int temp;
+	int r = 0;
+
+	int idx = srcu_read_lock(&kfd_processes_srcu);
+
+	hash_for_each_rcu(kfd_processes_table, temp, p, kfd_processes) {
+		seq_printf(m, "Process %d PASID %d:\n",
+			   p->lead_thread->tgid, p->pasid);
+
+		mutex_lock(&p->mutex);
+		r = pqm_debugfs_mqds(m, &p->pqm);
+		mutex_unlock(&p->mutex);
+
+		if (r)
+			break;
+	}
+
+	srcu_read_unlock(&kfd_processes_srcu, idx);
+
+	return r;
+}
+
+#endif
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
new file mode 100644
index 000000000..c8cad9c07
--- /dev/null
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
@@ -0,0 +1,474 @@
+/*
+ * Copyright 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ *
+ */
+
+#include <linux/slab.h>
+#include <linux/list.h>
+#include "kfd_device_queue_manager.h"
+#include "kfd_priv.h"
+#include "kfd_kernel_queue.h"
+
+static inline struct process_queue_node *get_queue_by_qid(
+			struct process_queue_manager *pqm, unsigned int qid)
+{
+	struct process_queue_node *pqn;
+
+	list_for_each_entry(pqn, &pqm->queues, process_queue_list) {
+		if ((pqn->q && pqn->q->properties.queue_id == qid) ||
+		    (pqn->kq && pqn->kq->queue->properties.queue_id == qid))
+			return pqn;
+	}
+
+	return NULL;
+}
+
+static int find_available_queue_slot(struct process_queue_manager *pqm,
+					unsigned int *qid)
+{
+	unsigned long found;
+
+	found = find_first_zero_bit(pqm->queue_slot_bitmap,
+			KFD_MAX_NUM_OF_QUEUES_PER_PROCESS);
+
+	pr_debug("The new slot id %lu\n", found);
+
+	if (found >= KFD_MAX_NUM_OF_QUEUES_PER_PROCESS) {
+		pr_info("Cannot open more queues for process with pasid %d\n",
+				pqm->process->pasid);
+		return -ENOMEM;
+	}
+
+	set_bit(found, pqm->queue_slot_bitmap);
+	*qid = found;
+
+	return 0;
+}
+
+void kfd_process_dequeue_from_device(struct kfd_process_device *pdd)
+{
+	struct kfd_dev *dev = pdd->dev;
+
+	if (pdd->already_dequeued)
+		return;
+
+	dev->dqm->ops.process_termination(dev->dqm, &pdd->qpd);
+	pdd->already_dequeued = true;
+}
+
+void kfd_process_dequeue_from_all_devices(struct kfd_process *p)
+{
+	struct kfd_process_device *pdd;
+
+	list_for_each_entry(pdd, &p->per_device_data, per_device_list)
+		kfd_process_dequeue_from_device(pdd);
+}
+
+int pqm_init(struct process_queue_manager *pqm, struct kfd_process *p)
+{
+	INIT_LIST_HEAD(&pqm->queues);
+	pqm->queue_slot_bitmap =
+			kzalloc(DIV_ROUND_UP(KFD_MAX_NUM_OF_QUEUES_PER_PROCESS,
+					BITS_PER_BYTE), GFP_KERNEL);
+	if (!pqm->queue_slot_bitmap)
+		return -ENOMEM;
+	pqm->process = p;
+
+	return 0;
+}
+
+void pqm_uninit(struct process_queue_manager *pqm)
+{
+	struct process_queue_node *pqn, *next;
+
+	list_for_each_entry_safe(pqn, next, &pqm->queues, process_queue_list) {
+		uninit_queue(pqn->q);
+		list_del(&pqn->process_queue_list);
+		kfree(pqn);
+	}
+
+	kfree(pqm->queue_slot_bitmap);
+	pqm->queue_slot_bitmap = NULL;
+}
+
+static int create_cp_queue(struct process_queue_manager *pqm,
+				struct kfd_dev *dev, struct queue **q,
+				struct queue_properties *q_properties,
+				struct file *f, unsigned int qid)
+{
+	int retval;
+
+	/* Doorbell initialized in user space*/
+	q_properties->doorbell_ptr = NULL;
+
+	/* let DQM handle it*/
+	q_properties->vmid = 0;
+	q_properties->queue_id = qid;
+
+	retval = init_queue(q, q_properties);
+	if (retval != 0)
+		return retval;
+
+	(*q)->device = dev;
+	(*q)->process = pqm->process;
+
+	pr_debug("PQM After init queue");
+
+	return retval;
+}
+
+int pqm_create_queue(struct process_queue_manager *pqm,
+			    struct kfd_dev *dev,
+			    struct file *f,
+			    struct queue_properties *properties,
+			    unsigned int *qid)
+{
+	int retval;
+	struct kfd_process_device *pdd;
+	struct queue *q;
+	struct process_queue_node *pqn;
+	struct kernel_queue *kq;
+	enum kfd_queue_type type = properties->type;
+	unsigned int max_queues = 127; /* HWS limit */
+
+	q = NULL;
+	kq = NULL;
+
+	pdd = kfd_get_process_device_data(dev, pqm->process);
+	if (!pdd) {
+		pr_err("Process device data doesn't exist\n");
+		return -1;
+	}
+
+	/*
+	 * for debug process, verify that it is within the static queues limit
+	 * currently limit is set to half of the total avail HQD slots
+	 * If we are just about to create DIQ, the is_debug flag is not set yet
+	 * Hence we also check the type as well
+	 */
+	if ((pdd->qpd.is_debug) || (type == KFD_QUEUE_TYPE_DIQ))
+		max_queues = dev->device_info->max_no_of_hqd/2;
+
+	if (pdd->qpd.queue_count >= max_queues)
+		return -ENOSPC;
+
+	retval = find_available_queue_slot(pqm, qid);
+	if (retval != 0)
+		return retval;
+
+	if (list_empty(&pdd->qpd.queues_list) &&
+	    list_empty(&pdd->qpd.priv_queue_list))
+		dev->dqm->ops.register_process(dev->dqm, &pdd->qpd);
+
+	pqn = kzalloc(sizeof(*pqn), GFP_KERNEL);
+	if (!pqn) {
+		retval = -ENOMEM;
+		goto err_allocate_pqn;
+	}
+
+	switch (type) {
+	case KFD_QUEUE_TYPE_SDMA:
+		if (dev->dqm->queue_count >= get_num_sdma_queues(dev->dqm)) {
+			pr_err("Over-subscription is not allowed for SDMA.\n");
+			retval = -EPERM;
+			goto err_create_queue;
+		}
+
+		retval = create_cp_queue(pqm, dev, &q, properties, f, *qid);
+		if (retval != 0)
+			goto err_create_queue;
+		pqn->q = q;
+		pqn->kq = NULL;
+		retval = dev->dqm->ops.create_queue(dev->dqm, q, &pdd->qpd);
+		pr_debug("DQM returned %d for create_queue\n", retval);
+		print_queue(q);
+		break;
+
+	case KFD_QUEUE_TYPE_COMPUTE:
+		/* check if there is over subscription */
+		if ((dev->dqm->sched_policy ==
+		     KFD_SCHED_POLICY_HWS_NO_OVERSUBSCRIPTION) &&
+		((dev->dqm->processes_count >= dev->vm_info.vmid_num_kfd) ||
+		(dev->dqm->queue_count >= get_queues_num(dev->dqm)))) {
+			pr_debug("Over-subscription is not allowed when amdkfd.sched_policy == 1\n");
+			retval = -EPERM;
+			goto err_create_queue;
+		}
+
+		retval = create_cp_queue(pqm, dev, &q, properties, f, *qid);
+		if (retval != 0)
+			goto err_create_queue;
+		pqn->q = q;
+		pqn->kq = NULL;
+		retval = dev->dqm->ops.create_queue(dev->dqm, q, &pdd->qpd);
+		pr_debug("DQM returned %d for create_queue\n", retval);
+		print_queue(q);
+		break;
+	case KFD_QUEUE_TYPE_DIQ:
+		kq = kernel_queue_init(dev, KFD_QUEUE_TYPE_DIQ);
+		if (!kq) {
+			retval = -ENOMEM;
+			goto err_create_queue;
+		}
+		kq->queue->properties.queue_id = *qid;
+		pqn->kq = kq;
+		pqn->q = NULL;
+		retval = dev->dqm->ops.create_kernel_queue(dev->dqm,
+							kq, &pdd->qpd);
+		break;
+	default:
+		WARN(1, "Invalid queue type %d", type);
+		retval = -EINVAL;
+	}
+
+	if (retval != 0) {
+		pr_err("Pasid %d DQM create queue %d failed. ret %d\n",
+			pqm->process->pasid, type, retval);
+		goto err_create_queue;
+	}
+
+	if (q)
+		/* Return the doorbell offset within the doorbell page
+		 * to the caller so it can be passed up to user mode
+		 * (in bytes).
+		 */
+		properties->doorbell_off =
+			(q->properties.doorbell_off * sizeof(uint32_t)) &
+			(kfd_doorbell_process_slice(dev) - 1);
+
+	pr_debug("PQM After DQM create queue\n");
+
+	list_add(&pqn->process_queue_list, &pqm->queues);
+
+	if (q) {
+		pr_debug("PQM done creating queue\n");
+		print_queue_properties(&q->properties);
+	}
+
+	return retval;
+
+err_create_queue:
+	kfree(pqn);
+err_allocate_pqn:
+	/* check if queues list is empty unregister process from device */
+	clear_bit(*qid, pqm->queue_slot_bitmap);
+	if (list_empty(&pdd->qpd.queues_list) &&
+	    list_empty(&pdd->qpd.priv_queue_list))
+		dev->dqm->ops.unregister_process(dev->dqm, &pdd->qpd);
+	return retval;
+}
+
+int pqm_destroy_queue(struct process_queue_manager *pqm, unsigned int qid)
+{
+	struct process_queue_node *pqn;
+	struct kfd_process_device *pdd;
+	struct device_queue_manager *dqm;
+	struct kfd_dev *dev;
+	int retval;
+
+	dqm = NULL;
+
+	retval = 0;
+
+	pqn = get_queue_by_qid(pqm, qid);
+	if (!pqn) {
+		pr_err("Queue id does not match any known queue\n");
+		return -EINVAL;
+	}
+
+	dev = NULL;
+	if (pqn->kq)
+		dev = pqn->kq->dev;
+	if (pqn->q)
+		dev = pqn->q->device;
+	if (WARN_ON(!dev))
+		return -ENODEV;
+
+	pdd = kfd_get_process_device_data(dev, pqm->process);
+	if (!pdd) {
+		pr_err("Process device data doesn't exist\n");
+		return -1;
+	}
+
+	if (pqn->kq) {
+		/* destroy kernel queue (DIQ) */
+		dqm = pqn->kq->dev->dqm;
+		dqm->ops.destroy_kernel_queue(dqm, pqn->kq, &pdd->qpd);
+		kernel_queue_uninit(pqn->kq);
+	}
+
+	if (pqn->q) {
+		dqm = pqn->q->device->dqm;
+		retval = dqm->ops.destroy_queue(dqm, &pdd->qpd, pqn->q);
+		if (retval) {
+			pr_err("Pasid %d destroy queue %d failed, ret %d\n",
+				pqm->process->pasid,
+				pqn->q->properties.queue_id, retval);
+			if (retval != -ETIME)
+				goto err_destroy_queue;
+		}
+		kfree(pqn->q->properties.cu_mask);
+		pqn->q->properties.cu_mask = NULL;
+		uninit_queue(pqn->q);
+	}
+
+	list_del(&pqn->process_queue_list);
+	kfree(pqn);
+	clear_bit(qid, pqm->queue_slot_bitmap);
+
+	if (list_empty(&pdd->qpd.queues_list) &&
+	    list_empty(&pdd->qpd.priv_queue_list))
+		dqm->ops.unregister_process(dqm, &pdd->qpd);
+
+err_destroy_queue:
+	return retval;
+}
+
+int pqm_update_queue(struct process_queue_manager *pqm, unsigned int qid,
+			struct queue_properties *p)
+{
+	int retval;
+	struct process_queue_node *pqn;
+
+	pqn = get_queue_by_qid(pqm, qid);
+	if (!pqn) {
+		pr_debug("No queue %d exists for update operation\n", qid);
+		return -EFAULT;
+	}
+
+	pqn->q->properties.queue_address = p->queue_address;
+	pqn->q->properties.queue_size = p->queue_size;
+	pqn->q->properties.queue_percent = p->queue_percent;
+	pqn->q->properties.priority = p->priority;
+
+	retval = pqn->q->device->dqm->ops.update_queue(pqn->q->device->dqm,
+							pqn->q);
+	if (retval != 0)
+		return retval;
+
+	return 0;
+}
+
+int pqm_set_cu_mask(struct process_queue_manager *pqm, unsigned int qid,
+			struct queue_properties *p)
+{
+	int retval;
+	struct process_queue_node *pqn;
+
+	pqn = get_queue_by_qid(pqm, qid);
+	if (!pqn) {
+		pr_debug("No queue %d exists for update operation\n", qid);
+		return -EFAULT;
+	}
+
+	/* Free the old CU mask memory if it is already allocated, then
+	 * allocate memory for the new CU mask.
+	 */
+	kfree(pqn->q->properties.cu_mask);
+
+	pqn->q->properties.cu_mask_count = p->cu_mask_count;
+	pqn->q->properties.cu_mask = p->cu_mask;
+
+	retval = pqn->q->device->dqm->ops.update_queue(pqn->q->device->dqm,
+							pqn->q);
+	if (retval != 0)
+		return retval;
+
+	return 0;
+}
+
+struct kernel_queue *pqm_get_kernel_queue(
+					struct process_queue_manager *pqm,
+					unsigned int qid)
+{
+	struct process_queue_node *pqn;
+
+	pqn = get_queue_by_qid(pqm, qid);
+	if (pqn && pqn->kq)
+		return pqn->kq;
+
+	return NULL;
+}
+
+#if defined(CONFIG_DEBUG_FS)
+
+int pqm_debugfs_mqds(struct seq_file *m, void *data)
+{
+	struct process_queue_manager *pqm = data;
+	struct process_queue_node *pqn;
+	struct queue *q;
+	enum KFD_MQD_TYPE mqd_type;
+	struct mqd_manager *mqd_mgr;
+	int r = 0;
+
+	list_for_each_entry(pqn, &pqm->queues, process_queue_list) {
+		if (pqn->q) {
+			q = pqn->q;
+			switch (q->properties.type) {
+			case KFD_QUEUE_TYPE_SDMA:
+				seq_printf(m, "  SDMA queue on device %x\n",
+					   q->device->id);
+				mqd_type = KFD_MQD_TYPE_SDMA;
+				break;
+			case KFD_QUEUE_TYPE_COMPUTE:
+				seq_printf(m, "  Compute queue on device %x\n",
+					   q->device->id);
+				mqd_type = KFD_MQD_TYPE_CP;
+				break;
+			default:
+				seq_printf(m,
+				"  Bad user queue type %d on device %x\n",
+					   q->properties.type, q->device->id);
+				continue;
+			}
+			mqd_mgr = q->device->dqm->ops.get_mqd_manager(
+				q->device->dqm, mqd_type);
+		} else if (pqn->kq) {
+			q = pqn->kq->queue;
+			mqd_mgr = pqn->kq->mqd_mgr;
+			switch (q->properties.type) {
+			case KFD_QUEUE_TYPE_DIQ:
+				seq_printf(m, "  DIQ on device %x\n",
+					   pqn->kq->dev->id);
+				mqd_type = KFD_MQD_TYPE_HIQ;
+				break;
+			default:
+				seq_printf(m,
+				"  Bad kernel queue type %d on device %x\n",
+					   q->properties.type,
+					   pqn->kq->dev->id);
+				continue;
+			}
+		} else {
+			seq_printf(m,
+		"  Weird: Queue node with neither kernel nor user queue\n");
+			continue;
+		}
+
+		r = mqd_mgr->debugfs_show_mqd(m, q->mqd);
+		if (r != 0)
+			break;
+	}
+
+	return r;
+}
+
+#endif
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_queue.c b/drivers/gpu/drm/amd/amdkfd/kfd_queue.c
new file mode 100644
index 000000000..6dcd621e5
--- /dev/null
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_queue.c
@@ -0,0 +1,83 @@
+/*
+ * Copyright 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ *
+ */
+
+#include <linux/slab.h>
+#include "kfd_priv.h"
+
+void print_queue_properties(struct queue_properties *q)
+{
+	if (!q)
+		return;
+
+	pr_debug("Printing queue properties:\n");
+	pr_debug("Queue Type: %u\n", q->type);
+	pr_debug("Queue Size: %llu\n", q->queue_size);
+	pr_debug("Queue percent: %u\n", q->queue_percent);
+	pr_debug("Queue Address: 0x%llX\n", q->queue_address);
+	pr_debug("Queue Id: %u\n", q->queue_id);
+	pr_debug("Queue Process Vmid: %u\n", q->vmid);
+	pr_debug("Queue Read Pointer: 0x%px\n", q->read_ptr);
+	pr_debug("Queue Write Pointer: 0x%px\n", q->write_ptr);
+	pr_debug("Queue Doorbell Pointer: 0x%p\n", q->doorbell_ptr);
+	pr_debug("Queue Doorbell Offset: %u\n", q->doorbell_off);
+}
+
+void print_queue(struct queue *q)
+{
+	if (!q)
+		return;
+	pr_debug("Printing queue:\n");
+	pr_debug("Queue Type: %u\n", q->properties.type);
+	pr_debug("Queue Size: %llu\n", q->properties.queue_size);
+	pr_debug("Queue percent: %u\n", q->properties.queue_percent);
+	pr_debug("Queue Address: 0x%llX\n", q->properties.queue_address);
+	pr_debug("Queue Id: %u\n", q->properties.queue_id);
+	pr_debug("Queue Process Vmid: %u\n", q->properties.vmid);
+	pr_debug("Queue Read Pointer: 0x%px\n", q->properties.read_ptr);
+	pr_debug("Queue Write Pointer: 0x%px\n", q->properties.write_ptr);
+	pr_debug("Queue Doorbell Pointer: 0x%p\n", q->properties.doorbell_ptr);
+	pr_debug("Queue Doorbell Offset: %u\n", q->properties.doorbell_off);
+	pr_debug("Queue MQD Address: 0x%p\n", q->mqd);
+	pr_debug("Queue MQD Gart: 0x%llX\n", q->gart_mqd_addr);
+	pr_debug("Queue Process Address: 0x%p\n", q->process);
+	pr_debug("Queue Device Address: 0x%p\n", q->device);
+}
+
+int init_queue(struct queue **q, const struct queue_properties *properties)
+{
+	struct queue *tmp_q;
+
+	tmp_q = kzalloc(sizeof(*tmp_q), GFP_KERNEL);
+	if (!tmp_q)
+		return -ENOMEM;
+
+	memcpy(&tmp_q->properties, properties, sizeof(*properties));
+
+	*q = tmp_q;
+	return 0;
+}
+
+void uninit_queue(struct queue *q)
+{
+	kfree(q);
+}
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_topology.c b/drivers/gpu/drm/amd/amdkfd/kfd_topology.c
new file mode 100644
index 000000000..5cf499a07
--- /dev/null
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_topology.c
@@ -0,0 +1,1441 @@
+/*
+ * Copyright 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/pci.h>
+#include <linux/errno.h>
+#include <linux/acpi.h>
+#include <linux/hash.h>
+#include <linux/cpufreq.h>
+#include <linux/log2.h>
+#include <linux/dmi.h>
+#include <linux/atomic.h>
+
+#include "kfd_priv.h"
+#include "kfd_crat.h"
+#include "kfd_topology.h"
+#include "kfd_device_queue_manager.h"
+#include "kfd_iommu.h"
+
+/* topology_device_list - Master list of all topology devices */
+static struct list_head topology_device_list;
+static struct kfd_system_properties sys_props;
+
+static DECLARE_RWSEM(topology_lock);
+static atomic_t topology_crat_proximity_domain;
+
+struct kfd_topology_device *kfd_topology_device_by_proximity_domain(
+						uint32_t proximity_domain)
+{
+	struct kfd_topology_device *top_dev;
+	struct kfd_topology_device *device = NULL;
+
+	down_read(&topology_lock);
+
+	list_for_each_entry(top_dev, &topology_device_list, list)
+		if (top_dev->proximity_domain == proximity_domain) {
+			device = top_dev;
+			break;
+		}
+
+	up_read(&topology_lock);
+
+	return device;
+}
+
+struct kfd_topology_device *kfd_topology_device_by_id(uint32_t gpu_id)
+{
+	struct kfd_topology_device *top_dev = NULL;
+	struct kfd_topology_device *ret = NULL;
+
+	down_read(&topology_lock);
+
+	list_for_each_entry(top_dev, &topology_device_list, list)
+		if (top_dev->gpu_id == gpu_id) {
+			ret = top_dev;
+			break;
+		}
+
+	up_read(&topology_lock);
+
+	return ret;
+}
+
+struct kfd_dev *kfd_device_by_id(uint32_t gpu_id)
+{
+	struct kfd_topology_device *top_dev;
+
+	top_dev = kfd_topology_device_by_id(gpu_id);
+	if (!top_dev)
+		return NULL;
+
+	return top_dev->gpu;
+}
+
+struct kfd_dev *kfd_device_by_pci_dev(const struct pci_dev *pdev)
+{
+	struct kfd_topology_device *top_dev;
+	struct kfd_dev *device = NULL;
+
+	down_read(&topology_lock);
+
+	list_for_each_entry(top_dev, &topology_device_list, list)
+		if (top_dev->gpu->pdev == pdev) {
+			device = top_dev->gpu;
+			break;
+		}
+
+	up_read(&topology_lock);
+
+	return device;
+}
+
+/* Called with write topology_lock acquired */
+static void kfd_release_topology_device(struct kfd_topology_device *dev)
+{
+	struct kfd_mem_properties *mem;
+	struct kfd_cache_properties *cache;
+	struct kfd_iolink_properties *iolink;
+	struct kfd_perf_properties *perf;
+
+	list_del(&dev->list);
+
+	while (dev->mem_props.next != &dev->mem_props) {
+		mem = container_of(dev->mem_props.next,
+				struct kfd_mem_properties, list);
+		list_del(&mem->list);
+		kfree(mem);
+	}
+
+	while (dev->cache_props.next != &dev->cache_props) {
+		cache = container_of(dev->cache_props.next,
+				struct kfd_cache_properties, list);
+		list_del(&cache->list);
+		kfree(cache);
+	}
+
+	while (dev->io_link_props.next != &dev->io_link_props) {
+		iolink = container_of(dev->io_link_props.next,
+				struct kfd_iolink_properties, list);
+		list_del(&iolink->list);
+		kfree(iolink);
+	}
+
+	while (dev->perf_props.next != &dev->perf_props) {
+		perf = container_of(dev->perf_props.next,
+				struct kfd_perf_properties, list);
+		list_del(&perf->list);
+		kfree(perf);
+	}
+
+	kfree(dev);
+}
+
+void kfd_release_topology_device_list(struct list_head *device_list)
+{
+	struct kfd_topology_device *dev;
+
+	while (!list_empty(device_list)) {
+		dev = list_first_entry(device_list,
+				       struct kfd_topology_device, list);
+		kfd_release_topology_device(dev);
+	}
+}
+
+static void kfd_release_live_view(void)
+{
+	kfd_release_topology_device_list(&topology_device_list);
+	memset(&sys_props, 0, sizeof(sys_props));
+}
+
+struct kfd_topology_device *kfd_create_topology_device(
+				struct list_head *device_list)
+{
+	struct kfd_topology_device *dev;
+
+	dev = kfd_alloc_struct(dev);
+	if (!dev) {
+		pr_err("No memory to allocate a topology device");
+		return NULL;
+	}
+
+	INIT_LIST_HEAD(&dev->mem_props);
+	INIT_LIST_HEAD(&dev->cache_props);
+	INIT_LIST_HEAD(&dev->io_link_props);
+	INIT_LIST_HEAD(&dev->perf_props);
+
+	list_add_tail(&dev->list, device_list);
+
+	return dev;
+}
+
+
+#define sysfs_show_gen_prop(buffer, fmt, ...) \
+		snprintf(buffer, PAGE_SIZE, "%s"fmt, buffer, __VA_ARGS__)
+#define sysfs_show_32bit_prop(buffer, name, value) \
+		sysfs_show_gen_prop(buffer, "%s %u\n", name, value)
+#define sysfs_show_64bit_prop(buffer, name, value) \
+		sysfs_show_gen_prop(buffer, "%s %llu\n", name, value)
+#define sysfs_show_32bit_val(buffer, value) \
+		sysfs_show_gen_prop(buffer, "%u\n", value)
+#define sysfs_show_str_val(buffer, value) \
+		sysfs_show_gen_prop(buffer, "%s\n", value)
+
+static ssize_t sysprops_show(struct kobject *kobj, struct attribute *attr,
+		char *buffer)
+{
+	ssize_t ret;
+
+	/* Making sure that the buffer is an empty string */
+	buffer[0] = 0;
+
+	if (attr == &sys_props.attr_genid) {
+		ret = sysfs_show_32bit_val(buffer, sys_props.generation_count);
+	} else if (attr == &sys_props.attr_props) {
+		sysfs_show_64bit_prop(buffer, "platform_oem",
+				sys_props.platform_oem);
+		sysfs_show_64bit_prop(buffer, "platform_id",
+				sys_props.platform_id);
+		ret = sysfs_show_64bit_prop(buffer, "platform_rev",
+				sys_props.platform_rev);
+	} else {
+		ret = -EINVAL;
+	}
+
+	return ret;
+}
+
+static void kfd_topology_kobj_release(struct kobject *kobj)
+{
+	kfree(kobj);
+}
+
+static const struct sysfs_ops sysprops_ops = {
+	.show = sysprops_show,
+};
+
+static struct kobj_type sysprops_type = {
+	.release = kfd_topology_kobj_release,
+	.sysfs_ops = &sysprops_ops,
+};
+
+static ssize_t iolink_show(struct kobject *kobj, struct attribute *attr,
+		char *buffer)
+{
+	ssize_t ret;
+	struct kfd_iolink_properties *iolink;
+
+	/* Making sure that the buffer is an empty string */
+	buffer[0] = 0;
+
+	iolink = container_of(attr, struct kfd_iolink_properties, attr);
+	sysfs_show_32bit_prop(buffer, "type", iolink->iolink_type);
+	sysfs_show_32bit_prop(buffer, "version_major", iolink->ver_maj);
+	sysfs_show_32bit_prop(buffer, "version_minor", iolink->ver_min);
+	sysfs_show_32bit_prop(buffer, "node_from", iolink->node_from);
+	sysfs_show_32bit_prop(buffer, "node_to", iolink->node_to);
+	sysfs_show_32bit_prop(buffer, "weight", iolink->weight);
+	sysfs_show_32bit_prop(buffer, "min_latency", iolink->min_latency);
+	sysfs_show_32bit_prop(buffer, "max_latency", iolink->max_latency);
+	sysfs_show_32bit_prop(buffer, "min_bandwidth", iolink->min_bandwidth);
+	sysfs_show_32bit_prop(buffer, "max_bandwidth", iolink->max_bandwidth);
+	sysfs_show_32bit_prop(buffer, "recommended_transfer_size",
+			iolink->rec_transfer_size);
+	ret = sysfs_show_32bit_prop(buffer, "flags", iolink->flags);
+
+	return ret;
+}
+
+static const struct sysfs_ops iolink_ops = {
+	.show = iolink_show,
+};
+
+static struct kobj_type iolink_type = {
+	.release = kfd_topology_kobj_release,
+	.sysfs_ops = &iolink_ops,
+};
+
+static ssize_t mem_show(struct kobject *kobj, struct attribute *attr,
+		char *buffer)
+{
+	ssize_t ret;
+	struct kfd_mem_properties *mem;
+
+	/* Making sure that the buffer is an empty string */
+	buffer[0] = 0;
+
+	mem = container_of(attr, struct kfd_mem_properties, attr);
+	sysfs_show_32bit_prop(buffer, "heap_type", mem->heap_type);
+	sysfs_show_64bit_prop(buffer, "size_in_bytes", mem->size_in_bytes);
+	sysfs_show_32bit_prop(buffer, "flags", mem->flags);
+	sysfs_show_32bit_prop(buffer, "width", mem->width);
+	ret = sysfs_show_32bit_prop(buffer, "mem_clk_max", mem->mem_clk_max);
+
+	return ret;
+}
+
+static const struct sysfs_ops mem_ops = {
+	.show = mem_show,
+};
+
+static struct kobj_type mem_type = {
+	.release = kfd_topology_kobj_release,
+	.sysfs_ops = &mem_ops,
+};
+
+static ssize_t kfd_cache_show(struct kobject *kobj, struct attribute *attr,
+		char *buffer)
+{
+	ssize_t ret;
+	uint32_t i, j;
+	struct kfd_cache_properties *cache;
+
+	/* Making sure that the buffer is an empty string */
+	buffer[0] = 0;
+
+	cache = container_of(attr, struct kfd_cache_properties, attr);
+	sysfs_show_32bit_prop(buffer, "processor_id_low",
+			cache->processor_id_low);
+	sysfs_show_32bit_prop(buffer, "level", cache->cache_level);
+	sysfs_show_32bit_prop(buffer, "size", cache->cache_size);
+	sysfs_show_32bit_prop(buffer, "cache_line_size", cache->cacheline_size);
+	sysfs_show_32bit_prop(buffer, "cache_lines_per_tag",
+			cache->cachelines_per_tag);
+	sysfs_show_32bit_prop(buffer, "association", cache->cache_assoc);
+	sysfs_show_32bit_prop(buffer, "latency", cache->cache_latency);
+	sysfs_show_32bit_prop(buffer, "type", cache->cache_type);
+	snprintf(buffer, PAGE_SIZE, "%ssibling_map ", buffer);
+	for (i = 0; i < CRAT_SIBLINGMAP_SIZE; i++)
+		for (j = 0; j < sizeof(cache->sibling_map[0])*8; j++) {
+			/* Check each bit */
+			if (cache->sibling_map[i] & (1 << j))
+				ret = snprintf(buffer, PAGE_SIZE,
+					 "%s%d%s", buffer, 1, ",");
+			else
+				ret = snprintf(buffer, PAGE_SIZE,
+					 "%s%d%s", buffer, 0, ",");
+		}
+	/* Replace the last "," with end of line */
+	*(buffer + strlen(buffer) - 1) = 0xA;
+	return ret;
+}
+
+static const struct sysfs_ops cache_ops = {
+	.show = kfd_cache_show,
+};
+
+static struct kobj_type cache_type = {
+	.release = kfd_topology_kobj_release,
+	.sysfs_ops = &cache_ops,
+};
+
+/****** Sysfs of Performance Counters ******/
+
+struct kfd_perf_attr {
+	struct kobj_attribute attr;
+	uint32_t data;
+};
+
+static ssize_t perf_show(struct kobject *kobj, struct kobj_attribute *attrs,
+			char *buf)
+{
+	struct kfd_perf_attr *attr;
+
+	buf[0] = 0;
+	attr = container_of(attrs, struct kfd_perf_attr, attr);
+	if (!attr->data) /* invalid data for PMC */
+		return 0;
+	else
+		return sysfs_show_32bit_val(buf, attr->data);
+}
+
+#define KFD_PERF_DESC(_name, _data)			\
+{							\
+	.attr  = __ATTR(_name, 0444, perf_show, NULL),	\
+	.data = _data,					\
+}
+
+static struct kfd_perf_attr perf_attr_iommu[] = {
+	KFD_PERF_DESC(max_concurrent, 0),
+	KFD_PERF_DESC(num_counters, 0),
+	KFD_PERF_DESC(counter_ids, 0),
+};
+/****************************************/
+
+static ssize_t node_show(struct kobject *kobj, struct attribute *attr,
+		char *buffer)
+{
+	struct kfd_topology_device *dev;
+	char public_name[KFD_TOPOLOGY_PUBLIC_NAME_SIZE];
+	uint32_t i;
+	uint32_t log_max_watch_addr;
+
+	/* Making sure that the buffer is an empty string */
+	buffer[0] = 0;
+
+	if (strcmp(attr->name, "gpu_id") == 0) {
+		dev = container_of(attr, struct kfd_topology_device,
+				attr_gpuid);
+		return sysfs_show_32bit_val(buffer, dev->gpu_id);
+	}
+
+	if (strcmp(attr->name, "name") == 0) {
+		dev = container_of(attr, struct kfd_topology_device,
+				attr_name);
+		for (i = 0; i < KFD_TOPOLOGY_PUBLIC_NAME_SIZE; i++) {
+			public_name[i] =
+					(char)dev->node_props.marketing_name[i];
+			if (dev->node_props.marketing_name[i] == 0)
+				break;
+		}
+		public_name[KFD_TOPOLOGY_PUBLIC_NAME_SIZE-1] = 0x0;
+		return sysfs_show_str_val(buffer, public_name);
+	}
+
+	dev = container_of(attr, struct kfd_topology_device,
+			attr_props);
+	sysfs_show_32bit_prop(buffer, "cpu_cores_count",
+			dev->node_props.cpu_cores_count);
+	sysfs_show_32bit_prop(buffer, "simd_count",
+			dev->node_props.simd_count);
+	sysfs_show_32bit_prop(buffer, "mem_banks_count",
+			dev->node_props.mem_banks_count);
+	sysfs_show_32bit_prop(buffer, "caches_count",
+			dev->node_props.caches_count);
+	sysfs_show_32bit_prop(buffer, "io_links_count",
+			dev->node_props.io_links_count);
+	sysfs_show_32bit_prop(buffer, "cpu_core_id_base",
+			dev->node_props.cpu_core_id_base);
+	sysfs_show_32bit_prop(buffer, "simd_id_base",
+			dev->node_props.simd_id_base);
+	sysfs_show_32bit_prop(buffer, "max_waves_per_simd",
+			dev->node_props.max_waves_per_simd);
+	sysfs_show_32bit_prop(buffer, "lds_size_in_kb",
+			dev->node_props.lds_size_in_kb);
+	sysfs_show_32bit_prop(buffer, "gds_size_in_kb",
+			dev->node_props.gds_size_in_kb);
+	sysfs_show_32bit_prop(buffer, "wave_front_size",
+			dev->node_props.wave_front_size);
+	sysfs_show_32bit_prop(buffer, "array_count",
+			dev->node_props.array_count);
+	sysfs_show_32bit_prop(buffer, "simd_arrays_per_engine",
+			dev->node_props.simd_arrays_per_engine);
+	sysfs_show_32bit_prop(buffer, "cu_per_simd_array",
+			dev->node_props.cu_per_simd_array);
+	sysfs_show_32bit_prop(buffer, "simd_per_cu",
+			dev->node_props.simd_per_cu);
+	sysfs_show_32bit_prop(buffer, "max_slots_scratch_cu",
+			dev->node_props.max_slots_scratch_cu);
+	sysfs_show_32bit_prop(buffer, "vendor_id",
+			dev->node_props.vendor_id);
+	sysfs_show_32bit_prop(buffer, "device_id",
+			dev->node_props.device_id);
+	sysfs_show_32bit_prop(buffer, "location_id",
+			dev->node_props.location_id);
+	sysfs_show_32bit_prop(buffer, "drm_render_minor",
+			dev->node_props.drm_render_minor);
+
+	if (dev->gpu) {
+		log_max_watch_addr =
+			__ilog2_u32(dev->gpu->device_info->num_of_watch_points);
+
+		if (log_max_watch_addr) {
+			dev->node_props.capability |=
+					HSA_CAP_WATCH_POINTS_SUPPORTED;
+
+			dev->node_props.capability |=
+				((log_max_watch_addr <<
+					HSA_CAP_WATCH_POINTS_TOTALBITS_SHIFT) &
+				HSA_CAP_WATCH_POINTS_TOTALBITS_MASK);
+		}
+
+		if (dev->gpu->device_info->asic_family == CHIP_TONGA)
+			dev->node_props.capability |=
+					HSA_CAP_AQL_QUEUE_DOUBLE_MAP;
+
+		sysfs_show_32bit_prop(buffer, "max_engine_clk_fcompute",
+			dev->node_props.max_engine_clk_fcompute);
+
+		sysfs_show_64bit_prop(buffer, "local_mem_size",
+				(unsigned long long int) 0);
+
+		sysfs_show_32bit_prop(buffer, "fw_version",
+			dev->gpu->kfd2kgd->get_fw_version(
+						dev->gpu->kgd,
+						KGD_ENGINE_MEC1));
+		sysfs_show_32bit_prop(buffer, "capability",
+				dev->node_props.capability);
+	}
+
+	return sysfs_show_32bit_prop(buffer, "max_engine_clk_ccompute",
+					cpufreq_quick_get_max(0)/1000);
+}
+
+static const struct sysfs_ops node_ops = {
+	.show = node_show,
+};
+
+static struct kobj_type node_type = {
+	.release = kfd_topology_kobj_release,
+	.sysfs_ops = &node_ops,
+};
+
+static void kfd_remove_sysfs_file(struct kobject *kobj, struct attribute *attr)
+{
+	sysfs_remove_file(kobj, attr);
+	kobject_del(kobj);
+	kobject_put(kobj);
+}
+
+static void kfd_remove_sysfs_node_entry(struct kfd_topology_device *dev)
+{
+	struct kfd_iolink_properties *iolink;
+	struct kfd_cache_properties *cache;
+	struct kfd_mem_properties *mem;
+	struct kfd_perf_properties *perf;
+
+	if (dev->kobj_iolink) {
+		list_for_each_entry(iolink, &dev->io_link_props, list)
+			if (iolink->kobj) {
+				kfd_remove_sysfs_file(iolink->kobj,
+							&iolink->attr);
+				iolink->kobj = NULL;
+			}
+		kobject_del(dev->kobj_iolink);
+		kobject_put(dev->kobj_iolink);
+		dev->kobj_iolink = NULL;
+	}
+
+	if (dev->kobj_cache) {
+		list_for_each_entry(cache, &dev->cache_props, list)
+			if (cache->kobj) {
+				kfd_remove_sysfs_file(cache->kobj,
+							&cache->attr);
+				cache->kobj = NULL;
+			}
+		kobject_del(dev->kobj_cache);
+		kobject_put(dev->kobj_cache);
+		dev->kobj_cache = NULL;
+	}
+
+	if (dev->kobj_mem) {
+		list_for_each_entry(mem, &dev->mem_props, list)
+			if (mem->kobj) {
+				kfd_remove_sysfs_file(mem->kobj, &mem->attr);
+				mem->kobj = NULL;
+			}
+		kobject_del(dev->kobj_mem);
+		kobject_put(dev->kobj_mem);
+		dev->kobj_mem = NULL;
+	}
+
+	if (dev->kobj_perf) {
+		list_for_each_entry(perf, &dev->perf_props, list) {
+			kfree(perf->attr_group);
+			perf->attr_group = NULL;
+		}
+		kobject_del(dev->kobj_perf);
+		kobject_put(dev->kobj_perf);
+		dev->kobj_perf = NULL;
+	}
+
+	if (dev->kobj_node) {
+		sysfs_remove_file(dev->kobj_node, &dev->attr_gpuid);
+		sysfs_remove_file(dev->kobj_node, &dev->attr_name);
+		sysfs_remove_file(dev->kobj_node, &dev->attr_props);
+		kobject_del(dev->kobj_node);
+		kobject_put(dev->kobj_node);
+		dev->kobj_node = NULL;
+	}
+}
+
+static int kfd_build_sysfs_node_entry(struct kfd_topology_device *dev,
+		uint32_t id)
+{
+	struct kfd_iolink_properties *iolink;
+	struct kfd_cache_properties *cache;
+	struct kfd_mem_properties *mem;
+	struct kfd_perf_properties *perf;
+	int ret;
+	uint32_t i, num_attrs;
+	struct attribute **attrs;
+
+	if (WARN_ON(dev->kobj_node))
+		return -EEXIST;
+
+	/*
+	 * Creating the sysfs folders
+	 */
+	dev->kobj_node = kfd_alloc_struct(dev->kobj_node);
+	if (!dev->kobj_node)
+		return -ENOMEM;
+
+	ret = kobject_init_and_add(dev->kobj_node, &node_type,
+			sys_props.kobj_nodes, "%d", id);
+	if (ret < 0) {
+		kobject_put(dev->kobj_node);
+		return ret;
+	}
+
+	dev->kobj_mem = kobject_create_and_add("mem_banks", dev->kobj_node);
+	if (!dev->kobj_mem)
+		return -ENOMEM;
+
+	dev->kobj_cache = kobject_create_and_add("caches", dev->kobj_node);
+	if (!dev->kobj_cache)
+		return -ENOMEM;
+
+	dev->kobj_iolink = kobject_create_and_add("io_links", dev->kobj_node);
+	if (!dev->kobj_iolink)
+		return -ENOMEM;
+
+	dev->kobj_perf = kobject_create_and_add("perf", dev->kobj_node);
+	if (!dev->kobj_perf)
+		return -ENOMEM;
+
+	/*
+	 * Creating sysfs files for node properties
+	 */
+	dev->attr_gpuid.name = "gpu_id";
+	dev->attr_gpuid.mode = KFD_SYSFS_FILE_MODE;
+	sysfs_attr_init(&dev->attr_gpuid);
+	dev->attr_name.name = "name";
+	dev->attr_name.mode = KFD_SYSFS_FILE_MODE;
+	sysfs_attr_init(&dev->attr_name);
+	dev->attr_props.name = "properties";
+	dev->attr_props.mode = KFD_SYSFS_FILE_MODE;
+	sysfs_attr_init(&dev->attr_props);
+	ret = sysfs_create_file(dev->kobj_node, &dev->attr_gpuid);
+	if (ret < 0)
+		return ret;
+	ret = sysfs_create_file(dev->kobj_node, &dev->attr_name);
+	if (ret < 0)
+		return ret;
+	ret = sysfs_create_file(dev->kobj_node, &dev->attr_props);
+	if (ret < 0)
+		return ret;
+
+	i = 0;
+	list_for_each_entry(mem, &dev->mem_props, list) {
+		mem->kobj = kzalloc(sizeof(struct kobject), GFP_KERNEL);
+		if (!mem->kobj)
+			return -ENOMEM;
+		ret = kobject_init_and_add(mem->kobj, &mem_type,
+				dev->kobj_mem, "%d", i);
+		if (ret < 0) {
+			kobject_put(mem->kobj);
+			return ret;
+		}
+
+		mem->attr.name = "properties";
+		mem->attr.mode = KFD_SYSFS_FILE_MODE;
+		sysfs_attr_init(&mem->attr);
+		ret = sysfs_create_file(mem->kobj, &mem->attr);
+		if (ret < 0)
+			return ret;
+		i++;
+	}
+
+	i = 0;
+	list_for_each_entry(cache, &dev->cache_props, list) {
+		cache->kobj = kzalloc(sizeof(struct kobject), GFP_KERNEL);
+		if (!cache->kobj)
+			return -ENOMEM;
+		ret = kobject_init_and_add(cache->kobj, &cache_type,
+				dev->kobj_cache, "%d", i);
+		if (ret < 0) {
+			kobject_put(cache->kobj);
+			return ret;
+		}
+
+		cache->attr.name = "properties";
+		cache->attr.mode = KFD_SYSFS_FILE_MODE;
+		sysfs_attr_init(&cache->attr);
+		ret = sysfs_create_file(cache->kobj, &cache->attr);
+		if (ret < 0)
+			return ret;
+		i++;
+	}
+
+	i = 0;
+	list_for_each_entry(iolink, &dev->io_link_props, list) {
+		iolink->kobj = kzalloc(sizeof(struct kobject), GFP_KERNEL);
+		if (!iolink->kobj)
+			return -ENOMEM;
+		ret = kobject_init_and_add(iolink->kobj, &iolink_type,
+				dev->kobj_iolink, "%d", i);
+		if (ret < 0) {
+			kobject_put(iolink->kobj);
+			return ret;
+		}
+
+		iolink->attr.name = "properties";
+		iolink->attr.mode = KFD_SYSFS_FILE_MODE;
+		sysfs_attr_init(&iolink->attr);
+		ret = sysfs_create_file(iolink->kobj, &iolink->attr);
+		if (ret < 0)
+			return ret;
+		i++;
+	}
+
+	/* All hardware blocks have the same number of attributes. */
+	num_attrs = ARRAY_SIZE(perf_attr_iommu);
+	list_for_each_entry(perf, &dev->perf_props, list) {
+		perf->attr_group = kzalloc(sizeof(struct kfd_perf_attr)
+			* num_attrs + sizeof(struct attribute_group),
+			GFP_KERNEL);
+		if (!perf->attr_group)
+			return -ENOMEM;
+
+		attrs = (struct attribute **)(perf->attr_group + 1);
+		if (!strcmp(perf->block_name, "iommu")) {
+		/* Information of IOMMU's num_counters and counter_ids is shown
+		 * under /sys/bus/event_source/devices/amd_iommu. We don't
+		 * duplicate here.
+		 */
+			perf_attr_iommu[0].data = perf->max_concurrent;
+			for (i = 0; i < num_attrs; i++)
+				attrs[i] = &perf_attr_iommu[i].attr.attr;
+		}
+		perf->attr_group->name = perf->block_name;
+		perf->attr_group->attrs = attrs;
+		ret = sysfs_create_group(dev->kobj_perf, perf->attr_group);
+		if (ret < 0)
+			return ret;
+	}
+
+	return 0;
+}
+
+/* Called with write topology lock acquired */
+static int kfd_build_sysfs_node_tree(void)
+{
+	struct kfd_topology_device *dev;
+	int ret;
+	uint32_t i = 0;
+
+	list_for_each_entry(dev, &topology_device_list, list) {
+		ret = kfd_build_sysfs_node_entry(dev, i);
+		if (ret < 0)
+			return ret;
+		i++;
+	}
+
+	return 0;
+}
+
+/* Called with write topology lock acquired */
+static void kfd_remove_sysfs_node_tree(void)
+{
+	struct kfd_topology_device *dev;
+
+	list_for_each_entry(dev, &topology_device_list, list)
+		kfd_remove_sysfs_node_entry(dev);
+}
+
+static int kfd_topology_update_sysfs(void)
+{
+	int ret;
+
+	pr_info("Creating topology SYSFS entries\n");
+	if (!sys_props.kobj_topology) {
+		sys_props.kobj_topology =
+				kfd_alloc_struct(sys_props.kobj_topology);
+		if (!sys_props.kobj_topology)
+			return -ENOMEM;
+
+		ret = kobject_init_and_add(sys_props.kobj_topology,
+				&sysprops_type,  &kfd_device->kobj,
+				"topology");
+		if (ret < 0) {
+			kobject_put(sys_props.kobj_topology);
+			return ret;
+		}
+
+		sys_props.kobj_nodes = kobject_create_and_add("nodes",
+				sys_props.kobj_topology);
+		if (!sys_props.kobj_nodes)
+			return -ENOMEM;
+
+		sys_props.attr_genid.name = "generation_id";
+		sys_props.attr_genid.mode = KFD_SYSFS_FILE_MODE;
+		sysfs_attr_init(&sys_props.attr_genid);
+		ret = sysfs_create_file(sys_props.kobj_topology,
+				&sys_props.attr_genid);
+		if (ret < 0)
+			return ret;
+
+		sys_props.attr_props.name = "system_properties";
+		sys_props.attr_props.mode = KFD_SYSFS_FILE_MODE;
+		sysfs_attr_init(&sys_props.attr_props);
+		ret = sysfs_create_file(sys_props.kobj_topology,
+				&sys_props.attr_props);
+		if (ret < 0)
+			return ret;
+	}
+
+	kfd_remove_sysfs_node_tree();
+
+	return kfd_build_sysfs_node_tree();
+}
+
+static void kfd_topology_release_sysfs(void)
+{
+	kfd_remove_sysfs_node_tree();
+	if (sys_props.kobj_topology) {
+		sysfs_remove_file(sys_props.kobj_topology,
+				&sys_props.attr_genid);
+		sysfs_remove_file(sys_props.kobj_topology,
+				&sys_props.attr_props);
+		if (sys_props.kobj_nodes) {
+			kobject_del(sys_props.kobj_nodes);
+			kobject_put(sys_props.kobj_nodes);
+			sys_props.kobj_nodes = NULL;
+		}
+		kobject_del(sys_props.kobj_topology);
+		kobject_put(sys_props.kobj_topology);
+		sys_props.kobj_topology = NULL;
+	}
+}
+
+/* Called with write topology_lock acquired */
+static void kfd_topology_update_device_list(struct list_head *temp_list,
+					struct list_head *master_list)
+{
+	while (!list_empty(temp_list)) {
+		list_move_tail(temp_list->next, master_list);
+		sys_props.num_devices++;
+	}
+}
+
+static void kfd_debug_print_topology(void)
+{
+	struct kfd_topology_device *dev;
+
+	down_read(&topology_lock);
+
+	dev = list_last_entry(&topology_device_list,
+			struct kfd_topology_device, list);
+	if (dev) {
+		if (dev->node_props.cpu_cores_count &&
+				dev->node_props.simd_count) {
+			pr_info("Topology: Add APU node [0x%0x:0x%0x]\n",
+				dev->node_props.device_id,
+				dev->node_props.vendor_id);
+		} else if (dev->node_props.cpu_cores_count)
+			pr_info("Topology: Add CPU node\n");
+		else if (dev->node_props.simd_count)
+			pr_info("Topology: Add dGPU node [0x%0x:0x%0x]\n",
+				dev->node_props.device_id,
+				dev->node_props.vendor_id);
+	}
+	up_read(&topology_lock);
+}
+
+/* Helper function for intializing platform_xx members of
+ * kfd_system_properties. Uses OEM info from the last CPU/APU node.
+ */
+static void kfd_update_system_properties(void)
+{
+	struct kfd_topology_device *dev;
+
+	down_read(&topology_lock);
+	dev = list_last_entry(&topology_device_list,
+			struct kfd_topology_device, list);
+	if (dev) {
+		sys_props.platform_id =
+			(*((uint64_t *)dev->oem_id)) & CRAT_OEMID_64BIT_MASK;
+		sys_props.platform_oem = *((uint64_t *)dev->oem_table_id);
+		sys_props.platform_rev = dev->oem_revision;
+	}
+	up_read(&topology_lock);
+}
+
+static void find_system_memory(const struct dmi_header *dm,
+	void *private)
+{
+	struct kfd_mem_properties *mem;
+	u16 mem_width, mem_clock;
+	struct kfd_topology_device *kdev =
+		(struct kfd_topology_device *)private;
+	const u8 *dmi_data = (const u8 *)(dm + 1);
+
+	if (dm->type == DMI_ENTRY_MEM_DEVICE && dm->length >= 0x15) {
+		mem_width = (u16)(*(const u16 *)(dmi_data + 0x6));
+		mem_clock = (u16)(*(const u16 *)(dmi_data + 0x11));
+		list_for_each_entry(mem, &kdev->mem_props, list) {
+			if (mem_width != 0xFFFF && mem_width != 0)
+				mem->width = mem_width;
+			if (mem_clock != 0)
+				mem->mem_clk_max = mem_clock;
+		}
+	}
+}
+
+/*
+ * Performance counters information is not part of CRAT but we would like to
+ * put them in the sysfs under topology directory for Thunk to get the data.
+ * This function is called before updating the sysfs.
+ */
+static int kfd_add_perf_to_topology(struct kfd_topology_device *kdev)
+{
+	/* These are the only counters supported so far */
+	return kfd_iommu_add_perf_counters(kdev);
+}
+
+/* kfd_add_non_crat_information - Add information that is not currently
+ *	defined in CRAT but is necessary for KFD topology
+ * @dev - topology device to which addition info is added
+ */
+static void kfd_add_non_crat_information(struct kfd_topology_device *kdev)
+{
+	/* Check if CPU only node. */
+	if (!kdev->gpu) {
+		/* Add system memory information */
+		dmi_walk(find_system_memory, kdev);
+	}
+	/* TODO: For GPU node, rearrange code from kfd_topology_add_device */
+}
+
+/* kfd_is_acpi_crat_invalid - CRAT from ACPI is valid only for AMD APU devices.
+ *	Ignore CRAT for all other devices. AMD APU is identified if both CPU
+ *	and GPU cores are present.
+ * @device_list - topology device list created by parsing ACPI CRAT table.
+ * @return - TRUE if invalid, FALSE is valid.
+ */
+static bool kfd_is_acpi_crat_invalid(struct list_head *device_list)
+{
+	struct kfd_topology_device *dev;
+
+	list_for_each_entry(dev, device_list, list) {
+		if (dev->node_props.cpu_cores_count &&
+			dev->node_props.simd_count)
+			return false;
+	}
+	pr_info("Ignoring ACPI CRAT on non-APU system\n");
+	return true;
+}
+
+int kfd_topology_init(void)
+{
+	void *crat_image = NULL;
+	size_t image_size = 0;
+	int ret;
+	struct list_head temp_topology_device_list;
+	int cpu_only_node = 0;
+	struct kfd_topology_device *kdev;
+	int proximity_domain;
+
+	/* topology_device_list - Master list of all topology devices
+	 * temp_topology_device_list - temporary list created while parsing CRAT
+	 * or VCRAT. Once parsing is complete the contents of list is moved to
+	 * topology_device_list
+	 */
+
+	/* Initialize the head for the both the lists */
+	INIT_LIST_HEAD(&topology_device_list);
+	INIT_LIST_HEAD(&temp_topology_device_list);
+	init_rwsem(&topology_lock);
+
+	memset(&sys_props, 0, sizeof(sys_props));
+
+	/* Proximity domains in ACPI CRAT tables start counting at
+	 * 0. The same should be true for virtual CRAT tables created
+	 * at this stage. GPUs added later in kfd_topology_add_device
+	 * use a counter.
+	 */
+	proximity_domain = 0;
+
+	/*
+	 * Get the CRAT image from the ACPI. If ACPI doesn't have one
+	 * or if ACPI CRAT is invalid create a virtual CRAT.
+	 * NOTE: The current implementation expects all AMD APUs to have
+	 *	CRAT. If no CRAT is available, it is assumed to be a CPU
+	 */
+	ret = kfd_create_crat_image_acpi(&crat_image, &image_size);
+	if (!ret) {
+		ret = kfd_parse_crat_table(crat_image,
+					   &temp_topology_device_list,
+					   proximity_domain);
+		if (ret ||
+		    kfd_is_acpi_crat_invalid(&temp_topology_device_list)) {
+			kfd_release_topology_device_list(
+				&temp_topology_device_list);
+			kfd_destroy_crat_image(crat_image);
+			crat_image = NULL;
+		}
+	}
+
+	if (!crat_image) {
+		ret = kfd_create_crat_image_virtual(&crat_image, &image_size,
+						    COMPUTE_UNIT_CPU, NULL,
+						    proximity_domain);
+		cpu_only_node = 1;
+		if (ret) {
+			pr_err("Error creating VCRAT table for CPU\n");
+			return ret;
+		}
+
+		ret = kfd_parse_crat_table(crat_image,
+					   &temp_topology_device_list,
+					   proximity_domain);
+		if (ret) {
+			pr_err("Error parsing VCRAT table for CPU\n");
+			goto err;
+		}
+	}
+
+	kdev = list_first_entry(&temp_topology_device_list,
+				struct kfd_topology_device, list);
+	kfd_add_perf_to_topology(kdev);
+
+	down_write(&topology_lock);
+	kfd_topology_update_device_list(&temp_topology_device_list,
+					&topology_device_list);
+	atomic_set(&topology_crat_proximity_domain, sys_props.num_devices-1);
+	ret = kfd_topology_update_sysfs();
+	up_write(&topology_lock);
+
+	if (!ret) {
+		sys_props.generation_count++;
+		kfd_update_system_properties();
+		kfd_debug_print_topology();
+		pr_info("Finished initializing topology\n");
+	} else
+		pr_err("Failed to update topology in sysfs ret=%d\n", ret);
+
+	/* For nodes with GPU, this information gets added
+	 * when GPU is detected (kfd_topology_add_device).
+	 */
+	if (cpu_only_node) {
+		/* Add additional information to CPU only node created above */
+		down_write(&topology_lock);
+		kdev = list_first_entry(&topology_device_list,
+				struct kfd_topology_device, list);
+		up_write(&topology_lock);
+		kfd_add_non_crat_information(kdev);
+	}
+
+err:
+	kfd_destroy_crat_image(crat_image);
+	return ret;
+}
+
+void kfd_topology_shutdown(void)
+{
+	down_write(&topology_lock);
+	kfd_topology_release_sysfs();
+	kfd_release_live_view();
+	up_write(&topology_lock);
+}
+
+static uint32_t kfd_generate_gpu_id(struct kfd_dev *gpu)
+{
+	uint32_t hashout;
+	uint32_t buf[7];
+	uint64_t local_mem_size;
+	int i;
+	struct kfd_local_mem_info local_mem_info;
+
+	if (!gpu)
+		return 0;
+
+	gpu->kfd2kgd->get_local_mem_info(gpu->kgd, &local_mem_info);
+
+	local_mem_size = local_mem_info.local_mem_size_private +
+			local_mem_info.local_mem_size_public;
+
+	buf[0] = gpu->pdev->devfn;
+	buf[1] = gpu->pdev->subsystem_vendor;
+	buf[2] = gpu->pdev->subsystem_device;
+	buf[3] = gpu->pdev->device;
+	buf[4] = gpu->pdev->bus->number;
+	buf[5] = lower_32_bits(local_mem_size);
+	buf[6] = upper_32_bits(local_mem_size);
+
+	for (i = 0, hashout = 0; i < 7; i++)
+		hashout ^= hash_32(buf[i], KFD_GPU_ID_HASH_WIDTH);
+
+	return hashout;
+}
+/* kfd_assign_gpu - Attach @gpu to the correct kfd topology device. If
+ *		the GPU device is not already present in the topology device
+ *		list then return NULL. This means a new topology device has to
+ *		be created for this GPU.
+ */
+static struct kfd_topology_device *kfd_assign_gpu(struct kfd_dev *gpu)
+{
+	struct kfd_topology_device *dev;
+	struct kfd_topology_device *out_dev = NULL;
+
+	down_write(&topology_lock);
+	list_for_each_entry(dev, &topology_device_list, list) {
+		/* Discrete GPUs need their own topology device list
+		 * entries. Don't assign them to CPU/APU nodes.
+		 */
+		if (!gpu->device_info->needs_iommu_device &&
+		    dev->node_props.cpu_cores_count)
+			continue;
+
+		if (!dev->gpu && (dev->node_props.simd_count > 0)) {
+			dev->gpu = gpu;
+			out_dev = dev;
+			break;
+		}
+	}
+	up_write(&topology_lock);
+	return out_dev;
+}
+
+static void kfd_notify_gpu_change(uint32_t gpu_id, int arrival)
+{
+	/*
+	 * TODO: Generate an event for thunk about the arrival/removal
+	 * of the GPU
+	 */
+}
+
+/* kfd_fill_mem_clk_max_info - Since CRAT doesn't have memory clock info,
+ *		patch this after CRAT parsing.
+ */
+static void kfd_fill_mem_clk_max_info(struct kfd_topology_device *dev)
+{
+	struct kfd_mem_properties *mem;
+	struct kfd_local_mem_info local_mem_info;
+
+	if (!dev)
+		return;
+
+	/* Currently, amdgpu driver (amdgpu_mc) deals only with GPUs with
+	 * single bank of VRAM local memory.
+	 * for dGPUs - VCRAT reports only one bank of Local Memory
+	 * for APUs - If CRAT from ACPI reports more than one bank, then
+	 *	all the banks will report the same mem_clk_max information
+	 */
+	dev->gpu->kfd2kgd->get_local_mem_info(dev->gpu->kgd,
+		&local_mem_info);
+
+	list_for_each_entry(mem, &dev->mem_props, list)
+		mem->mem_clk_max = local_mem_info.mem_clk_max;
+}
+
+static void kfd_fill_iolink_non_crat_info(struct kfd_topology_device *dev)
+{
+	struct kfd_iolink_properties *link;
+
+	if (!dev || !dev->gpu)
+		return;
+
+	/* GPU only creates direck links so apply flags setting to all */
+	if (dev->gpu->device_info->asic_family == CHIP_HAWAII)
+		list_for_each_entry(link, &dev->io_link_props, list)
+			link->flags = CRAT_IOLINK_FLAGS_ENABLED |
+				CRAT_IOLINK_FLAGS_NO_ATOMICS_32_BIT |
+				CRAT_IOLINK_FLAGS_NO_ATOMICS_64_BIT;
+}
+
+int kfd_topology_add_device(struct kfd_dev *gpu)
+{
+	uint32_t gpu_id;
+	struct kfd_topology_device *dev;
+	struct kfd_cu_info cu_info;
+	int res = 0;
+	struct list_head temp_topology_device_list;
+	void *crat_image = NULL;
+	size_t image_size = 0;
+	int proximity_domain;
+
+	INIT_LIST_HEAD(&temp_topology_device_list);
+
+	gpu_id = kfd_generate_gpu_id(gpu);
+
+	pr_debug("Adding new GPU (ID: 0x%x) to topology\n", gpu_id);
+
+	proximity_domain = atomic_inc_return(&topology_crat_proximity_domain);
+
+	/* Check to see if this gpu device exists in the topology_device_list.
+	 * If so, assign the gpu to that device,
+	 * else create a Virtual CRAT for this gpu device and then parse that
+	 * CRAT to create a new topology device. Once created assign the gpu to
+	 * that topology device
+	 */
+	dev = kfd_assign_gpu(gpu);
+	if (!dev) {
+		res = kfd_create_crat_image_virtual(&crat_image, &image_size,
+						    COMPUTE_UNIT_GPU, gpu,
+						    proximity_domain);
+		if (res) {
+			pr_err("Error creating VCRAT for GPU (ID: 0x%x)\n",
+			       gpu_id);
+			return res;
+		}
+		res = kfd_parse_crat_table(crat_image,
+					   &temp_topology_device_list,
+					   proximity_domain);
+		if (res) {
+			pr_err("Error parsing VCRAT for GPU (ID: 0x%x)\n",
+			       gpu_id);
+			goto err;
+		}
+
+		down_write(&topology_lock);
+		kfd_topology_update_device_list(&temp_topology_device_list,
+			&topology_device_list);
+
+		/* Update the SYSFS tree, since we added another topology
+		 * device
+		 */
+		res = kfd_topology_update_sysfs();
+		up_write(&topology_lock);
+
+		if (!res)
+			sys_props.generation_count++;
+		else
+			pr_err("Failed to update GPU (ID: 0x%x) to sysfs topology. res=%d\n",
+						gpu_id, res);
+		dev = kfd_assign_gpu(gpu);
+		if (WARN_ON(!dev)) {
+			res = -ENODEV;
+			goto err;
+		}
+	}
+
+	dev->gpu_id = gpu_id;
+	gpu->id = gpu_id;
+
+	/* TODO: Move the following lines to function
+	 *	kfd_add_non_crat_information
+	 */
+
+	/* Fill-in additional information that is not available in CRAT but
+	 * needed for the topology
+	 */
+
+	dev->gpu->kfd2kgd->get_cu_info(dev->gpu->kgd, &cu_info);
+	dev->node_props.simd_arrays_per_engine =
+		cu_info.num_shader_arrays_per_engine;
+
+	dev->node_props.vendor_id = gpu->pdev->vendor;
+	dev->node_props.device_id = gpu->pdev->device;
+	dev->node_props.location_id = PCI_DEVID(gpu->pdev->bus->number,
+		gpu->pdev->devfn);
+	dev->node_props.max_engine_clk_fcompute =
+		dev->gpu->kfd2kgd->get_max_engine_clock_in_mhz(dev->gpu->kgd);
+	dev->node_props.max_engine_clk_ccompute =
+		cpufreq_quick_get_max(0) / 1000;
+	dev->node_props.drm_render_minor =
+		gpu->shared_resources.drm_render_minor;
+
+	kfd_fill_mem_clk_max_info(dev);
+	kfd_fill_iolink_non_crat_info(dev);
+
+	switch (dev->gpu->device_info->asic_family) {
+	case CHIP_KAVERI:
+	case CHIP_HAWAII:
+	case CHIP_TONGA:
+		dev->node_props.capability |= ((HSA_CAP_DOORBELL_TYPE_PRE_1_0 <<
+			HSA_CAP_DOORBELL_TYPE_TOTALBITS_SHIFT) &
+			HSA_CAP_DOORBELL_TYPE_TOTALBITS_MASK);
+		break;
+	case CHIP_CARRIZO:
+	case CHIP_FIJI:
+	case CHIP_POLARIS10:
+	case CHIP_POLARIS11:
+		pr_debug("Adding doorbell packet type capability\n");
+		dev->node_props.capability |= ((HSA_CAP_DOORBELL_TYPE_1_0 <<
+			HSA_CAP_DOORBELL_TYPE_TOTALBITS_SHIFT) &
+			HSA_CAP_DOORBELL_TYPE_TOTALBITS_MASK);
+		break;
+	case CHIP_VEGA10:
+	case CHIP_RAVEN:
+		dev->node_props.capability |= ((HSA_CAP_DOORBELL_TYPE_2_0 <<
+			HSA_CAP_DOORBELL_TYPE_TOTALBITS_SHIFT) &
+			HSA_CAP_DOORBELL_TYPE_TOTALBITS_MASK);
+		break;
+	default:
+		WARN(1, "Unexpected ASIC family %u",
+		     dev->gpu->device_info->asic_family);
+	}
+
+	/* Fix errors in CZ CRAT.
+	 * simd_count: Carrizo CRAT reports wrong simd_count, probably
+	 *		because it doesn't consider masked out CUs
+	 * max_waves_per_simd: Carrizo reports wrong max_waves_per_simd
+	 * capability flag: Carrizo CRAT doesn't report IOMMU flags
+	 */
+	if (dev->gpu->device_info->asic_family == CHIP_CARRIZO) {
+		dev->node_props.simd_count =
+			cu_info.simd_per_cu * cu_info.cu_active_number;
+		dev->node_props.max_waves_per_simd = 10;
+		dev->node_props.capability |= HSA_CAP_ATS_PRESENT;
+	}
+
+	kfd_debug_print_topology();
+
+	if (!res)
+		kfd_notify_gpu_change(gpu_id, 1);
+err:
+	kfd_destroy_crat_image(crat_image);
+	return res;
+}
+
+int kfd_topology_remove_device(struct kfd_dev *gpu)
+{
+	struct kfd_topology_device *dev, *tmp;
+	uint32_t gpu_id;
+	int res = -ENODEV;
+
+	down_write(&topology_lock);
+
+	list_for_each_entry_safe(dev, tmp, &topology_device_list, list)
+		if (dev->gpu == gpu) {
+			gpu_id = dev->gpu_id;
+			kfd_remove_sysfs_node_entry(dev);
+			kfd_release_topology_device(dev);
+			sys_props.num_devices--;
+			res = 0;
+			if (kfd_topology_update_sysfs() < 0)
+				kfd_topology_release_sysfs();
+			break;
+		}
+
+	up_write(&topology_lock);
+
+	if (!res)
+		kfd_notify_gpu_change(gpu_id, 0);
+
+	return res;
+}
+
+/* kfd_topology_enum_kfd_devices - Enumerate through all devices in KFD
+ *	topology. If GPU device is found @idx, then valid kfd_dev pointer is
+ *	returned through @kdev
+ * Return -	0: On success (@kdev will be NULL for non GPU nodes)
+ *		-1: If end of list
+ */
+int kfd_topology_enum_kfd_devices(uint8_t idx, struct kfd_dev **kdev)
+{
+
+	struct kfd_topology_device *top_dev;
+	uint8_t device_idx = 0;
+
+	*kdev = NULL;
+	down_read(&topology_lock);
+
+	list_for_each_entry(top_dev, &topology_device_list, list) {
+		if (device_idx == idx) {
+			*kdev = top_dev->gpu;
+			up_read(&topology_lock);
+			return 0;
+		}
+
+		device_idx++;
+	}
+
+	up_read(&topology_lock);
+
+	return -1;
+
+}
+
+static int kfd_cpumask_to_apic_id(const struct cpumask *cpumask)
+{
+	const struct cpuinfo_x86 *cpuinfo;
+	int first_cpu_of_numa_node;
+
+	if (!cpumask || cpumask == cpu_none_mask)
+		return -1;
+	first_cpu_of_numa_node = cpumask_first(cpumask);
+	if (first_cpu_of_numa_node >= nr_cpu_ids)
+		return -1;
+	cpuinfo = &cpu_data(first_cpu_of_numa_node);
+
+	return cpuinfo->apicid;
+}
+
+/* kfd_numa_node_to_apic_id - Returns the APIC ID of the first logical processor
+ *	of the given NUMA node (numa_node_id)
+ * Return -1 on failure
+ */
+int kfd_numa_node_to_apic_id(int numa_node_id)
+{
+	if (numa_node_id == -1) {
+		pr_warn("Invalid NUMA Node. Use online CPU mask\n");
+		return kfd_cpumask_to_apic_id(cpu_online_mask);
+	}
+	return kfd_cpumask_to_apic_id(cpumask_of_node(numa_node_id));
+}
+
+#if defined(CONFIG_DEBUG_FS)
+
+int kfd_debugfs_hqds_by_device(struct seq_file *m, void *data)
+{
+	struct kfd_topology_device *dev;
+	unsigned int i = 0;
+	int r = 0;
+
+	down_read(&topology_lock);
+
+	list_for_each_entry(dev, &topology_device_list, list) {
+		if (!dev->gpu) {
+			i++;
+			continue;
+		}
+
+		seq_printf(m, "Node %u, gpu_id %x:\n", i++, dev->gpu->id);
+		r = dqm_debugfs_hqds(m, dev->gpu->dqm);
+		if (r)
+			break;
+	}
+
+	up_read(&topology_lock);
+
+	return r;
+}
+
+int kfd_debugfs_rls_by_device(struct seq_file *m, void *data)
+{
+	struct kfd_topology_device *dev;
+	unsigned int i = 0;
+	int r = 0;
+
+	down_read(&topology_lock);
+
+	list_for_each_entry(dev, &topology_device_list, list) {
+		if (!dev->gpu) {
+			i++;
+			continue;
+		}
+
+		seq_printf(m, "Node %u, gpu_id %x:\n", i++, dev->gpu->id);
+		r = pm_debugfs_runlist(m, &dev->gpu->dqm->packets);
+		if (r)
+			break;
+	}
+
+	up_read(&topology_lock);
+
+	return r;
+}
+
+#endif
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_topology.h b/drivers/gpu/drm/amd/amdkfd/kfd_topology.h
new file mode 100644
index 000000000..7d9c3f948
--- /dev/null
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_topology.h
@@ -0,0 +1,188 @@
+/*
+ * Copyright 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#ifndef __KFD_TOPOLOGY_H__
+#define __KFD_TOPOLOGY_H__
+
+#include <linux/types.h>
+#include <linux/list.h>
+#include "kfd_crat.h"
+
+#define KFD_TOPOLOGY_PUBLIC_NAME_SIZE 128
+
+#define HSA_CAP_HOT_PLUGGABLE			0x00000001
+#define HSA_CAP_ATS_PRESENT			0x00000002
+#define HSA_CAP_SHARED_WITH_GRAPHICS		0x00000004
+#define HSA_CAP_QUEUE_SIZE_POW2			0x00000008
+#define HSA_CAP_QUEUE_SIZE_32BIT		0x00000010
+#define HSA_CAP_QUEUE_IDLE_EVENT		0x00000020
+#define HSA_CAP_VA_LIMIT			0x00000040
+#define HSA_CAP_WATCH_POINTS_SUPPORTED		0x00000080
+#define HSA_CAP_WATCH_POINTS_TOTALBITS_MASK	0x00000f00
+#define HSA_CAP_WATCH_POINTS_TOTALBITS_SHIFT	8
+#define HSA_CAP_DOORBELL_TYPE_TOTALBITS_MASK	0x00003000
+#define HSA_CAP_DOORBELL_TYPE_TOTALBITS_SHIFT	12
+#define HSA_CAP_RESERVED			0xffffc000
+
+#define HSA_CAP_DOORBELL_TYPE_PRE_1_0		0x0
+#define HSA_CAP_DOORBELL_TYPE_1_0		0x1
+#define HSA_CAP_DOORBELL_TYPE_2_0		0x2
+#define HSA_CAP_AQL_QUEUE_DOUBLE_MAP		0x00004000
+
+struct kfd_node_properties {
+	uint32_t cpu_cores_count;
+	uint32_t simd_count;
+	uint32_t mem_banks_count;
+	uint32_t caches_count;
+	uint32_t io_links_count;
+	uint32_t cpu_core_id_base;
+	uint32_t simd_id_base;
+	uint32_t capability;
+	uint32_t max_waves_per_simd;
+	uint32_t lds_size_in_kb;
+	uint32_t gds_size_in_kb;
+	uint32_t wave_front_size;
+	uint32_t array_count;
+	uint32_t simd_arrays_per_engine;
+	uint32_t cu_per_simd_array;
+	uint32_t simd_per_cu;
+	uint32_t max_slots_scratch_cu;
+	uint32_t engine_id;
+	uint32_t vendor_id;
+	uint32_t device_id;
+	uint32_t location_id;
+	uint32_t max_engine_clk_fcompute;
+	uint32_t max_engine_clk_ccompute;
+	int32_t  drm_render_minor;
+	uint16_t marketing_name[KFD_TOPOLOGY_PUBLIC_NAME_SIZE];
+};
+
+#define HSA_MEM_HEAP_TYPE_SYSTEM	0
+#define HSA_MEM_HEAP_TYPE_FB_PUBLIC	1
+#define HSA_MEM_HEAP_TYPE_FB_PRIVATE	2
+#define HSA_MEM_HEAP_TYPE_GPU_GDS	3
+#define HSA_MEM_HEAP_TYPE_GPU_LDS	4
+#define HSA_MEM_HEAP_TYPE_GPU_SCRATCH	5
+
+#define HSA_MEM_FLAGS_HOT_PLUGGABLE	0x00000001
+#define HSA_MEM_FLAGS_NON_VOLATILE	0x00000002
+#define HSA_MEM_FLAGS_RESERVED		0xfffffffc
+
+struct kfd_mem_properties {
+	struct list_head	list;
+	uint32_t		heap_type;
+	uint64_t		size_in_bytes;
+	uint32_t		flags;
+	uint32_t		width;
+	uint32_t		mem_clk_max;
+	struct kobject		*kobj;
+	struct attribute	attr;
+};
+
+#define HSA_CACHE_TYPE_DATA		0x00000001
+#define HSA_CACHE_TYPE_INSTRUCTION	0x00000002
+#define HSA_CACHE_TYPE_CPU		0x00000004
+#define HSA_CACHE_TYPE_HSACU		0x00000008
+#define HSA_CACHE_TYPE_RESERVED		0xfffffff0
+
+struct kfd_cache_properties {
+	struct list_head	list;
+	uint32_t		processor_id_low;
+	uint32_t		cache_level;
+	uint32_t		cache_size;
+	uint32_t		cacheline_size;
+	uint32_t		cachelines_per_tag;
+	uint32_t		cache_assoc;
+	uint32_t		cache_latency;
+	uint32_t		cache_type;
+	uint8_t			sibling_map[CRAT_SIBLINGMAP_SIZE];
+	struct kobject		*kobj;
+	struct attribute	attr;
+};
+
+struct kfd_iolink_properties {
+	struct list_head	list;
+	uint32_t		iolink_type;
+	uint32_t		ver_maj;
+	uint32_t		ver_min;
+	uint32_t		node_from;
+	uint32_t		node_to;
+	uint32_t		weight;
+	uint32_t		min_latency;
+	uint32_t		max_latency;
+	uint32_t		min_bandwidth;
+	uint32_t		max_bandwidth;
+	uint32_t		rec_transfer_size;
+	uint32_t		flags;
+	struct kobject		*kobj;
+	struct attribute	attr;
+};
+
+struct kfd_perf_properties {
+	struct list_head	list;
+	char			block_name[16];
+	uint32_t		max_concurrent;
+	struct attribute_group	*attr_group;
+};
+
+struct kfd_topology_device {
+	struct list_head		list;
+	uint32_t			gpu_id;
+	uint32_t			proximity_domain;
+	struct kfd_node_properties	node_props;
+	struct list_head		mem_props;
+	uint32_t			cache_count;
+	struct list_head		cache_props;
+	uint32_t			io_link_count;
+	struct list_head		io_link_props;
+	struct list_head		perf_props;
+	struct kfd_dev			*gpu;
+	struct kobject			*kobj_node;
+	struct kobject			*kobj_mem;
+	struct kobject			*kobj_cache;
+	struct kobject			*kobj_iolink;
+	struct kobject			*kobj_perf;
+	struct attribute		attr_gpuid;
+	struct attribute		attr_name;
+	struct attribute		attr_props;
+	uint8_t				oem_id[CRAT_OEMID_LENGTH];
+	uint8_t				oem_table_id[CRAT_OEMTABLEID_LENGTH];
+	uint32_t			oem_revision;
+};
+
+struct kfd_system_properties {
+	uint32_t		num_devices;     /* Number of H-NUMA nodes */
+	uint32_t		generation_count;
+	uint64_t		platform_oem;
+	uint64_t		platform_id;
+	uint64_t		platform_rev;
+	struct kobject		*kobj_topology;
+	struct kobject		*kobj_nodes;
+	struct attribute	attr_genid;
+	struct attribute	attr_props;
+};
+
+struct kfd_topology_device *kfd_create_topology_device(
+		struct list_head *device_list);
+void kfd_release_topology_device_list(struct list_head *device_list);
+
+#endif /* __KFD_TOPOLOGY_H__ */
diff --git a/drivers/gpu/drm/amd/amdkfd/soc15_int.h b/drivers/gpu/drm/amd/amdkfd/soc15_int.h
new file mode 100644
index 000000000..0bc0b25cb
--- /dev/null
+++ b/drivers/gpu/drm/amd/amdkfd/soc15_int.h
@@ -0,0 +1,47 @@
+/*
+ * Copyright 2016-2018 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#ifndef HSA_SOC15_INT_H_INCLUDED
+#define HSA_SOC15_INT_H_INCLUDED
+
+#include "soc15_ih_clientid.h"
+
+#define SOC15_INTSRC_CP_END_OF_PIPE	181
+#define SOC15_INTSRC_CP_BAD_OPCODE	183
+#define SOC15_INTSRC_SQ_INTERRUPT_MSG	239
+#define SOC15_INTSRC_VMC_FAULT		0
+#define SOC15_INTSRC_SDMA_TRAP		224
+
+
+#define SOC15_CLIENT_ID_FROM_IH_ENTRY(entry) (le32_to_cpu(entry[0]) & 0xff)
+#define SOC15_SOURCE_ID_FROM_IH_ENTRY(entry) (le32_to_cpu(entry[0]) >> 8 & 0xff)
+#define SOC15_RING_ID_FROM_IH_ENTRY(entry) (le32_to_cpu(entry[0]) >> 16 & 0xff)
+#define SOC15_VMID_FROM_IH_ENTRY(entry) (le32_to_cpu(entry[0]) >> 24 & 0xf)
+#define SOC15_VMID_TYPE_FROM_IH_ENTRY(entry) (le32_to_cpu(entry[0]) >> 31 & 0x1)
+#define SOC15_PASID_FROM_IH_ENTRY(entry) (le32_to_cpu(entry[3]) & 0xffff)
+#define SOC15_CONTEXT_ID0_FROM_IH_ENTRY(entry) (le32_to_cpu(entry[4]))
+#define SOC15_CONTEXT_ID1_FROM_IH_ENTRY(entry) (le32_to_cpu(entry[5]))
+#define SOC15_CONTEXT_ID2_FROM_IH_ENTRY(entry) (le32_to_cpu(entry[6]))
+#define SOC15_CONTEXT_ID3_FROM_IH_ENTRY(entry) (le32_to_cpu(entry[7]))
+
+#endif
+
author	Daniel Baumann <daniel.baumann@progress-linux.org>	2024-05-06 01:02:30 +0000
committer	Daniel Baumann <daniel.baumann@progress-linux.org>	2024-05-06 01:02:30 +0000
commit	76cb841cb886eef6b3bee341a2266c76578724ad (patch)
tree	f5892e5ba6cc11949952a6ce4ecbe6d516d6ce58 /drivers/gpu/drm/amd/amdkfd
parent	Initial commit. (diff)
download	linux-76cb841cb886eef6b3bee341a2266c76578724ad.tar.xz linux-76cb841cb886eef6b3bee341a2266c76578724ad.zip