diff options
author | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-05-06 01:02:30 +0000 |
---|---|---|
committer | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-05-06 01:02:30 +0000 |
commit | 76cb841cb886eef6b3bee341a2266c76578724ad (patch) | |
tree | f5892e5ba6cc11949952a6ce4ecbe6d516d6ce58 /drivers/gpu/drm/amd/amdkfd | |
parent | Initial commit. (diff) | |
download | linux-76cb841cb886eef6b3bee341a2266c76578724ad.tar.xz linux-76cb841cb886eef6b3bee341a2266c76578724ad.zip |
Adding upstream version 4.19.249.upstream/4.19.249
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'drivers/gpu/drm/amd/amdkfd')
55 files changed, 24022 insertions, 0 deletions
diff --git a/drivers/gpu/drm/amd/amdkfd/Kconfig b/drivers/gpu/drm/amd/amdkfd/Kconfig new file mode 100644 index 000000000..3858820a0 --- /dev/null +++ b/drivers/gpu/drm/amd/amdkfd/Kconfig @@ -0,0 +1,11 @@ +# +# Heterogenous system architecture configuration +# + +config HSA_AMD + tristate "HSA kernel driver for AMD GPU devices" + depends on DRM_AMDGPU && X86_64 + imply AMD_IOMMU_V2 + select MMU_NOTIFIER + help + Enable this if you want to use HSA features on AMD GPU devices. diff --git a/drivers/gpu/drm/amd/amdkfd/Makefile b/drivers/gpu/drm/amd/amdkfd/Makefile new file mode 100644 index 000000000..ffd096fff --- /dev/null +++ b/drivers/gpu/drm/amd/amdkfd/Makefile @@ -0,0 +1,48 @@ +# +# Copyright 2017 Advanced Micro Devices, Inc. +# +# Permission is hereby granted, free of charge, to any person obtaining a +# copy of this software and associated documentation files (the "Software"), +# to deal in the Software without restriction, including without limitation +# the rights to use, copy, modify, merge, publish, distribute, sublicense, +# and/or sell copies of the Software, and to permit persons to whom the +# Software is furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +# THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR +# OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +# ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR +# OTHER DEALINGS IN THE SOFTWARE. +# +# +# Makefile for Heterogenous System Architecture support for AMD GPU devices +# + +ccflags-y := -Idrivers/gpu/drm/amd/include/ \ + -Idrivers/gpu/drm/amd/include/asic_reg + +amdkfd-y := kfd_module.o kfd_device.o kfd_chardev.o kfd_topology.o \ + kfd_pasid.o kfd_doorbell.o kfd_flat_memory.o \ + kfd_process.o kfd_queue.o kfd_mqd_manager.o \ + kfd_mqd_manager_cik.o kfd_mqd_manager_vi.o \ + kfd_mqd_manager_v9.o \ + kfd_kernel_queue.o kfd_kernel_queue_cik.o \ + kfd_kernel_queue_vi.o kfd_kernel_queue_v9.o \ + kfd_packet_manager.o kfd_process_queue_manager.o \ + kfd_device_queue_manager.o kfd_device_queue_manager_cik.o \ + kfd_device_queue_manager_vi.o kfd_device_queue_manager_v9.o \ + kfd_interrupt.o kfd_events.o cik_event_interrupt.o \ + kfd_int_process_v9.o kfd_dbgdev.o kfd_dbgmgr.o kfd_crat.o + +ifneq ($(CONFIG_AMD_IOMMU_V2),) +amdkfd-y += kfd_iommu.o +endif + +amdkfd-$(CONFIG_DEBUG_FS) += kfd_debugfs.o + +obj-$(CONFIG_HSA_AMD) += amdkfd.o diff --git a/drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c b/drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c new file mode 100644 index 000000000..5d2475d53 --- /dev/null +++ b/drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c @@ -0,0 +1,124 @@ +/* + * Copyright 2014 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +#include "kfd_priv.h" +#include "kfd_events.h" +#include "cik_int.h" + +static bool cik_event_interrupt_isr(struct kfd_dev *dev, + const uint32_t *ih_ring_entry, + uint32_t *patched_ihre, + bool *patched_flag) +{ + const struct cik_ih_ring_entry *ihre = + (const struct cik_ih_ring_entry *)ih_ring_entry; + const struct kfd2kgd_calls *f2g = dev->kfd2kgd; + unsigned int vmid, pasid; + + /* This workaround is due to HW/FW limitation on Hawaii that + * VMID and PASID are not written into ih_ring_entry + */ + if ((ihre->source_id == CIK_INTSRC_GFX_PAGE_INV_FAULT || + ihre->source_id == CIK_INTSRC_GFX_MEM_PROT_FAULT) && + dev->device_info->asic_family == CHIP_HAWAII) { + struct cik_ih_ring_entry *tmp_ihre = + (struct cik_ih_ring_entry *)patched_ihre; + + *patched_flag = true; + *tmp_ihre = *ihre; + + vmid = f2g->read_vmid_from_vmfault_reg(dev->kgd); + pasid = f2g->get_atc_vmid_pasid_mapping_pasid(dev->kgd, vmid); + + tmp_ihre->ring_id &= 0x000000ff; + tmp_ihre->ring_id |= vmid << 8; + tmp_ihre->ring_id |= pasid << 16; + + return (pasid != 0) && + vmid >= dev->vm_info.first_vmid_kfd && + vmid <= dev->vm_info.last_vmid_kfd; + } + + /* Only handle interrupts from KFD VMIDs */ + vmid = (ihre->ring_id & 0x0000ff00) >> 8; + if (vmid < dev->vm_info.first_vmid_kfd || + vmid > dev->vm_info.last_vmid_kfd) + return 0; + + /* If there is no valid PASID, it's likely a firmware bug */ + pasid = (ihre->ring_id & 0xffff0000) >> 16; + if (WARN_ONCE(pasid == 0, "FW bug: No PASID in KFD interrupt")) + return 0; + + /* Interrupt types we care about: various signals and faults. + * They will be forwarded to a work queue (see below). + */ + return ihre->source_id == CIK_INTSRC_CP_END_OF_PIPE || + ihre->source_id == CIK_INTSRC_SDMA_TRAP || + ihre->source_id == CIK_INTSRC_SQ_INTERRUPT_MSG || + ihre->source_id == CIK_INTSRC_CP_BAD_OPCODE || + ihre->source_id == CIK_INTSRC_GFX_PAGE_INV_FAULT || + ihre->source_id == CIK_INTSRC_GFX_MEM_PROT_FAULT; +} + +static void cik_event_interrupt_wq(struct kfd_dev *dev, + const uint32_t *ih_ring_entry) +{ + const struct cik_ih_ring_entry *ihre = + (const struct cik_ih_ring_entry *)ih_ring_entry; + uint32_t context_id = ihre->data & 0xfffffff; + unsigned int vmid = (ihre->ring_id & 0x0000ff00) >> 8; + unsigned int pasid = (ihre->ring_id & 0xffff0000) >> 16; + + if (pasid == 0) + return; + + if (ihre->source_id == CIK_INTSRC_CP_END_OF_PIPE) + kfd_signal_event_interrupt(pasid, context_id, 28); + else if (ihre->source_id == CIK_INTSRC_SDMA_TRAP) + kfd_signal_event_interrupt(pasid, context_id, 28); + else if (ihre->source_id == CIK_INTSRC_SQ_INTERRUPT_MSG) + kfd_signal_event_interrupt(pasid, context_id & 0xff, 8); + else if (ihre->source_id == CIK_INTSRC_CP_BAD_OPCODE) + kfd_signal_hw_exception_event(pasid); + else if (ihre->source_id == CIK_INTSRC_GFX_PAGE_INV_FAULT || + ihre->source_id == CIK_INTSRC_GFX_MEM_PROT_FAULT) { + struct kfd_vm_fault_info info; + + kfd_process_vm_fault(dev->dqm, pasid); + + memset(&info, 0, sizeof(info)); + dev->kfd2kgd->get_vm_fault_info(dev->kgd, &info); + if (!info.page_addr && !info.status) + return; + + if (info.vmid == vmid) + kfd_signal_vm_fault_event(dev, pasid, &info); + else + kfd_signal_vm_fault_event(dev, pasid, NULL); + } +} + +const struct kfd_event_interrupt_class event_interrupt_class_cik = { + .interrupt_isr = cik_event_interrupt_isr, + .interrupt_wq = cik_event_interrupt_wq, +}; diff --git a/drivers/gpu/drm/amd/amdkfd/cik_int.h b/drivers/gpu/drm/amd/amdkfd/cik_int.h new file mode 100644 index 000000000..76f8677a7 --- /dev/null +++ b/drivers/gpu/drm/amd/amdkfd/cik_int.h @@ -0,0 +1,43 @@ +/* + * Copyright 2014 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +#ifndef CIK_INT_H_INCLUDED +#define CIK_INT_H_INCLUDED + +#include <linux/types.h> + +struct cik_ih_ring_entry { + uint32_t source_id; + uint32_t data; + uint32_t ring_id; + uint32_t reserved; +}; + +#define CIK_INTSRC_CP_END_OF_PIPE 0xB5 +#define CIK_INTSRC_CP_BAD_OPCODE 0xB7 +#define CIK_INTSRC_SDMA_TRAP 0xE0 +#define CIK_INTSRC_SQ_INTERRUPT_MSG 0xEF +#define CIK_INTSRC_GFX_PAGE_INV_FAULT 0x92 +#define CIK_INTSRC_GFX_MEM_PROT_FAULT 0x93 + +#endif + diff --git a/drivers/gpu/drm/amd/amdkfd/cik_regs.h b/drivers/gpu/drm/amd/amdkfd/cik_regs.h new file mode 100644 index 000000000..37ce6dd65 --- /dev/null +++ b/drivers/gpu/drm/amd/amdkfd/cik_regs.h @@ -0,0 +1,73 @@ +/* + * Copyright 2014 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +#ifndef CIK_REGS_H +#define CIK_REGS_H + +/* if PTR32, these are the bases for scratch and lds */ +#define PRIVATE_BASE(x) ((x) << 0) /* scratch */ +#define SHARED_BASE(x) ((x) << 16) /* LDS */ +#define PTR32 (1 << 0) +#define ALIGNMENT_MODE(x) ((x) << 2) +#define SH_MEM_ALIGNMENT_MODE_UNALIGNED 3 +#define DEFAULT_MTYPE(x) ((x) << 4) +#define APE1_MTYPE(x) ((x) << 7) + +/* valid for both DEFAULT_MTYPE and APE1_MTYPE */ +#define MTYPE_CACHED_NV 0 +#define MTYPE_CACHED 1 +#define MTYPE_NONCACHED 3 + +#define DEFAULT_CP_HQD_PERSISTENT_STATE (0x33U << 8) +#define PRELOAD_REQ (1 << 0) + +#define MQD_CONTROL_PRIV_STATE_EN (1U << 8) + +#define DEFAULT_MIN_IB_AVAIL_SIZE (3U << 20) + +#define IB_ATC_EN (1U << 23) + +#define QUANTUM_EN 1U +#define QUANTUM_SCALE_1MS (1U << 4) +#define QUANTUM_DURATION(x) ((x) << 8) + +#define RPTR_BLOCK_SIZE(x) ((x) << 8) +#define MIN_AVAIL_SIZE(x) ((x) << 20) +#define DEFAULT_RPTR_BLOCK_SIZE RPTR_BLOCK_SIZE(5) +#define DEFAULT_MIN_AVAIL_SIZE MIN_AVAIL_SIZE(3) + +#define PQ_ATC_EN (1 << 23) +#define NO_UPDATE_RPTR (1 << 27) + +#define DOORBELL_OFFSET(x) ((x) << 2) +#define DOORBELL_EN (1 << 30) + +#define PRIV_STATE (1 << 30) +#define KMD_QUEUE (1 << 31) + +#define AQL_ENABLE 1 + +#define GRBM_GFX_INDEX 0x30800 + +#define ATC_VMID_PASID_MAPPING_VALID (1U << 31) + +#endif diff --git a/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler.h b/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler.h new file mode 100644 index 000000000..3621efbd5 --- /dev/null +++ b/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler.h @@ -0,0 +1,568 @@ +/* + * Copyright 2018 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +static const uint32_t cwsr_trap_gfx8_hex[] = { + 0xbf820001, 0xbf82012b, + 0xb8f4f802, 0x89748674, + 0xb8f5f803, 0x8675ff75, + 0x00000400, 0xbf850017, + 0xc00a1e37, 0x00000000, + 0xbf8c007f, 0x87777978, + 0xbf840005, 0x8f728374, + 0xb972e0c2, 0xbf800002, + 0xb9740002, 0xbe801d78, + 0xb8f5f803, 0x8675ff75, + 0x000001ff, 0xbf850002, + 0x80708470, 0x82718071, + 0x8671ff71, 0x0000ffff, + 0x8f728374, 0xb972e0c2, + 0xbf800002, 0xb9740002, + 0xbe801f70, 0xb8f5f803, + 0x8675ff75, 0x00000100, + 0xbf840006, 0xbefa0080, + 0xb97a0203, 0x8671ff71, + 0x0000ffff, 0x80f08870, + 0x82f18071, 0xbefa0080, + 0xb97a0283, 0xbef60068, + 0xbef70069, 0xb8fa1c07, + 0x8e7a9c7a, 0x87717a71, + 0xb8fa03c7, 0x8e7a9b7a, + 0x87717a71, 0xb8faf807, + 0x867aff7a, 0x00007fff, + 0xb97af807, 0xbef2007e, + 0xbef3007f, 0xbefe0180, + 0xbf900004, 0x877a8474, + 0xb97af802, 0xbf8e0002, + 0xbf88fffe, 0xbef8007e, + 0x8679ff7f, 0x0000ffff, + 0x8779ff79, 0x00040000, + 0xbefa0080, 0xbefb00ff, + 0x00807fac, 0x867aff7f, + 0x08000000, 0x8f7a837a, + 0x877b7a7b, 0x867aff7f, + 0x70000000, 0x8f7a817a, + 0x877b7a7b, 0xbeef007c, + 0xbeee0080, 0xb8ee2a05, + 0x806e816e, 0x8e6e8a6e, + 0xb8fa1605, 0x807a817a, + 0x8e7a867a, 0x806e7a6e, + 0xbefa0084, 0xbefa00ff, + 0x01000000, 0xbefe007c, + 0xbefc006e, 0xc0611bfc, + 0x0000007c, 0x806e846e, + 0xbefc007e, 0xbefe007c, + 0xbefc006e, 0xc0611c3c, + 0x0000007c, 0x806e846e, + 0xbefc007e, 0xbefe007c, + 0xbefc006e, 0xc0611c7c, + 0x0000007c, 0x806e846e, + 0xbefc007e, 0xbefe007c, + 0xbefc006e, 0xc0611cbc, + 0x0000007c, 0x806e846e, + 0xbefc007e, 0xbefe007c, + 0xbefc006e, 0xc0611cfc, + 0x0000007c, 0x806e846e, + 0xbefc007e, 0xbefe007c, + 0xbefc006e, 0xc0611d3c, + 0x0000007c, 0x806e846e, + 0xbefc007e, 0xb8f5f803, + 0xbefe007c, 0xbefc006e, + 0xc0611d7c, 0x0000007c, + 0x806e846e, 0xbefc007e, + 0xbefe007c, 0xbefc006e, + 0xc0611dbc, 0x0000007c, + 0x806e846e, 0xbefc007e, + 0xbefe007c, 0xbefc006e, + 0xc0611dfc, 0x0000007c, + 0x806e846e, 0xbefc007e, + 0xb8eff801, 0xbefe007c, + 0xbefc006e, 0xc0611bfc, + 0x0000007c, 0x806e846e, + 0xbefc007e, 0xbefe007c, + 0xbefc006e, 0xc0611b3c, + 0x0000007c, 0x806e846e, + 0xbefc007e, 0xbefe007c, + 0xbefc006e, 0xc0611b7c, + 0x0000007c, 0x806e846e, + 0xbefc007e, 0x867aff7f, + 0x04000000, 0xbef30080, + 0x8773737a, 0xb8ee2a05, + 0x806e816e, 0x8e6e8a6e, + 0xb8f51605, 0x80758175, + 0x8e758475, 0x8e7a8275, + 0xbefa00ff, 0x01000000, + 0xbef60178, 0x80786e78, + 0x82798079, 0xbefc0080, + 0xbe802b00, 0xbe822b02, + 0xbe842b04, 0xbe862b06, + 0xbe882b08, 0xbe8a2b0a, + 0xbe8c2b0c, 0xbe8e2b0e, + 0xc06b003c, 0x00000000, + 0xc06b013c, 0x00000010, + 0xc06b023c, 0x00000020, + 0xc06b033c, 0x00000030, + 0x8078c078, 0x82798079, + 0x807c907c, 0xbf0a757c, + 0xbf85ffeb, 0xbef80176, + 0xbeee0080, 0xbefe00c1, + 0xbeff00c1, 0xbefa00ff, + 0x01000000, 0xe0724000, + 0x6e1e0000, 0xe0724100, + 0x6e1e0100, 0xe0724200, + 0x6e1e0200, 0xe0724300, + 0x6e1e0300, 0xbefe00c1, + 0xbeff00c1, 0xb8f54306, + 0x8675c175, 0xbf84002c, + 0xbf8a0000, 0x867aff73, + 0x04000000, 0xbf840028, + 0x8e758675, 0x8e758275, + 0xbefa0075, 0xb8ee2a05, + 0x806e816e, 0x8e6e8a6e, + 0xb8fa1605, 0x807a817a, + 0x8e7a867a, 0x806e7a6e, + 0x806eff6e, 0x00000080, + 0xbefa00ff, 0x01000000, + 0xbefc0080, 0xd28c0002, + 0x000100c1, 0xd28d0003, + 0x000204c1, 0xd1060002, + 0x00011103, 0x7e0602ff, + 0x00000200, 0xbefc00ff, + 0x00010000, 0xbe80007b, + 0x867bff7b, 0xff7fffff, + 0x877bff7b, 0x00058000, + 0xd8ec0000, 0x00000002, + 0xbf8c007f, 0xe0765000, + 0x6e1e0002, 0x32040702, + 0xd0c9006a, 0x0000eb02, + 0xbf87fff7, 0xbefb0000, + 0xbeee00ff, 0x00000400, + 0xbefe00c1, 0xbeff00c1, + 0xb8f52a05, 0x80758175, + 0x8e758275, 0x8e7a8875, + 0xbefa00ff, 0x01000000, + 0xbefc0084, 0xbf0a757c, + 0xbf840015, 0xbf11017c, + 0x8075ff75, 0x00001000, + 0x7e000300, 0x7e020301, + 0x7e040302, 0x7e060303, + 0xe0724000, 0x6e1e0000, + 0xe0724100, 0x6e1e0100, + 0xe0724200, 0x6e1e0200, + 0xe0724300, 0x6e1e0300, + 0x807c847c, 0x806eff6e, + 0x00000400, 0xbf0a757c, + 0xbf85ffef, 0xbf9c0000, + 0xbf8200cd, 0xbef8007e, + 0x8679ff7f, 0x0000ffff, + 0x8779ff79, 0x00040000, + 0xbefa0080, 0xbefb00ff, + 0x00807fac, 0x8676ff7f, + 0x08000000, 0x8f768376, + 0x877b767b, 0x8676ff7f, + 0x70000000, 0x8f768176, + 0x877b767b, 0x8676ff7f, + 0x04000000, 0xbf84001e, + 0xbefe00c1, 0xbeff00c1, + 0xb8f34306, 0x8673c173, + 0xbf840019, 0x8e738673, + 0x8e738273, 0xbefa0073, + 0xb8f22a05, 0x80728172, + 0x8e728a72, 0xb8f61605, + 0x80768176, 0x8e768676, + 0x80727672, 0x8072ff72, + 0x00000080, 0xbefa00ff, + 0x01000000, 0xbefc0080, + 0xe0510000, 0x721e0000, + 0xe0510100, 0x721e0000, + 0x807cff7c, 0x00000200, + 0x8072ff72, 0x00000200, + 0xbf0a737c, 0xbf85fff6, + 0xbef20080, 0xbefe00c1, + 0xbeff00c1, 0xb8f32a05, + 0x80738173, 0x8e738273, + 0x8e7a8873, 0xbefa00ff, + 0x01000000, 0xbef60072, + 0x8072ff72, 0x00000400, + 0xbefc0084, 0xbf11087c, + 0x8073ff73, 0x00008000, + 0xe0524000, 0x721e0000, + 0xe0524100, 0x721e0100, + 0xe0524200, 0x721e0200, + 0xe0524300, 0x721e0300, + 0xbf8c0f70, 0x7e000300, + 0x7e020301, 0x7e040302, + 0x7e060303, 0x807c847c, + 0x8072ff72, 0x00000400, + 0xbf0a737c, 0xbf85ffee, + 0xbf9c0000, 0xe0524000, + 0x761e0000, 0xe0524100, + 0x761e0100, 0xe0524200, + 0x761e0200, 0xe0524300, + 0x761e0300, 0xb8f22a05, + 0x80728172, 0x8e728a72, + 0xb8f61605, 0x80768176, + 0x8e768676, 0x80727672, + 0x80f2c072, 0xb8f31605, + 0x80738173, 0x8e738473, + 0x8e7a8273, 0xbefa00ff, + 0x01000000, 0xbefc0073, + 0xc031003c, 0x00000072, + 0x80f2c072, 0xbf8c007f, + 0x80fc907c, 0xbe802d00, + 0xbe822d02, 0xbe842d04, + 0xbe862d06, 0xbe882d08, + 0xbe8a2d0a, 0xbe8c2d0c, + 0xbe8e2d0e, 0xbf06807c, + 0xbf84fff1, 0xb8f22a05, + 0x80728172, 0x8e728a72, + 0xb8f61605, 0x80768176, + 0x8e768676, 0x80727672, + 0xbefa0084, 0xbefa00ff, + 0x01000000, 0xc0211cfc, + 0x00000072, 0x80728472, + 0xc0211c3c, 0x00000072, + 0x80728472, 0xc0211c7c, + 0x00000072, 0x80728472, + 0xc0211bbc, 0x00000072, + 0x80728472, 0xc0211bfc, + 0x00000072, 0x80728472, + 0xc0211d3c, 0x00000072, + 0x80728472, 0xc0211d7c, + 0x00000072, 0x80728472, + 0xc0211a3c, 0x00000072, + 0x80728472, 0xc0211a7c, + 0x00000072, 0x80728472, + 0xc0211dfc, 0x00000072, + 0x80728472, 0xc0211b3c, + 0x00000072, 0x80728472, + 0xc0211b7c, 0x00000072, + 0x80728472, 0xbf8c007f, + 0xbefc0073, 0xbefe006e, + 0xbeff006f, 0x867375ff, + 0x000003ff, 0xb9734803, + 0x867375ff, 0xfffff800, + 0x8f738b73, 0xb973a2c3, + 0xb977f801, 0x8673ff71, + 0xf0000000, 0x8f739c73, + 0x8e739073, 0xbef60080, + 0x87767376, 0x8673ff71, + 0x08000000, 0x8f739b73, + 0x8e738f73, 0x87767376, + 0x8673ff74, 0x00800000, + 0x8f739773, 0xb976f807, + 0x8671ff71, 0x0000ffff, + 0x86fe7e7e, 0x86ea6a6a, + 0x8f768374, 0xb976e0c2, + 0xbf800002, 0xb9740002, + 0xbf8a0000, 0x95807370, + 0xbf810000, 0x00000000, +}; + + +static const uint32_t cwsr_trap_gfx9_hex[] = { + 0xbf820001, 0xbf82015d, + 0xb8f8f802, 0x89788678, + 0xb8f1f803, 0x866eff71, + 0x00000400, 0xbf850037, + 0x866eff71, 0x00000800, + 0xbf850003, 0x866eff71, + 0x00000100, 0xbf840008, + 0x866eff78, 0x00002000, + 0xbf840001, 0xbf810000, + 0x8778ff78, 0x00002000, + 0x80ec886c, 0x82ed806d, + 0xb8eef807, 0x866fff6e, + 0x001f8000, 0x8e6f8b6f, + 0x8977ff77, 0xfc000000, + 0x87776f77, 0x896eff6e, + 0x001f8000, 0xb96ef807, + 0xb8f0f812, 0xb8f1f813, + 0x8ef08870, 0xc0071bb8, + 0x00000000, 0xbf8cc07f, + 0xc0071c38, 0x00000008, + 0xbf8cc07f, 0x86ee6e6e, + 0xbf840001, 0xbe801d6e, + 0xb8f1f803, 0x8671ff71, + 0x000001ff, 0xbf850002, + 0x806c846c, 0x826d806d, + 0x866dff6d, 0x0000ffff, + 0x8f6e8b77, 0x866eff6e, + 0x001f8000, 0xb96ef807, + 0x86fe7e7e, 0x86ea6a6a, + 0x8f6e8378, 0xb96ee0c2, + 0xbf800002, 0xb9780002, + 0xbe801f6c, 0x866dff6d, + 0x0000ffff, 0xbef00080, + 0xb9700283, 0xb8f02407, + 0x8e709c70, 0x876d706d, + 0xb8f003c7, 0x8e709b70, + 0x876d706d, 0xb8f0f807, + 0x8670ff70, 0x00007fff, + 0xb970f807, 0xbeee007e, + 0xbeef007f, 0xbefe0180, + 0xbf900004, 0x87708478, + 0xb970f802, 0xbf8e0002, + 0xbf88fffe, 0xb8f02a05, + 0x80708170, 0x8e708a70, + 0xb8f11605, 0x80718171, + 0x8e718671, 0x80707170, + 0x80707e70, 0x8271807f, + 0x8671ff71, 0x0000ffff, + 0xc0471cb8, 0x00000040, + 0xbf8cc07f, 0xc04b1d38, + 0x00000048, 0xbf8cc07f, + 0xc0431e78, 0x00000058, + 0xbf8cc07f, 0xc0471eb8, + 0x0000005c, 0xbf8cc07f, + 0xbef4007e, 0x8675ff7f, + 0x0000ffff, 0x8775ff75, + 0x00040000, 0xbef60080, + 0xbef700ff, 0x00807fac, + 0x8670ff7f, 0x08000000, + 0x8f708370, 0x87777077, + 0x8670ff7f, 0x70000000, + 0x8f708170, 0x87777077, + 0xbefb007c, 0xbefa0080, + 0xb8fa2a05, 0x807a817a, + 0x8e7a8a7a, 0xb8f01605, + 0x80708170, 0x8e708670, + 0x807a707a, 0xbef60084, + 0xbef600ff, 0x01000000, + 0xbefe007c, 0xbefc007a, + 0xc0611efa, 0x0000007c, + 0xbf8cc07f, 0x807a847a, + 0xbefc007e, 0xbefe007c, + 0xbefc007a, 0xc0611b3a, + 0x0000007c, 0xbf8cc07f, + 0x807a847a, 0xbefc007e, + 0xbefe007c, 0xbefc007a, + 0xc0611b7a, 0x0000007c, + 0xbf8cc07f, 0x807a847a, + 0xbefc007e, 0xbefe007c, + 0xbefc007a, 0xc0611bba, + 0x0000007c, 0xbf8cc07f, + 0x807a847a, 0xbefc007e, + 0xbefe007c, 0xbefc007a, + 0xc0611bfa, 0x0000007c, + 0xbf8cc07f, 0x807a847a, + 0xbefc007e, 0xbefe007c, + 0xbefc007a, 0xc0611e3a, + 0x0000007c, 0xbf8cc07f, + 0x807a847a, 0xbefc007e, + 0xb8f1f803, 0xbefe007c, + 0xbefc007a, 0xc0611c7a, + 0x0000007c, 0xbf8cc07f, + 0x807a847a, 0xbefc007e, + 0xbefe007c, 0xbefc007a, + 0xc0611a3a, 0x0000007c, + 0xbf8cc07f, 0x807a847a, + 0xbefc007e, 0xbefe007c, + 0xbefc007a, 0xc0611a7a, + 0x0000007c, 0xbf8cc07f, + 0x807a847a, 0xbefc007e, + 0xb8fbf801, 0xbefe007c, + 0xbefc007a, 0xc0611efa, + 0x0000007c, 0xbf8cc07f, + 0x807a847a, 0xbefc007e, + 0x8670ff7f, 0x04000000, + 0xbeef0080, 0x876f6f70, + 0xb8fa2a05, 0x807a817a, + 0x8e7a8a7a, 0xb8f11605, + 0x80718171, 0x8e718471, + 0x8e768271, 0xbef600ff, + 0x01000000, 0xbef20174, + 0x80747a74, 0x82758075, + 0xbefc0080, 0xbf800000, + 0xbe802b00, 0xbe822b02, + 0xbe842b04, 0xbe862b06, + 0xbe882b08, 0xbe8a2b0a, + 0xbe8c2b0c, 0xbe8e2b0e, + 0xc06b003a, 0x00000000, + 0xbf8cc07f, 0xc06b013a, + 0x00000010, 0xbf8cc07f, + 0xc06b023a, 0x00000020, + 0xbf8cc07f, 0xc06b033a, + 0x00000030, 0xbf8cc07f, + 0x8074c074, 0x82758075, + 0x807c907c, 0xbf0a717c, + 0xbf85ffe7, 0xbef40172, + 0xbefa0080, 0xbefe00c1, + 0xbeff00c1, 0xbee80080, + 0xbee90080, 0xbef600ff, + 0x01000000, 0xe0724000, + 0x7a1d0000, 0xe0724100, + 0x7a1d0100, 0xe0724200, + 0x7a1d0200, 0xe0724300, + 0x7a1d0300, 0xbefe00c1, + 0xbeff00c1, 0xb8f14306, + 0x8671c171, 0xbf84002c, + 0xbf8a0000, 0x8670ff6f, + 0x04000000, 0xbf840028, + 0x8e718671, 0x8e718271, + 0xbef60071, 0xb8fa2a05, + 0x807a817a, 0x8e7a8a7a, + 0xb8f01605, 0x80708170, + 0x8e708670, 0x807a707a, + 0x807aff7a, 0x00000080, + 0xbef600ff, 0x01000000, + 0xbefc0080, 0xd28c0002, + 0x000100c1, 0xd28d0003, + 0x000204c1, 0xd1060002, + 0x00011103, 0x7e0602ff, + 0x00000200, 0xbefc00ff, + 0x00010000, 0xbe800077, + 0x8677ff77, 0xff7fffff, + 0x8777ff77, 0x00058000, + 0xd8ec0000, 0x00000002, + 0xbf8cc07f, 0xe0765000, + 0x7a1d0002, 0x68040702, + 0xd0c9006a, 0x0000e302, + 0xbf87fff7, 0xbef70000, + 0xbefa00ff, 0x00000400, + 0xbefe00c1, 0xbeff00c1, + 0xb8f12a05, 0x80718171, + 0x8e718271, 0x8e768871, + 0xbef600ff, 0x01000000, + 0xbefc0084, 0xbf0a717c, + 0xbf840015, 0xbf11017c, + 0x8071ff71, 0x00001000, + 0x7e000300, 0x7e020301, + 0x7e040302, 0x7e060303, + 0xe0724000, 0x7a1d0000, + 0xe0724100, 0x7a1d0100, + 0xe0724200, 0x7a1d0200, + 0xe0724300, 0x7a1d0300, + 0x807c847c, 0x807aff7a, + 0x00000400, 0xbf0a717c, + 0xbf85ffef, 0xbf9c0000, + 0xbf8200dc, 0xbef4007e, + 0x8675ff7f, 0x0000ffff, + 0x8775ff75, 0x00040000, + 0xbef60080, 0xbef700ff, + 0x00807fac, 0x866eff7f, + 0x08000000, 0x8f6e836e, + 0x87776e77, 0x866eff7f, + 0x70000000, 0x8f6e816e, + 0x87776e77, 0x866eff7f, + 0x04000000, 0xbf84001e, + 0xbefe00c1, 0xbeff00c1, + 0xb8ef4306, 0x866fc16f, + 0xbf840019, 0x8e6f866f, + 0x8e6f826f, 0xbef6006f, + 0xb8f82a05, 0x80788178, + 0x8e788a78, 0xb8ee1605, + 0x806e816e, 0x8e6e866e, + 0x80786e78, 0x8078ff78, + 0x00000080, 0xbef600ff, + 0x01000000, 0xbefc0080, + 0xe0510000, 0x781d0000, + 0xe0510100, 0x781d0000, + 0x807cff7c, 0x00000200, + 0x8078ff78, 0x00000200, + 0xbf0a6f7c, 0xbf85fff6, + 0xbef80080, 0xbefe00c1, + 0xbeff00c1, 0xb8ef2a05, + 0x806f816f, 0x8e6f826f, + 0x8e76886f, 0xbef600ff, + 0x01000000, 0xbeee0078, + 0x8078ff78, 0x00000400, + 0xbefc0084, 0xbf11087c, + 0x806fff6f, 0x00008000, + 0xe0524000, 0x781d0000, + 0xe0524100, 0x781d0100, + 0xe0524200, 0x781d0200, + 0xe0524300, 0x781d0300, + 0xbf8c0f70, 0x7e000300, + 0x7e020301, 0x7e040302, + 0x7e060303, 0x807c847c, + 0x8078ff78, 0x00000400, + 0xbf0a6f7c, 0xbf85ffee, + 0xbf9c0000, 0xe0524000, + 0x6e1d0000, 0xe0524100, + 0x6e1d0100, 0xe0524200, + 0x6e1d0200, 0xe0524300, + 0x6e1d0300, 0xb8f82a05, + 0x80788178, 0x8e788a78, + 0xb8ee1605, 0x806e816e, + 0x8e6e866e, 0x80786e78, + 0x80f8c078, 0xb8ef1605, + 0x806f816f, 0x8e6f846f, + 0x8e76826f, 0xbef600ff, + 0x01000000, 0xbefc006f, + 0xc031003a, 0x00000078, + 0x80f8c078, 0xbf8cc07f, + 0x80fc907c, 0xbf800000, + 0xbe802d00, 0xbe822d02, + 0xbe842d04, 0xbe862d06, + 0xbe882d08, 0xbe8a2d0a, + 0xbe8c2d0c, 0xbe8e2d0e, + 0xbf06807c, 0xbf84fff0, + 0xb8f82a05, 0x80788178, + 0x8e788a78, 0xb8ee1605, + 0x806e816e, 0x8e6e866e, + 0x80786e78, 0xbef60084, + 0xbef600ff, 0x01000000, + 0xc0211bfa, 0x00000078, + 0x80788478, 0xc0211b3a, + 0x00000078, 0x80788478, + 0xc0211b7a, 0x00000078, + 0x80788478, 0xc0211eba, + 0x00000078, 0x80788478, + 0xc0211efa, 0x00000078, + 0x80788478, 0xc0211c3a, + 0x00000078, 0x80788478, + 0xc0211c7a, 0x00000078, + 0x80788478, 0xc0211a3a, + 0x00000078, 0x80788478, + 0xc0211a7a, 0x00000078, + 0x80788478, 0xc0211cfa, + 0x00000078, 0x80788478, + 0xbf8cc07f, 0xbefc006f, + 0xbefe007a, 0xbeff007b, + 0x866f71ff, 0x000003ff, + 0xb96f4803, 0x866f71ff, + 0xfffff800, 0x8f6f8b6f, + 0xb96fa2c3, 0xb973f801, + 0xb8ee2a05, 0x806e816e, + 0x8e6e8a6e, 0xb8ef1605, + 0x806f816f, 0x8e6f866f, + 0x806e6f6e, 0x806e746e, + 0x826f8075, 0x866fff6f, + 0x0000ffff, 0xc0071cb7, + 0x00000040, 0xc00b1d37, + 0x00000048, 0xc0031e77, + 0x00000058, 0xc0071eb7, + 0x0000005c, 0xbf8cc07f, + 0x866fff6d, 0xf0000000, + 0x8f6f9c6f, 0x8e6f906f, + 0xbeee0080, 0x876e6f6e, + 0x866fff6d, 0x08000000, + 0x8f6f9b6f, 0x8e6f8f6f, + 0x876e6f6e, 0x866fff70, + 0x00800000, 0x8f6f976f, + 0xb96ef807, 0x866dff6d, + 0x0000ffff, 0x86fe7e7e, + 0x86ea6a6a, 0x8f6e8370, + 0xb96ee0c2, 0xbf800002, + 0xb9700002, 0xbf8a0000, + 0x95806f6c, 0xbf810000, +}; diff --git a/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler_gfx8.asm b/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler_gfx8.asm new file mode 100644 index 000000000..abe1a5da2 --- /dev/null +++ b/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler_gfx8.asm @@ -0,0 +1,1148 @@ +/* + * Copyright 2015-2017 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +/* To compile this assembly code: + * PROJECT=vi ./sp3 cwsr_trap_handler_gfx8.asm -hex tmp.hex + */ + +/* HW (VI) source code for CWSR trap handler */ +/* Version 18 + multiple trap handler */ + +// this performance-optimal version was originally from Seven Xu at SRDC + +// Revison #18 --... +/* Rev History +** #1. Branch from gc dv. //gfxip/gfx8/main/src/test/suites/block/cs/sr/cs_trap_handler.sp3#1,#50, #51, #52-53(Skip, Already Fixed by PV), #54-56(merged),#57-58(mergerd, skiped-already fixed by PV) +** #4. SR Memory Layout: +** 1. VGPR-SGPR-HWREG-{LDS} +** 2. tba_hi.bits.26 - reconfigured as the first wave in tg bits, for defer Save LDS for a threadgroup.. performance concern.. +** #5. Update: 1. Accurate g8sr_ts_save_d timestamp +** #6. Update: 1. Fix s_barrier usage; 2. VGPR s/r using swizzle buffer?(NoNeed, already matched the swizzle pattern, more investigation) +** #7. Update: 1. don't barrier if noLDS +** #8. Branch: 1. Branch to ver#0, which is very similar to gc dv version +** 2. Fix SQ issue by s_sleep 2 +** #9. Update: 1. Fix scc restore failed issue, restore wave_status at last +** 2. optimize s_buffer save by burst 16sgprs... +** #10. Update 1. Optimize restore sgpr by busrt 16 sgprs. +** #11. Update 1. Add 2 more timestamp for debug version +** #12. Update 1. Add VGPR SR using DWx4, some case improve and some case drop performance +** #13. Integ 1. Always use MUBUF for PV trap shader... +** #14. Update 1. s_buffer_store soft clause... +** #15. Update 1. PERF - sclar write with glc:0/mtype0 to allow L2 combine. perf improvement a lot. +** #16. Update 1. PRRF - UNROLL LDS_DMA got 2500cycle save in IP tree +** #17. Update 1. FUNC - LDS_DMA has issues while ATC, replace with ds_read/buffer_store for save part[TODO restore part] +** 2. PERF - Save LDS before save VGPR to cover LDS save long latency... +** #18. Update 1. FUNC - Implicitly estore STATUS.VCCZ, which is not writable by s_setreg_b32 +** 2. FUNC - Handle non-CWSR traps +*/ + +var G8SR_WDMEM_HWREG_OFFSET = 0 +var G8SR_WDMEM_SGPR_OFFSET = 128 // in bytes + +// Keep definition same as the app shader, These 2 time stamps are part of the app shader... Should before any Save and after restore. + +var G8SR_DEBUG_TIMESTAMP = 0 +var G8SR_DEBUG_TS_SAVE_D_OFFSET = 40*4 // ts_save_d timestamp offset relative to SGPR_SR_memory_offset +var s_g8sr_ts_save_s = s[34:35] // save start +var s_g8sr_ts_sq_save_msg = s[36:37] // The save shader send SAVEWAVE msg to spi +var s_g8sr_ts_spi_wrexec = s[38:39] // the SPI write the sr address to SQ +var s_g8sr_ts_save_d = s[40:41] // save end +var s_g8sr_ts_restore_s = s[42:43] // restore start +var s_g8sr_ts_restore_d = s[44:45] // restore end + +var G8SR_VGPR_SR_IN_DWX4 = 0 +var G8SR_SAVE_BUF_RSRC_WORD1_STRIDE_DWx4 = 0x00100000 // DWx4 stride is 4*4Bytes +var G8SR_RESTORE_BUF_RSRC_WORD1_STRIDE_DWx4 = G8SR_SAVE_BUF_RSRC_WORD1_STRIDE_DWx4 + + +/*************************************************************************/ +/* control on how to run the shader */ +/*************************************************************************/ +//any hack that needs to be made to run this code in EMU (either because various EMU code are not ready or no compute save & restore in EMU run) +var EMU_RUN_HACK = 0 +var EMU_RUN_HACK_RESTORE_NORMAL = 0 +var EMU_RUN_HACK_SAVE_NORMAL_EXIT = 0 +var EMU_RUN_HACK_SAVE_SINGLE_WAVE = 0 +var EMU_RUN_HACK_SAVE_FIRST_TIME = 0 //for interrupted restore in which the first save is through EMU_RUN_HACK +var EMU_RUN_HACK_SAVE_FIRST_TIME_TBA_LO = 0 //for interrupted restore in which the first save is through EMU_RUN_HACK +var EMU_RUN_HACK_SAVE_FIRST_TIME_TBA_HI = 0 //for interrupted restore in which the first save is through EMU_RUN_HACK +var SAVE_LDS = 1 +var WG_BASE_ADDR_LO = 0x9000a000 +var WG_BASE_ADDR_HI = 0x0 +var WAVE_SPACE = 0x5000 //memory size that each wave occupies in workgroup state mem +var CTX_SAVE_CONTROL = 0x0 +var CTX_RESTORE_CONTROL = CTX_SAVE_CONTROL +var SIM_RUN_HACK = 0 //any hack that needs to be made to run this code in SIM (either because various RTL code are not ready or no compute save & restore in RTL run) +var SGPR_SAVE_USE_SQC = 1 //use SQC D$ to do the write +var USE_MTBUF_INSTEAD_OF_MUBUF = 0 //because TC EMU currently asserts on 0 of // overload DFMT field to carry 4 more bits of stride for MUBUF opcodes +var SWIZZLE_EN = 0 //whether we use swizzled buffer addressing + +/**************************************************************************/ +/* variables */ +/**************************************************************************/ +var SQ_WAVE_STATUS_INST_ATC_SHIFT = 23 +var SQ_WAVE_STATUS_INST_ATC_MASK = 0x00800000 +var SQ_WAVE_STATUS_SPI_PRIO_SHIFT = 1 +var SQ_WAVE_STATUS_SPI_PRIO_MASK = 0x00000006 +var SQ_WAVE_STATUS_PRE_SPI_PRIO_SHIFT = 0 +var SQ_WAVE_STATUS_PRE_SPI_PRIO_SIZE = 1 +var SQ_WAVE_STATUS_POST_SPI_PRIO_SHIFT = 3 +var SQ_WAVE_STATUS_POST_SPI_PRIO_SIZE = 29 + +var SQ_WAVE_LDS_ALLOC_LDS_SIZE_SHIFT = 12 +var SQ_WAVE_LDS_ALLOC_LDS_SIZE_SIZE = 9 +var SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SHIFT = 8 +var SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SIZE = 6 +var SQ_WAVE_GPR_ALLOC_SGPR_SIZE_SHIFT = 24 +var SQ_WAVE_GPR_ALLOC_SGPR_SIZE_SIZE = 3 //FIXME sq.blk still has 4 bits at this time while SQ programming guide has 3 bits + +var SQ_WAVE_TRAPSTS_SAVECTX_MASK = 0x400 +var SQ_WAVE_TRAPSTS_EXCE_MASK = 0x1FF // Exception mask +var SQ_WAVE_TRAPSTS_SAVECTX_SHIFT = 10 +var SQ_WAVE_TRAPSTS_MEM_VIOL_MASK = 0x100 +var SQ_WAVE_TRAPSTS_MEM_VIOL_SHIFT = 8 +var SQ_WAVE_TRAPSTS_PRE_SAVECTX_MASK = 0x3FF +var SQ_WAVE_TRAPSTS_PRE_SAVECTX_SHIFT = 0x0 +var SQ_WAVE_TRAPSTS_PRE_SAVECTX_SIZE = 10 +var SQ_WAVE_TRAPSTS_POST_SAVECTX_MASK = 0xFFFFF800 +var SQ_WAVE_TRAPSTS_POST_SAVECTX_SHIFT = 11 +var SQ_WAVE_TRAPSTS_POST_SAVECTX_SIZE = 21 + +var SQ_WAVE_IB_STS_RCNT_SHIFT = 16 //FIXME +var SQ_WAVE_IB_STS_RCNT_SIZE = 4 //FIXME +var SQ_WAVE_IB_STS_FIRST_REPLAY_SHIFT = 15 //FIXME +var SQ_WAVE_IB_STS_FIRST_REPLAY_SIZE = 1 //FIXME +var SQ_WAVE_IB_STS_RCNT_FIRST_REPLAY_MASK_NEG = 0x00007FFF //FIXME + +var SQ_BUF_RSRC_WORD1_ATC_SHIFT = 24 +var SQ_BUF_RSRC_WORD3_MTYPE_SHIFT = 27 + + +/* Save */ +var S_SAVE_BUF_RSRC_WORD1_STRIDE = 0x00040000 //stride is 4 bytes +var S_SAVE_BUF_RSRC_WORD3_MISC = 0x00807FAC //SQ_SEL_X/Y/Z/W, BUF_NUM_FORMAT_FLOAT, (0 for MUBUF stride[17:14] when ADD_TID_ENABLE and BUF_DATA_FORMAT_32 for MTBUF), ADD_TID_ENABLE + +var S_SAVE_SPI_INIT_ATC_MASK = 0x08000000 //bit[27]: ATC bit +var S_SAVE_SPI_INIT_ATC_SHIFT = 27 +var S_SAVE_SPI_INIT_MTYPE_MASK = 0x70000000 //bit[30:28]: Mtype +var S_SAVE_SPI_INIT_MTYPE_SHIFT = 28 +var S_SAVE_SPI_INIT_FIRST_WAVE_MASK = 0x04000000 //bit[26]: FirstWaveInTG +var S_SAVE_SPI_INIT_FIRST_WAVE_SHIFT = 26 + +var S_SAVE_PC_HI_RCNT_SHIFT = 28 //FIXME check with Brian to ensure all fields other than PC[47:0] can be used +var S_SAVE_PC_HI_RCNT_MASK = 0xF0000000 //FIXME +var S_SAVE_PC_HI_FIRST_REPLAY_SHIFT = 27 //FIXME +var S_SAVE_PC_HI_FIRST_REPLAY_MASK = 0x08000000 //FIXME + +var s_save_spi_init_lo = exec_lo +var s_save_spi_init_hi = exec_hi + + //tba_lo and tba_hi need to be saved/restored +var s_save_pc_lo = ttmp0 //{TTMP1, TTMP0} = {3'h0,pc_rewind[3:0], HT[0],trapID[7:0], PC[47:0]} +var s_save_pc_hi = ttmp1 +var s_save_exec_lo = ttmp2 +var s_save_exec_hi = ttmp3 +var s_save_status = ttmp4 +var s_save_trapsts = ttmp5 //not really used until the end of the SAVE routine +var s_save_xnack_mask_lo = ttmp6 +var s_save_xnack_mask_hi = ttmp7 +var s_save_buf_rsrc0 = ttmp8 +var s_save_buf_rsrc1 = ttmp9 +var s_save_buf_rsrc2 = ttmp10 +var s_save_buf_rsrc3 = ttmp11 + +var s_save_mem_offset = tma_lo +var s_save_alloc_size = s_save_trapsts //conflict +var s_save_tmp = s_save_buf_rsrc2 //shared with s_save_buf_rsrc2 (conflict: should not use mem access with s_save_tmp at the same time) +var s_save_m0 = tma_hi + +/* Restore */ +var S_RESTORE_BUF_RSRC_WORD1_STRIDE = S_SAVE_BUF_RSRC_WORD1_STRIDE +var S_RESTORE_BUF_RSRC_WORD3_MISC = S_SAVE_BUF_RSRC_WORD3_MISC + +var S_RESTORE_SPI_INIT_ATC_MASK = 0x08000000 //bit[27]: ATC bit +var S_RESTORE_SPI_INIT_ATC_SHIFT = 27 +var S_RESTORE_SPI_INIT_MTYPE_MASK = 0x70000000 //bit[30:28]: Mtype +var S_RESTORE_SPI_INIT_MTYPE_SHIFT = 28 +var S_RESTORE_SPI_INIT_FIRST_WAVE_MASK = 0x04000000 //bit[26]: FirstWaveInTG +var S_RESTORE_SPI_INIT_FIRST_WAVE_SHIFT = 26 + +var S_RESTORE_PC_HI_RCNT_SHIFT = S_SAVE_PC_HI_RCNT_SHIFT +var S_RESTORE_PC_HI_RCNT_MASK = S_SAVE_PC_HI_RCNT_MASK +var S_RESTORE_PC_HI_FIRST_REPLAY_SHIFT = S_SAVE_PC_HI_FIRST_REPLAY_SHIFT +var S_RESTORE_PC_HI_FIRST_REPLAY_MASK = S_SAVE_PC_HI_FIRST_REPLAY_MASK + +var s_restore_spi_init_lo = exec_lo +var s_restore_spi_init_hi = exec_hi + +var s_restore_mem_offset = ttmp2 +var s_restore_alloc_size = ttmp3 +var s_restore_tmp = ttmp6 //tba_lo/hi need to be restored +var s_restore_mem_offset_save = s_restore_tmp //no conflict + +var s_restore_m0 = s_restore_alloc_size //no conflict + +var s_restore_mode = ttmp7 + +var s_restore_pc_lo = ttmp0 +var s_restore_pc_hi = ttmp1 +var s_restore_exec_lo = tma_lo //no conflict +var s_restore_exec_hi = tma_hi //no conflict +var s_restore_status = ttmp4 +var s_restore_trapsts = ttmp5 +var s_restore_xnack_mask_lo = xnack_mask_lo +var s_restore_xnack_mask_hi = xnack_mask_hi +var s_restore_buf_rsrc0 = ttmp8 +var s_restore_buf_rsrc1 = ttmp9 +var s_restore_buf_rsrc2 = ttmp10 +var s_restore_buf_rsrc3 = ttmp11 + +/**************************************************************************/ +/* trap handler entry points */ +/**************************************************************************/ +/* Shader Main*/ + +shader main + asic(VI) + type(CS) + + + if ((EMU_RUN_HACK) && (!EMU_RUN_HACK_RESTORE_NORMAL)) //hack to use trap_id for determining save/restore + //FIXME VCCZ un-init assertion s_getreg_b32 s_save_status, hwreg(HW_REG_STATUS) //save STATUS since we will change SCC + s_and_b32 s_save_tmp, s_save_pc_hi, 0xffff0000 //change SCC + s_cmp_eq_u32 s_save_tmp, 0x007e0000 //Save: trap_id = 0x7e. Restore: trap_id = 0x7f. + s_cbranch_scc0 L_JUMP_TO_RESTORE //do not need to recover STATUS here since we are going to RESTORE + //FIXME s_setreg_b32 hwreg(HW_REG_STATUS), s_save_status //need to recover STATUS since we are going to SAVE + s_branch L_SKIP_RESTORE //NOT restore, SAVE actually + else + s_branch L_SKIP_RESTORE //NOT restore. might be a regular trap or save + end + +L_JUMP_TO_RESTORE: + s_branch L_RESTORE //restore + +L_SKIP_RESTORE: + + s_getreg_b32 s_save_status, hwreg(HW_REG_STATUS) //save STATUS since we will change SCC + s_andn2_b32 s_save_status, s_save_status, SQ_WAVE_STATUS_SPI_PRIO_MASK //check whether this is for save + s_getreg_b32 s_save_trapsts, hwreg(HW_REG_TRAPSTS) + s_and_b32 s_save_trapsts, s_save_trapsts, SQ_WAVE_TRAPSTS_SAVECTX_MASK //check whether this is for save + s_cbranch_scc1 L_SAVE //this is the operation for save + + // ********* Handle non-CWSR traps ******************* +if (!EMU_RUN_HACK) + /* read tba and tma for next level trap handler, ttmp4 is used as s_save_status */ + s_load_dwordx4 [ttmp8,ttmp9,ttmp10, ttmp11], [tma_lo,tma_hi], 0 + s_waitcnt lgkmcnt(0) + s_or_b32 ttmp7, ttmp8, ttmp9 + s_cbranch_scc0 L_NO_NEXT_TRAP //next level trap handler not been set + set_status_without_spi_prio(s_save_status, ttmp2) //restore HW status(SCC) + s_setpc_b64 [ttmp8,ttmp9] //jump to next level trap handler + +L_NO_NEXT_TRAP: + s_getreg_b32 s_save_trapsts, hwreg(HW_REG_TRAPSTS) + s_and_b32 s_save_trapsts, s_save_trapsts, SQ_WAVE_TRAPSTS_EXCE_MASK // Check whether it is an exception + s_cbranch_scc1 L_EXCP_CASE // Exception, jump back to the shader program directly. + s_add_u32 ttmp0, ttmp0, 4 // S_TRAP case, add 4 to ttmp0 + s_addc_u32 ttmp1, ttmp1, 0 +L_EXCP_CASE: + s_and_b32 ttmp1, ttmp1, 0xFFFF + set_status_without_spi_prio(s_save_status, ttmp2) //restore HW status(SCC) + s_rfe_b64 [ttmp0, ttmp1] +end + // ********* End handling of non-CWSR traps ******************* + +/**************************************************************************/ +/* save routine */ +/**************************************************************************/ + +L_SAVE: + +if G8SR_DEBUG_TIMESTAMP + s_memrealtime s_g8sr_ts_save_s + s_waitcnt lgkmcnt(0) //FIXME, will cause xnack?? +end + + //check whether there is mem_viol + s_getreg_b32 s_save_trapsts, hwreg(HW_REG_TRAPSTS) + s_and_b32 s_save_trapsts, s_save_trapsts, SQ_WAVE_TRAPSTS_MEM_VIOL_MASK + s_cbranch_scc0 L_NO_PC_REWIND + + //if so, need rewind PC assuming GDS operation gets NACKed + s_mov_b32 s_save_tmp, 0 //clear mem_viol bit + s_setreg_b32 hwreg(HW_REG_TRAPSTS, SQ_WAVE_TRAPSTS_MEM_VIOL_SHIFT, 1), s_save_tmp //clear mem_viol bit + s_and_b32 s_save_pc_hi, s_save_pc_hi, 0x0000ffff //pc[47:32] + s_sub_u32 s_save_pc_lo, s_save_pc_lo, 8 //pc[31:0]-8 + s_subb_u32 s_save_pc_hi, s_save_pc_hi, 0x0 // -scc + +L_NO_PC_REWIND: + s_mov_b32 s_save_tmp, 0 //clear saveCtx bit + s_setreg_b32 hwreg(HW_REG_TRAPSTS, SQ_WAVE_TRAPSTS_SAVECTX_SHIFT, 1), s_save_tmp //clear saveCtx bit + + s_mov_b32 s_save_xnack_mask_lo, xnack_mask_lo //save XNACK_MASK + s_mov_b32 s_save_xnack_mask_hi, xnack_mask_hi //save XNACK must before any memory operation + s_getreg_b32 s_save_tmp, hwreg(HW_REG_IB_STS, SQ_WAVE_IB_STS_RCNT_SHIFT, SQ_WAVE_IB_STS_RCNT_SIZE) //save RCNT + s_lshl_b32 s_save_tmp, s_save_tmp, S_SAVE_PC_HI_RCNT_SHIFT + s_or_b32 s_save_pc_hi, s_save_pc_hi, s_save_tmp + s_getreg_b32 s_save_tmp, hwreg(HW_REG_IB_STS, SQ_WAVE_IB_STS_FIRST_REPLAY_SHIFT, SQ_WAVE_IB_STS_FIRST_REPLAY_SIZE) //save FIRST_REPLAY + s_lshl_b32 s_save_tmp, s_save_tmp, S_SAVE_PC_HI_FIRST_REPLAY_SHIFT + s_or_b32 s_save_pc_hi, s_save_pc_hi, s_save_tmp + s_getreg_b32 s_save_tmp, hwreg(HW_REG_IB_STS) //clear RCNT and FIRST_REPLAY in IB_STS + s_and_b32 s_save_tmp, s_save_tmp, SQ_WAVE_IB_STS_RCNT_FIRST_REPLAY_MASK_NEG + + s_setreg_b32 hwreg(HW_REG_IB_STS), s_save_tmp + + /* inform SPI the readiness and wait for SPI's go signal */ + s_mov_b32 s_save_exec_lo, exec_lo //save EXEC and use EXEC for the go signal from SPI + s_mov_b32 s_save_exec_hi, exec_hi + s_mov_b64 exec, 0x0 //clear EXEC to get ready to receive + +if G8SR_DEBUG_TIMESTAMP + s_memrealtime s_g8sr_ts_sq_save_msg + s_waitcnt lgkmcnt(0) +end + + if (EMU_RUN_HACK) + + else + s_sendmsg sendmsg(MSG_SAVEWAVE) //send SPI a message and wait for SPI's write to EXEC + end + + // Set SPI_PRIO=2 to avoid starving instruction fetch in the waves we're waiting for. + s_or_b32 s_save_tmp, s_save_status, (2 << SQ_WAVE_STATUS_SPI_PRIO_SHIFT) + s_setreg_b32 hwreg(HW_REG_STATUS), s_save_tmp + + L_SLEEP: + s_sleep 0x2 // sleep 1 (64clk) is not enough for 8 waves per SIMD, which will cause SQ hang, since the 7,8th wave could not get arbit to exec inst, while other waves are stuck into the sleep-loop and waiting for wrexec!=0 + + if (EMU_RUN_HACK) + + else + s_cbranch_execz L_SLEEP + end + +if G8SR_DEBUG_TIMESTAMP + s_memrealtime s_g8sr_ts_spi_wrexec + s_waitcnt lgkmcnt(0) +end + + /* setup Resource Contants */ + if ((EMU_RUN_HACK) && (!EMU_RUN_HACK_SAVE_SINGLE_WAVE)) + //calculate wd_addr using absolute thread id + v_readlane_b32 s_save_tmp, v9, 0 + s_lshr_b32 s_save_tmp, s_save_tmp, 6 + s_mul_i32 s_save_tmp, s_save_tmp, WAVE_SPACE + s_add_i32 s_save_spi_init_lo, s_save_tmp, WG_BASE_ADDR_LO + s_mov_b32 s_save_spi_init_hi, WG_BASE_ADDR_HI + s_and_b32 s_save_spi_init_hi, s_save_spi_init_hi, CTX_SAVE_CONTROL + else + end + if ((EMU_RUN_HACK) && (EMU_RUN_HACK_SAVE_SINGLE_WAVE)) + s_add_i32 s_save_spi_init_lo, s_save_tmp, WG_BASE_ADDR_LO + s_mov_b32 s_save_spi_init_hi, WG_BASE_ADDR_HI + s_and_b32 s_save_spi_init_hi, s_save_spi_init_hi, CTX_SAVE_CONTROL + else + end + + + s_mov_b32 s_save_buf_rsrc0, s_save_spi_init_lo //base_addr_lo + s_and_b32 s_save_buf_rsrc1, s_save_spi_init_hi, 0x0000FFFF //base_addr_hi + s_or_b32 s_save_buf_rsrc1, s_save_buf_rsrc1, S_SAVE_BUF_RSRC_WORD1_STRIDE + s_mov_b32 s_save_buf_rsrc2, 0 //NUM_RECORDS initial value = 0 (in bytes) although not neccessarily inited + s_mov_b32 s_save_buf_rsrc3, S_SAVE_BUF_RSRC_WORD3_MISC + s_and_b32 s_save_tmp, s_save_spi_init_hi, S_SAVE_SPI_INIT_ATC_MASK + s_lshr_b32 s_save_tmp, s_save_tmp, (S_SAVE_SPI_INIT_ATC_SHIFT-SQ_BUF_RSRC_WORD1_ATC_SHIFT) //get ATC bit into position + s_or_b32 s_save_buf_rsrc3, s_save_buf_rsrc3, s_save_tmp //or ATC + s_and_b32 s_save_tmp, s_save_spi_init_hi, S_SAVE_SPI_INIT_MTYPE_MASK + s_lshr_b32 s_save_tmp, s_save_tmp, (S_SAVE_SPI_INIT_MTYPE_SHIFT-SQ_BUF_RSRC_WORD3_MTYPE_SHIFT) //get MTYPE bits into position + s_or_b32 s_save_buf_rsrc3, s_save_buf_rsrc3, s_save_tmp //or MTYPE + + //FIXME right now s_save_m0/s_save_mem_offset use tma_lo/tma_hi (might need to save them before using them?) + s_mov_b32 s_save_m0, m0 //save M0 + + /* global mem offset */ + s_mov_b32 s_save_mem_offset, 0x0 //mem offset initial value = 0 + + + + + /* save HW registers */ + ////////////////////////////// + + L_SAVE_HWREG: + // HWREG SR memory offset : size(VGPR)+size(SGPR) + get_vgpr_size_bytes(s_save_mem_offset) + get_sgpr_size_bytes(s_save_tmp) + s_add_u32 s_save_mem_offset, s_save_mem_offset, s_save_tmp + + + s_mov_b32 s_save_buf_rsrc2, 0x4 //NUM_RECORDS in bytes + if (SWIZZLE_EN) + s_add_u32 s_save_buf_rsrc2, s_save_buf_rsrc2, 0x0 //FIXME need to use swizzle to enable bounds checking? + else + s_mov_b32 s_save_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes + end + + + write_hwreg_to_mem(s_save_m0, s_save_buf_rsrc0, s_save_mem_offset) //M0 + + if ((EMU_RUN_HACK) && (EMU_RUN_HACK_SAVE_FIRST_TIME)) + s_add_u32 s_save_pc_lo, s_save_pc_lo, 4 //pc[31:0]+4 + s_addc_u32 s_save_pc_hi, s_save_pc_hi, 0x0 //carry bit over + s_mov_b32 tba_lo, EMU_RUN_HACK_SAVE_FIRST_TIME_TBA_LO + s_mov_b32 tba_hi, EMU_RUN_HACK_SAVE_FIRST_TIME_TBA_HI + end + + write_hwreg_to_mem(s_save_pc_lo, s_save_buf_rsrc0, s_save_mem_offset) //PC + write_hwreg_to_mem(s_save_pc_hi, s_save_buf_rsrc0, s_save_mem_offset) + write_hwreg_to_mem(s_save_exec_lo, s_save_buf_rsrc0, s_save_mem_offset) //EXEC + write_hwreg_to_mem(s_save_exec_hi, s_save_buf_rsrc0, s_save_mem_offset) + write_hwreg_to_mem(s_save_status, s_save_buf_rsrc0, s_save_mem_offset) //STATUS + + //s_save_trapsts conflicts with s_save_alloc_size + s_getreg_b32 s_save_trapsts, hwreg(HW_REG_TRAPSTS) + write_hwreg_to_mem(s_save_trapsts, s_save_buf_rsrc0, s_save_mem_offset) //TRAPSTS + + write_hwreg_to_mem(s_save_xnack_mask_lo, s_save_buf_rsrc0, s_save_mem_offset) //XNACK_MASK_LO + write_hwreg_to_mem(s_save_xnack_mask_hi, s_save_buf_rsrc0, s_save_mem_offset) //XNACK_MASK_HI + + //use s_save_tmp would introduce conflict here between s_save_tmp and s_save_buf_rsrc2 + s_getreg_b32 s_save_m0, hwreg(HW_REG_MODE) //MODE + write_hwreg_to_mem(s_save_m0, s_save_buf_rsrc0, s_save_mem_offset) + write_hwreg_to_mem(tba_lo, s_save_buf_rsrc0, s_save_mem_offset) //TBA_LO + write_hwreg_to_mem(tba_hi, s_save_buf_rsrc0, s_save_mem_offset) //TBA_HI + + + + /* the first wave in the threadgroup */ + // save fist_wave bits in tba_hi unused bit.26 + s_and_b32 s_save_tmp, s_save_spi_init_hi, S_SAVE_SPI_INIT_FIRST_WAVE_MASK // extract fisrt wave bit + //s_or_b32 tba_hi, s_save_tmp, tba_hi // save first wave bit to tba_hi.bits[26] + s_mov_b32 s_save_exec_hi, 0x0 + s_or_b32 s_save_exec_hi, s_save_tmp, s_save_exec_hi // save first wave bit to s_save_exec_hi.bits[26] + + + /* save SGPRs */ + // Save SGPR before LDS save, then the s0 to s4 can be used during LDS save... + ////////////////////////////// + + // SGPR SR memory offset : size(VGPR) + get_vgpr_size_bytes(s_save_mem_offset) + // TODO, change RSRC word to rearrange memory layout for SGPRS + + s_getreg_b32 s_save_alloc_size, hwreg(HW_REG_GPR_ALLOC,SQ_WAVE_GPR_ALLOC_SGPR_SIZE_SHIFT,SQ_WAVE_GPR_ALLOC_SGPR_SIZE_SIZE) //spgr_size + s_add_u32 s_save_alloc_size, s_save_alloc_size, 1 + s_lshl_b32 s_save_alloc_size, s_save_alloc_size, 4 //Number of SGPRs = (sgpr_size + 1) * 16 (non-zero value) + + if (SGPR_SAVE_USE_SQC) + s_lshl_b32 s_save_buf_rsrc2, s_save_alloc_size, 2 //NUM_RECORDS in bytes + else + s_lshl_b32 s_save_buf_rsrc2, s_save_alloc_size, 8 //NUM_RECORDS in bytes (64 threads) + end + + if (SWIZZLE_EN) + s_add_u32 s_save_buf_rsrc2, s_save_buf_rsrc2, 0x0 //FIXME need to use swizzle to enable bounds checking? + else + s_mov_b32 s_save_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes + end + + + // backup s_save_buf_rsrc0,1 to s_save_pc_lo/hi, since write_16sgpr_to_mem function will change the rsrc0 + //s_mov_b64 s_save_pc_lo, s_save_buf_rsrc0 + s_mov_b64 s_save_xnack_mask_lo, s_save_buf_rsrc0 + s_add_u32 s_save_buf_rsrc0, s_save_buf_rsrc0, s_save_mem_offset + s_addc_u32 s_save_buf_rsrc1, s_save_buf_rsrc1, 0 + + s_mov_b32 m0, 0x0 //SGPR initial index value =0 + L_SAVE_SGPR_LOOP: + // SGPR is allocated in 16 SGPR granularity + s_movrels_b64 s0, s0 //s0 = s[0+m0], s1 = s[1+m0] + s_movrels_b64 s2, s2 //s2 = s[2+m0], s3 = s[3+m0] + s_movrels_b64 s4, s4 //s4 = s[4+m0], s5 = s[5+m0] + s_movrels_b64 s6, s6 //s6 = s[6+m0], s7 = s[7+m0] + s_movrels_b64 s8, s8 //s8 = s[8+m0], s9 = s[9+m0] + s_movrels_b64 s10, s10 //s10 = s[10+m0], s11 = s[11+m0] + s_movrels_b64 s12, s12 //s12 = s[12+m0], s13 = s[13+m0] + s_movrels_b64 s14, s14 //s14 = s[14+m0], s15 = s[15+m0] + + write_16sgpr_to_mem(s0, s_save_buf_rsrc0, s_save_mem_offset) //PV: the best performance should be using s_buffer_store_dwordx4 + s_add_u32 m0, m0, 16 //next sgpr index + s_cmp_lt_u32 m0, s_save_alloc_size //scc = (m0 < s_save_alloc_size) ? 1 : 0 + s_cbranch_scc1 L_SAVE_SGPR_LOOP //SGPR save is complete? + // restore s_save_buf_rsrc0,1 + //s_mov_b64 s_save_buf_rsrc0, s_save_pc_lo + s_mov_b64 s_save_buf_rsrc0, s_save_xnack_mask_lo + + + + + /* save first 4 VGPR, then LDS save could use */ + // each wave will alloc 4 vgprs at least... + ///////////////////////////////////////////////////////////////////////////////////// + + s_mov_b32 s_save_mem_offset, 0 + s_mov_b32 exec_lo, 0xFFFFFFFF //need every thread from now on + s_mov_b32 exec_hi, 0xFFFFFFFF + + if (SWIZZLE_EN) + s_add_u32 s_save_buf_rsrc2, s_save_buf_rsrc2, 0x0 //FIXME need to use swizzle to enable bounds checking? + else + s_mov_b32 s_save_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes + end + + + // VGPR Allocated in 4-GPR granularity + +if G8SR_VGPR_SR_IN_DWX4 + // the const stride for DWx4 is 4*4 bytes + s_and_b32 s_save_buf_rsrc1, s_save_buf_rsrc1, 0x0000FFFF // reset const stride to 0 + s_or_b32 s_save_buf_rsrc1, s_save_buf_rsrc1, G8SR_SAVE_BUF_RSRC_WORD1_STRIDE_DWx4 // const stride to 4*4 bytes + + buffer_store_dwordx4 v0, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 + + s_and_b32 s_save_buf_rsrc1, s_save_buf_rsrc1, 0x0000FFFF // reset const stride to 0 + s_or_b32 s_save_buf_rsrc1, s_save_buf_rsrc1, S_SAVE_BUF_RSRC_WORD1_STRIDE // reset const stride to 4 bytes +else + buffer_store_dword v0, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 + buffer_store_dword v1, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 offset:256 + buffer_store_dword v2, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 offset:256*2 + buffer_store_dword v3, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 offset:256*3 +end + + + + /* save LDS */ + ////////////////////////////// + + L_SAVE_LDS: + + // Change EXEC to all threads... + s_mov_b32 exec_lo, 0xFFFFFFFF //need every thread from now on + s_mov_b32 exec_hi, 0xFFFFFFFF + + s_getreg_b32 s_save_alloc_size, hwreg(HW_REG_LDS_ALLOC,SQ_WAVE_LDS_ALLOC_LDS_SIZE_SHIFT,SQ_WAVE_LDS_ALLOC_LDS_SIZE_SIZE) //lds_size + s_and_b32 s_save_alloc_size, s_save_alloc_size, 0xFFFFFFFF //lds_size is zero? + s_cbranch_scc0 L_SAVE_LDS_DONE //no lds used? jump to L_SAVE_DONE + + s_barrier //LDS is used? wait for other waves in the same TG + //s_and_b32 s_save_tmp, tba_hi, S_SAVE_SPI_INIT_FIRST_WAVE_MASK //exec is still used here + s_and_b32 s_save_tmp, s_save_exec_hi, S_SAVE_SPI_INIT_FIRST_WAVE_MASK //exec is still used here + s_cbranch_scc0 L_SAVE_LDS_DONE + + // first wave do LDS save; + + s_lshl_b32 s_save_alloc_size, s_save_alloc_size, 6 //LDS size in dwords = lds_size * 64dw + s_lshl_b32 s_save_alloc_size, s_save_alloc_size, 2 //LDS size in bytes + s_mov_b32 s_save_buf_rsrc2, s_save_alloc_size //NUM_RECORDS in bytes + + // LDS at offset: size(VGPR)+SIZE(SGPR)+SIZE(HWREG) + // + get_vgpr_size_bytes(s_save_mem_offset) + get_sgpr_size_bytes(s_save_tmp) + s_add_u32 s_save_mem_offset, s_save_mem_offset, s_save_tmp + s_add_u32 s_save_mem_offset, s_save_mem_offset, get_hwreg_size_bytes() + + + if (SWIZZLE_EN) + s_add_u32 s_save_buf_rsrc2, s_save_buf_rsrc2, 0x0 //FIXME need to use swizzle to enable bounds checking? + else + s_mov_b32 s_save_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes + end + + s_mov_b32 m0, 0x0 //lds_offset initial value = 0 + + +var LDS_DMA_ENABLE = 0 +var UNROLL = 0 +if UNROLL==0 && LDS_DMA_ENABLE==1 + s_mov_b32 s3, 256*2 + s_nop 0 + s_nop 0 + s_nop 0 + L_SAVE_LDS_LOOP: + //TODO: looks the 2 buffer_store/load clause for s/r will hurt performance.??? + if (SAVE_LDS) //SPI always alloc LDS space in 128DW granularity + buffer_store_lds_dword s_save_buf_rsrc0, s_save_mem_offset lds:1 // first 64DW + buffer_store_lds_dword s_save_buf_rsrc0, s_save_mem_offset lds:1 offset:256 // second 64DW + end + + s_add_u32 m0, m0, s3 //every buffer_store_lds does 256 bytes + s_add_u32 s_save_mem_offset, s_save_mem_offset, s3 //mem offset increased by 256 bytes + s_cmp_lt_u32 m0, s_save_alloc_size //scc=(m0 < s_save_alloc_size) ? 1 : 0 + s_cbranch_scc1 L_SAVE_LDS_LOOP //LDS save is complete? + +elsif LDS_DMA_ENABLE==1 && UNROLL==1 // UNROOL , has ichace miss + // store from higest LDS address to lowest + s_mov_b32 s3, 256*2 + s_sub_u32 m0, s_save_alloc_size, s3 + s_add_u32 s_save_mem_offset, s_save_mem_offset, m0 + s_lshr_b32 s_save_alloc_size, s_save_alloc_size, 9 // how many 128 trunks... + s_sub_u32 s_save_alloc_size, 128, s_save_alloc_size // store from higheset addr to lowest + s_mul_i32 s_save_alloc_size, s_save_alloc_size, 6*4 // PC offset increment, each LDS save block cost 6*4 Bytes instruction + s_add_u32 s_save_alloc_size, s_save_alloc_size, 3*4 //2is the below 2 inst...//s_addc and s_setpc + s_nop 0 + s_nop 0 + s_nop 0 //pad 3 dw to let LDS_DMA align with 64Bytes + s_getpc_b64 s[0:1] // reuse s[0:1], since s[0:1] already saved + s_add_u32 s0, s0,s_save_alloc_size + s_addc_u32 s1, s1, 0 + s_setpc_b64 s[0:1] + + + for var i =0; i< 128; i++ + // be careful to make here a 64Byte aligned address, which could improve performance... + buffer_store_lds_dword s_save_buf_rsrc0, s_save_mem_offset lds:1 offset:0 // first 64DW + buffer_store_lds_dword s_save_buf_rsrc0, s_save_mem_offset lds:1 offset:256 // second 64DW + + if i!=127 + s_sub_u32 m0, m0, s3 // use a sgpr to shrink 2DW-inst to 1DW inst to improve performance , i.e. pack more LDS_DMA inst to one Cacheline + s_sub_u32 s_save_mem_offset, s_save_mem_offset, s3 + end + end + +else // BUFFER_STORE + v_mbcnt_lo_u32_b32 v2, 0xffffffff, 0x0 + v_mbcnt_hi_u32_b32 v3, 0xffffffff, v2 // tid + v_mul_i32_i24 v2, v3, 8 // tid*8 + v_mov_b32 v3, 256*2 + s_mov_b32 m0, 0x10000 + s_mov_b32 s0, s_save_buf_rsrc3 + s_and_b32 s_save_buf_rsrc3, s_save_buf_rsrc3, 0xFF7FFFFF // disable add_tid + s_or_b32 s_save_buf_rsrc3, s_save_buf_rsrc3, 0x58000 //DFMT + +L_SAVE_LDS_LOOP_VECTOR: + ds_read_b64 v[0:1], v2 //x =LDS[a], byte address + s_waitcnt lgkmcnt(0) + buffer_store_dwordx2 v[0:1], v2, s_save_buf_rsrc0, s_save_mem_offset offen:1 glc:1 slc:1 +// s_waitcnt vmcnt(0) + v_add_u32 v2, vcc[0:1], v2, v3 + v_cmp_lt_u32 vcc[0:1], v2, s_save_alloc_size + s_cbranch_vccnz L_SAVE_LDS_LOOP_VECTOR + + // restore rsrc3 + s_mov_b32 s_save_buf_rsrc3, s0 + +end + +L_SAVE_LDS_DONE: + + + /* save VGPRs - set the Rest VGPRs */ + ////////////////////////////////////////////////////////////////////////////////////// + L_SAVE_VGPR: + // VGPR SR memory offset: 0 + // TODO rearrange the RSRC words to use swizzle for VGPR save... + + s_mov_b32 s_save_mem_offset, (0+256*4) // for the rest VGPRs + s_mov_b32 exec_lo, 0xFFFFFFFF //need every thread from now on + s_mov_b32 exec_hi, 0xFFFFFFFF + + s_getreg_b32 s_save_alloc_size, hwreg(HW_REG_GPR_ALLOC,SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SHIFT,SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SIZE) //vpgr_size + s_add_u32 s_save_alloc_size, s_save_alloc_size, 1 + s_lshl_b32 s_save_alloc_size, s_save_alloc_size, 2 //Number of VGPRs = (vgpr_size + 1) * 4 (non-zero value) //FIXME for GFX, zero is possible + s_lshl_b32 s_save_buf_rsrc2, s_save_alloc_size, 8 //NUM_RECORDS in bytes (64 threads*4) + if (SWIZZLE_EN) + s_add_u32 s_save_buf_rsrc2, s_save_buf_rsrc2, 0x0 //FIXME need to use swizzle to enable bounds checking? + else + s_mov_b32 s_save_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes + end + + + // VGPR Allocated in 4-GPR granularity + +if G8SR_VGPR_SR_IN_DWX4 + // the const stride for DWx4 is 4*4 bytes + s_and_b32 s_save_buf_rsrc1, s_save_buf_rsrc1, 0x0000FFFF // reset const stride to 0 + s_or_b32 s_save_buf_rsrc1, s_save_buf_rsrc1, G8SR_SAVE_BUF_RSRC_WORD1_STRIDE_DWx4 // const stride to 4*4 bytes + + s_mov_b32 m0, 4 // skip first 4 VGPRs + s_cmp_lt_u32 m0, s_save_alloc_size + s_cbranch_scc0 L_SAVE_VGPR_LOOP_END // no more vgprs + + s_set_gpr_idx_on m0, 0x1 // This will change M0 + s_add_u32 s_save_alloc_size, s_save_alloc_size, 0x1000 // because above inst change m0 +L_SAVE_VGPR_LOOP: + v_mov_b32 v0, v0 // v0 = v[0+m0] + v_mov_b32 v1, v1 + v_mov_b32 v2, v2 + v_mov_b32 v3, v3 + + + buffer_store_dwordx4 v0, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 + s_add_u32 m0, m0, 4 + s_add_u32 s_save_mem_offset, s_save_mem_offset, 256*4 + s_cmp_lt_u32 m0, s_save_alloc_size + s_cbranch_scc1 L_SAVE_VGPR_LOOP //VGPR save is complete? + s_set_gpr_idx_off +L_SAVE_VGPR_LOOP_END: + + s_and_b32 s_save_buf_rsrc1, s_save_buf_rsrc1, 0x0000FFFF // reset const stride to 0 + s_or_b32 s_save_buf_rsrc1, s_save_buf_rsrc1, S_SAVE_BUF_RSRC_WORD1_STRIDE // reset const stride to 4 bytes +else + // VGPR store using dw burst + s_mov_b32 m0, 0x4 //VGPR initial index value =0 + s_cmp_lt_u32 m0, s_save_alloc_size + s_cbranch_scc0 L_SAVE_VGPR_END + + + s_set_gpr_idx_on m0, 0x1 //M0[7:0] = M0[7:0] and M0[15:12] = 0x1 + s_add_u32 s_save_alloc_size, s_save_alloc_size, 0x1000 //add 0x1000 since we compare m0 against it later + + L_SAVE_VGPR_LOOP: + v_mov_b32 v0, v0 //v0 = v[0+m0] + v_mov_b32 v1, v1 //v0 = v[0+m0] + v_mov_b32 v2, v2 //v0 = v[0+m0] + v_mov_b32 v3, v3 //v0 = v[0+m0] + + if(USE_MTBUF_INSTEAD_OF_MUBUF) + tbuffer_store_format_x v0, v0, s_save_buf_rsrc0, s_save_mem_offset format:BUF_NUM_FORMAT_FLOAT format: BUF_DATA_FORMAT_32 slc:1 glc:1 + else + buffer_store_dword v0, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 + buffer_store_dword v1, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 offset:256 + buffer_store_dword v2, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 offset:256*2 + buffer_store_dword v3, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 offset:256*3 + end + + s_add_u32 m0, m0, 4 //next vgpr index + s_add_u32 s_save_mem_offset, s_save_mem_offset, 256*4 //every buffer_store_dword does 256 bytes + s_cmp_lt_u32 m0, s_save_alloc_size //scc = (m0 < s_save_alloc_size) ? 1 : 0 + s_cbranch_scc1 L_SAVE_VGPR_LOOP //VGPR save is complete? + s_set_gpr_idx_off +end + +L_SAVE_VGPR_END: + + + + + + + /* S_PGM_END_SAVED */ //FIXME graphics ONLY + if ((EMU_RUN_HACK) && (!EMU_RUN_HACK_SAVE_NORMAL_EXIT)) + s_and_b32 s_save_pc_hi, s_save_pc_hi, 0x0000ffff //pc[47:32] + s_add_u32 s_save_pc_lo, s_save_pc_lo, 4 //pc[31:0]+4 + s_addc_u32 s_save_pc_hi, s_save_pc_hi, 0x0 //carry bit over + s_rfe_b64 s_save_pc_lo //Return to the main shader program + else + end + +// Save Done timestamp +if G8SR_DEBUG_TIMESTAMP + s_memrealtime s_g8sr_ts_save_d + // SGPR SR memory offset : size(VGPR) + get_vgpr_size_bytes(s_save_mem_offset) + s_add_u32 s_save_mem_offset, s_save_mem_offset, G8SR_DEBUG_TS_SAVE_D_OFFSET + s_waitcnt lgkmcnt(0) //FIXME, will cause xnack?? + // Need reset rsrc2?? + s_mov_b32 m0, s_save_mem_offset + s_mov_b32 s_save_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes + s_buffer_store_dwordx2 s_g8sr_ts_save_d, s_save_buf_rsrc0, m0 glc:1 +end + + + s_branch L_END_PGM + + + +/**************************************************************************/ +/* restore routine */ +/**************************************************************************/ + +L_RESTORE: + /* Setup Resource Contants */ + if ((EMU_RUN_HACK) && (!EMU_RUN_HACK_RESTORE_NORMAL)) + //calculate wd_addr using absolute thread id + v_readlane_b32 s_restore_tmp, v9, 0 + s_lshr_b32 s_restore_tmp, s_restore_tmp, 6 + s_mul_i32 s_restore_tmp, s_restore_tmp, WAVE_SPACE + s_add_i32 s_restore_spi_init_lo, s_restore_tmp, WG_BASE_ADDR_LO + s_mov_b32 s_restore_spi_init_hi, WG_BASE_ADDR_HI + s_and_b32 s_restore_spi_init_hi, s_restore_spi_init_hi, CTX_RESTORE_CONTROL + else + end + +if G8SR_DEBUG_TIMESTAMP + s_memrealtime s_g8sr_ts_restore_s + s_waitcnt lgkmcnt(0) //FIXME, will cause xnack?? + // tma_lo/hi are sgpr 110, 111, which will not used for 112 SGPR allocated case... + s_mov_b32 s_restore_pc_lo, s_g8sr_ts_restore_s[0] + s_mov_b32 s_restore_pc_hi, s_g8sr_ts_restore_s[1] //backup ts to ttmp0/1, sicne exec will be finally restored.. +end + + + + s_mov_b32 s_restore_buf_rsrc0, s_restore_spi_init_lo //base_addr_lo + s_and_b32 s_restore_buf_rsrc1, s_restore_spi_init_hi, 0x0000FFFF //base_addr_hi + s_or_b32 s_restore_buf_rsrc1, s_restore_buf_rsrc1, S_RESTORE_BUF_RSRC_WORD1_STRIDE + s_mov_b32 s_restore_buf_rsrc2, 0 //NUM_RECORDS initial value = 0 (in bytes) + s_mov_b32 s_restore_buf_rsrc3, S_RESTORE_BUF_RSRC_WORD3_MISC + s_and_b32 s_restore_tmp, s_restore_spi_init_hi, S_RESTORE_SPI_INIT_ATC_MASK + s_lshr_b32 s_restore_tmp, s_restore_tmp, (S_RESTORE_SPI_INIT_ATC_SHIFT-SQ_BUF_RSRC_WORD1_ATC_SHIFT) //get ATC bit into position + s_or_b32 s_restore_buf_rsrc3, s_restore_buf_rsrc3, s_restore_tmp //or ATC + s_and_b32 s_restore_tmp, s_restore_spi_init_hi, S_RESTORE_SPI_INIT_MTYPE_MASK + s_lshr_b32 s_restore_tmp, s_restore_tmp, (S_RESTORE_SPI_INIT_MTYPE_SHIFT-SQ_BUF_RSRC_WORD3_MTYPE_SHIFT) //get MTYPE bits into position + s_or_b32 s_restore_buf_rsrc3, s_restore_buf_rsrc3, s_restore_tmp //or MTYPE + + /* global mem offset */ +// s_mov_b32 s_restore_mem_offset, 0x0 //mem offset initial value = 0 + + /* the first wave in the threadgroup */ + s_and_b32 s_restore_tmp, s_restore_spi_init_hi, S_RESTORE_SPI_INIT_FIRST_WAVE_MASK + s_cbranch_scc0 L_RESTORE_VGPR + + /* restore LDS */ + ////////////////////////////// + L_RESTORE_LDS: + + s_mov_b32 exec_lo, 0xFFFFFFFF //need every thread from now on //be consistent with SAVE although can be moved ahead + s_mov_b32 exec_hi, 0xFFFFFFFF + + s_getreg_b32 s_restore_alloc_size, hwreg(HW_REG_LDS_ALLOC,SQ_WAVE_LDS_ALLOC_LDS_SIZE_SHIFT,SQ_WAVE_LDS_ALLOC_LDS_SIZE_SIZE) //lds_size + s_and_b32 s_restore_alloc_size, s_restore_alloc_size, 0xFFFFFFFF //lds_size is zero? + s_cbranch_scc0 L_RESTORE_VGPR //no lds used? jump to L_RESTORE_VGPR + s_lshl_b32 s_restore_alloc_size, s_restore_alloc_size, 6 //LDS size in dwords = lds_size * 64dw + s_lshl_b32 s_restore_alloc_size, s_restore_alloc_size, 2 //LDS size in bytes + s_mov_b32 s_restore_buf_rsrc2, s_restore_alloc_size //NUM_RECORDS in bytes + + // LDS at offset: size(VGPR)+SIZE(SGPR)+SIZE(HWREG) + // + get_vgpr_size_bytes(s_restore_mem_offset) + get_sgpr_size_bytes(s_restore_tmp) + s_add_u32 s_restore_mem_offset, s_restore_mem_offset, s_restore_tmp + s_add_u32 s_restore_mem_offset, s_restore_mem_offset, get_hwreg_size_bytes() //FIXME, Check if offset overflow??? + + + if (SWIZZLE_EN) + s_add_u32 s_restore_buf_rsrc2, s_restore_buf_rsrc2, 0x0 //FIXME need to use swizzle to enable bounds checking? + else + s_mov_b32 s_restore_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes + end + s_mov_b32 m0, 0x0 //lds_offset initial value = 0 + + L_RESTORE_LDS_LOOP: + if (SAVE_LDS) + buffer_load_dword v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset lds:1 // first 64DW + buffer_load_dword v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset lds:1 offset:256 // second 64DW + end + s_add_u32 m0, m0, 256*2 // 128 DW + s_add_u32 s_restore_mem_offset, s_restore_mem_offset, 256*2 //mem offset increased by 128DW + s_cmp_lt_u32 m0, s_restore_alloc_size //scc=(m0 < s_restore_alloc_size) ? 1 : 0 + s_cbranch_scc1 L_RESTORE_LDS_LOOP //LDS restore is complete? + + + /* restore VGPRs */ + ////////////////////////////// + L_RESTORE_VGPR: + // VGPR SR memory offset : 0 + s_mov_b32 s_restore_mem_offset, 0x0 + s_mov_b32 exec_lo, 0xFFFFFFFF //need every thread from now on //be consistent with SAVE although can be moved ahead + s_mov_b32 exec_hi, 0xFFFFFFFF + + s_getreg_b32 s_restore_alloc_size, hwreg(HW_REG_GPR_ALLOC,SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SHIFT,SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SIZE) //vpgr_size + s_add_u32 s_restore_alloc_size, s_restore_alloc_size, 1 + s_lshl_b32 s_restore_alloc_size, s_restore_alloc_size, 2 //Number of VGPRs = (vgpr_size + 1) * 4 (non-zero value) + s_lshl_b32 s_restore_buf_rsrc2, s_restore_alloc_size, 8 //NUM_RECORDS in bytes (64 threads*4) + if (SWIZZLE_EN) + s_add_u32 s_restore_buf_rsrc2, s_restore_buf_rsrc2, 0x0 //FIXME need to use swizzle to enable bounds checking? + else + s_mov_b32 s_restore_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes + end + +if G8SR_VGPR_SR_IN_DWX4 + get_vgpr_size_bytes(s_restore_mem_offset) + s_sub_u32 s_restore_mem_offset, s_restore_mem_offset, 256*4 + + // the const stride for DWx4 is 4*4 bytes + s_and_b32 s_restore_buf_rsrc1, s_restore_buf_rsrc1, 0x0000FFFF // reset const stride to 0 + s_or_b32 s_restore_buf_rsrc1, s_restore_buf_rsrc1, G8SR_RESTORE_BUF_RSRC_WORD1_STRIDE_DWx4 // const stride to 4*4 bytes + + s_mov_b32 m0, s_restore_alloc_size + s_set_gpr_idx_on m0, 0x8 // Note.. This will change m0 + +L_RESTORE_VGPR_LOOP: + buffer_load_dwordx4 v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset slc:1 glc:1 + s_waitcnt vmcnt(0) + s_sub_u32 m0, m0, 4 + v_mov_b32 v0, v0 // v[0+m0] = v0 + v_mov_b32 v1, v1 + v_mov_b32 v2, v2 + v_mov_b32 v3, v3 + s_sub_u32 s_restore_mem_offset, s_restore_mem_offset, 256*4 + s_cmp_eq_u32 m0, 0x8000 + s_cbranch_scc0 L_RESTORE_VGPR_LOOP + s_set_gpr_idx_off + + s_and_b32 s_restore_buf_rsrc1, s_restore_buf_rsrc1, 0x0000FFFF // reset const stride to 0 + s_or_b32 s_restore_buf_rsrc1, s_restore_buf_rsrc1, S_RESTORE_BUF_RSRC_WORD1_STRIDE // const stride to 4*4 bytes + +else + // VGPR load using dw burst + s_mov_b32 s_restore_mem_offset_save, s_restore_mem_offset // restore start with v1, v0 will be the last + s_add_u32 s_restore_mem_offset, s_restore_mem_offset, 256*4 + s_mov_b32 m0, 4 //VGPR initial index value = 1 + s_set_gpr_idx_on m0, 0x8 //M0[7:0] = M0[7:0] and M0[15:12] = 0x8 + s_add_u32 s_restore_alloc_size, s_restore_alloc_size, 0x8000 //add 0x8000 since we compare m0 against it later + + L_RESTORE_VGPR_LOOP: + if(USE_MTBUF_INSTEAD_OF_MUBUF) + tbuffer_load_format_x v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset format:BUF_NUM_FORMAT_FLOAT format: BUF_DATA_FORMAT_32 slc:1 glc:1 + else + buffer_load_dword v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset slc:1 glc:1 + buffer_load_dword v1, v0, s_restore_buf_rsrc0, s_restore_mem_offset slc:1 glc:1 offset:256 + buffer_load_dword v2, v0, s_restore_buf_rsrc0, s_restore_mem_offset slc:1 glc:1 offset:256*2 + buffer_load_dword v3, v0, s_restore_buf_rsrc0, s_restore_mem_offset slc:1 glc:1 offset:256*3 + end + s_waitcnt vmcnt(0) //ensure data ready + v_mov_b32 v0, v0 //v[0+m0] = v0 + v_mov_b32 v1, v1 + v_mov_b32 v2, v2 + v_mov_b32 v3, v3 + s_add_u32 m0, m0, 4 //next vgpr index + s_add_u32 s_restore_mem_offset, s_restore_mem_offset, 256*4 //every buffer_load_dword does 256 bytes + s_cmp_lt_u32 m0, s_restore_alloc_size //scc = (m0 < s_restore_alloc_size) ? 1 : 0 + s_cbranch_scc1 L_RESTORE_VGPR_LOOP //VGPR restore (except v0) is complete? + s_set_gpr_idx_off + /* VGPR restore on v0 */ + if(USE_MTBUF_INSTEAD_OF_MUBUF) + tbuffer_load_format_x v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save format:BUF_NUM_FORMAT_FLOAT format: BUF_DATA_FORMAT_32 slc:1 glc:1 + else + buffer_load_dword v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save slc:1 glc:1 + buffer_load_dword v1, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save slc:1 glc:1 offset:256 + buffer_load_dword v2, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save slc:1 glc:1 offset:256*2 + buffer_load_dword v3, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save slc:1 glc:1 offset:256*3 + end + +end + + /* restore SGPRs */ + ////////////////////////////// + + // SGPR SR memory offset : size(VGPR) + get_vgpr_size_bytes(s_restore_mem_offset) + get_sgpr_size_bytes(s_restore_tmp) + s_add_u32 s_restore_mem_offset, s_restore_mem_offset, s_restore_tmp + s_sub_u32 s_restore_mem_offset, s_restore_mem_offset, 16*4 // restore SGPR from S[n] to S[0], by 16 sgprs group + // TODO, change RSRC word to rearrange memory layout for SGPRS + + s_getreg_b32 s_restore_alloc_size, hwreg(HW_REG_GPR_ALLOC,SQ_WAVE_GPR_ALLOC_SGPR_SIZE_SHIFT,SQ_WAVE_GPR_ALLOC_SGPR_SIZE_SIZE) //spgr_size + s_add_u32 s_restore_alloc_size, s_restore_alloc_size, 1 + s_lshl_b32 s_restore_alloc_size, s_restore_alloc_size, 4 //Number of SGPRs = (sgpr_size + 1) * 16 (non-zero value) + + if (SGPR_SAVE_USE_SQC) + s_lshl_b32 s_restore_buf_rsrc2, s_restore_alloc_size, 2 //NUM_RECORDS in bytes + else + s_lshl_b32 s_restore_buf_rsrc2, s_restore_alloc_size, 8 //NUM_RECORDS in bytes (64 threads) + end + if (SWIZZLE_EN) + s_add_u32 s_restore_buf_rsrc2, s_restore_buf_rsrc2, 0x0 //FIXME need to use swizzle to enable bounds checking? + else + s_mov_b32 s_restore_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes + end + + /* If 112 SGPRs ar allocated, 4 sgprs are not used TBA(108,109),TMA(110,111), + However, we are safe to restore these 4 SGPRs anyway, since TBA,TMA will later be restored by HWREG + */ + s_mov_b32 m0, s_restore_alloc_size + + L_RESTORE_SGPR_LOOP: + read_16sgpr_from_mem(s0, s_restore_buf_rsrc0, s_restore_mem_offset) //PV: further performance improvement can be made + s_waitcnt lgkmcnt(0) //ensure data ready + + s_sub_u32 m0, m0, 16 // Restore from S[n] to S[0] + + s_movreld_b64 s0, s0 //s[0+m0] = s0 + s_movreld_b64 s2, s2 + s_movreld_b64 s4, s4 + s_movreld_b64 s6, s6 + s_movreld_b64 s8, s8 + s_movreld_b64 s10, s10 + s_movreld_b64 s12, s12 + s_movreld_b64 s14, s14 + + s_cmp_eq_u32 m0, 0 //scc = (m0 < s_restore_alloc_size) ? 1 : 0 + s_cbranch_scc0 L_RESTORE_SGPR_LOOP //SGPR restore (except s0) is complete? + + /* restore HW registers */ + ////////////////////////////// + L_RESTORE_HWREG: + + +if G8SR_DEBUG_TIMESTAMP + s_mov_b32 s_g8sr_ts_restore_s[0], s_restore_pc_lo + s_mov_b32 s_g8sr_ts_restore_s[1], s_restore_pc_hi +end + + // HWREG SR memory offset : size(VGPR)+size(SGPR) + get_vgpr_size_bytes(s_restore_mem_offset) + get_sgpr_size_bytes(s_restore_tmp) + s_add_u32 s_restore_mem_offset, s_restore_mem_offset, s_restore_tmp + + + s_mov_b32 s_restore_buf_rsrc2, 0x4 //NUM_RECORDS in bytes + if (SWIZZLE_EN) + s_add_u32 s_restore_buf_rsrc2, s_restore_buf_rsrc2, 0x0 //FIXME need to use swizzle to enable bounds checking? + else + s_mov_b32 s_restore_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes + end + + read_hwreg_from_mem(s_restore_m0, s_restore_buf_rsrc0, s_restore_mem_offset) //M0 + read_hwreg_from_mem(s_restore_pc_lo, s_restore_buf_rsrc0, s_restore_mem_offset) //PC + read_hwreg_from_mem(s_restore_pc_hi, s_restore_buf_rsrc0, s_restore_mem_offset) + read_hwreg_from_mem(s_restore_exec_lo, s_restore_buf_rsrc0, s_restore_mem_offset) //EXEC + read_hwreg_from_mem(s_restore_exec_hi, s_restore_buf_rsrc0, s_restore_mem_offset) + read_hwreg_from_mem(s_restore_status, s_restore_buf_rsrc0, s_restore_mem_offset) //STATUS + read_hwreg_from_mem(s_restore_trapsts, s_restore_buf_rsrc0, s_restore_mem_offset) //TRAPSTS + read_hwreg_from_mem(xnack_mask_lo, s_restore_buf_rsrc0, s_restore_mem_offset) //XNACK_MASK_LO + read_hwreg_from_mem(xnack_mask_hi, s_restore_buf_rsrc0, s_restore_mem_offset) //XNACK_MASK_HI + read_hwreg_from_mem(s_restore_mode, s_restore_buf_rsrc0, s_restore_mem_offset) //MODE + read_hwreg_from_mem(tba_lo, s_restore_buf_rsrc0, s_restore_mem_offset) //TBA_LO + read_hwreg_from_mem(tba_hi, s_restore_buf_rsrc0, s_restore_mem_offset) //TBA_HI + + s_waitcnt lgkmcnt(0) //from now on, it is safe to restore STATUS and IB_STS + + //for normal save & restore, the saved PC points to the next inst to execute, no adjustment needs to be made, otherwise: + if ((EMU_RUN_HACK) && (!EMU_RUN_HACK_RESTORE_NORMAL)) + s_add_u32 s_restore_pc_lo, s_restore_pc_lo, 8 //pc[31:0]+8 //two back-to-back s_trap are used (first for save and second for restore) + s_addc_u32 s_restore_pc_hi, s_restore_pc_hi, 0x0 //carry bit over + end + if ((EMU_RUN_HACK) && (EMU_RUN_HACK_RESTORE_NORMAL)) + s_add_u32 s_restore_pc_lo, s_restore_pc_lo, 4 //pc[31:0]+4 // save is hack through s_trap but restore is normal + s_addc_u32 s_restore_pc_hi, s_restore_pc_hi, 0x0 //carry bit over + end + + s_mov_b32 m0, s_restore_m0 + s_mov_b32 exec_lo, s_restore_exec_lo + s_mov_b32 exec_hi, s_restore_exec_hi + + s_and_b32 s_restore_m0, SQ_WAVE_TRAPSTS_PRE_SAVECTX_MASK, s_restore_trapsts + s_setreg_b32 hwreg(HW_REG_TRAPSTS, SQ_WAVE_TRAPSTS_PRE_SAVECTX_SHIFT, SQ_WAVE_TRAPSTS_PRE_SAVECTX_SIZE), s_restore_m0 + s_and_b32 s_restore_m0, SQ_WAVE_TRAPSTS_POST_SAVECTX_MASK, s_restore_trapsts + s_lshr_b32 s_restore_m0, s_restore_m0, SQ_WAVE_TRAPSTS_POST_SAVECTX_SHIFT + s_setreg_b32 hwreg(HW_REG_TRAPSTS, SQ_WAVE_TRAPSTS_POST_SAVECTX_SHIFT, SQ_WAVE_TRAPSTS_POST_SAVECTX_SIZE), s_restore_m0 + //s_setreg_b32 hwreg(HW_REG_TRAPSTS), s_restore_trapsts //don't overwrite SAVECTX bit as it may be set through external SAVECTX during restore + s_setreg_b32 hwreg(HW_REG_MODE), s_restore_mode + //reuse s_restore_m0 as a temp register + s_and_b32 s_restore_m0, s_restore_pc_hi, S_SAVE_PC_HI_RCNT_MASK + s_lshr_b32 s_restore_m0, s_restore_m0, S_SAVE_PC_HI_RCNT_SHIFT + s_lshl_b32 s_restore_m0, s_restore_m0, SQ_WAVE_IB_STS_RCNT_SHIFT + s_mov_b32 s_restore_tmp, 0x0 //IB_STS is zero + s_or_b32 s_restore_tmp, s_restore_tmp, s_restore_m0 + s_and_b32 s_restore_m0, s_restore_pc_hi, S_SAVE_PC_HI_FIRST_REPLAY_MASK + s_lshr_b32 s_restore_m0, s_restore_m0, S_SAVE_PC_HI_FIRST_REPLAY_SHIFT + s_lshl_b32 s_restore_m0, s_restore_m0, SQ_WAVE_IB_STS_FIRST_REPLAY_SHIFT + s_or_b32 s_restore_tmp, s_restore_tmp, s_restore_m0 + s_and_b32 s_restore_m0, s_restore_status, SQ_WAVE_STATUS_INST_ATC_MASK + s_lshr_b32 s_restore_m0, s_restore_m0, SQ_WAVE_STATUS_INST_ATC_SHIFT + s_setreg_b32 hwreg(HW_REG_IB_STS), s_restore_tmp + + s_and_b32 s_restore_pc_hi, s_restore_pc_hi, 0x0000ffff //pc[47:32] //Do it here in order not to affect STATUS + s_and_b64 exec, exec, exec // Restore STATUS.EXECZ, not writable by s_setreg_b32 + s_and_b64 vcc, vcc, vcc // Restore STATUS.VCCZ, not writable by s_setreg_b32 + set_status_without_spi_prio(s_restore_status, s_restore_tmp) // SCC is included, which is changed by previous salu + + s_barrier //barrier to ensure the readiness of LDS before access attempts from any other wave in the same TG //FIXME not performance-optimal at this time + +if G8SR_DEBUG_TIMESTAMP + s_memrealtime s_g8sr_ts_restore_d + s_waitcnt lgkmcnt(0) +end + +// s_rfe_b64 s_restore_pc_lo //Return to the main shader program and resume execution + s_rfe_restore_b64 s_restore_pc_lo, s_restore_m0 // s_restore_m0[0] is used to set STATUS.inst_atc + + +/**************************************************************************/ +/* the END */ +/**************************************************************************/ +L_END_PGM: + s_endpgm + +end + + +/**************************************************************************/ +/* the helper functions */ +/**************************************************************************/ + +//Only for save hwreg to mem +function write_hwreg_to_mem(s, s_rsrc, s_mem_offset) + s_mov_b32 exec_lo, m0 //assuming exec_lo is not needed anymore from this point on + s_mov_b32 m0, s_mem_offset + s_buffer_store_dword s, s_rsrc, m0 glc:1 + s_add_u32 s_mem_offset, s_mem_offset, 4 + s_mov_b32 m0, exec_lo +end + + +// HWREG are saved before SGPRs, so all HWREG could be use. +function write_16sgpr_to_mem(s, s_rsrc, s_mem_offset) + + s_buffer_store_dwordx4 s[0], s_rsrc, 0 glc:1 + s_buffer_store_dwordx4 s[4], s_rsrc, 16 glc:1 + s_buffer_store_dwordx4 s[8], s_rsrc, 32 glc:1 + s_buffer_store_dwordx4 s[12], s_rsrc, 48 glc:1 + s_add_u32 s_rsrc[0], s_rsrc[0], 4*16 + s_addc_u32 s_rsrc[1], s_rsrc[1], 0x0 // +scc +end + + +function read_hwreg_from_mem(s, s_rsrc, s_mem_offset) + s_buffer_load_dword s, s_rsrc, s_mem_offset glc:1 + s_add_u32 s_mem_offset, s_mem_offset, 4 +end + +function read_16sgpr_from_mem(s, s_rsrc, s_mem_offset) + s_buffer_load_dwordx16 s, s_rsrc, s_mem_offset glc:1 + s_sub_u32 s_mem_offset, s_mem_offset, 4*16 +end + + + +function get_lds_size_bytes(s_lds_size_byte) + // SQ LDS granularity is 64DW, while PGM_RSRC2.lds_size is in granularity 128DW + s_getreg_b32 s_lds_size_byte, hwreg(HW_REG_LDS_ALLOC, SQ_WAVE_LDS_ALLOC_LDS_SIZE_SHIFT, SQ_WAVE_LDS_ALLOC_LDS_SIZE_SIZE) // lds_size + s_lshl_b32 s_lds_size_byte, s_lds_size_byte, 8 //LDS size in dwords = lds_size * 64 *4Bytes // granularity 64DW +end + +function get_vgpr_size_bytes(s_vgpr_size_byte) + s_getreg_b32 s_vgpr_size_byte, hwreg(HW_REG_GPR_ALLOC,SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SHIFT,SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SIZE) //vpgr_size + s_add_u32 s_vgpr_size_byte, s_vgpr_size_byte, 1 + s_lshl_b32 s_vgpr_size_byte, s_vgpr_size_byte, (2+8) //Number of VGPRs = (vgpr_size + 1) * 4 * 64 * 4 (non-zero value) //FIXME for GFX, zero is possible +end + +function get_sgpr_size_bytes(s_sgpr_size_byte) + s_getreg_b32 s_sgpr_size_byte, hwreg(HW_REG_GPR_ALLOC,SQ_WAVE_GPR_ALLOC_SGPR_SIZE_SHIFT,SQ_WAVE_GPR_ALLOC_SGPR_SIZE_SIZE) //spgr_size + s_add_u32 s_sgpr_size_byte, s_sgpr_size_byte, 1 + s_lshl_b32 s_sgpr_size_byte, s_sgpr_size_byte, 6 //Number of SGPRs = (sgpr_size + 1) * 16 *4 (non-zero value) +end + +function get_hwreg_size_bytes + return 128 //HWREG size 128 bytes +end + +function set_status_without_spi_prio(status, tmp) + // Do not restore STATUS.SPI_PRIO since scheduler may have raised it. + s_lshr_b32 tmp, status, SQ_WAVE_STATUS_POST_SPI_PRIO_SHIFT + s_setreg_b32 hwreg(HW_REG_STATUS, SQ_WAVE_STATUS_POST_SPI_PRIO_SHIFT, SQ_WAVE_STATUS_POST_SPI_PRIO_SIZE), tmp + s_nop 0x2 // avoid S_SETREG => S_SETREG hazard + s_setreg_b32 hwreg(HW_REG_STATUS, SQ_WAVE_STATUS_PRE_SPI_PRIO_SHIFT, SQ_WAVE_STATUS_PRE_SPI_PRIO_SIZE), status +end diff --git a/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler_gfx9.asm b/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler_gfx9.asm new file mode 100644 index 000000000..0bb9c577b --- /dev/null +++ b/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler_gfx9.asm @@ -0,0 +1,1226 @@ +/* + * Copyright 2016 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +/* To compile this assembly code: + * PROJECT=greenland ./sp3 cwsr_trap_handler_gfx9.asm -hex tmp.hex + */ + +/* HW (GFX9) source code for CWSR trap handler */ +/* Version 18 + multiple trap handler */ + +// this performance-optimal version was originally from Seven Xu at SRDC + +// Revison #18 --... +/* Rev History +** #1. Branch from gc dv. //gfxip/gfx9/main/src/test/suites/block/cs/sr/cs_trap_handler.sp3#1,#50, #51, #52-53(Skip, Already Fixed by PV), #54-56(merged),#57-58(mergerd, skiped-already fixed by PV) +** #4. SR Memory Layout: +** 1. VGPR-SGPR-HWREG-{LDS} +** 2. tba_hi.bits.26 - reconfigured as the first wave in tg bits, for defer Save LDS for a threadgroup.. performance concern.. +** #5. Update: 1. Accurate g8sr_ts_save_d timestamp +** #6. Update: 1. Fix s_barrier usage; 2. VGPR s/r using swizzle buffer?(NoNeed, already matched the swizzle pattern, more investigation) +** #7. Update: 1. don't barrier if noLDS +** #8. Branch: 1. Branch to ver#0, which is very similar to gc dv version +** 2. Fix SQ issue by s_sleep 2 +** #9. Update: 1. Fix scc restore failed issue, restore wave_status at last +** 2. optimize s_buffer save by burst 16sgprs... +** #10. Update 1. Optimize restore sgpr by busrt 16 sgprs. +** #11. Update 1. Add 2 more timestamp for debug version +** #12. Update 1. Add VGPR SR using DWx4, some case improve and some case drop performance +** #13. Integ 1. Always use MUBUF for PV trap shader... +** #14. Update 1. s_buffer_store soft clause... +** #15. Update 1. PERF - sclar write with glc:0/mtype0 to allow L2 combine. perf improvement a lot. +** #16. Update 1. PRRF - UNROLL LDS_DMA got 2500cycle save in IP tree +** #17. Update 1. FUNC - LDS_DMA has issues while ATC, replace with ds_read/buffer_store for save part[TODO restore part] +** 2. PERF - Save LDS before save VGPR to cover LDS save long latency... +** #18. Update 1. FUNC - Implicitly estore STATUS.VCCZ, which is not writable by s_setreg_b32 +** 2. FUNC - Handle non-CWSR traps +*/ + +var G8SR_WDMEM_HWREG_OFFSET = 0 +var G8SR_WDMEM_SGPR_OFFSET = 128 // in bytes + +// Keep definition same as the app shader, These 2 time stamps are part of the app shader... Should before any Save and after restore. + +var G8SR_DEBUG_TIMESTAMP = 0 +var G8SR_DEBUG_TS_SAVE_D_OFFSET = 40*4 // ts_save_d timestamp offset relative to SGPR_SR_memory_offset +var s_g8sr_ts_save_s = s[34:35] // save start +var s_g8sr_ts_sq_save_msg = s[36:37] // The save shader send SAVEWAVE msg to spi +var s_g8sr_ts_spi_wrexec = s[38:39] // the SPI write the sr address to SQ +var s_g8sr_ts_save_d = s[40:41] // save end +var s_g8sr_ts_restore_s = s[42:43] // restore start +var s_g8sr_ts_restore_d = s[44:45] // restore end + +var G8SR_VGPR_SR_IN_DWX4 = 0 +var G8SR_SAVE_BUF_RSRC_WORD1_STRIDE_DWx4 = 0x00100000 // DWx4 stride is 4*4Bytes +var G8SR_RESTORE_BUF_RSRC_WORD1_STRIDE_DWx4 = G8SR_SAVE_BUF_RSRC_WORD1_STRIDE_DWx4 + + +/*************************************************************************/ +/* control on how to run the shader */ +/*************************************************************************/ +//any hack that needs to be made to run this code in EMU (either because various EMU code are not ready or no compute save & restore in EMU run) +var EMU_RUN_HACK = 0 +var EMU_RUN_HACK_RESTORE_NORMAL = 0 +var EMU_RUN_HACK_SAVE_NORMAL_EXIT = 0 +var EMU_RUN_HACK_SAVE_SINGLE_WAVE = 0 +var EMU_RUN_HACK_SAVE_FIRST_TIME = 0 //for interrupted restore in which the first save is through EMU_RUN_HACK +var SAVE_LDS = 1 +var WG_BASE_ADDR_LO = 0x9000a000 +var WG_BASE_ADDR_HI = 0x0 +var WAVE_SPACE = 0x5000 //memory size that each wave occupies in workgroup state mem +var CTX_SAVE_CONTROL = 0x0 +var CTX_RESTORE_CONTROL = CTX_SAVE_CONTROL +var SIM_RUN_HACK = 0 //any hack that needs to be made to run this code in SIM (either because various RTL code are not ready or no compute save & restore in RTL run) +var SGPR_SAVE_USE_SQC = 1 //use SQC D$ to do the write +var USE_MTBUF_INSTEAD_OF_MUBUF = 0 //because TC EMU currently asserts on 0 of // overload DFMT field to carry 4 more bits of stride for MUBUF opcodes +var SWIZZLE_EN = 0 //whether we use swizzled buffer addressing +var ACK_SQC_STORE = 1 //workaround for suspected SQC store bug causing incorrect stores under concurrency + +/**************************************************************************/ +/* variables */ +/**************************************************************************/ +var SQ_WAVE_STATUS_INST_ATC_SHIFT = 23 +var SQ_WAVE_STATUS_INST_ATC_MASK = 0x00800000 +var SQ_WAVE_STATUS_SPI_PRIO_SHIFT = 1 +var SQ_WAVE_STATUS_SPI_PRIO_MASK = 0x00000006 +var SQ_WAVE_STATUS_HALT_MASK = 0x2000 +var SQ_WAVE_STATUS_PRE_SPI_PRIO_SHIFT = 0 +var SQ_WAVE_STATUS_PRE_SPI_PRIO_SIZE = 1 +var SQ_WAVE_STATUS_POST_SPI_PRIO_SHIFT = 3 +var SQ_WAVE_STATUS_POST_SPI_PRIO_SIZE = 29 + +var SQ_WAVE_LDS_ALLOC_LDS_SIZE_SHIFT = 12 +var SQ_WAVE_LDS_ALLOC_LDS_SIZE_SIZE = 9 +var SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SHIFT = 8 +var SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SIZE = 6 +var SQ_WAVE_GPR_ALLOC_SGPR_SIZE_SHIFT = 24 +var SQ_WAVE_GPR_ALLOC_SGPR_SIZE_SIZE = 3 //FIXME sq.blk still has 4 bits at this time while SQ programming guide has 3 bits + +var SQ_WAVE_TRAPSTS_SAVECTX_MASK = 0x400 +var SQ_WAVE_TRAPSTS_EXCE_MASK = 0x1FF // Exception mask +var SQ_WAVE_TRAPSTS_SAVECTX_SHIFT = 10 +var SQ_WAVE_TRAPSTS_MEM_VIOL_MASK = 0x100 +var SQ_WAVE_TRAPSTS_MEM_VIOL_SHIFT = 8 +var SQ_WAVE_TRAPSTS_PRE_SAVECTX_MASK = 0x3FF +var SQ_WAVE_TRAPSTS_PRE_SAVECTX_SHIFT = 0x0 +var SQ_WAVE_TRAPSTS_PRE_SAVECTX_SIZE = 10 +var SQ_WAVE_TRAPSTS_POST_SAVECTX_MASK = 0xFFFFF800 +var SQ_WAVE_TRAPSTS_POST_SAVECTX_SHIFT = 11 +var SQ_WAVE_TRAPSTS_POST_SAVECTX_SIZE = 21 +var SQ_WAVE_TRAPSTS_ILLEGAL_INST_MASK = 0x800 + +var SQ_WAVE_IB_STS_RCNT_SHIFT = 16 //FIXME +var SQ_WAVE_IB_STS_FIRST_REPLAY_SHIFT = 15 //FIXME +var SQ_WAVE_IB_STS_RCNT_FIRST_REPLAY_MASK = 0x1F8000 +var SQ_WAVE_IB_STS_RCNT_FIRST_REPLAY_MASK_NEG = 0x00007FFF //FIXME + +var SQ_BUF_RSRC_WORD1_ATC_SHIFT = 24 +var SQ_BUF_RSRC_WORD3_MTYPE_SHIFT = 27 + +var TTMP11_SAVE_RCNT_FIRST_REPLAY_SHIFT = 26 // bits [31:26] unused by SPI debug data +var TTMP11_SAVE_RCNT_FIRST_REPLAY_MASK = 0xFC000000 + +/* Save */ +var S_SAVE_BUF_RSRC_WORD1_STRIDE = 0x00040000 //stride is 4 bytes +var S_SAVE_BUF_RSRC_WORD3_MISC = 0x00807FAC //SQ_SEL_X/Y/Z/W, BUF_NUM_FORMAT_FLOAT, (0 for MUBUF stride[17:14] when ADD_TID_ENABLE and BUF_DATA_FORMAT_32 for MTBUF), ADD_TID_ENABLE + +var S_SAVE_SPI_INIT_ATC_MASK = 0x08000000 //bit[27]: ATC bit +var S_SAVE_SPI_INIT_ATC_SHIFT = 27 +var S_SAVE_SPI_INIT_MTYPE_MASK = 0x70000000 //bit[30:28]: Mtype +var S_SAVE_SPI_INIT_MTYPE_SHIFT = 28 +var S_SAVE_SPI_INIT_FIRST_WAVE_MASK = 0x04000000 //bit[26]: FirstWaveInTG +var S_SAVE_SPI_INIT_FIRST_WAVE_SHIFT = 26 + +var S_SAVE_PC_HI_RCNT_SHIFT = 28 //FIXME check with Brian to ensure all fields other than PC[47:0] can be used +var S_SAVE_PC_HI_RCNT_MASK = 0xF0000000 //FIXME +var S_SAVE_PC_HI_FIRST_REPLAY_SHIFT = 27 //FIXME +var S_SAVE_PC_HI_FIRST_REPLAY_MASK = 0x08000000 //FIXME + +var s_save_spi_init_lo = exec_lo +var s_save_spi_init_hi = exec_hi + +var s_save_pc_lo = ttmp0 //{TTMP1, TTMP0} = {3'h0,pc_rewind[3:0], HT[0],trapID[7:0], PC[47:0]} +var s_save_pc_hi = ttmp1 +var s_save_exec_lo = ttmp2 +var s_save_exec_hi = ttmp3 +var s_save_tmp = ttmp4 +var s_save_trapsts = ttmp5 //not really used until the end of the SAVE routine +var s_save_xnack_mask_lo = ttmp6 +var s_save_xnack_mask_hi = ttmp7 +var s_save_buf_rsrc0 = ttmp8 +var s_save_buf_rsrc1 = ttmp9 +var s_save_buf_rsrc2 = ttmp10 +var s_save_buf_rsrc3 = ttmp11 +var s_save_status = ttmp12 +var s_save_mem_offset = ttmp14 +var s_save_alloc_size = s_save_trapsts //conflict +var s_save_m0 = ttmp15 +var s_save_ttmps_lo = s_save_tmp //no conflict +var s_save_ttmps_hi = s_save_trapsts //no conflict + +/* Restore */ +var S_RESTORE_BUF_RSRC_WORD1_STRIDE = S_SAVE_BUF_RSRC_WORD1_STRIDE +var S_RESTORE_BUF_RSRC_WORD3_MISC = S_SAVE_BUF_RSRC_WORD3_MISC + +var S_RESTORE_SPI_INIT_ATC_MASK = 0x08000000 //bit[27]: ATC bit +var S_RESTORE_SPI_INIT_ATC_SHIFT = 27 +var S_RESTORE_SPI_INIT_MTYPE_MASK = 0x70000000 //bit[30:28]: Mtype +var S_RESTORE_SPI_INIT_MTYPE_SHIFT = 28 +var S_RESTORE_SPI_INIT_FIRST_WAVE_MASK = 0x04000000 //bit[26]: FirstWaveInTG +var S_RESTORE_SPI_INIT_FIRST_WAVE_SHIFT = 26 + +var S_RESTORE_PC_HI_RCNT_SHIFT = S_SAVE_PC_HI_RCNT_SHIFT +var S_RESTORE_PC_HI_RCNT_MASK = S_SAVE_PC_HI_RCNT_MASK +var S_RESTORE_PC_HI_FIRST_REPLAY_SHIFT = S_SAVE_PC_HI_FIRST_REPLAY_SHIFT +var S_RESTORE_PC_HI_FIRST_REPLAY_MASK = S_SAVE_PC_HI_FIRST_REPLAY_MASK + +var s_restore_spi_init_lo = exec_lo +var s_restore_spi_init_hi = exec_hi + +var s_restore_mem_offset = ttmp12 +var s_restore_alloc_size = ttmp3 +var s_restore_tmp = ttmp2 +var s_restore_mem_offset_save = s_restore_tmp //no conflict + +var s_restore_m0 = s_restore_alloc_size //no conflict + +var s_restore_mode = ttmp7 + +var s_restore_pc_lo = ttmp0 +var s_restore_pc_hi = ttmp1 +var s_restore_exec_lo = ttmp14 +var s_restore_exec_hi = ttmp15 +var s_restore_status = ttmp4 +var s_restore_trapsts = ttmp5 +var s_restore_xnack_mask_lo = xnack_mask_lo +var s_restore_xnack_mask_hi = xnack_mask_hi +var s_restore_buf_rsrc0 = ttmp8 +var s_restore_buf_rsrc1 = ttmp9 +var s_restore_buf_rsrc2 = ttmp10 +var s_restore_buf_rsrc3 = ttmp11 +var s_restore_ttmps_lo = s_restore_tmp //no conflict +var s_restore_ttmps_hi = s_restore_alloc_size //no conflict + +/**************************************************************************/ +/* trap handler entry points */ +/**************************************************************************/ +/* Shader Main*/ + +shader main + asic(GFX9) + type(CS) + + + if ((EMU_RUN_HACK) && (!EMU_RUN_HACK_RESTORE_NORMAL)) //hack to use trap_id for determining save/restore + //FIXME VCCZ un-init assertion s_getreg_b32 s_save_status, hwreg(HW_REG_STATUS) //save STATUS since we will change SCC + s_and_b32 s_save_tmp, s_save_pc_hi, 0xffff0000 //change SCC + s_cmp_eq_u32 s_save_tmp, 0x007e0000 //Save: trap_id = 0x7e. Restore: trap_id = 0x7f. + s_cbranch_scc0 L_JUMP_TO_RESTORE //do not need to recover STATUS here since we are going to RESTORE + //FIXME s_setreg_b32 hwreg(HW_REG_STATUS), s_save_status //need to recover STATUS since we are going to SAVE + s_branch L_SKIP_RESTORE //NOT restore, SAVE actually + else + s_branch L_SKIP_RESTORE //NOT restore. might be a regular trap or save + end + +L_JUMP_TO_RESTORE: + s_branch L_RESTORE //restore + +L_SKIP_RESTORE: + + s_getreg_b32 s_save_status, hwreg(HW_REG_STATUS) //save STATUS since we will change SCC + s_andn2_b32 s_save_status, s_save_status, SQ_WAVE_STATUS_SPI_PRIO_MASK //check whether this is for save + s_getreg_b32 s_save_trapsts, hwreg(HW_REG_TRAPSTS) + s_and_b32 ttmp2, s_save_trapsts, SQ_WAVE_TRAPSTS_SAVECTX_MASK //check whether this is for save + s_cbranch_scc1 L_SAVE //this is the operation for save + + // ********* Handle non-CWSR traps ******************* +if (!EMU_RUN_HACK) + // Illegal instruction is a non-maskable exception which blocks context save. + // Halt the wavefront and return from the trap. + s_and_b32 ttmp2, s_save_trapsts, SQ_WAVE_TRAPSTS_ILLEGAL_INST_MASK + s_cbranch_scc1 L_HALT_WAVE + + // If STATUS.MEM_VIOL is asserted then we cannot fetch from the TMA. + // Instead, halt the wavefront and return from the trap. + s_and_b32 ttmp2, s_save_trapsts, SQ_WAVE_TRAPSTS_MEM_VIOL_MASK + s_cbranch_scc0 L_FETCH_2ND_TRAP + +L_HALT_WAVE: + // If STATUS.HALT is set then this fault must come from SQC instruction fetch. + // We cannot prevent further faults so just terminate the wavefront. + s_and_b32 ttmp2, s_save_status, SQ_WAVE_STATUS_HALT_MASK + s_cbranch_scc0 L_NOT_ALREADY_HALTED + s_endpgm +L_NOT_ALREADY_HALTED: + s_or_b32 s_save_status, s_save_status, SQ_WAVE_STATUS_HALT_MASK + + // If the PC points to S_ENDPGM then context save will fail if STATUS.HALT is set. + // Rewind the PC to prevent this from occurring. The debugger compensates for this. + s_sub_u32 ttmp0, ttmp0, 0x8 + s_subb_u32 ttmp1, ttmp1, 0x0 + +L_FETCH_2ND_TRAP: + // Preserve and clear scalar XNACK state before issuing scalar reads. + // Save IB_STS.FIRST_REPLAY[15] and IB_STS.RCNT[20:16] into unused space ttmp11[31:26]. + s_getreg_b32 ttmp2, hwreg(HW_REG_IB_STS) + s_and_b32 ttmp3, ttmp2, SQ_WAVE_IB_STS_RCNT_FIRST_REPLAY_MASK + s_lshl_b32 ttmp3, ttmp3, (TTMP11_SAVE_RCNT_FIRST_REPLAY_SHIFT - SQ_WAVE_IB_STS_FIRST_REPLAY_SHIFT) + s_andn2_b32 ttmp11, ttmp11, TTMP11_SAVE_RCNT_FIRST_REPLAY_MASK + s_or_b32 ttmp11, ttmp11, ttmp3 + + s_andn2_b32 ttmp2, ttmp2, SQ_WAVE_IB_STS_RCNT_FIRST_REPLAY_MASK + s_setreg_b32 hwreg(HW_REG_IB_STS), ttmp2 + + // Read second-level TBA/TMA from first-level TMA and jump if available. + // ttmp[2:5] and ttmp12 can be used (others hold SPI-initialized debug data) + // ttmp12 holds SQ_WAVE_STATUS + s_getreg_b32 ttmp4, hwreg(HW_REG_SQ_SHADER_TMA_LO) + s_getreg_b32 ttmp5, hwreg(HW_REG_SQ_SHADER_TMA_HI) + s_lshl_b64 [ttmp4, ttmp5], [ttmp4, ttmp5], 0x8 + s_load_dwordx2 [ttmp2, ttmp3], [ttmp4, ttmp5], 0x0 glc:1 // second-level TBA + s_waitcnt lgkmcnt(0) + s_load_dwordx2 [ttmp4, ttmp5], [ttmp4, ttmp5], 0x8 glc:1 // second-level TMA + s_waitcnt lgkmcnt(0) + s_and_b64 [ttmp2, ttmp3], [ttmp2, ttmp3], [ttmp2, ttmp3] + s_cbranch_scc0 L_NO_NEXT_TRAP // second-level trap handler not been set + s_setpc_b64 [ttmp2, ttmp3] // jump to second-level trap handler + +L_NO_NEXT_TRAP: + s_getreg_b32 s_save_trapsts, hwreg(HW_REG_TRAPSTS) + s_and_b32 s_save_trapsts, s_save_trapsts, SQ_WAVE_TRAPSTS_EXCE_MASK // Check whether it is an exception + s_cbranch_scc1 L_EXCP_CASE // Exception, jump back to the shader program directly. + s_add_u32 ttmp0, ttmp0, 4 // S_TRAP case, add 4 to ttmp0 + s_addc_u32 ttmp1, ttmp1, 0 +L_EXCP_CASE: + s_and_b32 ttmp1, ttmp1, 0xFFFF + + // Restore SQ_WAVE_IB_STS. + s_lshr_b32 ttmp2, ttmp11, (TTMP11_SAVE_RCNT_FIRST_REPLAY_SHIFT - SQ_WAVE_IB_STS_FIRST_REPLAY_SHIFT) + s_and_b32 ttmp2, ttmp2, SQ_WAVE_IB_STS_RCNT_FIRST_REPLAY_MASK + s_setreg_b32 hwreg(HW_REG_IB_STS), ttmp2 + + // Restore SQ_WAVE_STATUS. + s_and_b64 exec, exec, exec // Restore STATUS.EXECZ, not writable by s_setreg_b32 + s_and_b64 vcc, vcc, vcc // Restore STATUS.VCCZ, not writable by s_setreg_b32 + set_status_without_spi_prio(s_save_status, ttmp2) + + s_rfe_b64 [ttmp0, ttmp1] +end + // ********* End handling of non-CWSR traps ******************* + +/**************************************************************************/ +/* save routine */ +/**************************************************************************/ + +L_SAVE: + +if G8SR_DEBUG_TIMESTAMP + s_memrealtime s_g8sr_ts_save_s + s_waitcnt lgkmcnt(0) //FIXME, will cause xnack?? +end + + s_and_b32 s_save_pc_hi, s_save_pc_hi, 0x0000ffff //pc[47:32] + + s_mov_b32 s_save_tmp, 0 //clear saveCtx bit + s_setreg_b32 hwreg(HW_REG_TRAPSTS, SQ_WAVE_TRAPSTS_SAVECTX_SHIFT, 1), s_save_tmp //clear saveCtx bit + + s_getreg_b32 s_save_tmp, hwreg(HW_REG_IB_STS, SQ_WAVE_IB_STS_RCNT_SHIFT, SQ_WAVE_IB_STS_RCNT_SIZE) //save RCNT + s_lshl_b32 s_save_tmp, s_save_tmp, S_SAVE_PC_HI_RCNT_SHIFT + s_or_b32 s_save_pc_hi, s_save_pc_hi, s_save_tmp + s_getreg_b32 s_save_tmp, hwreg(HW_REG_IB_STS, SQ_WAVE_IB_STS_FIRST_REPLAY_SHIFT, SQ_WAVE_IB_STS_FIRST_REPLAY_SIZE) //save FIRST_REPLAY + s_lshl_b32 s_save_tmp, s_save_tmp, S_SAVE_PC_HI_FIRST_REPLAY_SHIFT + s_or_b32 s_save_pc_hi, s_save_pc_hi, s_save_tmp + s_getreg_b32 s_save_tmp, hwreg(HW_REG_IB_STS) //clear RCNT and FIRST_REPLAY in IB_STS + s_and_b32 s_save_tmp, s_save_tmp, SQ_WAVE_IB_STS_RCNT_FIRST_REPLAY_MASK_NEG + + s_setreg_b32 hwreg(HW_REG_IB_STS), s_save_tmp + + /* inform SPI the readiness and wait for SPI's go signal */ + s_mov_b32 s_save_exec_lo, exec_lo //save EXEC and use EXEC for the go signal from SPI + s_mov_b32 s_save_exec_hi, exec_hi + s_mov_b64 exec, 0x0 //clear EXEC to get ready to receive + +if G8SR_DEBUG_TIMESTAMP + s_memrealtime s_g8sr_ts_sq_save_msg + s_waitcnt lgkmcnt(0) +end + + if (EMU_RUN_HACK) + + else + s_sendmsg sendmsg(MSG_SAVEWAVE) //send SPI a message and wait for SPI's write to EXEC + end + + // Set SPI_PRIO=2 to avoid starving instruction fetch in the waves we're waiting for. + s_or_b32 s_save_tmp, s_save_status, (2 << SQ_WAVE_STATUS_SPI_PRIO_SHIFT) + s_setreg_b32 hwreg(HW_REG_STATUS), s_save_tmp + + L_SLEEP: + s_sleep 0x2 // sleep 1 (64clk) is not enough for 8 waves per SIMD, which will cause SQ hang, since the 7,8th wave could not get arbit to exec inst, while other waves are stuck into the sleep-loop and waiting for wrexec!=0 + + if (EMU_RUN_HACK) + + else + s_cbranch_execz L_SLEEP + end + +if G8SR_DEBUG_TIMESTAMP + s_memrealtime s_g8sr_ts_spi_wrexec + s_waitcnt lgkmcnt(0) +end + + if ((EMU_RUN_HACK) && (!EMU_RUN_HACK_SAVE_SINGLE_WAVE)) + //calculate wd_addr using absolute thread id + v_readlane_b32 s_save_tmp, v9, 0 + s_lshr_b32 s_save_tmp, s_save_tmp, 6 + s_mul_i32 s_save_tmp, s_save_tmp, WAVE_SPACE + s_add_i32 s_save_spi_init_lo, s_save_tmp, WG_BASE_ADDR_LO + s_mov_b32 s_save_spi_init_hi, WG_BASE_ADDR_HI + s_and_b32 s_save_spi_init_hi, s_save_spi_init_hi, CTX_SAVE_CONTROL + else + end + if ((EMU_RUN_HACK) && (EMU_RUN_HACK_SAVE_SINGLE_WAVE)) + s_add_i32 s_save_spi_init_lo, s_save_tmp, WG_BASE_ADDR_LO + s_mov_b32 s_save_spi_init_hi, WG_BASE_ADDR_HI + s_and_b32 s_save_spi_init_hi, s_save_spi_init_hi, CTX_SAVE_CONTROL + else + end + + // Save trap temporaries 6-11, 13-15 initialized by SPI debug dispatch logic + // ttmp SR memory offset : size(VGPR)+size(SGPR)+0x40 + get_vgpr_size_bytes(s_save_ttmps_lo) + get_sgpr_size_bytes(s_save_ttmps_hi) + s_add_u32 s_save_ttmps_lo, s_save_ttmps_lo, s_save_ttmps_hi + s_add_u32 s_save_ttmps_lo, s_save_ttmps_lo, s_save_spi_init_lo + s_addc_u32 s_save_ttmps_hi, s_save_spi_init_hi, 0x0 + s_and_b32 s_save_ttmps_hi, s_save_ttmps_hi, 0xFFFF + s_store_dwordx2 [ttmp6, ttmp7], [s_save_ttmps_lo, s_save_ttmps_hi], 0x40 glc:1 + ack_sqc_store_workaround() + s_store_dwordx4 [ttmp8, ttmp9, ttmp10, ttmp11], [s_save_ttmps_lo, s_save_ttmps_hi], 0x48 glc:1 + ack_sqc_store_workaround() + s_store_dword ttmp13, [s_save_ttmps_lo, s_save_ttmps_hi], 0x58 glc:1 + ack_sqc_store_workaround() + s_store_dwordx2 [ttmp14, ttmp15], [s_save_ttmps_lo, s_save_ttmps_hi], 0x5C glc:1 + ack_sqc_store_workaround() + + /* setup Resource Contants */ + s_mov_b32 s_save_buf_rsrc0, s_save_spi_init_lo //base_addr_lo + s_and_b32 s_save_buf_rsrc1, s_save_spi_init_hi, 0x0000FFFF //base_addr_hi + s_or_b32 s_save_buf_rsrc1, s_save_buf_rsrc1, S_SAVE_BUF_RSRC_WORD1_STRIDE + s_mov_b32 s_save_buf_rsrc2, 0 //NUM_RECORDS initial value = 0 (in bytes) although not neccessarily inited + s_mov_b32 s_save_buf_rsrc3, S_SAVE_BUF_RSRC_WORD3_MISC + s_and_b32 s_save_tmp, s_save_spi_init_hi, S_SAVE_SPI_INIT_ATC_MASK + s_lshr_b32 s_save_tmp, s_save_tmp, (S_SAVE_SPI_INIT_ATC_SHIFT-SQ_BUF_RSRC_WORD1_ATC_SHIFT) //get ATC bit into position + s_or_b32 s_save_buf_rsrc3, s_save_buf_rsrc3, s_save_tmp //or ATC + s_and_b32 s_save_tmp, s_save_spi_init_hi, S_SAVE_SPI_INIT_MTYPE_MASK + s_lshr_b32 s_save_tmp, s_save_tmp, (S_SAVE_SPI_INIT_MTYPE_SHIFT-SQ_BUF_RSRC_WORD3_MTYPE_SHIFT) //get MTYPE bits into position + s_or_b32 s_save_buf_rsrc3, s_save_buf_rsrc3, s_save_tmp //or MTYPE + + //FIXME right now s_save_m0/s_save_mem_offset use tma_lo/tma_hi (might need to save them before using them?) + s_mov_b32 s_save_m0, m0 //save M0 + + /* global mem offset */ + s_mov_b32 s_save_mem_offset, 0x0 //mem offset initial value = 0 + + + + + /* save HW registers */ + ////////////////////////////// + + L_SAVE_HWREG: + // HWREG SR memory offset : size(VGPR)+size(SGPR) + get_vgpr_size_bytes(s_save_mem_offset) + get_sgpr_size_bytes(s_save_tmp) + s_add_u32 s_save_mem_offset, s_save_mem_offset, s_save_tmp + + + s_mov_b32 s_save_buf_rsrc2, 0x4 //NUM_RECORDS in bytes + if (SWIZZLE_EN) + s_add_u32 s_save_buf_rsrc2, s_save_buf_rsrc2, 0x0 //FIXME need to use swizzle to enable bounds checking? + else + s_mov_b32 s_save_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes + end + + + write_hwreg_to_mem(s_save_m0, s_save_buf_rsrc0, s_save_mem_offset) //M0 + + if ((EMU_RUN_HACK) && (EMU_RUN_HACK_SAVE_FIRST_TIME)) + s_add_u32 s_save_pc_lo, s_save_pc_lo, 4 //pc[31:0]+4 + s_addc_u32 s_save_pc_hi, s_save_pc_hi, 0x0 //carry bit over + end + + write_hwreg_to_mem(s_save_pc_lo, s_save_buf_rsrc0, s_save_mem_offset) //PC + write_hwreg_to_mem(s_save_pc_hi, s_save_buf_rsrc0, s_save_mem_offset) + write_hwreg_to_mem(s_save_exec_lo, s_save_buf_rsrc0, s_save_mem_offset) //EXEC + write_hwreg_to_mem(s_save_exec_hi, s_save_buf_rsrc0, s_save_mem_offset) + write_hwreg_to_mem(s_save_status, s_save_buf_rsrc0, s_save_mem_offset) //STATUS + + //s_save_trapsts conflicts with s_save_alloc_size + s_getreg_b32 s_save_trapsts, hwreg(HW_REG_TRAPSTS) + write_hwreg_to_mem(s_save_trapsts, s_save_buf_rsrc0, s_save_mem_offset) //TRAPSTS + + write_hwreg_to_mem(xnack_mask_lo, s_save_buf_rsrc0, s_save_mem_offset) //XNACK_MASK_LO + write_hwreg_to_mem(xnack_mask_hi, s_save_buf_rsrc0, s_save_mem_offset) //XNACK_MASK_HI + + //use s_save_tmp would introduce conflict here between s_save_tmp and s_save_buf_rsrc2 + s_getreg_b32 s_save_m0, hwreg(HW_REG_MODE) //MODE + write_hwreg_to_mem(s_save_m0, s_save_buf_rsrc0, s_save_mem_offset) + + + + /* the first wave in the threadgroup */ + s_and_b32 s_save_tmp, s_save_spi_init_hi, S_SAVE_SPI_INIT_FIRST_WAVE_MASK // extract fisrt wave bit + s_mov_b32 s_save_exec_hi, 0x0 + s_or_b32 s_save_exec_hi, s_save_tmp, s_save_exec_hi // save first wave bit to s_save_exec_hi.bits[26] + + + /* save SGPRs */ + // Save SGPR before LDS save, then the s0 to s4 can be used during LDS save... + ////////////////////////////// + + // SGPR SR memory offset : size(VGPR) + get_vgpr_size_bytes(s_save_mem_offset) + // TODO, change RSRC word to rearrange memory layout for SGPRS + + s_getreg_b32 s_save_alloc_size, hwreg(HW_REG_GPR_ALLOC,SQ_WAVE_GPR_ALLOC_SGPR_SIZE_SHIFT,SQ_WAVE_GPR_ALLOC_SGPR_SIZE_SIZE) //spgr_size + s_add_u32 s_save_alloc_size, s_save_alloc_size, 1 + s_lshl_b32 s_save_alloc_size, s_save_alloc_size, 4 //Number of SGPRs = (sgpr_size + 1) * 16 (non-zero value) + + if (SGPR_SAVE_USE_SQC) + s_lshl_b32 s_save_buf_rsrc2, s_save_alloc_size, 2 //NUM_RECORDS in bytes + else + s_lshl_b32 s_save_buf_rsrc2, s_save_alloc_size, 8 //NUM_RECORDS in bytes (64 threads) + end + + if (SWIZZLE_EN) + s_add_u32 s_save_buf_rsrc2, s_save_buf_rsrc2, 0x0 //FIXME need to use swizzle to enable bounds checking? + else + s_mov_b32 s_save_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes + end + + + // backup s_save_buf_rsrc0,1 to s_save_pc_lo/hi, since write_16sgpr_to_mem function will change the rsrc0 + //s_mov_b64 s_save_pc_lo, s_save_buf_rsrc0 + s_mov_b64 s_save_xnack_mask_lo, s_save_buf_rsrc0 + s_add_u32 s_save_buf_rsrc0, s_save_buf_rsrc0, s_save_mem_offset + s_addc_u32 s_save_buf_rsrc1, s_save_buf_rsrc1, 0 + + s_mov_b32 m0, 0x0 //SGPR initial index value =0 + s_nop 0x0 //Manually inserted wait states + L_SAVE_SGPR_LOOP: + // SGPR is allocated in 16 SGPR granularity + s_movrels_b64 s0, s0 //s0 = s[0+m0], s1 = s[1+m0] + s_movrels_b64 s2, s2 //s2 = s[2+m0], s3 = s[3+m0] + s_movrels_b64 s4, s4 //s4 = s[4+m0], s5 = s[5+m0] + s_movrels_b64 s6, s6 //s6 = s[6+m0], s7 = s[7+m0] + s_movrels_b64 s8, s8 //s8 = s[8+m0], s9 = s[9+m0] + s_movrels_b64 s10, s10 //s10 = s[10+m0], s11 = s[11+m0] + s_movrels_b64 s12, s12 //s12 = s[12+m0], s13 = s[13+m0] + s_movrels_b64 s14, s14 //s14 = s[14+m0], s15 = s[15+m0] + + write_16sgpr_to_mem(s0, s_save_buf_rsrc0, s_save_mem_offset) //PV: the best performance should be using s_buffer_store_dwordx4 + s_add_u32 m0, m0, 16 //next sgpr index + s_cmp_lt_u32 m0, s_save_alloc_size //scc = (m0 < s_save_alloc_size) ? 1 : 0 + s_cbranch_scc1 L_SAVE_SGPR_LOOP //SGPR save is complete? + // restore s_save_buf_rsrc0,1 + //s_mov_b64 s_save_buf_rsrc0, s_save_pc_lo + s_mov_b64 s_save_buf_rsrc0, s_save_xnack_mask_lo + + + + + /* save first 4 VGPR, then LDS save could use */ + // each wave will alloc 4 vgprs at least... + ///////////////////////////////////////////////////////////////////////////////////// + + s_mov_b32 s_save_mem_offset, 0 + s_mov_b32 exec_lo, 0xFFFFFFFF //need every thread from now on + s_mov_b32 exec_hi, 0xFFFFFFFF + s_mov_b32 xnack_mask_lo, 0x0 + s_mov_b32 xnack_mask_hi, 0x0 + + if (SWIZZLE_EN) + s_add_u32 s_save_buf_rsrc2, s_save_buf_rsrc2, 0x0 //FIXME need to use swizzle to enable bounds checking? + else + s_mov_b32 s_save_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes + end + + + // VGPR Allocated in 4-GPR granularity + +if G8SR_VGPR_SR_IN_DWX4 + // the const stride for DWx4 is 4*4 bytes + s_and_b32 s_save_buf_rsrc1, s_save_buf_rsrc1, 0x0000FFFF // reset const stride to 0 + s_or_b32 s_save_buf_rsrc1, s_save_buf_rsrc1, G8SR_SAVE_BUF_RSRC_WORD1_STRIDE_DWx4 // const stride to 4*4 bytes + + buffer_store_dwordx4 v0, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 + + s_and_b32 s_save_buf_rsrc1, s_save_buf_rsrc1, 0x0000FFFF // reset const stride to 0 + s_or_b32 s_save_buf_rsrc1, s_save_buf_rsrc1, S_SAVE_BUF_RSRC_WORD1_STRIDE // reset const stride to 4 bytes +else + buffer_store_dword v0, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 + buffer_store_dword v1, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 offset:256 + buffer_store_dword v2, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 offset:256*2 + buffer_store_dword v3, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 offset:256*3 +end + + + + /* save LDS */ + ////////////////////////////// + + L_SAVE_LDS: + + // Change EXEC to all threads... + s_mov_b32 exec_lo, 0xFFFFFFFF //need every thread from now on + s_mov_b32 exec_hi, 0xFFFFFFFF + + s_getreg_b32 s_save_alloc_size, hwreg(HW_REG_LDS_ALLOC,SQ_WAVE_LDS_ALLOC_LDS_SIZE_SHIFT,SQ_WAVE_LDS_ALLOC_LDS_SIZE_SIZE) //lds_size + s_and_b32 s_save_alloc_size, s_save_alloc_size, 0xFFFFFFFF //lds_size is zero? + s_cbranch_scc0 L_SAVE_LDS_DONE //no lds used? jump to L_SAVE_DONE + + s_barrier //LDS is used? wait for other waves in the same TG + s_and_b32 s_save_tmp, s_save_exec_hi, S_SAVE_SPI_INIT_FIRST_WAVE_MASK //exec is still used here + s_cbranch_scc0 L_SAVE_LDS_DONE + + // first wave do LDS save; + + s_lshl_b32 s_save_alloc_size, s_save_alloc_size, 6 //LDS size in dwords = lds_size * 64dw + s_lshl_b32 s_save_alloc_size, s_save_alloc_size, 2 //LDS size in bytes + s_mov_b32 s_save_buf_rsrc2, s_save_alloc_size //NUM_RECORDS in bytes + + // LDS at offset: size(VGPR)+SIZE(SGPR)+SIZE(HWREG) + // + get_vgpr_size_bytes(s_save_mem_offset) + get_sgpr_size_bytes(s_save_tmp) + s_add_u32 s_save_mem_offset, s_save_mem_offset, s_save_tmp + s_add_u32 s_save_mem_offset, s_save_mem_offset, get_hwreg_size_bytes() + + + if (SWIZZLE_EN) + s_add_u32 s_save_buf_rsrc2, s_save_buf_rsrc2, 0x0 //FIXME need to use swizzle to enable bounds checking? + else + s_mov_b32 s_save_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes + end + + s_mov_b32 m0, 0x0 //lds_offset initial value = 0 + + +var LDS_DMA_ENABLE = 0 +var UNROLL = 0 +if UNROLL==0 && LDS_DMA_ENABLE==1 + s_mov_b32 s3, 256*2 + s_nop 0 + s_nop 0 + s_nop 0 + L_SAVE_LDS_LOOP: + //TODO: looks the 2 buffer_store/load clause for s/r will hurt performance.??? + if (SAVE_LDS) //SPI always alloc LDS space in 128DW granularity + buffer_store_lds_dword s_save_buf_rsrc0, s_save_mem_offset lds:1 // first 64DW + buffer_store_lds_dword s_save_buf_rsrc0, s_save_mem_offset lds:1 offset:256 // second 64DW + end + + s_add_u32 m0, m0, s3 //every buffer_store_lds does 256 bytes + s_add_u32 s_save_mem_offset, s_save_mem_offset, s3 //mem offset increased by 256 bytes + s_cmp_lt_u32 m0, s_save_alloc_size //scc=(m0 < s_save_alloc_size) ? 1 : 0 + s_cbranch_scc1 L_SAVE_LDS_LOOP //LDS save is complete? + +elsif LDS_DMA_ENABLE==1 && UNROLL==1 // UNROOL , has ichace miss + // store from higest LDS address to lowest + s_mov_b32 s3, 256*2 + s_sub_u32 m0, s_save_alloc_size, s3 + s_add_u32 s_save_mem_offset, s_save_mem_offset, m0 + s_lshr_b32 s_save_alloc_size, s_save_alloc_size, 9 // how many 128 trunks... + s_sub_u32 s_save_alloc_size, 128, s_save_alloc_size // store from higheset addr to lowest + s_mul_i32 s_save_alloc_size, s_save_alloc_size, 6*4 // PC offset increment, each LDS save block cost 6*4 Bytes instruction + s_add_u32 s_save_alloc_size, s_save_alloc_size, 3*4 //2is the below 2 inst...//s_addc and s_setpc + s_nop 0 + s_nop 0 + s_nop 0 //pad 3 dw to let LDS_DMA align with 64Bytes + s_getpc_b64 s[0:1] // reuse s[0:1], since s[0:1] already saved + s_add_u32 s0, s0,s_save_alloc_size + s_addc_u32 s1, s1, 0 + s_setpc_b64 s[0:1] + + + for var i =0; i< 128; i++ + // be careful to make here a 64Byte aligned address, which could improve performance... + buffer_store_lds_dword s_save_buf_rsrc0, s_save_mem_offset lds:1 offset:0 // first 64DW + buffer_store_lds_dword s_save_buf_rsrc0, s_save_mem_offset lds:1 offset:256 // second 64DW + + if i!=127 + s_sub_u32 m0, m0, s3 // use a sgpr to shrink 2DW-inst to 1DW inst to improve performance , i.e. pack more LDS_DMA inst to one Cacheline + s_sub_u32 s_save_mem_offset, s_save_mem_offset, s3 + end + end + +else // BUFFER_STORE + v_mbcnt_lo_u32_b32 v2, 0xffffffff, 0x0 + v_mbcnt_hi_u32_b32 v3, 0xffffffff, v2 // tid + v_mul_i32_i24 v2, v3, 8 // tid*8 + v_mov_b32 v3, 256*2 + s_mov_b32 m0, 0x10000 + s_mov_b32 s0, s_save_buf_rsrc3 + s_and_b32 s_save_buf_rsrc3, s_save_buf_rsrc3, 0xFF7FFFFF // disable add_tid + s_or_b32 s_save_buf_rsrc3, s_save_buf_rsrc3, 0x58000 //DFMT + +L_SAVE_LDS_LOOP_VECTOR: + ds_read_b64 v[0:1], v2 //x =LDS[a], byte address + s_waitcnt lgkmcnt(0) + buffer_store_dwordx2 v[0:1], v2, s_save_buf_rsrc0, s_save_mem_offset offen:1 glc:1 slc:1 +// s_waitcnt vmcnt(0) +// v_add_u32 v2, vcc[0:1], v2, v3 + v_add_u32 v2, v2, v3 + v_cmp_lt_u32 vcc[0:1], v2, s_save_alloc_size + s_cbranch_vccnz L_SAVE_LDS_LOOP_VECTOR + + // restore rsrc3 + s_mov_b32 s_save_buf_rsrc3, s0 + +end + +L_SAVE_LDS_DONE: + + + /* save VGPRs - set the Rest VGPRs */ + ////////////////////////////////////////////////////////////////////////////////////// + L_SAVE_VGPR: + // VGPR SR memory offset: 0 + // TODO rearrange the RSRC words to use swizzle for VGPR save... + + s_mov_b32 s_save_mem_offset, (0+256*4) // for the rest VGPRs + s_mov_b32 exec_lo, 0xFFFFFFFF //need every thread from now on + s_mov_b32 exec_hi, 0xFFFFFFFF + + s_getreg_b32 s_save_alloc_size, hwreg(HW_REG_GPR_ALLOC,SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SHIFT,SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SIZE) //vpgr_size + s_add_u32 s_save_alloc_size, s_save_alloc_size, 1 + s_lshl_b32 s_save_alloc_size, s_save_alloc_size, 2 //Number of VGPRs = (vgpr_size + 1) * 4 (non-zero value) //FIXME for GFX, zero is possible + s_lshl_b32 s_save_buf_rsrc2, s_save_alloc_size, 8 //NUM_RECORDS in bytes (64 threads*4) + if (SWIZZLE_EN) + s_add_u32 s_save_buf_rsrc2, s_save_buf_rsrc2, 0x0 //FIXME need to use swizzle to enable bounds checking? + else + s_mov_b32 s_save_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes + end + + + // VGPR Allocated in 4-GPR granularity + +if G8SR_VGPR_SR_IN_DWX4 + // the const stride for DWx4 is 4*4 bytes + s_and_b32 s_save_buf_rsrc1, s_save_buf_rsrc1, 0x0000FFFF // reset const stride to 0 + s_or_b32 s_save_buf_rsrc1, s_save_buf_rsrc1, G8SR_SAVE_BUF_RSRC_WORD1_STRIDE_DWx4 // const stride to 4*4 bytes + + s_mov_b32 m0, 4 // skip first 4 VGPRs + s_cmp_lt_u32 m0, s_save_alloc_size + s_cbranch_scc0 L_SAVE_VGPR_LOOP_END // no more vgprs + + s_set_gpr_idx_on m0, 0x1 // This will change M0 + s_add_u32 s_save_alloc_size, s_save_alloc_size, 0x1000 // because above inst change m0 +L_SAVE_VGPR_LOOP: + v_mov_b32 v0, v0 // v0 = v[0+m0] + v_mov_b32 v1, v1 + v_mov_b32 v2, v2 + v_mov_b32 v3, v3 + + + buffer_store_dwordx4 v0, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 + s_add_u32 m0, m0, 4 + s_add_u32 s_save_mem_offset, s_save_mem_offset, 256*4 + s_cmp_lt_u32 m0, s_save_alloc_size + s_cbranch_scc1 L_SAVE_VGPR_LOOP //VGPR save is complete? + s_set_gpr_idx_off +L_SAVE_VGPR_LOOP_END: + + s_and_b32 s_save_buf_rsrc1, s_save_buf_rsrc1, 0x0000FFFF // reset const stride to 0 + s_or_b32 s_save_buf_rsrc1, s_save_buf_rsrc1, S_SAVE_BUF_RSRC_WORD1_STRIDE // reset const stride to 4 bytes +else + // VGPR store using dw burst + s_mov_b32 m0, 0x4 //VGPR initial index value =0 + s_cmp_lt_u32 m0, s_save_alloc_size + s_cbranch_scc0 L_SAVE_VGPR_END + + + s_set_gpr_idx_on m0, 0x1 //M0[7:0] = M0[7:0] and M0[15:12] = 0x1 + s_add_u32 s_save_alloc_size, s_save_alloc_size, 0x1000 //add 0x1000 since we compare m0 against it later + + L_SAVE_VGPR_LOOP: + v_mov_b32 v0, v0 //v0 = v[0+m0] + v_mov_b32 v1, v1 //v0 = v[0+m0] + v_mov_b32 v2, v2 //v0 = v[0+m0] + v_mov_b32 v3, v3 //v0 = v[0+m0] + + if(USE_MTBUF_INSTEAD_OF_MUBUF) + tbuffer_store_format_x v0, v0, s_save_buf_rsrc0, s_save_mem_offset format:BUF_NUM_FORMAT_FLOAT format: BUF_DATA_FORMAT_32 slc:1 glc:1 + else + buffer_store_dword v0, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 + buffer_store_dword v1, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 offset:256 + buffer_store_dword v2, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 offset:256*2 + buffer_store_dword v3, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 offset:256*3 + end + + s_add_u32 m0, m0, 4 //next vgpr index + s_add_u32 s_save_mem_offset, s_save_mem_offset, 256*4 //every buffer_store_dword does 256 bytes + s_cmp_lt_u32 m0, s_save_alloc_size //scc = (m0 < s_save_alloc_size) ? 1 : 0 + s_cbranch_scc1 L_SAVE_VGPR_LOOP //VGPR save is complete? + s_set_gpr_idx_off +end + +L_SAVE_VGPR_END: + + + + + + + /* S_PGM_END_SAVED */ //FIXME graphics ONLY + if ((EMU_RUN_HACK) && (!EMU_RUN_HACK_SAVE_NORMAL_EXIT)) + s_and_b32 s_save_pc_hi, s_save_pc_hi, 0x0000ffff //pc[47:32] + s_add_u32 s_save_pc_lo, s_save_pc_lo, 4 //pc[31:0]+4 + s_addc_u32 s_save_pc_hi, s_save_pc_hi, 0x0 //carry bit over + s_rfe_b64 s_save_pc_lo //Return to the main shader program + else + end + +// Save Done timestamp +if G8SR_DEBUG_TIMESTAMP + s_memrealtime s_g8sr_ts_save_d + // SGPR SR memory offset : size(VGPR) + get_vgpr_size_bytes(s_save_mem_offset) + s_add_u32 s_save_mem_offset, s_save_mem_offset, G8SR_DEBUG_TS_SAVE_D_OFFSET + s_waitcnt lgkmcnt(0) //FIXME, will cause xnack?? + // Need reset rsrc2?? + s_mov_b32 m0, s_save_mem_offset + s_mov_b32 s_save_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes + s_buffer_store_dwordx2 s_g8sr_ts_save_d, s_save_buf_rsrc0, m0 glc:1 +end + + + s_branch L_END_PGM + + + +/**************************************************************************/ +/* restore routine */ +/**************************************************************************/ + +L_RESTORE: + /* Setup Resource Contants */ + if ((EMU_RUN_HACK) && (!EMU_RUN_HACK_RESTORE_NORMAL)) + //calculate wd_addr using absolute thread id + v_readlane_b32 s_restore_tmp, v9, 0 + s_lshr_b32 s_restore_tmp, s_restore_tmp, 6 + s_mul_i32 s_restore_tmp, s_restore_tmp, WAVE_SPACE + s_add_i32 s_restore_spi_init_lo, s_restore_tmp, WG_BASE_ADDR_LO + s_mov_b32 s_restore_spi_init_hi, WG_BASE_ADDR_HI + s_and_b32 s_restore_spi_init_hi, s_restore_spi_init_hi, CTX_RESTORE_CONTROL + else + end + +if G8SR_DEBUG_TIMESTAMP + s_memrealtime s_g8sr_ts_restore_s + s_waitcnt lgkmcnt(0) //FIXME, will cause xnack?? + // tma_lo/hi are sgpr 110, 111, which will not used for 112 SGPR allocated case... + s_mov_b32 s_restore_pc_lo, s_g8sr_ts_restore_s[0] + s_mov_b32 s_restore_pc_hi, s_g8sr_ts_restore_s[1] //backup ts to ttmp0/1, sicne exec will be finally restored.. +end + + + + s_mov_b32 s_restore_buf_rsrc0, s_restore_spi_init_lo //base_addr_lo + s_and_b32 s_restore_buf_rsrc1, s_restore_spi_init_hi, 0x0000FFFF //base_addr_hi + s_or_b32 s_restore_buf_rsrc1, s_restore_buf_rsrc1, S_RESTORE_BUF_RSRC_WORD1_STRIDE + s_mov_b32 s_restore_buf_rsrc2, 0 //NUM_RECORDS initial value = 0 (in bytes) + s_mov_b32 s_restore_buf_rsrc3, S_RESTORE_BUF_RSRC_WORD3_MISC + s_and_b32 s_restore_tmp, s_restore_spi_init_hi, S_RESTORE_SPI_INIT_ATC_MASK + s_lshr_b32 s_restore_tmp, s_restore_tmp, (S_RESTORE_SPI_INIT_ATC_SHIFT-SQ_BUF_RSRC_WORD1_ATC_SHIFT) //get ATC bit into position + s_or_b32 s_restore_buf_rsrc3, s_restore_buf_rsrc3, s_restore_tmp //or ATC + s_and_b32 s_restore_tmp, s_restore_spi_init_hi, S_RESTORE_SPI_INIT_MTYPE_MASK + s_lshr_b32 s_restore_tmp, s_restore_tmp, (S_RESTORE_SPI_INIT_MTYPE_SHIFT-SQ_BUF_RSRC_WORD3_MTYPE_SHIFT) //get MTYPE bits into position + s_or_b32 s_restore_buf_rsrc3, s_restore_buf_rsrc3, s_restore_tmp //or MTYPE + + /* global mem offset */ +// s_mov_b32 s_restore_mem_offset, 0x0 //mem offset initial value = 0 + + /* the first wave in the threadgroup */ + s_and_b32 s_restore_tmp, s_restore_spi_init_hi, S_RESTORE_SPI_INIT_FIRST_WAVE_MASK + s_cbranch_scc0 L_RESTORE_VGPR + + /* restore LDS */ + ////////////////////////////// + L_RESTORE_LDS: + + s_mov_b32 exec_lo, 0xFFFFFFFF //need every thread from now on //be consistent with SAVE although can be moved ahead + s_mov_b32 exec_hi, 0xFFFFFFFF + + s_getreg_b32 s_restore_alloc_size, hwreg(HW_REG_LDS_ALLOC,SQ_WAVE_LDS_ALLOC_LDS_SIZE_SHIFT,SQ_WAVE_LDS_ALLOC_LDS_SIZE_SIZE) //lds_size + s_and_b32 s_restore_alloc_size, s_restore_alloc_size, 0xFFFFFFFF //lds_size is zero? + s_cbranch_scc0 L_RESTORE_VGPR //no lds used? jump to L_RESTORE_VGPR + s_lshl_b32 s_restore_alloc_size, s_restore_alloc_size, 6 //LDS size in dwords = lds_size * 64dw + s_lshl_b32 s_restore_alloc_size, s_restore_alloc_size, 2 //LDS size in bytes + s_mov_b32 s_restore_buf_rsrc2, s_restore_alloc_size //NUM_RECORDS in bytes + + // LDS at offset: size(VGPR)+SIZE(SGPR)+SIZE(HWREG) + // + get_vgpr_size_bytes(s_restore_mem_offset) + get_sgpr_size_bytes(s_restore_tmp) + s_add_u32 s_restore_mem_offset, s_restore_mem_offset, s_restore_tmp + s_add_u32 s_restore_mem_offset, s_restore_mem_offset, get_hwreg_size_bytes() //FIXME, Check if offset overflow??? + + + if (SWIZZLE_EN) + s_add_u32 s_restore_buf_rsrc2, s_restore_buf_rsrc2, 0x0 //FIXME need to use swizzle to enable bounds checking? + else + s_mov_b32 s_restore_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes + end + s_mov_b32 m0, 0x0 //lds_offset initial value = 0 + + L_RESTORE_LDS_LOOP: + if (SAVE_LDS) + buffer_load_dword v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset lds:1 // first 64DW + buffer_load_dword v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset lds:1 offset:256 // second 64DW + end + s_add_u32 m0, m0, 256*2 // 128 DW + s_add_u32 s_restore_mem_offset, s_restore_mem_offset, 256*2 //mem offset increased by 128DW + s_cmp_lt_u32 m0, s_restore_alloc_size //scc=(m0 < s_restore_alloc_size) ? 1 : 0 + s_cbranch_scc1 L_RESTORE_LDS_LOOP //LDS restore is complete? + + + /* restore VGPRs */ + ////////////////////////////// + L_RESTORE_VGPR: + // VGPR SR memory offset : 0 + s_mov_b32 s_restore_mem_offset, 0x0 + s_mov_b32 exec_lo, 0xFFFFFFFF //need every thread from now on //be consistent with SAVE although can be moved ahead + s_mov_b32 exec_hi, 0xFFFFFFFF + + s_getreg_b32 s_restore_alloc_size, hwreg(HW_REG_GPR_ALLOC,SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SHIFT,SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SIZE) //vpgr_size + s_add_u32 s_restore_alloc_size, s_restore_alloc_size, 1 + s_lshl_b32 s_restore_alloc_size, s_restore_alloc_size, 2 //Number of VGPRs = (vgpr_size + 1) * 4 (non-zero value) + s_lshl_b32 s_restore_buf_rsrc2, s_restore_alloc_size, 8 //NUM_RECORDS in bytes (64 threads*4) + if (SWIZZLE_EN) + s_add_u32 s_restore_buf_rsrc2, s_restore_buf_rsrc2, 0x0 //FIXME need to use swizzle to enable bounds checking? + else + s_mov_b32 s_restore_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes + end + +if G8SR_VGPR_SR_IN_DWX4 + get_vgpr_size_bytes(s_restore_mem_offset) + s_sub_u32 s_restore_mem_offset, s_restore_mem_offset, 256*4 + + // the const stride for DWx4 is 4*4 bytes + s_and_b32 s_restore_buf_rsrc1, s_restore_buf_rsrc1, 0x0000FFFF // reset const stride to 0 + s_or_b32 s_restore_buf_rsrc1, s_restore_buf_rsrc1, G8SR_RESTORE_BUF_RSRC_WORD1_STRIDE_DWx4 // const stride to 4*4 bytes + + s_mov_b32 m0, s_restore_alloc_size + s_set_gpr_idx_on m0, 0x8 // Note.. This will change m0 + +L_RESTORE_VGPR_LOOP: + buffer_load_dwordx4 v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset slc:1 glc:1 + s_waitcnt vmcnt(0) + s_sub_u32 m0, m0, 4 + v_mov_b32 v0, v0 // v[0+m0] = v0 + v_mov_b32 v1, v1 + v_mov_b32 v2, v2 + v_mov_b32 v3, v3 + s_sub_u32 s_restore_mem_offset, s_restore_mem_offset, 256*4 + s_cmp_eq_u32 m0, 0x8000 + s_cbranch_scc0 L_RESTORE_VGPR_LOOP + s_set_gpr_idx_off + + s_and_b32 s_restore_buf_rsrc1, s_restore_buf_rsrc1, 0x0000FFFF // reset const stride to 0 + s_or_b32 s_restore_buf_rsrc1, s_restore_buf_rsrc1, S_RESTORE_BUF_RSRC_WORD1_STRIDE // const stride to 4*4 bytes + +else + // VGPR load using dw burst + s_mov_b32 s_restore_mem_offset_save, s_restore_mem_offset // restore start with v1, v0 will be the last + s_add_u32 s_restore_mem_offset, s_restore_mem_offset, 256*4 + s_mov_b32 m0, 4 //VGPR initial index value = 1 + s_set_gpr_idx_on m0, 0x8 //M0[7:0] = M0[7:0] and M0[15:12] = 0x8 + s_add_u32 s_restore_alloc_size, s_restore_alloc_size, 0x8000 //add 0x8000 since we compare m0 against it later + + L_RESTORE_VGPR_LOOP: + if(USE_MTBUF_INSTEAD_OF_MUBUF) + tbuffer_load_format_x v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset format:BUF_NUM_FORMAT_FLOAT format: BUF_DATA_FORMAT_32 slc:1 glc:1 + else + buffer_load_dword v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset slc:1 glc:1 + buffer_load_dword v1, v0, s_restore_buf_rsrc0, s_restore_mem_offset slc:1 glc:1 offset:256 + buffer_load_dword v2, v0, s_restore_buf_rsrc0, s_restore_mem_offset slc:1 glc:1 offset:256*2 + buffer_load_dword v3, v0, s_restore_buf_rsrc0, s_restore_mem_offset slc:1 glc:1 offset:256*3 + end + s_waitcnt vmcnt(0) //ensure data ready + v_mov_b32 v0, v0 //v[0+m0] = v0 + v_mov_b32 v1, v1 + v_mov_b32 v2, v2 + v_mov_b32 v3, v3 + s_add_u32 m0, m0, 4 //next vgpr index + s_add_u32 s_restore_mem_offset, s_restore_mem_offset, 256*4 //every buffer_load_dword does 256 bytes + s_cmp_lt_u32 m0, s_restore_alloc_size //scc = (m0 < s_restore_alloc_size) ? 1 : 0 + s_cbranch_scc1 L_RESTORE_VGPR_LOOP //VGPR restore (except v0) is complete? + s_set_gpr_idx_off + /* VGPR restore on v0 */ + if(USE_MTBUF_INSTEAD_OF_MUBUF) + tbuffer_load_format_x v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save format:BUF_NUM_FORMAT_FLOAT format: BUF_DATA_FORMAT_32 slc:1 glc:1 + else + buffer_load_dword v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save slc:1 glc:1 + buffer_load_dword v1, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save slc:1 glc:1 offset:256 + buffer_load_dword v2, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save slc:1 glc:1 offset:256*2 + buffer_load_dword v3, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save slc:1 glc:1 offset:256*3 + end + +end + + /* restore SGPRs */ + ////////////////////////////// + + // SGPR SR memory offset : size(VGPR) + get_vgpr_size_bytes(s_restore_mem_offset) + get_sgpr_size_bytes(s_restore_tmp) + s_add_u32 s_restore_mem_offset, s_restore_mem_offset, s_restore_tmp + s_sub_u32 s_restore_mem_offset, s_restore_mem_offset, 16*4 // restore SGPR from S[n] to S[0], by 16 sgprs group + // TODO, change RSRC word to rearrange memory layout for SGPRS + + s_getreg_b32 s_restore_alloc_size, hwreg(HW_REG_GPR_ALLOC,SQ_WAVE_GPR_ALLOC_SGPR_SIZE_SHIFT,SQ_WAVE_GPR_ALLOC_SGPR_SIZE_SIZE) //spgr_size + s_add_u32 s_restore_alloc_size, s_restore_alloc_size, 1 + s_lshl_b32 s_restore_alloc_size, s_restore_alloc_size, 4 //Number of SGPRs = (sgpr_size + 1) * 16 (non-zero value) + + if (SGPR_SAVE_USE_SQC) + s_lshl_b32 s_restore_buf_rsrc2, s_restore_alloc_size, 2 //NUM_RECORDS in bytes + else + s_lshl_b32 s_restore_buf_rsrc2, s_restore_alloc_size, 8 //NUM_RECORDS in bytes (64 threads) + end + if (SWIZZLE_EN) + s_add_u32 s_restore_buf_rsrc2, s_restore_buf_rsrc2, 0x0 //FIXME need to use swizzle to enable bounds checking? + else + s_mov_b32 s_restore_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes + end + + s_mov_b32 m0, s_restore_alloc_size + + L_RESTORE_SGPR_LOOP: + read_16sgpr_from_mem(s0, s_restore_buf_rsrc0, s_restore_mem_offset) //PV: further performance improvement can be made + s_waitcnt lgkmcnt(0) //ensure data ready + + s_sub_u32 m0, m0, 16 // Restore from S[n] to S[0] + s_nop 0 // hazard SALU M0=> S_MOVREL + + s_movreld_b64 s0, s0 //s[0+m0] = s0 + s_movreld_b64 s2, s2 + s_movreld_b64 s4, s4 + s_movreld_b64 s6, s6 + s_movreld_b64 s8, s8 + s_movreld_b64 s10, s10 + s_movreld_b64 s12, s12 + s_movreld_b64 s14, s14 + + s_cmp_eq_u32 m0, 0 //scc = (m0 < s_restore_alloc_size) ? 1 : 0 + s_cbranch_scc0 L_RESTORE_SGPR_LOOP //SGPR restore (except s0) is complete? + + /* restore HW registers */ + ////////////////////////////// + L_RESTORE_HWREG: + + +if G8SR_DEBUG_TIMESTAMP + s_mov_b32 s_g8sr_ts_restore_s[0], s_restore_pc_lo + s_mov_b32 s_g8sr_ts_restore_s[1], s_restore_pc_hi +end + + // HWREG SR memory offset : size(VGPR)+size(SGPR) + get_vgpr_size_bytes(s_restore_mem_offset) + get_sgpr_size_bytes(s_restore_tmp) + s_add_u32 s_restore_mem_offset, s_restore_mem_offset, s_restore_tmp + + + s_mov_b32 s_restore_buf_rsrc2, 0x4 //NUM_RECORDS in bytes + if (SWIZZLE_EN) + s_add_u32 s_restore_buf_rsrc2, s_restore_buf_rsrc2, 0x0 //FIXME need to use swizzle to enable bounds checking? + else + s_mov_b32 s_restore_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes + end + + read_hwreg_from_mem(s_restore_m0, s_restore_buf_rsrc0, s_restore_mem_offset) //M0 + read_hwreg_from_mem(s_restore_pc_lo, s_restore_buf_rsrc0, s_restore_mem_offset) //PC + read_hwreg_from_mem(s_restore_pc_hi, s_restore_buf_rsrc0, s_restore_mem_offset) + read_hwreg_from_mem(s_restore_exec_lo, s_restore_buf_rsrc0, s_restore_mem_offset) //EXEC + read_hwreg_from_mem(s_restore_exec_hi, s_restore_buf_rsrc0, s_restore_mem_offset) + read_hwreg_from_mem(s_restore_status, s_restore_buf_rsrc0, s_restore_mem_offset) //STATUS + read_hwreg_from_mem(s_restore_trapsts, s_restore_buf_rsrc0, s_restore_mem_offset) //TRAPSTS + read_hwreg_from_mem(xnack_mask_lo, s_restore_buf_rsrc0, s_restore_mem_offset) //XNACK_MASK_LO + read_hwreg_from_mem(xnack_mask_hi, s_restore_buf_rsrc0, s_restore_mem_offset) //XNACK_MASK_HI + read_hwreg_from_mem(s_restore_mode, s_restore_buf_rsrc0, s_restore_mem_offset) //MODE + + s_waitcnt lgkmcnt(0) //from now on, it is safe to restore STATUS and IB_STS + + //for normal save & restore, the saved PC points to the next inst to execute, no adjustment needs to be made, otherwise: + if ((EMU_RUN_HACK) && (!EMU_RUN_HACK_RESTORE_NORMAL)) + s_add_u32 s_restore_pc_lo, s_restore_pc_lo, 8 //pc[31:0]+8 //two back-to-back s_trap are used (first for save and second for restore) + s_addc_u32 s_restore_pc_hi, s_restore_pc_hi, 0x0 //carry bit over + end + if ((EMU_RUN_HACK) && (EMU_RUN_HACK_RESTORE_NORMAL)) + s_add_u32 s_restore_pc_lo, s_restore_pc_lo, 4 //pc[31:0]+4 // save is hack through s_trap but restore is normal + s_addc_u32 s_restore_pc_hi, s_restore_pc_hi, 0x0 //carry bit over + end + + s_mov_b32 m0, s_restore_m0 + s_mov_b32 exec_lo, s_restore_exec_lo + s_mov_b32 exec_hi, s_restore_exec_hi + + s_and_b32 s_restore_m0, SQ_WAVE_TRAPSTS_PRE_SAVECTX_MASK, s_restore_trapsts + s_setreg_b32 hwreg(HW_REG_TRAPSTS, SQ_WAVE_TRAPSTS_PRE_SAVECTX_SHIFT, SQ_WAVE_TRAPSTS_PRE_SAVECTX_SIZE), s_restore_m0 + s_and_b32 s_restore_m0, SQ_WAVE_TRAPSTS_POST_SAVECTX_MASK, s_restore_trapsts + s_lshr_b32 s_restore_m0, s_restore_m0, SQ_WAVE_TRAPSTS_POST_SAVECTX_SHIFT + s_setreg_b32 hwreg(HW_REG_TRAPSTS, SQ_WAVE_TRAPSTS_POST_SAVECTX_SHIFT, SQ_WAVE_TRAPSTS_POST_SAVECTX_SIZE), s_restore_m0 + //s_setreg_b32 hwreg(HW_REG_TRAPSTS), s_restore_trapsts //don't overwrite SAVECTX bit as it may be set through external SAVECTX during restore + s_setreg_b32 hwreg(HW_REG_MODE), s_restore_mode + + // Restore trap temporaries 6-11, 13-15 initialized by SPI debug dispatch logic + // ttmp SR memory offset : size(VGPR)+size(SGPR)+0x40 + get_vgpr_size_bytes(s_restore_ttmps_lo) + get_sgpr_size_bytes(s_restore_ttmps_hi) + s_add_u32 s_restore_ttmps_lo, s_restore_ttmps_lo, s_restore_ttmps_hi + s_add_u32 s_restore_ttmps_lo, s_restore_ttmps_lo, s_restore_buf_rsrc0 + s_addc_u32 s_restore_ttmps_hi, s_restore_buf_rsrc1, 0x0 + s_and_b32 s_restore_ttmps_hi, s_restore_ttmps_hi, 0xFFFF + s_load_dwordx2 [ttmp6, ttmp7], [s_restore_ttmps_lo, s_restore_ttmps_hi], 0x40 glc:1 + s_load_dwordx4 [ttmp8, ttmp9, ttmp10, ttmp11], [s_restore_ttmps_lo, s_restore_ttmps_hi], 0x48 glc:1 + s_load_dword ttmp13, [s_restore_ttmps_lo, s_restore_ttmps_hi], 0x58 glc:1 + s_load_dwordx2 [ttmp14, ttmp15], [s_restore_ttmps_lo, s_restore_ttmps_hi], 0x5C glc:1 + s_waitcnt lgkmcnt(0) + + //reuse s_restore_m0 as a temp register + s_and_b32 s_restore_m0, s_restore_pc_hi, S_SAVE_PC_HI_RCNT_MASK + s_lshr_b32 s_restore_m0, s_restore_m0, S_SAVE_PC_HI_RCNT_SHIFT + s_lshl_b32 s_restore_m0, s_restore_m0, SQ_WAVE_IB_STS_RCNT_SHIFT + s_mov_b32 s_restore_tmp, 0x0 //IB_STS is zero + s_or_b32 s_restore_tmp, s_restore_tmp, s_restore_m0 + s_and_b32 s_restore_m0, s_restore_pc_hi, S_SAVE_PC_HI_FIRST_REPLAY_MASK + s_lshr_b32 s_restore_m0, s_restore_m0, S_SAVE_PC_HI_FIRST_REPLAY_SHIFT + s_lshl_b32 s_restore_m0, s_restore_m0, SQ_WAVE_IB_STS_FIRST_REPLAY_SHIFT + s_or_b32 s_restore_tmp, s_restore_tmp, s_restore_m0 + s_and_b32 s_restore_m0, s_restore_status, SQ_WAVE_STATUS_INST_ATC_MASK + s_lshr_b32 s_restore_m0, s_restore_m0, SQ_WAVE_STATUS_INST_ATC_SHIFT + s_setreg_b32 hwreg(HW_REG_IB_STS), s_restore_tmp + + s_and_b32 s_restore_pc_hi, s_restore_pc_hi, 0x0000ffff //pc[47:32] //Do it here in order not to affect STATUS + s_and_b64 exec, exec, exec // Restore STATUS.EXECZ, not writable by s_setreg_b32 + s_and_b64 vcc, vcc, vcc // Restore STATUS.VCCZ, not writable by s_setreg_b32 + set_status_without_spi_prio(s_restore_status, s_restore_tmp) // SCC is included, which is changed by previous salu + + s_barrier //barrier to ensure the readiness of LDS before access attempts from any other wave in the same TG //FIXME not performance-optimal at this time + +if G8SR_DEBUG_TIMESTAMP + s_memrealtime s_g8sr_ts_restore_d + s_waitcnt lgkmcnt(0) +end + +// s_rfe_b64 s_restore_pc_lo //Return to the main shader program and resume execution + s_rfe_restore_b64 s_restore_pc_lo, s_restore_m0 // s_restore_m0[0] is used to set STATUS.inst_atc + + +/**************************************************************************/ +/* the END */ +/**************************************************************************/ +L_END_PGM: + s_endpgm + +end + + +/**************************************************************************/ +/* the helper functions */ +/**************************************************************************/ + +//Only for save hwreg to mem +function write_hwreg_to_mem(s, s_rsrc, s_mem_offset) + s_mov_b32 exec_lo, m0 //assuming exec_lo is not needed anymore from this point on + s_mov_b32 m0, s_mem_offset + s_buffer_store_dword s, s_rsrc, m0 glc:1 + ack_sqc_store_workaround() + s_add_u32 s_mem_offset, s_mem_offset, 4 + s_mov_b32 m0, exec_lo +end + + +// HWREG are saved before SGPRs, so all HWREG could be use. +function write_16sgpr_to_mem(s, s_rsrc, s_mem_offset) + + s_buffer_store_dwordx4 s[0], s_rsrc, 0 glc:1 + ack_sqc_store_workaround() + s_buffer_store_dwordx4 s[4], s_rsrc, 16 glc:1 + ack_sqc_store_workaround() + s_buffer_store_dwordx4 s[8], s_rsrc, 32 glc:1 + ack_sqc_store_workaround() + s_buffer_store_dwordx4 s[12], s_rsrc, 48 glc:1 + ack_sqc_store_workaround() + s_add_u32 s_rsrc[0], s_rsrc[0], 4*16 + s_addc_u32 s_rsrc[1], s_rsrc[1], 0x0 // +scc +end + + +function read_hwreg_from_mem(s, s_rsrc, s_mem_offset) + s_buffer_load_dword s, s_rsrc, s_mem_offset glc:1 + s_add_u32 s_mem_offset, s_mem_offset, 4 +end + +function read_16sgpr_from_mem(s, s_rsrc, s_mem_offset) + s_buffer_load_dwordx16 s, s_rsrc, s_mem_offset glc:1 + s_sub_u32 s_mem_offset, s_mem_offset, 4*16 +end + + + +function get_lds_size_bytes(s_lds_size_byte) + // SQ LDS granularity is 64DW, while PGM_RSRC2.lds_size is in granularity 128DW + s_getreg_b32 s_lds_size_byte, hwreg(HW_REG_LDS_ALLOC, SQ_WAVE_LDS_ALLOC_LDS_SIZE_SHIFT, SQ_WAVE_LDS_ALLOC_LDS_SIZE_SIZE) // lds_size + s_lshl_b32 s_lds_size_byte, s_lds_size_byte, 8 //LDS size in dwords = lds_size * 64 *4Bytes // granularity 64DW +end + +function get_vgpr_size_bytes(s_vgpr_size_byte) + s_getreg_b32 s_vgpr_size_byte, hwreg(HW_REG_GPR_ALLOC,SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SHIFT,SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SIZE) //vpgr_size + s_add_u32 s_vgpr_size_byte, s_vgpr_size_byte, 1 + s_lshl_b32 s_vgpr_size_byte, s_vgpr_size_byte, (2+8) //Number of VGPRs = (vgpr_size + 1) * 4 * 64 * 4 (non-zero value) //FIXME for GFX, zero is possible +end + +function get_sgpr_size_bytes(s_sgpr_size_byte) + s_getreg_b32 s_sgpr_size_byte, hwreg(HW_REG_GPR_ALLOC,SQ_WAVE_GPR_ALLOC_SGPR_SIZE_SHIFT,SQ_WAVE_GPR_ALLOC_SGPR_SIZE_SIZE) //spgr_size + s_add_u32 s_sgpr_size_byte, s_sgpr_size_byte, 1 + s_lshl_b32 s_sgpr_size_byte, s_sgpr_size_byte, 6 //Number of SGPRs = (sgpr_size + 1) * 16 *4 (non-zero value) +end + +function get_hwreg_size_bytes + return 128 //HWREG size 128 bytes +end + +function ack_sqc_store_workaround + if ACK_SQC_STORE + s_waitcnt lgkmcnt(0) + end +end + +function set_status_without_spi_prio(status, tmp) + // Do not restore STATUS.SPI_PRIO since scheduler may have raised it. + s_lshr_b32 tmp, status, SQ_WAVE_STATUS_POST_SPI_PRIO_SHIFT + s_setreg_b32 hwreg(HW_REG_STATUS, SQ_WAVE_STATUS_POST_SPI_PRIO_SHIFT, SQ_WAVE_STATUS_POST_SPI_PRIO_SIZE), tmp + s_nop 0x2 // avoid S_SETREG => S_SETREG hazard + s_setreg_b32 hwreg(HW_REG_STATUS, SQ_WAVE_STATUS_PRE_SPI_PRIO_SHIFT, SQ_WAVE_STATUS_PRE_SPI_PRIO_SIZE), status +end diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c new file mode 100644 index 000000000..297b36c26 --- /dev/null +++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c @@ -0,0 +1,1743 @@ +/* + * Copyright 2014 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +#include <linux/device.h> +#include <linux/export.h> +#include <linux/err.h> +#include <linux/fs.h> +#include <linux/file.h> +#include <linux/sched.h> +#include <linux/slab.h> +#include <linux/uaccess.h> +#include <linux/compat.h> +#include <uapi/linux/kfd_ioctl.h> +#include <linux/time.h> +#include <linux/mm.h> +#include <linux/mman.h> +#include <asm/processor.h> +#include "kfd_priv.h" +#include "kfd_device_queue_manager.h" +#include "kfd_dbgmgr.h" + +static long kfd_ioctl(struct file *, unsigned int, unsigned long); +static int kfd_open(struct inode *, struct file *); +static int kfd_mmap(struct file *, struct vm_area_struct *); + +static const char kfd_dev_name[] = "kfd"; + +static const struct file_operations kfd_fops = { + .owner = THIS_MODULE, + .unlocked_ioctl = kfd_ioctl, + .compat_ioctl = kfd_ioctl, + .open = kfd_open, + .mmap = kfd_mmap, +}; + +static int kfd_char_dev_major = -1; +static struct class *kfd_class; +struct device *kfd_device; + +int kfd_chardev_init(void) +{ + int err = 0; + + kfd_char_dev_major = register_chrdev(0, kfd_dev_name, &kfd_fops); + err = kfd_char_dev_major; + if (err < 0) + goto err_register_chrdev; + + kfd_class = class_create(THIS_MODULE, kfd_dev_name); + err = PTR_ERR(kfd_class); + if (IS_ERR(kfd_class)) + goto err_class_create; + + kfd_device = device_create(kfd_class, NULL, + MKDEV(kfd_char_dev_major, 0), + NULL, kfd_dev_name); + err = PTR_ERR(kfd_device); + if (IS_ERR(kfd_device)) + goto err_device_create; + + return 0; + +err_device_create: + class_destroy(kfd_class); +err_class_create: + unregister_chrdev(kfd_char_dev_major, kfd_dev_name); +err_register_chrdev: + return err; +} + +void kfd_chardev_exit(void) +{ + device_destroy(kfd_class, MKDEV(kfd_char_dev_major, 0)); + class_destroy(kfd_class); + unregister_chrdev(kfd_char_dev_major, kfd_dev_name); +} + +struct device *kfd_chardev(void) +{ + return kfd_device; +} + + +static int kfd_open(struct inode *inode, struct file *filep) +{ + struct kfd_process *process; + bool is_32bit_user_mode; + + if (iminor(inode) != 0) + return -ENODEV; + + is_32bit_user_mode = in_compat_syscall(); + + if (is_32bit_user_mode) { + dev_warn(kfd_device, + "Process %d (32-bit) failed to open /dev/kfd\n" + "32-bit processes are not supported by amdkfd\n", + current->pid); + return -EPERM; + } + + process = kfd_create_process(filep); + if (IS_ERR(process)) + return PTR_ERR(process); + + if (kfd_is_locked()) + return -EAGAIN; + + dev_dbg(kfd_device, "process %d opened, compat mode (32 bit) - %d\n", + process->pasid, process->is_32bit_user_mode); + + return 0; +} + +static int kfd_ioctl_get_version(struct file *filep, struct kfd_process *p, + void *data) +{ + struct kfd_ioctl_get_version_args *args = data; + + args->major_version = KFD_IOCTL_MAJOR_VERSION; + args->minor_version = KFD_IOCTL_MINOR_VERSION; + + return 0; +} + +static int set_queue_properties_from_user(struct queue_properties *q_properties, + struct kfd_ioctl_create_queue_args *args) +{ + if (args->queue_percentage > KFD_MAX_QUEUE_PERCENTAGE) { + pr_err("Queue percentage must be between 0 to KFD_MAX_QUEUE_PERCENTAGE\n"); + return -EINVAL; + } + + if (args->queue_priority > KFD_MAX_QUEUE_PRIORITY) { + pr_err("Queue priority must be between 0 to KFD_MAX_QUEUE_PRIORITY\n"); + return -EINVAL; + } + + if ((args->ring_base_address) && + (!access_ok(VERIFY_WRITE, + (const void __user *) args->ring_base_address, + sizeof(uint64_t)))) { + pr_err("Can't access ring base address\n"); + return -EFAULT; + } + + if (!is_power_of_2(args->ring_size) && (args->ring_size != 0)) { + pr_err("Ring size must be a power of 2 or 0\n"); + return -EINVAL; + } + + if (!access_ok(VERIFY_WRITE, + (const void __user *) args->read_pointer_address, + sizeof(uint32_t))) { + pr_err("Can't access read pointer\n"); + return -EFAULT; + } + + if (!access_ok(VERIFY_WRITE, + (const void __user *) args->write_pointer_address, + sizeof(uint32_t))) { + pr_err("Can't access write pointer\n"); + return -EFAULT; + } + + if (args->eop_buffer_address && + !access_ok(VERIFY_WRITE, + (const void __user *) args->eop_buffer_address, + sizeof(uint32_t))) { + pr_debug("Can't access eop buffer"); + return -EFAULT; + } + + if (args->ctx_save_restore_address && + !access_ok(VERIFY_WRITE, + (const void __user *) args->ctx_save_restore_address, + sizeof(uint32_t))) { + pr_debug("Can't access ctx save restore buffer"); + return -EFAULT; + } + + q_properties->is_interop = false; + q_properties->queue_percent = args->queue_percentage; + q_properties->priority = args->queue_priority; + q_properties->queue_address = args->ring_base_address; + q_properties->queue_size = args->ring_size; + q_properties->read_ptr = (uint32_t *) args->read_pointer_address; + q_properties->write_ptr = (uint32_t *) args->write_pointer_address; + q_properties->eop_ring_buffer_address = args->eop_buffer_address; + q_properties->eop_ring_buffer_size = args->eop_buffer_size; + q_properties->ctx_save_restore_area_address = + args->ctx_save_restore_address; + q_properties->ctx_save_restore_area_size = args->ctx_save_restore_size; + q_properties->ctl_stack_size = args->ctl_stack_size; + if (args->queue_type == KFD_IOC_QUEUE_TYPE_COMPUTE || + args->queue_type == KFD_IOC_QUEUE_TYPE_COMPUTE_AQL) + q_properties->type = KFD_QUEUE_TYPE_COMPUTE; + else if (args->queue_type == KFD_IOC_QUEUE_TYPE_SDMA) + q_properties->type = KFD_QUEUE_TYPE_SDMA; + else + return -ENOTSUPP; + + if (args->queue_type == KFD_IOC_QUEUE_TYPE_COMPUTE_AQL) + q_properties->format = KFD_QUEUE_FORMAT_AQL; + else + q_properties->format = KFD_QUEUE_FORMAT_PM4; + + pr_debug("Queue Percentage: %d, %d\n", + q_properties->queue_percent, args->queue_percentage); + + pr_debug("Queue Priority: %d, %d\n", + q_properties->priority, args->queue_priority); + + pr_debug("Queue Address: 0x%llX, 0x%llX\n", + q_properties->queue_address, args->ring_base_address); + + pr_debug("Queue Size: 0x%llX, %u\n", + q_properties->queue_size, args->ring_size); + + pr_debug("Queue r/w Pointers: %px, %px\n", + q_properties->read_ptr, + q_properties->write_ptr); + + pr_debug("Queue Format: %d\n", q_properties->format); + + pr_debug("Queue EOP: 0x%llX\n", q_properties->eop_ring_buffer_address); + + pr_debug("Queue CTX save area: 0x%llX\n", + q_properties->ctx_save_restore_area_address); + + return 0; +} + +static int kfd_ioctl_create_queue(struct file *filep, struct kfd_process *p, + void *data) +{ + struct kfd_ioctl_create_queue_args *args = data; + struct kfd_dev *dev; + int err = 0; + unsigned int queue_id; + struct kfd_process_device *pdd; + struct queue_properties q_properties; + + memset(&q_properties, 0, sizeof(struct queue_properties)); + + pr_debug("Creating queue ioctl\n"); + + err = set_queue_properties_from_user(&q_properties, args); + if (err) + return err; + + pr_debug("Looking for gpu id 0x%x\n", args->gpu_id); + dev = kfd_device_by_id(args->gpu_id); + if (!dev) { + pr_debug("Could not find gpu id 0x%x\n", args->gpu_id); + return -EINVAL; + } + + mutex_lock(&p->mutex); + + pdd = kfd_bind_process_to_device(dev, p); + if (IS_ERR(pdd)) { + err = -ESRCH; + goto err_bind_process; + } + + pr_debug("Creating queue for PASID %d on gpu 0x%x\n", + p->pasid, + dev->id); + + err = pqm_create_queue(&p->pqm, dev, filep, &q_properties, &queue_id); + if (err != 0) + goto err_create_queue; + + args->queue_id = queue_id; + + + /* Return gpu_id as doorbell offset for mmap usage */ + args->doorbell_offset = KFD_MMAP_TYPE_DOORBELL; + args->doorbell_offset |= KFD_MMAP_GPU_ID(args->gpu_id); + args->doorbell_offset <<= PAGE_SHIFT; + if (KFD_IS_SOC15(dev->device_info->asic_family)) + /* On SOC15 ASICs, doorbell allocation must be + * per-device, and independent from the per-process + * queue_id. Return the doorbell offset within the + * doorbell aperture to user mode. + */ + args->doorbell_offset |= q_properties.doorbell_off; + + mutex_unlock(&p->mutex); + + pr_debug("Queue id %d was created successfully\n", args->queue_id); + + pr_debug("Ring buffer address == 0x%016llX\n", + args->ring_base_address); + + pr_debug("Read ptr address == 0x%016llX\n", + args->read_pointer_address); + + pr_debug("Write ptr address == 0x%016llX\n", + args->write_pointer_address); + + return 0; + +err_create_queue: +err_bind_process: + mutex_unlock(&p->mutex); + return err; +} + +static int kfd_ioctl_destroy_queue(struct file *filp, struct kfd_process *p, + void *data) +{ + int retval; + struct kfd_ioctl_destroy_queue_args *args = data; + + pr_debug("Destroying queue id %d for pasid %d\n", + args->queue_id, + p->pasid); + + mutex_lock(&p->mutex); + + retval = pqm_destroy_queue(&p->pqm, args->queue_id); + + mutex_unlock(&p->mutex); + return retval; +} + +static int kfd_ioctl_update_queue(struct file *filp, struct kfd_process *p, + void *data) +{ + int retval; + struct kfd_ioctl_update_queue_args *args = data; + struct queue_properties properties; + + if (args->queue_percentage > KFD_MAX_QUEUE_PERCENTAGE) { + pr_err("Queue percentage must be between 0 to KFD_MAX_QUEUE_PERCENTAGE\n"); + return -EINVAL; + } + + if (args->queue_priority > KFD_MAX_QUEUE_PRIORITY) { + pr_err("Queue priority must be between 0 to KFD_MAX_QUEUE_PRIORITY\n"); + return -EINVAL; + } + + if ((args->ring_base_address) && + (!access_ok(VERIFY_WRITE, + (const void __user *) args->ring_base_address, + sizeof(uint64_t)))) { + pr_err("Can't access ring base address\n"); + return -EFAULT; + } + + if (!is_power_of_2(args->ring_size) && (args->ring_size != 0)) { + pr_err("Ring size must be a power of 2 or 0\n"); + return -EINVAL; + } + + properties.queue_address = args->ring_base_address; + properties.queue_size = args->ring_size; + properties.queue_percent = args->queue_percentage; + properties.priority = args->queue_priority; + + pr_debug("Updating queue id %d for pasid %d\n", + args->queue_id, p->pasid); + + mutex_lock(&p->mutex); + + retval = pqm_update_queue(&p->pqm, args->queue_id, &properties); + + mutex_unlock(&p->mutex); + + return retval; +} + +static int kfd_ioctl_set_cu_mask(struct file *filp, struct kfd_process *p, + void *data) +{ + int retval; + const int max_num_cus = 1024; + struct kfd_ioctl_set_cu_mask_args *args = data; + struct queue_properties properties; + uint32_t __user *cu_mask_ptr = (uint32_t __user *)args->cu_mask_ptr; + size_t cu_mask_size = sizeof(uint32_t) * (args->num_cu_mask / 32); + + if ((args->num_cu_mask % 32) != 0) { + pr_debug("num_cu_mask 0x%x must be a multiple of 32", + args->num_cu_mask); + return -EINVAL; + } + + properties.cu_mask_count = args->num_cu_mask; + if (properties.cu_mask_count == 0) { + pr_debug("CU mask cannot be 0"); + return -EINVAL; + } + + /* To prevent an unreasonably large CU mask size, set an arbitrary + * limit of max_num_cus bits. We can then just drop any CU mask bits + * past max_num_cus bits and just use the first max_num_cus bits. + */ + if (properties.cu_mask_count > max_num_cus) { + pr_debug("CU mask cannot be greater than 1024 bits"); + properties.cu_mask_count = max_num_cus; + cu_mask_size = sizeof(uint32_t) * (max_num_cus/32); + } + + properties.cu_mask = kzalloc(cu_mask_size, GFP_KERNEL); + if (!properties.cu_mask) + return -ENOMEM; + + retval = copy_from_user(properties.cu_mask, cu_mask_ptr, cu_mask_size); + if (retval) { + pr_debug("Could not copy CU mask from userspace"); + kfree(properties.cu_mask); + return -EFAULT; + } + + mutex_lock(&p->mutex); + + retval = pqm_set_cu_mask(&p->pqm, args->queue_id, &properties); + + mutex_unlock(&p->mutex); + + if (retval) + kfree(properties.cu_mask); + + return retval; +} + +static int kfd_ioctl_set_memory_policy(struct file *filep, + struct kfd_process *p, void *data) +{ + struct kfd_ioctl_set_memory_policy_args *args = data; + struct kfd_dev *dev; + int err = 0; + struct kfd_process_device *pdd; + enum cache_policy default_policy, alternate_policy; + + if (args->default_policy != KFD_IOC_CACHE_POLICY_COHERENT + && args->default_policy != KFD_IOC_CACHE_POLICY_NONCOHERENT) { + return -EINVAL; + } + + if (args->alternate_policy != KFD_IOC_CACHE_POLICY_COHERENT + && args->alternate_policy != KFD_IOC_CACHE_POLICY_NONCOHERENT) { + return -EINVAL; + } + + dev = kfd_device_by_id(args->gpu_id); + if (!dev) + return -EINVAL; + + mutex_lock(&p->mutex); + + pdd = kfd_bind_process_to_device(dev, p); + if (IS_ERR(pdd)) { + err = -ESRCH; + goto out; + } + + default_policy = (args->default_policy == KFD_IOC_CACHE_POLICY_COHERENT) + ? cache_policy_coherent : cache_policy_noncoherent; + + alternate_policy = + (args->alternate_policy == KFD_IOC_CACHE_POLICY_COHERENT) + ? cache_policy_coherent : cache_policy_noncoherent; + + if (!dev->dqm->ops.set_cache_memory_policy(dev->dqm, + &pdd->qpd, + default_policy, + alternate_policy, + (void __user *)args->alternate_aperture_base, + args->alternate_aperture_size)) + err = -EINVAL; + +out: + mutex_unlock(&p->mutex); + + return err; +} + +static int kfd_ioctl_set_trap_handler(struct file *filep, + struct kfd_process *p, void *data) +{ + struct kfd_ioctl_set_trap_handler_args *args = data; + struct kfd_dev *dev; + int err = 0; + struct kfd_process_device *pdd; + + dev = kfd_device_by_id(args->gpu_id); + if (dev == NULL) + return -EINVAL; + + mutex_lock(&p->mutex); + + pdd = kfd_bind_process_to_device(dev, p); + if (IS_ERR(pdd)) { + err = -ESRCH; + goto out; + } + + if (dev->dqm->ops.set_trap_handler(dev->dqm, + &pdd->qpd, + args->tba_addr, + args->tma_addr)) + err = -EINVAL; + +out: + mutex_unlock(&p->mutex); + + return err; +} + +static int kfd_ioctl_dbg_register(struct file *filep, + struct kfd_process *p, void *data) +{ + struct kfd_ioctl_dbg_register_args *args = data; + struct kfd_dev *dev; + struct kfd_dbgmgr *dbgmgr_ptr; + struct kfd_process_device *pdd; + bool create_ok; + long status = 0; + + dev = kfd_device_by_id(args->gpu_id); + if (!dev) + return -EINVAL; + + if (dev->device_info->asic_family == CHIP_CARRIZO) { + pr_debug("kfd_ioctl_dbg_register not supported on CZ\n"); + return -EINVAL; + } + + mutex_lock(&p->mutex); + mutex_lock(kfd_get_dbgmgr_mutex()); + + /* + * make sure that we have pdd, if this the first queue created for + * this process + */ + pdd = kfd_bind_process_to_device(dev, p); + if (IS_ERR(pdd)) { + status = PTR_ERR(pdd); + goto out; + } + + if (!dev->dbgmgr) { + /* In case of a legal call, we have no dbgmgr yet */ + create_ok = kfd_dbgmgr_create(&dbgmgr_ptr, dev); + if (create_ok) { + status = kfd_dbgmgr_register(dbgmgr_ptr, p); + if (status != 0) + kfd_dbgmgr_destroy(dbgmgr_ptr); + else + dev->dbgmgr = dbgmgr_ptr; + } + } else { + pr_debug("debugger already registered\n"); + status = -EINVAL; + } + +out: + mutex_unlock(kfd_get_dbgmgr_mutex()); + mutex_unlock(&p->mutex); + + return status; +} + +static int kfd_ioctl_dbg_unregister(struct file *filep, + struct kfd_process *p, void *data) +{ + struct kfd_ioctl_dbg_unregister_args *args = data; + struct kfd_dev *dev; + long status; + + dev = kfd_device_by_id(args->gpu_id); + if (!dev || !dev->dbgmgr) + return -EINVAL; + + if (dev->device_info->asic_family == CHIP_CARRIZO) { + pr_debug("kfd_ioctl_dbg_unregister not supported on CZ\n"); + return -EINVAL; + } + + mutex_lock(kfd_get_dbgmgr_mutex()); + + status = kfd_dbgmgr_unregister(dev->dbgmgr, p); + if (!status) { + kfd_dbgmgr_destroy(dev->dbgmgr); + dev->dbgmgr = NULL; + } + + mutex_unlock(kfd_get_dbgmgr_mutex()); + + return status; +} + +/* + * Parse and generate variable size data structure for address watch. + * Total size of the buffer and # watch points is limited in order + * to prevent kernel abuse. (no bearing to the much smaller HW limitation + * which is enforced by dbgdev module) + * please also note that the watch address itself are not "copied from user", + * since it be set into the HW in user mode values. + * + */ +static int kfd_ioctl_dbg_address_watch(struct file *filep, + struct kfd_process *p, void *data) +{ + struct kfd_ioctl_dbg_address_watch_args *args = data; + struct kfd_dev *dev; + struct dbg_address_watch_info aw_info; + unsigned char *args_buff; + long status; + void __user *cmd_from_user; + uint64_t watch_mask_value = 0; + unsigned int args_idx = 0; + + memset((void *) &aw_info, 0, sizeof(struct dbg_address_watch_info)); + + dev = kfd_device_by_id(args->gpu_id); + if (!dev) + return -EINVAL; + + if (dev->device_info->asic_family == CHIP_CARRIZO) { + pr_debug("kfd_ioctl_dbg_wave_control not supported on CZ\n"); + return -EINVAL; + } + + cmd_from_user = (void __user *) args->content_ptr; + + /* Validate arguments */ + + if ((args->buf_size_in_bytes > MAX_ALLOWED_AW_BUFF_SIZE) || + (args->buf_size_in_bytes <= sizeof(*args) + sizeof(int) * 2) || + (cmd_from_user == NULL)) + return -EINVAL; + + /* this is the actual buffer to work with */ + args_buff = memdup_user(cmd_from_user, + args->buf_size_in_bytes - sizeof(*args)); + if (IS_ERR(args_buff)) + return PTR_ERR(args_buff); + + aw_info.process = p; + + aw_info.num_watch_points = *((uint32_t *)(&args_buff[args_idx])); + args_idx += sizeof(aw_info.num_watch_points); + + aw_info.watch_mode = (enum HSA_DBG_WATCH_MODE *) &args_buff[args_idx]; + args_idx += sizeof(enum HSA_DBG_WATCH_MODE) * aw_info.num_watch_points; + + /* + * set watch address base pointer to point on the array base + * within args_buff + */ + aw_info.watch_address = (uint64_t *) &args_buff[args_idx]; + + /* skip over the addresses buffer */ + args_idx += sizeof(aw_info.watch_address) * aw_info.num_watch_points; + + if (args_idx >= args->buf_size_in_bytes - sizeof(*args)) { + status = -EINVAL; + goto out; + } + + watch_mask_value = (uint64_t) args_buff[args_idx]; + + if (watch_mask_value > 0) { + /* + * There is an array of masks. + * set watch mask base pointer to point on the array base + * within args_buff + */ + aw_info.watch_mask = (uint64_t *) &args_buff[args_idx]; + + /* skip over the masks buffer */ + args_idx += sizeof(aw_info.watch_mask) * + aw_info.num_watch_points; + } else { + /* just the NULL mask, set to NULL and skip over it */ + aw_info.watch_mask = NULL; + args_idx += sizeof(aw_info.watch_mask); + } + + if (args_idx >= args->buf_size_in_bytes - sizeof(args)) { + status = -EINVAL; + goto out; + } + + /* Currently HSA Event is not supported for DBG */ + aw_info.watch_event = NULL; + + mutex_lock(kfd_get_dbgmgr_mutex()); + + status = kfd_dbgmgr_address_watch(dev->dbgmgr, &aw_info); + + mutex_unlock(kfd_get_dbgmgr_mutex()); + +out: + kfree(args_buff); + + return status; +} + +/* Parse and generate fixed size data structure for wave control */ +static int kfd_ioctl_dbg_wave_control(struct file *filep, + struct kfd_process *p, void *data) +{ + struct kfd_ioctl_dbg_wave_control_args *args = data; + struct kfd_dev *dev; + struct dbg_wave_control_info wac_info; + unsigned char *args_buff; + uint32_t computed_buff_size; + long status; + void __user *cmd_from_user; + unsigned int args_idx = 0; + + memset((void *) &wac_info, 0, sizeof(struct dbg_wave_control_info)); + + /* we use compact form, independent of the packing attribute value */ + computed_buff_size = sizeof(*args) + + sizeof(wac_info.mode) + + sizeof(wac_info.operand) + + sizeof(wac_info.dbgWave_msg.DbgWaveMsg) + + sizeof(wac_info.dbgWave_msg.MemoryVA) + + sizeof(wac_info.trapId); + + dev = kfd_device_by_id(args->gpu_id); + if (!dev) + return -EINVAL; + + if (dev->device_info->asic_family == CHIP_CARRIZO) { + pr_debug("kfd_ioctl_dbg_wave_control not supported on CZ\n"); + return -EINVAL; + } + + /* input size must match the computed "compact" size */ + if (args->buf_size_in_bytes != computed_buff_size) { + pr_debug("size mismatch, computed : actual %u : %u\n", + args->buf_size_in_bytes, computed_buff_size); + return -EINVAL; + } + + cmd_from_user = (void __user *) args->content_ptr; + + if (cmd_from_user == NULL) + return -EINVAL; + + /* copy the entire buffer from user */ + + args_buff = memdup_user(cmd_from_user, + args->buf_size_in_bytes - sizeof(*args)); + if (IS_ERR(args_buff)) + return PTR_ERR(args_buff); + + /* move ptr to the start of the "pay-load" area */ + wac_info.process = p; + + wac_info.operand = *((enum HSA_DBG_WAVEOP *)(&args_buff[args_idx])); + args_idx += sizeof(wac_info.operand); + + wac_info.mode = *((enum HSA_DBG_WAVEMODE *)(&args_buff[args_idx])); + args_idx += sizeof(wac_info.mode); + + wac_info.trapId = *((uint32_t *)(&args_buff[args_idx])); + args_idx += sizeof(wac_info.trapId); + + wac_info.dbgWave_msg.DbgWaveMsg.WaveMsgInfoGen2.Value = + *((uint32_t *)(&args_buff[args_idx])); + wac_info.dbgWave_msg.MemoryVA = NULL; + + mutex_lock(kfd_get_dbgmgr_mutex()); + + pr_debug("Calling dbg manager process %p, operand %u, mode %u, trapId %u, message %u\n", + wac_info.process, wac_info.operand, + wac_info.mode, wac_info.trapId, + wac_info.dbgWave_msg.DbgWaveMsg.WaveMsgInfoGen2.Value); + + status = kfd_dbgmgr_wave_control(dev->dbgmgr, &wac_info); + + pr_debug("Returned status of dbg manager is %ld\n", status); + + mutex_unlock(kfd_get_dbgmgr_mutex()); + + kfree(args_buff); + + return status; +} + +static int kfd_ioctl_get_clock_counters(struct file *filep, + struct kfd_process *p, void *data) +{ + struct kfd_ioctl_get_clock_counters_args *args = data; + struct kfd_dev *dev; + + dev = kfd_device_by_id(args->gpu_id); + if (dev) + /* Reading GPU clock counter from KGD */ + args->gpu_clock_counter = + dev->kfd2kgd->get_gpu_clock_counter(dev->kgd); + else + /* Node without GPU resource */ + args->gpu_clock_counter = 0; + + /* No access to rdtsc. Using raw monotonic time */ + args->cpu_clock_counter = ktime_get_raw_ns(); + args->system_clock_counter = ktime_get_boot_ns(); + + /* Since the counter is in nano-seconds we use 1GHz frequency */ + args->system_clock_freq = 1000000000; + + return 0; +} + + +static int kfd_ioctl_get_process_apertures(struct file *filp, + struct kfd_process *p, void *data) +{ + struct kfd_ioctl_get_process_apertures_args *args = data; + struct kfd_process_device_apertures *pAperture; + struct kfd_process_device *pdd; + + dev_dbg(kfd_device, "get apertures for PASID %d", p->pasid); + + args->num_of_nodes = 0; + + mutex_lock(&p->mutex); + + /*if the process-device list isn't empty*/ + if (kfd_has_process_device_data(p)) { + /* Run over all pdd of the process */ + pdd = kfd_get_first_process_device_data(p); + do { + pAperture = + &args->process_apertures[args->num_of_nodes]; + pAperture->gpu_id = pdd->dev->id; + pAperture->lds_base = pdd->lds_base; + pAperture->lds_limit = pdd->lds_limit; + pAperture->gpuvm_base = pdd->gpuvm_base; + pAperture->gpuvm_limit = pdd->gpuvm_limit; + pAperture->scratch_base = pdd->scratch_base; + pAperture->scratch_limit = pdd->scratch_limit; + + dev_dbg(kfd_device, + "node id %u\n", args->num_of_nodes); + dev_dbg(kfd_device, + "gpu id %u\n", pdd->dev->id); + dev_dbg(kfd_device, + "lds_base %llX\n", pdd->lds_base); + dev_dbg(kfd_device, + "lds_limit %llX\n", pdd->lds_limit); + dev_dbg(kfd_device, + "gpuvm_base %llX\n", pdd->gpuvm_base); + dev_dbg(kfd_device, + "gpuvm_limit %llX\n", pdd->gpuvm_limit); + dev_dbg(kfd_device, + "scratch_base %llX\n", pdd->scratch_base); + dev_dbg(kfd_device, + "scratch_limit %llX\n", pdd->scratch_limit); + + args->num_of_nodes++; + + pdd = kfd_get_next_process_device_data(p, pdd); + } while (pdd && (args->num_of_nodes < NUM_OF_SUPPORTED_GPUS)); + } + + mutex_unlock(&p->mutex); + + return 0; +} + +static int kfd_ioctl_get_process_apertures_new(struct file *filp, + struct kfd_process *p, void *data) +{ + struct kfd_ioctl_get_process_apertures_new_args *args = data; + struct kfd_process_device_apertures *pa; + struct kfd_process_device *pdd; + uint32_t nodes = 0; + int ret; + + dev_dbg(kfd_device, "get apertures for PASID %d", p->pasid); + + if (args->num_of_nodes == 0) { + /* Return number of nodes, so that user space can alloacate + * sufficient memory + */ + mutex_lock(&p->mutex); + + if (!kfd_has_process_device_data(p)) + goto out_unlock; + + /* Run over all pdd of the process */ + pdd = kfd_get_first_process_device_data(p); + do { + args->num_of_nodes++; + pdd = kfd_get_next_process_device_data(p, pdd); + } while (pdd); + + goto out_unlock; + } + + /* Fill in process-aperture information for all available + * nodes, but not more than args->num_of_nodes as that is + * the amount of memory allocated by user + */ + pa = kzalloc((sizeof(struct kfd_process_device_apertures) * + args->num_of_nodes), GFP_KERNEL); + if (!pa) + return -ENOMEM; + + mutex_lock(&p->mutex); + + if (!kfd_has_process_device_data(p)) { + args->num_of_nodes = 0; + kfree(pa); + goto out_unlock; + } + + /* Run over all pdd of the process */ + pdd = kfd_get_first_process_device_data(p); + do { + pa[nodes].gpu_id = pdd->dev->id; + pa[nodes].lds_base = pdd->lds_base; + pa[nodes].lds_limit = pdd->lds_limit; + pa[nodes].gpuvm_base = pdd->gpuvm_base; + pa[nodes].gpuvm_limit = pdd->gpuvm_limit; + pa[nodes].scratch_base = pdd->scratch_base; + pa[nodes].scratch_limit = pdd->scratch_limit; + + dev_dbg(kfd_device, + "gpu id %u\n", pdd->dev->id); + dev_dbg(kfd_device, + "lds_base %llX\n", pdd->lds_base); + dev_dbg(kfd_device, + "lds_limit %llX\n", pdd->lds_limit); + dev_dbg(kfd_device, + "gpuvm_base %llX\n", pdd->gpuvm_base); + dev_dbg(kfd_device, + "gpuvm_limit %llX\n", pdd->gpuvm_limit); + dev_dbg(kfd_device, + "scratch_base %llX\n", pdd->scratch_base); + dev_dbg(kfd_device, + "scratch_limit %llX\n", pdd->scratch_limit); + nodes++; + + pdd = kfd_get_next_process_device_data(p, pdd); + } while (pdd && (nodes < args->num_of_nodes)); + mutex_unlock(&p->mutex); + + args->num_of_nodes = nodes; + ret = copy_to_user( + (void __user *)args->kfd_process_device_apertures_ptr, + pa, + (nodes * sizeof(struct kfd_process_device_apertures))); + kfree(pa); + return ret ? -EFAULT : 0; + +out_unlock: + mutex_unlock(&p->mutex); + return 0; +} + +static int kfd_ioctl_create_event(struct file *filp, struct kfd_process *p, + void *data) +{ + struct kfd_ioctl_create_event_args *args = data; + int err; + + /* For dGPUs the event page is allocated in user mode. The + * handle is passed to KFD with the first call to this IOCTL + * through the event_page_offset field. + */ + if (args->event_page_offset) { + struct kfd_dev *kfd; + struct kfd_process_device *pdd; + void *mem, *kern_addr; + uint64_t size; + + if (p->signal_page) { + pr_err("Event page is already set\n"); + return -EINVAL; + } + + kfd = kfd_device_by_id(GET_GPU_ID(args->event_page_offset)); + if (!kfd) { + pr_err("Getting device by id failed in %s\n", __func__); + return -EINVAL; + } + + mutex_lock(&p->mutex); + pdd = kfd_bind_process_to_device(kfd, p); + if (IS_ERR(pdd)) { + err = PTR_ERR(pdd); + goto out_unlock; + } + + mem = kfd_process_device_translate_handle(pdd, + GET_IDR_HANDLE(args->event_page_offset)); + if (!mem) { + pr_err("Can't find BO, offset is 0x%llx\n", + args->event_page_offset); + err = -EINVAL; + goto out_unlock; + } + mutex_unlock(&p->mutex); + + err = kfd->kfd2kgd->map_gtt_bo_to_kernel(kfd->kgd, + mem, &kern_addr, &size); + if (err) { + pr_err("Failed to map event page to kernel\n"); + return err; + } + + err = kfd_event_page_set(p, kern_addr, size); + if (err) { + pr_err("Failed to set event page\n"); + return err; + } + } + + err = kfd_event_create(filp, p, args->event_type, + args->auto_reset != 0, args->node_id, + &args->event_id, &args->event_trigger_data, + &args->event_page_offset, + &args->event_slot_index); + + return err; + +out_unlock: + mutex_unlock(&p->mutex); + return err; +} + +static int kfd_ioctl_destroy_event(struct file *filp, struct kfd_process *p, + void *data) +{ + struct kfd_ioctl_destroy_event_args *args = data; + + return kfd_event_destroy(p, args->event_id); +} + +static int kfd_ioctl_set_event(struct file *filp, struct kfd_process *p, + void *data) +{ + struct kfd_ioctl_set_event_args *args = data; + + return kfd_set_event(p, args->event_id); +} + +static int kfd_ioctl_reset_event(struct file *filp, struct kfd_process *p, + void *data) +{ + struct kfd_ioctl_reset_event_args *args = data; + + return kfd_reset_event(p, args->event_id); +} + +static int kfd_ioctl_wait_events(struct file *filp, struct kfd_process *p, + void *data) +{ + struct kfd_ioctl_wait_events_args *args = data; + int err; + + err = kfd_wait_on_events(p, args->num_events, + (void __user *)args->events_ptr, + (args->wait_for_all != 0), + args->timeout, &args->wait_result); + + return err; +} +static int kfd_ioctl_set_scratch_backing_va(struct file *filep, + struct kfd_process *p, void *data) +{ + struct kfd_ioctl_set_scratch_backing_va_args *args = data; + struct kfd_process_device *pdd; + struct kfd_dev *dev; + long err; + + dev = kfd_device_by_id(args->gpu_id); + if (!dev) + return -EINVAL; + + mutex_lock(&p->mutex); + + pdd = kfd_bind_process_to_device(dev, p); + if (IS_ERR(pdd)) { + err = PTR_ERR(pdd); + goto bind_process_to_device_fail; + } + + pdd->qpd.sh_hidden_private_base = args->va_addr; + + mutex_unlock(&p->mutex); + + if (dev->dqm->sched_policy == KFD_SCHED_POLICY_NO_HWS && + pdd->qpd.vmid != 0) + dev->kfd2kgd->set_scratch_backing_va( + dev->kgd, args->va_addr, pdd->qpd.vmid); + + return 0; + +bind_process_to_device_fail: + mutex_unlock(&p->mutex); + return err; +} + +static int kfd_ioctl_get_tile_config(struct file *filep, + struct kfd_process *p, void *data) +{ + struct kfd_ioctl_get_tile_config_args *args = data; + struct kfd_dev *dev; + struct tile_config config; + int err = 0; + + dev = kfd_device_by_id(args->gpu_id); + if (!dev) + return -EINVAL; + + dev->kfd2kgd->get_tile_config(dev->kgd, &config); + + args->gb_addr_config = config.gb_addr_config; + args->num_banks = config.num_banks; + args->num_ranks = config.num_ranks; + + if (args->num_tile_configs > config.num_tile_configs) + args->num_tile_configs = config.num_tile_configs; + err = copy_to_user((void __user *)args->tile_config_ptr, + config.tile_config_ptr, + args->num_tile_configs * sizeof(uint32_t)); + if (err) { + args->num_tile_configs = 0; + return -EFAULT; + } + + if (args->num_macro_tile_configs > config.num_macro_tile_configs) + args->num_macro_tile_configs = + config.num_macro_tile_configs; + err = copy_to_user((void __user *)args->macro_tile_config_ptr, + config.macro_tile_config_ptr, + args->num_macro_tile_configs * sizeof(uint32_t)); + if (err) { + args->num_macro_tile_configs = 0; + return -EFAULT; + } + + return 0; +} + +static int kfd_ioctl_acquire_vm(struct file *filep, struct kfd_process *p, + void *data) +{ + struct kfd_ioctl_acquire_vm_args *args = data; + struct kfd_process_device *pdd; + struct kfd_dev *dev; + struct file *drm_file; + int ret; + + dev = kfd_device_by_id(args->gpu_id); + if (!dev) + return -EINVAL; + + drm_file = fget(args->drm_fd); + if (!drm_file) + return -EINVAL; + + mutex_lock(&p->mutex); + + pdd = kfd_get_process_device_data(dev, p); + if (!pdd) { + ret = -EINVAL; + goto err_unlock; + } + + if (pdd->drm_file) { + ret = pdd->drm_file == drm_file ? 0 : -EBUSY; + goto err_unlock; + } + + ret = kfd_process_device_init_vm(pdd, drm_file); + if (ret) + goto err_unlock; + /* On success, the PDD keeps the drm_file reference */ + mutex_unlock(&p->mutex); + + return 0; + +err_unlock: + mutex_unlock(&p->mutex); + fput(drm_file); + return ret; +} + +static bool kfd_dev_is_large_bar(struct kfd_dev *dev) +{ + struct kfd_local_mem_info mem_info; + + if (debug_largebar) { + pr_debug("Simulate large-bar allocation on non large-bar machine\n"); + return true; + } + + if (dev->device_info->needs_iommu_device) + return false; + + dev->kfd2kgd->get_local_mem_info(dev->kgd, &mem_info); + if (mem_info.local_mem_size_private == 0 && + mem_info.local_mem_size_public > 0) + return true; + return false; +} + +static int kfd_ioctl_alloc_memory_of_gpu(struct file *filep, + struct kfd_process *p, void *data) +{ + struct kfd_ioctl_alloc_memory_of_gpu_args *args = data; + struct kfd_process_device *pdd; + void *mem; + struct kfd_dev *dev; + int idr_handle; + long err; + uint64_t offset = args->mmap_offset; + uint32_t flags = args->flags; + + if (args->size == 0) + return -EINVAL; + + dev = kfd_device_by_id(args->gpu_id); + if (!dev) + return -EINVAL; + + if ((flags & KFD_IOC_ALLOC_MEM_FLAGS_PUBLIC) && + (flags & KFD_IOC_ALLOC_MEM_FLAGS_VRAM) && + !kfd_dev_is_large_bar(dev)) { + pr_err("Alloc host visible vram on small bar is not allowed\n"); + return -EINVAL; + } + + mutex_lock(&p->mutex); + + pdd = kfd_bind_process_to_device(dev, p); + if (IS_ERR(pdd)) { + err = PTR_ERR(pdd); + goto err_unlock; + } + + err = dev->kfd2kgd->alloc_memory_of_gpu( + dev->kgd, args->va_addr, args->size, + pdd->vm, (struct kgd_mem **) &mem, &offset, + flags); + + if (err) + goto err_unlock; + + idr_handle = kfd_process_device_create_obj_handle(pdd, mem); + if (idr_handle < 0) { + err = -EFAULT; + goto err_free; + } + + mutex_unlock(&p->mutex); + + args->handle = MAKE_HANDLE(args->gpu_id, idr_handle); + args->mmap_offset = offset; + + return 0; + +err_free: + dev->kfd2kgd->free_memory_of_gpu(dev->kgd, (struct kgd_mem *)mem); +err_unlock: + mutex_unlock(&p->mutex); + return err; +} + +static int kfd_ioctl_free_memory_of_gpu(struct file *filep, + struct kfd_process *p, void *data) +{ + struct kfd_ioctl_free_memory_of_gpu_args *args = data; + struct kfd_process_device *pdd; + void *mem; + struct kfd_dev *dev; + int ret; + + dev = kfd_device_by_id(GET_GPU_ID(args->handle)); + if (!dev) + return -EINVAL; + + mutex_lock(&p->mutex); + + pdd = kfd_get_process_device_data(dev, p); + if (!pdd) { + pr_err("Process device data doesn't exist\n"); + ret = -EINVAL; + goto err_unlock; + } + + mem = kfd_process_device_translate_handle( + pdd, GET_IDR_HANDLE(args->handle)); + if (!mem) { + ret = -EINVAL; + goto err_unlock; + } + + ret = dev->kfd2kgd->free_memory_of_gpu(dev->kgd, (struct kgd_mem *)mem); + + /* If freeing the buffer failed, leave the handle in place for + * clean-up during process tear-down. + */ + if (!ret) + kfd_process_device_remove_obj_handle( + pdd, GET_IDR_HANDLE(args->handle)); + +err_unlock: + mutex_unlock(&p->mutex); + return ret; +} + +static int kfd_ioctl_map_memory_to_gpu(struct file *filep, + struct kfd_process *p, void *data) +{ + struct kfd_ioctl_map_memory_to_gpu_args *args = data; + struct kfd_process_device *pdd, *peer_pdd; + void *mem; + struct kfd_dev *dev, *peer; + long err = 0; + int i; + uint32_t *devices_arr = NULL; + + dev = kfd_device_by_id(GET_GPU_ID(args->handle)); + if (!dev) + return -EINVAL; + + if (!args->n_devices) { + pr_debug("Device IDs array empty\n"); + return -EINVAL; + } + if (args->n_success > args->n_devices) { + pr_debug("n_success exceeds n_devices\n"); + return -EINVAL; + } + + devices_arr = kmalloc_array(args->n_devices, sizeof(*devices_arr), + GFP_KERNEL); + if (!devices_arr) + return -ENOMEM; + + err = copy_from_user(devices_arr, + (void __user *)args->device_ids_array_ptr, + args->n_devices * sizeof(*devices_arr)); + if (err != 0) { + err = -EFAULT; + goto copy_from_user_failed; + } + + mutex_lock(&p->mutex); + + pdd = kfd_bind_process_to_device(dev, p); + if (IS_ERR(pdd)) { + err = PTR_ERR(pdd); + goto bind_process_to_device_failed; + } + + mem = kfd_process_device_translate_handle(pdd, + GET_IDR_HANDLE(args->handle)); + if (!mem) { + err = -ENOMEM; + goto get_mem_obj_from_handle_failed; + } + + for (i = args->n_success; i < args->n_devices; i++) { + peer = kfd_device_by_id(devices_arr[i]); + if (!peer) { + pr_debug("Getting device by id failed for 0x%x\n", + devices_arr[i]); + err = -EINVAL; + goto get_mem_obj_from_handle_failed; + } + + peer_pdd = kfd_bind_process_to_device(peer, p); + if (IS_ERR(peer_pdd)) { + err = PTR_ERR(peer_pdd); + goto get_mem_obj_from_handle_failed; + } + err = peer->kfd2kgd->map_memory_to_gpu( + peer->kgd, (struct kgd_mem *)mem, peer_pdd->vm); + if (err) { + pr_err("Failed to map to gpu %d/%d\n", + i, args->n_devices); + goto map_memory_to_gpu_failed; + } + args->n_success = i+1; + } + + mutex_unlock(&p->mutex); + + err = dev->kfd2kgd->sync_memory(dev->kgd, (struct kgd_mem *) mem, true); + if (err) { + pr_debug("Sync memory failed, wait interrupted by user signal\n"); + goto sync_memory_failed; + } + + /* Flush TLBs after waiting for the page table updates to complete */ + for (i = 0; i < args->n_devices; i++) { + peer = kfd_device_by_id(devices_arr[i]); + if (WARN_ON_ONCE(!peer)) + continue; + peer_pdd = kfd_get_process_device_data(peer, p); + if (WARN_ON_ONCE(!peer_pdd)) + continue; + kfd_flush_tlb(peer_pdd); + } + + kfree(devices_arr); + + return err; + +bind_process_to_device_failed: +get_mem_obj_from_handle_failed: +map_memory_to_gpu_failed: + mutex_unlock(&p->mutex); +copy_from_user_failed: +sync_memory_failed: + kfree(devices_arr); + + return err; +} + +static int kfd_ioctl_unmap_memory_from_gpu(struct file *filep, + struct kfd_process *p, void *data) +{ + struct kfd_ioctl_unmap_memory_from_gpu_args *args = data; + struct kfd_process_device *pdd, *peer_pdd; + void *mem; + struct kfd_dev *dev, *peer; + long err = 0; + uint32_t *devices_arr = NULL, i; + + dev = kfd_device_by_id(GET_GPU_ID(args->handle)); + if (!dev) + return -EINVAL; + + if (!args->n_devices) { + pr_debug("Device IDs array empty\n"); + return -EINVAL; + } + if (args->n_success > args->n_devices) { + pr_debug("n_success exceeds n_devices\n"); + return -EINVAL; + } + + devices_arr = kmalloc_array(args->n_devices, sizeof(*devices_arr), + GFP_KERNEL); + if (!devices_arr) + return -ENOMEM; + + err = copy_from_user(devices_arr, + (void __user *)args->device_ids_array_ptr, + args->n_devices * sizeof(*devices_arr)); + if (err != 0) { + err = -EFAULT; + goto copy_from_user_failed; + } + + mutex_lock(&p->mutex); + + pdd = kfd_get_process_device_data(dev, p); + if (!pdd) { + err = -EINVAL; + goto bind_process_to_device_failed; + } + + mem = kfd_process_device_translate_handle(pdd, + GET_IDR_HANDLE(args->handle)); + if (!mem) { + err = -ENOMEM; + goto get_mem_obj_from_handle_failed; + } + + for (i = args->n_success; i < args->n_devices; i++) { + peer = kfd_device_by_id(devices_arr[i]); + if (!peer) { + err = -EINVAL; + goto get_mem_obj_from_handle_failed; + } + + peer_pdd = kfd_get_process_device_data(peer, p); + if (!peer_pdd) { + err = -ENODEV; + goto get_mem_obj_from_handle_failed; + } + err = dev->kfd2kgd->unmap_memory_to_gpu( + peer->kgd, (struct kgd_mem *)mem, peer_pdd->vm); + if (err) { + pr_err("Failed to unmap from gpu %d/%d\n", + i, args->n_devices); + goto unmap_memory_from_gpu_failed; + } + args->n_success = i+1; + } + kfree(devices_arr); + + mutex_unlock(&p->mutex); + + return 0; + +bind_process_to_device_failed: +get_mem_obj_from_handle_failed: +unmap_memory_from_gpu_failed: + mutex_unlock(&p->mutex); +copy_from_user_failed: + kfree(devices_arr); + return err; +} + +#define AMDKFD_IOCTL_DEF(ioctl, _func, _flags) \ + [_IOC_NR(ioctl)] = {.cmd = ioctl, .func = _func, .flags = _flags, \ + .cmd_drv = 0, .name = #ioctl} + +/** Ioctl table */ +static const struct amdkfd_ioctl_desc amdkfd_ioctls[] = { + AMDKFD_IOCTL_DEF(AMDKFD_IOC_GET_VERSION, + kfd_ioctl_get_version, 0), + + AMDKFD_IOCTL_DEF(AMDKFD_IOC_CREATE_QUEUE, + kfd_ioctl_create_queue, 0), + + AMDKFD_IOCTL_DEF(AMDKFD_IOC_DESTROY_QUEUE, + kfd_ioctl_destroy_queue, 0), + + AMDKFD_IOCTL_DEF(AMDKFD_IOC_SET_MEMORY_POLICY, + kfd_ioctl_set_memory_policy, 0), + + AMDKFD_IOCTL_DEF(AMDKFD_IOC_GET_CLOCK_COUNTERS, + kfd_ioctl_get_clock_counters, 0), + + AMDKFD_IOCTL_DEF(AMDKFD_IOC_GET_PROCESS_APERTURES, + kfd_ioctl_get_process_apertures, 0), + + AMDKFD_IOCTL_DEF(AMDKFD_IOC_UPDATE_QUEUE, + kfd_ioctl_update_queue, 0), + + AMDKFD_IOCTL_DEF(AMDKFD_IOC_CREATE_EVENT, + kfd_ioctl_create_event, 0), + + AMDKFD_IOCTL_DEF(AMDKFD_IOC_DESTROY_EVENT, + kfd_ioctl_destroy_event, 0), + + AMDKFD_IOCTL_DEF(AMDKFD_IOC_SET_EVENT, + kfd_ioctl_set_event, 0), + + AMDKFD_IOCTL_DEF(AMDKFD_IOC_RESET_EVENT, + kfd_ioctl_reset_event, 0), + + AMDKFD_IOCTL_DEF(AMDKFD_IOC_WAIT_EVENTS, + kfd_ioctl_wait_events, 0), + + AMDKFD_IOCTL_DEF(AMDKFD_IOC_DBG_REGISTER, + kfd_ioctl_dbg_register, 0), + + AMDKFD_IOCTL_DEF(AMDKFD_IOC_DBG_UNREGISTER, + kfd_ioctl_dbg_unregister, 0), + + AMDKFD_IOCTL_DEF(AMDKFD_IOC_DBG_ADDRESS_WATCH, + kfd_ioctl_dbg_address_watch, 0), + + AMDKFD_IOCTL_DEF(AMDKFD_IOC_DBG_WAVE_CONTROL, + kfd_ioctl_dbg_wave_control, 0), + + AMDKFD_IOCTL_DEF(AMDKFD_IOC_SET_SCRATCH_BACKING_VA, + kfd_ioctl_set_scratch_backing_va, 0), + + AMDKFD_IOCTL_DEF(AMDKFD_IOC_GET_TILE_CONFIG, + kfd_ioctl_get_tile_config, 0), + + AMDKFD_IOCTL_DEF(AMDKFD_IOC_SET_TRAP_HANDLER, + kfd_ioctl_set_trap_handler, 0), + + AMDKFD_IOCTL_DEF(AMDKFD_IOC_GET_PROCESS_APERTURES_NEW, + kfd_ioctl_get_process_apertures_new, 0), + + AMDKFD_IOCTL_DEF(AMDKFD_IOC_ACQUIRE_VM, + kfd_ioctl_acquire_vm, 0), + + AMDKFD_IOCTL_DEF(AMDKFD_IOC_ALLOC_MEMORY_OF_GPU, + kfd_ioctl_alloc_memory_of_gpu, 0), + + AMDKFD_IOCTL_DEF(AMDKFD_IOC_FREE_MEMORY_OF_GPU, + kfd_ioctl_free_memory_of_gpu, 0), + + AMDKFD_IOCTL_DEF(AMDKFD_IOC_MAP_MEMORY_TO_GPU, + kfd_ioctl_map_memory_to_gpu, 0), + + AMDKFD_IOCTL_DEF(AMDKFD_IOC_UNMAP_MEMORY_FROM_GPU, + kfd_ioctl_unmap_memory_from_gpu, 0), + + AMDKFD_IOCTL_DEF(AMDKFD_IOC_SET_CU_MASK, + kfd_ioctl_set_cu_mask, 0), + +}; + +#define AMDKFD_CORE_IOCTL_COUNT ARRAY_SIZE(amdkfd_ioctls) + +static long kfd_ioctl(struct file *filep, unsigned int cmd, unsigned long arg) +{ + struct kfd_process *process; + amdkfd_ioctl_t *func; + const struct amdkfd_ioctl_desc *ioctl = NULL; + unsigned int nr = _IOC_NR(cmd); + char stack_kdata[128]; + char *kdata = NULL; + unsigned int usize, asize; + int retcode = -EINVAL; + + if (nr >= AMDKFD_CORE_IOCTL_COUNT) + goto err_i1; + + if ((nr >= AMDKFD_COMMAND_START) && (nr < AMDKFD_COMMAND_END)) { + u32 amdkfd_size; + + ioctl = &amdkfd_ioctls[nr]; + + amdkfd_size = _IOC_SIZE(ioctl->cmd); + usize = asize = _IOC_SIZE(cmd); + if (amdkfd_size > asize) + asize = amdkfd_size; + + cmd = ioctl->cmd; + } else + goto err_i1; + + dev_dbg(kfd_device, "ioctl cmd 0x%x (#%d), arg 0x%lx\n", cmd, nr, arg); + + process = kfd_get_process(current); + if (IS_ERR(process)) { + dev_dbg(kfd_device, "no process\n"); + goto err_i1; + } + + /* Do not trust userspace, use our own definition */ + func = ioctl->func; + + if (unlikely(!func)) { + dev_dbg(kfd_device, "no function\n"); + retcode = -EINVAL; + goto err_i1; + } + + if (cmd & (IOC_IN | IOC_OUT)) { + if (asize <= sizeof(stack_kdata)) { + kdata = stack_kdata; + } else { + kdata = kmalloc(asize, GFP_KERNEL); + if (!kdata) { + retcode = -ENOMEM; + goto err_i1; + } + } + if (asize > usize) + memset(kdata + usize, 0, asize - usize); + } + + if (cmd & IOC_IN) { + if (copy_from_user(kdata, (void __user *)arg, usize) != 0) { + retcode = -EFAULT; + goto err_i1; + } + } else if (cmd & IOC_OUT) { + memset(kdata, 0, usize); + } + + retcode = func(filep, process, kdata); + + if (cmd & IOC_OUT) + if (copy_to_user((void __user *)arg, kdata, usize) != 0) + retcode = -EFAULT; + +err_i1: + if (!ioctl) + dev_dbg(kfd_device, "invalid ioctl: pid=%d, cmd=0x%02x, nr=0x%02x\n", + task_pid_nr(current), cmd, nr); + + if (kdata != stack_kdata) + kfree(kdata); + + if (retcode) + dev_dbg(kfd_device, "ret = %d\n", retcode); + + return retcode; +} + +static int kfd_mmap(struct file *filp, struct vm_area_struct *vma) +{ + struct kfd_process *process; + struct kfd_dev *dev = NULL; + unsigned long vm_pgoff; + unsigned int gpu_id; + + process = kfd_get_process(current); + if (IS_ERR(process)) + return PTR_ERR(process); + + vm_pgoff = vma->vm_pgoff; + vma->vm_pgoff = KFD_MMAP_OFFSET_VALUE_GET(vm_pgoff); + gpu_id = KFD_MMAP_GPU_ID_GET(vm_pgoff); + if (gpu_id) + dev = kfd_device_by_id(gpu_id); + + switch (vm_pgoff & KFD_MMAP_TYPE_MASK) { + case KFD_MMAP_TYPE_DOORBELL: + if (!dev) + return -ENODEV; + return kfd_doorbell_mmap(dev, process, vma); + + case KFD_MMAP_TYPE_EVENTS: + return kfd_event_mmap(process, vma); + + case KFD_MMAP_TYPE_RESERVED_MEM: + if (!dev) + return -ENODEV; + return kfd_reserved_mem_mmap(dev, process, vma); + } + + return -EFAULT; +} diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_crat.c b/drivers/gpu/drm/amd/amdkfd/kfd_crat.c new file mode 100644 index 000000000..e2780643f --- /dev/null +++ b/drivers/gpu/drm/amd/amdkfd/kfd_crat.c @@ -0,0 +1,1308 @@ +/* + * Copyright 2015-2017 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +#include <linux/pci.h> +#include <linux/acpi.h> +#include "kfd_crat.h" +#include "kfd_priv.h" +#include "kfd_topology.h" +#include "kfd_iommu.h" + +/* GPU Processor ID base for dGPUs for which VCRAT needs to be created. + * GPU processor ID are expressed with Bit[31]=1. + * The base is set to 0x8000_0000 + 0x1000 to avoid collision with GPU IDs + * used in the CRAT. + */ +static uint32_t gpu_processor_id_low = 0x80001000; + +/* Return the next available gpu_processor_id and increment it for next GPU + * @total_cu_count - Total CUs present in the GPU including ones + * masked off + */ +static inline unsigned int get_and_inc_gpu_processor_id( + unsigned int total_cu_count) +{ + int current_id = gpu_processor_id_low; + + gpu_processor_id_low += total_cu_count; + return current_id; +} + +/* Static table to describe GPU Cache information */ +struct kfd_gpu_cache_info { + uint32_t cache_size; + uint32_t cache_level; + uint32_t flags; + /* Indicates how many Compute Units share this cache + * Value = 1 indicates the cache is not shared + */ + uint32_t num_cu_shared; +}; + +static struct kfd_gpu_cache_info kaveri_cache_info[] = { + { + /* TCP L1 Cache per CU */ + .cache_size = 16, + .cache_level = 1, + .flags = (CRAT_CACHE_FLAGS_ENABLED | + CRAT_CACHE_FLAGS_DATA_CACHE | + CRAT_CACHE_FLAGS_SIMD_CACHE), + .num_cu_shared = 1, + + }, + { + /* Scalar L1 Instruction Cache (in SQC module) per bank */ + .cache_size = 16, + .cache_level = 1, + .flags = (CRAT_CACHE_FLAGS_ENABLED | + CRAT_CACHE_FLAGS_INST_CACHE | + CRAT_CACHE_FLAGS_SIMD_CACHE), + .num_cu_shared = 2, + }, + { + /* Scalar L1 Data Cache (in SQC module) per bank */ + .cache_size = 8, + .cache_level = 1, + .flags = (CRAT_CACHE_FLAGS_ENABLED | + CRAT_CACHE_FLAGS_DATA_CACHE | + CRAT_CACHE_FLAGS_SIMD_CACHE), + .num_cu_shared = 2, + }, + + /* TODO: Add L2 Cache information */ +}; + + +static struct kfd_gpu_cache_info carrizo_cache_info[] = { + { + /* TCP L1 Cache per CU */ + .cache_size = 16, + .cache_level = 1, + .flags = (CRAT_CACHE_FLAGS_ENABLED | + CRAT_CACHE_FLAGS_DATA_CACHE | + CRAT_CACHE_FLAGS_SIMD_CACHE), + .num_cu_shared = 1, + }, + { + /* Scalar L1 Instruction Cache (in SQC module) per bank */ + .cache_size = 8, + .cache_level = 1, + .flags = (CRAT_CACHE_FLAGS_ENABLED | + CRAT_CACHE_FLAGS_INST_CACHE | + CRAT_CACHE_FLAGS_SIMD_CACHE), + .num_cu_shared = 4, + }, + { + /* Scalar L1 Data Cache (in SQC module) per bank. */ + .cache_size = 4, + .cache_level = 1, + .flags = (CRAT_CACHE_FLAGS_ENABLED | + CRAT_CACHE_FLAGS_DATA_CACHE | + CRAT_CACHE_FLAGS_SIMD_CACHE), + .num_cu_shared = 4, + }, + + /* TODO: Add L2 Cache information */ +}; + +/* NOTE: In future if more information is added to struct kfd_gpu_cache_info + * the following ASICs may need a separate table. + */ +#define hawaii_cache_info kaveri_cache_info +#define tonga_cache_info carrizo_cache_info +#define fiji_cache_info carrizo_cache_info +#define polaris10_cache_info carrizo_cache_info +#define polaris11_cache_info carrizo_cache_info +/* TODO - check & update Vega10 cache details */ +#define vega10_cache_info carrizo_cache_info +#define raven_cache_info carrizo_cache_info + +static void kfd_populated_cu_info_cpu(struct kfd_topology_device *dev, + struct crat_subtype_computeunit *cu) +{ + dev->node_props.cpu_cores_count = cu->num_cpu_cores; + dev->node_props.cpu_core_id_base = cu->processor_id_low; + if (cu->hsa_capability & CRAT_CU_FLAGS_IOMMU_PRESENT) + dev->node_props.capability |= HSA_CAP_ATS_PRESENT; + + pr_debug("CU CPU: cores=%d id_base=%d\n", cu->num_cpu_cores, + cu->processor_id_low); +} + +static void kfd_populated_cu_info_gpu(struct kfd_topology_device *dev, + struct crat_subtype_computeunit *cu) +{ + dev->node_props.simd_id_base = cu->processor_id_low; + dev->node_props.simd_count = cu->num_simd_cores; + dev->node_props.lds_size_in_kb = cu->lds_size_in_kb; + dev->node_props.max_waves_per_simd = cu->max_waves_simd; + dev->node_props.wave_front_size = cu->wave_front_size; + dev->node_props.array_count = cu->array_count; + dev->node_props.cu_per_simd_array = cu->num_cu_per_array; + dev->node_props.simd_per_cu = cu->num_simd_per_cu; + dev->node_props.max_slots_scratch_cu = cu->max_slots_scatch_cu; + if (cu->hsa_capability & CRAT_CU_FLAGS_HOT_PLUGGABLE) + dev->node_props.capability |= HSA_CAP_HOT_PLUGGABLE; + pr_debug("CU GPU: id_base=%d\n", cu->processor_id_low); +} + +/* kfd_parse_subtype_cu - parse compute unit subtypes and attach it to correct + * topology device present in the device_list + */ +static int kfd_parse_subtype_cu(struct crat_subtype_computeunit *cu, + struct list_head *device_list) +{ + struct kfd_topology_device *dev; + + pr_debug("Found CU entry in CRAT table with proximity_domain=%d caps=%x\n", + cu->proximity_domain, cu->hsa_capability); + list_for_each_entry(dev, device_list, list) { + if (cu->proximity_domain == dev->proximity_domain) { + if (cu->flags & CRAT_CU_FLAGS_CPU_PRESENT) + kfd_populated_cu_info_cpu(dev, cu); + + if (cu->flags & CRAT_CU_FLAGS_GPU_PRESENT) + kfd_populated_cu_info_gpu(dev, cu); + break; + } + } + + return 0; +} + +static struct kfd_mem_properties * +find_subtype_mem(uint32_t heap_type, uint32_t flags, uint32_t width, + struct kfd_topology_device *dev) +{ + struct kfd_mem_properties *props; + + list_for_each_entry(props, &dev->mem_props, list) { + if (props->heap_type == heap_type + && props->flags == flags + && props->width == width) + return props; + } + + return NULL; +} +/* kfd_parse_subtype_mem - parse memory subtypes and attach it to correct + * topology device present in the device_list + */ +static int kfd_parse_subtype_mem(struct crat_subtype_memory *mem, + struct list_head *device_list) +{ + struct kfd_mem_properties *props; + struct kfd_topology_device *dev; + uint32_t heap_type; + uint64_t size_in_bytes; + uint32_t flags = 0; + uint32_t width; + + pr_debug("Found memory entry in CRAT table with proximity_domain=%d\n", + mem->proximity_domain); + list_for_each_entry(dev, device_list, list) { + if (mem->proximity_domain == dev->proximity_domain) { + /* We're on GPU node */ + if (dev->node_props.cpu_cores_count == 0) { + /* APU */ + if (mem->visibility_type == 0) + heap_type = + HSA_MEM_HEAP_TYPE_FB_PRIVATE; + /* dGPU */ + else + heap_type = mem->visibility_type; + } else + heap_type = HSA_MEM_HEAP_TYPE_SYSTEM; + + if (mem->flags & CRAT_MEM_FLAGS_HOT_PLUGGABLE) + flags |= HSA_MEM_FLAGS_HOT_PLUGGABLE; + if (mem->flags & CRAT_MEM_FLAGS_NON_VOLATILE) + flags |= HSA_MEM_FLAGS_NON_VOLATILE; + + size_in_bytes = + ((uint64_t)mem->length_high << 32) + + mem->length_low; + width = mem->width; + + /* Multiple banks of the same type are aggregated into + * one. User mode doesn't care about multiple physical + * memory segments. It's managed as a single virtual + * heap for user mode. + */ + props = find_subtype_mem(heap_type, flags, width, dev); + if (props) { + props->size_in_bytes += size_in_bytes; + break; + } + + props = kfd_alloc_struct(props); + if (!props) + return -ENOMEM; + + props->heap_type = heap_type; + props->flags = flags; + props->size_in_bytes = size_in_bytes; + props->width = width; + + dev->node_props.mem_banks_count++; + list_add_tail(&props->list, &dev->mem_props); + + break; + } + } + + return 0; +} + +/* kfd_parse_subtype_cache - parse cache subtypes and attach it to correct + * topology device present in the device_list + */ +static int kfd_parse_subtype_cache(struct crat_subtype_cache *cache, + struct list_head *device_list) +{ + struct kfd_cache_properties *props; + struct kfd_topology_device *dev; + uint32_t id; + uint32_t total_num_of_cu; + + id = cache->processor_id_low; + + pr_debug("Found cache entry in CRAT table with processor_id=%d\n", id); + list_for_each_entry(dev, device_list, list) { + total_num_of_cu = (dev->node_props.array_count * + dev->node_props.cu_per_simd_array); + + /* Cache infomration in CRAT doesn't have proximity_domain + * information as it is associated with a CPU core or GPU + * Compute Unit. So map the cache using CPU core Id or SIMD + * (GPU) ID. + * TODO: This works because currently we can safely assume that + * Compute Units are parsed before caches are parsed. In + * future, remove this dependency + */ + if ((id >= dev->node_props.cpu_core_id_base && + id <= dev->node_props.cpu_core_id_base + + dev->node_props.cpu_cores_count) || + (id >= dev->node_props.simd_id_base && + id < dev->node_props.simd_id_base + + total_num_of_cu)) { + props = kfd_alloc_struct(props); + if (!props) + return -ENOMEM; + + props->processor_id_low = id; + props->cache_level = cache->cache_level; + props->cache_size = cache->cache_size; + props->cacheline_size = cache->cache_line_size; + props->cachelines_per_tag = cache->lines_per_tag; + props->cache_assoc = cache->associativity; + props->cache_latency = cache->cache_latency; + memcpy(props->sibling_map, cache->sibling_map, + sizeof(props->sibling_map)); + + if (cache->flags & CRAT_CACHE_FLAGS_DATA_CACHE) + props->cache_type |= HSA_CACHE_TYPE_DATA; + if (cache->flags & CRAT_CACHE_FLAGS_INST_CACHE) + props->cache_type |= HSA_CACHE_TYPE_INSTRUCTION; + if (cache->flags & CRAT_CACHE_FLAGS_CPU_CACHE) + props->cache_type |= HSA_CACHE_TYPE_CPU; + if (cache->flags & CRAT_CACHE_FLAGS_SIMD_CACHE) + props->cache_type |= HSA_CACHE_TYPE_HSACU; + + dev->cache_count++; + dev->node_props.caches_count++; + list_add_tail(&props->list, &dev->cache_props); + + break; + } + } + + return 0; +} + +/* kfd_parse_subtype_iolink - parse iolink subtypes and attach it to correct + * topology device present in the device_list + */ +static int kfd_parse_subtype_iolink(struct crat_subtype_iolink *iolink, + struct list_head *device_list) +{ + struct kfd_iolink_properties *props = NULL, *props2; + struct kfd_topology_device *dev, *cpu_dev; + uint32_t id_from; + uint32_t id_to; + + id_from = iolink->proximity_domain_from; + id_to = iolink->proximity_domain_to; + + pr_debug("Found IO link entry in CRAT table with id_from=%d\n", + id_from); + list_for_each_entry(dev, device_list, list) { + if (id_from == dev->proximity_domain) { + props = kfd_alloc_struct(props); + if (!props) + return -ENOMEM; + + props->node_from = id_from; + props->node_to = id_to; + props->ver_maj = iolink->version_major; + props->ver_min = iolink->version_minor; + props->iolink_type = iolink->io_interface_type; + + if (props->iolink_type == CRAT_IOLINK_TYPE_PCIEXPRESS) + props->weight = 20; + else + props->weight = node_distance(id_from, id_to); + + props->min_latency = iolink->minimum_latency; + props->max_latency = iolink->maximum_latency; + props->min_bandwidth = iolink->minimum_bandwidth_mbs; + props->max_bandwidth = iolink->maximum_bandwidth_mbs; + props->rec_transfer_size = + iolink->recommended_transfer_size; + + dev->io_link_count++; + dev->node_props.io_links_count++; + list_add_tail(&props->list, &dev->io_link_props); + break; + } + } + + /* CPU topology is created before GPUs are detected, so CPU->GPU + * links are not built at that time. If a PCIe type is discovered, it + * means a GPU is detected and we are adding GPU->CPU to the topology. + * At this time, also add the corresponded CPU->GPU link. + */ + if (props && props->iolink_type == CRAT_IOLINK_TYPE_PCIEXPRESS) { + cpu_dev = kfd_topology_device_by_proximity_domain(id_to); + if (!cpu_dev) + return -ENODEV; + /* same everything but the other direction */ + props2 = kmemdup(props, sizeof(*props2), GFP_KERNEL); + props2->node_from = id_to; + props2->node_to = id_from; + props2->kobj = NULL; + cpu_dev->io_link_count++; + cpu_dev->node_props.io_links_count++; + list_add_tail(&props2->list, &cpu_dev->io_link_props); + } + + return 0; +} + +/* kfd_parse_subtype - parse subtypes and attach it to correct topology device + * present in the device_list + * @sub_type_hdr - subtype section of crat_image + * @device_list - list of topology devices present in this crat_image + */ +static int kfd_parse_subtype(struct crat_subtype_generic *sub_type_hdr, + struct list_head *device_list) +{ + struct crat_subtype_computeunit *cu; + struct crat_subtype_memory *mem; + struct crat_subtype_cache *cache; + struct crat_subtype_iolink *iolink; + int ret = 0; + + switch (sub_type_hdr->type) { + case CRAT_SUBTYPE_COMPUTEUNIT_AFFINITY: + cu = (struct crat_subtype_computeunit *)sub_type_hdr; + ret = kfd_parse_subtype_cu(cu, device_list); + break; + case CRAT_SUBTYPE_MEMORY_AFFINITY: + mem = (struct crat_subtype_memory *)sub_type_hdr; + ret = kfd_parse_subtype_mem(mem, device_list); + break; + case CRAT_SUBTYPE_CACHE_AFFINITY: + cache = (struct crat_subtype_cache *)sub_type_hdr; + ret = kfd_parse_subtype_cache(cache, device_list); + break; + case CRAT_SUBTYPE_TLB_AFFINITY: + /* + * For now, nothing to do here + */ + pr_debug("Found TLB entry in CRAT table (not processing)\n"); + break; + case CRAT_SUBTYPE_CCOMPUTE_AFFINITY: + /* + * For now, nothing to do here + */ + pr_debug("Found CCOMPUTE entry in CRAT table (not processing)\n"); + break; + case CRAT_SUBTYPE_IOLINK_AFFINITY: + iolink = (struct crat_subtype_iolink *)sub_type_hdr; + ret = kfd_parse_subtype_iolink(iolink, device_list); + break; + default: + pr_warn("Unknown subtype %d in CRAT\n", + sub_type_hdr->type); + } + + return ret; +} + +/* kfd_parse_crat_table - parse CRAT table. For each node present in CRAT + * create a kfd_topology_device and add in to device_list. Also parse + * CRAT subtypes and attach it to appropriate kfd_topology_device + * @crat_image - input image containing CRAT + * @device_list - [OUT] list of kfd_topology_device generated after + * parsing crat_image + * @proximity_domain - Proximity domain of the first device in the table + * + * Return - 0 if successful else -ve value + */ +int kfd_parse_crat_table(void *crat_image, struct list_head *device_list, + uint32_t proximity_domain) +{ + struct kfd_topology_device *top_dev = NULL; + struct crat_subtype_generic *sub_type_hdr; + uint16_t node_id; + int ret = 0; + struct crat_header *crat_table = (struct crat_header *)crat_image; + uint16_t num_nodes; + uint32_t image_len; + + if (!crat_image) + return -EINVAL; + + if (!list_empty(device_list)) { + pr_warn("Error device list should be empty\n"); + return -EINVAL; + } + + num_nodes = crat_table->num_domains; + image_len = crat_table->length; + + pr_info("Parsing CRAT table with %d nodes\n", num_nodes); + + for (node_id = 0; node_id < num_nodes; node_id++) { + top_dev = kfd_create_topology_device(device_list); + if (!top_dev) + break; + top_dev->proximity_domain = proximity_domain++; + } + + if (!top_dev) { + ret = -ENOMEM; + goto err; + } + + memcpy(top_dev->oem_id, crat_table->oem_id, CRAT_OEMID_LENGTH); + memcpy(top_dev->oem_table_id, crat_table->oem_table_id, + CRAT_OEMTABLEID_LENGTH); + top_dev->oem_revision = crat_table->oem_revision; + + sub_type_hdr = (struct crat_subtype_generic *)(crat_table+1); + while ((char *)sub_type_hdr + sizeof(struct crat_subtype_generic) < + ((char *)crat_image) + image_len) { + if (sub_type_hdr->flags & CRAT_SUBTYPE_FLAGS_ENABLED) { + ret = kfd_parse_subtype(sub_type_hdr, device_list); + if (ret) + break; + } + + sub_type_hdr = (typeof(sub_type_hdr))((char *)sub_type_hdr + + sub_type_hdr->length); + } + +err: + if (ret) + kfd_release_topology_device_list(device_list); + + return ret; +} + +/* Helper function. See kfd_fill_gpu_cache_info for parameter description */ +static int fill_in_pcache(struct crat_subtype_cache *pcache, + struct kfd_gpu_cache_info *pcache_info, + struct kfd_cu_info *cu_info, + int mem_available, + int cu_bitmask, + int cache_type, unsigned int cu_processor_id, + int cu_block) +{ + unsigned int cu_sibling_map_mask; + int first_active_cu; + + /* First check if enough memory is available */ + if (sizeof(struct crat_subtype_cache) > mem_available) + return -ENOMEM; + + cu_sibling_map_mask = cu_bitmask; + cu_sibling_map_mask >>= cu_block; + cu_sibling_map_mask &= + ((1 << pcache_info[cache_type].num_cu_shared) - 1); + first_active_cu = ffs(cu_sibling_map_mask); + + /* CU could be inactive. In case of shared cache find the first active + * CU. and incase of non-shared cache check if the CU is inactive. If + * inactive active skip it + */ + if (first_active_cu) { + memset(pcache, 0, sizeof(struct crat_subtype_cache)); + pcache->type = CRAT_SUBTYPE_CACHE_AFFINITY; + pcache->length = sizeof(struct crat_subtype_cache); + pcache->flags = pcache_info[cache_type].flags; + pcache->processor_id_low = cu_processor_id + + (first_active_cu - 1); + pcache->cache_level = pcache_info[cache_type].cache_level; + pcache->cache_size = pcache_info[cache_type].cache_size; + + /* Sibling map is w.r.t processor_id_low, so shift out + * inactive CU + */ + cu_sibling_map_mask = + cu_sibling_map_mask >> (first_active_cu - 1); + + pcache->sibling_map[0] = (uint8_t)(cu_sibling_map_mask & 0xFF); + pcache->sibling_map[1] = + (uint8_t)((cu_sibling_map_mask >> 8) & 0xFF); + pcache->sibling_map[2] = + (uint8_t)((cu_sibling_map_mask >> 16) & 0xFF); + pcache->sibling_map[3] = + (uint8_t)((cu_sibling_map_mask >> 24) & 0xFF); + return 0; + } + return 1; +} + +/* kfd_fill_gpu_cache_info - Fill GPU cache info using kfd_gpu_cache_info + * tables + * + * @kdev - [IN] GPU device + * @gpu_processor_id - [IN] GPU processor ID to which these caches + * associate + * @available_size - [IN] Amount of memory available in pcache + * @cu_info - [IN] Compute Unit info obtained from KGD + * @pcache - [OUT] memory into which cache data is to be filled in. + * @size_filled - [OUT] amount of data used up in pcache. + * @num_of_entries - [OUT] number of caches added + */ +static int kfd_fill_gpu_cache_info(struct kfd_dev *kdev, + int gpu_processor_id, + int available_size, + struct kfd_cu_info *cu_info, + struct crat_subtype_cache *pcache, + int *size_filled, + int *num_of_entries) +{ + struct kfd_gpu_cache_info *pcache_info; + int num_of_cache_types = 0; + int i, j, k; + int ct = 0; + int mem_available = available_size; + unsigned int cu_processor_id; + int ret; + + switch (kdev->device_info->asic_family) { + case CHIP_KAVERI: + pcache_info = kaveri_cache_info; + num_of_cache_types = ARRAY_SIZE(kaveri_cache_info); + break; + case CHIP_HAWAII: + pcache_info = hawaii_cache_info; + num_of_cache_types = ARRAY_SIZE(hawaii_cache_info); + break; + case CHIP_CARRIZO: + pcache_info = carrizo_cache_info; + num_of_cache_types = ARRAY_SIZE(carrizo_cache_info); + break; + case CHIP_TONGA: + pcache_info = tonga_cache_info; + num_of_cache_types = ARRAY_SIZE(tonga_cache_info); + break; + case CHIP_FIJI: + pcache_info = fiji_cache_info; + num_of_cache_types = ARRAY_SIZE(fiji_cache_info); + break; + case CHIP_POLARIS10: + pcache_info = polaris10_cache_info; + num_of_cache_types = ARRAY_SIZE(polaris10_cache_info); + break; + case CHIP_POLARIS11: + pcache_info = polaris11_cache_info; + num_of_cache_types = ARRAY_SIZE(polaris11_cache_info); + break; + case CHIP_VEGA10: + pcache_info = vega10_cache_info; + num_of_cache_types = ARRAY_SIZE(vega10_cache_info); + break; + case CHIP_RAVEN: + pcache_info = raven_cache_info; + num_of_cache_types = ARRAY_SIZE(raven_cache_info); + break; + default: + return -EINVAL; + } + + *size_filled = 0; + *num_of_entries = 0; + + /* For each type of cache listed in the kfd_gpu_cache_info table, + * go through all available Compute Units. + * The [i,j,k] loop will + * if kfd_gpu_cache_info.num_cu_shared = 1 + * will parse through all available CU + * If (kfd_gpu_cache_info.num_cu_shared != 1) + * then it will consider only one CU from + * the shared unit + */ + + for (ct = 0; ct < num_of_cache_types; ct++) { + cu_processor_id = gpu_processor_id; + for (i = 0; i < cu_info->num_shader_engines; i++) { + for (j = 0; j < cu_info->num_shader_arrays_per_engine; + j++) { + for (k = 0; k < cu_info->num_cu_per_sh; + k += pcache_info[ct].num_cu_shared) { + + ret = fill_in_pcache(pcache, + pcache_info, + cu_info, + mem_available, + cu_info->cu_bitmap[i][j], + ct, + cu_processor_id, + k); + + if (ret < 0) + break; + + if (!ret) { + pcache++; + (*num_of_entries)++; + mem_available -= + sizeof(*pcache); + (*size_filled) += + sizeof(*pcache); + } + + /* Move to next CU block */ + cu_processor_id += + pcache_info[ct].num_cu_shared; + } + } + } + } + + pr_debug("Added [%d] GPU cache entries\n", *num_of_entries); + + return 0; +} + +/* + * kfd_create_crat_image_acpi - Allocates memory for CRAT image and + * copies CRAT from ACPI (if available). + * NOTE: Call kfd_destroy_crat_image to free CRAT image memory + * + * @crat_image: CRAT read from ACPI. If no CRAT in ACPI then + * crat_image will be NULL + * @size: [OUT] size of crat_image + * + * Return 0 if successful else return error code + */ +int kfd_create_crat_image_acpi(void **crat_image, size_t *size) +{ + struct acpi_table_header *crat_table; + acpi_status status; + void *pcrat_image; + + if (!crat_image) + return -EINVAL; + + *crat_image = NULL; + + /* Fetch the CRAT table from ACPI */ + status = acpi_get_table(CRAT_SIGNATURE, 0, &crat_table); + if (status == AE_NOT_FOUND) { + pr_info("CRAT table not found\n"); + return -ENODATA; + } else if (ACPI_FAILURE(status)) { + const char *err = acpi_format_exception(status); + + pr_err("CRAT table error: %s\n", err); + return -EINVAL; + } + + if (ignore_crat) { + pr_info("CRAT table disabled by module option\n"); + return -ENODATA; + } + + pcrat_image = kmalloc(crat_table->length, GFP_KERNEL); + if (!pcrat_image) + return -ENOMEM; + + memcpy(pcrat_image, crat_table, crat_table->length); + + *crat_image = pcrat_image; + *size = crat_table->length; + + return 0; +} + +/* Memory required to create Virtual CRAT. + * Since there is no easy way to predict the amount of memory required, the + * following amount are allocated for CPU and GPU Virtual CRAT. This is + * expected to cover all known conditions. But to be safe additional check + * is put in the code to ensure we don't overwrite. + */ +#define VCRAT_SIZE_FOR_CPU (2 * PAGE_SIZE) +#define VCRAT_SIZE_FOR_GPU (3 * PAGE_SIZE) + +/* kfd_fill_cu_for_cpu - Fill in Compute info for the given CPU NUMA node + * + * @numa_node_id: CPU NUMA node id + * @avail_size: Available size in the memory + * @sub_type_hdr: Memory into which compute info will be filled in + * + * Return 0 if successful else return -ve value + */ +static int kfd_fill_cu_for_cpu(int numa_node_id, int *avail_size, + int proximity_domain, + struct crat_subtype_computeunit *sub_type_hdr) +{ + const struct cpumask *cpumask; + + *avail_size -= sizeof(struct crat_subtype_computeunit); + if (*avail_size < 0) + return -ENOMEM; + + memset(sub_type_hdr, 0, sizeof(struct crat_subtype_computeunit)); + + /* Fill in subtype header data */ + sub_type_hdr->type = CRAT_SUBTYPE_COMPUTEUNIT_AFFINITY; + sub_type_hdr->length = sizeof(struct crat_subtype_computeunit); + sub_type_hdr->flags = CRAT_SUBTYPE_FLAGS_ENABLED; + + cpumask = cpumask_of_node(numa_node_id); + + /* Fill in CU data */ + sub_type_hdr->flags |= CRAT_CU_FLAGS_CPU_PRESENT; + sub_type_hdr->proximity_domain = proximity_domain; + sub_type_hdr->processor_id_low = kfd_numa_node_to_apic_id(numa_node_id); + if (sub_type_hdr->processor_id_low == -1) + return -EINVAL; + + sub_type_hdr->num_cpu_cores = cpumask_weight(cpumask); + + return 0; +} + +/* kfd_fill_mem_info_for_cpu - Fill in Memory info for the given CPU NUMA node + * + * @numa_node_id: CPU NUMA node id + * @avail_size: Available size in the memory + * @sub_type_hdr: Memory into which compute info will be filled in + * + * Return 0 if successful else return -ve value + */ +static int kfd_fill_mem_info_for_cpu(int numa_node_id, int *avail_size, + int proximity_domain, + struct crat_subtype_memory *sub_type_hdr) +{ + uint64_t mem_in_bytes = 0; + pg_data_t *pgdat; + int zone_type; + + *avail_size -= sizeof(struct crat_subtype_memory); + if (*avail_size < 0) + return -ENOMEM; + + memset(sub_type_hdr, 0, sizeof(struct crat_subtype_memory)); + + /* Fill in subtype header data */ + sub_type_hdr->type = CRAT_SUBTYPE_MEMORY_AFFINITY; + sub_type_hdr->length = sizeof(struct crat_subtype_memory); + sub_type_hdr->flags = CRAT_SUBTYPE_FLAGS_ENABLED; + + /* Fill in Memory Subunit data */ + + /* Unlike si_meminfo, si_meminfo_node is not exported. So + * the following lines are duplicated from si_meminfo_node + * function + */ + pgdat = NODE_DATA(numa_node_id); + for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++) + mem_in_bytes += pgdat->node_zones[zone_type].managed_pages; + mem_in_bytes <<= PAGE_SHIFT; + + sub_type_hdr->length_low = lower_32_bits(mem_in_bytes); + sub_type_hdr->length_high = upper_32_bits(mem_in_bytes); + sub_type_hdr->proximity_domain = proximity_domain; + + return 0; +} + +static int kfd_fill_iolink_info_for_cpu(int numa_node_id, int *avail_size, + uint32_t *num_entries, + struct crat_subtype_iolink *sub_type_hdr) +{ + int nid; + struct cpuinfo_x86 *c = &cpu_data(0); + uint8_t link_type; + + if (c->x86_vendor == X86_VENDOR_AMD) + link_type = CRAT_IOLINK_TYPE_HYPERTRANSPORT; + else + link_type = CRAT_IOLINK_TYPE_QPI_1_1; + + *num_entries = 0; + + /* Create IO links from this node to other CPU nodes */ + for_each_online_node(nid) { + if (nid == numa_node_id) /* node itself */ + continue; + + *avail_size -= sizeof(struct crat_subtype_iolink); + if (*avail_size < 0) + return -ENOMEM; + + memset(sub_type_hdr, 0, sizeof(struct crat_subtype_iolink)); + + /* Fill in subtype header data */ + sub_type_hdr->type = CRAT_SUBTYPE_IOLINK_AFFINITY; + sub_type_hdr->length = sizeof(struct crat_subtype_iolink); + sub_type_hdr->flags = CRAT_SUBTYPE_FLAGS_ENABLED; + + /* Fill in IO link data */ + sub_type_hdr->proximity_domain_from = numa_node_id; + sub_type_hdr->proximity_domain_to = nid; + sub_type_hdr->io_interface_type = link_type; + + (*num_entries)++; + sub_type_hdr++; + } + + return 0; +} + +/* kfd_create_vcrat_image_cpu - Create Virtual CRAT for CPU + * + * @pcrat_image: Fill in VCRAT for CPU + * @size: [IN] allocated size of crat_image. + * [OUT] actual size of data filled in crat_image + */ +static int kfd_create_vcrat_image_cpu(void *pcrat_image, size_t *size) +{ + struct crat_header *crat_table = (struct crat_header *)pcrat_image; + struct acpi_table_header *acpi_table; + acpi_status status; + struct crat_subtype_generic *sub_type_hdr; + int avail_size = *size; + int numa_node_id; + uint32_t entries = 0; + int ret = 0; + + if (!pcrat_image || avail_size < VCRAT_SIZE_FOR_CPU) + return -EINVAL; + + /* Fill in CRAT Header. + * Modify length and total_entries as subunits are added. + */ + avail_size -= sizeof(struct crat_header); + if (avail_size < 0) + return -ENOMEM; + + memset(crat_table, 0, sizeof(struct crat_header)); + memcpy(&crat_table->signature, CRAT_SIGNATURE, + sizeof(crat_table->signature)); + crat_table->length = sizeof(struct crat_header); + + status = acpi_get_table("DSDT", 0, &acpi_table); + if (status != AE_OK) + pr_warn("DSDT table not found for OEM information\n"); + else { + crat_table->oem_revision = acpi_table->revision; + memcpy(crat_table->oem_id, acpi_table->oem_id, + CRAT_OEMID_LENGTH); + memcpy(crat_table->oem_table_id, acpi_table->oem_table_id, + CRAT_OEMTABLEID_LENGTH); + } + crat_table->total_entries = 0; + crat_table->num_domains = 0; + + sub_type_hdr = (struct crat_subtype_generic *)(crat_table+1); + + for_each_online_node(numa_node_id) { + if (kfd_numa_node_to_apic_id(numa_node_id) == -1) + continue; + + /* Fill in Subtype: Compute Unit */ + ret = kfd_fill_cu_for_cpu(numa_node_id, &avail_size, + crat_table->num_domains, + (struct crat_subtype_computeunit *)sub_type_hdr); + if (ret < 0) + return ret; + crat_table->length += sub_type_hdr->length; + crat_table->total_entries++; + + sub_type_hdr = (typeof(sub_type_hdr))((char *)sub_type_hdr + + sub_type_hdr->length); + + /* Fill in Subtype: Memory */ + ret = kfd_fill_mem_info_for_cpu(numa_node_id, &avail_size, + crat_table->num_domains, + (struct crat_subtype_memory *)sub_type_hdr); + if (ret < 0) + return ret; + crat_table->length += sub_type_hdr->length; + crat_table->total_entries++; + + sub_type_hdr = (typeof(sub_type_hdr))((char *)sub_type_hdr + + sub_type_hdr->length); + + /* Fill in Subtype: IO Link */ + ret = kfd_fill_iolink_info_for_cpu(numa_node_id, &avail_size, + &entries, + (struct crat_subtype_iolink *)sub_type_hdr); + if (ret < 0) + return ret; + crat_table->length += (sub_type_hdr->length * entries); + crat_table->total_entries += entries; + + sub_type_hdr = (typeof(sub_type_hdr))((char *)sub_type_hdr + + sub_type_hdr->length * entries); + + crat_table->num_domains++; + } + + /* TODO: Add cache Subtype for CPU. + * Currently, CPU cache information is available in function + * detect_cache_attributes(cpu) defined in the file + * ./arch/x86/kernel/cpu/intel_cacheinfo.c. This function is not + * exported and to get the same information the code needs to be + * duplicated. + */ + + *size = crat_table->length; + pr_info("Virtual CRAT table created for CPU\n"); + + return 0; +} + +static int kfd_fill_gpu_memory_affinity(int *avail_size, + struct kfd_dev *kdev, uint8_t type, uint64_t size, + struct crat_subtype_memory *sub_type_hdr, + uint32_t proximity_domain, + const struct kfd_local_mem_info *local_mem_info) +{ + *avail_size -= sizeof(struct crat_subtype_memory); + if (*avail_size < 0) + return -ENOMEM; + + memset((void *)sub_type_hdr, 0, sizeof(struct crat_subtype_memory)); + sub_type_hdr->type = CRAT_SUBTYPE_MEMORY_AFFINITY; + sub_type_hdr->length = sizeof(struct crat_subtype_memory); + sub_type_hdr->flags |= CRAT_SUBTYPE_FLAGS_ENABLED; + + sub_type_hdr->proximity_domain = proximity_domain; + + pr_debug("Fill gpu memory affinity - type 0x%x size 0x%llx\n", + type, size); + + sub_type_hdr->length_low = lower_32_bits(size); + sub_type_hdr->length_high = upper_32_bits(size); + + sub_type_hdr->width = local_mem_info->vram_width; + sub_type_hdr->visibility_type = type; + + return 0; +} + +/* kfd_fill_gpu_direct_io_link - Fill in direct io link from GPU + * to its NUMA node + * @avail_size: Available size in the memory + * @kdev - [IN] GPU device + * @sub_type_hdr: Memory into which io link info will be filled in + * @proximity_domain - proximity domain of the GPU node + * + * Return 0 if successful else return -ve value + */ +static int kfd_fill_gpu_direct_io_link(int *avail_size, + struct kfd_dev *kdev, + struct crat_subtype_iolink *sub_type_hdr, + uint32_t proximity_domain) +{ + *avail_size -= sizeof(struct crat_subtype_iolink); + if (*avail_size < 0) + return -ENOMEM; + + memset((void *)sub_type_hdr, 0, sizeof(struct crat_subtype_iolink)); + + /* Fill in subtype header data */ + sub_type_hdr->type = CRAT_SUBTYPE_IOLINK_AFFINITY; + sub_type_hdr->length = sizeof(struct crat_subtype_iolink); + sub_type_hdr->flags |= CRAT_SUBTYPE_FLAGS_ENABLED; + + /* Fill in IOLINK subtype. + * TODO: Fill-in other fields of iolink subtype + */ + sub_type_hdr->io_interface_type = CRAT_IOLINK_TYPE_PCIEXPRESS; + sub_type_hdr->proximity_domain_from = proximity_domain; +#ifdef CONFIG_NUMA + if (kdev->pdev->dev.numa_node == NUMA_NO_NODE) + sub_type_hdr->proximity_domain_to = 0; + else + sub_type_hdr->proximity_domain_to = kdev->pdev->dev.numa_node; +#else + sub_type_hdr->proximity_domain_to = 0; +#endif + return 0; +} + +/* kfd_create_vcrat_image_gpu - Create Virtual CRAT for CPU + * + * @pcrat_image: Fill in VCRAT for GPU + * @size: [IN] allocated size of crat_image. + * [OUT] actual size of data filled in crat_image + */ +static int kfd_create_vcrat_image_gpu(void *pcrat_image, + size_t *size, struct kfd_dev *kdev, + uint32_t proximity_domain) +{ + struct crat_header *crat_table = (struct crat_header *)pcrat_image; + struct crat_subtype_generic *sub_type_hdr; + struct crat_subtype_computeunit *cu; + struct kfd_cu_info cu_info; + int avail_size = *size; + uint32_t total_num_of_cu; + int num_of_cache_entries = 0; + int cache_mem_filled = 0; + int ret = 0; + struct kfd_local_mem_info local_mem_info; + + if (!pcrat_image || avail_size < VCRAT_SIZE_FOR_GPU) + return -EINVAL; + + /* Fill the CRAT Header. + * Modify length and total_entries as subunits are added. + */ + avail_size -= sizeof(struct crat_header); + if (avail_size < 0) + return -ENOMEM; + + memset(crat_table, 0, sizeof(struct crat_header)); + + memcpy(&crat_table->signature, CRAT_SIGNATURE, + sizeof(crat_table->signature)); + /* Change length as we add more subtypes*/ + crat_table->length = sizeof(struct crat_header); + crat_table->num_domains = 1; + crat_table->total_entries = 0; + + /* Fill in Subtype: Compute Unit + * First fill in the sub type header and then sub type data + */ + avail_size -= sizeof(struct crat_subtype_computeunit); + if (avail_size < 0) + return -ENOMEM; + + sub_type_hdr = (struct crat_subtype_generic *)(crat_table + 1); + memset(sub_type_hdr, 0, sizeof(struct crat_subtype_computeunit)); + + sub_type_hdr->type = CRAT_SUBTYPE_COMPUTEUNIT_AFFINITY; + sub_type_hdr->length = sizeof(struct crat_subtype_computeunit); + sub_type_hdr->flags = CRAT_SUBTYPE_FLAGS_ENABLED; + + /* Fill CU subtype data */ + cu = (struct crat_subtype_computeunit *)sub_type_hdr; + cu->flags |= CRAT_CU_FLAGS_GPU_PRESENT; + cu->proximity_domain = proximity_domain; + + kdev->kfd2kgd->get_cu_info(kdev->kgd, &cu_info); + cu->num_simd_per_cu = cu_info.simd_per_cu; + cu->num_simd_cores = cu_info.simd_per_cu * cu_info.cu_active_number; + cu->max_waves_simd = cu_info.max_waves_per_simd; + + cu->wave_front_size = cu_info.wave_front_size; + cu->array_count = cu_info.num_shader_arrays_per_engine * + cu_info.num_shader_engines; + total_num_of_cu = (cu->array_count * cu_info.num_cu_per_sh); + cu->processor_id_low = get_and_inc_gpu_processor_id(total_num_of_cu); + cu->num_cu_per_array = cu_info.num_cu_per_sh; + cu->max_slots_scatch_cu = cu_info.max_scratch_slots_per_cu; + cu->num_banks = cu_info.num_shader_engines; + cu->lds_size_in_kb = cu_info.lds_size; + + cu->hsa_capability = 0; + + /* Check if this node supports IOMMU. During parsing this flag will + * translate to HSA_CAP_ATS_PRESENT + */ + if (!kfd_iommu_check_device(kdev)) + cu->hsa_capability |= CRAT_CU_FLAGS_IOMMU_PRESENT; + + crat_table->length += sub_type_hdr->length; + crat_table->total_entries++; + + /* Fill in Subtype: Memory. Only on systems with large BAR (no + * private FB), report memory as public. On other systems + * report the total FB size (public+private) as a single + * private heap. + */ + kdev->kfd2kgd->get_local_mem_info(kdev->kgd, &local_mem_info); + sub_type_hdr = (typeof(sub_type_hdr))((char *)sub_type_hdr + + sub_type_hdr->length); + + if (debug_largebar) + local_mem_info.local_mem_size_private = 0; + + if (local_mem_info.local_mem_size_private == 0) + ret = kfd_fill_gpu_memory_affinity(&avail_size, + kdev, HSA_MEM_HEAP_TYPE_FB_PUBLIC, + local_mem_info.local_mem_size_public, + (struct crat_subtype_memory *)sub_type_hdr, + proximity_domain, + &local_mem_info); + else + ret = kfd_fill_gpu_memory_affinity(&avail_size, + kdev, HSA_MEM_HEAP_TYPE_FB_PRIVATE, + local_mem_info.local_mem_size_public + + local_mem_info.local_mem_size_private, + (struct crat_subtype_memory *)sub_type_hdr, + proximity_domain, + &local_mem_info); + if (ret < 0) + return ret; + + crat_table->length += sizeof(struct crat_subtype_memory); + crat_table->total_entries++; + + /* TODO: Fill in cache information. This information is NOT readily + * available in KGD + */ + sub_type_hdr = (typeof(sub_type_hdr))((char *)sub_type_hdr + + sub_type_hdr->length); + ret = kfd_fill_gpu_cache_info(kdev, cu->processor_id_low, + avail_size, + &cu_info, + (struct crat_subtype_cache *)sub_type_hdr, + &cache_mem_filled, + &num_of_cache_entries); + + if (ret < 0) + return ret; + + crat_table->length += cache_mem_filled; + crat_table->total_entries += num_of_cache_entries; + avail_size -= cache_mem_filled; + + /* Fill in Subtype: IO_LINKS + * Only direct links are added here which is Link from GPU to + * to its NUMA node. Indirect links are added by userspace. + */ + sub_type_hdr = (typeof(sub_type_hdr))((char *)sub_type_hdr + + cache_mem_filled); + ret = kfd_fill_gpu_direct_io_link(&avail_size, kdev, + (struct crat_subtype_iolink *)sub_type_hdr, proximity_domain); + + if (ret < 0) + return ret; + + crat_table->length += sub_type_hdr->length; + crat_table->total_entries++; + + *size = crat_table->length; + pr_info("Virtual CRAT table created for GPU\n"); + + return ret; +} + +/* kfd_create_crat_image_virtual - Allocates memory for CRAT image and + * creates a Virtual CRAT (VCRAT) image + * + * NOTE: Call kfd_destroy_crat_image to free CRAT image memory + * + * @crat_image: VCRAT image created because ACPI does not have a + * CRAT for this device + * @size: [OUT] size of virtual crat_image + * @flags: COMPUTE_UNIT_CPU - Create VCRAT for CPU device + * COMPUTE_UNIT_GPU - Create VCRAT for GPU + * (COMPUTE_UNIT_CPU | COMPUTE_UNIT_GPU) - Create VCRAT for APU + * -- this option is not currently implemented. + * The assumption is that all AMD APUs will have CRAT + * @kdev: Valid kfd_device required if flags contain COMPUTE_UNIT_GPU + * + * Return 0 if successful else return -ve value + */ +int kfd_create_crat_image_virtual(void **crat_image, size_t *size, + int flags, struct kfd_dev *kdev, + uint32_t proximity_domain) +{ + void *pcrat_image = NULL; + int ret = 0; + + if (!crat_image) + return -EINVAL; + + *crat_image = NULL; + + /* Allocate one VCRAT_SIZE_FOR_CPU for CPU virtual CRAT image and + * VCRAT_SIZE_FOR_GPU for GPU virtual CRAT image. This should cover + * all the current conditions. A check is put not to overwrite beyond + * allocated size + */ + switch (flags) { + case COMPUTE_UNIT_CPU: + pcrat_image = kmalloc(VCRAT_SIZE_FOR_CPU, GFP_KERNEL); + if (!pcrat_image) + return -ENOMEM; + *size = VCRAT_SIZE_FOR_CPU; + ret = kfd_create_vcrat_image_cpu(pcrat_image, size); + break; + case COMPUTE_UNIT_GPU: + if (!kdev) + return -EINVAL; + pcrat_image = kmalloc(VCRAT_SIZE_FOR_GPU, GFP_KERNEL); + if (!pcrat_image) + return -ENOMEM; + *size = VCRAT_SIZE_FOR_GPU; + ret = kfd_create_vcrat_image_gpu(pcrat_image, size, kdev, + proximity_domain); + break; + case (COMPUTE_UNIT_CPU | COMPUTE_UNIT_GPU): + /* TODO: */ + ret = -EINVAL; + pr_err("VCRAT not implemented for APU\n"); + break; + default: + ret = -EINVAL; + } + + if (!ret) + *crat_image = pcrat_image; + else + kfree(pcrat_image); + + return ret; +} + + +/* kfd_destroy_crat_image + * + * @crat_image: [IN] - crat_image from kfd_create_crat_image_xxx(..) + * + */ +void kfd_destroy_crat_image(void *crat_image) +{ + kfree(crat_image); +} diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_crat.h b/drivers/gpu/drm/amd/amdkfd/kfd_crat.h new file mode 100644 index 000000000..b5cd182b9 --- /dev/null +++ b/drivers/gpu/drm/amd/amdkfd/kfd_crat.h @@ -0,0 +1,320 @@ +/* + * Copyright 2014 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +#ifndef KFD_CRAT_H_INCLUDED +#define KFD_CRAT_H_INCLUDED + +#include <linux/types.h> + +#pragma pack(1) + +/* + * 4CC signature values for the CRAT and CDIT ACPI tables + */ + +#define CRAT_SIGNATURE "CRAT" +#define CDIT_SIGNATURE "CDIT" + +/* + * Component Resource Association Table (CRAT) + */ + +#define CRAT_OEMID_LENGTH 6 +#define CRAT_OEMTABLEID_LENGTH 8 +#define CRAT_RESERVED_LENGTH 6 + +#define CRAT_OEMID_64BIT_MASK ((1ULL << (CRAT_OEMID_LENGTH * 8)) - 1) + +/* Compute Unit flags */ +#define COMPUTE_UNIT_CPU (1 << 0) /* Create Virtual CRAT for CPU */ +#define COMPUTE_UNIT_GPU (1 << 1) /* Create Virtual CRAT for GPU */ + +struct crat_header { + uint32_t signature; + uint32_t length; + uint8_t revision; + uint8_t checksum; + uint8_t oem_id[CRAT_OEMID_LENGTH]; + uint8_t oem_table_id[CRAT_OEMTABLEID_LENGTH]; + uint32_t oem_revision; + uint32_t creator_id; + uint32_t creator_revision; + uint32_t total_entries; + uint16_t num_domains; + uint8_t reserved[CRAT_RESERVED_LENGTH]; +}; + +/* + * The header structure is immediately followed by total_entries of the + * data definitions + */ + +/* + * The currently defined subtype entries in the CRAT + */ +#define CRAT_SUBTYPE_COMPUTEUNIT_AFFINITY 0 +#define CRAT_SUBTYPE_MEMORY_AFFINITY 1 +#define CRAT_SUBTYPE_CACHE_AFFINITY 2 +#define CRAT_SUBTYPE_TLB_AFFINITY 3 +#define CRAT_SUBTYPE_CCOMPUTE_AFFINITY 4 +#define CRAT_SUBTYPE_IOLINK_AFFINITY 5 +#define CRAT_SUBTYPE_MAX 6 + +#define CRAT_SIBLINGMAP_SIZE 32 + +/* + * ComputeUnit Affinity structure and definitions + */ +#define CRAT_CU_FLAGS_ENABLED 0x00000001 +#define CRAT_CU_FLAGS_HOT_PLUGGABLE 0x00000002 +#define CRAT_CU_FLAGS_CPU_PRESENT 0x00000004 +#define CRAT_CU_FLAGS_GPU_PRESENT 0x00000008 +#define CRAT_CU_FLAGS_IOMMU_PRESENT 0x00000010 +#define CRAT_CU_FLAGS_RESERVED 0xffffffe0 + +#define CRAT_COMPUTEUNIT_RESERVED_LENGTH 4 + +struct crat_subtype_computeunit { + uint8_t type; + uint8_t length; + uint16_t reserved; + uint32_t flags; + uint32_t proximity_domain; + uint32_t processor_id_low; + uint16_t num_cpu_cores; + uint16_t num_simd_cores; + uint16_t max_waves_simd; + uint16_t io_count; + uint16_t hsa_capability; + uint16_t lds_size_in_kb; + uint8_t wave_front_size; + uint8_t num_banks; + uint16_t micro_engine_id; + uint8_t array_count; + uint8_t num_cu_per_array; + uint8_t num_simd_per_cu; + uint8_t max_slots_scatch_cu; + uint8_t reserved2[CRAT_COMPUTEUNIT_RESERVED_LENGTH]; +}; + +/* + * HSA Memory Affinity structure and definitions + */ +#define CRAT_MEM_FLAGS_ENABLED 0x00000001 +#define CRAT_MEM_FLAGS_HOT_PLUGGABLE 0x00000002 +#define CRAT_MEM_FLAGS_NON_VOLATILE 0x00000004 +#define CRAT_MEM_FLAGS_RESERVED 0xfffffff8 + +#define CRAT_MEMORY_RESERVED_LENGTH 8 + +struct crat_subtype_memory { + uint8_t type; + uint8_t length; + uint16_t reserved; + uint32_t flags; + uint32_t proximity_domain; + uint32_t base_addr_low; + uint32_t base_addr_high; + uint32_t length_low; + uint32_t length_high; + uint32_t width; + uint8_t visibility_type; /* for virtual (dGPU) CRAT */ + uint8_t reserved2[CRAT_MEMORY_RESERVED_LENGTH - 1]; +}; + +/* + * HSA Cache Affinity structure and definitions + */ +#define CRAT_CACHE_FLAGS_ENABLED 0x00000001 +#define CRAT_CACHE_FLAGS_DATA_CACHE 0x00000002 +#define CRAT_CACHE_FLAGS_INST_CACHE 0x00000004 +#define CRAT_CACHE_FLAGS_CPU_CACHE 0x00000008 +#define CRAT_CACHE_FLAGS_SIMD_CACHE 0x00000010 +#define CRAT_CACHE_FLAGS_RESERVED 0xffffffe0 + +#define CRAT_CACHE_RESERVED_LENGTH 8 + +struct crat_subtype_cache { + uint8_t type; + uint8_t length; + uint16_t reserved; + uint32_t flags; + uint32_t processor_id_low; + uint8_t sibling_map[CRAT_SIBLINGMAP_SIZE]; + uint32_t cache_size; + uint8_t cache_level; + uint8_t lines_per_tag; + uint16_t cache_line_size; + uint8_t associativity; + uint8_t cache_properties; + uint16_t cache_latency; + uint8_t reserved2[CRAT_CACHE_RESERVED_LENGTH]; +}; + +/* + * HSA TLB Affinity structure and definitions + */ +#define CRAT_TLB_FLAGS_ENABLED 0x00000001 +#define CRAT_TLB_FLAGS_DATA_TLB 0x00000002 +#define CRAT_TLB_FLAGS_INST_TLB 0x00000004 +#define CRAT_TLB_FLAGS_CPU_TLB 0x00000008 +#define CRAT_TLB_FLAGS_SIMD_TLB 0x00000010 +#define CRAT_TLB_FLAGS_RESERVED 0xffffffe0 + +#define CRAT_TLB_RESERVED_LENGTH 4 + +struct crat_subtype_tlb { + uint8_t type; + uint8_t length; + uint16_t reserved; + uint32_t flags; + uint32_t processor_id_low; + uint8_t sibling_map[CRAT_SIBLINGMAP_SIZE]; + uint32_t tlb_level; + uint8_t data_tlb_associativity_2mb; + uint8_t data_tlb_size_2mb; + uint8_t instruction_tlb_associativity_2mb; + uint8_t instruction_tlb_size_2mb; + uint8_t data_tlb_associativity_4k; + uint8_t data_tlb_size_4k; + uint8_t instruction_tlb_associativity_4k; + uint8_t instruction_tlb_size_4k; + uint8_t data_tlb_associativity_1gb; + uint8_t data_tlb_size_1gb; + uint8_t instruction_tlb_associativity_1gb; + uint8_t instruction_tlb_size_1gb; + uint8_t reserved2[CRAT_TLB_RESERVED_LENGTH]; +}; + +/* + * HSA CCompute/APU Affinity structure and definitions + */ +#define CRAT_CCOMPUTE_FLAGS_ENABLED 0x00000001 +#define CRAT_CCOMPUTE_FLAGS_RESERVED 0xfffffffe + +#define CRAT_CCOMPUTE_RESERVED_LENGTH 16 + +struct crat_subtype_ccompute { + uint8_t type; + uint8_t length; + uint16_t reserved; + uint32_t flags; + uint32_t processor_id_low; + uint8_t sibling_map[CRAT_SIBLINGMAP_SIZE]; + uint32_t apu_size; + uint8_t reserved2[CRAT_CCOMPUTE_RESERVED_LENGTH]; +}; + +/* + * HSA IO Link Affinity structure and definitions + */ +#define CRAT_IOLINK_FLAGS_ENABLED (1 << 0) +#define CRAT_IOLINK_FLAGS_NON_COHERENT (1 << 1) +#define CRAT_IOLINK_FLAGS_NO_ATOMICS_32_BIT (1 << 2) +#define CRAT_IOLINK_FLAGS_NO_ATOMICS_64_BIT (1 << 3) +#define CRAT_IOLINK_FLAGS_NO_PEER_TO_PEER_DMA (1 << 4) +#define CRAT_IOLINK_FLAGS_RESERVED_MASK 0xffffffe0 + +/* + * IO interface types + */ +#define CRAT_IOLINK_TYPE_UNDEFINED 0 +#define CRAT_IOLINK_TYPE_HYPERTRANSPORT 1 +#define CRAT_IOLINK_TYPE_PCIEXPRESS 2 +#define CRAT_IOLINK_TYPE_AMBA 3 +#define CRAT_IOLINK_TYPE_MIPI 4 +#define CRAT_IOLINK_TYPE_QPI_1_1 5 +#define CRAT_IOLINK_TYPE_RESERVED1 6 +#define CRAT_IOLINK_TYPE_RESERVED2 7 +#define CRAT_IOLINK_TYPE_RAPID_IO 8 +#define CRAT_IOLINK_TYPE_INFINIBAND 9 +#define CRAT_IOLINK_TYPE_RESERVED3 10 +#define CRAT_IOLINK_TYPE_OTHER 11 +#define CRAT_IOLINK_TYPE_MAX 255 + +#define CRAT_IOLINK_RESERVED_LENGTH 24 + +struct crat_subtype_iolink { + uint8_t type; + uint8_t length; + uint16_t reserved; + uint32_t flags; + uint32_t proximity_domain_from; + uint32_t proximity_domain_to; + uint8_t io_interface_type; + uint8_t version_major; + uint16_t version_minor; + uint32_t minimum_latency; + uint32_t maximum_latency; + uint32_t minimum_bandwidth_mbs; + uint32_t maximum_bandwidth_mbs; + uint32_t recommended_transfer_size; + uint8_t reserved2[CRAT_IOLINK_RESERVED_LENGTH]; +}; + +/* + * HSA generic sub-type header + */ + +#define CRAT_SUBTYPE_FLAGS_ENABLED 0x00000001 + +struct crat_subtype_generic { + uint8_t type; + uint8_t length; + uint16_t reserved; + uint32_t flags; +}; + +/* + * Component Locality Distance Information Table (CDIT) + */ +#define CDIT_OEMID_LENGTH 6 +#define CDIT_OEMTABLEID_LENGTH 8 + +struct cdit_header { + uint32_t signature; + uint32_t length; + uint8_t revision; + uint8_t checksum; + uint8_t oem_id[CDIT_OEMID_LENGTH]; + uint8_t oem_table_id[CDIT_OEMTABLEID_LENGTH]; + uint32_t oem_revision; + uint32_t creator_id; + uint32_t creator_revision; + uint32_t total_entries; + uint16_t num_domains; + uint8_t entry[1]; +}; + +#pragma pack() + +struct kfd_dev; + +int kfd_create_crat_image_acpi(void **crat_image, size_t *size); +void kfd_destroy_crat_image(void *crat_image); +int kfd_parse_crat_table(void *crat_image, struct list_head *device_list, + uint32_t proximity_domain); +int kfd_create_crat_image_virtual(void **crat_image, size_t *size, + int flags, struct kfd_dev *kdev, + uint32_t proximity_domain); + +#endif /* KFD_CRAT_H_INCLUDED */ diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_dbgdev.c b/drivers/gpu/drm/amd/amdkfd/kfd_dbgdev.c new file mode 100644 index 000000000..a3441b0e3 --- /dev/null +++ b/drivers/gpu/drm/amd/amdkfd/kfd_dbgdev.c @@ -0,0 +1,845 @@ +/* + * Copyright 2014 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + * + */ + +#include <linux/types.h> +#include <linux/kernel.h> +#include <linux/log2.h> +#include <linux/sched.h> +#include <linux/slab.h> +#include <linux/mutex.h> +#include <linux/device.h> + +#include "kfd_pm4_headers.h" +#include "kfd_pm4_headers_diq.h" +#include "kfd_kernel_queue.h" +#include "kfd_priv.h" +#include "kfd_pm4_opcodes.h" +#include "cik_regs.h" +#include "kfd_dbgmgr.h" +#include "kfd_dbgdev.h" +#include "kfd_device_queue_manager.h" + +static void dbgdev_address_watch_disable_nodiq(struct kfd_dev *dev) +{ + dev->kfd2kgd->address_watch_disable(dev->kgd); +} + +static int dbgdev_diq_submit_ib(struct kfd_dbgdev *dbgdev, + unsigned int pasid, uint64_t vmid0_address, + uint32_t *packet_buff, size_t size_in_bytes) +{ + struct pm4__release_mem *rm_packet; + struct pm4__indirect_buffer_pasid *ib_packet; + struct kfd_mem_obj *mem_obj; + size_t pq_packets_size_in_bytes; + union ULARGE_INTEGER *largep; + union ULARGE_INTEGER addr; + struct kernel_queue *kq; + uint64_t *rm_state; + unsigned int *ib_packet_buff; + int status; + + if (WARN_ON(!size_in_bytes)) + return -EINVAL; + + kq = dbgdev->kq; + + pq_packets_size_in_bytes = sizeof(struct pm4__release_mem) + + sizeof(struct pm4__indirect_buffer_pasid); + + /* + * We acquire a buffer from DIQ + * The receive packet buff will be sitting on the Indirect Buffer + * and in the PQ we put the IB packet + sync packet(s). + */ + status = kq->ops.acquire_packet_buffer(kq, + pq_packets_size_in_bytes / sizeof(uint32_t), + &ib_packet_buff); + if (status) { + pr_err("acquire_packet_buffer failed\n"); + return status; + } + + memset(ib_packet_buff, 0, pq_packets_size_in_bytes); + + ib_packet = (struct pm4__indirect_buffer_pasid *) (ib_packet_buff); + + ib_packet->header.count = 3; + ib_packet->header.opcode = IT_INDIRECT_BUFFER_PASID; + ib_packet->header.type = PM4_TYPE_3; + + largep = (union ULARGE_INTEGER *) &vmid0_address; + + ib_packet->bitfields2.ib_base_lo = largep->u.low_part >> 2; + ib_packet->bitfields3.ib_base_hi = largep->u.high_part; + + ib_packet->control = (1 << 23) | (1 << 31) | + ((size_in_bytes / 4) & 0xfffff); + + ib_packet->bitfields5.pasid = pasid; + + /* + * for now we use release mem for GPU-CPU synchronization + * Consider WaitRegMem + WriteData as a better alternative + * we get a GART allocations ( gpu/cpu mapping), + * for the sync variable, and wait until: + * (a) Sync with HW + * (b) Sync var is written by CP to mem. + */ + rm_packet = (struct pm4__release_mem *) (ib_packet_buff + + (sizeof(struct pm4__indirect_buffer_pasid) / + sizeof(unsigned int))); + + status = kfd_gtt_sa_allocate(dbgdev->dev, sizeof(uint64_t), + &mem_obj); + + if (status) { + pr_err("Failed to allocate GART memory\n"); + kq->ops.rollback_packet(kq); + return status; + } + + rm_state = (uint64_t *) mem_obj->cpu_ptr; + + *rm_state = QUEUESTATE__ACTIVE_COMPLETION_PENDING; + + rm_packet->header.opcode = IT_RELEASE_MEM; + rm_packet->header.type = PM4_TYPE_3; + rm_packet->header.count = sizeof(struct pm4__release_mem) / 4 - 2; + + rm_packet->bitfields2.event_type = CACHE_FLUSH_AND_INV_TS_EVENT; + rm_packet->bitfields2.event_index = + event_index___release_mem__end_of_pipe; + + rm_packet->bitfields2.cache_policy = cache_policy___release_mem__lru; + rm_packet->bitfields2.atc = 0; + rm_packet->bitfields2.tc_wb_action_ena = 1; + + addr.quad_part = mem_obj->gpu_addr; + + rm_packet->bitfields4.address_lo_32b = addr.u.low_part >> 2; + rm_packet->address_hi = addr.u.high_part; + + rm_packet->bitfields3.data_sel = + data_sel___release_mem__send_64_bit_data; + + rm_packet->bitfields3.int_sel = + int_sel___release_mem__send_data_after_write_confirm; + + rm_packet->bitfields3.dst_sel = + dst_sel___release_mem__memory_controller; + + rm_packet->data_lo = QUEUESTATE__ACTIVE; + + kq->ops.submit_packet(kq); + + /* Wait till CP writes sync code: */ + status = amdkfd_fence_wait_timeout( + (unsigned int *) rm_state, + QUEUESTATE__ACTIVE, 1500); + + kfd_gtt_sa_free(dbgdev->dev, mem_obj); + + return status; +} + +static int dbgdev_register_nodiq(struct kfd_dbgdev *dbgdev) +{ + /* + * no action is needed in this case, + * just make sure diq will not be used + */ + + dbgdev->kq = NULL; + + return 0; +} + +static int dbgdev_register_diq(struct kfd_dbgdev *dbgdev) +{ + struct queue_properties properties; + unsigned int qid; + struct kernel_queue *kq = NULL; + int status; + + properties.type = KFD_QUEUE_TYPE_DIQ; + + status = pqm_create_queue(dbgdev->pqm, dbgdev->dev, NULL, + &properties, &qid); + + if (status) { + pr_err("Failed to create DIQ\n"); + return status; + } + + pr_debug("DIQ Created with queue id: %d\n", qid); + + kq = pqm_get_kernel_queue(dbgdev->pqm, qid); + + if (!kq) { + pr_err("Error getting DIQ\n"); + pqm_destroy_queue(dbgdev->pqm, qid); + return -EFAULT; + } + + dbgdev->kq = kq; + + return status; +} + +static int dbgdev_unregister_nodiq(struct kfd_dbgdev *dbgdev) +{ + /* disable watch address */ + dbgdev_address_watch_disable_nodiq(dbgdev->dev); + return 0; +} + +static int dbgdev_unregister_diq(struct kfd_dbgdev *dbgdev) +{ + /* todo - disable address watch */ + int status; + + status = pqm_destroy_queue(dbgdev->pqm, + dbgdev->kq->queue->properties.queue_id); + dbgdev->kq = NULL; + + return status; +} + +static void dbgdev_address_watch_set_registers( + const struct dbg_address_watch_info *adw_info, + union TCP_WATCH_ADDR_H_BITS *addrHi, + union TCP_WATCH_ADDR_L_BITS *addrLo, + union TCP_WATCH_CNTL_BITS *cntl, + unsigned int index, unsigned int vmid) +{ + union ULARGE_INTEGER addr; + + addr.quad_part = 0; + addrHi->u32All = 0; + addrLo->u32All = 0; + cntl->u32All = 0; + + if (adw_info->watch_mask) + cntl->bitfields.mask = + (uint32_t) (adw_info->watch_mask[index] & + ADDRESS_WATCH_REG_CNTL_DEFAULT_MASK); + else + cntl->bitfields.mask = ADDRESS_WATCH_REG_CNTL_DEFAULT_MASK; + + addr.quad_part = (unsigned long long) adw_info->watch_address[index]; + + addrHi->bitfields.addr = addr.u.high_part & + ADDRESS_WATCH_REG_ADDHIGH_MASK; + addrLo->bitfields.addr = + (addr.u.low_part >> ADDRESS_WATCH_REG_ADDLOW_SHIFT); + + cntl->bitfields.mode = adw_info->watch_mode[index]; + cntl->bitfields.vmid = (uint32_t) vmid; + /* for now assume it is an ATC address */ + cntl->u32All |= ADDRESS_WATCH_REG_CNTL_ATC_BIT; + + pr_debug("\t\t%20s %08x\n", "set reg mask :", cntl->bitfields.mask); + pr_debug("\t\t%20s %08x\n", "set reg add high :", + addrHi->bitfields.addr); + pr_debug("\t\t%20s %08x\n", "set reg add low :", + addrLo->bitfields.addr); +} + +static int dbgdev_address_watch_nodiq(struct kfd_dbgdev *dbgdev, + struct dbg_address_watch_info *adw_info) +{ + union TCP_WATCH_ADDR_H_BITS addrHi; + union TCP_WATCH_ADDR_L_BITS addrLo; + union TCP_WATCH_CNTL_BITS cntl; + struct kfd_process_device *pdd; + unsigned int i; + + /* taking the vmid for that process on the safe way using pdd */ + pdd = kfd_get_process_device_data(dbgdev->dev, + adw_info->process); + if (!pdd) { + pr_err("Failed to get pdd for wave control no DIQ\n"); + return -EFAULT; + } + + addrHi.u32All = 0; + addrLo.u32All = 0; + cntl.u32All = 0; + + if ((adw_info->num_watch_points > MAX_WATCH_ADDRESSES) || + (adw_info->num_watch_points == 0)) { + pr_err("num_watch_points is invalid\n"); + return -EINVAL; + } + + if (!adw_info->watch_mode || !adw_info->watch_address) { + pr_err("adw_info fields are not valid\n"); + return -EINVAL; + } + + for (i = 0; i < adw_info->num_watch_points; i++) { + dbgdev_address_watch_set_registers(adw_info, &addrHi, &addrLo, + &cntl, i, pdd->qpd.vmid); + + pr_debug("\t\t%30s\n", "* * * * * * * * * * * * * * * * * *"); + pr_debug("\t\t%20s %08x\n", "register index :", i); + pr_debug("\t\t%20s %08x\n", "vmid is :", pdd->qpd.vmid); + pr_debug("\t\t%20s %08x\n", "Address Low is :", + addrLo.bitfields.addr); + pr_debug("\t\t%20s %08x\n", "Address high is :", + addrHi.bitfields.addr); + pr_debug("\t\t%20s %08x\n", "Address high is :", + addrHi.bitfields.addr); + pr_debug("\t\t%20s %08x\n", "Control Mask is :", + cntl.bitfields.mask); + pr_debug("\t\t%20s %08x\n", "Control Mode is :", + cntl.bitfields.mode); + pr_debug("\t\t%20s %08x\n", "Control Vmid is :", + cntl.bitfields.vmid); + pr_debug("\t\t%20s %08x\n", "Control atc is :", + cntl.bitfields.atc); + pr_debug("\t\t%30s\n", "* * * * * * * * * * * * * * * * * *"); + + pdd->dev->kfd2kgd->address_watch_execute( + dbgdev->dev->kgd, + i, + cntl.u32All, + addrHi.u32All, + addrLo.u32All); + } + + return 0; +} + +static int dbgdev_address_watch_diq(struct kfd_dbgdev *dbgdev, + struct dbg_address_watch_info *adw_info) +{ + struct pm4__set_config_reg *packets_vec; + union TCP_WATCH_ADDR_H_BITS addrHi; + union TCP_WATCH_ADDR_L_BITS addrLo; + union TCP_WATCH_CNTL_BITS cntl; + struct kfd_mem_obj *mem_obj; + unsigned int aw_reg_add_dword; + uint32_t *packet_buff_uint; + unsigned int i; + int status; + size_t ib_size = sizeof(struct pm4__set_config_reg) * 4; + /* we do not control the vmid in DIQ mode, just a place holder */ + unsigned int vmid = 0; + + addrHi.u32All = 0; + addrLo.u32All = 0; + cntl.u32All = 0; + + if ((adw_info->num_watch_points > MAX_WATCH_ADDRESSES) || + (adw_info->num_watch_points == 0)) { + pr_err("num_watch_points is invalid\n"); + return -EINVAL; + } + + if (!adw_info->watch_mode || !adw_info->watch_address) { + pr_err("adw_info fields are not valid\n"); + return -EINVAL; + } + + status = kfd_gtt_sa_allocate(dbgdev->dev, ib_size, &mem_obj); + + if (status) { + pr_err("Failed to allocate GART memory\n"); + return status; + } + + packet_buff_uint = mem_obj->cpu_ptr; + + memset(packet_buff_uint, 0, ib_size); + + packets_vec = (struct pm4__set_config_reg *) (packet_buff_uint); + + packets_vec[0].header.count = 1; + packets_vec[0].header.opcode = IT_SET_CONFIG_REG; + packets_vec[0].header.type = PM4_TYPE_3; + packets_vec[0].bitfields2.vmid_shift = ADDRESS_WATCH_CNTL_OFFSET; + packets_vec[0].bitfields2.insert_vmid = 1; + packets_vec[1].ordinal1 = packets_vec[0].ordinal1; + packets_vec[1].bitfields2.insert_vmid = 0; + packets_vec[2].ordinal1 = packets_vec[0].ordinal1; + packets_vec[2].bitfields2.insert_vmid = 0; + packets_vec[3].ordinal1 = packets_vec[0].ordinal1; + packets_vec[3].bitfields2.vmid_shift = ADDRESS_WATCH_CNTL_OFFSET; + packets_vec[3].bitfields2.insert_vmid = 1; + + for (i = 0; i < adw_info->num_watch_points; i++) { + dbgdev_address_watch_set_registers(adw_info, + &addrHi, + &addrLo, + &cntl, + i, + vmid); + + pr_debug("\t\t%30s\n", "* * * * * * * * * * * * * * * * * *"); + pr_debug("\t\t%20s %08x\n", "register index :", i); + pr_debug("\t\t%20s %08x\n", "vmid is :", vmid); + pr_debug("\t\t%20s %p\n", "Add ptr is :", + adw_info->watch_address); + pr_debug("\t\t%20s %08llx\n", "Add is :", + adw_info->watch_address[i]); + pr_debug("\t\t%20s %08x\n", "Address Low is :", + addrLo.bitfields.addr); + pr_debug("\t\t%20s %08x\n", "Address high is :", + addrHi.bitfields.addr); + pr_debug("\t\t%20s %08x\n", "Control Mask is :", + cntl.bitfields.mask); + pr_debug("\t\t%20s %08x\n", "Control Mode is :", + cntl.bitfields.mode); + pr_debug("\t\t%20s %08x\n", "Control Vmid is :", + cntl.bitfields.vmid); + pr_debug("\t\t%20s %08x\n", "Control atc is :", + cntl.bitfields.atc); + pr_debug("\t\t%30s\n", "* * * * * * * * * * * * * * * * * *"); + + aw_reg_add_dword = + dbgdev->dev->kfd2kgd->address_watch_get_offset( + dbgdev->dev->kgd, + i, + ADDRESS_WATCH_REG_CNTL); + + packets_vec[0].bitfields2.reg_offset = + aw_reg_add_dword - AMD_CONFIG_REG_BASE; + + packets_vec[0].reg_data[0] = cntl.u32All; + + aw_reg_add_dword = + dbgdev->dev->kfd2kgd->address_watch_get_offset( + dbgdev->dev->kgd, + i, + ADDRESS_WATCH_REG_ADDR_HI); + + packets_vec[1].bitfields2.reg_offset = + aw_reg_add_dword - AMD_CONFIG_REG_BASE; + packets_vec[1].reg_data[0] = addrHi.u32All; + + aw_reg_add_dword = + dbgdev->dev->kfd2kgd->address_watch_get_offset( + dbgdev->dev->kgd, + i, + ADDRESS_WATCH_REG_ADDR_LO); + + packets_vec[2].bitfields2.reg_offset = + aw_reg_add_dword - AMD_CONFIG_REG_BASE; + packets_vec[2].reg_data[0] = addrLo.u32All; + + /* enable watch flag if address is not zero*/ + if (adw_info->watch_address[i] > 0) + cntl.bitfields.valid = 1; + else + cntl.bitfields.valid = 0; + + aw_reg_add_dword = + dbgdev->dev->kfd2kgd->address_watch_get_offset( + dbgdev->dev->kgd, + i, + ADDRESS_WATCH_REG_CNTL); + + packets_vec[3].bitfields2.reg_offset = + aw_reg_add_dword - AMD_CONFIG_REG_BASE; + packets_vec[3].reg_data[0] = cntl.u32All; + + status = dbgdev_diq_submit_ib( + dbgdev, + adw_info->process->pasid, + mem_obj->gpu_addr, + packet_buff_uint, + ib_size); + + if (status) { + pr_err("Failed to submit IB to DIQ\n"); + break; + } + } + + kfd_gtt_sa_free(dbgdev->dev, mem_obj); + return status; +} + +static int dbgdev_wave_control_set_registers( + struct dbg_wave_control_info *wac_info, + union SQ_CMD_BITS *in_reg_sq_cmd, + union GRBM_GFX_INDEX_BITS *in_reg_gfx_index) +{ + int status = 0; + union SQ_CMD_BITS reg_sq_cmd; + union GRBM_GFX_INDEX_BITS reg_gfx_index; + struct HsaDbgWaveMsgAMDGen2 *pMsg; + + reg_sq_cmd.u32All = 0; + reg_gfx_index.u32All = 0; + pMsg = &wac_info->dbgWave_msg.DbgWaveMsg.WaveMsgInfoGen2; + + switch (wac_info->mode) { + /* Send command to single wave */ + case HSA_DBG_WAVEMODE_SINGLE: + /* + * Limit access to the process waves only, + * by setting vmid check + */ + reg_sq_cmd.bits.check_vmid = 1; + reg_sq_cmd.bits.simd_id = pMsg->ui32.SIMD; + reg_sq_cmd.bits.wave_id = pMsg->ui32.WaveId; + reg_sq_cmd.bits.mode = SQ_IND_CMD_MODE_SINGLE; + + reg_gfx_index.bits.sh_index = pMsg->ui32.ShaderArray; + reg_gfx_index.bits.se_index = pMsg->ui32.ShaderEngine; + reg_gfx_index.bits.instance_index = pMsg->ui32.HSACU; + + break; + + /* Send command to all waves with matching VMID */ + case HSA_DBG_WAVEMODE_BROADCAST_PROCESS: + + reg_gfx_index.bits.sh_broadcast_writes = 1; + reg_gfx_index.bits.se_broadcast_writes = 1; + reg_gfx_index.bits.instance_broadcast_writes = 1; + + reg_sq_cmd.bits.mode = SQ_IND_CMD_MODE_BROADCAST; + + break; + + /* Send command to all CU waves with matching VMID */ + case HSA_DBG_WAVEMODE_BROADCAST_PROCESS_CU: + + reg_sq_cmd.bits.check_vmid = 1; + reg_sq_cmd.bits.mode = SQ_IND_CMD_MODE_BROADCAST; + + reg_gfx_index.bits.sh_index = pMsg->ui32.ShaderArray; + reg_gfx_index.bits.se_index = pMsg->ui32.ShaderEngine; + reg_gfx_index.bits.instance_index = pMsg->ui32.HSACU; + + break; + + default: + return -EINVAL; + } + + switch (wac_info->operand) { + case HSA_DBG_WAVEOP_HALT: + reg_sq_cmd.bits.cmd = SQ_IND_CMD_CMD_HALT; + break; + + case HSA_DBG_WAVEOP_RESUME: + reg_sq_cmd.bits.cmd = SQ_IND_CMD_CMD_RESUME; + break; + + case HSA_DBG_WAVEOP_KILL: + reg_sq_cmd.bits.cmd = SQ_IND_CMD_CMD_KILL; + break; + + case HSA_DBG_WAVEOP_DEBUG: + reg_sq_cmd.bits.cmd = SQ_IND_CMD_CMD_DEBUG; + break; + + case HSA_DBG_WAVEOP_TRAP: + if (wac_info->trapId < MAX_TRAPID) { + reg_sq_cmd.bits.cmd = SQ_IND_CMD_CMD_TRAP; + reg_sq_cmd.bits.trap_id = wac_info->trapId; + } else { + status = -EINVAL; + } + break; + + default: + status = -EINVAL; + break; + } + + if (status == 0) { + *in_reg_sq_cmd = reg_sq_cmd; + *in_reg_gfx_index = reg_gfx_index; + } + + return status; +} + +static int dbgdev_wave_control_diq(struct kfd_dbgdev *dbgdev, + struct dbg_wave_control_info *wac_info) +{ + + int status; + union SQ_CMD_BITS reg_sq_cmd; + union GRBM_GFX_INDEX_BITS reg_gfx_index; + struct kfd_mem_obj *mem_obj; + uint32_t *packet_buff_uint; + struct pm4__set_config_reg *packets_vec; + size_t ib_size = sizeof(struct pm4__set_config_reg) * 3; + + reg_sq_cmd.u32All = 0; + + status = dbgdev_wave_control_set_registers(wac_info, ®_sq_cmd, + ®_gfx_index); + if (status) { + pr_err("Failed to set wave control registers\n"); + return status; + } + + /* we do not control the VMID in DIQ, so reset it to a known value */ + reg_sq_cmd.bits.vm_id = 0; + + pr_debug("\t\t %30s\n", "* * * * * * * * * * * * * * * * * *"); + + pr_debug("\t\t mode is: %u\n", wac_info->mode); + pr_debug("\t\t operand is: %u\n", wac_info->operand); + pr_debug("\t\t trap id is: %u\n", wac_info->trapId); + pr_debug("\t\t msg value is: %u\n", + wac_info->dbgWave_msg.DbgWaveMsg.WaveMsgInfoGen2.Value); + pr_debug("\t\t vmid is: N/A\n"); + + pr_debug("\t\t chk_vmid is : %u\n", reg_sq_cmd.bitfields.check_vmid); + pr_debug("\t\t command is : %u\n", reg_sq_cmd.bitfields.cmd); + pr_debug("\t\t queue id is : %u\n", reg_sq_cmd.bitfields.queue_id); + pr_debug("\t\t simd id is : %u\n", reg_sq_cmd.bitfields.simd_id); + pr_debug("\t\t mode is : %u\n", reg_sq_cmd.bitfields.mode); + pr_debug("\t\t vm_id is : %u\n", reg_sq_cmd.bitfields.vm_id); + pr_debug("\t\t wave_id is : %u\n", reg_sq_cmd.bitfields.wave_id); + + pr_debug("\t\t ibw is : %u\n", + reg_gfx_index.bitfields.instance_broadcast_writes); + pr_debug("\t\t ii is : %u\n", + reg_gfx_index.bitfields.instance_index); + pr_debug("\t\t sebw is : %u\n", + reg_gfx_index.bitfields.se_broadcast_writes); + pr_debug("\t\t se_ind is : %u\n", reg_gfx_index.bitfields.se_index); + pr_debug("\t\t sh_ind is : %u\n", reg_gfx_index.bitfields.sh_index); + pr_debug("\t\t sbw is : %u\n", + reg_gfx_index.bitfields.sh_broadcast_writes); + + pr_debug("\t\t %30s\n", "* * * * * * * * * * * * * * * * * *"); + + status = kfd_gtt_sa_allocate(dbgdev->dev, ib_size, &mem_obj); + + if (status != 0) { + pr_err("Failed to allocate GART memory\n"); + return status; + } + + packet_buff_uint = mem_obj->cpu_ptr; + + memset(packet_buff_uint, 0, ib_size); + + packets_vec = (struct pm4__set_config_reg *) packet_buff_uint; + packets_vec[0].header.count = 1; + packets_vec[0].header.opcode = IT_SET_UCONFIG_REG; + packets_vec[0].header.type = PM4_TYPE_3; + packets_vec[0].bitfields2.reg_offset = + GRBM_GFX_INDEX / 4 - USERCONFIG_REG_BASE; + + packets_vec[0].bitfields2.insert_vmid = 0; + packets_vec[0].reg_data[0] = reg_gfx_index.u32All; + + packets_vec[1].header.count = 1; + packets_vec[1].header.opcode = IT_SET_CONFIG_REG; + packets_vec[1].header.type = PM4_TYPE_3; + packets_vec[1].bitfields2.reg_offset = SQ_CMD / 4 - AMD_CONFIG_REG_BASE; + + packets_vec[1].bitfields2.vmid_shift = SQ_CMD_VMID_OFFSET; + packets_vec[1].bitfields2.insert_vmid = 1; + packets_vec[1].reg_data[0] = reg_sq_cmd.u32All; + + /* Restore the GRBM_GFX_INDEX register */ + + reg_gfx_index.u32All = 0; + reg_gfx_index.bits.sh_broadcast_writes = 1; + reg_gfx_index.bits.instance_broadcast_writes = 1; + reg_gfx_index.bits.se_broadcast_writes = 1; + + + packets_vec[2].ordinal1 = packets_vec[0].ordinal1; + packets_vec[2].bitfields2.reg_offset = + GRBM_GFX_INDEX / 4 - USERCONFIG_REG_BASE; + + packets_vec[2].bitfields2.insert_vmid = 0; + packets_vec[2].reg_data[0] = reg_gfx_index.u32All; + + status = dbgdev_diq_submit_ib( + dbgdev, + wac_info->process->pasid, + mem_obj->gpu_addr, + packet_buff_uint, + ib_size); + + if (status) + pr_err("Failed to submit IB to DIQ\n"); + + kfd_gtt_sa_free(dbgdev->dev, mem_obj); + + return status; +} + +static int dbgdev_wave_control_nodiq(struct kfd_dbgdev *dbgdev, + struct dbg_wave_control_info *wac_info) +{ + int status; + union SQ_CMD_BITS reg_sq_cmd; + union GRBM_GFX_INDEX_BITS reg_gfx_index; + struct kfd_process_device *pdd; + + reg_sq_cmd.u32All = 0; + + /* taking the VMID for that process on the safe way using PDD */ + pdd = kfd_get_process_device_data(dbgdev->dev, wac_info->process); + + if (!pdd) { + pr_err("Failed to get pdd for wave control no DIQ\n"); + return -EFAULT; + } + status = dbgdev_wave_control_set_registers(wac_info, ®_sq_cmd, + ®_gfx_index); + if (status) { + pr_err("Failed to set wave control registers\n"); + return status; + } + + /* for non DIQ we need to patch the VMID: */ + + reg_sq_cmd.bits.vm_id = pdd->qpd.vmid; + + pr_debug("\t\t %30s\n", "* * * * * * * * * * * * * * * * * *"); + + pr_debug("\t\t mode is: %u\n", wac_info->mode); + pr_debug("\t\t operand is: %u\n", wac_info->operand); + pr_debug("\t\t trap id is: %u\n", wac_info->trapId); + pr_debug("\t\t msg value is: %u\n", + wac_info->dbgWave_msg.DbgWaveMsg.WaveMsgInfoGen2.Value); + pr_debug("\t\t vmid is: %u\n", pdd->qpd.vmid); + + pr_debug("\t\t chk_vmid is : %u\n", reg_sq_cmd.bitfields.check_vmid); + pr_debug("\t\t command is : %u\n", reg_sq_cmd.bitfields.cmd); + pr_debug("\t\t queue id is : %u\n", reg_sq_cmd.bitfields.queue_id); + pr_debug("\t\t simd id is : %u\n", reg_sq_cmd.bitfields.simd_id); + pr_debug("\t\t mode is : %u\n", reg_sq_cmd.bitfields.mode); + pr_debug("\t\t vm_id is : %u\n", reg_sq_cmd.bitfields.vm_id); + pr_debug("\t\t wave_id is : %u\n", reg_sq_cmd.bitfields.wave_id); + + pr_debug("\t\t ibw is : %u\n", + reg_gfx_index.bitfields.instance_broadcast_writes); + pr_debug("\t\t ii is : %u\n", + reg_gfx_index.bitfields.instance_index); + pr_debug("\t\t sebw is : %u\n", + reg_gfx_index.bitfields.se_broadcast_writes); + pr_debug("\t\t se_ind is : %u\n", reg_gfx_index.bitfields.se_index); + pr_debug("\t\t sh_ind is : %u\n", reg_gfx_index.bitfields.sh_index); + pr_debug("\t\t sbw is : %u\n", + reg_gfx_index.bitfields.sh_broadcast_writes); + + pr_debug("\t\t %30s\n", "* * * * * * * * * * * * * * * * * *"); + + return dbgdev->dev->kfd2kgd->wave_control_execute(dbgdev->dev->kgd, + reg_gfx_index.u32All, + reg_sq_cmd.u32All); +} + +int dbgdev_wave_reset_wavefronts(struct kfd_dev *dev, struct kfd_process *p) +{ + int status = 0; + unsigned int vmid; + union SQ_CMD_BITS reg_sq_cmd; + union GRBM_GFX_INDEX_BITS reg_gfx_index; + struct kfd_process_device *pdd; + struct dbg_wave_control_info wac_info; + int first_vmid_to_scan = dev->vm_info.first_vmid_kfd; + int last_vmid_to_scan = dev->vm_info.last_vmid_kfd; + + reg_sq_cmd.u32All = 0; + status = 0; + + wac_info.mode = HSA_DBG_WAVEMODE_BROADCAST_PROCESS; + wac_info.operand = HSA_DBG_WAVEOP_KILL; + + pr_debug("Killing all process wavefronts\n"); + + /* Scan all registers in the range ATC_VMID8_PASID_MAPPING .. + * ATC_VMID15_PASID_MAPPING + * to check which VMID the current process is mapped to. + */ + + for (vmid = first_vmid_to_scan; vmid <= last_vmid_to_scan; vmid++) { + if (dev->kfd2kgd->get_atc_vmid_pasid_mapping_valid + (dev->kgd, vmid)) { + if (dev->kfd2kgd->get_atc_vmid_pasid_mapping_pasid + (dev->kgd, vmid) == p->pasid) { + pr_debug("Killing wave fronts of vmid %d and pasid %d\n", + vmid, p->pasid); + break; + } + } + } + + if (vmid > last_vmid_to_scan) { + pr_err("Didn't find vmid for pasid %d\n", p->pasid); + return -EFAULT; + } + + /* taking the VMID for that process on the safe way using PDD */ + pdd = kfd_get_process_device_data(dev, p); + if (!pdd) + return -EFAULT; + + status = dbgdev_wave_control_set_registers(&wac_info, ®_sq_cmd, + ®_gfx_index); + if (status != 0) + return -EINVAL; + + /* for non DIQ we need to patch the VMID: */ + reg_sq_cmd.bits.vm_id = vmid; + + dev->kfd2kgd->wave_control_execute(dev->kgd, + reg_gfx_index.u32All, + reg_sq_cmd.u32All); + + return 0; +} + +void kfd_dbgdev_init(struct kfd_dbgdev *pdbgdev, struct kfd_dev *pdev, + enum DBGDEV_TYPE type) +{ + pdbgdev->dev = pdev; + pdbgdev->kq = NULL; + pdbgdev->type = type; + pdbgdev->pqm = NULL; + + switch (type) { + case DBGDEV_TYPE_NODIQ: + pdbgdev->dbgdev_register = dbgdev_register_nodiq; + pdbgdev->dbgdev_unregister = dbgdev_unregister_nodiq; + pdbgdev->dbgdev_wave_control = dbgdev_wave_control_nodiq; + pdbgdev->dbgdev_address_watch = dbgdev_address_watch_nodiq; + break; + case DBGDEV_TYPE_DIQ: + default: + pdbgdev->dbgdev_register = dbgdev_register_diq; + pdbgdev->dbgdev_unregister = dbgdev_unregister_diq; + pdbgdev->dbgdev_wave_control = dbgdev_wave_control_diq; + pdbgdev->dbgdev_address_watch = dbgdev_address_watch_diq; + break; + } + +} diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_dbgdev.h b/drivers/gpu/drm/amd/amdkfd/kfd_dbgdev.h new file mode 100644 index 000000000..0619c777b --- /dev/null +++ b/drivers/gpu/drm/amd/amdkfd/kfd_dbgdev.h @@ -0,0 +1,230 @@ +/* + * Copyright 2014 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +#ifndef KFD_DBGDEV_H_ +#define KFD_DBGDEV_H_ + +enum { + SQ_CMD_VMID_OFFSET = 28, + ADDRESS_WATCH_CNTL_OFFSET = 24 +}; + +enum { + PRIV_QUEUE_SYNC_TIME_MS = 200 +}; + +/* CONTEXT reg space definition */ +enum { + CONTEXT_REG_BASE = 0xA000, + CONTEXT_REG_END = 0xA400, + CONTEXT_REG_SIZE = CONTEXT_REG_END - CONTEXT_REG_BASE +}; + +/* USER CONFIG reg space definition */ +enum { + USERCONFIG_REG_BASE = 0xC000, + USERCONFIG_REG_END = 0x10000, + USERCONFIG_REG_SIZE = USERCONFIG_REG_END - USERCONFIG_REG_BASE +}; + +/* CONFIG reg space definition */ +enum { + AMD_CONFIG_REG_BASE = 0x2000, /* in dwords */ + AMD_CONFIG_REG_END = 0x2B00, + AMD_CONFIG_REG_SIZE = AMD_CONFIG_REG_END - AMD_CONFIG_REG_BASE +}; + +/* SH reg space definition */ +enum { + SH_REG_BASE = 0x2C00, + SH_REG_END = 0x3000, + SH_REG_SIZE = SH_REG_END - SH_REG_BASE +}; + +/* SQ_CMD definitions */ +#define SQ_CMD 0x8DEC + +enum SQ_IND_CMD_CMD { + SQ_IND_CMD_CMD_NULL = 0x00000000, + SQ_IND_CMD_CMD_HALT = 0x00000001, + SQ_IND_CMD_CMD_RESUME = 0x00000002, + SQ_IND_CMD_CMD_KILL = 0x00000003, + SQ_IND_CMD_CMD_DEBUG = 0x00000004, + SQ_IND_CMD_CMD_TRAP = 0x00000005, +}; + +enum SQ_IND_CMD_MODE { + SQ_IND_CMD_MODE_SINGLE = 0x00000000, + SQ_IND_CMD_MODE_BROADCAST = 0x00000001, + SQ_IND_CMD_MODE_BROADCAST_QUEUE = 0x00000002, + SQ_IND_CMD_MODE_BROADCAST_PIPE = 0x00000003, + SQ_IND_CMD_MODE_BROADCAST_ME = 0x00000004, +}; + +union SQ_IND_INDEX_BITS { + struct { + uint32_t wave_id:4; + uint32_t simd_id:2; + uint32_t thread_id:6; + uint32_t:1; + uint32_t force_read:1; + uint32_t read_timeout:1; + uint32_t unindexed:1; + uint32_t index:16; + + } bitfields, bits; + uint32_t u32All; + signed int i32All; + float f32All; +}; + +union SQ_IND_CMD_BITS { + struct { + uint32_t data:32; + } bitfields, bits; + uint32_t u32All; + signed int i32All; + float f32All; +}; + +union SQ_CMD_BITS { + struct { + uint32_t cmd:3; + uint32_t:1; + uint32_t mode:3; + uint32_t check_vmid:1; + uint32_t trap_id:3; + uint32_t:5; + uint32_t wave_id:4; + uint32_t simd_id:2; + uint32_t:2; + uint32_t queue_id:3; + uint32_t:1; + uint32_t vm_id:4; + } bitfields, bits; + uint32_t u32All; + signed int i32All; + float f32All; +}; + +union SQ_IND_DATA_BITS { + struct { + uint32_t data:32; + } bitfields, bits; + uint32_t u32All; + signed int i32All; + float f32All; +}; + +union GRBM_GFX_INDEX_BITS { + struct { + uint32_t instance_index:8; + uint32_t sh_index:8; + uint32_t se_index:8; + uint32_t:5; + uint32_t sh_broadcast_writes:1; + uint32_t instance_broadcast_writes:1; + uint32_t se_broadcast_writes:1; + } bitfields, bits; + uint32_t u32All; + signed int i32All; + float f32All; +}; + +union TCP_WATCH_ADDR_H_BITS { + struct { + uint32_t addr:16; + uint32_t:16; + + } bitfields, bits; + uint32_t u32All; + signed int i32All; + float f32All; +}; + +union TCP_WATCH_ADDR_L_BITS { + struct { + uint32_t:6; + uint32_t addr:26; + } bitfields, bits; + uint32_t u32All; + signed int i32All; + float f32All; +}; + +enum { + QUEUESTATE__INVALID = 0, /* so by default we'll get invalid state */ + QUEUESTATE__ACTIVE_COMPLETION_PENDING, + QUEUESTATE__ACTIVE +}; + +union ULARGE_INTEGER { + struct { + uint32_t low_part; + uint32_t high_part; + } u; + unsigned long long quad_part; +}; + + +#define KFD_CIK_VMID_START_OFFSET (8) +#define KFD_CIK_VMID_END_OFFSET (KFD_CIK_VMID_START_OFFSET + (8)) + + +void kfd_dbgdev_init(struct kfd_dbgdev *pdbgdev, struct kfd_dev *pdev, + enum DBGDEV_TYPE type); + +union TCP_WATCH_CNTL_BITS { + struct { + uint32_t mask:24; + uint32_t vmid:4; + uint32_t atc:1; + uint32_t mode:2; + uint32_t valid:1; + } bitfields, bits; + uint32_t u32All; + signed int i32All; + float f32All; +}; + +enum { + ADDRESS_WATCH_REG_CNTL_ATC_BIT = 0x10000000UL, + ADDRESS_WATCH_REG_CNTL_DEFAULT_MASK = 0x00FFFFFF, + ADDRESS_WATCH_REG_ADDLOW_MASK_EXTENSION = 0x03000000, + /* extend the mask to 26 bits in order to match the low address field */ + ADDRESS_WATCH_REG_ADDLOW_SHIFT = 6, + ADDRESS_WATCH_REG_ADDHIGH_MASK = 0xFFFF +}; + +enum { + MAX_TRAPID = 8, /* 3 bits in the bitfield. */ + MAX_WATCH_ADDRESSES = 4 +}; + +enum { + ADDRESS_WATCH_REG_ADDR_HI = 0, + ADDRESS_WATCH_REG_ADDR_LO, + ADDRESS_WATCH_REG_CNTL, + ADDRESS_WATCH_REG_MAX +}; + +#endif /* KFD_DBGDEV_H_ */ diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_dbgmgr.c b/drivers/gpu/drm/amd/amdkfd/kfd_dbgmgr.c new file mode 100644 index 000000000..9d4af961c --- /dev/null +++ b/drivers/gpu/drm/amd/amdkfd/kfd_dbgmgr.c @@ -0,0 +1,158 @@ +/* + * Copyright 2014 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + * + */ +#include <linux/types.h> +#include <linux/kernel.h> +#include <linux/log2.h> +#include <linux/sched.h> +#include <linux/slab.h> +#include <linux/device.h> + +#include "kfd_priv.h" +#include "cik_regs.h" +#include "kfd_pm4_headers.h" +#include "kfd_pm4_headers_diq.h" +#include "kfd_dbgmgr.h" +#include "kfd_dbgdev.h" +#include "kfd_device_queue_manager.h" + +static DEFINE_MUTEX(kfd_dbgmgr_mutex); + +struct mutex *kfd_get_dbgmgr_mutex(void) +{ + return &kfd_dbgmgr_mutex; +} + + +static void kfd_dbgmgr_uninitialize(struct kfd_dbgmgr *pmgr) +{ + kfree(pmgr->dbgdev); + + pmgr->dbgdev = NULL; + pmgr->pasid = 0; + pmgr->dev = NULL; +} + +void kfd_dbgmgr_destroy(struct kfd_dbgmgr *pmgr) +{ + if (pmgr) { + kfd_dbgmgr_uninitialize(pmgr); + kfree(pmgr); + } +} + +bool kfd_dbgmgr_create(struct kfd_dbgmgr **ppmgr, struct kfd_dev *pdev) +{ + enum DBGDEV_TYPE type = DBGDEV_TYPE_DIQ; + struct kfd_dbgmgr *new_buff; + + if (WARN_ON(!pdev->init_complete)) + return false; + + new_buff = kfd_alloc_struct(new_buff); + if (!new_buff) { + pr_err("Failed to allocate dbgmgr instance\n"); + return false; + } + + new_buff->pasid = 0; + new_buff->dev = pdev; + new_buff->dbgdev = kfd_alloc_struct(new_buff->dbgdev); + if (!new_buff->dbgdev) { + pr_err("Failed to allocate dbgdev instance\n"); + kfree(new_buff); + return false; + } + + /* get actual type of DBGDevice cpsch or not */ + if (pdev->dqm->sched_policy == KFD_SCHED_POLICY_NO_HWS) + type = DBGDEV_TYPE_NODIQ; + + kfd_dbgdev_init(new_buff->dbgdev, pdev, type); + *ppmgr = new_buff; + + return true; +} + +long kfd_dbgmgr_register(struct kfd_dbgmgr *pmgr, struct kfd_process *p) +{ + if (pmgr->pasid != 0) { + pr_debug("H/W debugger is already active using pasid %d\n", + pmgr->pasid); + return -EBUSY; + } + + /* remember pasid */ + pmgr->pasid = p->pasid; + + /* provide the pqm for diq generation */ + pmgr->dbgdev->pqm = &p->pqm; + + /* activate the actual registering */ + pmgr->dbgdev->dbgdev_register(pmgr->dbgdev); + + return 0; +} + +long kfd_dbgmgr_unregister(struct kfd_dbgmgr *pmgr, struct kfd_process *p) +{ + /* Is the requests coming from the already registered process? */ + if (pmgr->pasid != p->pasid) { + pr_debug("H/W debugger is not registered by calling pasid %d\n", + p->pasid); + return -EINVAL; + } + + pmgr->dbgdev->dbgdev_unregister(pmgr->dbgdev); + + pmgr->pasid = 0; + + return 0; +} + +long kfd_dbgmgr_wave_control(struct kfd_dbgmgr *pmgr, + struct dbg_wave_control_info *wac_info) +{ + /* Is the requests coming from the already registered process? */ + if (pmgr->pasid != wac_info->process->pasid) { + pr_debug("H/W debugger support was not registered for requester pasid %d\n", + wac_info->process->pasid); + return -EINVAL; + } + + return (long) pmgr->dbgdev->dbgdev_wave_control(pmgr->dbgdev, wac_info); +} + +long kfd_dbgmgr_address_watch(struct kfd_dbgmgr *pmgr, + struct dbg_address_watch_info *adw_info) +{ + /* Is the requests coming from the already registered process? */ + if (pmgr->pasid != adw_info->process->pasid) { + pr_debug("H/W debugger support was not registered for requester pasid %d\n", + adw_info->process->pasid); + return -EINVAL; + } + + return (long) pmgr->dbgdev->dbgdev_address_watch(pmgr->dbgdev, + adw_info); +} + diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_dbgmgr.h b/drivers/gpu/drm/amd/amdkfd/kfd_dbgmgr.h new file mode 100644 index 000000000..a04a1fe1d --- /dev/null +++ b/drivers/gpu/drm/amd/amdkfd/kfd_dbgmgr.h @@ -0,0 +1,293 @@ +/* + * Copyright 2014 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + * + */ + +#ifndef KFD_DBGMGR_H_ +#define KFD_DBGMGR_H_ + +#include "kfd_priv.h" + +/* must align with hsakmttypes definition */ +#pragma pack(push, 4) + +enum HSA_DBG_WAVEOP { + HSA_DBG_WAVEOP_HALT = 1, /* Halts a wavefront */ + HSA_DBG_WAVEOP_RESUME = 2, /* Resumes a wavefront */ + HSA_DBG_WAVEOP_KILL = 3, /* Kills a wavefront */ + HSA_DBG_WAVEOP_DEBUG = 4, /* Causes wavefront to enter dbg mode */ + HSA_DBG_WAVEOP_TRAP = 5, /* Causes wavefront to take a trap */ + HSA_DBG_NUM_WAVEOP = 5, + HSA_DBG_MAX_WAVEOP = 0xFFFFFFFF +}; + +enum HSA_DBG_WAVEMODE { + /* send command to a single wave */ + HSA_DBG_WAVEMODE_SINGLE = 0, + /* + * Broadcast to all wavefronts of all processes is not + * supported for HSA user mode + */ + + /* send to waves within current process */ + HSA_DBG_WAVEMODE_BROADCAST_PROCESS = 2, + /* send to waves within current process on CU */ + HSA_DBG_WAVEMODE_BROADCAST_PROCESS_CU = 3, + HSA_DBG_NUM_WAVEMODE = 3, + HSA_DBG_MAX_WAVEMODE = 0xFFFFFFFF +}; + +enum HSA_DBG_WAVEMSG_TYPE { + HSA_DBG_WAVEMSG_AUTO = 0, + HSA_DBG_WAVEMSG_USER = 1, + HSA_DBG_WAVEMSG_ERROR = 2, + HSA_DBG_NUM_WAVEMSG, + HSA_DBG_MAX_WAVEMSG = 0xFFFFFFFF +}; + +enum HSA_DBG_WATCH_MODE { + HSA_DBG_WATCH_READ = 0, /* Read operations only */ + HSA_DBG_WATCH_NONREAD = 1, /* Write or Atomic operations only */ + HSA_DBG_WATCH_ATOMIC = 2, /* Atomic Operations only */ + HSA_DBG_WATCH_ALL = 3, /* Read, Write or Atomic operations */ + HSA_DBG_WATCH_NUM, + HSA_DBG_WATCH_SIZE = 0xFFFFFFFF +}; + +/* This structure is hardware specific and may change in the future */ +struct HsaDbgWaveMsgAMDGen2 { + union { + struct ui32 { + uint32_t UserData:8; /* user data */ + uint32_t ShaderArray:1; /* Shader array */ + uint32_t Priv:1; /* Privileged */ + uint32_t Reserved0:4; /* Reserved, should be 0 */ + uint32_t WaveId:4; /* wave id */ + uint32_t SIMD:2; /* SIMD id */ + uint32_t HSACU:4; /* Compute unit */ + uint32_t ShaderEngine:2;/* Shader engine */ + uint32_t MessageType:2; /* see HSA_DBG_WAVEMSG_TYPE */ + uint32_t Reserved1:4; /* Reserved, should be 0 */ + } ui32; + uint32_t Value; + }; + uint32_t Reserved2; +}; + +union HsaDbgWaveMessageAMD { + struct HsaDbgWaveMsgAMDGen2 WaveMsgInfoGen2; + /* for future HsaDbgWaveMsgAMDGen3; */ +}; + +struct HsaDbgWaveMessage { + void *MemoryVA; /* ptr to associated host-accessible data */ + union HsaDbgWaveMessageAMD DbgWaveMsg; +}; + +/* + * TODO: This definitions to be MOVED to kfd_event, once it is implemented. + * + * HSA sync primitive, Event and HW Exception notification API definitions. + * The API functions allow the runtime to define a so-called sync-primitive, + * a SW object combining a user-mode provided "syncvar" and a scheduler event + * that can be signaled through a defined GPU interrupt. A syncvar is + * a process virtual memory location of a certain size that can be accessed + * by CPU and GPU shader code within the process to set and query the content + * within that memory. The definition of the content is determined by the HSA + * runtime and potentially GPU shader code interfacing with the HSA runtime. + * The syncvar values may be commonly written through an PM4 WRITE_DATA packet + * in the user mode instruction stream. The OS scheduler event is typically + * associated and signaled by an interrupt issued by the GPU, but other HSA + * system interrupt conditions from other HW (e.g. IOMMUv2) may be surfaced + * by the KFD by this mechanism, too. + */ + +/* these are the new definitions for events */ +enum HSA_EVENTTYPE { + HSA_EVENTTYPE_SIGNAL = 0, /* user-mode generated GPU signal */ + HSA_EVENTTYPE_NODECHANGE = 1, /* HSA node change (attach/detach) */ + HSA_EVENTTYPE_DEVICESTATECHANGE = 2, /* HSA device state change + * (start/stop) + */ + HSA_EVENTTYPE_HW_EXCEPTION = 3, /* GPU shader exception event */ + HSA_EVENTTYPE_SYSTEM_EVENT = 4, /* GPU SYSCALL with parameter info */ + HSA_EVENTTYPE_DEBUG_EVENT = 5, /* GPU signal for debugging */ + HSA_EVENTTYPE_PROFILE_EVENT = 6,/* GPU signal for profiling */ + HSA_EVENTTYPE_QUEUE_EVENT = 7, /* GPU signal queue idle state + * (EOP pm4) + */ + /* ... */ + HSA_EVENTTYPE_MAXID, + HSA_EVENTTYPE_TYPE_SIZE = 0xFFFFFFFF +}; + +/* Sub-definitions for various event types: Syncvar */ +struct HsaSyncVar { + union SyncVar { + void *UserData; /* pointer to user mode data */ + uint64_t UserDataPtrValue; /* 64bit compatibility of value */ + } SyncVar; + uint64_t SyncVarSize; +}; + +/* Sub-definitions for various event types: NodeChange */ + +enum HSA_EVENTTYPE_NODECHANGE_FLAGS { + HSA_EVENTTYPE_NODECHANGE_ADD = 0, + HSA_EVENTTYPE_NODECHANGE_REMOVE = 1, + HSA_EVENTTYPE_NODECHANGE_SIZE = 0xFFFFFFFF +}; + +struct HsaNodeChange { + /* HSA node added/removed on the platform */ + enum HSA_EVENTTYPE_NODECHANGE_FLAGS Flags; +}; + +/* Sub-definitions for various event types: DeviceStateChange */ +enum HSA_EVENTTYPE_DEVICESTATECHANGE_FLAGS { + /* device started (and available) */ + HSA_EVENTTYPE_DEVICESTATUSCHANGE_START = 0, + /* device stopped (i.e. unavailable) */ + HSA_EVENTTYPE_DEVICESTATUSCHANGE_STOP = 1, + HSA_EVENTTYPE_DEVICESTATUSCHANGE_SIZE = 0xFFFFFFFF +}; + +enum HSA_DEVICE { + HSA_DEVICE_CPU = 0, + HSA_DEVICE_GPU = 1, + MAX_HSA_DEVICE = 2 +}; + +struct HsaDeviceStateChange { + uint32_t NodeId; /* F-NUMA node that contains the device */ + enum HSA_DEVICE Device; /* device type: GPU or CPU */ + enum HSA_EVENTTYPE_DEVICESTATECHANGE_FLAGS Flags; /* event flags */ +}; + +struct HsaEventData { + enum HSA_EVENTTYPE EventType; /* event type */ + union EventData { + /* + * return data associated with HSA_EVENTTYPE_SIGNAL + * and other events + */ + struct HsaSyncVar SyncVar; + + /* data associated with HSA_EVENTTYPE_NODE_CHANGE */ + struct HsaNodeChange NodeChangeState; + + /* data associated with HSA_EVENTTYPE_DEVICE_STATE_CHANGE */ + struct HsaDeviceStateChange DeviceState; + } EventData; + + /* the following data entries are internal to the KFD & thunk itself */ + + /* internal thunk store for Event data (OsEventHandle) */ + uint64_t HWData1; + /* internal thunk store for Event data (HWAddress) */ + uint64_t HWData2; + /* internal thunk store for Event data (HWData) */ + uint32_t HWData3; +}; + +struct HsaEventDescriptor { + /* event type to allocate */ + enum HSA_EVENTTYPE EventType; + /* H-NUMA node containing GPU device that is event source */ + uint32_t NodeId; + /* pointer to user mode syncvar data, syncvar->UserDataPtrValue + * may be NULL + */ + struct HsaSyncVar SyncVar; +}; + +struct HsaEvent { + uint32_t EventId; + struct HsaEventData EventData; +}; + +#pragma pack(pop) + +enum DBGDEV_TYPE { + DBGDEV_TYPE_ILLEGAL = 0, + DBGDEV_TYPE_NODIQ = 1, + DBGDEV_TYPE_DIQ = 2, + DBGDEV_TYPE_TEST = 3 +}; + +struct dbg_address_watch_info { + struct kfd_process *process; + enum HSA_DBG_WATCH_MODE *watch_mode; + uint64_t *watch_address; + uint64_t *watch_mask; + struct HsaEvent *watch_event; + uint32_t num_watch_points; +}; + +struct dbg_wave_control_info { + struct kfd_process *process; + uint32_t trapId; + enum HSA_DBG_WAVEOP operand; + enum HSA_DBG_WAVEMODE mode; + struct HsaDbgWaveMessage dbgWave_msg; +}; + +struct kfd_dbgdev { + + /* The device that owns this data. */ + struct kfd_dev *dev; + + /* kernel queue for DIQ */ + struct kernel_queue *kq; + + /* a pointer to the pqm of the calling process */ + struct process_queue_manager *pqm; + + /* type of debug device ( DIQ, non DIQ, etc. ) */ + enum DBGDEV_TYPE type; + + /* virtualized function pointers to device dbg */ + int (*dbgdev_register)(struct kfd_dbgdev *dbgdev); + int (*dbgdev_unregister)(struct kfd_dbgdev *dbgdev); + int (*dbgdev_address_watch)(struct kfd_dbgdev *dbgdev, + struct dbg_address_watch_info *adw_info); + int (*dbgdev_wave_control)(struct kfd_dbgdev *dbgdev, + struct dbg_wave_control_info *wac_info); + +}; + +struct kfd_dbgmgr { + unsigned int pasid; + struct kfd_dev *dev; + struct kfd_dbgdev *dbgdev; +}; + +/* prototypes for debug manager functions */ +struct mutex *kfd_get_dbgmgr_mutex(void); +void kfd_dbgmgr_destroy(struct kfd_dbgmgr *pmgr); +bool kfd_dbgmgr_create(struct kfd_dbgmgr **ppmgr, struct kfd_dev *pdev); +long kfd_dbgmgr_register(struct kfd_dbgmgr *pmgr, struct kfd_process *p); +long kfd_dbgmgr_unregister(struct kfd_dbgmgr *pmgr, struct kfd_process *p); +long kfd_dbgmgr_wave_control(struct kfd_dbgmgr *pmgr, + struct dbg_wave_control_info *wac_info); +long kfd_dbgmgr_address_watch(struct kfd_dbgmgr *pmgr, + struct dbg_address_watch_info *adw_info); +#endif /* KFD_DBGMGR_H_ */ diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_debugfs.c b/drivers/gpu/drm/amd/amdkfd/kfd_debugfs.c new file mode 100644 index 000000000..ab37d36d9 --- /dev/null +++ b/drivers/gpu/drm/amd/amdkfd/kfd_debugfs.c @@ -0,0 +1,123 @@ +/* + * Copyright 2016-2017 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +#include <linux/debugfs.h> +#include <linux/uaccess.h> + +#include "kfd_priv.h" + +static struct dentry *debugfs_root; + +static int kfd_debugfs_open(struct inode *inode, struct file *file) +{ + int (*show)(struct seq_file *, void *) = inode->i_private; + + return single_open(file, show, NULL); +} + +static ssize_t kfd_debugfs_hang_hws_write(struct file *file, + const char __user *user_buf, size_t size, loff_t *ppos) +{ + struct kfd_dev *dev; + char tmp[16]; + uint32_t gpu_id; + int ret = -EINVAL; + + memset(tmp, 0, 16); + if (size >= 16) { + pr_err("Invalid input for gpu id.\n"); + goto out; + } + if (copy_from_user(tmp, user_buf, size)) { + ret = -EFAULT; + goto out; + } + if (kstrtoint(tmp, 10, &gpu_id)) { + pr_err("Invalid input for gpu id.\n"); + goto out; + } + dev = kfd_device_by_id(gpu_id); + if (dev) { + kfd_debugfs_hang_hws(dev); + ret = size; + } else + pr_err("Cannot find device %d.\n", gpu_id); + +out: + return ret; +} + +static const struct file_operations kfd_debugfs_fops = { + .owner = THIS_MODULE, + .open = kfd_debugfs_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + +static const struct file_operations kfd_debugfs_hang_hws_fops = { + .owner = THIS_MODULE, + .open = kfd_debugfs_open, + .read = seq_read, + .write = kfd_debugfs_hang_hws_write, + .llseek = seq_lseek, + .release = single_release, +}; + +void kfd_debugfs_init(void) +{ + struct dentry *ent; + + debugfs_root = debugfs_create_dir("kfd", NULL); + if (!debugfs_root || debugfs_root == ERR_PTR(-ENODEV)) { + pr_warn("Failed to create kfd debugfs dir\n"); + return; + } + + ent = debugfs_create_file("mqds", S_IFREG | 0444, debugfs_root, + kfd_debugfs_mqds_by_process, + &kfd_debugfs_fops); + if (!ent) + pr_warn("Failed to create mqds in kfd debugfs\n"); + + ent = debugfs_create_file("hqds", S_IFREG | 0444, debugfs_root, + kfd_debugfs_hqds_by_device, + &kfd_debugfs_fops); + if (!ent) + pr_warn("Failed to create hqds in kfd debugfs\n"); + + ent = debugfs_create_file("rls", S_IFREG | 0444, debugfs_root, + kfd_debugfs_rls_by_device, + &kfd_debugfs_fops); + + ent = debugfs_create_file("hang_hws", S_IFREG | 0644, debugfs_root, + NULL, + &kfd_debugfs_hang_hws_fops); + + if (!ent) + pr_warn("Failed to create rls in kfd debugfs\n"); +} + +void kfd_debugfs_fini(void) +{ + debugfs_remove_recursive(debugfs_root); +} diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device.c b/drivers/gpu/drm/amd/amdkfd/kfd_device.c new file mode 100644 index 000000000..28022d1cb --- /dev/null +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device.c @@ -0,0 +1,976 @@ +/* + * Copyright 2014 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +#include <linux/bsearch.h> +#include <linux/pci.h> +#include <linux/slab.h> +#include "kfd_priv.h" +#include "kfd_device_queue_manager.h" +#include "kfd_pm4_headers_vi.h" +#include "cwsr_trap_handler.h" +#include "kfd_iommu.h" + +#define MQD_SIZE_ALIGNED 768 + +/* + * kfd_locked is used to lock the kfd driver during suspend or reset + * once locked, kfd driver will stop any further GPU execution. + * create process (open) will return -EAGAIN. + */ +static atomic_t kfd_locked = ATOMIC_INIT(0); + +#ifdef KFD_SUPPORT_IOMMU_V2 +static const struct kfd_device_info kaveri_device_info = { + .asic_family = CHIP_KAVERI, + .max_pasid_bits = 16, + /* max num of queues for KV.TODO should be a dynamic value */ + .max_no_of_hqd = 24, + .doorbell_size = 4, + .ih_ring_entry_size = 4 * sizeof(uint32_t), + .event_interrupt_class = &event_interrupt_class_cik, + .num_of_watch_points = 4, + .mqd_size_aligned = MQD_SIZE_ALIGNED, + .supports_cwsr = false, + .needs_iommu_device = true, + .needs_pci_atomics = false, + .num_sdma_engines = 2, +}; + +static const struct kfd_device_info carrizo_device_info = { + .asic_family = CHIP_CARRIZO, + .max_pasid_bits = 16, + /* max num of queues for CZ.TODO should be a dynamic value */ + .max_no_of_hqd = 24, + .doorbell_size = 4, + .ih_ring_entry_size = 4 * sizeof(uint32_t), + .event_interrupt_class = &event_interrupt_class_cik, + .num_of_watch_points = 4, + .mqd_size_aligned = MQD_SIZE_ALIGNED, + .supports_cwsr = true, + .needs_iommu_device = true, + .needs_pci_atomics = false, + .num_sdma_engines = 2, +}; + +static const struct kfd_device_info raven_device_info = { + .asic_family = CHIP_RAVEN, + .max_pasid_bits = 16, + .max_no_of_hqd = 24, + .doorbell_size = 8, + .ih_ring_entry_size = 8 * sizeof(uint32_t), + .event_interrupt_class = &event_interrupt_class_v9, + .num_of_watch_points = 4, + .mqd_size_aligned = MQD_SIZE_ALIGNED, + .supports_cwsr = true, + .needs_iommu_device = true, + .needs_pci_atomics = true, + .num_sdma_engines = 1, +}; +#endif + +static const struct kfd_device_info hawaii_device_info = { + .asic_family = CHIP_HAWAII, + .max_pasid_bits = 16, + /* max num of queues for KV.TODO should be a dynamic value */ + .max_no_of_hqd = 24, + .doorbell_size = 4, + .ih_ring_entry_size = 4 * sizeof(uint32_t), + .event_interrupt_class = &event_interrupt_class_cik, + .num_of_watch_points = 4, + .mqd_size_aligned = MQD_SIZE_ALIGNED, + .supports_cwsr = false, + .needs_iommu_device = false, + .needs_pci_atomics = false, + .num_sdma_engines = 2, +}; + +static const struct kfd_device_info tonga_device_info = { + .asic_family = CHIP_TONGA, + .max_pasid_bits = 16, + .max_no_of_hqd = 24, + .doorbell_size = 4, + .ih_ring_entry_size = 4 * sizeof(uint32_t), + .event_interrupt_class = &event_interrupt_class_cik, + .num_of_watch_points = 4, + .mqd_size_aligned = MQD_SIZE_ALIGNED, + .supports_cwsr = false, + .needs_iommu_device = false, + .needs_pci_atomics = true, + .num_sdma_engines = 2, +}; + +static const struct kfd_device_info tonga_vf_device_info = { + .asic_family = CHIP_TONGA, + .max_pasid_bits = 16, + .max_no_of_hqd = 24, + .doorbell_size = 4, + .ih_ring_entry_size = 4 * sizeof(uint32_t), + .event_interrupt_class = &event_interrupt_class_cik, + .num_of_watch_points = 4, + .mqd_size_aligned = MQD_SIZE_ALIGNED, + .supports_cwsr = false, + .needs_iommu_device = false, + .needs_pci_atomics = false, + .num_sdma_engines = 2, +}; + +static const struct kfd_device_info fiji_device_info = { + .asic_family = CHIP_FIJI, + .max_pasid_bits = 16, + .max_no_of_hqd = 24, + .doorbell_size = 4, + .ih_ring_entry_size = 4 * sizeof(uint32_t), + .event_interrupt_class = &event_interrupt_class_cik, + .num_of_watch_points = 4, + .mqd_size_aligned = MQD_SIZE_ALIGNED, + .supports_cwsr = true, + .needs_iommu_device = false, + .needs_pci_atomics = true, + .num_sdma_engines = 2, +}; + +static const struct kfd_device_info fiji_vf_device_info = { + .asic_family = CHIP_FIJI, + .max_pasid_bits = 16, + .max_no_of_hqd = 24, + .doorbell_size = 4, + .ih_ring_entry_size = 4 * sizeof(uint32_t), + .event_interrupt_class = &event_interrupt_class_cik, + .num_of_watch_points = 4, + .mqd_size_aligned = MQD_SIZE_ALIGNED, + .supports_cwsr = true, + .needs_iommu_device = false, + .needs_pci_atomics = false, + .num_sdma_engines = 2, +}; + + +static const struct kfd_device_info polaris10_device_info = { + .asic_family = CHIP_POLARIS10, + .max_pasid_bits = 16, + .max_no_of_hqd = 24, + .doorbell_size = 4, + .ih_ring_entry_size = 4 * sizeof(uint32_t), + .event_interrupt_class = &event_interrupt_class_cik, + .num_of_watch_points = 4, + .mqd_size_aligned = MQD_SIZE_ALIGNED, + .supports_cwsr = true, + .needs_iommu_device = false, + .needs_pci_atomics = true, + .num_sdma_engines = 2, +}; + +static const struct kfd_device_info polaris10_vf_device_info = { + .asic_family = CHIP_POLARIS10, + .max_pasid_bits = 16, + .max_no_of_hqd = 24, + .doorbell_size = 4, + .ih_ring_entry_size = 4 * sizeof(uint32_t), + .event_interrupt_class = &event_interrupt_class_cik, + .num_of_watch_points = 4, + .mqd_size_aligned = MQD_SIZE_ALIGNED, + .supports_cwsr = true, + .needs_iommu_device = false, + .needs_pci_atomics = false, + .num_sdma_engines = 2, +}; + +static const struct kfd_device_info polaris11_device_info = { + .asic_family = CHIP_POLARIS11, + .max_pasid_bits = 16, + .max_no_of_hqd = 24, + .doorbell_size = 4, + .ih_ring_entry_size = 4 * sizeof(uint32_t), + .event_interrupt_class = &event_interrupt_class_cik, + .num_of_watch_points = 4, + .mqd_size_aligned = MQD_SIZE_ALIGNED, + .supports_cwsr = true, + .needs_iommu_device = false, + .needs_pci_atomics = true, + .num_sdma_engines = 2, +}; + +static const struct kfd_device_info vega10_device_info = { + .asic_family = CHIP_VEGA10, + .max_pasid_bits = 16, + .max_no_of_hqd = 24, + .doorbell_size = 8, + .ih_ring_entry_size = 8 * sizeof(uint32_t), + .event_interrupt_class = &event_interrupt_class_v9, + .num_of_watch_points = 4, + .mqd_size_aligned = MQD_SIZE_ALIGNED, + .supports_cwsr = true, + .needs_iommu_device = false, + .needs_pci_atomics = false, + .num_sdma_engines = 2, +}; + +static const struct kfd_device_info vega10_vf_device_info = { + .asic_family = CHIP_VEGA10, + .max_pasid_bits = 16, + .max_no_of_hqd = 24, + .doorbell_size = 8, + .ih_ring_entry_size = 8 * sizeof(uint32_t), + .event_interrupt_class = &event_interrupt_class_v9, + .num_of_watch_points = 4, + .mqd_size_aligned = MQD_SIZE_ALIGNED, + .supports_cwsr = true, + .needs_iommu_device = false, + .needs_pci_atomics = false, + .num_sdma_engines = 2, +}; + + +struct kfd_deviceid { + unsigned short did; + const struct kfd_device_info *device_info; +}; + +static const struct kfd_deviceid supported_devices[] = { +#ifdef KFD_SUPPORT_IOMMU_V2 + { 0x1304, &kaveri_device_info }, /* Kaveri */ + { 0x1305, &kaveri_device_info }, /* Kaveri */ + { 0x1306, &kaveri_device_info }, /* Kaveri */ + { 0x1307, &kaveri_device_info }, /* Kaveri */ + { 0x1309, &kaveri_device_info }, /* Kaveri */ + { 0x130A, &kaveri_device_info }, /* Kaveri */ + { 0x130B, &kaveri_device_info }, /* Kaveri */ + { 0x130C, &kaveri_device_info }, /* Kaveri */ + { 0x130D, &kaveri_device_info }, /* Kaveri */ + { 0x130E, &kaveri_device_info }, /* Kaveri */ + { 0x130F, &kaveri_device_info }, /* Kaveri */ + { 0x1310, &kaveri_device_info }, /* Kaveri */ + { 0x1311, &kaveri_device_info }, /* Kaveri */ + { 0x1312, &kaveri_device_info }, /* Kaveri */ + { 0x1313, &kaveri_device_info }, /* Kaveri */ + { 0x1315, &kaveri_device_info }, /* Kaveri */ + { 0x1316, &kaveri_device_info }, /* Kaveri */ + { 0x1317, &kaveri_device_info }, /* Kaveri */ + { 0x1318, &kaveri_device_info }, /* Kaveri */ + { 0x131B, &kaveri_device_info }, /* Kaveri */ + { 0x131C, &kaveri_device_info }, /* Kaveri */ + { 0x131D, &kaveri_device_info }, /* Kaveri */ + { 0x9870, &carrizo_device_info }, /* Carrizo */ + { 0x9874, &carrizo_device_info }, /* Carrizo */ + { 0x9875, &carrizo_device_info }, /* Carrizo */ + { 0x9876, &carrizo_device_info }, /* Carrizo */ + { 0x9877, &carrizo_device_info }, /* Carrizo */ + { 0x15DD, &raven_device_info }, /* Raven */ +#endif + { 0x67A0, &hawaii_device_info }, /* Hawaii */ + { 0x67A1, &hawaii_device_info }, /* Hawaii */ + { 0x67A2, &hawaii_device_info }, /* Hawaii */ + { 0x67A8, &hawaii_device_info }, /* Hawaii */ + { 0x67A9, &hawaii_device_info }, /* Hawaii */ + { 0x67AA, &hawaii_device_info }, /* Hawaii */ + { 0x67B0, &hawaii_device_info }, /* Hawaii */ + { 0x67B1, &hawaii_device_info }, /* Hawaii */ + { 0x67B8, &hawaii_device_info }, /* Hawaii */ + { 0x67B9, &hawaii_device_info }, /* Hawaii */ + { 0x67BA, &hawaii_device_info }, /* Hawaii */ + { 0x67BE, &hawaii_device_info }, /* Hawaii */ + { 0x6920, &tonga_device_info }, /* Tonga */ + { 0x6921, &tonga_device_info }, /* Tonga */ + { 0x6928, &tonga_device_info }, /* Tonga */ + { 0x6929, &tonga_device_info }, /* Tonga */ + { 0x692B, &tonga_device_info }, /* Tonga */ + { 0x692F, &tonga_vf_device_info }, /* Tonga vf */ + { 0x6938, &tonga_device_info }, /* Tonga */ + { 0x6939, &tonga_device_info }, /* Tonga */ + { 0x7300, &fiji_device_info }, /* Fiji */ + { 0x730F, &fiji_vf_device_info }, /* Fiji vf*/ + { 0x67C0, &polaris10_device_info }, /* Polaris10 */ + { 0x67C1, &polaris10_device_info }, /* Polaris10 */ + { 0x67C2, &polaris10_device_info }, /* Polaris10 */ + { 0x67C4, &polaris10_device_info }, /* Polaris10 */ + { 0x67C7, &polaris10_device_info }, /* Polaris10 */ + { 0x67C8, &polaris10_device_info }, /* Polaris10 */ + { 0x67C9, &polaris10_device_info }, /* Polaris10 */ + { 0x67CA, &polaris10_device_info }, /* Polaris10 */ + { 0x67CC, &polaris10_device_info }, /* Polaris10 */ + { 0x67CF, &polaris10_device_info }, /* Polaris10 */ + { 0x67D0, &polaris10_vf_device_info }, /* Polaris10 vf*/ + { 0x67DF, &polaris10_device_info }, /* Polaris10 */ + { 0x6FDF, &polaris10_device_info }, /* Polaris10 */ + { 0x67E0, &polaris11_device_info }, /* Polaris11 */ + { 0x67E1, &polaris11_device_info }, /* Polaris11 */ + { 0x67E3, &polaris11_device_info }, /* Polaris11 */ + { 0x67E7, &polaris11_device_info }, /* Polaris11 */ + { 0x67E8, &polaris11_device_info }, /* Polaris11 */ + { 0x67E9, &polaris11_device_info }, /* Polaris11 */ + { 0x67EB, &polaris11_device_info }, /* Polaris11 */ + { 0x67EF, &polaris11_device_info }, /* Polaris11 */ + { 0x67FF, &polaris11_device_info }, /* Polaris11 */ + { 0x6860, &vega10_device_info }, /* Vega10 */ + { 0x6861, &vega10_device_info }, /* Vega10 */ + { 0x6862, &vega10_device_info }, /* Vega10 */ + { 0x6863, &vega10_device_info }, /* Vega10 */ + { 0x6864, &vega10_device_info }, /* Vega10 */ + { 0x6867, &vega10_device_info }, /* Vega10 */ + { 0x6868, &vega10_device_info }, /* Vega10 */ + { 0x6869, &vega10_device_info }, /* Vega10 */ + { 0x686A, &vega10_device_info }, /* Vega10 */ + { 0x686B, &vega10_device_info }, /* Vega10 */ + { 0x686C, &vega10_vf_device_info }, /* Vega10 vf*/ + { 0x686D, &vega10_device_info }, /* Vega10 */ + { 0x686E, &vega10_device_info }, /* Vega10 */ + { 0x686F, &vega10_device_info }, /* Vega10 */ + { 0x687F, &vega10_device_info }, /* Vega10 */ +}; + +static int kfd_gtt_sa_init(struct kfd_dev *kfd, unsigned int buf_size, + unsigned int chunk_size); +static void kfd_gtt_sa_fini(struct kfd_dev *kfd); + +static int kfd_resume(struct kfd_dev *kfd); + +static const struct kfd_device_info *lookup_device_info(unsigned short did) +{ + size_t i; + + for (i = 0; i < ARRAY_SIZE(supported_devices); i++) { + if (supported_devices[i].did == did) { + WARN_ON(!supported_devices[i].device_info); + return supported_devices[i].device_info; + } + } + + dev_warn(kfd_device, "DID %04x is missing in supported_devices\n", + did); + + return NULL; +} + +struct kfd_dev *kgd2kfd_probe(struct kgd_dev *kgd, + struct pci_dev *pdev, const struct kfd2kgd_calls *f2g) +{ + struct kfd_dev *kfd; + int ret; + const struct kfd_device_info *device_info = + lookup_device_info(pdev->device); + + if (!device_info) { + dev_err(kfd_device, "kgd2kfd_probe failed\n"); + return NULL; + } + + /* Allow BIF to recode atomics to PCIe 3.0 AtomicOps. + * 32 and 64-bit requests are possible and must be + * supported. + */ + ret = pci_enable_atomic_ops_to_root(pdev, + PCI_EXP_DEVCAP2_ATOMIC_COMP32 | + PCI_EXP_DEVCAP2_ATOMIC_COMP64); + if (device_info->needs_pci_atomics && ret < 0) { + dev_info(kfd_device, + "skipped device %x:%x, PCI rejects atomics\n", + pdev->vendor, pdev->device); + return NULL; + } + + kfd = kzalloc(sizeof(*kfd), GFP_KERNEL); + if (!kfd) + return NULL; + + kfd->kgd = kgd; + kfd->device_info = device_info; + kfd->pdev = pdev; + kfd->init_complete = false; + kfd->kfd2kgd = f2g; + + mutex_init(&kfd->doorbell_mutex); + memset(&kfd->doorbell_available_index, 0, + sizeof(kfd->doorbell_available_index)); + + return kfd; +} + +static void kfd_cwsr_init(struct kfd_dev *kfd) +{ + if (cwsr_enable && kfd->device_info->supports_cwsr) { + if (kfd->device_info->asic_family < CHIP_VEGA10) { + BUILD_BUG_ON(sizeof(cwsr_trap_gfx8_hex) > PAGE_SIZE); + kfd->cwsr_isa = cwsr_trap_gfx8_hex; + kfd->cwsr_isa_size = sizeof(cwsr_trap_gfx8_hex); + } else { + BUILD_BUG_ON(sizeof(cwsr_trap_gfx9_hex) > PAGE_SIZE); + kfd->cwsr_isa = cwsr_trap_gfx9_hex; + kfd->cwsr_isa_size = sizeof(cwsr_trap_gfx9_hex); + } + + kfd->cwsr_enabled = true; + } +} + +bool kgd2kfd_device_init(struct kfd_dev *kfd, + const struct kgd2kfd_shared_resources *gpu_resources) +{ + unsigned int size; + + kfd->shared_resources = *gpu_resources; + + kfd->vm_info.first_vmid_kfd = ffs(gpu_resources->compute_vmid_bitmap)-1; + kfd->vm_info.last_vmid_kfd = fls(gpu_resources->compute_vmid_bitmap)-1; + kfd->vm_info.vmid_num_kfd = kfd->vm_info.last_vmid_kfd + - kfd->vm_info.first_vmid_kfd + 1; + + /* Verify module parameters regarding mapped process number*/ + if ((hws_max_conc_proc < 0) + || (hws_max_conc_proc > kfd->vm_info.vmid_num_kfd)) { + dev_err(kfd_device, + "hws_max_conc_proc %d must be between 0 and %d, use %d instead\n", + hws_max_conc_proc, kfd->vm_info.vmid_num_kfd, + kfd->vm_info.vmid_num_kfd); + kfd->max_proc_per_quantum = kfd->vm_info.vmid_num_kfd; + } else + kfd->max_proc_per_quantum = hws_max_conc_proc; + + /* calculate max size of mqds needed for queues */ + size = max_num_of_queues_per_device * + kfd->device_info->mqd_size_aligned; + + /* + * calculate max size of runlist packet. + * There can be only 2 packets at once + */ + size += (KFD_MAX_NUM_OF_PROCESSES * sizeof(struct pm4_mes_map_process) + + max_num_of_queues_per_device * sizeof(struct pm4_mes_map_queues) + + sizeof(struct pm4_mes_runlist)) * 2; + + /* Add size of HIQ & DIQ */ + size += KFD_KERNEL_QUEUE_SIZE * 2; + + /* add another 512KB for all other allocations on gart (HPD, fences) */ + size += 512 * 1024; + + if (kfd->kfd2kgd->init_gtt_mem_allocation( + kfd->kgd, size, &kfd->gtt_mem, + &kfd->gtt_start_gpu_addr, &kfd->gtt_start_cpu_ptr, + false)) { + dev_err(kfd_device, "Could not allocate %d bytes\n", size); + goto out; + } + + dev_info(kfd_device, "Allocated %d bytes on gart\n", size); + + /* Initialize GTT sa with 512 byte chunk size */ + if (kfd_gtt_sa_init(kfd, size, 512) != 0) { + dev_err(kfd_device, "Error initializing gtt sub-allocator\n"); + goto kfd_gtt_sa_init_error; + } + + if (kfd_doorbell_init(kfd)) { + dev_err(kfd_device, + "Error initializing doorbell aperture\n"); + goto kfd_doorbell_error; + } + + if (kfd_topology_add_device(kfd)) { + dev_err(kfd_device, "Error adding device to topology\n"); + goto kfd_topology_add_device_error; + } + + if (kfd_interrupt_init(kfd)) { + dev_err(kfd_device, "Error initializing interrupts\n"); + goto kfd_interrupt_error; + } + + kfd->dqm = device_queue_manager_init(kfd); + if (!kfd->dqm) { + dev_err(kfd_device, "Error initializing queue manager\n"); + goto device_queue_manager_error; + } + + if (kfd_iommu_device_init(kfd)) { + dev_err(kfd_device, "Error initializing iommuv2\n"); + goto device_iommu_error; + } + + kfd_cwsr_init(kfd); + + if (kfd_resume(kfd)) + goto kfd_resume_error; + + kfd->dbgmgr = NULL; + + kfd->init_complete = true; + dev_info(kfd_device, "added device %x:%x\n", kfd->pdev->vendor, + kfd->pdev->device); + + pr_debug("Starting kfd with the following scheduling policy %d\n", + kfd->dqm->sched_policy); + + goto out; + +kfd_resume_error: +device_iommu_error: + device_queue_manager_uninit(kfd->dqm); +device_queue_manager_error: + kfd_interrupt_exit(kfd); +kfd_interrupt_error: + kfd_topology_remove_device(kfd); +kfd_topology_add_device_error: + kfd_doorbell_fini(kfd); +kfd_doorbell_error: + kfd_gtt_sa_fini(kfd); +kfd_gtt_sa_init_error: + kfd->kfd2kgd->free_gtt_mem(kfd->kgd, kfd->gtt_mem); + dev_err(kfd_device, + "device %x:%x NOT added due to errors\n", + kfd->pdev->vendor, kfd->pdev->device); +out: + return kfd->init_complete; +} + +void kgd2kfd_device_exit(struct kfd_dev *kfd) +{ + if (kfd->init_complete) { + kgd2kfd_suspend(kfd); + device_queue_manager_uninit(kfd->dqm); + kfd_interrupt_exit(kfd); + kfd_topology_remove_device(kfd); + kfd_doorbell_fini(kfd); + kfd_gtt_sa_fini(kfd); + kfd->kfd2kgd->free_gtt_mem(kfd->kgd, kfd->gtt_mem); + } + + kfree(kfd); +} + +int kgd2kfd_pre_reset(struct kfd_dev *kfd) +{ + if (!kfd->init_complete) + return 0; + kgd2kfd_suspend(kfd); + + /* hold dqm->lock to prevent further execution*/ + dqm_lock(kfd->dqm); + + kfd_signal_reset_event(kfd); + return 0; +} + +/* + * Fix me. KFD won't be able to resume existing process for now. + * We will keep all existing process in a evicted state and + * wait the process to be terminated. + */ + +int kgd2kfd_post_reset(struct kfd_dev *kfd) +{ + int ret, count; + + if (!kfd->init_complete) + return 0; + + dqm_unlock(kfd->dqm); + + ret = kfd_resume(kfd); + if (ret) + return ret; + count = atomic_dec_return(&kfd_locked); + WARN_ONCE(count != 0, "KFD reset ref. error"); + return 0; +} + +bool kfd_is_locked(void) +{ + return (atomic_read(&kfd_locked) > 0); +} + +void kgd2kfd_suspend(struct kfd_dev *kfd) +{ + if (!kfd->init_complete) + return; + + /* For first KFD device suspend all the KFD processes */ + if (atomic_inc_return(&kfd_locked) == 1) + kfd_suspend_all_processes(); + + kfd->dqm->ops.stop(kfd->dqm); + + kfd_iommu_suspend(kfd); +} + +int kgd2kfd_resume(struct kfd_dev *kfd) +{ + int ret, count; + + if (!kfd->init_complete) + return 0; + + ret = kfd_resume(kfd); + if (ret) + return ret; + + count = atomic_dec_return(&kfd_locked); + WARN_ONCE(count < 0, "KFD suspend / resume ref. error"); + if (count == 0) + ret = kfd_resume_all_processes(); + + return ret; +} + +static int kfd_resume(struct kfd_dev *kfd) +{ + int err = 0; + + err = kfd_iommu_resume(kfd); + if (err) { + dev_err(kfd_device, + "Failed to resume IOMMU for device %x:%x\n", + kfd->pdev->vendor, kfd->pdev->device); + return err; + } + + err = kfd->dqm->ops.start(kfd->dqm); + if (err) { + dev_err(kfd_device, + "Error starting queue manager for device %x:%x\n", + kfd->pdev->vendor, kfd->pdev->device); + goto dqm_start_error; + } + + return err; + +dqm_start_error: + kfd_iommu_suspend(kfd); + return err; +} + +/* This is called directly from KGD at ISR. */ +void kgd2kfd_interrupt(struct kfd_dev *kfd, const void *ih_ring_entry) +{ + uint32_t patched_ihre[KFD_MAX_RING_ENTRY_SIZE]; + bool is_patched = false; + unsigned long flags; + + if (!kfd->init_complete) + return; + + if (kfd->device_info->ih_ring_entry_size > sizeof(patched_ihre)) { + dev_err_once(kfd_device, "Ring entry too small\n"); + return; + } + + spin_lock_irqsave(&kfd->interrupt_lock, flags); + + if (kfd->interrupts_active + && interrupt_is_wanted(kfd, ih_ring_entry, + patched_ihre, &is_patched) + && enqueue_ih_ring_entry(kfd, + is_patched ? patched_ihre : ih_ring_entry)) + queue_work(kfd->ih_wq, &kfd->interrupt_work); + + spin_unlock_irqrestore(&kfd->interrupt_lock, flags); +} + +int kgd2kfd_quiesce_mm(struct mm_struct *mm) +{ + struct kfd_process *p; + int r; + + /* Because we are called from arbitrary context (workqueue) as opposed + * to process context, kfd_process could attempt to exit while we are + * running so the lookup function increments the process ref count. + */ + p = kfd_lookup_process_by_mm(mm); + if (!p) + return -ESRCH; + + r = kfd_process_evict_queues(p); + + kfd_unref_process(p); + return r; +} + +int kgd2kfd_resume_mm(struct mm_struct *mm) +{ + struct kfd_process *p; + int r; + + /* Because we are called from arbitrary context (workqueue) as opposed + * to process context, kfd_process could attempt to exit while we are + * running so the lookup function increments the process ref count. + */ + p = kfd_lookup_process_by_mm(mm); + if (!p) + return -ESRCH; + + r = kfd_process_restore_queues(p); + + kfd_unref_process(p); + return r; +} + +/** kgd2kfd_schedule_evict_and_restore_process - Schedules work queue that will + * prepare for safe eviction of KFD BOs that belong to the specified + * process. + * + * @mm: mm_struct that identifies the specified KFD process + * @fence: eviction fence attached to KFD process BOs + * + */ +int kgd2kfd_schedule_evict_and_restore_process(struct mm_struct *mm, + struct dma_fence *fence) +{ + struct kfd_process *p; + unsigned long active_time; + unsigned long delay_jiffies = msecs_to_jiffies(PROCESS_ACTIVE_TIME_MS); + + if (!fence) + return -EINVAL; + + if (dma_fence_is_signaled(fence)) + return 0; + + p = kfd_lookup_process_by_mm(mm); + if (!p) + return -ENODEV; + + if (fence->seqno == p->last_eviction_seqno) + goto out; + + p->last_eviction_seqno = fence->seqno; + + /* Avoid KFD process starvation. Wait for at least + * PROCESS_ACTIVE_TIME_MS before evicting the process again + */ + active_time = get_jiffies_64() - p->last_restore_timestamp; + if (delay_jiffies > active_time) + delay_jiffies -= active_time; + else + delay_jiffies = 0; + + /* During process initialization eviction_work.dwork is initialized + * to kfd_evict_bo_worker + */ + schedule_delayed_work(&p->eviction_work, delay_jiffies); +out: + kfd_unref_process(p); + return 0; +} + +static int kfd_gtt_sa_init(struct kfd_dev *kfd, unsigned int buf_size, + unsigned int chunk_size) +{ + unsigned int num_of_longs; + + if (WARN_ON(buf_size < chunk_size)) + return -EINVAL; + if (WARN_ON(buf_size == 0)) + return -EINVAL; + if (WARN_ON(chunk_size == 0)) + return -EINVAL; + + kfd->gtt_sa_chunk_size = chunk_size; + kfd->gtt_sa_num_of_chunks = buf_size / chunk_size; + + num_of_longs = (kfd->gtt_sa_num_of_chunks + BITS_PER_LONG - 1) / + BITS_PER_LONG; + + kfd->gtt_sa_bitmap = kcalloc(num_of_longs, sizeof(long), GFP_KERNEL); + + if (!kfd->gtt_sa_bitmap) + return -ENOMEM; + + pr_debug("gtt_sa_num_of_chunks = %d, gtt_sa_bitmap = %p\n", + kfd->gtt_sa_num_of_chunks, kfd->gtt_sa_bitmap); + + mutex_init(&kfd->gtt_sa_lock); + + return 0; + +} + +static void kfd_gtt_sa_fini(struct kfd_dev *kfd) +{ + mutex_destroy(&kfd->gtt_sa_lock); + kfree(kfd->gtt_sa_bitmap); +} + +static inline uint64_t kfd_gtt_sa_calc_gpu_addr(uint64_t start_addr, + unsigned int bit_num, + unsigned int chunk_size) +{ + return start_addr + bit_num * chunk_size; +} + +static inline uint32_t *kfd_gtt_sa_calc_cpu_addr(void *start_addr, + unsigned int bit_num, + unsigned int chunk_size) +{ + return (uint32_t *) ((uint64_t) start_addr + bit_num * chunk_size); +} + +int kfd_gtt_sa_allocate(struct kfd_dev *kfd, unsigned int size, + struct kfd_mem_obj **mem_obj) +{ + unsigned int found, start_search, cur_size; + + if (size == 0) + return -EINVAL; + + if (size > kfd->gtt_sa_num_of_chunks * kfd->gtt_sa_chunk_size) + return -ENOMEM; + + *mem_obj = kzalloc(sizeof(struct kfd_mem_obj), GFP_KERNEL); + if (!(*mem_obj)) + return -ENOMEM; + + pr_debug("Allocated mem_obj = %p for size = %d\n", *mem_obj, size); + + start_search = 0; + + mutex_lock(&kfd->gtt_sa_lock); + +kfd_gtt_restart_search: + /* Find the first chunk that is free */ + found = find_next_zero_bit(kfd->gtt_sa_bitmap, + kfd->gtt_sa_num_of_chunks, + start_search); + + pr_debug("Found = %d\n", found); + + /* If there wasn't any free chunk, bail out */ + if (found == kfd->gtt_sa_num_of_chunks) + goto kfd_gtt_no_free_chunk; + + /* Update fields of mem_obj */ + (*mem_obj)->range_start = found; + (*mem_obj)->range_end = found; + (*mem_obj)->gpu_addr = kfd_gtt_sa_calc_gpu_addr( + kfd->gtt_start_gpu_addr, + found, + kfd->gtt_sa_chunk_size); + (*mem_obj)->cpu_ptr = kfd_gtt_sa_calc_cpu_addr( + kfd->gtt_start_cpu_ptr, + found, + kfd->gtt_sa_chunk_size); + + pr_debug("gpu_addr = %p, cpu_addr = %p\n", + (uint64_t *) (*mem_obj)->gpu_addr, (*mem_obj)->cpu_ptr); + + /* If we need only one chunk, mark it as allocated and get out */ + if (size <= kfd->gtt_sa_chunk_size) { + pr_debug("Single bit\n"); + set_bit(found, kfd->gtt_sa_bitmap); + goto kfd_gtt_out; + } + + /* Otherwise, try to see if we have enough contiguous chunks */ + cur_size = size - kfd->gtt_sa_chunk_size; + do { + (*mem_obj)->range_end = + find_next_zero_bit(kfd->gtt_sa_bitmap, + kfd->gtt_sa_num_of_chunks, ++found); + /* + * If next free chunk is not contiguous than we need to + * restart our search from the last free chunk we found (which + * wasn't contiguous to the previous ones + */ + if ((*mem_obj)->range_end != found) { + start_search = found; + goto kfd_gtt_restart_search; + } + + /* + * If we reached end of buffer, bail out with error + */ + if (found == kfd->gtt_sa_num_of_chunks) + goto kfd_gtt_no_free_chunk; + + /* Check if we don't need another chunk */ + if (cur_size <= kfd->gtt_sa_chunk_size) + cur_size = 0; + else + cur_size -= kfd->gtt_sa_chunk_size; + + } while (cur_size > 0); + + pr_debug("range_start = %d, range_end = %d\n", + (*mem_obj)->range_start, (*mem_obj)->range_end); + + /* Mark the chunks as allocated */ + for (found = (*mem_obj)->range_start; + found <= (*mem_obj)->range_end; + found++) + set_bit(found, kfd->gtt_sa_bitmap); + +kfd_gtt_out: + mutex_unlock(&kfd->gtt_sa_lock); + return 0; + +kfd_gtt_no_free_chunk: + pr_debug("Allocation failed with mem_obj = %p\n", *mem_obj); + mutex_unlock(&kfd->gtt_sa_lock); + kfree(*mem_obj); + return -ENOMEM; +} + +int kfd_gtt_sa_free(struct kfd_dev *kfd, struct kfd_mem_obj *mem_obj) +{ + unsigned int bit; + + /* Act like kfree when trying to free a NULL object */ + if (!mem_obj) + return 0; + + pr_debug("Free mem_obj = %p, range_start = %d, range_end = %d\n", + mem_obj, mem_obj->range_start, mem_obj->range_end); + + mutex_lock(&kfd->gtt_sa_lock); + + /* Mark the chunks as free */ + for (bit = mem_obj->range_start; + bit <= mem_obj->range_end; + bit++) + clear_bit(bit, kfd->gtt_sa_bitmap); + + mutex_unlock(&kfd->gtt_sa_lock); + + kfree(mem_obj); + return 0; +} + +#if defined(CONFIG_DEBUG_FS) + +/* This function will send a package to HIQ to hang the HWS + * which will trigger a GPU reset and bring the HWS back to normal state + */ +int kfd_debugfs_hang_hws(struct kfd_dev *dev) +{ + int r = 0; + + if (dev->dqm->sched_policy != KFD_SCHED_POLICY_HWS) { + pr_err("HWS is not enabled"); + return -EINVAL; + } + + r = pm_debugfs_hang_hws(&dev->dqm->packets); + if (!r) + r = dqm_debugfs_execute_queues(dev->dqm); + + return r; +} + +#endif diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c new file mode 100644 index 000000000..bff39f561 --- /dev/null +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c @@ -0,0 +1,1864 @@ +/* + * Copyright 2014 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + * + */ + +#include <linux/ratelimit.h> +#include <linux/printk.h> +#include <linux/slab.h> +#include <linux/list.h> +#include <linux/types.h> +#include <linux/bitops.h> +#include <linux/sched.h> +#include "kfd_priv.h" +#include "kfd_device_queue_manager.h" +#include "kfd_mqd_manager.h" +#include "cik_regs.h" +#include "kfd_kernel_queue.h" + +/* Size of the per-pipe EOP queue */ +#define CIK_HPD_EOP_BYTES_LOG2 11 +#define CIK_HPD_EOP_BYTES (1U << CIK_HPD_EOP_BYTES_LOG2) + +static int set_pasid_vmid_mapping(struct device_queue_manager *dqm, + unsigned int pasid, unsigned int vmid); + +static int create_compute_queue_nocpsch(struct device_queue_manager *dqm, + struct queue *q, + struct qcm_process_device *qpd); + +static int execute_queues_cpsch(struct device_queue_manager *dqm, + enum kfd_unmap_queues_filter filter, + uint32_t filter_param); +static int unmap_queues_cpsch(struct device_queue_manager *dqm, + enum kfd_unmap_queues_filter filter, + uint32_t filter_param); + +static int map_queues_cpsch(struct device_queue_manager *dqm); + +static int create_sdma_queue_nocpsch(struct device_queue_manager *dqm, + struct queue *q, + struct qcm_process_device *qpd); + +static void deallocate_sdma_queue(struct device_queue_manager *dqm, + unsigned int sdma_queue_id); + +static void kfd_process_hw_exception(struct work_struct *work); + +static inline +enum KFD_MQD_TYPE get_mqd_type_from_queue_type(enum kfd_queue_type type) +{ + if (type == KFD_QUEUE_TYPE_SDMA) + return KFD_MQD_TYPE_SDMA; + return KFD_MQD_TYPE_CP; +} + +static bool is_pipe_enabled(struct device_queue_manager *dqm, int mec, int pipe) +{ + int i; + int pipe_offset = mec * dqm->dev->shared_resources.num_pipe_per_mec + + pipe * dqm->dev->shared_resources.num_queue_per_pipe; + + /* queue is available for KFD usage if bit is 1 */ + for (i = 0; i < dqm->dev->shared_resources.num_queue_per_pipe; ++i) + if (test_bit(pipe_offset + i, + dqm->dev->shared_resources.queue_bitmap)) + return true; + return false; +} + +unsigned int get_queues_num(struct device_queue_manager *dqm) +{ + return bitmap_weight(dqm->dev->shared_resources.queue_bitmap, + KGD_MAX_QUEUES); +} + +unsigned int get_queues_per_pipe(struct device_queue_manager *dqm) +{ + return dqm->dev->shared_resources.num_queue_per_pipe; +} + +unsigned int get_pipes_per_mec(struct device_queue_manager *dqm) +{ + return dqm->dev->shared_resources.num_pipe_per_mec; +} + +static unsigned int get_num_sdma_engines(struct device_queue_manager *dqm) +{ + return dqm->dev->device_info->num_sdma_engines; +} + +unsigned int get_num_sdma_queues(struct device_queue_manager *dqm) +{ + return dqm->dev->device_info->num_sdma_engines + * KFD_SDMA_QUEUES_PER_ENGINE; +} + +void program_sh_mem_settings(struct device_queue_manager *dqm, + struct qcm_process_device *qpd) +{ + return dqm->dev->kfd2kgd->program_sh_mem_settings( + dqm->dev->kgd, qpd->vmid, + qpd->sh_mem_config, + qpd->sh_mem_ape1_base, + qpd->sh_mem_ape1_limit, + qpd->sh_mem_bases); +} + +static int allocate_doorbell(struct qcm_process_device *qpd, struct queue *q) +{ + struct kfd_dev *dev = qpd->dqm->dev; + + if (!KFD_IS_SOC15(dev->device_info->asic_family)) { + /* On pre-SOC15 chips we need to use the queue ID to + * preserve the user mode ABI. + */ + q->doorbell_id = q->properties.queue_id; + } else if (q->properties.type == KFD_QUEUE_TYPE_SDMA) { + /* For SDMA queues on SOC15, use static doorbell + * assignments based on the engine and queue. + */ + q->doorbell_id = dev->shared_resources.sdma_doorbell + [q->properties.sdma_engine_id] + [q->properties.sdma_queue_id]; + } else { + /* For CP queues on SOC15 reserve a free doorbell ID */ + unsigned int found; + + found = find_first_zero_bit(qpd->doorbell_bitmap, + KFD_MAX_NUM_OF_QUEUES_PER_PROCESS); + if (found >= KFD_MAX_NUM_OF_QUEUES_PER_PROCESS) { + pr_debug("No doorbells available"); + return -EBUSY; + } + set_bit(found, qpd->doorbell_bitmap); + q->doorbell_id = found; + } + + q->properties.doorbell_off = + kfd_doorbell_id_to_offset(dev, q->process, + q->doorbell_id); + + return 0; +} + +static void deallocate_doorbell(struct qcm_process_device *qpd, + struct queue *q) +{ + unsigned int old; + struct kfd_dev *dev = qpd->dqm->dev; + + if (!KFD_IS_SOC15(dev->device_info->asic_family) || + q->properties.type == KFD_QUEUE_TYPE_SDMA) + return; + + old = test_and_clear_bit(q->doorbell_id, qpd->doorbell_bitmap); + WARN_ON(!old); +} + +static int allocate_vmid(struct device_queue_manager *dqm, + struct qcm_process_device *qpd, + struct queue *q) +{ + int bit, allocated_vmid; + + if (dqm->vmid_bitmap == 0) + return -ENOMEM; + + bit = ffs(dqm->vmid_bitmap) - 1; + dqm->vmid_bitmap &= ~(1 << bit); + + allocated_vmid = bit + dqm->dev->vm_info.first_vmid_kfd; + pr_debug("vmid allocation %d\n", allocated_vmid); + qpd->vmid = allocated_vmid; + q->properties.vmid = allocated_vmid; + + set_pasid_vmid_mapping(dqm, q->process->pasid, q->properties.vmid); + program_sh_mem_settings(dqm, qpd); + + /* qpd->page_table_base is set earlier when register_process() + * is called, i.e. when the first queue is created. + */ + dqm->dev->kfd2kgd->set_vm_context_page_table_base(dqm->dev->kgd, + qpd->vmid, + qpd->page_table_base); + /* invalidate the VM context after pasid and vmid mapping is set up */ + kfd_flush_tlb(qpd_to_pdd(qpd)); + + return 0; +} + +static int flush_texture_cache_nocpsch(struct kfd_dev *kdev, + struct qcm_process_device *qpd) +{ + const struct packet_manager_funcs *pmf = qpd->dqm->packets.pmf; + int ret; + + if (!qpd->ib_kaddr) + return -ENOMEM; + + ret = pmf->release_mem(qpd->ib_base, (uint32_t *)qpd->ib_kaddr); + if (ret) + return ret; + + return kdev->kfd2kgd->submit_ib(kdev->kgd, KGD_ENGINE_MEC1, qpd->vmid, + qpd->ib_base, (uint32_t *)qpd->ib_kaddr, + pmf->release_mem_size / sizeof(uint32_t)); +} + +static void deallocate_vmid(struct device_queue_manager *dqm, + struct qcm_process_device *qpd, + struct queue *q) +{ + int bit = qpd->vmid - dqm->dev->vm_info.first_vmid_kfd; + + /* On GFX v7, CP doesn't flush TC at dequeue */ + if (q->device->device_info->asic_family == CHIP_HAWAII) + if (flush_texture_cache_nocpsch(q->device, qpd)) + pr_err("Failed to flush TC\n"); + + kfd_flush_tlb(qpd_to_pdd(qpd)); + + /* Release the vmid mapping */ + set_pasid_vmid_mapping(dqm, 0, qpd->vmid); + + dqm->vmid_bitmap |= (1 << bit); + qpd->vmid = 0; + q->properties.vmid = 0; +} + +static int create_queue_nocpsch(struct device_queue_manager *dqm, + struct queue *q, + struct qcm_process_device *qpd) +{ + int retval; + + print_queue(q); + + dqm_lock(dqm); + + if (dqm->total_queue_count >= max_num_of_queues_per_device) { + pr_warn("Can't create new usermode queue because %d queues were already created\n", + dqm->total_queue_count); + retval = -EPERM; + goto out_unlock; + } + + if (list_empty(&qpd->queues_list)) { + retval = allocate_vmid(dqm, qpd, q); + if (retval) + goto out_unlock; + } + q->properties.vmid = qpd->vmid; + /* + * Eviction state logic: we only mark active queues as evicted + * to avoid the overhead of restoring inactive queues later + */ + if (qpd->evicted) + q->properties.is_evicted = (q->properties.queue_size > 0 && + q->properties.queue_percent > 0 && + q->properties.queue_address != 0); + + q->properties.tba_addr = qpd->tba_addr; + q->properties.tma_addr = qpd->tma_addr; + + if (q->properties.type == KFD_QUEUE_TYPE_COMPUTE) + retval = create_compute_queue_nocpsch(dqm, q, qpd); + else if (q->properties.type == KFD_QUEUE_TYPE_SDMA) + retval = create_sdma_queue_nocpsch(dqm, q, qpd); + else + retval = -EINVAL; + + if (retval) { + if (list_empty(&qpd->queues_list)) + deallocate_vmid(dqm, qpd, q); + goto out_unlock; + } + + list_add(&q->list, &qpd->queues_list); + qpd->queue_count++; + if (q->properties.is_active) + dqm->queue_count++; + + if (q->properties.type == KFD_QUEUE_TYPE_SDMA) + dqm->sdma_queue_count++; + + /* + * Unconditionally increment this counter, regardless of the queue's + * type or whether the queue is active. + */ + dqm->total_queue_count++; + pr_debug("Total of %d queues are accountable so far\n", + dqm->total_queue_count); + +out_unlock: + dqm_unlock(dqm); + return retval; +} + +static int allocate_hqd(struct device_queue_manager *dqm, struct queue *q) +{ + bool set; + int pipe, bit, i; + + set = false; + + for (pipe = dqm->next_pipe_to_allocate, i = 0; + i < get_pipes_per_mec(dqm); + pipe = ((pipe + 1) % get_pipes_per_mec(dqm)), ++i) { + + if (!is_pipe_enabled(dqm, 0, pipe)) + continue; + + if (dqm->allocated_queues[pipe] != 0) { + bit = ffs(dqm->allocated_queues[pipe]) - 1; + dqm->allocated_queues[pipe] &= ~(1 << bit); + q->pipe = pipe; + q->queue = bit; + set = true; + break; + } + } + + if (!set) + return -EBUSY; + + pr_debug("hqd slot - pipe %d, queue %d\n", q->pipe, q->queue); + /* horizontal hqd allocation */ + dqm->next_pipe_to_allocate = (pipe + 1) % get_pipes_per_mec(dqm); + + return 0; +} + +static inline void deallocate_hqd(struct device_queue_manager *dqm, + struct queue *q) +{ + dqm->allocated_queues[q->pipe] |= (1 << q->queue); +} + +static int create_compute_queue_nocpsch(struct device_queue_manager *dqm, + struct queue *q, + struct qcm_process_device *qpd) +{ + struct mqd_manager *mqd_mgr; + int retval; + + mqd_mgr = dqm->ops.get_mqd_manager(dqm, KFD_MQD_TYPE_COMPUTE); + if (!mqd_mgr) + return -ENOMEM; + + retval = allocate_hqd(dqm, q); + if (retval) + return retval; + + retval = allocate_doorbell(qpd, q); + if (retval) + goto out_deallocate_hqd; + + retval = mqd_mgr->init_mqd(mqd_mgr, &q->mqd, &q->mqd_mem_obj, + &q->gart_mqd_addr, &q->properties); + if (retval) + goto out_deallocate_doorbell; + + pr_debug("Loading mqd to hqd on pipe %d, queue %d\n", + q->pipe, q->queue); + + dqm->dev->kfd2kgd->set_scratch_backing_va( + dqm->dev->kgd, qpd->sh_hidden_private_base, qpd->vmid); + + if (!q->properties.is_active) + return 0; + + if (WARN(q->process->mm != current->mm, + "should only run in user thread")) + retval = -EFAULT; + else + retval = mqd_mgr->load_mqd(mqd_mgr, q->mqd, q->pipe, q->queue, + &q->properties, current->mm); + if (retval) + goto out_uninit_mqd; + + return 0; + +out_uninit_mqd: + mqd_mgr->uninit_mqd(mqd_mgr, q->mqd, q->mqd_mem_obj); +out_deallocate_doorbell: + deallocate_doorbell(qpd, q); +out_deallocate_hqd: + deallocate_hqd(dqm, q); + + return retval; +} + +/* Access to DQM has to be locked before calling destroy_queue_nocpsch_locked + * to avoid asynchronized access + */ +static int destroy_queue_nocpsch_locked(struct device_queue_manager *dqm, + struct qcm_process_device *qpd, + struct queue *q) +{ + int retval; + struct mqd_manager *mqd_mgr; + + mqd_mgr = dqm->ops.get_mqd_manager(dqm, + get_mqd_type_from_queue_type(q->properties.type)); + if (!mqd_mgr) + return -ENOMEM; + + if (q->properties.type == KFD_QUEUE_TYPE_COMPUTE) { + deallocate_hqd(dqm, q); + } else if (q->properties.type == KFD_QUEUE_TYPE_SDMA) { + dqm->sdma_queue_count--; + deallocate_sdma_queue(dqm, q->sdma_id); + } else { + pr_debug("q->properties.type %d is invalid\n", + q->properties.type); + return -EINVAL; + } + dqm->total_queue_count--; + + deallocate_doorbell(qpd, q); + + retval = mqd_mgr->destroy_mqd(mqd_mgr, q->mqd, + KFD_PREEMPT_TYPE_WAVEFRONT_RESET, + KFD_UNMAP_LATENCY_MS, + q->pipe, q->queue); + if (retval == -ETIME) + qpd->reset_wavefronts = true; + + mqd_mgr->uninit_mqd(mqd_mgr, q->mqd, q->mqd_mem_obj); + + list_del(&q->list); + if (list_empty(&qpd->queues_list)) { + if (qpd->reset_wavefronts) { + pr_warn("Resetting wave fronts (nocpsch) on dev %p\n", + dqm->dev); + /* dbgdev_wave_reset_wavefronts has to be called before + * deallocate_vmid(), i.e. when vmid is still in use. + */ + dbgdev_wave_reset_wavefronts(dqm->dev, + qpd->pqm->process); + qpd->reset_wavefronts = false; + } + + deallocate_vmid(dqm, qpd, q); + } + qpd->queue_count--; + if (q->properties.is_active) + dqm->queue_count--; + + return retval; +} + +static int destroy_queue_nocpsch(struct device_queue_manager *dqm, + struct qcm_process_device *qpd, + struct queue *q) +{ + int retval; + + dqm_lock(dqm); + retval = destroy_queue_nocpsch_locked(dqm, qpd, q); + dqm_unlock(dqm); + + return retval; +} + +static int update_queue(struct device_queue_manager *dqm, struct queue *q) +{ + int retval; + struct mqd_manager *mqd_mgr; + struct kfd_process_device *pdd; + bool prev_active = false; + + dqm_lock(dqm); + pdd = kfd_get_process_device_data(q->device, q->process); + if (!pdd) { + retval = -ENODEV; + goto out_unlock; + } + mqd_mgr = dqm->ops.get_mqd_manager(dqm, + get_mqd_type_from_queue_type(q->properties.type)); + if (!mqd_mgr) { + retval = -ENOMEM; + goto out_unlock; + } + /* + * Eviction state logic: we only mark active queues as evicted + * to avoid the overhead of restoring inactive queues later + */ + if (pdd->qpd.evicted) + q->properties.is_evicted = (q->properties.queue_size > 0 && + q->properties.queue_percent > 0 && + q->properties.queue_address != 0); + + /* Save previous activity state for counters */ + prev_active = q->properties.is_active; + + /* Make sure the queue is unmapped before updating the MQD */ + if (dqm->sched_policy != KFD_SCHED_POLICY_NO_HWS) { + retval = unmap_queues_cpsch(dqm, + KFD_UNMAP_QUEUES_FILTER_DYNAMIC_QUEUES, 0); + if (retval) { + pr_err("unmap queue failed\n"); + goto out_unlock; + } + } else if (prev_active && + (q->properties.type == KFD_QUEUE_TYPE_COMPUTE || + q->properties.type == KFD_QUEUE_TYPE_SDMA)) { + retval = mqd_mgr->destroy_mqd(mqd_mgr, q->mqd, + KFD_PREEMPT_TYPE_WAVEFRONT_DRAIN, + KFD_UNMAP_LATENCY_MS, q->pipe, q->queue); + if (retval) { + pr_err("destroy mqd failed\n"); + goto out_unlock; + } + } + + retval = mqd_mgr->update_mqd(mqd_mgr, q->mqd, &q->properties); + + /* + * check active state vs. the previous state and modify + * counter accordingly. map_queues_cpsch uses the + * dqm->queue_count to determine whether a new runlist must be + * uploaded. + */ + if (q->properties.is_active && !prev_active) + dqm->queue_count++; + else if (!q->properties.is_active && prev_active) + dqm->queue_count--; + + if (dqm->sched_policy != KFD_SCHED_POLICY_NO_HWS) + retval = map_queues_cpsch(dqm); + else if (q->properties.is_active && + (q->properties.type == KFD_QUEUE_TYPE_COMPUTE || + q->properties.type == KFD_QUEUE_TYPE_SDMA)) { + if (WARN(q->process->mm != current->mm, + "should only run in user thread")) + retval = -EFAULT; + else + retval = mqd_mgr->load_mqd(mqd_mgr, q->mqd, + q->pipe, q->queue, + &q->properties, current->mm); + } + +out_unlock: + dqm_unlock(dqm); + return retval; +} + +static struct mqd_manager *get_mqd_manager( + struct device_queue_manager *dqm, enum KFD_MQD_TYPE type) +{ + struct mqd_manager *mqd_mgr; + + if (WARN_ON(type >= KFD_MQD_TYPE_MAX)) + return NULL; + + pr_debug("mqd type %d\n", type); + + mqd_mgr = dqm->mqd_mgrs[type]; + if (!mqd_mgr) { + mqd_mgr = mqd_manager_init(type, dqm->dev); + if (!mqd_mgr) + pr_err("mqd manager is NULL"); + dqm->mqd_mgrs[type] = mqd_mgr; + } + + return mqd_mgr; +} + +static int evict_process_queues_nocpsch(struct device_queue_manager *dqm, + struct qcm_process_device *qpd) +{ + struct queue *q; + struct mqd_manager *mqd_mgr; + struct kfd_process_device *pdd; + int retval = 0; + + dqm_lock(dqm); + if (qpd->evicted++ > 0) /* already evicted, do nothing */ + goto out; + + pdd = qpd_to_pdd(qpd); + pr_info_ratelimited("Evicting PASID %u queues\n", + pdd->process->pasid); + + /* unactivate all active queues on the qpd */ + list_for_each_entry(q, &qpd->queues_list, list) { + if (!q->properties.is_active) + continue; + mqd_mgr = dqm->ops.get_mqd_manager(dqm, + get_mqd_type_from_queue_type(q->properties.type)); + if (!mqd_mgr) { /* should not be here */ + pr_err("Cannot evict queue, mqd mgr is NULL\n"); + retval = -ENOMEM; + goto out; + } + q->properties.is_evicted = true; + q->properties.is_active = false; + retval = mqd_mgr->destroy_mqd(mqd_mgr, q->mqd, + KFD_PREEMPT_TYPE_WAVEFRONT_DRAIN, + KFD_UNMAP_LATENCY_MS, q->pipe, q->queue); + if (retval) + goto out; + dqm->queue_count--; + } + +out: + dqm_unlock(dqm); + return retval; +} + +static int evict_process_queues_cpsch(struct device_queue_manager *dqm, + struct qcm_process_device *qpd) +{ + struct queue *q; + struct kfd_process_device *pdd; + int retval = 0; + + dqm_lock(dqm); + if (qpd->evicted++ > 0) /* already evicted, do nothing */ + goto out; + + pdd = qpd_to_pdd(qpd); + pr_info_ratelimited("Evicting PASID %u queues\n", + pdd->process->pasid); + + /* unactivate all active queues on the qpd */ + list_for_each_entry(q, &qpd->queues_list, list) { + if (!q->properties.is_active) + continue; + q->properties.is_evicted = true; + q->properties.is_active = false; + dqm->queue_count--; + } + retval = execute_queues_cpsch(dqm, + qpd->is_debug ? + KFD_UNMAP_QUEUES_FILTER_ALL_QUEUES : + KFD_UNMAP_QUEUES_FILTER_DYNAMIC_QUEUES, 0); + +out: + dqm_unlock(dqm); + return retval; +} + +static int restore_process_queues_nocpsch(struct device_queue_manager *dqm, + struct qcm_process_device *qpd) +{ + struct mm_struct *mm = NULL; + struct queue *q; + struct mqd_manager *mqd_mgr; + struct kfd_process_device *pdd; + uint32_t pd_base; + int retval = 0; + + pdd = qpd_to_pdd(qpd); + /* Retrieve PD base */ + pd_base = dqm->dev->kfd2kgd->get_process_page_dir(pdd->vm); + + dqm_lock(dqm); + if (WARN_ON_ONCE(!qpd->evicted)) /* already restored, do nothing */ + goto out; + if (qpd->evicted > 1) { /* ref count still > 0, decrement & quit */ + qpd->evicted--; + goto out; + } + + pr_info_ratelimited("Restoring PASID %u queues\n", + pdd->process->pasid); + + /* Update PD Base in QPD */ + qpd->page_table_base = pd_base; + pr_debug("Updated PD address to 0x%08x\n", pd_base); + + if (!list_empty(&qpd->queues_list)) { + dqm->dev->kfd2kgd->set_vm_context_page_table_base( + dqm->dev->kgd, + qpd->vmid, + qpd->page_table_base); + kfd_flush_tlb(pdd); + } + + /* Take a safe reference to the mm_struct, which may otherwise + * disappear even while the kfd_process is still referenced. + */ + mm = get_task_mm(pdd->process->lead_thread); + if (!mm) { + retval = -EFAULT; + goto out; + } + + /* activate all active queues on the qpd */ + list_for_each_entry(q, &qpd->queues_list, list) { + if (!q->properties.is_evicted) + continue; + mqd_mgr = dqm->ops.get_mqd_manager(dqm, + get_mqd_type_from_queue_type(q->properties.type)); + if (!mqd_mgr) { /* should not be here */ + pr_err("Cannot restore queue, mqd mgr is NULL\n"); + retval = -ENOMEM; + goto out; + } + q->properties.is_evicted = false; + q->properties.is_active = true; + retval = mqd_mgr->load_mqd(mqd_mgr, q->mqd, q->pipe, + q->queue, &q->properties, mm); + if (retval) + goto out; + dqm->queue_count++; + } + qpd->evicted = 0; +out: + if (mm) + mmput(mm); + dqm_unlock(dqm); + return retval; +} + +static int restore_process_queues_cpsch(struct device_queue_manager *dqm, + struct qcm_process_device *qpd) +{ + struct queue *q; + struct kfd_process_device *pdd; + uint32_t pd_base; + int retval = 0; + + pdd = qpd_to_pdd(qpd); + /* Retrieve PD base */ + pd_base = dqm->dev->kfd2kgd->get_process_page_dir(pdd->vm); + + dqm_lock(dqm); + if (WARN_ON_ONCE(!qpd->evicted)) /* already restored, do nothing */ + goto out; + if (qpd->evicted > 1) { /* ref count still > 0, decrement & quit */ + qpd->evicted--; + goto out; + } + + pr_info_ratelimited("Restoring PASID %u queues\n", + pdd->process->pasid); + + /* Update PD Base in QPD */ + qpd->page_table_base = pd_base; + pr_debug("Updated PD address to 0x%08x\n", pd_base); + + /* activate all active queues on the qpd */ + list_for_each_entry(q, &qpd->queues_list, list) { + if (!q->properties.is_evicted) + continue; + q->properties.is_evicted = false; + q->properties.is_active = true; + dqm->queue_count++; + } + retval = execute_queues_cpsch(dqm, + KFD_UNMAP_QUEUES_FILTER_DYNAMIC_QUEUES, 0); + if (!retval) + qpd->evicted = 0; +out: + dqm_unlock(dqm); + return retval; +} + +static int register_process(struct device_queue_manager *dqm, + struct qcm_process_device *qpd) +{ + struct device_process_node *n; + struct kfd_process_device *pdd; + uint32_t pd_base; + int retval; + + n = kzalloc(sizeof(*n), GFP_KERNEL); + if (!n) + return -ENOMEM; + + n->qpd = qpd; + + pdd = qpd_to_pdd(qpd); + /* Retrieve PD base */ + pd_base = dqm->dev->kfd2kgd->get_process_page_dir(pdd->vm); + + dqm_lock(dqm); + list_add(&n->list, &dqm->queues); + + /* Update PD Base in QPD */ + qpd->page_table_base = pd_base; + + retval = dqm->asic_ops.update_qpd(dqm, qpd); + + if (dqm->processes_count++ == 0) + dqm->dev->kfd2kgd->set_compute_idle(dqm->dev->kgd, false); + + dqm_unlock(dqm); + + return retval; +} + +static int unregister_process(struct device_queue_manager *dqm, + struct qcm_process_device *qpd) +{ + int retval; + struct device_process_node *cur, *next; + + pr_debug("qpd->queues_list is %s\n", + list_empty(&qpd->queues_list) ? "empty" : "not empty"); + + retval = 0; + dqm_lock(dqm); + + list_for_each_entry_safe(cur, next, &dqm->queues, list) { + if (qpd == cur->qpd) { + list_del(&cur->list); + kfree(cur); + if (--dqm->processes_count == 0) + dqm->dev->kfd2kgd->set_compute_idle( + dqm->dev->kgd, true); + goto out; + } + } + /* qpd not found in dqm list */ + retval = 1; +out: + dqm_unlock(dqm); + return retval; +} + +static int +set_pasid_vmid_mapping(struct device_queue_manager *dqm, unsigned int pasid, + unsigned int vmid) +{ + uint32_t pasid_mapping; + + pasid_mapping = (pasid == 0) ? 0 : + (uint32_t)pasid | + ATC_VMID_PASID_MAPPING_VALID; + + return dqm->dev->kfd2kgd->set_pasid_vmid_mapping( + dqm->dev->kgd, pasid_mapping, + vmid); +} + +static void init_interrupts(struct device_queue_manager *dqm) +{ + unsigned int i; + + for (i = 0 ; i < get_pipes_per_mec(dqm) ; i++) + if (is_pipe_enabled(dqm, 0, i)) + dqm->dev->kfd2kgd->init_interrupts(dqm->dev->kgd, i); +} + +static int initialize_nocpsch(struct device_queue_manager *dqm) +{ + int pipe, queue; + + pr_debug("num of pipes: %d\n", get_pipes_per_mec(dqm)); + + dqm->allocated_queues = kcalloc(get_pipes_per_mec(dqm), + sizeof(unsigned int), GFP_KERNEL); + if (!dqm->allocated_queues) + return -ENOMEM; + + mutex_init(&dqm->lock_hidden); + INIT_LIST_HEAD(&dqm->queues); + dqm->queue_count = dqm->next_pipe_to_allocate = 0; + dqm->sdma_queue_count = 0; + + for (pipe = 0; pipe < get_pipes_per_mec(dqm); pipe++) { + int pipe_offset = pipe * get_queues_per_pipe(dqm); + + for (queue = 0; queue < get_queues_per_pipe(dqm); queue++) + if (test_bit(pipe_offset + queue, + dqm->dev->shared_resources.queue_bitmap)) + dqm->allocated_queues[pipe] |= 1 << queue; + } + + dqm->vmid_bitmap = (1 << dqm->dev->vm_info.vmid_num_kfd) - 1; + dqm->sdma_bitmap = (1 << get_num_sdma_queues(dqm)) - 1; + + return 0; +} + +static void uninitialize(struct device_queue_manager *dqm) +{ + int i; + + WARN_ON(dqm->queue_count > 0 || dqm->processes_count > 0); + + kfree(dqm->allocated_queues); + for (i = 0 ; i < KFD_MQD_TYPE_MAX ; i++) + kfree(dqm->mqd_mgrs[i]); + mutex_destroy(&dqm->lock_hidden); + kfd_gtt_sa_free(dqm->dev, dqm->pipeline_mem); +} + +static int start_nocpsch(struct device_queue_manager *dqm) +{ + init_interrupts(dqm); + return pm_init(&dqm->packets, dqm); +} + +static int stop_nocpsch(struct device_queue_manager *dqm) +{ + pm_uninit(&dqm->packets); + return 0; +} + +static int allocate_sdma_queue(struct device_queue_manager *dqm, + unsigned int *sdma_queue_id) +{ + int bit; + + if (dqm->sdma_bitmap == 0) + return -ENOMEM; + + bit = ffs(dqm->sdma_bitmap) - 1; + dqm->sdma_bitmap &= ~(1 << bit); + *sdma_queue_id = bit; + + return 0; +} + +static void deallocate_sdma_queue(struct device_queue_manager *dqm, + unsigned int sdma_queue_id) +{ + if (sdma_queue_id >= get_num_sdma_queues(dqm)) + return; + dqm->sdma_bitmap |= (1 << sdma_queue_id); +} + +static int create_sdma_queue_nocpsch(struct device_queue_manager *dqm, + struct queue *q, + struct qcm_process_device *qpd) +{ + struct mqd_manager *mqd_mgr; + int retval; + + mqd_mgr = dqm->ops.get_mqd_manager(dqm, KFD_MQD_TYPE_SDMA); + if (!mqd_mgr) + return -ENOMEM; + + retval = allocate_sdma_queue(dqm, &q->sdma_id); + if (retval) + return retval; + + q->properties.sdma_queue_id = q->sdma_id / get_num_sdma_engines(dqm); + q->properties.sdma_engine_id = q->sdma_id % get_num_sdma_engines(dqm); + + retval = allocate_doorbell(qpd, q); + if (retval) + goto out_deallocate_sdma_queue; + + pr_debug("SDMA id is: %d\n", q->sdma_id); + pr_debug("SDMA queue id: %d\n", q->properties.sdma_queue_id); + pr_debug("SDMA engine id: %d\n", q->properties.sdma_engine_id); + + dqm->asic_ops.init_sdma_vm(dqm, q, qpd); + retval = mqd_mgr->init_mqd(mqd_mgr, &q->mqd, &q->mqd_mem_obj, + &q->gart_mqd_addr, &q->properties); + if (retval) + goto out_deallocate_doorbell; + + retval = mqd_mgr->load_mqd(mqd_mgr, q->mqd, 0, 0, &q->properties, + NULL); + if (retval) + goto out_uninit_mqd; + + return 0; + +out_uninit_mqd: + mqd_mgr->uninit_mqd(mqd_mgr, q->mqd, q->mqd_mem_obj); +out_deallocate_doorbell: + deallocate_doorbell(qpd, q); +out_deallocate_sdma_queue: + deallocate_sdma_queue(dqm, q->sdma_id); + + return retval; +} + +/* + * Device Queue Manager implementation for cp scheduler + */ + +static int set_sched_resources(struct device_queue_manager *dqm) +{ + int i, mec; + struct scheduling_resources res; + + res.vmid_mask = dqm->dev->shared_resources.compute_vmid_bitmap; + + res.queue_mask = 0; + for (i = 0; i < KGD_MAX_QUEUES; ++i) { + mec = (i / dqm->dev->shared_resources.num_queue_per_pipe) + / dqm->dev->shared_resources.num_pipe_per_mec; + + if (!test_bit(i, dqm->dev->shared_resources.queue_bitmap)) + continue; + + /* only acquire queues from the first MEC */ + if (mec > 0) + continue; + + /* This situation may be hit in the future if a new HW + * generation exposes more than 64 queues. If so, the + * definition of res.queue_mask needs updating + */ + if (WARN_ON(i >= (sizeof(res.queue_mask)*8))) { + pr_err("Invalid queue enabled by amdgpu: %d\n", i); + break; + } + + res.queue_mask |= (1ull << i); + } + res.gws_mask = res.oac_mask = res.gds_heap_base = + res.gds_heap_size = 0; + + pr_debug("Scheduling resources:\n" + "vmid mask: 0x%8X\n" + "queue mask: 0x%8llX\n", + res.vmid_mask, res.queue_mask); + + return pm_send_set_resources(&dqm->packets, &res); +} + +static int initialize_cpsch(struct device_queue_manager *dqm) +{ + pr_debug("num of pipes: %d\n", get_pipes_per_mec(dqm)); + + mutex_init(&dqm->lock_hidden); + INIT_LIST_HEAD(&dqm->queues); + dqm->queue_count = dqm->processes_count = 0; + dqm->sdma_queue_count = 0; + dqm->active_runlist = false; + dqm->sdma_bitmap = (1 << get_num_sdma_queues(dqm)) - 1; + + INIT_WORK(&dqm->hw_exception_work, kfd_process_hw_exception); + + return 0; +} + +static int start_cpsch(struct device_queue_manager *dqm) +{ + int retval; + + retval = 0; + + retval = pm_init(&dqm->packets, dqm); + if (retval) + goto fail_packet_manager_init; + + retval = set_sched_resources(dqm); + if (retval) + goto fail_set_sched_resources; + + pr_debug("Allocating fence memory\n"); + + /* allocate fence memory on the gart */ + retval = kfd_gtt_sa_allocate(dqm->dev, sizeof(*dqm->fence_addr), + &dqm->fence_mem); + + if (retval) + goto fail_allocate_vidmem; + + dqm->fence_addr = dqm->fence_mem->cpu_ptr; + dqm->fence_gpu_addr = dqm->fence_mem->gpu_addr; + + init_interrupts(dqm); + + dqm_lock(dqm); + /* clear hang status when driver try to start the hw scheduler */ + dqm->is_hws_hang = false; + execute_queues_cpsch(dqm, KFD_UNMAP_QUEUES_FILTER_DYNAMIC_QUEUES, 0); + dqm_unlock(dqm); + + return 0; +fail_allocate_vidmem: +fail_set_sched_resources: + pm_uninit(&dqm->packets); +fail_packet_manager_init: + return retval; +} + +static int stop_cpsch(struct device_queue_manager *dqm) +{ + dqm_lock(dqm); + unmap_queues_cpsch(dqm, KFD_UNMAP_QUEUES_FILTER_ALL_QUEUES, 0); + dqm_unlock(dqm); + + pm_release_ib(&dqm->packets); + + kfd_gtt_sa_free(dqm->dev, dqm->fence_mem); + pm_uninit(&dqm->packets); + + return 0; +} + +static int create_kernel_queue_cpsch(struct device_queue_manager *dqm, + struct kernel_queue *kq, + struct qcm_process_device *qpd) +{ + dqm_lock(dqm); + if (dqm->total_queue_count >= max_num_of_queues_per_device) { + pr_warn("Can't create new kernel queue because %d queues were already created\n", + dqm->total_queue_count); + dqm_unlock(dqm); + return -EPERM; + } + + /* + * Unconditionally increment this counter, regardless of the queue's + * type or whether the queue is active. + */ + dqm->total_queue_count++; + pr_debug("Total of %d queues are accountable so far\n", + dqm->total_queue_count); + + list_add(&kq->list, &qpd->priv_queue_list); + dqm->queue_count++; + qpd->is_debug = true; + execute_queues_cpsch(dqm, KFD_UNMAP_QUEUES_FILTER_DYNAMIC_QUEUES, 0); + dqm_unlock(dqm); + + return 0; +} + +static void destroy_kernel_queue_cpsch(struct device_queue_manager *dqm, + struct kernel_queue *kq, + struct qcm_process_device *qpd) +{ + dqm_lock(dqm); + list_del(&kq->list); + dqm->queue_count--; + qpd->is_debug = false; + execute_queues_cpsch(dqm, KFD_UNMAP_QUEUES_FILTER_ALL_QUEUES, 0); + /* + * Unconditionally decrement this counter, regardless of the queue's + * type. + */ + dqm->total_queue_count--; + pr_debug("Total of %d queues are accountable so far\n", + dqm->total_queue_count); + dqm_unlock(dqm); +} + +static int create_queue_cpsch(struct device_queue_manager *dqm, struct queue *q, + struct qcm_process_device *qpd) +{ + int retval; + struct mqd_manager *mqd_mgr; + + retval = 0; + + dqm_lock(dqm); + + if (dqm->total_queue_count >= max_num_of_queues_per_device) { + pr_warn("Can't create new usermode queue because %d queues were already created\n", + dqm->total_queue_count); + retval = -EPERM; + goto out_unlock; + } + + if (q->properties.type == KFD_QUEUE_TYPE_SDMA) { + retval = allocate_sdma_queue(dqm, &q->sdma_id); + if (retval) + goto out_unlock; + q->properties.sdma_queue_id = + q->sdma_id / get_num_sdma_engines(dqm); + q->properties.sdma_engine_id = + q->sdma_id % get_num_sdma_engines(dqm); + } + + retval = allocate_doorbell(qpd, q); + if (retval) + goto out_deallocate_sdma_queue; + + mqd_mgr = dqm->ops.get_mqd_manager(dqm, + get_mqd_type_from_queue_type(q->properties.type)); + + if (!mqd_mgr) { + retval = -ENOMEM; + goto out_deallocate_doorbell; + } + /* + * Eviction state logic: we only mark active queues as evicted + * to avoid the overhead of restoring inactive queues later + */ + if (qpd->evicted) + q->properties.is_evicted = (q->properties.queue_size > 0 && + q->properties.queue_percent > 0 && + q->properties.queue_address != 0); + + dqm->asic_ops.init_sdma_vm(dqm, q, qpd); + + q->properties.tba_addr = qpd->tba_addr; + q->properties.tma_addr = qpd->tma_addr; + retval = mqd_mgr->init_mqd(mqd_mgr, &q->mqd, &q->mqd_mem_obj, + &q->gart_mqd_addr, &q->properties); + if (retval) + goto out_deallocate_doorbell; + + list_add(&q->list, &qpd->queues_list); + qpd->queue_count++; + if (q->properties.is_active) { + dqm->queue_count++; + retval = execute_queues_cpsch(dqm, + KFD_UNMAP_QUEUES_FILTER_DYNAMIC_QUEUES, 0); + } + + if (q->properties.type == KFD_QUEUE_TYPE_SDMA) + dqm->sdma_queue_count++; + /* + * Unconditionally increment this counter, regardless of the queue's + * type or whether the queue is active. + */ + dqm->total_queue_count++; + + pr_debug("Total of %d queues are accountable so far\n", + dqm->total_queue_count); + + dqm_unlock(dqm); + return retval; + +out_deallocate_doorbell: + deallocate_doorbell(qpd, q); +out_deallocate_sdma_queue: + if (q->properties.type == KFD_QUEUE_TYPE_SDMA) + deallocate_sdma_queue(dqm, q->sdma_id); +out_unlock: + dqm_unlock(dqm); + + return retval; +} + +int amdkfd_fence_wait_timeout(unsigned int *fence_addr, + unsigned int fence_value, + unsigned int timeout_ms) +{ + unsigned long end_jiffies = msecs_to_jiffies(timeout_ms) + jiffies; + + while (*fence_addr != fence_value) { + if (time_after(jiffies, end_jiffies)) { + pr_err("qcm fence wait loop timeout expired\n"); + /* In HWS case, this is used to halt the driver thread + * in order not to mess up CP states before doing + * scandumps for FW debugging. + */ + while (halt_if_hws_hang) + schedule(); + + return -ETIME; + } + schedule(); + } + + return 0; +} + +static int unmap_sdma_queues(struct device_queue_manager *dqm) +{ + int i, retval = 0; + + for (i = 0; i < dqm->dev->device_info->num_sdma_engines; i++) { + retval = pm_send_unmap_queue(&dqm->packets, KFD_QUEUE_TYPE_SDMA, + KFD_UNMAP_QUEUES_FILTER_DYNAMIC_QUEUES, 0, false, i); + if (retval) + return retval; + } + return retval; +} + +/* dqm->lock mutex has to be locked before calling this function */ +static int map_queues_cpsch(struct device_queue_manager *dqm) +{ + int retval; + + if (dqm->queue_count <= 0 || dqm->processes_count <= 0) + return 0; + + if (dqm->active_runlist) + return 0; + + retval = pm_send_runlist(&dqm->packets, &dqm->queues); + if (retval) { + pr_err("failed to execute runlist\n"); + return retval; + } + dqm->active_runlist = true; + + return retval; +} + +/* dqm->lock mutex has to be locked before calling this function */ +static int unmap_queues_cpsch(struct device_queue_manager *dqm, + enum kfd_unmap_queues_filter filter, + uint32_t filter_param) +{ + int retval = 0; + + if (dqm->is_hws_hang) + return -EIO; + if (!dqm->active_runlist) + return retval; + + pr_debug("Before destroying queues, sdma queue count is : %u\n", + dqm->sdma_queue_count); + + if (dqm->sdma_queue_count > 0) + unmap_sdma_queues(dqm); + + retval = pm_send_unmap_queue(&dqm->packets, KFD_QUEUE_TYPE_COMPUTE, + filter, filter_param, false, 0); + if (retval) + return retval; + + *dqm->fence_addr = KFD_FENCE_INIT; + pm_send_query_status(&dqm->packets, dqm->fence_gpu_addr, + KFD_FENCE_COMPLETED); + /* should be timed out */ + retval = amdkfd_fence_wait_timeout(dqm->fence_addr, KFD_FENCE_COMPLETED, + QUEUE_PREEMPT_DEFAULT_TIMEOUT_MS); + if (retval) + return retval; + + pm_release_ib(&dqm->packets); + dqm->active_runlist = false; + + return retval; +} + +/* dqm->lock mutex has to be locked before calling this function */ +static int execute_queues_cpsch(struct device_queue_manager *dqm, + enum kfd_unmap_queues_filter filter, + uint32_t filter_param) +{ + int retval; + + if (dqm->is_hws_hang) + return -EIO; + retval = unmap_queues_cpsch(dqm, filter, filter_param); + if (retval) { + pr_err("The cp might be in an unrecoverable state due to an unsuccessful queues preemption\n"); + dqm->is_hws_hang = true; + schedule_work(&dqm->hw_exception_work); + return retval; + } + + return map_queues_cpsch(dqm); +} + +static int destroy_queue_cpsch(struct device_queue_manager *dqm, + struct qcm_process_device *qpd, + struct queue *q) +{ + int retval; + struct mqd_manager *mqd_mgr; + bool preempt_all_queues; + + preempt_all_queues = false; + + retval = 0; + + /* remove queue from list to prevent rescheduling after preemption */ + dqm_lock(dqm); + + if (qpd->is_debug) { + /* + * error, currently we do not allow to destroy a queue + * of a currently debugged process + */ + retval = -EBUSY; + goto failed_try_destroy_debugged_queue; + + } + + mqd_mgr = dqm->ops.get_mqd_manager(dqm, + get_mqd_type_from_queue_type(q->properties.type)); + if (!mqd_mgr) { + retval = -ENOMEM; + goto failed; + } + + deallocate_doorbell(qpd, q); + + if (q->properties.type == KFD_QUEUE_TYPE_SDMA) { + dqm->sdma_queue_count--; + deallocate_sdma_queue(dqm, q->sdma_id); + } + + list_del(&q->list); + qpd->queue_count--; + if (q->properties.is_active) { + dqm->queue_count--; + retval = execute_queues_cpsch(dqm, + KFD_UNMAP_QUEUES_FILTER_DYNAMIC_QUEUES, 0); + if (retval == -ETIME) + qpd->reset_wavefronts = true; + } + + mqd_mgr->uninit_mqd(mqd_mgr, q->mqd, q->mqd_mem_obj); + + /* + * Unconditionally decrement this counter, regardless of the queue's + * type + */ + dqm->total_queue_count--; + pr_debug("Total of %d queues are accountable so far\n", + dqm->total_queue_count); + + dqm_unlock(dqm); + + return retval; + +failed: +failed_try_destroy_debugged_queue: + + dqm_unlock(dqm); + return retval; +} + +/* + * Low bits must be 0000/FFFF as required by HW, high bits must be 0 to + * stay in user mode. + */ +#define APE1_FIXED_BITS_MASK 0xFFFF80000000FFFFULL +/* APE1 limit is inclusive and 64K aligned. */ +#define APE1_LIMIT_ALIGNMENT 0xFFFF + +static bool set_cache_memory_policy(struct device_queue_manager *dqm, + struct qcm_process_device *qpd, + enum cache_policy default_policy, + enum cache_policy alternate_policy, + void __user *alternate_aperture_base, + uint64_t alternate_aperture_size) +{ + bool retval = true; + + if (!dqm->asic_ops.set_cache_memory_policy) + return retval; + + dqm_lock(dqm); + + if (alternate_aperture_size == 0) { + /* base > limit disables APE1 */ + qpd->sh_mem_ape1_base = 1; + qpd->sh_mem_ape1_limit = 0; + } else { + /* + * In FSA64, APE1_Base[63:0] = { 16{SH_MEM_APE1_BASE[31]}, + * SH_MEM_APE1_BASE[31:0], 0x0000 } + * APE1_Limit[63:0] = { 16{SH_MEM_APE1_LIMIT[31]}, + * SH_MEM_APE1_LIMIT[31:0], 0xFFFF } + * Verify that the base and size parameters can be + * represented in this format and convert them. + * Additionally restrict APE1 to user-mode addresses. + */ + + uint64_t base = (uintptr_t)alternate_aperture_base; + uint64_t limit = base + alternate_aperture_size - 1; + + if (limit <= base || (base & APE1_FIXED_BITS_MASK) != 0 || + (limit & APE1_FIXED_BITS_MASK) != APE1_LIMIT_ALIGNMENT) { + retval = false; + goto out; + } + + qpd->sh_mem_ape1_base = base >> 16; + qpd->sh_mem_ape1_limit = limit >> 16; + } + + retval = dqm->asic_ops.set_cache_memory_policy( + dqm, + qpd, + default_policy, + alternate_policy, + alternate_aperture_base, + alternate_aperture_size); + + if ((dqm->sched_policy == KFD_SCHED_POLICY_NO_HWS) && (qpd->vmid != 0)) + program_sh_mem_settings(dqm, qpd); + + pr_debug("sh_mem_config: 0x%x, ape1_base: 0x%x, ape1_limit: 0x%x\n", + qpd->sh_mem_config, qpd->sh_mem_ape1_base, + qpd->sh_mem_ape1_limit); + +out: + dqm_unlock(dqm); + return retval; +} + +static int set_trap_handler(struct device_queue_manager *dqm, + struct qcm_process_device *qpd, + uint64_t tba_addr, + uint64_t tma_addr) +{ + uint64_t *tma; + + if (dqm->dev->cwsr_enabled) { + /* Jump from CWSR trap handler to user trap */ + tma = (uint64_t *)(qpd->cwsr_kaddr + KFD_CWSR_TMA_OFFSET); + tma[0] = tba_addr; + tma[1] = tma_addr; + } else { + qpd->tba_addr = tba_addr; + qpd->tma_addr = tma_addr; + } + + return 0; +} + +static int process_termination_nocpsch(struct device_queue_manager *dqm, + struct qcm_process_device *qpd) +{ + struct queue *q, *next; + struct device_process_node *cur, *next_dpn; + int retval = 0; + + dqm_lock(dqm); + + /* Clear all user mode queues */ + list_for_each_entry_safe(q, next, &qpd->queues_list, list) { + int ret; + + ret = destroy_queue_nocpsch_locked(dqm, qpd, q); + if (ret) + retval = ret; + } + + /* Unregister process */ + list_for_each_entry_safe(cur, next_dpn, &dqm->queues, list) { + if (qpd == cur->qpd) { + list_del(&cur->list); + kfree(cur); + dqm->processes_count--; + break; + } + } + + dqm_unlock(dqm); + return retval; +} + + +static int process_termination_cpsch(struct device_queue_manager *dqm, + struct qcm_process_device *qpd) +{ + int retval; + struct queue *q, *next; + struct kernel_queue *kq, *kq_next; + struct mqd_manager *mqd_mgr; + struct device_process_node *cur, *next_dpn; + enum kfd_unmap_queues_filter filter = + KFD_UNMAP_QUEUES_FILTER_DYNAMIC_QUEUES; + + retval = 0; + + dqm_lock(dqm); + + /* Clean all kernel queues */ + list_for_each_entry_safe(kq, kq_next, &qpd->priv_queue_list, list) { + list_del(&kq->list); + dqm->queue_count--; + qpd->is_debug = false; + dqm->total_queue_count--; + filter = KFD_UNMAP_QUEUES_FILTER_ALL_QUEUES; + } + + /* Clear all user mode queues */ + list_for_each_entry(q, &qpd->queues_list, list) { + if (q->properties.type == KFD_QUEUE_TYPE_SDMA) { + dqm->sdma_queue_count--; + deallocate_sdma_queue(dqm, q->sdma_id); + } + + if (q->properties.is_active) + dqm->queue_count--; + + dqm->total_queue_count--; + } + + /* Unregister process */ + list_for_each_entry_safe(cur, next_dpn, &dqm->queues, list) { + if (qpd == cur->qpd) { + list_del(&cur->list); + kfree(cur); + dqm->processes_count--; + break; + } + } + + retval = execute_queues_cpsch(dqm, filter, 0); + if ((!dqm->is_hws_hang) && (retval || qpd->reset_wavefronts)) { + pr_warn("Resetting wave fronts (cpsch) on dev %p\n", dqm->dev); + dbgdev_wave_reset_wavefronts(dqm->dev, qpd->pqm->process); + qpd->reset_wavefronts = false; + } + + /* lastly, free mqd resources */ + list_for_each_entry_safe(q, next, &qpd->queues_list, list) { + mqd_mgr = dqm->ops.get_mqd_manager(dqm, + get_mqd_type_from_queue_type(q->properties.type)); + if (!mqd_mgr) { + retval = -ENOMEM; + goto out; + } + list_del(&q->list); + qpd->queue_count--; + mqd_mgr->uninit_mqd(mqd_mgr, q->mqd, q->mqd_mem_obj); + } + +out: + dqm_unlock(dqm); + return retval; +} + +struct device_queue_manager *device_queue_manager_init(struct kfd_dev *dev) +{ + struct device_queue_manager *dqm; + + pr_debug("Loading device queue manager\n"); + + dqm = kzalloc(sizeof(*dqm), GFP_KERNEL); + if (!dqm) + return NULL; + + switch (dev->device_info->asic_family) { + /* HWS is not available on Hawaii. */ + case CHIP_HAWAII: + /* HWS depends on CWSR for timely dequeue. CWSR is not + * available on Tonga. + * + * FIXME: This argument also applies to Kaveri. + */ + case CHIP_TONGA: + dqm->sched_policy = KFD_SCHED_POLICY_NO_HWS; + break; + default: + dqm->sched_policy = sched_policy; + break; + } + + dqm->dev = dev; + switch (dqm->sched_policy) { + case KFD_SCHED_POLICY_HWS: + case KFD_SCHED_POLICY_HWS_NO_OVERSUBSCRIPTION: + /* initialize dqm for cp scheduling */ + dqm->ops.create_queue = create_queue_cpsch; + dqm->ops.initialize = initialize_cpsch; + dqm->ops.start = start_cpsch; + dqm->ops.stop = stop_cpsch; + dqm->ops.destroy_queue = destroy_queue_cpsch; + dqm->ops.update_queue = update_queue; + dqm->ops.get_mqd_manager = get_mqd_manager; + dqm->ops.register_process = register_process; + dqm->ops.unregister_process = unregister_process; + dqm->ops.uninitialize = uninitialize; + dqm->ops.create_kernel_queue = create_kernel_queue_cpsch; + dqm->ops.destroy_kernel_queue = destroy_kernel_queue_cpsch; + dqm->ops.set_cache_memory_policy = set_cache_memory_policy; + dqm->ops.set_trap_handler = set_trap_handler; + dqm->ops.process_termination = process_termination_cpsch; + dqm->ops.evict_process_queues = evict_process_queues_cpsch; + dqm->ops.restore_process_queues = restore_process_queues_cpsch; + break; + case KFD_SCHED_POLICY_NO_HWS: + /* initialize dqm for no cp scheduling */ + dqm->ops.start = start_nocpsch; + dqm->ops.stop = stop_nocpsch; + dqm->ops.create_queue = create_queue_nocpsch; + dqm->ops.destroy_queue = destroy_queue_nocpsch; + dqm->ops.update_queue = update_queue; + dqm->ops.get_mqd_manager = get_mqd_manager; + dqm->ops.register_process = register_process; + dqm->ops.unregister_process = unregister_process; + dqm->ops.initialize = initialize_nocpsch; + dqm->ops.uninitialize = uninitialize; + dqm->ops.set_cache_memory_policy = set_cache_memory_policy; + dqm->ops.set_trap_handler = set_trap_handler; + dqm->ops.process_termination = process_termination_nocpsch; + dqm->ops.evict_process_queues = evict_process_queues_nocpsch; + dqm->ops.restore_process_queues = + restore_process_queues_nocpsch; + break; + default: + pr_err("Invalid scheduling policy %d\n", dqm->sched_policy); + goto out_free; + } + + switch (dev->device_info->asic_family) { + case CHIP_CARRIZO: + device_queue_manager_init_vi(&dqm->asic_ops); + break; + + case CHIP_KAVERI: + device_queue_manager_init_cik(&dqm->asic_ops); + break; + + case CHIP_HAWAII: + device_queue_manager_init_cik_hawaii(&dqm->asic_ops); + break; + + case CHIP_TONGA: + case CHIP_FIJI: + case CHIP_POLARIS10: + case CHIP_POLARIS11: + device_queue_manager_init_vi_tonga(&dqm->asic_ops); + break; + + case CHIP_VEGA10: + case CHIP_RAVEN: + device_queue_manager_init_v9(&dqm->asic_ops); + break; + default: + WARN(1, "Unexpected ASIC family %u", + dev->device_info->asic_family); + goto out_free; + } + + if (!dqm->ops.initialize(dqm)) + return dqm; + +out_free: + kfree(dqm); + return NULL; +} + +void device_queue_manager_uninit(struct device_queue_manager *dqm) +{ + dqm->ops.uninitialize(dqm); + kfree(dqm); +} + +int kfd_process_vm_fault(struct device_queue_manager *dqm, + unsigned int pasid) +{ + struct kfd_process_device *pdd; + struct kfd_process *p = kfd_lookup_process_by_pasid(pasid); + int ret = 0; + + if (!p) + return -EINVAL; + pdd = kfd_get_process_device_data(dqm->dev, p); + if (pdd) + ret = dqm->ops.evict_process_queues(dqm, &pdd->qpd); + kfd_unref_process(p); + + return ret; +} + +static void kfd_process_hw_exception(struct work_struct *work) +{ + struct device_queue_manager *dqm = container_of(work, + struct device_queue_manager, hw_exception_work); + dqm->dev->kfd2kgd->gpu_recover(dqm->dev->kgd); +} + +#if defined(CONFIG_DEBUG_FS) + +static void seq_reg_dump(struct seq_file *m, + uint32_t (*dump)[2], uint32_t n_regs) +{ + uint32_t i, count; + + for (i = 0, count = 0; i < n_regs; i++) { + if (count == 0 || + dump[i-1][0] + sizeof(uint32_t) != dump[i][0]) { + seq_printf(m, "%s %08x: %08x", + i ? "\n" : "", + dump[i][0], dump[i][1]); + count = 7; + } else { + seq_printf(m, " %08x", dump[i][1]); + count--; + } + } + + seq_puts(m, "\n"); +} + +int dqm_debugfs_hqds(struct seq_file *m, void *data) +{ + struct device_queue_manager *dqm = data; + uint32_t (*dump)[2], n_regs; + int pipe, queue; + int r = 0; + + r = dqm->dev->kfd2kgd->hqd_dump(dqm->dev->kgd, + KFD_CIK_HIQ_PIPE, KFD_CIK_HIQ_QUEUE, &dump, &n_regs); + if (!r) { + seq_printf(m, " HIQ on MEC %d Pipe %d Queue %d\n", + KFD_CIK_HIQ_PIPE/get_pipes_per_mec(dqm)+1, + KFD_CIK_HIQ_PIPE%get_pipes_per_mec(dqm), + KFD_CIK_HIQ_QUEUE); + seq_reg_dump(m, dump, n_regs); + + kfree(dump); + } + + for (pipe = 0; pipe < get_pipes_per_mec(dqm); pipe++) { + int pipe_offset = pipe * get_queues_per_pipe(dqm); + + for (queue = 0; queue < get_queues_per_pipe(dqm); queue++) { + if (!test_bit(pipe_offset + queue, + dqm->dev->shared_resources.queue_bitmap)) + continue; + + r = dqm->dev->kfd2kgd->hqd_dump( + dqm->dev->kgd, pipe, queue, &dump, &n_regs); + if (r) + break; + + seq_printf(m, " CP Pipe %d, Queue %d\n", + pipe, queue); + seq_reg_dump(m, dump, n_regs); + + kfree(dump); + } + } + + for (pipe = 0; pipe < get_num_sdma_engines(dqm); pipe++) { + for (queue = 0; queue < KFD_SDMA_QUEUES_PER_ENGINE; queue++) { + r = dqm->dev->kfd2kgd->hqd_sdma_dump( + dqm->dev->kgd, pipe, queue, &dump, &n_regs); + if (r) + break; + + seq_printf(m, " SDMA Engine %d, RLC %d\n", + pipe, queue); + seq_reg_dump(m, dump, n_regs); + + kfree(dump); + } + } + + return r; +} + +int dqm_debugfs_execute_queues(struct device_queue_manager *dqm) +{ + int r = 0; + + dqm_lock(dqm); + dqm->active_runlist = true; + r = execute_queues_cpsch(dqm, KFD_UNMAP_QUEUES_FILTER_ALL_QUEUES, 0); + dqm_unlock(dqm); + + return r; +} + +#endif diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h new file mode 100644 index 000000000..00da3169a --- /dev/null +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h @@ -0,0 +1,242 @@ +/* + * Copyright 2014 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + * + */ + +#ifndef KFD_DEVICE_QUEUE_MANAGER_H_ +#define KFD_DEVICE_QUEUE_MANAGER_H_ + +#include <linux/rwsem.h> +#include <linux/list.h> +#include <linux/mutex.h> +#include <linux/sched/mm.h> +#include "kfd_priv.h" +#include "kfd_mqd_manager.h" + +#define KFD_UNMAP_LATENCY_MS (4000) +#define QUEUE_PREEMPT_DEFAULT_TIMEOUT_MS (2 * KFD_UNMAP_LATENCY_MS + 1000) +#define KFD_SDMA_QUEUES_PER_ENGINE (2) + +struct device_process_node { + struct qcm_process_device *qpd; + struct list_head list; +}; + +/** + * struct device_queue_manager_ops + * + * @create_queue: Queue creation routine. + * + * @destroy_queue: Queue destruction routine. + * + * @update_queue: Queue update routine. + * + * @get_mqd_manager: Returns the mqd manager according to the mqd type. + * + * @exeute_queues: Dispatches the queues list to the H/W. + * + * @register_process: This routine associates a specific process with device. + * + * @unregister_process: destroys the associations between process to device. + * + * @initialize: Initializes the pipelines and memory module for that device. + * + * @start: Initializes the resources/modules the the device needs for queues + * execution. This function is called on device initialization and after the + * system woke up after suspension. + * + * @stop: This routine stops execution of all the active queue running on the + * H/W and basically this function called on system suspend. + * + * @uninitialize: Destroys all the device queue manager resources allocated in + * initialize routine. + * + * @create_kernel_queue: Creates kernel queue. Used for debug queue. + * + * @destroy_kernel_queue: Destroys kernel queue. Used for debug queue. + * + * @set_cache_memory_policy: Sets memory policy (cached/ non cached) for the + * memory apertures. + * + * @process_termination: Clears all process queues belongs to that device. + * + * @evict_process_queues: Evict all active queues of a process + * + * @restore_process_queues: Restore all evicted queues queues of a process + * + */ + +struct device_queue_manager_ops { + int (*create_queue)(struct device_queue_manager *dqm, + struct queue *q, + struct qcm_process_device *qpd); + + int (*destroy_queue)(struct device_queue_manager *dqm, + struct qcm_process_device *qpd, + struct queue *q); + + int (*update_queue)(struct device_queue_manager *dqm, + struct queue *q); + + struct mqd_manager * (*get_mqd_manager) + (struct device_queue_manager *dqm, + enum KFD_MQD_TYPE type); + + int (*register_process)(struct device_queue_manager *dqm, + struct qcm_process_device *qpd); + + int (*unregister_process)(struct device_queue_manager *dqm, + struct qcm_process_device *qpd); + + int (*initialize)(struct device_queue_manager *dqm); + int (*start)(struct device_queue_manager *dqm); + int (*stop)(struct device_queue_manager *dqm); + void (*uninitialize)(struct device_queue_manager *dqm); + int (*create_kernel_queue)(struct device_queue_manager *dqm, + struct kernel_queue *kq, + struct qcm_process_device *qpd); + + void (*destroy_kernel_queue)(struct device_queue_manager *dqm, + struct kernel_queue *kq, + struct qcm_process_device *qpd); + + bool (*set_cache_memory_policy)(struct device_queue_manager *dqm, + struct qcm_process_device *qpd, + enum cache_policy default_policy, + enum cache_policy alternate_policy, + void __user *alternate_aperture_base, + uint64_t alternate_aperture_size); + + int (*set_trap_handler)(struct device_queue_manager *dqm, + struct qcm_process_device *qpd, + uint64_t tba_addr, + uint64_t tma_addr); + + int (*process_termination)(struct device_queue_manager *dqm, + struct qcm_process_device *qpd); + + int (*evict_process_queues)(struct device_queue_manager *dqm, + struct qcm_process_device *qpd); + int (*restore_process_queues)(struct device_queue_manager *dqm, + struct qcm_process_device *qpd); +}; + +struct device_queue_manager_asic_ops { + int (*update_qpd)(struct device_queue_manager *dqm, + struct qcm_process_device *qpd); + bool (*set_cache_memory_policy)(struct device_queue_manager *dqm, + struct qcm_process_device *qpd, + enum cache_policy default_policy, + enum cache_policy alternate_policy, + void __user *alternate_aperture_base, + uint64_t alternate_aperture_size); + void (*init_sdma_vm)(struct device_queue_manager *dqm, + struct queue *q, + struct qcm_process_device *qpd); +}; + +/** + * struct device_queue_manager + * + * This struct is a base class for the kfd queues scheduler in the + * device level. The device base class should expose the basic operations + * for queue creation and queue destruction. This base class hides the + * scheduling mode of the driver and the specific implementation of the + * concrete device. This class is the only class in the queues scheduler + * that configures the H/W. + * + */ + +struct device_queue_manager { + struct device_queue_manager_ops ops; + struct device_queue_manager_asic_ops asic_ops; + + struct mqd_manager *mqd_mgrs[KFD_MQD_TYPE_MAX]; + struct packet_manager packets; + struct kfd_dev *dev; + struct mutex lock_hidden; /* use dqm_lock/unlock(dqm) */ + struct list_head queues; + unsigned int saved_flags; + unsigned int processes_count; + unsigned int queue_count; + unsigned int sdma_queue_count; + unsigned int total_queue_count; + unsigned int next_pipe_to_allocate; + unsigned int *allocated_queues; + unsigned int sdma_bitmap; + unsigned int vmid_bitmap; + uint64_t pipelines_addr; + struct kfd_mem_obj *pipeline_mem; + uint64_t fence_gpu_addr; + unsigned int *fence_addr; + struct kfd_mem_obj *fence_mem; + bool active_runlist; + int sched_policy; + + /* hw exception */ + bool is_hws_hang; + struct work_struct hw_exception_work; +}; + +void device_queue_manager_init_cik( + struct device_queue_manager_asic_ops *asic_ops); +void device_queue_manager_init_cik_hawaii( + struct device_queue_manager_asic_ops *asic_ops); +void device_queue_manager_init_vi( + struct device_queue_manager_asic_ops *asic_ops); +void device_queue_manager_init_vi_tonga( + struct device_queue_manager_asic_ops *asic_ops); +void device_queue_manager_init_v9( + struct device_queue_manager_asic_ops *asic_ops); +void program_sh_mem_settings(struct device_queue_manager *dqm, + struct qcm_process_device *qpd); +unsigned int get_queues_num(struct device_queue_manager *dqm); +unsigned int get_queues_per_pipe(struct device_queue_manager *dqm); +unsigned int get_pipes_per_mec(struct device_queue_manager *dqm); +unsigned int get_num_sdma_queues(struct device_queue_manager *dqm); + +static inline unsigned int get_sh_mem_bases_32(struct kfd_process_device *pdd) +{ + return (pdd->lds_base >> 16) & 0xFF; +} + +static inline unsigned int +get_sh_mem_bases_nybble_64(struct kfd_process_device *pdd) +{ + return (pdd->lds_base >> 60) & 0x0E; +} + +/* The DQM lock can be taken in MMU notifiers. Make sure no reclaim-FS + * happens while holding this lock anywhere to prevent deadlocks when + * an MMU notifier runs in reclaim-FS context. + */ +static inline void dqm_lock(struct device_queue_manager *dqm) +{ + mutex_lock(&dqm->lock_hidden); + dqm->saved_flags = memalloc_nofs_save(); +} +static inline void dqm_unlock(struct device_queue_manager *dqm) +{ + memalloc_nofs_restore(dqm->saved_flags); + mutex_unlock(&dqm->lock_hidden); +} + +#endif /* KFD_DEVICE_QUEUE_MANAGER_H_ */ diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager_cik.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager_cik.c new file mode 100644 index 000000000..aed4c2141 --- /dev/null +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager_cik.c @@ -0,0 +1,205 @@ +/* + * Copyright 2014 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + * + */ + +#include "kfd_device_queue_manager.h" +#include "cik_regs.h" +#include "oss/oss_2_4_sh_mask.h" +#include "gca/gfx_7_2_sh_mask.h" + +static bool set_cache_memory_policy_cik(struct device_queue_manager *dqm, + struct qcm_process_device *qpd, + enum cache_policy default_policy, + enum cache_policy alternate_policy, + void __user *alternate_aperture_base, + uint64_t alternate_aperture_size); +static int update_qpd_cik(struct device_queue_manager *dqm, + struct qcm_process_device *qpd); +static int update_qpd_cik_hawaii(struct device_queue_manager *dqm, + struct qcm_process_device *qpd); +static void init_sdma_vm(struct device_queue_manager *dqm, struct queue *q, + struct qcm_process_device *qpd); +static void init_sdma_vm_hawaii(struct device_queue_manager *dqm, + struct queue *q, + struct qcm_process_device *qpd); + +void device_queue_manager_init_cik( + struct device_queue_manager_asic_ops *asic_ops) +{ + asic_ops->set_cache_memory_policy = set_cache_memory_policy_cik; + asic_ops->update_qpd = update_qpd_cik; + asic_ops->init_sdma_vm = init_sdma_vm; +} + +void device_queue_manager_init_cik_hawaii( + struct device_queue_manager_asic_ops *asic_ops) +{ + asic_ops->set_cache_memory_policy = set_cache_memory_policy_cik; + asic_ops->update_qpd = update_qpd_cik_hawaii; + asic_ops->init_sdma_vm = init_sdma_vm_hawaii; +} + +static uint32_t compute_sh_mem_bases_64bit(unsigned int top_address_nybble) +{ + /* In 64-bit mode, we can only control the top 3 bits of the LDS, + * scratch and GPUVM apertures. + * The hardware fills in the remaining 59 bits according to the + * following pattern: + * LDS: X0000000'00000000 - X0000001'00000000 (4GB) + * Scratch: X0000001'00000000 - X0000002'00000000 (4GB) + * GPUVM: Y0010000'00000000 - Y0020000'00000000 (1TB) + * + * (where X/Y is the configurable nybble with the low-bit 0) + * + * LDS and scratch will have the same top nybble programmed in the + * top 3 bits of SH_MEM_BASES.PRIVATE_BASE. + * GPUVM can have a different top nybble programmed in the + * top 3 bits of SH_MEM_BASES.SHARED_BASE. + * We don't bother to support different top nybbles + * for LDS/Scratch and GPUVM. + */ + + WARN_ON((top_address_nybble & 1) || top_address_nybble > 0xE || + top_address_nybble == 0); + + return PRIVATE_BASE(top_address_nybble << 12) | + SHARED_BASE(top_address_nybble << 12); +} + +static bool set_cache_memory_policy_cik(struct device_queue_manager *dqm, + struct qcm_process_device *qpd, + enum cache_policy default_policy, + enum cache_policy alternate_policy, + void __user *alternate_aperture_base, + uint64_t alternate_aperture_size) +{ + uint32_t default_mtype; + uint32_t ape1_mtype; + + default_mtype = (default_policy == cache_policy_coherent) ? + MTYPE_NONCACHED : + MTYPE_CACHED; + + ape1_mtype = (alternate_policy == cache_policy_coherent) ? + MTYPE_NONCACHED : + MTYPE_CACHED; + + qpd->sh_mem_config = (qpd->sh_mem_config & PTR32) + | ALIGNMENT_MODE(SH_MEM_ALIGNMENT_MODE_UNALIGNED) + | DEFAULT_MTYPE(default_mtype) + | APE1_MTYPE(ape1_mtype); + + return true; +} + +static int update_qpd_cik(struct device_queue_manager *dqm, + struct qcm_process_device *qpd) +{ + struct kfd_process_device *pdd; + unsigned int temp; + + pdd = qpd_to_pdd(qpd); + + /* check if sh_mem_config register already configured */ + if (qpd->sh_mem_config == 0) { + qpd->sh_mem_config = + ALIGNMENT_MODE(SH_MEM_ALIGNMENT_MODE_UNALIGNED) | + DEFAULT_MTYPE(MTYPE_NONCACHED) | + APE1_MTYPE(MTYPE_NONCACHED); + qpd->sh_mem_ape1_limit = 0; + qpd->sh_mem_ape1_base = 0; + } + + if (qpd->pqm->process->is_32bit_user_mode) { + temp = get_sh_mem_bases_32(pdd); + qpd->sh_mem_bases = SHARED_BASE(temp); + qpd->sh_mem_config |= PTR32; + } else { + temp = get_sh_mem_bases_nybble_64(pdd); + qpd->sh_mem_bases = compute_sh_mem_bases_64bit(temp); + qpd->sh_mem_config |= 1 << SH_MEM_CONFIG__PRIVATE_ATC__SHIFT; + } + + pr_debug("is32bit process: %d sh_mem_bases nybble: 0x%X and register 0x%X\n", + qpd->pqm->process->is_32bit_user_mode, temp, qpd->sh_mem_bases); + + return 0; +} + +static int update_qpd_cik_hawaii(struct device_queue_manager *dqm, + struct qcm_process_device *qpd) +{ + struct kfd_process_device *pdd; + unsigned int temp; + + pdd = qpd_to_pdd(qpd); + + /* check if sh_mem_config register already configured */ + if (qpd->sh_mem_config == 0) { + qpd->sh_mem_config = + ALIGNMENT_MODE(SH_MEM_ALIGNMENT_MODE_UNALIGNED) | + DEFAULT_MTYPE(MTYPE_NONCACHED) | + APE1_MTYPE(MTYPE_NONCACHED); + qpd->sh_mem_ape1_limit = 0; + qpd->sh_mem_ape1_base = 0; + } + + /* On dGPU we're always in GPUVM64 addressing mode with 64-bit + * aperture addresses. + */ + temp = get_sh_mem_bases_nybble_64(pdd); + qpd->sh_mem_bases = compute_sh_mem_bases_64bit(temp); + + pr_debug("is32bit process: %d sh_mem_bases nybble: 0x%X and register 0x%X\n", + qpd->pqm->process->is_32bit_user_mode, temp, qpd->sh_mem_bases); + + return 0; +} + +static void init_sdma_vm(struct device_queue_manager *dqm, struct queue *q, + struct qcm_process_device *qpd) +{ + uint32_t value = (1 << SDMA0_RLC0_VIRTUAL_ADDR__ATC__SHIFT); + + if (q->process->is_32bit_user_mode) + value |= (1 << SDMA0_RLC0_VIRTUAL_ADDR__PTR32__SHIFT) | + get_sh_mem_bases_32(qpd_to_pdd(qpd)); + else + value |= ((get_sh_mem_bases_nybble_64(qpd_to_pdd(qpd))) << + SDMA0_RLC0_VIRTUAL_ADDR__SHARED_BASE__SHIFT) & + SDMA0_RLC0_VIRTUAL_ADDR__SHARED_BASE_MASK; + + q->properties.sdma_vm_addr = value; +} + +static void init_sdma_vm_hawaii(struct device_queue_manager *dqm, + struct queue *q, + struct qcm_process_device *qpd) +{ + /* On dGPU we're always in GPUVM64 addressing mode with 64-bit + * aperture addresses. + */ + q->properties.sdma_vm_addr = + ((get_sh_mem_bases_nybble_64(qpd_to_pdd(qpd))) << + SDMA0_RLC0_VIRTUAL_ADDR__SHARED_BASE__SHIFT) & + SDMA0_RLC0_VIRTUAL_ADDR__SHARED_BASE_MASK; +} diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager_v9.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager_v9.c new file mode 100644 index 000000000..417515332 --- /dev/null +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager_v9.c @@ -0,0 +1,84 @@ +/* + * Copyright 2016-2018 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + * + */ + +#include "kfd_device_queue_manager.h" +#include "vega10_enum.h" +#include "gc/gc_9_0_offset.h" +#include "gc/gc_9_0_sh_mask.h" +#include "sdma0/sdma0_4_0_sh_mask.h" + +static int update_qpd_v9(struct device_queue_manager *dqm, + struct qcm_process_device *qpd); +static void init_sdma_vm_v9(struct device_queue_manager *dqm, struct queue *q, + struct qcm_process_device *qpd); + +void device_queue_manager_init_v9( + struct device_queue_manager_asic_ops *asic_ops) +{ + asic_ops->update_qpd = update_qpd_v9; + asic_ops->init_sdma_vm = init_sdma_vm_v9; +} + +static uint32_t compute_sh_mem_bases_64bit(struct kfd_process_device *pdd) +{ + uint32_t shared_base = pdd->lds_base >> 48; + uint32_t private_base = pdd->scratch_base >> 48; + + return (shared_base << SH_MEM_BASES__SHARED_BASE__SHIFT) | + private_base; +} + +static int update_qpd_v9(struct device_queue_manager *dqm, + struct qcm_process_device *qpd) +{ + struct kfd_process_device *pdd; + + pdd = qpd_to_pdd(qpd); + + /* check if sh_mem_config register already configured */ + if (qpd->sh_mem_config == 0) { + qpd->sh_mem_config = + SH_MEM_ALIGNMENT_MODE_UNALIGNED << + SH_MEM_CONFIG__ALIGNMENT_MODE__SHIFT; + if (noretry && + !dqm->dev->device_info->needs_iommu_device) + qpd->sh_mem_config |= + 1 << SH_MEM_CONFIG__RETRY_DISABLE__SHIFT; + + qpd->sh_mem_ape1_limit = 0; + qpd->sh_mem_ape1_base = 0; + } + + qpd->sh_mem_bases = compute_sh_mem_bases_64bit(pdd); + + pr_debug("sh_mem_bases 0x%X\n", qpd->sh_mem_bases); + + return 0; +} + +static void init_sdma_vm_v9(struct device_queue_manager *dqm, struct queue *q, + struct qcm_process_device *qpd) +{ + /* Not needed on SDMAv4 any more */ + q->properties.sdma_vm_addr = 0; +} diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager_vi.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager_vi.c new file mode 100644 index 000000000..fd60a116b --- /dev/null +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager_vi.c @@ -0,0 +1,254 @@ +/* + * Copyright 2014 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + * + */ + +#include "kfd_device_queue_manager.h" +#include "gca/gfx_8_0_enum.h" +#include "gca/gfx_8_0_sh_mask.h" +#include "gca/gfx_8_0_enum.h" +#include "oss/oss_3_0_sh_mask.h" + +static bool set_cache_memory_policy_vi(struct device_queue_manager *dqm, + struct qcm_process_device *qpd, + enum cache_policy default_policy, + enum cache_policy alternate_policy, + void __user *alternate_aperture_base, + uint64_t alternate_aperture_size); +static bool set_cache_memory_policy_vi_tonga(struct device_queue_manager *dqm, + struct qcm_process_device *qpd, + enum cache_policy default_policy, + enum cache_policy alternate_policy, + void __user *alternate_aperture_base, + uint64_t alternate_aperture_size); +static int update_qpd_vi(struct device_queue_manager *dqm, + struct qcm_process_device *qpd); +static int update_qpd_vi_tonga(struct device_queue_manager *dqm, + struct qcm_process_device *qpd); +static void init_sdma_vm(struct device_queue_manager *dqm, struct queue *q, + struct qcm_process_device *qpd); +static void init_sdma_vm_tonga(struct device_queue_manager *dqm, + struct queue *q, + struct qcm_process_device *qpd); + +void device_queue_manager_init_vi( + struct device_queue_manager_asic_ops *asic_ops) +{ + asic_ops->set_cache_memory_policy = set_cache_memory_policy_vi; + asic_ops->update_qpd = update_qpd_vi; + asic_ops->init_sdma_vm = init_sdma_vm; +} + +void device_queue_manager_init_vi_tonga( + struct device_queue_manager_asic_ops *asic_ops) +{ + asic_ops->set_cache_memory_policy = set_cache_memory_policy_vi_tonga; + asic_ops->update_qpd = update_qpd_vi_tonga; + asic_ops->init_sdma_vm = init_sdma_vm_tonga; +} + +static uint32_t compute_sh_mem_bases_64bit(unsigned int top_address_nybble) +{ + /* In 64-bit mode, we can only control the top 3 bits of the LDS, + * scratch and GPUVM apertures. + * The hardware fills in the remaining 59 bits according to the + * following pattern: + * LDS: X0000000'00000000 - X0000001'00000000 (4GB) + * Scratch: X0000001'00000000 - X0000002'00000000 (4GB) + * GPUVM: Y0010000'00000000 - Y0020000'00000000 (1TB) + * + * (where X/Y is the configurable nybble with the low-bit 0) + * + * LDS and scratch will have the same top nybble programmed in the + * top 3 bits of SH_MEM_BASES.PRIVATE_BASE. + * GPUVM can have a different top nybble programmed in the + * top 3 bits of SH_MEM_BASES.SHARED_BASE. + * We don't bother to support different top nybbles + * for LDS/Scratch and GPUVM. + */ + + WARN_ON((top_address_nybble & 1) || top_address_nybble > 0xE || + top_address_nybble == 0); + + return top_address_nybble << 12 | + (top_address_nybble << 12) << + SH_MEM_BASES__SHARED_BASE__SHIFT; +} + +static bool set_cache_memory_policy_vi(struct device_queue_manager *dqm, + struct qcm_process_device *qpd, + enum cache_policy default_policy, + enum cache_policy alternate_policy, + void __user *alternate_aperture_base, + uint64_t alternate_aperture_size) +{ + uint32_t default_mtype; + uint32_t ape1_mtype; + + default_mtype = (default_policy == cache_policy_coherent) ? + MTYPE_CC : + MTYPE_NC; + + ape1_mtype = (alternate_policy == cache_policy_coherent) ? + MTYPE_CC : + MTYPE_NC; + + qpd->sh_mem_config = (qpd->sh_mem_config & + SH_MEM_CONFIG__ADDRESS_MODE_MASK) | + SH_MEM_ALIGNMENT_MODE_UNALIGNED << + SH_MEM_CONFIG__ALIGNMENT_MODE__SHIFT | + default_mtype << SH_MEM_CONFIG__DEFAULT_MTYPE__SHIFT | + ape1_mtype << SH_MEM_CONFIG__APE1_MTYPE__SHIFT | + SH_MEM_CONFIG__PRIVATE_ATC_MASK; + + return true; +} + +static bool set_cache_memory_policy_vi_tonga(struct device_queue_manager *dqm, + struct qcm_process_device *qpd, + enum cache_policy default_policy, + enum cache_policy alternate_policy, + void __user *alternate_aperture_base, + uint64_t alternate_aperture_size) +{ + uint32_t default_mtype; + uint32_t ape1_mtype; + + default_mtype = (default_policy == cache_policy_coherent) ? + MTYPE_UC : + MTYPE_NC; + + ape1_mtype = (alternate_policy == cache_policy_coherent) ? + MTYPE_UC : + MTYPE_NC; + + qpd->sh_mem_config = + SH_MEM_ALIGNMENT_MODE_UNALIGNED << + SH_MEM_CONFIG__ALIGNMENT_MODE__SHIFT | + default_mtype << SH_MEM_CONFIG__DEFAULT_MTYPE__SHIFT | + ape1_mtype << SH_MEM_CONFIG__APE1_MTYPE__SHIFT; + + return true; +} + +static int update_qpd_vi(struct device_queue_manager *dqm, + struct qcm_process_device *qpd) +{ + struct kfd_process_device *pdd; + unsigned int temp; + + pdd = qpd_to_pdd(qpd); + + /* check if sh_mem_config register already configured */ + if (qpd->sh_mem_config == 0) { + qpd->sh_mem_config = + SH_MEM_ALIGNMENT_MODE_UNALIGNED << + SH_MEM_CONFIG__ALIGNMENT_MODE__SHIFT | + MTYPE_CC << SH_MEM_CONFIG__DEFAULT_MTYPE__SHIFT | + MTYPE_CC << SH_MEM_CONFIG__APE1_MTYPE__SHIFT | + SH_MEM_CONFIG__PRIVATE_ATC_MASK; + + qpd->sh_mem_ape1_limit = 0; + qpd->sh_mem_ape1_base = 0; + } + + if (qpd->pqm->process->is_32bit_user_mode) { + temp = get_sh_mem_bases_32(pdd); + qpd->sh_mem_bases = temp << SH_MEM_BASES__SHARED_BASE__SHIFT; + qpd->sh_mem_config |= SH_MEM_ADDRESS_MODE_HSA32 << + SH_MEM_CONFIG__ADDRESS_MODE__SHIFT; + } else { + temp = get_sh_mem_bases_nybble_64(pdd); + qpd->sh_mem_bases = compute_sh_mem_bases_64bit(temp); + qpd->sh_mem_config |= SH_MEM_ADDRESS_MODE_HSA64 << + SH_MEM_CONFIG__ADDRESS_MODE__SHIFT; + qpd->sh_mem_config |= 1 << + SH_MEM_CONFIG__PRIVATE_ATC__SHIFT; + } + + pr_debug("is32bit process: %d sh_mem_bases nybble: 0x%X and register 0x%X\n", + qpd->pqm->process->is_32bit_user_mode, temp, qpd->sh_mem_bases); + + return 0; +} + +static int update_qpd_vi_tonga(struct device_queue_manager *dqm, + struct qcm_process_device *qpd) +{ + struct kfd_process_device *pdd; + unsigned int temp; + + pdd = qpd_to_pdd(qpd); + + /* check if sh_mem_config register already configured */ + if (qpd->sh_mem_config == 0) { + qpd->sh_mem_config = + SH_MEM_ALIGNMENT_MODE_UNALIGNED << + SH_MEM_CONFIG__ALIGNMENT_MODE__SHIFT | + MTYPE_UC << + SH_MEM_CONFIG__DEFAULT_MTYPE__SHIFT | + MTYPE_UC << + SH_MEM_CONFIG__APE1_MTYPE__SHIFT; + + qpd->sh_mem_ape1_limit = 0; + qpd->sh_mem_ape1_base = 0; + } + + /* On dGPU we're always in GPUVM64 addressing mode with 64-bit + * aperture addresses. + */ + temp = get_sh_mem_bases_nybble_64(pdd); + qpd->sh_mem_bases = compute_sh_mem_bases_64bit(temp); + + pr_debug("sh_mem_bases nybble: 0x%X and register 0x%X\n", + temp, qpd->sh_mem_bases); + + return 0; +} + +static void init_sdma_vm(struct device_queue_manager *dqm, struct queue *q, + struct qcm_process_device *qpd) +{ + uint32_t value = (1 << SDMA0_RLC0_VIRTUAL_ADDR__ATC__SHIFT); + + if (q->process->is_32bit_user_mode) + value |= (1 << SDMA0_RLC0_VIRTUAL_ADDR__PTR32__SHIFT) | + get_sh_mem_bases_32(qpd_to_pdd(qpd)); + else + value |= ((get_sh_mem_bases_nybble_64(qpd_to_pdd(qpd))) << + SDMA0_RLC0_VIRTUAL_ADDR__SHARED_BASE__SHIFT) & + SDMA0_RLC0_VIRTUAL_ADDR__SHARED_BASE_MASK; + + q->properties.sdma_vm_addr = value; +} + +static void init_sdma_vm_tonga(struct device_queue_manager *dqm, + struct queue *q, + struct qcm_process_device *qpd) +{ + /* On dGPU we're always in GPUVM64 addressing mode with 64-bit + * aperture addresses. + */ + q->properties.sdma_vm_addr = + ((get_sh_mem_bases_nybble_64(qpd_to_pdd(qpd))) << + SDMA0_RLC0_VIRTUAL_ADDR__SHARED_BASE__SHIFT) & + SDMA0_RLC0_VIRTUAL_ADDR__SHARED_BASE_MASK; +} diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_doorbell.c b/drivers/gpu/drm/amd/amdkfd/kfd_doorbell.c new file mode 100644 index 000000000..ebe79bf00 --- /dev/null +++ b/drivers/gpu/drm/amd/amdkfd/kfd_doorbell.c @@ -0,0 +1,275 @@ +/* + * Copyright 2014 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ +#include "kfd_priv.h" +#include <linux/mm.h> +#include <linux/mman.h> +#include <linux/slab.h> +#include <linux/io.h> +#include <linux/idr.h> + +/* + * This extension supports a kernel level doorbells management for the + * kernel queues using the first doorbell page reserved for the kernel. + */ + +static DEFINE_IDA(doorbell_ida); +static unsigned int max_doorbell_slices; + +/* + * Each device exposes a doorbell aperture, a PCI MMIO aperture that + * receives 32-bit writes that are passed to queues as wptr values. + * The doorbells are intended to be written by applications as part + * of queueing work on user-mode queues. + * We assign doorbells to applications in PAGE_SIZE-sized and aligned chunks. + * We map the doorbell address space into user-mode when a process creates + * its first queue on each device. + * Although the mapping is done by KFD, it is equivalent to an mmap of + * the /dev/kfd with the particular device encoded in the mmap offset. + * There will be other uses for mmap of /dev/kfd, so only a range of + * offsets (KFD_MMAP_DOORBELL_START-END) is used for doorbells. + */ + +/* # of doorbell bytes allocated for each process. */ +size_t kfd_doorbell_process_slice(struct kfd_dev *kfd) +{ + return roundup(kfd->device_info->doorbell_size * + KFD_MAX_NUM_OF_QUEUES_PER_PROCESS, + PAGE_SIZE); +} + +/* Doorbell calculations for device init. */ +int kfd_doorbell_init(struct kfd_dev *kfd) +{ + size_t doorbell_start_offset; + size_t doorbell_aperture_size; + size_t doorbell_process_limit; + + /* + * We start with calculations in bytes because the input data might + * only be byte-aligned. + * Only after we have done the rounding can we assume any alignment. + */ + + doorbell_start_offset = + roundup(kfd->shared_resources.doorbell_start_offset, + kfd_doorbell_process_slice(kfd)); + + doorbell_aperture_size = + rounddown(kfd->shared_resources.doorbell_aperture_size, + kfd_doorbell_process_slice(kfd)); + + if (doorbell_aperture_size > doorbell_start_offset) + doorbell_process_limit = + (doorbell_aperture_size - doorbell_start_offset) / + kfd_doorbell_process_slice(kfd); + else + return -ENOSPC; + + if (!max_doorbell_slices || + doorbell_process_limit < max_doorbell_slices) + max_doorbell_slices = doorbell_process_limit; + + kfd->doorbell_base = kfd->shared_resources.doorbell_physical_address + + doorbell_start_offset; + + kfd->doorbell_id_offset = doorbell_start_offset / sizeof(u32); + + kfd->doorbell_kernel_ptr = ioremap(kfd->doorbell_base, + kfd_doorbell_process_slice(kfd)); + + if (!kfd->doorbell_kernel_ptr) + return -ENOMEM; + + pr_debug("Doorbell initialization:\n"); + pr_debug("doorbell base == 0x%08lX\n", + (uintptr_t)kfd->doorbell_base); + + pr_debug("doorbell_id_offset == 0x%08lX\n", + kfd->doorbell_id_offset); + + pr_debug("doorbell_process_limit == 0x%08lX\n", + doorbell_process_limit); + + pr_debug("doorbell_kernel_offset == 0x%08lX\n", + (uintptr_t)kfd->doorbell_base); + + pr_debug("doorbell aperture size == 0x%08lX\n", + kfd->shared_resources.doorbell_aperture_size); + + pr_debug("doorbell kernel address == %p\n", kfd->doorbell_kernel_ptr); + + return 0; +} + +void kfd_doorbell_fini(struct kfd_dev *kfd) +{ + if (kfd->doorbell_kernel_ptr) + iounmap(kfd->doorbell_kernel_ptr); +} + +int kfd_doorbell_mmap(struct kfd_dev *dev, struct kfd_process *process, + struct vm_area_struct *vma) +{ + phys_addr_t address; + + /* + * For simplicitly we only allow mapping of the entire doorbell + * allocation of a single device & process. + */ + if (vma->vm_end - vma->vm_start != kfd_doorbell_process_slice(dev)) + return -EINVAL; + + /* Calculate physical address of doorbell */ + address = kfd_get_process_doorbells(dev, process); + + vma->vm_flags |= VM_IO | VM_DONTCOPY | VM_DONTEXPAND | VM_NORESERVE | + VM_DONTDUMP | VM_PFNMAP; + + vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot); + + pr_debug("Mapping doorbell page\n" + " target user address == 0x%08llX\n" + " physical address == 0x%08llX\n" + " vm_flags == 0x%04lX\n" + " size == 0x%04lX\n", + (unsigned long long) vma->vm_start, address, vma->vm_flags, + kfd_doorbell_process_slice(dev)); + + + return io_remap_pfn_range(vma, + vma->vm_start, + address >> PAGE_SHIFT, + kfd_doorbell_process_slice(dev), + vma->vm_page_prot); +} + + +/* get kernel iomem pointer for a doorbell */ +void __iomem *kfd_get_kernel_doorbell(struct kfd_dev *kfd, + unsigned int *doorbell_off) +{ + u32 inx; + + mutex_lock(&kfd->doorbell_mutex); + inx = find_first_zero_bit(kfd->doorbell_available_index, + KFD_MAX_NUM_OF_QUEUES_PER_PROCESS); + + __set_bit(inx, kfd->doorbell_available_index); + mutex_unlock(&kfd->doorbell_mutex); + + if (inx >= KFD_MAX_NUM_OF_QUEUES_PER_PROCESS) + return NULL; + + inx *= kfd->device_info->doorbell_size / sizeof(u32); + + /* + * Calculating the kernel doorbell offset using the first + * doorbell page. + */ + *doorbell_off = kfd->doorbell_id_offset + inx; + + pr_debug("Get kernel queue doorbell\n" + " doorbell offset == 0x%08X\n" + " doorbell index == 0x%x\n", + *doorbell_off, inx); + + return kfd->doorbell_kernel_ptr + inx; +} + +void kfd_release_kernel_doorbell(struct kfd_dev *kfd, u32 __iomem *db_addr) +{ + unsigned int inx; + + inx = (unsigned int)(db_addr - kfd->doorbell_kernel_ptr) + * sizeof(u32) / kfd->device_info->doorbell_size; + + mutex_lock(&kfd->doorbell_mutex); + __clear_bit(inx, kfd->doorbell_available_index); + mutex_unlock(&kfd->doorbell_mutex); +} + +void write_kernel_doorbell(void __iomem *db, u32 value) +{ + if (db) { + writel(value, db); + pr_debug("Writing %d to doorbell address %p\n", value, db); + } +} + +void write_kernel_doorbell64(void __iomem *db, u64 value) +{ + if (db) { + WARN(((unsigned long)db & 7) != 0, + "Unaligned 64-bit doorbell"); + writeq(value, (u64 __iomem *)db); + pr_debug("writing %llu to doorbell address %p\n", value, db); + } +} + +unsigned int kfd_doorbell_id_to_offset(struct kfd_dev *kfd, + struct kfd_process *process, + unsigned int doorbell_id) +{ + /* + * doorbell_id_offset accounts for doorbells taken by KGD. + * index * kfd_doorbell_process_slice/sizeof(u32) adjusts to + * the process's doorbells. The offset returned is in dword + * units regardless of the ASIC-dependent doorbell size. + */ + return kfd->doorbell_id_offset + + process->doorbell_index + * kfd_doorbell_process_slice(kfd) / sizeof(u32) + + doorbell_id * kfd->device_info->doorbell_size / sizeof(u32); +} + +uint64_t kfd_get_number_elems(struct kfd_dev *kfd) +{ + uint64_t num_of_elems = (kfd->shared_resources.doorbell_aperture_size - + kfd->shared_resources.doorbell_start_offset) / + kfd_doorbell_process_slice(kfd) + 1; + + return num_of_elems; + +} + +phys_addr_t kfd_get_process_doorbells(struct kfd_dev *dev, + struct kfd_process *process) +{ + return dev->doorbell_base + + process->doorbell_index * kfd_doorbell_process_slice(dev); +} + +int kfd_alloc_process_doorbells(struct kfd_process *process) +{ + int r = ida_simple_get(&doorbell_ida, 1, max_doorbell_slices, + GFP_KERNEL); + if (r > 0) + process->doorbell_index = r; + + return r; +} + +void kfd_free_process_doorbells(struct kfd_process *process) +{ + if (process->doorbell_index) + ida_simple_remove(&doorbell_ida, process->doorbell_index); +} diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_events.c b/drivers/gpu/drm/amd/amdkfd/kfd_events.c new file mode 100644 index 000000000..892077377 --- /dev/null +++ b/drivers/gpu/drm/amd/amdkfd/kfd_events.c @@ -0,0 +1,1038 @@ +/* + * Copyright 2014 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +#include <linux/mm_types.h> +#include <linux/slab.h> +#include <linux/types.h> +#include <linux/sched/signal.h> +#include <linux/sched/mm.h> +#include <linux/uaccess.h> +#include <linux/mman.h> +#include <linux/memory.h> +#include "kfd_priv.h" +#include "kfd_events.h" +#include "kfd_iommu.h" +#include <linux/device.h> + +/* + * Wrapper around wait_queue_entry_t + */ +struct kfd_event_waiter { + wait_queue_entry_t wait; + struct kfd_event *event; /* Event to wait for */ + bool activated; /* Becomes true when event is signaled */ +}; + +/* + * Each signal event needs a 64-bit signal slot where the signaler will write + * a 1 before sending an interrupt. (This is needed because some interrupts + * do not contain enough spare data bits to identify an event.) + * We get whole pages and map them to the process VA. + * Individual signal events use their event_id as slot index. + */ +struct kfd_signal_page { + uint64_t *kernel_address; + uint64_t __user *user_address; + bool need_to_free_pages; +}; + + +static uint64_t *page_slots(struct kfd_signal_page *page) +{ + return page->kernel_address; +} + +static struct kfd_signal_page *allocate_signal_page(struct kfd_process *p) +{ + void *backing_store; + struct kfd_signal_page *page; + + page = kzalloc(sizeof(*page), GFP_KERNEL); + if (!page) + return NULL; + + backing_store = (void *) __get_free_pages(GFP_KERNEL, + get_order(KFD_SIGNAL_EVENT_LIMIT * 8)); + if (!backing_store) + goto fail_alloc_signal_store; + + /* Initialize all events to unsignaled */ + memset(backing_store, (uint8_t) UNSIGNALED_EVENT_SLOT, + KFD_SIGNAL_EVENT_LIMIT * 8); + + page->kernel_address = backing_store; + page->need_to_free_pages = true; + pr_debug("Allocated new event signal page at %p, for process %p\n", + page, p); + + return page; + +fail_alloc_signal_store: + kfree(page); + return NULL; +} + +static int allocate_event_notification_slot(struct kfd_process *p, + struct kfd_event *ev) +{ + int id; + + if (!p->signal_page) { + p->signal_page = allocate_signal_page(p); + if (!p->signal_page) + return -ENOMEM; + /* Oldest user mode expects 256 event slots */ + p->signal_mapped_size = 256*8; + } + + /* + * Compatibility with old user mode: Only use signal slots + * user mode has mapped, may be less than + * KFD_SIGNAL_EVENT_LIMIT. This also allows future increase + * of the event limit without breaking user mode. + */ + id = idr_alloc(&p->event_idr, ev, 0, p->signal_mapped_size / 8, + GFP_KERNEL); + if (id < 0) + return id; + + ev->event_id = id; + page_slots(p->signal_page)[id] = UNSIGNALED_EVENT_SLOT; + + return 0; +} + +/* + * Assumes that p->event_mutex is held and of course that p is not going + * away (current or locked). + */ +static struct kfd_event *lookup_event_by_id(struct kfd_process *p, uint32_t id) +{ + return idr_find(&p->event_idr, id); +} + +/** + * lookup_signaled_event_by_partial_id - Lookup signaled event from partial ID + * @p: Pointer to struct kfd_process + * @id: ID to look up + * @bits: Number of valid bits in @id + * + * Finds the first signaled event with a matching partial ID. If no + * matching signaled event is found, returns NULL. In that case the + * caller should assume that the partial ID is invalid and do an + * exhaustive search of all siglaned events. + * + * If multiple events with the same partial ID signal at the same + * time, they will be found one interrupt at a time, not necessarily + * in the same order the interrupts occurred. As long as the number of + * interrupts is correct, all signaled events will be seen by the + * driver. + */ +static struct kfd_event *lookup_signaled_event_by_partial_id( + struct kfd_process *p, uint32_t id, uint32_t bits) +{ + struct kfd_event *ev; + + if (!p->signal_page || id >= KFD_SIGNAL_EVENT_LIMIT) + return NULL; + + /* Fast path for the common case that @id is not a partial ID + * and we only need a single lookup. + */ + if (bits > 31 || (1U << bits) >= KFD_SIGNAL_EVENT_LIMIT) { + if (page_slots(p->signal_page)[id] == UNSIGNALED_EVENT_SLOT) + return NULL; + + return idr_find(&p->event_idr, id); + } + + /* General case for partial IDs: Iterate over all matching IDs + * and find the first one that has signaled. + */ + for (ev = NULL; id < KFD_SIGNAL_EVENT_LIMIT && !ev; id += 1U << bits) { + if (page_slots(p->signal_page)[id] == UNSIGNALED_EVENT_SLOT) + continue; + + ev = idr_find(&p->event_idr, id); + } + + return ev; +} + +static int create_signal_event(struct file *devkfd, + struct kfd_process *p, + struct kfd_event *ev) +{ + int ret; + + if (p->signal_mapped_size && + p->signal_event_count == p->signal_mapped_size / 8) { + if (!p->signal_event_limit_reached) { + pr_warn("Signal event wasn't created because limit was reached\n"); + p->signal_event_limit_reached = true; + } + return -ENOSPC; + } + + ret = allocate_event_notification_slot(p, ev); + if (ret) { + pr_warn("Signal event wasn't created because out of kernel memory\n"); + return ret; + } + + p->signal_event_count++; + + ev->user_signal_address = &p->signal_page->user_address[ev->event_id]; + pr_debug("Signal event number %zu created with id %d, address %p\n", + p->signal_event_count, ev->event_id, + ev->user_signal_address); + + return 0; +} + +static int create_other_event(struct kfd_process *p, struct kfd_event *ev) +{ + /* Cast KFD_LAST_NONSIGNAL_EVENT to uint32_t. This allows an + * intentional integer overflow to -1 without a compiler + * warning. idr_alloc treats a negative value as "maximum + * signed integer". + */ + int id = idr_alloc(&p->event_idr, ev, KFD_FIRST_NONSIGNAL_EVENT_ID, + (uint32_t)KFD_LAST_NONSIGNAL_EVENT_ID + 1, + GFP_KERNEL); + + if (id < 0) + return id; + ev->event_id = id; + + return 0; +} + +void kfd_event_init_process(struct kfd_process *p) +{ + mutex_init(&p->event_mutex); + idr_init(&p->event_idr); + p->signal_page = NULL; + p->signal_event_count = 0; +} + +static void destroy_event(struct kfd_process *p, struct kfd_event *ev) +{ + struct kfd_event_waiter *waiter; + + /* Wake up pending waiters. They will return failure */ + list_for_each_entry(waiter, &ev->wq.head, wait.entry) + waiter->event = NULL; + wake_up_all(&ev->wq); + + if (ev->type == KFD_EVENT_TYPE_SIGNAL || + ev->type == KFD_EVENT_TYPE_DEBUG) + p->signal_event_count--; + + idr_remove(&p->event_idr, ev->event_id); + kfree(ev); +} + +static void destroy_events(struct kfd_process *p) +{ + struct kfd_event *ev; + uint32_t id; + + idr_for_each_entry(&p->event_idr, ev, id) + destroy_event(p, ev); + idr_destroy(&p->event_idr); +} + +/* + * We assume that the process is being destroyed and there is no need to + * unmap the pages or keep bookkeeping data in order. + */ +static void shutdown_signal_page(struct kfd_process *p) +{ + struct kfd_signal_page *page = p->signal_page; + + if (page) { + if (page->need_to_free_pages) + free_pages((unsigned long)page->kernel_address, + get_order(KFD_SIGNAL_EVENT_LIMIT * 8)); + kfree(page); + } +} + +void kfd_event_free_process(struct kfd_process *p) +{ + destroy_events(p); + shutdown_signal_page(p); +} + +static bool event_can_be_gpu_signaled(const struct kfd_event *ev) +{ + return ev->type == KFD_EVENT_TYPE_SIGNAL || + ev->type == KFD_EVENT_TYPE_DEBUG; +} + +static bool event_can_be_cpu_signaled(const struct kfd_event *ev) +{ + return ev->type == KFD_EVENT_TYPE_SIGNAL; +} + +int kfd_event_page_set(struct kfd_process *p, void *kernel_address, + uint64_t size) +{ + struct kfd_signal_page *page; + + if (p->signal_page) + return -EBUSY; + + page = kzalloc(sizeof(*page), GFP_KERNEL); + if (!page) + return -ENOMEM; + + /* Initialize all events to unsignaled */ + memset(kernel_address, (uint8_t) UNSIGNALED_EVENT_SLOT, + KFD_SIGNAL_EVENT_LIMIT * 8); + + page->kernel_address = kernel_address; + + p->signal_page = page; + p->signal_mapped_size = size; + + return 0; +} + +int kfd_event_create(struct file *devkfd, struct kfd_process *p, + uint32_t event_type, bool auto_reset, uint32_t node_id, + uint32_t *event_id, uint32_t *event_trigger_data, + uint64_t *event_page_offset, uint32_t *event_slot_index) +{ + int ret = 0; + struct kfd_event *ev = kzalloc(sizeof(*ev), GFP_KERNEL); + + if (!ev) + return -ENOMEM; + + ev->type = event_type; + ev->auto_reset = auto_reset; + ev->signaled = false; + + init_waitqueue_head(&ev->wq); + + *event_page_offset = 0; + + mutex_lock(&p->event_mutex); + + switch (event_type) { + case KFD_EVENT_TYPE_SIGNAL: + case KFD_EVENT_TYPE_DEBUG: + ret = create_signal_event(devkfd, p, ev); + if (!ret) { + *event_page_offset = KFD_MMAP_TYPE_EVENTS; + *event_page_offset <<= PAGE_SHIFT; + *event_slot_index = ev->event_id; + } + break; + default: + ret = create_other_event(p, ev); + break; + } + + if (!ret) { + *event_id = ev->event_id; + *event_trigger_data = ev->event_id; + } else { + kfree(ev); + } + + mutex_unlock(&p->event_mutex); + + return ret; +} + +/* Assumes that p is current. */ +int kfd_event_destroy(struct kfd_process *p, uint32_t event_id) +{ + struct kfd_event *ev; + int ret = 0; + + mutex_lock(&p->event_mutex); + + ev = lookup_event_by_id(p, event_id); + + if (ev) + destroy_event(p, ev); + else + ret = -EINVAL; + + mutex_unlock(&p->event_mutex); + return ret; +} + +static void set_event(struct kfd_event *ev) +{ + struct kfd_event_waiter *waiter; + + /* Auto reset if the list is non-empty and we're waking + * someone. waitqueue_active is safe here because we're + * protected by the p->event_mutex, which is also held when + * updating the wait queues in kfd_wait_on_events. + */ + ev->signaled = !ev->auto_reset || !waitqueue_active(&ev->wq); + + list_for_each_entry(waiter, &ev->wq.head, wait.entry) + waiter->activated = true; + + wake_up_all(&ev->wq); +} + +/* Assumes that p is current. */ +int kfd_set_event(struct kfd_process *p, uint32_t event_id) +{ + int ret = 0; + struct kfd_event *ev; + + mutex_lock(&p->event_mutex); + + ev = lookup_event_by_id(p, event_id); + + if (ev && event_can_be_cpu_signaled(ev)) + set_event(ev); + else + ret = -EINVAL; + + mutex_unlock(&p->event_mutex); + return ret; +} + +static void reset_event(struct kfd_event *ev) +{ + ev->signaled = false; +} + +/* Assumes that p is current. */ +int kfd_reset_event(struct kfd_process *p, uint32_t event_id) +{ + int ret = 0; + struct kfd_event *ev; + + mutex_lock(&p->event_mutex); + + ev = lookup_event_by_id(p, event_id); + + if (ev && event_can_be_cpu_signaled(ev)) + reset_event(ev); + else + ret = -EINVAL; + + mutex_unlock(&p->event_mutex); + return ret; + +} + +static void acknowledge_signal(struct kfd_process *p, struct kfd_event *ev) +{ + page_slots(p->signal_page)[ev->event_id] = UNSIGNALED_EVENT_SLOT; +} + +static void set_event_from_interrupt(struct kfd_process *p, + struct kfd_event *ev) +{ + if (ev && event_can_be_gpu_signaled(ev)) { + acknowledge_signal(p, ev); + set_event(ev); + } +} + +void kfd_signal_event_interrupt(unsigned int pasid, uint32_t partial_id, + uint32_t valid_id_bits) +{ + struct kfd_event *ev = NULL; + + /* + * Because we are called from arbitrary context (workqueue) as opposed + * to process context, kfd_process could attempt to exit while we are + * running so the lookup function increments the process ref count. + */ + struct kfd_process *p = kfd_lookup_process_by_pasid(pasid); + + if (!p) + return; /* Presumably process exited. */ + + mutex_lock(&p->event_mutex); + + if (valid_id_bits) + ev = lookup_signaled_event_by_partial_id(p, partial_id, + valid_id_bits); + if (ev) { + set_event_from_interrupt(p, ev); + } else if (p->signal_page) { + /* + * Partial ID lookup failed. Assume that the event ID + * in the interrupt payload was invalid and do an + * exhaustive search of signaled events. + */ + uint64_t *slots = page_slots(p->signal_page); + uint32_t id; + + if (valid_id_bits) + pr_debug_ratelimited("Partial ID invalid: %u (%u valid bits)\n", + partial_id, valid_id_bits); + + if (p->signal_event_count < KFD_SIGNAL_EVENT_LIMIT / 64) { + /* With relatively few events, it's faster to + * iterate over the event IDR + */ + idr_for_each_entry(&p->event_idr, ev, id) { + if (id >= KFD_SIGNAL_EVENT_LIMIT) + break; + + if (slots[id] != UNSIGNALED_EVENT_SLOT) + set_event_from_interrupt(p, ev); + } + } else { + /* With relatively many events, it's faster to + * iterate over the signal slots and lookup + * only signaled events from the IDR. + */ + for (id = 0; id < KFD_SIGNAL_EVENT_LIMIT; id++) + if (slots[id] != UNSIGNALED_EVENT_SLOT) { + ev = lookup_event_by_id(p, id); + set_event_from_interrupt(p, ev); + } + } + } + + mutex_unlock(&p->event_mutex); + kfd_unref_process(p); +} + +static struct kfd_event_waiter *alloc_event_waiters(uint32_t num_events) +{ + struct kfd_event_waiter *event_waiters; + uint32_t i; + + event_waiters = kmalloc_array(num_events, + sizeof(struct kfd_event_waiter), + GFP_KERNEL); + if (!event_waiters) + return NULL; + + for (i = 0; (event_waiters) && (i < num_events) ; i++) { + init_wait(&event_waiters[i].wait); + event_waiters[i].activated = false; + } + + return event_waiters; +} + +static int init_event_waiter_get_status(struct kfd_process *p, + struct kfd_event_waiter *waiter, + uint32_t event_id) +{ + struct kfd_event *ev = lookup_event_by_id(p, event_id); + + if (!ev) + return -EINVAL; + + waiter->event = ev; + waiter->activated = ev->signaled; + ev->signaled = ev->signaled && !ev->auto_reset; + + return 0; +} + +static void init_event_waiter_add_to_waitlist(struct kfd_event_waiter *waiter) +{ + struct kfd_event *ev = waiter->event; + + /* Only add to the wait list if we actually need to + * wait on this event. + */ + if (!waiter->activated) + add_wait_queue(&ev->wq, &waiter->wait); +} + +/* test_event_condition - Test condition of events being waited for + * @all: Return completion only if all events have signaled + * @num_events: Number of events to wait for + * @event_waiters: Array of event waiters, one per event + * + * Returns KFD_IOC_WAIT_RESULT_COMPLETE if all (or one) event(s) have + * signaled. Returns KFD_IOC_WAIT_RESULT_TIMEOUT if no (or not all) + * events have signaled. Returns KFD_IOC_WAIT_RESULT_FAIL if any of + * the events have been destroyed. + */ +static uint32_t test_event_condition(bool all, uint32_t num_events, + struct kfd_event_waiter *event_waiters) +{ + uint32_t i; + uint32_t activated_count = 0; + + for (i = 0; i < num_events; i++) { + if (!event_waiters[i].event) + return KFD_IOC_WAIT_RESULT_FAIL; + + if (event_waiters[i].activated) { + if (!all) + return KFD_IOC_WAIT_RESULT_COMPLETE; + + activated_count++; + } + } + + return activated_count == num_events ? + KFD_IOC_WAIT_RESULT_COMPLETE : KFD_IOC_WAIT_RESULT_TIMEOUT; +} + +/* + * Copy event specific data, if defined. + * Currently only memory exception events have additional data to copy to user + */ +static int copy_signaled_event_data(uint32_t num_events, + struct kfd_event_waiter *event_waiters, + struct kfd_event_data __user *data) +{ + struct kfd_hsa_memory_exception_data *src; + struct kfd_hsa_memory_exception_data __user *dst; + struct kfd_event_waiter *waiter; + struct kfd_event *event; + uint32_t i; + + for (i = 0; i < num_events; i++) { + waiter = &event_waiters[i]; + event = waiter->event; + if (waiter->activated && event->type == KFD_EVENT_TYPE_MEMORY) { + dst = &data[i].memory_exception_data; + src = &event->memory_exception_data; + if (copy_to_user(dst, src, + sizeof(struct kfd_hsa_memory_exception_data))) + return -EFAULT; + } + } + + return 0; + +} + + + +static long user_timeout_to_jiffies(uint32_t user_timeout_ms) +{ + if (user_timeout_ms == KFD_EVENT_TIMEOUT_IMMEDIATE) + return 0; + + if (user_timeout_ms == KFD_EVENT_TIMEOUT_INFINITE) + return MAX_SCHEDULE_TIMEOUT; + + /* + * msecs_to_jiffies interprets all values above 2^31-1 as infinite, + * but we consider them finite. + * This hack is wrong, but nobody is likely to notice. + */ + user_timeout_ms = min_t(uint32_t, user_timeout_ms, 0x7FFFFFFF); + + return msecs_to_jiffies(user_timeout_ms) + 1; +} + +static void free_waiters(uint32_t num_events, struct kfd_event_waiter *waiters) +{ + uint32_t i; + + for (i = 0; i < num_events; i++) + if (waiters[i].event) + remove_wait_queue(&waiters[i].event->wq, + &waiters[i].wait); + + kfree(waiters); +} + +int kfd_wait_on_events(struct kfd_process *p, + uint32_t num_events, void __user *data, + bool all, uint32_t user_timeout_ms, + uint32_t *wait_result) +{ + struct kfd_event_data __user *events = + (struct kfd_event_data __user *) data; + uint32_t i; + int ret = 0; + + struct kfd_event_waiter *event_waiters = NULL; + long timeout = user_timeout_to_jiffies(user_timeout_ms); + + event_waiters = alloc_event_waiters(num_events); + if (!event_waiters) { + ret = -ENOMEM; + goto out; + } + + mutex_lock(&p->event_mutex); + + for (i = 0; i < num_events; i++) { + struct kfd_event_data event_data; + + if (copy_from_user(&event_data, &events[i], + sizeof(struct kfd_event_data))) { + ret = -EFAULT; + goto out_unlock; + } + + ret = init_event_waiter_get_status(p, &event_waiters[i], + event_data.event_id); + if (ret) + goto out_unlock; + } + + /* Check condition once. */ + *wait_result = test_event_condition(all, num_events, event_waiters); + if (*wait_result == KFD_IOC_WAIT_RESULT_COMPLETE) { + ret = copy_signaled_event_data(num_events, + event_waiters, events); + goto out_unlock; + } else if (WARN_ON(*wait_result == KFD_IOC_WAIT_RESULT_FAIL)) { + /* This should not happen. Events shouldn't be + * destroyed while we're holding the event_mutex + */ + goto out_unlock; + } + + /* Add to wait lists if we need to wait. */ + for (i = 0; i < num_events; i++) + init_event_waiter_add_to_waitlist(&event_waiters[i]); + + mutex_unlock(&p->event_mutex); + + while (true) { + if (fatal_signal_pending(current)) { + ret = -EINTR; + break; + } + + if (signal_pending(current)) { + /* + * This is wrong when a nonzero, non-infinite timeout + * is specified. We need to use + * ERESTARTSYS_RESTARTBLOCK, but struct restart_block + * contains a union with data for each user and it's + * in generic kernel code that I don't want to + * touch yet. + */ + ret = -ERESTARTSYS; + break; + } + + /* Set task state to interruptible sleep before + * checking wake-up conditions. A concurrent wake-up + * will put the task back into runnable state. In that + * case schedule_timeout will not put the task to + * sleep and we'll get a chance to re-check the + * updated conditions almost immediately. Otherwise, + * this race condition would lead to a soft hang or a + * very long sleep. + */ + set_current_state(TASK_INTERRUPTIBLE); + + *wait_result = test_event_condition(all, num_events, + event_waiters); + if (*wait_result != KFD_IOC_WAIT_RESULT_TIMEOUT) + break; + + if (timeout <= 0) + break; + + timeout = schedule_timeout(timeout); + } + __set_current_state(TASK_RUNNING); + + /* copy_signaled_event_data may sleep. So this has to happen + * after the task state is set back to RUNNING. + */ + if (!ret && *wait_result == KFD_IOC_WAIT_RESULT_COMPLETE) + ret = copy_signaled_event_data(num_events, + event_waiters, events); + + mutex_lock(&p->event_mutex); +out_unlock: + free_waiters(num_events, event_waiters); + mutex_unlock(&p->event_mutex); +out: + if (ret) + *wait_result = KFD_IOC_WAIT_RESULT_FAIL; + else if (*wait_result == KFD_IOC_WAIT_RESULT_FAIL) + ret = -EIO; + + return ret; +} + +int kfd_event_mmap(struct kfd_process *p, struct vm_area_struct *vma) +{ + unsigned long pfn; + struct kfd_signal_page *page; + int ret; + + /* check required size doesn't exceed the allocated size */ + if (get_order(KFD_SIGNAL_EVENT_LIMIT * 8) < + get_order(vma->vm_end - vma->vm_start)) { + pr_err("Event page mmap requested illegal size\n"); + return -EINVAL; + } + + page = p->signal_page; + if (!page) { + /* Probably KFD bug, but mmap is user-accessible. */ + pr_debug("Signal page could not be found\n"); + return -EINVAL; + } + + pfn = __pa(page->kernel_address); + pfn >>= PAGE_SHIFT; + + vma->vm_flags |= VM_IO | VM_DONTCOPY | VM_DONTEXPAND | VM_NORESERVE + | VM_DONTDUMP | VM_PFNMAP; + + pr_debug("Mapping signal page\n"); + pr_debug(" start user address == 0x%08lx\n", vma->vm_start); + pr_debug(" end user address == 0x%08lx\n", vma->vm_end); + pr_debug(" pfn == 0x%016lX\n", pfn); + pr_debug(" vm_flags == 0x%08lX\n", vma->vm_flags); + pr_debug(" size == 0x%08lX\n", + vma->vm_end - vma->vm_start); + + page->user_address = (uint64_t __user *)vma->vm_start; + + /* mapping the page to user process */ + ret = remap_pfn_range(vma, vma->vm_start, pfn, + vma->vm_end - vma->vm_start, vma->vm_page_prot); + if (!ret) + p->signal_mapped_size = vma->vm_end - vma->vm_start; + + return ret; +} + +/* + * Assumes that p->event_mutex is held and of course + * that p is not going away (current or locked). + */ +static void lookup_events_by_type_and_signal(struct kfd_process *p, + int type, void *event_data) +{ + struct kfd_hsa_memory_exception_data *ev_data; + struct kfd_event *ev; + uint32_t id; + bool send_signal = true; + + ev_data = (struct kfd_hsa_memory_exception_data *) event_data; + + id = KFD_FIRST_NONSIGNAL_EVENT_ID; + idr_for_each_entry_continue(&p->event_idr, ev, id) + if (ev->type == type) { + send_signal = false; + dev_dbg(kfd_device, + "Event found: id %X type %d", + ev->event_id, ev->type); + set_event(ev); + if (ev->type == KFD_EVENT_TYPE_MEMORY && ev_data) + ev->memory_exception_data = *ev_data; + } + + if (type == KFD_EVENT_TYPE_MEMORY) { + dev_warn(kfd_device, + "Sending SIGSEGV to HSA Process with PID %d ", + p->lead_thread->pid); + send_sig(SIGSEGV, p->lead_thread, 0); + } + + /* Send SIGTERM no event of type "type" has been found*/ + if (send_signal) { + if (send_sigterm) { + dev_warn(kfd_device, + "Sending SIGTERM to HSA Process with PID %d ", + p->lead_thread->pid); + send_sig(SIGTERM, p->lead_thread, 0); + } else { + dev_err(kfd_device, + "HSA Process (PID %d) got unhandled exception", + p->lead_thread->pid); + } + } +} + +#ifdef KFD_SUPPORT_IOMMU_V2 +void kfd_signal_iommu_event(struct kfd_dev *dev, unsigned int pasid, + unsigned long address, bool is_write_requested, + bool is_execute_requested) +{ + struct kfd_hsa_memory_exception_data memory_exception_data; + struct vm_area_struct *vma; + + /* + * Because we are called from arbitrary context (workqueue) as opposed + * to process context, kfd_process could attempt to exit while we are + * running so the lookup function increments the process ref count. + */ + struct kfd_process *p = kfd_lookup_process_by_pasid(pasid); + struct mm_struct *mm; + + if (!p) + return; /* Presumably process exited. */ + + /* Take a safe reference to the mm_struct, which may otherwise + * disappear even while the kfd_process is still referenced. + */ + mm = get_task_mm(p->lead_thread); + if (!mm) { + kfd_unref_process(p); + return; /* Process is exiting */ + } + + memset(&memory_exception_data, 0, sizeof(memory_exception_data)); + + down_read(&mm->mmap_sem); + vma = find_vma(mm, address); + + memory_exception_data.gpu_id = dev->id; + memory_exception_data.va = address; + /* Set failure reason */ + memory_exception_data.failure.NotPresent = 1; + memory_exception_data.failure.NoExecute = 0; + memory_exception_data.failure.ReadOnly = 0; + if (vma && address >= vma->vm_start) { + memory_exception_data.failure.NotPresent = 0; + + if (is_write_requested && !(vma->vm_flags & VM_WRITE)) + memory_exception_data.failure.ReadOnly = 1; + else + memory_exception_data.failure.ReadOnly = 0; + + if (is_execute_requested && !(vma->vm_flags & VM_EXEC)) + memory_exception_data.failure.NoExecute = 1; + else + memory_exception_data.failure.NoExecute = 0; + } + + up_read(&mm->mmap_sem); + mmput(mm); + + pr_debug("notpresent %d, noexecute %d, readonly %d\n", + memory_exception_data.failure.NotPresent, + memory_exception_data.failure.NoExecute, + memory_exception_data.failure.ReadOnly); + + /* Workaround on Raven to not kill the process when memory is freed + * before IOMMU is able to finish processing all the excessive PPRs + */ + if (dev->device_info->asic_family != CHIP_RAVEN) { + mutex_lock(&p->event_mutex); + + /* Lookup events by type and signal them */ + lookup_events_by_type_and_signal(p, KFD_EVENT_TYPE_MEMORY, + &memory_exception_data); + + mutex_unlock(&p->event_mutex); + } + + kfd_unref_process(p); +} +#endif /* KFD_SUPPORT_IOMMU_V2 */ + +void kfd_signal_hw_exception_event(unsigned int pasid) +{ + /* + * Because we are called from arbitrary context (workqueue) as opposed + * to process context, kfd_process could attempt to exit while we are + * running so the lookup function increments the process ref count. + */ + struct kfd_process *p = kfd_lookup_process_by_pasid(pasid); + + if (!p) + return; /* Presumably process exited. */ + + mutex_lock(&p->event_mutex); + + /* Lookup events by type and signal them */ + lookup_events_by_type_and_signal(p, KFD_EVENT_TYPE_HW_EXCEPTION, NULL); + + mutex_unlock(&p->event_mutex); + kfd_unref_process(p); +} + +void kfd_signal_vm_fault_event(struct kfd_dev *dev, unsigned int pasid, + struct kfd_vm_fault_info *info) +{ + struct kfd_event *ev; + uint32_t id; + struct kfd_process *p = kfd_lookup_process_by_pasid(pasid); + struct kfd_hsa_memory_exception_data memory_exception_data; + + if (!p) + return; /* Presumably process exited. */ + memset(&memory_exception_data, 0, sizeof(memory_exception_data)); + memory_exception_data.gpu_id = dev->id; + memory_exception_data.failure.imprecise = 1; + /* Set failure reason */ + if (info) { + memory_exception_data.va = (info->page_addr) << PAGE_SHIFT; + memory_exception_data.failure.NotPresent = + info->prot_valid ? 1 : 0; + memory_exception_data.failure.NoExecute = + info->prot_exec ? 1 : 0; + memory_exception_data.failure.ReadOnly = + info->prot_write ? 1 : 0; + memory_exception_data.failure.imprecise = 0; + } + mutex_lock(&p->event_mutex); + + id = KFD_FIRST_NONSIGNAL_EVENT_ID; + idr_for_each_entry_continue(&p->event_idr, ev, id) + if (ev->type == KFD_EVENT_TYPE_MEMORY) { + ev->memory_exception_data = memory_exception_data; + set_event(ev); + } + + mutex_unlock(&p->event_mutex); + kfd_unref_process(p); +} + +void kfd_signal_reset_event(struct kfd_dev *dev) +{ + struct kfd_hsa_hw_exception_data hw_exception_data; + struct kfd_process *p; + struct kfd_event *ev; + unsigned int temp; + uint32_t id, idx; + + /* Whole gpu reset caused by GPU hang and memory is lost */ + memset(&hw_exception_data, 0, sizeof(hw_exception_data)); + hw_exception_data.gpu_id = dev->id; + hw_exception_data.memory_lost = 1; + + idx = srcu_read_lock(&kfd_processes_srcu); + hash_for_each_rcu(kfd_processes_table, temp, p, kfd_processes) { + mutex_lock(&p->event_mutex); + id = KFD_FIRST_NONSIGNAL_EVENT_ID; + idr_for_each_entry_continue(&p->event_idr, ev, id) + if (ev->type == KFD_EVENT_TYPE_HW_EXCEPTION) { + ev->hw_exception_data = hw_exception_data; + set_event(ev); + } + mutex_unlock(&p->event_mutex); + } + srcu_read_unlock(&kfd_processes_srcu, idx); +} diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_events.h b/drivers/gpu/drm/amd/amdkfd/kfd_events.h new file mode 100644 index 000000000..c7ac6c73a --- /dev/null +++ b/drivers/gpu/drm/amd/amdkfd/kfd_events.h @@ -0,0 +1,85 @@ +/* + * Copyright 2014 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +#ifndef KFD_EVENTS_H_INCLUDED +#define KFD_EVENTS_H_INCLUDED + +#include <linux/kernel.h> +#include <linux/hashtable.h> +#include <linux/types.h> +#include <linux/list.h> +#include <linux/wait.h> +#include "kfd_priv.h" +#include <uapi/linux/kfd_ioctl.h> + +/* + * IDR supports non-negative integer IDs. Small IDs are used for + * signal events to match their signal slot. Use the upper half of the + * ID space for non-signal events. + */ +#define KFD_FIRST_NONSIGNAL_EVENT_ID ((INT_MAX >> 1) + 1) +#define KFD_LAST_NONSIGNAL_EVENT_ID INT_MAX + +/* + * Written into kfd_signal_slot_t to indicate that the event is not signaled. + * Since the event protocol may need to write the event ID into memory, this + * must not be a valid event ID. + * For the sake of easy memset-ing, this must be a byte pattern. + */ +#define UNSIGNALED_EVENT_SLOT ((uint64_t)-1) + +struct kfd_event_waiter; +struct signal_page; + +struct kfd_event { + u32 event_id; + + bool signaled; + bool auto_reset; + + int type; + + wait_queue_head_t wq; /* List of event waiters. */ + + /* Only for signal events. */ + uint64_t __user *user_signal_address; + + /* type specific data */ + union { + struct kfd_hsa_memory_exception_data memory_exception_data; + struct kfd_hsa_hw_exception_data hw_exception_data; + }; +}; + +#define KFD_EVENT_TIMEOUT_IMMEDIATE 0 +#define KFD_EVENT_TIMEOUT_INFINITE 0xFFFFFFFFu + +/* Matching HSA_EVENTTYPE */ +#define KFD_EVENT_TYPE_SIGNAL 0 +#define KFD_EVENT_TYPE_HW_EXCEPTION 3 +#define KFD_EVENT_TYPE_DEBUG 5 +#define KFD_EVENT_TYPE_MEMORY 8 + +extern void kfd_signal_event_interrupt(unsigned int pasid, uint32_t partial_id, + uint32_t valid_id_bits); + +#endif diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_flat_memory.c b/drivers/gpu/drm/amd/amdkfd/kfd_flat_memory.c new file mode 100644 index 000000000..97d5423c5 --- /dev/null +++ b/drivers/gpu/drm/amd/amdkfd/kfd_flat_memory.c @@ -0,0 +1,436 @@ +/* + * Copyright 2014 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + * + */ + +#include <linux/device.h> +#include <linux/export.h> +#include <linux/err.h> +#include <linux/fs.h> +#include <linux/sched.h> +#include <linux/slab.h> +#include <linux/uaccess.h> +#include <linux/compat.h> +#include <uapi/linux/kfd_ioctl.h> +#include <linux/time.h> +#include "kfd_priv.h" +#include <linux/mm.h> +#include <linux/mman.h> +#include <asm/processor.h> + +/* + * The primary memory I/O features being added for revisions of gfxip + * beyond 7.0 (Kaveri) are: + * + * Access to ATC/IOMMU mapped memory w/ associated extension of VA to 48b + * + * “Flat” shader memory access – These are new shader vector memory + * operations that do not reference a T#/V# so a “pointer” is what is + * sourced from the vector gprs for direct access to memory. + * This pointer space has the Shared(LDS) and Private(Scratch) memory + * mapped into this pointer space as apertures. + * The hardware then determines how to direct the memory request + * based on what apertures the request falls in. + * + * Unaligned support and alignment check + * + * + * System Unified Address - SUA + * + * The standard usage for GPU virtual addresses are that they are mapped by + * a set of page tables we call GPUVM and these page tables are managed by + * a combination of vidMM/driver software components. The current virtual + * address (VA) range for GPUVM is 40b. + * + * As of gfxip7.1 and beyond we’re adding the ability for compute memory + * clients (CP/RLC, DMA, SHADER(ifetch, scalar, and vector ops)) to access + * the same page tables used by host x86 processors and that are managed by + * the operating system. This is via a technique and hardware called ATC/IOMMU. + * The GPU has the capability of accessing both the GPUVM and ATC address + * spaces for a given VMID (process) simultaneously and we call this feature + * system unified address (SUA). + * + * There are three fundamental address modes of operation for a given VMID + * (process) on the GPU: + * + * HSA64 – 64b pointers and the default address space is ATC + * HSA32 – 32b pointers and the default address space is ATC + * GPUVM – 64b pointers and the default address space is GPUVM (driver + * model mode) + * + * + * HSA64 - ATC/IOMMU 64b + * + * A 64b pointer in the AMD64/IA64 CPU architecture is not fully utilized + * by the CPU so an AMD CPU can only access the high area + * (VA[63:47] == 0x1FFFF) and low area (VA[63:47 == 0) of the address space + * so the actual VA carried to translation is 48b. There is a “hole” in + * the middle of the 64b VA space. + * + * The GPU not only has access to all of the CPU accessible address space via + * ATC/IOMMU, but it also has access to the GPUVM address space. The “system + * unified address” feature (SUA) is the mapping of GPUVM and ATC address + * spaces into a unified pointer space. The method we take for 64b mode is + * to map the full 40b GPUVM address space into the hole of the 64b address + * space. + + * The GPUVM_Base/GPUVM_Limit defines the aperture in the 64b space where we + * direct requests to be translated via GPUVM page tables instead of the + * IOMMU path. + * + * + * 64b to 49b Address conversion + * + * Note that there are still significant portions of unused regions (holes) + * in the 64b address space even for the GPU. There are several places in + * the pipeline (sw and hw), we wish to compress the 64b virtual address + * to a 49b address. This 49b address is constituted of an “ATC” bit + * plus a 48b virtual address. This 49b address is what is passed to the + * translation hardware. ATC==0 means the 48b address is a GPUVM address + * (max of 2^40 – 1) intended to be translated via GPUVM page tables. + * ATC==1 means the 48b address is intended to be translated via IOMMU + * page tables. + * + * A 64b pointer is compared to the apertures that are defined (Base/Limit), in + * this case the GPUVM aperture (red) is defined and if a pointer falls in this + * aperture, we subtract the GPUVM_Base address and set the ATC bit to zero + * as part of the 64b to 49b conversion. + * + * Where this 64b to 49b conversion is done is a function of the usage. + * Most GPU memory access is via memory objects where the driver builds + * a descriptor which consists of a base address and a memory access by + * the GPU usually consists of some kind of an offset or Cartesian coordinate + * that references this memory descriptor. This is the case for shader + * instructions that reference the T# or V# constants, or for specified + * locations of assets (ex. the shader program location). In these cases + * the driver is what handles the 64b to 49b conversion and the base + * address in the descriptor (ex. V# or T# or shader program location) + * is defined as a 48b address w/ an ATC bit. For this usage a given + * memory object cannot straddle multiple apertures in the 64b address + * space. For example a shader program cannot jump in/out between ATC + * and GPUVM space. + * + * In some cases we wish to pass a 64b pointer to the GPU hardware and + * the GPU hw does the 64b to 49b conversion before passing memory + * requests to the cache/memory system. This is the case for the + * S_LOAD and FLAT_* shader memory instructions where we have 64b pointers + * in scalar and vector GPRs respectively. + * + * In all cases (no matter where the 64b -> 49b conversion is done), the gfxip + * hardware sends a 48b address along w/ an ATC bit, to the memory controller + * on the memory request interfaces. + * + * <client>_MC_rdreq_atc // read request ATC bit + * + * 0 : <client>_MC_rdreq_addr is a GPUVM VA + * + * 1 : <client>_MC_rdreq_addr is a ATC VA + * + * + * “Spare” aperture (APE1) + * + * We use the GPUVM aperture to differentiate ATC vs. GPUVM, but we also use + * apertures to set the Mtype field for S_LOAD/FLAT_* ops which is input to the + * config tables for setting cache policies. The “spare” (APE1) aperture is + * motivated by getting a different Mtype from the default. + * The default aperture isn’t an actual base/limit aperture; it is just the + * address space that doesn’t hit any defined base/limit apertures. + * The following diagram is a complete picture of the gfxip7.x SUA apertures. + * The APE1 can be placed either below or above + * the hole (cannot be in the hole). + * + * + * General Aperture definitions and rules + * + * An aperture register definition consists of a Base, Limit, Mtype, and + * usually an ATC bit indicating which translation tables that aperture uses. + * In all cases (for SUA and DUA apertures discussed later), aperture base + * and limit definitions are 64KB aligned. + * + * <ape>_Base[63:0] = { <ape>_Base_register[63:16], 0x0000 } + * + * <ape>_Limit[63:0] = { <ape>_Limit_register[63:16], 0xFFFF } + * + * The base and limit are considered inclusive to an aperture so being + * inside an aperture means (address >= Base) AND (address <= Limit). + * + * In no case is a payload that straddles multiple apertures expected to work. + * For example a load_dword_x4 that starts in one aperture and ends in another, + * does not work. For the vector FLAT_* ops we have detection capability in + * the shader for reporting a “memory violation” back to the + * SQ block for use in traps. + * A memory violation results when an op falls into the hole, + * or a payload straddles multiple apertures. The S_LOAD instruction + * does not have this detection. + * + * Apertures cannot overlap. + * + * + * + * HSA32 - ATC/IOMMU 32b + * + * For HSA32 mode, the pointers are interpreted as 32 bits and use a single GPR + * instead of two for the S_LOAD and FLAT_* ops. The entire GPUVM space of 40b + * will not fit so there is only partial visibility to the GPUVM + * space (defined by the aperture) for S_LOAD and FLAT_* ops. + * There is no spare (APE1) aperture for HSA32 mode. + * + * + * GPUVM 64b mode (driver model) + * + * This mode is related to HSA64 in that the difference really is that + * the default aperture is GPUVM (ATC==0) and not ATC space. + * We have gfxip7.x hardware that has FLAT_* and S_LOAD support for + * SUA GPUVM mode, but does not support HSA32/HSA64. + * + * + * Device Unified Address - DUA + * + * Device unified address (DUA) is the name of the feature that maps the + * Shared(LDS) memory and Private(Scratch) memory into the overall address + * space for use by the new FLAT_* vector memory ops. The Shared and + * Private memories are mapped as apertures into the address space, + * and the hardware detects when a FLAT_* memory request is to be redirected + * to the LDS or Scratch memory when it falls into one of these apertures. + * Like the SUA apertures, the Shared/Private apertures are 64KB aligned and + * the base/limit is “in” the aperture. For both HSA64 and GPUVM SUA modes, + * the Shared/Private apertures are always placed in a limited selection of + * options in the hole of the 64b address space. For HSA32 mode, the + * Shared/Private apertures can be placed anywhere in the 32b space + * except at 0. + * + * + * HSA64 Apertures for FLAT_* vector ops + * + * For HSA64 SUA mode, the Shared and Private apertures are always placed + * in the hole w/ a limited selection of possible locations. The requests + * that fall in the private aperture are expanded as a function of the + * work-item id (tid) and redirected to the location of the + * “hidden private memory”. The hidden private can be placed in either GPUVM + * or ATC space. The addresses that fall in the shared aperture are + * re-directed to the on-chip LDS memory hardware. + * + * + * HSA32 Apertures for FLAT_* vector ops + * + * In HSA32 mode, the Private and Shared apertures can be placed anywhere + * in the 32b space except at 0 (Private or Shared Base at zero disables + * the apertures). If the base address of the apertures are non-zero + * (ie apertures exists), the size is always 64KB. + * + * + * GPUVM Apertures for FLAT_* vector ops + * + * In GPUVM mode, the Shared/Private apertures are specified identically + * to HSA64 mode where they are always in the hole at a limited selection + * of locations. + * + * + * Aperture Definitions for SUA and DUA + * + * The interpretation of the aperture register definitions for a given + * VMID is a function of the “SUA Mode” which is one of HSA64, HSA32, or + * GPUVM64 discussed in previous sections. The mode is first decoded, and + * then the remaining register decode is a function of the mode. + * + * + * SUA Mode Decode + * + * For the S_LOAD and FLAT_* shader operations, the SUA mode is decoded from + * the COMPUTE_DISPATCH_INITIATOR:DATA_ATC bit and + * the SH_MEM_CONFIG:PTR32 bits. + * + * COMPUTE_DISPATCH_INITIATOR:DATA_ATC SH_MEM_CONFIG:PTR32 Mode + * + * 1 0 HSA64 + * + * 1 1 HSA32 + * + * 0 X GPUVM64 + * + * In general the hardware will ignore the PTR32 bit and treat + * as “0” whenever DATA_ATC = “0”, but sw should set PTR32=0 + * when DATA_ATC=0. + * + * The DATA_ATC bit is only set for compute dispatches. + * All “Draw” dispatches are hardcoded to GPUVM64 mode + * for FLAT_* / S_LOAD operations. + */ + +#define MAKE_GPUVM_APP_BASE_VI(gpu_num) \ + (((uint64_t)(gpu_num) << 61) + 0x1000000000000L) + +#define MAKE_GPUVM_APP_LIMIT(base, size) \ + (((uint64_t)(base) & 0xFFFFFF0000000000UL) + (size) - 1) + +#define MAKE_SCRATCH_APP_BASE_VI() \ + (((uint64_t)(0x1UL) << 61) + 0x100000000L) + +#define MAKE_SCRATCH_APP_LIMIT(base) \ + (((uint64_t)base & 0xFFFFFFFF00000000UL) | 0xFFFFFFFF) + +#define MAKE_LDS_APP_BASE_VI() \ + (((uint64_t)(0x1UL) << 61) + 0x0) +#define MAKE_LDS_APP_LIMIT(base) \ + (((uint64_t)(base) & 0xFFFFFFFF00000000UL) | 0xFFFFFFFF) + +/* On GFXv9 the LDS and scratch apertures are programmed independently + * using the high 16 bits of the 64-bit virtual address. They must be + * in the hole, which will be the case as long as the high 16 bits are + * not 0. + * + * The aperture sizes are still 4GB implicitly. + * + * A GPUVM aperture is not applicable on GFXv9. + */ +#define MAKE_LDS_APP_BASE_V9() ((uint64_t)(0x1UL) << 48) +#define MAKE_SCRATCH_APP_BASE_V9() ((uint64_t)(0x2UL) << 48) + +/* User mode manages most of the SVM aperture address space. The low + * 16MB are reserved for kernel use (CWSR trap handler and kernel IB + * for now). + */ +#define SVM_USER_BASE 0x1000000ull +#define SVM_CWSR_BASE (SVM_USER_BASE - KFD_CWSR_TBA_TMA_SIZE) +#define SVM_IB_BASE (SVM_CWSR_BASE - PAGE_SIZE) + +static void kfd_init_apertures_vi(struct kfd_process_device *pdd, uint8_t id) +{ + /* + * node id couldn't be 0 - the three MSB bits of + * aperture shoudn't be 0 + */ + pdd->lds_base = MAKE_LDS_APP_BASE_VI(); + pdd->lds_limit = MAKE_LDS_APP_LIMIT(pdd->lds_base); + + if (!pdd->dev->device_info->needs_iommu_device) { + /* dGPUs: SVM aperture starting at 0 + * with small reserved space for kernel. + * Set them to CANONICAL addresses. + */ + pdd->gpuvm_base = SVM_USER_BASE; + pdd->gpuvm_limit = + pdd->dev->shared_resources.gpuvm_size - 1; + } else { + /* set them to non CANONICAL addresses, and no SVM is + * allocated. + */ + pdd->gpuvm_base = MAKE_GPUVM_APP_BASE_VI(id + 1); + pdd->gpuvm_limit = MAKE_GPUVM_APP_LIMIT(pdd->gpuvm_base, + pdd->dev->shared_resources.gpuvm_size); + } + + pdd->scratch_base = MAKE_SCRATCH_APP_BASE_VI(); + pdd->scratch_limit = MAKE_SCRATCH_APP_LIMIT(pdd->scratch_base); +} + +static void kfd_init_apertures_v9(struct kfd_process_device *pdd, uint8_t id) +{ + pdd->lds_base = MAKE_LDS_APP_BASE_V9(); + pdd->lds_limit = MAKE_LDS_APP_LIMIT(pdd->lds_base); + + /* Raven needs SVM to support graphic handle, etc. Leave the small + * reserved space before SVM on Raven as well, even though we don't + * have to. + * Set gpuvm_base and gpuvm_limit to CANONICAL addresses so that they + * are used in Thunk to reserve SVM. + */ + pdd->gpuvm_base = SVM_USER_BASE; + pdd->gpuvm_limit = + pdd->dev->shared_resources.gpuvm_size - 1; + + pdd->scratch_base = MAKE_SCRATCH_APP_BASE_V9(); + pdd->scratch_limit = MAKE_SCRATCH_APP_LIMIT(pdd->scratch_base); +} + +int kfd_init_apertures(struct kfd_process *process) +{ + uint8_t id = 0; + struct kfd_dev *dev; + struct kfd_process_device *pdd; + + /*Iterating over all devices*/ + while (kfd_topology_enum_kfd_devices(id, &dev) == 0) { + if (!dev) { + id++; /* Skip non GPU devices */ + continue; + } + + pdd = kfd_create_process_device_data(dev, process); + if (!pdd) { + pr_err("Failed to create process device data\n"); + return -ENOMEM; + } + /* + * For 64 bit process apertures will be statically reserved in + * the x86_64 non canonical process address space + * amdkfd doesn't currently support apertures for 32 bit process + */ + if (process->is_32bit_user_mode) { + pdd->lds_base = pdd->lds_limit = 0; + pdd->gpuvm_base = pdd->gpuvm_limit = 0; + pdd->scratch_base = pdd->scratch_limit = 0; + } else { + switch (dev->device_info->asic_family) { + case CHIP_KAVERI: + case CHIP_HAWAII: + case CHIP_CARRIZO: + case CHIP_TONGA: + case CHIP_FIJI: + case CHIP_POLARIS10: + case CHIP_POLARIS11: + kfd_init_apertures_vi(pdd, id); + break; + case CHIP_VEGA10: + case CHIP_RAVEN: + kfd_init_apertures_v9(pdd, id); + break; + default: + WARN(1, "Unexpected ASIC family %u", + dev->device_info->asic_family); + return -EINVAL; + } + + if (!dev->device_info->needs_iommu_device) { + /* dGPUs: the reserved space for kernel + * before SVM + */ + pdd->qpd.cwsr_base = SVM_CWSR_BASE; + pdd->qpd.ib_base = SVM_IB_BASE; + } + } + + dev_dbg(kfd_device, "node id %u\n", id); + dev_dbg(kfd_device, "gpu id %u\n", pdd->dev->id); + dev_dbg(kfd_device, "lds_base %llX\n", pdd->lds_base); + dev_dbg(kfd_device, "lds_limit %llX\n", pdd->lds_limit); + dev_dbg(kfd_device, "gpuvm_base %llX\n", pdd->gpuvm_base); + dev_dbg(kfd_device, "gpuvm_limit %llX\n", pdd->gpuvm_limit); + dev_dbg(kfd_device, "scratch_base %llX\n", pdd->scratch_base); + dev_dbg(kfd_device, "scratch_limit %llX\n", pdd->scratch_limit); + + id++; + } + + return 0; +} + + diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c new file mode 100644 index 000000000..f836897bb --- /dev/null +++ b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c @@ -0,0 +1,108 @@ +/* + * Copyright 2016-2018 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +#include "kfd_priv.h" +#include "kfd_events.h" +#include "soc15_int.h" + + +static bool event_interrupt_isr_v9(struct kfd_dev *dev, + const uint32_t *ih_ring_entry, + uint32_t *patched_ihre, + bool *patched_flag) +{ + uint16_t source_id, client_id, pasid, vmid; + const uint32_t *data = ih_ring_entry; + + /* Only handle interrupts from KFD VMIDs */ + vmid = SOC15_VMID_FROM_IH_ENTRY(ih_ring_entry); + if (vmid < dev->vm_info.first_vmid_kfd || + vmid > dev->vm_info.last_vmid_kfd) + return 0; + + /* If there is no valid PASID, it's likely a firmware bug */ + pasid = SOC15_PASID_FROM_IH_ENTRY(ih_ring_entry); + if (WARN_ONCE(pasid == 0, "FW bug: No PASID in KFD interrupt")) + return 0; + + source_id = SOC15_SOURCE_ID_FROM_IH_ENTRY(ih_ring_entry); + client_id = SOC15_CLIENT_ID_FROM_IH_ENTRY(ih_ring_entry); + + pr_debug("client id 0x%x, source id %d, pasid 0x%x. raw data:\n", + client_id, source_id, pasid); + pr_debug("%8X, %8X, %8X, %8X, %8X, %8X, %8X, %8X.\n", + data[0], data[1], data[2], data[3], + data[4], data[5], data[6], data[7]); + + /* Interrupt types we care about: various signals and faults. + * They will be forwarded to a work queue (see below). + */ + return source_id == SOC15_INTSRC_CP_END_OF_PIPE || + source_id == SOC15_INTSRC_SDMA_TRAP || + source_id == SOC15_INTSRC_SQ_INTERRUPT_MSG || + source_id == SOC15_INTSRC_CP_BAD_OPCODE || + client_id == SOC15_IH_CLIENTID_VMC || + client_id == SOC15_IH_CLIENTID_UTCL2; +} + +static void event_interrupt_wq_v9(struct kfd_dev *dev, + const uint32_t *ih_ring_entry) +{ + uint16_t source_id, client_id, pasid, vmid; + uint32_t context_id; + + source_id = SOC15_SOURCE_ID_FROM_IH_ENTRY(ih_ring_entry); + client_id = SOC15_CLIENT_ID_FROM_IH_ENTRY(ih_ring_entry); + pasid = SOC15_PASID_FROM_IH_ENTRY(ih_ring_entry); + vmid = SOC15_VMID_FROM_IH_ENTRY(ih_ring_entry); + context_id = SOC15_CONTEXT_ID0_FROM_IH_ENTRY(ih_ring_entry); + + if (source_id == SOC15_INTSRC_CP_END_OF_PIPE) + kfd_signal_event_interrupt(pasid, context_id, 32); + else if (source_id == SOC15_INTSRC_SDMA_TRAP) + kfd_signal_event_interrupt(pasid, context_id & 0xfffffff, 28); + else if (source_id == SOC15_INTSRC_SQ_INTERRUPT_MSG) + kfd_signal_event_interrupt(pasid, context_id & 0xffffff, 24); + else if (source_id == SOC15_INTSRC_CP_BAD_OPCODE) + kfd_signal_hw_exception_event(pasid); + else if (client_id == SOC15_IH_CLIENTID_VMC || + client_id == SOC15_IH_CLIENTID_UTCL2) { + struct kfd_vm_fault_info info = {0}; + uint16_t ring_id = SOC15_RING_ID_FROM_IH_ENTRY(ih_ring_entry); + + info.vmid = vmid; + info.mc_id = client_id; + info.page_addr = ih_ring_entry[4] | + (uint64_t)(ih_ring_entry[5] & 0xf) << 32; + info.prot_valid = ring_id & 0x08; + info.prot_read = ring_id & 0x10; + info.prot_write = ring_id & 0x20; + + kfd_process_vm_fault(dev->dqm, pasid); + kfd_signal_vm_fault_event(dev, pasid, &info); + } +} + +const struct kfd_event_interrupt_class event_interrupt_class_v9 = { + .interrupt_isr = event_interrupt_isr_v9, + .interrupt_wq = event_interrupt_wq_v9, +}; diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_interrupt.c b/drivers/gpu/drm/amd/amdkfd/kfd_interrupt.c new file mode 100644 index 000000000..bc47f6a44 --- /dev/null +++ b/drivers/gpu/drm/amd/amdkfd/kfd_interrupt.c @@ -0,0 +1,170 @@ +/* + * Copyright 2014 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +/* + * KFD Interrupts. + * + * AMD GPUs deliver interrupts by pushing an interrupt description onto the + * interrupt ring and then sending an interrupt. KGD receives the interrupt + * in ISR and sends us a pointer to each new entry on the interrupt ring. + * + * We generally can't process interrupt-signaled events from ISR, so we call + * out to each interrupt client module (currently only the scheduler) to ask if + * each interrupt is interesting. If they return true, then it requires further + * processing so we copy it to an internal interrupt ring and call each + * interrupt client again from a work-queue. + * + * There's no acknowledgment for the interrupts we use. The hardware simply + * queues a new interrupt each time without waiting. + * + * The fixed-size internal queue means that it's possible for us to lose + * interrupts because we have no back-pressure to the hardware. + */ + +#include <linux/slab.h> +#include <linux/device.h> +#include <linux/kfifo.h> +#include "kfd_priv.h" + +#define KFD_IH_NUM_ENTRIES 8192 + +static void interrupt_wq(struct work_struct *); + +int kfd_interrupt_init(struct kfd_dev *kfd) +{ + int r; + + r = kfifo_alloc(&kfd->ih_fifo, + KFD_IH_NUM_ENTRIES * kfd->device_info->ih_ring_entry_size, + GFP_KERNEL); + if (r) { + dev_err(kfd_chardev(), "Failed to allocate IH fifo\n"); + return r; + } + + kfd->ih_wq = alloc_workqueue("KFD IH", WQ_HIGHPRI, 1); + if (unlikely(!kfd->ih_wq)) { + kfifo_free(&kfd->ih_fifo); + dev_err(kfd_chardev(), "Failed to allocate KFD IH workqueue\n"); + return -ENOMEM; + } + spin_lock_init(&kfd->interrupt_lock); + + INIT_WORK(&kfd->interrupt_work, interrupt_wq); + + kfd->interrupts_active = true; + + /* + * After this function returns, the interrupt will be enabled. This + * barrier ensures that the interrupt running on a different processor + * sees all the above writes. + */ + smp_wmb(); + + return 0; +} + +void kfd_interrupt_exit(struct kfd_dev *kfd) +{ + /* + * Stop the interrupt handler from writing to the ring and scheduling + * workqueue items. The spinlock ensures that any interrupt running + * after we have unlocked sees interrupts_active = false. + */ + unsigned long flags; + + spin_lock_irqsave(&kfd->interrupt_lock, flags); + kfd->interrupts_active = false; + spin_unlock_irqrestore(&kfd->interrupt_lock, flags); + + /* + * flush_work ensures that there are no outstanding + * work-queue items that will access interrupt_ring. New work items + * can't be created because we stopped interrupt handling above. + */ + flush_workqueue(kfd->ih_wq); + + kfifo_free(&kfd->ih_fifo); +} + +/* + * Assumption: single reader/writer. This function is not re-entrant + */ +bool enqueue_ih_ring_entry(struct kfd_dev *kfd, const void *ih_ring_entry) +{ + int count; + + count = kfifo_in(&kfd->ih_fifo, ih_ring_entry, + kfd->device_info->ih_ring_entry_size); + if (count != kfd->device_info->ih_ring_entry_size) { + dev_err_ratelimited(kfd_chardev(), + "Interrupt ring overflow, dropping interrupt %d\n", + count); + return false; + } + + return true; +} + +/* + * Assumption: single reader/writer. This function is not re-entrant + */ +static bool dequeue_ih_ring_entry(struct kfd_dev *kfd, void *ih_ring_entry) +{ + int count; + + count = kfifo_out(&kfd->ih_fifo, ih_ring_entry, + kfd->device_info->ih_ring_entry_size); + + WARN_ON(count && count != kfd->device_info->ih_ring_entry_size); + + return count == kfd->device_info->ih_ring_entry_size; +} + +static void interrupt_wq(struct work_struct *work) +{ + struct kfd_dev *dev = container_of(work, struct kfd_dev, + interrupt_work); + uint32_t ih_ring_entry[KFD_MAX_RING_ENTRY_SIZE]; + + if (dev->device_info->ih_ring_entry_size > sizeof(ih_ring_entry)) { + dev_err_once(kfd_chardev(), "Ring entry too small\n"); + return; + } + + while (dequeue_ih_ring_entry(dev, ih_ring_entry)) + dev->device_info->event_interrupt_class->interrupt_wq(dev, + ih_ring_entry); +} + +bool interrupt_is_wanted(struct kfd_dev *dev, + const uint32_t *ih_ring_entry, + uint32_t *patched_ihre, bool *flag) +{ + /* integer and bitwise OR so there is no boolean short-circuiting */ + unsigned int wanted = 0; + + wanted |= dev->device_info->event_interrupt_class->interrupt_isr(dev, + ih_ring_entry, patched_ihre, flag); + + return wanted != 0; +} diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_iommu.c b/drivers/gpu/drm/amd/amdkfd/kfd_iommu.c new file mode 100644 index 000000000..f3a526ed8 --- /dev/null +++ b/drivers/gpu/drm/amd/amdkfd/kfd_iommu.c @@ -0,0 +1,374 @@ +/* + * Copyright 2018 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +#include <linux/kconfig.h> + +#if IS_REACHABLE(CONFIG_AMD_IOMMU_V2) + +#include <linux/printk.h> +#include <linux/device.h> +#include <linux/slab.h> +#include <linux/pci.h> +#include <linux/amd-iommu.h> +#include "kfd_priv.h" +#include "kfd_dbgmgr.h" +#include "kfd_topology.h" +#include "kfd_iommu.h" + +static const u32 required_iommu_flags = AMD_IOMMU_DEVICE_FLAG_ATS_SUP | + AMD_IOMMU_DEVICE_FLAG_PRI_SUP | + AMD_IOMMU_DEVICE_FLAG_PASID_SUP; + +/** kfd_iommu_check_device - Check whether IOMMU is available for device + */ +int kfd_iommu_check_device(struct kfd_dev *kfd) +{ + struct amd_iommu_device_info iommu_info; + int err; + + if (!kfd->device_info->needs_iommu_device) + return -ENODEV; + + iommu_info.flags = 0; + err = amd_iommu_device_info(kfd->pdev, &iommu_info); + if (err) + return err; + + if ((iommu_info.flags & required_iommu_flags) != required_iommu_flags) + return -ENODEV; + + return 0; +} + +/** kfd_iommu_device_init - Initialize IOMMU for device + */ +int kfd_iommu_device_init(struct kfd_dev *kfd) +{ + struct amd_iommu_device_info iommu_info; + unsigned int pasid_limit; + int err; + struct kfd_topology_device *top_dev; + + top_dev = kfd_topology_device_by_id(kfd->id); + + /* + * Overwrite ATS capability according to needs_iommu_device to fix + * potential missing corresponding bit in CRAT of BIOS. + */ + if (!kfd->device_info->needs_iommu_device) { + top_dev->node_props.capability &= ~HSA_CAP_ATS_PRESENT; + return 0; + } + + top_dev->node_props.capability |= HSA_CAP_ATS_PRESENT; + + iommu_info.flags = 0; + err = amd_iommu_device_info(kfd->pdev, &iommu_info); + if (err < 0) { + dev_err(kfd_device, + "error getting iommu info. is the iommu enabled?\n"); + return -ENODEV; + } + + if ((iommu_info.flags & required_iommu_flags) != required_iommu_flags) { + dev_err(kfd_device, + "error required iommu flags ats %i, pri %i, pasid %i\n", + (iommu_info.flags & AMD_IOMMU_DEVICE_FLAG_ATS_SUP) != 0, + (iommu_info.flags & AMD_IOMMU_DEVICE_FLAG_PRI_SUP) != 0, + (iommu_info.flags & AMD_IOMMU_DEVICE_FLAG_PASID_SUP) + != 0); + return -ENODEV; + } + + pasid_limit = min_t(unsigned int, + (unsigned int)(1 << kfd->device_info->max_pasid_bits), + iommu_info.max_pasids); + + if (!kfd_set_pasid_limit(pasid_limit)) { + dev_err(kfd_device, "error setting pasid limit\n"); + return -EBUSY; + } + + return 0; +} + +/** kfd_iommu_bind_process_to_device - Have the IOMMU bind a process + * + * Binds the given process to the given device using its PASID. This + * enables IOMMUv2 address translation for the process on the device. + * + * This function assumes that the process mutex is held. + */ +int kfd_iommu_bind_process_to_device(struct kfd_process_device *pdd) +{ + struct kfd_dev *dev = pdd->dev; + struct kfd_process *p = pdd->process; + int err; + + if (!dev->device_info->needs_iommu_device || pdd->bound == PDD_BOUND) + return 0; + + if (unlikely(pdd->bound == PDD_BOUND_SUSPENDED)) { + pr_err("Binding PDD_BOUND_SUSPENDED pdd is unexpected!\n"); + return -EINVAL; + } + + err = amd_iommu_bind_pasid(dev->pdev, p->pasid, p->lead_thread); + if (!err) + pdd->bound = PDD_BOUND; + + return err; +} + +/** kfd_iommu_unbind_process - Unbind process from all devices + * + * This removes all IOMMU device bindings of the process. To be used + * before process termination. + */ +void kfd_iommu_unbind_process(struct kfd_process *p) +{ + struct kfd_process_device *pdd; + + list_for_each_entry(pdd, &p->per_device_data, per_device_list) + if (pdd->bound == PDD_BOUND) + amd_iommu_unbind_pasid(pdd->dev->pdev, p->pasid); +} + +/* Callback for process shutdown invoked by the IOMMU driver */ +static void iommu_pasid_shutdown_callback(struct pci_dev *pdev, int pasid) +{ + struct kfd_dev *dev = kfd_device_by_pci_dev(pdev); + struct kfd_process *p; + struct kfd_process_device *pdd; + + if (!dev) + return; + + /* + * Look for the process that matches the pasid. If there is no such + * process, we either released it in amdkfd's own notifier, or there + * is a bug. Unfortunately, there is no way to tell... + */ + p = kfd_lookup_process_by_pasid(pasid); + if (!p) + return; + + pr_debug("Unbinding process %d from IOMMU\n", pasid); + + mutex_lock(kfd_get_dbgmgr_mutex()); + + if (dev->dbgmgr && dev->dbgmgr->pasid == p->pasid) { + if (!kfd_dbgmgr_unregister(dev->dbgmgr, p)) { + kfd_dbgmgr_destroy(dev->dbgmgr); + dev->dbgmgr = NULL; + } + } + + mutex_unlock(kfd_get_dbgmgr_mutex()); + + mutex_lock(&p->mutex); + + pdd = kfd_get_process_device_data(dev, p); + if (pdd) + /* For GPU relying on IOMMU, we need to dequeue here + * when PASID is still bound. + */ + kfd_process_dequeue_from_device(pdd); + + mutex_unlock(&p->mutex); + + kfd_unref_process(p); +} + +/* This function called by IOMMU driver on PPR failure */ +static int iommu_invalid_ppr_cb(struct pci_dev *pdev, int pasid, + unsigned long address, u16 flags) +{ + struct kfd_dev *dev; + + dev_warn_ratelimited(kfd_device, + "Invalid PPR device %x:%x.%x pasid %d address 0x%lX flags 0x%X", + PCI_BUS_NUM(pdev->devfn), + PCI_SLOT(pdev->devfn), + PCI_FUNC(pdev->devfn), + pasid, + address, + flags); + + dev = kfd_device_by_pci_dev(pdev); + if (!WARN_ON(!dev)) + kfd_signal_iommu_event(dev, pasid, address, + flags & PPR_FAULT_WRITE, flags & PPR_FAULT_EXEC); + + return AMD_IOMMU_INV_PRI_RSP_INVALID; +} + +/* + * Bind processes do the device that have been temporarily unbound + * (PDD_BOUND_SUSPENDED) in kfd_unbind_processes_from_device. + */ +static int kfd_bind_processes_to_device(struct kfd_dev *kfd) +{ + struct kfd_process_device *pdd; + struct kfd_process *p; + unsigned int temp; + int err = 0; + + int idx = srcu_read_lock(&kfd_processes_srcu); + + hash_for_each_rcu(kfd_processes_table, temp, p, kfd_processes) { + mutex_lock(&p->mutex); + pdd = kfd_get_process_device_data(kfd, p); + + if (WARN_ON(!pdd) || pdd->bound != PDD_BOUND_SUSPENDED) { + mutex_unlock(&p->mutex); + continue; + } + + err = amd_iommu_bind_pasid(kfd->pdev, p->pasid, + p->lead_thread); + if (err < 0) { + pr_err("Unexpected pasid %d binding failure\n", + p->pasid); + mutex_unlock(&p->mutex); + break; + } + + pdd->bound = PDD_BOUND; + mutex_unlock(&p->mutex); + } + + srcu_read_unlock(&kfd_processes_srcu, idx); + + return err; +} + +/* + * Mark currently bound processes as PDD_BOUND_SUSPENDED. These + * processes will be restored to PDD_BOUND state in + * kfd_bind_processes_to_device. + */ +static void kfd_unbind_processes_from_device(struct kfd_dev *kfd) +{ + struct kfd_process_device *pdd; + struct kfd_process *p; + unsigned int temp; + + int idx = srcu_read_lock(&kfd_processes_srcu); + + hash_for_each_rcu(kfd_processes_table, temp, p, kfd_processes) { + mutex_lock(&p->mutex); + pdd = kfd_get_process_device_data(kfd, p); + + if (WARN_ON(!pdd)) { + mutex_unlock(&p->mutex); + continue; + } + + if (pdd->bound == PDD_BOUND) + pdd->bound = PDD_BOUND_SUSPENDED; + mutex_unlock(&p->mutex); + } + + srcu_read_unlock(&kfd_processes_srcu, idx); +} + +/** kfd_iommu_suspend - Prepare IOMMU for suspend + * + * This unbinds processes from the device and disables the IOMMU for + * the device. + */ +void kfd_iommu_suspend(struct kfd_dev *kfd) +{ + if (!kfd->device_info->needs_iommu_device) + return; + + kfd_unbind_processes_from_device(kfd); + + amd_iommu_set_invalidate_ctx_cb(kfd->pdev, NULL); + amd_iommu_set_invalid_ppr_cb(kfd->pdev, NULL); + amd_iommu_free_device(kfd->pdev); +} + +/** kfd_iommu_resume - Restore IOMMU after resume + * + * This reinitializes the IOMMU for the device and re-binds previously + * suspended processes to the device. + */ +int kfd_iommu_resume(struct kfd_dev *kfd) +{ + unsigned int pasid_limit; + int err; + + if (!kfd->device_info->needs_iommu_device) + return 0; + + pasid_limit = kfd_get_pasid_limit(); + + err = amd_iommu_init_device(kfd->pdev, pasid_limit); + if (err) + return -ENXIO; + + amd_iommu_set_invalidate_ctx_cb(kfd->pdev, + iommu_pasid_shutdown_callback); + amd_iommu_set_invalid_ppr_cb(kfd->pdev, + iommu_invalid_ppr_cb); + + err = kfd_bind_processes_to_device(kfd); + if (err) { + amd_iommu_set_invalidate_ctx_cb(kfd->pdev, NULL); + amd_iommu_set_invalid_ppr_cb(kfd->pdev, NULL); + amd_iommu_free_device(kfd->pdev); + return err; + } + + return 0; +} + +extern bool amd_iommu_pc_supported(void); +extern u8 amd_iommu_pc_get_max_banks(u16 devid); +extern u8 amd_iommu_pc_get_max_counters(u16 devid); + +/** kfd_iommu_add_perf_counters - Add IOMMU performance counters to topology + */ +int kfd_iommu_add_perf_counters(struct kfd_topology_device *kdev) +{ + struct kfd_perf_properties *props; + + if (!(kdev->node_props.capability & HSA_CAP_ATS_PRESENT)) + return 0; + + if (!amd_iommu_pc_supported()) + return 0; + + props = kfd_alloc_struct(props); + if (!props) + return -ENOMEM; + strcpy(props->block_name, "iommu"); + props->max_concurrent = amd_iommu_pc_get_max_banks(0) * + amd_iommu_pc_get_max_counters(0); /* assume one iommu */ + list_add_tail(&props->list, &kdev->perf_props); + + return 0; +} + +#endif diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_iommu.h b/drivers/gpu/drm/amd/amdkfd/kfd_iommu.h new file mode 100644 index 000000000..afd420b01 --- /dev/null +++ b/drivers/gpu/drm/amd/amdkfd/kfd_iommu.h @@ -0,0 +1,83 @@ +/* + * Copyright 2018 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +#ifndef __KFD_IOMMU_H__ +#define __KFD_IOMMU_H__ + +#include <linux/kconfig.h> + +#if IS_REACHABLE(CONFIG_AMD_IOMMU_V2) + +#define KFD_SUPPORT_IOMMU_V2 + +int kfd_iommu_check_device(struct kfd_dev *kfd); +int kfd_iommu_device_init(struct kfd_dev *kfd); + +int kfd_iommu_bind_process_to_device(struct kfd_process_device *pdd); +void kfd_iommu_unbind_process(struct kfd_process *p); + +void kfd_iommu_suspend(struct kfd_dev *kfd); +int kfd_iommu_resume(struct kfd_dev *kfd); + +int kfd_iommu_add_perf_counters(struct kfd_topology_device *kdev); + +#else + +static inline int kfd_iommu_check_device(struct kfd_dev *kfd) +{ + return -ENODEV; +} +static inline int kfd_iommu_device_init(struct kfd_dev *kfd) +{ +#if IS_MODULE(CONFIG_AMD_IOMMU_V2) + WARN_ONCE(1, "iommu_v2 module is not usable by built-in KFD"); +#endif + return 0; +} + +static inline int kfd_iommu_bind_process_to_device( + struct kfd_process_device *pdd) +{ + return 0; +} +static inline void kfd_iommu_unbind_process(struct kfd_process *p) +{ + /* empty */ +} + +static inline void kfd_iommu_suspend(struct kfd_dev *kfd) +{ + /* empty */ +} +static inline int kfd_iommu_resume(struct kfd_dev *kfd) +{ + return 0; +} + +static inline int kfd_iommu_add_perf_counters(struct kfd_topology_device *kdev) +{ + return 0; +} + +#endif /* IS_REACHABLE(CONFIG_AMD_IOMMU_V2) */ + +#endif /* __KFD_IOMMU_H__ */ diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c b/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c new file mode 100644 index 000000000..9f84b4d9f --- /dev/null +++ b/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c @@ -0,0 +1,379 @@ +/* + * Copyright 2014 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + * + */ + +#include <linux/types.h> +#include <linux/mutex.h> +#include <linux/slab.h> +#include <linux/printk.h> +#include <linux/sched.h> +#include "kfd_kernel_queue.h" +#include "kfd_priv.h" +#include "kfd_device_queue_manager.h" +#include "kfd_pm4_headers.h" +#include "kfd_pm4_opcodes.h" + +#define PM4_COUNT_ZERO (((1 << 15) - 1) << 16) + +static bool initialize(struct kernel_queue *kq, struct kfd_dev *dev, + enum kfd_queue_type type, unsigned int queue_size) +{ + struct queue_properties prop; + int retval; + union PM4_MES_TYPE_3_HEADER nop; + + if (WARN_ON(type != KFD_QUEUE_TYPE_DIQ && type != KFD_QUEUE_TYPE_HIQ)) + return false; + + pr_debug("Initializing queue type %d size %d\n", KFD_QUEUE_TYPE_HIQ, + queue_size); + + memset(&prop, 0, sizeof(prop)); + memset(&nop, 0, sizeof(nop)); + + nop.opcode = IT_NOP; + nop.type = PM4_TYPE_3; + nop.u32all |= PM4_COUNT_ZERO; + + kq->dev = dev; + kq->nop_packet = nop.u32all; + switch (type) { + case KFD_QUEUE_TYPE_DIQ: + case KFD_QUEUE_TYPE_HIQ: + kq->mqd_mgr = dev->dqm->ops.get_mqd_manager(dev->dqm, + KFD_MQD_TYPE_HIQ); + break; + default: + pr_err("Invalid queue type %d\n", type); + return false; + } + + if (!kq->mqd_mgr) + return false; + + prop.doorbell_ptr = kfd_get_kernel_doorbell(dev, &prop.doorbell_off); + + if (!prop.doorbell_ptr) { + pr_err("Failed to initialize doorbell"); + goto err_get_kernel_doorbell; + } + + retval = kfd_gtt_sa_allocate(dev, queue_size, &kq->pq); + if (retval != 0) { + pr_err("Failed to init pq queues size %d\n", queue_size); + goto err_pq_allocate_vidmem; + } + + kq->pq_kernel_addr = kq->pq->cpu_ptr; + kq->pq_gpu_addr = kq->pq->gpu_addr; + + retval = kq->ops_asic_specific.initialize(kq, dev, type, queue_size); + if (!retval) + goto err_eop_allocate_vidmem; + + retval = kfd_gtt_sa_allocate(dev, sizeof(*kq->rptr_kernel), + &kq->rptr_mem); + + if (retval != 0) + goto err_rptr_allocate_vidmem; + + kq->rptr_kernel = kq->rptr_mem->cpu_ptr; + kq->rptr_gpu_addr = kq->rptr_mem->gpu_addr; + + retval = kfd_gtt_sa_allocate(dev, dev->device_info->doorbell_size, + &kq->wptr_mem); + + if (retval != 0) + goto err_wptr_allocate_vidmem; + + kq->wptr_kernel = kq->wptr_mem->cpu_ptr; + kq->wptr_gpu_addr = kq->wptr_mem->gpu_addr; + + memset(kq->pq_kernel_addr, 0, queue_size); + memset(kq->rptr_kernel, 0, sizeof(*kq->rptr_kernel)); + memset(kq->wptr_kernel, 0, sizeof(*kq->wptr_kernel)); + + prop.queue_size = queue_size; + prop.is_interop = false; + prop.priority = 1; + prop.queue_percent = 100; + prop.type = type; + prop.vmid = 0; + prop.queue_address = kq->pq_gpu_addr; + prop.read_ptr = (uint32_t *) kq->rptr_gpu_addr; + prop.write_ptr = (uint32_t *) kq->wptr_gpu_addr; + prop.eop_ring_buffer_address = kq->eop_gpu_addr; + prop.eop_ring_buffer_size = PAGE_SIZE; + prop.cu_mask = NULL; + + if (init_queue(&kq->queue, &prop) != 0) + goto err_init_queue; + + kq->queue->device = dev; + kq->queue->process = kfd_get_process(current); + + retval = kq->mqd_mgr->init_mqd(kq->mqd_mgr, &kq->queue->mqd, + &kq->queue->mqd_mem_obj, + &kq->queue->gart_mqd_addr, + &kq->queue->properties); + if (retval != 0) + goto err_init_mqd; + + /* assign HIQ to HQD */ + if (type == KFD_QUEUE_TYPE_HIQ) { + pr_debug("Assigning hiq to hqd\n"); + kq->queue->pipe = KFD_CIK_HIQ_PIPE; + kq->queue->queue = KFD_CIK_HIQ_QUEUE; + kq->mqd_mgr->load_mqd(kq->mqd_mgr, kq->queue->mqd, + kq->queue->pipe, kq->queue->queue, + &kq->queue->properties, NULL); + } else { + /* allocate fence for DIQ */ + + retval = kfd_gtt_sa_allocate(dev, sizeof(uint32_t), + &kq->fence_mem_obj); + + if (retval != 0) + goto err_alloc_fence; + + kq->fence_kernel_address = kq->fence_mem_obj->cpu_ptr; + kq->fence_gpu_addr = kq->fence_mem_obj->gpu_addr; + } + + print_queue(kq->queue); + + return true; +err_alloc_fence: +err_init_mqd: + uninit_queue(kq->queue); +err_init_queue: + kfd_gtt_sa_free(dev, kq->wptr_mem); +err_wptr_allocate_vidmem: + kfd_gtt_sa_free(dev, kq->rptr_mem); +err_rptr_allocate_vidmem: + kfd_gtt_sa_free(dev, kq->eop_mem); +err_eop_allocate_vidmem: + kfd_gtt_sa_free(dev, kq->pq); +err_pq_allocate_vidmem: + kfd_release_kernel_doorbell(dev, prop.doorbell_ptr); +err_get_kernel_doorbell: + return false; + +} + +static void uninitialize(struct kernel_queue *kq) +{ + if (kq->queue->properties.type == KFD_QUEUE_TYPE_HIQ) + kq->mqd_mgr->destroy_mqd(kq->mqd_mgr, + kq->queue->mqd, + KFD_PREEMPT_TYPE_WAVEFRONT_RESET, + KFD_UNMAP_LATENCY_MS, + kq->queue->pipe, + kq->queue->queue); + else if (kq->queue->properties.type == KFD_QUEUE_TYPE_DIQ) + kfd_gtt_sa_free(kq->dev, kq->fence_mem_obj); + + kq->mqd_mgr->uninit_mqd(kq->mqd_mgr, kq->queue->mqd, + kq->queue->mqd_mem_obj); + + kfd_gtt_sa_free(kq->dev, kq->rptr_mem); + kfd_gtt_sa_free(kq->dev, kq->wptr_mem); + kq->ops_asic_specific.uninitialize(kq); + kfd_gtt_sa_free(kq->dev, kq->pq); + kfd_release_kernel_doorbell(kq->dev, + kq->queue->properties.doorbell_ptr); + uninit_queue(kq->queue); +} + +static int acquire_packet_buffer(struct kernel_queue *kq, + size_t packet_size_in_dwords, unsigned int **buffer_ptr) +{ + size_t available_size; + size_t queue_size_dwords; + uint32_t wptr, rptr; + uint64_t wptr64; + unsigned int *queue_address; + + /* When rptr == wptr, the buffer is empty. + * When rptr == wptr + 1, the buffer is full. + * It is always rptr that advances to the position of wptr, rather than + * the opposite. So we can only use up to queue_size_dwords - 1 dwords. + */ + rptr = *kq->rptr_kernel; + wptr = kq->pending_wptr; + wptr64 = kq->pending_wptr64; + queue_address = (unsigned int *)kq->pq_kernel_addr; + queue_size_dwords = kq->queue->properties.queue_size / 4; + + pr_debug("rptr: %d\n", rptr); + pr_debug("wptr: %d\n", wptr); + pr_debug("queue_address 0x%p\n", queue_address); + + available_size = (rptr + queue_size_dwords - 1 - wptr) % + queue_size_dwords; + + if (packet_size_in_dwords > available_size) { + /* + * make sure calling functions know + * acquire_packet_buffer() failed + */ + goto err_no_space; + } + + if (wptr + packet_size_in_dwords >= queue_size_dwords) { + /* make sure after rolling back to position 0, there is + * still enough space. + */ + if (packet_size_in_dwords >= rptr) + goto err_no_space; + + /* fill nops, roll back and start at position 0 */ + while (wptr > 0) { + queue_address[wptr] = kq->nop_packet; + wptr = (wptr + 1) % queue_size_dwords; + wptr64++; + } + } + + *buffer_ptr = &queue_address[wptr]; + kq->pending_wptr = wptr + packet_size_in_dwords; + kq->pending_wptr64 = wptr64 + packet_size_in_dwords; + + return 0; + +err_no_space: + *buffer_ptr = NULL; + return -ENOMEM; +} + +static void submit_packet(struct kernel_queue *kq) +{ +#ifdef DEBUG + int i; + + for (i = *kq->wptr_kernel; i < kq->pending_wptr; i++) { + pr_debug("0x%2X ", kq->pq_kernel_addr[i]); + if (i % 15 == 0) + pr_debug("\n"); + } + pr_debug("\n"); +#endif + + kq->ops_asic_specific.submit_packet(kq); +} + +static void rollback_packet(struct kernel_queue *kq) +{ + if (kq->dev->device_info->doorbell_size == 8) { + kq->pending_wptr64 = *kq->wptr64_kernel; + kq->pending_wptr = *kq->wptr_kernel % + (kq->queue->properties.queue_size / 4); + } else { + kq->pending_wptr = *kq->wptr_kernel; + } +} + +struct kernel_queue *kernel_queue_init(struct kfd_dev *dev, + enum kfd_queue_type type) +{ + struct kernel_queue *kq; + + kq = kzalloc(sizeof(*kq), GFP_KERNEL); + if (!kq) + return NULL; + + kq->ops.initialize = initialize; + kq->ops.uninitialize = uninitialize; + kq->ops.acquire_packet_buffer = acquire_packet_buffer; + kq->ops.submit_packet = submit_packet; + kq->ops.rollback_packet = rollback_packet; + + switch (dev->device_info->asic_family) { + case CHIP_CARRIZO: + case CHIP_TONGA: + case CHIP_FIJI: + case CHIP_POLARIS10: + case CHIP_POLARIS11: + kernel_queue_init_vi(&kq->ops_asic_specific); + break; + + case CHIP_KAVERI: + case CHIP_HAWAII: + kernel_queue_init_cik(&kq->ops_asic_specific); + break; + + case CHIP_VEGA10: + case CHIP_RAVEN: + kernel_queue_init_v9(&kq->ops_asic_specific); + break; + default: + WARN(1, "Unexpected ASIC family %u", + dev->device_info->asic_family); + goto out_free; + } + + if (kq->ops.initialize(kq, dev, type, KFD_KERNEL_QUEUE_SIZE)) + return kq; + + pr_err("Failed to init kernel queue\n"); + +out_free: + kfree(kq); + return NULL; +} + +void kernel_queue_uninit(struct kernel_queue *kq) +{ + kq->ops.uninitialize(kq); + kfree(kq); +} + +/* FIXME: Can this test be removed? */ +static __attribute__((unused)) void test_kq(struct kfd_dev *dev) +{ + struct kernel_queue *kq; + uint32_t *buffer, i; + int retval; + + pr_err("Starting kernel queue test\n"); + + kq = kernel_queue_init(dev, KFD_QUEUE_TYPE_HIQ); + if (unlikely(!kq)) { + pr_err(" Failed to initialize HIQ\n"); + pr_err("Kernel queue test failed\n"); + return; + } + + retval = kq->ops.acquire_packet_buffer(kq, 5, &buffer); + if (unlikely(retval != 0)) { + pr_err(" Failed to acquire packet buffer\n"); + pr_err("Kernel queue test failed\n"); + return; + } + for (i = 0; i < 5; i++) + buffer[i] = kq->nop_packet; + kq->ops.submit_packet(kq); + + pr_err("Ending kernel queue test\n"); +} + + diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.h b/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.h new file mode 100644 index 000000000..a7116a939 --- /dev/null +++ b/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.h @@ -0,0 +1,106 @@ +/* + * Copyright 2014 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + * + */ + +#ifndef KFD_KERNEL_QUEUE_H_ +#define KFD_KERNEL_QUEUE_H_ + +#include <linux/list.h> +#include <linux/types.h> +#include "kfd_priv.h" + +/** + * struct kernel_queue_ops + * + * @initialize: Initialize a kernel queue, including allocations of GART memory + * needed for the queue. + * + * @uninitialize: Uninitialize a kernel queue and free all its memory usages. + * + * @acquire_packet_buffer: Returns a pointer to the location in the kernel + * queue ring buffer where the calling function can write its packet. It is + * Guaranteed that there is enough space for that packet. It also updates the + * pending write pointer to that location so subsequent calls to + * acquire_packet_buffer will get a correct write pointer + * + * @submit_packet: Update the write pointer and doorbell of a kernel queue. + * + * @sync_with_hw: Wait until the write pointer and the read pointer of a kernel + * queue are equal, which means the CP has read all the submitted packets. + * + * @rollback_packet: This routine is called if we failed to build an acquired + * packet for some reason. It just overwrites the pending wptr with the current + * one + * + */ +struct kernel_queue_ops { + bool (*initialize)(struct kernel_queue *kq, struct kfd_dev *dev, + enum kfd_queue_type type, unsigned int queue_size); + void (*uninitialize)(struct kernel_queue *kq); + int (*acquire_packet_buffer)(struct kernel_queue *kq, + size_t packet_size_in_dwords, + unsigned int **buffer_ptr); + + void (*submit_packet)(struct kernel_queue *kq); + void (*rollback_packet)(struct kernel_queue *kq); +}; + +struct kernel_queue { + struct kernel_queue_ops ops; + struct kernel_queue_ops ops_asic_specific; + + /* data */ + struct kfd_dev *dev; + struct mqd_manager *mqd_mgr; + struct queue *queue; + uint64_t pending_wptr64; + uint32_t pending_wptr; + unsigned int nop_packet; + + struct kfd_mem_obj *rptr_mem; + uint32_t *rptr_kernel; + uint64_t rptr_gpu_addr; + struct kfd_mem_obj *wptr_mem; + union { + uint64_t *wptr64_kernel; + uint32_t *wptr_kernel; + }; + uint64_t wptr_gpu_addr; + struct kfd_mem_obj *pq; + uint64_t pq_gpu_addr; + uint32_t *pq_kernel_addr; + struct kfd_mem_obj *eop_mem; + uint64_t eop_gpu_addr; + uint32_t *eop_kernel_addr; + + struct kfd_mem_obj *fence_mem_obj; + uint64_t fence_gpu_addr; + void *fence_kernel_address; + + struct list_head list; +}; + +void kernel_queue_init_cik(struct kernel_queue_ops *ops); +void kernel_queue_init_vi(struct kernel_queue_ops *ops); +void kernel_queue_init_v9(struct kernel_queue_ops *ops); + +#endif /* KFD_KERNEL_QUEUE_H_ */ diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue_cik.c b/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue_cik.c new file mode 100644 index 000000000..19e54acb4 --- /dev/null +++ b/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue_cik.c @@ -0,0 +1,53 @@ +/* + * Copyright 2014 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + * + */ + +#include "kfd_kernel_queue.h" + +static bool initialize_cik(struct kernel_queue *kq, struct kfd_dev *dev, + enum kfd_queue_type type, unsigned int queue_size); +static void uninitialize_cik(struct kernel_queue *kq); +static void submit_packet_cik(struct kernel_queue *kq); + +void kernel_queue_init_cik(struct kernel_queue_ops *ops) +{ + ops->initialize = initialize_cik; + ops->uninitialize = uninitialize_cik; + ops->submit_packet = submit_packet_cik; +} + +static bool initialize_cik(struct kernel_queue *kq, struct kfd_dev *dev, + enum kfd_queue_type type, unsigned int queue_size) +{ + return true; +} + +static void uninitialize_cik(struct kernel_queue *kq) +{ +} + +static void submit_packet_cik(struct kernel_queue *kq) +{ + *kq->wptr_kernel = kq->pending_wptr; + write_kernel_doorbell(kq->queue->properties.doorbell_ptr, + kq->pending_wptr); +} diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue_v9.c b/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue_v9.c new file mode 100644 index 000000000..684a3bf07 --- /dev/null +++ b/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue_v9.c @@ -0,0 +1,340 @@ +/* + * Copyright 2016-2018 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + * + */ + +#include "kfd_kernel_queue.h" +#include "kfd_device_queue_manager.h" +#include "kfd_pm4_headers_ai.h" +#include "kfd_pm4_opcodes.h" + +static bool initialize_v9(struct kernel_queue *kq, struct kfd_dev *dev, + enum kfd_queue_type type, unsigned int queue_size); +static void uninitialize_v9(struct kernel_queue *kq); +static void submit_packet_v9(struct kernel_queue *kq); + +void kernel_queue_init_v9(struct kernel_queue_ops *ops) +{ + ops->initialize = initialize_v9; + ops->uninitialize = uninitialize_v9; + ops->submit_packet = submit_packet_v9; +} + +static bool initialize_v9(struct kernel_queue *kq, struct kfd_dev *dev, + enum kfd_queue_type type, unsigned int queue_size) +{ + int retval; + + retval = kfd_gtt_sa_allocate(dev, PAGE_SIZE, &kq->eop_mem); + if (retval) + return false; + + kq->eop_gpu_addr = kq->eop_mem->gpu_addr; + kq->eop_kernel_addr = kq->eop_mem->cpu_ptr; + + memset(kq->eop_kernel_addr, 0, PAGE_SIZE); + + return true; +} + +static void uninitialize_v9(struct kernel_queue *kq) +{ + kfd_gtt_sa_free(kq->dev, kq->eop_mem); +} + +static void submit_packet_v9(struct kernel_queue *kq) +{ + *kq->wptr64_kernel = kq->pending_wptr64; + write_kernel_doorbell64(kq->queue->properties.doorbell_ptr, + kq->pending_wptr64); +} + +static int pm_map_process_v9(struct packet_manager *pm, + uint32_t *buffer, struct qcm_process_device *qpd) +{ + struct pm4_mes_map_process *packet; + uint64_t vm_page_table_base_addr = + (uint64_t)(qpd->page_table_base) << 12; + + packet = (struct pm4_mes_map_process *)buffer; + memset(buffer, 0, sizeof(struct pm4_mes_map_process)); + + packet->header.u32All = pm_build_pm4_header(IT_MAP_PROCESS, + sizeof(struct pm4_mes_map_process)); + packet->bitfields2.diq_enable = (qpd->is_debug) ? 1 : 0; + packet->bitfields2.process_quantum = 1; + packet->bitfields2.pasid = qpd->pqm->process->pasid; + packet->bitfields14.gds_size = qpd->gds_size; + packet->bitfields14.num_gws = qpd->num_gws; + packet->bitfields14.num_oac = qpd->num_oac; + packet->bitfields14.sdma_enable = 1; + packet->bitfields14.num_queues = (qpd->is_debug) ? 0 : qpd->queue_count; + + packet->sh_mem_config = qpd->sh_mem_config; + packet->sh_mem_bases = qpd->sh_mem_bases; + packet->sq_shader_tba_lo = lower_32_bits(qpd->tba_addr >> 8); + packet->sq_shader_tba_hi = upper_32_bits(qpd->tba_addr >> 8); + packet->sq_shader_tma_lo = lower_32_bits(qpd->tma_addr >> 8); + packet->sq_shader_tma_hi = upper_32_bits(qpd->tma_addr >> 8); + + packet->gds_addr_lo = lower_32_bits(qpd->gds_context_area); + packet->gds_addr_hi = upper_32_bits(qpd->gds_context_area); + + packet->vm_context_page_table_base_addr_lo32 = + lower_32_bits(vm_page_table_base_addr); + packet->vm_context_page_table_base_addr_hi32 = + upper_32_bits(vm_page_table_base_addr); + + return 0; +} + +static int pm_runlist_v9(struct packet_manager *pm, uint32_t *buffer, + uint64_t ib, size_t ib_size_in_dwords, bool chain) +{ + struct pm4_mes_runlist *packet; + + int concurrent_proc_cnt = 0; + struct kfd_dev *kfd = pm->dqm->dev; + + /* Determine the number of processes to map together to HW: + * it can not exceed the number of VMIDs available to the + * scheduler, and it is determined by the smaller of the number + * of processes in the runlist and kfd module parameter + * hws_max_conc_proc. + * Note: the arbitration between the number of VMIDs and + * hws_max_conc_proc has been done in + * kgd2kfd_device_init(). + */ + concurrent_proc_cnt = min(pm->dqm->processes_count, + kfd->max_proc_per_quantum); + + packet = (struct pm4_mes_runlist *)buffer; + + memset(buffer, 0, sizeof(struct pm4_mes_runlist)); + packet->header.u32All = pm_build_pm4_header(IT_RUN_LIST, + sizeof(struct pm4_mes_runlist)); + + packet->bitfields4.ib_size = ib_size_in_dwords; + packet->bitfields4.chain = chain ? 1 : 0; + packet->bitfields4.offload_polling = 0; + packet->bitfields4.valid = 1; + packet->bitfields4.process_cnt = concurrent_proc_cnt; + packet->ordinal2 = lower_32_bits(ib); + packet->ib_base_hi = upper_32_bits(ib); + + return 0; +} + +static int pm_map_queues_v9(struct packet_manager *pm, uint32_t *buffer, + struct queue *q, bool is_static) +{ + struct pm4_mes_map_queues *packet; + bool use_static = is_static; + + packet = (struct pm4_mes_map_queues *)buffer; + memset(buffer, 0, sizeof(struct pm4_mes_map_queues)); + + packet->header.u32All = pm_build_pm4_header(IT_MAP_QUEUES, + sizeof(struct pm4_mes_map_queues)); + packet->bitfields2.alloc_format = + alloc_format__mes_map_queues__one_per_pipe_vi; + packet->bitfields2.num_queues = 1; + packet->bitfields2.queue_sel = + queue_sel__mes_map_queues__map_to_hws_determined_queue_slots_vi; + + packet->bitfields2.engine_sel = + engine_sel__mes_map_queues__compute_vi; + packet->bitfields2.queue_type = + queue_type__mes_map_queues__normal_compute_vi; + + switch (q->properties.type) { + case KFD_QUEUE_TYPE_COMPUTE: + if (use_static) + packet->bitfields2.queue_type = + queue_type__mes_map_queues__normal_latency_static_queue_vi; + break; + case KFD_QUEUE_TYPE_DIQ: + packet->bitfields2.queue_type = + queue_type__mes_map_queues__debug_interface_queue_vi; + break; + case KFD_QUEUE_TYPE_SDMA: + packet->bitfields2.engine_sel = q->properties.sdma_engine_id + + engine_sel__mes_map_queues__sdma0_vi; + use_static = false; /* no static queues under SDMA */ + break; + default: + WARN(1, "queue type %d", q->properties.type); + return -EINVAL; + } + packet->bitfields3.doorbell_offset = + q->properties.doorbell_off; + + packet->mqd_addr_lo = + lower_32_bits(q->gart_mqd_addr); + + packet->mqd_addr_hi = + upper_32_bits(q->gart_mqd_addr); + + packet->wptr_addr_lo = + lower_32_bits((uint64_t)q->properties.write_ptr); + + packet->wptr_addr_hi = + upper_32_bits((uint64_t)q->properties.write_ptr); + + return 0; +} + +static int pm_unmap_queues_v9(struct packet_manager *pm, uint32_t *buffer, + enum kfd_queue_type type, + enum kfd_unmap_queues_filter filter, + uint32_t filter_param, bool reset, + unsigned int sdma_engine) +{ + struct pm4_mes_unmap_queues *packet; + + packet = (struct pm4_mes_unmap_queues *)buffer; + memset(buffer, 0, sizeof(struct pm4_mes_unmap_queues)); + + packet->header.u32All = pm_build_pm4_header(IT_UNMAP_QUEUES, + sizeof(struct pm4_mes_unmap_queues)); + switch (type) { + case KFD_QUEUE_TYPE_COMPUTE: + case KFD_QUEUE_TYPE_DIQ: + packet->bitfields2.engine_sel = + engine_sel__mes_unmap_queues__compute; + break; + case KFD_QUEUE_TYPE_SDMA: + packet->bitfields2.engine_sel = + engine_sel__mes_unmap_queues__sdma0 + sdma_engine; + break; + default: + WARN(1, "queue type %d", type); + return -EINVAL; + } + + if (reset) + packet->bitfields2.action = + action__mes_unmap_queues__reset_queues; + else + packet->bitfields2.action = + action__mes_unmap_queues__preempt_queues; + + switch (filter) { + case KFD_UNMAP_QUEUES_FILTER_SINGLE_QUEUE: + packet->bitfields2.queue_sel = + queue_sel__mes_unmap_queues__perform_request_on_specified_queues; + packet->bitfields2.num_queues = 1; + packet->bitfields3b.doorbell_offset0 = filter_param; + break; + case KFD_UNMAP_QUEUES_FILTER_BY_PASID: + packet->bitfields2.queue_sel = + queue_sel__mes_unmap_queues__perform_request_on_pasid_queues; + packet->bitfields3a.pasid = filter_param; + break; + case KFD_UNMAP_QUEUES_FILTER_ALL_QUEUES: + packet->bitfields2.queue_sel = + queue_sel__mes_unmap_queues__unmap_all_queues; + break; + case KFD_UNMAP_QUEUES_FILTER_DYNAMIC_QUEUES: + /* in this case, we do not preempt static queues */ + packet->bitfields2.queue_sel = + queue_sel__mes_unmap_queues__unmap_all_non_static_queues; + break; + default: + WARN(1, "filter %d", filter); + return -EINVAL; + } + + return 0; + +} + +static int pm_query_status_v9(struct packet_manager *pm, uint32_t *buffer, + uint64_t fence_address, uint32_t fence_value) +{ + struct pm4_mes_query_status *packet; + + packet = (struct pm4_mes_query_status *)buffer; + memset(buffer, 0, sizeof(struct pm4_mes_query_status)); + + + packet->header.u32All = pm_build_pm4_header(IT_QUERY_STATUS, + sizeof(struct pm4_mes_query_status)); + + packet->bitfields2.context_id = 0; + packet->bitfields2.interrupt_sel = + interrupt_sel__mes_query_status__completion_status; + packet->bitfields2.command = + command__mes_query_status__fence_only_after_write_ack; + + packet->addr_hi = upper_32_bits((uint64_t)fence_address); + packet->addr_lo = lower_32_bits((uint64_t)fence_address); + packet->data_hi = upper_32_bits((uint64_t)fence_value); + packet->data_lo = lower_32_bits((uint64_t)fence_value); + + return 0; +} + + +static int pm_release_mem_v9(uint64_t gpu_addr, uint32_t *buffer) +{ + struct pm4_mec_release_mem *packet; + + packet = (struct pm4_mec_release_mem *)buffer; + memset(buffer, 0, sizeof(struct pm4_mec_release_mem)); + + packet->header.u32All = pm_build_pm4_header(IT_RELEASE_MEM, + sizeof(struct pm4_mec_release_mem)); + + packet->bitfields2.event_type = CACHE_FLUSH_AND_INV_TS_EVENT; + packet->bitfields2.event_index = event_index__mec_release_mem__end_of_pipe; + packet->bitfields2.tcl1_action_ena = 1; + packet->bitfields2.tc_action_ena = 1; + packet->bitfields2.cache_policy = cache_policy__mec_release_mem__lru; + + packet->bitfields3.data_sel = data_sel__mec_release_mem__send_32_bit_low; + packet->bitfields3.int_sel = + int_sel__mec_release_mem__send_interrupt_after_write_confirm; + + packet->bitfields4.address_lo_32b = (gpu_addr & 0xffffffff) >> 2; + packet->address_hi = upper_32_bits(gpu_addr); + + packet->data_lo = 0; + + return 0; +} + +const struct packet_manager_funcs kfd_v9_pm_funcs = { + .map_process = pm_map_process_v9, + .runlist = pm_runlist_v9, + .set_resources = pm_set_resources_vi, + .map_queues = pm_map_queues_v9, + .unmap_queues = pm_unmap_queues_v9, + .query_status = pm_query_status_v9, + .release_mem = pm_release_mem_v9, + .map_process_size = sizeof(struct pm4_mes_map_process), + .runlist_size = sizeof(struct pm4_mes_runlist), + .set_resources_size = sizeof(struct pm4_mes_set_resources), + .map_queues_size = sizeof(struct pm4_mes_map_queues), + .unmap_queues_size = sizeof(struct pm4_mes_unmap_queues), + .query_status_size = sizeof(struct pm4_mes_query_status), + .release_mem_size = sizeof(struct pm4_mec_release_mem) +}; diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue_vi.c b/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue_vi.c new file mode 100644 index 000000000..bf20c6d32 --- /dev/null +++ b/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue_vi.c @@ -0,0 +1,375 @@ +/* + * Copyright 2014 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + * + */ + +#include "kfd_kernel_queue.h" +#include "kfd_device_queue_manager.h" +#include "kfd_pm4_headers_vi.h" +#include "kfd_pm4_opcodes.h" + +static bool initialize_vi(struct kernel_queue *kq, struct kfd_dev *dev, + enum kfd_queue_type type, unsigned int queue_size); +static void uninitialize_vi(struct kernel_queue *kq); +static void submit_packet_vi(struct kernel_queue *kq); + +void kernel_queue_init_vi(struct kernel_queue_ops *ops) +{ + ops->initialize = initialize_vi; + ops->uninitialize = uninitialize_vi; + ops->submit_packet = submit_packet_vi; +} + +static bool initialize_vi(struct kernel_queue *kq, struct kfd_dev *dev, + enum kfd_queue_type type, unsigned int queue_size) +{ + int retval; + + retval = kfd_gtt_sa_allocate(dev, PAGE_SIZE, &kq->eop_mem); + if (retval != 0) + return false; + + kq->eop_gpu_addr = kq->eop_mem->gpu_addr; + kq->eop_kernel_addr = kq->eop_mem->cpu_ptr; + + memset(kq->eop_kernel_addr, 0, PAGE_SIZE); + + return true; +} + +static void uninitialize_vi(struct kernel_queue *kq) +{ + kfd_gtt_sa_free(kq->dev, kq->eop_mem); +} + +static void submit_packet_vi(struct kernel_queue *kq) +{ + *kq->wptr_kernel = kq->pending_wptr; + write_kernel_doorbell(kq->queue->properties.doorbell_ptr, + kq->pending_wptr); +} + +unsigned int pm_build_pm4_header(unsigned int opcode, size_t packet_size) +{ + union PM4_MES_TYPE_3_HEADER header; + + header.u32All = 0; + header.opcode = opcode; + header.count = packet_size / 4 - 2; + header.type = PM4_TYPE_3; + + return header.u32All; +} + +static int pm_map_process_vi(struct packet_manager *pm, uint32_t *buffer, + struct qcm_process_device *qpd) +{ + struct pm4_mes_map_process *packet; + + packet = (struct pm4_mes_map_process *)buffer; + + memset(buffer, 0, sizeof(struct pm4_mes_map_process)); + + packet->header.u32All = pm_build_pm4_header(IT_MAP_PROCESS, + sizeof(struct pm4_mes_map_process)); + packet->bitfields2.diq_enable = (qpd->is_debug) ? 1 : 0; + packet->bitfields2.process_quantum = 1; + packet->bitfields2.pasid = qpd->pqm->process->pasid; + packet->bitfields3.page_table_base = qpd->page_table_base; + packet->bitfields10.gds_size = qpd->gds_size; + packet->bitfields10.num_gws = qpd->num_gws; + packet->bitfields10.num_oac = qpd->num_oac; + packet->bitfields10.num_queues = (qpd->is_debug) ? 0 : qpd->queue_count; + + packet->sh_mem_config = qpd->sh_mem_config; + packet->sh_mem_bases = qpd->sh_mem_bases; + packet->sh_mem_ape1_base = qpd->sh_mem_ape1_base; + packet->sh_mem_ape1_limit = qpd->sh_mem_ape1_limit; + + packet->sh_hidden_private_base_vmid = qpd->sh_hidden_private_base; + + packet->gds_addr_lo = lower_32_bits(qpd->gds_context_area); + packet->gds_addr_hi = upper_32_bits(qpd->gds_context_area); + + return 0; +} + +static int pm_runlist_vi(struct packet_manager *pm, uint32_t *buffer, + uint64_t ib, size_t ib_size_in_dwords, bool chain) +{ + struct pm4_mes_runlist *packet; + int concurrent_proc_cnt = 0; + struct kfd_dev *kfd = pm->dqm->dev; + + if (WARN_ON(!ib)) + return -EFAULT; + + /* Determine the number of processes to map together to HW: + * it can not exceed the number of VMIDs available to the + * scheduler, and it is determined by the smaller of the number + * of processes in the runlist and kfd module parameter + * hws_max_conc_proc. + * Note: the arbitration between the number of VMIDs and + * hws_max_conc_proc has been done in + * kgd2kfd_device_init(). + */ + concurrent_proc_cnt = min(pm->dqm->processes_count, + kfd->max_proc_per_quantum); + + packet = (struct pm4_mes_runlist *)buffer; + + memset(buffer, 0, sizeof(struct pm4_mes_runlist)); + packet->header.u32All = pm_build_pm4_header(IT_RUN_LIST, + sizeof(struct pm4_mes_runlist)); + + packet->bitfields4.ib_size = ib_size_in_dwords; + packet->bitfields4.chain = chain ? 1 : 0; + packet->bitfields4.offload_polling = 0; + packet->bitfields4.valid = 1; + packet->bitfields4.process_cnt = concurrent_proc_cnt; + packet->ordinal2 = lower_32_bits(ib); + packet->bitfields3.ib_base_hi = upper_32_bits(ib); + + return 0; +} + +int pm_set_resources_vi(struct packet_manager *pm, uint32_t *buffer, + struct scheduling_resources *res) +{ + struct pm4_mes_set_resources *packet; + + packet = (struct pm4_mes_set_resources *)buffer; + memset(buffer, 0, sizeof(struct pm4_mes_set_resources)); + + packet->header.u32All = pm_build_pm4_header(IT_SET_RESOURCES, + sizeof(struct pm4_mes_set_resources)); + + packet->bitfields2.queue_type = + queue_type__mes_set_resources__hsa_interface_queue_hiq; + packet->bitfields2.vmid_mask = res->vmid_mask; + packet->bitfields2.unmap_latency = KFD_UNMAP_LATENCY_MS / 100; + packet->bitfields7.oac_mask = res->oac_mask; + packet->bitfields8.gds_heap_base = res->gds_heap_base; + packet->bitfields8.gds_heap_size = res->gds_heap_size; + + packet->gws_mask_lo = lower_32_bits(res->gws_mask); + packet->gws_mask_hi = upper_32_bits(res->gws_mask); + + packet->queue_mask_lo = lower_32_bits(res->queue_mask); + packet->queue_mask_hi = upper_32_bits(res->queue_mask); + + return 0; +} + +static int pm_map_queues_vi(struct packet_manager *pm, uint32_t *buffer, + struct queue *q, bool is_static) +{ + struct pm4_mes_map_queues *packet; + bool use_static = is_static; + + packet = (struct pm4_mes_map_queues *)buffer; + memset(buffer, 0, sizeof(struct pm4_mes_map_queues)); + + packet->header.u32All = pm_build_pm4_header(IT_MAP_QUEUES, + sizeof(struct pm4_mes_map_queues)); + packet->bitfields2.alloc_format = + alloc_format__mes_map_queues__one_per_pipe_vi; + packet->bitfields2.num_queues = 1; + packet->bitfields2.queue_sel = + queue_sel__mes_map_queues__map_to_hws_determined_queue_slots_vi; + + packet->bitfields2.engine_sel = + engine_sel__mes_map_queues__compute_vi; + packet->bitfields2.queue_type = + queue_type__mes_map_queues__normal_compute_vi; + + switch (q->properties.type) { + case KFD_QUEUE_TYPE_COMPUTE: + if (use_static) + packet->bitfields2.queue_type = + queue_type__mes_map_queues__normal_latency_static_queue_vi; + break; + case KFD_QUEUE_TYPE_DIQ: + packet->bitfields2.queue_type = + queue_type__mes_map_queues__debug_interface_queue_vi; + break; + case KFD_QUEUE_TYPE_SDMA: + packet->bitfields2.engine_sel = q->properties.sdma_engine_id + + engine_sel__mes_map_queues__sdma0_vi; + use_static = false; /* no static queues under SDMA */ + break; + default: + WARN(1, "queue type %d", q->properties.type); + return -EINVAL; + } + packet->bitfields3.doorbell_offset = + q->properties.doorbell_off; + + packet->mqd_addr_lo = + lower_32_bits(q->gart_mqd_addr); + + packet->mqd_addr_hi = + upper_32_bits(q->gart_mqd_addr); + + packet->wptr_addr_lo = + lower_32_bits((uint64_t)q->properties.write_ptr); + + packet->wptr_addr_hi = + upper_32_bits((uint64_t)q->properties.write_ptr); + + return 0; +} + +static int pm_unmap_queues_vi(struct packet_manager *pm, uint32_t *buffer, + enum kfd_queue_type type, + enum kfd_unmap_queues_filter filter, + uint32_t filter_param, bool reset, + unsigned int sdma_engine) +{ + struct pm4_mes_unmap_queues *packet; + + packet = (struct pm4_mes_unmap_queues *)buffer; + memset(buffer, 0, sizeof(struct pm4_mes_unmap_queues)); + + packet->header.u32All = pm_build_pm4_header(IT_UNMAP_QUEUES, + sizeof(struct pm4_mes_unmap_queues)); + switch (type) { + case KFD_QUEUE_TYPE_COMPUTE: + case KFD_QUEUE_TYPE_DIQ: + packet->bitfields2.engine_sel = + engine_sel__mes_unmap_queues__compute; + break; + case KFD_QUEUE_TYPE_SDMA: + packet->bitfields2.engine_sel = + engine_sel__mes_unmap_queues__sdma0 + sdma_engine; + break; + default: + WARN(1, "queue type %d", type); + return -EINVAL; + } + + if (reset) + packet->bitfields2.action = + action__mes_unmap_queues__reset_queues; + else + packet->bitfields2.action = + action__mes_unmap_queues__preempt_queues; + + switch (filter) { + case KFD_UNMAP_QUEUES_FILTER_SINGLE_QUEUE: + packet->bitfields2.queue_sel = + queue_sel__mes_unmap_queues__perform_request_on_specified_queues; + packet->bitfields2.num_queues = 1; + packet->bitfields3b.doorbell_offset0 = filter_param; + break; + case KFD_UNMAP_QUEUES_FILTER_BY_PASID: + packet->bitfields2.queue_sel = + queue_sel__mes_unmap_queues__perform_request_on_pasid_queues; + packet->bitfields3a.pasid = filter_param; + break; + case KFD_UNMAP_QUEUES_FILTER_ALL_QUEUES: + packet->bitfields2.queue_sel = + queue_sel__mes_unmap_queues__unmap_all_queues; + break; + case KFD_UNMAP_QUEUES_FILTER_DYNAMIC_QUEUES: + /* in this case, we do not preempt static queues */ + packet->bitfields2.queue_sel = + queue_sel__mes_unmap_queues__unmap_all_non_static_queues; + break; + default: + WARN(1, "filter %d", filter); + return -EINVAL; + } + + return 0; + +} + +static int pm_query_status_vi(struct packet_manager *pm, uint32_t *buffer, + uint64_t fence_address, uint32_t fence_value) +{ + struct pm4_mes_query_status *packet; + + packet = (struct pm4_mes_query_status *)buffer; + memset(buffer, 0, sizeof(struct pm4_mes_query_status)); + + packet->header.u32All = pm_build_pm4_header(IT_QUERY_STATUS, + sizeof(struct pm4_mes_query_status)); + + packet->bitfields2.context_id = 0; + packet->bitfields2.interrupt_sel = + interrupt_sel__mes_query_status__completion_status; + packet->bitfields2.command = + command__mes_query_status__fence_only_after_write_ack; + + packet->addr_hi = upper_32_bits((uint64_t)fence_address); + packet->addr_lo = lower_32_bits((uint64_t)fence_address); + packet->data_hi = upper_32_bits((uint64_t)fence_value); + packet->data_lo = lower_32_bits((uint64_t)fence_value); + + return 0; +} + +static int pm_release_mem_vi(uint64_t gpu_addr, uint32_t *buffer) +{ + struct pm4_mec_release_mem *packet; + + packet = (struct pm4_mec_release_mem *)buffer; + memset(buffer, 0, sizeof(*packet)); + + packet->header.u32All = pm_build_pm4_header(IT_RELEASE_MEM, + sizeof(*packet)); + + packet->bitfields2.event_type = CACHE_FLUSH_AND_INV_TS_EVENT; + packet->bitfields2.event_index = event_index___release_mem__end_of_pipe; + packet->bitfields2.tcl1_action_ena = 1; + packet->bitfields2.tc_action_ena = 1; + packet->bitfields2.cache_policy = cache_policy___release_mem__lru; + packet->bitfields2.atc = 0; + + packet->bitfields3.data_sel = data_sel___release_mem__send_32_bit_low; + packet->bitfields3.int_sel = + int_sel___release_mem__send_interrupt_after_write_confirm; + + packet->bitfields4.address_lo_32b = (gpu_addr & 0xffffffff) >> 2; + packet->address_hi = upper_32_bits(gpu_addr); + + packet->data_lo = 0; + + return 0; +} + +const struct packet_manager_funcs kfd_vi_pm_funcs = { + .map_process = pm_map_process_vi, + .runlist = pm_runlist_vi, + .set_resources = pm_set_resources_vi, + .map_queues = pm_map_queues_vi, + .unmap_queues = pm_unmap_queues_vi, + .query_status = pm_query_status_vi, + .release_mem = pm_release_mem_vi, + .map_process_size = sizeof(struct pm4_mes_map_process), + .runlist_size = sizeof(struct pm4_mes_runlist), + .set_resources_size = sizeof(struct pm4_mes_set_resources), + .map_queues_size = sizeof(struct pm4_mes_map_queues), + .unmap_queues_size = sizeof(struct pm4_mes_unmap_queues), + .query_status_size = sizeof(struct pm4_mes_query_status), + .release_mem_size = sizeof(struct pm4_mec_release_mem) +}; diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_module.c b/drivers/gpu/drm/amd/amdkfd/kfd_module.c new file mode 100644 index 000000000..6e1f5c7c2 --- /dev/null +++ b/drivers/gpu/drm/amd/amdkfd/kfd_module.c @@ -0,0 +1,190 @@ +/* + * Copyright 2014 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +#include <linux/module.h> +#include <linux/sched.h> +#include <linux/moduleparam.h> +#include <linux/device.h> +#include <linux/printk.h> +#include "kfd_priv.h" + +#define KFD_DRIVER_AUTHOR "AMD Inc. and others" + +#define KFD_DRIVER_DESC "Standalone HSA driver for AMD's GPUs" +#define KFD_DRIVER_DATE "20150421" +#define KFD_DRIVER_MAJOR 0 +#define KFD_DRIVER_MINOR 7 +#define KFD_DRIVER_PATCHLEVEL 2 + +static const struct kgd2kfd_calls kgd2kfd = { + .exit = kgd2kfd_exit, + .probe = kgd2kfd_probe, + .device_init = kgd2kfd_device_init, + .device_exit = kgd2kfd_device_exit, + .interrupt = kgd2kfd_interrupt, + .suspend = kgd2kfd_suspend, + .resume = kgd2kfd_resume, + .quiesce_mm = kgd2kfd_quiesce_mm, + .resume_mm = kgd2kfd_resume_mm, + .schedule_evict_and_restore_process = + kgd2kfd_schedule_evict_and_restore_process, + .pre_reset = kgd2kfd_pre_reset, + .post_reset = kgd2kfd_post_reset, +}; + +int sched_policy = KFD_SCHED_POLICY_HWS; +module_param(sched_policy, int, 0444); +MODULE_PARM_DESC(sched_policy, + "Scheduling policy (0 = HWS (Default), 1 = HWS without over-subscription, 2 = Non-HWS (Used for debugging only)"); + +int hws_max_conc_proc = 8; +module_param(hws_max_conc_proc, int, 0444); +MODULE_PARM_DESC(hws_max_conc_proc, + "Max # processes HWS can execute concurrently when sched_policy=0 (0 = no concurrency, #VMIDs for KFD = Maximum(default))"); + +int cwsr_enable = 1; +module_param(cwsr_enable, int, 0444); +MODULE_PARM_DESC(cwsr_enable, "CWSR enable (0 = off, 1 = on (default))"); + +int max_num_of_queues_per_device = KFD_MAX_NUM_OF_QUEUES_PER_DEVICE_DEFAULT; +module_param(max_num_of_queues_per_device, int, 0444); +MODULE_PARM_DESC(max_num_of_queues_per_device, + "Maximum number of supported queues per device (1 = Minimum, 4096 = default)"); + +int send_sigterm; +module_param(send_sigterm, int, 0444); +MODULE_PARM_DESC(send_sigterm, + "Send sigterm to HSA process on unhandled exception (0 = disable, 1 = enable)"); + +int debug_largebar; +module_param(debug_largebar, int, 0444); +MODULE_PARM_DESC(debug_largebar, + "Debug large-bar flag used to simulate large-bar capability on non-large bar machine (0 = disable, 1 = enable)"); + +int ignore_crat; +module_param(ignore_crat, int, 0444); +MODULE_PARM_DESC(ignore_crat, + "Ignore CRAT table during KFD initialization (0 = use CRAT (default), 1 = ignore CRAT)"); + +int noretry; +module_param(noretry, int, 0644); +MODULE_PARM_DESC(noretry, + "Set sh_mem_config.retry_disable on GFXv9+ dGPUs (0 = retry enabled (default), 1 = retry disabled)"); + +int halt_if_hws_hang; +module_param(halt_if_hws_hang, int, 0644); +MODULE_PARM_DESC(halt_if_hws_hang, "Halt if HWS hang is detected (0 = off (default), 1 = on)"); + + +static int amdkfd_init_completed; + + +int kgd2kfd_init(unsigned int interface_version, + const struct kgd2kfd_calls **g2f) +{ + if (!amdkfd_init_completed) + return -EPROBE_DEFER; + + /* + * Only one interface version is supported, + * no kfd/kgd version skew allowed. + */ + if (interface_version != KFD_INTERFACE_VERSION) + return -EINVAL; + + *g2f = &kgd2kfd; + + return 0; +} +EXPORT_SYMBOL(kgd2kfd_init); + +void kgd2kfd_exit(void) +{ +} + +static int __init kfd_module_init(void) +{ + int err; + + /* Verify module parameters */ + if ((sched_policy < KFD_SCHED_POLICY_HWS) || + (sched_policy > KFD_SCHED_POLICY_NO_HWS)) { + pr_err("sched_policy has invalid value\n"); + return -1; + } + + /* Verify module parameters */ + if ((max_num_of_queues_per_device < 1) || + (max_num_of_queues_per_device > + KFD_MAX_NUM_OF_QUEUES_PER_DEVICE)) { + pr_err("max_num_of_queues_per_device must be between 1 to KFD_MAX_NUM_OF_QUEUES_PER_DEVICE\n"); + return -1; + } + + err = kfd_chardev_init(); + if (err < 0) + goto err_ioctl; + + err = kfd_topology_init(); + if (err < 0) + goto err_topology; + + err = kfd_process_create_wq(); + if (err < 0) + goto err_create_wq; + + kfd_debugfs_init(); + + amdkfd_init_completed = 1; + + dev_info(kfd_device, "Initialized module\n"); + + return 0; + +err_create_wq: + kfd_topology_shutdown(); +err_topology: + kfd_chardev_exit(); +err_ioctl: + return err; +} + +static void __exit kfd_module_exit(void) +{ + amdkfd_init_completed = 0; + + kfd_debugfs_fini(); + kfd_process_destroy_wq(); + kfd_topology_shutdown(); + kfd_chardev_exit(); + pr_info("amdkfd: Removed module\n"); +} + +module_init(kfd_module_init); +module_exit(kfd_module_exit); + +MODULE_AUTHOR(KFD_DRIVER_AUTHOR); +MODULE_DESCRIPTION(KFD_DRIVER_DESC); +MODULE_LICENSE("GPL and additional rights"); +MODULE_VERSION(__stringify(KFD_DRIVER_MAJOR) "." + __stringify(KFD_DRIVER_MINOR) "." + __stringify(KFD_DRIVER_PATCHLEVEL)); diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.c new file mode 100644 index 000000000..3bc25ab84 --- /dev/null +++ b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.c @@ -0,0 +1,89 @@ +/* + * Copyright 2014 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + * + */ + +#include "kfd_mqd_manager.h" + +struct mqd_manager *mqd_manager_init(enum KFD_MQD_TYPE type, + struct kfd_dev *dev) +{ + switch (dev->device_info->asic_family) { + case CHIP_KAVERI: + return mqd_manager_init_cik(type, dev); + case CHIP_HAWAII: + return mqd_manager_init_cik_hawaii(type, dev); + case CHIP_CARRIZO: + return mqd_manager_init_vi(type, dev); + case CHIP_TONGA: + case CHIP_FIJI: + case CHIP_POLARIS10: + case CHIP_POLARIS11: + return mqd_manager_init_vi_tonga(type, dev); + case CHIP_VEGA10: + case CHIP_RAVEN: + return mqd_manager_init_v9(type, dev); + default: + WARN(1, "Unexpected ASIC family %u", + dev->device_info->asic_family); + } + + return NULL; +} + +void mqd_symmetrically_map_cu_mask(struct mqd_manager *mm, + const uint32_t *cu_mask, uint32_t cu_mask_count, + uint32_t *se_mask) +{ + struct kfd_cu_info cu_info; + uint32_t cu_per_sh[4] = {0}; + int i, se, cu = 0; + + mm->dev->kfd2kgd->get_cu_info(mm->dev->kgd, &cu_info); + + if (cu_mask_count > cu_info.cu_active_number) + cu_mask_count = cu_info.cu_active_number; + + for (se = 0; se < cu_info.num_shader_engines; se++) + for (i = 0; i < 4; i++) + cu_per_sh[se] += hweight32(cu_info.cu_bitmap[se][i]); + + /* Symmetrically map cu_mask to all SEs: + * cu_mask[0] bit0 -> se_mask[0] bit0; + * cu_mask[0] bit1 -> se_mask[1] bit0; + * ... (if # SE is 4) + * cu_mask[0] bit4 -> se_mask[0] bit1; + * ... + */ + se = 0; + for (i = 0; i < cu_mask_count; i++) { + if (cu_mask[i / 32] & (1 << (i % 32))) + se_mask[se] |= 1 << cu; + + do { + se++; + if (se == cu_info.num_shader_engines) { + se = 0; + cu++; + } + } while (cu >= cu_per_sh[se] && cu < 32); + } +} diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.h b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.h new file mode 100644 index 000000000..4e84052d4 --- /dev/null +++ b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.h @@ -0,0 +1,100 @@ +/* + * Copyright 2014 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + * + */ + +#ifndef KFD_MQD_MANAGER_H_ +#define KFD_MQD_MANAGER_H_ + +#include "kfd_priv.h" + +/** + * struct mqd_manager + * + * @init_mqd: Allocates the mqd buffer on local gpu memory and initialize it. + * + * @load_mqd: Loads the mqd to a concrete hqd slot. Used only for no cp + * scheduling mode. + * + * @update_mqd: Handles a update call for the MQD + * + * @destroy_mqd: Destroys the HQD slot and by that preempt the relevant queue. + * Used only for no cp scheduling. + * + * @uninit_mqd: Releases the mqd buffer from local gpu memory. + * + * @is_occupied: Checks if the relevant HQD slot is occupied. + * + * @mqd_mutex: Mqd manager mutex. + * + * @dev: The kfd device structure coupled with this module. + * + * MQD stands for Memory Queue Descriptor which represents the current queue + * state in the memory and initiate the HQD (Hardware Queue Descriptor) state. + * This structure is actually a base class for the different types of MQDs + * structures for the variant ASICs that should be supported in the future. + * This base class is also contains all the MQD specific operations. + * Another important thing to mention is that each queue has a MQD that keeps + * his state (or context) after each preemption or reassignment. + * Basically there are a instances of the mqd manager class per MQD type per + * ASIC. Currently the kfd driver supports only Kaveri so there are instances + * per KFD_MQD_TYPE for each device. + * + */ + +struct mqd_manager { + int (*init_mqd)(struct mqd_manager *mm, void **mqd, + struct kfd_mem_obj **mqd_mem_obj, uint64_t *gart_addr, + struct queue_properties *q); + + int (*load_mqd)(struct mqd_manager *mm, void *mqd, + uint32_t pipe_id, uint32_t queue_id, + struct queue_properties *p, + struct mm_struct *mms); + + int (*update_mqd)(struct mqd_manager *mm, void *mqd, + struct queue_properties *q); + + int (*destroy_mqd)(struct mqd_manager *mm, void *mqd, + enum kfd_preempt_type type, + unsigned int timeout, uint32_t pipe_id, + uint32_t queue_id); + + void (*uninit_mqd)(struct mqd_manager *mm, void *mqd, + struct kfd_mem_obj *mqd_mem_obj); + + bool (*is_occupied)(struct mqd_manager *mm, void *mqd, + uint64_t queue_address, uint32_t pipe_id, + uint32_t queue_id); + +#if defined(CONFIG_DEBUG_FS) + int (*debugfs_show_mqd)(struct seq_file *m, void *data); +#endif + + struct mutex mqd_mutex; + struct kfd_dev *dev; +}; + +void mqd_symmetrically_map_cu_mask(struct mqd_manager *mm, + const uint32_t *cu_mask, uint32_t cu_mask_count, + uint32_t *se_mask); + +#endif /* KFD_MQD_MANAGER_H_ */ diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_cik.c b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_cik.c new file mode 100644 index 000000000..ae90a9990 --- /dev/null +++ b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_cik.c @@ -0,0 +1,448 @@ +/* + * Copyright 2014 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + * + */ + +#include <linux/printk.h> +#include <linux/slab.h> +#include <linux/mm_types.h> + +#include "kfd_priv.h" +#include "kfd_mqd_manager.h" +#include "cik_regs.h" +#include "cik_structs.h" +#include "oss/oss_2_4_sh_mask.h" + +static inline struct cik_mqd *get_mqd(void *mqd) +{ + return (struct cik_mqd *)mqd; +} + +static inline struct cik_sdma_rlc_registers *get_sdma_mqd(void *mqd) +{ + return (struct cik_sdma_rlc_registers *)mqd; +} + +static void update_cu_mask(struct mqd_manager *mm, void *mqd, + struct queue_properties *q) +{ + struct cik_mqd *m; + uint32_t se_mask[4] = {0}; /* 4 is the max # of SEs */ + + if (q->cu_mask_count == 0) + return; + + mqd_symmetrically_map_cu_mask(mm, + q->cu_mask, q->cu_mask_count, se_mask); + + m = get_mqd(mqd); + m->compute_static_thread_mgmt_se0 = se_mask[0]; + m->compute_static_thread_mgmt_se1 = se_mask[1]; + m->compute_static_thread_mgmt_se2 = se_mask[2]; + m->compute_static_thread_mgmt_se3 = se_mask[3]; + + pr_debug("Update cu mask to %#x %#x %#x %#x\n", + m->compute_static_thread_mgmt_se0, + m->compute_static_thread_mgmt_se1, + m->compute_static_thread_mgmt_se2, + m->compute_static_thread_mgmt_se3); +} + +static int init_mqd(struct mqd_manager *mm, void **mqd, + struct kfd_mem_obj **mqd_mem_obj, uint64_t *gart_addr, + struct queue_properties *q) +{ + uint64_t addr; + struct cik_mqd *m; + int retval; + + retval = kfd_gtt_sa_allocate(mm->dev, sizeof(struct cik_mqd), + mqd_mem_obj); + + if (retval != 0) + return -ENOMEM; + + m = (struct cik_mqd *) (*mqd_mem_obj)->cpu_ptr; + addr = (*mqd_mem_obj)->gpu_addr; + + memset(m, 0, ALIGN(sizeof(struct cik_mqd), 256)); + + m->header = 0xC0310800; + m->compute_pipelinestat_enable = 1; + m->compute_static_thread_mgmt_se0 = 0xFFFFFFFF; + m->compute_static_thread_mgmt_se1 = 0xFFFFFFFF; + m->compute_static_thread_mgmt_se2 = 0xFFFFFFFF; + m->compute_static_thread_mgmt_se3 = 0xFFFFFFFF; + + /* + * Make sure to use the last queue state saved on mqd when the cp + * reassigns the queue, so when queue is switched on/off (e.g over + * subscription or quantum timeout) the context will be consistent + */ + m->cp_hqd_persistent_state = + DEFAULT_CP_HQD_PERSISTENT_STATE | PRELOAD_REQ; + + m->cp_mqd_control = MQD_CONTROL_PRIV_STATE_EN; + m->cp_mqd_base_addr_lo = lower_32_bits(addr); + m->cp_mqd_base_addr_hi = upper_32_bits(addr); + + m->cp_hqd_quantum = QUANTUM_EN | QUANTUM_SCALE_1MS | + QUANTUM_DURATION(10); + + /* + * Pipe Priority + * Identifies the pipe relative priority when this queue is connected + * to the pipeline. The pipe priority is against the GFX pipe and HP3D. + * In KFD we are using a fixed pipe priority set to CS_MEDIUM. + * 0 = CS_LOW (typically below GFX) + * 1 = CS_MEDIUM (typically between HP3D and GFX + * 2 = CS_HIGH (typically above HP3D) + */ + m->cp_hqd_pipe_priority = 1; + m->cp_hqd_queue_priority = 15; + + if (q->format == KFD_QUEUE_FORMAT_AQL) + m->cp_hqd_iq_rptr = AQL_ENABLE; + + *mqd = m; + if (gart_addr) + *gart_addr = addr; + retval = mm->update_mqd(mm, m, q); + + return retval; +} + +static int init_mqd_sdma(struct mqd_manager *mm, void **mqd, + struct kfd_mem_obj **mqd_mem_obj, uint64_t *gart_addr, + struct queue_properties *q) +{ + int retval; + struct cik_sdma_rlc_registers *m; + + retval = kfd_gtt_sa_allocate(mm->dev, + sizeof(struct cik_sdma_rlc_registers), + mqd_mem_obj); + + if (retval != 0) + return -ENOMEM; + + m = (struct cik_sdma_rlc_registers *) (*mqd_mem_obj)->cpu_ptr; + + memset(m, 0, sizeof(struct cik_sdma_rlc_registers)); + + *mqd = m; + if (gart_addr) + *gart_addr = (*mqd_mem_obj)->gpu_addr; + + retval = mm->update_mqd(mm, m, q); + + return retval; +} + +static void uninit_mqd(struct mqd_manager *mm, void *mqd, + struct kfd_mem_obj *mqd_mem_obj) +{ + kfd_gtt_sa_free(mm->dev, mqd_mem_obj); +} + +static void uninit_mqd_sdma(struct mqd_manager *mm, void *mqd, + struct kfd_mem_obj *mqd_mem_obj) +{ + kfd_gtt_sa_free(mm->dev, mqd_mem_obj); +} + +static int load_mqd(struct mqd_manager *mm, void *mqd, uint32_t pipe_id, + uint32_t queue_id, struct queue_properties *p, + struct mm_struct *mms) +{ + /* AQL write pointer counts in 64B packets, PM4/CP counts in dwords. */ + uint32_t wptr_shift = (p->format == KFD_QUEUE_FORMAT_AQL ? 4 : 0); + uint32_t wptr_mask = (uint32_t)((p->queue_size / 4) - 1); + + return mm->dev->kfd2kgd->hqd_load(mm->dev->kgd, mqd, pipe_id, queue_id, + (uint32_t __user *)p->write_ptr, + wptr_shift, wptr_mask, mms); +} + +static int load_mqd_sdma(struct mqd_manager *mm, void *mqd, + uint32_t pipe_id, uint32_t queue_id, + struct queue_properties *p, struct mm_struct *mms) +{ + return mm->dev->kfd2kgd->hqd_sdma_load(mm->dev->kgd, mqd, + (uint32_t __user *)p->write_ptr, + mms); +} + +static int __update_mqd(struct mqd_manager *mm, void *mqd, + struct queue_properties *q, unsigned int atc_bit) +{ + struct cik_mqd *m; + + m = get_mqd(mqd); + m->cp_hqd_pq_control = DEFAULT_RPTR_BLOCK_SIZE | + DEFAULT_MIN_AVAIL_SIZE; + m->cp_hqd_ib_control = DEFAULT_MIN_IB_AVAIL_SIZE; + if (atc_bit) { + m->cp_hqd_pq_control |= PQ_ATC_EN; + m->cp_hqd_ib_control |= IB_ATC_EN; + } + + /* + * Calculating queue size which is log base 2 of actual queue size -1 + * dwords and another -1 for ffs + */ + m->cp_hqd_pq_control |= order_base_2(q->queue_size / 4) - 1; + m->cp_hqd_pq_base_lo = lower_32_bits((uint64_t)q->queue_address >> 8); + m->cp_hqd_pq_base_hi = upper_32_bits((uint64_t)q->queue_address >> 8); + m->cp_hqd_pq_rptr_report_addr_lo = lower_32_bits((uint64_t)q->read_ptr); + m->cp_hqd_pq_rptr_report_addr_hi = upper_32_bits((uint64_t)q->read_ptr); + m->cp_hqd_pq_doorbell_control = DOORBELL_OFFSET(q->doorbell_off); + + m->cp_hqd_vmid = q->vmid; + + if (q->format == KFD_QUEUE_FORMAT_AQL) + m->cp_hqd_pq_control |= NO_UPDATE_RPTR; + + update_cu_mask(mm, mqd, q); + + q->is_active = (q->queue_size > 0 && + q->queue_address != 0 && + q->queue_percent > 0 && + !q->is_evicted); + + return 0; +} + +static int update_mqd(struct mqd_manager *mm, void *mqd, + struct queue_properties *q) +{ + return __update_mqd(mm, mqd, q, 1); +} + +static int update_mqd_hawaii(struct mqd_manager *mm, void *mqd, + struct queue_properties *q) +{ + return __update_mqd(mm, mqd, q, 0); +} + +static int update_mqd_sdma(struct mqd_manager *mm, void *mqd, + struct queue_properties *q) +{ + struct cik_sdma_rlc_registers *m; + + m = get_sdma_mqd(mqd); + m->sdma_rlc_rb_cntl = order_base_2(q->queue_size / 4) + << SDMA0_RLC0_RB_CNTL__RB_SIZE__SHIFT | + q->vmid << SDMA0_RLC0_RB_CNTL__RB_VMID__SHIFT | + 1 << SDMA0_RLC0_RB_CNTL__RPTR_WRITEBACK_ENABLE__SHIFT | + 6 << SDMA0_RLC0_RB_CNTL__RPTR_WRITEBACK_TIMER__SHIFT; + + m->sdma_rlc_rb_base = lower_32_bits(q->queue_address >> 8); + m->sdma_rlc_rb_base_hi = upper_32_bits(q->queue_address >> 8); + m->sdma_rlc_rb_rptr_addr_lo = lower_32_bits((uint64_t)q->read_ptr); + m->sdma_rlc_rb_rptr_addr_hi = upper_32_bits((uint64_t)q->read_ptr); + m->sdma_rlc_doorbell = + q->doorbell_off << SDMA0_RLC0_DOORBELL__OFFSET__SHIFT; + + m->sdma_rlc_virtual_addr = q->sdma_vm_addr; + + m->sdma_engine_id = q->sdma_engine_id; + m->sdma_queue_id = q->sdma_queue_id; + + q->is_active = (q->queue_size > 0 && + q->queue_address != 0 && + q->queue_percent > 0 && + !q->is_evicted); + + return 0; +} + +static int destroy_mqd(struct mqd_manager *mm, void *mqd, + enum kfd_preempt_type type, + unsigned int timeout, uint32_t pipe_id, + uint32_t queue_id) +{ + return mm->dev->kfd2kgd->hqd_destroy(mm->dev->kgd, mqd, type, timeout, + pipe_id, queue_id); +} + +/* + * preempt type here is ignored because there is only one way + * to preempt sdma queue + */ +static int destroy_mqd_sdma(struct mqd_manager *mm, void *mqd, + enum kfd_preempt_type type, + unsigned int timeout, uint32_t pipe_id, + uint32_t queue_id) +{ + return mm->dev->kfd2kgd->hqd_sdma_destroy(mm->dev->kgd, mqd, timeout); +} + +static bool is_occupied(struct mqd_manager *mm, void *mqd, + uint64_t queue_address, uint32_t pipe_id, + uint32_t queue_id) +{ + + return mm->dev->kfd2kgd->hqd_is_occupied(mm->dev->kgd, queue_address, + pipe_id, queue_id); + +} + +static bool is_occupied_sdma(struct mqd_manager *mm, void *mqd, + uint64_t queue_address, uint32_t pipe_id, + uint32_t queue_id) +{ + return mm->dev->kfd2kgd->hqd_sdma_is_occupied(mm->dev->kgd, mqd); +} + +/* + * HIQ MQD Implementation, concrete implementation for HIQ MQD implementation. + * The HIQ queue in Kaveri is using the same MQD structure as all the user mode + * queues but with different initial values. + */ + +static int init_mqd_hiq(struct mqd_manager *mm, void **mqd, + struct kfd_mem_obj **mqd_mem_obj, uint64_t *gart_addr, + struct queue_properties *q) +{ + return init_mqd(mm, mqd, mqd_mem_obj, gart_addr, q); +} + +static int update_mqd_hiq(struct mqd_manager *mm, void *mqd, + struct queue_properties *q) +{ + struct cik_mqd *m; + + m = get_mqd(mqd); + m->cp_hqd_pq_control = DEFAULT_RPTR_BLOCK_SIZE | + DEFAULT_MIN_AVAIL_SIZE | + PRIV_STATE | + KMD_QUEUE; + + /* + * Calculating queue size which is log base 2 of actual queue + * size -1 dwords + */ + m->cp_hqd_pq_control |= order_base_2(q->queue_size / 4) - 1; + m->cp_hqd_pq_base_lo = lower_32_bits((uint64_t)q->queue_address >> 8); + m->cp_hqd_pq_base_hi = upper_32_bits((uint64_t)q->queue_address >> 8); + m->cp_hqd_pq_rptr_report_addr_lo = lower_32_bits((uint64_t)q->read_ptr); + m->cp_hqd_pq_rptr_report_addr_hi = upper_32_bits((uint64_t)q->read_ptr); + m->cp_hqd_pq_doorbell_control = DOORBELL_OFFSET(q->doorbell_off); + + m->cp_hqd_vmid = q->vmid; + + q->is_active = (q->queue_size > 0 && + q->queue_address != 0 && + q->queue_percent > 0 && + !q->is_evicted); + + return 0; +} + +#if defined(CONFIG_DEBUG_FS) + +static int debugfs_show_mqd(struct seq_file *m, void *data) +{ + seq_hex_dump(m, " ", DUMP_PREFIX_OFFSET, 32, 4, + data, sizeof(struct cik_mqd), false); + return 0; +} + +static int debugfs_show_mqd_sdma(struct seq_file *m, void *data) +{ + seq_hex_dump(m, " ", DUMP_PREFIX_OFFSET, 32, 4, + data, sizeof(struct cik_sdma_rlc_registers), false); + return 0; +} + +#endif + + +struct mqd_manager *mqd_manager_init_cik(enum KFD_MQD_TYPE type, + struct kfd_dev *dev) +{ + struct mqd_manager *mqd; + + if (WARN_ON(type >= KFD_MQD_TYPE_MAX)) + return NULL; + + mqd = kzalloc(sizeof(*mqd), GFP_KERNEL); + if (!mqd) + return NULL; + + mqd->dev = dev; + + switch (type) { + case KFD_MQD_TYPE_CP: + case KFD_MQD_TYPE_COMPUTE: + mqd->init_mqd = init_mqd; + mqd->uninit_mqd = uninit_mqd; + mqd->load_mqd = load_mqd; + mqd->update_mqd = update_mqd; + mqd->destroy_mqd = destroy_mqd; + mqd->is_occupied = is_occupied; +#if defined(CONFIG_DEBUG_FS) + mqd->debugfs_show_mqd = debugfs_show_mqd; +#endif + break; + case KFD_MQD_TYPE_HIQ: + mqd->init_mqd = init_mqd_hiq; + mqd->uninit_mqd = uninit_mqd; + mqd->load_mqd = load_mqd; + mqd->update_mqd = update_mqd_hiq; + mqd->destroy_mqd = destroy_mqd; + mqd->is_occupied = is_occupied; +#if defined(CONFIG_DEBUG_FS) + mqd->debugfs_show_mqd = debugfs_show_mqd; +#endif + break; + case KFD_MQD_TYPE_SDMA: + mqd->init_mqd = init_mqd_sdma; + mqd->uninit_mqd = uninit_mqd_sdma; + mqd->load_mqd = load_mqd_sdma; + mqd->update_mqd = update_mqd_sdma; + mqd->destroy_mqd = destroy_mqd_sdma; + mqd->is_occupied = is_occupied_sdma; +#if defined(CONFIG_DEBUG_FS) + mqd->debugfs_show_mqd = debugfs_show_mqd_sdma; +#endif + break; + default: + kfree(mqd); + return NULL; + } + + return mqd; +} + +struct mqd_manager *mqd_manager_init_cik_hawaii(enum KFD_MQD_TYPE type, + struct kfd_dev *dev) +{ + struct mqd_manager *mqd; + + mqd = mqd_manager_init_cik(type, dev); + if (!mqd) + return NULL; + if ((type == KFD_MQD_TYPE_CP) || (type == KFD_MQD_TYPE_COMPUTE)) + mqd->update_mqd = update_mqd_hawaii; + return mqd; +} diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v9.c b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v9.c new file mode 100644 index 000000000..985bebde5 --- /dev/null +++ b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v9.c @@ -0,0 +1,473 @@ +/* + * Copyright 2016-2018 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + * + */ + +#include <linux/printk.h> +#include <linux/slab.h> +#include <linux/uaccess.h> +#include "kfd_priv.h" +#include "kfd_mqd_manager.h" +#include "v9_structs.h" +#include "gc/gc_9_0_offset.h" +#include "gc/gc_9_0_sh_mask.h" +#include "sdma0/sdma0_4_0_sh_mask.h" + +static inline struct v9_mqd *get_mqd(void *mqd) +{ + return (struct v9_mqd *)mqd; +} + +static inline struct v9_sdma_mqd *get_sdma_mqd(void *mqd) +{ + return (struct v9_sdma_mqd *)mqd; +} + +static void update_cu_mask(struct mqd_manager *mm, void *mqd, + struct queue_properties *q) +{ + struct v9_mqd *m; + uint32_t se_mask[4] = {0}; /* 4 is the max # of SEs */ + + if (q->cu_mask_count == 0) + return; + + mqd_symmetrically_map_cu_mask(mm, + q->cu_mask, q->cu_mask_count, se_mask); + + m = get_mqd(mqd); + m->compute_static_thread_mgmt_se0 = se_mask[0]; + m->compute_static_thread_mgmt_se1 = se_mask[1]; + m->compute_static_thread_mgmt_se2 = se_mask[2]; + m->compute_static_thread_mgmt_se3 = se_mask[3]; + + pr_debug("update cu mask to %#x %#x %#x %#x\n", + m->compute_static_thread_mgmt_se0, + m->compute_static_thread_mgmt_se1, + m->compute_static_thread_mgmt_se2, + m->compute_static_thread_mgmt_se3); +} + +static int init_mqd(struct mqd_manager *mm, void **mqd, + struct kfd_mem_obj **mqd_mem_obj, uint64_t *gart_addr, + struct queue_properties *q) +{ + int retval; + uint64_t addr; + struct v9_mqd *m; + struct kfd_dev *kfd = mm->dev; + + *mqd_mem_obj = NULL; + /* From V9, for CWSR, the control stack is located on the next page + * boundary after the mqd, we will use the gtt allocation function + * instead of sub-allocation function. + */ + if (kfd->cwsr_enabled && (q->type == KFD_QUEUE_TYPE_COMPUTE)) { + *mqd_mem_obj = kzalloc(sizeof(struct kfd_mem_obj), GFP_KERNEL); + if (!*mqd_mem_obj) + return -ENOMEM; + retval = kfd->kfd2kgd->init_gtt_mem_allocation(kfd->kgd, + ALIGN(q->ctl_stack_size, PAGE_SIZE) + + ALIGN(sizeof(struct v9_mqd), PAGE_SIZE), + &((*mqd_mem_obj)->gtt_mem), + &((*mqd_mem_obj)->gpu_addr), + (void *)&((*mqd_mem_obj)->cpu_ptr), true); + } else + retval = kfd_gtt_sa_allocate(mm->dev, sizeof(struct v9_mqd), + mqd_mem_obj); + if (retval) { + kfree(*mqd_mem_obj); + return -ENOMEM; + } + + m = (struct v9_mqd *) (*mqd_mem_obj)->cpu_ptr; + addr = (*mqd_mem_obj)->gpu_addr; + + memset(m, 0, sizeof(struct v9_mqd)); + + m->header = 0xC0310800; + m->compute_pipelinestat_enable = 1; + m->compute_static_thread_mgmt_se0 = 0xFFFFFFFF; + m->compute_static_thread_mgmt_se1 = 0xFFFFFFFF; + m->compute_static_thread_mgmt_se2 = 0xFFFFFFFF; + m->compute_static_thread_mgmt_se3 = 0xFFFFFFFF; + + m->cp_hqd_persistent_state = CP_HQD_PERSISTENT_STATE__PRELOAD_REQ_MASK | + 0x53 << CP_HQD_PERSISTENT_STATE__PRELOAD_SIZE__SHIFT; + + m->cp_mqd_control = 1 << CP_MQD_CONTROL__PRIV_STATE__SHIFT; + + m->cp_mqd_base_addr_lo = lower_32_bits(addr); + m->cp_mqd_base_addr_hi = upper_32_bits(addr); + + m->cp_hqd_quantum = 1 << CP_HQD_QUANTUM__QUANTUM_EN__SHIFT | + 1 << CP_HQD_QUANTUM__QUANTUM_SCALE__SHIFT | + 10 << CP_HQD_QUANTUM__QUANTUM_DURATION__SHIFT; + + m->cp_hqd_pipe_priority = 1; + m->cp_hqd_queue_priority = 15; + + if (q->format == KFD_QUEUE_FORMAT_AQL) { + m->cp_hqd_aql_control = + 1 << CP_HQD_AQL_CONTROL__CONTROL0__SHIFT; + } + + if (q->tba_addr) { + m->compute_pgm_rsrc2 |= + (1 << COMPUTE_PGM_RSRC2__TRAP_PRESENT__SHIFT); + } + + if (mm->dev->cwsr_enabled && q->ctx_save_restore_area_address) { + m->cp_hqd_persistent_state |= + (1 << CP_HQD_PERSISTENT_STATE__QSWITCH_MODE__SHIFT); + m->cp_hqd_ctx_save_base_addr_lo = + lower_32_bits(q->ctx_save_restore_area_address); + m->cp_hqd_ctx_save_base_addr_hi = + upper_32_bits(q->ctx_save_restore_area_address); + m->cp_hqd_ctx_save_size = q->ctx_save_restore_area_size; + m->cp_hqd_cntl_stack_size = q->ctl_stack_size; + m->cp_hqd_cntl_stack_offset = q->ctl_stack_size; + m->cp_hqd_wg_state_offset = q->ctl_stack_size; + } + + *mqd = m; + if (gart_addr) + *gart_addr = addr; + retval = mm->update_mqd(mm, m, q); + + return retval; +} + +static int load_mqd(struct mqd_manager *mm, void *mqd, + uint32_t pipe_id, uint32_t queue_id, + struct queue_properties *p, struct mm_struct *mms) +{ + /* AQL write pointer counts in 64B packets, PM4/CP counts in dwords. */ + uint32_t wptr_shift = (p->format == KFD_QUEUE_FORMAT_AQL ? 4 : 0); + + return mm->dev->kfd2kgd->hqd_load(mm->dev->kgd, mqd, pipe_id, queue_id, + (uint32_t __user *)p->write_ptr, + wptr_shift, 0, mms); +} + +static int update_mqd(struct mqd_manager *mm, void *mqd, + struct queue_properties *q) +{ + struct v9_mqd *m; + + m = get_mqd(mqd); + + m->cp_hqd_pq_control = 5 << CP_HQD_PQ_CONTROL__RPTR_BLOCK_SIZE__SHIFT; + m->cp_hqd_pq_control |= order_base_2(q->queue_size / 4) - 1; + pr_debug("cp_hqd_pq_control 0x%x\n", m->cp_hqd_pq_control); + + m->cp_hqd_pq_base_lo = lower_32_bits((uint64_t)q->queue_address >> 8); + m->cp_hqd_pq_base_hi = upper_32_bits((uint64_t)q->queue_address >> 8); + + m->cp_hqd_pq_rptr_report_addr_lo = lower_32_bits((uint64_t)q->read_ptr); + m->cp_hqd_pq_rptr_report_addr_hi = upper_32_bits((uint64_t)q->read_ptr); + m->cp_hqd_pq_wptr_poll_addr_lo = lower_32_bits((uint64_t)q->write_ptr); + m->cp_hqd_pq_wptr_poll_addr_hi = upper_32_bits((uint64_t)q->write_ptr); + + m->cp_hqd_pq_doorbell_control = + q->doorbell_off << + CP_HQD_PQ_DOORBELL_CONTROL__DOORBELL_OFFSET__SHIFT; + pr_debug("cp_hqd_pq_doorbell_control 0x%x\n", + m->cp_hqd_pq_doorbell_control); + + m->cp_hqd_ib_control = + 3 << CP_HQD_IB_CONTROL__MIN_IB_AVAIL_SIZE__SHIFT | + 1 << CP_HQD_IB_CONTROL__IB_EXE_DISABLE__SHIFT; + + /* + * HW does not clamp this field correctly. Maximum EOP queue size + * is constrained by per-SE EOP done signal count, which is 8-bit. + * Limit is 0xFF EOP entries (= 0x7F8 dwords). CP will not submit + * more than (EOP entry count - 1) so a queue size of 0x800 dwords + * is safe, giving a maximum field value of 0xA. + */ + m->cp_hqd_eop_control = min(0xA, + order_base_2(q->eop_ring_buffer_size / 4) - 1); + m->cp_hqd_eop_base_addr_lo = + lower_32_bits(q->eop_ring_buffer_address >> 8); + m->cp_hqd_eop_base_addr_hi = + upper_32_bits(q->eop_ring_buffer_address >> 8); + + m->cp_hqd_iq_timer = 0; + + m->cp_hqd_vmid = q->vmid; + + if (q->format == KFD_QUEUE_FORMAT_AQL) { + m->cp_hqd_pq_control |= CP_HQD_PQ_CONTROL__NO_UPDATE_RPTR_MASK | + 2 << CP_HQD_PQ_CONTROL__SLOT_BASED_WPTR__SHIFT | + 1 << CP_HQD_PQ_CONTROL__QUEUE_FULL_EN__SHIFT | + 1 << CP_HQD_PQ_CONTROL__WPP_CLAMP_EN__SHIFT; + m->cp_hqd_pq_doorbell_control |= 1 << + CP_HQD_PQ_DOORBELL_CONTROL__DOORBELL_BIF_DROP__SHIFT; + } + if (mm->dev->cwsr_enabled && q->ctx_save_restore_area_address) + m->cp_hqd_ctx_save_control = 0; + + update_cu_mask(mm, mqd, q); + + q->is_active = (q->queue_size > 0 && + q->queue_address != 0 && + q->queue_percent > 0 && + !q->is_evicted); + + return 0; +} + + +static int destroy_mqd(struct mqd_manager *mm, void *mqd, + enum kfd_preempt_type type, + unsigned int timeout, uint32_t pipe_id, + uint32_t queue_id) +{ + return mm->dev->kfd2kgd->hqd_destroy + (mm->dev->kgd, mqd, type, timeout, + pipe_id, queue_id); +} + +static void uninit_mqd(struct mqd_manager *mm, void *mqd, + struct kfd_mem_obj *mqd_mem_obj) +{ + struct kfd_dev *kfd = mm->dev; + + if (mqd_mem_obj->gtt_mem) { + kfd->kfd2kgd->free_gtt_mem(kfd->kgd, mqd_mem_obj->gtt_mem); + kfree(mqd_mem_obj); + } else { + kfd_gtt_sa_free(mm->dev, mqd_mem_obj); + } +} + +static bool is_occupied(struct mqd_manager *mm, void *mqd, + uint64_t queue_address, uint32_t pipe_id, + uint32_t queue_id) +{ + return mm->dev->kfd2kgd->hqd_is_occupied( + mm->dev->kgd, queue_address, + pipe_id, queue_id); +} + +static int init_mqd_hiq(struct mqd_manager *mm, void **mqd, + struct kfd_mem_obj **mqd_mem_obj, uint64_t *gart_addr, + struct queue_properties *q) +{ + struct v9_mqd *m; + int retval = init_mqd(mm, mqd, mqd_mem_obj, gart_addr, q); + + if (retval != 0) + return retval; + + m = get_mqd(*mqd); + + m->cp_hqd_pq_control |= 1 << CP_HQD_PQ_CONTROL__PRIV_STATE__SHIFT | + 1 << CP_HQD_PQ_CONTROL__KMD_QUEUE__SHIFT; + + return retval; +} + +static int update_mqd_hiq(struct mqd_manager *mm, void *mqd, + struct queue_properties *q) +{ + struct v9_mqd *m; + int retval = update_mqd(mm, mqd, q); + + if (retval != 0) + return retval; + + /* TODO: what's the point? update_mqd already does this. */ + m = get_mqd(mqd); + m->cp_hqd_vmid = q->vmid; + return retval; +} + +static int init_mqd_sdma(struct mqd_manager *mm, void **mqd, + struct kfd_mem_obj **mqd_mem_obj, uint64_t *gart_addr, + struct queue_properties *q) +{ + int retval; + struct v9_sdma_mqd *m; + + + retval = kfd_gtt_sa_allocate(mm->dev, + sizeof(struct v9_sdma_mqd), + mqd_mem_obj); + + if (retval != 0) + return -ENOMEM; + + m = (struct v9_sdma_mqd *) (*mqd_mem_obj)->cpu_ptr; + + memset(m, 0, sizeof(struct v9_sdma_mqd)); + + *mqd = m; + if (gart_addr) + *gart_addr = (*mqd_mem_obj)->gpu_addr; + + retval = mm->update_mqd(mm, m, q); + + return retval; +} + +static void uninit_mqd_sdma(struct mqd_manager *mm, void *mqd, + struct kfd_mem_obj *mqd_mem_obj) +{ + kfd_gtt_sa_free(mm->dev, mqd_mem_obj); +} + +static int load_mqd_sdma(struct mqd_manager *mm, void *mqd, + uint32_t pipe_id, uint32_t queue_id, + struct queue_properties *p, struct mm_struct *mms) +{ + return mm->dev->kfd2kgd->hqd_sdma_load(mm->dev->kgd, mqd, + (uint32_t __user *)p->write_ptr, + mms); +} + +#define SDMA_RLC_DUMMY_DEFAULT 0xf + +static int update_mqd_sdma(struct mqd_manager *mm, void *mqd, + struct queue_properties *q) +{ + struct v9_sdma_mqd *m; + + m = get_sdma_mqd(mqd); + m->sdmax_rlcx_rb_cntl = order_base_2(q->queue_size / 4) + << SDMA0_RLC0_RB_CNTL__RB_SIZE__SHIFT | + q->vmid << SDMA0_RLC0_RB_CNTL__RB_VMID__SHIFT | + 1 << SDMA0_RLC0_RB_CNTL__RPTR_WRITEBACK_ENABLE__SHIFT | + 6 << SDMA0_RLC0_RB_CNTL__RPTR_WRITEBACK_TIMER__SHIFT; + + m->sdmax_rlcx_rb_base = lower_32_bits(q->queue_address >> 8); + m->sdmax_rlcx_rb_base_hi = upper_32_bits(q->queue_address >> 8); + m->sdmax_rlcx_rb_rptr_addr_lo = lower_32_bits((uint64_t)q->read_ptr); + m->sdmax_rlcx_rb_rptr_addr_hi = upper_32_bits((uint64_t)q->read_ptr); + m->sdmax_rlcx_doorbell_offset = + q->doorbell_off << SDMA0_RLC0_DOORBELL_OFFSET__OFFSET__SHIFT; + + m->sdma_engine_id = q->sdma_engine_id; + m->sdma_queue_id = q->sdma_queue_id; + m->sdmax_rlcx_dummy_reg = SDMA_RLC_DUMMY_DEFAULT; + + q->is_active = (q->queue_size > 0 && + q->queue_address != 0 && + q->queue_percent > 0 && + !q->is_evicted); + + return 0; +} + +/* + * * preempt type here is ignored because there is only one way + * * to preempt sdma queue + */ +static int destroy_mqd_sdma(struct mqd_manager *mm, void *mqd, + enum kfd_preempt_type type, + unsigned int timeout, uint32_t pipe_id, + uint32_t queue_id) +{ + return mm->dev->kfd2kgd->hqd_sdma_destroy(mm->dev->kgd, mqd, timeout); +} + +static bool is_occupied_sdma(struct mqd_manager *mm, void *mqd, + uint64_t queue_address, uint32_t pipe_id, + uint32_t queue_id) +{ + return mm->dev->kfd2kgd->hqd_sdma_is_occupied(mm->dev->kgd, mqd); +} + +#if defined(CONFIG_DEBUG_FS) + +static int debugfs_show_mqd(struct seq_file *m, void *data) +{ + seq_hex_dump(m, " ", DUMP_PREFIX_OFFSET, 32, 4, + data, sizeof(struct v9_mqd), false); + return 0; +} + +static int debugfs_show_mqd_sdma(struct seq_file *m, void *data) +{ + seq_hex_dump(m, " ", DUMP_PREFIX_OFFSET, 32, 4, + data, sizeof(struct v9_sdma_mqd), false); + return 0; +} + +#endif + +struct mqd_manager *mqd_manager_init_v9(enum KFD_MQD_TYPE type, + struct kfd_dev *dev) +{ + struct mqd_manager *mqd; + + if (WARN_ON(type >= KFD_MQD_TYPE_MAX)) + return NULL; + + mqd = kzalloc(sizeof(*mqd), GFP_KERNEL); + if (!mqd) + return NULL; + + mqd->dev = dev; + + switch (type) { + case KFD_MQD_TYPE_CP: + case KFD_MQD_TYPE_COMPUTE: + mqd->init_mqd = init_mqd; + mqd->uninit_mqd = uninit_mqd; + mqd->load_mqd = load_mqd; + mqd->update_mqd = update_mqd; + mqd->destroy_mqd = destroy_mqd; + mqd->is_occupied = is_occupied; +#if defined(CONFIG_DEBUG_FS) + mqd->debugfs_show_mqd = debugfs_show_mqd; +#endif + break; + case KFD_MQD_TYPE_HIQ: + mqd->init_mqd = init_mqd_hiq; + mqd->uninit_mqd = uninit_mqd; + mqd->load_mqd = load_mqd; + mqd->update_mqd = update_mqd_hiq; + mqd->destroy_mqd = destroy_mqd; + mqd->is_occupied = is_occupied; +#if defined(CONFIG_DEBUG_FS) + mqd->debugfs_show_mqd = debugfs_show_mqd; +#endif + break; + case KFD_MQD_TYPE_SDMA: + mqd->init_mqd = init_mqd_sdma; + mqd->uninit_mqd = uninit_mqd_sdma; + mqd->load_mqd = load_mqd_sdma; + mqd->update_mqd = update_mqd_sdma; + mqd->destroy_mqd = destroy_mqd_sdma; + mqd->is_occupied = is_occupied_sdma; +#if defined(CONFIG_DEBUG_FS) + mqd->debugfs_show_mqd = debugfs_show_mqd_sdma; +#endif + break; + default: + kfree(mqd); + return NULL; + } + + return mqd; +} diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_vi.c b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_vi.c new file mode 100644 index 000000000..b81fda375 --- /dev/null +++ b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_vi.c @@ -0,0 +1,484 @@ +/* + * Copyright 2014 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + * + */ + +#include <linux/printk.h> +#include <linux/slab.h> +#include <linux/mm_types.h> + +#include "kfd_priv.h" +#include "kfd_mqd_manager.h" +#include "vi_structs.h" +#include "gca/gfx_8_0_sh_mask.h" +#include "gca/gfx_8_0_enum.h" +#include "oss/oss_3_0_sh_mask.h" +#define CP_MQD_CONTROL__PRIV_STATE__SHIFT 0x8 + +static inline struct vi_mqd *get_mqd(void *mqd) +{ + return (struct vi_mqd *)mqd; +} + +static inline struct vi_sdma_mqd *get_sdma_mqd(void *mqd) +{ + return (struct vi_sdma_mqd *)mqd; +} + +static void update_cu_mask(struct mqd_manager *mm, void *mqd, + struct queue_properties *q) +{ + struct vi_mqd *m; + uint32_t se_mask[4] = {0}; /* 4 is the max # of SEs */ + + if (q->cu_mask_count == 0) + return; + + mqd_symmetrically_map_cu_mask(mm, + q->cu_mask, q->cu_mask_count, se_mask); + + m = get_mqd(mqd); + m->compute_static_thread_mgmt_se0 = se_mask[0]; + m->compute_static_thread_mgmt_se1 = se_mask[1]; + m->compute_static_thread_mgmt_se2 = se_mask[2]; + m->compute_static_thread_mgmt_se3 = se_mask[3]; + + pr_debug("Update cu mask to %#x %#x %#x %#x\n", + m->compute_static_thread_mgmt_se0, + m->compute_static_thread_mgmt_se1, + m->compute_static_thread_mgmt_se2, + m->compute_static_thread_mgmt_se3); +} + +static int init_mqd(struct mqd_manager *mm, void **mqd, + struct kfd_mem_obj **mqd_mem_obj, uint64_t *gart_addr, + struct queue_properties *q) +{ + int retval; + uint64_t addr; + struct vi_mqd *m; + + retval = kfd_gtt_sa_allocate(mm->dev, sizeof(struct vi_mqd), + mqd_mem_obj); + if (retval != 0) + return -ENOMEM; + + m = (struct vi_mqd *) (*mqd_mem_obj)->cpu_ptr; + addr = (*mqd_mem_obj)->gpu_addr; + + memset(m, 0, sizeof(struct vi_mqd)); + + m->header = 0xC0310800; + m->compute_pipelinestat_enable = 1; + m->compute_static_thread_mgmt_se0 = 0xFFFFFFFF; + m->compute_static_thread_mgmt_se1 = 0xFFFFFFFF; + m->compute_static_thread_mgmt_se2 = 0xFFFFFFFF; + m->compute_static_thread_mgmt_se3 = 0xFFFFFFFF; + + m->cp_hqd_persistent_state = CP_HQD_PERSISTENT_STATE__PRELOAD_REQ_MASK | + 0x53 << CP_HQD_PERSISTENT_STATE__PRELOAD_SIZE__SHIFT; + + m->cp_mqd_control = 1 << CP_MQD_CONTROL__PRIV_STATE__SHIFT | + MTYPE_UC << CP_MQD_CONTROL__MTYPE__SHIFT; + + m->cp_mqd_base_addr_lo = lower_32_bits(addr); + m->cp_mqd_base_addr_hi = upper_32_bits(addr); + + m->cp_hqd_quantum = 1 << CP_HQD_QUANTUM__QUANTUM_EN__SHIFT | + 1 << CP_HQD_QUANTUM__QUANTUM_SCALE__SHIFT | + 10 << CP_HQD_QUANTUM__QUANTUM_DURATION__SHIFT; + + m->cp_hqd_pipe_priority = 1; + m->cp_hqd_queue_priority = 15; + + m->cp_hqd_eop_rptr = 1 << CP_HQD_EOP_RPTR__INIT_FETCHER__SHIFT; + + if (q->format == KFD_QUEUE_FORMAT_AQL) + m->cp_hqd_iq_rptr = 1; + + if (q->tba_addr) { + m->compute_tba_lo = lower_32_bits(q->tba_addr >> 8); + m->compute_tba_hi = upper_32_bits(q->tba_addr >> 8); + m->compute_tma_lo = lower_32_bits(q->tma_addr >> 8); + m->compute_tma_hi = upper_32_bits(q->tma_addr >> 8); + m->compute_pgm_rsrc2 |= + (1 << COMPUTE_PGM_RSRC2__TRAP_PRESENT__SHIFT); + } + + if (mm->dev->cwsr_enabled && q->ctx_save_restore_area_address) { + m->cp_hqd_persistent_state |= + (1 << CP_HQD_PERSISTENT_STATE__QSWITCH_MODE__SHIFT); + m->cp_hqd_ctx_save_base_addr_lo = + lower_32_bits(q->ctx_save_restore_area_address); + m->cp_hqd_ctx_save_base_addr_hi = + upper_32_bits(q->ctx_save_restore_area_address); + m->cp_hqd_ctx_save_size = q->ctx_save_restore_area_size; + m->cp_hqd_cntl_stack_size = q->ctl_stack_size; + m->cp_hqd_cntl_stack_offset = q->ctl_stack_size; + m->cp_hqd_wg_state_offset = q->ctl_stack_size; + } + + *mqd = m; + if (gart_addr) + *gart_addr = addr; + retval = mm->update_mqd(mm, m, q); + + return retval; +} + +static int load_mqd(struct mqd_manager *mm, void *mqd, + uint32_t pipe_id, uint32_t queue_id, + struct queue_properties *p, struct mm_struct *mms) +{ + /* AQL write pointer counts in 64B packets, PM4/CP counts in dwords. */ + uint32_t wptr_shift = (p->format == KFD_QUEUE_FORMAT_AQL ? 4 : 0); + uint32_t wptr_mask = (uint32_t)((p->queue_size / 4) - 1); + + return mm->dev->kfd2kgd->hqd_load(mm->dev->kgd, mqd, pipe_id, queue_id, + (uint32_t __user *)p->write_ptr, + wptr_shift, wptr_mask, mms); +} + +static int __update_mqd(struct mqd_manager *mm, void *mqd, + struct queue_properties *q, unsigned int mtype, + unsigned int atc_bit) +{ + struct vi_mqd *m; + + m = get_mqd(mqd); + + m->cp_hqd_pq_control = 5 << CP_HQD_PQ_CONTROL__RPTR_BLOCK_SIZE__SHIFT | + atc_bit << CP_HQD_PQ_CONTROL__PQ_ATC__SHIFT | + mtype << CP_HQD_PQ_CONTROL__MTYPE__SHIFT; + m->cp_hqd_pq_control |= order_base_2(q->queue_size / 4) - 1; + pr_debug("cp_hqd_pq_control 0x%x\n", m->cp_hqd_pq_control); + + m->cp_hqd_pq_base_lo = lower_32_bits((uint64_t)q->queue_address >> 8); + m->cp_hqd_pq_base_hi = upper_32_bits((uint64_t)q->queue_address >> 8); + + m->cp_hqd_pq_rptr_report_addr_lo = lower_32_bits((uint64_t)q->read_ptr); + m->cp_hqd_pq_rptr_report_addr_hi = upper_32_bits((uint64_t)q->read_ptr); + m->cp_hqd_pq_wptr_poll_addr_lo = lower_32_bits((uint64_t)q->write_ptr); + m->cp_hqd_pq_wptr_poll_addr_hi = upper_32_bits((uint64_t)q->write_ptr); + + m->cp_hqd_pq_doorbell_control = + q->doorbell_off << + CP_HQD_PQ_DOORBELL_CONTROL__DOORBELL_OFFSET__SHIFT; + pr_debug("cp_hqd_pq_doorbell_control 0x%x\n", + m->cp_hqd_pq_doorbell_control); + + m->cp_hqd_eop_control = atc_bit << CP_HQD_EOP_CONTROL__EOP_ATC__SHIFT | + mtype << CP_HQD_EOP_CONTROL__MTYPE__SHIFT; + + m->cp_hqd_ib_control = atc_bit << CP_HQD_IB_CONTROL__IB_ATC__SHIFT | + 3 << CP_HQD_IB_CONTROL__MIN_IB_AVAIL_SIZE__SHIFT | + mtype << CP_HQD_IB_CONTROL__MTYPE__SHIFT; + + /* + * HW does not clamp this field correctly. Maximum EOP queue size + * is constrained by per-SE EOP done signal count, which is 8-bit. + * Limit is 0xFF EOP entries (= 0x7F8 dwords). CP will not submit + * more than (EOP entry count - 1) so a queue size of 0x800 dwords + * is safe, giving a maximum field value of 0xA. + */ + m->cp_hqd_eop_control |= min(0xA, + order_base_2(q->eop_ring_buffer_size / 4) - 1); + m->cp_hqd_eop_base_addr_lo = + lower_32_bits(q->eop_ring_buffer_address >> 8); + m->cp_hqd_eop_base_addr_hi = + upper_32_bits(q->eop_ring_buffer_address >> 8); + + m->cp_hqd_iq_timer = atc_bit << CP_HQD_IQ_TIMER__IQ_ATC__SHIFT | + mtype << CP_HQD_IQ_TIMER__MTYPE__SHIFT; + + m->cp_hqd_vmid = q->vmid; + + if (q->format == KFD_QUEUE_FORMAT_AQL) { + m->cp_hqd_pq_control |= CP_HQD_PQ_CONTROL__NO_UPDATE_RPTR_MASK | + 2 << CP_HQD_PQ_CONTROL__SLOT_BASED_WPTR__SHIFT; + } + + if (mm->dev->cwsr_enabled && q->ctx_save_restore_area_address) + m->cp_hqd_ctx_save_control = + atc_bit << CP_HQD_CTX_SAVE_CONTROL__ATC__SHIFT | + mtype << CP_HQD_CTX_SAVE_CONTROL__MTYPE__SHIFT; + + update_cu_mask(mm, mqd, q); + + q->is_active = (q->queue_size > 0 && + q->queue_address != 0 && + q->queue_percent > 0 && + !q->is_evicted); + + return 0; +} + + +static int update_mqd(struct mqd_manager *mm, void *mqd, + struct queue_properties *q) +{ + return __update_mqd(mm, mqd, q, MTYPE_CC, 1); +} + +static int update_mqd_tonga(struct mqd_manager *mm, void *mqd, + struct queue_properties *q) +{ + return __update_mqd(mm, mqd, q, MTYPE_UC, 0); +} + +static int destroy_mqd(struct mqd_manager *mm, void *mqd, + enum kfd_preempt_type type, + unsigned int timeout, uint32_t pipe_id, + uint32_t queue_id) +{ + return mm->dev->kfd2kgd->hqd_destroy + (mm->dev->kgd, mqd, type, timeout, + pipe_id, queue_id); +} + +static void uninit_mqd(struct mqd_manager *mm, void *mqd, + struct kfd_mem_obj *mqd_mem_obj) +{ + kfd_gtt_sa_free(mm->dev, mqd_mem_obj); +} + +static bool is_occupied(struct mqd_manager *mm, void *mqd, + uint64_t queue_address, uint32_t pipe_id, + uint32_t queue_id) +{ + return mm->dev->kfd2kgd->hqd_is_occupied( + mm->dev->kgd, queue_address, + pipe_id, queue_id); +} + +static int init_mqd_hiq(struct mqd_manager *mm, void **mqd, + struct kfd_mem_obj **mqd_mem_obj, uint64_t *gart_addr, + struct queue_properties *q) +{ + struct vi_mqd *m; + int retval = init_mqd(mm, mqd, mqd_mem_obj, gart_addr, q); + + if (retval != 0) + return retval; + + m = get_mqd(*mqd); + + m->cp_hqd_pq_control |= 1 << CP_HQD_PQ_CONTROL__PRIV_STATE__SHIFT | + 1 << CP_HQD_PQ_CONTROL__KMD_QUEUE__SHIFT; + + return retval; +} + +static int update_mqd_hiq(struct mqd_manager *mm, void *mqd, + struct queue_properties *q) +{ + struct vi_mqd *m; + int retval = __update_mqd(mm, mqd, q, MTYPE_UC, 0); + + if (retval != 0) + return retval; + + m = get_mqd(mqd); + m->cp_hqd_vmid = q->vmid; + return retval; +} + +static int init_mqd_sdma(struct mqd_manager *mm, void **mqd, + struct kfd_mem_obj **mqd_mem_obj, uint64_t *gart_addr, + struct queue_properties *q) +{ + int retval; + struct vi_sdma_mqd *m; + + + retval = kfd_gtt_sa_allocate(mm->dev, + sizeof(struct vi_sdma_mqd), + mqd_mem_obj); + + if (retval != 0) + return -ENOMEM; + + m = (struct vi_sdma_mqd *) (*mqd_mem_obj)->cpu_ptr; + + memset(m, 0, sizeof(struct vi_sdma_mqd)); + + *mqd = m; + if (gart_addr != NULL) + *gart_addr = (*mqd_mem_obj)->gpu_addr; + + retval = mm->update_mqd(mm, m, q); + + return retval; +} + +static void uninit_mqd_sdma(struct mqd_manager *mm, void *mqd, + struct kfd_mem_obj *mqd_mem_obj) +{ + kfd_gtt_sa_free(mm->dev, mqd_mem_obj); +} + +static int load_mqd_sdma(struct mqd_manager *mm, void *mqd, + uint32_t pipe_id, uint32_t queue_id, + struct queue_properties *p, struct mm_struct *mms) +{ + return mm->dev->kfd2kgd->hqd_sdma_load(mm->dev->kgd, mqd, + (uint32_t __user *)p->write_ptr, + mms); +} + +static int update_mqd_sdma(struct mqd_manager *mm, void *mqd, + struct queue_properties *q) +{ + struct vi_sdma_mqd *m; + + m = get_sdma_mqd(mqd); + m->sdmax_rlcx_rb_cntl = order_base_2(q->queue_size / 4) + << SDMA0_RLC0_RB_CNTL__RB_SIZE__SHIFT | + q->vmid << SDMA0_RLC0_RB_CNTL__RB_VMID__SHIFT | + 1 << SDMA0_RLC0_RB_CNTL__RPTR_WRITEBACK_ENABLE__SHIFT | + 6 << SDMA0_RLC0_RB_CNTL__RPTR_WRITEBACK_TIMER__SHIFT; + + m->sdmax_rlcx_rb_base = lower_32_bits(q->queue_address >> 8); + m->sdmax_rlcx_rb_base_hi = upper_32_bits(q->queue_address >> 8); + m->sdmax_rlcx_rb_rptr_addr_lo = lower_32_bits((uint64_t)q->read_ptr); + m->sdmax_rlcx_rb_rptr_addr_hi = upper_32_bits((uint64_t)q->read_ptr); + m->sdmax_rlcx_doorbell = + q->doorbell_off << SDMA0_RLC0_DOORBELL__OFFSET__SHIFT; + + m->sdmax_rlcx_virtual_addr = q->sdma_vm_addr; + + m->sdma_engine_id = q->sdma_engine_id; + m->sdma_queue_id = q->sdma_queue_id; + + q->is_active = (q->queue_size > 0 && + q->queue_address != 0 && + q->queue_percent > 0 && + !q->is_evicted); + + return 0; +} + +/* + * * preempt type here is ignored because there is only one way + * * to preempt sdma queue + */ +static int destroy_mqd_sdma(struct mqd_manager *mm, void *mqd, + enum kfd_preempt_type type, + unsigned int timeout, uint32_t pipe_id, + uint32_t queue_id) +{ + return mm->dev->kfd2kgd->hqd_sdma_destroy(mm->dev->kgd, mqd, timeout); +} + +static bool is_occupied_sdma(struct mqd_manager *mm, void *mqd, + uint64_t queue_address, uint32_t pipe_id, + uint32_t queue_id) +{ + return mm->dev->kfd2kgd->hqd_sdma_is_occupied(mm->dev->kgd, mqd); +} + +#if defined(CONFIG_DEBUG_FS) + +static int debugfs_show_mqd(struct seq_file *m, void *data) +{ + seq_hex_dump(m, " ", DUMP_PREFIX_OFFSET, 32, 4, + data, sizeof(struct vi_mqd), false); + return 0; +} + +static int debugfs_show_mqd_sdma(struct seq_file *m, void *data) +{ + seq_hex_dump(m, " ", DUMP_PREFIX_OFFSET, 32, 4, + data, sizeof(struct vi_sdma_mqd), false); + return 0; +} + +#endif + +struct mqd_manager *mqd_manager_init_vi(enum KFD_MQD_TYPE type, + struct kfd_dev *dev) +{ + struct mqd_manager *mqd; + + if (WARN_ON(type >= KFD_MQD_TYPE_MAX)) + return NULL; + + mqd = kzalloc(sizeof(*mqd), GFP_KERNEL); + if (!mqd) + return NULL; + + mqd->dev = dev; + + switch (type) { + case KFD_MQD_TYPE_CP: + case KFD_MQD_TYPE_COMPUTE: + mqd->init_mqd = init_mqd; + mqd->uninit_mqd = uninit_mqd; + mqd->load_mqd = load_mqd; + mqd->update_mqd = update_mqd; + mqd->destroy_mqd = destroy_mqd; + mqd->is_occupied = is_occupied; +#if defined(CONFIG_DEBUG_FS) + mqd->debugfs_show_mqd = debugfs_show_mqd; +#endif + break; + case KFD_MQD_TYPE_HIQ: + mqd->init_mqd = init_mqd_hiq; + mqd->uninit_mqd = uninit_mqd; + mqd->load_mqd = load_mqd; + mqd->update_mqd = update_mqd_hiq; + mqd->destroy_mqd = destroy_mqd; + mqd->is_occupied = is_occupied; +#if defined(CONFIG_DEBUG_FS) + mqd->debugfs_show_mqd = debugfs_show_mqd; +#endif + break; + case KFD_MQD_TYPE_SDMA: + mqd->init_mqd = init_mqd_sdma; + mqd->uninit_mqd = uninit_mqd_sdma; + mqd->load_mqd = load_mqd_sdma; + mqd->update_mqd = update_mqd_sdma; + mqd->destroy_mqd = destroy_mqd_sdma; + mqd->is_occupied = is_occupied_sdma; +#if defined(CONFIG_DEBUG_FS) + mqd->debugfs_show_mqd = debugfs_show_mqd_sdma; +#endif + break; + default: + kfree(mqd); + return NULL; + } + + return mqd; +} + +struct mqd_manager *mqd_manager_init_vi_tonga(enum KFD_MQD_TYPE type, + struct kfd_dev *dev) +{ + struct mqd_manager *mqd; + + mqd = mqd_manager_init_vi(type, dev); + if (!mqd) + return NULL; + if ((type == KFD_MQD_TYPE_CP) || (type == KFD_MQD_TYPE_COMPUTE)) + mqd->update_mqd = update_mqd_tonga; + return mqd; +} diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c new file mode 100644 index 000000000..109263176 --- /dev/null +++ b/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c @@ -0,0 +1,447 @@ +/* + * Copyright 2014 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + * + */ + +#include <linux/slab.h> +#include <linux/mutex.h> +#include "kfd_device_queue_manager.h" +#include "kfd_kernel_queue.h" +#include "kfd_priv.h" + +static inline void inc_wptr(unsigned int *wptr, unsigned int increment_bytes, + unsigned int buffer_size_bytes) +{ + unsigned int temp = *wptr + increment_bytes / sizeof(uint32_t); + + WARN((temp * sizeof(uint32_t)) > buffer_size_bytes, + "Runlist IB overflow"); + *wptr = temp; +} + +static void pm_calc_rlib_size(struct packet_manager *pm, + unsigned int *rlib_size, + bool *over_subscription) +{ + unsigned int process_count, queue_count, compute_queue_count; + unsigned int map_queue_size; + unsigned int max_proc_per_quantum = 1; + struct kfd_dev *dev = pm->dqm->dev; + + process_count = pm->dqm->processes_count; + queue_count = pm->dqm->queue_count; + compute_queue_count = queue_count - pm->dqm->sdma_queue_count; + + /* check if there is over subscription + * Note: the arbitration between the number of VMIDs and + * hws_max_conc_proc has been done in + * kgd2kfd_device_init(). + */ + *over_subscription = false; + + if (dev->max_proc_per_quantum > 1) + max_proc_per_quantum = dev->max_proc_per_quantum; + + if ((process_count > max_proc_per_quantum) || + compute_queue_count > get_queues_num(pm->dqm)) { + *over_subscription = true; + pr_debug("Over subscribed runlist\n"); + } + + map_queue_size = pm->pmf->map_queues_size; + /* calculate run list ib allocation size */ + *rlib_size = process_count * pm->pmf->map_process_size + + queue_count * map_queue_size; + + /* + * Increase the allocation size in case we need a chained run list + * when over subscription + */ + if (*over_subscription) + *rlib_size += pm->pmf->runlist_size; + + pr_debug("runlist ib size %d\n", *rlib_size); +} + +static int pm_allocate_runlist_ib(struct packet_manager *pm, + unsigned int **rl_buffer, + uint64_t *rl_gpu_buffer, + unsigned int *rl_buffer_size, + bool *is_over_subscription) +{ + int retval; + + if (WARN_ON(pm->allocated)) + return -EINVAL; + + pm_calc_rlib_size(pm, rl_buffer_size, is_over_subscription); + + mutex_lock(&pm->lock); + + retval = kfd_gtt_sa_allocate(pm->dqm->dev, *rl_buffer_size, + &pm->ib_buffer_obj); + + if (retval) { + pr_err("Failed to allocate runlist IB\n"); + goto out; + } + + *(void **)rl_buffer = pm->ib_buffer_obj->cpu_ptr; + *rl_gpu_buffer = pm->ib_buffer_obj->gpu_addr; + + memset(*rl_buffer, 0, *rl_buffer_size); + pm->allocated = true; + +out: + mutex_unlock(&pm->lock); + return retval; +} + +static int pm_create_runlist_ib(struct packet_manager *pm, + struct list_head *queues, + uint64_t *rl_gpu_addr, + size_t *rl_size_bytes) +{ + unsigned int alloc_size_bytes; + unsigned int *rl_buffer, rl_wptr, i; + int retval, proccesses_mapped; + struct device_process_node *cur; + struct qcm_process_device *qpd; + struct queue *q; + struct kernel_queue *kq; + bool is_over_subscription; + + rl_wptr = retval = proccesses_mapped = 0; + + retval = pm_allocate_runlist_ib(pm, &rl_buffer, rl_gpu_addr, + &alloc_size_bytes, &is_over_subscription); + if (retval) + return retval; + + *rl_size_bytes = alloc_size_bytes; + pm->ib_size_bytes = alloc_size_bytes; + + pr_debug("Building runlist ib process count: %d queues count %d\n", + pm->dqm->processes_count, pm->dqm->queue_count); + + /* build the run list ib packet */ + list_for_each_entry(cur, queues, list) { + qpd = cur->qpd; + /* build map process packet */ + if (proccesses_mapped >= pm->dqm->processes_count) { + pr_debug("Not enough space left in runlist IB\n"); + pm_release_ib(pm); + return -ENOMEM; + } + + retval = pm->pmf->map_process(pm, &rl_buffer[rl_wptr], qpd); + if (retval) + return retval; + + proccesses_mapped++; + inc_wptr(&rl_wptr, pm->pmf->map_process_size, + alloc_size_bytes); + + list_for_each_entry(kq, &qpd->priv_queue_list, list) { + if (!kq->queue->properties.is_active) + continue; + + pr_debug("static_queue, mapping kernel q %d, is debug status %d\n", + kq->queue->queue, qpd->is_debug); + + retval = pm->pmf->map_queues(pm, + &rl_buffer[rl_wptr], + kq->queue, + qpd->is_debug); + if (retval) + return retval; + + inc_wptr(&rl_wptr, + pm->pmf->map_queues_size, + alloc_size_bytes); + } + + list_for_each_entry(q, &qpd->queues_list, list) { + if (!q->properties.is_active) + continue; + + pr_debug("static_queue, mapping user queue %d, is debug status %d\n", + q->queue, qpd->is_debug); + + retval = pm->pmf->map_queues(pm, + &rl_buffer[rl_wptr], + q, + qpd->is_debug); + + if (retval) + return retval; + + inc_wptr(&rl_wptr, + pm->pmf->map_queues_size, + alloc_size_bytes); + } + } + + pr_debug("Finished map process and queues to runlist\n"); + + if (is_over_subscription) + retval = pm->pmf->runlist(pm, &rl_buffer[rl_wptr], + *rl_gpu_addr, + alloc_size_bytes / sizeof(uint32_t), + true); + + for (i = 0; i < alloc_size_bytes / sizeof(uint32_t); i++) + pr_debug("0x%2X ", rl_buffer[i]); + pr_debug("\n"); + + return retval; +} + +int pm_init(struct packet_manager *pm, struct device_queue_manager *dqm) +{ + switch (dqm->dev->device_info->asic_family) { + case CHIP_KAVERI: + case CHIP_HAWAII: + /* PM4 packet structures on CIK are the same as on VI */ + case CHIP_CARRIZO: + case CHIP_TONGA: + case CHIP_FIJI: + case CHIP_POLARIS10: + case CHIP_POLARIS11: + pm->pmf = &kfd_vi_pm_funcs; + break; + case CHIP_VEGA10: + case CHIP_RAVEN: + pm->pmf = &kfd_v9_pm_funcs; + break; + default: + WARN(1, "Unexpected ASIC family %u", + dqm->dev->device_info->asic_family); + return -EINVAL; + } + + pm->dqm = dqm; + mutex_init(&pm->lock); + pm->priv_queue = kernel_queue_init(dqm->dev, KFD_QUEUE_TYPE_HIQ); + if (!pm->priv_queue) { + mutex_destroy(&pm->lock); + return -ENOMEM; + } + pm->allocated = false; + + return 0; +} + +void pm_uninit(struct packet_manager *pm) +{ + mutex_destroy(&pm->lock); + kernel_queue_uninit(pm->priv_queue); +} + +int pm_send_set_resources(struct packet_manager *pm, + struct scheduling_resources *res) +{ + uint32_t *buffer, size; + int retval = 0; + + size = pm->pmf->set_resources_size; + mutex_lock(&pm->lock); + pm->priv_queue->ops.acquire_packet_buffer(pm->priv_queue, + size / sizeof(uint32_t), + (unsigned int **)&buffer); + if (!buffer) { + pr_err("Failed to allocate buffer on kernel queue\n"); + retval = -ENOMEM; + goto out; + } + + retval = pm->pmf->set_resources(pm, buffer, res); + if (!retval) + pm->priv_queue->ops.submit_packet(pm->priv_queue); + else + pm->priv_queue->ops.rollback_packet(pm->priv_queue); + +out: + mutex_unlock(&pm->lock); + + return retval; +} + +int pm_send_runlist(struct packet_manager *pm, struct list_head *dqm_queues) +{ + uint64_t rl_gpu_ib_addr; + uint32_t *rl_buffer; + size_t rl_ib_size, packet_size_dwords; + int retval; + + retval = pm_create_runlist_ib(pm, dqm_queues, &rl_gpu_ib_addr, + &rl_ib_size); + if (retval) + goto fail_create_runlist_ib; + + pr_debug("runlist IB address: 0x%llX\n", rl_gpu_ib_addr); + + packet_size_dwords = pm->pmf->runlist_size / sizeof(uint32_t); + mutex_lock(&pm->lock); + + retval = pm->priv_queue->ops.acquire_packet_buffer(pm->priv_queue, + packet_size_dwords, &rl_buffer); + if (retval) + goto fail_acquire_packet_buffer; + + retval = pm->pmf->runlist(pm, rl_buffer, rl_gpu_ib_addr, + rl_ib_size / sizeof(uint32_t), false); + if (retval) + goto fail_create_runlist; + + pm->priv_queue->ops.submit_packet(pm->priv_queue); + + mutex_unlock(&pm->lock); + + return retval; + +fail_create_runlist: + pm->priv_queue->ops.rollback_packet(pm->priv_queue); +fail_acquire_packet_buffer: + mutex_unlock(&pm->lock); +fail_create_runlist_ib: + pm_release_ib(pm); + return retval; +} + +int pm_send_query_status(struct packet_manager *pm, uint64_t fence_address, + uint32_t fence_value) +{ + uint32_t *buffer, size; + int retval = 0; + + if (WARN_ON(!fence_address)) + return -EFAULT; + + size = pm->pmf->query_status_size; + mutex_lock(&pm->lock); + pm->priv_queue->ops.acquire_packet_buffer(pm->priv_queue, + size / sizeof(uint32_t), (unsigned int **)&buffer); + if (!buffer) { + pr_err("Failed to allocate buffer on kernel queue\n"); + retval = -ENOMEM; + goto out; + } + + retval = pm->pmf->query_status(pm, buffer, fence_address, fence_value); + if (!retval) + pm->priv_queue->ops.submit_packet(pm->priv_queue); + else + pm->priv_queue->ops.rollback_packet(pm->priv_queue); + +out: + mutex_unlock(&pm->lock); + return retval; +} + +int pm_send_unmap_queue(struct packet_manager *pm, enum kfd_queue_type type, + enum kfd_unmap_queues_filter filter, + uint32_t filter_param, bool reset, + unsigned int sdma_engine) +{ + uint32_t *buffer, size; + int retval = 0; + + size = pm->pmf->unmap_queues_size; + mutex_lock(&pm->lock); + pm->priv_queue->ops.acquire_packet_buffer(pm->priv_queue, + size / sizeof(uint32_t), (unsigned int **)&buffer); + if (!buffer) { + pr_err("Failed to allocate buffer on kernel queue\n"); + retval = -ENOMEM; + goto out; + } + + retval = pm->pmf->unmap_queues(pm, buffer, type, filter, filter_param, + reset, sdma_engine); + if (!retval) + pm->priv_queue->ops.submit_packet(pm->priv_queue); + else + pm->priv_queue->ops.rollback_packet(pm->priv_queue); + +out: + mutex_unlock(&pm->lock); + return retval; +} + +void pm_release_ib(struct packet_manager *pm) +{ + mutex_lock(&pm->lock); + if (pm->allocated) { + kfd_gtt_sa_free(pm->dqm->dev, pm->ib_buffer_obj); + pm->allocated = false; + } + mutex_unlock(&pm->lock); +} + +#if defined(CONFIG_DEBUG_FS) + +int pm_debugfs_runlist(struct seq_file *m, void *data) +{ + struct packet_manager *pm = data; + + mutex_lock(&pm->lock); + + if (!pm->allocated) { + seq_puts(m, " No active runlist\n"); + goto out; + } + + seq_hex_dump(m, " ", DUMP_PREFIX_OFFSET, 32, 4, + pm->ib_buffer_obj->cpu_ptr, pm->ib_size_bytes, false); + +out: + mutex_unlock(&pm->lock); + return 0; +} + +int pm_debugfs_hang_hws(struct packet_manager *pm) +{ + uint32_t *buffer, size; + int r = 0; + + size = pm->pmf->query_status_size; + mutex_lock(&pm->lock); + pm->priv_queue->ops.acquire_packet_buffer(pm->priv_queue, + size / sizeof(uint32_t), (unsigned int **)&buffer); + if (!buffer) { + pr_err("Failed to allocate buffer on kernel queue\n"); + r = -ENOMEM; + goto out; + } + memset(buffer, 0x55, size); + pm->priv_queue->ops.submit_packet(pm->priv_queue); + + pr_info("Submitting %x %x %x %x %x %x %x to HIQ to hang the HWS.", + buffer[0], buffer[1], buffer[2], buffer[3], + buffer[4], buffer[5], buffer[6]); +out: + mutex_unlock(&pm->lock); + return r; +} + + +#endif diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_pasid.c b/drivers/gpu/drm/amd/amdkfd/kfd_pasid.c new file mode 100644 index 000000000..15fff4420 --- /dev/null +++ b/drivers/gpu/drm/amd/amdkfd/kfd_pasid.c @@ -0,0 +1,83 @@ +/* + * Copyright 2014 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +#include <linux/types.h> +#include "kfd_priv.h" + +static unsigned int pasid_bits = 16; +static const struct kfd2kgd_calls *kfd2kgd; + +bool kfd_set_pasid_limit(unsigned int new_limit) +{ + if (new_limit < 2) + return false; + + if (new_limit < (1U << pasid_bits)) { + if (kfd2kgd) + /* We've already allocated user PASIDs, too late to + * change the limit + */ + return false; + + while (new_limit < (1U << pasid_bits)) + pasid_bits--; + } + + return true; +} + +unsigned int kfd_get_pasid_limit(void) +{ + return 1U << pasid_bits; +} + +unsigned int kfd_pasid_alloc(void) +{ + int r; + + /* Find the first best KFD device for calling KGD */ + if (!kfd2kgd) { + struct kfd_dev *dev = NULL; + unsigned int i = 0; + + while ((kfd_topology_enum_kfd_devices(i, &dev)) == 0) { + if (dev && dev->kfd2kgd) { + kfd2kgd = dev->kfd2kgd; + break; + } + i++; + } + + if (!kfd2kgd) + return false; + } + + r = kfd2kgd->alloc_pasid(pasid_bits); + + return r > 0 ? r : 0; +} + +void kfd_pasid_free(unsigned int pasid) +{ + if (kfd2kgd) + kfd2kgd->free_pasid(pasid); +} diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_pm4_headers.h b/drivers/gpu/drm/amd/amdkfd/kfd_pm4_headers.h new file mode 100644 index 000000000..e50f73d25 --- /dev/null +++ b/drivers/gpu/drm/amd/amdkfd/kfd_pm4_headers.h @@ -0,0 +1,155 @@ +/* + * Copyright 2014 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + * + */ + +#ifndef KFD_PM4_HEADERS_H_ +#define KFD_PM4_HEADERS_H_ + +#ifndef PM4_MES_HEADER_DEFINED +#define PM4_MES_HEADER_DEFINED +union PM4_MES_TYPE_3_HEADER { + struct { + /* reserved */ + uint32_t reserved1:8; + /* IT opcode */ + uint32_t opcode:8; + /* number of DWORDs - 1 in the information body */ + uint32_t count:14; + /* packet identifier. It should be 3 for type 3 packets */ + uint32_t type:2; + }; + uint32_t u32all; +}; +#endif /* PM4_MES_HEADER_DEFINED */ + + +/*--------------------MES_MAP_PROCESS-------------------- */ + +#ifndef PM4_MES_MAP_PROCESS_DEFINED +#define PM4_MES_MAP_PROCESS_DEFINED + +struct pm4_map_process { + union { + union PM4_MES_TYPE_3_HEADER header; /* header */ + uint32_t ordinal1; + }; + + union { + struct { + uint32_t pasid:16; + uint32_t reserved1:8; + uint32_t diq_enable:1; + uint32_t process_quantum:7; + } bitfields2; + uint32_t ordinal2; + }; + + union { + struct { + uint32_t page_table_base:28; + uint32_t reserved3:4; + } bitfields3; + uint32_t ordinal3; + }; + + uint32_t sh_mem_bases; + uint32_t sh_mem_ape1_base; + uint32_t sh_mem_ape1_limit; + uint32_t sh_mem_config; + uint32_t gds_addr_lo; + uint32_t gds_addr_hi; + + union { + struct { + uint32_t num_gws:6; + uint32_t reserved4:2; + uint32_t num_oac:4; + uint32_t reserved5:4; + uint32_t gds_size:6; + uint32_t num_queues:10; + } bitfields10; + uint32_t ordinal10; + }; + +}; +#endif + +#ifndef PM4_MES_MAP_PROCESS_DEFINED_KV_SCRATCH +#define PM4_MES_MAP_PROCESS_DEFINED_KV_SCRATCH + +struct pm4_map_process_scratch_kv { + union { + union PM4_MES_TYPE_3_HEADER header; /* header */ + uint32_t ordinal1; + }; + + union { + struct { + uint32_t pasid:16; + uint32_t reserved1:8; + uint32_t diq_enable:1; + uint32_t process_quantum:7; + } bitfields2; + uint32_t ordinal2; + }; + + union { + struct { + uint32_t page_table_base:28; + uint32_t reserved2:4; + } bitfields3; + uint32_t ordinal3; + }; + + uint32_t reserved3; + uint32_t sh_mem_bases; + uint32_t sh_mem_config; + uint32_t sh_mem_ape1_base; + uint32_t sh_mem_ape1_limit; + uint32_t sh_hidden_private_base_vmid; + uint32_t reserved4; + uint32_t reserved5; + uint32_t gds_addr_lo; + uint32_t gds_addr_hi; + + union { + struct { + uint32_t num_gws:6; + uint32_t reserved6:2; + uint32_t num_oac:4; + uint32_t reserved7:4; + uint32_t gds_size:6; + uint32_t num_queues:10; + } bitfields14; + uint32_t ordinal14; + }; + + uint32_t completion_signal_lo32; +uint32_t completion_signal_hi32; +}; +#endif + +enum { + CACHE_FLUSH_AND_INV_TS_EVENT = 0x00000014 +}; + +#endif /* KFD_PM4_HEADERS_H_ */ diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_pm4_headers_ai.h b/drivers/gpu/drm/amd/amdkfd/kfd_pm4_headers_ai.h new file mode 100644 index 000000000..f2bcf5c09 --- /dev/null +++ b/drivers/gpu/drm/amd/amdkfd/kfd_pm4_headers_ai.h @@ -0,0 +1,583 @@ +/* + * Copyright 2016 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + * + */ + +#ifndef F32_MES_PM4_PACKETS_H +#define F32_MES_PM4_PACKETS_H + +#ifndef PM4_MES_HEADER_DEFINED +#define PM4_MES_HEADER_DEFINED +union PM4_MES_TYPE_3_HEADER { + struct { + uint32_t reserved1 : 8; /* < reserved */ + uint32_t opcode : 8; /* < IT opcode */ + uint32_t count : 14;/* < number of DWORDs - 1 in the + * information body. + */ + uint32_t type : 2; /* < packet identifier. + * It should be 3 for type 3 packets + */ + }; + uint32_t u32All; +}; +#endif /* PM4_MES_HEADER_DEFINED */ + +/*--------------------MES_SET_RESOURCES--------------------*/ + +#ifndef PM4_MES_SET_RESOURCES_DEFINED +#define PM4_MES_SET_RESOURCES_DEFINED +enum mes_set_resources_queue_type_enum { + queue_type__mes_set_resources__kernel_interface_queue_kiq = 0, + queue_type__mes_set_resources__hsa_interface_queue_hiq = 1, + queue_type__mes_set_resources__hsa_debug_interface_queue = 4 +}; + + +struct pm4_mes_set_resources { + union { + union PM4_MES_TYPE_3_HEADER header; /* header */ + uint32_t ordinal1; + }; + + union { + struct { + uint32_t vmid_mask:16; + uint32_t unmap_latency:8; + uint32_t reserved1:5; + enum mes_set_resources_queue_type_enum queue_type:3; + } bitfields2; + uint32_t ordinal2; + }; + + uint32_t queue_mask_lo; + uint32_t queue_mask_hi; + uint32_t gws_mask_lo; + uint32_t gws_mask_hi; + + union { + struct { + uint32_t oac_mask:16; + uint32_t reserved2:16; + } bitfields7; + uint32_t ordinal7; + }; + + union { + struct { + uint32_t gds_heap_base:6; + uint32_t reserved3:5; + uint32_t gds_heap_size:6; + uint32_t reserved4:15; + } bitfields8; + uint32_t ordinal8; + }; + +}; +#endif + +/*--------------------MES_RUN_LIST--------------------*/ + +#ifndef PM4_MES_RUN_LIST_DEFINED +#define PM4_MES_RUN_LIST_DEFINED + +struct pm4_mes_runlist { + union { + union PM4_MES_TYPE_3_HEADER header; /* header */ + uint32_t ordinal1; + }; + + union { + struct { + uint32_t reserved1:2; + uint32_t ib_base_lo:30; + } bitfields2; + uint32_t ordinal2; + }; + + uint32_t ib_base_hi; + + union { + struct { + uint32_t ib_size:20; + uint32_t chain:1; + uint32_t offload_polling:1; + uint32_t reserved2:1; + uint32_t valid:1; + uint32_t process_cnt:4; + uint32_t reserved3:4; + } bitfields4; + uint32_t ordinal4; + }; + +}; +#endif + +/*--------------------MES_MAP_PROCESS--------------------*/ + +#ifndef PM4_MES_MAP_PROCESS_DEFINED +#define PM4_MES_MAP_PROCESS_DEFINED + +struct pm4_mes_map_process { + union { + union PM4_MES_TYPE_3_HEADER header; /* header */ + uint32_t ordinal1; + }; + + union { + struct { + uint32_t pasid:16; + uint32_t reserved1:8; + uint32_t diq_enable:1; + uint32_t process_quantum:7; + } bitfields2; + uint32_t ordinal2; + }; + + uint32_t vm_context_page_table_base_addr_lo32; + + uint32_t vm_context_page_table_base_addr_hi32; + + uint32_t sh_mem_bases; + + uint32_t sh_mem_config; + + uint32_t sq_shader_tba_lo; + + uint32_t sq_shader_tba_hi; + + uint32_t sq_shader_tma_lo; + + uint32_t sq_shader_tma_hi; + + uint32_t reserved6; + + uint32_t gds_addr_lo; + + uint32_t gds_addr_hi; + + union { + struct { + uint32_t num_gws:6; + uint32_t reserved7:1; + uint32_t sdma_enable:1; + uint32_t num_oac:4; + uint32_t reserved8:4; + uint32_t gds_size:6; + uint32_t num_queues:10; + } bitfields14; + uint32_t ordinal14; + }; + + uint32_t completion_signal_lo; + + uint32_t completion_signal_hi; + +}; + +#endif + +/*--------------------MES_MAP_PROCESS_VM--------------------*/ + +#ifndef PM4_MES_MAP_PROCESS_VM_DEFINED +#define PM4_MES_MAP_PROCESS_VM_DEFINED + +struct PM4_MES_MAP_PROCESS_VM { + union { + union PM4_MES_TYPE_3_HEADER header; /* header */ + uint32_t ordinal1; + }; + + uint32_t reserved1; + + uint32_t vm_context_cntl; + + uint32_t reserved2; + + uint32_t vm_context_page_table_end_addr_lo32; + + uint32_t vm_context_page_table_end_addr_hi32; + + uint32_t vm_context_page_table_start_addr_lo32; + + uint32_t vm_context_page_table_start_addr_hi32; + + uint32_t reserved3; + + uint32_t reserved4; + + uint32_t reserved5; + + uint32_t reserved6; + + uint32_t reserved7; + + uint32_t reserved8; + + uint32_t completion_signal_lo32; + + uint32_t completion_signal_hi32; + +}; +#endif + +/*--------------------MES_MAP_QUEUES--------------------*/ + +#ifndef PM4_MES_MAP_QUEUES_VI_DEFINED +#define PM4_MES_MAP_QUEUES_VI_DEFINED +enum mes_map_queues_queue_sel_enum { + queue_sel__mes_map_queues__map_to_specified_queue_slots_vi = 0, +queue_sel__mes_map_queues__map_to_hws_determined_queue_slots_vi = 1 +}; + +enum mes_map_queues_queue_type_enum { + queue_type__mes_map_queues__normal_compute_vi = 0, + queue_type__mes_map_queues__debug_interface_queue_vi = 1, + queue_type__mes_map_queues__normal_latency_static_queue_vi = 2, +queue_type__mes_map_queues__low_latency_static_queue_vi = 3 +}; + +enum mes_map_queues_alloc_format_enum { + alloc_format__mes_map_queues__one_per_pipe_vi = 0, +alloc_format__mes_map_queues__all_on_one_pipe_vi = 1 +}; + +enum mes_map_queues_engine_sel_enum { + engine_sel__mes_map_queues__compute_vi = 0, + engine_sel__mes_map_queues__sdma0_vi = 2, + engine_sel__mes_map_queues__sdma1_vi = 3 +}; + + +struct pm4_mes_map_queues { + union { + union PM4_MES_TYPE_3_HEADER header; /* header */ + uint32_t ordinal1; + }; + + union { + struct { + uint32_t reserved1:4; + enum mes_map_queues_queue_sel_enum queue_sel:2; + uint32_t reserved2:15; + enum mes_map_queues_queue_type_enum queue_type:3; + enum mes_map_queues_alloc_format_enum alloc_format:2; + enum mes_map_queues_engine_sel_enum engine_sel:3; + uint32_t num_queues:3; + } bitfields2; + uint32_t ordinal2; + }; + + union { + struct { + uint32_t reserved3:1; + uint32_t check_disable:1; + uint32_t doorbell_offset:26; + uint32_t reserved4:4; + } bitfields3; + uint32_t ordinal3; + }; + + uint32_t mqd_addr_lo; + uint32_t mqd_addr_hi; + uint32_t wptr_addr_lo; + uint32_t wptr_addr_hi; +}; +#endif + +/*--------------------MES_QUERY_STATUS--------------------*/ + +#ifndef PM4_MES_QUERY_STATUS_DEFINED +#define PM4_MES_QUERY_STATUS_DEFINED +enum mes_query_status_interrupt_sel_enum { + interrupt_sel__mes_query_status__completion_status = 0, + interrupt_sel__mes_query_status__process_status = 1, + interrupt_sel__mes_query_status__queue_status = 2 +}; + +enum mes_query_status_command_enum { + command__mes_query_status__interrupt_only = 0, + command__mes_query_status__fence_only_immediate = 1, + command__mes_query_status__fence_only_after_write_ack = 2, + command__mes_query_status__fence_wait_for_write_ack_send_interrupt = 3 +}; + +enum mes_query_status_engine_sel_enum { + engine_sel__mes_query_status__compute = 0, + engine_sel__mes_query_status__sdma0_queue = 2, + engine_sel__mes_query_status__sdma1_queue = 3 +}; + +struct pm4_mes_query_status { + union { + union PM4_MES_TYPE_3_HEADER header; /* header */ + uint32_t ordinal1; + }; + + union { + struct { + uint32_t context_id:28; + enum mes_query_status_interrupt_sel_enum interrupt_sel:2; + enum mes_query_status_command_enum command:2; + } bitfields2; + uint32_t ordinal2; + }; + + union { + struct { + uint32_t pasid:16; + uint32_t reserved1:16; + } bitfields3a; + struct { + uint32_t reserved2:2; + uint32_t doorbell_offset:26; + enum mes_query_status_engine_sel_enum engine_sel:3; + uint32_t reserved3:1; + } bitfields3b; + uint32_t ordinal3; + }; + + uint32_t addr_lo; + uint32_t addr_hi; + uint32_t data_lo; + uint32_t data_hi; +}; +#endif + +/*--------------------MES_UNMAP_QUEUES--------------------*/ + +#ifndef PM4_MES_UNMAP_QUEUES_DEFINED +#define PM4_MES_UNMAP_QUEUES_DEFINED +enum mes_unmap_queues_action_enum { + action__mes_unmap_queues__preempt_queues = 0, + action__mes_unmap_queues__reset_queues = 1, + action__mes_unmap_queues__disable_process_queues = 2, + action__mes_unmap_queues__reserved = 3 +}; + +enum mes_unmap_queues_queue_sel_enum { + queue_sel__mes_unmap_queues__perform_request_on_specified_queues = 0, + queue_sel__mes_unmap_queues__perform_request_on_pasid_queues = 1, + queue_sel__mes_unmap_queues__unmap_all_queues = 2, + queue_sel__mes_unmap_queues__unmap_all_non_static_queues = 3 +}; + +enum mes_unmap_queues_engine_sel_enum { + engine_sel__mes_unmap_queues__compute = 0, + engine_sel__mes_unmap_queues__sdma0 = 2, + engine_sel__mes_unmap_queues__sdmal = 3 +}; + +struct pm4_mes_unmap_queues { + union { + union PM4_MES_TYPE_3_HEADER header; /* header */ + uint32_t ordinal1; + }; + + union { + struct { + enum mes_unmap_queues_action_enum action:2; + uint32_t reserved1:2; + enum mes_unmap_queues_queue_sel_enum queue_sel:2; + uint32_t reserved2:20; + enum mes_unmap_queues_engine_sel_enum engine_sel:3; + uint32_t num_queues:3; + } bitfields2; + uint32_t ordinal2; + }; + + union { + struct { + uint32_t pasid:16; + uint32_t reserved3:16; + } bitfields3a; + struct { + uint32_t reserved4:2; + uint32_t doorbell_offset0:26; + int32_t reserved5:4; + } bitfields3b; + uint32_t ordinal3; + }; + + union { + struct { + uint32_t reserved6:2; + uint32_t doorbell_offset1:26; + uint32_t reserved7:4; + } bitfields4; + uint32_t ordinal4; + }; + + union { + struct { + uint32_t reserved8:2; + uint32_t doorbell_offset2:26; + uint32_t reserved9:4; + } bitfields5; + uint32_t ordinal5; + }; + + union { + struct { + uint32_t reserved10:2; + uint32_t doorbell_offset3:26; + uint32_t reserved11:4; + } bitfields6; + uint32_t ordinal6; + }; +}; +#endif + +#ifndef PM4_MEC_RELEASE_MEM_DEFINED +#define PM4_MEC_RELEASE_MEM_DEFINED + +enum mec_release_mem_event_index_enum { + event_index__mec_release_mem__end_of_pipe = 5, + event_index__mec_release_mem__shader_done = 6 +}; + +enum mec_release_mem_cache_policy_enum { + cache_policy__mec_release_mem__lru = 0, + cache_policy__mec_release_mem__stream = 1 +}; + +enum mec_release_mem_pq_exe_status_enum { + pq_exe_status__mec_release_mem__default = 0, + pq_exe_status__mec_release_mem__phase_update = 1 +}; + +enum mec_release_mem_dst_sel_enum { + dst_sel__mec_release_mem__memory_controller = 0, + dst_sel__mec_release_mem__tc_l2 = 1, + dst_sel__mec_release_mem__queue_write_pointer_register = 2, + dst_sel__mec_release_mem__queue_write_pointer_poll_mask_bit = 3 +}; + +enum mec_release_mem_int_sel_enum { + int_sel__mec_release_mem__none = 0, + int_sel__mec_release_mem__send_interrupt_only = 1, + int_sel__mec_release_mem__send_interrupt_after_write_confirm = 2, + int_sel__mec_release_mem__send_data_after_write_confirm = 3, + int_sel__mec_release_mem__unconditionally_send_int_ctxid = 4, + int_sel__mec_release_mem__conditionally_send_int_ctxid_based_on_32_bit_compare = 5, + int_sel__mec_release_mem__conditionally_send_int_ctxid_based_on_64_bit_compare = 6 +}; + +enum mec_release_mem_data_sel_enum { + data_sel__mec_release_mem__none = 0, + data_sel__mec_release_mem__send_32_bit_low = 1, + data_sel__mec_release_mem__send_64_bit_data = 2, + data_sel__mec_release_mem__send_gpu_clock_counter = 3, + data_sel__mec_release_mem__send_cp_perfcounter_hi_lo = 4, + data_sel__mec_release_mem__store_gds_data_to_memory = 5 +}; + +struct pm4_mec_release_mem { + union { + union PM4_MES_TYPE_3_HEADER header; /*header */ + unsigned int ordinal1; + }; + + union { + struct { + unsigned int event_type:6; + unsigned int reserved1:2; + enum mec_release_mem_event_index_enum event_index:4; + unsigned int tcl1_vol_action_ena:1; + unsigned int tc_vol_action_ena:1; + unsigned int reserved2:1; + unsigned int tc_wb_action_ena:1; + unsigned int tcl1_action_ena:1; + unsigned int tc_action_ena:1; + uint32_t reserved3:1; + uint32_t tc_nc_action_ena:1; + uint32_t tc_wc_action_ena:1; + uint32_t tc_md_action_ena:1; + uint32_t reserved4:3; + enum mec_release_mem_cache_policy_enum cache_policy:2; + uint32_t reserved5:2; + enum mec_release_mem_pq_exe_status_enum pq_exe_status:1; + uint32_t reserved6:2; + } bitfields2; + unsigned int ordinal2; + }; + + union { + struct { + uint32_t reserved7:16; + enum mec_release_mem_dst_sel_enum dst_sel:2; + uint32_t reserved8:6; + enum mec_release_mem_int_sel_enum int_sel:3; + uint32_t reserved9:2; + enum mec_release_mem_data_sel_enum data_sel:3; + } bitfields3; + unsigned int ordinal3; + }; + + union { + struct { + uint32_t reserved10:2; + unsigned int address_lo_32b:30; + } bitfields4; + struct { + uint32_t reserved11:3; + uint32_t address_lo_64b:29; + } bitfields4b; + uint32_t reserved12; + unsigned int ordinal4; + }; + + union { + uint32_t address_hi; + uint32_t reserved13; + uint32_t ordinal5; + }; + + union { + uint32_t data_lo; + uint32_t cmp_data_lo; + struct { + uint32_t dw_offset:16; + uint32_t num_dwords:16; + } bitfields6c; + uint32_t reserved14; + uint32_t ordinal6; + }; + + union { + uint32_t data_hi; + uint32_t cmp_data_hi; + uint32_t reserved15; + uint32_t reserved16; + uint32_t ordinal7; + }; + + uint32_t int_ctxid; + +}; + +#endif + +enum { + CACHE_FLUSH_AND_INV_TS_EVENT = 0x00000014 +}; +#endif + diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_pm4_headers_diq.h b/drivers/gpu/drm/amd/amdkfd/kfd_pm4_headers_diq.h new file mode 100644 index 000000000..a0ff34878 --- /dev/null +++ b/drivers/gpu/drm/amd/amdkfd/kfd_pm4_headers_diq.h @@ -0,0 +1,290 @@ +/* + * Copyright 2014 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + * + */ + +#ifndef KFD_PM4_HEADERS_DIQ_H_ +#define KFD_PM4_HEADERS_DIQ_H_ + +/*--------------------_INDIRECT_BUFFER-------------------- */ + +#ifndef _PM4__INDIRECT_BUFFER_DEFINED +#define _PM4__INDIRECT_BUFFER_DEFINED +enum _INDIRECT_BUFFER_cache_policy_enum { + cache_policy___indirect_buffer__lru = 0, + cache_policy___indirect_buffer__stream = 1, + cache_policy___indirect_buffer__bypass = 2 +}; + +enum { + IT_INDIRECT_BUFFER_PASID = 0x5C +}; + +struct pm4__indirect_buffer_pasid { + union { + union PM4_MES_TYPE_3_HEADER header; /* header */ + unsigned int ordinal1; + }; + + union { + struct { + unsigned int reserved1:2; + unsigned int ib_base_lo:30; + } bitfields2; + unsigned int ordinal2; + }; + + union { + struct { + unsigned int ib_base_hi:16; + unsigned int reserved2:16; + } bitfields3; + unsigned int ordinal3; + }; + + union { + unsigned int control; + unsigned int ordinal4; + }; + + union { + struct { + unsigned int pasid:10; + unsigned int reserved4:22; + } bitfields5; + unsigned int ordinal5; + }; + +}; + +#endif + +/*--------------------_RELEASE_MEM-------------------- */ + +#ifndef _PM4__RELEASE_MEM_DEFINED +#define _PM4__RELEASE_MEM_DEFINED +enum _RELEASE_MEM_event_index_enum { + event_index___release_mem__end_of_pipe = 5, + event_index___release_mem__shader_done = 6 +}; + +enum _RELEASE_MEM_cache_policy_enum { + cache_policy___release_mem__lru = 0, + cache_policy___release_mem__stream = 1, + cache_policy___release_mem__bypass = 2 +}; + +enum _RELEASE_MEM_dst_sel_enum { + dst_sel___release_mem__memory_controller = 0, + dst_sel___release_mem__tc_l2 = 1, + dst_sel___release_mem__queue_write_pointer_register = 2, + dst_sel___release_mem__queue_write_pointer_poll_mask_bit = 3 +}; + +enum _RELEASE_MEM_int_sel_enum { + int_sel___release_mem__none = 0, + int_sel___release_mem__send_interrupt_only = 1, + int_sel___release_mem__send_interrupt_after_write_confirm = 2, + int_sel___release_mem__send_data_after_write_confirm = 3 +}; + +enum _RELEASE_MEM_data_sel_enum { + data_sel___release_mem__none = 0, + data_sel___release_mem__send_32_bit_low = 1, + data_sel___release_mem__send_64_bit_data = 2, + data_sel___release_mem__send_gpu_clock_counter = 3, + data_sel___release_mem__send_cp_perfcounter_hi_lo = 4, + data_sel___release_mem__store_gds_data_to_memory = 5 +}; + +struct pm4__release_mem { + union { + union PM4_MES_TYPE_3_HEADER header; /*header */ + unsigned int ordinal1; + }; + + union { + struct { + unsigned int event_type:6; + unsigned int reserved1:2; + enum _RELEASE_MEM_event_index_enum event_index:4; + unsigned int tcl1_vol_action_ena:1; + unsigned int tc_vol_action_ena:1; + unsigned int reserved2:1; + unsigned int tc_wb_action_ena:1; + unsigned int tcl1_action_ena:1; + unsigned int tc_action_ena:1; + unsigned int reserved3:6; + unsigned int atc:1; + enum _RELEASE_MEM_cache_policy_enum cache_policy:2; + unsigned int reserved4:5; + } bitfields2; + unsigned int ordinal2; + }; + + union { + struct { + unsigned int reserved5:16; + enum _RELEASE_MEM_dst_sel_enum dst_sel:2; + unsigned int reserved6:6; + enum _RELEASE_MEM_int_sel_enum int_sel:3; + unsigned int reserved7:2; + enum _RELEASE_MEM_data_sel_enum data_sel:3; + } bitfields3; + unsigned int ordinal3; + }; + + union { + struct { + unsigned int reserved8:2; + unsigned int address_lo_32b:30; + } bitfields4; + struct { + unsigned int reserved9:3; + unsigned int address_lo_64b:29; + } bitfields5; + unsigned int ordinal4; + }; + + unsigned int address_hi; + + unsigned int data_lo; + + unsigned int data_hi; + +}; +#endif + + +/*--------------------_SET_CONFIG_REG-------------------- */ + +#ifndef _PM4__SET_CONFIG_REG_DEFINED +#define _PM4__SET_CONFIG_REG_DEFINED + +struct pm4__set_config_reg { + union { + union PM4_MES_TYPE_3_HEADER header; /*header */ + unsigned int ordinal1; + }; + + union { + struct { + unsigned int reg_offset:16; + unsigned int reserved1:7; + unsigned int vmid_shift:5; + unsigned int insert_vmid:1; + unsigned int reserved2:3; + } bitfields2; + unsigned int ordinal2; + }; + + unsigned int reg_data[1]; /*1..N of these fields */ + +}; +#endif + +/*--------------------_WAIT_REG_MEM-------------------- */ + +#ifndef _PM4__WAIT_REG_MEM_DEFINED +#define _PM4__WAIT_REG_MEM_DEFINED +enum _WAIT_REG_MEM_function_enum { + function___wait_reg_mem__always_pass = 0, + function___wait_reg_mem__less_than_ref_value = 1, + function___wait_reg_mem__less_than_equal_to_the_ref_value = 2, + function___wait_reg_mem__equal_to_the_reference_value = 3, + function___wait_reg_mem__not_equal_reference_value = 4, + function___wait_reg_mem__greater_than_or_equal_reference_value = 5, + function___wait_reg_mem__greater_than_reference_value = 6, + function___wait_reg_mem__reserved = 7 +}; + +enum _WAIT_REG_MEM_mem_space_enum { + mem_space___wait_reg_mem__register_space = 0, + mem_space___wait_reg_mem__memory_space = 1 +}; + +enum _WAIT_REG_MEM_operation_enum { + operation___wait_reg_mem__wait_reg_mem = 0, + operation___wait_reg_mem__wr_wait_wr_reg = 1 +}; + +struct pm4__wait_reg_mem { + union { + union PM4_MES_TYPE_3_HEADER header; /*header */ + unsigned int ordinal1; + }; + + union { + struct { + enum _WAIT_REG_MEM_function_enum function:3; + unsigned int reserved1:1; + enum _WAIT_REG_MEM_mem_space_enum mem_space:2; + enum _WAIT_REG_MEM_operation_enum operation:2; + unsigned int reserved2:24; + } bitfields2; + unsigned int ordinal2; + }; + + union { + struct { + unsigned int reserved3:2; + unsigned int memory_poll_addr_lo:30; + } bitfields3; + struct { + unsigned int register_poll_addr:16; + unsigned int reserved4:16; + } bitfields4; + struct { + unsigned int register_write_addr:16; + unsigned int reserved5:16; + } bitfields5; + unsigned int ordinal3; + }; + + union { + struct { + unsigned int poll_address_hi:16; + unsigned int reserved6:16; + } bitfields6; + struct { + unsigned int register_write_addr:16; + unsigned int reserved7:16; + } bitfields7; + unsigned int ordinal4; + }; + + unsigned int reference; + + unsigned int mask; + + union { + struct { + unsigned int poll_interval:16; + unsigned int reserved8:16; + } bitfields8; + unsigned int ordinal7; + }; + +}; +#endif + + +#endif /* KFD_PM4_HEADERS_DIQ_H_ */ diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_pm4_headers_vi.h b/drivers/gpu/drm/amd/amdkfd/kfd_pm4_headers_vi.h new file mode 100644 index 000000000..7c8d9b357 --- /dev/null +++ b/drivers/gpu/drm/amd/amdkfd/kfd_pm4_headers_vi.h @@ -0,0 +1,510 @@ +/* + * Copyright 2014 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + * + */ + +#ifndef F32_MES_PM4_PACKETS_H +#define F32_MES_PM4_PACKETS_H + +#ifndef PM4_MES_HEADER_DEFINED +#define PM4_MES_HEADER_DEFINED +union PM4_MES_TYPE_3_HEADER { + struct { + uint32_t reserved1 : 8; /* < reserved */ + uint32_t opcode : 8; /* < IT opcode */ + uint32_t count : 14;/* < Number of DWORDS - 1 in the + * information body + */ + uint32_t type : 2; /* < packet identifier + * It should be 3 for type 3 packets + */ + }; + uint32_t u32All; +}; +#endif /* PM4_MES_HEADER_DEFINED */ + +/*--------------------MES_SET_RESOURCES--------------------*/ + +#ifndef PM4_MES_SET_RESOURCES_DEFINED +#define PM4_MES_SET_RESOURCES_DEFINED +enum mes_set_resources_queue_type_enum { + queue_type__mes_set_resources__kernel_interface_queue_kiq = 0, + queue_type__mes_set_resources__hsa_interface_queue_hiq = 1, + queue_type__mes_set_resources__hsa_debug_interface_queue = 4 +}; + + +struct pm4_mes_set_resources { + union { + union PM4_MES_TYPE_3_HEADER header; /* header */ + uint32_t ordinal1; + }; + + union { + struct { + uint32_t vmid_mask:16; + uint32_t unmap_latency:8; + uint32_t reserved1:5; + enum mes_set_resources_queue_type_enum queue_type:3; + } bitfields2; + uint32_t ordinal2; + }; + + uint32_t queue_mask_lo; + uint32_t queue_mask_hi; + uint32_t gws_mask_lo; + uint32_t gws_mask_hi; + + union { + struct { + uint32_t oac_mask:16; + uint32_t reserved2:16; + } bitfields7; + uint32_t ordinal7; + }; + + union { + struct { + uint32_t gds_heap_base:6; + uint32_t reserved3:5; + uint32_t gds_heap_size:6; + uint32_t reserved4:15; + } bitfields8; + uint32_t ordinal8; + }; + +}; +#endif + +/*--------------------MES_RUN_LIST--------------------*/ + +#ifndef PM4_MES_RUN_LIST_DEFINED +#define PM4_MES_RUN_LIST_DEFINED + +struct pm4_mes_runlist { + union { + union PM4_MES_TYPE_3_HEADER header; /* header */ + uint32_t ordinal1; + }; + + union { + struct { + uint32_t reserved1:2; + uint32_t ib_base_lo:30; + } bitfields2; + uint32_t ordinal2; + }; + + union { + struct { + uint32_t ib_base_hi:16; + uint32_t reserved2:16; + } bitfields3; + uint32_t ordinal3; + }; + + union { + struct { + uint32_t ib_size:20; + uint32_t chain:1; + uint32_t offload_polling:1; + uint32_t reserved2:1; + uint32_t valid:1; + uint32_t process_cnt:4; + uint32_t reserved3:4; + } bitfields4; + uint32_t ordinal4; + }; + +}; +#endif + +/*--------------------MES_MAP_PROCESS--------------------*/ + +#ifndef PM4_MES_MAP_PROCESS_DEFINED +#define PM4_MES_MAP_PROCESS_DEFINED + +struct pm4_mes_map_process { + union { + union PM4_MES_TYPE_3_HEADER header; /* header */ + uint32_t ordinal1; + }; + + union { + struct { + uint32_t pasid:16; + uint32_t reserved1:8; + uint32_t diq_enable:1; + uint32_t process_quantum:7; + } bitfields2; + uint32_t ordinal2; + }; + + union { + struct { + uint32_t page_table_base:28; + uint32_t reserved3:4; + } bitfields3; + uint32_t ordinal3; + }; + + uint32_t reserved; + + uint32_t sh_mem_bases; + uint32_t sh_mem_config; + uint32_t sh_mem_ape1_base; + uint32_t sh_mem_ape1_limit; + + uint32_t sh_hidden_private_base_vmid; + + uint32_t reserved2; + uint32_t reserved3; + + uint32_t gds_addr_lo; + uint32_t gds_addr_hi; + + union { + struct { + uint32_t num_gws:6; + uint32_t reserved4:2; + uint32_t num_oac:4; + uint32_t reserved5:4; + uint32_t gds_size:6; + uint32_t num_queues:10; + } bitfields10; + uint32_t ordinal10; + }; + + uint32_t completion_signal_lo; + uint32_t completion_signal_hi; + +}; + +#endif + +/*--------------------MES_MAP_QUEUES--------------------*/ + +#ifndef PM4_MES_MAP_QUEUES_VI_DEFINED +#define PM4_MES_MAP_QUEUES_VI_DEFINED +enum mes_map_queues_queue_sel_vi_enum { + queue_sel__mes_map_queues__map_to_specified_queue_slots_vi = 0, +queue_sel__mes_map_queues__map_to_hws_determined_queue_slots_vi = 1 +}; + +enum mes_map_queues_queue_type_vi_enum { + queue_type__mes_map_queues__normal_compute_vi = 0, + queue_type__mes_map_queues__debug_interface_queue_vi = 1, + queue_type__mes_map_queues__normal_latency_static_queue_vi = 2, +queue_type__mes_map_queues__low_latency_static_queue_vi = 3 +}; + +enum mes_map_queues_alloc_format_vi_enum { + alloc_format__mes_map_queues__one_per_pipe_vi = 0, +alloc_format__mes_map_queues__all_on_one_pipe_vi = 1 +}; + +enum mes_map_queues_engine_sel_vi_enum { + engine_sel__mes_map_queues__compute_vi = 0, + engine_sel__mes_map_queues__sdma0_vi = 2, + engine_sel__mes_map_queues__sdma1_vi = 3 +}; + + +struct pm4_mes_map_queues { + union { + union PM4_MES_TYPE_3_HEADER header; /* header */ + uint32_t ordinal1; + }; + + union { + struct { + uint32_t reserved1:4; + enum mes_map_queues_queue_sel_vi_enum queue_sel:2; + uint32_t reserved2:15; + enum mes_map_queues_queue_type_vi_enum queue_type:3; + enum mes_map_queues_alloc_format_vi_enum alloc_format:2; + enum mes_map_queues_engine_sel_vi_enum engine_sel:3; + uint32_t num_queues:3; + } bitfields2; + uint32_t ordinal2; + }; + + union { + struct { + uint32_t reserved3:1; + uint32_t check_disable:1; + uint32_t doorbell_offset:21; + uint32_t reserved4:3; + uint32_t queue:6; + } bitfields3; + uint32_t ordinal3; + }; + + uint32_t mqd_addr_lo; + uint32_t mqd_addr_hi; + uint32_t wptr_addr_lo; + uint32_t wptr_addr_hi; +}; +#endif + +/*--------------------MES_QUERY_STATUS--------------------*/ + +#ifndef PM4_MES_QUERY_STATUS_DEFINED +#define PM4_MES_QUERY_STATUS_DEFINED +enum mes_query_status_interrupt_sel_enum { + interrupt_sel__mes_query_status__completion_status = 0, + interrupt_sel__mes_query_status__process_status = 1, + interrupt_sel__mes_query_status__queue_status = 2 +}; + +enum mes_query_status_command_enum { + command__mes_query_status__interrupt_only = 0, + command__mes_query_status__fence_only_immediate = 1, + command__mes_query_status__fence_only_after_write_ack = 2, + command__mes_query_status__fence_wait_for_write_ack_send_interrupt = 3 +}; + +enum mes_query_status_engine_sel_enum { + engine_sel__mes_query_status__compute = 0, + engine_sel__mes_query_status__sdma0_queue = 2, + engine_sel__mes_query_status__sdma1_queue = 3 +}; + +struct pm4_mes_query_status { + union { + union PM4_MES_TYPE_3_HEADER header; /* header */ + uint32_t ordinal1; + }; + + union { + struct { + uint32_t context_id:28; + enum mes_query_status_interrupt_sel_enum + interrupt_sel:2; + enum mes_query_status_command_enum command:2; + } bitfields2; + uint32_t ordinal2; + }; + + union { + struct { + uint32_t pasid:16; + uint32_t reserved1:16; + } bitfields3a; + struct { + uint32_t reserved2:2; + uint32_t doorbell_offset:21; + uint32_t reserved3:2; + enum mes_query_status_engine_sel_enum engine_sel:3; + uint32_t reserved4:4; + } bitfields3b; + uint32_t ordinal3; + }; + + uint32_t addr_lo; + uint32_t addr_hi; + uint32_t data_lo; + uint32_t data_hi; +}; +#endif + +/*--------------------MES_UNMAP_QUEUES--------------------*/ + +#ifndef PM4_MES_UNMAP_QUEUES_DEFINED +#define PM4_MES_UNMAP_QUEUES_DEFINED +enum mes_unmap_queues_action_enum { + action__mes_unmap_queues__preempt_queues = 0, + action__mes_unmap_queues__reset_queues = 1, + action__mes_unmap_queues__disable_process_queues = 2, + action__mes_unmap_queues__reserved = 3 +}; + +enum mes_unmap_queues_queue_sel_enum { + queue_sel__mes_unmap_queues__perform_request_on_specified_queues = 0, + queue_sel__mes_unmap_queues__perform_request_on_pasid_queues = 1, + queue_sel__mes_unmap_queues__unmap_all_queues = 2, + queue_sel__mes_unmap_queues__unmap_all_non_static_queues = 3 +}; + +enum mes_unmap_queues_engine_sel_enum { + engine_sel__mes_unmap_queues__compute = 0, + engine_sel__mes_unmap_queues__sdma0 = 2, + engine_sel__mes_unmap_queues__sdmal = 3 +}; + +struct pm4_mes_unmap_queues { + union { + union PM4_MES_TYPE_3_HEADER header; /* header */ + uint32_t ordinal1; + }; + + union { + struct { + enum mes_unmap_queues_action_enum action:2; + uint32_t reserved1:2; + enum mes_unmap_queues_queue_sel_enum queue_sel:2; + uint32_t reserved2:20; + enum mes_unmap_queues_engine_sel_enum engine_sel:3; + uint32_t num_queues:3; + } bitfields2; + uint32_t ordinal2; + }; + + union { + struct { + uint32_t pasid:16; + uint32_t reserved3:16; + } bitfields3a; + struct { + uint32_t reserved4:2; + uint32_t doorbell_offset0:21; + uint32_t reserved5:9; + } bitfields3b; + uint32_t ordinal3; + }; + + union { + struct { + uint32_t reserved6:2; + uint32_t doorbell_offset1:21; + uint32_t reserved7:9; + } bitfields4; + uint32_t ordinal4; + }; + + union { + struct { + uint32_t reserved8:2; + uint32_t doorbell_offset2:21; + uint32_t reserved9:9; + } bitfields5; + uint32_t ordinal5; + }; + + union { + struct { + uint32_t reserved10:2; + uint32_t doorbell_offset3:21; + uint32_t reserved11:9; + } bitfields6; + uint32_t ordinal6; + }; +}; +#endif + +#ifndef PM4_MEC_RELEASE_MEM_DEFINED +#define PM4_MEC_RELEASE_MEM_DEFINED +enum RELEASE_MEM_event_index_enum { + event_index___release_mem__end_of_pipe = 5, + event_index___release_mem__shader_done = 6 +}; + +enum RELEASE_MEM_cache_policy_enum { + cache_policy___release_mem__lru = 0, + cache_policy___release_mem__stream = 1, + cache_policy___release_mem__bypass = 2 +}; + +enum RELEASE_MEM_dst_sel_enum { + dst_sel___release_mem__memory_controller = 0, + dst_sel___release_mem__tc_l2 = 1, + dst_sel___release_mem__queue_write_pointer_register = 2, + dst_sel___release_mem__queue_write_pointer_poll_mask_bit = 3 +}; + +enum RELEASE_MEM_int_sel_enum { + int_sel___release_mem__none = 0, + int_sel___release_mem__send_interrupt_only = 1, + int_sel___release_mem__send_interrupt_after_write_confirm = 2, + int_sel___release_mem__send_data_after_write_confirm = 3 +}; + +enum RELEASE_MEM_data_sel_enum { + data_sel___release_mem__none = 0, + data_sel___release_mem__send_32_bit_low = 1, + data_sel___release_mem__send_64_bit_data = 2, + data_sel___release_mem__send_gpu_clock_counter = 3, + data_sel___release_mem__send_cp_perfcounter_hi_lo = 4, + data_sel___release_mem__store_gds_data_to_memory = 5 +}; + +struct pm4_mec_release_mem { + union { + union PM4_MES_TYPE_3_HEADER header; /*header */ + unsigned int ordinal1; + }; + + union { + struct { + unsigned int event_type:6; + unsigned int reserved1:2; + enum RELEASE_MEM_event_index_enum event_index:4; + unsigned int tcl1_vol_action_ena:1; + unsigned int tc_vol_action_ena:1; + unsigned int reserved2:1; + unsigned int tc_wb_action_ena:1; + unsigned int tcl1_action_ena:1; + unsigned int tc_action_ena:1; + unsigned int reserved3:6; + unsigned int atc:1; + enum RELEASE_MEM_cache_policy_enum cache_policy:2; + unsigned int reserved4:5; + } bitfields2; + unsigned int ordinal2; + }; + + union { + struct { + unsigned int reserved5:16; + enum RELEASE_MEM_dst_sel_enum dst_sel:2; + unsigned int reserved6:6; + enum RELEASE_MEM_int_sel_enum int_sel:3; + unsigned int reserved7:2; + enum RELEASE_MEM_data_sel_enum data_sel:3; + } bitfields3; + unsigned int ordinal3; + }; + + union { + struct { + unsigned int reserved8:2; + unsigned int address_lo_32b:30; + } bitfields4; + struct { + unsigned int reserved9:3; + unsigned int address_lo_64b:29; + } bitfields5; + unsigned int ordinal4; + }; + + unsigned int address_hi; + + unsigned int data_lo; + + unsigned int data_hi; +}; +#endif + +enum { + CACHE_FLUSH_AND_INV_TS_EVENT = 0x00000014 +}; + +#endif diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_pm4_opcodes.h b/drivers/gpu/drm/amd/amdkfd/kfd_pm4_opcodes.h new file mode 100644 index 000000000..b72fa3b8c --- /dev/null +++ b/drivers/gpu/drm/amd/amdkfd/kfd_pm4_opcodes.h @@ -0,0 +1,107 @@ +/* + * Copyright 2014 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + * + */ + + +#ifndef KFD_PM4_OPCODES_H +#define KFD_PM4_OPCODES_H + +enum it_opcode_type { + IT_NOP = 0x10, + IT_SET_BASE = 0x11, + IT_CLEAR_STATE = 0x12, + IT_INDEX_BUFFER_SIZE = 0x13, + IT_DISPATCH_DIRECT = 0x15, + IT_DISPATCH_INDIRECT = 0x16, + IT_ATOMIC_GDS = 0x1D, + IT_OCCLUSION_QUERY = 0x1F, + IT_SET_PREDICATION = 0x20, + IT_REG_RMW = 0x21, + IT_COND_EXEC = 0x22, + IT_PRED_EXEC = 0x23, + IT_DRAW_INDIRECT = 0x24, + IT_DRAW_INDEX_INDIRECT = 0x25, + IT_INDEX_BASE = 0x26, + IT_DRAW_INDEX_2 = 0x27, + IT_CONTEXT_CONTROL = 0x28, + IT_INDEX_TYPE = 0x2A, + IT_DRAW_INDIRECT_MULTI = 0x2C, + IT_DRAW_INDEX_AUTO = 0x2D, + IT_NUM_INSTANCES = 0x2F, + IT_DRAW_INDEX_MULTI_AUTO = 0x30, + IT_INDIRECT_BUFFER_CNST = 0x33, + IT_STRMOUT_BUFFER_UPDATE = 0x34, + IT_DRAW_INDEX_OFFSET_2 = 0x35, + IT_DRAW_PREAMBLE = 0x36, + IT_WRITE_DATA = 0x37, + IT_DRAW_INDEX_INDIRECT_MULTI = 0x38, + IT_MEM_SEMAPHORE = 0x39, + IT_COPY_DW = 0x3B, + IT_WAIT_REG_MEM = 0x3C, + IT_INDIRECT_BUFFER = 0x3F, + IT_COPY_DATA = 0x40, + IT_PFP_SYNC_ME = 0x42, + IT_SURFACE_SYNC = 0x43, + IT_COND_WRITE = 0x45, + IT_EVENT_WRITE = 0x46, + IT_EVENT_WRITE_EOP = 0x47, + IT_EVENT_WRITE_EOS = 0x48, + IT_RELEASE_MEM = 0x49, + IT_PREAMBLE_CNTL = 0x4A, + IT_DMA_DATA = 0x50, + IT_ACQUIRE_MEM = 0x58, + IT_REWIND = 0x59, + IT_LOAD_UCONFIG_REG = 0x5E, + IT_LOAD_SH_REG = 0x5F, + IT_LOAD_CONFIG_REG = 0x60, + IT_LOAD_CONTEXT_REG = 0x61, + IT_SET_CONFIG_REG = 0x68, + IT_SET_CONTEXT_REG = 0x69, + IT_SET_CONTEXT_REG_INDIRECT = 0x73, + IT_SET_SH_REG = 0x76, + IT_SET_SH_REG_OFFSET = 0x77, + IT_SET_QUEUE_REG = 0x78, + IT_SET_UCONFIG_REG = 0x79, + IT_SCRATCH_RAM_WRITE = 0x7D, + IT_SCRATCH_RAM_READ = 0x7E, + IT_LOAD_CONST_RAM = 0x80, + IT_WRITE_CONST_RAM = 0x81, + IT_DUMP_CONST_RAM = 0x83, + IT_INCREMENT_CE_COUNTER = 0x84, + IT_INCREMENT_DE_COUNTER = 0x85, + IT_WAIT_ON_CE_COUNTER = 0x86, + IT_WAIT_ON_DE_COUNTER_DIFF = 0x88, + IT_SWITCH_BUFFER = 0x8B, + IT_SET_RESOURCES = 0xA0, + IT_MAP_PROCESS = 0xA1, + IT_MAP_QUEUES = 0xA2, + IT_UNMAP_QUEUES = 0xA3, + IT_QUERY_STATUS = 0xA4, + IT_RUN_LIST = 0xA5, +}; + +#define PM4_TYPE_0 0 +#define PM4_TYPE_2 2 +#define PM4_TYPE_3 3 + +#endif /* KFD_PM4_OPCODES_H */ + diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h new file mode 100644 index 000000000..92b285ca7 --- /dev/null +++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h @@ -0,0 +1,1021 @@ +/* + * Copyright 2014 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +#ifndef KFD_PRIV_H_INCLUDED +#define KFD_PRIV_H_INCLUDED + +#include <linux/hashtable.h> +#include <linux/mmu_notifier.h> +#include <linux/mutex.h> +#include <linux/types.h> +#include <linux/atomic.h> +#include <linux/workqueue.h> +#include <linux/spinlock.h> +#include <linux/kfd_ioctl.h> +#include <linux/idr.h> +#include <linux/kfifo.h> +#include <linux/seq_file.h> +#include <linux/kref.h> +#include <kgd_kfd_interface.h> + +#include "amd_shared.h" + +#define KFD_MAX_RING_ENTRY_SIZE 8 + +#define KFD_SYSFS_FILE_MODE 0444 + +/* GPU ID hash width in bits */ +#define KFD_GPU_ID_HASH_WIDTH 16 + +/* Use upper bits of mmap offset to store KFD driver specific information. + * BITS[63:62] - Encode MMAP type + * BITS[61:46] - Encode gpu_id. To identify to which GPU the offset belongs to + * BITS[45:0] - MMAP offset value + * + * NOTE: struct vm_area_struct.vm_pgoff uses offset in pages. Hence, these + * defines are w.r.t to PAGE_SIZE + */ +#define KFD_MMAP_TYPE_SHIFT (62 - PAGE_SHIFT) +#define KFD_MMAP_TYPE_MASK (0x3ULL << KFD_MMAP_TYPE_SHIFT) +#define KFD_MMAP_TYPE_DOORBELL (0x3ULL << KFD_MMAP_TYPE_SHIFT) +#define KFD_MMAP_TYPE_EVENTS (0x2ULL << KFD_MMAP_TYPE_SHIFT) +#define KFD_MMAP_TYPE_RESERVED_MEM (0x1ULL << KFD_MMAP_TYPE_SHIFT) + +#define KFD_MMAP_GPU_ID_SHIFT (46 - PAGE_SHIFT) +#define KFD_MMAP_GPU_ID_MASK (((1ULL << KFD_GPU_ID_HASH_WIDTH) - 1) \ + << KFD_MMAP_GPU_ID_SHIFT) +#define KFD_MMAP_GPU_ID(gpu_id) ((((uint64_t)gpu_id) << KFD_MMAP_GPU_ID_SHIFT)\ + & KFD_MMAP_GPU_ID_MASK) +#define KFD_MMAP_GPU_ID_GET(offset) ((offset & KFD_MMAP_GPU_ID_MASK) \ + >> KFD_MMAP_GPU_ID_SHIFT) + +#define KFD_MMAP_OFFSET_VALUE_MASK (0x3FFFFFFFFFFFULL >> PAGE_SHIFT) +#define KFD_MMAP_OFFSET_VALUE_GET(offset) (offset & KFD_MMAP_OFFSET_VALUE_MASK) + +/* + * When working with cp scheduler we should assign the HIQ manually or via + * the amdgpu driver to a fixed hqd slot, here are the fixed HIQ hqd slot + * definitions for Kaveri. In Kaveri only the first ME queues participates + * in the cp scheduling taking that in mind we set the HIQ slot in the + * second ME. + */ +#define KFD_CIK_HIQ_PIPE 4 +#define KFD_CIK_HIQ_QUEUE 0 + +/* Macro for allocating structures */ +#define kfd_alloc_struct(ptr_to_struct) \ + ((typeof(ptr_to_struct)) kzalloc(sizeof(*ptr_to_struct), GFP_KERNEL)) + +#define KFD_MAX_NUM_OF_PROCESSES 512 +#define KFD_MAX_NUM_OF_QUEUES_PER_PROCESS 1024 + +/* + * Size of the per-process TBA+TMA buffer: 2 pages + * + * The first page is the TBA used for the CWSR ISA code. The second + * page is used as TMA for daisy changing a user-mode trap handler. + */ +#define KFD_CWSR_TBA_TMA_SIZE (PAGE_SIZE * 2) +#define KFD_CWSR_TMA_OFFSET PAGE_SIZE + +/* + * Kernel module parameter to specify maximum number of supported queues per + * device + */ +extern int max_num_of_queues_per_device; + +#define KFD_MAX_NUM_OF_QUEUES_PER_DEVICE_DEFAULT 4096 +#define KFD_MAX_NUM_OF_QUEUES_PER_DEVICE \ + (KFD_MAX_NUM_OF_PROCESSES * \ + KFD_MAX_NUM_OF_QUEUES_PER_PROCESS) + +#define KFD_KERNEL_QUEUE_SIZE 2048 + +/* Kernel module parameter to specify the scheduling policy */ +extern int sched_policy; + +/* + * Kernel module parameter to specify the maximum process + * number per HW scheduler + */ +extern int hws_max_conc_proc; + +extern int cwsr_enable; + +/* + * Kernel module parameter to specify whether to send sigterm to HSA process on + * unhandled exception + */ +extern int send_sigterm; + +/* + * This kernel module is used to simulate large bar machine on non-large bar + * enabled machines. + */ +extern int debug_largebar; + +/* + * Ignore CRAT table during KFD initialization, can be used to work around + * broken CRAT tables on some AMD systems + */ +extern int ignore_crat; + +/* + * Set sh_mem_config.retry_disable on Vega10 + */ +extern int noretry; + +/* + * Halt if HWS hang is detected + */ +extern int halt_if_hws_hang; + +/** + * enum kfd_sched_policy + * + * @KFD_SCHED_POLICY_HWS: H/W scheduling policy known as command processor (cp) + * scheduling. In this scheduling mode we're using the firmware code to + * schedule the user mode queues and kernel queues such as HIQ and DIQ. + * the HIQ queue is used as a special queue that dispatches the configuration + * to the cp and the user mode queues list that are currently running. + * the DIQ queue is a debugging queue that dispatches debugging commands to the + * firmware. + * in this scheduling mode user mode queues over subscription feature is + * enabled. + * + * @KFD_SCHED_POLICY_HWS_NO_OVERSUBSCRIPTION: The same as above but the over + * subscription feature disabled. + * + * @KFD_SCHED_POLICY_NO_HWS: no H/W scheduling policy is a mode which directly + * set the command processor registers and sets the queues "manually". This + * mode is used *ONLY* for debugging proposes. + * + */ +enum kfd_sched_policy { + KFD_SCHED_POLICY_HWS = 0, + KFD_SCHED_POLICY_HWS_NO_OVERSUBSCRIPTION, + KFD_SCHED_POLICY_NO_HWS +}; + +enum cache_policy { + cache_policy_coherent, + cache_policy_noncoherent +}; + +#define KFD_IS_SOC15(chip) ((chip) >= CHIP_VEGA10) + +struct kfd_event_interrupt_class { + bool (*interrupt_isr)(struct kfd_dev *dev, + const uint32_t *ih_ring_entry, uint32_t *patched_ihre, + bool *patched_flag); + void (*interrupt_wq)(struct kfd_dev *dev, + const uint32_t *ih_ring_entry); +}; + +struct kfd_device_info { + enum amd_asic_type asic_family; + const struct kfd_event_interrupt_class *event_interrupt_class; + unsigned int max_pasid_bits; + unsigned int max_no_of_hqd; + unsigned int doorbell_size; + size_t ih_ring_entry_size; + uint8_t num_of_watch_points; + uint16_t mqd_size_aligned; + bool supports_cwsr; + bool needs_iommu_device; + bool needs_pci_atomics; + unsigned int num_sdma_engines; +}; + +struct kfd_mem_obj { + uint32_t range_start; + uint32_t range_end; + uint64_t gpu_addr; + uint32_t *cpu_ptr; + void *gtt_mem; +}; + +struct kfd_vmid_info { + uint32_t first_vmid_kfd; + uint32_t last_vmid_kfd; + uint32_t vmid_num_kfd; +}; + +struct kfd_dev { + struct kgd_dev *kgd; + + const struct kfd_device_info *device_info; + struct pci_dev *pdev; + + unsigned int id; /* topology stub index */ + + phys_addr_t doorbell_base; /* Start of actual doorbells used by + * KFD. It is aligned for mapping + * into user mode + */ + size_t doorbell_id_offset; /* Doorbell offset (from KFD doorbell + * to HW doorbell, GFX reserved some + * at the start) + */ + u32 __iomem *doorbell_kernel_ptr; /* This is a pointer for a doorbells + * page used by kernel queue + */ + + struct kgd2kfd_shared_resources shared_resources; + struct kfd_vmid_info vm_info; + + const struct kfd2kgd_calls *kfd2kgd; + struct mutex doorbell_mutex; + DECLARE_BITMAP(doorbell_available_index, + KFD_MAX_NUM_OF_QUEUES_PER_PROCESS); + + void *gtt_mem; + uint64_t gtt_start_gpu_addr; + void *gtt_start_cpu_ptr; + void *gtt_sa_bitmap; + struct mutex gtt_sa_lock; + unsigned int gtt_sa_chunk_size; + unsigned int gtt_sa_num_of_chunks; + + /* Interrupts */ + struct kfifo ih_fifo; + struct workqueue_struct *ih_wq; + struct work_struct interrupt_work; + spinlock_t interrupt_lock; + + /* QCM Device instance */ + struct device_queue_manager *dqm; + + bool init_complete; + /* + * Interrupts of interest to KFD are copied + * from the HW ring into a SW ring. + */ + bool interrupts_active; + + /* Debug manager */ + struct kfd_dbgmgr *dbgmgr; + + /* Maximum process number mapped to HW scheduler */ + unsigned int max_proc_per_quantum; + + /* CWSR */ + bool cwsr_enabled; + const void *cwsr_isa; + unsigned int cwsr_isa_size; +}; + +/* KGD2KFD callbacks */ +void kgd2kfd_exit(void); +struct kfd_dev *kgd2kfd_probe(struct kgd_dev *kgd, + struct pci_dev *pdev, const struct kfd2kgd_calls *f2g); +bool kgd2kfd_device_init(struct kfd_dev *kfd, + const struct kgd2kfd_shared_resources *gpu_resources); +void kgd2kfd_device_exit(struct kfd_dev *kfd); + +enum kfd_mempool { + KFD_MEMPOOL_SYSTEM_CACHEABLE = 1, + KFD_MEMPOOL_SYSTEM_WRITECOMBINE = 2, + KFD_MEMPOOL_FRAMEBUFFER = 3, +}; + +/* Character device interface */ +int kfd_chardev_init(void); +void kfd_chardev_exit(void); +struct device *kfd_chardev(void); + +/** + * enum kfd_unmap_queues_filter + * + * @KFD_UNMAP_QUEUES_FILTER_SINGLE_QUEUE: Preempts single queue. + * + * @KFD_UNMAP_QUEUES_FILTER_ALL_QUEUES: Preempts all queues in the + * running queues list. + * + * @KFD_UNMAP_QUEUES_FILTER_BY_PASID: Preempts queues that belongs to + * specific process. + * + */ +enum kfd_unmap_queues_filter { + KFD_UNMAP_QUEUES_FILTER_SINGLE_QUEUE, + KFD_UNMAP_QUEUES_FILTER_ALL_QUEUES, + KFD_UNMAP_QUEUES_FILTER_DYNAMIC_QUEUES, + KFD_UNMAP_QUEUES_FILTER_BY_PASID +}; + +/** + * enum kfd_queue_type + * + * @KFD_QUEUE_TYPE_COMPUTE: Regular user mode queue type. + * + * @KFD_QUEUE_TYPE_SDMA: Sdma user mode queue type. + * + * @KFD_QUEUE_TYPE_HIQ: HIQ queue type. + * + * @KFD_QUEUE_TYPE_DIQ: DIQ queue type. + */ +enum kfd_queue_type { + KFD_QUEUE_TYPE_COMPUTE, + KFD_QUEUE_TYPE_SDMA, + KFD_QUEUE_TYPE_HIQ, + KFD_QUEUE_TYPE_DIQ +}; + +enum kfd_queue_format { + KFD_QUEUE_FORMAT_PM4, + KFD_QUEUE_FORMAT_AQL +}; + +/** + * struct queue_properties + * + * @type: The queue type. + * + * @queue_id: Queue identifier. + * + * @queue_address: Queue ring buffer address. + * + * @queue_size: Queue ring buffer size. + * + * @priority: Defines the queue priority relative to other queues in the + * process. + * This is just an indication and HW scheduling may override the priority as + * necessary while keeping the relative prioritization. + * the priority granularity is from 0 to f which f is the highest priority. + * currently all queues are initialized with the highest priority. + * + * @queue_percent: This field is partially implemented and currently a zero in + * this field defines that the queue is non active. + * + * @read_ptr: User space address which points to the number of dwords the + * cp read from the ring buffer. This field updates automatically by the H/W. + * + * @write_ptr: Defines the number of dwords written to the ring buffer. + * + * @doorbell_ptr: This field aim is to notify the H/W of new packet written to + * the queue ring buffer. This field should be similar to write_ptr and the + * user should update this field after he updated the write_ptr. + * + * @doorbell_off: The doorbell offset in the doorbell pci-bar. + * + * @is_interop: Defines if this is a interop queue. Interop queue means that + * the queue can access both graphics and compute resources. + * + * @is_evicted: Defines if the queue is evicted. Only active queues + * are evicted, rendering them inactive. + * + * @is_active: Defines if the queue is active or not. @is_active and + * @is_evicted are protected by the DQM lock. + * + * @vmid: If the scheduling mode is no cp scheduling the field defines the vmid + * of the queue. + * + * This structure represents the queue properties for each queue no matter if + * it's user mode or kernel mode queue. + * + */ +struct queue_properties { + enum kfd_queue_type type; + enum kfd_queue_format format; + unsigned int queue_id; + uint64_t queue_address; + uint64_t queue_size; + uint32_t priority; + uint32_t queue_percent; + uint32_t *read_ptr; + uint32_t *write_ptr; + void __iomem *doorbell_ptr; + uint32_t doorbell_off; + bool is_interop; + bool is_evicted; + bool is_active; + /* Not relevant for user mode queues in cp scheduling */ + unsigned int vmid; + /* Relevant only for sdma queues*/ + uint32_t sdma_engine_id; + uint32_t sdma_queue_id; + uint32_t sdma_vm_addr; + /* Relevant only for VI */ + uint64_t eop_ring_buffer_address; + uint32_t eop_ring_buffer_size; + uint64_t ctx_save_restore_area_address; + uint32_t ctx_save_restore_area_size; + uint32_t ctl_stack_size; + uint64_t tba_addr; + uint64_t tma_addr; + /* Relevant for CU */ + uint32_t cu_mask_count; /* Must be a multiple of 32 */ + uint32_t *cu_mask; +}; + +/** + * struct queue + * + * @list: Queue linked list. + * + * @mqd: The queue MQD. + * + * @mqd_mem_obj: The MQD local gpu memory object. + * + * @gart_mqd_addr: The MQD gart mc address. + * + * @properties: The queue properties. + * + * @mec: Used only in no cp scheduling mode and identifies to micro engine id + * that the queue should be execute on. + * + * @pipe: Used only in no cp scheduling mode and identifies the queue's pipe + * id. + * + * @queue: Used only in no cp scheduliong mode and identifies the queue's slot. + * + * @process: The kfd process that created this queue. + * + * @device: The kfd device that created this queue. + * + * This structure represents user mode compute queues. + * It contains all the necessary data to handle such queues. + * + */ + +struct queue { + struct list_head list; + void *mqd; + struct kfd_mem_obj *mqd_mem_obj; + uint64_t gart_mqd_addr; + struct queue_properties properties; + + uint32_t mec; + uint32_t pipe; + uint32_t queue; + + unsigned int sdma_id; + unsigned int doorbell_id; + + struct kfd_process *process; + struct kfd_dev *device; +}; + +/* + * Please read the kfd_mqd_manager.h description. + */ +enum KFD_MQD_TYPE { + KFD_MQD_TYPE_COMPUTE = 0, /* for no cp scheduling */ + KFD_MQD_TYPE_HIQ, /* for hiq */ + KFD_MQD_TYPE_CP, /* for cp queues and diq */ + KFD_MQD_TYPE_SDMA, /* for sdma queues */ + KFD_MQD_TYPE_MAX +}; + +struct scheduling_resources { + unsigned int vmid_mask; + enum kfd_queue_type type; + uint64_t queue_mask; + uint64_t gws_mask; + uint32_t oac_mask; + uint32_t gds_heap_base; + uint32_t gds_heap_size; +}; + +struct process_queue_manager { + /* data */ + struct kfd_process *process; + struct list_head queues; + unsigned long *queue_slot_bitmap; +}; + +struct qcm_process_device { + /* The Device Queue Manager that owns this data */ + struct device_queue_manager *dqm; + struct process_queue_manager *pqm; + /* Queues list */ + struct list_head queues_list; + struct list_head priv_queue_list; + + unsigned int queue_count; + unsigned int vmid; + bool is_debug; + unsigned int evicted; /* eviction counter, 0=active */ + + /* This flag tells if we should reset all wavefronts on + * process termination + */ + bool reset_wavefronts; + + /* + * All the memory management data should be here too + */ + uint64_t gds_context_area; + uint32_t sh_mem_config; + uint32_t sh_mem_bases; + uint32_t sh_mem_ape1_base; + uint32_t sh_mem_ape1_limit; + uint32_t page_table_base; + uint32_t gds_size; + uint32_t num_gws; + uint32_t num_oac; + uint32_t sh_hidden_private_base; + + /* CWSR memory */ + void *cwsr_kaddr; + uint64_t cwsr_base; + uint64_t tba_addr; + uint64_t tma_addr; + + /* IB memory */ + uint64_t ib_base; + void *ib_kaddr; + + /* doorbell resources per process per device */ + unsigned long *doorbell_bitmap; +}; + +/* KFD Memory Eviction */ + +/* Approx. wait time before attempting to restore evicted BOs */ +#define PROCESS_RESTORE_TIME_MS 100 +/* Approx. back off time if restore fails due to lack of memory */ +#define PROCESS_BACK_OFF_TIME_MS 100 +/* Approx. time before evicting the process again */ +#define PROCESS_ACTIVE_TIME_MS 10 + +int kgd2kfd_quiesce_mm(struct mm_struct *mm); +int kgd2kfd_resume_mm(struct mm_struct *mm); +int kgd2kfd_schedule_evict_and_restore_process(struct mm_struct *mm, + struct dma_fence *fence); + +/* 8 byte handle containing GPU ID in the most significant 4 bytes and + * idr_handle in the least significant 4 bytes + */ +#define MAKE_HANDLE(gpu_id, idr_handle) \ + (((uint64_t)(gpu_id) << 32) + idr_handle) +#define GET_GPU_ID(handle) (handle >> 32) +#define GET_IDR_HANDLE(handle) (handle & 0xFFFFFFFF) + +enum kfd_pdd_bound { + PDD_UNBOUND = 0, + PDD_BOUND, + PDD_BOUND_SUSPENDED, +}; + +/* Data that is per-process-per device. */ +struct kfd_process_device { + /* + * List of all per-device data for a process. + * Starts from kfd_process.per_device_data. + */ + struct list_head per_device_list; + + /* The device that owns this data. */ + struct kfd_dev *dev; + + /* The process that owns this kfd_process_device. */ + struct kfd_process *process; + + /* per-process-per device QCM data structure */ + struct qcm_process_device qpd; + + /*Apertures*/ + uint64_t lds_base; + uint64_t lds_limit; + uint64_t gpuvm_base; + uint64_t gpuvm_limit; + uint64_t scratch_base; + uint64_t scratch_limit; + + /* VM context for GPUVM allocations */ + struct file *drm_file; + void *vm; + + /* GPUVM allocations storage */ + struct idr alloc_idr; + + /* Flag used to tell the pdd has dequeued from the dqm. + * This is used to prevent dev->dqm->ops.process_termination() from + * being called twice when it is already called in IOMMU callback + * function. + */ + bool already_dequeued; + + /* Is this process/pasid bound to this device? (amd_iommu_bind_pasid) */ + enum kfd_pdd_bound bound; +}; + +#define qpd_to_pdd(x) container_of(x, struct kfd_process_device, qpd) + +/* Process data */ +struct kfd_process { + /* + * kfd_process are stored in an mm_struct*->kfd_process* + * hash table (kfd_processes in kfd_process.c) + */ + struct hlist_node kfd_processes; + + /* + * Opaque pointer to mm_struct. We don't hold a reference to + * it so it should never be dereferenced from here. This is + * only used for looking up processes by their mm. + */ + void *mm; + + struct kref ref; + struct work_struct release_work; + + struct mutex mutex; + + /* + * In any process, the thread that started main() is the lead + * thread and outlives the rest. + * It is here because amd_iommu_bind_pasid wants a task_struct. + * It can also be used for safely getting a reference to the + * mm_struct of the process. + */ + struct task_struct *lead_thread; + + /* We want to receive a notification when the mm_struct is destroyed */ + struct mmu_notifier mmu_notifier; + + /* Use for delayed freeing of kfd_process structure */ + struct rcu_head rcu; + + unsigned int pasid; + unsigned int doorbell_index; + + /* + * List of kfd_process_device structures, + * one for each device the process is using. + */ + struct list_head per_device_data; + + struct process_queue_manager pqm; + + /*Is the user space process 32 bit?*/ + bool is_32bit_user_mode; + + /* Event-related data */ + struct mutex event_mutex; + /* Event ID allocator and lookup */ + struct idr event_idr; + /* Event page */ + struct kfd_signal_page *signal_page; + size_t signal_mapped_size; + size_t signal_event_count; + bool signal_event_limit_reached; + + /* Information used for memory eviction */ + void *kgd_process_info; + /* Eviction fence that is attached to all the BOs of this process. The + * fence will be triggered during eviction and new one will be created + * during restore + */ + struct dma_fence *ef; + + /* Work items for evicting and restoring BOs */ + struct delayed_work eviction_work; + struct delayed_work restore_work; + /* seqno of the last scheduled eviction */ + unsigned int last_eviction_seqno; + /* Approx. the last timestamp (in jiffies) when the process was + * restored after an eviction + */ + unsigned long last_restore_timestamp; +}; + +#define KFD_PROCESS_TABLE_SIZE 5 /* bits: 32 entries */ +extern DECLARE_HASHTABLE(kfd_processes_table, KFD_PROCESS_TABLE_SIZE); +extern struct srcu_struct kfd_processes_srcu; + +/** + * Ioctl function type. + * + * \param filep pointer to file structure. + * \param p amdkfd process pointer. + * \param data pointer to arg that was copied from user. + */ +typedef int amdkfd_ioctl_t(struct file *filep, struct kfd_process *p, + void *data); + +struct amdkfd_ioctl_desc { + unsigned int cmd; + int flags; + amdkfd_ioctl_t *func; + unsigned int cmd_drv; + const char *name; +}; + +int kfd_process_create_wq(void); +void kfd_process_destroy_wq(void); +struct kfd_process *kfd_create_process(struct file *filep); +struct kfd_process *kfd_get_process(const struct task_struct *); +struct kfd_process *kfd_lookup_process_by_pasid(unsigned int pasid); +struct kfd_process *kfd_lookup_process_by_mm(const struct mm_struct *mm); +void kfd_unref_process(struct kfd_process *p); +int kfd_process_evict_queues(struct kfd_process *p); +int kfd_process_restore_queues(struct kfd_process *p); +void kfd_suspend_all_processes(void); +int kfd_resume_all_processes(void); + +int kfd_process_device_init_vm(struct kfd_process_device *pdd, + struct file *drm_file); +struct kfd_process_device *kfd_bind_process_to_device(struct kfd_dev *dev, + struct kfd_process *p); +struct kfd_process_device *kfd_get_process_device_data(struct kfd_dev *dev, + struct kfd_process *p); +struct kfd_process_device *kfd_create_process_device_data(struct kfd_dev *dev, + struct kfd_process *p); + +int kfd_reserved_mem_mmap(struct kfd_dev *dev, struct kfd_process *process, + struct vm_area_struct *vma); + +/* KFD process API for creating and translating handles */ +int kfd_process_device_create_obj_handle(struct kfd_process_device *pdd, + void *mem); +void *kfd_process_device_translate_handle(struct kfd_process_device *p, + int handle); +void kfd_process_device_remove_obj_handle(struct kfd_process_device *pdd, + int handle); + +/* Process device data iterator */ +struct kfd_process_device *kfd_get_first_process_device_data( + struct kfd_process *p); +struct kfd_process_device *kfd_get_next_process_device_data( + struct kfd_process *p, + struct kfd_process_device *pdd); +bool kfd_has_process_device_data(struct kfd_process *p); + +/* PASIDs */ +int kfd_pasid_init(void); +void kfd_pasid_exit(void); +bool kfd_set_pasid_limit(unsigned int new_limit); +unsigned int kfd_get_pasid_limit(void); +unsigned int kfd_pasid_alloc(void); +void kfd_pasid_free(unsigned int pasid); + +/* Doorbells */ +size_t kfd_doorbell_process_slice(struct kfd_dev *kfd); +int kfd_doorbell_init(struct kfd_dev *kfd); +void kfd_doorbell_fini(struct kfd_dev *kfd); +int kfd_doorbell_mmap(struct kfd_dev *dev, struct kfd_process *process, + struct vm_area_struct *vma); +void __iomem *kfd_get_kernel_doorbell(struct kfd_dev *kfd, + unsigned int *doorbell_off); +void kfd_release_kernel_doorbell(struct kfd_dev *kfd, u32 __iomem *db_addr); +u32 read_kernel_doorbell(u32 __iomem *db); +void write_kernel_doorbell(void __iomem *db, u32 value); +void write_kernel_doorbell64(void __iomem *db, u64 value); +unsigned int kfd_doorbell_id_to_offset(struct kfd_dev *kfd, + struct kfd_process *process, + unsigned int doorbell_id); +phys_addr_t kfd_get_process_doorbells(struct kfd_dev *dev, + struct kfd_process *process); +int kfd_alloc_process_doorbells(struct kfd_process *process); +void kfd_free_process_doorbells(struct kfd_process *process); + +/* GTT Sub-Allocator */ + +int kfd_gtt_sa_allocate(struct kfd_dev *kfd, unsigned int size, + struct kfd_mem_obj **mem_obj); + +int kfd_gtt_sa_free(struct kfd_dev *kfd, struct kfd_mem_obj *mem_obj); + +extern struct device *kfd_device; + +/* Topology */ +int kfd_topology_init(void); +void kfd_topology_shutdown(void); +int kfd_topology_add_device(struct kfd_dev *gpu); +int kfd_topology_remove_device(struct kfd_dev *gpu); +struct kfd_topology_device *kfd_topology_device_by_proximity_domain( + uint32_t proximity_domain); +struct kfd_topology_device *kfd_topology_device_by_id(uint32_t gpu_id); +struct kfd_dev *kfd_device_by_id(uint32_t gpu_id); +struct kfd_dev *kfd_device_by_pci_dev(const struct pci_dev *pdev); +int kfd_topology_enum_kfd_devices(uint8_t idx, struct kfd_dev **kdev); +int kfd_numa_node_to_apic_id(int numa_node_id); + +/* Interrupts */ +int kfd_interrupt_init(struct kfd_dev *dev); +void kfd_interrupt_exit(struct kfd_dev *dev); +void kgd2kfd_interrupt(struct kfd_dev *kfd, const void *ih_ring_entry); +bool enqueue_ih_ring_entry(struct kfd_dev *kfd, const void *ih_ring_entry); +bool interrupt_is_wanted(struct kfd_dev *dev, + const uint32_t *ih_ring_entry, + uint32_t *patched_ihre, bool *flag); + +/* Power Management */ +void kgd2kfd_suspend(struct kfd_dev *kfd); +int kgd2kfd_resume(struct kfd_dev *kfd); + +/* GPU reset */ +int kgd2kfd_pre_reset(struct kfd_dev *kfd); +int kgd2kfd_post_reset(struct kfd_dev *kfd); + +/* amdkfd Apertures */ +int kfd_init_apertures(struct kfd_process *process); + +/* Queue Context Management */ +int init_queue(struct queue **q, const struct queue_properties *properties); +void uninit_queue(struct queue *q); +void print_queue_properties(struct queue_properties *q); +void print_queue(struct queue *q); + +struct mqd_manager *mqd_manager_init(enum KFD_MQD_TYPE type, + struct kfd_dev *dev); +struct mqd_manager *mqd_manager_init_cik(enum KFD_MQD_TYPE type, + struct kfd_dev *dev); +struct mqd_manager *mqd_manager_init_cik_hawaii(enum KFD_MQD_TYPE type, + struct kfd_dev *dev); +struct mqd_manager *mqd_manager_init_vi(enum KFD_MQD_TYPE type, + struct kfd_dev *dev); +struct mqd_manager *mqd_manager_init_vi_tonga(enum KFD_MQD_TYPE type, + struct kfd_dev *dev); +struct mqd_manager *mqd_manager_init_v9(enum KFD_MQD_TYPE type, + struct kfd_dev *dev); +struct device_queue_manager *device_queue_manager_init(struct kfd_dev *dev); +void device_queue_manager_uninit(struct device_queue_manager *dqm); +struct kernel_queue *kernel_queue_init(struct kfd_dev *dev, + enum kfd_queue_type type); +void kernel_queue_uninit(struct kernel_queue *kq); +int kfd_process_vm_fault(struct device_queue_manager *dqm, unsigned int pasid); + +/* Process Queue Manager */ +struct process_queue_node { + struct queue *q; + struct kernel_queue *kq; + struct list_head process_queue_list; +}; + +void kfd_process_dequeue_from_device(struct kfd_process_device *pdd); +void kfd_process_dequeue_from_all_devices(struct kfd_process *p); +int pqm_init(struct process_queue_manager *pqm, struct kfd_process *p); +void pqm_uninit(struct process_queue_manager *pqm); +int pqm_create_queue(struct process_queue_manager *pqm, + struct kfd_dev *dev, + struct file *f, + struct queue_properties *properties, + unsigned int *qid); +int pqm_destroy_queue(struct process_queue_manager *pqm, unsigned int qid); +int pqm_update_queue(struct process_queue_manager *pqm, unsigned int qid, + struct queue_properties *p); +int pqm_set_cu_mask(struct process_queue_manager *pqm, unsigned int qid, + struct queue_properties *p); +struct kernel_queue *pqm_get_kernel_queue(struct process_queue_manager *pqm, + unsigned int qid); + +int amdkfd_fence_wait_timeout(unsigned int *fence_addr, + unsigned int fence_value, + unsigned int timeout_ms); + +/* Packet Manager */ + +#define KFD_FENCE_COMPLETED (100) +#define KFD_FENCE_INIT (10) + +struct packet_manager { + struct device_queue_manager *dqm; + struct kernel_queue *priv_queue; + struct mutex lock; + bool allocated; + struct kfd_mem_obj *ib_buffer_obj; + unsigned int ib_size_bytes; + + const struct packet_manager_funcs *pmf; +}; + +struct packet_manager_funcs { + /* Support ASIC-specific packet formats for PM4 packets */ + int (*map_process)(struct packet_manager *pm, uint32_t *buffer, + struct qcm_process_device *qpd); + int (*runlist)(struct packet_manager *pm, uint32_t *buffer, + uint64_t ib, size_t ib_size_in_dwords, bool chain); + int (*set_resources)(struct packet_manager *pm, uint32_t *buffer, + struct scheduling_resources *res); + int (*map_queues)(struct packet_manager *pm, uint32_t *buffer, + struct queue *q, bool is_static); + int (*unmap_queues)(struct packet_manager *pm, uint32_t *buffer, + enum kfd_queue_type type, + enum kfd_unmap_queues_filter mode, + uint32_t filter_param, bool reset, + unsigned int sdma_engine); + int (*query_status)(struct packet_manager *pm, uint32_t *buffer, + uint64_t fence_address, uint32_t fence_value); + int (*release_mem)(uint64_t gpu_addr, uint32_t *buffer); + + /* Packet sizes */ + int map_process_size; + int runlist_size; + int set_resources_size; + int map_queues_size; + int unmap_queues_size; + int query_status_size; + int release_mem_size; +}; + +extern const struct packet_manager_funcs kfd_vi_pm_funcs; +extern const struct packet_manager_funcs kfd_v9_pm_funcs; + +int pm_init(struct packet_manager *pm, struct device_queue_manager *dqm); +void pm_uninit(struct packet_manager *pm); +int pm_send_set_resources(struct packet_manager *pm, + struct scheduling_resources *res); +int pm_send_runlist(struct packet_manager *pm, struct list_head *dqm_queues); +int pm_send_query_status(struct packet_manager *pm, uint64_t fence_address, + uint32_t fence_value); + +int pm_send_unmap_queue(struct packet_manager *pm, enum kfd_queue_type type, + enum kfd_unmap_queues_filter mode, + uint32_t filter_param, bool reset, + unsigned int sdma_engine); + +void pm_release_ib(struct packet_manager *pm); + +/* Following PM funcs can be shared among VI and AI */ +unsigned int pm_build_pm4_header(unsigned int opcode, size_t packet_size); +int pm_set_resources_vi(struct packet_manager *pm, uint32_t *buffer, + struct scheduling_resources *res); + +uint64_t kfd_get_number_elems(struct kfd_dev *kfd); + +/* Events */ +extern const struct kfd_event_interrupt_class event_interrupt_class_cik; +extern const struct kfd_event_interrupt_class event_interrupt_class_v9; + +extern const struct kfd_device_global_init_class device_global_init_class_cik; + +void kfd_event_init_process(struct kfd_process *p); +void kfd_event_free_process(struct kfd_process *p); +int kfd_event_mmap(struct kfd_process *process, struct vm_area_struct *vma); +int kfd_wait_on_events(struct kfd_process *p, + uint32_t num_events, void __user *data, + bool all, uint32_t user_timeout_ms, + uint32_t *wait_result); +void kfd_signal_event_interrupt(unsigned int pasid, uint32_t partial_id, + uint32_t valid_id_bits); +void kfd_signal_iommu_event(struct kfd_dev *dev, + unsigned int pasid, unsigned long address, + bool is_write_requested, bool is_execute_requested); +void kfd_signal_hw_exception_event(unsigned int pasid); +int kfd_set_event(struct kfd_process *p, uint32_t event_id); +int kfd_reset_event(struct kfd_process *p, uint32_t event_id); +int kfd_event_page_set(struct kfd_process *p, void *kernel_address, + uint64_t size); +int kfd_event_create(struct file *devkfd, struct kfd_process *p, + uint32_t event_type, bool auto_reset, uint32_t node_id, + uint32_t *event_id, uint32_t *event_trigger_data, + uint64_t *event_page_offset, uint32_t *event_slot_index); +int kfd_event_destroy(struct kfd_process *p, uint32_t event_id); + +void kfd_signal_vm_fault_event(struct kfd_dev *dev, unsigned int pasid, + struct kfd_vm_fault_info *info); + +void kfd_signal_reset_event(struct kfd_dev *dev); + +void kfd_flush_tlb(struct kfd_process_device *pdd); + +int dbgdev_wave_reset_wavefronts(struct kfd_dev *dev, struct kfd_process *p); + +bool kfd_is_locked(void); + +/* Debugfs */ +#if defined(CONFIG_DEBUG_FS) + +void kfd_debugfs_init(void); +void kfd_debugfs_fini(void); +int kfd_debugfs_mqds_by_process(struct seq_file *m, void *data); +int pqm_debugfs_mqds(struct seq_file *m, void *data); +int kfd_debugfs_hqds_by_device(struct seq_file *m, void *data); +int dqm_debugfs_hqds(struct seq_file *m, void *data); +int kfd_debugfs_rls_by_device(struct seq_file *m, void *data); +int pm_debugfs_runlist(struct seq_file *m, void *data); + +int kfd_debugfs_hang_hws(struct kfd_dev *dev); +int pm_debugfs_hang_hws(struct packet_manager *pm); +int dqm_debugfs_execute_queues(struct device_queue_manager *dqm); + +#else + +static inline void kfd_debugfs_init(void) {} +static inline void kfd_debugfs_fini(void) {} + +#endif + +#endif diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process.c b/drivers/gpu/drm/amd/amdkfd/kfd_process.c new file mode 100644 index 000000000..4694386cc --- /dev/null +++ b/drivers/gpu/drm/amd/amdkfd/kfd_process.c @@ -0,0 +1,1101 @@ +/* + * Copyright 2014 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +#include <linux/mutex.h> +#include <linux/log2.h> +#include <linux/sched.h> +#include <linux/sched/mm.h> +#include <linux/sched/task.h> +#include <linux/slab.h> +#include <linux/amd-iommu.h> +#include <linux/notifier.h> +#include <linux/compat.h> +#include <linux/mman.h> +#include <linux/file.h> + +struct mm_struct; + +#include "kfd_priv.h" +#include "kfd_device_queue_manager.h" +#include "kfd_dbgmgr.h" +#include "kfd_iommu.h" + +/* + * List of struct kfd_process (field kfd_process). + * Unique/indexed by mm_struct* + */ +DEFINE_HASHTABLE(kfd_processes_table, KFD_PROCESS_TABLE_SIZE); +static DEFINE_MUTEX(kfd_processes_mutex); + +DEFINE_SRCU(kfd_processes_srcu); + +/* For process termination handling */ +static struct workqueue_struct *kfd_process_wq; + +/* Ordered, single-threaded workqueue for restoring evicted + * processes. Restoring multiple processes concurrently under memory + * pressure can lead to processes blocking each other from validating + * their BOs and result in a live-lock situation where processes + * remain evicted indefinitely. + */ +static struct workqueue_struct *kfd_restore_wq; + +static struct kfd_process *find_process(const struct task_struct *thread); +static void kfd_process_ref_release(struct kref *ref); +static struct kfd_process *create_process(const struct task_struct *thread, + struct file *filep); + +static void evict_process_worker(struct work_struct *work); +static void restore_process_worker(struct work_struct *work); + + +int kfd_process_create_wq(void) +{ + if (!kfd_process_wq) + kfd_process_wq = alloc_workqueue("kfd_process_wq", 0, 0); + if (!kfd_restore_wq) + kfd_restore_wq = alloc_ordered_workqueue("kfd_restore_wq", 0); + + if (!kfd_process_wq || !kfd_restore_wq) { + kfd_process_destroy_wq(); + return -ENOMEM; + } + + return 0; +} + +void kfd_process_destroy_wq(void) +{ + if (kfd_process_wq) { + destroy_workqueue(kfd_process_wq); + kfd_process_wq = NULL; + } + if (kfd_restore_wq) { + destroy_workqueue(kfd_restore_wq); + kfd_restore_wq = NULL; + } +} + +static void kfd_process_free_gpuvm(struct kgd_mem *mem, + struct kfd_process_device *pdd) +{ + struct kfd_dev *dev = pdd->dev; + + dev->kfd2kgd->unmap_memory_to_gpu(dev->kgd, mem, pdd->vm); + dev->kfd2kgd->free_memory_of_gpu(dev->kgd, mem); +} + +/* kfd_process_alloc_gpuvm - Allocate GPU VM for the KFD process + * This function should be only called right after the process + * is created and when kfd_processes_mutex is still being held + * to avoid concurrency. Because of that exclusiveness, we do + * not need to take p->mutex. + */ +static int kfd_process_alloc_gpuvm(struct kfd_process_device *pdd, + uint64_t gpu_va, uint32_t size, + uint32_t flags, void **kptr) +{ + struct kfd_dev *kdev = pdd->dev; + struct kgd_mem *mem = NULL; + int handle; + int err; + + err = kdev->kfd2kgd->alloc_memory_of_gpu(kdev->kgd, gpu_va, size, + pdd->vm, &mem, NULL, flags); + if (err) + goto err_alloc_mem; + + err = kdev->kfd2kgd->map_memory_to_gpu(kdev->kgd, mem, pdd->vm); + if (err) + goto err_map_mem; + + err = kdev->kfd2kgd->sync_memory(kdev->kgd, mem, true); + if (err) { + pr_debug("Sync memory failed, wait interrupted by user signal\n"); + goto sync_memory_failed; + } + + /* Create an obj handle so kfd_process_device_remove_obj_handle + * will take care of the bo removal when the process finishes. + * We do not need to take p->mutex, because the process is just + * created and the ioctls have not had the chance to run. + */ + handle = kfd_process_device_create_obj_handle(pdd, mem); + + if (handle < 0) { + err = handle; + goto free_gpuvm; + } + + if (kptr) { + err = kdev->kfd2kgd->map_gtt_bo_to_kernel(kdev->kgd, + (struct kgd_mem *)mem, kptr, NULL); + if (err) { + pr_debug("Map GTT BO to kernel failed\n"); + goto free_obj_handle; + } + } + + return err; + +free_obj_handle: + kfd_process_device_remove_obj_handle(pdd, handle); +free_gpuvm: +sync_memory_failed: + kfd_process_free_gpuvm(mem, pdd); + return err; + +err_map_mem: + kdev->kfd2kgd->free_memory_of_gpu(kdev->kgd, mem); +err_alloc_mem: + *kptr = NULL; + return err; +} + +/* kfd_process_device_reserve_ib_mem - Reserve memory inside the + * process for IB usage The memory reserved is for KFD to submit + * IB to AMDGPU from kernel. If the memory is reserved + * successfully, ib_kaddr will have the CPU/kernel + * address. Check ib_kaddr before accessing the memory. + */ +static int kfd_process_device_reserve_ib_mem(struct kfd_process_device *pdd) +{ + struct qcm_process_device *qpd = &pdd->qpd; + uint32_t flags = ALLOC_MEM_FLAGS_GTT | + ALLOC_MEM_FLAGS_NO_SUBSTITUTE | + ALLOC_MEM_FLAGS_WRITABLE | + ALLOC_MEM_FLAGS_EXECUTABLE; + void *kaddr; + int ret; + + if (qpd->ib_kaddr || !qpd->ib_base) + return 0; + + /* ib_base is only set for dGPU */ + ret = kfd_process_alloc_gpuvm(pdd, qpd->ib_base, PAGE_SIZE, flags, + &kaddr); + if (ret) + return ret; + + qpd->ib_kaddr = kaddr; + + return 0; +} + +struct kfd_process *kfd_create_process(struct file *filep) +{ + struct kfd_process *process; + struct task_struct *thread = current; + + if (!thread->mm) + return ERR_PTR(-EINVAL); + + /* Only the pthreads threading model is supported. */ + if (thread->group_leader->mm != thread->mm) + return ERR_PTR(-EINVAL); + + /* + * take kfd processes mutex before starting of process creation + * so there won't be a case where two threads of the same process + * create two kfd_process structures + */ + mutex_lock(&kfd_processes_mutex); + + /* A prior open of /dev/kfd could have already created the process. */ + process = find_process(thread); + if (process) + pr_debug("Process already found\n"); + else + process = create_process(thread, filep); + + mutex_unlock(&kfd_processes_mutex); + + return process; +} + +struct kfd_process *kfd_get_process(const struct task_struct *thread) +{ + struct kfd_process *process; + + if (!thread->mm) + return ERR_PTR(-EINVAL); + + /* Only the pthreads threading model is supported. */ + if (thread->group_leader->mm != thread->mm) + return ERR_PTR(-EINVAL); + + process = find_process(thread); + if (!process) + return ERR_PTR(-EINVAL); + + return process; +} + +static struct kfd_process *find_process_by_mm(const struct mm_struct *mm) +{ + struct kfd_process *process; + + hash_for_each_possible_rcu(kfd_processes_table, process, + kfd_processes, (uintptr_t)mm) + if (process->mm == mm) + return process; + + return NULL; +} + +static struct kfd_process *find_process(const struct task_struct *thread) +{ + struct kfd_process *p; + int idx; + + idx = srcu_read_lock(&kfd_processes_srcu); + p = find_process_by_mm(thread->mm); + srcu_read_unlock(&kfd_processes_srcu, idx); + + return p; +} + +void kfd_unref_process(struct kfd_process *p) +{ + kref_put(&p->ref, kfd_process_ref_release); +} + +static void kfd_process_device_free_bos(struct kfd_process_device *pdd) +{ + struct kfd_process *p = pdd->process; + void *mem; + int id; + + /* + * Remove all handles from idr and release appropriate + * local memory object + */ + idr_for_each_entry(&pdd->alloc_idr, mem, id) { + struct kfd_process_device *peer_pdd; + + list_for_each_entry(peer_pdd, &p->per_device_data, + per_device_list) { + if (!peer_pdd->vm) + continue; + peer_pdd->dev->kfd2kgd->unmap_memory_to_gpu( + peer_pdd->dev->kgd, mem, peer_pdd->vm); + } + + pdd->dev->kfd2kgd->free_memory_of_gpu(pdd->dev->kgd, mem); + kfd_process_device_remove_obj_handle(pdd, id); + } +} + +static void kfd_process_free_outstanding_kfd_bos(struct kfd_process *p) +{ + struct kfd_process_device *pdd; + + list_for_each_entry(pdd, &p->per_device_data, per_device_list) + kfd_process_device_free_bos(pdd); +} + +static void kfd_process_destroy_pdds(struct kfd_process *p) +{ + struct kfd_process_device *pdd, *temp; + + list_for_each_entry_safe(pdd, temp, &p->per_device_data, + per_device_list) { + pr_debug("Releasing pdd (topology id %d) for process (pasid %d)\n", + pdd->dev->id, p->pasid); + + if (pdd->drm_file) + fput(pdd->drm_file); + else if (pdd->vm) + pdd->dev->kfd2kgd->destroy_process_vm( + pdd->dev->kgd, pdd->vm); + + list_del(&pdd->per_device_list); + + if (pdd->qpd.cwsr_kaddr && !pdd->qpd.cwsr_base) + free_pages((unsigned long)pdd->qpd.cwsr_kaddr, + get_order(KFD_CWSR_TBA_TMA_SIZE)); + + kfree(pdd->qpd.doorbell_bitmap); + idr_destroy(&pdd->alloc_idr); + + kfree(pdd); + } +} + +/* No process locking is needed in this function, because the process + * is not findable any more. We must assume that no other thread is + * using it any more, otherwise we couldn't safely free the process + * structure in the end. + */ +static void kfd_process_wq_release(struct work_struct *work) +{ + struct kfd_process *p = container_of(work, struct kfd_process, + release_work); + + kfd_iommu_unbind_process(p); + + kfd_process_free_outstanding_kfd_bos(p); + + kfd_process_destroy_pdds(p); + dma_fence_put(p->ef); + + kfd_event_free_process(p); + + kfd_pasid_free(p->pasid); + kfd_free_process_doorbells(p); + + mutex_destroy(&p->mutex); + + put_task_struct(p->lead_thread); + + kfree(p); +} + +static void kfd_process_ref_release(struct kref *ref) +{ + struct kfd_process *p = container_of(ref, struct kfd_process, ref); + + INIT_WORK(&p->release_work, kfd_process_wq_release); + queue_work(kfd_process_wq, &p->release_work); +} + +static void kfd_process_destroy_delayed(struct rcu_head *rcu) +{ + struct kfd_process *p = container_of(rcu, struct kfd_process, rcu); + + kfd_unref_process(p); +} + +static void kfd_process_notifier_release(struct mmu_notifier *mn, + struct mm_struct *mm) +{ + struct kfd_process *p; + struct kfd_process_device *pdd = NULL; + + /* + * The kfd_process structure can not be free because the + * mmu_notifier srcu is read locked + */ + p = container_of(mn, struct kfd_process, mmu_notifier); + if (WARN_ON(p->mm != mm)) + return; + + mutex_lock(&kfd_processes_mutex); + hash_del_rcu(&p->kfd_processes); + mutex_unlock(&kfd_processes_mutex); + synchronize_srcu(&kfd_processes_srcu); + + cancel_delayed_work_sync(&p->eviction_work); + cancel_delayed_work_sync(&p->restore_work); + + mutex_lock(&p->mutex); + + /* Iterate over all process device data structures and if the + * pdd is in debug mode, we should first force unregistration, + * then we will be able to destroy the queues + */ + list_for_each_entry(pdd, &p->per_device_data, per_device_list) { + struct kfd_dev *dev = pdd->dev; + + mutex_lock(kfd_get_dbgmgr_mutex()); + if (dev && dev->dbgmgr && dev->dbgmgr->pasid == p->pasid) { + if (!kfd_dbgmgr_unregister(dev->dbgmgr, p)) { + kfd_dbgmgr_destroy(dev->dbgmgr); + dev->dbgmgr = NULL; + } + } + mutex_unlock(kfd_get_dbgmgr_mutex()); + } + + kfd_process_dequeue_from_all_devices(p); + pqm_uninit(&p->pqm); + + /* Indicate to other users that MM is no longer valid */ + p->mm = NULL; + + mutex_unlock(&p->mutex); + + mmu_notifier_unregister_no_release(&p->mmu_notifier, mm); + mmu_notifier_call_srcu(&p->rcu, &kfd_process_destroy_delayed); +} + +static const struct mmu_notifier_ops kfd_process_mmu_notifier_ops = { + .release = kfd_process_notifier_release, +}; + +static int kfd_process_init_cwsr_apu(struct kfd_process *p, struct file *filep) +{ + unsigned long offset; + struct kfd_process_device *pdd; + + list_for_each_entry(pdd, &p->per_device_data, per_device_list) { + struct kfd_dev *dev = pdd->dev; + struct qcm_process_device *qpd = &pdd->qpd; + + if (!dev->cwsr_enabled || qpd->cwsr_kaddr || qpd->cwsr_base) + continue; + + offset = (KFD_MMAP_TYPE_RESERVED_MEM | KFD_MMAP_GPU_ID(dev->id)) + << PAGE_SHIFT; + qpd->tba_addr = (int64_t)vm_mmap(filep, 0, + KFD_CWSR_TBA_TMA_SIZE, PROT_READ | PROT_EXEC, + MAP_SHARED, offset); + + if (IS_ERR_VALUE(qpd->tba_addr)) { + int err = qpd->tba_addr; + + pr_err("Failure to set tba address. error %d.\n", err); + qpd->tba_addr = 0; + qpd->cwsr_kaddr = NULL; + return err; + } + + memcpy(qpd->cwsr_kaddr, dev->cwsr_isa, dev->cwsr_isa_size); + + qpd->tma_addr = qpd->tba_addr + KFD_CWSR_TMA_OFFSET; + pr_debug("set tba :0x%llx, tma:0x%llx, cwsr_kaddr:%p for pqm.\n", + qpd->tba_addr, qpd->tma_addr, qpd->cwsr_kaddr); + } + + return 0; +} + +static int kfd_process_device_init_cwsr_dgpu(struct kfd_process_device *pdd) +{ + struct kfd_dev *dev = pdd->dev; + struct qcm_process_device *qpd = &pdd->qpd; + uint32_t flags = ALLOC_MEM_FLAGS_GTT | + ALLOC_MEM_FLAGS_NO_SUBSTITUTE | ALLOC_MEM_FLAGS_EXECUTABLE; + void *kaddr; + int ret; + + if (!dev->cwsr_enabled || qpd->cwsr_kaddr || !qpd->cwsr_base) + return 0; + + /* cwsr_base is only set for dGPU */ + ret = kfd_process_alloc_gpuvm(pdd, qpd->cwsr_base, + KFD_CWSR_TBA_TMA_SIZE, flags, &kaddr); + if (ret) + return ret; + + qpd->cwsr_kaddr = kaddr; + qpd->tba_addr = qpd->cwsr_base; + + memcpy(qpd->cwsr_kaddr, dev->cwsr_isa, dev->cwsr_isa_size); + + qpd->tma_addr = qpd->tba_addr + KFD_CWSR_TMA_OFFSET; + pr_debug("set tba :0x%llx, tma:0x%llx, cwsr_kaddr:%p for pqm.\n", + qpd->tba_addr, qpd->tma_addr, qpd->cwsr_kaddr); + + return 0; +} + +static struct kfd_process *create_process(const struct task_struct *thread, + struct file *filep) +{ + struct kfd_process *process; + int err = -ENOMEM; + + process = kzalloc(sizeof(*process), GFP_KERNEL); + + if (!process) + goto err_alloc_process; + + process->pasid = kfd_pasid_alloc(); + if (process->pasid == 0) + goto err_alloc_pasid; + + if (kfd_alloc_process_doorbells(process) < 0) + goto err_alloc_doorbells; + + kref_init(&process->ref); + + mutex_init(&process->mutex); + + process->mm = thread->mm; + + /* register notifier */ + process->mmu_notifier.ops = &kfd_process_mmu_notifier_ops; + err = mmu_notifier_register(&process->mmu_notifier, process->mm); + if (err) + goto err_mmu_notifier; + + hash_add_rcu(kfd_processes_table, &process->kfd_processes, + (uintptr_t)process->mm); + + process->lead_thread = thread->group_leader; + get_task_struct(process->lead_thread); + + INIT_LIST_HEAD(&process->per_device_data); + + kfd_event_init_process(process); + + err = pqm_init(&process->pqm, process); + if (err != 0) + goto err_process_pqm_init; + + /* init process apertures*/ + process->is_32bit_user_mode = in_compat_syscall(); + err = kfd_init_apertures(process); + if (err != 0) + goto err_init_apertures; + + INIT_DELAYED_WORK(&process->eviction_work, evict_process_worker); + INIT_DELAYED_WORK(&process->restore_work, restore_process_worker); + process->last_restore_timestamp = get_jiffies_64(); + + err = kfd_process_init_cwsr_apu(process, filep); + if (err) + goto err_init_cwsr; + + return process; + +err_init_cwsr: + kfd_process_free_outstanding_kfd_bos(process); + kfd_process_destroy_pdds(process); +err_init_apertures: + pqm_uninit(&process->pqm); +err_process_pqm_init: + hash_del_rcu(&process->kfd_processes); + synchronize_rcu(); + mmu_notifier_unregister_no_release(&process->mmu_notifier, process->mm); +err_mmu_notifier: + mutex_destroy(&process->mutex); + kfd_free_process_doorbells(process); +err_alloc_doorbells: + kfd_pasid_free(process->pasid); +err_alloc_pasid: + kfree(process); +err_alloc_process: + return ERR_PTR(err); +} + +static int init_doorbell_bitmap(struct qcm_process_device *qpd, + struct kfd_dev *dev) +{ + unsigned int i; + + if (!KFD_IS_SOC15(dev->device_info->asic_family)) + return 0; + + qpd->doorbell_bitmap = + kzalloc(DIV_ROUND_UP(KFD_MAX_NUM_OF_QUEUES_PER_PROCESS, + BITS_PER_BYTE), GFP_KERNEL); + if (!qpd->doorbell_bitmap) + return -ENOMEM; + + /* Mask out any reserved doorbells */ + for (i = 0; i < KFD_MAX_NUM_OF_QUEUES_PER_PROCESS; i++) + if ((dev->shared_resources.reserved_doorbell_mask & i) == + dev->shared_resources.reserved_doorbell_val) { + set_bit(i, qpd->doorbell_bitmap); + pr_debug("reserved doorbell 0x%03x\n", i); + } + + return 0; +} + +struct kfd_process_device *kfd_get_process_device_data(struct kfd_dev *dev, + struct kfd_process *p) +{ + struct kfd_process_device *pdd = NULL; + + list_for_each_entry(pdd, &p->per_device_data, per_device_list) + if (pdd->dev == dev) + return pdd; + + return NULL; +} + +struct kfd_process_device *kfd_create_process_device_data(struct kfd_dev *dev, + struct kfd_process *p) +{ + struct kfd_process_device *pdd = NULL; + + pdd = kzalloc(sizeof(*pdd), GFP_KERNEL); + if (!pdd) + return NULL; + + if (init_doorbell_bitmap(&pdd->qpd, dev)) { + pr_err("Failed to init doorbell for process\n"); + kfree(pdd); + return NULL; + } + + pdd->dev = dev; + INIT_LIST_HEAD(&pdd->qpd.queues_list); + INIT_LIST_HEAD(&pdd->qpd.priv_queue_list); + pdd->qpd.dqm = dev->dqm; + pdd->qpd.pqm = &p->pqm; + pdd->qpd.evicted = 0; + pdd->process = p; + pdd->bound = PDD_UNBOUND; + pdd->already_dequeued = false; + list_add(&pdd->per_device_list, &p->per_device_data); + + /* Init idr used for memory handle translation */ + idr_init(&pdd->alloc_idr); + + return pdd; +} + +/** + * kfd_process_device_init_vm - Initialize a VM for a process-device + * + * @pdd: The process-device + * @drm_file: Optional pointer to a DRM file descriptor + * + * If @drm_file is specified, it will be used to acquire the VM from + * that file descriptor. If successful, the @pdd takes ownership of + * the file descriptor. + * + * If @drm_file is NULL, a new VM is created. + * + * Returns 0 on success, -errno on failure. + */ +int kfd_process_device_init_vm(struct kfd_process_device *pdd, + struct file *drm_file) +{ + struct kfd_process *p; + struct kfd_dev *dev; + int ret; + + if (pdd->vm) + return drm_file ? -EBUSY : 0; + + p = pdd->process; + dev = pdd->dev; + + if (drm_file) + ret = dev->kfd2kgd->acquire_process_vm( + dev->kgd, drm_file, + &pdd->vm, &p->kgd_process_info, &p->ef); + else + ret = dev->kfd2kgd->create_process_vm( + dev->kgd, &pdd->vm, &p->kgd_process_info, &p->ef); + if (ret) { + pr_err("Failed to create process VM object\n"); + return ret; + } + + ret = kfd_process_device_reserve_ib_mem(pdd); + if (ret) + goto err_reserve_ib_mem; + ret = kfd_process_device_init_cwsr_dgpu(pdd); + if (ret) + goto err_init_cwsr; + + pdd->drm_file = drm_file; + + return 0; + +err_init_cwsr: +err_reserve_ib_mem: + kfd_process_device_free_bos(pdd); + if (!drm_file) + dev->kfd2kgd->destroy_process_vm(dev->kgd, pdd->vm); + pdd->vm = NULL; + + return ret; +} + +/* + * Direct the IOMMU to bind the process (specifically the pasid->mm) + * to the device. + * Unbinding occurs when the process dies or the device is removed. + * + * Assumes that the process lock is held. + */ +struct kfd_process_device *kfd_bind_process_to_device(struct kfd_dev *dev, + struct kfd_process *p) +{ + struct kfd_process_device *pdd; + int err; + + pdd = kfd_get_process_device_data(dev, p); + if (!pdd) { + pr_err("Process device data doesn't exist\n"); + return ERR_PTR(-ENOMEM); + } + + err = kfd_iommu_bind_process_to_device(pdd); + if (err) + return ERR_PTR(err); + + err = kfd_process_device_init_vm(pdd, NULL); + if (err) + return ERR_PTR(err); + + return pdd; +} + +struct kfd_process_device *kfd_get_first_process_device_data( + struct kfd_process *p) +{ + return list_first_entry(&p->per_device_data, + struct kfd_process_device, + per_device_list); +} + +struct kfd_process_device *kfd_get_next_process_device_data( + struct kfd_process *p, + struct kfd_process_device *pdd) +{ + if (list_is_last(&pdd->per_device_list, &p->per_device_data)) + return NULL; + return list_next_entry(pdd, per_device_list); +} + +bool kfd_has_process_device_data(struct kfd_process *p) +{ + return !(list_empty(&p->per_device_data)); +} + +/* Create specific handle mapped to mem from process local memory idr + * Assumes that the process lock is held. + */ +int kfd_process_device_create_obj_handle(struct kfd_process_device *pdd, + void *mem) +{ + return idr_alloc(&pdd->alloc_idr, mem, 0, 0, GFP_KERNEL); +} + +/* Translate specific handle from process local memory idr + * Assumes that the process lock is held. + */ +void *kfd_process_device_translate_handle(struct kfd_process_device *pdd, + int handle) +{ + if (handle < 0) + return NULL; + + return idr_find(&pdd->alloc_idr, handle); +} + +/* Remove specific handle from process local memory idr + * Assumes that the process lock is held. + */ +void kfd_process_device_remove_obj_handle(struct kfd_process_device *pdd, + int handle) +{ + if (handle >= 0) + idr_remove(&pdd->alloc_idr, handle); +} + +/* This increments the process->ref counter. */ +struct kfd_process *kfd_lookup_process_by_pasid(unsigned int pasid) +{ + struct kfd_process *p, *ret_p = NULL; + unsigned int temp; + + int idx = srcu_read_lock(&kfd_processes_srcu); + + hash_for_each_rcu(kfd_processes_table, temp, p, kfd_processes) { + if (p->pasid == pasid) { + kref_get(&p->ref); + ret_p = p; + break; + } + } + + srcu_read_unlock(&kfd_processes_srcu, idx); + + return ret_p; +} + +/* This increments the process->ref counter. */ +struct kfd_process *kfd_lookup_process_by_mm(const struct mm_struct *mm) +{ + struct kfd_process *p; + + int idx = srcu_read_lock(&kfd_processes_srcu); + + p = find_process_by_mm(mm); + if (p) + kref_get(&p->ref); + + srcu_read_unlock(&kfd_processes_srcu, idx); + + return p; +} + +/* process_evict_queues - Evict all user queues of a process + * + * Eviction is reference-counted per process-device. This means multiple + * evictions from different sources can be nested safely. + */ +int kfd_process_evict_queues(struct kfd_process *p) +{ + struct kfd_process_device *pdd; + int r = 0; + unsigned int n_evicted = 0; + + list_for_each_entry(pdd, &p->per_device_data, per_device_list) { + r = pdd->dev->dqm->ops.evict_process_queues(pdd->dev->dqm, + &pdd->qpd); + if (r) { + pr_err("Failed to evict process queues\n"); + goto fail; + } + n_evicted++; + } + + return r; + +fail: + /* To keep state consistent, roll back partial eviction by + * restoring queues + */ + list_for_each_entry(pdd, &p->per_device_data, per_device_list) { + if (n_evicted == 0) + break; + if (pdd->dev->dqm->ops.restore_process_queues(pdd->dev->dqm, + &pdd->qpd)) + pr_err("Failed to restore queues\n"); + + n_evicted--; + } + + return r; +} + +/* process_restore_queues - Restore all user queues of a process */ +int kfd_process_restore_queues(struct kfd_process *p) +{ + struct kfd_process_device *pdd; + int r, ret = 0; + + list_for_each_entry(pdd, &p->per_device_data, per_device_list) { + r = pdd->dev->dqm->ops.restore_process_queues(pdd->dev->dqm, + &pdd->qpd); + if (r) { + pr_err("Failed to restore process queues\n"); + if (!ret) + ret = r; + } + } + + return ret; +} + +static void evict_process_worker(struct work_struct *work) +{ + int ret; + struct kfd_process *p; + struct delayed_work *dwork; + + dwork = to_delayed_work(work); + + /* Process termination destroys this worker thread. So during the + * lifetime of this thread, kfd_process p will be valid + */ + p = container_of(dwork, struct kfd_process, eviction_work); + WARN_ONCE(p->last_eviction_seqno != p->ef->seqno, + "Eviction fence mismatch\n"); + + /* Narrow window of overlap between restore and evict work + * item is possible. Once amdgpu_amdkfd_gpuvm_restore_process_bos + * unreserves KFD BOs, it is possible to evicted again. But + * restore has few more steps of finish. So lets wait for any + * previous restore work to complete + */ + flush_delayed_work(&p->restore_work); + + pr_debug("Started evicting pasid %d\n", p->pasid); + ret = kfd_process_evict_queues(p); + if (!ret) { + dma_fence_signal(p->ef); + dma_fence_put(p->ef); + p->ef = NULL; + queue_delayed_work(kfd_restore_wq, &p->restore_work, + msecs_to_jiffies(PROCESS_RESTORE_TIME_MS)); + + pr_debug("Finished evicting pasid %d\n", p->pasid); + } else + pr_err("Failed to evict queues of pasid %d\n", p->pasid); +} + +static void restore_process_worker(struct work_struct *work) +{ + struct delayed_work *dwork; + struct kfd_process *p; + struct kfd_process_device *pdd; + int ret = 0; + + dwork = to_delayed_work(work); + + /* Process termination destroys this worker thread. So during the + * lifetime of this thread, kfd_process p will be valid + */ + p = container_of(dwork, struct kfd_process, restore_work); + + /* Call restore_process_bos on the first KGD device. This function + * takes care of restoring the whole process including other devices. + * Restore can fail if enough memory is not available. If so, + * reschedule again. + */ + pdd = list_first_entry(&p->per_device_data, + struct kfd_process_device, + per_device_list); + + pr_debug("Started restoring pasid %d\n", p->pasid); + + /* Setting last_restore_timestamp before successful restoration. + * Otherwise this would have to be set by KGD (restore_process_bos) + * before KFD BOs are unreserved. If not, the process can be evicted + * again before the timestamp is set. + * If restore fails, the timestamp will be set again in the next + * attempt. This would mean that the minimum GPU quanta would be + * PROCESS_ACTIVE_TIME_MS - (time to execute the following two + * functions) + */ + + p->last_restore_timestamp = get_jiffies_64(); + ret = pdd->dev->kfd2kgd->restore_process_bos(p->kgd_process_info, + &p->ef); + if (ret) { + pr_debug("Failed to restore BOs of pasid %d, retry after %d ms\n", + p->pasid, PROCESS_BACK_OFF_TIME_MS); + ret = queue_delayed_work(kfd_restore_wq, &p->restore_work, + msecs_to_jiffies(PROCESS_BACK_OFF_TIME_MS)); + WARN(!ret, "reschedule restore work failed\n"); + return; + } + + ret = kfd_process_restore_queues(p); + if (!ret) + pr_debug("Finished restoring pasid %d\n", p->pasid); + else + pr_err("Failed to restore queues of pasid %d\n", p->pasid); +} + +void kfd_suspend_all_processes(void) +{ + struct kfd_process *p; + unsigned int temp; + int idx = srcu_read_lock(&kfd_processes_srcu); + + hash_for_each_rcu(kfd_processes_table, temp, p, kfd_processes) { + cancel_delayed_work_sync(&p->eviction_work); + cancel_delayed_work_sync(&p->restore_work); + + if (kfd_process_evict_queues(p)) + pr_err("Failed to suspend process %d\n", p->pasid); + dma_fence_signal(p->ef); + dma_fence_put(p->ef); + p->ef = NULL; + } + srcu_read_unlock(&kfd_processes_srcu, idx); +} + +int kfd_resume_all_processes(void) +{ + struct kfd_process *p; + unsigned int temp; + int ret = 0, idx = srcu_read_lock(&kfd_processes_srcu); + + hash_for_each_rcu(kfd_processes_table, temp, p, kfd_processes) { + if (!queue_delayed_work(kfd_restore_wq, &p->restore_work, 0)) { + pr_err("Restore process %d failed during resume\n", + p->pasid); + ret = -EFAULT; + } + } + srcu_read_unlock(&kfd_processes_srcu, idx); + return ret; +} + +int kfd_reserved_mem_mmap(struct kfd_dev *dev, struct kfd_process *process, + struct vm_area_struct *vma) +{ + struct kfd_process_device *pdd; + struct qcm_process_device *qpd; + + if ((vma->vm_end - vma->vm_start) != KFD_CWSR_TBA_TMA_SIZE) { + pr_err("Incorrect CWSR mapping size.\n"); + return -EINVAL; + } + + pdd = kfd_get_process_device_data(dev, process); + if (!pdd) + return -EINVAL; + qpd = &pdd->qpd; + + qpd->cwsr_kaddr = (void *)__get_free_pages(GFP_KERNEL | __GFP_ZERO, + get_order(KFD_CWSR_TBA_TMA_SIZE)); + if (!qpd->cwsr_kaddr) { + pr_err("Error allocating per process CWSR buffer.\n"); + return -ENOMEM; + } + + vma->vm_flags |= VM_IO | VM_DONTCOPY | VM_DONTEXPAND + | VM_NORESERVE | VM_DONTDUMP | VM_PFNMAP; + /* Mapping pages to user process */ + return remap_pfn_range(vma, vma->vm_start, + PFN_DOWN(__pa(qpd->cwsr_kaddr)), + KFD_CWSR_TBA_TMA_SIZE, vma->vm_page_prot); +} + +void kfd_flush_tlb(struct kfd_process_device *pdd) +{ + struct kfd_dev *dev = pdd->dev; + const struct kfd2kgd_calls *f2g = dev->kfd2kgd; + + if (dev->dqm->sched_policy == KFD_SCHED_POLICY_NO_HWS) { + /* Nothing to flush until a VMID is assigned, which + * only happens when the first queue is created. + */ + if (pdd->qpd.vmid) + f2g->invalidate_tlbs_vmid(dev->kgd, pdd->qpd.vmid); + } else { + f2g->invalidate_tlbs(dev->kgd, pdd->process->pasid); + } +} + +#if defined(CONFIG_DEBUG_FS) + +int kfd_debugfs_mqds_by_process(struct seq_file *m, void *data) +{ + struct kfd_process *p; + unsigned int temp; + int r = 0; + + int idx = srcu_read_lock(&kfd_processes_srcu); + + hash_for_each_rcu(kfd_processes_table, temp, p, kfd_processes) { + seq_printf(m, "Process %d PASID %d:\n", + p->lead_thread->tgid, p->pasid); + + mutex_lock(&p->mutex); + r = pqm_debugfs_mqds(m, &p->pqm); + mutex_unlock(&p->mutex); + + if (r) + break; + } + + srcu_read_unlock(&kfd_processes_srcu, idx); + + return r; +} + +#endif diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c new file mode 100644 index 000000000..c8cad9c07 --- /dev/null +++ b/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c @@ -0,0 +1,474 @@ +/* + * Copyright 2014 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + * + */ + +#include <linux/slab.h> +#include <linux/list.h> +#include "kfd_device_queue_manager.h" +#include "kfd_priv.h" +#include "kfd_kernel_queue.h" + +static inline struct process_queue_node *get_queue_by_qid( + struct process_queue_manager *pqm, unsigned int qid) +{ + struct process_queue_node *pqn; + + list_for_each_entry(pqn, &pqm->queues, process_queue_list) { + if ((pqn->q && pqn->q->properties.queue_id == qid) || + (pqn->kq && pqn->kq->queue->properties.queue_id == qid)) + return pqn; + } + + return NULL; +} + +static int find_available_queue_slot(struct process_queue_manager *pqm, + unsigned int *qid) +{ + unsigned long found; + + found = find_first_zero_bit(pqm->queue_slot_bitmap, + KFD_MAX_NUM_OF_QUEUES_PER_PROCESS); + + pr_debug("The new slot id %lu\n", found); + + if (found >= KFD_MAX_NUM_OF_QUEUES_PER_PROCESS) { + pr_info("Cannot open more queues for process with pasid %d\n", + pqm->process->pasid); + return -ENOMEM; + } + + set_bit(found, pqm->queue_slot_bitmap); + *qid = found; + + return 0; +} + +void kfd_process_dequeue_from_device(struct kfd_process_device *pdd) +{ + struct kfd_dev *dev = pdd->dev; + + if (pdd->already_dequeued) + return; + + dev->dqm->ops.process_termination(dev->dqm, &pdd->qpd); + pdd->already_dequeued = true; +} + +void kfd_process_dequeue_from_all_devices(struct kfd_process *p) +{ + struct kfd_process_device *pdd; + + list_for_each_entry(pdd, &p->per_device_data, per_device_list) + kfd_process_dequeue_from_device(pdd); +} + +int pqm_init(struct process_queue_manager *pqm, struct kfd_process *p) +{ + INIT_LIST_HEAD(&pqm->queues); + pqm->queue_slot_bitmap = + kzalloc(DIV_ROUND_UP(KFD_MAX_NUM_OF_QUEUES_PER_PROCESS, + BITS_PER_BYTE), GFP_KERNEL); + if (!pqm->queue_slot_bitmap) + return -ENOMEM; + pqm->process = p; + + return 0; +} + +void pqm_uninit(struct process_queue_manager *pqm) +{ + struct process_queue_node *pqn, *next; + + list_for_each_entry_safe(pqn, next, &pqm->queues, process_queue_list) { + uninit_queue(pqn->q); + list_del(&pqn->process_queue_list); + kfree(pqn); + } + + kfree(pqm->queue_slot_bitmap); + pqm->queue_slot_bitmap = NULL; +} + +static int create_cp_queue(struct process_queue_manager *pqm, + struct kfd_dev *dev, struct queue **q, + struct queue_properties *q_properties, + struct file *f, unsigned int qid) +{ + int retval; + + /* Doorbell initialized in user space*/ + q_properties->doorbell_ptr = NULL; + + /* let DQM handle it*/ + q_properties->vmid = 0; + q_properties->queue_id = qid; + + retval = init_queue(q, q_properties); + if (retval != 0) + return retval; + + (*q)->device = dev; + (*q)->process = pqm->process; + + pr_debug("PQM After init queue"); + + return retval; +} + +int pqm_create_queue(struct process_queue_manager *pqm, + struct kfd_dev *dev, + struct file *f, + struct queue_properties *properties, + unsigned int *qid) +{ + int retval; + struct kfd_process_device *pdd; + struct queue *q; + struct process_queue_node *pqn; + struct kernel_queue *kq; + enum kfd_queue_type type = properties->type; + unsigned int max_queues = 127; /* HWS limit */ + + q = NULL; + kq = NULL; + + pdd = kfd_get_process_device_data(dev, pqm->process); + if (!pdd) { + pr_err("Process device data doesn't exist\n"); + return -1; + } + + /* + * for debug process, verify that it is within the static queues limit + * currently limit is set to half of the total avail HQD slots + * If we are just about to create DIQ, the is_debug flag is not set yet + * Hence we also check the type as well + */ + if ((pdd->qpd.is_debug) || (type == KFD_QUEUE_TYPE_DIQ)) + max_queues = dev->device_info->max_no_of_hqd/2; + + if (pdd->qpd.queue_count >= max_queues) + return -ENOSPC; + + retval = find_available_queue_slot(pqm, qid); + if (retval != 0) + return retval; + + if (list_empty(&pdd->qpd.queues_list) && + list_empty(&pdd->qpd.priv_queue_list)) + dev->dqm->ops.register_process(dev->dqm, &pdd->qpd); + + pqn = kzalloc(sizeof(*pqn), GFP_KERNEL); + if (!pqn) { + retval = -ENOMEM; + goto err_allocate_pqn; + } + + switch (type) { + case KFD_QUEUE_TYPE_SDMA: + if (dev->dqm->queue_count >= get_num_sdma_queues(dev->dqm)) { + pr_err("Over-subscription is not allowed for SDMA.\n"); + retval = -EPERM; + goto err_create_queue; + } + + retval = create_cp_queue(pqm, dev, &q, properties, f, *qid); + if (retval != 0) + goto err_create_queue; + pqn->q = q; + pqn->kq = NULL; + retval = dev->dqm->ops.create_queue(dev->dqm, q, &pdd->qpd); + pr_debug("DQM returned %d for create_queue\n", retval); + print_queue(q); + break; + + case KFD_QUEUE_TYPE_COMPUTE: + /* check if there is over subscription */ + if ((dev->dqm->sched_policy == + KFD_SCHED_POLICY_HWS_NO_OVERSUBSCRIPTION) && + ((dev->dqm->processes_count >= dev->vm_info.vmid_num_kfd) || + (dev->dqm->queue_count >= get_queues_num(dev->dqm)))) { + pr_debug("Over-subscription is not allowed when amdkfd.sched_policy == 1\n"); + retval = -EPERM; + goto err_create_queue; + } + + retval = create_cp_queue(pqm, dev, &q, properties, f, *qid); + if (retval != 0) + goto err_create_queue; + pqn->q = q; + pqn->kq = NULL; + retval = dev->dqm->ops.create_queue(dev->dqm, q, &pdd->qpd); + pr_debug("DQM returned %d for create_queue\n", retval); + print_queue(q); + break; + case KFD_QUEUE_TYPE_DIQ: + kq = kernel_queue_init(dev, KFD_QUEUE_TYPE_DIQ); + if (!kq) { + retval = -ENOMEM; + goto err_create_queue; + } + kq->queue->properties.queue_id = *qid; + pqn->kq = kq; + pqn->q = NULL; + retval = dev->dqm->ops.create_kernel_queue(dev->dqm, + kq, &pdd->qpd); + break; + default: + WARN(1, "Invalid queue type %d", type); + retval = -EINVAL; + } + + if (retval != 0) { + pr_err("Pasid %d DQM create queue %d failed. ret %d\n", + pqm->process->pasid, type, retval); + goto err_create_queue; + } + + if (q) + /* Return the doorbell offset within the doorbell page + * to the caller so it can be passed up to user mode + * (in bytes). + */ + properties->doorbell_off = + (q->properties.doorbell_off * sizeof(uint32_t)) & + (kfd_doorbell_process_slice(dev) - 1); + + pr_debug("PQM After DQM create queue\n"); + + list_add(&pqn->process_queue_list, &pqm->queues); + + if (q) { + pr_debug("PQM done creating queue\n"); + print_queue_properties(&q->properties); + } + + return retval; + +err_create_queue: + kfree(pqn); +err_allocate_pqn: + /* check if queues list is empty unregister process from device */ + clear_bit(*qid, pqm->queue_slot_bitmap); + if (list_empty(&pdd->qpd.queues_list) && + list_empty(&pdd->qpd.priv_queue_list)) + dev->dqm->ops.unregister_process(dev->dqm, &pdd->qpd); + return retval; +} + +int pqm_destroy_queue(struct process_queue_manager *pqm, unsigned int qid) +{ + struct process_queue_node *pqn; + struct kfd_process_device *pdd; + struct device_queue_manager *dqm; + struct kfd_dev *dev; + int retval; + + dqm = NULL; + + retval = 0; + + pqn = get_queue_by_qid(pqm, qid); + if (!pqn) { + pr_err("Queue id does not match any known queue\n"); + return -EINVAL; + } + + dev = NULL; + if (pqn->kq) + dev = pqn->kq->dev; + if (pqn->q) + dev = pqn->q->device; + if (WARN_ON(!dev)) + return -ENODEV; + + pdd = kfd_get_process_device_data(dev, pqm->process); + if (!pdd) { + pr_err("Process device data doesn't exist\n"); + return -1; + } + + if (pqn->kq) { + /* destroy kernel queue (DIQ) */ + dqm = pqn->kq->dev->dqm; + dqm->ops.destroy_kernel_queue(dqm, pqn->kq, &pdd->qpd); + kernel_queue_uninit(pqn->kq); + } + + if (pqn->q) { + dqm = pqn->q->device->dqm; + retval = dqm->ops.destroy_queue(dqm, &pdd->qpd, pqn->q); + if (retval) { + pr_err("Pasid %d destroy queue %d failed, ret %d\n", + pqm->process->pasid, + pqn->q->properties.queue_id, retval); + if (retval != -ETIME) + goto err_destroy_queue; + } + kfree(pqn->q->properties.cu_mask); + pqn->q->properties.cu_mask = NULL; + uninit_queue(pqn->q); + } + + list_del(&pqn->process_queue_list); + kfree(pqn); + clear_bit(qid, pqm->queue_slot_bitmap); + + if (list_empty(&pdd->qpd.queues_list) && + list_empty(&pdd->qpd.priv_queue_list)) + dqm->ops.unregister_process(dqm, &pdd->qpd); + +err_destroy_queue: + return retval; +} + +int pqm_update_queue(struct process_queue_manager *pqm, unsigned int qid, + struct queue_properties *p) +{ + int retval; + struct process_queue_node *pqn; + + pqn = get_queue_by_qid(pqm, qid); + if (!pqn) { + pr_debug("No queue %d exists for update operation\n", qid); + return -EFAULT; + } + + pqn->q->properties.queue_address = p->queue_address; + pqn->q->properties.queue_size = p->queue_size; + pqn->q->properties.queue_percent = p->queue_percent; + pqn->q->properties.priority = p->priority; + + retval = pqn->q->device->dqm->ops.update_queue(pqn->q->device->dqm, + pqn->q); + if (retval != 0) + return retval; + + return 0; +} + +int pqm_set_cu_mask(struct process_queue_manager *pqm, unsigned int qid, + struct queue_properties *p) +{ + int retval; + struct process_queue_node *pqn; + + pqn = get_queue_by_qid(pqm, qid); + if (!pqn) { + pr_debug("No queue %d exists for update operation\n", qid); + return -EFAULT; + } + + /* Free the old CU mask memory if it is already allocated, then + * allocate memory for the new CU mask. + */ + kfree(pqn->q->properties.cu_mask); + + pqn->q->properties.cu_mask_count = p->cu_mask_count; + pqn->q->properties.cu_mask = p->cu_mask; + + retval = pqn->q->device->dqm->ops.update_queue(pqn->q->device->dqm, + pqn->q); + if (retval != 0) + return retval; + + return 0; +} + +struct kernel_queue *pqm_get_kernel_queue( + struct process_queue_manager *pqm, + unsigned int qid) +{ + struct process_queue_node *pqn; + + pqn = get_queue_by_qid(pqm, qid); + if (pqn && pqn->kq) + return pqn->kq; + + return NULL; +} + +#if defined(CONFIG_DEBUG_FS) + +int pqm_debugfs_mqds(struct seq_file *m, void *data) +{ + struct process_queue_manager *pqm = data; + struct process_queue_node *pqn; + struct queue *q; + enum KFD_MQD_TYPE mqd_type; + struct mqd_manager *mqd_mgr; + int r = 0; + + list_for_each_entry(pqn, &pqm->queues, process_queue_list) { + if (pqn->q) { + q = pqn->q; + switch (q->properties.type) { + case KFD_QUEUE_TYPE_SDMA: + seq_printf(m, " SDMA queue on device %x\n", + q->device->id); + mqd_type = KFD_MQD_TYPE_SDMA; + break; + case KFD_QUEUE_TYPE_COMPUTE: + seq_printf(m, " Compute queue on device %x\n", + q->device->id); + mqd_type = KFD_MQD_TYPE_CP; + break; + default: + seq_printf(m, + " Bad user queue type %d on device %x\n", + q->properties.type, q->device->id); + continue; + } + mqd_mgr = q->device->dqm->ops.get_mqd_manager( + q->device->dqm, mqd_type); + } else if (pqn->kq) { + q = pqn->kq->queue; + mqd_mgr = pqn->kq->mqd_mgr; + switch (q->properties.type) { + case KFD_QUEUE_TYPE_DIQ: + seq_printf(m, " DIQ on device %x\n", + pqn->kq->dev->id); + mqd_type = KFD_MQD_TYPE_HIQ; + break; + default: + seq_printf(m, + " Bad kernel queue type %d on device %x\n", + q->properties.type, + pqn->kq->dev->id); + continue; + } + } else { + seq_printf(m, + " Weird: Queue node with neither kernel nor user queue\n"); + continue; + } + + r = mqd_mgr->debugfs_show_mqd(m, q->mqd); + if (r != 0) + break; + } + + return r; +} + +#endif diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_queue.c b/drivers/gpu/drm/amd/amdkfd/kfd_queue.c new file mode 100644 index 000000000..6dcd621e5 --- /dev/null +++ b/drivers/gpu/drm/amd/amdkfd/kfd_queue.c @@ -0,0 +1,83 @@ +/* + * Copyright 2014 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + * + */ + +#include <linux/slab.h> +#include "kfd_priv.h" + +void print_queue_properties(struct queue_properties *q) +{ + if (!q) + return; + + pr_debug("Printing queue properties:\n"); + pr_debug("Queue Type: %u\n", q->type); + pr_debug("Queue Size: %llu\n", q->queue_size); + pr_debug("Queue percent: %u\n", q->queue_percent); + pr_debug("Queue Address: 0x%llX\n", q->queue_address); + pr_debug("Queue Id: %u\n", q->queue_id); + pr_debug("Queue Process Vmid: %u\n", q->vmid); + pr_debug("Queue Read Pointer: 0x%px\n", q->read_ptr); + pr_debug("Queue Write Pointer: 0x%px\n", q->write_ptr); + pr_debug("Queue Doorbell Pointer: 0x%p\n", q->doorbell_ptr); + pr_debug("Queue Doorbell Offset: %u\n", q->doorbell_off); +} + +void print_queue(struct queue *q) +{ + if (!q) + return; + pr_debug("Printing queue:\n"); + pr_debug("Queue Type: %u\n", q->properties.type); + pr_debug("Queue Size: %llu\n", q->properties.queue_size); + pr_debug("Queue percent: %u\n", q->properties.queue_percent); + pr_debug("Queue Address: 0x%llX\n", q->properties.queue_address); + pr_debug("Queue Id: %u\n", q->properties.queue_id); + pr_debug("Queue Process Vmid: %u\n", q->properties.vmid); + pr_debug("Queue Read Pointer: 0x%px\n", q->properties.read_ptr); + pr_debug("Queue Write Pointer: 0x%px\n", q->properties.write_ptr); + pr_debug("Queue Doorbell Pointer: 0x%p\n", q->properties.doorbell_ptr); + pr_debug("Queue Doorbell Offset: %u\n", q->properties.doorbell_off); + pr_debug("Queue MQD Address: 0x%p\n", q->mqd); + pr_debug("Queue MQD Gart: 0x%llX\n", q->gart_mqd_addr); + pr_debug("Queue Process Address: 0x%p\n", q->process); + pr_debug("Queue Device Address: 0x%p\n", q->device); +} + +int init_queue(struct queue **q, const struct queue_properties *properties) +{ + struct queue *tmp_q; + + tmp_q = kzalloc(sizeof(*tmp_q), GFP_KERNEL); + if (!tmp_q) + return -ENOMEM; + + memcpy(&tmp_q->properties, properties, sizeof(*properties)); + + *q = tmp_q; + return 0; +} + +void uninit_queue(struct queue *q) +{ + kfree(q); +} diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_topology.c b/drivers/gpu/drm/amd/amdkfd/kfd_topology.c new file mode 100644 index 000000000..5cf499a07 --- /dev/null +++ b/drivers/gpu/drm/amd/amdkfd/kfd_topology.c @@ -0,0 +1,1441 @@ +/* + * Copyright 2014 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +#include <linux/types.h> +#include <linux/kernel.h> +#include <linux/pci.h> +#include <linux/errno.h> +#include <linux/acpi.h> +#include <linux/hash.h> +#include <linux/cpufreq.h> +#include <linux/log2.h> +#include <linux/dmi.h> +#include <linux/atomic.h> + +#include "kfd_priv.h" +#include "kfd_crat.h" +#include "kfd_topology.h" +#include "kfd_device_queue_manager.h" +#include "kfd_iommu.h" + +/* topology_device_list - Master list of all topology devices */ +static struct list_head topology_device_list; +static struct kfd_system_properties sys_props; + +static DECLARE_RWSEM(topology_lock); +static atomic_t topology_crat_proximity_domain; + +struct kfd_topology_device *kfd_topology_device_by_proximity_domain( + uint32_t proximity_domain) +{ + struct kfd_topology_device *top_dev; + struct kfd_topology_device *device = NULL; + + down_read(&topology_lock); + + list_for_each_entry(top_dev, &topology_device_list, list) + if (top_dev->proximity_domain == proximity_domain) { + device = top_dev; + break; + } + + up_read(&topology_lock); + + return device; +} + +struct kfd_topology_device *kfd_topology_device_by_id(uint32_t gpu_id) +{ + struct kfd_topology_device *top_dev = NULL; + struct kfd_topology_device *ret = NULL; + + down_read(&topology_lock); + + list_for_each_entry(top_dev, &topology_device_list, list) + if (top_dev->gpu_id == gpu_id) { + ret = top_dev; + break; + } + + up_read(&topology_lock); + + return ret; +} + +struct kfd_dev *kfd_device_by_id(uint32_t gpu_id) +{ + struct kfd_topology_device *top_dev; + + top_dev = kfd_topology_device_by_id(gpu_id); + if (!top_dev) + return NULL; + + return top_dev->gpu; +} + +struct kfd_dev *kfd_device_by_pci_dev(const struct pci_dev *pdev) +{ + struct kfd_topology_device *top_dev; + struct kfd_dev *device = NULL; + + down_read(&topology_lock); + + list_for_each_entry(top_dev, &topology_device_list, list) + if (top_dev->gpu->pdev == pdev) { + device = top_dev->gpu; + break; + } + + up_read(&topology_lock); + + return device; +} + +/* Called with write topology_lock acquired */ +static void kfd_release_topology_device(struct kfd_topology_device *dev) +{ + struct kfd_mem_properties *mem; + struct kfd_cache_properties *cache; + struct kfd_iolink_properties *iolink; + struct kfd_perf_properties *perf; + + list_del(&dev->list); + + while (dev->mem_props.next != &dev->mem_props) { + mem = container_of(dev->mem_props.next, + struct kfd_mem_properties, list); + list_del(&mem->list); + kfree(mem); + } + + while (dev->cache_props.next != &dev->cache_props) { + cache = container_of(dev->cache_props.next, + struct kfd_cache_properties, list); + list_del(&cache->list); + kfree(cache); + } + + while (dev->io_link_props.next != &dev->io_link_props) { + iolink = container_of(dev->io_link_props.next, + struct kfd_iolink_properties, list); + list_del(&iolink->list); + kfree(iolink); + } + + while (dev->perf_props.next != &dev->perf_props) { + perf = container_of(dev->perf_props.next, + struct kfd_perf_properties, list); + list_del(&perf->list); + kfree(perf); + } + + kfree(dev); +} + +void kfd_release_topology_device_list(struct list_head *device_list) +{ + struct kfd_topology_device *dev; + + while (!list_empty(device_list)) { + dev = list_first_entry(device_list, + struct kfd_topology_device, list); + kfd_release_topology_device(dev); + } +} + +static void kfd_release_live_view(void) +{ + kfd_release_topology_device_list(&topology_device_list); + memset(&sys_props, 0, sizeof(sys_props)); +} + +struct kfd_topology_device *kfd_create_topology_device( + struct list_head *device_list) +{ + struct kfd_topology_device *dev; + + dev = kfd_alloc_struct(dev); + if (!dev) { + pr_err("No memory to allocate a topology device"); + return NULL; + } + + INIT_LIST_HEAD(&dev->mem_props); + INIT_LIST_HEAD(&dev->cache_props); + INIT_LIST_HEAD(&dev->io_link_props); + INIT_LIST_HEAD(&dev->perf_props); + + list_add_tail(&dev->list, device_list); + + return dev; +} + + +#define sysfs_show_gen_prop(buffer, fmt, ...) \ + snprintf(buffer, PAGE_SIZE, "%s"fmt, buffer, __VA_ARGS__) +#define sysfs_show_32bit_prop(buffer, name, value) \ + sysfs_show_gen_prop(buffer, "%s %u\n", name, value) +#define sysfs_show_64bit_prop(buffer, name, value) \ + sysfs_show_gen_prop(buffer, "%s %llu\n", name, value) +#define sysfs_show_32bit_val(buffer, value) \ + sysfs_show_gen_prop(buffer, "%u\n", value) +#define sysfs_show_str_val(buffer, value) \ + sysfs_show_gen_prop(buffer, "%s\n", value) + +static ssize_t sysprops_show(struct kobject *kobj, struct attribute *attr, + char *buffer) +{ + ssize_t ret; + + /* Making sure that the buffer is an empty string */ + buffer[0] = 0; + + if (attr == &sys_props.attr_genid) { + ret = sysfs_show_32bit_val(buffer, sys_props.generation_count); + } else if (attr == &sys_props.attr_props) { + sysfs_show_64bit_prop(buffer, "platform_oem", + sys_props.platform_oem); + sysfs_show_64bit_prop(buffer, "platform_id", + sys_props.platform_id); + ret = sysfs_show_64bit_prop(buffer, "platform_rev", + sys_props.platform_rev); + } else { + ret = -EINVAL; + } + + return ret; +} + +static void kfd_topology_kobj_release(struct kobject *kobj) +{ + kfree(kobj); +} + +static const struct sysfs_ops sysprops_ops = { + .show = sysprops_show, +}; + +static struct kobj_type sysprops_type = { + .release = kfd_topology_kobj_release, + .sysfs_ops = &sysprops_ops, +}; + +static ssize_t iolink_show(struct kobject *kobj, struct attribute *attr, + char *buffer) +{ + ssize_t ret; + struct kfd_iolink_properties *iolink; + + /* Making sure that the buffer is an empty string */ + buffer[0] = 0; + + iolink = container_of(attr, struct kfd_iolink_properties, attr); + sysfs_show_32bit_prop(buffer, "type", iolink->iolink_type); + sysfs_show_32bit_prop(buffer, "version_major", iolink->ver_maj); + sysfs_show_32bit_prop(buffer, "version_minor", iolink->ver_min); + sysfs_show_32bit_prop(buffer, "node_from", iolink->node_from); + sysfs_show_32bit_prop(buffer, "node_to", iolink->node_to); + sysfs_show_32bit_prop(buffer, "weight", iolink->weight); + sysfs_show_32bit_prop(buffer, "min_latency", iolink->min_latency); + sysfs_show_32bit_prop(buffer, "max_latency", iolink->max_latency); + sysfs_show_32bit_prop(buffer, "min_bandwidth", iolink->min_bandwidth); + sysfs_show_32bit_prop(buffer, "max_bandwidth", iolink->max_bandwidth); + sysfs_show_32bit_prop(buffer, "recommended_transfer_size", + iolink->rec_transfer_size); + ret = sysfs_show_32bit_prop(buffer, "flags", iolink->flags); + + return ret; +} + +static const struct sysfs_ops iolink_ops = { + .show = iolink_show, +}; + +static struct kobj_type iolink_type = { + .release = kfd_topology_kobj_release, + .sysfs_ops = &iolink_ops, +}; + +static ssize_t mem_show(struct kobject *kobj, struct attribute *attr, + char *buffer) +{ + ssize_t ret; + struct kfd_mem_properties *mem; + + /* Making sure that the buffer is an empty string */ + buffer[0] = 0; + + mem = container_of(attr, struct kfd_mem_properties, attr); + sysfs_show_32bit_prop(buffer, "heap_type", mem->heap_type); + sysfs_show_64bit_prop(buffer, "size_in_bytes", mem->size_in_bytes); + sysfs_show_32bit_prop(buffer, "flags", mem->flags); + sysfs_show_32bit_prop(buffer, "width", mem->width); + ret = sysfs_show_32bit_prop(buffer, "mem_clk_max", mem->mem_clk_max); + + return ret; +} + +static const struct sysfs_ops mem_ops = { + .show = mem_show, +}; + +static struct kobj_type mem_type = { + .release = kfd_topology_kobj_release, + .sysfs_ops = &mem_ops, +}; + +static ssize_t kfd_cache_show(struct kobject *kobj, struct attribute *attr, + char *buffer) +{ + ssize_t ret; + uint32_t i, j; + struct kfd_cache_properties *cache; + + /* Making sure that the buffer is an empty string */ + buffer[0] = 0; + + cache = container_of(attr, struct kfd_cache_properties, attr); + sysfs_show_32bit_prop(buffer, "processor_id_low", + cache->processor_id_low); + sysfs_show_32bit_prop(buffer, "level", cache->cache_level); + sysfs_show_32bit_prop(buffer, "size", cache->cache_size); + sysfs_show_32bit_prop(buffer, "cache_line_size", cache->cacheline_size); + sysfs_show_32bit_prop(buffer, "cache_lines_per_tag", + cache->cachelines_per_tag); + sysfs_show_32bit_prop(buffer, "association", cache->cache_assoc); + sysfs_show_32bit_prop(buffer, "latency", cache->cache_latency); + sysfs_show_32bit_prop(buffer, "type", cache->cache_type); + snprintf(buffer, PAGE_SIZE, "%ssibling_map ", buffer); + for (i = 0; i < CRAT_SIBLINGMAP_SIZE; i++) + for (j = 0; j < sizeof(cache->sibling_map[0])*8; j++) { + /* Check each bit */ + if (cache->sibling_map[i] & (1 << j)) + ret = snprintf(buffer, PAGE_SIZE, + "%s%d%s", buffer, 1, ","); + else + ret = snprintf(buffer, PAGE_SIZE, + "%s%d%s", buffer, 0, ","); + } + /* Replace the last "," with end of line */ + *(buffer + strlen(buffer) - 1) = 0xA; + return ret; +} + +static const struct sysfs_ops cache_ops = { + .show = kfd_cache_show, +}; + +static struct kobj_type cache_type = { + .release = kfd_topology_kobj_release, + .sysfs_ops = &cache_ops, +}; + +/****** Sysfs of Performance Counters ******/ + +struct kfd_perf_attr { + struct kobj_attribute attr; + uint32_t data; +}; + +static ssize_t perf_show(struct kobject *kobj, struct kobj_attribute *attrs, + char *buf) +{ + struct kfd_perf_attr *attr; + + buf[0] = 0; + attr = container_of(attrs, struct kfd_perf_attr, attr); + if (!attr->data) /* invalid data for PMC */ + return 0; + else + return sysfs_show_32bit_val(buf, attr->data); +} + +#define KFD_PERF_DESC(_name, _data) \ +{ \ + .attr = __ATTR(_name, 0444, perf_show, NULL), \ + .data = _data, \ +} + +static struct kfd_perf_attr perf_attr_iommu[] = { + KFD_PERF_DESC(max_concurrent, 0), + KFD_PERF_DESC(num_counters, 0), + KFD_PERF_DESC(counter_ids, 0), +}; +/****************************************/ + +static ssize_t node_show(struct kobject *kobj, struct attribute *attr, + char *buffer) +{ + struct kfd_topology_device *dev; + char public_name[KFD_TOPOLOGY_PUBLIC_NAME_SIZE]; + uint32_t i; + uint32_t log_max_watch_addr; + + /* Making sure that the buffer is an empty string */ + buffer[0] = 0; + + if (strcmp(attr->name, "gpu_id") == 0) { + dev = container_of(attr, struct kfd_topology_device, + attr_gpuid); + return sysfs_show_32bit_val(buffer, dev->gpu_id); + } + + if (strcmp(attr->name, "name") == 0) { + dev = container_of(attr, struct kfd_topology_device, + attr_name); + for (i = 0; i < KFD_TOPOLOGY_PUBLIC_NAME_SIZE; i++) { + public_name[i] = + (char)dev->node_props.marketing_name[i]; + if (dev->node_props.marketing_name[i] == 0) + break; + } + public_name[KFD_TOPOLOGY_PUBLIC_NAME_SIZE-1] = 0x0; + return sysfs_show_str_val(buffer, public_name); + } + + dev = container_of(attr, struct kfd_topology_device, + attr_props); + sysfs_show_32bit_prop(buffer, "cpu_cores_count", + dev->node_props.cpu_cores_count); + sysfs_show_32bit_prop(buffer, "simd_count", + dev->node_props.simd_count); + sysfs_show_32bit_prop(buffer, "mem_banks_count", + dev->node_props.mem_banks_count); + sysfs_show_32bit_prop(buffer, "caches_count", + dev->node_props.caches_count); + sysfs_show_32bit_prop(buffer, "io_links_count", + dev->node_props.io_links_count); + sysfs_show_32bit_prop(buffer, "cpu_core_id_base", + dev->node_props.cpu_core_id_base); + sysfs_show_32bit_prop(buffer, "simd_id_base", + dev->node_props.simd_id_base); + sysfs_show_32bit_prop(buffer, "max_waves_per_simd", + dev->node_props.max_waves_per_simd); + sysfs_show_32bit_prop(buffer, "lds_size_in_kb", + dev->node_props.lds_size_in_kb); + sysfs_show_32bit_prop(buffer, "gds_size_in_kb", + dev->node_props.gds_size_in_kb); + sysfs_show_32bit_prop(buffer, "wave_front_size", + dev->node_props.wave_front_size); + sysfs_show_32bit_prop(buffer, "array_count", + dev->node_props.array_count); + sysfs_show_32bit_prop(buffer, "simd_arrays_per_engine", + dev->node_props.simd_arrays_per_engine); + sysfs_show_32bit_prop(buffer, "cu_per_simd_array", + dev->node_props.cu_per_simd_array); + sysfs_show_32bit_prop(buffer, "simd_per_cu", + dev->node_props.simd_per_cu); + sysfs_show_32bit_prop(buffer, "max_slots_scratch_cu", + dev->node_props.max_slots_scratch_cu); + sysfs_show_32bit_prop(buffer, "vendor_id", + dev->node_props.vendor_id); + sysfs_show_32bit_prop(buffer, "device_id", + dev->node_props.device_id); + sysfs_show_32bit_prop(buffer, "location_id", + dev->node_props.location_id); + sysfs_show_32bit_prop(buffer, "drm_render_minor", + dev->node_props.drm_render_minor); + + if (dev->gpu) { + log_max_watch_addr = + __ilog2_u32(dev->gpu->device_info->num_of_watch_points); + + if (log_max_watch_addr) { + dev->node_props.capability |= + HSA_CAP_WATCH_POINTS_SUPPORTED; + + dev->node_props.capability |= + ((log_max_watch_addr << + HSA_CAP_WATCH_POINTS_TOTALBITS_SHIFT) & + HSA_CAP_WATCH_POINTS_TOTALBITS_MASK); + } + + if (dev->gpu->device_info->asic_family == CHIP_TONGA) + dev->node_props.capability |= + HSA_CAP_AQL_QUEUE_DOUBLE_MAP; + + sysfs_show_32bit_prop(buffer, "max_engine_clk_fcompute", + dev->node_props.max_engine_clk_fcompute); + + sysfs_show_64bit_prop(buffer, "local_mem_size", + (unsigned long long int) 0); + + sysfs_show_32bit_prop(buffer, "fw_version", + dev->gpu->kfd2kgd->get_fw_version( + dev->gpu->kgd, + KGD_ENGINE_MEC1)); + sysfs_show_32bit_prop(buffer, "capability", + dev->node_props.capability); + } + + return sysfs_show_32bit_prop(buffer, "max_engine_clk_ccompute", + cpufreq_quick_get_max(0)/1000); +} + +static const struct sysfs_ops node_ops = { + .show = node_show, +}; + +static struct kobj_type node_type = { + .release = kfd_topology_kobj_release, + .sysfs_ops = &node_ops, +}; + +static void kfd_remove_sysfs_file(struct kobject *kobj, struct attribute *attr) +{ + sysfs_remove_file(kobj, attr); + kobject_del(kobj); + kobject_put(kobj); +} + +static void kfd_remove_sysfs_node_entry(struct kfd_topology_device *dev) +{ + struct kfd_iolink_properties *iolink; + struct kfd_cache_properties *cache; + struct kfd_mem_properties *mem; + struct kfd_perf_properties *perf; + + if (dev->kobj_iolink) { + list_for_each_entry(iolink, &dev->io_link_props, list) + if (iolink->kobj) { + kfd_remove_sysfs_file(iolink->kobj, + &iolink->attr); + iolink->kobj = NULL; + } + kobject_del(dev->kobj_iolink); + kobject_put(dev->kobj_iolink); + dev->kobj_iolink = NULL; + } + + if (dev->kobj_cache) { + list_for_each_entry(cache, &dev->cache_props, list) + if (cache->kobj) { + kfd_remove_sysfs_file(cache->kobj, + &cache->attr); + cache->kobj = NULL; + } + kobject_del(dev->kobj_cache); + kobject_put(dev->kobj_cache); + dev->kobj_cache = NULL; + } + + if (dev->kobj_mem) { + list_for_each_entry(mem, &dev->mem_props, list) + if (mem->kobj) { + kfd_remove_sysfs_file(mem->kobj, &mem->attr); + mem->kobj = NULL; + } + kobject_del(dev->kobj_mem); + kobject_put(dev->kobj_mem); + dev->kobj_mem = NULL; + } + + if (dev->kobj_perf) { + list_for_each_entry(perf, &dev->perf_props, list) { + kfree(perf->attr_group); + perf->attr_group = NULL; + } + kobject_del(dev->kobj_perf); + kobject_put(dev->kobj_perf); + dev->kobj_perf = NULL; + } + + if (dev->kobj_node) { + sysfs_remove_file(dev->kobj_node, &dev->attr_gpuid); + sysfs_remove_file(dev->kobj_node, &dev->attr_name); + sysfs_remove_file(dev->kobj_node, &dev->attr_props); + kobject_del(dev->kobj_node); + kobject_put(dev->kobj_node); + dev->kobj_node = NULL; + } +} + +static int kfd_build_sysfs_node_entry(struct kfd_topology_device *dev, + uint32_t id) +{ + struct kfd_iolink_properties *iolink; + struct kfd_cache_properties *cache; + struct kfd_mem_properties *mem; + struct kfd_perf_properties *perf; + int ret; + uint32_t i, num_attrs; + struct attribute **attrs; + + if (WARN_ON(dev->kobj_node)) + return -EEXIST; + + /* + * Creating the sysfs folders + */ + dev->kobj_node = kfd_alloc_struct(dev->kobj_node); + if (!dev->kobj_node) + return -ENOMEM; + + ret = kobject_init_and_add(dev->kobj_node, &node_type, + sys_props.kobj_nodes, "%d", id); + if (ret < 0) { + kobject_put(dev->kobj_node); + return ret; + } + + dev->kobj_mem = kobject_create_and_add("mem_banks", dev->kobj_node); + if (!dev->kobj_mem) + return -ENOMEM; + + dev->kobj_cache = kobject_create_and_add("caches", dev->kobj_node); + if (!dev->kobj_cache) + return -ENOMEM; + + dev->kobj_iolink = kobject_create_and_add("io_links", dev->kobj_node); + if (!dev->kobj_iolink) + return -ENOMEM; + + dev->kobj_perf = kobject_create_and_add("perf", dev->kobj_node); + if (!dev->kobj_perf) + return -ENOMEM; + + /* + * Creating sysfs files for node properties + */ + dev->attr_gpuid.name = "gpu_id"; + dev->attr_gpuid.mode = KFD_SYSFS_FILE_MODE; + sysfs_attr_init(&dev->attr_gpuid); + dev->attr_name.name = "name"; + dev->attr_name.mode = KFD_SYSFS_FILE_MODE; + sysfs_attr_init(&dev->attr_name); + dev->attr_props.name = "properties"; + dev->attr_props.mode = KFD_SYSFS_FILE_MODE; + sysfs_attr_init(&dev->attr_props); + ret = sysfs_create_file(dev->kobj_node, &dev->attr_gpuid); + if (ret < 0) + return ret; + ret = sysfs_create_file(dev->kobj_node, &dev->attr_name); + if (ret < 0) + return ret; + ret = sysfs_create_file(dev->kobj_node, &dev->attr_props); + if (ret < 0) + return ret; + + i = 0; + list_for_each_entry(mem, &dev->mem_props, list) { + mem->kobj = kzalloc(sizeof(struct kobject), GFP_KERNEL); + if (!mem->kobj) + return -ENOMEM; + ret = kobject_init_and_add(mem->kobj, &mem_type, + dev->kobj_mem, "%d", i); + if (ret < 0) { + kobject_put(mem->kobj); + return ret; + } + + mem->attr.name = "properties"; + mem->attr.mode = KFD_SYSFS_FILE_MODE; + sysfs_attr_init(&mem->attr); + ret = sysfs_create_file(mem->kobj, &mem->attr); + if (ret < 0) + return ret; + i++; + } + + i = 0; + list_for_each_entry(cache, &dev->cache_props, list) { + cache->kobj = kzalloc(sizeof(struct kobject), GFP_KERNEL); + if (!cache->kobj) + return -ENOMEM; + ret = kobject_init_and_add(cache->kobj, &cache_type, + dev->kobj_cache, "%d", i); + if (ret < 0) { + kobject_put(cache->kobj); + return ret; + } + + cache->attr.name = "properties"; + cache->attr.mode = KFD_SYSFS_FILE_MODE; + sysfs_attr_init(&cache->attr); + ret = sysfs_create_file(cache->kobj, &cache->attr); + if (ret < 0) + return ret; + i++; + } + + i = 0; + list_for_each_entry(iolink, &dev->io_link_props, list) { + iolink->kobj = kzalloc(sizeof(struct kobject), GFP_KERNEL); + if (!iolink->kobj) + return -ENOMEM; + ret = kobject_init_and_add(iolink->kobj, &iolink_type, + dev->kobj_iolink, "%d", i); + if (ret < 0) { + kobject_put(iolink->kobj); + return ret; + } + + iolink->attr.name = "properties"; + iolink->attr.mode = KFD_SYSFS_FILE_MODE; + sysfs_attr_init(&iolink->attr); + ret = sysfs_create_file(iolink->kobj, &iolink->attr); + if (ret < 0) + return ret; + i++; + } + + /* All hardware blocks have the same number of attributes. */ + num_attrs = ARRAY_SIZE(perf_attr_iommu); + list_for_each_entry(perf, &dev->perf_props, list) { + perf->attr_group = kzalloc(sizeof(struct kfd_perf_attr) + * num_attrs + sizeof(struct attribute_group), + GFP_KERNEL); + if (!perf->attr_group) + return -ENOMEM; + + attrs = (struct attribute **)(perf->attr_group + 1); + if (!strcmp(perf->block_name, "iommu")) { + /* Information of IOMMU's num_counters and counter_ids is shown + * under /sys/bus/event_source/devices/amd_iommu. We don't + * duplicate here. + */ + perf_attr_iommu[0].data = perf->max_concurrent; + for (i = 0; i < num_attrs; i++) + attrs[i] = &perf_attr_iommu[i].attr.attr; + } + perf->attr_group->name = perf->block_name; + perf->attr_group->attrs = attrs; + ret = sysfs_create_group(dev->kobj_perf, perf->attr_group); + if (ret < 0) + return ret; + } + + return 0; +} + +/* Called with write topology lock acquired */ +static int kfd_build_sysfs_node_tree(void) +{ + struct kfd_topology_device *dev; + int ret; + uint32_t i = 0; + + list_for_each_entry(dev, &topology_device_list, list) { + ret = kfd_build_sysfs_node_entry(dev, i); + if (ret < 0) + return ret; + i++; + } + + return 0; +} + +/* Called with write topology lock acquired */ +static void kfd_remove_sysfs_node_tree(void) +{ + struct kfd_topology_device *dev; + + list_for_each_entry(dev, &topology_device_list, list) + kfd_remove_sysfs_node_entry(dev); +} + +static int kfd_topology_update_sysfs(void) +{ + int ret; + + pr_info("Creating topology SYSFS entries\n"); + if (!sys_props.kobj_topology) { + sys_props.kobj_topology = + kfd_alloc_struct(sys_props.kobj_topology); + if (!sys_props.kobj_topology) + return -ENOMEM; + + ret = kobject_init_and_add(sys_props.kobj_topology, + &sysprops_type, &kfd_device->kobj, + "topology"); + if (ret < 0) { + kobject_put(sys_props.kobj_topology); + return ret; + } + + sys_props.kobj_nodes = kobject_create_and_add("nodes", + sys_props.kobj_topology); + if (!sys_props.kobj_nodes) + return -ENOMEM; + + sys_props.attr_genid.name = "generation_id"; + sys_props.attr_genid.mode = KFD_SYSFS_FILE_MODE; + sysfs_attr_init(&sys_props.attr_genid); + ret = sysfs_create_file(sys_props.kobj_topology, + &sys_props.attr_genid); + if (ret < 0) + return ret; + + sys_props.attr_props.name = "system_properties"; + sys_props.attr_props.mode = KFD_SYSFS_FILE_MODE; + sysfs_attr_init(&sys_props.attr_props); + ret = sysfs_create_file(sys_props.kobj_topology, + &sys_props.attr_props); + if (ret < 0) + return ret; + } + + kfd_remove_sysfs_node_tree(); + + return kfd_build_sysfs_node_tree(); +} + +static void kfd_topology_release_sysfs(void) +{ + kfd_remove_sysfs_node_tree(); + if (sys_props.kobj_topology) { + sysfs_remove_file(sys_props.kobj_topology, + &sys_props.attr_genid); + sysfs_remove_file(sys_props.kobj_topology, + &sys_props.attr_props); + if (sys_props.kobj_nodes) { + kobject_del(sys_props.kobj_nodes); + kobject_put(sys_props.kobj_nodes); + sys_props.kobj_nodes = NULL; + } + kobject_del(sys_props.kobj_topology); + kobject_put(sys_props.kobj_topology); + sys_props.kobj_topology = NULL; + } +} + +/* Called with write topology_lock acquired */ +static void kfd_topology_update_device_list(struct list_head *temp_list, + struct list_head *master_list) +{ + while (!list_empty(temp_list)) { + list_move_tail(temp_list->next, master_list); + sys_props.num_devices++; + } +} + +static void kfd_debug_print_topology(void) +{ + struct kfd_topology_device *dev; + + down_read(&topology_lock); + + dev = list_last_entry(&topology_device_list, + struct kfd_topology_device, list); + if (dev) { + if (dev->node_props.cpu_cores_count && + dev->node_props.simd_count) { + pr_info("Topology: Add APU node [0x%0x:0x%0x]\n", + dev->node_props.device_id, + dev->node_props.vendor_id); + } else if (dev->node_props.cpu_cores_count) + pr_info("Topology: Add CPU node\n"); + else if (dev->node_props.simd_count) + pr_info("Topology: Add dGPU node [0x%0x:0x%0x]\n", + dev->node_props.device_id, + dev->node_props.vendor_id); + } + up_read(&topology_lock); +} + +/* Helper function for intializing platform_xx members of + * kfd_system_properties. Uses OEM info from the last CPU/APU node. + */ +static void kfd_update_system_properties(void) +{ + struct kfd_topology_device *dev; + + down_read(&topology_lock); + dev = list_last_entry(&topology_device_list, + struct kfd_topology_device, list); + if (dev) { + sys_props.platform_id = + (*((uint64_t *)dev->oem_id)) & CRAT_OEMID_64BIT_MASK; + sys_props.platform_oem = *((uint64_t *)dev->oem_table_id); + sys_props.platform_rev = dev->oem_revision; + } + up_read(&topology_lock); +} + +static void find_system_memory(const struct dmi_header *dm, + void *private) +{ + struct kfd_mem_properties *mem; + u16 mem_width, mem_clock; + struct kfd_topology_device *kdev = + (struct kfd_topology_device *)private; + const u8 *dmi_data = (const u8 *)(dm + 1); + + if (dm->type == DMI_ENTRY_MEM_DEVICE && dm->length >= 0x15) { + mem_width = (u16)(*(const u16 *)(dmi_data + 0x6)); + mem_clock = (u16)(*(const u16 *)(dmi_data + 0x11)); + list_for_each_entry(mem, &kdev->mem_props, list) { + if (mem_width != 0xFFFF && mem_width != 0) + mem->width = mem_width; + if (mem_clock != 0) + mem->mem_clk_max = mem_clock; + } + } +} + +/* + * Performance counters information is not part of CRAT but we would like to + * put them in the sysfs under topology directory for Thunk to get the data. + * This function is called before updating the sysfs. + */ +static int kfd_add_perf_to_topology(struct kfd_topology_device *kdev) +{ + /* These are the only counters supported so far */ + return kfd_iommu_add_perf_counters(kdev); +} + +/* kfd_add_non_crat_information - Add information that is not currently + * defined in CRAT but is necessary for KFD topology + * @dev - topology device to which addition info is added + */ +static void kfd_add_non_crat_information(struct kfd_topology_device *kdev) +{ + /* Check if CPU only node. */ + if (!kdev->gpu) { + /* Add system memory information */ + dmi_walk(find_system_memory, kdev); + } + /* TODO: For GPU node, rearrange code from kfd_topology_add_device */ +} + +/* kfd_is_acpi_crat_invalid - CRAT from ACPI is valid only for AMD APU devices. + * Ignore CRAT for all other devices. AMD APU is identified if both CPU + * and GPU cores are present. + * @device_list - topology device list created by parsing ACPI CRAT table. + * @return - TRUE if invalid, FALSE is valid. + */ +static bool kfd_is_acpi_crat_invalid(struct list_head *device_list) +{ + struct kfd_topology_device *dev; + + list_for_each_entry(dev, device_list, list) { + if (dev->node_props.cpu_cores_count && + dev->node_props.simd_count) + return false; + } + pr_info("Ignoring ACPI CRAT on non-APU system\n"); + return true; +} + +int kfd_topology_init(void) +{ + void *crat_image = NULL; + size_t image_size = 0; + int ret; + struct list_head temp_topology_device_list; + int cpu_only_node = 0; + struct kfd_topology_device *kdev; + int proximity_domain; + + /* topology_device_list - Master list of all topology devices + * temp_topology_device_list - temporary list created while parsing CRAT + * or VCRAT. Once parsing is complete the contents of list is moved to + * topology_device_list + */ + + /* Initialize the head for the both the lists */ + INIT_LIST_HEAD(&topology_device_list); + INIT_LIST_HEAD(&temp_topology_device_list); + init_rwsem(&topology_lock); + + memset(&sys_props, 0, sizeof(sys_props)); + + /* Proximity domains in ACPI CRAT tables start counting at + * 0. The same should be true for virtual CRAT tables created + * at this stage. GPUs added later in kfd_topology_add_device + * use a counter. + */ + proximity_domain = 0; + + /* + * Get the CRAT image from the ACPI. If ACPI doesn't have one + * or if ACPI CRAT is invalid create a virtual CRAT. + * NOTE: The current implementation expects all AMD APUs to have + * CRAT. If no CRAT is available, it is assumed to be a CPU + */ + ret = kfd_create_crat_image_acpi(&crat_image, &image_size); + if (!ret) { + ret = kfd_parse_crat_table(crat_image, + &temp_topology_device_list, + proximity_domain); + if (ret || + kfd_is_acpi_crat_invalid(&temp_topology_device_list)) { + kfd_release_topology_device_list( + &temp_topology_device_list); + kfd_destroy_crat_image(crat_image); + crat_image = NULL; + } + } + + if (!crat_image) { + ret = kfd_create_crat_image_virtual(&crat_image, &image_size, + COMPUTE_UNIT_CPU, NULL, + proximity_domain); + cpu_only_node = 1; + if (ret) { + pr_err("Error creating VCRAT table for CPU\n"); + return ret; + } + + ret = kfd_parse_crat_table(crat_image, + &temp_topology_device_list, + proximity_domain); + if (ret) { + pr_err("Error parsing VCRAT table for CPU\n"); + goto err; + } + } + + kdev = list_first_entry(&temp_topology_device_list, + struct kfd_topology_device, list); + kfd_add_perf_to_topology(kdev); + + down_write(&topology_lock); + kfd_topology_update_device_list(&temp_topology_device_list, + &topology_device_list); + atomic_set(&topology_crat_proximity_domain, sys_props.num_devices-1); + ret = kfd_topology_update_sysfs(); + up_write(&topology_lock); + + if (!ret) { + sys_props.generation_count++; + kfd_update_system_properties(); + kfd_debug_print_topology(); + pr_info("Finished initializing topology\n"); + } else + pr_err("Failed to update topology in sysfs ret=%d\n", ret); + + /* For nodes with GPU, this information gets added + * when GPU is detected (kfd_topology_add_device). + */ + if (cpu_only_node) { + /* Add additional information to CPU only node created above */ + down_write(&topology_lock); + kdev = list_first_entry(&topology_device_list, + struct kfd_topology_device, list); + up_write(&topology_lock); + kfd_add_non_crat_information(kdev); + } + +err: + kfd_destroy_crat_image(crat_image); + return ret; +} + +void kfd_topology_shutdown(void) +{ + down_write(&topology_lock); + kfd_topology_release_sysfs(); + kfd_release_live_view(); + up_write(&topology_lock); +} + +static uint32_t kfd_generate_gpu_id(struct kfd_dev *gpu) +{ + uint32_t hashout; + uint32_t buf[7]; + uint64_t local_mem_size; + int i; + struct kfd_local_mem_info local_mem_info; + + if (!gpu) + return 0; + + gpu->kfd2kgd->get_local_mem_info(gpu->kgd, &local_mem_info); + + local_mem_size = local_mem_info.local_mem_size_private + + local_mem_info.local_mem_size_public; + + buf[0] = gpu->pdev->devfn; + buf[1] = gpu->pdev->subsystem_vendor; + buf[2] = gpu->pdev->subsystem_device; + buf[3] = gpu->pdev->device; + buf[4] = gpu->pdev->bus->number; + buf[5] = lower_32_bits(local_mem_size); + buf[6] = upper_32_bits(local_mem_size); + + for (i = 0, hashout = 0; i < 7; i++) + hashout ^= hash_32(buf[i], KFD_GPU_ID_HASH_WIDTH); + + return hashout; +} +/* kfd_assign_gpu - Attach @gpu to the correct kfd topology device. If + * the GPU device is not already present in the topology device + * list then return NULL. This means a new topology device has to + * be created for this GPU. + */ +static struct kfd_topology_device *kfd_assign_gpu(struct kfd_dev *gpu) +{ + struct kfd_topology_device *dev; + struct kfd_topology_device *out_dev = NULL; + + down_write(&topology_lock); + list_for_each_entry(dev, &topology_device_list, list) { + /* Discrete GPUs need their own topology device list + * entries. Don't assign them to CPU/APU nodes. + */ + if (!gpu->device_info->needs_iommu_device && + dev->node_props.cpu_cores_count) + continue; + + if (!dev->gpu && (dev->node_props.simd_count > 0)) { + dev->gpu = gpu; + out_dev = dev; + break; + } + } + up_write(&topology_lock); + return out_dev; +} + +static void kfd_notify_gpu_change(uint32_t gpu_id, int arrival) +{ + /* + * TODO: Generate an event for thunk about the arrival/removal + * of the GPU + */ +} + +/* kfd_fill_mem_clk_max_info - Since CRAT doesn't have memory clock info, + * patch this after CRAT parsing. + */ +static void kfd_fill_mem_clk_max_info(struct kfd_topology_device *dev) +{ + struct kfd_mem_properties *mem; + struct kfd_local_mem_info local_mem_info; + + if (!dev) + return; + + /* Currently, amdgpu driver (amdgpu_mc) deals only with GPUs with + * single bank of VRAM local memory. + * for dGPUs - VCRAT reports only one bank of Local Memory + * for APUs - If CRAT from ACPI reports more than one bank, then + * all the banks will report the same mem_clk_max information + */ + dev->gpu->kfd2kgd->get_local_mem_info(dev->gpu->kgd, + &local_mem_info); + + list_for_each_entry(mem, &dev->mem_props, list) + mem->mem_clk_max = local_mem_info.mem_clk_max; +} + +static void kfd_fill_iolink_non_crat_info(struct kfd_topology_device *dev) +{ + struct kfd_iolink_properties *link; + + if (!dev || !dev->gpu) + return; + + /* GPU only creates direck links so apply flags setting to all */ + if (dev->gpu->device_info->asic_family == CHIP_HAWAII) + list_for_each_entry(link, &dev->io_link_props, list) + link->flags = CRAT_IOLINK_FLAGS_ENABLED | + CRAT_IOLINK_FLAGS_NO_ATOMICS_32_BIT | + CRAT_IOLINK_FLAGS_NO_ATOMICS_64_BIT; +} + +int kfd_topology_add_device(struct kfd_dev *gpu) +{ + uint32_t gpu_id; + struct kfd_topology_device *dev; + struct kfd_cu_info cu_info; + int res = 0; + struct list_head temp_topology_device_list; + void *crat_image = NULL; + size_t image_size = 0; + int proximity_domain; + + INIT_LIST_HEAD(&temp_topology_device_list); + + gpu_id = kfd_generate_gpu_id(gpu); + + pr_debug("Adding new GPU (ID: 0x%x) to topology\n", gpu_id); + + proximity_domain = atomic_inc_return(&topology_crat_proximity_domain); + + /* Check to see if this gpu device exists in the topology_device_list. + * If so, assign the gpu to that device, + * else create a Virtual CRAT for this gpu device and then parse that + * CRAT to create a new topology device. Once created assign the gpu to + * that topology device + */ + dev = kfd_assign_gpu(gpu); + if (!dev) { + res = kfd_create_crat_image_virtual(&crat_image, &image_size, + COMPUTE_UNIT_GPU, gpu, + proximity_domain); + if (res) { + pr_err("Error creating VCRAT for GPU (ID: 0x%x)\n", + gpu_id); + return res; + } + res = kfd_parse_crat_table(crat_image, + &temp_topology_device_list, + proximity_domain); + if (res) { + pr_err("Error parsing VCRAT for GPU (ID: 0x%x)\n", + gpu_id); + goto err; + } + + down_write(&topology_lock); + kfd_topology_update_device_list(&temp_topology_device_list, + &topology_device_list); + + /* Update the SYSFS tree, since we added another topology + * device + */ + res = kfd_topology_update_sysfs(); + up_write(&topology_lock); + + if (!res) + sys_props.generation_count++; + else + pr_err("Failed to update GPU (ID: 0x%x) to sysfs topology. res=%d\n", + gpu_id, res); + dev = kfd_assign_gpu(gpu); + if (WARN_ON(!dev)) { + res = -ENODEV; + goto err; + } + } + + dev->gpu_id = gpu_id; + gpu->id = gpu_id; + + /* TODO: Move the following lines to function + * kfd_add_non_crat_information + */ + + /* Fill-in additional information that is not available in CRAT but + * needed for the topology + */ + + dev->gpu->kfd2kgd->get_cu_info(dev->gpu->kgd, &cu_info); + dev->node_props.simd_arrays_per_engine = + cu_info.num_shader_arrays_per_engine; + + dev->node_props.vendor_id = gpu->pdev->vendor; + dev->node_props.device_id = gpu->pdev->device; + dev->node_props.location_id = PCI_DEVID(gpu->pdev->bus->number, + gpu->pdev->devfn); + dev->node_props.max_engine_clk_fcompute = + dev->gpu->kfd2kgd->get_max_engine_clock_in_mhz(dev->gpu->kgd); + dev->node_props.max_engine_clk_ccompute = + cpufreq_quick_get_max(0) / 1000; + dev->node_props.drm_render_minor = + gpu->shared_resources.drm_render_minor; + + kfd_fill_mem_clk_max_info(dev); + kfd_fill_iolink_non_crat_info(dev); + + switch (dev->gpu->device_info->asic_family) { + case CHIP_KAVERI: + case CHIP_HAWAII: + case CHIP_TONGA: + dev->node_props.capability |= ((HSA_CAP_DOORBELL_TYPE_PRE_1_0 << + HSA_CAP_DOORBELL_TYPE_TOTALBITS_SHIFT) & + HSA_CAP_DOORBELL_TYPE_TOTALBITS_MASK); + break; + case CHIP_CARRIZO: + case CHIP_FIJI: + case CHIP_POLARIS10: + case CHIP_POLARIS11: + pr_debug("Adding doorbell packet type capability\n"); + dev->node_props.capability |= ((HSA_CAP_DOORBELL_TYPE_1_0 << + HSA_CAP_DOORBELL_TYPE_TOTALBITS_SHIFT) & + HSA_CAP_DOORBELL_TYPE_TOTALBITS_MASK); + break; + case CHIP_VEGA10: + case CHIP_RAVEN: + dev->node_props.capability |= ((HSA_CAP_DOORBELL_TYPE_2_0 << + HSA_CAP_DOORBELL_TYPE_TOTALBITS_SHIFT) & + HSA_CAP_DOORBELL_TYPE_TOTALBITS_MASK); + break; + default: + WARN(1, "Unexpected ASIC family %u", + dev->gpu->device_info->asic_family); + } + + /* Fix errors in CZ CRAT. + * simd_count: Carrizo CRAT reports wrong simd_count, probably + * because it doesn't consider masked out CUs + * max_waves_per_simd: Carrizo reports wrong max_waves_per_simd + * capability flag: Carrizo CRAT doesn't report IOMMU flags + */ + if (dev->gpu->device_info->asic_family == CHIP_CARRIZO) { + dev->node_props.simd_count = + cu_info.simd_per_cu * cu_info.cu_active_number; + dev->node_props.max_waves_per_simd = 10; + dev->node_props.capability |= HSA_CAP_ATS_PRESENT; + } + + kfd_debug_print_topology(); + + if (!res) + kfd_notify_gpu_change(gpu_id, 1); +err: + kfd_destroy_crat_image(crat_image); + return res; +} + +int kfd_topology_remove_device(struct kfd_dev *gpu) +{ + struct kfd_topology_device *dev, *tmp; + uint32_t gpu_id; + int res = -ENODEV; + + down_write(&topology_lock); + + list_for_each_entry_safe(dev, tmp, &topology_device_list, list) + if (dev->gpu == gpu) { + gpu_id = dev->gpu_id; + kfd_remove_sysfs_node_entry(dev); + kfd_release_topology_device(dev); + sys_props.num_devices--; + res = 0; + if (kfd_topology_update_sysfs() < 0) + kfd_topology_release_sysfs(); + break; + } + + up_write(&topology_lock); + + if (!res) + kfd_notify_gpu_change(gpu_id, 0); + + return res; +} + +/* kfd_topology_enum_kfd_devices - Enumerate through all devices in KFD + * topology. If GPU device is found @idx, then valid kfd_dev pointer is + * returned through @kdev + * Return - 0: On success (@kdev will be NULL for non GPU nodes) + * -1: If end of list + */ +int kfd_topology_enum_kfd_devices(uint8_t idx, struct kfd_dev **kdev) +{ + + struct kfd_topology_device *top_dev; + uint8_t device_idx = 0; + + *kdev = NULL; + down_read(&topology_lock); + + list_for_each_entry(top_dev, &topology_device_list, list) { + if (device_idx == idx) { + *kdev = top_dev->gpu; + up_read(&topology_lock); + return 0; + } + + device_idx++; + } + + up_read(&topology_lock); + + return -1; + +} + +static int kfd_cpumask_to_apic_id(const struct cpumask *cpumask) +{ + const struct cpuinfo_x86 *cpuinfo; + int first_cpu_of_numa_node; + + if (!cpumask || cpumask == cpu_none_mask) + return -1; + first_cpu_of_numa_node = cpumask_first(cpumask); + if (first_cpu_of_numa_node >= nr_cpu_ids) + return -1; + cpuinfo = &cpu_data(first_cpu_of_numa_node); + + return cpuinfo->apicid; +} + +/* kfd_numa_node_to_apic_id - Returns the APIC ID of the first logical processor + * of the given NUMA node (numa_node_id) + * Return -1 on failure + */ +int kfd_numa_node_to_apic_id(int numa_node_id) +{ + if (numa_node_id == -1) { + pr_warn("Invalid NUMA Node. Use online CPU mask\n"); + return kfd_cpumask_to_apic_id(cpu_online_mask); + } + return kfd_cpumask_to_apic_id(cpumask_of_node(numa_node_id)); +} + +#if defined(CONFIG_DEBUG_FS) + +int kfd_debugfs_hqds_by_device(struct seq_file *m, void *data) +{ + struct kfd_topology_device *dev; + unsigned int i = 0; + int r = 0; + + down_read(&topology_lock); + + list_for_each_entry(dev, &topology_device_list, list) { + if (!dev->gpu) { + i++; + continue; + } + + seq_printf(m, "Node %u, gpu_id %x:\n", i++, dev->gpu->id); + r = dqm_debugfs_hqds(m, dev->gpu->dqm); + if (r) + break; + } + + up_read(&topology_lock); + + return r; +} + +int kfd_debugfs_rls_by_device(struct seq_file *m, void *data) +{ + struct kfd_topology_device *dev; + unsigned int i = 0; + int r = 0; + + down_read(&topology_lock); + + list_for_each_entry(dev, &topology_device_list, list) { + if (!dev->gpu) { + i++; + continue; + } + + seq_printf(m, "Node %u, gpu_id %x:\n", i++, dev->gpu->id); + r = pm_debugfs_runlist(m, &dev->gpu->dqm->packets); + if (r) + break; + } + + up_read(&topology_lock); + + return r; +} + +#endif diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_topology.h b/drivers/gpu/drm/amd/amdkfd/kfd_topology.h new file mode 100644 index 000000000..7d9c3f948 --- /dev/null +++ b/drivers/gpu/drm/amd/amdkfd/kfd_topology.h @@ -0,0 +1,188 @@ +/* + * Copyright 2014 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +#ifndef __KFD_TOPOLOGY_H__ +#define __KFD_TOPOLOGY_H__ + +#include <linux/types.h> +#include <linux/list.h> +#include "kfd_crat.h" + +#define KFD_TOPOLOGY_PUBLIC_NAME_SIZE 128 + +#define HSA_CAP_HOT_PLUGGABLE 0x00000001 +#define HSA_CAP_ATS_PRESENT 0x00000002 +#define HSA_CAP_SHARED_WITH_GRAPHICS 0x00000004 +#define HSA_CAP_QUEUE_SIZE_POW2 0x00000008 +#define HSA_CAP_QUEUE_SIZE_32BIT 0x00000010 +#define HSA_CAP_QUEUE_IDLE_EVENT 0x00000020 +#define HSA_CAP_VA_LIMIT 0x00000040 +#define HSA_CAP_WATCH_POINTS_SUPPORTED 0x00000080 +#define HSA_CAP_WATCH_POINTS_TOTALBITS_MASK 0x00000f00 +#define HSA_CAP_WATCH_POINTS_TOTALBITS_SHIFT 8 +#define HSA_CAP_DOORBELL_TYPE_TOTALBITS_MASK 0x00003000 +#define HSA_CAP_DOORBELL_TYPE_TOTALBITS_SHIFT 12 +#define HSA_CAP_RESERVED 0xffffc000 + +#define HSA_CAP_DOORBELL_TYPE_PRE_1_0 0x0 +#define HSA_CAP_DOORBELL_TYPE_1_0 0x1 +#define HSA_CAP_DOORBELL_TYPE_2_0 0x2 +#define HSA_CAP_AQL_QUEUE_DOUBLE_MAP 0x00004000 + +struct kfd_node_properties { + uint32_t cpu_cores_count; + uint32_t simd_count; + uint32_t mem_banks_count; + uint32_t caches_count; + uint32_t io_links_count; + uint32_t cpu_core_id_base; + uint32_t simd_id_base; + uint32_t capability; + uint32_t max_waves_per_simd; + uint32_t lds_size_in_kb; + uint32_t gds_size_in_kb; + uint32_t wave_front_size; + uint32_t array_count; + uint32_t simd_arrays_per_engine; + uint32_t cu_per_simd_array; + uint32_t simd_per_cu; + uint32_t max_slots_scratch_cu; + uint32_t engine_id; + uint32_t vendor_id; + uint32_t device_id; + uint32_t location_id; + uint32_t max_engine_clk_fcompute; + uint32_t max_engine_clk_ccompute; + int32_t drm_render_minor; + uint16_t marketing_name[KFD_TOPOLOGY_PUBLIC_NAME_SIZE]; +}; + +#define HSA_MEM_HEAP_TYPE_SYSTEM 0 +#define HSA_MEM_HEAP_TYPE_FB_PUBLIC 1 +#define HSA_MEM_HEAP_TYPE_FB_PRIVATE 2 +#define HSA_MEM_HEAP_TYPE_GPU_GDS 3 +#define HSA_MEM_HEAP_TYPE_GPU_LDS 4 +#define HSA_MEM_HEAP_TYPE_GPU_SCRATCH 5 + +#define HSA_MEM_FLAGS_HOT_PLUGGABLE 0x00000001 +#define HSA_MEM_FLAGS_NON_VOLATILE 0x00000002 +#define HSA_MEM_FLAGS_RESERVED 0xfffffffc + +struct kfd_mem_properties { + struct list_head list; + uint32_t heap_type; + uint64_t size_in_bytes; + uint32_t flags; + uint32_t width; + uint32_t mem_clk_max; + struct kobject *kobj; + struct attribute attr; +}; + +#define HSA_CACHE_TYPE_DATA 0x00000001 +#define HSA_CACHE_TYPE_INSTRUCTION 0x00000002 +#define HSA_CACHE_TYPE_CPU 0x00000004 +#define HSA_CACHE_TYPE_HSACU 0x00000008 +#define HSA_CACHE_TYPE_RESERVED 0xfffffff0 + +struct kfd_cache_properties { + struct list_head list; + uint32_t processor_id_low; + uint32_t cache_level; + uint32_t cache_size; + uint32_t cacheline_size; + uint32_t cachelines_per_tag; + uint32_t cache_assoc; + uint32_t cache_latency; + uint32_t cache_type; + uint8_t sibling_map[CRAT_SIBLINGMAP_SIZE]; + struct kobject *kobj; + struct attribute attr; +}; + +struct kfd_iolink_properties { + struct list_head list; + uint32_t iolink_type; + uint32_t ver_maj; + uint32_t ver_min; + uint32_t node_from; + uint32_t node_to; + uint32_t weight; + uint32_t min_latency; + uint32_t max_latency; + uint32_t min_bandwidth; + uint32_t max_bandwidth; + uint32_t rec_transfer_size; + uint32_t flags; + struct kobject *kobj; + struct attribute attr; +}; + +struct kfd_perf_properties { + struct list_head list; + char block_name[16]; + uint32_t max_concurrent; + struct attribute_group *attr_group; +}; + +struct kfd_topology_device { + struct list_head list; + uint32_t gpu_id; + uint32_t proximity_domain; + struct kfd_node_properties node_props; + struct list_head mem_props; + uint32_t cache_count; + struct list_head cache_props; + uint32_t io_link_count; + struct list_head io_link_props; + struct list_head perf_props; + struct kfd_dev *gpu; + struct kobject *kobj_node; + struct kobject *kobj_mem; + struct kobject *kobj_cache; + struct kobject *kobj_iolink; + struct kobject *kobj_perf; + struct attribute attr_gpuid; + struct attribute attr_name; + struct attribute attr_props; + uint8_t oem_id[CRAT_OEMID_LENGTH]; + uint8_t oem_table_id[CRAT_OEMTABLEID_LENGTH]; + uint32_t oem_revision; +}; + +struct kfd_system_properties { + uint32_t num_devices; /* Number of H-NUMA nodes */ + uint32_t generation_count; + uint64_t platform_oem; + uint64_t platform_id; + uint64_t platform_rev; + struct kobject *kobj_topology; + struct kobject *kobj_nodes; + struct attribute attr_genid; + struct attribute attr_props; +}; + +struct kfd_topology_device *kfd_create_topology_device( + struct list_head *device_list); +void kfd_release_topology_device_list(struct list_head *device_list); + +#endif /* __KFD_TOPOLOGY_H__ */ diff --git a/drivers/gpu/drm/amd/amdkfd/soc15_int.h b/drivers/gpu/drm/amd/amdkfd/soc15_int.h new file mode 100644 index 000000000..0bc0b25cb --- /dev/null +++ b/drivers/gpu/drm/amd/amdkfd/soc15_int.h @@ -0,0 +1,47 @@ +/* + * Copyright 2016-2018 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +#ifndef HSA_SOC15_INT_H_INCLUDED +#define HSA_SOC15_INT_H_INCLUDED + +#include "soc15_ih_clientid.h" + +#define SOC15_INTSRC_CP_END_OF_PIPE 181 +#define SOC15_INTSRC_CP_BAD_OPCODE 183 +#define SOC15_INTSRC_SQ_INTERRUPT_MSG 239 +#define SOC15_INTSRC_VMC_FAULT 0 +#define SOC15_INTSRC_SDMA_TRAP 224 + + +#define SOC15_CLIENT_ID_FROM_IH_ENTRY(entry) (le32_to_cpu(entry[0]) & 0xff) +#define SOC15_SOURCE_ID_FROM_IH_ENTRY(entry) (le32_to_cpu(entry[0]) >> 8 & 0xff) +#define SOC15_RING_ID_FROM_IH_ENTRY(entry) (le32_to_cpu(entry[0]) >> 16 & 0xff) +#define SOC15_VMID_FROM_IH_ENTRY(entry) (le32_to_cpu(entry[0]) >> 24 & 0xf) +#define SOC15_VMID_TYPE_FROM_IH_ENTRY(entry) (le32_to_cpu(entry[0]) >> 31 & 0x1) +#define SOC15_PASID_FROM_IH_ENTRY(entry) (le32_to_cpu(entry[3]) & 0xffff) +#define SOC15_CONTEXT_ID0_FROM_IH_ENTRY(entry) (le32_to_cpu(entry[4])) +#define SOC15_CONTEXT_ID1_FROM_IH_ENTRY(entry) (le32_to_cpu(entry[5])) +#define SOC15_CONTEXT_ID2_FROM_IH_ENTRY(entry) (le32_to_cpu(entry[6])) +#define SOC15_CONTEXT_ID3_FROM_IH_ENTRY(entry) (le32_to_cpu(entry[7])) + +#endif + |