diff options
Diffstat (limited to 'arch/powerpc/platforms/powernv/npu-dma.c')
-rw-r--r-- | arch/powerpc/platforms/powernv/npu-dma.c | 705 |
1 files changed, 705 insertions, 0 deletions
diff --git a/arch/powerpc/platforms/powernv/npu-dma.c b/arch/powerpc/platforms/powernv/npu-dma.c new file mode 100644 index 000000000..b711dc326 --- /dev/null +++ b/arch/powerpc/platforms/powernv/npu-dma.c @@ -0,0 +1,705 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * This file implements the DMA operations for NVLink devices. The NPU + * devices all point to the same iommu table as the parent PCI device. + * + * Copyright Alistair Popple, IBM Corporation 2015. + */ + +#include <linux/mmu_notifier.h> +#include <linux/mmu_context.h> +#include <linux/of.h> +#include <linux/pci.h> +#include <linux/memblock.h> +#include <linux/sizes.h> + +#include <asm/debugfs.h> +#include <asm/powernv.h> +#include <asm/ppc-pci.h> +#include <asm/opal.h> + +#include "pci.h" + +static struct pci_dev *get_pci_dev(struct device_node *dn) +{ + struct pci_dn *pdn = PCI_DN(dn); + struct pci_dev *pdev; + + pdev = pci_get_domain_bus_and_slot(pci_domain_nr(pdn->phb->bus), + pdn->busno, pdn->devfn); + + /* + * pci_get_domain_bus_and_slot() increased the reference count of + * the PCI device, but callers don't need that actually as the PE + * already holds a reference to the device. Since callers aren't + * aware of the reference count change, call pci_dev_put() now to + * avoid leaks. + */ + if (pdev) + pci_dev_put(pdev); + + return pdev; +} + +/* Given a NPU device get the associated PCI device. */ +struct pci_dev *pnv_pci_get_gpu_dev(struct pci_dev *npdev) +{ + struct device_node *dn; + struct pci_dev *gpdev; + + if (WARN_ON(!npdev)) + return NULL; + + if (WARN_ON(!npdev->dev.of_node)) + return NULL; + + /* Get assoicated PCI device */ + dn = of_parse_phandle(npdev->dev.of_node, "ibm,gpu", 0); + if (!dn) + return NULL; + + gpdev = get_pci_dev(dn); + of_node_put(dn); + + return gpdev; +} +EXPORT_SYMBOL(pnv_pci_get_gpu_dev); + +/* Given the real PCI device get a linked NPU device. */ +struct pci_dev *pnv_pci_get_npu_dev(struct pci_dev *gpdev, int index) +{ + struct device_node *dn; + struct pci_dev *npdev; + + if (WARN_ON(!gpdev)) + return NULL; + + /* Not all PCI devices have device-tree nodes */ + if (!gpdev->dev.of_node) + return NULL; + + /* Get assoicated PCI device */ + dn = of_parse_phandle(gpdev->dev.of_node, "ibm,npu", index); + if (!dn) + return NULL; + + npdev = get_pci_dev(dn); + of_node_put(dn); + + return npdev; +} +EXPORT_SYMBOL(pnv_pci_get_npu_dev); + +#ifdef CONFIG_IOMMU_API +/* + * Returns the PE assoicated with the PCI device of the given + * NPU. Returns the linked pci device if pci_dev != NULL. + */ +static struct pnv_ioda_pe *get_gpu_pci_dev_and_pe(struct pnv_ioda_pe *npe, + struct pci_dev **gpdev) +{ + struct pnv_phb *phb; + struct pci_controller *hose; + struct pci_dev *pdev; + struct pnv_ioda_pe *pe; + struct pci_dn *pdn; + + pdev = pnv_pci_get_gpu_dev(npe->pdev); + if (!pdev) + return NULL; + + pdn = pci_get_pdn(pdev); + if (WARN_ON(!pdn || pdn->pe_number == IODA_INVALID_PE)) + return NULL; + + hose = pci_bus_to_host(pdev->bus); + phb = hose->private_data; + pe = &phb->ioda.pe_array[pdn->pe_number]; + + if (gpdev) + *gpdev = pdev; + + return pe; +} + +static long pnv_npu_unset_window(struct iommu_table_group *table_group, + int num); + +static long pnv_npu_set_window(struct iommu_table_group *table_group, int num, + struct iommu_table *tbl) +{ + struct pnv_ioda_pe *npe = container_of(table_group, struct pnv_ioda_pe, + table_group); + struct pnv_phb *phb = npe->phb; + int64_t rc; + const unsigned long size = tbl->it_indirect_levels ? + tbl->it_level_size : tbl->it_size; + const __u64 start_addr = tbl->it_offset << tbl->it_page_shift; + const __u64 win_size = tbl->it_size << tbl->it_page_shift; + int num2 = (num == 0) ? 1 : 0; + + /* NPU has just one TVE so if there is another table, remove it first */ + if (npe->table_group.tables[num2]) + pnv_npu_unset_window(&npe->table_group, num2); + + pe_info(npe, "Setting up window %llx..%llx pg=%lx\n", + start_addr, start_addr + win_size - 1, + IOMMU_PAGE_SIZE(tbl)); + + rc = opal_pci_map_pe_dma_window(phb->opal_id, + npe->pe_number, + npe->pe_number, + tbl->it_indirect_levels + 1, + __pa(tbl->it_base), + size << 3, + IOMMU_PAGE_SIZE(tbl)); + if (rc) { + pe_err(npe, "Failed to configure TCE table, err %lld\n", rc); + return rc; + } + pnv_pci_ioda2_tce_invalidate_entire(phb, false); + + /* Add the table to the list so its TCE cache will get invalidated */ + pnv_pci_link_table_and_group(phb->hose->node, num, + tbl, &npe->table_group); + + return 0; +} + +static long pnv_npu_unset_window(struct iommu_table_group *table_group, int num) +{ + struct pnv_ioda_pe *npe = container_of(table_group, struct pnv_ioda_pe, + table_group); + struct pnv_phb *phb = npe->phb; + int64_t rc; + + if (!npe->table_group.tables[num]) + return 0; + + pe_info(npe, "Removing DMA window\n"); + + rc = opal_pci_map_pe_dma_window(phb->opal_id, npe->pe_number, + npe->pe_number, + 0/* levels */, 0/* table address */, + 0/* table size */, 0/* page size */); + if (rc) { + pe_err(npe, "Unmapping failed, ret = %lld\n", rc); + return rc; + } + pnv_pci_ioda2_tce_invalidate_entire(phb, false); + + pnv_pci_unlink_table_and_group(npe->table_group.tables[num], + &npe->table_group); + + return 0; +} + +/* Switch ownership from platform code to external user (e.g. VFIO) */ +static void pnv_npu_take_ownership(struct iommu_table_group *table_group) +{ + struct pnv_ioda_pe *npe = container_of(table_group, struct pnv_ioda_pe, + table_group); + struct pnv_phb *phb = npe->phb; + int64_t rc; + struct pci_dev *gpdev = NULL; + + /* + * Note: NPU has just a single TVE in the hardware which means that + * while used by the kernel, it can have either 32bit window or + * DMA bypass but never both. So we deconfigure 32bit window only + * if it was enabled at the moment of ownership change. + */ + if (npe->table_group.tables[0]) { + pnv_npu_unset_window(&npe->table_group, 0); + return; + } + + /* Disable bypass */ + rc = opal_pci_map_pe_dma_window_real(phb->opal_id, + npe->pe_number, npe->pe_number, + 0 /* bypass base */, 0); + if (rc) { + pe_err(npe, "Failed to disable bypass, err %lld\n", rc); + return; + } + pnv_pci_ioda2_tce_invalidate_entire(npe->phb, false); + + get_gpu_pci_dev_and_pe(npe, &gpdev); + if (gpdev) + pnv_npu2_unmap_lpar_dev(gpdev); +} + +static void pnv_npu_release_ownership(struct iommu_table_group *table_group) +{ + struct pnv_ioda_pe *npe = container_of(table_group, struct pnv_ioda_pe, + table_group); + struct pci_dev *gpdev = NULL; + + get_gpu_pci_dev_and_pe(npe, &gpdev); + if (gpdev) + pnv_npu2_map_lpar_dev(gpdev, 0, MSR_DR | MSR_PR | MSR_HV); +} + +static struct iommu_table_group_ops pnv_pci_npu_ops = { + .set_window = pnv_npu_set_window, + .unset_window = pnv_npu_unset_window, + .take_ownership = pnv_npu_take_ownership, + .release_ownership = pnv_npu_release_ownership, +}; +#endif /* !CONFIG_IOMMU_API */ + +/* + * NPU2 ATS + */ +/* Maximum possible number of ATSD MMIO registers per NPU */ +#define NV_NMMU_ATSD_REGS 8 +#define NV_NPU_MAX_PE_NUM 16 + +/* + * A compound NPU IOMMU group which might consist of 1 GPU + 2xNPUs (POWER8) or + * up to 3 x (GPU + 2xNPUs) (POWER9). + */ +struct npu_comp { + struct iommu_table_group table_group; + int pe_num; + struct pnv_ioda_pe *pe[NV_NPU_MAX_PE_NUM]; +}; + +/* An NPU descriptor, valid for POWER9 only */ +struct npu { + int index; + struct npu_comp npucomp; +}; + +#ifdef CONFIG_IOMMU_API +static long pnv_npu_peers_create_table_userspace( + struct iommu_table_group *table_group, + int num, __u32 page_shift, __u64 window_size, __u32 levels, + struct iommu_table **ptbl) +{ + struct npu_comp *npucomp = container_of(table_group, struct npu_comp, + table_group); + + if (!npucomp->pe_num || !npucomp->pe[0] || + !npucomp->pe[0]->table_group.ops || + !npucomp->pe[0]->table_group.ops->create_table) + return -EFAULT; + + return npucomp->pe[0]->table_group.ops->create_table( + &npucomp->pe[0]->table_group, num, page_shift, + window_size, levels, ptbl); +} + +static long pnv_npu_peers_set_window(struct iommu_table_group *table_group, + int num, struct iommu_table *tbl) +{ + int i, j; + long ret = 0; + struct npu_comp *npucomp = container_of(table_group, struct npu_comp, + table_group); + + for (i = 0; i < npucomp->pe_num; ++i) { + struct pnv_ioda_pe *pe = npucomp->pe[i]; + + if (!pe->table_group.ops->set_window) + continue; + + ret = pe->table_group.ops->set_window(&pe->table_group, + num, tbl); + if (ret) + break; + } + + if (ret) { + for (j = 0; j < i; ++j) { + struct pnv_ioda_pe *pe = npucomp->pe[j]; + + if (!pe->table_group.ops->unset_window) + continue; + + ret = pe->table_group.ops->unset_window( + &pe->table_group, num); + if (ret) + break; + } + } else { + table_group->tables[num] = iommu_tce_table_get(tbl); + } + + return ret; +} + +static long pnv_npu_peers_unset_window(struct iommu_table_group *table_group, + int num) +{ + int i, j; + long ret = 0; + struct npu_comp *npucomp = container_of(table_group, struct npu_comp, + table_group); + + for (i = 0; i < npucomp->pe_num; ++i) { + struct pnv_ioda_pe *pe = npucomp->pe[i]; + + WARN_ON(npucomp->table_group.tables[num] != + table_group->tables[num]); + if (!npucomp->table_group.tables[num]) + continue; + + if (!pe->table_group.ops->unset_window) + continue; + + ret = pe->table_group.ops->unset_window(&pe->table_group, num); + if (ret) + break; + } + + if (ret) { + for (j = 0; j < i; ++j) { + struct pnv_ioda_pe *pe = npucomp->pe[j]; + + if (!npucomp->table_group.tables[num]) + continue; + + if (!pe->table_group.ops->set_window) + continue; + + ret = pe->table_group.ops->set_window(&pe->table_group, + num, table_group->tables[num]); + if (ret) + break; + } + } else if (table_group->tables[num]) { + iommu_tce_table_put(table_group->tables[num]); + table_group->tables[num] = NULL; + } + + return ret; +} + +static void pnv_npu_peers_take_ownership(struct iommu_table_group *table_group) +{ + int i; + struct npu_comp *npucomp = container_of(table_group, struct npu_comp, + table_group); + + for (i = 0; i < npucomp->pe_num; ++i) { + struct pnv_ioda_pe *pe = npucomp->pe[i]; + + if (!pe->table_group.ops || + !pe->table_group.ops->take_ownership) + continue; + pe->table_group.ops->take_ownership(&pe->table_group); + } +} + +static void pnv_npu_peers_release_ownership( + struct iommu_table_group *table_group) +{ + int i; + struct npu_comp *npucomp = container_of(table_group, struct npu_comp, + table_group); + + for (i = 0; i < npucomp->pe_num; ++i) { + struct pnv_ioda_pe *pe = npucomp->pe[i]; + + if (!pe->table_group.ops || + !pe->table_group.ops->release_ownership) + continue; + pe->table_group.ops->release_ownership(&pe->table_group); + } +} + +static struct iommu_table_group_ops pnv_npu_peers_ops = { + .get_table_size = pnv_pci_ioda2_get_table_size, + .create_table = pnv_npu_peers_create_table_userspace, + .set_window = pnv_npu_peers_set_window, + .unset_window = pnv_npu_peers_unset_window, + .take_ownership = pnv_npu_peers_take_ownership, + .release_ownership = pnv_npu_peers_release_ownership, +}; + +static void pnv_comp_attach_table_group(struct npu_comp *npucomp, + struct pnv_ioda_pe *pe) +{ + if (WARN_ON(npucomp->pe_num == NV_NPU_MAX_PE_NUM)) + return; + + npucomp->pe[npucomp->pe_num] = pe; + ++npucomp->pe_num; +} + +static struct iommu_table_group * + pnv_try_setup_npu_table_group(struct pnv_ioda_pe *pe) +{ + struct iommu_table_group *compound_group; + struct npu_comp *npucomp; + struct pci_dev *gpdev = NULL; + struct pci_controller *hose; + struct pci_dev *npdev = NULL; + + list_for_each_entry(gpdev, &pe->pbus->devices, bus_list) { + npdev = pnv_pci_get_npu_dev(gpdev, 0); + if (npdev) + break; + } + + if (!npdev) + /* It is not an NPU attached device, skip */ + return NULL; + + hose = pci_bus_to_host(npdev->bus); + + if (hose->npu) { + /* P9 case: compound group is per-NPU (all gpus, all links) */ + npucomp = &hose->npu->npucomp; + } else { + /* P8 case: Compound group is per-GPU (1 gpu, 2 links) */ + npucomp = pe->npucomp = kzalloc(sizeof(*npucomp), GFP_KERNEL); + } + + compound_group = &npucomp->table_group; + if (!compound_group->group) { + compound_group->ops = &pnv_npu_peers_ops; + iommu_register_group(compound_group, hose->global_number, + pe->pe_number); + + /* Steal capabilities from a GPU PE */ + compound_group->max_dynamic_windows_supported = + pe->table_group.max_dynamic_windows_supported; + compound_group->tce32_start = pe->table_group.tce32_start; + compound_group->tce32_size = pe->table_group.tce32_size; + compound_group->max_levels = pe->table_group.max_levels; + if (!compound_group->pgsizes) + compound_group->pgsizes = pe->table_group.pgsizes; + } + + /* + * The gpu would have been added to the iommu group that's created + * for the PE. Pull it out now. + */ + iommu_del_device(&gpdev->dev); + + /* + * I'm not sure this is strictly required, but it's probably a good idea + * since the table_group for the PE is going to be attached to the + * compound table group. If we leave the PE's iommu group active then + * we might have the same table_group being modifiable via two sepeate + * iommu groups. + */ + iommu_group_put(pe->table_group.group); + + /* now put the GPU into the compound group */ + pnv_comp_attach_table_group(npucomp, pe); + iommu_add_device(compound_group, &gpdev->dev); + + return compound_group; +} + +static struct iommu_table_group *pnv_npu_compound_attach(struct pnv_ioda_pe *pe) +{ + struct iommu_table_group *table_group; + struct npu_comp *npucomp; + struct pci_dev *gpdev = NULL; + struct pci_dev *npdev; + struct pnv_ioda_pe *gpe = get_gpu_pci_dev_and_pe(pe, &gpdev); + + WARN_ON(!(pe->flags & PNV_IODA_PE_DEV)); + if (!gpe) + return NULL; + + /* + * IODA2 bridges get this set up from pci_controller_ops::setup_bridge + * but NPU bridges do not have this hook defined so we do it here. + * We do not setup other table group parameters as they won't be used + * anyway - NVLink bridges are subordinate PEs. + */ + pe->table_group.ops = &pnv_pci_npu_ops; + + table_group = iommu_group_get_iommudata( + iommu_group_get(&gpdev->dev)); + + /* + * On P9 NPU PHB and PCI PHB support different page sizes, + * keep only matching. We expect here that NVLink bridge PE pgsizes is + * initialized by the caller. + */ + table_group->pgsizes &= pe->table_group.pgsizes; + npucomp = container_of(table_group, struct npu_comp, table_group); + pnv_comp_attach_table_group(npucomp, pe); + + list_for_each_entry(npdev, &pe->phb->hose->bus->devices, bus_list) { + struct pci_dev *gpdevtmp = pnv_pci_get_gpu_dev(npdev); + + if (gpdevtmp != gpdev) + continue; + + iommu_add_device(table_group, &npdev->dev); + } + + return table_group; +} + +void pnv_pci_npu_setup_iommu_groups(void) +{ + struct pci_controller *hose; + struct pnv_phb *phb; + struct pnv_ioda_pe *pe; + + /* + * For non-nvlink devices the IOMMU group is registered when the PE is + * configured and devices are added to the group when the per-device + * DMA setup is run. That's done in hose->ops.dma_dev_setup() which is + * only initialise for "normal" IODA PHBs. + * + * For NVLink devices we need to ensure the NVLinks and the GPU end up + * in the same IOMMU group, so that's handled here. + */ + list_for_each_entry(hose, &hose_list, list_node) { + phb = hose->private_data; + + if (phb->type == PNV_PHB_IODA2) + list_for_each_entry(pe, &phb->ioda.pe_list, list) + pnv_try_setup_npu_table_group(pe); + } + + /* + * Now we have all PHBs discovered, time to add NPU devices to + * the corresponding IOMMU groups. + */ + list_for_each_entry(hose, &hose_list, list_node) { + unsigned long pgsizes; + + phb = hose->private_data; + + if (phb->type != PNV_PHB_NPU_NVLINK) + continue; + + pgsizes = pnv_ioda_parse_tce_sizes(phb); + list_for_each_entry(pe, &phb->ioda.pe_list, list) { + /* + * IODA2 bridges get this set up from + * pci_controller_ops::setup_bridge but NPU bridges + * do not have this hook defined so we do it here. + */ + pe->table_group.pgsizes = pgsizes; + pnv_npu_compound_attach(pe); + } + } +} +#endif /* CONFIG_IOMMU_API */ + +int pnv_npu2_init(struct pci_controller *hose) +{ + static int npu_index; + struct npu *npu; + int ret; + + npu = kzalloc(sizeof(*npu), GFP_KERNEL); + if (!npu) + return -ENOMEM; + + npu_index++; + if (WARN_ON(npu_index >= NV_MAX_NPUS)) { + ret = -ENOSPC; + goto fail_exit; + } + npu->index = npu_index; + hose->npu = npu; + + return 0; + +fail_exit: + kfree(npu); + return ret; +} + +int pnv_npu2_map_lpar_dev(struct pci_dev *gpdev, unsigned int lparid, + unsigned long msr) +{ + int ret; + struct pci_dev *npdev = pnv_pci_get_npu_dev(gpdev, 0); + struct pci_controller *hose; + struct pnv_phb *nphb; + + if (!npdev) + return -ENODEV; + + hose = pci_bus_to_host(npdev->bus); + if (hose->npu == NULL) { + dev_info_once(&npdev->dev, "Nvlink1 does not support contexts"); + return 0; + } + + nphb = hose->private_data; + + dev_dbg(&gpdev->dev, "Map LPAR opalid=%llu lparid=%u\n", + nphb->opal_id, lparid); + /* + * Currently we only support radix and non-zero LPCR only makes sense + * for hash tables so skiboot expects the LPCR parameter to be a zero. + */ + ret = opal_npu_map_lpar(nphb->opal_id, pci_dev_id(gpdev), lparid, + 0 /* LPCR bits */); + if (ret) { + dev_err(&gpdev->dev, "Error %d mapping device to LPAR\n", ret); + return ret; + } + + dev_dbg(&gpdev->dev, "init context opalid=%llu msr=%lx\n", + nphb->opal_id, msr); + ret = opal_npu_init_context(nphb->opal_id, 0/*__unused*/, msr, + pci_dev_id(gpdev)); + if (ret < 0) + dev_err(&gpdev->dev, "Failed to init context: %d\n", ret); + else + ret = 0; + + return 0; +} +EXPORT_SYMBOL_GPL(pnv_npu2_map_lpar_dev); + +void pnv_npu2_map_lpar(struct pnv_ioda_pe *gpe, unsigned long msr) +{ + struct pci_dev *gpdev; + + list_for_each_entry(gpdev, &gpe->pbus->devices, bus_list) + pnv_npu2_map_lpar_dev(gpdev, 0, msr); +} + +int pnv_npu2_unmap_lpar_dev(struct pci_dev *gpdev) +{ + int ret; + struct pci_dev *npdev = pnv_pci_get_npu_dev(gpdev, 0); + struct pci_controller *hose; + struct pnv_phb *nphb; + + if (!npdev) + return -ENODEV; + + hose = pci_bus_to_host(npdev->bus); + if (hose->npu == NULL) { + dev_info_once(&npdev->dev, "Nvlink1 does not support contexts"); + return 0; + } + + nphb = hose->private_data; + + dev_dbg(&gpdev->dev, "destroy context opalid=%llu\n", + nphb->opal_id); + ret = opal_npu_destroy_context(nphb->opal_id, 0/*__unused*/, + pci_dev_id(gpdev)); + if (ret < 0) { + dev_err(&gpdev->dev, "Failed to destroy context: %d\n", ret); + return ret; + } + + /* Set LPID to 0 anyway, just to be safe */ + dev_dbg(&gpdev->dev, "Map LPAR opalid=%llu lparid=0\n", nphb->opal_id); + ret = opal_npu_map_lpar(nphb->opal_id, pci_dev_id(gpdev), 0 /*LPID*/, + 0 /* LPCR bits */); + if (ret) + dev_err(&gpdev->dev, "Error %d mapping device to LPAR\n", ret); + + return ret; +} +EXPORT_SYMBOL_GPL(pnv_npu2_unmap_lpar_dev); |