diff options
Diffstat (limited to 'arch/powerpc/sysdev/xive')
-rw-r--r-- | arch/powerpc/sysdev/xive/Kconfig | 14 | ||||
-rw-r--r-- | arch/powerpc/sysdev/xive/Makefile | 5 | ||||
-rw-r--r-- | arch/powerpc/sysdev/xive/common.c | 1864 | ||||
-rw-r--r-- | arch/powerpc/sysdev/xive/native.c | 877 | ||||
-rw-r--r-- | arch/powerpc/sysdev/xive/spapr.c | 892 | ||||
-rw-r--r-- | arch/powerpc/sysdev/xive/xive-internal.h | 78 |
6 files changed, 3730 insertions, 0 deletions
diff --git a/arch/powerpc/sysdev/xive/Kconfig b/arch/powerpc/sysdev/xive/Kconfig new file mode 100644 index 0000000000..785c292d10 --- /dev/null +++ b/arch/powerpc/sysdev/xive/Kconfig @@ -0,0 +1,14 @@ +# SPDX-License-Identifier: GPL-2.0 +config PPC_XIVE + bool + select PPC_SMP_MUXED_IPI + select HARDIRQS_SW_RESEND + +config PPC_XIVE_NATIVE + bool + select PPC_XIVE + depends on PPC_POWERNV + +config PPC_XIVE_SPAPR + bool + select PPC_XIVE diff --git a/arch/powerpc/sysdev/xive/Makefile b/arch/powerpc/sysdev/xive/Makefile new file mode 100644 index 0000000000..e510888389 --- /dev/null +++ b/arch/powerpc/sysdev/xive/Makefile @@ -0,0 +1,5 @@ +# SPDX-License-Identifier: GPL-2.0-only + +obj-y += common.o +obj-$(CONFIG_PPC_XIVE_NATIVE) += native.o +obj-$(CONFIG_PPC_XIVE_SPAPR) += spapr.o diff --git a/arch/powerpc/sysdev/xive/common.c b/arch/powerpc/sysdev/xive/common.c new file mode 100644 index 0000000000..a289cb97c1 --- /dev/null +++ b/arch/powerpc/sysdev/xive/common.c @@ -0,0 +1,1864 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright 2016,2017 IBM Corporation. + */ + +#define pr_fmt(fmt) "xive: " fmt + +#include <linux/types.h> +#include <linux/threads.h> +#include <linux/kernel.h> +#include <linux/irq.h> +#include <linux/irqdomain.h> +#include <linux/debugfs.h> +#include <linux/smp.h> +#include <linux/interrupt.h> +#include <linux/seq_file.h> +#include <linux/init.h> +#include <linux/cpu.h> +#include <linux/of.h> +#include <linux/slab.h> +#include <linux/spinlock.h> +#include <linux/msi.h> +#include <linux/vmalloc.h> + +#include <asm/io.h> +#include <asm/smp.h> +#include <asm/machdep.h> +#include <asm/irq.h> +#include <asm/errno.h> +#include <asm/xive.h> +#include <asm/xive-regs.h> +#include <asm/xmon.h> + +#include "xive-internal.h" + +#undef DEBUG_FLUSH +#undef DEBUG_ALL + +#ifdef DEBUG_ALL +#define DBG_VERBOSE(fmt, ...) pr_devel("cpu %d - " fmt, \ + smp_processor_id(), ## __VA_ARGS__) +#else +#define DBG_VERBOSE(fmt...) do { } while(0) +#endif + +bool __xive_enabled; +EXPORT_SYMBOL_GPL(__xive_enabled); +bool xive_cmdline_disabled; + +/* We use only one priority for now */ +static u8 xive_irq_priority; + +/* TIMA exported to KVM */ +void __iomem *xive_tima; +EXPORT_SYMBOL_GPL(xive_tima); +u32 xive_tima_offset; + +/* Backend ops */ +static const struct xive_ops *xive_ops; + +/* Our global interrupt domain */ +static struct irq_domain *xive_irq_domain; + +#ifdef CONFIG_SMP +/* The IPIs use the same logical irq number when on the same chip */ +static struct xive_ipi_desc { + unsigned int irq; + char name[16]; + atomic_t started; +} *xive_ipis; + +/* + * Use early_cpu_to_node() for hot-plugged CPUs + */ +static unsigned int xive_ipi_cpu_to_irq(unsigned int cpu) +{ + return xive_ipis[early_cpu_to_node(cpu)].irq; +} +#endif + +/* Xive state for each CPU */ +static DEFINE_PER_CPU(struct xive_cpu *, xive_cpu); + +/* An invalid CPU target */ +#define XIVE_INVALID_TARGET (-1) + +/* + * Global toggle to switch on/off StoreEOI + */ +static bool xive_store_eoi = true; + +static bool xive_is_store_eoi(struct xive_irq_data *xd) +{ + return xd->flags & XIVE_IRQ_FLAG_STORE_EOI && xive_store_eoi; +} + +/* + * Read the next entry in a queue, return its content if it's valid + * or 0 if there is no new entry. + * + * The queue pointer is moved forward unless "just_peek" is set + */ +static u32 xive_read_eq(struct xive_q *q, bool just_peek) +{ + u32 cur; + + if (!q->qpage) + return 0; + cur = be32_to_cpup(q->qpage + q->idx); + + /* Check valid bit (31) vs current toggle polarity */ + if ((cur >> 31) == q->toggle) + return 0; + + /* If consuming from the queue ... */ + if (!just_peek) { + /* Next entry */ + q->idx = (q->idx + 1) & q->msk; + + /* Wrap around: flip valid toggle */ + if (q->idx == 0) + q->toggle ^= 1; + } + /* Mask out the valid bit (31) */ + return cur & 0x7fffffff; +} + +/* + * Scans all the queue that may have interrupts in them + * (based on "pending_prio") in priority order until an + * interrupt is found or all the queues are empty. + * + * Then updates the CPPR (Current Processor Priority + * Register) based on the most favored interrupt found + * (0xff if none) and return what was found (0 if none). + * + * If just_peek is set, return the most favored pending + * interrupt if any but don't update the queue pointers. + * + * Note: This function can operate generically on any number + * of queues (up to 8). The current implementation of the XIVE + * driver only uses a single queue however. + * + * Note2: This will also "flush" "the pending_count" of a queue + * into the "count" when that queue is observed to be empty. + * This is used to keep track of the amount of interrupts + * targetting a queue. When an interrupt is moved away from + * a queue, we only decrement that queue count once the queue + * has been observed empty to avoid races. + */ +static u32 xive_scan_interrupts(struct xive_cpu *xc, bool just_peek) +{ + u32 irq = 0; + u8 prio = 0; + + /* Find highest pending priority */ + while (xc->pending_prio != 0) { + struct xive_q *q; + + prio = ffs(xc->pending_prio) - 1; + DBG_VERBOSE("scan_irq: trying prio %d\n", prio); + + /* Try to fetch */ + irq = xive_read_eq(&xc->queue[prio], just_peek); + + /* Found something ? That's it */ + if (irq) { + if (just_peek || irq_to_desc(irq)) + break; + /* + * We should never get here; if we do then we must + * have failed to synchronize the interrupt properly + * when shutting it down. + */ + pr_crit("xive: got interrupt %d without descriptor, dropping\n", + irq); + WARN_ON(1); + continue; + } + + /* Clear pending bits */ + xc->pending_prio &= ~(1 << prio); + + /* + * Check if the queue count needs adjusting due to + * interrupts being moved away. See description of + * xive_dec_target_count() + */ + q = &xc->queue[prio]; + if (atomic_read(&q->pending_count)) { + int p = atomic_xchg(&q->pending_count, 0); + if (p) { + WARN_ON(p > atomic_read(&q->count)); + atomic_sub(p, &q->count); + } + } + } + + /* If nothing was found, set CPPR to 0xff */ + if (irq == 0) + prio = 0xff; + + /* Update HW CPPR to match if necessary */ + if (prio != xc->cppr) { + DBG_VERBOSE("scan_irq: adjusting CPPR to %d\n", prio); + xc->cppr = prio; + out_8(xive_tima + xive_tima_offset + TM_CPPR, prio); + } + + return irq; +} + +/* + * This is used to perform the magic loads from an ESB + * described in xive-regs.h + */ +static notrace u8 xive_esb_read(struct xive_irq_data *xd, u32 offset) +{ + u64 val; + + if (offset == XIVE_ESB_SET_PQ_10 && xive_is_store_eoi(xd)) + offset |= XIVE_ESB_LD_ST_MO; + + if ((xd->flags & XIVE_IRQ_FLAG_H_INT_ESB) && xive_ops->esb_rw) + val = xive_ops->esb_rw(xd->hw_irq, offset, 0, 0); + else + val = in_be64(xd->eoi_mmio + offset); + + return (u8)val; +} + +static void xive_esb_write(struct xive_irq_data *xd, u32 offset, u64 data) +{ + if ((xd->flags & XIVE_IRQ_FLAG_H_INT_ESB) && xive_ops->esb_rw) + xive_ops->esb_rw(xd->hw_irq, offset, data, 1); + else + out_be64(xd->eoi_mmio + offset, data); +} + +#if defined(CONFIG_XMON) || defined(CONFIG_DEBUG_FS) +static void xive_irq_data_dump(struct xive_irq_data *xd, char *buffer, size_t size) +{ + u64 val = xive_esb_read(xd, XIVE_ESB_GET); + + snprintf(buffer, size, "flags=%c%c%c PQ=%c%c 0x%016llx 0x%016llx", + xive_is_store_eoi(xd) ? 'S' : ' ', + xd->flags & XIVE_IRQ_FLAG_LSI ? 'L' : ' ', + xd->flags & XIVE_IRQ_FLAG_H_INT_ESB ? 'H' : ' ', + val & XIVE_ESB_VAL_P ? 'P' : '-', + val & XIVE_ESB_VAL_Q ? 'Q' : '-', + xd->trig_page, xd->eoi_page); +} +#endif + +#ifdef CONFIG_XMON +static notrace void xive_dump_eq(const char *name, struct xive_q *q) +{ + u32 i0, i1, idx; + + if (!q->qpage) + return; + idx = q->idx; + i0 = be32_to_cpup(q->qpage + idx); + idx = (idx + 1) & q->msk; + i1 = be32_to_cpup(q->qpage + idx); + xmon_printf("%s idx=%d T=%d %08x %08x ...", name, + q->idx, q->toggle, i0, i1); +} + +notrace void xmon_xive_do_dump(int cpu) +{ + struct xive_cpu *xc = per_cpu(xive_cpu, cpu); + + xmon_printf("CPU %d:", cpu); + if (xc) { + xmon_printf("pp=%02x CPPR=%02x ", xc->pending_prio, xc->cppr); + +#ifdef CONFIG_SMP + { + char buffer[128]; + + xive_irq_data_dump(&xc->ipi_data, buffer, sizeof(buffer)); + xmon_printf("IPI=0x%08x %s", xc->hw_ipi, buffer); + } +#endif + xive_dump_eq("EQ", &xc->queue[xive_irq_priority]); + } + xmon_printf("\n"); +} + +static struct irq_data *xive_get_irq_data(u32 hw_irq) +{ + unsigned int irq = irq_find_mapping(xive_irq_domain, hw_irq); + + return irq ? irq_get_irq_data(irq) : NULL; +} + +int xmon_xive_get_irq_config(u32 hw_irq, struct irq_data *d) +{ + int rc; + u32 target; + u8 prio; + u32 lirq; + + rc = xive_ops->get_irq_config(hw_irq, &target, &prio, &lirq); + if (rc) { + xmon_printf("IRQ 0x%08x : no config rc=%d\n", hw_irq, rc); + return rc; + } + + xmon_printf("IRQ 0x%08x : target=0x%x prio=%02x lirq=0x%x ", + hw_irq, target, prio, lirq); + + if (!d) + d = xive_get_irq_data(hw_irq); + + if (d) { + char buffer[128]; + + xive_irq_data_dump(irq_data_get_irq_handler_data(d), + buffer, sizeof(buffer)); + xmon_printf("%s", buffer); + } + + xmon_printf("\n"); + return 0; +} + +void xmon_xive_get_irq_all(void) +{ + unsigned int i; + struct irq_desc *desc; + + for_each_irq_desc(i, desc) { + struct irq_data *d = irq_domain_get_irq_data(xive_irq_domain, i); + + if (d) + xmon_xive_get_irq_config(irqd_to_hwirq(d), d); + } +} + +#endif /* CONFIG_XMON */ + +static unsigned int xive_get_irq(void) +{ + struct xive_cpu *xc = __this_cpu_read(xive_cpu); + u32 irq; + + /* + * This can be called either as a result of a HW interrupt or + * as a "replay" because EOI decided there was still something + * in one of the queues. + * + * First we perform an ACK cycle in order to update our mask + * of pending priorities. This will also have the effect of + * updating the CPPR to the most favored pending interrupts. + * + * In the future, if we have a way to differentiate a first + * entry (on HW interrupt) from a replay triggered by EOI, + * we could skip this on replays unless we soft-mask tells us + * that a new HW interrupt occurred. + */ + xive_ops->update_pending(xc); + + DBG_VERBOSE("get_irq: pending=%02x\n", xc->pending_prio); + + /* Scan our queue(s) for interrupts */ + irq = xive_scan_interrupts(xc, false); + + DBG_VERBOSE("get_irq: got irq 0x%x, new pending=0x%02x\n", + irq, xc->pending_prio); + + /* Return pending interrupt if any */ + if (irq == XIVE_BAD_IRQ) + return 0; + return irq; +} + +/* + * After EOI'ing an interrupt, we need to re-check the queue + * to see if another interrupt is pending since multiple + * interrupts can coalesce into a single notification to the + * CPU. + * + * If we find that there is indeed more in there, we call + * force_external_irq_replay() to make Linux synthetize an + * external interrupt on the next call to local_irq_restore(). + */ +static void xive_do_queue_eoi(struct xive_cpu *xc) +{ + if (xive_scan_interrupts(xc, true) != 0) { + DBG_VERBOSE("eoi: pending=0x%02x\n", xc->pending_prio); + force_external_irq_replay(); + } +} + +/* + * EOI an interrupt at the source. There are several methods + * to do this depending on the HW version and source type + */ +static void xive_do_source_eoi(struct xive_irq_data *xd) +{ + u8 eoi_val; + + xd->stale_p = false; + + /* If the XIVE supports the new "store EOI facility, use it */ + if (xive_is_store_eoi(xd)) { + xive_esb_write(xd, XIVE_ESB_STORE_EOI, 0); + return; + } + + /* + * For LSIs, we use the "EOI cycle" special load rather than + * PQ bits, as they are automatically re-triggered in HW when + * still pending. + */ + if (xd->flags & XIVE_IRQ_FLAG_LSI) { + xive_esb_read(xd, XIVE_ESB_LOAD_EOI); + return; + } + + /* + * Otherwise, we use the special MMIO that does a clear of + * both P and Q and returns the old Q. This allows us to then + * do a re-trigger if Q was set rather than synthesizing an + * interrupt in software + */ + eoi_val = xive_esb_read(xd, XIVE_ESB_SET_PQ_00); + DBG_VERBOSE("eoi_val=%x\n", eoi_val); + + /* Re-trigger if needed */ + if ((eoi_val & XIVE_ESB_VAL_Q) && xd->trig_mmio) + out_be64(xd->trig_mmio, 0); +} + +/* irq_chip eoi callback, called with irq descriptor lock held */ +static void xive_irq_eoi(struct irq_data *d) +{ + struct xive_irq_data *xd = irq_data_get_irq_handler_data(d); + struct xive_cpu *xc = __this_cpu_read(xive_cpu); + + DBG_VERBOSE("eoi_irq: irq=%d [0x%lx] pending=%02x\n", + d->irq, irqd_to_hwirq(d), xc->pending_prio); + + /* + * EOI the source if it hasn't been disabled and hasn't + * been passed-through to a KVM guest + */ + if (!irqd_irq_disabled(d) && !irqd_is_forwarded_to_vcpu(d) && + !(xd->flags & XIVE_IRQ_FLAG_NO_EOI)) + xive_do_source_eoi(xd); + else + xd->stale_p = true; + + /* + * Clear saved_p to indicate that it's no longer occupying + * a queue slot on the target queue + */ + xd->saved_p = false; + + /* Check for more work in the queue */ + xive_do_queue_eoi(xc); +} + +/* + * Helper used to mask and unmask an interrupt source. + */ +static void xive_do_source_set_mask(struct xive_irq_data *xd, + bool mask) +{ + u64 val; + + pr_debug("%s: HW 0x%x %smask\n", __func__, xd->hw_irq, mask ? "" : "un"); + + /* + * If the interrupt had P set, it may be in a queue. + * + * We need to make sure we don't re-enable it until it + * has been fetched from that queue and EOId. We keep + * a copy of that P state and use it to restore the + * ESB accordingly on unmask. + */ + if (mask) { + val = xive_esb_read(xd, XIVE_ESB_SET_PQ_01); + if (!xd->stale_p && !!(val & XIVE_ESB_VAL_P)) + xd->saved_p = true; + xd->stale_p = false; + } else if (xd->saved_p) { + xive_esb_read(xd, XIVE_ESB_SET_PQ_10); + xd->saved_p = false; + } else { + xive_esb_read(xd, XIVE_ESB_SET_PQ_00); + xd->stale_p = false; + } +} + +/* + * Try to chose "cpu" as a new interrupt target. Increments + * the queue accounting for that target if it's not already + * full. + */ +static bool xive_try_pick_target(int cpu) +{ + struct xive_cpu *xc = per_cpu(xive_cpu, cpu); + struct xive_q *q = &xc->queue[xive_irq_priority]; + int max; + + /* + * Calculate max number of interrupts in that queue. + * + * We leave a gap of 1 just in case... + */ + max = (q->msk + 1) - 1; + return !!atomic_add_unless(&q->count, 1, max); +} + +/* + * Un-account an interrupt for a target CPU. We don't directly + * decrement q->count since the interrupt might still be present + * in the queue. + * + * Instead increment a separate counter "pending_count" which + * will be substracted from "count" later when that CPU observes + * the queue to be empty. + */ +static void xive_dec_target_count(int cpu) +{ + struct xive_cpu *xc = per_cpu(xive_cpu, cpu); + struct xive_q *q = &xc->queue[xive_irq_priority]; + + if (WARN_ON(cpu < 0 || !xc)) { + pr_err("%s: cpu=%d xc=%p\n", __func__, cpu, xc); + return; + } + + /* + * We increment the "pending count" which will be used + * to decrement the target queue count whenever it's next + * processed and found empty. This ensure that we don't + * decrement while we still have the interrupt there + * occupying a slot. + */ + atomic_inc(&q->pending_count); +} + +/* Find a tentative CPU target in a CPU mask */ +static int xive_find_target_in_mask(const struct cpumask *mask, + unsigned int fuzz) +{ + int cpu, first, num, i; + + /* Pick up a starting point CPU in the mask based on fuzz */ + num = min_t(int, cpumask_weight(mask), nr_cpu_ids); + first = fuzz % num; + + /* Locate it */ + cpu = cpumask_first(mask); + for (i = 0; i < first && cpu < nr_cpu_ids; i++) + cpu = cpumask_next(cpu, mask); + + /* Sanity check */ + if (WARN_ON(cpu >= nr_cpu_ids)) + cpu = cpumask_first(cpu_online_mask); + + /* Remember first one to handle wrap-around */ + first = cpu; + + /* + * Now go through the entire mask until we find a valid + * target. + */ + do { + /* + * We re-check online as the fallback case passes us + * an untested affinity mask + */ + if (cpu_online(cpu) && xive_try_pick_target(cpu)) + return cpu; + cpu = cpumask_next(cpu, mask); + /* Wrap around */ + if (cpu >= nr_cpu_ids) + cpu = cpumask_first(mask); + } while (cpu != first); + + return -1; +} + +/* + * Pick a target CPU for an interrupt. This is done at + * startup or if the affinity is changed in a way that + * invalidates the current target. + */ +static int xive_pick_irq_target(struct irq_data *d, + const struct cpumask *affinity) +{ + static unsigned int fuzz; + struct xive_irq_data *xd = irq_data_get_irq_handler_data(d); + cpumask_var_t mask; + int cpu = -1; + + /* + * If we have chip IDs, first we try to build a mask of + * CPUs matching the CPU and find a target in there + */ + if (xd->src_chip != XIVE_INVALID_CHIP_ID && + zalloc_cpumask_var(&mask, GFP_ATOMIC)) { + /* Build a mask of matching chip IDs */ + for_each_cpu_and(cpu, affinity, cpu_online_mask) { + struct xive_cpu *xc = per_cpu(xive_cpu, cpu); + if (xc->chip_id == xd->src_chip) + cpumask_set_cpu(cpu, mask); + } + /* Try to find a target */ + if (cpumask_empty(mask)) + cpu = -1; + else + cpu = xive_find_target_in_mask(mask, fuzz++); + free_cpumask_var(mask); + if (cpu >= 0) + return cpu; + fuzz--; + } + + /* No chip IDs, fallback to using the affinity mask */ + return xive_find_target_in_mask(affinity, fuzz++); +} + +static unsigned int xive_irq_startup(struct irq_data *d) +{ + struct xive_irq_data *xd = irq_data_get_irq_handler_data(d); + unsigned int hw_irq = (unsigned int)irqd_to_hwirq(d); + int target, rc; + + xd->saved_p = false; + xd->stale_p = false; + + pr_debug("%s: irq %d [0x%x] data @%p\n", __func__, d->irq, hw_irq, d); + + /* Pick a target */ + target = xive_pick_irq_target(d, irq_data_get_affinity_mask(d)); + if (target == XIVE_INVALID_TARGET) { + /* Try again breaking affinity */ + target = xive_pick_irq_target(d, cpu_online_mask); + if (target == XIVE_INVALID_TARGET) + return -ENXIO; + pr_warn("irq %d started with broken affinity\n", d->irq); + } + + /* Sanity check */ + if (WARN_ON(target == XIVE_INVALID_TARGET || + target >= nr_cpu_ids)) + target = smp_processor_id(); + + xd->target = target; + + /* + * Configure the logical number to be the Linux IRQ number + * and set the target queue + */ + rc = xive_ops->configure_irq(hw_irq, + get_hard_smp_processor_id(target), + xive_irq_priority, d->irq); + if (rc) + return rc; + + /* Unmask the ESB */ + xive_do_source_set_mask(xd, false); + + return 0; +} + +/* called with irq descriptor lock held */ +static void xive_irq_shutdown(struct irq_data *d) +{ + struct xive_irq_data *xd = irq_data_get_irq_handler_data(d); + unsigned int hw_irq = (unsigned int)irqd_to_hwirq(d); + + pr_debug("%s: irq %d [0x%x] data @%p\n", __func__, d->irq, hw_irq, d); + + if (WARN_ON(xd->target == XIVE_INVALID_TARGET)) + return; + + /* Mask the interrupt at the source */ + xive_do_source_set_mask(xd, true); + + /* + * Mask the interrupt in HW in the IVT/EAS and set the number + * to be the "bad" IRQ number + */ + xive_ops->configure_irq(hw_irq, + get_hard_smp_processor_id(xd->target), + 0xff, XIVE_BAD_IRQ); + + xive_dec_target_count(xd->target); + xd->target = XIVE_INVALID_TARGET; +} + +static void xive_irq_unmask(struct irq_data *d) +{ + struct xive_irq_data *xd = irq_data_get_irq_handler_data(d); + + pr_debug("%s: irq %d data @%p\n", __func__, d->irq, xd); + + xive_do_source_set_mask(xd, false); +} + +static void xive_irq_mask(struct irq_data *d) +{ + struct xive_irq_data *xd = irq_data_get_irq_handler_data(d); + + pr_debug("%s: irq %d data @%p\n", __func__, d->irq, xd); + + xive_do_source_set_mask(xd, true); +} + +static int xive_irq_set_affinity(struct irq_data *d, + const struct cpumask *cpumask, + bool force) +{ + struct xive_irq_data *xd = irq_data_get_irq_handler_data(d); + unsigned int hw_irq = (unsigned int)irqd_to_hwirq(d); + u32 target, old_target; + int rc = 0; + + pr_debug("%s: irq %d/0x%x\n", __func__, d->irq, hw_irq); + + /* Is this valid ? */ + if (cpumask_any_and(cpumask, cpu_online_mask) >= nr_cpu_ids) + return -EINVAL; + + /* + * If existing target is already in the new mask, and is + * online then do nothing. + */ + if (xd->target != XIVE_INVALID_TARGET && + cpu_online(xd->target) && + cpumask_test_cpu(xd->target, cpumask)) + return IRQ_SET_MASK_OK; + + /* Pick a new target */ + target = xive_pick_irq_target(d, cpumask); + + /* No target found */ + if (target == XIVE_INVALID_TARGET) + return -ENXIO; + + /* Sanity check */ + if (WARN_ON(target >= nr_cpu_ids)) + target = smp_processor_id(); + + old_target = xd->target; + + /* + * Only configure the irq if it's not currently passed-through to + * a KVM guest + */ + if (!irqd_is_forwarded_to_vcpu(d)) + rc = xive_ops->configure_irq(hw_irq, + get_hard_smp_processor_id(target), + xive_irq_priority, d->irq); + if (rc < 0) { + pr_err("Error %d reconfiguring irq %d\n", rc, d->irq); + return rc; + } + + pr_debug(" target: 0x%x\n", target); + xd->target = target; + + /* Give up previous target */ + if (old_target != XIVE_INVALID_TARGET) + xive_dec_target_count(old_target); + + return IRQ_SET_MASK_OK; +} + +static int xive_irq_set_type(struct irq_data *d, unsigned int flow_type) +{ + struct xive_irq_data *xd = irq_data_get_irq_handler_data(d); + + /* + * We only support these. This has really no effect other than setting + * the corresponding descriptor bits mind you but those will in turn + * affect the resend function when re-enabling an edge interrupt. + * + * Set the default to edge as explained in map(). + */ + if (flow_type == IRQ_TYPE_DEFAULT || flow_type == IRQ_TYPE_NONE) + flow_type = IRQ_TYPE_EDGE_RISING; + + if (flow_type != IRQ_TYPE_EDGE_RISING && + flow_type != IRQ_TYPE_LEVEL_LOW) + return -EINVAL; + + irqd_set_trigger_type(d, flow_type); + + /* + * Double check it matches what the FW thinks + * + * NOTE: We don't know yet if the PAPR interface will provide + * the LSI vs MSI information apart from the device-tree so + * this check might have to move into an optional backend call + * that is specific to the native backend + */ + if ((flow_type == IRQ_TYPE_LEVEL_LOW) != + !!(xd->flags & XIVE_IRQ_FLAG_LSI)) { + pr_warn("Interrupt %d (HW 0x%x) type mismatch, Linux says %s, FW says %s\n", + d->irq, (u32)irqd_to_hwirq(d), + (flow_type == IRQ_TYPE_LEVEL_LOW) ? "Level" : "Edge", + (xd->flags & XIVE_IRQ_FLAG_LSI) ? "Level" : "Edge"); + } + + return IRQ_SET_MASK_OK_NOCOPY; +} + +static int xive_irq_retrigger(struct irq_data *d) +{ + struct xive_irq_data *xd = irq_data_get_irq_handler_data(d); + + /* This should be only for MSIs */ + if (WARN_ON(xd->flags & XIVE_IRQ_FLAG_LSI)) + return 0; + + /* + * To perform a retrigger, we first set the PQ bits to + * 11, then perform an EOI. + */ + xive_esb_read(xd, XIVE_ESB_SET_PQ_11); + xive_do_source_eoi(xd); + + return 1; +} + +/* + * Caller holds the irq descriptor lock, so this won't be called + * concurrently with xive_get_irqchip_state on the same interrupt. + */ +static int xive_irq_set_vcpu_affinity(struct irq_data *d, void *state) +{ + struct xive_irq_data *xd = irq_data_get_irq_handler_data(d); + unsigned int hw_irq = (unsigned int)irqd_to_hwirq(d); + int rc; + u8 pq; + + /* + * This is called by KVM with state non-NULL for enabling + * pass-through or NULL for disabling it + */ + if (state) { + irqd_set_forwarded_to_vcpu(d); + + /* Set it to PQ=10 state to prevent further sends */ + pq = xive_esb_read(xd, XIVE_ESB_SET_PQ_10); + if (!xd->stale_p) { + xd->saved_p = !!(pq & XIVE_ESB_VAL_P); + xd->stale_p = !xd->saved_p; + } + + /* No target ? nothing to do */ + if (xd->target == XIVE_INVALID_TARGET) { + /* + * An untargetted interrupt should have been + * also masked at the source + */ + WARN_ON(xd->saved_p); + + return 0; + } + + /* + * If P was set, adjust state to PQ=11 to indicate + * that a resend is needed for the interrupt to reach + * the guest. Also remember the value of P. + * + * This also tells us that it's in flight to a host queue + * or has already been fetched but hasn't been EOIed yet + * by the host. This it's potentially using up a host + * queue slot. This is important to know because as long + * as this is the case, we must not hard-unmask it when + * "returning" that interrupt to the host. + * + * This saved_p is cleared by the host EOI, when we know + * for sure the queue slot is no longer in use. + */ + if (xd->saved_p) { + xive_esb_read(xd, XIVE_ESB_SET_PQ_11); + + /* + * Sync the XIVE source HW to ensure the interrupt + * has gone through the EAS before we change its + * target to the guest. That should guarantee us + * that we *will* eventually get an EOI for it on + * the host. Otherwise there would be a small window + * for P to be seen here but the interrupt going + * to the guest queue. + */ + if (xive_ops->sync_source) + xive_ops->sync_source(hw_irq); + } + } else { + irqd_clr_forwarded_to_vcpu(d); + + /* No host target ? hard mask and return */ + if (xd->target == XIVE_INVALID_TARGET) { + xive_do_source_set_mask(xd, true); + return 0; + } + + /* + * Sync the XIVE source HW to ensure the interrupt + * has gone through the EAS before we change its + * target to the host. + */ + if (xive_ops->sync_source) + xive_ops->sync_source(hw_irq); + + /* + * By convention we are called with the interrupt in + * a PQ=10 or PQ=11 state, ie, it won't fire and will + * have latched in Q whether there's a pending HW + * interrupt or not. + * + * First reconfigure the target. + */ + rc = xive_ops->configure_irq(hw_irq, + get_hard_smp_processor_id(xd->target), + xive_irq_priority, d->irq); + if (rc) + return rc; + + /* + * Then if saved_p is not set, effectively re-enable the + * interrupt with an EOI. If it is set, we know there is + * still a message in a host queue somewhere that will be + * EOId eventually. + * + * Note: We don't check irqd_irq_disabled(). Effectively, + * we *will* let the irq get through even if masked if the + * HW is still firing it in order to deal with the whole + * saved_p business properly. If the interrupt triggers + * while masked, the generic code will re-mask it anyway. + */ + if (!xd->saved_p) + xive_do_source_eoi(xd); + + } + return 0; +} + +/* Called with irq descriptor lock held. */ +static int xive_get_irqchip_state(struct irq_data *data, + enum irqchip_irq_state which, bool *state) +{ + struct xive_irq_data *xd = irq_data_get_irq_handler_data(data); + u8 pq; + + switch (which) { + case IRQCHIP_STATE_ACTIVE: + pq = xive_esb_read(xd, XIVE_ESB_GET); + + /* + * The esb value being all 1's means we couldn't get + * the PQ state of the interrupt through mmio. It may + * happen, for example when querying a PHB interrupt + * while the PHB is in an error state. We consider the + * interrupt to be inactive in that case. + */ + *state = (pq != XIVE_ESB_INVALID) && !xd->stale_p && + (xd->saved_p || (!!(pq & XIVE_ESB_VAL_P) && + !irqd_irq_disabled(data))); + return 0; + default: + return -EINVAL; + } +} + +static struct irq_chip xive_irq_chip = { + .name = "XIVE-IRQ", + .irq_startup = xive_irq_startup, + .irq_shutdown = xive_irq_shutdown, + .irq_eoi = xive_irq_eoi, + .irq_mask = xive_irq_mask, + .irq_unmask = xive_irq_unmask, + .irq_set_affinity = xive_irq_set_affinity, + .irq_set_type = xive_irq_set_type, + .irq_retrigger = xive_irq_retrigger, + .irq_set_vcpu_affinity = xive_irq_set_vcpu_affinity, + .irq_get_irqchip_state = xive_get_irqchip_state, +}; + +bool is_xive_irq(struct irq_chip *chip) +{ + return chip == &xive_irq_chip; +} +EXPORT_SYMBOL_GPL(is_xive_irq); + +void xive_cleanup_irq_data(struct xive_irq_data *xd) +{ + pr_debug("%s for HW 0x%x\n", __func__, xd->hw_irq); + + if (xd->eoi_mmio) { + iounmap(xd->eoi_mmio); + if (xd->eoi_mmio == xd->trig_mmio) + xd->trig_mmio = NULL; + xd->eoi_mmio = NULL; + } + if (xd->trig_mmio) { + iounmap(xd->trig_mmio); + xd->trig_mmio = NULL; + } +} +EXPORT_SYMBOL_GPL(xive_cleanup_irq_data); + +static int xive_irq_alloc_data(unsigned int virq, irq_hw_number_t hw) +{ + struct xive_irq_data *xd; + int rc; + + xd = kzalloc(sizeof(struct xive_irq_data), GFP_KERNEL); + if (!xd) + return -ENOMEM; + rc = xive_ops->populate_irq_data(hw, xd); + if (rc) { + kfree(xd); + return rc; + } + xd->target = XIVE_INVALID_TARGET; + irq_set_handler_data(virq, xd); + + /* + * Turn OFF by default the interrupt being mapped. A side + * effect of this check is the mapping the ESB page of the + * interrupt in the Linux address space. This prevents page + * fault issues in the crash handler which masks all + * interrupts. + */ + xive_esb_read(xd, XIVE_ESB_SET_PQ_01); + + return 0; +} + +void xive_irq_free_data(unsigned int virq) +{ + struct xive_irq_data *xd = irq_get_handler_data(virq); + + if (!xd) + return; + irq_set_handler_data(virq, NULL); + xive_cleanup_irq_data(xd); + kfree(xd); +} +EXPORT_SYMBOL_GPL(xive_irq_free_data); + +#ifdef CONFIG_SMP + +static void xive_cause_ipi(int cpu) +{ + struct xive_cpu *xc; + struct xive_irq_data *xd; + + xc = per_cpu(xive_cpu, cpu); + + DBG_VERBOSE("IPI CPU %d -> %d (HW IRQ 0x%x)\n", + smp_processor_id(), cpu, xc->hw_ipi); + + xd = &xc->ipi_data; + if (WARN_ON(!xd->trig_mmio)) + return; + out_be64(xd->trig_mmio, 0); +} + +static irqreturn_t xive_muxed_ipi_action(int irq, void *dev_id) +{ + return smp_ipi_demux(); +} + +static void xive_ipi_eoi(struct irq_data *d) +{ + struct xive_cpu *xc = __this_cpu_read(xive_cpu); + + /* Handle possible race with unplug and drop stale IPIs */ + if (!xc) + return; + + DBG_VERBOSE("IPI eoi: irq=%d [0x%lx] (HW IRQ 0x%x) pending=%02x\n", + d->irq, irqd_to_hwirq(d), xc->hw_ipi, xc->pending_prio); + + xive_do_source_eoi(&xc->ipi_data); + xive_do_queue_eoi(xc); +} + +static void xive_ipi_do_nothing(struct irq_data *d) +{ + /* + * Nothing to do, we never mask/unmask IPIs, but the callback + * has to exist for the struct irq_chip. + */ +} + +static struct irq_chip xive_ipi_chip = { + .name = "XIVE-IPI", + .irq_eoi = xive_ipi_eoi, + .irq_mask = xive_ipi_do_nothing, + .irq_unmask = xive_ipi_do_nothing, +}; + +/* + * IPIs are marked per-cpu. We use separate HW interrupts under the + * hood but associated with the same "linux" interrupt + */ +struct xive_ipi_alloc_info { + irq_hw_number_t hwirq; +}; + +static int xive_ipi_irq_domain_alloc(struct irq_domain *domain, unsigned int virq, + unsigned int nr_irqs, void *arg) +{ + struct xive_ipi_alloc_info *info = arg; + int i; + + for (i = 0; i < nr_irqs; i++) { + irq_domain_set_info(domain, virq + i, info->hwirq + i, &xive_ipi_chip, + domain->host_data, handle_percpu_irq, + NULL, NULL); + } + return 0; +} + +static const struct irq_domain_ops xive_ipi_irq_domain_ops = { + .alloc = xive_ipi_irq_domain_alloc, +}; + +static int __init xive_init_ipis(void) +{ + struct fwnode_handle *fwnode; + struct irq_domain *ipi_domain; + unsigned int node; + int ret = -ENOMEM; + + fwnode = irq_domain_alloc_named_fwnode("XIVE-IPI"); + if (!fwnode) + goto out; + + ipi_domain = irq_domain_create_linear(fwnode, nr_node_ids, + &xive_ipi_irq_domain_ops, NULL); + if (!ipi_domain) + goto out_free_fwnode; + + xive_ipis = kcalloc(nr_node_ids, sizeof(*xive_ipis), GFP_KERNEL | __GFP_NOFAIL); + if (!xive_ipis) + goto out_free_domain; + + for_each_node(node) { + struct xive_ipi_desc *xid = &xive_ipis[node]; + struct xive_ipi_alloc_info info = { node }; + + /* + * Map one IPI interrupt per node for all cpus of that node. + * Since the HW interrupt number doesn't have any meaning, + * simply use the node number. + */ + ret = irq_domain_alloc_irqs(ipi_domain, 1, node, &info); + if (ret < 0) + goto out_free_xive_ipis; + xid->irq = ret; + + snprintf(xid->name, sizeof(xid->name), "IPI-%d", node); + } + + return ret; + +out_free_xive_ipis: + kfree(xive_ipis); +out_free_domain: + irq_domain_remove(ipi_domain); +out_free_fwnode: + irq_domain_free_fwnode(fwnode); +out: + return ret; +} + +static int xive_request_ipi(unsigned int cpu) +{ + struct xive_ipi_desc *xid = &xive_ipis[early_cpu_to_node(cpu)]; + int ret; + + if (atomic_inc_return(&xid->started) > 1) + return 0; + + ret = request_irq(xid->irq, xive_muxed_ipi_action, + IRQF_NO_DEBUG | IRQF_PERCPU | IRQF_NO_THREAD, + xid->name, NULL); + + WARN(ret < 0, "Failed to request IPI %d: %d\n", xid->irq, ret); + return ret; +} + +static int xive_setup_cpu_ipi(unsigned int cpu) +{ + unsigned int xive_ipi_irq = xive_ipi_cpu_to_irq(cpu); + struct xive_cpu *xc; + int rc; + + pr_debug("Setting up IPI for CPU %d\n", cpu); + + xc = per_cpu(xive_cpu, cpu); + + /* Check if we are already setup */ + if (xc->hw_ipi != XIVE_BAD_IRQ) + return 0; + + /* Register the IPI */ + xive_request_ipi(cpu); + + /* Grab an IPI from the backend, this will populate xc->hw_ipi */ + if (xive_ops->get_ipi(cpu, xc)) + return -EIO; + + /* + * Populate the IRQ data in the xive_cpu structure and + * configure the HW / enable the IPIs. + */ + rc = xive_ops->populate_irq_data(xc->hw_ipi, &xc->ipi_data); + if (rc) { + pr_err("Failed to populate IPI data on CPU %d\n", cpu); + return -EIO; + } + rc = xive_ops->configure_irq(xc->hw_ipi, + get_hard_smp_processor_id(cpu), + xive_irq_priority, xive_ipi_irq); + if (rc) { + pr_err("Failed to map IPI CPU %d\n", cpu); + return -EIO; + } + pr_debug("CPU %d HW IPI 0x%x, virq %d, trig_mmio=%p\n", cpu, + xc->hw_ipi, xive_ipi_irq, xc->ipi_data.trig_mmio); + + /* Unmask it */ + xive_do_source_set_mask(&xc->ipi_data, false); + + return 0; +} + +noinstr static void xive_cleanup_cpu_ipi(unsigned int cpu, struct xive_cpu *xc) +{ + unsigned int xive_ipi_irq = xive_ipi_cpu_to_irq(cpu); + + /* Disable the IPI and free the IRQ data */ + + /* Already cleaned up ? */ + if (xc->hw_ipi == XIVE_BAD_IRQ) + return; + + /* TODO: clear IPI mapping */ + + /* Mask the IPI */ + xive_do_source_set_mask(&xc->ipi_data, true); + + /* + * Note: We don't call xive_cleanup_irq_data() to free + * the mappings as this is called from an IPI on kexec + * which is not a safe environment to call iounmap() + */ + + /* Deconfigure/mask in the backend */ + xive_ops->configure_irq(xc->hw_ipi, hard_smp_processor_id(), + 0xff, xive_ipi_irq); + + /* Free the IPIs in the backend */ + xive_ops->put_ipi(cpu, xc); +} + +void __init xive_smp_probe(void) +{ + smp_ops->cause_ipi = xive_cause_ipi; + + /* Register the IPI */ + xive_init_ipis(); + + /* Allocate and setup IPI for the boot CPU */ + xive_setup_cpu_ipi(smp_processor_id()); +} + +#endif /* CONFIG_SMP */ + +static int xive_irq_domain_map(struct irq_domain *h, unsigned int virq, + irq_hw_number_t hw) +{ + int rc; + + /* + * Mark interrupts as edge sensitive by default so that resend + * actually works. Will fix that up below if needed. + */ + irq_clear_status_flags(virq, IRQ_LEVEL); + + rc = xive_irq_alloc_data(virq, hw); + if (rc) + return rc; + + irq_set_chip_and_handler(virq, &xive_irq_chip, handle_fasteoi_irq); + + return 0; +} + +static void xive_irq_domain_unmap(struct irq_domain *d, unsigned int virq) +{ + xive_irq_free_data(virq); +} + +static int xive_irq_domain_xlate(struct irq_domain *h, struct device_node *ct, + const u32 *intspec, unsigned int intsize, + irq_hw_number_t *out_hwirq, unsigned int *out_flags) + +{ + *out_hwirq = intspec[0]; + + /* + * If intsize is at least 2, we look for the type in the second cell, + * we assume the LSB indicates a level interrupt. + */ + if (intsize > 1) { + if (intspec[1] & 1) + *out_flags = IRQ_TYPE_LEVEL_LOW; + else + *out_flags = IRQ_TYPE_EDGE_RISING; + } else + *out_flags = IRQ_TYPE_LEVEL_LOW; + + return 0; +} + +static int xive_irq_domain_match(struct irq_domain *h, struct device_node *node, + enum irq_domain_bus_token bus_token) +{ + return xive_ops->match(node); +} + +#ifdef CONFIG_GENERIC_IRQ_DEBUGFS +static const char * const esb_names[] = { "RESET", "OFF", "PENDING", "QUEUED" }; + +static const struct { + u64 mask; + char *name; +} xive_irq_flags[] = { + { XIVE_IRQ_FLAG_STORE_EOI, "STORE_EOI" }, + { XIVE_IRQ_FLAG_LSI, "LSI" }, + { XIVE_IRQ_FLAG_H_INT_ESB, "H_INT_ESB" }, + { XIVE_IRQ_FLAG_NO_EOI, "NO_EOI" }, +}; + +static void xive_irq_domain_debug_show(struct seq_file *m, struct irq_domain *d, + struct irq_data *irqd, int ind) +{ + struct xive_irq_data *xd; + u64 val; + int i; + + /* No IRQ domain level information. To be done */ + if (!irqd) + return; + + if (!is_xive_irq(irq_data_get_irq_chip(irqd))) + return; + + seq_printf(m, "%*sXIVE:\n", ind, ""); + ind++; + + xd = irq_data_get_irq_handler_data(irqd); + if (!xd) { + seq_printf(m, "%*snot assigned\n", ind, ""); + return; + } + + val = xive_esb_read(xd, XIVE_ESB_GET); + seq_printf(m, "%*sESB: %s\n", ind, "", esb_names[val & 0x3]); + seq_printf(m, "%*sPstate: %s %s\n", ind, "", xd->stale_p ? "stale" : "", + xd->saved_p ? "saved" : ""); + seq_printf(m, "%*sTarget: %d\n", ind, "", xd->target); + seq_printf(m, "%*sChip: %d\n", ind, "", xd->src_chip); + seq_printf(m, "%*sTrigger: 0x%016llx\n", ind, "", xd->trig_page); + seq_printf(m, "%*sEOI: 0x%016llx\n", ind, "", xd->eoi_page); + seq_printf(m, "%*sFlags: 0x%llx\n", ind, "", xd->flags); + for (i = 0; i < ARRAY_SIZE(xive_irq_flags); i++) { + if (xd->flags & xive_irq_flags[i].mask) + seq_printf(m, "%*s%s\n", ind + 12, "", xive_irq_flags[i].name); + } +} +#endif + +#ifdef CONFIG_IRQ_DOMAIN_HIERARCHY +static int xive_irq_domain_translate(struct irq_domain *d, + struct irq_fwspec *fwspec, + unsigned long *hwirq, + unsigned int *type) +{ + return xive_irq_domain_xlate(d, to_of_node(fwspec->fwnode), + fwspec->param, fwspec->param_count, + hwirq, type); +} + +static int xive_irq_domain_alloc(struct irq_domain *domain, unsigned int virq, + unsigned int nr_irqs, void *arg) +{ + struct irq_fwspec *fwspec = arg; + irq_hw_number_t hwirq; + unsigned int type = IRQ_TYPE_NONE; + int i, rc; + + rc = xive_irq_domain_translate(domain, fwspec, &hwirq, &type); + if (rc) + return rc; + + pr_debug("%s %d/0x%lx #%d\n", __func__, virq, hwirq, nr_irqs); + + for (i = 0; i < nr_irqs; i++) { + /* TODO: call xive_irq_domain_map() */ + + /* + * Mark interrupts as edge sensitive by default so that resend + * actually works. Will fix that up below if needed. + */ + irq_clear_status_flags(virq, IRQ_LEVEL); + + /* allocates and sets handler data */ + rc = xive_irq_alloc_data(virq + i, hwirq + i); + if (rc) + return rc; + + irq_domain_set_hwirq_and_chip(domain, virq + i, hwirq + i, + &xive_irq_chip, domain->host_data); + irq_set_handler(virq + i, handle_fasteoi_irq); + } + + return 0; +} + +static void xive_irq_domain_free(struct irq_domain *domain, + unsigned int virq, unsigned int nr_irqs) +{ + int i; + + pr_debug("%s %d #%d\n", __func__, virq, nr_irqs); + + for (i = 0; i < nr_irqs; i++) + xive_irq_free_data(virq + i); +} +#endif + +static const struct irq_domain_ops xive_irq_domain_ops = { +#ifdef CONFIG_IRQ_DOMAIN_HIERARCHY + .alloc = xive_irq_domain_alloc, + .free = xive_irq_domain_free, + .translate = xive_irq_domain_translate, +#endif + .match = xive_irq_domain_match, + .map = xive_irq_domain_map, + .unmap = xive_irq_domain_unmap, + .xlate = xive_irq_domain_xlate, +#ifdef CONFIG_GENERIC_IRQ_DEBUGFS + .debug_show = xive_irq_domain_debug_show, +#endif +}; + +static void __init xive_init_host(struct device_node *np) +{ + xive_irq_domain = irq_domain_add_tree(np, &xive_irq_domain_ops, NULL); + if (WARN_ON(xive_irq_domain == NULL)) + return; + irq_set_default_host(xive_irq_domain); +} + +static void xive_cleanup_cpu_queues(unsigned int cpu, struct xive_cpu *xc) +{ + if (xc->queue[xive_irq_priority].qpage) + xive_ops->cleanup_queue(cpu, xc, xive_irq_priority); +} + +static int xive_setup_cpu_queues(unsigned int cpu, struct xive_cpu *xc) +{ + int rc = 0; + + /* We setup 1 queues for now with a 64k page */ + if (!xc->queue[xive_irq_priority].qpage) + rc = xive_ops->setup_queue(cpu, xc, xive_irq_priority); + + return rc; +} + +static int xive_prepare_cpu(unsigned int cpu) +{ + struct xive_cpu *xc; + + xc = per_cpu(xive_cpu, cpu); + if (!xc) { + xc = kzalloc_node(sizeof(struct xive_cpu), + GFP_KERNEL, cpu_to_node(cpu)); + if (!xc) + return -ENOMEM; + xc->hw_ipi = XIVE_BAD_IRQ; + xc->chip_id = XIVE_INVALID_CHIP_ID; + if (xive_ops->prepare_cpu) + xive_ops->prepare_cpu(cpu, xc); + + per_cpu(xive_cpu, cpu) = xc; + } + + /* Setup EQs if not already */ + return xive_setup_cpu_queues(cpu, xc); +} + +static void xive_setup_cpu(void) +{ + struct xive_cpu *xc = __this_cpu_read(xive_cpu); + + /* The backend might have additional things to do */ + if (xive_ops->setup_cpu) + xive_ops->setup_cpu(smp_processor_id(), xc); + + /* Set CPPR to 0xff to enable flow of interrupts */ + xc->cppr = 0xff; + out_8(xive_tima + xive_tima_offset + TM_CPPR, 0xff); +} + +#ifdef CONFIG_SMP +void xive_smp_setup_cpu(void) +{ + pr_debug("SMP setup CPU %d\n", smp_processor_id()); + + /* This will have already been done on the boot CPU */ + if (smp_processor_id() != boot_cpuid) + xive_setup_cpu(); + +} + +int xive_smp_prepare_cpu(unsigned int cpu) +{ + int rc; + + /* Allocate per-CPU data and queues */ + rc = xive_prepare_cpu(cpu); + if (rc) + return rc; + + /* Allocate and setup IPI for the new CPU */ + return xive_setup_cpu_ipi(cpu); +} + +#ifdef CONFIG_HOTPLUG_CPU +static void xive_flush_cpu_queue(unsigned int cpu, struct xive_cpu *xc) +{ + u32 irq; + + /* We assume local irqs are disabled */ + WARN_ON(!irqs_disabled()); + + /* Check what's already in the CPU queue */ + while ((irq = xive_scan_interrupts(xc, false)) != 0) { + /* + * We need to re-route that interrupt to its new destination. + * First get and lock the descriptor + */ + struct irq_desc *desc = irq_to_desc(irq); + struct irq_data *d = irq_desc_get_irq_data(desc); + struct xive_irq_data *xd; + + /* + * Ignore anything that isn't a XIVE irq and ignore + * IPIs, so can just be dropped. + */ + if (d->domain != xive_irq_domain) + continue; + + /* + * The IRQ should have already been re-routed, it's just a + * stale in the old queue, so re-trigger it in order to make + * it reach is new destination. + */ +#ifdef DEBUG_FLUSH + pr_info("CPU %d: Got irq %d while offline, re-sending...\n", + cpu, irq); +#endif + raw_spin_lock(&desc->lock); + xd = irq_desc_get_handler_data(desc); + + /* + * Clear saved_p to indicate that it's no longer pending + */ + xd->saved_p = false; + + /* + * For LSIs, we EOI, this will cause a resend if it's + * still asserted. Otherwise do an MSI retrigger. + */ + if (xd->flags & XIVE_IRQ_FLAG_LSI) + xive_do_source_eoi(xd); + else + xive_irq_retrigger(d); + + raw_spin_unlock(&desc->lock); + } +} + +void xive_smp_disable_cpu(void) +{ + struct xive_cpu *xc = __this_cpu_read(xive_cpu); + unsigned int cpu = smp_processor_id(); + + /* Migrate interrupts away from the CPU */ + irq_migrate_all_off_this_cpu(); + + /* Set CPPR to 0 to disable flow of interrupts */ + xc->cppr = 0; + out_8(xive_tima + xive_tima_offset + TM_CPPR, 0); + + /* Flush everything still in the queue */ + xive_flush_cpu_queue(cpu, xc); + + /* Re-enable CPPR */ + xc->cppr = 0xff; + out_8(xive_tima + xive_tima_offset + TM_CPPR, 0xff); +} + +void xive_flush_interrupt(void) +{ + struct xive_cpu *xc = __this_cpu_read(xive_cpu); + unsigned int cpu = smp_processor_id(); + + /* Called if an interrupt occurs while the CPU is hot unplugged */ + xive_flush_cpu_queue(cpu, xc); +} + +#endif /* CONFIG_HOTPLUG_CPU */ + +#endif /* CONFIG_SMP */ + +noinstr void xive_teardown_cpu(void) +{ + struct xive_cpu *xc = __this_cpu_read(xive_cpu); + unsigned int cpu = smp_processor_id(); + + /* Set CPPR to 0 to disable flow of interrupts */ + xc->cppr = 0; + out_8(xive_tima + xive_tima_offset + TM_CPPR, 0); + + if (xive_ops->teardown_cpu) + xive_ops->teardown_cpu(cpu, xc); + +#ifdef CONFIG_SMP + /* Get rid of IPI */ + xive_cleanup_cpu_ipi(cpu, xc); +#endif + + /* Disable and free the queues */ + xive_cleanup_cpu_queues(cpu, xc); +} + +void xive_shutdown(void) +{ + xive_ops->shutdown(); +} + +bool __init xive_core_init(struct device_node *np, const struct xive_ops *ops, + void __iomem *area, u32 offset, u8 max_prio) +{ + xive_tima = area; + xive_tima_offset = offset; + xive_ops = ops; + xive_irq_priority = max_prio; + + ppc_md.get_irq = xive_get_irq; + __xive_enabled = true; + + pr_debug("Initializing host..\n"); + xive_init_host(np); + + pr_debug("Initializing boot CPU..\n"); + + /* Allocate per-CPU data and queues */ + xive_prepare_cpu(smp_processor_id()); + + /* Get ready for interrupts */ + xive_setup_cpu(); + + pr_info("Interrupt handling initialized with %s backend\n", + xive_ops->name); + pr_info("Using priority %d for all interrupts\n", max_prio); + + return true; +} + +__be32 *xive_queue_page_alloc(unsigned int cpu, u32 queue_shift) +{ + unsigned int alloc_order; + struct page *pages; + __be32 *qpage; + + alloc_order = xive_alloc_order(queue_shift); + pages = alloc_pages_node(cpu_to_node(cpu), GFP_KERNEL, alloc_order); + if (!pages) + return ERR_PTR(-ENOMEM); + qpage = (__be32 *)page_address(pages); + memset(qpage, 0, 1 << queue_shift); + + return qpage; +} + +static int __init xive_off(char *arg) +{ + xive_cmdline_disabled = true; + return 1; +} +__setup("xive=off", xive_off); + +static int __init xive_store_eoi_cmdline(char *arg) +{ + if (!arg) + return 1; + + if (strncmp(arg, "off", 3) == 0) { + pr_info("StoreEOI disabled on kernel command line\n"); + xive_store_eoi = false; + } + return 1; +} +__setup("xive.store-eoi=", xive_store_eoi_cmdline); + +#ifdef CONFIG_DEBUG_FS +static void xive_debug_show_ipi(struct seq_file *m, int cpu) +{ + struct xive_cpu *xc = per_cpu(xive_cpu, cpu); + + seq_printf(m, "CPU %d: ", cpu); + if (xc) { + seq_printf(m, "pp=%02x CPPR=%02x ", xc->pending_prio, xc->cppr); + +#ifdef CONFIG_SMP + { + char buffer[128]; + + xive_irq_data_dump(&xc->ipi_data, buffer, sizeof(buffer)); + seq_printf(m, "IPI=0x%08x %s", xc->hw_ipi, buffer); + } +#endif + } + seq_puts(m, "\n"); +} + +static void xive_debug_show_irq(struct seq_file *m, struct irq_data *d) +{ + unsigned int hw_irq = (unsigned int)irqd_to_hwirq(d); + int rc; + u32 target; + u8 prio; + u32 lirq; + char buffer[128]; + + rc = xive_ops->get_irq_config(hw_irq, &target, &prio, &lirq); + if (rc) { + seq_printf(m, "IRQ 0x%08x : no config rc=%d\n", hw_irq, rc); + return; + } + + seq_printf(m, "IRQ 0x%08x : target=0x%x prio=%02x lirq=0x%x ", + hw_irq, target, prio, lirq); + + xive_irq_data_dump(irq_data_get_irq_handler_data(d), buffer, sizeof(buffer)); + seq_puts(m, buffer); + seq_puts(m, "\n"); +} + +static int xive_irq_debug_show(struct seq_file *m, void *private) +{ + unsigned int i; + struct irq_desc *desc; + + for_each_irq_desc(i, desc) { + struct irq_data *d = irq_domain_get_irq_data(xive_irq_domain, i); + + if (d) + xive_debug_show_irq(m, d); + } + return 0; +} +DEFINE_SHOW_ATTRIBUTE(xive_irq_debug); + +static int xive_ipi_debug_show(struct seq_file *m, void *private) +{ + int cpu; + + if (xive_ops->debug_show) + xive_ops->debug_show(m, private); + + for_each_online_cpu(cpu) + xive_debug_show_ipi(m, cpu); + return 0; +} +DEFINE_SHOW_ATTRIBUTE(xive_ipi_debug); + +static void xive_eq_debug_show_one(struct seq_file *m, struct xive_q *q, u8 prio) +{ + int i; + + seq_printf(m, "EQ%d idx=%d T=%d\n", prio, q->idx, q->toggle); + if (q->qpage) { + for (i = 0; i < q->msk + 1; i++) { + if (!(i % 8)) + seq_printf(m, "%05d ", i); + seq_printf(m, "%08x%s", be32_to_cpup(q->qpage + i), + (i + 1) % 8 ? " " : "\n"); + } + } + seq_puts(m, "\n"); +} + +static int xive_eq_debug_show(struct seq_file *m, void *private) +{ + int cpu = (long)m->private; + struct xive_cpu *xc = per_cpu(xive_cpu, cpu); + + if (xc) + xive_eq_debug_show_one(m, &xc->queue[xive_irq_priority], + xive_irq_priority); + return 0; +} +DEFINE_SHOW_ATTRIBUTE(xive_eq_debug); + +static void xive_core_debugfs_create(void) +{ + struct dentry *xive_dir; + struct dentry *xive_eq_dir; + long cpu; + char name[16]; + + xive_dir = debugfs_create_dir("xive", arch_debugfs_dir); + if (IS_ERR(xive_dir)) + return; + + debugfs_create_file("ipis", 0400, xive_dir, + NULL, &xive_ipi_debug_fops); + debugfs_create_file("interrupts", 0400, xive_dir, + NULL, &xive_irq_debug_fops); + xive_eq_dir = debugfs_create_dir("eqs", xive_dir); + for_each_possible_cpu(cpu) { + snprintf(name, sizeof(name), "cpu%ld", cpu); + debugfs_create_file(name, 0400, xive_eq_dir, (void *)cpu, + &xive_eq_debug_fops); + } + debugfs_create_bool("store-eoi", 0600, xive_dir, &xive_store_eoi); + + if (xive_ops->debug_create) + xive_ops->debug_create(xive_dir); +} +#else +static inline void xive_core_debugfs_create(void) { } +#endif /* CONFIG_DEBUG_FS */ + +int xive_core_debug_init(void) +{ + if (xive_enabled() && IS_ENABLED(CONFIG_DEBUG_FS)) + xive_core_debugfs_create(); + + return 0; +} diff --git a/arch/powerpc/sysdev/xive/native.c b/arch/powerpc/sysdev/xive/native.c new file mode 100644 index 0000000000..f1c0fa6ece --- /dev/null +++ b/arch/powerpc/sysdev/xive/native.c @@ -0,0 +1,877 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright 2016,2017 IBM Corporation. + */ + +#define pr_fmt(fmt) "xive: " fmt + +#include <linux/types.h> +#include <linux/irq.h> +#include <linux/debugfs.h> +#include <linux/smp.h> +#include <linux/interrupt.h> +#include <linux/seq_file.h> +#include <linux/init.h> +#include <linux/of.h> +#include <linux/of_address.h> +#include <linux/slab.h> +#include <linux/spinlock.h> +#include <linux/delay.h> +#include <linux/cpumask.h> +#include <linux/mm.h> +#include <linux/kmemleak.h> + +#include <asm/machdep.h> +#include <asm/io.h> +#include <asm/smp.h> +#include <asm/irq.h> +#include <asm/errno.h> +#include <asm/xive.h> +#include <asm/xive-regs.h> +#include <asm/opal.h> +#include <asm/kvm_ppc.h> + +#include "xive-internal.h" + + +static u32 xive_provision_size; +static u32 *xive_provision_chips; +static u32 xive_provision_chip_count; +static u32 xive_queue_shift; +static u32 xive_pool_vps = XIVE_INVALID_VP; +static struct kmem_cache *xive_provision_cache; +static bool xive_has_single_esc; +bool xive_has_save_restore; + +int xive_native_populate_irq_data(u32 hw_irq, struct xive_irq_data *data) +{ + __be64 flags, eoi_page, trig_page; + __be32 esb_shift, src_chip; + u64 opal_flags; + s64 rc; + + memset(data, 0, sizeof(*data)); + + rc = opal_xive_get_irq_info(hw_irq, &flags, &eoi_page, &trig_page, + &esb_shift, &src_chip); + if (rc) { + pr_err("opal_xive_get_irq_info(0x%x) returned %lld\n", + hw_irq, rc); + return -EINVAL; + } + + opal_flags = be64_to_cpu(flags); + if (opal_flags & OPAL_XIVE_IRQ_STORE_EOI) + data->flags |= XIVE_IRQ_FLAG_STORE_EOI; + if (opal_flags & OPAL_XIVE_IRQ_STORE_EOI2) + data->flags |= XIVE_IRQ_FLAG_STORE_EOI; + if (opal_flags & OPAL_XIVE_IRQ_LSI) + data->flags |= XIVE_IRQ_FLAG_LSI; + data->eoi_page = be64_to_cpu(eoi_page); + data->trig_page = be64_to_cpu(trig_page); + data->esb_shift = be32_to_cpu(esb_shift); + data->src_chip = be32_to_cpu(src_chip); + + data->eoi_mmio = ioremap(data->eoi_page, 1u << data->esb_shift); + if (!data->eoi_mmio) { + pr_err("Failed to map EOI page for irq 0x%x\n", hw_irq); + return -ENOMEM; + } + + data->hw_irq = hw_irq; + + if (!data->trig_page) + return 0; + if (data->trig_page == data->eoi_page) { + data->trig_mmio = data->eoi_mmio; + return 0; + } + + data->trig_mmio = ioremap(data->trig_page, 1u << data->esb_shift); + if (!data->trig_mmio) { + pr_err("Failed to map trigger page for irq 0x%x\n", hw_irq); + return -ENOMEM; + } + return 0; +} +EXPORT_SYMBOL_GPL(xive_native_populate_irq_data); + +int xive_native_configure_irq(u32 hw_irq, u32 target, u8 prio, u32 sw_irq) +{ + s64 rc; + + for (;;) { + rc = opal_xive_set_irq_config(hw_irq, target, prio, sw_irq); + if (rc != OPAL_BUSY) + break; + msleep(OPAL_BUSY_DELAY_MS); + } + return rc == 0 ? 0 : -ENXIO; +} +EXPORT_SYMBOL_GPL(xive_native_configure_irq); + +static int xive_native_get_irq_config(u32 hw_irq, u32 *target, u8 *prio, + u32 *sw_irq) +{ + s64 rc; + __be64 vp; + __be32 lirq; + + rc = opal_xive_get_irq_config(hw_irq, &vp, prio, &lirq); + + *target = be64_to_cpu(vp); + *sw_irq = be32_to_cpu(lirq); + + return rc == 0 ? 0 : -ENXIO; +} + +#define vp_err(vp, fmt, ...) pr_err("VP[0x%x]: " fmt, vp, ##__VA_ARGS__) + +/* This can be called multiple time to change a queue configuration */ +int xive_native_configure_queue(u32 vp_id, struct xive_q *q, u8 prio, + __be32 *qpage, u32 order, bool can_escalate) +{ + s64 rc = 0; + __be64 qeoi_page_be; + __be32 esc_irq_be; + u64 flags, qpage_phys; + + /* If there's an actual queue page, clean it */ + if (order) { + if (WARN_ON(!qpage)) + return -EINVAL; + qpage_phys = __pa(qpage); + } else + qpage_phys = 0; + + /* Initialize the rest of the fields */ + q->msk = order ? ((1u << (order - 2)) - 1) : 0; + q->idx = 0; + q->toggle = 0; + + rc = opal_xive_get_queue_info(vp_id, prio, NULL, NULL, + &qeoi_page_be, + &esc_irq_be, + NULL); + if (rc) { + vp_err(vp_id, "Failed to get queue %d info : %lld\n", prio, rc); + rc = -EIO; + goto fail; + } + q->eoi_phys = be64_to_cpu(qeoi_page_be); + + /* Default flags */ + flags = OPAL_XIVE_EQ_ALWAYS_NOTIFY | OPAL_XIVE_EQ_ENABLED; + + /* Escalation needed ? */ + if (can_escalate) { + q->esc_irq = be32_to_cpu(esc_irq_be); + flags |= OPAL_XIVE_EQ_ESCALATE; + } + + /* Configure and enable the queue in HW */ + for (;;) { + rc = opal_xive_set_queue_info(vp_id, prio, qpage_phys, order, flags); + if (rc != OPAL_BUSY) + break; + msleep(OPAL_BUSY_DELAY_MS); + } + if (rc) { + vp_err(vp_id, "Failed to set queue %d info: %lld\n", prio, rc); + rc = -EIO; + } else { + /* + * KVM code requires all of the above to be visible before + * q->qpage is set due to how it manages IPI EOIs + */ + wmb(); + q->qpage = qpage; + } +fail: + return rc; +} +EXPORT_SYMBOL_GPL(xive_native_configure_queue); + +static void __xive_native_disable_queue(u32 vp_id, struct xive_q *q, u8 prio) +{ + s64 rc; + + /* Disable the queue in HW */ + for (;;) { + rc = opal_xive_set_queue_info(vp_id, prio, 0, 0, 0); + if (rc != OPAL_BUSY) + break; + msleep(OPAL_BUSY_DELAY_MS); + } + if (rc) + vp_err(vp_id, "Failed to disable queue %d : %lld\n", prio, rc); +} + +void xive_native_disable_queue(u32 vp_id, struct xive_q *q, u8 prio) +{ + __xive_native_disable_queue(vp_id, q, prio); +} +EXPORT_SYMBOL_GPL(xive_native_disable_queue); + +static int xive_native_setup_queue(unsigned int cpu, struct xive_cpu *xc, u8 prio) +{ + struct xive_q *q = &xc->queue[prio]; + __be32 *qpage; + + qpage = xive_queue_page_alloc(cpu, xive_queue_shift); + if (IS_ERR(qpage)) + return PTR_ERR(qpage); + + return xive_native_configure_queue(get_hard_smp_processor_id(cpu), + q, prio, qpage, xive_queue_shift, false); +} + +static void xive_native_cleanup_queue(unsigned int cpu, struct xive_cpu *xc, u8 prio) +{ + struct xive_q *q = &xc->queue[prio]; + unsigned int alloc_order; + + /* + * We use the variant with no iounmap as this is called on exec + * from an IPI and iounmap isn't safe + */ + __xive_native_disable_queue(get_hard_smp_processor_id(cpu), q, prio); + alloc_order = xive_alloc_order(xive_queue_shift); + free_pages((unsigned long)q->qpage, alloc_order); + q->qpage = NULL; +} + +static bool xive_native_match(struct device_node *node) +{ + return of_device_is_compatible(node, "ibm,opal-xive-vc"); +} + +static s64 opal_xive_allocate_irq(u32 chip_id) +{ + s64 irq = opal_xive_allocate_irq_raw(chip_id); + + /* + * Old versions of skiboot can incorrectly return 0xffffffff to + * indicate no space, fix it up here. + */ + return irq == 0xffffffff ? OPAL_RESOURCE : irq; +} + +#ifdef CONFIG_SMP +static int xive_native_get_ipi(unsigned int cpu, struct xive_cpu *xc) +{ + s64 irq; + + /* Allocate an IPI and populate info about it */ + for (;;) { + irq = opal_xive_allocate_irq(xc->chip_id); + if (irq == OPAL_BUSY) { + msleep(OPAL_BUSY_DELAY_MS); + continue; + } + if (irq < 0) { + pr_err("Failed to allocate IPI on CPU %d\n", cpu); + return -ENXIO; + } + xc->hw_ipi = irq; + break; + } + return 0; +} +#endif /* CONFIG_SMP */ + +u32 xive_native_alloc_irq_on_chip(u32 chip_id) +{ + s64 rc; + + for (;;) { + rc = opal_xive_allocate_irq(chip_id); + if (rc != OPAL_BUSY) + break; + msleep(OPAL_BUSY_DELAY_MS); + } + if (rc < 0) + return 0; + return rc; +} +EXPORT_SYMBOL_GPL(xive_native_alloc_irq_on_chip); + +void xive_native_free_irq(u32 irq) +{ + for (;;) { + s64 rc = opal_xive_free_irq(irq); + if (rc != OPAL_BUSY) + break; + msleep(OPAL_BUSY_DELAY_MS); + } +} +EXPORT_SYMBOL_GPL(xive_native_free_irq); + +#ifdef CONFIG_SMP +static void xive_native_put_ipi(unsigned int cpu, struct xive_cpu *xc) +{ + s64 rc; + + /* Free the IPI */ + if (xc->hw_ipi == XIVE_BAD_IRQ) + return; + for (;;) { + rc = opal_xive_free_irq(xc->hw_ipi); + if (rc == OPAL_BUSY) { + msleep(OPAL_BUSY_DELAY_MS); + continue; + } + xc->hw_ipi = XIVE_BAD_IRQ; + break; + } +} +#endif /* CONFIG_SMP */ + +static void xive_native_shutdown(void) +{ + /* Switch the XIVE to emulation mode */ + opal_xive_reset(OPAL_XIVE_MODE_EMU); +} + +/* + * Perform an "ack" cycle on the current thread, thus + * grabbing the pending active priorities and updating + * the CPPR to the most favored one. + */ +static void xive_native_update_pending(struct xive_cpu *xc) +{ + u8 he, cppr; + u16 ack; + + /* Perform the acknowledge hypervisor to register cycle */ + ack = be16_to_cpu(__raw_readw(xive_tima + TM_SPC_ACK_HV_REG)); + + /* Synchronize subsequent queue accesses */ + mb(); + + /* + * Grab the CPPR and the "HE" field which indicates the source + * of the hypervisor interrupt (if any) + */ + cppr = ack & 0xff; + he = (ack >> 8) >> 6; + switch(he) { + case TM_QW3_NSR_HE_NONE: /* Nothing to see here */ + break; + case TM_QW3_NSR_HE_PHYS: /* Physical thread interrupt */ + if (cppr == 0xff) + return; + /* Mark the priority pending */ + xc->pending_prio |= 1 << cppr; + + /* + * A new interrupt should never have a CPPR less favored + * than our current one. + */ + if (cppr >= xc->cppr) + pr_err("CPU %d odd ack CPPR, got %d at %d\n", + smp_processor_id(), cppr, xc->cppr); + + /* Update our idea of what the CPPR is */ + xc->cppr = cppr; + break; + case TM_QW3_NSR_HE_POOL: /* HV Pool interrupt (unused) */ + case TM_QW3_NSR_HE_LSI: /* Legacy FW LSI (unused) */ + pr_err("CPU %d got unexpected interrupt type HE=%d\n", + smp_processor_id(), he); + return; + } +} + +static void xive_native_prepare_cpu(unsigned int cpu, struct xive_cpu *xc) +{ + xc->chip_id = cpu_to_chip_id(cpu); +} + +static void xive_native_setup_cpu(unsigned int cpu, struct xive_cpu *xc) +{ + s64 rc; + u32 vp; + __be64 vp_cam_be; + u64 vp_cam; + + if (xive_pool_vps == XIVE_INVALID_VP) + return; + + /* Check if pool VP already active, if it is, pull it */ + if (in_be32(xive_tima + TM_QW2_HV_POOL + TM_WORD2) & TM_QW2W2_VP) + in_be64(xive_tima + TM_SPC_PULL_POOL_CTX); + + /* Enable the pool VP */ + vp = xive_pool_vps + cpu; + for (;;) { + rc = opal_xive_set_vp_info(vp, OPAL_XIVE_VP_ENABLED, 0); + if (rc != OPAL_BUSY) + break; + msleep(OPAL_BUSY_DELAY_MS); + } + if (rc) { + pr_err("Failed to enable pool VP on CPU %d\n", cpu); + return; + } + + /* Grab it's CAM value */ + rc = opal_xive_get_vp_info(vp, NULL, &vp_cam_be, NULL, NULL); + if (rc) { + pr_err("Failed to get pool VP info CPU %d\n", cpu); + return; + } + vp_cam = be64_to_cpu(vp_cam_be); + + /* Push it on the CPU (set LSMFB to 0xff to skip backlog scan) */ + out_be32(xive_tima + TM_QW2_HV_POOL + TM_WORD0, 0xff); + out_be32(xive_tima + TM_QW2_HV_POOL + TM_WORD2, TM_QW2W2_VP | vp_cam); +} + +static void xive_native_teardown_cpu(unsigned int cpu, struct xive_cpu *xc) +{ + s64 rc; + u32 vp; + + if (xive_pool_vps == XIVE_INVALID_VP) + return; + + /* Pull the pool VP from the CPU */ + in_be64(xive_tima + TM_SPC_PULL_POOL_CTX); + + /* Disable it */ + vp = xive_pool_vps + cpu; + for (;;) { + rc = opal_xive_set_vp_info(vp, 0, 0); + if (rc != OPAL_BUSY) + break; + msleep(OPAL_BUSY_DELAY_MS); + } +} + +void xive_native_sync_source(u32 hw_irq) +{ + opal_xive_sync(XIVE_SYNC_EAS, hw_irq); +} +EXPORT_SYMBOL_GPL(xive_native_sync_source); + +void xive_native_sync_queue(u32 hw_irq) +{ + opal_xive_sync(XIVE_SYNC_QUEUE, hw_irq); +} +EXPORT_SYMBOL_GPL(xive_native_sync_queue); + +#ifdef CONFIG_DEBUG_FS +static int xive_native_debug_create(struct dentry *xive_dir) +{ + debugfs_create_bool("save-restore", 0600, xive_dir, &xive_has_save_restore); + return 0; +} +#endif + +static const struct xive_ops xive_native_ops = { + .populate_irq_data = xive_native_populate_irq_data, + .configure_irq = xive_native_configure_irq, + .get_irq_config = xive_native_get_irq_config, + .setup_queue = xive_native_setup_queue, + .cleanup_queue = xive_native_cleanup_queue, + .match = xive_native_match, + .shutdown = xive_native_shutdown, + .update_pending = xive_native_update_pending, + .prepare_cpu = xive_native_prepare_cpu, + .setup_cpu = xive_native_setup_cpu, + .teardown_cpu = xive_native_teardown_cpu, + .sync_source = xive_native_sync_source, +#ifdef CONFIG_SMP + .get_ipi = xive_native_get_ipi, + .put_ipi = xive_native_put_ipi, +#endif /* CONFIG_SMP */ +#ifdef CONFIG_DEBUG_FS + .debug_create = xive_native_debug_create, +#endif /* CONFIG_DEBUG_FS */ + .name = "native", +}; + +static bool __init xive_parse_provisioning(struct device_node *np) +{ + int rc; + + if (of_property_read_u32(np, "ibm,xive-provision-page-size", + &xive_provision_size) < 0) + return true; + rc = of_property_count_elems_of_size(np, "ibm,xive-provision-chips", 4); + if (rc < 0) { + pr_err("Error %d getting provision chips array\n", rc); + return false; + } + xive_provision_chip_count = rc; + if (rc == 0) + return true; + + xive_provision_chips = kcalloc(4, xive_provision_chip_count, + GFP_KERNEL); + if (WARN_ON(!xive_provision_chips)) + return false; + + rc = of_property_read_u32_array(np, "ibm,xive-provision-chips", + xive_provision_chips, + xive_provision_chip_count); + if (rc < 0) { + pr_err("Error %d reading provision chips array\n", rc); + return false; + } + + xive_provision_cache = kmem_cache_create("xive-provision", + xive_provision_size, + xive_provision_size, + 0, NULL); + if (!xive_provision_cache) { + pr_err("Failed to allocate provision cache\n"); + return false; + } + return true; +} + +static void __init xive_native_setup_pools(void) +{ + /* Allocate a pool big enough */ + pr_debug("Allocating VP block for pool size %u\n", nr_cpu_ids); + + xive_pool_vps = xive_native_alloc_vp_block(nr_cpu_ids); + if (WARN_ON(xive_pool_vps == XIVE_INVALID_VP)) + pr_err("Failed to allocate pool VP, KVM might not function\n"); + + pr_debug("Pool VPs allocated at 0x%x for %u max CPUs\n", + xive_pool_vps, nr_cpu_ids); +} + +u32 xive_native_default_eq_shift(void) +{ + return xive_queue_shift; +} +EXPORT_SYMBOL_GPL(xive_native_default_eq_shift); + +unsigned long xive_tima_os; +EXPORT_SYMBOL_GPL(xive_tima_os); + +bool __init xive_native_init(void) +{ + struct device_node *np; + struct resource r; + void __iomem *tima; + struct property *prop; + u8 max_prio = 7; + const __be32 *p; + u32 val, cpu; + s64 rc; + + if (xive_cmdline_disabled) + return false; + + pr_devel("xive_native_init()\n"); + np = of_find_compatible_node(NULL, NULL, "ibm,opal-xive-pe"); + if (!np) { + pr_devel("not found !\n"); + return false; + } + pr_devel("Found %pOF\n", np); + + /* Resource 1 is HV window */ + if (of_address_to_resource(np, 1, &r)) { + pr_err("Failed to get thread mgmnt area resource\n"); + goto err_put; + } + tima = ioremap(r.start, resource_size(&r)); + if (!tima) { + pr_err("Failed to map thread mgmnt area\n"); + goto err_put; + } + + /* Read number of priorities */ + if (of_property_read_u32(np, "ibm,xive-#priorities", &val) == 0) + max_prio = val - 1; + + /* Iterate the EQ sizes and pick one */ + of_property_for_each_u32(np, "ibm,xive-eq-sizes", prop, p, val) { + xive_queue_shift = val; + if (val == PAGE_SHIFT) + break; + } + + /* Do we support single escalation */ + xive_has_single_esc = of_property_read_bool(np, "single-escalation-support"); + + xive_has_save_restore = of_property_read_bool(np, "vp-save-restore"); + + /* Configure Thread Management areas for KVM */ + for_each_possible_cpu(cpu) + kvmppc_set_xive_tima(cpu, r.start, tima); + + /* Resource 2 is OS window */ + if (of_address_to_resource(np, 2, &r)) { + pr_err("Failed to get thread mgmnt area resource\n"); + goto err_put; + } + + xive_tima_os = r.start; + + /* Grab size of provisioning pages */ + xive_parse_provisioning(np); + + /* Switch the XIVE to exploitation mode */ + rc = opal_xive_reset(OPAL_XIVE_MODE_EXPL); + if (rc) { + pr_err("Switch to exploitation mode failed with error %lld\n", rc); + goto err_put; + } + + /* Setup some dummy HV pool VPs */ + xive_native_setup_pools(); + + /* Initialize XIVE core with our backend */ + if (!xive_core_init(np, &xive_native_ops, tima, TM_QW3_HV_PHYS, + max_prio)) { + opal_xive_reset(OPAL_XIVE_MODE_EMU); + goto err_put; + } + of_node_put(np); + pr_info("Using %dkB queues\n", 1 << (xive_queue_shift - 10)); + return true; + +err_put: + of_node_put(np); + return false; +} + +static bool xive_native_provision_pages(void) +{ + u32 i; + void *p; + + for (i = 0; i < xive_provision_chip_count; i++) { + u32 chip = xive_provision_chips[i]; + + /* + * XXX TODO: Try to make the allocation local to the node where + * the chip resides. + */ + p = kmem_cache_alloc(xive_provision_cache, GFP_KERNEL); + if (!p) { + pr_err("Failed to allocate provisioning page\n"); + return false; + } + kmemleak_ignore(p); + opal_xive_donate_page(chip, __pa(p)); + } + return true; +} + +u32 xive_native_alloc_vp_block(u32 max_vcpus) +{ + s64 rc; + u32 order; + + order = fls(max_vcpus) - 1; + if (max_vcpus > (1 << order)) + order++; + + pr_debug("VP block alloc, for max VCPUs %d use order %d\n", + max_vcpus, order); + + for (;;) { + rc = opal_xive_alloc_vp_block(order); + switch (rc) { + case OPAL_BUSY: + msleep(OPAL_BUSY_DELAY_MS); + break; + case OPAL_XIVE_PROVISIONING: + if (!xive_native_provision_pages()) + return XIVE_INVALID_VP; + break; + default: + if (rc < 0) { + pr_err("OPAL failed to allocate VCPUs order %d, err %lld\n", + order, rc); + return XIVE_INVALID_VP; + } + return rc; + } + } +} +EXPORT_SYMBOL_GPL(xive_native_alloc_vp_block); + +void xive_native_free_vp_block(u32 vp_base) +{ + s64 rc; + + if (vp_base == XIVE_INVALID_VP) + return; + + rc = opal_xive_free_vp_block(vp_base); + if (rc < 0) + pr_warn("OPAL error %lld freeing VP block\n", rc); +} +EXPORT_SYMBOL_GPL(xive_native_free_vp_block); + +int xive_native_enable_vp(u32 vp_id, bool single_escalation) +{ + s64 rc; + u64 flags = OPAL_XIVE_VP_ENABLED; + + if (single_escalation) + flags |= OPAL_XIVE_VP_SINGLE_ESCALATION; + for (;;) { + rc = opal_xive_set_vp_info(vp_id, flags, 0); + if (rc != OPAL_BUSY) + break; + msleep(OPAL_BUSY_DELAY_MS); + } + if (rc) + vp_err(vp_id, "Failed to enable VP : %lld\n", rc); + return rc ? -EIO : 0; +} +EXPORT_SYMBOL_GPL(xive_native_enable_vp); + +int xive_native_disable_vp(u32 vp_id) +{ + s64 rc; + + for (;;) { + rc = opal_xive_set_vp_info(vp_id, 0, 0); + if (rc != OPAL_BUSY) + break; + msleep(OPAL_BUSY_DELAY_MS); + } + if (rc) + vp_err(vp_id, "Failed to disable VP : %lld\n", rc); + return rc ? -EIO : 0; +} +EXPORT_SYMBOL_GPL(xive_native_disable_vp); + +int xive_native_get_vp_info(u32 vp_id, u32 *out_cam_id, u32 *out_chip_id) +{ + __be64 vp_cam_be; + __be32 vp_chip_id_be; + s64 rc; + + rc = opal_xive_get_vp_info(vp_id, NULL, &vp_cam_be, NULL, &vp_chip_id_be); + if (rc) { + vp_err(vp_id, "Failed to get VP info : %lld\n", rc); + return -EIO; + } + *out_cam_id = be64_to_cpu(vp_cam_be) & 0xffffffffu; + *out_chip_id = be32_to_cpu(vp_chip_id_be); + + return 0; +} +EXPORT_SYMBOL_GPL(xive_native_get_vp_info); + +bool xive_native_has_single_escalation(void) +{ + return xive_has_single_esc; +} +EXPORT_SYMBOL_GPL(xive_native_has_single_escalation); + +bool xive_native_has_save_restore(void) +{ + return xive_has_save_restore; +} +EXPORT_SYMBOL_GPL(xive_native_has_save_restore); + +int xive_native_get_queue_info(u32 vp_id, u32 prio, + u64 *out_qpage, + u64 *out_qsize, + u64 *out_qeoi_page, + u32 *out_escalate_irq, + u64 *out_qflags) +{ + __be64 qpage; + __be64 qsize; + __be64 qeoi_page; + __be32 escalate_irq; + __be64 qflags; + s64 rc; + + rc = opal_xive_get_queue_info(vp_id, prio, &qpage, &qsize, + &qeoi_page, &escalate_irq, &qflags); + if (rc) { + vp_err(vp_id, "failed to get queue %d info : %lld\n", prio, rc); + return -EIO; + } + + if (out_qpage) + *out_qpage = be64_to_cpu(qpage); + if (out_qsize) + *out_qsize = be64_to_cpu(qsize); + if (out_qeoi_page) + *out_qeoi_page = be64_to_cpu(qeoi_page); + if (out_escalate_irq) + *out_escalate_irq = be32_to_cpu(escalate_irq); + if (out_qflags) + *out_qflags = be64_to_cpu(qflags); + + return 0; +} +EXPORT_SYMBOL_GPL(xive_native_get_queue_info); + +int xive_native_get_queue_state(u32 vp_id, u32 prio, u32 *qtoggle, u32 *qindex) +{ + __be32 opal_qtoggle; + __be32 opal_qindex; + s64 rc; + + rc = opal_xive_get_queue_state(vp_id, prio, &opal_qtoggle, + &opal_qindex); + if (rc) { + vp_err(vp_id, "failed to get queue %d state : %lld\n", prio, rc); + return -EIO; + } + + if (qtoggle) + *qtoggle = be32_to_cpu(opal_qtoggle); + if (qindex) + *qindex = be32_to_cpu(opal_qindex); + + return 0; +} +EXPORT_SYMBOL_GPL(xive_native_get_queue_state); + +int xive_native_set_queue_state(u32 vp_id, u32 prio, u32 qtoggle, u32 qindex) +{ + s64 rc; + + rc = opal_xive_set_queue_state(vp_id, prio, qtoggle, qindex); + if (rc) { + vp_err(vp_id, "failed to set queue %d state : %lld\n", prio, rc); + return -EIO; + } + + return 0; +} +EXPORT_SYMBOL_GPL(xive_native_set_queue_state); + +bool xive_native_has_queue_state_support(void) +{ + return opal_check_token(OPAL_XIVE_GET_QUEUE_STATE) && + opal_check_token(OPAL_XIVE_SET_QUEUE_STATE); +} +EXPORT_SYMBOL_GPL(xive_native_has_queue_state_support); + +int xive_native_get_vp_state(u32 vp_id, u64 *out_state) +{ + __be64 state; + s64 rc; + + rc = opal_xive_get_vp_state(vp_id, &state); + if (rc) { + vp_err(vp_id, "failed to get vp state : %lld\n", rc); + return -EIO; + } + + if (out_state) + *out_state = be64_to_cpu(state); + return 0; +} +EXPORT_SYMBOL_GPL(xive_native_get_vp_state); + +machine_arch_initcall(powernv, xive_core_debug_init); diff --git a/arch/powerpc/sysdev/xive/spapr.c b/arch/powerpc/sysdev/xive/spapr.c new file mode 100644 index 0000000000..e454192643 --- /dev/null +++ b/arch/powerpc/sysdev/xive/spapr.c @@ -0,0 +1,892 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright 2016,2017 IBM Corporation. + */ + +#define pr_fmt(fmt) "xive: " fmt + +#include <linux/types.h> +#include <linux/irq.h> +#include <linux/smp.h> +#include <linux/interrupt.h> +#include <linux/init.h> +#include <linux/of.h> +#include <linux/of_address.h> +#include <linux/of_fdt.h> +#include <linux/slab.h> +#include <linux/spinlock.h> +#include <linux/bitmap.h> +#include <linux/cpumask.h> +#include <linux/mm.h> +#include <linux/delay.h> +#include <linux/libfdt.h> + +#include <asm/machdep.h> +#include <asm/prom.h> +#include <asm/io.h> +#include <asm/smp.h> +#include <asm/irq.h> +#include <asm/errno.h> +#include <asm/xive.h> +#include <asm/xive-regs.h> +#include <asm/hvcall.h> +#include <asm/svm.h> +#include <asm/ultravisor.h> + +#include "xive-internal.h" + +static u32 xive_queue_shift; + +struct xive_irq_bitmap { + unsigned long *bitmap; + unsigned int base; + unsigned int count; + spinlock_t lock; + struct list_head list; +}; + +static LIST_HEAD(xive_irq_bitmaps); + +static int __init xive_irq_bitmap_add(int base, int count) +{ + struct xive_irq_bitmap *xibm; + + xibm = kzalloc(sizeof(*xibm), GFP_KERNEL); + if (!xibm) + return -ENOMEM; + + spin_lock_init(&xibm->lock); + xibm->base = base; + xibm->count = count; + xibm->bitmap = bitmap_zalloc(xibm->count, GFP_KERNEL); + if (!xibm->bitmap) { + kfree(xibm); + return -ENOMEM; + } + list_add(&xibm->list, &xive_irq_bitmaps); + + pr_info("Using IRQ range [%x-%x]", xibm->base, + xibm->base + xibm->count - 1); + return 0; +} + +static void xive_irq_bitmap_remove_all(void) +{ + struct xive_irq_bitmap *xibm, *tmp; + + list_for_each_entry_safe(xibm, tmp, &xive_irq_bitmaps, list) { + list_del(&xibm->list); + bitmap_free(xibm->bitmap); + kfree(xibm); + } +} + +static int __xive_irq_bitmap_alloc(struct xive_irq_bitmap *xibm) +{ + int irq; + + irq = find_first_zero_bit(xibm->bitmap, xibm->count); + if (irq != xibm->count) { + set_bit(irq, xibm->bitmap); + irq += xibm->base; + } else { + irq = -ENOMEM; + } + + return irq; +} + +static int xive_irq_bitmap_alloc(void) +{ + struct xive_irq_bitmap *xibm; + unsigned long flags; + int irq = -ENOENT; + + list_for_each_entry(xibm, &xive_irq_bitmaps, list) { + spin_lock_irqsave(&xibm->lock, flags); + irq = __xive_irq_bitmap_alloc(xibm); + spin_unlock_irqrestore(&xibm->lock, flags); + if (irq >= 0) + break; + } + return irq; +} + +static void xive_irq_bitmap_free(int irq) +{ + unsigned long flags; + struct xive_irq_bitmap *xibm; + + list_for_each_entry(xibm, &xive_irq_bitmaps, list) { + if ((irq >= xibm->base) && (irq < xibm->base + xibm->count)) { + spin_lock_irqsave(&xibm->lock, flags); + clear_bit(irq - xibm->base, xibm->bitmap); + spin_unlock_irqrestore(&xibm->lock, flags); + break; + } + } +} + + +/* Based on the similar routines in RTAS */ +static unsigned int plpar_busy_delay_time(long rc) +{ + unsigned int ms = 0; + + if (H_IS_LONG_BUSY(rc)) { + ms = get_longbusy_msecs(rc); + } else if (rc == H_BUSY) { + ms = 10; /* seems appropriate for XIVE hcalls */ + } + + return ms; +} + +static unsigned int plpar_busy_delay(int rc) +{ + unsigned int ms; + + ms = plpar_busy_delay_time(rc); + if (ms) + mdelay(ms); + + return ms; +} + +/* + * Note: this call has a partition wide scope and can take a while to + * complete. If it returns H_LONG_BUSY_* it should be retried + * periodically. + */ +static long plpar_int_reset(unsigned long flags) +{ + long rc; + + do { + rc = plpar_hcall_norets(H_INT_RESET, flags); + } while (plpar_busy_delay(rc)); + + if (rc) + pr_err("H_INT_RESET failed %ld\n", rc); + + return rc; +} + +static long plpar_int_get_source_info(unsigned long flags, + unsigned long lisn, + unsigned long *src_flags, + unsigned long *eoi_page, + unsigned long *trig_page, + unsigned long *esb_shift) +{ + unsigned long retbuf[PLPAR_HCALL_BUFSIZE]; + long rc; + + do { + rc = plpar_hcall(H_INT_GET_SOURCE_INFO, retbuf, flags, lisn); + } while (plpar_busy_delay(rc)); + + if (rc) { + pr_err("H_INT_GET_SOURCE_INFO lisn=0x%lx failed %ld\n", lisn, rc); + return rc; + } + + *src_flags = retbuf[0]; + *eoi_page = retbuf[1]; + *trig_page = retbuf[2]; + *esb_shift = retbuf[3]; + + pr_debug("H_INT_GET_SOURCE_INFO lisn=0x%lx flags=0x%lx eoi=0x%lx trig=0x%lx shift=0x%lx\n", + lisn, retbuf[0], retbuf[1], retbuf[2], retbuf[3]); + + return 0; +} + +#define XIVE_SRC_SET_EISN (1ull << (63 - 62)) +#define XIVE_SRC_MASK (1ull << (63 - 63)) /* unused */ + +static long plpar_int_set_source_config(unsigned long flags, + unsigned long lisn, + unsigned long target, + unsigned long prio, + unsigned long sw_irq) +{ + long rc; + + + pr_debug("H_INT_SET_SOURCE_CONFIG flags=0x%lx lisn=0x%lx target=%ld prio=%ld sw_irq=%ld\n", + flags, lisn, target, prio, sw_irq); + + + do { + rc = plpar_hcall_norets(H_INT_SET_SOURCE_CONFIG, flags, lisn, + target, prio, sw_irq); + } while (plpar_busy_delay(rc)); + + if (rc) { + pr_err("H_INT_SET_SOURCE_CONFIG lisn=0x%lx target=%ld prio=%ld failed %ld\n", + lisn, target, prio, rc); + return rc; + } + + return 0; +} + +static long plpar_int_get_source_config(unsigned long flags, + unsigned long lisn, + unsigned long *target, + unsigned long *prio, + unsigned long *sw_irq) +{ + unsigned long retbuf[PLPAR_HCALL_BUFSIZE]; + long rc; + + pr_debug("H_INT_GET_SOURCE_CONFIG flags=0x%lx lisn=0x%lx\n", flags, lisn); + + do { + rc = plpar_hcall(H_INT_GET_SOURCE_CONFIG, retbuf, flags, lisn, + target, prio, sw_irq); + } while (plpar_busy_delay(rc)); + + if (rc) { + pr_err("H_INT_GET_SOURCE_CONFIG lisn=0x%lx failed %ld\n", + lisn, rc); + return rc; + } + + *target = retbuf[0]; + *prio = retbuf[1]; + *sw_irq = retbuf[2]; + + pr_debug("H_INT_GET_SOURCE_CONFIG target=%ld prio=%ld sw_irq=%ld\n", + retbuf[0], retbuf[1], retbuf[2]); + + return 0; +} + +static long plpar_int_get_queue_info(unsigned long flags, + unsigned long target, + unsigned long priority, + unsigned long *esn_page, + unsigned long *esn_size) +{ + unsigned long retbuf[PLPAR_HCALL_BUFSIZE]; + long rc; + + do { + rc = plpar_hcall(H_INT_GET_QUEUE_INFO, retbuf, flags, target, + priority); + } while (plpar_busy_delay(rc)); + + if (rc) { + pr_err("H_INT_GET_QUEUE_INFO cpu=%ld prio=%ld failed %ld\n", + target, priority, rc); + return rc; + } + + *esn_page = retbuf[0]; + *esn_size = retbuf[1]; + + pr_debug("H_INT_GET_QUEUE_INFO cpu=%ld prio=%ld page=0x%lx size=0x%lx\n", + target, priority, retbuf[0], retbuf[1]); + + return 0; +} + +#define XIVE_EQ_ALWAYS_NOTIFY (1ull << (63 - 63)) + +static long plpar_int_set_queue_config(unsigned long flags, + unsigned long target, + unsigned long priority, + unsigned long qpage, + unsigned long qsize) +{ + long rc; + + pr_debug("H_INT_SET_QUEUE_CONFIG flags=0x%lx target=%ld priority=0x%lx qpage=0x%lx qsize=0x%lx\n", + flags, target, priority, qpage, qsize); + + do { + rc = plpar_hcall_norets(H_INT_SET_QUEUE_CONFIG, flags, target, + priority, qpage, qsize); + } while (plpar_busy_delay(rc)); + + if (rc) { + pr_err("H_INT_SET_QUEUE_CONFIG cpu=%ld prio=%ld qpage=0x%lx returned %ld\n", + target, priority, qpage, rc); + return rc; + } + + return 0; +} + +static long plpar_int_sync(unsigned long flags, unsigned long lisn) +{ + long rc; + + do { + rc = plpar_hcall_norets(H_INT_SYNC, flags, lisn); + } while (plpar_busy_delay(rc)); + + if (rc) { + pr_err("H_INT_SYNC lisn=0x%lx returned %ld\n", lisn, rc); + return rc; + } + + return 0; +} + +#define XIVE_ESB_FLAG_STORE (1ull << (63 - 63)) + +static long plpar_int_esb(unsigned long flags, + unsigned long lisn, + unsigned long offset, + unsigned long in_data, + unsigned long *out_data) +{ + unsigned long retbuf[PLPAR_HCALL_BUFSIZE]; + long rc; + + pr_debug("H_INT_ESB flags=0x%lx lisn=0x%lx offset=0x%lx in=0x%lx\n", + flags, lisn, offset, in_data); + + do { + rc = plpar_hcall(H_INT_ESB, retbuf, flags, lisn, offset, + in_data); + } while (plpar_busy_delay(rc)); + + if (rc) { + pr_err("H_INT_ESB lisn=0x%lx offset=0x%lx returned %ld\n", + lisn, offset, rc); + return rc; + } + + *out_data = retbuf[0]; + + return 0; +} + +static u64 xive_spapr_esb_rw(u32 lisn, u32 offset, u64 data, bool write) +{ + unsigned long read_data; + long rc; + + rc = plpar_int_esb(write ? XIVE_ESB_FLAG_STORE : 0, + lisn, offset, data, &read_data); + if (rc) + return -1; + + return write ? 0 : read_data; +} + +#define XIVE_SRC_H_INT_ESB (1ull << (63 - 60)) +#define XIVE_SRC_LSI (1ull << (63 - 61)) +#define XIVE_SRC_TRIGGER (1ull << (63 - 62)) +#define XIVE_SRC_STORE_EOI (1ull << (63 - 63)) + +static int xive_spapr_populate_irq_data(u32 hw_irq, struct xive_irq_data *data) +{ + long rc; + unsigned long flags; + unsigned long eoi_page; + unsigned long trig_page; + unsigned long esb_shift; + + memset(data, 0, sizeof(*data)); + + rc = plpar_int_get_source_info(0, hw_irq, &flags, &eoi_page, &trig_page, + &esb_shift); + if (rc) + return -EINVAL; + + if (flags & XIVE_SRC_H_INT_ESB) + data->flags |= XIVE_IRQ_FLAG_H_INT_ESB; + if (flags & XIVE_SRC_STORE_EOI) + data->flags |= XIVE_IRQ_FLAG_STORE_EOI; + if (flags & XIVE_SRC_LSI) + data->flags |= XIVE_IRQ_FLAG_LSI; + data->eoi_page = eoi_page; + data->esb_shift = esb_shift; + data->trig_page = trig_page; + + data->hw_irq = hw_irq; + + /* + * No chip-id for the sPAPR backend. This has an impact how we + * pick a target. See xive_pick_irq_target(). + */ + data->src_chip = XIVE_INVALID_CHIP_ID; + + /* + * When the H_INT_ESB flag is set, the H_INT_ESB hcall should + * be used for interrupt management. Skip the remapping of the + * ESB pages which are not available. + */ + if (data->flags & XIVE_IRQ_FLAG_H_INT_ESB) + return 0; + + data->eoi_mmio = ioremap(data->eoi_page, 1u << data->esb_shift); + if (!data->eoi_mmio) { + pr_err("Failed to map EOI page for irq 0x%x\n", hw_irq); + return -ENOMEM; + } + + /* Full function page supports trigger */ + if (flags & XIVE_SRC_TRIGGER) { + data->trig_mmio = data->eoi_mmio; + return 0; + } + + data->trig_mmio = ioremap(data->trig_page, 1u << data->esb_shift); + if (!data->trig_mmio) { + iounmap(data->eoi_mmio); + pr_err("Failed to map trigger page for irq 0x%x\n", hw_irq); + return -ENOMEM; + } + return 0; +} + +static int xive_spapr_configure_irq(u32 hw_irq, u32 target, u8 prio, u32 sw_irq) +{ + long rc; + + rc = plpar_int_set_source_config(XIVE_SRC_SET_EISN, hw_irq, target, + prio, sw_irq); + + return rc == 0 ? 0 : -ENXIO; +} + +static int xive_spapr_get_irq_config(u32 hw_irq, u32 *target, u8 *prio, + u32 *sw_irq) +{ + long rc; + unsigned long h_target; + unsigned long h_prio; + unsigned long h_sw_irq; + + rc = plpar_int_get_source_config(0, hw_irq, &h_target, &h_prio, + &h_sw_irq); + + *target = h_target; + *prio = h_prio; + *sw_irq = h_sw_irq; + + return rc == 0 ? 0 : -ENXIO; +} + +/* This can be called multiple time to change a queue configuration */ +static int xive_spapr_configure_queue(u32 target, struct xive_q *q, u8 prio, + __be32 *qpage, u32 order) +{ + s64 rc = 0; + unsigned long esn_page; + unsigned long esn_size; + u64 flags, qpage_phys; + + /* If there's an actual queue page, clean it */ + if (order) { + if (WARN_ON(!qpage)) + return -EINVAL; + qpage_phys = __pa(qpage); + } else { + qpage_phys = 0; + } + + /* Initialize the rest of the fields */ + q->msk = order ? ((1u << (order - 2)) - 1) : 0; + q->idx = 0; + q->toggle = 0; + + rc = plpar_int_get_queue_info(0, target, prio, &esn_page, &esn_size); + if (rc) { + pr_err("Error %lld getting queue info CPU %d prio %d\n", rc, + target, prio); + rc = -EIO; + goto fail; + } + + /* TODO: add support for the notification page */ + q->eoi_phys = esn_page; + + /* Default is to always notify */ + flags = XIVE_EQ_ALWAYS_NOTIFY; + + /* Configure and enable the queue in HW */ + rc = plpar_int_set_queue_config(flags, target, prio, qpage_phys, order); + if (rc) { + pr_err("Error %lld setting queue for CPU %d prio %d\n", rc, + target, prio); + rc = -EIO; + } else { + q->qpage = qpage; + if (is_secure_guest()) + uv_share_page(PHYS_PFN(qpage_phys), + 1 << xive_alloc_order(order)); + } +fail: + return rc; +} + +static int xive_spapr_setup_queue(unsigned int cpu, struct xive_cpu *xc, + u8 prio) +{ + struct xive_q *q = &xc->queue[prio]; + __be32 *qpage; + + qpage = xive_queue_page_alloc(cpu, xive_queue_shift); + if (IS_ERR(qpage)) + return PTR_ERR(qpage); + + return xive_spapr_configure_queue(get_hard_smp_processor_id(cpu), + q, prio, qpage, xive_queue_shift); +} + +static void xive_spapr_cleanup_queue(unsigned int cpu, struct xive_cpu *xc, + u8 prio) +{ + struct xive_q *q = &xc->queue[prio]; + unsigned int alloc_order; + long rc; + int hw_cpu = get_hard_smp_processor_id(cpu); + + rc = plpar_int_set_queue_config(0, hw_cpu, prio, 0, 0); + if (rc) + pr_err("Error %ld setting queue for CPU %d prio %d\n", rc, + hw_cpu, prio); + + alloc_order = xive_alloc_order(xive_queue_shift); + if (is_secure_guest()) + uv_unshare_page(PHYS_PFN(__pa(q->qpage)), 1 << alloc_order); + free_pages((unsigned long)q->qpage, alloc_order); + q->qpage = NULL; +} + +static bool xive_spapr_match(struct device_node *node) +{ + /* Ignore cascaded controllers for the moment */ + return true; +} + +#ifdef CONFIG_SMP +static int xive_spapr_get_ipi(unsigned int cpu, struct xive_cpu *xc) +{ + int irq = xive_irq_bitmap_alloc(); + + if (irq < 0) { + pr_err("Failed to allocate IPI on CPU %d\n", cpu); + return -ENXIO; + } + + xc->hw_ipi = irq; + return 0; +} + +static void xive_spapr_put_ipi(unsigned int cpu, struct xive_cpu *xc) +{ + if (xc->hw_ipi == XIVE_BAD_IRQ) + return; + + xive_irq_bitmap_free(xc->hw_ipi); + xc->hw_ipi = XIVE_BAD_IRQ; +} +#endif /* CONFIG_SMP */ + +static void xive_spapr_shutdown(void) +{ + plpar_int_reset(0); +} + +/* + * Perform an "ack" cycle on the current thread. Grab the pending + * active priorities and update the CPPR to the most favored one. + */ +static void xive_spapr_update_pending(struct xive_cpu *xc) +{ + u8 nsr, cppr; + u16 ack; + + /* + * Perform the "Acknowledge O/S to Register" cycle. + * + * Let's speedup the access to the TIMA using the raw I/O + * accessor as we don't need the synchronisation routine of + * the higher level ones + */ + ack = be16_to_cpu(__raw_readw(xive_tima + TM_SPC_ACK_OS_REG)); + + /* Synchronize subsequent queue accesses */ + mb(); + + /* + * Grab the CPPR and the "NSR" field which indicates the source + * of the interrupt (if any) + */ + cppr = ack & 0xff; + nsr = ack >> 8; + + if (nsr & TM_QW1_NSR_EO) { + if (cppr == 0xff) + return; + /* Mark the priority pending */ + xc->pending_prio |= 1 << cppr; + + /* + * A new interrupt should never have a CPPR less favored + * than our current one. + */ + if (cppr >= xc->cppr) + pr_err("CPU %d odd ack CPPR, got %d at %d\n", + smp_processor_id(), cppr, xc->cppr); + + /* Update our idea of what the CPPR is */ + xc->cppr = cppr; + } +} + +static void xive_spapr_setup_cpu(unsigned int cpu, struct xive_cpu *xc) +{ + /* Only some debug on the TIMA settings */ + pr_debug("(HW value: %08x %08x %08x)\n", + in_be32(xive_tima + TM_QW1_OS + TM_WORD0), + in_be32(xive_tima + TM_QW1_OS + TM_WORD1), + in_be32(xive_tima + TM_QW1_OS + TM_WORD2)); +} + +static void xive_spapr_teardown_cpu(unsigned int cpu, struct xive_cpu *xc) +{ + /* Nothing to do */; +} + +static void xive_spapr_sync_source(u32 hw_irq) +{ + /* Specs are unclear on what this is doing */ + plpar_int_sync(0, hw_irq); +} + +static int xive_spapr_debug_show(struct seq_file *m, void *private) +{ + struct xive_irq_bitmap *xibm; + char *buf = kmalloc(PAGE_SIZE, GFP_KERNEL); + + if (!buf) + return -ENOMEM; + + list_for_each_entry(xibm, &xive_irq_bitmaps, list) { + memset(buf, 0, PAGE_SIZE); + bitmap_print_to_pagebuf(true, buf, xibm->bitmap, xibm->count); + seq_printf(m, "bitmap #%d: %s", xibm->count, buf); + } + kfree(buf); + + return 0; +} + +static const struct xive_ops xive_spapr_ops = { + .populate_irq_data = xive_spapr_populate_irq_data, + .configure_irq = xive_spapr_configure_irq, + .get_irq_config = xive_spapr_get_irq_config, + .setup_queue = xive_spapr_setup_queue, + .cleanup_queue = xive_spapr_cleanup_queue, + .match = xive_spapr_match, + .shutdown = xive_spapr_shutdown, + .update_pending = xive_spapr_update_pending, + .setup_cpu = xive_spapr_setup_cpu, + .teardown_cpu = xive_spapr_teardown_cpu, + .sync_source = xive_spapr_sync_source, + .esb_rw = xive_spapr_esb_rw, +#ifdef CONFIG_SMP + .get_ipi = xive_spapr_get_ipi, + .put_ipi = xive_spapr_put_ipi, + .debug_show = xive_spapr_debug_show, +#endif /* CONFIG_SMP */ + .name = "spapr", +}; + +/* + * get max priority from "/ibm,plat-res-int-priorities" + */ +static bool __init xive_get_max_prio(u8 *max_prio) +{ + struct device_node *rootdn; + const __be32 *reg; + u32 len; + int prio, found; + + rootdn = of_find_node_by_path("/"); + if (!rootdn) { + pr_err("not root node found !\n"); + return false; + } + + reg = of_get_property(rootdn, "ibm,plat-res-int-priorities", &len); + of_node_put(rootdn); + if (!reg) { + pr_err("Failed to read 'ibm,plat-res-int-priorities' property\n"); + return false; + } + + if (len % (2 * sizeof(u32)) != 0) { + pr_err("invalid 'ibm,plat-res-int-priorities' property\n"); + return false; + } + + /* HW supports priorities in the range [0-7] and 0xFF is a + * wildcard priority used to mask. We scan the ranges reserved + * by the hypervisor to find the lowest priority we can use. + */ + found = 0xFF; + for (prio = 0; prio < 8; prio++) { + int reserved = 0; + int i; + + for (i = 0; i < len / (2 * sizeof(u32)); i++) { + int base = be32_to_cpu(reg[2 * i]); + int range = be32_to_cpu(reg[2 * i + 1]); + + if (prio >= base && prio < base + range) + reserved++; + } + + if (!reserved) + found = prio; + } + + if (found == 0xFF) { + pr_err("no valid priority found in 'ibm,plat-res-int-priorities'\n"); + return false; + } + + *max_prio = found; + return true; +} + +static const u8 *__init get_vec5_feature(unsigned int index) +{ + unsigned long root, chosen; + int size; + const u8 *vec5; + + root = of_get_flat_dt_root(); + chosen = of_get_flat_dt_subnode_by_name(root, "chosen"); + if (chosen == -FDT_ERR_NOTFOUND) + return NULL; + + vec5 = of_get_flat_dt_prop(chosen, "ibm,architecture-vec-5", &size); + if (!vec5) + return NULL; + + if (size <= index) + return NULL; + + return vec5 + index; +} + +static bool __init xive_spapr_disabled(void) +{ + const u8 *vec5_xive; + + vec5_xive = get_vec5_feature(OV5_INDX(OV5_XIVE_SUPPORT)); + if (vec5_xive) { + u8 val; + + val = *vec5_xive & OV5_FEAT(OV5_XIVE_SUPPORT); + switch (val) { + case OV5_FEAT(OV5_XIVE_EITHER): + case OV5_FEAT(OV5_XIVE_LEGACY): + break; + case OV5_FEAT(OV5_XIVE_EXPLOIT): + /* Hypervisor only supports XIVE */ + if (xive_cmdline_disabled) + pr_warn("WARNING: Ignoring cmdline option xive=off\n"); + return false; + default: + pr_warn("%s: Unknown xive support option: 0x%x\n", + __func__, val); + break; + } + } + + return xive_cmdline_disabled; +} + +bool __init xive_spapr_init(void) +{ + struct device_node *np; + struct resource r; + void __iomem *tima; + struct property *prop; + u8 max_prio; + u32 val; + u32 len; + const __be32 *reg; + int i, err; + + if (xive_spapr_disabled()) + return false; + + pr_devel("%s()\n", __func__); + np = of_find_compatible_node(NULL, NULL, "ibm,power-ivpe"); + if (!np) { + pr_devel("not found !\n"); + return false; + } + pr_devel("Found %s\n", np->full_name); + + /* Resource 1 is the OS ring TIMA */ + if (of_address_to_resource(np, 1, &r)) { + pr_err("Failed to get thread mgmnt area resource\n"); + goto err_put; + } + tima = ioremap(r.start, resource_size(&r)); + if (!tima) { + pr_err("Failed to map thread mgmnt area\n"); + goto err_put; + } + + if (!xive_get_max_prio(&max_prio)) + goto err_unmap; + + /* Feed the IRQ number allocator with the ranges given in the DT */ + reg = of_get_property(np, "ibm,xive-lisn-ranges", &len); + if (!reg) { + pr_err("Failed to read 'ibm,xive-lisn-ranges' property\n"); + goto err_unmap; + } + + if (len % (2 * sizeof(u32)) != 0) { + pr_err("invalid 'ibm,xive-lisn-ranges' property\n"); + goto err_unmap; + } + + for (i = 0; i < len / (2 * sizeof(u32)); i++, reg += 2) { + err = xive_irq_bitmap_add(be32_to_cpu(reg[0]), + be32_to_cpu(reg[1])); + if (err < 0) + goto err_mem_free; + } + + /* Iterate the EQ sizes and pick one */ + of_property_for_each_u32(np, "ibm,xive-eq-sizes", prop, reg, val) { + xive_queue_shift = val; + if (val == PAGE_SHIFT) + break; + } + + /* Initialize XIVE core with our backend */ + if (!xive_core_init(np, &xive_spapr_ops, tima, TM_QW1_OS, max_prio)) + goto err_mem_free; + + of_node_put(np); + pr_info("Using %dkB queues\n", 1 << (xive_queue_shift - 10)); + return true; + +err_mem_free: + xive_irq_bitmap_remove_all(); +err_unmap: + iounmap(tima); +err_put: + of_node_put(np); + return false; +} + +machine_arch_initcall(pseries, xive_core_debug_init); diff --git a/arch/powerpc/sysdev/xive/xive-internal.h b/arch/powerpc/sysdev/xive/xive-internal.h new file mode 100644 index 0000000000..fe6d95d54a --- /dev/null +++ b/arch/powerpc/sysdev/xive/xive-internal.h @@ -0,0 +1,78 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * Copyright 2016,2017 IBM Corporation. + */ +#ifndef __XIVE_INTERNAL_H +#define __XIVE_INTERNAL_H + +/* + * A "disabled" interrupt should never fire, to catch problems + * we set its logical number to this + */ +#define XIVE_BAD_IRQ 0x7fffffff +#define XIVE_MAX_IRQ (XIVE_BAD_IRQ - 1) + +/* Each CPU carry one of these with various per-CPU state */ +struct xive_cpu { +#ifdef CONFIG_SMP + /* HW irq number and data of IPI */ + u32 hw_ipi; + struct xive_irq_data ipi_data; +#endif /* CONFIG_SMP */ + + int chip_id; + + /* Queue datas. Only one is populated */ +#define XIVE_MAX_QUEUES 8 + struct xive_q queue[XIVE_MAX_QUEUES]; + + /* + * Pending mask. Each bit corresponds to a priority that + * potentially has pending interrupts. + */ + u8 pending_prio; + + /* Cache of HW CPPR */ + u8 cppr; +}; + +/* Backend ops */ +struct xive_ops { + int (*populate_irq_data)(u32 hw_irq, struct xive_irq_data *data); + int (*configure_irq)(u32 hw_irq, u32 target, u8 prio, u32 sw_irq); + int (*get_irq_config)(u32 hw_irq, u32 *target, u8 *prio, + u32 *sw_irq); + int (*setup_queue)(unsigned int cpu, struct xive_cpu *xc, u8 prio); + void (*cleanup_queue)(unsigned int cpu, struct xive_cpu *xc, u8 prio); + void (*prepare_cpu)(unsigned int cpu, struct xive_cpu *xc); + void (*setup_cpu)(unsigned int cpu, struct xive_cpu *xc); + void (*teardown_cpu)(unsigned int cpu, struct xive_cpu *xc); + bool (*match)(struct device_node *np); + void (*shutdown)(void); + + void (*update_pending)(struct xive_cpu *xc); + void (*sync_source)(u32 hw_irq); + u64 (*esb_rw)(u32 hw_irq, u32 offset, u64 data, bool write); +#ifdef CONFIG_SMP + int (*get_ipi)(unsigned int cpu, struct xive_cpu *xc); + void (*put_ipi)(unsigned int cpu, struct xive_cpu *xc); +#endif + int (*debug_show)(struct seq_file *m, void *private); + int (*debug_create)(struct dentry *xive_dir); + const char *name; +}; + +bool xive_core_init(struct device_node *np, const struct xive_ops *ops, + void __iomem *area, u32 offset, u8 max_prio); +__be32 *xive_queue_page_alloc(unsigned int cpu, u32 queue_shift); +int xive_core_debug_init(void); + +static inline u32 xive_alloc_order(u32 queue_shift) +{ + return (queue_shift > PAGE_SHIFT) ? (queue_shift - PAGE_SHIFT) : 0; +} + +extern bool xive_cmdline_disabled; +extern bool xive_has_save_restore; + +#endif /* __XIVE_INTERNAL_H */ |