diff options
Diffstat (limited to 'arch/sparc/kernel/smp_64.c')
-rw-r--r-- | arch/sparc/kernel/smp_64.c | 1568 |
1 files changed, 1568 insertions, 0 deletions
diff --git a/arch/sparc/kernel/smp_64.c b/arch/sparc/kernel/smp_64.c new file mode 100644 index 000000000..a55295d1b --- /dev/null +++ b/arch/sparc/kernel/smp_64.c @@ -0,0 +1,1568 @@ +// SPDX-License-Identifier: GPL-2.0 +/* smp.c: Sparc64 SMP support. + * + * Copyright (C) 1997, 2007, 2008 David S. Miller (davem@davemloft.net) + */ + +#include <linux/export.h> +#include <linux/kernel.h> +#include <linux/sched/mm.h> +#include <linux/sched/hotplug.h> +#include <linux/mm.h> +#include <linux/pagemap.h> +#include <linux/threads.h> +#include <linux/smp.h> +#include <linux/interrupt.h> +#include <linux/kernel_stat.h> +#include <linux/delay.h> +#include <linux/init.h> +#include <linux/spinlock.h> +#include <linux/fs.h> +#include <linux/seq_file.h> +#include <linux/cache.h> +#include <linux/jiffies.h> +#include <linux/profile.h> +#include <linux/memblock.h> +#include <linux/vmalloc.h> +#include <linux/ftrace.h> +#include <linux/cpu.h> +#include <linux/slab.h> +#include <linux/kgdb.h> + +#include <asm/head.h> +#include <asm/ptrace.h> +#include <linux/atomic.h> +#include <asm/tlbflush.h> +#include <asm/mmu_context.h> +#include <asm/cpudata.h> +#include <asm/hvtramp.h> +#include <asm/io.h> +#include <asm/timer.h> +#include <asm/setup.h> + +#include <asm/irq.h> +#include <asm/irq_regs.h> +#include <asm/page.h> +#include <asm/oplib.h> +#include <linux/uaccess.h> +#include <asm/starfire.h> +#include <asm/tlb.h> +#include <asm/pgalloc.h> +#include <asm/sections.h> +#include <asm/prom.h> +#include <asm/mdesc.h> +#include <asm/ldc.h> +#include <asm/hypervisor.h> +#include <asm/pcr.h> + +#include "cpumap.h" +#include "kernel.h" + +DEFINE_PER_CPU(cpumask_t, cpu_sibling_map) = CPU_MASK_NONE; +cpumask_t cpu_core_map[NR_CPUS] __read_mostly = + { [0 ... NR_CPUS-1] = CPU_MASK_NONE }; + +cpumask_t cpu_core_sib_map[NR_CPUS] __read_mostly = { + [0 ... NR_CPUS-1] = CPU_MASK_NONE }; + +cpumask_t cpu_core_sib_cache_map[NR_CPUS] __read_mostly = { + [0 ... NR_CPUS - 1] = CPU_MASK_NONE }; + +EXPORT_PER_CPU_SYMBOL(cpu_sibling_map); +EXPORT_SYMBOL(cpu_core_map); +EXPORT_SYMBOL(cpu_core_sib_map); +EXPORT_SYMBOL(cpu_core_sib_cache_map); + +static cpumask_t smp_commenced_mask; + +static DEFINE_PER_CPU(bool, poke); +static bool cpu_poke; + +void smp_info(struct seq_file *m) +{ + int i; + + seq_printf(m, "State:\n"); + for_each_online_cpu(i) + seq_printf(m, "CPU%d:\t\tonline\n", i); +} + +void smp_bogo(struct seq_file *m) +{ + int i; + + for_each_online_cpu(i) + seq_printf(m, + "Cpu%dClkTck\t: %016lx\n", + i, cpu_data(i).clock_tick); +} + +extern void setup_sparc64_timer(void); + +static volatile unsigned long callin_flag = 0; + +void smp_callin(void) +{ + int cpuid = hard_smp_processor_id(); + + __local_per_cpu_offset = __per_cpu_offset(cpuid); + + if (tlb_type == hypervisor) + sun4v_ktsb_register(); + + __flush_tlb_all(); + + setup_sparc64_timer(); + + if (cheetah_pcache_forced_on) + cheetah_enable_pcache(); + + callin_flag = 1; + __asm__ __volatile__("membar #Sync\n\t" + "flush %%g6" : : : "memory"); + + /* Clear this or we will die instantly when we + * schedule back to this idler... + */ + current_thread_info()->new_child = 0; + + /* Attach to the address space of init_task. */ + mmgrab(&init_mm); + current->active_mm = &init_mm; + + /* inform the notifiers about the new cpu */ + notify_cpu_starting(cpuid); + + while (!cpumask_test_cpu(cpuid, &smp_commenced_mask)) + rmb(); + + set_cpu_online(cpuid, true); + + local_irq_enable(); + + cpu_startup_entry(CPUHP_AP_ONLINE_IDLE); +} + +void cpu_panic(void) +{ + printk("CPU[%d]: Returns from cpu_idle!\n", smp_processor_id()); + panic("SMP bolixed\n"); +} + +/* This tick register synchronization scheme is taken entirely from + * the ia64 port, see arch/ia64/kernel/smpboot.c for details and credit. + * + * The only change I've made is to rework it so that the master + * initiates the synchonization instead of the slave. -DaveM + */ + +#define MASTER 0 +#define SLAVE (SMP_CACHE_BYTES/sizeof(unsigned long)) + +#define NUM_ROUNDS 64 /* magic value */ +#define NUM_ITERS 5 /* likewise */ + +static DEFINE_RAW_SPINLOCK(itc_sync_lock); +static unsigned long go[SLAVE + 1]; + +#define DEBUG_TICK_SYNC 0 + +static inline long get_delta (long *rt, long *master) +{ + unsigned long best_t0 = 0, best_t1 = ~0UL, best_tm = 0; + unsigned long tcenter, t0, t1, tm; + unsigned long i; + + for (i = 0; i < NUM_ITERS; i++) { + t0 = tick_ops->get_tick(); + go[MASTER] = 1; + membar_safe("#StoreLoad"); + while (!(tm = go[SLAVE])) + rmb(); + go[SLAVE] = 0; + wmb(); + t1 = tick_ops->get_tick(); + + if (t1 - t0 < best_t1 - best_t0) + best_t0 = t0, best_t1 = t1, best_tm = tm; + } + + *rt = best_t1 - best_t0; + *master = best_tm - best_t0; + + /* average best_t0 and best_t1 without overflow: */ + tcenter = (best_t0/2 + best_t1/2); + if (best_t0 % 2 + best_t1 % 2 == 2) + tcenter++; + return tcenter - best_tm; +} + +void smp_synchronize_tick_client(void) +{ + long i, delta, adj, adjust_latency = 0, done = 0; + unsigned long flags, rt, master_time_stamp; +#if DEBUG_TICK_SYNC + struct { + long rt; /* roundtrip time */ + long master; /* master's timestamp */ + long diff; /* difference between midpoint and master's timestamp */ + long lat; /* estimate of itc adjustment latency */ + } t[NUM_ROUNDS]; +#endif + + go[MASTER] = 1; + + while (go[MASTER]) + rmb(); + + local_irq_save(flags); + { + for (i = 0; i < NUM_ROUNDS; i++) { + delta = get_delta(&rt, &master_time_stamp); + if (delta == 0) + done = 1; /* let's lock on to this... */ + + if (!done) { + if (i > 0) { + adjust_latency += -delta; + adj = -delta + adjust_latency/4; + } else + adj = -delta; + + tick_ops->add_tick(adj); + } +#if DEBUG_TICK_SYNC + t[i].rt = rt; + t[i].master = master_time_stamp; + t[i].diff = delta; + t[i].lat = adjust_latency/4; +#endif + } + } + local_irq_restore(flags); + +#if DEBUG_TICK_SYNC + for (i = 0; i < NUM_ROUNDS; i++) + printk("rt=%5ld master=%5ld diff=%5ld adjlat=%5ld\n", + t[i].rt, t[i].master, t[i].diff, t[i].lat); +#endif + + printk(KERN_INFO "CPU %d: synchronized TICK with master CPU " + "(last diff %ld cycles, maxerr %lu cycles)\n", + smp_processor_id(), delta, rt); +} + +static void smp_start_sync_tick_client(int cpu); + +static void smp_synchronize_one_tick(int cpu) +{ + unsigned long flags, i; + + go[MASTER] = 0; + + smp_start_sync_tick_client(cpu); + + /* wait for client to be ready */ + while (!go[MASTER]) + rmb(); + + /* now let the client proceed into his loop */ + go[MASTER] = 0; + membar_safe("#StoreLoad"); + + raw_spin_lock_irqsave(&itc_sync_lock, flags); + { + for (i = 0; i < NUM_ROUNDS*NUM_ITERS; i++) { + while (!go[MASTER]) + rmb(); + go[MASTER] = 0; + wmb(); + go[SLAVE] = tick_ops->get_tick(); + membar_safe("#StoreLoad"); + } + } + raw_spin_unlock_irqrestore(&itc_sync_lock, flags); +} + +#if defined(CONFIG_SUN_LDOMS) && defined(CONFIG_HOTPLUG_CPU) +static void ldom_startcpu_cpuid(unsigned int cpu, unsigned long thread_reg, + void **descrp) +{ + extern unsigned long sparc64_ttable_tl0; + extern unsigned long kern_locked_tte_data; + struct hvtramp_descr *hdesc; + unsigned long trampoline_ra; + struct trap_per_cpu *tb; + u64 tte_vaddr, tte_data; + unsigned long hv_err; + int i; + + hdesc = kzalloc(sizeof(*hdesc) + + (sizeof(struct hvtramp_mapping) * + num_kernel_image_mappings - 1), + GFP_KERNEL); + if (!hdesc) { + printk(KERN_ERR "ldom_startcpu_cpuid: Cannot allocate " + "hvtramp_descr.\n"); + return; + } + *descrp = hdesc; + + hdesc->cpu = cpu; + hdesc->num_mappings = num_kernel_image_mappings; + + tb = &trap_block[cpu]; + + hdesc->fault_info_va = (unsigned long) &tb->fault_info; + hdesc->fault_info_pa = kimage_addr_to_ra(&tb->fault_info); + + hdesc->thread_reg = thread_reg; + + tte_vaddr = (unsigned long) KERNBASE; + tte_data = kern_locked_tte_data; + + for (i = 0; i < hdesc->num_mappings; i++) { + hdesc->maps[i].vaddr = tte_vaddr; + hdesc->maps[i].tte = tte_data; + tte_vaddr += 0x400000; + tte_data += 0x400000; + } + + trampoline_ra = kimage_addr_to_ra(hv_cpu_startup); + + hv_err = sun4v_cpu_start(cpu, trampoline_ra, + kimage_addr_to_ra(&sparc64_ttable_tl0), + __pa(hdesc)); + if (hv_err) + printk(KERN_ERR "ldom_startcpu_cpuid: sun4v_cpu_start() " + "gives error %lu\n", hv_err); +} +#endif + +extern unsigned long sparc64_cpu_startup; + +/* The OBP cpu startup callback truncates the 3rd arg cookie to + * 32-bits (I think) so to be safe we have it read the pointer + * contained here so we work on >4GB machines. -DaveM + */ +static struct thread_info *cpu_new_thread = NULL; + +static int smp_boot_one_cpu(unsigned int cpu, struct task_struct *idle) +{ + unsigned long entry = + (unsigned long)(&sparc64_cpu_startup); + unsigned long cookie = + (unsigned long)(&cpu_new_thread); + void *descr = NULL; + int timeout, ret; + + callin_flag = 0; + cpu_new_thread = task_thread_info(idle); + + if (tlb_type == hypervisor) { +#if defined(CONFIG_SUN_LDOMS) && defined(CONFIG_HOTPLUG_CPU) + if (ldom_domaining_enabled) + ldom_startcpu_cpuid(cpu, + (unsigned long) cpu_new_thread, + &descr); + else +#endif + prom_startcpu_cpuid(cpu, entry, cookie); + } else { + struct device_node *dp = of_find_node_by_cpuid(cpu); + + prom_startcpu(dp->phandle, entry, cookie); + } + + for (timeout = 0; timeout < 50000; timeout++) { + if (callin_flag) + break; + udelay(100); + } + + if (callin_flag) { + ret = 0; + } else { + printk("Processor %d is stuck.\n", cpu); + ret = -ENODEV; + } + cpu_new_thread = NULL; + + kfree(descr); + + return ret; +} + +static void spitfire_xcall_helper(u64 data0, u64 data1, u64 data2, u64 pstate, unsigned long cpu) +{ + u64 result, target; + int stuck, tmp; + + if (this_is_starfire) { + /* map to real upaid */ + cpu = (((cpu & 0x3c) << 1) | + ((cpu & 0x40) >> 4) | + (cpu & 0x3)); + } + + target = (cpu << 14) | 0x70; +again: + /* Ok, this is the real Spitfire Errata #54. + * One must read back from a UDB internal register + * after writes to the UDB interrupt dispatch, but + * before the membar Sync for that write. + * So we use the high UDB control register (ASI 0x7f, + * ADDR 0x20) for the dummy read. -DaveM + */ + tmp = 0x40; + __asm__ __volatile__( + "wrpr %1, %2, %%pstate\n\t" + "stxa %4, [%0] %3\n\t" + "stxa %5, [%0+%8] %3\n\t" + "add %0, %8, %0\n\t" + "stxa %6, [%0+%8] %3\n\t" + "membar #Sync\n\t" + "stxa %%g0, [%7] %3\n\t" + "membar #Sync\n\t" + "mov 0x20, %%g1\n\t" + "ldxa [%%g1] 0x7f, %%g0\n\t" + "membar #Sync" + : "=r" (tmp) + : "r" (pstate), "i" (PSTATE_IE), "i" (ASI_INTR_W), + "r" (data0), "r" (data1), "r" (data2), "r" (target), + "r" (0x10), "0" (tmp) + : "g1"); + + /* NOTE: PSTATE_IE is still clear. */ + stuck = 100000; + do { + __asm__ __volatile__("ldxa [%%g0] %1, %0" + : "=r" (result) + : "i" (ASI_INTR_DISPATCH_STAT)); + if (result == 0) { + __asm__ __volatile__("wrpr %0, 0x0, %%pstate" + : : "r" (pstate)); + return; + } + stuck -= 1; + if (stuck == 0) + break; + } while (result & 0x1); + __asm__ __volatile__("wrpr %0, 0x0, %%pstate" + : : "r" (pstate)); + if (stuck == 0) { + printk("CPU[%d]: mondo stuckage result[%016llx]\n", + smp_processor_id(), result); + } else { + udelay(2); + goto again; + } +} + +static void spitfire_xcall_deliver(struct trap_per_cpu *tb, int cnt) +{ + u64 *mondo, data0, data1, data2; + u16 *cpu_list; + u64 pstate; + int i; + + __asm__ __volatile__("rdpr %%pstate, %0" : "=r" (pstate)); + cpu_list = __va(tb->cpu_list_pa); + mondo = __va(tb->cpu_mondo_block_pa); + data0 = mondo[0]; + data1 = mondo[1]; + data2 = mondo[2]; + for (i = 0; i < cnt; i++) + spitfire_xcall_helper(data0, data1, data2, pstate, cpu_list[i]); +} + +/* Cheetah now allows to send the whole 64-bytes of data in the interrupt + * packet, but we have no use for that. However we do take advantage of + * the new pipelining feature (ie. dispatch to multiple cpus simultaneously). + */ +static void cheetah_xcall_deliver(struct trap_per_cpu *tb, int cnt) +{ + int nack_busy_id, is_jbus, need_more; + u64 *mondo, pstate, ver, busy_mask; + u16 *cpu_list; + + cpu_list = __va(tb->cpu_list_pa); + mondo = __va(tb->cpu_mondo_block_pa); + + /* Unfortunately, someone at Sun had the brilliant idea to make the + * busy/nack fields hard-coded by ITID number for this Ultra-III + * derivative processor. + */ + __asm__ ("rdpr %%ver, %0" : "=r" (ver)); + is_jbus = ((ver >> 32) == __JALAPENO_ID || + (ver >> 32) == __SERRANO_ID); + + __asm__ __volatile__("rdpr %%pstate, %0" : "=r" (pstate)); + +retry: + need_more = 0; + __asm__ __volatile__("wrpr %0, %1, %%pstate\n\t" + : : "r" (pstate), "i" (PSTATE_IE)); + + /* Setup the dispatch data registers. */ + __asm__ __volatile__("stxa %0, [%3] %6\n\t" + "stxa %1, [%4] %6\n\t" + "stxa %2, [%5] %6\n\t" + "membar #Sync\n\t" + : /* no outputs */ + : "r" (mondo[0]), "r" (mondo[1]), "r" (mondo[2]), + "r" (0x40), "r" (0x50), "r" (0x60), + "i" (ASI_INTR_W)); + + nack_busy_id = 0; + busy_mask = 0; + { + int i; + + for (i = 0; i < cnt; i++) { + u64 target, nr; + + nr = cpu_list[i]; + if (nr == 0xffff) + continue; + + target = (nr << 14) | 0x70; + if (is_jbus) { + busy_mask |= (0x1UL << (nr * 2)); + } else { + target |= (nack_busy_id << 24); + busy_mask |= (0x1UL << + (nack_busy_id * 2)); + } + __asm__ __volatile__( + "stxa %%g0, [%0] %1\n\t" + "membar #Sync\n\t" + : /* no outputs */ + : "r" (target), "i" (ASI_INTR_W)); + nack_busy_id++; + if (nack_busy_id == 32) { + need_more = 1; + break; + } + } + } + + /* Now, poll for completion. */ + { + u64 dispatch_stat, nack_mask; + long stuck; + + stuck = 100000 * nack_busy_id; + nack_mask = busy_mask << 1; + do { + __asm__ __volatile__("ldxa [%%g0] %1, %0" + : "=r" (dispatch_stat) + : "i" (ASI_INTR_DISPATCH_STAT)); + if (!(dispatch_stat & (busy_mask | nack_mask))) { + __asm__ __volatile__("wrpr %0, 0x0, %%pstate" + : : "r" (pstate)); + if (unlikely(need_more)) { + int i, this_cnt = 0; + for (i = 0; i < cnt; i++) { + if (cpu_list[i] == 0xffff) + continue; + cpu_list[i] = 0xffff; + this_cnt++; + if (this_cnt == 32) + break; + } + goto retry; + } + return; + } + if (!--stuck) + break; + } while (dispatch_stat & busy_mask); + + __asm__ __volatile__("wrpr %0, 0x0, %%pstate" + : : "r" (pstate)); + + if (dispatch_stat & busy_mask) { + /* Busy bits will not clear, continue instead + * of freezing up on this cpu. + */ + printk("CPU[%d]: mondo stuckage result[%016llx]\n", + smp_processor_id(), dispatch_stat); + } else { + int i, this_busy_nack = 0; + + /* Delay some random time with interrupts enabled + * to prevent deadlock. + */ + udelay(2 * nack_busy_id); + + /* Clear out the mask bits for cpus which did not + * NACK us. + */ + for (i = 0; i < cnt; i++) { + u64 check_mask, nr; + + nr = cpu_list[i]; + if (nr == 0xffff) + continue; + + if (is_jbus) + check_mask = (0x2UL << (2*nr)); + else + check_mask = (0x2UL << + this_busy_nack); + if ((dispatch_stat & check_mask) == 0) + cpu_list[i] = 0xffff; + this_busy_nack += 2; + if (this_busy_nack == 64) + break; + } + + goto retry; + } + } +} + +#define CPU_MONDO_COUNTER(cpuid) (cpu_mondo_counter[cpuid]) +#define MONDO_USEC_WAIT_MIN 2 +#define MONDO_USEC_WAIT_MAX 100 +#define MONDO_RETRY_LIMIT 500000 + +/* Multi-cpu list version. + * + * Deliver xcalls to 'cnt' number of cpus in 'cpu_list'. + * Sometimes not all cpus receive the mondo, requiring us to re-send + * the mondo until all cpus have received, or cpus are truly stuck + * unable to receive mondo, and we timeout. + * Occasionally a target cpu strand is borrowed briefly by hypervisor to + * perform guest service, such as PCIe error handling. Consider the + * service time, 1 second overall wait is reasonable for 1 cpu. + * Here two in-between mondo check wait time are defined: 2 usec for + * single cpu quick turn around and up to 100usec for large cpu count. + * Deliver mondo to large number of cpus could take longer, we adjusts + * the retry count as long as target cpus are making forward progress. + */ +static void hypervisor_xcall_deliver(struct trap_per_cpu *tb, int cnt) +{ + int this_cpu, tot_cpus, prev_sent, i, rem; + int usec_wait, retries, tot_retries; + u16 first_cpu = 0xffff; + unsigned long xc_rcvd = 0; + unsigned long status; + int ecpuerror_id = 0; + int enocpu_id = 0; + u16 *cpu_list; + u16 cpu; + + this_cpu = smp_processor_id(); + cpu_list = __va(tb->cpu_list_pa); + usec_wait = cnt * MONDO_USEC_WAIT_MIN; + if (usec_wait > MONDO_USEC_WAIT_MAX) + usec_wait = MONDO_USEC_WAIT_MAX; + retries = tot_retries = 0; + tot_cpus = cnt; + prev_sent = 0; + + do { + int n_sent, mondo_delivered, target_cpu_busy; + + status = sun4v_cpu_mondo_send(cnt, + tb->cpu_list_pa, + tb->cpu_mondo_block_pa); + + /* HV_EOK means all cpus received the xcall, we're done. */ + if (likely(status == HV_EOK)) + goto xcall_done; + + /* If not these non-fatal errors, panic */ + if (unlikely((status != HV_EWOULDBLOCK) && + (status != HV_ECPUERROR) && + (status != HV_ENOCPU))) + goto fatal_errors; + + /* First, see if we made any forward progress. + * + * Go through the cpu_list, count the target cpus that have + * received our mondo (n_sent), and those that did not (rem). + * Re-pack cpu_list with the cpus remain to be retried in the + * front - this simplifies tracking the truly stalled cpus. + * + * The hypervisor indicates successful sends by setting + * cpu list entries to the value 0xffff. + * + * EWOULDBLOCK means some target cpus did not receive the + * mondo and retry usually helps. + * + * ECPUERROR means at least one target cpu is in error state, + * it's usually safe to skip the faulty cpu and retry. + * + * ENOCPU means one of the target cpu doesn't belong to the + * domain, perhaps offlined which is unexpected, but not + * fatal and it's okay to skip the offlined cpu. + */ + rem = 0; + n_sent = 0; + for (i = 0; i < cnt; i++) { + cpu = cpu_list[i]; + if (likely(cpu == 0xffff)) { + n_sent++; + } else if ((status == HV_ECPUERROR) && + (sun4v_cpu_state(cpu) == HV_CPU_STATE_ERROR)) { + ecpuerror_id = cpu + 1; + } else if (status == HV_ENOCPU && !cpu_online(cpu)) { + enocpu_id = cpu + 1; + } else { + cpu_list[rem++] = cpu; + } + } + + /* No cpu remained, we're done. */ + if (rem == 0) + break; + + /* Otherwise, update the cpu count for retry. */ + cnt = rem; + + /* Record the overall number of mondos received by the + * first of the remaining cpus. + */ + if (first_cpu != cpu_list[0]) { + first_cpu = cpu_list[0]; + xc_rcvd = CPU_MONDO_COUNTER(first_cpu); + } + + /* Was any mondo delivered successfully? */ + mondo_delivered = (n_sent > prev_sent); + prev_sent = n_sent; + + /* or, was any target cpu busy processing other mondos? */ + target_cpu_busy = (xc_rcvd < CPU_MONDO_COUNTER(first_cpu)); + xc_rcvd = CPU_MONDO_COUNTER(first_cpu); + + /* Retry count is for no progress. If we're making progress, + * reset the retry count. + */ + if (likely(mondo_delivered || target_cpu_busy)) { + tot_retries += retries; + retries = 0; + } else if (unlikely(retries > MONDO_RETRY_LIMIT)) { + goto fatal_mondo_timeout; + } + + /* Delay a little bit to let other cpus catch up on + * their cpu mondo queue work. + */ + if (!mondo_delivered) + udelay(usec_wait); + + retries++; + } while (1); + +xcall_done: + if (unlikely(ecpuerror_id > 0)) { + pr_crit("CPU[%d]: SUN4V mondo cpu error, target cpu(%d) was in error state\n", + this_cpu, ecpuerror_id - 1); + } else if (unlikely(enocpu_id > 0)) { + pr_crit("CPU[%d]: SUN4V mondo cpu error, target cpu(%d) does not belong to the domain\n", + this_cpu, enocpu_id - 1); + } + return; + +fatal_errors: + /* fatal errors include bad alignment, etc */ + pr_crit("CPU[%d]: Args were cnt(%d) cpulist_pa(%lx) mondo_block_pa(%lx)\n", + this_cpu, tot_cpus, tb->cpu_list_pa, tb->cpu_mondo_block_pa); + panic("Unexpected SUN4V mondo error %lu\n", status); + +fatal_mondo_timeout: + /* some cpus being non-responsive to the cpu mondo */ + pr_crit("CPU[%d]: SUN4V mondo timeout, cpu(%d) made no forward progress after %d retries. Total target cpus(%d).\n", + this_cpu, first_cpu, (tot_retries + retries), tot_cpus); + panic("SUN4V mondo timeout panic\n"); +} + +static void (*xcall_deliver_impl)(struct trap_per_cpu *, int); + +static void xcall_deliver(u64 data0, u64 data1, u64 data2, const cpumask_t *mask) +{ + struct trap_per_cpu *tb; + int this_cpu, i, cnt; + unsigned long flags; + u16 *cpu_list; + u64 *mondo; + + /* We have to do this whole thing with interrupts fully disabled. + * Otherwise if we send an xcall from interrupt context it will + * corrupt both our mondo block and cpu list state. + * + * One consequence of this is that we cannot use timeout mechanisms + * that depend upon interrupts being delivered locally. So, for + * example, we cannot sample jiffies and expect it to advance. + * + * Fortunately, udelay() uses %stick/%tick so we can use that. + */ + local_irq_save(flags); + + this_cpu = smp_processor_id(); + tb = &trap_block[this_cpu]; + + mondo = __va(tb->cpu_mondo_block_pa); + mondo[0] = data0; + mondo[1] = data1; + mondo[2] = data2; + wmb(); + + cpu_list = __va(tb->cpu_list_pa); + + /* Setup the initial cpu list. */ + cnt = 0; + for_each_cpu(i, mask) { + if (i == this_cpu || !cpu_online(i)) + continue; + cpu_list[cnt++] = i; + } + + if (cnt) + xcall_deliver_impl(tb, cnt); + + local_irq_restore(flags); +} + +/* Send cross call to all processors mentioned in MASK_P + * except self. Really, there are only two cases currently, + * "cpu_online_mask" and "mm_cpumask(mm)". + */ +static void smp_cross_call_masked(unsigned long *func, u32 ctx, u64 data1, u64 data2, const cpumask_t *mask) +{ + u64 data0 = (((u64)ctx)<<32 | (((u64)func) & 0xffffffff)); + + xcall_deliver(data0, data1, data2, mask); +} + +/* Send cross call to all processors except self. */ +static void smp_cross_call(unsigned long *func, u32 ctx, u64 data1, u64 data2) +{ + smp_cross_call_masked(func, ctx, data1, data2, cpu_online_mask); +} + +extern unsigned long xcall_sync_tick; + +static void smp_start_sync_tick_client(int cpu) +{ + xcall_deliver((u64) &xcall_sync_tick, 0, 0, + cpumask_of(cpu)); +} + +extern unsigned long xcall_call_function; + +void arch_send_call_function_ipi_mask(const struct cpumask *mask) +{ + xcall_deliver((u64) &xcall_call_function, 0, 0, mask); +} + +extern unsigned long xcall_call_function_single; + +void arch_send_call_function_single_ipi(int cpu) +{ + xcall_deliver((u64) &xcall_call_function_single, 0, 0, + cpumask_of(cpu)); +} + +void __irq_entry smp_call_function_client(int irq, struct pt_regs *regs) +{ + clear_softint(1 << irq); + irq_enter(); + generic_smp_call_function_interrupt(); + irq_exit(); +} + +void __irq_entry smp_call_function_single_client(int irq, struct pt_regs *regs) +{ + clear_softint(1 << irq); + irq_enter(); + generic_smp_call_function_single_interrupt(); + irq_exit(); +} + +static void tsb_sync(void *info) +{ + struct trap_per_cpu *tp = &trap_block[raw_smp_processor_id()]; + struct mm_struct *mm = info; + + /* It is not valid to test "current->active_mm == mm" here. + * + * The value of "current" is not changed atomically with + * switch_mm(). But that's OK, we just need to check the + * current cpu's trap block PGD physical address. + */ + if (tp->pgd_paddr == __pa(mm->pgd)) + tsb_context_switch(mm); +} + +void smp_tsb_sync(struct mm_struct *mm) +{ + smp_call_function_many(mm_cpumask(mm), tsb_sync, mm, 1); +} + +extern unsigned long xcall_flush_tlb_mm; +extern unsigned long xcall_flush_tlb_page; +extern unsigned long xcall_flush_tlb_kernel_range; +extern unsigned long xcall_fetch_glob_regs; +extern unsigned long xcall_fetch_glob_pmu; +extern unsigned long xcall_fetch_glob_pmu_n4; +extern unsigned long xcall_receive_signal; +extern unsigned long xcall_new_mmu_context_version; +#ifdef CONFIG_KGDB +extern unsigned long xcall_kgdb_capture; +#endif + +#ifdef DCACHE_ALIASING_POSSIBLE +extern unsigned long xcall_flush_dcache_page_cheetah; +#endif +extern unsigned long xcall_flush_dcache_page_spitfire; + +static inline void __local_flush_dcache_page(struct page *page) +{ +#ifdef DCACHE_ALIASING_POSSIBLE + __flush_dcache_page(page_address(page), + ((tlb_type == spitfire) && + page_mapping_file(page) != NULL)); +#else + if (page_mapping_file(page) != NULL && + tlb_type == spitfire) + __flush_icache_page(__pa(page_address(page))); +#endif +} + +void smp_flush_dcache_page_impl(struct page *page, int cpu) +{ + int this_cpu; + + if (tlb_type == hypervisor) + return; + +#ifdef CONFIG_DEBUG_DCFLUSH + atomic_inc(&dcpage_flushes); +#endif + + this_cpu = get_cpu(); + + if (cpu == this_cpu) { + __local_flush_dcache_page(page); + } else if (cpu_online(cpu)) { + void *pg_addr = page_address(page); + u64 data0 = 0; + + if (tlb_type == spitfire) { + data0 = ((u64)&xcall_flush_dcache_page_spitfire); + if (page_mapping_file(page) != NULL) + data0 |= ((u64)1 << 32); + } else if (tlb_type == cheetah || tlb_type == cheetah_plus) { +#ifdef DCACHE_ALIASING_POSSIBLE + data0 = ((u64)&xcall_flush_dcache_page_cheetah); +#endif + } + if (data0) { + xcall_deliver(data0, __pa(pg_addr), + (u64) pg_addr, cpumask_of(cpu)); +#ifdef CONFIG_DEBUG_DCFLUSH + atomic_inc(&dcpage_flushes_xcall); +#endif + } + } + + put_cpu(); +} + +void flush_dcache_page_all(struct mm_struct *mm, struct page *page) +{ + void *pg_addr; + u64 data0; + + if (tlb_type == hypervisor) + return; + + preempt_disable(); + +#ifdef CONFIG_DEBUG_DCFLUSH + atomic_inc(&dcpage_flushes); +#endif + data0 = 0; + pg_addr = page_address(page); + if (tlb_type == spitfire) { + data0 = ((u64)&xcall_flush_dcache_page_spitfire); + if (page_mapping_file(page) != NULL) + data0 |= ((u64)1 << 32); + } else if (tlb_type == cheetah || tlb_type == cheetah_plus) { +#ifdef DCACHE_ALIASING_POSSIBLE + data0 = ((u64)&xcall_flush_dcache_page_cheetah); +#endif + } + if (data0) { + xcall_deliver(data0, __pa(pg_addr), + (u64) pg_addr, cpu_online_mask); +#ifdef CONFIG_DEBUG_DCFLUSH + atomic_inc(&dcpage_flushes_xcall); +#endif + } + __local_flush_dcache_page(page); + + preempt_enable(); +} + +#ifdef CONFIG_KGDB +void kgdb_roundup_cpus(void) +{ + smp_cross_call(&xcall_kgdb_capture, 0, 0, 0); +} +#endif + +void smp_fetch_global_regs(void) +{ + smp_cross_call(&xcall_fetch_glob_regs, 0, 0, 0); +} + +void smp_fetch_global_pmu(void) +{ + if (tlb_type == hypervisor && + sun4v_chip_type >= SUN4V_CHIP_NIAGARA4) + smp_cross_call(&xcall_fetch_glob_pmu_n4, 0, 0, 0); + else + smp_cross_call(&xcall_fetch_glob_pmu, 0, 0, 0); +} + +/* We know that the window frames of the user have been flushed + * to the stack before we get here because all callers of us + * are flush_tlb_*() routines, and these run after flush_cache_*() + * which performs the flushw. + * + * mm->cpu_vm_mask is a bit mask of which cpus an address + * space has (potentially) executed on, this is the heuristic + * we use to limit cross calls. + */ + +/* This currently is only used by the hugetlb arch pre-fault + * hook on UltraSPARC-III+ and later when changing the pagesize + * bits of the context register for an address space. + */ +void smp_flush_tlb_mm(struct mm_struct *mm) +{ + u32 ctx = CTX_HWBITS(mm->context); + + get_cpu(); + + smp_cross_call_masked(&xcall_flush_tlb_mm, + ctx, 0, 0, + mm_cpumask(mm)); + + __flush_tlb_mm(ctx, SECONDARY_CONTEXT); + + put_cpu(); +} + +struct tlb_pending_info { + unsigned long ctx; + unsigned long nr; + unsigned long *vaddrs; +}; + +static void tlb_pending_func(void *info) +{ + struct tlb_pending_info *t = info; + + __flush_tlb_pending(t->ctx, t->nr, t->vaddrs); +} + +void smp_flush_tlb_pending(struct mm_struct *mm, unsigned long nr, unsigned long *vaddrs) +{ + u32 ctx = CTX_HWBITS(mm->context); + struct tlb_pending_info info; + + get_cpu(); + + info.ctx = ctx; + info.nr = nr; + info.vaddrs = vaddrs; + + smp_call_function_many(mm_cpumask(mm), tlb_pending_func, + &info, 1); + + __flush_tlb_pending(ctx, nr, vaddrs); + + put_cpu(); +} + +void smp_flush_tlb_page(struct mm_struct *mm, unsigned long vaddr) +{ + unsigned long context = CTX_HWBITS(mm->context); + + get_cpu(); + + smp_cross_call_masked(&xcall_flush_tlb_page, + context, vaddr, 0, + mm_cpumask(mm)); + + __flush_tlb_page(context, vaddr); + + put_cpu(); +} + +void smp_flush_tlb_kernel_range(unsigned long start, unsigned long end) +{ + start &= PAGE_MASK; + end = PAGE_ALIGN(end); + if (start != end) { + smp_cross_call(&xcall_flush_tlb_kernel_range, + 0, start, end); + + __flush_tlb_kernel_range(start, end); + } +} + +/* CPU capture. */ +/* #define CAPTURE_DEBUG */ +extern unsigned long xcall_capture; + +static atomic_t smp_capture_depth = ATOMIC_INIT(0); +static atomic_t smp_capture_registry = ATOMIC_INIT(0); +static unsigned long penguins_are_doing_time; + +void smp_capture(void) +{ + int result = atomic_add_return(1, &smp_capture_depth); + + if (result == 1) { + int ncpus = num_online_cpus(); + +#ifdef CAPTURE_DEBUG + printk("CPU[%d]: Sending penguins to jail...", + smp_processor_id()); +#endif + penguins_are_doing_time = 1; + atomic_inc(&smp_capture_registry); + smp_cross_call(&xcall_capture, 0, 0, 0); + while (atomic_read(&smp_capture_registry) != ncpus) + rmb(); +#ifdef CAPTURE_DEBUG + printk("done\n"); +#endif + } +} + +void smp_release(void) +{ + if (atomic_dec_and_test(&smp_capture_depth)) { +#ifdef CAPTURE_DEBUG + printk("CPU[%d]: Giving pardon to " + "imprisoned penguins\n", + smp_processor_id()); +#endif + penguins_are_doing_time = 0; + membar_safe("#StoreLoad"); + atomic_dec(&smp_capture_registry); + } +} + +/* Imprisoned penguins run with %pil == PIL_NORMAL_MAX, but PSTATE_IE + * set, so they can service tlb flush xcalls... + */ +extern void prom_world(int); + +void __irq_entry smp_penguin_jailcell(int irq, struct pt_regs *regs) +{ + clear_softint(1 << irq); + + preempt_disable(); + + __asm__ __volatile__("flushw"); + prom_world(1); + atomic_inc(&smp_capture_registry); + membar_safe("#StoreLoad"); + while (penguins_are_doing_time) + rmb(); + atomic_dec(&smp_capture_registry); + prom_world(0); + + preempt_enable(); +} + +void __init smp_prepare_cpus(unsigned int max_cpus) +{ +} + +void smp_prepare_boot_cpu(void) +{ +} + +void __init smp_setup_processor_id(void) +{ + if (tlb_type == spitfire) + xcall_deliver_impl = spitfire_xcall_deliver; + else if (tlb_type == cheetah || tlb_type == cheetah_plus) + xcall_deliver_impl = cheetah_xcall_deliver; + else + xcall_deliver_impl = hypervisor_xcall_deliver; +} + +void __init smp_fill_in_cpu_possible_map(void) +{ + int possible_cpus = num_possible_cpus(); + int i; + + if (possible_cpus > nr_cpu_ids) + possible_cpus = nr_cpu_ids; + + for (i = 0; i < possible_cpus; i++) + set_cpu_possible(i, true); + for (; i < NR_CPUS; i++) + set_cpu_possible(i, false); +} + +void smp_fill_in_sib_core_maps(void) +{ + unsigned int i; + + for_each_present_cpu(i) { + unsigned int j; + + cpumask_clear(&cpu_core_map[i]); + if (cpu_data(i).core_id == 0) { + cpumask_set_cpu(i, &cpu_core_map[i]); + continue; + } + + for_each_present_cpu(j) { + if (cpu_data(i).core_id == + cpu_data(j).core_id) + cpumask_set_cpu(j, &cpu_core_map[i]); + } + } + + for_each_present_cpu(i) { + unsigned int j; + + for_each_present_cpu(j) { + if (cpu_data(i).max_cache_id == + cpu_data(j).max_cache_id) + cpumask_set_cpu(j, &cpu_core_sib_cache_map[i]); + + if (cpu_data(i).sock_id == cpu_data(j).sock_id) + cpumask_set_cpu(j, &cpu_core_sib_map[i]); + } + } + + for_each_present_cpu(i) { + unsigned int j; + + cpumask_clear(&per_cpu(cpu_sibling_map, i)); + if (cpu_data(i).proc_id == -1) { + cpumask_set_cpu(i, &per_cpu(cpu_sibling_map, i)); + continue; + } + + for_each_present_cpu(j) { + if (cpu_data(i).proc_id == + cpu_data(j).proc_id) + cpumask_set_cpu(j, &per_cpu(cpu_sibling_map, i)); + } + } +} + +int __cpu_up(unsigned int cpu, struct task_struct *tidle) +{ + int ret = smp_boot_one_cpu(cpu, tidle); + + if (!ret) { + cpumask_set_cpu(cpu, &smp_commenced_mask); + while (!cpu_online(cpu)) + mb(); + if (!cpu_online(cpu)) { + ret = -ENODEV; + } else { + /* On SUN4V, writes to %tick and %stick are + * not allowed. + */ + if (tlb_type != hypervisor) + smp_synchronize_one_tick(cpu); + } + } + return ret; +} + +#ifdef CONFIG_HOTPLUG_CPU +void cpu_play_dead(void) +{ + int cpu = smp_processor_id(); + unsigned long pstate; + + idle_task_exit(); + + if (tlb_type == hypervisor) { + struct trap_per_cpu *tb = &trap_block[cpu]; + + sun4v_cpu_qconf(HV_CPU_QUEUE_CPU_MONDO, + tb->cpu_mondo_pa, 0); + sun4v_cpu_qconf(HV_CPU_QUEUE_DEVICE_MONDO, + tb->dev_mondo_pa, 0); + sun4v_cpu_qconf(HV_CPU_QUEUE_RES_ERROR, + tb->resum_mondo_pa, 0); + sun4v_cpu_qconf(HV_CPU_QUEUE_NONRES_ERROR, + tb->nonresum_mondo_pa, 0); + } + + cpumask_clear_cpu(cpu, &smp_commenced_mask); + membar_safe("#Sync"); + + local_irq_disable(); + + __asm__ __volatile__( + "rdpr %%pstate, %0\n\t" + "wrpr %0, %1, %%pstate" + : "=r" (pstate) + : "i" (PSTATE_IE)); + + while (1) + barrier(); +} + +int __cpu_disable(void) +{ + int cpu = smp_processor_id(); + cpuinfo_sparc *c; + int i; + + for_each_cpu(i, &cpu_core_map[cpu]) + cpumask_clear_cpu(cpu, &cpu_core_map[i]); + cpumask_clear(&cpu_core_map[cpu]); + + for_each_cpu(i, &per_cpu(cpu_sibling_map, cpu)) + cpumask_clear_cpu(cpu, &per_cpu(cpu_sibling_map, i)); + cpumask_clear(&per_cpu(cpu_sibling_map, cpu)); + + c = &cpu_data(cpu); + + c->core_id = 0; + c->proc_id = -1; + + smp_wmb(); + + /* Make sure no interrupts point to this cpu. */ + fixup_irqs(); + + local_irq_enable(); + mdelay(1); + local_irq_disable(); + + set_cpu_online(cpu, false); + + cpu_map_rebuild(); + + return 0; +} + +void __cpu_die(unsigned int cpu) +{ + int i; + + for (i = 0; i < 100; i++) { + smp_rmb(); + if (!cpumask_test_cpu(cpu, &smp_commenced_mask)) + break; + msleep(100); + } + if (cpumask_test_cpu(cpu, &smp_commenced_mask)) { + printk(KERN_ERR "CPU %u didn't die...\n", cpu); + } else { +#if defined(CONFIG_SUN_LDOMS) + unsigned long hv_err; + int limit = 100; + + do { + hv_err = sun4v_cpu_stop(cpu); + if (hv_err == HV_EOK) { + set_cpu_present(cpu, false); + break; + } + } while (--limit > 0); + if (limit <= 0) { + printk(KERN_ERR "sun4v_cpu_stop() fails err=%lu\n", + hv_err); + } +#endif + } +} +#endif + +void __init smp_cpus_done(unsigned int max_cpus) +{ +} + +static void send_cpu_ipi(int cpu) +{ + xcall_deliver((u64) &xcall_receive_signal, + 0, 0, cpumask_of(cpu)); +} + +void scheduler_poke(void) +{ + if (!cpu_poke) + return; + + if (!__this_cpu_read(poke)) + return; + + __this_cpu_write(poke, false); + set_softint(1 << PIL_SMP_RECEIVE_SIGNAL); +} + +static unsigned long send_cpu_poke(int cpu) +{ + unsigned long hv_err; + + per_cpu(poke, cpu) = true; + hv_err = sun4v_cpu_poke(cpu); + if (hv_err != HV_EOK) { + per_cpu(poke, cpu) = false; + pr_err_ratelimited("%s: sun4v_cpu_poke() fails err=%lu\n", + __func__, hv_err); + } + + return hv_err; +} + +void smp_send_reschedule(int cpu) +{ + if (cpu == smp_processor_id()) { + WARN_ON_ONCE(preemptible()); + set_softint(1 << PIL_SMP_RECEIVE_SIGNAL); + return; + } + + /* Use cpu poke to resume idle cpu if supported. */ + if (cpu_poke && idle_cpu(cpu)) { + unsigned long ret; + + ret = send_cpu_poke(cpu); + if (ret == HV_EOK) + return; + } + + /* Use IPI in following cases: + * - cpu poke not supported + * - cpu not idle + * - send_cpu_poke() returns with error + */ + send_cpu_ipi(cpu); +} + +void smp_init_cpu_poke(void) +{ + unsigned long major; + unsigned long minor; + int ret; + + if (tlb_type != hypervisor) + return; + + ret = sun4v_hvapi_get(HV_GRP_CORE, &major, &minor); + if (ret) { + pr_debug("HV_GRP_CORE is not registered\n"); + return; + } + + if (major == 1 && minor >= 6) { + /* CPU POKE is registered. */ + cpu_poke = true; + return; + } + + pr_debug("CPU_POKE not supported\n"); +} + +void __irq_entry smp_receive_signal_client(int irq, struct pt_regs *regs) +{ + clear_softint(1 << irq); + scheduler_ipi(); +} + +static void stop_this_cpu(void *dummy) +{ + set_cpu_online(smp_processor_id(), false); + prom_stopself(); +} + +void smp_send_stop(void) +{ + int cpu; + + if (tlb_type == hypervisor) { + int this_cpu = smp_processor_id(); +#ifdef CONFIG_SERIAL_SUNHV + sunhv_migrate_hvcons_irq(this_cpu); +#endif + for_each_online_cpu(cpu) { + if (cpu == this_cpu) + continue; + + set_cpu_online(cpu, false); +#ifdef CONFIG_SUN_LDOMS + if (ldom_domaining_enabled) { + unsigned long hv_err; + hv_err = sun4v_cpu_stop(cpu); + if (hv_err) + printk(KERN_ERR "sun4v_cpu_stop() " + "failed err=%lu\n", hv_err); + } else +#endif + prom_stopcpu_cpuid(cpu); + } + } else + smp_call_function(stop_this_cpu, NULL, 0); +} + +static int __init pcpu_cpu_distance(unsigned int from, unsigned int to) +{ + if (cpu_to_node(from) == cpu_to_node(to)) + return LOCAL_DISTANCE; + else + return REMOTE_DISTANCE; +} + +static int __init pcpu_cpu_to_node(int cpu) +{ + return cpu_to_node(cpu); +} + +void __init setup_per_cpu_areas(void) +{ + unsigned long delta; + unsigned int cpu; + int rc = -EINVAL; + + if (pcpu_chosen_fc != PCPU_FC_PAGE) { + rc = pcpu_embed_first_chunk(PERCPU_MODULE_RESERVE, + PERCPU_DYNAMIC_RESERVE, 4 << 20, + pcpu_cpu_distance, + pcpu_cpu_to_node); + if (rc) + pr_warn("PERCPU: %s allocator failed (%d), " + "falling back to page size\n", + pcpu_fc_names[pcpu_chosen_fc], rc); + } + if (rc < 0) + rc = pcpu_page_first_chunk(PERCPU_MODULE_RESERVE, + pcpu_cpu_to_node); + if (rc < 0) + panic("cannot initialize percpu area (err=%d)", rc); + + delta = (unsigned long)pcpu_base_addr - (unsigned long)__per_cpu_start; + for_each_possible_cpu(cpu) + __per_cpu_offset(cpu) = delta + pcpu_unit_offsets[cpu]; + + /* Setup %g5 for the boot cpu. */ + __local_per_cpu_offset = __per_cpu_offset(smp_processor_id()); + + of_fill_in_cpu_data(); + if (tlb_type == hypervisor) + mdesc_fill_in_cpu_data(cpu_all_mask); +} |