qemu/roms/skiboot/core/cpu.c

// SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later
/*
 * Code to manage and manipulate CPUs
 *
 * Copyright 2013-2019 IBM Corp.
 */

#include <skiboot.h>
#include <cpu.h>
#include <device.h>
#include <mem_region.h>
#include <opal.h>
#include <stack.h>
#include <trace.h>
#include <affinity.h>
#include <chip.h>
#include <timebase.h>
#include <interrupts.h>
#include <ccan/str/str.h>
#include <ccan/container_of/container_of.h>
#include <xscom.h>

/* The cpu_threads array is static and indexed by PIR in
 * order to speed up lookup from asm entry points
 */
struct cpu_stack {
	union {
		uint8_t	stack[STACK_SIZE];
		struct cpu_thread cpu;
	};
} __align(STACK_SIZE);

static struct cpu_stack * const cpu_stacks = (struct cpu_stack *)CPU_STACKS_BASE;
static unsigned int cpu_threads_max;
unsigned int cpu_thread_count;
unsigned int cpu_max_pir;
struct cpu_thread *boot_cpu;
static struct lock reinit_lock = LOCK_UNLOCKED;
static bool radix_supported;
static unsigned long hid0_hile;
static unsigned long hid0_attn;
static unsigned long hid0_icache;
static bool reconfigure_idle = false;
static bool sreset_enabled;
static bool ipi_enabled;
static bool pm_enabled;
static bool current_hile_mode = HAVE_LITTLE_ENDIAN;
static bool current_radix_mode = true;
static bool tm_suspend_enabled;

unsigned long cpu_secondary_start __force_data = 0;

struct cpu_job {
	struct list_node	link;
	void			(*func)(void *data);
	void			*data;
	const char		*name;
	bool			complete;
	bool		        no_return;
};

/* attribute const as cpu_stacks is constant. */
unsigned long __attrconst cpu_stack_bottom(unsigned int pir)
{
	return ((unsigned long)&cpu_stacks[pir]) +
		sizeof(struct cpu_thread) + STACK_SAFETY_GAP;
}

unsigned long __attrconst cpu_stack_top(unsigned int pir)
{
	/* This is the top of the normal stack. */
	return ((unsigned long)&cpu_stacks[pir]) +
		NORMAL_STACK_SIZE - STACK_TOP_GAP;
}

unsigned long __attrconst cpu_emergency_stack_top(unsigned int pir)
{
	/* This is the top of the emergency stack, above the normal stack. */
	return ((unsigned long)&cpu_stacks[pir]) +
		NORMAL_STACK_SIZE + EMERGENCY_STACK_SIZE - STACK_TOP_GAP;
}

void __nomcount cpu_relax(void)
{
	if ((mfspr(SPR_PPR32) >> 18) != 0x4) {
		printf("cpu_relax called when not at medium SMT priority: "
			"PPR[PRI]=0x%lx\n", mfspr(SPR_PPR32) >> 18);
		backtrace();
	}

	/* Relax a bit to give sibling threads some breathing space */
	smt_lowest();
	asm volatile("nop; nop; nop; nop;\n"
		     "nop; nop; nop; nop;\n"
		     "nop; nop; nop; nop;\n"
		     "nop; nop; nop; nop;\n");
	smt_medium();
	barrier();
}

static void cpu_send_ipi(struct cpu_thread *cpu)
{
	if (proc_gen == proc_gen_p8) {
		/* Poke IPI */
		icp_kick_cpu(cpu);
	} else if (proc_gen == proc_gen_p9 || proc_gen == proc_gen_p10 ||
						proc_gen == proc_gen_p11) {
		p9_dbell_send(cpu->pir);
	}
}

/*
 * If chip_id is >= 0, schedule the job on that node.
 * Otherwise schedule the job anywhere.
 */
static struct cpu_thread *cpu_find_job_target(int32_t chip_id)
{
	struct cpu_thread *cpu, *best, *me = this_cpu();
	uint32_t best_count;

	/* We try to find a target to run a job. We need to avoid
	 * a CPU that has a "no return" job on its queue as it might
	 * never be able to process anything.
	 *
	 * Additionally we don't check the list but the job count
	 * on the target CPUs, since that is decremented *after*
	 * a job has been completed.
	 */


	/* First we scan all available primary threads
	 */
	for_each_available_cpu(cpu) {
		if (chip_id >= 0 && cpu->chip_id != chip_id)
			continue;
		if (cpu == me || !cpu_is_thread0(cpu) || cpu->job_has_no_return)
			continue;
		if (cpu->job_count)
			continue;
		lock(&cpu->job_lock);
		if (!cpu->job_count)
			return cpu;
		unlock(&cpu->job_lock);
	}

	/* Now try again with secondary threads included and keep
	 * track of the one with the less jobs queued up. This is
	 * done in a racy way, but it's just an optimization in case
	 * we are overcommitted on jobs. Could could also just pick
	 * a random one...
	 */
	best = NULL;
	best_count = -1u;
	for_each_available_cpu(cpu) {
		if (chip_id >= 0 && cpu->chip_id != chip_id)
			continue;
		if (cpu == me || cpu->job_has_no_return)
			continue;
		if (!best || cpu->job_count < best_count) {
			best = cpu;
			best_count = cpu->job_count;
		}
		if (cpu->job_count)
			continue;
		lock(&cpu->job_lock);
		if (!cpu->job_count)
			return cpu;
		unlock(&cpu->job_lock);
	}

	/* We haven't found anybody, do we have a bestie ? */
	if (best) {
		lock(&best->job_lock);
		return best;
	}

	/* Go away */
	return NULL;
}

/* job_lock is held, returns with it released */
static void queue_job_on_cpu(struct cpu_thread *cpu, struct cpu_job *job)
{
	/* That's bad, the job will never run */
	if (cpu->job_has_no_return) {
		prlog(PR_WARNING, "WARNING ! Job %s scheduled on CPU 0x%x"
		      " which has a no-return job on its queue !\n",
		      job->name, cpu->pir);
		backtrace();
	}
	list_add_tail(&cpu->job_queue, &job->link);
	if (job->no_return)
		cpu->job_has_no_return = true;
	else
		cpu->job_count++;
	unlock(&cpu->job_lock);

	/* Is it idle waiting for jobs? If so, must send an IPI. */
	sync();
	if (cpu->in_job_sleep)
		cpu_send_ipi(cpu);
}

struct cpu_job *__cpu_queue_job(struct cpu_thread *cpu,
				const char *name,
				void (*func)(void *data), void *data,
				bool no_return)
{
	struct cpu_job *job;

#ifdef DEBUG_SERIALIZE_CPU_JOBS
	if (cpu == NULL)
		cpu = this_cpu();
#endif

	if (cpu && !cpu_is_available(cpu)) {
		prerror("CPU: Tried to queue job on unavailable CPU 0x%04x\n",
			cpu->pir);
		return NULL;
	}

	job = zalloc(sizeof(struct cpu_job));
	if (!job)
		return NULL;
	job->func = func;
	job->data = data;
	job->name = name;
	job->complete = false;
	job->no_return = no_return;

	/* Pick a candidate. Returns with target queue locked */
	if (cpu == NULL)
		cpu = cpu_find_job_target(-1);
	else if (cpu != this_cpu())
		lock(&cpu->job_lock);
	else
		cpu = NULL;

	/* Can't be scheduled, run it now */
	if (cpu == NULL) {
		if (!this_cpu()->job_has_no_return)
			this_cpu()->job_has_no_return = no_return;
		func(data);
		job->complete = true;
		return job;
	}

	queue_job_on_cpu(cpu, job);

	return job;
}

struct cpu_job *cpu_queue_job_on_node(uint32_t chip_id,
				const char *name,
				void (*func)(void *data), void *data)
{
	struct cpu_thread *cpu;
	struct cpu_job *job;

	job = zalloc(sizeof(struct cpu_job));
	if (!job)
		return NULL;
	job->func = func;
	job->data = data;
	job->name = name;
	job->complete = false;
	job->no_return = false;

	/* Pick a candidate. Returns with target queue locked */
	cpu = cpu_find_job_target(chip_id);

	/* Can't be scheduled... */
	if (cpu == NULL) {
		cpu = this_cpu();
		if (cpu->chip_id == chip_id) {
			/* Run it now if we're the right node. */
			func(data);
			job->complete = true;
			return job;
		}
		/* Otherwise fail. */
		free(job);
		return NULL;
	}

	queue_job_on_cpu(cpu, job);

	return job;
}

bool cpu_poll_job(struct cpu_job *job)
{
	lwsync();
	return job->complete;
}

void cpu_wait_job(struct cpu_job *job, bool free_it)
{
	unsigned long time_waited = 0;

	if (!job)
		return;

	while (!job->complete) {
		/* This will call OPAL pollers for us */
		time_wait_ms(10);
		time_waited += 10;
		lwsync();
		if ((time_waited % 30000) == 0) {
			prlog(PR_INFO, "cpu_wait_job(%s) for %lums\n",
			      job->name, time_waited);
			backtrace();
		}
	}
	lwsync();

	if (time_waited > 1000)
		prlog(PR_DEBUG, "cpu_wait_job(%s) for %lums\n",
		      job->name, time_waited);

	if (free_it)
		free(job);
}

bool cpu_check_jobs(struct cpu_thread *cpu)
{
	return !list_empty_nocheck(&cpu->job_queue);
}

void cpu_process_jobs(void)
{
	struct cpu_thread *cpu = this_cpu();
	struct cpu_job *job = NULL;
	void (*func)(void *);
	void *data;

	sync();
	if (!cpu_check_jobs(cpu))
		return;

	lock(&cpu->job_lock);
	while (true) {
		bool no_return;

		job = list_pop(&cpu->job_queue, struct cpu_job, link);
		if (!job)
			break;

		func = job->func;
		data = job->data;
		no_return = job->no_return;
		unlock(&cpu->job_lock);
		prlog(PR_TRACE, "running job %s on %x\n", job->name, cpu->pir);
		if (no_return)
			free(job);
		func(data);
		if (!list_empty(&cpu->locks_held)) {
			if (no_return)
				prlog(PR_ERR, "OPAL no-return job returned with"
				      "locks held!\n");
			else
				prlog(PR_ERR, "OPAL job %s returning with locks held\n",
				      job->name);
			drop_my_locks(true);
		}
		lock(&cpu->job_lock);
		if (!no_return) {
			cpu->job_count--;
			lwsync();
			job->complete = true;
		}
	}
	unlock(&cpu->job_lock);
}

enum cpu_wake_cause {
	cpu_wake_on_job,
	cpu_wake_on_dec,
};

static unsigned int cpu_idle_p8(enum cpu_wake_cause wake_on)
{
	uint64_t lpcr = mfspr(SPR_LPCR) & ~SPR_LPCR_P8_PECE;
	unsigned int vec;

	/* Clean up ICP, be ready for IPIs */
	icp_prep_for_pm();

	/* Setup wakup cause in LPCR: EE (for IPI) */
	lpcr |= SPR_LPCR_P8_PECE2;
	if (wake_on == cpu_wake_on_dec)
		lpcr |= SPR_LPCR_P8_PECE3; /* DEC */
	mtspr(SPR_LPCR, lpcr);
	isync();

	/* Enter nap */
	vec = enter_p8_pm_state(false);

	reset_cpu_icp();

	return vec;
}

static unsigned int cpu_idle_p9(enum cpu_wake_cause wake_on)
{
	uint64_t lpcr = mfspr(SPR_LPCR) & ~SPR_LPCR_P9_PECE;
	uint64_t psscr;
	unsigned int vec;

	lpcr |= SPR_LPCR_P9_PECEL1; /* HV DBELL for IPI */
	if (wake_on == cpu_wake_on_dec)
		lpcr |= SPR_LPCR_P9_PECEL3; /* DEC */
	mtspr(SPR_LPCR, lpcr);
	isync();

	if (sreset_enabled) {
		/* stop with EC=1 (sreset) and ESL=1 (enable thread switch). */
		/* PSSCR SD=0 ESL=1 EC=1 PSSL=0 TR=3 MTL=0 RL=1 */
		psscr = PPC_BIT(42) | PPC_BIT(43) |
			PPC_BITMASK(54, 55) | PPC_BIT(63);
		vec = enter_p9_pm_state(psscr);
	} else {
		/* stop with EC=0 (resumes) which does not require sreset. */
		/* PSSCR SD=0 ESL=0 EC=0 PSSL=0 TR=3 MTL=0 RL=1 */
		psscr = PPC_BITMASK(54, 55) | PPC_BIT(63);
		enter_p9_pm_lite_state(psscr);
		vec = 0;
	}

	/* Clear doorbell */
	p9_dbell_receive();

	return vec;
}

static void cpu_idle_pm(enum cpu_wake_cause wake_on)
{
	struct cpu_thread *cpu = this_cpu();
	unsigned int vec;

	if (!pm_enabled) {
		prlog_once(PR_DEBUG, "cpu_idle_pm called pm disabled\n");
		return;
	}

	/*
	 * Mark ourselves in sleep so other CPUs know to send an IPI,
	 * then re-check the wake conditions. This is ordered against
	 * queue_job_on_cpu() and reconfigure_idle_start() which first
	 * set the wake conditions (either queue a job or set
	 * reconfigure_idle = true), issue a sync(), then test if the
	 * target is in_sleep / in_job_sleep.
	 */
	cpu->in_sleep = true;
	if (wake_on == cpu_wake_on_job)
		cpu->in_job_sleep = true;
	sync();
	if (reconfigure_idle)
		goto skip_sleep;
	if (wake_on == cpu_wake_on_job && cpu_check_jobs(cpu))
		goto skip_sleep;

	if (proc_gen == proc_gen_p8)
		vec = cpu_idle_p8(wake_on);
	else
		vec = cpu_idle_p9(wake_on);

	if (vec == 0x100) {
		unsigned long srr1 = mfspr(SPR_SRR1);

		switch (srr1 & SPR_SRR1_PM_WAKE_MASK) {
		case SPR_SRR1_PM_WAKE_SRESET:
			exception_entry_pm_sreset();
			break;
		default:
			break;
		}
		mtmsrd(MSR_RI, 1);

	} else if (vec == 0x200) {
		exception_entry_pm_mce();
		enable_machine_check();
		mtmsrd(MSR_RI, 1);
	}

skip_sleep:
	sync();
	cpu->in_sleep = false;
	if (wake_on == cpu_wake_on_job)
		cpu->in_job_sleep = false;
}

static struct lock idle_lock = LOCK_UNLOCKED;
static int nr_cpus_idle = 0;

static void enter_idle(void)
{
	struct cpu_thread *cpu = this_cpu();

	assert(!cpu->in_idle);
	assert(!cpu->in_sleep);
	assert(!cpu->in_job_sleep);

	for (;;) {
		lock(&idle_lock);
		if (!reconfigure_idle) {
			nr_cpus_idle++;
			cpu->in_idle = true;
			break;
		}
		unlock(&idle_lock);

		/* Another CPU is reconfiguring idle */
		smt_lowest();
		while (reconfigure_idle)
			barrier();
		smt_medium();
	}

	unlock(&idle_lock);
}

static void exit_idle(void)
{
	struct cpu_thread *cpu = this_cpu();

	assert(cpu->in_idle);
	assert(!cpu->in_sleep);
	assert(!cpu->in_job_sleep);

	lock(&idle_lock);
	assert(nr_cpus_idle > 0);
	nr_cpus_idle--;
	cpu->in_idle = false;
	unlock(&idle_lock);
}

static void reconfigure_idle_start(void)
{
	struct cpu_thread *cpu;

	/*
	 * First, make sure we are exclusive in reconfiguring by taking
	 * reconfigure_idle from false to true.
	 */
	for (;;) {
		lock(&idle_lock);
		if (!reconfigure_idle) {
			reconfigure_idle = true;
			break;
		}
		unlock(&idle_lock);

		/* Someone else is reconfiguring */
		smt_lowest();
		while (reconfigure_idle)
			barrier();
		smt_medium();
	}

	unlock(&idle_lock);

	/*
	 * Then kick everyone out of idle.
	 */

	/*
	 * Order earlier store to reconfigure_idle=true vs load from
	 * cpu->in_sleep.
	 */
	sync();

	for_each_available_cpu(cpu) {
		if (cpu->in_sleep)
			cpu_send_ipi(cpu);
	}

	/*
	 * Then wait for all other CPUs to leave idle. Now they will see
	 * reconfigure_idle==true and not re-enter idle.
	 */
	smt_lowest();
	while (nr_cpus_idle != 0)
		barrier();
	smt_medium();

	/*
	 * Order load of nr_cpus_idle with later loads of data that other
	 * CPUs might have stored-to before coming out of idle.
	 */
	lwsync();
}

static void reconfigure_idle_end(void)
{
	assert(reconfigure_idle);
	lock(&idle_lock);
	reconfigure_idle = false;
	unlock(&idle_lock);
}

void cpu_idle_job(void)
{
	struct cpu_thread *cpu = this_cpu();

	do {
		enter_idle();

		if (pm_enabled) {
			cpu_idle_pm(cpu_wake_on_job);
		} else {
			smt_lowest();
			for (;;) {
				if (cpu_check_jobs(cpu))
					break;
				if (reconfigure_idle)
					break;
				barrier();
			}
			smt_medium();
		}

		exit_idle();

	} while (!cpu_check_jobs(cpu));
}

void cpu_idle_delay(unsigned long delay)
{
	unsigned long now = mftb();
	unsigned long end = now + delay;
	unsigned long min_pm = usecs_to_tb(10);

	do {
		enter_idle();

		delay = end - now;

		if (pm_enabled && delay > min_pm) {
			if (delay >= 0x7fffffff)
				delay = 0x7fffffff;
			mtspr(SPR_DEC, delay);

			cpu_idle_pm(cpu_wake_on_dec);
		} else {
			smt_lowest();
			for (;;) {
				if (tb_compare(mftb(), end) == TB_AAFTERB)
					break;
				if (reconfigure_idle)
					break;
				barrier();
			}
			smt_medium();
		}

		exit_idle();

		now = mftb();

	} while (tb_compare(now, end) != TB_AAFTERB);
}

static void recalc_pm_enabled(void)
{
	if (chip_quirk(QUIRK_AWAN))
		return;

	if (proc_gen == proc_gen_p8)
		pm_enabled = ipi_enabled && sreset_enabled;
	else
		pm_enabled = ipi_enabled;
}

void cpu_set_sreset_enable(bool enabled)
{
	if (sreset_enabled == enabled)
		return;

	if (proc_gen == proc_gen_p8) {
		/* Public P8 Mambo has broken NAP */
		if (chip_quirk(QUIRK_MAMBO_CALLOUTS))
			return;
	}

	reconfigure_idle_start();

	sreset_enabled = enabled;

	recalc_pm_enabled();

	reconfigure_idle_end();
}

void cpu_set_ipi_enable(bool enabled)
{
	if (ipi_enabled == enabled)
		return;

	reconfigure_idle_start();

	ipi_enabled = enabled;

	recalc_pm_enabled();

	reconfigure_idle_end();
}

void cpu_process_local_jobs(void)
{
	struct cpu_thread *cpu = first_available_cpu();

	while (cpu) {
		if (cpu != this_cpu())
			return;

		cpu = next_available_cpu(cpu);
	}

	if (!cpu)
		cpu = first_available_cpu();

	/* No CPU to run on, just run synchro */
	if (cpu == this_cpu()) {
		prlog_once(PR_DEBUG, "Processing jobs synchronously\n");
		cpu_process_jobs();
		opal_run_pollers();
	}
}


struct dt_node *get_cpu_node(u32 pir)
{
	struct cpu_thread *t = find_cpu_by_pir(pir);

	return t ? t->node : NULL;
}

/* This only covers primary, active cpus */
struct cpu_thread *find_cpu_by_chip_id(u32 chip_id)
{
	struct cpu_thread *t;

	for_each_available_cpu(t) {
		if (t->is_secondary)
			continue;
		if (t->chip_id == chip_id)
			return t;
	}
	return NULL;
}

struct cpu_thread *find_cpu_by_node(struct dt_node *cpu)
{
	struct cpu_thread *t;

	for_each_available_cpu(t) {
		if (t->node == cpu)
			return t;
	}
	return NULL;
}

struct cpu_thread *find_cpu_by_pir(u32 pir)
{
	if (pir > cpu_max_pir)
		return NULL;
	return &cpu_stacks[pir].cpu;
}

struct cpu_thread __nomcount *find_cpu_by_pir_nomcount(u32 pir)
{
	if (pir > cpu_max_pir)
		return NULL;
	return &cpu_stacks[pir].cpu;
}

struct cpu_thread *find_cpu_by_server(u32 server_no)
{
	struct cpu_thread *t;

	for_each_cpu(t) {
		if (t->server_no == server_no)
			return t;
	}
	return NULL;
}

struct cpu_thread *next_cpu(struct cpu_thread *cpu)
{
	struct cpu_stack *s;
	unsigned int index = 0;

	if (cpu != NULL) {
		s = container_of(cpu, struct cpu_stack, cpu);
		index = s - cpu_stacks + 1;
	}
	for (; index <= cpu_max_pir; index++) {
		cpu = &cpu_stacks[index].cpu;
		if (cpu->state != cpu_state_no_cpu)
			return cpu;
	}
	return NULL;
}

struct cpu_thread *first_cpu(void)
{
	return next_cpu(NULL);
}

struct cpu_thread *next_available_cpu(struct cpu_thread *cpu)
{
	do {
		cpu = next_cpu(cpu);
	} while(cpu && !cpu_is_available(cpu));

	return cpu;
}

struct cpu_thread *first_available_cpu(void)
{
	return next_available_cpu(NULL);
}

struct cpu_thread *next_present_cpu(struct cpu_thread *cpu)
{
	do {
		cpu = next_cpu(cpu);
	} while(cpu && !cpu_is_present(cpu));

	return cpu;
}

struct cpu_thread *first_present_cpu(void)
{
	return next_present_cpu(NULL);
}

struct cpu_thread *next_ungarded_cpu(struct cpu_thread *cpu)
{
	do {
		cpu = next_cpu(cpu);
	} while(cpu && cpu->state == cpu_state_unavailable);

	return cpu;
}

struct cpu_thread *first_ungarded_cpu(void)
{
	return next_ungarded_cpu(NULL);
}

struct cpu_thread *next_ungarded_primary(struct cpu_thread *cpu)
{
	do {
		cpu = next_ungarded_cpu(cpu);
	} while (cpu && !(cpu == cpu->primary || cpu == cpu->ec_primary));

	return cpu;
}

struct cpu_thread *first_ungarded_primary(void)
{
	return next_ungarded_primary(NULL);
}

u8 get_available_nr_cores_in_chip(u32 chip_id)
{
	struct cpu_thread *core;
	u8 nr_cores = 0;

	for_each_available_core_in_chip(core, chip_id)
		nr_cores++;

	return nr_cores;
}

struct cpu_thread *next_available_core_in_chip(struct cpu_thread *core,
					       u32 chip_id)
{
	do {
		core = next_cpu(core);
	} while(core && (!cpu_is_available(core) ||
			 core->chip_id != chip_id ||
			 core->is_secondary));
	return core;
}

struct cpu_thread *first_available_core_in_chip(u32 chip_id)
{
	return next_available_core_in_chip(NULL, chip_id);
}

uint32_t cpu_get_core_index(struct cpu_thread *cpu)
{
	return pir_to_fused_core_id(cpu->pir);
}

void cpu_remove_node(const struct cpu_thread *t)
{
	struct dt_node *i;

	/* Find this cpu node */
	dt_for_each_node(dt_root, i) {
		const struct dt_property *p;

		if (!dt_has_node_property(i, "device_type", "cpu"))
			continue;
		p = dt_find_property(i, "ibm,pir");
		if (!p)
			continue;
		if (dt_property_get_cell(p, 0) == t->pir) {
			dt_free(i);
			return;
		}
	}
	prerror("CPU: Could not find cpu node %i to remove!\n", t->pir);
	abort();
}

void cpu_disable_all_threads(struct cpu_thread *cpu)
{
	unsigned int i;
	struct dt_property *p;

	for (i = 0; i <= cpu_max_pir; i++) {
		struct cpu_thread *t = &cpu_stacks[i].cpu;

		if (t->primary == cpu->primary)
			t->state = cpu_state_disabled;

	}

	/* Mark this core as bad so that Linux kernel don't use this CPU. */
	prlog(PR_DEBUG, "CPU: Mark CPU bad (PIR 0x%04x)...\n", cpu->pir);
	p = __dt_find_property(cpu->node, "status");
	if (p)
		dt_del_property(cpu->node, p);

	dt_add_property_string(cpu->node, "status", "bad");

	/* XXX Do something to actually stop the core */
}

static void init_cpu_thread(struct cpu_thread *t,
			    enum cpu_thread_state state,
			    unsigned int pir)
{
	/* offset within cpu_thread to prevent stack_guard clobber */
	const size_t guard_skip = container_off_var(t, stack_guard) +
		sizeof(t->stack_guard);

	memset(((void *)t) + guard_skip, 0, sizeof(struct cpu_thread) - guard_skip);
	init_lock(&t->dctl_lock);
	init_lock(&t->job_lock);
	list_head_init(&t->job_queue);
	list_head_init(&t->locks_held);
	t->stack_guard = STACK_CHECK_GUARD_BASE ^ pir;
	t->state = state;
	t->pir = pir;
#ifdef STACK_CHECK_ENABLED
	t->stack_bot_mark = LONG_MAX;
#endif
	t->is_fused_core = is_fused_core(mfspr(SPR_PVR));
	assert(pir == container_of(t, struct cpu_stack, cpu) - cpu_stacks);
}

static void enable_attn(void)
{
	unsigned long hid0;

	hid0 = mfspr(SPR_HID0);
	hid0 |= hid0_attn;
	set_hid0(hid0);
	if (hid0_icache) {
		if (hid0 & hid0_icache) {
			prlog(PR_WARNING, "enable_attn found hid0_cache bit set unexpectedly\n");
			hid0 &= ~hid0_icache;
		}
		/* icache is flushed on hid0_icache 0->1 */
		set_hid0(hid0 | hid0_icache);
		set_hid0(hid0);
	}

}

static void disable_attn(void)
{
	unsigned long hid0;

	hid0 = mfspr(SPR_HID0);
	hid0 &= ~hid0_attn;
	set_hid0(hid0);
	if (hid0_icache) {
		if (hid0 & hid0_icache) {
			prlog(PR_WARNING, "disable_attn found hid0_cache bit set unexpectedly\n");
			hid0 &= ~hid0_icache;
		}
		/* icache is flushed on hid0_icache 0->1 */
		set_hid0(hid0 | hid0_icache);
		set_hid0(hid0);
	}
}

extern void __trigger_attn(void);
void trigger_attn(void)
{
	enable_attn();
	__trigger_attn();
}

static void init_hid(void)
{
	/* attn is enabled even when HV=0, so make sure it's off */
	disable_attn();
}

void __nomcount pre_init_boot_cpu(void)
{
	struct cpu_thread *cpu = this_cpu();

	/* We skip the stack guard ! */
	memset(((void *)cpu) + 8, 0, sizeof(struct cpu_thread) - 8);
}

void init_boot_cpu(void)
{
	unsigned int pir, pvr;

	pir = mfspr(SPR_PIR);
	pvr = mfspr(SPR_PVR);

	/* Get CPU family and other flags based on PVR */
	switch(PVR_TYPE(pvr)) {
	case PVR_TYPE_P8E:
	case PVR_TYPE_P8:
		proc_gen = proc_gen_p8;
		hid0_hile = SPR_HID0_POWER8_HILE;
		hid0_attn = SPR_HID0_POWER8_ENABLE_ATTN;
		break;
	case PVR_TYPE_P8NVL:
		proc_gen = proc_gen_p8;
		hid0_hile = SPR_HID0_POWER8_HILE;
		hid0_attn = SPR_HID0_POWER8_ENABLE_ATTN;
		break;
	case PVR_TYPE_P9:
	case PVR_TYPE_P9P:
		proc_gen = proc_gen_p9;
		radix_supported = true;
		hid0_hile = SPR_HID0_POWER9_HILE;
		hid0_attn = SPR_HID0_POWER9_ENABLE_ATTN;
		hid0_icache = SPR_HID0_POWER9_FLUSH_ICACHE;
		break;
	case PVR_TYPE_P10:
		proc_gen = proc_gen_p10;
		radix_supported = true;
		hid0_hile = SPR_HID0_POWER10_HILE;
		hid0_attn = SPR_HID0_POWER10_ENABLE_ATTN;
		hid0_icache = SPR_HID0_POWER10_FLUSH_ICACHE;
		break;
	case PVR_TYPE_P11:
		proc_gen = proc_gen_p11;
		radix_supported = true;
		hid0_hile = SPR_HID0_POWER10_HILE;
		hid0_attn = SPR_HID0_POWER10_ENABLE_ATTN;
		hid0_icache = SPR_HID0_POWER10_FLUSH_ICACHE;
		break;
	default:
		proc_gen = proc_gen_unknown;
	}

	/* Get a CPU thread count based on family */
	switch(proc_gen) {
	case proc_gen_p8:
		cpu_threads_max = 8;
		prlog(PR_INFO, "CPU: P8 generation processor"
		      " (max %d threads/core)\n", cpu_threads_max);
		break;
	case proc_gen_p9:
		if (is_fused_core(pvr))
			cpu_threads_max = 8;
		else
			cpu_threads_max = 4;
		prlog(PR_INFO, "CPU: P9 generation processor"
		      " (max %d threads/core)\n", cpu_threads_max);
		break;
	case proc_gen_p10:
		if (is_fused_core(pvr))
			cpu_threads_max = 8;
		else
			cpu_threads_max = 4;
		prlog(PR_INFO, "CPU: P10 generation processor"
		      " (max %d threads/core)\n", cpu_threads_max);
		break;
	case proc_gen_p11:
		if (is_fused_core(pvr))
			cpu_threads_max = 8;
		else
			cpu_threads_max = 4;
		prlog(PR_INFO, "CPU: Power11 generation processor"
		      " (max %d threads/core)\n", cpu_thread_count);
		break;
	default:
		prerror("CPU: Unknown PVR, assuming 1 thread\n");
		cpu_threads_max = 1;
	}

	if (proc_gen == proc_gen_p8) {
#ifdef CONFIG_P8
		if (PVR_VERS_MAJ(mfspr(SPR_PVR)) == 1) {
			prerror("CPU: POWER8 DD1 is not supported\n");
			abort();
		}
#else
		prerror("CPU: POWER8 detected but CONFIG_P8 not set\n");
		abort();
#endif
	}

	if (is_power9n(pvr) && (PVR_VERS_MAJ(pvr) == 1)) {
		prerror("CPU: POWER9N DD1 is not supported\n");
		abort();
	}

	prlog(PR_DEBUG, "CPU: Boot CPU PIR is 0x%04x PVR is 0x%08x\n",
	      pir, pvr);

	/*
	 * Adjust top of RAM to include the boot CPU stack. If we have less
	 * RAM than this, it's not possible to boot.
	 */
	cpu_max_pir = pir;
	top_of_ram += (cpu_max_pir + 1) * STACK_SIZE;

	/* Setup boot CPU state */
	boot_cpu = &cpu_stacks[pir].cpu;
	init_cpu_thread(boot_cpu, cpu_state_active, pir);
	init_boot_tracebuf(boot_cpu);
	assert(this_cpu() == boot_cpu);
	init_hid();
}

static void enable_large_dec(bool on)
{
	u64 lpcr = mfspr(SPR_LPCR);

	if (on)
		lpcr |= SPR_LPCR_P9_LD;
	else
		lpcr &= ~SPR_LPCR_P9_LD;

	mtspr(SPR_LPCR, lpcr);
	isync();
}

#define HIGH_BIT (1ull << 63)

static int find_dec_bits(void)
{
	int bits = 65; /* we always decrement once */
	u64 mask = ~0ull;

	if (proc_gen < proc_gen_p9)
		return 32;

	/* The ISA doesn't specify the width of the decrementer register so we
	 * need to discover it. When in large mode (LPCR.LD = 1) reads from the
	 * DEC SPR are sign extended to 64 bits and writes are truncated to the
	 * physical register width. We can use this behaviour to detect the
	 * width by starting from an all 1s value and left shifting until we
	 * read a value from the DEC with it's high bit cleared.
	 */

	enable_large_dec(true);

	do {
		bits--;
		mask = mask >> 1;
		mtspr(SPR_DEC, mask);
	} while (mfspr(SPR_DEC) & HIGH_BIT);

	enable_large_dec(false);

	prlog(PR_DEBUG, "CPU: decrementer bits %d\n", bits);
	return bits;
}

static void init_tm_suspend_mode_property(void)
{
	struct dt_node *node;

	/* If we don't find anything, assume TM suspend is enabled */
	tm_suspend_enabled = true;

	node = dt_find_by_path(dt_root, "/ibm,opal/fw-features/tm-suspend-mode");
	if (!node)
		return;

	if (dt_find_property(node, "disabled"))
		tm_suspend_enabled = false;
}

void init_cpu_max_pir(void)
{
	struct dt_node *cpus, *cpu;

	cpus = dt_find_by_path(dt_root, "/cpus");
	assert(cpus);

	/* Iterate all CPUs in the device-tree */
	dt_for_each_child(cpus, cpu) {
		unsigned int pir, server_no, threads;
		const struct dt_property *p;

		/* Skip cache nodes */
		if (strcmp(dt_prop_get(cpu, "device_type"), "cpu"))
			continue;

		server_no = dt_prop_get_u32(cpu, "reg");

		/* If PIR property is absent, assume it's the same as the
		 * server number
		 */
		pir = dt_prop_get_u32_def(cpu, "ibm,pir", server_no);

		p = dt_find_property(cpu, "ibm,ppc-interrupt-server#s");
		if (!p)
			continue;
		threads = p->len / 4;
		assert(threads > 0);
		if (threads > cpu_threads_max) {
			prlog(PR_WARNING, "CPU: Threads out of range for PIR 0x%04x"
			      " threads=%d max=%d\n",
			      pir, threads, cpu_threads_max);
			threads = cpu_threads_max;
		}
		if (!cpu_thread_count) {
			cpu_thread_count = threads;
		} else {
			/* Do not support asymmetric SMT topologies */
			assert(cpu_thread_count == threads);
		}


		if (cpu_max_pir < pir + threads - 1)
			cpu_max_pir = pir + threads - 1;
	}

	prlog(PR_DEBUG, "CPU: New max PIR set to 0x%x\n", cpu_max_pir);
}

/*
 * Set cpu->state to cpu_state_no_cpu for all secondaries, before the dt is
 * parsed and they will be flipped to present as populated CPUs are found.
 *
 * Some configurations (e.g., with memory encryption) will not zero system
 * memory at boot, so can't rely on cpu->state to be zero (== cpu_state_no_cpu).
 */
static void mark_all_secondary_cpus_absent(void)
{
	unsigned int pir;
	struct cpu_thread *cpu;

	for (pir = 0; pir <= cpu_max_pir; pir++) {
		cpu = &cpu_stacks[pir].cpu;
		if (cpu == boot_cpu)
			continue;
		cpu->state = cpu_state_no_cpu;
	}
}

void init_all_cpus(void)
{
	struct dt_node *cpus, *cpu;
	unsigned int pir, thread;
	int dec_bits = find_dec_bits();

	cpus = dt_find_by_path(dt_root, "/cpus");
	assert(cpus);

	init_tm_suspend_mode_property();

	mark_all_secondary_cpus_absent();

	/* Iterate all CPUs in the device-tree */
	dt_for_each_child(cpus, cpu) {
		unsigned int server_no, chip_id, threads;
		enum cpu_thread_state state;
		const struct dt_property *p;
		struct cpu_thread *t, *pt0, *pt1;

		/* Skip cache nodes */
		if (strcmp(dt_prop_get(cpu, "device_type"), "cpu"))
			continue;

		server_no = dt_prop_get_u32(cpu, "reg");

		/* If PIR property is absent, assume it's the same as the
		 * server number
		 */
		pir = dt_prop_get_u32_def(cpu, "ibm,pir", server_no);

		/* We should always have an ibm,chip-id property */
		chip_id = dt_get_chip_id(cpu);

		/* Only use operational CPUs */
		if (!strcmp(dt_prop_get(cpu, "status"), "okay")) {
			state = cpu_state_present;
			get_chip(chip_id)->ex_present = true;
		} else {
			state = cpu_state_unavailable;
		}

		prlog(PR_INFO, "CPU: CPU from DT PIR=0x%04x Server#=0x%x"
		      " State=%d\n", pir, server_no, state);

		/* Setup thread 0 */
		assert(pir <= cpu_max_pir);
		t = pt0 = &cpu_stacks[pir].cpu;

		if (t != boot_cpu) {
			init_cpu_thread(t, state, pir);
			/* Each cpu gets its own later in init_trace_buffers */
			t->trace = boot_cpu->trace;
		}
		if (t->is_fused_core)
			pt1 = &cpu_stacks[pir + 1].cpu;
		else
			pt1 = pt0;
		t->server_no = server_no;
		t->primary = t->ec_primary = t;
		t->node = cpu;
		t->chip_id = chip_id;
		t->icp_regs = NULL; /* Will be set later */
#ifdef DEBUG_LOCKS
		t->requested_lock = NULL;
#endif
		t->core_hmi_state = 0;
		t->core_hmi_state_ptr = &t->core_hmi_state;

		/* Add associativity properties */
		add_core_associativity(t);

		/* Add the decrementer width property */
		dt_add_property_cells(cpu, "ibm,dec-bits", dec_bits);

		if (t->is_fused_core)
			dt_add_property(t->node, "ibm,fused-core", NULL, 0);

		/* Iterate threads */
		p = dt_find_property(cpu, "ibm,ppc-interrupt-server#s");
		if (!p)
			continue;
		threads = p->len / 4;
		for (thread = 1; thread < threads; thread++) {
			prlog(PR_TRACE, "CPU:   secondary thread %d found\n",
			      thread);
			t = &cpu_stacks[pir + thread].cpu;
			init_cpu_thread(t, state, pir + thread);
			t->trace = boot_cpu->trace;
			t->server_no = dt_property_get_cell(p, thread);
			t->is_secondary = true;
			t->is_fused_core = pt0->is_fused_core;
			t->primary = pt0;
			t->ec_primary = (thread & 1) ? pt1 : pt0;
			t->node = cpu;
			t->chip_id = chip_id;
			t->core_hmi_state_ptr = &pt0->core_hmi_state;
		}
		prlog(PR_INFO, "CPU:  %d secondary threads\n", thread);
	}
}

void cpu_bringup(void)
{
	struct cpu_thread *t;
	uint32_t count = 0;

	prlog(PR_INFO, "CPU: Setting up secondary CPU state\n");

	op_display(OP_LOG, OP_MOD_CPU, 0x0000);

	/* Tell everybody to chime in ! */
	prlog(PR_INFO, "CPU: Calling in all processors...\n");
	cpu_secondary_start = 1;
	sync();

	op_display(OP_LOG, OP_MOD_CPU, 0x0002);

	for_each_cpu(t) {
		if (t->state != cpu_state_present &&
		    t->state != cpu_state_active)
			continue;

		/* Add a callin timeout ?  If so, call cpu_remove_node(t). */
		while (t->state != cpu_state_active) {
			smt_lowest();
			sync();
		}
		smt_medium();
		count++;
	}

	prlog(PR_NOTICE, "CPU: All %d processors called in...\n", count);

	op_display(OP_LOG, OP_MOD_CPU, 0x0003);
}

void cpu_callin(struct cpu_thread *cpu)
{
	sync();
	cpu->state = cpu_state_active;
	sync();

	cpu->job_has_no_return = false;
	if (cpu_is_thread0(cpu))
		init_hid();
}

static void opal_start_thread_job(void *data)
{
	cpu_give_self_os();

	/* We do not return, so let's mark the job as
	 * complete
	 */
	start_kernel_secondary((uint64_t)data);
}

static int64_t opal_start_cpu_thread(uint64_t server_no, uint64_t start_address)
{
	struct cpu_thread *cpu;
	struct cpu_job *job;

	if (!opal_addr_valid((void *)start_address))
		return OPAL_PARAMETER;

	cpu = find_cpu_by_server(server_no);
	if (!cpu) {
		prerror("OPAL: Start invalid CPU 0x%04llx !\n", server_no);
		return OPAL_PARAMETER;
	}
	prlog(PR_DEBUG, "OPAL: Start CPU 0x%04llx (PIR 0x%04x) -> 0x%016llx\n",
	       server_no, cpu->pir, start_address);

	lock(&reinit_lock);
	if (!cpu_is_available(cpu)) {
		unlock(&reinit_lock);
		prerror("OPAL: CPU not active in OPAL !\n");
		return OPAL_WRONG_STATE;
	}
	if (cpu->in_reinit) {
		unlock(&reinit_lock);
		prerror("OPAL: CPU being reinitialized !\n");
		return OPAL_WRONG_STATE;
	}
	job = __cpu_queue_job(cpu, "start_thread",
			      opal_start_thread_job, (void *)start_address,
			      true);
	unlock(&reinit_lock);
	if (!job) {
		prerror("OPAL: Failed to create CPU start job !\n");
		return OPAL_INTERNAL_ERROR;
	}
	return OPAL_SUCCESS;
}
opal_call(OPAL_START_CPU, opal_start_cpu_thread, 2);

static int64_t opal_query_cpu_status(uint64_t server_no, uint8_t *thread_status)
{
	struct cpu_thread *cpu;

	if (!opal_addr_valid(thread_status))
		return OPAL_PARAMETER;

	cpu = find_cpu_by_server(server_no);
	if (!cpu) {
		prerror("OPAL: Query invalid CPU 0x%04llx !\n", server_no);
		return OPAL_PARAMETER;
	}
	if (!cpu_is_available(cpu) && cpu->state != cpu_state_os) {
		prerror("OPAL: CPU not active in OPAL nor OS !\n");
		return OPAL_PARAMETER;
	}
	switch(cpu->state) {
	case cpu_state_os:
		*thread_status = OPAL_THREAD_STARTED;
		break;
	case cpu_state_active:
		/* Active in skiboot -> inactive in OS */
		*thread_status = OPAL_THREAD_INACTIVE;
		break;
	default:
		*thread_status = OPAL_THREAD_UNAVAILABLE;
	}

	return OPAL_SUCCESS;
}
opal_call(OPAL_QUERY_CPU_STATUS, opal_query_cpu_status, 2);

static int64_t opal_return_cpu(void)
{
	prlog(PR_DEBUG, "OPAL: Returning CPU 0x%04x\n", this_cpu()->pir);

	this_cpu()->in_opal_call--;
	if (this_cpu()->in_opal_call != 0) {
		printf("OPAL in_opal_call=%u\n", this_cpu()->in_opal_call);
	}

	__secondary_cpu_entry();

	return OPAL_HARDWARE; /* Should not happen */
}
opal_call(OPAL_RETURN_CPU, opal_return_cpu, 0);

struct hid0_change_req {
	uint64_t clr_bits;
	uint64_t set_bits;
};

static void cpu_change_hid0(void *__req)
{
	struct hid0_change_req *req = __req;
	unsigned long hid0, new_hid0;

	hid0 = new_hid0 = mfspr(SPR_HID0);
	new_hid0 &= ~req->clr_bits;
	new_hid0 |= req->set_bits;
	prlog(PR_DEBUG, "CPU: [%08x] HID0 change 0x%016lx -> 0x%016lx\n",
		this_cpu()->pir, hid0, new_hid0);
	set_hid0(new_hid0);
}

static int64_t cpu_change_all_hid0(struct hid0_change_req *req)
{
	struct cpu_thread *cpu;
	struct cpu_job **jobs;

	jobs = zalloc(sizeof(struct cpu_job *) * (cpu_max_pir + 1));
	assert(jobs);

	for_each_available_cpu(cpu) {
		if (!cpu_is_thread0(cpu) && !cpu_is_core_chiplet_primary(cpu))
			continue;
		if (cpu == this_cpu())
			continue;
		jobs[cpu->pir] = cpu_queue_job(cpu, "cpu_change_hid0",
						cpu_change_hid0, req);
	}

	/* this cpu */
	cpu_change_hid0(req);

	for_each_available_cpu(cpu) {
		if (jobs[cpu->pir])
			cpu_wait_job(jobs[cpu->pir], true);
	}

	free(jobs);

	return OPAL_SUCCESS;
}

void cpu_set_hile_mode(bool hile)
{
	struct hid0_change_req req;

	if (hile == current_hile_mode)
		return;

	if (hile) {
		req.clr_bits = 0;
		req.set_bits = hid0_hile;
	} else {
		req.clr_bits = hid0_hile;
		req.set_bits = 0;
	}
	cpu_change_all_hid0(&req);
	current_hile_mode = hile;
}

static void cpu_cleanup_one(void *param __unused)
{
	mtspr(SPR_AMR, 0);
	mtspr(SPR_IAMR, 0);
	mtspr(SPR_PCR, 0);
}

static int64_t cpu_cleanup_all(void)
{
	struct cpu_thread *cpu;
	struct cpu_job **jobs;

	jobs = zalloc(sizeof(struct cpu_job *) * (cpu_max_pir + 1));
	assert(jobs);

	for_each_available_cpu(cpu) {
		if (cpu == this_cpu())
			continue;
		jobs[cpu->pir] = cpu_queue_job(cpu, "cpu_cleanup",
						cpu_cleanup_one, NULL);
	}

	/* this cpu */
	cpu_cleanup_one(NULL);

	for_each_available_cpu(cpu) {
		if (jobs[cpu->pir])
			cpu_wait_job(jobs[cpu->pir], true);
	}

	free(jobs);


	return OPAL_SUCCESS;
}

void cpu_fast_reboot_complete(void)
{
	/* Fast reboot will have set HID0:HILE to skiboot endian */
	current_hile_mode = HAVE_LITTLE_ENDIAN;

	/* and set HID0:RADIX */
	if (proc_gen == proc_gen_p9)
		current_radix_mode = true;

	/* P8 clears TLBs in cleanup_cpu_state() */
	if (proc_gen >= proc_gen_p9)
		 cleanup_global_tlb();
}

static int64_t opal_reinit_cpus(uint64_t flags)
{
	struct hid0_change_req req = { 0, 0 };
	struct cpu_thread *cpu;
	int64_t rc = OPAL_SUCCESS;
	int i;

	prlog(PR_DEBUG, "OPAL: CPU re-init with flags: 0x%llx\n", flags);

	if (flags & OPAL_REINIT_CPUS_HILE_LE)
		prlog(PR_INFO, "OPAL: Switch to little-endian OS\n");
	else if (flags & OPAL_REINIT_CPUS_HILE_BE)
		prlog(PR_INFO, "OPAL: Switch to big-endian OS\n");

 again:
	lock(&reinit_lock);

	for (cpu = first_cpu(); cpu; cpu = next_cpu(cpu)) {
		if (cpu == this_cpu() || cpu->in_reinit)
			continue;
		if (cpu->state == cpu_state_os) {
			unlock(&reinit_lock);
			/*
			 * That might be a race with return CPU during kexec
			 * where we are still, wait a bit and try again
			 */
			for (i = 0; (i < 1000) &&
				     (cpu->state == cpu_state_os); i++) {
				time_wait_ms(1);
			}
			if (cpu->state == cpu_state_os) {
				prerror("OPAL: CPU 0x%x not in OPAL !\n", cpu->pir);
				return OPAL_WRONG_STATE;
			}
			goto again;
		}
		cpu->in_reinit = true;
	}
	/*
	 * Now we need to mark ourselves "active" or we'll be skipped
	 * by the various "for_each_active_..."
	 */
	this_cpu()->state = cpu_state_active;
	this_cpu()->in_reinit = true;
	unlock(&reinit_lock);

	/*
	 * This cleans up a few things left over by Linux
	 * that can cause problems in cases such as radix->hash
	 * transitions. Ideally Linux should do it but doing it
	 * here works around existing broken kernels.
	 */
	cpu_cleanup_all();

	if (flags & (OPAL_REINIT_CPUS_HILE_BE |
		     OPAL_REINIT_CPUS_HILE_LE)) {
		bool hile = !!(flags & OPAL_REINIT_CPUS_HILE_LE);

		flags &= ~(OPAL_REINIT_CPUS_HILE_BE | OPAL_REINIT_CPUS_HILE_LE);
		if (hile != current_hile_mode) {
			if (hile)
				req.set_bits |= hid0_hile;
			else
				req.clr_bits |= hid0_hile;
			current_hile_mode = hile;
		}
	}

	/* If MMU mode change is supported */
	if (radix_supported &&
	    (flags & (OPAL_REINIT_CPUS_MMU_HASH |
		      OPAL_REINIT_CPUS_MMU_RADIX))) {
		bool radix = !!(flags & OPAL_REINIT_CPUS_MMU_RADIX);

		flags &= ~(OPAL_REINIT_CPUS_MMU_HASH |
			   OPAL_REINIT_CPUS_MMU_RADIX);

		if (proc_gen == proc_gen_p9 && radix != current_radix_mode) {
			if (radix)
				req.set_bits |= SPR_HID0_POWER9_RADIX;
			else
				req.clr_bits |= SPR_HID0_POWER9_RADIX;

			current_radix_mode = radix;
		}
	}

	/* Cleanup the TLB. We do that unconditionally, this works
	 * around issues where OSes fail to invalidate the PWC in Radix
	 * mode for example. This only works on P9 and later, but we
	 * also know we don't have a problem with Linux cleanups on
	 * P8 so this isn't a problem. If we wanted to cleanup the
	 * TLB on P8 as well, we'd have to use jobs to do it locally
	 * on each CPU.
	 */
	 cleanup_global_tlb();

	 /* Apply HID bits changes if any */
	if (req.set_bits || req.clr_bits)
		cpu_change_all_hid0(&req);

	if (flags & OPAL_REINIT_CPUS_TM_SUSPEND_DISABLED) {
		flags &= ~OPAL_REINIT_CPUS_TM_SUSPEND_DISABLED;

		if (tm_suspend_enabled)
			rc = OPAL_UNSUPPORTED;
		else
			rc = OPAL_SUCCESS;
	}

	if (flags != 0)
		rc = OPAL_UNSUPPORTED;

	/* And undo the above */
	lock(&reinit_lock);
	this_cpu()->state = cpu_state_os;
	for (cpu = first_cpu(); cpu; cpu = next_cpu(cpu))
		cpu->in_reinit = false;
	unlock(&reinit_lock);

	return rc;
}
opal_call(OPAL_REINIT_CPUS, opal_reinit_cpus, 1);

#define NMMU_XLAT_CTL_PTCR 0xb
static int64_t nmmu_set_ptcr(uint64_t chip_id, struct dt_node *node, uint64_t ptcr)
{
	uint32_t nmmu_base_addr;

	nmmu_base_addr = dt_get_address(node, 0, NULL);
	return xscom_write(chip_id, nmmu_base_addr + NMMU_XLAT_CTL_PTCR, ptcr);
}

/*
 * Setup the the Nest MMU PTCR register for all chips in the system or
 * the specified chip id.
 *
 * The PTCR value may be overwritten so long as all users have been
 * quiesced. If it is set to an invalid memory address the system will
 * checkstop if anything attempts to use it.
 *
 * Returns OPAL_UNSUPPORTED if no nest mmu was found.
 */
static int64_t opal_nmmu_set_ptcr(uint64_t chip_id, uint64_t ptcr)
{
	struct dt_node *node;
	int64_t rc = OPAL_UNSUPPORTED;

	if (chip_id == -1ULL)
		dt_for_each_compatible(dt_root, node, "ibm,power9-nest-mmu") {
			chip_id = dt_get_chip_id(node);
			if ((rc = nmmu_set_ptcr(chip_id, node, ptcr)))
				return rc;
		}
	else
		dt_for_each_compatible_on_chip(dt_root, node, "ibm,power9-nest-mmu", chip_id)
			if ((rc = nmmu_set_ptcr(chip_id, node, ptcr)))
				return rc;

	return rc;
}
opal_call(OPAL_NMMU_SET_PTCR, opal_nmmu_set_ptcr, 2);

static void _exit_uv_mode(void *data __unused)
{
	prlog(PR_DEBUG, "Exit uv mode on cpu pir 0x%04x\n", this_cpu()->pir);
	/* HW has smfctrl shared between threads but on Mambo it is per-thread */
	if (chip_quirk(QUIRK_MAMBO_CALLOUTS))
		exit_uv_mode(1);
	else
		exit_uv_mode(cpu_is_thread0(this_cpu()));
}

void cpu_disable_pef(void)
{
	struct cpu_thread *cpu;
	struct cpu_job **jobs;

	if (!(mfmsr() & MSR_S)) {
		prlog(PR_DEBUG, "UV mode off on cpu pir 0x%04x\n", this_cpu()->pir);
		return;
	}

	jobs = zalloc(sizeof(struct cpu_job *) * (cpu_max_pir + 1));
	assert(jobs);

	/* Exit uv mode on all secondary threads before touching
	 * smfctrl on thread 0 */
	for_each_available_cpu(cpu) {
		if (cpu == this_cpu())
			continue;

		if (!cpu_is_thread0(cpu))
			jobs[cpu->pir] = cpu_queue_job(cpu, "exit_uv_mode",
					_exit_uv_mode, NULL);
	}

	for_each_available_cpu(cpu)
		if (jobs[cpu->pir]) {
			cpu_wait_job(jobs[cpu->pir], true);
			jobs[cpu->pir] = NULL;
		}

	/* Exit uv mode and disable smfctrl on primary threads */
	for_each_available_cpu(cpu) {
		if (cpu == this_cpu())
			continue;

		if (cpu_is_thread0(cpu))
			jobs[cpu->pir] = cpu_queue_job(cpu, "exit_uv_mode",
					_exit_uv_mode, NULL);
	}

	for_each_available_cpu(cpu)
		if (jobs[cpu->pir])
			cpu_wait_job(jobs[cpu->pir], true);

	free(jobs);

	_exit_uv_mode(NULL);
}