Adding upstream version 6.6.15.upstream/6.6.15

Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
author: Daniel Baumann <daniel.baumann@progress-linux.org> 2024-04-11 08:27:49 +0000
committer: Daniel Baumann <daniel.baumann@progress-linux.org> 2024-04-11 08:27:49 +0000
commit: ace9429bb58fd418f0c81d4c2835699bddf6bde6 (patch)
tree: b2d64bc10158fdd5497876388cd68142ca374ed3 /drivers/cpuidle
parent: Initial commit. (diff)
download: linux-ace9429bb58fd418f0c81d4c2835699bddf6bde6.tar.xz
linux-ace9429bb58fd418f0c81d4c2835699bddf6bde6.zip
43 files changed, 9728 insertions, 0 deletions
diff --git a/drivers/cpuidle/Kconfig b/drivers/cpuidle/Kconfig
new file mode 100644
index 0000000000..cac5997dca
--- /dev/null
+++ b/drivers/cpuidle/Kconfig
@@ -0,0 +1,88 @@
+# SPDX-License-Identifier: GPL-2.0-only
+menu "CPU Idle"
+
+config CPU_IDLE
+	bool "CPU idle PM support"
+	default y if ACPI || PPC_PSERIES
+	select CPU_IDLE_GOV_LADDER if (!NO_HZ && !NO_HZ_IDLE)
+	select CPU_IDLE_GOV_MENU if (NO_HZ || NO_HZ_IDLE) && !CPU_IDLE_GOV_TEO
+	help
+	  CPU idle is a generic framework for supporting software-controlled
+	  idle processor power management.  It includes modular cross-platform
+	  governors that can be swapped during runtime.
+
+	  If you're using an ACPI-enabled platform, you should say Y here.
+
+if CPU_IDLE
+
+config CPU_IDLE_MULTIPLE_DRIVERS
+	bool
+
+config CPU_IDLE_GOV_LADDER
+	bool "Ladder governor (for periodic timer tick)"
+
+config CPU_IDLE_GOV_MENU
+	bool "Menu governor (for tickless system)"
+
+config CPU_IDLE_GOV_TEO
+	bool "Timer events oriented (TEO) governor (for tickless systems)"
+	help
+	  This governor implements a simplified idle state selection method
+	  focused on timer events and does not do any interactivity boosting.
+
+	  Some workloads benefit from using it and it generally should be safe
+	  to use.  Say Y here if you are not happy with the alternatives.
+
+config CPU_IDLE_GOV_HALTPOLL
+	bool "Haltpoll governor (for virtualized systems)"
+	depends on KVM_GUEST
+	help
+	  This governor implements haltpoll idle state selection, to be
+	  used in conjunction with the haltpoll cpuidle driver, allowing
+	  for polling for a certain amount of time before entering idle
+	  state.
+
+	  Some virtualized workloads benefit from using it.
+
+config DT_IDLE_STATES
+	bool
+
+config DT_IDLE_GENPD
+	depends on PM_GENERIC_DOMAINS_OF
+	bool
+
+menu "ARM CPU Idle Drivers"
+depends on ARM || ARM64
+source "drivers/cpuidle/Kconfig.arm"
+endmenu
+
+menu "MIPS CPU Idle Drivers"
+depends on MIPS
+source "drivers/cpuidle/Kconfig.mips"
+endmenu
+
+menu "POWERPC CPU Idle Drivers"
+depends on PPC
+source "drivers/cpuidle/Kconfig.powerpc"
+endmenu
+
+menu "RISC-V CPU Idle Drivers"
+depends on RISCV
+source "drivers/cpuidle/Kconfig.riscv"
+endmenu
+
+config HALTPOLL_CPUIDLE
+	tristate "Halt poll cpuidle driver"
+	depends on X86 && KVM_GUEST
+	select CPU_IDLE_GOV_HALTPOLL
+	default y
+	help
+	 This option enables halt poll cpuidle driver, which allows to poll
+	 before halting in the guest (more efficient than polling in the
+	 host via halt_poll_ns for some scenarios).
+
+endif
+
+config ARCH_NEEDS_CPU_IDLE_COUPLED
+	def_bool n
+endmenu
diff --git a/drivers/cpuidle/Kconfig.arm b/drivers/cpuidle/Kconfig.arm
new file mode 100644
index 0000000000..a1ee475d18
--- /dev/null
+++ b/drivers/cpuidle/Kconfig.arm
@@ -0,0 +1,132 @@
+# SPDX-License-Identifier: GPL-2.0-only
+#
+# ARM CPU Idle drivers
+#
+config ARM_CPUIDLE
+	bool "Generic ARM CPU idle Driver"
+	depends on ARM
+	select DT_IDLE_STATES
+	select CPU_IDLE_MULTIPLE_DRIVERS
+	help
+	  Select this to enable generic cpuidle driver for ARM.
+	  It provides a generic idle driver whose idle states are configured
+	  at run-time through DT nodes. The CPUidle suspend backend is
+	  initialized by calling the CPU operations init idle hook
+	  provided by architecture code.
+
+config ARM_PSCI_CPUIDLE
+	bool "PSCI CPU idle Driver"
+	depends on ARM_PSCI_FW
+	select DT_IDLE_STATES
+	select CPU_IDLE_MULTIPLE_DRIVERS
+	help
+	  Select this to enable PSCI firmware based CPUidle driver for ARM.
+	  It provides an idle driver that is capable of detecting and
+	  managing idle states through the PSCI firmware interface.
+
+	  The driver has limitations when used with PREEMPT_RT:
+	  - If the idle states are described with the non-hierarchical layout,
+	    all idle states are still available.
+
+	  - If the idle states are described with the hierarchical layout,
+	    only the idle states defined per CPU are available, but not the ones
+	    being shared among a group of CPUs (aka cluster idle states).
+
+config ARM_PSCI_CPUIDLE_DOMAIN
+	bool "PSCI CPU idle Domain"
+	depends on ARM_PSCI_CPUIDLE
+	depends on PM_GENERIC_DOMAINS_OF
+	select DT_IDLE_GENPD
+	default y
+	help
+	  Select this to enable the PSCI based CPUidle driver to use PM domains,
+	  which is needed to support the hierarchical DT based layout of the
+	  idle states.
+
+config ARM_BIG_LITTLE_CPUIDLE
+	bool "Support for ARM big.LITTLE processors"
+	depends on ARCH_VEXPRESS_TC2_PM || ARCH_EXYNOS || COMPILE_TEST
+	depends on MCPM && !ARM64
+	select ARM_CPU_SUSPEND
+	select CPU_IDLE_MULTIPLE_DRIVERS
+	select DT_IDLE_STATES
+	help
+	  Select this option to enable CPU idle driver for big.LITTLE based
+	  ARM systems. Driver manages CPUs coordination through MCPM and
+	  define different C-states for little and big cores through the
+	  multiple CPU idle drivers infrastructure.
+
+config ARM_CLPS711X_CPUIDLE
+	bool "CPU Idle Driver for CLPS711X processors"
+	depends on ARCH_CLPS711X && !ARM64 || COMPILE_TEST
+	help
+	  Select this to enable cpuidle on Cirrus Logic CLPS711X SOCs.
+
+config ARM_HIGHBANK_CPUIDLE
+	bool "CPU Idle Driver for Calxeda processors"
+	depends on ARM_PSCI && !ARM64
+	select ARM_CPU_SUSPEND
+	help
+	  Select this to enable cpuidle on Calxeda processors.
+
+config ARM_KIRKWOOD_CPUIDLE
+	bool "CPU Idle Driver for Marvell Kirkwood SoCs"
+	depends on (MACH_KIRKWOOD || COMPILE_TEST) && !ARM64
+	help
+	  This adds the CPU Idle driver for Marvell Kirkwood SoCs.
+
+config ARM_ZYNQ_CPUIDLE
+	bool "CPU Idle Driver for Xilinx Zynq processors"
+	depends on (ARCH_ZYNQ || COMPILE_TEST) && !ARM64
+	help
+	  Select this to enable cpuidle on Xilinx Zynq processors.
+
+config ARM_U8500_CPUIDLE
+	bool "Cpu Idle Driver for the ST-E u8500 processors"
+	depends on ARCH_U8500 && !ARM64
+	help
+	  Select this to enable cpuidle for ST-E u8500 processors.
+
+config ARM_AT91_CPUIDLE
+	bool "Cpu Idle Driver for the AT91 processors"
+	default y
+	depends on (ARCH_AT91 || COMPILE_TEST) && !ARM64
+	help
+	  Select this to enable cpuidle for AT91 processors.
+
+config ARM_EXYNOS_CPUIDLE
+	bool "Cpu Idle Driver for the Exynos processors"
+	depends on (ARCH_EXYNOS || COMPILE_TEST) && !ARM64
+	select ARCH_NEEDS_CPU_IDLE_COUPLED if SMP
+	help
+	  Select this to enable cpuidle for Exynos processors.
+
+config ARM_MVEBU_V7_CPUIDLE
+	bool "CPU Idle Driver for mvebu v7 family processors"
+	depends on (ARCH_MVEBU || COMPILE_TEST) && !ARM64
+	help
+	  Select this to enable cpuidle on Armada 370, 38x and XP processors.
+
+config ARM_TEGRA_CPUIDLE
+	bool "CPU Idle Driver for NVIDIA Tegra SoCs"
+	depends on (ARCH_TEGRA || COMPILE_TEST) && !ARM64 && MMU
+	depends on ARCH_SUSPEND_POSSIBLE
+	select ARCH_NEEDS_CPU_IDLE_COUPLED if SMP
+	select ARM_CPU_SUSPEND
+	help
+	  Select this to enable cpuidle for NVIDIA Tegra20/30/114/124 SoCs.
+
+config ARM_QCOM_SPM_CPUIDLE
+	bool "CPU Idle Driver for Qualcomm Subsystem Power Manager (SPM)"
+	depends on (ARCH_QCOM || COMPILE_TEST) && !ARM64 && MMU
+	depends on ARCH_SUSPEND_POSSIBLE
+	select ARM_CPU_SUSPEND
+	select CPU_IDLE_MULTIPLE_DRIVERS
+	select DT_IDLE_STATES
+	select QCOM_SCM
+	select QCOM_SPM
+	help
+	  Select this to enable cpuidle for Qualcomm processors.
+	  The Subsystem Power Manager (SPM) controls low power modes for the
+	  CPU and L2 cores. It interface with various system drivers to put
+	  the cores in low power modes.
diff --git a/drivers/cpuidle/Kconfig.mips b/drivers/cpuidle/Kconfig.mips
new file mode 100644
index 0000000000..c3c011af4a
--- /dev/null
+++ b/drivers/cpuidle/Kconfig.mips
@@ -0,0 +1,18 @@
+# SPDX-License-Identifier: GPL-2.0-only
+#
+# MIPS CPU Idle Drivers
+#
+config MIPS_CPS_CPUIDLE
+	bool "CPU Idle driver for MIPS CPS platforms"
+	depends on CPU_IDLE && MIPS_CPS
+	depends on SYS_SUPPORTS_MIPS_CPS
+	select ARCH_NEEDS_CPU_IDLE_COUPLED if MIPS_MT || CPU_MIPSR6
+	select GENERIC_CLOCKEVENTS_BROADCAST if SMP
+	select MIPS_CPS_PM
+	default y
+	help
+	  Select this option to enable processor idle state management
+	  through cpuidle for systems built around the MIPS Coherent
+	  Processing System (CPS) architecture. In order to make use of
+	  the deepest idle states you will need to ensure that you are
+	  also using the CONFIG_MIPS_CPS SMP implementation.
diff --git a/drivers/cpuidle/Kconfig.powerpc b/drivers/cpuidle/Kconfig.powerpc
new file mode 100644
index 0000000000..a797a02b7b
--- /dev/null
+++ b/drivers/cpuidle/Kconfig.powerpc
@@ -0,0 +1,21 @@
+# SPDX-License-Identifier: GPL-2.0-only
+#
+# POWERPC CPU Idle Drivers
+#
+config PSERIES_CPUIDLE
+	bool "Cpuidle driver for pSeries platforms"
+	depends on CPU_IDLE
+	depends on PPC_PSERIES
+	default y
+	help
+	  Select this option to enable processor idle state management
+	  through cpuidle subsystem.
+
+config POWERNV_CPUIDLE
+	bool "Cpuidle driver for powernv platforms"
+	depends on CPU_IDLE
+	depends on PPC_POWERNV
+	default y
+	help
+	  Select this option to enable processor idle state management
+	  through cpuidle subsystem.
diff --git a/drivers/cpuidle/Kconfig.riscv b/drivers/cpuidle/Kconfig.riscv
new file mode 100644
index 0000000000..78518c26af
--- /dev/null
+++ b/drivers/cpuidle/Kconfig.riscv
@@ -0,0 +1,15 @@
+# SPDX-License-Identifier: GPL-2.0-only
+#
+# RISC-V CPU Idle drivers
+#
+
+config RISCV_SBI_CPUIDLE
+	bool "RISC-V SBI CPU idle Driver"
+	depends on RISCV_SBI
+	select DT_IDLE_STATES
+	select CPU_IDLE_MULTIPLE_DRIVERS
+	select DT_IDLE_GENPD if PM_GENERIC_DOMAINS_OF
+	help
+	  Select this option to enable RISC-V SBI firmware based CPU idle
+	  driver for RISC-V systems. This drivers also supports hierarchical
+	  DT based layout of the idle state.
diff --git a/drivers/cpuidle/Makefile b/drivers/cpuidle/Makefile
new file mode 100644
index 0000000000..d103342b7c
--- /dev/null
+++ b/drivers/cpuidle/Makefile
@@ -0,0 +1,41 @@
+# SPDX-License-Identifier: GPL-2.0
+#
+# Makefile for cpuidle.
+#
+
+obj-y += cpuidle.o driver.o governor.o sysfs.o governors/
+obj-$(CONFIG_ARCH_NEEDS_CPU_IDLE_COUPLED) += coupled.o
+obj-$(CONFIG_DT_IDLE_STATES)		  += dt_idle_states.o
+obj-$(CONFIG_DT_IDLE_GENPD)		  += dt_idle_genpd.o
+obj-$(CONFIG_ARCH_HAS_CPU_RELAX)	  += poll_state.o
+obj-$(CONFIG_HALTPOLL_CPUIDLE)		  += cpuidle-haltpoll.o
+
+##################################################################################
+# ARM SoC drivers
+obj-$(CONFIG_ARM_MVEBU_V7_CPUIDLE) += cpuidle-mvebu-v7.o
+obj-$(CONFIG_ARM_BIG_LITTLE_CPUIDLE)	+= cpuidle-big_little.o
+obj-$(CONFIG_ARM_CLPS711X_CPUIDLE)	+= cpuidle-clps711x.o
+obj-$(CONFIG_ARM_HIGHBANK_CPUIDLE)	+= cpuidle-calxeda.o
+obj-$(CONFIG_ARM_KIRKWOOD_CPUIDLE)	+= cpuidle-kirkwood.o
+obj-$(CONFIG_ARM_ZYNQ_CPUIDLE)		+= cpuidle-zynq.o
+obj-$(CONFIG_ARM_U8500_CPUIDLE)         += cpuidle-ux500.o
+obj-$(CONFIG_ARM_AT91_CPUIDLE)          += cpuidle-at91.o
+obj-$(CONFIG_ARM_EXYNOS_CPUIDLE)        += cpuidle-exynos.o
+obj-$(CONFIG_ARM_CPUIDLE)		+= cpuidle-arm.o
+obj-$(CONFIG_ARM_PSCI_CPUIDLE)		+= cpuidle-psci.o
+obj-$(CONFIG_ARM_PSCI_CPUIDLE_DOMAIN)	+= cpuidle-psci-domain.o
+obj-$(CONFIG_ARM_TEGRA_CPUIDLE)		+= cpuidle-tegra.o
+obj-$(CONFIG_ARM_QCOM_SPM_CPUIDLE)	+= cpuidle-qcom-spm.o
+
+###############################################################################
+# MIPS drivers
+obj-$(CONFIG_MIPS_CPS_CPUIDLE)		+= cpuidle-cps.o
+
+###############################################################################
+# POWERPC drivers
+obj-$(CONFIG_PSERIES_CPUIDLE)		+= cpuidle-pseries.o
+obj-$(CONFIG_POWERNV_CPUIDLE)		+= cpuidle-powernv.o
+
+###############################################################################
+# RISC-V drivers
+obj-$(CONFIG_RISCV_SBI_CPUIDLE)		+= cpuidle-riscv-sbi.o
diff --git a/drivers/cpuidle/coupled.c b/drivers/cpuidle/coupled.c
new file mode 100644
index 0000000000..9acde71558
--- /dev/null
+++ b/drivers/cpuidle/coupled.c
@@ -0,0 +1,791 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * coupled.c - helper functions to enter the same idle state on multiple cpus
+ *
+ * Copyright (c) 2011 Google, Inc.
+ *
+ * Author: Colin Cross <ccross@android.com>
+ */
+
+#include <linux/kernel.h>
+#include <linux/cpu.h>
+#include <linux/cpuidle.h>
+#include <linux/mutex.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+
+#include "cpuidle.h"
+
+/**
+ * DOC: Coupled cpuidle states
+ *
+ * On some ARM SMP SoCs (OMAP4460, Tegra 2, and probably more), the
+ * cpus cannot be independently powered down, either due to
+ * sequencing restrictions (on Tegra 2, cpu 0 must be the last to
+ * power down), or due to HW bugs (on OMAP4460, a cpu powering up
+ * will corrupt the gic state unless the other cpu runs a work
+ * around).  Each cpu has a power state that it can enter without
+ * coordinating with the other cpu (usually Wait For Interrupt, or
+ * WFI), and one or more "coupled" power states that affect blocks
+ * shared between the cpus (L2 cache, interrupt controller, and
+ * sometimes the whole SoC).  Entering a coupled power state must
+ * be tightly controlled on both cpus.
+ *
+ * This file implements a solution, where each cpu will wait in the
+ * WFI state until all cpus are ready to enter a coupled state, at
+ * which point the coupled state function will be called on all
+ * cpus at approximately the same time.
+ *
+ * Once all cpus are ready to enter idle, they are woken by an smp
+ * cross call.  At this point, there is a chance that one of the
+ * cpus will find work to do, and choose not to enter idle.  A
+ * final pass is needed to guarantee that all cpus will call the
+ * power state enter function at the same time.  During this pass,
+ * each cpu will increment the ready counter, and continue once the
+ * ready counter matches the number of online coupled cpus.  If any
+ * cpu exits idle, the other cpus will decrement their counter and
+ * retry.
+ *
+ * requested_state stores the deepest coupled idle state each cpu
+ * is ready for.  It is assumed that the states are indexed from
+ * shallowest (highest power, lowest exit latency) to deepest
+ * (lowest power, highest exit latency).  The requested_state
+ * variable is not locked.  It is only written from the cpu that
+ * it stores (or by the on/offlining cpu if that cpu is offline),
+ * and only read after all the cpus are ready for the coupled idle
+ * state are no longer updating it.
+ *
+ * Three atomic counters are used.  alive_count tracks the number
+ * of cpus in the coupled set that are currently or soon will be
+ * online.  waiting_count tracks the number of cpus that are in
+ * the waiting loop, in the ready loop, or in the coupled idle state.
+ * ready_count tracks the number of cpus that are in the ready loop
+ * or in the coupled idle state.
+ *
+ * To use coupled cpuidle states, a cpuidle driver must:
+ *
+ *    Set struct cpuidle_device.coupled_cpus to the mask of all
+ *    coupled cpus, usually the same as cpu_possible_mask if all cpus
+ *    are part of the same cluster.  The coupled_cpus mask must be
+ *    set in the struct cpuidle_device for each cpu.
+ *
+ *    Set struct cpuidle_device.safe_state to a state that is not a
+ *    coupled state.  This is usually WFI.
+ *
+ *    Set CPUIDLE_FLAG_COUPLED in struct cpuidle_state.flags for each
+ *    state that affects multiple cpus.
+ *
+ *    Provide a struct cpuidle_state.enter function for each state
+ *    that affects multiple cpus.  This function is guaranteed to be
+ *    called on all cpus at approximately the same time.  The driver
+ *    should ensure that the cpus all abort together if any cpu tries
+ *    to abort once the function is called.  The function should return
+ *    with interrupts still disabled.
+ */
+
+/**
+ * struct cpuidle_coupled - data for set of cpus that share a coupled idle state
+ * @coupled_cpus: mask of cpus that are part of the coupled set
+ * @requested_state: array of requested states for cpus in the coupled set
+ * @ready_waiting_counts: combined count of cpus  in ready or waiting loops
+ * @abort_barrier: synchronisation point for abort cases
+ * @online_count: count of cpus that are online
+ * @refcnt: reference count of cpuidle devices that are using this struct
+ * @prevent: flag to prevent coupled idle while a cpu is hotplugging
+ */
+struct cpuidle_coupled {
+	cpumask_t coupled_cpus;
+	int requested_state[NR_CPUS];
+	atomic_t ready_waiting_counts;
+	atomic_t abort_barrier;
+	int online_count;
+	int refcnt;
+	int prevent;
+};
+
+#define WAITING_BITS 16
+#define MAX_WAITING_CPUS (1 << WAITING_BITS)
+#define WAITING_MASK (MAX_WAITING_CPUS - 1)
+#define READY_MASK (~WAITING_MASK)
+
+#define CPUIDLE_COUPLED_NOT_IDLE	(-1)
+
+static DEFINE_PER_CPU(call_single_data_t, cpuidle_coupled_poke_cb);
+
+/*
+ * The cpuidle_coupled_poke_pending mask is used to avoid calling
+ * __smp_call_function_single with the per cpu call_single_data_t struct already
+ * in use.  This prevents a deadlock where two cpus are waiting for each others
+ * call_single_data_t struct to be available
+ */
+static cpumask_t cpuidle_coupled_poke_pending;
+
+/*
+ * The cpuidle_coupled_poked mask is used to ensure that each cpu has been poked
+ * once to minimize entering the ready loop with a poke pending, which would
+ * require aborting and retrying.
+ */
+static cpumask_t cpuidle_coupled_poked;
+
+/**
+ * cpuidle_coupled_parallel_barrier - synchronize all online coupled cpus
+ * @dev: cpuidle_device of the calling cpu
+ * @a:   atomic variable to hold the barrier
+ *
+ * No caller to this function will return from this function until all online
+ * cpus in the same coupled group have called this function.  Once any caller
+ * has returned from this function, the barrier is immediately available for
+ * reuse.
+ *
+ * The atomic variable must be initialized to 0 before any cpu calls
+ * this function, will be reset to 0 before any cpu returns from this function.
+ *
+ * Must only be called from within a coupled idle state handler
+ * (state.enter when state.flags has CPUIDLE_FLAG_COUPLED set).
+ *
+ * Provides full smp barrier semantics before and after calling.
+ */
+void cpuidle_coupled_parallel_barrier(struct cpuidle_device *dev, atomic_t *a)
+{
+	int n = dev->coupled->online_count;
+
+	smp_mb__before_atomic();
+	atomic_inc(a);
+
+	while (atomic_read(a) < n)
+		cpu_relax();
+
+	if (atomic_inc_return(a) == n * 2) {
+		atomic_set(a, 0);
+		return;
+	}
+
+	while (atomic_read(a) > n)
+		cpu_relax();
+}
+
+/**
+ * cpuidle_state_is_coupled - check if a state is part of a coupled set
+ * @drv: struct cpuidle_driver for the platform
+ * @state: index of the target state in drv->states
+ *
+ * Returns true if the target state is coupled with cpus besides this one
+ */
+bool cpuidle_state_is_coupled(struct cpuidle_driver *drv, int state)
+{
+	return drv->states[state].flags & CPUIDLE_FLAG_COUPLED;
+}
+
+/**
+ * cpuidle_coupled_state_verify - check if the coupled states are correctly set.
+ * @drv: struct cpuidle_driver for the platform
+ *
+ * Returns 0 for valid state values, a negative error code otherwise:
+ *  * -EINVAL if any coupled state(safe_state_index) is wrongly set.
+ */
+int cpuidle_coupled_state_verify(struct cpuidle_driver *drv)
+{
+	int i;
+
+	for (i = drv->state_count - 1; i >= 0; i--) {
+		if (cpuidle_state_is_coupled(drv, i) &&
+		    (drv->safe_state_index == i ||
+		     drv->safe_state_index < 0 ||
+		     drv->safe_state_index >= drv->state_count))
+			return -EINVAL;
+	}
+
+	return 0;
+}
+
+/**
+ * cpuidle_coupled_set_ready - mark a cpu as ready
+ * @coupled: the struct coupled that contains the current cpu
+ */
+static inline void cpuidle_coupled_set_ready(struct cpuidle_coupled *coupled)
+{
+	atomic_add(MAX_WAITING_CPUS, &coupled->ready_waiting_counts);
+}
+
+/**
+ * cpuidle_coupled_set_not_ready - mark a cpu as not ready
+ * @coupled: the struct coupled that contains the current cpu
+ *
+ * Decrements the ready counter, unless the ready (and thus the waiting) counter
+ * is equal to the number of online cpus.  Prevents a race where one cpu
+ * decrements the waiting counter and then re-increments it just before another
+ * cpu has decremented its ready counter, leading to the ready counter going
+ * down from the number of online cpus without going through the coupled idle
+ * state.
+ *
+ * Returns 0 if the counter was decremented successfully, -EINVAL if the ready
+ * counter was equal to the number of online cpus.
+ */
+static
+inline int cpuidle_coupled_set_not_ready(struct cpuidle_coupled *coupled)
+{
+	int all;
+	int ret;
+
+	all = coupled->online_count | (coupled->online_count << WAITING_BITS);
+	ret = atomic_add_unless(&coupled->ready_waiting_counts,
+		-MAX_WAITING_CPUS, all);
+
+	return ret ? 0 : -EINVAL;
+}
+
+/**
+ * cpuidle_coupled_no_cpus_ready - check if no cpus in a coupled set are ready
+ * @coupled: the struct coupled that contains the current cpu
+ *
+ * Returns true if all of the cpus in a coupled set are out of the ready loop.
+ */
+static inline int cpuidle_coupled_no_cpus_ready(struct cpuidle_coupled *coupled)
+{
+	int r = atomic_read(&coupled->ready_waiting_counts) >> WAITING_BITS;
+	return r == 0;
+}
+
+/**
+ * cpuidle_coupled_cpus_ready - check if all cpus in a coupled set are ready
+ * @coupled: the struct coupled that contains the current cpu
+ *
+ * Returns true if all cpus coupled to this target state are in the ready loop
+ */
+static inline bool cpuidle_coupled_cpus_ready(struct cpuidle_coupled *coupled)
+{
+	int r = atomic_read(&coupled->ready_waiting_counts) >> WAITING_BITS;
+	return r == coupled->online_count;
+}
+
+/**
+ * cpuidle_coupled_cpus_waiting - check if all cpus in a coupled set are waiting
+ * @coupled: the struct coupled that contains the current cpu
+ *
+ * Returns true if all cpus coupled to this target state are in the wait loop
+ */
+static inline bool cpuidle_coupled_cpus_waiting(struct cpuidle_coupled *coupled)
+{
+	int w = atomic_read(&coupled->ready_waiting_counts) & WAITING_MASK;
+	return w == coupled->online_count;
+}
+
+/**
+ * cpuidle_coupled_no_cpus_waiting - check if no cpus in coupled set are waiting
+ * @coupled: the struct coupled that contains the current cpu
+ *
+ * Returns true if all of the cpus in a coupled set are out of the waiting loop.
+ */
+static inline int cpuidle_coupled_no_cpus_waiting(struct cpuidle_coupled *coupled)
+{
+	int w = atomic_read(&coupled->ready_waiting_counts) & WAITING_MASK;
+	return w == 0;
+}
+
+/**
+ * cpuidle_coupled_get_state - determine the deepest idle state
+ * @dev: struct cpuidle_device for this cpu
+ * @coupled: the struct coupled that contains the current cpu
+ *
+ * Returns the deepest idle state that all coupled cpus can enter
+ */
+static inline int cpuidle_coupled_get_state(struct cpuidle_device *dev,
+		struct cpuidle_coupled *coupled)
+{
+	int i;
+	int state = INT_MAX;
+
+	/*
+	 * Read barrier ensures that read of requested_state is ordered after
+	 * reads of ready_count.  Matches the write barriers
+	 * cpuidle_set_state_waiting.
+	 */
+	smp_rmb();
+
+	for_each_cpu(i, &coupled->coupled_cpus)
+		if (cpu_online(i) && coupled->requested_state[i] < state)
+			state = coupled->requested_state[i];
+
+	return state;
+}
+
+static void cpuidle_coupled_handle_poke(void *info)
+{
+	int cpu = (unsigned long)info;
+	cpumask_set_cpu(cpu, &cpuidle_coupled_poked);
+	cpumask_clear_cpu(cpu, &cpuidle_coupled_poke_pending);
+}
+
+/**
+ * cpuidle_coupled_poke - wake up a cpu that may be waiting
+ * @cpu: target cpu
+ *
+ * Ensures that the target cpu exits it's waiting idle state (if it is in it)
+ * and will see updates to waiting_count before it re-enters it's waiting idle
+ * state.
+ *
+ * If cpuidle_coupled_poked_mask is already set for the target cpu, that cpu
+ * either has or will soon have a pending IPI that will wake it out of idle,
+ * or it is currently processing the IPI and is not in idle.
+ */
+static void cpuidle_coupled_poke(int cpu)
+{
+	call_single_data_t *csd = &per_cpu(cpuidle_coupled_poke_cb, cpu);
+
+	if (!cpumask_test_and_set_cpu(cpu, &cpuidle_coupled_poke_pending))
+		smp_call_function_single_async(cpu, csd);
+}
+
+/**
+ * cpuidle_coupled_poke_others - wake up all other cpus that may be waiting
+ * @this_cpu: target cpu
+ * @coupled: the struct coupled that contains the current cpu
+ *
+ * Calls cpuidle_coupled_poke on all other online cpus.
+ */
+static void cpuidle_coupled_poke_others(int this_cpu,
+		struct cpuidle_coupled *coupled)
+{
+	int cpu;
+
+	for_each_cpu(cpu, &coupled->coupled_cpus)
+		if (cpu != this_cpu && cpu_online(cpu))
+			cpuidle_coupled_poke(cpu);
+}
+
+/**
+ * cpuidle_coupled_set_waiting - mark this cpu as in the wait loop
+ * @cpu: target cpu
+ * @coupled: the struct coupled that contains the current cpu
+ * @next_state: the index in drv->states of the requested state for this cpu
+ *
+ * Updates the requested idle state for the specified cpuidle device.
+ * Returns the number of waiting cpus.
+ */
+static int cpuidle_coupled_set_waiting(int cpu,
+		struct cpuidle_coupled *coupled, int next_state)
+{
+	coupled->requested_state[cpu] = next_state;
+
+	/*
+	 * The atomic_inc_return provides a write barrier to order the write
+	 * to requested_state with the later write that increments ready_count.
+	 */
+	return atomic_inc_return(&coupled->ready_waiting_counts) & WAITING_MASK;
+}
+
+/**
+ * cpuidle_coupled_set_not_waiting - mark this cpu as leaving the wait loop
+ * @cpu: target cpu
+ * @coupled: the struct coupled that contains the current cpu
+ *
+ * Removes the requested idle state for the specified cpuidle device.
+ */
+static void cpuidle_coupled_set_not_waiting(int cpu,
+		struct cpuidle_coupled *coupled)
+{
+	/*
+	 * Decrementing waiting count can race with incrementing it in
+	 * cpuidle_coupled_set_waiting, but that's OK.  Worst case, some
+	 * cpus will increment ready_count and then spin until they
+	 * notice that this cpu has cleared it's requested_state.
+	 */
+	atomic_dec(&coupled->ready_waiting_counts);
+
+	coupled->requested_state[cpu] = CPUIDLE_COUPLED_NOT_IDLE;
+}
+
+/**
+ * cpuidle_coupled_set_done - mark this cpu as leaving the ready loop
+ * @cpu: the current cpu
+ * @coupled: the struct coupled that contains the current cpu
+ *
+ * Marks this cpu as no longer in the ready and waiting loops.  Decrements
+ * the waiting count first to prevent another cpu looping back in and seeing
+ * this cpu as waiting just before it exits idle.
+ */
+static void cpuidle_coupled_set_done(int cpu, struct cpuidle_coupled *coupled)
+{
+	cpuidle_coupled_set_not_waiting(cpu, coupled);
+	atomic_sub(MAX_WAITING_CPUS, &coupled->ready_waiting_counts);
+}
+
+/**
+ * cpuidle_coupled_clear_pokes - spin until the poke interrupt is processed
+ * @cpu: this cpu
+ *
+ * Turns on interrupts and spins until any outstanding poke interrupts have
+ * been processed and the poke bit has been cleared.
+ *
+ * Other interrupts may also be processed while interrupts are enabled, so
+ * need_resched() must be tested after this function returns to make sure
+ * the interrupt didn't schedule work that should take the cpu out of idle.
+ *
+ * Returns 0 if no poke was pending, 1 if a poke was cleared.
+ */
+static int cpuidle_coupled_clear_pokes(int cpu)
+{
+	if (!cpumask_test_cpu(cpu, &cpuidle_coupled_poke_pending))
+		return 0;
+
+	local_irq_enable();
+	while (cpumask_test_cpu(cpu, &cpuidle_coupled_poke_pending))
+		cpu_relax();
+	local_irq_disable();
+
+	return 1;
+}
+
+static bool cpuidle_coupled_any_pokes_pending(struct cpuidle_coupled *coupled)
+{
+	cpumask_t cpus;
+	int ret;
+
+	cpumask_and(&cpus, cpu_online_mask, &coupled->coupled_cpus);
+	ret = cpumask_and(&cpus, &cpuidle_coupled_poke_pending, &cpus);
+
+	return ret;
+}
+
+/**
+ * cpuidle_enter_state_coupled - attempt to enter a state with coupled cpus
+ * @dev: struct cpuidle_device for the current cpu
+ * @drv: struct cpuidle_driver for the platform
+ * @next_state: index of the requested state in drv->states
+ *
+ * Coordinate with coupled cpus to enter the target state.  This is a two
+ * stage process.  In the first stage, the cpus are operating independently,
+ * and may call into cpuidle_enter_state_coupled at completely different times.
+ * To save as much power as possible, the first cpus to call this function will
+ * go to an intermediate state (the cpuidle_device's safe state), and wait for
+ * all the other cpus to call this function.  Once all coupled cpus are idle,
+ * the second stage will start.  Each coupled cpu will spin until all cpus have
+ * guaranteed that they will call the target_state.
+ *
+ * This function must be called with interrupts disabled.  It may enable
+ * interrupts while preparing for idle, and it will always return with
+ * interrupts enabled.
+ */
+int cpuidle_enter_state_coupled(struct cpuidle_device *dev,
+		struct cpuidle_driver *drv, int next_state)
+{
+	int entered_state = -1;
+	struct cpuidle_coupled *coupled = dev->coupled;
+	int w;
+
+	if (!coupled)
+		return -EINVAL;
+
+	while (coupled->prevent) {
+		cpuidle_coupled_clear_pokes(dev->cpu);
+		if (need_resched()) {
+			local_irq_enable();
+			return entered_state;
+		}
+		entered_state = cpuidle_enter_state(dev, drv,
+			drv->safe_state_index);
+		local_irq_disable();
+	}
+
+	/* Read barrier ensures online_count is read after prevent is cleared */
+	smp_rmb();
+
+reset:
+	cpumask_clear_cpu(dev->cpu, &cpuidle_coupled_poked);
+
+	w = cpuidle_coupled_set_waiting(dev->cpu, coupled, next_state);
+	/*
+	 * If this is the last cpu to enter the waiting state, poke
+	 * all the other cpus out of their waiting state so they can
+	 * enter a deeper state.  This can race with one of the cpus
+	 * exiting the waiting state due to an interrupt and
+	 * decrementing waiting_count, see comment below.
+	 */
+	if (w == coupled->online_count) {
+		cpumask_set_cpu(dev->cpu, &cpuidle_coupled_poked);
+		cpuidle_coupled_poke_others(dev->cpu, coupled);
+	}
+
+retry:
+	/*
+	 * Wait for all coupled cpus to be idle, using the deepest state
+	 * allowed for a single cpu.  If this was not the poking cpu, wait
+	 * for at least one poke before leaving to avoid a race where
+	 * two cpus could arrive at the waiting loop at the same time,
+	 * but the first of the two to arrive could skip the loop without
+	 * processing the pokes from the last to arrive.
+	 */
+	while (!cpuidle_coupled_cpus_waiting(coupled) ||
+			!cpumask_test_cpu(dev->cpu, &cpuidle_coupled_poked)) {
+		if (cpuidle_coupled_clear_pokes(dev->cpu))
+			continue;
+
+		if (need_resched()) {
+			cpuidle_coupled_set_not_waiting(dev->cpu, coupled);
+			goto out;
+		}
+
+		if (coupled->prevent) {
+			cpuidle_coupled_set_not_waiting(dev->cpu, coupled);
+			goto out;
+		}
+
+		entered_state = cpuidle_enter_state(dev, drv,
+			drv->safe_state_index);
+		local_irq_disable();
+	}
+
+	cpuidle_coupled_clear_pokes(dev->cpu);
+	if (need_resched()) {
+		cpuidle_coupled_set_not_waiting(dev->cpu, coupled);
+		goto out;
+	}
+
+	/*
+	 * Make sure final poke status for this cpu is visible before setting
+	 * cpu as ready.
+	 */
+	smp_wmb();
+
+	/*
+	 * All coupled cpus are probably idle.  There is a small chance that
+	 * one of the other cpus just became active.  Increment the ready count,
+	 * and spin until all coupled cpus have incremented the counter. Once a
+	 * cpu has incremented the ready counter, it cannot abort idle and must
+	 * spin until either all cpus have incremented the ready counter, or
+	 * another cpu leaves idle and decrements the waiting counter.
+	 */
+
+	cpuidle_coupled_set_ready(coupled);
+	while (!cpuidle_coupled_cpus_ready(coupled)) {
+		/* Check if any other cpus bailed out of idle. */
+		if (!cpuidle_coupled_cpus_waiting(coupled))
+			if (!cpuidle_coupled_set_not_ready(coupled))
+				goto retry;
+
+		cpu_relax();
+	}
+
+	/*
+	 * Make sure read of all cpus ready is done before reading pending pokes
+	 */
+	smp_rmb();
+
+	/*
+	 * There is a small chance that a cpu left and reentered idle after this
+	 * cpu saw that all cpus were waiting.  The cpu that reentered idle will
+	 * have sent this cpu a poke, which will still be pending after the
+	 * ready loop.  The pending interrupt may be lost by the interrupt
+	 * controller when entering the deep idle state.  It's not possible to
+	 * clear a pending interrupt without turning interrupts on and handling
+	 * it, and it's too late to turn on interrupts here, so reset the
+	 * coupled idle state of all cpus and retry.
+	 */
+	if (cpuidle_coupled_any_pokes_pending(coupled)) {
+		cpuidle_coupled_set_done(dev->cpu, coupled);
+		/* Wait for all cpus to see the pending pokes */
+		cpuidle_coupled_parallel_barrier(dev, &coupled->abort_barrier);
+		goto reset;
+	}
+
+	/* all cpus have acked the coupled state */
+	next_state = cpuidle_coupled_get_state(dev, coupled);
+
+	entered_state = cpuidle_enter_state(dev, drv, next_state);
+
+	cpuidle_coupled_set_done(dev->cpu, coupled);
+
+out:
+	/*
+	 * Normal cpuidle states are expected to return with irqs enabled.
+	 * That leads to an inefficiency where a cpu receiving an interrupt
+	 * that brings it out of idle will process that interrupt before
+	 * exiting the idle enter function and decrementing ready_count.  All
+	 * other cpus will need to spin waiting for the cpu that is processing
+	 * the interrupt.  If the driver returns with interrupts disabled,
+	 * all other cpus will loop back into the safe idle state instead of
+	 * spinning, saving power.
+	 *
+	 * Calling local_irq_enable here allows coupled states to return with
+	 * interrupts disabled, but won't cause problems for drivers that
+	 * exit with interrupts enabled.
+	 */
+	local_irq_enable();
+
+	/*
+	 * Wait until all coupled cpus have exited idle.  There is no risk that
+	 * a cpu exits and re-enters the ready state because this cpu has
+	 * already decremented its waiting_count.
+	 */
+	while (!cpuidle_coupled_no_cpus_ready(coupled))
+		cpu_relax();
+
+	return entered_state;
+}
+
+static void cpuidle_coupled_update_online_cpus(struct cpuidle_coupled *coupled)
+{
+	cpumask_t cpus;
+	cpumask_and(&cpus, cpu_online_mask, &coupled->coupled_cpus);
+	coupled->online_count = cpumask_weight(&cpus);
+}
+
+/**
+ * cpuidle_coupled_register_device - register a coupled cpuidle device
+ * @dev: struct cpuidle_device for the current cpu
+ *
+ * Called from cpuidle_register_device to handle coupled idle init.  Finds the
+ * cpuidle_coupled struct for this set of coupled cpus, or creates one if none
+ * exists yet.
+ */
+int cpuidle_coupled_register_device(struct cpuidle_device *dev)
+{
+	int cpu;
+	struct cpuidle_device *other_dev;
+	call_single_data_t *csd;
+	struct cpuidle_coupled *coupled;
+
+	if (cpumask_empty(&dev->coupled_cpus))
+		return 0;
+
+	for_each_cpu(cpu, &dev->coupled_cpus) {
+		other_dev = per_cpu(cpuidle_devices, cpu);
+		if (other_dev && other_dev->coupled) {
+			coupled = other_dev->coupled;
+			goto have_coupled;
+		}
+	}
+
+	/* No existing coupled info found, create a new one */
+	coupled = kzalloc(sizeof(struct cpuidle_coupled), GFP_KERNEL);
+	if (!coupled)
+		return -ENOMEM;
+
+	coupled->coupled_cpus = dev->coupled_cpus;
+
+have_coupled:
+	dev->coupled = coupled;
+	if (WARN_ON(!cpumask_equal(&dev->coupled_cpus, &coupled->coupled_cpus)))
+		coupled->prevent++;
+
+	cpuidle_coupled_update_online_cpus(coupled);
+
+	coupled->refcnt++;
+
+	csd = &per_cpu(cpuidle_coupled_poke_cb, dev->cpu);
+	INIT_CSD(csd, cpuidle_coupled_handle_poke, (void *)(unsigned long)dev->cpu);
+
+	return 0;
+}
+
+/**
+ * cpuidle_coupled_unregister_device - unregister a coupled cpuidle device
+ * @dev: struct cpuidle_device for the current cpu
+ *
+ * Called from cpuidle_unregister_device to tear down coupled idle.  Removes the
+ * cpu from the coupled idle set, and frees the cpuidle_coupled_info struct if
+ * this was the last cpu in the set.
+ */
+void cpuidle_coupled_unregister_device(struct cpuidle_device *dev)
+{
+	struct cpuidle_coupled *coupled = dev->coupled;
+
+	if (cpumask_empty(&dev->coupled_cpus))
+		return;
+
+	if (--coupled->refcnt)
+		kfree(coupled);
+	dev->coupled = NULL;
+}
+
+/**
+ * cpuidle_coupled_prevent_idle - prevent cpus from entering a coupled state
+ * @coupled: the struct coupled that contains the cpu that is changing state
+ *
+ * Disables coupled cpuidle on a coupled set of cpus.  Used to ensure that
+ * cpu_online_mask doesn't change while cpus are coordinating coupled idle.
+ */
+static void cpuidle_coupled_prevent_idle(struct cpuidle_coupled *coupled)
+{
+	int cpu = get_cpu();
+
+	/* Force all cpus out of the waiting loop. */
+	coupled->prevent++;
+	cpuidle_coupled_poke_others(cpu, coupled);
+	put_cpu();
+	while (!cpuidle_coupled_no_cpus_waiting(coupled))
+		cpu_relax();
+}
+
+/**
+ * cpuidle_coupled_allow_idle - allows cpus to enter a coupled state
+ * @coupled: the struct coupled that contains the cpu that is changing state
+ *
+ * Enables coupled cpuidle on a coupled set of cpus.  Used to ensure that
+ * cpu_online_mask doesn't change while cpus are coordinating coupled idle.
+ */
+static void cpuidle_coupled_allow_idle(struct cpuidle_coupled *coupled)
+{
+	int cpu = get_cpu();
+
+	/*
+	 * Write barrier ensures readers see the new online_count when they
+	 * see prevent == 0.
+	 */
+	smp_wmb();
+	coupled->prevent--;
+	/* Force cpus out of the prevent loop. */
+	cpuidle_coupled_poke_others(cpu, coupled);
+	put_cpu();
+}
+
+static int coupled_cpu_online(unsigned int cpu)
+{
+	struct cpuidle_device *dev;
+
+	mutex_lock(&cpuidle_lock);
+
+	dev = per_cpu(cpuidle_devices, cpu);
+	if (dev && dev->coupled) {
+		cpuidle_coupled_update_online_cpus(dev->coupled);
+		cpuidle_coupled_allow_idle(dev->coupled);
+	}
+
+	mutex_unlock(&cpuidle_lock);
+	return 0;
+}
+
+static int coupled_cpu_up_prepare(unsigned int cpu)
+{
+	struct cpuidle_device *dev;
+
+	mutex_lock(&cpuidle_lock);
+
+	dev = per_cpu(cpuidle_devices, cpu);
+	if (dev && dev->coupled)
+		cpuidle_coupled_prevent_idle(dev->coupled);
+
+	mutex_unlock(&cpuidle_lock);
+	return 0;
+}
+
+static int __init cpuidle_coupled_init(void)
+{
+	int ret;
+
+	ret = cpuhp_setup_state_nocalls(CPUHP_CPUIDLE_COUPLED_PREPARE,
+					"cpuidle/coupled:prepare",
+					coupled_cpu_up_prepare,
+					coupled_cpu_online);
+	if (ret)
+		return ret;
+	ret = cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN,
+					"cpuidle/coupled:online",
+					coupled_cpu_online,
+					coupled_cpu_up_prepare);
+	if (ret < 0)
+		cpuhp_remove_state_nocalls(CPUHP_CPUIDLE_COUPLED_PREPARE);
+	return ret;
+}
+core_initcall(cpuidle_coupled_init);
diff --git a/drivers/cpuidle/cpuidle-arm.c b/drivers/cpuidle/cpuidle-arm.c
new file mode 100644
index 0000000000..7cfb980a35
--- /dev/null
+++ b/drivers/cpuidle/cpuidle-arm.c
@@ -0,0 +1,168 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * ARM/ARM64 generic CPU idle driver.
+ *
+ * Copyright (C) 2014 ARM Ltd.
+ * Author: Lorenzo Pieralisi <lorenzo.pieralisi@arm.com>
+ */
+
+#define pr_fmt(fmt) "CPUidle arm: " fmt
+
+#include <linux/cpu_cooling.h>
+#include <linux/cpuidle.h>
+#include <linux/cpumask.h>
+#include <linux/cpu_pm.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/of.h>
+#include <linux/slab.h>
+
+#include <asm/cpuidle.h>
+
+#include "dt_idle_states.h"
+
+/*
+ * arm_enter_idle_state - Programs CPU to enter the specified state
+ *
+ * dev: cpuidle device
+ * drv: cpuidle driver
+ * idx: state index
+ *
+ * Called from the CPUidle framework to program the device to the
+ * specified target state selected by the governor.
+ */
+static __cpuidle int arm_enter_idle_state(struct cpuidle_device *dev,
+					  struct cpuidle_driver *drv, int idx)
+{
+	/*
+	 * Pass idle state index to arm_cpuidle_suspend which in turn
+	 * will call the CPU ops suspend protocol with idle index as a
+	 * parameter.
+	 */
+	return CPU_PM_CPU_IDLE_ENTER(arm_cpuidle_suspend, idx);
+}
+
+static struct cpuidle_driver arm_idle_driver __initdata = {
+	.name = "arm_idle",
+	.owner = THIS_MODULE,
+	/*
+	 * State at index 0 is standby wfi and considered standard
+	 * on all ARM platforms. If in some platforms simple wfi
+	 * can't be used as "state 0", DT bindings must be implemented
+	 * to work around this issue and allow installing a special
+	 * handler for idle state index 0.
+	 */
+	.states[0] = {
+		.enter                  = arm_enter_idle_state,
+		.exit_latency           = 1,
+		.target_residency       = 1,
+		.power_usage		= UINT_MAX,
+		.name                   = "WFI",
+		.desc                   = "ARM WFI",
+	}
+};
+
+static const struct of_device_id arm_idle_state_match[] __initconst = {
+	{ .compatible = "arm,idle-state",
+	  .data = arm_enter_idle_state },
+	{ },
+};
+
+/*
+ * arm_idle_init_cpu
+ *
+ * Registers the arm specific cpuidle driver with the cpuidle
+ * framework. It relies on core code to parse the idle states
+ * and initialize them using driver data structures accordingly.
+ */
+static int __init arm_idle_init_cpu(int cpu)
+{
+	int ret;
+	struct cpuidle_driver *drv;
+
+	drv = kmemdup(&arm_idle_driver, sizeof(*drv), GFP_KERNEL);
+	if (!drv)
+		return -ENOMEM;
+
+	drv->cpumask = (struct cpumask *)cpumask_of(cpu);
+
+	/*
+	 * Initialize idle states data, starting at index 1.  This
+	 * driver is DT only, if no DT idle states are detected (ret
+	 * == 0) let the driver initialization fail accordingly since
+	 * there is no reason to initialize the idle driver if only
+	 * wfi is supported.
+	 */
+	ret = dt_init_idle_driver(drv, arm_idle_state_match, 1);
+	if (ret <= 0) {
+		ret = ret ? : -ENODEV;
+		goto out_kfree_drv;
+	}
+
+	/*
+	 * Call arch CPU operations in order to initialize
+	 * idle states suspend back-end specific data
+	 */
+	ret = arm_cpuidle_init(cpu);
+
+	/*
+	 * Allow the initialization to continue for other CPUs, if the
+	 * reported failure is a HW misconfiguration/breakage (-ENXIO).
+	 *
+	 * Some platforms do not support idle operations
+	 * (arm_cpuidle_init() returning -EOPNOTSUPP), we should
+	 * not flag this case as an error, it is a valid
+	 * configuration.
+	 */
+	if (ret) {
+		if (ret != -EOPNOTSUPP)
+			pr_err("CPU %d failed to init idle CPU ops\n", cpu);
+		ret = ret == -ENXIO ? 0 : ret;
+		goto out_kfree_drv;
+	}
+
+	ret = cpuidle_register(drv, NULL);
+	if (ret)
+		goto out_kfree_drv;
+
+	cpuidle_cooling_register(drv);
+
+	return 0;
+
+out_kfree_drv:
+	kfree(drv);
+	return ret;
+}
+
+/*
+ * arm_idle_init - Initializes arm cpuidle driver
+ *
+ * Initializes arm cpuidle driver for all CPUs, if any CPU fails
+ * to register cpuidle driver then rollback to cancel all CPUs
+ * registeration.
+ */
+static int __init arm_idle_init(void)
+{
+	int cpu, ret;
+	struct cpuidle_driver *drv;
+	struct cpuidle_device *dev;
+
+	for_each_possible_cpu(cpu) {
+		ret = arm_idle_init_cpu(cpu);
+		if (ret)
+			goto out_fail;
+	}
+
+	return 0;
+
+out_fail:
+	while (--cpu >= 0) {
+		dev = per_cpu(cpuidle_devices, cpu);
+		drv = cpuidle_get_cpu_driver(dev);
+		cpuidle_unregister(drv);
+		kfree(drv);
+	}
+
+	return ret;
+}
+device_initcall(arm_idle_init);
diff --git a/drivers/cpuidle/cpuidle-at91.c b/drivers/cpuidle/cpuidle-at91.c
new file mode 100644
index 0000000000..45ee8e1e71
--- /dev/null
+++ b/drivers/cpuidle/cpuidle-at91.c
@@ -0,0 +1,62 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * based on arch/arm/mach-kirkwood/cpuidle.c
+ *
+ * CPU idle support for AT91 SoC
+ *
+ * The cpu idle uses wait-for-interrupt and RAM self refresh in order
+ * to implement two idle states -
+ * #1 wait-for-interrupt
+ * #2 wait-for-interrupt and RAM self refresh
+ */
+
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/platform_device.h>
+#include <linux/cpuidle.h>
+#include <linux/io.h>
+#include <linux/export.h>
+#include <asm/cpuidle.h>
+
+#define AT91_MAX_STATES	2
+
+static void (*at91_standby)(void);
+
+/* Actual code that puts the SoC in different idle states */
+static int at91_enter_idle(struct cpuidle_device *dev,
+			struct cpuidle_driver *drv,
+			       int index)
+{
+	at91_standby();
+	return index;
+}
+
+static struct cpuidle_driver at91_idle_driver = {
+	.name			= "at91_idle",
+	.owner			= THIS_MODULE,
+	.states[0]		= ARM_CPUIDLE_WFI_STATE,
+	.states[1]		= {
+		.enter			= at91_enter_idle,
+		.exit_latency		= 10,
+		.target_residency	= 10000,
+		.name			= "RAM_SR",
+		.desc			= "WFI and DDR Self Refresh",
+	},
+	.state_count = AT91_MAX_STATES,
+};
+
+/* Initialize CPU idle by registering the idle states */
+static int at91_cpuidle_probe(struct platform_device *dev)
+{
+	at91_standby = (void *)(dev->dev.platform_data);
+	
+	return cpuidle_register(&at91_idle_driver, NULL);
+}
+
+static struct platform_driver at91_cpuidle_driver = {
+	.driver = {
+		.name = "cpuidle-at91",
+	},
+	.probe = at91_cpuidle_probe,
+};
+builtin_platform_driver(at91_cpuidle_driver);
diff --git a/drivers/cpuidle/cpuidle-big_little.c b/drivers/cpuidle/cpuidle-big_little.c
new file mode 100644
index 0000000000..74972deda0
--- /dev/null
+++ b/drivers/cpuidle/cpuidle-big_little.c
@@ -0,0 +1,233 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (c) 2013 ARM/Linaro
+ *
+ * Authors: Daniel Lezcano <daniel.lezcano@linaro.org>
+ *          Lorenzo Pieralisi <lorenzo.pieralisi@arm.com>
+ *          Nicolas Pitre <nicolas.pitre@linaro.org>
+ *
+ * Maintainer: Lorenzo Pieralisi <lorenzo.pieralisi@arm.com>
+ * Maintainer: Daniel Lezcano <daniel.lezcano@linaro.org>
+ */
+#include <linux/cpuidle.h>
+#include <linux/cpu_pm.h>
+#include <linux/slab.h>
+#include <linux/of.h>
+
+#include <asm/cpu.h>
+#include <asm/cputype.h>
+#include <asm/cpuidle.h>
+#include <asm/mcpm.h>
+#include <asm/smp_plat.h>
+#include <asm/suspend.h>
+
+#include "dt_idle_states.h"
+
+static int bl_enter_powerdown(struct cpuidle_device *dev,
+			      struct cpuidle_driver *drv, int idx);
+
+/*
+ * NB: Owing to current menu governor behaviour big and LITTLE
+ * index 1 states have to define exit_latency and target_residency for
+ * cluster state since, when all CPUs in a cluster hit it, the cluster
+ * can be shutdown. This means that when a single CPU enters this state
+ * the exit_latency and target_residency values are somewhat overkill.
+ * There is no notion of cluster states in the menu governor, so CPUs
+ * have to define CPU states where possibly the cluster will be shutdown
+ * depending on the state of other CPUs. idle states entry and exit happen
+ * at random times; however the cluster state provides target_residency
+ * values as if all CPUs in a cluster enter the state at once; this is
+ * somewhat optimistic and behaviour should be fixed either in the governor
+ * or in the MCPM back-ends.
+ * To make this driver 100% generic the number of states and the exit_latency
+ * target_residency values must be obtained from device tree bindings.
+ *
+ * exit_latency: refers to the TC2 vexpress test chip and depends on the
+ * current cluster operating point. It is the time it takes to get the CPU
+ * up and running when the CPU is powered up on cluster wake-up from shutdown.
+ * Current values for big and LITTLE clusters are provided for clusters
+ * running at default operating points.
+ *
+ * target_residency: it is the minimum amount of time the cluster has
+ * to be down to break even in terms of power consumption. cluster
+ * shutdown has inherent dynamic power costs (L2 writebacks to DRAM
+ * being the main factor) that depend on the current operating points.
+ * The current values for both clusters are provided for a CPU whose half
+ * of L2 lines are dirty and require cleaning to DRAM, and takes into
+ * account leakage static power values related to the vexpress TC2 testchip.
+ */
+static struct cpuidle_driver bl_idle_little_driver = {
+	.name = "little_idle",
+	.owner = THIS_MODULE,
+	.states[0] = ARM_CPUIDLE_WFI_STATE,
+	.states[1] = {
+		.enter			= bl_enter_powerdown,
+		.exit_latency		= 700,
+		.target_residency	= 2500,
+		.flags			= CPUIDLE_FLAG_TIMER_STOP |
+					  CPUIDLE_FLAG_RCU_IDLE,
+		.name			= "C1",
+		.desc			= "ARM little-cluster power down",
+	},
+	.state_count = 2,
+};
+
+static const struct of_device_id bl_idle_state_match[] __initconst = {
+	{ .compatible = "arm,idle-state",
+	  .data = bl_enter_powerdown },
+	{ },
+};
+
+static struct cpuidle_driver bl_idle_big_driver = {
+	.name = "big_idle",
+	.owner = THIS_MODULE,
+	.states[0] = ARM_CPUIDLE_WFI_STATE,
+	.states[1] = {
+		.enter			= bl_enter_powerdown,
+		.exit_latency		= 500,
+		.target_residency	= 2000,
+		.flags			= CPUIDLE_FLAG_TIMER_STOP |
+					  CPUIDLE_FLAG_RCU_IDLE,
+		.name			= "C1",
+		.desc			= "ARM big-cluster power down",
+	},
+	.state_count = 2,
+};
+
+/*
+ * notrace prevents trace shims from getting inserted where they
+ * should not. Global jumps and ldrex/strex must not be inserted
+ * in power down sequences where caches and MMU may be turned off.
+ */
+static int notrace bl_powerdown_finisher(unsigned long arg)
+{
+	/* MCPM works with HW CPU identifiers */
+	unsigned int mpidr = read_cpuid_mpidr();
+	unsigned int cluster = MPIDR_AFFINITY_LEVEL(mpidr, 1);
+	unsigned int cpu = MPIDR_AFFINITY_LEVEL(mpidr, 0);
+
+	mcpm_set_entry_vector(cpu, cluster, cpu_resume);
+	mcpm_cpu_suspend();
+
+	/* return value != 0 means failure */
+	return 1;
+}
+
+/**
+ * bl_enter_powerdown - Programs CPU to enter the specified state
+ * @dev: cpuidle device
+ * @drv: The target state to be programmed
+ * @idx: state index
+ *
+ * Called from the CPUidle framework to program the device to the
+ * specified target state selected by the governor.
+ */
+static __cpuidle int bl_enter_powerdown(struct cpuidle_device *dev,
+					struct cpuidle_driver *drv, int idx)
+{
+	cpu_pm_enter();
+	ct_cpuidle_enter();
+
+	cpu_suspend(0, bl_powerdown_finisher);
+
+	/* signals the MCPM core that CPU is out of low power state */
+	mcpm_cpu_powered_up();
+	ct_cpuidle_exit();
+
+	cpu_pm_exit();
+
+	return idx;
+}
+
+static int __init bl_idle_driver_init(struct cpuidle_driver *drv, int part_id)
+{
+	struct cpumask *cpumask;
+	int cpu;
+
+	cpumask = kzalloc(cpumask_size(), GFP_KERNEL);
+	if (!cpumask)
+		return -ENOMEM;
+
+	for_each_possible_cpu(cpu)
+		if (smp_cpuid_part(cpu) == part_id)
+			cpumask_set_cpu(cpu, cpumask);
+
+	drv->cpumask = cpumask;
+
+	return 0;
+}
+
+static const struct of_device_id compatible_machine_match[] = {
+	{ .compatible = "arm,vexpress,v2p-ca15_a7" },
+	{ .compatible = "google,peach" },
+	{},
+};
+
+static int __init bl_idle_init(void)
+{
+	int ret;
+	struct device_node *root = of_find_node_by_path("/");
+	const struct of_device_id *match_id;
+
+	if (!root)
+		return -ENODEV;
+
+	/*
+	 * Initialize the driver just for a compliant set of machines
+	 */
+	match_id = of_match_node(compatible_machine_match, root);
+
+	of_node_put(root);
+
+	if (!match_id)
+		return -ENODEV;
+
+	if (!mcpm_is_available())
+		return -EUNATCH;
+
+	/*
+	 * For now the differentiation between little and big cores
+	 * is based on the part number. A7 cores are considered little
+	 * cores, A15 are considered big cores. This distinction may
+	 * evolve in the future with a more generic matching approach.
+	 */
+	ret = bl_idle_driver_init(&bl_idle_little_driver,
+				  ARM_CPU_PART_CORTEX_A7);
+	if (ret)
+		return ret;
+
+	ret = bl_idle_driver_init(&bl_idle_big_driver, ARM_CPU_PART_CORTEX_A15);
+	if (ret)
+		goto out_uninit_little;
+
+	/* Start at index 1, index 0 standard WFI */
+	ret = dt_init_idle_driver(&bl_idle_big_driver, bl_idle_state_match, 1);
+	if (ret < 0)
+		goto out_uninit_big;
+
+	/* Start at index 1, index 0 standard WFI */
+	ret = dt_init_idle_driver(&bl_idle_little_driver,
+				  bl_idle_state_match, 1);
+	if (ret < 0)
+		goto out_uninit_big;
+
+	ret = cpuidle_register(&bl_idle_little_driver, NULL);
+	if (ret)
+		goto out_uninit_big;
+
+	ret = cpuidle_register(&bl_idle_big_driver, NULL);
+	if (ret)
+		goto out_unregister_little;
+
+	return 0;
+
+out_unregister_little:
+	cpuidle_unregister(&bl_idle_little_driver);
+out_uninit_big:
+	kfree(bl_idle_big_driver.cpumask);
+out_uninit_little:
+	kfree(bl_idle_little_driver.cpumask);
+
+	return ret;
+}
+device_initcall(bl_idle_init);
diff --git a/drivers/cpuidle/cpuidle-calxeda.c b/drivers/cpuidle/cpuidle-calxeda.c
new file mode 100644
index 0000000000..b17d9a8418
--- /dev/null
+++ b/drivers/cpuidle/cpuidle-calxeda.c
@@ -0,0 +1,72 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright 2012 Calxeda, Inc.
+ *
+ * Based on arch/arm/plat-mxc/cpuidle.c: #v3.7
+ * Copyright 2012 Freescale Semiconductor, Inc.
+ * Copyright 2012 Linaro Ltd.
+ *
+ * Maintainer: Rob Herring <rob.herring@calxeda.com>
+ */
+
+#include <linux/cpuidle.h>
+#include <linux/cpu_pm.h>
+#include <linux/init.h>
+#include <linux/mm.h>
+#include <linux/platform_device.h>
+#include <linux/psci.h>
+
+#include <asm/cpuidle.h>
+#include <asm/suspend.h>
+
+#include <uapi/linux/psci.h>
+
+#define CALXEDA_IDLE_PARAM \
+	((0 << PSCI_0_2_POWER_STATE_ID_SHIFT) | \
+	 (0 << PSCI_0_2_POWER_STATE_AFFL_SHIFT) | \
+	 (PSCI_POWER_STATE_TYPE_POWER_DOWN << PSCI_0_2_POWER_STATE_TYPE_SHIFT))
+
+static int calxeda_idle_finish(unsigned long val)
+{
+	return psci_ops.cpu_suspend(CALXEDA_IDLE_PARAM, __pa(cpu_resume));
+}
+
+static int calxeda_pwrdown_idle(struct cpuidle_device *dev,
+				struct cpuidle_driver *drv,
+				int index)
+{
+	cpu_pm_enter();
+	cpu_suspend(0, calxeda_idle_finish);
+	cpu_pm_exit();
+
+	return index;
+}
+
+static struct cpuidle_driver calxeda_idle_driver = {
+	.name = "calxeda_idle",
+	.states = {
+		ARM_CPUIDLE_WFI_STATE,
+		{
+			.name = "PG",
+			.desc = "Power Gate",
+			.exit_latency = 30,
+			.power_usage = 50,
+			.target_residency = 200,
+			.enter = calxeda_pwrdown_idle,
+		},
+	},
+	.state_count = 2,
+};
+
+static int calxeda_cpuidle_probe(struct platform_device *pdev)
+{
+	return cpuidle_register(&calxeda_idle_driver, NULL);
+}
+
+static struct platform_driver calxeda_cpuidle_plat_driver = {
+        .driver = {
+                .name = "cpuidle-calxeda",
+        },
+        .probe = calxeda_cpuidle_probe,
+};
+builtin_platform_driver(calxeda_cpuidle_plat_driver);
diff --git a/drivers/cpuidle/cpuidle-clps711x.c b/drivers/cpuidle/cpuidle-clps711x.c
new file mode 100644
index 0000000000..fc22c59b6c
--- /dev/null
+++ b/drivers/cpuidle/cpuidle-clps711x.c
@@ -0,0 +1,52 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ *  CLPS711X CPU idle driver
+ *
+ *  Copyright (C) 2014 Alexander Shiyan <shc_work@mail.ru>
+ */
+
+#include <linux/cpuidle.h>
+#include <linux/err.h>
+#include <linux/io.h>
+#include <linux/init.h>
+#include <linux/platform_device.h>
+
+#define CLPS711X_CPUIDLE_NAME	"clps711x-cpuidle"
+
+static void __iomem *clps711x_halt;
+
+static int clps711x_cpuidle_halt(struct cpuidle_device *dev,
+				 struct cpuidle_driver *drv, int index)
+{
+	writel(0xaa, clps711x_halt);
+
+	return index;
+}
+
+static struct cpuidle_driver clps711x_idle_driver = {
+	.name		= CLPS711X_CPUIDLE_NAME,
+	.owner		= THIS_MODULE,
+	.states[0]	= {
+		.name		= "HALT",
+		.desc		= "CLPS711X HALT",
+		.enter		= clps711x_cpuidle_halt,
+		.exit_latency	= 1,
+	},
+	.state_count	= 1,
+};
+
+static int __init clps711x_cpuidle_probe(struct platform_device *pdev)
+{
+	clps711x_halt = devm_platform_ioremap_resource(pdev, 0);
+	if (IS_ERR(clps711x_halt))
+		return PTR_ERR(clps711x_halt);
+
+	return cpuidle_register(&clps711x_idle_driver, NULL);
+}
+
+static struct platform_driver clps711x_cpuidle_driver = {
+	.driver	= {
+		.name	= CLPS711X_CPUIDLE_NAME,
+	},
+};
+builtin_platform_driver_probe(clps711x_cpuidle_driver, clps711x_cpuidle_probe);
diff --git a/drivers/cpuidle/cpuidle-cps.c b/drivers/cpuidle/cpuidle-cps.c
new file mode 100644
index 0000000000..dff0ff4cc2
--- /dev/null
+++ b/drivers/cpuidle/cpuidle-cps.c
@@ -0,0 +1,178 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (C) 2014 Imagination Technologies
+ * Author: Paul Burton <paul.burton@mips.com>
+ */
+
+#include <linux/cpu_pm.h>
+#include <linux/cpuidle.h>
+#include <linux/init.h>
+
+#include <asm/idle.h>
+#include <asm/pm-cps.h>
+
+/* Enumeration of the various idle states this driver may enter */
+enum cps_idle_state {
+	STATE_WAIT = 0,		/* MIPS wait instruction, coherent */
+	STATE_NC_WAIT,		/* MIPS wait instruction, non-coherent */
+	STATE_CLOCK_GATED,	/* Core clock gated */
+	STATE_POWER_GATED,	/* Core power gated */
+	STATE_COUNT
+};
+
+static int cps_nc_enter(struct cpuidle_device *dev,
+			struct cpuidle_driver *drv, int index)
+{
+	enum cps_pm_state pm_state;
+	int err;
+
+	/*
+	 * At least one core must remain powered up & clocked in order for the
+	 * system to have any hope of functioning.
+	 *
+	 * TODO: don't treat core 0 specially, just prevent the final core
+	 * TODO: remap interrupt affinity temporarily
+	 */
+	if (cpus_are_siblings(0, dev->cpu) && (index > STATE_NC_WAIT))
+		index = STATE_NC_WAIT;
+
+	/* Select the appropriate cps_pm_state */
+	switch (index) {
+	case STATE_NC_WAIT:
+		pm_state = CPS_PM_NC_WAIT;
+		break;
+	case STATE_CLOCK_GATED:
+		pm_state = CPS_PM_CLOCK_GATED;
+		break;
+	case STATE_POWER_GATED:
+		pm_state = CPS_PM_POWER_GATED;
+		break;
+	default:
+		BUG();
+		return -EINVAL;
+	}
+
+	/* Notify listeners the CPU is about to power down */
+	if ((pm_state == CPS_PM_POWER_GATED) && cpu_pm_enter())
+		return -EINTR;
+
+	/* Enter that state */
+	err = cps_pm_enter_state(pm_state);
+
+	/* Notify listeners the CPU is back up */
+	if (pm_state == CPS_PM_POWER_GATED)
+		cpu_pm_exit();
+
+	return err ?: index;
+}
+
+static struct cpuidle_driver cps_driver = {
+	.name			= "cpc_cpuidle",
+	.owner			= THIS_MODULE,
+	.states = {
+		[STATE_WAIT] = MIPS_CPUIDLE_WAIT_STATE,
+		[STATE_NC_WAIT] = {
+			.enter	= cps_nc_enter,
+			.exit_latency		= 200,
+			.target_residency	= 450,
+			.name	= "nc-wait",
+			.desc	= "non-coherent MIPS wait",
+		},
+		[STATE_CLOCK_GATED] = {
+			.enter	= cps_nc_enter,
+			.exit_latency		= 300,
+			.target_residency	= 700,
+			.flags	= CPUIDLE_FLAG_TIMER_STOP,
+			.name	= "clock-gated",
+			.desc	= "core clock gated",
+		},
+		[STATE_POWER_GATED] = {
+			.enter	= cps_nc_enter,
+			.exit_latency		= 600,
+			.target_residency	= 1000,
+			.flags	= CPUIDLE_FLAG_TIMER_STOP,
+			.name	= "power-gated",
+			.desc	= "core power gated",
+		},
+	},
+	.state_count		= STATE_COUNT,
+	.safe_state_index	= 0,
+};
+
+static void __init cps_cpuidle_unregister(void)
+{
+	int cpu;
+	struct cpuidle_device *device;
+
+	for_each_possible_cpu(cpu) {
+		device = &per_cpu(cpuidle_dev, cpu);
+		cpuidle_unregister_device(device);
+	}
+
+	cpuidle_unregister_driver(&cps_driver);
+}
+
+static int __init cps_cpuidle_init(void)
+{
+	int err, cpu, i;
+	struct cpuidle_device *device;
+
+	/* Detect supported states */
+	if (!cps_pm_support_state(CPS_PM_POWER_GATED))
+		cps_driver.state_count = STATE_CLOCK_GATED + 1;
+	if (!cps_pm_support_state(CPS_PM_CLOCK_GATED))
+		cps_driver.state_count = STATE_NC_WAIT + 1;
+	if (!cps_pm_support_state(CPS_PM_NC_WAIT))
+		cps_driver.state_count = STATE_WAIT + 1;
+
+	/* Inform the user if some states are unavailable */
+	if (cps_driver.state_count < STATE_COUNT) {
+		pr_info("cpuidle-cps: limited to ");
+		switch (cps_driver.state_count - 1) {
+		case STATE_WAIT:
+			pr_cont("coherent wait\n");
+			break;
+		case STATE_NC_WAIT:
+			pr_cont("non-coherent wait\n");
+			break;
+		case STATE_CLOCK_GATED:
+			pr_cont("clock gating\n");
+			break;
+		}
+	}
+
+	/*
+	 * Set the coupled flag on the appropriate states if this system
+	 * requires it.
+	 */
+	if (coupled_coherence)
+		for (i = STATE_NC_WAIT; i < cps_driver.state_count; i++)
+			cps_driver.states[i].flags |= CPUIDLE_FLAG_COUPLED;
+
+	err = cpuidle_register_driver(&cps_driver);
+	if (err) {
+		pr_err("Failed to register CPS cpuidle driver\n");
+		return err;
+	}
+
+	for_each_possible_cpu(cpu) {
+		device = &per_cpu(cpuidle_dev, cpu);
+		device->cpu = cpu;
+#ifdef CONFIG_ARCH_NEEDS_CPU_IDLE_COUPLED
+		cpumask_copy(&device->coupled_cpus, &cpu_sibling_map[cpu]);
+#endif
+
+		err = cpuidle_register_device(device);
+		if (err) {
+			pr_err("Failed to register CPU%d cpuidle device\n",
+			       cpu);
+			goto err_out;
+		}
+	}
+
+	return 0;
+err_out:
+	cps_cpuidle_unregister();
+	return err;
+}
+device_initcall(cps_cpuidle_init);
diff --git a/drivers/cpuidle/cpuidle-exynos.c b/drivers/cpuidle/cpuidle-exynos.c
new file mode 100644
index 0000000000..b2b5666e05
--- /dev/null
+++ b/drivers/cpuidle/cpuidle-exynos.c
@@ -0,0 +1,143 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (c) 2011-2014 Samsung Electronics Co., Ltd.
+ *		http://www.samsung.com
+ *
+ * Coupled cpuidle support based on the work of:
+ *	Colin Cross <ccross@android.com>
+ *	Daniel Lezcano <daniel.lezcano@linaro.org>
+*/
+
+#include <linux/cpuidle.h>
+#include <linux/cpu_pm.h>
+#include <linux/export.h>
+#include <linux/init.h>
+#include <linux/platform_device.h>
+#include <linux/of.h>
+#include <linux/platform_data/cpuidle-exynos.h>
+
+#include <asm/suspend.h>
+#include <asm/cpuidle.h>
+
+static atomic_t exynos_idle_barrier;
+
+static struct cpuidle_exynos_data *exynos_cpuidle_pdata;
+static void (*exynos_enter_aftr)(void);
+
+static int exynos_enter_coupled_lowpower(struct cpuidle_device *dev,
+					 struct cpuidle_driver *drv,
+					 int index)
+{
+	int ret;
+
+	exynos_cpuidle_pdata->pre_enter_aftr();
+
+	/*
+	 * Waiting all cpus to reach this point at the same moment
+	 */
+	cpuidle_coupled_parallel_barrier(dev, &exynos_idle_barrier);
+
+	/*
+	 * Both cpus will reach this point at the same time
+	 */
+	ret = dev->cpu ? exynos_cpuidle_pdata->cpu1_powerdown()
+		       : exynos_cpuidle_pdata->cpu0_enter_aftr();
+	if (ret)
+		index = ret;
+
+	/*
+	 * Waiting all cpus to finish the power sequence before going further
+	 */
+	cpuidle_coupled_parallel_barrier(dev, &exynos_idle_barrier);
+
+	exynos_cpuidle_pdata->post_enter_aftr();
+
+	return index;
+}
+
+static int exynos_enter_lowpower(struct cpuidle_device *dev,
+				struct cpuidle_driver *drv,
+				int index)
+{
+	int new_index = index;
+
+	/* AFTR can only be entered when cores other than CPU0 are offline */
+	if (num_online_cpus() > 1 || dev->cpu != 0)
+		new_index = drv->safe_state_index;
+
+	if (new_index == 0)
+		return arm_cpuidle_simple_enter(dev, drv, new_index);
+
+	exynos_enter_aftr();
+
+	return new_index;
+}
+
+static struct cpuidle_driver exynos_idle_driver = {
+	.name			= "exynos_idle",
+	.owner			= THIS_MODULE,
+	.states = {
+		[0] = ARM_CPUIDLE_WFI_STATE,
+		[1] = {
+			.enter			= exynos_enter_lowpower,
+			.exit_latency		= 300,
+			.target_residency	= 10000,
+			.name			= "C1",
+			.desc			= "ARM power down",
+		},
+	},
+	.state_count = 2,
+	.safe_state_index = 0,
+};
+
+static struct cpuidle_driver exynos_coupled_idle_driver = {
+	.name			= "exynos_coupled_idle",
+	.owner			= THIS_MODULE,
+	.states = {
+		[0] = ARM_CPUIDLE_WFI_STATE,
+		[1] = {
+			.enter			= exynos_enter_coupled_lowpower,
+			.exit_latency		= 5000,
+			.target_residency	= 10000,
+			.flags			= CPUIDLE_FLAG_COUPLED |
+						  CPUIDLE_FLAG_TIMER_STOP,
+			.name			= "C1",
+			.desc			= "ARM power down",
+		},
+	},
+	.state_count = 2,
+	.safe_state_index = 0,
+};
+
+static int exynos_cpuidle_probe(struct platform_device *pdev)
+{
+	int ret;
+
+	if (IS_ENABLED(CONFIG_SMP) &&
+	    (of_machine_is_compatible("samsung,exynos4210") ||
+	     of_machine_is_compatible("samsung,exynos3250"))) {
+		exynos_cpuidle_pdata = pdev->dev.platform_data;
+
+		ret = cpuidle_register(&exynos_coupled_idle_driver,
+				       cpu_possible_mask);
+	} else {
+		exynos_enter_aftr = (void *)(pdev->dev.platform_data);
+
+		ret = cpuidle_register(&exynos_idle_driver, NULL);
+	}
+
+	if (ret) {
+		dev_err(&pdev->dev, "failed to register cpuidle driver\n");
+		return ret;
+	}
+
+	return 0;
+}
+
+static struct platform_driver exynos_cpuidle_driver = {
+	.probe	= exynos_cpuidle_probe,
+	.driver = {
+		.name = "exynos_cpuidle",
+	},
+};
+builtin_platform_driver(exynos_cpuidle_driver);
diff --git a/drivers/cpuidle/cpuidle-haltpoll.c b/drivers/cpuidle/cpuidle-haltpoll.c
new file mode 100644
index 0000000000..d8515d5c08
--- /dev/null
+++ b/drivers/cpuidle/cpuidle-haltpoll.c
@@ -0,0 +1,145 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * cpuidle driver for haltpoll governor.
+ *
+ * Copyright 2019 Red Hat, Inc. and/or its affiliates.
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2.  See
+ * the COPYING file in the top-level directory.
+ *
+ * Authors: Marcelo Tosatti <mtosatti@redhat.com>
+ */
+
+#include <linux/init.h>
+#include <linux/cpu.h>
+#include <linux/cpuidle.h>
+#include <linux/module.h>
+#include <linux/sched/idle.h>
+#include <linux/kvm_para.h>
+#include <linux/cpuidle_haltpoll.h>
+
+static bool force __read_mostly;
+module_param(force, bool, 0444);
+MODULE_PARM_DESC(force, "Load unconditionally");
+
+static struct cpuidle_device __percpu *haltpoll_cpuidle_devices;
+static enum cpuhp_state haltpoll_hp_state;
+
+static __cpuidle int default_enter_idle(struct cpuidle_device *dev,
+					struct cpuidle_driver *drv, int index)
+{
+	if (current_clr_polling_and_test())
+		return index;
+
+	arch_cpu_idle();
+	return index;
+}
+
+static struct cpuidle_driver haltpoll_driver = {
+	.name = "haltpoll",
+	.governor = "haltpoll",
+	.states = {
+		{ /* entry 0 is for polling */ },
+		{
+			.enter			= default_enter_idle,
+			.exit_latency		= 1,
+			.target_residency	= 1,
+			.power_usage		= -1,
+			.name			= "haltpoll idle",
+			.desc			= "default architecture idle",
+		},
+	},
+	.safe_state_index = 0,
+	.state_count = 2,
+};
+
+static int haltpoll_cpu_online(unsigned int cpu)
+{
+	struct cpuidle_device *dev;
+
+	dev = per_cpu_ptr(haltpoll_cpuidle_devices, cpu);
+	if (!dev->registered) {
+		dev->cpu = cpu;
+		if (cpuidle_register_device(dev)) {
+			pr_notice("cpuidle_register_device %d failed!\n", cpu);
+			return -EIO;
+		}
+		arch_haltpoll_enable(cpu);
+	}
+
+	return 0;
+}
+
+static int haltpoll_cpu_offline(unsigned int cpu)
+{
+	struct cpuidle_device *dev;
+
+	dev = per_cpu_ptr(haltpoll_cpuidle_devices, cpu);
+	if (dev->registered) {
+		arch_haltpoll_disable(cpu);
+		cpuidle_unregister_device(dev);
+	}
+
+	return 0;
+}
+
+static void haltpoll_uninit(void)
+{
+	if (haltpoll_hp_state)
+		cpuhp_remove_state(haltpoll_hp_state);
+	cpuidle_unregister_driver(&haltpoll_driver);
+
+	free_percpu(haltpoll_cpuidle_devices);
+	haltpoll_cpuidle_devices = NULL;
+}
+
+static bool haltpoll_want(void)
+{
+	return kvm_para_has_hint(KVM_HINTS_REALTIME) || force;
+}
+
+static int __init haltpoll_init(void)
+{
+	int ret;
+	struct cpuidle_driver *drv = &haltpoll_driver;
+
+	/* Do not load haltpoll if idle= is passed */
+	if (boot_option_idle_override != IDLE_NO_OVERRIDE)
+		return -ENODEV;
+
+	if (!kvm_para_available() || !haltpoll_want())
+		return -ENODEV;
+
+	cpuidle_poll_state_init(drv);
+
+	ret = cpuidle_register_driver(drv);
+	if (ret < 0)
+		return ret;
+
+	haltpoll_cpuidle_devices = alloc_percpu(struct cpuidle_device);
+	if (haltpoll_cpuidle_devices == NULL) {
+		cpuidle_unregister_driver(drv);
+		return -ENOMEM;
+	}
+
+	ret = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "cpuidle/haltpoll:online",
+				haltpoll_cpu_online, haltpoll_cpu_offline);
+	if (ret < 0) {
+		haltpoll_uninit();
+	} else {
+		haltpoll_hp_state = ret;
+		ret = 0;
+	}
+
+	return ret;
+}
+
+static void __exit haltpoll_exit(void)
+{
+	haltpoll_uninit();
+}
+
+module_init(haltpoll_init);
+module_exit(haltpoll_exit);
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Marcelo Tosatti <mtosatti@redhat.com>");
diff --git a/drivers/cpuidle/cpuidle-kirkwood.c b/drivers/cpuidle/cpuidle-kirkwood.c
new file mode 100644
index 0000000000..13bf743f88
--- /dev/null
+++ b/drivers/cpuidle/cpuidle-kirkwood.c
@@ -0,0 +1,81 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * CPU idle Marvell Kirkwood SoCs
+ *
+ * The cpu idle uses wait-for-interrupt and DDR self refresh in order
+ * to implement two idle states -
+ * #1 wait-for-interrupt
+ * #2 wait-for-interrupt and DDR self refresh
+ *
+ * Maintainer: Jason Cooper <jason@lakedaemon.net>
+ * Maintainer: Andrew Lunn <andrew@lunn.ch>
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/platform_device.h>
+#include <linux/cpuidle.h>
+#include <linux/io.h>
+#include <linux/export.h>
+#include <asm/cpuidle.h>
+
+#define KIRKWOOD_MAX_STATES	2
+
+static void __iomem *ddr_operation_base;
+
+/* Actual code that puts the SoC in different idle states */
+static int kirkwood_enter_idle(struct cpuidle_device *dev,
+			       struct cpuidle_driver *drv,
+			       int index)
+{
+	writel(0x7, ddr_operation_base);
+	cpu_do_idle();
+
+	return index;
+}
+
+static struct cpuidle_driver kirkwood_idle_driver = {
+	.name			= "kirkwood_idle",
+	.owner			= THIS_MODULE,
+	.states[0]		= ARM_CPUIDLE_WFI_STATE,
+	.states[1]		= {
+		.enter			= kirkwood_enter_idle,
+		.exit_latency		= 10,
+		.target_residency	= 100000,
+		.name			= "DDR SR",
+		.desc			= "WFI and DDR Self Refresh",
+	},
+	.state_count = KIRKWOOD_MAX_STATES,
+};
+
+/* Initialize CPU idle by registering the idle states */
+static int kirkwood_cpuidle_probe(struct platform_device *pdev)
+{
+	ddr_operation_base = devm_platform_ioremap_resource(pdev, 0);
+	if (IS_ERR(ddr_operation_base))
+		return PTR_ERR(ddr_operation_base);
+
+	return cpuidle_register(&kirkwood_idle_driver, NULL);
+}
+
+static int kirkwood_cpuidle_remove(struct platform_device *pdev)
+{
+	cpuidle_unregister(&kirkwood_idle_driver);
+	return 0;
+}
+
+static struct platform_driver kirkwood_cpuidle_driver = {
+	.probe = kirkwood_cpuidle_probe,
+	.remove = kirkwood_cpuidle_remove,
+	.driver = {
+		   .name = "kirkwood_cpuidle",
+		   },
+};
+
+module_platform_driver(kirkwood_cpuidle_driver);
+
+MODULE_AUTHOR("Andrew Lunn <andrew@lunn.ch>");
+MODULE_DESCRIPTION("Kirkwood cpu idle driver");
+MODULE_LICENSE("GPL v2");
+MODULE_ALIAS("platform:kirkwood-cpuidle");
diff --git a/drivers/cpuidle/cpuidle-mvebu-v7.c b/drivers/cpuidle/cpuidle-mvebu-v7.c
new file mode 100644
index 0000000000..563dba609b
--- /dev/null
+++ b/drivers/cpuidle/cpuidle-mvebu-v7.c
@@ -0,0 +1,144 @@
+/*
+ * Marvell Armada 370, 38x and XP SoC cpuidle driver
+ *
+ * Copyright (C) 2014 Marvell
+ *
+ * Nadav Haklai <nadavh@marvell.com>
+ * Gregory CLEMENT <gregory.clement@free-electrons.com>
+ *
+ * This file is licensed under the terms of the GNU General Public
+ * License version 2.  This program is licensed "as is" without any
+ * warranty of any kind, whether express or implied.
+ *
+ * Maintainer: Gregory CLEMENT <gregory.clement@free-electrons.com>
+ */
+
+#include <linux/cpu_pm.h>
+#include <linux/cpuidle.h>
+#include <linux/module.h>
+#include <linux/of.h>
+#include <linux/suspend.h>
+#include <linux/platform_device.h>
+#include <asm/cpuidle.h>
+
+#define MVEBU_V7_FLAG_DEEP_IDLE	0x10000
+
+static int (*mvebu_v7_cpu_suspend)(int);
+
+static __cpuidle int mvebu_v7_enter_idle(struct cpuidle_device *dev,
+					 struct cpuidle_driver *drv,
+					 int index)
+{
+	int ret;
+	bool deepidle = false;
+	cpu_pm_enter();
+
+	if (drv->states[index].flags & MVEBU_V7_FLAG_DEEP_IDLE)
+		deepidle = true;
+
+	ct_cpuidle_enter();
+	ret = mvebu_v7_cpu_suspend(deepidle);
+	ct_cpuidle_exit();
+
+	cpu_pm_exit();
+
+	if (ret)
+		return ret;
+
+	return index;
+}
+
+static struct cpuidle_driver armadaxp_idle_driver = {
+	.name			= "armada_xp_idle",
+	.states[0]		= ARM_CPUIDLE_WFI_STATE,
+	.states[1]		= {
+		.enter			= mvebu_v7_enter_idle,
+		.exit_latency		= 100,
+		.power_usage		= 50,
+		.target_residency	= 1000,
+		.flags			= CPUIDLE_FLAG_RCU_IDLE,
+		.name			= "MV CPU IDLE",
+		.desc			= "CPU power down",
+	},
+	.states[2]		= {
+		.enter			= mvebu_v7_enter_idle,
+		.exit_latency		= 1000,
+		.power_usage		= 5,
+		.target_residency	= 10000,
+		.flags			= MVEBU_V7_FLAG_DEEP_IDLE | CPUIDLE_FLAG_RCU_IDLE,
+		.name			= "MV CPU DEEP IDLE",
+		.desc			= "CPU and L2 Fabric power down",
+	},
+	.state_count = 3,
+};
+
+static struct cpuidle_driver armada370_idle_driver = {
+	.name			= "armada_370_idle",
+	.states[0]		= ARM_CPUIDLE_WFI_STATE,
+	.states[1]		= {
+		.enter			= mvebu_v7_enter_idle,
+		.exit_latency		= 100,
+		.power_usage		= 5,
+		.target_residency	= 1000,
+		.flags			= MVEBU_V7_FLAG_DEEP_IDLE | CPUIDLE_FLAG_RCU_IDLE,
+		.name			= "Deep Idle",
+		.desc			= "CPU and L2 Fabric power down",
+	},
+	.state_count = 2,
+};
+
+static struct cpuidle_driver armada38x_idle_driver = {
+	.name			= "armada_38x_idle",
+	.states[0]		= ARM_CPUIDLE_WFI_STATE,
+	.states[1]		= {
+		.enter			= mvebu_v7_enter_idle,
+		.exit_latency		= 10,
+		.power_usage		= 5,
+		.target_residency	= 100,
+		.flags			= CPUIDLE_FLAG_RCU_IDLE,
+		.name			= "Idle",
+		.desc			= "CPU and SCU power down",
+	},
+	.state_count = 2,
+};
+
+static int mvebu_v7_cpuidle_probe(struct platform_device *pdev)
+{
+	const struct platform_device_id *id = pdev->id_entry;
+
+	if (!id)
+		return -EINVAL;
+
+	mvebu_v7_cpu_suspend = pdev->dev.platform_data;
+
+	return cpuidle_register((struct cpuidle_driver *)id->driver_data, NULL);
+}
+
+static const struct platform_device_id mvebu_cpuidle_ids[] = {
+	{
+		.name = "cpuidle-armada-xp",
+		.driver_data = (unsigned long)&armadaxp_idle_driver,
+	}, {
+		.name = "cpuidle-armada-370",
+		.driver_data = (unsigned long)&armada370_idle_driver,
+	}, {
+		.name = "cpuidle-armada-38x",
+		.driver_data = (unsigned long)&armada38x_idle_driver,
+	},
+	{}
+};
+
+static struct platform_driver mvebu_cpuidle_driver = {
+	.probe = mvebu_v7_cpuidle_probe,
+	.driver = {
+		.name = "cpuidle-mbevu",
+		.suppress_bind_attrs = true,
+	},
+	.id_table = mvebu_cpuidle_ids,
+};
+
+builtin_platform_driver(mvebu_cpuidle_driver);
+
+MODULE_AUTHOR("Gregory CLEMENT <gregory.clement@free-electrons.com>");
+MODULE_DESCRIPTION("Marvell EBU v7 cpuidle driver");
+MODULE_LICENSE("GPL");
diff --git a/drivers/cpuidle/cpuidle-powernv.c b/drivers/cpuidle/cpuidle-powernv.c
new file mode 100644
index 0000000000..9ebedd972d
--- /dev/null
+++ b/drivers/cpuidle/cpuidle-powernv.c
@@ -0,0 +1,406 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ *  cpuidle-powernv - idle state cpuidle driver.
+ *  Adapted from drivers/cpuidle/cpuidle-pseries
+ *
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/moduleparam.h>
+#include <linux/cpuidle.h>
+#include <linux/cpu.h>
+#include <linux/notifier.h>
+#include <linux/clockchips.h>
+#include <linux/of.h>
+#include <linux/slab.h>
+
+#include <asm/machdep.h>
+#include <asm/firmware.h>
+#include <asm/opal.h>
+#include <asm/runlatch.h>
+#include <asm/cpuidle.h>
+
+/*
+ * Expose only those Hardware idle states via the cpuidle framework
+ * that have latency value below POWERNV_THRESHOLD_LATENCY_NS.
+ */
+#define POWERNV_THRESHOLD_LATENCY_NS 200000
+
+static struct cpuidle_driver powernv_idle_driver = {
+	.name             = "powernv_idle",
+	.owner            = THIS_MODULE,
+};
+
+static int max_idle_state __read_mostly;
+static struct cpuidle_state *cpuidle_state_table __read_mostly;
+
+struct stop_psscr_table {
+	u64 val;
+	u64 mask;
+};
+
+static struct stop_psscr_table stop_psscr_table[CPUIDLE_STATE_MAX] __read_mostly;
+
+static u64 default_snooze_timeout __read_mostly;
+static bool snooze_timeout_en __read_mostly;
+
+static u64 get_snooze_timeout(struct cpuidle_device *dev,
+			      struct cpuidle_driver *drv,
+			      int index)
+{
+	int i;
+
+	if (unlikely(!snooze_timeout_en))
+		return default_snooze_timeout;
+
+	for (i = index + 1; i < drv->state_count; i++) {
+		if (dev->states_usage[i].disable)
+			continue;
+
+		return drv->states[i].target_residency * tb_ticks_per_usec;
+	}
+
+	return default_snooze_timeout;
+}
+
+static int snooze_loop(struct cpuidle_device *dev,
+			struct cpuidle_driver *drv,
+			int index)
+{
+	u64 snooze_exit_time;
+
+	set_thread_flag(TIF_POLLING_NRFLAG);
+
+	local_irq_enable();
+
+	snooze_exit_time = get_tb() + get_snooze_timeout(dev, drv, index);
+	dev->poll_time_limit = false;
+	ppc64_runlatch_off();
+	HMT_very_low();
+	while (!need_resched()) {
+		if (likely(snooze_timeout_en) && get_tb() > snooze_exit_time) {
+			/*
+			 * Task has not woken up but we are exiting the polling
+			 * loop anyway. Require a barrier after polling is
+			 * cleared to order subsequent test of need_resched().
+			 */
+			clear_thread_flag(TIF_POLLING_NRFLAG);
+			dev->poll_time_limit = true;
+			smp_mb();
+			break;
+		}
+	}
+
+	HMT_medium();
+	ppc64_runlatch_on();
+	clear_thread_flag(TIF_POLLING_NRFLAG);
+
+	local_irq_disable();
+
+	return index;
+}
+
+static int nap_loop(struct cpuidle_device *dev,
+			struct cpuidle_driver *drv,
+			int index)
+{
+	power7_idle_type(PNV_THREAD_NAP);
+
+	return index;
+}
+
+/* Register for fastsleep only in oneshot mode of broadcast */
+#ifdef CONFIG_TICK_ONESHOT
+static int fastsleep_loop(struct cpuidle_device *dev,
+				struct cpuidle_driver *drv,
+				int index)
+{
+	unsigned long old_lpcr = mfspr(SPRN_LPCR);
+	unsigned long new_lpcr;
+
+	if (unlikely(system_state < SYSTEM_RUNNING))
+		return index;
+
+	new_lpcr = old_lpcr;
+	/* Do not exit powersave upon decrementer as we've setup the timer
+	 * offload.
+	 */
+	new_lpcr &= ~LPCR_PECE1;
+
+	mtspr(SPRN_LPCR, new_lpcr);
+
+	power7_idle_type(PNV_THREAD_SLEEP);
+
+	mtspr(SPRN_LPCR, old_lpcr);
+
+	return index;
+}
+#endif
+
+static int stop_loop(struct cpuidle_device *dev,
+		     struct cpuidle_driver *drv,
+		     int index)
+{
+	arch300_idle_type(stop_psscr_table[index].val,
+			 stop_psscr_table[index].mask);
+	return index;
+}
+
+/*
+ * States for dedicated partition case.
+ */
+static struct cpuidle_state powernv_states[CPUIDLE_STATE_MAX] = {
+	{ /* Snooze */
+		.name = "snooze",
+		.desc = "snooze",
+		.exit_latency = 0,
+		.target_residency = 0,
+		.enter = snooze_loop,
+		.flags = CPUIDLE_FLAG_POLLING },
+};
+
+static int powernv_cpuidle_cpu_online(unsigned int cpu)
+{
+	struct cpuidle_device *dev = per_cpu(cpuidle_devices, cpu);
+
+	if (dev && cpuidle_get_driver()) {
+		cpuidle_pause_and_lock();
+		cpuidle_enable_device(dev);
+		cpuidle_resume_and_unlock();
+	}
+	return 0;
+}
+
+static int powernv_cpuidle_cpu_dead(unsigned int cpu)
+{
+	struct cpuidle_device *dev = per_cpu(cpuidle_devices, cpu);
+
+	if (dev && cpuidle_get_driver()) {
+		cpuidle_pause_and_lock();
+		cpuidle_disable_device(dev);
+		cpuidle_resume_and_unlock();
+	}
+	return 0;
+}
+
+/*
+ * powernv_cpuidle_driver_init()
+ */
+static int powernv_cpuidle_driver_init(void)
+{
+	int idle_state;
+	struct cpuidle_driver *drv = &powernv_idle_driver;
+
+	drv->state_count = 0;
+
+	for (idle_state = 0; idle_state < max_idle_state; ++idle_state) {
+		/* Is the state not enabled? */
+		if (cpuidle_state_table[idle_state].enter == NULL)
+			continue;
+
+		drv->states[drv->state_count] =	/* structure copy */
+			cpuidle_state_table[idle_state];
+
+		drv->state_count += 1;
+	}
+
+	/*
+	 * On the PowerNV platform cpu_present may be less than cpu_possible in
+	 * cases when firmware detects the CPU, but it is not available to the
+	 * OS.  If CONFIG_HOTPLUG_CPU=n, then such CPUs are not hotplugable at
+	 * run time and hence cpu_devices are not created for those CPUs by the
+	 * generic topology_init().
+	 *
+	 * drv->cpumask defaults to cpu_possible_mask in
+	 * __cpuidle_driver_init().  This breaks cpuidle on PowerNV where
+	 * cpu_devices are not created for CPUs in cpu_possible_mask that
+	 * cannot be hot-added later at run time.
+	 *
+	 * Trying cpuidle_register_device() on a CPU without a cpu_device is
+	 * incorrect, so pass a correct CPU mask to the generic cpuidle driver.
+	 */
+
+	drv->cpumask = (struct cpumask *)cpu_present_mask;
+
+	return 0;
+}
+
+static inline void add_powernv_state(int index, const char *name,
+				     unsigned int flags,
+				     int (*idle_fn)(struct cpuidle_device *,
+						    struct cpuidle_driver *,
+						    int),
+				     unsigned int target_residency,
+				     unsigned int exit_latency,
+				     u64 psscr_val, u64 psscr_mask)
+{
+	strscpy(powernv_states[index].name, name, CPUIDLE_NAME_LEN);
+	strscpy(powernv_states[index].desc, name, CPUIDLE_NAME_LEN);
+	powernv_states[index].flags = flags;
+	powernv_states[index].target_residency = target_residency;
+	powernv_states[index].exit_latency = exit_latency;
+	powernv_states[index].enter = idle_fn;
+	/* For power8 and below psscr_* will be 0 */
+	stop_psscr_table[index].val = psscr_val;
+	stop_psscr_table[index].mask = psscr_mask;
+}
+
+extern u32 pnv_get_supported_cpuidle_states(void);
+static int powernv_add_idle_states(void)
+{
+	int nr_idle_states = 1; /* Snooze */
+	int dt_idle_states;
+	u32 has_stop_states = 0;
+	int i;
+	u32 supported_flags = pnv_get_supported_cpuidle_states();
+
+
+	/* Currently we have snooze statically defined */
+	if (nr_pnv_idle_states <= 0) {
+		pr_warn("cpuidle-powernv : Only Snooze is available\n");
+		goto out;
+	}
+
+	/* TODO: Count only states which are eligible for cpuidle */
+	dt_idle_states = nr_pnv_idle_states;
+
+	/*
+	 * Since snooze is used as first idle state, max idle states allowed is
+	 * CPUIDLE_STATE_MAX -1
+	 */
+	if (nr_pnv_idle_states > CPUIDLE_STATE_MAX - 1) {
+		pr_warn("cpuidle-powernv: discovered idle states more than allowed");
+		dt_idle_states = CPUIDLE_STATE_MAX - 1;
+	}
+
+	/*
+	 * If the idle states use stop instruction, probe for psscr values
+	 * and psscr mask which are necessary to specify required stop level.
+	 */
+	has_stop_states = (pnv_idle_states[0].flags &
+			   (OPAL_PM_STOP_INST_FAST | OPAL_PM_STOP_INST_DEEP));
+
+	for (i = 0; i < dt_idle_states; i++) {
+		unsigned int exit_latency, target_residency;
+		bool stops_timebase = false;
+		struct pnv_idle_states_t *state = &pnv_idle_states[i];
+
+		/*
+		 * Skip the platform idle state whose flag isn't in
+		 * the supported_cpuidle_states flag mask.
+		 */
+		if ((state->flags & supported_flags) != state->flags)
+			continue;
+		/*
+		 * If an idle state has exit latency beyond
+		 * POWERNV_THRESHOLD_LATENCY_NS then don't use it
+		 * in cpu-idle.
+		 */
+		if (state->latency_ns > POWERNV_THRESHOLD_LATENCY_NS)
+			continue;
+		/*
+		 * Firmware passes residency and latency values in ns.
+		 * cpuidle expects it in us.
+		 */
+		exit_latency = DIV_ROUND_UP(state->latency_ns, 1000);
+		target_residency = DIV_ROUND_UP(state->residency_ns, 1000);
+
+		if (has_stop_states && !(state->valid))
+				continue;
+
+		if (state->flags & OPAL_PM_TIMEBASE_STOP)
+			stops_timebase = true;
+
+		if (state->flags & OPAL_PM_NAP_ENABLED) {
+			/* Add NAP state */
+			add_powernv_state(nr_idle_states, "Nap",
+					  CPUIDLE_FLAG_NONE, nap_loop,
+					  target_residency, exit_latency, 0, 0);
+		} else if (has_stop_states && !stops_timebase) {
+			add_powernv_state(nr_idle_states, state->name,
+					  CPUIDLE_FLAG_NONE, stop_loop,
+					  target_residency, exit_latency,
+					  state->psscr_val,
+					  state->psscr_mask);
+		}
+
+		/*
+		 * All cpuidle states with CPUIDLE_FLAG_TIMER_STOP set must come
+		 * within this config dependency check.
+		 */
+#ifdef CONFIG_TICK_ONESHOT
+		else if (state->flags & OPAL_PM_SLEEP_ENABLED ||
+			 state->flags & OPAL_PM_SLEEP_ENABLED_ER1) {
+			/* Add FASTSLEEP state */
+			add_powernv_state(nr_idle_states, "FastSleep",
+					  CPUIDLE_FLAG_TIMER_STOP,
+					  fastsleep_loop,
+					  target_residency, exit_latency, 0, 0);
+		} else if (has_stop_states && stops_timebase) {
+			add_powernv_state(nr_idle_states, state->name,
+					  CPUIDLE_FLAG_TIMER_STOP, stop_loop,
+					  target_residency, exit_latency,
+					  state->psscr_val,
+					  state->psscr_mask);
+		}
+#endif
+		else
+			continue;
+		nr_idle_states++;
+	}
+out:
+	return nr_idle_states;
+}
+
+/*
+ * powernv_idle_probe()
+ * Choose state table for shared versus dedicated partition
+ */
+static int powernv_idle_probe(void)
+{
+	if (cpuidle_disable != IDLE_NO_OVERRIDE)
+		return -ENODEV;
+
+	if (firmware_has_feature(FW_FEATURE_OPAL)) {
+		cpuidle_state_table = powernv_states;
+		/* Device tree can indicate more idle states */
+		max_idle_state = powernv_add_idle_states();
+		default_snooze_timeout = TICK_USEC * tb_ticks_per_usec;
+		if (max_idle_state > 1)
+			snooze_timeout_en = true;
+ 	} else
+ 		return -ENODEV;
+
+	return 0;
+}
+
+static int __init powernv_processor_idle_init(void)
+{
+	int retval;
+
+	retval = powernv_idle_probe();
+	if (retval)
+		return retval;
+
+	powernv_cpuidle_driver_init();
+	retval = cpuidle_register(&powernv_idle_driver, NULL);
+	if (retval) {
+		printk(KERN_DEBUG "Registration of powernv driver failed.\n");
+		return retval;
+	}
+
+	retval = cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN,
+					   "cpuidle/powernv:online",
+					   powernv_cpuidle_cpu_online, NULL);
+	WARN_ON(retval < 0);
+	retval = cpuhp_setup_state_nocalls(CPUHP_CPUIDLE_DEAD,
+					   "cpuidle/powernv:dead", NULL,
+					   powernv_cpuidle_cpu_dead);
+	WARN_ON(retval < 0);
+	printk(KERN_DEBUG "powernv_idle_driver registered\n");
+	return 0;
+}
+
+device_initcall(powernv_processor_idle_init);
diff --git a/drivers/cpuidle/cpuidle-psci-domain.c b/drivers/cpuidle/cpuidle-psci-domain.c
new file mode 100644
index 0000000000..b88af1262f
--- /dev/null
+++ b/drivers/cpuidle/cpuidle-psci-domain.c
@@ -0,0 +1,203 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * PM domains for CPUs via genpd - managed by cpuidle-psci.
+ *
+ * Copyright (C) 2019 Linaro Ltd.
+ * Author: Ulf Hansson <ulf.hansson@linaro.org>
+ *
+ */
+
+#define pr_fmt(fmt) "CPUidle PSCI: " fmt
+
+#include <linux/cpu.h>
+#include <linux/device.h>
+#include <linux/kernel.h>
+#include <linux/platform_device.h>
+#include <linux/pm_domain.h>
+#include <linux/pm_runtime.h>
+#include <linux/psci.h>
+#include <linux/slab.h>
+#include <linux/string.h>
+
+#include "cpuidle-psci.h"
+
+struct psci_pd_provider {
+	struct list_head link;
+	struct device_node *node;
+};
+
+static LIST_HEAD(psci_pd_providers);
+static bool psci_pd_allow_domain_state;
+
+static int psci_pd_power_off(struct generic_pm_domain *pd)
+{
+	struct genpd_power_state *state = &pd->states[pd->state_idx];
+	u32 *pd_state;
+
+	if (!state->data)
+		return 0;
+
+	if (!psci_pd_allow_domain_state)
+		return -EBUSY;
+
+	/* OSI mode is enabled, set the corresponding domain state. */
+	pd_state = state->data;
+	psci_set_domain_state(*pd_state);
+
+	return 0;
+}
+
+static int psci_pd_init(struct device_node *np, bool use_osi)
+{
+	struct generic_pm_domain *pd;
+	struct psci_pd_provider *pd_provider;
+	struct dev_power_governor *pd_gov;
+	int ret = -ENOMEM;
+
+	pd = dt_idle_pd_alloc(np, psci_dt_parse_state_node);
+	if (!pd)
+		goto out;
+
+	pd_provider = kzalloc(sizeof(*pd_provider), GFP_KERNEL);
+	if (!pd_provider)
+		goto free_pd;
+
+	pd->flags |= GENPD_FLAG_IRQ_SAFE | GENPD_FLAG_CPU_DOMAIN;
+
+	/*
+	 * Allow power off when OSI has been successfully enabled.
+	 * PREEMPT_RT is not yet ready to enter domain idle states.
+	 */
+	if (use_osi && !IS_ENABLED(CONFIG_PREEMPT_RT))
+		pd->power_off = psci_pd_power_off;
+	else
+		pd->flags |= GENPD_FLAG_ALWAYS_ON;
+
+	/* Use governor for CPU PM domains if it has some states to manage. */
+	pd_gov = pd->states ? &pm_domain_cpu_gov : NULL;
+
+	ret = pm_genpd_init(pd, pd_gov, false);
+	if (ret)
+		goto free_pd_prov;
+
+	ret = of_genpd_add_provider_simple(np, pd);
+	if (ret)
+		goto remove_pd;
+
+	pd_provider->node = of_node_get(np);
+	list_add(&pd_provider->link, &psci_pd_providers);
+
+	pr_debug("init PM domain %s\n", pd->name);
+	return 0;
+
+remove_pd:
+	pm_genpd_remove(pd);
+free_pd_prov:
+	kfree(pd_provider);
+free_pd:
+	dt_idle_pd_free(pd);
+out:
+	pr_err("failed to init PM domain ret=%d %pOF\n", ret, np);
+	return ret;
+}
+
+static void psci_pd_remove(void)
+{
+	struct psci_pd_provider *pd_provider, *it;
+	struct generic_pm_domain *genpd;
+
+	list_for_each_entry_safe_reverse(pd_provider, it,
+					 &psci_pd_providers, link) {
+		of_genpd_del_provider(pd_provider->node);
+
+		genpd = of_genpd_remove_last(pd_provider->node);
+		if (!IS_ERR(genpd))
+			kfree(genpd);
+
+		of_node_put(pd_provider->node);
+		list_del(&pd_provider->link);
+		kfree(pd_provider);
+	}
+}
+
+static void psci_cpuidle_domain_sync_state(struct device *dev)
+{
+	/*
+	 * All devices have now been attached/probed to the PM domain topology,
+	 * hence it's fine to allow domain states to be picked.
+	 */
+	psci_pd_allow_domain_state = true;
+}
+
+static const struct of_device_id psci_of_match[] = {
+	{ .compatible = "arm,psci-1.0" },
+	{}
+};
+
+static int psci_cpuidle_domain_probe(struct platform_device *pdev)
+{
+	struct device_node *np = pdev->dev.of_node;
+	struct device_node *node;
+	bool use_osi = psci_has_osi_support();
+	int ret = 0, pd_count = 0;
+
+	if (!np)
+		return -ENODEV;
+
+	/*
+	 * Parse child nodes for the "#power-domain-cells" property and
+	 * initialize a genpd/genpd-of-provider pair when it's found.
+	 */
+	for_each_child_of_node(np, node) {
+		if (!of_property_present(node, "#power-domain-cells"))
+			continue;
+
+		ret = psci_pd_init(node, use_osi);
+		if (ret) {
+			of_node_put(node);
+			goto exit;
+		}
+
+		pd_count++;
+	}
+
+	/* Bail out if not using the hierarchical CPU topology. */
+	if (!pd_count)
+		return 0;
+
+	/* Link genpd masters/subdomains to model the CPU topology. */
+	ret = dt_idle_pd_init_topology(np);
+	if (ret)
+		goto remove_pd;
+
+	/* let's try to enable OSI. */
+	ret = psci_set_osi_mode(use_osi);
+	if (ret)
+		goto remove_pd;
+
+	pr_info("Initialized CPU PM domain topology using %s mode\n",
+		use_osi ? "OSI" : "PC");
+	return 0;
+
+remove_pd:
+	dt_idle_pd_remove_topology(np);
+	psci_pd_remove();
+exit:
+	pr_err("failed to create CPU PM domains ret=%d\n", ret);
+	return ret;
+}
+
+static struct platform_driver psci_cpuidle_domain_driver = {
+	.probe  = psci_cpuidle_domain_probe,
+	.driver = {
+		.name = "psci-cpuidle-domain",
+		.of_match_table = psci_of_match,
+		.sync_state = psci_cpuidle_domain_sync_state,
+	},
+};
+
+static int __init psci_idle_init_domains(void)
+{
+	return platform_driver_register(&psci_cpuidle_domain_driver);
+}
+subsys_initcall(psci_idle_init_domains);
diff --git a/drivers/cpuidle/cpuidle-psci.c b/drivers/cpuidle/cpuidle-psci.c
new file mode 100644
index 0000000000..bf68920d03
--- /dev/null
+++ b/drivers/cpuidle/cpuidle-psci.c
@@ -0,0 +1,449 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * PSCI CPU idle driver.
+ *
+ * Copyright (C) 2019 ARM Ltd.
+ * Author: Lorenzo Pieralisi <lorenzo.pieralisi@arm.com>
+ */
+
+#define pr_fmt(fmt) "CPUidle PSCI: " fmt
+
+#include <linux/cpuhotplug.h>
+#include <linux/cpu_cooling.h>
+#include <linux/cpuidle.h>
+#include <linux/cpumask.h>
+#include <linux/cpu_pm.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/of.h>
+#include <linux/platform_device.h>
+#include <linux/psci.h>
+#include <linux/pm_domain.h>
+#include <linux/pm_runtime.h>
+#include <linux/slab.h>
+#include <linux/string.h>
+#include <linux/syscore_ops.h>
+
+#include <asm/cpuidle.h>
+
+#include "cpuidle-psci.h"
+#include "dt_idle_states.h"
+
+struct psci_cpuidle_data {
+	u32 *psci_states;
+	struct device *dev;
+};
+
+static DEFINE_PER_CPU_READ_MOSTLY(struct psci_cpuidle_data, psci_cpuidle_data);
+static DEFINE_PER_CPU(u32, domain_state);
+static bool psci_cpuidle_use_cpuhp;
+
+void psci_set_domain_state(u32 state)
+{
+	__this_cpu_write(domain_state, state);
+}
+
+static inline u32 psci_get_domain_state(void)
+{
+	return __this_cpu_read(domain_state);
+}
+
+static __cpuidle int __psci_enter_domain_idle_state(struct cpuidle_device *dev,
+						    struct cpuidle_driver *drv, int idx,
+						    bool s2idle)
+{
+	struct psci_cpuidle_data *data = this_cpu_ptr(&psci_cpuidle_data);
+	u32 *states = data->psci_states;
+	struct device *pd_dev = data->dev;
+	u32 state;
+	int ret;
+
+	ret = cpu_pm_enter();
+	if (ret)
+		return -1;
+
+	/* Do runtime PM to manage a hierarchical CPU toplogy. */
+	if (s2idle)
+		dev_pm_genpd_suspend(pd_dev);
+	else
+		pm_runtime_put_sync_suspend(pd_dev);
+
+	state = psci_get_domain_state();
+	if (!state)
+		state = states[idx];
+
+	ret = psci_cpu_suspend_enter(state) ? -1 : idx;
+
+	if (s2idle)
+		dev_pm_genpd_resume(pd_dev);
+	else
+		pm_runtime_get_sync(pd_dev);
+
+	cpu_pm_exit();
+
+	/* Clear the domain state to start fresh when back from idle. */
+	psci_set_domain_state(0);
+	return ret;
+}
+
+static int psci_enter_domain_idle_state(struct cpuidle_device *dev,
+					struct cpuidle_driver *drv, int idx)
+{
+	return __psci_enter_domain_idle_state(dev, drv, idx, false);
+}
+
+static int psci_enter_s2idle_domain_idle_state(struct cpuidle_device *dev,
+					       struct cpuidle_driver *drv,
+					       int idx)
+{
+	return __psci_enter_domain_idle_state(dev, drv, idx, true);
+}
+
+static int psci_idle_cpuhp_up(unsigned int cpu)
+{
+	struct device *pd_dev = __this_cpu_read(psci_cpuidle_data.dev);
+
+	if (pd_dev)
+		pm_runtime_get_sync(pd_dev);
+
+	return 0;
+}
+
+static int psci_idle_cpuhp_down(unsigned int cpu)
+{
+	struct device *pd_dev = __this_cpu_read(psci_cpuidle_data.dev);
+
+	if (pd_dev) {
+		pm_runtime_put_sync(pd_dev);
+		/* Clear domain state to start fresh at next online. */
+		psci_set_domain_state(0);
+	}
+
+	return 0;
+}
+
+static void psci_idle_syscore_switch(bool suspend)
+{
+	bool cleared = false;
+	struct device *dev;
+	int cpu;
+
+	for_each_possible_cpu(cpu) {
+		dev = per_cpu_ptr(&psci_cpuidle_data, cpu)->dev;
+
+		if (dev && suspend) {
+			dev_pm_genpd_suspend(dev);
+		} else if (dev) {
+			dev_pm_genpd_resume(dev);
+
+			/* Account for userspace having offlined a CPU. */
+			if (pm_runtime_status_suspended(dev))
+				pm_runtime_set_active(dev);
+
+			/* Clear domain state to re-start fresh. */
+			if (!cleared) {
+				psci_set_domain_state(0);
+				cleared = true;
+			}
+		}
+	}
+}
+
+static int psci_idle_syscore_suspend(void)
+{
+	psci_idle_syscore_switch(true);
+	return 0;
+}
+
+static void psci_idle_syscore_resume(void)
+{
+	psci_idle_syscore_switch(false);
+}
+
+static struct syscore_ops psci_idle_syscore_ops = {
+	.suspend = psci_idle_syscore_suspend,
+	.resume = psci_idle_syscore_resume,
+};
+
+static void psci_idle_init_cpuhp(void)
+{
+	int err;
+
+	if (!psci_cpuidle_use_cpuhp)
+		return;
+
+	register_syscore_ops(&psci_idle_syscore_ops);
+
+	err = cpuhp_setup_state_nocalls(CPUHP_AP_CPU_PM_STARTING,
+					"cpuidle/psci:online",
+					psci_idle_cpuhp_up,
+					psci_idle_cpuhp_down);
+	if (err)
+		pr_warn("Failed %d while setup cpuhp state\n", err);
+}
+
+static __cpuidle int psci_enter_idle_state(struct cpuidle_device *dev,
+					   struct cpuidle_driver *drv, int idx)
+{
+	u32 *state = __this_cpu_read(psci_cpuidle_data.psci_states);
+
+	return CPU_PM_CPU_IDLE_ENTER_PARAM_RCU(psci_cpu_suspend_enter, idx, state[idx]);
+}
+
+static const struct of_device_id psci_idle_state_match[] = {
+	{ .compatible = "arm,idle-state",
+	  .data = psci_enter_idle_state },
+	{ },
+};
+
+int psci_dt_parse_state_node(struct device_node *np, u32 *state)
+{
+	int err = of_property_read_u32(np, "arm,psci-suspend-param", state);
+
+	if (err) {
+		pr_warn("%pOF missing arm,psci-suspend-param property\n", np);
+		return err;
+	}
+
+	if (!psci_power_state_is_valid(*state)) {
+		pr_warn("Invalid PSCI power state %#x\n", *state);
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+static int psci_dt_cpu_init_topology(struct cpuidle_driver *drv,
+				     struct psci_cpuidle_data *data,
+				     unsigned int state_count, int cpu)
+{
+	/* Currently limit the hierarchical topology to be used in OSI mode. */
+	if (!psci_has_osi_support())
+		return 0;
+
+	if (IS_ENABLED(CONFIG_PREEMPT_RT))
+		return 0;
+
+	data->dev = psci_dt_attach_cpu(cpu);
+	if (IS_ERR_OR_NULL(data->dev))
+		return PTR_ERR_OR_ZERO(data->dev);
+
+	/*
+	 * Using the deepest state for the CPU to trigger a potential selection
+	 * of a shared state for the domain, assumes the domain states are all
+	 * deeper states.
+	 */
+	drv->states[state_count - 1].flags |= CPUIDLE_FLAG_RCU_IDLE;
+	drv->states[state_count - 1].enter = psci_enter_domain_idle_state;
+	drv->states[state_count - 1].enter_s2idle = psci_enter_s2idle_domain_idle_state;
+	psci_cpuidle_use_cpuhp = true;
+
+	return 0;
+}
+
+static int psci_dt_cpu_init_idle(struct device *dev, struct cpuidle_driver *drv,
+				 struct device_node *cpu_node,
+				 unsigned int state_count, int cpu)
+{
+	int i, ret = 0;
+	u32 *psci_states;
+	struct device_node *state_node;
+	struct psci_cpuidle_data *data = per_cpu_ptr(&psci_cpuidle_data, cpu);
+
+	state_count++; /* Add WFI state too */
+	psci_states = devm_kcalloc(dev, state_count, sizeof(*psci_states),
+				   GFP_KERNEL);
+	if (!psci_states)
+		return -ENOMEM;
+
+	for (i = 1; i < state_count; i++) {
+		state_node = of_get_cpu_state_node(cpu_node, i - 1);
+		if (!state_node)
+			break;
+
+		ret = psci_dt_parse_state_node(state_node, &psci_states[i]);
+		of_node_put(state_node);
+
+		if (ret)
+			return ret;
+
+		pr_debug("psci-power-state %#x index %d\n", psci_states[i], i);
+	}
+
+	if (i != state_count)
+		return -ENODEV;
+
+	/* Initialize optional data, used for the hierarchical topology. */
+	ret = psci_dt_cpu_init_topology(drv, data, state_count, cpu);
+	if (ret < 0)
+		return ret;
+
+	/* Idle states parsed correctly, store them in the per-cpu struct. */
+	data->psci_states = psci_states;
+	return 0;
+}
+
+static int psci_cpu_init_idle(struct device *dev, struct cpuidle_driver *drv,
+			      unsigned int cpu, unsigned int state_count)
+{
+	struct device_node *cpu_node;
+	int ret;
+
+	/*
+	 * If the PSCI cpu_suspend function hook has not been initialized
+	 * idle states must not be enabled, so bail out
+	 */
+	if (!psci_ops.cpu_suspend)
+		return -EOPNOTSUPP;
+
+	cpu_node = of_cpu_device_node_get(cpu);
+	if (!cpu_node)
+		return -ENODEV;
+
+	ret = psci_dt_cpu_init_idle(dev, drv, cpu_node, state_count, cpu);
+
+	of_node_put(cpu_node);
+
+	return ret;
+}
+
+static void psci_cpu_deinit_idle(int cpu)
+{
+	struct psci_cpuidle_data *data = per_cpu_ptr(&psci_cpuidle_data, cpu);
+
+	psci_dt_detach_cpu(data->dev);
+	psci_cpuidle_use_cpuhp = false;
+}
+
+static int psci_idle_init_cpu(struct device *dev, int cpu)
+{
+	struct cpuidle_driver *drv;
+	struct device_node *cpu_node;
+	const char *enable_method;
+	int ret = 0;
+
+	cpu_node = of_cpu_device_node_get(cpu);
+	if (!cpu_node)
+		return -ENODEV;
+
+	/*
+	 * Check whether the enable-method for the cpu is PSCI, fail
+	 * if it is not.
+	 */
+	enable_method = of_get_property(cpu_node, "enable-method", NULL);
+	if (!enable_method || (strcmp(enable_method, "psci")))
+		ret = -ENODEV;
+
+	of_node_put(cpu_node);
+	if (ret)
+		return ret;
+
+	drv = devm_kzalloc(dev, sizeof(*drv), GFP_KERNEL);
+	if (!drv)
+		return -ENOMEM;
+
+	drv->name = "psci_idle";
+	drv->owner = THIS_MODULE;
+	drv->cpumask = (struct cpumask *)cpumask_of(cpu);
+
+	/*
+	 * PSCI idle states relies on architectural WFI to be represented as
+	 * state index 0.
+	 */
+	drv->states[0].enter = psci_enter_idle_state;
+	drv->states[0].exit_latency = 1;
+	drv->states[0].target_residency = 1;
+	drv->states[0].power_usage = UINT_MAX;
+	strcpy(drv->states[0].name, "WFI");
+	strcpy(drv->states[0].desc, "ARM WFI");
+
+	/*
+	 * If no DT idle states are detected (ret == 0) let the driver
+	 * initialization fail accordingly since there is no reason to
+	 * initialize the idle driver if only wfi is supported, the
+	 * default archictectural back-end already executes wfi
+	 * on idle entry.
+	 */
+	ret = dt_init_idle_driver(drv, psci_idle_state_match, 1);
+	if (ret <= 0)
+		return ret ? : -ENODEV;
+
+	/*
+	 * Initialize PSCI idle states.
+	 */
+	ret = psci_cpu_init_idle(dev, drv, cpu, ret);
+	if (ret) {
+		pr_err("CPU %d failed to PSCI idle\n", cpu);
+		return ret;
+	}
+
+	ret = cpuidle_register(drv, NULL);
+	if (ret)
+		goto deinit;
+
+	cpuidle_cooling_register(drv);
+
+	return 0;
+deinit:
+	psci_cpu_deinit_idle(cpu);
+	return ret;
+}
+
+/*
+ * psci_idle_probe - Initializes PSCI cpuidle driver
+ *
+ * Initializes PSCI cpuidle driver for all CPUs, if any CPU fails
+ * to register cpuidle driver then rollback to cancel all CPUs
+ * registration.
+ */
+static int psci_cpuidle_probe(struct platform_device *pdev)
+{
+	int cpu, ret;
+	struct cpuidle_driver *drv;
+	struct cpuidle_device *dev;
+
+	for_each_possible_cpu(cpu) {
+		ret = psci_idle_init_cpu(&pdev->dev, cpu);
+		if (ret)
+			goto out_fail;
+	}
+
+	psci_idle_init_cpuhp();
+	return 0;
+
+out_fail:
+	while (--cpu >= 0) {
+		dev = per_cpu(cpuidle_devices, cpu);
+		drv = cpuidle_get_cpu_driver(dev);
+		cpuidle_unregister(drv);
+		psci_cpu_deinit_idle(cpu);
+	}
+
+	return ret;
+}
+
+static struct platform_driver psci_cpuidle_driver = {
+	.probe = psci_cpuidle_probe,
+	.driver = {
+		.name = "psci-cpuidle",
+	},
+};
+
+static int __init psci_idle_init(void)
+{
+	struct platform_device *pdev;
+	int ret;
+
+	ret = platform_driver_register(&psci_cpuidle_driver);
+	if (ret)
+		return ret;
+
+	pdev = platform_device_register_simple("psci-cpuidle", -1, NULL, 0);
+	if (IS_ERR(pdev)) {
+		platform_driver_unregister(&psci_cpuidle_driver);
+		return PTR_ERR(pdev);
+	}
+
+	return 0;
+}
+device_initcall(psci_idle_init);
diff --git a/drivers/cpuidle/cpuidle-psci.h b/drivers/cpuidle/cpuidle-psci.h
new file mode 100644
index 0000000000..4e132640ed
--- /dev/null
+++ b/drivers/cpuidle/cpuidle-psci.h
@@ -0,0 +1,31 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#ifndef __CPUIDLE_PSCI_H
+#define __CPUIDLE_PSCI_H
+
+struct device;
+struct device_node;
+
+void psci_set_domain_state(u32 state);
+int psci_dt_parse_state_node(struct device_node *np, u32 *state);
+
+#ifdef CONFIG_ARM_PSCI_CPUIDLE_DOMAIN
+
+#include "dt_idle_genpd.h"
+
+static inline struct device *psci_dt_attach_cpu(int cpu)
+{
+	return dt_idle_attach_cpu(cpu, "psci");
+}
+
+static inline void psci_dt_detach_cpu(struct device *dev)
+{
+	dt_idle_detach_cpu(dev);
+}
+
+#else
+static inline struct device *psci_dt_attach_cpu(int cpu) { return NULL; }
+static inline void psci_dt_detach_cpu(struct device *dev) { }
+#endif
+
+#endif /* __CPUIDLE_PSCI_H */
diff --git a/drivers/cpuidle/cpuidle-pseries.c b/drivers/cpuidle/cpuidle-pseries.c
new file mode 100644
index 0000000000..14db9b7d98
--- /dev/null
+++ b/drivers/cpuidle/cpuidle-pseries.c
@@ -0,0 +1,477 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ *  cpuidle-pseries - idle state cpuidle driver.
+ *  Adapted from drivers/idle/intel_idle.c and
+ *  drivers/acpi/processor_idle.c
+ *
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/moduleparam.h>
+#include <linux/cpuidle.h>
+#include <linux/cpu.h>
+#include <linux/notifier.h>
+
+#include <asm/paca.h>
+#include <asm/reg.h>
+#include <asm/machdep.h>
+#include <asm/firmware.h>
+#include <asm/runlatch.h>
+#include <asm/idle.h>
+#include <asm/plpar_wrappers.h>
+#include <asm/rtas.h>
+
+static struct cpuidle_driver pseries_idle_driver = {
+	.name             = "pseries_idle",
+	.owner            = THIS_MODULE,
+};
+
+static int max_idle_state __read_mostly;
+static struct cpuidle_state *cpuidle_state_table __read_mostly;
+static u64 snooze_timeout __read_mostly;
+static bool snooze_timeout_en __read_mostly;
+
+static __cpuidle
+int snooze_loop(struct cpuidle_device *dev, struct cpuidle_driver *drv,
+		int index)
+{
+	u64 snooze_exit_time;
+
+	set_thread_flag(TIF_POLLING_NRFLAG);
+
+	pseries_idle_prolog();
+	raw_local_irq_enable();
+	snooze_exit_time = get_tb() + snooze_timeout;
+	dev->poll_time_limit = false;
+
+	while (!need_resched()) {
+		HMT_low();
+		HMT_very_low();
+		if (likely(snooze_timeout_en) && get_tb() > snooze_exit_time) {
+			/*
+			 * Task has not woken up but we are exiting the polling
+			 * loop anyway. Require a barrier after polling is
+			 * cleared to order subsequent test of need_resched().
+			 */
+			dev->poll_time_limit = true;
+			clear_thread_flag(TIF_POLLING_NRFLAG);
+			smp_mb();
+			break;
+		}
+	}
+
+	HMT_medium();
+	clear_thread_flag(TIF_POLLING_NRFLAG);
+
+	raw_local_irq_disable();
+
+	pseries_idle_epilog();
+
+	return index;
+}
+
+static __cpuidle void check_and_cede_processor(void)
+{
+	/*
+	 * Ensure our interrupt state is properly tracked,
+	 * also checks if no interrupt has occurred while we
+	 * were soft-disabled
+	 */
+	if (prep_irq_for_idle()) {
+		cede_processor();
+#ifdef CONFIG_TRACE_IRQFLAGS
+		/* Ensure that H_CEDE returns with IRQs on */
+		if (WARN_ON(!(mfmsr() & MSR_EE)))
+			__hard_irq_enable();
+#endif
+	}
+}
+
+/*
+ * XCEDE: Extended CEDE states discovered through the
+ *        "ibm,get-systems-parameter" RTAS call with the token
+ *        CEDE_LATENCY_TOKEN
+ */
+
+/*
+ * Section 7.3.16 System Parameters Option of PAPR version 2.8.1 has a
+ * table with all the parameters to ibm,get-system-parameters.
+ * CEDE_LATENCY_TOKEN corresponds to the token value for Cede Latency
+ * Settings Information.
+ */
+#define CEDE_LATENCY_TOKEN	45
+
+/*
+ * If the platform supports the cede latency settings information system
+ * parameter it must provide the following information in the NULL terminated
+ * parameter string:
+ *
+ * a. The first byte is the length “N” of each cede latency setting record minus
+ *    one (zero indicates a length of 1 byte).
+ *
+ * b. For each supported cede latency setting a cede latency setting record
+ *    consisting of the first “N” bytes as per the following table.
+ *
+ *    -----------------------------
+ *    | Field           | Field   |
+ *    | Name            | Length  |
+ *    -----------------------------
+ *    | Cede Latency    | 1 Byte  |
+ *    | Specifier Value |         |
+ *    -----------------------------
+ *    | Maximum wakeup  |         |
+ *    | latency in      | 8 Bytes |
+ *    | tb-ticks        |         |
+ *    -----------------------------
+ *    | Responsive to   |         |
+ *    | external        | 1 Byte  |
+ *    | interrupts      |         |
+ *    -----------------------------
+ *
+ * This version has cede latency record size = 10.
+ *
+ * The structure xcede_latency_payload represents a) and b) with
+ * xcede_latency_record representing the table in b).
+ *
+ * xcede_latency_parameter is what gets returned by
+ * ibm,get-systems-parameter RTAS call when made with
+ * CEDE_LATENCY_TOKEN.
+ *
+ * These structures are only used to represent the data obtained by the RTAS
+ * call. The data is in big-endian.
+ */
+struct xcede_latency_record {
+	u8	hint;
+	__be64	latency_ticks;
+	u8	wake_on_irqs;
+} __packed;
+
+// Make space for 16 records, which "should be enough".
+struct xcede_latency_payload {
+	u8     record_size;
+	struct xcede_latency_record records[16];
+} __packed;
+
+struct xcede_latency_parameter {
+	__be16  payload_size;
+	struct xcede_latency_payload payload;
+	u8 null_char;
+} __packed;
+
+static unsigned int nr_xcede_records;
+static struct xcede_latency_parameter xcede_latency_parameter __initdata;
+
+static int __init parse_cede_parameters(void)
+{
+	struct xcede_latency_payload *payload;
+	u32 total_xcede_records_size;
+	u8 xcede_record_size;
+	u16 payload_size;
+	int ret, i;
+
+	ret = rtas_call(rtas_token("ibm,get-system-parameter"), 3, 1,
+			NULL, CEDE_LATENCY_TOKEN, __pa(&xcede_latency_parameter),
+			sizeof(xcede_latency_parameter));
+	if (ret) {
+		pr_err("xcede: Error parsing CEDE_LATENCY_TOKEN\n");
+		return ret;
+	}
+
+	payload_size = be16_to_cpu(xcede_latency_parameter.payload_size);
+	payload = &xcede_latency_parameter.payload;
+
+	xcede_record_size = payload->record_size + 1;
+
+	if (xcede_record_size != sizeof(struct xcede_latency_record)) {
+		pr_err("xcede: Expected record-size %lu. Observed size %u.\n",
+		       sizeof(struct xcede_latency_record), xcede_record_size);
+		return -EINVAL;
+	}
+
+	pr_info("xcede: xcede_record_size = %d\n", xcede_record_size);
+
+	/*
+	 * Since the payload_size includes the last NULL byte and the
+	 * xcede_record_size, the remaining bytes correspond to array of all
+	 * cede_latency settings.
+	 */
+	total_xcede_records_size = payload_size - 2;
+	nr_xcede_records = total_xcede_records_size / xcede_record_size;
+
+	for (i = 0; i < nr_xcede_records; i++) {
+		struct xcede_latency_record *record = &payload->records[i];
+		u64 latency_ticks = be64_to_cpu(record->latency_ticks);
+		u8 wake_on_irqs = record->wake_on_irqs;
+		u8 hint = record->hint;
+
+		pr_info("xcede: Record %d : hint = %u, latency = 0x%llx tb ticks, Wake-on-irq = %u\n",
+			i, hint, latency_ticks, wake_on_irqs);
+	}
+
+	return 0;
+}
+
+#define NR_DEDICATED_STATES	2 /* snooze, CEDE */
+static u8 cede_latency_hint[NR_DEDICATED_STATES];
+
+static __cpuidle
+int dedicated_cede_loop(struct cpuidle_device *dev, struct cpuidle_driver *drv,
+			int index)
+{
+	u8 old_latency_hint;
+
+	pseries_idle_prolog();
+	get_lppaca()->donate_dedicated_cpu = 1;
+	old_latency_hint = get_lppaca()->cede_latency_hint;
+	get_lppaca()->cede_latency_hint = cede_latency_hint[index];
+
+	HMT_medium();
+	check_and_cede_processor();
+
+	raw_local_irq_disable();
+	get_lppaca()->donate_dedicated_cpu = 0;
+	get_lppaca()->cede_latency_hint = old_latency_hint;
+
+	pseries_idle_epilog();
+
+	return index;
+}
+
+static __cpuidle
+int shared_cede_loop(struct cpuidle_device *dev, struct cpuidle_driver *drv,
+		     int index)
+{
+
+	pseries_idle_prolog();
+
+	/*
+	 * Yield the processor to the hypervisor.  We return if
+	 * an external interrupt occurs (which are driven prior
+	 * to returning here) or if a prod occurs from another
+	 * processor. When returning here, external interrupts
+	 * are enabled.
+	 */
+	check_and_cede_processor();
+
+	raw_local_irq_disable();
+	pseries_idle_epilog();
+
+	return index;
+}
+
+/*
+ * States for dedicated partition case.
+ */
+static struct cpuidle_state dedicated_states[NR_DEDICATED_STATES] = {
+	{ /* Snooze */
+		.name = "snooze",
+		.desc = "snooze",
+		.exit_latency = 0,
+		.target_residency = 0,
+		.enter = &snooze_loop,
+		.flags = CPUIDLE_FLAG_POLLING },
+	{ /* CEDE */
+		.name = "CEDE",
+		.desc = "CEDE",
+		.exit_latency = 10,
+		.target_residency = 100,
+		.enter = &dedicated_cede_loop },
+};
+
+/*
+ * States for shared partition case.
+ */
+static struct cpuidle_state shared_states[] = {
+	{ /* Snooze */
+		.name = "snooze",
+		.desc = "snooze",
+		.exit_latency = 0,
+		.target_residency = 0,
+		.enter = &snooze_loop,
+		.flags = CPUIDLE_FLAG_POLLING },
+	{ /* Shared Cede */
+		.name = "Shared Cede",
+		.desc = "Shared Cede",
+		.exit_latency = 10,
+		.target_residency = 100,
+		.enter = &shared_cede_loop },
+};
+
+static int pseries_cpuidle_cpu_online(unsigned int cpu)
+{
+	struct cpuidle_device *dev = per_cpu(cpuidle_devices, cpu);
+
+	if (dev && cpuidle_get_driver()) {
+		cpuidle_pause_and_lock();
+		cpuidle_enable_device(dev);
+		cpuidle_resume_and_unlock();
+	}
+	return 0;
+}
+
+static int pseries_cpuidle_cpu_dead(unsigned int cpu)
+{
+	struct cpuidle_device *dev = per_cpu(cpuidle_devices, cpu);
+
+	if (dev && cpuidle_get_driver()) {
+		cpuidle_pause_and_lock();
+		cpuidle_disable_device(dev);
+		cpuidle_resume_and_unlock();
+	}
+	return 0;
+}
+
+/*
+ * pseries_cpuidle_driver_init()
+ */
+static int pseries_cpuidle_driver_init(void)
+{
+	int idle_state;
+	struct cpuidle_driver *drv = &pseries_idle_driver;
+
+	drv->state_count = 0;
+
+	for (idle_state = 0; idle_state < max_idle_state; ++idle_state) {
+		/* Is the state not enabled? */
+		if (cpuidle_state_table[idle_state].enter == NULL)
+			continue;
+
+		drv->states[drv->state_count] =	/* structure copy */
+			cpuidle_state_table[idle_state];
+
+		drv->state_count += 1;
+	}
+
+	return 0;
+}
+
+static void __init fixup_cede0_latency(void)
+{
+	struct xcede_latency_payload *payload;
+	u64 min_xcede_latency_us = UINT_MAX;
+	int i;
+
+	if (parse_cede_parameters())
+		return;
+
+	pr_info("cpuidle: Skipping the %d Extended CEDE idle states\n",
+		nr_xcede_records);
+
+	payload = &xcede_latency_parameter.payload;
+
+	/*
+	 * The CEDE idle state maps to CEDE(0). While the hypervisor
+	 * does not advertise CEDE(0) exit latency values, it does
+	 * advertise the latency values of the extended CEDE states.
+	 * We use the lowest advertised exit latency value as a proxy
+	 * for the exit latency of CEDE(0).
+	 */
+	for (i = 0; i < nr_xcede_records; i++) {
+		struct xcede_latency_record *record = &payload->records[i];
+		u8 hint = record->hint;
+		u64 latency_tb = be64_to_cpu(record->latency_ticks);
+		u64 latency_us = DIV_ROUND_UP_ULL(tb_to_ns(latency_tb), NSEC_PER_USEC);
+
+		/*
+		 * We expect the exit latency of an extended CEDE
+		 * state to be non-zero, it to since it takes at least
+		 * a few nanoseconds to wakeup the idle CPU and
+		 * dispatch the virtual processor into the Linux
+		 * Guest.
+		 *
+		 * So we consider only non-zero value for performing
+		 * the fixup of CEDE(0) latency.
+		 */
+		if (latency_us == 0) {
+			pr_warn("cpuidle: Skipping xcede record %d [hint=%d]. Exit latency = 0us\n",
+				i, hint);
+			continue;
+		}
+
+		if (latency_us < min_xcede_latency_us)
+			min_xcede_latency_us = latency_us;
+	}
+
+	if (min_xcede_latency_us != UINT_MAX) {
+		dedicated_states[1].exit_latency = min_xcede_latency_us;
+		dedicated_states[1].target_residency = 10 * (min_xcede_latency_us);
+		pr_info("cpuidle: Fixed up CEDE exit latency to %llu us\n",
+			min_xcede_latency_us);
+	}
+
+}
+
+/*
+ * pseries_idle_probe()
+ * Choose state table for shared versus dedicated partition
+ */
+static int __init pseries_idle_probe(void)
+{
+
+	if (cpuidle_disable != IDLE_NO_OVERRIDE)
+		return -ENODEV;
+
+	if (firmware_has_feature(FW_FEATURE_SPLPAR)) {
+		if (lppaca_shared_proc()) {
+			cpuidle_state_table = shared_states;
+			max_idle_state = ARRAY_SIZE(shared_states);
+		} else {
+			/*
+			 * Use firmware provided latency values
+			 * starting with POWER10 platforms. In the
+			 * case that we are running on a POWER10
+			 * platform but in an earlier compat mode, we
+			 * can still use the firmware provided values.
+			 *
+			 * However, on platforms prior to POWER10, we
+			 * cannot rely on the accuracy of the firmware
+			 * provided latency values. On such platforms,
+			 * go with the conservative default estimate
+			 * of 10us.
+			 */
+			if (cpu_has_feature(CPU_FTR_ARCH_31) || pvr_version_is(PVR_POWER10))
+				fixup_cede0_latency();
+			cpuidle_state_table = dedicated_states;
+			max_idle_state = NR_DEDICATED_STATES;
+		}
+	} else
+		return -ENODEV;
+
+	if (max_idle_state > 1) {
+		snooze_timeout_en = true;
+		snooze_timeout = cpuidle_state_table[1].target_residency *
+				 tb_ticks_per_usec;
+	}
+	return 0;
+}
+
+static int __init pseries_processor_idle_init(void)
+{
+	int retval;
+
+	retval = pseries_idle_probe();
+	if (retval)
+		return retval;
+
+	pseries_cpuidle_driver_init();
+	retval = cpuidle_register(&pseries_idle_driver, NULL);
+	if (retval) {
+		printk(KERN_DEBUG "Registration of pseries driver failed.\n");
+		return retval;
+	}
+
+	retval = cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN,
+					   "cpuidle/pseries:online",
+					   pseries_cpuidle_cpu_online, NULL);
+	WARN_ON(retval < 0);
+	retval = cpuhp_setup_state_nocalls(CPUHP_CPUIDLE_DEAD,
+					   "cpuidle/pseries:DEAD", NULL,
+					   pseries_cpuidle_cpu_dead);
+	WARN_ON(retval < 0);
+	printk(KERN_DEBUG "pseries_idle_driver registered\n");
+	return 0;
+}
+
+device_initcall(pseries_processor_idle_init);
diff --git a/drivers/cpuidle/cpuidle-qcom-spm.c b/drivers/cpuidle/cpuidle-qcom-spm.c
new file mode 100644
index 0000000000..1fc9968eae
--- /dev/null
+++ b/drivers/cpuidle/cpuidle-qcom-spm.c
@@ -0,0 +1,195 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (c) 2011-2014, The Linux Foundation. All rights reserved.
+ * Copyright (c) 2014,2015, Linaro Ltd.
+ *
+ * SAW power controller driver
+ */
+
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/io.h>
+#include <linux/slab.h>
+#include <linux/of.h>
+#include <linux/of_platform.h>
+#include <linux/err.h>
+#include <linux/platform_device.h>
+#include <linux/cpuidle.h>
+#include <linux/cpu_pm.h>
+#include <linux/firmware/qcom/qcom_scm.h>
+#include <soc/qcom/spm.h>
+
+#include <asm/proc-fns.h>
+#include <asm/suspend.h>
+
+#include "dt_idle_states.h"
+
+struct cpuidle_qcom_spm_data {
+	struct cpuidle_driver cpuidle_driver;
+	struct spm_driver_data *spm;
+};
+
+static int qcom_pm_collapse(unsigned long int unused)
+{
+	qcom_scm_cpu_power_down(QCOM_SCM_CPU_PWR_DOWN_L2_ON);
+
+	/*
+	 * Returns here only if there was a pending interrupt and we did not
+	 * power down as a result.
+	 */
+	return -1;
+}
+
+static int qcom_cpu_spc(struct spm_driver_data *drv)
+{
+	int ret;
+
+	spm_set_low_power_mode(drv, PM_SLEEP_MODE_SPC);
+	ret = cpu_suspend(0, qcom_pm_collapse);
+	/*
+	 * ARM common code executes WFI without calling into our driver and
+	 * if the SPM mode is not reset, then we may accidently power down the
+	 * cpu when we intended only to gate the cpu clock.
+	 * Ensure the state is set to standby before returning.
+	 */
+	spm_set_low_power_mode(drv, PM_SLEEP_MODE_STBY);
+
+	return ret;
+}
+
+static __cpuidle int spm_enter_idle_state(struct cpuidle_device *dev,
+					  struct cpuidle_driver *drv, int idx)
+{
+	struct cpuidle_qcom_spm_data *data = container_of(drv, struct cpuidle_qcom_spm_data,
+							  cpuidle_driver);
+
+	return CPU_PM_CPU_IDLE_ENTER_PARAM(qcom_cpu_spc, idx, data->spm);
+}
+
+static struct cpuidle_driver qcom_spm_idle_driver = {
+	.name = "qcom_spm",
+	.owner = THIS_MODULE,
+	.states[0] = {
+		.enter			= spm_enter_idle_state,
+		.exit_latency		= 1,
+		.target_residency	= 1,
+		.power_usage		= UINT_MAX,
+		.name			= "WFI",
+		.desc			= "ARM WFI",
+	}
+};
+
+static const struct of_device_id qcom_idle_state_match[] = {
+	{ .compatible = "qcom,idle-state-spc", .data = spm_enter_idle_state },
+	{ },
+};
+
+static int spm_cpuidle_register(struct device *cpuidle_dev, int cpu)
+{
+	struct platform_device *pdev = NULL;
+	struct device_node *cpu_node, *saw_node;
+	struct cpuidle_qcom_spm_data *data = NULL;
+	int ret;
+
+	cpu_node = of_cpu_device_node_get(cpu);
+	if (!cpu_node)
+		return -ENODEV;
+
+	saw_node = of_parse_phandle(cpu_node, "qcom,saw", 0);
+	if (!saw_node)
+		return -ENODEV;
+
+	pdev = of_find_device_by_node(saw_node);
+	of_node_put(saw_node);
+	of_node_put(cpu_node);
+	if (!pdev)
+		return -ENODEV;
+
+	data = devm_kzalloc(cpuidle_dev, sizeof(*data), GFP_KERNEL);
+	if (!data)
+		return -ENOMEM;
+
+	data->spm = dev_get_drvdata(&pdev->dev);
+	if (!data->spm)
+		return -EINVAL;
+
+	data->cpuidle_driver = qcom_spm_idle_driver;
+	data->cpuidle_driver.cpumask = (struct cpumask *)cpumask_of(cpu);
+
+	ret = dt_init_idle_driver(&data->cpuidle_driver,
+				  qcom_idle_state_match, 1);
+	if (ret <= 0)
+		return ret ? : -ENODEV;
+
+	return cpuidle_register(&data->cpuidle_driver, NULL);
+}
+
+static int spm_cpuidle_drv_probe(struct platform_device *pdev)
+{
+	int cpu, ret;
+
+	if (!qcom_scm_is_available())
+		return -EPROBE_DEFER;
+
+	ret = qcom_scm_set_warm_boot_addr(cpu_resume_arm);
+	if (ret)
+		return dev_err_probe(&pdev->dev, ret, "set warm boot addr failed");
+
+	for_each_possible_cpu(cpu) {
+		ret = spm_cpuidle_register(&pdev->dev, cpu);
+		if (ret && ret != -ENODEV) {
+			dev_err(&pdev->dev,
+				"Cannot register for CPU%d: %d\n", cpu, ret);
+		}
+	}
+
+	return 0;
+}
+
+static struct platform_driver spm_cpuidle_driver = {
+	.probe = spm_cpuidle_drv_probe,
+	.driver = {
+		.name = "qcom-spm-cpuidle",
+		.suppress_bind_attrs = true,
+	},
+};
+
+static bool __init qcom_spm_find_any_cpu(void)
+{
+	struct device_node *cpu_node, *saw_node;
+
+	for_each_of_cpu_node(cpu_node) {
+		saw_node = of_parse_phandle(cpu_node, "qcom,saw", 0);
+		if (of_device_is_available(saw_node)) {
+			of_node_put(saw_node);
+			of_node_put(cpu_node);
+			return true;
+		}
+		of_node_put(saw_node);
+	}
+	return false;
+}
+
+static int __init qcom_spm_cpuidle_init(void)
+{
+	struct platform_device *pdev;
+	int ret;
+
+	ret = platform_driver_register(&spm_cpuidle_driver);
+	if (ret)
+		return ret;
+
+	/* Make sure there is actually any CPU managed by the SPM */
+	if (!qcom_spm_find_any_cpu())
+		return 0;
+
+	pdev = platform_device_register_simple("qcom-spm-cpuidle",
+					       -1, NULL, 0);
+	if (IS_ERR(pdev)) {
+		platform_driver_unregister(&spm_cpuidle_driver);
+		return PTR_ERR(pdev);
+	}
+
+	return 0;
+}
+device_initcall(qcom_spm_cpuidle_init);
diff --git a/drivers/cpuidle/cpuidle-riscv-sbi.c b/drivers/cpuidle/cpuidle-riscv-sbi.c
new file mode 100644
index 0000000000..e8094fc924
--- /dev/null
+++ b/drivers/cpuidle/cpuidle-riscv-sbi.c
@@ -0,0 +1,634 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * RISC-V SBI CPU idle driver.
+ *
+ * Copyright (c) 2021 Western Digital Corporation or its affiliates.
+ * Copyright (c) 2022 Ventana Micro Systems Inc.
+ */
+
+#define pr_fmt(fmt) "cpuidle-riscv-sbi: " fmt
+
+#include <linux/cpuhotplug.h>
+#include <linux/cpuidle.h>
+#include <linux/cpumask.h>
+#include <linux/cpu_pm.h>
+#include <linux/cpu_cooling.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/of.h>
+#include <linux/slab.h>
+#include <linux/platform_device.h>
+#include <linux/pm_domain.h>
+#include <linux/pm_runtime.h>
+#include <asm/cpuidle.h>
+#include <asm/sbi.h>
+#include <asm/smp.h>
+#include <asm/suspend.h>
+
+#include "dt_idle_states.h"
+#include "dt_idle_genpd.h"
+
+struct sbi_cpuidle_data {
+	u32 *states;
+	struct device *dev;
+};
+
+struct sbi_domain_state {
+	bool available;
+	u32 state;
+};
+
+static DEFINE_PER_CPU_READ_MOSTLY(struct sbi_cpuidle_data, sbi_cpuidle_data);
+static DEFINE_PER_CPU(struct sbi_domain_state, domain_state);
+static bool sbi_cpuidle_use_osi;
+static bool sbi_cpuidle_use_cpuhp;
+static bool sbi_cpuidle_pd_allow_domain_state;
+
+static inline void sbi_set_domain_state(u32 state)
+{
+	struct sbi_domain_state *data = this_cpu_ptr(&domain_state);
+
+	data->available = true;
+	data->state = state;
+}
+
+static inline u32 sbi_get_domain_state(void)
+{
+	struct sbi_domain_state *data = this_cpu_ptr(&domain_state);
+
+	return data->state;
+}
+
+static inline void sbi_clear_domain_state(void)
+{
+	struct sbi_domain_state *data = this_cpu_ptr(&domain_state);
+
+	data->available = false;
+}
+
+static inline bool sbi_is_domain_state_available(void)
+{
+	struct sbi_domain_state *data = this_cpu_ptr(&domain_state);
+
+	return data->available;
+}
+
+static int sbi_suspend_finisher(unsigned long suspend_type,
+				unsigned long resume_addr,
+				unsigned long opaque)
+{
+	struct sbiret ret;
+
+	ret = sbi_ecall(SBI_EXT_HSM, SBI_EXT_HSM_HART_SUSPEND,
+			suspend_type, resume_addr, opaque, 0, 0, 0);
+
+	return (ret.error) ? sbi_err_map_linux_errno(ret.error) : 0;
+}
+
+static int sbi_suspend(u32 state)
+{
+	if (state & SBI_HSM_SUSP_NON_RET_BIT)
+		return cpu_suspend(state, sbi_suspend_finisher);
+	else
+		return sbi_suspend_finisher(state, 0, 0);
+}
+
+static __cpuidle int sbi_cpuidle_enter_state(struct cpuidle_device *dev,
+					     struct cpuidle_driver *drv, int idx)
+{
+	u32 *states = __this_cpu_read(sbi_cpuidle_data.states);
+	u32 state = states[idx];
+
+	if (state & SBI_HSM_SUSP_NON_RET_BIT)
+		return CPU_PM_CPU_IDLE_ENTER_PARAM(sbi_suspend, idx, state);
+	else
+		return CPU_PM_CPU_IDLE_ENTER_RETENTION_PARAM(sbi_suspend,
+							     idx, state);
+}
+
+static __cpuidle int __sbi_enter_domain_idle_state(struct cpuidle_device *dev,
+						   struct cpuidle_driver *drv, int idx,
+						   bool s2idle)
+{
+	struct sbi_cpuidle_data *data = this_cpu_ptr(&sbi_cpuidle_data);
+	u32 *states = data->states;
+	struct device *pd_dev = data->dev;
+	u32 state;
+	int ret;
+
+	ret = cpu_pm_enter();
+	if (ret)
+		return -1;
+
+	/* Do runtime PM to manage a hierarchical CPU toplogy. */
+	if (s2idle)
+		dev_pm_genpd_suspend(pd_dev);
+	else
+		pm_runtime_put_sync_suspend(pd_dev);
+
+	ct_cpuidle_enter();
+
+	if (sbi_is_domain_state_available())
+		state = sbi_get_domain_state();
+	else
+		state = states[idx];
+
+	ret = sbi_suspend(state) ? -1 : idx;
+
+	ct_cpuidle_exit();
+
+	if (s2idle)
+		dev_pm_genpd_resume(pd_dev);
+	else
+		pm_runtime_get_sync(pd_dev);
+
+	cpu_pm_exit();
+
+	/* Clear the domain state to start fresh when back from idle. */
+	sbi_clear_domain_state();
+	return ret;
+}
+
+static int sbi_enter_domain_idle_state(struct cpuidle_device *dev,
+				       struct cpuidle_driver *drv, int idx)
+{
+	return __sbi_enter_domain_idle_state(dev, drv, idx, false);
+}
+
+static int sbi_enter_s2idle_domain_idle_state(struct cpuidle_device *dev,
+					      struct cpuidle_driver *drv,
+					      int idx)
+{
+	return __sbi_enter_domain_idle_state(dev, drv, idx, true);
+}
+
+static int sbi_cpuidle_cpuhp_up(unsigned int cpu)
+{
+	struct device *pd_dev = __this_cpu_read(sbi_cpuidle_data.dev);
+
+	if (pd_dev)
+		pm_runtime_get_sync(pd_dev);
+
+	return 0;
+}
+
+static int sbi_cpuidle_cpuhp_down(unsigned int cpu)
+{
+	struct device *pd_dev = __this_cpu_read(sbi_cpuidle_data.dev);
+
+	if (pd_dev) {
+		pm_runtime_put_sync(pd_dev);
+		/* Clear domain state to start fresh at next online. */
+		sbi_clear_domain_state();
+	}
+
+	return 0;
+}
+
+static void sbi_idle_init_cpuhp(void)
+{
+	int err;
+
+	if (!sbi_cpuidle_use_cpuhp)
+		return;
+
+	err = cpuhp_setup_state_nocalls(CPUHP_AP_CPU_PM_STARTING,
+					"cpuidle/sbi:online",
+					sbi_cpuidle_cpuhp_up,
+					sbi_cpuidle_cpuhp_down);
+	if (err)
+		pr_warn("Failed %d while setup cpuhp state\n", err);
+}
+
+static const struct of_device_id sbi_cpuidle_state_match[] = {
+	{ .compatible = "riscv,idle-state",
+	  .data = sbi_cpuidle_enter_state },
+	{ },
+};
+
+static bool sbi_suspend_state_is_valid(u32 state)
+{
+	if (state > SBI_HSM_SUSPEND_RET_DEFAULT &&
+	    state < SBI_HSM_SUSPEND_RET_PLATFORM)
+		return false;
+	if (state > SBI_HSM_SUSPEND_NON_RET_DEFAULT &&
+	    state < SBI_HSM_SUSPEND_NON_RET_PLATFORM)
+		return false;
+	return true;
+}
+
+static int sbi_dt_parse_state_node(struct device_node *np, u32 *state)
+{
+	int err = of_property_read_u32(np, "riscv,sbi-suspend-param", state);
+
+	if (err) {
+		pr_warn("%pOF missing riscv,sbi-suspend-param property\n", np);
+		return err;
+	}
+
+	if (!sbi_suspend_state_is_valid(*state)) {
+		pr_warn("Invalid SBI suspend state %#x\n", *state);
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+static int sbi_dt_cpu_init_topology(struct cpuidle_driver *drv,
+				     struct sbi_cpuidle_data *data,
+				     unsigned int state_count, int cpu)
+{
+	/* Currently limit the hierarchical topology to be used in OSI mode. */
+	if (!sbi_cpuidle_use_osi)
+		return 0;
+
+	data->dev = dt_idle_attach_cpu(cpu, "sbi");
+	if (IS_ERR_OR_NULL(data->dev))
+		return PTR_ERR_OR_ZERO(data->dev);
+
+	/*
+	 * Using the deepest state for the CPU to trigger a potential selection
+	 * of a shared state for the domain, assumes the domain states are all
+	 * deeper states.
+	 */
+	drv->states[state_count - 1].flags |= CPUIDLE_FLAG_RCU_IDLE;
+	drv->states[state_count - 1].enter = sbi_enter_domain_idle_state;
+	drv->states[state_count - 1].enter_s2idle =
+					sbi_enter_s2idle_domain_idle_state;
+	sbi_cpuidle_use_cpuhp = true;
+
+	return 0;
+}
+
+static int sbi_cpuidle_dt_init_states(struct device *dev,
+					struct cpuidle_driver *drv,
+					unsigned int cpu,
+					unsigned int state_count)
+{
+	struct sbi_cpuidle_data *data = per_cpu_ptr(&sbi_cpuidle_data, cpu);
+	struct device_node *state_node;
+	struct device_node *cpu_node;
+	u32 *states;
+	int i, ret;
+
+	cpu_node = of_cpu_device_node_get(cpu);
+	if (!cpu_node)
+		return -ENODEV;
+
+	states = devm_kcalloc(dev, state_count, sizeof(*states), GFP_KERNEL);
+	if (!states) {
+		ret = -ENOMEM;
+		goto fail;
+	}
+
+	/* Parse SBI specific details from state DT nodes */
+	for (i = 1; i < state_count; i++) {
+		state_node = of_get_cpu_state_node(cpu_node, i - 1);
+		if (!state_node)
+			break;
+
+		ret = sbi_dt_parse_state_node(state_node, &states[i]);
+		of_node_put(state_node);
+
+		if (ret)
+			return ret;
+
+		pr_debug("sbi-state %#x index %d\n", states[i], i);
+	}
+	if (i != state_count) {
+		ret = -ENODEV;
+		goto fail;
+	}
+
+	/* Initialize optional data, used for the hierarchical topology. */
+	ret = sbi_dt_cpu_init_topology(drv, data, state_count, cpu);
+	if (ret < 0)
+		return ret;
+
+	/* Store states in the per-cpu struct. */
+	data->states = states;
+
+fail:
+	of_node_put(cpu_node);
+
+	return ret;
+}
+
+static void sbi_cpuidle_deinit_cpu(int cpu)
+{
+	struct sbi_cpuidle_data *data = per_cpu_ptr(&sbi_cpuidle_data, cpu);
+
+	dt_idle_detach_cpu(data->dev);
+	sbi_cpuidle_use_cpuhp = false;
+}
+
+static int sbi_cpuidle_init_cpu(struct device *dev, int cpu)
+{
+	struct cpuidle_driver *drv;
+	unsigned int state_count = 0;
+	int ret = 0;
+
+	drv = devm_kzalloc(dev, sizeof(*drv), GFP_KERNEL);
+	if (!drv)
+		return -ENOMEM;
+
+	drv->name = "sbi_cpuidle";
+	drv->owner = THIS_MODULE;
+	drv->cpumask = (struct cpumask *)cpumask_of(cpu);
+
+	/* RISC-V architectural WFI to be represented as state index 0. */
+	drv->states[0].enter = sbi_cpuidle_enter_state;
+	drv->states[0].exit_latency = 1;
+	drv->states[0].target_residency = 1;
+	drv->states[0].power_usage = UINT_MAX;
+	strcpy(drv->states[0].name, "WFI");
+	strcpy(drv->states[0].desc, "RISC-V WFI");
+
+	/*
+	 * If no DT idle states are detected (ret == 0) let the driver
+	 * initialization fail accordingly since there is no reason to
+	 * initialize the idle driver if only wfi is supported, the
+	 * default archictectural back-end already executes wfi
+	 * on idle entry.
+	 */
+	ret = dt_init_idle_driver(drv, sbi_cpuidle_state_match, 1);
+	if (ret <= 0) {
+		pr_debug("HART%ld: failed to parse DT idle states\n",
+			 cpuid_to_hartid_map(cpu));
+		return ret ? : -ENODEV;
+	}
+	state_count = ret + 1; /* Include WFI state as well */
+
+	/* Initialize idle states from DT. */
+	ret = sbi_cpuidle_dt_init_states(dev, drv, cpu, state_count);
+	if (ret) {
+		pr_err("HART%ld: failed to init idle states\n",
+		       cpuid_to_hartid_map(cpu));
+		return ret;
+	}
+
+	ret = cpuidle_register(drv, NULL);
+	if (ret)
+		goto deinit;
+
+	cpuidle_cooling_register(drv);
+
+	return 0;
+deinit:
+	sbi_cpuidle_deinit_cpu(cpu);
+	return ret;
+}
+
+static void sbi_cpuidle_domain_sync_state(struct device *dev)
+{
+	/*
+	 * All devices have now been attached/probed to the PM domain
+	 * topology, hence it's fine to allow domain states to be picked.
+	 */
+	sbi_cpuidle_pd_allow_domain_state = true;
+}
+
+#ifdef CONFIG_DT_IDLE_GENPD
+
+static int sbi_cpuidle_pd_power_off(struct generic_pm_domain *pd)
+{
+	struct genpd_power_state *state = &pd->states[pd->state_idx];
+	u32 *pd_state;
+
+	if (!state->data)
+		return 0;
+
+	if (!sbi_cpuidle_pd_allow_domain_state)
+		return -EBUSY;
+
+	/* OSI mode is enabled, set the corresponding domain state. */
+	pd_state = state->data;
+	sbi_set_domain_state(*pd_state);
+
+	return 0;
+}
+
+struct sbi_pd_provider {
+	struct list_head link;
+	struct device_node *node;
+};
+
+static LIST_HEAD(sbi_pd_providers);
+
+static int sbi_pd_init(struct device_node *np)
+{
+	struct generic_pm_domain *pd;
+	struct sbi_pd_provider *pd_provider;
+	struct dev_power_governor *pd_gov;
+	int ret = -ENOMEM;
+
+	pd = dt_idle_pd_alloc(np, sbi_dt_parse_state_node);
+	if (!pd)
+		goto out;
+
+	pd_provider = kzalloc(sizeof(*pd_provider), GFP_KERNEL);
+	if (!pd_provider)
+		goto free_pd;
+
+	pd->flags |= GENPD_FLAG_IRQ_SAFE | GENPD_FLAG_CPU_DOMAIN;
+
+	/* Allow power off when OSI is available. */
+	if (sbi_cpuidle_use_osi)
+		pd->power_off = sbi_cpuidle_pd_power_off;
+	else
+		pd->flags |= GENPD_FLAG_ALWAYS_ON;
+
+	/* Use governor for CPU PM domains if it has some states to manage. */
+	pd_gov = pd->states ? &pm_domain_cpu_gov : NULL;
+
+	ret = pm_genpd_init(pd, pd_gov, false);
+	if (ret)
+		goto free_pd_prov;
+
+	ret = of_genpd_add_provider_simple(np, pd);
+	if (ret)
+		goto remove_pd;
+
+	pd_provider->node = of_node_get(np);
+	list_add(&pd_provider->link, &sbi_pd_providers);
+
+	pr_debug("init PM domain %s\n", pd->name);
+	return 0;
+
+remove_pd:
+	pm_genpd_remove(pd);
+free_pd_prov:
+	kfree(pd_provider);
+free_pd:
+	dt_idle_pd_free(pd);
+out:
+	pr_err("failed to init PM domain ret=%d %pOF\n", ret, np);
+	return ret;
+}
+
+static void sbi_pd_remove(void)
+{
+	struct sbi_pd_provider *pd_provider, *it;
+	struct generic_pm_domain *genpd;
+
+	list_for_each_entry_safe(pd_provider, it, &sbi_pd_providers, link) {
+		of_genpd_del_provider(pd_provider->node);
+
+		genpd = of_genpd_remove_last(pd_provider->node);
+		if (!IS_ERR(genpd))
+			kfree(genpd);
+
+		of_node_put(pd_provider->node);
+		list_del(&pd_provider->link);
+		kfree(pd_provider);
+	}
+}
+
+static int sbi_genpd_probe(struct device_node *np)
+{
+	struct device_node *node;
+	int ret = 0, pd_count = 0;
+
+	if (!np)
+		return -ENODEV;
+
+	/*
+	 * Parse child nodes for the "#power-domain-cells" property and
+	 * initialize a genpd/genpd-of-provider pair when it's found.
+	 */
+	for_each_child_of_node(np, node) {
+		if (!of_property_present(node, "#power-domain-cells"))
+			continue;
+
+		ret = sbi_pd_init(node);
+		if (ret)
+			goto put_node;
+
+		pd_count++;
+	}
+
+	/* Bail out if not using the hierarchical CPU topology. */
+	if (!pd_count)
+		goto no_pd;
+
+	/* Link genpd masters/subdomains to model the CPU topology. */
+	ret = dt_idle_pd_init_topology(np);
+	if (ret)
+		goto remove_pd;
+
+	return 0;
+
+put_node:
+	of_node_put(node);
+remove_pd:
+	sbi_pd_remove();
+	pr_err("failed to create CPU PM domains ret=%d\n", ret);
+no_pd:
+	return ret;
+}
+
+#else
+
+static inline int sbi_genpd_probe(struct device_node *np)
+{
+	return 0;
+}
+
+#endif
+
+static int sbi_cpuidle_probe(struct platform_device *pdev)
+{
+	int cpu, ret;
+	struct cpuidle_driver *drv;
+	struct cpuidle_device *dev;
+	struct device_node *np, *pds_node;
+
+	/* Detect OSI support based on CPU DT nodes */
+	sbi_cpuidle_use_osi = true;
+	for_each_possible_cpu(cpu) {
+		np = of_cpu_device_node_get(cpu);
+		if (np &&
+		    of_property_present(np, "power-domains") &&
+		    of_property_present(np, "power-domain-names")) {
+			continue;
+		} else {
+			sbi_cpuidle_use_osi = false;
+			break;
+		}
+	}
+
+	/* Populate generic power domains from DT nodes */
+	pds_node = of_find_node_by_path("/cpus/power-domains");
+	if (pds_node) {
+		ret = sbi_genpd_probe(pds_node);
+		of_node_put(pds_node);
+		if (ret)
+			return ret;
+	}
+
+	/* Initialize CPU idle driver for each CPU */
+	for_each_possible_cpu(cpu) {
+		ret = sbi_cpuidle_init_cpu(&pdev->dev, cpu);
+		if (ret) {
+			pr_debug("HART%ld: idle driver init failed\n",
+				 cpuid_to_hartid_map(cpu));
+			goto out_fail;
+		}
+	}
+
+	/* Setup CPU hotplut notifiers */
+	sbi_idle_init_cpuhp();
+
+	pr_info("idle driver registered for all CPUs\n");
+
+	return 0;
+
+out_fail:
+	while (--cpu >= 0) {
+		dev = per_cpu(cpuidle_devices, cpu);
+		drv = cpuidle_get_cpu_driver(dev);
+		cpuidle_unregister(drv);
+		sbi_cpuidle_deinit_cpu(cpu);
+	}
+
+	return ret;
+}
+
+static struct platform_driver sbi_cpuidle_driver = {
+	.probe = sbi_cpuidle_probe,
+	.driver = {
+		.name = "sbi-cpuidle",
+		.sync_state = sbi_cpuidle_domain_sync_state,
+	},
+};
+
+static int __init sbi_cpuidle_init(void)
+{
+	int ret;
+	struct platform_device *pdev;
+
+	/*
+	 * The SBI HSM suspend function is only available when:
+	 * 1) SBI version is 0.3 or higher
+	 * 2) SBI HSM extension is available
+	 */
+	if ((sbi_spec_version < sbi_mk_version(0, 3)) ||
+	    !sbi_probe_extension(SBI_EXT_HSM)) {
+		pr_info("HSM suspend not available\n");
+		return 0;
+	}
+
+	ret = platform_driver_register(&sbi_cpuidle_driver);
+	if (ret)
+		return ret;
+
+	pdev = platform_device_register_simple("sbi-cpuidle",
+						-1, NULL, 0);
+	if (IS_ERR(pdev)) {
+		platform_driver_unregister(&sbi_cpuidle_driver);
+		return PTR_ERR(pdev);
+	}
+
+	return 0;
+}
+device_initcall(sbi_cpuidle_init);
diff --git a/drivers/cpuidle/cpuidle-tegra.c b/drivers/cpuidle/cpuidle-tegra.c
new file mode 100644
index 0000000000..b203a93dea
--- /dev/null
+++ b/drivers/cpuidle/cpuidle-tegra.c
@@ -0,0 +1,402 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * CPU idle driver for Tegra CPUs
+ *
+ * Copyright (c) 2010-2013, NVIDIA Corporation.
+ * Copyright (c) 2011 Google, Inc.
+ * Author: Colin Cross <ccross@android.com>
+ *         Gary King <gking@nvidia.com>
+ *
+ * Rework for 3.3 by Peter De Schrijver <pdeschrijver@nvidia.com>
+ *
+ * Tegra20/124 driver unification by Dmitry Osipenko <digetx@gmail.com>
+ */
+
+#define pr_fmt(fmt)	"tegra-cpuidle: " fmt
+
+#include <linux/atomic.h>
+#include <linux/cpuidle.h>
+#include <linux/cpumask.h>
+#include <linux/cpu_pm.h>
+#include <linux/delay.h>
+#include <linux/errno.h>
+#include <linux/platform_device.h>
+#include <linux/types.h>
+
+#include <linux/clk/tegra.h>
+#include <linux/firmware/trusted_foundations.h>
+
+#include <soc/tegra/cpuidle.h>
+#include <soc/tegra/flowctrl.h>
+#include <soc/tegra/fuse.h>
+#include <soc/tegra/irq.h>
+#include <soc/tegra/pm.h>
+#include <soc/tegra/pmc.h>
+
+#include <asm/cpuidle.h>
+#include <asm/firmware.h>
+#include <asm/smp_plat.h>
+#include <asm/suspend.h>
+
+enum tegra_state {
+	TEGRA_C1,
+	TEGRA_C7,
+	TEGRA_CC6,
+	TEGRA_STATE_COUNT,
+};
+
+static atomic_t tegra_idle_barrier;
+static atomic_t tegra_abort_flag;
+
+static void tegra_cpuidle_report_cpus_state(void)
+{
+	unsigned long cpu, lcpu, csr;
+
+	for_each_cpu(lcpu, cpu_possible_mask) {
+		cpu = cpu_logical_map(lcpu);
+		csr = flowctrl_read_cpu_csr(cpu);
+
+		pr_err("cpu%lu: online=%d flowctrl_csr=0x%08lx\n",
+		       cpu, cpu_online(lcpu), csr);
+	}
+}
+
+static int tegra_cpuidle_wait_for_secondary_cpus_parking(void)
+{
+	unsigned int retries = 3;
+
+	while (retries--) {
+		unsigned int delay_us = 10;
+		unsigned int timeout_us = 500 * 1000 / delay_us;
+
+		/*
+		 * The primary CPU0 core shall wait for the secondaries
+		 * shutdown in order to power-off CPU's cluster safely.
+		 * The timeout value depends on the current CPU frequency,
+		 * it takes about 40-150us in average and over 1000us in
+		 * a worst case scenario.
+		 */
+		do {
+			if (tegra_cpu_rail_off_ready())
+				return 0;
+
+			udelay(delay_us);
+
+		} while (timeout_us--);
+
+		pr_err("secondary CPU taking too long to park\n");
+
+		tegra_cpuidle_report_cpus_state();
+	}
+
+	pr_err("timed out waiting secondaries to park\n");
+
+	return -ETIMEDOUT;
+}
+
+static void tegra_cpuidle_unpark_secondary_cpus(void)
+{
+	unsigned int cpu, lcpu;
+
+	for_each_cpu(lcpu, cpu_online_mask) {
+		cpu = cpu_logical_map(lcpu);
+
+		if (cpu > 0) {
+			tegra_enable_cpu_clock(cpu);
+			tegra_cpu_out_of_reset(cpu);
+			flowctrl_write_cpu_halt(cpu, 0);
+		}
+	}
+}
+
+static int tegra_cpuidle_cc6_enter(unsigned int cpu)
+{
+	int ret;
+
+	if (cpu > 0) {
+		ret = cpu_suspend(cpu, tegra_pm_park_secondary_cpu);
+	} else {
+		ret = tegra_cpuidle_wait_for_secondary_cpus_parking();
+		if (!ret)
+			ret = tegra_pm_enter_lp2();
+
+		tegra_cpuidle_unpark_secondary_cpus();
+	}
+
+	return ret;
+}
+
+static int tegra_cpuidle_c7_enter(void)
+{
+	int err;
+
+	err = call_firmware_op(prepare_idle, TF_PM_MODE_LP2_NOFLUSH_L2);
+	if (err && err != -ENOSYS)
+		return err;
+
+	return cpu_suspend(0, tegra30_pm_secondary_cpu_suspend);
+}
+
+static int tegra_cpuidle_coupled_barrier(struct cpuidle_device *dev)
+{
+	if (tegra_pending_sgi()) {
+		/*
+		 * CPU got local interrupt that will be lost after GIC's
+		 * shutdown because GIC driver doesn't save/restore the
+		 * pending SGI state across CPU cluster PM.  Abort and retry
+		 * next time.
+		 */
+		atomic_set(&tegra_abort_flag, 1);
+	}
+
+	cpuidle_coupled_parallel_barrier(dev, &tegra_idle_barrier);
+
+	if (atomic_read(&tegra_abort_flag)) {
+		cpuidle_coupled_parallel_barrier(dev, &tegra_idle_barrier);
+		atomic_set(&tegra_abort_flag, 0);
+		return -EINTR;
+	}
+
+	return 0;
+}
+
+static __cpuidle int tegra_cpuidle_state_enter(struct cpuidle_device *dev,
+					       int index, unsigned int cpu)
+{
+	int err;
+
+	/*
+	 * CC6 state is the "CPU cluster power-off" state.  In order to
+	 * enter this state, at first the secondary CPU cores need to be
+	 * parked into offline mode, then the last CPU should clean out
+	 * remaining dirty cache lines into DRAM and trigger Flow Controller
+	 * logic that turns off the cluster's power domain (which includes
+	 * CPU cores, GIC and L2 cache).
+	 */
+	if (index == TEGRA_CC6) {
+		err = tegra_cpuidle_coupled_barrier(dev);
+		if (err)
+			return err;
+	}
+
+	local_fiq_disable();
+	tegra_pm_set_cpu_in_lp2();
+	cpu_pm_enter();
+
+	ct_cpuidle_enter();
+
+	switch (index) {
+	case TEGRA_C7:
+		err = tegra_cpuidle_c7_enter();
+		break;
+
+	case TEGRA_CC6:
+		err = tegra_cpuidle_cc6_enter(cpu);
+		break;
+
+	default:
+		err = -EINVAL;
+		break;
+	}
+
+	ct_cpuidle_exit();
+
+	cpu_pm_exit();
+	tegra_pm_clear_cpu_in_lp2();
+	local_fiq_enable();
+
+	return err ?: index;
+}
+
+static int tegra_cpuidle_adjust_state_index(int index, unsigned int cpu)
+{
+	/*
+	 * On Tegra30 CPU0 can't be power-gated separately from secondary
+	 * cores because it gates the whole CPU cluster.
+	 */
+	if (cpu > 0 || index != TEGRA_C7 || tegra_get_chip_id() != TEGRA30)
+		return index;
+
+	/* put CPU0 into C1 if C7 is requested and secondaries are online */
+	if (!IS_ENABLED(CONFIG_PM_SLEEP) || num_online_cpus() > 1)
+		index = TEGRA_C1;
+	else
+		index = TEGRA_CC6;
+
+	return index;
+}
+
+static __cpuidle int tegra_cpuidle_enter(struct cpuidle_device *dev,
+					 struct cpuidle_driver *drv,
+					 int index)
+{
+	bool do_rcu = drv->states[index].flags & CPUIDLE_FLAG_RCU_IDLE;
+	unsigned int cpu = cpu_logical_map(dev->cpu);
+	int ret;
+
+	index = tegra_cpuidle_adjust_state_index(index, cpu);
+	if (dev->states_usage[index].disable)
+		return -1;
+
+	if (index == TEGRA_C1) {
+		if (do_rcu)
+			ct_cpuidle_enter();
+		ret = arm_cpuidle_simple_enter(dev, drv, index);
+		if (do_rcu)
+			ct_cpuidle_exit();
+	} else
+		ret = tegra_cpuidle_state_enter(dev, index, cpu);
+
+	if (ret < 0) {
+		if (ret != -EINTR || index != TEGRA_CC6)
+			pr_err_once("failed to enter state %d err: %d\n",
+				    index, ret);
+		index = -1;
+	} else {
+		index = ret;
+	}
+
+	return index;
+}
+
+static int tegra114_enter_s2idle(struct cpuidle_device *dev,
+				 struct cpuidle_driver *drv,
+				 int index)
+{
+	tegra_cpuidle_enter(dev, drv, index);
+
+	return 0;
+}
+
+/*
+ * The previous versions of Tegra CPUIDLE driver used a different "legacy"
+ * terminology for naming of the idling states, while this driver uses the
+ * new terminology.
+ *
+ * Mapping of the old terms into the new ones:
+ *
+ * Old | New
+ * ---------
+ * LP3 | C1	(CPU core clock gating)
+ * LP2 | C7	(CPU core power gating)
+ * LP2 | CC6	(CPU cluster power gating)
+ *
+ * Note that that the older CPUIDLE driver versions didn't explicitly
+ * differentiate the LP2 states because these states either used the same
+ * code path or because CC6 wasn't supported.
+ */
+static struct cpuidle_driver tegra_idle_driver = {
+	.name = "tegra_idle",
+	.states = {
+		[TEGRA_C1] = ARM_CPUIDLE_WFI_STATE_PWR(600),
+		[TEGRA_C7] = {
+			.enter			= tegra_cpuidle_enter,
+			.exit_latency		= 2000,
+			.target_residency	= 2200,
+			.power_usage		= 100,
+			.flags			= CPUIDLE_FLAG_TIMER_STOP |
+						  CPUIDLE_FLAG_RCU_IDLE,
+			.name			= "C7",
+			.desc			= "CPU core powered off",
+		},
+		[TEGRA_CC6] = {
+			.enter			= tegra_cpuidle_enter,
+			.exit_latency		= 5000,
+			.target_residency	= 10000,
+			.power_usage		= 0,
+			.flags			= CPUIDLE_FLAG_TIMER_STOP |
+						  CPUIDLE_FLAG_RCU_IDLE   |
+						  CPUIDLE_FLAG_COUPLED,
+			.name			= "CC6",
+			.desc			= "CPU cluster powered off",
+		},
+	},
+	.state_count = TEGRA_STATE_COUNT,
+	.safe_state_index = TEGRA_C1,
+};
+
+static inline void tegra_cpuidle_disable_state(enum tegra_state state)
+{
+	cpuidle_driver_state_disabled(&tegra_idle_driver, state, true);
+}
+
+/*
+ * Tegra20 HW appears to have a bug such that PCIe device interrupts, whether
+ * they are legacy IRQs or MSI, are lost when CC6 is enabled.  To work around
+ * this, simply disable CC6 if the PCI driver and DT node are both enabled.
+ */
+void tegra_cpuidle_pcie_irqs_in_use(void)
+{
+	struct cpuidle_state *state_cc6 = &tegra_idle_driver.states[TEGRA_CC6];
+
+	if ((state_cc6->flags & CPUIDLE_FLAG_UNUSABLE) ||
+	    tegra_get_chip_id() != TEGRA20)
+		return;
+
+	pr_info("disabling CC6 state, since PCIe IRQs are in use\n");
+	tegra_cpuidle_disable_state(TEGRA_CC6);
+}
+
+static void tegra_cpuidle_setup_tegra114_c7_state(void)
+{
+	struct cpuidle_state *s = &tegra_idle_driver.states[TEGRA_C7];
+
+	s->enter_s2idle = tegra114_enter_s2idle;
+	s->target_residency = 1000;
+	s->exit_latency = 500;
+}
+
+static int tegra_cpuidle_probe(struct platform_device *pdev)
+{
+	if (tegra_pmc_get_suspend_mode() == TEGRA_SUSPEND_NOT_READY)
+		return -EPROBE_DEFER;
+
+	/* LP2 could be disabled in device-tree */
+	if (tegra_pmc_get_suspend_mode() < TEGRA_SUSPEND_LP2)
+		tegra_cpuidle_disable_state(TEGRA_CC6);
+
+	/*
+	 * Required suspend-resume functionality, which is provided by the
+	 * Tegra-arch core and PMC driver, is unavailable if PM-sleep option
+	 * is disabled.
+	 */
+	if (!IS_ENABLED(CONFIG_PM_SLEEP)) {
+		tegra_cpuidle_disable_state(TEGRA_C7);
+		tegra_cpuidle_disable_state(TEGRA_CC6);
+	}
+
+	/*
+	 * Generic WFI state (also known as C1 or LP3) and the coupled CPU
+	 * cluster power-off (CC6 or LP2) states are common for all Tegra SoCs.
+	 */
+	switch (tegra_get_chip_id()) {
+	case TEGRA20:
+		/* Tegra20 isn't capable to power-off individual CPU cores */
+		tegra_cpuidle_disable_state(TEGRA_C7);
+		break;
+
+	case TEGRA30:
+		break;
+
+	case TEGRA114:
+	case TEGRA124:
+		tegra_cpuidle_setup_tegra114_c7_state();
+
+		/* coupled CC6 (LP2) state isn't implemented yet */
+		tegra_cpuidle_disable_state(TEGRA_CC6);
+		break;
+
+	default:
+		return -EINVAL;
+	}
+
+	return cpuidle_register(&tegra_idle_driver, cpu_possible_mask);
+}
+
+static struct platform_driver tegra_cpuidle_driver = {
+	.probe = tegra_cpuidle_probe,
+	.driver = {
+		.name = "tegra-cpuidle",
+	},
+};
+builtin_platform_driver(tegra_cpuidle_driver);
diff --git a/drivers/cpuidle/cpuidle-ux500.c b/drivers/cpuidle/cpuidle-ux500.c
new file mode 100644
index 0000000000..f7d778580e
--- /dev/null
+++ b/drivers/cpuidle/cpuidle-ux500.c
@@ -0,0 +1,124 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (c) 2012 Linaro : Daniel Lezcano <daniel.lezcano@linaro.org> (IBM)
+ *
+ * Based on the work of Rickard Andersson <rickard.andersson@stericsson.com>
+ * and Jonas Aaberg <jonas.aberg@stericsson.com>.
+ */
+
+#include <linux/init.h>
+#include <linux/cpuidle.h>
+#include <linux/spinlock.h>
+#include <linux/atomic.h>
+#include <linux/smp.h>
+#include <linux/mfd/dbx500-prcmu.h>
+#include <linux/platform_data/arm-ux500-pm.h>
+#include <linux/platform_device.h>
+
+#include <asm/cpuidle.h>
+
+static atomic_t master = ATOMIC_INIT(0);
+static DEFINE_SPINLOCK(master_lock);
+
+static inline int ux500_enter_idle(struct cpuidle_device *dev,
+				   struct cpuidle_driver *drv, int index)
+{
+	int this_cpu = smp_processor_id();
+	bool recouple = false;
+
+	if (atomic_inc_return(&master) == num_online_cpus()) {
+
+		/* With this lock, we prevent the other cpu to exit and enter
+		 * this function again and become the master */
+		if (!spin_trylock(&master_lock))
+			goto wfi;
+
+		/* decouple the gic from the A9 cores */
+		if (prcmu_gic_decouple()) {
+			spin_unlock(&master_lock);
+			goto out;
+		}
+
+		/* If an error occur, we will have to recouple the gic
+		 * manually */
+		recouple = true;
+
+		/* At this state, as the gic is decoupled, if the other
+		 * cpu is in WFI, we have the guarantee it won't be wake
+		 * up, so we can safely go to retention */
+		if (!prcmu_is_cpu_in_wfi(this_cpu ? 0 : 1))
+			goto out;
+
+		/* The prcmu will be in charge of watching the interrupts
+		 * and wake up the cpus */
+		if (prcmu_copy_gic_settings())
+			goto out;
+
+		/* Check in the meantime an interrupt did
+		 * not occur on the gic ... */
+		if (prcmu_gic_pending_irq())
+			goto out;
+
+		/* ... and the prcmu */
+		if (prcmu_pending_irq())
+			goto out;
+
+		/* Go to the retention state, the prcmu will wait for the
+		 * cpu to go WFI and this is what happens after exiting this
+		 * 'master' critical section */
+		if (prcmu_set_power_state(PRCMU_AP_IDLE, true, true))
+			goto out;
+
+		/* When we switch to retention, the prcmu is in charge
+		 * of recoupling the gic automatically */
+		recouple = false;
+
+		spin_unlock(&master_lock);
+	}
+wfi:
+	cpu_do_idle();
+out:
+	atomic_dec(&master);
+
+	if (recouple) {
+		prcmu_gic_recouple();
+		spin_unlock(&master_lock);
+	}
+
+	return index;
+}
+
+static struct cpuidle_driver ux500_idle_driver = {
+	.name = "ux500_idle",
+	.owner = THIS_MODULE,
+	.states = {
+		ARM_CPUIDLE_WFI_STATE,
+		{
+			.enter		  = ux500_enter_idle,
+			.exit_latency	  = 70,
+			.target_residency = 260,
+			.flags		  = CPUIDLE_FLAG_TIMER_STOP,
+			.name		  = "ApIdle",
+			.desc		  = "ARM Retention",
+		},
+	},
+	.safe_state_index = 0,
+	.state_count = 2,
+};
+
+static int dbx500_cpuidle_probe(struct platform_device *pdev)
+{
+	/* Configure wake up reasons */
+	prcmu_enable_wakeups(PRCMU_WAKEUP(ARM) | PRCMU_WAKEUP(RTC) |
+			     PRCMU_WAKEUP(ABB));
+
+	return cpuidle_register(&ux500_idle_driver, NULL);
+}
+
+static struct platform_driver dbx500_cpuidle_plat_driver = {
+	.driver = {
+		.name = "db8500-cpuidle",
+	},
+	.probe = dbx500_cpuidle_probe,
+};
+builtin_platform_driver(dbx500_cpuidle_plat_driver);
diff --git a/drivers/cpuidle/cpuidle-zynq.c b/drivers/cpuidle/cpuidle-zynq.c
new file mode 100644
index 0000000000..a79610e723
--- /dev/null
+++ b/drivers/cpuidle/cpuidle-zynq.c
@@ -0,0 +1,65 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (C) 2012-2013 Xilinx
+ *
+ * CPU idle support for Xilinx Zynq
+ *
+ * based on arch/arm/mach-at91/cpuidle.c
+ *
+ * The cpu idle uses wait-for-interrupt and RAM self refresh in order
+ * to implement two idle states -
+ * #1 wait-for-interrupt
+ * #2 wait-for-interrupt and RAM self refresh
+ *
+ * Maintainer: Michal Simek <michal.simek@xilinx.com>
+ */
+
+#include <linux/init.h>
+#include <linux/cpuidle.h>
+#include <linux/platform_device.h>
+#include <asm/cpuidle.h>
+
+#define ZYNQ_MAX_STATES		2
+
+/* Actual code that puts the SoC in different idle states */
+static int zynq_enter_idle(struct cpuidle_device *dev,
+			   struct cpuidle_driver *drv, int index)
+{
+	/* Add code for DDR self refresh start */
+	cpu_do_idle();
+
+	return index;
+}
+
+static struct cpuidle_driver zynq_idle_driver = {
+	.name = "zynq_idle",
+	.owner = THIS_MODULE,
+	.states = {
+		ARM_CPUIDLE_WFI_STATE,
+		{
+			.enter			= zynq_enter_idle,
+			.exit_latency		= 10,
+			.target_residency	= 10000,
+			.name			= "RAM_SR",
+			.desc			= "WFI and RAM Self Refresh",
+		},
+	},
+	.safe_state_index = 0,
+	.state_count = ZYNQ_MAX_STATES,
+};
+
+/* Initialize CPU idle by registering the idle states */
+static int zynq_cpuidle_probe(struct platform_device *pdev)
+{
+	pr_info("Xilinx Zynq CpuIdle Driver started\n");
+
+	return cpuidle_register(&zynq_idle_driver, NULL);
+}
+
+static struct platform_driver zynq_cpuidle_driver = {
+	.driver = {
+		.name = "cpuidle-zynq",
+	},
+	.probe = zynq_cpuidle_probe,
+};
+builtin_platform_driver(zynq_cpuidle_driver);
diff --git a/drivers/cpuidle/cpuidle.c b/drivers/cpuidle/cpuidle.c
new file mode 100644
index 0000000000..737a026ef5
--- /dev/null
+++ b/drivers/cpuidle/cpuidle.c
@@ -0,0 +1,816 @@
+/*
+ * cpuidle.c - core cpuidle infrastructure
+ *
+ * (C) 2006-2007 Venkatesh Pallipadi <venkatesh.pallipadi@intel.com>
+ *               Shaohua Li <shaohua.li@intel.com>
+ *               Adam Belay <abelay@novell.com>
+ *
+ * This code is licenced under the GPL.
+ */
+
+#include "linux/percpu-defs.h"
+#include <linux/clockchips.h>
+#include <linux/kernel.h>
+#include <linux/mutex.h>
+#include <linux/sched.h>
+#include <linux/sched/clock.h>
+#include <linux/sched/idle.h>
+#include <linux/notifier.h>
+#include <linux/pm_qos.h>
+#include <linux/cpu.h>
+#include <linux/cpuidle.h>
+#include <linux/ktime.h>
+#include <linux/hrtimer.h>
+#include <linux/module.h>
+#include <linux/suspend.h>
+#include <linux/tick.h>
+#include <linux/mmu_context.h>
+#include <linux/context_tracking.h>
+#include <trace/events/power.h>
+
+#include "cpuidle.h"
+
+DEFINE_PER_CPU(struct cpuidle_device *, cpuidle_devices);
+DEFINE_PER_CPU(struct cpuidle_device, cpuidle_dev);
+
+DEFINE_MUTEX(cpuidle_lock);
+LIST_HEAD(cpuidle_detected_devices);
+
+static int enabled_devices;
+static int off __read_mostly;
+static int initialized __read_mostly;
+
+int cpuidle_disabled(void)
+{
+	return off;
+}
+void disable_cpuidle(void)
+{
+	off = 1;
+}
+
+bool cpuidle_not_available(struct cpuidle_driver *drv,
+			   struct cpuidle_device *dev)
+{
+	return off || !initialized || !drv || !dev || !dev->enabled;
+}
+
+/**
+ * cpuidle_play_dead - cpu off-lining
+ *
+ * Returns in case of an error or no driver
+ */
+int cpuidle_play_dead(void)
+{
+	struct cpuidle_device *dev = __this_cpu_read(cpuidle_devices);
+	struct cpuidle_driver *drv = cpuidle_get_cpu_driver(dev);
+	int i;
+
+	if (!drv)
+		return -ENODEV;
+
+	/* Find lowest-power state that supports long-term idle */
+	for (i = drv->state_count - 1; i >= 0; i--)
+		if (drv->states[i].enter_dead)
+			return drv->states[i].enter_dead(dev, i);
+
+	return -ENODEV;
+}
+
+static int find_deepest_state(struct cpuidle_driver *drv,
+			      struct cpuidle_device *dev,
+			      u64 max_latency_ns,
+			      unsigned int forbidden_flags,
+			      bool s2idle)
+{
+	u64 latency_req = 0;
+	int i, ret = 0;
+
+	for (i = 1; i < drv->state_count; i++) {
+		struct cpuidle_state *s = &drv->states[i];
+
+		if (dev->states_usage[i].disable ||
+		    s->exit_latency_ns <= latency_req ||
+		    s->exit_latency_ns > max_latency_ns ||
+		    (s->flags & forbidden_flags) ||
+		    (s2idle && !s->enter_s2idle))
+			continue;
+
+		latency_req = s->exit_latency_ns;
+		ret = i;
+	}
+	return ret;
+}
+
+/**
+ * cpuidle_use_deepest_state - Set/unset governor override mode.
+ * @latency_limit_ns: Idle state exit latency limit (or no override if 0).
+ *
+ * If @latency_limit_ns is nonzero, set the current CPU to use the deepest idle
+ * state with exit latency within @latency_limit_ns (override governors going
+ * forward), or do not override governors if it is zero.
+ */
+void cpuidle_use_deepest_state(u64 latency_limit_ns)
+{
+	struct cpuidle_device *dev;
+
+	preempt_disable();
+	dev = cpuidle_get_device();
+	if (dev)
+		dev->forced_idle_latency_limit_ns = latency_limit_ns;
+	preempt_enable();
+}
+
+/**
+ * cpuidle_find_deepest_state - Find the deepest available idle state.
+ * @drv: cpuidle driver for the given CPU.
+ * @dev: cpuidle device for the given CPU.
+ * @latency_limit_ns: Idle state exit latency limit
+ *
+ * Return: the index of the deepest available idle state.
+ */
+int cpuidle_find_deepest_state(struct cpuidle_driver *drv,
+			       struct cpuidle_device *dev,
+			       u64 latency_limit_ns)
+{
+	return find_deepest_state(drv, dev, latency_limit_ns, 0, false);
+}
+
+#ifdef CONFIG_SUSPEND
+static noinstr void enter_s2idle_proper(struct cpuidle_driver *drv,
+					 struct cpuidle_device *dev, int index)
+{
+	struct cpuidle_state *target_state = &drv->states[index];
+	ktime_t time_start, time_end;
+
+	instrumentation_begin();
+
+	time_start = ns_to_ktime(local_clock_noinstr());
+
+	tick_freeze();
+	/*
+	 * The state used here cannot be a "coupled" one, because the "coupled"
+	 * cpuidle mechanism enables interrupts and doing that with timekeeping
+	 * suspended is generally unsafe.
+	 */
+	stop_critical_timings();
+	if (!(target_state->flags & CPUIDLE_FLAG_RCU_IDLE)) {
+		ct_cpuidle_enter();
+		/* Annotate away the indirect call */
+		instrumentation_begin();
+	}
+	target_state->enter_s2idle(dev, drv, index);
+	if (WARN_ON_ONCE(!irqs_disabled()))
+		raw_local_irq_disable();
+	if (!(target_state->flags & CPUIDLE_FLAG_RCU_IDLE)) {
+		instrumentation_end();
+		ct_cpuidle_exit();
+	}
+	tick_unfreeze();
+	start_critical_timings();
+
+	time_end = ns_to_ktime(local_clock_noinstr());
+
+	dev->states_usage[index].s2idle_time += ktime_us_delta(time_end, time_start);
+	dev->states_usage[index].s2idle_usage++;
+	instrumentation_end();
+}
+
+/**
+ * cpuidle_enter_s2idle - Enter an idle state suitable for suspend-to-idle.
+ * @drv: cpuidle driver for the given CPU.
+ * @dev: cpuidle device for the given CPU.
+ *
+ * If there are states with the ->enter_s2idle callback, find the deepest of
+ * them and enter it with frozen tick.
+ */
+int cpuidle_enter_s2idle(struct cpuidle_driver *drv, struct cpuidle_device *dev)
+{
+	int index;
+
+	/*
+	 * Find the deepest state with ->enter_s2idle present, which guarantees
+	 * that interrupts won't be enabled when it exits and allows the tick to
+	 * be frozen safely.
+	 */
+	index = find_deepest_state(drv, dev, U64_MAX, 0, true);
+	if (index > 0) {
+		enter_s2idle_proper(drv, dev, index);
+		local_irq_enable();
+	}
+	return index;
+}
+#endif /* CONFIG_SUSPEND */
+
+/**
+ * cpuidle_enter_state - enter the state and update stats
+ * @dev: cpuidle device for this cpu
+ * @drv: cpuidle driver for this cpu
+ * @index: index into the states table in @drv of the state to enter
+ */
+noinstr int cpuidle_enter_state(struct cpuidle_device *dev,
+				 struct cpuidle_driver *drv,
+				 int index)
+{
+	int entered_state;
+
+	struct cpuidle_state *target_state = &drv->states[index];
+	bool broadcast = !!(target_state->flags & CPUIDLE_FLAG_TIMER_STOP);
+	ktime_t time_start, time_end;
+
+	instrumentation_begin();
+
+	/*
+	 * Tell the time framework to switch to a broadcast timer because our
+	 * local timer will be shut down.  If a local timer is used from another
+	 * CPU as a broadcast timer, this call may fail if it is not available.
+	 */
+	if (broadcast && tick_broadcast_enter()) {
+		index = find_deepest_state(drv, dev, target_state->exit_latency_ns,
+					   CPUIDLE_FLAG_TIMER_STOP, false);
+		if (index < 0) {
+			default_idle_call();
+			return -EBUSY;
+		}
+		target_state = &drv->states[index];
+		broadcast = false;
+	}
+
+	if (target_state->flags & CPUIDLE_FLAG_TLB_FLUSHED)
+		leave_mm(dev->cpu);
+
+	/* Take note of the planned idle state. */
+	sched_idle_set_state(target_state);
+
+	trace_cpu_idle(index, dev->cpu);
+	time_start = ns_to_ktime(local_clock_noinstr());
+
+	stop_critical_timings();
+	if (!(target_state->flags & CPUIDLE_FLAG_RCU_IDLE)) {
+		ct_cpuidle_enter();
+		/* Annotate away the indirect call */
+		instrumentation_begin();
+	}
+
+	/*
+	 * NOTE!!
+	 *
+	 * For cpuidle_state::enter() methods that do *NOT* set
+	 * CPUIDLE_FLAG_RCU_IDLE RCU will be disabled here and these functions
+	 * must be marked either noinstr or __cpuidle.
+	 *
+	 * For cpuidle_state::enter() methods that *DO* set
+	 * CPUIDLE_FLAG_RCU_IDLE this isn't required, but they must mark the
+	 * function calling ct_cpuidle_enter() as noinstr/__cpuidle and all
+	 * functions called within the RCU-idle region.
+	 */
+	entered_state = target_state->enter(dev, drv, index);
+
+	if (WARN_ONCE(!irqs_disabled(), "%ps leaked IRQ state", target_state->enter))
+		raw_local_irq_disable();
+
+	if (!(target_state->flags & CPUIDLE_FLAG_RCU_IDLE)) {
+		instrumentation_end();
+		ct_cpuidle_exit();
+	}
+	start_critical_timings();
+
+	sched_clock_idle_wakeup_event();
+	time_end = ns_to_ktime(local_clock_noinstr());
+	trace_cpu_idle(PWR_EVENT_EXIT, dev->cpu);
+
+	/* The cpu is no longer idle or about to enter idle. */
+	sched_idle_set_state(NULL);
+
+	if (broadcast)
+		tick_broadcast_exit();
+
+	if (!cpuidle_state_is_coupled(drv, index))
+		local_irq_enable();
+
+	if (entered_state >= 0) {
+		s64 diff, delay = drv->states[entered_state].exit_latency_ns;
+		int i;
+
+		/*
+		 * Update cpuidle counters
+		 * This can be moved to within driver enter routine,
+		 * but that results in multiple copies of same code.
+		 */
+		diff = ktime_sub(time_end, time_start);
+
+		dev->last_residency_ns = diff;
+		dev->states_usage[entered_state].time_ns += diff;
+		dev->states_usage[entered_state].usage++;
+
+		if (diff < drv->states[entered_state].target_residency_ns) {
+			for (i = entered_state - 1; i >= 0; i--) {
+				if (dev->states_usage[i].disable)
+					continue;
+
+				/* Shallower states are enabled, so update. */
+				dev->states_usage[entered_state].above++;
+				trace_cpu_idle_miss(dev->cpu, entered_state, false);
+				break;
+			}
+		} else if (diff > delay) {
+			for (i = entered_state + 1; i < drv->state_count; i++) {
+				if (dev->states_usage[i].disable)
+					continue;
+
+				/*
+				 * Update if a deeper state would have been a
+				 * better match for the observed idle duration.
+				 */
+				if (diff - delay >= drv->states[i].target_residency_ns) {
+					dev->states_usage[entered_state].below++;
+					trace_cpu_idle_miss(dev->cpu, entered_state, true);
+				}
+
+				break;
+			}
+		}
+	} else {
+		dev->last_residency_ns = 0;
+		dev->states_usage[index].rejected++;
+	}
+
+	instrumentation_end();
+
+	return entered_state;
+}
+
+/**
+ * cpuidle_select - ask the cpuidle framework to choose an idle state
+ *
+ * @drv: the cpuidle driver
+ * @dev: the cpuidle device
+ * @stop_tick: indication on whether or not to stop the tick
+ *
+ * Returns the index of the idle state.  The return value must not be negative.
+ *
+ * The memory location pointed to by @stop_tick is expected to be written the
+ * 'false' boolean value if the scheduler tick should not be stopped before
+ * entering the returned state.
+ */
+int cpuidle_select(struct cpuidle_driver *drv, struct cpuidle_device *dev,
+		   bool *stop_tick)
+{
+	return cpuidle_curr_governor->select(drv, dev, stop_tick);
+}
+
+/**
+ * cpuidle_enter - enter into the specified idle state
+ *
+ * @drv:   the cpuidle driver tied with the cpu
+ * @dev:   the cpuidle device
+ * @index: the index in the idle state table
+ *
+ * Returns the index in the idle state, < 0 in case of error.
+ * The error code depends on the backend driver
+ */
+int cpuidle_enter(struct cpuidle_driver *drv, struct cpuidle_device *dev,
+		  int index)
+{
+	int ret = 0;
+
+	/*
+	 * Store the next hrtimer, which becomes either next tick or the next
+	 * timer event, whatever expires first. Additionally, to make this data
+	 * useful for consumers outside cpuidle, we rely on that the governor's
+	 * ->select() callback have decided, whether to stop the tick or not.
+	 */
+	WRITE_ONCE(dev->next_hrtimer, tick_nohz_get_next_hrtimer());
+
+	if (cpuidle_state_is_coupled(drv, index))
+		ret = cpuidle_enter_state_coupled(dev, drv, index);
+	else
+		ret = cpuidle_enter_state(dev, drv, index);
+
+	WRITE_ONCE(dev->next_hrtimer, 0);
+	return ret;
+}
+
+/**
+ * cpuidle_reflect - tell the underlying governor what was the state
+ * we were in
+ *
+ * @dev  : the cpuidle device
+ * @index: the index in the idle state table
+ *
+ */
+void cpuidle_reflect(struct cpuidle_device *dev, int index)
+{
+	if (cpuidle_curr_governor->reflect && index >= 0)
+		cpuidle_curr_governor->reflect(dev, index);
+}
+
+/*
+ * Min polling interval of 10usec is a guess. It is assuming that
+ * for most users, the time for a single ping-pong workload like
+ * perf bench pipe would generally complete within 10usec but
+ * this is hardware dependant. Actual time can be estimated with
+ *
+ * perf bench sched pipe -l 10000
+ *
+ * Run multiple times to avoid cpufreq effects.
+ */
+#define CPUIDLE_POLL_MIN 10000
+#define CPUIDLE_POLL_MAX (TICK_NSEC / 16)
+
+/**
+ * cpuidle_poll_time - return amount of time to poll for,
+ * governors can override dev->poll_limit_ns if necessary
+ *
+ * @drv:   the cpuidle driver tied with the cpu
+ * @dev:   the cpuidle device
+ *
+ */
+__cpuidle u64 cpuidle_poll_time(struct cpuidle_driver *drv,
+		      struct cpuidle_device *dev)
+{
+	int i;
+	u64 limit_ns;
+
+	BUILD_BUG_ON(CPUIDLE_POLL_MIN > CPUIDLE_POLL_MAX);
+
+	if (dev->poll_limit_ns)
+		return dev->poll_limit_ns;
+
+	limit_ns = CPUIDLE_POLL_MAX;
+	for (i = 1; i < drv->state_count; i++) {
+		u64 state_limit;
+
+		if (dev->states_usage[i].disable)
+			continue;
+
+		state_limit = drv->states[i].target_residency_ns;
+		if (state_limit < CPUIDLE_POLL_MIN)
+			continue;
+
+		limit_ns = min_t(u64, state_limit, CPUIDLE_POLL_MAX);
+		break;
+	}
+
+	dev->poll_limit_ns = limit_ns;
+
+	return dev->poll_limit_ns;
+}
+
+/**
+ * cpuidle_install_idle_handler - installs the cpuidle idle loop handler
+ */
+void cpuidle_install_idle_handler(void)
+{
+	if (enabled_devices) {
+		/* Make sure all changes finished before we switch to new idle */
+		smp_wmb();
+		initialized = 1;
+	}
+}
+
+/**
+ * cpuidle_uninstall_idle_handler - uninstalls the cpuidle idle loop handler
+ */
+void cpuidle_uninstall_idle_handler(void)
+{
+	if (enabled_devices) {
+		initialized = 0;
+		wake_up_all_idle_cpus();
+	}
+
+	/*
+	 * Make sure external observers (such as the scheduler)
+	 * are done looking at pointed idle states.
+	 */
+	synchronize_rcu();
+}
+
+/**
+ * cpuidle_pause_and_lock - temporarily disables CPUIDLE
+ */
+void cpuidle_pause_and_lock(void)
+{
+	mutex_lock(&cpuidle_lock);
+	cpuidle_uninstall_idle_handler();
+}
+
+EXPORT_SYMBOL_GPL(cpuidle_pause_and_lock);
+
+/**
+ * cpuidle_resume_and_unlock - resumes CPUIDLE operation
+ */
+void cpuidle_resume_and_unlock(void)
+{
+	cpuidle_install_idle_handler();
+	mutex_unlock(&cpuidle_lock);
+}
+
+EXPORT_SYMBOL_GPL(cpuidle_resume_and_unlock);
+
+/* Currently used in suspend/resume path to suspend cpuidle */
+void cpuidle_pause(void)
+{
+	mutex_lock(&cpuidle_lock);
+	cpuidle_uninstall_idle_handler();
+	mutex_unlock(&cpuidle_lock);
+}
+
+/* Currently used in suspend/resume path to resume cpuidle */
+void cpuidle_resume(void)
+{
+	mutex_lock(&cpuidle_lock);
+	cpuidle_install_idle_handler();
+	mutex_unlock(&cpuidle_lock);
+}
+
+/**
+ * cpuidle_enable_device - enables idle PM for a CPU
+ * @dev: the CPU
+ *
+ * This function must be called between cpuidle_pause_and_lock and
+ * cpuidle_resume_and_unlock when used externally.
+ */
+int cpuidle_enable_device(struct cpuidle_device *dev)
+{
+	int ret;
+	struct cpuidle_driver *drv;
+
+	if (!dev)
+		return -EINVAL;
+
+	if (dev->enabled)
+		return 0;
+
+	if (!cpuidle_curr_governor)
+		return -EIO;
+
+	drv = cpuidle_get_cpu_driver(dev);
+
+	if (!drv)
+		return -EIO;
+
+	if (!dev->registered)
+		return -EINVAL;
+
+	ret = cpuidle_add_device_sysfs(dev);
+	if (ret)
+		return ret;
+
+	if (cpuidle_curr_governor->enable) {
+		ret = cpuidle_curr_governor->enable(drv, dev);
+		if (ret)
+			goto fail_sysfs;
+	}
+
+	smp_wmb();
+
+	dev->enabled = 1;
+
+	enabled_devices++;
+	return 0;
+
+fail_sysfs:
+	cpuidle_remove_device_sysfs(dev);
+
+	return ret;
+}
+
+EXPORT_SYMBOL_GPL(cpuidle_enable_device);
+
+/**
+ * cpuidle_disable_device - disables idle PM for a CPU
+ * @dev: the CPU
+ *
+ * This function must be called between cpuidle_pause_and_lock and
+ * cpuidle_resume_and_unlock when used externally.
+ */
+void cpuidle_disable_device(struct cpuidle_device *dev)
+{
+	struct cpuidle_driver *drv = cpuidle_get_cpu_driver(dev);
+
+	if (!dev || !dev->enabled)
+		return;
+
+	if (!drv || !cpuidle_curr_governor)
+		return;
+
+	dev->enabled = 0;
+
+	if (cpuidle_curr_governor->disable)
+		cpuidle_curr_governor->disable(drv, dev);
+
+	cpuidle_remove_device_sysfs(dev);
+	enabled_devices--;
+}
+
+EXPORT_SYMBOL_GPL(cpuidle_disable_device);
+
+static void __cpuidle_unregister_device(struct cpuidle_device *dev)
+{
+	struct cpuidle_driver *drv = cpuidle_get_cpu_driver(dev);
+
+	list_del(&dev->device_list);
+	per_cpu(cpuidle_devices, dev->cpu) = NULL;
+	module_put(drv->owner);
+
+	dev->registered = 0;
+}
+
+static void __cpuidle_device_init(struct cpuidle_device *dev)
+{
+	memset(dev->states_usage, 0, sizeof(dev->states_usage));
+	dev->last_residency_ns = 0;
+	dev->next_hrtimer = 0;
+}
+
+/**
+ * __cpuidle_register_device - internal register function called before register
+ * and enable routines
+ * @dev: the cpu
+ *
+ * cpuidle_lock mutex must be held before this is called
+ */
+static int __cpuidle_register_device(struct cpuidle_device *dev)
+{
+	struct cpuidle_driver *drv = cpuidle_get_cpu_driver(dev);
+	int i, ret;
+
+	if (!try_module_get(drv->owner))
+		return -EINVAL;
+
+	for (i = 0; i < drv->state_count; i++) {
+		if (drv->states[i].flags & CPUIDLE_FLAG_UNUSABLE)
+			dev->states_usage[i].disable |= CPUIDLE_STATE_DISABLED_BY_DRIVER;
+
+		if (drv->states[i].flags & CPUIDLE_FLAG_OFF)
+			dev->states_usage[i].disable |= CPUIDLE_STATE_DISABLED_BY_USER;
+	}
+
+	per_cpu(cpuidle_devices, dev->cpu) = dev;
+	list_add(&dev->device_list, &cpuidle_detected_devices);
+
+	ret = cpuidle_coupled_register_device(dev);
+	if (ret)
+		__cpuidle_unregister_device(dev);
+	else
+		dev->registered = 1;
+
+	return ret;
+}
+
+/**
+ * cpuidle_register_device - registers a CPU's idle PM feature
+ * @dev: the cpu
+ */
+int cpuidle_register_device(struct cpuidle_device *dev)
+{
+	int ret = -EBUSY;
+
+	if (!dev)
+		return -EINVAL;
+
+	mutex_lock(&cpuidle_lock);
+
+	if (dev->registered)
+		goto out_unlock;
+
+	__cpuidle_device_init(dev);
+
+	ret = __cpuidle_register_device(dev);
+	if (ret)
+		goto out_unlock;
+
+	ret = cpuidle_add_sysfs(dev);
+	if (ret)
+		goto out_unregister;
+
+	ret = cpuidle_enable_device(dev);
+	if (ret)
+		goto out_sysfs;
+
+	cpuidle_install_idle_handler();
+
+out_unlock:
+	mutex_unlock(&cpuidle_lock);
+
+	return ret;
+
+out_sysfs:
+	cpuidle_remove_sysfs(dev);
+out_unregister:
+	__cpuidle_unregister_device(dev);
+	goto out_unlock;
+}
+
+EXPORT_SYMBOL_GPL(cpuidle_register_device);
+
+/**
+ * cpuidle_unregister_device - unregisters a CPU's idle PM feature
+ * @dev: the cpu
+ */
+void cpuidle_unregister_device(struct cpuidle_device *dev)
+{
+	if (!dev || dev->registered == 0)
+		return;
+
+	cpuidle_pause_and_lock();
+
+	cpuidle_disable_device(dev);
+
+	cpuidle_remove_sysfs(dev);
+
+	__cpuidle_unregister_device(dev);
+
+	cpuidle_coupled_unregister_device(dev);
+
+	cpuidle_resume_and_unlock();
+}
+
+EXPORT_SYMBOL_GPL(cpuidle_unregister_device);
+
+/**
+ * cpuidle_unregister: unregister a driver and the devices. This function
+ * can be used only if the driver has been previously registered through
+ * the cpuidle_register function.
+ *
+ * @drv: a valid pointer to a struct cpuidle_driver
+ */
+void cpuidle_unregister(struct cpuidle_driver *drv)
+{
+	int cpu;
+	struct cpuidle_device *device;
+
+	for_each_cpu(cpu, drv->cpumask) {
+		device = &per_cpu(cpuidle_dev, cpu);
+		cpuidle_unregister_device(device);
+	}
+
+	cpuidle_unregister_driver(drv);
+}
+EXPORT_SYMBOL_GPL(cpuidle_unregister);
+
+/**
+ * cpuidle_register: registers the driver and the cpu devices with the
+ * coupled_cpus passed as parameter. This function is used for all common
+ * initialization pattern there are in the arch specific drivers. The
+ * devices is globally defined in this file.
+ *
+ * @drv         : a valid pointer to a struct cpuidle_driver
+ * @coupled_cpus: a cpumask for the coupled states
+ *
+ * Returns 0 on success, < 0 otherwise
+ */
+int cpuidle_register(struct cpuidle_driver *drv,
+		     const struct cpumask *const coupled_cpus)
+{
+	int ret, cpu;
+	struct cpuidle_device *device;
+
+	ret = cpuidle_register_driver(drv);
+	if (ret) {
+		pr_err("failed to register cpuidle driver\n");
+		return ret;
+	}
+
+	for_each_cpu(cpu, drv->cpumask) {
+		device = &per_cpu(cpuidle_dev, cpu);
+		device->cpu = cpu;
+
+#ifdef CONFIG_ARCH_NEEDS_CPU_IDLE_COUPLED
+		/*
+		 * On multiplatform for ARM, the coupled idle states could be
+		 * enabled in the kernel even if the cpuidle driver does not
+		 * use it. Note, coupled_cpus is a struct copy.
+		 */
+		if (coupled_cpus)
+			device->coupled_cpus = *coupled_cpus;
+#endif
+		ret = cpuidle_register_device(device);
+		if (!ret)
+			continue;
+
+		pr_err("Failed to register cpuidle device for cpu%d\n", cpu);
+
+		cpuidle_unregister(drv);
+		break;
+	}
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(cpuidle_register);
+
+/**
+ * cpuidle_init - core initializer
+ */
+static int __init cpuidle_init(void)
+{
+	if (cpuidle_disabled())
+		return -ENODEV;
+
+	return cpuidle_add_interface();
+}
+
+module_param(off, int, 0444);
+module_param_string(governor, param_governor, CPUIDLE_NAME_LEN, 0444);
+core_initcall(cpuidle_init);
diff --git a/drivers/cpuidle/cpuidle.h b/drivers/cpuidle/cpuidle.h
new file mode 100644
index 0000000000..52701d9588
--- /dev/null
+++ b/drivers/cpuidle/cpuidle.h
@@ -0,0 +1,75 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * cpuidle.h - The internal header file
+ */
+
+#ifndef __DRIVER_CPUIDLE_H
+#define __DRIVER_CPUIDLE_H
+
+/* For internal use only */
+extern char param_governor[];
+extern struct cpuidle_governor *cpuidle_curr_governor;
+extern struct cpuidle_governor *cpuidle_prev_governor;
+extern struct list_head cpuidle_governors;
+extern struct list_head cpuidle_detected_devices;
+extern struct mutex cpuidle_lock;
+extern spinlock_t cpuidle_driver_lock;
+extern int cpuidle_disabled(void);
+extern int cpuidle_enter_state(struct cpuidle_device *dev,
+		struct cpuidle_driver *drv, int next_state);
+
+/* idle loop */
+extern void cpuidle_install_idle_handler(void);
+extern void cpuidle_uninstall_idle_handler(void);
+
+/* governors */
+extern struct cpuidle_governor *cpuidle_find_governor(const char *str);
+extern int cpuidle_switch_governor(struct cpuidle_governor *gov);
+
+/* sysfs */
+
+struct device;
+
+extern int cpuidle_add_interface(void);
+extern void cpuidle_remove_interface(struct device *dev);
+extern int cpuidle_add_device_sysfs(struct cpuidle_device *device);
+extern void cpuidle_remove_device_sysfs(struct cpuidle_device *device);
+extern int cpuidle_add_sysfs(struct cpuidle_device *dev);
+extern void cpuidle_remove_sysfs(struct cpuidle_device *dev);
+
+#ifdef CONFIG_ARCH_NEEDS_CPU_IDLE_COUPLED
+bool cpuidle_state_is_coupled(struct cpuidle_driver *drv, int state);
+int cpuidle_coupled_state_verify(struct cpuidle_driver *drv);
+int cpuidle_enter_state_coupled(struct cpuidle_device *dev,
+		struct cpuidle_driver *drv, int next_state);
+int cpuidle_coupled_register_device(struct cpuidle_device *dev);
+void cpuidle_coupled_unregister_device(struct cpuidle_device *dev);
+#else
+static inline
+bool cpuidle_state_is_coupled(struct cpuidle_driver *drv, int state)
+{
+	return false;
+}
+
+static inline int cpuidle_coupled_state_verify(struct cpuidle_driver *drv)
+{
+	return 0;
+}
+
+static inline int cpuidle_enter_state_coupled(struct cpuidle_device *dev,
+		struct cpuidle_driver *drv, int next_state)
+{
+	return -1;
+}
+
+static inline int cpuidle_coupled_register_device(struct cpuidle_device *dev)
+{
+	return 0;
+}
+
+static inline void cpuidle_coupled_unregister_device(struct cpuidle_device *dev)
+{
+}
+#endif
+
+#endif /* __DRIVER_CPUIDLE_H */
diff --git a/drivers/cpuidle/driver.c b/drivers/cpuidle/driver.c
new file mode 100644
index 0000000000..d9cda7f6cc
--- /dev/null
+++ b/drivers/cpuidle/driver.c
@@ -0,0 +1,391 @@
+/*
+ * driver.c - driver support
+ *
+ * (C) 2006-2007 Venkatesh Pallipadi <venkatesh.pallipadi@intel.com>
+ *               Shaohua Li <shaohua.li@intel.com>
+ *               Adam Belay <abelay@novell.com>
+ *
+ * This code is licenced under the GPL.
+ */
+
+#include <linux/mutex.h>
+#include <linux/module.h>
+#include <linux/sched.h>
+#include <linux/sched/idle.h>
+#include <linux/cpuidle.h>
+#include <linux/cpumask.h>
+#include <linux/tick.h>
+#include <linux/cpu.h>
+
+#include "cpuidle.h"
+
+DEFINE_SPINLOCK(cpuidle_driver_lock);
+
+#ifdef CONFIG_CPU_IDLE_MULTIPLE_DRIVERS
+
+static DEFINE_PER_CPU(struct cpuidle_driver *, cpuidle_drivers);
+
+/**
+ * __cpuidle_get_cpu_driver - return the cpuidle driver tied to a CPU.
+ * @cpu: the CPU handled by the driver
+ *
+ * Returns a pointer to struct cpuidle_driver or NULL if no driver has been
+ * registered for @cpu.
+ */
+static struct cpuidle_driver *__cpuidle_get_cpu_driver(int cpu)
+{
+	return per_cpu(cpuidle_drivers, cpu);
+}
+
+/**
+ * __cpuidle_unset_driver - unset per CPU driver variables.
+ * @drv: a valid pointer to a struct cpuidle_driver
+ *
+ * For each CPU in the driver's CPU mask, unset the registered driver per CPU
+ * variable. If @drv is different from the registered driver, the corresponding
+ * variable is not cleared.
+ */
+static inline void __cpuidle_unset_driver(struct cpuidle_driver *drv)
+{
+	int cpu;
+
+	for_each_cpu(cpu, drv->cpumask) {
+
+		if (drv != __cpuidle_get_cpu_driver(cpu))
+			continue;
+
+		per_cpu(cpuidle_drivers, cpu) = NULL;
+	}
+}
+
+/**
+ * __cpuidle_set_driver - set per CPU driver variables for the given driver.
+ * @drv: a valid pointer to a struct cpuidle_driver
+ *
+ * Returns 0 on success, -EBUSY if any CPU in the cpumask have a driver
+ * different from drv already.
+ */
+static inline int __cpuidle_set_driver(struct cpuidle_driver *drv)
+{
+	int cpu;
+
+	for_each_cpu(cpu, drv->cpumask) {
+		struct cpuidle_driver *old_drv;
+
+		old_drv = __cpuidle_get_cpu_driver(cpu);
+		if (old_drv && old_drv != drv)
+			return -EBUSY;
+	}
+
+	for_each_cpu(cpu, drv->cpumask)
+		per_cpu(cpuidle_drivers, cpu) = drv;
+
+	return 0;
+}
+
+#else
+
+static struct cpuidle_driver *cpuidle_curr_driver;
+
+/**
+ * __cpuidle_get_cpu_driver - return the global cpuidle driver pointer.
+ * @cpu: ignored without the multiple driver support
+ *
+ * Return a pointer to a struct cpuidle_driver object or NULL if no driver was
+ * previously registered.
+ */
+static inline struct cpuidle_driver *__cpuidle_get_cpu_driver(int cpu)
+{
+	return cpuidle_curr_driver;
+}
+
+/**
+ * __cpuidle_set_driver - assign the global cpuidle driver variable.
+ * @drv: pointer to a struct cpuidle_driver object
+ *
+ * Returns 0 on success, -EBUSY if the driver is already registered.
+ */
+static inline int __cpuidle_set_driver(struct cpuidle_driver *drv)
+{
+	if (cpuidle_curr_driver)
+		return -EBUSY;
+
+	cpuidle_curr_driver = drv;
+
+	return 0;
+}
+
+/**
+ * __cpuidle_unset_driver - unset the global cpuidle driver variable.
+ * @drv: a pointer to a struct cpuidle_driver
+ *
+ * Reset the global cpuidle variable to NULL.  If @drv does not match the
+ * registered driver, do nothing.
+ */
+static inline void __cpuidle_unset_driver(struct cpuidle_driver *drv)
+{
+	if (drv == cpuidle_curr_driver)
+		cpuidle_curr_driver = NULL;
+}
+
+#endif
+
+/**
+ * cpuidle_setup_broadcast_timer - enable/disable the broadcast timer on a cpu
+ * @arg: a void pointer used to match the SMP cross call API
+ *
+ * If @arg is NULL broadcast is disabled otherwise enabled
+ *
+ * This function is executed per CPU by an SMP cross call.  It's not
+ * supposed to be called directly.
+ */
+static void cpuidle_setup_broadcast_timer(void *arg)
+{
+	if (arg)
+		tick_broadcast_enable();
+	else
+		tick_broadcast_disable();
+}
+
+/**
+ * __cpuidle_driver_init - initialize the driver's internal data
+ * @drv: a valid pointer to a struct cpuidle_driver
+ */
+static void __cpuidle_driver_init(struct cpuidle_driver *drv)
+{
+	int i;
+
+	/*
+	 * Use all possible CPUs as the default, because if the kernel boots
+	 * with some CPUs offline and then we online one of them, the CPU
+	 * notifier has to know which driver to assign.
+	 */
+	if (!drv->cpumask)
+		drv->cpumask = (struct cpumask *)cpu_possible_mask;
+
+	for (i = 0; i < drv->state_count; i++) {
+		struct cpuidle_state *s = &drv->states[i];
+
+		/*
+		 * Look for the timer stop flag in the different states and if
+		 * it is found, indicate that the broadcast timer has to be set
+		 * up.
+		 */
+		if (s->flags & CPUIDLE_FLAG_TIMER_STOP)
+			drv->bctimer = 1;
+
+		/*
+		 * The core will use the target residency and exit latency
+		 * values in nanoseconds, but allow drivers to provide them in
+		 * microseconds too.
+		 */
+		if (s->target_residency > 0)
+			s->target_residency_ns = s->target_residency * NSEC_PER_USEC;
+		else if (s->target_residency_ns < 0)
+			s->target_residency_ns = 0;
+		else
+			s->target_residency = div_u64(s->target_residency_ns, NSEC_PER_USEC);
+
+		if (s->exit_latency > 0)
+			s->exit_latency_ns = s->exit_latency * NSEC_PER_USEC;
+		else if (s->exit_latency_ns < 0)
+			s->exit_latency_ns =  0;
+		else
+			s->exit_latency = div_u64(s->exit_latency_ns, NSEC_PER_USEC);
+	}
+}
+
+/**
+ * __cpuidle_register_driver: register the driver
+ * @drv: a valid pointer to a struct cpuidle_driver
+ *
+ * Do some sanity checks, initialize the driver, assign the driver to the
+ * global cpuidle driver variable(s) and set up the broadcast timer if the
+ * cpuidle driver has some states that shut down the local timer.
+ *
+ * Returns 0 on success, a negative error code otherwise:
+ *  * -EINVAL if the driver pointer is NULL or no idle states are available
+ *  * -ENODEV if the cpuidle framework is disabled
+ *  * -EBUSY if the driver is already assigned to the global variable(s)
+ */
+static int __cpuidle_register_driver(struct cpuidle_driver *drv)
+{
+	int ret;
+
+	if (!drv || !drv->state_count)
+		return -EINVAL;
+
+	ret = cpuidle_coupled_state_verify(drv);
+	if (ret)
+		return ret;
+
+	if (cpuidle_disabled())
+		return -ENODEV;
+
+	__cpuidle_driver_init(drv);
+
+	ret = __cpuidle_set_driver(drv);
+	if (ret)
+		return ret;
+
+	if (drv->bctimer)
+		on_each_cpu_mask(drv->cpumask, cpuidle_setup_broadcast_timer,
+				 (void *)1, 1);
+
+	return 0;
+}
+
+/**
+ * __cpuidle_unregister_driver - unregister the driver
+ * @drv: a valid pointer to a struct cpuidle_driver
+ *
+ * Check if the driver is no longer in use, reset the global cpuidle driver
+ * variable(s) and disable the timer broadcast notification mechanism if it was
+ * in use.
+ *
+ */
+static void __cpuidle_unregister_driver(struct cpuidle_driver *drv)
+{
+	if (drv->bctimer) {
+		drv->bctimer = 0;
+		on_each_cpu_mask(drv->cpumask, cpuidle_setup_broadcast_timer,
+				 NULL, 1);
+	}
+
+	__cpuidle_unset_driver(drv);
+}
+
+/**
+ * cpuidle_register_driver - registers a driver
+ * @drv: a pointer to a valid struct cpuidle_driver
+ *
+ * Register the driver under a lock to prevent concurrent attempts to
+ * [un]register the driver from occuring at the same time.
+ *
+ * Returns 0 on success, a negative error code (returned by
+ * __cpuidle_register_driver()) otherwise.
+ */
+int cpuidle_register_driver(struct cpuidle_driver *drv)
+{
+	struct cpuidle_governor *gov;
+	int ret;
+
+	spin_lock(&cpuidle_driver_lock);
+	ret = __cpuidle_register_driver(drv);
+	spin_unlock(&cpuidle_driver_lock);
+
+	if (!ret && !strlen(param_governor) && drv->governor &&
+	    (cpuidle_get_driver() == drv)) {
+		mutex_lock(&cpuidle_lock);
+		gov = cpuidle_find_governor(drv->governor);
+		if (gov) {
+			cpuidle_prev_governor = cpuidle_curr_governor;
+			if (cpuidle_switch_governor(gov) < 0)
+				cpuidle_prev_governor = NULL;
+		}
+		mutex_unlock(&cpuidle_lock);
+	}
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(cpuidle_register_driver);
+
+/**
+ * cpuidle_unregister_driver - unregisters a driver
+ * @drv: a pointer to a valid struct cpuidle_driver
+ *
+ * Unregisters the cpuidle driver under a lock to prevent concurrent attempts
+ * to [un]register the driver from occuring at the same time.  @drv has to
+ * match the currently registered driver.
+ */
+void cpuidle_unregister_driver(struct cpuidle_driver *drv)
+{
+	bool enabled = (cpuidle_get_driver() == drv);
+
+	spin_lock(&cpuidle_driver_lock);
+	__cpuidle_unregister_driver(drv);
+	spin_unlock(&cpuidle_driver_lock);
+
+	if (!enabled)
+		return;
+
+	mutex_lock(&cpuidle_lock);
+	if (cpuidle_prev_governor) {
+		if (!cpuidle_switch_governor(cpuidle_prev_governor))
+			cpuidle_prev_governor = NULL;
+	}
+	mutex_unlock(&cpuidle_lock);
+}
+EXPORT_SYMBOL_GPL(cpuidle_unregister_driver);
+
+/**
+ * cpuidle_get_driver - return the driver tied to the current CPU.
+ *
+ * Returns a struct cpuidle_driver pointer, or NULL if no driver is registered.
+ */
+struct cpuidle_driver *cpuidle_get_driver(void)
+{
+	struct cpuidle_driver *drv;
+	int cpu;
+
+	cpu = get_cpu();
+	drv = __cpuidle_get_cpu_driver(cpu);
+	put_cpu();
+
+	return drv;
+}
+EXPORT_SYMBOL_GPL(cpuidle_get_driver);
+
+/**
+ * cpuidle_get_cpu_driver - return the driver registered for a CPU.
+ * @dev: a valid pointer to a struct cpuidle_device
+ *
+ * Returns a struct cpuidle_driver pointer, or NULL if no driver is registered
+ * for the CPU associated with @dev.
+ */
+struct cpuidle_driver *cpuidle_get_cpu_driver(struct cpuidle_device *dev)
+{
+	if (!dev)
+		return NULL;
+
+	return __cpuidle_get_cpu_driver(dev->cpu);
+}
+EXPORT_SYMBOL_GPL(cpuidle_get_cpu_driver);
+
+/**
+ * cpuidle_driver_state_disabled - Disable or enable an idle state
+ * @drv: cpuidle driver owning the state
+ * @idx: State index
+ * @disable: Whether or not to disable the state
+ */
+void cpuidle_driver_state_disabled(struct cpuidle_driver *drv, int idx,
+				 bool disable)
+{
+	unsigned int cpu;
+
+	mutex_lock(&cpuidle_lock);
+
+	spin_lock(&cpuidle_driver_lock);
+
+	if (!drv->cpumask) {
+		drv->states[idx].flags |= CPUIDLE_FLAG_UNUSABLE;
+		goto unlock;
+	}
+
+	for_each_cpu(cpu, drv->cpumask) {
+		struct cpuidle_device *dev = per_cpu(cpuidle_devices, cpu);
+
+		if (!dev)
+			continue;
+
+		if (disable)
+			dev->states_usage[idx].disable |= CPUIDLE_STATE_DISABLED_BY_DRIVER;
+		else
+			dev->states_usage[idx].disable &= ~CPUIDLE_STATE_DISABLED_BY_DRIVER;
+	}
+
+unlock:
+	spin_unlock(&cpuidle_driver_lock);
+
+	mutex_unlock(&cpuidle_lock);
+}
diff --git a/drivers/cpuidle/dt_idle_genpd.c b/drivers/cpuidle/dt_idle_genpd.c
new file mode 100644
index 0000000000..1af63c1890
--- /dev/null
+++ b/drivers/cpuidle/dt_idle_genpd.c
@@ -0,0 +1,202 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * PM domains for CPUs via genpd.
+ *
+ * Copyright (C) 2019 Linaro Ltd.
+ * Author: Ulf Hansson <ulf.hansson@linaro.org>
+ *
+ * Copyright (c) 2021 Western Digital Corporation or its affiliates.
+ * Copyright (c) 2022 Ventana Micro Systems Inc.
+ */
+
+#define pr_fmt(fmt) "dt-idle-genpd: " fmt
+
+#include <linux/cpu.h>
+#include <linux/device.h>
+#include <linux/kernel.h>
+#include <linux/pm_domain.h>
+#include <linux/pm_runtime.h>
+#include <linux/slab.h>
+#include <linux/string.h>
+
+#include "dt_idle_genpd.h"
+
+static int pd_parse_state_nodes(
+			int (*parse_state)(struct device_node *, u32 *),
+			struct genpd_power_state *states, int state_count)
+{
+	int i, ret;
+	u32 state, *state_buf;
+
+	for (i = 0; i < state_count; i++) {
+		ret = parse_state(to_of_node(states[i].fwnode), &state);
+		if (ret)
+			goto free_state;
+
+		state_buf = kmalloc(sizeof(u32), GFP_KERNEL);
+		if (!state_buf) {
+			ret = -ENOMEM;
+			goto free_state;
+		}
+		*state_buf = state;
+		states[i].data = state_buf;
+	}
+
+	return 0;
+
+free_state:
+	i--;
+	for (; i >= 0; i--)
+		kfree(states[i].data);
+	return ret;
+}
+
+static int pd_parse_states(struct device_node *np,
+			   int (*parse_state)(struct device_node *, u32 *),
+			   struct genpd_power_state **states,
+			   int *state_count)
+{
+	int ret;
+
+	/* Parse the domain idle states. */
+	ret = of_genpd_parse_idle_states(np, states, state_count);
+	if (ret)
+		return ret;
+
+	/* Fill out the dt specifics for each found state. */
+	ret = pd_parse_state_nodes(parse_state, *states, *state_count);
+	if (ret)
+		kfree(*states);
+
+	return ret;
+}
+
+static void pd_free_states(struct genpd_power_state *states,
+			    unsigned int state_count)
+{
+	int i;
+
+	for (i = 0; i < state_count; i++)
+		kfree(states[i].data);
+	kfree(states);
+}
+
+void dt_idle_pd_free(struct generic_pm_domain *pd)
+{
+	pd_free_states(pd->states, pd->state_count);
+	kfree(pd->name);
+	kfree(pd);
+}
+
+struct generic_pm_domain *dt_idle_pd_alloc(struct device_node *np,
+			int (*parse_state)(struct device_node *, u32 *))
+{
+	struct generic_pm_domain *pd;
+	struct genpd_power_state *states = NULL;
+	int ret, state_count = 0;
+
+	pd = kzalloc(sizeof(*pd), GFP_KERNEL);
+	if (!pd)
+		goto out;
+
+	pd->name = kasprintf(GFP_KERNEL, "%pOF", np);
+	if (!pd->name)
+		goto free_pd;
+
+	/*
+	 * Parse the domain idle states and let genpd manage the state selection
+	 * for those being compatible with "domain-idle-state".
+	 */
+	ret = pd_parse_states(np, parse_state, &states, &state_count);
+	if (ret)
+		goto free_name;
+
+	pd->free_states = pd_free_states;
+	pd->name = kbasename(pd->name);
+	pd->states = states;
+	pd->state_count = state_count;
+
+	pr_debug("alloc PM domain %s\n", pd->name);
+	return pd;
+
+free_name:
+	kfree(pd->name);
+free_pd:
+	kfree(pd);
+out:
+	pr_err("failed to alloc PM domain %pOF\n", np);
+	return NULL;
+}
+
+int dt_idle_pd_init_topology(struct device_node *np)
+{
+	struct device_node *node;
+	struct of_phandle_args child, parent;
+	int ret;
+
+	for_each_child_of_node(np, node) {
+		if (of_parse_phandle_with_args(node, "power-domains",
+					"#power-domain-cells", 0, &parent))
+			continue;
+
+		child.np = node;
+		child.args_count = 0;
+		ret = of_genpd_add_subdomain(&parent, &child);
+		of_node_put(parent.np);
+		if (ret) {
+			of_node_put(node);
+			return ret;
+		}
+	}
+
+	return 0;
+}
+
+int dt_idle_pd_remove_topology(struct device_node *np)
+{
+	struct device_node *node;
+	struct of_phandle_args child, parent;
+	int ret;
+
+	for_each_child_of_node(np, node) {
+		if (of_parse_phandle_with_args(node, "power-domains",
+					"#power-domain-cells", 0, &parent))
+			continue;
+
+		child.np = node;
+		child.args_count = 0;
+		ret = of_genpd_remove_subdomain(&parent, &child);
+		of_node_put(parent.np);
+		if (ret) {
+			of_node_put(node);
+			return ret;
+		}
+	}
+
+	return 0;
+}
+
+struct device *dt_idle_attach_cpu(int cpu, const char *name)
+{
+	struct device *dev;
+
+	dev = dev_pm_domain_attach_by_name(get_cpu_device(cpu), name);
+	if (IS_ERR_OR_NULL(dev))
+		return dev;
+
+	pm_runtime_irq_safe(dev);
+	if (cpu_online(cpu))
+		pm_runtime_get_sync(dev);
+
+	dev_pm_syscore_device(dev, true);
+
+	return dev;
+}
+
+void dt_idle_detach_cpu(struct device *dev)
+{
+	if (IS_ERR_OR_NULL(dev))
+		return;
+
+	dev_pm_domain_detach(dev, false);
+}
diff --git a/drivers/cpuidle/dt_idle_genpd.h b/drivers/cpuidle/dt_idle_genpd.h
new file mode 100644
index 0000000000..3be1f70f55
--- /dev/null
+++ b/drivers/cpuidle/dt_idle_genpd.h
@@ -0,0 +1,57 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef __DT_IDLE_GENPD
+#define __DT_IDLE_GENPD
+
+struct device_node;
+struct generic_pm_domain;
+
+#ifdef CONFIG_DT_IDLE_GENPD
+
+void dt_idle_pd_free(struct generic_pm_domain *pd);
+
+struct generic_pm_domain *dt_idle_pd_alloc(struct device_node *np,
+			int (*parse_state)(struct device_node *, u32 *));
+
+int dt_idle_pd_init_topology(struct device_node *np);
+
+int dt_idle_pd_remove_topology(struct device_node *np);
+
+struct device *dt_idle_attach_cpu(int cpu, const char *name);
+
+void dt_idle_detach_cpu(struct device *dev);
+
+#else
+
+static inline void dt_idle_pd_free(struct generic_pm_domain *pd)
+{
+}
+
+static inline struct generic_pm_domain *dt_idle_pd_alloc(
+			struct device_node *np,
+			int (*parse_state)(struct device_node *, u32 *))
+{
+	return NULL;
+}
+
+static inline int dt_idle_pd_init_topology(struct device_node *np)
+{
+	return 0;
+}
+
+static inline int dt_idle_pd_remove_topology(struct device_node *np)
+{
+	return 0;
+}
+
+static inline struct device *dt_idle_attach_cpu(int cpu, const char *name)
+{
+	return NULL;
+}
+
+static inline void dt_idle_detach_cpu(struct device *dev)
+{
+}
+
+#endif
+
+#endif
diff --git a/drivers/cpuidle/dt_idle_states.c b/drivers/cpuidle/dt_idle_states.c
new file mode 100644
index 0000000000..12fec92a85
--- /dev/null
+++ b/drivers/cpuidle/dt_idle_states.c
@@ -0,0 +1,224 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * DT idle states parsing code.
+ *
+ * Copyright (C) 2014 ARM Ltd.
+ * Author: Lorenzo Pieralisi <lorenzo.pieralisi@arm.com>
+ */
+
+#define pr_fmt(fmt) "DT idle-states: " fmt
+
+#include <linux/cpuidle.h>
+#include <linux/cpumask.h>
+#include <linux/errno.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/of.h>
+
+#include "dt_idle_states.h"
+
+static int init_state_node(struct cpuidle_state *idle_state,
+			   const struct of_device_id *match_id,
+			   struct device_node *state_node)
+{
+	int err;
+	const char *desc;
+
+	/*
+	 * CPUidle drivers are expected to initialize the const void *data
+	 * pointer of the passed in struct of_device_id array to the idle
+	 * state enter function.
+	 */
+	idle_state->enter = match_id->data;
+	/*
+	 * Since this is not a "coupled" state, it's safe to assume interrupts
+	 * won't be enabled when it exits allowing the tick to be frozen
+	 * safely. So enter() can be also enter_s2idle() callback.
+	 */
+	idle_state->enter_s2idle = match_id->data;
+
+	err = of_property_read_u32(state_node, "wakeup-latency-us",
+				   &idle_state->exit_latency);
+	if (err) {
+		u32 entry_latency, exit_latency;
+
+		err = of_property_read_u32(state_node, "entry-latency-us",
+					   &entry_latency);
+		if (err) {
+			pr_debug(" * %pOF missing entry-latency-us property\n",
+				 state_node);
+			return -EINVAL;
+		}
+
+		err = of_property_read_u32(state_node, "exit-latency-us",
+					   &exit_latency);
+		if (err) {
+			pr_debug(" * %pOF missing exit-latency-us property\n",
+				 state_node);
+			return -EINVAL;
+		}
+		/*
+		 * If wakeup-latency-us is missing, default to entry+exit
+		 * latencies as defined in idle states bindings
+		 */
+		idle_state->exit_latency = entry_latency + exit_latency;
+	}
+
+	err = of_property_read_u32(state_node, "min-residency-us",
+				   &idle_state->target_residency);
+	if (err) {
+		pr_debug(" * %pOF missing min-residency-us property\n",
+			     state_node);
+		return -EINVAL;
+	}
+
+	err = of_property_read_string(state_node, "idle-state-name", &desc);
+	if (err)
+		desc = state_node->name;
+
+	idle_state->flags = CPUIDLE_FLAG_RCU_IDLE;
+	if (of_property_read_bool(state_node, "local-timer-stop"))
+		idle_state->flags |= CPUIDLE_FLAG_TIMER_STOP;
+	/*
+	 * TODO:
+	 *	replace with kstrdup and pointer assignment when name
+	 *	and desc become string pointers
+	 */
+	strncpy(idle_state->name, state_node->name, CPUIDLE_NAME_LEN - 1);
+	strncpy(idle_state->desc, desc, CPUIDLE_DESC_LEN - 1);
+	return 0;
+}
+
+/*
+ * Check that the idle state is uniform across all CPUs in the CPUidle driver
+ * cpumask
+ */
+static bool idle_state_valid(struct device_node *state_node, unsigned int idx,
+			     const cpumask_t *cpumask)
+{
+	int cpu;
+	struct device_node *cpu_node, *curr_state_node;
+	bool valid = true;
+
+	/*
+	 * Compare idle state phandles for index idx on all CPUs in the
+	 * CPUidle driver cpumask. Start from next logical cpu following
+	 * cpumask_first(cpumask) since that's the CPU state_node was
+	 * retrieved from. If a mismatch is found bail out straight
+	 * away since we certainly hit a firmware misconfiguration.
+	 */
+	for (cpu = cpumask_next(cpumask_first(cpumask), cpumask);
+	     cpu < nr_cpu_ids; cpu = cpumask_next(cpu, cpumask)) {
+		cpu_node = of_cpu_device_node_get(cpu);
+		curr_state_node = of_get_cpu_state_node(cpu_node, idx);
+		if (state_node != curr_state_node)
+			valid = false;
+
+		of_node_put(curr_state_node);
+		of_node_put(cpu_node);
+		if (!valid)
+			break;
+	}
+
+	return valid;
+}
+
+/**
+ * dt_init_idle_driver() - Parse the DT idle states and initialize the
+ *			   idle driver states array
+ * @drv:	  Pointer to CPU idle driver to be initialized
+ * @matches:	  Array of of_device_id match structures to search in for
+ *		  compatible idle state nodes. The data pointer for each valid
+ *		  struct of_device_id entry in the matches array must point to
+ *		  a function with the following signature, that corresponds to
+ *		  the CPUidle state enter function signature:
+ *
+ *		  int (*)(struct cpuidle_device *dev,
+ *			  struct cpuidle_driver *drv,
+ *			  int index);
+ *
+ * @start_idx:    First idle state index to be initialized
+ *
+ * If DT idle states are detected and are valid the state count and states
+ * array entries in the cpuidle driver are initialized accordingly starting
+ * from index start_idx.
+ *
+ * Return: number of valid DT idle states parsed, <0 on failure
+ */
+int dt_init_idle_driver(struct cpuidle_driver *drv,
+			const struct of_device_id *matches,
+			unsigned int start_idx)
+{
+	struct cpuidle_state *idle_state;
+	struct device_node *state_node, *cpu_node;
+	const struct of_device_id *match_id;
+	int i, err = 0;
+	const cpumask_t *cpumask;
+	unsigned int state_idx = start_idx;
+
+	if (state_idx >= CPUIDLE_STATE_MAX)
+		return -EINVAL;
+	/*
+	 * We get the idle states for the first logical cpu in the
+	 * driver mask (or cpu_possible_mask if the driver cpumask is not set)
+	 * and we check through idle_state_valid() if they are uniform
+	 * across CPUs, otherwise we hit a firmware misconfiguration.
+	 */
+	cpumask = drv->cpumask ? : cpu_possible_mask;
+	cpu_node = of_cpu_device_node_get(cpumask_first(cpumask));
+
+	for (i = 0; ; i++) {
+		state_node = of_get_cpu_state_node(cpu_node, i);
+		if (!state_node)
+			break;
+
+		match_id = of_match_node(matches, state_node);
+		if (!match_id) {
+			err = -ENODEV;
+			break;
+		}
+
+		if (!of_device_is_available(state_node)) {
+			of_node_put(state_node);
+			continue;
+		}
+
+		if (!idle_state_valid(state_node, i, cpumask)) {
+			pr_warn("%pOF idle state not valid, bailing out\n",
+				state_node);
+			err = -EINVAL;
+			break;
+		}
+
+		if (state_idx == CPUIDLE_STATE_MAX) {
+			pr_warn("State index reached static CPU idle driver states array size\n");
+			break;
+		}
+
+		idle_state = &drv->states[state_idx++];
+		err = init_state_node(idle_state, match_id, state_node);
+		if (err) {
+			pr_err("Parsing idle state node %pOF failed with err %d\n",
+			       state_node, err);
+			err = -EINVAL;
+			break;
+		}
+		of_node_put(state_node);
+	}
+
+	of_node_put(state_node);
+	of_node_put(cpu_node);
+	if (err)
+		return err;
+
+	/* Set the number of total supported idle states. */
+	drv->state_count = state_idx;
+
+	/*
+	 * Return the number of present and valid DT idle states, which can
+	 * also be 0 on platforms with missing DT idle states or legacy DT
+	 * configuration predating the DT idle states bindings.
+	 */
+	return state_idx - start_idx;
+}
+EXPORT_SYMBOL_GPL(dt_init_idle_driver);
diff --git a/drivers/cpuidle/dt_idle_states.h b/drivers/cpuidle/dt_idle_states.h
new file mode 100644
index 0000000000..14ae88cef1
--- /dev/null
+++ b/drivers/cpuidle/dt_idle_states.h
@@ -0,0 +1,8 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef __DT_IDLE_STATES
+#define __DT_IDLE_STATES
+
+int dt_init_idle_driver(struct cpuidle_driver *drv,
+			const struct of_device_id *matches,
+			unsigned int start_idx);
+#endif
diff --git a/drivers/cpuidle/governor.c b/drivers/cpuidle/governor.c
new file mode 100644
index 0000000000..0d0f9751ff
--- /dev/null
+++ b/drivers/cpuidle/governor.c
@@ -0,0 +1,119 @@
+/*
+ * governor.c - governor support
+ *
+ * (C) 2006-2007 Venkatesh Pallipadi <venkatesh.pallipadi@intel.com>
+ *               Shaohua Li <shaohua.li@intel.com>
+ *               Adam Belay <abelay@novell.com>
+ *
+ * This code is licenced under the GPL.
+ */
+
+#include <linux/cpu.h>
+#include <linux/cpuidle.h>
+#include <linux/mutex.h>
+#include <linux/module.h>
+#include <linux/pm_qos.h>
+
+#include "cpuidle.h"
+
+char param_governor[CPUIDLE_NAME_LEN];
+
+LIST_HEAD(cpuidle_governors);
+struct cpuidle_governor *cpuidle_curr_governor;
+struct cpuidle_governor *cpuidle_prev_governor;
+
+/**
+ * cpuidle_find_governor - finds a governor of the specified name
+ * @str: the name
+ *
+ * Must be called with cpuidle_lock acquired.
+ */
+struct cpuidle_governor *cpuidle_find_governor(const char *str)
+{
+	struct cpuidle_governor *gov;
+
+	list_for_each_entry(gov, &cpuidle_governors, governor_list)
+		if (!strncasecmp(str, gov->name, CPUIDLE_NAME_LEN))
+			return gov;
+
+	return NULL;
+}
+
+/**
+ * cpuidle_switch_governor - changes the governor
+ * @gov: the new target governor
+ * Must be called with cpuidle_lock acquired.
+ */
+int cpuidle_switch_governor(struct cpuidle_governor *gov)
+{
+	struct cpuidle_device *dev;
+
+	if (!gov)
+		return -EINVAL;
+
+	if (gov == cpuidle_curr_governor)
+		return 0;
+
+	cpuidle_uninstall_idle_handler();
+
+	if (cpuidle_curr_governor) {
+		list_for_each_entry(dev, &cpuidle_detected_devices, device_list)
+			cpuidle_disable_device(dev);
+	}
+
+	cpuidle_curr_governor = gov;
+
+	list_for_each_entry(dev, &cpuidle_detected_devices, device_list)
+		cpuidle_enable_device(dev);
+
+	cpuidle_install_idle_handler();
+	pr_info("cpuidle: using governor %s\n", gov->name);
+
+	return 0;
+}
+
+/**
+ * cpuidle_register_governor - registers a governor
+ * @gov: the governor
+ */
+int cpuidle_register_governor(struct cpuidle_governor *gov)
+{
+	int ret = -EEXIST;
+
+	if (!gov || !gov->select)
+		return -EINVAL;
+
+	if (cpuidle_disabled())
+		return -ENODEV;
+
+	mutex_lock(&cpuidle_lock);
+	if (cpuidle_find_governor(gov->name) == NULL) {
+		ret = 0;
+		list_add_tail(&gov->governor_list, &cpuidle_governors);
+		if (!cpuidle_curr_governor ||
+		    !strncasecmp(param_governor, gov->name, CPUIDLE_NAME_LEN) ||
+		    (cpuidle_curr_governor->rating < gov->rating &&
+		     strncasecmp(param_governor, cpuidle_curr_governor->name,
+				 CPUIDLE_NAME_LEN)))
+			cpuidle_switch_governor(gov);
+	}
+	mutex_unlock(&cpuidle_lock);
+
+	return ret;
+}
+
+/**
+ * cpuidle_governor_latency_req - Compute a latency constraint for CPU
+ * @cpu: Target CPU
+ */
+s64 cpuidle_governor_latency_req(unsigned int cpu)
+{
+	struct device *device = get_cpu_device(cpu);
+	int device_req = dev_pm_qos_raw_resume_latency(device);
+	int global_req = cpu_latency_qos_limit();
+
+	if (device_req > global_req)
+		device_req = global_req;
+
+	return (s64)device_req * NSEC_PER_USEC;
+}
diff --git a/drivers/cpuidle/governors/Makefile b/drivers/cpuidle/governors/Makefile
new file mode 100644
index 0000000000..63abb5393a
--- /dev/null
+++ b/drivers/cpuidle/governors/Makefile
@@ -0,0 +1,9 @@
+# SPDX-License-Identifier: GPL-2.0-only
+#
+# Makefile for cpuidle governors.
+#
+
+obj-$(CONFIG_CPU_IDLE_GOV_LADDER) += ladder.o
+obj-$(CONFIG_CPU_IDLE_GOV_MENU) += menu.o
+obj-$(CONFIG_CPU_IDLE_GOV_TEO) += teo.o
+obj-$(CONFIG_CPU_IDLE_GOV_HALTPOLL) += haltpoll.o
diff --git a/drivers/cpuidle/governors/gov.h b/drivers/cpuidle/governors/gov.h
new file mode 100644
index 0000000000..99e067d966
--- /dev/null
+++ b/drivers/cpuidle/governors/gov.h
@@ -0,0 +1,14 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+/* Common definitions for cpuidle governors. */
+
+#ifndef __CPUIDLE_GOVERNOR_H
+#define __CPUIDLE_GOVERNOR_H
+
+/*
+ * Idle state target residency threshold used for deciding whether or not to
+ * check the time till the closest expected timer event.
+ */
+#define RESIDENCY_THRESHOLD_NS	(15 * NSEC_PER_USEC)
+
+#endif /* __CPUIDLE_GOVERNOR_H */
diff --git a/drivers/cpuidle/governors/haltpoll.c b/drivers/cpuidle/governors/haltpoll.c
new file mode 100644
index 0000000000..1dff3a5291
--- /dev/null
+++ b/drivers/cpuidle/governors/haltpoll.c
@@ -0,0 +1,152 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * haltpoll.c - haltpoll idle governor
+ *
+ * Copyright 2019 Red Hat, Inc. and/or its affiliates.
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2.  See
+ * the COPYING file in the top-level directory.
+ *
+ * Authors: Marcelo Tosatti <mtosatti@redhat.com>
+ */
+
+#include <linux/kernel.h>
+#include <linux/cpuidle.h>
+#include <linux/time.h>
+#include <linux/ktime.h>
+#include <linux/hrtimer.h>
+#include <linux/tick.h>
+#include <linux/sched.h>
+#include <linux/module.h>
+#include <linux/kvm_para.h>
+#include <trace/events/power.h>
+
+static unsigned int guest_halt_poll_ns __read_mostly = 200000;
+module_param(guest_halt_poll_ns, uint, 0644);
+
+/* division factor to shrink halt_poll_ns */
+static unsigned int guest_halt_poll_shrink __read_mostly = 2;
+module_param(guest_halt_poll_shrink, uint, 0644);
+
+/* multiplication factor to grow per-cpu poll_limit_ns */
+static unsigned int guest_halt_poll_grow __read_mostly = 2;
+module_param(guest_halt_poll_grow, uint, 0644);
+
+/* value in us to start growing per-cpu halt_poll_ns */
+static unsigned int guest_halt_poll_grow_start __read_mostly = 50000;
+module_param(guest_halt_poll_grow_start, uint, 0644);
+
+/* allow shrinking guest halt poll */
+static bool guest_halt_poll_allow_shrink __read_mostly = true;
+module_param(guest_halt_poll_allow_shrink, bool, 0644);
+
+/**
+ * haltpoll_select - selects the next idle state to enter
+ * @drv: cpuidle driver containing state data
+ * @dev: the CPU
+ * @stop_tick: indication on whether or not to stop the tick
+ */
+static int haltpoll_select(struct cpuidle_driver *drv,
+			   struct cpuidle_device *dev,
+			   bool *stop_tick)
+{
+	s64 latency_req = cpuidle_governor_latency_req(dev->cpu);
+
+	if (!drv->state_count || latency_req == 0) {
+		*stop_tick = false;
+		return 0;
+	}
+
+	if (dev->poll_limit_ns == 0)
+		return 1;
+
+	/* Last state was poll? */
+	if (dev->last_state_idx == 0) {
+		/* Halt if no event occurred on poll window */
+		if (dev->poll_time_limit == true)
+			return 1;
+
+		*stop_tick = false;
+		/* Otherwise, poll again */
+		return 0;
+	}
+
+	*stop_tick = false;
+	/* Last state was halt: poll */
+	return 0;
+}
+
+static void adjust_poll_limit(struct cpuidle_device *dev, u64 block_ns)
+{
+	unsigned int val;
+
+	/* Grow cpu_halt_poll_us if
+	 * cpu_halt_poll_us < block_ns < guest_halt_poll_us
+	 */
+	if (block_ns > dev->poll_limit_ns && block_ns <= guest_halt_poll_ns) {
+		val = dev->poll_limit_ns * guest_halt_poll_grow;
+
+		if (val < guest_halt_poll_grow_start)
+			val = guest_halt_poll_grow_start;
+		if (val > guest_halt_poll_ns)
+			val = guest_halt_poll_ns;
+
+		trace_guest_halt_poll_ns_grow(val, dev->poll_limit_ns);
+		dev->poll_limit_ns = val;
+	} else if (block_ns > guest_halt_poll_ns &&
+		   guest_halt_poll_allow_shrink) {
+		unsigned int shrink = guest_halt_poll_shrink;
+
+		val = dev->poll_limit_ns;
+		if (shrink == 0)
+			val = 0;
+		else
+			val /= shrink;
+		trace_guest_halt_poll_ns_shrink(val, dev->poll_limit_ns);
+		dev->poll_limit_ns = val;
+	}
+}
+
+/**
+ * haltpoll_reflect - update variables and update poll time
+ * @dev: the CPU
+ * @index: the index of actual entered state
+ */
+static void haltpoll_reflect(struct cpuidle_device *dev, int index)
+{
+	dev->last_state_idx = index;
+
+	if (index != 0)
+		adjust_poll_limit(dev, dev->last_residency_ns);
+}
+
+/**
+ * haltpoll_enable_device - scans a CPU's states and does setup
+ * @drv: cpuidle driver
+ * @dev: the CPU
+ */
+static int haltpoll_enable_device(struct cpuidle_driver *drv,
+				  struct cpuidle_device *dev)
+{
+	dev->poll_limit_ns = 0;
+
+	return 0;
+}
+
+static struct cpuidle_governor haltpoll_governor = {
+	.name =			"haltpoll",
+	.rating =		9,
+	.enable =		haltpoll_enable_device,
+	.select =		haltpoll_select,
+	.reflect =		haltpoll_reflect,
+};
+
+static int __init init_haltpoll(void)
+{
+	if (kvm_para_available())
+		return cpuidle_register_governor(&haltpoll_governor);
+
+	return 0;
+}
+
+postcore_initcall(init_haltpoll);
diff --git a/drivers/cpuidle/governors/ladder.c b/drivers/cpuidle/governors/ladder.c
new file mode 100644
index 0000000000..8e9058c4ea
--- /dev/null
+++ b/drivers/cpuidle/governors/ladder.c
@@ -0,0 +1,197 @@
+/*
+ * ladder.c - the residency ladder algorithm
+ *
+ *  Copyright (C) 2001, 2002 Andy Grover <andrew.grover@intel.com>
+ *  Copyright (C) 2001, 2002 Paul Diefenbaugh <paul.s.diefenbaugh@intel.com>
+ *  Copyright (C) 2004, 2005 Dominik Brodowski <linux@brodo.de>
+ *
+ * (C) 2006-2007 Venkatesh Pallipadi <venkatesh.pallipadi@intel.com>
+ *               Shaohua Li <shaohua.li@intel.com>
+ *               Adam Belay <abelay@novell.com>
+ *
+ * This code is licenced under the GPL.
+ */
+
+#include <linux/kernel.h>
+#include <linux/cpuidle.h>
+#include <linux/jiffies.h>
+#include <linux/tick.h>
+
+#include <asm/io.h>
+#include <linux/uaccess.h>
+
+#define PROMOTION_COUNT 4
+#define DEMOTION_COUNT 1
+
+struct ladder_device_state {
+	struct {
+		u32 promotion_count;
+		u32 demotion_count;
+		u64 promotion_time_ns;
+		u64 demotion_time_ns;
+	} threshold;
+	struct {
+		int promotion_count;
+		int demotion_count;
+	} stats;
+};
+
+struct ladder_device {
+	struct ladder_device_state states[CPUIDLE_STATE_MAX];
+};
+
+static DEFINE_PER_CPU(struct ladder_device, ladder_devices);
+
+/**
+ * ladder_do_selection - prepares private data for a state change
+ * @ldev: the ladder device
+ * @old_idx: the current state index
+ * @new_idx: the new target state index
+ */
+static inline void ladder_do_selection(struct cpuidle_device *dev,
+				       struct ladder_device *ldev,
+				       int old_idx, int new_idx)
+{
+	ldev->states[old_idx].stats.promotion_count = 0;
+	ldev->states[old_idx].stats.demotion_count = 0;
+	dev->last_state_idx = new_idx;
+}
+
+/**
+ * ladder_select_state - selects the next state to enter
+ * @drv: cpuidle driver
+ * @dev: the CPU
+ * @dummy: not used
+ */
+static int ladder_select_state(struct cpuidle_driver *drv,
+			       struct cpuidle_device *dev, bool *dummy)
+{
+	struct ladder_device *ldev = this_cpu_ptr(&ladder_devices);
+	struct ladder_device_state *last_state;
+	int last_idx = dev->last_state_idx;
+	int first_idx = drv->states[0].flags & CPUIDLE_FLAG_POLLING ? 1 : 0;
+	s64 latency_req = cpuidle_governor_latency_req(dev->cpu);
+	s64 last_residency;
+
+	/* Special case when user has set very strict latency requirement */
+	if (unlikely(latency_req == 0)) {
+		ladder_do_selection(dev, ldev, last_idx, 0);
+		return 0;
+	}
+
+	last_state = &ldev->states[last_idx];
+
+	last_residency = dev->last_residency_ns - drv->states[last_idx].exit_latency_ns;
+
+	/* consider promotion */
+	if (last_idx < drv->state_count - 1 &&
+	    !dev->states_usage[last_idx + 1].disable &&
+	    last_residency > last_state->threshold.promotion_time_ns &&
+	    drv->states[last_idx + 1].exit_latency_ns <= latency_req) {
+		last_state->stats.promotion_count++;
+		last_state->stats.demotion_count = 0;
+		if (last_state->stats.promotion_count >= last_state->threshold.promotion_count) {
+			ladder_do_selection(dev, ldev, last_idx, last_idx + 1);
+			return last_idx + 1;
+		}
+	}
+
+	/* consider demotion */
+	if (last_idx > first_idx &&
+	    (dev->states_usage[last_idx].disable ||
+	    drv->states[last_idx].exit_latency_ns > latency_req)) {
+		int i;
+
+		for (i = last_idx - 1; i > first_idx; i--) {
+			if (drv->states[i].exit_latency_ns <= latency_req)
+				break;
+		}
+		ladder_do_selection(dev, ldev, last_idx, i);
+		return i;
+	}
+
+	if (last_idx > first_idx &&
+	    last_residency < last_state->threshold.demotion_time_ns) {
+		last_state->stats.demotion_count++;
+		last_state->stats.promotion_count = 0;
+		if (last_state->stats.demotion_count >= last_state->threshold.demotion_count) {
+			ladder_do_selection(dev, ldev, last_idx, last_idx - 1);
+			return last_idx - 1;
+		}
+	}
+
+	/* otherwise remain at the current state */
+	return last_idx;
+}
+
+/**
+ * ladder_enable_device - setup for the governor
+ * @drv: cpuidle driver
+ * @dev: the CPU
+ */
+static int ladder_enable_device(struct cpuidle_driver *drv,
+				struct cpuidle_device *dev)
+{
+	int i;
+	int first_idx = drv->states[0].flags & CPUIDLE_FLAG_POLLING ? 1 : 0;
+	struct ladder_device *ldev = &per_cpu(ladder_devices, dev->cpu);
+	struct ladder_device_state *lstate;
+	struct cpuidle_state *state;
+
+	dev->last_state_idx = first_idx;
+
+	for (i = first_idx; i < drv->state_count; i++) {
+		state = &drv->states[i];
+		lstate = &ldev->states[i];
+
+		lstate->stats.promotion_count = 0;
+		lstate->stats.demotion_count = 0;
+
+		lstate->threshold.promotion_count = PROMOTION_COUNT;
+		lstate->threshold.demotion_count = DEMOTION_COUNT;
+
+		if (i < drv->state_count - 1)
+			lstate->threshold.promotion_time_ns = state->exit_latency_ns;
+		if (i > first_idx)
+			lstate->threshold.demotion_time_ns = state->exit_latency_ns;
+	}
+
+	return 0;
+}
+
+/**
+ * ladder_reflect - update the correct last_state_idx
+ * @dev: the CPU
+ * @index: the index of actual state entered
+ */
+static void ladder_reflect(struct cpuidle_device *dev, int index)
+{
+	if (index > 0)
+		dev->last_state_idx = index;
+}
+
+static struct cpuidle_governor ladder_governor = {
+	.name =		"ladder",
+	.rating =	10,
+	.enable =	ladder_enable_device,
+	.select =	ladder_select_state,
+	.reflect =	ladder_reflect,
+};
+
+/**
+ * init_ladder - initializes the governor
+ */
+static int __init init_ladder(void)
+{
+	/*
+	 * When NO_HZ is disabled, or when booting with nohz=off, the ladder
+	 * governor is better so give it a higher rating than the menu
+	 * governor.
+	 */
+	if (!tick_nohz_enabled)
+		ladder_governor.rating = 25;
+
+	return cpuidle_register_governor(&ladder_governor);
+}
+
+postcore_initcall(init_ladder);
diff --git a/drivers/cpuidle/governors/menu.c b/drivers/cpuidle/governors/menu.c
new file mode 100644
index 0000000000..b96e3da0fe
--- /dev/null
+++ b/drivers/cpuidle/governors/menu.c
@@ -0,0 +1,590 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * menu.c - the menu idle governor
+ *
+ * Copyright (C) 2006-2007 Adam Belay <abelay@novell.com>
+ * Copyright (C) 2009 Intel Corporation
+ * Author:
+ *        Arjan van de Ven <arjan@linux.intel.com>
+ */
+
+#include <linux/kernel.h>
+#include <linux/cpuidle.h>
+#include <linux/time.h>
+#include <linux/ktime.h>
+#include <linux/hrtimer.h>
+#include <linux/tick.h>
+#include <linux/sched.h>
+#include <linux/sched/loadavg.h>
+#include <linux/sched/stat.h>
+#include <linux/math64.h>
+
+#include "gov.h"
+
+#define BUCKETS 12
+#define INTERVAL_SHIFT 3
+#define INTERVALS (1UL << INTERVAL_SHIFT)
+#define RESOLUTION 1024
+#define DECAY 8
+#define MAX_INTERESTING (50000 * NSEC_PER_USEC)
+
+/*
+ * Concepts and ideas behind the menu governor
+ *
+ * For the menu governor, there are 3 decision factors for picking a C
+ * state:
+ * 1) Energy break even point
+ * 2) Performance impact
+ * 3) Latency tolerance (from pmqos infrastructure)
+ * These three factors are treated independently.
+ *
+ * Energy break even point
+ * -----------------------
+ * C state entry and exit have an energy cost, and a certain amount of time in
+ * the  C state is required to actually break even on this cost. CPUIDLE
+ * provides us this duration in the "target_residency" field. So all that we
+ * need is a good prediction of how long we'll be idle. Like the traditional
+ * menu governor, we start with the actual known "next timer event" time.
+ *
+ * Since there are other source of wakeups (interrupts for example) than
+ * the next timer event, this estimation is rather optimistic. To get a
+ * more realistic estimate, a correction factor is applied to the estimate,
+ * that is based on historic behavior. For example, if in the past the actual
+ * duration always was 50% of the next timer tick, the correction factor will
+ * be 0.5.
+ *
+ * menu uses a running average for this correction factor, however it uses a
+ * set of factors, not just a single factor. This stems from the realization
+ * that the ratio is dependent on the order of magnitude of the expected
+ * duration; if we expect 500 milliseconds of idle time the likelihood of
+ * getting an interrupt very early is much higher than if we expect 50 micro
+ * seconds of idle time. A second independent factor that has big impact on
+ * the actual factor is if there is (disk) IO outstanding or not.
+ * (as a special twist, we consider every sleep longer than 50 milliseconds
+ * as perfect; there are no power gains for sleeping longer than this)
+ *
+ * For these two reasons we keep an array of 12 independent factors, that gets
+ * indexed based on the magnitude of the expected duration as well as the
+ * "is IO outstanding" property.
+ *
+ * Repeatable-interval-detector
+ * ----------------------------
+ * There are some cases where "next timer" is a completely unusable predictor:
+ * Those cases where the interval is fixed, for example due to hardware
+ * interrupt mitigation, but also due to fixed transfer rate devices such as
+ * mice.
+ * For this, we use a different predictor: We track the duration of the last 8
+ * intervals and if the stand deviation of these 8 intervals is below a
+ * threshold value, we use the average of these intervals as prediction.
+ *
+ * Limiting Performance Impact
+ * ---------------------------
+ * C states, especially those with large exit latencies, can have a real
+ * noticeable impact on workloads, which is not acceptable for most sysadmins,
+ * and in addition, less performance has a power price of its own.
+ *
+ * As a general rule of thumb, menu assumes that the following heuristic
+ * holds:
+ *     The busier the system, the less impact of C states is acceptable
+ *
+ * This rule-of-thumb is implemented using a performance-multiplier:
+ * If the exit latency times the performance multiplier is longer than
+ * the predicted duration, the C state is not considered a candidate
+ * for selection due to a too high performance impact. So the higher
+ * this multiplier is, the longer we need to be idle to pick a deep C
+ * state, and thus the less likely a busy CPU will hit such a deep
+ * C state.
+ *
+ * Two factors are used in determing this multiplier:
+ * a value of 10 is added for each point of "per cpu load average" we have.
+ * a value of 5 points is added for each process that is waiting for
+ * IO on this CPU.
+ * (these values are experimentally determined)
+ *
+ * The load average factor gives a longer term (few seconds) input to the
+ * decision, while the iowait value gives a cpu local instantanious input.
+ * The iowait factor may look low, but realize that this is also already
+ * represented in the system load average.
+ *
+ */
+
+struct menu_device {
+	int             needs_update;
+	int             tick_wakeup;
+
+	u64		next_timer_ns;
+	unsigned int	bucket;
+	unsigned int	correction_factor[BUCKETS];
+	unsigned int	intervals[INTERVALS];
+	int		interval_ptr;
+};
+
+static inline int which_bucket(u64 duration_ns, unsigned int nr_iowaiters)
+{
+	int bucket = 0;
+
+	/*
+	 * We keep two groups of stats; one with no
+	 * IO pending, one without.
+	 * This allows us to calculate
+	 * E(duration)|iowait
+	 */
+	if (nr_iowaiters)
+		bucket = BUCKETS/2;
+
+	if (duration_ns < 10ULL * NSEC_PER_USEC)
+		return bucket;
+	if (duration_ns < 100ULL * NSEC_PER_USEC)
+		return bucket + 1;
+	if (duration_ns < 1000ULL * NSEC_PER_USEC)
+		return bucket + 2;
+	if (duration_ns < 10000ULL * NSEC_PER_USEC)
+		return bucket + 3;
+	if (duration_ns < 100000ULL * NSEC_PER_USEC)
+		return bucket + 4;
+	return bucket + 5;
+}
+
+/*
+ * Return a multiplier for the exit latency that is intended
+ * to take performance requirements into account.
+ * The more performance critical we estimate the system
+ * to be, the higher this multiplier, and thus the higher
+ * the barrier to go to an expensive C state.
+ */
+static inline int performance_multiplier(unsigned int nr_iowaiters)
+{
+	/* for IO wait tasks (per cpu!) we add 10x each */
+	return 1 + 10 * nr_iowaiters;
+}
+
+static DEFINE_PER_CPU(struct menu_device, menu_devices);
+
+static void menu_update(struct cpuidle_driver *drv, struct cpuidle_device *dev);
+
+/*
+ * Try detecting repeating patterns by keeping track of the last 8
+ * intervals, and checking if the standard deviation of that set
+ * of points is below a threshold. If it is... then use the
+ * average of these 8 points as the estimated value.
+ */
+static unsigned int get_typical_interval(struct menu_device *data)
+{
+	int i, divisor;
+	unsigned int min, max, thresh, avg;
+	uint64_t sum, variance;
+
+	thresh = INT_MAX; /* Discard outliers above this value */
+
+again:
+
+	/* First calculate the average of past intervals */
+	min = UINT_MAX;
+	max = 0;
+	sum = 0;
+	divisor = 0;
+	for (i = 0; i < INTERVALS; i++) {
+		unsigned int value = data->intervals[i];
+		if (value <= thresh) {
+			sum += value;
+			divisor++;
+			if (value > max)
+				max = value;
+
+			if (value < min)
+				min = value;
+		}
+	}
+
+	if (!max)
+		return UINT_MAX;
+
+	if (divisor == INTERVALS)
+		avg = sum >> INTERVAL_SHIFT;
+	else
+		avg = div_u64(sum, divisor);
+
+	/* Then try to determine variance */
+	variance = 0;
+	for (i = 0; i < INTERVALS; i++) {
+		unsigned int value = data->intervals[i];
+		if (value <= thresh) {
+			int64_t diff = (int64_t)value - avg;
+			variance += diff * diff;
+		}
+	}
+	if (divisor == INTERVALS)
+		variance >>= INTERVAL_SHIFT;
+	else
+		do_div(variance, divisor);
+
+	/*
+	 * The typical interval is obtained when standard deviation is
+	 * small (stddev <= 20 us, variance <= 400 us^2) or standard
+	 * deviation is small compared to the average interval (avg >
+	 * 6*stddev, avg^2 > 36*variance). The average is smaller than
+	 * UINT_MAX aka U32_MAX, so computing its square does not
+	 * overflow a u64. We simply reject this candidate average if
+	 * the standard deviation is greater than 715 s (which is
+	 * rather unlikely).
+	 *
+	 * Use this result only if there is no timer to wake us up sooner.
+	 */
+	if (likely(variance <= U64_MAX/36)) {
+		if ((((u64)avg*avg > variance*36) && (divisor * 4 >= INTERVALS * 3))
+							|| variance <= 400) {
+			return avg;
+		}
+	}
+
+	/*
+	 * If we have outliers to the upside in our distribution, discard
+	 * those by setting the threshold to exclude these outliers, then
+	 * calculate the average and standard deviation again. Once we get
+	 * down to the bottom 3/4 of our samples, stop excluding samples.
+	 *
+	 * This can deal with workloads that have long pauses interspersed
+	 * with sporadic activity with a bunch of short pauses.
+	 */
+	if ((divisor * 4) <= INTERVALS * 3)
+		return UINT_MAX;
+
+	thresh = max - 1;
+	goto again;
+}
+
+/**
+ * menu_select - selects the next idle state to enter
+ * @drv: cpuidle driver containing state data
+ * @dev: the CPU
+ * @stop_tick: indication on whether or not to stop the tick
+ */
+static int menu_select(struct cpuidle_driver *drv, struct cpuidle_device *dev,
+		       bool *stop_tick)
+{
+	struct menu_device *data = this_cpu_ptr(&menu_devices);
+	s64 latency_req = cpuidle_governor_latency_req(dev->cpu);
+	u64 predicted_ns;
+	u64 interactivity_req;
+	unsigned int nr_iowaiters;
+	ktime_t delta, delta_tick;
+	int i, idx;
+
+	if (data->needs_update) {
+		menu_update(drv, dev);
+		data->needs_update = 0;
+	}
+
+	nr_iowaiters = nr_iowait_cpu(dev->cpu);
+
+	/* Find the shortest expected idle interval. */
+	predicted_ns = get_typical_interval(data) * NSEC_PER_USEC;
+	if (predicted_ns > RESIDENCY_THRESHOLD_NS) {
+		unsigned int timer_us;
+
+		/* Determine the time till the closest timer. */
+		delta = tick_nohz_get_sleep_length(&delta_tick);
+		if (unlikely(delta < 0)) {
+			delta = 0;
+			delta_tick = 0;
+		}
+
+		data->next_timer_ns = delta;
+		data->bucket = which_bucket(data->next_timer_ns, nr_iowaiters);
+
+		/* Round up the result for half microseconds. */
+		timer_us = div_u64((RESOLUTION * DECAY * NSEC_PER_USEC) / 2 +
+					data->next_timer_ns *
+						data->correction_factor[data->bucket],
+				   RESOLUTION * DECAY * NSEC_PER_USEC);
+		/* Use the lowest expected idle interval to pick the idle state. */
+		predicted_ns = min((u64)timer_us * NSEC_PER_USEC, predicted_ns);
+	} else {
+		/*
+		 * Because the next timer event is not going to be determined
+		 * in this case, assume that without the tick the closest timer
+		 * will be in distant future and that the closest tick will occur
+		 * after 1/2 of the tick period.
+		 */
+		data->next_timer_ns = KTIME_MAX;
+		delta_tick = TICK_NSEC / 2;
+		data->bucket = which_bucket(KTIME_MAX, nr_iowaiters);
+	}
+
+	if (unlikely(drv->state_count <= 1 || latency_req == 0) ||
+	    ((data->next_timer_ns < drv->states[1].target_residency_ns ||
+	      latency_req < drv->states[1].exit_latency_ns) &&
+	     !dev->states_usage[0].disable)) {
+		/*
+		 * In this case state[0] will be used no matter what, so return
+		 * it right away and keep the tick running if state[0] is a
+		 * polling one.
+		 */
+		*stop_tick = !(drv->states[0].flags & CPUIDLE_FLAG_POLLING);
+		return 0;
+	}
+
+	if (tick_nohz_tick_stopped()) {
+		/*
+		 * If the tick is already stopped, the cost of possible short
+		 * idle duration misprediction is much higher, because the CPU
+		 * may be stuck in a shallow idle state for a long time as a
+		 * result of it.  In that case say we might mispredict and use
+		 * the known time till the closest timer event for the idle
+		 * state selection.
+		 */
+		if (predicted_ns < TICK_NSEC)
+			predicted_ns = data->next_timer_ns;
+	} else {
+		/*
+		 * Use the performance multiplier and the user-configurable
+		 * latency_req to determine the maximum exit latency.
+		 */
+		interactivity_req = div64_u64(predicted_ns,
+					      performance_multiplier(nr_iowaiters));
+		if (latency_req > interactivity_req)
+			latency_req = interactivity_req;
+	}
+
+	/*
+	 * Find the idle state with the lowest power while satisfying
+	 * our constraints.
+	 */
+	idx = -1;
+	for (i = 0; i < drv->state_count; i++) {
+		struct cpuidle_state *s = &drv->states[i];
+
+		if (dev->states_usage[i].disable)
+			continue;
+
+		if (idx == -1)
+			idx = i; /* first enabled state */
+
+		if (s->target_residency_ns > predicted_ns) {
+			/*
+			 * Use a physical idle state, not busy polling, unless
+			 * a timer is going to trigger soon enough.
+			 */
+			if ((drv->states[idx].flags & CPUIDLE_FLAG_POLLING) &&
+			    s->exit_latency_ns <= latency_req &&
+			    s->target_residency_ns <= data->next_timer_ns) {
+				predicted_ns = s->target_residency_ns;
+				idx = i;
+				break;
+			}
+			if (predicted_ns < TICK_NSEC)
+				break;
+
+			if (!tick_nohz_tick_stopped()) {
+				/*
+				 * If the state selected so far is shallow,
+				 * waking up early won't hurt, so retain the
+				 * tick in that case and let the governor run
+				 * again in the next iteration of the loop.
+				 */
+				predicted_ns = drv->states[idx].target_residency_ns;
+				break;
+			}
+
+			/*
+			 * If the state selected so far is shallow and this
+			 * state's target residency matches the time till the
+			 * closest timer event, select this one to avoid getting
+			 * stuck in the shallow one for too long.
+			 */
+			if (drv->states[idx].target_residency_ns < TICK_NSEC &&
+			    s->target_residency_ns <= delta_tick)
+				idx = i;
+
+			return idx;
+		}
+		if (s->exit_latency_ns > latency_req)
+			break;
+
+		idx = i;
+	}
+
+	if (idx == -1)
+		idx = 0; /* No states enabled. Must use 0. */
+
+	/*
+	 * Don't stop the tick if the selected state is a polling one or if the
+	 * expected idle duration is shorter than the tick period length.
+	 */
+	if (((drv->states[idx].flags & CPUIDLE_FLAG_POLLING) ||
+	     predicted_ns < TICK_NSEC) && !tick_nohz_tick_stopped()) {
+		*stop_tick = false;
+
+		if (idx > 0 && drv->states[idx].target_residency_ns > delta_tick) {
+			/*
+			 * The tick is not going to be stopped and the target
+			 * residency of the state to be returned is not within
+			 * the time until the next timer event including the
+			 * tick, so try to correct that.
+			 */
+			for (i = idx - 1; i >= 0; i--) {
+				if (dev->states_usage[i].disable)
+					continue;
+
+				idx = i;
+				if (drv->states[i].target_residency_ns <= delta_tick)
+					break;
+			}
+		}
+	}
+
+	return idx;
+}
+
+/**
+ * menu_reflect - records that data structures need update
+ * @dev: the CPU
+ * @index: the index of actual entered state
+ *
+ * NOTE: it's important to be fast here because this operation will add to
+ *       the overall exit latency.
+ */
+static void menu_reflect(struct cpuidle_device *dev, int index)
+{
+	struct menu_device *data = this_cpu_ptr(&menu_devices);
+
+	dev->last_state_idx = index;
+	data->needs_update = 1;
+	data->tick_wakeup = tick_nohz_idle_got_tick();
+}
+
+/**
+ * menu_update - attempts to guess what happened after entry
+ * @drv: cpuidle driver containing state data
+ * @dev: the CPU
+ */
+static void menu_update(struct cpuidle_driver *drv, struct cpuidle_device *dev)
+{
+	struct menu_device *data = this_cpu_ptr(&menu_devices);
+	int last_idx = dev->last_state_idx;
+	struct cpuidle_state *target = &drv->states[last_idx];
+	u64 measured_ns;
+	unsigned int new_factor;
+
+	/*
+	 * Try to figure out how much time passed between entry to low
+	 * power state and occurrence of the wakeup event.
+	 *
+	 * If the entered idle state didn't support residency measurements,
+	 * we use them anyway if they are short, and if long,
+	 * truncate to the whole expected time.
+	 *
+	 * Any measured amount of time will include the exit latency.
+	 * Since we are interested in when the wakeup begun, not when it
+	 * was completed, we must subtract the exit latency. However, if
+	 * the measured amount of time is less than the exit latency,
+	 * assume the state was never reached and the exit latency is 0.
+	 */
+
+	if (data->tick_wakeup && data->next_timer_ns > TICK_NSEC) {
+		/*
+		 * The nohz code said that there wouldn't be any events within
+		 * the tick boundary (if the tick was stopped), but the idle
+		 * duration predictor had a differing opinion.  Since the CPU
+		 * was woken up by a tick (that wasn't stopped after all), the
+		 * predictor was not quite right, so assume that the CPU could
+		 * have been idle long (but not forever) to help the idle
+		 * duration predictor do a better job next time.
+		 */
+		measured_ns = 9 * MAX_INTERESTING / 10;
+	} else if ((drv->states[last_idx].flags & CPUIDLE_FLAG_POLLING) &&
+		   dev->poll_time_limit) {
+		/*
+		 * The CPU exited the "polling" state due to a time limit, so
+		 * the idle duration prediction leading to the selection of that
+		 * state was inaccurate.  If a better prediction had been made,
+		 * the CPU might have been woken up from idle by the next timer.
+		 * Assume that to be the case.
+		 */
+		measured_ns = data->next_timer_ns;
+	} else {
+		/* measured value */
+		measured_ns = dev->last_residency_ns;
+
+		/* Deduct exit latency */
+		if (measured_ns > 2 * target->exit_latency_ns)
+			measured_ns -= target->exit_latency_ns;
+		else
+			measured_ns /= 2;
+	}
+
+	/* Make sure our coefficients do not exceed unity */
+	if (measured_ns > data->next_timer_ns)
+		measured_ns = data->next_timer_ns;
+
+	/* Update our correction ratio */
+	new_factor = data->correction_factor[data->bucket];
+	new_factor -= new_factor / DECAY;
+
+	if (data->next_timer_ns > 0 && measured_ns < MAX_INTERESTING)
+		new_factor += div64_u64(RESOLUTION * measured_ns,
+					data->next_timer_ns);
+	else
+		/*
+		 * we were idle so long that we count it as a perfect
+		 * prediction
+		 */
+		new_factor += RESOLUTION;
+
+	/*
+	 * We don't want 0 as factor; we always want at least
+	 * a tiny bit of estimated time. Fortunately, due to rounding,
+	 * new_factor will stay nonzero regardless of measured_us values
+	 * and the compiler can eliminate this test as long as DECAY > 1.
+	 */
+	if (DECAY == 1 && unlikely(new_factor == 0))
+		new_factor = 1;
+
+	data->correction_factor[data->bucket] = new_factor;
+
+	/* update the repeating-pattern data */
+	data->intervals[data->interval_ptr++] = ktime_to_us(measured_ns);
+	if (data->interval_ptr >= INTERVALS)
+		data->interval_ptr = 0;
+}
+
+/**
+ * menu_enable_device - scans a CPU's states and does setup
+ * @drv: cpuidle driver
+ * @dev: the CPU
+ */
+static int menu_enable_device(struct cpuidle_driver *drv,
+				struct cpuidle_device *dev)
+{
+	struct menu_device *data = &per_cpu(menu_devices, dev->cpu);
+	int i;
+
+	memset(data, 0, sizeof(struct menu_device));
+
+	/*
+	 * if the correction factor is 0 (eg first time init or cpu hotplug
+	 * etc), we actually want to start out with a unity factor.
+	 */
+	for(i = 0; i < BUCKETS; i++)
+		data->correction_factor[i] = RESOLUTION * DECAY;
+
+	return 0;
+}
+
+static struct cpuidle_governor menu_governor = {
+	.name =		"menu",
+	.rating =	20,
+	.enable =	menu_enable_device,
+	.select =	menu_select,
+	.reflect =	menu_reflect,
+};
+
+/**
+ * init_menu - initializes the governor
+ */
+static int __init init_menu(void)
+{
+	return cpuidle_register_governor(&menu_governor);
+}
+
+postcore_initcall(init_menu);
diff --git a/drivers/cpuidle/governors/teo.c b/drivers/cpuidle/governors/teo.c
new file mode 100644
index 0000000000..7244f71c59
--- /dev/null
+++ b/drivers/cpuidle/governors/teo.c
@@ -0,0 +1,695 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Timer events oriented CPU idle governor
+ *
+ * TEO governor:
+ * Copyright (C) 2018 - 2021 Intel Corporation
+ * Author: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
+ *
+ * Util-awareness mechanism:
+ * Copyright (C) 2022 Arm Ltd.
+ * Author: Kajetan Puchalski <kajetan.puchalski@arm.com>
+ */
+
+/**
+ * DOC: teo-description
+ *
+ * The idea of this governor is based on the observation that on many systems
+ * timer events are two or more orders of magnitude more frequent than any
+ * other interrupts, so they are likely to be the most significant cause of CPU
+ * wakeups from idle states.  Moreover, information about what happened in the
+ * (relatively recent) past can be used to estimate whether or not the deepest
+ * idle state with target residency within the (known) time till the closest
+ * timer event, referred to as the sleep length, is likely to be suitable for
+ * the upcoming CPU idle period and, if not, then which of the shallower idle
+ * states to choose instead of it.
+ *
+ * Of course, non-timer wakeup sources are more important in some use cases
+ * which can be covered by taking a few most recent idle time intervals of the
+ * CPU into account.  However, even in that context it is not necessary to
+ * consider idle duration values greater than the sleep length, because the
+ * closest timer will ultimately wake up the CPU anyway unless it is woken up
+ * earlier.
+ *
+ * Thus this governor estimates whether or not the prospective idle duration of
+ * a CPU is likely to be significantly shorter than the sleep length and selects
+ * an idle state for it accordingly.
+ *
+ * The computations carried out by this governor are based on using bins whose
+ * boundaries are aligned with the target residency parameter values of the CPU
+ * idle states provided by the %CPUIdle driver in the ascending order.  That is,
+ * the first bin spans from 0 up to, but not including, the target residency of
+ * the second idle state (idle state 1), the second bin spans from the target
+ * residency of idle state 1 up to, but not including, the target residency of
+ * idle state 2, the third bin spans from the target residency of idle state 2
+ * up to, but not including, the target residency of idle state 3 and so on.
+ * The last bin spans from the target residency of the deepest idle state
+ * supplied by the driver to infinity.
+ *
+ * Two metrics called "hits" and "intercepts" are associated with each bin.
+ * They are updated every time before selecting an idle state for the given CPU
+ * in accordance with what happened last time.
+ *
+ * The "hits" metric reflects the relative frequency of situations in which the
+ * sleep length and the idle duration measured after CPU wakeup fall into the
+ * same bin (that is, the CPU appears to wake up "on time" relative to the sleep
+ * length).  In turn, the "intercepts" metric reflects the relative frequency of
+ * situations in which the measured idle duration is so much shorter than the
+ * sleep length that the bin it falls into corresponds to an idle state
+ * shallower than the one whose bin is fallen into by the sleep length (these
+ * situations are referred to as "intercepts" below).
+ *
+ * In addition to the metrics described above, the governor counts recent
+ * intercepts (that is, intercepts that have occurred during the last
+ * %NR_RECENT invocations of it for the given CPU) for each bin.
+ *
+ * In order to select an idle state for a CPU, the governor takes the following
+ * steps (modulo the possible latency constraint that must be taken into account
+ * too):
+ *
+ * 1. Find the deepest CPU idle state whose target residency does not exceed
+ *    the current sleep length (the candidate idle state) and compute 3 sums as
+ *    follows:
+ *
+ *    - The sum of the "hits" and "intercepts" metrics for the candidate state
+ *      and all of the deeper idle states (it represents the cases in which the
+ *      CPU was idle long enough to avoid being intercepted if the sleep length
+ *      had been equal to the current one).
+ *
+ *    - The sum of the "intercepts" metrics for all of the idle states shallower
+ *      than the candidate one (it represents the cases in which the CPU was not
+ *      idle long enough to avoid being intercepted if the sleep length had been
+ *      equal to the current one).
+ *
+ *    - The sum of the numbers of recent intercepts for all of the idle states
+ *      shallower than the candidate one.
+ *
+ * 2. If the second sum is greater than the first one or the third sum is
+ *    greater than %NR_RECENT / 2, the CPU is likely to wake up early, so look
+ *    for an alternative idle state to select.
+ *
+ *    - Traverse the idle states shallower than the candidate one in the
+ *      descending order.
+ *
+ *    - For each of them compute the sum of the "intercepts" metrics and the sum
+ *      of the numbers of recent intercepts over all of the idle states between
+ *      it and the candidate one (including the former and excluding the
+ *      latter).
+ *
+ *    - If each of these sums that needs to be taken into account (because the
+ *      check related to it has indicated that the CPU is likely to wake up
+ *      early) is greater than a half of the corresponding sum computed in step
+ *      1 (which means that the target residency of the state in question had
+ *      not exceeded the idle duration in over a half of the relevant cases),
+ *      select the given idle state instead of the candidate one.
+ *
+ * 3. By default, select the candidate state.
+ *
+ * Util-awareness mechanism:
+ *
+ * The idea behind the util-awareness extension is that there are two distinct
+ * scenarios for the CPU which should result in two different approaches to idle
+ * state selection - utilized and not utilized.
+ *
+ * In this case, 'utilized' means that the average runqueue util of the CPU is
+ * above a certain threshold.
+ *
+ * When the CPU is utilized while going into idle, more likely than not it will
+ * be woken up to do more work soon and so a shallower idle state should be
+ * selected to minimise latency and maximise performance. When the CPU is not
+ * being utilized, the usual metrics-based approach to selecting the deepest
+ * available idle state should be preferred to take advantage of the power
+ * saving.
+ *
+ * In order to achieve this, the governor uses a utilization threshold.
+ * The threshold is computed per-CPU as a percentage of the CPU's capacity
+ * by bit shifting the capacity value. Based on testing, the shift of 6 (~1.56%)
+ * seems to be getting the best results.
+ *
+ * Before selecting the next idle state, the governor compares the current CPU
+ * util to the precomputed util threshold. If it's below, it defaults to the
+ * TEO metrics mechanism. If it's above, the closest shallower idle state will
+ * be selected instead, as long as is not a polling state.
+ */
+
+#include <linux/cpuidle.h>
+#include <linux/jiffies.h>
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/sched/clock.h>
+#include <linux/sched/topology.h>
+#include <linux/tick.h>
+
+#include "gov.h"
+
+/*
+ * The number of bits to shift the CPU's capacity by in order to determine
+ * the utilized threshold.
+ *
+ * 6 was chosen based on testing as the number that achieved the best balance
+ * of power and performance on average.
+ *
+ * The resulting threshold is high enough to not be triggered by background
+ * noise and low enough to react quickly when activity starts to ramp up.
+ */
+#define UTIL_THRESHOLD_SHIFT 6
+
+/*
+ * The PULSE value is added to metrics when they grow and the DECAY_SHIFT value
+ * is used for decreasing metrics on a regular basis.
+ */
+#define PULSE		1024
+#define DECAY_SHIFT	3
+
+/*
+ * Number of the most recent idle duration values to take into consideration for
+ * the detection of recent early wakeup patterns.
+ */
+#define NR_RECENT	9
+
+/**
+ * struct teo_bin - Metrics used by the TEO cpuidle governor.
+ * @intercepts: The "intercepts" metric.
+ * @hits: The "hits" metric.
+ * @recent: The number of recent "intercepts".
+ */
+struct teo_bin {
+	unsigned int intercepts;
+	unsigned int hits;
+	unsigned int recent;
+};
+
+/**
+ * struct teo_cpu - CPU data used by the TEO cpuidle governor.
+ * @time_span_ns: Time between idle state selection and post-wakeup update.
+ * @sleep_length_ns: Time till the closest timer event (at the selection time).
+ * @state_bins: Idle state data bins for this CPU.
+ * @total: Grand total of the "intercepts" and "hits" metrics for all bins.
+ * @next_recent_idx: Index of the next @recent_idx entry to update.
+ * @recent_idx: Indices of bins corresponding to recent "intercepts".
+ * @tick_hits: Number of "hits" after TICK_NSEC.
+ * @util_threshold: Threshold above which the CPU is considered utilized
+ */
+struct teo_cpu {
+	s64 time_span_ns;
+	s64 sleep_length_ns;
+	struct teo_bin state_bins[CPUIDLE_STATE_MAX];
+	unsigned int total;
+	int next_recent_idx;
+	int recent_idx[NR_RECENT];
+	unsigned int tick_hits;
+	unsigned long util_threshold;
+};
+
+static DEFINE_PER_CPU(struct teo_cpu, teo_cpus);
+
+/**
+ * teo_cpu_is_utilized - Check if the CPU's util is above the threshold
+ * @cpu: Target CPU
+ * @cpu_data: Governor CPU data for the target CPU
+ */
+#ifdef CONFIG_SMP
+static bool teo_cpu_is_utilized(int cpu, struct teo_cpu *cpu_data)
+{
+	return sched_cpu_util(cpu) > cpu_data->util_threshold;
+}
+#else
+static bool teo_cpu_is_utilized(int cpu, struct teo_cpu *cpu_data)
+{
+	return false;
+}
+#endif
+
+/**
+ * teo_update - Update CPU metrics after wakeup.
+ * @drv: cpuidle driver containing state data.
+ * @dev: Target CPU.
+ */
+static void teo_update(struct cpuidle_driver *drv, struct cpuidle_device *dev)
+{
+	struct teo_cpu *cpu_data = per_cpu_ptr(&teo_cpus, dev->cpu);
+	int i, idx_timer = 0, idx_duration = 0;
+	s64 target_residency_ns;
+	u64 measured_ns;
+
+	if (cpu_data->time_span_ns >= cpu_data->sleep_length_ns) {
+		/*
+		 * One of the safety nets has triggered or the wakeup was close
+		 * enough to the closest timer event expected at the idle state
+		 * selection time to be discarded.
+		 */
+		measured_ns = U64_MAX;
+	} else {
+		u64 lat_ns = drv->states[dev->last_state_idx].exit_latency_ns;
+
+		/*
+		 * The computations below are to determine whether or not the
+		 * (saved) time till the next timer event and the measured idle
+		 * duration fall into the same "bin", so use last_residency_ns
+		 * for that instead of time_span_ns which includes the cpuidle
+		 * overhead.
+		 */
+		measured_ns = dev->last_residency_ns;
+		/*
+		 * The delay between the wakeup and the first instruction
+		 * executed by the CPU is not likely to be worst-case every
+		 * time, so take 1/2 of the exit latency as a very rough
+		 * approximation of the average of it.
+		 */
+		if (measured_ns >= lat_ns)
+			measured_ns -= lat_ns / 2;
+		else
+			measured_ns /= 2;
+	}
+
+	cpu_data->total = 0;
+
+	/*
+	 * Decay the "hits" and "intercepts" metrics for all of the bins and
+	 * find the bins that the sleep length and the measured idle duration
+	 * fall into.
+	 */
+	for (i = 0; i < drv->state_count; i++) {
+		struct teo_bin *bin = &cpu_data->state_bins[i];
+
+		bin->hits -= bin->hits >> DECAY_SHIFT;
+		bin->intercepts -= bin->intercepts >> DECAY_SHIFT;
+
+		cpu_data->total += bin->hits + bin->intercepts;
+
+		target_residency_ns = drv->states[i].target_residency_ns;
+
+		if (target_residency_ns <= cpu_data->sleep_length_ns) {
+			idx_timer = i;
+			if (target_residency_ns <= measured_ns)
+				idx_duration = i;
+		}
+	}
+
+	i = cpu_data->next_recent_idx++;
+	if (cpu_data->next_recent_idx >= NR_RECENT)
+		cpu_data->next_recent_idx = 0;
+
+	if (cpu_data->recent_idx[i] >= 0)
+		cpu_data->state_bins[cpu_data->recent_idx[i]].recent--;
+
+	/*
+	 * If the deepest state's target residency is below the tick length,
+	 * make a record of it to help teo_select() decide whether or not
+	 * to stop the tick.  This effectively adds an extra hits-only bin
+	 * beyond the last state-related one.
+	 */
+	if (target_residency_ns < TICK_NSEC) {
+		cpu_data->tick_hits -= cpu_data->tick_hits >> DECAY_SHIFT;
+
+		cpu_data->total += cpu_data->tick_hits;
+
+		if (TICK_NSEC <= cpu_data->sleep_length_ns) {
+			idx_timer = drv->state_count;
+			if (TICK_NSEC <= measured_ns) {
+				cpu_data->tick_hits += PULSE;
+				goto end;
+			}
+		}
+	}
+
+	/*
+	 * If the measured idle duration falls into the same bin as the sleep
+	 * length, this is a "hit", so update the "hits" metric for that bin.
+	 * Otherwise, update the "intercepts" metric for the bin fallen into by
+	 * the measured idle duration.
+	 */
+	if (idx_timer == idx_duration) {
+		cpu_data->state_bins[idx_timer].hits += PULSE;
+		cpu_data->recent_idx[i] = -1;
+	} else {
+		cpu_data->state_bins[idx_duration].intercepts += PULSE;
+		cpu_data->state_bins[idx_duration].recent++;
+		cpu_data->recent_idx[i] = idx_duration;
+	}
+
+end:
+	cpu_data->total += PULSE;
+}
+
+static bool teo_state_ok(int i, struct cpuidle_driver *drv)
+{
+	return !tick_nohz_tick_stopped() ||
+		drv->states[i].target_residency_ns >= TICK_NSEC;
+}
+
+/**
+ * teo_find_shallower_state - Find shallower idle state matching given duration.
+ * @drv: cpuidle driver containing state data.
+ * @dev: Target CPU.
+ * @state_idx: Index of the capping idle state.
+ * @duration_ns: Idle duration value to match.
+ * @no_poll: Don't consider polling states.
+ */
+static int teo_find_shallower_state(struct cpuidle_driver *drv,
+				    struct cpuidle_device *dev, int state_idx,
+				    s64 duration_ns, bool no_poll)
+{
+	int i;
+
+	for (i = state_idx - 1; i >= 0; i--) {
+		if (dev->states_usage[i].disable ||
+				(no_poll && drv->states[i].flags & CPUIDLE_FLAG_POLLING))
+			continue;
+
+		state_idx = i;
+		if (drv->states[i].target_residency_ns <= duration_ns)
+			break;
+	}
+	return state_idx;
+}
+
+/**
+ * teo_select - Selects the next idle state to enter.
+ * @drv: cpuidle driver containing state data.
+ * @dev: Target CPU.
+ * @stop_tick: Indication on whether or not to stop the scheduler tick.
+ */
+static int teo_select(struct cpuidle_driver *drv, struct cpuidle_device *dev,
+		      bool *stop_tick)
+{
+	struct teo_cpu *cpu_data = per_cpu_ptr(&teo_cpus, dev->cpu);
+	s64 latency_req = cpuidle_governor_latency_req(dev->cpu);
+	ktime_t delta_tick = TICK_NSEC / 2;
+	unsigned int tick_intercept_sum = 0;
+	unsigned int idx_intercept_sum = 0;
+	unsigned int intercept_sum = 0;
+	unsigned int idx_recent_sum = 0;
+	unsigned int recent_sum = 0;
+	unsigned int idx_hit_sum = 0;
+	unsigned int hit_sum = 0;
+	int constraint_idx = 0;
+	int idx0 = 0, idx = -1;
+	bool alt_intercepts, alt_recent;
+	bool cpu_utilized;
+	s64 duration_ns;
+	int i;
+
+	if (dev->last_state_idx >= 0) {
+		teo_update(drv, dev);
+		dev->last_state_idx = -1;
+	}
+
+	cpu_data->time_span_ns = local_clock();
+	/*
+	 * Set the expected sleep length to infinity in case of an early
+	 * return.
+	 */
+	cpu_data->sleep_length_ns = KTIME_MAX;
+
+	/* Check if there is any choice in the first place. */
+	if (drv->state_count < 2) {
+		idx = 0;
+		goto out_tick;
+	}
+
+	if (!dev->states_usage[0].disable)
+		idx = 0;
+
+	cpu_utilized = teo_cpu_is_utilized(dev->cpu, cpu_data);
+	/*
+	 * If the CPU is being utilized over the threshold and there are only 2
+	 * states to choose from, the metrics need not be considered, so choose
+	 * the shallowest non-polling state and exit.
+	 */
+	if (drv->state_count < 3 && cpu_utilized) {
+		/*
+		 * If state 0 is enabled and it is not a polling one, select it
+		 * right away unless the scheduler tick has been stopped, in
+		 * which case care needs to be taken to leave the CPU in a deep
+		 * enough state in case it is not woken up any time soon after
+		 * all.  If state 1 is disabled, though, state 0 must be used
+		 * anyway.
+		 */
+		if ((!idx && !(drv->states[0].flags & CPUIDLE_FLAG_POLLING) &&
+		    teo_state_ok(0, drv)) || dev->states_usage[1].disable) {
+			idx = 0;
+			goto out_tick;
+		}
+		/* Assume that state 1 is not a polling one and use it. */
+		idx = 1;
+		duration_ns = drv->states[1].target_residency_ns;
+		goto end;
+	}
+
+	/* Compute the sums of metrics for early wakeup pattern detection. */
+	for (i = 1; i < drv->state_count; i++) {
+		struct teo_bin *prev_bin = &cpu_data->state_bins[i-1];
+		struct cpuidle_state *s = &drv->states[i];
+
+		/*
+		 * Update the sums of idle state mertics for all of the states
+		 * shallower than the current one.
+		 */
+		intercept_sum += prev_bin->intercepts;
+		hit_sum += prev_bin->hits;
+		recent_sum += prev_bin->recent;
+
+		if (dev->states_usage[i].disable)
+			continue;
+
+		if (idx < 0)
+			idx0 = i; /* first enabled state */
+
+		idx = i;
+
+		if (s->exit_latency_ns <= latency_req)
+			constraint_idx = i;
+
+		/* Save the sums for the current state. */
+		idx_intercept_sum = intercept_sum;
+		idx_hit_sum = hit_sum;
+		idx_recent_sum = recent_sum;
+	}
+
+	/* Avoid unnecessary overhead. */
+	if (idx < 0) {
+		idx = 0; /* No states enabled, must use 0. */
+		goto out_tick;
+	}
+
+	if (idx == idx0) {
+		/*
+		 * Only one idle state is enabled, so use it, but do not
+		 * allow the tick to be stopped it is shallow enough.
+		 */
+		duration_ns = drv->states[idx].target_residency_ns;
+		goto end;
+	}
+
+	tick_intercept_sum = intercept_sum +
+			cpu_data->state_bins[drv->state_count-1].intercepts;
+
+	/*
+	 * If the sum of the intercepts metric for all of the idle states
+	 * shallower than the current candidate one (idx) is greater than the
+	 * sum of the intercepts and hits metrics for the candidate state and
+	 * all of the deeper states, or the sum of the numbers of recent
+	 * intercepts over all of the states shallower than the candidate one
+	 * is greater than a half of the number of recent events taken into
+	 * account, a shallower idle state is likely to be a better choice.
+	 */
+	alt_intercepts = 2 * idx_intercept_sum > cpu_data->total - idx_hit_sum;
+	alt_recent = idx_recent_sum > NR_RECENT / 2;
+	if (alt_recent || alt_intercepts) {
+		int first_suitable_idx = idx;
+
+		/*
+		 * Look for the deepest idle state whose target residency had
+		 * not exceeded the idle duration in over a half of the relevant
+		 * cases (both with respect to intercepts overall and with
+		 * respect to the recent intercepts only) in the past.
+		 *
+		 * Take the possible duration limitation present if the tick
+		 * has been stopped already into account.
+		 */
+		intercept_sum = 0;
+		recent_sum = 0;
+
+		for (i = idx - 1; i >= 0; i--) {
+			struct teo_bin *bin = &cpu_data->state_bins[i];
+
+			intercept_sum += bin->intercepts;
+			recent_sum += bin->recent;
+
+			if ((!alt_recent || 2 * recent_sum > idx_recent_sum) &&
+			    (!alt_intercepts ||
+			     2 * intercept_sum > idx_intercept_sum)) {
+				/*
+				 * Use the current state unless it is too
+				 * shallow or disabled, in which case take the
+				 * first enabled state that is deep enough.
+				 */
+				if (teo_state_ok(i, drv) &&
+				    !dev->states_usage[i].disable)
+					idx = i;
+				else
+					idx = first_suitable_idx;
+
+				break;
+			}
+
+			if (dev->states_usage[i].disable)
+				continue;
+
+			if (!teo_state_ok(i, drv)) {
+				/*
+				 * The current state is too shallow, but if an
+				 * alternative candidate state has been found,
+				 * it may still turn out to be a better choice.
+				 */
+				if (first_suitable_idx != idx)
+					continue;
+
+				break;
+			}
+
+			first_suitable_idx = i;
+		}
+	}
+
+	/*
+	 * If there is a latency constraint, it may be necessary to select an
+	 * idle state shallower than the current candidate one.
+	 */
+	if (idx > constraint_idx)
+		idx = constraint_idx;
+
+	/*
+	 * If the CPU is being utilized over the threshold, choose a shallower
+	 * non-polling state to improve latency, unless the scheduler tick has
+	 * been stopped already and the shallower state's target residency is
+	 * not sufficiently large.
+	 */
+	if (cpu_utilized) {
+		i = teo_find_shallower_state(drv, dev, idx, KTIME_MAX, true);
+		if (teo_state_ok(i, drv))
+			idx = i;
+	}
+
+	/*
+	 * Skip the timers check if state 0 is the current candidate one,
+	 * because an immediate non-timer wakeup is expected in that case.
+	 */
+	if (!idx)
+		goto out_tick;
+
+	/*
+	 * If state 0 is a polling one, check if the target residency of
+	 * the current candidate state is low enough and skip the timers
+	 * check in that case too.
+	 */
+	if ((drv->states[0].flags & CPUIDLE_FLAG_POLLING) &&
+	    drv->states[idx].target_residency_ns < RESIDENCY_THRESHOLD_NS)
+		goto out_tick;
+
+	duration_ns = tick_nohz_get_sleep_length(&delta_tick);
+	cpu_data->sleep_length_ns = duration_ns;
+
+	/*
+	 * If the closest expected timer is before the terget residency of the
+	 * candidate state, a shallower one needs to be found.
+	 */
+	if (drv->states[idx].target_residency_ns > duration_ns) {
+		i = teo_find_shallower_state(drv, dev, idx, duration_ns, false);
+		if (teo_state_ok(i, drv))
+			idx = i;
+	}
+
+	/*
+	 * If the selected state's target residency is below the tick length
+	 * and intercepts occurring before the tick length are the majority of
+	 * total wakeup events, do not stop the tick.
+	 */
+	if (drv->states[idx].target_residency_ns < TICK_NSEC &&
+	    tick_intercept_sum > cpu_data->total / 2 + cpu_data->total / 8)
+		duration_ns = TICK_NSEC / 2;
+
+end:
+	/*
+	 * Allow the tick to be stopped unless the selected state is a polling
+	 * one or the expected idle duration is shorter than the tick period
+	 * length.
+	 */
+	if ((!(drv->states[idx].flags & CPUIDLE_FLAG_POLLING) &&
+	    duration_ns >= TICK_NSEC) || tick_nohz_tick_stopped())
+		return idx;
+
+	/*
+	 * The tick is not going to be stopped, so if the target residency of
+	 * the state to be returned is not within the time till the closest
+	 * timer including the tick, try to correct that.
+	 */
+	if (idx > idx0 &&
+	    drv->states[idx].target_residency_ns > delta_tick)
+		idx = teo_find_shallower_state(drv, dev, idx, delta_tick, false);
+
+out_tick:
+	*stop_tick = false;
+	return idx;
+}
+
+/**
+ * teo_reflect - Note that governor data for the CPU need to be updated.
+ * @dev: Target CPU.
+ * @state: Entered state.
+ */
+static void teo_reflect(struct cpuidle_device *dev, int state)
+{
+	struct teo_cpu *cpu_data = per_cpu_ptr(&teo_cpus, dev->cpu);
+
+	dev->last_state_idx = state;
+	/*
+	 * If the wakeup was not "natural", but triggered by one of the safety
+	 * nets, assume that the CPU might have been idle for the entire sleep
+	 * length time.
+	 */
+	if (dev->poll_time_limit ||
+	    (tick_nohz_idle_got_tick() && cpu_data->sleep_length_ns > TICK_NSEC)) {
+		dev->poll_time_limit = false;
+		cpu_data->time_span_ns = cpu_data->sleep_length_ns;
+	} else {
+		cpu_data->time_span_ns = local_clock() - cpu_data->time_span_ns;
+	}
+}
+
+/**
+ * teo_enable_device - Initialize the governor's data for the target CPU.
+ * @drv: cpuidle driver (not used).
+ * @dev: Target CPU.
+ */
+static int teo_enable_device(struct cpuidle_driver *drv,
+			     struct cpuidle_device *dev)
+{
+	struct teo_cpu *cpu_data = per_cpu_ptr(&teo_cpus, dev->cpu);
+	unsigned long max_capacity = arch_scale_cpu_capacity(dev->cpu);
+	int i;
+
+	memset(cpu_data, 0, sizeof(*cpu_data));
+	cpu_data->util_threshold = max_capacity >> UTIL_THRESHOLD_SHIFT;
+
+	for (i = 0; i < NR_RECENT; i++)
+		cpu_data->recent_idx[i] = -1;
+
+	return 0;
+}
+
+static struct cpuidle_governor teo_governor = {
+	.name =		"teo",
+	.rating =	19,
+	.enable =	teo_enable_device,
+	.select =	teo_select,
+	.reflect =	teo_reflect,
+};
+
+static int __init teo_governor_init(void)
+{
+	return cpuidle_register_governor(&teo_governor);
+}
+
+postcore_initcall(teo_governor_init);
diff --git a/drivers/cpuidle/poll_state.c b/drivers/cpuidle/poll_state.c
new file mode 100644
index 0000000000..9b6d90a726
--- /dev/null
+++ b/drivers/cpuidle/poll_state.c
@@ -0,0 +1,62 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * poll_state.c - Polling idle state
+ */
+
+#include <linux/cpuidle.h>
+#include <linux/sched.h>
+#include <linux/sched/clock.h>
+#include <linux/sched/idle.h>
+
+#define POLL_IDLE_RELAX_COUNT	200
+
+static int __cpuidle poll_idle(struct cpuidle_device *dev,
+			       struct cpuidle_driver *drv, int index)
+{
+	u64 time_start;
+
+	time_start = local_clock_noinstr();
+
+	dev->poll_time_limit = false;
+
+	raw_local_irq_enable();
+	if (!current_set_polling_and_test()) {
+		unsigned int loop_count = 0;
+		u64 limit;
+
+		limit = cpuidle_poll_time(drv, dev);
+
+		while (!need_resched()) {
+			cpu_relax();
+			if (loop_count++ < POLL_IDLE_RELAX_COUNT)
+				continue;
+
+			loop_count = 0;
+			if (local_clock_noinstr() - time_start > limit) {
+				dev->poll_time_limit = true;
+				break;
+			}
+		}
+	}
+	raw_local_irq_disable();
+
+	current_clr_polling();
+
+	return index;
+}
+
+void cpuidle_poll_state_init(struct cpuidle_driver *drv)
+{
+	struct cpuidle_state *state = &drv->states[0];
+
+	snprintf(state->name, CPUIDLE_NAME_LEN, "POLL");
+	snprintf(state->desc, CPUIDLE_DESC_LEN, "CPUIDLE CORE POLL IDLE");
+	state->exit_latency = 0;
+	state->target_residency = 0;
+	state->exit_latency_ns = 0;
+	state->target_residency_ns = 0;
+	state->power_usage = -1;
+	state->enter = poll_idle;
+	state->flags = CPUIDLE_FLAG_POLLING;
+}
+EXPORT_SYMBOL_GPL(cpuidle_poll_state_init);
diff --git a/drivers/cpuidle/sysfs.c b/drivers/cpuidle/sysfs.c
new file mode 100644
index 0000000000..d6f5da61cb
--- /dev/null
+++ b/drivers/cpuidle/sysfs.c
@@ -0,0 +1,747 @@
+/*
+ * sysfs.c - sysfs support
+ *
+ * (C) 2006-2007 Shaohua Li <shaohua.li@intel.com>
+ *
+ * This code is licenced under the GPL.
+ */
+
+#include <linux/kernel.h>
+#include <linux/cpuidle.h>
+#include <linux/sysfs.h>
+#include <linux/slab.h>
+#include <linux/cpu.h>
+#include <linux/completion.h>
+#include <linux/capability.h>
+#include <linux/device.h>
+#include <linux/kobject.h>
+
+#include "cpuidle.h"
+
+static ssize_t show_available_governors(struct device *dev,
+					struct device_attribute *attr,
+					char *buf)
+{
+	ssize_t i = 0;
+	struct cpuidle_governor *tmp;
+
+	mutex_lock(&cpuidle_lock);
+	list_for_each_entry(tmp, &cpuidle_governors, governor_list) {
+		if (i >= (ssize_t) (PAGE_SIZE - (CPUIDLE_NAME_LEN + 2)))
+			goto out;
+
+		i += scnprintf(&buf[i], CPUIDLE_NAME_LEN + 1, "%s ", tmp->name);
+	}
+
+out:
+	i+= sprintf(&buf[i], "\n");
+	mutex_unlock(&cpuidle_lock);
+	return i;
+}
+
+static ssize_t show_current_driver(struct device *dev,
+				   struct device_attribute *attr,
+				   char *buf)
+{
+	ssize_t ret;
+	struct cpuidle_driver *drv;
+
+	spin_lock(&cpuidle_driver_lock);
+	drv = cpuidle_get_driver();
+	if (drv)
+		ret = sprintf(buf, "%s\n", drv->name);
+	else
+		ret = sprintf(buf, "none\n");
+	spin_unlock(&cpuidle_driver_lock);
+
+	return ret;
+}
+
+static ssize_t show_current_governor(struct device *dev,
+				     struct device_attribute *attr,
+				     char *buf)
+{
+	ssize_t ret;
+
+	mutex_lock(&cpuidle_lock);
+	if (cpuidle_curr_governor)
+		ret = sprintf(buf, "%s\n", cpuidle_curr_governor->name);
+	else
+		ret = sprintf(buf, "none\n");
+	mutex_unlock(&cpuidle_lock);
+
+	return ret;
+}
+
+static ssize_t store_current_governor(struct device *dev,
+				      struct device_attribute *attr,
+				      const char *buf, size_t count)
+{
+	char gov_name[CPUIDLE_NAME_LEN + 1];
+	int ret;
+	struct cpuidle_governor *gov;
+
+	ret = sscanf(buf, "%" __stringify(CPUIDLE_NAME_LEN) "s", gov_name);
+	if (ret != 1)
+		return -EINVAL;
+
+	mutex_lock(&cpuidle_lock);
+	ret = -EINVAL;
+	list_for_each_entry(gov, &cpuidle_governors, governor_list) {
+		if (!strncmp(gov->name, gov_name, CPUIDLE_NAME_LEN)) {
+			ret = cpuidle_switch_governor(gov);
+			break;
+		}
+	}
+	mutex_unlock(&cpuidle_lock);
+
+	return ret ? ret : count;
+}
+
+static DEVICE_ATTR(available_governors, 0444, show_available_governors, NULL);
+static DEVICE_ATTR(current_driver, 0444, show_current_driver, NULL);
+static DEVICE_ATTR(current_governor, 0644, show_current_governor,
+				   store_current_governor);
+static DEVICE_ATTR(current_governor_ro, 0444, show_current_governor, NULL);
+
+static struct attribute *cpuidle_attrs[] = {
+	&dev_attr_available_governors.attr,
+	&dev_attr_current_driver.attr,
+	&dev_attr_current_governor.attr,
+	&dev_attr_current_governor_ro.attr,
+	NULL
+};
+
+static struct attribute_group cpuidle_attr_group = {
+	.attrs = cpuidle_attrs,
+	.name = "cpuidle",
+};
+
+/**
+ * cpuidle_add_interface - add CPU global sysfs attributes
+ */
+int cpuidle_add_interface(void)
+{
+	struct device *dev_root = bus_get_dev_root(&cpu_subsys);
+	int retval;
+
+	if (!dev_root)
+		return -EINVAL;
+
+	retval = sysfs_create_group(&dev_root->kobj, &cpuidle_attr_group);
+	put_device(dev_root);
+	return retval;
+}
+
+/**
+ * cpuidle_remove_interface - remove CPU global sysfs attributes
+ * @dev: the target device
+ */
+void cpuidle_remove_interface(struct device *dev)
+{
+	sysfs_remove_group(&dev->kobj, &cpuidle_attr_group);
+}
+
+struct cpuidle_attr {
+	struct attribute attr;
+	ssize_t (*show)(struct cpuidle_device *, char *);
+	ssize_t (*store)(struct cpuidle_device *, const char *, size_t count);
+};
+
+#define attr_to_cpuidleattr(a) container_of(a, struct cpuidle_attr, attr)
+
+struct cpuidle_device_kobj {
+	struct cpuidle_device *dev;
+	struct completion kobj_unregister;
+	struct kobject kobj;
+};
+
+static inline struct cpuidle_device *to_cpuidle_device(struct kobject *kobj)
+{
+	struct cpuidle_device_kobj *kdev =
+		container_of(kobj, struct cpuidle_device_kobj, kobj);
+
+	return kdev->dev;
+}
+
+static ssize_t cpuidle_show(struct kobject *kobj, struct attribute *attr,
+			    char *buf)
+{
+	int ret = -EIO;
+	struct cpuidle_device *dev = to_cpuidle_device(kobj);
+	struct cpuidle_attr *cattr = attr_to_cpuidleattr(attr);
+
+	if (cattr->show) {
+		mutex_lock(&cpuidle_lock);
+		ret = cattr->show(dev, buf);
+		mutex_unlock(&cpuidle_lock);
+	}
+	return ret;
+}
+
+static ssize_t cpuidle_store(struct kobject *kobj, struct attribute *attr,
+			     const char *buf, size_t count)
+{
+	int ret = -EIO;
+	struct cpuidle_device *dev = to_cpuidle_device(kobj);
+	struct cpuidle_attr *cattr = attr_to_cpuidleattr(attr);
+
+	if (cattr->store) {
+		mutex_lock(&cpuidle_lock);
+		ret = cattr->store(dev, buf, count);
+		mutex_unlock(&cpuidle_lock);
+	}
+	return ret;
+}
+
+static const struct sysfs_ops cpuidle_sysfs_ops = {
+	.show = cpuidle_show,
+	.store = cpuidle_store,
+};
+
+static void cpuidle_sysfs_release(struct kobject *kobj)
+{
+	struct cpuidle_device_kobj *kdev =
+		container_of(kobj, struct cpuidle_device_kobj, kobj);
+
+	complete(&kdev->kobj_unregister);
+}
+
+static const struct kobj_type ktype_cpuidle = {
+	.sysfs_ops = &cpuidle_sysfs_ops,
+	.release = cpuidle_sysfs_release,
+};
+
+struct cpuidle_state_attr {
+	struct attribute attr;
+	ssize_t (*show)(struct cpuidle_state *, \
+					struct cpuidle_state_usage *, char *);
+	ssize_t (*store)(struct cpuidle_state *, \
+			struct cpuidle_state_usage *, const char *, size_t);
+};
+
+#define define_one_state_ro(_name, show) \
+static struct cpuidle_state_attr attr_##_name = __ATTR(_name, 0444, show, NULL)
+
+#define define_one_state_rw(_name, show, store) \
+static struct cpuidle_state_attr attr_##_name = __ATTR(_name, 0644, show, store)
+
+#define define_show_state_function(_name) \
+static ssize_t show_state_##_name(struct cpuidle_state *state, \
+			 struct cpuidle_state_usage *state_usage, char *buf) \
+{ \
+	return sprintf(buf, "%u\n", state->_name);\
+}
+
+#define define_show_state_ull_function(_name) \
+static ssize_t show_state_##_name(struct cpuidle_state *state, \
+				  struct cpuidle_state_usage *state_usage, \
+				  char *buf)				\
+{ \
+	return sprintf(buf, "%llu\n", state_usage->_name);\
+}
+
+#define define_show_state_str_function(_name) \
+static ssize_t show_state_##_name(struct cpuidle_state *state, \
+				  struct cpuidle_state_usage *state_usage, \
+				  char *buf)				\
+{ \
+	if (state->_name[0] == '\0')\
+		return sprintf(buf, "<null>\n");\
+	return sprintf(buf, "%s\n", state->_name);\
+}
+
+#define define_show_state_time_function(_name) \
+static ssize_t show_state_##_name(struct cpuidle_state *state, \
+				  struct cpuidle_state_usage *state_usage, \
+				  char *buf) \
+{ \
+	return sprintf(buf, "%llu\n", ktime_to_us(state->_name##_ns)); \
+}
+
+define_show_state_time_function(exit_latency)
+define_show_state_time_function(target_residency)
+define_show_state_function(power_usage)
+define_show_state_ull_function(usage)
+define_show_state_ull_function(rejected)
+define_show_state_str_function(name)
+define_show_state_str_function(desc)
+define_show_state_ull_function(above)
+define_show_state_ull_function(below)
+
+static ssize_t show_state_time(struct cpuidle_state *state,
+			       struct cpuidle_state_usage *state_usage,
+			       char *buf)
+{
+	return sprintf(buf, "%llu\n", ktime_to_us(state_usage->time_ns));
+}
+
+static ssize_t show_state_disable(struct cpuidle_state *state,
+				  struct cpuidle_state_usage *state_usage,
+				  char *buf)
+{
+	return sprintf(buf, "%llu\n",
+		       state_usage->disable & CPUIDLE_STATE_DISABLED_BY_USER);
+}
+
+static ssize_t store_state_disable(struct cpuidle_state *state,
+				   struct cpuidle_state_usage *state_usage,
+				   const char *buf, size_t size)
+{
+	unsigned int value;
+	int err;
+
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
+	err = kstrtouint(buf, 0, &value);
+	if (err)
+		return err;
+
+	if (value)
+		state_usage->disable |= CPUIDLE_STATE_DISABLED_BY_USER;
+	else
+		state_usage->disable &= ~CPUIDLE_STATE_DISABLED_BY_USER;
+
+	return size;
+}
+
+static ssize_t show_state_default_status(struct cpuidle_state *state,
+					  struct cpuidle_state_usage *state_usage,
+					  char *buf)
+{
+	return sprintf(buf, "%s\n",
+		       state->flags & CPUIDLE_FLAG_OFF ? "disabled" : "enabled");
+}
+
+define_one_state_ro(name, show_state_name);
+define_one_state_ro(desc, show_state_desc);
+define_one_state_ro(latency, show_state_exit_latency);
+define_one_state_ro(residency, show_state_target_residency);
+define_one_state_ro(power, show_state_power_usage);
+define_one_state_ro(usage, show_state_usage);
+define_one_state_ro(rejected, show_state_rejected);
+define_one_state_ro(time, show_state_time);
+define_one_state_rw(disable, show_state_disable, store_state_disable);
+define_one_state_ro(above, show_state_above);
+define_one_state_ro(below, show_state_below);
+define_one_state_ro(default_status, show_state_default_status);
+
+static struct attribute *cpuidle_state_default_attrs[] = {
+	&attr_name.attr,
+	&attr_desc.attr,
+	&attr_latency.attr,
+	&attr_residency.attr,
+	&attr_power.attr,
+	&attr_usage.attr,
+	&attr_rejected.attr,
+	&attr_time.attr,
+	&attr_disable.attr,
+	&attr_above.attr,
+	&attr_below.attr,
+	&attr_default_status.attr,
+	NULL
+};
+ATTRIBUTE_GROUPS(cpuidle_state_default);
+
+struct cpuidle_state_kobj {
+	struct cpuidle_state *state;
+	struct cpuidle_state_usage *state_usage;
+	struct completion kobj_unregister;
+	struct kobject kobj;
+	struct cpuidle_device *device;
+};
+
+#ifdef CONFIG_SUSPEND
+#define define_show_state_s2idle_ull_function(_name) \
+static ssize_t show_state_s2idle_##_name(struct cpuidle_state *state, \
+					 struct cpuidle_state_usage *state_usage, \
+					 char *buf)				\
+{ \
+	return sprintf(buf, "%llu\n", state_usage->s2idle_##_name);\
+}
+
+define_show_state_s2idle_ull_function(usage);
+define_show_state_s2idle_ull_function(time);
+
+#define define_one_state_s2idle_ro(_name, show) \
+static struct cpuidle_state_attr attr_s2idle_##_name = \
+	__ATTR(_name, 0444, show, NULL)
+
+define_one_state_s2idle_ro(usage, show_state_s2idle_usage);
+define_one_state_s2idle_ro(time, show_state_s2idle_time);
+
+static struct attribute *cpuidle_state_s2idle_attrs[] = {
+	&attr_s2idle_usage.attr,
+	&attr_s2idle_time.attr,
+	NULL
+};
+
+static const struct attribute_group cpuidle_state_s2idle_group = {
+	.name	= "s2idle",
+	.attrs	= cpuidle_state_s2idle_attrs,
+};
+
+static void cpuidle_add_s2idle_attr_group(struct cpuidle_state_kobj *kobj)
+{
+	int ret;
+
+	if (!kobj->state->enter_s2idle)
+		return;
+
+	ret = sysfs_create_group(&kobj->kobj, &cpuidle_state_s2idle_group);
+	if (ret)
+		pr_debug("%s: sysfs attribute group not created\n", __func__);
+}
+
+static void cpuidle_remove_s2idle_attr_group(struct cpuidle_state_kobj *kobj)
+{
+	if (kobj->state->enter_s2idle)
+		sysfs_remove_group(&kobj->kobj, &cpuidle_state_s2idle_group);
+}
+#else
+static inline void cpuidle_add_s2idle_attr_group(struct cpuidle_state_kobj *kobj) { }
+static inline void cpuidle_remove_s2idle_attr_group(struct cpuidle_state_kobj *kobj) { }
+#endif /* CONFIG_SUSPEND */
+
+#define kobj_to_state_obj(k) container_of(k, struct cpuidle_state_kobj, kobj)
+#define kobj_to_state(k) (kobj_to_state_obj(k)->state)
+#define kobj_to_state_usage(k) (kobj_to_state_obj(k)->state_usage)
+#define kobj_to_device(k) (kobj_to_state_obj(k)->device)
+#define attr_to_stateattr(a) container_of(a, struct cpuidle_state_attr, attr)
+
+static ssize_t cpuidle_state_show(struct kobject *kobj, struct attribute *attr,
+				  char *buf)
+{
+	int ret = -EIO;
+	struct cpuidle_state *state = kobj_to_state(kobj);
+	struct cpuidle_state_usage *state_usage = kobj_to_state_usage(kobj);
+	struct cpuidle_state_attr *cattr = attr_to_stateattr(attr);
+
+	if (cattr->show)
+		ret = cattr->show(state, state_usage, buf);
+
+	return ret;
+}
+
+static ssize_t cpuidle_state_store(struct kobject *kobj, struct attribute *attr,
+				   const char *buf, size_t size)
+{
+	int ret = -EIO;
+	struct cpuidle_state *state = kobj_to_state(kobj);
+	struct cpuidle_state_usage *state_usage = kobj_to_state_usage(kobj);
+	struct cpuidle_state_attr *cattr = attr_to_stateattr(attr);
+	struct cpuidle_device *dev = kobj_to_device(kobj);
+
+	if (cattr->store)
+		ret = cattr->store(state, state_usage, buf, size);
+
+	/* reset poll time cache */
+	dev->poll_limit_ns = 0;
+
+	return ret;
+}
+
+static const struct sysfs_ops cpuidle_state_sysfs_ops = {
+	.show = cpuidle_state_show,
+	.store = cpuidle_state_store,
+};
+
+static void cpuidle_state_sysfs_release(struct kobject *kobj)
+{
+	struct cpuidle_state_kobj *state_obj = kobj_to_state_obj(kobj);
+
+	complete(&state_obj->kobj_unregister);
+}
+
+static const struct kobj_type ktype_state_cpuidle = {
+	.sysfs_ops = &cpuidle_state_sysfs_ops,
+	.default_groups = cpuidle_state_default_groups,
+	.release = cpuidle_state_sysfs_release,
+};
+
+static inline void cpuidle_free_state_kobj(struct cpuidle_device *device, int i)
+{
+	cpuidle_remove_s2idle_attr_group(device->kobjs[i]);
+	kobject_put(&device->kobjs[i]->kobj);
+	wait_for_completion(&device->kobjs[i]->kobj_unregister);
+	kfree(device->kobjs[i]);
+	device->kobjs[i] = NULL;
+}
+
+/**
+ * cpuidle_add_state_sysfs - adds cpuidle states sysfs attributes
+ * @device: the target device
+ */
+static int cpuidle_add_state_sysfs(struct cpuidle_device *device)
+{
+	int i, ret = -ENOMEM;
+	struct cpuidle_state_kobj *kobj;
+	struct cpuidle_device_kobj *kdev = device->kobj_dev;
+	struct cpuidle_driver *drv = cpuidle_get_cpu_driver(device);
+
+	/* state statistics */
+	for (i = 0; i < drv->state_count; i++) {
+		kobj = kzalloc(sizeof(struct cpuidle_state_kobj), GFP_KERNEL);
+		if (!kobj) {
+			ret = -ENOMEM;
+			goto error_state;
+		}
+		kobj->state = &drv->states[i];
+		kobj->state_usage = &device->states_usage[i];
+		kobj->device = device;
+		init_completion(&kobj->kobj_unregister);
+
+		ret = kobject_init_and_add(&kobj->kobj, &ktype_state_cpuidle,
+					   &kdev->kobj, "state%d", i);
+		if (ret) {
+			kobject_put(&kobj->kobj);
+			kfree(kobj);
+			goto error_state;
+		}
+		cpuidle_add_s2idle_attr_group(kobj);
+		kobject_uevent(&kobj->kobj, KOBJ_ADD);
+		device->kobjs[i] = kobj;
+	}
+
+	return 0;
+
+error_state:
+	for (i = i - 1; i >= 0; i--)
+		cpuidle_free_state_kobj(device, i);
+	return ret;
+}
+
+/**
+ * cpuidle_remove_state_sysfs - removes the cpuidle states sysfs attributes
+ * @device: the target device
+ */
+static void cpuidle_remove_state_sysfs(struct cpuidle_device *device)
+{
+	struct cpuidle_driver *drv = cpuidle_get_cpu_driver(device);
+	int i;
+
+	for (i = 0; i < drv->state_count; i++)
+		cpuidle_free_state_kobj(device, i);
+}
+
+#ifdef CONFIG_CPU_IDLE_MULTIPLE_DRIVERS
+#define kobj_to_driver_kobj(k) container_of(k, struct cpuidle_driver_kobj, kobj)
+#define attr_to_driver_attr(a) container_of(a, struct cpuidle_driver_attr, attr)
+
+#define define_one_driver_ro(_name, show)                       \
+	static struct cpuidle_driver_attr attr_driver_##_name = \
+		__ATTR(_name, 0444, show, NULL)
+
+struct cpuidle_driver_kobj {
+	struct cpuidle_driver *drv;
+	struct completion kobj_unregister;
+	struct kobject kobj;
+};
+
+struct cpuidle_driver_attr {
+	struct attribute attr;
+	ssize_t (*show)(struct cpuidle_driver *, char *);
+	ssize_t (*store)(struct cpuidle_driver *, const char *, size_t);
+};
+
+static ssize_t show_driver_name(struct cpuidle_driver *drv, char *buf)
+{
+	ssize_t ret;
+
+	spin_lock(&cpuidle_driver_lock);
+	ret = sprintf(buf, "%s\n", drv ? drv->name : "none");
+	spin_unlock(&cpuidle_driver_lock);
+
+	return ret;
+}
+
+static void cpuidle_driver_sysfs_release(struct kobject *kobj)
+{
+	struct cpuidle_driver_kobj *driver_kobj = kobj_to_driver_kobj(kobj);
+	complete(&driver_kobj->kobj_unregister);
+}
+
+static ssize_t cpuidle_driver_show(struct kobject *kobj, struct attribute *attr,
+				   char *buf)
+{
+	int ret = -EIO;
+	struct cpuidle_driver_kobj *driver_kobj = kobj_to_driver_kobj(kobj);
+	struct cpuidle_driver_attr *dattr = attr_to_driver_attr(attr);
+
+	if (dattr->show)
+		ret = dattr->show(driver_kobj->drv, buf);
+
+	return ret;
+}
+
+static ssize_t cpuidle_driver_store(struct kobject *kobj, struct attribute *attr,
+				    const char *buf, size_t size)
+{
+	int ret = -EIO;
+	struct cpuidle_driver_kobj *driver_kobj = kobj_to_driver_kobj(kobj);
+	struct cpuidle_driver_attr *dattr = attr_to_driver_attr(attr);
+
+	if (dattr->store)
+		ret = dattr->store(driver_kobj->drv, buf, size);
+
+	return ret;
+}
+
+define_one_driver_ro(name, show_driver_name);
+
+static const struct sysfs_ops cpuidle_driver_sysfs_ops = {
+	.show = cpuidle_driver_show,
+	.store = cpuidle_driver_store,
+};
+
+static struct attribute *cpuidle_driver_default_attrs[] = {
+	&attr_driver_name.attr,
+	NULL
+};
+ATTRIBUTE_GROUPS(cpuidle_driver_default);
+
+static const struct kobj_type ktype_driver_cpuidle = {
+	.sysfs_ops = &cpuidle_driver_sysfs_ops,
+	.default_groups = cpuidle_driver_default_groups,
+	.release = cpuidle_driver_sysfs_release,
+};
+
+/**
+ * cpuidle_add_driver_sysfs - adds the driver name sysfs attribute
+ * @dev: the target device
+ */
+static int cpuidle_add_driver_sysfs(struct cpuidle_device *dev)
+{
+	struct cpuidle_driver_kobj *kdrv;
+	struct cpuidle_device_kobj *kdev = dev->kobj_dev;
+	struct cpuidle_driver *drv = cpuidle_get_cpu_driver(dev);
+	int ret;
+
+	kdrv = kzalloc(sizeof(*kdrv), GFP_KERNEL);
+	if (!kdrv)
+		return -ENOMEM;
+
+	kdrv->drv = drv;
+	init_completion(&kdrv->kobj_unregister);
+
+	ret = kobject_init_and_add(&kdrv->kobj, &ktype_driver_cpuidle,
+				   &kdev->kobj, "driver");
+	if (ret) {
+		kobject_put(&kdrv->kobj);
+		kfree(kdrv);
+		return ret;
+	}
+
+	kobject_uevent(&kdrv->kobj, KOBJ_ADD);
+	dev->kobj_driver = kdrv;
+
+	return ret;
+}
+
+/**
+ * cpuidle_remove_driver_sysfs - removes the driver name sysfs attribute
+ * @dev: the target device
+ */
+static void cpuidle_remove_driver_sysfs(struct cpuidle_device *dev)
+{
+	struct cpuidle_driver_kobj *kdrv = dev->kobj_driver;
+	kobject_put(&kdrv->kobj);
+	wait_for_completion(&kdrv->kobj_unregister);
+	kfree(kdrv);
+}
+#else
+static inline int cpuidle_add_driver_sysfs(struct cpuidle_device *dev)
+{
+	return 0;
+}
+
+static inline void cpuidle_remove_driver_sysfs(struct cpuidle_device *dev)
+{
+	;
+}
+#endif
+
+/**
+ * cpuidle_add_device_sysfs - adds device specific sysfs attributes
+ * @device: the target device
+ */
+int cpuidle_add_device_sysfs(struct cpuidle_device *device)
+{
+	int ret;
+
+	ret = cpuidle_add_state_sysfs(device);
+	if (ret)
+		return ret;
+
+	ret = cpuidle_add_driver_sysfs(device);
+	if (ret)
+		cpuidle_remove_state_sysfs(device);
+	return ret;
+}
+
+/**
+ * cpuidle_remove_device_sysfs : removes device specific sysfs attributes
+ * @device : the target device
+ */
+void cpuidle_remove_device_sysfs(struct cpuidle_device *device)
+{
+	cpuidle_remove_driver_sysfs(device);
+	cpuidle_remove_state_sysfs(device);
+}
+
+/**
+ * cpuidle_add_sysfs - creates a sysfs instance for the target device
+ * @dev: the target device
+ */
+int cpuidle_add_sysfs(struct cpuidle_device *dev)
+{
+	struct cpuidle_device_kobj *kdev;
+	struct device *cpu_dev = get_cpu_device((unsigned long)dev->cpu);
+	int error;
+
+	/*
+	 * Return if cpu_device is not setup for this CPU.
+	 *
+	 * This could happen if the arch did not set up cpu_device
+	 * since this CPU is not in cpu_present mask and the
+	 * driver did not send a correct CPU mask during registration.
+	 * Without this check we would end up passing bogus
+	 * value for &cpu_dev->kobj in kobject_init_and_add()
+	 */
+	if (!cpu_dev)
+		return -ENODEV;
+
+	kdev = kzalloc(sizeof(*kdev), GFP_KERNEL);
+	if (!kdev)
+		return -ENOMEM;
+	kdev->dev = dev;
+
+	init_completion(&kdev->kobj_unregister);
+
+	error = kobject_init_and_add(&kdev->kobj, &ktype_cpuidle, &cpu_dev->kobj,
+				   "cpuidle");
+	if (error) {
+		kobject_put(&kdev->kobj);
+		kfree(kdev);
+		return error;
+	}
+
+	dev->kobj_dev = kdev;
+	kobject_uevent(&kdev->kobj, KOBJ_ADD);
+
+	return 0;
+}
+
+/**
+ * cpuidle_remove_sysfs - deletes a sysfs instance on the target device
+ * @dev: the target device
+ */
+void cpuidle_remove_sysfs(struct cpuidle_device *dev)
+{
+	struct cpuidle_device_kobj *kdev = dev->kobj_dev;
+
+	kobject_put(&kdev->kobj);
+	wait_for_completion(&kdev->kobj_unregister);
+	kfree(kdev);
+}
author	Daniel Baumann <daniel.baumann@progress-linux.org>	2024-04-11 08:27:49 +0000
committer	Daniel Baumann <daniel.baumann@progress-linux.org>	2024-04-11 08:27:49 +0000
commit	ace9429bb58fd418f0c81d4c2835699bddf6bde6 (patch)
tree	b2d64bc10158fdd5497876388cd68142ca374ed3 /drivers/cpuidle
parent	Initial commit. (diff)
download	linux-ace9429bb58fd418f0c81d4c2835699bddf6bde6.tar.xz linux-ace9429bb58fd418f0c81d4c2835699bddf6bde6.zip