diff options
author | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-11 08:27:49 +0000 |
---|---|---|
committer | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-11 08:27:49 +0000 |
commit | ace9429bb58fd418f0c81d4c2835699bddf6bde6 (patch) | |
tree | b2d64bc10158fdd5497876388cd68142ca374ed3 /drivers/cpuidle | |
parent | Initial commit. (diff) | |
download | linux-ace9429bb58fd418f0c81d4c2835699bddf6bde6.tar.xz linux-ace9429bb58fd418f0c81d4c2835699bddf6bde6.zip |
Adding upstream version 6.6.15.upstream/6.6.15
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'drivers/cpuidle')
43 files changed, 9728 insertions, 0 deletions
diff --git a/drivers/cpuidle/Kconfig b/drivers/cpuidle/Kconfig new file mode 100644 index 0000000000..cac5997dca --- /dev/null +++ b/drivers/cpuidle/Kconfig @@ -0,0 +1,88 @@ +# SPDX-License-Identifier: GPL-2.0-only +menu "CPU Idle" + +config CPU_IDLE + bool "CPU idle PM support" + default y if ACPI || PPC_PSERIES + select CPU_IDLE_GOV_LADDER if (!NO_HZ && !NO_HZ_IDLE) + select CPU_IDLE_GOV_MENU if (NO_HZ || NO_HZ_IDLE) && !CPU_IDLE_GOV_TEO + help + CPU idle is a generic framework for supporting software-controlled + idle processor power management. It includes modular cross-platform + governors that can be swapped during runtime. + + If you're using an ACPI-enabled platform, you should say Y here. + +if CPU_IDLE + +config CPU_IDLE_MULTIPLE_DRIVERS + bool + +config CPU_IDLE_GOV_LADDER + bool "Ladder governor (for periodic timer tick)" + +config CPU_IDLE_GOV_MENU + bool "Menu governor (for tickless system)" + +config CPU_IDLE_GOV_TEO + bool "Timer events oriented (TEO) governor (for tickless systems)" + help + This governor implements a simplified idle state selection method + focused on timer events and does not do any interactivity boosting. + + Some workloads benefit from using it and it generally should be safe + to use. Say Y here if you are not happy with the alternatives. + +config CPU_IDLE_GOV_HALTPOLL + bool "Haltpoll governor (for virtualized systems)" + depends on KVM_GUEST + help + This governor implements haltpoll idle state selection, to be + used in conjunction with the haltpoll cpuidle driver, allowing + for polling for a certain amount of time before entering idle + state. + + Some virtualized workloads benefit from using it. + +config DT_IDLE_STATES + bool + +config DT_IDLE_GENPD + depends on PM_GENERIC_DOMAINS_OF + bool + +menu "ARM CPU Idle Drivers" +depends on ARM || ARM64 +source "drivers/cpuidle/Kconfig.arm" +endmenu + +menu "MIPS CPU Idle Drivers" +depends on MIPS +source "drivers/cpuidle/Kconfig.mips" +endmenu + +menu "POWERPC CPU Idle Drivers" +depends on PPC +source "drivers/cpuidle/Kconfig.powerpc" +endmenu + +menu "RISC-V CPU Idle Drivers" +depends on RISCV +source "drivers/cpuidle/Kconfig.riscv" +endmenu + +config HALTPOLL_CPUIDLE + tristate "Halt poll cpuidle driver" + depends on X86 && KVM_GUEST + select CPU_IDLE_GOV_HALTPOLL + default y + help + This option enables halt poll cpuidle driver, which allows to poll + before halting in the guest (more efficient than polling in the + host via halt_poll_ns for some scenarios). + +endif + +config ARCH_NEEDS_CPU_IDLE_COUPLED + def_bool n +endmenu diff --git a/drivers/cpuidle/Kconfig.arm b/drivers/cpuidle/Kconfig.arm new file mode 100644 index 0000000000..a1ee475d18 --- /dev/null +++ b/drivers/cpuidle/Kconfig.arm @@ -0,0 +1,132 @@ +# SPDX-License-Identifier: GPL-2.0-only +# +# ARM CPU Idle drivers +# +config ARM_CPUIDLE + bool "Generic ARM CPU idle Driver" + depends on ARM + select DT_IDLE_STATES + select CPU_IDLE_MULTIPLE_DRIVERS + help + Select this to enable generic cpuidle driver for ARM. + It provides a generic idle driver whose idle states are configured + at run-time through DT nodes. The CPUidle suspend backend is + initialized by calling the CPU operations init idle hook + provided by architecture code. + +config ARM_PSCI_CPUIDLE + bool "PSCI CPU idle Driver" + depends on ARM_PSCI_FW + select DT_IDLE_STATES + select CPU_IDLE_MULTIPLE_DRIVERS + help + Select this to enable PSCI firmware based CPUidle driver for ARM. + It provides an idle driver that is capable of detecting and + managing idle states through the PSCI firmware interface. + + The driver has limitations when used with PREEMPT_RT: + - If the idle states are described with the non-hierarchical layout, + all idle states are still available. + + - If the idle states are described with the hierarchical layout, + only the idle states defined per CPU are available, but not the ones + being shared among a group of CPUs (aka cluster idle states). + +config ARM_PSCI_CPUIDLE_DOMAIN + bool "PSCI CPU idle Domain" + depends on ARM_PSCI_CPUIDLE + depends on PM_GENERIC_DOMAINS_OF + select DT_IDLE_GENPD + default y + help + Select this to enable the PSCI based CPUidle driver to use PM domains, + which is needed to support the hierarchical DT based layout of the + idle states. + +config ARM_BIG_LITTLE_CPUIDLE + bool "Support for ARM big.LITTLE processors" + depends on ARCH_VEXPRESS_TC2_PM || ARCH_EXYNOS || COMPILE_TEST + depends on MCPM && !ARM64 + select ARM_CPU_SUSPEND + select CPU_IDLE_MULTIPLE_DRIVERS + select DT_IDLE_STATES + help + Select this option to enable CPU idle driver for big.LITTLE based + ARM systems. Driver manages CPUs coordination through MCPM and + define different C-states for little and big cores through the + multiple CPU idle drivers infrastructure. + +config ARM_CLPS711X_CPUIDLE + bool "CPU Idle Driver for CLPS711X processors" + depends on ARCH_CLPS711X && !ARM64 || COMPILE_TEST + help + Select this to enable cpuidle on Cirrus Logic CLPS711X SOCs. + +config ARM_HIGHBANK_CPUIDLE + bool "CPU Idle Driver for Calxeda processors" + depends on ARM_PSCI && !ARM64 + select ARM_CPU_SUSPEND + help + Select this to enable cpuidle on Calxeda processors. + +config ARM_KIRKWOOD_CPUIDLE + bool "CPU Idle Driver for Marvell Kirkwood SoCs" + depends on (MACH_KIRKWOOD || COMPILE_TEST) && !ARM64 + help + This adds the CPU Idle driver for Marvell Kirkwood SoCs. + +config ARM_ZYNQ_CPUIDLE + bool "CPU Idle Driver for Xilinx Zynq processors" + depends on (ARCH_ZYNQ || COMPILE_TEST) && !ARM64 + help + Select this to enable cpuidle on Xilinx Zynq processors. + +config ARM_U8500_CPUIDLE + bool "Cpu Idle Driver for the ST-E u8500 processors" + depends on ARCH_U8500 && !ARM64 + help + Select this to enable cpuidle for ST-E u8500 processors. + +config ARM_AT91_CPUIDLE + bool "Cpu Idle Driver for the AT91 processors" + default y + depends on (ARCH_AT91 || COMPILE_TEST) && !ARM64 + help + Select this to enable cpuidle for AT91 processors. + +config ARM_EXYNOS_CPUIDLE + bool "Cpu Idle Driver for the Exynos processors" + depends on (ARCH_EXYNOS || COMPILE_TEST) && !ARM64 + select ARCH_NEEDS_CPU_IDLE_COUPLED if SMP + help + Select this to enable cpuidle for Exynos processors. + +config ARM_MVEBU_V7_CPUIDLE + bool "CPU Idle Driver for mvebu v7 family processors" + depends on (ARCH_MVEBU || COMPILE_TEST) && !ARM64 + help + Select this to enable cpuidle on Armada 370, 38x and XP processors. + +config ARM_TEGRA_CPUIDLE + bool "CPU Idle Driver for NVIDIA Tegra SoCs" + depends on (ARCH_TEGRA || COMPILE_TEST) && !ARM64 && MMU + depends on ARCH_SUSPEND_POSSIBLE + select ARCH_NEEDS_CPU_IDLE_COUPLED if SMP + select ARM_CPU_SUSPEND + help + Select this to enable cpuidle for NVIDIA Tegra20/30/114/124 SoCs. + +config ARM_QCOM_SPM_CPUIDLE + bool "CPU Idle Driver for Qualcomm Subsystem Power Manager (SPM)" + depends on (ARCH_QCOM || COMPILE_TEST) && !ARM64 && MMU + depends on ARCH_SUSPEND_POSSIBLE + select ARM_CPU_SUSPEND + select CPU_IDLE_MULTIPLE_DRIVERS + select DT_IDLE_STATES + select QCOM_SCM + select QCOM_SPM + help + Select this to enable cpuidle for Qualcomm processors. + The Subsystem Power Manager (SPM) controls low power modes for the + CPU and L2 cores. It interface with various system drivers to put + the cores in low power modes. diff --git a/drivers/cpuidle/Kconfig.mips b/drivers/cpuidle/Kconfig.mips new file mode 100644 index 0000000000..c3c011af4a --- /dev/null +++ b/drivers/cpuidle/Kconfig.mips @@ -0,0 +1,18 @@ +# SPDX-License-Identifier: GPL-2.0-only +# +# MIPS CPU Idle Drivers +# +config MIPS_CPS_CPUIDLE + bool "CPU Idle driver for MIPS CPS platforms" + depends on CPU_IDLE && MIPS_CPS + depends on SYS_SUPPORTS_MIPS_CPS + select ARCH_NEEDS_CPU_IDLE_COUPLED if MIPS_MT || CPU_MIPSR6 + select GENERIC_CLOCKEVENTS_BROADCAST if SMP + select MIPS_CPS_PM + default y + help + Select this option to enable processor idle state management + through cpuidle for systems built around the MIPS Coherent + Processing System (CPS) architecture. In order to make use of + the deepest idle states you will need to ensure that you are + also using the CONFIG_MIPS_CPS SMP implementation. diff --git a/drivers/cpuidle/Kconfig.powerpc b/drivers/cpuidle/Kconfig.powerpc new file mode 100644 index 0000000000..a797a02b7b --- /dev/null +++ b/drivers/cpuidle/Kconfig.powerpc @@ -0,0 +1,21 @@ +# SPDX-License-Identifier: GPL-2.0-only +# +# POWERPC CPU Idle Drivers +# +config PSERIES_CPUIDLE + bool "Cpuidle driver for pSeries platforms" + depends on CPU_IDLE + depends on PPC_PSERIES + default y + help + Select this option to enable processor idle state management + through cpuidle subsystem. + +config POWERNV_CPUIDLE + bool "Cpuidle driver for powernv platforms" + depends on CPU_IDLE + depends on PPC_POWERNV + default y + help + Select this option to enable processor idle state management + through cpuidle subsystem. diff --git a/drivers/cpuidle/Kconfig.riscv b/drivers/cpuidle/Kconfig.riscv new file mode 100644 index 0000000000..78518c26af --- /dev/null +++ b/drivers/cpuidle/Kconfig.riscv @@ -0,0 +1,15 @@ +# SPDX-License-Identifier: GPL-2.0-only +# +# RISC-V CPU Idle drivers +# + +config RISCV_SBI_CPUIDLE + bool "RISC-V SBI CPU idle Driver" + depends on RISCV_SBI + select DT_IDLE_STATES + select CPU_IDLE_MULTIPLE_DRIVERS + select DT_IDLE_GENPD if PM_GENERIC_DOMAINS_OF + help + Select this option to enable RISC-V SBI firmware based CPU idle + driver for RISC-V systems. This drivers also supports hierarchical + DT based layout of the idle state. diff --git a/drivers/cpuidle/Makefile b/drivers/cpuidle/Makefile new file mode 100644 index 0000000000..d103342b7c --- /dev/null +++ b/drivers/cpuidle/Makefile @@ -0,0 +1,41 @@ +# SPDX-License-Identifier: GPL-2.0 +# +# Makefile for cpuidle. +# + +obj-y += cpuidle.o driver.o governor.o sysfs.o governors/ +obj-$(CONFIG_ARCH_NEEDS_CPU_IDLE_COUPLED) += coupled.o +obj-$(CONFIG_DT_IDLE_STATES) += dt_idle_states.o +obj-$(CONFIG_DT_IDLE_GENPD) += dt_idle_genpd.o +obj-$(CONFIG_ARCH_HAS_CPU_RELAX) += poll_state.o +obj-$(CONFIG_HALTPOLL_CPUIDLE) += cpuidle-haltpoll.o + +################################################################################## +# ARM SoC drivers +obj-$(CONFIG_ARM_MVEBU_V7_CPUIDLE) += cpuidle-mvebu-v7.o +obj-$(CONFIG_ARM_BIG_LITTLE_CPUIDLE) += cpuidle-big_little.o +obj-$(CONFIG_ARM_CLPS711X_CPUIDLE) += cpuidle-clps711x.o +obj-$(CONFIG_ARM_HIGHBANK_CPUIDLE) += cpuidle-calxeda.o +obj-$(CONFIG_ARM_KIRKWOOD_CPUIDLE) += cpuidle-kirkwood.o +obj-$(CONFIG_ARM_ZYNQ_CPUIDLE) += cpuidle-zynq.o +obj-$(CONFIG_ARM_U8500_CPUIDLE) += cpuidle-ux500.o +obj-$(CONFIG_ARM_AT91_CPUIDLE) += cpuidle-at91.o +obj-$(CONFIG_ARM_EXYNOS_CPUIDLE) += cpuidle-exynos.o +obj-$(CONFIG_ARM_CPUIDLE) += cpuidle-arm.o +obj-$(CONFIG_ARM_PSCI_CPUIDLE) += cpuidle-psci.o +obj-$(CONFIG_ARM_PSCI_CPUIDLE_DOMAIN) += cpuidle-psci-domain.o +obj-$(CONFIG_ARM_TEGRA_CPUIDLE) += cpuidle-tegra.o +obj-$(CONFIG_ARM_QCOM_SPM_CPUIDLE) += cpuidle-qcom-spm.o + +############################################################################### +# MIPS drivers +obj-$(CONFIG_MIPS_CPS_CPUIDLE) += cpuidle-cps.o + +############################################################################### +# POWERPC drivers +obj-$(CONFIG_PSERIES_CPUIDLE) += cpuidle-pseries.o +obj-$(CONFIG_POWERNV_CPUIDLE) += cpuidle-powernv.o + +############################################################################### +# RISC-V drivers +obj-$(CONFIG_RISCV_SBI_CPUIDLE) += cpuidle-riscv-sbi.o diff --git a/drivers/cpuidle/coupled.c b/drivers/cpuidle/coupled.c new file mode 100644 index 0000000000..9acde71558 --- /dev/null +++ b/drivers/cpuidle/coupled.c @@ -0,0 +1,791 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * coupled.c - helper functions to enter the same idle state on multiple cpus + * + * Copyright (c) 2011 Google, Inc. + * + * Author: Colin Cross <ccross@android.com> + */ + +#include <linux/kernel.h> +#include <linux/cpu.h> +#include <linux/cpuidle.h> +#include <linux/mutex.h> +#include <linux/sched.h> +#include <linux/slab.h> +#include <linux/spinlock.h> + +#include "cpuidle.h" + +/** + * DOC: Coupled cpuidle states + * + * On some ARM SMP SoCs (OMAP4460, Tegra 2, and probably more), the + * cpus cannot be independently powered down, either due to + * sequencing restrictions (on Tegra 2, cpu 0 must be the last to + * power down), or due to HW bugs (on OMAP4460, a cpu powering up + * will corrupt the gic state unless the other cpu runs a work + * around). Each cpu has a power state that it can enter without + * coordinating with the other cpu (usually Wait For Interrupt, or + * WFI), and one or more "coupled" power states that affect blocks + * shared between the cpus (L2 cache, interrupt controller, and + * sometimes the whole SoC). Entering a coupled power state must + * be tightly controlled on both cpus. + * + * This file implements a solution, where each cpu will wait in the + * WFI state until all cpus are ready to enter a coupled state, at + * which point the coupled state function will be called on all + * cpus at approximately the same time. + * + * Once all cpus are ready to enter idle, they are woken by an smp + * cross call. At this point, there is a chance that one of the + * cpus will find work to do, and choose not to enter idle. A + * final pass is needed to guarantee that all cpus will call the + * power state enter function at the same time. During this pass, + * each cpu will increment the ready counter, and continue once the + * ready counter matches the number of online coupled cpus. If any + * cpu exits idle, the other cpus will decrement their counter and + * retry. + * + * requested_state stores the deepest coupled idle state each cpu + * is ready for. It is assumed that the states are indexed from + * shallowest (highest power, lowest exit latency) to deepest + * (lowest power, highest exit latency). The requested_state + * variable is not locked. It is only written from the cpu that + * it stores (or by the on/offlining cpu if that cpu is offline), + * and only read after all the cpus are ready for the coupled idle + * state are no longer updating it. + * + * Three atomic counters are used. alive_count tracks the number + * of cpus in the coupled set that are currently or soon will be + * online. waiting_count tracks the number of cpus that are in + * the waiting loop, in the ready loop, or in the coupled idle state. + * ready_count tracks the number of cpus that are in the ready loop + * or in the coupled idle state. + * + * To use coupled cpuidle states, a cpuidle driver must: + * + * Set struct cpuidle_device.coupled_cpus to the mask of all + * coupled cpus, usually the same as cpu_possible_mask if all cpus + * are part of the same cluster. The coupled_cpus mask must be + * set in the struct cpuidle_device for each cpu. + * + * Set struct cpuidle_device.safe_state to a state that is not a + * coupled state. This is usually WFI. + * + * Set CPUIDLE_FLAG_COUPLED in struct cpuidle_state.flags for each + * state that affects multiple cpus. + * + * Provide a struct cpuidle_state.enter function for each state + * that affects multiple cpus. This function is guaranteed to be + * called on all cpus at approximately the same time. The driver + * should ensure that the cpus all abort together if any cpu tries + * to abort once the function is called. The function should return + * with interrupts still disabled. + */ + +/** + * struct cpuidle_coupled - data for set of cpus that share a coupled idle state + * @coupled_cpus: mask of cpus that are part of the coupled set + * @requested_state: array of requested states for cpus in the coupled set + * @ready_waiting_counts: combined count of cpus in ready or waiting loops + * @abort_barrier: synchronisation point for abort cases + * @online_count: count of cpus that are online + * @refcnt: reference count of cpuidle devices that are using this struct + * @prevent: flag to prevent coupled idle while a cpu is hotplugging + */ +struct cpuidle_coupled { + cpumask_t coupled_cpus; + int requested_state[NR_CPUS]; + atomic_t ready_waiting_counts; + atomic_t abort_barrier; + int online_count; + int refcnt; + int prevent; +}; + +#define WAITING_BITS 16 +#define MAX_WAITING_CPUS (1 << WAITING_BITS) +#define WAITING_MASK (MAX_WAITING_CPUS - 1) +#define READY_MASK (~WAITING_MASK) + +#define CPUIDLE_COUPLED_NOT_IDLE (-1) + +static DEFINE_PER_CPU(call_single_data_t, cpuidle_coupled_poke_cb); + +/* + * The cpuidle_coupled_poke_pending mask is used to avoid calling + * __smp_call_function_single with the per cpu call_single_data_t struct already + * in use. This prevents a deadlock where two cpus are waiting for each others + * call_single_data_t struct to be available + */ +static cpumask_t cpuidle_coupled_poke_pending; + +/* + * The cpuidle_coupled_poked mask is used to ensure that each cpu has been poked + * once to minimize entering the ready loop with a poke pending, which would + * require aborting and retrying. + */ +static cpumask_t cpuidle_coupled_poked; + +/** + * cpuidle_coupled_parallel_barrier - synchronize all online coupled cpus + * @dev: cpuidle_device of the calling cpu + * @a: atomic variable to hold the barrier + * + * No caller to this function will return from this function until all online + * cpus in the same coupled group have called this function. Once any caller + * has returned from this function, the barrier is immediately available for + * reuse. + * + * The atomic variable must be initialized to 0 before any cpu calls + * this function, will be reset to 0 before any cpu returns from this function. + * + * Must only be called from within a coupled idle state handler + * (state.enter when state.flags has CPUIDLE_FLAG_COUPLED set). + * + * Provides full smp barrier semantics before and after calling. + */ +void cpuidle_coupled_parallel_barrier(struct cpuidle_device *dev, atomic_t *a) +{ + int n = dev->coupled->online_count; + + smp_mb__before_atomic(); + atomic_inc(a); + + while (atomic_read(a) < n) + cpu_relax(); + + if (atomic_inc_return(a) == n * 2) { + atomic_set(a, 0); + return; + } + + while (atomic_read(a) > n) + cpu_relax(); +} + +/** + * cpuidle_state_is_coupled - check if a state is part of a coupled set + * @drv: struct cpuidle_driver for the platform + * @state: index of the target state in drv->states + * + * Returns true if the target state is coupled with cpus besides this one + */ +bool cpuidle_state_is_coupled(struct cpuidle_driver *drv, int state) +{ + return drv->states[state].flags & CPUIDLE_FLAG_COUPLED; +} + +/** + * cpuidle_coupled_state_verify - check if the coupled states are correctly set. + * @drv: struct cpuidle_driver for the platform + * + * Returns 0 for valid state values, a negative error code otherwise: + * * -EINVAL if any coupled state(safe_state_index) is wrongly set. + */ +int cpuidle_coupled_state_verify(struct cpuidle_driver *drv) +{ + int i; + + for (i = drv->state_count - 1; i >= 0; i--) { + if (cpuidle_state_is_coupled(drv, i) && + (drv->safe_state_index == i || + drv->safe_state_index < 0 || + drv->safe_state_index >= drv->state_count)) + return -EINVAL; + } + + return 0; +} + +/** + * cpuidle_coupled_set_ready - mark a cpu as ready + * @coupled: the struct coupled that contains the current cpu + */ +static inline void cpuidle_coupled_set_ready(struct cpuidle_coupled *coupled) +{ + atomic_add(MAX_WAITING_CPUS, &coupled->ready_waiting_counts); +} + +/** + * cpuidle_coupled_set_not_ready - mark a cpu as not ready + * @coupled: the struct coupled that contains the current cpu + * + * Decrements the ready counter, unless the ready (and thus the waiting) counter + * is equal to the number of online cpus. Prevents a race where one cpu + * decrements the waiting counter and then re-increments it just before another + * cpu has decremented its ready counter, leading to the ready counter going + * down from the number of online cpus without going through the coupled idle + * state. + * + * Returns 0 if the counter was decremented successfully, -EINVAL if the ready + * counter was equal to the number of online cpus. + */ +static +inline int cpuidle_coupled_set_not_ready(struct cpuidle_coupled *coupled) +{ + int all; + int ret; + + all = coupled->online_count | (coupled->online_count << WAITING_BITS); + ret = atomic_add_unless(&coupled->ready_waiting_counts, + -MAX_WAITING_CPUS, all); + + return ret ? 0 : -EINVAL; +} + +/** + * cpuidle_coupled_no_cpus_ready - check if no cpus in a coupled set are ready + * @coupled: the struct coupled that contains the current cpu + * + * Returns true if all of the cpus in a coupled set are out of the ready loop. + */ +static inline int cpuidle_coupled_no_cpus_ready(struct cpuidle_coupled *coupled) +{ + int r = atomic_read(&coupled->ready_waiting_counts) >> WAITING_BITS; + return r == 0; +} + +/** + * cpuidle_coupled_cpus_ready - check if all cpus in a coupled set are ready + * @coupled: the struct coupled that contains the current cpu + * + * Returns true if all cpus coupled to this target state are in the ready loop + */ +static inline bool cpuidle_coupled_cpus_ready(struct cpuidle_coupled *coupled) +{ + int r = atomic_read(&coupled->ready_waiting_counts) >> WAITING_BITS; + return r == coupled->online_count; +} + +/** + * cpuidle_coupled_cpus_waiting - check if all cpus in a coupled set are waiting + * @coupled: the struct coupled that contains the current cpu + * + * Returns true if all cpus coupled to this target state are in the wait loop + */ +static inline bool cpuidle_coupled_cpus_waiting(struct cpuidle_coupled *coupled) +{ + int w = atomic_read(&coupled->ready_waiting_counts) & WAITING_MASK; + return w == coupled->online_count; +} + +/** + * cpuidle_coupled_no_cpus_waiting - check if no cpus in coupled set are waiting + * @coupled: the struct coupled that contains the current cpu + * + * Returns true if all of the cpus in a coupled set are out of the waiting loop. + */ +static inline int cpuidle_coupled_no_cpus_waiting(struct cpuidle_coupled *coupled) +{ + int w = atomic_read(&coupled->ready_waiting_counts) & WAITING_MASK; + return w == 0; +} + +/** + * cpuidle_coupled_get_state - determine the deepest idle state + * @dev: struct cpuidle_device for this cpu + * @coupled: the struct coupled that contains the current cpu + * + * Returns the deepest idle state that all coupled cpus can enter + */ +static inline int cpuidle_coupled_get_state(struct cpuidle_device *dev, + struct cpuidle_coupled *coupled) +{ + int i; + int state = INT_MAX; + + /* + * Read barrier ensures that read of requested_state is ordered after + * reads of ready_count. Matches the write barriers + * cpuidle_set_state_waiting. + */ + smp_rmb(); + + for_each_cpu(i, &coupled->coupled_cpus) + if (cpu_online(i) && coupled->requested_state[i] < state) + state = coupled->requested_state[i]; + + return state; +} + +static void cpuidle_coupled_handle_poke(void *info) +{ + int cpu = (unsigned long)info; + cpumask_set_cpu(cpu, &cpuidle_coupled_poked); + cpumask_clear_cpu(cpu, &cpuidle_coupled_poke_pending); +} + +/** + * cpuidle_coupled_poke - wake up a cpu that may be waiting + * @cpu: target cpu + * + * Ensures that the target cpu exits it's waiting idle state (if it is in it) + * and will see updates to waiting_count before it re-enters it's waiting idle + * state. + * + * If cpuidle_coupled_poked_mask is already set for the target cpu, that cpu + * either has or will soon have a pending IPI that will wake it out of idle, + * or it is currently processing the IPI and is not in idle. + */ +static void cpuidle_coupled_poke(int cpu) +{ + call_single_data_t *csd = &per_cpu(cpuidle_coupled_poke_cb, cpu); + + if (!cpumask_test_and_set_cpu(cpu, &cpuidle_coupled_poke_pending)) + smp_call_function_single_async(cpu, csd); +} + +/** + * cpuidle_coupled_poke_others - wake up all other cpus that may be waiting + * @this_cpu: target cpu + * @coupled: the struct coupled that contains the current cpu + * + * Calls cpuidle_coupled_poke on all other online cpus. + */ +static void cpuidle_coupled_poke_others(int this_cpu, + struct cpuidle_coupled *coupled) +{ + int cpu; + + for_each_cpu(cpu, &coupled->coupled_cpus) + if (cpu != this_cpu && cpu_online(cpu)) + cpuidle_coupled_poke(cpu); +} + +/** + * cpuidle_coupled_set_waiting - mark this cpu as in the wait loop + * @cpu: target cpu + * @coupled: the struct coupled that contains the current cpu + * @next_state: the index in drv->states of the requested state for this cpu + * + * Updates the requested idle state for the specified cpuidle device. + * Returns the number of waiting cpus. + */ +static int cpuidle_coupled_set_waiting(int cpu, + struct cpuidle_coupled *coupled, int next_state) +{ + coupled->requested_state[cpu] = next_state; + + /* + * The atomic_inc_return provides a write barrier to order the write + * to requested_state with the later write that increments ready_count. + */ + return atomic_inc_return(&coupled->ready_waiting_counts) & WAITING_MASK; +} + +/** + * cpuidle_coupled_set_not_waiting - mark this cpu as leaving the wait loop + * @cpu: target cpu + * @coupled: the struct coupled that contains the current cpu + * + * Removes the requested idle state for the specified cpuidle device. + */ +static void cpuidle_coupled_set_not_waiting(int cpu, + struct cpuidle_coupled *coupled) +{ + /* + * Decrementing waiting count can race with incrementing it in + * cpuidle_coupled_set_waiting, but that's OK. Worst case, some + * cpus will increment ready_count and then spin until they + * notice that this cpu has cleared it's requested_state. + */ + atomic_dec(&coupled->ready_waiting_counts); + + coupled->requested_state[cpu] = CPUIDLE_COUPLED_NOT_IDLE; +} + +/** + * cpuidle_coupled_set_done - mark this cpu as leaving the ready loop + * @cpu: the current cpu + * @coupled: the struct coupled that contains the current cpu + * + * Marks this cpu as no longer in the ready and waiting loops. Decrements + * the waiting count first to prevent another cpu looping back in and seeing + * this cpu as waiting just before it exits idle. + */ +static void cpuidle_coupled_set_done(int cpu, struct cpuidle_coupled *coupled) +{ + cpuidle_coupled_set_not_waiting(cpu, coupled); + atomic_sub(MAX_WAITING_CPUS, &coupled->ready_waiting_counts); +} + +/** + * cpuidle_coupled_clear_pokes - spin until the poke interrupt is processed + * @cpu: this cpu + * + * Turns on interrupts and spins until any outstanding poke interrupts have + * been processed and the poke bit has been cleared. + * + * Other interrupts may also be processed while interrupts are enabled, so + * need_resched() must be tested after this function returns to make sure + * the interrupt didn't schedule work that should take the cpu out of idle. + * + * Returns 0 if no poke was pending, 1 if a poke was cleared. + */ +static int cpuidle_coupled_clear_pokes(int cpu) +{ + if (!cpumask_test_cpu(cpu, &cpuidle_coupled_poke_pending)) + return 0; + + local_irq_enable(); + while (cpumask_test_cpu(cpu, &cpuidle_coupled_poke_pending)) + cpu_relax(); + local_irq_disable(); + + return 1; +} + +static bool cpuidle_coupled_any_pokes_pending(struct cpuidle_coupled *coupled) +{ + cpumask_t cpus; + int ret; + + cpumask_and(&cpus, cpu_online_mask, &coupled->coupled_cpus); + ret = cpumask_and(&cpus, &cpuidle_coupled_poke_pending, &cpus); + + return ret; +} + +/** + * cpuidle_enter_state_coupled - attempt to enter a state with coupled cpus + * @dev: struct cpuidle_device for the current cpu + * @drv: struct cpuidle_driver for the platform + * @next_state: index of the requested state in drv->states + * + * Coordinate with coupled cpus to enter the target state. This is a two + * stage process. In the first stage, the cpus are operating independently, + * and may call into cpuidle_enter_state_coupled at completely different times. + * To save as much power as possible, the first cpus to call this function will + * go to an intermediate state (the cpuidle_device's safe state), and wait for + * all the other cpus to call this function. Once all coupled cpus are idle, + * the second stage will start. Each coupled cpu will spin until all cpus have + * guaranteed that they will call the target_state. + * + * This function must be called with interrupts disabled. It may enable + * interrupts while preparing for idle, and it will always return with + * interrupts enabled. + */ +int cpuidle_enter_state_coupled(struct cpuidle_device *dev, + struct cpuidle_driver *drv, int next_state) +{ + int entered_state = -1; + struct cpuidle_coupled *coupled = dev->coupled; + int w; + + if (!coupled) + return -EINVAL; + + while (coupled->prevent) { + cpuidle_coupled_clear_pokes(dev->cpu); + if (need_resched()) { + local_irq_enable(); + return entered_state; + } + entered_state = cpuidle_enter_state(dev, drv, + drv->safe_state_index); + local_irq_disable(); + } + + /* Read barrier ensures online_count is read after prevent is cleared */ + smp_rmb(); + +reset: + cpumask_clear_cpu(dev->cpu, &cpuidle_coupled_poked); + + w = cpuidle_coupled_set_waiting(dev->cpu, coupled, next_state); + /* + * If this is the last cpu to enter the waiting state, poke + * all the other cpus out of their waiting state so they can + * enter a deeper state. This can race with one of the cpus + * exiting the waiting state due to an interrupt and + * decrementing waiting_count, see comment below. + */ + if (w == coupled->online_count) { + cpumask_set_cpu(dev->cpu, &cpuidle_coupled_poked); + cpuidle_coupled_poke_others(dev->cpu, coupled); + } + +retry: + /* + * Wait for all coupled cpus to be idle, using the deepest state + * allowed for a single cpu. If this was not the poking cpu, wait + * for at least one poke before leaving to avoid a race where + * two cpus could arrive at the waiting loop at the same time, + * but the first of the two to arrive could skip the loop without + * processing the pokes from the last to arrive. + */ + while (!cpuidle_coupled_cpus_waiting(coupled) || + !cpumask_test_cpu(dev->cpu, &cpuidle_coupled_poked)) { + if (cpuidle_coupled_clear_pokes(dev->cpu)) + continue; + + if (need_resched()) { + cpuidle_coupled_set_not_waiting(dev->cpu, coupled); + goto out; + } + + if (coupled->prevent) { + cpuidle_coupled_set_not_waiting(dev->cpu, coupled); + goto out; + } + + entered_state = cpuidle_enter_state(dev, drv, + drv->safe_state_index); + local_irq_disable(); + } + + cpuidle_coupled_clear_pokes(dev->cpu); + if (need_resched()) { + cpuidle_coupled_set_not_waiting(dev->cpu, coupled); + goto out; + } + + /* + * Make sure final poke status for this cpu is visible before setting + * cpu as ready. + */ + smp_wmb(); + + /* + * All coupled cpus are probably idle. There is a small chance that + * one of the other cpus just became active. Increment the ready count, + * and spin until all coupled cpus have incremented the counter. Once a + * cpu has incremented the ready counter, it cannot abort idle and must + * spin until either all cpus have incremented the ready counter, or + * another cpu leaves idle and decrements the waiting counter. + */ + + cpuidle_coupled_set_ready(coupled); + while (!cpuidle_coupled_cpus_ready(coupled)) { + /* Check if any other cpus bailed out of idle. */ + if (!cpuidle_coupled_cpus_waiting(coupled)) + if (!cpuidle_coupled_set_not_ready(coupled)) + goto retry; + + cpu_relax(); + } + + /* + * Make sure read of all cpus ready is done before reading pending pokes + */ + smp_rmb(); + + /* + * There is a small chance that a cpu left and reentered idle after this + * cpu saw that all cpus were waiting. The cpu that reentered idle will + * have sent this cpu a poke, which will still be pending after the + * ready loop. The pending interrupt may be lost by the interrupt + * controller when entering the deep idle state. It's not possible to + * clear a pending interrupt without turning interrupts on and handling + * it, and it's too late to turn on interrupts here, so reset the + * coupled idle state of all cpus and retry. + */ + if (cpuidle_coupled_any_pokes_pending(coupled)) { + cpuidle_coupled_set_done(dev->cpu, coupled); + /* Wait for all cpus to see the pending pokes */ + cpuidle_coupled_parallel_barrier(dev, &coupled->abort_barrier); + goto reset; + } + + /* all cpus have acked the coupled state */ + next_state = cpuidle_coupled_get_state(dev, coupled); + + entered_state = cpuidle_enter_state(dev, drv, next_state); + + cpuidle_coupled_set_done(dev->cpu, coupled); + +out: + /* + * Normal cpuidle states are expected to return with irqs enabled. + * That leads to an inefficiency where a cpu receiving an interrupt + * that brings it out of idle will process that interrupt before + * exiting the idle enter function and decrementing ready_count. All + * other cpus will need to spin waiting for the cpu that is processing + * the interrupt. If the driver returns with interrupts disabled, + * all other cpus will loop back into the safe idle state instead of + * spinning, saving power. + * + * Calling local_irq_enable here allows coupled states to return with + * interrupts disabled, but won't cause problems for drivers that + * exit with interrupts enabled. + */ + local_irq_enable(); + + /* + * Wait until all coupled cpus have exited idle. There is no risk that + * a cpu exits and re-enters the ready state because this cpu has + * already decremented its waiting_count. + */ + while (!cpuidle_coupled_no_cpus_ready(coupled)) + cpu_relax(); + + return entered_state; +} + +static void cpuidle_coupled_update_online_cpus(struct cpuidle_coupled *coupled) +{ + cpumask_t cpus; + cpumask_and(&cpus, cpu_online_mask, &coupled->coupled_cpus); + coupled->online_count = cpumask_weight(&cpus); +} + +/** + * cpuidle_coupled_register_device - register a coupled cpuidle device + * @dev: struct cpuidle_device for the current cpu + * + * Called from cpuidle_register_device to handle coupled idle init. Finds the + * cpuidle_coupled struct for this set of coupled cpus, or creates one if none + * exists yet. + */ +int cpuidle_coupled_register_device(struct cpuidle_device *dev) +{ + int cpu; + struct cpuidle_device *other_dev; + call_single_data_t *csd; + struct cpuidle_coupled *coupled; + + if (cpumask_empty(&dev->coupled_cpus)) + return 0; + + for_each_cpu(cpu, &dev->coupled_cpus) { + other_dev = per_cpu(cpuidle_devices, cpu); + if (other_dev && other_dev->coupled) { + coupled = other_dev->coupled; + goto have_coupled; + } + } + + /* No existing coupled info found, create a new one */ + coupled = kzalloc(sizeof(struct cpuidle_coupled), GFP_KERNEL); + if (!coupled) + return -ENOMEM; + + coupled->coupled_cpus = dev->coupled_cpus; + +have_coupled: + dev->coupled = coupled; + if (WARN_ON(!cpumask_equal(&dev->coupled_cpus, &coupled->coupled_cpus))) + coupled->prevent++; + + cpuidle_coupled_update_online_cpus(coupled); + + coupled->refcnt++; + + csd = &per_cpu(cpuidle_coupled_poke_cb, dev->cpu); + INIT_CSD(csd, cpuidle_coupled_handle_poke, (void *)(unsigned long)dev->cpu); + + return 0; +} + +/** + * cpuidle_coupled_unregister_device - unregister a coupled cpuidle device + * @dev: struct cpuidle_device for the current cpu + * + * Called from cpuidle_unregister_device to tear down coupled idle. Removes the + * cpu from the coupled idle set, and frees the cpuidle_coupled_info struct if + * this was the last cpu in the set. + */ +void cpuidle_coupled_unregister_device(struct cpuidle_device *dev) +{ + struct cpuidle_coupled *coupled = dev->coupled; + + if (cpumask_empty(&dev->coupled_cpus)) + return; + + if (--coupled->refcnt) + kfree(coupled); + dev->coupled = NULL; +} + +/** + * cpuidle_coupled_prevent_idle - prevent cpus from entering a coupled state + * @coupled: the struct coupled that contains the cpu that is changing state + * + * Disables coupled cpuidle on a coupled set of cpus. Used to ensure that + * cpu_online_mask doesn't change while cpus are coordinating coupled idle. + */ +static void cpuidle_coupled_prevent_idle(struct cpuidle_coupled *coupled) +{ + int cpu = get_cpu(); + + /* Force all cpus out of the waiting loop. */ + coupled->prevent++; + cpuidle_coupled_poke_others(cpu, coupled); + put_cpu(); + while (!cpuidle_coupled_no_cpus_waiting(coupled)) + cpu_relax(); +} + +/** + * cpuidle_coupled_allow_idle - allows cpus to enter a coupled state + * @coupled: the struct coupled that contains the cpu that is changing state + * + * Enables coupled cpuidle on a coupled set of cpus. Used to ensure that + * cpu_online_mask doesn't change while cpus are coordinating coupled idle. + */ +static void cpuidle_coupled_allow_idle(struct cpuidle_coupled *coupled) +{ + int cpu = get_cpu(); + + /* + * Write barrier ensures readers see the new online_count when they + * see prevent == 0. + */ + smp_wmb(); + coupled->prevent--; + /* Force cpus out of the prevent loop. */ + cpuidle_coupled_poke_others(cpu, coupled); + put_cpu(); +} + +static int coupled_cpu_online(unsigned int cpu) +{ + struct cpuidle_device *dev; + + mutex_lock(&cpuidle_lock); + + dev = per_cpu(cpuidle_devices, cpu); + if (dev && dev->coupled) { + cpuidle_coupled_update_online_cpus(dev->coupled); + cpuidle_coupled_allow_idle(dev->coupled); + } + + mutex_unlock(&cpuidle_lock); + return 0; +} + +static int coupled_cpu_up_prepare(unsigned int cpu) +{ + struct cpuidle_device *dev; + + mutex_lock(&cpuidle_lock); + + dev = per_cpu(cpuidle_devices, cpu); + if (dev && dev->coupled) + cpuidle_coupled_prevent_idle(dev->coupled); + + mutex_unlock(&cpuidle_lock); + return 0; +} + +static int __init cpuidle_coupled_init(void) +{ + int ret; + + ret = cpuhp_setup_state_nocalls(CPUHP_CPUIDLE_COUPLED_PREPARE, + "cpuidle/coupled:prepare", + coupled_cpu_up_prepare, + coupled_cpu_online); + if (ret) + return ret; + ret = cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN, + "cpuidle/coupled:online", + coupled_cpu_online, + coupled_cpu_up_prepare); + if (ret < 0) + cpuhp_remove_state_nocalls(CPUHP_CPUIDLE_COUPLED_PREPARE); + return ret; +} +core_initcall(cpuidle_coupled_init); diff --git a/drivers/cpuidle/cpuidle-arm.c b/drivers/cpuidle/cpuidle-arm.c new file mode 100644 index 0000000000..7cfb980a35 --- /dev/null +++ b/drivers/cpuidle/cpuidle-arm.c @@ -0,0 +1,168 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * ARM/ARM64 generic CPU idle driver. + * + * Copyright (C) 2014 ARM Ltd. + * Author: Lorenzo Pieralisi <lorenzo.pieralisi@arm.com> + */ + +#define pr_fmt(fmt) "CPUidle arm: " fmt + +#include <linux/cpu_cooling.h> +#include <linux/cpuidle.h> +#include <linux/cpumask.h> +#include <linux/cpu_pm.h> +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/of.h> +#include <linux/slab.h> + +#include <asm/cpuidle.h> + +#include "dt_idle_states.h" + +/* + * arm_enter_idle_state - Programs CPU to enter the specified state + * + * dev: cpuidle device + * drv: cpuidle driver + * idx: state index + * + * Called from the CPUidle framework to program the device to the + * specified target state selected by the governor. + */ +static __cpuidle int arm_enter_idle_state(struct cpuidle_device *dev, + struct cpuidle_driver *drv, int idx) +{ + /* + * Pass idle state index to arm_cpuidle_suspend which in turn + * will call the CPU ops suspend protocol with idle index as a + * parameter. + */ + return CPU_PM_CPU_IDLE_ENTER(arm_cpuidle_suspend, idx); +} + +static struct cpuidle_driver arm_idle_driver __initdata = { + .name = "arm_idle", + .owner = THIS_MODULE, + /* + * State at index 0 is standby wfi and considered standard + * on all ARM platforms. If in some platforms simple wfi + * can't be used as "state 0", DT bindings must be implemented + * to work around this issue and allow installing a special + * handler for idle state index 0. + */ + .states[0] = { + .enter = arm_enter_idle_state, + .exit_latency = 1, + .target_residency = 1, + .power_usage = UINT_MAX, + .name = "WFI", + .desc = "ARM WFI", + } +}; + +static const struct of_device_id arm_idle_state_match[] __initconst = { + { .compatible = "arm,idle-state", + .data = arm_enter_idle_state }, + { }, +}; + +/* + * arm_idle_init_cpu + * + * Registers the arm specific cpuidle driver with the cpuidle + * framework. It relies on core code to parse the idle states + * and initialize them using driver data structures accordingly. + */ +static int __init arm_idle_init_cpu(int cpu) +{ + int ret; + struct cpuidle_driver *drv; + + drv = kmemdup(&arm_idle_driver, sizeof(*drv), GFP_KERNEL); + if (!drv) + return -ENOMEM; + + drv->cpumask = (struct cpumask *)cpumask_of(cpu); + + /* + * Initialize idle states data, starting at index 1. This + * driver is DT only, if no DT idle states are detected (ret + * == 0) let the driver initialization fail accordingly since + * there is no reason to initialize the idle driver if only + * wfi is supported. + */ + ret = dt_init_idle_driver(drv, arm_idle_state_match, 1); + if (ret <= 0) { + ret = ret ? : -ENODEV; + goto out_kfree_drv; + } + + /* + * Call arch CPU operations in order to initialize + * idle states suspend back-end specific data + */ + ret = arm_cpuidle_init(cpu); + + /* + * Allow the initialization to continue for other CPUs, if the + * reported failure is a HW misconfiguration/breakage (-ENXIO). + * + * Some platforms do not support idle operations + * (arm_cpuidle_init() returning -EOPNOTSUPP), we should + * not flag this case as an error, it is a valid + * configuration. + */ + if (ret) { + if (ret != -EOPNOTSUPP) + pr_err("CPU %d failed to init idle CPU ops\n", cpu); + ret = ret == -ENXIO ? 0 : ret; + goto out_kfree_drv; + } + + ret = cpuidle_register(drv, NULL); + if (ret) + goto out_kfree_drv; + + cpuidle_cooling_register(drv); + + return 0; + +out_kfree_drv: + kfree(drv); + return ret; +} + +/* + * arm_idle_init - Initializes arm cpuidle driver + * + * Initializes arm cpuidle driver for all CPUs, if any CPU fails + * to register cpuidle driver then rollback to cancel all CPUs + * registeration. + */ +static int __init arm_idle_init(void) +{ + int cpu, ret; + struct cpuidle_driver *drv; + struct cpuidle_device *dev; + + for_each_possible_cpu(cpu) { + ret = arm_idle_init_cpu(cpu); + if (ret) + goto out_fail; + } + + return 0; + +out_fail: + while (--cpu >= 0) { + dev = per_cpu(cpuidle_devices, cpu); + drv = cpuidle_get_cpu_driver(dev); + cpuidle_unregister(drv); + kfree(drv); + } + + return ret; +} +device_initcall(arm_idle_init); diff --git a/drivers/cpuidle/cpuidle-at91.c b/drivers/cpuidle/cpuidle-at91.c new file mode 100644 index 0000000000..45ee8e1e71 --- /dev/null +++ b/drivers/cpuidle/cpuidle-at91.c @@ -0,0 +1,62 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * based on arch/arm/mach-kirkwood/cpuidle.c + * + * CPU idle support for AT91 SoC + * + * The cpu idle uses wait-for-interrupt and RAM self refresh in order + * to implement two idle states - + * #1 wait-for-interrupt + * #2 wait-for-interrupt and RAM self refresh + */ + +#include <linux/kernel.h> +#include <linux/init.h> +#include <linux/platform_device.h> +#include <linux/cpuidle.h> +#include <linux/io.h> +#include <linux/export.h> +#include <asm/cpuidle.h> + +#define AT91_MAX_STATES 2 + +static void (*at91_standby)(void); + +/* Actual code that puts the SoC in different idle states */ +static int at91_enter_idle(struct cpuidle_device *dev, + struct cpuidle_driver *drv, + int index) +{ + at91_standby(); + return index; +} + +static struct cpuidle_driver at91_idle_driver = { + .name = "at91_idle", + .owner = THIS_MODULE, + .states[0] = ARM_CPUIDLE_WFI_STATE, + .states[1] = { + .enter = at91_enter_idle, + .exit_latency = 10, + .target_residency = 10000, + .name = "RAM_SR", + .desc = "WFI and DDR Self Refresh", + }, + .state_count = AT91_MAX_STATES, +}; + +/* Initialize CPU idle by registering the idle states */ +static int at91_cpuidle_probe(struct platform_device *dev) +{ + at91_standby = (void *)(dev->dev.platform_data); + + return cpuidle_register(&at91_idle_driver, NULL); +} + +static struct platform_driver at91_cpuidle_driver = { + .driver = { + .name = "cpuidle-at91", + }, + .probe = at91_cpuidle_probe, +}; +builtin_platform_driver(at91_cpuidle_driver); diff --git a/drivers/cpuidle/cpuidle-big_little.c b/drivers/cpuidle/cpuidle-big_little.c new file mode 100644 index 0000000000..74972deda0 --- /dev/null +++ b/drivers/cpuidle/cpuidle-big_little.c @@ -0,0 +1,233 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (c) 2013 ARM/Linaro + * + * Authors: Daniel Lezcano <daniel.lezcano@linaro.org> + * Lorenzo Pieralisi <lorenzo.pieralisi@arm.com> + * Nicolas Pitre <nicolas.pitre@linaro.org> + * + * Maintainer: Lorenzo Pieralisi <lorenzo.pieralisi@arm.com> + * Maintainer: Daniel Lezcano <daniel.lezcano@linaro.org> + */ +#include <linux/cpuidle.h> +#include <linux/cpu_pm.h> +#include <linux/slab.h> +#include <linux/of.h> + +#include <asm/cpu.h> +#include <asm/cputype.h> +#include <asm/cpuidle.h> +#include <asm/mcpm.h> +#include <asm/smp_plat.h> +#include <asm/suspend.h> + +#include "dt_idle_states.h" + +static int bl_enter_powerdown(struct cpuidle_device *dev, + struct cpuidle_driver *drv, int idx); + +/* + * NB: Owing to current menu governor behaviour big and LITTLE + * index 1 states have to define exit_latency and target_residency for + * cluster state since, when all CPUs in a cluster hit it, the cluster + * can be shutdown. This means that when a single CPU enters this state + * the exit_latency and target_residency values are somewhat overkill. + * There is no notion of cluster states in the menu governor, so CPUs + * have to define CPU states where possibly the cluster will be shutdown + * depending on the state of other CPUs. idle states entry and exit happen + * at random times; however the cluster state provides target_residency + * values as if all CPUs in a cluster enter the state at once; this is + * somewhat optimistic and behaviour should be fixed either in the governor + * or in the MCPM back-ends. + * To make this driver 100% generic the number of states and the exit_latency + * target_residency values must be obtained from device tree bindings. + * + * exit_latency: refers to the TC2 vexpress test chip and depends on the + * current cluster operating point. It is the time it takes to get the CPU + * up and running when the CPU is powered up on cluster wake-up from shutdown. + * Current values for big and LITTLE clusters are provided for clusters + * running at default operating points. + * + * target_residency: it is the minimum amount of time the cluster has + * to be down to break even in terms of power consumption. cluster + * shutdown has inherent dynamic power costs (L2 writebacks to DRAM + * being the main factor) that depend on the current operating points. + * The current values for both clusters are provided for a CPU whose half + * of L2 lines are dirty and require cleaning to DRAM, and takes into + * account leakage static power values related to the vexpress TC2 testchip. + */ +static struct cpuidle_driver bl_idle_little_driver = { + .name = "little_idle", + .owner = THIS_MODULE, + .states[0] = ARM_CPUIDLE_WFI_STATE, + .states[1] = { + .enter = bl_enter_powerdown, + .exit_latency = 700, + .target_residency = 2500, + .flags = CPUIDLE_FLAG_TIMER_STOP | + CPUIDLE_FLAG_RCU_IDLE, + .name = "C1", + .desc = "ARM little-cluster power down", + }, + .state_count = 2, +}; + +static const struct of_device_id bl_idle_state_match[] __initconst = { + { .compatible = "arm,idle-state", + .data = bl_enter_powerdown }, + { }, +}; + +static struct cpuidle_driver bl_idle_big_driver = { + .name = "big_idle", + .owner = THIS_MODULE, + .states[0] = ARM_CPUIDLE_WFI_STATE, + .states[1] = { + .enter = bl_enter_powerdown, + .exit_latency = 500, + .target_residency = 2000, + .flags = CPUIDLE_FLAG_TIMER_STOP | + CPUIDLE_FLAG_RCU_IDLE, + .name = "C1", + .desc = "ARM big-cluster power down", + }, + .state_count = 2, +}; + +/* + * notrace prevents trace shims from getting inserted where they + * should not. Global jumps and ldrex/strex must not be inserted + * in power down sequences where caches and MMU may be turned off. + */ +static int notrace bl_powerdown_finisher(unsigned long arg) +{ + /* MCPM works with HW CPU identifiers */ + unsigned int mpidr = read_cpuid_mpidr(); + unsigned int cluster = MPIDR_AFFINITY_LEVEL(mpidr, 1); + unsigned int cpu = MPIDR_AFFINITY_LEVEL(mpidr, 0); + + mcpm_set_entry_vector(cpu, cluster, cpu_resume); + mcpm_cpu_suspend(); + + /* return value != 0 means failure */ + return 1; +} + +/** + * bl_enter_powerdown - Programs CPU to enter the specified state + * @dev: cpuidle device + * @drv: The target state to be programmed + * @idx: state index + * + * Called from the CPUidle framework to program the device to the + * specified target state selected by the governor. + */ +static __cpuidle int bl_enter_powerdown(struct cpuidle_device *dev, + struct cpuidle_driver *drv, int idx) +{ + cpu_pm_enter(); + ct_cpuidle_enter(); + + cpu_suspend(0, bl_powerdown_finisher); + + /* signals the MCPM core that CPU is out of low power state */ + mcpm_cpu_powered_up(); + ct_cpuidle_exit(); + + cpu_pm_exit(); + + return idx; +} + +static int __init bl_idle_driver_init(struct cpuidle_driver *drv, int part_id) +{ + struct cpumask *cpumask; + int cpu; + + cpumask = kzalloc(cpumask_size(), GFP_KERNEL); + if (!cpumask) + return -ENOMEM; + + for_each_possible_cpu(cpu) + if (smp_cpuid_part(cpu) == part_id) + cpumask_set_cpu(cpu, cpumask); + + drv->cpumask = cpumask; + + return 0; +} + +static const struct of_device_id compatible_machine_match[] = { + { .compatible = "arm,vexpress,v2p-ca15_a7" }, + { .compatible = "google,peach" }, + {}, +}; + +static int __init bl_idle_init(void) +{ + int ret; + struct device_node *root = of_find_node_by_path("/"); + const struct of_device_id *match_id; + + if (!root) + return -ENODEV; + + /* + * Initialize the driver just for a compliant set of machines + */ + match_id = of_match_node(compatible_machine_match, root); + + of_node_put(root); + + if (!match_id) + return -ENODEV; + + if (!mcpm_is_available()) + return -EUNATCH; + + /* + * For now the differentiation between little and big cores + * is based on the part number. A7 cores are considered little + * cores, A15 are considered big cores. This distinction may + * evolve in the future with a more generic matching approach. + */ + ret = bl_idle_driver_init(&bl_idle_little_driver, + ARM_CPU_PART_CORTEX_A7); + if (ret) + return ret; + + ret = bl_idle_driver_init(&bl_idle_big_driver, ARM_CPU_PART_CORTEX_A15); + if (ret) + goto out_uninit_little; + + /* Start at index 1, index 0 standard WFI */ + ret = dt_init_idle_driver(&bl_idle_big_driver, bl_idle_state_match, 1); + if (ret < 0) + goto out_uninit_big; + + /* Start at index 1, index 0 standard WFI */ + ret = dt_init_idle_driver(&bl_idle_little_driver, + bl_idle_state_match, 1); + if (ret < 0) + goto out_uninit_big; + + ret = cpuidle_register(&bl_idle_little_driver, NULL); + if (ret) + goto out_uninit_big; + + ret = cpuidle_register(&bl_idle_big_driver, NULL); + if (ret) + goto out_unregister_little; + + return 0; + +out_unregister_little: + cpuidle_unregister(&bl_idle_little_driver); +out_uninit_big: + kfree(bl_idle_big_driver.cpumask); +out_uninit_little: + kfree(bl_idle_little_driver.cpumask); + + return ret; +} +device_initcall(bl_idle_init); diff --git a/drivers/cpuidle/cpuidle-calxeda.c b/drivers/cpuidle/cpuidle-calxeda.c new file mode 100644 index 0000000000..b17d9a8418 --- /dev/null +++ b/drivers/cpuidle/cpuidle-calxeda.c @@ -0,0 +1,72 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright 2012 Calxeda, Inc. + * + * Based on arch/arm/plat-mxc/cpuidle.c: #v3.7 + * Copyright 2012 Freescale Semiconductor, Inc. + * Copyright 2012 Linaro Ltd. + * + * Maintainer: Rob Herring <rob.herring@calxeda.com> + */ + +#include <linux/cpuidle.h> +#include <linux/cpu_pm.h> +#include <linux/init.h> +#include <linux/mm.h> +#include <linux/platform_device.h> +#include <linux/psci.h> + +#include <asm/cpuidle.h> +#include <asm/suspend.h> + +#include <uapi/linux/psci.h> + +#define CALXEDA_IDLE_PARAM \ + ((0 << PSCI_0_2_POWER_STATE_ID_SHIFT) | \ + (0 << PSCI_0_2_POWER_STATE_AFFL_SHIFT) | \ + (PSCI_POWER_STATE_TYPE_POWER_DOWN << PSCI_0_2_POWER_STATE_TYPE_SHIFT)) + +static int calxeda_idle_finish(unsigned long val) +{ + return psci_ops.cpu_suspend(CALXEDA_IDLE_PARAM, __pa(cpu_resume)); +} + +static int calxeda_pwrdown_idle(struct cpuidle_device *dev, + struct cpuidle_driver *drv, + int index) +{ + cpu_pm_enter(); + cpu_suspend(0, calxeda_idle_finish); + cpu_pm_exit(); + + return index; +} + +static struct cpuidle_driver calxeda_idle_driver = { + .name = "calxeda_idle", + .states = { + ARM_CPUIDLE_WFI_STATE, + { + .name = "PG", + .desc = "Power Gate", + .exit_latency = 30, + .power_usage = 50, + .target_residency = 200, + .enter = calxeda_pwrdown_idle, + }, + }, + .state_count = 2, +}; + +static int calxeda_cpuidle_probe(struct platform_device *pdev) +{ + return cpuidle_register(&calxeda_idle_driver, NULL); +} + +static struct platform_driver calxeda_cpuidle_plat_driver = { + .driver = { + .name = "cpuidle-calxeda", + }, + .probe = calxeda_cpuidle_probe, +}; +builtin_platform_driver(calxeda_cpuidle_plat_driver); diff --git a/drivers/cpuidle/cpuidle-clps711x.c b/drivers/cpuidle/cpuidle-clps711x.c new file mode 100644 index 0000000000..fc22c59b6c --- /dev/null +++ b/drivers/cpuidle/cpuidle-clps711x.c @@ -0,0 +1,52 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * CLPS711X CPU idle driver + * + * Copyright (C) 2014 Alexander Shiyan <shc_work@mail.ru> + */ + +#include <linux/cpuidle.h> +#include <linux/err.h> +#include <linux/io.h> +#include <linux/init.h> +#include <linux/platform_device.h> + +#define CLPS711X_CPUIDLE_NAME "clps711x-cpuidle" + +static void __iomem *clps711x_halt; + +static int clps711x_cpuidle_halt(struct cpuidle_device *dev, + struct cpuidle_driver *drv, int index) +{ + writel(0xaa, clps711x_halt); + + return index; +} + +static struct cpuidle_driver clps711x_idle_driver = { + .name = CLPS711X_CPUIDLE_NAME, + .owner = THIS_MODULE, + .states[0] = { + .name = "HALT", + .desc = "CLPS711X HALT", + .enter = clps711x_cpuidle_halt, + .exit_latency = 1, + }, + .state_count = 1, +}; + +static int __init clps711x_cpuidle_probe(struct platform_device *pdev) +{ + clps711x_halt = devm_platform_ioremap_resource(pdev, 0); + if (IS_ERR(clps711x_halt)) + return PTR_ERR(clps711x_halt); + + return cpuidle_register(&clps711x_idle_driver, NULL); +} + +static struct platform_driver clps711x_cpuidle_driver = { + .driver = { + .name = CLPS711X_CPUIDLE_NAME, + }, +}; +builtin_platform_driver_probe(clps711x_cpuidle_driver, clps711x_cpuidle_probe); diff --git a/drivers/cpuidle/cpuidle-cps.c b/drivers/cpuidle/cpuidle-cps.c new file mode 100644 index 0000000000..dff0ff4cc2 --- /dev/null +++ b/drivers/cpuidle/cpuidle-cps.c @@ -0,0 +1,178 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (C) 2014 Imagination Technologies + * Author: Paul Burton <paul.burton@mips.com> + */ + +#include <linux/cpu_pm.h> +#include <linux/cpuidle.h> +#include <linux/init.h> + +#include <asm/idle.h> +#include <asm/pm-cps.h> + +/* Enumeration of the various idle states this driver may enter */ +enum cps_idle_state { + STATE_WAIT = 0, /* MIPS wait instruction, coherent */ + STATE_NC_WAIT, /* MIPS wait instruction, non-coherent */ + STATE_CLOCK_GATED, /* Core clock gated */ + STATE_POWER_GATED, /* Core power gated */ + STATE_COUNT +}; + +static int cps_nc_enter(struct cpuidle_device *dev, + struct cpuidle_driver *drv, int index) +{ + enum cps_pm_state pm_state; + int err; + + /* + * At least one core must remain powered up & clocked in order for the + * system to have any hope of functioning. + * + * TODO: don't treat core 0 specially, just prevent the final core + * TODO: remap interrupt affinity temporarily + */ + if (cpus_are_siblings(0, dev->cpu) && (index > STATE_NC_WAIT)) + index = STATE_NC_WAIT; + + /* Select the appropriate cps_pm_state */ + switch (index) { + case STATE_NC_WAIT: + pm_state = CPS_PM_NC_WAIT; + break; + case STATE_CLOCK_GATED: + pm_state = CPS_PM_CLOCK_GATED; + break; + case STATE_POWER_GATED: + pm_state = CPS_PM_POWER_GATED; + break; + default: + BUG(); + return -EINVAL; + } + + /* Notify listeners the CPU is about to power down */ + if ((pm_state == CPS_PM_POWER_GATED) && cpu_pm_enter()) + return -EINTR; + + /* Enter that state */ + err = cps_pm_enter_state(pm_state); + + /* Notify listeners the CPU is back up */ + if (pm_state == CPS_PM_POWER_GATED) + cpu_pm_exit(); + + return err ?: index; +} + +static struct cpuidle_driver cps_driver = { + .name = "cpc_cpuidle", + .owner = THIS_MODULE, + .states = { + [STATE_WAIT] = MIPS_CPUIDLE_WAIT_STATE, + [STATE_NC_WAIT] = { + .enter = cps_nc_enter, + .exit_latency = 200, + .target_residency = 450, + .name = "nc-wait", + .desc = "non-coherent MIPS wait", + }, + [STATE_CLOCK_GATED] = { + .enter = cps_nc_enter, + .exit_latency = 300, + .target_residency = 700, + .flags = CPUIDLE_FLAG_TIMER_STOP, + .name = "clock-gated", + .desc = "core clock gated", + }, + [STATE_POWER_GATED] = { + .enter = cps_nc_enter, + .exit_latency = 600, + .target_residency = 1000, + .flags = CPUIDLE_FLAG_TIMER_STOP, + .name = "power-gated", + .desc = "core power gated", + }, + }, + .state_count = STATE_COUNT, + .safe_state_index = 0, +}; + +static void __init cps_cpuidle_unregister(void) +{ + int cpu; + struct cpuidle_device *device; + + for_each_possible_cpu(cpu) { + device = &per_cpu(cpuidle_dev, cpu); + cpuidle_unregister_device(device); + } + + cpuidle_unregister_driver(&cps_driver); +} + +static int __init cps_cpuidle_init(void) +{ + int err, cpu, i; + struct cpuidle_device *device; + + /* Detect supported states */ + if (!cps_pm_support_state(CPS_PM_POWER_GATED)) + cps_driver.state_count = STATE_CLOCK_GATED + 1; + if (!cps_pm_support_state(CPS_PM_CLOCK_GATED)) + cps_driver.state_count = STATE_NC_WAIT + 1; + if (!cps_pm_support_state(CPS_PM_NC_WAIT)) + cps_driver.state_count = STATE_WAIT + 1; + + /* Inform the user if some states are unavailable */ + if (cps_driver.state_count < STATE_COUNT) { + pr_info("cpuidle-cps: limited to "); + switch (cps_driver.state_count - 1) { + case STATE_WAIT: + pr_cont("coherent wait\n"); + break; + case STATE_NC_WAIT: + pr_cont("non-coherent wait\n"); + break; + case STATE_CLOCK_GATED: + pr_cont("clock gating\n"); + break; + } + } + + /* + * Set the coupled flag on the appropriate states if this system + * requires it. + */ + if (coupled_coherence) + for (i = STATE_NC_WAIT; i < cps_driver.state_count; i++) + cps_driver.states[i].flags |= CPUIDLE_FLAG_COUPLED; + + err = cpuidle_register_driver(&cps_driver); + if (err) { + pr_err("Failed to register CPS cpuidle driver\n"); + return err; + } + + for_each_possible_cpu(cpu) { + device = &per_cpu(cpuidle_dev, cpu); + device->cpu = cpu; +#ifdef CONFIG_ARCH_NEEDS_CPU_IDLE_COUPLED + cpumask_copy(&device->coupled_cpus, &cpu_sibling_map[cpu]); +#endif + + err = cpuidle_register_device(device); + if (err) { + pr_err("Failed to register CPU%d cpuidle device\n", + cpu); + goto err_out; + } + } + + return 0; +err_out: + cps_cpuidle_unregister(); + return err; +} +device_initcall(cps_cpuidle_init); diff --git a/drivers/cpuidle/cpuidle-exynos.c b/drivers/cpuidle/cpuidle-exynos.c new file mode 100644 index 0000000000..b2b5666e05 --- /dev/null +++ b/drivers/cpuidle/cpuidle-exynos.c @@ -0,0 +1,143 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (c) 2011-2014 Samsung Electronics Co., Ltd. + * http://www.samsung.com + * + * Coupled cpuidle support based on the work of: + * Colin Cross <ccross@android.com> + * Daniel Lezcano <daniel.lezcano@linaro.org> +*/ + +#include <linux/cpuidle.h> +#include <linux/cpu_pm.h> +#include <linux/export.h> +#include <linux/init.h> +#include <linux/platform_device.h> +#include <linux/of.h> +#include <linux/platform_data/cpuidle-exynos.h> + +#include <asm/suspend.h> +#include <asm/cpuidle.h> + +static atomic_t exynos_idle_barrier; + +static struct cpuidle_exynos_data *exynos_cpuidle_pdata; +static void (*exynos_enter_aftr)(void); + +static int exynos_enter_coupled_lowpower(struct cpuidle_device *dev, + struct cpuidle_driver *drv, + int index) +{ + int ret; + + exynos_cpuidle_pdata->pre_enter_aftr(); + + /* + * Waiting all cpus to reach this point at the same moment + */ + cpuidle_coupled_parallel_barrier(dev, &exynos_idle_barrier); + + /* + * Both cpus will reach this point at the same time + */ + ret = dev->cpu ? exynos_cpuidle_pdata->cpu1_powerdown() + : exynos_cpuidle_pdata->cpu0_enter_aftr(); + if (ret) + index = ret; + + /* + * Waiting all cpus to finish the power sequence before going further + */ + cpuidle_coupled_parallel_barrier(dev, &exynos_idle_barrier); + + exynos_cpuidle_pdata->post_enter_aftr(); + + return index; +} + +static int exynos_enter_lowpower(struct cpuidle_device *dev, + struct cpuidle_driver *drv, + int index) +{ + int new_index = index; + + /* AFTR can only be entered when cores other than CPU0 are offline */ + if (num_online_cpus() > 1 || dev->cpu != 0) + new_index = drv->safe_state_index; + + if (new_index == 0) + return arm_cpuidle_simple_enter(dev, drv, new_index); + + exynos_enter_aftr(); + + return new_index; +} + +static struct cpuidle_driver exynos_idle_driver = { + .name = "exynos_idle", + .owner = THIS_MODULE, + .states = { + [0] = ARM_CPUIDLE_WFI_STATE, + [1] = { + .enter = exynos_enter_lowpower, + .exit_latency = 300, + .target_residency = 10000, + .name = "C1", + .desc = "ARM power down", + }, + }, + .state_count = 2, + .safe_state_index = 0, +}; + +static struct cpuidle_driver exynos_coupled_idle_driver = { + .name = "exynos_coupled_idle", + .owner = THIS_MODULE, + .states = { + [0] = ARM_CPUIDLE_WFI_STATE, + [1] = { + .enter = exynos_enter_coupled_lowpower, + .exit_latency = 5000, + .target_residency = 10000, + .flags = CPUIDLE_FLAG_COUPLED | + CPUIDLE_FLAG_TIMER_STOP, + .name = "C1", + .desc = "ARM power down", + }, + }, + .state_count = 2, + .safe_state_index = 0, +}; + +static int exynos_cpuidle_probe(struct platform_device *pdev) +{ + int ret; + + if (IS_ENABLED(CONFIG_SMP) && + (of_machine_is_compatible("samsung,exynos4210") || + of_machine_is_compatible("samsung,exynos3250"))) { + exynos_cpuidle_pdata = pdev->dev.platform_data; + + ret = cpuidle_register(&exynos_coupled_idle_driver, + cpu_possible_mask); + } else { + exynos_enter_aftr = (void *)(pdev->dev.platform_data); + + ret = cpuidle_register(&exynos_idle_driver, NULL); + } + + if (ret) { + dev_err(&pdev->dev, "failed to register cpuidle driver\n"); + return ret; + } + + return 0; +} + +static struct platform_driver exynos_cpuidle_driver = { + .probe = exynos_cpuidle_probe, + .driver = { + .name = "exynos_cpuidle", + }, +}; +builtin_platform_driver(exynos_cpuidle_driver); diff --git a/drivers/cpuidle/cpuidle-haltpoll.c b/drivers/cpuidle/cpuidle-haltpoll.c new file mode 100644 index 0000000000..d8515d5c08 --- /dev/null +++ b/drivers/cpuidle/cpuidle-haltpoll.c @@ -0,0 +1,145 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * cpuidle driver for haltpoll governor. + * + * Copyright 2019 Red Hat, Inc. and/or its affiliates. + * + * This work is licensed under the terms of the GNU GPL, version 2. See + * the COPYING file in the top-level directory. + * + * Authors: Marcelo Tosatti <mtosatti@redhat.com> + */ + +#include <linux/init.h> +#include <linux/cpu.h> +#include <linux/cpuidle.h> +#include <linux/module.h> +#include <linux/sched/idle.h> +#include <linux/kvm_para.h> +#include <linux/cpuidle_haltpoll.h> + +static bool force __read_mostly; +module_param(force, bool, 0444); +MODULE_PARM_DESC(force, "Load unconditionally"); + +static struct cpuidle_device __percpu *haltpoll_cpuidle_devices; +static enum cpuhp_state haltpoll_hp_state; + +static __cpuidle int default_enter_idle(struct cpuidle_device *dev, + struct cpuidle_driver *drv, int index) +{ + if (current_clr_polling_and_test()) + return index; + + arch_cpu_idle(); + return index; +} + +static struct cpuidle_driver haltpoll_driver = { + .name = "haltpoll", + .governor = "haltpoll", + .states = { + { /* entry 0 is for polling */ }, + { + .enter = default_enter_idle, + .exit_latency = 1, + .target_residency = 1, + .power_usage = -1, + .name = "haltpoll idle", + .desc = "default architecture idle", + }, + }, + .safe_state_index = 0, + .state_count = 2, +}; + +static int haltpoll_cpu_online(unsigned int cpu) +{ + struct cpuidle_device *dev; + + dev = per_cpu_ptr(haltpoll_cpuidle_devices, cpu); + if (!dev->registered) { + dev->cpu = cpu; + if (cpuidle_register_device(dev)) { + pr_notice("cpuidle_register_device %d failed!\n", cpu); + return -EIO; + } + arch_haltpoll_enable(cpu); + } + + return 0; +} + +static int haltpoll_cpu_offline(unsigned int cpu) +{ + struct cpuidle_device *dev; + + dev = per_cpu_ptr(haltpoll_cpuidle_devices, cpu); + if (dev->registered) { + arch_haltpoll_disable(cpu); + cpuidle_unregister_device(dev); + } + + return 0; +} + +static void haltpoll_uninit(void) +{ + if (haltpoll_hp_state) + cpuhp_remove_state(haltpoll_hp_state); + cpuidle_unregister_driver(&haltpoll_driver); + + free_percpu(haltpoll_cpuidle_devices); + haltpoll_cpuidle_devices = NULL; +} + +static bool haltpoll_want(void) +{ + return kvm_para_has_hint(KVM_HINTS_REALTIME) || force; +} + +static int __init haltpoll_init(void) +{ + int ret; + struct cpuidle_driver *drv = &haltpoll_driver; + + /* Do not load haltpoll if idle= is passed */ + if (boot_option_idle_override != IDLE_NO_OVERRIDE) + return -ENODEV; + + if (!kvm_para_available() || !haltpoll_want()) + return -ENODEV; + + cpuidle_poll_state_init(drv); + + ret = cpuidle_register_driver(drv); + if (ret < 0) + return ret; + + haltpoll_cpuidle_devices = alloc_percpu(struct cpuidle_device); + if (haltpoll_cpuidle_devices == NULL) { + cpuidle_unregister_driver(drv); + return -ENOMEM; + } + + ret = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "cpuidle/haltpoll:online", + haltpoll_cpu_online, haltpoll_cpu_offline); + if (ret < 0) { + haltpoll_uninit(); + } else { + haltpoll_hp_state = ret; + ret = 0; + } + + return ret; +} + +static void __exit haltpoll_exit(void) +{ + haltpoll_uninit(); +} + +module_init(haltpoll_init); +module_exit(haltpoll_exit); +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Marcelo Tosatti <mtosatti@redhat.com>"); diff --git a/drivers/cpuidle/cpuidle-kirkwood.c b/drivers/cpuidle/cpuidle-kirkwood.c new file mode 100644 index 0000000000..13bf743f88 --- /dev/null +++ b/drivers/cpuidle/cpuidle-kirkwood.c @@ -0,0 +1,81 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * CPU idle Marvell Kirkwood SoCs + * + * The cpu idle uses wait-for-interrupt and DDR self refresh in order + * to implement two idle states - + * #1 wait-for-interrupt + * #2 wait-for-interrupt and DDR self refresh + * + * Maintainer: Jason Cooper <jason@lakedaemon.net> + * Maintainer: Andrew Lunn <andrew@lunn.ch> + */ + +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/init.h> +#include <linux/platform_device.h> +#include <linux/cpuidle.h> +#include <linux/io.h> +#include <linux/export.h> +#include <asm/cpuidle.h> + +#define KIRKWOOD_MAX_STATES 2 + +static void __iomem *ddr_operation_base; + +/* Actual code that puts the SoC in different idle states */ +static int kirkwood_enter_idle(struct cpuidle_device *dev, + struct cpuidle_driver *drv, + int index) +{ + writel(0x7, ddr_operation_base); + cpu_do_idle(); + + return index; +} + +static struct cpuidle_driver kirkwood_idle_driver = { + .name = "kirkwood_idle", + .owner = THIS_MODULE, + .states[0] = ARM_CPUIDLE_WFI_STATE, + .states[1] = { + .enter = kirkwood_enter_idle, + .exit_latency = 10, + .target_residency = 100000, + .name = "DDR SR", + .desc = "WFI and DDR Self Refresh", + }, + .state_count = KIRKWOOD_MAX_STATES, +}; + +/* Initialize CPU idle by registering the idle states */ +static int kirkwood_cpuidle_probe(struct platform_device *pdev) +{ + ddr_operation_base = devm_platform_ioremap_resource(pdev, 0); + if (IS_ERR(ddr_operation_base)) + return PTR_ERR(ddr_operation_base); + + return cpuidle_register(&kirkwood_idle_driver, NULL); +} + +static int kirkwood_cpuidle_remove(struct platform_device *pdev) +{ + cpuidle_unregister(&kirkwood_idle_driver); + return 0; +} + +static struct platform_driver kirkwood_cpuidle_driver = { + .probe = kirkwood_cpuidle_probe, + .remove = kirkwood_cpuidle_remove, + .driver = { + .name = "kirkwood_cpuidle", + }, +}; + +module_platform_driver(kirkwood_cpuidle_driver); + +MODULE_AUTHOR("Andrew Lunn <andrew@lunn.ch>"); +MODULE_DESCRIPTION("Kirkwood cpu idle driver"); +MODULE_LICENSE("GPL v2"); +MODULE_ALIAS("platform:kirkwood-cpuidle"); diff --git a/drivers/cpuidle/cpuidle-mvebu-v7.c b/drivers/cpuidle/cpuidle-mvebu-v7.c new file mode 100644 index 0000000000..563dba609b --- /dev/null +++ b/drivers/cpuidle/cpuidle-mvebu-v7.c @@ -0,0 +1,144 @@ +/* + * Marvell Armada 370, 38x and XP SoC cpuidle driver + * + * Copyright (C) 2014 Marvell + * + * Nadav Haklai <nadavh@marvell.com> + * Gregory CLEMENT <gregory.clement@free-electrons.com> + * + * This file is licensed under the terms of the GNU General Public + * License version 2. This program is licensed "as is" without any + * warranty of any kind, whether express or implied. + * + * Maintainer: Gregory CLEMENT <gregory.clement@free-electrons.com> + */ + +#include <linux/cpu_pm.h> +#include <linux/cpuidle.h> +#include <linux/module.h> +#include <linux/of.h> +#include <linux/suspend.h> +#include <linux/platform_device.h> +#include <asm/cpuidle.h> + +#define MVEBU_V7_FLAG_DEEP_IDLE 0x10000 + +static int (*mvebu_v7_cpu_suspend)(int); + +static __cpuidle int mvebu_v7_enter_idle(struct cpuidle_device *dev, + struct cpuidle_driver *drv, + int index) +{ + int ret; + bool deepidle = false; + cpu_pm_enter(); + + if (drv->states[index].flags & MVEBU_V7_FLAG_DEEP_IDLE) + deepidle = true; + + ct_cpuidle_enter(); + ret = mvebu_v7_cpu_suspend(deepidle); + ct_cpuidle_exit(); + + cpu_pm_exit(); + + if (ret) + return ret; + + return index; +} + +static struct cpuidle_driver armadaxp_idle_driver = { + .name = "armada_xp_idle", + .states[0] = ARM_CPUIDLE_WFI_STATE, + .states[1] = { + .enter = mvebu_v7_enter_idle, + .exit_latency = 100, + .power_usage = 50, + .target_residency = 1000, + .flags = CPUIDLE_FLAG_RCU_IDLE, + .name = "MV CPU IDLE", + .desc = "CPU power down", + }, + .states[2] = { + .enter = mvebu_v7_enter_idle, + .exit_latency = 1000, + .power_usage = 5, + .target_residency = 10000, + .flags = MVEBU_V7_FLAG_DEEP_IDLE | CPUIDLE_FLAG_RCU_IDLE, + .name = "MV CPU DEEP IDLE", + .desc = "CPU and L2 Fabric power down", + }, + .state_count = 3, +}; + +static struct cpuidle_driver armada370_idle_driver = { + .name = "armada_370_idle", + .states[0] = ARM_CPUIDLE_WFI_STATE, + .states[1] = { + .enter = mvebu_v7_enter_idle, + .exit_latency = 100, + .power_usage = 5, + .target_residency = 1000, + .flags = MVEBU_V7_FLAG_DEEP_IDLE | CPUIDLE_FLAG_RCU_IDLE, + .name = "Deep Idle", + .desc = "CPU and L2 Fabric power down", + }, + .state_count = 2, +}; + +static struct cpuidle_driver armada38x_idle_driver = { + .name = "armada_38x_idle", + .states[0] = ARM_CPUIDLE_WFI_STATE, + .states[1] = { + .enter = mvebu_v7_enter_idle, + .exit_latency = 10, + .power_usage = 5, + .target_residency = 100, + .flags = CPUIDLE_FLAG_RCU_IDLE, + .name = "Idle", + .desc = "CPU and SCU power down", + }, + .state_count = 2, +}; + +static int mvebu_v7_cpuidle_probe(struct platform_device *pdev) +{ + const struct platform_device_id *id = pdev->id_entry; + + if (!id) + return -EINVAL; + + mvebu_v7_cpu_suspend = pdev->dev.platform_data; + + return cpuidle_register((struct cpuidle_driver *)id->driver_data, NULL); +} + +static const struct platform_device_id mvebu_cpuidle_ids[] = { + { + .name = "cpuidle-armada-xp", + .driver_data = (unsigned long)&armadaxp_idle_driver, + }, { + .name = "cpuidle-armada-370", + .driver_data = (unsigned long)&armada370_idle_driver, + }, { + .name = "cpuidle-armada-38x", + .driver_data = (unsigned long)&armada38x_idle_driver, + }, + {} +}; + +static struct platform_driver mvebu_cpuidle_driver = { + .probe = mvebu_v7_cpuidle_probe, + .driver = { + .name = "cpuidle-mbevu", + .suppress_bind_attrs = true, + }, + .id_table = mvebu_cpuidle_ids, +}; + +builtin_platform_driver(mvebu_cpuidle_driver); + +MODULE_AUTHOR("Gregory CLEMENT <gregory.clement@free-electrons.com>"); +MODULE_DESCRIPTION("Marvell EBU v7 cpuidle driver"); +MODULE_LICENSE("GPL"); diff --git a/drivers/cpuidle/cpuidle-powernv.c b/drivers/cpuidle/cpuidle-powernv.c new file mode 100644 index 0000000000..9ebedd972d --- /dev/null +++ b/drivers/cpuidle/cpuidle-powernv.c @@ -0,0 +1,406 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * cpuidle-powernv - idle state cpuidle driver. + * Adapted from drivers/cpuidle/cpuidle-pseries + * + */ + +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/init.h> +#include <linux/moduleparam.h> +#include <linux/cpuidle.h> +#include <linux/cpu.h> +#include <linux/notifier.h> +#include <linux/clockchips.h> +#include <linux/of.h> +#include <linux/slab.h> + +#include <asm/machdep.h> +#include <asm/firmware.h> +#include <asm/opal.h> +#include <asm/runlatch.h> +#include <asm/cpuidle.h> + +/* + * Expose only those Hardware idle states via the cpuidle framework + * that have latency value below POWERNV_THRESHOLD_LATENCY_NS. + */ +#define POWERNV_THRESHOLD_LATENCY_NS 200000 + +static struct cpuidle_driver powernv_idle_driver = { + .name = "powernv_idle", + .owner = THIS_MODULE, +}; + +static int max_idle_state __read_mostly; +static struct cpuidle_state *cpuidle_state_table __read_mostly; + +struct stop_psscr_table { + u64 val; + u64 mask; +}; + +static struct stop_psscr_table stop_psscr_table[CPUIDLE_STATE_MAX] __read_mostly; + +static u64 default_snooze_timeout __read_mostly; +static bool snooze_timeout_en __read_mostly; + +static u64 get_snooze_timeout(struct cpuidle_device *dev, + struct cpuidle_driver *drv, + int index) +{ + int i; + + if (unlikely(!snooze_timeout_en)) + return default_snooze_timeout; + + for (i = index + 1; i < drv->state_count; i++) { + if (dev->states_usage[i].disable) + continue; + + return drv->states[i].target_residency * tb_ticks_per_usec; + } + + return default_snooze_timeout; +} + +static int snooze_loop(struct cpuidle_device *dev, + struct cpuidle_driver *drv, + int index) +{ + u64 snooze_exit_time; + + set_thread_flag(TIF_POLLING_NRFLAG); + + local_irq_enable(); + + snooze_exit_time = get_tb() + get_snooze_timeout(dev, drv, index); + dev->poll_time_limit = false; + ppc64_runlatch_off(); + HMT_very_low(); + while (!need_resched()) { + if (likely(snooze_timeout_en) && get_tb() > snooze_exit_time) { + /* + * Task has not woken up but we are exiting the polling + * loop anyway. Require a barrier after polling is + * cleared to order subsequent test of need_resched(). + */ + clear_thread_flag(TIF_POLLING_NRFLAG); + dev->poll_time_limit = true; + smp_mb(); + break; + } + } + + HMT_medium(); + ppc64_runlatch_on(); + clear_thread_flag(TIF_POLLING_NRFLAG); + + local_irq_disable(); + + return index; +} + +static int nap_loop(struct cpuidle_device *dev, + struct cpuidle_driver *drv, + int index) +{ + power7_idle_type(PNV_THREAD_NAP); + + return index; +} + +/* Register for fastsleep only in oneshot mode of broadcast */ +#ifdef CONFIG_TICK_ONESHOT +static int fastsleep_loop(struct cpuidle_device *dev, + struct cpuidle_driver *drv, + int index) +{ + unsigned long old_lpcr = mfspr(SPRN_LPCR); + unsigned long new_lpcr; + + if (unlikely(system_state < SYSTEM_RUNNING)) + return index; + + new_lpcr = old_lpcr; + /* Do not exit powersave upon decrementer as we've setup the timer + * offload. + */ + new_lpcr &= ~LPCR_PECE1; + + mtspr(SPRN_LPCR, new_lpcr); + + power7_idle_type(PNV_THREAD_SLEEP); + + mtspr(SPRN_LPCR, old_lpcr); + + return index; +} +#endif + +static int stop_loop(struct cpuidle_device *dev, + struct cpuidle_driver *drv, + int index) +{ + arch300_idle_type(stop_psscr_table[index].val, + stop_psscr_table[index].mask); + return index; +} + +/* + * States for dedicated partition case. + */ +static struct cpuidle_state powernv_states[CPUIDLE_STATE_MAX] = { + { /* Snooze */ + .name = "snooze", + .desc = "snooze", + .exit_latency = 0, + .target_residency = 0, + .enter = snooze_loop, + .flags = CPUIDLE_FLAG_POLLING }, +}; + +static int powernv_cpuidle_cpu_online(unsigned int cpu) +{ + struct cpuidle_device *dev = per_cpu(cpuidle_devices, cpu); + + if (dev && cpuidle_get_driver()) { + cpuidle_pause_and_lock(); + cpuidle_enable_device(dev); + cpuidle_resume_and_unlock(); + } + return 0; +} + +static int powernv_cpuidle_cpu_dead(unsigned int cpu) +{ + struct cpuidle_device *dev = per_cpu(cpuidle_devices, cpu); + + if (dev && cpuidle_get_driver()) { + cpuidle_pause_and_lock(); + cpuidle_disable_device(dev); + cpuidle_resume_and_unlock(); + } + return 0; +} + +/* + * powernv_cpuidle_driver_init() + */ +static int powernv_cpuidle_driver_init(void) +{ + int idle_state; + struct cpuidle_driver *drv = &powernv_idle_driver; + + drv->state_count = 0; + + for (idle_state = 0; idle_state < max_idle_state; ++idle_state) { + /* Is the state not enabled? */ + if (cpuidle_state_table[idle_state].enter == NULL) + continue; + + drv->states[drv->state_count] = /* structure copy */ + cpuidle_state_table[idle_state]; + + drv->state_count += 1; + } + + /* + * On the PowerNV platform cpu_present may be less than cpu_possible in + * cases when firmware detects the CPU, but it is not available to the + * OS. If CONFIG_HOTPLUG_CPU=n, then such CPUs are not hotplugable at + * run time and hence cpu_devices are not created for those CPUs by the + * generic topology_init(). + * + * drv->cpumask defaults to cpu_possible_mask in + * __cpuidle_driver_init(). This breaks cpuidle on PowerNV where + * cpu_devices are not created for CPUs in cpu_possible_mask that + * cannot be hot-added later at run time. + * + * Trying cpuidle_register_device() on a CPU without a cpu_device is + * incorrect, so pass a correct CPU mask to the generic cpuidle driver. + */ + + drv->cpumask = (struct cpumask *)cpu_present_mask; + + return 0; +} + +static inline void add_powernv_state(int index, const char *name, + unsigned int flags, + int (*idle_fn)(struct cpuidle_device *, + struct cpuidle_driver *, + int), + unsigned int target_residency, + unsigned int exit_latency, + u64 psscr_val, u64 psscr_mask) +{ + strscpy(powernv_states[index].name, name, CPUIDLE_NAME_LEN); + strscpy(powernv_states[index].desc, name, CPUIDLE_NAME_LEN); + powernv_states[index].flags = flags; + powernv_states[index].target_residency = target_residency; + powernv_states[index].exit_latency = exit_latency; + powernv_states[index].enter = idle_fn; + /* For power8 and below psscr_* will be 0 */ + stop_psscr_table[index].val = psscr_val; + stop_psscr_table[index].mask = psscr_mask; +} + +extern u32 pnv_get_supported_cpuidle_states(void); +static int powernv_add_idle_states(void) +{ + int nr_idle_states = 1; /* Snooze */ + int dt_idle_states; + u32 has_stop_states = 0; + int i; + u32 supported_flags = pnv_get_supported_cpuidle_states(); + + + /* Currently we have snooze statically defined */ + if (nr_pnv_idle_states <= 0) { + pr_warn("cpuidle-powernv : Only Snooze is available\n"); + goto out; + } + + /* TODO: Count only states which are eligible for cpuidle */ + dt_idle_states = nr_pnv_idle_states; + + /* + * Since snooze is used as first idle state, max idle states allowed is + * CPUIDLE_STATE_MAX -1 + */ + if (nr_pnv_idle_states > CPUIDLE_STATE_MAX - 1) { + pr_warn("cpuidle-powernv: discovered idle states more than allowed"); + dt_idle_states = CPUIDLE_STATE_MAX - 1; + } + + /* + * If the idle states use stop instruction, probe for psscr values + * and psscr mask which are necessary to specify required stop level. + */ + has_stop_states = (pnv_idle_states[0].flags & + (OPAL_PM_STOP_INST_FAST | OPAL_PM_STOP_INST_DEEP)); + + for (i = 0; i < dt_idle_states; i++) { + unsigned int exit_latency, target_residency; + bool stops_timebase = false; + struct pnv_idle_states_t *state = &pnv_idle_states[i]; + + /* + * Skip the platform idle state whose flag isn't in + * the supported_cpuidle_states flag mask. + */ + if ((state->flags & supported_flags) != state->flags) + continue; + /* + * If an idle state has exit latency beyond + * POWERNV_THRESHOLD_LATENCY_NS then don't use it + * in cpu-idle. + */ + if (state->latency_ns > POWERNV_THRESHOLD_LATENCY_NS) + continue; + /* + * Firmware passes residency and latency values in ns. + * cpuidle expects it in us. + */ + exit_latency = DIV_ROUND_UP(state->latency_ns, 1000); + target_residency = DIV_ROUND_UP(state->residency_ns, 1000); + + if (has_stop_states && !(state->valid)) + continue; + + if (state->flags & OPAL_PM_TIMEBASE_STOP) + stops_timebase = true; + + if (state->flags & OPAL_PM_NAP_ENABLED) { + /* Add NAP state */ + add_powernv_state(nr_idle_states, "Nap", + CPUIDLE_FLAG_NONE, nap_loop, + target_residency, exit_latency, 0, 0); + } else if (has_stop_states && !stops_timebase) { + add_powernv_state(nr_idle_states, state->name, + CPUIDLE_FLAG_NONE, stop_loop, + target_residency, exit_latency, + state->psscr_val, + state->psscr_mask); + } + + /* + * All cpuidle states with CPUIDLE_FLAG_TIMER_STOP set must come + * within this config dependency check. + */ +#ifdef CONFIG_TICK_ONESHOT + else if (state->flags & OPAL_PM_SLEEP_ENABLED || + state->flags & OPAL_PM_SLEEP_ENABLED_ER1) { + /* Add FASTSLEEP state */ + add_powernv_state(nr_idle_states, "FastSleep", + CPUIDLE_FLAG_TIMER_STOP, + fastsleep_loop, + target_residency, exit_latency, 0, 0); + } else if (has_stop_states && stops_timebase) { + add_powernv_state(nr_idle_states, state->name, + CPUIDLE_FLAG_TIMER_STOP, stop_loop, + target_residency, exit_latency, + state->psscr_val, + state->psscr_mask); + } +#endif + else + continue; + nr_idle_states++; + } +out: + return nr_idle_states; +} + +/* + * powernv_idle_probe() + * Choose state table for shared versus dedicated partition + */ +static int powernv_idle_probe(void) +{ + if (cpuidle_disable != IDLE_NO_OVERRIDE) + return -ENODEV; + + if (firmware_has_feature(FW_FEATURE_OPAL)) { + cpuidle_state_table = powernv_states; + /* Device tree can indicate more idle states */ + max_idle_state = powernv_add_idle_states(); + default_snooze_timeout = TICK_USEC * tb_ticks_per_usec; + if (max_idle_state > 1) + snooze_timeout_en = true; + } else + return -ENODEV; + + return 0; +} + +static int __init powernv_processor_idle_init(void) +{ + int retval; + + retval = powernv_idle_probe(); + if (retval) + return retval; + + powernv_cpuidle_driver_init(); + retval = cpuidle_register(&powernv_idle_driver, NULL); + if (retval) { + printk(KERN_DEBUG "Registration of powernv driver failed.\n"); + return retval; + } + + retval = cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN, + "cpuidle/powernv:online", + powernv_cpuidle_cpu_online, NULL); + WARN_ON(retval < 0); + retval = cpuhp_setup_state_nocalls(CPUHP_CPUIDLE_DEAD, + "cpuidle/powernv:dead", NULL, + powernv_cpuidle_cpu_dead); + WARN_ON(retval < 0); + printk(KERN_DEBUG "powernv_idle_driver registered\n"); + return 0; +} + +device_initcall(powernv_processor_idle_init); diff --git a/drivers/cpuidle/cpuidle-psci-domain.c b/drivers/cpuidle/cpuidle-psci-domain.c new file mode 100644 index 0000000000..b88af1262f --- /dev/null +++ b/drivers/cpuidle/cpuidle-psci-domain.c @@ -0,0 +1,203 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * PM domains for CPUs via genpd - managed by cpuidle-psci. + * + * Copyright (C) 2019 Linaro Ltd. + * Author: Ulf Hansson <ulf.hansson@linaro.org> + * + */ + +#define pr_fmt(fmt) "CPUidle PSCI: " fmt + +#include <linux/cpu.h> +#include <linux/device.h> +#include <linux/kernel.h> +#include <linux/platform_device.h> +#include <linux/pm_domain.h> +#include <linux/pm_runtime.h> +#include <linux/psci.h> +#include <linux/slab.h> +#include <linux/string.h> + +#include "cpuidle-psci.h" + +struct psci_pd_provider { + struct list_head link; + struct device_node *node; +}; + +static LIST_HEAD(psci_pd_providers); +static bool psci_pd_allow_domain_state; + +static int psci_pd_power_off(struct generic_pm_domain *pd) +{ + struct genpd_power_state *state = &pd->states[pd->state_idx]; + u32 *pd_state; + + if (!state->data) + return 0; + + if (!psci_pd_allow_domain_state) + return -EBUSY; + + /* OSI mode is enabled, set the corresponding domain state. */ + pd_state = state->data; + psci_set_domain_state(*pd_state); + + return 0; +} + +static int psci_pd_init(struct device_node *np, bool use_osi) +{ + struct generic_pm_domain *pd; + struct psci_pd_provider *pd_provider; + struct dev_power_governor *pd_gov; + int ret = -ENOMEM; + + pd = dt_idle_pd_alloc(np, psci_dt_parse_state_node); + if (!pd) + goto out; + + pd_provider = kzalloc(sizeof(*pd_provider), GFP_KERNEL); + if (!pd_provider) + goto free_pd; + + pd->flags |= GENPD_FLAG_IRQ_SAFE | GENPD_FLAG_CPU_DOMAIN; + + /* + * Allow power off when OSI has been successfully enabled. + * PREEMPT_RT is not yet ready to enter domain idle states. + */ + if (use_osi && !IS_ENABLED(CONFIG_PREEMPT_RT)) + pd->power_off = psci_pd_power_off; + else + pd->flags |= GENPD_FLAG_ALWAYS_ON; + + /* Use governor for CPU PM domains if it has some states to manage. */ + pd_gov = pd->states ? &pm_domain_cpu_gov : NULL; + + ret = pm_genpd_init(pd, pd_gov, false); + if (ret) + goto free_pd_prov; + + ret = of_genpd_add_provider_simple(np, pd); + if (ret) + goto remove_pd; + + pd_provider->node = of_node_get(np); + list_add(&pd_provider->link, &psci_pd_providers); + + pr_debug("init PM domain %s\n", pd->name); + return 0; + +remove_pd: + pm_genpd_remove(pd); +free_pd_prov: + kfree(pd_provider); +free_pd: + dt_idle_pd_free(pd); +out: + pr_err("failed to init PM domain ret=%d %pOF\n", ret, np); + return ret; +} + +static void psci_pd_remove(void) +{ + struct psci_pd_provider *pd_provider, *it; + struct generic_pm_domain *genpd; + + list_for_each_entry_safe_reverse(pd_provider, it, + &psci_pd_providers, link) { + of_genpd_del_provider(pd_provider->node); + + genpd = of_genpd_remove_last(pd_provider->node); + if (!IS_ERR(genpd)) + kfree(genpd); + + of_node_put(pd_provider->node); + list_del(&pd_provider->link); + kfree(pd_provider); + } +} + +static void psci_cpuidle_domain_sync_state(struct device *dev) +{ + /* + * All devices have now been attached/probed to the PM domain topology, + * hence it's fine to allow domain states to be picked. + */ + psci_pd_allow_domain_state = true; +} + +static const struct of_device_id psci_of_match[] = { + { .compatible = "arm,psci-1.0" }, + {} +}; + +static int psci_cpuidle_domain_probe(struct platform_device *pdev) +{ + struct device_node *np = pdev->dev.of_node; + struct device_node *node; + bool use_osi = psci_has_osi_support(); + int ret = 0, pd_count = 0; + + if (!np) + return -ENODEV; + + /* + * Parse child nodes for the "#power-domain-cells" property and + * initialize a genpd/genpd-of-provider pair when it's found. + */ + for_each_child_of_node(np, node) { + if (!of_property_present(node, "#power-domain-cells")) + continue; + + ret = psci_pd_init(node, use_osi); + if (ret) { + of_node_put(node); + goto exit; + } + + pd_count++; + } + + /* Bail out if not using the hierarchical CPU topology. */ + if (!pd_count) + return 0; + + /* Link genpd masters/subdomains to model the CPU topology. */ + ret = dt_idle_pd_init_topology(np); + if (ret) + goto remove_pd; + + /* let's try to enable OSI. */ + ret = psci_set_osi_mode(use_osi); + if (ret) + goto remove_pd; + + pr_info("Initialized CPU PM domain topology using %s mode\n", + use_osi ? "OSI" : "PC"); + return 0; + +remove_pd: + dt_idle_pd_remove_topology(np); + psci_pd_remove(); +exit: + pr_err("failed to create CPU PM domains ret=%d\n", ret); + return ret; +} + +static struct platform_driver psci_cpuidle_domain_driver = { + .probe = psci_cpuidle_domain_probe, + .driver = { + .name = "psci-cpuidle-domain", + .of_match_table = psci_of_match, + .sync_state = psci_cpuidle_domain_sync_state, + }, +}; + +static int __init psci_idle_init_domains(void) +{ + return platform_driver_register(&psci_cpuidle_domain_driver); +} +subsys_initcall(psci_idle_init_domains); diff --git a/drivers/cpuidle/cpuidle-psci.c b/drivers/cpuidle/cpuidle-psci.c new file mode 100644 index 0000000000..bf68920d03 --- /dev/null +++ b/drivers/cpuidle/cpuidle-psci.c @@ -0,0 +1,449 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * PSCI CPU idle driver. + * + * Copyright (C) 2019 ARM Ltd. + * Author: Lorenzo Pieralisi <lorenzo.pieralisi@arm.com> + */ + +#define pr_fmt(fmt) "CPUidle PSCI: " fmt + +#include <linux/cpuhotplug.h> +#include <linux/cpu_cooling.h> +#include <linux/cpuidle.h> +#include <linux/cpumask.h> +#include <linux/cpu_pm.h> +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/of.h> +#include <linux/platform_device.h> +#include <linux/psci.h> +#include <linux/pm_domain.h> +#include <linux/pm_runtime.h> +#include <linux/slab.h> +#include <linux/string.h> +#include <linux/syscore_ops.h> + +#include <asm/cpuidle.h> + +#include "cpuidle-psci.h" +#include "dt_idle_states.h" + +struct psci_cpuidle_data { + u32 *psci_states; + struct device *dev; +}; + +static DEFINE_PER_CPU_READ_MOSTLY(struct psci_cpuidle_data, psci_cpuidle_data); +static DEFINE_PER_CPU(u32, domain_state); +static bool psci_cpuidle_use_cpuhp; + +void psci_set_domain_state(u32 state) +{ + __this_cpu_write(domain_state, state); +} + +static inline u32 psci_get_domain_state(void) +{ + return __this_cpu_read(domain_state); +} + +static __cpuidle int __psci_enter_domain_idle_state(struct cpuidle_device *dev, + struct cpuidle_driver *drv, int idx, + bool s2idle) +{ + struct psci_cpuidle_data *data = this_cpu_ptr(&psci_cpuidle_data); + u32 *states = data->psci_states; + struct device *pd_dev = data->dev; + u32 state; + int ret; + + ret = cpu_pm_enter(); + if (ret) + return -1; + + /* Do runtime PM to manage a hierarchical CPU toplogy. */ + if (s2idle) + dev_pm_genpd_suspend(pd_dev); + else + pm_runtime_put_sync_suspend(pd_dev); + + state = psci_get_domain_state(); + if (!state) + state = states[idx]; + + ret = psci_cpu_suspend_enter(state) ? -1 : idx; + + if (s2idle) + dev_pm_genpd_resume(pd_dev); + else + pm_runtime_get_sync(pd_dev); + + cpu_pm_exit(); + + /* Clear the domain state to start fresh when back from idle. */ + psci_set_domain_state(0); + return ret; +} + +static int psci_enter_domain_idle_state(struct cpuidle_device *dev, + struct cpuidle_driver *drv, int idx) +{ + return __psci_enter_domain_idle_state(dev, drv, idx, false); +} + +static int psci_enter_s2idle_domain_idle_state(struct cpuidle_device *dev, + struct cpuidle_driver *drv, + int idx) +{ + return __psci_enter_domain_idle_state(dev, drv, idx, true); +} + +static int psci_idle_cpuhp_up(unsigned int cpu) +{ + struct device *pd_dev = __this_cpu_read(psci_cpuidle_data.dev); + + if (pd_dev) + pm_runtime_get_sync(pd_dev); + + return 0; +} + +static int psci_idle_cpuhp_down(unsigned int cpu) +{ + struct device *pd_dev = __this_cpu_read(psci_cpuidle_data.dev); + + if (pd_dev) { + pm_runtime_put_sync(pd_dev); + /* Clear domain state to start fresh at next online. */ + psci_set_domain_state(0); + } + + return 0; +} + +static void psci_idle_syscore_switch(bool suspend) +{ + bool cleared = false; + struct device *dev; + int cpu; + + for_each_possible_cpu(cpu) { + dev = per_cpu_ptr(&psci_cpuidle_data, cpu)->dev; + + if (dev && suspend) { + dev_pm_genpd_suspend(dev); + } else if (dev) { + dev_pm_genpd_resume(dev); + + /* Account for userspace having offlined a CPU. */ + if (pm_runtime_status_suspended(dev)) + pm_runtime_set_active(dev); + + /* Clear domain state to re-start fresh. */ + if (!cleared) { + psci_set_domain_state(0); + cleared = true; + } + } + } +} + +static int psci_idle_syscore_suspend(void) +{ + psci_idle_syscore_switch(true); + return 0; +} + +static void psci_idle_syscore_resume(void) +{ + psci_idle_syscore_switch(false); +} + +static struct syscore_ops psci_idle_syscore_ops = { + .suspend = psci_idle_syscore_suspend, + .resume = psci_idle_syscore_resume, +}; + +static void psci_idle_init_cpuhp(void) +{ + int err; + + if (!psci_cpuidle_use_cpuhp) + return; + + register_syscore_ops(&psci_idle_syscore_ops); + + err = cpuhp_setup_state_nocalls(CPUHP_AP_CPU_PM_STARTING, + "cpuidle/psci:online", + psci_idle_cpuhp_up, + psci_idle_cpuhp_down); + if (err) + pr_warn("Failed %d while setup cpuhp state\n", err); +} + +static __cpuidle int psci_enter_idle_state(struct cpuidle_device *dev, + struct cpuidle_driver *drv, int idx) +{ + u32 *state = __this_cpu_read(psci_cpuidle_data.psci_states); + + return CPU_PM_CPU_IDLE_ENTER_PARAM_RCU(psci_cpu_suspend_enter, idx, state[idx]); +} + +static const struct of_device_id psci_idle_state_match[] = { + { .compatible = "arm,idle-state", + .data = psci_enter_idle_state }, + { }, +}; + +int psci_dt_parse_state_node(struct device_node *np, u32 *state) +{ + int err = of_property_read_u32(np, "arm,psci-suspend-param", state); + + if (err) { + pr_warn("%pOF missing arm,psci-suspend-param property\n", np); + return err; + } + + if (!psci_power_state_is_valid(*state)) { + pr_warn("Invalid PSCI power state %#x\n", *state); + return -EINVAL; + } + + return 0; +} + +static int psci_dt_cpu_init_topology(struct cpuidle_driver *drv, + struct psci_cpuidle_data *data, + unsigned int state_count, int cpu) +{ + /* Currently limit the hierarchical topology to be used in OSI mode. */ + if (!psci_has_osi_support()) + return 0; + + if (IS_ENABLED(CONFIG_PREEMPT_RT)) + return 0; + + data->dev = psci_dt_attach_cpu(cpu); + if (IS_ERR_OR_NULL(data->dev)) + return PTR_ERR_OR_ZERO(data->dev); + + /* + * Using the deepest state for the CPU to trigger a potential selection + * of a shared state for the domain, assumes the domain states are all + * deeper states. + */ + drv->states[state_count - 1].flags |= CPUIDLE_FLAG_RCU_IDLE; + drv->states[state_count - 1].enter = psci_enter_domain_idle_state; + drv->states[state_count - 1].enter_s2idle = psci_enter_s2idle_domain_idle_state; + psci_cpuidle_use_cpuhp = true; + + return 0; +} + +static int psci_dt_cpu_init_idle(struct device *dev, struct cpuidle_driver *drv, + struct device_node *cpu_node, + unsigned int state_count, int cpu) +{ + int i, ret = 0; + u32 *psci_states; + struct device_node *state_node; + struct psci_cpuidle_data *data = per_cpu_ptr(&psci_cpuidle_data, cpu); + + state_count++; /* Add WFI state too */ + psci_states = devm_kcalloc(dev, state_count, sizeof(*psci_states), + GFP_KERNEL); + if (!psci_states) + return -ENOMEM; + + for (i = 1; i < state_count; i++) { + state_node = of_get_cpu_state_node(cpu_node, i - 1); + if (!state_node) + break; + + ret = psci_dt_parse_state_node(state_node, &psci_states[i]); + of_node_put(state_node); + + if (ret) + return ret; + + pr_debug("psci-power-state %#x index %d\n", psci_states[i], i); + } + + if (i != state_count) + return -ENODEV; + + /* Initialize optional data, used for the hierarchical topology. */ + ret = psci_dt_cpu_init_topology(drv, data, state_count, cpu); + if (ret < 0) + return ret; + + /* Idle states parsed correctly, store them in the per-cpu struct. */ + data->psci_states = psci_states; + return 0; +} + +static int psci_cpu_init_idle(struct device *dev, struct cpuidle_driver *drv, + unsigned int cpu, unsigned int state_count) +{ + struct device_node *cpu_node; + int ret; + + /* + * If the PSCI cpu_suspend function hook has not been initialized + * idle states must not be enabled, so bail out + */ + if (!psci_ops.cpu_suspend) + return -EOPNOTSUPP; + + cpu_node = of_cpu_device_node_get(cpu); + if (!cpu_node) + return -ENODEV; + + ret = psci_dt_cpu_init_idle(dev, drv, cpu_node, state_count, cpu); + + of_node_put(cpu_node); + + return ret; +} + +static void psci_cpu_deinit_idle(int cpu) +{ + struct psci_cpuidle_data *data = per_cpu_ptr(&psci_cpuidle_data, cpu); + + psci_dt_detach_cpu(data->dev); + psci_cpuidle_use_cpuhp = false; +} + +static int psci_idle_init_cpu(struct device *dev, int cpu) +{ + struct cpuidle_driver *drv; + struct device_node *cpu_node; + const char *enable_method; + int ret = 0; + + cpu_node = of_cpu_device_node_get(cpu); + if (!cpu_node) + return -ENODEV; + + /* + * Check whether the enable-method for the cpu is PSCI, fail + * if it is not. + */ + enable_method = of_get_property(cpu_node, "enable-method", NULL); + if (!enable_method || (strcmp(enable_method, "psci"))) + ret = -ENODEV; + + of_node_put(cpu_node); + if (ret) + return ret; + + drv = devm_kzalloc(dev, sizeof(*drv), GFP_KERNEL); + if (!drv) + return -ENOMEM; + + drv->name = "psci_idle"; + drv->owner = THIS_MODULE; + drv->cpumask = (struct cpumask *)cpumask_of(cpu); + + /* + * PSCI idle states relies on architectural WFI to be represented as + * state index 0. + */ + drv->states[0].enter = psci_enter_idle_state; + drv->states[0].exit_latency = 1; + drv->states[0].target_residency = 1; + drv->states[0].power_usage = UINT_MAX; + strcpy(drv->states[0].name, "WFI"); + strcpy(drv->states[0].desc, "ARM WFI"); + + /* + * If no DT idle states are detected (ret == 0) let the driver + * initialization fail accordingly since there is no reason to + * initialize the idle driver if only wfi is supported, the + * default archictectural back-end already executes wfi + * on idle entry. + */ + ret = dt_init_idle_driver(drv, psci_idle_state_match, 1); + if (ret <= 0) + return ret ? : -ENODEV; + + /* + * Initialize PSCI idle states. + */ + ret = psci_cpu_init_idle(dev, drv, cpu, ret); + if (ret) { + pr_err("CPU %d failed to PSCI idle\n", cpu); + return ret; + } + + ret = cpuidle_register(drv, NULL); + if (ret) + goto deinit; + + cpuidle_cooling_register(drv); + + return 0; +deinit: + psci_cpu_deinit_idle(cpu); + return ret; +} + +/* + * psci_idle_probe - Initializes PSCI cpuidle driver + * + * Initializes PSCI cpuidle driver for all CPUs, if any CPU fails + * to register cpuidle driver then rollback to cancel all CPUs + * registration. + */ +static int psci_cpuidle_probe(struct platform_device *pdev) +{ + int cpu, ret; + struct cpuidle_driver *drv; + struct cpuidle_device *dev; + + for_each_possible_cpu(cpu) { + ret = psci_idle_init_cpu(&pdev->dev, cpu); + if (ret) + goto out_fail; + } + + psci_idle_init_cpuhp(); + return 0; + +out_fail: + while (--cpu >= 0) { + dev = per_cpu(cpuidle_devices, cpu); + drv = cpuidle_get_cpu_driver(dev); + cpuidle_unregister(drv); + psci_cpu_deinit_idle(cpu); + } + + return ret; +} + +static struct platform_driver psci_cpuidle_driver = { + .probe = psci_cpuidle_probe, + .driver = { + .name = "psci-cpuidle", + }, +}; + +static int __init psci_idle_init(void) +{ + struct platform_device *pdev; + int ret; + + ret = platform_driver_register(&psci_cpuidle_driver); + if (ret) + return ret; + + pdev = platform_device_register_simple("psci-cpuidle", -1, NULL, 0); + if (IS_ERR(pdev)) { + platform_driver_unregister(&psci_cpuidle_driver); + return PTR_ERR(pdev); + } + + return 0; +} +device_initcall(psci_idle_init); diff --git a/drivers/cpuidle/cpuidle-psci.h b/drivers/cpuidle/cpuidle-psci.h new file mode 100644 index 0000000000..4e132640ed --- /dev/null +++ b/drivers/cpuidle/cpuidle-psci.h @@ -0,0 +1,31 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#ifndef __CPUIDLE_PSCI_H +#define __CPUIDLE_PSCI_H + +struct device; +struct device_node; + +void psci_set_domain_state(u32 state); +int psci_dt_parse_state_node(struct device_node *np, u32 *state); + +#ifdef CONFIG_ARM_PSCI_CPUIDLE_DOMAIN + +#include "dt_idle_genpd.h" + +static inline struct device *psci_dt_attach_cpu(int cpu) +{ + return dt_idle_attach_cpu(cpu, "psci"); +} + +static inline void psci_dt_detach_cpu(struct device *dev) +{ + dt_idle_detach_cpu(dev); +} + +#else +static inline struct device *psci_dt_attach_cpu(int cpu) { return NULL; } +static inline void psci_dt_detach_cpu(struct device *dev) { } +#endif + +#endif /* __CPUIDLE_PSCI_H */ diff --git a/drivers/cpuidle/cpuidle-pseries.c b/drivers/cpuidle/cpuidle-pseries.c new file mode 100644 index 0000000000..14db9b7d98 --- /dev/null +++ b/drivers/cpuidle/cpuidle-pseries.c @@ -0,0 +1,477 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * cpuidle-pseries - idle state cpuidle driver. + * Adapted from drivers/idle/intel_idle.c and + * drivers/acpi/processor_idle.c + * + */ + +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/init.h> +#include <linux/moduleparam.h> +#include <linux/cpuidle.h> +#include <linux/cpu.h> +#include <linux/notifier.h> + +#include <asm/paca.h> +#include <asm/reg.h> +#include <asm/machdep.h> +#include <asm/firmware.h> +#include <asm/runlatch.h> +#include <asm/idle.h> +#include <asm/plpar_wrappers.h> +#include <asm/rtas.h> + +static struct cpuidle_driver pseries_idle_driver = { + .name = "pseries_idle", + .owner = THIS_MODULE, +}; + +static int max_idle_state __read_mostly; +static struct cpuidle_state *cpuidle_state_table __read_mostly; +static u64 snooze_timeout __read_mostly; +static bool snooze_timeout_en __read_mostly; + +static __cpuidle +int snooze_loop(struct cpuidle_device *dev, struct cpuidle_driver *drv, + int index) +{ + u64 snooze_exit_time; + + set_thread_flag(TIF_POLLING_NRFLAG); + + pseries_idle_prolog(); + raw_local_irq_enable(); + snooze_exit_time = get_tb() + snooze_timeout; + dev->poll_time_limit = false; + + while (!need_resched()) { + HMT_low(); + HMT_very_low(); + if (likely(snooze_timeout_en) && get_tb() > snooze_exit_time) { + /* + * Task has not woken up but we are exiting the polling + * loop anyway. Require a barrier after polling is + * cleared to order subsequent test of need_resched(). + */ + dev->poll_time_limit = true; + clear_thread_flag(TIF_POLLING_NRFLAG); + smp_mb(); + break; + } + } + + HMT_medium(); + clear_thread_flag(TIF_POLLING_NRFLAG); + + raw_local_irq_disable(); + + pseries_idle_epilog(); + + return index; +} + +static __cpuidle void check_and_cede_processor(void) +{ + /* + * Ensure our interrupt state is properly tracked, + * also checks if no interrupt has occurred while we + * were soft-disabled + */ + if (prep_irq_for_idle()) { + cede_processor(); +#ifdef CONFIG_TRACE_IRQFLAGS + /* Ensure that H_CEDE returns with IRQs on */ + if (WARN_ON(!(mfmsr() & MSR_EE))) + __hard_irq_enable(); +#endif + } +} + +/* + * XCEDE: Extended CEDE states discovered through the + * "ibm,get-systems-parameter" RTAS call with the token + * CEDE_LATENCY_TOKEN + */ + +/* + * Section 7.3.16 System Parameters Option of PAPR version 2.8.1 has a + * table with all the parameters to ibm,get-system-parameters. + * CEDE_LATENCY_TOKEN corresponds to the token value for Cede Latency + * Settings Information. + */ +#define CEDE_LATENCY_TOKEN 45 + +/* + * If the platform supports the cede latency settings information system + * parameter it must provide the following information in the NULL terminated + * parameter string: + * + * a. The first byte is the length “N” of each cede latency setting record minus + * one (zero indicates a length of 1 byte). + * + * b. For each supported cede latency setting a cede latency setting record + * consisting of the first “N” bytes as per the following table. + * + * ----------------------------- + * | Field | Field | + * | Name | Length | + * ----------------------------- + * | Cede Latency | 1 Byte | + * | Specifier Value | | + * ----------------------------- + * | Maximum wakeup | | + * | latency in | 8 Bytes | + * | tb-ticks | | + * ----------------------------- + * | Responsive to | | + * | external | 1 Byte | + * | interrupts | | + * ----------------------------- + * + * This version has cede latency record size = 10. + * + * The structure xcede_latency_payload represents a) and b) with + * xcede_latency_record representing the table in b). + * + * xcede_latency_parameter is what gets returned by + * ibm,get-systems-parameter RTAS call when made with + * CEDE_LATENCY_TOKEN. + * + * These structures are only used to represent the data obtained by the RTAS + * call. The data is in big-endian. + */ +struct xcede_latency_record { + u8 hint; + __be64 latency_ticks; + u8 wake_on_irqs; +} __packed; + +// Make space for 16 records, which "should be enough". +struct xcede_latency_payload { + u8 record_size; + struct xcede_latency_record records[16]; +} __packed; + +struct xcede_latency_parameter { + __be16 payload_size; + struct xcede_latency_payload payload; + u8 null_char; +} __packed; + +static unsigned int nr_xcede_records; +static struct xcede_latency_parameter xcede_latency_parameter __initdata; + +static int __init parse_cede_parameters(void) +{ + struct xcede_latency_payload *payload; + u32 total_xcede_records_size; + u8 xcede_record_size; + u16 payload_size; + int ret, i; + + ret = rtas_call(rtas_token("ibm,get-system-parameter"), 3, 1, + NULL, CEDE_LATENCY_TOKEN, __pa(&xcede_latency_parameter), + sizeof(xcede_latency_parameter)); + if (ret) { + pr_err("xcede: Error parsing CEDE_LATENCY_TOKEN\n"); + return ret; + } + + payload_size = be16_to_cpu(xcede_latency_parameter.payload_size); + payload = &xcede_latency_parameter.payload; + + xcede_record_size = payload->record_size + 1; + + if (xcede_record_size != sizeof(struct xcede_latency_record)) { + pr_err("xcede: Expected record-size %lu. Observed size %u.\n", + sizeof(struct xcede_latency_record), xcede_record_size); + return -EINVAL; + } + + pr_info("xcede: xcede_record_size = %d\n", xcede_record_size); + + /* + * Since the payload_size includes the last NULL byte and the + * xcede_record_size, the remaining bytes correspond to array of all + * cede_latency settings. + */ + total_xcede_records_size = payload_size - 2; + nr_xcede_records = total_xcede_records_size / xcede_record_size; + + for (i = 0; i < nr_xcede_records; i++) { + struct xcede_latency_record *record = &payload->records[i]; + u64 latency_ticks = be64_to_cpu(record->latency_ticks); + u8 wake_on_irqs = record->wake_on_irqs; + u8 hint = record->hint; + + pr_info("xcede: Record %d : hint = %u, latency = 0x%llx tb ticks, Wake-on-irq = %u\n", + i, hint, latency_ticks, wake_on_irqs); + } + + return 0; +} + +#define NR_DEDICATED_STATES 2 /* snooze, CEDE */ +static u8 cede_latency_hint[NR_DEDICATED_STATES]; + +static __cpuidle +int dedicated_cede_loop(struct cpuidle_device *dev, struct cpuidle_driver *drv, + int index) +{ + u8 old_latency_hint; + + pseries_idle_prolog(); + get_lppaca()->donate_dedicated_cpu = 1; + old_latency_hint = get_lppaca()->cede_latency_hint; + get_lppaca()->cede_latency_hint = cede_latency_hint[index]; + + HMT_medium(); + check_and_cede_processor(); + + raw_local_irq_disable(); + get_lppaca()->donate_dedicated_cpu = 0; + get_lppaca()->cede_latency_hint = old_latency_hint; + + pseries_idle_epilog(); + + return index; +} + +static __cpuidle +int shared_cede_loop(struct cpuidle_device *dev, struct cpuidle_driver *drv, + int index) +{ + + pseries_idle_prolog(); + + /* + * Yield the processor to the hypervisor. We return if + * an external interrupt occurs (which are driven prior + * to returning here) or if a prod occurs from another + * processor. When returning here, external interrupts + * are enabled. + */ + check_and_cede_processor(); + + raw_local_irq_disable(); + pseries_idle_epilog(); + + return index; +} + +/* + * States for dedicated partition case. + */ +static struct cpuidle_state dedicated_states[NR_DEDICATED_STATES] = { + { /* Snooze */ + .name = "snooze", + .desc = "snooze", + .exit_latency = 0, + .target_residency = 0, + .enter = &snooze_loop, + .flags = CPUIDLE_FLAG_POLLING }, + { /* CEDE */ + .name = "CEDE", + .desc = "CEDE", + .exit_latency = 10, + .target_residency = 100, + .enter = &dedicated_cede_loop }, +}; + +/* + * States for shared partition case. + */ +static struct cpuidle_state shared_states[] = { + { /* Snooze */ + .name = "snooze", + .desc = "snooze", + .exit_latency = 0, + .target_residency = 0, + .enter = &snooze_loop, + .flags = CPUIDLE_FLAG_POLLING }, + { /* Shared Cede */ + .name = "Shared Cede", + .desc = "Shared Cede", + .exit_latency = 10, + .target_residency = 100, + .enter = &shared_cede_loop }, +}; + +static int pseries_cpuidle_cpu_online(unsigned int cpu) +{ + struct cpuidle_device *dev = per_cpu(cpuidle_devices, cpu); + + if (dev && cpuidle_get_driver()) { + cpuidle_pause_and_lock(); + cpuidle_enable_device(dev); + cpuidle_resume_and_unlock(); + } + return 0; +} + +static int pseries_cpuidle_cpu_dead(unsigned int cpu) +{ + struct cpuidle_device *dev = per_cpu(cpuidle_devices, cpu); + + if (dev && cpuidle_get_driver()) { + cpuidle_pause_and_lock(); + cpuidle_disable_device(dev); + cpuidle_resume_and_unlock(); + } + return 0; +} + +/* + * pseries_cpuidle_driver_init() + */ +static int pseries_cpuidle_driver_init(void) +{ + int idle_state; + struct cpuidle_driver *drv = &pseries_idle_driver; + + drv->state_count = 0; + + for (idle_state = 0; idle_state < max_idle_state; ++idle_state) { + /* Is the state not enabled? */ + if (cpuidle_state_table[idle_state].enter == NULL) + continue; + + drv->states[drv->state_count] = /* structure copy */ + cpuidle_state_table[idle_state]; + + drv->state_count += 1; + } + + return 0; +} + +static void __init fixup_cede0_latency(void) +{ + struct xcede_latency_payload *payload; + u64 min_xcede_latency_us = UINT_MAX; + int i; + + if (parse_cede_parameters()) + return; + + pr_info("cpuidle: Skipping the %d Extended CEDE idle states\n", + nr_xcede_records); + + payload = &xcede_latency_parameter.payload; + + /* + * The CEDE idle state maps to CEDE(0). While the hypervisor + * does not advertise CEDE(0) exit latency values, it does + * advertise the latency values of the extended CEDE states. + * We use the lowest advertised exit latency value as a proxy + * for the exit latency of CEDE(0). + */ + for (i = 0; i < nr_xcede_records; i++) { + struct xcede_latency_record *record = &payload->records[i]; + u8 hint = record->hint; + u64 latency_tb = be64_to_cpu(record->latency_ticks); + u64 latency_us = DIV_ROUND_UP_ULL(tb_to_ns(latency_tb), NSEC_PER_USEC); + + /* + * We expect the exit latency of an extended CEDE + * state to be non-zero, it to since it takes at least + * a few nanoseconds to wakeup the idle CPU and + * dispatch the virtual processor into the Linux + * Guest. + * + * So we consider only non-zero value for performing + * the fixup of CEDE(0) latency. + */ + if (latency_us == 0) { + pr_warn("cpuidle: Skipping xcede record %d [hint=%d]. Exit latency = 0us\n", + i, hint); + continue; + } + + if (latency_us < min_xcede_latency_us) + min_xcede_latency_us = latency_us; + } + + if (min_xcede_latency_us != UINT_MAX) { + dedicated_states[1].exit_latency = min_xcede_latency_us; + dedicated_states[1].target_residency = 10 * (min_xcede_latency_us); + pr_info("cpuidle: Fixed up CEDE exit latency to %llu us\n", + min_xcede_latency_us); + } + +} + +/* + * pseries_idle_probe() + * Choose state table for shared versus dedicated partition + */ +static int __init pseries_idle_probe(void) +{ + + if (cpuidle_disable != IDLE_NO_OVERRIDE) + return -ENODEV; + + if (firmware_has_feature(FW_FEATURE_SPLPAR)) { + if (lppaca_shared_proc()) { + cpuidle_state_table = shared_states; + max_idle_state = ARRAY_SIZE(shared_states); + } else { + /* + * Use firmware provided latency values + * starting with POWER10 platforms. In the + * case that we are running on a POWER10 + * platform but in an earlier compat mode, we + * can still use the firmware provided values. + * + * However, on platforms prior to POWER10, we + * cannot rely on the accuracy of the firmware + * provided latency values. On such platforms, + * go with the conservative default estimate + * of 10us. + */ + if (cpu_has_feature(CPU_FTR_ARCH_31) || pvr_version_is(PVR_POWER10)) + fixup_cede0_latency(); + cpuidle_state_table = dedicated_states; + max_idle_state = NR_DEDICATED_STATES; + } + } else + return -ENODEV; + + if (max_idle_state > 1) { + snooze_timeout_en = true; + snooze_timeout = cpuidle_state_table[1].target_residency * + tb_ticks_per_usec; + } + return 0; +} + +static int __init pseries_processor_idle_init(void) +{ + int retval; + + retval = pseries_idle_probe(); + if (retval) + return retval; + + pseries_cpuidle_driver_init(); + retval = cpuidle_register(&pseries_idle_driver, NULL); + if (retval) { + printk(KERN_DEBUG "Registration of pseries driver failed.\n"); + return retval; + } + + retval = cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN, + "cpuidle/pseries:online", + pseries_cpuidle_cpu_online, NULL); + WARN_ON(retval < 0); + retval = cpuhp_setup_state_nocalls(CPUHP_CPUIDLE_DEAD, + "cpuidle/pseries:DEAD", NULL, + pseries_cpuidle_cpu_dead); + WARN_ON(retval < 0); + printk(KERN_DEBUG "pseries_idle_driver registered\n"); + return 0; +} + +device_initcall(pseries_processor_idle_init); diff --git a/drivers/cpuidle/cpuidle-qcom-spm.c b/drivers/cpuidle/cpuidle-qcom-spm.c new file mode 100644 index 0000000000..1fc9968eae --- /dev/null +++ b/drivers/cpuidle/cpuidle-qcom-spm.c @@ -0,0 +1,195 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (c) 2011-2014, The Linux Foundation. All rights reserved. + * Copyright (c) 2014,2015, Linaro Ltd. + * + * SAW power controller driver + */ + +#include <linux/kernel.h> +#include <linux/init.h> +#include <linux/io.h> +#include <linux/slab.h> +#include <linux/of.h> +#include <linux/of_platform.h> +#include <linux/err.h> +#include <linux/platform_device.h> +#include <linux/cpuidle.h> +#include <linux/cpu_pm.h> +#include <linux/firmware/qcom/qcom_scm.h> +#include <soc/qcom/spm.h> + +#include <asm/proc-fns.h> +#include <asm/suspend.h> + +#include "dt_idle_states.h" + +struct cpuidle_qcom_spm_data { + struct cpuidle_driver cpuidle_driver; + struct spm_driver_data *spm; +}; + +static int qcom_pm_collapse(unsigned long int unused) +{ + qcom_scm_cpu_power_down(QCOM_SCM_CPU_PWR_DOWN_L2_ON); + + /* + * Returns here only if there was a pending interrupt and we did not + * power down as a result. + */ + return -1; +} + +static int qcom_cpu_spc(struct spm_driver_data *drv) +{ + int ret; + + spm_set_low_power_mode(drv, PM_SLEEP_MODE_SPC); + ret = cpu_suspend(0, qcom_pm_collapse); + /* + * ARM common code executes WFI without calling into our driver and + * if the SPM mode is not reset, then we may accidently power down the + * cpu when we intended only to gate the cpu clock. + * Ensure the state is set to standby before returning. + */ + spm_set_low_power_mode(drv, PM_SLEEP_MODE_STBY); + + return ret; +} + +static __cpuidle int spm_enter_idle_state(struct cpuidle_device *dev, + struct cpuidle_driver *drv, int idx) +{ + struct cpuidle_qcom_spm_data *data = container_of(drv, struct cpuidle_qcom_spm_data, + cpuidle_driver); + + return CPU_PM_CPU_IDLE_ENTER_PARAM(qcom_cpu_spc, idx, data->spm); +} + +static struct cpuidle_driver qcom_spm_idle_driver = { + .name = "qcom_spm", + .owner = THIS_MODULE, + .states[0] = { + .enter = spm_enter_idle_state, + .exit_latency = 1, + .target_residency = 1, + .power_usage = UINT_MAX, + .name = "WFI", + .desc = "ARM WFI", + } +}; + +static const struct of_device_id qcom_idle_state_match[] = { + { .compatible = "qcom,idle-state-spc", .data = spm_enter_idle_state }, + { }, +}; + +static int spm_cpuidle_register(struct device *cpuidle_dev, int cpu) +{ + struct platform_device *pdev = NULL; + struct device_node *cpu_node, *saw_node; + struct cpuidle_qcom_spm_data *data = NULL; + int ret; + + cpu_node = of_cpu_device_node_get(cpu); + if (!cpu_node) + return -ENODEV; + + saw_node = of_parse_phandle(cpu_node, "qcom,saw", 0); + if (!saw_node) + return -ENODEV; + + pdev = of_find_device_by_node(saw_node); + of_node_put(saw_node); + of_node_put(cpu_node); + if (!pdev) + return -ENODEV; + + data = devm_kzalloc(cpuidle_dev, sizeof(*data), GFP_KERNEL); + if (!data) + return -ENOMEM; + + data->spm = dev_get_drvdata(&pdev->dev); + if (!data->spm) + return -EINVAL; + + data->cpuidle_driver = qcom_spm_idle_driver; + data->cpuidle_driver.cpumask = (struct cpumask *)cpumask_of(cpu); + + ret = dt_init_idle_driver(&data->cpuidle_driver, + qcom_idle_state_match, 1); + if (ret <= 0) + return ret ? : -ENODEV; + + return cpuidle_register(&data->cpuidle_driver, NULL); +} + +static int spm_cpuidle_drv_probe(struct platform_device *pdev) +{ + int cpu, ret; + + if (!qcom_scm_is_available()) + return -EPROBE_DEFER; + + ret = qcom_scm_set_warm_boot_addr(cpu_resume_arm); + if (ret) + return dev_err_probe(&pdev->dev, ret, "set warm boot addr failed"); + + for_each_possible_cpu(cpu) { + ret = spm_cpuidle_register(&pdev->dev, cpu); + if (ret && ret != -ENODEV) { + dev_err(&pdev->dev, + "Cannot register for CPU%d: %d\n", cpu, ret); + } + } + + return 0; +} + +static struct platform_driver spm_cpuidle_driver = { + .probe = spm_cpuidle_drv_probe, + .driver = { + .name = "qcom-spm-cpuidle", + .suppress_bind_attrs = true, + }, +}; + +static bool __init qcom_spm_find_any_cpu(void) +{ + struct device_node *cpu_node, *saw_node; + + for_each_of_cpu_node(cpu_node) { + saw_node = of_parse_phandle(cpu_node, "qcom,saw", 0); + if (of_device_is_available(saw_node)) { + of_node_put(saw_node); + of_node_put(cpu_node); + return true; + } + of_node_put(saw_node); + } + return false; +} + +static int __init qcom_spm_cpuidle_init(void) +{ + struct platform_device *pdev; + int ret; + + ret = platform_driver_register(&spm_cpuidle_driver); + if (ret) + return ret; + + /* Make sure there is actually any CPU managed by the SPM */ + if (!qcom_spm_find_any_cpu()) + return 0; + + pdev = platform_device_register_simple("qcom-spm-cpuidle", + -1, NULL, 0); + if (IS_ERR(pdev)) { + platform_driver_unregister(&spm_cpuidle_driver); + return PTR_ERR(pdev); + } + + return 0; +} +device_initcall(qcom_spm_cpuidle_init); diff --git a/drivers/cpuidle/cpuidle-riscv-sbi.c b/drivers/cpuidle/cpuidle-riscv-sbi.c new file mode 100644 index 0000000000..e8094fc924 --- /dev/null +++ b/drivers/cpuidle/cpuidle-riscv-sbi.c @@ -0,0 +1,634 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * RISC-V SBI CPU idle driver. + * + * Copyright (c) 2021 Western Digital Corporation or its affiliates. + * Copyright (c) 2022 Ventana Micro Systems Inc. + */ + +#define pr_fmt(fmt) "cpuidle-riscv-sbi: " fmt + +#include <linux/cpuhotplug.h> +#include <linux/cpuidle.h> +#include <linux/cpumask.h> +#include <linux/cpu_pm.h> +#include <linux/cpu_cooling.h> +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/of.h> +#include <linux/slab.h> +#include <linux/platform_device.h> +#include <linux/pm_domain.h> +#include <linux/pm_runtime.h> +#include <asm/cpuidle.h> +#include <asm/sbi.h> +#include <asm/smp.h> +#include <asm/suspend.h> + +#include "dt_idle_states.h" +#include "dt_idle_genpd.h" + +struct sbi_cpuidle_data { + u32 *states; + struct device *dev; +}; + +struct sbi_domain_state { + bool available; + u32 state; +}; + +static DEFINE_PER_CPU_READ_MOSTLY(struct sbi_cpuidle_data, sbi_cpuidle_data); +static DEFINE_PER_CPU(struct sbi_domain_state, domain_state); +static bool sbi_cpuidle_use_osi; +static bool sbi_cpuidle_use_cpuhp; +static bool sbi_cpuidle_pd_allow_domain_state; + +static inline void sbi_set_domain_state(u32 state) +{ + struct sbi_domain_state *data = this_cpu_ptr(&domain_state); + + data->available = true; + data->state = state; +} + +static inline u32 sbi_get_domain_state(void) +{ + struct sbi_domain_state *data = this_cpu_ptr(&domain_state); + + return data->state; +} + +static inline void sbi_clear_domain_state(void) +{ + struct sbi_domain_state *data = this_cpu_ptr(&domain_state); + + data->available = false; +} + +static inline bool sbi_is_domain_state_available(void) +{ + struct sbi_domain_state *data = this_cpu_ptr(&domain_state); + + return data->available; +} + +static int sbi_suspend_finisher(unsigned long suspend_type, + unsigned long resume_addr, + unsigned long opaque) +{ + struct sbiret ret; + + ret = sbi_ecall(SBI_EXT_HSM, SBI_EXT_HSM_HART_SUSPEND, + suspend_type, resume_addr, opaque, 0, 0, 0); + + return (ret.error) ? sbi_err_map_linux_errno(ret.error) : 0; +} + +static int sbi_suspend(u32 state) +{ + if (state & SBI_HSM_SUSP_NON_RET_BIT) + return cpu_suspend(state, sbi_suspend_finisher); + else + return sbi_suspend_finisher(state, 0, 0); +} + +static __cpuidle int sbi_cpuidle_enter_state(struct cpuidle_device *dev, + struct cpuidle_driver *drv, int idx) +{ + u32 *states = __this_cpu_read(sbi_cpuidle_data.states); + u32 state = states[idx]; + + if (state & SBI_HSM_SUSP_NON_RET_BIT) + return CPU_PM_CPU_IDLE_ENTER_PARAM(sbi_suspend, idx, state); + else + return CPU_PM_CPU_IDLE_ENTER_RETENTION_PARAM(sbi_suspend, + idx, state); +} + +static __cpuidle int __sbi_enter_domain_idle_state(struct cpuidle_device *dev, + struct cpuidle_driver *drv, int idx, + bool s2idle) +{ + struct sbi_cpuidle_data *data = this_cpu_ptr(&sbi_cpuidle_data); + u32 *states = data->states; + struct device *pd_dev = data->dev; + u32 state; + int ret; + + ret = cpu_pm_enter(); + if (ret) + return -1; + + /* Do runtime PM to manage a hierarchical CPU toplogy. */ + if (s2idle) + dev_pm_genpd_suspend(pd_dev); + else + pm_runtime_put_sync_suspend(pd_dev); + + ct_cpuidle_enter(); + + if (sbi_is_domain_state_available()) + state = sbi_get_domain_state(); + else + state = states[idx]; + + ret = sbi_suspend(state) ? -1 : idx; + + ct_cpuidle_exit(); + + if (s2idle) + dev_pm_genpd_resume(pd_dev); + else + pm_runtime_get_sync(pd_dev); + + cpu_pm_exit(); + + /* Clear the domain state to start fresh when back from idle. */ + sbi_clear_domain_state(); + return ret; +} + +static int sbi_enter_domain_idle_state(struct cpuidle_device *dev, + struct cpuidle_driver *drv, int idx) +{ + return __sbi_enter_domain_idle_state(dev, drv, idx, false); +} + +static int sbi_enter_s2idle_domain_idle_state(struct cpuidle_device *dev, + struct cpuidle_driver *drv, + int idx) +{ + return __sbi_enter_domain_idle_state(dev, drv, idx, true); +} + +static int sbi_cpuidle_cpuhp_up(unsigned int cpu) +{ + struct device *pd_dev = __this_cpu_read(sbi_cpuidle_data.dev); + + if (pd_dev) + pm_runtime_get_sync(pd_dev); + + return 0; +} + +static int sbi_cpuidle_cpuhp_down(unsigned int cpu) +{ + struct device *pd_dev = __this_cpu_read(sbi_cpuidle_data.dev); + + if (pd_dev) { + pm_runtime_put_sync(pd_dev); + /* Clear domain state to start fresh at next online. */ + sbi_clear_domain_state(); + } + + return 0; +} + +static void sbi_idle_init_cpuhp(void) +{ + int err; + + if (!sbi_cpuidle_use_cpuhp) + return; + + err = cpuhp_setup_state_nocalls(CPUHP_AP_CPU_PM_STARTING, + "cpuidle/sbi:online", + sbi_cpuidle_cpuhp_up, + sbi_cpuidle_cpuhp_down); + if (err) + pr_warn("Failed %d while setup cpuhp state\n", err); +} + +static const struct of_device_id sbi_cpuidle_state_match[] = { + { .compatible = "riscv,idle-state", + .data = sbi_cpuidle_enter_state }, + { }, +}; + +static bool sbi_suspend_state_is_valid(u32 state) +{ + if (state > SBI_HSM_SUSPEND_RET_DEFAULT && + state < SBI_HSM_SUSPEND_RET_PLATFORM) + return false; + if (state > SBI_HSM_SUSPEND_NON_RET_DEFAULT && + state < SBI_HSM_SUSPEND_NON_RET_PLATFORM) + return false; + return true; +} + +static int sbi_dt_parse_state_node(struct device_node *np, u32 *state) +{ + int err = of_property_read_u32(np, "riscv,sbi-suspend-param", state); + + if (err) { + pr_warn("%pOF missing riscv,sbi-suspend-param property\n", np); + return err; + } + + if (!sbi_suspend_state_is_valid(*state)) { + pr_warn("Invalid SBI suspend state %#x\n", *state); + return -EINVAL; + } + + return 0; +} + +static int sbi_dt_cpu_init_topology(struct cpuidle_driver *drv, + struct sbi_cpuidle_data *data, + unsigned int state_count, int cpu) +{ + /* Currently limit the hierarchical topology to be used in OSI mode. */ + if (!sbi_cpuidle_use_osi) + return 0; + + data->dev = dt_idle_attach_cpu(cpu, "sbi"); + if (IS_ERR_OR_NULL(data->dev)) + return PTR_ERR_OR_ZERO(data->dev); + + /* + * Using the deepest state for the CPU to trigger a potential selection + * of a shared state for the domain, assumes the domain states are all + * deeper states. + */ + drv->states[state_count - 1].flags |= CPUIDLE_FLAG_RCU_IDLE; + drv->states[state_count - 1].enter = sbi_enter_domain_idle_state; + drv->states[state_count - 1].enter_s2idle = + sbi_enter_s2idle_domain_idle_state; + sbi_cpuidle_use_cpuhp = true; + + return 0; +} + +static int sbi_cpuidle_dt_init_states(struct device *dev, + struct cpuidle_driver *drv, + unsigned int cpu, + unsigned int state_count) +{ + struct sbi_cpuidle_data *data = per_cpu_ptr(&sbi_cpuidle_data, cpu); + struct device_node *state_node; + struct device_node *cpu_node; + u32 *states; + int i, ret; + + cpu_node = of_cpu_device_node_get(cpu); + if (!cpu_node) + return -ENODEV; + + states = devm_kcalloc(dev, state_count, sizeof(*states), GFP_KERNEL); + if (!states) { + ret = -ENOMEM; + goto fail; + } + + /* Parse SBI specific details from state DT nodes */ + for (i = 1; i < state_count; i++) { + state_node = of_get_cpu_state_node(cpu_node, i - 1); + if (!state_node) + break; + + ret = sbi_dt_parse_state_node(state_node, &states[i]); + of_node_put(state_node); + + if (ret) + return ret; + + pr_debug("sbi-state %#x index %d\n", states[i], i); + } + if (i != state_count) { + ret = -ENODEV; + goto fail; + } + + /* Initialize optional data, used for the hierarchical topology. */ + ret = sbi_dt_cpu_init_topology(drv, data, state_count, cpu); + if (ret < 0) + return ret; + + /* Store states in the per-cpu struct. */ + data->states = states; + +fail: + of_node_put(cpu_node); + + return ret; +} + +static void sbi_cpuidle_deinit_cpu(int cpu) +{ + struct sbi_cpuidle_data *data = per_cpu_ptr(&sbi_cpuidle_data, cpu); + + dt_idle_detach_cpu(data->dev); + sbi_cpuidle_use_cpuhp = false; +} + +static int sbi_cpuidle_init_cpu(struct device *dev, int cpu) +{ + struct cpuidle_driver *drv; + unsigned int state_count = 0; + int ret = 0; + + drv = devm_kzalloc(dev, sizeof(*drv), GFP_KERNEL); + if (!drv) + return -ENOMEM; + + drv->name = "sbi_cpuidle"; + drv->owner = THIS_MODULE; + drv->cpumask = (struct cpumask *)cpumask_of(cpu); + + /* RISC-V architectural WFI to be represented as state index 0. */ + drv->states[0].enter = sbi_cpuidle_enter_state; + drv->states[0].exit_latency = 1; + drv->states[0].target_residency = 1; + drv->states[0].power_usage = UINT_MAX; + strcpy(drv->states[0].name, "WFI"); + strcpy(drv->states[0].desc, "RISC-V WFI"); + + /* + * If no DT idle states are detected (ret == 0) let the driver + * initialization fail accordingly since there is no reason to + * initialize the idle driver if only wfi is supported, the + * default archictectural back-end already executes wfi + * on idle entry. + */ + ret = dt_init_idle_driver(drv, sbi_cpuidle_state_match, 1); + if (ret <= 0) { + pr_debug("HART%ld: failed to parse DT idle states\n", + cpuid_to_hartid_map(cpu)); + return ret ? : -ENODEV; + } + state_count = ret + 1; /* Include WFI state as well */ + + /* Initialize idle states from DT. */ + ret = sbi_cpuidle_dt_init_states(dev, drv, cpu, state_count); + if (ret) { + pr_err("HART%ld: failed to init idle states\n", + cpuid_to_hartid_map(cpu)); + return ret; + } + + ret = cpuidle_register(drv, NULL); + if (ret) + goto deinit; + + cpuidle_cooling_register(drv); + + return 0; +deinit: + sbi_cpuidle_deinit_cpu(cpu); + return ret; +} + +static void sbi_cpuidle_domain_sync_state(struct device *dev) +{ + /* + * All devices have now been attached/probed to the PM domain + * topology, hence it's fine to allow domain states to be picked. + */ + sbi_cpuidle_pd_allow_domain_state = true; +} + +#ifdef CONFIG_DT_IDLE_GENPD + +static int sbi_cpuidle_pd_power_off(struct generic_pm_domain *pd) +{ + struct genpd_power_state *state = &pd->states[pd->state_idx]; + u32 *pd_state; + + if (!state->data) + return 0; + + if (!sbi_cpuidle_pd_allow_domain_state) + return -EBUSY; + + /* OSI mode is enabled, set the corresponding domain state. */ + pd_state = state->data; + sbi_set_domain_state(*pd_state); + + return 0; +} + +struct sbi_pd_provider { + struct list_head link; + struct device_node *node; +}; + +static LIST_HEAD(sbi_pd_providers); + +static int sbi_pd_init(struct device_node *np) +{ + struct generic_pm_domain *pd; + struct sbi_pd_provider *pd_provider; + struct dev_power_governor *pd_gov; + int ret = -ENOMEM; + + pd = dt_idle_pd_alloc(np, sbi_dt_parse_state_node); + if (!pd) + goto out; + + pd_provider = kzalloc(sizeof(*pd_provider), GFP_KERNEL); + if (!pd_provider) + goto free_pd; + + pd->flags |= GENPD_FLAG_IRQ_SAFE | GENPD_FLAG_CPU_DOMAIN; + + /* Allow power off when OSI is available. */ + if (sbi_cpuidle_use_osi) + pd->power_off = sbi_cpuidle_pd_power_off; + else + pd->flags |= GENPD_FLAG_ALWAYS_ON; + + /* Use governor for CPU PM domains if it has some states to manage. */ + pd_gov = pd->states ? &pm_domain_cpu_gov : NULL; + + ret = pm_genpd_init(pd, pd_gov, false); + if (ret) + goto free_pd_prov; + + ret = of_genpd_add_provider_simple(np, pd); + if (ret) + goto remove_pd; + + pd_provider->node = of_node_get(np); + list_add(&pd_provider->link, &sbi_pd_providers); + + pr_debug("init PM domain %s\n", pd->name); + return 0; + +remove_pd: + pm_genpd_remove(pd); +free_pd_prov: + kfree(pd_provider); +free_pd: + dt_idle_pd_free(pd); +out: + pr_err("failed to init PM domain ret=%d %pOF\n", ret, np); + return ret; +} + +static void sbi_pd_remove(void) +{ + struct sbi_pd_provider *pd_provider, *it; + struct generic_pm_domain *genpd; + + list_for_each_entry_safe(pd_provider, it, &sbi_pd_providers, link) { + of_genpd_del_provider(pd_provider->node); + + genpd = of_genpd_remove_last(pd_provider->node); + if (!IS_ERR(genpd)) + kfree(genpd); + + of_node_put(pd_provider->node); + list_del(&pd_provider->link); + kfree(pd_provider); + } +} + +static int sbi_genpd_probe(struct device_node *np) +{ + struct device_node *node; + int ret = 0, pd_count = 0; + + if (!np) + return -ENODEV; + + /* + * Parse child nodes for the "#power-domain-cells" property and + * initialize a genpd/genpd-of-provider pair when it's found. + */ + for_each_child_of_node(np, node) { + if (!of_property_present(node, "#power-domain-cells")) + continue; + + ret = sbi_pd_init(node); + if (ret) + goto put_node; + + pd_count++; + } + + /* Bail out if not using the hierarchical CPU topology. */ + if (!pd_count) + goto no_pd; + + /* Link genpd masters/subdomains to model the CPU topology. */ + ret = dt_idle_pd_init_topology(np); + if (ret) + goto remove_pd; + + return 0; + +put_node: + of_node_put(node); +remove_pd: + sbi_pd_remove(); + pr_err("failed to create CPU PM domains ret=%d\n", ret); +no_pd: + return ret; +} + +#else + +static inline int sbi_genpd_probe(struct device_node *np) +{ + return 0; +} + +#endif + +static int sbi_cpuidle_probe(struct platform_device *pdev) +{ + int cpu, ret; + struct cpuidle_driver *drv; + struct cpuidle_device *dev; + struct device_node *np, *pds_node; + + /* Detect OSI support based on CPU DT nodes */ + sbi_cpuidle_use_osi = true; + for_each_possible_cpu(cpu) { + np = of_cpu_device_node_get(cpu); + if (np && + of_property_present(np, "power-domains") && + of_property_present(np, "power-domain-names")) { + continue; + } else { + sbi_cpuidle_use_osi = false; + break; + } + } + + /* Populate generic power domains from DT nodes */ + pds_node = of_find_node_by_path("/cpus/power-domains"); + if (pds_node) { + ret = sbi_genpd_probe(pds_node); + of_node_put(pds_node); + if (ret) + return ret; + } + + /* Initialize CPU idle driver for each CPU */ + for_each_possible_cpu(cpu) { + ret = sbi_cpuidle_init_cpu(&pdev->dev, cpu); + if (ret) { + pr_debug("HART%ld: idle driver init failed\n", + cpuid_to_hartid_map(cpu)); + goto out_fail; + } + } + + /* Setup CPU hotplut notifiers */ + sbi_idle_init_cpuhp(); + + pr_info("idle driver registered for all CPUs\n"); + + return 0; + +out_fail: + while (--cpu >= 0) { + dev = per_cpu(cpuidle_devices, cpu); + drv = cpuidle_get_cpu_driver(dev); + cpuidle_unregister(drv); + sbi_cpuidle_deinit_cpu(cpu); + } + + return ret; +} + +static struct platform_driver sbi_cpuidle_driver = { + .probe = sbi_cpuidle_probe, + .driver = { + .name = "sbi-cpuidle", + .sync_state = sbi_cpuidle_domain_sync_state, + }, +}; + +static int __init sbi_cpuidle_init(void) +{ + int ret; + struct platform_device *pdev; + + /* + * The SBI HSM suspend function is only available when: + * 1) SBI version is 0.3 or higher + * 2) SBI HSM extension is available + */ + if ((sbi_spec_version < sbi_mk_version(0, 3)) || + !sbi_probe_extension(SBI_EXT_HSM)) { + pr_info("HSM suspend not available\n"); + return 0; + } + + ret = platform_driver_register(&sbi_cpuidle_driver); + if (ret) + return ret; + + pdev = platform_device_register_simple("sbi-cpuidle", + -1, NULL, 0); + if (IS_ERR(pdev)) { + platform_driver_unregister(&sbi_cpuidle_driver); + return PTR_ERR(pdev); + } + + return 0; +} +device_initcall(sbi_cpuidle_init); diff --git a/drivers/cpuidle/cpuidle-tegra.c b/drivers/cpuidle/cpuidle-tegra.c new file mode 100644 index 0000000000..b203a93dea --- /dev/null +++ b/drivers/cpuidle/cpuidle-tegra.c @@ -0,0 +1,402 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * CPU idle driver for Tegra CPUs + * + * Copyright (c) 2010-2013, NVIDIA Corporation. + * Copyright (c) 2011 Google, Inc. + * Author: Colin Cross <ccross@android.com> + * Gary King <gking@nvidia.com> + * + * Rework for 3.3 by Peter De Schrijver <pdeschrijver@nvidia.com> + * + * Tegra20/124 driver unification by Dmitry Osipenko <digetx@gmail.com> + */ + +#define pr_fmt(fmt) "tegra-cpuidle: " fmt + +#include <linux/atomic.h> +#include <linux/cpuidle.h> +#include <linux/cpumask.h> +#include <linux/cpu_pm.h> +#include <linux/delay.h> +#include <linux/errno.h> +#include <linux/platform_device.h> +#include <linux/types.h> + +#include <linux/clk/tegra.h> +#include <linux/firmware/trusted_foundations.h> + +#include <soc/tegra/cpuidle.h> +#include <soc/tegra/flowctrl.h> +#include <soc/tegra/fuse.h> +#include <soc/tegra/irq.h> +#include <soc/tegra/pm.h> +#include <soc/tegra/pmc.h> + +#include <asm/cpuidle.h> +#include <asm/firmware.h> +#include <asm/smp_plat.h> +#include <asm/suspend.h> + +enum tegra_state { + TEGRA_C1, + TEGRA_C7, + TEGRA_CC6, + TEGRA_STATE_COUNT, +}; + +static atomic_t tegra_idle_barrier; +static atomic_t tegra_abort_flag; + +static void tegra_cpuidle_report_cpus_state(void) +{ + unsigned long cpu, lcpu, csr; + + for_each_cpu(lcpu, cpu_possible_mask) { + cpu = cpu_logical_map(lcpu); + csr = flowctrl_read_cpu_csr(cpu); + + pr_err("cpu%lu: online=%d flowctrl_csr=0x%08lx\n", + cpu, cpu_online(lcpu), csr); + } +} + +static int tegra_cpuidle_wait_for_secondary_cpus_parking(void) +{ + unsigned int retries = 3; + + while (retries--) { + unsigned int delay_us = 10; + unsigned int timeout_us = 500 * 1000 / delay_us; + + /* + * The primary CPU0 core shall wait for the secondaries + * shutdown in order to power-off CPU's cluster safely. + * The timeout value depends on the current CPU frequency, + * it takes about 40-150us in average and over 1000us in + * a worst case scenario. + */ + do { + if (tegra_cpu_rail_off_ready()) + return 0; + + udelay(delay_us); + + } while (timeout_us--); + + pr_err("secondary CPU taking too long to park\n"); + + tegra_cpuidle_report_cpus_state(); + } + + pr_err("timed out waiting secondaries to park\n"); + + return -ETIMEDOUT; +} + +static void tegra_cpuidle_unpark_secondary_cpus(void) +{ + unsigned int cpu, lcpu; + + for_each_cpu(lcpu, cpu_online_mask) { + cpu = cpu_logical_map(lcpu); + + if (cpu > 0) { + tegra_enable_cpu_clock(cpu); + tegra_cpu_out_of_reset(cpu); + flowctrl_write_cpu_halt(cpu, 0); + } + } +} + +static int tegra_cpuidle_cc6_enter(unsigned int cpu) +{ + int ret; + + if (cpu > 0) { + ret = cpu_suspend(cpu, tegra_pm_park_secondary_cpu); + } else { + ret = tegra_cpuidle_wait_for_secondary_cpus_parking(); + if (!ret) + ret = tegra_pm_enter_lp2(); + + tegra_cpuidle_unpark_secondary_cpus(); + } + + return ret; +} + +static int tegra_cpuidle_c7_enter(void) +{ + int err; + + err = call_firmware_op(prepare_idle, TF_PM_MODE_LP2_NOFLUSH_L2); + if (err && err != -ENOSYS) + return err; + + return cpu_suspend(0, tegra30_pm_secondary_cpu_suspend); +} + +static int tegra_cpuidle_coupled_barrier(struct cpuidle_device *dev) +{ + if (tegra_pending_sgi()) { + /* + * CPU got local interrupt that will be lost after GIC's + * shutdown because GIC driver doesn't save/restore the + * pending SGI state across CPU cluster PM. Abort and retry + * next time. + */ + atomic_set(&tegra_abort_flag, 1); + } + + cpuidle_coupled_parallel_barrier(dev, &tegra_idle_barrier); + + if (atomic_read(&tegra_abort_flag)) { + cpuidle_coupled_parallel_barrier(dev, &tegra_idle_barrier); + atomic_set(&tegra_abort_flag, 0); + return -EINTR; + } + + return 0; +} + +static __cpuidle int tegra_cpuidle_state_enter(struct cpuidle_device *dev, + int index, unsigned int cpu) +{ + int err; + + /* + * CC6 state is the "CPU cluster power-off" state. In order to + * enter this state, at first the secondary CPU cores need to be + * parked into offline mode, then the last CPU should clean out + * remaining dirty cache lines into DRAM and trigger Flow Controller + * logic that turns off the cluster's power domain (which includes + * CPU cores, GIC and L2 cache). + */ + if (index == TEGRA_CC6) { + err = tegra_cpuidle_coupled_barrier(dev); + if (err) + return err; + } + + local_fiq_disable(); + tegra_pm_set_cpu_in_lp2(); + cpu_pm_enter(); + + ct_cpuidle_enter(); + + switch (index) { + case TEGRA_C7: + err = tegra_cpuidle_c7_enter(); + break; + + case TEGRA_CC6: + err = tegra_cpuidle_cc6_enter(cpu); + break; + + default: + err = -EINVAL; + break; + } + + ct_cpuidle_exit(); + + cpu_pm_exit(); + tegra_pm_clear_cpu_in_lp2(); + local_fiq_enable(); + + return err ?: index; +} + +static int tegra_cpuidle_adjust_state_index(int index, unsigned int cpu) +{ + /* + * On Tegra30 CPU0 can't be power-gated separately from secondary + * cores because it gates the whole CPU cluster. + */ + if (cpu > 0 || index != TEGRA_C7 || tegra_get_chip_id() != TEGRA30) + return index; + + /* put CPU0 into C1 if C7 is requested and secondaries are online */ + if (!IS_ENABLED(CONFIG_PM_SLEEP) || num_online_cpus() > 1) + index = TEGRA_C1; + else + index = TEGRA_CC6; + + return index; +} + +static __cpuidle int tegra_cpuidle_enter(struct cpuidle_device *dev, + struct cpuidle_driver *drv, + int index) +{ + bool do_rcu = drv->states[index].flags & CPUIDLE_FLAG_RCU_IDLE; + unsigned int cpu = cpu_logical_map(dev->cpu); + int ret; + + index = tegra_cpuidle_adjust_state_index(index, cpu); + if (dev->states_usage[index].disable) + return -1; + + if (index == TEGRA_C1) { + if (do_rcu) + ct_cpuidle_enter(); + ret = arm_cpuidle_simple_enter(dev, drv, index); + if (do_rcu) + ct_cpuidle_exit(); + } else + ret = tegra_cpuidle_state_enter(dev, index, cpu); + + if (ret < 0) { + if (ret != -EINTR || index != TEGRA_CC6) + pr_err_once("failed to enter state %d err: %d\n", + index, ret); + index = -1; + } else { + index = ret; + } + + return index; +} + +static int tegra114_enter_s2idle(struct cpuidle_device *dev, + struct cpuidle_driver *drv, + int index) +{ + tegra_cpuidle_enter(dev, drv, index); + + return 0; +} + +/* + * The previous versions of Tegra CPUIDLE driver used a different "legacy" + * terminology for naming of the idling states, while this driver uses the + * new terminology. + * + * Mapping of the old terms into the new ones: + * + * Old | New + * --------- + * LP3 | C1 (CPU core clock gating) + * LP2 | C7 (CPU core power gating) + * LP2 | CC6 (CPU cluster power gating) + * + * Note that that the older CPUIDLE driver versions didn't explicitly + * differentiate the LP2 states because these states either used the same + * code path or because CC6 wasn't supported. + */ +static struct cpuidle_driver tegra_idle_driver = { + .name = "tegra_idle", + .states = { + [TEGRA_C1] = ARM_CPUIDLE_WFI_STATE_PWR(600), + [TEGRA_C7] = { + .enter = tegra_cpuidle_enter, + .exit_latency = 2000, + .target_residency = 2200, + .power_usage = 100, + .flags = CPUIDLE_FLAG_TIMER_STOP | + CPUIDLE_FLAG_RCU_IDLE, + .name = "C7", + .desc = "CPU core powered off", + }, + [TEGRA_CC6] = { + .enter = tegra_cpuidle_enter, + .exit_latency = 5000, + .target_residency = 10000, + .power_usage = 0, + .flags = CPUIDLE_FLAG_TIMER_STOP | + CPUIDLE_FLAG_RCU_IDLE | + CPUIDLE_FLAG_COUPLED, + .name = "CC6", + .desc = "CPU cluster powered off", + }, + }, + .state_count = TEGRA_STATE_COUNT, + .safe_state_index = TEGRA_C1, +}; + +static inline void tegra_cpuidle_disable_state(enum tegra_state state) +{ + cpuidle_driver_state_disabled(&tegra_idle_driver, state, true); +} + +/* + * Tegra20 HW appears to have a bug such that PCIe device interrupts, whether + * they are legacy IRQs or MSI, are lost when CC6 is enabled. To work around + * this, simply disable CC6 if the PCI driver and DT node are both enabled. + */ +void tegra_cpuidle_pcie_irqs_in_use(void) +{ + struct cpuidle_state *state_cc6 = &tegra_idle_driver.states[TEGRA_CC6]; + + if ((state_cc6->flags & CPUIDLE_FLAG_UNUSABLE) || + tegra_get_chip_id() != TEGRA20) + return; + + pr_info("disabling CC6 state, since PCIe IRQs are in use\n"); + tegra_cpuidle_disable_state(TEGRA_CC6); +} + +static void tegra_cpuidle_setup_tegra114_c7_state(void) +{ + struct cpuidle_state *s = &tegra_idle_driver.states[TEGRA_C7]; + + s->enter_s2idle = tegra114_enter_s2idle; + s->target_residency = 1000; + s->exit_latency = 500; +} + +static int tegra_cpuidle_probe(struct platform_device *pdev) +{ + if (tegra_pmc_get_suspend_mode() == TEGRA_SUSPEND_NOT_READY) + return -EPROBE_DEFER; + + /* LP2 could be disabled in device-tree */ + if (tegra_pmc_get_suspend_mode() < TEGRA_SUSPEND_LP2) + tegra_cpuidle_disable_state(TEGRA_CC6); + + /* + * Required suspend-resume functionality, which is provided by the + * Tegra-arch core and PMC driver, is unavailable if PM-sleep option + * is disabled. + */ + if (!IS_ENABLED(CONFIG_PM_SLEEP)) { + tegra_cpuidle_disable_state(TEGRA_C7); + tegra_cpuidle_disable_state(TEGRA_CC6); + } + + /* + * Generic WFI state (also known as C1 or LP3) and the coupled CPU + * cluster power-off (CC6 or LP2) states are common for all Tegra SoCs. + */ + switch (tegra_get_chip_id()) { + case TEGRA20: + /* Tegra20 isn't capable to power-off individual CPU cores */ + tegra_cpuidle_disable_state(TEGRA_C7); + break; + + case TEGRA30: + break; + + case TEGRA114: + case TEGRA124: + tegra_cpuidle_setup_tegra114_c7_state(); + + /* coupled CC6 (LP2) state isn't implemented yet */ + tegra_cpuidle_disable_state(TEGRA_CC6); + break; + + default: + return -EINVAL; + } + + return cpuidle_register(&tegra_idle_driver, cpu_possible_mask); +} + +static struct platform_driver tegra_cpuidle_driver = { + .probe = tegra_cpuidle_probe, + .driver = { + .name = "tegra-cpuidle", + }, +}; +builtin_platform_driver(tegra_cpuidle_driver); diff --git a/drivers/cpuidle/cpuidle-ux500.c b/drivers/cpuidle/cpuidle-ux500.c new file mode 100644 index 0000000000..f7d778580e --- /dev/null +++ b/drivers/cpuidle/cpuidle-ux500.c @@ -0,0 +1,124 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (c) 2012 Linaro : Daniel Lezcano <daniel.lezcano@linaro.org> (IBM) + * + * Based on the work of Rickard Andersson <rickard.andersson@stericsson.com> + * and Jonas Aaberg <jonas.aberg@stericsson.com>. + */ + +#include <linux/init.h> +#include <linux/cpuidle.h> +#include <linux/spinlock.h> +#include <linux/atomic.h> +#include <linux/smp.h> +#include <linux/mfd/dbx500-prcmu.h> +#include <linux/platform_data/arm-ux500-pm.h> +#include <linux/platform_device.h> + +#include <asm/cpuidle.h> + +static atomic_t master = ATOMIC_INIT(0); +static DEFINE_SPINLOCK(master_lock); + +static inline int ux500_enter_idle(struct cpuidle_device *dev, + struct cpuidle_driver *drv, int index) +{ + int this_cpu = smp_processor_id(); + bool recouple = false; + + if (atomic_inc_return(&master) == num_online_cpus()) { + + /* With this lock, we prevent the other cpu to exit and enter + * this function again and become the master */ + if (!spin_trylock(&master_lock)) + goto wfi; + + /* decouple the gic from the A9 cores */ + if (prcmu_gic_decouple()) { + spin_unlock(&master_lock); + goto out; + } + + /* If an error occur, we will have to recouple the gic + * manually */ + recouple = true; + + /* At this state, as the gic is decoupled, if the other + * cpu is in WFI, we have the guarantee it won't be wake + * up, so we can safely go to retention */ + if (!prcmu_is_cpu_in_wfi(this_cpu ? 0 : 1)) + goto out; + + /* The prcmu will be in charge of watching the interrupts + * and wake up the cpus */ + if (prcmu_copy_gic_settings()) + goto out; + + /* Check in the meantime an interrupt did + * not occur on the gic ... */ + if (prcmu_gic_pending_irq()) + goto out; + + /* ... and the prcmu */ + if (prcmu_pending_irq()) + goto out; + + /* Go to the retention state, the prcmu will wait for the + * cpu to go WFI and this is what happens after exiting this + * 'master' critical section */ + if (prcmu_set_power_state(PRCMU_AP_IDLE, true, true)) + goto out; + + /* When we switch to retention, the prcmu is in charge + * of recoupling the gic automatically */ + recouple = false; + + spin_unlock(&master_lock); + } +wfi: + cpu_do_idle(); +out: + atomic_dec(&master); + + if (recouple) { + prcmu_gic_recouple(); + spin_unlock(&master_lock); + } + + return index; +} + +static struct cpuidle_driver ux500_idle_driver = { + .name = "ux500_idle", + .owner = THIS_MODULE, + .states = { + ARM_CPUIDLE_WFI_STATE, + { + .enter = ux500_enter_idle, + .exit_latency = 70, + .target_residency = 260, + .flags = CPUIDLE_FLAG_TIMER_STOP, + .name = "ApIdle", + .desc = "ARM Retention", + }, + }, + .safe_state_index = 0, + .state_count = 2, +}; + +static int dbx500_cpuidle_probe(struct platform_device *pdev) +{ + /* Configure wake up reasons */ + prcmu_enable_wakeups(PRCMU_WAKEUP(ARM) | PRCMU_WAKEUP(RTC) | + PRCMU_WAKEUP(ABB)); + + return cpuidle_register(&ux500_idle_driver, NULL); +} + +static struct platform_driver dbx500_cpuidle_plat_driver = { + .driver = { + .name = "db8500-cpuidle", + }, + .probe = dbx500_cpuidle_probe, +}; +builtin_platform_driver(dbx500_cpuidle_plat_driver); diff --git a/drivers/cpuidle/cpuidle-zynq.c b/drivers/cpuidle/cpuidle-zynq.c new file mode 100644 index 0000000000..a79610e723 --- /dev/null +++ b/drivers/cpuidle/cpuidle-zynq.c @@ -0,0 +1,65 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (C) 2012-2013 Xilinx + * + * CPU idle support for Xilinx Zynq + * + * based on arch/arm/mach-at91/cpuidle.c + * + * The cpu idle uses wait-for-interrupt and RAM self refresh in order + * to implement two idle states - + * #1 wait-for-interrupt + * #2 wait-for-interrupt and RAM self refresh + * + * Maintainer: Michal Simek <michal.simek@xilinx.com> + */ + +#include <linux/init.h> +#include <linux/cpuidle.h> +#include <linux/platform_device.h> +#include <asm/cpuidle.h> + +#define ZYNQ_MAX_STATES 2 + +/* Actual code that puts the SoC in different idle states */ +static int zynq_enter_idle(struct cpuidle_device *dev, + struct cpuidle_driver *drv, int index) +{ + /* Add code for DDR self refresh start */ + cpu_do_idle(); + + return index; +} + +static struct cpuidle_driver zynq_idle_driver = { + .name = "zynq_idle", + .owner = THIS_MODULE, + .states = { + ARM_CPUIDLE_WFI_STATE, + { + .enter = zynq_enter_idle, + .exit_latency = 10, + .target_residency = 10000, + .name = "RAM_SR", + .desc = "WFI and RAM Self Refresh", + }, + }, + .safe_state_index = 0, + .state_count = ZYNQ_MAX_STATES, +}; + +/* Initialize CPU idle by registering the idle states */ +static int zynq_cpuidle_probe(struct platform_device *pdev) +{ + pr_info("Xilinx Zynq CpuIdle Driver started\n"); + + return cpuidle_register(&zynq_idle_driver, NULL); +} + +static struct platform_driver zynq_cpuidle_driver = { + .driver = { + .name = "cpuidle-zynq", + }, + .probe = zynq_cpuidle_probe, +}; +builtin_platform_driver(zynq_cpuidle_driver); diff --git a/drivers/cpuidle/cpuidle.c b/drivers/cpuidle/cpuidle.c new file mode 100644 index 0000000000..737a026ef5 --- /dev/null +++ b/drivers/cpuidle/cpuidle.c @@ -0,0 +1,816 @@ +/* + * cpuidle.c - core cpuidle infrastructure + * + * (C) 2006-2007 Venkatesh Pallipadi <venkatesh.pallipadi@intel.com> + * Shaohua Li <shaohua.li@intel.com> + * Adam Belay <abelay@novell.com> + * + * This code is licenced under the GPL. + */ + +#include "linux/percpu-defs.h" +#include <linux/clockchips.h> +#include <linux/kernel.h> +#include <linux/mutex.h> +#include <linux/sched.h> +#include <linux/sched/clock.h> +#include <linux/sched/idle.h> +#include <linux/notifier.h> +#include <linux/pm_qos.h> +#include <linux/cpu.h> +#include <linux/cpuidle.h> +#include <linux/ktime.h> +#include <linux/hrtimer.h> +#include <linux/module.h> +#include <linux/suspend.h> +#include <linux/tick.h> +#include <linux/mmu_context.h> +#include <linux/context_tracking.h> +#include <trace/events/power.h> + +#include "cpuidle.h" + +DEFINE_PER_CPU(struct cpuidle_device *, cpuidle_devices); +DEFINE_PER_CPU(struct cpuidle_device, cpuidle_dev); + +DEFINE_MUTEX(cpuidle_lock); +LIST_HEAD(cpuidle_detected_devices); + +static int enabled_devices; +static int off __read_mostly; +static int initialized __read_mostly; + +int cpuidle_disabled(void) +{ + return off; +} +void disable_cpuidle(void) +{ + off = 1; +} + +bool cpuidle_not_available(struct cpuidle_driver *drv, + struct cpuidle_device *dev) +{ + return off || !initialized || !drv || !dev || !dev->enabled; +} + +/** + * cpuidle_play_dead - cpu off-lining + * + * Returns in case of an error or no driver + */ +int cpuidle_play_dead(void) +{ + struct cpuidle_device *dev = __this_cpu_read(cpuidle_devices); + struct cpuidle_driver *drv = cpuidle_get_cpu_driver(dev); + int i; + + if (!drv) + return -ENODEV; + + /* Find lowest-power state that supports long-term idle */ + for (i = drv->state_count - 1; i >= 0; i--) + if (drv->states[i].enter_dead) + return drv->states[i].enter_dead(dev, i); + + return -ENODEV; +} + +static int find_deepest_state(struct cpuidle_driver *drv, + struct cpuidle_device *dev, + u64 max_latency_ns, + unsigned int forbidden_flags, + bool s2idle) +{ + u64 latency_req = 0; + int i, ret = 0; + + for (i = 1; i < drv->state_count; i++) { + struct cpuidle_state *s = &drv->states[i]; + + if (dev->states_usage[i].disable || + s->exit_latency_ns <= latency_req || + s->exit_latency_ns > max_latency_ns || + (s->flags & forbidden_flags) || + (s2idle && !s->enter_s2idle)) + continue; + + latency_req = s->exit_latency_ns; + ret = i; + } + return ret; +} + +/** + * cpuidle_use_deepest_state - Set/unset governor override mode. + * @latency_limit_ns: Idle state exit latency limit (or no override if 0). + * + * If @latency_limit_ns is nonzero, set the current CPU to use the deepest idle + * state with exit latency within @latency_limit_ns (override governors going + * forward), or do not override governors if it is zero. + */ +void cpuidle_use_deepest_state(u64 latency_limit_ns) +{ + struct cpuidle_device *dev; + + preempt_disable(); + dev = cpuidle_get_device(); + if (dev) + dev->forced_idle_latency_limit_ns = latency_limit_ns; + preempt_enable(); +} + +/** + * cpuidle_find_deepest_state - Find the deepest available idle state. + * @drv: cpuidle driver for the given CPU. + * @dev: cpuidle device for the given CPU. + * @latency_limit_ns: Idle state exit latency limit + * + * Return: the index of the deepest available idle state. + */ +int cpuidle_find_deepest_state(struct cpuidle_driver *drv, + struct cpuidle_device *dev, + u64 latency_limit_ns) +{ + return find_deepest_state(drv, dev, latency_limit_ns, 0, false); +} + +#ifdef CONFIG_SUSPEND +static noinstr void enter_s2idle_proper(struct cpuidle_driver *drv, + struct cpuidle_device *dev, int index) +{ + struct cpuidle_state *target_state = &drv->states[index]; + ktime_t time_start, time_end; + + instrumentation_begin(); + + time_start = ns_to_ktime(local_clock_noinstr()); + + tick_freeze(); + /* + * The state used here cannot be a "coupled" one, because the "coupled" + * cpuidle mechanism enables interrupts and doing that with timekeeping + * suspended is generally unsafe. + */ + stop_critical_timings(); + if (!(target_state->flags & CPUIDLE_FLAG_RCU_IDLE)) { + ct_cpuidle_enter(); + /* Annotate away the indirect call */ + instrumentation_begin(); + } + target_state->enter_s2idle(dev, drv, index); + if (WARN_ON_ONCE(!irqs_disabled())) + raw_local_irq_disable(); + if (!(target_state->flags & CPUIDLE_FLAG_RCU_IDLE)) { + instrumentation_end(); + ct_cpuidle_exit(); + } + tick_unfreeze(); + start_critical_timings(); + + time_end = ns_to_ktime(local_clock_noinstr()); + + dev->states_usage[index].s2idle_time += ktime_us_delta(time_end, time_start); + dev->states_usage[index].s2idle_usage++; + instrumentation_end(); +} + +/** + * cpuidle_enter_s2idle - Enter an idle state suitable for suspend-to-idle. + * @drv: cpuidle driver for the given CPU. + * @dev: cpuidle device for the given CPU. + * + * If there are states with the ->enter_s2idle callback, find the deepest of + * them and enter it with frozen tick. + */ +int cpuidle_enter_s2idle(struct cpuidle_driver *drv, struct cpuidle_device *dev) +{ + int index; + + /* + * Find the deepest state with ->enter_s2idle present, which guarantees + * that interrupts won't be enabled when it exits and allows the tick to + * be frozen safely. + */ + index = find_deepest_state(drv, dev, U64_MAX, 0, true); + if (index > 0) { + enter_s2idle_proper(drv, dev, index); + local_irq_enable(); + } + return index; +} +#endif /* CONFIG_SUSPEND */ + +/** + * cpuidle_enter_state - enter the state and update stats + * @dev: cpuidle device for this cpu + * @drv: cpuidle driver for this cpu + * @index: index into the states table in @drv of the state to enter + */ +noinstr int cpuidle_enter_state(struct cpuidle_device *dev, + struct cpuidle_driver *drv, + int index) +{ + int entered_state; + + struct cpuidle_state *target_state = &drv->states[index]; + bool broadcast = !!(target_state->flags & CPUIDLE_FLAG_TIMER_STOP); + ktime_t time_start, time_end; + + instrumentation_begin(); + + /* + * Tell the time framework to switch to a broadcast timer because our + * local timer will be shut down. If a local timer is used from another + * CPU as a broadcast timer, this call may fail if it is not available. + */ + if (broadcast && tick_broadcast_enter()) { + index = find_deepest_state(drv, dev, target_state->exit_latency_ns, + CPUIDLE_FLAG_TIMER_STOP, false); + if (index < 0) { + default_idle_call(); + return -EBUSY; + } + target_state = &drv->states[index]; + broadcast = false; + } + + if (target_state->flags & CPUIDLE_FLAG_TLB_FLUSHED) + leave_mm(dev->cpu); + + /* Take note of the planned idle state. */ + sched_idle_set_state(target_state); + + trace_cpu_idle(index, dev->cpu); + time_start = ns_to_ktime(local_clock_noinstr()); + + stop_critical_timings(); + if (!(target_state->flags & CPUIDLE_FLAG_RCU_IDLE)) { + ct_cpuidle_enter(); + /* Annotate away the indirect call */ + instrumentation_begin(); + } + + /* + * NOTE!! + * + * For cpuidle_state::enter() methods that do *NOT* set + * CPUIDLE_FLAG_RCU_IDLE RCU will be disabled here and these functions + * must be marked either noinstr or __cpuidle. + * + * For cpuidle_state::enter() methods that *DO* set + * CPUIDLE_FLAG_RCU_IDLE this isn't required, but they must mark the + * function calling ct_cpuidle_enter() as noinstr/__cpuidle and all + * functions called within the RCU-idle region. + */ + entered_state = target_state->enter(dev, drv, index); + + if (WARN_ONCE(!irqs_disabled(), "%ps leaked IRQ state", target_state->enter)) + raw_local_irq_disable(); + + if (!(target_state->flags & CPUIDLE_FLAG_RCU_IDLE)) { + instrumentation_end(); + ct_cpuidle_exit(); + } + start_critical_timings(); + + sched_clock_idle_wakeup_event(); + time_end = ns_to_ktime(local_clock_noinstr()); + trace_cpu_idle(PWR_EVENT_EXIT, dev->cpu); + + /* The cpu is no longer idle or about to enter idle. */ + sched_idle_set_state(NULL); + + if (broadcast) + tick_broadcast_exit(); + + if (!cpuidle_state_is_coupled(drv, index)) + local_irq_enable(); + + if (entered_state >= 0) { + s64 diff, delay = drv->states[entered_state].exit_latency_ns; + int i; + + /* + * Update cpuidle counters + * This can be moved to within driver enter routine, + * but that results in multiple copies of same code. + */ + diff = ktime_sub(time_end, time_start); + + dev->last_residency_ns = diff; + dev->states_usage[entered_state].time_ns += diff; + dev->states_usage[entered_state].usage++; + + if (diff < drv->states[entered_state].target_residency_ns) { + for (i = entered_state - 1; i >= 0; i--) { + if (dev->states_usage[i].disable) + continue; + + /* Shallower states are enabled, so update. */ + dev->states_usage[entered_state].above++; + trace_cpu_idle_miss(dev->cpu, entered_state, false); + break; + } + } else if (diff > delay) { + for (i = entered_state + 1; i < drv->state_count; i++) { + if (dev->states_usage[i].disable) + continue; + + /* + * Update if a deeper state would have been a + * better match for the observed idle duration. + */ + if (diff - delay >= drv->states[i].target_residency_ns) { + dev->states_usage[entered_state].below++; + trace_cpu_idle_miss(dev->cpu, entered_state, true); + } + + break; + } + } + } else { + dev->last_residency_ns = 0; + dev->states_usage[index].rejected++; + } + + instrumentation_end(); + + return entered_state; +} + +/** + * cpuidle_select - ask the cpuidle framework to choose an idle state + * + * @drv: the cpuidle driver + * @dev: the cpuidle device + * @stop_tick: indication on whether or not to stop the tick + * + * Returns the index of the idle state. The return value must not be negative. + * + * The memory location pointed to by @stop_tick is expected to be written the + * 'false' boolean value if the scheduler tick should not be stopped before + * entering the returned state. + */ +int cpuidle_select(struct cpuidle_driver *drv, struct cpuidle_device *dev, + bool *stop_tick) +{ + return cpuidle_curr_governor->select(drv, dev, stop_tick); +} + +/** + * cpuidle_enter - enter into the specified idle state + * + * @drv: the cpuidle driver tied with the cpu + * @dev: the cpuidle device + * @index: the index in the idle state table + * + * Returns the index in the idle state, < 0 in case of error. + * The error code depends on the backend driver + */ +int cpuidle_enter(struct cpuidle_driver *drv, struct cpuidle_device *dev, + int index) +{ + int ret = 0; + + /* + * Store the next hrtimer, which becomes either next tick or the next + * timer event, whatever expires first. Additionally, to make this data + * useful for consumers outside cpuidle, we rely on that the governor's + * ->select() callback have decided, whether to stop the tick or not. + */ + WRITE_ONCE(dev->next_hrtimer, tick_nohz_get_next_hrtimer()); + + if (cpuidle_state_is_coupled(drv, index)) + ret = cpuidle_enter_state_coupled(dev, drv, index); + else + ret = cpuidle_enter_state(dev, drv, index); + + WRITE_ONCE(dev->next_hrtimer, 0); + return ret; +} + +/** + * cpuidle_reflect - tell the underlying governor what was the state + * we were in + * + * @dev : the cpuidle device + * @index: the index in the idle state table + * + */ +void cpuidle_reflect(struct cpuidle_device *dev, int index) +{ + if (cpuidle_curr_governor->reflect && index >= 0) + cpuidle_curr_governor->reflect(dev, index); +} + +/* + * Min polling interval of 10usec is a guess. It is assuming that + * for most users, the time for a single ping-pong workload like + * perf bench pipe would generally complete within 10usec but + * this is hardware dependant. Actual time can be estimated with + * + * perf bench sched pipe -l 10000 + * + * Run multiple times to avoid cpufreq effects. + */ +#define CPUIDLE_POLL_MIN 10000 +#define CPUIDLE_POLL_MAX (TICK_NSEC / 16) + +/** + * cpuidle_poll_time - return amount of time to poll for, + * governors can override dev->poll_limit_ns if necessary + * + * @drv: the cpuidle driver tied with the cpu + * @dev: the cpuidle device + * + */ +__cpuidle u64 cpuidle_poll_time(struct cpuidle_driver *drv, + struct cpuidle_device *dev) +{ + int i; + u64 limit_ns; + + BUILD_BUG_ON(CPUIDLE_POLL_MIN > CPUIDLE_POLL_MAX); + + if (dev->poll_limit_ns) + return dev->poll_limit_ns; + + limit_ns = CPUIDLE_POLL_MAX; + for (i = 1; i < drv->state_count; i++) { + u64 state_limit; + + if (dev->states_usage[i].disable) + continue; + + state_limit = drv->states[i].target_residency_ns; + if (state_limit < CPUIDLE_POLL_MIN) + continue; + + limit_ns = min_t(u64, state_limit, CPUIDLE_POLL_MAX); + break; + } + + dev->poll_limit_ns = limit_ns; + + return dev->poll_limit_ns; +} + +/** + * cpuidle_install_idle_handler - installs the cpuidle idle loop handler + */ +void cpuidle_install_idle_handler(void) +{ + if (enabled_devices) { + /* Make sure all changes finished before we switch to new idle */ + smp_wmb(); + initialized = 1; + } +} + +/** + * cpuidle_uninstall_idle_handler - uninstalls the cpuidle idle loop handler + */ +void cpuidle_uninstall_idle_handler(void) +{ + if (enabled_devices) { + initialized = 0; + wake_up_all_idle_cpus(); + } + + /* + * Make sure external observers (such as the scheduler) + * are done looking at pointed idle states. + */ + synchronize_rcu(); +} + +/** + * cpuidle_pause_and_lock - temporarily disables CPUIDLE + */ +void cpuidle_pause_and_lock(void) +{ + mutex_lock(&cpuidle_lock); + cpuidle_uninstall_idle_handler(); +} + +EXPORT_SYMBOL_GPL(cpuidle_pause_and_lock); + +/** + * cpuidle_resume_and_unlock - resumes CPUIDLE operation + */ +void cpuidle_resume_and_unlock(void) +{ + cpuidle_install_idle_handler(); + mutex_unlock(&cpuidle_lock); +} + +EXPORT_SYMBOL_GPL(cpuidle_resume_and_unlock); + +/* Currently used in suspend/resume path to suspend cpuidle */ +void cpuidle_pause(void) +{ + mutex_lock(&cpuidle_lock); + cpuidle_uninstall_idle_handler(); + mutex_unlock(&cpuidle_lock); +} + +/* Currently used in suspend/resume path to resume cpuidle */ +void cpuidle_resume(void) +{ + mutex_lock(&cpuidle_lock); + cpuidle_install_idle_handler(); + mutex_unlock(&cpuidle_lock); +} + +/** + * cpuidle_enable_device - enables idle PM for a CPU + * @dev: the CPU + * + * This function must be called between cpuidle_pause_and_lock and + * cpuidle_resume_and_unlock when used externally. + */ +int cpuidle_enable_device(struct cpuidle_device *dev) +{ + int ret; + struct cpuidle_driver *drv; + + if (!dev) + return -EINVAL; + + if (dev->enabled) + return 0; + + if (!cpuidle_curr_governor) + return -EIO; + + drv = cpuidle_get_cpu_driver(dev); + + if (!drv) + return -EIO; + + if (!dev->registered) + return -EINVAL; + + ret = cpuidle_add_device_sysfs(dev); + if (ret) + return ret; + + if (cpuidle_curr_governor->enable) { + ret = cpuidle_curr_governor->enable(drv, dev); + if (ret) + goto fail_sysfs; + } + + smp_wmb(); + + dev->enabled = 1; + + enabled_devices++; + return 0; + +fail_sysfs: + cpuidle_remove_device_sysfs(dev); + + return ret; +} + +EXPORT_SYMBOL_GPL(cpuidle_enable_device); + +/** + * cpuidle_disable_device - disables idle PM for a CPU + * @dev: the CPU + * + * This function must be called between cpuidle_pause_and_lock and + * cpuidle_resume_and_unlock when used externally. + */ +void cpuidle_disable_device(struct cpuidle_device *dev) +{ + struct cpuidle_driver *drv = cpuidle_get_cpu_driver(dev); + + if (!dev || !dev->enabled) + return; + + if (!drv || !cpuidle_curr_governor) + return; + + dev->enabled = 0; + + if (cpuidle_curr_governor->disable) + cpuidle_curr_governor->disable(drv, dev); + + cpuidle_remove_device_sysfs(dev); + enabled_devices--; +} + +EXPORT_SYMBOL_GPL(cpuidle_disable_device); + +static void __cpuidle_unregister_device(struct cpuidle_device *dev) +{ + struct cpuidle_driver *drv = cpuidle_get_cpu_driver(dev); + + list_del(&dev->device_list); + per_cpu(cpuidle_devices, dev->cpu) = NULL; + module_put(drv->owner); + + dev->registered = 0; +} + +static void __cpuidle_device_init(struct cpuidle_device *dev) +{ + memset(dev->states_usage, 0, sizeof(dev->states_usage)); + dev->last_residency_ns = 0; + dev->next_hrtimer = 0; +} + +/** + * __cpuidle_register_device - internal register function called before register + * and enable routines + * @dev: the cpu + * + * cpuidle_lock mutex must be held before this is called + */ +static int __cpuidle_register_device(struct cpuidle_device *dev) +{ + struct cpuidle_driver *drv = cpuidle_get_cpu_driver(dev); + int i, ret; + + if (!try_module_get(drv->owner)) + return -EINVAL; + + for (i = 0; i < drv->state_count; i++) { + if (drv->states[i].flags & CPUIDLE_FLAG_UNUSABLE) + dev->states_usage[i].disable |= CPUIDLE_STATE_DISABLED_BY_DRIVER; + + if (drv->states[i].flags & CPUIDLE_FLAG_OFF) + dev->states_usage[i].disable |= CPUIDLE_STATE_DISABLED_BY_USER; + } + + per_cpu(cpuidle_devices, dev->cpu) = dev; + list_add(&dev->device_list, &cpuidle_detected_devices); + + ret = cpuidle_coupled_register_device(dev); + if (ret) + __cpuidle_unregister_device(dev); + else + dev->registered = 1; + + return ret; +} + +/** + * cpuidle_register_device - registers a CPU's idle PM feature + * @dev: the cpu + */ +int cpuidle_register_device(struct cpuidle_device *dev) +{ + int ret = -EBUSY; + + if (!dev) + return -EINVAL; + + mutex_lock(&cpuidle_lock); + + if (dev->registered) + goto out_unlock; + + __cpuidle_device_init(dev); + + ret = __cpuidle_register_device(dev); + if (ret) + goto out_unlock; + + ret = cpuidle_add_sysfs(dev); + if (ret) + goto out_unregister; + + ret = cpuidle_enable_device(dev); + if (ret) + goto out_sysfs; + + cpuidle_install_idle_handler(); + +out_unlock: + mutex_unlock(&cpuidle_lock); + + return ret; + +out_sysfs: + cpuidle_remove_sysfs(dev); +out_unregister: + __cpuidle_unregister_device(dev); + goto out_unlock; +} + +EXPORT_SYMBOL_GPL(cpuidle_register_device); + +/** + * cpuidle_unregister_device - unregisters a CPU's idle PM feature + * @dev: the cpu + */ +void cpuidle_unregister_device(struct cpuidle_device *dev) +{ + if (!dev || dev->registered == 0) + return; + + cpuidle_pause_and_lock(); + + cpuidle_disable_device(dev); + + cpuidle_remove_sysfs(dev); + + __cpuidle_unregister_device(dev); + + cpuidle_coupled_unregister_device(dev); + + cpuidle_resume_and_unlock(); +} + +EXPORT_SYMBOL_GPL(cpuidle_unregister_device); + +/** + * cpuidle_unregister: unregister a driver and the devices. This function + * can be used only if the driver has been previously registered through + * the cpuidle_register function. + * + * @drv: a valid pointer to a struct cpuidle_driver + */ +void cpuidle_unregister(struct cpuidle_driver *drv) +{ + int cpu; + struct cpuidle_device *device; + + for_each_cpu(cpu, drv->cpumask) { + device = &per_cpu(cpuidle_dev, cpu); + cpuidle_unregister_device(device); + } + + cpuidle_unregister_driver(drv); +} +EXPORT_SYMBOL_GPL(cpuidle_unregister); + +/** + * cpuidle_register: registers the driver and the cpu devices with the + * coupled_cpus passed as parameter. This function is used for all common + * initialization pattern there are in the arch specific drivers. The + * devices is globally defined in this file. + * + * @drv : a valid pointer to a struct cpuidle_driver + * @coupled_cpus: a cpumask for the coupled states + * + * Returns 0 on success, < 0 otherwise + */ +int cpuidle_register(struct cpuidle_driver *drv, + const struct cpumask *const coupled_cpus) +{ + int ret, cpu; + struct cpuidle_device *device; + + ret = cpuidle_register_driver(drv); + if (ret) { + pr_err("failed to register cpuidle driver\n"); + return ret; + } + + for_each_cpu(cpu, drv->cpumask) { + device = &per_cpu(cpuidle_dev, cpu); + device->cpu = cpu; + +#ifdef CONFIG_ARCH_NEEDS_CPU_IDLE_COUPLED + /* + * On multiplatform for ARM, the coupled idle states could be + * enabled in the kernel even if the cpuidle driver does not + * use it. Note, coupled_cpus is a struct copy. + */ + if (coupled_cpus) + device->coupled_cpus = *coupled_cpus; +#endif + ret = cpuidle_register_device(device); + if (!ret) + continue; + + pr_err("Failed to register cpuidle device for cpu%d\n", cpu); + + cpuidle_unregister(drv); + break; + } + + return ret; +} +EXPORT_SYMBOL_GPL(cpuidle_register); + +/** + * cpuidle_init - core initializer + */ +static int __init cpuidle_init(void) +{ + if (cpuidle_disabled()) + return -ENODEV; + + return cpuidle_add_interface(); +} + +module_param(off, int, 0444); +module_param_string(governor, param_governor, CPUIDLE_NAME_LEN, 0444); +core_initcall(cpuidle_init); diff --git a/drivers/cpuidle/cpuidle.h b/drivers/cpuidle/cpuidle.h new file mode 100644 index 0000000000..52701d9588 --- /dev/null +++ b/drivers/cpuidle/cpuidle.h @@ -0,0 +1,75 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * cpuidle.h - The internal header file + */ + +#ifndef __DRIVER_CPUIDLE_H +#define __DRIVER_CPUIDLE_H + +/* For internal use only */ +extern char param_governor[]; +extern struct cpuidle_governor *cpuidle_curr_governor; +extern struct cpuidle_governor *cpuidle_prev_governor; +extern struct list_head cpuidle_governors; +extern struct list_head cpuidle_detected_devices; +extern struct mutex cpuidle_lock; +extern spinlock_t cpuidle_driver_lock; +extern int cpuidle_disabled(void); +extern int cpuidle_enter_state(struct cpuidle_device *dev, + struct cpuidle_driver *drv, int next_state); + +/* idle loop */ +extern void cpuidle_install_idle_handler(void); +extern void cpuidle_uninstall_idle_handler(void); + +/* governors */ +extern struct cpuidle_governor *cpuidle_find_governor(const char *str); +extern int cpuidle_switch_governor(struct cpuidle_governor *gov); + +/* sysfs */ + +struct device; + +extern int cpuidle_add_interface(void); +extern void cpuidle_remove_interface(struct device *dev); +extern int cpuidle_add_device_sysfs(struct cpuidle_device *device); +extern void cpuidle_remove_device_sysfs(struct cpuidle_device *device); +extern int cpuidle_add_sysfs(struct cpuidle_device *dev); +extern void cpuidle_remove_sysfs(struct cpuidle_device *dev); + +#ifdef CONFIG_ARCH_NEEDS_CPU_IDLE_COUPLED +bool cpuidle_state_is_coupled(struct cpuidle_driver *drv, int state); +int cpuidle_coupled_state_verify(struct cpuidle_driver *drv); +int cpuidle_enter_state_coupled(struct cpuidle_device *dev, + struct cpuidle_driver *drv, int next_state); +int cpuidle_coupled_register_device(struct cpuidle_device *dev); +void cpuidle_coupled_unregister_device(struct cpuidle_device *dev); +#else +static inline +bool cpuidle_state_is_coupled(struct cpuidle_driver *drv, int state) +{ + return false; +} + +static inline int cpuidle_coupled_state_verify(struct cpuidle_driver *drv) +{ + return 0; +} + +static inline int cpuidle_enter_state_coupled(struct cpuidle_device *dev, + struct cpuidle_driver *drv, int next_state) +{ + return -1; +} + +static inline int cpuidle_coupled_register_device(struct cpuidle_device *dev) +{ + return 0; +} + +static inline void cpuidle_coupled_unregister_device(struct cpuidle_device *dev) +{ +} +#endif + +#endif /* __DRIVER_CPUIDLE_H */ diff --git a/drivers/cpuidle/driver.c b/drivers/cpuidle/driver.c new file mode 100644 index 0000000000..d9cda7f6cc --- /dev/null +++ b/drivers/cpuidle/driver.c @@ -0,0 +1,391 @@ +/* + * driver.c - driver support + * + * (C) 2006-2007 Venkatesh Pallipadi <venkatesh.pallipadi@intel.com> + * Shaohua Li <shaohua.li@intel.com> + * Adam Belay <abelay@novell.com> + * + * This code is licenced under the GPL. + */ + +#include <linux/mutex.h> +#include <linux/module.h> +#include <linux/sched.h> +#include <linux/sched/idle.h> +#include <linux/cpuidle.h> +#include <linux/cpumask.h> +#include <linux/tick.h> +#include <linux/cpu.h> + +#include "cpuidle.h" + +DEFINE_SPINLOCK(cpuidle_driver_lock); + +#ifdef CONFIG_CPU_IDLE_MULTIPLE_DRIVERS + +static DEFINE_PER_CPU(struct cpuidle_driver *, cpuidle_drivers); + +/** + * __cpuidle_get_cpu_driver - return the cpuidle driver tied to a CPU. + * @cpu: the CPU handled by the driver + * + * Returns a pointer to struct cpuidle_driver or NULL if no driver has been + * registered for @cpu. + */ +static struct cpuidle_driver *__cpuidle_get_cpu_driver(int cpu) +{ + return per_cpu(cpuidle_drivers, cpu); +} + +/** + * __cpuidle_unset_driver - unset per CPU driver variables. + * @drv: a valid pointer to a struct cpuidle_driver + * + * For each CPU in the driver's CPU mask, unset the registered driver per CPU + * variable. If @drv is different from the registered driver, the corresponding + * variable is not cleared. + */ +static inline void __cpuidle_unset_driver(struct cpuidle_driver *drv) +{ + int cpu; + + for_each_cpu(cpu, drv->cpumask) { + + if (drv != __cpuidle_get_cpu_driver(cpu)) + continue; + + per_cpu(cpuidle_drivers, cpu) = NULL; + } +} + +/** + * __cpuidle_set_driver - set per CPU driver variables for the given driver. + * @drv: a valid pointer to a struct cpuidle_driver + * + * Returns 0 on success, -EBUSY if any CPU in the cpumask have a driver + * different from drv already. + */ +static inline int __cpuidle_set_driver(struct cpuidle_driver *drv) +{ + int cpu; + + for_each_cpu(cpu, drv->cpumask) { + struct cpuidle_driver *old_drv; + + old_drv = __cpuidle_get_cpu_driver(cpu); + if (old_drv && old_drv != drv) + return -EBUSY; + } + + for_each_cpu(cpu, drv->cpumask) + per_cpu(cpuidle_drivers, cpu) = drv; + + return 0; +} + +#else + +static struct cpuidle_driver *cpuidle_curr_driver; + +/** + * __cpuidle_get_cpu_driver - return the global cpuidle driver pointer. + * @cpu: ignored without the multiple driver support + * + * Return a pointer to a struct cpuidle_driver object or NULL if no driver was + * previously registered. + */ +static inline struct cpuidle_driver *__cpuidle_get_cpu_driver(int cpu) +{ + return cpuidle_curr_driver; +} + +/** + * __cpuidle_set_driver - assign the global cpuidle driver variable. + * @drv: pointer to a struct cpuidle_driver object + * + * Returns 0 on success, -EBUSY if the driver is already registered. + */ +static inline int __cpuidle_set_driver(struct cpuidle_driver *drv) +{ + if (cpuidle_curr_driver) + return -EBUSY; + + cpuidle_curr_driver = drv; + + return 0; +} + +/** + * __cpuidle_unset_driver - unset the global cpuidle driver variable. + * @drv: a pointer to a struct cpuidle_driver + * + * Reset the global cpuidle variable to NULL. If @drv does not match the + * registered driver, do nothing. + */ +static inline void __cpuidle_unset_driver(struct cpuidle_driver *drv) +{ + if (drv == cpuidle_curr_driver) + cpuidle_curr_driver = NULL; +} + +#endif + +/** + * cpuidle_setup_broadcast_timer - enable/disable the broadcast timer on a cpu + * @arg: a void pointer used to match the SMP cross call API + * + * If @arg is NULL broadcast is disabled otherwise enabled + * + * This function is executed per CPU by an SMP cross call. It's not + * supposed to be called directly. + */ +static void cpuidle_setup_broadcast_timer(void *arg) +{ + if (arg) + tick_broadcast_enable(); + else + tick_broadcast_disable(); +} + +/** + * __cpuidle_driver_init - initialize the driver's internal data + * @drv: a valid pointer to a struct cpuidle_driver + */ +static void __cpuidle_driver_init(struct cpuidle_driver *drv) +{ + int i; + + /* + * Use all possible CPUs as the default, because if the kernel boots + * with some CPUs offline and then we online one of them, the CPU + * notifier has to know which driver to assign. + */ + if (!drv->cpumask) + drv->cpumask = (struct cpumask *)cpu_possible_mask; + + for (i = 0; i < drv->state_count; i++) { + struct cpuidle_state *s = &drv->states[i]; + + /* + * Look for the timer stop flag in the different states and if + * it is found, indicate that the broadcast timer has to be set + * up. + */ + if (s->flags & CPUIDLE_FLAG_TIMER_STOP) + drv->bctimer = 1; + + /* + * The core will use the target residency and exit latency + * values in nanoseconds, but allow drivers to provide them in + * microseconds too. + */ + if (s->target_residency > 0) + s->target_residency_ns = s->target_residency * NSEC_PER_USEC; + else if (s->target_residency_ns < 0) + s->target_residency_ns = 0; + else + s->target_residency = div_u64(s->target_residency_ns, NSEC_PER_USEC); + + if (s->exit_latency > 0) + s->exit_latency_ns = s->exit_latency * NSEC_PER_USEC; + else if (s->exit_latency_ns < 0) + s->exit_latency_ns = 0; + else + s->exit_latency = div_u64(s->exit_latency_ns, NSEC_PER_USEC); + } +} + +/** + * __cpuidle_register_driver: register the driver + * @drv: a valid pointer to a struct cpuidle_driver + * + * Do some sanity checks, initialize the driver, assign the driver to the + * global cpuidle driver variable(s) and set up the broadcast timer if the + * cpuidle driver has some states that shut down the local timer. + * + * Returns 0 on success, a negative error code otherwise: + * * -EINVAL if the driver pointer is NULL or no idle states are available + * * -ENODEV if the cpuidle framework is disabled + * * -EBUSY if the driver is already assigned to the global variable(s) + */ +static int __cpuidle_register_driver(struct cpuidle_driver *drv) +{ + int ret; + + if (!drv || !drv->state_count) + return -EINVAL; + + ret = cpuidle_coupled_state_verify(drv); + if (ret) + return ret; + + if (cpuidle_disabled()) + return -ENODEV; + + __cpuidle_driver_init(drv); + + ret = __cpuidle_set_driver(drv); + if (ret) + return ret; + + if (drv->bctimer) + on_each_cpu_mask(drv->cpumask, cpuidle_setup_broadcast_timer, + (void *)1, 1); + + return 0; +} + +/** + * __cpuidle_unregister_driver - unregister the driver + * @drv: a valid pointer to a struct cpuidle_driver + * + * Check if the driver is no longer in use, reset the global cpuidle driver + * variable(s) and disable the timer broadcast notification mechanism if it was + * in use. + * + */ +static void __cpuidle_unregister_driver(struct cpuidle_driver *drv) +{ + if (drv->bctimer) { + drv->bctimer = 0; + on_each_cpu_mask(drv->cpumask, cpuidle_setup_broadcast_timer, + NULL, 1); + } + + __cpuidle_unset_driver(drv); +} + +/** + * cpuidle_register_driver - registers a driver + * @drv: a pointer to a valid struct cpuidle_driver + * + * Register the driver under a lock to prevent concurrent attempts to + * [un]register the driver from occuring at the same time. + * + * Returns 0 on success, a negative error code (returned by + * __cpuidle_register_driver()) otherwise. + */ +int cpuidle_register_driver(struct cpuidle_driver *drv) +{ + struct cpuidle_governor *gov; + int ret; + + spin_lock(&cpuidle_driver_lock); + ret = __cpuidle_register_driver(drv); + spin_unlock(&cpuidle_driver_lock); + + if (!ret && !strlen(param_governor) && drv->governor && + (cpuidle_get_driver() == drv)) { + mutex_lock(&cpuidle_lock); + gov = cpuidle_find_governor(drv->governor); + if (gov) { + cpuidle_prev_governor = cpuidle_curr_governor; + if (cpuidle_switch_governor(gov) < 0) + cpuidle_prev_governor = NULL; + } + mutex_unlock(&cpuidle_lock); + } + + return ret; +} +EXPORT_SYMBOL_GPL(cpuidle_register_driver); + +/** + * cpuidle_unregister_driver - unregisters a driver + * @drv: a pointer to a valid struct cpuidle_driver + * + * Unregisters the cpuidle driver under a lock to prevent concurrent attempts + * to [un]register the driver from occuring at the same time. @drv has to + * match the currently registered driver. + */ +void cpuidle_unregister_driver(struct cpuidle_driver *drv) +{ + bool enabled = (cpuidle_get_driver() == drv); + + spin_lock(&cpuidle_driver_lock); + __cpuidle_unregister_driver(drv); + spin_unlock(&cpuidle_driver_lock); + + if (!enabled) + return; + + mutex_lock(&cpuidle_lock); + if (cpuidle_prev_governor) { + if (!cpuidle_switch_governor(cpuidle_prev_governor)) + cpuidle_prev_governor = NULL; + } + mutex_unlock(&cpuidle_lock); +} +EXPORT_SYMBOL_GPL(cpuidle_unregister_driver); + +/** + * cpuidle_get_driver - return the driver tied to the current CPU. + * + * Returns a struct cpuidle_driver pointer, or NULL if no driver is registered. + */ +struct cpuidle_driver *cpuidle_get_driver(void) +{ + struct cpuidle_driver *drv; + int cpu; + + cpu = get_cpu(); + drv = __cpuidle_get_cpu_driver(cpu); + put_cpu(); + + return drv; +} +EXPORT_SYMBOL_GPL(cpuidle_get_driver); + +/** + * cpuidle_get_cpu_driver - return the driver registered for a CPU. + * @dev: a valid pointer to a struct cpuidle_device + * + * Returns a struct cpuidle_driver pointer, or NULL if no driver is registered + * for the CPU associated with @dev. + */ +struct cpuidle_driver *cpuidle_get_cpu_driver(struct cpuidle_device *dev) +{ + if (!dev) + return NULL; + + return __cpuidle_get_cpu_driver(dev->cpu); +} +EXPORT_SYMBOL_GPL(cpuidle_get_cpu_driver); + +/** + * cpuidle_driver_state_disabled - Disable or enable an idle state + * @drv: cpuidle driver owning the state + * @idx: State index + * @disable: Whether or not to disable the state + */ +void cpuidle_driver_state_disabled(struct cpuidle_driver *drv, int idx, + bool disable) +{ + unsigned int cpu; + + mutex_lock(&cpuidle_lock); + + spin_lock(&cpuidle_driver_lock); + + if (!drv->cpumask) { + drv->states[idx].flags |= CPUIDLE_FLAG_UNUSABLE; + goto unlock; + } + + for_each_cpu(cpu, drv->cpumask) { + struct cpuidle_device *dev = per_cpu(cpuidle_devices, cpu); + + if (!dev) + continue; + + if (disable) + dev->states_usage[idx].disable |= CPUIDLE_STATE_DISABLED_BY_DRIVER; + else + dev->states_usage[idx].disable &= ~CPUIDLE_STATE_DISABLED_BY_DRIVER; + } + +unlock: + spin_unlock(&cpuidle_driver_lock); + + mutex_unlock(&cpuidle_lock); +} diff --git a/drivers/cpuidle/dt_idle_genpd.c b/drivers/cpuidle/dt_idle_genpd.c new file mode 100644 index 0000000000..1af63c1890 --- /dev/null +++ b/drivers/cpuidle/dt_idle_genpd.c @@ -0,0 +1,202 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * PM domains for CPUs via genpd. + * + * Copyright (C) 2019 Linaro Ltd. + * Author: Ulf Hansson <ulf.hansson@linaro.org> + * + * Copyright (c) 2021 Western Digital Corporation or its affiliates. + * Copyright (c) 2022 Ventana Micro Systems Inc. + */ + +#define pr_fmt(fmt) "dt-idle-genpd: " fmt + +#include <linux/cpu.h> +#include <linux/device.h> +#include <linux/kernel.h> +#include <linux/pm_domain.h> +#include <linux/pm_runtime.h> +#include <linux/slab.h> +#include <linux/string.h> + +#include "dt_idle_genpd.h" + +static int pd_parse_state_nodes( + int (*parse_state)(struct device_node *, u32 *), + struct genpd_power_state *states, int state_count) +{ + int i, ret; + u32 state, *state_buf; + + for (i = 0; i < state_count; i++) { + ret = parse_state(to_of_node(states[i].fwnode), &state); + if (ret) + goto free_state; + + state_buf = kmalloc(sizeof(u32), GFP_KERNEL); + if (!state_buf) { + ret = -ENOMEM; + goto free_state; + } + *state_buf = state; + states[i].data = state_buf; + } + + return 0; + +free_state: + i--; + for (; i >= 0; i--) + kfree(states[i].data); + return ret; +} + +static int pd_parse_states(struct device_node *np, + int (*parse_state)(struct device_node *, u32 *), + struct genpd_power_state **states, + int *state_count) +{ + int ret; + + /* Parse the domain idle states. */ + ret = of_genpd_parse_idle_states(np, states, state_count); + if (ret) + return ret; + + /* Fill out the dt specifics for each found state. */ + ret = pd_parse_state_nodes(parse_state, *states, *state_count); + if (ret) + kfree(*states); + + return ret; +} + +static void pd_free_states(struct genpd_power_state *states, + unsigned int state_count) +{ + int i; + + for (i = 0; i < state_count; i++) + kfree(states[i].data); + kfree(states); +} + +void dt_idle_pd_free(struct generic_pm_domain *pd) +{ + pd_free_states(pd->states, pd->state_count); + kfree(pd->name); + kfree(pd); +} + +struct generic_pm_domain *dt_idle_pd_alloc(struct device_node *np, + int (*parse_state)(struct device_node *, u32 *)) +{ + struct generic_pm_domain *pd; + struct genpd_power_state *states = NULL; + int ret, state_count = 0; + + pd = kzalloc(sizeof(*pd), GFP_KERNEL); + if (!pd) + goto out; + + pd->name = kasprintf(GFP_KERNEL, "%pOF", np); + if (!pd->name) + goto free_pd; + + /* + * Parse the domain idle states and let genpd manage the state selection + * for those being compatible with "domain-idle-state". + */ + ret = pd_parse_states(np, parse_state, &states, &state_count); + if (ret) + goto free_name; + + pd->free_states = pd_free_states; + pd->name = kbasename(pd->name); + pd->states = states; + pd->state_count = state_count; + + pr_debug("alloc PM domain %s\n", pd->name); + return pd; + +free_name: + kfree(pd->name); +free_pd: + kfree(pd); +out: + pr_err("failed to alloc PM domain %pOF\n", np); + return NULL; +} + +int dt_idle_pd_init_topology(struct device_node *np) +{ + struct device_node *node; + struct of_phandle_args child, parent; + int ret; + + for_each_child_of_node(np, node) { + if (of_parse_phandle_with_args(node, "power-domains", + "#power-domain-cells", 0, &parent)) + continue; + + child.np = node; + child.args_count = 0; + ret = of_genpd_add_subdomain(&parent, &child); + of_node_put(parent.np); + if (ret) { + of_node_put(node); + return ret; + } + } + + return 0; +} + +int dt_idle_pd_remove_topology(struct device_node *np) +{ + struct device_node *node; + struct of_phandle_args child, parent; + int ret; + + for_each_child_of_node(np, node) { + if (of_parse_phandle_with_args(node, "power-domains", + "#power-domain-cells", 0, &parent)) + continue; + + child.np = node; + child.args_count = 0; + ret = of_genpd_remove_subdomain(&parent, &child); + of_node_put(parent.np); + if (ret) { + of_node_put(node); + return ret; + } + } + + return 0; +} + +struct device *dt_idle_attach_cpu(int cpu, const char *name) +{ + struct device *dev; + + dev = dev_pm_domain_attach_by_name(get_cpu_device(cpu), name); + if (IS_ERR_OR_NULL(dev)) + return dev; + + pm_runtime_irq_safe(dev); + if (cpu_online(cpu)) + pm_runtime_get_sync(dev); + + dev_pm_syscore_device(dev, true); + + return dev; +} + +void dt_idle_detach_cpu(struct device *dev) +{ + if (IS_ERR_OR_NULL(dev)) + return; + + dev_pm_domain_detach(dev, false); +} diff --git a/drivers/cpuidle/dt_idle_genpd.h b/drivers/cpuidle/dt_idle_genpd.h new file mode 100644 index 0000000000..3be1f70f55 --- /dev/null +++ b/drivers/cpuidle/dt_idle_genpd.h @@ -0,0 +1,57 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef __DT_IDLE_GENPD +#define __DT_IDLE_GENPD + +struct device_node; +struct generic_pm_domain; + +#ifdef CONFIG_DT_IDLE_GENPD + +void dt_idle_pd_free(struct generic_pm_domain *pd); + +struct generic_pm_domain *dt_idle_pd_alloc(struct device_node *np, + int (*parse_state)(struct device_node *, u32 *)); + +int dt_idle_pd_init_topology(struct device_node *np); + +int dt_idle_pd_remove_topology(struct device_node *np); + +struct device *dt_idle_attach_cpu(int cpu, const char *name); + +void dt_idle_detach_cpu(struct device *dev); + +#else + +static inline void dt_idle_pd_free(struct generic_pm_domain *pd) +{ +} + +static inline struct generic_pm_domain *dt_idle_pd_alloc( + struct device_node *np, + int (*parse_state)(struct device_node *, u32 *)) +{ + return NULL; +} + +static inline int dt_idle_pd_init_topology(struct device_node *np) +{ + return 0; +} + +static inline int dt_idle_pd_remove_topology(struct device_node *np) +{ + return 0; +} + +static inline struct device *dt_idle_attach_cpu(int cpu, const char *name) +{ + return NULL; +} + +static inline void dt_idle_detach_cpu(struct device *dev) +{ +} + +#endif + +#endif diff --git a/drivers/cpuidle/dt_idle_states.c b/drivers/cpuidle/dt_idle_states.c new file mode 100644 index 0000000000..12fec92a85 --- /dev/null +++ b/drivers/cpuidle/dt_idle_states.c @@ -0,0 +1,224 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * DT idle states parsing code. + * + * Copyright (C) 2014 ARM Ltd. + * Author: Lorenzo Pieralisi <lorenzo.pieralisi@arm.com> + */ + +#define pr_fmt(fmt) "DT idle-states: " fmt + +#include <linux/cpuidle.h> +#include <linux/cpumask.h> +#include <linux/errno.h> +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/of.h> + +#include "dt_idle_states.h" + +static int init_state_node(struct cpuidle_state *idle_state, + const struct of_device_id *match_id, + struct device_node *state_node) +{ + int err; + const char *desc; + + /* + * CPUidle drivers are expected to initialize the const void *data + * pointer of the passed in struct of_device_id array to the idle + * state enter function. + */ + idle_state->enter = match_id->data; + /* + * Since this is not a "coupled" state, it's safe to assume interrupts + * won't be enabled when it exits allowing the tick to be frozen + * safely. So enter() can be also enter_s2idle() callback. + */ + idle_state->enter_s2idle = match_id->data; + + err = of_property_read_u32(state_node, "wakeup-latency-us", + &idle_state->exit_latency); + if (err) { + u32 entry_latency, exit_latency; + + err = of_property_read_u32(state_node, "entry-latency-us", + &entry_latency); + if (err) { + pr_debug(" * %pOF missing entry-latency-us property\n", + state_node); + return -EINVAL; + } + + err = of_property_read_u32(state_node, "exit-latency-us", + &exit_latency); + if (err) { + pr_debug(" * %pOF missing exit-latency-us property\n", + state_node); + return -EINVAL; + } + /* + * If wakeup-latency-us is missing, default to entry+exit + * latencies as defined in idle states bindings + */ + idle_state->exit_latency = entry_latency + exit_latency; + } + + err = of_property_read_u32(state_node, "min-residency-us", + &idle_state->target_residency); + if (err) { + pr_debug(" * %pOF missing min-residency-us property\n", + state_node); + return -EINVAL; + } + + err = of_property_read_string(state_node, "idle-state-name", &desc); + if (err) + desc = state_node->name; + + idle_state->flags = CPUIDLE_FLAG_RCU_IDLE; + if (of_property_read_bool(state_node, "local-timer-stop")) + idle_state->flags |= CPUIDLE_FLAG_TIMER_STOP; + /* + * TODO: + * replace with kstrdup and pointer assignment when name + * and desc become string pointers + */ + strncpy(idle_state->name, state_node->name, CPUIDLE_NAME_LEN - 1); + strncpy(idle_state->desc, desc, CPUIDLE_DESC_LEN - 1); + return 0; +} + +/* + * Check that the idle state is uniform across all CPUs in the CPUidle driver + * cpumask + */ +static bool idle_state_valid(struct device_node *state_node, unsigned int idx, + const cpumask_t *cpumask) +{ + int cpu; + struct device_node *cpu_node, *curr_state_node; + bool valid = true; + + /* + * Compare idle state phandles for index idx on all CPUs in the + * CPUidle driver cpumask. Start from next logical cpu following + * cpumask_first(cpumask) since that's the CPU state_node was + * retrieved from. If a mismatch is found bail out straight + * away since we certainly hit a firmware misconfiguration. + */ + for (cpu = cpumask_next(cpumask_first(cpumask), cpumask); + cpu < nr_cpu_ids; cpu = cpumask_next(cpu, cpumask)) { + cpu_node = of_cpu_device_node_get(cpu); + curr_state_node = of_get_cpu_state_node(cpu_node, idx); + if (state_node != curr_state_node) + valid = false; + + of_node_put(curr_state_node); + of_node_put(cpu_node); + if (!valid) + break; + } + + return valid; +} + +/** + * dt_init_idle_driver() - Parse the DT idle states and initialize the + * idle driver states array + * @drv: Pointer to CPU idle driver to be initialized + * @matches: Array of of_device_id match structures to search in for + * compatible idle state nodes. The data pointer for each valid + * struct of_device_id entry in the matches array must point to + * a function with the following signature, that corresponds to + * the CPUidle state enter function signature: + * + * int (*)(struct cpuidle_device *dev, + * struct cpuidle_driver *drv, + * int index); + * + * @start_idx: First idle state index to be initialized + * + * If DT idle states are detected and are valid the state count and states + * array entries in the cpuidle driver are initialized accordingly starting + * from index start_idx. + * + * Return: number of valid DT idle states parsed, <0 on failure + */ +int dt_init_idle_driver(struct cpuidle_driver *drv, + const struct of_device_id *matches, + unsigned int start_idx) +{ + struct cpuidle_state *idle_state; + struct device_node *state_node, *cpu_node; + const struct of_device_id *match_id; + int i, err = 0; + const cpumask_t *cpumask; + unsigned int state_idx = start_idx; + + if (state_idx >= CPUIDLE_STATE_MAX) + return -EINVAL; + /* + * We get the idle states for the first logical cpu in the + * driver mask (or cpu_possible_mask if the driver cpumask is not set) + * and we check through idle_state_valid() if they are uniform + * across CPUs, otherwise we hit a firmware misconfiguration. + */ + cpumask = drv->cpumask ? : cpu_possible_mask; + cpu_node = of_cpu_device_node_get(cpumask_first(cpumask)); + + for (i = 0; ; i++) { + state_node = of_get_cpu_state_node(cpu_node, i); + if (!state_node) + break; + + match_id = of_match_node(matches, state_node); + if (!match_id) { + err = -ENODEV; + break; + } + + if (!of_device_is_available(state_node)) { + of_node_put(state_node); + continue; + } + + if (!idle_state_valid(state_node, i, cpumask)) { + pr_warn("%pOF idle state not valid, bailing out\n", + state_node); + err = -EINVAL; + break; + } + + if (state_idx == CPUIDLE_STATE_MAX) { + pr_warn("State index reached static CPU idle driver states array size\n"); + break; + } + + idle_state = &drv->states[state_idx++]; + err = init_state_node(idle_state, match_id, state_node); + if (err) { + pr_err("Parsing idle state node %pOF failed with err %d\n", + state_node, err); + err = -EINVAL; + break; + } + of_node_put(state_node); + } + + of_node_put(state_node); + of_node_put(cpu_node); + if (err) + return err; + + /* Set the number of total supported idle states. */ + drv->state_count = state_idx; + + /* + * Return the number of present and valid DT idle states, which can + * also be 0 on platforms with missing DT idle states or legacy DT + * configuration predating the DT idle states bindings. + */ + return state_idx - start_idx; +} +EXPORT_SYMBOL_GPL(dt_init_idle_driver); diff --git a/drivers/cpuidle/dt_idle_states.h b/drivers/cpuidle/dt_idle_states.h new file mode 100644 index 0000000000..14ae88cef1 --- /dev/null +++ b/drivers/cpuidle/dt_idle_states.h @@ -0,0 +1,8 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef __DT_IDLE_STATES +#define __DT_IDLE_STATES + +int dt_init_idle_driver(struct cpuidle_driver *drv, + const struct of_device_id *matches, + unsigned int start_idx); +#endif diff --git a/drivers/cpuidle/governor.c b/drivers/cpuidle/governor.c new file mode 100644 index 0000000000..0d0f9751ff --- /dev/null +++ b/drivers/cpuidle/governor.c @@ -0,0 +1,119 @@ +/* + * governor.c - governor support + * + * (C) 2006-2007 Venkatesh Pallipadi <venkatesh.pallipadi@intel.com> + * Shaohua Li <shaohua.li@intel.com> + * Adam Belay <abelay@novell.com> + * + * This code is licenced under the GPL. + */ + +#include <linux/cpu.h> +#include <linux/cpuidle.h> +#include <linux/mutex.h> +#include <linux/module.h> +#include <linux/pm_qos.h> + +#include "cpuidle.h" + +char param_governor[CPUIDLE_NAME_LEN]; + +LIST_HEAD(cpuidle_governors); +struct cpuidle_governor *cpuidle_curr_governor; +struct cpuidle_governor *cpuidle_prev_governor; + +/** + * cpuidle_find_governor - finds a governor of the specified name + * @str: the name + * + * Must be called with cpuidle_lock acquired. + */ +struct cpuidle_governor *cpuidle_find_governor(const char *str) +{ + struct cpuidle_governor *gov; + + list_for_each_entry(gov, &cpuidle_governors, governor_list) + if (!strncasecmp(str, gov->name, CPUIDLE_NAME_LEN)) + return gov; + + return NULL; +} + +/** + * cpuidle_switch_governor - changes the governor + * @gov: the new target governor + * Must be called with cpuidle_lock acquired. + */ +int cpuidle_switch_governor(struct cpuidle_governor *gov) +{ + struct cpuidle_device *dev; + + if (!gov) + return -EINVAL; + + if (gov == cpuidle_curr_governor) + return 0; + + cpuidle_uninstall_idle_handler(); + + if (cpuidle_curr_governor) { + list_for_each_entry(dev, &cpuidle_detected_devices, device_list) + cpuidle_disable_device(dev); + } + + cpuidle_curr_governor = gov; + + list_for_each_entry(dev, &cpuidle_detected_devices, device_list) + cpuidle_enable_device(dev); + + cpuidle_install_idle_handler(); + pr_info("cpuidle: using governor %s\n", gov->name); + + return 0; +} + +/** + * cpuidle_register_governor - registers a governor + * @gov: the governor + */ +int cpuidle_register_governor(struct cpuidle_governor *gov) +{ + int ret = -EEXIST; + + if (!gov || !gov->select) + return -EINVAL; + + if (cpuidle_disabled()) + return -ENODEV; + + mutex_lock(&cpuidle_lock); + if (cpuidle_find_governor(gov->name) == NULL) { + ret = 0; + list_add_tail(&gov->governor_list, &cpuidle_governors); + if (!cpuidle_curr_governor || + !strncasecmp(param_governor, gov->name, CPUIDLE_NAME_LEN) || + (cpuidle_curr_governor->rating < gov->rating && + strncasecmp(param_governor, cpuidle_curr_governor->name, + CPUIDLE_NAME_LEN))) + cpuidle_switch_governor(gov); + } + mutex_unlock(&cpuidle_lock); + + return ret; +} + +/** + * cpuidle_governor_latency_req - Compute a latency constraint for CPU + * @cpu: Target CPU + */ +s64 cpuidle_governor_latency_req(unsigned int cpu) +{ + struct device *device = get_cpu_device(cpu); + int device_req = dev_pm_qos_raw_resume_latency(device); + int global_req = cpu_latency_qos_limit(); + + if (device_req > global_req) + device_req = global_req; + + return (s64)device_req * NSEC_PER_USEC; +} diff --git a/drivers/cpuidle/governors/Makefile b/drivers/cpuidle/governors/Makefile new file mode 100644 index 0000000000..63abb5393a --- /dev/null +++ b/drivers/cpuidle/governors/Makefile @@ -0,0 +1,9 @@ +# SPDX-License-Identifier: GPL-2.0-only +# +# Makefile for cpuidle governors. +# + +obj-$(CONFIG_CPU_IDLE_GOV_LADDER) += ladder.o +obj-$(CONFIG_CPU_IDLE_GOV_MENU) += menu.o +obj-$(CONFIG_CPU_IDLE_GOV_TEO) += teo.o +obj-$(CONFIG_CPU_IDLE_GOV_HALTPOLL) += haltpoll.o diff --git a/drivers/cpuidle/governors/gov.h b/drivers/cpuidle/governors/gov.h new file mode 100644 index 0000000000..99e067d966 --- /dev/null +++ b/drivers/cpuidle/governors/gov.h @@ -0,0 +1,14 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +/* Common definitions for cpuidle governors. */ + +#ifndef __CPUIDLE_GOVERNOR_H +#define __CPUIDLE_GOVERNOR_H + +/* + * Idle state target residency threshold used for deciding whether or not to + * check the time till the closest expected timer event. + */ +#define RESIDENCY_THRESHOLD_NS (15 * NSEC_PER_USEC) + +#endif /* __CPUIDLE_GOVERNOR_H */ diff --git a/drivers/cpuidle/governors/haltpoll.c b/drivers/cpuidle/governors/haltpoll.c new file mode 100644 index 0000000000..1dff3a5291 --- /dev/null +++ b/drivers/cpuidle/governors/haltpoll.c @@ -0,0 +1,152 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * haltpoll.c - haltpoll idle governor + * + * Copyright 2019 Red Hat, Inc. and/or its affiliates. + * + * This work is licensed under the terms of the GNU GPL, version 2. See + * the COPYING file in the top-level directory. + * + * Authors: Marcelo Tosatti <mtosatti@redhat.com> + */ + +#include <linux/kernel.h> +#include <linux/cpuidle.h> +#include <linux/time.h> +#include <linux/ktime.h> +#include <linux/hrtimer.h> +#include <linux/tick.h> +#include <linux/sched.h> +#include <linux/module.h> +#include <linux/kvm_para.h> +#include <trace/events/power.h> + +static unsigned int guest_halt_poll_ns __read_mostly = 200000; +module_param(guest_halt_poll_ns, uint, 0644); + +/* division factor to shrink halt_poll_ns */ +static unsigned int guest_halt_poll_shrink __read_mostly = 2; +module_param(guest_halt_poll_shrink, uint, 0644); + +/* multiplication factor to grow per-cpu poll_limit_ns */ +static unsigned int guest_halt_poll_grow __read_mostly = 2; +module_param(guest_halt_poll_grow, uint, 0644); + +/* value in us to start growing per-cpu halt_poll_ns */ +static unsigned int guest_halt_poll_grow_start __read_mostly = 50000; +module_param(guest_halt_poll_grow_start, uint, 0644); + +/* allow shrinking guest halt poll */ +static bool guest_halt_poll_allow_shrink __read_mostly = true; +module_param(guest_halt_poll_allow_shrink, bool, 0644); + +/** + * haltpoll_select - selects the next idle state to enter + * @drv: cpuidle driver containing state data + * @dev: the CPU + * @stop_tick: indication on whether or not to stop the tick + */ +static int haltpoll_select(struct cpuidle_driver *drv, + struct cpuidle_device *dev, + bool *stop_tick) +{ + s64 latency_req = cpuidle_governor_latency_req(dev->cpu); + + if (!drv->state_count || latency_req == 0) { + *stop_tick = false; + return 0; + } + + if (dev->poll_limit_ns == 0) + return 1; + + /* Last state was poll? */ + if (dev->last_state_idx == 0) { + /* Halt if no event occurred on poll window */ + if (dev->poll_time_limit == true) + return 1; + + *stop_tick = false; + /* Otherwise, poll again */ + return 0; + } + + *stop_tick = false; + /* Last state was halt: poll */ + return 0; +} + +static void adjust_poll_limit(struct cpuidle_device *dev, u64 block_ns) +{ + unsigned int val; + + /* Grow cpu_halt_poll_us if + * cpu_halt_poll_us < block_ns < guest_halt_poll_us + */ + if (block_ns > dev->poll_limit_ns && block_ns <= guest_halt_poll_ns) { + val = dev->poll_limit_ns * guest_halt_poll_grow; + + if (val < guest_halt_poll_grow_start) + val = guest_halt_poll_grow_start; + if (val > guest_halt_poll_ns) + val = guest_halt_poll_ns; + + trace_guest_halt_poll_ns_grow(val, dev->poll_limit_ns); + dev->poll_limit_ns = val; + } else if (block_ns > guest_halt_poll_ns && + guest_halt_poll_allow_shrink) { + unsigned int shrink = guest_halt_poll_shrink; + + val = dev->poll_limit_ns; + if (shrink == 0) + val = 0; + else + val /= shrink; + trace_guest_halt_poll_ns_shrink(val, dev->poll_limit_ns); + dev->poll_limit_ns = val; + } +} + +/** + * haltpoll_reflect - update variables and update poll time + * @dev: the CPU + * @index: the index of actual entered state + */ +static void haltpoll_reflect(struct cpuidle_device *dev, int index) +{ + dev->last_state_idx = index; + + if (index != 0) + adjust_poll_limit(dev, dev->last_residency_ns); +} + +/** + * haltpoll_enable_device - scans a CPU's states and does setup + * @drv: cpuidle driver + * @dev: the CPU + */ +static int haltpoll_enable_device(struct cpuidle_driver *drv, + struct cpuidle_device *dev) +{ + dev->poll_limit_ns = 0; + + return 0; +} + +static struct cpuidle_governor haltpoll_governor = { + .name = "haltpoll", + .rating = 9, + .enable = haltpoll_enable_device, + .select = haltpoll_select, + .reflect = haltpoll_reflect, +}; + +static int __init init_haltpoll(void) +{ + if (kvm_para_available()) + return cpuidle_register_governor(&haltpoll_governor); + + return 0; +} + +postcore_initcall(init_haltpoll); diff --git a/drivers/cpuidle/governors/ladder.c b/drivers/cpuidle/governors/ladder.c new file mode 100644 index 0000000000..8e9058c4ea --- /dev/null +++ b/drivers/cpuidle/governors/ladder.c @@ -0,0 +1,197 @@ +/* + * ladder.c - the residency ladder algorithm + * + * Copyright (C) 2001, 2002 Andy Grover <andrew.grover@intel.com> + * Copyright (C) 2001, 2002 Paul Diefenbaugh <paul.s.diefenbaugh@intel.com> + * Copyright (C) 2004, 2005 Dominik Brodowski <linux@brodo.de> + * + * (C) 2006-2007 Venkatesh Pallipadi <venkatesh.pallipadi@intel.com> + * Shaohua Li <shaohua.li@intel.com> + * Adam Belay <abelay@novell.com> + * + * This code is licenced under the GPL. + */ + +#include <linux/kernel.h> +#include <linux/cpuidle.h> +#include <linux/jiffies.h> +#include <linux/tick.h> + +#include <asm/io.h> +#include <linux/uaccess.h> + +#define PROMOTION_COUNT 4 +#define DEMOTION_COUNT 1 + +struct ladder_device_state { + struct { + u32 promotion_count; + u32 demotion_count; + u64 promotion_time_ns; + u64 demotion_time_ns; + } threshold; + struct { + int promotion_count; + int demotion_count; + } stats; +}; + +struct ladder_device { + struct ladder_device_state states[CPUIDLE_STATE_MAX]; +}; + +static DEFINE_PER_CPU(struct ladder_device, ladder_devices); + +/** + * ladder_do_selection - prepares private data for a state change + * @ldev: the ladder device + * @old_idx: the current state index + * @new_idx: the new target state index + */ +static inline void ladder_do_selection(struct cpuidle_device *dev, + struct ladder_device *ldev, + int old_idx, int new_idx) +{ + ldev->states[old_idx].stats.promotion_count = 0; + ldev->states[old_idx].stats.demotion_count = 0; + dev->last_state_idx = new_idx; +} + +/** + * ladder_select_state - selects the next state to enter + * @drv: cpuidle driver + * @dev: the CPU + * @dummy: not used + */ +static int ladder_select_state(struct cpuidle_driver *drv, + struct cpuidle_device *dev, bool *dummy) +{ + struct ladder_device *ldev = this_cpu_ptr(&ladder_devices); + struct ladder_device_state *last_state; + int last_idx = dev->last_state_idx; + int first_idx = drv->states[0].flags & CPUIDLE_FLAG_POLLING ? 1 : 0; + s64 latency_req = cpuidle_governor_latency_req(dev->cpu); + s64 last_residency; + + /* Special case when user has set very strict latency requirement */ + if (unlikely(latency_req == 0)) { + ladder_do_selection(dev, ldev, last_idx, 0); + return 0; + } + + last_state = &ldev->states[last_idx]; + + last_residency = dev->last_residency_ns - drv->states[last_idx].exit_latency_ns; + + /* consider promotion */ + if (last_idx < drv->state_count - 1 && + !dev->states_usage[last_idx + 1].disable && + last_residency > last_state->threshold.promotion_time_ns && + drv->states[last_idx + 1].exit_latency_ns <= latency_req) { + last_state->stats.promotion_count++; + last_state->stats.demotion_count = 0; + if (last_state->stats.promotion_count >= last_state->threshold.promotion_count) { + ladder_do_selection(dev, ldev, last_idx, last_idx + 1); + return last_idx + 1; + } + } + + /* consider demotion */ + if (last_idx > first_idx && + (dev->states_usage[last_idx].disable || + drv->states[last_idx].exit_latency_ns > latency_req)) { + int i; + + for (i = last_idx - 1; i > first_idx; i--) { + if (drv->states[i].exit_latency_ns <= latency_req) + break; + } + ladder_do_selection(dev, ldev, last_idx, i); + return i; + } + + if (last_idx > first_idx && + last_residency < last_state->threshold.demotion_time_ns) { + last_state->stats.demotion_count++; + last_state->stats.promotion_count = 0; + if (last_state->stats.demotion_count >= last_state->threshold.demotion_count) { + ladder_do_selection(dev, ldev, last_idx, last_idx - 1); + return last_idx - 1; + } + } + + /* otherwise remain at the current state */ + return last_idx; +} + +/** + * ladder_enable_device - setup for the governor + * @drv: cpuidle driver + * @dev: the CPU + */ +static int ladder_enable_device(struct cpuidle_driver *drv, + struct cpuidle_device *dev) +{ + int i; + int first_idx = drv->states[0].flags & CPUIDLE_FLAG_POLLING ? 1 : 0; + struct ladder_device *ldev = &per_cpu(ladder_devices, dev->cpu); + struct ladder_device_state *lstate; + struct cpuidle_state *state; + + dev->last_state_idx = first_idx; + + for (i = first_idx; i < drv->state_count; i++) { + state = &drv->states[i]; + lstate = &ldev->states[i]; + + lstate->stats.promotion_count = 0; + lstate->stats.demotion_count = 0; + + lstate->threshold.promotion_count = PROMOTION_COUNT; + lstate->threshold.demotion_count = DEMOTION_COUNT; + + if (i < drv->state_count - 1) + lstate->threshold.promotion_time_ns = state->exit_latency_ns; + if (i > first_idx) + lstate->threshold.demotion_time_ns = state->exit_latency_ns; + } + + return 0; +} + +/** + * ladder_reflect - update the correct last_state_idx + * @dev: the CPU + * @index: the index of actual state entered + */ +static void ladder_reflect(struct cpuidle_device *dev, int index) +{ + if (index > 0) + dev->last_state_idx = index; +} + +static struct cpuidle_governor ladder_governor = { + .name = "ladder", + .rating = 10, + .enable = ladder_enable_device, + .select = ladder_select_state, + .reflect = ladder_reflect, +}; + +/** + * init_ladder - initializes the governor + */ +static int __init init_ladder(void) +{ + /* + * When NO_HZ is disabled, or when booting with nohz=off, the ladder + * governor is better so give it a higher rating than the menu + * governor. + */ + if (!tick_nohz_enabled) + ladder_governor.rating = 25; + + return cpuidle_register_governor(&ladder_governor); +} + +postcore_initcall(init_ladder); diff --git a/drivers/cpuidle/governors/menu.c b/drivers/cpuidle/governors/menu.c new file mode 100644 index 0000000000..b96e3da0fe --- /dev/null +++ b/drivers/cpuidle/governors/menu.c @@ -0,0 +1,590 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * menu.c - the menu idle governor + * + * Copyright (C) 2006-2007 Adam Belay <abelay@novell.com> + * Copyright (C) 2009 Intel Corporation + * Author: + * Arjan van de Ven <arjan@linux.intel.com> + */ + +#include <linux/kernel.h> +#include <linux/cpuidle.h> +#include <linux/time.h> +#include <linux/ktime.h> +#include <linux/hrtimer.h> +#include <linux/tick.h> +#include <linux/sched.h> +#include <linux/sched/loadavg.h> +#include <linux/sched/stat.h> +#include <linux/math64.h> + +#include "gov.h" + +#define BUCKETS 12 +#define INTERVAL_SHIFT 3 +#define INTERVALS (1UL << INTERVAL_SHIFT) +#define RESOLUTION 1024 +#define DECAY 8 +#define MAX_INTERESTING (50000 * NSEC_PER_USEC) + +/* + * Concepts and ideas behind the menu governor + * + * For the menu governor, there are 3 decision factors for picking a C + * state: + * 1) Energy break even point + * 2) Performance impact + * 3) Latency tolerance (from pmqos infrastructure) + * These three factors are treated independently. + * + * Energy break even point + * ----------------------- + * C state entry and exit have an energy cost, and a certain amount of time in + * the C state is required to actually break even on this cost. CPUIDLE + * provides us this duration in the "target_residency" field. So all that we + * need is a good prediction of how long we'll be idle. Like the traditional + * menu governor, we start with the actual known "next timer event" time. + * + * Since there are other source of wakeups (interrupts for example) than + * the next timer event, this estimation is rather optimistic. To get a + * more realistic estimate, a correction factor is applied to the estimate, + * that is based on historic behavior. For example, if in the past the actual + * duration always was 50% of the next timer tick, the correction factor will + * be 0.5. + * + * menu uses a running average for this correction factor, however it uses a + * set of factors, not just a single factor. This stems from the realization + * that the ratio is dependent on the order of magnitude of the expected + * duration; if we expect 500 milliseconds of idle time the likelihood of + * getting an interrupt very early is much higher than if we expect 50 micro + * seconds of idle time. A second independent factor that has big impact on + * the actual factor is if there is (disk) IO outstanding or not. + * (as a special twist, we consider every sleep longer than 50 milliseconds + * as perfect; there are no power gains for sleeping longer than this) + * + * For these two reasons we keep an array of 12 independent factors, that gets + * indexed based on the magnitude of the expected duration as well as the + * "is IO outstanding" property. + * + * Repeatable-interval-detector + * ---------------------------- + * There are some cases where "next timer" is a completely unusable predictor: + * Those cases where the interval is fixed, for example due to hardware + * interrupt mitigation, but also due to fixed transfer rate devices such as + * mice. + * For this, we use a different predictor: We track the duration of the last 8 + * intervals and if the stand deviation of these 8 intervals is below a + * threshold value, we use the average of these intervals as prediction. + * + * Limiting Performance Impact + * --------------------------- + * C states, especially those with large exit latencies, can have a real + * noticeable impact on workloads, which is not acceptable for most sysadmins, + * and in addition, less performance has a power price of its own. + * + * As a general rule of thumb, menu assumes that the following heuristic + * holds: + * The busier the system, the less impact of C states is acceptable + * + * This rule-of-thumb is implemented using a performance-multiplier: + * If the exit latency times the performance multiplier is longer than + * the predicted duration, the C state is not considered a candidate + * for selection due to a too high performance impact. So the higher + * this multiplier is, the longer we need to be idle to pick a deep C + * state, and thus the less likely a busy CPU will hit such a deep + * C state. + * + * Two factors are used in determing this multiplier: + * a value of 10 is added for each point of "per cpu load average" we have. + * a value of 5 points is added for each process that is waiting for + * IO on this CPU. + * (these values are experimentally determined) + * + * The load average factor gives a longer term (few seconds) input to the + * decision, while the iowait value gives a cpu local instantanious input. + * The iowait factor may look low, but realize that this is also already + * represented in the system load average. + * + */ + +struct menu_device { + int needs_update; + int tick_wakeup; + + u64 next_timer_ns; + unsigned int bucket; + unsigned int correction_factor[BUCKETS]; + unsigned int intervals[INTERVALS]; + int interval_ptr; +}; + +static inline int which_bucket(u64 duration_ns, unsigned int nr_iowaiters) +{ + int bucket = 0; + + /* + * We keep two groups of stats; one with no + * IO pending, one without. + * This allows us to calculate + * E(duration)|iowait + */ + if (nr_iowaiters) + bucket = BUCKETS/2; + + if (duration_ns < 10ULL * NSEC_PER_USEC) + return bucket; + if (duration_ns < 100ULL * NSEC_PER_USEC) + return bucket + 1; + if (duration_ns < 1000ULL * NSEC_PER_USEC) + return bucket + 2; + if (duration_ns < 10000ULL * NSEC_PER_USEC) + return bucket + 3; + if (duration_ns < 100000ULL * NSEC_PER_USEC) + return bucket + 4; + return bucket + 5; +} + +/* + * Return a multiplier for the exit latency that is intended + * to take performance requirements into account. + * The more performance critical we estimate the system + * to be, the higher this multiplier, and thus the higher + * the barrier to go to an expensive C state. + */ +static inline int performance_multiplier(unsigned int nr_iowaiters) +{ + /* for IO wait tasks (per cpu!) we add 10x each */ + return 1 + 10 * nr_iowaiters; +} + +static DEFINE_PER_CPU(struct menu_device, menu_devices); + +static void menu_update(struct cpuidle_driver *drv, struct cpuidle_device *dev); + +/* + * Try detecting repeating patterns by keeping track of the last 8 + * intervals, and checking if the standard deviation of that set + * of points is below a threshold. If it is... then use the + * average of these 8 points as the estimated value. + */ +static unsigned int get_typical_interval(struct menu_device *data) +{ + int i, divisor; + unsigned int min, max, thresh, avg; + uint64_t sum, variance; + + thresh = INT_MAX; /* Discard outliers above this value */ + +again: + + /* First calculate the average of past intervals */ + min = UINT_MAX; + max = 0; + sum = 0; + divisor = 0; + for (i = 0; i < INTERVALS; i++) { + unsigned int value = data->intervals[i]; + if (value <= thresh) { + sum += value; + divisor++; + if (value > max) + max = value; + + if (value < min) + min = value; + } + } + + if (!max) + return UINT_MAX; + + if (divisor == INTERVALS) + avg = sum >> INTERVAL_SHIFT; + else + avg = div_u64(sum, divisor); + + /* Then try to determine variance */ + variance = 0; + for (i = 0; i < INTERVALS; i++) { + unsigned int value = data->intervals[i]; + if (value <= thresh) { + int64_t diff = (int64_t)value - avg; + variance += diff * diff; + } + } + if (divisor == INTERVALS) + variance >>= INTERVAL_SHIFT; + else + do_div(variance, divisor); + + /* + * The typical interval is obtained when standard deviation is + * small (stddev <= 20 us, variance <= 400 us^2) or standard + * deviation is small compared to the average interval (avg > + * 6*stddev, avg^2 > 36*variance). The average is smaller than + * UINT_MAX aka U32_MAX, so computing its square does not + * overflow a u64. We simply reject this candidate average if + * the standard deviation is greater than 715 s (which is + * rather unlikely). + * + * Use this result only if there is no timer to wake us up sooner. + */ + if (likely(variance <= U64_MAX/36)) { + if ((((u64)avg*avg > variance*36) && (divisor * 4 >= INTERVALS * 3)) + || variance <= 400) { + return avg; + } + } + + /* + * If we have outliers to the upside in our distribution, discard + * those by setting the threshold to exclude these outliers, then + * calculate the average and standard deviation again. Once we get + * down to the bottom 3/4 of our samples, stop excluding samples. + * + * This can deal with workloads that have long pauses interspersed + * with sporadic activity with a bunch of short pauses. + */ + if ((divisor * 4) <= INTERVALS * 3) + return UINT_MAX; + + thresh = max - 1; + goto again; +} + +/** + * menu_select - selects the next idle state to enter + * @drv: cpuidle driver containing state data + * @dev: the CPU + * @stop_tick: indication on whether or not to stop the tick + */ +static int menu_select(struct cpuidle_driver *drv, struct cpuidle_device *dev, + bool *stop_tick) +{ + struct menu_device *data = this_cpu_ptr(&menu_devices); + s64 latency_req = cpuidle_governor_latency_req(dev->cpu); + u64 predicted_ns; + u64 interactivity_req; + unsigned int nr_iowaiters; + ktime_t delta, delta_tick; + int i, idx; + + if (data->needs_update) { + menu_update(drv, dev); + data->needs_update = 0; + } + + nr_iowaiters = nr_iowait_cpu(dev->cpu); + + /* Find the shortest expected idle interval. */ + predicted_ns = get_typical_interval(data) * NSEC_PER_USEC; + if (predicted_ns > RESIDENCY_THRESHOLD_NS) { + unsigned int timer_us; + + /* Determine the time till the closest timer. */ + delta = tick_nohz_get_sleep_length(&delta_tick); + if (unlikely(delta < 0)) { + delta = 0; + delta_tick = 0; + } + + data->next_timer_ns = delta; + data->bucket = which_bucket(data->next_timer_ns, nr_iowaiters); + + /* Round up the result for half microseconds. */ + timer_us = div_u64((RESOLUTION * DECAY * NSEC_PER_USEC) / 2 + + data->next_timer_ns * + data->correction_factor[data->bucket], + RESOLUTION * DECAY * NSEC_PER_USEC); + /* Use the lowest expected idle interval to pick the idle state. */ + predicted_ns = min((u64)timer_us * NSEC_PER_USEC, predicted_ns); + } else { + /* + * Because the next timer event is not going to be determined + * in this case, assume that without the tick the closest timer + * will be in distant future and that the closest tick will occur + * after 1/2 of the tick period. + */ + data->next_timer_ns = KTIME_MAX; + delta_tick = TICK_NSEC / 2; + data->bucket = which_bucket(KTIME_MAX, nr_iowaiters); + } + + if (unlikely(drv->state_count <= 1 || latency_req == 0) || + ((data->next_timer_ns < drv->states[1].target_residency_ns || + latency_req < drv->states[1].exit_latency_ns) && + !dev->states_usage[0].disable)) { + /* + * In this case state[0] will be used no matter what, so return + * it right away and keep the tick running if state[0] is a + * polling one. + */ + *stop_tick = !(drv->states[0].flags & CPUIDLE_FLAG_POLLING); + return 0; + } + + if (tick_nohz_tick_stopped()) { + /* + * If the tick is already stopped, the cost of possible short + * idle duration misprediction is much higher, because the CPU + * may be stuck in a shallow idle state for a long time as a + * result of it. In that case say we might mispredict and use + * the known time till the closest timer event for the idle + * state selection. + */ + if (predicted_ns < TICK_NSEC) + predicted_ns = data->next_timer_ns; + } else { + /* + * Use the performance multiplier and the user-configurable + * latency_req to determine the maximum exit latency. + */ + interactivity_req = div64_u64(predicted_ns, + performance_multiplier(nr_iowaiters)); + if (latency_req > interactivity_req) + latency_req = interactivity_req; + } + + /* + * Find the idle state with the lowest power while satisfying + * our constraints. + */ + idx = -1; + for (i = 0; i < drv->state_count; i++) { + struct cpuidle_state *s = &drv->states[i]; + + if (dev->states_usage[i].disable) + continue; + + if (idx == -1) + idx = i; /* first enabled state */ + + if (s->target_residency_ns > predicted_ns) { + /* + * Use a physical idle state, not busy polling, unless + * a timer is going to trigger soon enough. + */ + if ((drv->states[idx].flags & CPUIDLE_FLAG_POLLING) && + s->exit_latency_ns <= latency_req && + s->target_residency_ns <= data->next_timer_ns) { + predicted_ns = s->target_residency_ns; + idx = i; + break; + } + if (predicted_ns < TICK_NSEC) + break; + + if (!tick_nohz_tick_stopped()) { + /* + * If the state selected so far is shallow, + * waking up early won't hurt, so retain the + * tick in that case and let the governor run + * again in the next iteration of the loop. + */ + predicted_ns = drv->states[idx].target_residency_ns; + break; + } + + /* + * If the state selected so far is shallow and this + * state's target residency matches the time till the + * closest timer event, select this one to avoid getting + * stuck in the shallow one for too long. + */ + if (drv->states[idx].target_residency_ns < TICK_NSEC && + s->target_residency_ns <= delta_tick) + idx = i; + + return idx; + } + if (s->exit_latency_ns > latency_req) + break; + + idx = i; + } + + if (idx == -1) + idx = 0; /* No states enabled. Must use 0. */ + + /* + * Don't stop the tick if the selected state is a polling one or if the + * expected idle duration is shorter than the tick period length. + */ + if (((drv->states[idx].flags & CPUIDLE_FLAG_POLLING) || + predicted_ns < TICK_NSEC) && !tick_nohz_tick_stopped()) { + *stop_tick = false; + + if (idx > 0 && drv->states[idx].target_residency_ns > delta_tick) { + /* + * The tick is not going to be stopped and the target + * residency of the state to be returned is not within + * the time until the next timer event including the + * tick, so try to correct that. + */ + for (i = idx - 1; i >= 0; i--) { + if (dev->states_usage[i].disable) + continue; + + idx = i; + if (drv->states[i].target_residency_ns <= delta_tick) + break; + } + } + } + + return idx; +} + +/** + * menu_reflect - records that data structures need update + * @dev: the CPU + * @index: the index of actual entered state + * + * NOTE: it's important to be fast here because this operation will add to + * the overall exit latency. + */ +static void menu_reflect(struct cpuidle_device *dev, int index) +{ + struct menu_device *data = this_cpu_ptr(&menu_devices); + + dev->last_state_idx = index; + data->needs_update = 1; + data->tick_wakeup = tick_nohz_idle_got_tick(); +} + +/** + * menu_update - attempts to guess what happened after entry + * @drv: cpuidle driver containing state data + * @dev: the CPU + */ +static void menu_update(struct cpuidle_driver *drv, struct cpuidle_device *dev) +{ + struct menu_device *data = this_cpu_ptr(&menu_devices); + int last_idx = dev->last_state_idx; + struct cpuidle_state *target = &drv->states[last_idx]; + u64 measured_ns; + unsigned int new_factor; + + /* + * Try to figure out how much time passed between entry to low + * power state and occurrence of the wakeup event. + * + * If the entered idle state didn't support residency measurements, + * we use them anyway if they are short, and if long, + * truncate to the whole expected time. + * + * Any measured amount of time will include the exit latency. + * Since we are interested in when the wakeup begun, not when it + * was completed, we must subtract the exit latency. However, if + * the measured amount of time is less than the exit latency, + * assume the state was never reached and the exit latency is 0. + */ + + if (data->tick_wakeup && data->next_timer_ns > TICK_NSEC) { + /* + * The nohz code said that there wouldn't be any events within + * the tick boundary (if the tick was stopped), but the idle + * duration predictor had a differing opinion. Since the CPU + * was woken up by a tick (that wasn't stopped after all), the + * predictor was not quite right, so assume that the CPU could + * have been idle long (but not forever) to help the idle + * duration predictor do a better job next time. + */ + measured_ns = 9 * MAX_INTERESTING / 10; + } else if ((drv->states[last_idx].flags & CPUIDLE_FLAG_POLLING) && + dev->poll_time_limit) { + /* + * The CPU exited the "polling" state due to a time limit, so + * the idle duration prediction leading to the selection of that + * state was inaccurate. If a better prediction had been made, + * the CPU might have been woken up from idle by the next timer. + * Assume that to be the case. + */ + measured_ns = data->next_timer_ns; + } else { + /* measured value */ + measured_ns = dev->last_residency_ns; + + /* Deduct exit latency */ + if (measured_ns > 2 * target->exit_latency_ns) + measured_ns -= target->exit_latency_ns; + else + measured_ns /= 2; + } + + /* Make sure our coefficients do not exceed unity */ + if (measured_ns > data->next_timer_ns) + measured_ns = data->next_timer_ns; + + /* Update our correction ratio */ + new_factor = data->correction_factor[data->bucket]; + new_factor -= new_factor / DECAY; + + if (data->next_timer_ns > 0 && measured_ns < MAX_INTERESTING) + new_factor += div64_u64(RESOLUTION * measured_ns, + data->next_timer_ns); + else + /* + * we were idle so long that we count it as a perfect + * prediction + */ + new_factor += RESOLUTION; + + /* + * We don't want 0 as factor; we always want at least + * a tiny bit of estimated time. Fortunately, due to rounding, + * new_factor will stay nonzero regardless of measured_us values + * and the compiler can eliminate this test as long as DECAY > 1. + */ + if (DECAY == 1 && unlikely(new_factor == 0)) + new_factor = 1; + + data->correction_factor[data->bucket] = new_factor; + + /* update the repeating-pattern data */ + data->intervals[data->interval_ptr++] = ktime_to_us(measured_ns); + if (data->interval_ptr >= INTERVALS) + data->interval_ptr = 0; +} + +/** + * menu_enable_device - scans a CPU's states and does setup + * @drv: cpuidle driver + * @dev: the CPU + */ +static int menu_enable_device(struct cpuidle_driver *drv, + struct cpuidle_device *dev) +{ + struct menu_device *data = &per_cpu(menu_devices, dev->cpu); + int i; + + memset(data, 0, sizeof(struct menu_device)); + + /* + * if the correction factor is 0 (eg first time init or cpu hotplug + * etc), we actually want to start out with a unity factor. + */ + for(i = 0; i < BUCKETS; i++) + data->correction_factor[i] = RESOLUTION * DECAY; + + return 0; +} + +static struct cpuidle_governor menu_governor = { + .name = "menu", + .rating = 20, + .enable = menu_enable_device, + .select = menu_select, + .reflect = menu_reflect, +}; + +/** + * init_menu - initializes the governor + */ +static int __init init_menu(void) +{ + return cpuidle_register_governor(&menu_governor); +} + +postcore_initcall(init_menu); diff --git a/drivers/cpuidle/governors/teo.c b/drivers/cpuidle/governors/teo.c new file mode 100644 index 0000000000..7244f71c59 --- /dev/null +++ b/drivers/cpuidle/governors/teo.c @@ -0,0 +1,695 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Timer events oriented CPU idle governor + * + * TEO governor: + * Copyright (C) 2018 - 2021 Intel Corporation + * Author: Rafael J. Wysocki <rafael.j.wysocki@intel.com> + * + * Util-awareness mechanism: + * Copyright (C) 2022 Arm Ltd. + * Author: Kajetan Puchalski <kajetan.puchalski@arm.com> + */ + +/** + * DOC: teo-description + * + * The idea of this governor is based on the observation that on many systems + * timer events are two or more orders of magnitude more frequent than any + * other interrupts, so they are likely to be the most significant cause of CPU + * wakeups from idle states. Moreover, information about what happened in the + * (relatively recent) past can be used to estimate whether or not the deepest + * idle state with target residency within the (known) time till the closest + * timer event, referred to as the sleep length, is likely to be suitable for + * the upcoming CPU idle period and, if not, then which of the shallower idle + * states to choose instead of it. + * + * Of course, non-timer wakeup sources are more important in some use cases + * which can be covered by taking a few most recent idle time intervals of the + * CPU into account. However, even in that context it is not necessary to + * consider idle duration values greater than the sleep length, because the + * closest timer will ultimately wake up the CPU anyway unless it is woken up + * earlier. + * + * Thus this governor estimates whether or not the prospective idle duration of + * a CPU is likely to be significantly shorter than the sleep length and selects + * an idle state for it accordingly. + * + * The computations carried out by this governor are based on using bins whose + * boundaries are aligned with the target residency parameter values of the CPU + * idle states provided by the %CPUIdle driver in the ascending order. That is, + * the first bin spans from 0 up to, but not including, the target residency of + * the second idle state (idle state 1), the second bin spans from the target + * residency of idle state 1 up to, but not including, the target residency of + * idle state 2, the third bin spans from the target residency of idle state 2 + * up to, but not including, the target residency of idle state 3 and so on. + * The last bin spans from the target residency of the deepest idle state + * supplied by the driver to infinity. + * + * Two metrics called "hits" and "intercepts" are associated with each bin. + * They are updated every time before selecting an idle state for the given CPU + * in accordance with what happened last time. + * + * The "hits" metric reflects the relative frequency of situations in which the + * sleep length and the idle duration measured after CPU wakeup fall into the + * same bin (that is, the CPU appears to wake up "on time" relative to the sleep + * length). In turn, the "intercepts" metric reflects the relative frequency of + * situations in which the measured idle duration is so much shorter than the + * sleep length that the bin it falls into corresponds to an idle state + * shallower than the one whose bin is fallen into by the sleep length (these + * situations are referred to as "intercepts" below). + * + * In addition to the metrics described above, the governor counts recent + * intercepts (that is, intercepts that have occurred during the last + * %NR_RECENT invocations of it for the given CPU) for each bin. + * + * In order to select an idle state for a CPU, the governor takes the following + * steps (modulo the possible latency constraint that must be taken into account + * too): + * + * 1. Find the deepest CPU idle state whose target residency does not exceed + * the current sleep length (the candidate idle state) and compute 3 sums as + * follows: + * + * - The sum of the "hits" and "intercepts" metrics for the candidate state + * and all of the deeper idle states (it represents the cases in which the + * CPU was idle long enough to avoid being intercepted if the sleep length + * had been equal to the current one). + * + * - The sum of the "intercepts" metrics for all of the idle states shallower + * than the candidate one (it represents the cases in which the CPU was not + * idle long enough to avoid being intercepted if the sleep length had been + * equal to the current one). + * + * - The sum of the numbers of recent intercepts for all of the idle states + * shallower than the candidate one. + * + * 2. If the second sum is greater than the first one or the third sum is + * greater than %NR_RECENT / 2, the CPU is likely to wake up early, so look + * for an alternative idle state to select. + * + * - Traverse the idle states shallower than the candidate one in the + * descending order. + * + * - For each of them compute the sum of the "intercepts" metrics and the sum + * of the numbers of recent intercepts over all of the idle states between + * it and the candidate one (including the former and excluding the + * latter). + * + * - If each of these sums that needs to be taken into account (because the + * check related to it has indicated that the CPU is likely to wake up + * early) is greater than a half of the corresponding sum computed in step + * 1 (which means that the target residency of the state in question had + * not exceeded the idle duration in over a half of the relevant cases), + * select the given idle state instead of the candidate one. + * + * 3. By default, select the candidate state. + * + * Util-awareness mechanism: + * + * The idea behind the util-awareness extension is that there are two distinct + * scenarios for the CPU which should result in two different approaches to idle + * state selection - utilized and not utilized. + * + * In this case, 'utilized' means that the average runqueue util of the CPU is + * above a certain threshold. + * + * When the CPU is utilized while going into idle, more likely than not it will + * be woken up to do more work soon and so a shallower idle state should be + * selected to minimise latency and maximise performance. When the CPU is not + * being utilized, the usual metrics-based approach to selecting the deepest + * available idle state should be preferred to take advantage of the power + * saving. + * + * In order to achieve this, the governor uses a utilization threshold. + * The threshold is computed per-CPU as a percentage of the CPU's capacity + * by bit shifting the capacity value. Based on testing, the shift of 6 (~1.56%) + * seems to be getting the best results. + * + * Before selecting the next idle state, the governor compares the current CPU + * util to the precomputed util threshold. If it's below, it defaults to the + * TEO metrics mechanism. If it's above, the closest shallower idle state will + * be selected instead, as long as is not a polling state. + */ + +#include <linux/cpuidle.h> +#include <linux/jiffies.h> +#include <linux/kernel.h> +#include <linux/sched.h> +#include <linux/sched/clock.h> +#include <linux/sched/topology.h> +#include <linux/tick.h> + +#include "gov.h" + +/* + * The number of bits to shift the CPU's capacity by in order to determine + * the utilized threshold. + * + * 6 was chosen based on testing as the number that achieved the best balance + * of power and performance on average. + * + * The resulting threshold is high enough to not be triggered by background + * noise and low enough to react quickly when activity starts to ramp up. + */ +#define UTIL_THRESHOLD_SHIFT 6 + +/* + * The PULSE value is added to metrics when they grow and the DECAY_SHIFT value + * is used for decreasing metrics on a regular basis. + */ +#define PULSE 1024 +#define DECAY_SHIFT 3 + +/* + * Number of the most recent idle duration values to take into consideration for + * the detection of recent early wakeup patterns. + */ +#define NR_RECENT 9 + +/** + * struct teo_bin - Metrics used by the TEO cpuidle governor. + * @intercepts: The "intercepts" metric. + * @hits: The "hits" metric. + * @recent: The number of recent "intercepts". + */ +struct teo_bin { + unsigned int intercepts; + unsigned int hits; + unsigned int recent; +}; + +/** + * struct teo_cpu - CPU data used by the TEO cpuidle governor. + * @time_span_ns: Time between idle state selection and post-wakeup update. + * @sleep_length_ns: Time till the closest timer event (at the selection time). + * @state_bins: Idle state data bins for this CPU. + * @total: Grand total of the "intercepts" and "hits" metrics for all bins. + * @next_recent_idx: Index of the next @recent_idx entry to update. + * @recent_idx: Indices of bins corresponding to recent "intercepts". + * @tick_hits: Number of "hits" after TICK_NSEC. + * @util_threshold: Threshold above which the CPU is considered utilized + */ +struct teo_cpu { + s64 time_span_ns; + s64 sleep_length_ns; + struct teo_bin state_bins[CPUIDLE_STATE_MAX]; + unsigned int total; + int next_recent_idx; + int recent_idx[NR_RECENT]; + unsigned int tick_hits; + unsigned long util_threshold; +}; + +static DEFINE_PER_CPU(struct teo_cpu, teo_cpus); + +/** + * teo_cpu_is_utilized - Check if the CPU's util is above the threshold + * @cpu: Target CPU + * @cpu_data: Governor CPU data for the target CPU + */ +#ifdef CONFIG_SMP +static bool teo_cpu_is_utilized(int cpu, struct teo_cpu *cpu_data) +{ + return sched_cpu_util(cpu) > cpu_data->util_threshold; +} +#else +static bool teo_cpu_is_utilized(int cpu, struct teo_cpu *cpu_data) +{ + return false; +} +#endif + +/** + * teo_update - Update CPU metrics after wakeup. + * @drv: cpuidle driver containing state data. + * @dev: Target CPU. + */ +static void teo_update(struct cpuidle_driver *drv, struct cpuidle_device *dev) +{ + struct teo_cpu *cpu_data = per_cpu_ptr(&teo_cpus, dev->cpu); + int i, idx_timer = 0, idx_duration = 0; + s64 target_residency_ns; + u64 measured_ns; + + if (cpu_data->time_span_ns >= cpu_data->sleep_length_ns) { + /* + * One of the safety nets has triggered or the wakeup was close + * enough to the closest timer event expected at the idle state + * selection time to be discarded. + */ + measured_ns = U64_MAX; + } else { + u64 lat_ns = drv->states[dev->last_state_idx].exit_latency_ns; + + /* + * The computations below are to determine whether or not the + * (saved) time till the next timer event and the measured idle + * duration fall into the same "bin", so use last_residency_ns + * for that instead of time_span_ns which includes the cpuidle + * overhead. + */ + measured_ns = dev->last_residency_ns; + /* + * The delay between the wakeup and the first instruction + * executed by the CPU is not likely to be worst-case every + * time, so take 1/2 of the exit latency as a very rough + * approximation of the average of it. + */ + if (measured_ns >= lat_ns) + measured_ns -= lat_ns / 2; + else + measured_ns /= 2; + } + + cpu_data->total = 0; + + /* + * Decay the "hits" and "intercepts" metrics for all of the bins and + * find the bins that the sleep length and the measured idle duration + * fall into. + */ + for (i = 0; i < drv->state_count; i++) { + struct teo_bin *bin = &cpu_data->state_bins[i]; + + bin->hits -= bin->hits >> DECAY_SHIFT; + bin->intercepts -= bin->intercepts >> DECAY_SHIFT; + + cpu_data->total += bin->hits + bin->intercepts; + + target_residency_ns = drv->states[i].target_residency_ns; + + if (target_residency_ns <= cpu_data->sleep_length_ns) { + idx_timer = i; + if (target_residency_ns <= measured_ns) + idx_duration = i; + } + } + + i = cpu_data->next_recent_idx++; + if (cpu_data->next_recent_idx >= NR_RECENT) + cpu_data->next_recent_idx = 0; + + if (cpu_data->recent_idx[i] >= 0) + cpu_data->state_bins[cpu_data->recent_idx[i]].recent--; + + /* + * If the deepest state's target residency is below the tick length, + * make a record of it to help teo_select() decide whether or not + * to stop the tick. This effectively adds an extra hits-only bin + * beyond the last state-related one. + */ + if (target_residency_ns < TICK_NSEC) { + cpu_data->tick_hits -= cpu_data->tick_hits >> DECAY_SHIFT; + + cpu_data->total += cpu_data->tick_hits; + + if (TICK_NSEC <= cpu_data->sleep_length_ns) { + idx_timer = drv->state_count; + if (TICK_NSEC <= measured_ns) { + cpu_data->tick_hits += PULSE; + goto end; + } + } + } + + /* + * If the measured idle duration falls into the same bin as the sleep + * length, this is a "hit", so update the "hits" metric for that bin. + * Otherwise, update the "intercepts" metric for the bin fallen into by + * the measured idle duration. + */ + if (idx_timer == idx_duration) { + cpu_data->state_bins[idx_timer].hits += PULSE; + cpu_data->recent_idx[i] = -1; + } else { + cpu_data->state_bins[idx_duration].intercepts += PULSE; + cpu_data->state_bins[idx_duration].recent++; + cpu_data->recent_idx[i] = idx_duration; + } + +end: + cpu_data->total += PULSE; +} + +static bool teo_state_ok(int i, struct cpuidle_driver *drv) +{ + return !tick_nohz_tick_stopped() || + drv->states[i].target_residency_ns >= TICK_NSEC; +} + +/** + * teo_find_shallower_state - Find shallower idle state matching given duration. + * @drv: cpuidle driver containing state data. + * @dev: Target CPU. + * @state_idx: Index of the capping idle state. + * @duration_ns: Idle duration value to match. + * @no_poll: Don't consider polling states. + */ +static int teo_find_shallower_state(struct cpuidle_driver *drv, + struct cpuidle_device *dev, int state_idx, + s64 duration_ns, bool no_poll) +{ + int i; + + for (i = state_idx - 1; i >= 0; i--) { + if (dev->states_usage[i].disable || + (no_poll && drv->states[i].flags & CPUIDLE_FLAG_POLLING)) + continue; + + state_idx = i; + if (drv->states[i].target_residency_ns <= duration_ns) + break; + } + return state_idx; +} + +/** + * teo_select - Selects the next idle state to enter. + * @drv: cpuidle driver containing state data. + * @dev: Target CPU. + * @stop_tick: Indication on whether or not to stop the scheduler tick. + */ +static int teo_select(struct cpuidle_driver *drv, struct cpuidle_device *dev, + bool *stop_tick) +{ + struct teo_cpu *cpu_data = per_cpu_ptr(&teo_cpus, dev->cpu); + s64 latency_req = cpuidle_governor_latency_req(dev->cpu); + ktime_t delta_tick = TICK_NSEC / 2; + unsigned int tick_intercept_sum = 0; + unsigned int idx_intercept_sum = 0; + unsigned int intercept_sum = 0; + unsigned int idx_recent_sum = 0; + unsigned int recent_sum = 0; + unsigned int idx_hit_sum = 0; + unsigned int hit_sum = 0; + int constraint_idx = 0; + int idx0 = 0, idx = -1; + bool alt_intercepts, alt_recent; + bool cpu_utilized; + s64 duration_ns; + int i; + + if (dev->last_state_idx >= 0) { + teo_update(drv, dev); + dev->last_state_idx = -1; + } + + cpu_data->time_span_ns = local_clock(); + /* + * Set the expected sleep length to infinity in case of an early + * return. + */ + cpu_data->sleep_length_ns = KTIME_MAX; + + /* Check if there is any choice in the first place. */ + if (drv->state_count < 2) { + idx = 0; + goto out_tick; + } + + if (!dev->states_usage[0].disable) + idx = 0; + + cpu_utilized = teo_cpu_is_utilized(dev->cpu, cpu_data); + /* + * If the CPU is being utilized over the threshold and there are only 2 + * states to choose from, the metrics need not be considered, so choose + * the shallowest non-polling state and exit. + */ + if (drv->state_count < 3 && cpu_utilized) { + /* + * If state 0 is enabled and it is not a polling one, select it + * right away unless the scheduler tick has been stopped, in + * which case care needs to be taken to leave the CPU in a deep + * enough state in case it is not woken up any time soon after + * all. If state 1 is disabled, though, state 0 must be used + * anyway. + */ + if ((!idx && !(drv->states[0].flags & CPUIDLE_FLAG_POLLING) && + teo_state_ok(0, drv)) || dev->states_usage[1].disable) { + idx = 0; + goto out_tick; + } + /* Assume that state 1 is not a polling one and use it. */ + idx = 1; + duration_ns = drv->states[1].target_residency_ns; + goto end; + } + + /* Compute the sums of metrics for early wakeup pattern detection. */ + for (i = 1; i < drv->state_count; i++) { + struct teo_bin *prev_bin = &cpu_data->state_bins[i-1]; + struct cpuidle_state *s = &drv->states[i]; + + /* + * Update the sums of idle state mertics for all of the states + * shallower than the current one. + */ + intercept_sum += prev_bin->intercepts; + hit_sum += prev_bin->hits; + recent_sum += prev_bin->recent; + + if (dev->states_usage[i].disable) + continue; + + if (idx < 0) + idx0 = i; /* first enabled state */ + + idx = i; + + if (s->exit_latency_ns <= latency_req) + constraint_idx = i; + + /* Save the sums for the current state. */ + idx_intercept_sum = intercept_sum; + idx_hit_sum = hit_sum; + idx_recent_sum = recent_sum; + } + + /* Avoid unnecessary overhead. */ + if (idx < 0) { + idx = 0; /* No states enabled, must use 0. */ + goto out_tick; + } + + if (idx == idx0) { + /* + * Only one idle state is enabled, so use it, but do not + * allow the tick to be stopped it is shallow enough. + */ + duration_ns = drv->states[idx].target_residency_ns; + goto end; + } + + tick_intercept_sum = intercept_sum + + cpu_data->state_bins[drv->state_count-1].intercepts; + + /* + * If the sum of the intercepts metric for all of the idle states + * shallower than the current candidate one (idx) is greater than the + * sum of the intercepts and hits metrics for the candidate state and + * all of the deeper states, or the sum of the numbers of recent + * intercepts over all of the states shallower than the candidate one + * is greater than a half of the number of recent events taken into + * account, a shallower idle state is likely to be a better choice. + */ + alt_intercepts = 2 * idx_intercept_sum > cpu_data->total - idx_hit_sum; + alt_recent = idx_recent_sum > NR_RECENT / 2; + if (alt_recent || alt_intercepts) { + int first_suitable_idx = idx; + + /* + * Look for the deepest idle state whose target residency had + * not exceeded the idle duration in over a half of the relevant + * cases (both with respect to intercepts overall and with + * respect to the recent intercepts only) in the past. + * + * Take the possible duration limitation present if the tick + * has been stopped already into account. + */ + intercept_sum = 0; + recent_sum = 0; + + for (i = idx - 1; i >= 0; i--) { + struct teo_bin *bin = &cpu_data->state_bins[i]; + + intercept_sum += bin->intercepts; + recent_sum += bin->recent; + + if ((!alt_recent || 2 * recent_sum > idx_recent_sum) && + (!alt_intercepts || + 2 * intercept_sum > idx_intercept_sum)) { + /* + * Use the current state unless it is too + * shallow or disabled, in which case take the + * first enabled state that is deep enough. + */ + if (teo_state_ok(i, drv) && + !dev->states_usage[i].disable) + idx = i; + else + idx = first_suitable_idx; + + break; + } + + if (dev->states_usage[i].disable) + continue; + + if (!teo_state_ok(i, drv)) { + /* + * The current state is too shallow, but if an + * alternative candidate state has been found, + * it may still turn out to be a better choice. + */ + if (first_suitable_idx != idx) + continue; + + break; + } + + first_suitable_idx = i; + } + } + + /* + * If there is a latency constraint, it may be necessary to select an + * idle state shallower than the current candidate one. + */ + if (idx > constraint_idx) + idx = constraint_idx; + + /* + * If the CPU is being utilized over the threshold, choose a shallower + * non-polling state to improve latency, unless the scheduler tick has + * been stopped already and the shallower state's target residency is + * not sufficiently large. + */ + if (cpu_utilized) { + i = teo_find_shallower_state(drv, dev, idx, KTIME_MAX, true); + if (teo_state_ok(i, drv)) + idx = i; + } + + /* + * Skip the timers check if state 0 is the current candidate one, + * because an immediate non-timer wakeup is expected in that case. + */ + if (!idx) + goto out_tick; + + /* + * If state 0 is a polling one, check if the target residency of + * the current candidate state is low enough and skip the timers + * check in that case too. + */ + if ((drv->states[0].flags & CPUIDLE_FLAG_POLLING) && + drv->states[idx].target_residency_ns < RESIDENCY_THRESHOLD_NS) + goto out_tick; + + duration_ns = tick_nohz_get_sleep_length(&delta_tick); + cpu_data->sleep_length_ns = duration_ns; + + /* + * If the closest expected timer is before the terget residency of the + * candidate state, a shallower one needs to be found. + */ + if (drv->states[idx].target_residency_ns > duration_ns) { + i = teo_find_shallower_state(drv, dev, idx, duration_ns, false); + if (teo_state_ok(i, drv)) + idx = i; + } + + /* + * If the selected state's target residency is below the tick length + * and intercepts occurring before the tick length are the majority of + * total wakeup events, do not stop the tick. + */ + if (drv->states[idx].target_residency_ns < TICK_NSEC && + tick_intercept_sum > cpu_data->total / 2 + cpu_data->total / 8) + duration_ns = TICK_NSEC / 2; + +end: + /* + * Allow the tick to be stopped unless the selected state is a polling + * one or the expected idle duration is shorter than the tick period + * length. + */ + if ((!(drv->states[idx].flags & CPUIDLE_FLAG_POLLING) && + duration_ns >= TICK_NSEC) || tick_nohz_tick_stopped()) + return idx; + + /* + * The tick is not going to be stopped, so if the target residency of + * the state to be returned is not within the time till the closest + * timer including the tick, try to correct that. + */ + if (idx > idx0 && + drv->states[idx].target_residency_ns > delta_tick) + idx = teo_find_shallower_state(drv, dev, idx, delta_tick, false); + +out_tick: + *stop_tick = false; + return idx; +} + +/** + * teo_reflect - Note that governor data for the CPU need to be updated. + * @dev: Target CPU. + * @state: Entered state. + */ +static void teo_reflect(struct cpuidle_device *dev, int state) +{ + struct teo_cpu *cpu_data = per_cpu_ptr(&teo_cpus, dev->cpu); + + dev->last_state_idx = state; + /* + * If the wakeup was not "natural", but triggered by one of the safety + * nets, assume that the CPU might have been idle for the entire sleep + * length time. + */ + if (dev->poll_time_limit || + (tick_nohz_idle_got_tick() && cpu_data->sleep_length_ns > TICK_NSEC)) { + dev->poll_time_limit = false; + cpu_data->time_span_ns = cpu_data->sleep_length_ns; + } else { + cpu_data->time_span_ns = local_clock() - cpu_data->time_span_ns; + } +} + +/** + * teo_enable_device - Initialize the governor's data for the target CPU. + * @drv: cpuidle driver (not used). + * @dev: Target CPU. + */ +static int teo_enable_device(struct cpuidle_driver *drv, + struct cpuidle_device *dev) +{ + struct teo_cpu *cpu_data = per_cpu_ptr(&teo_cpus, dev->cpu); + unsigned long max_capacity = arch_scale_cpu_capacity(dev->cpu); + int i; + + memset(cpu_data, 0, sizeof(*cpu_data)); + cpu_data->util_threshold = max_capacity >> UTIL_THRESHOLD_SHIFT; + + for (i = 0; i < NR_RECENT; i++) + cpu_data->recent_idx[i] = -1; + + return 0; +} + +static struct cpuidle_governor teo_governor = { + .name = "teo", + .rating = 19, + .enable = teo_enable_device, + .select = teo_select, + .reflect = teo_reflect, +}; + +static int __init teo_governor_init(void) +{ + return cpuidle_register_governor(&teo_governor); +} + +postcore_initcall(teo_governor_init); diff --git a/drivers/cpuidle/poll_state.c b/drivers/cpuidle/poll_state.c new file mode 100644 index 0000000000..9b6d90a726 --- /dev/null +++ b/drivers/cpuidle/poll_state.c @@ -0,0 +1,62 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * poll_state.c - Polling idle state + */ + +#include <linux/cpuidle.h> +#include <linux/sched.h> +#include <linux/sched/clock.h> +#include <linux/sched/idle.h> + +#define POLL_IDLE_RELAX_COUNT 200 + +static int __cpuidle poll_idle(struct cpuidle_device *dev, + struct cpuidle_driver *drv, int index) +{ + u64 time_start; + + time_start = local_clock_noinstr(); + + dev->poll_time_limit = false; + + raw_local_irq_enable(); + if (!current_set_polling_and_test()) { + unsigned int loop_count = 0; + u64 limit; + + limit = cpuidle_poll_time(drv, dev); + + while (!need_resched()) { + cpu_relax(); + if (loop_count++ < POLL_IDLE_RELAX_COUNT) + continue; + + loop_count = 0; + if (local_clock_noinstr() - time_start > limit) { + dev->poll_time_limit = true; + break; + } + } + } + raw_local_irq_disable(); + + current_clr_polling(); + + return index; +} + +void cpuidle_poll_state_init(struct cpuidle_driver *drv) +{ + struct cpuidle_state *state = &drv->states[0]; + + snprintf(state->name, CPUIDLE_NAME_LEN, "POLL"); + snprintf(state->desc, CPUIDLE_DESC_LEN, "CPUIDLE CORE POLL IDLE"); + state->exit_latency = 0; + state->target_residency = 0; + state->exit_latency_ns = 0; + state->target_residency_ns = 0; + state->power_usage = -1; + state->enter = poll_idle; + state->flags = CPUIDLE_FLAG_POLLING; +} +EXPORT_SYMBOL_GPL(cpuidle_poll_state_init); diff --git a/drivers/cpuidle/sysfs.c b/drivers/cpuidle/sysfs.c new file mode 100644 index 0000000000..d6f5da61cb --- /dev/null +++ b/drivers/cpuidle/sysfs.c @@ -0,0 +1,747 @@ +/* + * sysfs.c - sysfs support + * + * (C) 2006-2007 Shaohua Li <shaohua.li@intel.com> + * + * This code is licenced under the GPL. + */ + +#include <linux/kernel.h> +#include <linux/cpuidle.h> +#include <linux/sysfs.h> +#include <linux/slab.h> +#include <linux/cpu.h> +#include <linux/completion.h> +#include <linux/capability.h> +#include <linux/device.h> +#include <linux/kobject.h> + +#include "cpuidle.h" + +static ssize_t show_available_governors(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + ssize_t i = 0; + struct cpuidle_governor *tmp; + + mutex_lock(&cpuidle_lock); + list_for_each_entry(tmp, &cpuidle_governors, governor_list) { + if (i >= (ssize_t) (PAGE_SIZE - (CPUIDLE_NAME_LEN + 2))) + goto out; + + i += scnprintf(&buf[i], CPUIDLE_NAME_LEN + 1, "%s ", tmp->name); + } + +out: + i+= sprintf(&buf[i], "\n"); + mutex_unlock(&cpuidle_lock); + return i; +} + +static ssize_t show_current_driver(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + ssize_t ret; + struct cpuidle_driver *drv; + + spin_lock(&cpuidle_driver_lock); + drv = cpuidle_get_driver(); + if (drv) + ret = sprintf(buf, "%s\n", drv->name); + else + ret = sprintf(buf, "none\n"); + spin_unlock(&cpuidle_driver_lock); + + return ret; +} + +static ssize_t show_current_governor(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + ssize_t ret; + + mutex_lock(&cpuidle_lock); + if (cpuidle_curr_governor) + ret = sprintf(buf, "%s\n", cpuidle_curr_governor->name); + else + ret = sprintf(buf, "none\n"); + mutex_unlock(&cpuidle_lock); + + return ret; +} + +static ssize_t store_current_governor(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t count) +{ + char gov_name[CPUIDLE_NAME_LEN + 1]; + int ret; + struct cpuidle_governor *gov; + + ret = sscanf(buf, "%" __stringify(CPUIDLE_NAME_LEN) "s", gov_name); + if (ret != 1) + return -EINVAL; + + mutex_lock(&cpuidle_lock); + ret = -EINVAL; + list_for_each_entry(gov, &cpuidle_governors, governor_list) { + if (!strncmp(gov->name, gov_name, CPUIDLE_NAME_LEN)) { + ret = cpuidle_switch_governor(gov); + break; + } + } + mutex_unlock(&cpuidle_lock); + + return ret ? ret : count; +} + +static DEVICE_ATTR(available_governors, 0444, show_available_governors, NULL); +static DEVICE_ATTR(current_driver, 0444, show_current_driver, NULL); +static DEVICE_ATTR(current_governor, 0644, show_current_governor, + store_current_governor); +static DEVICE_ATTR(current_governor_ro, 0444, show_current_governor, NULL); + +static struct attribute *cpuidle_attrs[] = { + &dev_attr_available_governors.attr, + &dev_attr_current_driver.attr, + &dev_attr_current_governor.attr, + &dev_attr_current_governor_ro.attr, + NULL +}; + +static struct attribute_group cpuidle_attr_group = { + .attrs = cpuidle_attrs, + .name = "cpuidle", +}; + +/** + * cpuidle_add_interface - add CPU global sysfs attributes + */ +int cpuidle_add_interface(void) +{ + struct device *dev_root = bus_get_dev_root(&cpu_subsys); + int retval; + + if (!dev_root) + return -EINVAL; + + retval = sysfs_create_group(&dev_root->kobj, &cpuidle_attr_group); + put_device(dev_root); + return retval; +} + +/** + * cpuidle_remove_interface - remove CPU global sysfs attributes + * @dev: the target device + */ +void cpuidle_remove_interface(struct device *dev) +{ + sysfs_remove_group(&dev->kobj, &cpuidle_attr_group); +} + +struct cpuidle_attr { + struct attribute attr; + ssize_t (*show)(struct cpuidle_device *, char *); + ssize_t (*store)(struct cpuidle_device *, const char *, size_t count); +}; + +#define attr_to_cpuidleattr(a) container_of(a, struct cpuidle_attr, attr) + +struct cpuidle_device_kobj { + struct cpuidle_device *dev; + struct completion kobj_unregister; + struct kobject kobj; +}; + +static inline struct cpuidle_device *to_cpuidle_device(struct kobject *kobj) +{ + struct cpuidle_device_kobj *kdev = + container_of(kobj, struct cpuidle_device_kobj, kobj); + + return kdev->dev; +} + +static ssize_t cpuidle_show(struct kobject *kobj, struct attribute *attr, + char *buf) +{ + int ret = -EIO; + struct cpuidle_device *dev = to_cpuidle_device(kobj); + struct cpuidle_attr *cattr = attr_to_cpuidleattr(attr); + + if (cattr->show) { + mutex_lock(&cpuidle_lock); + ret = cattr->show(dev, buf); + mutex_unlock(&cpuidle_lock); + } + return ret; +} + +static ssize_t cpuidle_store(struct kobject *kobj, struct attribute *attr, + const char *buf, size_t count) +{ + int ret = -EIO; + struct cpuidle_device *dev = to_cpuidle_device(kobj); + struct cpuidle_attr *cattr = attr_to_cpuidleattr(attr); + + if (cattr->store) { + mutex_lock(&cpuidle_lock); + ret = cattr->store(dev, buf, count); + mutex_unlock(&cpuidle_lock); + } + return ret; +} + +static const struct sysfs_ops cpuidle_sysfs_ops = { + .show = cpuidle_show, + .store = cpuidle_store, +}; + +static void cpuidle_sysfs_release(struct kobject *kobj) +{ + struct cpuidle_device_kobj *kdev = + container_of(kobj, struct cpuidle_device_kobj, kobj); + + complete(&kdev->kobj_unregister); +} + +static const struct kobj_type ktype_cpuidle = { + .sysfs_ops = &cpuidle_sysfs_ops, + .release = cpuidle_sysfs_release, +}; + +struct cpuidle_state_attr { + struct attribute attr; + ssize_t (*show)(struct cpuidle_state *, \ + struct cpuidle_state_usage *, char *); + ssize_t (*store)(struct cpuidle_state *, \ + struct cpuidle_state_usage *, const char *, size_t); +}; + +#define define_one_state_ro(_name, show) \ +static struct cpuidle_state_attr attr_##_name = __ATTR(_name, 0444, show, NULL) + +#define define_one_state_rw(_name, show, store) \ +static struct cpuidle_state_attr attr_##_name = __ATTR(_name, 0644, show, store) + +#define define_show_state_function(_name) \ +static ssize_t show_state_##_name(struct cpuidle_state *state, \ + struct cpuidle_state_usage *state_usage, char *buf) \ +{ \ + return sprintf(buf, "%u\n", state->_name);\ +} + +#define define_show_state_ull_function(_name) \ +static ssize_t show_state_##_name(struct cpuidle_state *state, \ + struct cpuidle_state_usage *state_usage, \ + char *buf) \ +{ \ + return sprintf(buf, "%llu\n", state_usage->_name);\ +} + +#define define_show_state_str_function(_name) \ +static ssize_t show_state_##_name(struct cpuidle_state *state, \ + struct cpuidle_state_usage *state_usage, \ + char *buf) \ +{ \ + if (state->_name[0] == '\0')\ + return sprintf(buf, "<null>\n");\ + return sprintf(buf, "%s\n", state->_name);\ +} + +#define define_show_state_time_function(_name) \ +static ssize_t show_state_##_name(struct cpuidle_state *state, \ + struct cpuidle_state_usage *state_usage, \ + char *buf) \ +{ \ + return sprintf(buf, "%llu\n", ktime_to_us(state->_name##_ns)); \ +} + +define_show_state_time_function(exit_latency) +define_show_state_time_function(target_residency) +define_show_state_function(power_usage) +define_show_state_ull_function(usage) +define_show_state_ull_function(rejected) +define_show_state_str_function(name) +define_show_state_str_function(desc) +define_show_state_ull_function(above) +define_show_state_ull_function(below) + +static ssize_t show_state_time(struct cpuidle_state *state, + struct cpuidle_state_usage *state_usage, + char *buf) +{ + return sprintf(buf, "%llu\n", ktime_to_us(state_usage->time_ns)); +} + +static ssize_t show_state_disable(struct cpuidle_state *state, + struct cpuidle_state_usage *state_usage, + char *buf) +{ + return sprintf(buf, "%llu\n", + state_usage->disable & CPUIDLE_STATE_DISABLED_BY_USER); +} + +static ssize_t store_state_disable(struct cpuidle_state *state, + struct cpuidle_state_usage *state_usage, + const char *buf, size_t size) +{ + unsigned int value; + int err; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + err = kstrtouint(buf, 0, &value); + if (err) + return err; + + if (value) + state_usage->disable |= CPUIDLE_STATE_DISABLED_BY_USER; + else + state_usage->disable &= ~CPUIDLE_STATE_DISABLED_BY_USER; + + return size; +} + +static ssize_t show_state_default_status(struct cpuidle_state *state, + struct cpuidle_state_usage *state_usage, + char *buf) +{ + return sprintf(buf, "%s\n", + state->flags & CPUIDLE_FLAG_OFF ? "disabled" : "enabled"); +} + +define_one_state_ro(name, show_state_name); +define_one_state_ro(desc, show_state_desc); +define_one_state_ro(latency, show_state_exit_latency); +define_one_state_ro(residency, show_state_target_residency); +define_one_state_ro(power, show_state_power_usage); +define_one_state_ro(usage, show_state_usage); +define_one_state_ro(rejected, show_state_rejected); +define_one_state_ro(time, show_state_time); +define_one_state_rw(disable, show_state_disable, store_state_disable); +define_one_state_ro(above, show_state_above); +define_one_state_ro(below, show_state_below); +define_one_state_ro(default_status, show_state_default_status); + +static struct attribute *cpuidle_state_default_attrs[] = { + &attr_name.attr, + &attr_desc.attr, + &attr_latency.attr, + &attr_residency.attr, + &attr_power.attr, + &attr_usage.attr, + &attr_rejected.attr, + &attr_time.attr, + &attr_disable.attr, + &attr_above.attr, + &attr_below.attr, + &attr_default_status.attr, + NULL +}; +ATTRIBUTE_GROUPS(cpuidle_state_default); + +struct cpuidle_state_kobj { + struct cpuidle_state *state; + struct cpuidle_state_usage *state_usage; + struct completion kobj_unregister; + struct kobject kobj; + struct cpuidle_device *device; +}; + +#ifdef CONFIG_SUSPEND +#define define_show_state_s2idle_ull_function(_name) \ +static ssize_t show_state_s2idle_##_name(struct cpuidle_state *state, \ + struct cpuidle_state_usage *state_usage, \ + char *buf) \ +{ \ + return sprintf(buf, "%llu\n", state_usage->s2idle_##_name);\ +} + +define_show_state_s2idle_ull_function(usage); +define_show_state_s2idle_ull_function(time); + +#define define_one_state_s2idle_ro(_name, show) \ +static struct cpuidle_state_attr attr_s2idle_##_name = \ + __ATTR(_name, 0444, show, NULL) + +define_one_state_s2idle_ro(usage, show_state_s2idle_usage); +define_one_state_s2idle_ro(time, show_state_s2idle_time); + +static struct attribute *cpuidle_state_s2idle_attrs[] = { + &attr_s2idle_usage.attr, + &attr_s2idle_time.attr, + NULL +}; + +static const struct attribute_group cpuidle_state_s2idle_group = { + .name = "s2idle", + .attrs = cpuidle_state_s2idle_attrs, +}; + +static void cpuidle_add_s2idle_attr_group(struct cpuidle_state_kobj *kobj) +{ + int ret; + + if (!kobj->state->enter_s2idle) + return; + + ret = sysfs_create_group(&kobj->kobj, &cpuidle_state_s2idle_group); + if (ret) + pr_debug("%s: sysfs attribute group not created\n", __func__); +} + +static void cpuidle_remove_s2idle_attr_group(struct cpuidle_state_kobj *kobj) +{ + if (kobj->state->enter_s2idle) + sysfs_remove_group(&kobj->kobj, &cpuidle_state_s2idle_group); +} +#else +static inline void cpuidle_add_s2idle_attr_group(struct cpuidle_state_kobj *kobj) { } +static inline void cpuidle_remove_s2idle_attr_group(struct cpuidle_state_kobj *kobj) { } +#endif /* CONFIG_SUSPEND */ + +#define kobj_to_state_obj(k) container_of(k, struct cpuidle_state_kobj, kobj) +#define kobj_to_state(k) (kobj_to_state_obj(k)->state) +#define kobj_to_state_usage(k) (kobj_to_state_obj(k)->state_usage) +#define kobj_to_device(k) (kobj_to_state_obj(k)->device) +#define attr_to_stateattr(a) container_of(a, struct cpuidle_state_attr, attr) + +static ssize_t cpuidle_state_show(struct kobject *kobj, struct attribute *attr, + char *buf) +{ + int ret = -EIO; + struct cpuidle_state *state = kobj_to_state(kobj); + struct cpuidle_state_usage *state_usage = kobj_to_state_usage(kobj); + struct cpuidle_state_attr *cattr = attr_to_stateattr(attr); + + if (cattr->show) + ret = cattr->show(state, state_usage, buf); + + return ret; +} + +static ssize_t cpuidle_state_store(struct kobject *kobj, struct attribute *attr, + const char *buf, size_t size) +{ + int ret = -EIO; + struct cpuidle_state *state = kobj_to_state(kobj); + struct cpuidle_state_usage *state_usage = kobj_to_state_usage(kobj); + struct cpuidle_state_attr *cattr = attr_to_stateattr(attr); + struct cpuidle_device *dev = kobj_to_device(kobj); + + if (cattr->store) + ret = cattr->store(state, state_usage, buf, size); + + /* reset poll time cache */ + dev->poll_limit_ns = 0; + + return ret; +} + +static const struct sysfs_ops cpuidle_state_sysfs_ops = { + .show = cpuidle_state_show, + .store = cpuidle_state_store, +}; + +static void cpuidle_state_sysfs_release(struct kobject *kobj) +{ + struct cpuidle_state_kobj *state_obj = kobj_to_state_obj(kobj); + + complete(&state_obj->kobj_unregister); +} + +static const struct kobj_type ktype_state_cpuidle = { + .sysfs_ops = &cpuidle_state_sysfs_ops, + .default_groups = cpuidle_state_default_groups, + .release = cpuidle_state_sysfs_release, +}; + +static inline void cpuidle_free_state_kobj(struct cpuidle_device *device, int i) +{ + cpuidle_remove_s2idle_attr_group(device->kobjs[i]); + kobject_put(&device->kobjs[i]->kobj); + wait_for_completion(&device->kobjs[i]->kobj_unregister); + kfree(device->kobjs[i]); + device->kobjs[i] = NULL; +} + +/** + * cpuidle_add_state_sysfs - adds cpuidle states sysfs attributes + * @device: the target device + */ +static int cpuidle_add_state_sysfs(struct cpuidle_device *device) +{ + int i, ret = -ENOMEM; + struct cpuidle_state_kobj *kobj; + struct cpuidle_device_kobj *kdev = device->kobj_dev; + struct cpuidle_driver *drv = cpuidle_get_cpu_driver(device); + + /* state statistics */ + for (i = 0; i < drv->state_count; i++) { + kobj = kzalloc(sizeof(struct cpuidle_state_kobj), GFP_KERNEL); + if (!kobj) { + ret = -ENOMEM; + goto error_state; + } + kobj->state = &drv->states[i]; + kobj->state_usage = &device->states_usage[i]; + kobj->device = device; + init_completion(&kobj->kobj_unregister); + + ret = kobject_init_and_add(&kobj->kobj, &ktype_state_cpuidle, + &kdev->kobj, "state%d", i); + if (ret) { + kobject_put(&kobj->kobj); + kfree(kobj); + goto error_state; + } + cpuidle_add_s2idle_attr_group(kobj); + kobject_uevent(&kobj->kobj, KOBJ_ADD); + device->kobjs[i] = kobj; + } + + return 0; + +error_state: + for (i = i - 1; i >= 0; i--) + cpuidle_free_state_kobj(device, i); + return ret; +} + +/** + * cpuidle_remove_state_sysfs - removes the cpuidle states sysfs attributes + * @device: the target device + */ +static void cpuidle_remove_state_sysfs(struct cpuidle_device *device) +{ + struct cpuidle_driver *drv = cpuidle_get_cpu_driver(device); + int i; + + for (i = 0; i < drv->state_count; i++) + cpuidle_free_state_kobj(device, i); +} + +#ifdef CONFIG_CPU_IDLE_MULTIPLE_DRIVERS +#define kobj_to_driver_kobj(k) container_of(k, struct cpuidle_driver_kobj, kobj) +#define attr_to_driver_attr(a) container_of(a, struct cpuidle_driver_attr, attr) + +#define define_one_driver_ro(_name, show) \ + static struct cpuidle_driver_attr attr_driver_##_name = \ + __ATTR(_name, 0444, show, NULL) + +struct cpuidle_driver_kobj { + struct cpuidle_driver *drv; + struct completion kobj_unregister; + struct kobject kobj; +}; + +struct cpuidle_driver_attr { + struct attribute attr; + ssize_t (*show)(struct cpuidle_driver *, char *); + ssize_t (*store)(struct cpuidle_driver *, const char *, size_t); +}; + +static ssize_t show_driver_name(struct cpuidle_driver *drv, char *buf) +{ + ssize_t ret; + + spin_lock(&cpuidle_driver_lock); + ret = sprintf(buf, "%s\n", drv ? drv->name : "none"); + spin_unlock(&cpuidle_driver_lock); + + return ret; +} + +static void cpuidle_driver_sysfs_release(struct kobject *kobj) +{ + struct cpuidle_driver_kobj *driver_kobj = kobj_to_driver_kobj(kobj); + complete(&driver_kobj->kobj_unregister); +} + +static ssize_t cpuidle_driver_show(struct kobject *kobj, struct attribute *attr, + char *buf) +{ + int ret = -EIO; + struct cpuidle_driver_kobj *driver_kobj = kobj_to_driver_kobj(kobj); + struct cpuidle_driver_attr *dattr = attr_to_driver_attr(attr); + + if (dattr->show) + ret = dattr->show(driver_kobj->drv, buf); + + return ret; +} + +static ssize_t cpuidle_driver_store(struct kobject *kobj, struct attribute *attr, + const char *buf, size_t size) +{ + int ret = -EIO; + struct cpuidle_driver_kobj *driver_kobj = kobj_to_driver_kobj(kobj); + struct cpuidle_driver_attr *dattr = attr_to_driver_attr(attr); + + if (dattr->store) + ret = dattr->store(driver_kobj->drv, buf, size); + + return ret; +} + +define_one_driver_ro(name, show_driver_name); + +static const struct sysfs_ops cpuidle_driver_sysfs_ops = { + .show = cpuidle_driver_show, + .store = cpuidle_driver_store, +}; + +static struct attribute *cpuidle_driver_default_attrs[] = { + &attr_driver_name.attr, + NULL +}; +ATTRIBUTE_GROUPS(cpuidle_driver_default); + +static const struct kobj_type ktype_driver_cpuidle = { + .sysfs_ops = &cpuidle_driver_sysfs_ops, + .default_groups = cpuidle_driver_default_groups, + .release = cpuidle_driver_sysfs_release, +}; + +/** + * cpuidle_add_driver_sysfs - adds the driver name sysfs attribute + * @dev: the target device + */ +static int cpuidle_add_driver_sysfs(struct cpuidle_device *dev) +{ + struct cpuidle_driver_kobj *kdrv; + struct cpuidle_device_kobj *kdev = dev->kobj_dev; + struct cpuidle_driver *drv = cpuidle_get_cpu_driver(dev); + int ret; + + kdrv = kzalloc(sizeof(*kdrv), GFP_KERNEL); + if (!kdrv) + return -ENOMEM; + + kdrv->drv = drv; + init_completion(&kdrv->kobj_unregister); + + ret = kobject_init_and_add(&kdrv->kobj, &ktype_driver_cpuidle, + &kdev->kobj, "driver"); + if (ret) { + kobject_put(&kdrv->kobj); + kfree(kdrv); + return ret; + } + + kobject_uevent(&kdrv->kobj, KOBJ_ADD); + dev->kobj_driver = kdrv; + + return ret; +} + +/** + * cpuidle_remove_driver_sysfs - removes the driver name sysfs attribute + * @dev: the target device + */ +static void cpuidle_remove_driver_sysfs(struct cpuidle_device *dev) +{ + struct cpuidle_driver_kobj *kdrv = dev->kobj_driver; + kobject_put(&kdrv->kobj); + wait_for_completion(&kdrv->kobj_unregister); + kfree(kdrv); +} +#else +static inline int cpuidle_add_driver_sysfs(struct cpuidle_device *dev) +{ + return 0; +} + +static inline void cpuidle_remove_driver_sysfs(struct cpuidle_device *dev) +{ + ; +} +#endif + +/** + * cpuidle_add_device_sysfs - adds device specific sysfs attributes + * @device: the target device + */ +int cpuidle_add_device_sysfs(struct cpuidle_device *device) +{ + int ret; + + ret = cpuidle_add_state_sysfs(device); + if (ret) + return ret; + + ret = cpuidle_add_driver_sysfs(device); + if (ret) + cpuidle_remove_state_sysfs(device); + return ret; +} + +/** + * cpuidle_remove_device_sysfs : removes device specific sysfs attributes + * @device : the target device + */ +void cpuidle_remove_device_sysfs(struct cpuidle_device *device) +{ + cpuidle_remove_driver_sysfs(device); + cpuidle_remove_state_sysfs(device); +} + +/** + * cpuidle_add_sysfs - creates a sysfs instance for the target device + * @dev: the target device + */ +int cpuidle_add_sysfs(struct cpuidle_device *dev) +{ + struct cpuidle_device_kobj *kdev; + struct device *cpu_dev = get_cpu_device((unsigned long)dev->cpu); + int error; + + /* + * Return if cpu_device is not setup for this CPU. + * + * This could happen if the arch did not set up cpu_device + * since this CPU is not in cpu_present mask and the + * driver did not send a correct CPU mask during registration. + * Without this check we would end up passing bogus + * value for &cpu_dev->kobj in kobject_init_and_add() + */ + if (!cpu_dev) + return -ENODEV; + + kdev = kzalloc(sizeof(*kdev), GFP_KERNEL); + if (!kdev) + return -ENOMEM; + kdev->dev = dev; + + init_completion(&kdev->kobj_unregister); + + error = kobject_init_and_add(&kdev->kobj, &ktype_cpuidle, &cpu_dev->kobj, + "cpuidle"); + if (error) { + kobject_put(&kdev->kobj); + kfree(kdev); + return error; + } + + dev->kobj_dev = kdev; + kobject_uevent(&kdev->kobj, KOBJ_ADD); + + return 0; +} + +/** + * cpuidle_remove_sysfs - deletes a sysfs instance on the target device + * @dev: the target device + */ +void cpuidle_remove_sysfs(struct cpuidle_device *dev) +{ + struct cpuidle_device_kobj *kdev = dev->kobj_dev; + + kobject_put(&kdev->kobj); + wait_for_completion(&kdev->kobj_unregister); + kfree(kdev); +} |