diff options
Diffstat (limited to '')
-rw-r--r-- | kernel/power/Kconfig | 352 | ||||
-rw-r--r-- | kernel/power/Makefile | 24 | ||||
-rw-r--r-- | kernel/power/autosleep.c | 129 | ||||
-rw-r--r-- | kernel/power/console.c | 152 | ||||
-rw-r--r-- | kernel/power/energy_model.c | 438 | ||||
-rw-r--r-- | kernel/power/hibernate.c | 1373 | ||||
-rw-r--r-- | kernel/power/main.c | 1008 | ||||
-rw-r--r-- | kernel/power/power.h | 327 | ||||
-rw-r--r-- | kernel/power/poweroff.c | 45 | ||||
-rw-r--r-- | kernel/power/process.c | 235 | ||||
-rw-r--r-- | kernel/power/qos.c | 683 | ||||
-rw-r--r-- | kernel/power/snapshot.c | 2923 | ||||
-rw-r--r-- | kernel/power/suspend.c | 629 | ||||
-rw-r--r-- | kernel/power/suspend_test.c | 219 | ||||
-rw-r--r-- | kernel/power/swap.c | 1619 | ||||
-rw-r--r-- | kernel/power/user.c | 466 | ||||
-rw-r--r-- | kernel/power/wakelock.c | 285 |
17 files changed, 10907 insertions, 0 deletions
diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig new file mode 100644 index 0000000000..4b31629c5b --- /dev/null +++ b/kernel/power/Kconfig @@ -0,0 +1,352 @@ +# SPDX-License-Identifier: GPL-2.0-only +config SUSPEND + bool "Suspend to RAM and standby" + depends on ARCH_SUSPEND_POSSIBLE + default y + help + Allow the system to enter sleep states in which main memory is + powered and thus its contents are preserved, such as the + suspend-to-RAM state (e.g. the ACPI S3 state). + +config SUSPEND_FREEZER + bool "Enable freezer for suspend to RAM/standby" \ + if ARCH_WANTS_FREEZER_CONTROL || BROKEN + depends on SUSPEND + default y + help + This allows you to turn off the freezer for suspend. If this is + done, no tasks are frozen for suspend to RAM/standby. + + Turning OFF this setting is NOT recommended! If in doubt, say Y. + +config SUSPEND_SKIP_SYNC + bool "Skip kernel's sys_sync() on suspend to RAM/standby" + depends on SUSPEND + depends on EXPERT + help + Skip the kernel sys_sync() before freezing user processes. + Some systems prefer not to pay this cost on every invocation + of suspend, or they are content with invoking sync() from + user-space before invoking suspend. There's a run-time switch + at '/sys/power/sync_on_suspend' to configure this behaviour. + This setting changes the default for the run-tim switch. Say Y + to change the default to disable the kernel sys_sync(). + +config HIBERNATE_CALLBACKS + bool + +config HIBERNATION + bool "Hibernation (aka 'suspend to disk')" + depends on SWAP && ARCH_HIBERNATION_POSSIBLE + select HIBERNATE_CALLBACKS + select LZO_COMPRESS + select LZO_DECOMPRESS + select CRC32 + help + Enable the suspend to disk (STD) functionality, which is usually + called "hibernation" in user interfaces. STD checkpoints the + system and powers it off; and restores that checkpoint on reboot. + + You can suspend your machine with 'echo disk > /sys/power/state' + after placing resume=/dev/swappartition on the kernel command line + in your bootloader's configuration file. + + Alternatively, you can use the additional userland tools available + from <http://suspend.sf.net>. + + In principle it does not require ACPI or APM, although for example + ACPI will be used for the final steps when it is available. One + of the reasons to use software suspend is that the firmware hooks + for suspend states like suspend-to-RAM (STR) often don't work very + well with Linux. + + It creates an image which is saved in your active swap. Upon the next + boot, pass the 'resume=/dev/swappartition' argument to the kernel to + have it detect the saved image, restore memory state from it, and + continue to run as before. If you do not want the previous state to + be reloaded, then use the 'noresume' kernel command line argument. + Note, however, that fsck will be run on your filesystems and you will + need to run mkswap against the swap partition used for the suspend. + + It also works with swap files to a limited extent (for details see + <file:Documentation/power/swsusp-and-swap-files.rst>). + + Right now you may boot without resuming and resume later but in the + meantime you cannot use the swap partition(s)/file(s) involved in + suspending. Also in this case you must not use the filesystems + that were mounted before the suspend. In particular, you MUST NOT + MOUNT any journaled filesystems mounted before the suspend or they + will get corrupted in a nasty way. + + For more information take a look at <file:Documentation/power/swsusp.rst>. + +config HIBERNATION_SNAPSHOT_DEV + bool "Userspace snapshot device" + depends on HIBERNATION + default y + help + Device used by the uswsusp tools. + + Say N if no snapshotting from userspace is needed, this also + reduces the attack surface of the kernel. + + If in doubt, say Y. + +config PM_STD_PARTITION + string "Default resume partition" + depends on HIBERNATION + default "" + help + The default resume partition is the partition that the suspend- + to-disk implementation will look for a suspended disk image. + + The partition specified here will be different for almost every user. + It should be a valid swap partition (at least for now) that is turned + on before suspending. + + The partition specified can be overridden by specifying: + + resume=/dev/<other device> + + which will set the resume partition to the device specified. + + Note there is currently not a way to specify which device to save the + suspended image to. It will simply pick the first available swap + device. + +config PM_SLEEP + def_bool y + depends on SUSPEND || HIBERNATE_CALLBACKS + select PM + +config PM_SLEEP_SMP + def_bool y + depends on SMP + depends on ARCH_SUSPEND_POSSIBLE || ARCH_HIBERNATION_POSSIBLE + depends on PM_SLEEP + select HOTPLUG_CPU + +config PM_SLEEP_SMP_NONZERO_CPU + def_bool y + depends on PM_SLEEP_SMP + depends on ARCH_SUSPEND_NONZERO_CPU + help + If an arch can suspend (for suspend, hibernate, kexec, etc) on a + non-zero numbered CPU, it may define ARCH_SUSPEND_NONZERO_CPU. This + will allow nohz_full mask to include CPU0. + +config PM_AUTOSLEEP + bool "Opportunistic sleep" + depends on PM_SLEEP + help + Allow the kernel to trigger a system transition into a global sleep + state automatically whenever there are no active wakeup sources. + +config PM_USERSPACE_AUTOSLEEP + bool "Userspace opportunistic sleep" + depends on PM_SLEEP + help + Notify kernel of aggressive userspace autosleep power management policy. + + This option changes the behavior of various sleep-sensitive code to deal + with frequent userspace-initiated transitions into a global sleep state. + + Saying Y here, disables code paths that most users really should keep + enabled. In particular, only enable this if it is very common to be + asleep/awake for very short periods of time (<= 2 seconds). + + Only platforms, such as Android, that implement opportunistic sleep from + a userspace power manager service should enable this option; and not + other machines. Therefore, you should say N here, unless you are + extremely certain that this is what you want. The option otherwise has + bad, undesirable effects, and should not be enabled just for fun. + + +config PM_WAKELOCKS + bool "User space wakeup sources interface" + depends on PM_SLEEP + help + Allow user space to create, activate and deactivate wakeup source + objects with the help of a sysfs-based interface. + +config PM_WAKELOCKS_LIMIT + int "Maximum number of user space wakeup sources (0 = no limit)" + range 0 100000 + default 100 + depends on PM_WAKELOCKS + +config PM_WAKELOCKS_GC + bool "Garbage collector for user space wakeup sources" + depends on PM_WAKELOCKS + default y + +config PM + bool "Device power management core functionality" + help + Enable functionality allowing I/O devices to be put into energy-saving + (low power) states, for example after a specified period of inactivity + (autosuspended), and woken up in response to a hardware-generated + wake-up event or a driver's request. + + Hardware support is generally required for this functionality to work + and the bus type drivers of the buses the devices are on are + responsible for the actual handling of device suspend requests and + wake-up events. + +config PM_DEBUG + bool "Power Management Debug Support" + depends on PM + help + This option enables various debugging support in the Power Management + code. This is helpful when debugging and reporting PM bugs, like + suspend support. + +config PM_ADVANCED_DEBUG + bool "Extra PM attributes in sysfs for low-level debugging/testing" + depends on PM_DEBUG + help + Add extra sysfs attributes allowing one to access some Power Management + fields of device objects from user space. If you are not a kernel + developer interested in debugging/testing Power Management, say "no". + +config PM_TEST_SUSPEND + bool "Test suspend/resume and wakealarm during bootup" + depends on SUSPEND && PM_DEBUG && RTC_CLASS=y + help + This option will let you suspend your machine during bootup, and + make it wake up a few seconds later using an RTC wakeup alarm. + Enable this with a kernel parameter like "test_suspend=mem". + + You probably want to have your system's RTC driver statically + linked, ensuring that it's available when this test runs. + +config PM_SLEEP_DEBUG + def_bool y + depends on PM_DEBUG && PM_SLEEP + +config DPM_WATCHDOG + bool "Device suspend/resume watchdog" + depends on PM_DEBUG && PSTORE && EXPERT + help + Sets up a watchdog timer to capture drivers that are + locked up attempting to suspend/resume a device. + A detected lockup causes system panic with message + captured in pstore device for inspection in subsequent + boot session. + +config DPM_WATCHDOG_TIMEOUT + int "Watchdog timeout in seconds" + range 1 120 + default 120 + depends on DPM_WATCHDOG + +config PM_TRACE + bool + help + This enables code to save the last PM event point across + reboot. The architecture needs to support this, x86 for + example does by saving things in the RTC, see below. + + The architecture specific code must provide the extern + functions from <linux/resume-trace.h> as well as the + <asm/resume-trace.h> header with a TRACE_RESUME() macro. + + The way the information is presented is architecture- + dependent, x86 will print the information during a + late_initcall. + +config PM_TRACE_RTC + bool "Suspend/resume event tracing" + depends on PM_SLEEP_DEBUG + depends on X86 + select PM_TRACE + help + This enables some cheesy code to save the last PM event point in the + RTC across reboots, so that you can debug a machine that just hangs + during suspend (or more commonly, during resume). + + To use this debugging feature you should attempt to suspend the + machine, reboot it and then run + + dmesg -s 1000000 | grep 'hash matches' + + CAUTION: this option will cause your machine's real-time clock to be + set to an invalid time after a resume. + +config APM_EMULATION + tristate "Advanced Power Management Emulation" + depends on SYS_SUPPORTS_APM_EMULATION + help + APM is a BIOS specification for saving power using several different + techniques. This is mostly useful for battery powered laptops with + APM compliant BIOSes. If you say Y here, the system time will be + reset after a RESUME operation, the /proc/apm device will provide + battery status information, and user-space programs will receive + notification of APM "events" (e.g. battery status change). + + In order to use APM, you will need supporting software. For location + and more information, read <file:Documentation/power/apm-acpi.rst> + and the Battery Powered Linux mini-HOWTO, available from + <http://www.tldp.org/docs.html#howto>. + + This driver does not spin down disk drives (see the hdparm(8) + manpage ("man 8 hdparm") for that), and it doesn't turn off + VESA-compliant "green" monitors. + + Generally, if you don't have a battery in your machine, there isn't + much point in using this driver and you should say N. If you get + random kernel OOPSes or reboots that don't seem to be related to + anything, try disabling/enabling this option (or disabling/enabling + APM in your BIOS). + +config PM_CLK + def_bool y + depends on PM && HAVE_CLK + +config PM_GENERIC_DOMAINS + bool + depends on PM + +config WQ_POWER_EFFICIENT_DEFAULT + bool "Enable workqueue power-efficient mode by default" + depends on PM + help + Per-cpu workqueues are generally preferred because they show + better performance thanks to cache locality; unfortunately, + per-cpu workqueues tend to be more power hungry than unbound + workqueues. + + Enabling workqueue.power_efficient kernel parameter makes the + per-cpu workqueues which were observed to contribute + significantly to power consumption unbound, leading to measurably + lower power usage at the cost of small performance overhead. + + This config option determines whether workqueue.power_efficient + is enabled by default. + + If in doubt, say N. + +config PM_GENERIC_DOMAINS_SLEEP + def_bool y + depends on PM_SLEEP && PM_GENERIC_DOMAINS + +config PM_GENERIC_DOMAINS_OF + def_bool y + depends on PM_GENERIC_DOMAINS && OF + +config CPU_PM + bool + +config ENERGY_MODEL + bool "Energy Model for devices with DVFS (CPUs, GPUs, etc)" + depends on SMP + depends on CPU_FREQ + help + Several subsystems (thermal and/or the task scheduler for example) + can leverage information about the energy consumed by devices to + make smarter decisions. This config option enables the framework + from which subsystems can access the energy models. + + The exact usage of the energy model is subsystem-dependent. + + If in doubt, say N. diff --git a/kernel/power/Makefile b/kernel/power/Makefile new file mode 100644 index 0000000000..874ad834dc --- /dev/null +++ b/kernel/power/Makefile @@ -0,0 +1,24 @@ +# SPDX-License-Identifier: GPL-2.0 + +ifeq ($(CONFIG_DYNAMIC_DEBUG), y) +CFLAGS_swap.o := -DDEBUG +CFLAGS_snapshot.o := -DDEBUG +CFLAGS_energy_model.o := -DDEBUG +endif + +KASAN_SANITIZE_snapshot.o := n + +obj-y += qos.o +obj-$(CONFIG_PM) += main.o +obj-$(CONFIG_VT_CONSOLE_SLEEP) += console.o +obj-$(CONFIG_FREEZER) += process.o +obj-$(CONFIG_SUSPEND) += suspend.o +obj-$(CONFIG_PM_TEST_SUSPEND) += suspend_test.o +obj-$(CONFIG_HIBERNATION) += hibernate.o snapshot.o swap.o +obj-$(CONFIG_HIBERNATION_SNAPSHOT_DEV) += user.o +obj-$(CONFIG_PM_AUTOSLEEP) += autosleep.o +obj-$(CONFIG_PM_WAKELOCKS) += wakelock.o + +obj-$(CONFIG_MAGIC_SYSRQ) += poweroff.o + +obj-$(CONFIG_ENERGY_MODEL) += energy_model.o diff --git a/kernel/power/autosleep.c b/kernel/power/autosleep.c new file mode 100644 index 0000000000..b29c8aca74 --- /dev/null +++ b/kernel/power/autosleep.c @@ -0,0 +1,129 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * kernel/power/autosleep.c + * + * Opportunistic sleep support. + * + * Copyright (C) 2012 Rafael J. Wysocki <rjw@sisk.pl> + */ + +#include <linux/device.h> +#include <linux/mutex.h> +#include <linux/pm_wakeup.h> + +#include "power.h" + +static suspend_state_t autosleep_state; +static struct workqueue_struct *autosleep_wq; +/* + * Note: it is only safe to mutex_lock(&autosleep_lock) if a wakeup_source + * is active, otherwise a deadlock with try_to_suspend() is possible. + * Alternatively mutex_lock_interruptible() can be used. This will then fail + * if an auto_sleep cycle tries to freeze processes. + */ +static DEFINE_MUTEX(autosleep_lock); +static struct wakeup_source *autosleep_ws; + +static void try_to_suspend(struct work_struct *work) +{ + unsigned int initial_count, final_count; + + if (!pm_get_wakeup_count(&initial_count, true)) + goto out; + + mutex_lock(&autosleep_lock); + + if (!pm_save_wakeup_count(initial_count) || + system_state != SYSTEM_RUNNING) { + mutex_unlock(&autosleep_lock); + goto out; + } + + if (autosleep_state == PM_SUSPEND_ON) { + mutex_unlock(&autosleep_lock); + return; + } + if (autosleep_state >= PM_SUSPEND_MAX) + hibernate(); + else + pm_suspend(autosleep_state); + + mutex_unlock(&autosleep_lock); + + if (!pm_get_wakeup_count(&final_count, false)) + goto out; + + /* + * If the wakeup occurred for an unknown reason, wait to prevent the + * system from trying to suspend and waking up in a tight loop. + */ + if (final_count == initial_count) + schedule_timeout_uninterruptible(HZ / 2); + + out: + queue_up_suspend_work(); +} + +static DECLARE_WORK(suspend_work, try_to_suspend); + +void queue_up_suspend_work(void) +{ + if (autosleep_state > PM_SUSPEND_ON) + queue_work(autosleep_wq, &suspend_work); +} + +suspend_state_t pm_autosleep_state(void) +{ + return autosleep_state; +} + +int pm_autosleep_lock(void) +{ + return mutex_lock_interruptible(&autosleep_lock); +} + +void pm_autosleep_unlock(void) +{ + mutex_unlock(&autosleep_lock); +} + +int pm_autosleep_set_state(suspend_state_t state) +{ + +#ifndef CONFIG_HIBERNATION + if (state >= PM_SUSPEND_MAX) + return -EINVAL; +#endif + + __pm_stay_awake(autosleep_ws); + + mutex_lock(&autosleep_lock); + + autosleep_state = state; + + __pm_relax(autosleep_ws); + + if (state > PM_SUSPEND_ON) { + pm_wakep_autosleep_enabled(true); + queue_up_suspend_work(); + } else { + pm_wakep_autosleep_enabled(false); + } + + mutex_unlock(&autosleep_lock); + return 0; +} + +int __init pm_autosleep_init(void) +{ + autosleep_ws = wakeup_source_register(NULL, "autosleep"); + if (!autosleep_ws) + return -ENOMEM; + + autosleep_wq = alloc_ordered_workqueue("autosleep", 0); + if (autosleep_wq) + return 0; + + wakeup_source_unregister(autosleep_ws); + return -ENOMEM; +} diff --git a/kernel/power/console.c b/kernel/power/console.c new file mode 100644 index 0000000000..fcdf0e14a4 --- /dev/null +++ b/kernel/power/console.c @@ -0,0 +1,152 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Functions for saving/restoring console. + * + * Originally from swsusp. + */ + +#include <linux/console.h> +#include <linux/vt_kern.h> +#include <linux/kbd_kern.h> +#include <linux/vt.h> +#include <linux/module.h> +#include <linux/slab.h> +#include "power.h" + +#define SUSPEND_CONSOLE (MAX_NR_CONSOLES-1) + +static int orig_fgconsole, orig_kmsg; + +static DEFINE_MUTEX(vt_switch_mutex); + +struct pm_vt_switch { + struct list_head head; + struct device *dev; + bool required; +}; + +static LIST_HEAD(pm_vt_switch_list); + + +/** + * pm_vt_switch_required - indicate VT switch at suspend requirements + * @dev: device + * @required: if true, caller needs VT switch at suspend/resume time + * + * The different console drivers may or may not require VT switches across + * suspend/resume, depending on how they handle restoring video state and + * what may be running. + * + * Drivers can indicate support for switchless suspend/resume, which can + * save time and flicker, by using this routine and passing 'false' as + * the argument. If any loaded driver needs VT switching, or the + * no_console_suspend argument has been passed on the command line, VT + * switches will occur. + */ +void pm_vt_switch_required(struct device *dev, bool required) +{ + struct pm_vt_switch *entry, *tmp; + + mutex_lock(&vt_switch_mutex); + list_for_each_entry(tmp, &pm_vt_switch_list, head) { + if (tmp->dev == dev) { + /* already registered, update requirement */ + tmp->required = required; + goto out; + } + } + + entry = kmalloc(sizeof(*entry), GFP_KERNEL); + if (!entry) + goto out; + + entry->required = required; + entry->dev = dev; + + list_add(&entry->head, &pm_vt_switch_list); +out: + mutex_unlock(&vt_switch_mutex); +} +EXPORT_SYMBOL(pm_vt_switch_required); + +/** + * pm_vt_switch_unregister - stop tracking a device's VT switching needs + * @dev: device + * + * Remove @dev from the vt switch list. + */ +void pm_vt_switch_unregister(struct device *dev) +{ + struct pm_vt_switch *tmp; + + mutex_lock(&vt_switch_mutex); + list_for_each_entry(tmp, &pm_vt_switch_list, head) { + if (tmp->dev == dev) { + list_del(&tmp->head); + kfree(tmp); + break; + } + } + mutex_unlock(&vt_switch_mutex); +} +EXPORT_SYMBOL(pm_vt_switch_unregister); + +/* + * There are three cases when a VT switch on suspend/resume are required: + * 1) no driver has indicated a requirement one way or another, so preserve + * the old behavior + * 2) console suspend is disabled, we want to see debug messages across + * suspend/resume + * 3) any registered driver indicates it needs a VT switch + * + * If none of these conditions is present, meaning we have at least one driver + * that doesn't need the switch, and none that do, we can avoid it to make + * resume look a little prettier (and suspend too, but that's usually hidden, + * e.g. when closing the lid on a laptop). + */ +static bool pm_vt_switch(void) +{ + struct pm_vt_switch *entry; + bool ret = true; + + mutex_lock(&vt_switch_mutex); + if (list_empty(&pm_vt_switch_list)) + goto out; + + if (!console_suspend_enabled) + goto out; + + list_for_each_entry(entry, &pm_vt_switch_list, head) { + if (entry->required) + goto out; + } + + ret = false; +out: + mutex_unlock(&vt_switch_mutex); + return ret; +} + +void pm_prepare_console(void) +{ + if (!pm_vt_switch()) + return; + + orig_fgconsole = vt_move_to_console(SUSPEND_CONSOLE, 1); + if (orig_fgconsole < 0) + return; + + orig_kmsg = vt_kmsg_redirect(SUSPEND_CONSOLE); + return; +} + +void pm_restore_console(void) +{ + if (!pm_vt_switch()) + return; + + if (orig_fgconsole >= 0) { + vt_move_to_console(orig_fgconsole, 0); + vt_kmsg_redirect(orig_kmsg); + } +} diff --git a/kernel/power/energy_model.c b/kernel/power/energy_model.c new file mode 100644 index 0000000000..7b44f5b89f --- /dev/null +++ b/kernel/power/energy_model.c @@ -0,0 +1,438 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Energy Model of devices + * + * Copyright (c) 2018-2021, Arm ltd. + * Written by: Quentin Perret, Arm ltd. + * Improvements provided by: Lukasz Luba, Arm ltd. + */ + +#define pr_fmt(fmt) "energy_model: " fmt + +#include <linux/cpu.h> +#include <linux/cpufreq.h> +#include <linux/cpumask.h> +#include <linux/debugfs.h> +#include <linux/energy_model.h> +#include <linux/sched/topology.h> +#include <linux/slab.h> + +/* + * Mutex serializing the registrations of performance domains and letting + * callbacks defined by drivers sleep. + */ +static DEFINE_MUTEX(em_pd_mutex); + +static bool _is_cpu_device(struct device *dev) +{ + return (dev->bus == &cpu_subsys); +} + +#ifdef CONFIG_DEBUG_FS +static struct dentry *rootdir; + +static void em_debug_create_ps(struct em_perf_state *ps, struct dentry *pd) +{ + struct dentry *d; + char name[24]; + + snprintf(name, sizeof(name), "ps:%lu", ps->frequency); + + /* Create per-ps directory */ + d = debugfs_create_dir(name, pd); + debugfs_create_ulong("frequency", 0444, d, &ps->frequency); + debugfs_create_ulong("power", 0444, d, &ps->power); + debugfs_create_ulong("cost", 0444, d, &ps->cost); + debugfs_create_ulong("inefficient", 0444, d, &ps->flags); +} + +static int em_debug_cpus_show(struct seq_file *s, void *unused) +{ + seq_printf(s, "%*pbl\n", cpumask_pr_args(to_cpumask(s->private))); + + return 0; +} +DEFINE_SHOW_ATTRIBUTE(em_debug_cpus); + +static int em_debug_flags_show(struct seq_file *s, void *unused) +{ + struct em_perf_domain *pd = s->private; + + seq_printf(s, "%#lx\n", pd->flags); + + return 0; +} +DEFINE_SHOW_ATTRIBUTE(em_debug_flags); + +static void em_debug_create_pd(struct device *dev) +{ + struct dentry *d; + int i; + + /* Create the directory of the performance domain */ + d = debugfs_create_dir(dev_name(dev), rootdir); + + if (_is_cpu_device(dev)) + debugfs_create_file("cpus", 0444, d, dev->em_pd->cpus, + &em_debug_cpus_fops); + + debugfs_create_file("flags", 0444, d, dev->em_pd, + &em_debug_flags_fops); + + /* Create a sub-directory for each performance state */ + for (i = 0; i < dev->em_pd->nr_perf_states; i++) + em_debug_create_ps(&dev->em_pd->table[i], d); + +} + +static void em_debug_remove_pd(struct device *dev) +{ + debugfs_lookup_and_remove(dev_name(dev), rootdir); +} + +static int __init em_debug_init(void) +{ + /* Create /sys/kernel/debug/energy_model directory */ + rootdir = debugfs_create_dir("energy_model", NULL); + + return 0; +} +fs_initcall(em_debug_init); +#else /* CONFIG_DEBUG_FS */ +static void em_debug_create_pd(struct device *dev) {} +static void em_debug_remove_pd(struct device *dev) {} +#endif + +static int em_create_perf_table(struct device *dev, struct em_perf_domain *pd, + int nr_states, struct em_data_callback *cb, + unsigned long flags) +{ + unsigned long power, freq, prev_freq = 0, prev_cost = ULONG_MAX; + struct em_perf_state *table; + int i, ret; + u64 fmax; + + table = kcalloc(nr_states, sizeof(*table), GFP_KERNEL); + if (!table) + return -ENOMEM; + + /* Build the list of performance states for this performance domain */ + for (i = 0, freq = 0; i < nr_states; i++, freq++) { + /* + * active_power() is a driver callback which ceils 'freq' to + * lowest performance state of 'dev' above 'freq' and updates + * 'power' and 'freq' accordingly. + */ + ret = cb->active_power(dev, &power, &freq); + if (ret) { + dev_err(dev, "EM: invalid perf. state: %d\n", + ret); + goto free_ps_table; + } + + /* + * We expect the driver callback to increase the frequency for + * higher performance states. + */ + if (freq <= prev_freq) { + dev_err(dev, "EM: non-increasing freq: %lu\n", + freq); + goto free_ps_table; + } + + /* + * The power returned by active_state() is expected to be + * positive and be in range. + */ + if (!power || power > EM_MAX_POWER) { + dev_err(dev, "EM: invalid power: %lu\n", + power); + goto free_ps_table; + } + + table[i].power = power; + table[i].frequency = prev_freq = freq; + } + + /* Compute the cost of each performance state. */ + fmax = (u64) table[nr_states - 1].frequency; + for (i = nr_states - 1; i >= 0; i--) { + unsigned long power_res, cost; + + if (flags & EM_PERF_DOMAIN_ARTIFICIAL) { + ret = cb->get_cost(dev, table[i].frequency, &cost); + if (ret || !cost || cost > EM_MAX_POWER) { + dev_err(dev, "EM: invalid cost %lu %d\n", + cost, ret); + goto free_ps_table; + } + } else { + power_res = table[i].power; + cost = div64_u64(fmax * power_res, table[i].frequency); + } + + table[i].cost = cost; + + if (table[i].cost >= prev_cost) { + table[i].flags = EM_PERF_STATE_INEFFICIENT; + dev_dbg(dev, "EM: OPP:%lu is inefficient\n", + table[i].frequency); + } else { + prev_cost = table[i].cost; + } + } + + pd->table = table; + pd->nr_perf_states = nr_states; + + return 0; + +free_ps_table: + kfree(table); + return -EINVAL; +} + +static int em_create_pd(struct device *dev, int nr_states, + struct em_data_callback *cb, cpumask_t *cpus, + unsigned long flags) +{ + struct em_perf_domain *pd; + struct device *cpu_dev; + int cpu, ret, num_cpus; + + if (_is_cpu_device(dev)) { + num_cpus = cpumask_weight(cpus); + + /* Prevent max possible energy calculation to not overflow */ + if (num_cpus > EM_MAX_NUM_CPUS) { + dev_err(dev, "EM: too many CPUs, overflow possible\n"); + return -EINVAL; + } + + pd = kzalloc(sizeof(*pd) + cpumask_size(), GFP_KERNEL); + if (!pd) + return -ENOMEM; + + cpumask_copy(em_span_cpus(pd), cpus); + } else { + pd = kzalloc(sizeof(*pd), GFP_KERNEL); + if (!pd) + return -ENOMEM; + } + + ret = em_create_perf_table(dev, pd, nr_states, cb, flags); + if (ret) { + kfree(pd); + return ret; + } + + if (_is_cpu_device(dev)) + for_each_cpu(cpu, cpus) { + cpu_dev = get_cpu_device(cpu); + cpu_dev->em_pd = pd; + } + + dev->em_pd = pd; + + return 0; +} + +static void em_cpufreq_update_efficiencies(struct device *dev) +{ + struct em_perf_domain *pd = dev->em_pd; + struct em_perf_state *table; + struct cpufreq_policy *policy; + int found = 0; + int i; + + if (!_is_cpu_device(dev) || !pd) + return; + + policy = cpufreq_cpu_get(cpumask_first(em_span_cpus(pd))); + if (!policy) { + dev_warn(dev, "EM: Access to CPUFreq policy failed"); + return; + } + + table = pd->table; + + for (i = 0; i < pd->nr_perf_states; i++) { + if (!(table[i].flags & EM_PERF_STATE_INEFFICIENT)) + continue; + + if (!cpufreq_table_set_inefficient(policy, table[i].frequency)) + found++; + } + + cpufreq_cpu_put(policy); + + if (!found) + return; + + /* + * Efficiencies have been installed in CPUFreq, inefficient frequencies + * will be skipped. The EM can do the same. + */ + pd->flags |= EM_PERF_DOMAIN_SKIP_INEFFICIENCIES; +} + +/** + * em_pd_get() - Return the performance domain for a device + * @dev : Device to find the performance domain for + * + * Returns the performance domain to which @dev belongs, or NULL if it doesn't + * exist. + */ +struct em_perf_domain *em_pd_get(struct device *dev) +{ + if (IS_ERR_OR_NULL(dev)) + return NULL; + + return dev->em_pd; +} +EXPORT_SYMBOL_GPL(em_pd_get); + +/** + * em_cpu_get() - Return the performance domain for a CPU + * @cpu : CPU to find the performance domain for + * + * Returns the performance domain to which @cpu belongs, or NULL if it doesn't + * exist. + */ +struct em_perf_domain *em_cpu_get(int cpu) +{ + struct device *cpu_dev; + + cpu_dev = get_cpu_device(cpu); + if (!cpu_dev) + return NULL; + + return em_pd_get(cpu_dev); +} +EXPORT_SYMBOL_GPL(em_cpu_get); + +/** + * em_dev_register_perf_domain() - Register the Energy Model (EM) for a device + * @dev : Device for which the EM is to register + * @nr_states : Number of performance states to register + * @cb : Callback functions providing the data of the Energy Model + * @cpus : Pointer to cpumask_t, which in case of a CPU device is + * obligatory. It can be taken from i.e. 'policy->cpus'. For other + * type of devices this should be set to NULL. + * @microwatts : Flag indicating that the power values are in micro-Watts or + * in some other scale. It must be set properly. + * + * Create Energy Model tables for a performance domain using the callbacks + * defined in cb. + * + * The @microwatts is important to set with correct value. Some kernel + * sub-systems might rely on this flag and check if all devices in the EM are + * using the same scale. + * + * If multiple clients register the same performance domain, all but the first + * registration will be ignored. + * + * Return 0 on success + */ +int em_dev_register_perf_domain(struct device *dev, unsigned int nr_states, + struct em_data_callback *cb, cpumask_t *cpus, + bool microwatts) +{ + unsigned long cap, prev_cap = 0; + unsigned long flags = 0; + int cpu, ret; + + if (!dev || !nr_states || !cb) + return -EINVAL; + + /* + * Use a mutex to serialize the registration of performance domains and + * let the driver-defined callback functions sleep. + */ + mutex_lock(&em_pd_mutex); + + if (dev->em_pd) { + ret = -EEXIST; + goto unlock; + } + + if (_is_cpu_device(dev)) { + if (!cpus) { + dev_err(dev, "EM: invalid CPU mask\n"); + ret = -EINVAL; + goto unlock; + } + + for_each_cpu(cpu, cpus) { + if (em_cpu_get(cpu)) { + dev_err(dev, "EM: exists for CPU%d\n", cpu); + ret = -EEXIST; + goto unlock; + } + /* + * All CPUs of a domain must have the same + * micro-architecture since they all share the same + * table. + */ + cap = arch_scale_cpu_capacity(cpu); + if (prev_cap && prev_cap != cap) { + dev_err(dev, "EM: CPUs of %*pbl must have the same capacity\n", + cpumask_pr_args(cpus)); + + ret = -EINVAL; + goto unlock; + } + prev_cap = cap; + } + } + + if (microwatts) + flags |= EM_PERF_DOMAIN_MICROWATTS; + else if (cb->get_cost) + flags |= EM_PERF_DOMAIN_ARTIFICIAL; + + ret = em_create_pd(dev, nr_states, cb, cpus, flags); + if (ret) + goto unlock; + + dev->em_pd->flags |= flags; + + em_cpufreq_update_efficiencies(dev); + + em_debug_create_pd(dev); + dev_info(dev, "EM: created perf domain\n"); + +unlock: + mutex_unlock(&em_pd_mutex); + return ret; +} +EXPORT_SYMBOL_GPL(em_dev_register_perf_domain); + +/** + * em_dev_unregister_perf_domain() - Unregister Energy Model (EM) for a device + * @dev : Device for which the EM is registered + * + * Unregister the EM for the specified @dev (but not a CPU device). + */ +void em_dev_unregister_perf_domain(struct device *dev) +{ + if (IS_ERR_OR_NULL(dev) || !dev->em_pd) + return; + + if (_is_cpu_device(dev)) + return; + + /* + * The mutex separates all register/unregister requests and protects + * from potential clean-up/setup issues in the debugfs directories. + * The debugfs directory name is the same as device's name. + */ + mutex_lock(&em_pd_mutex); + em_debug_remove_pd(dev); + + kfree(dev->em_pd->table); + kfree(dev->em_pd); + dev->em_pd = NULL; + mutex_unlock(&em_pd_mutex); +} +EXPORT_SYMBOL_GPL(em_dev_unregister_perf_domain); diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c new file mode 100644 index 0000000000..8d35b9f9aa --- /dev/null +++ b/kernel/power/hibernate.c @@ -0,0 +1,1373 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * kernel/power/hibernate.c - Hibernation (a.k.a suspend-to-disk) support. + * + * Copyright (c) 2003 Patrick Mochel + * Copyright (c) 2003 Open Source Development Lab + * Copyright (c) 2004 Pavel Machek <pavel@ucw.cz> + * Copyright (c) 2009 Rafael J. Wysocki, Novell Inc. + * Copyright (C) 2012 Bojan Smojver <bojan@rexursive.com> + */ + +#define pr_fmt(fmt) "PM: hibernation: " fmt + +#include <linux/blkdev.h> +#include <linux/export.h> +#include <linux/suspend.h> +#include <linux/reboot.h> +#include <linux/string.h> +#include <linux/device.h> +#include <linux/async.h> +#include <linux/delay.h> +#include <linux/fs.h> +#include <linux/mount.h> +#include <linux/pm.h> +#include <linux/nmi.h> +#include <linux/console.h> +#include <linux/cpu.h> +#include <linux/freezer.h> +#include <linux/gfp.h> +#include <linux/syscore_ops.h> +#include <linux/ctype.h> +#include <linux/ktime.h> +#include <linux/security.h> +#include <linux/secretmem.h> +#include <trace/events/power.h> + +#include "power.h" + + +static int nocompress; +static int noresume; +static int nohibernate; +static int resume_wait; +static unsigned int resume_delay; +static char resume_file[256] = CONFIG_PM_STD_PARTITION; +dev_t swsusp_resume_device; +sector_t swsusp_resume_block; +__visible int in_suspend __nosavedata; + +enum { + HIBERNATION_INVALID, + HIBERNATION_PLATFORM, + HIBERNATION_SHUTDOWN, + HIBERNATION_REBOOT, +#ifdef CONFIG_SUSPEND + HIBERNATION_SUSPEND, +#endif + HIBERNATION_TEST_RESUME, + /* keep last */ + __HIBERNATION_AFTER_LAST +}; +#define HIBERNATION_MAX (__HIBERNATION_AFTER_LAST-1) +#define HIBERNATION_FIRST (HIBERNATION_INVALID + 1) + +static int hibernation_mode = HIBERNATION_SHUTDOWN; + +bool freezer_test_done; + +static const struct platform_hibernation_ops *hibernation_ops; + +static atomic_t hibernate_atomic = ATOMIC_INIT(1); + +bool hibernate_acquire(void) +{ + return atomic_add_unless(&hibernate_atomic, -1, 0); +} + +void hibernate_release(void) +{ + atomic_inc(&hibernate_atomic); +} + +bool hibernation_available(void) +{ + return nohibernate == 0 && + !security_locked_down(LOCKDOWN_HIBERNATION) && + !secretmem_active() && !cxl_mem_active(); +} + +/** + * hibernation_set_ops - Set the global hibernate operations. + * @ops: Hibernation operations to use in subsequent hibernation transitions. + */ +void hibernation_set_ops(const struct platform_hibernation_ops *ops) +{ + unsigned int sleep_flags; + + if (ops && !(ops->begin && ops->end && ops->pre_snapshot + && ops->prepare && ops->finish && ops->enter && ops->pre_restore + && ops->restore_cleanup && ops->leave)) { + WARN_ON(1); + return; + } + + sleep_flags = lock_system_sleep(); + + hibernation_ops = ops; + if (ops) + hibernation_mode = HIBERNATION_PLATFORM; + else if (hibernation_mode == HIBERNATION_PLATFORM) + hibernation_mode = HIBERNATION_SHUTDOWN; + + unlock_system_sleep(sleep_flags); +} +EXPORT_SYMBOL_GPL(hibernation_set_ops); + +static bool entering_platform_hibernation; + +bool system_entering_hibernation(void) +{ + return entering_platform_hibernation; +} +EXPORT_SYMBOL(system_entering_hibernation); + +#ifdef CONFIG_PM_DEBUG +static void hibernation_debug_sleep(void) +{ + pr_info("debug: Waiting for 5 seconds.\n"); + mdelay(5000); +} + +static int hibernation_test(int level) +{ + if (pm_test_level == level) { + hibernation_debug_sleep(); + return 1; + } + return 0; +} +#else /* !CONFIG_PM_DEBUG */ +static int hibernation_test(int level) { return 0; } +#endif /* !CONFIG_PM_DEBUG */ + +/** + * platform_begin - Call platform to start hibernation. + * @platform_mode: Whether or not to use the platform driver. + */ +static int platform_begin(int platform_mode) +{ + return (platform_mode && hibernation_ops) ? + hibernation_ops->begin(PMSG_FREEZE) : 0; +} + +/** + * platform_end - Call platform to finish transition to the working state. + * @platform_mode: Whether or not to use the platform driver. + */ +static void platform_end(int platform_mode) +{ + if (platform_mode && hibernation_ops) + hibernation_ops->end(); +} + +/** + * platform_pre_snapshot - Call platform to prepare the machine for hibernation. + * @platform_mode: Whether or not to use the platform driver. + * + * Use the platform driver to prepare the system for creating a hibernate image, + * if so configured, and return an error code if that fails. + */ + +static int platform_pre_snapshot(int platform_mode) +{ + return (platform_mode && hibernation_ops) ? + hibernation_ops->pre_snapshot() : 0; +} + +/** + * platform_leave - Call platform to prepare a transition to the working state. + * @platform_mode: Whether or not to use the platform driver. + * + * Use the platform driver prepare to prepare the machine for switching to the + * normal mode of operation. + * + * This routine is called on one CPU with interrupts disabled. + */ +static void platform_leave(int platform_mode) +{ + if (platform_mode && hibernation_ops) + hibernation_ops->leave(); +} + +/** + * platform_finish - Call platform to switch the system to the working state. + * @platform_mode: Whether or not to use the platform driver. + * + * Use the platform driver to switch the machine to the normal mode of + * operation. + * + * This routine must be called after platform_prepare(). + */ +static void platform_finish(int platform_mode) +{ + if (platform_mode && hibernation_ops) + hibernation_ops->finish(); +} + +/** + * platform_pre_restore - Prepare for hibernate image restoration. + * @platform_mode: Whether or not to use the platform driver. + * + * Use the platform driver to prepare the system for resume from a hibernation + * image. + * + * If the restore fails after this function has been called, + * platform_restore_cleanup() must be called. + */ +static int platform_pre_restore(int platform_mode) +{ + return (platform_mode && hibernation_ops) ? + hibernation_ops->pre_restore() : 0; +} + +/** + * platform_restore_cleanup - Switch to the working state after failing restore. + * @platform_mode: Whether or not to use the platform driver. + * + * Use the platform driver to switch the system to the normal mode of operation + * after a failing restore. + * + * If platform_pre_restore() has been called before the failing restore, this + * function must be called too, regardless of the result of + * platform_pre_restore(). + */ +static void platform_restore_cleanup(int platform_mode) +{ + if (platform_mode && hibernation_ops) + hibernation_ops->restore_cleanup(); +} + +/** + * platform_recover - Recover from a failure to suspend devices. + * @platform_mode: Whether or not to use the platform driver. + */ +static void platform_recover(int platform_mode) +{ + if (platform_mode && hibernation_ops && hibernation_ops->recover) + hibernation_ops->recover(); +} + +/** + * swsusp_show_speed - Print time elapsed between two events during hibernation. + * @start: Starting event. + * @stop: Final event. + * @nr_pages: Number of memory pages processed between @start and @stop. + * @msg: Additional diagnostic message to print. + */ +void swsusp_show_speed(ktime_t start, ktime_t stop, + unsigned nr_pages, char *msg) +{ + ktime_t diff; + u64 elapsed_centisecs64; + unsigned int centisecs; + unsigned int k; + unsigned int kps; + + diff = ktime_sub(stop, start); + elapsed_centisecs64 = ktime_divns(diff, 10*NSEC_PER_MSEC); + centisecs = elapsed_centisecs64; + if (centisecs == 0) + centisecs = 1; /* avoid div-by-zero */ + k = nr_pages * (PAGE_SIZE / 1024); + kps = (k * 100) / centisecs; + pr_info("%s %u kbytes in %u.%02u seconds (%u.%02u MB/s)\n", + msg, k, centisecs / 100, centisecs % 100, kps / 1000, + (kps % 1000) / 10); +} + +__weak int arch_resume_nosmt(void) +{ + return 0; +} + +/** + * create_image - Create a hibernation image. + * @platform_mode: Whether or not to use the platform driver. + * + * Execute device drivers' "late" and "noirq" freeze callbacks, create a + * hibernation image and run the drivers' "noirq" and "early" thaw callbacks. + * + * Control reappears in this routine after the subsequent restore. + */ +static int create_image(int platform_mode) +{ + int error; + + error = dpm_suspend_end(PMSG_FREEZE); + if (error) { + pr_err("Some devices failed to power down, aborting\n"); + return error; + } + + error = platform_pre_snapshot(platform_mode); + if (error || hibernation_test(TEST_PLATFORM)) + goto Platform_finish; + + error = pm_sleep_disable_secondary_cpus(); + if (error || hibernation_test(TEST_CPUS)) + goto Enable_cpus; + + local_irq_disable(); + + system_state = SYSTEM_SUSPEND; + + error = syscore_suspend(); + if (error) { + pr_err("Some system devices failed to power down, aborting\n"); + goto Enable_irqs; + } + + if (hibernation_test(TEST_CORE) || pm_wakeup_pending()) + goto Power_up; + + in_suspend = 1; + save_processor_state(); + trace_suspend_resume(TPS("machine_suspend"), PM_EVENT_HIBERNATE, true); + error = swsusp_arch_suspend(); + /* Restore control flow magically appears here */ + restore_processor_state(); + trace_suspend_resume(TPS("machine_suspend"), PM_EVENT_HIBERNATE, false); + if (error) + pr_err("Error %d creating image\n", error); + + if (!in_suspend) { + events_check_enabled = false; + clear_or_poison_free_pages(); + } + + platform_leave(platform_mode); + + Power_up: + syscore_resume(); + + Enable_irqs: + system_state = SYSTEM_RUNNING; + local_irq_enable(); + + Enable_cpus: + pm_sleep_enable_secondary_cpus(); + + /* Allow architectures to do nosmt-specific post-resume dances */ + if (!in_suspend) + error = arch_resume_nosmt(); + + Platform_finish: + platform_finish(platform_mode); + + dpm_resume_start(in_suspend ? + (error ? PMSG_RECOVER : PMSG_THAW) : PMSG_RESTORE); + + return error; +} + +/** + * hibernation_snapshot - Quiesce devices and create a hibernation image. + * @platform_mode: If set, use platform driver to prepare for the transition. + * + * This routine must be called with system_transition_mutex held. + */ +int hibernation_snapshot(int platform_mode) +{ + pm_message_t msg; + int error; + + pm_suspend_clear_flags(); + error = platform_begin(platform_mode); + if (error) + goto Close; + + /* Preallocate image memory before shutting down devices. */ + error = hibernate_preallocate_memory(); + if (error) + goto Close; + + error = freeze_kernel_threads(); + if (error) + goto Cleanup; + + if (hibernation_test(TEST_FREEZER)) { + + /* + * Indicate to the caller that we are returning due to a + * successful freezer test. + */ + freezer_test_done = true; + goto Thaw; + } + + error = dpm_prepare(PMSG_FREEZE); + if (error) { + dpm_complete(PMSG_RECOVER); + goto Thaw; + } + + suspend_console(); + pm_restrict_gfp_mask(); + + error = dpm_suspend(PMSG_FREEZE); + + if (error || hibernation_test(TEST_DEVICES)) + platform_recover(platform_mode); + else + error = create_image(platform_mode); + + /* + * In the case that we call create_image() above, the control + * returns here (1) after the image has been created or the + * image creation has failed and (2) after a successful restore. + */ + + /* We may need to release the preallocated image pages here. */ + if (error || !in_suspend) + swsusp_free(); + + msg = in_suspend ? (error ? PMSG_RECOVER : PMSG_THAW) : PMSG_RESTORE; + dpm_resume(msg); + + if (error || !in_suspend) + pm_restore_gfp_mask(); + + resume_console(); + dpm_complete(msg); + + Close: + platform_end(platform_mode); + return error; + + Thaw: + thaw_kernel_threads(); + Cleanup: + swsusp_free(); + goto Close; +} + +int __weak hibernate_resume_nonboot_cpu_disable(void) +{ + return suspend_disable_secondary_cpus(); +} + +/** + * resume_target_kernel - Restore system state from a hibernation image. + * @platform_mode: Whether or not to use the platform driver. + * + * Execute device drivers' "noirq" and "late" freeze callbacks, restore the + * contents of highmem that have not been restored yet from the image and run + * the low-level code that will restore the remaining contents of memory and + * switch to the just restored target kernel. + */ +static int resume_target_kernel(bool platform_mode) +{ + int error; + + error = dpm_suspend_end(PMSG_QUIESCE); + if (error) { + pr_err("Some devices failed to power down, aborting resume\n"); + return error; + } + + error = platform_pre_restore(platform_mode); + if (error) + goto Cleanup; + + cpuidle_pause(); + + error = hibernate_resume_nonboot_cpu_disable(); + if (error) + goto Enable_cpus; + + local_irq_disable(); + system_state = SYSTEM_SUSPEND; + + error = syscore_suspend(); + if (error) + goto Enable_irqs; + + save_processor_state(); + error = restore_highmem(); + if (!error) { + error = swsusp_arch_resume(); + /* + * The code below is only ever reached in case of a failure. + * Otherwise, execution continues at the place where + * swsusp_arch_suspend() was called. + */ + BUG_ON(!error); + /* + * This call to restore_highmem() reverts the changes made by + * the previous one. + */ + restore_highmem(); + } + /* + * The only reason why swsusp_arch_resume() can fail is memory being + * very tight, so we have to free it as soon as we can to avoid + * subsequent failures. + */ + swsusp_free(); + restore_processor_state(); + touch_softlockup_watchdog(); + + syscore_resume(); + + Enable_irqs: + system_state = SYSTEM_RUNNING; + local_irq_enable(); + + Enable_cpus: + pm_sleep_enable_secondary_cpus(); + + Cleanup: + platform_restore_cleanup(platform_mode); + + dpm_resume_start(PMSG_RECOVER); + + return error; +} + +/** + * hibernation_restore - Quiesce devices and restore from a hibernation image. + * @platform_mode: If set, use platform driver to prepare for the transition. + * + * This routine must be called with system_transition_mutex held. If it is + * successful, control reappears in the restored target kernel in + * hibernation_snapshot(). + */ +int hibernation_restore(int platform_mode) +{ + int error; + + pm_prepare_console(); + suspend_console(); + pm_restrict_gfp_mask(); + error = dpm_suspend_start(PMSG_QUIESCE); + if (!error) { + error = resume_target_kernel(platform_mode); + /* + * The above should either succeed and jump to the new kernel, + * or return with an error. Otherwise things are just + * undefined, so let's be paranoid. + */ + BUG_ON(!error); + } + dpm_resume_end(PMSG_RECOVER); + pm_restore_gfp_mask(); + resume_console(); + pm_restore_console(); + return error; +} + +/** + * hibernation_platform_enter - Power off the system using the platform driver. + */ +int hibernation_platform_enter(void) +{ + int error; + + if (!hibernation_ops) + return -ENOSYS; + + /* + * We have cancelled the power transition by running + * hibernation_ops->finish() before saving the image, so we should let + * the firmware know that we're going to enter the sleep state after all + */ + error = hibernation_ops->begin(PMSG_HIBERNATE); + if (error) + goto Close; + + entering_platform_hibernation = true; + suspend_console(); + error = dpm_suspend_start(PMSG_HIBERNATE); + if (error) { + if (hibernation_ops->recover) + hibernation_ops->recover(); + goto Resume_devices; + } + + error = dpm_suspend_end(PMSG_HIBERNATE); + if (error) + goto Resume_devices; + + error = hibernation_ops->prepare(); + if (error) + goto Platform_finish; + + error = pm_sleep_disable_secondary_cpus(); + if (error) + goto Enable_cpus; + + local_irq_disable(); + system_state = SYSTEM_SUSPEND; + syscore_suspend(); + if (pm_wakeup_pending()) { + error = -EAGAIN; + goto Power_up; + } + + hibernation_ops->enter(); + /* We should never get here */ + while (1); + + Power_up: + syscore_resume(); + system_state = SYSTEM_RUNNING; + local_irq_enable(); + + Enable_cpus: + pm_sleep_enable_secondary_cpus(); + + Platform_finish: + hibernation_ops->finish(); + + dpm_resume_start(PMSG_RESTORE); + + Resume_devices: + entering_platform_hibernation = false; + dpm_resume_end(PMSG_RESTORE); + resume_console(); + + Close: + hibernation_ops->end(); + + return error; +} + +/** + * power_down - Shut the machine down for hibernation. + * + * Use the platform driver, if configured, to put the system into the sleep + * state corresponding to hibernation, or try to power it off or reboot, + * depending on the value of hibernation_mode. + */ +static void power_down(void) +{ +#ifdef CONFIG_SUSPEND + int error; + + if (hibernation_mode == HIBERNATION_SUSPEND) { + error = suspend_devices_and_enter(mem_sleep_current); + if (error) { + hibernation_mode = hibernation_ops ? + HIBERNATION_PLATFORM : + HIBERNATION_SHUTDOWN; + } else { + /* Restore swap signature. */ + error = swsusp_unmark(); + if (error) + pr_err("Swap will be unusable! Try swapon -a.\n"); + + return; + } + } +#endif + + switch (hibernation_mode) { + case HIBERNATION_REBOOT: + kernel_restart(NULL); + break; + case HIBERNATION_PLATFORM: + hibernation_platform_enter(); + fallthrough; + case HIBERNATION_SHUTDOWN: + if (kernel_can_power_off()) + kernel_power_off(); + break; + } + kernel_halt(); + /* + * Valid image is on the disk, if we continue we risk serious data + * corruption after resume. + */ + pr_crit("Power down manually\n"); + while (1) + cpu_relax(); +} + +static int load_image_and_restore(bool snapshot_test) +{ + int error; + unsigned int flags; + + pm_pr_dbg("Loading hibernation image.\n"); + + lock_device_hotplug(); + error = create_basic_memory_bitmaps(); + if (error) { + swsusp_close(snapshot_test); + goto Unlock; + } + + error = swsusp_read(&flags); + swsusp_close(snapshot_test); + if (!error) + error = hibernation_restore(flags & SF_PLATFORM_MODE); + + pr_err("Failed to load image, recovering.\n"); + swsusp_free(); + free_basic_memory_bitmaps(); + Unlock: + unlock_device_hotplug(); + + return error; +} + +/** + * hibernate - Carry out system hibernation, including saving the image. + */ +int hibernate(void) +{ + bool snapshot_test = false; + unsigned int sleep_flags; + int error; + + if (!hibernation_available()) { + pm_pr_dbg("Hibernation not available.\n"); + return -EPERM; + } + + sleep_flags = lock_system_sleep(); + /* The snapshot device should not be opened while we're running */ + if (!hibernate_acquire()) { + error = -EBUSY; + goto Unlock; + } + + pr_info("hibernation entry\n"); + pm_prepare_console(); + error = pm_notifier_call_chain_robust(PM_HIBERNATION_PREPARE, PM_POST_HIBERNATION); + if (error) + goto Restore; + + ksys_sync_helper(); + + error = freeze_processes(); + if (error) + goto Exit; + + lock_device_hotplug(); + /* Allocate memory management structures */ + error = create_basic_memory_bitmaps(); + if (error) + goto Thaw; + + error = hibernation_snapshot(hibernation_mode == HIBERNATION_PLATFORM); + if (error || freezer_test_done) + goto Free_bitmaps; + + if (in_suspend) { + unsigned int flags = 0; + + if (hibernation_mode == HIBERNATION_PLATFORM) + flags |= SF_PLATFORM_MODE; + if (nocompress) + flags |= SF_NOCOMPRESS_MODE; + else + flags |= SF_CRC32_MODE; + + pm_pr_dbg("Writing hibernation image.\n"); + error = swsusp_write(flags); + swsusp_free(); + if (!error) { + if (hibernation_mode == HIBERNATION_TEST_RESUME) + snapshot_test = true; + else + power_down(); + } + in_suspend = 0; + pm_restore_gfp_mask(); + } else { + pm_pr_dbg("Hibernation image restored successfully.\n"); + } + + Free_bitmaps: + free_basic_memory_bitmaps(); + Thaw: + unlock_device_hotplug(); + if (snapshot_test) { + pm_pr_dbg("Checking hibernation image\n"); + error = swsusp_check(false); + if (!error) + error = load_image_and_restore(false); + } + thaw_processes(); + + /* Don't bother checking whether freezer_test_done is true */ + freezer_test_done = false; + Exit: + pm_notifier_call_chain(PM_POST_HIBERNATION); + Restore: + pm_restore_console(); + hibernate_release(); + Unlock: + unlock_system_sleep(sleep_flags); + pr_info("hibernation exit\n"); + + return error; +} + +/** + * hibernate_quiet_exec - Execute a function with all devices frozen. + * @func: Function to execute. + * @data: Data pointer to pass to @func. + * + * Return the @func return value or an error code if it cannot be executed. + */ +int hibernate_quiet_exec(int (*func)(void *data), void *data) +{ + unsigned int sleep_flags; + int error; + + sleep_flags = lock_system_sleep(); + + if (!hibernate_acquire()) { + error = -EBUSY; + goto unlock; + } + + pm_prepare_console(); + + error = pm_notifier_call_chain_robust(PM_HIBERNATION_PREPARE, PM_POST_HIBERNATION); + if (error) + goto restore; + + error = freeze_processes(); + if (error) + goto exit; + + lock_device_hotplug(); + + pm_suspend_clear_flags(); + + error = platform_begin(true); + if (error) + goto thaw; + + error = freeze_kernel_threads(); + if (error) + goto thaw; + + error = dpm_prepare(PMSG_FREEZE); + if (error) + goto dpm_complete; + + suspend_console(); + + error = dpm_suspend(PMSG_FREEZE); + if (error) + goto dpm_resume; + + error = dpm_suspend_end(PMSG_FREEZE); + if (error) + goto dpm_resume; + + error = platform_pre_snapshot(true); + if (error) + goto skip; + + error = func(data); + +skip: + platform_finish(true); + + dpm_resume_start(PMSG_THAW); + +dpm_resume: + dpm_resume(PMSG_THAW); + + resume_console(); + +dpm_complete: + dpm_complete(PMSG_THAW); + + thaw_kernel_threads(); + +thaw: + platform_end(true); + + unlock_device_hotplug(); + + thaw_processes(); + +exit: + pm_notifier_call_chain(PM_POST_HIBERNATION); + +restore: + pm_restore_console(); + + hibernate_release(); + +unlock: + unlock_system_sleep(sleep_flags); + + return error; +} +EXPORT_SYMBOL_GPL(hibernate_quiet_exec); + +static int __init find_resume_device(void) +{ + if (!strlen(resume_file)) + return -ENOENT; + + pm_pr_dbg("Checking hibernation image partition %s\n", resume_file); + + if (resume_delay) { + pr_info("Waiting %dsec before reading resume device ...\n", + resume_delay); + ssleep(resume_delay); + } + + /* Check if the device is there */ + if (!early_lookup_bdev(resume_file, &swsusp_resume_device)) + return 0; + + /* + * Some device discovery might still be in progress; we need to wait for + * this to finish. + */ + wait_for_device_probe(); + if (resume_wait) { + while (early_lookup_bdev(resume_file, &swsusp_resume_device)) + msleep(10); + async_synchronize_full(); + } + + return early_lookup_bdev(resume_file, &swsusp_resume_device); +} + +static int software_resume(void) +{ + int error; + + pm_pr_dbg("Hibernation image partition %d:%d present\n", + MAJOR(swsusp_resume_device), MINOR(swsusp_resume_device)); + + pm_pr_dbg("Looking for hibernation image.\n"); + + mutex_lock(&system_transition_mutex); + error = swsusp_check(true); + if (error) + goto Unlock; + + /* The snapshot device should not be opened while we're running */ + if (!hibernate_acquire()) { + error = -EBUSY; + swsusp_close(true); + goto Unlock; + } + + pr_info("resume from hibernation\n"); + pm_prepare_console(); + error = pm_notifier_call_chain_robust(PM_RESTORE_PREPARE, PM_POST_RESTORE); + if (error) + goto Restore; + + pm_pr_dbg("Preparing processes for hibernation restore.\n"); + error = freeze_processes(); + if (error) + goto Close_Finish; + + error = freeze_kernel_threads(); + if (error) { + thaw_processes(); + goto Close_Finish; + } + + error = load_image_and_restore(true); + thaw_processes(); + Finish: + pm_notifier_call_chain(PM_POST_RESTORE); + Restore: + pm_restore_console(); + pr_info("resume failed (%d)\n", error); + hibernate_release(); + /* For success case, the suspend path will release the lock */ + Unlock: + mutex_unlock(&system_transition_mutex); + pm_pr_dbg("Hibernation image not present or could not be loaded.\n"); + return error; + Close_Finish: + swsusp_close(true); + goto Finish; +} + +/** + * software_resume_initcall - Resume from a saved hibernation image. + * + * This routine is called as a late initcall, when all devices have been + * discovered and initialized already. + * + * The image reading code is called to see if there is a hibernation image + * available for reading. If that is the case, devices are quiesced and the + * contents of memory is restored from the saved image. + * + * If this is successful, control reappears in the restored target kernel in + * hibernation_snapshot() which returns to hibernate(). Otherwise, the routine + * attempts to recover gracefully and make the kernel return to the normal mode + * of operation. + */ +static int __init software_resume_initcall(void) +{ + /* + * If the user said "noresume".. bail out early. + */ + if (noresume || !hibernation_available()) + return 0; + + if (!swsusp_resume_device) { + int error = find_resume_device(); + + if (error) + return error; + } + + return software_resume(); +} +late_initcall_sync(software_resume_initcall); + + +static const char * const hibernation_modes[] = { + [HIBERNATION_PLATFORM] = "platform", + [HIBERNATION_SHUTDOWN] = "shutdown", + [HIBERNATION_REBOOT] = "reboot", +#ifdef CONFIG_SUSPEND + [HIBERNATION_SUSPEND] = "suspend", +#endif + [HIBERNATION_TEST_RESUME] = "test_resume", +}; + +/* + * /sys/power/disk - Control hibernation mode. + * + * Hibernation can be handled in several ways. There are a few different ways + * to put the system into the sleep state: using the platform driver (e.g. ACPI + * or other hibernation_ops), powering it off or rebooting it (for testing + * mostly). + * + * The sysfs file /sys/power/disk provides an interface for selecting the + * hibernation mode to use. Reading from this file causes the available modes + * to be printed. There are 3 modes that can be supported: + * + * 'platform' + * 'shutdown' + * 'reboot' + * + * If a platform hibernation driver is in use, 'platform' will be supported + * and will be used by default. Otherwise, 'shutdown' will be used by default. + * The selected option (i.e. the one corresponding to the current value of + * hibernation_mode) is enclosed by a square bracket. + * + * To select a given hibernation mode it is necessary to write the mode's + * string representation (as returned by reading from /sys/power/disk) back + * into /sys/power/disk. + */ + +static ssize_t disk_show(struct kobject *kobj, struct kobj_attribute *attr, + char *buf) +{ + int i; + char *start = buf; + + if (!hibernation_available()) + return sprintf(buf, "[disabled]\n"); + + for (i = HIBERNATION_FIRST; i <= HIBERNATION_MAX; i++) { + if (!hibernation_modes[i]) + continue; + switch (i) { + case HIBERNATION_SHUTDOWN: + case HIBERNATION_REBOOT: +#ifdef CONFIG_SUSPEND + case HIBERNATION_SUSPEND: +#endif + case HIBERNATION_TEST_RESUME: + break; + case HIBERNATION_PLATFORM: + if (hibernation_ops) + break; + /* not a valid mode, continue with loop */ + continue; + } + if (i == hibernation_mode) + buf += sprintf(buf, "[%s] ", hibernation_modes[i]); + else + buf += sprintf(buf, "%s ", hibernation_modes[i]); + } + buf += sprintf(buf, "\n"); + return buf-start; +} + +static ssize_t disk_store(struct kobject *kobj, struct kobj_attribute *attr, + const char *buf, size_t n) +{ + int mode = HIBERNATION_INVALID; + unsigned int sleep_flags; + int error = 0; + int len; + char *p; + int i; + + if (!hibernation_available()) + return -EPERM; + + p = memchr(buf, '\n', n); + len = p ? p - buf : n; + + sleep_flags = lock_system_sleep(); + for (i = HIBERNATION_FIRST; i <= HIBERNATION_MAX; i++) { + if (len == strlen(hibernation_modes[i]) + && !strncmp(buf, hibernation_modes[i], len)) { + mode = i; + break; + } + } + if (mode != HIBERNATION_INVALID) { + switch (mode) { + case HIBERNATION_SHUTDOWN: + case HIBERNATION_REBOOT: +#ifdef CONFIG_SUSPEND + case HIBERNATION_SUSPEND: +#endif + case HIBERNATION_TEST_RESUME: + hibernation_mode = mode; + break; + case HIBERNATION_PLATFORM: + if (hibernation_ops) + hibernation_mode = mode; + else + error = -EINVAL; + } + } else + error = -EINVAL; + + if (!error) + pm_pr_dbg("Hibernation mode set to '%s'\n", + hibernation_modes[mode]); + unlock_system_sleep(sleep_flags); + return error ? error : n; +} + +power_attr(disk); + +static ssize_t resume_show(struct kobject *kobj, struct kobj_attribute *attr, + char *buf) +{ + return sprintf(buf, "%d:%d\n", MAJOR(swsusp_resume_device), + MINOR(swsusp_resume_device)); +} + +static ssize_t resume_store(struct kobject *kobj, struct kobj_attribute *attr, + const char *buf, size_t n) +{ + unsigned int sleep_flags; + int len = n; + char *name; + dev_t dev; + int error; + + if (!hibernation_available()) + return n; + + if (len && buf[len-1] == '\n') + len--; + name = kstrndup(buf, len, GFP_KERNEL); + if (!name) + return -ENOMEM; + + error = lookup_bdev(name, &dev); + if (error) { + unsigned maj, min, offset; + char *p, dummy; + + error = 0; + if (sscanf(name, "%u:%u%c", &maj, &min, &dummy) == 2 || + sscanf(name, "%u:%u:%u:%c", &maj, &min, &offset, + &dummy) == 3) { + dev = MKDEV(maj, min); + if (maj != MAJOR(dev) || min != MINOR(dev)) + error = -EINVAL; + } else { + dev = new_decode_dev(simple_strtoul(name, &p, 16)); + if (*p) + error = -EINVAL; + } + } + kfree(name); + if (error) + return error; + + sleep_flags = lock_system_sleep(); + swsusp_resume_device = dev; + unlock_system_sleep(sleep_flags); + + pm_pr_dbg("Configured hibernation resume from disk to %u\n", + swsusp_resume_device); + noresume = 0; + software_resume(); + return n; +} + +power_attr(resume); + +static ssize_t resume_offset_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + return sprintf(buf, "%llu\n", (unsigned long long)swsusp_resume_block); +} + +static ssize_t resume_offset_store(struct kobject *kobj, + struct kobj_attribute *attr, const char *buf, + size_t n) +{ + unsigned long long offset; + int rc; + + rc = kstrtoull(buf, 0, &offset); + if (rc) + return rc; + swsusp_resume_block = offset; + + return n; +} + +power_attr(resume_offset); + +static ssize_t image_size_show(struct kobject *kobj, struct kobj_attribute *attr, + char *buf) +{ + return sprintf(buf, "%lu\n", image_size); +} + +static ssize_t image_size_store(struct kobject *kobj, struct kobj_attribute *attr, + const char *buf, size_t n) +{ + unsigned long size; + + if (sscanf(buf, "%lu", &size) == 1) { + image_size = size; + return n; + } + + return -EINVAL; +} + +power_attr(image_size); + +static ssize_t reserved_size_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + return sprintf(buf, "%lu\n", reserved_size); +} + +static ssize_t reserved_size_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t n) +{ + unsigned long size; + + if (sscanf(buf, "%lu", &size) == 1) { + reserved_size = size; + return n; + } + + return -EINVAL; +} + +power_attr(reserved_size); + +static struct attribute *g[] = { + &disk_attr.attr, + &resume_offset_attr.attr, + &resume_attr.attr, + &image_size_attr.attr, + &reserved_size_attr.attr, + NULL, +}; + + +static const struct attribute_group attr_group = { + .attrs = g, +}; + + +static int __init pm_disk_init(void) +{ + return sysfs_create_group(power_kobj, &attr_group); +} + +core_initcall(pm_disk_init); + + +static int __init resume_setup(char *str) +{ + if (noresume) + return 1; + + strncpy(resume_file, str, 255); + return 1; +} + +static int __init resume_offset_setup(char *str) +{ + unsigned long long offset; + + if (noresume) + return 1; + + if (sscanf(str, "%llu", &offset) == 1) + swsusp_resume_block = offset; + + return 1; +} + +static int __init hibernate_setup(char *str) +{ + if (!strncmp(str, "noresume", 8)) { + noresume = 1; + } else if (!strncmp(str, "nocompress", 10)) { + nocompress = 1; + } else if (!strncmp(str, "no", 2)) { + noresume = 1; + nohibernate = 1; + } else if (IS_ENABLED(CONFIG_STRICT_KERNEL_RWX) + && !strncmp(str, "protect_image", 13)) { + enable_restore_image_protection(); + } + return 1; +} + +static int __init noresume_setup(char *str) +{ + noresume = 1; + return 1; +} + +static int __init resumewait_setup(char *str) +{ + resume_wait = 1; + return 1; +} + +static int __init resumedelay_setup(char *str) +{ + int rc = kstrtouint(str, 0, &resume_delay); + + if (rc) + pr_warn("resumedelay: bad option string '%s'\n", str); + return 1; +} + +static int __init nohibernate_setup(char *str) +{ + noresume = 1; + nohibernate = 1; + return 1; +} + +__setup("noresume", noresume_setup); +__setup("resume_offset=", resume_offset_setup); +__setup("resume=", resume_setup); +__setup("hibernate=", hibernate_setup); +__setup("resumewait", resumewait_setup); +__setup("resumedelay=", resumedelay_setup); +__setup("nohibernate", nohibernate_setup); diff --git a/kernel/power/main.c b/kernel/power/main.c new file mode 100644 index 0000000000..f6425ae3e8 --- /dev/null +++ b/kernel/power/main.c @@ -0,0 +1,1008 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * kernel/power/main.c - PM subsystem core functionality. + * + * Copyright (c) 2003 Patrick Mochel + * Copyright (c) 2003 Open Source Development Lab + */ + +#include <linux/acpi.h> +#include <linux/export.h> +#include <linux/kobject.h> +#include <linux/string.h> +#include <linux/pm-trace.h> +#include <linux/workqueue.h> +#include <linux/debugfs.h> +#include <linux/seq_file.h> +#include <linux/suspend.h> +#include <linux/syscalls.h> +#include <linux/pm_runtime.h> + +#include "power.h" + +#ifdef CONFIG_PM_SLEEP +/* + * The following functions are used by the suspend/hibernate code to temporarily + * change gfp_allowed_mask in order to avoid using I/O during memory allocations + * while devices are suspended. To avoid races with the suspend/hibernate code, + * they should always be called with system_transition_mutex held + * (gfp_allowed_mask also should only be modified with system_transition_mutex + * held, unless the suspend/hibernate code is guaranteed not to run in parallel + * with that modification). + */ +static gfp_t saved_gfp_mask; + +void pm_restore_gfp_mask(void) +{ + WARN_ON(!mutex_is_locked(&system_transition_mutex)); + if (saved_gfp_mask) { + gfp_allowed_mask = saved_gfp_mask; + saved_gfp_mask = 0; + } +} + +void pm_restrict_gfp_mask(void) +{ + WARN_ON(!mutex_is_locked(&system_transition_mutex)); + WARN_ON(saved_gfp_mask); + saved_gfp_mask = gfp_allowed_mask; + gfp_allowed_mask &= ~(__GFP_IO | __GFP_FS); +} + +unsigned int lock_system_sleep(void) +{ + unsigned int flags = current->flags; + current->flags |= PF_NOFREEZE; + mutex_lock(&system_transition_mutex); + return flags; +} +EXPORT_SYMBOL_GPL(lock_system_sleep); + +void unlock_system_sleep(unsigned int flags) +{ + /* + * Don't use freezer_count() because we don't want the call to + * try_to_freeze() here. + * + * Reason: + * Fundamentally, we just don't need it, because freezing condition + * doesn't come into effect until we release the + * system_transition_mutex lock, since the freezer always works with + * system_transition_mutex held. + * + * More importantly, in the case of hibernation, + * unlock_system_sleep() gets called in snapshot_read() and + * snapshot_write() when the freezing condition is still in effect. + * Which means, if we use try_to_freeze() here, it would make them + * enter the refrigerator, thus causing hibernation to lockup. + */ + if (!(flags & PF_NOFREEZE)) + current->flags &= ~PF_NOFREEZE; + mutex_unlock(&system_transition_mutex); +} +EXPORT_SYMBOL_GPL(unlock_system_sleep); + +void ksys_sync_helper(void) +{ + ktime_t start; + long elapsed_msecs; + + start = ktime_get(); + ksys_sync(); + elapsed_msecs = ktime_to_ms(ktime_sub(ktime_get(), start)); + pr_info("Filesystems sync: %ld.%03ld seconds\n", + elapsed_msecs / MSEC_PER_SEC, elapsed_msecs % MSEC_PER_SEC); +} +EXPORT_SYMBOL_GPL(ksys_sync_helper); + +/* Routines for PM-transition notifications */ + +static BLOCKING_NOTIFIER_HEAD(pm_chain_head); + +int register_pm_notifier(struct notifier_block *nb) +{ + return blocking_notifier_chain_register(&pm_chain_head, nb); +} +EXPORT_SYMBOL_GPL(register_pm_notifier); + +int unregister_pm_notifier(struct notifier_block *nb) +{ + return blocking_notifier_chain_unregister(&pm_chain_head, nb); +} +EXPORT_SYMBOL_GPL(unregister_pm_notifier); + +void pm_report_hw_sleep_time(u64 t) +{ + suspend_stats.last_hw_sleep = t; + suspend_stats.total_hw_sleep += t; +} +EXPORT_SYMBOL_GPL(pm_report_hw_sleep_time); + +void pm_report_max_hw_sleep(u64 t) +{ + suspend_stats.max_hw_sleep = t; +} +EXPORT_SYMBOL_GPL(pm_report_max_hw_sleep); + +int pm_notifier_call_chain_robust(unsigned long val_up, unsigned long val_down) +{ + int ret; + + ret = blocking_notifier_call_chain_robust(&pm_chain_head, val_up, val_down, NULL); + + return notifier_to_errno(ret); +} + +int pm_notifier_call_chain(unsigned long val) +{ + return blocking_notifier_call_chain(&pm_chain_head, val, NULL); +} + +/* If set, devices may be suspended and resumed asynchronously. */ +int pm_async_enabled = 1; + +static ssize_t pm_async_show(struct kobject *kobj, struct kobj_attribute *attr, + char *buf) +{ + return sprintf(buf, "%d\n", pm_async_enabled); +} + +static ssize_t pm_async_store(struct kobject *kobj, struct kobj_attribute *attr, + const char *buf, size_t n) +{ + unsigned long val; + + if (kstrtoul(buf, 10, &val)) + return -EINVAL; + + if (val > 1) + return -EINVAL; + + pm_async_enabled = val; + return n; +} + +power_attr(pm_async); + +#ifdef CONFIG_SUSPEND +static ssize_t mem_sleep_show(struct kobject *kobj, struct kobj_attribute *attr, + char *buf) +{ + char *s = buf; + suspend_state_t i; + + for (i = PM_SUSPEND_MIN; i < PM_SUSPEND_MAX; i++) { + if (i >= PM_SUSPEND_MEM && cxl_mem_active()) + continue; + if (mem_sleep_states[i]) { + const char *label = mem_sleep_states[i]; + + if (mem_sleep_current == i) + s += sprintf(s, "[%s] ", label); + else + s += sprintf(s, "%s ", label); + } + } + + /* Convert the last space to a newline if needed. */ + if (s != buf) + *(s-1) = '\n'; + + return (s - buf); +} + +static suspend_state_t decode_suspend_state(const char *buf, size_t n) +{ + suspend_state_t state; + char *p; + int len; + + p = memchr(buf, '\n', n); + len = p ? p - buf : n; + + for (state = PM_SUSPEND_MIN; state < PM_SUSPEND_MAX; state++) { + const char *label = mem_sleep_states[state]; + + if (label && len == strlen(label) && !strncmp(buf, label, len)) + return state; + } + + return PM_SUSPEND_ON; +} + +static ssize_t mem_sleep_store(struct kobject *kobj, struct kobj_attribute *attr, + const char *buf, size_t n) +{ + suspend_state_t state; + int error; + + error = pm_autosleep_lock(); + if (error) + return error; + + if (pm_autosleep_state() > PM_SUSPEND_ON) { + error = -EBUSY; + goto out; + } + + state = decode_suspend_state(buf, n); + if (state < PM_SUSPEND_MAX && state > PM_SUSPEND_ON) + mem_sleep_current = state; + else + error = -EINVAL; + + out: + pm_autosleep_unlock(); + return error ? error : n; +} + +power_attr(mem_sleep); + +/* + * sync_on_suspend: invoke ksys_sync_helper() before suspend. + * + * show() returns whether ksys_sync_helper() is invoked before suspend. + * store() accepts 0 or 1. 0 disables ksys_sync_helper() and 1 enables it. + */ +bool sync_on_suspend_enabled = !IS_ENABLED(CONFIG_SUSPEND_SKIP_SYNC); + +static ssize_t sync_on_suspend_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + return sprintf(buf, "%d\n", sync_on_suspend_enabled); +} + +static ssize_t sync_on_suspend_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t n) +{ + unsigned long val; + + if (kstrtoul(buf, 10, &val)) + return -EINVAL; + + if (val > 1) + return -EINVAL; + + sync_on_suspend_enabled = !!val; + return n; +} + +power_attr(sync_on_suspend); +#endif /* CONFIG_SUSPEND */ + +#ifdef CONFIG_PM_SLEEP_DEBUG +int pm_test_level = TEST_NONE; + +static const char * const pm_tests[__TEST_AFTER_LAST] = { + [TEST_NONE] = "none", + [TEST_CORE] = "core", + [TEST_CPUS] = "processors", + [TEST_PLATFORM] = "platform", + [TEST_DEVICES] = "devices", + [TEST_FREEZER] = "freezer", +}; + +static ssize_t pm_test_show(struct kobject *kobj, struct kobj_attribute *attr, + char *buf) +{ + char *s = buf; + int level; + + for (level = TEST_FIRST; level <= TEST_MAX; level++) + if (pm_tests[level]) { + if (level == pm_test_level) + s += sprintf(s, "[%s] ", pm_tests[level]); + else + s += sprintf(s, "%s ", pm_tests[level]); + } + + if (s != buf) + /* convert the last space to a newline */ + *(s-1) = '\n'; + + return (s - buf); +} + +static ssize_t pm_test_store(struct kobject *kobj, struct kobj_attribute *attr, + const char *buf, size_t n) +{ + unsigned int sleep_flags; + const char * const *s; + int error = -EINVAL; + int level; + char *p; + int len; + + p = memchr(buf, '\n', n); + len = p ? p - buf : n; + + sleep_flags = lock_system_sleep(); + + level = TEST_FIRST; + for (s = &pm_tests[level]; level <= TEST_MAX; s++, level++) + if (*s && len == strlen(*s) && !strncmp(buf, *s, len)) { + pm_test_level = level; + error = 0; + break; + } + + unlock_system_sleep(sleep_flags); + + return error ? error : n; +} + +power_attr(pm_test); +#endif /* CONFIG_PM_SLEEP_DEBUG */ + +static char *suspend_step_name(enum suspend_stat_step step) +{ + switch (step) { + case SUSPEND_FREEZE: + return "freeze"; + case SUSPEND_PREPARE: + return "prepare"; + case SUSPEND_SUSPEND: + return "suspend"; + case SUSPEND_SUSPEND_NOIRQ: + return "suspend_noirq"; + case SUSPEND_RESUME_NOIRQ: + return "resume_noirq"; + case SUSPEND_RESUME: + return "resume"; + default: + return ""; + } +} + +#define suspend_attr(_name, format_str) \ +static ssize_t _name##_show(struct kobject *kobj, \ + struct kobj_attribute *attr, char *buf) \ +{ \ + return sprintf(buf, format_str, suspend_stats._name); \ +} \ +static struct kobj_attribute _name = __ATTR_RO(_name) + +suspend_attr(success, "%d\n"); +suspend_attr(fail, "%d\n"); +suspend_attr(failed_freeze, "%d\n"); +suspend_attr(failed_prepare, "%d\n"); +suspend_attr(failed_suspend, "%d\n"); +suspend_attr(failed_suspend_late, "%d\n"); +suspend_attr(failed_suspend_noirq, "%d\n"); +suspend_attr(failed_resume, "%d\n"); +suspend_attr(failed_resume_early, "%d\n"); +suspend_attr(failed_resume_noirq, "%d\n"); +suspend_attr(last_hw_sleep, "%llu\n"); +suspend_attr(total_hw_sleep, "%llu\n"); +suspend_attr(max_hw_sleep, "%llu\n"); + +static ssize_t last_failed_dev_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + int index; + char *last_failed_dev = NULL; + + index = suspend_stats.last_failed_dev + REC_FAILED_NUM - 1; + index %= REC_FAILED_NUM; + last_failed_dev = suspend_stats.failed_devs[index]; + + return sprintf(buf, "%s\n", last_failed_dev); +} +static struct kobj_attribute last_failed_dev = __ATTR_RO(last_failed_dev); + +static ssize_t last_failed_errno_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + int index; + int last_failed_errno; + + index = suspend_stats.last_failed_errno + REC_FAILED_NUM - 1; + index %= REC_FAILED_NUM; + last_failed_errno = suspend_stats.errno[index]; + + return sprintf(buf, "%d\n", last_failed_errno); +} +static struct kobj_attribute last_failed_errno = __ATTR_RO(last_failed_errno); + +static ssize_t last_failed_step_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + int index; + enum suspend_stat_step step; + char *last_failed_step = NULL; + + index = suspend_stats.last_failed_step + REC_FAILED_NUM - 1; + index %= REC_FAILED_NUM; + step = suspend_stats.failed_steps[index]; + last_failed_step = suspend_step_name(step); + + return sprintf(buf, "%s\n", last_failed_step); +} +static struct kobj_attribute last_failed_step = __ATTR_RO(last_failed_step); + +static struct attribute *suspend_attrs[] = { + &success.attr, + &fail.attr, + &failed_freeze.attr, + &failed_prepare.attr, + &failed_suspend.attr, + &failed_suspend_late.attr, + &failed_suspend_noirq.attr, + &failed_resume.attr, + &failed_resume_early.attr, + &failed_resume_noirq.attr, + &last_failed_dev.attr, + &last_failed_errno.attr, + &last_failed_step.attr, + &last_hw_sleep.attr, + &total_hw_sleep.attr, + &max_hw_sleep.attr, + NULL, +}; + +static umode_t suspend_attr_is_visible(struct kobject *kobj, struct attribute *attr, int idx) +{ + if (attr != &last_hw_sleep.attr && + attr != &total_hw_sleep.attr && + attr != &max_hw_sleep.attr) + return 0444; + +#ifdef CONFIG_ACPI + if (acpi_gbl_FADT.flags & ACPI_FADT_LOW_POWER_S0) + return 0444; +#endif + return 0; +} + +static const struct attribute_group suspend_attr_group = { + .name = "suspend_stats", + .attrs = suspend_attrs, + .is_visible = suspend_attr_is_visible, +}; + +#ifdef CONFIG_DEBUG_FS +static int suspend_stats_show(struct seq_file *s, void *unused) +{ + int i, index, last_dev, last_errno, last_step; + + last_dev = suspend_stats.last_failed_dev + REC_FAILED_NUM - 1; + last_dev %= REC_FAILED_NUM; + last_errno = suspend_stats.last_failed_errno + REC_FAILED_NUM - 1; + last_errno %= REC_FAILED_NUM; + last_step = suspend_stats.last_failed_step + REC_FAILED_NUM - 1; + last_step %= REC_FAILED_NUM; + seq_printf(s, "%s: %d\n%s: %d\n%s: %d\n%s: %d\n%s: %d\n" + "%s: %d\n%s: %d\n%s: %d\n%s: %d\n%s: %d\n", + "success", suspend_stats.success, + "fail", suspend_stats.fail, + "failed_freeze", suspend_stats.failed_freeze, + "failed_prepare", suspend_stats.failed_prepare, + "failed_suspend", suspend_stats.failed_suspend, + "failed_suspend_late", + suspend_stats.failed_suspend_late, + "failed_suspend_noirq", + suspend_stats.failed_suspend_noirq, + "failed_resume", suspend_stats.failed_resume, + "failed_resume_early", + suspend_stats.failed_resume_early, + "failed_resume_noirq", + suspend_stats.failed_resume_noirq); + seq_printf(s, "failures:\n last_failed_dev:\t%-s\n", + suspend_stats.failed_devs[last_dev]); + for (i = 1; i < REC_FAILED_NUM; i++) { + index = last_dev + REC_FAILED_NUM - i; + index %= REC_FAILED_NUM; + seq_printf(s, "\t\t\t%-s\n", + suspend_stats.failed_devs[index]); + } + seq_printf(s, " last_failed_errno:\t%-d\n", + suspend_stats.errno[last_errno]); + for (i = 1; i < REC_FAILED_NUM; i++) { + index = last_errno + REC_FAILED_NUM - i; + index %= REC_FAILED_NUM; + seq_printf(s, "\t\t\t%-d\n", + suspend_stats.errno[index]); + } + seq_printf(s, " last_failed_step:\t%-s\n", + suspend_step_name( + suspend_stats.failed_steps[last_step])); + for (i = 1; i < REC_FAILED_NUM; i++) { + index = last_step + REC_FAILED_NUM - i; + index %= REC_FAILED_NUM; + seq_printf(s, "\t\t\t%-s\n", + suspend_step_name( + suspend_stats.failed_steps[index])); + } + + return 0; +} +DEFINE_SHOW_ATTRIBUTE(suspend_stats); + +static int __init pm_debugfs_init(void) +{ + debugfs_create_file("suspend_stats", S_IFREG | S_IRUGO, + NULL, NULL, &suspend_stats_fops); + return 0; +} + +late_initcall(pm_debugfs_init); +#endif /* CONFIG_DEBUG_FS */ + +#endif /* CONFIG_PM_SLEEP */ + +#ifdef CONFIG_PM_SLEEP_DEBUG +/* + * pm_print_times: print time taken by devices to suspend and resume. + * + * show() returns whether printing of suspend and resume times is enabled. + * store() accepts 0 or 1. 0 disables printing and 1 enables it. + */ +bool pm_print_times_enabled; + +static ssize_t pm_print_times_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + return sprintf(buf, "%d\n", pm_print_times_enabled); +} + +static ssize_t pm_print_times_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t n) +{ + unsigned long val; + + if (kstrtoul(buf, 10, &val)) + return -EINVAL; + + if (val > 1) + return -EINVAL; + + pm_print_times_enabled = !!val; + return n; +} + +power_attr(pm_print_times); + +static inline void pm_print_times_init(void) +{ + pm_print_times_enabled = !!initcall_debug; +} + +static ssize_t pm_wakeup_irq_show(struct kobject *kobj, + struct kobj_attribute *attr, + char *buf) +{ + if (!pm_wakeup_irq()) + return -ENODATA; + + return sprintf(buf, "%u\n", pm_wakeup_irq()); +} + +power_attr_ro(pm_wakeup_irq); + +bool pm_debug_messages_on __read_mostly; + +bool pm_debug_messages_should_print(void) +{ + return pm_debug_messages_on && pm_suspend_target_state != PM_SUSPEND_ON; +} +EXPORT_SYMBOL_GPL(pm_debug_messages_should_print); + +static ssize_t pm_debug_messages_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + return sprintf(buf, "%d\n", pm_debug_messages_on); +} + +static ssize_t pm_debug_messages_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t n) +{ + unsigned long val; + + if (kstrtoul(buf, 10, &val)) + return -EINVAL; + + if (val > 1) + return -EINVAL; + + pm_debug_messages_on = !!val; + return n; +} + +power_attr(pm_debug_messages); + +static int __init pm_debug_messages_setup(char *str) +{ + pm_debug_messages_on = true; + return 1; +} +__setup("pm_debug_messages", pm_debug_messages_setup); + +#else /* !CONFIG_PM_SLEEP_DEBUG */ +static inline void pm_print_times_init(void) {} +#endif /* CONFIG_PM_SLEEP_DEBUG */ + +struct kobject *power_kobj; + +/* + * state - control system sleep states. + * + * show() returns available sleep state labels, which may be "mem", "standby", + * "freeze" and "disk" (hibernation). + * See Documentation/admin-guide/pm/sleep-states.rst for a description of + * what they mean. + * + * store() accepts one of those strings, translates it into the proper + * enumerated value, and initiates a suspend transition. + */ +static ssize_t state_show(struct kobject *kobj, struct kobj_attribute *attr, + char *buf) +{ + char *s = buf; +#ifdef CONFIG_SUSPEND + suspend_state_t i; + + for (i = PM_SUSPEND_MIN; i < PM_SUSPEND_MAX; i++) + if (pm_states[i]) + s += sprintf(s,"%s ", pm_states[i]); + +#endif + if (hibernation_available()) + s += sprintf(s, "disk "); + if (s != buf) + /* convert the last space to a newline */ + *(s-1) = '\n'; + return (s - buf); +} + +static suspend_state_t decode_state(const char *buf, size_t n) +{ +#ifdef CONFIG_SUSPEND + suspend_state_t state; +#endif + char *p; + int len; + + p = memchr(buf, '\n', n); + len = p ? p - buf : n; + + /* Check hibernation first. */ + if (len == 4 && str_has_prefix(buf, "disk")) + return PM_SUSPEND_MAX; + +#ifdef CONFIG_SUSPEND + for (state = PM_SUSPEND_MIN; state < PM_SUSPEND_MAX; state++) { + const char *label = pm_states[state]; + + if (label && len == strlen(label) && !strncmp(buf, label, len)) + return state; + } +#endif + + return PM_SUSPEND_ON; +} + +static ssize_t state_store(struct kobject *kobj, struct kobj_attribute *attr, + const char *buf, size_t n) +{ + suspend_state_t state; + int error; + + error = pm_autosleep_lock(); + if (error) + return error; + + if (pm_autosleep_state() > PM_SUSPEND_ON) { + error = -EBUSY; + goto out; + } + + state = decode_state(buf, n); + if (state < PM_SUSPEND_MAX) { + if (state == PM_SUSPEND_MEM) + state = mem_sleep_current; + + error = pm_suspend(state); + } else if (state == PM_SUSPEND_MAX) { + error = hibernate(); + } else { + error = -EINVAL; + } + + out: + pm_autosleep_unlock(); + return error ? error : n; +} + +power_attr(state); + +#ifdef CONFIG_PM_SLEEP +/* + * The 'wakeup_count' attribute, along with the functions defined in + * drivers/base/power/wakeup.c, provides a means by which wakeup events can be + * handled in a non-racy way. + * + * If a wakeup event occurs when the system is in a sleep state, it simply is + * woken up. In turn, if an event that would wake the system up from a sleep + * state occurs when it is undergoing a transition to that sleep state, the + * transition should be aborted. Moreover, if such an event occurs when the + * system is in the working state, an attempt to start a transition to the + * given sleep state should fail during certain period after the detection of + * the event. Using the 'state' attribute alone is not sufficient to satisfy + * these requirements, because a wakeup event may occur exactly when 'state' + * is being written to and may be delivered to user space right before it is + * frozen, so the event will remain only partially processed until the system is + * woken up by another event. In particular, it won't cause the transition to + * a sleep state to be aborted. + * + * This difficulty may be overcome if user space uses 'wakeup_count' before + * writing to 'state'. It first should read from 'wakeup_count' and store + * the read value. Then, after carrying out its own preparations for the system + * transition to a sleep state, it should write the stored value to + * 'wakeup_count'. If that fails, at least one wakeup event has occurred since + * 'wakeup_count' was read and 'state' should not be written to. Otherwise, it + * is allowed to write to 'state', but the transition will be aborted if there + * are any wakeup events detected after 'wakeup_count' was written to. + */ + +static ssize_t wakeup_count_show(struct kobject *kobj, + struct kobj_attribute *attr, + char *buf) +{ + unsigned int val; + + return pm_get_wakeup_count(&val, true) ? + sprintf(buf, "%u\n", val) : -EINTR; +} + +static ssize_t wakeup_count_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t n) +{ + unsigned int val; + int error; + + error = pm_autosleep_lock(); + if (error) + return error; + + if (pm_autosleep_state() > PM_SUSPEND_ON) { + error = -EBUSY; + goto out; + } + + error = -EINVAL; + if (sscanf(buf, "%u", &val) == 1) { + if (pm_save_wakeup_count(val)) + error = n; + else + pm_print_active_wakeup_sources(); + } + + out: + pm_autosleep_unlock(); + return error; +} + +power_attr(wakeup_count); + +#ifdef CONFIG_PM_AUTOSLEEP +static ssize_t autosleep_show(struct kobject *kobj, + struct kobj_attribute *attr, + char *buf) +{ + suspend_state_t state = pm_autosleep_state(); + + if (state == PM_SUSPEND_ON) + return sprintf(buf, "off\n"); + +#ifdef CONFIG_SUSPEND + if (state < PM_SUSPEND_MAX) + return sprintf(buf, "%s\n", pm_states[state] ? + pm_states[state] : "error"); +#endif +#ifdef CONFIG_HIBERNATION + return sprintf(buf, "disk\n"); +#else + return sprintf(buf, "error"); +#endif +} + +static ssize_t autosleep_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t n) +{ + suspend_state_t state = decode_state(buf, n); + int error; + + if (state == PM_SUSPEND_ON + && strcmp(buf, "off") && strcmp(buf, "off\n")) + return -EINVAL; + + if (state == PM_SUSPEND_MEM) + state = mem_sleep_current; + + error = pm_autosleep_set_state(state); + return error ? error : n; +} + +power_attr(autosleep); +#endif /* CONFIG_PM_AUTOSLEEP */ + +#ifdef CONFIG_PM_WAKELOCKS +static ssize_t wake_lock_show(struct kobject *kobj, + struct kobj_attribute *attr, + char *buf) +{ + return pm_show_wakelocks(buf, true); +} + +static ssize_t wake_lock_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t n) +{ + int error = pm_wake_lock(buf); + return error ? error : n; +} + +power_attr(wake_lock); + +static ssize_t wake_unlock_show(struct kobject *kobj, + struct kobj_attribute *attr, + char *buf) +{ + return pm_show_wakelocks(buf, false); +} + +static ssize_t wake_unlock_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t n) +{ + int error = pm_wake_unlock(buf); + return error ? error : n; +} + +power_attr(wake_unlock); + +#endif /* CONFIG_PM_WAKELOCKS */ +#endif /* CONFIG_PM_SLEEP */ + +#ifdef CONFIG_PM_TRACE +int pm_trace_enabled; + +static ssize_t pm_trace_show(struct kobject *kobj, struct kobj_attribute *attr, + char *buf) +{ + return sprintf(buf, "%d\n", pm_trace_enabled); +} + +static ssize_t +pm_trace_store(struct kobject *kobj, struct kobj_attribute *attr, + const char *buf, size_t n) +{ + int val; + + if (sscanf(buf, "%d", &val) == 1) { + pm_trace_enabled = !!val; + if (pm_trace_enabled) { + pr_warn("PM: Enabling pm_trace changes system date and time during resume.\n" + "PM: Correct system time has to be restored manually after resume.\n"); + } + return n; + } + return -EINVAL; +} + +power_attr(pm_trace); + +static ssize_t pm_trace_dev_match_show(struct kobject *kobj, + struct kobj_attribute *attr, + char *buf) +{ + return show_trace_dev_match(buf, PAGE_SIZE); +} + +power_attr_ro(pm_trace_dev_match); + +#endif /* CONFIG_PM_TRACE */ + +#ifdef CONFIG_FREEZER +static ssize_t pm_freeze_timeout_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + return sprintf(buf, "%u\n", freeze_timeout_msecs); +} + +static ssize_t pm_freeze_timeout_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t n) +{ + unsigned long val; + + if (kstrtoul(buf, 10, &val)) + return -EINVAL; + + freeze_timeout_msecs = val; + return n; +} + +power_attr(pm_freeze_timeout); + +#endif /* CONFIG_FREEZER*/ + +static struct attribute * g[] = { + &state_attr.attr, +#ifdef CONFIG_PM_TRACE + &pm_trace_attr.attr, + &pm_trace_dev_match_attr.attr, +#endif +#ifdef CONFIG_PM_SLEEP + &pm_async_attr.attr, + &wakeup_count_attr.attr, +#ifdef CONFIG_SUSPEND + &mem_sleep_attr.attr, + &sync_on_suspend_attr.attr, +#endif +#ifdef CONFIG_PM_AUTOSLEEP + &autosleep_attr.attr, +#endif +#ifdef CONFIG_PM_WAKELOCKS + &wake_lock_attr.attr, + &wake_unlock_attr.attr, +#endif +#ifdef CONFIG_PM_SLEEP_DEBUG + &pm_test_attr.attr, + &pm_print_times_attr.attr, + &pm_wakeup_irq_attr.attr, + &pm_debug_messages_attr.attr, +#endif +#endif +#ifdef CONFIG_FREEZER + &pm_freeze_timeout_attr.attr, +#endif + NULL, +}; + +static const struct attribute_group attr_group = { + .attrs = g, +}; + +static const struct attribute_group *attr_groups[] = { + &attr_group, +#ifdef CONFIG_PM_SLEEP + &suspend_attr_group, +#endif + NULL, +}; + +struct workqueue_struct *pm_wq; +EXPORT_SYMBOL_GPL(pm_wq); + +static int __init pm_start_workqueue(void) +{ + pm_wq = alloc_workqueue("pm", WQ_FREEZABLE, 0); + + return pm_wq ? 0 : -ENOMEM; +} + +static int __init pm_init(void) +{ + int error = pm_start_workqueue(); + if (error) + return error; + hibernate_image_size_init(); + hibernate_reserved_size_init(); + pm_states_init(); + power_kobj = kobject_create_and_add("power", NULL); + if (!power_kobj) + return -ENOMEM; + error = sysfs_create_groups(power_kobj, attr_groups); + if (error) + return error; + pm_print_times_init(); + return pm_autosleep_init(); +} + +core_initcall(pm_init); diff --git a/kernel/power/power.h b/kernel/power/power.h new file mode 100644 index 0000000000..a98f95e309 --- /dev/null +++ b/kernel/power/power.h @@ -0,0 +1,327 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#include <linux/suspend.h> +#include <linux/suspend_ioctls.h> +#include <linux/utsname.h> +#include <linux/freezer.h> +#include <linux/compiler.h> +#include <linux/cpu.h> +#include <linux/cpuidle.h> + +struct swsusp_info { + struct new_utsname uts; + u32 version_code; + unsigned long num_physpages; + int cpus; + unsigned long image_pages; + unsigned long pages; + unsigned long size; +} __aligned(PAGE_SIZE); + +#ifdef CONFIG_HIBERNATION +/* kernel/power/snapshot.c */ +extern void __init hibernate_reserved_size_init(void); +extern void __init hibernate_image_size_init(void); + +#ifdef CONFIG_ARCH_HIBERNATION_HEADER +/* Maximum size of architecture specific data in a hibernation header */ +#define MAX_ARCH_HEADER_SIZE (sizeof(struct new_utsname) + 4) + +static inline int init_header_complete(struct swsusp_info *info) +{ + return arch_hibernation_header_save(info, MAX_ARCH_HEADER_SIZE); +} + +static inline const char *check_image_kernel(struct swsusp_info *info) +{ + return arch_hibernation_header_restore(info) ? + "architecture specific data" : NULL; +} +#endif /* CONFIG_ARCH_HIBERNATION_HEADER */ + +/* + * Keep some memory free so that I/O operations can succeed without paging + * [Might this be more than 4 MB?] + */ +#define PAGES_FOR_IO ((4096 * 1024) >> PAGE_SHIFT) + +/* + * Keep 1 MB of memory free so that device drivers can allocate some pages in + * their .suspend() routines without breaking the suspend to disk. + */ +#define SPARE_PAGES ((1024 * 1024) >> PAGE_SHIFT) + +asmlinkage int swsusp_save(void); + +/* kernel/power/hibernate.c */ +extern bool freezer_test_done; + +extern int hibernation_snapshot(int platform_mode); +extern int hibernation_restore(int platform_mode); +extern int hibernation_platform_enter(void); + +#ifdef CONFIG_STRICT_KERNEL_RWX +/* kernel/power/snapshot.c */ +extern void enable_restore_image_protection(void); +#else +static inline void enable_restore_image_protection(void) {} +#endif /* CONFIG_STRICT_KERNEL_RWX */ + +#else /* !CONFIG_HIBERNATION */ + +static inline void hibernate_reserved_size_init(void) {} +static inline void hibernate_image_size_init(void) {} +#endif /* !CONFIG_HIBERNATION */ + +#define power_attr(_name) \ +static struct kobj_attribute _name##_attr = { \ + .attr = { \ + .name = __stringify(_name), \ + .mode = 0644, \ + }, \ + .show = _name##_show, \ + .store = _name##_store, \ +} + +#define power_attr_ro(_name) \ +static struct kobj_attribute _name##_attr = { \ + .attr = { \ + .name = __stringify(_name), \ + .mode = S_IRUGO, \ + }, \ + .show = _name##_show, \ +} + +/* Preferred image size in bytes (default 500 MB) */ +extern unsigned long image_size; +/* Size of memory reserved for drivers (default SPARE_PAGES x PAGE_SIZE) */ +extern unsigned long reserved_size; +extern int in_suspend; +extern dev_t swsusp_resume_device; +extern sector_t swsusp_resume_block; + +extern int create_basic_memory_bitmaps(void); +extern void free_basic_memory_bitmaps(void); +extern int hibernate_preallocate_memory(void); + +extern void clear_or_poison_free_pages(void); + +/** + * Auxiliary structure used for reading the snapshot image data and + * metadata from and writing them to the list of page backup entries + * (PBEs) which is the main data structure of swsusp. + * + * Using struct snapshot_handle we can transfer the image, including its + * metadata, as a continuous sequence of bytes with the help of + * snapshot_read_next() and snapshot_write_next(). + * + * The code that writes the image to a storage or transfers it to + * the user land is required to use snapshot_read_next() for this + * purpose and it should not make any assumptions regarding the internal + * structure of the image. Similarly, the code that reads the image from + * a storage or transfers it from the user land is required to use + * snapshot_write_next(). + * + * This may allow us to change the internal structure of the image + * in the future with considerably less effort. + */ + +struct snapshot_handle { + unsigned int cur; /* number of the block of PAGE_SIZE bytes the + * next operation will refer to (ie. current) + */ + void *buffer; /* address of the block to read from + * or write to + */ + int sync_read; /* Set to one to notify the caller of + * snapshot_write_next() that it may + * need to call wait_on_bio_chain() + */ +}; + +/* This macro returns the address from/to which the caller of + * snapshot_read_next()/snapshot_write_next() is allowed to + * read/write data after the function returns + */ +#define data_of(handle) ((handle).buffer) + +extern unsigned int snapshot_additional_pages(struct zone *zone); +extern unsigned long snapshot_get_image_size(void); +extern int snapshot_read_next(struct snapshot_handle *handle); +extern int snapshot_write_next(struct snapshot_handle *handle); +extern void snapshot_write_finalize(struct snapshot_handle *handle); +extern int snapshot_image_loaded(struct snapshot_handle *handle); + +extern bool hibernate_acquire(void); +extern void hibernate_release(void); + +extern sector_t alloc_swapdev_block(int swap); +extern void free_all_swap_pages(int swap); +extern int swsusp_swap_in_use(void); + +/* + * Flags that can be passed from the hibernatig hernel to the "boot" kernel in + * the image header. + */ +#define SF_PLATFORM_MODE 1 +#define SF_NOCOMPRESS_MODE 2 +#define SF_CRC32_MODE 4 +#define SF_HW_SIG 8 + +/* kernel/power/hibernate.c */ +int swsusp_check(bool exclusive); +extern void swsusp_free(void); +extern int swsusp_read(unsigned int *flags_p); +extern int swsusp_write(unsigned int flags); +void swsusp_close(bool exclusive); +#ifdef CONFIG_SUSPEND +extern int swsusp_unmark(void); +#endif + +struct __kernel_old_timeval; +/* kernel/power/swsusp.c */ +extern void swsusp_show_speed(ktime_t, ktime_t, unsigned int, char *); + +#ifdef CONFIG_SUSPEND +/* kernel/power/suspend.c */ +extern const char * const pm_labels[]; +extern const char *pm_states[]; +extern const char *mem_sleep_states[]; + +extern int suspend_devices_and_enter(suspend_state_t state); +#else /* !CONFIG_SUSPEND */ +#define mem_sleep_current PM_SUSPEND_ON + +static inline int suspend_devices_and_enter(suspend_state_t state) +{ + return -ENOSYS; +} +#endif /* !CONFIG_SUSPEND */ + +#ifdef CONFIG_PM_TEST_SUSPEND +/* kernel/power/suspend_test.c */ +extern void suspend_test_start(void); +extern void suspend_test_finish(const char *label); +#else /* !CONFIG_PM_TEST_SUSPEND */ +static inline void suspend_test_start(void) {} +static inline void suspend_test_finish(const char *label) {} +#endif /* !CONFIG_PM_TEST_SUSPEND */ + +#ifdef CONFIG_PM_SLEEP +/* kernel/power/main.c */ +extern int pm_notifier_call_chain_robust(unsigned long val_up, unsigned long val_down); +extern int pm_notifier_call_chain(unsigned long val); +void pm_restrict_gfp_mask(void); +void pm_restore_gfp_mask(void); +#else +static inline void pm_restrict_gfp_mask(void) {} +static inline void pm_restore_gfp_mask(void) {} +#endif + +#ifdef CONFIG_HIGHMEM +int restore_highmem(void); +#else +static inline unsigned int count_highmem_pages(void) { return 0; } +static inline int restore_highmem(void) { return 0; } +#endif + +/* + * Suspend test levels + */ +enum { + /* keep first */ + TEST_NONE, + TEST_CORE, + TEST_CPUS, + TEST_PLATFORM, + TEST_DEVICES, + TEST_FREEZER, + /* keep last */ + __TEST_AFTER_LAST +}; + +#define TEST_FIRST TEST_NONE +#define TEST_MAX (__TEST_AFTER_LAST - 1) + +#ifdef CONFIG_PM_SLEEP_DEBUG +extern int pm_test_level; +#else +#define pm_test_level (TEST_NONE) +#endif + +#ifdef CONFIG_SUSPEND_FREEZER +static inline int suspend_freeze_processes(void) +{ + int error; + + error = freeze_processes(); + /* + * freeze_processes() automatically thaws every task if freezing + * fails. So we need not do anything extra upon error. + */ + if (error) + return error; + + error = freeze_kernel_threads(); + /* + * freeze_kernel_threads() thaws only kernel threads upon freezing + * failure. So we have to thaw the userspace tasks ourselves. + */ + if (error) + thaw_processes(); + + return error; +} + +static inline void suspend_thaw_processes(void) +{ + thaw_processes(); +} +#else +static inline int suspend_freeze_processes(void) +{ + return 0; +} + +static inline void suspend_thaw_processes(void) +{ +} +#endif + +#ifdef CONFIG_PM_AUTOSLEEP + +/* kernel/power/autosleep.c */ +extern int pm_autosleep_init(void); +extern int pm_autosleep_lock(void); +extern void pm_autosleep_unlock(void); +extern suspend_state_t pm_autosleep_state(void); +extern int pm_autosleep_set_state(suspend_state_t state); + +#else /* !CONFIG_PM_AUTOSLEEP */ + +static inline int pm_autosleep_init(void) { return 0; } +static inline int pm_autosleep_lock(void) { return 0; } +static inline void pm_autosleep_unlock(void) {} +static inline suspend_state_t pm_autosleep_state(void) { return PM_SUSPEND_ON; } + +#endif /* !CONFIG_PM_AUTOSLEEP */ + +#ifdef CONFIG_PM_WAKELOCKS + +/* kernel/power/wakelock.c */ +extern ssize_t pm_show_wakelocks(char *buf, bool show_active); +extern int pm_wake_lock(const char *buf); +extern int pm_wake_unlock(const char *buf); + +#endif /* !CONFIG_PM_WAKELOCKS */ + +static inline int pm_sleep_disable_secondary_cpus(void) +{ + cpuidle_pause(); + return suspend_disable_secondary_cpus(); +} + +static inline void pm_sleep_enable_secondary_cpus(void) +{ + suspend_enable_secondary_cpus(); + cpuidle_resume(); +} diff --git a/kernel/power/poweroff.c b/kernel/power/poweroff.c new file mode 100644 index 0000000000..1f306f1586 --- /dev/null +++ b/kernel/power/poweroff.c @@ -0,0 +1,45 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * poweroff.c - sysrq handler to gracefully power down machine. + */ + +#include <linux/kernel.h> +#include <linux/sysrq.h> +#include <linux/init.h> +#include <linux/pm.h> +#include <linux/workqueue.h> +#include <linux/reboot.h> +#include <linux/cpumask.h> + +/* + * When the user hits Sys-Rq o to power down the machine this is the + * callback we use. + */ + +static void do_poweroff(struct work_struct *dummy) +{ + kernel_power_off(); +} + +static DECLARE_WORK(poweroff_work, do_poweroff); + +static void handle_poweroff(u8 key) +{ + /* run sysrq poweroff on boot cpu */ + schedule_work_on(cpumask_first(cpu_online_mask), &poweroff_work); +} + +static const struct sysrq_key_op sysrq_poweroff_op = { + .handler = handle_poweroff, + .help_msg = "poweroff(o)", + .action_msg = "Power Off", + .enable_mask = SYSRQ_ENABLE_BOOT, +}; + +static int __init pm_sysrq_init(void) +{ + register_sysrq_key('o', &sysrq_poweroff_op); + return 0; +} + +subsys_initcall(pm_sysrq_init); diff --git a/kernel/power/process.c b/kernel/power/process.c new file mode 100644 index 0000000000..cae81a87cc --- /dev/null +++ b/kernel/power/process.c @@ -0,0 +1,235 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * drivers/power/process.c - Functions for starting/stopping processes on + * suspend transitions. + * + * Originally from swsusp. + */ + +#include <linux/interrupt.h> +#include <linux/oom.h> +#include <linux/suspend.h> +#include <linux/module.h> +#include <linux/sched/debug.h> +#include <linux/sched/task.h> +#include <linux/syscalls.h> +#include <linux/freezer.h> +#include <linux/delay.h> +#include <linux/workqueue.h> +#include <linux/kmod.h> +#include <trace/events/power.h> +#include <linux/cpuset.h> + +/* + * Timeout for stopping processes + */ +unsigned int __read_mostly freeze_timeout_msecs = 20 * MSEC_PER_SEC; + +static int try_to_freeze_tasks(bool user_only) +{ + const char *what = user_only ? "user space processes" : + "remaining freezable tasks"; + struct task_struct *g, *p; + unsigned long end_time; + unsigned int todo; + bool wq_busy = false; + ktime_t start, end, elapsed; + unsigned int elapsed_msecs; + bool wakeup = false; + int sleep_usecs = USEC_PER_MSEC; + + pr_info("Freezing %s\n", what); + + start = ktime_get_boottime(); + + end_time = jiffies + msecs_to_jiffies(freeze_timeout_msecs); + + if (!user_only) + freeze_workqueues_begin(); + + while (true) { + todo = 0; + read_lock(&tasklist_lock); + for_each_process_thread(g, p) { + if (p == current || !freeze_task(p)) + continue; + + todo++; + } + read_unlock(&tasklist_lock); + + if (!user_only) { + wq_busy = freeze_workqueues_busy(); + todo += wq_busy; + } + + if (!todo || time_after(jiffies, end_time)) + break; + + if (pm_wakeup_pending()) { + wakeup = true; + break; + } + + /* + * We need to retry, but first give the freezing tasks some + * time to enter the refrigerator. Start with an initial + * 1 ms sleep followed by exponential backoff until 8 ms. + */ + usleep_range(sleep_usecs / 2, sleep_usecs); + if (sleep_usecs < 8 * USEC_PER_MSEC) + sleep_usecs *= 2; + } + + end = ktime_get_boottime(); + elapsed = ktime_sub(end, start); + elapsed_msecs = ktime_to_ms(elapsed); + + if (todo) { + pr_err("Freezing %s %s after %d.%03d seconds " + "(%d tasks refusing to freeze, wq_busy=%d):\n", what, + wakeup ? "aborted" : "failed", + elapsed_msecs / 1000, elapsed_msecs % 1000, + todo - wq_busy, wq_busy); + + if (wq_busy) + show_freezable_workqueues(); + + if (!wakeup || pm_debug_messages_on) { + read_lock(&tasklist_lock); + for_each_process_thread(g, p) { + if (p != current && freezing(p) && !frozen(p)) + sched_show_task(p); + } + read_unlock(&tasklist_lock); + } + } else { + pr_info("Freezing %s completed (elapsed %d.%03d seconds)\n", + what, elapsed_msecs / 1000, elapsed_msecs % 1000); + } + + return todo ? -EBUSY : 0; +} + +/** + * freeze_processes - Signal user space processes to enter the refrigerator. + * The current thread will not be frozen. The same process that calls + * freeze_processes must later call thaw_processes. + * + * On success, returns 0. On failure, -errno and system is fully thawed. + */ +int freeze_processes(void) +{ + int error; + + error = __usermodehelper_disable(UMH_FREEZING); + if (error) + return error; + + /* Make sure this task doesn't get frozen */ + current->flags |= PF_SUSPEND_TASK; + + if (!pm_freezing) + static_branch_inc(&freezer_active); + + pm_wakeup_clear(0); + pm_freezing = true; + error = try_to_freeze_tasks(true); + if (!error) + __usermodehelper_set_disable_depth(UMH_DISABLED); + + BUG_ON(in_atomic()); + + /* + * Now that the whole userspace is frozen we need to disable + * the OOM killer to disallow any further interference with + * killable tasks. There is no guarantee oom victims will + * ever reach a point they go away we have to wait with a timeout. + */ + if (!error && !oom_killer_disable(msecs_to_jiffies(freeze_timeout_msecs))) + error = -EBUSY; + + if (error) + thaw_processes(); + return error; +} + +/** + * freeze_kernel_threads - Make freezable kernel threads go to the refrigerator. + * + * On success, returns 0. On failure, -errno and only the kernel threads are + * thawed, so as to give a chance to the caller to do additional cleanups + * (if any) before thawing the userspace tasks. So, it is the responsibility + * of the caller to thaw the userspace tasks, when the time is right. + */ +int freeze_kernel_threads(void) +{ + int error; + + pm_nosig_freezing = true; + error = try_to_freeze_tasks(false); + + BUG_ON(in_atomic()); + + if (error) + thaw_kernel_threads(); + return error; +} + +void thaw_processes(void) +{ + struct task_struct *g, *p; + struct task_struct *curr = current; + + trace_suspend_resume(TPS("thaw_processes"), 0, true); + if (pm_freezing) + static_branch_dec(&freezer_active); + pm_freezing = false; + pm_nosig_freezing = false; + + oom_killer_enable(); + + pr_info("Restarting tasks ... "); + + __usermodehelper_set_disable_depth(UMH_FREEZING); + thaw_workqueues(); + + cpuset_wait_for_hotplug(); + + read_lock(&tasklist_lock); + for_each_process_thread(g, p) { + /* No other threads should have PF_SUSPEND_TASK set */ + WARN_ON((p != curr) && (p->flags & PF_SUSPEND_TASK)); + __thaw_task(p); + } + read_unlock(&tasklist_lock); + + WARN_ON(!(curr->flags & PF_SUSPEND_TASK)); + curr->flags &= ~PF_SUSPEND_TASK; + + usermodehelper_enable(); + + schedule(); + pr_cont("done.\n"); + trace_suspend_resume(TPS("thaw_processes"), 0, false); +} + +void thaw_kernel_threads(void) +{ + struct task_struct *g, *p; + + pm_nosig_freezing = false; + pr_info("Restarting kernel threads ... "); + + thaw_workqueues(); + + read_lock(&tasklist_lock); + for_each_process_thread(g, p) { + if (p->flags & PF_KTHREAD) + __thaw_task(p); + } + read_unlock(&tasklist_lock); + + schedule(); + pr_cont("done.\n"); +} diff --git a/kernel/power/qos.c b/kernel/power/qos.c new file mode 100644 index 0000000000..4244b06944 --- /dev/null +++ b/kernel/power/qos.c @@ -0,0 +1,683 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Power Management Quality of Service (PM QoS) support base. + * + * Copyright (C) 2020 Intel Corporation + * + * Authors: + * Mark Gross <mgross@linux.intel.com> + * Rafael J. Wysocki <rafael.j.wysocki@intel.com> + * + * Provided here is an interface for specifying PM QoS dependencies. It allows + * entities depending on QoS constraints to register their requests which are + * aggregated as appropriate to produce effective constraints (target values) + * that can be monitored by entities needing to respect them, either by polling + * or through a built-in notification mechanism. + * + * In addition to the basic functionality, more specific interfaces for managing + * global CPU latency QoS requests and frequency QoS requests are provided. + */ + +/*#define DEBUG*/ + +#include <linux/pm_qos.h> +#include <linux/sched.h> +#include <linux/spinlock.h> +#include <linux/slab.h> +#include <linux/time.h> +#include <linux/fs.h> +#include <linux/device.h> +#include <linux/miscdevice.h> +#include <linux/string.h> +#include <linux/platform_device.h> +#include <linux/init.h> +#include <linux/kernel.h> +#include <linux/debugfs.h> +#include <linux/seq_file.h> + +#include <linux/uaccess.h> +#include <linux/export.h> +#include <trace/events/power.h> + +/* + * locking rule: all changes to constraints or notifiers lists + * or pm_qos_object list and pm_qos_objects need to happen with pm_qos_lock + * held, taken with _irqsave. One lock to rule them all + */ +static DEFINE_SPINLOCK(pm_qos_lock); + +/** + * pm_qos_read_value - Return the current effective constraint value. + * @c: List of PM QoS constraint requests. + */ +s32 pm_qos_read_value(struct pm_qos_constraints *c) +{ + return READ_ONCE(c->target_value); +} + +static int pm_qos_get_value(struct pm_qos_constraints *c) +{ + if (plist_head_empty(&c->list)) + return c->no_constraint_value; + + switch (c->type) { + case PM_QOS_MIN: + return plist_first(&c->list)->prio; + + case PM_QOS_MAX: + return plist_last(&c->list)->prio; + + default: + WARN(1, "Unknown PM QoS type in %s\n", __func__); + return PM_QOS_DEFAULT_VALUE; + } +} + +static void pm_qos_set_value(struct pm_qos_constraints *c, s32 value) +{ + WRITE_ONCE(c->target_value, value); +} + +/** + * pm_qos_update_target - Update a list of PM QoS constraint requests. + * @c: List of PM QoS requests. + * @node: Target list entry. + * @action: Action to carry out (add, update or remove). + * @value: New request value for the target list entry. + * + * Update the given list of PM QoS constraint requests, @c, by carrying an + * @action involving the @node list entry and @value on it. + * + * The recognized values of @action are PM_QOS_ADD_REQ (store @value in @node + * and add it to the list), PM_QOS_UPDATE_REQ (remove @node from the list, store + * @value in it and add it to the list again), and PM_QOS_REMOVE_REQ (remove + * @node from the list, ignore @value). + * + * Return: 1 if the aggregate constraint value has changed, 0 otherwise. + */ +int pm_qos_update_target(struct pm_qos_constraints *c, struct plist_node *node, + enum pm_qos_req_action action, int value) +{ + int prev_value, curr_value, new_value; + unsigned long flags; + + spin_lock_irqsave(&pm_qos_lock, flags); + + prev_value = pm_qos_get_value(c); + if (value == PM_QOS_DEFAULT_VALUE) + new_value = c->default_value; + else + new_value = value; + + switch (action) { + case PM_QOS_REMOVE_REQ: + plist_del(node, &c->list); + break; + case PM_QOS_UPDATE_REQ: + /* + * To change the list, atomically remove, reinit with new value + * and add, then see if the aggregate has changed. + */ + plist_del(node, &c->list); + fallthrough; + case PM_QOS_ADD_REQ: + plist_node_init(node, new_value); + plist_add(node, &c->list); + break; + default: + /* no action */ + ; + } + + curr_value = pm_qos_get_value(c); + pm_qos_set_value(c, curr_value); + + spin_unlock_irqrestore(&pm_qos_lock, flags); + + trace_pm_qos_update_target(action, prev_value, curr_value); + + if (prev_value == curr_value) + return 0; + + if (c->notifiers) + blocking_notifier_call_chain(c->notifiers, curr_value, NULL); + + return 1; +} + +/** + * pm_qos_flags_remove_req - Remove device PM QoS flags request. + * @pqf: Device PM QoS flags set to remove the request from. + * @req: Request to remove from the set. + */ +static void pm_qos_flags_remove_req(struct pm_qos_flags *pqf, + struct pm_qos_flags_request *req) +{ + s32 val = 0; + + list_del(&req->node); + list_for_each_entry(req, &pqf->list, node) + val |= req->flags; + + pqf->effective_flags = val; +} + +/** + * pm_qos_update_flags - Update a set of PM QoS flags. + * @pqf: Set of PM QoS flags to update. + * @req: Request to add to the set, to modify, or to remove from the set. + * @action: Action to take on the set. + * @val: Value of the request to add or modify. + * + * Return: 1 if the aggregate constraint value has changed, 0 otherwise. + */ +bool pm_qos_update_flags(struct pm_qos_flags *pqf, + struct pm_qos_flags_request *req, + enum pm_qos_req_action action, s32 val) +{ + unsigned long irqflags; + s32 prev_value, curr_value; + + spin_lock_irqsave(&pm_qos_lock, irqflags); + + prev_value = list_empty(&pqf->list) ? 0 : pqf->effective_flags; + + switch (action) { + case PM_QOS_REMOVE_REQ: + pm_qos_flags_remove_req(pqf, req); + break; + case PM_QOS_UPDATE_REQ: + pm_qos_flags_remove_req(pqf, req); + fallthrough; + case PM_QOS_ADD_REQ: + req->flags = val; + INIT_LIST_HEAD(&req->node); + list_add_tail(&req->node, &pqf->list); + pqf->effective_flags |= val; + break; + default: + /* no action */ + ; + } + + curr_value = list_empty(&pqf->list) ? 0 : pqf->effective_flags; + + spin_unlock_irqrestore(&pm_qos_lock, irqflags); + + trace_pm_qos_update_flags(action, prev_value, curr_value); + + return prev_value != curr_value; +} + +#ifdef CONFIG_CPU_IDLE +/* Definitions related to the CPU latency QoS. */ + +static struct pm_qos_constraints cpu_latency_constraints = { + .list = PLIST_HEAD_INIT(cpu_latency_constraints.list), + .target_value = PM_QOS_CPU_LATENCY_DEFAULT_VALUE, + .default_value = PM_QOS_CPU_LATENCY_DEFAULT_VALUE, + .no_constraint_value = PM_QOS_CPU_LATENCY_DEFAULT_VALUE, + .type = PM_QOS_MIN, +}; + +static inline bool cpu_latency_qos_value_invalid(s32 value) +{ + return value < 0 && value != PM_QOS_DEFAULT_VALUE; +} + +/** + * cpu_latency_qos_limit - Return current system-wide CPU latency QoS limit. + */ +s32 cpu_latency_qos_limit(void) +{ + return pm_qos_read_value(&cpu_latency_constraints); +} + +/** + * cpu_latency_qos_request_active - Check the given PM QoS request. + * @req: PM QoS request to check. + * + * Return: 'true' if @req has been added to the CPU latency QoS list, 'false' + * otherwise. + */ +bool cpu_latency_qos_request_active(struct pm_qos_request *req) +{ + return req->qos == &cpu_latency_constraints; +} +EXPORT_SYMBOL_GPL(cpu_latency_qos_request_active); + +static void cpu_latency_qos_apply(struct pm_qos_request *req, + enum pm_qos_req_action action, s32 value) +{ + int ret = pm_qos_update_target(req->qos, &req->node, action, value); + if (ret > 0) + wake_up_all_idle_cpus(); +} + +/** + * cpu_latency_qos_add_request - Add new CPU latency QoS request. + * @req: Pointer to a preallocated handle. + * @value: Requested constraint value. + * + * Use @value to initialize the request handle pointed to by @req, insert it as + * a new entry to the CPU latency QoS list and recompute the effective QoS + * constraint for that list. + * + * Callers need to save the handle for later use in updates and removal of the + * QoS request represented by it. + */ +void cpu_latency_qos_add_request(struct pm_qos_request *req, s32 value) +{ + if (!req || cpu_latency_qos_value_invalid(value)) + return; + + if (cpu_latency_qos_request_active(req)) { + WARN(1, KERN_ERR "%s called for already added request\n", __func__); + return; + } + + trace_pm_qos_add_request(value); + + req->qos = &cpu_latency_constraints; + cpu_latency_qos_apply(req, PM_QOS_ADD_REQ, value); +} +EXPORT_SYMBOL_GPL(cpu_latency_qos_add_request); + +/** + * cpu_latency_qos_update_request - Modify existing CPU latency QoS request. + * @req : QoS request to update. + * @new_value: New requested constraint value. + * + * Use @new_value to update the QoS request represented by @req in the CPU + * latency QoS list along with updating the effective constraint value for that + * list. + */ +void cpu_latency_qos_update_request(struct pm_qos_request *req, s32 new_value) +{ + if (!req || cpu_latency_qos_value_invalid(new_value)) + return; + + if (!cpu_latency_qos_request_active(req)) { + WARN(1, KERN_ERR "%s called for unknown object\n", __func__); + return; + } + + trace_pm_qos_update_request(new_value); + + if (new_value == req->node.prio) + return; + + cpu_latency_qos_apply(req, PM_QOS_UPDATE_REQ, new_value); +} +EXPORT_SYMBOL_GPL(cpu_latency_qos_update_request); + +/** + * cpu_latency_qos_remove_request - Remove existing CPU latency QoS request. + * @req: QoS request to remove. + * + * Remove the CPU latency QoS request represented by @req from the CPU latency + * QoS list along with updating the effective constraint value for that list. + */ +void cpu_latency_qos_remove_request(struct pm_qos_request *req) +{ + if (!req) + return; + + if (!cpu_latency_qos_request_active(req)) { + WARN(1, KERN_ERR "%s called for unknown object\n", __func__); + return; + } + + trace_pm_qos_remove_request(PM_QOS_DEFAULT_VALUE); + + cpu_latency_qos_apply(req, PM_QOS_REMOVE_REQ, PM_QOS_DEFAULT_VALUE); + memset(req, 0, sizeof(*req)); +} +EXPORT_SYMBOL_GPL(cpu_latency_qos_remove_request); + +/* User space interface to the CPU latency QoS via misc device. */ + +static int cpu_latency_qos_open(struct inode *inode, struct file *filp) +{ + struct pm_qos_request *req; + + req = kzalloc(sizeof(*req), GFP_KERNEL); + if (!req) + return -ENOMEM; + + cpu_latency_qos_add_request(req, PM_QOS_DEFAULT_VALUE); + filp->private_data = req; + + return 0; +} + +static int cpu_latency_qos_release(struct inode *inode, struct file *filp) +{ + struct pm_qos_request *req = filp->private_data; + + filp->private_data = NULL; + + cpu_latency_qos_remove_request(req); + kfree(req); + + return 0; +} + +static ssize_t cpu_latency_qos_read(struct file *filp, char __user *buf, + size_t count, loff_t *f_pos) +{ + struct pm_qos_request *req = filp->private_data; + unsigned long flags; + s32 value; + + if (!req || !cpu_latency_qos_request_active(req)) + return -EINVAL; + + spin_lock_irqsave(&pm_qos_lock, flags); + value = pm_qos_get_value(&cpu_latency_constraints); + spin_unlock_irqrestore(&pm_qos_lock, flags); + + return simple_read_from_buffer(buf, count, f_pos, &value, sizeof(s32)); +} + +static ssize_t cpu_latency_qos_write(struct file *filp, const char __user *buf, + size_t count, loff_t *f_pos) +{ + s32 value; + + if (count == sizeof(s32)) { + if (copy_from_user(&value, buf, sizeof(s32))) + return -EFAULT; + } else { + int ret; + + ret = kstrtos32_from_user(buf, count, 16, &value); + if (ret) + return ret; + } + + cpu_latency_qos_update_request(filp->private_data, value); + + return count; +} + +static const struct file_operations cpu_latency_qos_fops = { + .write = cpu_latency_qos_write, + .read = cpu_latency_qos_read, + .open = cpu_latency_qos_open, + .release = cpu_latency_qos_release, + .llseek = noop_llseek, +}; + +static struct miscdevice cpu_latency_qos_miscdev = { + .minor = MISC_DYNAMIC_MINOR, + .name = "cpu_dma_latency", + .fops = &cpu_latency_qos_fops, +}; + +static int __init cpu_latency_qos_init(void) +{ + int ret; + + ret = misc_register(&cpu_latency_qos_miscdev); + if (ret < 0) + pr_err("%s: %s setup failed\n", __func__, + cpu_latency_qos_miscdev.name); + + return ret; +} +late_initcall(cpu_latency_qos_init); +#endif /* CONFIG_CPU_IDLE */ + +/* Definitions related to the frequency QoS below. */ + +static inline bool freq_qos_value_invalid(s32 value) +{ + return value < 0 && value != PM_QOS_DEFAULT_VALUE; +} + +/** + * freq_constraints_init - Initialize frequency QoS constraints. + * @qos: Frequency QoS constraints to initialize. + */ +void freq_constraints_init(struct freq_constraints *qos) +{ + struct pm_qos_constraints *c; + + c = &qos->min_freq; + plist_head_init(&c->list); + c->target_value = FREQ_QOS_MIN_DEFAULT_VALUE; + c->default_value = FREQ_QOS_MIN_DEFAULT_VALUE; + c->no_constraint_value = FREQ_QOS_MIN_DEFAULT_VALUE; + c->type = PM_QOS_MAX; + c->notifiers = &qos->min_freq_notifiers; + BLOCKING_INIT_NOTIFIER_HEAD(c->notifiers); + + c = &qos->max_freq; + plist_head_init(&c->list); + c->target_value = FREQ_QOS_MAX_DEFAULT_VALUE; + c->default_value = FREQ_QOS_MAX_DEFAULT_VALUE; + c->no_constraint_value = FREQ_QOS_MAX_DEFAULT_VALUE; + c->type = PM_QOS_MIN; + c->notifiers = &qos->max_freq_notifiers; + BLOCKING_INIT_NOTIFIER_HEAD(c->notifiers); +} + +/** + * freq_qos_read_value - Get frequency QoS constraint for a given list. + * @qos: Constraints to evaluate. + * @type: QoS request type. + */ +s32 freq_qos_read_value(struct freq_constraints *qos, + enum freq_qos_req_type type) +{ + s32 ret; + + switch (type) { + case FREQ_QOS_MIN: + ret = IS_ERR_OR_NULL(qos) ? + FREQ_QOS_MIN_DEFAULT_VALUE : + pm_qos_read_value(&qos->min_freq); + break; + case FREQ_QOS_MAX: + ret = IS_ERR_OR_NULL(qos) ? + FREQ_QOS_MAX_DEFAULT_VALUE : + pm_qos_read_value(&qos->max_freq); + break; + default: + WARN_ON(1); + ret = 0; + } + + return ret; +} + +/** + * freq_qos_apply - Add/modify/remove frequency QoS request. + * @req: Constraint request to apply. + * @action: Action to perform (add/update/remove). + * @value: Value to assign to the QoS request. + * + * This is only meant to be called from inside pm_qos, not drivers. + */ +int freq_qos_apply(struct freq_qos_request *req, + enum pm_qos_req_action action, s32 value) +{ + int ret; + + switch(req->type) { + case FREQ_QOS_MIN: + ret = pm_qos_update_target(&req->qos->min_freq, &req->pnode, + action, value); + break; + case FREQ_QOS_MAX: + ret = pm_qos_update_target(&req->qos->max_freq, &req->pnode, + action, value); + break; + default: + ret = -EINVAL; + } + + return ret; +} + +/** + * freq_qos_add_request - Insert new frequency QoS request into a given list. + * @qos: Constraints to update. + * @req: Preallocated request object. + * @type: Request type. + * @value: Request value. + * + * Insert a new entry into the @qos list of requests, recompute the effective + * QoS constraint value for that list and initialize the @req object. The + * caller needs to save that object for later use in updates and removal. + * + * Return 1 if the effective constraint value has changed, 0 if the effective + * constraint value has not changed, or a negative error code on failures. + */ +int freq_qos_add_request(struct freq_constraints *qos, + struct freq_qos_request *req, + enum freq_qos_req_type type, s32 value) +{ + int ret; + + if (IS_ERR_OR_NULL(qos) || !req || freq_qos_value_invalid(value)) + return -EINVAL; + + if (WARN(freq_qos_request_active(req), + "%s() called for active request\n", __func__)) + return -EINVAL; + + req->qos = qos; + req->type = type; + ret = freq_qos_apply(req, PM_QOS_ADD_REQ, value); + if (ret < 0) { + req->qos = NULL; + req->type = 0; + } + + return ret; +} +EXPORT_SYMBOL_GPL(freq_qos_add_request); + +/** + * freq_qos_update_request - Modify existing frequency QoS request. + * @req: Request to modify. + * @new_value: New request value. + * + * Update an existing frequency QoS request along with the effective constraint + * value for the list of requests it belongs to. + * + * Return 1 if the effective constraint value has changed, 0 if the effective + * constraint value has not changed, or a negative error code on failures. + */ +int freq_qos_update_request(struct freq_qos_request *req, s32 new_value) +{ + if (!req || freq_qos_value_invalid(new_value)) + return -EINVAL; + + if (WARN(!freq_qos_request_active(req), + "%s() called for unknown object\n", __func__)) + return -EINVAL; + + if (req->pnode.prio == new_value) + return 0; + + return freq_qos_apply(req, PM_QOS_UPDATE_REQ, new_value); +} +EXPORT_SYMBOL_GPL(freq_qos_update_request); + +/** + * freq_qos_remove_request - Remove frequency QoS request from its list. + * @req: Request to remove. + * + * Remove the given frequency QoS request from the list of constraints it + * belongs to and recompute the effective constraint value for that list. + * + * Return 1 if the effective constraint value has changed, 0 if the effective + * constraint value has not changed, or a negative error code on failures. + */ +int freq_qos_remove_request(struct freq_qos_request *req) +{ + int ret; + + if (!req) + return -EINVAL; + + if (WARN(!freq_qos_request_active(req), + "%s() called for unknown object\n", __func__)) + return -EINVAL; + + ret = freq_qos_apply(req, PM_QOS_REMOVE_REQ, PM_QOS_DEFAULT_VALUE); + req->qos = NULL; + req->type = 0; + + return ret; +} +EXPORT_SYMBOL_GPL(freq_qos_remove_request); + +/** + * freq_qos_add_notifier - Add frequency QoS change notifier. + * @qos: List of requests to add the notifier to. + * @type: Request type. + * @notifier: Notifier block to add. + */ +int freq_qos_add_notifier(struct freq_constraints *qos, + enum freq_qos_req_type type, + struct notifier_block *notifier) +{ + int ret; + + if (IS_ERR_OR_NULL(qos) || !notifier) + return -EINVAL; + + switch (type) { + case FREQ_QOS_MIN: + ret = blocking_notifier_chain_register(qos->min_freq.notifiers, + notifier); + break; + case FREQ_QOS_MAX: + ret = blocking_notifier_chain_register(qos->max_freq.notifiers, + notifier); + break; + default: + WARN_ON(1); + ret = -EINVAL; + } + + return ret; +} +EXPORT_SYMBOL_GPL(freq_qos_add_notifier); + +/** + * freq_qos_remove_notifier - Remove frequency QoS change notifier. + * @qos: List of requests to remove the notifier from. + * @type: Request type. + * @notifier: Notifier block to remove. + */ +int freq_qos_remove_notifier(struct freq_constraints *qos, + enum freq_qos_req_type type, + struct notifier_block *notifier) +{ + int ret; + + if (IS_ERR_OR_NULL(qos) || !notifier) + return -EINVAL; + + switch (type) { + case FREQ_QOS_MIN: + ret = blocking_notifier_chain_unregister(qos->min_freq.notifiers, + notifier); + break; + case FREQ_QOS_MAX: + ret = blocking_notifier_chain_unregister(qos->max_freq.notifiers, + notifier); + break; + default: + WARN_ON(1); + ret = -EINVAL; + } + + return ret; +} +EXPORT_SYMBOL_GPL(freq_qos_remove_notifier); diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c new file mode 100644 index 0000000000..50a15408c3 --- /dev/null +++ b/kernel/power/snapshot.c @@ -0,0 +1,2923 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * linux/kernel/power/snapshot.c + * + * This file provides system snapshot/restore functionality for swsusp. + * + * Copyright (C) 1998-2005 Pavel Machek <pavel@ucw.cz> + * Copyright (C) 2006 Rafael J. Wysocki <rjw@sisk.pl> + */ + +#define pr_fmt(fmt) "PM: hibernation: " fmt + +#include <linux/version.h> +#include <linux/module.h> +#include <linux/mm.h> +#include <linux/suspend.h> +#include <linux/delay.h> +#include <linux/bitops.h> +#include <linux/spinlock.h> +#include <linux/kernel.h> +#include <linux/pm.h> +#include <linux/device.h> +#include <linux/init.h> +#include <linux/memblock.h> +#include <linux/nmi.h> +#include <linux/syscalls.h> +#include <linux/console.h> +#include <linux/highmem.h> +#include <linux/list.h> +#include <linux/slab.h> +#include <linux/compiler.h> +#include <linux/ktime.h> +#include <linux/set_memory.h> + +#include <linux/uaccess.h> +#include <asm/mmu_context.h> +#include <asm/tlbflush.h> +#include <asm/io.h> + +#include "power.h" + +#if defined(CONFIG_STRICT_KERNEL_RWX) && defined(CONFIG_ARCH_HAS_SET_MEMORY) +static bool hibernate_restore_protection; +static bool hibernate_restore_protection_active; + +void enable_restore_image_protection(void) +{ + hibernate_restore_protection = true; +} + +static inline void hibernate_restore_protection_begin(void) +{ + hibernate_restore_protection_active = hibernate_restore_protection; +} + +static inline void hibernate_restore_protection_end(void) +{ + hibernate_restore_protection_active = false; +} + +static inline void hibernate_restore_protect_page(void *page_address) +{ + if (hibernate_restore_protection_active) + set_memory_ro((unsigned long)page_address, 1); +} + +static inline void hibernate_restore_unprotect_page(void *page_address) +{ + if (hibernate_restore_protection_active) + set_memory_rw((unsigned long)page_address, 1); +} +#else +static inline void hibernate_restore_protection_begin(void) {} +static inline void hibernate_restore_protection_end(void) {} +static inline void hibernate_restore_protect_page(void *page_address) {} +static inline void hibernate_restore_unprotect_page(void *page_address) {} +#endif /* CONFIG_STRICT_KERNEL_RWX && CONFIG_ARCH_HAS_SET_MEMORY */ + + +/* + * The calls to set_direct_map_*() should not fail because remapping a page + * here means that we only update protection bits in an existing PTE. + * It is still worth to have a warning here if something changes and this + * will no longer be the case. + */ +static inline void hibernate_map_page(struct page *page) +{ + if (IS_ENABLED(CONFIG_ARCH_HAS_SET_DIRECT_MAP)) { + int ret = set_direct_map_default_noflush(page); + + if (ret) + pr_warn_once("Failed to remap page\n"); + } else { + debug_pagealloc_map_pages(page, 1); + } +} + +static inline void hibernate_unmap_page(struct page *page) +{ + if (IS_ENABLED(CONFIG_ARCH_HAS_SET_DIRECT_MAP)) { + unsigned long addr = (unsigned long)page_address(page); + int ret = set_direct_map_invalid_noflush(page); + + if (ret) + pr_warn_once("Failed to remap page\n"); + + flush_tlb_kernel_range(addr, addr + PAGE_SIZE); + } else { + debug_pagealloc_unmap_pages(page, 1); + } +} + +static int swsusp_page_is_free(struct page *); +static void swsusp_set_page_forbidden(struct page *); +static void swsusp_unset_page_forbidden(struct page *); + +/* + * Number of bytes to reserve for memory allocations made by device drivers + * from their ->freeze() and ->freeze_noirq() callbacks so that they don't + * cause image creation to fail (tunable via /sys/power/reserved_size). + */ +unsigned long reserved_size; + +void __init hibernate_reserved_size_init(void) +{ + reserved_size = SPARE_PAGES * PAGE_SIZE; +} + +/* + * Preferred image size in bytes (tunable via /sys/power/image_size). + * When it is set to N, swsusp will do its best to ensure the image + * size will not exceed N bytes, but if that is impossible, it will + * try to create the smallest image possible. + */ +unsigned long image_size; + +void __init hibernate_image_size_init(void) +{ + image_size = ((totalram_pages() * 2) / 5) * PAGE_SIZE; +} + +/* + * List of PBEs needed for restoring the pages that were allocated before + * the suspend and included in the suspend image, but have also been + * allocated by the "resume" kernel, so their contents cannot be written + * directly to their "original" page frames. + */ +struct pbe *restore_pblist; + +/* struct linked_page is used to build chains of pages */ + +#define LINKED_PAGE_DATA_SIZE (PAGE_SIZE - sizeof(void *)) + +struct linked_page { + struct linked_page *next; + char data[LINKED_PAGE_DATA_SIZE]; +} __packed; + +/* + * List of "safe" pages (ie. pages that were not used by the image kernel + * before hibernation) that may be used as temporary storage for image kernel + * memory contents. + */ +static struct linked_page *safe_pages_list; + +/* Pointer to an auxiliary buffer (1 page) */ +static void *buffer; + +#define PG_ANY 0 +#define PG_SAFE 1 +#define PG_UNSAFE_CLEAR 1 +#define PG_UNSAFE_KEEP 0 + +static unsigned int allocated_unsafe_pages; + +/** + * get_image_page - Allocate a page for a hibernation image. + * @gfp_mask: GFP mask for the allocation. + * @safe_needed: Get pages that were not used before hibernation (restore only) + * + * During image restoration, for storing the PBE list and the image data, we can + * only use memory pages that do not conflict with the pages used before + * hibernation. The "unsafe" pages have PageNosaveFree set and we count them + * using allocated_unsafe_pages. + * + * Each allocated image page is marked as PageNosave and PageNosaveFree so that + * swsusp_free() can release it. + */ +static void *get_image_page(gfp_t gfp_mask, int safe_needed) +{ + void *res; + + res = (void *)get_zeroed_page(gfp_mask); + if (safe_needed) + while (res && swsusp_page_is_free(virt_to_page(res))) { + /* The page is unsafe, mark it for swsusp_free() */ + swsusp_set_page_forbidden(virt_to_page(res)); + allocated_unsafe_pages++; + res = (void *)get_zeroed_page(gfp_mask); + } + if (res) { + swsusp_set_page_forbidden(virt_to_page(res)); + swsusp_set_page_free(virt_to_page(res)); + } + return res; +} + +static void *__get_safe_page(gfp_t gfp_mask) +{ + if (safe_pages_list) { + void *ret = safe_pages_list; + + safe_pages_list = safe_pages_list->next; + memset(ret, 0, PAGE_SIZE); + return ret; + } + return get_image_page(gfp_mask, PG_SAFE); +} + +unsigned long get_safe_page(gfp_t gfp_mask) +{ + return (unsigned long)__get_safe_page(gfp_mask); +} + +static struct page *alloc_image_page(gfp_t gfp_mask) +{ + struct page *page; + + page = alloc_page(gfp_mask); + if (page) { + swsusp_set_page_forbidden(page); + swsusp_set_page_free(page); + } + return page; +} + +static void recycle_safe_page(void *page_address) +{ + struct linked_page *lp = page_address; + + lp->next = safe_pages_list; + safe_pages_list = lp; +} + +/** + * free_image_page - Free a page allocated for hibernation image. + * @addr: Address of the page to free. + * @clear_nosave_free: If set, clear the PageNosaveFree bit for the page. + * + * The page to free should have been allocated by get_image_page() (page flags + * set by it are affected). + */ +static inline void free_image_page(void *addr, int clear_nosave_free) +{ + struct page *page; + + BUG_ON(!virt_addr_valid(addr)); + + page = virt_to_page(addr); + + swsusp_unset_page_forbidden(page); + if (clear_nosave_free) + swsusp_unset_page_free(page); + + __free_page(page); +} + +static inline void free_list_of_pages(struct linked_page *list, + int clear_page_nosave) +{ + while (list) { + struct linked_page *lp = list->next; + + free_image_page(list, clear_page_nosave); + list = lp; + } +} + +/* + * struct chain_allocator is used for allocating small objects out of + * a linked list of pages called 'the chain'. + * + * The chain grows each time when there is no room for a new object in + * the current page. The allocated objects cannot be freed individually. + * It is only possible to free them all at once, by freeing the entire + * chain. + * + * NOTE: The chain allocator may be inefficient if the allocated objects + * are not much smaller than PAGE_SIZE. + */ +struct chain_allocator { + struct linked_page *chain; /* the chain */ + unsigned int used_space; /* total size of objects allocated out + of the current page */ + gfp_t gfp_mask; /* mask for allocating pages */ + int safe_needed; /* if set, only "safe" pages are allocated */ +}; + +static void chain_init(struct chain_allocator *ca, gfp_t gfp_mask, + int safe_needed) +{ + ca->chain = NULL; + ca->used_space = LINKED_PAGE_DATA_SIZE; + ca->gfp_mask = gfp_mask; + ca->safe_needed = safe_needed; +} + +static void *chain_alloc(struct chain_allocator *ca, unsigned int size) +{ + void *ret; + + if (LINKED_PAGE_DATA_SIZE - ca->used_space < size) { + struct linked_page *lp; + + lp = ca->safe_needed ? __get_safe_page(ca->gfp_mask) : + get_image_page(ca->gfp_mask, PG_ANY); + if (!lp) + return NULL; + + lp->next = ca->chain; + ca->chain = lp; + ca->used_space = 0; + } + ret = ca->chain->data + ca->used_space; + ca->used_space += size; + return ret; +} + +/* + * Data types related to memory bitmaps. + * + * Memory bitmap is a structure consisting of many linked lists of + * objects. The main list's elements are of type struct zone_bitmap + * and each of them corresponds to one zone. For each zone bitmap + * object there is a list of objects of type struct bm_block that + * represent each blocks of bitmap in which information is stored. + * + * struct memory_bitmap contains a pointer to the main list of zone + * bitmap objects, a struct bm_position used for browsing the bitmap, + * and a pointer to the list of pages used for allocating all of the + * zone bitmap objects and bitmap block objects. + * + * NOTE: It has to be possible to lay out the bitmap in memory + * using only allocations of order 0. Additionally, the bitmap is + * designed to work with arbitrary number of zones (this is over the + * top for now, but let's avoid making unnecessary assumptions ;-). + * + * struct zone_bitmap contains a pointer to a list of bitmap block + * objects and a pointer to the bitmap block object that has been + * most recently used for setting bits. Additionally, it contains the + * PFNs that correspond to the start and end of the represented zone. + * + * struct bm_block contains a pointer to the memory page in which + * information is stored (in the form of a block of bitmap) + * It also contains the pfns that correspond to the start and end of + * the represented memory area. + * + * The memory bitmap is organized as a radix tree to guarantee fast random + * access to the bits. There is one radix tree for each zone (as returned + * from create_mem_extents). + * + * One radix tree is represented by one struct mem_zone_bm_rtree. There are + * two linked lists for the nodes of the tree, one for the inner nodes and + * one for the leave nodes. The linked leave nodes are used for fast linear + * access of the memory bitmap. + * + * The struct rtree_node represents one node of the radix tree. + */ + +#define BM_END_OF_MAP (~0UL) + +#define BM_BITS_PER_BLOCK (PAGE_SIZE * BITS_PER_BYTE) +#define BM_BLOCK_SHIFT (PAGE_SHIFT + 3) +#define BM_BLOCK_MASK ((1UL << BM_BLOCK_SHIFT) - 1) + +/* + * struct rtree_node is a wrapper struct to link the nodes + * of the rtree together for easy linear iteration over + * bits and easy freeing + */ +struct rtree_node { + struct list_head list; + unsigned long *data; +}; + +/* + * struct mem_zone_bm_rtree represents a bitmap used for one + * populated memory zone. + */ +struct mem_zone_bm_rtree { + struct list_head list; /* Link Zones together */ + struct list_head nodes; /* Radix Tree inner nodes */ + struct list_head leaves; /* Radix Tree leaves */ + unsigned long start_pfn; /* Zone start page frame */ + unsigned long end_pfn; /* Zone end page frame + 1 */ + struct rtree_node *rtree; /* Radix Tree Root */ + int levels; /* Number of Radix Tree Levels */ + unsigned int blocks; /* Number of Bitmap Blocks */ +}; + +/* struct bm_position is used for browsing memory bitmaps */ + +struct bm_position { + struct mem_zone_bm_rtree *zone; + struct rtree_node *node; + unsigned long node_pfn; + unsigned long cur_pfn; + int node_bit; +}; + +struct memory_bitmap { + struct list_head zones; + struct linked_page *p_list; /* list of pages used to store zone + bitmap objects and bitmap block + objects */ + struct bm_position cur; /* most recently used bit position */ +}; + +/* Functions that operate on memory bitmaps */ + +#define BM_ENTRIES_PER_LEVEL (PAGE_SIZE / sizeof(unsigned long)) +#if BITS_PER_LONG == 32 +#define BM_RTREE_LEVEL_SHIFT (PAGE_SHIFT - 2) +#else +#define BM_RTREE_LEVEL_SHIFT (PAGE_SHIFT - 3) +#endif +#define BM_RTREE_LEVEL_MASK ((1UL << BM_RTREE_LEVEL_SHIFT) - 1) + +/** + * alloc_rtree_node - Allocate a new node and add it to the radix tree. + * @gfp_mask: GFP mask for the allocation. + * @safe_needed: Get pages not used before hibernation (restore only) + * @ca: Pointer to a linked list of pages ("a chain") to allocate from + * @list: Radix Tree node to add. + * + * This function is used to allocate inner nodes as well as the + * leave nodes of the radix tree. It also adds the node to the + * corresponding linked list passed in by the *list parameter. + */ +static struct rtree_node *alloc_rtree_node(gfp_t gfp_mask, int safe_needed, + struct chain_allocator *ca, + struct list_head *list) +{ + struct rtree_node *node; + + node = chain_alloc(ca, sizeof(struct rtree_node)); + if (!node) + return NULL; + + node->data = get_image_page(gfp_mask, safe_needed); + if (!node->data) + return NULL; + + list_add_tail(&node->list, list); + + return node; +} + +/** + * add_rtree_block - Add a new leave node to the radix tree. + * + * The leave nodes need to be allocated in order to keep the leaves + * linked list in order. This is guaranteed by the zone->blocks + * counter. + */ +static int add_rtree_block(struct mem_zone_bm_rtree *zone, gfp_t gfp_mask, + int safe_needed, struct chain_allocator *ca) +{ + struct rtree_node *node, *block, **dst; + unsigned int levels_needed, block_nr; + int i; + + block_nr = zone->blocks; + levels_needed = 0; + + /* How many levels do we need for this block nr? */ + while (block_nr) { + levels_needed += 1; + block_nr >>= BM_RTREE_LEVEL_SHIFT; + } + + /* Make sure the rtree has enough levels */ + for (i = zone->levels; i < levels_needed; i++) { + node = alloc_rtree_node(gfp_mask, safe_needed, ca, + &zone->nodes); + if (!node) + return -ENOMEM; + + node->data[0] = (unsigned long)zone->rtree; + zone->rtree = node; + zone->levels += 1; + } + + /* Allocate new block */ + block = alloc_rtree_node(gfp_mask, safe_needed, ca, &zone->leaves); + if (!block) + return -ENOMEM; + + /* Now walk the rtree to insert the block */ + node = zone->rtree; + dst = &zone->rtree; + block_nr = zone->blocks; + for (i = zone->levels; i > 0; i--) { + int index; + + if (!node) { + node = alloc_rtree_node(gfp_mask, safe_needed, ca, + &zone->nodes); + if (!node) + return -ENOMEM; + *dst = node; + } + + index = block_nr >> ((i - 1) * BM_RTREE_LEVEL_SHIFT); + index &= BM_RTREE_LEVEL_MASK; + dst = (struct rtree_node **)&((*dst)->data[index]); + node = *dst; + } + + zone->blocks += 1; + *dst = block; + + return 0; +} + +static void free_zone_bm_rtree(struct mem_zone_bm_rtree *zone, + int clear_nosave_free); + +/** + * create_zone_bm_rtree - Create a radix tree for one zone. + * + * Allocated the mem_zone_bm_rtree structure and initializes it. + * This function also allocated and builds the radix tree for the + * zone. + */ +static struct mem_zone_bm_rtree *create_zone_bm_rtree(gfp_t gfp_mask, + int safe_needed, + struct chain_allocator *ca, + unsigned long start, + unsigned long end) +{ + struct mem_zone_bm_rtree *zone; + unsigned int i, nr_blocks; + unsigned long pages; + + pages = end - start; + zone = chain_alloc(ca, sizeof(struct mem_zone_bm_rtree)); + if (!zone) + return NULL; + + INIT_LIST_HEAD(&zone->nodes); + INIT_LIST_HEAD(&zone->leaves); + zone->start_pfn = start; + zone->end_pfn = end; + nr_blocks = DIV_ROUND_UP(pages, BM_BITS_PER_BLOCK); + + for (i = 0; i < nr_blocks; i++) { + if (add_rtree_block(zone, gfp_mask, safe_needed, ca)) { + free_zone_bm_rtree(zone, PG_UNSAFE_CLEAR); + return NULL; + } + } + + return zone; +} + +/** + * free_zone_bm_rtree - Free the memory of the radix tree. + * + * Free all node pages of the radix tree. The mem_zone_bm_rtree + * structure itself is not freed here nor are the rtree_node + * structs. + */ +static void free_zone_bm_rtree(struct mem_zone_bm_rtree *zone, + int clear_nosave_free) +{ + struct rtree_node *node; + + list_for_each_entry(node, &zone->nodes, list) + free_image_page(node->data, clear_nosave_free); + + list_for_each_entry(node, &zone->leaves, list) + free_image_page(node->data, clear_nosave_free); +} + +static void memory_bm_position_reset(struct memory_bitmap *bm) +{ + bm->cur.zone = list_entry(bm->zones.next, struct mem_zone_bm_rtree, + list); + bm->cur.node = list_entry(bm->cur.zone->leaves.next, + struct rtree_node, list); + bm->cur.node_pfn = 0; + bm->cur.cur_pfn = BM_END_OF_MAP; + bm->cur.node_bit = 0; +} + +static void memory_bm_free(struct memory_bitmap *bm, int clear_nosave_free); + +struct mem_extent { + struct list_head hook; + unsigned long start; + unsigned long end; +}; + +/** + * free_mem_extents - Free a list of memory extents. + * @list: List of extents to free. + */ +static void free_mem_extents(struct list_head *list) +{ + struct mem_extent *ext, *aux; + + list_for_each_entry_safe(ext, aux, list, hook) { + list_del(&ext->hook); + kfree(ext); + } +} + +/** + * create_mem_extents - Create a list of memory extents. + * @list: List to put the extents into. + * @gfp_mask: Mask to use for memory allocations. + * + * The extents represent contiguous ranges of PFNs. + */ +static int create_mem_extents(struct list_head *list, gfp_t gfp_mask) +{ + struct zone *zone; + + INIT_LIST_HEAD(list); + + for_each_populated_zone(zone) { + unsigned long zone_start, zone_end; + struct mem_extent *ext, *cur, *aux; + + zone_start = zone->zone_start_pfn; + zone_end = zone_end_pfn(zone); + + list_for_each_entry(ext, list, hook) + if (zone_start <= ext->end) + break; + + if (&ext->hook == list || zone_end < ext->start) { + /* New extent is necessary */ + struct mem_extent *new_ext; + + new_ext = kzalloc(sizeof(struct mem_extent), gfp_mask); + if (!new_ext) { + free_mem_extents(list); + return -ENOMEM; + } + new_ext->start = zone_start; + new_ext->end = zone_end; + list_add_tail(&new_ext->hook, &ext->hook); + continue; + } + + /* Merge this zone's range of PFNs with the existing one */ + if (zone_start < ext->start) + ext->start = zone_start; + if (zone_end > ext->end) + ext->end = zone_end; + + /* More merging may be possible */ + cur = ext; + list_for_each_entry_safe_continue(cur, aux, list, hook) { + if (zone_end < cur->start) + break; + if (zone_end < cur->end) + ext->end = cur->end; + list_del(&cur->hook); + kfree(cur); + } + } + + return 0; +} + +/** + * memory_bm_create - Allocate memory for a memory bitmap. + */ +static int memory_bm_create(struct memory_bitmap *bm, gfp_t gfp_mask, + int safe_needed) +{ + struct chain_allocator ca; + struct list_head mem_extents; + struct mem_extent *ext; + int error; + + chain_init(&ca, gfp_mask, safe_needed); + INIT_LIST_HEAD(&bm->zones); + + error = create_mem_extents(&mem_extents, gfp_mask); + if (error) + return error; + + list_for_each_entry(ext, &mem_extents, hook) { + struct mem_zone_bm_rtree *zone; + + zone = create_zone_bm_rtree(gfp_mask, safe_needed, &ca, + ext->start, ext->end); + if (!zone) { + error = -ENOMEM; + goto Error; + } + list_add_tail(&zone->list, &bm->zones); + } + + bm->p_list = ca.chain; + memory_bm_position_reset(bm); + Exit: + free_mem_extents(&mem_extents); + return error; + + Error: + bm->p_list = ca.chain; + memory_bm_free(bm, PG_UNSAFE_CLEAR); + goto Exit; +} + +/** + * memory_bm_free - Free memory occupied by the memory bitmap. + * @bm: Memory bitmap. + */ +static void memory_bm_free(struct memory_bitmap *bm, int clear_nosave_free) +{ + struct mem_zone_bm_rtree *zone; + + list_for_each_entry(zone, &bm->zones, list) + free_zone_bm_rtree(zone, clear_nosave_free); + + free_list_of_pages(bm->p_list, clear_nosave_free); + + INIT_LIST_HEAD(&bm->zones); +} + +/** + * memory_bm_find_bit - Find the bit for a given PFN in a memory bitmap. + * + * Find the bit in memory bitmap @bm that corresponds to the given PFN. + * The cur.zone, cur.block and cur.node_pfn members of @bm are updated. + * + * Walk the radix tree to find the page containing the bit that represents @pfn + * and return the position of the bit in @addr and @bit_nr. + */ +static int memory_bm_find_bit(struct memory_bitmap *bm, unsigned long pfn, + void **addr, unsigned int *bit_nr) +{ + struct mem_zone_bm_rtree *curr, *zone; + struct rtree_node *node; + int i, block_nr; + + zone = bm->cur.zone; + + if (pfn >= zone->start_pfn && pfn < zone->end_pfn) + goto zone_found; + + zone = NULL; + + /* Find the right zone */ + list_for_each_entry(curr, &bm->zones, list) { + if (pfn >= curr->start_pfn && pfn < curr->end_pfn) { + zone = curr; + break; + } + } + + if (!zone) + return -EFAULT; + +zone_found: + /* + * We have found the zone. Now walk the radix tree to find the leaf node + * for our PFN. + */ + + /* + * If the zone we wish to scan is the current zone and the + * pfn falls into the current node then we do not need to walk + * the tree. + */ + node = bm->cur.node; + if (zone == bm->cur.zone && + ((pfn - zone->start_pfn) & ~BM_BLOCK_MASK) == bm->cur.node_pfn) + goto node_found; + + node = zone->rtree; + block_nr = (pfn - zone->start_pfn) >> BM_BLOCK_SHIFT; + + for (i = zone->levels; i > 0; i--) { + int index; + + index = block_nr >> ((i - 1) * BM_RTREE_LEVEL_SHIFT); + index &= BM_RTREE_LEVEL_MASK; + BUG_ON(node->data[index] == 0); + node = (struct rtree_node *)node->data[index]; + } + +node_found: + /* Update last position */ + bm->cur.zone = zone; + bm->cur.node = node; + bm->cur.node_pfn = (pfn - zone->start_pfn) & ~BM_BLOCK_MASK; + bm->cur.cur_pfn = pfn; + + /* Set return values */ + *addr = node->data; + *bit_nr = (pfn - zone->start_pfn) & BM_BLOCK_MASK; + + return 0; +} + +static void memory_bm_set_bit(struct memory_bitmap *bm, unsigned long pfn) +{ + void *addr; + unsigned int bit; + int error; + + error = memory_bm_find_bit(bm, pfn, &addr, &bit); + BUG_ON(error); + set_bit(bit, addr); +} + +static int mem_bm_set_bit_check(struct memory_bitmap *bm, unsigned long pfn) +{ + void *addr; + unsigned int bit; + int error; + + error = memory_bm_find_bit(bm, pfn, &addr, &bit); + if (!error) + set_bit(bit, addr); + + return error; +} + +static void memory_bm_clear_bit(struct memory_bitmap *bm, unsigned long pfn) +{ + void *addr; + unsigned int bit; + int error; + + error = memory_bm_find_bit(bm, pfn, &addr, &bit); + BUG_ON(error); + clear_bit(bit, addr); +} + +static void memory_bm_clear_current(struct memory_bitmap *bm) +{ + int bit; + + bit = max(bm->cur.node_bit - 1, 0); + clear_bit(bit, bm->cur.node->data); +} + +static unsigned long memory_bm_get_current(struct memory_bitmap *bm) +{ + return bm->cur.cur_pfn; +} + +static int memory_bm_test_bit(struct memory_bitmap *bm, unsigned long pfn) +{ + void *addr; + unsigned int bit; + int error; + + error = memory_bm_find_bit(bm, pfn, &addr, &bit); + BUG_ON(error); + return test_bit(bit, addr); +} + +static bool memory_bm_pfn_present(struct memory_bitmap *bm, unsigned long pfn) +{ + void *addr; + unsigned int bit; + + return !memory_bm_find_bit(bm, pfn, &addr, &bit); +} + +/* + * rtree_next_node - Jump to the next leaf node. + * + * Set the position to the beginning of the next node in the + * memory bitmap. This is either the next node in the current + * zone's radix tree or the first node in the radix tree of the + * next zone. + * + * Return true if there is a next node, false otherwise. + */ +static bool rtree_next_node(struct memory_bitmap *bm) +{ + if (!list_is_last(&bm->cur.node->list, &bm->cur.zone->leaves)) { + bm->cur.node = list_entry(bm->cur.node->list.next, + struct rtree_node, list); + bm->cur.node_pfn += BM_BITS_PER_BLOCK; + bm->cur.node_bit = 0; + touch_softlockup_watchdog(); + return true; + } + + /* No more nodes, goto next zone */ + if (!list_is_last(&bm->cur.zone->list, &bm->zones)) { + bm->cur.zone = list_entry(bm->cur.zone->list.next, + struct mem_zone_bm_rtree, list); + bm->cur.node = list_entry(bm->cur.zone->leaves.next, + struct rtree_node, list); + bm->cur.node_pfn = 0; + bm->cur.node_bit = 0; + return true; + } + + /* No more zones */ + return false; +} + +/** + * memory_bm_next_pfn - Find the next set bit in a memory bitmap. + * @bm: Memory bitmap. + * + * Starting from the last returned position this function searches for the next + * set bit in @bm and returns the PFN represented by it. If no more bits are + * set, BM_END_OF_MAP is returned. + * + * It is required to run memory_bm_position_reset() before the first call to + * this function for the given memory bitmap. + */ +static unsigned long memory_bm_next_pfn(struct memory_bitmap *bm) +{ + unsigned long bits, pfn, pages; + int bit; + + do { + pages = bm->cur.zone->end_pfn - bm->cur.zone->start_pfn; + bits = min(pages - bm->cur.node_pfn, BM_BITS_PER_BLOCK); + bit = find_next_bit(bm->cur.node->data, bits, + bm->cur.node_bit); + if (bit < bits) { + pfn = bm->cur.zone->start_pfn + bm->cur.node_pfn + bit; + bm->cur.node_bit = bit + 1; + bm->cur.cur_pfn = pfn; + return pfn; + } + } while (rtree_next_node(bm)); + + bm->cur.cur_pfn = BM_END_OF_MAP; + return BM_END_OF_MAP; +} + +/* + * This structure represents a range of page frames the contents of which + * should not be saved during hibernation. + */ +struct nosave_region { + struct list_head list; + unsigned long start_pfn; + unsigned long end_pfn; +}; + +static LIST_HEAD(nosave_regions); + +static void recycle_zone_bm_rtree(struct mem_zone_bm_rtree *zone) +{ + struct rtree_node *node; + + list_for_each_entry(node, &zone->nodes, list) + recycle_safe_page(node->data); + + list_for_each_entry(node, &zone->leaves, list) + recycle_safe_page(node->data); +} + +static void memory_bm_recycle(struct memory_bitmap *bm) +{ + struct mem_zone_bm_rtree *zone; + struct linked_page *p_list; + + list_for_each_entry(zone, &bm->zones, list) + recycle_zone_bm_rtree(zone); + + p_list = bm->p_list; + while (p_list) { + struct linked_page *lp = p_list; + + p_list = lp->next; + recycle_safe_page(lp); + } +} + +/** + * register_nosave_region - Register a region of unsaveable memory. + * + * Register a range of page frames the contents of which should not be saved + * during hibernation (to be used in the early initialization code). + */ +void __init register_nosave_region(unsigned long start_pfn, unsigned long end_pfn) +{ + struct nosave_region *region; + + if (start_pfn >= end_pfn) + return; + + if (!list_empty(&nosave_regions)) { + /* Try to extend the previous region (they should be sorted) */ + region = list_entry(nosave_regions.prev, + struct nosave_region, list); + if (region->end_pfn == start_pfn) { + region->end_pfn = end_pfn; + goto Report; + } + } + /* This allocation cannot fail */ + region = memblock_alloc(sizeof(struct nosave_region), + SMP_CACHE_BYTES); + if (!region) + panic("%s: Failed to allocate %zu bytes\n", __func__, + sizeof(struct nosave_region)); + region->start_pfn = start_pfn; + region->end_pfn = end_pfn; + list_add_tail(®ion->list, &nosave_regions); + Report: + pr_info("Registered nosave memory: [mem %#010llx-%#010llx]\n", + (unsigned long long) start_pfn << PAGE_SHIFT, + ((unsigned long long) end_pfn << PAGE_SHIFT) - 1); +} + +/* + * Set bits in this map correspond to the page frames the contents of which + * should not be saved during the suspend. + */ +static struct memory_bitmap *forbidden_pages_map; + +/* Set bits in this map correspond to free page frames. */ +static struct memory_bitmap *free_pages_map; + +/* + * Each page frame allocated for creating the image is marked by setting the + * corresponding bits in forbidden_pages_map and free_pages_map simultaneously + */ + +void swsusp_set_page_free(struct page *page) +{ + if (free_pages_map) + memory_bm_set_bit(free_pages_map, page_to_pfn(page)); +} + +static int swsusp_page_is_free(struct page *page) +{ + return free_pages_map ? + memory_bm_test_bit(free_pages_map, page_to_pfn(page)) : 0; +} + +void swsusp_unset_page_free(struct page *page) +{ + if (free_pages_map) + memory_bm_clear_bit(free_pages_map, page_to_pfn(page)); +} + +static void swsusp_set_page_forbidden(struct page *page) +{ + if (forbidden_pages_map) + memory_bm_set_bit(forbidden_pages_map, page_to_pfn(page)); +} + +int swsusp_page_is_forbidden(struct page *page) +{ + return forbidden_pages_map ? + memory_bm_test_bit(forbidden_pages_map, page_to_pfn(page)) : 0; +} + +static void swsusp_unset_page_forbidden(struct page *page) +{ + if (forbidden_pages_map) + memory_bm_clear_bit(forbidden_pages_map, page_to_pfn(page)); +} + +/** + * mark_nosave_pages - Mark pages that should not be saved. + * @bm: Memory bitmap. + * + * Set the bits in @bm that correspond to the page frames the contents of which + * should not be saved. + */ +static void mark_nosave_pages(struct memory_bitmap *bm) +{ + struct nosave_region *region; + + if (list_empty(&nosave_regions)) + return; + + list_for_each_entry(region, &nosave_regions, list) { + unsigned long pfn; + + pr_debug("Marking nosave pages: [mem %#010llx-%#010llx]\n", + (unsigned long long) region->start_pfn << PAGE_SHIFT, + ((unsigned long long) region->end_pfn << PAGE_SHIFT) + - 1); + + for (pfn = region->start_pfn; pfn < region->end_pfn; pfn++) + if (pfn_valid(pfn)) { + /* + * It is safe to ignore the result of + * mem_bm_set_bit_check() here, since we won't + * touch the PFNs for which the error is + * returned anyway. + */ + mem_bm_set_bit_check(bm, pfn); + } + } +} + +/** + * create_basic_memory_bitmaps - Create bitmaps to hold basic page information. + * + * Create bitmaps needed for marking page frames that should not be saved and + * free page frames. The forbidden_pages_map and free_pages_map pointers are + * only modified if everything goes well, because we don't want the bits to be + * touched before both bitmaps are set up. + */ +int create_basic_memory_bitmaps(void) +{ + struct memory_bitmap *bm1, *bm2; + int error = 0; + + if (forbidden_pages_map && free_pages_map) + return 0; + else + BUG_ON(forbidden_pages_map || free_pages_map); + + bm1 = kzalloc(sizeof(struct memory_bitmap), GFP_KERNEL); + if (!bm1) + return -ENOMEM; + + error = memory_bm_create(bm1, GFP_KERNEL, PG_ANY); + if (error) + goto Free_first_object; + + bm2 = kzalloc(sizeof(struct memory_bitmap), GFP_KERNEL); + if (!bm2) + goto Free_first_bitmap; + + error = memory_bm_create(bm2, GFP_KERNEL, PG_ANY); + if (error) + goto Free_second_object; + + forbidden_pages_map = bm1; + free_pages_map = bm2; + mark_nosave_pages(forbidden_pages_map); + + pr_debug("Basic memory bitmaps created\n"); + + return 0; + + Free_second_object: + kfree(bm2); + Free_first_bitmap: + memory_bm_free(bm1, PG_UNSAFE_CLEAR); + Free_first_object: + kfree(bm1); + return -ENOMEM; +} + +/** + * free_basic_memory_bitmaps - Free memory bitmaps holding basic information. + * + * Free memory bitmaps allocated by create_basic_memory_bitmaps(). The + * auxiliary pointers are necessary so that the bitmaps themselves are not + * referred to while they are being freed. + */ +void free_basic_memory_bitmaps(void) +{ + struct memory_bitmap *bm1, *bm2; + + if (WARN_ON(!(forbidden_pages_map && free_pages_map))) + return; + + bm1 = forbidden_pages_map; + bm2 = free_pages_map; + forbidden_pages_map = NULL; + free_pages_map = NULL; + memory_bm_free(bm1, PG_UNSAFE_CLEAR); + kfree(bm1); + memory_bm_free(bm2, PG_UNSAFE_CLEAR); + kfree(bm2); + + pr_debug("Basic memory bitmaps freed\n"); +} + +static void clear_or_poison_free_page(struct page *page) +{ + if (page_poisoning_enabled_static()) + __kernel_poison_pages(page, 1); + else if (want_init_on_free()) + clear_highpage(page); +} + +void clear_or_poison_free_pages(void) +{ + struct memory_bitmap *bm = free_pages_map; + unsigned long pfn; + + if (WARN_ON(!(free_pages_map))) + return; + + if (page_poisoning_enabled() || want_init_on_free()) { + memory_bm_position_reset(bm); + pfn = memory_bm_next_pfn(bm); + while (pfn != BM_END_OF_MAP) { + if (pfn_valid(pfn)) + clear_or_poison_free_page(pfn_to_page(pfn)); + + pfn = memory_bm_next_pfn(bm); + } + memory_bm_position_reset(bm); + pr_info("free pages cleared after restore\n"); + } +} + +/** + * snapshot_additional_pages - Estimate the number of extra pages needed. + * @zone: Memory zone to carry out the computation for. + * + * Estimate the number of additional pages needed for setting up a hibernation + * image data structures for @zone (usually, the returned value is greater than + * the exact number). + */ +unsigned int snapshot_additional_pages(struct zone *zone) +{ + unsigned int rtree, nodes; + + rtree = nodes = DIV_ROUND_UP(zone->spanned_pages, BM_BITS_PER_BLOCK); + rtree += DIV_ROUND_UP(rtree * sizeof(struct rtree_node), + LINKED_PAGE_DATA_SIZE); + while (nodes > 1) { + nodes = DIV_ROUND_UP(nodes, BM_ENTRIES_PER_LEVEL); + rtree += nodes; + } + + return 2 * rtree; +} + +/* + * Touch the watchdog for every WD_PAGE_COUNT pages. + */ +#define WD_PAGE_COUNT (128*1024) + +static void mark_free_pages(struct zone *zone) +{ + unsigned long pfn, max_zone_pfn, page_count = WD_PAGE_COUNT; + unsigned long flags; + unsigned int order, t; + struct page *page; + + if (zone_is_empty(zone)) + return; + + spin_lock_irqsave(&zone->lock, flags); + + max_zone_pfn = zone_end_pfn(zone); + for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++) + if (pfn_valid(pfn)) { + page = pfn_to_page(pfn); + + if (!--page_count) { + touch_nmi_watchdog(); + page_count = WD_PAGE_COUNT; + } + + if (page_zone(page) != zone) + continue; + + if (!swsusp_page_is_forbidden(page)) + swsusp_unset_page_free(page); + } + + for_each_migratetype_order(order, t) { + list_for_each_entry(page, + &zone->free_area[order].free_list[t], buddy_list) { + unsigned long i; + + pfn = page_to_pfn(page); + for (i = 0; i < (1UL << order); i++) { + if (!--page_count) { + touch_nmi_watchdog(); + page_count = WD_PAGE_COUNT; + } + swsusp_set_page_free(pfn_to_page(pfn + i)); + } + } + } + spin_unlock_irqrestore(&zone->lock, flags); +} + +#ifdef CONFIG_HIGHMEM +/** + * count_free_highmem_pages - Compute the total number of free highmem pages. + * + * The returned number is system-wide. + */ +static unsigned int count_free_highmem_pages(void) +{ + struct zone *zone; + unsigned int cnt = 0; + + for_each_populated_zone(zone) + if (is_highmem(zone)) + cnt += zone_page_state(zone, NR_FREE_PAGES); + + return cnt; +} + +/** + * saveable_highmem_page - Check if a highmem page is saveable. + * + * Determine whether a highmem page should be included in a hibernation image. + * + * We should save the page if it isn't Nosave or NosaveFree, or Reserved, + * and it isn't part of a free chunk of pages. + */ +static struct page *saveable_highmem_page(struct zone *zone, unsigned long pfn) +{ + struct page *page; + + if (!pfn_valid(pfn)) + return NULL; + + page = pfn_to_online_page(pfn); + if (!page || page_zone(page) != zone) + return NULL; + + BUG_ON(!PageHighMem(page)); + + if (swsusp_page_is_forbidden(page) || swsusp_page_is_free(page)) + return NULL; + + if (PageReserved(page) || PageOffline(page)) + return NULL; + + if (page_is_guard(page)) + return NULL; + + return page; +} + +/** + * count_highmem_pages - Compute the total number of saveable highmem pages. + */ +static unsigned int count_highmem_pages(void) +{ + struct zone *zone; + unsigned int n = 0; + + for_each_populated_zone(zone) { + unsigned long pfn, max_zone_pfn; + + if (!is_highmem(zone)) + continue; + + mark_free_pages(zone); + max_zone_pfn = zone_end_pfn(zone); + for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++) + if (saveable_highmem_page(zone, pfn)) + n++; + } + return n; +} +#else +static inline void *saveable_highmem_page(struct zone *z, unsigned long p) +{ + return NULL; +} +#endif /* CONFIG_HIGHMEM */ + +/** + * saveable_page - Check if the given page is saveable. + * + * Determine whether a non-highmem page should be included in a hibernation + * image. + * + * We should save the page if it isn't Nosave, and is not in the range + * of pages statically defined as 'unsaveable', and it isn't part of + * a free chunk of pages. + */ +static struct page *saveable_page(struct zone *zone, unsigned long pfn) +{ + struct page *page; + + if (!pfn_valid(pfn)) + return NULL; + + page = pfn_to_online_page(pfn); + if (!page || page_zone(page) != zone) + return NULL; + + BUG_ON(PageHighMem(page)); + + if (swsusp_page_is_forbidden(page) || swsusp_page_is_free(page)) + return NULL; + + if (PageOffline(page)) + return NULL; + + if (PageReserved(page) + && (!kernel_page_present(page) || pfn_is_nosave(pfn))) + return NULL; + + if (page_is_guard(page)) + return NULL; + + return page; +} + +/** + * count_data_pages - Compute the total number of saveable non-highmem pages. + */ +static unsigned int count_data_pages(void) +{ + struct zone *zone; + unsigned long pfn, max_zone_pfn; + unsigned int n = 0; + + for_each_populated_zone(zone) { + if (is_highmem(zone)) + continue; + + mark_free_pages(zone); + max_zone_pfn = zone_end_pfn(zone); + for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++) + if (saveable_page(zone, pfn)) + n++; + } + return n; +} + +/* + * This is needed, because copy_page and memcpy are not usable for copying + * task structs. Returns true if the page was filled with only zeros, + * otherwise false. + */ +static inline bool do_copy_page(long *dst, long *src) +{ + long z = 0; + int n; + + for (n = PAGE_SIZE / sizeof(long); n; n--) { + z |= *src; + *dst++ = *src++; + } + return !z; +} + +/** + * safe_copy_page - Copy a page in a safe way. + * + * Check if the page we are going to copy is marked as present in the kernel + * page tables. This always is the case if CONFIG_DEBUG_PAGEALLOC or + * CONFIG_ARCH_HAS_SET_DIRECT_MAP is not set. In that case kernel_page_present() + * always returns 'true'. Returns true if the page was entirely composed of + * zeros, otherwise it will return false. + */ +static bool safe_copy_page(void *dst, struct page *s_page) +{ + bool zeros_only; + + if (kernel_page_present(s_page)) { + zeros_only = do_copy_page(dst, page_address(s_page)); + } else { + hibernate_map_page(s_page); + zeros_only = do_copy_page(dst, page_address(s_page)); + hibernate_unmap_page(s_page); + } + return zeros_only; +} + +#ifdef CONFIG_HIGHMEM +static inline struct page *page_is_saveable(struct zone *zone, unsigned long pfn) +{ + return is_highmem(zone) ? + saveable_highmem_page(zone, pfn) : saveable_page(zone, pfn); +} + +static bool copy_data_page(unsigned long dst_pfn, unsigned long src_pfn) +{ + struct page *s_page, *d_page; + void *src, *dst; + bool zeros_only; + + s_page = pfn_to_page(src_pfn); + d_page = pfn_to_page(dst_pfn); + if (PageHighMem(s_page)) { + src = kmap_atomic(s_page); + dst = kmap_atomic(d_page); + zeros_only = do_copy_page(dst, src); + kunmap_atomic(dst); + kunmap_atomic(src); + } else { + if (PageHighMem(d_page)) { + /* + * The page pointed to by src may contain some kernel + * data modified by kmap_atomic() + */ + zeros_only = safe_copy_page(buffer, s_page); + dst = kmap_atomic(d_page); + copy_page(dst, buffer); + kunmap_atomic(dst); + } else { + zeros_only = safe_copy_page(page_address(d_page), s_page); + } + } + return zeros_only; +} +#else +#define page_is_saveable(zone, pfn) saveable_page(zone, pfn) + +static inline int copy_data_page(unsigned long dst_pfn, unsigned long src_pfn) +{ + return safe_copy_page(page_address(pfn_to_page(dst_pfn)), + pfn_to_page(src_pfn)); +} +#endif /* CONFIG_HIGHMEM */ + +/* + * Copy data pages will copy all pages into pages pulled from the copy_bm. + * If a page was entirely filled with zeros it will be marked in the zero_bm. + * + * Returns the number of pages copied. + */ +static unsigned long copy_data_pages(struct memory_bitmap *copy_bm, + struct memory_bitmap *orig_bm, + struct memory_bitmap *zero_bm) +{ + unsigned long copied_pages = 0; + struct zone *zone; + unsigned long pfn, copy_pfn; + + for_each_populated_zone(zone) { + unsigned long max_zone_pfn; + + mark_free_pages(zone); + max_zone_pfn = zone_end_pfn(zone); + for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++) + if (page_is_saveable(zone, pfn)) + memory_bm_set_bit(orig_bm, pfn); + } + memory_bm_position_reset(orig_bm); + memory_bm_position_reset(copy_bm); + copy_pfn = memory_bm_next_pfn(copy_bm); + for(;;) { + pfn = memory_bm_next_pfn(orig_bm); + if (unlikely(pfn == BM_END_OF_MAP)) + break; + if (copy_data_page(copy_pfn, pfn)) { + memory_bm_set_bit(zero_bm, pfn); + /* Use this copy_pfn for a page that is not full of zeros */ + continue; + } + copied_pages++; + copy_pfn = memory_bm_next_pfn(copy_bm); + } + return copied_pages; +} + +/* Total number of image pages */ +static unsigned int nr_copy_pages; +/* Number of pages needed for saving the original pfns of the image pages */ +static unsigned int nr_meta_pages; +/* Number of zero pages */ +static unsigned int nr_zero_pages; + +/* + * Numbers of normal and highmem page frames allocated for hibernation image + * before suspending devices. + */ +static unsigned int alloc_normal, alloc_highmem; +/* + * Memory bitmap used for marking saveable pages (during hibernation) or + * hibernation image pages (during restore) + */ +static struct memory_bitmap orig_bm; +/* + * Memory bitmap used during hibernation for marking allocated page frames that + * will contain copies of saveable pages. During restore it is initially used + * for marking hibernation image pages, but then the set bits from it are + * duplicated in @orig_bm and it is released. On highmem systems it is next + * used for marking "safe" highmem pages, but it has to be reinitialized for + * this purpose. + */ +static struct memory_bitmap copy_bm; + +/* Memory bitmap which tracks which saveable pages were zero filled. */ +static struct memory_bitmap zero_bm; + +/** + * swsusp_free - Free pages allocated for hibernation image. + * + * Image pages are allocated before snapshot creation, so they need to be + * released after resume. + */ +void swsusp_free(void) +{ + unsigned long fb_pfn, fr_pfn; + + if (!forbidden_pages_map || !free_pages_map) + goto out; + + memory_bm_position_reset(forbidden_pages_map); + memory_bm_position_reset(free_pages_map); + +loop: + fr_pfn = memory_bm_next_pfn(free_pages_map); + fb_pfn = memory_bm_next_pfn(forbidden_pages_map); + + /* + * Find the next bit set in both bitmaps. This is guaranteed to + * terminate when fb_pfn == fr_pfn == BM_END_OF_MAP. + */ + do { + if (fb_pfn < fr_pfn) + fb_pfn = memory_bm_next_pfn(forbidden_pages_map); + if (fr_pfn < fb_pfn) + fr_pfn = memory_bm_next_pfn(free_pages_map); + } while (fb_pfn != fr_pfn); + + if (fr_pfn != BM_END_OF_MAP && pfn_valid(fr_pfn)) { + struct page *page = pfn_to_page(fr_pfn); + + memory_bm_clear_current(forbidden_pages_map); + memory_bm_clear_current(free_pages_map); + hibernate_restore_unprotect_page(page_address(page)); + __free_page(page); + goto loop; + } + +out: + nr_copy_pages = 0; + nr_meta_pages = 0; + nr_zero_pages = 0; + restore_pblist = NULL; + buffer = NULL; + alloc_normal = 0; + alloc_highmem = 0; + hibernate_restore_protection_end(); +} + +/* Helper functions used for the shrinking of memory. */ + +#define GFP_IMAGE (GFP_KERNEL | __GFP_NOWARN) + +/** + * preallocate_image_pages - Allocate a number of pages for hibernation image. + * @nr_pages: Number of page frames to allocate. + * @mask: GFP flags to use for the allocation. + * + * Return value: Number of page frames actually allocated + */ +static unsigned long preallocate_image_pages(unsigned long nr_pages, gfp_t mask) +{ + unsigned long nr_alloc = 0; + + while (nr_pages > 0) { + struct page *page; + + page = alloc_image_page(mask); + if (!page) + break; + memory_bm_set_bit(©_bm, page_to_pfn(page)); + if (PageHighMem(page)) + alloc_highmem++; + else + alloc_normal++; + nr_pages--; + nr_alloc++; + } + + return nr_alloc; +} + +static unsigned long preallocate_image_memory(unsigned long nr_pages, + unsigned long avail_normal) +{ + unsigned long alloc; + + if (avail_normal <= alloc_normal) + return 0; + + alloc = avail_normal - alloc_normal; + if (nr_pages < alloc) + alloc = nr_pages; + + return preallocate_image_pages(alloc, GFP_IMAGE); +} + +#ifdef CONFIG_HIGHMEM +static unsigned long preallocate_image_highmem(unsigned long nr_pages) +{ + return preallocate_image_pages(nr_pages, GFP_IMAGE | __GFP_HIGHMEM); +} + +/** + * __fraction - Compute (an approximation of) x * (multiplier / base). + */ +static unsigned long __fraction(u64 x, u64 multiplier, u64 base) +{ + return div64_u64(x * multiplier, base); +} + +static unsigned long preallocate_highmem_fraction(unsigned long nr_pages, + unsigned long highmem, + unsigned long total) +{ + unsigned long alloc = __fraction(nr_pages, highmem, total); + + return preallocate_image_pages(alloc, GFP_IMAGE | __GFP_HIGHMEM); +} +#else /* CONFIG_HIGHMEM */ +static inline unsigned long preallocate_image_highmem(unsigned long nr_pages) +{ + return 0; +} + +static inline unsigned long preallocate_highmem_fraction(unsigned long nr_pages, + unsigned long highmem, + unsigned long total) +{ + return 0; +} +#endif /* CONFIG_HIGHMEM */ + +/** + * free_unnecessary_pages - Release preallocated pages not needed for the image. + */ +static unsigned long free_unnecessary_pages(void) +{ + unsigned long save, to_free_normal, to_free_highmem, free; + + save = count_data_pages(); + if (alloc_normal >= save) { + to_free_normal = alloc_normal - save; + save = 0; + } else { + to_free_normal = 0; + save -= alloc_normal; + } + save += count_highmem_pages(); + if (alloc_highmem >= save) { + to_free_highmem = alloc_highmem - save; + } else { + to_free_highmem = 0; + save -= alloc_highmem; + if (to_free_normal > save) + to_free_normal -= save; + else + to_free_normal = 0; + } + free = to_free_normal + to_free_highmem; + + memory_bm_position_reset(©_bm); + + while (to_free_normal > 0 || to_free_highmem > 0) { + unsigned long pfn = memory_bm_next_pfn(©_bm); + struct page *page = pfn_to_page(pfn); + + if (PageHighMem(page)) { + if (!to_free_highmem) + continue; + to_free_highmem--; + alloc_highmem--; + } else { + if (!to_free_normal) + continue; + to_free_normal--; + alloc_normal--; + } + memory_bm_clear_bit(©_bm, pfn); + swsusp_unset_page_forbidden(page); + swsusp_unset_page_free(page); + __free_page(page); + } + + return free; +} + +/** + * minimum_image_size - Estimate the minimum acceptable size of an image. + * @saveable: Number of saveable pages in the system. + * + * We want to avoid attempting to free too much memory too hard, so estimate the + * minimum acceptable size of a hibernation image to use as the lower limit for + * preallocating memory. + * + * We assume that the minimum image size should be proportional to + * + * [number of saveable pages] - [number of pages that can be freed in theory] + * + * where the second term is the sum of (1) reclaimable slab pages, (2) active + * and (3) inactive anonymous pages, (4) active and (5) inactive file pages. + */ +static unsigned long minimum_image_size(unsigned long saveable) +{ + unsigned long size; + + size = global_node_page_state_pages(NR_SLAB_RECLAIMABLE_B) + + global_node_page_state(NR_ACTIVE_ANON) + + global_node_page_state(NR_INACTIVE_ANON) + + global_node_page_state(NR_ACTIVE_FILE) + + global_node_page_state(NR_INACTIVE_FILE); + + return saveable <= size ? 0 : saveable - size; +} + +/** + * hibernate_preallocate_memory - Preallocate memory for hibernation image. + * + * To create a hibernation image it is necessary to make a copy of every page + * frame in use. We also need a number of page frames to be free during + * hibernation for allocations made while saving the image and for device + * drivers, in case they need to allocate memory from their hibernation + * callbacks (these two numbers are given by PAGES_FOR_IO (which is a rough + * estimate) and reserved_size divided by PAGE_SIZE (which is tunable through + * /sys/power/reserved_size, respectively). To make this happen, we compute the + * total number of available page frames and allocate at least + * + * ([page frames total] - PAGES_FOR_IO - [metadata pages]) / 2 + * - 2 * DIV_ROUND_UP(reserved_size, PAGE_SIZE) + * + * of them, which corresponds to the maximum size of a hibernation image. + * + * If image_size is set below the number following from the above formula, + * the preallocation of memory is continued until the total number of saveable + * pages in the system is below the requested image size or the minimum + * acceptable image size returned by minimum_image_size(), whichever is greater. + */ +int hibernate_preallocate_memory(void) +{ + struct zone *zone; + unsigned long saveable, size, max_size, count, highmem, pages = 0; + unsigned long alloc, save_highmem, pages_highmem, avail_normal; + ktime_t start, stop; + int error; + + pr_info("Preallocating image memory\n"); + start = ktime_get(); + + error = memory_bm_create(&orig_bm, GFP_IMAGE, PG_ANY); + if (error) { + pr_err("Cannot allocate original bitmap\n"); + goto err_out; + } + + error = memory_bm_create(©_bm, GFP_IMAGE, PG_ANY); + if (error) { + pr_err("Cannot allocate copy bitmap\n"); + goto err_out; + } + + error = memory_bm_create(&zero_bm, GFP_IMAGE, PG_ANY); + if (error) { + pr_err("Cannot allocate zero bitmap\n"); + goto err_out; + } + + alloc_normal = 0; + alloc_highmem = 0; + nr_zero_pages = 0; + + /* Count the number of saveable data pages. */ + save_highmem = count_highmem_pages(); + saveable = count_data_pages(); + + /* + * Compute the total number of page frames we can use (count) and the + * number of pages needed for image metadata (size). + */ + count = saveable; + saveable += save_highmem; + highmem = save_highmem; + size = 0; + for_each_populated_zone(zone) { + size += snapshot_additional_pages(zone); + if (is_highmem(zone)) + highmem += zone_page_state(zone, NR_FREE_PAGES); + else + count += zone_page_state(zone, NR_FREE_PAGES); + } + avail_normal = count; + count += highmem; + count -= totalreserve_pages; + + /* Compute the maximum number of saveable pages to leave in memory. */ + max_size = (count - (size + PAGES_FOR_IO)) / 2 + - 2 * DIV_ROUND_UP(reserved_size, PAGE_SIZE); + /* Compute the desired number of image pages specified by image_size. */ + size = DIV_ROUND_UP(image_size, PAGE_SIZE); + if (size > max_size) + size = max_size; + /* + * If the desired number of image pages is at least as large as the + * current number of saveable pages in memory, allocate page frames for + * the image and we're done. + */ + if (size >= saveable) { + pages = preallocate_image_highmem(save_highmem); + pages += preallocate_image_memory(saveable - pages, avail_normal); + goto out; + } + + /* Estimate the minimum size of the image. */ + pages = minimum_image_size(saveable); + /* + * To avoid excessive pressure on the normal zone, leave room in it to + * accommodate an image of the minimum size (unless it's already too + * small, in which case don't preallocate pages from it at all). + */ + if (avail_normal > pages) + avail_normal -= pages; + else + avail_normal = 0; + if (size < pages) + size = min_t(unsigned long, pages, max_size); + + /* + * Let the memory management subsystem know that we're going to need a + * large number of page frames to allocate and make it free some memory. + * NOTE: If this is not done, performance will be hurt badly in some + * test cases. + */ + shrink_all_memory(saveable - size); + + /* + * The number of saveable pages in memory was too high, so apply some + * pressure to decrease it. First, make room for the largest possible + * image and fail if that doesn't work. Next, try to decrease the size + * of the image as much as indicated by 'size' using allocations from + * highmem and non-highmem zones separately. + */ + pages_highmem = preallocate_image_highmem(highmem / 2); + alloc = count - max_size; + if (alloc > pages_highmem) + alloc -= pages_highmem; + else + alloc = 0; + pages = preallocate_image_memory(alloc, avail_normal); + if (pages < alloc) { + /* We have exhausted non-highmem pages, try highmem. */ + alloc -= pages; + pages += pages_highmem; + pages_highmem = preallocate_image_highmem(alloc); + if (pages_highmem < alloc) { + pr_err("Image allocation is %lu pages short\n", + alloc - pages_highmem); + goto err_out; + } + pages += pages_highmem; + /* + * size is the desired number of saveable pages to leave in + * memory, so try to preallocate (all memory - size) pages. + */ + alloc = (count - pages) - size; + pages += preallocate_image_highmem(alloc); + } else { + /* + * There are approximately max_size saveable pages at this point + * and we want to reduce this number down to size. + */ + alloc = max_size - size; + size = preallocate_highmem_fraction(alloc, highmem, count); + pages_highmem += size; + alloc -= size; + size = preallocate_image_memory(alloc, avail_normal); + pages_highmem += preallocate_image_highmem(alloc - size); + pages += pages_highmem + size; + } + + /* + * We only need as many page frames for the image as there are saveable + * pages in memory, but we have allocated more. Release the excessive + * ones now. + */ + pages -= free_unnecessary_pages(); + + out: + stop = ktime_get(); + pr_info("Allocated %lu pages for snapshot\n", pages); + swsusp_show_speed(start, stop, pages, "Allocated"); + + return 0; + + err_out: + swsusp_free(); + return -ENOMEM; +} + +#ifdef CONFIG_HIGHMEM +/** + * count_pages_for_highmem - Count non-highmem pages needed for copying highmem. + * + * Compute the number of non-highmem pages that will be necessary for creating + * copies of highmem pages. + */ +static unsigned int count_pages_for_highmem(unsigned int nr_highmem) +{ + unsigned int free_highmem = count_free_highmem_pages() + alloc_highmem; + + if (free_highmem >= nr_highmem) + nr_highmem = 0; + else + nr_highmem -= free_highmem; + + return nr_highmem; +} +#else +static unsigned int count_pages_for_highmem(unsigned int nr_highmem) { return 0; } +#endif /* CONFIG_HIGHMEM */ + +/** + * enough_free_mem - Check if there is enough free memory for the image. + */ +static int enough_free_mem(unsigned int nr_pages, unsigned int nr_highmem) +{ + struct zone *zone; + unsigned int free = alloc_normal; + + for_each_populated_zone(zone) + if (!is_highmem(zone)) + free += zone_page_state(zone, NR_FREE_PAGES); + + nr_pages += count_pages_for_highmem(nr_highmem); + pr_debug("Normal pages needed: %u + %u, available pages: %u\n", + nr_pages, PAGES_FOR_IO, free); + + return free > nr_pages + PAGES_FOR_IO; +} + +#ifdef CONFIG_HIGHMEM +/** + * get_highmem_buffer - Allocate a buffer for highmem pages. + * + * If there are some highmem pages in the hibernation image, we may need a + * buffer to copy them and/or load their data. + */ +static inline int get_highmem_buffer(int safe_needed) +{ + buffer = get_image_page(GFP_ATOMIC, safe_needed); + return buffer ? 0 : -ENOMEM; +} + +/** + * alloc_highmem_pages - Allocate some highmem pages for the image. + * + * Try to allocate as many pages as needed, but if the number of free highmem + * pages is less than that, allocate them all. + */ +static inline unsigned int alloc_highmem_pages(struct memory_bitmap *bm, + unsigned int nr_highmem) +{ + unsigned int to_alloc = count_free_highmem_pages(); + + if (to_alloc > nr_highmem) + to_alloc = nr_highmem; + + nr_highmem -= to_alloc; + while (to_alloc-- > 0) { + struct page *page; + + page = alloc_image_page(__GFP_HIGHMEM|__GFP_KSWAPD_RECLAIM); + memory_bm_set_bit(bm, page_to_pfn(page)); + } + return nr_highmem; +} +#else +static inline int get_highmem_buffer(int safe_needed) { return 0; } + +static inline unsigned int alloc_highmem_pages(struct memory_bitmap *bm, + unsigned int n) { return 0; } +#endif /* CONFIG_HIGHMEM */ + +/** + * swsusp_alloc - Allocate memory for hibernation image. + * + * We first try to allocate as many highmem pages as there are + * saveable highmem pages in the system. If that fails, we allocate + * non-highmem pages for the copies of the remaining highmem ones. + * + * In this approach it is likely that the copies of highmem pages will + * also be located in the high memory, because of the way in which + * copy_data_pages() works. + */ +static int swsusp_alloc(struct memory_bitmap *copy_bm, + unsigned int nr_pages, unsigned int nr_highmem) +{ + if (nr_highmem > 0) { + if (get_highmem_buffer(PG_ANY)) + goto err_out; + if (nr_highmem > alloc_highmem) { + nr_highmem -= alloc_highmem; + nr_pages += alloc_highmem_pages(copy_bm, nr_highmem); + } + } + if (nr_pages > alloc_normal) { + nr_pages -= alloc_normal; + while (nr_pages-- > 0) { + struct page *page; + + page = alloc_image_page(GFP_ATOMIC); + if (!page) + goto err_out; + memory_bm_set_bit(copy_bm, page_to_pfn(page)); + } + } + + return 0; + + err_out: + swsusp_free(); + return -ENOMEM; +} + +asmlinkage __visible int swsusp_save(void) +{ + unsigned int nr_pages, nr_highmem; + + pr_info("Creating image:\n"); + + drain_local_pages(NULL); + nr_pages = count_data_pages(); + nr_highmem = count_highmem_pages(); + pr_info("Need to copy %u pages\n", nr_pages + nr_highmem); + + if (!enough_free_mem(nr_pages, nr_highmem)) { + pr_err("Not enough free memory\n"); + return -ENOMEM; + } + + if (swsusp_alloc(©_bm, nr_pages, nr_highmem)) { + pr_err("Memory allocation failed\n"); + return -ENOMEM; + } + + /* + * During allocating of suspend pagedir, new cold pages may appear. + * Kill them. + */ + drain_local_pages(NULL); + nr_copy_pages = copy_data_pages(©_bm, &orig_bm, &zero_bm); + + /* + * End of critical section. From now on, we can write to memory, + * but we should not touch disk. This specially means we must _not_ + * touch swap space! Except we must write out our image of course. + */ + nr_pages += nr_highmem; + /* We don't actually copy the zero pages */ + nr_zero_pages = nr_pages - nr_copy_pages; + nr_meta_pages = DIV_ROUND_UP(nr_pages * sizeof(long), PAGE_SIZE); + + pr_info("Image created (%d pages copied, %d zero pages)\n", nr_copy_pages, nr_zero_pages); + + return 0; +} + +#ifndef CONFIG_ARCH_HIBERNATION_HEADER +static int init_header_complete(struct swsusp_info *info) +{ + memcpy(&info->uts, init_utsname(), sizeof(struct new_utsname)); + info->version_code = LINUX_VERSION_CODE; + return 0; +} + +static const char *check_image_kernel(struct swsusp_info *info) +{ + if (info->version_code != LINUX_VERSION_CODE) + return "kernel version"; + if (strcmp(info->uts.sysname,init_utsname()->sysname)) + return "system type"; + if (strcmp(info->uts.release,init_utsname()->release)) + return "kernel release"; + if (strcmp(info->uts.version,init_utsname()->version)) + return "version"; + if (strcmp(info->uts.machine,init_utsname()->machine)) + return "machine"; + return NULL; +} +#endif /* CONFIG_ARCH_HIBERNATION_HEADER */ + +unsigned long snapshot_get_image_size(void) +{ + return nr_copy_pages + nr_meta_pages + 1; +} + +static int init_header(struct swsusp_info *info) +{ + memset(info, 0, sizeof(struct swsusp_info)); + info->num_physpages = get_num_physpages(); + info->image_pages = nr_copy_pages; + info->pages = snapshot_get_image_size(); + info->size = info->pages; + info->size <<= PAGE_SHIFT; + return init_header_complete(info); +} + +#define ENCODED_PFN_ZERO_FLAG ((unsigned long)1 << (BITS_PER_LONG - 1)) +#define ENCODED_PFN_MASK (~ENCODED_PFN_ZERO_FLAG) + +/** + * pack_pfns - Prepare PFNs for saving. + * @bm: Memory bitmap. + * @buf: Memory buffer to store the PFNs in. + * @zero_bm: Memory bitmap containing PFNs of zero pages. + * + * PFNs corresponding to set bits in @bm are stored in the area of memory + * pointed to by @buf (1 page at a time). Pages which were filled with only + * zeros will have the highest bit set in the packed format to distinguish + * them from PFNs which will be contained in the image file. + */ +static inline void pack_pfns(unsigned long *buf, struct memory_bitmap *bm, + struct memory_bitmap *zero_bm) +{ + int j; + + for (j = 0; j < PAGE_SIZE / sizeof(long); j++) { + buf[j] = memory_bm_next_pfn(bm); + if (unlikely(buf[j] == BM_END_OF_MAP)) + break; + if (memory_bm_test_bit(zero_bm, buf[j])) + buf[j] |= ENCODED_PFN_ZERO_FLAG; + } +} + +/** + * snapshot_read_next - Get the address to read the next image page from. + * @handle: Snapshot handle to be used for the reading. + * + * On the first call, @handle should point to a zeroed snapshot_handle + * structure. The structure gets populated then and a pointer to it should be + * passed to this function every next time. + * + * On success, the function returns a positive number. Then, the caller + * is allowed to read up to the returned number of bytes from the memory + * location computed by the data_of() macro. + * + * The function returns 0 to indicate the end of the data stream condition, + * and negative numbers are returned on errors. If that happens, the structure + * pointed to by @handle is not updated and should not be used any more. + */ +int snapshot_read_next(struct snapshot_handle *handle) +{ + if (handle->cur > nr_meta_pages + nr_copy_pages) + return 0; + + if (!buffer) { + /* This makes the buffer be freed by swsusp_free() */ + buffer = get_image_page(GFP_ATOMIC, PG_ANY); + if (!buffer) + return -ENOMEM; + } + if (!handle->cur) { + int error; + + error = init_header((struct swsusp_info *)buffer); + if (error) + return error; + handle->buffer = buffer; + memory_bm_position_reset(&orig_bm); + memory_bm_position_reset(©_bm); + } else if (handle->cur <= nr_meta_pages) { + clear_page(buffer); + pack_pfns(buffer, &orig_bm, &zero_bm); + } else { + struct page *page; + + page = pfn_to_page(memory_bm_next_pfn(©_bm)); + if (PageHighMem(page)) { + /* + * Highmem pages are copied to the buffer, + * because we can't return with a kmapped + * highmem page (we may not be called again). + */ + void *kaddr; + + kaddr = kmap_atomic(page); + copy_page(buffer, kaddr); + kunmap_atomic(kaddr); + handle->buffer = buffer; + } else { + handle->buffer = page_address(page); + } + } + handle->cur++; + return PAGE_SIZE; +} + +static void duplicate_memory_bitmap(struct memory_bitmap *dst, + struct memory_bitmap *src) +{ + unsigned long pfn; + + memory_bm_position_reset(src); + pfn = memory_bm_next_pfn(src); + while (pfn != BM_END_OF_MAP) { + memory_bm_set_bit(dst, pfn); + pfn = memory_bm_next_pfn(src); + } +} + +/** + * mark_unsafe_pages - Mark pages that were used before hibernation. + * + * Mark the pages that cannot be used for storing the image during restoration, + * because they conflict with the pages that had been used before hibernation. + */ +static void mark_unsafe_pages(struct memory_bitmap *bm) +{ + unsigned long pfn; + + /* Clear the "free"/"unsafe" bit for all PFNs */ + memory_bm_position_reset(free_pages_map); + pfn = memory_bm_next_pfn(free_pages_map); + while (pfn != BM_END_OF_MAP) { + memory_bm_clear_current(free_pages_map); + pfn = memory_bm_next_pfn(free_pages_map); + } + + /* Mark pages that correspond to the "original" PFNs as "unsafe" */ + duplicate_memory_bitmap(free_pages_map, bm); + + allocated_unsafe_pages = 0; +} + +static int check_header(struct swsusp_info *info) +{ + const char *reason; + + reason = check_image_kernel(info); + if (!reason && info->num_physpages != get_num_physpages()) + reason = "memory size"; + if (reason) { + pr_err("Image mismatch: %s\n", reason); + return -EPERM; + } + return 0; +} + +/** + * load_header - Check the image header and copy the data from it. + */ +static int load_header(struct swsusp_info *info) +{ + int error; + + restore_pblist = NULL; + error = check_header(info); + if (!error) { + nr_copy_pages = info->image_pages; + nr_meta_pages = info->pages - info->image_pages - 1; + } + return error; +} + +/** + * unpack_orig_pfns - Set bits corresponding to given PFNs in a memory bitmap. + * @bm: Memory bitmap. + * @buf: Area of memory containing the PFNs. + * @zero_bm: Memory bitmap with the zero PFNs marked. + * + * For each element of the array pointed to by @buf (1 page at a time), set the + * corresponding bit in @bm. If the page was originally populated with only + * zeros then a corresponding bit will also be set in @zero_bm. + */ +static int unpack_orig_pfns(unsigned long *buf, struct memory_bitmap *bm, + struct memory_bitmap *zero_bm) +{ + unsigned long decoded_pfn; + bool zero; + int j; + + for (j = 0; j < PAGE_SIZE / sizeof(long); j++) { + if (unlikely(buf[j] == BM_END_OF_MAP)) + break; + + zero = !!(buf[j] & ENCODED_PFN_ZERO_FLAG); + decoded_pfn = buf[j] & ENCODED_PFN_MASK; + if (pfn_valid(decoded_pfn) && memory_bm_pfn_present(bm, decoded_pfn)) { + memory_bm_set_bit(bm, decoded_pfn); + if (zero) { + memory_bm_set_bit(zero_bm, decoded_pfn); + nr_zero_pages++; + } + } else { + if (!pfn_valid(decoded_pfn)) + pr_err(FW_BUG "Memory map mismatch at 0x%llx after hibernation\n", + (unsigned long long)PFN_PHYS(decoded_pfn)); + return -EFAULT; + } + } + + return 0; +} + +#ifdef CONFIG_HIGHMEM +/* + * struct highmem_pbe is used for creating the list of highmem pages that + * should be restored atomically during the resume from disk, because the page + * frames they have occupied before the suspend are in use. + */ +struct highmem_pbe { + struct page *copy_page; /* data is here now */ + struct page *orig_page; /* data was here before the suspend */ + struct highmem_pbe *next; +}; + +/* + * List of highmem PBEs needed for restoring the highmem pages that were + * allocated before the suspend and included in the suspend image, but have + * also been allocated by the "resume" kernel, so their contents cannot be + * written directly to their "original" page frames. + */ +static struct highmem_pbe *highmem_pblist; + +/** + * count_highmem_image_pages - Compute the number of highmem pages in the image. + * @bm: Memory bitmap. + * + * The bits in @bm that correspond to image pages are assumed to be set. + */ +static unsigned int count_highmem_image_pages(struct memory_bitmap *bm) +{ + unsigned long pfn; + unsigned int cnt = 0; + + memory_bm_position_reset(bm); + pfn = memory_bm_next_pfn(bm); + while (pfn != BM_END_OF_MAP) { + if (PageHighMem(pfn_to_page(pfn))) + cnt++; + + pfn = memory_bm_next_pfn(bm); + } + return cnt; +} + +static unsigned int safe_highmem_pages; + +static struct memory_bitmap *safe_highmem_bm; + +/** + * prepare_highmem_image - Allocate memory for loading highmem data from image. + * @bm: Pointer to an uninitialized memory bitmap structure. + * @nr_highmem_p: Pointer to the number of highmem image pages. + * + * Try to allocate as many highmem pages as there are highmem image pages + * (@nr_highmem_p points to the variable containing the number of highmem image + * pages). The pages that are "safe" (ie. will not be overwritten when the + * hibernation image is restored entirely) have the corresponding bits set in + * @bm (it must be uninitialized). + * + * NOTE: This function should not be called if there are no highmem image pages. + */ +static int prepare_highmem_image(struct memory_bitmap *bm, + unsigned int *nr_highmem_p) +{ + unsigned int to_alloc; + + if (memory_bm_create(bm, GFP_ATOMIC, PG_SAFE)) + return -ENOMEM; + + if (get_highmem_buffer(PG_SAFE)) + return -ENOMEM; + + to_alloc = count_free_highmem_pages(); + if (to_alloc > *nr_highmem_p) + to_alloc = *nr_highmem_p; + else + *nr_highmem_p = to_alloc; + + safe_highmem_pages = 0; + while (to_alloc-- > 0) { + struct page *page; + + page = alloc_page(__GFP_HIGHMEM); + if (!swsusp_page_is_free(page)) { + /* The page is "safe", set its bit the bitmap */ + memory_bm_set_bit(bm, page_to_pfn(page)); + safe_highmem_pages++; + } + /* Mark the page as allocated */ + swsusp_set_page_forbidden(page); + swsusp_set_page_free(page); + } + memory_bm_position_reset(bm); + safe_highmem_bm = bm; + return 0; +} + +static struct page *last_highmem_page; + +/** + * get_highmem_page_buffer - Prepare a buffer to store a highmem image page. + * + * For a given highmem image page get a buffer that suspend_write_next() should + * return to its caller to write to. + * + * If the page is to be saved to its "original" page frame or a copy of + * the page is to be made in the highmem, @buffer is returned. Otherwise, + * the copy of the page is to be made in normal memory, so the address of + * the copy is returned. + * + * If @buffer is returned, the caller of suspend_write_next() will write + * the page's contents to @buffer, so they will have to be copied to the + * right location on the next call to suspend_write_next() and it is done + * with the help of copy_last_highmem_page(). For this purpose, if + * @buffer is returned, @last_highmem_page is set to the page to which + * the data will have to be copied from @buffer. + */ +static void *get_highmem_page_buffer(struct page *page, + struct chain_allocator *ca) +{ + struct highmem_pbe *pbe; + void *kaddr; + + if (swsusp_page_is_forbidden(page) && swsusp_page_is_free(page)) { + /* + * We have allocated the "original" page frame and we can + * use it directly to store the loaded page. + */ + last_highmem_page = page; + return buffer; + } + /* + * The "original" page frame has not been allocated and we have to + * use a "safe" page frame to store the loaded page. + */ + pbe = chain_alloc(ca, sizeof(struct highmem_pbe)); + if (!pbe) { + swsusp_free(); + return ERR_PTR(-ENOMEM); + } + pbe->orig_page = page; + if (safe_highmem_pages > 0) { + struct page *tmp; + + /* Copy of the page will be stored in high memory */ + kaddr = buffer; + tmp = pfn_to_page(memory_bm_next_pfn(safe_highmem_bm)); + safe_highmem_pages--; + last_highmem_page = tmp; + pbe->copy_page = tmp; + } else { + /* Copy of the page will be stored in normal memory */ + kaddr = __get_safe_page(ca->gfp_mask); + if (!kaddr) + return ERR_PTR(-ENOMEM); + pbe->copy_page = virt_to_page(kaddr); + } + pbe->next = highmem_pblist; + highmem_pblist = pbe; + return kaddr; +} + +/** + * copy_last_highmem_page - Copy most the most recent highmem image page. + * + * Copy the contents of a highmem image from @buffer, where the caller of + * snapshot_write_next() has stored them, to the right location represented by + * @last_highmem_page . + */ +static void copy_last_highmem_page(void) +{ + if (last_highmem_page) { + void *dst; + + dst = kmap_atomic(last_highmem_page); + copy_page(dst, buffer); + kunmap_atomic(dst); + last_highmem_page = NULL; + } +} + +static inline int last_highmem_page_copied(void) +{ + return !last_highmem_page; +} + +static inline void free_highmem_data(void) +{ + if (safe_highmem_bm) + memory_bm_free(safe_highmem_bm, PG_UNSAFE_CLEAR); + + if (buffer) + free_image_page(buffer, PG_UNSAFE_CLEAR); +} +#else +static unsigned int count_highmem_image_pages(struct memory_bitmap *bm) { return 0; } + +static inline int prepare_highmem_image(struct memory_bitmap *bm, + unsigned int *nr_highmem_p) { return 0; } + +static inline void *get_highmem_page_buffer(struct page *page, + struct chain_allocator *ca) +{ + return ERR_PTR(-EINVAL); +} + +static inline void copy_last_highmem_page(void) {} +static inline int last_highmem_page_copied(void) { return 1; } +static inline void free_highmem_data(void) {} +#endif /* CONFIG_HIGHMEM */ + +#define PBES_PER_LINKED_PAGE (LINKED_PAGE_DATA_SIZE / sizeof(struct pbe)) + +/** + * prepare_image - Make room for loading hibernation image. + * @new_bm: Uninitialized memory bitmap structure. + * @bm: Memory bitmap with unsafe pages marked. + * @zero_bm: Memory bitmap containing the zero pages. + * + * Use @bm to mark the pages that will be overwritten in the process of + * restoring the system memory state from the suspend image ("unsafe" pages) + * and allocate memory for the image. + * + * The idea is to allocate a new memory bitmap first and then allocate + * as many pages as needed for image data, but without specifying what those + * pages will be used for just yet. Instead, we mark them all as allocated and + * create a lists of "safe" pages to be used later. On systems with high + * memory a list of "safe" highmem pages is created too. + * + * Because it was not known which pages were unsafe when @zero_bm was created, + * make a copy of it and recreate it within safe pages. + */ +static int prepare_image(struct memory_bitmap *new_bm, struct memory_bitmap *bm, + struct memory_bitmap *zero_bm) +{ + unsigned int nr_pages, nr_highmem; + struct memory_bitmap tmp; + struct linked_page *lp; + int error; + + /* If there is no highmem, the buffer will not be necessary */ + free_image_page(buffer, PG_UNSAFE_CLEAR); + buffer = NULL; + + nr_highmem = count_highmem_image_pages(bm); + mark_unsafe_pages(bm); + + error = memory_bm_create(new_bm, GFP_ATOMIC, PG_SAFE); + if (error) + goto Free; + + duplicate_memory_bitmap(new_bm, bm); + memory_bm_free(bm, PG_UNSAFE_KEEP); + + /* Make a copy of zero_bm so it can be created in safe pages */ + error = memory_bm_create(&tmp, GFP_ATOMIC, PG_SAFE); + if (error) + goto Free; + + duplicate_memory_bitmap(&tmp, zero_bm); + memory_bm_free(zero_bm, PG_UNSAFE_KEEP); + + /* Recreate zero_bm in safe pages */ + error = memory_bm_create(zero_bm, GFP_ATOMIC, PG_SAFE); + if (error) + goto Free; + + duplicate_memory_bitmap(zero_bm, &tmp); + memory_bm_free(&tmp, PG_UNSAFE_CLEAR); + /* At this point zero_bm is in safe pages and it can be used for restoring. */ + + if (nr_highmem > 0) { + error = prepare_highmem_image(bm, &nr_highmem); + if (error) + goto Free; + } + /* + * Reserve some safe pages for potential later use. + * + * NOTE: This way we make sure there will be enough safe pages for the + * chain_alloc() in get_buffer(). It is a bit wasteful, but + * nr_copy_pages cannot be greater than 50% of the memory anyway. + * + * nr_copy_pages cannot be less than allocated_unsafe_pages too. + */ + nr_pages = (nr_zero_pages + nr_copy_pages) - nr_highmem - allocated_unsafe_pages; + nr_pages = DIV_ROUND_UP(nr_pages, PBES_PER_LINKED_PAGE); + while (nr_pages > 0) { + lp = get_image_page(GFP_ATOMIC, PG_SAFE); + if (!lp) { + error = -ENOMEM; + goto Free; + } + lp->next = safe_pages_list; + safe_pages_list = lp; + nr_pages--; + } + /* Preallocate memory for the image */ + nr_pages = (nr_zero_pages + nr_copy_pages) - nr_highmem - allocated_unsafe_pages; + while (nr_pages > 0) { + lp = (struct linked_page *)get_zeroed_page(GFP_ATOMIC); + if (!lp) { + error = -ENOMEM; + goto Free; + } + if (!swsusp_page_is_free(virt_to_page(lp))) { + /* The page is "safe", add it to the list */ + lp->next = safe_pages_list; + safe_pages_list = lp; + } + /* Mark the page as allocated */ + swsusp_set_page_forbidden(virt_to_page(lp)); + swsusp_set_page_free(virt_to_page(lp)); + nr_pages--; + } + return 0; + + Free: + swsusp_free(); + return error; +} + +/** + * get_buffer - Get the address to store the next image data page. + * + * Get the address that snapshot_write_next() should return to its caller to + * write to. + */ +static void *get_buffer(struct memory_bitmap *bm, struct chain_allocator *ca) +{ + struct pbe *pbe; + struct page *page; + unsigned long pfn = memory_bm_next_pfn(bm); + + if (pfn == BM_END_OF_MAP) + return ERR_PTR(-EFAULT); + + page = pfn_to_page(pfn); + if (PageHighMem(page)) + return get_highmem_page_buffer(page, ca); + + if (swsusp_page_is_forbidden(page) && swsusp_page_is_free(page)) + /* + * We have allocated the "original" page frame and we can + * use it directly to store the loaded page. + */ + return page_address(page); + + /* + * The "original" page frame has not been allocated and we have to + * use a "safe" page frame to store the loaded page. + */ + pbe = chain_alloc(ca, sizeof(struct pbe)); + if (!pbe) { + swsusp_free(); + return ERR_PTR(-ENOMEM); + } + pbe->orig_address = page_address(page); + pbe->address = __get_safe_page(ca->gfp_mask); + if (!pbe->address) + return ERR_PTR(-ENOMEM); + pbe->next = restore_pblist; + restore_pblist = pbe; + return pbe->address; +} + +/** + * snapshot_write_next - Get the address to store the next image page. + * @handle: Snapshot handle structure to guide the writing. + * + * On the first call, @handle should point to a zeroed snapshot_handle + * structure. The structure gets populated then and a pointer to it should be + * passed to this function every next time. + * + * On success, the function returns a positive number. Then, the caller + * is allowed to write up to the returned number of bytes to the memory + * location computed by the data_of() macro. + * + * The function returns 0 to indicate the "end of file" condition. Negative + * numbers are returned on errors, in which cases the structure pointed to by + * @handle is not updated and should not be used any more. + */ +int snapshot_write_next(struct snapshot_handle *handle) +{ + static struct chain_allocator ca; + int error = 0; + +next: + /* Check if we have already loaded the entire image */ + if (handle->cur > 1 && handle->cur > nr_meta_pages + nr_copy_pages + nr_zero_pages) + return 0; + + if (!handle->cur) { + if (!buffer) + /* This makes the buffer be freed by swsusp_free() */ + buffer = get_image_page(GFP_ATOMIC, PG_ANY); + + if (!buffer) + return -ENOMEM; + + handle->buffer = buffer; + } else if (handle->cur == 1) { + error = load_header(buffer); + if (error) + return error; + + safe_pages_list = NULL; + + error = memory_bm_create(©_bm, GFP_ATOMIC, PG_ANY); + if (error) + return error; + + error = memory_bm_create(&zero_bm, GFP_ATOMIC, PG_ANY); + if (error) + return error; + + nr_zero_pages = 0; + + hibernate_restore_protection_begin(); + } else if (handle->cur <= nr_meta_pages + 1) { + error = unpack_orig_pfns(buffer, ©_bm, &zero_bm); + if (error) + return error; + + if (handle->cur == nr_meta_pages + 1) { + error = prepare_image(&orig_bm, ©_bm, &zero_bm); + if (error) + return error; + + chain_init(&ca, GFP_ATOMIC, PG_SAFE); + memory_bm_position_reset(&orig_bm); + memory_bm_position_reset(&zero_bm); + restore_pblist = NULL; + handle->buffer = get_buffer(&orig_bm, &ca); + if (IS_ERR(handle->buffer)) + return PTR_ERR(handle->buffer); + } + } else { + copy_last_highmem_page(); + hibernate_restore_protect_page(handle->buffer); + handle->buffer = get_buffer(&orig_bm, &ca); + if (IS_ERR(handle->buffer)) + return PTR_ERR(handle->buffer); + } + handle->sync_read = (handle->buffer == buffer); + handle->cur++; + + /* Zero pages were not included in the image, memset it and move on. */ + if (handle->cur > nr_meta_pages + 1 && + memory_bm_test_bit(&zero_bm, memory_bm_get_current(&orig_bm))) { + memset(handle->buffer, 0, PAGE_SIZE); + goto next; + } + + return PAGE_SIZE; +} + +/** + * snapshot_write_finalize - Complete the loading of a hibernation image. + * + * Must be called after the last call to snapshot_write_next() in case the last + * page in the image happens to be a highmem page and its contents should be + * stored in highmem. Additionally, it recycles bitmap memory that's not + * necessary any more. + */ +void snapshot_write_finalize(struct snapshot_handle *handle) +{ + copy_last_highmem_page(); + hibernate_restore_protect_page(handle->buffer); + /* Do that only if we have loaded the image entirely */ + if (handle->cur > 1 && handle->cur > nr_meta_pages + nr_copy_pages + nr_zero_pages) { + memory_bm_recycle(&orig_bm); + free_highmem_data(); + } +} + +int snapshot_image_loaded(struct snapshot_handle *handle) +{ + return !(!nr_copy_pages || !last_highmem_page_copied() || + handle->cur <= nr_meta_pages + nr_copy_pages + nr_zero_pages); +} + +#ifdef CONFIG_HIGHMEM +/* Assumes that @buf is ready and points to a "safe" page */ +static inline void swap_two_pages_data(struct page *p1, struct page *p2, + void *buf) +{ + void *kaddr1, *kaddr2; + + kaddr1 = kmap_atomic(p1); + kaddr2 = kmap_atomic(p2); + copy_page(buf, kaddr1); + copy_page(kaddr1, kaddr2); + copy_page(kaddr2, buf); + kunmap_atomic(kaddr2); + kunmap_atomic(kaddr1); +} + +/** + * restore_highmem - Put highmem image pages into their original locations. + * + * For each highmem page that was in use before hibernation and is included in + * the image, and also has been allocated by the "restore" kernel, swap its + * current contents with the previous (ie. "before hibernation") ones. + * + * If the restore eventually fails, we can call this function once again and + * restore the highmem state as seen by the restore kernel. + */ +int restore_highmem(void) +{ + struct highmem_pbe *pbe = highmem_pblist; + void *buf; + + if (!pbe) + return 0; + + buf = get_image_page(GFP_ATOMIC, PG_SAFE); + if (!buf) + return -ENOMEM; + + while (pbe) { + swap_two_pages_data(pbe->copy_page, pbe->orig_page, buf); + pbe = pbe->next; + } + free_image_page(buf, PG_UNSAFE_CLEAR); + return 0; +} +#endif /* CONFIG_HIGHMEM */ diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c new file mode 100644 index 0000000000..fa3bf161d1 --- /dev/null +++ b/kernel/power/suspend.c @@ -0,0 +1,629 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * kernel/power/suspend.c - Suspend to RAM and standby functionality. + * + * Copyright (c) 2003 Patrick Mochel + * Copyright (c) 2003 Open Source Development Lab + * Copyright (c) 2009 Rafael J. Wysocki <rjw@sisk.pl>, Novell Inc. + */ + +#define pr_fmt(fmt) "PM: " fmt + +#include <linux/string.h> +#include <linux/delay.h> +#include <linux/errno.h> +#include <linux/init.h> +#include <linux/console.h> +#include <linux/cpu.h> +#include <linux/cpuidle.h> +#include <linux/gfp.h> +#include <linux/io.h> +#include <linux/kernel.h> +#include <linux/list.h> +#include <linux/mm.h> +#include <linux/slab.h> +#include <linux/export.h> +#include <linux/suspend.h> +#include <linux/syscore_ops.h> +#include <linux/swait.h> +#include <linux/ftrace.h> +#include <trace/events/power.h> +#include <linux/compiler.h> +#include <linux/moduleparam.h> + +#include "power.h" + +const char * const pm_labels[] = { + [PM_SUSPEND_TO_IDLE] = "freeze", + [PM_SUSPEND_STANDBY] = "standby", + [PM_SUSPEND_MEM] = "mem", +}; +const char *pm_states[PM_SUSPEND_MAX]; +static const char * const mem_sleep_labels[] = { + [PM_SUSPEND_TO_IDLE] = "s2idle", + [PM_SUSPEND_STANDBY] = "shallow", + [PM_SUSPEND_MEM] = "deep", +}; +const char *mem_sleep_states[PM_SUSPEND_MAX]; + +suspend_state_t mem_sleep_current = PM_SUSPEND_TO_IDLE; +suspend_state_t mem_sleep_default = PM_SUSPEND_MAX; +suspend_state_t pm_suspend_target_state; +EXPORT_SYMBOL_GPL(pm_suspend_target_state); + +unsigned int pm_suspend_global_flags; +EXPORT_SYMBOL_GPL(pm_suspend_global_flags); + +static const struct platform_suspend_ops *suspend_ops; +static const struct platform_s2idle_ops *s2idle_ops; +static DECLARE_SWAIT_QUEUE_HEAD(s2idle_wait_head); + +enum s2idle_states __read_mostly s2idle_state; +static DEFINE_RAW_SPINLOCK(s2idle_lock); + +/** + * pm_suspend_default_s2idle - Check if suspend-to-idle is the default suspend. + * + * Return 'true' if suspend-to-idle has been selected as the default system + * suspend method. + */ +bool pm_suspend_default_s2idle(void) +{ + return mem_sleep_current == PM_SUSPEND_TO_IDLE; +} +EXPORT_SYMBOL_GPL(pm_suspend_default_s2idle); + +void s2idle_set_ops(const struct platform_s2idle_ops *ops) +{ + unsigned int sleep_flags; + + sleep_flags = lock_system_sleep(); + s2idle_ops = ops; + unlock_system_sleep(sleep_flags); +} + +static void s2idle_begin(void) +{ + s2idle_state = S2IDLE_STATE_NONE; +} + +static void s2idle_enter(void) +{ + trace_suspend_resume(TPS("machine_suspend"), PM_SUSPEND_TO_IDLE, true); + + raw_spin_lock_irq(&s2idle_lock); + if (pm_wakeup_pending()) + goto out; + + s2idle_state = S2IDLE_STATE_ENTER; + raw_spin_unlock_irq(&s2idle_lock); + + cpus_read_lock(); + + /* Push all the CPUs into the idle loop. */ + wake_up_all_idle_cpus(); + /* Make the current CPU wait so it can enter the idle loop too. */ + swait_event_exclusive(s2idle_wait_head, + s2idle_state == S2IDLE_STATE_WAKE); + + cpus_read_unlock(); + + raw_spin_lock_irq(&s2idle_lock); + + out: + s2idle_state = S2IDLE_STATE_NONE; + raw_spin_unlock_irq(&s2idle_lock); + + trace_suspend_resume(TPS("machine_suspend"), PM_SUSPEND_TO_IDLE, false); +} + +static void s2idle_loop(void) +{ + pm_pr_dbg("suspend-to-idle\n"); + + /* + * Suspend-to-idle equals: + * frozen processes + suspended devices + idle processors. + * Thus s2idle_enter() should be called right after all devices have + * been suspended. + * + * Wakeups during the noirq suspend of devices may be spurious, so try + * to avoid them upfront. + */ + for (;;) { + if (s2idle_ops && s2idle_ops->wake) { + if (s2idle_ops->wake()) + break; + } else if (pm_wakeup_pending()) { + break; + } + + if (s2idle_ops && s2idle_ops->check) + s2idle_ops->check(); + + s2idle_enter(); + } + + pm_pr_dbg("resume from suspend-to-idle\n"); +} + +void s2idle_wake(void) +{ + unsigned long flags; + + raw_spin_lock_irqsave(&s2idle_lock, flags); + if (s2idle_state > S2IDLE_STATE_NONE) { + s2idle_state = S2IDLE_STATE_WAKE; + swake_up_one(&s2idle_wait_head); + } + raw_spin_unlock_irqrestore(&s2idle_lock, flags); +} +EXPORT_SYMBOL_GPL(s2idle_wake); + +static bool valid_state(suspend_state_t state) +{ + /* + * The PM_SUSPEND_STANDBY and PM_SUSPEND_MEM states require low-level + * support and need to be valid to the low-level implementation. + * + * No ->valid() or ->enter() callback implies that none are valid. + */ + return suspend_ops && suspend_ops->valid && suspend_ops->valid(state) && + suspend_ops->enter; +} + +void __init pm_states_init(void) +{ + /* "mem" and "freeze" are always present in /sys/power/state. */ + pm_states[PM_SUSPEND_MEM] = pm_labels[PM_SUSPEND_MEM]; + pm_states[PM_SUSPEND_TO_IDLE] = pm_labels[PM_SUSPEND_TO_IDLE]; + /* + * Suspend-to-idle should be supported even without any suspend_ops, + * initialize mem_sleep_states[] accordingly here. + */ + mem_sleep_states[PM_SUSPEND_TO_IDLE] = mem_sleep_labels[PM_SUSPEND_TO_IDLE]; +} + +static int __init mem_sleep_default_setup(char *str) +{ + suspend_state_t state; + + for (state = PM_SUSPEND_TO_IDLE; state <= PM_SUSPEND_MEM; state++) + if (mem_sleep_labels[state] && + !strcmp(str, mem_sleep_labels[state])) { + mem_sleep_default = state; + break; + } + + return 1; +} +__setup("mem_sleep_default=", mem_sleep_default_setup); + +/** + * suspend_set_ops - Set the global suspend method table. + * @ops: Suspend operations to use. + */ +void suspend_set_ops(const struct platform_suspend_ops *ops) +{ + unsigned int sleep_flags; + + sleep_flags = lock_system_sleep(); + + suspend_ops = ops; + + if (valid_state(PM_SUSPEND_STANDBY)) { + mem_sleep_states[PM_SUSPEND_STANDBY] = mem_sleep_labels[PM_SUSPEND_STANDBY]; + pm_states[PM_SUSPEND_STANDBY] = pm_labels[PM_SUSPEND_STANDBY]; + if (mem_sleep_default == PM_SUSPEND_STANDBY) + mem_sleep_current = PM_SUSPEND_STANDBY; + } + if (valid_state(PM_SUSPEND_MEM)) { + mem_sleep_states[PM_SUSPEND_MEM] = mem_sleep_labels[PM_SUSPEND_MEM]; + if (mem_sleep_default >= PM_SUSPEND_MEM) + mem_sleep_current = PM_SUSPEND_MEM; + } + + unlock_system_sleep(sleep_flags); +} +EXPORT_SYMBOL_GPL(suspend_set_ops); + +/** + * suspend_valid_only_mem - Generic memory-only valid callback. + * @state: Target system sleep state. + * + * Platform drivers that implement mem suspend only and only need to check for + * that in their .valid() callback can use this instead of rolling their own + * .valid() callback. + */ +int suspend_valid_only_mem(suspend_state_t state) +{ + return state == PM_SUSPEND_MEM; +} +EXPORT_SYMBOL_GPL(suspend_valid_only_mem); + +static bool sleep_state_supported(suspend_state_t state) +{ + return state == PM_SUSPEND_TO_IDLE || + (valid_state(state) && !cxl_mem_active()); +} + +static int platform_suspend_prepare(suspend_state_t state) +{ + return state != PM_SUSPEND_TO_IDLE && suspend_ops->prepare ? + suspend_ops->prepare() : 0; +} + +static int platform_suspend_prepare_late(suspend_state_t state) +{ + return state == PM_SUSPEND_TO_IDLE && s2idle_ops && s2idle_ops->prepare ? + s2idle_ops->prepare() : 0; +} + +static int platform_suspend_prepare_noirq(suspend_state_t state) +{ + if (state == PM_SUSPEND_TO_IDLE) + return s2idle_ops && s2idle_ops->prepare_late ? + s2idle_ops->prepare_late() : 0; + + return suspend_ops->prepare_late ? suspend_ops->prepare_late() : 0; +} + +static void platform_resume_noirq(suspend_state_t state) +{ + if (state == PM_SUSPEND_TO_IDLE) { + if (s2idle_ops && s2idle_ops->restore_early) + s2idle_ops->restore_early(); + } else if (suspend_ops->wake) { + suspend_ops->wake(); + } +} + +static void platform_resume_early(suspend_state_t state) +{ + if (state == PM_SUSPEND_TO_IDLE && s2idle_ops && s2idle_ops->restore) + s2idle_ops->restore(); +} + +static void platform_resume_finish(suspend_state_t state) +{ + if (state != PM_SUSPEND_TO_IDLE && suspend_ops->finish) + suspend_ops->finish(); +} + +static int platform_suspend_begin(suspend_state_t state) +{ + if (state == PM_SUSPEND_TO_IDLE && s2idle_ops && s2idle_ops->begin) + return s2idle_ops->begin(); + else if (suspend_ops && suspend_ops->begin) + return suspend_ops->begin(state); + else + return 0; +} + +static void platform_resume_end(suspend_state_t state) +{ + if (state == PM_SUSPEND_TO_IDLE && s2idle_ops && s2idle_ops->end) + s2idle_ops->end(); + else if (suspend_ops && suspend_ops->end) + suspend_ops->end(); +} + +static void platform_recover(suspend_state_t state) +{ + if (state != PM_SUSPEND_TO_IDLE && suspend_ops->recover) + suspend_ops->recover(); +} + +static bool platform_suspend_again(suspend_state_t state) +{ + return state != PM_SUSPEND_TO_IDLE && suspend_ops->suspend_again ? + suspend_ops->suspend_again() : false; +} + +#ifdef CONFIG_PM_DEBUG +static unsigned int pm_test_delay = 5; +module_param(pm_test_delay, uint, 0644); +MODULE_PARM_DESC(pm_test_delay, + "Number of seconds to wait before resuming from suspend test"); +#endif + +static int suspend_test(int level) +{ +#ifdef CONFIG_PM_DEBUG + if (pm_test_level == level) { + pr_info("suspend debug: Waiting for %d second(s).\n", + pm_test_delay); + mdelay(pm_test_delay * 1000); + return 1; + } +#endif /* !CONFIG_PM_DEBUG */ + return 0; +} + +/** + * suspend_prepare - Prepare for entering system sleep state. + * @state: Target system sleep state. + * + * Common code run for every system sleep state that can be entered (except for + * hibernation). Run suspend notifiers, allocate the "suspend" console and + * freeze processes. + */ +static int suspend_prepare(suspend_state_t state) +{ + int error; + + if (!sleep_state_supported(state)) + return -EPERM; + + pm_prepare_console(); + + error = pm_notifier_call_chain_robust(PM_SUSPEND_PREPARE, PM_POST_SUSPEND); + if (error) + goto Restore; + + trace_suspend_resume(TPS("freeze_processes"), 0, true); + error = suspend_freeze_processes(); + trace_suspend_resume(TPS("freeze_processes"), 0, false); + if (!error) + return 0; + + suspend_stats.failed_freeze++; + dpm_save_failed_step(SUSPEND_FREEZE); + pm_notifier_call_chain(PM_POST_SUSPEND); + Restore: + pm_restore_console(); + return error; +} + +/* default implementation */ +void __weak arch_suspend_disable_irqs(void) +{ + local_irq_disable(); +} + +/* default implementation */ +void __weak arch_suspend_enable_irqs(void) +{ + local_irq_enable(); +} + +/** + * suspend_enter - Make the system enter the given sleep state. + * @state: System sleep state to enter. + * @wakeup: Returns information that the sleep state should not be re-entered. + * + * This function should be called after devices have been suspended. + */ +static int suspend_enter(suspend_state_t state, bool *wakeup) +{ + int error; + + error = platform_suspend_prepare(state); + if (error) + goto Platform_finish; + + error = dpm_suspend_late(PMSG_SUSPEND); + if (error) { + pr_err("late suspend of devices failed\n"); + goto Platform_finish; + } + error = platform_suspend_prepare_late(state); + if (error) + goto Devices_early_resume; + + error = dpm_suspend_noirq(PMSG_SUSPEND); + if (error) { + pr_err("noirq suspend of devices failed\n"); + goto Platform_early_resume; + } + error = platform_suspend_prepare_noirq(state); + if (error) + goto Platform_wake; + + if (suspend_test(TEST_PLATFORM)) + goto Platform_wake; + + if (state == PM_SUSPEND_TO_IDLE) { + s2idle_loop(); + goto Platform_wake; + } + + error = pm_sleep_disable_secondary_cpus(); + if (error || suspend_test(TEST_CPUS)) + goto Enable_cpus; + + arch_suspend_disable_irqs(); + BUG_ON(!irqs_disabled()); + + system_state = SYSTEM_SUSPEND; + + error = syscore_suspend(); + if (!error) { + *wakeup = pm_wakeup_pending(); + if (!(suspend_test(TEST_CORE) || *wakeup)) { + trace_suspend_resume(TPS("machine_suspend"), + state, true); + error = suspend_ops->enter(state); + trace_suspend_resume(TPS("machine_suspend"), + state, false); + } else if (*wakeup) { + error = -EBUSY; + } + syscore_resume(); + } + + system_state = SYSTEM_RUNNING; + + arch_suspend_enable_irqs(); + BUG_ON(irqs_disabled()); + + Enable_cpus: + pm_sleep_enable_secondary_cpus(); + + Platform_wake: + platform_resume_noirq(state); + dpm_resume_noirq(PMSG_RESUME); + + Platform_early_resume: + platform_resume_early(state); + + Devices_early_resume: + dpm_resume_early(PMSG_RESUME); + + Platform_finish: + platform_resume_finish(state); + return error; +} + +/** + * suspend_devices_and_enter - Suspend devices and enter system sleep state. + * @state: System sleep state to enter. + */ +int suspend_devices_and_enter(suspend_state_t state) +{ + int error; + bool wakeup = false; + + if (!sleep_state_supported(state)) + return -ENOSYS; + + pm_suspend_target_state = state; + + if (state == PM_SUSPEND_TO_IDLE) + pm_set_suspend_no_platform(); + + error = platform_suspend_begin(state); + if (error) + goto Close; + + suspend_console(); + suspend_test_start(); + error = dpm_suspend_start(PMSG_SUSPEND); + if (error) { + pr_err("Some devices failed to suspend, or early wake event detected\n"); + goto Recover_platform; + } + suspend_test_finish("suspend devices"); + if (suspend_test(TEST_DEVICES)) + goto Recover_platform; + + do { + error = suspend_enter(state, &wakeup); + } while (!error && !wakeup && platform_suspend_again(state)); + + Resume_devices: + suspend_test_start(); + dpm_resume_end(PMSG_RESUME); + suspend_test_finish("resume devices"); + trace_suspend_resume(TPS("resume_console"), state, true); + resume_console(); + trace_suspend_resume(TPS("resume_console"), state, false); + + Close: + platform_resume_end(state); + pm_suspend_target_state = PM_SUSPEND_ON; + return error; + + Recover_platform: + platform_recover(state); + goto Resume_devices; +} + +/** + * suspend_finish - Clean up before finishing the suspend sequence. + * + * Call platform code to clean up, restart processes, and free the console that + * we've allocated. This routine is not called for hibernation. + */ +static void suspend_finish(void) +{ + suspend_thaw_processes(); + pm_notifier_call_chain(PM_POST_SUSPEND); + pm_restore_console(); +} + +/** + * enter_state - Do common work needed to enter system sleep state. + * @state: System sleep state to enter. + * + * Make sure that no one else is trying to put the system into a sleep state. + * Fail if that's not the case. Otherwise, prepare for system suspend, make the + * system enter the given sleep state and clean up after wakeup. + */ +static int enter_state(suspend_state_t state) +{ + int error; + + trace_suspend_resume(TPS("suspend_enter"), state, true); + if (state == PM_SUSPEND_TO_IDLE) { +#ifdef CONFIG_PM_DEBUG + if (pm_test_level != TEST_NONE && pm_test_level <= TEST_CPUS) { + pr_warn("Unsupported test mode for suspend to idle, please choose none/freezer/devices/platform.\n"); + return -EAGAIN; + } +#endif + } else if (!valid_state(state)) { + return -EINVAL; + } + if (!mutex_trylock(&system_transition_mutex)) + return -EBUSY; + + if (state == PM_SUSPEND_TO_IDLE) + s2idle_begin(); + + if (sync_on_suspend_enabled) { + trace_suspend_resume(TPS("sync_filesystems"), 0, true); + ksys_sync_helper(); + trace_suspend_resume(TPS("sync_filesystems"), 0, false); + } + + pm_pr_dbg("Preparing system for sleep (%s)\n", mem_sleep_labels[state]); + pm_suspend_clear_flags(); + error = suspend_prepare(state); + if (error) + goto Unlock; + + if (suspend_test(TEST_FREEZER)) + goto Finish; + + trace_suspend_resume(TPS("suspend_enter"), state, false); + pm_pr_dbg("Suspending system (%s)\n", mem_sleep_labels[state]); + pm_restrict_gfp_mask(); + error = suspend_devices_and_enter(state); + pm_restore_gfp_mask(); + + Finish: + events_check_enabled = false; + pm_pr_dbg("Finishing wakeup.\n"); + suspend_finish(); + Unlock: + mutex_unlock(&system_transition_mutex); + return error; +} + +/** + * pm_suspend - Externally visible function for suspending the system. + * @state: System sleep state to enter. + * + * Check if the value of @state represents one of the supported states, + * execute enter_state() and update system suspend statistics. + */ +int pm_suspend(suspend_state_t state) +{ + int error; + + if (state <= PM_SUSPEND_ON || state >= PM_SUSPEND_MAX) + return -EINVAL; + + pr_info("suspend entry (%s)\n", mem_sleep_labels[state]); + error = enter_state(state); + if (error) { + suspend_stats.fail++; + dpm_save_failed_errno(error); + } else { + suspend_stats.success++; + } + pr_info("suspend exit\n"); + return error; +} +EXPORT_SYMBOL(pm_suspend); diff --git a/kernel/power/suspend_test.c b/kernel/power/suspend_test.c new file mode 100644 index 0000000000..b663a97f58 --- /dev/null +++ b/kernel/power/suspend_test.c @@ -0,0 +1,219 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * kernel/power/suspend_test.c - Suspend to RAM and standby test facility. + * + * Copyright (c) 2009 Pavel Machek <pavel@ucw.cz> + */ + +#include <linux/init.h> +#include <linux/rtc.h> + +#include "power.h" + +/* + * We test the system suspend code by setting an RTC wakealarm a short + * time in the future, then suspending. Suspending the devices won't + * normally take long ... some systems only need a few milliseconds. + * + * The time it takes is system-specific though, so when we test this + * during system bootup we allow a LOT of time. + */ +#define TEST_SUSPEND_SECONDS 10 + +static unsigned long suspend_test_start_time; +static u32 test_repeat_count_max = 1; +static u32 test_repeat_count_current; + +void suspend_test_start(void) +{ + /* FIXME Use better timebase than "jiffies", ideally a clocksource. + * What we want is a hardware counter that will work correctly even + * during the irqs-are-off stages of the suspend/resume cycle... + */ + suspend_test_start_time = jiffies; +} + +void suspend_test_finish(const char *label) +{ + long nj = jiffies - suspend_test_start_time; + unsigned msec; + + msec = jiffies_to_msecs(abs(nj)); + pr_info("PM: %s took %d.%03d seconds\n", label, + msec / 1000, msec % 1000); + + /* Warning on suspend means the RTC alarm period needs to be + * larger -- the system was sooo slooowwww to suspend that the + * alarm (should have) fired before the system went to sleep! + * + * Warning on either suspend or resume also means the system + * has some performance issues. The stack dump of a WARN_ON + * is more likely to get the right attention than a printk... + */ + WARN(msec > (TEST_SUSPEND_SECONDS * 1000), + "Component: %s, time: %u\n", label, msec); +} + +/* + * To test system suspend, we need a hands-off mechanism to resume the + * system. RTCs wake alarms are a common self-contained mechanism. + */ + +static void __init test_wakealarm(struct rtc_device *rtc, suspend_state_t state) +{ + static char err_readtime[] __initdata = + KERN_ERR "PM: can't read %s time, err %d\n"; + static char err_wakealarm [] __initdata = + KERN_ERR "PM: can't set %s wakealarm, err %d\n"; + static char err_suspend[] __initdata = + KERN_ERR "PM: suspend test failed, error %d\n"; + static char info_test[] __initdata = + KERN_INFO "PM: test RTC wakeup from '%s' suspend\n"; + + time64_t now; + struct rtc_wkalrm alm; + int status; + + /* this may fail if the RTC hasn't been initialized */ +repeat: + status = rtc_read_time(rtc, &alm.time); + if (status < 0) { + printk(err_readtime, dev_name(&rtc->dev), status); + return; + } + now = rtc_tm_to_time64(&alm.time); + + memset(&alm, 0, sizeof alm); + rtc_time64_to_tm(now + TEST_SUSPEND_SECONDS, &alm.time); + alm.enabled = true; + + status = rtc_set_alarm(rtc, &alm); + if (status < 0) { + printk(err_wakealarm, dev_name(&rtc->dev), status); + return; + } + + if (state == PM_SUSPEND_MEM) { + printk(info_test, pm_states[state]); + status = pm_suspend(state); + if (status == -ENODEV) + state = PM_SUSPEND_STANDBY; + } + if (state == PM_SUSPEND_STANDBY) { + printk(info_test, pm_states[state]); + status = pm_suspend(state); + if (status < 0) + state = PM_SUSPEND_TO_IDLE; + } + if (state == PM_SUSPEND_TO_IDLE) { + printk(info_test, pm_states[state]); + status = pm_suspend(state); + } + + if (status < 0) + printk(err_suspend, status); + + test_repeat_count_current++; + if (test_repeat_count_current < test_repeat_count_max) + goto repeat; + + /* Some platforms can't detect that the alarm triggered the + * wakeup, or (accordingly) disable it after it afterwards. + * It's supposed to give oneshot behavior; cope. + */ + alm.enabled = false; + rtc_set_alarm(rtc, &alm); +} + +static int __init has_wakealarm(struct device *dev, const void *data) +{ + struct rtc_device *candidate = to_rtc_device(dev); + + if (!test_bit(RTC_FEATURE_ALARM, candidate->features)) + return 0; + if (!device_may_wakeup(candidate->dev.parent)) + return 0; + + return 1; +} + +/* + * Kernel options like "test_suspend=mem" force suspend/resume sanity tests + * at startup time. They're normally disabled, for faster boot and because + * we can't know which states really work on this particular system. + */ +static const char *test_state_label __initdata; + +static char warn_bad_state[] __initdata = + KERN_WARNING "PM: can't test '%s' suspend state\n"; + +static int __init setup_test_suspend(char *value) +{ + int i; + char *repeat; + char *suspend_type; + + /* example : "=mem[,N]" ==> "mem[,N]" */ + value++; + suspend_type = strsep(&value, ","); + if (!suspend_type) + return 1; + + repeat = strsep(&value, ","); + if (repeat) { + if (kstrtou32(repeat, 0, &test_repeat_count_max)) + return 1; + } + + for (i = PM_SUSPEND_MIN; i < PM_SUSPEND_MAX; i++) + if (!strcmp(pm_labels[i], suspend_type)) { + test_state_label = pm_labels[i]; + return 1; + } + + printk(warn_bad_state, suspend_type); + return 1; +} +__setup("test_suspend", setup_test_suspend); + +static int __init test_suspend(void) +{ + static char warn_no_rtc[] __initdata = + KERN_WARNING "PM: no wakealarm-capable RTC driver is ready\n"; + + struct rtc_device *rtc = NULL; + struct device *dev; + suspend_state_t test_state; + + /* PM is initialized by now; is that state testable? */ + if (!test_state_label) + return 0; + + for (test_state = PM_SUSPEND_MIN; test_state < PM_SUSPEND_MAX; test_state++) { + const char *state_label = pm_states[test_state]; + + if (state_label && !strcmp(test_state_label, state_label)) + break; + } + if (test_state == PM_SUSPEND_MAX) { + printk(warn_bad_state, test_state_label); + return 0; + } + + /* RTCs have initialized by now too ... can we use one? */ + dev = class_find_device(rtc_class, NULL, NULL, has_wakealarm); + if (dev) { + rtc = rtc_class_open(dev_name(dev)); + put_device(dev); + } + if (!rtc) { + printk(warn_no_rtc); + return 0; + } + + /* go for it */ + test_wakealarm(rtc, test_state); + rtc_class_close(rtc); + return 0; +} +late_initcall(test_suspend); diff --git a/kernel/power/swap.c b/kernel/power/swap.c new file mode 100644 index 0000000000..d71c590550 --- /dev/null +++ b/kernel/power/swap.c @@ -0,0 +1,1619 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * linux/kernel/power/swap.c + * + * This file provides functions for reading the suspend image from + * and writing it to a swap partition. + * + * Copyright (C) 1998,2001-2005 Pavel Machek <pavel@ucw.cz> + * Copyright (C) 2006 Rafael J. Wysocki <rjw@sisk.pl> + * Copyright (C) 2010-2012 Bojan Smojver <bojan@rexursive.com> + */ + +#define pr_fmt(fmt) "PM: " fmt + +#include <linux/module.h> +#include <linux/file.h> +#include <linux/delay.h> +#include <linux/bitops.h> +#include <linux/device.h> +#include <linux/bio.h> +#include <linux/blkdev.h> +#include <linux/swap.h> +#include <linux/swapops.h> +#include <linux/pm.h> +#include <linux/slab.h> +#include <linux/lzo.h> +#include <linux/vmalloc.h> +#include <linux/cpumask.h> +#include <linux/atomic.h> +#include <linux/kthread.h> +#include <linux/crc32.h> +#include <linux/ktime.h> + +#include "power.h" + +#define HIBERNATE_SIG "S1SUSPEND" + +u32 swsusp_hardware_signature; + +/* + * When reading an {un,}compressed image, we may restore pages in place, + * in which case some architectures need these pages cleaning before they + * can be executed. We don't know which pages these may be, so clean the lot. + */ +static bool clean_pages_on_read; +static bool clean_pages_on_decompress; + +/* + * The swap map is a data structure used for keeping track of each page + * written to a swap partition. It consists of many swap_map_page + * structures that contain each an array of MAP_PAGE_ENTRIES swap entries. + * These structures are stored on the swap and linked together with the + * help of the .next_swap member. + * + * The swap map is created during suspend. The swap map pages are + * allocated and populated one at a time, so we only need one memory + * page to set up the entire structure. + * + * During resume we pick up all swap_map_page structures into a list. + */ + +#define MAP_PAGE_ENTRIES (PAGE_SIZE / sizeof(sector_t) - 1) + +/* + * Number of free pages that are not high. + */ +static inline unsigned long low_free_pages(void) +{ + return nr_free_pages() - nr_free_highpages(); +} + +/* + * Number of pages required to be kept free while writing the image. Always + * half of all available low pages before the writing starts. + */ +static inline unsigned long reqd_free_pages(void) +{ + return low_free_pages() / 2; +} + +struct swap_map_page { + sector_t entries[MAP_PAGE_ENTRIES]; + sector_t next_swap; +}; + +struct swap_map_page_list { + struct swap_map_page *map; + struct swap_map_page_list *next; +}; + +/* + * The swap_map_handle structure is used for handling swap in + * a file-alike way + */ + +struct swap_map_handle { + struct swap_map_page *cur; + struct swap_map_page_list *maps; + sector_t cur_swap; + sector_t first_sector; + unsigned int k; + unsigned long reqd_free_pages; + u32 crc32; +}; + +struct swsusp_header { + char reserved[PAGE_SIZE - 20 - sizeof(sector_t) - sizeof(int) - + sizeof(u32) - sizeof(u32)]; + u32 hw_sig; + u32 crc32; + sector_t image; + unsigned int flags; /* Flags to pass to the "boot" kernel */ + char orig_sig[10]; + char sig[10]; +} __packed; + +static struct swsusp_header *swsusp_header; + +/* + * The following functions are used for tracing the allocated + * swap pages, so that they can be freed in case of an error. + */ + +struct swsusp_extent { + struct rb_node node; + unsigned long start; + unsigned long end; +}; + +static struct rb_root swsusp_extents = RB_ROOT; + +static int swsusp_extents_insert(unsigned long swap_offset) +{ + struct rb_node **new = &(swsusp_extents.rb_node); + struct rb_node *parent = NULL; + struct swsusp_extent *ext; + + /* Figure out where to put the new node */ + while (*new) { + ext = rb_entry(*new, struct swsusp_extent, node); + parent = *new; + if (swap_offset < ext->start) { + /* Try to merge */ + if (swap_offset == ext->start - 1) { + ext->start--; + return 0; + } + new = &((*new)->rb_left); + } else if (swap_offset > ext->end) { + /* Try to merge */ + if (swap_offset == ext->end + 1) { + ext->end++; + return 0; + } + new = &((*new)->rb_right); + } else { + /* It already is in the tree */ + return -EINVAL; + } + } + /* Add the new node and rebalance the tree. */ + ext = kzalloc(sizeof(struct swsusp_extent), GFP_KERNEL); + if (!ext) + return -ENOMEM; + + ext->start = swap_offset; + ext->end = swap_offset; + rb_link_node(&ext->node, parent, new); + rb_insert_color(&ext->node, &swsusp_extents); + return 0; +} + +/* + * alloc_swapdev_block - allocate a swap page and register that it has + * been allocated, so that it can be freed in case of an error. + */ + +sector_t alloc_swapdev_block(int swap) +{ + unsigned long offset; + + offset = swp_offset(get_swap_page_of_type(swap)); + if (offset) { + if (swsusp_extents_insert(offset)) + swap_free(swp_entry(swap, offset)); + else + return swapdev_block(swap, offset); + } + return 0; +} + +/* + * free_all_swap_pages - free swap pages allocated for saving image data. + * It also frees the extents used to register which swap entries had been + * allocated. + */ + +void free_all_swap_pages(int swap) +{ + struct rb_node *node; + + while ((node = swsusp_extents.rb_node)) { + struct swsusp_extent *ext; + unsigned long offset; + + ext = rb_entry(node, struct swsusp_extent, node); + rb_erase(node, &swsusp_extents); + for (offset = ext->start; offset <= ext->end; offset++) + swap_free(swp_entry(swap, offset)); + + kfree(ext); + } +} + +int swsusp_swap_in_use(void) +{ + return (swsusp_extents.rb_node != NULL); +} + +/* + * General things + */ + +static unsigned short root_swap = 0xffff; +static struct block_device *hib_resume_bdev; + +struct hib_bio_batch { + atomic_t count; + wait_queue_head_t wait; + blk_status_t error; + struct blk_plug plug; +}; + +static void hib_init_batch(struct hib_bio_batch *hb) +{ + atomic_set(&hb->count, 0); + init_waitqueue_head(&hb->wait); + hb->error = BLK_STS_OK; + blk_start_plug(&hb->plug); +} + +static void hib_finish_batch(struct hib_bio_batch *hb) +{ + blk_finish_plug(&hb->plug); +} + +static void hib_end_io(struct bio *bio) +{ + struct hib_bio_batch *hb = bio->bi_private; + struct page *page = bio_first_page_all(bio); + + if (bio->bi_status) { + pr_alert("Read-error on swap-device (%u:%u:%Lu)\n", + MAJOR(bio_dev(bio)), MINOR(bio_dev(bio)), + (unsigned long long)bio->bi_iter.bi_sector); + } + + if (bio_data_dir(bio) == WRITE) + put_page(page); + else if (clean_pages_on_read) + flush_icache_range((unsigned long)page_address(page), + (unsigned long)page_address(page) + PAGE_SIZE); + + if (bio->bi_status && !hb->error) + hb->error = bio->bi_status; + if (atomic_dec_and_test(&hb->count)) + wake_up(&hb->wait); + + bio_put(bio); +} + +static int hib_submit_io(blk_opf_t opf, pgoff_t page_off, void *addr, + struct hib_bio_batch *hb) +{ + struct page *page = virt_to_page(addr); + struct bio *bio; + int error = 0; + + bio = bio_alloc(hib_resume_bdev, 1, opf, GFP_NOIO | __GFP_HIGH); + bio->bi_iter.bi_sector = page_off * (PAGE_SIZE >> 9); + + if (bio_add_page(bio, page, PAGE_SIZE, 0) < PAGE_SIZE) { + pr_err("Adding page to bio failed at %llu\n", + (unsigned long long)bio->bi_iter.bi_sector); + bio_put(bio); + return -EFAULT; + } + + if (hb) { + bio->bi_end_io = hib_end_io; + bio->bi_private = hb; + atomic_inc(&hb->count); + submit_bio(bio); + } else { + error = submit_bio_wait(bio); + bio_put(bio); + } + + return error; +} + +static int hib_wait_io(struct hib_bio_batch *hb) +{ + /* + * We are relying on the behavior of blk_plug that a thread with + * a plug will flush the plug list before sleeping. + */ + wait_event(hb->wait, atomic_read(&hb->count) == 0); + return blk_status_to_errno(hb->error); +} + +/* + * Saving part + */ +static int mark_swapfiles(struct swap_map_handle *handle, unsigned int flags) +{ + int error; + + hib_submit_io(REQ_OP_READ, swsusp_resume_block, swsusp_header, NULL); + if (!memcmp("SWAP-SPACE",swsusp_header->sig, 10) || + !memcmp("SWAPSPACE2",swsusp_header->sig, 10)) { + memcpy(swsusp_header->orig_sig,swsusp_header->sig, 10); + memcpy(swsusp_header->sig, HIBERNATE_SIG, 10); + swsusp_header->image = handle->first_sector; + if (swsusp_hardware_signature) { + swsusp_header->hw_sig = swsusp_hardware_signature; + flags |= SF_HW_SIG; + } + swsusp_header->flags = flags; + if (flags & SF_CRC32_MODE) + swsusp_header->crc32 = handle->crc32; + error = hib_submit_io(REQ_OP_WRITE | REQ_SYNC, + swsusp_resume_block, swsusp_header, NULL); + } else { + pr_err("Swap header not found!\n"); + error = -ENODEV; + } + return error; +} + +/** + * swsusp_swap_check - check if the resume device is a swap device + * and get its index (if so) + * + * This is called before saving image + */ +static int swsusp_swap_check(void) +{ + int res; + + if (swsusp_resume_device) + res = swap_type_of(swsusp_resume_device, swsusp_resume_block); + else + res = find_first_swap(&swsusp_resume_device); + if (res < 0) + return res; + root_swap = res; + + hib_resume_bdev = blkdev_get_by_dev(swsusp_resume_device, + BLK_OPEN_WRITE, NULL, NULL); + if (IS_ERR(hib_resume_bdev)) + return PTR_ERR(hib_resume_bdev); + + res = set_blocksize(hib_resume_bdev, PAGE_SIZE); + if (res < 0) + blkdev_put(hib_resume_bdev, NULL); + + return res; +} + +/** + * write_page - Write one page to given swap location. + * @buf: Address we're writing. + * @offset: Offset of the swap page we're writing to. + * @hb: bio completion batch + */ + +static int write_page(void *buf, sector_t offset, struct hib_bio_batch *hb) +{ + void *src; + int ret; + + if (!offset) + return -ENOSPC; + + if (hb) { + src = (void *)__get_free_page(GFP_NOIO | __GFP_NOWARN | + __GFP_NORETRY); + if (src) { + copy_page(src, buf); + } else { + ret = hib_wait_io(hb); /* Free pages */ + if (ret) + return ret; + src = (void *)__get_free_page(GFP_NOIO | + __GFP_NOWARN | + __GFP_NORETRY); + if (src) { + copy_page(src, buf); + } else { + WARN_ON_ONCE(1); + hb = NULL; /* Go synchronous */ + src = buf; + } + } + } else { + src = buf; + } + return hib_submit_io(REQ_OP_WRITE | REQ_SYNC, offset, src, hb); +} + +static void release_swap_writer(struct swap_map_handle *handle) +{ + if (handle->cur) + free_page((unsigned long)handle->cur); + handle->cur = NULL; +} + +static int get_swap_writer(struct swap_map_handle *handle) +{ + int ret; + + ret = swsusp_swap_check(); + if (ret) { + if (ret != -ENOSPC) + pr_err("Cannot find swap device, try swapon -a\n"); + return ret; + } + handle->cur = (struct swap_map_page *)get_zeroed_page(GFP_KERNEL); + if (!handle->cur) { + ret = -ENOMEM; + goto err_close; + } + handle->cur_swap = alloc_swapdev_block(root_swap); + if (!handle->cur_swap) { + ret = -ENOSPC; + goto err_rel; + } + handle->k = 0; + handle->reqd_free_pages = reqd_free_pages(); + handle->first_sector = handle->cur_swap; + return 0; +err_rel: + release_swap_writer(handle); +err_close: + swsusp_close(false); + return ret; +} + +static int swap_write_page(struct swap_map_handle *handle, void *buf, + struct hib_bio_batch *hb) +{ + int error = 0; + sector_t offset; + + if (!handle->cur) + return -EINVAL; + offset = alloc_swapdev_block(root_swap); + error = write_page(buf, offset, hb); + if (error) + return error; + handle->cur->entries[handle->k++] = offset; + if (handle->k >= MAP_PAGE_ENTRIES) { + offset = alloc_swapdev_block(root_swap); + if (!offset) + return -ENOSPC; + handle->cur->next_swap = offset; + error = write_page(handle->cur, handle->cur_swap, hb); + if (error) + goto out; + clear_page(handle->cur); + handle->cur_swap = offset; + handle->k = 0; + + if (hb && low_free_pages() <= handle->reqd_free_pages) { + error = hib_wait_io(hb); + if (error) + goto out; + /* + * Recalculate the number of required free pages, to + * make sure we never take more than half. + */ + handle->reqd_free_pages = reqd_free_pages(); + } + } + out: + return error; +} + +static int flush_swap_writer(struct swap_map_handle *handle) +{ + if (handle->cur && handle->cur_swap) + return write_page(handle->cur, handle->cur_swap, NULL); + else + return -EINVAL; +} + +static int swap_writer_finish(struct swap_map_handle *handle, + unsigned int flags, int error) +{ + if (!error) { + pr_info("S"); + error = mark_swapfiles(handle, flags); + pr_cont("|\n"); + flush_swap_writer(handle); + } + + if (error) + free_all_swap_pages(root_swap); + release_swap_writer(handle); + swsusp_close(false); + + return error; +} + +/* We need to remember how much compressed data we need to read. */ +#define LZO_HEADER sizeof(size_t) + +/* Number of pages/bytes we'll compress at one time. */ +#define LZO_UNC_PAGES 32 +#define LZO_UNC_SIZE (LZO_UNC_PAGES * PAGE_SIZE) + +/* Number of pages/bytes we need for compressed data (worst case). */ +#define LZO_CMP_PAGES DIV_ROUND_UP(lzo1x_worst_compress(LZO_UNC_SIZE) + \ + LZO_HEADER, PAGE_SIZE) +#define LZO_CMP_SIZE (LZO_CMP_PAGES * PAGE_SIZE) + +/* Maximum number of threads for compression/decompression. */ +#define LZO_THREADS 3 + +/* Minimum/maximum number of pages for read buffering. */ +#define LZO_MIN_RD_PAGES 1024 +#define LZO_MAX_RD_PAGES 8192 + + +/** + * save_image - save the suspend image data + */ + +static int save_image(struct swap_map_handle *handle, + struct snapshot_handle *snapshot, + unsigned int nr_to_write) +{ + unsigned int m; + int ret; + int nr_pages; + int err2; + struct hib_bio_batch hb; + ktime_t start; + ktime_t stop; + + hib_init_batch(&hb); + + pr_info("Saving image data pages (%u pages)...\n", + nr_to_write); + m = nr_to_write / 10; + if (!m) + m = 1; + nr_pages = 0; + start = ktime_get(); + while (1) { + ret = snapshot_read_next(snapshot); + if (ret <= 0) + break; + ret = swap_write_page(handle, data_of(*snapshot), &hb); + if (ret) + break; + if (!(nr_pages % m)) + pr_info("Image saving progress: %3d%%\n", + nr_pages / m * 10); + nr_pages++; + } + err2 = hib_wait_io(&hb); + hib_finish_batch(&hb); + stop = ktime_get(); + if (!ret) + ret = err2; + if (!ret) + pr_info("Image saving done\n"); + swsusp_show_speed(start, stop, nr_to_write, "Wrote"); + return ret; +} + +/* + * Structure used for CRC32. + */ +struct crc_data { + struct task_struct *thr; /* thread */ + atomic_t ready; /* ready to start flag */ + atomic_t stop; /* ready to stop flag */ + unsigned run_threads; /* nr current threads */ + wait_queue_head_t go; /* start crc update */ + wait_queue_head_t done; /* crc update done */ + u32 *crc32; /* points to handle's crc32 */ + size_t *unc_len[LZO_THREADS]; /* uncompressed lengths */ + unsigned char *unc[LZO_THREADS]; /* uncompressed data */ +}; + +/* + * CRC32 update function that runs in its own thread. + */ +static int crc32_threadfn(void *data) +{ + struct crc_data *d = data; + unsigned i; + + while (1) { + wait_event(d->go, atomic_read_acquire(&d->ready) || + kthread_should_stop()); + if (kthread_should_stop()) { + d->thr = NULL; + atomic_set_release(&d->stop, 1); + wake_up(&d->done); + break; + } + atomic_set(&d->ready, 0); + + for (i = 0; i < d->run_threads; i++) + *d->crc32 = crc32_le(*d->crc32, + d->unc[i], *d->unc_len[i]); + atomic_set_release(&d->stop, 1); + wake_up(&d->done); + } + return 0; +} +/* + * Structure used for LZO data compression. + */ +struct cmp_data { + struct task_struct *thr; /* thread */ + atomic_t ready; /* ready to start flag */ + atomic_t stop; /* ready to stop flag */ + int ret; /* return code */ + wait_queue_head_t go; /* start compression */ + wait_queue_head_t done; /* compression done */ + size_t unc_len; /* uncompressed length */ + size_t cmp_len; /* compressed length */ + unsigned char unc[LZO_UNC_SIZE]; /* uncompressed buffer */ + unsigned char cmp[LZO_CMP_SIZE]; /* compressed buffer */ + unsigned char wrk[LZO1X_1_MEM_COMPRESS]; /* compression workspace */ +}; + +/* + * Compression function that runs in its own thread. + */ +static int lzo_compress_threadfn(void *data) +{ + struct cmp_data *d = data; + + while (1) { + wait_event(d->go, atomic_read_acquire(&d->ready) || + kthread_should_stop()); + if (kthread_should_stop()) { + d->thr = NULL; + d->ret = -1; + atomic_set_release(&d->stop, 1); + wake_up(&d->done); + break; + } + atomic_set(&d->ready, 0); + + d->ret = lzo1x_1_compress(d->unc, d->unc_len, + d->cmp + LZO_HEADER, &d->cmp_len, + d->wrk); + atomic_set_release(&d->stop, 1); + wake_up(&d->done); + } + return 0; +} + +/** + * save_image_lzo - Save the suspend image data compressed with LZO. + * @handle: Swap map handle to use for saving the image. + * @snapshot: Image to read data from. + * @nr_to_write: Number of pages to save. + */ +static int save_image_lzo(struct swap_map_handle *handle, + struct snapshot_handle *snapshot, + unsigned int nr_to_write) +{ + unsigned int m; + int ret = 0; + int nr_pages; + int err2; + struct hib_bio_batch hb; + ktime_t start; + ktime_t stop; + size_t off; + unsigned thr, run_threads, nr_threads; + unsigned char *page = NULL; + struct cmp_data *data = NULL; + struct crc_data *crc = NULL; + + hib_init_batch(&hb); + + /* + * We'll limit the number of threads for compression to limit memory + * footprint. + */ + nr_threads = num_online_cpus() - 1; + nr_threads = clamp_val(nr_threads, 1, LZO_THREADS); + + page = (void *)__get_free_page(GFP_NOIO | __GFP_HIGH); + if (!page) { + pr_err("Failed to allocate LZO page\n"); + ret = -ENOMEM; + goto out_clean; + } + + data = vzalloc(array_size(nr_threads, sizeof(*data))); + if (!data) { + pr_err("Failed to allocate LZO data\n"); + ret = -ENOMEM; + goto out_clean; + } + + crc = kzalloc(sizeof(*crc), GFP_KERNEL); + if (!crc) { + pr_err("Failed to allocate crc\n"); + ret = -ENOMEM; + goto out_clean; + } + + /* + * Start the compression threads. + */ + for (thr = 0; thr < nr_threads; thr++) { + init_waitqueue_head(&data[thr].go); + init_waitqueue_head(&data[thr].done); + + data[thr].thr = kthread_run(lzo_compress_threadfn, + &data[thr], + "image_compress/%u", thr); + if (IS_ERR(data[thr].thr)) { + data[thr].thr = NULL; + pr_err("Cannot start compression threads\n"); + ret = -ENOMEM; + goto out_clean; + } + } + + /* + * Start the CRC32 thread. + */ + init_waitqueue_head(&crc->go); + init_waitqueue_head(&crc->done); + + handle->crc32 = 0; + crc->crc32 = &handle->crc32; + for (thr = 0; thr < nr_threads; thr++) { + crc->unc[thr] = data[thr].unc; + crc->unc_len[thr] = &data[thr].unc_len; + } + + crc->thr = kthread_run(crc32_threadfn, crc, "image_crc32"); + if (IS_ERR(crc->thr)) { + crc->thr = NULL; + pr_err("Cannot start CRC32 thread\n"); + ret = -ENOMEM; + goto out_clean; + } + + /* + * Adjust the number of required free pages after all allocations have + * been done. We don't want to run out of pages when writing. + */ + handle->reqd_free_pages = reqd_free_pages(); + + pr_info("Using %u thread(s) for compression\n", nr_threads); + pr_info("Compressing and saving image data (%u pages)...\n", + nr_to_write); + m = nr_to_write / 10; + if (!m) + m = 1; + nr_pages = 0; + start = ktime_get(); + for (;;) { + for (thr = 0; thr < nr_threads; thr++) { + for (off = 0; off < LZO_UNC_SIZE; off += PAGE_SIZE) { + ret = snapshot_read_next(snapshot); + if (ret < 0) + goto out_finish; + + if (!ret) + break; + + memcpy(data[thr].unc + off, + data_of(*snapshot), PAGE_SIZE); + + if (!(nr_pages % m)) + pr_info("Image saving progress: %3d%%\n", + nr_pages / m * 10); + nr_pages++; + } + if (!off) + break; + + data[thr].unc_len = off; + + atomic_set_release(&data[thr].ready, 1); + wake_up(&data[thr].go); + } + + if (!thr) + break; + + crc->run_threads = thr; + atomic_set_release(&crc->ready, 1); + wake_up(&crc->go); + + for (run_threads = thr, thr = 0; thr < run_threads; thr++) { + wait_event(data[thr].done, + atomic_read_acquire(&data[thr].stop)); + atomic_set(&data[thr].stop, 0); + + ret = data[thr].ret; + + if (ret < 0) { + pr_err("LZO compression failed\n"); + goto out_finish; + } + + if (unlikely(!data[thr].cmp_len || + data[thr].cmp_len > + lzo1x_worst_compress(data[thr].unc_len))) { + pr_err("Invalid LZO compressed length\n"); + ret = -1; + goto out_finish; + } + + *(size_t *)data[thr].cmp = data[thr].cmp_len; + + /* + * Given we are writing one page at a time to disk, we + * copy that much from the buffer, although the last + * bit will likely be smaller than full page. This is + * OK - we saved the length of the compressed data, so + * any garbage at the end will be discarded when we + * read it. + */ + for (off = 0; + off < LZO_HEADER + data[thr].cmp_len; + off += PAGE_SIZE) { + memcpy(page, data[thr].cmp + off, PAGE_SIZE); + + ret = swap_write_page(handle, page, &hb); + if (ret) + goto out_finish; + } + } + + wait_event(crc->done, atomic_read_acquire(&crc->stop)); + atomic_set(&crc->stop, 0); + } + +out_finish: + err2 = hib_wait_io(&hb); + stop = ktime_get(); + if (!ret) + ret = err2; + if (!ret) + pr_info("Image saving done\n"); + swsusp_show_speed(start, stop, nr_to_write, "Wrote"); +out_clean: + hib_finish_batch(&hb); + if (crc) { + if (crc->thr) + kthread_stop(crc->thr); + kfree(crc); + } + if (data) { + for (thr = 0; thr < nr_threads; thr++) + if (data[thr].thr) + kthread_stop(data[thr].thr); + vfree(data); + } + if (page) free_page((unsigned long)page); + + return ret; +} + +/** + * enough_swap - Make sure we have enough swap to save the image. + * + * Returns TRUE or FALSE after checking the total amount of swap + * space available from the resume partition. + */ + +static int enough_swap(unsigned int nr_pages) +{ + unsigned int free_swap = count_swap_pages(root_swap, 1); + unsigned int required; + + pr_debug("Free swap pages: %u\n", free_swap); + + required = PAGES_FOR_IO + nr_pages; + return free_swap > required; +} + +/** + * swsusp_write - Write entire image and metadata. + * @flags: flags to pass to the "boot" kernel in the image header + * + * It is important _NOT_ to umount filesystems at this point. We want + * them synced (in case something goes wrong) but we DO not want to mark + * filesystem clean: it is not. (And it does not matter, if we resume + * correctly, we'll mark system clean, anyway.) + */ + +int swsusp_write(unsigned int flags) +{ + struct swap_map_handle handle; + struct snapshot_handle snapshot; + struct swsusp_info *header; + unsigned long pages; + int error; + + pages = snapshot_get_image_size(); + error = get_swap_writer(&handle); + if (error) { + pr_err("Cannot get swap writer\n"); + return error; + } + if (flags & SF_NOCOMPRESS_MODE) { + if (!enough_swap(pages)) { + pr_err("Not enough free swap\n"); + error = -ENOSPC; + goto out_finish; + } + } + memset(&snapshot, 0, sizeof(struct snapshot_handle)); + error = snapshot_read_next(&snapshot); + if (error < (int)PAGE_SIZE) { + if (error >= 0) + error = -EFAULT; + + goto out_finish; + } + header = (struct swsusp_info *)data_of(snapshot); + error = swap_write_page(&handle, header, NULL); + if (!error) { + error = (flags & SF_NOCOMPRESS_MODE) ? + save_image(&handle, &snapshot, pages - 1) : + save_image_lzo(&handle, &snapshot, pages - 1); + } +out_finish: + error = swap_writer_finish(&handle, flags, error); + return error; +} + +/* + * The following functions allow us to read data using a swap map + * in a file-like way. + */ + +static void release_swap_reader(struct swap_map_handle *handle) +{ + struct swap_map_page_list *tmp; + + while (handle->maps) { + if (handle->maps->map) + free_page((unsigned long)handle->maps->map); + tmp = handle->maps; + handle->maps = handle->maps->next; + kfree(tmp); + } + handle->cur = NULL; +} + +static int get_swap_reader(struct swap_map_handle *handle, + unsigned int *flags_p) +{ + int error; + struct swap_map_page_list *tmp, *last; + sector_t offset; + + *flags_p = swsusp_header->flags; + + if (!swsusp_header->image) /* how can this happen? */ + return -EINVAL; + + handle->cur = NULL; + last = handle->maps = NULL; + offset = swsusp_header->image; + while (offset) { + tmp = kzalloc(sizeof(*handle->maps), GFP_KERNEL); + if (!tmp) { + release_swap_reader(handle); + return -ENOMEM; + } + if (!handle->maps) + handle->maps = tmp; + if (last) + last->next = tmp; + last = tmp; + + tmp->map = (struct swap_map_page *) + __get_free_page(GFP_NOIO | __GFP_HIGH); + if (!tmp->map) { + release_swap_reader(handle); + return -ENOMEM; + } + + error = hib_submit_io(REQ_OP_READ, offset, tmp->map, NULL); + if (error) { + release_swap_reader(handle); + return error; + } + offset = tmp->map->next_swap; + } + handle->k = 0; + handle->cur = handle->maps->map; + return 0; +} + +static int swap_read_page(struct swap_map_handle *handle, void *buf, + struct hib_bio_batch *hb) +{ + sector_t offset; + int error; + struct swap_map_page_list *tmp; + + if (!handle->cur) + return -EINVAL; + offset = handle->cur->entries[handle->k]; + if (!offset) + return -EFAULT; + error = hib_submit_io(REQ_OP_READ, offset, buf, hb); + if (error) + return error; + if (++handle->k >= MAP_PAGE_ENTRIES) { + handle->k = 0; + free_page((unsigned long)handle->maps->map); + tmp = handle->maps; + handle->maps = handle->maps->next; + kfree(tmp); + if (!handle->maps) + release_swap_reader(handle); + else + handle->cur = handle->maps->map; + } + return error; +} + +static int swap_reader_finish(struct swap_map_handle *handle) +{ + release_swap_reader(handle); + + return 0; +} + +/** + * load_image - load the image using the swap map handle + * @handle and the snapshot handle @snapshot + * (assume there are @nr_pages pages to load) + */ + +static int load_image(struct swap_map_handle *handle, + struct snapshot_handle *snapshot, + unsigned int nr_to_read) +{ + unsigned int m; + int ret = 0; + ktime_t start; + ktime_t stop; + struct hib_bio_batch hb; + int err2; + unsigned nr_pages; + + hib_init_batch(&hb); + + clean_pages_on_read = true; + pr_info("Loading image data pages (%u pages)...\n", nr_to_read); + m = nr_to_read / 10; + if (!m) + m = 1; + nr_pages = 0; + start = ktime_get(); + for ( ; ; ) { + ret = snapshot_write_next(snapshot); + if (ret <= 0) + break; + ret = swap_read_page(handle, data_of(*snapshot), &hb); + if (ret) + break; + if (snapshot->sync_read) + ret = hib_wait_io(&hb); + if (ret) + break; + if (!(nr_pages % m)) + pr_info("Image loading progress: %3d%%\n", + nr_pages / m * 10); + nr_pages++; + } + err2 = hib_wait_io(&hb); + hib_finish_batch(&hb); + stop = ktime_get(); + if (!ret) + ret = err2; + if (!ret) { + pr_info("Image loading done\n"); + snapshot_write_finalize(snapshot); + if (!snapshot_image_loaded(snapshot)) + ret = -ENODATA; + } + swsusp_show_speed(start, stop, nr_to_read, "Read"); + return ret; +} + +/* + * Structure used for LZO data decompression. + */ +struct dec_data { + struct task_struct *thr; /* thread */ + atomic_t ready; /* ready to start flag */ + atomic_t stop; /* ready to stop flag */ + int ret; /* return code */ + wait_queue_head_t go; /* start decompression */ + wait_queue_head_t done; /* decompression done */ + size_t unc_len; /* uncompressed length */ + size_t cmp_len; /* compressed length */ + unsigned char unc[LZO_UNC_SIZE]; /* uncompressed buffer */ + unsigned char cmp[LZO_CMP_SIZE]; /* compressed buffer */ +}; + +/* + * Decompression function that runs in its own thread. + */ +static int lzo_decompress_threadfn(void *data) +{ + struct dec_data *d = data; + + while (1) { + wait_event(d->go, atomic_read_acquire(&d->ready) || + kthread_should_stop()); + if (kthread_should_stop()) { + d->thr = NULL; + d->ret = -1; + atomic_set_release(&d->stop, 1); + wake_up(&d->done); + break; + } + atomic_set(&d->ready, 0); + + d->unc_len = LZO_UNC_SIZE; + d->ret = lzo1x_decompress_safe(d->cmp + LZO_HEADER, d->cmp_len, + d->unc, &d->unc_len); + if (clean_pages_on_decompress) + flush_icache_range((unsigned long)d->unc, + (unsigned long)d->unc + d->unc_len); + + atomic_set_release(&d->stop, 1); + wake_up(&d->done); + } + return 0; +} + +/** + * load_image_lzo - Load compressed image data and decompress them with LZO. + * @handle: Swap map handle to use for loading data. + * @snapshot: Image to copy uncompressed data into. + * @nr_to_read: Number of pages to load. + */ +static int load_image_lzo(struct swap_map_handle *handle, + struct snapshot_handle *snapshot, + unsigned int nr_to_read) +{ + unsigned int m; + int ret = 0; + int eof = 0; + struct hib_bio_batch hb; + ktime_t start; + ktime_t stop; + unsigned nr_pages; + size_t off; + unsigned i, thr, run_threads, nr_threads; + unsigned ring = 0, pg = 0, ring_size = 0, + have = 0, want, need, asked = 0; + unsigned long read_pages = 0; + unsigned char **page = NULL; + struct dec_data *data = NULL; + struct crc_data *crc = NULL; + + hib_init_batch(&hb); + + /* + * We'll limit the number of threads for decompression to limit memory + * footprint. + */ + nr_threads = num_online_cpus() - 1; + nr_threads = clamp_val(nr_threads, 1, LZO_THREADS); + + page = vmalloc(array_size(LZO_MAX_RD_PAGES, sizeof(*page))); + if (!page) { + pr_err("Failed to allocate LZO page\n"); + ret = -ENOMEM; + goto out_clean; + } + + data = vzalloc(array_size(nr_threads, sizeof(*data))); + if (!data) { + pr_err("Failed to allocate LZO data\n"); + ret = -ENOMEM; + goto out_clean; + } + + crc = kzalloc(sizeof(*crc), GFP_KERNEL); + if (!crc) { + pr_err("Failed to allocate crc\n"); + ret = -ENOMEM; + goto out_clean; + } + + clean_pages_on_decompress = true; + + /* + * Start the decompression threads. + */ + for (thr = 0; thr < nr_threads; thr++) { + init_waitqueue_head(&data[thr].go); + init_waitqueue_head(&data[thr].done); + + data[thr].thr = kthread_run(lzo_decompress_threadfn, + &data[thr], + "image_decompress/%u", thr); + if (IS_ERR(data[thr].thr)) { + data[thr].thr = NULL; + pr_err("Cannot start decompression threads\n"); + ret = -ENOMEM; + goto out_clean; + } + } + + /* + * Start the CRC32 thread. + */ + init_waitqueue_head(&crc->go); + init_waitqueue_head(&crc->done); + + handle->crc32 = 0; + crc->crc32 = &handle->crc32; + for (thr = 0; thr < nr_threads; thr++) { + crc->unc[thr] = data[thr].unc; + crc->unc_len[thr] = &data[thr].unc_len; + } + + crc->thr = kthread_run(crc32_threadfn, crc, "image_crc32"); + if (IS_ERR(crc->thr)) { + crc->thr = NULL; + pr_err("Cannot start CRC32 thread\n"); + ret = -ENOMEM; + goto out_clean; + } + + /* + * Set the number of pages for read buffering. + * This is complete guesswork, because we'll only know the real + * picture once prepare_image() is called, which is much later on + * during the image load phase. We'll assume the worst case and + * say that none of the image pages are from high memory. + */ + if (low_free_pages() > snapshot_get_image_size()) + read_pages = (low_free_pages() - snapshot_get_image_size()) / 2; + read_pages = clamp_val(read_pages, LZO_MIN_RD_PAGES, LZO_MAX_RD_PAGES); + + for (i = 0; i < read_pages; i++) { + page[i] = (void *)__get_free_page(i < LZO_CMP_PAGES ? + GFP_NOIO | __GFP_HIGH : + GFP_NOIO | __GFP_NOWARN | + __GFP_NORETRY); + + if (!page[i]) { + if (i < LZO_CMP_PAGES) { + ring_size = i; + pr_err("Failed to allocate LZO pages\n"); + ret = -ENOMEM; + goto out_clean; + } else { + break; + } + } + } + want = ring_size = i; + + pr_info("Using %u thread(s) for decompression\n", nr_threads); + pr_info("Loading and decompressing image data (%u pages)...\n", + nr_to_read); + m = nr_to_read / 10; + if (!m) + m = 1; + nr_pages = 0; + start = ktime_get(); + + ret = snapshot_write_next(snapshot); + if (ret <= 0) + goto out_finish; + + for(;;) { + for (i = 0; !eof && i < want; i++) { + ret = swap_read_page(handle, page[ring], &hb); + if (ret) { + /* + * On real read error, finish. On end of data, + * set EOF flag and just exit the read loop. + */ + if (handle->cur && + handle->cur->entries[handle->k]) { + goto out_finish; + } else { + eof = 1; + break; + } + } + if (++ring >= ring_size) + ring = 0; + } + asked += i; + want -= i; + + /* + * We are out of data, wait for some more. + */ + if (!have) { + if (!asked) + break; + + ret = hib_wait_io(&hb); + if (ret) + goto out_finish; + have += asked; + asked = 0; + if (eof) + eof = 2; + } + + if (crc->run_threads) { + wait_event(crc->done, atomic_read_acquire(&crc->stop)); + atomic_set(&crc->stop, 0); + crc->run_threads = 0; + } + + for (thr = 0; have && thr < nr_threads; thr++) { + data[thr].cmp_len = *(size_t *)page[pg]; + if (unlikely(!data[thr].cmp_len || + data[thr].cmp_len > + lzo1x_worst_compress(LZO_UNC_SIZE))) { + pr_err("Invalid LZO compressed length\n"); + ret = -1; + goto out_finish; + } + + need = DIV_ROUND_UP(data[thr].cmp_len + LZO_HEADER, + PAGE_SIZE); + if (need > have) { + if (eof > 1) { + ret = -1; + goto out_finish; + } + break; + } + + for (off = 0; + off < LZO_HEADER + data[thr].cmp_len; + off += PAGE_SIZE) { + memcpy(data[thr].cmp + off, + page[pg], PAGE_SIZE); + have--; + want++; + if (++pg >= ring_size) + pg = 0; + } + + atomic_set_release(&data[thr].ready, 1); + wake_up(&data[thr].go); + } + + /* + * Wait for more data while we are decompressing. + */ + if (have < LZO_CMP_PAGES && asked) { + ret = hib_wait_io(&hb); + if (ret) + goto out_finish; + have += asked; + asked = 0; + if (eof) + eof = 2; + } + + for (run_threads = thr, thr = 0; thr < run_threads; thr++) { + wait_event(data[thr].done, + atomic_read_acquire(&data[thr].stop)); + atomic_set(&data[thr].stop, 0); + + ret = data[thr].ret; + + if (ret < 0) { + pr_err("LZO decompression failed\n"); + goto out_finish; + } + + if (unlikely(!data[thr].unc_len || + data[thr].unc_len > LZO_UNC_SIZE || + data[thr].unc_len & (PAGE_SIZE - 1))) { + pr_err("Invalid LZO uncompressed length\n"); + ret = -1; + goto out_finish; + } + + for (off = 0; + off < data[thr].unc_len; off += PAGE_SIZE) { + memcpy(data_of(*snapshot), + data[thr].unc + off, PAGE_SIZE); + + if (!(nr_pages % m)) + pr_info("Image loading progress: %3d%%\n", + nr_pages / m * 10); + nr_pages++; + + ret = snapshot_write_next(snapshot); + if (ret <= 0) { + crc->run_threads = thr + 1; + atomic_set_release(&crc->ready, 1); + wake_up(&crc->go); + goto out_finish; + } + } + } + + crc->run_threads = thr; + atomic_set_release(&crc->ready, 1); + wake_up(&crc->go); + } + +out_finish: + if (crc->run_threads) { + wait_event(crc->done, atomic_read_acquire(&crc->stop)); + atomic_set(&crc->stop, 0); + } + stop = ktime_get(); + if (!ret) { + pr_info("Image loading done\n"); + snapshot_write_finalize(snapshot); + if (!snapshot_image_loaded(snapshot)) + ret = -ENODATA; + if (!ret) { + if (swsusp_header->flags & SF_CRC32_MODE) { + if(handle->crc32 != swsusp_header->crc32) { + pr_err("Invalid image CRC32!\n"); + ret = -ENODATA; + } + } + } + } + swsusp_show_speed(start, stop, nr_to_read, "Read"); +out_clean: + hib_finish_batch(&hb); + for (i = 0; i < ring_size; i++) + free_page((unsigned long)page[i]); + if (crc) { + if (crc->thr) + kthread_stop(crc->thr); + kfree(crc); + } + if (data) { + for (thr = 0; thr < nr_threads; thr++) + if (data[thr].thr) + kthread_stop(data[thr].thr); + vfree(data); + } + vfree(page); + + return ret; +} + +/** + * swsusp_read - read the hibernation image. + * @flags_p: flags passed by the "frozen" kernel in the image header should + * be written into this memory location + */ + +int swsusp_read(unsigned int *flags_p) +{ + int error; + struct swap_map_handle handle; + struct snapshot_handle snapshot; + struct swsusp_info *header; + + memset(&snapshot, 0, sizeof(struct snapshot_handle)); + error = snapshot_write_next(&snapshot); + if (error < (int)PAGE_SIZE) + return error < 0 ? error : -EFAULT; + header = (struct swsusp_info *)data_of(snapshot); + error = get_swap_reader(&handle, flags_p); + if (error) + goto end; + if (!error) + error = swap_read_page(&handle, header, NULL); + if (!error) { + error = (*flags_p & SF_NOCOMPRESS_MODE) ? + load_image(&handle, &snapshot, header->pages - 1) : + load_image_lzo(&handle, &snapshot, header->pages - 1); + } + swap_reader_finish(&handle); +end: + if (!error) + pr_debug("Image successfully loaded\n"); + else + pr_debug("Error %d resuming\n", error); + return error; +} + +static void *swsusp_holder; + +/** + * swsusp_check - Check for swsusp signature in the resume device + * @exclusive: Open the resume device exclusively. + */ + +int swsusp_check(bool exclusive) +{ + void *holder = exclusive ? &swsusp_holder : NULL; + int error; + + hib_resume_bdev = blkdev_get_by_dev(swsusp_resume_device, BLK_OPEN_READ, + holder, NULL); + if (!IS_ERR(hib_resume_bdev)) { + set_blocksize(hib_resume_bdev, PAGE_SIZE); + clear_page(swsusp_header); + error = hib_submit_io(REQ_OP_READ, swsusp_resume_block, + swsusp_header, NULL); + if (error) + goto put; + + if (!memcmp(HIBERNATE_SIG, swsusp_header->sig, 10)) { + memcpy(swsusp_header->sig, swsusp_header->orig_sig, 10); + /* Reset swap signature now */ + error = hib_submit_io(REQ_OP_WRITE | REQ_SYNC, + swsusp_resume_block, + swsusp_header, NULL); + } else { + error = -EINVAL; + } + if (!error && swsusp_header->flags & SF_HW_SIG && + swsusp_header->hw_sig != swsusp_hardware_signature) { + pr_info("Suspend image hardware signature mismatch (%08x now %08x); aborting resume.\n", + swsusp_header->hw_sig, swsusp_hardware_signature); + error = -EINVAL; + } + +put: + if (error) + blkdev_put(hib_resume_bdev, holder); + else + pr_debug("Image signature found, resuming\n"); + } else { + error = PTR_ERR(hib_resume_bdev); + } + + if (error) + pr_debug("Image not found (code %d)\n", error); + + return error; +} + +/** + * swsusp_close - close swap device. + * @exclusive: Close the resume device which is exclusively opened. + */ + +void swsusp_close(bool exclusive) +{ + if (IS_ERR(hib_resume_bdev)) { + pr_debug("Image device not initialised\n"); + return; + } + + blkdev_put(hib_resume_bdev, exclusive ? &swsusp_holder : NULL); +} + +/** + * swsusp_unmark - Unmark swsusp signature in the resume device + */ + +#ifdef CONFIG_SUSPEND +int swsusp_unmark(void) +{ + int error; + + hib_submit_io(REQ_OP_READ, swsusp_resume_block, + swsusp_header, NULL); + if (!memcmp(HIBERNATE_SIG,swsusp_header->sig, 10)) { + memcpy(swsusp_header->sig,swsusp_header->orig_sig, 10); + error = hib_submit_io(REQ_OP_WRITE | REQ_SYNC, + swsusp_resume_block, + swsusp_header, NULL); + } else { + pr_err("Cannot find swsusp signature!\n"); + error = -ENODEV; + } + + /* + * We just returned from suspend, we don't need the image any more. + */ + free_all_swap_pages(root_swap); + + return error; +} +#endif + +static int __init swsusp_header_init(void) +{ + swsusp_header = (struct swsusp_header*) __get_free_page(GFP_KERNEL); + if (!swsusp_header) + panic("Could not allocate memory for swsusp_header\n"); + return 0; +} + +core_initcall(swsusp_header_init); diff --git a/kernel/power/user.c b/kernel/power/user.c new file mode 100644 index 0000000000..3a4e70366f --- /dev/null +++ b/kernel/power/user.c @@ -0,0 +1,466 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * linux/kernel/power/user.c + * + * This file provides the user space interface for software suspend/resume. + * + * Copyright (C) 2006 Rafael J. Wysocki <rjw@sisk.pl> + */ + +#include <linux/suspend.h> +#include <linux/reboot.h> +#include <linux/string.h> +#include <linux/device.h> +#include <linux/miscdevice.h> +#include <linux/mm.h> +#include <linux/swap.h> +#include <linux/swapops.h> +#include <linux/pm.h> +#include <linux/fs.h> +#include <linux/compat.h> +#include <linux/console.h> +#include <linux/cpu.h> +#include <linux/freezer.h> + +#include <linux/uaccess.h> + +#include "power.h" + +static bool need_wait; + +static struct snapshot_data { + struct snapshot_handle handle; + int swap; + int mode; + bool frozen; + bool ready; + bool platform_support; + bool free_bitmaps; + dev_t dev; +} snapshot_state; + +int is_hibernate_resume_dev(dev_t dev) +{ + return hibernation_available() && snapshot_state.dev == dev; +} + +static int snapshot_open(struct inode *inode, struct file *filp) +{ + struct snapshot_data *data; + unsigned int sleep_flags; + int error; + + if (!hibernation_available()) + return -EPERM; + + sleep_flags = lock_system_sleep(); + + if (!hibernate_acquire()) { + error = -EBUSY; + goto Unlock; + } + + if ((filp->f_flags & O_ACCMODE) == O_RDWR) { + hibernate_release(); + error = -ENOSYS; + goto Unlock; + } + nonseekable_open(inode, filp); + data = &snapshot_state; + filp->private_data = data; + memset(&data->handle, 0, sizeof(struct snapshot_handle)); + if ((filp->f_flags & O_ACCMODE) == O_RDONLY) { + /* Hibernating. The image device should be accessible. */ + data->swap = swap_type_of(swsusp_resume_device, 0); + data->mode = O_RDONLY; + data->free_bitmaps = false; + error = pm_notifier_call_chain_robust(PM_HIBERNATION_PREPARE, PM_POST_HIBERNATION); + } else { + /* + * Resuming. We may need to wait for the image device to + * appear. + */ + need_wait = true; + + data->swap = -1; + data->mode = O_WRONLY; + error = pm_notifier_call_chain_robust(PM_RESTORE_PREPARE, PM_POST_RESTORE); + if (!error) { + error = create_basic_memory_bitmaps(); + data->free_bitmaps = !error; + } + } + if (error) + hibernate_release(); + + data->frozen = false; + data->ready = false; + data->platform_support = false; + data->dev = 0; + + Unlock: + unlock_system_sleep(sleep_flags); + + return error; +} + +static int snapshot_release(struct inode *inode, struct file *filp) +{ + struct snapshot_data *data; + unsigned int sleep_flags; + + sleep_flags = lock_system_sleep(); + + swsusp_free(); + data = filp->private_data; + data->dev = 0; + free_all_swap_pages(data->swap); + if (data->frozen) { + pm_restore_gfp_mask(); + free_basic_memory_bitmaps(); + thaw_processes(); + } else if (data->free_bitmaps) { + free_basic_memory_bitmaps(); + } + pm_notifier_call_chain(data->mode == O_RDONLY ? + PM_POST_HIBERNATION : PM_POST_RESTORE); + hibernate_release(); + + unlock_system_sleep(sleep_flags); + + return 0; +} + +static ssize_t snapshot_read(struct file *filp, char __user *buf, + size_t count, loff_t *offp) +{ + loff_t pg_offp = *offp & ~PAGE_MASK; + struct snapshot_data *data; + unsigned int sleep_flags; + ssize_t res; + + sleep_flags = lock_system_sleep(); + + data = filp->private_data; + if (!data->ready) { + res = -ENODATA; + goto Unlock; + } + if (!pg_offp) { /* on page boundary? */ + res = snapshot_read_next(&data->handle); + if (res <= 0) + goto Unlock; + } else { + res = PAGE_SIZE - pg_offp; + } + + res = simple_read_from_buffer(buf, count, &pg_offp, + data_of(data->handle), res); + if (res > 0) + *offp += res; + + Unlock: + unlock_system_sleep(sleep_flags); + + return res; +} + +static ssize_t snapshot_write(struct file *filp, const char __user *buf, + size_t count, loff_t *offp) +{ + loff_t pg_offp = *offp & ~PAGE_MASK; + struct snapshot_data *data; + unsigned long sleep_flags; + ssize_t res; + + if (need_wait) { + wait_for_device_probe(); + need_wait = false; + } + + sleep_flags = lock_system_sleep(); + + data = filp->private_data; + + if (!pg_offp) { + res = snapshot_write_next(&data->handle); + if (res <= 0) + goto unlock; + } else { + res = PAGE_SIZE; + } + + if (!data_of(data->handle)) { + res = -EINVAL; + goto unlock; + } + + res = simple_write_to_buffer(data_of(data->handle), res, &pg_offp, + buf, count); + if (res > 0) + *offp += res; +unlock: + unlock_system_sleep(sleep_flags); + + return res; +} + +struct compat_resume_swap_area { + compat_loff_t offset; + u32 dev; +} __packed; + +static int snapshot_set_swap_area(struct snapshot_data *data, + void __user *argp) +{ + sector_t offset; + dev_t swdev; + + if (swsusp_swap_in_use()) + return -EPERM; + + if (in_compat_syscall()) { + struct compat_resume_swap_area swap_area; + + if (copy_from_user(&swap_area, argp, sizeof(swap_area))) + return -EFAULT; + swdev = new_decode_dev(swap_area.dev); + offset = swap_area.offset; + } else { + struct resume_swap_area swap_area; + + if (copy_from_user(&swap_area, argp, sizeof(swap_area))) + return -EFAULT; + swdev = new_decode_dev(swap_area.dev); + offset = swap_area.offset; + } + + /* + * User space encodes device types as two-byte values, + * so we need to recode them + */ + data->swap = swap_type_of(swdev, offset); + if (data->swap < 0) + return swdev ? -ENODEV : -EINVAL; + data->dev = swdev; + return 0; +} + +static long snapshot_ioctl(struct file *filp, unsigned int cmd, + unsigned long arg) +{ + int error = 0; + struct snapshot_data *data; + loff_t size; + sector_t offset; + + if (need_wait) { + wait_for_device_probe(); + need_wait = false; + } + + if (_IOC_TYPE(cmd) != SNAPSHOT_IOC_MAGIC) + return -ENOTTY; + if (_IOC_NR(cmd) > SNAPSHOT_IOC_MAXNR) + return -ENOTTY; + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + if (!mutex_trylock(&system_transition_mutex)) + return -EBUSY; + + lock_device_hotplug(); + data = filp->private_data; + + switch (cmd) { + + case SNAPSHOT_FREEZE: + if (data->frozen) + break; + + ksys_sync_helper(); + + error = freeze_processes(); + if (error) + break; + + error = create_basic_memory_bitmaps(); + if (error) + thaw_processes(); + else + data->frozen = true; + + break; + + case SNAPSHOT_UNFREEZE: + if (!data->frozen || data->ready) + break; + pm_restore_gfp_mask(); + free_basic_memory_bitmaps(); + data->free_bitmaps = false; + thaw_processes(); + data->frozen = false; + break; + + case SNAPSHOT_CREATE_IMAGE: + if (data->mode != O_RDONLY || !data->frozen || data->ready) { + error = -EPERM; + break; + } + pm_restore_gfp_mask(); + error = hibernation_snapshot(data->platform_support); + if (!error) { + error = put_user(in_suspend, (int __user *)arg); + data->ready = !freezer_test_done && !error; + freezer_test_done = false; + } + break; + + case SNAPSHOT_ATOMIC_RESTORE: + snapshot_write_finalize(&data->handle); + if (data->mode != O_WRONLY || !data->frozen || + !snapshot_image_loaded(&data->handle)) { + error = -EPERM; + break; + } + error = hibernation_restore(data->platform_support); + break; + + case SNAPSHOT_FREE: + swsusp_free(); + memset(&data->handle, 0, sizeof(struct snapshot_handle)); + data->ready = false; + /* + * It is necessary to thaw kernel threads here, because + * SNAPSHOT_CREATE_IMAGE may be invoked directly after + * SNAPSHOT_FREE. In that case, if kernel threads were not + * thawed, the preallocation of memory carried out by + * hibernation_snapshot() might run into problems (i.e. it + * might fail or even deadlock). + */ + thaw_kernel_threads(); + break; + + case SNAPSHOT_PREF_IMAGE_SIZE: + image_size = arg; + break; + + case SNAPSHOT_GET_IMAGE_SIZE: + if (!data->ready) { + error = -ENODATA; + break; + } + size = snapshot_get_image_size(); + size <<= PAGE_SHIFT; + error = put_user(size, (loff_t __user *)arg); + break; + + case SNAPSHOT_AVAIL_SWAP_SIZE: + size = count_swap_pages(data->swap, 1); + size <<= PAGE_SHIFT; + error = put_user(size, (loff_t __user *)arg); + break; + + case SNAPSHOT_ALLOC_SWAP_PAGE: + if (data->swap < 0 || data->swap >= MAX_SWAPFILES) { + error = -ENODEV; + break; + } + offset = alloc_swapdev_block(data->swap); + if (offset) { + offset <<= PAGE_SHIFT; + error = put_user(offset, (loff_t __user *)arg); + } else { + error = -ENOSPC; + } + break; + + case SNAPSHOT_FREE_SWAP_PAGES: + if (data->swap < 0 || data->swap >= MAX_SWAPFILES) { + error = -ENODEV; + break; + } + free_all_swap_pages(data->swap); + break; + + case SNAPSHOT_S2RAM: + if (!data->frozen) { + error = -EPERM; + break; + } + /* + * Tasks are frozen and the notifiers have been called with + * PM_HIBERNATION_PREPARE + */ + error = suspend_devices_and_enter(PM_SUSPEND_MEM); + data->ready = false; + break; + + case SNAPSHOT_PLATFORM_SUPPORT: + data->platform_support = !!arg; + break; + + case SNAPSHOT_POWER_OFF: + if (data->platform_support) + error = hibernation_platform_enter(); + break; + + case SNAPSHOT_SET_SWAP_AREA: + error = snapshot_set_swap_area(data, (void __user *)arg); + break; + + default: + error = -ENOTTY; + + } + + unlock_device_hotplug(); + mutex_unlock(&system_transition_mutex); + + return error; +} + +#ifdef CONFIG_COMPAT +static long +snapshot_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg) +{ + BUILD_BUG_ON(sizeof(loff_t) != sizeof(compat_loff_t)); + + switch (cmd) { + case SNAPSHOT_GET_IMAGE_SIZE: + case SNAPSHOT_AVAIL_SWAP_SIZE: + case SNAPSHOT_ALLOC_SWAP_PAGE: + case SNAPSHOT_CREATE_IMAGE: + case SNAPSHOT_SET_SWAP_AREA: + return snapshot_ioctl(file, cmd, + (unsigned long) compat_ptr(arg)); + default: + return snapshot_ioctl(file, cmd, arg); + } +} +#endif /* CONFIG_COMPAT */ + +static const struct file_operations snapshot_fops = { + .open = snapshot_open, + .release = snapshot_release, + .read = snapshot_read, + .write = snapshot_write, + .llseek = no_llseek, + .unlocked_ioctl = snapshot_ioctl, +#ifdef CONFIG_COMPAT + .compat_ioctl = snapshot_compat_ioctl, +#endif +}; + +static struct miscdevice snapshot_device = { + .minor = SNAPSHOT_MINOR, + .name = "snapshot", + .fops = &snapshot_fops, +}; + +static int __init snapshot_device_init(void) +{ + return misc_register(&snapshot_device); +}; + +device_initcall(snapshot_device_init); diff --git a/kernel/power/wakelock.c b/kernel/power/wakelock.c new file mode 100644 index 0000000000..52571dcad7 --- /dev/null +++ b/kernel/power/wakelock.c @@ -0,0 +1,285 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * kernel/power/wakelock.c + * + * User space wakeup sources support. + * + * Copyright (C) 2012 Rafael J. Wysocki <rjw@sisk.pl> + * + * This code is based on the analogous interface allowing user space to + * manipulate wakelocks on Android. + */ + +#include <linux/capability.h> +#include <linux/ctype.h> +#include <linux/device.h> +#include <linux/err.h> +#include <linux/hrtimer.h> +#include <linux/list.h> +#include <linux/rbtree.h> +#include <linux/slab.h> +#include <linux/workqueue.h> + +#include "power.h" + +static DEFINE_MUTEX(wakelocks_lock); + +struct wakelock { + char *name; + struct rb_node node; + struct wakeup_source *ws; +#ifdef CONFIG_PM_WAKELOCKS_GC + struct list_head lru; +#endif +}; + +static struct rb_root wakelocks_tree = RB_ROOT; + +ssize_t pm_show_wakelocks(char *buf, bool show_active) +{ + struct rb_node *node; + struct wakelock *wl; + int len = 0; + + mutex_lock(&wakelocks_lock); + + for (node = rb_first(&wakelocks_tree); node; node = rb_next(node)) { + wl = rb_entry(node, struct wakelock, node); + if (wl->ws->active == show_active) + len += sysfs_emit_at(buf, len, "%s ", wl->name); + } + + len += sysfs_emit_at(buf, len, "\n"); + + mutex_unlock(&wakelocks_lock); + return len; +} + +#if CONFIG_PM_WAKELOCKS_LIMIT > 0 +static unsigned int number_of_wakelocks; + +static inline bool wakelocks_limit_exceeded(void) +{ + return number_of_wakelocks > CONFIG_PM_WAKELOCKS_LIMIT; +} + +static inline void increment_wakelocks_number(void) +{ + number_of_wakelocks++; +} + +static inline void decrement_wakelocks_number(void) +{ + number_of_wakelocks--; +} +#else /* CONFIG_PM_WAKELOCKS_LIMIT = 0 */ +static inline bool wakelocks_limit_exceeded(void) { return false; } +static inline void increment_wakelocks_number(void) {} +static inline void decrement_wakelocks_number(void) {} +#endif /* CONFIG_PM_WAKELOCKS_LIMIT */ + +#ifdef CONFIG_PM_WAKELOCKS_GC +#define WL_GC_COUNT_MAX 100 +#define WL_GC_TIME_SEC 300 + +static void __wakelocks_gc(struct work_struct *work); +static LIST_HEAD(wakelocks_lru_list); +static DECLARE_WORK(wakelock_work, __wakelocks_gc); +static unsigned int wakelocks_gc_count; + +static inline void wakelocks_lru_add(struct wakelock *wl) +{ + list_add(&wl->lru, &wakelocks_lru_list); +} + +static inline void wakelocks_lru_most_recent(struct wakelock *wl) +{ + list_move(&wl->lru, &wakelocks_lru_list); +} + +static void __wakelocks_gc(struct work_struct *work) +{ + struct wakelock *wl, *aux; + ktime_t now; + + mutex_lock(&wakelocks_lock); + + now = ktime_get(); + list_for_each_entry_safe_reverse(wl, aux, &wakelocks_lru_list, lru) { + u64 idle_time_ns; + bool active; + + spin_lock_irq(&wl->ws->lock); + idle_time_ns = ktime_to_ns(ktime_sub(now, wl->ws->last_time)); + active = wl->ws->active; + spin_unlock_irq(&wl->ws->lock); + + if (idle_time_ns < ((u64)WL_GC_TIME_SEC * NSEC_PER_SEC)) + break; + + if (!active) { + wakeup_source_unregister(wl->ws); + rb_erase(&wl->node, &wakelocks_tree); + list_del(&wl->lru); + kfree(wl->name); + kfree(wl); + decrement_wakelocks_number(); + } + } + wakelocks_gc_count = 0; + + mutex_unlock(&wakelocks_lock); +} + +static void wakelocks_gc(void) +{ + if (++wakelocks_gc_count <= WL_GC_COUNT_MAX) + return; + + schedule_work(&wakelock_work); +} +#else /* !CONFIG_PM_WAKELOCKS_GC */ +static inline void wakelocks_lru_add(struct wakelock *wl) {} +static inline void wakelocks_lru_most_recent(struct wakelock *wl) {} +static inline void wakelocks_gc(void) {} +#endif /* !CONFIG_PM_WAKELOCKS_GC */ + +static struct wakelock *wakelock_lookup_add(const char *name, size_t len, + bool add_if_not_found) +{ + struct rb_node **node = &wakelocks_tree.rb_node; + struct rb_node *parent = *node; + struct wakelock *wl; + + while (*node) { + int diff; + + parent = *node; + wl = rb_entry(*node, struct wakelock, node); + diff = strncmp(name, wl->name, len); + if (diff == 0) { + if (wl->name[len]) + diff = -1; + else + return wl; + } + if (diff < 0) + node = &(*node)->rb_left; + else + node = &(*node)->rb_right; + } + if (!add_if_not_found) + return ERR_PTR(-EINVAL); + + if (wakelocks_limit_exceeded()) + return ERR_PTR(-ENOSPC); + + /* Not found, we have to add a new one. */ + wl = kzalloc(sizeof(*wl), GFP_KERNEL); + if (!wl) + return ERR_PTR(-ENOMEM); + + wl->name = kstrndup(name, len, GFP_KERNEL); + if (!wl->name) { + kfree(wl); + return ERR_PTR(-ENOMEM); + } + + wl->ws = wakeup_source_register(NULL, wl->name); + if (!wl->ws) { + kfree(wl->name); + kfree(wl); + return ERR_PTR(-ENOMEM); + } + wl->ws->last_time = ktime_get(); + + rb_link_node(&wl->node, parent, node); + rb_insert_color(&wl->node, &wakelocks_tree); + wakelocks_lru_add(wl); + increment_wakelocks_number(); + return wl; +} + +int pm_wake_lock(const char *buf) +{ + const char *str = buf; + struct wakelock *wl; + u64 timeout_ns = 0; + size_t len; + int ret = 0; + + if (!capable(CAP_BLOCK_SUSPEND)) + return -EPERM; + + while (*str && !isspace(*str)) + str++; + + len = str - buf; + if (!len) + return -EINVAL; + + if (*str && *str != '\n') { + /* Find out if there's a valid timeout string appended. */ + ret = kstrtou64(skip_spaces(str), 10, &timeout_ns); + if (ret) + return -EINVAL; + } + + mutex_lock(&wakelocks_lock); + + wl = wakelock_lookup_add(buf, len, true); + if (IS_ERR(wl)) { + ret = PTR_ERR(wl); + goto out; + } + if (timeout_ns) { + u64 timeout_ms = timeout_ns + NSEC_PER_MSEC - 1; + + do_div(timeout_ms, NSEC_PER_MSEC); + __pm_wakeup_event(wl->ws, timeout_ms); + } else { + __pm_stay_awake(wl->ws); + } + + wakelocks_lru_most_recent(wl); + + out: + mutex_unlock(&wakelocks_lock); + return ret; +} + +int pm_wake_unlock(const char *buf) +{ + struct wakelock *wl; + size_t len; + int ret = 0; + + if (!capable(CAP_BLOCK_SUSPEND)) + return -EPERM; + + len = strlen(buf); + if (!len) + return -EINVAL; + + if (buf[len-1] == '\n') + len--; + + if (!len) + return -EINVAL; + + mutex_lock(&wakelocks_lock); + + wl = wakelock_lookup_add(buf, len, false); + if (IS_ERR(wl)) { + ret = PTR_ERR(wl); + goto out; + } + __pm_relax(wl->ws); + + wakelocks_lru_most_recent(wl); + wakelocks_gc(); + + out: + mutex_unlock(&wakelocks_lock); + return ret; +} |