diff options
author | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-11 08:27:49 +0000 |
---|---|---|
committer | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-11 08:27:49 +0000 |
commit | ace9429bb58fd418f0c81d4c2835699bddf6bde6 (patch) | |
tree | b2d64bc10158fdd5497876388cd68142ca374ed3 /kernel/time | |
parent | Initial commit. (diff) | |
download | linux-ace9429bb58fd418f0c81d4c2835699bddf6bde6.tar.xz linux-ace9429bb58fd418f0c81d4c2835699bddf6bde6.zip |
Adding upstream version 6.6.15.upstream/6.6.15
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'kernel/time')
39 files changed, 23522 insertions, 0 deletions
diff --git a/kernel/time/Kconfig b/kernel/time/Kconfig new file mode 100644 index 0000000000..bae8f11070 --- /dev/null +++ b/kernel/time/Kconfig @@ -0,0 +1,213 @@ +# SPDX-License-Identifier: GPL-2.0-only +# +# Timer subsystem related configuration options +# + +# Options selectable by arch Kconfig + +# Watchdog function for clocksources to detect instabilities +config CLOCKSOURCE_WATCHDOG + bool + +# Architecture has extra clocksource data +config ARCH_CLOCKSOURCE_DATA + bool + +# Architecture has extra clocksource init called from registration +config ARCH_CLOCKSOURCE_INIT + bool + +# Clocksources require validation of the clocksource against the last +# cycle update - x86/TSC misfeature +config CLOCKSOURCE_VALIDATE_LAST_CYCLE + bool + +# Timekeeping vsyscall support +config GENERIC_TIME_VSYSCALL + bool + +# The generic clock events infrastructure +config GENERIC_CLOCKEVENTS + def_bool !LEGACY_TIMER_TICK + +# Architecture can handle broadcast in a driver-agnostic way +config ARCH_HAS_TICK_BROADCAST + bool + +# Clockevents broadcasting infrastructure +config GENERIC_CLOCKEVENTS_BROADCAST + bool + depends on GENERIC_CLOCKEVENTS + +# Automatically adjust the min. reprogramming time for +# clock event device +config GENERIC_CLOCKEVENTS_MIN_ADJUST + bool + +# Generic update of CMOS clock +config GENERIC_CMOS_UPDATE + bool + +# Select to handle posix CPU timers from task_work +# and not from the timer interrupt context +config HAVE_POSIX_CPU_TIMERS_TASK_WORK + bool + +config POSIX_CPU_TIMERS_TASK_WORK + bool + default y if POSIX_TIMERS && HAVE_POSIX_CPU_TIMERS_TASK_WORK + +config LEGACY_TIMER_TICK + bool + help + The legacy timer tick helper is used by platforms that + lack support for the generic clockevent framework. + New platforms should use generic clockevents instead. + +config TIME_KUNIT_TEST + tristate "KUnit test for kernel/time functions" if !KUNIT_ALL_TESTS + depends on KUNIT + default KUNIT_ALL_TESTS + help + Enable this option to test RTC library functions. + + If unsure, say N. + +config CONTEXT_TRACKING + bool + +config CONTEXT_TRACKING_IDLE + bool + select CONTEXT_TRACKING + help + Tracks idle state on behalf of RCU. + +if GENERIC_CLOCKEVENTS +menu "Timers subsystem" + +# Core internal switch. Selected by NO_HZ_COMMON / HIGH_RES_TIMERS. This is +# only related to the tick functionality. Oneshot clockevent devices +# are supported independent of this. +config TICK_ONESHOT + bool + +config NO_HZ_COMMON + bool + select TICK_ONESHOT + +choice + prompt "Timer tick handling" + default NO_HZ_IDLE if NO_HZ + +config HZ_PERIODIC + bool "Periodic timer ticks (constant rate, no dynticks)" + help + This option keeps the tick running periodically at a constant + rate, even when the CPU doesn't need it. + +config NO_HZ_IDLE + bool "Idle dynticks system (tickless idle)" + select NO_HZ_COMMON + help + This option enables a tickless idle system: timer interrupts + will only trigger on an as-needed basis when the system is idle. + This is usually interesting for energy saving. + + Most of the time you want to say Y here. + +config NO_HZ_FULL + bool "Full dynticks system (tickless)" + # NO_HZ_COMMON dependency + # We need at least one periodic CPU for timekeeping + depends on SMP + depends on HAVE_CONTEXT_TRACKING_USER + # VIRT_CPU_ACCOUNTING_GEN dependency + depends on HAVE_VIRT_CPU_ACCOUNTING_GEN + select NO_HZ_COMMON + select RCU_NOCB_CPU + select VIRT_CPU_ACCOUNTING_GEN + select IRQ_WORK + select CPU_ISOLATION + help + Adaptively try to shutdown the tick whenever possible, even when + the CPU is running tasks. Typically this requires running a single + task on the CPU. Chances for running tickless are maximized when + the task mostly runs in userspace and has few kernel activity. + + You need to fill up the nohz_full boot parameter with the + desired range of dynticks CPUs to use it. This is implemented at + the expense of some overhead in user <-> kernel transitions: + syscalls, exceptions and interrupts. + + By default, without passing the nohz_full parameter, this behaves just + like NO_HZ_IDLE. + + If you're a distro say Y. + +endchoice + +config CONTEXT_TRACKING_USER + bool + depends on HAVE_CONTEXT_TRACKING_USER + select CONTEXT_TRACKING + help + Track transitions between kernel and user on behalf of RCU and + tickless cputime accounting. The former case relies on context + tracking to enter/exit RCU extended quiescent states. + +config CONTEXT_TRACKING_USER_FORCE + bool "Force user context tracking" + depends on CONTEXT_TRACKING_USER + default y if !NO_HZ_FULL + help + The major pre-requirement for full dynticks to work is to + support the user context tracking subsystem. But there are also + other dependencies to provide in order to make the full + dynticks working. + + This option stands for testing when an arch implements the + user context tracking backend but doesn't yet fulfill all the + requirements to make the full dynticks feature working. + Without the full dynticks, there is no way to test the support + for user context tracking and the subsystems that rely on it: RCU + userspace extended quiescent state and tickless cputime + accounting. This option copes with the absence of the full + dynticks subsystem by forcing the user context tracking on all + CPUs in the system. + + Say Y only if you're working on the development of an + architecture backend for the user context tracking. + + Say N otherwise, this option brings an overhead that you + don't want in production. + +config NO_HZ + bool "Old Idle dynticks config" + help + This is the old config entry that enables dynticks idle. + We keep it around for a little while to enforce backward + compatibility with older config files. + +config HIGH_RES_TIMERS + bool "High Resolution Timer Support" + select TICK_ONESHOT + help + This option enables high resolution timer support. If your + hardware is not capable then this option only increases + the size of the kernel image. + +config CLOCKSOURCE_WATCHDOG_MAX_SKEW_US + int "Clocksource watchdog maximum allowable skew (in μs)" + depends on CLOCKSOURCE_WATCHDOG + range 50 1000 + default 125 + help + Specify the maximum amount of allowable watchdog skew in + microseconds before reporting the clocksource to be unstable. + The default is based on a half-second clocksource watchdog + interval and NTP's maximum frequency drift of 500 parts + per million. If the clocksource is good enough for NTP, + it is good enough for the clocksource watchdog! + +endmenu +endif diff --git a/kernel/time/Makefile b/kernel/time/Makefile new file mode 100644 index 0000000000..7e875e63ff --- /dev/null +++ b/kernel/time/Makefile @@ -0,0 +1,25 @@ +# SPDX-License-Identifier: GPL-2.0 +obj-y += time.o timer.o hrtimer.o +obj-y += timekeeping.o ntp.o clocksource.o jiffies.o timer_list.o +obj-y += timeconv.o timecounter.o alarmtimer.o + +ifeq ($(CONFIG_POSIX_TIMERS),y) + obj-y += posix-timers.o posix-cpu-timers.o posix-clock.o itimer.o +else + obj-y += posix-stubs.o +endif + +obj-$(CONFIG_GENERIC_CLOCKEVENTS) += clockevents.o tick-common.o +ifeq ($(CONFIG_GENERIC_CLOCKEVENTS_BROADCAST),y) + obj-y += tick-broadcast.o + obj-$(CONFIG_TICK_ONESHOT) += tick-broadcast-hrtimer.o +endif +obj-$(CONFIG_GENERIC_SCHED_CLOCK) += sched_clock.o +obj-$(CONFIG_TICK_ONESHOT) += tick-oneshot.o tick-sched.o +obj-$(CONFIG_LEGACY_TIMER_TICK) += tick-legacy.o +obj-$(CONFIG_HAVE_GENERIC_VDSO) += vsyscall.o +obj-$(CONFIG_DEBUG_FS) += timekeeping_debug.o +obj-$(CONFIG_TEST_UDELAY) += test_udelay.o +obj-$(CONFIG_TIME_NS) += namespace.o +obj-$(CONFIG_TEST_CLOCKSOURCE_WATCHDOG) += clocksource-wdtest.o +obj-$(CONFIG_TIME_KUNIT_TEST) += time_test.o diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c new file mode 100644 index 0000000000..8d9f13d847 --- /dev/null +++ b/kernel/time/alarmtimer.c @@ -0,0 +1,963 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Alarmtimer interface + * + * This interface provides a timer which is similar to hrtimers, + * but triggers a RTC alarm if the box is suspend. + * + * This interface is influenced by the Android RTC Alarm timer + * interface. + * + * Copyright (C) 2010 IBM Corporation + * + * Author: John Stultz <john.stultz@linaro.org> + */ +#include <linux/time.h> +#include <linux/hrtimer.h> +#include <linux/timerqueue.h> +#include <linux/rtc.h> +#include <linux/sched/signal.h> +#include <linux/sched/debug.h> +#include <linux/alarmtimer.h> +#include <linux/mutex.h> +#include <linux/platform_device.h> +#include <linux/posix-timers.h> +#include <linux/workqueue.h> +#include <linux/freezer.h> +#include <linux/compat.h> +#include <linux/module.h> +#include <linux/time_namespace.h> + +#include "posix-timers.h" + +#define CREATE_TRACE_POINTS +#include <trace/events/alarmtimer.h> + +/** + * struct alarm_base - Alarm timer bases + * @lock: Lock for syncrhonized access to the base + * @timerqueue: Timerqueue head managing the list of events + * @get_ktime: Function to read the time correlating to the base + * @get_timespec: Function to read the namespace time correlating to the base + * @base_clockid: clockid for the base + */ +static struct alarm_base { + spinlock_t lock; + struct timerqueue_head timerqueue; + ktime_t (*get_ktime)(void); + void (*get_timespec)(struct timespec64 *tp); + clockid_t base_clockid; +} alarm_bases[ALARM_NUMTYPE]; + +#if defined(CONFIG_POSIX_TIMERS) || defined(CONFIG_RTC_CLASS) +/* freezer information to handle clock_nanosleep triggered wakeups */ +static enum alarmtimer_type freezer_alarmtype; +static ktime_t freezer_expires; +static ktime_t freezer_delta; +static DEFINE_SPINLOCK(freezer_delta_lock); +#endif + +#ifdef CONFIG_RTC_CLASS +/* rtc timer and device for setting alarm wakeups at suspend */ +static struct rtc_timer rtctimer; +static struct rtc_device *rtcdev; +static DEFINE_SPINLOCK(rtcdev_lock); + +/** + * alarmtimer_get_rtcdev - Return selected rtcdevice + * + * This function returns the rtc device to use for wakealarms. + */ +struct rtc_device *alarmtimer_get_rtcdev(void) +{ + unsigned long flags; + struct rtc_device *ret; + + spin_lock_irqsave(&rtcdev_lock, flags); + ret = rtcdev; + spin_unlock_irqrestore(&rtcdev_lock, flags); + + return ret; +} +EXPORT_SYMBOL_GPL(alarmtimer_get_rtcdev); + +static int alarmtimer_rtc_add_device(struct device *dev) +{ + unsigned long flags; + struct rtc_device *rtc = to_rtc_device(dev); + struct platform_device *pdev; + int ret = 0; + + if (rtcdev) + return -EBUSY; + + if (!test_bit(RTC_FEATURE_ALARM, rtc->features)) + return -1; + if (!device_may_wakeup(rtc->dev.parent)) + return -1; + + pdev = platform_device_register_data(dev, "alarmtimer", + PLATFORM_DEVID_AUTO, NULL, 0); + if (!IS_ERR(pdev)) + device_init_wakeup(&pdev->dev, true); + + spin_lock_irqsave(&rtcdev_lock, flags); + if (!IS_ERR(pdev) && !rtcdev) { + if (!try_module_get(rtc->owner)) { + ret = -1; + goto unlock; + } + + rtcdev = rtc; + /* hold a reference so it doesn't go away */ + get_device(dev); + pdev = NULL; + } else { + ret = -1; + } +unlock: + spin_unlock_irqrestore(&rtcdev_lock, flags); + + platform_device_unregister(pdev); + + return ret; +} + +static inline void alarmtimer_rtc_timer_init(void) +{ + rtc_timer_init(&rtctimer, NULL, NULL); +} + +static struct class_interface alarmtimer_rtc_interface = { + .add_dev = &alarmtimer_rtc_add_device, +}; + +static int alarmtimer_rtc_interface_setup(void) +{ + alarmtimer_rtc_interface.class = rtc_class; + return class_interface_register(&alarmtimer_rtc_interface); +} +static void alarmtimer_rtc_interface_remove(void) +{ + class_interface_unregister(&alarmtimer_rtc_interface); +} +#else +static inline int alarmtimer_rtc_interface_setup(void) { return 0; } +static inline void alarmtimer_rtc_interface_remove(void) { } +static inline void alarmtimer_rtc_timer_init(void) { } +#endif + +/** + * alarmtimer_enqueue - Adds an alarm timer to an alarm_base timerqueue + * @base: pointer to the base where the timer is being run + * @alarm: pointer to alarm being enqueued. + * + * Adds alarm to a alarm_base timerqueue + * + * Must hold base->lock when calling. + */ +static void alarmtimer_enqueue(struct alarm_base *base, struct alarm *alarm) +{ + if (alarm->state & ALARMTIMER_STATE_ENQUEUED) + timerqueue_del(&base->timerqueue, &alarm->node); + + timerqueue_add(&base->timerqueue, &alarm->node); + alarm->state |= ALARMTIMER_STATE_ENQUEUED; +} + +/** + * alarmtimer_dequeue - Removes an alarm timer from an alarm_base timerqueue + * @base: pointer to the base where the timer is running + * @alarm: pointer to alarm being removed + * + * Removes alarm to a alarm_base timerqueue + * + * Must hold base->lock when calling. + */ +static void alarmtimer_dequeue(struct alarm_base *base, struct alarm *alarm) +{ + if (!(alarm->state & ALARMTIMER_STATE_ENQUEUED)) + return; + + timerqueue_del(&base->timerqueue, &alarm->node); + alarm->state &= ~ALARMTIMER_STATE_ENQUEUED; +} + + +/** + * alarmtimer_fired - Handles alarm hrtimer being fired. + * @timer: pointer to hrtimer being run + * + * When a alarm timer fires, this runs through the timerqueue to + * see which alarms expired, and runs those. If there are more alarm + * timers queued for the future, we set the hrtimer to fire when + * the next future alarm timer expires. + */ +static enum hrtimer_restart alarmtimer_fired(struct hrtimer *timer) +{ + struct alarm *alarm = container_of(timer, struct alarm, timer); + struct alarm_base *base = &alarm_bases[alarm->type]; + unsigned long flags; + int ret = HRTIMER_NORESTART; + int restart = ALARMTIMER_NORESTART; + + spin_lock_irqsave(&base->lock, flags); + alarmtimer_dequeue(base, alarm); + spin_unlock_irqrestore(&base->lock, flags); + + if (alarm->function) + restart = alarm->function(alarm, base->get_ktime()); + + spin_lock_irqsave(&base->lock, flags); + if (restart != ALARMTIMER_NORESTART) { + hrtimer_set_expires(&alarm->timer, alarm->node.expires); + alarmtimer_enqueue(base, alarm); + ret = HRTIMER_RESTART; + } + spin_unlock_irqrestore(&base->lock, flags); + + trace_alarmtimer_fired(alarm, base->get_ktime()); + return ret; + +} + +ktime_t alarm_expires_remaining(const struct alarm *alarm) +{ + struct alarm_base *base = &alarm_bases[alarm->type]; + return ktime_sub(alarm->node.expires, base->get_ktime()); +} +EXPORT_SYMBOL_GPL(alarm_expires_remaining); + +#ifdef CONFIG_RTC_CLASS +/** + * alarmtimer_suspend - Suspend time callback + * @dev: unused + * + * When we are going into suspend, we look through the bases + * to see which is the soonest timer to expire. We then + * set an rtc timer to fire that far into the future, which + * will wake us from suspend. + */ +static int alarmtimer_suspend(struct device *dev) +{ + ktime_t min, now, expires; + int i, ret, type; + struct rtc_device *rtc; + unsigned long flags; + struct rtc_time tm; + + spin_lock_irqsave(&freezer_delta_lock, flags); + min = freezer_delta; + expires = freezer_expires; + type = freezer_alarmtype; + freezer_delta = 0; + spin_unlock_irqrestore(&freezer_delta_lock, flags); + + rtc = alarmtimer_get_rtcdev(); + /* If we have no rtcdev, just return */ + if (!rtc) + return 0; + + /* Find the soonest timer to expire*/ + for (i = 0; i < ALARM_NUMTYPE; i++) { + struct alarm_base *base = &alarm_bases[i]; + struct timerqueue_node *next; + ktime_t delta; + + spin_lock_irqsave(&base->lock, flags); + next = timerqueue_getnext(&base->timerqueue); + spin_unlock_irqrestore(&base->lock, flags); + if (!next) + continue; + delta = ktime_sub(next->expires, base->get_ktime()); + if (!min || (delta < min)) { + expires = next->expires; + min = delta; + type = i; + } + } + if (min == 0) + return 0; + + if (ktime_to_ns(min) < 2 * NSEC_PER_SEC) { + pm_wakeup_event(dev, 2 * MSEC_PER_SEC); + return -EBUSY; + } + + trace_alarmtimer_suspend(expires, type); + + /* Setup an rtc timer to fire that far in the future */ + rtc_timer_cancel(rtc, &rtctimer); + rtc_read_time(rtc, &tm); + now = rtc_tm_to_ktime(tm); + now = ktime_add(now, min); + + /* Set alarm, if in the past reject suspend briefly to handle */ + ret = rtc_timer_start(rtc, &rtctimer, now, 0); + if (ret < 0) + pm_wakeup_event(dev, MSEC_PER_SEC); + return ret; +} + +static int alarmtimer_resume(struct device *dev) +{ + struct rtc_device *rtc; + + rtc = alarmtimer_get_rtcdev(); + if (rtc) + rtc_timer_cancel(rtc, &rtctimer); + return 0; +} + +#else +static int alarmtimer_suspend(struct device *dev) +{ + return 0; +} + +static int alarmtimer_resume(struct device *dev) +{ + return 0; +} +#endif + +static void +__alarm_init(struct alarm *alarm, enum alarmtimer_type type, + enum alarmtimer_restart (*function)(struct alarm *, ktime_t)) +{ + timerqueue_init(&alarm->node); + alarm->timer.function = alarmtimer_fired; + alarm->function = function; + alarm->type = type; + alarm->state = ALARMTIMER_STATE_INACTIVE; +} + +/** + * alarm_init - Initialize an alarm structure + * @alarm: ptr to alarm to be initialized + * @type: the type of the alarm + * @function: callback that is run when the alarm fires + */ +void alarm_init(struct alarm *alarm, enum alarmtimer_type type, + enum alarmtimer_restart (*function)(struct alarm *, ktime_t)) +{ + hrtimer_init(&alarm->timer, alarm_bases[type].base_clockid, + HRTIMER_MODE_ABS); + __alarm_init(alarm, type, function); +} +EXPORT_SYMBOL_GPL(alarm_init); + +/** + * alarm_start - Sets an absolute alarm to fire + * @alarm: ptr to alarm to set + * @start: time to run the alarm + */ +void alarm_start(struct alarm *alarm, ktime_t start) +{ + struct alarm_base *base = &alarm_bases[alarm->type]; + unsigned long flags; + + spin_lock_irqsave(&base->lock, flags); + alarm->node.expires = start; + alarmtimer_enqueue(base, alarm); + hrtimer_start(&alarm->timer, alarm->node.expires, HRTIMER_MODE_ABS); + spin_unlock_irqrestore(&base->lock, flags); + + trace_alarmtimer_start(alarm, base->get_ktime()); +} +EXPORT_SYMBOL_GPL(alarm_start); + +/** + * alarm_start_relative - Sets a relative alarm to fire + * @alarm: ptr to alarm to set + * @start: time relative to now to run the alarm + */ +void alarm_start_relative(struct alarm *alarm, ktime_t start) +{ + struct alarm_base *base = &alarm_bases[alarm->type]; + + start = ktime_add_safe(start, base->get_ktime()); + alarm_start(alarm, start); +} +EXPORT_SYMBOL_GPL(alarm_start_relative); + +void alarm_restart(struct alarm *alarm) +{ + struct alarm_base *base = &alarm_bases[alarm->type]; + unsigned long flags; + + spin_lock_irqsave(&base->lock, flags); + hrtimer_set_expires(&alarm->timer, alarm->node.expires); + hrtimer_restart(&alarm->timer); + alarmtimer_enqueue(base, alarm); + spin_unlock_irqrestore(&base->lock, flags); +} +EXPORT_SYMBOL_GPL(alarm_restart); + +/** + * alarm_try_to_cancel - Tries to cancel an alarm timer + * @alarm: ptr to alarm to be canceled + * + * Returns 1 if the timer was canceled, 0 if it was not running, + * and -1 if the callback was running + */ +int alarm_try_to_cancel(struct alarm *alarm) +{ + struct alarm_base *base = &alarm_bases[alarm->type]; + unsigned long flags; + int ret; + + spin_lock_irqsave(&base->lock, flags); + ret = hrtimer_try_to_cancel(&alarm->timer); + if (ret >= 0) + alarmtimer_dequeue(base, alarm); + spin_unlock_irqrestore(&base->lock, flags); + + trace_alarmtimer_cancel(alarm, base->get_ktime()); + return ret; +} +EXPORT_SYMBOL_GPL(alarm_try_to_cancel); + + +/** + * alarm_cancel - Spins trying to cancel an alarm timer until it is done + * @alarm: ptr to alarm to be canceled + * + * Returns 1 if the timer was canceled, 0 if it was not active. + */ +int alarm_cancel(struct alarm *alarm) +{ + for (;;) { + int ret = alarm_try_to_cancel(alarm); + if (ret >= 0) + return ret; + hrtimer_cancel_wait_running(&alarm->timer); + } +} +EXPORT_SYMBOL_GPL(alarm_cancel); + + +u64 alarm_forward(struct alarm *alarm, ktime_t now, ktime_t interval) +{ + u64 overrun = 1; + ktime_t delta; + + delta = ktime_sub(now, alarm->node.expires); + + if (delta < 0) + return 0; + + if (unlikely(delta >= interval)) { + s64 incr = ktime_to_ns(interval); + + overrun = ktime_divns(delta, incr); + + alarm->node.expires = ktime_add_ns(alarm->node.expires, + incr*overrun); + + if (alarm->node.expires > now) + return overrun; + /* + * This (and the ktime_add() below) is the + * correction for exact: + */ + overrun++; + } + + alarm->node.expires = ktime_add_safe(alarm->node.expires, interval); + return overrun; +} +EXPORT_SYMBOL_GPL(alarm_forward); + +static u64 __alarm_forward_now(struct alarm *alarm, ktime_t interval, bool throttle) +{ + struct alarm_base *base = &alarm_bases[alarm->type]; + ktime_t now = base->get_ktime(); + + if (IS_ENABLED(CONFIG_HIGH_RES_TIMERS) && throttle) { + /* + * Same issue as with posix_timer_fn(). Timers which are + * periodic but the signal is ignored can starve the system + * with a very small interval. The real fix which was + * promised in the context of posix_timer_fn() never + * materialized, but someone should really work on it. + * + * To prevent DOS fake @now to be 1 jiffie out which keeps + * the overrun accounting correct but creates an + * inconsistency vs. timer_gettime(2). + */ + ktime_t kj = NSEC_PER_SEC / HZ; + + if (interval < kj) + now = ktime_add(now, kj); + } + + return alarm_forward(alarm, now, interval); +} + +u64 alarm_forward_now(struct alarm *alarm, ktime_t interval) +{ + return __alarm_forward_now(alarm, interval, false); +} +EXPORT_SYMBOL_GPL(alarm_forward_now); + +#ifdef CONFIG_POSIX_TIMERS + +static void alarmtimer_freezerset(ktime_t absexp, enum alarmtimer_type type) +{ + struct alarm_base *base; + unsigned long flags; + ktime_t delta; + + switch(type) { + case ALARM_REALTIME: + base = &alarm_bases[ALARM_REALTIME]; + type = ALARM_REALTIME_FREEZER; + break; + case ALARM_BOOTTIME: + base = &alarm_bases[ALARM_BOOTTIME]; + type = ALARM_BOOTTIME_FREEZER; + break; + default: + WARN_ONCE(1, "Invalid alarm type: %d\n", type); + return; + } + + delta = ktime_sub(absexp, base->get_ktime()); + + spin_lock_irqsave(&freezer_delta_lock, flags); + if (!freezer_delta || (delta < freezer_delta)) { + freezer_delta = delta; + freezer_expires = absexp; + freezer_alarmtype = type; + } + spin_unlock_irqrestore(&freezer_delta_lock, flags); +} + +/** + * clock2alarm - helper that converts from clockid to alarmtypes + * @clockid: clockid. + */ +static enum alarmtimer_type clock2alarm(clockid_t clockid) +{ + if (clockid == CLOCK_REALTIME_ALARM) + return ALARM_REALTIME; + if (clockid == CLOCK_BOOTTIME_ALARM) + return ALARM_BOOTTIME; + return -1; +} + +/** + * alarm_handle_timer - Callback for posix timers + * @alarm: alarm that fired + * @now: time at the timer expiration + * + * Posix timer callback for expired alarm timers. + * + * Return: whether the timer is to be restarted + */ +static enum alarmtimer_restart alarm_handle_timer(struct alarm *alarm, + ktime_t now) +{ + struct k_itimer *ptr = container_of(alarm, struct k_itimer, + it.alarm.alarmtimer); + enum alarmtimer_restart result = ALARMTIMER_NORESTART; + unsigned long flags; + int si_private = 0; + + spin_lock_irqsave(&ptr->it_lock, flags); + + ptr->it_active = 0; + if (ptr->it_interval) + si_private = ++ptr->it_requeue_pending; + + if (posix_timer_event(ptr, si_private) && ptr->it_interval) { + /* + * Handle ignored signals and rearm the timer. This will go + * away once we handle ignored signals proper. Ensure that + * small intervals cannot starve the system. + */ + ptr->it_overrun += __alarm_forward_now(alarm, ptr->it_interval, true); + ++ptr->it_requeue_pending; + ptr->it_active = 1; + result = ALARMTIMER_RESTART; + } + spin_unlock_irqrestore(&ptr->it_lock, flags); + + return result; +} + +/** + * alarm_timer_rearm - Posix timer callback for rearming timer + * @timr: Pointer to the posixtimer data struct + */ +static void alarm_timer_rearm(struct k_itimer *timr) +{ + struct alarm *alarm = &timr->it.alarm.alarmtimer; + + timr->it_overrun += alarm_forward_now(alarm, timr->it_interval); + alarm_start(alarm, alarm->node.expires); +} + +/** + * alarm_timer_forward - Posix timer callback for forwarding timer + * @timr: Pointer to the posixtimer data struct + * @now: Current time to forward the timer against + */ +static s64 alarm_timer_forward(struct k_itimer *timr, ktime_t now) +{ + struct alarm *alarm = &timr->it.alarm.alarmtimer; + + return alarm_forward(alarm, timr->it_interval, now); +} + +/** + * alarm_timer_remaining - Posix timer callback to retrieve remaining time + * @timr: Pointer to the posixtimer data struct + * @now: Current time to calculate against + */ +static ktime_t alarm_timer_remaining(struct k_itimer *timr, ktime_t now) +{ + struct alarm *alarm = &timr->it.alarm.alarmtimer; + + return ktime_sub(alarm->node.expires, now); +} + +/** + * alarm_timer_try_to_cancel - Posix timer callback to cancel a timer + * @timr: Pointer to the posixtimer data struct + */ +static int alarm_timer_try_to_cancel(struct k_itimer *timr) +{ + return alarm_try_to_cancel(&timr->it.alarm.alarmtimer); +} + +/** + * alarm_timer_wait_running - Posix timer callback to wait for a timer + * @timr: Pointer to the posixtimer data struct + * + * Called from the core code when timer cancel detected that the callback + * is running. @timr is unlocked and rcu read lock is held to prevent it + * from being freed. + */ +static void alarm_timer_wait_running(struct k_itimer *timr) +{ + hrtimer_cancel_wait_running(&timr->it.alarm.alarmtimer.timer); +} + +/** + * alarm_timer_arm - Posix timer callback to arm a timer + * @timr: Pointer to the posixtimer data struct + * @expires: The new expiry time + * @absolute: Expiry value is absolute time + * @sigev_none: Posix timer does not deliver signals + */ +static void alarm_timer_arm(struct k_itimer *timr, ktime_t expires, + bool absolute, bool sigev_none) +{ + struct alarm *alarm = &timr->it.alarm.alarmtimer; + struct alarm_base *base = &alarm_bases[alarm->type]; + + if (!absolute) + expires = ktime_add_safe(expires, base->get_ktime()); + if (sigev_none) + alarm->node.expires = expires; + else + alarm_start(&timr->it.alarm.alarmtimer, expires); +} + +/** + * alarm_clock_getres - posix getres interface + * @which_clock: clockid + * @tp: timespec to fill + * + * Returns the granularity of underlying alarm base clock + */ +static int alarm_clock_getres(const clockid_t which_clock, struct timespec64 *tp) +{ + if (!alarmtimer_get_rtcdev()) + return -EINVAL; + + tp->tv_sec = 0; + tp->tv_nsec = hrtimer_resolution; + return 0; +} + +/** + * alarm_clock_get_timespec - posix clock_get_timespec interface + * @which_clock: clockid + * @tp: timespec to fill. + * + * Provides the underlying alarm base time in a tasks time namespace. + */ +static int alarm_clock_get_timespec(clockid_t which_clock, struct timespec64 *tp) +{ + struct alarm_base *base = &alarm_bases[clock2alarm(which_clock)]; + + if (!alarmtimer_get_rtcdev()) + return -EINVAL; + + base->get_timespec(tp); + + return 0; +} + +/** + * alarm_clock_get_ktime - posix clock_get_ktime interface + * @which_clock: clockid + * + * Provides the underlying alarm base time in the root namespace. + */ +static ktime_t alarm_clock_get_ktime(clockid_t which_clock) +{ + struct alarm_base *base = &alarm_bases[clock2alarm(which_clock)]; + + if (!alarmtimer_get_rtcdev()) + return -EINVAL; + + return base->get_ktime(); +} + +/** + * alarm_timer_create - posix timer_create interface + * @new_timer: k_itimer pointer to manage + * + * Initializes the k_itimer structure. + */ +static int alarm_timer_create(struct k_itimer *new_timer) +{ + enum alarmtimer_type type; + + if (!alarmtimer_get_rtcdev()) + return -EOPNOTSUPP; + + if (!capable(CAP_WAKE_ALARM)) + return -EPERM; + + type = clock2alarm(new_timer->it_clock); + alarm_init(&new_timer->it.alarm.alarmtimer, type, alarm_handle_timer); + return 0; +} + +/** + * alarmtimer_nsleep_wakeup - Wakeup function for alarm_timer_nsleep + * @alarm: ptr to alarm that fired + * @now: time at the timer expiration + * + * Wakes up the task that set the alarmtimer + * + * Return: ALARMTIMER_NORESTART + */ +static enum alarmtimer_restart alarmtimer_nsleep_wakeup(struct alarm *alarm, + ktime_t now) +{ + struct task_struct *task = alarm->data; + + alarm->data = NULL; + if (task) + wake_up_process(task); + return ALARMTIMER_NORESTART; +} + +/** + * alarmtimer_do_nsleep - Internal alarmtimer nsleep implementation + * @alarm: ptr to alarmtimer + * @absexp: absolute expiration time + * @type: alarm type (BOOTTIME/REALTIME). + * + * Sets the alarm timer and sleeps until it is fired or interrupted. + */ +static int alarmtimer_do_nsleep(struct alarm *alarm, ktime_t absexp, + enum alarmtimer_type type) +{ + struct restart_block *restart; + alarm->data = (void *)current; + do { + set_current_state(TASK_INTERRUPTIBLE); + alarm_start(alarm, absexp); + if (likely(alarm->data)) + schedule(); + + alarm_cancel(alarm); + } while (alarm->data && !signal_pending(current)); + + __set_current_state(TASK_RUNNING); + + destroy_hrtimer_on_stack(&alarm->timer); + + if (!alarm->data) + return 0; + + if (freezing(current)) + alarmtimer_freezerset(absexp, type); + restart = ¤t->restart_block; + if (restart->nanosleep.type != TT_NONE) { + struct timespec64 rmt; + ktime_t rem; + + rem = ktime_sub(absexp, alarm_bases[type].get_ktime()); + + if (rem <= 0) + return 0; + rmt = ktime_to_timespec64(rem); + + return nanosleep_copyout(restart, &rmt); + } + return -ERESTART_RESTARTBLOCK; +} + +static void +alarm_init_on_stack(struct alarm *alarm, enum alarmtimer_type type, + enum alarmtimer_restart (*function)(struct alarm *, ktime_t)) +{ + hrtimer_init_on_stack(&alarm->timer, alarm_bases[type].base_clockid, + HRTIMER_MODE_ABS); + __alarm_init(alarm, type, function); +} + +/** + * alarm_timer_nsleep_restart - restartblock alarmtimer nsleep + * @restart: ptr to restart block + * + * Handles restarted clock_nanosleep calls + */ +static long __sched alarm_timer_nsleep_restart(struct restart_block *restart) +{ + enum alarmtimer_type type = restart->nanosleep.clockid; + ktime_t exp = restart->nanosleep.expires; + struct alarm alarm; + + alarm_init_on_stack(&alarm, type, alarmtimer_nsleep_wakeup); + + return alarmtimer_do_nsleep(&alarm, exp, type); +} + +/** + * alarm_timer_nsleep - alarmtimer nanosleep + * @which_clock: clockid + * @flags: determines abstime or relative + * @tsreq: requested sleep time (abs or rel) + * + * Handles clock_nanosleep calls against _ALARM clockids + */ +static int alarm_timer_nsleep(const clockid_t which_clock, int flags, + const struct timespec64 *tsreq) +{ + enum alarmtimer_type type = clock2alarm(which_clock); + struct restart_block *restart = ¤t->restart_block; + struct alarm alarm; + ktime_t exp; + int ret; + + if (!alarmtimer_get_rtcdev()) + return -EOPNOTSUPP; + + if (flags & ~TIMER_ABSTIME) + return -EINVAL; + + if (!capable(CAP_WAKE_ALARM)) + return -EPERM; + + alarm_init_on_stack(&alarm, type, alarmtimer_nsleep_wakeup); + + exp = timespec64_to_ktime(*tsreq); + /* Convert (if necessary) to absolute time */ + if (flags != TIMER_ABSTIME) { + ktime_t now = alarm_bases[type].get_ktime(); + + exp = ktime_add_safe(now, exp); + } else { + exp = timens_ktime_to_host(which_clock, exp); + } + + ret = alarmtimer_do_nsleep(&alarm, exp, type); + if (ret != -ERESTART_RESTARTBLOCK) + return ret; + + /* abs timers don't set remaining time or restart */ + if (flags == TIMER_ABSTIME) + return -ERESTARTNOHAND; + + restart->nanosleep.clockid = type; + restart->nanosleep.expires = exp; + set_restart_fn(restart, alarm_timer_nsleep_restart); + return ret; +} + +const struct k_clock alarm_clock = { + .clock_getres = alarm_clock_getres, + .clock_get_ktime = alarm_clock_get_ktime, + .clock_get_timespec = alarm_clock_get_timespec, + .timer_create = alarm_timer_create, + .timer_set = common_timer_set, + .timer_del = common_timer_del, + .timer_get = common_timer_get, + .timer_arm = alarm_timer_arm, + .timer_rearm = alarm_timer_rearm, + .timer_forward = alarm_timer_forward, + .timer_remaining = alarm_timer_remaining, + .timer_try_to_cancel = alarm_timer_try_to_cancel, + .timer_wait_running = alarm_timer_wait_running, + .nsleep = alarm_timer_nsleep, +}; +#endif /* CONFIG_POSIX_TIMERS */ + + +/* Suspend hook structures */ +static const struct dev_pm_ops alarmtimer_pm_ops = { + .suspend = alarmtimer_suspend, + .resume = alarmtimer_resume, +}; + +static struct platform_driver alarmtimer_driver = { + .driver = { + .name = "alarmtimer", + .pm = &alarmtimer_pm_ops, + } +}; + +static void get_boottime_timespec(struct timespec64 *tp) +{ + ktime_get_boottime_ts64(tp); + timens_add_boottime(tp); +} + +/** + * alarmtimer_init - Initialize alarm timer code + * + * This function initializes the alarm bases and registers + * the posix clock ids. + */ +static int __init alarmtimer_init(void) +{ + int error; + int i; + + alarmtimer_rtc_timer_init(); + + /* Initialize alarm bases */ + alarm_bases[ALARM_REALTIME].base_clockid = CLOCK_REALTIME; + alarm_bases[ALARM_REALTIME].get_ktime = &ktime_get_real; + alarm_bases[ALARM_REALTIME].get_timespec = ktime_get_real_ts64; + alarm_bases[ALARM_BOOTTIME].base_clockid = CLOCK_BOOTTIME; + alarm_bases[ALARM_BOOTTIME].get_ktime = &ktime_get_boottime; + alarm_bases[ALARM_BOOTTIME].get_timespec = get_boottime_timespec; + for (i = 0; i < ALARM_NUMTYPE; i++) { + timerqueue_init_head(&alarm_bases[i].timerqueue); + spin_lock_init(&alarm_bases[i].lock); + } + + error = alarmtimer_rtc_interface_setup(); + if (error) + return error; + + error = platform_driver_register(&alarmtimer_driver); + if (error) + goto out_if; + + return 0; +out_if: + alarmtimer_rtc_interface_remove(); + return error; +} +device_initcall(alarmtimer_init); diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c new file mode 100644 index 0000000000..960143b183 --- /dev/null +++ b/kernel/time/clockevents.c @@ -0,0 +1,778 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * This file contains functions which manage clock event devices. + * + * Copyright(C) 2005-2006, Thomas Gleixner <tglx@linutronix.de> + * Copyright(C) 2005-2007, Red Hat, Inc., Ingo Molnar + * Copyright(C) 2006-2007, Timesys Corp., Thomas Gleixner + */ + +#include <linux/clockchips.h> +#include <linux/hrtimer.h> +#include <linux/init.h> +#include <linux/module.h> +#include <linux/smp.h> +#include <linux/device.h> + +#include "tick-internal.h" + +/* The registered clock event devices */ +static LIST_HEAD(clockevent_devices); +static LIST_HEAD(clockevents_released); +/* Protection for the above */ +static DEFINE_RAW_SPINLOCK(clockevents_lock); +/* Protection for unbind operations */ +static DEFINE_MUTEX(clockevents_mutex); + +struct ce_unbind { + struct clock_event_device *ce; + int res; +}; + +static u64 cev_delta2ns(unsigned long latch, struct clock_event_device *evt, + bool ismax) +{ + u64 clc = (u64) latch << evt->shift; + u64 rnd; + + if (WARN_ON(!evt->mult)) + evt->mult = 1; + rnd = (u64) evt->mult - 1; + + /* + * Upper bound sanity check. If the backwards conversion is + * not equal latch, we know that the above shift overflowed. + */ + if ((clc >> evt->shift) != (u64)latch) + clc = ~0ULL; + + /* + * Scaled math oddities: + * + * For mult <= (1 << shift) we can safely add mult - 1 to + * prevent integer rounding loss. So the backwards conversion + * from nsec to device ticks will be correct. + * + * For mult > (1 << shift), i.e. device frequency is > 1GHz we + * need to be careful. Adding mult - 1 will result in a value + * which when converted back to device ticks can be larger + * than latch by up to (mult - 1) >> shift. For the min_delta + * calculation we still want to apply this in order to stay + * above the minimum device ticks limit. For the upper limit + * we would end up with a latch value larger than the upper + * limit of the device, so we omit the add to stay below the + * device upper boundary. + * + * Also omit the add if it would overflow the u64 boundary. + */ + if ((~0ULL - clc > rnd) && + (!ismax || evt->mult <= (1ULL << evt->shift))) + clc += rnd; + + do_div(clc, evt->mult); + + /* Deltas less than 1usec are pointless noise */ + return clc > 1000 ? clc : 1000; +} + +/** + * clockevent_delta2ns - Convert a latch value (device ticks) to nanoseconds + * @latch: value to convert + * @evt: pointer to clock event device descriptor + * + * Math helper, returns latch value converted to nanoseconds (bound checked) + */ +u64 clockevent_delta2ns(unsigned long latch, struct clock_event_device *evt) +{ + return cev_delta2ns(latch, evt, false); +} +EXPORT_SYMBOL_GPL(clockevent_delta2ns); + +static int __clockevents_switch_state(struct clock_event_device *dev, + enum clock_event_state state) +{ + if (dev->features & CLOCK_EVT_FEAT_DUMMY) + return 0; + + /* Transition with new state-specific callbacks */ + switch (state) { + case CLOCK_EVT_STATE_DETACHED: + /* The clockevent device is getting replaced. Shut it down. */ + + case CLOCK_EVT_STATE_SHUTDOWN: + if (dev->set_state_shutdown) + return dev->set_state_shutdown(dev); + return 0; + + case CLOCK_EVT_STATE_PERIODIC: + /* Core internal bug */ + if (!(dev->features & CLOCK_EVT_FEAT_PERIODIC)) + return -ENOSYS; + if (dev->set_state_periodic) + return dev->set_state_periodic(dev); + return 0; + + case CLOCK_EVT_STATE_ONESHOT: + /* Core internal bug */ + if (!(dev->features & CLOCK_EVT_FEAT_ONESHOT)) + return -ENOSYS; + if (dev->set_state_oneshot) + return dev->set_state_oneshot(dev); + return 0; + + case CLOCK_EVT_STATE_ONESHOT_STOPPED: + /* Core internal bug */ + if (WARN_ONCE(!clockevent_state_oneshot(dev), + "Current state: %d\n", + clockevent_get_state(dev))) + return -EINVAL; + + if (dev->set_state_oneshot_stopped) + return dev->set_state_oneshot_stopped(dev); + else + return -ENOSYS; + + default: + return -ENOSYS; + } +} + +/** + * clockevents_switch_state - set the operating state of a clock event device + * @dev: device to modify + * @state: new state + * + * Must be called with interrupts disabled ! + */ +void clockevents_switch_state(struct clock_event_device *dev, + enum clock_event_state state) +{ + if (clockevent_get_state(dev) != state) { + if (__clockevents_switch_state(dev, state)) + return; + + clockevent_set_state(dev, state); + + /* + * A nsec2cyc multiplicator of 0 is invalid and we'd crash + * on it, so fix it up and emit a warning: + */ + if (clockevent_state_oneshot(dev)) { + if (WARN_ON(!dev->mult)) + dev->mult = 1; + } + } +} + +/** + * clockevents_shutdown - shutdown the device and clear next_event + * @dev: device to shutdown + */ +void clockevents_shutdown(struct clock_event_device *dev) +{ + clockevents_switch_state(dev, CLOCK_EVT_STATE_SHUTDOWN); + dev->next_event = KTIME_MAX; +} + +/** + * clockevents_tick_resume - Resume the tick device before using it again + * @dev: device to resume + */ +int clockevents_tick_resume(struct clock_event_device *dev) +{ + int ret = 0; + + if (dev->tick_resume) + ret = dev->tick_resume(dev); + + return ret; +} + +#ifdef CONFIG_GENERIC_CLOCKEVENTS_MIN_ADJUST + +/* Limit min_delta to a jiffie */ +#define MIN_DELTA_LIMIT (NSEC_PER_SEC / HZ) + +/** + * clockevents_increase_min_delta - raise minimum delta of a clock event device + * @dev: device to increase the minimum delta + * + * Returns 0 on success, -ETIME when the minimum delta reached the limit. + */ +static int clockevents_increase_min_delta(struct clock_event_device *dev) +{ + /* Nothing to do if we already reached the limit */ + if (dev->min_delta_ns >= MIN_DELTA_LIMIT) { + printk_deferred(KERN_WARNING + "CE: Reprogramming failure. Giving up\n"); + dev->next_event = KTIME_MAX; + return -ETIME; + } + + if (dev->min_delta_ns < 5000) + dev->min_delta_ns = 5000; + else + dev->min_delta_ns += dev->min_delta_ns >> 1; + + if (dev->min_delta_ns > MIN_DELTA_LIMIT) + dev->min_delta_ns = MIN_DELTA_LIMIT; + + printk_deferred(KERN_WARNING + "CE: %s increased min_delta_ns to %llu nsec\n", + dev->name ? dev->name : "?", + (unsigned long long) dev->min_delta_ns); + return 0; +} + +/** + * clockevents_program_min_delta - Set clock event device to the minimum delay. + * @dev: device to program + * + * Returns 0 on success, -ETIME when the retry loop failed. + */ +static int clockevents_program_min_delta(struct clock_event_device *dev) +{ + unsigned long long clc; + int64_t delta; + int i; + + for (i = 0;;) { + delta = dev->min_delta_ns; + dev->next_event = ktime_add_ns(ktime_get(), delta); + + if (clockevent_state_shutdown(dev)) + return 0; + + dev->retries++; + clc = ((unsigned long long) delta * dev->mult) >> dev->shift; + if (dev->set_next_event((unsigned long) clc, dev) == 0) + return 0; + + if (++i > 2) { + /* + * We tried 3 times to program the device with the + * given min_delta_ns. Try to increase the minimum + * delta, if that fails as well get out of here. + */ + if (clockevents_increase_min_delta(dev)) + return -ETIME; + i = 0; + } + } +} + +#else /* CONFIG_GENERIC_CLOCKEVENTS_MIN_ADJUST */ + +/** + * clockevents_program_min_delta - Set clock event device to the minimum delay. + * @dev: device to program + * + * Returns 0 on success, -ETIME when the retry loop failed. + */ +static int clockevents_program_min_delta(struct clock_event_device *dev) +{ + unsigned long long clc; + int64_t delta = 0; + int i; + + for (i = 0; i < 10; i++) { + delta += dev->min_delta_ns; + dev->next_event = ktime_add_ns(ktime_get(), delta); + + if (clockevent_state_shutdown(dev)) + return 0; + + dev->retries++; + clc = ((unsigned long long) delta * dev->mult) >> dev->shift; + if (dev->set_next_event((unsigned long) clc, dev) == 0) + return 0; + } + return -ETIME; +} + +#endif /* CONFIG_GENERIC_CLOCKEVENTS_MIN_ADJUST */ + +/** + * clockevents_program_event - Reprogram the clock event device. + * @dev: device to program + * @expires: absolute expiry time (monotonic clock) + * @force: program minimum delay if expires can not be set + * + * Returns 0 on success, -ETIME when the event is in the past. + */ +int clockevents_program_event(struct clock_event_device *dev, ktime_t expires, + bool force) +{ + unsigned long long clc; + int64_t delta; + int rc; + + if (WARN_ON_ONCE(expires < 0)) + return -ETIME; + + dev->next_event = expires; + + if (clockevent_state_shutdown(dev)) + return 0; + + /* We must be in ONESHOT state here */ + WARN_ONCE(!clockevent_state_oneshot(dev), "Current state: %d\n", + clockevent_get_state(dev)); + + /* Shortcut for clockevent devices that can deal with ktime. */ + if (dev->features & CLOCK_EVT_FEAT_KTIME) + return dev->set_next_ktime(expires, dev); + + delta = ktime_to_ns(ktime_sub(expires, ktime_get())); + if (delta <= 0) + return force ? clockevents_program_min_delta(dev) : -ETIME; + + delta = min(delta, (int64_t) dev->max_delta_ns); + delta = max(delta, (int64_t) dev->min_delta_ns); + + clc = ((unsigned long long) delta * dev->mult) >> dev->shift; + rc = dev->set_next_event((unsigned long) clc, dev); + + return (rc && force) ? clockevents_program_min_delta(dev) : rc; +} + +/* + * Called after a notify add to make devices available which were + * released from the notifier call. + */ +static void clockevents_notify_released(void) +{ + struct clock_event_device *dev; + + while (!list_empty(&clockevents_released)) { + dev = list_entry(clockevents_released.next, + struct clock_event_device, list); + list_move(&dev->list, &clockevent_devices); + tick_check_new_device(dev); + } +} + +/* + * Try to install a replacement clock event device + */ +static int clockevents_replace(struct clock_event_device *ced) +{ + struct clock_event_device *dev, *newdev = NULL; + + list_for_each_entry(dev, &clockevent_devices, list) { + if (dev == ced || !clockevent_state_detached(dev)) + continue; + + if (!tick_check_replacement(newdev, dev)) + continue; + + if (!try_module_get(dev->owner)) + continue; + + if (newdev) + module_put(newdev->owner); + newdev = dev; + } + if (newdev) { + tick_install_replacement(newdev); + list_del_init(&ced->list); + } + return newdev ? 0 : -EBUSY; +} + +/* + * Called with clockevents_mutex and clockevents_lock held + */ +static int __clockevents_try_unbind(struct clock_event_device *ced, int cpu) +{ + /* Fast track. Device is unused */ + if (clockevent_state_detached(ced)) { + list_del_init(&ced->list); + return 0; + } + + return ced == per_cpu(tick_cpu_device, cpu).evtdev ? -EAGAIN : -EBUSY; +} + +/* + * SMP function call to unbind a device + */ +static void __clockevents_unbind(void *arg) +{ + struct ce_unbind *cu = arg; + int res; + + raw_spin_lock(&clockevents_lock); + res = __clockevents_try_unbind(cu->ce, smp_processor_id()); + if (res == -EAGAIN) + res = clockevents_replace(cu->ce); + cu->res = res; + raw_spin_unlock(&clockevents_lock); +} + +/* + * Issues smp function call to unbind a per cpu device. Called with + * clockevents_mutex held. + */ +static int clockevents_unbind(struct clock_event_device *ced, int cpu) +{ + struct ce_unbind cu = { .ce = ced, .res = -ENODEV }; + + smp_call_function_single(cpu, __clockevents_unbind, &cu, 1); + return cu.res; +} + +/* + * Unbind a clockevents device. + */ +int clockevents_unbind_device(struct clock_event_device *ced, int cpu) +{ + int ret; + + mutex_lock(&clockevents_mutex); + ret = clockevents_unbind(ced, cpu); + mutex_unlock(&clockevents_mutex); + return ret; +} +EXPORT_SYMBOL_GPL(clockevents_unbind_device); + +/** + * clockevents_register_device - register a clock event device + * @dev: device to register + */ +void clockevents_register_device(struct clock_event_device *dev) +{ + unsigned long flags; + + /* Initialize state to DETACHED */ + clockevent_set_state(dev, CLOCK_EVT_STATE_DETACHED); + + if (!dev->cpumask) { + WARN_ON(num_possible_cpus() > 1); + dev->cpumask = cpumask_of(smp_processor_id()); + } + + if (dev->cpumask == cpu_all_mask) { + WARN(1, "%s cpumask == cpu_all_mask, using cpu_possible_mask instead\n", + dev->name); + dev->cpumask = cpu_possible_mask; + } + + raw_spin_lock_irqsave(&clockevents_lock, flags); + + list_add(&dev->list, &clockevent_devices); + tick_check_new_device(dev); + clockevents_notify_released(); + + raw_spin_unlock_irqrestore(&clockevents_lock, flags); +} +EXPORT_SYMBOL_GPL(clockevents_register_device); + +static void clockevents_config(struct clock_event_device *dev, u32 freq) +{ + u64 sec; + + if (!(dev->features & CLOCK_EVT_FEAT_ONESHOT)) + return; + + /* + * Calculate the maximum number of seconds we can sleep. Limit + * to 10 minutes for hardware which can program more than + * 32bit ticks so we still get reasonable conversion values. + */ + sec = dev->max_delta_ticks; + do_div(sec, freq); + if (!sec) + sec = 1; + else if (sec > 600 && dev->max_delta_ticks > UINT_MAX) + sec = 600; + + clockevents_calc_mult_shift(dev, freq, sec); + dev->min_delta_ns = cev_delta2ns(dev->min_delta_ticks, dev, false); + dev->max_delta_ns = cev_delta2ns(dev->max_delta_ticks, dev, true); +} + +/** + * clockevents_config_and_register - Configure and register a clock event device + * @dev: device to register + * @freq: The clock frequency + * @min_delta: The minimum clock ticks to program in oneshot mode + * @max_delta: The maximum clock ticks to program in oneshot mode + * + * min/max_delta can be 0 for devices which do not support oneshot mode. + */ +void clockevents_config_and_register(struct clock_event_device *dev, + u32 freq, unsigned long min_delta, + unsigned long max_delta) +{ + dev->min_delta_ticks = min_delta; + dev->max_delta_ticks = max_delta; + clockevents_config(dev, freq); + clockevents_register_device(dev); +} +EXPORT_SYMBOL_GPL(clockevents_config_and_register); + +int __clockevents_update_freq(struct clock_event_device *dev, u32 freq) +{ + clockevents_config(dev, freq); + + if (clockevent_state_oneshot(dev)) + return clockevents_program_event(dev, dev->next_event, false); + + if (clockevent_state_periodic(dev)) + return __clockevents_switch_state(dev, CLOCK_EVT_STATE_PERIODIC); + + return 0; +} + +/** + * clockevents_update_freq - Update frequency and reprogram a clock event device. + * @dev: device to modify + * @freq: new device frequency + * + * Reconfigure and reprogram a clock event device in oneshot + * mode. Must be called on the cpu for which the device delivers per + * cpu timer events. If called for the broadcast device the core takes + * care of serialization. + * + * Returns 0 on success, -ETIME when the event is in the past. + */ +int clockevents_update_freq(struct clock_event_device *dev, u32 freq) +{ + unsigned long flags; + int ret; + + local_irq_save(flags); + ret = tick_broadcast_update_freq(dev, freq); + if (ret == -ENODEV) + ret = __clockevents_update_freq(dev, freq); + local_irq_restore(flags); + return ret; +} + +/* + * Noop handler when we shut down an event device + */ +void clockevents_handle_noop(struct clock_event_device *dev) +{ +} + +/** + * clockevents_exchange_device - release and request clock devices + * @old: device to release (can be NULL) + * @new: device to request (can be NULL) + * + * Called from various tick functions with clockevents_lock held and + * interrupts disabled. + */ +void clockevents_exchange_device(struct clock_event_device *old, + struct clock_event_device *new) +{ + /* + * Caller releases a clock event device. We queue it into the + * released list and do a notify add later. + */ + if (old) { + module_put(old->owner); + clockevents_switch_state(old, CLOCK_EVT_STATE_DETACHED); + list_move(&old->list, &clockevents_released); + } + + if (new) { + BUG_ON(!clockevent_state_detached(new)); + clockevents_shutdown(new); + } +} + +/** + * clockevents_suspend - suspend clock devices + */ +void clockevents_suspend(void) +{ + struct clock_event_device *dev; + + list_for_each_entry_reverse(dev, &clockevent_devices, list) + if (dev->suspend && !clockevent_state_detached(dev)) + dev->suspend(dev); +} + +/** + * clockevents_resume - resume clock devices + */ +void clockevents_resume(void) +{ + struct clock_event_device *dev; + + list_for_each_entry(dev, &clockevent_devices, list) + if (dev->resume && !clockevent_state_detached(dev)) + dev->resume(dev); +} + +#ifdef CONFIG_HOTPLUG_CPU + +# ifdef CONFIG_GENERIC_CLOCKEVENTS_BROADCAST +/** + * tick_offline_cpu - Take CPU out of the broadcast mechanism + * @cpu: The outgoing CPU + * + * Called on the outgoing CPU after it took itself offline. + */ +void tick_offline_cpu(unsigned int cpu) +{ + raw_spin_lock(&clockevents_lock); + tick_broadcast_offline(cpu); + raw_spin_unlock(&clockevents_lock); +} +# endif + +/** + * tick_cleanup_dead_cpu - Cleanup the tick and clockevents of a dead cpu + * @cpu: The dead CPU + */ +void tick_cleanup_dead_cpu(int cpu) +{ + struct clock_event_device *dev, *tmp; + unsigned long flags; + + raw_spin_lock_irqsave(&clockevents_lock, flags); + + tick_shutdown(cpu); + /* + * Unregister the clock event devices which were + * released from the users in the notify chain. + */ + list_for_each_entry_safe(dev, tmp, &clockevents_released, list) + list_del(&dev->list); + /* + * Now check whether the CPU has left unused per cpu devices + */ + list_for_each_entry_safe(dev, tmp, &clockevent_devices, list) { + if (cpumask_test_cpu(cpu, dev->cpumask) && + cpumask_weight(dev->cpumask) == 1 && + !tick_is_broadcast_device(dev)) { + BUG_ON(!clockevent_state_detached(dev)); + list_del(&dev->list); + } + } + raw_spin_unlock_irqrestore(&clockevents_lock, flags); +} +#endif + +#ifdef CONFIG_SYSFS +static struct bus_type clockevents_subsys = { + .name = "clockevents", + .dev_name = "clockevent", +}; + +static DEFINE_PER_CPU(struct device, tick_percpu_dev); +static struct tick_device *tick_get_tick_dev(struct device *dev); + +static ssize_t current_device_show(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + struct tick_device *td; + ssize_t count = 0; + + raw_spin_lock_irq(&clockevents_lock); + td = tick_get_tick_dev(dev); + if (td && td->evtdev) + count = snprintf(buf, PAGE_SIZE, "%s\n", td->evtdev->name); + raw_spin_unlock_irq(&clockevents_lock); + return count; +} +static DEVICE_ATTR_RO(current_device); + +/* We don't support the abomination of removable broadcast devices */ +static ssize_t unbind_device_store(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t count) +{ + char name[CS_NAME_LEN]; + ssize_t ret = sysfs_get_uname(buf, name, count); + struct clock_event_device *ce = NULL, *iter; + + if (ret < 0) + return ret; + + ret = -ENODEV; + mutex_lock(&clockevents_mutex); + raw_spin_lock_irq(&clockevents_lock); + list_for_each_entry(iter, &clockevent_devices, list) { + if (!strcmp(iter->name, name)) { + ret = __clockevents_try_unbind(iter, dev->id); + ce = iter; + break; + } + } + raw_spin_unlock_irq(&clockevents_lock); + /* + * We hold clockevents_mutex, so ce can't go away + */ + if (ret == -EAGAIN) + ret = clockevents_unbind(ce, dev->id); + mutex_unlock(&clockevents_mutex); + return ret ? ret : count; +} +static DEVICE_ATTR_WO(unbind_device); + +#ifdef CONFIG_GENERIC_CLOCKEVENTS_BROADCAST +static struct device tick_bc_dev = { + .init_name = "broadcast", + .id = 0, + .bus = &clockevents_subsys, +}; + +static struct tick_device *tick_get_tick_dev(struct device *dev) +{ + return dev == &tick_bc_dev ? tick_get_broadcast_device() : + &per_cpu(tick_cpu_device, dev->id); +} + +static __init int tick_broadcast_init_sysfs(void) +{ + int err = device_register(&tick_bc_dev); + + if (!err) + err = device_create_file(&tick_bc_dev, &dev_attr_current_device); + return err; +} +#else +static struct tick_device *tick_get_tick_dev(struct device *dev) +{ + return &per_cpu(tick_cpu_device, dev->id); +} +static inline int tick_broadcast_init_sysfs(void) { return 0; } +#endif + +static int __init tick_init_sysfs(void) +{ + int cpu; + + for_each_possible_cpu(cpu) { + struct device *dev = &per_cpu(tick_percpu_dev, cpu); + int err; + + dev->id = cpu; + dev->bus = &clockevents_subsys; + err = device_register(dev); + if (!err) + err = device_create_file(dev, &dev_attr_current_device); + if (!err) + err = device_create_file(dev, &dev_attr_unbind_device); + if (err) + return err; + } + return tick_broadcast_init_sysfs(); +} + +static int __init clockevents_init_sysfs(void) +{ + int err = subsys_system_register(&clockevents_subsys, NULL); + + if (!err) + err = tick_init_sysfs(); + return err; +} +device_initcall(clockevents_init_sysfs); +#endif /* SYSFS */ diff --git a/kernel/time/clocksource-wdtest.c b/kernel/time/clocksource-wdtest.c new file mode 100644 index 0000000000..df922f49d1 --- /dev/null +++ b/kernel/time/clocksource-wdtest.c @@ -0,0 +1,201 @@ +// SPDX-License-Identifier: GPL-2.0+ +/* + * Unit test for the clocksource watchdog. + * + * Copyright (C) 2021 Facebook, Inc. + * + * Author: Paul E. McKenney <paulmck@kernel.org> + */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include <linux/device.h> +#include <linux/clocksource.h> +#include <linux/init.h> +#include <linux/module.h> +#include <linux/sched.h> /* for spin_unlock_irq() using preempt_count() m68k */ +#include <linux/tick.h> +#include <linux/kthread.h> +#include <linux/delay.h> +#include <linux/prandom.h> +#include <linux/cpu.h> + +#include "tick-internal.h" + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Paul E. McKenney <paulmck@kernel.org>"); + +static int holdoff = IS_BUILTIN(CONFIG_TEST_CLOCKSOURCE_WATCHDOG) ? 10 : 0; +module_param(holdoff, int, 0444); +MODULE_PARM_DESC(holdoff, "Time to wait to start test (s)."); + +/* Watchdog kthread's task_struct pointer for debug purposes. */ +static struct task_struct *wdtest_task; + +static u64 wdtest_jiffies_read(struct clocksource *cs) +{ + return (u64)jiffies; +} + +static struct clocksource clocksource_wdtest_jiffies = { + .name = "wdtest-jiffies", + .rating = 1, /* lowest valid rating*/ + .uncertainty_margin = TICK_NSEC, + .read = wdtest_jiffies_read, + .mask = CLOCKSOURCE_MASK(32), + .flags = CLOCK_SOURCE_MUST_VERIFY, + .mult = TICK_NSEC << JIFFIES_SHIFT, /* details above */ + .shift = JIFFIES_SHIFT, + .max_cycles = 10, +}; + +static int wdtest_ktime_read_ndelays; +static bool wdtest_ktime_read_fuzz; + +static u64 wdtest_ktime_read(struct clocksource *cs) +{ + int wkrn = READ_ONCE(wdtest_ktime_read_ndelays); + static int sign = 1; + u64 ret; + + if (wkrn) { + udelay(cs->uncertainty_margin / 250); + WRITE_ONCE(wdtest_ktime_read_ndelays, wkrn - 1); + } + ret = ktime_get_real_fast_ns(); + if (READ_ONCE(wdtest_ktime_read_fuzz)) { + sign = -sign; + ret = ret + sign * 100 * NSEC_PER_MSEC; + } + return ret; +} + +static void wdtest_ktime_cs_mark_unstable(struct clocksource *cs) +{ + pr_info("--- Marking %s unstable due to clocksource watchdog.\n", cs->name); +} + +#define KTIME_FLAGS (CLOCK_SOURCE_IS_CONTINUOUS | \ + CLOCK_SOURCE_VALID_FOR_HRES | \ + CLOCK_SOURCE_MUST_VERIFY | \ + CLOCK_SOURCE_VERIFY_PERCPU) + +static struct clocksource clocksource_wdtest_ktime = { + .name = "wdtest-ktime", + .rating = 300, + .read = wdtest_ktime_read, + .mask = CLOCKSOURCE_MASK(64), + .flags = KTIME_FLAGS, + .mark_unstable = wdtest_ktime_cs_mark_unstable, + .list = LIST_HEAD_INIT(clocksource_wdtest_ktime.list), +}; + +/* Reset the clocksource if needed. */ +static void wdtest_ktime_clocksource_reset(void) +{ + if (clocksource_wdtest_ktime.flags & CLOCK_SOURCE_UNSTABLE) { + clocksource_unregister(&clocksource_wdtest_ktime); + clocksource_wdtest_ktime.flags = KTIME_FLAGS; + schedule_timeout_uninterruptible(HZ / 10); + clocksource_register_khz(&clocksource_wdtest_ktime, 1000 * 1000); + } +} + +/* Run the specified series of watchdog tests. */ +static int wdtest_func(void *arg) +{ + unsigned long j1, j2; + char *s; + int i; + + schedule_timeout_uninterruptible(holdoff * HZ); + + /* + * Verify that jiffies-like clocksources get the manually + * specified uncertainty margin. + */ + pr_info("--- Verify jiffies-like uncertainty margin.\n"); + __clocksource_register(&clocksource_wdtest_jiffies); + WARN_ON_ONCE(clocksource_wdtest_jiffies.uncertainty_margin != TICK_NSEC); + + j1 = clocksource_wdtest_jiffies.read(&clocksource_wdtest_jiffies); + schedule_timeout_uninterruptible(HZ); + j2 = clocksource_wdtest_jiffies.read(&clocksource_wdtest_jiffies); + WARN_ON_ONCE(j1 == j2); + + clocksource_unregister(&clocksource_wdtest_jiffies); + + /* + * Verify that tsc-like clocksources are assigned a reasonable + * uncertainty margin. + */ + pr_info("--- Verify tsc-like uncertainty margin.\n"); + clocksource_register_khz(&clocksource_wdtest_ktime, 1000 * 1000); + WARN_ON_ONCE(clocksource_wdtest_ktime.uncertainty_margin < NSEC_PER_USEC); + + j1 = clocksource_wdtest_ktime.read(&clocksource_wdtest_ktime); + udelay(1); + j2 = clocksource_wdtest_ktime.read(&clocksource_wdtest_ktime); + pr_info("--- tsc-like times: %lu - %lu = %lu.\n", j2, j1, j2 - j1); + WARN_ON_ONCE(time_before(j2, j1 + NSEC_PER_USEC)); + + /* Verify tsc-like stability with various numbers of errors injected. */ + for (i = 0; i <= max_cswd_read_retries + 1; i++) { + if (i <= 1 && i < max_cswd_read_retries) + s = ""; + else if (i <= max_cswd_read_retries) + s = ", expect message"; + else + s = ", expect clock skew"; + pr_info("--- Watchdog with %dx error injection, %lu retries%s.\n", i, max_cswd_read_retries, s); + WRITE_ONCE(wdtest_ktime_read_ndelays, i); + schedule_timeout_uninterruptible(2 * HZ); + WARN_ON_ONCE(READ_ONCE(wdtest_ktime_read_ndelays)); + WARN_ON_ONCE((i <= max_cswd_read_retries) != + !(clocksource_wdtest_ktime.flags & CLOCK_SOURCE_UNSTABLE)); + wdtest_ktime_clocksource_reset(); + } + + /* Verify tsc-like stability with clock-value-fuzz error injection. */ + pr_info("--- Watchdog clock-value-fuzz error injection, expect clock skew and per-CPU mismatches.\n"); + WRITE_ONCE(wdtest_ktime_read_fuzz, true); + schedule_timeout_uninterruptible(2 * HZ); + WARN_ON_ONCE(!(clocksource_wdtest_ktime.flags & CLOCK_SOURCE_UNSTABLE)); + clocksource_verify_percpu(&clocksource_wdtest_ktime); + WRITE_ONCE(wdtest_ktime_read_fuzz, false); + + clocksource_unregister(&clocksource_wdtest_ktime); + + pr_info("--- Done with test.\n"); + return 0; +} + +static void wdtest_print_module_parms(void) +{ + pr_alert("--- holdoff=%d\n", holdoff); +} + +/* Cleanup function. */ +static void clocksource_wdtest_cleanup(void) +{ +} + +static int __init clocksource_wdtest_init(void) +{ + int ret = 0; + + wdtest_print_module_parms(); + + /* Create watchdog-test task. */ + wdtest_task = kthread_run(wdtest_func, NULL, "wdtest"); + if (IS_ERR(wdtest_task)) { + ret = PTR_ERR(wdtest_task); + pr_warn("%s: Failed to create wdtest kthread.\n", __func__); + wdtest_task = NULL; + return ret; + } + + return 0; +} + +module_init(clocksource_wdtest_init); +module_exit(clocksource_wdtest_cleanup); diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c new file mode 100644 index 0000000000..3052b1f116 --- /dev/null +++ b/kernel/time/clocksource.c @@ -0,0 +1,1530 @@ +// SPDX-License-Identifier: GPL-2.0+ +/* + * This file contains the functions which manage clocksource drivers. + * + * Copyright (C) 2004, 2005 IBM, John Stultz (johnstul@us.ibm.com) + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include <linux/device.h> +#include <linux/clocksource.h> +#include <linux/init.h> +#include <linux/module.h> +#include <linux/sched.h> /* for spin_unlock_irq() using preempt_count() m68k */ +#include <linux/tick.h> +#include <linux/kthread.h> +#include <linux/prandom.h> +#include <linux/cpu.h> + +#include "tick-internal.h" +#include "timekeeping_internal.h" + +/** + * clocks_calc_mult_shift - calculate mult/shift factors for scaled math of clocks + * @mult: pointer to mult variable + * @shift: pointer to shift variable + * @from: frequency to convert from + * @to: frequency to convert to + * @maxsec: guaranteed runtime conversion range in seconds + * + * The function evaluates the shift/mult pair for the scaled math + * operations of clocksources and clockevents. + * + * @to and @from are frequency values in HZ. For clock sources @to is + * NSEC_PER_SEC == 1GHz and @from is the counter frequency. For clock + * event @to is the counter frequency and @from is NSEC_PER_SEC. + * + * The @maxsec conversion range argument controls the time frame in + * seconds which must be covered by the runtime conversion with the + * calculated mult and shift factors. This guarantees that no 64bit + * overflow happens when the input value of the conversion is + * multiplied with the calculated mult factor. Larger ranges may + * reduce the conversion accuracy by choosing smaller mult and shift + * factors. + */ +void +clocks_calc_mult_shift(u32 *mult, u32 *shift, u32 from, u32 to, u32 maxsec) +{ + u64 tmp; + u32 sft, sftacc= 32; + + /* + * Calculate the shift factor which is limiting the conversion + * range: + */ + tmp = ((u64)maxsec * from) >> 32; + while (tmp) { + tmp >>=1; + sftacc--; + } + + /* + * Find the conversion shift/mult pair which has the best + * accuracy and fits the maxsec conversion range: + */ + for (sft = 32; sft > 0; sft--) { + tmp = (u64) to << sft; + tmp += from / 2; + do_div(tmp, from); + if ((tmp >> sftacc) == 0) + break; + } + *mult = tmp; + *shift = sft; +} +EXPORT_SYMBOL_GPL(clocks_calc_mult_shift); + +/*[Clocksource internal variables]--------- + * curr_clocksource: + * currently selected clocksource. + * suspend_clocksource: + * used to calculate the suspend time. + * clocksource_list: + * linked list with the registered clocksources + * clocksource_mutex: + * protects manipulations to curr_clocksource and the clocksource_list + * override_name: + * Name of the user-specified clocksource. + */ +static struct clocksource *curr_clocksource; +static struct clocksource *suspend_clocksource; +static LIST_HEAD(clocksource_list); +static DEFINE_MUTEX(clocksource_mutex); +static char override_name[CS_NAME_LEN]; +static int finished_booting; +static u64 suspend_start; + +/* + * Interval: 0.5sec. + */ +#define WATCHDOG_INTERVAL (HZ >> 1) +#define WATCHDOG_INTERVAL_MAX_NS ((2 * WATCHDOG_INTERVAL) * (NSEC_PER_SEC / HZ)) + +/* + * Threshold: 0.0312s, when doubled: 0.0625s. + * Also a default for cs->uncertainty_margin when registering clocks. + */ +#define WATCHDOG_THRESHOLD (NSEC_PER_SEC >> 5) + +/* + * Maximum permissible delay between two readouts of the watchdog + * clocksource surrounding a read of the clocksource being validated. + * This delay could be due to SMIs, NMIs, or to VCPU preemptions. Used as + * a lower bound for cs->uncertainty_margin values when registering clocks. + * + * The default of 500 parts per million is based on NTP's limits. + * If a clocksource is good enough for NTP, it is good enough for us! + */ +#ifdef CONFIG_CLOCKSOURCE_WATCHDOG_MAX_SKEW_US +#define MAX_SKEW_USEC CONFIG_CLOCKSOURCE_WATCHDOG_MAX_SKEW_US +#else +#define MAX_SKEW_USEC (125 * WATCHDOG_INTERVAL / HZ) +#endif + +#define WATCHDOG_MAX_SKEW (MAX_SKEW_USEC * NSEC_PER_USEC) + +#ifdef CONFIG_CLOCKSOURCE_WATCHDOG +static void clocksource_watchdog_work(struct work_struct *work); +static void clocksource_select(void); + +static LIST_HEAD(watchdog_list); +static struct clocksource *watchdog; +static struct timer_list watchdog_timer; +static DECLARE_WORK(watchdog_work, clocksource_watchdog_work); +static DEFINE_SPINLOCK(watchdog_lock); +static int watchdog_running; +static atomic_t watchdog_reset_pending; +static int64_t watchdog_max_interval; + +static inline void clocksource_watchdog_lock(unsigned long *flags) +{ + spin_lock_irqsave(&watchdog_lock, *flags); +} + +static inline void clocksource_watchdog_unlock(unsigned long *flags) +{ + spin_unlock_irqrestore(&watchdog_lock, *flags); +} + +static int clocksource_watchdog_kthread(void *data); +static void __clocksource_change_rating(struct clocksource *cs, int rating); + +static void clocksource_watchdog_work(struct work_struct *work) +{ + /* + * We cannot directly run clocksource_watchdog_kthread() here, because + * clocksource_select() calls timekeeping_notify() which uses + * stop_machine(). One cannot use stop_machine() from a workqueue() due + * lock inversions wrt CPU hotplug. + * + * Also, we only ever run this work once or twice during the lifetime + * of the kernel, so there is no point in creating a more permanent + * kthread for this. + * + * If kthread_run fails the next watchdog scan over the + * watchdog_list will find the unstable clock again. + */ + kthread_run(clocksource_watchdog_kthread, NULL, "kwatchdog"); +} + +static void __clocksource_unstable(struct clocksource *cs) +{ + cs->flags &= ~(CLOCK_SOURCE_VALID_FOR_HRES | CLOCK_SOURCE_WATCHDOG); + cs->flags |= CLOCK_SOURCE_UNSTABLE; + + /* + * If the clocksource is registered clocksource_watchdog_kthread() will + * re-rate and re-select. + */ + if (list_empty(&cs->list)) { + cs->rating = 0; + return; + } + + if (cs->mark_unstable) + cs->mark_unstable(cs); + + /* kick clocksource_watchdog_kthread() */ + if (finished_booting) + schedule_work(&watchdog_work); +} + +/** + * clocksource_mark_unstable - mark clocksource unstable via watchdog + * @cs: clocksource to be marked unstable + * + * This function is called by the x86 TSC code to mark clocksources as unstable; + * it defers demotion and re-selection to a kthread. + */ +void clocksource_mark_unstable(struct clocksource *cs) +{ + unsigned long flags; + + spin_lock_irqsave(&watchdog_lock, flags); + if (!(cs->flags & CLOCK_SOURCE_UNSTABLE)) { + if (!list_empty(&cs->list) && list_empty(&cs->wd_list)) + list_add(&cs->wd_list, &watchdog_list); + __clocksource_unstable(cs); + } + spin_unlock_irqrestore(&watchdog_lock, flags); +} + +ulong max_cswd_read_retries = 2; +module_param(max_cswd_read_retries, ulong, 0644); +EXPORT_SYMBOL_GPL(max_cswd_read_retries); +static int verify_n_cpus = 8; +module_param(verify_n_cpus, int, 0644); + +enum wd_read_status { + WD_READ_SUCCESS, + WD_READ_UNSTABLE, + WD_READ_SKIP +}; + +static enum wd_read_status cs_watchdog_read(struct clocksource *cs, u64 *csnow, u64 *wdnow) +{ + unsigned int nretries; + u64 wd_end, wd_end2, wd_delta; + int64_t wd_delay, wd_seq_delay; + + for (nretries = 0; nretries <= max_cswd_read_retries; nretries++) { + local_irq_disable(); + *wdnow = watchdog->read(watchdog); + *csnow = cs->read(cs); + wd_end = watchdog->read(watchdog); + wd_end2 = watchdog->read(watchdog); + local_irq_enable(); + + wd_delta = clocksource_delta(wd_end, *wdnow, watchdog->mask); + wd_delay = clocksource_cyc2ns(wd_delta, watchdog->mult, + watchdog->shift); + if (wd_delay <= WATCHDOG_MAX_SKEW) { + if (nretries > 1 || nretries >= max_cswd_read_retries) { + pr_warn("timekeeping watchdog on CPU%d: %s retried %d times before success\n", + smp_processor_id(), watchdog->name, nretries); + } + return WD_READ_SUCCESS; + } + + /* + * Now compute delay in consecutive watchdog read to see if + * there is too much external interferences that cause + * significant delay in reading both clocksource and watchdog. + * + * If consecutive WD read-back delay > WATCHDOG_MAX_SKEW/2, + * report system busy, reinit the watchdog and skip the current + * watchdog test. + */ + wd_delta = clocksource_delta(wd_end2, wd_end, watchdog->mask); + wd_seq_delay = clocksource_cyc2ns(wd_delta, watchdog->mult, watchdog->shift); + if (wd_seq_delay > WATCHDOG_MAX_SKEW/2) + goto skip_test; + } + + pr_warn("timekeeping watchdog on CPU%d: wd-%s-wd excessive read-back delay of %lldns vs. limit of %ldns, wd-wd read-back delay only %lldns, attempt %d, marking %s unstable\n", + smp_processor_id(), cs->name, wd_delay, WATCHDOG_MAX_SKEW, wd_seq_delay, nretries, cs->name); + return WD_READ_UNSTABLE; + +skip_test: + pr_info("timekeeping watchdog on CPU%d: %s wd-wd read-back delay of %lldns\n", + smp_processor_id(), watchdog->name, wd_seq_delay); + pr_info("wd-%s-wd read-back delay of %lldns, clock-skew test skipped!\n", + cs->name, wd_delay); + return WD_READ_SKIP; +} + +static u64 csnow_mid; +static cpumask_t cpus_ahead; +static cpumask_t cpus_behind; +static cpumask_t cpus_chosen; + +static void clocksource_verify_choose_cpus(void) +{ + int cpu, i, n = verify_n_cpus; + + if (n < 0) { + /* Check all of the CPUs. */ + cpumask_copy(&cpus_chosen, cpu_online_mask); + cpumask_clear_cpu(smp_processor_id(), &cpus_chosen); + return; + } + + /* If no checking desired, or no other CPU to check, leave. */ + cpumask_clear(&cpus_chosen); + if (n == 0 || num_online_cpus() <= 1) + return; + + /* Make sure to select at least one CPU other than the current CPU. */ + cpu = cpumask_first(cpu_online_mask); + if (cpu == smp_processor_id()) + cpu = cpumask_next(cpu, cpu_online_mask); + if (WARN_ON_ONCE(cpu >= nr_cpu_ids)) + return; + cpumask_set_cpu(cpu, &cpus_chosen); + + /* Force a sane value for the boot parameter. */ + if (n > nr_cpu_ids) + n = nr_cpu_ids; + + /* + * Randomly select the specified number of CPUs. If the same + * CPU is selected multiple times, that CPU is checked only once, + * and no replacement CPU is selected. This gracefully handles + * situations where verify_n_cpus is greater than the number of + * CPUs that are currently online. + */ + for (i = 1; i < n; i++) { + cpu = get_random_u32_below(nr_cpu_ids); + cpu = cpumask_next(cpu - 1, cpu_online_mask); + if (cpu >= nr_cpu_ids) + cpu = cpumask_first(cpu_online_mask); + if (!WARN_ON_ONCE(cpu >= nr_cpu_ids)) + cpumask_set_cpu(cpu, &cpus_chosen); + } + + /* Don't verify ourselves. */ + cpumask_clear_cpu(smp_processor_id(), &cpus_chosen); +} + +static void clocksource_verify_one_cpu(void *csin) +{ + struct clocksource *cs = (struct clocksource *)csin; + + csnow_mid = cs->read(cs); +} + +void clocksource_verify_percpu(struct clocksource *cs) +{ + int64_t cs_nsec, cs_nsec_max = 0, cs_nsec_min = LLONG_MAX; + u64 csnow_begin, csnow_end; + int cpu, testcpu; + s64 delta; + + if (verify_n_cpus == 0) + return; + cpumask_clear(&cpus_ahead); + cpumask_clear(&cpus_behind); + cpus_read_lock(); + preempt_disable(); + clocksource_verify_choose_cpus(); + if (cpumask_empty(&cpus_chosen)) { + preempt_enable(); + cpus_read_unlock(); + pr_warn("Not enough CPUs to check clocksource '%s'.\n", cs->name); + return; + } + testcpu = smp_processor_id(); + pr_warn("Checking clocksource %s synchronization from CPU %d to CPUs %*pbl.\n", cs->name, testcpu, cpumask_pr_args(&cpus_chosen)); + for_each_cpu(cpu, &cpus_chosen) { + if (cpu == testcpu) + continue; + csnow_begin = cs->read(cs); + smp_call_function_single(cpu, clocksource_verify_one_cpu, cs, 1); + csnow_end = cs->read(cs); + delta = (s64)((csnow_mid - csnow_begin) & cs->mask); + if (delta < 0) + cpumask_set_cpu(cpu, &cpus_behind); + delta = (csnow_end - csnow_mid) & cs->mask; + if (delta < 0) + cpumask_set_cpu(cpu, &cpus_ahead); + delta = clocksource_delta(csnow_end, csnow_begin, cs->mask); + cs_nsec = clocksource_cyc2ns(delta, cs->mult, cs->shift); + if (cs_nsec > cs_nsec_max) + cs_nsec_max = cs_nsec; + if (cs_nsec < cs_nsec_min) + cs_nsec_min = cs_nsec; + } + preempt_enable(); + cpus_read_unlock(); + if (!cpumask_empty(&cpus_ahead)) + pr_warn(" CPUs %*pbl ahead of CPU %d for clocksource %s.\n", + cpumask_pr_args(&cpus_ahead), testcpu, cs->name); + if (!cpumask_empty(&cpus_behind)) + pr_warn(" CPUs %*pbl behind CPU %d for clocksource %s.\n", + cpumask_pr_args(&cpus_behind), testcpu, cs->name); + if (!cpumask_empty(&cpus_ahead) || !cpumask_empty(&cpus_behind)) + pr_warn(" CPU %d check durations %lldns - %lldns for clocksource %s.\n", + testcpu, cs_nsec_min, cs_nsec_max, cs->name); +} +EXPORT_SYMBOL_GPL(clocksource_verify_percpu); + +static inline void clocksource_reset_watchdog(void) +{ + struct clocksource *cs; + + list_for_each_entry(cs, &watchdog_list, wd_list) + cs->flags &= ~CLOCK_SOURCE_WATCHDOG; +} + + +static void clocksource_watchdog(struct timer_list *unused) +{ + u64 csnow, wdnow, cslast, wdlast, delta; + int64_t wd_nsec, cs_nsec, interval; + int next_cpu, reset_pending; + struct clocksource *cs; + enum wd_read_status read_ret; + unsigned long extra_wait = 0; + u32 md; + + spin_lock(&watchdog_lock); + if (!watchdog_running) + goto out; + + reset_pending = atomic_read(&watchdog_reset_pending); + + list_for_each_entry(cs, &watchdog_list, wd_list) { + + /* Clocksource already marked unstable? */ + if (cs->flags & CLOCK_SOURCE_UNSTABLE) { + if (finished_booting) + schedule_work(&watchdog_work); + continue; + } + + read_ret = cs_watchdog_read(cs, &csnow, &wdnow); + + if (read_ret == WD_READ_UNSTABLE) { + /* Clock readout unreliable, so give it up. */ + __clocksource_unstable(cs); + continue; + } + + /* + * When WD_READ_SKIP is returned, it means the system is likely + * under very heavy load, where the latency of reading + * watchdog/clocksource is very big, and affect the accuracy of + * watchdog check. So give system some space and suspend the + * watchdog check for 5 minutes. + */ + if (read_ret == WD_READ_SKIP) { + /* + * As the watchdog timer will be suspended, and + * cs->last could keep unchanged for 5 minutes, reset + * the counters. + */ + clocksource_reset_watchdog(); + extra_wait = HZ * 300; + break; + } + + /* Clocksource initialized ? */ + if (!(cs->flags & CLOCK_SOURCE_WATCHDOG) || + atomic_read(&watchdog_reset_pending)) { + cs->flags |= CLOCK_SOURCE_WATCHDOG; + cs->wd_last = wdnow; + cs->cs_last = csnow; + continue; + } + + delta = clocksource_delta(wdnow, cs->wd_last, watchdog->mask); + wd_nsec = clocksource_cyc2ns(delta, watchdog->mult, + watchdog->shift); + + delta = clocksource_delta(csnow, cs->cs_last, cs->mask); + cs_nsec = clocksource_cyc2ns(delta, cs->mult, cs->shift); + wdlast = cs->wd_last; /* save these in case we print them */ + cslast = cs->cs_last; + cs->cs_last = csnow; + cs->wd_last = wdnow; + + if (atomic_read(&watchdog_reset_pending)) + continue; + + /* + * The processing of timer softirqs can get delayed (usually + * on account of ksoftirqd not getting to run in a timely + * manner), which causes the watchdog interval to stretch. + * Skew detection may fail for longer watchdog intervals + * on account of fixed margins being used. + * Some clocksources, e.g. acpi_pm, cannot tolerate + * watchdog intervals longer than a few seconds. + */ + interval = max(cs_nsec, wd_nsec); + if (unlikely(interval > WATCHDOG_INTERVAL_MAX_NS)) { + if (system_state > SYSTEM_SCHEDULING && + interval > 2 * watchdog_max_interval) { + watchdog_max_interval = interval; + pr_warn("Long readout interval, skipping watchdog check: cs_nsec: %lld wd_nsec: %lld\n", + cs_nsec, wd_nsec); + } + watchdog_timer.expires = jiffies; + continue; + } + + /* Check the deviation from the watchdog clocksource. */ + md = cs->uncertainty_margin + watchdog->uncertainty_margin; + if (abs(cs_nsec - wd_nsec) > md) { + s64 cs_wd_msec; + s64 wd_msec; + u32 wd_rem; + + pr_warn("timekeeping watchdog on CPU%d: Marking clocksource '%s' as unstable because the skew is too large:\n", + smp_processor_id(), cs->name); + pr_warn(" '%s' wd_nsec: %lld wd_now: %llx wd_last: %llx mask: %llx\n", + watchdog->name, wd_nsec, wdnow, wdlast, watchdog->mask); + pr_warn(" '%s' cs_nsec: %lld cs_now: %llx cs_last: %llx mask: %llx\n", + cs->name, cs_nsec, csnow, cslast, cs->mask); + cs_wd_msec = div_s64_rem(cs_nsec - wd_nsec, 1000 * 1000, &wd_rem); + wd_msec = div_s64_rem(wd_nsec, 1000 * 1000, &wd_rem); + pr_warn(" Clocksource '%s' skewed %lld ns (%lld ms) over watchdog '%s' interval of %lld ns (%lld ms)\n", + cs->name, cs_nsec - wd_nsec, cs_wd_msec, watchdog->name, wd_nsec, wd_msec); + if (curr_clocksource == cs) + pr_warn(" '%s' is current clocksource.\n", cs->name); + else if (curr_clocksource) + pr_warn(" '%s' (not '%s') is current clocksource.\n", curr_clocksource->name, cs->name); + else + pr_warn(" No current clocksource.\n"); + __clocksource_unstable(cs); + continue; + } + + if (cs == curr_clocksource && cs->tick_stable) + cs->tick_stable(cs); + + if (!(cs->flags & CLOCK_SOURCE_VALID_FOR_HRES) && + (cs->flags & CLOCK_SOURCE_IS_CONTINUOUS) && + (watchdog->flags & CLOCK_SOURCE_IS_CONTINUOUS)) { + /* Mark it valid for high-res. */ + cs->flags |= CLOCK_SOURCE_VALID_FOR_HRES; + + /* + * clocksource_done_booting() will sort it if + * finished_booting is not set yet. + */ + if (!finished_booting) + continue; + + /* + * If this is not the current clocksource let + * the watchdog thread reselect it. Due to the + * change to high res this clocksource might + * be preferred now. If it is the current + * clocksource let the tick code know about + * that change. + */ + if (cs != curr_clocksource) { + cs->flags |= CLOCK_SOURCE_RESELECT; + schedule_work(&watchdog_work); + } else { + tick_clock_notify(); + } + } + } + + /* + * We only clear the watchdog_reset_pending, when we did a + * full cycle through all clocksources. + */ + if (reset_pending) + atomic_dec(&watchdog_reset_pending); + + /* + * Cycle through CPUs to check if the CPUs stay synchronized + * to each other. + */ + next_cpu = cpumask_next(raw_smp_processor_id(), cpu_online_mask); + if (next_cpu >= nr_cpu_ids) + next_cpu = cpumask_first(cpu_online_mask); + + /* + * Arm timer if not already pending: could race with concurrent + * pair clocksource_stop_watchdog() clocksource_start_watchdog(). + */ + if (!timer_pending(&watchdog_timer)) { + watchdog_timer.expires += WATCHDOG_INTERVAL + extra_wait; + add_timer_on(&watchdog_timer, next_cpu); + } +out: + spin_unlock(&watchdog_lock); +} + +static inline void clocksource_start_watchdog(void) +{ + if (watchdog_running || !watchdog || list_empty(&watchdog_list)) + return; + timer_setup(&watchdog_timer, clocksource_watchdog, 0); + watchdog_timer.expires = jiffies + WATCHDOG_INTERVAL; + add_timer_on(&watchdog_timer, cpumask_first(cpu_online_mask)); + watchdog_running = 1; +} + +static inline void clocksource_stop_watchdog(void) +{ + if (!watchdog_running || (watchdog && !list_empty(&watchdog_list))) + return; + del_timer(&watchdog_timer); + watchdog_running = 0; +} + +static void clocksource_resume_watchdog(void) +{ + atomic_inc(&watchdog_reset_pending); +} + +static void clocksource_enqueue_watchdog(struct clocksource *cs) +{ + INIT_LIST_HEAD(&cs->wd_list); + + if (cs->flags & CLOCK_SOURCE_MUST_VERIFY) { + /* cs is a clocksource to be watched. */ + list_add(&cs->wd_list, &watchdog_list); + cs->flags &= ~CLOCK_SOURCE_WATCHDOG; + } else { + /* cs is a watchdog. */ + if (cs->flags & CLOCK_SOURCE_IS_CONTINUOUS) + cs->flags |= CLOCK_SOURCE_VALID_FOR_HRES; + } +} + +static void clocksource_select_watchdog(bool fallback) +{ + struct clocksource *cs, *old_wd; + unsigned long flags; + + spin_lock_irqsave(&watchdog_lock, flags); + /* save current watchdog */ + old_wd = watchdog; + if (fallback) + watchdog = NULL; + + list_for_each_entry(cs, &clocksource_list, list) { + /* cs is a clocksource to be watched. */ + if (cs->flags & CLOCK_SOURCE_MUST_VERIFY) + continue; + + /* Skip current if we were requested for a fallback. */ + if (fallback && cs == old_wd) + continue; + + /* Pick the best watchdog. */ + if (!watchdog || cs->rating > watchdog->rating) + watchdog = cs; + } + /* If we failed to find a fallback restore the old one. */ + if (!watchdog) + watchdog = old_wd; + + /* If we changed the watchdog we need to reset cycles. */ + if (watchdog != old_wd) + clocksource_reset_watchdog(); + + /* Check if the watchdog timer needs to be started. */ + clocksource_start_watchdog(); + spin_unlock_irqrestore(&watchdog_lock, flags); +} + +static void clocksource_dequeue_watchdog(struct clocksource *cs) +{ + if (cs != watchdog) { + if (cs->flags & CLOCK_SOURCE_MUST_VERIFY) { + /* cs is a watched clocksource. */ + list_del_init(&cs->wd_list); + /* Check if the watchdog timer needs to be stopped. */ + clocksource_stop_watchdog(); + } + } +} + +static int __clocksource_watchdog_kthread(void) +{ + struct clocksource *cs, *tmp; + unsigned long flags; + int select = 0; + + /* Do any required per-CPU skew verification. */ + if (curr_clocksource && + curr_clocksource->flags & CLOCK_SOURCE_UNSTABLE && + curr_clocksource->flags & CLOCK_SOURCE_VERIFY_PERCPU) + clocksource_verify_percpu(curr_clocksource); + + spin_lock_irqsave(&watchdog_lock, flags); + list_for_each_entry_safe(cs, tmp, &watchdog_list, wd_list) { + if (cs->flags & CLOCK_SOURCE_UNSTABLE) { + list_del_init(&cs->wd_list); + __clocksource_change_rating(cs, 0); + select = 1; + } + if (cs->flags & CLOCK_SOURCE_RESELECT) { + cs->flags &= ~CLOCK_SOURCE_RESELECT; + select = 1; + } + } + /* Check if the watchdog timer needs to be stopped. */ + clocksource_stop_watchdog(); + spin_unlock_irqrestore(&watchdog_lock, flags); + + return select; +} + +static int clocksource_watchdog_kthread(void *data) +{ + mutex_lock(&clocksource_mutex); + if (__clocksource_watchdog_kthread()) + clocksource_select(); + mutex_unlock(&clocksource_mutex); + return 0; +} + +static bool clocksource_is_watchdog(struct clocksource *cs) +{ + return cs == watchdog; +} + +#else /* CONFIG_CLOCKSOURCE_WATCHDOG */ + +static void clocksource_enqueue_watchdog(struct clocksource *cs) +{ + if (cs->flags & CLOCK_SOURCE_IS_CONTINUOUS) + cs->flags |= CLOCK_SOURCE_VALID_FOR_HRES; +} + +static void clocksource_select_watchdog(bool fallback) { } +static inline void clocksource_dequeue_watchdog(struct clocksource *cs) { } +static inline void clocksource_resume_watchdog(void) { } +static inline int __clocksource_watchdog_kthread(void) { return 0; } +static bool clocksource_is_watchdog(struct clocksource *cs) { return false; } +void clocksource_mark_unstable(struct clocksource *cs) { } + +static inline void clocksource_watchdog_lock(unsigned long *flags) { } +static inline void clocksource_watchdog_unlock(unsigned long *flags) { } + +#endif /* CONFIG_CLOCKSOURCE_WATCHDOG */ + +static bool clocksource_is_suspend(struct clocksource *cs) +{ + return cs == suspend_clocksource; +} + +static void __clocksource_suspend_select(struct clocksource *cs) +{ + /* + * Skip the clocksource which will be stopped in suspend state. + */ + if (!(cs->flags & CLOCK_SOURCE_SUSPEND_NONSTOP)) + return; + + /* + * The nonstop clocksource can be selected as the suspend clocksource to + * calculate the suspend time, so it should not supply suspend/resume + * interfaces to suspend the nonstop clocksource when system suspends. + */ + if (cs->suspend || cs->resume) { + pr_warn("Nonstop clocksource %s should not supply suspend/resume interfaces\n", + cs->name); + } + + /* Pick the best rating. */ + if (!suspend_clocksource || cs->rating > suspend_clocksource->rating) + suspend_clocksource = cs; +} + +/** + * clocksource_suspend_select - Select the best clocksource for suspend timing + * @fallback: if select a fallback clocksource + */ +static void clocksource_suspend_select(bool fallback) +{ + struct clocksource *cs, *old_suspend; + + old_suspend = suspend_clocksource; + if (fallback) + suspend_clocksource = NULL; + + list_for_each_entry(cs, &clocksource_list, list) { + /* Skip current if we were requested for a fallback. */ + if (fallback && cs == old_suspend) + continue; + + __clocksource_suspend_select(cs); + } +} + +/** + * clocksource_start_suspend_timing - Start measuring the suspend timing + * @cs: current clocksource from timekeeping + * @start_cycles: current cycles from timekeeping + * + * This function will save the start cycle values of suspend timer to calculate + * the suspend time when resuming system. + * + * This function is called late in the suspend process from timekeeping_suspend(), + * that means processes are frozen, non-boot cpus and interrupts are disabled + * now. It is therefore possible to start the suspend timer without taking the + * clocksource mutex. + */ +void clocksource_start_suspend_timing(struct clocksource *cs, u64 start_cycles) +{ + if (!suspend_clocksource) + return; + + /* + * If current clocksource is the suspend timer, we should use the + * tkr_mono.cycle_last value as suspend_start to avoid same reading + * from suspend timer. + */ + if (clocksource_is_suspend(cs)) { + suspend_start = start_cycles; + return; + } + + if (suspend_clocksource->enable && + suspend_clocksource->enable(suspend_clocksource)) { + pr_warn_once("Failed to enable the non-suspend-able clocksource.\n"); + return; + } + + suspend_start = suspend_clocksource->read(suspend_clocksource); +} + +/** + * clocksource_stop_suspend_timing - Stop measuring the suspend timing + * @cs: current clocksource from timekeeping + * @cycle_now: current cycles from timekeeping + * + * This function will calculate the suspend time from suspend timer. + * + * Returns nanoseconds since suspend started, 0 if no usable suspend clocksource. + * + * This function is called early in the resume process from timekeeping_resume(), + * that means there is only one cpu, no processes are running and the interrupts + * are disabled. It is therefore possible to stop the suspend timer without + * taking the clocksource mutex. + */ +u64 clocksource_stop_suspend_timing(struct clocksource *cs, u64 cycle_now) +{ + u64 now, delta, nsec = 0; + + if (!suspend_clocksource) + return 0; + + /* + * If current clocksource is the suspend timer, we should use the + * tkr_mono.cycle_last value from timekeeping as current cycle to + * avoid same reading from suspend timer. + */ + if (clocksource_is_suspend(cs)) + now = cycle_now; + else + now = suspend_clocksource->read(suspend_clocksource); + + if (now > suspend_start) { + delta = clocksource_delta(now, suspend_start, + suspend_clocksource->mask); + nsec = mul_u64_u32_shr(delta, suspend_clocksource->mult, + suspend_clocksource->shift); + } + + /* + * Disable the suspend timer to save power if current clocksource is + * not the suspend timer. + */ + if (!clocksource_is_suspend(cs) && suspend_clocksource->disable) + suspend_clocksource->disable(suspend_clocksource); + + return nsec; +} + +/** + * clocksource_suspend - suspend the clocksource(s) + */ +void clocksource_suspend(void) +{ + struct clocksource *cs; + + list_for_each_entry_reverse(cs, &clocksource_list, list) + if (cs->suspend) + cs->suspend(cs); +} + +/** + * clocksource_resume - resume the clocksource(s) + */ +void clocksource_resume(void) +{ + struct clocksource *cs; + + list_for_each_entry(cs, &clocksource_list, list) + if (cs->resume) + cs->resume(cs); + + clocksource_resume_watchdog(); +} + +/** + * clocksource_touch_watchdog - Update watchdog + * + * Update the watchdog after exception contexts such as kgdb so as not + * to incorrectly trip the watchdog. This might fail when the kernel + * was stopped in code which holds watchdog_lock. + */ +void clocksource_touch_watchdog(void) +{ + clocksource_resume_watchdog(); +} + +/** + * clocksource_max_adjustment- Returns max adjustment amount + * @cs: Pointer to clocksource + * + */ +static u32 clocksource_max_adjustment(struct clocksource *cs) +{ + u64 ret; + /* + * We won't try to correct for more than 11% adjustments (110,000 ppm), + */ + ret = (u64)cs->mult * 11; + do_div(ret,100); + return (u32)ret; +} + +/** + * clocks_calc_max_nsecs - Returns maximum nanoseconds that can be converted + * @mult: cycle to nanosecond multiplier + * @shift: cycle to nanosecond divisor (power of two) + * @maxadj: maximum adjustment value to mult (~11%) + * @mask: bitmask for two's complement subtraction of non 64 bit counters + * @max_cyc: maximum cycle value before potential overflow (does not include + * any safety margin) + * + * NOTE: This function includes a safety margin of 50%, in other words, we + * return half the number of nanoseconds the hardware counter can technically + * cover. This is done so that we can potentially detect problems caused by + * delayed timers or bad hardware, which might result in time intervals that + * are larger than what the math used can handle without overflows. + */ +u64 clocks_calc_max_nsecs(u32 mult, u32 shift, u32 maxadj, u64 mask, u64 *max_cyc) +{ + u64 max_nsecs, max_cycles; + + /* + * Calculate the maximum number of cycles that we can pass to the + * cyc2ns() function without overflowing a 64-bit result. + */ + max_cycles = ULLONG_MAX; + do_div(max_cycles, mult+maxadj); + + /* + * The actual maximum number of cycles we can defer the clocksource is + * determined by the minimum of max_cycles and mask. + * Note: Here we subtract the maxadj to make sure we don't sleep for + * too long if there's a large negative adjustment. + */ + max_cycles = min(max_cycles, mask); + max_nsecs = clocksource_cyc2ns(max_cycles, mult - maxadj, shift); + + /* return the max_cycles value as well if requested */ + if (max_cyc) + *max_cyc = max_cycles; + + /* Return 50% of the actual maximum, so we can detect bad values */ + max_nsecs >>= 1; + + return max_nsecs; +} + +/** + * clocksource_update_max_deferment - Updates the clocksource max_idle_ns & max_cycles + * @cs: Pointer to clocksource to be updated + * + */ +static inline void clocksource_update_max_deferment(struct clocksource *cs) +{ + cs->max_idle_ns = clocks_calc_max_nsecs(cs->mult, cs->shift, + cs->maxadj, cs->mask, + &cs->max_cycles); +} + +static struct clocksource *clocksource_find_best(bool oneshot, bool skipcur) +{ + struct clocksource *cs; + + if (!finished_booting || list_empty(&clocksource_list)) + return NULL; + + /* + * We pick the clocksource with the highest rating. If oneshot + * mode is active, we pick the highres valid clocksource with + * the best rating. + */ + list_for_each_entry(cs, &clocksource_list, list) { + if (skipcur && cs == curr_clocksource) + continue; + if (oneshot && !(cs->flags & CLOCK_SOURCE_VALID_FOR_HRES)) + continue; + return cs; + } + return NULL; +} + +static void __clocksource_select(bool skipcur) +{ + bool oneshot = tick_oneshot_mode_active(); + struct clocksource *best, *cs; + + /* Find the best suitable clocksource */ + best = clocksource_find_best(oneshot, skipcur); + if (!best) + return; + + if (!strlen(override_name)) + goto found; + + /* Check for the override clocksource. */ + list_for_each_entry(cs, &clocksource_list, list) { + if (skipcur && cs == curr_clocksource) + continue; + if (strcmp(cs->name, override_name) != 0) + continue; + /* + * Check to make sure we don't switch to a non-highres + * capable clocksource if the tick code is in oneshot + * mode (highres or nohz) + */ + if (!(cs->flags & CLOCK_SOURCE_VALID_FOR_HRES) && oneshot) { + /* Override clocksource cannot be used. */ + if (cs->flags & CLOCK_SOURCE_UNSTABLE) { + pr_warn("Override clocksource %s is unstable and not HRT compatible - cannot switch while in HRT/NOHZ mode\n", + cs->name); + override_name[0] = 0; + } else { + /* + * The override cannot be currently verified. + * Deferring to let the watchdog check. + */ + pr_info("Override clocksource %s is not currently HRT compatible - deferring\n", + cs->name); + } + } else + /* Override clocksource can be used. */ + best = cs; + break; + } + +found: + if (curr_clocksource != best && !timekeeping_notify(best)) { + pr_info("Switched to clocksource %s\n", best->name); + curr_clocksource = best; + } +} + +/** + * clocksource_select - Select the best clocksource available + * + * Private function. Must hold clocksource_mutex when called. + * + * Select the clocksource with the best rating, or the clocksource, + * which is selected by userspace override. + */ +static void clocksource_select(void) +{ + __clocksource_select(false); +} + +static void clocksource_select_fallback(void) +{ + __clocksource_select(true); +} + +/* + * clocksource_done_booting - Called near the end of core bootup + * + * Hack to avoid lots of clocksource churn at boot time. + * We use fs_initcall because we want this to start before + * device_initcall but after subsys_initcall. + */ +static int __init clocksource_done_booting(void) +{ + mutex_lock(&clocksource_mutex); + curr_clocksource = clocksource_default_clock(); + finished_booting = 1; + /* + * Run the watchdog first to eliminate unstable clock sources + */ + __clocksource_watchdog_kthread(); + clocksource_select(); + mutex_unlock(&clocksource_mutex); + return 0; +} +fs_initcall(clocksource_done_booting); + +/* + * Enqueue the clocksource sorted by rating + */ +static void clocksource_enqueue(struct clocksource *cs) +{ + struct list_head *entry = &clocksource_list; + struct clocksource *tmp; + + list_for_each_entry(tmp, &clocksource_list, list) { + /* Keep track of the place, where to insert */ + if (tmp->rating < cs->rating) + break; + entry = &tmp->list; + } + list_add(&cs->list, entry); +} + +/** + * __clocksource_update_freq_scale - Used update clocksource with new freq + * @cs: clocksource to be registered + * @scale: Scale factor multiplied against freq to get clocksource hz + * @freq: clocksource frequency (cycles per second) divided by scale + * + * This should only be called from the clocksource->enable() method. + * + * This *SHOULD NOT* be called directly! Please use the + * __clocksource_update_freq_hz() or __clocksource_update_freq_khz() helper + * functions. + */ +void __clocksource_update_freq_scale(struct clocksource *cs, u32 scale, u32 freq) +{ + u64 sec; + + /* + * Default clocksources are *special* and self-define their mult/shift. + * But, you're not special, so you should specify a freq value. + */ + if (freq) { + /* + * Calc the maximum number of seconds which we can run before + * wrapping around. For clocksources which have a mask > 32-bit + * we need to limit the max sleep time to have a good + * conversion precision. 10 minutes is still a reasonable + * amount. That results in a shift value of 24 for a + * clocksource with mask >= 40-bit and f >= 4GHz. That maps to + * ~ 0.06ppm granularity for NTP. + */ + sec = cs->mask; + do_div(sec, freq); + do_div(sec, scale); + if (!sec) + sec = 1; + else if (sec > 600 && cs->mask > UINT_MAX) + sec = 600; + + clocks_calc_mult_shift(&cs->mult, &cs->shift, freq, + NSEC_PER_SEC / scale, sec * scale); + } + + /* + * If the uncertainty margin is not specified, calculate it. + * If both scale and freq are non-zero, calculate the clock + * period, but bound below at 2*WATCHDOG_MAX_SKEW. However, + * if either of scale or freq is zero, be very conservative and + * take the tens-of-milliseconds WATCHDOG_THRESHOLD value for the + * uncertainty margin. Allow stupidly small uncertainty margins + * to be specified by the caller for testing purposes, but warn + * to discourage production use of this capability. + */ + if (scale && freq && !cs->uncertainty_margin) { + cs->uncertainty_margin = NSEC_PER_SEC / (scale * freq); + if (cs->uncertainty_margin < 2 * WATCHDOG_MAX_SKEW) + cs->uncertainty_margin = 2 * WATCHDOG_MAX_SKEW; + } else if (!cs->uncertainty_margin) { + cs->uncertainty_margin = WATCHDOG_THRESHOLD; + } + WARN_ON_ONCE(cs->uncertainty_margin < 2 * WATCHDOG_MAX_SKEW); + + /* + * Ensure clocksources that have large 'mult' values don't overflow + * when adjusted. + */ + cs->maxadj = clocksource_max_adjustment(cs); + while (freq && ((cs->mult + cs->maxadj < cs->mult) + || (cs->mult - cs->maxadj > cs->mult))) { + cs->mult >>= 1; + cs->shift--; + cs->maxadj = clocksource_max_adjustment(cs); + } + + /* + * Only warn for *special* clocksources that self-define + * their mult/shift values and don't specify a freq. + */ + WARN_ONCE(cs->mult + cs->maxadj < cs->mult, + "timekeeping: Clocksource %s might overflow on 11%% adjustment\n", + cs->name); + + clocksource_update_max_deferment(cs); + + pr_info("%s: mask: 0x%llx max_cycles: 0x%llx, max_idle_ns: %lld ns\n", + cs->name, cs->mask, cs->max_cycles, cs->max_idle_ns); +} +EXPORT_SYMBOL_GPL(__clocksource_update_freq_scale); + +/** + * __clocksource_register_scale - Used to install new clocksources + * @cs: clocksource to be registered + * @scale: Scale factor multiplied against freq to get clocksource hz + * @freq: clocksource frequency (cycles per second) divided by scale + * + * Returns -EBUSY if registration fails, zero otherwise. + * + * This *SHOULD NOT* be called directly! Please use the + * clocksource_register_hz() or clocksource_register_khz helper functions. + */ +int __clocksource_register_scale(struct clocksource *cs, u32 scale, u32 freq) +{ + unsigned long flags; + + clocksource_arch_init(cs); + + if (WARN_ON_ONCE((unsigned int)cs->id >= CSID_MAX)) + cs->id = CSID_GENERIC; + if (cs->vdso_clock_mode < 0 || + cs->vdso_clock_mode >= VDSO_CLOCKMODE_MAX) { + pr_warn("clocksource %s registered with invalid VDSO mode %d. Disabling VDSO support.\n", + cs->name, cs->vdso_clock_mode); + cs->vdso_clock_mode = VDSO_CLOCKMODE_NONE; + } + + /* Initialize mult/shift and max_idle_ns */ + __clocksource_update_freq_scale(cs, scale, freq); + + /* Add clocksource to the clocksource list */ + mutex_lock(&clocksource_mutex); + + clocksource_watchdog_lock(&flags); + clocksource_enqueue(cs); + clocksource_enqueue_watchdog(cs); + clocksource_watchdog_unlock(&flags); + + clocksource_select(); + clocksource_select_watchdog(false); + __clocksource_suspend_select(cs); + mutex_unlock(&clocksource_mutex); + return 0; +} +EXPORT_SYMBOL_GPL(__clocksource_register_scale); + +static void __clocksource_change_rating(struct clocksource *cs, int rating) +{ + list_del(&cs->list); + cs->rating = rating; + clocksource_enqueue(cs); +} + +/** + * clocksource_change_rating - Change the rating of a registered clocksource + * @cs: clocksource to be changed + * @rating: new rating + */ +void clocksource_change_rating(struct clocksource *cs, int rating) +{ + unsigned long flags; + + mutex_lock(&clocksource_mutex); + clocksource_watchdog_lock(&flags); + __clocksource_change_rating(cs, rating); + clocksource_watchdog_unlock(&flags); + + clocksource_select(); + clocksource_select_watchdog(false); + clocksource_suspend_select(false); + mutex_unlock(&clocksource_mutex); +} +EXPORT_SYMBOL(clocksource_change_rating); + +/* + * Unbind clocksource @cs. Called with clocksource_mutex held + */ +static int clocksource_unbind(struct clocksource *cs) +{ + unsigned long flags; + + if (clocksource_is_watchdog(cs)) { + /* Select and try to install a replacement watchdog. */ + clocksource_select_watchdog(true); + if (clocksource_is_watchdog(cs)) + return -EBUSY; + } + + if (cs == curr_clocksource) { + /* Select and try to install a replacement clock source */ + clocksource_select_fallback(); + if (curr_clocksource == cs) + return -EBUSY; + } + + if (clocksource_is_suspend(cs)) { + /* + * Select and try to install a replacement suspend clocksource. + * If no replacement suspend clocksource, we will just let the + * clocksource go and have no suspend clocksource. + */ + clocksource_suspend_select(true); + } + + clocksource_watchdog_lock(&flags); + clocksource_dequeue_watchdog(cs); + list_del_init(&cs->list); + clocksource_watchdog_unlock(&flags); + + return 0; +} + +/** + * clocksource_unregister - remove a registered clocksource + * @cs: clocksource to be unregistered + */ +int clocksource_unregister(struct clocksource *cs) +{ + int ret = 0; + + mutex_lock(&clocksource_mutex); + if (!list_empty(&cs->list)) + ret = clocksource_unbind(cs); + mutex_unlock(&clocksource_mutex); + return ret; +} +EXPORT_SYMBOL(clocksource_unregister); + +#ifdef CONFIG_SYSFS +/** + * current_clocksource_show - sysfs interface for current clocksource + * @dev: unused + * @attr: unused + * @buf: char buffer to be filled with clocksource list + * + * Provides sysfs interface for listing current clocksource. + */ +static ssize_t current_clocksource_show(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + ssize_t count = 0; + + mutex_lock(&clocksource_mutex); + count = snprintf(buf, PAGE_SIZE, "%s\n", curr_clocksource->name); + mutex_unlock(&clocksource_mutex); + + return count; +} + +ssize_t sysfs_get_uname(const char *buf, char *dst, size_t cnt) +{ + size_t ret = cnt; + + /* strings from sysfs write are not 0 terminated! */ + if (!cnt || cnt >= CS_NAME_LEN) + return -EINVAL; + + /* strip of \n: */ + if (buf[cnt-1] == '\n') + cnt--; + if (cnt > 0) + memcpy(dst, buf, cnt); + dst[cnt] = 0; + return ret; +} + +/** + * current_clocksource_store - interface for manually overriding clocksource + * @dev: unused + * @attr: unused + * @buf: name of override clocksource + * @count: length of buffer + * + * Takes input from sysfs interface for manually overriding the default + * clocksource selection. + */ +static ssize_t current_clocksource_store(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t count) +{ + ssize_t ret; + + mutex_lock(&clocksource_mutex); + + ret = sysfs_get_uname(buf, override_name, count); + if (ret >= 0) + clocksource_select(); + + mutex_unlock(&clocksource_mutex); + + return ret; +} +static DEVICE_ATTR_RW(current_clocksource); + +/** + * unbind_clocksource_store - interface for manually unbinding clocksource + * @dev: unused + * @attr: unused + * @buf: unused + * @count: length of buffer + * + * Takes input from sysfs interface for manually unbinding a clocksource. + */ +static ssize_t unbind_clocksource_store(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t count) +{ + struct clocksource *cs; + char name[CS_NAME_LEN]; + ssize_t ret; + + ret = sysfs_get_uname(buf, name, count); + if (ret < 0) + return ret; + + ret = -ENODEV; + mutex_lock(&clocksource_mutex); + list_for_each_entry(cs, &clocksource_list, list) { + if (strcmp(cs->name, name)) + continue; + ret = clocksource_unbind(cs); + break; + } + mutex_unlock(&clocksource_mutex); + + return ret ? ret : count; +} +static DEVICE_ATTR_WO(unbind_clocksource); + +/** + * available_clocksource_show - sysfs interface for listing clocksource + * @dev: unused + * @attr: unused + * @buf: char buffer to be filled with clocksource list + * + * Provides sysfs interface for listing registered clocksources + */ +static ssize_t available_clocksource_show(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + struct clocksource *src; + ssize_t count = 0; + + mutex_lock(&clocksource_mutex); + list_for_each_entry(src, &clocksource_list, list) { + /* + * Don't show non-HRES clocksource if the tick code is + * in one shot mode (highres=on or nohz=on) + */ + if (!tick_oneshot_mode_active() || + (src->flags & CLOCK_SOURCE_VALID_FOR_HRES)) + count += snprintf(buf + count, + max((ssize_t)PAGE_SIZE - count, (ssize_t)0), + "%s ", src->name); + } + mutex_unlock(&clocksource_mutex); + + count += snprintf(buf + count, + max((ssize_t)PAGE_SIZE - count, (ssize_t)0), "\n"); + + return count; +} +static DEVICE_ATTR_RO(available_clocksource); + +static struct attribute *clocksource_attrs[] = { + &dev_attr_current_clocksource.attr, + &dev_attr_unbind_clocksource.attr, + &dev_attr_available_clocksource.attr, + NULL +}; +ATTRIBUTE_GROUPS(clocksource); + +static struct bus_type clocksource_subsys = { + .name = "clocksource", + .dev_name = "clocksource", +}; + +static struct device device_clocksource = { + .id = 0, + .bus = &clocksource_subsys, + .groups = clocksource_groups, +}; + +static int __init init_clocksource_sysfs(void) +{ + int error = subsys_system_register(&clocksource_subsys, NULL); + + if (!error) + error = device_register(&device_clocksource); + + return error; +} + +device_initcall(init_clocksource_sysfs); +#endif /* CONFIG_SYSFS */ + +/** + * boot_override_clocksource - boot clock override + * @str: override name + * + * Takes a clocksource= boot argument and uses it + * as the clocksource override name. + */ +static int __init boot_override_clocksource(char* str) +{ + mutex_lock(&clocksource_mutex); + if (str) + strscpy(override_name, str, sizeof(override_name)); + mutex_unlock(&clocksource_mutex); + return 1; +} + +__setup("clocksource=", boot_override_clocksource); + +/** + * boot_override_clock - Compatibility layer for deprecated boot option + * @str: override name + * + * DEPRECATED! Takes a clock= boot argument and uses it + * as the clocksource override name + */ +static int __init boot_override_clock(char* str) +{ + if (!strcmp(str, "pmtmr")) { + pr_warn("clock=pmtmr is deprecated - use clocksource=acpi_pm\n"); + return boot_override_clocksource("acpi_pm"); + } + pr_warn("clock= boot option is deprecated - use clocksource=xyz\n"); + return boot_override_clocksource(str); +} + +__setup("clock=", boot_override_clock); diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c new file mode 100644 index 0000000000..760793998c --- /dev/null +++ b/kernel/time/hrtimer.c @@ -0,0 +1,2391 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright(C) 2005-2006, Thomas Gleixner <tglx@linutronix.de> + * Copyright(C) 2005-2007, Red Hat, Inc., Ingo Molnar + * Copyright(C) 2006-2007 Timesys Corp., Thomas Gleixner + * + * High-resolution kernel timers + * + * In contrast to the low-resolution timeout API, aka timer wheel, + * hrtimers provide finer resolution and accuracy depending on system + * configuration and capabilities. + * + * Started by: Thomas Gleixner and Ingo Molnar + * + * Credits: + * Based on the original timer wheel code + * + * Help, testing, suggestions, bugfixes, improvements were + * provided by: + * + * George Anzinger, Andrew Morton, Steven Rostedt, Roman Zippel + * et. al. + */ + +#include <linux/cpu.h> +#include <linux/export.h> +#include <linux/percpu.h> +#include <linux/hrtimer.h> +#include <linux/notifier.h> +#include <linux/syscalls.h> +#include <linux/interrupt.h> +#include <linux/tick.h> +#include <linux/err.h> +#include <linux/debugobjects.h> +#include <linux/sched/signal.h> +#include <linux/sched/sysctl.h> +#include <linux/sched/rt.h> +#include <linux/sched/deadline.h> +#include <linux/sched/nohz.h> +#include <linux/sched/debug.h> +#include <linux/timer.h> +#include <linux/freezer.h> +#include <linux/compat.h> + +#include <linux/uaccess.h> + +#include <trace/events/timer.h> + +#include "tick-internal.h" + +/* + * Masks for selecting the soft and hard context timers from + * cpu_base->active + */ +#define MASK_SHIFT (HRTIMER_BASE_MONOTONIC_SOFT) +#define HRTIMER_ACTIVE_HARD ((1U << MASK_SHIFT) - 1) +#define HRTIMER_ACTIVE_SOFT (HRTIMER_ACTIVE_HARD << MASK_SHIFT) +#define HRTIMER_ACTIVE_ALL (HRTIMER_ACTIVE_SOFT | HRTIMER_ACTIVE_HARD) + +/* + * The timer bases: + * + * There are more clockids than hrtimer bases. Thus, we index + * into the timer bases by the hrtimer_base_type enum. When trying + * to reach a base using a clockid, hrtimer_clockid_to_base() + * is used to convert from clockid to the proper hrtimer_base_type. + */ +DEFINE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases) = +{ + .lock = __RAW_SPIN_LOCK_UNLOCKED(hrtimer_bases.lock), + .clock_base = + { + { + .index = HRTIMER_BASE_MONOTONIC, + .clockid = CLOCK_MONOTONIC, + .get_time = &ktime_get, + }, + { + .index = HRTIMER_BASE_REALTIME, + .clockid = CLOCK_REALTIME, + .get_time = &ktime_get_real, + }, + { + .index = HRTIMER_BASE_BOOTTIME, + .clockid = CLOCK_BOOTTIME, + .get_time = &ktime_get_boottime, + }, + { + .index = HRTIMER_BASE_TAI, + .clockid = CLOCK_TAI, + .get_time = &ktime_get_clocktai, + }, + { + .index = HRTIMER_BASE_MONOTONIC_SOFT, + .clockid = CLOCK_MONOTONIC, + .get_time = &ktime_get, + }, + { + .index = HRTIMER_BASE_REALTIME_SOFT, + .clockid = CLOCK_REALTIME, + .get_time = &ktime_get_real, + }, + { + .index = HRTIMER_BASE_BOOTTIME_SOFT, + .clockid = CLOCK_BOOTTIME, + .get_time = &ktime_get_boottime, + }, + { + .index = HRTIMER_BASE_TAI_SOFT, + .clockid = CLOCK_TAI, + .get_time = &ktime_get_clocktai, + }, + } +}; + +static const int hrtimer_clock_to_base_table[MAX_CLOCKS] = { + /* Make sure we catch unsupported clockids */ + [0 ... MAX_CLOCKS - 1] = HRTIMER_MAX_CLOCK_BASES, + + [CLOCK_REALTIME] = HRTIMER_BASE_REALTIME, + [CLOCK_MONOTONIC] = HRTIMER_BASE_MONOTONIC, + [CLOCK_BOOTTIME] = HRTIMER_BASE_BOOTTIME, + [CLOCK_TAI] = HRTIMER_BASE_TAI, +}; + +/* + * Functions and macros which are different for UP/SMP systems are kept in a + * single place + */ +#ifdef CONFIG_SMP + +/* + * We require the migration_base for lock_hrtimer_base()/switch_hrtimer_base() + * such that hrtimer_callback_running() can unconditionally dereference + * timer->base->cpu_base + */ +static struct hrtimer_cpu_base migration_cpu_base = { + .clock_base = { { + .cpu_base = &migration_cpu_base, + .seq = SEQCNT_RAW_SPINLOCK_ZERO(migration_cpu_base.seq, + &migration_cpu_base.lock), + }, }, +}; + +#define migration_base migration_cpu_base.clock_base[0] + +static inline bool is_migration_base(struct hrtimer_clock_base *base) +{ + return base == &migration_base; +} + +/* + * We are using hashed locking: holding per_cpu(hrtimer_bases)[n].lock + * means that all timers which are tied to this base via timer->base are + * locked, and the base itself is locked too. + * + * So __run_timers/migrate_timers can safely modify all timers which could + * be found on the lists/queues. + * + * When the timer's base is locked, and the timer removed from list, it is + * possible to set timer->base = &migration_base and drop the lock: the timer + * remains locked. + */ +static +struct hrtimer_clock_base *lock_hrtimer_base(const struct hrtimer *timer, + unsigned long *flags) + __acquires(&timer->base->lock) +{ + struct hrtimer_clock_base *base; + + for (;;) { + base = READ_ONCE(timer->base); + if (likely(base != &migration_base)) { + raw_spin_lock_irqsave(&base->cpu_base->lock, *flags); + if (likely(base == timer->base)) + return base; + /* The timer has migrated to another CPU: */ + raw_spin_unlock_irqrestore(&base->cpu_base->lock, *flags); + } + cpu_relax(); + } +} + +/* + * We do not migrate the timer when it is expiring before the next + * event on the target cpu. When high resolution is enabled, we cannot + * reprogram the target cpu hardware and we would cause it to fire + * late. To keep it simple, we handle the high resolution enabled and + * disabled case similar. + * + * Called with cpu_base->lock of target cpu held. + */ +static int +hrtimer_check_target(struct hrtimer *timer, struct hrtimer_clock_base *new_base) +{ + ktime_t expires; + + expires = ktime_sub(hrtimer_get_expires(timer), new_base->offset); + return expires < new_base->cpu_base->expires_next; +} + +static inline +struct hrtimer_cpu_base *get_target_base(struct hrtimer_cpu_base *base, + int pinned) +{ +#if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ_COMMON) + if (static_branch_likely(&timers_migration_enabled) && !pinned) + return &per_cpu(hrtimer_bases, get_nohz_timer_target()); +#endif + return base; +} + +/* + * We switch the timer base to a power-optimized selected CPU target, + * if: + * - NO_HZ_COMMON is enabled + * - timer migration is enabled + * - the timer callback is not running + * - the timer is not the first expiring timer on the new target + * + * If one of the above requirements is not fulfilled we move the timer + * to the current CPU or leave it on the previously assigned CPU if + * the timer callback is currently running. + */ +static inline struct hrtimer_clock_base * +switch_hrtimer_base(struct hrtimer *timer, struct hrtimer_clock_base *base, + int pinned) +{ + struct hrtimer_cpu_base *new_cpu_base, *this_cpu_base; + struct hrtimer_clock_base *new_base; + int basenum = base->index; + + this_cpu_base = this_cpu_ptr(&hrtimer_bases); + new_cpu_base = get_target_base(this_cpu_base, pinned); +again: + new_base = &new_cpu_base->clock_base[basenum]; + + if (base != new_base) { + /* + * We are trying to move timer to new_base. + * However we can't change timer's base while it is running, + * so we keep it on the same CPU. No hassle vs. reprogramming + * the event source in the high resolution case. The softirq + * code will take care of this when the timer function has + * completed. There is no conflict as we hold the lock until + * the timer is enqueued. + */ + if (unlikely(hrtimer_callback_running(timer))) + return base; + + /* See the comment in lock_hrtimer_base() */ + WRITE_ONCE(timer->base, &migration_base); + raw_spin_unlock(&base->cpu_base->lock); + raw_spin_lock(&new_base->cpu_base->lock); + + if (new_cpu_base != this_cpu_base && + hrtimer_check_target(timer, new_base)) { + raw_spin_unlock(&new_base->cpu_base->lock); + raw_spin_lock(&base->cpu_base->lock); + new_cpu_base = this_cpu_base; + WRITE_ONCE(timer->base, base); + goto again; + } + WRITE_ONCE(timer->base, new_base); + } else { + if (new_cpu_base != this_cpu_base && + hrtimer_check_target(timer, new_base)) { + new_cpu_base = this_cpu_base; + goto again; + } + } + return new_base; +} + +#else /* CONFIG_SMP */ + +static inline bool is_migration_base(struct hrtimer_clock_base *base) +{ + return false; +} + +static inline struct hrtimer_clock_base * +lock_hrtimer_base(const struct hrtimer *timer, unsigned long *flags) + __acquires(&timer->base->cpu_base->lock) +{ + struct hrtimer_clock_base *base = timer->base; + + raw_spin_lock_irqsave(&base->cpu_base->lock, *flags); + + return base; +} + +# define switch_hrtimer_base(t, b, p) (b) + +#endif /* !CONFIG_SMP */ + +/* + * Functions for the union type storage format of ktime_t which are + * too large for inlining: + */ +#if BITS_PER_LONG < 64 +/* + * Divide a ktime value by a nanosecond value + */ +s64 __ktime_divns(const ktime_t kt, s64 div) +{ + int sft = 0; + s64 dclc; + u64 tmp; + + dclc = ktime_to_ns(kt); + tmp = dclc < 0 ? -dclc : dclc; + + /* Make sure the divisor is less than 2^32: */ + while (div >> 32) { + sft++; + div >>= 1; + } + tmp >>= sft; + do_div(tmp, (u32) div); + return dclc < 0 ? -tmp : tmp; +} +EXPORT_SYMBOL_GPL(__ktime_divns); +#endif /* BITS_PER_LONG >= 64 */ + +/* + * Add two ktime values and do a safety check for overflow: + */ +ktime_t ktime_add_safe(const ktime_t lhs, const ktime_t rhs) +{ + ktime_t res = ktime_add_unsafe(lhs, rhs); + + /* + * We use KTIME_SEC_MAX here, the maximum timeout which we can + * return to user space in a timespec: + */ + if (res < 0 || res < lhs || res < rhs) + res = ktime_set(KTIME_SEC_MAX, 0); + + return res; +} + +EXPORT_SYMBOL_GPL(ktime_add_safe); + +#ifdef CONFIG_DEBUG_OBJECTS_TIMERS + +static const struct debug_obj_descr hrtimer_debug_descr; + +static void *hrtimer_debug_hint(void *addr) +{ + return ((struct hrtimer *) addr)->function; +} + +/* + * fixup_init is called when: + * - an active object is initialized + */ +static bool hrtimer_fixup_init(void *addr, enum debug_obj_state state) +{ + struct hrtimer *timer = addr; + + switch (state) { + case ODEBUG_STATE_ACTIVE: + hrtimer_cancel(timer); + debug_object_init(timer, &hrtimer_debug_descr); + return true; + default: + return false; + } +} + +/* + * fixup_activate is called when: + * - an active object is activated + * - an unknown non-static object is activated + */ +static bool hrtimer_fixup_activate(void *addr, enum debug_obj_state state) +{ + switch (state) { + case ODEBUG_STATE_ACTIVE: + WARN_ON(1); + fallthrough; + default: + return false; + } +} + +/* + * fixup_free is called when: + * - an active object is freed + */ +static bool hrtimer_fixup_free(void *addr, enum debug_obj_state state) +{ + struct hrtimer *timer = addr; + + switch (state) { + case ODEBUG_STATE_ACTIVE: + hrtimer_cancel(timer); + debug_object_free(timer, &hrtimer_debug_descr); + return true; + default: + return false; + } +} + +static const struct debug_obj_descr hrtimer_debug_descr = { + .name = "hrtimer", + .debug_hint = hrtimer_debug_hint, + .fixup_init = hrtimer_fixup_init, + .fixup_activate = hrtimer_fixup_activate, + .fixup_free = hrtimer_fixup_free, +}; + +static inline void debug_hrtimer_init(struct hrtimer *timer) +{ + debug_object_init(timer, &hrtimer_debug_descr); +} + +static inline void debug_hrtimer_activate(struct hrtimer *timer, + enum hrtimer_mode mode) +{ + debug_object_activate(timer, &hrtimer_debug_descr); +} + +static inline void debug_hrtimer_deactivate(struct hrtimer *timer) +{ + debug_object_deactivate(timer, &hrtimer_debug_descr); +} + +static void __hrtimer_init(struct hrtimer *timer, clockid_t clock_id, + enum hrtimer_mode mode); + +void hrtimer_init_on_stack(struct hrtimer *timer, clockid_t clock_id, + enum hrtimer_mode mode) +{ + debug_object_init_on_stack(timer, &hrtimer_debug_descr); + __hrtimer_init(timer, clock_id, mode); +} +EXPORT_SYMBOL_GPL(hrtimer_init_on_stack); + +static void __hrtimer_init_sleeper(struct hrtimer_sleeper *sl, + clockid_t clock_id, enum hrtimer_mode mode); + +void hrtimer_init_sleeper_on_stack(struct hrtimer_sleeper *sl, + clockid_t clock_id, enum hrtimer_mode mode) +{ + debug_object_init_on_stack(&sl->timer, &hrtimer_debug_descr); + __hrtimer_init_sleeper(sl, clock_id, mode); +} +EXPORT_SYMBOL_GPL(hrtimer_init_sleeper_on_stack); + +void destroy_hrtimer_on_stack(struct hrtimer *timer) +{ + debug_object_free(timer, &hrtimer_debug_descr); +} +EXPORT_SYMBOL_GPL(destroy_hrtimer_on_stack); + +#else + +static inline void debug_hrtimer_init(struct hrtimer *timer) { } +static inline void debug_hrtimer_activate(struct hrtimer *timer, + enum hrtimer_mode mode) { } +static inline void debug_hrtimer_deactivate(struct hrtimer *timer) { } +#endif + +static inline void +debug_init(struct hrtimer *timer, clockid_t clockid, + enum hrtimer_mode mode) +{ + debug_hrtimer_init(timer); + trace_hrtimer_init(timer, clockid, mode); +} + +static inline void debug_activate(struct hrtimer *timer, + enum hrtimer_mode mode) +{ + debug_hrtimer_activate(timer, mode); + trace_hrtimer_start(timer, mode); +} + +static inline void debug_deactivate(struct hrtimer *timer) +{ + debug_hrtimer_deactivate(timer); + trace_hrtimer_cancel(timer); +} + +static struct hrtimer_clock_base * +__next_base(struct hrtimer_cpu_base *cpu_base, unsigned int *active) +{ + unsigned int idx; + + if (!*active) + return NULL; + + idx = __ffs(*active); + *active &= ~(1U << idx); + + return &cpu_base->clock_base[idx]; +} + +#define for_each_active_base(base, cpu_base, active) \ + while ((base = __next_base((cpu_base), &(active)))) + +static ktime_t __hrtimer_next_event_base(struct hrtimer_cpu_base *cpu_base, + const struct hrtimer *exclude, + unsigned int active, + ktime_t expires_next) +{ + struct hrtimer_clock_base *base; + ktime_t expires; + + for_each_active_base(base, cpu_base, active) { + struct timerqueue_node *next; + struct hrtimer *timer; + + next = timerqueue_getnext(&base->active); + timer = container_of(next, struct hrtimer, node); + if (timer == exclude) { + /* Get to the next timer in the queue. */ + next = timerqueue_iterate_next(next); + if (!next) + continue; + + timer = container_of(next, struct hrtimer, node); + } + expires = ktime_sub(hrtimer_get_expires(timer), base->offset); + if (expires < expires_next) { + expires_next = expires; + + /* Skip cpu_base update if a timer is being excluded. */ + if (exclude) + continue; + + if (timer->is_soft) + cpu_base->softirq_next_timer = timer; + else + cpu_base->next_timer = timer; + } + } + /* + * clock_was_set() might have changed base->offset of any of + * the clock bases so the result might be negative. Fix it up + * to prevent a false positive in clockevents_program_event(). + */ + if (expires_next < 0) + expires_next = 0; + return expires_next; +} + +/* + * Recomputes cpu_base::*next_timer and returns the earliest expires_next + * but does not set cpu_base::*expires_next, that is done by + * hrtimer[_force]_reprogram and hrtimer_interrupt only. When updating + * cpu_base::*expires_next right away, reprogramming logic would no longer + * work. + * + * When a softirq is pending, we can ignore the HRTIMER_ACTIVE_SOFT bases, + * those timers will get run whenever the softirq gets handled, at the end of + * hrtimer_run_softirq(), hrtimer_update_softirq_timer() will re-add these bases. + * + * Therefore softirq values are those from the HRTIMER_ACTIVE_SOFT clock bases. + * The !softirq values are the minima across HRTIMER_ACTIVE_ALL, unless an actual + * softirq is pending, in which case they're the minima of HRTIMER_ACTIVE_HARD. + * + * @active_mask must be one of: + * - HRTIMER_ACTIVE_ALL, + * - HRTIMER_ACTIVE_SOFT, or + * - HRTIMER_ACTIVE_HARD. + */ +static ktime_t +__hrtimer_get_next_event(struct hrtimer_cpu_base *cpu_base, unsigned int active_mask) +{ + unsigned int active; + struct hrtimer *next_timer = NULL; + ktime_t expires_next = KTIME_MAX; + + if (!cpu_base->softirq_activated && (active_mask & HRTIMER_ACTIVE_SOFT)) { + active = cpu_base->active_bases & HRTIMER_ACTIVE_SOFT; + cpu_base->softirq_next_timer = NULL; + expires_next = __hrtimer_next_event_base(cpu_base, NULL, + active, KTIME_MAX); + + next_timer = cpu_base->softirq_next_timer; + } + + if (active_mask & HRTIMER_ACTIVE_HARD) { + active = cpu_base->active_bases & HRTIMER_ACTIVE_HARD; + cpu_base->next_timer = next_timer; + expires_next = __hrtimer_next_event_base(cpu_base, NULL, active, + expires_next); + } + + return expires_next; +} + +static ktime_t hrtimer_update_next_event(struct hrtimer_cpu_base *cpu_base) +{ + ktime_t expires_next, soft = KTIME_MAX; + + /* + * If the soft interrupt has already been activated, ignore the + * soft bases. They will be handled in the already raised soft + * interrupt. + */ + if (!cpu_base->softirq_activated) { + soft = __hrtimer_get_next_event(cpu_base, HRTIMER_ACTIVE_SOFT); + /* + * Update the soft expiry time. clock_settime() might have + * affected it. + */ + cpu_base->softirq_expires_next = soft; + } + + expires_next = __hrtimer_get_next_event(cpu_base, HRTIMER_ACTIVE_HARD); + /* + * If a softirq timer is expiring first, update cpu_base->next_timer + * and program the hardware with the soft expiry time. + */ + if (expires_next > soft) { + cpu_base->next_timer = cpu_base->softirq_next_timer; + expires_next = soft; + } + + return expires_next; +} + +static inline ktime_t hrtimer_update_base(struct hrtimer_cpu_base *base) +{ + ktime_t *offs_real = &base->clock_base[HRTIMER_BASE_REALTIME].offset; + ktime_t *offs_boot = &base->clock_base[HRTIMER_BASE_BOOTTIME].offset; + ktime_t *offs_tai = &base->clock_base[HRTIMER_BASE_TAI].offset; + + ktime_t now = ktime_get_update_offsets_now(&base->clock_was_set_seq, + offs_real, offs_boot, offs_tai); + + base->clock_base[HRTIMER_BASE_REALTIME_SOFT].offset = *offs_real; + base->clock_base[HRTIMER_BASE_BOOTTIME_SOFT].offset = *offs_boot; + base->clock_base[HRTIMER_BASE_TAI_SOFT].offset = *offs_tai; + + return now; +} + +/* + * Is the high resolution mode active ? + */ +static inline int __hrtimer_hres_active(struct hrtimer_cpu_base *cpu_base) +{ + return IS_ENABLED(CONFIG_HIGH_RES_TIMERS) ? + cpu_base->hres_active : 0; +} + +static inline int hrtimer_hres_active(void) +{ + return __hrtimer_hres_active(this_cpu_ptr(&hrtimer_bases)); +} + +static void __hrtimer_reprogram(struct hrtimer_cpu_base *cpu_base, + struct hrtimer *next_timer, + ktime_t expires_next) +{ + cpu_base->expires_next = expires_next; + + /* + * If hres is not active, hardware does not have to be + * reprogrammed yet. + * + * If a hang was detected in the last timer interrupt then we + * leave the hang delay active in the hardware. We want the + * system to make progress. That also prevents the following + * scenario: + * T1 expires 50ms from now + * T2 expires 5s from now + * + * T1 is removed, so this code is called and would reprogram + * the hardware to 5s from now. Any hrtimer_start after that + * will not reprogram the hardware due to hang_detected being + * set. So we'd effectively block all timers until the T2 event + * fires. + */ + if (!__hrtimer_hres_active(cpu_base) || cpu_base->hang_detected) + return; + + tick_program_event(expires_next, 1); +} + +/* + * Reprogram the event source with checking both queues for the + * next event + * Called with interrupts disabled and base->lock held + */ +static void +hrtimer_force_reprogram(struct hrtimer_cpu_base *cpu_base, int skip_equal) +{ + ktime_t expires_next; + + expires_next = hrtimer_update_next_event(cpu_base); + + if (skip_equal && expires_next == cpu_base->expires_next) + return; + + __hrtimer_reprogram(cpu_base, cpu_base->next_timer, expires_next); +} + +/* High resolution timer related functions */ +#ifdef CONFIG_HIGH_RES_TIMERS + +/* + * High resolution timer enabled ? + */ +static bool hrtimer_hres_enabled __read_mostly = true; +unsigned int hrtimer_resolution __read_mostly = LOW_RES_NSEC; +EXPORT_SYMBOL_GPL(hrtimer_resolution); + +/* + * Enable / Disable high resolution mode + */ +static int __init setup_hrtimer_hres(char *str) +{ + return (kstrtobool(str, &hrtimer_hres_enabled) == 0); +} + +__setup("highres=", setup_hrtimer_hres); + +/* + * hrtimer_high_res_enabled - query, if the highres mode is enabled + */ +static inline int hrtimer_is_hres_enabled(void) +{ + return hrtimer_hres_enabled; +} + +static void retrigger_next_event(void *arg); + +/* + * Switch to high resolution mode + */ +static void hrtimer_switch_to_hres(void) +{ + struct hrtimer_cpu_base *base = this_cpu_ptr(&hrtimer_bases); + + if (tick_init_highres()) { + pr_warn("Could not switch to high resolution mode on CPU %u\n", + base->cpu); + return; + } + base->hres_active = 1; + hrtimer_resolution = HIGH_RES_NSEC; + + tick_setup_sched_timer(); + /* "Retrigger" the interrupt to get things going */ + retrigger_next_event(NULL); +} + +#else + +static inline int hrtimer_is_hres_enabled(void) { return 0; } +static inline void hrtimer_switch_to_hres(void) { } + +#endif /* CONFIG_HIGH_RES_TIMERS */ +/* + * Retrigger next event is called after clock was set with interrupts + * disabled through an SMP function call or directly from low level + * resume code. + * + * This is only invoked when: + * - CONFIG_HIGH_RES_TIMERS is enabled. + * - CONFIG_NOHZ_COMMON is enabled + * + * For the other cases this function is empty and because the call sites + * are optimized out it vanishes as well, i.e. no need for lots of + * #ifdeffery. + */ +static void retrigger_next_event(void *arg) +{ + struct hrtimer_cpu_base *base = this_cpu_ptr(&hrtimer_bases); + + /* + * When high resolution mode or nohz is active, then the offsets of + * CLOCK_REALTIME/TAI/BOOTTIME have to be updated. Otherwise the + * next tick will take care of that. + * + * If high resolution mode is active then the next expiring timer + * must be reevaluated and the clock event device reprogrammed if + * necessary. + * + * In the NOHZ case the update of the offset and the reevaluation + * of the next expiring timer is enough. The return from the SMP + * function call will take care of the reprogramming in case the + * CPU was in a NOHZ idle sleep. + */ + if (!__hrtimer_hres_active(base) && !tick_nohz_active) + return; + + raw_spin_lock(&base->lock); + hrtimer_update_base(base); + if (__hrtimer_hres_active(base)) + hrtimer_force_reprogram(base, 0); + else + hrtimer_update_next_event(base); + raw_spin_unlock(&base->lock); +} + +/* + * When a timer is enqueued and expires earlier than the already enqueued + * timers, we have to check, whether it expires earlier than the timer for + * which the clock event device was armed. + * + * Called with interrupts disabled and base->cpu_base.lock held + */ +static void hrtimer_reprogram(struct hrtimer *timer, bool reprogram) +{ + struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases); + struct hrtimer_clock_base *base = timer->base; + ktime_t expires = ktime_sub(hrtimer_get_expires(timer), base->offset); + + WARN_ON_ONCE(hrtimer_get_expires_tv64(timer) < 0); + + /* + * CLOCK_REALTIME timer might be requested with an absolute + * expiry time which is less than base->offset. Set it to 0. + */ + if (expires < 0) + expires = 0; + + if (timer->is_soft) { + /* + * soft hrtimer could be started on a remote CPU. In this + * case softirq_expires_next needs to be updated on the + * remote CPU. The soft hrtimer will not expire before the + * first hard hrtimer on the remote CPU - + * hrtimer_check_target() prevents this case. + */ + struct hrtimer_cpu_base *timer_cpu_base = base->cpu_base; + + if (timer_cpu_base->softirq_activated) + return; + + if (!ktime_before(expires, timer_cpu_base->softirq_expires_next)) + return; + + timer_cpu_base->softirq_next_timer = timer; + timer_cpu_base->softirq_expires_next = expires; + + if (!ktime_before(expires, timer_cpu_base->expires_next) || + !reprogram) + return; + } + + /* + * If the timer is not on the current cpu, we cannot reprogram + * the other cpus clock event device. + */ + if (base->cpu_base != cpu_base) + return; + + if (expires >= cpu_base->expires_next) + return; + + /* + * If the hrtimer interrupt is running, then it will reevaluate the + * clock bases and reprogram the clock event device. + */ + if (cpu_base->in_hrtirq) + return; + + cpu_base->next_timer = timer; + + __hrtimer_reprogram(cpu_base, timer, expires); +} + +static bool update_needs_ipi(struct hrtimer_cpu_base *cpu_base, + unsigned int active) +{ + struct hrtimer_clock_base *base; + unsigned int seq; + ktime_t expires; + + /* + * Update the base offsets unconditionally so the following + * checks whether the SMP function call is required works. + * + * The update is safe even when the remote CPU is in the hrtimer + * interrupt or the hrtimer soft interrupt and expiring affected + * bases. Either it will see the update before handling a base or + * it will see it when it finishes the processing and reevaluates + * the next expiring timer. + */ + seq = cpu_base->clock_was_set_seq; + hrtimer_update_base(cpu_base); + + /* + * If the sequence did not change over the update then the + * remote CPU already handled it. + */ + if (seq == cpu_base->clock_was_set_seq) + return false; + + /* + * If the remote CPU is currently handling an hrtimer interrupt, it + * will reevaluate the first expiring timer of all clock bases + * before reprogramming. Nothing to do here. + */ + if (cpu_base->in_hrtirq) + return false; + + /* + * Walk the affected clock bases and check whether the first expiring + * timer in a clock base is moving ahead of the first expiring timer of + * @cpu_base. If so, the IPI must be invoked because per CPU clock + * event devices cannot be remotely reprogrammed. + */ + active &= cpu_base->active_bases; + + for_each_active_base(base, cpu_base, active) { + struct timerqueue_node *next; + + next = timerqueue_getnext(&base->active); + expires = ktime_sub(next->expires, base->offset); + if (expires < cpu_base->expires_next) + return true; + + /* Extra check for softirq clock bases */ + if (base->clockid < HRTIMER_BASE_MONOTONIC_SOFT) + continue; + if (cpu_base->softirq_activated) + continue; + if (expires < cpu_base->softirq_expires_next) + return true; + } + return false; +} + +/* + * Clock was set. This might affect CLOCK_REALTIME, CLOCK_TAI and + * CLOCK_BOOTTIME (for late sleep time injection). + * + * This requires to update the offsets for these clocks + * vs. CLOCK_MONOTONIC. When high resolution timers are enabled, then this + * also requires to eventually reprogram the per CPU clock event devices + * when the change moves an affected timer ahead of the first expiring + * timer on that CPU. Obviously remote per CPU clock event devices cannot + * be reprogrammed. The other reason why an IPI has to be sent is when the + * system is in !HIGH_RES and NOHZ mode. The NOHZ mode updates the offsets + * in the tick, which obviously might be stopped, so this has to bring out + * the remote CPU which might sleep in idle to get this sorted. + */ +void clock_was_set(unsigned int bases) +{ + struct hrtimer_cpu_base *cpu_base = raw_cpu_ptr(&hrtimer_bases); + cpumask_var_t mask; + int cpu; + + if (!__hrtimer_hres_active(cpu_base) && !tick_nohz_active) + goto out_timerfd; + + if (!zalloc_cpumask_var(&mask, GFP_KERNEL)) { + on_each_cpu(retrigger_next_event, NULL, 1); + goto out_timerfd; + } + + /* Avoid interrupting CPUs if possible */ + cpus_read_lock(); + for_each_online_cpu(cpu) { + unsigned long flags; + + cpu_base = &per_cpu(hrtimer_bases, cpu); + raw_spin_lock_irqsave(&cpu_base->lock, flags); + + if (update_needs_ipi(cpu_base, bases)) + cpumask_set_cpu(cpu, mask); + + raw_spin_unlock_irqrestore(&cpu_base->lock, flags); + } + + preempt_disable(); + smp_call_function_many(mask, retrigger_next_event, NULL, 1); + preempt_enable(); + cpus_read_unlock(); + free_cpumask_var(mask); + +out_timerfd: + timerfd_clock_was_set(); +} + +static void clock_was_set_work(struct work_struct *work) +{ + clock_was_set(CLOCK_SET_WALL); +} + +static DECLARE_WORK(hrtimer_work, clock_was_set_work); + +/* + * Called from timekeeping code to reprogram the hrtimer interrupt device + * on all cpus and to notify timerfd. + */ +void clock_was_set_delayed(void) +{ + schedule_work(&hrtimer_work); +} + +/* + * Called during resume either directly from via timekeeping_resume() + * or in the case of s2idle from tick_unfreeze() to ensure that the + * hrtimers are up to date. + */ +void hrtimers_resume_local(void) +{ + lockdep_assert_irqs_disabled(); + /* Retrigger on the local CPU */ + retrigger_next_event(NULL); +} + +/* + * Counterpart to lock_hrtimer_base above: + */ +static inline +void unlock_hrtimer_base(const struct hrtimer *timer, unsigned long *flags) + __releases(&timer->base->cpu_base->lock) +{ + raw_spin_unlock_irqrestore(&timer->base->cpu_base->lock, *flags); +} + +/** + * hrtimer_forward - forward the timer expiry + * @timer: hrtimer to forward + * @now: forward past this time + * @interval: the interval to forward + * + * Forward the timer expiry so it will expire in the future. + * Returns the number of overruns. + * + * Can be safely called from the callback function of @timer. If + * called from other contexts @timer must neither be enqueued nor + * running the callback and the caller needs to take care of + * serialization. + * + * Note: This only updates the timer expiry value and does not requeue + * the timer. + */ +u64 hrtimer_forward(struct hrtimer *timer, ktime_t now, ktime_t interval) +{ + u64 orun = 1; + ktime_t delta; + + delta = ktime_sub(now, hrtimer_get_expires(timer)); + + if (delta < 0) + return 0; + + if (WARN_ON(timer->state & HRTIMER_STATE_ENQUEUED)) + return 0; + + if (interval < hrtimer_resolution) + interval = hrtimer_resolution; + + if (unlikely(delta >= interval)) { + s64 incr = ktime_to_ns(interval); + + orun = ktime_divns(delta, incr); + hrtimer_add_expires_ns(timer, incr * orun); + if (hrtimer_get_expires_tv64(timer) > now) + return orun; + /* + * This (and the ktime_add() below) is the + * correction for exact: + */ + orun++; + } + hrtimer_add_expires(timer, interval); + + return orun; +} +EXPORT_SYMBOL_GPL(hrtimer_forward); + +/* + * enqueue_hrtimer - internal function to (re)start a timer + * + * The timer is inserted in expiry order. Insertion into the + * red black tree is O(log(n)). Must hold the base lock. + * + * Returns 1 when the new timer is the leftmost timer in the tree. + */ +static int enqueue_hrtimer(struct hrtimer *timer, + struct hrtimer_clock_base *base, + enum hrtimer_mode mode) +{ + debug_activate(timer, mode); + + base->cpu_base->active_bases |= 1 << base->index; + + /* Pairs with the lockless read in hrtimer_is_queued() */ + WRITE_ONCE(timer->state, HRTIMER_STATE_ENQUEUED); + + return timerqueue_add(&base->active, &timer->node); +} + +/* + * __remove_hrtimer - internal function to remove a timer + * + * Caller must hold the base lock. + * + * High resolution timer mode reprograms the clock event device when the + * timer is the one which expires next. The caller can disable this by setting + * reprogram to zero. This is useful, when the context does a reprogramming + * anyway (e.g. timer interrupt) + */ +static void __remove_hrtimer(struct hrtimer *timer, + struct hrtimer_clock_base *base, + u8 newstate, int reprogram) +{ + struct hrtimer_cpu_base *cpu_base = base->cpu_base; + u8 state = timer->state; + + /* Pairs with the lockless read in hrtimer_is_queued() */ + WRITE_ONCE(timer->state, newstate); + if (!(state & HRTIMER_STATE_ENQUEUED)) + return; + + if (!timerqueue_del(&base->active, &timer->node)) + cpu_base->active_bases &= ~(1 << base->index); + + /* + * Note: If reprogram is false we do not update + * cpu_base->next_timer. This happens when we remove the first + * timer on a remote cpu. No harm as we never dereference + * cpu_base->next_timer. So the worst thing what can happen is + * an superfluous call to hrtimer_force_reprogram() on the + * remote cpu later on if the same timer gets enqueued again. + */ + if (reprogram && timer == cpu_base->next_timer) + hrtimer_force_reprogram(cpu_base, 1); +} + +/* + * remove hrtimer, called with base lock held + */ +static inline int +remove_hrtimer(struct hrtimer *timer, struct hrtimer_clock_base *base, + bool restart, bool keep_local) +{ + u8 state = timer->state; + + if (state & HRTIMER_STATE_ENQUEUED) { + bool reprogram; + + /* + * Remove the timer and force reprogramming when high + * resolution mode is active and the timer is on the current + * CPU. If we remove a timer on another CPU, reprogramming is + * skipped. The interrupt event on this CPU is fired and + * reprogramming happens in the interrupt handler. This is a + * rare case and less expensive than a smp call. + */ + debug_deactivate(timer); + reprogram = base->cpu_base == this_cpu_ptr(&hrtimer_bases); + + /* + * If the timer is not restarted then reprogramming is + * required if the timer is local. If it is local and about + * to be restarted, avoid programming it twice (on removal + * and a moment later when it's requeued). + */ + if (!restart) + state = HRTIMER_STATE_INACTIVE; + else + reprogram &= !keep_local; + + __remove_hrtimer(timer, base, state, reprogram); + return 1; + } + return 0; +} + +static inline ktime_t hrtimer_update_lowres(struct hrtimer *timer, ktime_t tim, + const enum hrtimer_mode mode) +{ +#ifdef CONFIG_TIME_LOW_RES + /* + * CONFIG_TIME_LOW_RES indicates that the system has no way to return + * granular time values. For relative timers we add hrtimer_resolution + * (i.e. one jiffie) to prevent short timeouts. + */ + timer->is_rel = mode & HRTIMER_MODE_REL; + if (timer->is_rel) + tim = ktime_add_safe(tim, hrtimer_resolution); +#endif + return tim; +} + +static void +hrtimer_update_softirq_timer(struct hrtimer_cpu_base *cpu_base, bool reprogram) +{ + ktime_t expires; + + /* + * Find the next SOFT expiration. + */ + expires = __hrtimer_get_next_event(cpu_base, HRTIMER_ACTIVE_SOFT); + + /* + * reprogramming needs to be triggered, even if the next soft + * hrtimer expires at the same time than the next hard + * hrtimer. cpu_base->softirq_expires_next needs to be updated! + */ + if (expires == KTIME_MAX) + return; + + /* + * cpu_base->*next_timer is recomputed by __hrtimer_get_next_event() + * cpu_base->*expires_next is only set by hrtimer_reprogram() + */ + hrtimer_reprogram(cpu_base->softirq_next_timer, reprogram); +} + +static int __hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim, + u64 delta_ns, const enum hrtimer_mode mode, + struct hrtimer_clock_base *base) +{ + struct hrtimer_clock_base *new_base; + bool force_local, first; + + /* + * If the timer is on the local cpu base and is the first expiring + * timer then this might end up reprogramming the hardware twice + * (on removal and on enqueue). To avoid that by prevent the + * reprogram on removal, keep the timer local to the current CPU + * and enforce reprogramming after it is queued no matter whether + * it is the new first expiring timer again or not. + */ + force_local = base->cpu_base == this_cpu_ptr(&hrtimer_bases); + force_local &= base->cpu_base->next_timer == timer; + + /* + * Remove an active timer from the queue. In case it is not queued + * on the current CPU, make sure that remove_hrtimer() updates the + * remote data correctly. + * + * If it's on the current CPU and the first expiring timer, then + * skip reprogramming, keep the timer local and enforce + * reprogramming later if it was the first expiring timer. This + * avoids programming the underlying clock event twice (once at + * removal and once after enqueue). + */ + remove_hrtimer(timer, base, true, force_local); + + if (mode & HRTIMER_MODE_REL) + tim = ktime_add_safe(tim, base->get_time()); + + tim = hrtimer_update_lowres(timer, tim, mode); + + hrtimer_set_expires_range_ns(timer, tim, delta_ns); + + /* Switch the timer base, if necessary: */ + if (!force_local) { + new_base = switch_hrtimer_base(timer, base, + mode & HRTIMER_MODE_PINNED); + } else { + new_base = base; + } + + first = enqueue_hrtimer(timer, new_base, mode); + if (!force_local) + return first; + + /* + * Timer was forced to stay on the current CPU to avoid + * reprogramming on removal and enqueue. Force reprogram the + * hardware by evaluating the new first expiring timer. + */ + hrtimer_force_reprogram(new_base->cpu_base, 1); + return 0; +} + +/** + * hrtimer_start_range_ns - (re)start an hrtimer + * @timer: the timer to be added + * @tim: expiry time + * @delta_ns: "slack" range for the timer + * @mode: timer mode: absolute (HRTIMER_MODE_ABS) or + * relative (HRTIMER_MODE_REL), and pinned (HRTIMER_MODE_PINNED); + * softirq based mode is considered for debug purpose only! + */ +void hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim, + u64 delta_ns, const enum hrtimer_mode mode) +{ + struct hrtimer_clock_base *base; + unsigned long flags; + + /* + * Check whether the HRTIMER_MODE_SOFT bit and hrtimer.is_soft + * match on CONFIG_PREEMPT_RT = n. With PREEMPT_RT check the hard + * expiry mode because unmarked timers are moved to softirq expiry. + */ + if (!IS_ENABLED(CONFIG_PREEMPT_RT)) + WARN_ON_ONCE(!(mode & HRTIMER_MODE_SOFT) ^ !timer->is_soft); + else + WARN_ON_ONCE(!(mode & HRTIMER_MODE_HARD) ^ !timer->is_hard); + + base = lock_hrtimer_base(timer, &flags); + + if (__hrtimer_start_range_ns(timer, tim, delta_ns, mode, base)) + hrtimer_reprogram(timer, true); + + unlock_hrtimer_base(timer, &flags); +} +EXPORT_SYMBOL_GPL(hrtimer_start_range_ns); + +/** + * hrtimer_try_to_cancel - try to deactivate a timer + * @timer: hrtimer to stop + * + * Returns: + * + * * 0 when the timer was not active + * * 1 when the timer was active + * * -1 when the timer is currently executing the callback function and + * cannot be stopped + */ +int hrtimer_try_to_cancel(struct hrtimer *timer) +{ + struct hrtimer_clock_base *base; + unsigned long flags; + int ret = -1; + + /* + * Check lockless first. If the timer is not active (neither + * enqueued nor running the callback, nothing to do here. The + * base lock does not serialize against a concurrent enqueue, + * so we can avoid taking it. + */ + if (!hrtimer_active(timer)) + return 0; + + base = lock_hrtimer_base(timer, &flags); + + if (!hrtimer_callback_running(timer)) + ret = remove_hrtimer(timer, base, false, false); + + unlock_hrtimer_base(timer, &flags); + + return ret; + +} +EXPORT_SYMBOL_GPL(hrtimer_try_to_cancel); + +#ifdef CONFIG_PREEMPT_RT +static void hrtimer_cpu_base_init_expiry_lock(struct hrtimer_cpu_base *base) +{ + spin_lock_init(&base->softirq_expiry_lock); +} + +static void hrtimer_cpu_base_lock_expiry(struct hrtimer_cpu_base *base) +{ + spin_lock(&base->softirq_expiry_lock); +} + +static void hrtimer_cpu_base_unlock_expiry(struct hrtimer_cpu_base *base) +{ + spin_unlock(&base->softirq_expiry_lock); +} + +/* + * The counterpart to hrtimer_cancel_wait_running(). + * + * If there is a waiter for cpu_base->expiry_lock, then it was waiting for + * the timer callback to finish. Drop expiry_lock and reacquire it. That + * allows the waiter to acquire the lock and make progress. + */ +static void hrtimer_sync_wait_running(struct hrtimer_cpu_base *cpu_base, + unsigned long flags) +{ + if (atomic_read(&cpu_base->timer_waiters)) { + raw_spin_unlock_irqrestore(&cpu_base->lock, flags); + spin_unlock(&cpu_base->softirq_expiry_lock); + spin_lock(&cpu_base->softirq_expiry_lock); + raw_spin_lock_irq(&cpu_base->lock); + } +} + +/* + * This function is called on PREEMPT_RT kernels when the fast path + * deletion of a timer failed because the timer callback function was + * running. + * + * This prevents priority inversion: if the soft irq thread is preempted + * in the middle of a timer callback, then calling del_timer_sync() can + * lead to two issues: + * + * - If the caller is on a remote CPU then it has to spin wait for the timer + * handler to complete. This can result in unbound priority inversion. + * + * - If the caller originates from the task which preempted the timer + * handler on the same CPU, then spin waiting for the timer handler to + * complete is never going to end. + */ +void hrtimer_cancel_wait_running(const struct hrtimer *timer) +{ + /* Lockless read. Prevent the compiler from reloading it below */ + struct hrtimer_clock_base *base = READ_ONCE(timer->base); + + /* + * Just relax if the timer expires in hard interrupt context or if + * it is currently on the migration base. + */ + if (!timer->is_soft || is_migration_base(base)) { + cpu_relax(); + return; + } + + /* + * Mark the base as contended and grab the expiry lock, which is + * held by the softirq across the timer callback. Drop the lock + * immediately so the softirq can expire the next timer. In theory + * the timer could already be running again, but that's more than + * unlikely and just causes another wait loop. + */ + atomic_inc(&base->cpu_base->timer_waiters); + spin_lock_bh(&base->cpu_base->softirq_expiry_lock); + atomic_dec(&base->cpu_base->timer_waiters); + spin_unlock_bh(&base->cpu_base->softirq_expiry_lock); +} +#else +static inline void +hrtimer_cpu_base_init_expiry_lock(struct hrtimer_cpu_base *base) { } +static inline void +hrtimer_cpu_base_lock_expiry(struct hrtimer_cpu_base *base) { } +static inline void +hrtimer_cpu_base_unlock_expiry(struct hrtimer_cpu_base *base) { } +static inline void hrtimer_sync_wait_running(struct hrtimer_cpu_base *base, + unsigned long flags) { } +#endif + +/** + * hrtimer_cancel - cancel a timer and wait for the handler to finish. + * @timer: the timer to be cancelled + * + * Returns: + * 0 when the timer was not active + * 1 when the timer was active + */ +int hrtimer_cancel(struct hrtimer *timer) +{ + int ret; + + do { + ret = hrtimer_try_to_cancel(timer); + + if (ret < 0) + hrtimer_cancel_wait_running(timer); + } while (ret < 0); + return ret; +} +EXPORT_SYMBOL_GPL(hrtimer_cancel); + +/** + * __hrtimer_get_remaining - get remaining time for the timer + * @timer: the timer to read + * @adjust: adjust relative timers when CONFIG_TIME_LOW_RES=y + */ +ktime_t __hrtimer_get_remaining(const struct hrtimer *timer, bool adjust) +{ + unsigned long flags; + ktime_t rem; + + lock_hrtimer_base(timer, &flags); + if (IS_ENABLED(CONFIG_TIME_LOW_RES) && adjust) + rem = hrtimer_expires_remaining_adjusted(timer); + else + rem = hrtimer_expires_remaining(timer); + unlock_hrtimer_base(timer, &flags); + + return rem; +} +EXPORT_SYMBOL_GPL(__hrtimer_get_remaining); + +#ifdef CONFIG_NO_HZ_COMMON +/** + * hrtimer_get_next_event - get the time until next expiry event + * + * Returns the next expiry time or KTIME_MAX if no timer is pending. + */ +u64 hrtimer_get_next_event(void) +{ + struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases); + u64 expires = KTIME_MAX; + unsigned long flags; + + raw_spin_lock_irqsave(&cpu_base->lock, flags); + + if (!__hrtimer_hres_active(cpu_base)) + expires = __hrtimer_get_next_event(cpu_base, HRTIMER_ACTIVE_ALL); + + raw_spin_unlock_irqrestore(&cpu_base->lock, flags); + + return expires; +} + +/** + * hrtimer_next_event_without - time until next expiry event w/o one timer + * @exclude: timer to exclude + * + * Returns the next expiry time over all timers except for the @exclude one or + * KTIME_MAX if none of them is pending. + */ +u64 hrtimer_next_event_without(const struct hrtimer *exclude) +{ + struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases); + u64 expires = KTIME_MAX; + unsigned long flags; + + raw_spin_lock_irqsave(&cpu_base->lock, flags); + + if (__hrtimer_hres_active(cpu_base)) { + unsigned int active; + + if (!cpu_base->softirq_activated) { + active = cpu_base->active_bases & HRTIMER_ACTIVE_SOFT; + expires = __hrtimer_next_event_base(cpu_base, exclude, + active, KTIME_MAX); + } + active = cpu_base->active_bases & HRTIMER_ACTIVE_HARD; + expires = __hrtimer_next_event_base(cpu_base, exclude, active, + expires); + } + + raw_spin_unlock_irqrestore(&cpu_base->lock, flags); + + return expires; +} +#endif + +static inline int hrtimer_clockid_to_base(clockid_t clock_id) +{ + if (likely(clock_id < MAX_CLOCKS)) { + int base = hrtimer_clock_to_base_table[clock_id]; + + if (likely(base != HRTIMER_MAX_CLOCK_BASES)) + return base; + } + WARN(1, "Invalid clockid %d. Using MONOTONIC\n", clock_id); + return HRTIMER_BASE_MONOTONIC; +} + +static void __hrtimer_init(struct hrtimer *timer, clockid_t clock_id, + enum hrtimer_mode mode) +{ + bool softtimer = !!(mode & HRTIMER_MODE_SOFT); + struct hrtimer_cpu_base *cpu_base; + int base; + + /* + * On PREEMPT_RT enabled kernels hrtimers which are not explicitly + * marked for hard interrupt expiry mode are moved into soft + * interrupt context for latency reasons and because the callbacks + * can invoke functions which might sleep on RT, e.g. spin_lock(). + */ + if (IS_ENABLED(CONFIG_PREEMPT_RT) && !(mode & HRTIMER_MODE_HARD)) + softtimer = true; + + memset(timer, 0, sizeof(struct hrtimer)); + + cpu_base = raw_cpu_ptr(&hrtimer_bases); + + /* + * POSIX magic: Relative CLOCK_REALTIME timers are not affected by + * clock modifications, so they needs to become CLOCK_MONOTONIC to + * ensure POSIX compliance. + */ + if (clock_id == CLOCK_REALTIME && mode & HRTIMER_MODE_REL) + clock_id = CLOCK_MONOTONIC; + + base = softtimer ? HRTIMER_MAX_CLOCK_BASES / 2 : 0; + base += hrtimer_clockid_to_base(clock_id); + timer->is_soft = softtimer; + timer->is_hard = !!(mode & HRTIMER_MODE_HARD); + timer->base = &cpu_base->clock_base[base]; + timerqueue_init(&timer->node); +} + +/** + * hrtimer_init - initialize a timer to the given clock + * @timer: the timer to be initialized + * @clock_id: the clock to be used + * @mode: The modes which are relevant for initialization: + * HRTIMER_MODE_ABS, HRTIMER_MODE_REL, HRTIMER_MODE_ABS_SOFT, + * HRTIMER_MODE_REL_SOFT + * + * The PINNED variants of the above can be handed in, + * but the PINNED bit is ignored as pinning happens + * when the hrtimer is started + */ +void hrtimer_init(struct hrtimer *timer, clockid_t clock_id, + enum hrtimer_mode mode) +{ + debug_init(timer, clock_id, mode); + __hrtimer_init(timer, clock_id, mode); +} +EXPORT_SYMBOL_GPL(hrtimer_init); + +/* + * A timer is active, when it is enqueued into the rbtree or the + * callback function is running or it's in the state of being migrated + * to another cpu. + * + * It is important for this function to not return a false negative. + */ +bool hrtimer_active(const struct hrtimer *timer) +{ + struct hrtimer_clock_base *base; + unsigned int seq; + + do { + base = READ_ONCE(timer->base); + seq = raw_read_seqcount_begin(&base->seq); + + if (timer->state != HRTIMER_STATE_INACTIVE || + base->running == timer) + return true; + + } while (read_seqcount_retry(&base->seq, seq) || + base != READ_ONCE(timer->base)); + + return false; +} +EXPORT_SYMBOL_GPL(hrtimer_active); + +/* + * The write_seqcount_barrier()s in __run_hrtimer() split the thing into 3 + * distinct sections: + * + * - queued: the timer is queued + * - callback: the timer is being ran + * - post: the timer is inactive or (re)queued + * + * On the read side we ensure we observe timer->state and cpu_base->running + * from the same section, if anything changed while we looked at it, we retry. + * This includes timer->base changing because sequence numbers alone are + * insufficient for that. + * + * The sequence numbers are required because otherwise we could still observe + * a false negative if the read side got smeared over multiple consecutive + * __run_hrtimer() invocations. + */ + +static void __run_hrtimer(struct hrtimer_cpu_base *cpu_base, + struct hrtimer_clock_base *base, + struct hrtimer *timer, ktime_t *now, + unsigned long flags) __must_hold(&cpu_base->lock) +{ + enum hrtimer_restart (*fn)(struct hrtimer *); + bool expires_in_hardirq; + int restart; + + lockdep_assert_held(&cpu_base->lock); + + debug_deactivate(timer); + base->running = timer; + + /* + * Separate the ->running assignment from the ->state assignment. + * + * As with a regular write barrier, this ensures the read side in + * hrtimer_active() cannot observe base->running == NULL && + * timer->state == INACTIVE. + */ + raw_write_seqcount_barrier(&base->seq); + + __remove_hrtimer(timer, base, HRTIMER_STATE_INACTIVE, 0); + fn = timer->function; + + /* + * Clear the 'is relative' flag for the TIME_LOW_RES case. If the + * timer is restarted with a period then it becomes an absolute + * timer. If its not restarted it does not matter. + */ + if (IS_ENABLED(CONFIG_TIME_LOW_RES)) + timer->is_rel = false; + + /* + * The timer is marked as running in the CPU base, so it is + * protected against migration to a different CPU even if the lock + * is dropped. + */ + raw_spin_unlock_irqrestore(&cpu_base->lock, flags); + trace_hrtimer_expire_entry(timer, now); + expires_in_hardirq = lockdep_hrtimer_enter(timer); + + restart = fn(timer); + + lockdep_hrtimer_exit(expires_in_hardirq); + trace_hrtimer_expire_exit(timer); + raw_spin_lock_irq(&cpu_base->lock); + + /* + * Note: We clear the running state after enqueue_hrtimer and + * we do not reprogram the event hardware. Happens either in + * hrtimer_start_range_ns() or in hrtimer_interrupt() + * + * Note: Because we dropped the cpu_base->lock above, + * hrtimer_start_range_ns() can have popped in and enqueued the timer + * for us already. + */ + if (restart != HRTIMER_NORESTART && + !(timer->state & HRTIMER_STATE_ENQUEUED)) + enqueue_hrtimer(timer, base, HRTIMER_MODE_ABS); + + /* + * Separate the ->running assignment from the ->state assignment. + * + * As with a regular write barrier, this ensures the read side in + * hrtimer_active() cannot observe base->running.timer == NULL && + * timer->state == INACTIVE. + */ + raw_write_seqcount_barrier(&base->seq); + + WARN_ON_ONCE(base->running != timer); + base->running = NULL; +} + +static void __hrtimer_run_queues(struct hrtimer_cpu_base *cpu_base, ktime_t now, + unsigned long flags, unsigned int active_mask) +{ + struct hrtimer_clock_base *base; + unsigned int active = cpu_base->active_bases & active_mask; + + for_each_active_base(base, cpu_base, active) { + struct timerqueue_node *node; + ktime_t basenow; + + basenow = ktime_add(now, base->offset); + + while ((node = timerqueue_getnext(&base->active))) { + struct hrtimer *timer; + + timer = container_of(node, struct hrtimer, node); + + /* + * The immediate goal for using the softexpires is + * minimizing wakeups, not running timers at the + * earliest interrupt after their soft expiration. + * This allows us to avoid using a Priority Search + * Tree, which can answer a stabbing query for + * overlapping intervals and instead use the simple + * BST we already have. + * We don't add extra wakeups by delaying timers that + * are right-of a not yet expired timer, because that + * timer will have to trigger a wakeup anyway. + */ + if (basenow < hrtimer_get_softexpires_tv64(timer)) + break; + + __run_hrtimer(cpu_base, base, timer, &basenow, flags); + if (active_mask == HRTIMER_ACTIVE_SOFT) + hrtimer_sync_wait_running(cpu_base, flags); + } + } +} + +static __latent_entropy void hrtimer_run_softirq(struct softirq_action *h) +{ + struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases); + unsigned long flags; + ktime_t now; + + hrtimer_cpu_base_lock_expiry(cpu_base); + raw_spin_lock_irqsave(&cpu_base->lock, flags); + + now = hrtimer_update_base(cpu_base); + __hrtimer_run_queues(cpu_base, now, flags, HRTIMER_ACTIVE_SOFT); + + cpu_base->softirq_activated = 0; + hrtimer_update_softirq_timer(cpu_base, true); + + raw_spin_unlock_irqrestore(&cpu_base->lock, flags); + hrtimer_cpu_base_unlock_expiry(cpu_base); +} + +#ifdef CONFIG_HIGH_RES_TIMERS + +/* + * High resolution timer interrupt + * Called with interrupts disabled + */ +void hrtimer_interrupt(struct clock_event_device *dev) +{ + struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases); + ktime_t expires_next, now, entry_time, delta; + unsigned long flags; + int retries = 0; + + BUG_ON(!cpu_base->hres_active); + cpu_base->nr_events++; + dev->next_event = KTIME_MAX; + + raw_spin_lock_irqsave(&cpu_base->lock, flags); + entry_time = now = hrtimer_update_base(cpu_base); +retry: + cpu_base->in_hrtirq = 1; + /* + * We set expires_next to KTIME_MAX here with cpu_base->lock + * held to prevent that a timer is enqueued in our queue via + * the migration code. This does not affect enqueueing of + * timers which run their callback and need to be requeued on + * this CPU. + */ + cpu_base->expires_next = KTIME_MAX; + + if (!ktime_before(now, cpu_base->softirq_expires_next)) { + cpu_base->softirq_expires_next = KTIME_MAX; + cpu_base->softirq_activated = 1; + raise_softirq_irqoff(HRTIMER_SOFTIRQ); + } + + __hrtimer_run_queues(cpu_base, now, flags, HRTIMER_ACTIVE_HARD); + + /* Reevaluate the clock bases for the [soft] next expiry */ + expires_next = hrtimer_update_next_event(cpu_base); + /* + * Store the new expiry value so the migration code can verify + * against it. + */ + cpu_base->expires_next = expires_next; + cpu_base->in_hrtirq = 0; + raw_spin_unlock_irqrestore(&cpu_base->lock, flags); + + /* Reprogramming necessary ? */ + if (!tick_program_event(expires_next, 0)) { + cpu_base->hang_detected = 0; + return; + } + + /* + * The next timer was already expired due to: + * - tracing + * - long lasting callbacks + * - being scheduled away when running in a VM + * + * We need to prevent that we loop forever in the hrtimer + * interrupt routine. We give it 3 attempts to avoid + * overreacting on some spurious event. + * + * Acquire base lock for updating the offsets and retrieving + * the current time. + */ + raw_spin_lock_irqsave(&cpu_base->lock, flags); + now = hrtimer_update_base(cpu_base); + cpu_base->nr_retries++; + if (++retries < 3) + goto retry; + /* + * Give the system a chance to do something else than looping + * here. We stored the entry time, so we know exactly how long + * we spent here. We schedule the next event this amount of + * time away. + */ + cpu_base->nr_hangs++; + cpu_base->hang_detected = 1; + raw_spin_unlock_irqrestore(&cpu_base->lock, flags); + + delta = ktime_sub(now, entry_time); + if ((unsigned int)delta > cpu_base->max_hang_time) + cpu_base->max_hang_time = (unsigned int) delta; + /* + * Limit it to a sensible value as we enforce a longer + * delay. Give the CPU at least 100ms to catch up. + */ + if (delta > 100 * NSEC_PER_MSEC) + expires_next = ktime_add_ns(now, 100 * NSEC_PER_MSEC); + else + expires_next = ktime_add(now, delta); + tick_program_event(expires_next, 1); + pr_warn_once("hrtimer: interrupt took %llu ns\n", ktime_to_ns(delta)); +} + +/* called with interrupts disabled */ +static inline void __hrtimer_peek_ahead_timers(void) +{ + struct tick_device *td; + + if (!hrtimer_hres_active()) + return; + + td = this_cpu_ptr(&tick_cpu_device); + if (td && td->evtdev) + hrtimer_interrupt(td->evtdev); +} + +#else /* CONFIG_HIGH_RES_TIMERS */ + +static inline void __hrtimer_peek_ahead_timers(void) { } + +#endif /* !CONFIG_HIGH_RES_TIMERS */ + +/* + * Called from run_local_timers in hardirq context every jiffy + */ +void hrtimer_run_queues(void) +{ + struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases); + unsigned long flags; + ktime_t now; + + if (__hrtimer_hres_active(cpu_base)) + return; + + /* + * This _is_ ugly: We have to check periodically, whether we + * can switch to highres and / or nohz mode. The clocksource + * switch happens with xtime_lock held. Notification from + * there only sets the check bit in the tick_oneshot code, + * otherwise we might deadlock vs. xtime_lock. + */ + if (tick_check_oneshot_change(!hrtimer_is_hres_enabled())) { + hrtimer_switch_to_hres(); + return; + } + + raw_spin_lock_irqsave(&cpu_base->lock, flags); + now = hrtimer_update_base(cpu_base); + + if (!ktime_before(now, cpu_base->softirq_expires_next)) { + cpu_base->softirq_expires_next = KTIME_MAX; + cpu_base->softirq_activated = 1; + raise_softirq_irqoff(HRTIMER_SOFTIRQ); + } + + __hrtimer_run_queues(cpu_base, now, flags, HRTIMER_ACTIVE_HARD); + raw_spin_unlock_irqrestore(&cpu_base->lock, flags); +} + +/* + * Sleep related functions: + */ +static enum hrtimer_restart hrtimer_wakeup(struct hrtimer *timer) +{ + struct hrtimer_sleeper *t = + container_of(timer, struct hrtimer_sleeper, timer); + struct task_struct *task = t->task; + + t->task = NULL; + if (task) + wake_up_process(task); + + return HRTIMER_NORESTART; +} + +/** + * hrtimer_sleeper_start_expires - Start a hrtimer sleeper timer + * @sl: sleeper to be started + * @mode: timer mode abs/rel + * + * Wrapper around hrtimer_start_expires() for hrtimer_sleeper based timers + * to allow PREEMPT_RT to tweak the delivery mode (soft/hardirq context) + */ +void hrtimer_sleeper_start_expires(struct hrtimer_sleeper *sl, + enum hrtimer_mode mode) +{ + /* + * Make the enqueue delivery mode check work on RT. If the sleeper + * was initialized for hard interrupt delivery, force the mode bit. + * This is a special case for hrtimer_sleepers because + * hrtimer_init_sleeper() determines the delivery mode on RT so the + * fiddling with this decision is avoided at the call sites. + */ + if (IS_ENABLED(CONFIG_PREEMPT_RT) && sl->timer.is_hard) + mode |= HRTIMER_MODE_HARD; + + hrtimer_start_expires(&sl->timer, mode); +} +EXPORT_SYMBOL_GPL(hrtimer_sleeper_start_expires); + +static void __hrtimer_init_sleeper(struct hrtimer_sleeper *sl, + clockid_t clock_id, enum hrtimer_mode mode) +{ + /* + * On PREEMPT_RT enabled kernels hrtimers which are not explicitly + * marked for hard interrupt expiry mode are moved into soft + * interrupt context either for latency reasons or because the + * hrtimer callback takes regular spinlocks or invokes other + * functions which are not suitable for hard interrupt context on + * PREEMPT_RT. + * + * The hrtimer_sleeper callback is RT compatible in hard interrupt + * context, but there is a latency concern: Untrusted userspace can + * spawn many threads which arm timers for the same expiry time on + * the same CPU. That causes a latency spike due to the wakeup of + * a gazillion threads. + * + * OTOH, privileged real-time user space applications rely on the + * low latency of hard interrupt wakeups. If the current task is in + * a real-time scheduling class, mark the mode for hard interrupt + * expiry. + */ + if (IS_ENABLED(CONFIG_PREEMPT_RT)) { + if (task_is_realtime(current) && !(mode & HRTIMER_MODE_SOFT)) + mode |= HRTIMER_MODE_HARD; + } + + __hrtimer_init(&sl->timer, clock_id, mode); + sl->timer.function = hrtimer_wakeup; + sl->task = current; +} + +/** + * hrtimer_init_sleeper - initialize sleeper to the given clock + * @sl: sleeper to be initialized + * @clock_id: the clock to be used + * @mode: timer mode abs/rel + */ +void hrtimer_init_sleeper(struct hrtimer_sleeper *sl, clockid_t clock_id, + enum hrtimer_mode mode) +{ + debug_init(&sl->timer, clock_id, mode); + __hrtimer_init_sleeper(sl, clock_id, mode); + +} +EXPORT_SYMBOL_GPL(hrtimer_init_sleeper); + +int nanosleep_copyout(struct restart_block *restart, struct timespec64 *ts) +{ + switch(restart->nanosleep.type) { +#ifdef CONFIG_COMPAT_32BIT_TIME + case TT_COMPAT: + if (put_old_timespec32(ts, restart->nanosleep.compat_rmtp)) + return -EFAULT; + break; +#endif + case TT_NATIVE: + if (put_timespec64(ts, restart->nanosleep.rmtp)) + return -EFAULT; + break; + default: + BUG(); + } + return -ERESTART_RESTARTBLOCK; +} + +static int __sched do_nanosleep(struct hrtimer_sleeper *t, enum hrtimer_mode mode) +{ + struct restart_block *restart; + + do { + set_current_state(TASK_INTERRUPTIBLE|TASK_FREEZABLE); + hrtimer_sleeper_start_expires(t, mode); + + if (likely(t->task)) + schedule(); + + hrtimer_cancel(&t->timer); + mode = HRTIMER_MODE_ABS; + + } while (t->task && !signal_pending(current)); + + __set_current_state(TASK_RUNNING); + + if (!t->task) + return 0; + + restart = ¤t->restart_block; + if (restart->nanosleep.type != TT_NONE) { + ktime_t rem = hrtimer_expires_remaining(&t->timer); + struct timespec64 rmt; + + if (rem <= 0) + return 0; + rmt = ktime_to_timespec64(rem); + + return nanosleep_copyout(restart, &rmt); + } + return -ERESTART_RESTARTBLOCK; +} + +static long __sched hrtimer_nanosleep_restart(struct restart_block *restart) +{ + struct hrtimer_sleeper t; + int ret; + + hrtimer_init_sleeper_on_stack(&t, restart->nanosleep.clockid, + HRTIMER_MODE_ABS); + hrtimer_set_expires_tv64(&t.timer, restart->nanosleep.expires); + ret = do_nanosleep(&t, HRTIMER_MODE_ABS); + destroy_hrtimer_on_stack(&t.timer); + return ret; +} + +long hrtimer_nanosleep(ktime_t rqtp, const enum hrtimer_mode mode, + const clockid_t clockid) +{ + struct restart_block *restart; + struct hrtimer_sleeper t; + int ret = 0; + u64 slack; + + slack = current->timer_slack_ns; + if (rt_task(current)) + slack = 0; + + hrtimer_init_sleeper_on_stack(&t, clockid, mode); + hrtimer_set_expires_range_ns(&t.timer, rqtp, slack); + ret = do_nanosleep(&t, mode); + if (ret != -ERESTART_RESTARTBLOCK) + goto out; + + /* Absolute timers do not update the rmtp value and restart: */ + if (mode == HRTIMER_MODE_ABS) { + ret = -ERESTARTNOHAND; + goto out; + } + + restart = ¤t->restart_block; + restart->nanosleep.clockid = t.timer.base->clockid; + restart->nanosleep.expires = hrtimer_get_expires_tv64(&t.timer); + set_restart_fn(restart, hrtimer_nanosleep_restart); +out: + destroy_hrtimer_on_stack(&t.timer); + return ret; +} + +#ifdef CONFIG_64BIT + +SYSCALL_DEFINE2(nanosleep, struct __kernel_timespec __user *, rqtp, + struct __kernel_timespec __user *, rmtp) +{ + struct timespec64 tu; + + if (get_timespec64(&tu, rqtp)) + return -EFAULT; + + if (!timespec64_valid(&tu)) + return -EINVAL; + + current->restart_block.fn = do_no_restart_syscall; + current->restart_block.nanosleep.type = rmtp ? TT_NATIVE : TT_NONE; + current->restart_block.nanosleep.rmtp = rmtp; + return hrtimer_nanosleep(timespec64_to_ktime(tu), HRTIMER_MODE_REL, + CLOCK_MONOTONIC); +} + +#endif + +#ifdef CONFIG_COMPAT_32BIT_TIME + +SYSCALL_DEFINE2(nanosleep_time32, struct old_timespec32 __user *, rqtp, + struct old_timespec32 __user *, rmtp) +{ + struct timespec64 tu; + + if (get_old_timespec32(&tu, rqtp)) + return -EFAULT; + + if (!timespec64_valid(&tu)) + return -EINVAL; + + current->restart_block.fn = do_no_restart_syscall; + current->restart_block.nanosleep.type = rmtp ? TT_COMPAT : TT_NONE; + current->restart_block.nanosleep.compat_rmtp = rmtp; + return hrtimer_nanosleep(timespec64_to_ktime(tu), HRTIMER_MODE_REL, + CLOCK_MONOTONIC); +} +#endif + +/* + * Functions related to boot-time initialization: + */ +int hrtimers_prepare_cpu(unsigned int cpu) +{ + struct hrtimer_cpu_base *cpu_base = &per_cpu(hrtimer_bases, cpu); + int i; + + for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) { + struct hrtimer_clock_base *clock_b = &cpu_base->clock_base[i]; + + clock_b->cpu_base = cpu_base; + seqcount_raw_spinlock_init(&clock_b->seq, &cpu_base->lock); + timerqueue_init_head(&clock_b->active); + } + + cpu_base->cpu = cpu; + cpu_base->active_bases = 0; + cpu_base->hres_active = 0; + cpu_base->hang_detected = 0; + cpu_base->next_timer = NULL; + cpu_base->softirq_next_timer = NULL; + cpu_base->expires_next = KTIME_MAX; + cpu_base->softirq_expires_next = KTIME_MAX; + hrtimer_cpu_base_init_expiry_lock(cpu_base); + return 0; +} + +#ifdef CONFIG_HOTPLUG_CPU + +static void migrate_hrtimer_list(struct hrtimer_clock_base *old_base, + struct hrtimer_clock_base *new_base) +{ + struct hrtimer *timer; + struct timerqueue_node *node; + + while ((node = timerqueue_getnext(&old_base->active))) { + timer = container_of(node, struct hrtimer, node); + BUG_ON(hrtimer_callback_running(timer)); + debug_deactivate(timer); + + /* + * Mark it as ENQUEUED not INACTIVE otherwise the + * timer could be seen as !active and just vanish away + * under us on another CPU + */ + __remove_hrtimer(timer, old_base, HRTIMER_STATE_ENQUEUED, 0); + timer->base = new_base; + /* + * Enqueue the timers on the new cpu. This does not + * reprogram the event device in case the timer + * expires before the earliest on this CPU, but we run + * hrtimer_interrupt after we migrated everything to + * sort out already expired timers and reprogram the + * event device. + */ + enqueue_hrtimer(timer, new_base, HRTIMER_MODE_ABS); + } +} + +int hrtimers_cpu_dying(unsigned int dying_cpu) +{ + struct hrtimer_cpu_base *old_base, *new_base; + int i, ncpu = cpumask_first(cpu_active_mask); + + tick_cancel_sched_timer(dying_cpu); + + old_base = this_cpu_ptr(&hrtimer_bases); + new_base = &per_cpu(hrtimer_bases, ncpu); + + /* + * The caller is globally serialized and nobody else + * takes two locks at once, deadlock is not possible. + */ + raw_spin_lock(&old_base->lock); + raw_spin_lock_nested(&new_base->lock, SINGLE_DEPTH_NESTING); + + for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) { + migrate_hrtimer_list(&old_base->clock_base[i], + &new_base->clock_base[i]); + } + + /* + * The migration might have changed the first expiring softirq + * timer on this CPU. Update it. + */ + __hrtimer_get_next_event(new_base, HRTIMER_ACTIVE_SOFT); + /* Tell the other CPU to retrigger the next event */ + smp_call_function_single(ncpu, retrigger_next_event, NULL, 0); + + raw_spin_unlock(&new_base->lock); + raw_spin_unlock(&old_base->lock); + + return 0; +} + +#endif /* CONFIG_HOTPLUG_CPU */ + +void __init hrtimers_init(void) +{ + hrtimers_prepare_cpu(smp_processor_id()); + open_softirq(HRTIMER_SOFTIRQ, hrtimer_run_softirq); +} + +/** + * schedule_hrtimeout_range_clock - sleep until timeout + * @expires: timeout value (ktime_t) + * @delta: slack in expires timeout (ktime_t) for SCHED_OTHER tasks + * @mode: timer mode + * @clock_id: timer clock to be used + */ +int __sched +schedule_hrtimeout_range_clock(ktime_t *expires, u64 delta, + const enum hrtimer_mode mode, clockid_t clock_id) +{ + struct hrtimer_sleeper t; + + /* + * Optimize when a zero timeout value is given. It does not + * matter whether this is an absolute or a relative time. + */ + if (expires && *expires == 0) { + __set_current_state(TASK_RUNNING); + return 0; + } + + /* + * A NULL parameter means "infinite" + */ + if (!expires) { + schedule(); + return -EINTR; + } + + /* + * Override any slack passed by the user if under + * rt contraints. + */ + if (rt_task(current)) + delta = 0; + + hrtimer_init_sleeper_on_stack(&t, clock_id, mode); + hrtimer_set_expires_range_ns(&t.timer, *expires, delta); + hrtimer_sleeper_start_expires(&t, mode); + + if (likely(t.task)) + schedule(); + + hrtimer_cancel(&t.timer); + destroy_hrtimer_on_stack(&t.timer); + + __set_current_state(TASK_RUNNING); + + return !t.task ? 0 : -EINTR; +} +EXPORT_SYMBOL_GPL(schedule_hrtimeout_range_clock); + +/** + * schedule_hrtimeout_range - sleep until timeout + * @expires: timeout value (ktime_t) + * @delta: slack in expires timeout (ktime_t) for SCHED_OTHER tasks + * @mode: timer mode + * + * Make the current task sleep until the given expiry time has + * elapsed. The routine will return immediately unless + * the current task state has been set (see set_current_state()). + * + * The @delta argument gives the kernel the freedom to schedule the + * actual wakeup to a time that is both power and performance friendly + * for regular (non RT/DL) tasks. + * The kernel give the normal best effort behavior for "@expires+@delta", + * but may decide to fire the timer earlier, but no earlier than @expires. + * + * You can set the task state as follows - + * + * %TASK_UNINTERRUPTIBLE - at least @timeout time is guaranteed to + * pass before the routine returns unless the current task is explicitly + * woken up, (e.g. by wake_up_process()). + * + * %TASK_INTERRUPTIBLE - the routine may return early if a signal is + * delivered to the current task or the current task is explicitly woken + * up. + * + * The current task state is guaranteed to be TASK_RUNNING when this + * routine returns. + * + * Returns 0 when the timer has expired. If the task was woken before the + * timer expired by a signal (only possible in state TASK_INTERRUPTIBLE) or + * by an explicit wakeup, it returns -EINTR. + */ +int __sched schedule_hrtimeout_range(ktime_t *expires, u64 delta, + const enum hrtimer_mode mode) +{ + return schedule_hrtimeout_range_clock(expires, delta, mode, + CLOCK_MONOTONIC); +} +EXPORT_SYMBOL_GPL(schedule_hrtimeout_range); + +/** + * schedule_hrtimeout - sleep until timeout + * @expires: timeout value (ktime_t) + * @mode: timer mode + * + * Make the current task sleep until the given expiry time has + * elapsed. The routine will return immediately unless + * the current task state has been set (see set_current_state()). + * + * You can set the task state as follows - + * + * %TASK_UNINTERRUPTIBLE - at least @timeout time is guaranteed to + * pass before the routine returns unless the current task is explicitly + * woken up, (e.g. by wake_up_process()). + * + * %TASK_INTERRUPTIBLE - the routine may return early if a signal is + * delivered to the current task or the current task is explicitly woken + * up. + * + * The current task state is guaranteed to be TASK_RUNNING when this + * routine returns. + * + * Returns 0 when the timer has expired. If the task was woken before the + * timer expired by a signal (only possible in state TASK_INTERRUPTIBLE) or + * by an explicit wakeup, it returns -EINTR. + */ +int __sched schedule_hrtimeout(ktime_t *expires, + const enum hrtimer_mode mode) +{ + return schedule_hrtimeout_range(expires, 0, mode); +} +EXPORT_SYMBOL_GPL(schedule_hrtimeout); diff --git a/kernel/time/itimer.c b/kernel/time/itimer.c new file mode 100644 index 0000000000..00629e658c --- /dev/null +++ b/kernel/time/itimer.c @@ -0,0 +1,403 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 1992 Darren Senn + */ + +/* These are all the functions necessary to implement itimers */ + +#include <linux/mm.h> +#include <linux/interrupt.h> +#include <linux/syscalls.h> +#include <linux/time.h> +#include <linux/sched/signal.h> +#include <linux/sched/cputime.h> +#include <linux/posix-timers.h> +#include <linux/hrtimer.h> +#include <trace/events/timer.h> +#include <linux/compat.h> + +#include <linux/uaccess.h> + +/** + * itimer_get_remtime - get remaining time for the timer + * + * @timer: the timer to read + * + * Returns the delta between the expiry time and now, which can be + * less than zero or 1usec for an pending expired timer + */ +static struct timespec64 itimer_get_remtime(struct hrtimer *timer) +{ + ktime_t rem = __hrtimer_get_remaining(timer, true); + + /* + * Racy but safe: if the itimer expires after the above + * hrtimer_get_remtime() call but before this condition + * then we return 0 - which is correct. + */ + if (hrtimer_active(timer)) { + if (rem <= 0) + rem = NSEC_PER_USEC; + } else + rem = 0; + + return ktime_to_timespec64(rem); +} + +static void get_cpu_itimer(struct task_struct *tsk, unsigned int clock_id, + struct itimerspec64 *const value) +{ + u64 val, interval; + struct cpu_itimer *it = &tsk->signal->it[clock_id]; + + spin_lock_irq(&tsk->sighand->siglock); + + val = it->expires; + interval = it->incr; + if (val) { + u64 t, samples[CPUCLOCK_MAX]; + + thread_group_sample_cputime(tsk, samples); + t = samples[clock_id]; + + if (val < t) + /* about to fire */ + val = TICK_NSEC; + else + val -= t; + } + + spin_unlock_irq(&tsk->sighand->siglock); + + value->it_value = ns_to_timespec64(val); + value->it_interval = ns_to_timespec64(interval); +} + +static int do_getitimer(int which, struct itimerspec64 *value) +{ + struct task_struct *tsk = current; + + switch (which) { + case ITIMER_REAL: + spin_lock_irq(&tsk->sighand->siglock); + value->it_value = itimer_get_remtime(&tsk->signal->real_timer); + value->it_interval = + ktime_to_timespec64(tsk->signal->it_real_incr); + spin_unlock_irq(&tsk->sighand->siglock); + break; + case ITIMER_VIRTUAL: + get_cpu_itimer(tsk, CPUCLOCK_VIRT, value); + break; + case ITIMER_PROF: + get_cpu_itimer(tsk, CPUCLOCK_PROF, value); + break; + default: + return(-EINVAL); + } + return 0; +} + +static int put_itimerval(struct __kernel_old_itimerval __user *o, + const struct itimerspec64 *i) +{ + struct __kernel_old_itimerval v; + + v.it_interval.tv_sec = i->it_interval.tv_sec; + v.it_interval.tv_usec = i->it_interval.tv_nsec / NSEC_PER_USEC; + v.it_value.tv_sec = i->it_value.tv_sec; + v.it_value.tv_usec = i->it_value.tv_nsec / NSEC_PER_USEC; + return copy_to_user(o, &v, sizeof(struct __kernel_old_itimerval)) ? -EFAULT : 0; +} + + +SYSCALL_DEFINE2(getitimer, int, which, struct __kernel_old_itimerval __user *, value) +{ + struct itimerspec64 get_buffer; + int error = do_getitimer(which, &get_buffer); + + if (!error && put_itimerval(value, &get_buffer)) + error = -EFAULT; + return error; +} + +#if defined(CONFIG_COMPAT) || defined(CONFIG_ALPHA) +struct old_itimerval32 { + struct old_timeval32 it_interval; + struct old_timeval32 it_value; +}; + +static int put_old_itimerval32(struct old_itimerval32 __user *o, + const struct itimerspec64 *i) +{ + struct old_itimerval32 v32; + + v32.it_interval.tv_sec = i->it_interval.tv_sec; + v32.it_interval.tv_usec = i->it_interval.tv_nsec / NSEC_PER_USEC; + v32.it_value.tv_sec = i->it_value.tv_sec; + v32.it_value.tv_usec = i->it_value.tv_nsec / NSEC_PER_USEC; + return copy_to_user(o, &v32, sizeof(struct old_itimerval32)) ? -EFAULT : 0; +} + +COMPAT_SYSCALL_DEFINE2(getitimer, int, which, + struct old_itimerval32 __user *, value) +{ + struct itimerspec64 get_buffer; + int error = do_getitimer(which, &get_buffer); + + if (!error && put_old_itimerval32(value, &get_buffer)) + error = -EFAULT; + return error; +} +#endif + +/* + * The timer is automagically restarted, when interval != 0 + */ +enum hrtimer_restart it_real_fn(struct hrtimer *timer) +{ + struct signal_struct *sig = + container_of(timer, struct signal_struct, real_timer); + struct pid *leader_pid = sig->pids[PIDTYPE_TGID]; + + trace_itimer_expire(ITIMER_REAL, leader_pid, 0); + kill_pid_info(SIGALRM, SEND_SIG_PRIV, leader_pid); + + return HRTIMER_NORESTART; +} + +static void set_cpu_itimer(struct task_struct *tsk, unsigned int clock_id, + const struct itimerspec64 *const value, + struct itimerspec64 *const ovalue) +{ + u64 oval, nval, ointerval, ninterval; + struct cpu_itimer *it = &tsk->signal->it[clock_id]; + + nval = timespec64_to_ns(&value->it_value); + ninterval = timespec64_to_ns(&value->it_interval); + + spin_lock_irq(&tsk->sighand->siglock); + + oval = it->expires; + ointerval = it->incr; + if (oval || nval) { + if (nval > 0) + nval += TICK_NSEC; + set_process_cpu_timer(tsk, clock_id, &nval, &oval); + } + it->expires = nval; + it->incr = ninterval; + trace_itimer_state(clock_id == CPUCLOCK_VIRT ? + ITIMER_VIRTUAL : ITIMER_PROF, value, nval); + + spin_unlock_irq(&tsk->sighand->siglock); + + if (ovalue) { + ovalue->it_value = ns_to_timespec64(oval); + ovalue->it_interval = ns_to_timespec64(ointerval); + } +} + +/* + * Returns true if the timeval is in canonical form + */ +#define timeval_valid(t) \ + (((t)->tv_sec >= 0) && (((unsigned long) (t)->tv_usec) < USEC_PER_SEC)) + +static int do_setitimer(int which, struct itimerspec64 *value, + struct itimerspec64 *ovalue) +{ + struct task_struct *tsk = current; + struct hrtimer *timer; + ktime_t expires; + + switch (which) { + case ITIMER_REAL: +again: + spin_lock_irq(&tsk->sighand->siglock); + timer = &tsk->signal->real_timer; + if (ovalue) { + ovalue->it_value = itimer_get_remtime(timer); + ovalue->it_interval + = ktime_to_timespec64(tsk->signal->it_real_incr); + } + /* We are sharing ->siglock with it_real_fn() */ + if (hrtimer_try_to_cancel(timer) < 0) { + spin_unlock_irq(&tsk->sighand->siglock); + hrtimer_cancel_wait_running(timer); + goto again; + } + expires = timespec64_to_ktime(value->it_value); + if (expires != 0) { + tsk->signal->it_real_incr = + timespec64_to_ktime(value->it_interval); + hrtimer_start(timer, expires, HRTIMER_MODE_REL); + } else + tsk->signal->it_real_incr = 0; + + trace_itimer_state(ITIMER_REAL, value, 0); + spin_unlock_irq(&tsk->sighand->siglock); + break; + case ITIMER_VIRTUAL: + set_cpu_itimer(tsk, CPUCLOCK_VIRT, value, ovalue); + break; + case ITIMER_PROF: + set_cpu_itimer(tsk, CPUCLOCK_PROF, value, ovalue); + break; + default: + return -EINVAL; + } + return 0; +} + +#ifdef CONFIG_SECURITY_SELINUX +void clear_itimer(void) +{ + struct itimerspec64 v = {}; + int i; + + for (i = 0; i < 3; i++) + do_setitimer(i, &v, NULL); +} +#endif + +#ifdef __ARCH_WANT_SYS_ALARM + +/** + * alarm_setitimer - set alarm in seconds + * + * @seconds: number of seconds until alarm + * 0 disables the alarm + * + * Returns the remaining time in seconds of a pending timer or 0 when + * the timer is not active. + * + * On 32 bit machines the seconds value is limited to (INT_MAX/2) to avoid + * negative timeval settings which would cause immediate expiry. + */ +static unsigned int alarm_setitimer(unsigned int seconds) +{ + struct itimerspec64 it_new, it_old; + +#if BITS_PER_LONG < 64 + if (seconds > INT_MAX) + seconds = INT_MAX; +#endif + it_new.it_value.tv_sec = seconds; + it_new.it_value.tv_nsec = 0; + it_new.it_interval.tv_sec = it_new.it_interval.tv_nsec = 0; + + do_setitimer(ITIMER_REAL, &it_new, &it_old); + + /* + * We can't return 0 if we have an alarm pending ... And we'd + * better return too much than too little anyway + */ + if ((!it_old.it_value.tv_sec && it_old.it_value.tv_nsec) || + it_old.it_value.tv_nsec >= (NSEC_PER_SEC / 2)) + it_old.it_value.tv_sec++; + + return it_old.it_value.tv_sec; +} + +/* + * For backwards compatibility? This can be done in libc so Alpha + * and all newer ports shouldn't need it. + */ +SYSCALL_DEFINE1(alarm, unsigned int, seconds) +{ + return alarm_setitimer(seconds); +} + +#endif + +static int get_itimerval(struct itimerspec64 *o, const struct __kernel_old_itimerval __user *i) +{ + struct __kernel_old_itimerval v; + + if (copy_from_user(&v, i, sizeof(struct __kernel_old_itimerval))) + return -EFAULT; + + /* Validate the timevals in value. */ + if (!timeval_valid(&v.it_value) || + !timeval_valid(&v.it_interval)) + return -EINVAL; + + o->it_interval.tv_sec = v.it_interval.tv_sec; + o->it_interval.tv_nsec = v.it_interval.tv_usec * NSEC_PER_USEC; + o->it_value.tv_sec = v.it_value.tv_sec; + o->it_value.tv_nsec = v.it_value.tv_usec * NSEC_PER_USEC; + return 0; +} + +SYSCALL_DEFINE3(setitimer, int, which, struct __kernel_old_itimerval __user *, value, + struct __kernel_old_itimerval __user *, ovalue) +{ + struct itimerspec64 set_buffer, get_buffer; + int error; + + if (value) { + error = get_itimerval(&set_buffer, value); + if (error) + return error; + } else { + memset(&set_buffer, 0, sizeof(set_buffer)); + printk_once(KERN_WARNING "%s calls setitimer() with new_value NULL pointer." + " Misfeature support will be removed\n", + current->comm); + } + + error = do_setitimer(which, &set_buffer, ovalue ? &get_buffer : NULL); + if (error || !ovalue) + return error; + + if (put_itimerval(ovalue, &get_buffer)) + return -EFAULT; + return 0; +} + +#if defined(CONFIG_COMPAT) || defined(CONFIG_ALPHA) +static int get_old_itimerval32(struct itimerspec64 *o, const struct old_itimerval32 __user *i) +{ + struct old_itimerval32 v32; + + if (copy_from_user(&v32, i, sizeof(struct old_itimerval32))) + return -EFAULT; + + /* Validate the timevals in value. */ + if (!timeval_valid(&v32.it_value) || + !timeval_valid(&v32.it_interval)) + return -EINVAL; + + o->it_interval.tv_sec = v32.it_interval.tv_sec; + o->it_interval.tv_nsec = v32.it_interval.tv_usec * NSEC_PER_USEC; + o->it_value.tv_sec = v32.it_value.tv_sec; + o->it_value.tv_nsec = v32.it_value.tv_usec * NSEC_PER_USEC; + return 0; +} + +COMPAT_SYSCALL_DEFINE3(setitimer, int, which, + struct old_itimerval32 __user *, value, + struct old_itimerval32 __user *, ovalue) +{ + struct itimerspec64 set_buffer, get_buffer; + int error; + + if (value) { + error = get_old_itimerval32(&set_buffer, value); + if (error) + return error; + } else { + memset(&set_buffer, 0, sizeof(set_buffer)); + printk_once(KERN_WARNING "%s calls setitimer() with new_value NULL pointer." + " Misfeature support will be removed\n", + current->comm); + } + + error = do_setitimer(which, &set_buffer, ovalue ? &get_buffer : NULL); + if (error || !ovalue) + return error; + if (put_old_itimerval32(ovalue, &get_buffer)) + return -EFAULT; + return 0; +} +#endif diff --git a/kernel/time/jiffies.c b/kernel/time/jiffies.c new file mode 100644 index 0000000000..bc4db9e5ab --- /dev/null +++ b/kernel/time/jiffies.c @@ -0,0 +1,104 @@ +// SPDX-License-Identifier: GPL-2.0+ +/* + * This file contains the jiffies based clocksource. + * + * Copyright (C) 2004, 2005 IBM, John Stultz (johnstul@us.ibm.com) + */ +#include <linux/clocksource.h> +#include <linux/jiffies.h> +#include <linux/module.h> +#include <linux/init.h> + +#include "timekeeping.h" +#include "tick-internal.h" + + +static u64 jiffies_read(struct clocksource *cs) +{ + return (u64) jiffies; +} + +/* + * The Jiffies based clocksource is the lowest common + * denominator clock source which should function on + * all systems. It has the same coarse resolution as + * the timer interrupt frequency HZ and it suffers + * inaccuracies caused by missed or lost timer + * interrupts and the inability for the timer + * interrupt hardware to accurately tick at the + * requested HZ value. It is also not recommended + * for "tick-less" systems. + */ +static struct clocksource clocksource_jiffies = { + .name = "jiffies", + .rating = 1, /* lowest valid rating*/ + .uncertainty_margin = 32 * NSEC_PER_MSEC, + .read = jiffies_read, + .mask = CLOCKSOURCE_MASK(32), + .mult = TICK_NSEC << JIFFIES_SHIFT, /* details above */ + .shift = JIFFIES_SHIFT, + .max_cycles = 10, +}; + +__cacheline_aligned_in_smp DEFINE_RAW_SPINLOCK(jiffies_lock); +__cacheline_aligned_in_smp seqcount_raw_spinlock_t jiffies_seq = + SEQCNT_RAW_SPINLOCK_ZERO(jiffies_seq, &jiffies_lock); + +#if (BITS_PER_LONG < 64) +u64 get_jiffies_64(void) +{ + unsigned int seq; + u64 ret; + + do { + seq = read_seqcount_begin(&jiffies_seq); + ret = jiffies_64; + } while (read_seqcount_retry(&jiffies_seq, seq)); + return ret; +} +EXPORT_SYMBOL(get_jiffies_64); +#endif + +EXPORT_SYMBOL(jiffies); + +static int __init init_jiffies_clocksource(void) +{ + return __clocksource_register(&clocksource_jiffies); +} + +core_initcall(init_jiffies_clocksource); + +struct clocksource * __init __weak clocksource_default_clock(void) +{ + return &clocksource_jiffies; +} + +static struct clocksource refined_jiffies; + +int register_refined_jiffies(long cycles_per_second) +{ + u64 nsec_per_tick, shift_hz; + long cycles_per_tick; + + + + refined_jiffies = clocksource_jiffies; + refined_jiffies.name = "refined-jiffies"; + refined_jiffies.rating++; + + /* Calc cycles per tick */ + cycles_per_tick = (cycles_per_second + HZ/2)/HZ; + /* shift_hz stores hz<<8 for extra accuracy */ + shift_hz = (u64)cycles_per_second << 8; + shift_hz += cycles_per_tick/2; + do_div(shift_hz, cycles_per_tick); + /* Calculate nsec_per_tick using shift_hz */ + nsec_per_tick = (u64)NSEC_PER_SEC << 8; + nsec_per_tick += (u32)shift_hz/2; + do_div(nsec_per_tick, (u32)shift_hz); + + refined_jiffies.mult = ((u32)nsec_per_tick) << JIFFIES_SHIFT; + + __clocksource_register(&refined_jiffies); + return 0; +} diff --git a/kernel/time/namespace.c b/kernel/time/namespace.c new file mode 100644 index 0000000000..0775b9ec95 --- /dev/null +++ b/kernel/time/namespace.c @@ -0,0 +1,485 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Author: Andrei Vagin <avagin@openvz.org> + * Author: Dmitry Safonov <dima@arista.com> + */ + +#include <linux/time_namespace.h> +#include <linux/user_namespace.h> +#include <linux/sched/signal.h> +#include <linux/sched/task.h> +#include <linux/clocksource.h> +#include <linux/seq_file.h> +#include <linux/proc_ns.h> +#include <linux/export.h> +#include <linux/time.h> +#include <linux/slab.h> +#include <linux/cred.h> +#include <linux/err.h> +#include <linux/mm.h> + +#include <vdso/datapage.h> + +ktime_t do_timens_ktime_to_host(clockid_t clockid, ktime_t tim, + struct timens_offsets *ns_offsets) +{ + ktime_t offset; + + switch (clockid) { + case CLOCK_MONOTONIC: + offset = timespec64_to_ktime(ns_offsets->monotonic); + break; + case CLOCK_BOOTTIME: + case CLOCK_BOOTTIME_ALARM: + offset = timespec64_to_ktime(ns_offsets->boottime); + break; + default: + return tim; + } + + /* + * Check that @tim value is in [offset, KTIME_MAX + offset] + * and subtract offset. + */ + if (tim < offset) { + /* + * User can specify @tim *absolute* value - if it's lesser than + * the time namespace's offset - it's already expired. + */ + tim = 0; + } else { + tim = ktime_sub(tim, offset); + if (unlikely(tim > KTIME_MAX)) + tim = KTIME_MAX; + } + + return tim; +} + +static struct ucounts *inc_time_namespaces(struct user_namespace *ns) +{ + return inc_ucount(ns, current_euid(), UCOUNT_TIME_NAMESPACES); +} + +static void dec_time_namespaces(struct ucounts *ucounts) +{ + dec_ucount(ucounts, UCOUNT_TIME_NAMESPACES); +} + +/** + * clone_time_ns - Clone a time namespace + * @user_ns: User namespace which owns a new namespace. + * @old_ns: Namespace to clone + * + * Clone @old_ns and set the clone refcount to 1 + * + * Return: The new namespace or ERR_PTR. + */ +static struct time_namespace *clone_time_ns(struct user_namespace *user_ns, + struct time_namespace *old_ns) +{ + struct time_namespace *ns; + struct ucounts *ucounts; + int err; + + err = -ENOSPC; + ucounts = inc_time_namespaces(user_ns); + if (!ucounts) + goto fail; + + err = -ENOMEM; + ns = kmalloc(sizeof(*ns), GFP_KERNEL_ACCOUNT); + if (!ns) + goto fail_dec; + + refcount_set(&ns->ns.count, 1); + + ns->vvar_page = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO); + if (!ns->vvar_page) + goto fail_free; + + err = ns_alloc_inum(&ns->ns); + if (err) + goto fail_free_page; + + ns->ucounts = ucounts; + ns->ns.ops = &timens_operations; + ns->user_ns = get_user_ns(user_ns); + ns->offsets = old_ns->offsets; + ns->frozen_offsets = false; + return ns; + +fail_free_page: + __free_page(ns->vvar_page); +fail_free: + kfree(ns); +fail_dec: + dec_time_namespaces(ucounts); +fail: + return ERR_PTR(err); +} + +/** + * copy_time_ns - Create timens_for_children from @old_ns + * @flags: Cloning flags + * @user_ns: User namespace which owns a new namespace. + * @old_ns: Namespace to clone + * + * If CLONE_NEWTIME specified in @flags, creates a new timens_for_children; + * adds a refcounter to @old_ns otherwise. + * + * Return: timens_for_children namespace or ERR_PTR. + */ +struct time_namespace *copy_time_ns(unsigned long flags, + struct user_namespace *user_ns, struct time_namespace *old_ns) +{ + if (!(flags & CLONE_NEWTIME)) + return get_time_ns(old_ns); + + return clone_time_ns(user_ns, old_ns); +} + +static struct timens_offset offset_from_ts(struct timespec64 off) +{ + struct timens_offset ret; + + ret.sec = off.tv_sec; + ret.nsec = off.tv_nsec; + + return ret; +} + +/* + * A time namespace VVAR page has the same layout as the VVAR page which + * contains the system wide VDSO data. + * + * For a normal task the VVAR pages are installed in the normal ordering: + * VVAR + * PVCLOCK + * HVCLOCK + * TIMENS <- Not really required + * + * Now for a timens task the pages are installed in the following order: + * TIMENS + * PVCLOCK + * HVCLOCK + * VVAR + * + * The check for vdso_data->clock_mode is in the unlikely path of + * the seq begin magic. So for the non-timens case most of the time + * 'seq' is even, so the branch is not taken. + * + * If 'seq' is odd, i.e. a concurrent update is in progress, the extra check + * for vdso_data->clock_mode is a non-issue. The task is spin waiting for the + * update to finish and for 'seq' to become even anyway. + * + * Timens page has vdso_data->clock_mode set to VDSO_CLOCKMODE_TIMENS which + * enforces the time namespace handling path. + */ +static void timens_setup_vdso_data(struct vdso_data *vdata, + struct time_namespace *ns) +{ + struct timens_offset *offset = vdata->offset; + struct timens_offset monotonic = offset_from_ts(ns->offsets.monotonic); + struct timens_offset boottime = offset_from_ts(ns->offsets.boottime); + + vdata->seq = 1; + vdata->clock_mode = VDSO_CLOCKMODE_TIMENS; + offset[CLOCK_MONOTONIC] = monotonic; + offset[CLOCK_MONOTONIC_RAW] = monotonic; + offset[CLOCK_MONOTONIC_COARSE] = monotonic; + offset[CLOCK_BOOTTIME] = boottime; + offset[CLOCK_BOOTTIME_ALARM] = boottime; +} + +struct page *find_timens_vvar_page(struct vm_area_struct *vma) +{ + if (likely(vma->vm_mm == current->mm)) + return current->nsproxy->time_ns->vvar_page; + + /* + * VM_PFNMAP | VM_IO protect .fault() handler from being called + * through interfaces like /proc/$pid/mem or + * process_vm_{readv,writev}() as long as there's no .access() + * in special_mapping_vmops(). + * For more details check_vma_flags() and __access_remote_vm() + */ + + WARN(1, "vvar_page accessed remotely"); + + return NULL; +} + +/* + * Protects possibly multiple offsets writers racing each other + * and tasks entering the namespace. + */ +static DEFINE_MUTEX(offset_lock); + +static void timens_set_vvar_page(struct task_struct *task, + struct time_namespace *ns) +{ + struct vdso_data *vdata; + unsigned int i; + + if (ns == &init_time_ns) + return; + + /* Fast-path, taken by every task in namespace except the first. */ + if (likely(ns->frozen_offsets)) + return; + + mutex_lock(&offset_lock); + /* Nothing to-do: vvar_page has been already initialized. */ + if (ns->frozen_offsets) + goto out; + + ns->frozen_offsets = true; + vdata = arch_get_vdso_data(page_address(ns->vvar_page)); + + for (i = 0; i < CS_BASES; i++) + timens_setup_vdso_data(&vdata[i], ns); + +out: + mutex_unlock(&offset_lock); +} + +void free_time_ns(struct time_namespace *ns) +{ + dec_time_namespaces(ns->ucounts); + put_user_ns(ns->user_ns); + ns_free_inum(&ns->ns); + __free_page(ns->vvar_page); + kfree(ns); +} + +static struct time_namespace *to_time_ns(struct ns_common *ns) +{ + return container_of(ns, struct time_namespace, ns); +} + +static struct ns_common *timens_get(struct task_struct *task) +{ + struct time_namespace *ns = NULL; + struct nsproxy *nsproxy; + + task_lock(task); + nsproxy = task->nsproxy; + if (nsproxy) { + ns = nsproxy->time_ns; + get_time_ns(ns); + } + task_unlock(task); + + return ns ? &ns->ns : NULL; +} + +static struct ns_common *timens_for_children_get(struct task_struct *task) +{ + struct time_namespace *ns = NULL; + struct nsproxy *nsproxy; + + task_lock(task); + nsproxy = task->nsproxy; + if (nsproxy) { + ns = nsproxy->time_ns_for_children; + get_time_ns(ns); + } + task_unlock(task); + + return ns ? &ns->ns : NULL; +} + +static void timens_put(struct ns_common *ns) +{ + put_time_ns(to_time_ns(ns)); +} + +void timens_commit(struct task_struct *tsk, struct time_namespace *ns) +{ + timens_set_vvar_page(tsk, ns); + vdso_join_timens(tsk, ns); +} + +static int timens_install(struct nsset *nsset, struct ns_common *new) +{ + struct nsproxy *nsproxy = nsset->nsproxy; + struct time_namespace *ns = to_time_ns(new); + + if (!current_is_single_threaded()) + return -EUSERS; + + if (!ns_capable(ns->user_ns, CAP_SYS_ADMIN) || + !ns_capable(nsset->cred->user_ns, CAP_SYS_ADMIN)) + return -EPERM; + + get_time_ns(ns); + put_time_ns(nsproxy->time_ns); + nsproxy->time_ns = ns; + + get_time_ns(ns); + put_time_ns(nsproxy->time_ns_for_children); + nsproxy->time_ns_for_children = ns; + return 0; +} + +void timens_on_fork(struct nsproxy *nsproxy, struct task_struct *tsk) +{ + struct ns_common *nsc = &nsproxy->time_ns_for_children->ns; + struct time_namespace *ns = to_time_ns(nsc); + + /* create_new_namespaces() already incremented the ref counter */ + if (nsproxy->time_ns == nsproxy->time_ns_for_children) + return; + + get_time_ns(ns); + put_time_ns(nsproxy->time_ns); + nsproxy->time_ns = ns; + + timens_commit(tsk, ns); +} + +static struct user_namespace *timens_owner(struct ns_common *ns) +{ + return to_time_ns(ns)->user_ns; +} + +static void show_offset(struct seq_file *m, int clockid, struct timespec64 *ts) +{ + char *clock; + + switch (clockid) { + case CLOCK_BOOTTIME: + clock = "boottime"; + break; + case CLOCK_MONOTONIC: + clock = "monotonic"; + break; + default: + clock = "unknown"; + break; + } + seq_printf(m, "%-10s %10lld %9ld\n", clock, ts->tv_sec, ts->tv_nsec); +} + +void proc_timens_show_offsets(struct task_struct *p, struct seq_file *m) +{ + struct ns_common *ns; + struct time_namespace *time_ns; + + ns = timens_for_children_get(p); + if (!ns) + return; + time_ns = to_time_ns(ns); + + show_offset(m, CLOCK_MONOTONIC, &time_ns->offsets.monotonic); + show_offset(m, CLOCK_BOOTTIME, &time_ns->offsets.boottime); + put_time_ns(time_ns); +} + +int proc_timens_set_offset(struct file *file, struct task_struct *p, + struct proc_timens_offset *offsets, int noffsets) +{ + struct ns_common *ns; + struct time_namespace *time_ns; + struct timespec64 tp; + int i, err; + + ns = timens_for_children_get(p); + if (!ns) + return -ESRCH; + time_ns = to_time_ns(ns); + + if (!file_ns_capable(file, time_ns->user_ns, CAP_SYS_TIME)) { + put_time_ns(time_ns); + return -EPERM; + } + + for (i = 0; i < noffsets; i++) { + struct proc_timens_offset *off = &offsets[i]; + + switch (off->clockid) { + case CLOCK_MONOTONIC: + ktime_get_ts64(&tp); + break; + case CLOCK_BOOTTIME: + ktime_get_boottime_ts64(&tp); + break; + default: + err = -EINVAL; + goto out; + } + + err = -ERANGE; + + if (off->val.tv_sec > KTIME_SEC_MAX || + off->val.tv_sec < -KTIME_SEC_MAX) + goto out; + + tp = timespec64_add(tp, off->val); + /* + * KTIME_SEC_MAX is divided by 2 to be sure that KTIME_MAX is + * still unreachable. + */ + if (tp.tv_sec < 0 || tp.tv_sec > KTIME_SEC_MAX / 2) + goto out; + } + + mutex_lock(&offset_lock); + if (time_ns->frozen_offsets) { + err = -EACCES; + goto out_unlock; + } + + err = 0; + /* Don't report errors after this line */ + for (i = 0; i < noffsets; i++) { + struct proc_timens_offset *off = &offsets[i]; + struct timespec64 *offset = NULL; + + switch (off->clockid) { + case CLOCK_MONOTONIC: + offset = &time_ns->offsets.monotonic; + break; + case CLOCK_BOOTTIME: + offset = &time_ns->offsets.boottime; + break; + } + + *offset = off->val; + } + +out_unlock: + mutex_unlock(&offset_lock); +out: + put_time_ns(time_ns); + + return err; +} + +const struct proc_ns_operations timens_operations = { + .name = "time", + .type = CLONE_NEWTIME, + .get = timens_get, + .put = timens_put, + .install = timens_install, + .owner = timens_owner, +}; + +const struct proc_ns_operations timens_for_children_operations = { + .name = "time_for_children", + .real_ns_name = "time", + .type = CLONE_NEWTIME, + .get = timens_for_children_get, + .put = timens_put, + .install = timens_install, + .owner = timens_owner, +}; + +struct time_namespace init_time_ns = { + .ns.count = REFCOUNT_INIT(3), + .user_ns = &init_user_ns, + .ns.inum = PROC_TIME_INIT_INO, + .ns.ops = &timens_operations, + .frozen_offsets = true, +}; diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c new file mode 100644 index 0000000000..406dccb79c --- /dev/null +++ b/kernel/time/ntp.c @@ -0,0 +1,1096 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * NTP state machine interfaces and logic. + * + * This code was mainly moved from kernel/timer.c and kernel/time.c + * Please see those files for relevant copyright info and historical + * changelogs. + */ +#include <linux/capability.h> +#include <linux/clocksource.h> +#include <linux/workqueue.h> +#include <linux/hrtimer.h> +#include <linux/jiffies.h> +#include <linux/math64.h> +#include <linux/timex.h> +#include <linux/time.h> +#include <linux/mm.h> +#include <linux/module.h> +#include <linux/rtc.h> +#include <linux/audit.h> + +#include "ntp_internal.h" +#include "timekeeping_internal.h" + + +/* + * NTP timekeeping variables: + * + * Note: All of the NTP state is protected by the timekeeping locks. + */ + + +/* USER_HZ period (usecs): */ +unsigned long tick_usec = USER_TICK_USEC; + +/* SHIFTED_HZ period (nsecs): */ +unsigned long tick_nsec; + +static u64 tick_length; +static u64 tick_length_base; + +#define SECS_PER_DAY 86400 +#define MAX_TICKADJ 500LL /* usecs */ +#define MAX_TICKADJ_SCALED \ + (((MAX_TICKADJ * NSEC_PER_USEC) << NTP_SCALE_SHIFT) / NTP_INTERVAL_FREQ) +#define MAX_TAI_OFFSET 100000 + +/* + * phase-lock loop variables + */ + +/* + * clock synchronization status + * + * (TIME_ERROR prevents overwriting the CMOS clock) + */ +static int time_state = TIME_OK; + +/* clock status bits: */ +static int time_status = STA_UNSYNC; + +/* time adjustment (nsecs): */ +static s64 time_offset; + +/* pll time constant: */ +static long time_constant = 2; + +/* maximum error (usecs): */ +static long time_maxerror = NTP_PHASE_LIMIT; + +/* estimated error (usecs): */ +static long time_esterror = NTP_PHASE_LIMIT; + +/* frequency offset (scaled nsecs/secs): */ +static s64 time_freq; + +/* time at last adjustment (secs): */ +static time64_t time_reftime; + +static long time_adjust; + +/* constant (boot-param configurable) NTP tick adjustment (upscaled) */ +static s64 ntp_tick_adj; + +/* second value of the next pending leapsecond, or TIME64_MAX if no leap */ +static time64_t ntp_next_leap_sec = TIME64_MAX; + +#ifdef CONFIG_NTP_PPS + +/* + * The following variables are used when a pulse-per-second (PPS) signal + * is available. They establish the engineering parameters of the clock + * discipline loop when controlled by the PPS signal. + */ +#define PPS_VALID 10 /* PPS signal watchdog max (s) */ +#define PPS_POPCORN 4 /* popcorn spike threshold (shift) */ +#define PPS_INTMIN 2 /* min freq interval (s) (shift) */ +#define PPS_INTMAX 8 /* max freq interval (s) (shift) */ +#define PPS_INTCOUNT 4 /* number of consecutive good intervals to + increase pps_shift or consecutive bad + intervals to decrease it */ +#define PPS_MAXWANDER 100000 /* max PPS freq wander (ns/s) */ + +static int pps_valid; /* signal watchdog counter */ +static long pps_tf[3]; /* phase median filter */ +static long pps_jitter; /* current jitter (ns) */ +static struct timespec64 pps_fbase; /* beginning of the last freq interval */ +static int pps_shift; /* current interval duration (s) (shift) */ +static int pps_intcnt; /* interval counter */ +static s64 pps_freq; /* frequency offset (scaled ns/s) */ +static long pps_stabil; /* current stability (scaled ns/s) */ + +/* + * PPS signal quality monitors + */ +static long pps_calcnt; /* calibration intervals */ +static long pps_jitcnt; /* jitter limit exceeded */ +static long pps_stbcnt; /* stability limit exceeded */ +static long pps_errcnt; /* calibration errors */ + + +/* PPS kernel consumer compensates the whole phase error immediately. + * Otherwise, reduce the offset by a fixed factor times the time constant. + */ +static inline s64 ntp_offset_chunk(s64 offset) +{ + if (time_status & STA_PPSTIME && time_status & STA_PPSSIGNAL) + return offset; + else + return shift_right(offset, SHIFT_PLL + time_constant); +} + +static inline void pps_reset_freq_interval(void) +{ + /* the PPS calibration interval may end + surprisingly early */ + pps_shift = PPS_INTMIN; + pps_intcnt = 0; +} + +/** + * pps_clear - Clears the PPS state variables + */ +static inline void pps_clear(void) +{ + pps_reset_freq_interval(); + pps_tf[0] = 0; + pps_tf[1] = 0; + pps_tf[2] = 0; + pps_fbase.tv_sec = pps_fbase.tv_nsec = 0; + pps_freq = 0; +} + +/* Decrease pps_valid to indicate that another second has passed since + * the last PPS signal. When it reaches 0, indicate that PPS signal is + * missing. + */ +static inline void pps_dec_valid(void) +{ + if (pps_valid > 0) + pps_valid--; + else { + time_status &= ~(STA_PPSSIGNAL | STA_PPSJITTER | + STA_PPSWANDER | STA_PPSERROR); + pps_clear(); + } +} + +static inline void pps_set_freq(s64 freq) +{ + pps_freq = freq; +} + +static inline int is_error_status(int status) +{ + return (status & (STA_UNSYNC|STA_CLOCKERR)) + /* PPS signal lost when either PPS time or + * PPS frequency synchronization requested + */ + || ((status & (STA_PPSFREQ|STA_PPSTIME)) + && !(status & STA_PPSSIGNAL)) + /* PPS jitter exceeded when + * PPS time synchronization requested */ + || ((status & (STA_PPSTIME|STA_PPSJITTER)) + == (STA_PPSTIME|STA_PPSJITTER)) + /* PPS wander exceeded or calibration error when + * PPS frequency synchronization requested + */ + || ((status & STA_PPSFREQ) + && (status & (STA_PPSWANDER|STA_PPSERROR))); +} + +static inline void pps_fill_timex(struct __kernel_timex *txc) +{ + txc->ppsfreq = shift_right((pps_freq >> PPM_SCALE_INV_SHIFT) * + PPM_SCALE_INV, NTP_SCALE_SHIFT); + txc->jitter = pps_jitter; + if (!(time_status & STA_NANO)) + txc->jitter = pps_jitter / NSEC_PER_USEC; + txc->shift = pps_shift; + txc->stabil = pps_stabil; + txc->jitcnt = pps_jitcnt; + txc->calcnt = pps_calcnt; + txc->errcnt = pps_errcnt; + txc->stbcnt = pps_stbcnt; +} + +#else /* !CONFIG_NTP_PPS */ + +static inline s64 ntp_offset_chunk(s64 offset) +{ + return shift_right(offset, SHIFT_PLL + time_constant); +} + +static inline void pps_reset_freq_interval(void) {} +static inline void pps_clear(void) {} +static inline void pps_dec_valid(void) {} +static inline void pps_set_freq(s64 freq) {} + +static inline int is_error_status(int status) +{ + return status & (STA_UNSYNC|STA_CLOCKERR); +} + +static inline void pps_fill_timex(struct __kernel_timex *txc) +{ + /* PPS is not implemented, so these are zero */ + txc->ppsfreq = 0; + txc->jitter = 0; + txc->shift = 0; + txc->stabil = 0; + txc->jitcnt = 0; + txc->calcnt = 0; + txc->errcnt = 0; + txc->stbcnt = 0; +} + +#endif /* CONFIG_NTP_PPS */ + + +/** + * ntp_synced - Returns 1 if the NTP status is not UNSYNC + * + */ +static inline int ntp_synced(void) +{ + return !(time_status & STA_UNSYNC); +} + + +/* + * NTP methods: + */ + +/* + * Update (tick_length, tick_length_base, tick_nsec), based + * on (tick_usec, ntp_tick_adj, time_freq): + */ +static void ntp_update_frequency(void) +{ + u64 second_length; + u64 new_base; + + second_length = (u64)(tick_usec * NSEC_PER_USEC * USER_HZ) + << NTP_SCALE_SHIFT; + + second_length += ntp_tick_adj; + second_length += time_freq; + + tick_nsec = div_u64(second_length, HZ) >> NTP_SCALE_SHIFT; + new_base = div_u64(second_length, NTP_INTERVAL_FREQ); + + /* + * Don't wait for the next second_overflow, apply + * the change to the tick length immediately: + */ + tick_length += new_base - tick_length_base; + tick_length_base = new_base; +} + +static inline s64 ntp_update_offset_fll(s64 offset64, long secs) +{ + time_status &= ~STA_MODE; + + if (secs < MINSEC) + return 0; + + if (!(time_status & STA_FLL) && (secs <= MAXSEC)) + return 0; + + time_status |= STA_MODE; + + return div64_long(offset64 << (NTP_SCALE_SHIFT - SHIFT_FLL), secs); +} + +static void ntp_update_offset(long offset) +{ + s64 freq_adj; + s64 offset64; + long secs; + + if (!(time_status & STA_PLL)) + return; + + if (!(time_status & STA_NANO)) { + /* Make sure the multiplication below won't overflow */ + offset = clamp(offset, -USEC_PER_SEC, USEC_PER_SEC); + offset *= NSEC_PER_USEC; + } + + /* + * Scale the phase adjustment and + * clamp to the operating range. + */ + offset = clamp(offset, -MAXPHASE, MAXPHASE); + + /* + * Select how the frequency is to be controlled + * and in which mode (PLL or FLL). + */ + secs = (long)(__ktime_get_real_seconds() - time_reftime); + if (unlikely(time_status & STA_FREQHOLD)) + secs = 0; + + time_reftime = __ktime_get_real_seconds(); + + offset64 = offset; + freq_adj = ntp_update_offset_fll(offset64, secs); + + /* + * Clamp update interval to reduce PLL gain with low + * sampling rate (e.g. intermittent network connection) + * to avoid instability. + */ + if (unlikely(secs > 1 << (SHIFT_PLL + 1 + time_constant))) + secs = 1 << (SHIFT_PLL + 1 + time_constant); + + freq_adj += (offset64 * secs) << + (NTP_SCALE_SHIFT - 2 * (SHIFT_PLL + 2 + time_constant)); + + freq_adj = min(freq_adj + time_freq, MAXFREQ_SCALED); + + time_freq = max(freq_adj, -MAXFREQ_SCALED); + + time_offset = div_s64(offset64 << NTP_SCALE_SHIFT, NTP_INTERVAL_FREQ); +} + +/** + * ntp_clear - Clears the NTP state variables + */ +void ntp_clear(void) +{ + time_adjust = 0; /* stop active adjtime() */ + time_status |= STA_UNSYNC; + time_maxerror = NTP_PHASE_LIMIT; + time_esterror = NTP_PHASE_LIMIT; + + ntp_update_frequency(); + + tick_length = tick_length_base; + time_offset = 0; + + ntp_next_leap_sec = TIME64_MAX; + /* Clear PPS state variables */ + pps_clear(); +} + + +u64 ntp_tick_length(void) +{ + return tick_length; +} + +/** + * ntp_get_next_leap - Returns the next leapsecond in CLOCK_REALTIME ktime_t + * + * Provides the time of the next leapsecond against CLOCK_REALTIME in + * a ktime_t format. Returns KTIME_MAX if no leapsecond is pending. + */ +ktime_t ntp_get_next_leap(void) +{ + ktime_t ret; + + if ((time_state == TIME_INS) && (time_status & STA_INS)) + return ktime_set(ntp_next_leap_sec, 0); + ret = KTIME_MAX; + return ret; +} + +/* + * this routine handles the overflow of the microsecond field + * + * The tricky bits of code to handle the accurate clock support + * were provided by Dave Mills (Mills@UDEL.EDU) of NTP fame. + * They were originally developed for SUN and DEC kernels. + * All the kudos should go to Dave for this stuff. + * + * Also handles leap second processing, and returns leap offset + */ +int second_overflow(time64_t secs) +{ + s64 delta; + int leap = 0; + s32 rem; + + /* + * Leap second processing. If in leap-insert state at the end of the + * day, the system clock is set back one second; if in leap-delete + * state, the system clock is set ahead one second. + */ + switch (time_state) { + case TIME_OK: + if (time_status & STA_INS) { + time_state = TIME_INS; + div_s64_rem(secs, SECS_PER_DAY, &rem); + ntp_next_leap_sec = secs + SECS_PER_DAY - rem; + } else if (time_status & STA_DEL) { + time_state = TIME_DEL; + div_s64_rem(secs + 1, SECS_PER_DAY, &rem); + ntp_next_leap_sec = secs + SECS_PER_DAY - rem; + } + break; + case TIME_INS: + if (!(time_status & STA_INS)) { + ntp_next_leap_sec = TIME64_MAX; + time_state = TIME_OK; + } else if (secs == ntp_next_leap_sec) { + leap = -1; + time_state = TIME_OOP; + printk(KERN_NOTICE + "Clock: inserting leap second 23:59:60 UTC\n"); + } + break; + case TIME_DEL: + if (!(time_status & STA_DEL)) { + ntp_next_leap_sec = TIME64_MAX; + time_state = TIME_OK; + } else if (secs == ntp_next_leap_sec) { + leap = 1; + ntp_next_leap_sec = TIME64_MAX; + time_state = TIME_WAIT; + printk(KERN_NOTICE + "Clock: deleting leap second 23:59:59 UTC\n"); + } + break; + case TIME_OOP: + ntp_next_leap_sec = TIME64_MAX; + time_state = TIME_WAIT; + break; + case TIME_WAIT: + if (!(time_status & (STA_INS | STA_DEL))) + time_state = TIME_OK; + break; + } + + + /* Bump the maxerror field */ + time_maxerror += MAXFREQ / NSEC_PER_USEC; + if (time_maxerror > NTP_PHASE_LIMIT) { + time_maxerror = NTP_PHASE_LIMIT; + time_status |= STA_UNSYNC; + } + + /* Compute the phase adjustment for the next second */ + tick_length = tick_length_base; + + delta = ntp_offset_chunk(time_offset); + time_offset -= delta; + tick_length += delta; + + /* Check PPS signal */ + pps_dec_valid(); + + if (!time_adjust) + goto out; + + if (time_adjust > MAX_TICKADJ) { + time_adjust -= MAX_TICKADJ; + tick_length += MAX_TICKADJ_SCALED; + goto out; + } + + if (time_adjust < -MAX_TICKADJ) { + time_adjust += MAX_TICKADJ; + tick_length -= MAX_TICKADJ_SCALED; + goto out; + } + + tick_length += (s64)(time_adjust * NSEC_PER_USEC / NTP_INTERVAL_FREQ) + << NTP_SCALE_SHIFT; + time_adjust = 0; + +out: + return leap; +} + +#if defined(CONFIG_GENERIC_CMOS_UPDATE) || defined(CONFIG_RTC_SYSTOHC) +static void sync_hw_clock(struct work_struct *work); +static DECLARE_WORK(sync_work, sync_hw_clock); +static struct hrtimer sync_hrtimer; +#define SYNC_PERIOD_NS (11ULL * 60 * NSEC_PER_SEC) + +static enum hrtimer_restart sync_timer_callback(struct hrtimer *timer) +{ + queue_work(system_freezable_power_efficient_wq, &sync_work); + + return HRTIMER_NORESTART; +} + +static void sched_sync_hw_clock(unsigned long offset_nsec, bool retry) +{ + ktime_t exp = ktime_set(ktime_get_real_seconds(), 0); + + if (retry) + exp = ktime_add_ns(exp, 2ULL * NSEC_PER_SEC - offset_nsec); + else + exp = ktime_add_ns(exp, SYNC_PERIOD_NS - offset_nsec); + + hrtimer_start(&sync_hrtimer, exp, HRTIMER_MODE_ABS); +} + +/* + * Check whether @now is correct versus the required time to update the RTC + * and calculate the value which needs to be written to the RTC so that the + * next seconds increment of the RTC after the write is aligned with the next + * seconds increment of clock REALTIME. + * + * tsched t1 write(t2.tv_sec - 1sec)) t2 RTC increments seconds + * + * t2.tv_nsec == 0 + * tsched = t2 - set_offset_nsec + * newval = t2 - NSEC_PER_SEC + * + * ==> neval = tsched + set_offset_nsec - NSEC_PER_SEC + * + * As the execution of this code is not guaranteed to happen exactly at + * tsched this allows it to happen within a fuzzy region: + * + * abs(now - tsched) < FUZZ + * + * If @now is not inside the allowed window the function returns false. + */ +static inline bool rtc_tv_nsec_ok(unsigned long set_offset_nsec, + struct timespec64 *to_set, + const struct timespec64 *now) +{ + /* Allowed error in tv_nsec, arbitrarily set to 5 jiffies in ns. */ + const unsigned long TIME_SET_NSEC_FUZZ = TICK_NSEC * 5; + struct timespec64 delay = {.tv_sec = -1, + .tv_nsec = set_offset_nsec}; + + *to_set = timespec64_add(*now, delay); + + if (to_set->tv_nsec < TIME_SET_NSEC_FUZZ) { + to_set->tv_nsec = 0; + return true; + } + + if (to_set->tv_nsec > NSEC_PER_SEC - TIME_SET_NSEC_FUZZ) { + to_set->tv_sec++; + to_set->tv_nsec = 0; + return true; + } + return false; +} + +#ifdef CONFIG_GENERIC_CMOS_UPDATE +int __weak update_persistent_clock64(struct timespec64 now64) +{ + return -ENODEV; +} +#else +static inline int update_persistent_clock64(struct timespec64 now64) +{ + return -ENODEV; +} +#endif + +#ifdef CONFIG_RTC_SYSTOHC +/* Save NTP synchronized time to the RTC */ +static int update_rtc(struct timespec64 *to_set, unsigned long *offset_nsec) +{ + struct rtc_device *rtc; + struct rtc_time tm; + int err = -ENODEV; + + rtc = rtc_class_open(CONFIG_RTC_SYSTOHC_DEVICE); + if (!rtc) + return -ENODEV; + + if (!rtc->ops || !rtc->ops->set_time) + goto out_close; + + /* First call might not have the correct offset */ + if (*offset_nsec == rtc->set_offset_nsec) { + rtc_time64_to_tm(to_set->tv_sec, &tm); + err = rtc_set_time(rtc, &tm); + } else { + /* Store the update offset and let the caller try again */ + *offset_nsec = rtc->set_offset_nsec; + err = -EAGAIN; + } +out_close: + rtc_class_close(rtc); + return err; +} +#else +static inline int update_rtc(struct timespec64 *to_set, unsigned long *offset_nsec) +{ + return -ENODEV; +} +#endif + +/* + * If we have an externally synchronized Linux clock, then update RTC clock + * accordingly every ~11 minutes. Generally RTCs can only store second + * precision, but many RTCs will adjust the phase of their second tick to + * match the moment of update. This infrastructure arranges to call to the RTC + * set at the correct moment to phase synchronize the RTC second tick over + * with the kernel clock. + */ +static void sync_hw_clock(struct work_struct *work) +{ + /* + * The default synchronization offset is 500ms for the deprecated + * update_persistent_clock64() under the assumption that it uses + * the infamous CMOS clock (MC146818). + */ + static unsigned long offset_nsec = NSEC_PER_SEC / 2; + struct timespec64 now, to_set; + int res = -EAGAIN; + + /* + * Don't update if STA_UNSYNC is set and if ntp_notify_cmos_timer() + * managed to schedule the work between the timer firing and the + * work being able to rearm the timer. Wait for the timer to expire. + */ + if (!ntp_synced() || hrtimer_is_queued(&sync_hrtimer)) + return; + + ktime_get_real_ts64(&now); + /* If @now is not in the allowed window, try again */ + if (!rtc_tv_nsec_ok(offset_nsec, &to_set, &now)) + goto rearm; + + /* Take timezone adjusted RTCs into account */ + if (persistent_clock_is_local) + to_set.tv_sec -= (sys_tz.tz_minuteswest * 60); + + /* Try the legacy RTC first. */ + res = update_persistent_clock64(to_set); + if (res != -ENODEV) + goto rearm; + + /* Try the RTC class */ + res = update_rtc(&to_set, &offset_nsec); + if (res == -ENODEV) + return; +rearm: + sched_sync_hw_clock(offset_nsec, res != 0); +} + +void ntp_notify_cmos_timer(void) +{ + /* + * When the work is currently executed but has not yet the timer + * rearmed this queues the work immediately again. No big issue, + * just a pointless work scheduled. + */ + if (ntp_synced() && !hrtimer_is_queued(&sync_hrtimer)) + queue_work(system_freezable_power_efficient_wq, &sync_work); +} + +static void __init ntp_init_cmos_sync(void) +{ + hrtimer_init(&sync_hrtimer, CLOCK_REALTIME, HRTIMER_MODE_ABS); + sync_hrtimer.function = sync_timer_callback; +} +#else /* CONFIG_GENERIC_CMOS_UPDATE) || defined(CONFIG_RTC_SYSTOHC) */ +static inline void __init ntp_init_cmos_sync(void) { } +#endif /* !CONFIG_GENERIC_CMOS_UPDATE) || defined(CONFIG_RTC_SYSTOHC) */ + +/* + * Propagate a new txc->status value into the NTP state: + */ +static inline void process_adj_status(const struct __kernel_timex *txc) +{ + if ((time_status & STA_PLL) && !(txc->status & STA_PLL)) { + time_state = TIME_OK; + time_status = STA_UNSYNC; + ntp_next_leap_sec = TIME64_MAX; + /* restart PPS frequency calibration */ + pps_reset_freq_interval(); + } + + /* + * If we turn on PLL adjustments then reset the + * reference time to current time. + */ + if (!(time_status & STA_PLL) && (txc->status & STA_PLL)) + time_reftime = __ktime_get_real_seconds(); + + /* only set allowed bits */ + time_status &= STA_RONLY; + time_status |= txc->status & ~STA_RONLY; +} + + +static inline void process_adjtimex_modes(const struct __kernel_timex *txc, + s32 *time_tai) +{ + if (txc->modes & ADJ_STATUS) + process_adj_status(txc); + + if (txc->modes & ADJ_NANO) + time_status |= STA_NANO; + + if (txc->modes & ADJ_MICRO) + time_status &= ~STA_NANO; + + if (txc->modes & ADJ_FREQUENCY) { + time_freq = txc->freq * PPM_SCALE; + time_freq = min(time_freq, MAXFREQ_SCALED); + time_freq = max(time_freq, -MAXFREQ_SCALED); + /* update pps_freq */ + pps_set_freq(time_freq); + } + + if (txc->modes & ADJ_MAXERROR) + time_maxerror = txc->maxerror; + + if (txc->modes & ADJ_ESTERROR) + time_esterror = txc->esterror; + + if (txc->modes & ADJ_TIMECONST) { + time_constant = txc->constant; + if (!(time_status & STA_NANO)) + time_constant += 4; + time_constant = min(time_constant, (long)MAXTC); + time_constant = max(time_constant, 0l); + } + + if (txc->modes & ADJ_TAI && + txc->constant >= 0 && txc->constant <= MAX_TAI_OFFSET) + *time_tai = txc->constant; + + if (txc->modes & ADJ_OFFSET) + ntp_update_offset(txc->offset); + + if (txc->modes & ADJ_TICK) + tick_usec = txc->tick; + + if (txc->modes & (ADJ_TICK|ADJ_FREQUENCY|ADJ_OFFSET)) + ntp_update_frequency(); +} + + +/* + * adjtimex mainly allows reading (and writing, if superuser) of + * kernel time-keeping variables. used by xntpd. + */ +int __do_adjtimex(struct __kernel_timex *txc, const struct timespec64 *ts, + s32 *time_tai, struct audit_ntp_data *ad) +{ + int result; + + if (txc->modes & ADJ_ADJTIME) { + long save_adjust = time_adjust; + + if (!(txc->modes & ADJ_OFFSET_READONLY)) { + /* adjtime() is independent from ntp_adjtime() */ + time_adjust = txc->offset; + ntp_update_frequency(); + + audit_ntp_set_old(ad, AUDIT_NTP_ADJUST, save_adjust); + audit_ntp_set_new(ad, AUDIT_NTP_ADJUST, time_adjust); + } + txc->offset = save_adjust; + } else { + /* If there are input parameters, then process them: */ + if (txc->modes) { + audit_ntp_set_old(ad, AUDIT_NTP_OFFSET, time_offset); + audit_ntp_set_old(ad, AUDIT_NTP_FREQ, time_freq); + audit_ntp_set_old(ad, AUDIT_NTP_STATUS, time_status); + audit_ntp_set_old(ad, AUDIT_NTP_TAI, *time_tai); + audit_ntp_set_old(ad, AUDIT_NTP_TICK, tick_usec); + + process_adjtimex_modes(txc, time_tai); + + audit_ntp_set_new(ad, AUDIT_NTP_OFFSET, time_offset); + audit_ntp_set_new(ad, AUDIT_NTP_FREQ, time_freq); + audit_ntp_set_new(ad, AUDIT_NTP_STATUS, time_status); + audit_ntp_set_new(ad, AUDIT_NTP_TAI, *time_tai); + audit_ntp_set_new(ad, AUDIT_NTP_TICK, tick_usec); + } + + txc->offset = shift_right(time_offset * NTP_INTERVAL_FREQ, + NTP_SCALE_SHIFT); + if (!(time_status & STA_NANO)) + txc->offset = (u32)txc->offset / NSEC_PER_USEC; + } + + result = time_state; /* mostly `TIME_OK' */ + /* check for errors */ + if (is_error_status(time_status)) + result = TIME_ERROR; + + txc->freq = shift_right((time_freq >> PPM_SCALE_INV_SHIFT) * + PPM_SCALE_INV, NTP_SCALE_SHIFT); + txc->maxerror = time_maxerror; + txc->esterror = time_esterror; + txc->status = time_status; + txc->constant = time_constant; + txc->precision = 1; + txc->tolerance = MAXFREQ_SCALED / PPM_SCALE; + txc->tick = tick_usec; + txc->tai = *time_tai; + + /* fill PPS status fields */ + pps_fill_timex(txc); + + txc->time.tv_sec = ts->tv_sec; + txc->time.tv_usec = ts->tv_nsec; + if (!(time_status & STA_NANO)) + txc->time.tv_usec = ts->tv_nsec / NSEC_PER_USEC; + + /* Handle leapsec adjustments */ + if (unlikely(ts->tv_sec >= ntp_next_leap_sec)) { + if ((time_state == TIME_INS) && (time_status & STA_INS)) { + result = TIME_OOP; + txc->tai++; + txc->time.tv_sec--; + } + if ((time_state == TIME_DEL) && (time_status & STA_DEL)) { + result = TIME_WAIT; + txc->tai--; + txc->time.tv_sec++; + } + if ((time_state == TIME_OOP) && + (ts->tv_sec == ntp_next_leap_sec)) { + result = TIME_WAIT; + } + } + + return result; +} + +#ifdef CONFIG_NTP_PPS + +/* actually struct pps_normtime is good old struct timespec, but it is + * semantically different (and it is the reason why it was invented): + * pps_normtime.nsec has a range of ( -NSEC_PER_SEC / 2, NSEC_PER_SEC / 2 ] + * while timespec.tv_nsec has a range of [0, NSEC_PER_SEC) */ +struct pps_normtime { + s64 sec; /* seconds */ + long nsec; /* nanoseconds */ +}; + +/* normalize the timestamp so that nsec is in the + ( -NSEC_PER_SEC / 2, NSEC_PER_SEC / 2 ] interval */ +static inline struct pps_normtime pps_normalize_ts(struct timespec64 ts) +{ + struct pps_normtime norm = { + .sec = ts.tv_sec, + .nsec = ts.tv_nsec + }; + + if (norm.nsec > (NSEC_PER_SEC >> 1)) { + norm.nsec -= NSEC_PER_SEC; + norm.sec++; + } + + return norm; +} + +/* get current phase correction and jitter */ +static inline long pps_phase_filter_get(long *jitter) +{ + *jitter = pps_tf[0] - pps_tf[1]; + if (*jitter < 0) + *jitter = -*jitter; + + /* TODO: test various filters */ + return pps_tf[0]; +} + +/* add the sample to the phase filter */ +static inline void pps_phase_filter_add(long err) +{ + pps_tf[2] = pps_tf[1]; + pps_tf[1] = pps_tf[0]; + pps_tf[0] = err; +} + +/* decrease frequency calibration interval length. + * It is halved after four consecutive unstable intervals. + */ +static inline void pps_dec_freq_interval(void) +{ + if (--pps_intcnt <= -PPS_INTCOUNT) { + pps_intcnt = -PPS_INTCOUNT; + if (pps_shift > PPS_INTMIN) { + pps_shift--; + pps_intcnt = 0; + } + } +} + +/* increase frequency calibration interval length. + * It is doubled after four consecutive stable intervals. + */ +static inline void pps_inc_freq_interval(void) +{ + if (++pps_intcnt >= PPS_INTCOUNT) { + pps_intcnt = PPS_INTCOUNT; + if (pps_shift < PPS_INTMAX) { + pps_shift++; + pps_intcnt = 0; + } + } +} + +/* update clock frequency based on MONOTONIC_RAW clock PPS signal + * timestamps + * + * At the end of the calibration interval the difference between the + * first and last MONOTONIC_RAW clock timestamps divided by the length + * of the interval becomes the frequency update. If the interval was + * too long, the data are discarded. + * Returns the difference between old and new frequency values. + */ +static long hardpps_update_freq(struct pps_normtime freq_norm) +{ + long delta, delta_mod; + s64 ftemp; + + /* check if the frequency interval was too long */ + if (freq_norm.sec > (2 << pps_shift)) { + time_status |= STA_PPSERROR; + pps_errcnt++; + pps_dec_freq_interval(); + printk_deferred(KERN_ERR + "hardpps: PPSERROR: interval too long - %lld s\n", + freq_norm.sec); + return 0; + } + + /* here the raw frequency offset and wander (stability) is + * calculated. If the wander is less than the wander threshold + * the interval is increased; otherwise it is decreased. + */ + ftemp = div_s64(((s64)(-freq_norm.nsec)) << NTP_SCALE_SHIFT, + freq_norm.sec); + delta = shift_right(ftemp - pps_freq, NTP_SCALE_SHIFT); + pps_freq = ftemp; + if (delta > PPS_MAXWANDER || delta < -PPS_MAXWANDER) { + printk_deferred(KERN_WARNING + "hardpps: PPSWANDER: change=%ld\n", delta); + time_status |= STA_PPSWANDER; + pps_stbcnt++; + pps_dec_freq_interval(); + } else { /* good sample */ + pps_inc_freq_interval(); + } + + /* the stability metric is calculated as the average of recent + * frequency changes, but is used only for performance + * monitoring + */ + delta_mod = delta; + if (delta_mod < 0) + delta_mod = -delta_mod; + pps_stabil += (div_s64(((s64)delta_mod) << + (NTP_SCALE_SHIFT - SHIFT_USEC), + NSEC_PER_USEC) - pps_stabil) >> PPS_INTMIN; + + /* if enabled, the system clock frequency is updated */ + if ((time_status & STA_PPSFREQ) != 0 && + (time_status & STA_FREQHOLD) == 0) { + time_freq = pps_freq; + ntp_update_frequency(); + } + + return delta; +} + +/* correct REALTIME clock phase error against PPS signal */ +static void hardpps_update_phase(long error) +{ + long correction = -error; + long jitter; + + /* add the sample to the median filter */ + pps_phase_filter_add(correction); + correction = pps_phase_filter_get(&jitter); + + /* Nominal jitter is due to PPS signal noise. If it exceeds the + * threshold, the sample is discarded; otherwise, if so enabled, + * the time offset is updated. + */ + if (jitter > (pps_jitter << PPS_POPCORN)) { + printk_deferred(KERN_WARNING + "hardpps: PPSJITTER: jitter=%ld, limit=%ld\n", + jitter, (pps_jitter << PPS_POPCORN)); + time_status |= STA_PPSJITTER; + pps_jitcnt++; + } else if (time_status & STA_PPSTIME) { + /* correct the time using the phase offset */ + time_offset = div_s64(((s64)correction) << NTP_SCALE_SHIFT, + NTP_INTERVAL_FREQ); + /* cancel running adjtime() */ + time_adjust = 0; + } + /* update jitter */ + pps_jitter += (jitter - pps_jitter) >> PPS_INTMIN; +} + +/* + * __hardpps() - discipline CPU clock oscillator to external PPS signal + * + * This routine is called at each PPS signal arrival in order to + * discipline the CPU clock oscillator to the PPS signal. It takes two + * parameters: REALTIME and MONOTONIC_RAW clock timestamps. The former + * is used to correct clock phase error and the latter is used to + * correct the frequency. + * + * This code is based on David Mills's reference nanokernel + * implementation. It was mostly rewritten but keeps the same idea. + */ +void __hardpps(const struct timespec64 *phase_ts, const struct timespec64 *raw_ts) +{ + struct pps_normtime pts_norm, freq_norm; + + pts_norm = pps_normalize_ts(*phase_ts); + + /* clear the error bits, they will be set again if needed */ + time_status &= ~(STA_PPSJITTER | STA_PPSWANDER | STA_PPSERROR); + + /* indicate signal presence */ + time_status |= STA_PPSSIGNAL; + pps_valid = PPS_VALID; + + /* when called for the first time, + * just start the frequency interval */ + if (unlikely(pps_fbase.tv_sec == 0)) { + pps_fbase = *raw_ts; + return; + } + + /* ok, now we have a base for frequency calculation */ + freq_norm = pps_normalize_ts(timespec64_sub(*raw_ts, pps_fbase)); + + /* check that the signal is in the range + * [1s - MAXFREQ us, 1s + MAXFREQ us], otherwise reject it */ + if ((freq_norm.sec == 0) || + (freq_norm.nsec > MAXFREQ * freq_norm.sec) || + (freq_norm.nsec < -MAXFREQ * freq_norm.sec)) { + time_status |= STA_PPSJITTER; + /* restart the frequency calibration interval */ + pps_fbase = *raw_ts; + printk_deferred(KERN_ERR "hardpps: PPSJITTER: bad pulse\n"); + return; + } + + /* signal is ok */ + + /* check if the current frequency interval is finished */ + if (freq_norm.sec >= (1 << pps_shift)) { + pps_calcnt++; + /* restart the frequency calibration interval */ + pps_fbase = *raw_ts; + hardpps_update_freq(freq_norm); + } + + hardpps_update_phase(pts_norm.nsec); + +} +#endif /* CONFIG_NTP_PPS */ + +static int __init ntp_tick_adj_setup(char *str) +{ + int rc = kstrtos64(str, 0, &ntp_tick_adj); + if (rc) + return rc; + + ntp_tick_adj <<= NTP_SCALE_SHIFT; + return 1; +} + +__setup("ntp_tick_adj=", ntp_tick_adj_setup); + +void __init ntp_init(void) +{ + ntp_clear(); + ntp_init_cmos_sync(); +} diff --git a/kernel/time/ntp_internal.h b/kernel/time/ntp_internal.h new file mode 100644 index 0000000000..23d1b74c30 --- /dev/null +++ b/kernel/time/ntp_internal.h @@ -0,0 +1,22 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _LINUX_NTP_INTERNAL_H +#define _LINUX_NTP_INTERNAL_H + +extern void ntp_init(void); +extern void ntp_clear(void); +/* Returns how long ticks are at present, in ns / 2^NTP_SCALE_SHIFT. */ +extern u64 ntp_tick_length(void); +extern ktime_t ntp_get_next_leap(void); +extern int second_overflow(time64_t secs); +extern int __do_adjtimex(struct __kernel_timex *txc, + const struct timespec64 *ts, + s32 *time_tai, struct audit_ntp_data *ad); +extern void __hardpps(const struct timespec64 *phase_ts, const struct timespec64 *raw_ts); + +#if defined(CONFIG_GENERIC_CMOS_UPDATE) || defined(CONFIG_RTC_SYSTOHC) +extern void ntp_notify_cmos_timer(void); +#else +static inline void ntp_notify_cmos_timer(void) { } +#endif + +#endif /* _LINUX_NTP_INTERNAL_H */ diff --git a/kernel/time/posix-clock.c b/kernel/time/posix-clock.c new file mode 100644 index 0000000000..77c0c2370b --- /dev/null +++ b/kernel/time/posix-clock.c @@ -0,0 +1,317 @@ +// SPDX-License-Identifier: GPL-2.0+ +/* + * Support for dynamic clock devices + * + * Copyright (C) 2010 OMICRON electronics GmbH + */ +#include <linux/device.h> +#include <linux/export.h> +#include <linux/file.h> +#include <linux/posix-clock.h> +#include <linux/slab.h> +#include <linux/syscalls.h> +#include <linux/uaccess.h> + +#include "posix-timers.h" + +/* + * Returns NULL if the posix_clock instance attached to 'fp' is old and stale. + */ +static struct posix_clock *get_posix_clock(struct file *fp) +{ + struct posix_clock *clk = fp->private_data; + + down_read(&clk->rwsem); + + if (!clk->zombie) + return clk; + + up_read(&clk->rwsem); + + return NULL; +} + +static void put_posix_clock(struct posix_clock *clk) +{ + up_read(&clk->rwsem); +} + +static ssize_t posix_clock_read(struct file *fp, char __user *buf, + size_t count, loff_t *ppos) +{ + struct posix_clock *clk = get_posix_clock(fp); + int err = -EINVAL; + + if (!clk) + return -ENODEV; + + if (clk->ops.read) + err = clk->ops.read(clk, fp->f_flags, buf, count); + + put_posix_clock(clk); + + return err; +} + +static __poll_t posix_clock_poll(struct file *fp, poll_table *wait) +{ + struct posix_clock *clk = get_posix_clock(fp); + __poll_t result = 0; + + if (!clk) + return EPOLLERR; + + if (clk->ops.poll) + result = clk->ops.poll(clk, fp, wait); + + put_posix_clock(clk); + + return result; +} + +static long posix_clock_ioctl(struct file *fp, + unsigned int cmd, unsigned long arg) +{ + struct posix_clock *clk = get_posix_clock(fp); + int err = -ENOTTY; + + if (!clk) + return -ENODEV; + + if (clk->ops.ioctl) + err = clk->ops.ioctl(clk, cmd, arg); + + put_posix_clock(clk); + + return err; +} + +#ifdef CONFIG_COMPAT +static long posix_clock_compat_ioctl(struct file *fp, + unsigned int cmd, unsigned long arg) +{ + struct posix_clock *clk = get_posix_clock(fp); + int err = -ENOTTY; + + if (!clk) + return -ENODEV; + + if (clk->ops.ioctl) + err = clk->ops.ioctl(clk, cmd, arg); + + put_posix_clock(clk); + + return err; +} +#endif + +static int posix_clock_open(struct inode *inode, struct file *fp) +{ + int err; + struct posix_clock *clk = + container_of(inode->i_cdev, struct posix_clock, cdev); + + down_read(&clk->rwsem); + + if (clk->zombie) { + err = -ENODEV; + goto out; + } + if (clk->ops.open) + err = clk->ops.open(clk, fp->f_mode); + else + err = 0; + + if (!err) { + get_device(clk->dev); + fp->private_data = clk; + } +out: + up_read(&clk->rwsem); + return err; +} + +static int posix_clock_release(struct inode *inode, struct file *fp) +{ + struct posix_clock *clk = fp->private_data; + int err = 0; + + if (clk->ops.release) + err = clk->ops.release(clk); + + put_device(clk->dev); + + fp->private_data = NULL; + + return err; +} + +static const struct file_operations posix_clock_file_operations = { + .owner = THIS_MODULE, + .llseek = no_llseek, + .read = posix_clock_read, + .poll = posix_clock_poll, + .unlocked_ioctl = posix_clock_ioctl, + .open = posix_clock_open, + .release = posix_clock_release, +#ifdef CONFIG_COMPAT + .compat_ioctl = posix_clock_compat_ioctl, +#endif +}; + +int posix_clock_register(struct posix_clock *clk, struct device *dev) +{ + int err; + + init_rwsem(&clk->rwsem); + + cdev_init(&clk->cdev, &posix_clock_file_operations); + err = cdev_device_add(&clk->cdev, dev); + if (err) { + pr_err("%s unable to add device %d:%d\n", + dev_name(dev), MAJOR(dev->devt), MINOR(dev->devt)); + return err; + } + clk->cdev.owner = clk->ops.owner; + clk->dev = dev; + + return 0; +} +EXPORT_SYMBOL_GPL(posix_clock_register); + +void posix_clock_unregister(struct posix_clock *clk) +{ + cdev_device_del(&clk->cdev, clk->dev); + + down_write(&clk->rwsem); + clk->zombie = true; + up_write(&clk->rwsem); + + put_device(clk->dev); +} +EXPORT_SYMBOL_GPL(posix_clock_unregister); + +struct posix_clock_desc { + struct file *fp; + struct posix_clock *clk; +}; + +static int get_clock_desc(const clockid_t id, struct posix_clock_desc *cd) +{ + struct file *fp = fget(clockid_to_fd(id)); + int err = -EINVAL; + + if (!fp) + return err; + + if (fp->f_op->open != posix_clock_open || !fp->private_data) + goto out; + + cd->fp = fp; + cd->clk = get_posix_clock(fp); + + err = cd->clk ? 0 : -ENODEV; +out: + if (err) + fput(fp); + return err; +} + +static void put_clock_desc(struct posix_clock_desc *cd) +{ + put_posix_clock(cd->clk); + fput(cd->fp); +} + +static int pc_clock_adjtime(clockid_t id, struct __kernel_timex *tx) +{ + struct posix_clock_desc cd; + int err; + + err = get_clock_desc(id, &cd); + if (err) + return err; + + if ((cd.fp->f_mode & FMODE_WRITE) == 0) { + err = -EACCES; + goto out; + } + + if (cd.clk->ops.clock_adjtime) + err = cd.clk->ops.clock_adjtime(cd.clk, tx); + else + err = -EOPNOTSUPP; +out: + put_clock_desc(&cd); + + return err; +} + +static int pc_clock_gettime(clockid_t id, struct timespec64 *ts) +{ + struct posix_clock_desc cd; + int err; + + err = get_clock_desc(id, &cd); + if (err) + return err; + + if (cd.clk->ops.clock_gettime) + err = cd.clk->ops.clock_gettime(cd.clk, ts); + else + err = -EOPNOTSUPP; + + put_clock_desc(&cd); + + return err; +} + +static int pc_clock_getres(clockid_t id, struct timespec64 *ts) +{ + struct posix_clock_desc cd; + int err; + + err = get_clock_desc(id, &cd); + if (err) + return err; + + if (cd.clk->ops.clock_getres) + err = cd.clk->ops.clock_getres(cd.clk, ts); + else + err = -EOPNOTSUPP; + + put_clock_desc(&cd); + + return err; +} + +static int pc_clock_settime(clockid_t id, const struct timespec64 *ts) +{ + struct posix_clock_desc cd; + int err; + + err = get_clock_desc(id, &cd); + if (err) + return err; + + if ((cd.fp->f_mode & FMODE_WRITE) == 0) { + err = -EACCES; + goto out; + } + + if (cd.clk->ops.clock_settime) + err = cd.clk->ops.clock_settime(cd.clk, ts); + else + err = -EOPNOTSUPP; +out: + put_clock_desc(&cd); + + return err; +} + +const struct k_clock clock_posix_dynamic = { + .clock_getres = pc_clock_getres, + .clock_set = pc_clock_settime, + .clock_get_timespec = pc_clock_gettime, + .clock_adj = pc_clock_adjtime, +}; diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c new file mode 100644 index 0000000000..e9c6f9d0e4 --- /dev/null +++ b/kernel/time/posix-cpu-timers.c @@ -0,0 +1,1692 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Implement CPU time clocks for the POSIX clock interface. + */ + +#include <linux/sched/signal.h> +#include <linux/sched/cputime.h> +#include <linux/posix-timers.h> +#include <linux/errno.h> +#include <linux/math64.h> +#include <linux/uaccess.h> +#include <linux/kernel_stat.h> +#include <trace/events/timer.h> +#include <linux/tick.h> +#include <linux/workqueue.h> +#include <linux/compat.h> +#include <linux/sched/deadline.h> +#include <linux/task_work.h> + +#include "posix-timers.h" + +static void posix_cpu_timer_rearm(struct k_itimer *timer); + +void posix_cputimers_group_init(struct posix_cputimers *pct, u64 cpu_limit) +{ + posix_cputimers_init(pct); + if (cpu_limit != RLIM_INFINITY) { + pct->bases[CPUCLOCK_PROF].nextevt = cpu_limit * NSEC_PER_SEC; + pct->timers_active = true; + } +} + +/* + * Called after updating RLIMIT_CPU to run cpu timer and update + * tsk->signal->posix_cputimers.bases[clock].nextevt expiration cache if + * necessary. Needs siglock protection since other code may update the + * expiration cache as well. + * + * Returns 0 on success, -ESRCH on failure. Can fail if the task is exiting and + * we cannot lock_task_sighand. Cannot fail if task is current. + */ +int update_rlimit_cpu(struct task_struct *task, unsigned long rlim_new) +{ + u64 nsecs = rlim_new * NSEC_PER_SEC; + unsigned long irq_fl; + + if (!lock_task_sighand(task, &irq_fl)) + return -ESRCH; + set_process_cpu_timer(task, CPUCLOCK_PROF, &nsecs, NULL); + unlock_task_sighand(task, &irq_fl); + return 0; +} + +/* + * Functions for validating access to tasks. + */ +static struct pid *pid_for_clock(const clockid_t clock, bool gettime) +{ + const bool thread = !!CPUCLOCK_PERTHREAD(clock); + const pid_t upid = CPUCLOCK_PID(clock); + struct pid *pid; + + if (CPUCLOCK_WHICH(clock) >= CPUCLOCK_MAX) + return NULL; + + /* + * If the encoded PID is 0, then the timer is targeted at current + * or the process to which current belongs. + */ + if (upid == 0) + return thread ? task_pid(current) : task_tgid(current); + + pid = find_vpid(upid); + if (!pid) + return NULL; + + if (thread) { + struct task_struct *tsk = pid_task(pid, PIDTYPE_PID); + return (tsk && same_thread_group(tsk, current)) ? pid : NULL; + } + + /* + * For clock_gettime(PROCESS) allow finding the process by + * with the pid of the current task. The code needs the tgid + * of the process so that pid_task(pid, PIDTYPE_TGID) can be + * used to find the process. + */ + if (gettime && (pid == task_pid(current))) + return task_tgid(current); + + /* + * For processes require that pid identifies a process. + */ + return pid_has_task(pid, PIDTYPE_TGID) ? pid : NULL; +} + +static inline int validate_clock_permissions(const clockid_t clock) +{ + int ret; + + rcu_read_lock(); + ret = pid_for_clock(clock, false) ? 0 : -EINVAL; + rcu_read_unlock(); + + return ret; +} + +static inline enum pid_type clock_pid_type(const clockid_t clock) +{ + return CPUCLOCK_PERTHREAD(clock) ? PIDTYPE_PID : PIDTYPE_TGID; +} + +static inline struct task_struct *cpu_timer_task_rcu(struct k_itimer *timer) +{ + return pid_task(timer->it.cpu.pid, clock_pid_type(timer->it_clock)); +} + +/* + * Update expiry time from increment, and increase overrun count, + * given the current clock sample. + */ +static u64 bump_cpu_timer(struct k_itimer *timer, u64 now) +{ + u64 delta, incr, expires = timer->it.cpu.node.expires; + int i; + + if (!timer->it_interval) + return expires; + + if (now < expires) + return expires; + + incr = timer->it_interval; + delta = now + incr - expires; + + /* Don't use (incr*2 < delta), incr*2 might overflow. */ + for (i = 0; incr < delta - incr; i++) + incr = incr << 1; + + for (; i >= 0; incr >>= 1, i--) { + if (delta < incr) + continue; + + timer->it.cpu.node.expires += incr; + timer->it_overrun += 1LL << i; + delta -= incr; + } + return timer->it.cpu.node.expires; +} + +/* Check whether all cache entries contain U64_MAX, i.e. eternal expiry time */ +static inline bool expiry_cache_is_inactive(const struct posix_cputimers *pct) +{ + return !(~pct->bases[CPUCLOCK_PROF].nextevt | + ~pct->bases[CPUCLOCK_VIRT].nextevt | + ~pct->bases[CPUCLOCK_SCHED].nextevt); +} + +static int +posix_cpu_clock_getres(const clockid_t which_clock, struct timespec64 *tp) +{ + int error = validate_clock_permissions(which_clock); + + if (!error) { + tp->tv_sec = 0; + tp->tv_nsec = ((NSEC_PER_SEC + HZ - 1) / HZ); + if (CPUCLOCK_WHICH(which_clock) == CPUCLOCK_SCHED) { + /* + * If sched_clock is using a cycle counter, we + * don't have any idea of its true resolution + * exported, but it is much more than 1s/HZ. + */ + tp->tv_nsec = 1; + } + } + return error; +} + +static int +posix_cpu_clock_set(const clockid_t clock, const struct timespec64 *tp) +{ + int error = validate_clock_permissions(clock); + + /* + * You can never reset a CPU clock, but we check for other errors + * in the call before failing with EPERM. + */ + return error ? : -EPERM; +} + +/* + * Sample a per-thread clock for the given task. clkid is validated. + */ +static u64 cpu_clock_sample(const clockid_t clkid, struct task_struct *p) +{ + u64 utime, stime; + + if (clkid == CPUCLOCK_SCHED) + return task_sched_runtime(p); + + task_cputime(p, &utime, &stime); + + switch (clkid) { + case CPUCLOCK_PROF: + return utime + stime; + case CPUCLOCK_VIRT: + return utime; + default: + WARN_ON_ONCE(1); + } + return 0; +} + +static inline void store_samples(u64 *samples, u64 stime, u64 utime, u64 rtime) +{ + samples[CPUCLOCK_PROF] = stime + utime; + samples[CPUCLOCK_VIRT] = utime; + samples[CPUCLOCK_SCHED] = rtime; +} + +static void task_sample_cputime(struct task_struct *p, u64 *samples) +{ + u64 stime, utime; + + task_cputime(p, &utime, &stime); + store_samples(samples, stime, utime, p->se.sum_exec_runtime); +} + +static void proc_sample_cputime_atomic(struct task_cputime_atomic *at, + u64 *samples) +{ + u64 stime, utime, rtime; + + utime = atomic64_read(&at->utime); + stime = atomic64_read(&at->stime); + rtime = atomic64_read(&at->sum_exec_runtime); + store_samples(samples, stime, utime, rtime); +} + +/* + * Set cputime to sum_cputime if sum_cputime > cputime. Use cmpxchg + * to avoid race conditions with concurrent updates to cputime. + */ +static inline void __update_gt_cputime(atomic64_t *cputime, u64 sum_cputime) +{ + u64 curr_cputime = atomic64_read(cputime); + + do { + if (sum_cputime <= curr_cputime) + return; + } while (!atomic64_try_cmpxchg(cputime, &curr_cputime, sum_cputime)); +} + +static void update_gt_cputime(struct task_cputime_atomic *cputime_atomic, + struct task_cputime *sum) +{ + __update_gt_cputime(&cputime_atomic->utime, sum->utime); + __update_gt_cputime(&cputime_atomic->stime, sum->stime); + __update_gt_cputime(&cputime_atomic->sum_exec_runtime, sum->sum_exec_runtime); +} + +/** + * thread_group_sample_cputime - Sample cputime for a given task + * @tsk: Task for which cputime needs to be started + * @samples: Storage for time samples + * + * Called from sys_getitimer() to calculate the expiry time of an active + * timer. That means group cputime accounting is already active. Called + * with task sighand lock held. + * + * Updates @times with an uptodate sample of the thread group cputimes. + */ +void thread_group_sample_cputime(struct task_struct *tsk, u64 *samples) +{ + struct thread_group_cputimer *cputimer = &tsk->signal->cputimer; + struct posix_cputimers *pct = &tsk->signal->posix_cputimers; + + WARN_ON_ONCE(!pct->timers_active); + + proc_sample_cputime_atomic(&cputimer->cputime_atomic, samples); +} + +/** + * thread_group_start_cputime - Start cputime and return a sample + * @tsk: Task for which cputime needs to be started + * @samples: Storage for time samples + * + * The thread group cputime accounting is avoided when there are no posix + * CPU timers armed. Before starting a timer it's required to check whether + * the time accounting is active. If not, a full update of the atomic + * accounting store needs to be done and the accounting enabled. + * + * Updates @times with an uptodate sample of the thread group cputimes. + */ +static void thread_group_start_cputime(struct task_struct *tsk, u64 *samples) +{ + struct thread_group_cputimer *cputimer = &tsk->signal->cputimer; + struct posix_cputimers *pct = &tsk->signal->posix_cputimers; + + lockdep_assert_task_sighand_held(tsk); + + /* Check if cputimer isn't running. This is accessed without locking. */ + if (!READ_ONCE(pct->timers_active)) { + struct task_cputime sum; + + /* + * The POSIX timer interface allows for absolute time expiry + * values through the TIMER_ABSTIME flag, therefore we have + * to synchronize the timer to the clock every time we start it. + */ + thread_group_cputime(tsk, &sum); + update_gt_cputime(&cputimer->cputime_atomic, &sum); + + /* + * We're setting timers_active without a lock. Ensure this + * only gets written to in one operation. We set it after + * update_gt_cputime() as a small optimization, but + * barriers are not required because update_gt_cputime() + * can handle concurrent updates. + */ + WRITE_ONCE(pct->timers_active, true); + } + proc_sample_cputime_atomic(&cputimer->cputime_atomic, samples); +} + +static void __thread_group_cputime(struct task_struct *tsk, u64 *samples) +{ + struct task_cputime ct; + + thread_group_cputime(tsk, &ct); + store_samples(samples, ct.stime, ct.utime, ct.sum_exec_runtime); +} + +/* + * Sample a process (thread group) clock for the given task clkid. If the + * group's cputime accounting is already enabled, read the atomic + * store. Otherwise a full update is required. clkid is already validated. + */ +static u64 cpu_clock_sample_group(const clockid_t clkid, struct task_struct *p, + bool start) +{ + struct thread_group_cputimer *cputimer = &p->signal->cputimer; + struct posix_cputimers *pct = &p->signal->posix_cputimers; + u64 samples[CPUCLOCK_MAX]; + + if (!READ_ONCE(pct->timers_active)) { + if (start) + thread_group_start_cputime(p, samples); + else + __thread_group_cputime(p, samples); + } else { + proc_sample_cputime_atomic(&cputimer->cputime_atomic, samples); + } + + return samples[clkid]; +} + +static int posix_cpu_clock_get(const clockid_t clock, struct timespec64 *tp) +{ + const clockid_t clkid = CPUCLOCK_WHICH(clock); + struct task_struct *tsk; + u64 t; + + rcu_read_lock(); + tsk = pid_task(pid_for_clock(clock, true), clock_pid_type(clock)); + if (!tsk) { + rcu_read_unlock(); + return -EINVAL; + } + + if (CPUCLOCK_PERTHREAD(clock)) + t = cpu_clock_sample(clkid, tsk); + else + t = cpu_clock_sample_group(clkid, tsk, false); + rcu_read_unlock(); + + *tp = ns_to_timespec64(t); + return 0; +} + +/* + * Validate the clockid_t for a new CPU-clock timer, and initialize the timer. + * This is called from sys_timer_create() and do_cpu_nanosleep() with the + * new timer already all-zeros initialized. + */ +static int posix_cpu_timer_create(struct k_itimer *new_timer) +{ + static struct lock_class_key posix_cpu_timers_key; + struct pid *pid; + + rcu_read_lock(); + pid = pid_for_clock(new_timer->it_clock, false); + if (!pid) { + rcu_read_unlock(); + return -EINVAL; + } + + /* + * If posix timer expiry is handled in task work context then + * timer::it_lock can be taken without disabling interrupts as all + * other locking happens in task context. This requires a separate + * lock class key otherwise regular posix timer expiry would record + * the lock class being taken in interrupt context and generate a + * false positive warning. + */ + if (IS_ENABLED(CONFIG_POSIX_CPU_TIMERS_TASK_WORK)) + lockdep_set_class(&new_timer->it_lock, &posix_cpu_timers_key); + + new_timer->kclock = &clock_posix_cpu; + timerqueue_init(&new_timer->it.cpu.node); + new_timer->it.cpu.pid = get_pid(pid); + rcu_read_unlock(); + return 0; +} + +static struct posix_cputimer_base *timer_base(struct k_itimer *timer, + struct task_struct *tsk) +{ + int clkidx = CPUCLOCK_WHICH(timer->it_clock); + + if (CPUCLOCK_PERTHREAD(timer->it_clock)) + return tsk->posix_cputimers.bases + clkidx; + else + return tsk->signal->posix_cputimers.bases + clkidx; +} + +/* + * Force recalculating the base earliest expiration on the next tick. + * This will also re-evaluate the need to keep around the process wide + * cputime counter and tick dependency and eventually shut these down + * if necessary. + */ +static void trigger_base_recalc_expires(struct k_itimer *timer, + struct task_struct *tsk) +{ + struct posix_cputimer_base *base = timer_base(timer, tsk); + + base->nextevt = 0; +} + +/* + * Dequeue the timer and reset the base if it was its earliest expiration. + * It makes sure the next tick recalculates the base next expiration so we + * don't keep the costly process wide cputime counter around for a random + * amount of time, along with the tick dependency. + * + * If another timer gets queued between this and the next tick, its + * expiration will update the base next event if necessary on the next + * tick. + */ +static void disarm_timer(struct k_itimer *timer, struct task_struct *p) +{ + struct cpu_timer *ctmr = &timer->it.cpu; + struct posix_cputimer_base *base; + + if (!cpu_timer_dequeue(ctmr)) + return; + + base = timer_base(timer, p); + if (cpu_timer_getexpires(ctmr) == base->nextevt) + trigger_base_recalc_expires(timer, p); +} + + +/* + * Clean up a CPU-clock timer that is about to be destroyed. + * This is called from timer deletion with the timer already locked. + * If we return TIMER_RETRY, it's necessary to release the timer's lock + * and try again. (This happens when the timer is in the middle of firing.) + */ +static int posix_cpu_timer_del(struct k_itimer *timer) +{ + struct cpu_timer *ctmr = &timer->it.cpu; + struct sighand_struct *sighand; + struct task_struct *p; + unsigned long flags; + int ret = 0; + + rcu_read_lock(); + p = cpu_timer_task_rcu(timer); + if (!p) + goto out; + + /* + * Protect against sighand release/switch in exit/exec and process/ + * thread timer list entry concurrent read/writes. + */ + sighand = lock_task_sighand(p, &flags); + if (unlikely(sighand == NULL)) { + /* + * This raced with the reaping of the task. The exit cleanup + * should have removed this timer from the timer queue. + */ + WARN_ON_ONCE(ctmr->head || timerqueue_node_queued(&ctmr->node)); + } else { + if (timer->it.cpu.firing) + ret = TIMER_RETRY; + else + disarm_timer(timer, p); + + unlock_task_sighand(p, &flags); + } + +out: + rcu_read_unlock(); + if (!ret) + put_pid(ctmr->pid); + + return ret; +} + +static void cleanup_timerqueue(struct timerqueue_head *head) +{ + struct timerqueue_node *node; + struct cpu_timer *ctmr; + + while ((node = timerqueue_getnext(head))) { + timerqueue_del(head, node); + ctmr = container_of(node, struct cpu_timer, node); + ctmr->head = NULL; + } +} + +/* + * Clean out CPU timers which are still armed when a thread exits. The + * timers are only removed from the list. No other updates are done. The + * corresponding posix timers are still accessible, but cannot be rearmed. + * + * This must be called with the siglock held. + */ +static void cleanup_timers(struct posix_cputimers *pct) +{ + cleanup_timerqueue(&pct->bases[CPUCLOCK_PROF].tqhead); + cleanup_timerqueue(&pct->bases[CPUCLOCK_VIRT].tqhead); + cleanup_timerqueue(&pct->bases[CPUCLOCK_SCHED].tqhead); +} + +/* + * These are both called with the siglock held, when the current thread + * is being reaped. When the final (leader) thread in the group is reaped, + * posix_cpu_timers_exit_group will be called after posix_cpu_timers_exit. + */ +void posix_cpu_timers_exit(struct task_struct *tsk) +{ + cleanup_timers(&tsk->posix_cputimers); +} +void posix_cpu_timers_exit_group(struct task_struct *tsk) +{ + cleanup_timers(&tsk->signal->posix_cputimers); +} + +/* + * Insert the timer on the appropriate list before any timers that + * expire later. This must be called with the sighand lock held. + */ +static void arm_timer(struct k_itimer *timer, struct task_struct *p) +{ + struct posix_cputimer_base *base = timer_base(timer, p); + struct cpu_timer *ctmr = &timer->it.cpu; + u64 newexp = cpu_timer_getexpires(ctmr); + + if (!cpu_timer_enqueue(&base->tqhead, ctmr)) + return; + + /* + * We are the new earliest-expiring POSIX 1.b timer, hence + * need to update expiration cache. Take into account that + * for process timers we share expiration cache with itimers + * and RLIMIT_CPU and for thread timers with RLIMIT_RTTIME. + */ + if (newexp < base->nextevt) + base->nextevt = newexp; + + if (CPUCLOCK_PERTHREAD(timer->it_clock)) + tick_dep_set_task(p, TICK_DEP_BIT_POSIX_TIMER); + else + tick_dep_set_signal(p, TICK_DEP_BIT_POSIX_TIMER); +} + +/* + * The timer is locked, fire it and arrange for its reload. + */ +static void cpu_timer_fire(struct k_itimer *timer) +{ + struct cpu_timer *ctmr = &timer->it.cpu; + + if ((timer->it_sigev_notify & ~SIGEV_THREAD_ID) == SIGEV_NONE) { + /* + * User don't want any signal. + */ + cpu_timer_setexpires(ctmr, 0); + } else if (unlikely(timer->sigq == NULL)) { + /* + * This a special case for clock_nanosleep, + * not a normal timer from sys_timer_create. + */ + wake_up_process(timer->it_process); + cpu_timer_setexpires(ctmr, 0); + } else if (!timer->it_interval) { + /* + * One-shot timer. Clear it as soon as it's fired. + */ + posix_timer_event(timer, 0); + cpu_timer_setexpires(ctmr, 0); + } else if (posix_timer_event(timer, ++timer->it_requeue_pending)) { + /* + * The signal did not get queued because the signal + * was ignored, so we won't get any callback to + * reload the timer. But we need to keep it + * ticking in case the signal is deliverable next time. + */ + posix_cpu_timer_rearm(timer); + ++timer->it_requeue_pending; + } +} + +/* + * Guts of sys_timer_settime for CPU timers. + * This is called with the timer locked and interrupts disabled. + * If we return TIMER_RETRY, it's necessary to release the timer's lock + * and try again. (This happens when the timer is in the middle of firing.) + */ +static int posix_cpu_timer_set(struct k_itimer *timer, int timer_flags, + struct itimerspec64 *new, struct itimerspec64 *old) +{ + clockid_t clkid = CPUCLOCK_WHICH(timer->it_clock); + u64 old_expires, new_expires, old_incr, val; + struct cpu_timer *ctmr = &timer->it.cpu; + struct sighand_struct *sighand; + struct task_struct *p; + unsigned long flags; + int ret = 0; + + rcu_read_lock(); + p = cpu_timer_task_rcu(timer); + if (!p) { + /* + * If p has just been reaped, we can no + * longer get any information about it at all. + */ + rcu_read_unlock(); + return -ESRCH; + } + + /* + * Use the to_ktime conversion because that clamps the maximum + * value to KTIME_MAX and avoid multiplication overflows. + */ + new_expires = ktime_to_ns(timespec64_to_ktime(new->it_value)); + + /* + * Protect against sighand release/switch in exit/exec and p->cpu_timers + * and p->signal->cpu_timers read/write in arm_timer() + */ + sighand = lock_task_sighand(p, &flags); + /* + * If p has just been reaped, we can no + * longer get any information about it at all. + */ + if (unlikely(sighand == NULL)) { + rcu_read_unlock(); + return -ESRCH; + } + + /* + * Disarm any old timer after extracting its expiry time. + */ + old_incr = timer->it_interval; + old_expires = cpu_timer_getexpires(ctmr); + + if (unlikely(timer->it.cpu.firing)) { + timer->it.cpu.firing = -1; + ret = TIMER_RETRY; + } else { + cpu_timer_dequeue(ctmr); + } + + /* + * We need to sample the current value to convert the new + * value from to relative and absolute, and to convert the + * old value from absolute to relative. To set a process + * timer, we need a sample to balance the thread expiry + * times (in arm_timer). With an absolute time, we must + * check if it's already passed. In short, we need a sample. + */ + if (CPUCLOCK_PERTHREAD(timer->it_clock)) + val = cpu_clock_sample(clkid, p); + else + val = cpu_clock_sample_group(clkid, p, true); + + if (old) { + if (old_expires == 0) { + old->it_value.tv_sec = 0; + old->it_value.tv_nsec = 0; + } else { + /* + * Update the timer in case it has overrun already. + * If it has, we'll report it as having overrun and + * with the next reloaded timer already ticking, + * though we are swallowing that pending + * notification here to install the new setting. + */ + u64 exp = bump_cpu_timer(timer, val); + + if (val < exp) { + old_expires = exp - val; + old->it_value = ns_to_timespec64(old_expires); + } else { + old->it_value.tv_nsec = 1; + old->it_value.tv_sec = 0; + } + } + } + + if (unlikely(ret)) { + /* + * We are colliding with the timer actually firing. + * Punt after filling in the timer's old value, and + * disable this firing since we are already reporting + * it as an overrun (thanks to bump_cpu_timer above). + */ + unlock_task_sighand(p, &flags); + goto out; + } + + if (new_expires != 0 && !(timer_flags & TIMER_ABSTIME)) { + new_expires += val; + } + + /* + * Install the new expiry time (or zero). + * For a timer with no notification action, we don't actually + * arm the timer (we'll just fake it for timer_gettime). + */ + cpu_timer_setexpires(ctmr, new_expires); + if (new_expires != 0 && val < new_expires) { + arm_timer(timer, p); + } + + unlock_task_sighand(p, &flags); + /* + * Install the new reload setting, and + * set up the signal and overrun bookkeeping. + */ + timer->it_interval = timespec64_to_ktime(new->it_interval); + + /* + * This acts as a modification timestamp for the timer, + * so any automatic reload attempt will punt on seeing + * that we have reset the timer manually. + */ + timer->it_requeue_pending = (timer->it_requeue_pending + 2) & + ~REQUEUE_PENDING; + timer->it_overrun_last = 0; + timer->it_overrun = -1; + + if (val >= new_expires) { + if (new_expires != 0) { + /* + * The designated time already passed, so we notify + * immediately, even if the thread never runs to + * accumulate more time on this clock. + */ + cpu_timer_fire(timer); + } + + /* + * Make sure we don't keep around the process wide cputime + * counter or the tick dependency if they are not necessary. + */ + sighand = lock_task_sighand(p, &flags); + if (!sighand) + goto out; + + if (!cpu_timer_queued(ctmr)) + trigger_base_recalc_expires(timer, p); + + unlock_task_sighand(p, &flags); + } + out: + rcu_read_unlock(); + if (old) + old->it_interval = ns_to_timespec64(old_incr); + + return ret; +} + +static void posix_cpu_timer_get(struct k_itimer *timer, struct itimerspec64 *itp) +{ + clockid_t clkid = CPUCLOCK_WHICH(timer->it_clock); + struct cpu_timer *ctmr = &timer->it.cpu; + u64 now, expires = cpu_timer_getexpires(ctmr); + struct task_struct *p; + + rcu_read_lock(); + p = cpu_timer_task_rcu(timer); + if (!p) + goto out; + + /* + * Easy part: convert the reload time. + */ + itp->it_interval = ktime_to_timespec64(timer->it_interval); + + if (!expires) + goto out; + + /* + * Sample the clock to take the difference with the expiry time. + */ + if (CPUCLOCK_PERTHREAD(timer->it_clock)) + now = cpu_clock_sample(clkid, p); + else + now = cpu_clock_sample_group(clkid, p, false); + + if (now < expires) { + itp->it_value = ns_to_timespec64(expires - now); + } else { + /* + * The timer should have expired already, but the firing + * hasn't taken place yet. Say it's just about to expire. + */ + itp->it_value.tv_nsec = 1; + itp->it_value.tv_sec = 0; + } +out: + rcu_read_unlock(); +} + +#define MAX_COLLECTED 20 + +static u64 collect_timerqueue(struct timerqueue_head *head, + struct list_head *firing, u64 now) +{ + struct timerqueue_node *next; + int i = 0; + + while ((next = timerqueue_getnext(head))) { + struct cpu_timer *ctmr; + u64 expires; + + ctmr = container_of(next, struct cpu_timer, node); + expires = cpu_timer_getexpires(ctmr); + /* Limit the number of timers to expire at once */ + if (++i == MAX_COLLECTED || now < expires) + return expires; + + ctmr->firing = 1; + /* See posix_cpu_timer_wait_running() */ + rcu_assign_pointer(ctmr->handling, current); + cpu_timer_dequeue(ctmr); + list_add_tail(&ctmr->elist, firing); + } + + return U64_MAX; +} + +static void collect_posix_cputimers(struct posix_cputimers *pct, u64 *samples, + struct list_head *firing) +{ + struct posix_cputimer_base *base = pct->bases; + int i; + + for (i = 0; i < CPUCLOCK_MAX; i++, base++) { + base->nextevt = collect_timerqueue(&base->tqhead, firing, + samples[i]); + } +} + +static inline void check_dl_overrun(struct task_struct *tsk) +{ + if (tsk->dl.dl_overrun) { + tsk->dl.dl_overrun = 0; + send_signal_locked(SIGXCPU, SEND_SIG_PRIV, tsk, PIDTYPE_TGID); + } +} + +static bool check_rlimit(u64 time, u64 limit, int signo, bool rt, bool hard) +{ + if (time < limit) + return false; + + if (print_fatal_signals) { + pr_info("%s Watchdog Timeout (%s): %s[%d]\n", + rt ? "RT" : "CPU", hard ? "hard" : "soft", + current->comm, task_pid_nr(current)); + } + send_signal_locked(signo, SEND_SIG_PRIV, current, PIDTYPE_TGID); + return true; +} + +/* + * Check for any per-thread CPU timers that have fired and move them off + * the tsk->cpu_timers[N] list onto the firing list. Here we update the + * tsk->it_*_expires values to reflect the remaining thread CPU timers. + */ +static void check_thread_timers(struct task_struct *tsk, + struct list_head *firing) +{ + struct posix_cputimers *pct = &tsk->posix_cputimers; + u64 samples[CPUCLOCK_MAX]; + unsigned long soft; + + if (dl_task(tsk)) + check_dl_overrun(tsk); + + if (expiry_cache_is_inactive(pct)) + return; + + task_sample_cputime(tsk, samples); + collect_posix_cputimers(pct, samples, firing); + + /* + * Check for the special case thread timers. + */ + soft = task_rlimit(tsk, RLIMIT_RTTIME); + if (soft != RLIM_INFINITY) { + /* Task RT timeout is accounted in jiffies. RTTIME is usec */ + unsigned long rttime = tsk->rt.timeout * (USEC_PER_SEC / HZ); + unsigned long hard = task_rlimit_max(tsk, RLIMIT_RTTIME); + + /* At the hard limit, send SIGKILL. No further action. */ + if (hard != RLIM_INFINITY && + check_rlimit(rttime, hard, SIGKILL, true, true)) + return; + + /* At the soft limit, send a SIGXCPU every second */ + if (check_rlimit(rttime, soft, SIGXCPU, true, false)) { + soft += USEC_PER_SEC; + tsk->signal->rlim[RLIMIT_RTTIME].rlim_cur = soft; + } + } + + if (expiry_cache_is_inactive(pct)) + tick_dep_clear_task(tsk, TICK_DEP_BIT_POSIX_TIMER); +} + +static inline void stop_process_timers(struct signal_struct *sig) +{ + struct posix_cputimers *pct = &sig->posix_cputimers; + + /* Turn off the active flag. This is done without locking. */ + WRITE_ONCE(pct->timers_active, false); + tick_dep_clear_signal(sig, TICK_DEP_BIT_POSIX_TIMER); +} + +static void check_cpu_itimer(struct task_struct *tsk, struct cpu_itimer *it, + u64 *expires, u64 cur_time, int signo) +{ + if (!it->expires) + return; + + if (cur_time >= it->expires) { + if (it->incr) + it->expires += it->incr; + else + it->expires = 0; + + trace_itimer_expire(signo == SIGPROF ? + ITIMER_PROF : ITIMER_VIRTUAL, + task_tgid(tsk), cur_time); + send_signal_locked(signo, SEND_SIG_PRIV, tsk, PIDTYPE_TGID); + } + + if (it->expires && it->expires < *expires) + *expires = it->expires; +} + +/* + * Check for any per-thread CPU timers that have fired and move them + * off the tsk->*_timers list onto the firing list. Per-thread timers + * have already been taken off. + */ +static void check_process_timers(struct task_struct *tsk, + struct list_head *firing) +{ + struct signal_struct *const sig = tsk->signal; + struct posix_cputimers *pct = &sig->posix_cputimers; + u64 samples[CPUCLOCK_MAX]; + unsigned long soft; + + /* + * If there are no active process wide timers (POSIX 1.b, itimers, + * RLIMIT_CPU) nothing to check. Also skip the process wide timer + * processing when there is already another task handling them. + */ + if (!READ_ONCE(pct->timers_active) || pct->expiry_active) + return; + + /* + * Signify that a thread is checking for process timers. + * Write access to this field is protected by the sighand lock. + */ + pct->expiry_active = true; + + /* + * Collect the current process totals. Group accounting is active + * so the sample can be taken directly. + */ + proc_sample_cputime_atomic(&sig->cputimer.cputime_atomic, samples); + collect_posix_cputimers(pct, samples, firing); + + /* + * Check for the special case process timers. + */ + check_cpu_itimer(tsk, &sig->it[CPUCLOCK_PROF], + &pct->bases[CPUCLOCK_PROF].nextevt, + samples[CPUCLOCK_PROF], SIGPROF); + check_cpu_itimer(tsk, &sig->it[CPUCLOCK_VIRT], + &pct->bases[CPUCLOCK_VIRT].nextevt, + samples[CPUCLOCK_VIRT], SIGVTALRM); + + soft = task_rlimit(tsk, RLIMIT_CPU); + if (soft != RLIM_INFINITY) { + /* RLIMIT_CPU is in seconds. Samples are nanoseconds */ + unsigned long hard = task_rlimit_max(tsk, RLIMIT_CPU); + u64 ptime = samples[CPUCLOCK_PROF]; + u64 softns = (u64)soft * NSEC_PER_SEC; + u64 hardns = (u64)hard * NSEC_PER_SEC; + + /* At the hard limit, send SIGKILL. No further action. */ + if (hard != RLIM_INFINITY && + check_rlimit(ptime, hardns, SIGKILL, false, true)) + return; + + /* At the soft limit, send a SIGXCPU every second */ + if (check_rlimit(ptime, softns, SIGXCPU, false, false)) { + sig->rlim[RLIMIT_CPU].rlim_cur = soft + 1; + softns += NSEC_PER_SEC; + } + + /* Update the expiry cache */ + if (softns < pct->bases[CPUCLOCK_PROF].nextevt) + pct->bases[CPUCLOCK_PROF].nextevt = softns; + } + + if (expiry_cache_is_inactive(pct)) + stop_process_timers(sig); + + pct->expiry_active = false; +} + +/* + * This is called from the signal code (via posixtimer_rearm) + * when the last timer signal was delivered and we have to reload the timer. + */ +static void posix_cpu_timer_rearm(struct k_itimer *timer) +{ + clockid_t clkid = CPUCLOCK_WHICH(timer->it_clock); + struct task_struct *p; + struct sighand_struct *sighand; + unsigned long flags; + u64 now; + + rcu_read_lock(); + p = cpu_timer_task_rcu(timer); + if (!p) + goto out; + + /* Protect timer list r/w in arm_timer() */ + sighand = lock_task_sighand(p, &flags); + if (unlikely(sighand == NULL)) + goto out; + + /* + * Fetch the current sample and update the timer's expiry time. + */ + if (CPUCLOCK_PERTHREAD(timer->it_clock)) + now = cpu_clock_sample(clkid, p); + else + now = cpu_clock_sample_group(clkid, p, true); + + bump_cpu_timer(timer, now); + + /* + * Now re-arm for the new expiry time. + */ + arm_timer(timer, p); + unlock_task_sighand(p, &flags); +out: + rcu_read_unlock(); +} + +/** + * task_cputimers_expired - Check whether posix CPU timers are expired + * + * @samples: Array of current samples for the CPUCLOCK clocks + * @pct: Pointer to a posix_cputimers container + * + * Returns true if any member of @samples is greater than the corresponding + * member of @pct->bases[CLK].nextevt. False otherwise + */ +static inline bool +task_cputimers_expired(const u64 *samples, struct posix_cputimers *pct) +{ + int i; + + for (i = 0; i < CPUCLOCK_MAX; i++) { + if (samples[i] >= pct->bases[i].nextevt) + return true; + } + return false; +} + +/** + * fastpath_timer_check - POSIX CPU timers fast path. + * + * @tsk: The task (thread) being checked. + * + * Check the task and thread group timers. If both are zero (there are no + * timers set) return false. Otherwise snapshot the task and thread group + * timers and compare them with the corresponding expiration times. Return + * true if a timer has expired, else return false. + */ +static inline bool fastpath_timer_check(struct task_struct *tsk) +{ + struct posix_cputimers *pct = &tsk->posix_cputimers; + struct signal_struct *sig; + + if (!expiry_cache_is_inactive(pct)) { + u64 samples[CPUCLOCK_MAX]; + + task_sample_cputime(tsk, samples); + if (task_cputimers_expired(samples, pct)) + return true; + } + + sig = tsk->signal; + pct = &sig->posix_cputimers; + /* + * Check if thread group timers expired when timers are active and + * no other thread in the group is already handling expiry for + * thread group cputimers. These fields are read without the + * sighand lock. However, this is fine because this is meant to be + * a fastpath heuristic to determine whether we should try to + * acquire the sighand lock to handle timer expiry. + * + * In the worst case scenario, if concurrently timers_active is set + * or expiry_active is cleared, but the current thread doesn't see + * the change yet, the timer checks are delayed until the next + * thread in the group gets a scheduler interrupt to handle the + * timer. This isn't an issue in practice because these types of + * delays with signals actually getting sent are expected. + */ + if (READ_ONCE(pct->timers_active) && !READ_ONCE(pct->expiry_active)) { + u64 samples[CPUCLOCK_MAX]; + + proc_sample_cputime_atomic(&sig->cputimer.cputime_atomic, + samples); + + if (task_cputimers_expired(samples, pct)) + return true; + } + + if (dl_task(tsk) && tsk->dl.dl_overrun) + return true; + + return false; +} + +static void handle_posix_cpu_timers(struct task_struct *tsk); + +#ifdef CONFIG_POSIX_CPU_TIMERS_TASK_WORK +static void posix_cpu_timers_work(struct callback_head *work) +{ + struct posix_cputimers_work *cw = container_of(work, typeof(*cw), work); + + mutex_lock(&cw->mutex); + handle_posix_cpu_timers(current); + mutex_unlock(&cw->mutex); +} + +/* + * Invoked from the posix-timer core when a cancel operation failed because + * the timer is marked firing. The caller holds rcu_read_lock(), which + * protects the timer and the task which is expiring it from being freed. + */ +static void posix_cpu_timer_wait_running(struct k_itimer *timr) +{ + struct task_struct *tsk = rcu_dereference(timr->it.cpu.handling); + + /* Has the handling task completed expiry already? */ + if (!tsk) + return; + + /* Ensure that the task cannot go away */ + get_task_struct(tsk); + /* Now drop the RCU protection so the mutex can be locked */ + rcu_read_unlock(); + /* Wait on the expiry mutex */ + mutex_lock(&tsk->posix_cputimers_work.mutex); + /* Release it immediately again. */ + mutex_unlock(&tsk->posix_cputimers_work.mutex); + /* Drop the task reference. */ + put_task_struct(tsk); + /* Relock RCU so the callsite is balanced */ + rcu_read_lock(); +} + +static void posix_cpu_timer_wait_running_nsleep(struct k_itimer *timr) +{ + /* Ensure that timr->it.cpu.handling task cannot go away */ + rcu_read_lock(); + spin_unlock_irq(&timr->it_lock); + posix_cpu_timer_wait_running(timr); + rcu_read_unlock(); + /* @timr is on stack and is valid */ + spin_lock_irq(&timr->it_lock); +} + +/* + * Clear existing posix CPU timers task work. + */ +void clear_posix_cputimers_work(struct task_struct *p) +{ + /* + * A copied work entry from the old task is not meaningful, clear it. + * N.B. init_task_work will not do this. + */ + memset(&p->posix_cputimers_work.work, 0, + sizeof(p->posix_cputimers_work.work)); + init_task_work(&p->posix_cputimers_work.work, + posix_cpu_timers_work); + mutex_init(&p->posix_cputimers_work.mutex); + p->posix_cputimers_work.scheduled = false; +} + +/* + * Initialize posix CPU timers task work in init task. Out of line to + * keep the callback static and to avoid header recursion hell. + */ +void __init posix_cputimers_init_work(void) +{ + clear_posix_cputimers_work(current); +} + +/* + * Note: All operations on tsk->posix_cputimer_work.scheduled happen either + * in hard interrupt context or in task context with interrupts + * disabled. Aside of that the writer/reader interaction is always in the + * context of the current task, which means they are strict per CPU. + */ +static inline bool posix_cpu_timers_work_scheduled(struct task_struct *tsk) +{ + return tsk->posix_cputimers_work.scheduled; +} + +static inline void __run_posix_cpu_timers(struct task_struct *tsk) +{ + if (WARN_ON_ONCE(tsk->posix_cputimers_work.scheduled)) + return; + + /* Schedule task work to actually expire the timers */ + tsk->posix_cputimers_work.scheduled = true; + task_work_add(tsk, &tsk->posix_cputimers_work.work, TWA_RESUME); +} + +static inline bool posix_cpu_timers_enable_work(struct task_struct *tsk, + unsigned long start) +{ + bool ret = true; + + /* + * On !RT kernels interrupts are disabled while collecting expired + * timers, so no tick can happen and the fast path check can be + * reenabled without further checks. + */ + if (!IS_ENABLED(CONFIG_PREEMPT_RT)) { + tsk->posix_cputimers_work.scheduled = false; + return true; + } + + /* + * On RT enabled kernels ticks can happen while the expired timers + * are collected under sighand lock. But any tick which observes + * the CPUTIMERS_WORK_SCHEDULED bit set, does not run the fastpath + * checks. So reenabling the tick work has do be done carefully: + * + * Disable interrupts and run the fast path check if jiffies have + * advanced since the collecting of expired timers started. If + * jiffies have not advanced or the fast path check did not find + * newly expired timers, reenable the fast path check in the timer + * interrupt. If there are newly expired timers, return false and + * let the collection loop repeat. + */ + local_irq_disable(); + if (start != jiffies && fastpath_timer_check(tsk)) + ret = false; + else + tsk->posix_cputimers_work.scheduled = false; + local_irq_enable(); + + return ret; +} +#else /* CONFIG_POSIX_CPU_TIMERS_TASK_WORK */ +static inline void __run_posix_cpu_timers(struct task_struct *tsk) +{ + lockdep_posixtimer_enter(); + handle_posix_cpu_timers(tsk); + lockdep_posixtimer_exit(); +} + +static void posix_cpu_timer_wait_running(struct k_itimer *timr) +{ + cpu_relax(); +} + +static void posix_cpu_timer_wait_running_nsleep(struct k_itimer *timr) +{ + spin_unlock_irq(&timr->it_lock); + cpu_relax(); + spin_lock_irq(&timr->it_lock); +} + +static inline bool posix_cpu_timers_work_scheduled(struct task_struct *tsk) +{ + return false; +} + +static inline bool posix_cpu_timers_enable_work(struct task_struct *tsk, + unsigned long start) +{ + return true; +} +#endif /* CONFIG_POSIX_CPU_TIMERS_TASK_WORK */ + +static void handle_posix_cpu_timers(struct task_struct *tsk) +{ + struct k_itimer *timer, *next; + unsigned long flags, start; + LIST_HEAD(firing); + + if (!lock_task_sighand(tsk, &flags)) + return; + + do { + /* + * On RT locking sighand lock does not disable interrupts, + * so this needs to be careful vs. ticks. Store the current + * jiffies value. + */ + start = READ_ONCE(jiffies); + barrier(); + + /* + * Here we take off tsk->signal->cpu_timers[N] and + * tsk->cpu_timers[N] all the timers that are firing, and + * put them on the firing list. + */ + check_thread_timers(tsk, &firing); + + check_process_timers(tsk, &firing); + + /* + * The above timer checks have updated the expiry cache and + * because nothing can have queued or modified timers after + * sighand lock was taken above it is guaranteed to be + * consistent. So the next timer interrupt fastpath check + * will find valid data. + * + * If timer expiry runs in the timer interrupt context then + * the loop is not relevant as timers will be directly + * expired in interrupt context. The stub function below + * returns always true which allows the compiler to + * optimize the loop out. + * + * If timer expiry is deferred to task work context then + * the following rules apply: + * + * - On !RT kernels no tick can have happened on this CPU + * after sighand lock was acquired because interrupts are + * disabled. So reenabling task work before dropping + * sighand lock and reenabling interrupts is race free. + * + * - On RT kernels ticks might have happened but the tick + * work ignored posix CPU timer handling because the + * CPUTIMERS_WORK_SCHEDULED bit is set. Reenabling work + * must be done very carefully including a check whether + * ticks have happened since the start of the timer + * expiry checks. posix_cpu_timers_enable_work() takes + * care of that and eventually lets the expiry checks + * run again. + */ + } while (!posix_cpu_timers_enable_work(tsk, start)); + + /* + * We must release sighand lock before taking any timer's lock. + * There is a potential race with timer deletion here, as the + * siglock now protects our private firing list. We have set + * the firing flag in each timer, so that a deletion attempt + * that gets the timer lock before we do will give it up and + * spin until we've taken care of that timer below. + */ + unlock_task_sighand(tsk, &flags); + + /* + * Now that all the timers on our list have the firing flag, + * no one will touch their list entries but us. We'll take + * each timer's lock before clearing its firing flag, so no + * timer call will interfere. + */ + list_for_each_entry_safe(timer, next, &firing, it.cpu.elist) { + int cpu_firing; + + /* + * spin_lock() is sufficient here even independent of the + * expiry context. If expiry happens in hard interrupt + * context it's obvious. For task work context it's safe + * because all other operations on timer::it_lock happen in + * task context (syscall or exit). + */ + spin_lock(&timer->it_lock); + list_del_init(&timer->it.cpu.elist); + cpu_firing = timer->it.cpu.firing; + timer->it.cpu.firing = 0; + /* + * The firing flag is -1 if we collided with a reset + * of the timer, which already reported this + * almost-firing as an overrun. So don't generate an event. + */ + if (likely(cpu_firing >= 0)) + cpu_timer_fire(timer); + /* See posix_cpu_timer_wait_running() */ + rcu_assign_pointer(timer->it.cpu.handling, NULL); + spin_unlock(&timer->it_lock); + } +} + +/* + * This is called from the timer interrupt handler. The irq handler has + * already updated our counts. We need to check if any timers fire now. + * Interrupts are disabled. + */ +void run_posix_cpu_timers(void) +{ + struct task_struct *tsk = current; + + lockdep_assert_irqs_disabled(); + + /* + * If the actual expiry is deferred to task work context and the + * work is already scheduled there is no point to do anything here. + */ + if (posix_cpu_timers_work_scheduled(tsk)) + return; + + /* + * The fast path checks that there are no expired thread or thread + * group timers. If that's so, just return. + */ + if (!fastpath_timer_check(tsk)) + return; + + __run_posix_cpu_timers(tsk); +} + +/* + * Set one of the process-wide special case CPU timers or RLIMIT_CPU. + * The tsk->sighand->siglock must be held by the caller. + */ +void set_process_cpu_timer(struct task_struct *tsk, unsigned int clkid, + u64 *newval, u64 *oldval) +{ + u64 now, *nextevt; + + if (WARN_ON_ONCE(clkid >= CPUCLOCK_SCHED)) + return; + + nextevt = &tsk->signal->posix_cputimers.bases[clkid].nextevt; + now = cpu_clock_sample_group(clkid, tsk, true); + + if (oldval) { + /* + * We are setting itimer. The *oldval is absolute and we update + * it to be relative, *newval argument is relative and we update + * it to be absolute. + */ + if (*oldval) { + if (*oldval <= now) { + /* Just about to fire. */ + *oldval = TICK_NSEC; + } else { + *oldval -= now; + } + } + + if (*newval) + *newval += now; + } + + /* + * Update expiration cache if this is the earliest timer. CPUCLOCK_PROF + * expiry cache is also used by RLIMIT_CPU!. + */ + if (*newval < *nextevt) + *nextevt = *newval; + + tick_dep_set_signal(tsk, TICK_DEP_BIT_POSIX_TIMER); +} + +static int do_cpu_nanosleep(const clockid_t which_clock, int flags, + const struct timespec64 *rqtp) +{ + struct itimerspec64 it; + struct k_itimer timer; + u64 expires; + int error; + + /* + * Set up a temporary timer and then wait for it to go off. + */ + memset(&timer, 0, sizeof timer); + spin_lock_init(&timer.it_lock); + timer.it_clock = which_clock; + timer.it_overrun = -1; + error = posix_cpu_timer_create(&timer); + timer.it_process = current; + + if (!error) { + static struct itimerspec64 zero_it; + struct restart_block *restart; + + memset(&it, 0, sizeof(it)); + it.it_value = *rqtp; + + spin_lock_irq(&timer.it_lock); + error = posix_cpu_timer_set(&timer, flags, &it, NULL); + if (error) { + spin_unlock_irq(&timer.it_lock); + return error; + } + + while (!signal_pending(current)) { + if (!cpu_timer_getexpires(&timer.it.cpu)) { + /* + * Our timer fired and was reset, below + * deletion can not fail. + */ + posix_cpu_timer_del(&timer); + spin_unlock_irq(&timer.it_lock); + return 0; + } + + /* + * Block until cpu_timer_fire (or a signal) wakes us. + */ + __set_current_state(TASK_INTERRUPTIBLE); + spin_unlock_irq(&timer.it_lock); + schedule(); + spin_lock_irq(&timer.it_lock); + } + + /* + * We were interrupted by a signal. + */ + expires = cpu_timer_getexpires(&timer.it.cpu); + error = posix_cpu_timer_set(&timer, 0, &zero_it, &it); + if (!error) { + /* Timer is now unarmed, deletion can not fail. */ + posix_cpu_timer_del(&timer); + } else { + while (error == TIMER_RETRY) { + posix_cpu_timer_wait_running_nsleep(&timer); + error = posix_cpu_timer_del(&timer); + } + } + + spin_unlock_irq(&timer.it_lock); + + if ((it.it_value.tv_sec | it.it_value.tv_nsec) == 0) { + /* + * It actually did fire already. + */ + return 0; + } + + error = -ERESTART_RESTARTBLOCK; + /* + * Report back to the user the time still remaining. + */ + restart = ¤t->restart_block; + restart->nanosleep.expires = expires; + if (restart->nanosleep.type != TT_NONE) + error = nanosleep_copyout(restart, &it.it_value); + } + + return error; +} + +static long posix_cpu_nsleep_restart(struct restart_block *restart_block); + +static int posix_cpu_nsleep(const clockid_t which_clock, int flags, + const struct timespec64 *rqtp) +{ + struct restart_block *restart_block = ¤t->restart_block; + int error; + + /* + * Diagnose required errors first. + */ + if (CPUCLOCK_PERTHREAD(which_clock) && + (CPUCLOCK_PID(which_clock) == 0 || + CPUCLOCK_PID(which_clock) == task_pid_vnr(current))) + return -EINVAL; + + error = do_cpu_nanosleep(which_clock, flags, rqtp); + + if (error == -ERESTART_RESTARTBLOCK) { + + if (flags & TIMER_ABSTIME) + return -ERESTARTNOHAND; + + restart_block->nanosleep.clockid = which_clock; + set_restart_fn(restart_block, posix_cpu_nsleep_restart); + } + return error; +} + +static long posix_cpu_nsleep_restart(struct restart_block *restart_block) +{ + clockid_t which_clock = restart_block->nanosleep.clockid; + struct timespec64 t; + + t = ns_to_timespec64(restart_block->nanosleep.expires); + + return do_cpu_nanosleep(which_clock, TIMER_ABSTIME, &t); +} + +#define PROCESS_CLOCK make_process_cpuclock(0, CPUCLOCK_SCHED) +#define THREAD_CLOCK make_thread_cpuclock(0, CPUCLOCK_SCHED) + +static int process_cpu_clock_getres(const clockid_t which_clock, + struct timespec64 *tp) +{ + return posix_cpu_clock_getres(PROCESS_CLOCK, tp); +} +static int process_cpu_clock_get(const clockid_t which_clock, + struct timespec64 *tp) +{ + return posix_cpu_clock_get(PROCESS_CLOCK, tp); +} +static int process_cpu_timer_create(struct k_itimer *timer) +{ + timer->it_clock = PROCESS_CLOCK; + return posix_cpu_timer_create(timer); +} +static int process_cpu_nsleep(const clockid_t which_clock, int flags, + const struct timespec64 *rqtp) +{ + return posix_cpu_nsleep(PROCESS_CLOCK, flags, rqtp); +} +static int thread_cpu_clock_getres(const clockid_t which_clock, + struct timespec64 *tp) +{ + return posix_cpu_clock_getres(THREAD_CLOCK, tp); +} +static int thread_cpu_clock_get(const clockid_t which_clock, + struct timespec64 *tp) +{ + return posix_cpu_clock_get(THREAD_CLOCK, tp); +} +static int thread_cpu_timer_create(struct k_itimer *timer) +{ + timer->it_clock = THREAD_CLOCK; + return posix_cpu_timer_create(timer); +} + +const struct k_clock clock_posix_cpu = { + .clock_getres = posix_cpu_clock_getres, + .clock_set = posix_cpu_clock_set, + .clock_get_timespec = posix_cpu_clock_get, + .timer_create = posix_cpu_timer_create, + .nsleep = posix_cpu_nsleep, + .timer_set = posix_cpu_timer_set, + .timer_del = posix_cpu_timer_del, + .timer_get = posix_cpu_timer_get, + .timer_rearm = posix_cpu_timer_rearm, + .timer_wait_running = posix_cpu_timer_wait_running, +}; + +const struct k_clock clock_process = { + .clock_getres = process_cpu_clock_getres, + .clock_get_timespec = process_cpu_clock_get, + .timer_create = process_cpu_timer_create, + .nsleep = process_cpu_nsleep, +}; + +const struct k_clock clock_thread = { + .clock_getres = thread_cpu_clock_getres, + .clock_get_timespec = thread_cpu_clock_get, + .timer_create = thread_cpu_timer_create, +}; diff --git a/kernel/time/posix-stubs.c b/kernel/time/posix-stubs.c new file mode 100644 index 0000000000..9b6fcb8d85 --- /dev/null +++ b/kernel/time/posix-stubs.c @@ -0,0 +1,209 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Dummy stubs used when CONFIG_POSIX_TIMERS=n + * + * Created by: Nicolas Pitre, July 2016 + * Copyright: (C) 2016 Linaro Limited + */ + +#include <linux/linkage.h> +#include <linux/kernel.h> +#include <linux/sched.h> +#include <linux/errno.h> +#include <linux/syscalls.h> +#include <linux/ktime.h> +#include <linux/timekeeping.h> +#include <linux/posix-timers.h> +#include <linux/time_namespace.h> +#include <linux/compat.h> + +/* + * We preserve minimal support for CLOCK_REALTIME and CLOCK_MONOTONIC + * as it is easy to remain compatible with little code. CLOCK_BOOTTIME + * is also included for convenience as at least systemd uses it. + */ + +SYSCALL_DEFINE2(clock_settime, const clockid_t, which_clock, + const struct __kernel_timespec __user *, tp) +{ + struct timespec64 new_tp; + + if (which_clock != CLOCK_REALTIME) + return -EINVAL; + if (get_timespec64(&new_tp, tp)) + return -EFAULT; + + return do_sys_settimeofday64(&new_tp, NULL); +} + +static int do_clock_gettime(clockid_t which_clock, struct timespec64 *tp) +{ + switch (which_clock) { + case CLOCK_REALTIME: + ktime_get_real_ts64(tp); + break; + case CLOCK_MONOTONIC: + ktime_get_ts64(tp); + timens_add_monotonic(tp); + break; + case CLOCK_BOOTTIME: + ktime_get_boottime_ts64(tp); + timens_add_boottime(tp); + break; + default: + return -EINVAL; + } + + return 0; +} + +SYSCALL_DEFINE2(clock_gettime, const clockid_t, which_clock, + struct __kernel_timespec __user *, tp) +{ + int ret; + struct timespec64 kernel_tp; + + ret = do_clock_gettime(which_clock, &kernel_tp); + if (ret) + return ret; + + if (put_timespec64(&kernel_tp, tp)) + return -EFAULT; + return 0; +} + +SYSCALL_DEFINE2(clock_getres, const clockid_t, which_clock, struct __kernel_timespec __user *, tp) +{ + struct timespec64 rtn_tp = { + .tv_sec = 0, + .tv_nsec = hrtimer_resolution, + }; + + switch (which_clock) { + case CLOCK_REALTIME: + case CLOCK_MONOTONIC: + case CLOCK_BOOTTIME: + if (put_timespec64(&rtn_tp, tp)) + return -EFAULT; + return 0; + default: + return -EINVAL; + } +} + +SYSCALL_DEFINE4(clock_nanosleep, const clockid_t, which_clock, int, flags, + const struct __kernel_timespec __user *, rqtp, + struct __kernel_timespec __user *, rmtp) +{ + struct timespec64 t; + ktime_t texp; + + switch (which_clock) { + case CLOCK_REALTIME: + case CLOCK_MONOTONIC: + case CLOCK_BOOTTIME: + break; + default: + return -EINVAL; + } + + if (get_timespec64(&t, rqtp)) + return -EFAULT; + if (!timespec64_valid(&t)) + return -EINVAL; + if (flags & TIMER_ABSTIME) + rmtp = NULL; + current->restart_block.fn = do_no_restart_syscall; + current->restart_block.nanosleep.type = rmtp ? TT_NATIVE : TT_NONE; + current->restart_block.nanosleep.rmtp = rmtp; + texp = timespec64_to_ktime(t); + if (flags & TIMER_ABSTIME) + texp = timens_ktime_to_host(which_clock, texp); + return hrtimer_nanosleep(texp, flags & TIMER_ABSTIME ? + HRTIMER_MODE_ABS : HRTIMER_MODE_REL, + which_clock); +} + +#ifdef CONFIG_COMPAT_32BIT_TIME + +SYSCALL_DEFINE2(clock_settime32, const clockid_t, which_clock, + struct old_timespec32 __user *, tp) +{ + struct timespec64 new_tp; + + if (which_clock != CLOCK_REALTIME) + return -EINVAL; + if (get_old_timespec32(&new_tp, tp)) + return -EFAULT; + + return do_sys_settimeofday64(&new_tp, NULL); +} + +SYSCALL_DEFINE2(clock_gettime32, clockid_t, which_clock, + struct old_timespec32 __user *, tp) +{ + int ret; + struct timespec64 kernel_tp; + + ret = do_clock_gettime(which_clock, &kernel_tp); + if (ret) + return ret; + + if (put_old_timespec32(&kernel_tp, tp)) + return -EFAULT; + return 0; +} + +SYSCALL_DEFINE2(clock_getres_time32, clockid_t, which_clock, + struct old_timespec32 __user *, tp) +{ + struct timespec64 rtn_tp = { + .tv_sec = 0, + .tv_nsec = hrtimer_resolution, + }; + + switch (which_clock) { + case CLOCK_REALTIME: + case CLOCK_MONOTONIC: + case CLOCK_BOOTTIME: + if (put_old_timespec32(&rtn_tp, tp)) + return -EFAULT; + return 0; + default: + return -EINVAL; + } +} + +SYSCALL_DEFINE4(clock_nanosleep_time32, clockid_t, which_clock, int, flags, + struct old_timespec32 __user *, rqtp, + struct old_timespec32 __user *, rmtp) +{ + struct timespec64 t; + ktime_t texp; + + switch (which_clock) { + case CLOCK_REALTIME: + case CLOCK_MONOTONIC: + case CLOCK_BOOTTIME: + break; + default: + return -EINVAL; + } + + if (get_old_timespec32(&t, rqtp)) + return -EFAULT; + if (!timespec64_valid(&t)) + return -EINVAL; + if (flags & TIMER_ABSTIME) + rmtp = NULL; + current->restart_block.fn = do_no_restart_syscall; + current->restart_block.nanosleep.type = rmtp ? TT_COMPAT : TT_NONE; + current->restart_block.nanosleep.compat_rmtp = rmtp; + texp = timespec64_to_ktime(t); + if (flags & TIMER_ABSTIME) + texp = timens_ktime_to_host(which_clock, texp); + return hrtimer_nanosleep(texp, flags & TIMER_ABSTIME ? + HRTIMER_MODE_ABS : HRTIMER_MODE_REL, + which_clock); +} +#endif diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c new file mode 100644 index 0000000000..b924f0f096 --- /dev/null +++ b/kernel/time/posix-timers.c @@ -0,0 +1,1541 @@ +// SPDX-License-Identifier: GPL-2.0+ +/* + * 2002-10-15 Posix Clocks & timers + * by George Anzinger george@mvista.com + * Copyright (C) 2002 2003 by MontaVista Software. + * + * 2004-06-01 Fix CLOCK_REALTIME clock/timer TIMER_ABSTIME bug. + * Copyright (C) 2004 Boris Hu + * + * These are all the functions necessary to implement POSIX clocks & timers + */ +#include <linux/mm.h> +#include <linux/interrupt.h> +#include <linux/slab.h> +#include <linux/time.h> +#include <linux/mutex.h> +#include <linux/sched/task.h> + +#include <linux/uaccess.h> +#include <linux/list.h> +#include <linux/init.h> +#include <linux/compiler.h> +#include <linux/hash.h> +#include <linux/posix-clock.h> +#include <linux/posix-timers.h> +#include <linux/syscalls.h> +#include <linux/wait.h> +#include <linux/workqueue.h> +#include <linux/export.h> +#include <linux/hashtable.h> +#include <linux/compat.h> +#include <linux/nospec.h> +#include <linux/time_namespace.h> + +#include "timekeeping.h" +#include "posix-timers.h" + +static struct kmem_cache *posix_timers_cache; + +/* + * Timers are managed in a hash table for lockless lookup. The hash key is + * constructed from current::signal and the timer ID and the timer is + * matched against current::signal and the timer ID when walking the hash + * bucket list. + * + * This allows checkpoint/restore to reconstruct the exact timer IDs for + * a process. + */ +static DEFINE_HASHTABLE(posix_timers_hashtable, 9); +static DEFINE_SPINLOCK(hash_lock); + +static const struct k_clock * const posix_clocks[]; +static const struct k_clock *clockid_to_kclock(const clockid_t id); +static const struct k_clock clock_realtime, clock_monotonic; + +/* SIGEV_THREAD_ID cannot share a bit with the other SIGEV values. */ +#if SIGEV_THREAD_ID != (SIGEV_THREAD_ID & \ + ~(SIGEV_SIGNAL | SIGEV_NONE | SIGEV_THREAD)) +#error "SIGEV_THREAD_ID must not share bit with other SIGEV values!" +#endif + +static struct k_itimer *__lock_timer(timer_t timer_id, unsigned long *flags); + +#define lock_timer(tid, flags) \ +({ struct k_itimer *__timr; \ + __cond_lock(&__timr->it_lock, __timr = __lock_timer(tid, flags)); \ + __timr; \ +}) + +static int hash(struct signal_struct *sig, unsigned int nr) +{ + return hash_32(hash32_ptr(sig) ^ nr, HASH_BITS(posix_timers_hashtable)); +} + +static struct k_itimer *__posix_timers_find(struct hlist_head *head, + struct signal_struct *sig, + timer_t id) +{ + struct k_itimer *timer; + + hlist_for_each_entry_rcu(timer, head, t_hash, lockdep_is_held(&hash_lock)) { + /* timer->it_signal can be set concurrently */ + if ((READ_ONCE(timer->it_signal) == sig) && (timer->it_id == id)) + return timer; + } + return NULL; +} + +static struct k_itimer *posix_timer_by_id(timer_t id) +{ + struct signal_struct *sig = current->signal; + struct hlist_head *head = &posix_timers_hashtable[hash(sig, id)]; + + return __posix_timers_find(head, sig, id); +} + +static int posix_timer_add(struct k_itimer *timer) +{ + struct signal_struct *sig = current->signal; + struct hlist_head *head; + unsigned int cnt, id; + + /* + * FIXME: Replace this by a per signal struct xarray once there is + * a plan to handle the resulting CRIU regression gracefully. + */ + for (cnt = 0; cnt <= INT_MAX; cnt++) { + spin_lock(&hash_lock); + id = sig->next_posix_timer_id; + + /* Write the next ID back. Clamp it to the positive space */ + sig->next_posix_timer_id = (id + 1) & INT_MAX; + + head = &posix_timers_hashtable[hash(sig, id)]; + if (!__posix_timers_find(head, sig, id)) { + hlist_add_head_rcu(&timer->t_hash, head); + spin_unlock(&hash_lock); + return id; + } + spin_unlock(&hash_lock); + } + /* POSIX return code when no timer ID could be allocated */ + return -EAGAIN; +} + +static inline void unlock_timer(struct k_itimer *timr, unsigned long flags) +{ + spin_unlock_irqrestore(&timr->it_lock, flags); +} + +static int posix_get_realtime_timespec(clockid_t which_clock, struct timespec64 *tp) +{ + ktime_get_real_ts64(tp); + return 0; +} + +static ktime_t posix_get_realtime_ktime(clockid_t which_clock) +{ + return ktime_get_real(); +} + +static int posix_clock_realtime_set(const clockid_t which_clock, + const struct timespec64 *tp) +{ + return do_sys_settimeofday64(tp, NULL); +} + +static int posix_clock_realtime_adj(const clockid_t which_clock, + struct __kernel_timex *t) +{ + return do_adjtimex(t); +} + +static int posix_get_monotonic_timespec(clockid_t which_clock, struct timespec64 *tp) +{ + ktime_get_ts64(tp); + timens_add_monotonic(tp); + return 0; +} + +static ktime_t posix_get_monotonic_ktime(clockid_t which_clock) +{ + return ktime_get(); +} + +static int posix_get_monotonic_raw(clockid_t which_clock, struct timespec64 *tp) +{ + ktime_get_raw_ts64(tp); + timens_add_monotonic(tp); + return 0; +} + +static int posix_get_realtime_coarse(clockid_t which_clock, struct timespec64 *tp) +{ + ktime_get_coarse_real_ts64(tp); + return 0; +} + +static int posix_get_monotonic_coarse(clockid_t which_clock, + struct timespec64 *tp) +{ + ktime_get_coarse_ts64(tp); + timens_add_monotonic(tp); + return 0; +} + +static int posix_get_coarse_res(const clockid_t which_clock, struct timespec64 *tp) +{ + *tp = ktime_to_timespec64(KTIME_LOW_RES); + return 0; +} + +static int posix_get_boottime_timespec(const clockid_t which_clock, struct timespec64 *tp) +{ + ktime_get_boottime_ts64(tp); + timens_add_boottime(tp); + return 0; +} + +static ktime_t posix_get_boottime_ktime(const clockid_t which_clock) +{ + return ktime_get_boottime(); +} + +static int posix_get_tai_timespec(clockid_t which_clock, struct timespec64 *tp) +{ + ktime_get_clocktai_ts64(tp); + return 0; +} + +static ktime_t posix_get_tai_ktime(clockid_t which_clock) +{ + return ktime_get_clocktai(); +} + +static int posix_get_hrtimer_res(clockid_t which_clock, struct timespec64 *tp) +{ + tp->tv_sec = 0; + tp->tv_nsec = hrtimer_resolution; + return 0; +} + +static __init int init_posix_timers(void) +{ + posix_timers_cache = kmem_cache_create("posix_timers_cache", + sizeof(struct k_itimer), 0, + SLAB_PANIC | SLAB_ACCOUNT, NULL); + return 0; +} +__initcall(init_posix_timers); + +/* + * The siginfo si_overrun field and the return value of timer_getoverrun(2) + * are of type int. Clamp the overrun value to INT_MAX + */ +static inline int timer_overrun_to_int(struct k_itimer *timr, int baseval) +{ + s64 sum = timr->it_overrun_last + (s64)baseval; + + return sum > (s64)INT_MAX ? INT_MAX : (int)sum; +} + +static void common_hrtimer_rearm(struct k_itimer *timr) +{ + struct hrtimer *timer = &timr->it.real.timer; + + timr->it_overrun += hrtimer_forward(timer, timer->base->get_time(), + timr->it_interval); + hrtimer_restart(timer); +} + +/* + * This function is called from the signal delivery code if + * info->si_sys_private is not zero, which indicates that the timer has to + * be rearmed. Restart the timer and update info::si_overrun. + */ +void posixtimer_rearm(struct kernel_siginfo *info) +{ + struct k_itimer *timr; + unsigned long flags; + + timr = lock_timer(info->si_tid, &flags); + if (!timr) + return; + + if (timr->it_interval && timr->it_requeue_pending == info->si_sys_private) { + timr->kclock->timer_rearm(timr); + + timr->it_active = 1; + timr->it_overrun_last = timr->it_overrun; + timr->it_overrun = -1LL; + ++timr->it_requeue_pending; + + info->si_overrun = timer_overrun_to_int(timr, info->si_overrun); + } + + unlock_timer(timr, flags); +} + +int posix_timer_event(struct k_itimer *timr, int si_private) +{ + enum pid_type type; + int ret; + /* + * FIXME: if ->sigq is queued we can race with + * dequeue_signal()->posixtimer_rearm(). + * + * If dequeue_signal() sees the "right" value of + * si_sys_private it calls posixtimer_rearm(). + * We re-queue ->sigq and drop ->it_lock(). + * posixtimer_rearm() locks the timer + * and re-schedules it while ->sigq is pending. + * Not really bad, but not that we want. + */ + timr->sigq->info.si_sys_private = si_private; + + type = !(timr->it_sigev_notify & SIGEV_THREAD_ID) ? PIDTYPE_TGID : PIDTYPE_PID; + ret = send_sigqueue(timr->sigq, timr->it_pid, type); + /* If we failed to send the signal the timer stops. */ + return ret > 0; +} + +/* + * This function gets called when a POSIX.1b interval timer expires from + * the HRTIMER interrupt (soft interrupt on RT kernels). + * + * Handles CLOCK_REALTIME, CLOCK_MONOTONIC, CLOCK_BOOTTIME and CLOCK_TAI + * based timers. + */ +static enum hrtimer_restart posix_timer_fn(struct hrtimer *timer) +{ + enum hrtimer_restart ret = HRTIMER_NORESTART; + struct k_itimer *timr; + unsigned long flags; + int si_private = 0; + + timr = container_of(timer, struct k_itimer, it.real.timer); + spin_lock_irqsave(&timr->it_lock, flags); + + timr->it_active = 0; + if (timr->it_interval != 0) + si_private = ++timr->it_requeue_pending; + + if (posix_timer_event(timr, si_private)) { + /* + * The signal was not queued due to SIG_IGN. As a + * consequence the timer is not going to be rearmed from + * the signal delivery path. But as a real signal handler + * can be installed later the timer must be rearmed here. + */ + if (timr->it_interval != 0) { + ktime_t now = hrtimer_cb_get_time(timer); + + /* + * FIXME: What we really want, is to stop this + * timer completely and restart it in case the + * SIG_IGN is removed. This is a non trivial + * change to the signal handling code. + * + * For now let timers with an interval less than a + * jiffie expire every jiffie and recheck for a + * valid signal handler. + * + * This avoids interrupt starvation in case of a + * very small interval, which would expire the + * timer immediately again. + * + * Moving now ahead of time by one jiffie tricks + * hrtimer_forward() to expire the timer later, + * while it still maintains the overrun accuracy + * for the price of a slight inconsistency in the + * timer_gettime() case. This is at least better + * than a timer storm. + * + * Only required when high resolution timers are + * enabled as the periodic tick based timers are + * automatically aligned to the next tick. + */ + if (IS_ENABLED(CONFIG_HIGH_RES_TIMERS)) { + ktime_t kj = TICK_NSEC; + + if (timr->it_interval < kj) + now = ktime_add(now, kj); + } + + timr->it_overrun += hrtimer_forward(timer, now, timr->it_interval); + ret = HRTIMER_RESTART; + ++timr->it_requeue_pending; + timr->it_active = 1; + } + } + + unlock_timer(timr, flags); + return ret; +} + +static struct pid *good_sigevent(sigevent_t * event) +{ + struct pid *pid = task_tgid(current); + struct task_struct *rtn; + + switch (event->sigev_notify) { + case SIGEV_SIGNAL | SIGEV_THREAD_ID: + pid = find_vpid(event->sigev_notify_thread_id); + rtn = pid_task(pid, PIDTYPE_PID); + if (!rtn || !same_thread_group(rtn, current)) + return NULL; + fallthrough; + case SIGEV_SIGNAL: + case SIGEV_THREAD: + if (event->sigev_signo <= 0 || event->sigev_signo > SIGRTMAX) + return NULL; + fallthrough; + case SIGEV_NONE: + return pid; + default: + return NULL; + } +} + +static struct k_itimer * alloc_posix_timer(void) +{ + struct k_itimer *tmr = kmem_cache_zalloc(posix_timers_cache, GFP_KERNEL); + + if (!tmr) + return tmr; + if (unlikely(!(tmr->sigq = sigqueue_alloc()))) { + kmem_cache_free(posix_timers_cache, tmr); + return NULL; + } + clear_siginfo(&tmr->sigq->info); + return tmr; +} + +static void k_itimer_rcu_free(struct rcu_head *head) +{ + struct k_itimer *tmr = container_of(head, struct k_itimer, rcu); + + kmem_cache_free(posix_timers_cache, tmr); +} + +static void posix_timer_free(struct k_itimer *tmr) +{ + put_pid(tmr->it_pid); + sigqueue_free(tmr->sigq); + call_rcu(&tmr->rcu, k_itimer_rcu_free); +} + +static void posix_timer_unhash_and_free(struct k_itimer *tmr) +{ + spin_lock(&hash_lock); + hlist_del_rcu(&tmr->t_hash); + spin_unlock(&hash_lock); + posix_timer_free(tmr); +} + +static int common_timer_create(struct k_itimer *new_timer) +{ + hrtimer_init(&new_timer->it.real.timer, new_timer->it_clock, 0); + return 0; +} + +/* Create a POSIX.1b interval timer. */ +static int do_timer_create(clockid_t which_clock, struct sigevent *event, + timer_t __user *created_timer_id) +{ + const struct k_clock *kc = clockid_to_kclock(which_clock); + struct k_itimer *new_timer; + int error, new_timer_id; + + if (!kc) + return -EINVAL; + if (!kc->timer_create) + return -EOPNOTSUPP; + + new_timer = alloc_posix_timer(); + if (unlikely(!new_timer)) + return -EAGAIN; + + spin_lock_init(&new_timer->it_lock); + + /* + * Add the timer to the hash table. The timer is not yet valid + * because new_timer::it_signal is still NULL. The timer id is also + * not yet visible to user space. + */ + new_timer_id = posix_timer_add(new_timer); + if (new_timer_id < 0) { + posix_timer_free(new_timer); + return new_timer_id; + } + + new_timer->it_id = (timer_t) new_timer_id; + new_timer->it_clock = which_clock; + new_timer->kclock = kc; + new_timer->it_overrun = -1LL; + + if (event) { + rcu_read_lock(); + new_timer->it_pid = get_pid(good_sigevent(event)); + rcu_read_unlock(); + if (!new_timer->it_pid) { + error = -EINVAL; + goto out; + } + new_timer->it_sigev_notify = event->sigev_notify; + new_timer->sigq->info.si_signo = event->sigev_signo; + new_timer->sigq->info.si_value = event->sigev_value; + } else { + new_timer->it_sigev_notify = SIGEV_SIGNAL; + new_timer->sigq->info.si_signo = SIGALRM; + memset(&new_timer->sigq->info.si_value, 0, sizeof(sigval_t)); + new_timer->sigq->info.si_value.sival_int = new_timer->it_id; + new_timer->it_pid = get_pid(task_tgid(current)); + } + + new_timer->sigq->info.si_tid = new_timer->it_id; + new_timer->sigq->info.si_code = SI_TIMER; + + if (copy_to_user(created_timer_id, &new_timer_id, sizeof (new_timer_id))) { + error = -EFAULT; + goto out; + } + /* + * After succesful copy out, the timer ID is visible to user space + * now but not yet valid because new_timer::signal is still NULL. + * + * Complete the initialization with the clock specific create + * callback. + */ + error = kc->timer_create(new_timer); + if (error) + goto out; + + spin_lock_irq(¤t->sighand->siglock); + /* This makes the timer valid in the hash table */ + WRITE_ONCE(new_timer->it_signal, current->signal); + list_add(&new_timer->list, ¤t->signal->posix_timers); + spin_unlock_irq(¤t->sighand->siglock); + /* + * After unlocking sighand::siglock @new_timer is subject to + * concurrent removal and cannot be touched anymore + */ + return 0; +out: + posix_timer_unhash_and_free(new_timer); + return error; +} + +SYSCALL_DEFINE3(timer_create, const clockid_t, which_clock, + struct sigevent __user *, timer_event_spec, + timer_t __user *, created_timer_id) +{ + if (timer_event_spec) { + sigevent_t event; + + if (copy_from_user(&event, timer_event_spec, sizeof (event))) + return -EFAULT; + return do_timer_create(which_clock, &event, created_timer_id); + } + return do_timer_create(which_clock, NULL, created_timer_id); +} + +#ifdef CONFIG_COMPAT +COMPAT_SYSCALL_DEFINE3(timer_create, clockid_t, which_clock, + struct compat_sigevent __user *, timer_event_spec, + timer_t __user *, created_timer_id) +{ + if (timer_event_spec) { + sigevent_t event; + + if (get_compat_sigevent(&event, timer_event_spec)) + return -EFAULT; + return do_timer_create(which_clock, &event, created_timer_id); + } + return do_timer_create(which_clock, NULL, created_timer_id); +} +#endif + +static struct k_itimer *__lock_timer(timer_t timer_id, unsigned long *flags) +{ + struct k_itimer *timr; + + /* + * timer_t could be any type >= int and we want to make sure any + * @timer_id outside positive int range fails lookup. + */ + if ((unsigned long long)timer_id > INT_MAX) + return NULL; + + /* + * The hash lookup and the timers are RCU protected. + * + * Timers are added to the hash in invalid state where + * timr::it_signal == NULL. timer::it_signal is only set after the + * rest of the initialization succeeded. + * + * Timer destruction happens in steps: + * 1) Set timr::it_signal to NULL with timr::it_lock held + * 2) Release timr::it_lock + * 3) Remove from the hash under hash_lock + * 4) Call RCU for removal after the grace period + * + * Holding rcu_read_lock() accross the lookup ensures that + * the timer cannot be freed. + * + * The lookup validates locklessly that timr::it_signal == + * current::it_signal and timr::it_id == @timer_id. timr::it_id + * can't change, but timr::it_signal becomes NULL during + * destruction. + */ + rcu_read_lock(); + timr = posix_timer_by_id(timer_id); + if (timr) { + spin_lock_irqsave(&timr->it_lock, *flags); + /* + * Validate under timr::it_lock that timr::it_signal is + * still valid. Pairs with #1 above. + */ + if (timr->it_signal == current->signal) { + rcu_read_unlock(); + return timr; + } + spin_unlock_irqrestore(&timr->it_lock, *flags); + } + rcu_read_unlock(); + + return NULL; +} + +static ktime_t common_hrtimer_remaining(struct k_itimer *timr, ktime_t now) +{ + struct hrtimer *timer = &timr->it.real.timer; + + return __hrtimer_expires_remaining_adjusted(timer, now); +} + +static s64 common_hrtimer_forward(struct k_itimer *timr, ktime_t now) +{ + struct hrtimer *timer = &timr->it.real.timer; + + return hrtimer_forward(timer, now, timr->it_interval); +} + +/* + * Get the time remaining on a POSIX.1b interval timer. + * + * Two issues to handle here: + * + * 1) The timer has a requeue pending. The return value must appear as + * if the timer has been requeued right now. + * + * 2) The timer is a SIGEV_NONE timer. These timers are never enqueued + * into the hrtimer queue and therefore never expired. Emulate expiry + * here taking #1 into account. + */ +void common_timer_get(struct k_itimer *timr, struct itimerspec64 *cur_setting) +{ + const struct k_clock *kc = timr->kclock; + ktime_t now, remaining, iv; + bool sig_none; + + sig_none = timr->it_sigev_notify == SIGEV_NONE; + iv = timr->it_interval; + + /* interval timer ? */ + if (iv) { + cur_setting->it_interval = ktime_to_timespec64(iv); + } else if (!timr->it_active) { + /* + * SIGEV_NONE oneshot timers are never queued and therefore + * timr->it_active is always false. The check below + * vs. remaining time will handle this case. + * + * For all other timers there is nothing to update here, so + * return. + */ + if (!sig_none) + return; + } + + now = kc->clock_get_ktime(timr->it_clock); + + /* + * If this is an interval timer and either has requeue pending or + * is a SIGEV_NONE timer move the expiry time forward by intervals, + * so expiry is > now. + */ + if (iv && (timr->it_requeue_pending & REQUEUE_PENDING || sig_none)) + timr->it_overrun += kc->timer_forward(timr, now); + + remaining = kc->timer_remaining(timr, now); + /* + * As @now is retrieved before a possible timer_forward() and + * cannot be reevaluated by the compiler @remaining is based on the + * same @now value. Therefore @remaining is consistent vs. @now. + * + * Consequently all interval timers, i.e. @iv > 0, cannot have a + * remaining time <= 0 because timer_forward() guarantees to move + * them forward so that the next timer expiry is > @now. + */ + if (remaining <= 0) { + /* + * A single shot SIGEV_NONE timer must return 0, when it is + * expired! Timers which have a real signal delivery mode + * must return a remaining time greater than 0 because the + * signal has not yet been delivered. + */ + if (!sig_none) + cur_setting->it_value.tv_nsec = 1; + } else { + cur_setting->it_value = ktime_to_timespec64(remaining); + } +} + +static int do_timer_gettime(timer_t timer_id, struct itimerspec64 *setting) +{ + const struct k_clock *kc; + struct k_itimer *timr; + unsigned long flags; + int ret = 0; + + timr = lock_timer(timer_id, &flags); + if (!timr) + return -EINVAL; + + memset(setting, 0, sizeof(*setting)); + kc = timr->kclock; + if (WARN_ON_ONCE(!kc || !kc->timer_get)) + ret = -EINVAL; + else + kc->timer_get(timr, setting); + + unlock_timer(timr, flags); + return ret; +} + +/* Get the time remaining on a POSIX.1b interval timer. */ +SYSCALL_DEFINE2(timer_gettime, timer_t, timer_id, + struct __kernel_itimerspec __user *, setting) +{ + struct itimerspec64 cur_setting; + + int ret = do_timer_gettime(timer_id, &cur_setting); + if (!ret) { + if (put_itimerspec64(&cur_setting, setting)) + ret = -EFAULT; + } + return ret; +} + +#ifdef CONFIG_COMPAT_32BIT_TIME + +SYSCALL_DEFINE2(timer_gettime32, timer_t, timer_id, + struct old_itimerspec32 __user *, setting) +{ + struct itimerspec64 cur_setting; + + int ret = do_timer_gettime(timer_id, &cur_setting); + if (!ret) { + if (put_old_itimerspec32(&cur_setting, setting)) + ret = -EFAULT; + } + return ret; +} + +#endif + +/** + * sys_timer_getoverrun - Get the number of overruns of a POSIX.1b interval timer + * @timer_id: The timer ID which identifies the timer + * + * The "overrun count" of a timer is one plus the number of expiration + * intervals which have elapsed between the first expiry, which queues the + * signal and the actual signal delivery. On signal delivery the "overrun + * count" is calculated and cached, so it can be returned directly here. + * + * As this is relative to the last queued signal the returned overrun count + * is meaningless outside of the signal delivery path and even there it + * does not accurately reflect the current state when user space evaluates + * it. + * + * Returns: + * -EINVAL @timer_id is invalid + * 1..INT_MAX The number of overruns related to the last delivered signal + */ +SYSCALL_DEFINE1(timer_getoverrun, timer_t, timer_id) +{ + struct k_itimer *timr; + unsigned long flags; + int overrun; + + timr = lock_timer(timer_id, &flags); + if (!timr) + return -EINVAL; + + overrun = timer_overrun_to_int(timr, 0); + unlock_timer(timr, flags); + + return overrun; +} + +static void common_hrtimer_arm(struct k_itimer *timr, ktime_t expires, + bool absolute, bool sigev_none) +{ + struct hrtimer *timer = &timr->it.real.timer; + enum hrtimer_mode mode; + + mode = absolute ? HRTIMER_MODE_ABS : HRTIMER_MODE_REL; + /* + * Posix magic: Relative CLOCK_REALTIME timers are not affected by + * clock modifications, so they become CLOCK_MONOTONIC based under the + * hood. See hrtimer_init(). Update timr->kclock, so the generic + * functions which use timr->kclock->clock_get_*() work. + * + * Note: it_clock stays unmodified, because the next timer_set() might + * use ABSTIME, so it needs to switch back. + */ + if (timr->it_clock == CLOCK_REALTIME) + timr->kclock = absolute ? &clock_realtime : &clock_monotonic; + + hrtimer_init(&timr->it.real.timer, timr->it_clock, mode); + timr->it.real.timer.function = posix_timer_fn; + + if (!absolute) + expires = ktime_add_safe(expires, timer->base->get_time()); + hrtimer_set_expires(timer, expires); + + if (!sigev_none) + hrtimer_start_expires(timer, HRTIMER_MODE_ABS); +} + +static int common_hrtimer_try_to_cancel(struct k_itimer *timr) +{ + return hrtimer_try_to_cancel(&timr->it.real.timer); +} + +static void common_timer_wait_running(struct k_itimer *timer) +{ + hrtimer_cancel_wait_running(&timer->it.real.timer); +} + +/* + * On PREEMPT_RT this prevents priority inversion and a potential livelock + * against the ksoftirqd thread in case that ksoftirqd gets preempted while + * executing a hrtimer callback. + * + * See the comments in hrtimer_cancel_wait_running(). For PREEMPT_RT=n this + * just results in a cpu_relax(). + * + * For POSIX CPU timers with CONFIG_POSIX_CPU_TIMERS_TASK_WORK=n this is + * just a cpu_relax(). With CONFIG_POSIX_CPU_TIMERS_TASK_WORK=y this + * prevents spinning on an eventually scheduled out task and a livelock + * when the task which tries to delete or disarm the timer has preempted + * the task which runs the expiry in task work context. + */ +static struct k_itimer *timer_wait_running(struct k_itimer *timer, + unsigned long *flags) +{ + const struct k_clock *kc = READ_ONCE(timer->kclock); + timer_t timer_id = READ_ONCE(timer->it_id); + + /* Prevent kfree(timer) after dropping the lock */ + rcu_read_lock(); + unlock_timer(timer, *flags); + + /* + * kc->timer_wait_running() might drop RCU lock. So @timer + * cannot be touched anymore after the function returns! + */ + if (!WARN_ON_ONCE(!kc->timer_wait_running)) + kc->timer_wait_running(timer); + + rcu_read_unlock(); + /* Relock the timer. It might be not longer hashed. */ + return lock_timer(timer_id, flags); +} + +/* Set a POSIX.1b interval timer. */ +int common_timer_set(struct k_itimer *timr, int flags, + struct itimerspec64 *new_setting, + struct itimerspec64 *old_setting) +{ + const struct k_clock *kc = timr->kclock; + bool sigev_none; + ktime_t expires; + + if (old_setting) + common_timer_get(timr, old_setting); + + /* Prevent rearming by clearing the interval */ + timr->it_interval = 0; + /* + * Careful here. On SMP systems the timer expiry function could be + * active and spinning on timr->it_lock. + */ + if (kc->timer_try_to_cancel(timr) < 0) + return TIMER_RETRY; + + timr->it_active = 0; + timr->it_requeue_pending = (timr->it_requeue_pending + 2) & + ~REQUEUE_PENDING; + timr->it_overrun_last = 0; + + /* Switch off the timer when it_value is zero */ + if (!new_setting->it_value.tv_sec && !new_setting->it_value.tv_nsec) + return 0; + + timr->it_interval = timespec64_to_ktime(new_setting->it_interval); + expires = timespec64_to_ktime(new_setting->it_value); + if (flags & TIMER_ABSTIME) + expires = timens_ktime_to_host(timr->it_clock, expires); + sigev_none = timr->it_sigev_notify == SIGEV_NONE; + + kc->timer_arm(timr, expires, flags & TIMER_ABSTIME, sigev_none); + timr->it_active = !sigev_none; + return 0; +} + +static int do_timer_settime(timer_t timer_id, int tmr_flags, + struct itimerspec64 *new_spec64, + struct itimerspec64 *old_spec64) +{ + const struct k_clock *kc; + struct k_itimer *timr; + unsigned long flags; + int error = 0; + + if (!timespec64_valid(&new_spec64->it_interval) || + !timespec64_valid(&new_spec64->it_value)) + return -EINVAL; + + if (old_spec64) + memset(old_spec64, 0, sizeof(*old_spec64)); + + timr = lock_timer(timer_id, &flags); +retry: + if (!timr) + return -EINVAL; + + kc = timr->kclock; + if (WARN_ON_ONCE(!kc || !kc->timer_set)) + error = -EINVAL; + else + error = kc->timer_set(timr, tmr_flags, new_spec64, old_spec64); + + if (error == TIMER_RETRY) { + // We already got the old time... + old_spec64 = NULL; + /* Unlocks and relocks the timer if it still exists */ + timr = timer_wait_running(timr, &flags); + goto retry; + } + unlock_timer(timr, flags); + + return error; +} + +/* Set a POSIX.1b interval timer */ +SYSCALL_DEFINE4(timer_settime, timer_t, timer_id, int, flags, + const struct __kernel_itimerspec __user *, new_setting, + struct __kernel_itimerspec __user *, old_setting) +{ + struct itimerspec64 new_spec, old_spec, *rtn; + int error = 0; + + if (!new_setting) + return -EINVAL; + + if (get_itimerspec64(&new_spec, new_setting)) + return -EFAULT; + + rtn = old_setting ? &old_spec : NULL; + error = do_timer_settime(timer_id, flags, &new_spec, rtn); + if (!error && old_setting) { + if (put_itimerspec64(&old_spec, old_setting)) + error = -EFAULT; + } + return error; +} + +#ifdef CONFIG_COMPAT_32BIT_TIME +SYSCALL_DEFINE4(timer_settime32, timer_t, timer_id, int, flags, + struct old_itimerspec32 __user *, new, + struct old_itimerspec32 __user *, old) +{ + struct itimerspec64 new_spec, old_spec; + struct itimerspec64 *rtn = old ? &old_spec : NULL; + int error = 0; + + if (!new) + return -EINVAL; + if (get_old_itimerspec32(&new_spec, new)) + return -EFAULT; + + error = do_timer_settime(timer_id, flags, &new_spec, rtn); + if (!error && old) { + if (put_old_itimerspec32(&old_spec, old)) + error = -EFAULT; + } + return error; +} +#endif + +int common_timer_del(struct k_itimer *timer) +{ + const struct k_clock *kc = timer->kclock; + + timer->it_interval = 0; + if (kc->timer_try_to_cancel(timer) < 0) + return TIMER_RETRY; + timer->it_active = 0; + return 0; +} + +static inline int timer_delete_hook(struct k_itimer *timer) +{ + const struct k_clock *kc = timer->kclock; + + if (WARN_ON_ONCE(!kc || !kc->timer_del)) + return -EINVAL; + return kc->timer_del(timer); +} + +/* Delete a POSIX.1b interval timer. */ +SYSCALL_DEFINE1(timer_delete, timer_t, timer_id) +{ + struct k_itimer *timer; + unsigned long flags; + + timer = lock_timer(timer_id, &flags); + +retry_delete: + if (!timer) + return -EINVAL; + + if (unlikely(timer_delete_hook(timer) == TIMER_RETRY)) { + /* Unlocks and relocks the timer if it still exists */ + timer = timer_wait_running(timer, &flags); + goto retry_delete; + } + + spin_lock(¤t->sighand->siglock); + list_del(&timer->list); + spin_unlock(¤t->sighand->siglock); + /* + * A concurrent lookup could check timer::it_signal lockless. It + * will reevaluate with timer::it_lock held and observe the NULL. + */ + WRITE_ONCE(timer->it_signal, NULL); + + unlock_timer(timer, flags); + posix_timer_unhash_and_free(timer); + return 0; +} + +/* + * Delete a timer if it is armed, remove it from the hash and schedule it + * for RCU freeing. + */ +static void itimer_delete(struct k_itimer *timer) +{ + unsigned long flags; + + /* + * irqsave is required to make timer_wait_running() work. + */ + spin_lock_irqsave(&timer->it_lock, flags); + +retry_delete: + /* + * Even if the timer is not longer accessible from other tasks + * it still might be armed and queued in the underlying timer + * mechanism. Worse, that timer mechanism might run the expiry + * function concurrently. + */ + if (timer_delete_hook(timer) == TIMER_RETRY) { + /* + * Timer is expired concurrently, prevent livelocks + * and pointless spinning on RT. + * + * timer_wait_running() drops timer::it_lock, which opens + * the possibility for another task to delete the timer. + * + * That's not possible here because this is invoked from + * do_exit() only for the last thread of the thread group. + * So no other task can access and delete that timer. + */ + if (WARN_ON_ONCE(timer_wait_running(timer, &flags) != timer)) + return; + + goto retry_delete; + } + list_del(&timer->list); + + /* + * Setting timer::it_signal to NULL is technically not required + * here as nothing can access the timer anymore legitimately via + * the hash table. Set it to NULL nevertheless so that all deletion + * paths are consistent. + */ + WRITE_ONCE(timer->it_signal, NULL); + + spin_unlock_irqrestore(&timer->it_lock, flags); + posix_timer_unhash_and_free(timer); +} + +/* + * Invoked from do_exit() when the last thread of a thread group exits. + * At that point no other task can access the timers of the dying + * task anymore. + */ +void exit_itimers(struct task_struct *tsk) +{ + struct list_head timers; + struct k_itimer *tmr; + + if (list_empty(&tsk->signal->posix_timers)) + return; + + /* Protect against concurrent read via /proc/$PID/timers */ + spin_lock_irq(&tsk->sighand->siglock); + list_replace_init(&tsk->signal->posix_timers, &timers); + spin_unlock_irq(&tsk->sighand->siglock); + + /* The timers are not longer accessible via tsk::signal */ + while (!list_empty(&timers)) { + tmr = list_first_entry(&timers, struct k_itimer, list); + itimer_delete(tmr); + } +} + +SYSCALL_DEFINE2(clock_settime, const clockid_t, which_clock, + const struct __kernel_timespec __user *, tp) +{ + const struct k_clock *kc = clockid_to_kclock(which_clock); + struct timespec64 new_tp; + + if (!kc || !kc->clock_set) + return -EINVAL; + + if (get_timespec64(&new_tp, tp)) + return -EFAULT; + + /* + * Permission checks have to be done inside the clock specific + * setter callback. + */ + return kc->clock_set(which_clock, &new_tp); +} + +SYSCALL_DEFINE2(clock_gettime, const clockid_t, which_clock, + struct __kernel_timespec __user *, tp) +{ + const struct k_clock *kc = clockid_to_kclock(which_clock); + struct timespec64 kernel_tp; + int error; + + if (!kc) + return -EINVAL; + + error = kc->clock_get_timespec(which_clock, &kernel_tp); + + if (!error && put_timespec64(&kernel_tp, tp)) + error = -EFAULT; + + return error; +} + +int do_clock_adjtime(const clockid_t which_clock, struct __kernel_timex * ktx) +{ + const struct k_clock *kc = clockid_to_kclock(which_clock); + + if (!kc) + return -EINVAL; + if (!kc->clock_adj) + return -EOPNOTSUPP; + + return kc->clock_adj(which_clock, ktx); +} + +SYSCALL_DEFINE2(clock_adjtime, const clockid_t, which_clock, + struct __kernel_timex __user *, utx) +{ + struct __kernel_timex ktx; + int err; + + if (copy_from_user(&ktx, utx, sizeof(ktx))) + return -EFAULT; + + err = do_clock_adjtime(which_clock, &ktx); + + if (err >= 0 && copy_to_user(utx, &ktx, sizeof(ktx))) + return -EFAULT; + + return err; +} + +/** + * sys_clock_getres - Get the resolution of a clock + * @which_clock: The clock to get the resolution for + * @tp: Pointer to a a user space timespec64 for storage + * + * POSIX defines: + * + * "The clock_getres() function shall return the resolution of any + * clock. Clock resolutions are implementation-defined and cannot be set by + * a process. If the argument res is not NULL, the resolution of the + * specified clock shall be stored in the location pointed to by res. If + * res is NULL, the clock resolution is not returned. If the time argument + * of clock_settime() is not a multiple of res, then the value is truncated + * to a multiple of res." + * + * Due to the various hardware constraints the real resolution can vary + * wildly and even change during runtime when the underlying devices are + * replaced. The kernel also can use hardware devices with different + * resolutions for reading the time and for arming timers. + * + * The kernel therefore deviates from the POSIX spec in various aspects: + * + * 1) The resolution returned to user space + * + * For CLOCK_REALTIME, CLOCK_MONOTONIC, CLOCK_BOOTTIME, CLOCK_TAI, + * CLOCK_REALTIME_ALARM, CLOCK_BOOTTIME_ALAREM and CLOCK_MONOTONIC_RAW + * the kernel differentiates only two cases: + * + * I) Low resolution mode: + * + * When high resolution timers are disabled at compile or runtime + * the resolution returned is nanoseconds per tick, which represents + * the precision at which timers expire. + * + * II) High resolution mode: + * + * When high resolution timers are enabled the resolution returned + * is always one nanosecond independent of the actual resolution of + * the underlying hardware devices. + * + * For CLOCK_*_ALARM the actual resolution depends on system + * state. When system is running the resolution is the same as the + * resolution of the other clocks. During suspend the actual + * resolution is the resolution of the underlying RTC device which + * might be way less precise than the clockevent device used during + * running state. + * + * For CLOCK_REALTIME_COARSE and CLOCK_MONOTONIC_COARSE the resolution + * returned is always nanoseconds per tick. + * + * For CLOCK_PROCESS_CPUTIME and CLOCK_THREAD_CPUTIME the resolution + * returned is always one nanosecond under the assumption that the + * underlying scheduler clock has a better resolution than nanoseconds + * per tick. + * + * For dynamic POSIX clocks (PTP devices) the resolution returned is + * always one nanosecond. + * + * 2) Affect on sys_clock_settime() + * + * The kernel does not truncate the time which is handed in to + * sys_clock_settime(). The kernel internal timekeeping is always using + * nanoseconds precision independent of the clocksource device which is + * used to read the time from. The resolution of that device only + * affects the presicion of the time returned by sys_clock_gettime(). + * + * Returns: + * 0 Success. @tp contains the resolution + * -EINVAL @which_clock is not a valid clock ID + * -EFAULT Copying the resolution to @tp faulted + * -ENODEV Dynamic POSIX clock is not backed by a device + * -EOPNOTSUPP Dynamic POSIX clock does not support getres() + */ +SYSCALL_DEFINE2(clock_getres, const clockid_t, which_clock, + struct __kernel_timespec __user *, tp) +{ + const struct k_clock *kc = clockid_to_kclock(which_clock); + struct timespec64 rtn_tp; + int error; + + if (!kc) + return -EINVAL; + + error = kc->clock_getres(which_clock, &rtn_tp); + + if (!error && tp && put_timespec64(&rtn_tp, tp)) + error = -EFAULT; + + return error; +} + +#ifdef CONFIG_COMPAT_32BIT_TIME + +SYSCALL_DEFINE2(clock_settime32, clockid_t, which_clock, + struct old_timespec32 __user *, tp) +{ + const struct k_clock *kc = clockid_to_kclock(which_clock); + struct timespec64 ts; + + if (!kc || !kc->clock_set) + return -EINVAL; + + if (get_old_timespec32(&ts, tp)) + return -EFAULT; + + return kc->clock_set(which_clock, &ts); +} + +SYSCALL_DEFINE2(clock_gettime32, clockid_t, which_clock, + struct old_timespec32 __user *, tp) +{ + const struct k_clock *kc = clockid_to_kclock(which_clock); + struct timespec64 ts; + int err; + + if (!kc) + return -EINVAL; + + err = kc->clock_get_timespec(which_clock, &ts); + + if (!err && put_old_timespec32(&ts, tp)) + err = -EFAULT; + + return err; +} + +SYSCALL_DEFINE2(clock_adjtime32, clockid_t, which_clock, + struct old_timex32 __user *, utp) +{ + struct __kernel_timex ktx; + int err; + + err = get_old_timex32(&ktx, utp); + if (err) + return err; + + err = do_clock_adjtime(which_clock, &ktx); + + if (err >= 0 && put_old_timex32(utp, &ktx)) + return -EFAULT; + + return err; +} + +SYSCALL_DEFINE2(clock_getres_time32, clockid_t, which_clock, + struct old_timespec32 __user *, tp) +{ + const struct k_clock *kc = clockid_to_kclock(which_clock); + struct timespec64 ts; + int err; + + if (!kc) + return -EINVAL; + + err = kc->clock_getres(which_clock, &ts); + if (!err && tp && put_old_timespec32(&ts, tp)) + return -EFAULT; + + return err; +} + +#endif + +/* + * sys_clock_nanosleep() for CLOCK_REALTIME and CLOCK_TAI + */ +static int common_nsleep(const clockid_t which_clock, int flags, + const struct timespec64 *rqtp) +{ + ktime_t texp = timespec64_to_ktime(*rqtp); + + return hrtimer_nanosleep(texp, flags & TIMER_ABSTIME ? + HRTIMER_MODE_ABS : HRTIMER_MODE_REL, + which_clock); +} + +/* + * sys_clock_nanosleep() for CLOCK_MONOTONIC and CLOCK_BOOTTIME + * + * Absolute nanosleeps for these clocks are time-namespace adjusted. + */ +static int common_nsleep_timens(const clockid_t which_clock, int flags, + const struct timespec64 *rqtp) +{ + ktime_t texp = timespec64_to_ktime(*rqtp); + + if (flags & TIMER_ABSTIME) + texp = timens_ktime_to_host(which_clock, texp); + + return hrtimer_nanosleep(texp, flags & TIMER_ABSTIME ? + HRTIMER_MODE_ABS : HRTIMER_MODE_REL, + which_clock); +} + +SYSCALL_DEFINE4(clock_nanosleep, const clockid_t, which_clock, int, flags, + const struct __kernel_timespec __user *, rqtp, + struct __kernel_timespec __user *, rmtp) +{ + const struct k_clock *kc = clockid_to_kclock(which_clock); + struct timespec64 t; + + if (!kc) + return -EINVAL; + if (!kc->nsleep) + return -EOPNOTSUPP; + + if (get_timespec64(&t, rqtp)) + return -EFAULT; + + if (!timespec64_valid(&t)) + return -EINVAL; + if (flags & TIMER_ABSTIME) + rmtp = NULL; + current->restart_block.fn = do_no_restart_syscall; + current->restart_block.nanosleep.type = rmtp ? TT_NATIVE : TT_NONE; + current->restart_block.nanosleep.rmtp = rmtp; + + return kc->nsleep(which_clock, flags, &t); +} + +#ifdef CONFIG_COMPAT_32BIT_TIME + +SYSCALL_DEFINE4(clock_nanosleep_time32, clockid_t, which_clock, int, flags, + struct old_timespec32 __user *, rqtp, + struct old_timespec32 __user *, rmtp) +{ + const struct k_clock *kc = clockid_to_kclock(which_clock); + struct timespec64 t; + + if (!kc) + return -EINVAL; + if (!kc->nsleep) + return -EOPNOTSUPP; + + if (get_old_timespec32(&t, rqtp)) + return -EFAULT; + + if (!timespec64_valid(&t)) + return -EINVAL; + if (flags & TIMER_ABSTIME) + rmtp = NULL; + current->restart_block.fn = do_no_restart_syscall; + current->restart_block.nanosleep.type = rmtp ? TT_COMPAT : TT_NONE; + current->restart_block.nanosleep.compat_rmtp = rmtp; + + return kc->nsleep(which_clock, flags, &t); +} + +#endif + +static const struct k_clock clock_realtime = { + .clock_getres = posix_get_hrtimer_res, + .clock_get_timespec = posix_get_realtime_timespec, + .clock_get_ktime = posix_get_realtime_ktime, + .clock_set = posix_clock_realtime_set, + .clock_adj = posix_clock_realtime_adj, + .nsleep = common_nsleep, + .timer_create = common_timer_create, + .timer_set = common_timer_set, + .timer_get = common_timer_get, + .timer_del = common_timer_del, + .timer_rearm = common_hrtimer_rearm, + .timer_forward = common_hrtimer_forward, + .timer_remaining = common_hrtimer_remaining, + .timer_try_to_cancel = common_hrtimer_try_to_cancel, + .timer_wait_running = common_timer_wait_running, + .timer_arm = common_hrtimer_arm, +}; + +static const struct k_clock clock_monotonic = { + .clock_getres = posix_get_hrtimer_res, + .clock_get_timespec = posix_get_monotonic_timespec, + .clock_get_ktime = posix_get_monotonic_ktime, + .nsleep = common_nsleep_timens, + .timer_create = common_timer_create, + .timer_set = common_timer_set, + .timer_get = common_timer_get, + .timer_del = common_timer_del, + .timer_rearm = common_hrtimer_rearm, + .timer_forward = common_hrtimer_forward, + .timer_remaining = common_hrtimer_remaining, + .timer_try_to_cancel = common_hrtimer_try_to_cancel, + .timer_wait_running = common_timer_wait_running, + .timer_arm = common_hrtimer_arm, +}; + +static const struct k_clock clock_monotonic_raw = { + .clock_getres = posix_get_hrtimer_res, + .clock_get_timespec = posix_get_monotonic_raw, +}; + +static const struct k_clock clock_realtime_coarse = { + .clock_getres = posix_get_coarse_res, + .clock_get_timespec = posix_get_realtime_coarse, +}; + +static const struct k_clock clock_monotonic_coarse = { + .clock_getres = posix_get_coarse_res, + .clock_get_timespec = posix_get_monotonic_coarse, +}; + +static const struct k_clock clock_tai = { + .clock_getres = posix_get_hrtimer_res, + .clock_get_ktime = posix_get_tai_ktime, + .clock_get_timespec = posix_get_tai_timespec, + .nsleep = common_nsleep, + .timer_create = common_timer_create, + .timer_set = common_timer_set, + .timer_get = common_timer_get, + .timer_del = common_timer_del, + .timer_rearm = common_hrtimer_rearm, + .timer_forward = common_hrtimer_forward, + .timer_remaining = common_hrtimer_remaining, + .timer_try_to_cancel = common_hrtimer_try_to_cancel, + .timer_wait_running = common_timer_wait_running, + .timer_arm = common_hrtimer_arm, +}; + +static const struct k_clock clock_boottime = { + .clock_getres = posix_get_hrtimer_res, + .clock_get_ktime = posix_get_boottime_ktime, + .clock_get_timespec = posix_get_boottime_timespec, + .nsleep = common_nsleep_timens, + .timer_create = common_timer_create, + .timer_set = common_timer_set, + .timer_get = common_timer_get, + .timer_del = common_timer_del, + .timer_rearm = common_hrtimer_rearm, + .timer_forward = common_hrtimer_forward, + .timer_remaining = common_hrtimer_remaining, + .timer_try_to_cancel = common_hrtimer_try_to_cancel, + .timer_wait_running = common_timer_wait_running, + .timer_arm = common_hrtimer_arm, +}; + +static const struct k_clock * const posix_clocks[] = { + [CLOCK_REALTIME] = &clock_realtime, + [CLOCK_MONOTONIC] = &clock_monotonic, + [CLOCK_PROCESS_CPUTIME_ID] = &clock_process, + [CLOCK_THREAD_CPUTIME_ID] = &clock_thread, + [CLOCK_MONOTONIC_RAW] = &clock_monotonic_raw, + [CLOCK_REALTIME_COARSE] = &clock_realtime_coarse, + [CLOCK_MONOTONIC_COARSE] = &clock_monotonic_coarse, + [CLOCK_BOOTTIME] = &clock_boottime, + [CLOCK_REALTIME_ALARM] = &alarm_clock, + [CLOCK_BOOTTIME_ALARM] = &alarm_clock, + [CLOCK_TAI] = &clock_tai, +}; + +static const struct k_clock *clockid_to_kclock(const clockid_t id) +{ + clockid_t idx = id; + + if (id < 0) { + return (id & CLOCKFD_MASK) == CLOCKFD ? + &clock_posix_dynamic : &clock_posix_cpu; + } + + if (id >= ARRAY_SIZE(posix_clocks)) + return NULL; + + return posix_clocks[array_index_nospec(idx, ARRAY_SIZE(posix_clocks))]; +} diff --git a/kernel/time/posix-timers.h b/kernel/time/posix-timers.h new file mode 100644 index 0000000000..f32a2ebba9 --- /dev/null +++ b/kernel/time/posix-timers.h @@ -0,0 +1,45 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#define TIMER_RETRY 1 + +struct k_clock { + int (*clock_getres)(const clockid_t which_clock, + struct timespec64 *tp); + int (*clock_set)(const clockid_t which_clock, + const struct timespec64 *tp); + /* Returns the clock value in the current time namespace. */ + int (*clock_get_timespec)(const clockid_t which_clock, + struct timespec64 *tp); + /* Returns the clock value in the root time namespace. */ + ktime_t (*clock_get_ktime)(const clockid_t which_clock); + int (*clock_adj)(const clockid_t which_clock, struct __kernel_timex *tx); + int (*timer_create)(struct k_itimer *timer); + int (*nsleep)(const clockid_t which_clock, int flags, + const struct timespec64 *); + int (*timer_set)(struct k_itimer *timr, int flags, + struct itimerspec64 *new_setting, + struct itimerspec64 *old_setting); + int (*timer_del)(struct k_itimer *timr); + void (*timer_get)(struct k_itimer *timr, + struct itimerspec64 *cur_setting); + void (*timer_rearm)(struct k_itimer *timr); + s64 (*timer_forward)(struct k_itimer *timr, ktime_t now); + ktime_t (*timer_remaining)(struct k_itimer *timr, ktime_t now); + int (*timer_try_to_cancel)(struct k_itimer *timr); + void (*timer_arm)(struct k_itimer *timr, ktime_t expires, + bool absolute, bool sigev_none); + void (*timer_wait_running)(struct k_itimer *timr); +}; + +extern const struct k_clock clock_posix_cpu; +extern const struct k_clock clock_posix_dynamic; +extern const struct k_clock clock_process; +extern const struct k_clock clock_thread; +extern const struct k_clock alarm_clock; + +int posix_timer_event(struct k_itimer *timr, int si_private); + +void common_timer_get(struct k_itimer *timr, struct itimerspec64 *cur_setting); +int common_timer_set(struct k_itimer *timr, int flags, + struct itimerspec64 *new_setting, + struct itimerspec64 *old_setting); +int common_timer_del(struct k_itimer *timer); diff --git a/kernel/time/sched_clock.c b/kernel/time/sched_clock.c new file mode 100644 index 0000000000..68d6c1190a --- /dev/null +++ b/kernel/time/sched_clock.c @@ -0,0 +1,306 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Generic sched_clock() support, to extend low level hardware time + * counters to full 64-bit ns values. + */ +#include <linux/clocksource.h> +#include <linux/init.h> +#include <linux/jiffies.h> +#include <linux/ktime.h> +#include <linux/kernel.h> +#include <linux/math.h> +#include <linux/moduleparam.h> +#include <linux/sched.h> +#include <linux/sched/clock.h> +#include <linux/syscore_ops.h> +#include <linux/hrtimer.h> +#include <linux/sched_clock.h> +#include <linux/seqlock.h> +#include <linux/bitops.h> + +#include "timekeeping.h" + +/** + * struct clock_data - all data needed for sched_clock() (including + * registration of a new clock source) + * + * @seq: Sequence counter for protecting updates. The lowest + * bit is the index for @read_data. + * @read_data: Data required to read from sched_clock. + * @wrap_kt: Duration for which clock can run before wrapping. + * @rate: Tick rate of the registered clock. + * @actual_read_sched_clock: Registered hardware level clock read function. + * + * The ordering of this structure has been chosen to optimize cache + * performance. In particular 'seq' and 'read_data[0]' (combined) should fit + * into a single 64-byte cache line. + */ +struct clock_data { + seqcount_latch_t seq; + struct clock_read_data read_data[2]; + ktime_t wrap_kt; + unsigned long rate; + + u64 (*actual_read_sched_clock)(void); +}; + +static struct hrtimer sched_clock_timer; +static int irqtime = -1; + +core_param(irqtime, irqtime, int, 0400); + +static u64 notrace jiffy_sched_clock_read(void) +{ + /* + * We don't need to use get_jiffies_64 on 32-bit arches here + * because we register with BITS_PER_LONG + */ + return (u64)(jiffies - INITIAL_JIFFIES); +} + +static struct clock_data cd ____cacheline_aligned = { + .read_data[0] = { .mult = NSEC_PER_SEC / HZ, + .read_sched_clock = jiffy_sched_clock_read, }, + .actual_read_sched_clock = jiffy_sched_clock_read, +}; + +static __always_inline u64 cyc_to_ns(u64 cyc, u32 mult, u32 shift) +{ + return (cyc * mult) >> shift; +} + +notrace struct clock_read_data *sched_clock_read_begin(unsigned int *seq) +{ + *seq = raw_read_seqcount_latch(&cd.seq); + return cd.read_data + (*seq & 1); +} + +notrace int sched_clock_read_retry(unsigned int seq) +{ + return raw_read_seqcount_latch_retry(&cd.seq, seq); +} + +unsigned long long noinstr sched_clock_noinstr(void) +{ + struct clock_read_data *rd; + unsigned int seq; + u64 cyc, res; + + do { + seq = raw_read_seqcount_latch(&cd.seq); + rd = cd.read_data + (seq & 1); + + cyc = (rd->read_sched_clock() - rd->epoch_cyc) & + rd->sched_clock_mask; + res = rd->epoch_ns + cyc_to_ns(cyc, rd->mult, rd->shift); + } while (raw_read_seqcount_latch_retry(&cd.seq, seq)); + + return res; +} + +unsigned long long notrace sched_clock(void) +{ + unsigned long long ns; + preempt_disable_notrace(); + ns = sched_clock_noinstr(); + preempt_enable_notrace(); + return ns; +} + +/* + * Updating the data required to read the clock. + * + * sched_clock() will never observe mis-matched data even if called from + * an NMI. We do this by maintaining an odd/even copy of the data and + * steering sched_clock() to one or the other using a sequence counter. + * In order to preserve the data cache profile of sched_clock() as much + * as possible the system reverts back to the even copy when the update + * completes; the odd copy is used *only* during an update. + */ +static void update_clock_read_data(struct clock_read_data *rd) +{ + /* update the backup (odd) copy with the new data */ + cd.read_data[1] = *rd; + + /* steer readers towards the odd copy */ + raw_write_seqcount_latch(&cd.seq); + + /* now its safe for us to update the normal (even) copy */ + cd.read_data[0] = *rd; + + /* switch readers back to the even copy */ + raw_write_seqcount_latch(&cd.seq); +} + +/* + * Atomically update the sched_clock() epoch. + */ +static void update_sched_clock(void) +{ + u64 cyc; + u64 ns; + struct clock_read_data rd; + + rd = cd.read_data[0]; + + cyc = cd.actual_read_sched_clock(); + ns = rd.epoch_ns + cyc_to_ns((cyc - rd.epoch_cyc) & rd.sched_clock_mask, rd.mult, rd.shift); + + rd.epoch_ns = ns; + rd.epoch_cyc = cyc; + + update_clock_read_data(&rd); +} + +static enum hrtimer_restart sched_clock_poll(struct hrtimer *hrt) +{ + update_sched_clock(); + hrtimer_forward_now(hrt, cd.wrap_kt); + + return HRTIMER_RESTART; +} + +void __init +sched_clock_register(u64 (*read)(void), int bits, unsigned long rate) +{ + u64 res, wrap, new_mask, new_epoch, cyc, ns; + u32 new_mult, new_shift; + unsigned long r, flags; + char r_unit; + struct clock_read_data rd; + + if (cd.rate > rate) + return; + + /* Cannot register a sched_clock with interrupts on */ + local_irq_save(flags); + + /* Calculate the mult/shift to convert counter ticks to ns. */ + clocks_calc_mult_shift(&new_mult, &new_shift, rate, NSEC_PER_SEC, 3600); + + new_mask = CLOCKSOURCE_MASK(bits); + cd.rate = rate; + + /* Calculate how many nanosecs until we risk wrapping */ + wrap = clocks_calc_max_nsecs(new_mult, new_shift, 0, new_mask, NULL); + cd.wrap_kt = ns_to_ktime(wrap); + + rd = cd.read_data[0]; + + /* Update epoch for new counter and update 'epoch_ns' from old counter*/ + new_epoch = read(); + cyc = cd.actual_read_sched_clock(); + ns = rd.epoch_ns + cyc_to_ns((cyc - rd.epoch_cyc) & rd.sched_clock_mask, rd.mult, rd.shift); + cd.actual_read_sched_clock = read; + + rd.read_sched_clock = read; + rd.sched_clock_mask = new_mask; + rd.mult = new_mult; + rd.shift = new_shift; + rd.epoch_cyc = new_epoch; + rd.epoch_ns = ns; + + update_clock_read_data(&rd); + + if (sched_clock_timer.function != NULL) { + /* update timeout for clock wrap */ + hrtimer_start(&sched_clock_timer, cd.wrap_kt, + HRTIMER_MODE_REL_HARD); + } + + r = rate; + if (r >= 4000000) { + r = DIV_ROUND_CLOSEST(r, 1000000); + r_unit = 'M'; + } else if (r >= 4000) { + r = DIV_ROUND_CLOSEST(r, 1000); + r_unit = 'k'; + } else { + r_unit = ' '; + } + + /* Calculate the ns resolution of this counter */ + res = cyc_to_ns(1ULL, new_mult, new_shift); + + pr_info("sched_clock: %u bits at %lu%cHz, resolution %lluns, wraps every %lluns\n", + bits, r, r_unit, res, wrap); + + /* Enable IRQ time accounting if we have a fast enough sched_clock() */ + if (irqtime > 0 || (irqtime == -1 && rate >= 1000000)) + enable_sched_clock_irqtime(); + + local_irq_restore(flags); + + pr_debug("Registered %pS as sched_clock source\n", read); +} + +void __init generic_sched_clock_init(void) +{ + /* + * If no sched_clock() function has been provided at that point, + * make it the final one. + */ + if (cd.actual_read_sched_clock == jiffy_sched_clock_read) + sched_clock_register(jiffy_sched_clock_read, BITS_PER_LONG, HZ); + + update_sched_clock(); + + /* + * Start the timer to keep sched_clock() properly updated and + * sets the initial epoch. + */ + hrtimer_init(&sched_clock_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_HARD); + sched_clock_timer.function = sched_clock_poll; + hrtimer_start(&sched_clock_timer, cd.wrap_kt, HRTIMER_MODE_REL_HARD); +} + +/* + * Clock read function for use when the clock is suspended. + * + * This function makes it appear to sched_clock() as if the clock + * stopped counting at its last update. + * + * This function must only be called from the critical + * section in sched_clock(). It relies on the read_seqcount_retry() + * at the end of the critical section to be sure we observe the + * correct copy of 'epoch_cyc'. + */ +static u64 notrace suspended_sched_clock_read(void) +{ + unsigned int seq = raw_read_seqcount_latch(&cd.seq); + + return cd.read_data[seq & 1].epoch_cyc; +} + +int sched_clock_suspend(void) +{ + struct clock_read_data *rd = &cd.read_data[0]; + + update_sched_clock(); + hrtimer_cancel(&sched_clock_timer); + rd->read_sched_clock = suspended_sched_clock_read; + + return 0; +} + +void sched_clock_resume(void) +{ + struct clock_read_data *rd = &cd.read_data[0]; + + rd->epoch_cyc = cd.actual_read_sched_clock(); + hrtimer_start(&sched_clock_timer, cd.wrap_kt, HRTIMER_MODE_REL_HARD); + rd->read_sched_clock = cd.actual_read_sched_clock; +} + +static struct syscore_ops sched_clock_ops = { + .suspend = sched_clock_suspend, + .resume = sched_clock_resume, +}; + +static int __init sched_clock_syscore_init(void) +{ + register_syscore_ops(&sched_clock_ops); + + return 0; +} +device_initcall(sched_clock_syscore_init); diff --git a/kernel/time/test_udelay.c b/kernel/time/test_udelay.c new file mode 100644 index 0000000000..20d5df6315 --- /dev/null +++ b/kernel/time/test_udelay.c @@ -0,0 +1,159 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * udelay() test kernel module + * + * Test is executed by writing and reading to /sys/kernel/debug/udelay_test + * Tests are configured by writing: USECS ITERATIONS + * Tests are executed by reading from the same file. + * Specifying usecs of 0 or negative values will run multiples tests. + * + * Copyright (C) 2014 Google, Inc. + */ + +#include <linux/debugfs.h> +#include <linux/delay.h> +#include <linux/ktime.h> +#include <linux/module.h> +#include <linux/uaccess.h> + +#define DEFAULT_ITERATIONS 100 + +#define DEBUGFS_FILENAME "udelay_test" + +static DEFINE_MUTEX(udelay_test_lock); +static int udelay_test_usecs; +static int udelay_test_iterations = DEFAULT_ITERATIONS; + +static int udelay_test_single(struct seq_file *s, int usecs, uint32_t iters) +{ + int min = 0, max = 0, fail_count = 0; + uint64_t sum = 0; + uint64_t avg; + int i; + /* Allow udelay to be up to 0.5% fast */ + int allowed_error_ns = usecs * 5; + + for (i = 0; i < iters; ++i) { + s64 kt1, kt2; + int time_passed; + + kt1 = ktime_get_ns(); + udelay(usecs); + kt2 = ktime_get_ns(); + time_passed = kt2 - kt1; + + if (i == 0 || time_passed < min) + min = time_passed; + if (i == 0 || time_passed > max) + max = time_passed; + if ((time_passed + allowed_error_ns) / 1000 < usecs) + ++fail_count; + WARN_ON(time_passed < 0); + sum += time_passed; + } + + avg = sum; + do_div(avg, iters); + seq_printf(s, "%d usecs x %d: exp=%d allowed=%d min=%d avg=%lld max=%d", + usecs, iters, usecs * 1000, + (usecs * 1000) - allowed_error_ns, min, avg, max); + if (fail_count) + seq_printf(s, " FAIL=%d", fail_count); + seq_puts(s, "\n"); + + return 0; +} + +static int udelay_test_show(struct seq_file *s, void *v) +{ + int usecs; + int iters; + int ret = 0; + + mutex_lock(&udelay_test_lock); + usecs = udelay_test_usecs; + iters = udelay_test_iterations; + mutex_unlock(&udelay_test_lock); + + if (usecs > 0 && iters > 0) { + return udelay_test_single(s, usecs, iters); + } else if (usecs == 0) { + struct timespec64 ts; + + ktime_get_ts64(&ts); + seq_printf(s, "udelay() test (lpj=%ld kt=%lld.%09ld)\n", + loops_per_jiffy, (s64)ts.tv_sec, ts.tv_nsec); + seq_puts(s, "usage:\n"); + seq_puts(s, "echo USECS [ITERS] > " DEBUGFS_FILENAME "\n"); + seq_puts(s, "cat " DEBUGFS_FILENAME "\n"); + } + + return ret; +} + +static int udelay_test_open(struct inode *inode, struct file *file) +{ + return single_open(file, udelay_test_show, inode->i_private); +} + +static ssize_t udelay_test_write(struct file *file, const char __user *buf, + size_t count, loff_t *pos) +{ + char lbuf[32]; + int ret; + int usecs; + int iters; + + if (count >= sizeof(lbuf)) + return -EINVAL; + + if (copy_from_user(lbuf, buf, count)) + return -EFAULT; + lbuf[count] = '\0'; + + ret = sscanf(lbuf, "%d %d", &usecs, &iters); + if (ret < 1) + return -EINVAL; + else if (ret < 2) + iters = DEFAULT_ITERATIONS; + + mutex_lock(&udelay_test_lock); + udelay_test_usecs = usecs; + udelay_test_iterations = iters; + mutex_unlock(&udelay_test_lock); + + return count; +} + +static const struct file_operations udelay_test_debugfs_ops = { + .owner = THIS_MODULE, + .open = udelay_test_open, + .read = seq_read, + .write = udelay_test_write, + .llseek = seq_lseek, + .release = single_release, +}; + +static int __init udelay_test_init(void) +{ + mutex_lock(&udelay_test_lock); + debugfs_create_file(DEBUGFS_FILENAME, S_IRUSR, NULL, NULL, + &udelay_test_debugfs_ops); + mutex_unlock(&udelay_test_lock); + + return 0; +} + +module_init(udelay_test_init); + +static void __exit udelay_test_exit(void) +{ + mutex_lock(&udelay_test_lock); + debugfs_lookup_and_remove(DEBUGFS_FILENAME, NULL); + mutex_unlock(&udelay_test_lock); +} + +module_exit(udelay_test_exit); + +MODULE_AUTHOR("David Riley <davidriley@chromium.org>"); +MODULE_LICENSE("GPL"); diff --git a/kernel/time/tick-broadcast-hrtimer.c b/kernel/time/tick-broadcast-hrtimer.c new file mode 100644 index 0000000000..e28f9210f8 --- /dev/null +++ b/kernel/time/tick-broadcast-hrtimer.c @@ -0,0 +1,106 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Emulate a local clock event device via a pseudo clock device. + */ +#include <linux/cpu.h> +#include <linux/err.h> +#include <linux/hrtimer.h> +#include <linux/interrupt.h> +#include <linux/percpu.h> +#include <linux/profile.h> +#include <linux/clockchips.h> +#include <linux/sched.h> +#include <linux/smp.h> +#include <linux/module.h> + +#include "tick-internal.h" + +static struct hrtimer bctimer; + +static int bc_shutdown(struct clock_event_device *evt) +{ + /* + * Note, we cannot cancel the timer here as we might + * run into the following live lock scenario: + * + * cpu 0 cpu1 + * lock(broadcast_lock); + * hrtimer_interrupt() + * bc_handler() + * tick_handle_oneshot_broadcast(); + * lock(broadcast_lock); + * hrtimer_cancel() + * wait_for_callback() + */ + hrtimer_try_to_cancel(&bctimer); + return 0; +} + +/* + * This is called from the guts of the broadcast code when the cpu + * which is about to enter idle has the earliest broadcast timer event. + */ +static int bc_set_next(ktime_t expires, struct clock_event_device *bc) +{ + /* + * This is called either from enter/exit idle code or from the + * broadcast handler. In all cases tick_broadcast_lock is held. + * + * hrtimer_cancel() cannot be called here neither from the + * broadcast handler nor from the enter/exit idle code. The idle + * code can run into the problem described in bc_shutdown() and the + * broadcast handler cannot wait for itself to complete for obvious + * reasons. + * + * Each caller tries to arm the hrtimer on its own CPU, but if the + * hrtimer callback function is currently running, then + * hrtimer_start() cannot move it and the timer stays on the CPU on + * which it is assigned at the moment. + */ + hrtimer_start(&bctimer, expires, HRTIMER_MODE_ABS_PINNED_HARD); + /* + * The core tick broadcast mode expects bc->bound_on to be set + * correctly to prevent a CPU which has the broadcast hrtimer + * armed from going deep idle. + * + * As tick_broadcast_lock is held, nothing can change the cpu + * base which was just established in hrtimer_start() above. So + * the below access is safe even without holding the hrtimer + * base lock. + */ + bc->bound_on = bctimer.base->cpu_base->cpu; + + return 0; +} + +static struct clock_event_device ce_broadcast_hrtimer = { + .name = "bc_hrtimer", + .set_state_shutdown = bc_shutdown, + .set_next_ktime = bc_set_next, + .features = CLOCK_EVT_FEAT_ONESHOT | + CLOCK_EVT_FEAT_KTIME | + CLOCK_EVT_FEAT_HRTIMER, + .rating = 0, + .bound_on = -1, + .min_delta_ns = 1, + .max_delta_ns = KTIME_MAX, + .min_delta_ticks = 1, + .max_delta_ticks = ULONG_MAX, + .mult = 1, + .shift = 0, + .cpumask = cpu_possible_mask, +}; + +static enum hrtimer_restart bc_handler(struct hrtimer *t) +{ + ce_broadcast_hrtimer.event_handler(&ce_broadcast_hrtimer); + + return HRTIMER_NORESTART; +} + +void tick_setup_hrtimer_broadcast(void) +{ + hrtimer_init(&bctimer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_HARD); + bctimer.function = bc_handler; + clockevents_register_device(&ce_broadcast_hrtimer); +} diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c new file mode 100644 index 0000000000..771d1e0403 --- /dev/null +++ b/kernel/time/tick-broadcast.c @@ -0,0 +1,1215 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * This file contains functions which emulate a local clock-event + * device via a broadcast event source. + * + * Copyright(C) 2005-2006, Thomas Gleixner <tglx@linutronix.de> + * Copyright(C) 2005-2007, Red Hat, Inc., Ingo Molnar + * Copyright(C) 2006-2007, Timesys Corp., Thomas Gleixner + */ +#include <linux/cpu.h> +#include <linux/err.h> +#include <linux/hrtimer.h> +#include <linux/interrupt.h> +#include <linux/percpu.h> +#include <linux/profile.h> +#include <linux/sched.h> +#include <linux/smp.h> +#include <linux/module.h> + +#include "tick-internal.h" + +/* + * Broadcast support for broken x86 hardware, where the local apic + * timer stops in C3 state. + */ + +static struct tick_device tick_broadcast_device; +static cpumask_var_t tick_broadcast_mask __cpumask_var_read_mostly; +static cpumask_var_t tick_broadcast_on __cpumask_var_read_mostly; +static cpumask_var_t tmpmask __cpumask_var_read_mostly; +static int tick_broadcast_forced; + +static __cacheline_aligned_in_smp DEFINE_RAW_SPINLOCK(tick_broadcast_lock); + +#ifdef CONFIG_TICK_ONESHOT +static DEFINE_PER_CPU(struct clock_event_device *, tick_oneshot_wakeup_device); + +static void tick_broadcast_setup_oneshot(struct clock_event_device *bc, bool from_periodic); +static void tick_broadcast_clear_oneshot(int cpu); +static void tick_resume_broadcast_oneshot(struct clock_event_device *bc); +# ifdef CONFIG_HOTPLUG_CPU +static void tick_broadcast_oneshot_offline(unsigned int cpu); +# endif +#else +static inline void +tick_broadcast_setup_oneshot(struct clock_event_device *bc, bool from_periodic) { BUG(); } +static inline void tick_broadcast_clear_oneshot(int cpu) { } +static inline void tick_resume_broadcast_oneshot(struct clock_event_device *bc) { } +# ifdef CONFIG_HOTPLUG_CPU +static inline void tick_broadcast_oneshot_offline(unsigned int cpu) { } +# endif +#endif + +/* + * Debugging: see timer_list.c + */ +struct tick_device *tick_get_broadcast_device(void) +{ + return &tick_broadcast_device; +} + +struct cpumask *tick_get_broadcast_mask(void) +{ + return tick_broadcast_mask; +} + +static struct clock_event_device *tick_get_oneshot_wakeup_device(int cpu); + +const struct clock_event_device *tick_get_wakeup_device(int cpu) +{ + return tick_get_oneshot_wakeup_device(cpu); +} + +/* + * Start the device in periodic mode + */ +static void tick_broadcast_start_periodic(struct clock_event_device *bc) +{ + if (bc) + tick_setup_periodic(bc, 1); +} + +/* + * Check, if the device can be utilized as broadcast device: + */ +static bool tick_check_broadcast_device(struct clock_event_device *curdev, + struct clock_event_device *newdev) +{ + if ((newdev->features & CLOCK_EVT_FEAT_DUMMY) || + (newdev->features & CLOCK_EVT_FEAT_PERCPU) || + (newdev->features & CLOCK_EVT_FEAT_C3STOP)) + return false; + + if (tick_broadcast_device.mode == TICKDEV_MODE_ONESHOT && + !(newdev->features & CLOCK_EVT_FEAT_ONESHOT)) + return false; + + return !curdev || newdev->rating > curdev->rating; +} + +#ifdef CONFIG_TICK_ONESHOT +static struct clock_event_device *tick_get_oneshot_wakeup_device(int cpu) +{ + return per_cpu(tick_oneshot_wakeup_device, cpu); +} + +static void tick_oneshot_wakeup_handler(struct clock_event_device *wd) +{ + /* + * If we woke up early and the tick was reprogrammed in the + * meantime then this may be spurious but harmless. + */ + tick_receive_broadcast(); +} + +static bool tick_set_oneshot_wakeup_device(struct clock_event_device *newdev, + int cpu) +{ + struct clock_event_device *curdev = tick_get_oneshot_wakeup_device(cpu); + + if (!newdev) + goto set_device; + + if ((newdev->features & CLOCK_EVT_FEAT_DUMMY) || + (newdev->features & CLOCK_EVT_FEAT_C3STOP)) + return false; + + if (!(newdev->features & CLOCK_EVT_FEAT_PERCPU) || + !(newdev->features & CLOCK_EVT_FEAT_ONESHOT)) + return false; + + if (!cpumask_equal(newdev->cpumask, cpumask_of(cpu))) + return false; + + if (curdev && newdev->rating <= curdev->rating) + return false; + + if (!try_module_get(newdev->owner)) + return false; + + newdev->event_handler = tick_oneshot_wakeup_handler; +set_device: + clockevents_exchange_device(curdev, newdev); + per_cpu(tick_oneshot_wakeup_device, cpu) = newdev; + return true; +} +#else +static struct clock_event_device *tick_get_oneshot_wakeup_device(int cpu) +{ + return NULL; +} + +static bool tick_set_oneshot_wakeup_device(struct clock_event_device *newdev, + int cpu) +{ + return false; +} +#endif + +/* + * Conditionally install/replace broadcast device + */ +void tick_install_broadcast_device(struct clock_event_device *dev, int cpu) +{ + struct clock_event_device *cur = tick_broadcast_device.evtdev; + + if (tick_set_oneshot_wakeup_device(dev, cpu)) + return; + + if (!tick_check_broadcast_device(cur, dev)) + return; + + if (!try_module_get(dev->owner)) + return; + + clockevents_exchange_device(cur, dev); + if (cur) + cur->event_handler = clockevents_handle_noop; + tick_broadcast_device.evtdev = dev; + if (!cpumask_empty(tick_broadcast_mask)) + tick_broadcast_start_periodic(dev); + + if (!(dev->features & CLOCK_EVT_FEAT_ONESHOT)) + return; + + /* + * If the system already runs in oneshot mode, switch the newly + * registered broadcast device to oneshot mode explicitly. + */ + if (tick_broadcast_oneshot_active()) { + tick_broadcast_switch_to_oneshot(); + return; + } + + /* + * Inform all cpus about this. We might be in a situation + * where we did not switch to oneshot mode because the per cpu + * devices are affected by CLOCK_EVT_FEAT_C3STOP and the lack + * of a oneshot capable broadcast device. Without that + * notification the systems stays stuck in periodic mode + * forever. + */ + tick_clock_notify(); +} + +/* + * Check, if the device is the broadcast device + */ +int tick_is_broadcast_device(struct clock_event_device *dev) +{ + return (dev && tick_broadcast_device.evtdev == dev); +} + +int tick_broadcast_update_freq(struct clock_event_device *dev, u32 freq) +{ + int ret = -ENODEV; + + if (tick_is_broadcast_device(dev)) { + raw_spin_lock(&tick_broadcast_lock); + ret = __clockevents_update_freq(dev, freq); + raw_spin_unlock(&tick_broadcast_lock); + } + return ret; +} + + +static void err_broadcast(const struct cpumask *mask) +{ + pr_crit_once("Failed to broadcast timer tick. Some CPUs may be unresponsive.\n"); +} + +static void tick_device_setup_broadcast_func(struct clock_event_device *dev) +{ + if (!dev->broadcast) + dev->broadcast = tick_broadcast; + if (!dev->broadcast) { + pr_warn_once("%s depends on broadcast, but no broadcast function available\n", + dev->name); + dev->broadcast = err_broadcast; + } +} + +/* + * Check, if the device is dysfunctional and a placeholder, which + * needs to be handled by the broadcast device. + */ +int tick_device_uses_broadcast(struct clock_event_device *dev, int cpu) +{ + struct clock_event_device *bc = tick_broadcast_device.evtdev; + unsigned long flags; + int ret = 0; + + raw_spin_lock_irqsave(&tick_broadcast_lock, flags); + + /* + * Devices might be registered with both periodic and oneshot + * mode disabled. This signals, that the device needs to be + * operated from the broadcast device and is a placeholder for + * the cpu local device. + */ + if (!tick_device_is_functional(dev)) { + dev->event_handler = tick_handle_periodic; + tick_device_setup_broadcast_func(dev); + cpumask_set_cpu(cpu, tick_broadcast_mask); + if (tick_broadcast_device.mode == TICKDEV_MODE_PERIODIC) + tick_broadcast_start_periodic(bc); + else + tick_broadcast_setup_oneshot(bc, false); + ret = 1; + } else { + /* + * Clear the broadcast bit for this cpu if the + * device is not power state affected. + */ + if (!(dev->features & CLOCK_EVT_FEAT_C3STOP)) + cpumask_clear_cpu(cpu, tick_broadcast_mask); + else + tick_device_setup_broadcast_func(dev); + + /* + * Clear the broadcast bit if the CPU is not in + * periodic broadcast on state. + */ + if (!cpumask_test_cpu(cpu, tick_broadcast_on)) + cpumask_clear_cpu(cpu, tick_broadcast_mask); + + switch (tick_broadcast_device.mode) { + case TICKDEV_MODE_ONESHOT: + /* + * If the system is in oneshot mode we can + * unconditionally clear the oneshot mask bit, + * because the CPU is running and therefore + * not in an idle state which causes the power + * state affected device to stop. Let the + * caller initialize the device. + */ + tick_broadcast_clear_oneshot(cpu); + ret = 0; + break; + + case TICKDEV_MODE_PERIODIC: + /* + * If the system is in periodic mode, check + * whether the broadcast device can be + * switched off now. + */ + if (cpumask_empty(tick_broadcast_mask) && bc) + clockevents_shutdown(bc); + /* + * If we kept the cpu in the broadcast mask, + * tell the caller to leave the per cpu device + * in shutdown state. The periodic interrupt + * is delivered by the broadcast device, if + * the broadcast device exists and is not + * hrtimer based. + */ + if (bc && !(bc->features & CLOCK_EVT_FEAT_HRTIMER)) + ret = cpumask_test_cpu(cpu, tick_broadcast_mask); + break; + default: + break; + } + } + raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags); + return ret; +} + +int tick_receive_broadcast(void) +{ + struct tick_device *td = this_cpu_ptr(&tick_cpu_device); + struct clock_event_device *evt = td->evtdev; + + if (!evt) + return -ENODEV; + + if (!evt->event_handler) + return -EINVAL; + + evt->event_handler(evt); + return 0; +} + +/* + * Broadcast the event to the cpus, which are set in the mask (mangled). + */ +static bool tick_do_broadcast(struct cpumask *mask) +{ + int cpu = smp_processor_id(); + struct tick_device *td; + bool local = false; + + /* + * Check, if the current cpu is in the mask + */ + if (cpumask_test_cpu(cpu, mask)) { + struct clock_event_device *bc = tick_broadcast_device.evtdev; + + cpumask_clear_cpu(cpu, mask); + /* + * We only run the local handler, if the broadcast + * device is not hrtimer based. Otherwise we run into + * a hrtimer recursion. + * + * local timer_interrupt() + * local_handler() + * expire_hrtimers() + * bc_handler() + * local_handler() + * expire_hrtimers() + */ + local = !(bc->features & CLOCK_EVT_FEAT_HRTIMER); + } + + if (!cpumask_empty(mask)) { + /* + * It might be necessary to actually check whether the devices + * have different broadcast functions. For now, just use the + * one of the first device. This works as long as we have this + * misfeature only on x86 (lapic) + */ + td = &per_cpu(tick_cpu_device, cpumask_first(mask)); + td->evtdev->broadcast(mask); + } + return local; +} + +/* + * Periodic broadcast: + * - invoke the broadcast handlers + */ +static bool tick_do_periodic_broadcast(void) +{ + cpumask_and(tmpmask, cpu_online_mask, tick_broadcast_mask); + return tick_do_broadcast(tmpmask); +} + +/* + * Event handler for periodic broadcast ticks + */ +static void tick_handle_periodic_broadcast(struct clock_event_device *dev) +{ + struct tick_device *td = this_cpu_ptr(&tick_cpu_device); + bool bc_local; + + raw_spin_lock(&tick_broadcast_lock); + + /* Handle spurious interrupts gracefully */ + if (clockevent_state_shutdown(tick_broadcast_device.evtdev)) { + raw_spin_unlock(&tick_broadcast_lock); + return; + } + + bc_local = tick_do_periodic_broadcast(); + + if (clockevent_state_oneshot(dev)) { + ktime_t next = ktime_add_ns(dev->next_event, TICK_NSEC); + + clockevents_program_event(dev, next, true); + } + raw_spin_unlock(&tick_broadcast_lock); + + /* + * We run the handler of the local cpu after dropping + * tick_broadcast_lock because the handler might deadlock when + * trying to switch to oneshot mode. + */ + if (bc_local) + td->evtdev->event_handler(td->evtdev); +} + +/** + * tick_broadcast_control - Enable/disable or force broadcast mode + * @mode: The selected broadcast mode + * + * Called when the system enters a state where affected tick devices + * might stop. Note: TICK_BROADCAST_FORCE cannot be undone. + */ +void tick_broadcast_control(enum tick_broadcast_mode mode) +{ + struct clock_event_device *bc, *dev; + struct tick_device *td; + int cpu, bc_stopped; + unsigned long flags; + + /* Protects also the local clockevent device. */ + raw_spin_lock_irqsave(&tick_broadcast_lock, flags); + td = this_cpu_ptr(&tick_cpu_device); + dev = td->evtdev; + + /* + * Is the device not affected by the powerstate ? + */ + if (!dev || !(dev->features & CLOCK_EVT_FEAT_C3STOP)) + goto out; + + if (!tick_device_is_functional(dev)) + goto out; + + cpu = smp_processor_id(); + bc = tick_broadcast_device.evtdev; + bc_stopped = cpumask_empty(tick_broadcast_mask); + + switch (mode) { + case TICK_BROADCAST_FORCE: + tick_broadcast_forced = 1; + fallthrough; + case TICK_BROADCAST_ON: + cpumask_set_cpu(cpu, tick_broadcast_on); + if (!cpumask_test_and_set_cpu(cpu, tick_broadcast_mask)) { + /* + * Only shutdown the cpu local device, if: + * + * - the broadcast device exists + * - the broadcast device is not a hrtimer based one + * - the broadcast device is in periodic mode to + * avoid a hiccup during switch to oneshot mode + */ + if (bc && !(bc->features & CLOCK_EVT_FEAT_HRTIMER) && + tick_broadcast_device.mode == TICKDEV_MODE_PERIODIC) + clockevents_shutdown(dev); + } + break; + + case TICK_BROADCAST_OFF: + if (tick_broadcast_forced) + break; + cpumask_clear_cpu(cpu, tick_broadcast_on); + if (cpumask_test_and_clear_cpu(cpu, tick_broadcast_mask)) { + if (tick_broadcast_device.mode == + TICKDEV_MODE_PERIODIC) + tick_setup_periodic(dev, 0); + } + break; + } + + if (bc) { + if (cpumask_empty(tick_broadcast_mask)) { + if (!bc_stopped) + clockevents_shutdown(bc); + } else if (bc_stopped) { + if (tick_broadcast_device.mode == TICKDEV_MODE_PERIODIC) + tick_broadcast_start_periodic(bc); + else + tick_broadcast_setup_oneshot(bc, false); + } + } +out: + raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags); +} +EXPORT_SYMBOL_GPL(tick_broadcast_control); + +/* + * Set the periodic handler depending on broadcast on/off + */ +void tick_set_periodic_handler(struct clock_event_device *dev, int broadcast) +{ + if (!broadcast) + dev->event_handler = tick_handle_periodic; + else + dev->event_handler = tick_handle_periodic_broadcast; +} + +#ifdef CONFIG_HOTPLUG_CPU +static void tick_shutdown_broadcast(void) +{ + struct clock_event_device *bc = tick_broadcast_device.evtdev; + + if (tick_broadcast_device.mode == TICKDEV_MODE_PERIODIC) { + if (bc && cpumask_empty(tick_broadcast_mask)) + clockevents_shutdown(bc); + } +} + +/* + * Remove a CPU from broadcasting + */ +void tick_broadcast_offline(unsigned int cpu) +{ + raw_spin_lock(&tick_broadcast_lock); + cpumask_clear_cpu(cpu, tick_broadcast_mask); + cpumask_clear_cpu(cpu, tick_broadcast_on); + tick_broadcast_oneshot_offline(cpu); + tick_shutdown_broadcast(); + raw_spin_unlock(&tick_broadcast_lock); +} + +#endif + +void tick_suspend_broadcast(void) +{ + struct clock_event_device *bc; + unsigned long flags; + + raw_spin_lock_irqsave(&tick_broadcast_lock, flags); + + bc = tick_broadcast_device.evtdev; + if (bc) + clockevents_shutdown(bc); + + raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags); +} + +/* + * This is called from tick_resume_local() on a resuming CPU. That's + * called from the core resume function, tick_unfreeze() and the magic XEN + * resume hackery. + * + * In none of these cases the broadcast device mode can change and the + * bit of the resuming CPU in the broadcast mask is safe as well. + */ +bool tick_resume_check_broadcast(void) +{ + if (tick_broadcast_device.mode == TICKDEV_MODE_ONESHOT) + return false; + else + return cpumask_test_cpu(smp_processor_id(), tick_broadcast_mask); +} + +void tick_resume_broadcast(void) +{ + struct clock_event_device *bc; + unsigned long flags; + + raw_spin_lock_irqsave(&tick_broadcast_lock, flags); + + bc = tick_broadcast_device.evtdev; + + if (bc) { + clockevents_tick_resume(bc); + + switch (tick_broadcast_device.mode) { + case TICKDEV_MODE_PERIODIC: + if (!cpumask_empty(tick_broadcast_mask)) + tick_broadcast_start_periodic(bc); + break; + case TICKDEV_MODE_ONESHOT: + if (!cpumask_empty(tick_broadcast_mask)) + tick_resume_broadcast_oneshot(bc); + break; + } + } + raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags); +} + +#ifdef CONFIG_TICK_ONESHOT + +static cpumask_var_t tick_broadcast_oneshot_mask __cpumask_var_read_mostly; +static cpumask_var_t tick_broadcast_pending_mask __cpumask_var_read_mostly; +static cpumask_var_t tick_broadcast_force_mask __cpumask_var_read_mostly; + +/* + * Exposed for debugging: see timer_list.c + */ +struct cpumask *tick_get_broadcast_oneshot_mask(void) +{ + return tick_broadcast_oneshot_mask; +} + +/* + * Called before going idle with interrupts disabled. Checks whether a + * broadcast event from the other core is about to happen. We detected + * that in tick_broadcast_oneshot_control(). The callsite can use this + * to avoid a deep idle transition as we are about to get the + * broadcast IPI right away. + */ +noinstr int tick_check_broadcast_expired(void) +{ +#ifdef _ASM_GENERIC_BITOPS_INSTRUMENTED_NON_ATOMIC_H + return arch_test_bit(smp_processor_id(), cpumask_bits(tick_broadcast_force_mask)); +#else + return cpumask_test_cpu(smp_processor_id(), tick_broadcast_force_mask); +#endif +} + +/* + * Set broadcast interrupt affinity + */ +static void tick_broadcast_set_affinity(struct clock_event_device *bc, + const struct cpumask *cpumask) +{ + if (!(bc->features & CLOCK_EVT_FEAT_DYNIRQ)) + return; + + if (cpumask_equal(bc->cpumask, cpumask)) + return; + + bc->cpumask = cpumask; + irq_set_affinity(bc->irq, bc->cpumask); +} + +static void tick_broadcast_set_event(struct clock_event_device *bc, int cpu, + ktime_t expires) +{ + if (!clockevent_state_oneshot(bc)) + clockevents_switch_state(bc, CLOCK_EVT_STATE_ONESHOT); + + clockevents_program_event(bc, expires, 1); + tick_broadcast_set_affinity(bc, cpumask_of(cpu)); +} + +static void tick_resume_broadcast_oneshot(struct clock_event_device *bc) +{ + clockevents_switch_state(bc, CLOCK_EVT_STATE_ONESHOT); +} + +/* + * Called from irq_enter() when idle was interrupted to reenable the + * per cpu device. + */ +void tick_check_oneshot_broadcast_this_cpu(void) +{ + if (cpumask_test_cpu(smp_processor_id(), tick_broadcast_oneshot_mask)) { + struct tick_device *td = this_cpu_ptr(&tick_cpu_device); + + /* + * We might be in the middle of switching over from + * periodic to oneshot. If the CPU has not yet + * switched over, leave the device alone. + */ + if (td->mode == TICKDEV_MODE_ONESHOT) { + clockevents_switch_state(td->evtdev, + CLOCK_EVT_STATE_ONESHOT); + } + } +} + +/* + * Handle oneshot mode broadcasting + */ +static void tick_handle_oneshot_broadcast(struct clock_event_device *dev) +{ + struct tick_device *td; + ktime_t now, next_event; + int cpu, next_cpu = 0; + bool bc_local; + + raw_spin_lock(&tick_broadcast_lock); + dev->next_event = KTIME_MAX; + next_event = KTIME_MAX; + cpumask_clear(tmpmask); + now = ktime_get(); + /* Find all expired events */ + for_each_cpu(cpu, tick_broadcast_oneshot_mask) { + /* + * Required for !SMP because for_each_cpu() reports + * unconditionally CPU0 as set on UP kernels. + */ + if (!IS_ENABLED(CONFIG_SMP) && + cpumask_empty(tick_broadcast_oneshot_mask)) + break; + + td = &per_cpu(tick_cpu_device, cpu); + if (td->evtdev->next_event <= now) { + cpumask_set_cpu(cpu, tmpmask); + /* + * Mark the remote cpu in the pending mask, so + * it can avoid reprogramming the cpu local + * timer in tick_broadcast_oneshot_control(). + */ + cpumask_set_cpu(cpu, tick_broadcast_pending_mask); + } else if (td->evtdev->next_event < next_event) { + next_event = td->evtdev->next_event; + next_cpu = cpu; + } + } + + /* + * Remove the current cpu from the pending mask. The event is + * delivered immediately in tick_do_broadcast() ! + */ + cpumask_clear_cpu(smp_processor_id(), tick_broadcast_pending_mask); + + /* Take care of enforced broadcast requests */ + cpumask_or(tmpmask, tmpmask, tick_broadcast_force_mask); + cpumask_clear(tick_broadcast_force_mask); + + /* + * Sanity check. Catch the case where we try to broadcast to + * offline cpus. + */ + if (WARN_ON_ONCE(!cpumask_subset(tmpmask, cpu_online_mask))) + cpumask_and(tmpmask, tmpmask, cpu_online_mask); + + /* + * Wakeup the cpus which have an expired event. + */ + bc_local = tick_do_broadcast(tmpmask); + + /* + * Two reasons for reprogram: + * + * - The global event did not expire any CPU local + * events. This happens in dyntick mode, as the maximum PIT + * delta is quite small. + * + * - There are pending events on sleeping CPUs which were not + * in the event mask + */ + if (next_event != KTIME_MAX) + tick_broadcast_set_event(dev, next_cpu, next_event); + + raw_spin_unlock(&tick_broadcast_lock); + + if (bc_local) { + td = this_cpu_ptr(&tick_cpu_device); + td->evtdev->event_handler(td->evtdev); + } +} + +static int broadcast_needs_cpu(struct clock_event_device *bc, int cpu) +{ + if (!(bc->features & CLOCK_EVT_FEAT_HRTIMER)) + return 0; + if (bc->next_event == KTIME_MAX) + return 0; + return bc->bound_on == cpu ? -EBUSY : 0; +} + +static void broadcast_shutdown_local(struct clock_event_device *bc, + struct clock_event_device *dev) +{ + /* + * For hrtimer based broadcasting we cannot shutdown the cpu + * local device if our own event is the first one to expire or + * if we own the broadcast timer. + */ + if (bc->features & CLOCK_EVT_FEAT_HRTIMER) { + if (broadcast_needs_cpu(bc, smp_processor_id())) + return; + if (dev->next_event < bc->next_event) + return; + } + clockevents_switch_state(dev, CLOCK_EVT_STATE_SHUTDOWN); +} + +static int ___tick_broadcast_oneshot_control(enum tick_broadcast_state state, + struct tick_device *td, + int cpu) +{ + struct clock_event_device *bc, *dev = td->evtdev; + int ret = 0; + ktime_t now; + + raw_spin_lock(&tick_broadcast_lock); + bc = tick_broadcast_device.evtdev; + + if (state == TICK_BROADCAST_ENTER) { + /* + * If the current CPU owns the hrtimer broadcast + * mechanism, it cannot go deep idle and we do not add + * the CPU to the broadcast mask. We don't have to go + * through the EXIT path as the local timer is not + * shutdown. + */ + ret = broadcast_needs_cpu(bc, cpu); + if (ret) + goto out; + + /* + * If the broadcast device is in periodic mode, we + * return. + */ + if (tick_broadcast_device.mode == TICKDEV_MODE_PERIODIC) { + /* If it is a hrtimer based broadcast, return busy */ + if (bc->features & CLOCK_EVT_FEAT_HRTIMER) + ret = -EBUSY; + goto out; + } + + if (!cpumask_test_and_set_cpu(cpu, tick_broadcast_oneshot_mask)) { + WARN_ON_ONCE(cpumask_test_cpu(cpu, tick_broadcast_pending_mask)); + + /* Conditionally shut down the local timer. */ + broadcast_shutdown_local(bc, dev); + + /* + * We only reprogram the broadcast timer if we + * did not mark ourself in the force mask and + * if the cpu local event is earlier than the + * broadcast event. If the current CPU is in + * the force mask, then we are going to be + * woken by the IPI right away; we return + * busy, so the CPU does not try to go deep + * idle. + */ + if (cpumask_test_cpu(cpu, tick_broadcast_force_mask)) { + ret = -EBUSY; + } else if (dev->next_event < bc->next_event) { + tick_broadcast_set_event(bc, cpu, dev->next_event); + /* + * In case of hrtimer broadcasts the + * programming might have moved the + * timer to this cpu. If yes, remove + * us from the broadcast mask and + * return busy. + */ + ret = broadcast_needs_cpu(bc, cpu); + if (ret) { + cpumask_clear_cpu(cpu, + tick_broadcast_oneshot_mask); + } + } + } + } else { + if (cpumask_test_and_clear_cpu(cpu, tick_broadcast_oneshot_mask)) { + clockevents_switch_state(dev, CLOCK_EVT_STATE_ONESHOT); + /* + * The cpu which was handling the broadcast + * timer marked this cpu in the broadcast + * pending mask and fired the broadcast + * IPI. So we are going to handle the expired + * event anyway via the broadcast IPI + * handler. No need to reprogram the timer + * with an already expired event. + */ + if (cpumask_test_and_clear_cpu(cpu, + tick_broadcast_pending_mask)) + goto out; + + /* + * Bail out if there is no next event. + */ + if (dev->next_event == KTIME_MAX) + goto out; + /* + * If the pending bit is not set, then we are + * either the CPU handling the broadcast + * interrupt or we got woken by something else. + * + * We are no longer in the broadcast mask, so + * if the cpu local expiry time is already + * reached, we would reprogram the cpu local + * timer with an already expired event. + * + * This can lead to a ping-pong when we return + * to idle and therefore rearm the broadcast + * timer before the cpu local timer was able + * to fire. This happens because the forced + * reprogramming makes sure that the event + * will happen in the future and depending on + * the min_delta setting this might be far + * enough out that the ping-pong starts. + * + * If the cpu local next_event has expired + * then we know that the broadcast timer + * next_event has expired as well and + * broadcast is about to be handled. So we + * avoid reprogramming and enforce that the + * broadcast handler, which did not run yet, + * will invoke the cpu local handler. + * + * We cannot call the handler directly from + * here, because we might be in a NOHZ phase + * and we did not go through the irq_enter() + * nohz fixups. + */ + now = ktime_get(); + if (dev->next_event <= now) { + cpumask_set_cpu(cpu, tick_broadcast_force_mask); + goto out; + } + /* + * We got woken by something else. Reprogram + * the cpu local timer device. + */ + tick_program_event(dev->next_event, 1); + } + } +out: + raw_spin_unlock(&tick_broadcast_lock); + return ret; +} + +static int tick_oneshot_wakeup_control(enum tick_broadcast_state state, + struct tick_device *td, + int cpu) +{ + struct clock_event_device *dev, *wd; + + dev = td->evtdev; + if (td->mode != TICKDEV_MODE_ONESHOT) + return -EINVAL; + + wd = tick_get_oneshot_wakeup_device(cpu); + if (!wd) + return -ENODEV; + + switch (state) { + case TICK_BROADCAST_ENTER: + clockevents_switch_state(dev, CLOCK_EVT_STATE_ONESHOT_STOPPED); + clockevents_switch_state(wd, CLOCK_EVT_STATE_ONESHOT); + clockevents_program_event(wd, dev->next_event, 1); + break; + case TICK_BROADCAST_EXIT: + /* We may have transitioned to oneshot mode while idle */ + if (clockevent_get_state(wd) != CLOCK_EVT_STATE_ONESHOT) + return -ENODEV; + } + + return 0; +} + +int __tick_broadcast_oneshot_control(enum tick_broadcast_state state) +{ + struct tick_device *td = this_cpu_ptr(&tick_cpu_device); + int cpu = smp_processor_id(); + + if (!tick_oneshot_wakeup_control(state, td, cpu)) + return 0; + + if (tick_broadcast_device.evtdev) + return ___tick_broadcast_oneshot_control(state, td, cpu); + + /* + * If there is no broadcast or wakeup device, tell the caller not + * to go into deep idle. + */ + return -EBUSY; +} + +/* + * Reset the one shot broadcast for a cpu + * + * Called with tick_broadcast_lock held + */ +static void tick_broadcast_clear_oneshot(int cpu) +{ + cpumask_clear_cpu(cpu, tick_broadcast_oneshot_mask); + cpumask_clear_cpu(cpu, tick_broadcast_pending_mask); +} + +static void tick_broadcast_init_next_event(struct cpumask *mask, + ktime_t expires) +{ + struct tick_device *td; + int cpu; + + for_each_cpu(cpu, mask) { + td = &per_cpu(tick_cpu_device, cpu); + if (td->evtdev) + td->evtdev->next_event = expires; + } +} + +static inline ktime_t tick_get_next_period(void) +{ + ktime_t next; + + /* + * Protect against concurrent updates (store /load tearing on + * 32bit). It does not matter if the time is already in the + * past. The broadcast device which is about to be programmed will + * fire in any case. + */ + raw_spin_lock(&jiffies_lock); + next = tick_next_period; + raw_spin_unlock(&jiffies_lock); + return next; +} + +/** + * tick_broadcast_setup_oneshot - setup the broadcast device + */ +static void tick_broadcast_setup_oneshot(struct clock_event_device *bc, + bool from_periodic) +{ + int cpu = smp_processor_id(); + ktime_t nexttick = 0; + + if (!bc) + return; + + /* + * When the broadcast device was switched to oneshot by the first + * CPU handling the NOHZ change, the other CPUs will reach this + * code via hrtimer_run_queues() -> tick_check_oneshot_change() + * too. Set up the broadcast device only once! + */ + if (bc->event_handler == tick_handle_oneshot_broadcast) { + /* + * The CPU which switched from periodic to oneshot mode + * set the broadcast oneshot bit for all other CPUs which + * are in the general (periodic) broadcast mask to ensure + * that CPUs which wait for the periodic broadcast are + * woken up. + * + * Clear the bit for the local CPU as the set bit would + * prevent the first tick_broadcast_enter() after this CPU + * switched to oneshot state to program the broadcast + * device. + * + * This code can also be reached via tick_broadcast_control(), + * but this cannot avoid the tick_broadcast_clear_oneshot() + * as that would break the periodic to oneshot transition of + * secondary CPUs. But that's harmless as the below only + * clears already cleared bits. + */ + tick_broadcast_clear_oneshot(cpu); + return; + } + + + bc->event_handler = tick_handle_oneshot_broadcast; + bc->next_event = KTIME_MAX; + + /* + * When the tick mode is switched from periodic to oneshot it must + * be ensured that CPUs which are waiting for periodic broadcast + * get their wake-up at the next tick. This is achieved by ORing + * tick_broadcast_mask into tick_broadcast_oneshot_mask. + * + * For other callers, e.g. broadcast device replacement, + * tick_broadcast_oneshot_mask must not be touched as this would + * set bits for CPUs which are already NOHZ, but not idle. Their + * next tick_broadcast_enter() would observe the bit set and fail + * to update the expiry time and the broadcast event device. + */ + if (from_periodic) { + cpumask_copy(tmpmask, tick_broadcast_mask); + /* Remove the local CPU as it is obviously not idle */ + cpumask_clear_cpu(cpu, tmpmask); + cpumask_or(tick_broadcast_oneshot_mask, tick_broadcast_oneshot_mask, tmpmask); + + /* + * Ensure that the oneshot broadcast handler will wake the + * CPUs which are still waiting for periodic broadcast. + */ + nexttick = tick_get_next_period(); + tick_broadcast_init_next_event(tmpmask, nexttick); + + /* + * If the underlying broadcast clock event device is + * already in oneshot state, then there is nothing to do. + * The device was already armed for the next tick + * in tick_handle_broadcast_periodic() + */ + if (clockevent_state_oneshot(bc)) + return; + } + + /* + * When switching from periodic to oneshot mode arm the broadcast + * device for the next tick. + * + * If the broadcast device has been replaced in oneshot mode and + * the oneshot broadcast mask is not empty, then arm it to expire + * immediately in order to reevaluate the next expiring timer. + * @nexttick is 0 and therefore in the past which will cause the + * clockevent code to force an event. + * + * For both cases the programming can be avoided when the oneshot + * broadcast mask is empty. + * + * tick_broadcast_set_event() implicitly switches the broadcast + * device to oneshot state. + */ + if (!cpumask_empty(tick_broadcast_oneshot_mask)) + tick_broadcast_set_event(bc, cpu, nexttick); +} + +/* + * Select oneshot operating mode for the broadcast device + */ +void tick_broadcast_switch_to_oneshot(void) +{ + struct clock_event_device *bc; + enum tick_device_mode oldmode; + unsigned long flags; + + raw_spin_lock_irqsave(&tick_broadcast_lock, flags); + + oldmode = tick_broadcast_device.mode; + tick_broadcast_device.mode = TICKDEV_MODE_ONESHOT; + bc = tick_broadcast_device.evtdev; + if (bc) + tick_broadcast_setup_oneshot(bc, oldmode == TICKDEV_MODE_PERIODIC); + + raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags); +} + +#ifdef CONFIG_HOTPLUG_CPU +void hotplug_cpu__broadcast_tick_pull(int deadcpu) +{ + struct clock_event_device *bc; + unsigned long flags; + + raw_spin_lock_irqsave(&tick_broadcast_lock, flags); + bc = tick_broadcast_device.evtdev; + + if (bc && broadcast_needs_cpu(bc, deadcpu)) { + /* This moves the broadcast assignment to this CPU: */ + clockevents_program_event(bc, bc->next_event, 1); + } + raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags); +} + +/* + * Remove a dying CPU from broadcasting + */ +static void tick_broadcast_oneshot_offline(unsigned int cpu) +{ + if (tick_get_oneshot_wakeup_device(cpu)) + tick_set_oneshot_wakeup_device(NULL, cpu); + + /* + * Clear the broadcast masks for the dead cpu, but do not stop + * the broadcast device! + */ + cpumask_clear_cpu(cpu, tick_broadcast_oneshot_mask); + cpumask_clear_cpu(cpu, tick_broadcast_pending_mask); + cpumask_clear_cpu(cpu, tick_broadcast_force_mask); +} +#endif + +/* + * Check, whether the broadcast device is in one shot mode + */ +int tick_broadcast_oneshot_active(void) +{ + return tick_broadcast_device.mode == TICKDEV_MODE_ONESHOT; +} + +/* + * Check whether the broadcast device supports oneshot. + */ +bool tick_broadcast_oneshot_available(void) +{ + struct clock_event_device *bc = tick_broadcast_device.evtdev; + + return bc ? bc->features & CLOCK_EVT_FEAT_ONESHOT : false; +} + +#else +int __tick_broadcast_oneshot_control(enum tick_broadcast_state state) +{ + struct clock_event_device *bc = tick_broadcast_device.evtdev; + + if (!bc || (bc->features & CLOCK_EVT_FEAT_HRTIMER)) + return -EBUSY; + + return 0; +} +#endif + +void __init tick_broadcast_init(void) +{ + zalloc_cpumask_var(&tick_broadcast_mask, GFP_NOWAIT); + zalloc_cpumask_var(&tick_broadcast_on, GFP_NOWAIT); + zalloc_cpumask_var(&tmpmask, GFP_NOWAIT); +#ifdef CONFIG_TICK_ONESHOT + zalloc_cpumask_var(&tick_broadcast_oneshot_mask, GFP_NOWAIT); + zalloc_cpumask_var(&tick_broadcast_pending_mask, GFP_NOWAIT); + zalloc_cpumask_var(&tick_broadcast_force_mask, GFP_NOWAIT); +#endif +} diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c new file mode 100644 index 0000000000..e9138cd7a0 --- /dev/null +++ b/kernel/time/tick-common.c @@ -0,0 +1,578 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * This file contains the base functions to manage periodic tick + * related events. + * + * Copyright(C) 2005-2006, Thomas Gleixner <tglx@linutronix.de> + * Copyright(C) 2005-2007, Red Hat, Inc., Ingo Molnar + * Copyright(C) 2006-2007, Timesys Corp., Thomas Gleixner + */ +#include <linux/cpu.h> +#include <linux/err.h> +#include <linux/hrtimer.h> +#include <linux/interrupt.h> +#include <linux/nmi.h> +#include <linux/percpu.h> +#include <linux/profile.h> +#include <linux/sched.h> +#include <linux/module.h> +#include <trace/events/power.h> + +#include <asm/irq_regs.h> + +#include "tick-internal.h" + +/* + * Tick devices + */ +DEFINE_PER_CPU(struct tick_device, tick_cpu_device); +/* + * Tick next event: keeps track of the tick time. It's updated by the + * CPU which handles the tick and protected by jiffies_lock. There is + * no requirement to write hold the jiffies seqcount for it. + */ +ktime_t tick_next_period; + +/* + * tick_do_timer_cpu is a timer core internal variable which holds the CPU NR + * which is responsible for calling do_timer(), i.e. the timekeeping stuff. This + * variable has two functions: + * + * 1) Prevent a thundering herd issue of a gazillion of CPUs trying to grab the + * timekeeping lock all at once. Only the CPU which is assigned to do the + * update is handling it. + * + * 2) Hand off the duty in the NOHZ idle case by setting the value to + * TICK_DO_TIMER_NONE, i.e. a non existing CPU. So the next cpu which looks + * at it will take over and keep the time keeping alive. The handover + * procedure also covers cpu hotplug. + */ +int tick_do_timer_cpu __read_mostly = TICK_DO_TIMER_BOOT; +#ifdef CONFIG_NO_HZ_FULL +/* + * tick_do_timer_boot_cpu indicates the boot CPU temporarily owns + * tick_do_timer_cpu and it should be taken over by an eligible secondary + * when one comes online. + */ +static int tick_do_timer_boot_cpu __read_mostly = -1; +#endif + +/* + * Debugging: see timer_list.c + */ +struct tick_device *tick_get_device(int cpu) +{ + return &per_cpu(tick_cpu_device, cpu); +} + +/** + * tick_is_oneshot_available - check for a oneshot capable event device + */ +int tick_is_oneshot_available(void) +{ + struct clock_event_device *dev = __this_cpu_read(tick_cpu_device.evtdev); + + if (!dev || !(dev->features & CLOCK_EVT_FEAT_ONESHOT)) + return 0; + if (!(dev->features & CLOCK_EVT_FEAT_C3STOP)) + return 1; + return tick_broadcast_oneshot_available(); +} + +/* + * Periodic tick + */ +static void tick_periodic(int cpu) +{ + if (tick_do_timer_cpu == cpu) { + raw_spin_lock(&jiffies_lock); + write_seqcount_begin(&jiffies_seq); + + /* Keep track of the next tick event */ + tick_next_period = ktime_add_ns(tick_next_period, TICK_NSEC); + + do_timer(1); + write_seqcount_end(&jiffies_seq); + raw_spin_unlock(&jiffies_lock); + update_wall_time(); + } + + update_process_times(user_mode(get_irq_regs())); + profile_tick(CPU_PROFILING); +} + +/* + * Event handler for periodic ticks + */ +void tick_handle_periodic(struct clock_event_device *dev) +{ + int cpu = smp_processor_id(); + ktime_t next = dev->next_event; + + tick_periodic(cpu); + +#if defined(CONFIG_HIGH_RES_TIMERS) || defined(CONFIG_NO_HZ_COMMON) + /* + * The cpu might have transitioned to HIGHRES or NOHZ mode via + * update_process_times() -> run_local_timers() -> + * hrtimer_run_queues(). + */ + if (dev->event_handler != tick_handle_periodic) + return; +#endif + + if (!clockevent_state_oneshot(dev)) + return; + for (;;) { + /* + * Setup the next period for devices, which do not have + * periodic mode: + */ + next = ktime_add_ns(next, TICK_NSEC); + + if (!clockevents_program_event(dev, next, false)) + return; + /* + * Have to be careful here. If we're in oneshot mode, + * before we call tick_periodic() in a loop, we need + * to be sure we're using a real hardware clocksource. + * Otherwise we could get trapped in an infinite + * loop, as the tick_periodic() increments jiffies, + * which then will increment time, possibly causing + * the loop to trigger again and again. + */ + if (timekeeping_valid_for_hres()) + tick_periodic(cpu); + } +} + +/* + * Setup the device for a periodic tick + */ +void tick_setup_periodic(struct clock_event_device *dev, int broadcast) +{ + tick_set_periodic_handler(dev, broadcast); + + /* Broadcast setup ? */ + if (!tick_device_is_functional(dev)) + return; + + if ((dev->features & CLOCK_EVT_FEAT_PERIODIC) && + !tick_broadcast_oneshot_active()) { + clockevents_switch_state(dev, CLOCK_EVT_STATE_PERIODIC); + } else { + unsigned int seq; + ktime_t next; + + do { + seq = read_seqcount_begin(&jiffies_seq); + next = tick_next_period; + } while (read_seqcount_retry(&jiffies_seq, seq)); + + clockevents_switch_state(dev, CLOCK_EVT_STATE_ONESHOT); + + for (;;) { + if (!clockevents_program_event(dev, next, false)) + return; + next = ktime_add_ns(next, TICK_NSEC); + } + } +} + +#ifdef CONFIG_NO_HZ_FULL +static void giveup_do_timer(void *info) +{ + int cpu = *(unsigned int *)info; + + WARN_ON(tick_do_timer_cpu != smp_processor_id()); + + tick_do_timer_cpu = cpu; +} + +static void tick_take_do_timer_from_boot(void) +{ + int cpu = smp_processor_id(); + int from = tick_do_timer_boot_cpu; + + if (from >= 0 && from != cpu) + smp_call_function_single(from, giveup_do_timer, &cpu, 1); +} +#endif + +/* + * Setup the tick device + */ +static void tick_setup_device(struct tick_device *td, + struct clock_event_device *newdev, int cpu, + const struct cpumask *cpumask) +{ + void (*handler)(struct clock_event_device *) = NULL; + ktime_t next_event = 0; + + /* + * First device setup ? + */ + if (!td->evtdev) { + /* + * If no cpu took the do_timer update, assign it to + * this cpu: + */ + if (tick_do_timer_cpu == TICK_DO_TIMER_BOOT) { + tick_do_timer_cpu = cpu; + tick_next_period = ktime_get(); +#ifdef CONFIG_NO_HZ_FULL + /* + * The boot CPU may be nohz_full, in which case set + * tick_do_timer_boot_cpu so the first housekeeping + * secondary that comes up will take do_timer from + * us. + */ + if (tick_nohz_full_cpu(cpu)) + tick_do_timer_boot_cpu = cpu; + + } else if (tick_do_timer_boot_cpu != -1 && + !tick_nohz_full_cpu(cpu)) { + tick_take_do_timer_from_boot(); + tick_do_timer_boot_cpu = -1; + WARN_ON(tick_do_timer_cpu != cpu); +#endif + } + + /* + * Startup in periodic mode first. + */ + td->mode = TICKDEV_MODE_PERIODIC; + } else { + handler = td->evtdev->event_handler; + next_event = td->evtdev->next_event; + td->evtdev->event_handler = clockevents_handle_noop; + } + + td->evtdev = newdev; + + /* + * When the device is not per cpu, pin the interrupt to the + * current cpu: + */ + if (!cpumask_equal(newdev->cpumask, cpumask)) + irq_set_affinity(newdev->irq, cpumask); + + /* + * When global broadcasting is active, check if the current + * device is registered as a placeholder for broadcast mode. + * This allows us to handle this x86 misfeature in a generic + * way. This function also returns !=0 when we keep the + * current active broadcast state for this CPU. + */ + if (tick_device_uses_broadcast(newdev, cpu)) + return; + + if (td->mode == TICKDEV_MODE_PERIODIC) + tick_setup_periodic(newdev, 0); + else + tick_setup_oneshot(newdev, handler, next_event); +} + +void tick_install_replacement(struct clock_event_device *newdev) +{ + struct tick_device *td = this_cpu_ptr(&tick_cpu_device); + int cpu = smp_processor_id(); + + clockevents_exchange_device(td->evtdev, newdev); + tick_setup_device(td, newdev, cpu, cpumask_of(cpu)); + if (newdev->features & CLOCK_EVT_FEAT_ONESHOT) + tick_oneshot_notify(); +} + +static bool tick_check_percpu(struct clock_event_device *curdev, + struct clock_event_device *newdev, int cpu) +{ + if (!cpumask_test_cpu(cpu, newdev->cpumask)) + return false; + if (cpumask_equal(newdev->cpumask, cpumask_of(cpu))) + return true; + /* Check if irq affinity can be set */ + if (newdev->irq >= 0 && !irq_can_set_affinity(newdev->irq)) + return false; + /* Prefer an existing cpu local device */ + if (curdev && cpumask_equal(curdev->cpumask, cpumask_of(cpu))) + return false; + return true; +} + +static bool tick_check_preferred(struct clock_event_device *curdev, + struct clock_event_device *newdev) +{ + /* Prefer oneshot capable device */ + if (!(newdev->features & CLOCK_EVT_FEAT_ONESHOT)) { + if (curdev && (curdev->features & CLOCK_EVT_FEAT_ONESHOT)) + return false; + if (tick_oneshot_mode_active()) + return false; + } + + /* + * Use the higher rated one, but prefer a CPU local device with a lower + * rating than a non-CPU local device + */ + return !curdev || + newdev->rating > curdev->rating || + !cpumask_equal(curdev->cpumask, newdev->cpumask); +} + +/* + * Check whether the new device is a better fit than curdev. curdev + * can be NULL ! + */ +bool tick_check_replacement(struct clock_event_device *curdev, + struct clock_event_device *newdev) +{ + if (!tick_check_percpu(curdev, newdev, smp_processor_id())) + return false; + + return tick_check_preferred(curdev, newdev); +} + +/* + * Check, if the new registered device should be used. Called with + * clockevents_lock held and interrupts disabled. + */ +void tick_check_new_device(struct clock_event_device *newdev) +{ + struct clock_event_device *curdev; + struct tick_device *td; + int cpu; + + cpu = smp_processor_id(); + td = &per_cpu(tick_cpu_device, cpu); + curdev = td->evtdev; + + if (!tick_check_replacement(curdev, newdev)) + goto out_bc; + + if (!try_module_get(newdev->owner)) + return; + + /* + * Replace the eventually existing device by the new + * device. If the current device is the broadcast device, do + * not give it back to the clockevents layer ! + */ + if (tick_is_broadcast_device(curdev)) { + clockevents_shutdown(curdev); + curdev = NULL; + } + clockevents_exchange_device(curdev, newdev); + tick_setup_device(td, newdev, cpu, cpumask_of(cpu)); + if (newdev->features & CLOCK_EVT_FEAT_ONESHOT) + tick_oneshot_notify(); + return; + +out_bc: + /* + * Can the new device be used as a broadcast device ? + */ + tick_install_broadcast_device(newdev, cpu); +} + +/** + * tick_broadcast_oneshot_control - Enter/exit broadcast oneshot mode + * @state: The target state (enter/exit) + * + * The system enters/leaves a state, where affected devices might stop + * Returns 0 on success, -EBUSY if the cpu is used to broadcast wakeups. + * + * Called with interrupts disabled, so clockevents_lock is not + * required here because the local clock event device cannot go away + * under us. + */ +int tick_broadcast_oneshot_control(enum tick_broadcast_state state) +{ + struct tick_device *td = this_cpu_ptr(&tick_cpu_device); + + if (!(td->evtdev->features & CLOCK_EVT_FEAT_C3STOP)) + return 0; + + return __tick_broadcast_oneshot_control(state); +} +EXPORT_SYMBOL_GPL(tick_broadcast_oneshot_control); + +#ifdef CONFIG_HOTPLUG_CPU +/* + * Transfer the do_timer job away from a dying cpu. + * + * Called with interrupts disabled. No locking required. If + * tick_do_timer_cpu is owned by this cpu, nothing can change it. + */ +void tick_handover_do_timer(void) +{ + if (tick_do_timer_cpu == smp_processor_id()) + tick_do_timer_cpu = cpumask_first(cpu_online_mask); +} + +/* + * Shutdown an event device on a given cpu: + * + * This is called on a life CPU, when a CPU is dead. So we cannot + * access the hardware device itself. + * We just set the mode and remove it from the lists. + */ +void tick_shutdown(unsigned int cpu) +{ + struct tick_device *td = &per_cpu(tick_cpu_device, cpu); + struct clock_event_device *dev = td->evtdev; + + td->mode = TICKDEV_MODE_PERIODIC; + if (dev) { + /* + * Prevent that the clock events layer tries to call + * the set mode function! + */ + clockevent_set_state(dev, CLOCK_EVT_STATE_DETACHED); + clockevents_exchange_device(dev, NULL); + dev->event_handler = clockevents_handle_noop; + td->evtdev = NULL; + } +} +#endif + +/** + * tick_suspend_local - Suspend the local tick device + * + * Called from the local cpu for freeze with interrupts disabled. + * + * No locks required. Nothing can change the per cpu device. + */ +void tick_suspend_local(void) +{ + struct tick_device *td = this_cpu_ptr(&tick_cpu_device); + + clockevents_shutdown(td->evtdev); +} + +/** + * tick_resume_local - Resume the local tick device + * + * Called from the local CPU for unfreeze or XEN resume magic. + * + * No locks required. Nothing can change the per cpu device. + */ +void tick_resume_local(void) +{ + struct tick_device *td = this_cpu_ptr(&tick_cpu_device); + bool broadcast = tick_resume_check_broadcast(); + + clockevents_tick_resume(td->evtdev); + if (!broadcast) { + if (td->mode == TICKDEV_MODE_PERIODIC) + tick_setup_periodic(td->evtdev, 0); + else + tick_resume_oneshot(); + } + + /* + * Ensure that hrtimers are up to date and the clockevents device + * is reprogrammed correctly when high resolution timers are + * enabled. + */ + hrtimers_resume_local(); +} + +/** + * tick_suspend - Suspend the tick and the broadcast device + * + * Called from syscore_suspend() via timekeeping_suspend with only one + * CPU online and interrupts disabled or from tick_unfreeze() under + * tick_freeze_lock. + * + * No locks required. Nothing can change the per cpu device. + */ +void tick_suspend(void) +{ + tick_suspend_local(); + tick_suspend_broadcast(); +} + +/** + * tick_resume - Resume the tick and the broadcast device + * + * Called from syscore_resume() via timekeeping_resume with only one + * CPU online and interrupts disabled. + * + * No locks required. Nothing can change the per cpu device. + */ +void tick_resume(void) +{ + tick_resume_broadcast(); + tick_resume_local(); +} + +#ifdef CONFIG_SUSPEND +static DEFINE_RAW_SPINLOCK(tick_freeze_lock); +static unsigned int tick_freeze_depth; + +/** + * tick_freeze - Suspend the local tick and (possibly) timekeeping. + * + * Check if this is the last online CPU executing the function and if so, + * suspend timekeeping. Otherwise suspend the local tick. + * + * Call with interrupts disabled. Must be balanced with %tick_unfreeze(). + * Interrupts must not be enabled before the subsequent %tick_unfreeze(). + */ +void tick_freeze(void) +{ + raw_spin_lock(&tick_freeze_lock); + + tick_freeze_depth++; + if (tick_freeze_depth == num_online_cpus()) { + trace_suspend_resume(TPS("timekeeping_freeze"), + smp_processor_id(), true); + system_state = SYSTEM_SUSPEND; + sched_clock_suspend(); + timekeeping_suspend(); + } else { + tick_suspend_local(); + } + + raw_spin_unlock(&tick_freeze_lock); +} + +/** + * tick_unfreeze - Resume the local tick and (possibly) timekeeping. + * + * Check if this is the first CPU executing the function and if so, resume + * timekeeping. Otherwise resume the local tick. + * + * Call with interrupts disabled. Must be balanced with %tick_freeze(). + * Interrupts must not be enabled after the preceding %tick_freeze(). + */ +void tick_unfreeze(void) +{ + raw_spin_lock(&tick_freeze_lock); + + if (tick_freeze_depth == num_online_cpus()) { + timekeeping_resume(); + sched_clock_resume(); + system_state = SYSTEM_RUNNING; + trace_suspend_resume(TPS("timekeeping_freeze"), + smp_processor_id(), false); + } else { + touch_softlockup_watchdog(); + tick_resume_local(); + } + + tick_freeze_depth--; + + raw_spin_unlock(&tick_freeze_lock); +} +#endif /* CONFIG_SUSPEND */ + +/** + * tick_init - initialize the tick control + */ +void __init tick_init(void) +{ + tick_broadcast_init(); + tick_nohz_init(); +} diff --git a/kernel/time/tick-internal.h b/kernel/time/tick-internal.h new file mode 100644 index 0000000000..649f2b48e8 --- /dev/null +++ b/kernel/time/tick-internal.h @@ -0,0 +1,199 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * tick internal variable and functions used by low/high res code + */ +#include <linux/hrtimer.h> +#include <linux/tick.h> + +#include "timekeeping.h" +#include "tick-sched.h" + +#ifdef CONFIG_GENERIC_CLOCKEVENTS + +# define TICK_DO_TIMER_NONE -1 +# define TICK_DO_TIMER_BOOT -2 + +DECLARE_PER_CPU(struct tick_device, tick_cpu_device); +extern ktime_t tick_next_period; +extern int tick_do_timer_cpu __read_mostly; + +extern void tick_setup_periodic(struct clock_event_device *dev, int broadcast); +extern void tick_handle_periodic(struct clock_event_device *dev); +extern void tick_check_new_device(struct clock_event_device *dev); +extern void tick_shutdown(unsigned int cpu); +extern void tick_suspend(void); +extern void tick_resume(void); +extern bool tick_check_replacement(struct clock_event_device *curdev, + struct clock_event_device *newdev); +extern void tick_install_replacement(struct clock_event_device *dev); +extern int tick_is_oneshot_available(void); +extern struct tick_device *tick_get_device(int cpu); + +extern int clockevents_tick_resume(struct clock_event_device *dev); +/* Check, if the device is functional or a dummy for broadcast */ +static inline int tick_device_is_functional(struct clock_event_device *dev) +{ + return !(dev->features & CLOCK_EVT_FEAT_DUMMY); +} + +static inline enum clock_event_state clockevent_get_state(struct clock_event_device *dev) +{ + return dev->state_use_accessors; +} + +static inline void clockevent_set_state(struct clock_event_device *dev, + enum clock_event_state state) +{ + dev->state_use_accessors = state; +} + +extern void clockevents_shutdown(struct clock_event_device *dev); +extern void clockevents_exchange_device(struct clock_event_device *old, + struct clock_event_device *new); +extern void clockevents_switch_state(struct clock_event_device *dev, + enum clock_event_state state); +extern int clockevents_program_event(struct clock_event_device *dev, + ktime_t expires, bool force); +extern void clockevents_handle_noop(struct clock_event_device *dev); +extern int __clockevents_update_freq(struct clock_event_device *dev, u32 freq); +extern ssize_t sysfs_get_uname(const char *buf, char *dst, size_t cnt); + +/* Broadcasting support */ +# ifdef CONFIG_GENERIC_CLOCKEVENTS_BROADCAST +extern int tick_device_uses_broadcast(struct clock_event_device *dev, int cpu); +extern void tick_install_broadcast_device(struct clock_event_device *dev, int cpu); +extern int tick_is_broadcast_device(struct clock_event_device *dev); +extern void tick_suspend_broadcast(void); +extern void tick_resume_broadcast(void); +extern bool tick_resume_check_broadcast(void); +extern void tick_broadcast_init(void); +extern void tick_set_periodic_handler(struct clock_event_device *dev, int broadcast); +extern int tick_broadcast_update_freq(struct clock_event_device *dev, u32 freq); +extern struct tick_device *tick_get_broadcast_device(void); +extern struct cpumask *tick_get_broadcast_mask(void); +extern const struct clock_event_device *tick_get_wakeup_device(int cpu); +# else /* !CONFIG_GENERIC_CLOCKEVENTS_BROADCAST: */ +static inline void tick_install_broadcast_device(struct clock_event_device *dev, int cpu) { } +static inline int tick_is_broadcast_device(struct clock_event_device *dev) { return 0; } +static inline int tick_device_uses_broadcast(struct clock_event_device *dev, int cpu) { return 0; } +static inline void tick_do_periodic_broadcast(struct clock_event_device *d) { } +static inline void tick_suspend_broadcast(void) { } +static inline void tick_resume_broadcast(void) { } +static inline bool tick_resume_check_broadcast(void) { return false; } +static inline void tick_broadcast_init(void) { } +static inline int tick_broadcast_update_freq(struct clock_event_device *dev, u32 freq) { return -ENODEV; } + +/* Set the periodic handler in non broadcast mode */ +static inline void tick_set_periodic_handler(struct clock_event_device *dev, int broadcast) +{ + dev->event_handler = tick_handle_periodic; +} +# endif /* !CONFIG_GENERIC_CLOCKEVENTS_BROADCAST */ + +#else /* !GENERIC_CLOCKEVENTS: */ +static inline void tick_suspend(void) { } +static inline void tick_resume(void) { } +#endif /* !GENERIC_CLOCKEVENTS */ + +/* Oneshot related functions */ +#ifdef CONFIG_TICK_ONESHOT +extern void tick_setup_oneshot(struct clock_event_device *newdev, + void (*handler)(struct clock_event_device *), + ktime_t nextevt); +extern int tick_program_event(ktime_t expires, int force); +extern void tick_oneshot_notify(void); +extern int tick_switch_to_oneshot(void (*handler)(struct clock_event_device *)); +extern void tick_resume_oneshot(void); +static inline bool tick_oneshot_possible(void) { return true; } +extern int tick_oneshot_mode_active(void); +extern void tick_clock_notify(void); +extern int tick_check_oneshot_change(int allow_nohz); +extern int tick_init_highres(void); +#else /* !CONFIG_TICK_ONESHOT: */ +static inline +void tick_setup_oneshot(struct clock_event_device *newdev, + void (*handler)(struct clock_event_device *), + ktime_t nextevt) { BUG(); } +static inline void tick_resume_oneshot(void) { BUG(); } +static inline int tick_program_event(ktime_t expires, int force) { return 0; } +static inline void tick_oneshot_notify(void) { } +static inline bool tick_oneshot_possible(void) { return false; } +static inline int tick_oneshot_mode_active(void) { return 0; } +static inline void tick_clock_notify(void) { } +static inline int tick_check_oneshot_change(int allow_nohz) { return 0; } +#endif /* !CONFIG_TICK_ONESHOT */ + +/* Functions related to oneshot broadcasting */ +#if defined(CONFIG_GENERIC_CLOCKEVENTS_BROADCAST) && defined(CONFIG_TICK_ONESHOT) +extern void tick_broadcast_switch_to_oneshot(void); +extern int tick_broadcast_oneshot_active(void); +extern void tick_check_oneshot_broadcast_this_cpu(void); +bool tick_broadcast_oneshot_available(void); +extern struct cpumask *tick_get_broadcast_oneshot_mask(void); +#else /* !(BROADCAST && ONESHOT): */ +static inline void tick_broadcast_switch_to_oneshot(void) { } +static inline int tick_broadcast_oneshot_active(void) { return 0; } +static inline void tick_check_oneshot_broadcast_this_cpu(void) { } +static inline bool tick_broadcast_oneshot_available(void) { return tick_oneshot_possible(); } +#endif /* !(BROADCAST && ONESHOT) */ + +#if defined(CONFIG_GENERIC_CLOCKEVENTS_BROADCAST) && defined(CONFIG_HOTPLUG_CPU) +extern void tick_broadcast_offline(unsigned int cpu); +#else +static inline void tick_broadcast_offline(unsigned int cpu) { } +#endif + +/* NO_HZ_FULL internal */ +#ifdef CONFIG_NO_HZ_FULL +extern void tick_nohz_init(void); +# else +static inline void tick_nohz_init(void) { } +#endif + +#ifdef CONFIG_NO_HZ_COMMON +extern unsigned long tick_nohz_active; +extern void timers_update_nohz(void); +# ifdef CONFIG_SMP +extern struct static_key_false timers_migration_enabled; +# endif +#else /* CONFIG_NO_HZ_COMMON */ +static inline void timers_update_nohz(void) { } +#define tick_nohz_active (0) +#endif + +DECLARE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases); + +extern u64 get_next_timer_interrupt(unsigned long basej, u64 basem); +void timer_clear_idle(void); + +#define CLOCK_SET_WALL \ + (BIT(HRTIMER_BASE_REALTIME) | BIT(HRTIMER_BASE_REALTIME_SOFT) | \ + BIT(HRTIMER_BASE_TAI) | BIT(HRTIMER_BASE_TAI_SOFT)) + +#define CLOCK_SET_BOOT \ + (BIT(HRTIMER_BASE_BOOTTIME) | BIT(HRTIMER_BASE_BOOTTIME_SOFT)) + +void clock_was_set(unsigned int bases); +void clock_was_set_delayed(void); + +void hrtimers_resume_local(void); + +/* Since jiffies uses a simple TICK_NSEC multiplier + * conversion, the .shift value could be zero. However + * this would make NTP adjustments impossible as they are + * in units of 1/2^.shift. Thus we use JIFFIES_SHIFT to + * shift both the nominator and denominator the same + * amount, and give ntp adjustments in units of 1/2^8 + * + * The value 8 is somewhat carefully chosen, as anything + * larger can result in overflows. TICK_NSEC grows as HZ + * shrinks, so values greater than 8 overflow 32bits when + * HZ=100. + */ +#if HZ < 34 +#define JIFFIES_SHIFT 6 +#elif HZ < 67 +#define JIFFIES_SHIFT 7 +#else +#define JIFFIES_SHIFT 8 +#endif diff --git a/kernel/time/tick-legacy.c b/kernel/time/tick-legacy.c new file mode 100644 index 0000000000..af225b32f5 --- /dev/null +++ b/kernel/time/tick-legacy.c @@ -0,0 +1,37 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Timer tick function for architectures that lack generic clockevents, + * consolidated here from m68k/ia64/parisc/arm. + */ + +#include <linux/irq.h> +#include <linux/profile.h> +#include <linux/timekeeper_internal.h> + +#include "tick-internal.h" + +/** + * legacy_timer_tick() - advances the timekeeping infrastructure + * @ticks: number of ticks, that have elapsed since the last call. + * + * This is used by platforms that have not been converted to + * generic clockevents. + * + * If 'ticks' is zero, the CPU is not handling timekeeping, so + * only perform process accounting and profiling. + * + * Must be called with interrupts disabled. + */ +void legacy_timer_tick(unsigned long ticks) +{ + if (ticks) { + raw_spin_lock(&jiffies_lock); + write_seqcount_begin(&jiffies_seq); + do_timer(ticks); + write_seqcount_end(&jiffies_seq); + raw_spin_unlock(&jiffies_lock); + update_wall_time(); + } + update_process_times(user_mode(get_irq_regs())); + profile_tick(CPU_PROFILING); +} diff --git a/kernel/time/tick-oneshot.c b/kernel/time/tick-oneshot.c new file mode 100644 index 0000000000..5e2c2c26b3 --- /dev/null +++ b/kernel/time/tick-oneshot.c @@ -0,0 +1,128 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * This file contains functions which manage high resolution tick + * related events. + * + * Copyright(C) 2005-2006, Thomas Gleixner <tglx@linutronix.de> + * Copyright(C) 2005-2007, Red Hat, Inc., Ingo Molnar + * Copyright(C) 2006-2007, Timesys Corp., Thomas Gleixner + */ +#include <linux/cpu.h> +#include <linux/err.h> +#include <linux/hrtimer.h> +#include <linux/interrupt.h> +#include <linux/percpu.h> +#include <linux/profile.h> +#include <linux/sched.h> + +#include "tick-internal.h" + +/** + * tick_program_event - program the CPU local timer device for the next event + */ +int tick_program_event(ktime_t expires, int force) +{ + struct clock_event_device *dev = __this_cpu_read(tick_cpu_device.evtdev); + + if (unlikely(expires == KTIME_MAX)) { + /* + * We don't need the clock event device any more, stop it. + */ + clockevents_switch_state(dev, CLOCK_EVT_STATE_ONESHOT_STOPPED); + dev->next_event = KTIME_MAX; + return 0; + } + + if (unlikely(clockevent_state_oneshot_stopped(dev))) { + /* + * We need the clock event again, configure it in ONESHOT mode + * before using it. + */ + clockevents_switch_state(dev, CLOCK_EVT_STATE_ONESHOT); + } + + return clockevents_program_event(dev, expires, force); +} + +/** + * tick_resume_oneshot - resume oneshot mode + */ +void tick_resume_oneshot(void) +{ + struct clock_event_device *dev = __this_cpu_read(tick_cpu_device.evtdev); + + clockevents_switch_state(dev, CLOCK_EVT_STATE_ONESHOT); + clockevents_program_event(dev, ktime_get(), true); +} + +/** + * tick_setup_oneshot - setup the event device for oneshot mode (hres or nohz) + */ +void tick_setup_oneshot(struct clock_event_device *newdev, + void (*handler)(struct clock_event_device *), + ktime_t next_event) +{ + newdev->event_handler = handler; + clockevents_switch_state(newdev, CLOCK_EVT_STATE_ONESHOT); + clockevents_program_event(newdev, next_event, true); +} + +/** + * tick_switch_to_oneshot - switch to oneshot mode + */ +int tick_switch_to_oneshot(void (*handler)(struct clock_event_device *)) +{ + struct tick_device *td = this_cpu_ptr(&tick_cpu_device); + struct clock_event_device *dev = td->evtdev; + + if (!dev || !(dev->features & CLOCK_EVT_FEAT_ONESHOT) || + !tick_device_is_functional(dev)) { + + pr_info("Clockevents: could not switch to one-shot mode:"); + if (!dev) { + pr_cont(" no tick device\n"); + } else { + if (!tick_device_is_functional(dev)) + pr_cont(" %s is not functional.\n", dev->name); + else + pr_cont(" %s does not support one-shot mode.\n", + dev->name); + } + return -EINVAL; + } + + td->mode = TICKDEV_MODE_ONESHOT; + dev->event_handler = handler; + clockevents_switch_state(dev, CLOCK_EVT_STATE_ONESHOT); + tick_broadcast_switch_to_oneshot(); + return 0; +} + +/** + * tick_oneshot_mode_active - check whether the system is in oneshot mode + * + * returns 1 when either nohz or highres are enabled. otherwise 0. + */ +int tick_oneshot_mode_active(void) +{ + unsigned long flags; + int ret; + + local_irq_save(flags); + ret = __this_cpu_read(tick_cpu_device.mode) == TICKDEV_MODE_ONESHOT; + local_irq_restore(flags); + + return ret; +} + +#ifdef CONFIG_HIGH_RES_TIMERS +/** + * tick_init_highres - switch to high resolution mode + * + * Called with interrupts disabled. + */ +int tick_init_highres(void) +{ + return tick_switch_to_oneshot(hrtimer_interrupt); +} +#endif diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c new file mode 100644 index 0000000000..55cbc49f70 --- /dev/null +++ b/kernel/time/tick-sched.c @@ -0,0 +1,1617 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright(C) 2005-2006, Thomas Gleixner <tglx@linutronix.de> + * Copyright(C) 2005-2007, Red Hat, Inc., Ingo Molnar + * Copyright(C) 2006-2007 Timesys Corp., Thomas Gleixner + * + * No idle tick implementation for low and high resolution timers + * + * Started by: Thomas Gleixner and Ingo Molnar + */ +#include <linux/cpu.h> +#include <linux/err.h> +#include <linux/hrtimer.h> +#include <linux/interrupt.h> +#include <linux/kernel_stat.h> +#include <linux/percpu.h> +#include <linux/nmi.h> +#include <linux/profile.h> +#include <linux/sched/signal.h> +#include <linux/sched/clock.h> +#include <linux/sched/stat.h> +#include <linux/sched/nohz.h> +#include <linux/sched/loadavg.h> +#include <linux/module.h> +#include <linux/irq_work.h> +#include <linux/posix-timers.h> +#include <linux/context_tracking.h> +#include <linux/mm.h> + +#include <asm/irq_regs.h> + +#include "tick-internal.h" + +#include <trace/events/timer.h> + +/* + * Per-CPU nohz control structure + */ +static DEFINE_PER_CPU(struct tick_sched, tick_cpu_sched); + +struct tick_sched *tick_get_tick_sched(int cpu) +{ + return &per_cpu(tick_cpu_sched, cpu); +} + +#if defined(CONFIG_NO_HZ_COMMON) || defined(CONFIG_HIGH_RES_TIMERS) +/* + * The time, when the last jiffy update happened. Write access must hold + * jiffies_lock and jiffies_seq. tick_nohz_next_event() needs to get a + * consistent view of jiffies and last_jiffies_update. + */ +static ktime_t last_jiffies_update; + +/* + * Must be called with interrupts disabled ! + */ +static void tick_do_update_jiffies64(ktime_t now) +{ + unsigned long ticks = 1; + ktime_t delta, nextp; + + /* + * 64bit can do a quick check without holding jiffies lock and + * without looking at the sequence count. The smp_load_acquire() + * pairs with the update done later in this function. + * + * 32bit cannot do that because the store of tick_next_period + * consists of two 32bit stores and the first store could move it + * to a random point in the future. + */ + if (IS_ENABLED(CONFIG_64BIT)) { + if (ktime_before(now, smp_load_acquire(&tick_next_period))) + return; + } else { + unsigned int seq; + + /* + * Avoid contention on jiffies_lock and protect the quick + * check with the sequence count. + */ + do { + seq = read_seqcount_begin(&jiffies_seq); + nextp = tick_next_period; + } while (read_seqcount_retry(&jiffies_seq, seq)); + + if (ktime_before(now, nextp)) + return; + } + + /* Quick check failed, i.e. update is required. */ + raw_spin_lock(&jiffies_lock); + /* + * Reevaluate with the lock held. Another CPU might have done the + * update already. + */ + if (ktime_before(now, tick_next_period)) { + raw_spin_unlock(&jiffies_lock); + return; + } + + write_seqcount_begin(&jiffies_seq); + + delta = ktime_sub(now, tick_next_period); + if (unlikely(delta >= TICK_NSEC)) { + /* Slow path for long idle sleep times */ + s64 incr = TICK_NSEC; + + ticks += ktime_divns(delta, incr); + + last_jiffies_update = ktime_add_ns(last_jiffies_update, + incr * ticks); + } else { + last_jiffies_update = ktime_add_ns(last_jiffies_update, + TICK_NSEC); + } + + /* Advance jiffies to complete the jiffies_seq protected job */ + jiffies_64 += ticks; + + /* + * Keep the tick_next_period variable up to date. + */ + nextp = ktime_add_ns(last_jiffies_update, TICK_NSEC); + + if (IS_ENABLED(CONFIG_64BIT)) { + /* + * Pairs with smp_load_acquire() in the lockless quick + * check above and ensures that the update to jiffies_64 is + * not reordered vs. the store to tick_next_period, neither + * by the compiler nor by the CPU. + */ + smp_store_release(&tick_next_period, nextp); + } else { + /* + * A plain store is good enough on 32bit as the quick check + * above is protected by the sequence count. + */ + tick_next_period = nextp; + } + + /* + * Release the sequence count. calc_global_load() below is not + * protected by it, but jiffies_lock needs to be held to prevent + * concurrent invocations. + */ + write_seqcount_end(&jiffies_seq); + + calc_global_load(); + + raw_spin_unlock(&jiffies_lock); + update_wall_time(); +} + +/* + * Initialize and return retrieve the jiffies update. + */ +static ktime_t tick_init_jiffy_update(void) +{ + ktime_t period; + + raw_spin_lock(&jiffies_lock); + write_seqcount_begin(&jiffies_seq); + /* Did we start the jiffies update yet ? */ + if (last_jiffies_update == 0) { + u32 rem; + + /* + * Ensure that the tick is aligned to a multiple of + * TICK_NSEC. + */ + div_u64_rem(tick_next_period, TICK_NSEC, &rem); + if (rem) + tick_next_period += TICK_NSEC - rem; + + last_jiffies_update = tick_next_period; + } + period = last_jiffies_update; + write_seqcount_end(&jiffies_seq); + raw_spin_unlock(&jiffies_lock); + return period; +} + +#define MAX_STALLED_JIFFIES 5 + +static void tick_sched_do_timer(struct tick_sched *ts, ktime_t now) +{ + int cpu = smp_processor_id(); + +#ifdef CONFIG_NO_HZ_COMMON + /* + * Check if the do_timer duty was dropped. We don't care about + * concurrency: This happens only when the CPU in charge went + * into a long sleep. If two CPUs happen to assign themselves to + * this duty, then the jiffies update is still serialized by + * jiffies_lock. + * + * If nohz_full is enabled, this should not happen because the + * tick_do_timer_cpu never relinquishes. + */ + if (unlikely(tick_do_timer_cpu == TICK_DO_TIMER_NONE)) { +#ifdef CONFIG_NO_HZ_FULL + WARN_ON_ONCE(tick_nohz_full_running); +#endif + tick_do_timer_cpu = cpu; + } +#endif + + /* Check, if the jiffies need an update */ + if (tick_do_timer_cpu == cpu) + tick_do_update_jiffies64(now); + + /* + * If jiffies update stalled for too long (timekeeper in stop_machine() + * or VMEXIT'ed for several msecs), force an update. + */ + if (ts->last_tick_jiffies != jiffies) { + ts->stalled_jiffies = 0; + ts->last_tick_jiffies = READ_ONCE(jiffies); + } else { + if (++ts->stalled_jiffies == MAX_STALLED_JIFFIES) { + tick_do_update_jiffies64(now); + ts->stalled_jiffies = 0; + ts->last_tick_jiffies = READ_ONCE(jiffies); + } + } + + if (ts->inidle) + ts->got_idle_tick = 1; +} + +static void tick_sched_handle(struct tick_sched *ts, struct pt_regs *regs) +{ +#ifdef CONFIG_NO_HZ_COMMON + /* + * When we are idle and the tick is stopped, we have to touch + * the watchdog as we might not schedule for a really long + * time. This happens on complete idle SMP systems while + * waiting on the login prompt. We also increment the "start of + * idle" jiffy stamp so the idle accounting adjustment we do + * when we go busy again does not account too much ticks. + */ + if (ts->tick_stopped) { + touch_softlockup_watchdog_sched(); + if (is_idle_task(current)) + ts->idle_jiffies++; + /* + * In case the current tick fired too early past its expected + * expiration, make sure we don't bypass the next clock reprogramming + * to the same deadline. + */ + ts->next_tick = 0; + } +#endif + update_process_times(user_mode(regs)); + profile_tick(CPU_PROFILING); +} +#endif + +#ifdef CONFIG_NO_HZ_FULL +cpumask_var_t tick_nohz_full_mask; +EXPORT_SYMBOL_GPL(tick_nohz_full_mask); +bool tick_nohz_full_running; +EXPORT_SYMBOL_GPL(tick_nohz_full_running); +static atomic_t tick_dep_mask; + +static bool check_tick_dependency(atomic_t *dep) +{ + int val = atomic_read(dep); + + if (val & TICK_DEP_MASK_POSIX_TIMER) { + trace_tick_stop(0, TICK_DEP_MASK_POSIX_TIMER); + return true; + } + + if (val & TICK_DEP_MASK_PERF_EVENTS) { + trace_tick_stop(0, TICK_DEP_MASK_PERF_EVENTS); + return true; + } + + if (val & TICK_DEP_MASK_SCHED) { + trace_tick_stop(0, TICK_DEP_MASK_SCHED); + return true; + } + + if (val & TICK_DEP_MASK_CLOCK_UNSTABLE) { + trace_tick_stop(0, TICK_DEP_MASK_CLOCK_UNSTABLE); + return true; + } + + if (val & TICK_DEP_MASK_RCU) { + trace_tick_stop(0, TICK_DEP_MASK_RCU); + return true; + } + + if (val & TICK_DEP_MASK_RCU_EXP) { + trace_tick_stop(0, TICK_DEP_MASK_RCU_EXP); + return true; + } + + return false; +} + +static bool can_stop_full_tick(int cpu, struct tick_sched *ts) +{ + lockdep_assert_irqs_disabled(); + + if (unlikely(!cpu_online(cpu))) + return false; + + if (check_tick_dependency(&tick_dep_mask)) + return false; + + if (check_tick_dependency(&ts->tick_dep_mask)) + return false; + + if (check_tick_dependency(¤t->tick_dep_mask)) + return false; + + if (check_tick_dependency(¤t->signal->tick_dep_mask)) + return false; + + return true; +} + +static void nohz_full_kick_func(struct irq_work *work) +{ + /* Empty, the tick restart happens on tick_nohz_irq_exit() */ +} + +static DEFINE_PER_CPU(struct irq_work, nohz_full_kick_work) = + IRQ_WORK_INIT_HARD(nohz_full_kick_func); + +/* + * Kick this CPU if it's full dynticks in order to force it to + * re-evaluate its dependency on the tick and restart it if necessary. + * This kick, unlike tick_nohz_full_kick_cpu() and tick_nohz_full_kick_all(), + * is NMI safe. + */ +static void tick_nohz_full_kick(void) +{ + if (!tick_nohz_full_cpu(smp_processor_id())) + return; + + irq_work_queue(this_cpu_ptr(&nohz_full_kick_work)); +} + +/* + * Kick the CPU if it's full dynticks in order to force it to + * re-evaluate its dependency on the tick and restart it if necessary. + */ +void tick_nohz_full_kick_cpu(int cpu) +{ + if (!tick_nohz_full_cpu(cpu)) + return; + + irq_work_queue_on(&per_cpu(nohz_full_kick_work, cpu), cpu); +} + +static void tick_nohz_kick_task(struct task_struct *tsk) +{ + int cpu; + + /* + * If the task is not running, run_posix_cpu_timers() + * has nothing to elapse, IPI can then be spared. + * + * activate_task() STORE p->tick_dep_mask + * STORE p->on_rq + * __schedule() (switch to task 'p') smp_mb() (atomic_fetch_or()) + * LOCK rq->lock LOAD p->on_rq + * smp_mb__after_spin_lock() + * tick_nohz_task_switch() + * LOAD p->tick_dep_mask + */ + if (!sched_task_on_rq(tsk)) + return; + + /* + * If the task concurrently migrates to another CPU, + * we guarantee it sees the new tick dependency upon + * schedule. + * + * set_task_cpu(p, cpu); + * STORE p->cpu = @cpu + * __schedule() (switch to task 'p') + * LOCK rq->lock + * smp_mb__after_spin_lock() STORE p->tick_dep_mask + * tick_nohz_task_switch() smp_mb() (atomic_fetch_or()) + * LOAD p->tick_dep_mask LOAD p->cpu + */ + cpu = task_cpu(tsk); + + preempt_disable(); + if (cpu_online(cpu)) + tick_nohz_full_kick_cpu(cpu); + preempt_enable(); +} + +/* + * Kick all full dynticks CPUs in order to force these to re-evaluate + * their dependency on the tick and restart it if necessary. + */ +static void tick_nohz_full_kick_all(void) +{ + int cpu; + + if (!tick_nohz_full_running) + return; + + preempt_disable(); + for_each_cpu_and(cpu, tick_nohz_full_mask, cpu_online_mask) + tick_nohz_full_kick_cpu(cpu); + preempt_enable(); +} + +static void tick_nohz_dep_set_all(atomic_t *dep, + enum tick_dep_bits bit) +{ + int prev; + + prev = atomic_fetch_or(BIT(bit), dep); + if (!prev) + tick_nohz_full_kick_all(); +} + +/* + * Set a global tick dependency. Used by perf events that rely on freq and + * by unstable clock. + */ +void tick_nohz_dep_set(enum tick_dep_bits bit) +{ + tick_nohz_dep_set_all(&tick_dep_mask, bit); +} + +void tick_nohz_dep_clear(enum tick_dep_bits bit) +{ + atomic_andnot(BIT(bit), &tick_dep_mask); +} + +/* + * Set per-CPU tick dependency. Used by scheduler and perf events in order to + * manage events throttling. + */ +void tick_nohz_dep_set_cpu(int cpu, enum tick_dep_bits bit) +{ + int prev; + struct tick_sched *ts; + + ts = per_cpu_ptr(&tick_cpu_sched, cpu); + + prev = atomic_fetch_or(BIT(bit), &ts->tick_dep_mask); + if (!prev) { + preempt_disable(); + /* Perf needs local kick that is NMI safe */ + if (cpu == smp_processor_id()) { + tick_nohz_full_kick(); + } else { + /* Remote irq work not NMI-safe */ + if (!WARN_ON_ONCE(in_nmi())) + tick_nohz_full_kick_cpu(cpu); + } + preempt_enable(); + } +} +EXPORT_SYMBOL_GPL(tick_nohz_dep_set_cpu); + +void tick_nohz_dep_clear_cpu(int cpu, enum tick_dep_bits bit) +{ + struct tick_sched *ts = per_cpu_ptr(&tick_cpu_sched, cpu); + + atomic_andnot(BIT(bit), &ts->tick_dep_mask); +} +EXPORT_SYMBOL_GPL(tick_nohz_dep_clear_cpu); + +/* + * Set a per-task tick dependency. RCU need this. Also posix CPU timers + * in order to elapse per task timers. + */ +void tick_nohz_dep_set_task(struct task_struct *tsk, enum tick_dep_bits bit) +{ + if (!atomic_fetch_or(BIT(bit), &tsk->tick_dep_mask)) + tick_nohz_kick_task(tsk); +} +EXPORT_SYMBOL_GPL(tick_nohz_dep_set_task); + +void tick_nohz_dep_clear_task(struct task_struct *tsk, enum tick_dep_bits bit) +{ + atomic_andnot(BIT(bit), &tsk->tick_dep_mask); +} +EXPORT_SYMBOL_GPL(tick_nohz_dep_clear_task); + +/* + * Set a per-taskgroup tick dependency. Posix CPU timers need this in order to elapse + * per process timers. + */ +void tick_nohz_dep_set_signal(struct task_struct *tsk, + enum tick_dep_bits bit) +{ + int prev; + struct signal_struct *sig = tsk->signal; + + prev = atomic_fetch_or(BIT(bit), &sig->tick_dep_mask); + if (!prev) { + struct task_struct *t; + + lockdep_assert_held(&tsk->sighand->siglock); + __for_each_thread(sig, t) + tick_nohz_kick_task(t); + } +} + +void tick_nohz_dep_clear_signal(struct signal_struct *sig, enum tick_dep_bits bit) +{ + atomic_andnot(BIT(bit), &sig->tick_dep_mask); +} + +/* + * Re-evaluate the need for the tick as we switch the current task. + * It might need the tick due to per task/process properties: + * perf events, posix CPU timers, ... + */ +void __tick_nohz_task_switch(void) +{ + struct tick_sched *ts; + + if (!tick_nohz_full_cpu(smp_processor_id())) + return; + + ts = this_cpu_ptr(&tick_cpu_sched); + + if (ts->tick_stopped) { + if (atomic_read(¤t->tick_dep_mask) || + atomic_read(¤t->signal->tick_dep_mask)) + tick_nohz_full_kick(); + } +} + +/* Get the boot-time nohz CPU list from the kernel parameters. */ +void __init tick_nohz_full_setup(cpumask_var_t cpumask) +{ + alloc_bootmem_cpumask_var(&tick_nohz_full_mask); + cpumask_copy(tick_nohz_full_mask, cpumask); + tick_nohz_full_running = true; +} + +bool tick_nohz_cpu_hotpluggable(unsigned int cpu) +{ + /* + * The tick_do_timer_cpu CPU handles housekeeping duty (unbound + * timers, workqueues, timekeeping, ...) on behalf of full dynticks + * CPUs. It must remain online when nohz full is enabled. + */ + if (tick_nohz_full_running && tick_do_timer_cpu == cpu) + return false; + return true; +} + +static int tick_nohz_cpu_down(unsigned int cpu) +{ + return tick_nohz_cpu_hotpluggable(cpu) ? 0 : -EBUSY; +} + +void __init tick_nohz_init(void) +{ + int cpu, ret; + + if (!tick_nohz_full_running) + return; + + /* + * Full dynticks uses irq work to drive the tick rescheduling on safe + * locking contexts. But then we need irq work to raise its own + * interrupts to avoid circular dependency on the tick + */ + if (!arch_irq_work_has_interrupt()) { + pr_warn("NO_HZ: Can't run full dynticks because arch doesn't support irq work self-IPIs\n"); + cpumask_clear(tick_nohz_full_mask); + tick_nohz_full_running = false; + return; + } + + if (IS_ENABLED(CONFIG_PM_SLEEP_SMP) && + !IS_ENABLED(CONFIG_PM_SLEEP_SMP_NONZERO_CPU)) { + cpu = smp_processor_id(); + + if (cpumask_test_cpu(cpu, tick_nohz_full_mask)) { + pr_warn("NO_HZ: Clearing %d from nohz_full range " + "for timekeeping\n", cpu); + cpumask_clear_cpu(cpu, tick_nohz_full_mask); + } + } + + for_each_cpu(cpu, tick_nohz_full_mask) + ct_cpu_track_user(cpu); + + ret = cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN, + "kernel/nohz:predown", NULL, + tick_nohz_cpu_down); + WARN_ON(ret < 0); + pr_info("NO_HZ: Full dynticks CPUs: %*pbl.\n", + cpumask_pr_args(tick_nohz_full_mask)); +} +#endif + +/* + * NOHZ - aka dynamic tick functionality + */ +#ifdef CONFIG_NO_HZ_COMMON +/* + * NO HZ enabled ? + */ +bool tick_nohz_enabled __read_mostly = true; +unsigned long tick_nohz_active __read_mostly; +/* + * Enable / Disable tickless mode + */ +static int __init setup_tick_nohz(char *str) +{ + return (kstrtobool(str, &tick_nohz_enabled) == 0); +} + +__setup("nohz=", setup_tick_nohz); + +bool tick_nohz_tick_stopped(void) +{ + struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched); + + return ts->tick_stopped; +} + +bool tick_nohz_tick_stopped_cpu(int cpu) +{ + struct tick_sched *ts = per_cpu_ptr(&tick_cpu_sched, cpu); + + return ts->tick_stopped; +} + +/** + * tick_nohz_update_jiffies - update jiffies when idle was interrupted + * + * Called from interrupt entry when the CPU was idle + * + * In case the sched_tick was stopped on this CPU, we have to check if jiffies + * must be updated. Otherwise an interrupt handler could use a stale jiffy + * value. We do this unconditionally on any CPU, as we don't know whether the + * CPU, which has the update task assigned is in a long sleep. + */ +static void tick_nohz_update_jiffies(ktime_t now) +{ + unsigned long flags; + + __this_cpu_write(tick_cpu_sched.idle_waketime, now); + + local_irq_save(flags); + tick_do_update_jiffies64(now); + local_irq_restore(flags); + + touch_softlockup_watchdog_sched(); +} + +static void tick_nohz_stop_idle(struct tick_sched *ts, ktime_t now) +{ + ktime_t delta; + + if (WARN_ON_ONCE(!ts->idle_active)) + return; + + delta = ktime_sub(now, ts->idle_entrytime); + + write_seqcount_begin(&ts->idle_sleeptime_seq); + if (nr_iowait_cpu(smp_processor_id()) > 0) + ts->iowait_sleeptime = ktime_add(ts->iowait_sleeptime, delta); + else + ts->idle_sleeptime = ktime_add(ts->idle_sleeptime, delta); + + ts->idle_entrytime = now; + ts->idle_active = 0; + write_seqcount_end(&ts->idle_sleeptime_seq); + + sched_clock_idle_wakeup_event(); +} + +static void tick_nohz_start_idle(struct tick_sched *ts) +{ + write_seqcount_begin(&ts->idle_sleeptime_seq); + ts->idle_entrytime = ktime_get(); + ts->idle_active = 1; + write_seqcount_end(&ts->idle_sleeptime_seq); + + sched_clock_idle_sleep_event(); +} + +static u64 get_cpu_sleep_time_us(struct tick_sched *ts, ktime_t *sleeptime, + bool compute_delta, u64 *last_update_time) +{ + ktime_t now, idle; + unsigned int seq; + + if (!tick_nohz_active) + return -1; + + now = ktime_get(); + if (last_update_time) + *last_update_time = ktime_to_us(now); + + do { + seq = read_seqcount_begin(&ts->idle_sleeptime_seq); + + if (ts->idle_active && compute_delta) { + ktime_t delta = ktime_sub(now, ts->idle_entrytime); + + idle = ktime_add(*sleeptime, delta); + } else { + idle = *sleeptime; + } + } while (read_seqcount_retry(&ts->idle_sleeptime_seq, seq)); + + return ktime_to_us(idle); + +} + +/** + * get_cpu_idle_time_us - get the total idle time of a CPU + * @cpu: CPU number to query + * @last_update_time: variable to store update time in. Do not update + * counters if NULL. + * + * Return the cumulative idle time (since boot) for a given + * CPU, in microseconds. Note this is partially broken due to + * the counter of iowait tasks that can be remotely updated without + * any synchronization. Therefore it is possible to observe backward + * values within two consecutive reads. + * + * This time is measured via accounting rather than sampling, + * and is as accurate as ktime_get() is. + * + * This function returns -1 if NOHZ is not enabled. + */ +u64 get_cpu_idle_time_us(int cpu, u64 *last_update_time) +{ + struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu); + + return get_cpu_sleep_time_us(ts, &ts->idle_sleeptime, + !nr_iowait_cpu(cpu), last_update_time); +} +EXPORT_SYMBOL_GPL(get_cpu_idle_time_us); + +/** + * get_cpu_iowait_time_us - get the total iowait time of a CPU + * @cpu: CPU number to query + * @last_update_time: variable to store update time in. Do not update + * counters if NULL. + * + * Return the cumulative iowait time (since boot) for a given + * CPU, in microseconds. Note this is partially broken due to + * the counter of iowait tasks that can be remotely updated without + * any synchronization. Therefore it is possible to observe backward + * values within two consecutive reads. + * + * This time is measured via accounting rather than sampling, + * and is as accurate as ktime_get() is. + * + * This function returns -1 if NOHZ is not enabled. + */ +u64 get_cpu_iowait_time_us(int cpu, u64 *last_update_time) +{ + struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu); + + return get_cpu_sleep_time_us(ts, &ts->iowait_sleeptime, + nr_iowait_cpu(cpu), last_update_time); +} +EXPORT_SYMBOL_GPL(get_cpu_iowait_time_us); + +static void tick_nohz_restart(struct tick_sched *ts, ktime_t now) +{ + hrtimer_cancel(&ts->sched_timer); + hrtimer_set_expires(&ts->sched_timer, ts->last_tick); + + /* Forward the time to expire in the future */ + hrtimer_forward(&ts->sched_timer, now, TICK_NSEC); + + if (ts->nohz_mode == NOHZ_MODE_HIGHRES) { + hrtimer_start_expires(&ts->sched_timer, + HRTIMER_MODE_ABS_PINNED_HARD); + } else { + tick_program_event(hrtimer_get_expires(&ts->sched_timer), 1); + } + + /* + * Reset to make sure next tick stop doesn't get fooled by past + * cached clock deadline. + */ + ts->next_tick = 0; +} + +static inline bool local_timer_softirq_pending(void) +{ + return local_softirq_pending() & BIT(TIMER_SOFTIRQ); +} + +static ktime_t tick_nohz_next_event(struct tick_sched *ts, int cpu) +{ + u64 basemono, next_tick, delta, expires; + unsigned long basejiff; + unsigned int seq; + + /* Read jiffies and the time when jiffies were updated last */ + do { + seq = read_seqcount_begin(&jiffies_seq); + basemono = last_jiffies_update; + basejiff = jiffies; + } while (read_seqcount_retry(&jiffies_seq, seq)); + ts->last_jiffies = basejiff; + ts->timer_expires_base = basemono; + + /* + * Keep the periodic tick, when RCU, architecture or irq_work + * requests it. + * Aside of that check whether the local timer softirq is + * pending. If so its a bad idea to call get_next_timer_interrupt() + * because there is an already expired timer, so it will request + * immediate expiry, which rearms the hardware timer with a + * minimal delta which brings us back to this place + * immediately. Lather, rinse and repeat... + */ + if (rcu_needs_cpu() || arch_needs_cpu() || + irq_work_needs_cpu() || local_timer_softirq_pending()) { + next_tick = basemono + TICK_NSEC; + } else { + /* + * Get the next pending timer. If high resolution + * timers are enabled this only takes the timer wheel + * timers into account. If high resolution timers are + * disabled this also looks at the next expiring + * hrtimer. + */ + next_tick = get_next_timer_interrupt(basejiff, basemono); + ts->next_timer = next_tick; + } + + /* + * If the tick is due in the next period, keep it ticking or + * force prod the timer. + */ + delta = next_tick - basemono; + if (delta <= (u64)TICK_NSEC) { + /* + * Tell the timer code that the base is not idle, i.e. undo + * the effect of get_next_timer_interrupt(): + */ + timer_clear_idle(); + /* + * We've not stopped the tick yet, and there's a timer in the + * next period, so no point in stopping it either, bail. + */ + if (!ts->tick_stopped) { + ts->timer_expires = 0; + goto out; + } + } + + /* + * If this CPU is the one which had the do_timer() duty last, we limit + * the sleep time to the timekeeping max_deferment value. + * Otherwise we can sleep as long as we want. + */ + delta = timekeeping_max_deferment(); + if (cpu != tick_do_timer_cpu && + (tick_do_timer_cpu != TICK_DO_TIMER_NONE || !ts->do_timer_last)) + delta = KTIME_MAX; + + /* Calculate the next expiry time */ + if (delta < (KTIME_MAX - basemono)) + expires = basemono + delta; + else + expires = KTIME_MAX; + + ts->timer_expires = min_t(u64, expires, next_tick); + +out: + return ts->timer_expires; +} + +static void tick_nohz_stop_tick(struct tick_sched *ts, int cpu) +{ + struct clock_event_device *dev = __this_cpu_read(tick_cpu_device.evtdev); + u64 basemono = ts->timer_expires_base; + u64 expires = ts->timer_expires; + ktime_t tick = expires; + + /* Make sure we won't be trying to stop it twice in a row. */ + ts->timer_expires_base = 0; + + /* + * If this CPU is the one which updates jiffies, then give up + * the assignment and let it be taken by the CPU which runs + * the tick timer next, which might be this CPU as well. If we + * don't drop this here the jiffies might be stale and + * do_timer() never invoked. Keep track of the fact that it + * was the one which had the do_timer() duty last. + */ + if (cpu == tick_do_timer_cpu) { + tick_do_timer_cpu = TICK_DO_TIMER_NONE; + ts->do_timer_last = 1; + } else if (tick_do_timer_cpu != TICK_DO_TIMER_NONE) { + ts->do_timer_last = 0; + } + + /* Skip reprogram of event if its not changed */ + if (ts->tick_stopped && (expires == ts->next_tick)) { + /* Sanity check: make sure clockevent is actually programmed */ + if (tick == KTIME_MAX || ts->next_tick == hrtimer_get_expires(&ts->sched_timer)) + return; + + WARN_ON_ONCE(1); + printk_once("basemono: %llu ts->next_tick: %llu dev->next_event: %llu timer->active: %d timer->expires: %llu\n", + basemono, ts->next_tick, dev->next_event, + hrtimer_active(&ts->sched_timer), hrtimer_get_expires(&ts->sched_timer)); + } + + /* + * nohz_stop_sched_tick can be called several times before + * the nohz_restart_sched_tick is called. This happens when + * interrupts arrive which do not cause a reschedule. In the + * first call we save the current tick time, so we can restart + * the scheduler tick in nohz_restart_sched_tick. + */ + if (!ts->tick_stopped) { + calc_load_nohz_start(); + quiet_vmstat(); + + ts->last_tick = hrtimer_get_expires(&ts->sched_timer); + ts->tick_stopped = 1; + trace_tick_stop(1, TICK_DEP_MASK_NONE); + } + + ts->next_tick = tick; + + /* + * If the expiration time == KTIME_MAX, then we simply stop + * the tick timer. + */ + if (unlikely(expires == KTIME_MAX)) { + if (ts->nohz_mode == NOHZ_MODE_HIGHRES) + hrtimer_cancel(&ts->sched_timer); + else + tick_program_event(KTIME_MAX, 1); + return; + } + + if (ts->nohz_mode == NOHZ_MODE_HIGHRES) { + hrtimer_start(&ts->sched_timer, tick, + HRTIMER_MODE_ABS_PINNED_HARD); + } else { + hrtimer_set_expires(&ts->sched_timer, tick); + tick_program_event(tick, 1); + } +} + +static void tick_nohz_retain_tick(struct tick_sched *ts) +{ + ts->timer_expires_base = 0; +} + +#ifdef CONFIG_NO_HZ_FULL +static void tick_nohz_stop_sched_tick(struct tick_sched *ts, int cpu) +{ + if (tick_nohz_next_event(ts, cpu)) + tick_nohz_stop_tick(ts, cpu); + else + tick_nohz_retain_tick(ts); +} +#endif /* CONFIG_NO_HZ_FULL */ + +static void tick_nohz_restart_sched_tick(struct tick_sched *ts, ktime_t now) +{ + /* Update jiffies first */ + tick_do_update_jiffies64(now); + + /* + * Clear the timer idle flag, so we avoid IPIs on remote queueing and + * the clock forward checks in the enqueue path: + */ + timer_clear_idle(); + + calc_load_nohz_stop(); + touch_softlockup_watchdog_sched(); + /* + * Cancel the scheduled timer and restore the tick + */ + ts->tick_stopped = 0; + tick_nohz_restart(ts, now); +} + +static void __tick_nohz_full_update_tick(struct tick_sched *ts, + ktime_t now) +{ +#ifdef CONFIG_NO_HZ_FULL + int cpu = smp_processor_id(); + + if (can_stop_full_tick(cpu, ts)) + tick_nohz_stop_sched_tick(ts, cpu); + else if (ts->tick_stopped) + tick_nohz_restart_sched_tick(ts, now); +#endif +} + +static void tick_nohz_full_update_tick(struct tick_sched *ts) +{ + if (!tick_nohz_full_cpu(smp_processor_id())) + return; + + if (!ts->tick_stopped && ts->nohz_mode == NOHZ_MODE_INACTIVE) + return; + + __tick_nohz_full_update_tick(ts, ktime_get()); +} + +/* + * A pending softirq outside an IRQ (or softirq disabled section) context + * should be waiting for ksoftirqd to handle it. Therefore we shouldn't + * reach here due to the need_resched() early check in can_stop_idle_tick(). + * + * However if we are between CPUHP_AP_SMPBOOT_THREADS and CPU_TEARDOWN_CPU on the + * cpu_down() process, softirqs can still be raised while ksoftirqd is parked, + * triggering the below since wakep_softirqd() is ignored. + * + */ +static bool report_idle_softirq(void) +{ + static int ratelimit; + unsigned int pending = local_softirq_pending(); + + if (likely(!pending)) + return false; + + /* Some softirqs claim to be safe against hotplug and ksoftirqd parking */ + if (!cpu_active(smp_processor_id())) { + pending &= ~SOFTIRQ_HOTPLUG_SAFE_MASK; + if (!pending) + return false; + } + + if (ratelimit >= 10) + return false; + + /* On RT, softirqs handling may be waiting on some lock */ + if (local_bh_blocked()) + return false; + + pr_warn("NOHZ tick-stop error: local softirq work is pending, handler #%02x!!!\n", + pending); + ratelimit++; + + return true; +} + +static bool can_stop_idle_tick(int cpu, struct tick_sched *ts) +{ + /* + * If this CPU is offline and it is the one which updates + * jiffies, then give up the assignment and let it be taken by + * the CPU which runs the tick timer next. If we don't drop + * this here the jiffies might be stale and do_timer() never + * invoked. + */ + if (unlikely(!cpu_online(cpu))) { + if (cpu == tick_do_timer_cpu) + tick_do_timer_cpu = TICK_DO_TIMER_NONE; + /* + * Make sure the CPU doesn't get fooled by obsolete tick + * deadline if it comes back online later. + */ + ts->next_tick = 0; + return false; + } + + if (unlikely(ts->nohz_mode == NOHZ_MODE_INACTIVE)) + return false; + + if (need_resched()) + return false; + + if (unlikely(report_idle_softirq())) + return false; + + if (tick_nohz_full_enabled()) { + /* + * Keep the tick alive to guarantee timekeeping progression + * if there are full dynticks CPUs around + */ + if (tick_do_timer_cpu == cpu) + return false; + + /* Should not happen for nohz-full */ + if (WARN_ON_ONCE(tick_do_timer_cpu == TICK_DO_TIMER_NONE)) + return false; + } + + return true; +} + +/** + * tick_nohz_idle_stop_tick - stop the idle tick from the idle task + * + * When the next event is more than a tick into the future, stop the idle tick + */ +void tick_nohz_idle_stop_tick(void) +{ + struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched); + int cpu = smp_processor_id(); + ktime_t expires; + + /* + * If tick_nohz_get_sleep_length() ran tick_nohz_next_event(), the + * tick timer expiration time is known already. + */ + if (ts->timer_expires_base) + expires = ts->timer_expires; + else if (can_stop_idle_tick(cpu, ts)) + expires = tick_nohz_next_event(ts, cpu); + else + return; + + ts->idle_calls++; + + if (expires > 0LL) { + int was_stopped = ts->tick_stopped; + + tick_nohz_stop_tick(ts, cpu); + + ts->idle_sleeps++; + ts->idle_expires = expires; + + if (!was_stopped && ts->tick_stopped) { + ts->idle_jiffies = ts->last_jiffies; + nohz_balance_enter_idle(cpu); + } + } else { + tick_nohz_retain_tick(ts); + } +} + +void tick_nohz_idle_retain_tick(void) +{ + tick_nohz_retain_tick(this_cpu_ptr(&tick_cpu_sched)); + /* + * Undo the effect of get_next_timer_interrupt() called from + * tick_nohz_next_event(). + */ + timer_clear_idle(); +} + +/** + * tick_nohz_idle_enter - prepare for entering idle on the current CPU + * + * Called when we start the idle loop. + */ +void tick_nohz_idle_enter(void) +{ + struct tick_sched *ts; + + lockdep_assert_irqs_enabled(); + + local_irq_disable(); + + ts = this_cpu_ptr(&tick_cpu_sched); + + WARN_ON_ONCE(ts->timer_expires_base); + + ts->inidle = 1; + tick_nohz_start_idle(ts); + + local_irq_enable(); +} + +/** + * tick_nohz_irq_exit - update next tick event from interrupt exit + * + * When an interrupt fires while we are idle and it doesn't cause + * a reschedule, it may still add, modify or delete a timer, enqueue + * an RCU callback, etc... + * So we need to re-calculate and reprogram the next tick event. + */ +void tick_nohz_irq_exit(void) +{ + struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched); + + if (ts->inidle) + tick_nohz_start_idle(ts); + else + tick_nohz_full_update_tick(ts); +} + +/** + * tick_nohz_idle_got_tick - Check whether or not the tick handler has run + */ +bool tick_nohz_idle_got_tick(void) +{ + struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched); + + if (ts->got_idle_tick) { + ts->got_idle_tick = 0; + return true; + } + return false; +} + +/** + * tick_nohz_get_next_hrtimer - return the next expiration time for the hrtimer + * or the tick, whatever that expires first. Note that, if the tick has been + * stopped, it returns the next hrtimer. + * + * Called from power state control code with interrupts disabled + */ +ktime_t tick_nohz_get_next_hrtimer(void) +{ + return __this_cpu_read(tick_cpu_device.evtdev)->next_event; +} + +/** + * tick_nohz_get_sleep_length - return the expected length of the current sleep + * @delta_next: duration until the next event if the tick cannot be stopped + * + * Called from power state control code with interrupts disabled. + * + * The return value of this function and/or the value returned by it through the + * @delta_next pointer can be negative which must be taken into account by its + * callers. + */ +ktime_t tick_nohz_get_sleep_length(ktime_t *delta_next) +{ + struct clock_event_device *dev = __this_cpu_read(tick_cpu_device.evtdev); + struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched); + int cpu = smp_processor_id(); + /* + * The idle entry time is expected to be a sufficient approximation of + * the current time at this point. + */ + ktime_t now = ts->idle_entrytime; + ktime_t next_event; + + WARN_ON_ONCE(!ts->inidle); + + *delta_next = ktime_sub(dev->next_event, now); + + if (!can_stop_idle_tick(cpu, ts)) + return *delta_next; + + next_event = tick_nohz_next_event(ts, cpu); + if (!next_event) + return *delta_next; + + /* + * If the next highres timer to expire is earlier than next_event, the + * idle governor needs to know that. + */ + next_event = min_t(u64, next_event, + hrtimer_next_event_without(&ts->sched_timer)); + + return ktime_sub(next_event, now); +} + +/** + * tick_nohz_get_idle_calls_cpu - return the current idle calls counter value + * for a particular CPU. + * + * Called from the schedutil frequency scaling governor in scheduler context. + */ +unsigned long tick_nohz_get_idle_calls_cpu(int cpu) +{ + struct tick_sched *ts = tick_get_tick_sched(cpu); + + return ts->idle_calls; +} + +/** + * tick_nohz_get_idle_calls - return the current idle calls counter value + * + * Called from the schedutil frequency scaling governor in scheduler context. + */ +unsigned long tick_nohz_get_idle_calls(void) +{ + struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched); + + return ts->idle_calls; +} + +static void tick_nohz_account_idle_time(struct tick_sched *ts, + ktime_t now) +{ + unsigned long ticks; + + ts->idle_exittime = now; + + if (vtime_accounting_enabled_this_cpu()) + return; + /* + * We stopped the tick in idle. Update process times would miss the + * time we slept as update_process_times does only a 1 tick + * accounting. Enforce that this is accounted to idle ! + */ + ticks = jiffies - ts->idle_jiffies; + /* + * We might be one off. Do not randomly account a huge number of ticks! + */ + if (ticks && ticks < LONG_MAX) + account_idle_ticks(ticks); +} + +void tick_nohz_idle_restart_tick(void) +{ + struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched); + + if (ts->tick_stopped) { + ktime_t now = ktime_get(); + tick_nohz_restart_sched_tick(ts, now); + tick_nohz_account_idle_time(ts, now); + } +} + +static void tick_nohz_idle_update_tick(struct tick_sched *ts, ktime_t now) +{ + if (tick_nohz_full_cpu(smp_processor_id())) + __tick_nohz_full_update_tick(ts, now); + else + tick_nohz_restart_sched_tick(ts, now); + + tick_nohz_account_idle_time(ts, now); +} + +/** + * tick_nohz_idle_exit - restart the idle tick from the idle task + * + * Restart the idle tick when the CPU is woken up from idle + * This also exit the RCU extended quiescent state. The CPU + * can use RCU again after this function is called. + */ +void tick_nohz_idle_exit(void) +{ + struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched); + bool idle_active, tick_stopped; + ktime_t now; + + local_irq_disable(); + + WARN_ON_ONCE(!ts->inidle); + WARN_ON_ONCE(ts->timer_expires_base); + + ts->inidle = 0; + idle_active = ts->idle_active; + tick_stopped = ts->tick_stopped; + + if (idle_active || tick_stopped) + now = ktime_get(); + + if (idle_active) + tick_nohz_stop_idle(ts, now); + + if (tick_stopped) + tick_nohz_idle_update_tick(ts, now); + + local_irq_enable(); +} + +/* + * The nohz low res interrupt handler + */ +static void tick_nohz_handler(struct clock_event_device *dev) +{ + struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched); + struct pt_regs *regs = get_irq_regs(); + ktime_t now = ktime_get(); + + dev->next_event = KTIME_MAX; + + tick_sched_do_timer(ts, now); + tick_sched_handle(ts, regs); + + if (unlikely(ts->tick_stopped)) { + /* + * The clockevent device is not reprogrammed, so change the + * clock event device to ONESHOT_STOPPED to avoid spurious + * interrupts on devices which might not be truly one shot. + */ + tick_program_event(KTIME_MAX, 1); + return; + } + + hrtimer_forward(&ts->sched_timer, now, TICK_NSEC); + tick_program_event(hrtimer_get_expires(&ts->sched_timer), 1); +} + +static inline void tick_nohz_activate(struct tick_sched *ts, int mode) +{ + if (!tick_nohz_enabled) + return; + ts->nohz_mode = mode; + /* One update is enough */ + if (!test_and_set_bit(0, &tick_nohz_active)) + timers_update_nohz(); +} + +/** + * tick_nohz_switch_to_nohz - switch to nohz mode + */ +static void tick_nohz_switch_to_nohz(void) +{ + struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched); + ktime_t next; + + if (!tick_nohz_enabled) + return; + + if (tick_switch_to_oneshot(tick_nohz_handler)) + return; + + /* + * Recycle the hrtimer in ts, so we can share the + * hrtimer_forward with the highres code. + */ + hrtimer_init(&ts->sched_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_HARD); + /* Get the next period */ + next = tick_init_jiffy_update(); + + hrtimer_set_expires(&ts->sched_timer, next); + hrtimer_forward_now(&ts->sched_timer, TICK_NSEC); + tick_program_event(hrtimer_get_expires(&ts->sched_timer), 1); + tick_nohz_activate(ts, NOHZ_MODE_LOWRES); +} + +static inline void tick_nohz_irq_enter(void) +{ + struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched); + ktime_t now; + + if (!ts->idle_active && !ts->tick_stopped) + return; + now = ktime_get(); + if (ts->idle_active) + tick_nohz_stop_idle(ts, now); + /* + * If all CPUs are idle. We may need to update a stale jiffies value. + * Note nohz_full is a special case: a timekeeper is guaranteed to stay + * alive but it might be busy looping with interrupts disabled in some + * rare case (typically stop machine). So we must make sure we have a + * last resort. + */ + if (ts->tick_stopped) + tick_nohz_update_jiffies(now); +} + +#else + +static inline void tick_nohz_switch_to_nohz(void) { } +static inline void tick_nohz_irq_enter(void) { } +static inline void tick_nohz_activate(struct tick_sched *ts, int mode) { } + +#endif /* CONFIG_NO_HZ_COMMON */ + +/* + * Called from irq_enter to notify about the possible interruption of idle() + */ +void tick_irq_enter(void) +{ + tick_check_oneshot_broadcast_this_cpu(); + tick_nohz_irq_enter(); +} + +/* + * High resolution timer specific code + */ +#ifdef CONFIG_HIGH_RES_TIMERS +/* + * We rearm the timer until we get disabled by the idle code. + * Called with interrupts disabled. + */ +static enum hrtimer_restart tick_sched_timer(struct hrtimer *timer) +{ + struct tick_sched *ts = + container_of(timer, struct tick_sched, sched_timer); + struct pt_regs *regs = get_irq_regs(); + ktime_t now = ktime_get(); + + tick_sched_do_timer(ts, now); + + /* + * Do not call, when we are not in irq context and have + * no valid regs pointer + */ + if (regs) + tick_sched_handle(ts, regs); + else + ts->next_tick = 0; + + /* No need to reprogram if we are in idle or full dynticks mode */ + if (unlikely(ts->tick_stopped)) + return HRTIMER_NORESTART; + + hrtimer_forward(timer, now, TICK_NSEC); + + return HRTIMER_RESTART; +} + +static int sched_skew_tick; + +static int __init skew_tick(char *str) +{ + get_option(&str, &sched_skew_tick); + + return 0; +} +early_param("skew_tick", skew_tick); + +/** + * tick_setup_sched_timer - setup the tick emulation timer + */ +void tick_setup_sched_timer(void) +{ + struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched); + ktime_t now = ktime_get(); + + /* + * Emulate tick processing via per-CPU hrtimers: + */ + hrtimer_init(&ts->sched_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_HARD); + ts->sched_timer.function = tick_sched_timer; + + /* Get the next period (per-CPU) */ + hrtimer_set_expires(&ts->sched_timer, tick_init_jiffy_update()); + + /* Offset the tick to avert jiffies_lock contention. */ + if (sched_skew_tick) { + u64 offset = TICK_NSEC >> 1; + do_div(offset, num_possible_cpus()); + offset *= smp_processor_id(); + hrtimer_add_expires_ns(&ts->sched_timer, offset); + } + + hrtimer_forward(&ts->sched_timer, now, TICK_NSEC); + hrtimer_start_expires(&ts->sched_timer, HRTIMER_MODE_ABS_PINNED_HARD); + tick_nohz_activate(ts, NOHZ_MODE_HIGHRES); +} +#endif /* HIGH_RES_TIMERS */ + +#if defined CONFIG_NO_HZ_COMMON || defined CONFIG_HIGH_RES_TIMERS +void tick_cancel_sched_timer(int cpu) +{ + struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu); + ktime_t idle_sleeptime, iowait_sleeptime; + unsigned long idle_calls, idle_sleeps; + +# ifdef CONFIG_HIGH_RES_TIMERS + if (ts->sched_timer.base) + hrtimer_cancel(&ts->sched_timer); +# endif + + idle_sleeptime = ts->idle_sleeptime; + iowait_sleeptime = ts->iowait_sleeptime; + idle_calls = ts->idle_calls; + idle_sleeps = ts->idle_sleeps; + memset(ts, 0, sizeof(*ts)); + ts->idle_sleeptime = idle_sleeptime; + ts->iowait_sleeptime = iowait_sleeptime; + ts->idle_calls = idle_calls; + ts->idle_sleeps = idle_sleeps; +} +#endif + +/* + * Async notification about clocksource changes + */ +void tick_clock_notify(void) +{ + int cpu; + + for_each_possible_cpu(cpu) + set_bit(0, &per_cpu(tick_cpu_sched, cpu).check_clocks); +} + +/* + * Async notification about clock event changes + */ +void tick_oneshot_notify(void) +{ + struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched); + + set_bit(0, &ts->check_clocks); +} + +/* + * Check, if a change happened, which makes oneshot possible. + * + * Called cyclic from the hrtimer softirq (driven by the timer + * softirq) allow_nohz signals, that we can switch into low-res nohz + * mode, because high resolution timers are disabled (either compile + * or runtime). Called with interrupts disabled. + */ +int tick_check_oneshot_change(int allow_nohz) +{ + struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched); + + if (!test_and_clear_bit(0, &ts->check_clocks)) + return 0; + + if (ts->nohz_mode != NOHZ_MODE_INACTIVE) + return 0; + + if (!timekeeping_valid_for_hres() || !tick_is_oneshot_available()) + return 0; + + if (!allow_nohz) + return 1; + + tick_nohz_switch_to_nohz(); + return 0; +} diff --git a/kernel/time/tick-sched.h b/kernel/time/tick-sched.h new file mode 100644 index 0000000000..5ed5a9d41d --- /dev/null +++ b/kernel/time/tick-sched.h @@ -0,0 +1,122 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _TICK_SCHED_H +#define _TICK_SCHED_H + +#include <linux/hrtimer.h> + +enum tick_device_mode { + TICKDEV_MODE_PERIODIC, + TICKDEV_MODE_ONESHOT, +}; + +struct tick_device { + struct clock_event_device *evtdev; + enum tick_device_mode mode; +}; + +enum tick_nohz_mode { + NOHZ_MODE_INACTIVE, + NOHZ_MODE_LOWRES, + NOHZ_MODE_HIGHRES, +}; + +/** + * struct tick_sched - sched tick emulation and no idle tick control/stats + * + * @inidle: Indicator that the CPU is in the tick idle mode + * @tick_stopped: Indicator that the idle tick has been stopped + * @idle_active: Indicator that the CPU is actively in the tick idle mode; + * it is reset during irq handling phases. + * @do_timer_last: CPU was the last one doing do_timer before going idle + * @got_idle_tick: Tick timer function has run with @inidle set + * @stalled_jiffies: Number of stalled jiffies detected across ticks + * @last_tick_jiffies: Value of jiffies seen on last tick + * @sched_timer: hrtimer to schedule the periodic tick in high + * resolution mode + * @last_tick: Store the last tick expiry time when the tick + * timer is modified for nohz sleeps. This is necessary + * to resume the tick timer operation in the timeline + * when the CPU returns from nohz sleep. + * @next_tick: Next tick to be fired when in dynticks mode. + * @idle_jiffies: jiffies at the entry to idle for idle time accounting + * @idle_waketime: Time when the idle was interrupted + * @idle_entrytime: Time when the idle call was entered + * @nohz_mode: Mode - one state of tick_nohz_mode + * @last_jiffies: Base jiffies snapshot when next event was last computed + * @timer_expires_base: Base time clock monotonic for @timer_expires + * @timer_expires: Anticipated timer expiration time (in case sched tick is stopped) + * @next_timer: Expiry time of next expiring timer for debugging purpose only + * @idle_expires: Next tick in idle, for debugging purpose only + * @idle_calls: Total number of idle calls + * @idle_sleeps: Number of idle calls, where the sched tick was stopped + * @idle_exittime: Time when the idle state was left + * @idle_sleeptime: Sum of the time slept in idle with sched tick stopped + * @iowait_sleeptime: Sum of the time slept in idle with sched tick stopped, with IO outstanding + * @tick_dep_mask: Tick dependency mask - is set, if someone needs the tick + * @check_clocks: Notification mechanism about clocksource changes + */ +struct tick_sched { + /* Common flags */ + unsigned int inidle : 1; + unsigned int tick_stopped : 1; + unsigned int idle_active : 1; + unsigned int do_timer_last : 1; + unsigned int got_idle_tick : 1; + + /* Tick handling: jiffies stall check */ + unsigned int stalled_jiffies; + unsigned long last_tick_jiffies; + + /* Tick handling */ + struct hrtimer sched_timer; + ktime_t last_tick; + ktime_t next_tick; + unsigned long idle_jiffies; + ktime_t idle_waketime; + + /* Idle entry */ + seqcount_t idle_sleeptime_seq; + ktime_t idle_entrytime; + + /* Tick stop */ + enum tick_nohz_mode nohz_mode; + unsigned long last_jiffies; + u64 timer_expires_base; + u64 timer_expires; + u64 next_timer; + ktime_t idle_expires; + unsigned long idle_calls; + unsigned long idle_sleeps; + + /* Idle exit */ + ktime_t idle_exittime; + ktime_t idle_sleeptime; + ktime_t iowait_sleeptime; + + /* Full dynticks handling */ + atomic_t tick_dep_mask; + + /* Clocksource changes */ + unsigned long check_clocks; +}; + +extern struct tick_sched *tick_get_tick_sched(int cpu); + +extern void tick_setup_sched_timer(void); +#if defined CONFIG_NO_HZ_COMMON || defined CONFIG_HIGH_RES_TIMERS +extern void tick_cancel_sched_timer(int cpu); +#else +static inline void tick_cancel_sched_timer(int cpu) { } +#endif + +#ifdef CONFIG_GENERIC_CLOCKEVENTS_BROADCAST +extern int __tick_broadcast_oneshot_control(enum tick_broadcast_state state); +#else +static inline int +__tick_broadcast_oneshot_control(enum tick_broadcast_state state) +{ + return -EBUSY; +} +#endif + +#endif diff --git a/kernel/time/time.c b/kernel/time/time.c new file mode 100644 index 0000000000..642647f504 --- /dev/null +++ b/kernel/time/time.c @@ -0,0 +1,1056 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 1991, 1992 Linus Torvalds + * + * This file contains the interface functions for the various time related + * system calls: time, stime, gettimeofday, settimeofday, adjtime + * + * Modification history: + * + * 1993-09-02 Philip Gladstone + * Created file with time related functions from sched/core.c and adjtimex() + * 1993-10-08 Torsten Duwe + * adjtime interface update and CMOS clock write code + * 1995-08-13 Torsten Duwe + * kernel PLL updated to 1994-12-13 specs (rfc-1589) + * 1999-01-16 Ulrich Windl + * Introduced error checking for many cases in adjtimex(). + * Updated NTP code according to technical memorandum Jan '96 + * "A Kernel Model for Precision Timekeeping" by Dave Mills + * Allow time_constant larger than MAXTC(6) for NTP v4 (MAXTC == 10) + * (Even though the technical memorandum forbids it) + * 2004-07-14 Christoph Lameter + * Added getnstimeofday to allow the posix timer functions to return + * with nanosecond accuracy + */ + +#include <linux/export.h> +#include <linux/kernel.h> +#include <linux/timex.h> +#include <linux/capability.h> +#include <linux/timekeeper_internal.h> +#include <linux/errno.h> +#include <linux/syscalls.h> +#include <linux/security.h> +#include <linux/fs.h> +#include <linux/math64.h> +#include <linux/ptrace.h> + +#include <linux/uaccess.h> +#include <linux/compat.h> +#include <asm/unistd.h> + +#include <generated/timeconst.h> +#include "timekeeping.h" + +/* + * The timezone where the local system is located. Used as a default by some + * programs who obtain this value by using gettimeofday. + */ +struct timezone sys_tz; + +EXPORT_SYMBOL(sys_tz); + +#ifdef __ARCH_WANT_SYS_TIME + +/* + * sys_time() can be implemented in user-level using + * sys_gettimeofday(). Is this for backwards compatibility? If so, + * why not move it into the appropriate arch directory (for those + * architectures that need it). + */ +SYSCALL_DEFINE1(time, __kernel_old_time_t __user *, tloc) +{ + __kernel_old_time_t i = (__kernel_old_time_t)ktime_get_real_seconds(); + + if (tloc) { + if (put_user(i,tloc)) + return -EFAULT; + } + force_successful_syscall_return(); + return i; +} + +/* + * sys_stime() can be implemented in user-level using + * sys_settimeofday(). Is this for backwards compatibility? If so, + * why not move it into the appropriate arch directory (for those + * architectures that need it). + */ + +SYSCALL_DEFINE1(stime, __kernel_old_time_t __user *, tptr) +{ + struct timespec64 tv; + int err; + + if (get_user(tv.tv_sec, tptr)) + return -EFAULT; + + tv.tv_nsec = 0; + + err = security_settime64(&tv, NULL); + if (err) + return err; + + do_settimeofday64(&tv); + return 0; +} + +#endif /* __ARCH_WANT_SYS_TIME */ + +#ifdef CONFIG_COMPAT_32BIT_TIME +#ifdef __ARCH_WANT_SYS_TIME32 + +/* old_time32_t is a 32 bit "long" and needs to get converted. */ +SYSCALL_DEFINE1(time32, old_time32_t __user *, tloc) +{ + old_time32_t i; + + i = (old_time32_t)ktime_get_real_seconds(); + + if (tloc) { + if (put_user(i,tloc)) + return -EFAULT; + } + force_successful_syscall_return(); + return i; +} + +SYSCALL_DEFINE1(stime32, old_time32_t __user *, tptr) +{ + struct timespec64 tv; + int err; + + if (get_user(tv.tv_sec, tptr)) + return -EFAULT; + + tv.tv_nsec = 0; + + err = security_settime64(&tv, NULL); + if (err) + return err; + + do_settimeofday64(&tv); + return 0; +} + +#endif /* __ARCH_WANT_SYS_TIME32 */ +#endif + +SYSCALL_DEFINE2(gettimeofday, struct __kernel_old_timeval __user *, tv, + struct timezone __user *, tz) +{ + if (likely(tv != NULL)) { + struct timespec64 ts; + + ktime_get_real_ts64(&ts); + if (put_user(ts.tv_sec, &tv->tv_sec) || + put_user(ts.tv_nsec / 1000, &tv->tv_usec)) + return -EFAULT; + } + if (unlikely(tz != NULL)) { + if (copy_to_user(tz, &sys_tz, sizeof(sys_tz))) + return -EFAULT; + } + return 0; +} + +/* + * In case for some reason the CMOS clock has not already been running + * in UTC, but in some local time: The first time we set the timezone, + * we will warp the clock so that it is ticking UTC time instead of + * local time. Presumably, if someone is setting the timezone then we + * are running in an environment where the programs understand about + * timezones. This should be done at boot time in the /etc/rc script, + * as soon as possible, so that the clock can be set right. Otherwise, + * various programs will get confused when the clock gets warped. + */ + +int do_sys_settimeofday64(const struct timespec64 *tv, const struct timezone *tz) +{ + static int firsttime = 1; + int error = 0; + + if (tv && !timespec64_valid_settod(tv)) + return -EINVAL; + + error = security_settime64(tv, tz); + if (error) + return error; + + if (tz) { + /* Verify we're within the +-15 hrs range */ + if (tz->tz_minuteswest > 15*60 || tz->tz_minuteswest < -15*60) + return -EINVAL; + + sys_tz = *tz; + update_vsyscall_tz(); + if (firsttime) { + firsttime = 0; + if (!tv) + timekeeping_warp_clock(); + } + } + if (tv) + return do_settimeofday64(tv); + return 0; +} + +SYSCALL_DEFINE2(settimeofday, struct __kernel_old_timeval __user *, tv, + struct timezone __user *, tz) +{ + struct timespec64 new_ts; + struct timezone new_tz; + + if (tv) { + if (get_user(new_ts.tv_sec, &tv->tv_sec) || + get_user(new_ts.tv_nsec, &tv->tv_usec)) + return -EFAULT; + + if (new_ts.tv_nsec > USEC_PER_SEC || new_ts.tv_nsec < 0) + return -EINVAL; + + new_ts.tv_nsec *= NSEC_PER_USEC; + } + if (tz) { + if (copy_from_user(&new_tz, tz, sizeof(*tz))) + return -EFAULT; + } + + return do_sys_settimeofday64(tv ? &new_ts : NULL, tz ? &new_tz : NULL); +} + +#ifdef CONFIG_COMPAT +COMPAT_SYSCALL_DEFINE2(gettimeofday, struct old_timeval32 __user *, tv, + struct timezone __user *, tz) +{ + if (tv) { + struct timespec64 ts; + + ktime_get_real_ts64(&ts); + if (put_user(ts.tv_sec, &tv->tv_sec) || + put_user(ts.tv_nsec / 1000, &tv->tv_usec)) + return -EFAULT; + } + if (tz) { + if (copy_to_user(tz, &sys_tz, sizeof(sys_tz))) + return -EFAULT; + } + + return 0; +} + +COMPAT_SYSCALL_DEFINE2(settimeofday, struct old_timeval32 __user *, tv, + struct timezone __user *, tz) +{ + struct timespec64 new_ts; + struct timezone new_tz; + + if (tv) { + if (get_user(new_ts.tv_sec, &tv->tv_sec) || + get_user(new_ts.tv_nsec, &tv->tv_usec)) + return -EFAULT; + + if (new_ts.tv_nsec > USEC_PER_SEC || new_ts.tv_nsec < 0) + return -EINVAL; + + new_ts.tv_nsec *= NSEC_PER_USEC; + } + if (tz) { + if (copy_from_user(&new_tz, tz, sizeof(*tz))) + return -EFAULT; + } + + return do_sys_settimeofday64(tv ? &new_ts : NULL, tz ? &new_tz : NULL); +} +#endif + +#ifdef CONFIG_64BIT +SYSCALL_DEFINE1(adjtimex, struct __kernel_timex __user *, txc_p) +{ + struct __kernel_timex txc; /* Local copy of parameter */ + int ret; + + /* Copy the user data space into the kernel copy + * structure. But bear in mind that the structures + * may change + */ + if (copy_from_user(&txc, txc_p, sizeof(struct __kernel_timex))) + return -EFAULT; + ret = do_adjtimex(&txc); + return copy_to_user(txc_p, &txc, sizeof(struct __kernel_timex)) ? -EFAULT : ret; +} +#endif + +#ifdef CONFIG_COMPAT_32BIT_TIME +int get_old_timex32(struct __kernel_timex *txc, const struct old_timex32 __user *utp) +{ + struct old_timex32 tx32; + + memset(txc, 0, sizeof(struct __kernel_timex)); + if (copy_from_user(&tx32, utp, sizeof(struct old_timex32))) + return -EFAULT; + + txc->modes = tx32.modes; + txc->offset = tx32.offset; + txc->freq = tx32.freq; + txc->maxerror = tx32.maxerror; + txc->esterror = tx32.esterror; + txc->status = tx32.status; + txc->constant = tx32.constant; + txc->precision = tx32.precision; + txc->tolerance = tx32.tolerance; + txc->time.tv_sec = tx32.time.tv_sec; + txc->time.tv_usec = tx32.time.tv_usec; + txc->tick = tx32.tick; + txc->ppsfreq = tx32.ppsfreq; + txc->jitter = tx32.jitter; + txc->shift = tx32.shift; + txc->stabil = tx32.stabil; + txc->jitcnt = tx32.jitcnt; + txc->calcnt = tx32.calcnt; + txc->errcnt = tx32.errcnt; + txc->stbcnt = tx32.stbcnt; + + return 0; +} + +int put_old_timex32(struct old_timex32 __user *utp, const struct __kernel_timex *txc) +{ + struct old_timex32 tx32; + + memset(&tx32, 0, sizeof(struct old_timex32)); + tx32.modes = txc->modes; + tx32.offset = txc->offset; + tx32.freq = txc->freq; + tx32.maxerror = txc->maxerror; + tx32.esterror = txc->esterror; + tx32.status = txc->status; + tx32.constant = txc->constant; + tx32.precision = txc->precision; + tx32.tolerance = txc->tolerance; + tx32.time.tv_sec = txc->time.tv_sec; + tx32.time.tv_usec = txc->time.tv_usec; + tx32.tick = txc->tick; + tx32.ppsfreq = txc->ppsfreq; + tx32.jitter = txc->jitter; + tx32.shift = txc->shift; + tx32.stabil = txc->stabil; + tx32.jitcnt = txc->jitcnt; + tx32.calcnt = txc->calcnt; + tx32.errcnt = txc->errcnt; + tx32.stbcnt = txc->stbcnt; + tx32.tai = txc->tai; + if (copy_to_user(utp, &tx32, sizeof(struct old_timex32))) + return -EFAULT; + return 0; +} + +SYSCALL_DEFINE1(adjtimex_time32, struct old_timex32 __user *, utp) +{ + struct __kernel_timex txc; + int err, ret; + + err = get_old_timex32(&txc, utp); + if (err) + return err; + + ret = do_adjtimex(&txc); + + err = put_old_timex32(utp, &txc); + if (err) + return err; + + return ret; +} +#endif + +/** + * jiffies_to_msecs - Convert jiffies to milliseconds + * @j: jiffies value + * + * Avoid unnecessary multiplications/divisions in the + * two most common HZ cases. + * + * Return: milliseconds value + */ +unsigned int jiffies_to_msecs(const unsigned long j) +{ +#if HZ <= MSEC_PER_SEC && !(MSEC_PER_SEC % HZ) + return (MSEC_PER_SEC / HZ) * j; +#elif HZ > MSEC_PER_SEC && !(HZ % MSEC_PER_SEC) + return (j + (HZ / MSEC_PER_SEC) - 1)/(HZ / MSEC_PER_SEC); +#else +# if BITS_PER_LONG == 32 + return (HZ_TO_MSEC_MUL32 * j + (1ULL << HZ_TO_MSEC_SHR32) - 1) >> + HZ_TO_MSEC_SHR32; +# else + return DIV_ROUND_UP(j * HZ_TO_MSEC_NUM, HZ_TO_MSEC_DEN); +# endif +#endif +} +EXPORT_SYMBOL(jiffies_to_msecs); + +/** + * jiffies_to_usecs - Convert jiffies to microseconds + * @j: jiffies value + * + * Return: microseconds value + */ +unsigned int jiffies_to_usecs(const unsigned long j) +{ + /* + * Hz usually doesn't go much further MSEC_PER_SEC. + * jiffies_to_usecs() and usecs_to_jiffies() depend on that. + */ + BUILD_BUG_ON(HZ > USEC_PER_SEC); + +#if !(USEC_PER_SEC % HZ) + return (USEC_PER_SEC / HZ) * j; +#else +# if BITS_PER_LONG == 32 + return (HZ_TO_USEC_MUL32 * j) >> HZ_TO_USEC_SHR32; +# else + return (j * HZ_TO_USEC_NUM) / HZ_TO_USEC_DEN; +# endif +#endif +} +EXPORT_SYMBOL(jiffies_to_usecs); + +/** + * mktime64 - Converts date to seconds. + * @year0: year to convert + * @mon0: month to convert + * @day: day to convert + * @hour: hour to convert + * @min: minute to convert + * @sec: second to convert + * + * Converts Gregorian date to seconds since 1970-01-01 00:00:00. + * Assumes input in normal date format, i.e. 1980-12-31 23:59:59 + * => year=1980, mon=12, day=31, hour=23, min=59, sec=59. + * + * [For the Julian calendar (which was used in Russia before 1917, + * Britain & colonies before 1752, anywhere else before 1582, + * and is still in use by some communities) leave out the + * -year/100+year/400 terms, and add 10.] + * + * This algorithm was first published by Gauss (I think). + * + * A leap second can be indicated by calling this function with sec as + * 60 (allowable under ISO 8601). The leap second is treated the same + * as the following second since they don't exist in UNIX time. + * + * An encoding of midnight at the end of the day as 24:00:00 - ie. midnight + * tomorrow - (allowable under ISO 8601) is supported. + * + * Return: seconds since the epoch time for the given input date + */ +time64_t mktime64(const unsigned int year0, const unsigned int mon0, + const unsigned int day, const unsigned int hour, + const unsigned int min, const unsigned int sec) +{ + unsigned int mon = mon0, year = year0; + + /* 1..12 -> 11,12,1..10 */ + if (0 >= (int) (mon -= 2)) { + mon += 12; /* Puts Feb last since it has leap day */ + year -= 1; + } + + return ((((time64_t) + (year/4 - year/100 + year/400 + 367*mon/12 + day) + + year*365 - 719499 + )*24 + hour /* now have hours - midnight tomorrow handled here */ + )*60 + min /* now have minutes */ + )*60 + sec; /* finally seconds */ +} +EXPORT_SYMBOL(mktime64); + +struct __kernel_old_timeval ns_to_kernel_old_timeval(s64 nsec) +{ + struct timespec64 ts = ns_to_timespec64(nsec); + struct __kernel_old_timeval tv; + + tv.tv_sec = ts.tv_sec; + tv.tv_usec = (suseconds_t)ts.tv_nsec / 1000; + + return tv; +} +EXPORT_SYMBOL(ns_to_kernel_old_timeval); + +/** + * set_normalized_timespec64 - set timespec sec and nsec parts and normalize + * + * @ts: pointer to timespec variable to be set + * @sec: seconds to set + * @nsec: nanoseconds to set + * + * Set seconds and nanoseconds field of a timespec variable and + * normalize to the timespec storage format + * + * Note: The tv_nsec part is always in the range of 0 <= tv_nsec < NSEC_PER_SEC. + * For negative values only the tv_sec field is negative ! + */ +void set_normalized_timespec64(struct timespec64 *ts, time64_t sec, s64 nsec) +{ + while (nsec >= NSEC_PER_SEC) { + /* + * The following asm() prevents the compiler from + * optimising this loop into a modulo operation. See + * also __iter_div_u64_rem() in include/linux/time.h + */ + asm("" : "+rm"(nsec)); + nsec -= NSEC_PER_SEC; + ++sec; + } + while (nsec < 0) { + asm("" : "+rm"(nsec)); + nsec += NSEC_PER_SEC; + --sec; + } + ts->tv_sec = sec; + ts->tv_nsec = nsec; +} +EXPORT_SYMBOL(set_normalized_timespec64); + +/** + * ns_to_timespec64 - Convert nanoseconds to timespec64 + * @nsec: the nanoseconds value to be converted + * + * Return: the timespec64 representation of the nsec parameter. + */ +struct timespec64 ns_to_timespec64(s64 nsec) +{ + struct timespec64 ts = { 0, 0 }; + s32 rem; + + if (likely(nsec > 0)) { + ts.tv_sec = div_u64_rem(nsec, NSEC_PER_SEC, &rem); + ts.tv_nsec = rem; + } else if (nsec < 0) { + /* + * With negative times, tv_sec points to the earlier + * second, and tv_nsec counts the nanoseconds since + * then, so tv_nsec is always a positive number. + */ + ts.tv_sec = -div_u64_rem(-nsec - 1, NSEC_PER_SEC, &rem) - 1; + ts.tv_nsec = NSEC_PER_SEC - rem - 1; + } + + return ts; +} +EXPORT_SYMBOL(ns_to_timespec64); + +/** + * __msecs_to_jiffies: - convert milliseconds to jiffies + * @m: time in milliseconds + * + * conversion is done as follows: + * + * - negative values mean 'infinite timeout' (MAX_JIFFY_OFFSET) + * + * - 'too large' values [that would result in larger than + * MAX_JIFFY_OFFSET values] mean 'infinite timeout' too. + * + * - all other values are converted to jiffies by either multiplying + * the input value by a factor or dividing it with a factor and + * handling any 32-bit overflows. + * for the details see __msecs_to_jiffies() + * + * __msecs_to_jiffies() checks for the passed in value being a constant + * via __builtin_constant_p() allowing gcc to eliminate most of the + * code, __msecs_to_jiffies() is called if the value passed does not + * allow constant folding and the actual conversion must be done at + * runtime. + * The _msecs_to_jiffies helpers are the HZ dependent conversion + * routines found in include/linux/jiffies.h + * + * Return: jiffies value + */ +unsigned long __msecs_to_jiffies(const unsigned int m) +{ + /* + * Negative value, means infinite timeout: + */ + if ((int)m < 0) + return MAX_JIFFY_OFFSET; + return _msecs_to_jiffies(m); +} +EXPORT_SYMBOL(__msecs_to_jiffies); + +/** + * __usecs_to_jiffies: - convert microseconds to jiffies + * @u: time in milliseconds + * + * Return: jiffies value + */ +unsigned long __usecs_to_jiffies(const unsigned int u) +{ + if (u > jiffies_to_usecs(MAX_JIFFY_OFFSET)) + return MAX_JIFFY_OFFSET; + return _usecs_to_jiffies(u); +} +EXPORT_SYMBOL(__usecs_to_jiffies); + +/** + * timespec64_to_jiffies - convert a timespec64 value to jiffies + * @value: pointer to &struct timespec64 + * + * The TICK_NSEC - 1 rounds up the value to the next resolution. Note + * that a remainder subtract here would not do the right thing as the + * resolution values don't fall on second boundaries. I.e. the line: + * nsec -= nsec % TICK_NSEC; is NOT a correct resolution rounding. + * Note that due to the small error in the multiplier here, this + * rounding is incorrect for sufficiently large values of tv_nsec, but + * well formed timespecs should have tv_nsec < NSEC_PER_SEC, so we're + * OK. + * + * Rather, we just shift the bits off the right. + * + * The >> (NSEC_JIFFIE_SC - SEC_JIFFIE_SC) converts the scaled nsec + * value to a scaled second value. + * + * Return: jiffies value + */ +unsigned long +timespec64_to_jiffies(const struct timespec64 *value) +{ + u64 sec = value->tv_sec; + long nsec = value->tv_nsec + TICK_NSEC - 1; + + if (sec >= MAX_SEC_IN_JIFFIES){ + sec = MAX_SEC_IN_JIFFIES; + nsec = 0; + } + return ((sec * SEC_CONVERSION) + + (((u64)nsec * NSEC_CONVERSION) >> + (NSEC_JIFFIE_SC - SEC_JIFFIE_SC))) >> SEC_JIFFIE_SC; + +} +EXPORT_SYMBOL(timespec64_to_jiffies); + +/** + * jiffies_to_timespec64 - convert jiffies value to &struct timespec64 + * @jiffies: jiffies value + * @value: pointer to &struct timespec64 + */ +void +jiffies_to_timespec64(const unsigned long jiffies, struct timespec64 *value) +{ + /* + * Convert jiffies to nanoseconds and separate with + * one divide. + */ + u32 rem; + value->tv_sec = div_u64_rem((u64)jiffies * TICK_NSEC, + NSEC_PER_SEC, &rem); + value->tv_nsec = rem; +} +EXPORT_SYMBOL(jiffies_to_timespec64); + +/* + * Convert jiffies/jiffies_64 to clock_t and back. + */ + +/** + * jiffies_to_clock_t - Convert jiffies to clock_t + * @x: jiffies value + * + * Return: jiffies converted to clock_t (CLOCKS_PER_SEC) + */ +clock_t jiffies_to_clock_t(unsigned long x) +{ +#if (TICK_NSEC % (NSEC_PER_SEC / USER_HZ)) == 0 +# if HZ < USER_HZ + return x * (USER_HZ / HZ); +# else + return x / (HZ / USER_HZ); +# endif +#else + return div_u64((u64)x * TICK_NSEC, NSEC_PER_SEC / USER_HZ); +#endif +} +EXPORT_SYMBOL(jiffies_to_clock_t); + +/** + * clock_t_to_jiffies - Convert clock_t to jiffies + * @x: clock_t value + * + * Return: clock_t value converted to jiffies + */ +unsigned long clock_t_to_jiffies(unsigned long x) +{ +#if (HZ % USER_HZ)==0 + if (x >= ~0UL / (HZ / USER_HZ)) + return ~0UL; + return x * (HZ / USER_HZ); +#else + /* Don't worry about loss of precision here .. */ + if (x >= ~0UL / HZ * USER_HZ) + return ~0UL; + + /* .. but do try to contain it here */ + return div_u64((u64)x * HZ, USER_HZ); +#endif +} +EXPORT_SYMBOL(clock_t_to_jiffies); + +/** + * jiffies_64_to_clock_t - Convert jiffies_64 to clock_t + * @x: jiffies_64 value + * + * Return: jiffies_64 value converted to 64-bit "clock_t" (CLOCKS_PER_SEC) + */ +u64 jiffies_64_to_clock_t(u64 x) +{ +#if (TICK_NSEC % (NSEC_PER_SEC / USER_HZ)) == 0 +# if HZ < USER_HZ + x = div_u64(x * USER_HZ, HZ); +# elif HZ > USER_HZ + x = div_u64(x, HZ / USER_HZ); +# else + /* Nothing to do */ +# endif +#else + /* + * There are better ways that don't overflow early, + * but even this doesn't overflow in hundreds of years + * in 64 bits, so.. + */ + x = div_u64(x * TICK_NSEC, (NSEC_PER_SEC / USER_HZ)); +#endif + return x; +} +EXPORT_SYMBOL(jiffies_64_to_clock_t); + +/** + * nsec_to_clock_t - Convert nsec value to clock_t + * @x: nsec value + * + * Return: nsec value converted to 64-bit "clock_t" (CLOCKS_PER_SEC) + */ +u64 nsec_to_clock_t(u64 x) +{ +#if (NSEC_PER_SEC % USER_HZ) == 0 + return div_u64(x, NSEC_PER_SEC / USER_HZ); +#elif (USER_HZ % 512) == 0 + return div_u64(x * USER_HZ / 512, NSEC_PER_SEC / 512); +#else + /* + * max relative error 5.7e-8 (1.8s per year) for USER_HZ <= 1024, + * overflow after 64.99 years. + * exact for HZ=60, 72, 90, 120, 144, 180, 300, 600, 900, ... + */ + return div_u64(x * 9, (9ull * NSEC_PER_SEC + (USER_HZ / 2)) / USER_HZ); +#endif +} + +/** + * jiffies64_to_nsecs - Convert jiffies64 to nanoseconds + * @j: jiffies64 value + * + * Return: nanoseconds value + */ +u64 jiffies64_to_nsecs(u64 j) +{ +#if !(NSEC_PER_SEC % HZ) + return (NSEC_PER_SEC / HZ) * j; +# else + return div_u64(j * HZ_TO_NSEC_NUM, HZ_TO_NSEC_DEN); +#endif +} +EXPORT_SYMBOL(jiffies64_to_nsecs); + +/** + * jiffies64_to_msecs - Convert jiffies64 to milliseconds + * @j: jiffies64 value + * + * Return: milliseconds value + */ +u64 jiffies64_to_msecs(const u64 j) +{ +#if HZ <= MSEC_PER_SEC && !(MSEC_PER_SEC % HZ) + return (MSEC_PER_SEC / HZ) * j; +#else + return div_u64(j * HZ_TO_MSEC_NUM, HZ_TO_MSEC_DEN); +#endif +} +EXPORT_SYMBOL(jiffies64_to_msecs); + +/** + * nsecs_to_jiffies64 - Convert nsecs in u64 to jiffies64 + * + * @n: nsecs in u64 + * + * Unlike {m,u}secs_to_jiffies, type of input is not unsigned int but u64. + * And this doesn't return MAX_JIFFY_OFFSET since this function is designed + * for scheduler, not for use in device drivers to calculate timeout value. + * + * note: + * NSEC_PER_SEC = 10^9 = (5^9 * 2^9) = (1953125 * 512) + * ULLONG_MAX ns = 18446744073.709551615 secs = about 584 years + * + * Return: nsecs converted to jiffies64 value + */ +u64 nsecs_to_jiffies64(u64 n) +{ +#if (NSEC_PER_SEC % HZ) == 0 + /* Common case, HZ = 100, 128, 200, 250, 256, 500, 512, 1000 etc. */ + return div_u64(n, NSEC_PER_SEC / HZ); +#elif (HZ % 512) == 0 + /* overflow after 292 years if HZ = 1024 */ + return div_u64(n * HZ / 512, NSEC_PER_SEC / 512); +#else + /* + * Generic case - optimized for cases where HZ is a multiple of 3. + * overflow after 64.99 years, exact for HZ = 60, 72, 90, 120 etc. + */ + return div_u64(n * 9, (9ull * NSEC_PER_SEC + HZ / 2) / HZ); +#endif +} +EXPORT_SYMBOL(nsecs_to_jiffies64); + +/** + * nsecs_to_jiffies - Convert nsecs in u64 to jiffies + * + * @n: nsecs in u64 + * + * Unlike {m,u}secs_to_jiffies, type of input is not unsigned int but u64. + * And this doesn't return MAX_JIFFY_OFFSET since this function is designed + * for scheduler, not for use in device drivers to calculate timeout value. + * + * note: + * NSEC_PER_SEC = 10^9 = (5^9 * 2^9) = (1953125 * 512) + * ULLONG_MAX ns = 18446744073.709551615 secs = about 584 years + * + * Return: nsecs converted to jiffies value + */ +unsigned long nsecs_to_jiffies(u64 n) +{ + return (unsigned long)nsecs_to_jiffies64(n); +} +EXPORT_SYMBOL_GPL(nsecs_to_jiffies); + +/** + * timespec64_add_safe - Add two timespec64 values and do a safety check + * for overflow. + * @lhs: first (left) timespec64 to add + * @rhs: second (right) timespec64 to add + * + * It's assumed that both values are valid (>= 0). + * And, each timespec64 is in normalized form. + * + * Return: sum of @lhs + @rhs + */ +struct timespec64 timespec64_add_safe(const struct timespec64 lhs, + const struct timespec64 rhs) +{ + struct timespec64 res; + + set_normalized_timespec64(&res, (timeu64_t) lhs.tv_sec + rhs.tv_sec, + lhs.tv_nsec + rhs.tv_nsec); + + if (unlikely(res.tv_sec < lhs.tv_sec || res.tv_sec < rhs.tv_sec)) { + res.tv_sec = TIME64_MAX; + res.tv_nsec = 0; + } + + return res; +} + +/** + * get_timespec64 - get user's time value into kernel space + * @ts: destination &struct timespec64 + * @uts: user's time value as &struct __kernel_timespec + * + * Handles compat or 32-bit modes. + * + * Return: %0 on success or negative errno on error + */ +int get_timespec64(struct timespec64 *ts, + const struct __kernel_timespec __user *uts) +{ + struct __kernel_timespec kts; + int ret; + + ret = copy_from_user(&kts, uts, sizeof(kts)); + if (ret) + return -EFAULT; + + ts->tv_sec = kts.tv_sec; + + /* Zero out the padding in compat mode */ + if (in_compat_syscall()) + kts.tv_nsec &= 0xFFFFFFFFUL; + + /* In 32-bit mode, this drops the padding */ + ts->tv_nsec = kts.tv_nsec; + + return 0; +} +EXPORT_SYMBOL_GPL(get_timespec64); + +/** + * put_timespec64 - convert timespec64 value to __kernel_timespec format and + * copy the latter to userspace + * @ts: input &struct timespec64 + * @uts: user's &struct __kernel_timespec + * + * Return: %0 on success or negative errno on error + */ +int put_timespec64(const struct timespec64 *ts, + struct __kernel_timespec __user *uts) +{ + struct __kernel_timespec kts = { + .tv_sec = ts->tv_sec, + .tv_nsec = ts->tv_nsec + }; + + return copy_to_user(uts, &kts, sizeof(kts)) ? -EFAULT : 0; +} +EXPORT_SYMBOL_GPL(put_timespec64); + +static int __get_old_timespec32(struct timespec64 *ts64, + const struct old_timespec32 __user *cts) +{ + struct old_timespec32 ts; + int ret; + + ret = copy_from_user(&ts, cts, sizeof(ts)); + if (ret) + return -EFAULT; + + ts64->tv_sec = ts.tv_sec; + ts64->tv_nsec = ts.tv_nsec; + + return 0; +} + +static int __put_old_timespec32(const struct timespec64 *ts64, + struct old_timespec32 __user *cts) +{ + struct old_timespec32 ts = { + .tv_sec = ts64->tv_sec, + .tv_nsec = ts64->tv_nsec + }; + return copy_to_user(cts, &ts, sizeof(ts)) ? -EFAULT : 0; +} + +/** + * get_old_timespec32 - get user's old-format time value into kernel space + * @ts: destination &struct timespec64 + * @uts: user's old-format time value (&struct old_timespec32) + * + * Handles X86_X32_ABI compatibility conversion. + * + * Return: %0 on success or negative errno on error + */ +int get_old_timespec32(struct timespec64 *ts, const void __user *uts) +{ + if (COMPAT_USE_64BIT_TIME) + return copy_from_user(ts, uts, sizeof(*ts)) ? -EFAULT : 0; + else + return __get_old_timespec32(ts, uts); +} +EXPORT_SYMBOL_GPL(get_old_timespec32); + +/** + * put_old_timespec32 - convert timespec64 value to &struct old_timespec32 and + * copy the latter to userspace + * @ts: input &struct timespec64 + * @uts: user's &struct old_timespec32 + * + * Handles X86_X32_ABI compatibility conversion. + * + * Return: %0 on success or negative errno on error + */ +int put_old_timespec32(const struct timespec64 *ts, void __user *uts) +{ + if (COMPAT_USE_64BIT_TIME) + return copy_to_user(uts, ts, sizeof(*ts)) ? -EFAULT : 0; + else + return __put_old_timespec32(ts, uts); +} +EXPORT_SYMBOL_GPL(put_old_timespec32); + +/** + * get_itimerspec64 - get user's &struct __kernel_itimerspec into kernel space + * @it: destination &struct itimerspec64 + * @uit: user's &struct __kernel_itimerspec + * + * Return: %0 on success or negative errno on error + */ +int get_itimerspec64(struct itimerspec64 *it, + const struct __kernel_itimerspec __user *uit) +{ + int ret; + + ret = get_timespec64(&it->it_interval, &uit->it_interval); + if (ret) + return ret; + + ret = get_timespec64(&it->it_value, &uit->it_value); + + return ret; +} +EXPORT_SYMBOL_GPL(get_itimerspec64); + +/** + * put_itimerspec64 - convert &struct itimerspec64 to __kernel_itimerspec format + * and copy the latter to userspace + * @it: input &struct itimerspec64 + * @uit: user's &struct __kernel_itimerspec + * + * Return: %0 on success or negative errno on error + */ +int put_itimerspec64(const struct itimerspec64 *it, + struct __kernel_itimerspec __user *uit) +{ + int ret; + + ret = put_timespec64(&it->it_interval, &uit->it_interval); + if (ret) + return ret; + + ret = put_timespec64(&it->it_value, &uit->it_value); + + return ret; +} +EXPORT_SYMBOL_GPL(put_itimerspec64); + +/** + * get_old_itimerspec32 - get user's &struct old_itimerspec32 into kernel space + * @its: destination &struct itimerspec64 + * @uits: user's &struct old_itimerspec32 + * + * Return: %0 on success or negative errno on error + */ +int get_old_itimerspec32(struct itimerspec64 *its, + const struct old_itimerspec32 __user *uits) +{ + + if (__get_old_timespec32(&its->it_interval, &uits->it_interval) || + __get_old_timespec32(&its->it_value, &uits->it_value)) + return -EFAULT; + return 0; +} +EXPORT_SYMBOL_GPL(get_old_itimerspec32); + +/** + * put_old_itimerspec32 - convert &struct itimerspec64 to &struct + * old_itimerspec32 and copy the latter to userspace + * @its: input &struct itimerspec64 + * @uits: user's &struct old_itimerspec32 + * + * Return: %0 on success or negative errno on error + */ +int put_old_itimerspec32(const struct itimerspec64 *its, + struct old_itimerspec32 __user *uits) +{ + if (__put_old_timespec32(&its->it_interval, &uits->it_interval) || + __put_old_timespec32(&its->it_value, &uits->it_value)) + return -EFAULT; + return 0; +} +EXPORT_SYMBOL_GPL(put_old_itimerspec32); diff --git a/kernel/time/time_test.c b/kernel/time/time_test.c new file mode 100644 index 0000000000..ca058c8af6 --- /dev/null +++ b/kernel/time/time_test.c @@ -0,0 +1,99 @@ +// SPDX-License-Identifier: LGPL-2.1+ + +#include <kunit/test.h> +#include <linux/time.h> + +/* + * Traditional implementation of leap year evaluation. + */ +static bool is_leap(long year) +{ + return year % 4 == 0 && (year % 100 != 0 || year % 400 == 0); +} + +/* + * Gets the last day of a month. + */ +static int last_day_of_month(long year, int month) +{ + if (month == 2) + return 28 + is_leap(year); + if (month == 4 || month == 6 || month == 9 || month == 11) + return 30; + return 31; +} + +/* + * Advances a date by one day. + */ +static void advance_date(long *year, int *month, int *mday, int *yday) +{ + if (*mday != last_day_of_month(*year, *month)) { + ++*mday; + ++*yday; + return; + } + + *mday = 1; + if (*month != 12) { + ++*month; + ++*yday; + return; + } + + *month = 1; + *yday = 0; + ++*year; +} + +/* + * Checks every day in a 160000 years interval centered at 1970-01-01 + * against the expected result. + */ +static void time64_to_tm_test_date_range(struct kunit *test) +{ + /* + * 80000 years = (80000 / 400) * 400 years + * = (80000 / 400) * 146097 days + * = (80000 / 400) * 146097 * 86400 seconds + */ + time64_t total_secs = ((time64_t) 80000) / 400 * 146097 * 86400; + long year = 1970 - 80000; + int month = 1; + int mdday = 1; + int yday = 0; + + struct tm result; + time64_t secs; + s64 days; + + for (secs = -total_secs; secs <= total_secs; secs += 86400) { + + time64_to_tm(secs, 0, &result); + + days = div_s64(secs, 86400); + + #define FAIL_MSG "%05ld/%02d/%02d (%2d) : %ld", \ + year, month, mdday, yday, days + + KUNIT_ASSERT_EQ_MSG(test, year - 1900, result.tm_year, FAIL_MSG); + KUNIT_ASSERT_EQ_MSG(test, month - 1, result.tm_mon, FAIL_MSG); + KUNIT_ASSERT_EQ_MSG(test, mdday, result.tm_mday, FAIL_MSG); + KUNIT_ASSERT_EQ_MSG(test, yday, result.tm_yday, FAIL_MSG); + + advance_date(&year, &month, &mdday, &yday); + } +} + +static struct kunit_case time_test_cases[] = { + KUNIT_CASE_SLOW(time64_to_tm_test_date_range), + {} +}; + +static struct kunit_suite time_test_suite = { + .name = "time_test_cases", + .test_cases = time_test_cases, +}; + +kunit_test_suite(time_test_suite); +MODULE_LICENSE("GPL"); diff --git a/kernel/time/timeconst.bc b/kernel/time/timeconst.bc new file mode 100644 index 0000000000..7ed0e0fb58 --- /dev/null +++ b/kernel/time/timeconst.bc @@ -0,0 +1,117 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +scale=0 + +define gcd(a,b) { + auto t; + while (b) { + t = b; + b = a % b; + a = t; + } + return a; +} + +/* Division by reciprocal multiplication. */ +define fmul(b,n,d) { + return (2^b*n+d-1)/d; +} + +/* Adjustment factor when a ceiling value is used. Use as: + (imul * n) + (fmulxx * n + fadjxx) >> xx) */ +define fadj(b,n,d) { + auto v; + d = d/gcd(n,d); + v = 2^b*(d-1)/d; + return v; +} + +/* Compute the appropriate mul/adj values as well as a shift count, + which brings the mul value into the range 2^b-1 <= x < 2^b. Such + a shift value will be correct in the signed integer range and off + by at most one in the upper half of the unsigned range. */ +define fmuls(b,n,d) { + auto s, m; + for (s = 0; 1; s++) { + m = fmul(s,n,d); + if (m >= 2^(b-1)) + return s; + } + return 0; +} + +define timeconst(hz) { + print "/* Automatically generated by kernel/time/timeconst.bc */\n" + print "/* Time conversion constants for HZ == ", hz, " */\n" + print "\n" + + print "#ifndef KERNEL_TIMECONST_H\n" + print "#define KERNEL_TIMECONST_H\n\n" + + print "#include <linux/param.h>\n" + print "#include <linux/types.h>\n\n" + + print "#if HZ != ", hz, "\n" + print "#error \qinclude/generated/timeconst.h has the wrong HZ value!\q\n" + print "#endif\n\n" + + if (hz < 2) { + print "#error Totally bogus HZ value!\n" + } else { + s=fmuls(32,1000,hz) + obase=16 + print "#define HZ_TO_MSEC_MUL32\tU64_C(0x", fmul(s,1000,hz), ")\n" + print "#define HZ_TO_MSEC_ADJ32\tU64_C(0x", fadj(s,1000,hz), ")\n" + obase=10 + print "#define HZ_TO_MSEC_SHR32\t", s, "\n" + + s=fmuls(32,hz,1000) + obase=16 + print "#define MSEC_TO_HZ_MUL32\tU64_C(0x", fmul(s,hz,1000), ")\n" + print "#define MSEC_TO_HZ_ADJ32\tU64_C(0x", fadj(s,hz,1000), ")\n" + obase=10 + print "#define MSEC_TO_HZ_SHR32\t", s, "\n" + + obase=10 + cd=gcd(hz,1000) + print "#define HZ_TO_MSEC_NUM\t\t", 1000/cd, "\n" + print "#define HZ_TO_MSEC_DEN\t\t", hz/cd, "\n" + print "#define MSEC_TO_HZ_NUM\t\t", hz/cd, "\n" + print "#define MSEC_TO_HZ_DEN\t\t", 1000/cd, "\n" + print "\n" + + s=fmuls(32,1000000,hz) + obase=16 + print "#define HZ_TO_USEC_MUL32\tU64_C(0x", fmul(s,1000000,hz), ")\n" + print "#define HZ_TO_USEC_ADJ32\tU64_C(0x", fadj(s,1000000,hz), ")\n" + obase=10 + print "#define HZ_TO_USEC_SHR32\t", s, "\n" + + s=fmuls(32,hz,1000000) + obase=16 + print "#define USEC_TO_HZ_MUL32\tU64_C(0x", fmul(s,hz,1000000), ")\n" + print "#define USEC_TO_HZ_ADJ32\tU64_C(0x", fadj(s,hz,1000000), ")\n" + obase=10 + print "#define USEC_TO_HZ_SHR32\t", s, "\n" + + obase=10 + cd=gcd(hz,1000000) + print "#define HZ_TO_USEC_NUM\t\t", 1000000/cd, "\n" + print "#define HZ_TO_USEC_DEN\t\t", hz/cd, "\n" + print "#define USEC_TO_HZ_NUM\t\t", hz/cd, "\n" + print "#define USEC_TO_HZ_DEN\t\t", 1000000/cd, "\n" + + cd=gcd(hz,1000000000) + print "#define HZ_TO_NSEC_NUM\t\t", 1000000000/cd, "\n" + print "#define HZ_TO_NSEC_DEN\t\t", hz/cd, "\n" + print "#define NSEC_TO_HZ_NUM\t\t", hz/cd, "\n" + print "#define NSEC_TO_HZ_DEN\t\t", 1000000000/cd, "\n" + print "\n" + + print "#endif /* KERNEL_TIMECONST_H */\n" + } + halt +} + +hz = read(); +timeconst(hz) diff --git a/kernel/time/timeconv.c b/kernel/time/timeconv.c new file mode 100644 index 0000000000..59b922c826 --- /dev/null +++ b/kernel/time/timeconv.c @@ -0,0 +1,141 @@ +// SPDX-License-Identifier: LGPL-2.0+ +/* + * Copyright (C) 1993, 1994, 1995, 1996, 1997 Free Software Foundation, Inc. + * This file is part of the GNU C Library. + * Contributed by Paul Eggert (eggert@twinsun.com). + * + * The GNU C Library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Library General Public License as + * published by the Free Software Foundation; either version 2 of the + * License, or (at your option) any later version. + * + * The GNU C Library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Library General Public License for more details. + * + * You should have received a copy of the GNU Library General Public + * License along with the GNU C Library; see the file COPYING.LIB. If not, + * write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 02111-1307, USA. + */ + +/* + * Converts the calendar time to broken-down time representation + * + * 2009-7-14: + * Moved from glibc-2.6 to kernel by Zhaolei<zhaolei@cn.fujitsu.com> + * 2021-06-02: + * Reimplemented by Cassio Neri <cassio.neri@gmail.com> + */ + +#include <linux/time.h> +#include <linux/module.h> +#include <linux/kernel.h> + +#define SECS_PER_HOUR (60 * 60) +#define SECS_PER_DAY (SECS_PER_HOUR * 24) + +/** + * time64_to_tm - converts the calendar time to local broken-down time + * + * @totalsecs: the number of seconds elapsed since 00:00:00 on January 1, 1970, + * Coordinated Universal Time (UTC). + * @offset: offset seconds adding to totalsecs. + * @result: pointer to struct tm variable to receive broken-down time + */ +void time64_to_tm(time64_t totalsecs, int offset, struct tm *result) +{ + u32 u32tmp, day_of_century, year_of_century, day_of_year, month, day; + u64 u64tmp, udays, century, year; + bool is_Jan_or_Feb, is_leap_year; + long days, rem; + int remainder; + + days = div_s64_rem(totalsecs, SECS_PER_DAY, &remainder); + rem = remainder; + rem += offset; + while (rem < 0) { + rem += SECS_PER_DAY; + --days; + } + while (rem >= SECS_PER_DAY) { + rem -= SECS_PER_DAY; + ++days; + } + + result->tm_hour = rem / SECS_PER_HOUR; + rem %= SECS_PER_HOUR; + result->tm_min = rem / 60; + result->tm_sec = rem % 60; + + /* January 1, 1970 was a Thursday. */ + result->tm_wday = (4 + days) % 7; + if (result->tm_wday < 0) + result->tm_wday += 7; + + /* + * The following algorithm is, basically, Proposition 6.3 of Neri + * and Schneider [1]. In a few words: it works on the computational + * (fictitious) calendar where the year starts in March, month = 2 + * (*), and finishes in February, month = 13. This calendar is + * mathematically convenient because the day of the year does not + * depend on whether the year is leap or not. For instance: + * + * March 1st 0-th day of the year; + * ... + * April 1st 31-st day of the year; + * ... + * January 1st 306-th day of the year; (Important!) + * ... + * February 28th 364-th day of the year; + * February 29th 365-th day of the year (if it exists). + * + * After having worked out the date in the computational calendar + * (using just arithmetics) it's easy to convert it to the + * corresponding date in the Gregorian calendar. + * + * [1] "Euclidean Affine Functions and Applications to Calendar + * Algorithms". https://arxiv.org/abs/2102.06959 + * + * (*) The numbering of months follows tm more closely and thus, + * is slightly different from [1]. + */ + + udays = ((u64) days) + 2305843009213814918ULL; + + u64tmp = 4 * udays + 3; + century = div64_u64_rem(u64tmp, 146097, &u64tmp); + day_of_century = (u32) (u64tmp / 4); + + u32tmp = 4 * day_of_century + 3; + u64tmp = 2939745ULL * u32tmp; + year_of_century = upper_32_bits(u64tmp); + day_of_year = lower_32_bits(u64tmp) / 2939745 / 4; + + year = 100 * century + year_of_century; + is_leap_year = year_of_century ? !(year_of_century % 4) : !(century % 4); + + u32tmp = 2141 * day_of_year + 132377; + month = u32tmp >> 16; + day = ((u16) u32tmp) / 2141; + + /* + * Recall that January 1st is the 306-th day of the year in the + * computational (not Gregorian) calendar. + */ + is_Jan_or_Feb = day_of_year >= 306; + + /* Convert to the Gregorian calendar and adjust to Unix time. */ + year = year + is_Jan_or_Feb - 6313183731940000ULL; + month = is_Jan_or_Feb ? month - 12 : month; + day = day + 1; + day_of_year += is_Jan_or_Feb ? -306 : 31 + 28 + is_leap_year; + + /* Convert to tm's format. */ + result->tm_year = (long) (year - 1900); + result->tm_mon = (int) month; + result->tm_mday = (int) day; + result->tm_yday = (int) day_of_year; +} +EXPORT_SYMBOL(time64_to_tm); diff --git a/kernel/time/timecounter.c b/kernel/time/timecounter.c new file mode 100644 index 0000000000..e6285288d7 --- /dev/null +++ b/kernel/time/timecounter.c @@ -0,0 +1,99 @@ +// SPDX-License-Identifier: GPL-2.0+ +/* + * Based on clocksource code. See commit 74d23cc704d1 + */ +#include <linux/export.h> +#include <linux/timecounter.h> + +void timecounter_init(struct timecounter *tc, + const struct cyclecounter *cc, + u64 start_tstamp) +{ + tc->cc = cc; + tc->cycle_last = cc->read(cc); + tc->nsec = start_tstamp; + tc->mask = (1ULL << cc->shift) - 1; + tc->frac = 0; +} +EXPORT_SYMBOL_GPL(timecounter_init); + +/** + * timecounter_read_delta - get nanoseconds since last call of this function + * @tc: Pointer to time counter + * + * When the underlying cycle counter runs over, this will be handled + * correctly as long as it does not run over more than once between + * calls. + * + * The first call to this function for a new time counter initializes + * the time tracking and returns an undefined result. + */ +static u64 timecounter_read_delta(struct timecounter *tc) +{ + u64 cycle_now, cycle_delta; + u64 ns_offset; + + /* read cycle counter: */ + cycle_now = tc->cc->read(tc->cc); + + /* calculate the delta since the last timecounter_read_delta(): */ + cycle_delta = (cycle_now - tc->cycle_last) & tc->cc->mask; + + /* convert to nanoseconds: */ + ns_offset = cyclecounter_cyc2ns(tc->cc, cycle_delta, + tc->mask, &tc->frac); + + /* update time stamp of timecounter_read_delta() call: */ + tc->cycle_last = cycle_now; + + return ns_offset; +} + +u64 timecounter_read(struct timecounter *tc) +{ + u64 nsec; + + /* increment time by nanoseconds since last call */ + nsec = timecounter_read_delta(tc); + nsec += tc->nsec; + tc->nsec = nsec; + + return nsec; +} +EXPORT_SYMBOL_GPL(timecounter_read); + +/* + * This is like cyclecounter_cyc2ns(), but it is used for computing a + * time previous to the time stored in the cycle counter. + */ +static u64 cc_cyc2ns_backwards(const struct cyclecounter *cc, + u64 cycles, u64 mask, u64 frac) +{ + u64 ns = (u64) cycles; + + ns = ((ns * cc->mult) - frac) >> cc->shift; + + return ns; +} + +u64 timecounter_cyc2time(const struct timecounter *tc, + u64 cycle_tstamp) +{ + u64 delta = (cycle_tstamp - tc->cycle_last) & tc->cc->mask; + u64 nsec = tc->nsec, frac = tc->frac; + + /* + * Instead of always treating cycle_tstamp as more recent + * than tc->cycle_last, detect when it is too far in the + * future and treat it as old time stamp instead. + */ + if (delta > tc->cc->mask / 2) { + delta = (tc->cycle_last - cycle_tstamp) & tc->cc->mask; + nsec -= cc_cyc2ns_backwards(tc->cc, delta, tc->mask, frac); + } else { + nsec += cyclecounter_cyc2ns(tc->cc, delta, tc->mask, &frac); + } + + return nsec; +} +EXPORT_SYMBOL_GPL(timecounter_cyc2time); diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c new file mode 100644 index 0000000000..266d02809d --- /dev/null +++ b/kernel/time/timekeeping.c @@ -0,0 +1,2503 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Kernel timekeeping code and accessor functions. Based on code from + * timer.c, moved in commit 8524070b7982. + */ +#include <linux/timekeeper_internal.h> +#include <linux/module.h> +#include <linux/interrupt.h> +#include <linux/percpu.h> +#include <linux/init.h> +#include <linux/mm.h> +#include <linux/nmi.h> +#include <linux/sched.h> +#include <linux/sched/loadavg.h> +#include <linux/sched/clock.h> +#include <linux/syscore_ops.h> +#include <linux/clocksource.h> +#include <linux/jiffies.h> +#include <linux/time.h> +#include <linux/timex.h> +#include <linux/tick.h> +#include <linux/stop_machine.h> +#include <linux/pvclock_gtod.h> +#include <linux/compiler.h> +#include <linux/audit.h> +#include <linux/random.h> + +#include "tick-internal.h" +#include "ntp_internal.h" +#include "timekeeping_internal.h" + +#define TK_CLEAR_NTP (1 << 0) +#define TK_MIRROR (1 << 1) +#define TK_CLOCK_WAS_SET (1 << 2) + +enum timekeeping_adv_mode { + /* Update timekeeper when a tick has passed */ + TK_ADV_TICK, + + /* Update timekeeper on a direct frequency change */ + TK_ADV_FREQ +}; + +DEFINE_RAW_SPINLOCK(timekeeper_lock); + +/* + * The most important data for readout fits into a single 64 byte + * cache line. + */ +static struct { + seqcount_raw_spinlock_t seq; + struct timekeeper timekeeper; +} tk_core ____cacheline_aligned = { + .seq = SEQCNT_RAW_SPINLOCK_ZERO(tk_core.seq, &timekeeper_lock), +}; + +static struct timekeeper shadow_timekeeper; + +/* flag for if timekeeping is suspended */ +int __read_mostly timekeeping_suspended; + +/** + * struct tk_fast - NMI safe timekeeper + * @seq: Sequence counter for protecting updates. The lowest bit + * is the index for the tk_read_base array + * @base: tk_read_base array. Access is indexed by the lowest bit of + * @seq. + * + * See @update_fast_timekeeper() below. + */ +struct tk_fast { + seqcount_latch_t seq; + struct tk_read_base base[2]; +}; + +/* Suspend-time cycles value for halted fast timekeeper. */ +static u64 cycles_at_suspend; + +static u64 dummy_clock_read(struct clocksource *cs) +{ + if (timekeeping_suspended) + return cycles_at_suspend; + return local_clock(); +} + +static struct clocksource dummy_clock = { + .read = dummy_clock_read, +}; + +/* + * Boot time initialization which allows local_clock() to be utilized + * during early boot when clocksources are not available. local_clock() + * returns nanoseconds already so no conversion is required, hence mult=1 + * and shift=0. When the first proper clocksource is installed then + * the fast time keepers are updated with the correct values. + */ +#define FAST_TK_INIT \ + { \ + .clock = &dummy_clock, \ + .mask = CLOCKSOURCE_MASK(64), \ + .mult = 1, \ + .shift = 0, \ + } + +static struct tk_fast tk_fast_mono ____cacheline_aligned = { + .seq = SEQCNT_LATCH_ZERO(tk_fast_mono.seq), + .base[0] = FAST_TK_INIT, + .base[1] = FAST_TK_INIT, +}; + +static struct tk_fast tk_fast_raw ____cacheline_aligned = { + .seq = SEQCNT_LATCH_ZERO(tk_fast_raw.seq), + .base[0] = FAST_TK_INIT, + .base[1] = FAST_TK_INIT, +}; + +static inline void tk_normalize_xtime(struct timekeeper *tk) +{ + while (tk->tkr_mono.xtime_nsec >= ((u64)NSEC_PER_SEC << tk->tkr_mono.shift)) { + tk->tkr_mono.xtime_nsec -= (u64)NSEC_PER_SEC << tk->tkr_mono.shift; + tk->xtime_sec++; + } + while (tk->tkr_raw.xtime_nsec >= ((u64)NSEC_PER_SEC << tk->tkr_raw.shift)) { + tk->tkr_raw.xtime_nsec -= (u64)NSEC_PER_SEC << tk->tkr_raw.shift; + tk->raw_sec++; + } +} + +static inline struct timespec64 tk_xtime(const struct timekeeper *tk) +{ + struct timespec64 ts; + + ts.tv_sec = tk->xtime_sec; + ts.tv_nsec = (long)(tk->tkr_mono.xtime_nsec >> tk->tkr_mono.shift); + return ts; +} + +static void tk_set_xtime(struct timekeeper *tk, const struct timespec64 *ts) +{ + tk->xtime_sec = ts->tv_sec; + tk->tkr_mono.xtime_nsec = (u64)ts->tv_nsec << tk->tkr_mono.shift; +} + +static void tk_xtime_add(struct timekeeper *tk, const struct timespec64 *ts) +{ + tk->xtime_sec += ts->tv_sec; + tk->tkr_mono.xtime_nsec += (u64)ts->tv_nsec << tk->tkr_mono.shift; + tk_normalize_xtime(tk); +} + +static void tk_set_wall_to_mono(struct timekeeper *tk, struct timespec64 wtm) +{ + struct timespec64 tmp; + + /* + * Verify consistency of: offset_real = -wall_to_monotonic + * before modifying anything + */ + set_normalized_timespec64(&tmp, -tk->wall_to_monotonic.tv_sec, + -tk->wall_to_monotonic.tv_nsec); + WARN_ON_ONCE(tk->offs_real != timespec64_to_ktime(tmp)); + tk->wall_to_monotonic = wtm; + set_normalized_timespec64(&tmp, -wtm.tv_sec, -wtm.tv_nsec); + tk->offs_real = timespec64_to_ktime(tmp); + tk->offs_tai = ktime_add(tk->offs_real, ktime_set(tk->tai_offset, 0)); +} + +static inline void tk_update_sleep_time(struct timekeeper *tk, ktime_t delta) +{ + tk->offs_boot = ktime_add(tk->offs_boot, delta); + /* + * Timespec representation for VDSO update to avoid 64bit division + * on every update. + */ + tk->monotonic_to_boot = ktime_to_timespec64(tk->offs_boot); +} + +/* + * tk_clock_read - atomic clocksource read() helper + * + * This helper is necessary to use in the read paths because, while the + * seqcount ensures we don't return a bad value while structures are updated, + * it doesn't protect from potential crashes. There is the possibility that + * the tkr's clocksource may change between the read reference, and the + * clock reference passed to the read function. This can cause crashes if + * the wrong clocksource is passed to the wrong read function. + * This isn't necessary to use when holding the timekeeper_lock or doing + * a read of the fast-timekeeper tkrs (which is protected by its own locking + * and update logic). + */ +static inline u64 tk_clock_read(const struct tk_read_base *tkr) +{ + struct clocksource *clock = READ_ONCE(tkr->clock); + + return clock->read(clock); +} + +#ifdef CONFIG_DEBUG_TIMEKEEPING +#define WARNING_FREQ (HZ*300) /* 5 minute rate-limiting */ + +static void timekeeping_check_update(struct timekeeper *tk, u64 offset) +{ + + u64 max_cycles = tk->tkr_mono.clock->max_cycles; + const char *name = tk->tkr_mono.clock->name; + + if (offset > max_cycles) { + printk_deferred("WARNING: timekeeping: Cycle offset (%lld) is larger than allowed by the '%s' clock's max_cycles value (%lld): time overflow danger\n", + offset, name, max_cycles); + printk_deferred(" timekeeping: Your kernel is sick, but tries to cope by capping time updates\n"); + } else { + if (offset > (max_cycles >> 1)) { + printk_deferred("INFO: timekeeping: Cycle offset (%lld) is larger than the '%s' clock's 50%% safety margin (%lld)\n", + offset, name, max_cycles >> 1); + printk_deferred(" timekeeping: Your kernel is still fine, but is feeling a bit nervous\n"); + } + } + + if (tk->underflow_seen) { + if (jiffies - tk->last_warning > WARNING_FREQ) { + printk_deferred("WARNING: Underflow in clocksource '%s' observed, time update ignored.\n", name); + printk_deferred(" Please report this, consider using a different clocksource, if possible.\n"); + printk_deferred(" Your kernel is probably still fine.\n"); + tk->last_warning = jiffies; + } + tk->underflow_seen = 0; + } + + if (tk->overflow_seen) { + if (jiffies - tk->last_warning > WARNING_FREQ) { + printk_deferred("WARNING: Overflow in clocksource '%s' observed, time update capped.\n", name); + printk_deferred(" Please report this, consider using a different clocksource, if possible.\n"); + printk_deferred(" Your kernel is probably still fine.\n"); + tk->last_warning = jiffies; + } + tk->overflow_seen = 0; + } +} + +static inline u64 timekeeping_get_delta(const struct tk_read_base *tkr) +{ + struct timekeeper *tk = &tk_core.timekeeper; + u64 now, last, mask, max, delta; + unsigned int seq; + + /* + * Since we're called holding a seqcount, the data may shift + * under us while we're doing the calculation. This can cause + * false positives, since we'd note a problem but throw the + * results away. So nest another seqcount here to atomically + * grab the points we are checking with. + */ + do { + seq = read_seqcount_begin(&tk_core.seq); + now = tk_clock_read(tkr); + last = tkr->cycle_last; + mask = tkr->mask; + max = tkr->clock->max_cycles; + } while (read_seqcount_retry(&tk_core.seq, seq)); + + delta = clocksource_delta(now, last, mask); + + /* + * Try to catch underflows by checking if we are seeing small + * mask-relative negative values. + */ + if (unlikely((~delta & mask) < (mask >> 3))) { + tk->underflow_seen = 1; + delta = 0; + } + + /* Cap delta value to the max_cycles values to avoid mult overflows */ + if (unlikely(delta > max)) { + tk->overflow_seen = 1; + delta = tkr->clock->max_cycles; + } + + return delta; +} +#else +static inline void timekeeping_check_update(struct timekeeper *tk, u64 offset) +{ +} +static inline u64 timekeeping_get_delta(const struct tk_read_base *tkr) +{ + u64 cycle_now, delta; + + /* read clocksource */ + cycle_now = tk_clock_read(tkr); + + /* calculate the delta since the last update_wall_time */ + delta = clocksource_delta(cycle_now, tkr->cycle_last, tkr->mask); + + return delta; +} +#endif + +/** + * tk_setup_internals - Set up internals to use clocksource clock. + * + * @tk: The target timekeeper to setup. + * @clock: Pointer to clocksource. + * + * Calculates a fixed cycle/nsec interval for a given clocksource/adjustment + * pair and interval request. + * + * Unless you're the timekeeping code, you should not be using this! + */ +static void tk_setup_internals(struct timekeeper *tk, struct clocksource *clock) +{ + u64 interval; + u64 tmp, ntpinterval; + struct clocksource *old_clock; + + ++tk->cs_was_changed_seq; + old_clock = tk->tkr_mono.clock; + tk->tkr_mono.clock = clock; + tk->tkr_mono.mask = clock->mask; + tk->tkr_mono.cycle_last = tk_clock_read(&tk->tkr_mono); + + tk->tkr_raw.clock = clock; + tk->tkr_raw.mask = clock->mask; + tk->tkr_raw.cycle_last = tk->tkr_mono.cycle_last; + + /* Do the ns -> cycle conversion first, using original mult */ + tmp = NTP_INTERVAL_LENGTH; + tmp <<= clock->shift; + ntpinterval = tmp; + tmp += clock->mult/2; + do_div(tmp, clock->mult); + if (tmp == 0) + tmp = 1; + + interval = (u64) tmp; + tk->cycle_interval = interval; + + /* Go back from cycles -> shifted ns */ + tk->xtime_interval = interval * clock->mult; + tk->xtime_remainder = ntpinterval - tk->xtime_interval; + tk->raw_interval = interval * clock->mult; + + /* if changing clocks, convert xtime_nsec shift units */ + if (old_clock) { + int shift_change = clock->shift - old_clock->shift; + if (shift_change < 0) { + tk->tkr_mono.xtime_nsec >>= -shift_change; + tk->tkr_raw.xtime_nsec >>= -shift_change; + } else { + tk->tkr_mono.xtime_nsec <<= shift_change; + tk->tkr_raw.xtime_nsec <<= shift_change; + } + } + + tk->tkr_mono.shift = clock->shift; + tk->tkr_raw.shift = clock->shift; + + tk->ntp_error = 0; + tk->ntp_error_shift = NTP_SCALE_SHIFT - clock->shift; + tk->ntp_tick = ntpinterval << tk->ntp_error_shift; + + /* + * The timekeeper keeps its own mult values for the currently + * active clocksource. These value will be adjusted via NTP + * to counteract clock drifting. + */ + tk->tkr_mono.mult = clock->mult; + tk->tkr_raw.mult = clock->mult; + tk->ntp_err_mult = 0; + tk->skip_second_overflow = 0; +} + +/* Timekeeper helper functions. */ + +static inline u64 timekeeping_delta_to_ns(const struct tk_read_base *tkr, u64 delta) +{ + u64 nsec; + + nsec = delta * tkr->mult + tkr->xtime_nsec; + nsec >>= tkr->shift; + + return nsec; +} + +static inline u64 timekeeping_get_ns(const struct tk_read_base *tkr) +{ + u64 delta; + + delta = timekeeping_get_delta(tkr); + return timekeeping_delta_to_ns(tkr, delta); +} + +static inline u64 timekeeping_cycles_to_ns(const struct tk_read_base *tkr, u64 cycles) +{ + u64 delta; + + /* calculate the delta since the last update_wall_time */ + delta = clocksource_delta(cycles, tkr->cycle_last, tkr->mask); + return timekeeping_delta_to_ns(tkr, delta); +} + +/** + * update_fast_timekeeper - Update the fast and NMI safe monotonic timekeeper. + * @tkr: Timekeeping readout base from which we take the update + * @tkf: Pointer to NMI safe timekeeper + * + * We want to use this from any context including NMI and tracing / + * instrumenting the timekeeping code itself. + * + * Employ the latch technique; see @raw_write_seqcount_latch. + * + * So if a NMI hits the update of base[0] then it will use base[1] + * which is still consistent. In the worst case this can result is a + * slightly wrong timestamp (a few nanoseconds). See + * @ktime_get_mono_fast_ns. + */ +static void update_fast_timekeeper(const struct tk_read_base *tkr, + struct tk_fast *tkf) +{ + struct tk_read_base *base = tkf->base; + + /* Force readers off to base[1] */ + raw_write_seqcount_latch(&tkf->seq); + + /* Update base[0] */ + memcpy(base, tkr, sizeof(*base)); + + /* Force readers back to base[0] */ + raw_write_seqcount_latch(&tkf->seq); + + /* Update base[1] */ + memcpy(base + 1, base, sizeof(*base)); +} + +static __always_inline u64 fast_tk_get_delta_ns(struct tk_read_base *tkr) +{ + u64 delta, cycles = tk_clock_read(tkr); + + delta = clocksource_delta(cycles, tkr->cycle_last, tkr->mask); + return timekeeping_delta_to_ns(tkr, delta); +} + +static __always_inline u64 __ktime_get_fast_ns(struct tk_fast *tkf) +{ + struct tk_read_base *tkr; + unsigned int seq; + u64 now; + + do { + seq = raw_read_seqcount_latch(&tkf->seq); + tkr = tkf->base + (seq & 0x01); + now = ktime_to_ns(tkr->base); + now += fast_tk_get_delta_ns(tkr); + } while (raw_read_seqcount_latch_retry(&tkf->seq, seq)); + + return now; +} + +/** + * ktime_get_mono_fast_ns - Fast NMI safe access to clock monotonic + * + * This timestamp is not guaranteed to be monotonic across an update. + * The timestamp is calculated by: + * + * now = base_mono + clock_delta * slope + * + * So if the update lowers the slope, readers who are forced to the + * not yet updated second array are still using the old steeper slope. + * + * tmono + * ^ + * | o n + * | o n + * | u + * | o + * |o + * |12345678---> reader order + * + * o = old slope + * u = update + * n = new slope + * + * So reader 6 will observe time going backwards versus reader 5. + * + * While other CPUs are likely to be able to observe that, the only way + * for a CPU local observation is when an NMI hits in the middle of + * the update. Timestamps taken from that NMI context might be ahead + * of the following timestamps. Callers need to be aware of that and + * deal with it. + */ +u64 notrace ktime_get_mono_fast_ns(void) +{ + return __ktime_get_fast_ns(&tk_fast_mono); +} +EXPORT_SYMBOL_GPL(ktime_get_mono_fast_ns); + +/** + * ktime_get_raw_fast_ns - Fast NMI safe access to clock monotonic raw + * + * Contrary to ktime_get_mono_fast_ns() this is always correct because the + * conversion factor is not affected by NTP/PTP correction. + */ +u64 notrace ktime_get_raw_fast_ns(void) +{ + return __ktime_get_fast_ns(&tk_fast_raw); +} +EXPORT_SYMBOL_GPL(ktime_get_raw_fast_ns); + +/** + * ktime_get_boot_fast_ns - NMI safe and fast access to boot clock. + * + * To keep it NMI safe since we're accessing from tracing, we're not using a + * separate timekeeper with updates to monotonic clock and boot offset + * protected with seqcounts. This has the following minor side effects: + * + * (1) Its possible that a timestamp be taken after the boot offset is updated + * but before the timekeeper is updated. If this happens, the new boot offset + * is added to the old timekeeping making the clock appear to update slightly + * earlier: + * CPU 0 CPU 1 + * timekeeping_inject_sleeptime64() + * __timekeeping_inject_sleeptime(tk, delta); + * timestamp(); + * timekeeping_update(tk, TK_CLEAR_NTP...); + * + * (2) On 32-bit systems, the 64-bit boot offset (tk->offs_boot) may be + * partially updated. Since the tk->offs_boot update is a rare event, this + * should be a rare occurrence which postprocessing should be able to handle. + * + * The caveats vs. timestamp ordering as documented for ktime_get_mono_fast_ns() + * apply as well. + */ +u64 notrace ktime_get_boot_fast_ns(void) +{ + struct timekeeper *tk = &tk_core.timekeeper; + + return (ktime_get_mono_fast_ns() + ktime_to_ns(data_race(tk->offs_boot))); +} +EXPORT_SYMBOL_GPL(ktime_get_boot_fast_ns); + +/** + * ktime_get_tai_fast_ns - NMI safe and fast access to tai clock. + * + * The same limitations as described for ktime_get_boot_fast_ns() apply. The + * mono time and the TAI offset are not read atomically which may yield wrong + * readouts. However, an update of the TAI offset is an rare event e.g., caused + * by settime or adjtimex with an offset. The user of this function has to deal + * with the possibility of wrong timestamps in post processing. + */ +u64 notrace ktime_get_tai_fast_ns(void) +{ + struct timekeeper *tk = &tk_core.timekeeper; + + return (ktime_get_mono_fast_ns() + ktime_to_ns(data_race(tk->offs_tai))); +} +EXPORT_SYMBOL_GPL(ktime_get_tai_fast_ns); + +static __always_inline u64 __ktime_get_real_fast(struct tk_fast *tkf, u64 *mono) +{ + struct tk_read_base *tkr; + u64 basem, baser, delta; + unsigned int seq; + + do { + seq = raw_read_seqcount_latch(&tkf->seq); + tkr = tkf->base + (seq & 0x01); + basem = ktime_to_ns(tkr->base); + baser = ktime_to_ns(tkr->base_real); + delta = fast_tk_get_delta_ns(tkr); + } while (raw_read_seqcount_latch_retry(&tkf->seq, seq)); + + if (mono) + *mono = basem + delta; + return baser + delta; +} + +/** + * ktime_get_real_fast_ns: - NMI safe and fast access to clock realtime. + * + * See ktime_get_mono_fast_ns() for documentation of the time stamp ordering. + */ +u64 ktime_get_real_fast_ns(void) +{ + return __ktime_get_real_fast(&tk_fast_mono, NULL); +} +EXPORT_SYMBOL_GPL(ktime_get_real_fast_ns); + +/** + * ktime_get_fast_timestamps: - NMI safe timestamps + * @snapshot: Pointer to timestamp storage + * + * Stores clock monotonic, boottime and realtime timestamps. + * + * Boot time is a racy access on 32bit systems if the sleep time injection + * happens late during resume and not in timekeeping_resume(). That could + * be avoided by expanding struct tk_read_base with boot offset for 32bit + * and adding more overhead to the update. As this is a hard to observe + * once per resume event which can be filtered with reasonable effort using + * the accurate mono/real timestamps, it's probably not worth the trouble. + * + * Aside of that it might be possible on 32 and 64 bit to observe the + * following when the sleep time injection happens late: + * + * CPU 0 CPU 1 + * timekeeping_resume() + * ktime_get_fast_timestamps() + * mono, real = __ktime_get_real_fast() + * inject_sleep_time() + * update boot offset + * boot = mono + bootoffset; + * + * That means that boot time already has the sleep time adjustment, but + * real time does not. On the next readout both are in sync again. + * + * Preventing this for 64bit is not really feasible without destroying the + * careful cache layout of the timekeeper because the sequence count and + * struct tk_read_base would then need two cache lines instead of one. + * + * Access to the time keeper clock source is disabled across the innermost + * steps of suspend/resume. The accessors still work, but the timestamps + * are frozen until time keeping is resumed which happens very early. + * + * For regular suspend/resume there is no observable difference vs. sched + * clock, but it might affect some of the nasty low level debug printks. + * + * OTOH, access to sched clock is not guaranteed across suspend/resume on + * all systems either so it depends on the hardware in use. + * + * If that turns out to be a real problem then this could be mitigated by + * using sched clock in a similar way as during early boot. But it's not as + * trivial as on early boot because it needs some careful protection + * against the clock monotonic timestamp jumping backwards on resume. + */ +void ktime_get_fast_timestamps(struct ktime_timestamps *snapshot) +{ + struct timekeeper *tk = &tk_core.timekeeper; + + snapshot->real = __ktime_get_real_fast(&tk_fast_mono, &snapshot->mono); + snapshot->boot = snapshot->mono + ktime_to_ns(data_race(tk->offs_boot)); +} + +/** + * halt_fast_timekeeper - Prevent fast timekeeper from accessing clocksource. + * @tk: Timekeeper to snapshot. + * + * It generally is unsafe to access the clocksource after timekeeping has been + * suspended, so take a snapshot of the readout base of @tk and use it as the + * fast timekeeper's readout base while suspended. It will return the same + * number of cycles every time until timekeeping is resumed at which time the + * proper readout base for the fast timekeeper will be restored automatically. + */ +static void halt_fast_timekeeper(const struct timekeeper *tk) +{ + static struct tk_read_base tkr_dummy; + const struct tk_read_base *tkr = &tk->tkr_mono; + + memcpy(&tkr_dummy, tkr, sizeof(tkr_dummy)); + cycles_at_suspend = tk_clock_read(tkr); + tkr_dummy.clock = &dummy_clock; + tkr_dummy.base_real = tkr->base + tk->offs_real; + update_fast_timekeeper(&tkr_dummy, &tk_fast_mono); + + tkr = &tk->tkr_raw; + memcpy(&tkr_dummy, tkr, sizeof(tkr_dummy)); + tkr_dummy.clock = &dummy_clock; + update_fast_timekeeper(&tkr_dummy, &tk_fast_raw); +} + +static RAW_NOTIFIER_HEAD(pvclock_gtod_chain); + +static void update_pvclock_gtod(struct timekeeper *tk, bool was_set) +{ + raw_notifier_call_chain(&pvclock_gtod_chain, was_set, tk); +} + +/** + * pvclock_gtod_register_notifier - register a pvclock timedata update listener + * @nb: Pointer to the notifier block to register + */ +int pvclock_gtod_register_notifier(struct notifier_block *nb) +{ + struct timekeeper *tk = &tk_core.timekeeper; + unsigned long flags; + int ret; + + raw_spin_lock_irqsave(&timekeeper_lock, flags); + ret = raw_notifier_chain_register(&pvclock_gtod_chain, nb); + update_pvclock_gtod(tk, true); + raw_spin_unlock_irqrestore(&timekeeper_lock, flags); + + return ret; +} +EXPORT_SYMBOL_GPL(pvclock_gtod_register_notifier); + +/** + * pvclock_gtod_unregister_notifier - unregister a pvclock + * timedata update listener + * @nb: Pointer to the notifier block to unregister + */ +int pvclock_gtod_unregister_notifier(struct notifier_block *nb) +{ + unsigned long flags; + int ret; + + raw_spin_lock_irqsave(&timekeeper_lock, flags); + ret = raw_notifier_chain_unregister(&pvclock_gtod_chain, nb); + raw_spin_unlock_irqrestore(&timekeeper_lock, flags); + + return ret; +} +EXPORT_SYMBOL_GPL(pvclock_gtod_unregister_notifier); + +/* + * tk_update_leap_state - helper to update the next_leap_ktime + */ +static inline void tk_update_leap_state(struct timekeeper *tk) +{ + tk->next_leap_ktime = ntp_get_next_leap(); + if (tk->next_leap_ktime != KTIME_MAX) + /* Convert to monotonic time */ + tk->next_leap_ktime = ktime_sub(tk->next_leap_ktime, tk->offs_real); +} + +/* + * Update the ktime_t based scalar nsec members of the timekeeper + */ +static inline void tk_update_ktime_data(struct timekeeper *tk) +{ + u64 seconds; + u32 nsec; + + /* + * The xtime based monotonic readout is: + * nsec = (xtime_sec + wtm_sec) * 1e9 + wtm_nsec + now(); + * The ktime based monotonic readout is: + * nsec = base_mono + now(); + * ==> base_mono = (xtime_sec + wtm_sec) * 1e9 + wtm_nsec + */ + seconds = (u64)(tk->xtime_sec + tk->wall_to_monotonic.tv_sec); + nsec = (u32) tk->wall_to_monotonic.tv_nsec; + tk->tkr_mono.base = ns_to_ktime(seconds * NSEC_PER_SEC + nsec); + + /* + * The sum of the nanoseconds portions of xtime and + * wall_to_monotonic can be greater/equal one second. Take + * this into account before updating tk->ktime_sec. + */ + nsec += (u32)(tk->tkr_mono.xtime_nsec >> tk->tkr_mono.shift); + if (nsec >= NSEC_PER_SEC) + seconds++; + tk->ktime_sec = seconds; + + /* Update the monotonic raw base */ + tk->tkr_raw.base = ns_to_ktime(tk->raw_sec * NSEC_PER_SEC); +} + +/* must hold timekeeper_lock */ +static void timekeeping_update(struct timekeeper *tk, unsigned int action) +{ + if (action & TK_CLEAR_NTP) { + tk->ntp_error = 0; + ntp_clear(); + } + + tk_update_leap_state(tk); + tk_update_ktime_data(tk); + + update_vsyscall(tk); + update_pvclock_gtod(tk, action & TK_CLOCK_WAS_SET); + + tk->tkr_mono.base_real = tk->tkr_mono.base + tk->offs_real; + update_fast_timekeeper(&tk->tkr_mono, &tk_fast_mono); + update_fast_timekeeper(&tk->tkr_raw, &tk_fast_raw); + + if (action & TK_CLOCK_WAS_SET) + tk->clock_was_set_seq++; + /* + * The mirroring of the data to the shadow-timekeeper needs + * to happen last here to ensure we don't over-write the + * timekeeper structure on the next update with stale data + */ + if (action & TK_MIRROR) + memcpy(&shadow_timekeeper, &tk_core.timekeeper, + sizeof(tk_core.timekeeper)); +} + +/** + * timekeeping_forward_now - update clock to the current time + * @tk: Pointer to the timekeeper to update + * + * Forward the current clock to update its state since the last call to + * update_wall_time(). This is useful before significant clock changes, + * as it avoids having to deal with this time offset explicitly. + */ +static void timekeeping_forward_now(struct timekeeper *tk) +{ + u64 cycle_now, delta; + + cycle_now = tk_clock_read(&tk->tkr_mono); + delta = clocksource_delta(cycle_now, tk->tkr_mono.cycle_last, tk->tkr_mono.mask); + tk->tkr_mono.cycle_last = cycle_now; + tk->tkr_raw.cycle_last = cycle_now; + + tk->tkr_mono.xtime_nsec += delta * tk->tkr_mono.mult; + tk->tkr_raw.xtime_nsec += delta * tk->tkr_raw.mult; + + tk_normalize_xtime(tk); +} + +/** + * ktime_get_real_ts64 - Returns the time of day in a timespec64. + * @ts: pointer to the timespec to be set + * + * Returns the time of day in a timespec64 (WARN if suspended). + */ +void ktime_get_real_ts64(struct timespec64 *ts) +{ + struct timekeeper *tk = &tk_core.timekeeper; + unsigned int seq; + u64 nsecs; + + WARN_ON(timekeeping_suspended); + + do { + seq = read_seqcount_begin(&tk_core.seq); + + ts->tv_sec = tk->xtime_sec; + nsecs = timekeeping_get_ns(&tk->tkr_mono); + + } while (read_seqcount_retry(&tk_core.seq, seq)); + + ts->tv_nsec = 0; + timespec64_add_ns(ts, nsecs); +} +EXPORT_SYMBOL(ktime_get_real_ts64); + +ktime_t ktime_get(void) +{ + struct timekeeper *tk = &tk_core.timekeeper; + unsigned int seq; + ktime_t base; + u64 nsecs; + + WARN_ON(timekeeping_suspended); + + do { + seq = read_seqcount_begin(&tk_core.seq); + base = tk->tkr_mono.base; + nsecs = timekeeping_get_ns(&tk->tkr_mono); + + } while (read_seqcount_retry(&tk_core.seq, seq)); + + return ktime_add_ns(base, nsecs); +} +EXPORT_SYMBOL_GPL(ktime_get); + +u32 ktime_get_resolution_ns(void) +{ + struct timekeeper *tk = &tk_core.timekeeper; + unsigned int seq; + u32 nsecs; + + WARN_ON(timekeeping_suspended); + + do { + seq = read_seqcount_begin(&tk_core.seq); + nsecs = tk->tkr_mono.mult >> tk->tkr_mono.shift; + } while (read_seqcount_retry(&tk_core.seq, seq)); + + return nsecs; +} +EXPORT_SYMBOL_GPL(ktime_get_resolution_ns); + +static ktime_t *offsets[TK_OFFS_MAX] = { + [TK_OFFS_REAL] = &tk_core.timekeeper.offs_real, + [TK_OFFS_BOOT] = &tk_core.timekeeper.offs_boot, + [TK_OFFS_TAI] = &tk_core.timekeeper.offs_tai, +}; + +ktime_t ktime_get_with_offset(enum tk_offsets offs) +{ + struct timekeeper *tk = &tk_core.timekeeper; + unsigned int seq; + ktime_t base, *offset = offsets[offs]; + u64 nsecs; + + WARN_ON(timekeeping_suspended); + + do { + seq = read_seqcount_begin(&tk_core.seq); + base = ktime_add(tk->tkr_mono.base, *offset); + nsecs = timekeeping_get_ns(&tk->tkr_mono); + + } while (read_seqcount_retry(&tk_core.seq, seq)); + + return ktime_add_ns(base, nsecs); + +} +EXPORT_SYMBOL_GPL(ktime_get_with_offset); + +ktime_t ktime_get_coarse_with_offset(enum tk_offsets offs) +{ + struct timekeeper *tk = &tk_core.timekeeper; + unsigned int seq; + ktime_t base, *offset = offsets[offs]; + u64 nsecs; + + WARN_ON(timekeeping_suspended); + + do { + seq = read_seqcount_begin(&tk_core.seq); + base = ktime_add(tk->tkr_mono.base, *offset); + nsecs = tk->tkr_mono.xtime_nsec >> tk->tkr_mono.shift; + + } while (read_seqcount_retry(&tk_core.seq, seq)); + + return ktime_add_ns(base, nsecs); +} +EXPORT_SYMBOL_GPL(ktime_get_coarse_with_offset); + +/** + * ktime_mono_to_any() - convert monotonic time to any other time + * @tmono: time to convert. + * @offs: which offset to use + */ +ktime_t ktime_mono_to_any(ktime_t tmono, enum tk_offsets offs) +{ + ktime_t *offset = offsets[offs]; + unsigned int seq; + ktime_t tconv; + + do { + seq = read_seqcount_begin(&tk_core.seq); + tconv = ktime_add(tmono, *offset); + } while (read_seqcount_retry(&tk_core.seq, seq)); + + return tconv; +} +EXPORT_SYMBOL_GPL(ktime_mono_to_any); + +/** + * ktime_get_raw - Returns the raw monotonic time in ktime_t format + */ +ktime_t ktime_get_raw(void) +{ + struct timekeeper *tk = &tk_core.timekeeper; + unsigned int seq; + ktime_t base; + u64 nsecs; + + do { + seq = read_seqcount_begin(&tk_core.seq); + base = tk->tkr_raw.base; + nsecs = timekeeping_get_ns(&tk->tkr_raw); + + } while (read_seqcount_retry(&tk_core.seq, seq)); + + return ktime_add_ns(base, nsecs); +} +EXPORT_SYMBOL_GPL(ktime_get_raw); + +/** + * ktime_get_ts64 - get the monotonic clock in timespec64 format + * @ts: pointer to timespec variable + * + * The function calculates the monotonic clock from the realtime + * clock and the wall_to_monotonic offset and stores the result + * in normalized timespec64 format in the variable pointed to by @ts. + */ +void ktime_get_ts64(struct timespec64 *ts) +{ + struct timekeeper *tk = &tk_core.timekeeper; + struct timespec64 tomono; + unsigned int seq; + u64 nsec; + + WARN_ON(timekeeping_suspended); + + do { + seq = read_seqcount_begin(&tk_core.seq); + ts->tv_sec = tk->xtime_sec; + nsec = timekeeping_get_ns(&tk->tkr_mono); + tomono = tk->wall_to_monotonic; + + } while (read_seqcount_retry(&tk_core.seq, seq)); + + ts->tv_sec += tomono.tv_sec; + ts->tv_nsec = 0; + timespec64_add_ns(ts, nsec + tomono.tv_nsec); +} +EXPORT_SYMBOL_GPL(ktime_get_ts64); + +/** + * ktime_get_seconds - Get the seconds portion of CLOCK_MONOTONIC + * + * Returns the seconds portion of CLOCK_MONOTONIC with a single non + * serialized read. tk->ktime_sec is of type 'unsigned long' so this + * works on both 32 and 64 bit systems. On 32 bit systems the readout + * covers ~136 years of uptime which should be enough to prevent + * premature wrap arounds. + */ +time64_t ktime_get_seconds(void) +{ + struct timekeeper *tk = &tk_core.timekeeper; + + WARN_ON(timekeeping_suspended); + return tk->ktime_sec; +} +EXPORT_SYMBOL_GPL(ktime_get_seconds); + +/** + * ktime_get_real_seconds - Get the seconds portion of CLOCK_REALTIME + * + * Returns the wall clock seconds since 1970. + * + * For 64bit systems the fast access to tk->xtime_sec is preserved. On + * 32bit systems the access must be protected with the sequence + * counter to provide "atomic" access to the 64bit tk->xtime_sec + * value. + */ +time64_t ktime_get_real_seconds(void) +{ + struct timekeeper *tk = &tk_core.timekeeper; + time64_t seconds; + unsigned int seq; + + if (IS_ENABLED(CONFIG_64BIT)) + return tk->xtime_sec; + + do { + seq = read_seqcount_begin(&tk_core.seq); + seconds = tk->xtime_sec; + + } while (read_seqcount_retry(&tk_core.seq, seq)); + + return seconds; +} +EXPORT_SYMBOL_GPL(ktime_get_real_seconds); + +/** + * __ktime_get_real_seconds - The same as ktime_get_real_seconds + * but without the sequence counter protect. This internal function + * is called just when timekeeping lock is already held. + */ +noinstr time64_t __ktime_get_real_seconds(void) +{ + struct timekeeper *tk = &tk_core.timekeeper; + + return tk->xtime_sec; +} + +/** + * ktime_get_snapshot - snapshots the realtime/monotonic raw clocks with counter + * @systime_snapshot: pointer to struct receiving the system time snapshot + */ +void ktime_get_snapshot(struct system_time_snapshot *systime_snapshot) +{ + struct timekeeper *tk = &tk_core.timekeeper; + unsigned int seq; + ktime_t base_raw; + ktime_t base_real; + u64 nsec_raw; + u64 nsec_real; + u64 now; + + WARN_ON_ONCE(timekeeping_suspended); + + do { + seq = read_seqcount_begin(&tk_core.seq); + now = tk_clock_read(&tk->tkr_mono); + systime_snapshot->cs_id = tk->tkr_mono.clock->id; + systime_snapshot->cs_was_changed_seq = tk->cs_was_changed_seq; + systime_snapshot->clock_was_set_seq = tk->clock_was_set_seq; + base_real = ktime_add(tk->tkr_mono.base, + tk_core.timekeeper.offs_real); + base_raw = tk->tkr_raw.base; + nsec_real = timekeeping_cycles_to_ns(&tk->tkr_mono, now); + nsec_raw = timekeeping_cycles_to_ns(&tk->tkr_raw, now); + } while (read_seqcount_retry(&tk_core.seq, seq)); + + systime_snapshot->cycles = now; + systime_snapshot->real = ktime_add_ns(base_real, nsec_real); + systime_snapshot->raw = ktime_add_ns(base_raw, nsec_raw); +} +EXPORT_SYMBOL_GPL(ktime_get_snapshot); + +/* Scale base by mult/div checking for overflow */ +static int scale64_check_overflow(u64 mult, u64 div, u64 *base) +{ + u64 tmp, rem; + + tmp = div64_u64_rem(*base, div, &rem); + + if (((int)sizeof(u64)*8 - fls64(mult) < fls64(tmp)) || + ((int)sizeof(u64)*8 - fls64(mult) < fls64(rem))) + return -EOVERFLOW; + tmp *= mult; + + rem = div64_u64(rem * mult, div); + *base = tmp + rem; + return 0; +} + +/** + * adjust_historical_crosststamp - adjust crosstimestamp previous to current interval + * @history: Snapshot representing start of history + * @partial_history_cycles: Cycle offset into history (fractional part) + * @total_history_cycles: Total history length in cycles + * @discontinuity: True indicates clock was set on history period + * @ts: Cross timestamp that should be adjusted using + * partial/total ratio + * + * Helper function used by get_device_system_crosststamp() to correct the + * crosstimestamp corresponding to the start of the current interval to the + * system counter value (timestamp point) provided by the driver. The + * total_history_* quantities are the total history starting at the provided + * reference point and ending at the start of the current interval. The cycle + * count between the driver timestamp point and the start of the current + * interval is partial_history_cycles. + */ +static int adjust_historical_crosststamp(struct system_time_snapshot *history, + u64 partial_history_cycles, + u64 total_history_cycles, + bool discontinuity, + struct system_device_crosststamp *ts) +{ + struct timekeeper *tk = &tk_core.timekeeper; + u64 corr_raw, corr_real; + bool interp_forward; + int ret; + + if (total_history_cycles == 0 || partial_history_cycles == 0) + return 0; + + /* Interpolate shortest distance from beginning or end of history */ + interp_forward = partial_history_cycles > total_history_cycles / 2; + partial_history_cycles = interp_forward ? + total_history_cycles - partial_history_cycles : + partial_history_cycles; + + /* + * Scale the monotonic raw time delta by: + * partial_history_cycles / total_history_cycles + */ + corr_raw = (u64)ktime_to_ns( + ktime_sub(ts->sys_monoraw, history->raw)); + ret = scale64_check_overflow(partial_history_cycles, + total_history_cycles, &corr_raw); + if (ret) + return ret; + + /* + * If there is a discontinuity in the history, scale monotonic raw + * correction by: + * mult(real)/mult(raw) yielding the realtime correction + * Otherwise, calculate the realtime correction similar to monotonic + * raw calculation + */ + if (discontinuity) { + corr_real = mul_u64_u32_div + (corr_raw, tk->tkr_mono.mult, tk->tkr_raw.mult); + } else { + corr_real = (u64)ktime_to_ns( + ktime_sub(ts->sys_realtime, history->real)); + ret = scale64_check_overflow(partial_history_cycles, + total_history_cycles, &corr_real); + if (ret) + return ret; + } + + /* Fixup monotonic raw and real time time values */ + if (interp_forward) { + ts->sys_monoraw = ktime_add_ns(history->raw, corr_raw); + ts->sys_realtime = ktime_add_ns(history->real, corr_real); + } else { + ts->sys_monoraw = ktime_sub_ns(ts->sys_monoraw, corr_raw); + ts->sys_realtime = ktime_sub_ns(ts->sys_realtime, corr_real); + } + + return 0; +} + +/* + * cycle_between - true if test occurs chronologically between before and after + */ +static bool cycle_between(u64 before, u64 test, u64 after) +{ + if (test > before && test < after) + return true; + if (test < before && before > after) + return true; + return false; +} + +/** + * get_device_system_crosststamp - Synchronously capture system/device timestamp + * @get_time_fn: Callback to get simultaneous device time and + * system counter from the device driver + * @ctx: Context passed to get_time_fn() + * @history_begin: Historical reference point used to interpolate system + * time when counter provided by the driver is before the current interval + * @xtstamp: Receives simultaneously captured system and device time + * + * Reads a timestamp from a device and correlates it to system time + */ +int get_device_system_crosststamp(int (*get_time_fn) + (ktime_t *device_time, + struct system_counterval_t *sys_counterval, + void *ctx), + void *ctx, + struct system_time_snapshot *history_begin, + struct system_device_crosststamp *xtstamp) +{ + struct system_counterval_t system_counterval; + struct timekeeper *tk = &tk_core.timekeeper; + u64 cycles, now, interval_start; + unsigned int clock_was_set_seq = 0; + ktime_t base_real, base_raw; + u64 nsec_real, nsec_raw; + u8 cs_was_changed_seq; + unsigned int seq; + bool do_interp; + int ret; + + do { + seq = read_seqcount_begin(&tk_core.seq); + /* + * Try to synchronously capture device time and a system + * counter value calling back into the device driver + */ + ret = get_time_fn(&xtstamp->device, &system_counterval, ctx); + if (ret) + return ret; + + /* + * Verify that the clocksource associated with the captured + * system counter value is the same as the currently installed + * timekeeper clocksource + */ + if (tk->tkr_mono.clock != system_counterval.cs) + return -ENODEV; + cycles = system_counterval.cycles; + + /* + * Check whether the system counter value provided by the + * device driver is on the current timekeeping interval. + */ + now = tk_clock_read(&tk->tkr_mono); + interval_start = tk->tkr_mono.cycle_last; + if (!cycle_between(interval_start, cycles, now)) { + clock_was_set_seq = tk->clock_was_set_seq; + cs_was_changed_seq = tk->cs_was_changed_seq; + cycles = interval_start; + do_interp = true; + } else { + do_interp = false; + } + + base_real = ktime_add(tk->tkr_mono.base, + tk_core.timekeeper.offs_real); + base_raw = tk->tkr_raw.base; + + nsec_real = timekeeping_cycles_to_ns(&tk->tkr_mono, + system_counterval.cycles); + nsec_raw = timekeeping_cycles_to_ns(&tk->tkr_raw, + system_counterval.cycles); + } while (read_seqcount_retry(&tk_core.seq, seq)); + + xtstamp->sys_realtime = ktime_add_ns(base_real, nsec_real); + xtstamp->sys_monoraw = ktime_add_ns(base_raw, nsec_raw); + + /* + * Interpolate if necessary, adjusting back from the start of the + * current interval + */ + if (do_interp) { + u64 partial_history_cycles, total_history_cycles; + bool discontinuity; + + /* + * Check that the counter value occurs after the provided + * history reference and that the history doesn't cross a + * clocksource change + */ + if (!history_begin || + !cycle_between(history_begin->cycles, + system_counterval.cycles, cycles) || + history_begin->cs_was_changed_seq != cs_was_changed_seq) + return -EINVAL; + partial_history_cycles = cycles - system_counterval.cycles; + total_history_cycles = cycles - history_begin->cycles; + discontinuity = + history_begin->clock_was_set_seq != clock_was_set_seq; + + ret = adjust_historical_crosststamp(history_begin, + partial_history_cycles, + total_history_cycles, + discontinuity, xtstamp); + if (ret) + return ret; + } + + return 0; +} +EXPORT_SYMBOL_GPL(get_device_system_crosststamp); + +/** + * do_settimeofday64 - Sets the time of day. + * @ts: pointer to the timespec64 variable containing the new time + * + * Sets the time of day to the new time and update NTP and notify hrtimers + */ +int do_settimeofday64(const struct timespec64 *ts) +{ + struct timekeeper *tk = &tk_core.timekeeper; + struct timespec64 ts_delta, xt; + unsigned long flags; + int ret = 0; + + if (!timespec64_valid_settod(ts)) + return -EINVAL; + + raw_spin_lock_irqsave(&timekeeper_lock, flags); + write_seqcount_begin(&tk_core.seq); + + timekeeping_forward_now(tk); + + xt = tk_xtime(tk); + ts_delta = timespec64_sub(*ts, xt); + + if (timespec64_compare(&tk->wall_to_monotonic, &ts_delta) > 0) { + ret = -EINVAL; + goto out; + } + + tk_set_wall_to_mono(tk, timespec64_sub(tk->wall_to_monotonic, ts_delta)); + + tk_set_xtime(tk, ts); +out: + timekeeping_update(tk, TK_CLEAR_NTP | TK_MIRROR | TK_CLOCK_WAS_SET); + + write_seqcount_end(&tk_core.seq); + raw_spin_unlock_irqrestore(&timekeeper_lock, flags); + + /* Signal hrtimers about time change */ + clock_was_set(CLOCK_SET_WALL); + + if (!ret) { + audit_tk_injoffset(ts_delta); + add_device_randomness(ts, sizeof(*ts)); + } + + return ret; +} +EXPORT_SYMBOL(do_settimeofday64); + +/** + * timekeeping_inject_offset - Adds or subtracts from the current time. + * @ts: Pointer to the timespec variable containing the offset + * + * Adds or subtracts an offset value from the current time. + */ +static int timekeeping_inject_offset(const struct timespec64 *ts) +{ + struct timekeeper *tk = &tk_core.timekeeper; + unsigned long flags; + struct timespec64 tmp; + int ret = 0; + + if (ts->tv_nsec < 0 || ts->tv_nsec >= NSEC_PER_SEC) + return -EINVAL; + + raw_spin_lock_irqsave(&timekeeper_lock, flags); + write_seqcount_begin(&tk_core.seq); + + timekeeping_forward_now(tk); + + /* Make sure the proposed value is valid */ + tmp = timespec64_add(tk_xtime(tk), *ts); + if (timespec64_compare(&tk->wall_to_monotonic, ts) > 0 || + !timespec64_valid_settod(&tmp)) { + ret = -EINVAL; + goto error; + } + + tk_xtime_add(tk, ts); + tk_set_wall_to_mono(tk, timespec64_sub(tk->wall_to_monotonic, *ts)); + +error: /* even if we error out, we forwarded the time, so call update */ + timekeeping_update(tk, TK_CLEAR_NTP | TK_MIRROR | TK_CLOCK_WAS_SET); + + write_seqcount_end(&tk_core.seq); + raw_spin_unlock_irqrestore(&timekeeper_lock, flags); + + /* Signal hrtimers about time change */ + clock_was_set(CLOCK_SET_WALL); + + return ret; +} + +/* + * Indicates if there is an offset between the system clock and the hardware + * clock/persistent clock/rtc. + */ +int persistent_clock_is_local; + +/* + * Adjust the time obtained from the CMOS to be UTC time instead of + * local time. + * + * This is ugly, but preferable to the alternatives. Otherwise we + * would either need to write a program to do it in /etc/rc (and risk + * confusion if the program gets run more than once; it would also be + * hard to make the program warp the clock precisely n hours) or + * compile in the timezone information into the kernel. Bad, bad.... + * + * - TYT, 1992-01-01 + * + * The best thing to do is to keep the CMOS clock in universal time (UTC) + * as real UNIX machines always do it. This avoids all headaches about + * daylight saving times and warping kernel clocks. + */ +void timekeeping_warp_clock(void) +{ + if (sys_tz.tz_minuteswest != 0) { + struct timespec64 adjust; + + persistent_clock_is_local = 1; + adjust.tv_sec = sys_tz.tz_minuteswest * 60; + adjust.tv_nsec = 0; + timekeeping_inject_offset(&adjust); + } +} + +/* + * __timekeeping_set_tai_offset - Sets the TAI offset from UTC and monotonic + */ +static void __timekeeping_set_tai_offset(struct timekeeper *tk, s32 tai_offset) +{ + tk->tai_offset = tai_offset; + tk->offs_tai = ktime_add(tk->offs_real, ktime_set(tai_offset, 0)); +} + +/* + * change_clocksource - Swaps clocksources if a new one is available + * + * Accumulates current time interval and initializes new clocksource + */ +static int change_clocksource(void *data) +{ + struct timekeeper *tk = &tk_core.timekeeper; + struct clocksource *new, *old = NULL; + unsigned long flags; + bool change = false; + + new = (struct clocksource *) data; + + /* + * If the cs is in module, get a module reference. Succeeds + * for built-in code (owner == NULL) as well. + */ + if (try_module_get(new->owner)) { + if (!new->enable || new->enable(new) == 0) + change = true; + else + module_put(new->owner); + } + + raw_spin_lock_irqsave(&timekeeper_lock, flags); + write_seqcount_begin(&tk_core.seq); + + timekeeping_forward_now(tk); + + if (change) { + old = tk->tkr_mono.clock; + tk_setup_internals(tk, new); + } + + timekeeping_update(tk, TK_CLEAR_NTP | TK_MIRROR | TK_CLOCK_WAS_SET); + + write_seqcount_end(&tk_core.seq); + raw_spin_unlock_irqrestore(&timekeeper_lock, flags); + + if (old) { + if (old->disable) + old->disable(old); + + module_put(old->owner); + } + + return 0; +} + +/** + * timekeeping_notify - Install a new clock source + * @clock: pointer to the clock source + * + * This function is called from clocksource.c after a new, better clock + * source has been registered. The caller holds the clocksource_mutex. + */ +int timekeeping_notify(struct clocksource *clock) +{ + struct timekeeper *tk = &tk_core.timekeeper; + + if (tk->tkr_mono.clock == clock) + return 0; + stop_machine(change_clocksource, clock, NULL); + tick_clock_notify(); + return tk->tkr_mono.clock == clock ? 0 : -1; +} + +/** + * ktime_get_raw_ts64 - Returns the raw monotonic time in a timespec + * @ts: pointer to the timespec64 to be set + * + * Returns the raw monotonic time (completely un-modified by ntp) + */ +void ktime_get_raw_ts64(struct timespec64 *ts) +{ + struct timekeeper *tk = &tk_core.timekeeper; + unsigned int seq; + u64 nsecs; + + do { + seq = read_seqcount_begin(&tk_core.seq); + ts->tv_sec = tk->raw_sec; + nsecs = timekeeping_get_ns(&tk->tkr_raw); + + } while (read_seqcount_retry(&tk_core.seq, seq)); + + ts->tv_nsec = 0; + timespec64_add_ns(ts, nsecs); +} +EXPORT_SYMBOL(ktime_get_raw_ts64); + + +/** + * timekeeping_valid_for_hres - Check if timekeeping is suitable for hres + */ +int timekeeping_valid_for_hres(void) +{ + struct timekeeper *tk = &tk_core.timekeeper; + unsigned int seq; + int ret; + + do { + seq = read_seqcount_begin(&tk_core.seq); + + ret = tk->tkr_mono.clock->flags & CLOCK_SOURCE_VALID_FOR_HRES; + + } while (read_seqcount_retry(&tk_core.seq, seq)); + + return ret; +} + +/** + * timekeeping_max_deferment - Returns max time the clocksource can be deferred + */ +u64 timekeeping_max_deferment(void) +{ + struct timekeeper *tk = &tk_core.timekeeper; + unsigned int seq; + u64 ret; + + do { + seq = read_seqcount_begin(&tk_core.seq); + + ret = tk->tkr_mono.clock->max_idle_ns; + + } while (read_seqcount_retry(&tk_core.seq, seq)); + + return ret; +} + +/** + * read_persistent_clock64 - Return time from the persistent clock. + * @ts: Pointer to the storage for the readout value + * + * Weak dummy function for arches that do not yet support it. + * Reads the time from the battery backed persistent clock. + * Returns a timespec with tv_sec=0 and tv_nsec=0 if unsupported. + * + * XXX - Do be sure to remove it once all arches implement it. + */ +void __weak read_persistent_clock64(struct timespec64 *ts) +{ + ts->tv_sec = 0; + ts->tv_nsec = 0; +} + +/** + * read_persistent_wall_and_boot_offset - Read persistent clock, and also offset + * from the boot. + * @wall_time: current time as returned by persistent clock + * @boot_offset: offset that is defined as wall_time - boot_time + * + * Weak dummy function for arches that do not yet support it. + * + * The default function calculates offset based on the current value of + * local_clock(). This way architectures that support sched_clock() but don't + * support dedicated boot time clock will provide the best estimate of the + * boot time. + */ +void __weak __init +read_persistent_wall_and_boot_offset(struct timespec64 *wall_time, + struct timespec64 *boot_offset) +{ + read_persistent_clock64(wall_time); + *boot_offset = ns_to_timespec64(local_clock()); +} + +/* + * Flag reflecting whether timekeeping_resume() has injected sleeptime. + * + * The flag starts of false and is only set when a suspend reaches + * timekeeping_suspend(), timekeeping_resume() sets it to false when the + * timekeeper clocksource is not stopping across suspend and has been + * used to update sleep time. If the timekeeper clocksource has stopped + * then the flag stays true and is used by the RTC resume code to decide + * whether sleeptime must be injected and if so the flag gets false then. + * + * If a suspend fails before reaching timekeeping_resume() then the flag + * stays false and prevents erroneous sleeptime injection. + */ +static bool suspend_timing_needed; + +/* Flag for if there is a persistent clock on this platform */ +static bool persistent_clock_exists; + +/* + * timekeeping_init - Initializes the clocksource and common timekeeping values + */ +void __init timekeeping_init(void) +{ + struct timespec64 wall_time, boot_offset, wall_to_mono; + struct timekeeper *tk = &tk_core.timekeeper; + struct clocksource *clock; + unsigned long flags; + + read_persistent_wall_and_boot_offset(&wall_time, &boot_offset); + if (timespec64_valid_settod(&wall_time) && + timespec64_to_ns(&wall_time) > 0) { + persistent_clock_exists = true; + } else if (timespec64_to_ns(&wall_time) != 0) { + pr_warn("Persistent clock returned invalid value"); + wall_time = (struct timespec64){0}; + } + + if (timespec64_compare(&wall_time, &boot_offset) < 0) + boot_offset = (struct timespec64){0}; + + /* + * We want set wall_to_mono, so the following is true: + * wall time + wall_to_mono = boot time + */ + wall_to_mono = timespec64_sub(boot_offset, wall_time); + + raw_spin_lock_irqsave(&timekeeper_lock, flags); + write_seqcount_begin(&tk_core.seq); + ntp_init(); + + clock = clocksource_default_clock(); + if (clock->enable) + clock->enable(clock); + tk_setup_internals(tk, clock); + + tk_set_xtime(tk, &wall_time); + tk->raw_sec = 0; + + tk_set_wall_to_mono(tk, wall_to_mono); + + timekeeping_update(tk, TK_MIRROR | TK_CLOCK_WAS_SET); + + write_seqcount_end(&tk_core.seq); + raw_spin_unlock_irqrestore(&timekeeper_lock, flags); +} + +/* time in seconds when suspend began for persistent clock */ +static struct timespec64 timekeeping_suspend_time; + +/** + * __timekeeping_inject_sleeptime - Internal function to add sleep interval + * @tk: Pointer to the timekeeper to be updated + * @delta: Pointer to the delta value in timespec64 format + * + * Takes a timespec offset measuring a suspend interval and properly + * adds the sleep offset to the timekeeping variables. + */ +static void __timekeeping_inject_sleeptime(struct timekeeper *tk, + const struct timespec64 *delta) +{ + if (!timespec64_valid_strict(delta)) { + printk_deferred(KERN_WARNING + "__timekeeping_inject_sleeptime: Invalid " + "sleep delta value!\n"); + return; + } + tk_xtime_add(tk, delta); + tk_set_wall_to_mono(tk, timespec64_sub(tk->wall_to_monotonic, *delta)); + tk_update_sleep_time(tk, timespec64_to_ktime(*delta)); + tk_debug_account_sleep_time(delta); +} + +#if defined(CONFIG_PM_SLEEP) && defined(CONFIG_RTC_HCTOSYS_DEVICE) +/* + * We have three kinds of time sources to use for sleep time + * injection, the preference order is: + * 1) non-stop clocksource + * 2) persistent clock (ie: RTC accessible when irqs are off) + * 3) RTC + * + * 1) and 2) are used by timekeeping, 3) by RTC subsystem. + * If system has neither 1) nor 2), 3) will be used finally. + * + * + * If timekeeping has injected sleeptime via either 1) or 2), + * 3) becomes needless, so in this case we don't need to call + * rtc_resume(), and this is what timekeeping_rtc_skipresume() + * means. + */ +bool timekeeping_rtc_skipresume(void) +{ + return !suspend_timing_needed; +} + +/* + * 1) can be determined whether to use or not only when doing + * timekeeping_resume() which is invoked after rtc_suspend(), + * so we can't skip rtc_suspend() surely if system has 1). + * + * But if system has 2), 2) will definitely be used, so in this + * case we don't need to call rtc_suspend(), and this is what + * timekeeping_rtc_skipsuspend() means. + */ +bool timekeeping_rtc_skipsuspend(void) +{ + return persistent_clock_exists; +} + +/** + * timekeeping_inject_sleeptime64 - Adds suspend interval to timeekeeping values + * @delta: pointer to a timespec64 delta value + * + * This hook is for architectures that cannot support read_persistent_clock64 + * because their RTC/persistent clock is only accessible when irqs are enabled. + * and also don't have an effective nonstop clocksource. + * + * This function should only be called by rtc_resume(), and allows + * a suspend offset to be injected into the timekeeping values. + */ +void timekeeping_inject_sleeptime64(const struct timespec64 *delta) +{ + struct timekeeper *tk = &tk_core.timekeeper; + unsigned long flags; + + raw_spin_lock_irqsave(&timekeeper_lock, flags); + write_seqcount_begin(&tk_core.seq); + + suspend_timing_needed = false; + + timekeeping_forward_now(tk); + + __timekeeping_inject_sleeptime(tk, delta); + + timekeeping_update(tk, TK_CLEAR_NTP | TK_MIRROR | TK_CLOCK_WAS_SET); + + write_seqcount_end(&tk_core.seq); + raw_spin_unlock_irqrestore(&timekeeper_lock, flags); + + /* Signal hrtimers about time change */ + clock_was_set(CLOCK_SET_WALL | CLOCK_SET_BOOT); +} +#endif + +/** + * timekeeping_resume - Resumes the generic timekeeping subsystem. + */ +void timekeeping_resume(void) +{ + struct timekeeper *tk = &tk_core.timekeeper; + struct clocksource *clock = tk->tkr_mono.clock; + unsigned long flags; + struct timespec64 ts_new, ts_delta; + u64 cycle_now, nsec; + bool inject_sleeptime = false; + + read_persistent_clock64(&ts_new); + + clockevents_resume(); + clocksource_resume(); + + raw_spin_lock_irqsave(&timekeeper_lock, flags); + write_seqcount_begin(&tk_core.seq); + + /* + * After system resumes, we need to calculate the suspended time and + * compensate it for the OS time. There are 3 sources that could be + * used: Nonstop clocksource during suspend, persistent clock and rtc + * device. + * + * One specific platform may have 1 or 2 or all of them, and the + * preference will be: + * suspend-nonstop clocksource -> persistent clock -> rtc + * The less preferred source will only be tried if there is no better + * usable source. The rtc part is handled separately in rtc core code. + */ + cycle_now = tk_clock_read(&tk->tkr_mono); + nsec = clocksource_stop_suspend_timing(clock, cycle_now); + if (nsec > 0) { + ts_delta = ns_to_timespec64(nsec); + inject_sleeptime = true; + } else if (timespec64_compare(&ts_new, &timekeeping_suspend_time) > 0) { + ts_delta = timespec64_sub(ts_new, timekeeping_suspend_time); + inject_sleeptime = true; + } + + if (inject_sleeptime) { + suspend_timing_needed = false; + __timekeeping_inject_sleeptime(tk, &ts_delta); + } + + /* Re-base the last cycle value */ + tk->tkr_mono.cycle_last = cycle_now; + tk->tkr_raw.cycle_last = cycle_now; + + tk->ntp_error = 0; + timekeeping_suspended = 0; + timekeeping_update(tk, TK_MIRROR | TK_CLOCK_WAS_SET); + write_seqcount_end(&tk_core.seq); + raw_spin_unlock_irqrestore(&timekeeper_lock, flags); + + touch_softlockup_watchdog(); + + /* Resume the clockevent device(s) and hrtimers */ + tick_resume(); + /* Notify timerfd as resume is equivalent to clock_was_set() */ + timerfd_resume(); +} + +int timekeeping_suspend(void) +{ + struct timekeeper *tk = &tk_core.timekeeper; + unsigned long flags; + struct timespec64 delta, delta_delta; + static struct timespec64 old_delta; + struct clocksource *curr_clock; + u64 cycle_now; + + read_persistent_clock64(&timekeeping_suspend_time); + + /* + * On some systems the persistent_clock can not be detected at + * timekeeping_init by its return value, so if we see a valid + * value returned, update the persistent_clock_exists flag. + */ + if (timekeeping_suspend_time.tv_sec || timekeeping_suspend_time.tv_nsec) + persistent_clock_exists = true; + + suspend_timing_needed = true; + + raw_spin_lock_irqsave(&timekeeper_lock, flags); + write_seqcount_begin(&tk_core.seq); + timekeeping_forward_now(tk); + timekeeping_suspended = 1; + + /* + * Since we've called forward_now, cycle_last stores the value + * just read from the current clocksource. Save this to potentially + * use in suspend timing. + */ + curr_clock = tk->tkr_mono.clock; + cycle_now = tk->tkr_mono.cycle_last; + clocksource_start_suspend_timing(curr_clock, cycle_now); + + if (persistent_clock_exists) { + /* + * To avoid drift caused by repeated suspend/resumes, + * which each can add ~1 second drift error, + * try to compensate so the difference in system time + * and persistent_clock time stays close to constant. + */ + delta = timespec64_sub(tk_xtime(tk), timekeeping_suspend_time); + delta_delta = timespec64_sub(delta, old_delta); + if (abs(delta_delta.tv_sec) >= 2) { + /* + * if delta_delta is too large, assume time correction + * has occurred and set old_delta to the current delta. + */ + old_delta = delta; + } else { + /* Otherwise try to adjust old_system to compensate */ + timekeeping_suspend_time = + timespec64_add(timekeeping_suspend_time, delta_delta); + } + } + + timekeeping_update(tk, TK_MIRROR); + halt_fast_timekeeper(tk); + write_seqcount_end(&tk_core.seq); + raw_spin_unlock_irqrestore(&timekeeper_lock, flags); + + tick_suspend(); + clocksource_suspend(); + clockevents_suspend(); + + return 0; +} + +/* sysfs resume/suspend bits for timekeeping */ +static struct syscore_ops timekeeping_syscore_ops = { + .resume = timekeeping_resume, + .suspend = timekeeping_suspend, +}; + +static int __init timekeeping_init_ops(void) +{ + register_syscore_ops(&timekeeping_syscore_ops); + return 0; +} +device_initcall(timekeeping_init_ops); + +/* + * Apply a multiplier adjustment to the timekeeper + */ +static __always_inline void timekeeping_apply_adjustment(struct timekeeper *tk, + s64 offset, + s32 mult_adj) +{ + s64 interval = tk->cycle_interval; + + if (mult_adj == 0) { + return; + } else if (mult_adj == -1) { + interval = -interval; + offset = -offset; + } else if (mult_adj != 1) { + interval *= mult_adj; + offset *= mult_adj; + } + + /* + * So the following can be confusing. + * + * To keep things simple, lets assume mult_adj == 1 for now. + * + * When mult_adj != 1, remember that the interval and offset values + * have been appropriately scaled so the math is the same. + * + * The basic idea here is that we're increasing the multiplier + * by one, this causes the xtime_interval to be incremented by + * one cycle_interval. This is because: + * xtime_interval = cycle_interval * mult + * So if mult is being incremented by one: + * xtime_interval = cycle_interval * (mult + 1) + * Its the same as: + * xtime_interval = (cycle_interval * mult) + cycle_interval + * Which can be shortened to: + * xtime_interval += cycle_interval + * + * So offset stores the non-accumulated cycles. Thus the current + * time (in shifted nanoseconds) is: + * now = (offset * adj) + xtime_nsec + * Now, even though we're adjusting the clock frequency, we have + * to keep time consistent. In other words, we can't jump back + * in time, and we also want to avoid jumping forward in time. + * + * So given the same offset value, we need the time to be the same + * both before and after the freq adjustment. + * now = (offset * adj_1) + xtime_nsec_1 + * now = (offset * adj_2) + xtime_nsec_2 + * So: + * (offset * adj_1) + xtime_nsec_1 = + * (offset * adj_2) + xtime_nsec_2 + * And we know: + * adj_2 = adj_1 + 1 + * So: + * (offset * adj_1) + xtime_nsec_1 = + * (offset * (adj_1+1)) + xtime_nsec_2 + * (offset * adj_1) + xtime_nsec_1 = + * (offset * adj_1) + offset + xtime_nsec_2 + * Canceling the sides: + * xtime_nsec_1 = offset + xtime_nsec_2 + * Which gives us: + * xtime_nsec_2 = xtime_nsec_1 - offset + * Which simplifies to: + * xtime_nsec -= offset + */ + if ((mult_adj > 0) && (tk->tkr_mono.mult + mult_adj < mult_adj)) { + /* NTP adjustment caused clocksource mult overflow */ + WARN_ON_ONCE(1); + return; + } + + tk->tkr_mono.mult += mult_adj; + tk->xtime_interval += interval; + tk->tkr_mono.xtime_nsec -= offset; +} + +/* + * Adjust the timekeeper's multiplier to the correct frequency + * and also to reduce the accumulated error value. + */ +static void timekeeping_adjust(struct timekeeper *tk, s64 offset) +{ + u32 mult; + + /* + * Determine the multiplier from the current NTP tick length. + * Avoid expensive division when the tick length doesn't change. + */ + if (likely(tk->ntp_tick == ntp_tick_length())) { + mult = tk->tkr_mono.mult - tk->ntp_err_mult; + } else { + tk->ntp_tick = ntp_tick_length(); + mult = div64_u64((tk->ntp_tick >> tk->ntp_error_shift) - + tk->xtime_remainder, tk->cycle_interval); + } + + /* + * If the clock is behind the NTP time, increase the multiplier by 1 + * to catch up with it. If it's ahead and there was a remainder in the + * tick division, the clock will slow down. Otherwise it will stay + * ahead until the tick length changes to a non-divisible value. + */ + tk->ntp_err_mult = tk->ntp_error > 0 ? 1 : 0; + mult += tk->ntp_err_mult; + + timekeeping_apply_adjustment(tk, offset, mult - tk->tkr_mono.mult); + + if (unlikely(tk->tkr_mono.clock->maxadj && + (abs(tk->tkr_mono.mult - tk->tkr_mono.clock->mult) + > tk->tkr_mono.clock->maxadj))) { + printk_once(KERN_WARNING + "Adjusting %s more than 11%% (%ld vs %ld)\n", + tk->tkr_mono.clock->name, (long)tk->tkr_mono.mult, + (long)tk->tkr_mono.clock->mult + tk->tkr_mono.clock->maxadj); + } + + /* + * It may be possible that when we entered this function, xtime_nsec + * was very small. Further, if we're slightly speeding the clocksource + * in the code above, its possible the required corrective factor to + * xtime_nsec could cause it to underflow. + * + * Now, since we have already accumulated the second and the NTP + * subsystem has been notified via second_overflow(), we need to skip + * the next update. + */ + if (unlikely((s64)tk->tkr_mono.xtime_nsec < 0)) { + tk->tkr_mono.xtime_nsec += (u64)NSEC_PER_SEC << + tk->tkr_mono.shift; + tk->xtime_sec--; + tk->skip_second_overflow = 1; + } +} + +/* + * accumulate_nsecs_to_secs - Accumulates nsecs into secs + * + * Helper function that accumulates the nsecs greater than a second + * from the xtime_nsec field to the xtime_secs field. + * It also calls into the NTP code to handle leapsecond processing. + */ +static inline unsigned int accumulate_nsecs_to_secs(struct timekeeper *tk) +{ + u64 nsecps = (u64)NSEC_PER_SEC << tk->tkr_mono.shift; + unsigned int clock_set = 0; + + while (tk->tkr_mono.xtime_nsec >= nsecps) { + int leap; + + tk->tkr_mono.xtime_nsec -= nsecps; + tk->xtime_sec++; + + /* + * Skip NTP update if this second was accumulated before, + * i.e. xtime_nsec underflowed in timekeeping_adjust() + */ + if (unlikely(tk->skip_second_overflow)) { + tk->skip_second_overflow = 0; + continue; + } + + /* Figure out if its a leap sec and apply if needed */ + leap = second_overflow(tk->xtime_sec); + if (unlikely(leap)) { + struct timespec64 ts; + + tk->xtime_sec += leap; + + ts.tv_sec = leap; + ts.tv_nsec = 0; + tk_set_wall_to_mono(tk, + timespec64_sub(tk->wall_to_monotonic, ts)); + + __timekeeping_set_tai_offset(tk, tk->tai_offset - leap); + + clock_set = TK_CLOCK_WAS_SET; + } + } + return clock_set; +} + +/* + * logarithmic_accumulation - shifted accumulation of cycles + * + * This functions accumulates a shifted interval of cycles into + * a shifted interval nanoseconds. Allows for O(log) accumulation + * loop. + * + * Returns the unconsumed cycles. + */ +static u64 logarithmic_accumulation(struct timekeeper *tk, u64 offset, + u32 shift, unsigned int *clock_set) +{ + u64 interval = tk->cycle_interval << shift; + u64 snsec_per_sec; + + /* If the offset is smaller than a shifted interval, do nothing */ + if (offset < interval) + return offset; + + /* Accumulate one shifted interval */ + offset -= interval; + tk->tkr_mono.cycle_last += interval; + tk->tkr_raw.cycle_last += interval; + + tk->tkr_mono.xtime_nsec += tk->xtime_interval << shift; + *clock_set |= accumulate_nsecs_to_secs(tk); + + /* Accumulate raw time */ + tk->tkr_raw.xtime_nsec += tk->raw_interval << shift; + snsec_per_sec = (u64)NSEC_PER_SEC << tk->tkr_raw.shift; + while (tk->tkr_raw.xtime_nsec >= snsec_per_sec) { + tk->tkr_raw.xtime_nsec -= snsec_per_sec; + tk->raw_sec++; + } + + /* Accumulate error between NTP and clock interval */ + tk->ntp_error += tk->ntp_tick << shift; + tk->ntp_error -= (tk->xtime_interval + tk->xtime_remainder) << + (tk->ntp_error_shift + shift); + + return offset; +} + +/* + * timekeeping_advance - Updates the timekeeper to the current time and + * current NTP tick length + */ +static bool timekeeping_advance(enum timekeeping_adv_mode mode) +{ + struct timekeeper *real_tk = &tk_core.timekeeper; + struct timekeeper *tk = &shadow_timekeeper; + u64 offset; + int shift = 0, maxshift; + unsigned int clock_set = 0; + unsigned long flags; + + raw_spin_lock_irqsave(&timekeeper_lock, flags); + + /* Make sure we're fully resumed: */ + if (unlikely(timekeeping_suspended)) + goto out; + + offset = clocksource_delta(tk_clock_read(&tk->tkr_mono), + tk->tkr_mono.cycle_last, tk->tkr_mono.mask); + + /* Check if there's really nothing to do */ + if (offset < real_tk->cycle_interval && mode == TK_ADV_TICK) + goto out; + + /* Do some additional sanity checking */ + timekeeping_check_update(tk, offset); + + /* + * With NO_HZ we may have to accumulate many cycle_intervals + * (think "ticks") worth of time at once. To do this efficiently, + * we calculate the largest doubling multiple of cycle_intervals + * that is smaller than the offset. We then accumulate that + * chunk in one go, and then try to consume the next smaller + * doubled multiple. + */ + shift = ilog2(offset) - ilog2(tk->cycle_interval); + shift = max(0, shift); + /* Bound shift to one less than what overflows tick_length */ + maxshift = (64 - (ilog2(ntp_tick_length())+1)) - 1; + shift = min(shift, maxshift); + while (offset >= tk->cycle_interval) { + offset = logarithmic_accumulation(tk, offset, shift, + &clock_set); + if (offset < tk->cycle_interval<<shift) + shift--; + } + + /* Adjust the multiplier to correct NTP error */ + timekeeping_adjust(tk, offset); + + /* + * Finally, make sure that after the rounding + * xtime_nsec isn't larger than NSEC_PER_SEC + */ + clock_set |= accumulate_nsecs_to_secs(tk); + + write_seqcount_begin(&tk_core.seq); + /* + * Update the real timekeeper. + * + * We could avoid this memcpy by switching pointers, but that + * requires changes to all other timekeeper usage sites as + * well, i.e. move the timekeeper pointer getter into the + * spinlocked/seqcount protected sections. And we trade this + * memcpy under the tk_core.seq against one before we start + * updating. + */ + timekeeping_update(tk, clock_set); + memcpy(real_tk, tk, sizeof(*tk)); + /* The memcpy must come last. Do not put anything here! */ + write_seqcount_end(&tk_core.seq); +out: + raw_spin_unlock_irqrestore(&timekeeper_lock, flags); + + return !!clock_set; +} + +/** + * update_wall_time - Uses the current clocksource to increment the wall time + * + */ +void update_wall_time(void) +{ + if (timekeeping_advance(TK_ADV_TICK)) + clock_was_set_delayed(); +} + +/** + * getboottime64 - Return the real time of system boot. + * @ts: pointer to the timespec64 to be set + * + * Returns the wall-time of boot in a timespec64. + * + * This is based on the wall_to_monotonic offset and the total suspend + * time. Calls to settimeofday will affect the value returned (which + * basically means that however wrong your real time clock is at boot time, + * you get the right time here). + */ +void getboottime64(struct timespec64 *ts) +{ + struct timekeeper *tk = &tk_core.timekeeper; + ktime_t t = ktime_sub(tk->offs_real, tk->offs_boot); + + *ts = ktime_to_timespec64(t); +} +EXPORT_SYMBOL_GPL(getboottime64); + +void ktime_get_coarse_real_ts64(struct timespec64 *ts) +{ + struct timekeeper *tk = &tk_core.timekeeper; + unsigned int seq; + + do { + seq = read_seqcount_begin(&tk_core.seq); + + *ts = tk_xtime(tk); + } while (read_seqcount_retry(&tk_core.seq, seq)); +} +EXPORT_SYMBOL(ktime_get_coarse_real_ts64); + +void ktime_get_coarse_ts64(struct timespec64 *ts) +{ + struct timekeeper *tk = &tk_core.timekeeper; + struct timespec64 now, mono; + unsigned int seq; + + do { + seq = read_seqcount_begin(&tk_core.seq); + + now = tk_xtime(tk); + mono = tk->wall_to_monotonic; + } while (read_seqcount_retry(&tk_core.seq, seq)); + + set_normalized_timespec64(ts, now.tv_sec + mono.tv_sec, + now.tv_nsec + mono.tv_nsec); +} +EXPORT_SYMBOL(ktime_get_coarse_ts64); + +/* + * Must hold jiffies_lock + */ +void do_timer(unsigned long ticks) +{ + jiffies_64 += ticks; + calc_global_load(); +} + +/** + * ktime_get_update_offsets_now - hrtimer helper + * @cwsseq: pointer to check and store the clock was set sequence number + * @offs_real: pointer to storage for monotonic -> realtime offset + * @offs_boot: pointer to storage for monotonic -> boottime offset + * @offs_tai: pointer to storage for monotonic -> clock tai offset + * + * Returns current monotonic time and updates the offsets if the + * sequence number in @cwsseq and timekeeper.clock_was_set_seq are + * different. + * + * Called from hrtimer_interrupt() or retrigger_next_event() + */ +ktime_t ktime_get_update_offsets_now(unsigned int *cwsseq, ktime_t *offs_real, + ktime_t *offs_boot, ktime_t *offs_tai) +{ + struct timekeeper *tk = &tk_core.timekeeper; + unsigned int seq; + ktime_t base; + u64 nsecs; + + do { + seq = read_seqcount_begin(&tk_core.seq); + + base = tk->tkr_mono.base; + nsecs = timekeeping_get_ns(&tk->tkr_mono); + base = ktime_add_ns(base, nsecs); + + if (*cwsseq != tk->clock_was_set_seq) { + *cwsseq = tk->clock_was_set_seq; + *offs_real = tk->offs_real; + *offs_boot = tk->offs_boot; + *offs_tai = tk->offs_tai; + } + + /* Handle leapsecond insertion adjustments */ + if (unlikely(base >= tk->next_leap_ktime)) + *offs_real = ktime_sub(tk->offs_real, ktime_set(1, 0)); + + } while (read_seqcount_retry(&tk_core.seq, seq)); + + return base; +} + +/* + * timekeeping_validate_timex - Ensures the timex is ok for use in do_adjtimex + */ +static int timekeeping_validate_timex(const struct __kernel_timex *txc) +{ + if (txc->modes & ADJ_ADJTIME) { + /* singleshot must not be used with any other mode bits */ + if (!(txc->modes & ADJ_OFFSET_SINGLESHOT)) + return -EINVAL; + if (!(txc->modes & ADJ_OFFSET_READONLY) && + !capable(CAP_SYS_TIME)) + return -EPERM; + } else { + /* In order to modify anything, you gotta be super-user! */ + if (txc->modes && !capable(CAP_SYS_TIME)) + return -EPERM; + /* + * if the quartz is off by more than 10% then + * something is VERY wrong! + */ + if (txc->modes & ADJ_TICK && + (txc->tick < 900000/USER_HZ || + txc->tick > 1100000/USER_HZ)) + return -EINVAL; + } + + if (txc->modes & ADJ_SETOFFSET) { + /* In order to inject time, you gotta be super-user! */ + if (!capable(CAP_SYS_TIME)) + return -EPERM; + + /* + * Validate if a timespec/timeval used to inject a time + * offset is valid. Offsets can be positive or negative, so + * we don't check tv_sec. The value of the timeval/timespec + * is the sum of its fields,but *NOTE*: + * The field tv_usec/tv_nsec must always be non-negative and + * we can't have more nanoseconds/microseconds than a second. + */ + if (txc->time.tv_usec < 0) + return -EINVAL; + + if (txc->modes & ADJ_NANO) { + if (txc->time.tv_usec >= NSEC_PER_SEC) + return -EINVAL; + } else { + if (txc->time.tv_usec >= USEC_PER_SEC) + return -EINVAL; + } + } + + /* + * Check for potential multiplication overflows that can + * only happen on 64-bit systems: + */ + if ((txc->modes & ADJ_FREQUENCY) && (BITS_PER_LONG == 64)) { + if (LLONG_MIN / PPM_SCALE > txc->freq) + return -EINVAL; + if (LLONG_MAX / PPM_SCALE < txc->freq) + return -EINVAL; + } + + return 0; +} + +/** + * random_get_entropy_fallback - Returns the raw clock source value, + * used by random.c for platforms with no valid random_get_entropy(). + */ +unsigned long random_get_entropy_fallback(void) +{ + struct tk_read_base *tkr = &tk_core.timekeeper.tkr_mono; + struct clocksource *clock = READ_ONCE(tkr->clock); + + if (unlikely(timekeeping_suspended || !clock)) + return 0; + return clock->read(clock); +} +EXPORT_SYMBOL_GPL(random_get_entropy_fallback); + +/** + * do_adjtimex() - Accessor function to NTP __do_adjtimex function + */ +int do_adjtimex(struct __kernel_timex *txc) +{ + struct timekeeper *tk = &tk_core.timekeeper; + struct audit_ntp_data ad; + bool clock_set = false; + struct timespec64 ts; + unsigned long flags; + s32 orig_tai, tai; + int ret; + + /* Validate the data before disabling interrupts */ + ret = timekeeping_validate_timex(txc); + if (ret) + return ret; + add_device_randomness(txc, sizeof(*txc)); + + if (txc->modes & ADJ_SETOFFSET) { + struct timespec64 delta; + delta.tv_sec = txc->time.tv_sec; + delta.tv_nsec = txc->time.tv_usec; + if (!(txc->modes & ADJ_NANO)) + delta.tv_nsec *= 1000; + ret = timekeeping_inject_offset(&delta); + if (ret) + return ret; + + audit_tk_injoffset(delta); + } + + audit_ntp_init(&ad); + + ktime_get_real_ts64(&ts); + add_device_randomness(&ts, sizeof(ts)); + + raw_spin_lock_irqsave(&timekeeper_lock, flags); + write_seqcount_begin(&tk_core.seq); + + orig_tai = tai = tk->tai_offset; + ret = __do_adjtimex(txc, &ts, &tai, &ad); + + if (tai != orig_tai) { + __timekeeping_set_tai_offset(tk, tai); + timekeeping_update(tk, TK_MIRROR | TK_CLOCK_WAS_SET); + clock_set = true; + } + tk_update_leap_state(tk); + + write_seqcount_end(&tk_core.seq); + raw_spin_unlock_irqrestore(&timekeeper_lock, flags); + + audit_ntp_log(&ad); + + /* Update the multiplier immediately if frequency was set directly */ + if (txc->modes & (ADJ_FREQUENCY | ADJ_TICK)) + clock_set |= timekeeping_advance(TK_ADV_FREQ); + + if (clock_set) + clock_was_set(CLOCK_REALTIME); + + ntp_notify_cmos_timer(); + + return ret; +} + +#ifdef CONFIG_NTP_PPS +/** + * hardpps() - Accessor function to NTP __hardpps function + */ +void hardpps(const struct timespec64 *phase_ts, const struct timespec64 *raw_ts) +{ + unsigned long flags; + + raw_spin_lock_irqsave(&timekeeper_lock, flags); + write_seqcount_begin(&tk_core.seq); + + __hardpps(phase_ts, raw_ts); + + write_seqcount_end(&tk_core.seq); + raw_spin_unlock_irqrestore(&timekeeper_lock, flags); +} +EXPORT_SYMBOL(hardpps); +#endif /* CONFIG_NTP_PPS */ diff --git a/kernel/time/timekeeping.h b/kernel/time/timekeeping.h new file mode 100644 index 0000000000..543beba096 --- /dev/null +++ b/kernel/time/timekeeping.h @@ -0,0 +1,34 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _KERNEL_TIME_TIMEKEEPING_H +#define _KERNEL_TIME_TIMEKEEPING_H +/* + * Internal interfaces for kernel/time/ + */ +extern ktime_t ktime_get_update_offsets_now(unsigned int *cwsseq, + ktime_t *offs_real, + ktime_t *offs_boot, + ktime_t *offs_tai); + +extern int timekeeping_valid_for_hres(void); +extern u64 timekeeping_max_deferment(void); +extern void timekeeping_warp_clock(void); +extern int timekeeping_suspend(void); +extern void timekeeping_resume(void); +#ifdef CONFIG_GENERIC_SCHED_CLOCK +extern int sched_clock_suspend(void); +extern void sched_clock_resume(void); +#else +static inline int sched_clock_suspend(void) { return 0; } +static inline void sched_clock_resume(void) { } +#endif + +extern void update_process_times(int user); +extern void do_timer(unsigned long ticks); +extern void update_wall_time(void); + +extern raw_spinlock_t jiffies_lock; +extern seqcount_raw_spinlock_t jiffies_seq; + +#define CS_NAME_LEN 32 + +#endif diff --git a/kernel/time/timekeeping_debug.c b/kernel/time/timekeeping_debug.c new file mode 100644 index 0000000000..b73e8850e5 --- /dev/null +++ b/kernel/time/timekeeping_debug.c @@ -0,0 +1,55 @@ +// SPDX-License-Identifier: GPL-2.0+ +/* + * debugfs file to track time spent in suspend + * + * Copyright (c) 2011, Google, Inc. + */ + +#include <linux/debugfs.h> +#include <linux/err.h> +#include <linux/init.h> +#include <linux/kernel.h> +#include <linux/seq_file.h> +#include <linux/suspend.h> +#include <linux/time.h> + +#include "timekeeping_internal.h" + +#define NUM_BINS 32 + +static unsigned int sleep_time_bin[NUM_BINS] = {0}; + +static int tk_debug_sleep_time_show(struct seq_file *s, void *data) +{ + unsigned int bin; + seq_puts(s, " time (secs) count\n"); + seq_puts(s, "------------------------------\n"); + for (bin = 0; bin < 32; bin++) { + if (sleep_time_bin[bin] == 0) + continue; + seq_printf(s, "%10u - %-10u %4u\n", + bin ? 1 << (bin - 1) : 0, 1 << bin, + sleep_time_bin[bin]); + } + return 0; +} +DEFINE_SHOW_ATTRIBUTE(tk_debug_sleep_time); + +static int __init tk_debug_sleep_time_init(void) +{ + debugfs_create_file("sleep_time", 0444, NULL, NULL, + &tk_debug_sleep_time_fops); + return 0; +} +late_initcall(tk_debug_sleep_time_init); + +void tk_debug_account_sleep_time(const struct timespec64 *t) +{ + /* Cap bin index so we don't overflow the array */ + int bin = min(fls(t->tv_sec), NUM_BINS-1); + + sleep_time_bin[bin]++; + pm_deferred_pr_dbg("Timekeeping suspended for %lld.%03lu seconds\n", + (s64)t->tv_sec, t->tv_nsec / NSEC_PER_MSEC); +} + diff --git a/kernel/time/timekeeping_internal.h b/kernel/time/timekeeping_internal.h new file mode 100644 index 0000000000..4ca2787d16 --- /dev/null +++ b/kernel/time/timekeeping_internal.h @@ -0,0 +1,39 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _TIMEKEEPING_INTERNAL_H +#define _TIMEKEEPING_INTERNAL_H + +#include <linux/clocksource.h> +#include <linux/spinlock.h> +#include <linux/time.h> + +/* + * timekeeping debug functions + */ +#ifdef CONFIG_DEBUG_FS +extern void tk_debug_account_sleep_time(const struct timespec64 *t); +#else +#define tk_debug_account_sleep_time(x) +#endif + +#ifdef CONFIG_CLOCKSOURCE_VALIDATE_LAST_CYCLE +static inline u64 clocksource_delta(u64 now, u64 last, u64 mask) +{ + u64 ret = (now - last) & mask; + + /* + * Prevent time going backwards by checking the MSB of mask in + * the result. If set, return 0. + */ + return ret & ~(mask >> 1) ? 0 : ret; +} +#else +static inline u64 clocksource_delta(u64 now, u64 last, u64 mask) +{ + return (now - last) & mask; +} +#endif + +/* Semi public for serialization of non timekeeper VDSO updates. */ +extern raw_spinlock_t timekeeper_lock; + +#endif /* _TIMEKEEPING_INTERNAL_H */ diff --git a/kernel/time/timer.c b/kernel/time/timer.c new file mode 100644 index 0000000000..63a8ce7177 --- /dev/null +++ b/kernel/time/timer.c @@ -0,0 +1,2367 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Kernel internal timers + * + * Copyright (C) 1991, 1992 Linus Torvalds + * + * 1997-01-28 Modified by Finn Arne Gangstad to make timers scale better. + * + * 1997-09-10 Updated NTP code according to technical memorandum Jan '96 + * "A Kernel Model for Precision Timekeeping" by Dave Mills + * 1998-12-24 Fixed a xtime SMP race (we need the xtime_lock rw spinlock to + * serialize accesses to xtime/lost_ticks). + * Copyright (C) 1998 Andrea Arcangeli + * 1999-03-10 Improved NTP compatibility by Ulrich Windl + * 2002-05-31 Move sys_sysinfo here and make its locking sane, Robert Love + * 2000-10-05 Implemented scalable SMP per-CPU timer handling. + * Copyright (C) 2000, 2001, 2002 Ingo Molnar + * Designed by David S. Miller, Alexey Kuznetsov and Ingo Molnar + */ + +#include <linux/kernel_stat.h> +#include <linux/export.h> +#include <linux/interrupt.h> +#include <linux/percpu.h> +#include <linux/init.h> +#include <linux/mm.h> +#include <linux/swap.h> +#include <linux/pid_namespace.h> +#include <linux/notifier.h> +#include <linux/thread_info.h> +#include <linux/time.h> +#include <linux/jiffies.h> +#include <linux/posix-timers.h> +#include <linux/cpu.h> +#include <linux/syscalls.h> +#include <linux/delay.h> +#include <linux/tick.h> +#include <linux/kallsyms.h> +#include <linux/irq_work.h> +#include <linux/sched/signal.h> +#include <linux/sched/sysctl.h> +#include <linux/sched/nohz.h> +#include <linux/sched/debug.h> +#include <linux/slab.h> +#include <linux/compat.h> +#include <linux/random.h> +#include <linux/sysctl.h> + +#include <linux/uaccess.h> +#include <asm/unistd.h> +#include <asm/div64.h> +#include <asm/timex.h> +#include <asm/io.h> + +#include "tick-internal.h" + +#define CREATE_TRACE_POINTS +#include <trace/events/timer.h> + +__visible u64 jiffies_64 __cacheline_aligned_in_smp = INITIAL_JIFFIES; + +EXPORT_SYMBOL(jiffies_64); + +/* + * The timer wheel has LVL_DEPTH array levels. Each level provides an array of + * LVL_SIZE buckets. Each level is driven by its own clock and therefor each + * level has a different granularity. + * + * The level granularity is: LVL_CLK_DIV ^ lvl + * The level clock frequency is: HZ / (LVL_CLK_DIV ^ level) + * + * The array level of a newly armed timer depends on the relative expiry + * time. The farther the expiry time is away the higher the array level and + * therefor the granularity becomes. + * + * Contrary to the original timer wheel implementation, which aims for 'exact' + * expiry of the timers, this implementation removes the need for recascading + * the timers into the lower array levels. The previous 'classic' timer wheel + * implementation of the kernel already violated the 'exact' expiry by adding + * slack to the expiry time to provide batched expiration. The granularity + * levels provide implicit batching. + * + * This is an optimization of the original timer wheel implementation for the + * majority of the timer wheel use cases: timeouts. The vast majority of + * timeout timers (networking, disk I/O ...) are canceled before expiry. If + * the timeout expires it indicates that normal operation is disturbed, so it + * does not matter much whether the timeout comes with a slight delay. + * + * The only exception to this are networking timers with a small expiry + * time. They rely on the granularity. Those fit into the first wheel level, + * which has HZ granularity. + * + * We don't have cascading anymore. timers with a expiry time above the + * capacity of the last wheel level are force expired at the maximum timeout + * value of the last wheel level. From data sampling we know that the maximum + * value observed is 5 days (network connection tracking), so this should not + * be an issue. + * + * The currently chosen array constants values are a good compromise between + * array size and granularity. + * + * This results in the following granularity and range levels: + * + * HZ 1000 steps + * Level Offset Granularity Range + * 0 0 1 ms 0 ms - 63 ms + * 1 64 8 ms 64 ms - 511 ms + * 2 128 64 ms 512 ms - 4095 ms (512ms - ~4s) + * 3 192 512 ms 4096 ms - 32767 ms (~4s - ~32s) + * 4 256 4096 ms (~4s) 32768 ms - 262143 ms (~32s - ~4m) + * 5 320 32768 ms (~32s) 262144 ms - 2097151 ms (~4m - ~34m) + * 6 384 262144 ms (~4m) 2097152 ms - 16777215 ms (~34m - ~4h) + * 7 448 2097152 ms (~34m) 16777216 ms - 134217727 ms (~4h - ~1d) + * 8 512 16777216 ms (~4h) 134217728 ms - 1073741822 ms (~1d - ~12d) + * + * HZ 300 + * Level Offset Granularity Range + * 0 0 3 ms 0 ms - 210 ms + * 1 64 26 ms 213 ms - 1703 ms (213ms - ~1s) + * 2 128 213 ms 1706 ms - 13650 ms (~1s - ~13s) + * 3 192 1706 ms (~1s) 13653 ms - 109223 ms (~13s - ~1m) + * 4 256 13653 ms (~13s) 109226 ms - 873810 ms (~1m - ~14m) + * 5 320 109226 ms (~1m) 873813 ms - 6990503 ms (~14m - ~1h) + * 6 384 873813 ms (~14m) 6990506 ms - 55924050 ms (~1h - ~15h) + * 7 448 6990506 ms (~1h) 55924053 ms - 447392423 ms (~15h - ~5d) + * 8 512 55924053 ms (~15h) 447392426 ms - 3579139406 ms (~5d - ~41d) + * + * HZ 250 + * Level Offset Granularity Range + * 0 0 4 ms 0 ms - 255 ms + * 1 64 32 ms 256 ms - 2047 ms (256ms - ~2s) + * 2 128 256 ms 2048 ms - 16383 ms (~2s - ~16s) + * 3 192 2048 ms (~2s) 16384 ms - 131071 ms (~16s - ~2m) + * 4 256 16384 ms (~16s) 131072 ms - 1048575 ms (~2m - ~17m) + * 5 320 131072 ms (~2m) 1048576 ms - 8388607 ms (~17m - ~2h) + * 6 384 1048576 ms (~17m) 8388608 ms - 67108863 ms (~2h - ~18h) + * 7 448 8388608 ms (~2h) 67108864 ms - 536870911 ms (~18h - ~6d) + * 8 512 67108864 ms (~18h) 536870912 ms - 4294967288 ms (~6d - ~49d) + * + * HZ 100 + * Level Offset Granularity Range + * 0 0 10 ms 0 ms - 630 ms + * 1 64 80 ms 640 ms - 5110 ms (640ms - ~5s) + * 2 128 640 ms 5120 ms - 40950 ms (~5s - ~40s) + * 3 192 5120 ms (~5s) 40960 ms - 327670 ms (~40s - ~5m) + * 4 256 40960 ms (~40s) 327680 ms - 2621430 ms (~5m - ~43m) + * 5 320 327680 ms (~5m) 2621440 ms - 20971510 ms (~43m - ~5h) + * 6 384 2621440 ms (~43m) 20971520 ms - 167772150 ms (~5h - ~1d) + * 7 448 20971520 ms (~5h) 167772160 ms - 1342177270 ms (~1d - ~15d) + */ + +/* Clock divisor for the next level */ +#define LVL_CLK_SHIFT 3 +#define LVL_CLK_DIV (1UL << LVL_CLK_SHIFT) +#define LVL_CLK_MASK (LVL_CLK_DIV - 1) +#define LVL_SHIFT(n) ((n) * LVL_CLK_SHIFT) +#define LVL_GRAN(n) (1UL << LVL_SHIFT(n)) + +/* + * The time start value for each level to select the bucket at enqueue + * time. We start from the last possible delta of the previous level + * so that we can later add an extra LVL_GRAN(n) to n (see calc_index()). + */ +#define LVL_START(n) ((LVL_SIZE - 1) << (((n) - 1) * LVL_CLK_SHIFT)) + +/* Size of each clock level */ +#define LVL_BITS 6 +#define LVL_SIZE (1UL << LVL_BITS) +#define LVL_MASK (LVL_SIZE - 1) +#define LVL_OFFS(n) ((n) * LVL_SIZE) + +/* Level depth */ +#if HZ > 100 +# define LVL_DEPTH 9 +# else +# define LVL_DEPTH 8 +#endif + +/* The cutoff (max. capacity of the wheel) */ +#define WHEEL_TIMEOUT_CUTOFF (LVL_START(LVL_DEPTH)) +#define WHEEL_TIMEOUT_MAX (WHEEL_TIMEOUT_CUTOFF - LVL_GRAN(LVL_DEPTH - 1)) + +/* + * The resulting wheel size. If NOHZ is configured we allocate two + * wheels so we have a separate storage for the deferrable timers. + */ +#define WHEEL_SIZE (LVL_SIZE * LVL_DEPTH) + +#ifdef CONFIG_NO_HZ_COMMON +# define NR_BASES 2 +# define BASE_STD 0 +# define BASE_DEF 1 +#else +# define NR_BASES 1 +# define BASE_STD 0 +# define BASE_DEF 0 +#endif + +struct timer_base { + raw_spinlock_t lock; + struct timer_list *running_timer; +#ifdef CONFIG_PREEMPT_RT + spinlock_t expiry_lock; + atomic_t timer_waiters; +#endif + unsigned long clk; + unsigned long next_expiry; + unsigned int cpu; + bool next_expiry_recalc; + bool is_idle; + bool timers_pending; + DECLARE_BITMAP(pending_map, WHEEL_SIZE); + struct hlist_head vectors[WHEEL_SIZE]; +} ____cacheline_aligned; + +static DEFINE_PER_CPU(struct timer_base, timer_bases[NR_BASES]); + +#ifdef CONFIG_NO_HZ_COMMON + +static DEFINE_STATIC_KEY_FALSE(timers_nohz_active); +static DEFINE_MUTEX(timer_keys_mutex); + +static void timer_update_keys(struct work_struct *work); +static DECLARE_WORK(timer_update_work, timer_update_keys); + +#ifdef CONFIG_SMP +static unsigned int sysctl_timer_migration = 1; + +DEFINE_STATIC_KEY_FALSE(timers_migration_enabled); + +static void timers_update_migration(void) +{ + if (sysctl_timer_migration && tick_nohz_active) + static_branch_enable(&timers_migration_enabled); + else + static_branch_disable(&timers_migration_enabled); +} + +#ifdef CONFIG_SYSCTL +static int timer_migration_handler(struct ctl_table *table, int write, + void *buffer, size_t *lenp, loff_t *ppos) +{ + int ret; + + mutex_lock(&timer_keys_mutex); + ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); + if (!ret && write) + timers_update_migration(); + mutex_unlock(&timer_keys_mutex); + return ret; +} + +static struct ctl_table timer_sysctl[] = { + { + .procname = "timer_migration", + .data = &sysctl_timer_migration, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = timer_migration_handler, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, + }, + {} +}; + +static int __init timer_sysctl_init(void) +{ + register_sysctl("kernel", timer_sysctl); + return 0; +} +device_initcall(timer_sysctl_init); +#endif /* CONFIG_SYSCTL */ +#else /* CONFIG_SMP */ +static inline void timers_update_migration(void) { } +#endif /* !CONFIG_SMP */ + +static void timer_update_keys(struct work_struct *work) +{ + mutex_lock(&timer_keys_mutex); + timers_update_migration(); + static_branch_enable(&timers_nohz_active); + mutex_unlock(&timer_keys_mutex); +} + +void timers_update_nohz(void) +{ + schedule_work(&timer_update_work); +} + +static inline bool is_timers_nohz_active(void) +{ + return static_branch_unlikely(&timers_nohz_active); +} +#else +static inline bool is_timers_nohz_active(void) { return false; } +#endif /* NO_HZ_COMMON */ + +static unsigned long round_jiffies_common(unsigned long j, int cpu, + bool force_up) +{ + int rem; + unsigned long original = j; + + /* + * We don't want all cpus firing their timers at once hitting the + * same lock or cachelines, so we skew each extra cpu with an extra + * 3 jiffies. This 3 jiffies came originally from the mm/ code which + * already did this. + * The skew is done by adding 3*cpunr, then round, then subtract this + * extra offset again. + */ + j += cpu * 3; + + rem = j % HZ; + + /* + * If the target jiffie is just after a whole second (which can happen + * due to delays of the timer irq, long irq off times etc etc) then + * we should round down to the whole second, not up. Use 1/4th second + * as cutoff for this rounding as an extreme upper bound for this. + * But never round down if @force_up is set. + */ + if (rem < HZ/4 && !force_up) /* round down */ + j = j - rem; + else /* round up */ + j = j - rem + HZ; + + /* now that we have rounded, subtract the extra skew again */ + j -= cpu * 3; + + /* + * Make sure j is still in the future. Otherwise return the + * unmodified value. + */ + return time_is_after_jiffies(j) ? j : original; +} + +/** + * __round_jiffies - function to round jiffies to a full second + * @j: the time in (absolute) jiffies that should be rounded + * @cpu: the processor number on which the timeout will happen + * + * __round_jiffies() rounds an absolute time in the future (in jiffies) + * up or down to (approximately) full seconds. This is useful for timers + * for which the exact time they fire does not matter too much, as long as + * they fire approximately every X seconds. + * + * By rounding these timers to whole seconds, all such timers will fire + * at the same time, rather than at various times spread out. The goal + * of this is to have the CPU wake up less, which saves power. + * + * The exact rounding is skewed for each processor to avoid all + * processors firing at the exact same time, which could lead + * to lock contention or spurious cache line bouncing. + * + * The return value is the rounded version of the @j parameter. + */ +unsigned long __round_jiffies(unsigned long j, int cpu) +{ + return round_jiffies_common(j, cpu, false); +} +EXPORT_SYMBOL_GPL(__round_jiffies); + +/** + * __round_jiffies_relative - function to round jiffies to a full second + * @j: the time in (relative) jiffies that should be rounded + * @cpu: the processor number on which the timeout will happen + * + * __round_jiffies_relative() rounds a time delta in the future (in jiffies) + * up or down to (approximately) full seconds. This is useful for timers + * for which the exact time they fire does not matter too much, as long as + * they fire approximately every X seconds. + * + * By rounding these timers to whole seconds, all such timers will fire + * at the same time, rather than at various times spread out. The goal + * of this is to have the CPU wake up less, which saves power. + * + * The exact rounding is skewed for each processor to avoid all + * processors firing at the exact same time, which could lead + * to lock contention or spurious cache line bouncing. + * + * The return value is the rounded version of the @j parameter. + */ +unsigned long __round_jiffies_relative(unsigned long j, int cpu) +{ + unsigned long j0 = jiffies; + + /* Use j0 because jiffies might change while we run */ + return round_jiffies_common(j + j0, cpu, false) - j0; +} +EXPORT_SYMBOL_GPL(__round_jiffies_relative); + +/** + * round_jiffies - function to round jiffies to a full second + * @j: the time in (absolute) jiffies that should be rounded + * + * round_jiffies() rounds an absolute time in the future (in jiffies) + * up or down to (approximately) full seconds. This is useful for timers + * for which the exact time they fire does not matter too much, as long as + * they fire approximately every X seconds. + * + * By rounding these timers to whole seconds, all such timers will fire + * at the same time, rather than at various times spread out. The goal + * of this is to have the CPU wake up less, which saves power. + * + * The return value is the rounded version of the @j parameter. + */ +unsigned long round_jiffies(unsigned long j) +{ + return round_jiffies_common(j, raw_smp_processor_id(), false); +} +EXPORT_SYMBOL_GPL(round_jiffies); + +/** + * round_jiffies_relative - function to round jiffies to a full second + * @j: the time in (relative) jiffies that should be rounded + * + * round_jiffies_relative() rounds a time delta in the future (in jiffies) + * up or down to (approximately) full seconds. This is useful for timers + * for which the exact time they fire does not matter too much, as long as + * they fire approximately every X seconds. + * + * By rounding these timers to whole seconds, all such timers will fire + * at the same time, rather than at various times spread out. The goal + * of this is to have the CPU wake up less, which saves power. + * + * The return value is the rounded version of the @j parameter. + */ +unsigned long round_jiffies_relative(unsigned long j) +{ + return __round_jiffies_relative(j, raw_smp_processor_id()); +} +EXPORT_SYMBOL_GPL(round_jiffies_relative); + +/** + * __round_jiffies_up - function to round jiffies up to a full second + * @j: the time in (absolute) jiffies that should be rounded + * @cpu: the processor number on which the timeout will happen + * + * This is the same as __round_jiffies() except that it will never + * round down. This is useful for timeouts for which the exact time + * of firing does not matter too much, as long as they don't fire too + * early. + */ +unsigned long __round_jiffies_up(unsigned long j, int cpu) +{ + return round_jiffies_common(j, cpu, true); +} +EXPORT_SYMBOL_GPL(__round_jiffies_up); + +/** + * __round_jiffies_up_relative - function to round jiffies up to a full second + * @j: the time in (relative) jiffies that should be rounded + * @cpu: the processor number on which the timeout will happen + * + * This is the same as __round_jiffies_relative() except that it will never + * round down. This is useful for timeouts for which the exact time + * of firing does not matter too much, as long as they don't fire too + * early. + */ +unsigned long __round_jiffies_up_relative(unsigned long j, int cpu) +{ + unsigned long j0 = jiffies; + + /* Use j0 because jiffies might change while we run */ + return round_jiffies_common(j + j0, cpu, true) - j0; +} +EXPORT_SYMBOL_GPL(__round_jiffies_up_relative); + +/** + * round_jiffies_up - function to round jiffies up to a full second + * @j: the time in (absolute) jiffies that should be rounded + * + * This is the same as round_jiffies() except that it will never + * round down. This is useful for timeouts for which the exact time + * of firing does not matter too much, as long as they don't fire too + * early. + */ +unsigned long round_jiffies_up(unsigned long j) +{ + return round_jiffies_common(j, raw_smp_processor_id(), true); +} +EXPORT_SYMBOL_GPL(round_jiffies_up); + +/** + * round_jiffies_up_relative - function to round jiffies up to a full second + * @j: the time in (relative) jiffies that should be rounded + * + * This is the same as round_jiffies_relative() except that it will never + * round down. This is useful for timeouts for which the exact time + * of firing does not matter too much, as long as they don't fire too + * early. + */ +unsigned long round_jiffies_up_relative(unsigned long j) +{ + return __round_jiffies_up_relative(j, raw_smp_processor_id()); +} +EXPORT_SYMBOL_GPL(round_jiffies_up_relative); + + +static inline unsigned int timer_get_idx(struct timer_list *timer) +{ + return (timer->flags & TIMER_ARRAYMASK) >> TIMER_ARRAYSHIFT; +} + +static inline void timer_set_idx(struct timer_list *timer, unsigned int idx) +{ + timer->flags = (timer->flags & ~TIMER_ARRAYMASK) | + idx << TIMER_ARRAYSHIFT; +} + +/* + * Helper function to calculate the array index for a given expiry + * time. + */ +static inline unsigned calc_index(unsigned long expires, unsigned lvl, + unsigned long *bucket_expiry) +{ + + /* + * The timer wheel has to guarantee that a timer does not fire + * early. Early expiry can happen due to: + * - Timer is armed at the edge of a tick + * - Truncation of the expiry time in the outer wheel levels + * + * Round up with level granularity to prevent this. + */ + expires = (expires >> LVL_SHIFT(lvl)) + 1; + *bucket_expiry = expires << LVL_SHIFT(lvl); + return LVL_OFFS(lvl) + (expires & LVL_MASK); +} + +static int calc_wheel_index(unsigned long expires, unsigned long clk, + unsigned long *bucket_expiry) +{ + unsigned long delta = expires - clk; + unsigned int idx; + + if (delta < LVL_START(1)) { + idx = calc_index(expires, 0, bucket_expiry); + } else if (delta < LVL_START(2)) { + idx = calc_index(expires, 1, bucket_expiry); + } else if (delta < LVL_START(3)) { + idx = calc_index(expires, 2, bucket_expiry); + } else if (delta < LVL_START(4)) { + idx = calc_index(expires, 3, bucket_expiry); + } else if (delta < LVL_START(5)) { + idx = calc_index(expires, 4, bucket_expiry); + } else if (delta < LVL_START(6)) { + idx = calc_index(expires, 5, bucket_expiry); + } else if (delta < LVL_START(7)) { + idx = calc_index(expires, 6, bucket_expiry); + } else if (LVL_DEPTH > 8 && delta < LVL_START(8)) { + idx = calc_index(expires, 7, bucket_expiry); + } else if ((long) delta < 0) { + idx = clk & LVL_MASK; + *bucket_expiry = clk; + } else { + /* + * Force expire obscene large timeouts to expire at the + * capacity limit of the wheel. + */ + if (delta >= WHEEL_TIMEOUT_CUTOFF) + expires = clk + WHEEL_TIMEOUT_MAX; + + idx = calc_index(expires, LVL_DEPTH - 1, bucket_expiry); + } + return idx; +} + +static void +trigger_dyntick_cpu(struct timer_base *base, struct timer_list *timer) +{ + if (!is_timers_nohz_active()) + return; + + /* + * TODO: This wants some optimizing similar to the code below, but we + * will do that when we switch from push to pull for deferrable timers. + */ + if (timer->flags & TIMER_DEFERRABLE) { + if (tick_nohz_full_cpu(base->cpu)) + wake_up_nohz_cpu(base->cpu); + return; + } + + /* + * We might have to IPI the remote CPU if the base is idle and the + * timer is not deferrable. If the other CPU is on the way to idle + * then it can't set base->is_idle as we hold the base lock: + */ + if (base->is_idle) + wake_up_nohz_cpu(base->cpu); +} + +/* + * Enqueue the timer into the hash bucket, mark it pending in + * the bitmap, store the index in the timer flags then wake up + * the target CPU if needed. + */ +static void enqueue_timer(struct timer_base *base, struct timer_list *timer, + unsigned int idx, unsigned long bucket_expiry) +{ + + hlist_add_head(&timer->entry, base->vectors + idx); + __set_bit(idx, base->pending_map); + timer_set_idx(timer, idx); + + trace_timer_start(timer, timer->expires, timer->flags); + + /* + * Check whether this is the new first expiring timer. The + * effective expiry time of the timer is required here + * (bucket_expiry) instead of timer->expires. + */ + if (time_before(bucket_expiry, base->next_expiry)) { + /* + * Set the next expiry time and kick the CPU so it + * can reevaluate the wheel: + */ + base->next_expiry = bucket_expiry; + base->timers_pending = true; + base->next_expiry_recalc = false; + trigger_dyntick_cpu(base, timer); + } +} + +static void internal_add_timer(struct timer_base *base, struct timer_list *timer) +{ + unsigned long bucket_expiry; + unsigned int idx; + + idx = calc_wheel_index(timer->expires, base->clk, &bucket_expiry); + enqueue_timer(base, timer, idx, bucket_expiry); +} + +#ifdef CONFIG_DEBUG_OBJECTS_TIMERS + +static const struct debug_obj_descr timer_debug_descr; + +struct timer_hint { + void (*function)(struct timer_list *t); + long offset; +}; + +#define TIMER_HINT(fn, container, timr, hintfn) \ + { \ + .function = fn, \ + .offset = offsetof(container, hintfn) - \ + offsetof(container, timr) \ + } + +static const struct timer_hint timer_hints[] = { + TIMER_HINT(delayed_work_timer_fn, + struct delayed_work, timer, work.func), + TIMER_HINT(kthread_delayed_work_timer_fn, + struct kthread_delayed_work, timer, work.func), +}; + +static void *timer_debug_hint(void *addr) +{ + struct timer_list *timer = addr; + int i; + + for (i = 0; i < ARRAY_SIZE(timer_hints); i++) { + if (timer_hints[i].function == timer->function) { + void (**fn)(void) = addr + timer_hints[i].offset; + + return *fn; + } + } + + return timer->function; +} + +static bool timer_is_static_object(void *addr) +{ + struct timer_list *timer = addr; + + return (timer->entry.pprev == NULL && + timer->entry.next == TIMER_ENTRY_STATIC); +} + +/* + * fixup_init is called when: + * - an active object is initialized + */ +static bool timer_fixup_init(void *addr, enum debug_obj_state state) +{ + struct timer_list *timer = addr; + + switch (state) { + case ODEBUG_STATE_ACTIVE: + del_timer_sync(timer); + debug_object_init(timer, &timer_debug_descr); + return true; + default: + return false; + } +} + +/* Stub timer callback for improperly used timers. */ +static void stub_timer(struct timer_list *unused) +{ + WARN_ON(1); +} + +/* + * fixup_activate is called when: + * - an active object is activated + * - an unknown non-static object is activated + */ +static bool timer_fixup_activate(void *addr, enum debug_obj_state state) +{ + struct timer_list *timer = addr; + + switch (state) { + case ODEBUG_STATE_NOTAVAILABLE: + timer_setup(timer, stub_timer, 0); + return true; + + case ODEBUG_STATE_ACTIVE: + WARN_ON(1); + fallthrough; + default: + return false; + } +} + +/* + * fixup_free is called when: + * - an active object is freed + */ +static bool timer_fixup_free(void *addr, enum debug_obj_state state) +{ + struct timer_list *timer = addr; + + switch (state) { + case ODEBUG_STATE_ACTIVE: + del_timer_sync(timer); + debug_object_free(timer, &timer_debug_descr); + return true; + default: + return false; + } +} + +/* + * fixup_assert_init is called when: + * - an untracked/uninit-ed object is found + */ +static bool timer_fixup_assert_init(void *addr, enum debug_obj_state state) +{ + struct timer_list *timer = addr; + + switch (state) { + case ODEBUG_STATE_NOTAVAILABLE: + timer_setup(timer, stub_timer, 0); + return true; + default: + return false; + } +} + +static const struct debug_obj_descr timer_debug_descr = { + .name = "timer_list", + .debug_hint = timer_debug_hint, + .is_static_object = timer_is_static_object, + .fixup_init = timer_fixup_init, + .fixup_activate = timer_fixup_activate, + .fixup_free = timer_fixup_free, + .fixup_assert_init = timer_fixup_assert_init, +}; + +static inline void debug_timer_init(struct timer_list *timer) +{ + debug_object_init(timer, &timer_debug_descr); +} + +static inline void debug_timer_activate(struct timer_list *timer) +{ + debug_object_activate(timer, &timer_debug_descr); +} + +static inline void debug_timer_deactivate(struct timer_list *timer) +{ + debug_object_deactivate(timer, &timer_debug_descr); +} + +static inline void debug_timer_assert_init(struct timer_list *timer) +{ + debug_object_assert_init(timer, &timer_debug_descr); +} + +static void do_init_timer(struct timer_list *timer, + void (*func)(struct timer_list *), + unsigned int flags, + const char *name, struct lock_class_key *key); + +void init_timer_on_stack_key(struct timer_list *timer, + void (*func)(struct timer_list *), + unsigned int flags, + const char *name, struct lock_class_key *key) +{ + debug_object_init_on_stack(timer, &timer_debug_descr); + do_init_timer(timer, func, flags, name, key); +} +EXPORT_SYMBOL_GPL(init_timer_on_stack_key); + +void destroy_timer_on_stack(struct timer_list *timer) +{ + debug_object_free(timer, &timer_debug_descr); +} +EXPORT_SYMBOL_GPL(destroy_timer_on_stack); + +#else +static inline void debug_timer_init(struct timer_list *timer) { } +static inline void debug_timer_activate(struct timer_list *timer) { } +static inline void debug_timer_deactivate(struct timer_list *timer) { } +static inline void debug_timer_assert_init(struct timer_list *timer) { } +#endif + +static inline void debug_init(struct timer_list *timer) +{ + debug_timer_init(timer); + trace_timer_init(timer); +} + +static inline void debug_deactivate(struct timer_list *timer) +{ + debug_timer_deactivate(timer); + trace_timer_cancel(timer); +} + +static inline void debug_assert_init(struct timer_list *timer) +{ + debug_timer_assert_init(timer); +} + +static void do_init_timer(struct timer_list *timer, + void (*func)(struct timer_list *), + unsigned int flags, + const char *name, struct lock_class_key *key) +{ + timer->entry.pprev = NULL; + timer->function = func; + if (WARN_ON_ONCE(flags & ~TIMER_INIT_FLAGS)) + flags &= TIMER_INIT_FLAGS; + timer->flags = flags | raw_smp_processor_id(); + lockdep_init_map(&timer->lockdep_map, name, key, 0); +} + +/** + * init_timer_key - initialize a timer + * @timer: the timer to be initialized + * @func: timer callback function + * @flags: timer flags + * @name: name of the timer + * @key: lockdep class key of the fake lock used for tracking timer + * sync lock dependencies + * + * init_timer_key() must be done to a timer prior calling *any* of the + * other timer functions. + */ +void init_timer_key(struct timer_list *timer, + void (*func)(struct timer_list *), unsigned int flags, + const char *name, struct lock_class_key *key) +{ + debug_init(timer); + do_init_timer(timer, func, flags, name, key); +} +EXPORT_SYMBOL(init_timer_key); + +static inline void detach_timer(struct timer_list *timer, bool clear_pending) +{ + struct hlist_node *entry = &timer->entry; + + debug_deactivate(timer); + + __hlist_del(entry); + if (clear_pending) + entry->pprev = NULL; + entry->next = LIST_POISON2; +} + +static int detach_if_pending(struct timer_list *timer, struct timer_base *base, + bool clear_pending) +{ + unsigned idx = timer_get_idx(timer); + + if (!timer_pending(timer)) + return 0; + + if (hlist_is_singular_node(&timer->entry, base->vectors + idx)) { + __clear_bit(idx, base->pending_map); + base->next_expiry_recalc = true; + } + + detach_timer(timer, clear_pending); + return 1; +} + +static inline struct timer_base *get_timer_cpu_base(u32 tflags, u32 cpu) +{ + struct timer_base *base = per_cpu_ptr(&timer_bases[BASE_STD], cpu); + + /* + * If the timer is deferrable and NO_HZ_COMMON is set then we need + * to use the deferrable base. + */ + if (IS_ENABLED(CONFIG_NO_HZ_COMMON) && (tflags & TIMER_DEFERRABLE)) + base = per_cpu_ptr(&timer_bases[BASE_DEF], cpu); + return base; +} + +static inline struct timer_base *get_timer_this_cpu_base(u32 tflags) +{ + struct timer_base *base = this_cpu_ptr(&timer_bases[BASE_STD]); + + /* + * If the timer is deferrable and NO_HZ_COMMON is set then we need + * to use the deferrable base. + */ + if (IS_ENABLED(CONFIG_NO_HZ_COMMON) && (tflags & TIMER_DEFERRABLE)) + base = this_cpu_ptr(&timer_bases[BASE_DEF]); + return base; +} + +static inline struct timer_base *get_timer_base(u32 tflags) +{ + return get_timer_cpu_base(tflags, tflags & TIMER_CPUMASK); +} + +static inline struct timer_base * +get_target_base(struct timer_base *base, unsigned tflags) +{ +#if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ_COMMON) + if (static_branch_likely(&timers_migration_enabled) && + !(tflags & TIMER_PINNED)) + return get_timer_cpu_base(tflags, get_nohz_timer_target()); +#endif + return get_timer_this_cpu_base(tflags); +} + +static inline void forward_timer_base(struct timer_base *base) +{ + unsigned long jnow = READ_ONCE(jiffies); + + /* + * No need to forward if we are close enough below jiffies. + * Also while executing timers, base->clk is 1 offset ahead + * of jiffies to avoid endless requeuing to current jiffies. + */ + if ((long)(jnow - base->clk) < 1) + return; + + /* + * If the next expiry value is > jiffies, then we fast forward to + * jiffies otherwise we forward to the next expiry value. + */ + if (time_after(base->next_expiry, jnow)) { + base->clk = jnow; + } else { + if (WARN_ON_ONCE(time_before(base->next_expiry, base->clk))) + return; + base->clk = base->next_expiry; + } +} + + +/* + * We are using hashed locking: Holding per_cpu(timer_bases[x]).lock means + * that all timers which are tied to this base are locked, and the base itself + * is locked too. + * + * So __run_timers/migrate_timers can safely modify all timers which could + * be found in the base->vectors array. + * + * When a timer is migrating then the TIMER_MIGRATING flag is set and we need + * to wait until the migration is done. + */ +static struct timer_base *lock_timer_base(struct timer_list *timer, + unsigned long *flags) + __acquires(timer->base->lock) +{ + for (;;) { + struct timer_base *base; + u32 tf; + + /* + * We need to use READ_ONCE() here, otherwise the compiler + * might re-read @tf between the check for TIMER_MIGRATING + * and spin_lock(). + */ + tf = READ_ONCE(timer->flags); + + if (!(tf & TIMER_MIGRATING)) { + base = get_timer_base(tf); + raw_spin_lock_irqsave(&base->lock, *flags); + if (timer->flags == tf) + return base; + raw_spin_unlock_irqrestore(&base->lock, *flags); + } + cpu_relax(); + } +} + +#define MOD_TIMER_PENDING_ONLY 0x01 +#define MOD_TIMER_REDUCE 0x02 +#define MOD_TIMER_NOTPENDING 0x04 + +static inline int +__mod_timer(struct timer_list *timer, unsigned long expires, unsigned int options) +{ + unsigned long clk = 0, flags, bucket_expiry; + struct timer_base *base, *new_base; + unsigned int idx = UINT_MAX; + int ret = 0; + + debug_assert_init(timer); + + /* + * This is a common optimization triggered by the networking code - if + * the timer is re-modified to have the same timeout or ends up in the + * same array bucket then just return: + */ + if (!(options & MOD_TIMER_NOTPENDING) && timer_pending(timer)) { + /* + * The downside of this optimization is that it can result in + * larger granularity than you would get from adding a new + * timer with this expiry. + */ + long diff = timer->expires - expires; + + if (!diff) + return 1; + if (options & MOD_TIMER_REDUCE && diff <= 0) + return 1; + + /* + * We lock timer base and calculate the bucket index right + * here. If the timer ends up in the same bucket, then we + * just update the expiry time and avoid the whole + * dequeue/enqueue dance. + */ + base = lock_timer_base(timer, &flags); + /* + * Has @timer been shutdown? This needs to be evaluated + * while holding base lock to prevent a race against the + * shutdown code. + */ + if (!timer->function) + goto out_unlock; + + forward_timer_base(base); + + if (timer_pending(timer) && (options & MOD_TIMER_REDUCE) && + time_before_eq(timer->expires, expires)) { + ret = 1; + goto out_unlock; + } + + clk = base->clk; + idx = calc_wheel_index(expires, clk, &bucket_expiry); + + /* + * Retrieve and compare the array index of the pending + * timer. If it matches set the expiry to the new value so a + * subsequent call will exit in the expires check above. + */ + if (idx == timer_get_idx(timer)) { + if (!(options & MOD_TIMER_REDUCE)) + timer->expires = expires; + else if (time_after(timer->expires, expires)) + timer->expires = expires; + ret = 1; + goto out_unlock; + } + } else { + base = lock_timer_base(timer, &flags); + /* + * Has @timer been shutdown? This needs to be evaluated + * while holding base lock to prevent a race against the + * shutdown code. + */ + if (!timer->function) + goto out_unlock; + + forward_timer_base(base); + } + + ret = detach_if_pending(timer, base, false); + if (!ret && (options & MOD_TIMER_PENDING_ONLY)) + goto out_unlock; + + new_base = get_target_base(base, timer->flags); + + if (base != new_base) { + /* + * We are trying to schedule the timer on the new base. + * However we can't change timer's base while it is running, + * otherwise timer_delete_sync() can't detect that the timer's + * handler yet has not finished. This also guarantees that the + * timer is serialized wrt itself. + */ + if (likely(base->running_timer != timer)) { + /* See the comment in lock_timer_base() */ + timer->flags |= TIMER_MIGRATING; + + raw_spin_unlock(&base->lock); + base = new_base; + raw_spin_lock(&base->lock); + WRITE_ONCE(timer->flags, + (timer->flags & ~TIMER_BASEMASK) | base->cpu); + forward_timer_base(base); + } + } + + debug_timer_activate(timer); + + timer->expires = expires; + /* + * If 'idx' was calculated above and the base time did not advance + * between calculating 'idx' and possibly switching the base, only + * enqueue_timer() is required. Otherwise we need to (re)calculate + * the wheel index via internal_add_timer(). + */ + if (idx != UINT_MAX && clk == base->clk) + enqueue_timer(base, timer, idx, bucket_expiry); + else + internal_add_timer(base, timer); + +out_unlock: + raw_spin_unlock_irqrestore(&base->lock, flags); + + return ret; +} + +/** + * mod_timer_pending - Modify a pending timer's timeout + * @timer: The pending timer to be modified + * @expires: New absolute timeout in jiffies + * + * mod_timer_pending() is the same for pending timers as mod_timer(), but + * will not activate inactive timers. + * + * If @timer->function == NULL then the start operation is silently + * discarded. + * + * Return: + * * %0 - The timer was inactive and not modified or was in + * shutdown state and the operation was discarded + * * %1 - The timer was active and requeued to expire at @expires + */ +int mod_timer_pending(struct timer_list *timer, unsigned long expires) +{ + return __mod_timer(timer, expires, MOD_TIMER_PENDING_ONLY); +} +EXPORT_SYMBOL(mod_timer_pending); + +/** + * mod_timer - Modify a timer's timeout + * @timer: The timer to be modified + * @expires: New absolute timeout in jiffies + * + * mod_timer(timer, expires) is equivalent to: + * + * del_timer(timer); timer->expires = expires; add_timer(timer); + * + * mod_timer() is more efficient than the above open coded sequence. In + * case that the timer is inactive, the del_timer() part is a NOP. The + * timer is in any case activated with the new expiry time @expires. + * + * Note that if there are multiple unserialized concurrent users of the + * same timer, then mod_timer() is the only safe way to modify the timeout, + * since add_timer() cannot modify an already running timer. + * + * If @timer->function == NULL then the start operation is silently + * discarded. In this case the return value is 0 and meaningless. + * + * Return: + * * %0 - The timer was inactive and started or was in shutdown + * state and the operation was discarded + * * %1 - The timer was active and requeued to expire at @expires or + * the timer was active and not modified because @expires did + * not change the effective expiry time + */ +int mod_timer(struct timer_list *timer, unsigned long expires) +{ + return __mod_timer(timer, expires, 0); +} +EXPORT_SYMBOL(mod_timer); + +/** + * timer_reduce - Modify a timer's timeout if it would reduce the timeout + * @timer: The timer to be modified + * @expires: New absolute timeout in jiffies + * + * timer_reduce() is very similar to mod_timer(), except that it will only + * modify an enqueued timer if that would reduce the expiration time. If + * @timer is not enqueued it starts the timer. + * + * If @timer->function == NULL then the start operation is silently + * discarded. + * + * Return: + * * %0 - The timer was inactive and started or was in shutdown + * state and the operation was discarded + * * %1 - The timer was active and requeued to expire at @expires or + * the timer was active and not modified because @expires + * did not change the effective expiry time such that the + * timer would expire earlier than already scheduled + */ +int timer_reduce(struct timer_list *timer, unsigned long expires) +{ + return __mod_timer(timer, expires, MOD_TIMER_REDUCE); +} +EXPORT_SYMBOL(timer_reduce); + +/** + * add_timer - Start a timer + * @timer: The timer to be started + * + * Start @timer to expire at @timer->expires in the future. @timer->expires + * is the absolute expiry time measured in 'jiffies'. When the timer expires + * timer->function(timer) will be invoked from soft interrupt context. + * + * The @timer->expires and @timer->function fields must be set prior + * to calling this function. + * + * If @timer->function == NULL then the start operation is silently + * discarded. + * + * If @timer->expires is already in the past @timer will be queued to + * expire at the next timer tick. + * + * This can only operate on an inactive timer. Attempts to invoke this on + * an active timer are rejected with a warning. + */ +void add_timer(struct timer_list *timer) +{ + if (WARN_ON_ONCE(timer_pending(timer))) + return; + __mod_timer(timer, timer->expires, MOD_TIMER_NOTPENDING); +} +EXPORT_SYMBOL(add_timer); + +/** + * add_timer_on - Start a timer on a particular CPU + * @timer: The timer to be started + * @cpu: The CPU to start it on + * + * Same as add_timer() except that it starts the timer on the given CPU. + * + * See add_timer() for further details. + */ +void add_timer_on(struct timer_list *timer, int cpu) +{ + struct timer_base *new_base, *base; + unsigned long flags; + + debug_assert_init(timer); + + if (WARN_ON_ONCE(timer_pending(timer))) + return; + + new_base = get_timer_cpu_base(timer->flags, cpu); + + /* + * If @timer was on a different CPU, it should be migrated with the + * old base locked to prevent other operations proceeding with the + * wrong base locked. See lock_timer_base(). + */ + base = lock_timer_base(timer, &flags); + /* + * Has @timer been shutdown? This needs to be evaluated while + * holding base lock to prevent a race against the shutdown code. + */ + if (!timer->function) + goto out_unlock; + + if (base != new_base) { + timer->flags |= TIMER_MIGRATING; + + raw_spin_unlock(&base->lock); + base = new_base; + raw_spin_lock(&base->lock); + WRITE_ONCE(timer->flags, + (timer->flags & ~TIMER_BASEMASK) | cpu); + } + forward_timer_base(base); + + debug_timer_activate(timer); + internal_add_timer(base, timer); +out_unlock: + raw_spin_unlock_irqrestore(&base->lock, flags); +} +EXPORT_SYMBOL_GPL(add_timer_on); + +/** + * __timer_delete - Internal function: Deactivate a timer + * @timer: The timer to be deactivated + * @shutdown: If true, this indicates that the timer is about to be + * shutdown permanently. + * + * If @shutdown is true then @timer->function is set to NULL under the + * timer base lock which prevents further rearming of the time. In that + * case any attempt to rearm @timer after this function returns will be + * silently ignored. + * + * Return: + * * %0 - The timer was not pending + * * %1 - The timer was pending and deactivated + */ +static int __timer_delete(struct timer_list *timer, bool shutdown) +{ + struct timer_base *base; + unsigned long flags; + int ret = 0; + + debug_assert_init(timer); + + /* + * If @shutdown is set then the lock has to be taken whether the + * timer is pending or not to protect against a concurrent rearm + * which might hit between the lockless pending check and the lock + * aquisition. By taking the lock it is ensured that such a newly + * enqueued timer is dequeued and cannot end up with + * timer->function == NULL in the expiry code. + * + * If timer->function is currently executed, then this makes sure + * that the callback cannot requeue the timer. + */ + if (timer_pending(timer) || shutdown) { + base = lock_timer_base(timer, &flags); + ret = detach_if_pending(timer, base, true); + if (shutdown) + timer->function = NULL; + raw_spin_unlock_irqrestore(&base->lock, flags); + } + + return ret; +} + +/** + * timer_delete - Deactivate a timer + * @timer: The timer to be deactivated + * + * The function only deactivates a pending timer, but contrary to + * timer_delete_sync() it does not take into account whether the timer's + * callback function is concurrently executed on a different CPU or not. + * It neither prevents rearming of the timer. If @timer can be rearmed + * concurrently then the return value of this function is meaningless. + * + * Return: + * * %0 - The timer was not pending + * * %1 - The timer was pending and deactivated + */ +int timer_delete(struct timer_list *timer) +{ + return __timer_delete(timer, false); +} +EXPORT_SYMBOL(timer_delete); + +/** + * timer_shutdown - Deactivate a timer and prevent rearming + * @timer: The timer to be deactivated + * + * The function does not wait for an eventually running timer callback on a + * different CPU but it prevents rearming of the timer. Any attempt to arm + * @timer after this function returns will be silently ignored. + * + * This function is useful for teardown code and should only be used when + * timer_shutdown_sync() cannot be invoked due to locking or context constraints. + * + * Return: + * * %0 - The timer was not pending + * * %1 - The timer was pending + */ +int timer_shutdown(struct timer_list *timer) +{ + return __timer_delete(timer, true); +} +EXPORT_SYMBOL_GPL(timer_shutdown); + +/** + * __try_to_del_timer_sync - Internal function: Try to deactivate a timer + * @timer: Timer to deactivate + * @shutdown: If true, this indicates that the timer is about to be + * shutdown permanently. + * + * If @shutdown is true then @timer->function is set to NULL under the + * timer base lock which prevents further rearming of the timer. Any + * attempt to rearm @timer after this function returns will be silently + * ignored. + * + * This function cannot guarantee that the timer cannot be rearmed + * right after dropping the base lock if @shutdown is false. That + * needs to be prevented by the calling code if necessary. + * + * Return: + * * %0 - The timer was not pending + * * %1 - The timer was pending and deactivated + * * %-1 - The timer callback function is running on a different CPU + */ +static int __try_to_del_timer_sync(struct timer_list *timer, bool shutdown) +{ + struct timer_base *base; + unsigned long flags; + int ret = -1; + + debug_assert_init(timer); + + base = lock_timer_base(timer, &flags); + + if (base->running_timer != timer) + ret = detach_if_pending(timer, base, true); + if (shutdown) + timer->function = NULL; + + raw_spin_unlock_irqrestore(&base->lock, flags); + + return ret; +} + +/** + * try_to_del_timer_sync - Try to deactivate a timer + * @timer: Timer to deactivate + * + * This function tries to deactivate a timer. On success the timer is not + * queued and the timer callback function is not running on any CPU. + * + * This function does not guarantee that the timer cannot be rearmed right + * after dropping the base lock. That needs to be prevented by the calling + * code if necessary. + * + * Return: + * * %0 - The timer was not pending + * * %1 - The timer was pending and deactivated + * * %-1 - The timer callback function is running on a different CPU + */ +int try_to_del_timer_sync(struct timer_list *timer) +{ + return __try_to_del_timer_sync(timer, false); +} +EXPORT_SYMBOL(try_to_del_timer_sync); + +#ifdef CONFIG_PREEMPT_RT +static __init void timer_base_init_expiry_lock(struct timer_base *base) +{ + spin_lock_init(&base->expiry_lock); +} + +static inline void timer_base_lock_expiry(struct timer_base *base) +{ + spin_lock(&base->expiry_lock); +} + +static inline void timer_base_unlock_expiry(struct timer_base *base) +{ + spin_unlock(&base->expiry_lock); +} + +/* + * The counterpart to del_timer_wait_running(). + * + * If there is a waiter for base->expiry_lock, then it was waiting for the + * timer callback to finish. Drop expiry_lock and reacquire it. That allows + * the waiter to acquire the lock and make progress. + */ +static void timer_sync_wait_running(struct timer_base *base) +{ + if (atomic_read(&base->timer_waiters)) { + raw_spin_unlock_irq(&base->lock); + spin_unlock(&base->expiry_lock); + spin_lock(&base->expiry_lock); + raw_spin_lock_irq(&base->lock); + } +} + +/* + * This function is called on PREEMPT_RT kernels when the fast path + * deletion of a timer failed because the timer callback function was + * running. + * + * This prevents priority inversion, if the softirq thread on a remote CPU + * got preempted, and it prevents a life lock when the task which tries to + * delete a timer preempted the softirq thread running the timer callback + * function. + */ +static void del_timer_wait_running(struct timer_list *timer) +{ + u32 tf; + + tf = READ_ONCE(timer->flags); + if (!(tf & (TIMER_MIGRATING | TIMER_IRQSAFE))) { + struct timer_base *base = get_timer_base(tf); + + /* + * Mark the base as contended and grab the expiry lock, + * which is held by the softirq across the timer + * callback. Drop the lock immediately so the softirq can + * expire the next timer. In theory the timer could already + * be running again, but that's more than unlikely and just + * causes another wait loop. + */ + atomic_inc(&base->timer_waiters); + spin_lock_bh(&base->expiry_lock); + atomic_dec(&base->timer_waiters); + spin_unlock_bh(&base->expiry_lock); + } +} +#else +static inline void timer_base_init_expiry_lock(struct timer_base *base) { } +static inline void timer_base_lock_expiry(struct timer_base *base) { } +static inline void timer_base_unlock_expiry(struct timer_base *base) { } +static inline void timer_sync_wait_running(struct timer_base *base) { } +static inline void del_timer_wait_running(struct timer_list *timer) { } +#endif + +/** + * __timer_delete_sync - Internal function: Deactivate a timer and wait + * for the handler to finish. + * @timer: The timer to be deactivated + * @shutdown: If true, @timer->function will be set to NULL under the + * timer base lock which prevents rearming of @timer + * + * If @shutdown is not set the timer can be rearmed later. If the timer can + * be rearmed concurrently, i.e. after dropping the base lock then the + * return value is meaningless. + * + * If @shutdown is set then @timer->function is set to NULL under timer + * base lock which prevents rearming of the timer. Any attempt to rearm + * a shutdown timer is silently ignored. + * + * If the timer should be reused after shutdown it has to be initialized + * again. + * + * Return: + * * %0 - The timer was not pending + * * %1 - The timer was pending and deactivated + */ +static int __timer_delete_sync(struct timer_list *timer, bool shutdown) +{ + int ret; + +#ifdef CONFIG_LOCKDEP + unsigned long flags; + + /* + * If lockdep gives a backtrace here, please reference + * the synchronization rules above. + */ + local_irq_save(flags); + lock_map_acquire(&timer->lockdep_map); + lock_map_release(&timer->lockdep_map); + local_irq_restore(flags); +#endif + /* + * don't use it in hardirq context, because it + * could lead to deadlock. + */ + WARN_ON(in_hardirq() && !(timer->flags & TIMER_IRQSAFE)); + + /* + * Must be able to sleep on PREEMPT_RT because of the slowpath in + * del_timer_wait_running(). + */ + if (IS_ENABLED(CONFIG_PREEMPT_RT) && !(timer->flags & TIMER_IRQSAFE)) + lockdep_assert_preemption_enabled(); + + do { + ret = __try_to_del_timer_sync(timer, shutdown); + + if (unlikely(ret < 0)) { + del_timer_wait_running(timer); + cpu_relax(); + } + } while (ret < 0); + + return ret; +} + +/** + * timer_delete_sync - Deactivate a timer and wait for the handler to finish. + * @timer: The timer to be deactivated + * + * Synchronization rules: Callers must prevent restarting of the timer, + * otherwise this function is meaningless. It must not be called from + * interrupt contexts unless the timer is an irqsafe one. The caller must + * not hold locks which would prevent completion of the timer's callback + * function. The timer's handler must not call add_timer_on(). Upon exit + * the timer is not queued and the handler is not running on any CPU. + * + * For !irqsafe timers, the caller must not hold locks that are held in + * interrupt context. Even if the lock has nothing to do with the timer in + * question. Here's why:: + * + * CPU0 CPU1 + * ---- ---- + * <SOFTIRQ> + * call_timer_fn(); + * base->running_timer = mytimer; + * spin_lock_irq(somelock); + * <IRQ> + * spin_lock(somelock); + * timer_delete_sync(mytimer); + * while (base->running_timer == mytimer); + * + * Now timer_delete_sync() will never return and never release somelock. + * The interrupt on the other CPU is waiting to grab somelock but it has + * interrupted the softirq that CPU0 is waiting to finish. + * + * This function cannot guarantee that the timer is not rearmed again by + * some concurrent or preempting code, right after it dropped the base + * lock. If there is the possibility of a concurrent rearm then the return + * value of the function is meaningless. + * + * If such a guarantee is needed, e.g. for teardown situations then use + * timer_shutdown_sync() instead. + * + * Return: + * * %0 - The timer was not pending + * * %1 - The timer was pending and deactivated + */ +int timer_delete_sync(struct timer_list *timer) +{ + return __timer_delete_sync(timer, false); +} +EXPORT_SYMBOL(timer_delete_sync); + +/** + * timer_shutdown_sync - Shutdown a timer and prevent rearming + * @timer: The timer to be shutdown + * + * When the function returns it is guaranteed that: + * - @timer is not queued + * - The callback function of @timer is not running + * - @timer cannot be enqueued again. Any attempt to rearm + * @timer is silently ignored. + * + * See timer_delete_sync() for synchronization rules. + * + * This function is useful for final teardown of an infrastructure where + * the timer is subject to a circular dependency problem. + * + * A common pattern for this is a timer and a workqueue where the timer can + * schedule work and work can arm the timer. On shutdown the workqueue must + * be destroyed and the timer must be prevented from rearming. Unless the + * code has conditionals like 'if (mything->in_shutdown)' to prevent that + * there is no way to get this correct with timer_delete_sync(). + * + * timer_shutdown_sync() is solving the problem. The correct ordering of + * calls in this case is: + * + * timer_shutdown_sync(&mything->timer); + * workqueue_destroy(&mything->workqueue); + * + * After this 'mything' can be safely freed. + * + * This obviously implies that the timer is not required to be functional + * for the rest of the shutdown operation. + * + * Return: + * * %0 - The timer was not pending + * * %1 - The timer was pending + */ +int timer_shutdown_sync(struct timer_list *timer) +{ + return __timer_delete_sync(timer, true); +} +EXPORT_SYMBOL_GPL(timer_shutdown_sync); + +static void call_timer_fn(struct timer_list *timer, + void (*fn)(struct timer_list *), + unsigned long baseclk) +{ + int count = preempt_count(); + +#ifdef CONFIG_LOCKDEP + /* + * It is permissible to free the timer from inside the + * function that is called from it, this we need to take into + * account for lockdep too. To avoid bogus "held lock freed" + * warnings as well as problems when looking into + * timer->lockdep_map, make a copy and use that here. + */ + struct lockdep_map lockdep_map; + + lockdep_copy_map(&lockdep_map, &timer->lockdep_map); +#endif + /* + * Couple the lock chain with the lock chain at + * timer_delete_sync() by acquiring the lock_map around the fn() + * call here and in timer_delete_sync(). + */ + lock_map_acquire(&lockdep_map); + + trace_timer_expire_entry(timer, baseclk); + fn(timer); + trace_timer_expire_exit(timer); + + lock_map_release(&lockdep_map); + + if (count != preempt_count()) { + WARN_ONCE(1, "timer: %pS preempt leak: %08x -> %08x\n", + fn, count, preempt_count()); + /* + * Restore the preempt count. That gives us a decent + * chance to survive and extract information. If the + * callback kept a lock held, bad luck, but not worse + * than the BUG() we had. + */ + preempt_count_set(count); + } +} + +static void expire_timers(struct timer_base *base, struct hlist_head *head) +{ + /* + * This value is required only for tracing. base->clk was + * incremented directly before expire_timers was called. But expiry + * is related to the old base->clk value. + */ + unsigned long baseclk = base->clk - 1; + + while (!hlist_empty(head)) { + struct timer_list *timer; + void (*fn)(struct timer_list *); + + timer = hlist_entry(head->first, struct timer_list, entry); + + base->running_timer = timer; + detach_timer(timer, true); + + fn = timer->function; + + if (WARN_ON_ONCE(!fn)) { + /* Should never happen. Emphasis on should! */ + base->running_timer = NULL; + continue; + } + + if (timer->flags & TIMER_IRQSAFE) { + raw_spin_unlock(&base->lock); + call_timer_fn(timer, fn, baseclk); + raw_spin_lock(&base->lock); + base->running_timer = NULL; + } else { + raw_spin_unlock_irq(&base->lock); + call_timer_fn(timer, fn, baseclk); + raw_spin_lock_irq(&base->lock); + base->running_timer = NULL; + timer_sync_wait_running(base); + } + } +} + +static int collect_expired_timers(struct timer_base *base, + struct hlist_head *heads) +{ + unsigned long clk = base->clk = base->next_expiry; + struct hlist_head *vec; + int i, levels = 0; + unsigned int idx; + + for (i = 0; i < LVL_DEPTH; i++) { + idx = (clk & LVL_MASK) + i * LVL_SIZE; + + if (__test_and_clear_bit(idx, base->pending_map)) { + vec = base->vectors + idx; + hlist_move_list(vec, heads++); + levels++; + } + /* Is it time to look at the next level? */ + if (clk & LVL_CLK_MASK) + break; + /* Shift clock for the next level granularity */ + clk >>= LVL_CLK_SHIFT; + } + return levels; +} + +/* + * Find the next pending bucket of a level. Search from level start (@offset) + * + @clk upwards and if nothing there, search from start of the level + * (@offset) up to @offset + clk. + */ +static int next_pending_bucket(struct timer_base *base, unsigned offset, + unsigned clk) +{ + unsigned pos, start = offset + clk; + unsigned end = offset + LVL_SIZE; + + pos = find_next_bit(base->pending_map, end, start); + if (pos < end) + return pos - start; + + pos = find_next_bit(base->pending_map, start, offset); + return pos < start ? pos + LVL_SIZE - start : -1; +} + +/* + * Search the first expiring timer in the various clock levels. Caller must + * hold base->lock. + */ +static unsigned long __next_timer_interrupt(struct timer_base *base) +{ + unsigned long clk, next, adj; + unsigned lvl, offset = 0; + + next = base->clk + NEXT_TIMER_MAX_DELTA; + clk = base->clk; + for (lvl = 0; lvl < LVL_DEPTH; lvl++, offset += LVL_SIZE) { + int pos = next_pending_bucket(base, offset, clk & LVL_MASK); + unsigned long lvl_clk = clk & LVL_CLK_MASK; + + if (pos >= 0) { + unsigned long tmp = clk + (unsigned long) pos; + + tmp <<= LVL_SHIFT(lvl); + if (time_before(tmp, next)) + next = tmp; + + /* + * If the next expiration happens before we reach + * the next level, no need to check further. + */ + if (pos <= ((LVL_CLK_DIV - lvl_clk) & LVL_CLK_MASK)) + break; + } + /* + * Clock for the next level. If the current level clock lower + * bits are zero, we look at the next level as is. If not we + * need to advance it by one because that's going to be the + * next expiring bucket in that level. base->clk is the next + * expiring jiffie. So in case of: + * + * LVL5 LVL4 LVL3 LVL2 LVL1 LVL0 + * 0 0 0 0 0 0 + * + * we have to look at all levels @index 0. With + * + * LVL5 LVL4 LVL3 LVL2 LVL1 LVL0 + * 0 0 0 0 0 2 + * + * LVL0 has the next expiring bucket @index 2. The upper + * levels have the next expiring bucket @index 1. + * + * In case that the propagation wraps the next level the same + * rules apply: + * + * LVL5 LVL4 LVL3 LVL2 LVL1 LVL0 + * 0 0 0 0 F 2 + * + * So after looking at LVL0 we get: + * + * LVL5 LVL4 LVL3 LVL2 LVL1 + * 0 0 0 1 0 + * + * So no propagation from LVL1 to LVL2 because that happened + * with the add already, but then we need to propagate further + * from LVL2 to LVL3. + * + * So the simple check whether the lower bits of the current + * level are 0 or not is sufficient for all cases. + */ + adj = lvl_clk ? 1 : 0; + clk >>= LVL_CLK_SHIFT; + clk += adj; + } + + base->next_expiry_recalc = false; + base->timers_pending = !(next == base->clk + NEXT_TIMER_MAX_DELTA); + + return next; +} + +#ifdef CONFIG_NO_HZ_COMMON +/* + * Check, if the next hrtimer event is before the next timer wheel + * event: + */ +static u64 cmp_next_hrtimer_event(u64 basem, u64 expires) +{ + u64 nextevt = hrtimer_get_next_event(); + + /* + * If high resolution timers are enabled + * hrtimer_get_next_event() returns KTIME_MAX. + */ + if (expires <= nextevt) + return expires; + + /* + * If the next timer is already expired, return the tick base + * time so the tick is fired immediately. + */ + if (nextevt <= basem) + return basem; + + /* + * Round up to the next jiffie. High resolution timers are + * off, so the hrtimers are expired in the tick and we need to + * make sure that this tick really expires the timer to avoid + * a ping pong of the nohz stop code. + * + * Use DIV_ROUND_UP_ULL to prevent gcc calling __divdi3 + */ + return DIV_ROUND_UP_ULL(nextevt, TICK_NSEC) * TICK_NSEC; +} + +/** + * get_next_timer_interrupt - return the time (clock mono) of the next timer + * @basej: base time jiffies + * @basem: base time clock monotonic + * + * Returns the tick aligned clock monotonic time of the next pending + * timer or KTIME_MAX if no timer is pending. + */ +u64 get_next_timer_interrupt(unsigned long basej, u64 basem) +{ + struct timer_base *base = this_cpu_ptr(&timer_bases[BASE_STD]); + u64 expires = KTIME_MAX; + unsigned long nextevt; + + /* + * Pretend that there is no timer pending if the cpu is offline. + * Possible pending timers will be migrated later to an active cpu. + */ + if (cpu_is_offline(smp_processor_id())) + return expires; + + raw_spin_lock(&base->lock); + if (base->next_expiry_recalc) + base->next_expiry = __next_timer_interrupt(base); + nextevt = base->next_expiry; + + /* + * We have a fresh next event. Check whether we can forward the + * base. We can only do that when @basej is past base->clk + * otherwise we might rewind base->clk. + */ + if (time_after(basej, base->clk)) { + if (time_after(nextevt, basej)) + base->clk = basej; + else if (time_after(nextevt, base->clk)) + base->clk = nextevt; + } + + if (time_before_eq(nextevt, basej)) { + expires = basem; + base->is_idle = false; + } else { + if (base->timers_pending) + expires = basem + (u64)(nextevt - basej) * TICK_NSEC; + /* + * If we expect to sleep more than a tick, mark the base idle. + * Also the tick is stopped so any added timer must forward + * the base clk itself to keep granularity small. This idle + * logic is only maintained for the BASE_STD base, deferrable + * timers may still see large granularity skew (by design). + */ + if ((expires - basem) > TICK_NSEC) + base->is_idle = true; + } + raw_spin_unlock(&base->lock); + + return cmp_next_hrtimer_event(basem, expires); +} + +/** + * timer_clear_idle - Clear the idle state of the timer base + * + * Called with interrupts disabled + */ +void timer_clear_idle(void) +{ + struct timer_base *base = this_cpu_ptr(&timer_bases[BASE_STD]); + + /* + * We do this unlocked. The worst outcome is a remote enqueue sending + * a pointless IPI, but taking the lock would just make the window for + * sending the IPI a few instructions smaller for the cost of taking + * the lock in the exit from idle path. + */ + base->is_idle = false; +} +#endif + +/** + * __run_timers - run all expired timers (if any) on this CPU. + * @base: the timer vector to be processed. + */ +static inline void __run_timers(struct timer_base *base) +{ + struct hlist_head heads[LVL_DEPTH]; + int levels; + + if (time_before(jiffies, base->next_expiry)) + return; + + timer_base_lock_expiry(base); + raw_spin_lock_irq(&base->lock); + + while (time_after_eq(jiffies, base->clk) && + time_after_eq(jiffies, base->next_expiry)) { + levels = collect_expired_timers(base, heads); + /* + * The two possible reasons for not finding any expired + * timer at this clk are that all matching timers have been + * dequeued or no timer has been queued since + * base::next_expiry was set to base::clk + + * NEXT_TIMER_MAX_DELTA. + */ + WARN_ON_ONCE(!levels && !base->next_expiry_recalc + && base->timers_pending); + base->clk++; + base->next_expiry = __next_timer_interrupt(base); + + while (levels--) + expire_timers(base, heads + levels); + } + raw_spin_unlock_irq(&base->lock); + timer_base_unlock_expiry(base); +} + +/* + * This function runs timers and the timer-tq in bottom half context. + */ +static __latent_entropy void run_timer_softirq(struct softirq_action *h) +{ + struct timer_base *base = this_cpu_ptr(&timer_bases[BASE_STD]); + + __run_timers(base); + if (IS_ENABLED(CONFIG_NO_HZ_COMMON)) + __run_timers(this_cpu_ptr(&timer_bases[BASE_DEF])); +} + +/* + * Called by the local, per-CPU timer interrupt on SMP. + */ +static void run_local_timers(void) +{ + struct timer_base *base = this_cpu_ptr(&timer_bases[BASE_STD]); + + hrtimer_run_queues(); + /* Raise the softirq only if required. */ + if (time_before(jiffies, base->next_expiry)) { + if (!IS_ENABLED(CONFIG_NO_HZ_COMMON)) + return; + /* CPU is awake, so check the deferrable base. */ + base++; + if (time_before(jiffies, base->next_expiry)) + return; + } + raise_softirq(TIMER_SOFTIRQ); +} + +/* + * Called from the timer interrupt handler to charge one tick to the current + * process. user_tick is 1 if the tick is user time, 0 for system. + */ +void update_process_times(int user_tick) +{ + struct task_struct *p = current; + + /* Note: this timer irq context must be accounted for as well. */ + account_process_tick(p, user_tick); + run_local_timers(); + rcu_sched_clock_irq(user_tick); +#ifdef CONFIG_IRQ_WORK + if (in_irq()) + irq_work_tick(); +#endif + scheduler_tick(); + if (IS_ENABLED(CONFIG_POSIX_TIMERS)) + run_posix_cpu_timers(); +} + +/* + * Since schedule_timeout()'s timer is defined on the stack, it must store + * the target task on the stack as well. + */ +struct process_timer { + struct timer_list timer; + struct task_struct *task; +}; + +static void process_timeout(struct timer_list *t) +{ + struct process_timer *timeout = from_timer(timeout, t, timer); + + wake_up_process(timeout->task); +} + +/** + * schedule_timeout - sleep until timeout + * @timeout: timeout value in jiffies + * + * Make the current task sleep until @timeout jiffies have elapsed. + * The function behavior depends on the current task state + * (see also set_current_state() description): + * + * %TASK_RUNNING - the scheduler is called, but the task does not sleep + * at all. That happens because sched_submit_work() does nothing for + * tasks in %TASK_RUNNING state. + * + * %TASK_UNINTERRUPTIBLE - at least @timeout jiffies are guaranteed to + * pass before the routine returns unless the current task is explicitly + * woken up, (e.g. by wake_up_process()). + * + * %TASK_INTERRUPTIBLE - the routine may return early if a signal is + * delivered to the current task or the current task is explicitly woken + * up. + * + * The current task state is guaranteed to be %TASK_RUNNING when this + * routine returns. + * + * Specifying a @timeout value of %MAX_SCHEDULE_TIMEOUT will schedule + * the CPU away without a bound on the timeout. In this case the return + * value will be %MAX_SCHEDULE_TIMEOUT. + * + * Returns 0 when the timer has expired otherwise the remaining time in + * jiffies will be returned. In all cases the return value is guaranteed + * to be non-negative. + */ +signed long __sched schedule_timeout(signed long timeout) +{ + struct process_timer timer; + unsigned long expire; + + switch (timeout) + { + case MAX_SCHEDULE_TIMEOUT: + /* + * These two special cases are useful to be comfortable + * in the caller. Nothing more. We could take + * MAX_SCHEDULE_TIMEOUT from one of the negative value + * but I' d like to return a valid offset (>=0) to allow + * the caller to do everything it want with the retval. + */ + schedule(); + goto out; + default: + /* + * Another bit of PARANOID. Note that the retval will be + * 0 since no piece of kernel is supposed to do a check + * for a negative retval of schedule_timeout() (since it + * should never happens anyway). You just have the printk() + * that will tell you if something is gone wrong and where. + */ + if (timeout < 0) { + printk(KERN_ERR "schedule_timeout: wrong timeout " + "value %lx\n", timeout); + dump_stack(); + __set_current_state(TASK_RUNNING); + goto out; + } + } + + expire = timeout + jiffies; + + timer.task = current; + timer_setup_on_stack(&timer.timer, process_timeout, 0); + __mod_timer(&timer.timer, expire, MOD_TIMER_NOTPENDING); + schedule(); + del_timer_sync(&timer.timer); + + /* Remove the timer from the object tracker */ + destroy_timer_on_stack(&timer.timer); + + timeout = expire - jiffies; + + out: + return timeout < 0 ? 0 : timeout; +} +EXPORT_SYMBOL(schedule_timeout); + +/* + * We can use __set_current_state() here because schedule_timeout() calls + * schedule() unconditionally. + */ +signed long __sched schedule_timeout_interruptible(signed long timeout) +{ + __set_current_state(TASK_INTERRUPTIBLE); + return schedule_timeout(timeout); +} +EXPORT_SYMBOL(schedule_timeout_interruptible); + +signed long __sched schedule_timeout_killable(signed long timeout) +{ + __set_current_state(TASK_KILLABLE); + return schedule_timeout(timeout); +} +EXPORT_SYMBOL(schedule_timeout_killable); + +signed long __sched schedule_timeout_uninterruptible(signed long timeout) +{ + __set_current_state(TASK_UNINTERRUPTIBLE); + return schedule_timeout(timeout); +} +EXPORT_SYMBOL(schedule_timeout_uninterruptible); + +/* + * Like schedule_timeout_uninterruptible(), except this task will not contribute + * to load average. + */ +signed long __sched schedule_timeout_idle(signed long timeout) +{ + __set_current_state(TASK_IDLE); + return schedule_timeout(timeout); +} +EXPORT_SYMBOL(schedule_timeout_idle); + +#ifdef CONFIG_HOTPLUG_CPU +static void migrate_timer_list(struct timer_base *new_base, struct hlist_head *head) +{ + struct timer_list *timer; + int cpu = new_base->cpu; + + while (!hlist_empty(head)) { + timer = hlist_entry(head->first, struct timer_list, entry); + detach_timer(timer, false); + timer->flags = (timer->flags & ~TIMER_BASEMASK) | cpu; + internal_add_timer(new_base, timer); + } +} + +int timers_prepare_cpu(unsigned int cpu) +{ + struct timer_base *base; + int b; + + for (b = 0; b < NR_BASES; b++) { + base = per_cpu_ptr(&timer_bases[b], cpu); + base->clk = jiffies; + base->next_expiry = base->clk + NEXT_TIMER_MAX_DELTA; + base->next_expiry_recalc = false; + base->timers_pending = false; + base->is_idle = false; + } + return 0; +} + +int timers_dead_cpu(unsigned int cpu) +{ + struct timer_base *old_base; + struct timer_base *new_base; + int b, i; + + for (b = 0; b < NR_BASES; b++) { + old_base = per_cpu_ptr(&timer_bases[b], cpu); + new_base = get_cpu_ptr(&timer_bases[b]); + /* + * The caller is globally serialized and nobody else + * takes two locks at once, deadlock is not possible. + */ + raw_spin_lock_irq(&new_base->lock); + raw_spin_lock_nested(&old_base->lock, SINGLE_DEPTH_NESTING); + + /* + * The current CPUs base clock might be stale. Update it + * before moving the timers over. + */ + forward_timer_base(new_base); + + WARN_ON_ONCE(old_base->running_timer); + old_base->running_timer = NULL; + + for (i = 0; i < WHEEL_SIZE; i++) + migrate_timer_list(new_base, old_base->vectors + i); + + raw_spin_unlock(&old_base->lock); + raw_spin_unlock_irq(&new_base->lock); + put_cpu_ptr(&timer_bases); + } + return 0; +} + +#endif /* CONFIG_HOTPLUG_CPU */ + +static void __init init_timer_cpu(int cpu) +{ + struct timer_base *base; + int i; + + for (i = 0; i < NR_BASES; i++) { + base = per_cpu_ptr(&timer_bases[i], cpu); + base->cpu = cpu; + raw_spin_lock_init(&base->lock); + base->clk = jiffies; + base->next_expiry = base->clk + NEXT_TIMER_MAX_DELTA; + timer_base_init_expiry_lock(base); + } +} + +static void __init init_timer_cpus(void) +{ + int cpu; + + for_each_possible_cpu(cpu) + init_timer_cpu(cpu); +} + +void __init init_timers(void) +{ + init_timer_cpus(); + posix_cputimers_init_work(); + open_softirq(TIMER_SOFTIRQ, run_timer_softirq); +} + +/** + * msleep - sleep safely even with waitqueue interruptions + * @msecs: Time in milliseconds to sleep for + */ +void msleep(unsigned int msecs) +{ + unsigned long timeout = msecs_to_jiffies(msecs) + 1; + + while (timeout) + timeout = schedule_timeout_uninterruptible(timeout); +} + +EXPORT_SYMBOL(msleep); + +/** + * msleep_interruptible - sleep waiting for signals + * @msecs: Time in milliseconds to sleep for + */ +unsigned long msleep_interruptible(unsigned int msecs) +{ + unsigned long timeout = msecs_to_jiffies(msecs) + 1; + + while (timeout && !signal_pending(current)) + timeout = schedule_timeout_interruptible(timeout); + return jiffies_to_msecs(timeout); +} + +EXPORT_SYMBOL(msleep_interruptible); + +/** + * usleep_range_state - Sleep for an approximate time in a given state + * @min: Minimum time in usecs to sleep + * @max: Maximum time in usecs to sleep + * @state: State of the current task that will be while sleeping + * + * In non-atomic context where the exact wakeup time is flexible, use + * usleep_range_state() instead of udelay(). The sleep improves responsiveness + * by avoiding the CPU-hogging busy-wait of udelay(), and the range reduces + * power usage by allowing hrtimers to take advantage of an already- + * scheduled interrupt instead of scheduling a new one just for this sleep. + */ +void __sched usleep_range_state(unsigned long min, unsigned long max, + unsigned int state) +{ + ktime_t exp = ktime_add_us(ktime_get(), min); + u64 delta = (u64)(max - min) * NSEC_PER_USEC; + + for (;;) { + __set_current_state(state); + /* Do not return before the requested sleep time has elapsed */ + if (!schedule_hrtimeout_range(&exp, delta, HRTIMER_MODE_ABS)) + break; + } +} +EXPORT_SYMBOL(usleep_range_state); diff --git a/kernel/time/timer_list.c b/kernel/time/timer_list.c new file mode 100644 index 0000000000..ed7d6ad694 --- /dev/null +++ b/kernel/time/timer_list.c @@ -0,0 +1,360 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * List pending timers + * + * Copyright(C) 2006, Red Hat, Inc., Ingo Molnar + */ + +#include <linux/proc_fs.h> +#include <linux/module.h> +#include <linux/spinlock.h> +#include <linux/sched.h> +#include <linux/seq_file.h> +#include <linux/kallsyms.h> +#include <linux/nmi.h> + +#include <linux/uaccess.h> + +#include "tick-internal.h" + +struct timer_list_iter { + int cpu; + bool second_pass; + u64 now; +}; + +/* + * This allows printing both to /proc/timer_list and + * to the console (on SysRq-Q): + */ +__printf(2, 3) +static void SEQ_printf(struct seq_file *m, const char *fmt, ...) +{ + va_list args; + + va_start(args, fmt); + + if (m) + seq_vprintf(m, fmt, args); + else + vprintk(fmt, args); + + va_end(args); +} + +static void +print_timer(struct seq_file *m, struct hrtimer *taddr, struct hrtimer *timer, + int idx, u64 now) +{ + SEQ_printf(m, " #%d: <%pK>, %ps", idx, taddr, timer->function); + SEQ_printf(m, ", S:%02x", timer->state); + SEQ_printf(m, "\n"); + SEQ_printf(m, " # expires at %Lu-%Lu nsecs [in %Ld to %Ld nsecs]\n", + (unsigned long long)ktime_to_ns(hrtimer_get_softexpires(timer)), + (unsigned long long)ktime_to_ns(hrtimer_get_expires(timer)), + (long long)(ktime_to_ns(hrtimer_get_softexpires(timer)) - now), + (long long)(ktime_to_ns(hrtimer_get_expires(timer)) - now)); +} + +static void +print_active_timers(struct seq_file *m, struct hrtimer_clock_base *base, + u64 now) +{ + struct hrtimer *timer, tmp; + unsigned long next = 0, i; + struct timerqueue_node *curr; + unsigned long flags; + +next_one: + i = 0; + + touch_nmi_watchdog(); + + raw_spin_lock_irqsave(&base->cpu_base->lock, flags); + + curr = timerqueue_getnext(&base->active); + /* + * Crude but we have to do this O(N*N) thing, because + * we have to unlock the base when printing: + */ + while (curr && i < next) { + curr = timerqueue_iterate_next(curr); + i++; + } + + if (curr) { + + timer = container_of(curr, struct hrtimer, node); + tmp = *timer; + raw_spin_unlock_irqrestore(&base->cpu_base->lock, flags); + + print_timer(m, timer, &tmp, i, now); + next++; + goto next_one; + } + raw_spin_unlock_irqrestore(&base->cpu_base->lock, flags); +} + +static void +print_base(struct seq_file *m, struct hrtimer_clock_base *base, u64 now) +{ + SEQ_printf(m, " .base: %pK\n", base); + SEQ_printf(m, " .index: %d\n", base->index); + + SEQ_printf(m, " .resolution: %u nsecs\n", hrtimer_resolution); + + SEQ_printf(m, " .get_time: %ps\n", base->get_time); +#ifdef CONFIG_HIGH_RES_TIMERS + SEQ_printf(m, " .offset: %Lu nsecs\n", + (unsigned long long) ktime_to_ns(base->offset)); +#endif + SEQ_printf(m, "active timers:\n"); + print_active_timers(m, base, now + ktime_to_ns(base->offset)); +} + +static void print_cpu(struct seq_file *m, int cpu, u64 now) +{ + struct hrtimer_cpu_base *cpu_base = &per_cpu(hrtimer_bases, cpu); + int i; + + SEQ_printf(m, "cpu: %d\n", cpu); + for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) { + SEQ_printf(m, " clock %d:\n", i); + print_base(m, cpu_base->clock_base + i, now); + } +#define P(x) \ + SEQ_printf(m, " .%-15s: %Lu\n", #x, \ + (unsigned long long)(cpu_base->x)) +#define P_ns(x) \ + SEQ_printf(m, " .%-15s: %Lu nsecs\n", #x, \ + (unsigned long long)(ktime_to_ns(cpu_base->x))) + +#ifdef CONFIG_HIGH_RES_TIMERS + P_ns(expires_next); + P(hres_active); + P(nr_events); + P(nr_retries); + P(nr_hangs); + P(max_hang_time); +#endif +#undef P +#undef P_ns + +#ifdef CONFIG_TICK_ONESHOT +# define P(x) \ + SEQ_printf(m, " .%-15s: %Lu\n", #x, \ + (unsigned long long)(ts->x)) +# define P_ns(x) \ + SEQ_printf(m, " .%-15s: %Lu nsecs\n", #x, \ + (unsigned long long)(ktime_to_ns(ts->x))) + { + struct tick_sched *ts = tick_get_tick_sched(cpu); + P(nohz_mode); + P_ns(last_tick); + P(tick_stopped); + P(idle_jiffies); + P(idle_calls); + P(idle_sleeps); + P_ns(idle_entrytime); + P_ns(idle_waketime); + P_ns(idle_exittime); + P_ns(idle_sleeptime); + P_ns(iowait_sleeptime); + P(last_jiffies); + P(next_timer); + P_ns(idle_expires); + SEQ_printf(m, "jiffies: %Lu\n", + (unsigned long long)jiffies); + } +#endif + +#undef P +#undef P_ns + SEQ_printf(m, "\n"); +} + +#ifdef CONFIG_GENERIC_CLOCKEVENTS +static void +print_tickdevice(struct seq_file *m, struct tick_device *td, int cpu) +{ + struct clock_event_device *dev = td->evtdev; + + touch_nmi_watchdog(); + + SEQ_printf(m, "Tick Device: mode: %d\n", td->mode); + if (cpu < 0) + SEQ_printf(m, "Broadcast device\n"); + else + SEQ_printf(m, "Per CPU device: %d\n", cpu); + + SEQ_printf(m, "Clock Event Device: "); + if (!dev) { + SEQ_printf(m, "<NULL>\n"); + return; + } + SEQ_printf(m, "%s\n", dev->name); + SEQ_printf(m, " max_delta_ns: %llu\n", + (unsigned long long) dev->max_delta_ns); + SEQ_printf(m, " min_delta_ns: %llu\n", + (unsigned long long) dev->min_delta_ns); + SEQ_printf(m, " mult: %u\n", dev->mult); + SEQ_printf(m, " shift: %u\n", dev->shift); + SEQ_printf(m, " mode: %d\n", clockevent_get_state(dev)); + SEQ_printf(m, " next_event: %Ld nsecs\n", + (unsigned long long) ktime_to_ns(dev->next_event)); + + SEQ_printf(m, " set_next_event: %ps\n", dev->set_next_event); + + if (dev->set_state_shutdown) + SEQ_printf(m, " shutdown: %ps\n", + dev->set_state_shutdown); + + if (dev->set_state_periodic) + SEQ_printf(m, " periodic: %ps\n", + dev->set_state_periodic); + + if (dev->set_state_oneshot) + SEQ_printf(m, " oneshot: %ps\n", + dev->set_state_oneshot); + + if (dev->set_state_oneshot_stopped) + SEQ_printf(m, " oneshot stopped: %ps\n", + dev->set_state_oneshot_stopped); + + if (dev->tick_resume) + SEQ_printf(m, " resume: %ps\n", + dev->tick_resume); + + SEQ_printf(m, " event_handler: %ps\n", dev->event_handler); + SEQ_printf(m, "\n"); + SEQ_printf(m, " retries: %lu\n", dev->retries); + +#ifdef CONFIG_GENERIC_CLOCKEVENTS_BROADCAST + if (cpu >= 0) { + const struct clock_event_device *wd = tick_get_wakeup_device(cpu); + + SEQ_printf(m, "Wakeup Device: %s\n", wd ? wd->name : "<NULL>"); + } +#endif + SEQ_printf(m, "\n"); +} + +static void timer_list_show_tickdevices_header(struct seq_file *m) +{ +#ifdef CONFIG_GENERIC_CLOCKEVENTS_BROADCAST + print_tickdevice(m, tick_get_broadcast_device(), -1); + SEQ_printf(m, "tick_broadcast_mask: %*pb\n", + cpumask_pr_args(tick_get_broadcast_mask())); +#ifdef CONFIG_TICK_ONESHOT + SEQ_printf(m, "tick_broadcast_oneshot_mask: %*pb\n", + cpumask_pr_args(tick_get_broadcast_oneshot_mask())); +#endif + SEQ_printf(m, "\n"); +#endif +} +#endif + +static inline void timer_list_header(struct seq_file *m, u64 now) +{ + SEQ_printf(m, "Timer List Version: v0.9\n"); + SEQ_printf(m, "HRTIMER_MAX_CLOCK_BASES: %d\n", HRTIMER_MAX_CLOCK_BASES); + SEQ_printf(m, "now at %Ld nsecs\n", (unsigned long long)now); + SEQ_printf(m, "\n"); +} + +void sysrq_timer_list_show(void) +{ + u64 now = ktime_to_ns(ktime_get()); + int cpu; + + timer_list_header(NULL, now); + + for_each_online_cpu(cpu) + print_cpu(NULL, cpu, now); + +#ifdef CONFIG_GENERIC_CLOCKEVENTS + timer_list_show_tickdevices_header(NULL); + for_each_online_cpu(cpu) + print_tickdevice(NULL, tick_get_device(cpu), cpu); +#endif + return; +} + +#ifdef CONFIG_PROC_FS +static int timer_list_show(struct seq_file *m, void *v) +{ + struct timer_list_iter *iter = v; + + if (iter->cpu == -1 && !iter->second_pass) + timer_list_header(m, iter->now); + else if (!iter->second_pass) + print_cpu(m, iter->cpu, iter->now); +#ifdef CONFIG_GENERIC_CLOCKEVENTS + else if (iter->cpu == -1 && iter->second_pass) + timer_list_show_tickdevices_header(m); + else + print_tickdevice(m, tick_get_device(iter->cpu), iter->cpu); +#endif + return 0; +} + +static void *move_iter(struct timer_list_iter *iter, loff_t offset) +{ + for (; offset; offset--) { + iter->cpu = cpumask_next(iter->cpu, cpu_online_mask); + if (iter->cpu >= nr_cpu_ids) { +#ifdef CONFIG_GENERIC_CLOCKEVENTS + if (!iter->second_pass) { + iter->cpu = -1; + iter->second_pass = true; + } else + return NULL; +#else + return NULL; +#endif + } + } + return iter; +} + +static void *timer_list_start(struct seq_file *file, loff_t *offset) +{ + struct timer_list_iter *iter = file->private; + + if (!*offset) + iter->now = ktime_to_ns(ktime_get()); + iter->cpu = -1; + iter->second_pass = false; + return move_iter(iter, *offset); +} + +static void *timer_list_next(struct seq_file *file, void *v, loff_t *offset) +{ + struct timer_list_iter *iter = file->private; + ++*offset; + return move_iter(iter, 1); +} + +static void timer_list_stop(struct seq_file *seq, void *v) +{ +} + +static const struct seq_operations timer_list_sops = { + .start = timer_list_start, + .next = timer_list_next, + .stop = timer_list_stop, + .show = timer_list_show, +}; + +static int __init init_timer_list_procfs(void) +{ + struct proc_dir_entry *pe; + + pe = proc_create_seq_private("timer_list", 0400, NULL, &timer_list_sops, + sizeof(struct timer_list_iter), NULL); + if (!pe) + return -ENOMEM; + return 0; +} +__initcall(init_timer_list_procfs); +#endif diff --git a/kernel/time/vsyscall.c b/kernel/time/vsyscall.c new file mode 100644 index 0000000000..f0d5062d9c --- /dev/null +++ b/kernel/time/vsyscall.c @@ -0,0 +1,170 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright 2019 ARM Ltd. + * + * Generic implementation of update_vsyscall and update_vsyscall_tz. + * + * Based on the x86 specific implementation. + */ + +#include <linux/hrtimer.h> +#include <linux/timekeeper_internal.h> +#include <vdso/datapage.h> +#include <vdso/helpers.h> +#include <vdso/vsyscall.h> + +#include "timekeeping_internal.h" + +static inline void update_vdso_data(struct vdso_data *vdata, + struct timekeeper *tk) +{ + struct vdso_timestamp *vdso_ts; + u64 nsec, sec; + + vdata[CS_HRES_COARSE].cycle_last = tk->tkr_mono.cycle_last; + vdata[CS_HRES_COARSE].mask = tk->tkr_mono.mask; + vdata[CS_HRES_COARSE].mult = tk->tkr_mono.mult; + vdata[CS_HRES_COARSE].shift = tk->tkr_mono.shift; + vdata[CS_RAW].cycle_last = tk->tkr_raw.cycle_last; + vdata[CS_RAW].mask = tk->tkr_raw.mask; + vdata[CS_RAW].mult = tk->tkr_raw.mult; + vdata[CS_RAW].shift = tk->tkr_raw.shift; + + /* CLOCK_MONOTONIC */ + vdso_ts = &vdata[CS_HRES_COARSE].basetime[CLOCK_MONOTONIC]; + vdso_ts->sec = tk->xtime_sec + tk->wall_to_monotonic.tv_sec; + + nsec = tk->tkr_mono.xtime_nsec; + nsec += ((u64)tk->wall_to_monotonic.tv_nsec << tk->tkr_mono.shift); + while (nsec >= (((u64)NSEC_PER_SEC) << tk->tkr_mono.shift)) { + nsec -= (((u64)NSEC_PER_SEC) << tk->tkr_mono.shift); + vdso_ts->sec++; + } + vdso_ts->nsec = nsec; + + /* Copy MONOTONIC time for BOOTTIME */ + sec = vdso_ts->sec; + /* Add the boot offset */ + sec += tk->monotonic_to_boot.tv_sec; + nsec += (u64)tk->monotonic_to_boot.tv_nsec << tk->tkr_mono.shift; + + /* CLOCK_BOOTTIME */ + vdso_ts = &vdata[CS_HRES_COARSE].basetime[CLOCK_BOOTTIME]; + vdso_ts->sec = sec; + + while (nsec >= (((u64)NSEC_PER_SEC) << tk->tkr_mono.shift)) { + nsec -= (((u64)NSEC_PER_SEC) << tk->tkr_mono.shift); + vdso_ts->sec++; + } + vdso_ts->nsec = nsec; + + /* CLOCK_MONOTONIC_RAW */ + vdso_ts = &vdata[CS_RAW].basetime[CLOCK_MONOTONIC_RAW]; + vdso_ts->sec = tk->raw_sec; + vdso_ts->nsec = tk->tkr_raw.xtime_nsec; + + /* CLOCK_TAI */ + vdso_ts = &vdata[CS_HRES_COARSE].basetime[CLOCK_TAI]; + vdso_ts->sec = tk->xtime_sec + (s64)tk->tai_offset; + vdso_ts->nsec = tk->tkr_mono.xtime_nsec; +} + +void update_vsyscall(struct timekeeper *tk) +{ + struct vdso_data *vdata = __arch_get_k_vdso_data(); + struct vdso_timestamp *vdso_ts; + s32 clock_mode; + u64 nsec; + + /* copy vsyscall data */ + vdso_write_begin(vdata); + + clock_mode = tk->tkr_mono.clock->vdso_clock_mode; + vdata[CS_HRES_COARSE].clock_mode = clock_mode; + vdata[CS_RAW].clock_mode = clock_mode; + + /* CLOCK_REALTIME also required for time() */ + vdso_ts = &vdata[CS_HRES_COARSE].basetime[CLOCK_REALTIME]; + vdso_ts->sec = tk->xtime_sec; + vdso_ts->nsec = tk->tkr_mono.xtime_nsec; + + /* CLOCK_REALTIME_COARSE */ + vdso_ts = &vdata[CS_HRES_COARSE].basetime[CLOCK_REALTIME_COARSE]; + vdso_ts->sec = tk->xtime_sec; + vdso_ts->nsec = tk->tkr_mono.xtime_nsec >> tk->tkr_mono.shift; + + /* CLOCK_MONOTONIC_COARSE */ + vdso_ts = &vdata[CS_HRES_COARSE].basetime[CLOCK_MONOTONIC_COARSE]; + vdso_ts->sec = tk->xtime_sec + tk->wall_to_monotonic.tv_sec; + nsec = tk->tkr_mono.xtime_nsec >> tk->tkr_mono.shift; + nsec = nsec + tk->wall_to_monotonic.tv_nsec; + vdso_ts->sec += __iter_div_u64_rem(nsec, NSEC_PER_SEC, &vdso_ts->nsec); + + /* + * Read without the seqlock held by clock_getres(). + * Note: No need to have a second copy. + */ + WRITE_ONCE(vdata[CS_HRES_COARSE].hrtimer_res, hrtimer_resolution); + + /* + * If the current clocksource is not VDSO capable, then spare the + * update of the high resolution parts. + */ + if (clock_mode != VDSO_CLOCKMODE_NONE) + update_vdso_data(vdata, tk); + + __arch_update_vsyscall(vdata, tk); + + vdso_write_end(vdata); + + __arch_sync_vdso_data(vdata); +} + +void update_vsyscall_tz(void) +{ + struct vdso_data *vdata = __arch_get_k_vdso_data(); + + vdata[CS_HRES_COARSE].tz_minuteswest = sys_tz.tz_minuteswest; + vdata[CS_HRES_COARSE].tz_dsttime = sys_tz.tz_dsttime; + + __arch_sync_vdso_data(vdata); +} + +/** + * vdso_update_begin - Start of a VDSO update section + * + * Allows architecture code to safely update the architecture specific VDSO + * data. Disables interrupts, acquires timekeeper lock to serialize against + * concurrent updates from timekeeping and invalidates the VDSO data + * sequence counter to prevent concurrent readers from accessing + * inconsistent data. + * + * Returns: Saved interrupt flags which need to be handed in to + * vdso_update_end(). + */ +unsigned long vdso_update_begin(void) +{ + struct vdso_data *vdata = __arch_get_k_vdso_data(); + unsigned long flags; + + raw_spin_lock_irqsave(&timekeeper_lock, flags); + vdso_write_begin(vdata); + return flags; +} + +/** + * vdso_update_end - End of a VDSO update section + * @flags: Interrupt flags as returned from vdso_update_begin() + * + * Pairs with vdso_update_begin(). Marks vdso data consistent, invokes data + * synchronization if the architecture requires it, drops timekeeper lock + * and restores interrupt flags. + */ +void vdso_update_end(unsigned long flags) +{ + struct vdso_data *vdata = __arch_get_k_vdso_data(); + + vdso_write_end(vdata); + __arch_sync_vdso_data(vdata); + raw_spin_unlock_irqrestore(&timekeeper_lock, flags); +} |