diff options
author | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-05-06 01:02:30 +0000 |
---|---|---|
committer | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-05-06 01:02:30 +0000 |
commit | 76cb841cb886eef6b3bee341a2266c76578724ad (patch) | |
tree | f5892e5ba6cc11949952a6ce4ecbe6d516d6ce58 /kernel/time | |
parent | Initial commit. (diff) | |
download | linux-76cb841cb886eef6b3bee341a2266c76578724ad.tar.xz linux-76cb841cb886eef6b3bee341a2266c76578724ad.zip |
Adding upstream version 4.19.249.upstream/4.19.249
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'kernel/time')
34 files changed, 20406 insertions, 0 deletions
diff --git a/kernel/time/Kconfig b/kernel/time/Kconfig new file mode 100644 index 000000000..78eabc41e --- /dev/null +++ b/kernel/time/Kconfig @@ -0,0 +1,134 @@ +# +# Timer subsystem related configuration options +# + +# Options selectable by arch Kconfig + +# Watchdog function for clocksources to detect instabilities +config CLOCKSOURCE_WATCHDOG + bool + +# Architecture has extra clocksource data +config ARCH_CLOCKSOURCE_DATA + bool + +# Clocksources require validation of the clocksource against the last +# cycle update - x86/TSC misfeature +config CLOCKSOURCE_VALIDATE_LAST_CYCLE + bool + +# Timekeeping vsyscall support +config GENERIC_TIME_VSYSCALL + bool + +# Old style timekeeping +config ARCH_USES_GETTIMEOFFSET + bool + +# The generic clock events infrastructure +config GENERIC_CLOCKEVENTS + bool + +# Architecture can handle broadcast in a driver-agnostic way +config ARCH_HAS_TICK_BROADCAST + bool + +# Clockevents broadcasting infrastructure +config GENERIC_CLOCKEVENTS_BROADCAST + bool + depends on GENERIC_CLOCKEVENTS + +# Automatically adjust the min. reprogramming time for +# clock event device +config GENERIC_CLOCKEVENTS_MIN_ADJUST + bool + +# Generic update of CMOS clock +config GENERIC_CMOS_UPDATE + bool + +if GENERIC_CLOCKEVENTS +menu "Timers subsystem" + +# Core internal switch. Selected by NO_HZ_COMMON / HIGH_RES_TIMERS. This is +# only related to the tick functionality. Oneshot clockevent devices +# are supported independent of this. +config TICK_ONESHOT + bool + +config NO_HZ_COMMON + bool + depends on !ARCH_USES_GETTIMEOFFSET && GENERIC_CLOCKEVENTS + select TICK_ONESHOT + +choice + prompt "Timer tick handling" + default NO_HZ_IDLE if NO_HZ + +config HZ_PERIODIC + bool "Periodic timer ticks (constant rate, no dynticks)" + help + This option keeps the tick running periodically at a constant + rate, even when the CPU doesn't need it. + +config NO_HZ_IDLE + bool "Idle dynticks system (tickless idle)" + depends on !ARCH_USES_GETTIMEOFFSET && GENERIC_CLOCKEVENTS + select NO_HZ_COMMON + help + This option enables a tickless idle system: timer interrupts + will only trigger on an as-needed basis when the system is idle. + This is usually interesting for energy saving. + + Most of the time you want to say Y here. + +config NO_HZ_FULL + bool "Full dynticks system (tickless)" + # NO_HZ_COMMON dependency + depends on !ARCH_USES_GETTIMEOFFSET && GENERIC_CLOCKEVENTS + # We need at least one periodic CPU for timekeeping + depends on SMP + depends on HAVE_CONTEXT_TRACKING + # VIRT_CPU_ACCOUNTING_GEN dependency + depends on HAVE_VIRT_CPU_ACCOUNTING_GEN + select NO_HZ_COMMON + select RCU_NOCB_CPU + select VIRT_CPU_ACCOUNTING_GEN + select IRQ_WORK + select CPU_ISOLATION + help + Adaptively try to shutdown the tick whenever possible, even when + the CPU is running tasks. Typically this requires running a single + task on the CPU. Chances for running tickless are maximized when + the task mostly runs in userspace and has few kernel activity. + + You need to fill up the nohz_full boot parameter with the + desired range of dynticks CPUs. + + This is implemented at the expense of some overhead in user <-> kernel + transitions: syscalls, exceptions and interrupts. Even when it's + dynamically off. + + Say N. + +endchoice + +config NO_HZ + bool "Old Idle dynticks config" + depends on !ARCH_USES_GETTIMEOFFSET && GENERIC_CLOCKEVENTS + help + This is the old config entry that enables dynticks idle. + We keep it around for a little while to enforce backward + compatibility with older config files. + +config HIGH_RES_TIMERS + bool "High Resolution Timer Support" + depends on !ARCH_USES_GETTIMEOFFSET && GENERIC_CLOCKEVENTS + select TICK_ONESHOT + help + This option enables high resolution timer support. If your + hardware is not capable then this option only increases + the size of the kernel image. + +endmenu +endif diff --git a/kernel/time/Makefile b/kernel/time/Makefile new file mode 100644 index 000000000..f1e46f338 --- /dev/null +++ b/kernel/time/Makefile @@ -0,0 +1,20 @@ +# SPDX-License-Identifier: GPL-2.0 +obj-y += time.o timer.o hrtimer.o +obj-y += timekeeping.o ntp.o clocksource.o jiffies.o timer_list.o +obj-y += timeconv.o timecounter.o alarmtimer.o + +ifeq ($(CONFIG_POSIX_TIMERS),y) + obj-y += posix-timers.o posix-cpu-timers.o posix-clock.o itimer.o +else + obj-y += posix-stubs.o +endif + +obj-$(CONFIG_GENERIC_CLOCKEVENTS) += clockevents.o tick-common.o +ifeq ($(CONFIG_GENERIC_CLOCKEVENTS_BROADCAST),y) + obj-y += tick-broadcast.o + obj-$(CONFIG_TICK_ONESHOT) += tick-broadcast-hrtimer.o +endif +obj-$(CONFIG_GENERIC_SCHED_CLOCK) += sched_clock.o +obj-$(CONFIG_TICK_ONESHOT) += tick-oneshot.o tick-sched.o +obj-$(CONFIG_DEBUG_FS) += timekeeping_debug.o +obj-$(CONFIG_TEST_UDELAY) += test_udelay.o diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c new file mode 100644 index 000000000..6a2ba3988 --- /dev/null +++ b/kernel/time/alarmtimer.c @@ -0,0 +1,906 @@ +/* + * Alarmtimer interface + * + * This interface provides a timer which is similarto hrtimers, + * but triggers a RTC alarm if the box is suspend. + * + * This interface is influenced by the Android RTC Alarm timer + * interface. + * + * Copyright (C) 2010 IBM Corperation + * + * Author: John Stultz <john.stultz@linaro.org> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ +#include <linux/time.h> +#include <linux/hrtimer.h> +#include <linux/timerqueue.h> +#include <linux/rtc.h> +#include <linux/sched/signal.h> +#include <linux/sched/debug.h> +#include <linux/alarmtimer.h> +#include <linux/mutex.h> +#include <linux/platform_device.h> +#include <linux/posix-timers.h> +#include <linux/workqueue.h> +#include <linux/freezer.h> +#include <linux/compat.h> +#include <linux/module.h> + +#include "posix-timers.h" + +#define CREATE_TRACE_POINTS +#include <trace/events/alarmtimer.h> + +/** + * struct alarm_base - Alarm timer bases + * @lock: Lock for syncrhonized access to the base + * @timerqueue: Timerqueue head managing the list of events + * @gettime: Function to read the time correlating to the base + * @base_clockid: clockid for the base + */ +static struct alarm_base { + spinlock_t lock; + struct timerqueue_head timerqueue; + ktime_t (*gettime)(void); + clockid_t base_clockid; +} alarm_bases[ALARM_NUMTYPE]; + +#if defined(CONFIG_POSIX_TIMERS) || defined(CONFIG_RTC_CLASS) +/* freezer information to handle clock_nanosleep triggered wakeups */ +static enum alarmtimer_type freezer_alarmtype; +static ktime_t freezer_expires; +static ktime_t freezer_delta; +static DEFINE_SPINLOCK(freezer_delta_lock); +#endif + +#ifdef CONFIG_RTC_CLASS +static struct wakeup_source *ws; + +/* rtc timer and device for setting alarm wakeups at suspend */ +static struct rtc_timer rtctimer; +static struct rtc_device *rtcdev; +static DEFINE_SPINLOCK(rtcdev_lock); + +/** + * alarmtimer_get_rtcdev - Return selected rtcdevice + * + * This function returns the rtc device to use for wakealarms. + * If one has not already been chosen, it checks to see if a + * functional rtc device is available. + */ +struct rtc_device *alarmtimer_get_rtcdev(void) +{ + unsigned long flags; + struct rtc_device *ret; + + spin_lock_irqsave(&rtcdev_lock, flags); + ret = rtcdev; + spin_unlock_irqrestore(&rtcdev_lock, flags); + + return ret; +} +EXPORT_SYMBOL_GPL(alarmtimer_get_rtcdev); + +static int alarmtimer_rtc_add_device(struct device *dev, + struct class_interface *class_intf) +{ + unsigned long flags; + struct rtc_device *rtc = to_rtc_device(dev); + struct wakeup_source *__ws; + int ret = 0; + + if (rtcdev) + return -EBUSY; + + if (!rtc->ops->set_alarm) + return -1; + if (!device_may_wakeup(rtc->dev.parent)) + return -1; + + __ws = wakeup_source_register("alarmtimer"); + + spin_lock_irqsave(&rtcdev_lock, flags); + if (!rtcdev) { + if (!try_module_get(rtc->owner)) { + ret = -1; + goto unlock; + } + + rtcdev = rtc; + /* hold a reference so it doesn't go away */ + get_device(dev); + ws = __ws; + __ws = NULL; + } +unlock: + spin_unlock_irqrestore(&rtcdev_lock, flags); + + wakeup_source_unregister(__ws); + + return ret; +} + +static inline void alarmtimer_rtc_timer_init(void) +{ + rtc_timer_init(&rtctimer, NULL, NULL); +} + +static struct class_interface alarmtimer_rtc_interface = { + .add_dev = &alarmtimer_rtc_add_device, +}; + +static int alarmtimer_rtc_interface_setup(void) +{ + alarmtimer_rtc_interface.class = rtc_class; + return class_interface_register(&alarmtimer_rtc_interface); +} +static void alarmtimer_rtc_interface_remove(void) +{ + class_interface_unregister(&alarmtimer_rtc_interface); +} +#else +struct rtc_device *alarmtimer_get_rtcdev(void) +{ + return NULL; +} +#define rtcdev (NULL) +static inline int alarmtimer_rtc_interface_setup(void) { return 0; } +static inline void alarmtimer_rtc_interface_remove(void) { } +static inline void alarmtimer_rtc_timer_init(void) { } +#endif + +/** + * alarmtimer_enqueue - Adds an alarm timer to an alarm_base timerqueue + * @base: pointer to the base where the timer is being run + * @alarm: pointer to alarm being enqueued. + * + * Adds alarm to a alarm_base timerqueue + * + * Must hold base->lock when calling. + */ +static void alarmtimer_enqueue(struct alarm_base *base, struct alarm *alarm) +{ + if (alarm->state & ALARMTIMER_STATE_ENQUEUED) + timerqueue_del(&base->timerqueue, &alarm->node); + + timerqueue_add(&base->timerqueue, &alarm->node); + alarm->state |= ALARMTIMER_STATE_ENQUEUED; +} + +/** + * alarmtimer_dequeue - Removes an alarm timer from an alarm_base timerqueue + * @base: pointer to the base where the timer is running + * @alarm: pointer to alarm being removed + * + * Removes alarm to a alarm_base timerqueue + * + * Must hold base->lock when calling. + */ +static void alarmtimer_dequeue(struct alarm_base *base, struct alarm *alarm) +{ + if (!(alarm->state & ALARMTIMER_STATE_ENQUEUED)) + return; + + timerqueue_del(&base->timerqueue, &alarm->node); + alarm->state &= ~ALARMTIMER_STATE_ENQUEUED; +} + + +/** + * alarmtimer_fired - Handles alarm hrtimer being fired. + * @timer: pointer to hrtimer being run + * + * When a alarm timer fires, this runs through the timerqueue to + * see which alarms expired, and runs those. If there are more alarm + * timers queued for the future, we set the hrtimer to fire when + * when the next future alarm timer expires. + */ +static enum hrtimer_restart alarmtimer_fired(struct hrtimer *timer) +{ + struct alarm *alarm = container_of(timer, struct alarm, timer); + struct alarm_base *base = &alarm_bases[alarm->type]; + unsigned long flags; + int ret = HRTIMER_NORESTART; + int restart = ALARMTIMER_NORESTART; + + spin_lock_irqsave(&base->lock, flags); + alarmtimer_dequeue(base, alarm); + spin_unlock_irqrestore(&base->lock, flags); + + if (alarm->function) + restart = alarm->function(alarm, base->gettime()); + + spin_lock_irqsave(&base->lock, flags); + if (restart != ALARMTIMER_NORESTART) { + hrtimer_set_expires(&alarm->timer, alarm->node.expires); + alarmtimer_enqueue(base, alarm); + ret = HRTIMER_RESTART; + } + spin_unlock_irqrestore(&base->lock, flags); + + trace_alarmtimer_fired(alarm, base->gettime()); + return ret; + +} + +ktime_t alarm_expires_remaining(const struct alarm *alarm) +{ + struct alarm_base *base = &alarm_bases[alarm->type]; + return ktime_sub(alarm->node.expires, base->gettime()); +} +EXPORT_SYMBOL_GPL(alarm_expires_remaining); + +#ifdef CONFIG_RTC_CLASS +/** + * alarmtimer_suspend - Suspend time callback + * @dev: unused + * @state: unused + * + * When we are going into suspend, we look through the bases + * to see which is the soonest timer to expire. We then + * set an rtc timer to fire that far into the future, which + * will wake us from suspend. + */ +static int alarmtimer_suspend(struct device *dev) +{ + ktime_t min, now, expires; + int i, ret, type; + struct rtc_device *rtc; + unsigned long flags; + struct rtc_time tm; + + spin_lock_irqsave(&freezer_delta_lock, flags); + min = freezer_delta; + expires = freezer_expires; + type = freezer_alarmtype; + freezer_delta = 0; + spin_unlock_irqrestore(&freezer_delta_lock, flags); + + rtc = alarmtimer_get_rtcdev(); + /* If we have no rtcdev, just return */ + if (!rtc) + return 0; + + /* Find the soonest timer to expire*/ + for (i = 0; i < ALARM_NUMTYPE; i++) { + struct alarm_base *base = &alarm_bases[i]; + struct timerqueue_node *next; + ktime_t delta; + + spin_lock_irqsave(&base->lock, flags); + next = timerqueue_getnext(&base->timerqueue); + spin_unlock_irqrestore(&base->lock, flags); + if (!next) + continue; + delta = ktime_sub(next->expires, base->gettime()); + if (!min || (delta < min)) { + expires = next->expires; + min = delta; + type = i; + } + } + if (min == 0) + return 0; + + if (ktime_to_ns(min) < 2 * NSEC_PER_SEC) { + __pm_wakeup_event(ws, 2 * MSEC_PER_SEC); + return -EBUSY; + } + + trace_alarmtimer_suspend(expires, type); + + /* Setup an rtc timer to fire that far in the future */ + rtc_timer_cancel(rtc, &rtctimer); + rtc_read_time(rtc, &tm); + now = rtc_tm_to_ktime(tm); + now = ktime_add(now, min); + + /* Set alarm, if in the past reject suspend briefly to handle */ + ret = rtc_timer_start(rtc, &rtctimer, now, 0); + if (ret < 0) + __pm_wakeup_event(ws, MSEC_PER_SEC); + return ret; +} + +static int alarmtimer_resume(struct device *dev) +{ + struct rtc_device *rtc; + + rtc = alarmtimer_get_rtcdev(); + if (rtc) + rtc_timer_cancel(rtc, &rtctimer); + return 0; +} + +#else +static int alarmtimer_suspend(struct device *dev) +{ + return 0; +} + +static int alarmtimer_resume(struct device *dev) +{ + return 0; +} +#endif + +static void +__alarm_init(struct alarm *alarm, enum alarmtimer_type type, + enum alarmtimer_restart (*function)(struct alarm *, ktime_t)) +{ + timerqueue_init(&alarm->node); + alarm->timer.function = alarmtimer_fired; + alarm->function = function; + alarm->type = type; + alarm->state = ALARMTIMER_STATE_INACTIVE; +} + +/** + * alarm_init - Initialize an alarm structure + * @alarm: ptr to alarm to be initialized + * @type: the type of the alarm + * @function: callback that is run when the alarm fires + */ +void alarm_init(struct alarm *alarm, enum alarmtimer_type type, + enum alarmtimer_restart (*function)(struct alarm *, ktime_t)) +{ + hrtimer_init(&alarm->timer, alarm_bases[type].base_clockid, + HRTIMER_MODE_ABS); + __alarm_init(alarm, type, function); +} +EXPORT_SYMBOL_GPL(alarm_init); + +/** + * alarm_start - Sets an absolute alarm to fire + * @alarm: ptr to alarm to set + * @start: time to run the alarm + */ +void alarm_start(struct alarm *alarm, ktime_t start) +{ + struct alarm_base *base = &alarm_bases[alarm->type]; + unsigned long flags; + + spin_lock_irqsave(&base->lock, flags); + alarm->node.expires = start; + alarmtimer_enqueue(base, alarm); + hrtimer_start(&alarm->timer, alarm->node.expires, HRTIMER_MODE_ABS); + spin_unlock_irqrestore(&base->lock, flags); + + trace_alarmtimer_start(alarm, base->gettime()); +} +EXPORT_SYMBOL_GPL(alarm_start); + +/** + * alarm_start_relative - Sets a relative alarm to fire + * @alarm: ptr to alarm to set + * @start: time relative to now to run the alarm + */ +void alarm_start_relative(struct alarm *alarm, ktime_t start) +{ + struct alarm_base *base = &alarm_bases[alarm->type]; + + start = ktime_add_safe(start, base->gettime()); + alarm_start(alarm, start); +} +EXPORT_SYMBOL_GPL(alarm_start_relative); + +void alarm_restart(struct alarm *alarm) +{ + struct alarm_base *base = &alarm_bases[alarm->type]; + unsigned long flags; + + spin_lock_irqsave(&base->lock, flags); + hrtimer_set_expires(&alarm->timer, alarm->node.expires); + hrtimer_restart(&alarm->timer); + alarmtimer_enqueue(base, alarm); + spin_unlock_irqrestore(&base->lock, flags); +} +EXPORT_SYMBOL_GPL(alarm_restart); + +/** + * alarm_try_to_cancel - Tries to cancel an alarm timer + * @alarm: ptr to alarm to be canceled + * + * Returns 1 if the timer was canceled, 0 if it was not running, + * and -1 if the callback was running + */ +int alarm_try_to_cancel(struct alarm *alarm) +{ + struct alarm_base *base = &alarm_bases[alarm->type]; + unsigned long flags; + int ret; + + spin_lock_irqsave(&base->lock, flags); + ret = hrtimer_try_to_cancel(&alarm->timer); + if (ret >= 0) + alarmtimer_dequeue(base, alarm); + spin_unlock_irqrestore(&base->lock, flags); + + trace_alarmtimer_cancel(alarm, base->gettime()); + return ret; +} +EXPORT_SYMBOL_GPL(alarm_try_to_cancel); + + +/** + * alarm_cancel - Spins trying to cancel an alarm timer until it is done + * @alarm: ptr to alarm to be canceled + * + * Returns 1 if the timer was canceled, 0 if it was not active. + */ +int alarm_cancel(struct alarm *alarm) +{ + for (;;) { + int ret = alarm_try_to_cancel(alarm); + if (ret >= 0) + return ret; + cpu_relax(); + } +} +EXPORT_SYMBOL_GPL(alarm_cancel); + + +u64 alarm_forward(struct alarm *alarm, ktime_t now, ktime_t interval) +{ + u64 overrun = 1; + ktime_t delta; + + delta = ktime_sub(now, alarm->node.expires); + + if (delta < 0) + return 0; + + if (unlikely(delta >= interval)) { + s64 incr = ktime_to_ns(interval); + + overrun = ktime_divns(delta, incr); + + alarm->node.expires = ktime_add_ns(alarm->node.expires, + incr*overrun); + + if (alarm->node.expires > now) + return overrun; + /* + * This (and the ktime_add() below) is the + * correction for exact: + */ + overrun++; + } + + alarm->node.expires = ktime_add_safe(alarm->node.expires, interval); + return overrun; +} +EXPORT_SYMBOL_GPL(alarm_forward); + +u64 alarm_forward_now(struct alarm *alarm, ktime_t interval) +{ + struct alarm_base *base = &alarm_bases[alarm->type]; + + return alarm_forward(alarm, base->gettime(), interval); +} +EXPORT_SYMBOL_GPL(alarm_forward_now); + +#ifdef CONFIG_POSIX_TIMERS + +static void alarmtimer_freezerset(ktime_t absexp, enum alarmtimer_type type) +{ + struct alarm_base *base; + unsigned long flags; + ktime_t delta; + + switch(type) { + case ALARM_REALTIME: + base = &alarm_bases[ALARM_REALTIME]; + type = ALARM_REALTIME_FREEZER; + break; + case ALARM_BOOTTIME: + base = &alarm_bases[ALARM_BOOTTIME]; + type = ALARM_BOOTTIME_FREEZER; + break; + default: + WARN_ONCE(1, "Invalid alarm type: %d\n", type); + return; + } + + delta = ktime_sub(absexp, base->gettime()); + + spin_lock_irqsave(&freezer_delta_lock, flags); + if (!freezer_delta || (delta < freezer_delta)) { + freezer_delta = delta; + freezer_expires = absexp; + freezer_alarmtype = type; + } + spin_unlock_irqrestore(&freezer_delta_lock, flags); +} + +/** + * clock2alarm - helper that converts from clockid to alarmtypes + * @clockid: clockid. + */ +static enum alarmtimer_type clock2alarm(clockid_t clockid) +{ + if (clockid == CLOCK_REALTIME_ALARM) + return ALARM_REALTIME; + if (clockid == CLOCK_BOOTTIME_ALARM) + return ALARM_BOOTTIME; + return -1; +} + +/** + * alarm_handle_timer - Callback for posix timers + * @alarm: alarm that fired + * + * Posix timer callback for expired alarm timers. + */ +static enum alarmtimer_restart alarm_handle_timer(struct alarm *alarm, + ktime_t now) +{ + struct k_itimer *ptr = container_of(alarm, struct k_itimer, + it.alarm.alarmtimer); + enum alarmtimer_restart result = ALARMTIMER_NORESTART; + unsigned long flags; + int si_private = 0; + + spin_lock_irqsave(&ptr->it_lock, flags); + + ptr->it_active = 0; + if (ptr->it_interval) + si_private = ++ptr->it_requeue_pending; + + if (posix_timer_event(ptr, si_private) && ptr->it_interval) { + /* + * Handle ignored signals and rearm the timer. This will go + * away once we handle ignored signals proper. + */ + ptr->it_overrun += alarm_forward_now(alarm, ptr->it_interval); + ++ptr->it_requeue_pending; + ptr->it_active = 1; + result = ALARMTIMER_RESTART; + } + spin_unlock_irqrestore(&ptr->it_lock, flags); + + return result; +} + +/** + * alarm_timer_rearm - Posix timer callback for rearming timer + * @timr: Pointer to the posixtimer data struct + */ +static void alarm_timer_rearm(struct k_itimer *timr) +{ + struct alarm *alarm = &timr->it.alarm.alarmtimer; + + timr->it_overrun += alarm_forward_now(alarm, timr->it_interval); + alarm_start(alarm, alarm->node.expires); +} + +/** + * alarm_timer_forward - Posix timer callback for forwarding timer + * @timr: Pointer to the posixtimer data struct + * @now: Current time to forward the timer against + */ +static s64 alarm_timer_forward(struct k_itimer *timr, ktime_t now) +{ + struct alarm *alarm = &timr->it.alarm.alarmtimer; + + return alarm_forward(alarm, timr->it_interval, now); +} + +/** + * alarm_timer_remaining - Posix timer callback to retrieve remaining time + * @timr: Pointer to the posixtimer data struct + * @now: Current time to calculate against + */ +static ktime_t alarm_timer_remaining(struct k_itimer *timr, ktime_t now) +{ + struct alarm *alarm = &timr->it.alarm.alarmtimer; + + return ktime_sub(alarm->node.expires, now); +} + +/** + * alarm_timer_try_to_cancel - Posix timer callback to cancel a timer + * @timr: Pointer to the posixtimer data struct + */ +static int alarm_timer_try_to_cancel(struct k_itimer *timr) +{ + return alarm_try_to_cancel(&timr->it.alarm.alarmtimer); +} + +/** + * alarm_timer_arm - Posix timer callback to arm a timer + * @timr: Pointer to the posixtimer data struct + * @expires: The new expiry time + * @absolute: Expiry value is absolute time + * @sigev_none: Posix timer does not deliver signals + */ +static void alarm_timer_arm(struct k_itimer *timr, ktime_t expires, + bool absolute, bool sigev_none) +{ + struct alarm *alarm = &timr->it.alarm.alarmtimer; + struct alarm_base *base = &alarm_bases[alarm->type]; + + if (!absolute) + expires = ktime_add_safe(expires, base->gettime()); + if (sigev_none) + alarm->node.expires = expires; + else + alarm_start(&timr->it.alarm.alarmtimer, expires); +} + +/** + * alarm_clock_getres - posix getres interface + * @which_clock: clockid + * @tp: timespec to fill + * + * Returns the granularity of underlying alarm base clock + */ +static int alarm_clock_getres(const clockid_t which_clock, struct timespec64 *tp) +{ + if (!alarmtimer_get_rtcdev()) + return -EINVAL; + + tp->tv_sec = 0; + tp->tv_nsec = hrtimer_resolution; + return 0; +} + +/** + * alarm_clock_get - posix clock_get interface + * @which_clock: clockid + * @tp: timespec to fill. + * + * Provides the underlying alarm base time. + */ +static int alarm_clock_get(clockid_t which_clock, struct timespec64 *tp) +{ + struct alarm_base *base = &alarm_bases[clock2alarm(which_clock)]; + + if (!alarmtimer_get_rtcdev()) + return -EINVAL; + + *tp = ktime_to_timespec64(base->gettime()); + return 0; +} + +/** + * alarm_timer_create - posix timer_create interface + * @new_timer: k_itimer pointer to manage + * + * Initializes the k_itimer structure. + */ +static int alarm_timer_create(struct k_itimer *new_timer) +{ + enum alarmtimer_type type; + + if (!alarmtimer_get_rtcdev()) + return -EOPNOTSUPP; + + if (!capable(CAP_WAKE_ALARM)) + return -EPERM; + + type = clock2alarm(new_timer->it_clock); + alarm_init(&new_timer->it.alarm.alarmtimer, type, alarm_handle_timer); + return 0; +} + +/** + * alarmtimer_nsleep_wakeup - Wakeup function for alarm_timer_nsleep + * @alarm: ptr to alarm that fired + * + * Wakes up the task that set the alarmtimer + */ +static enum alarmtimer_restart alarmtimer_nsleep_wakeup(struct alarm *alarm, + ktime_t now) +{ + struct task_struct *task = (struct task_struct *)alarm->data; + + alarm->data = NULL; + if (task) + wake_up_process(task); + return ALARMTIMER_NORESTART; +} + +/** + * alarmtimer_do_nsleep - Internal alarmtimer nsleep implementation + * @alarm: ptr to alarmtimer + * @absexp: absolute expiration time + * + * Sets the alarm timer and sleeps until it is fired or interrupted. + */ +static int alarmtimer_do_nsleep(struct alarm *alarm, ktime_t absexp, + enum alarmtimer_type type) +{ + struct restart_block *restart; + alarm->data = (void *)current; + do { + set_current_state(TASK_INTERRUPTIBLE); + alarm_start(alarm, absexp); + if (likely(alarm->data)) + schedule(); + + alarm_cancel(alarm); + } while (alarm->data && !signal_pending(current)); + + __set_current_state(TASK_RUNNING); + + destroy_hrtimer_on_stack(&alarm->timer); + + if (!alarm->data) + return 0; + + if (freezing(current)) + alarmtimer_freezerset(absexp, type); + restart = ¤t->restart_block; + if (restart->nanosleep.type != TT_NONE) { + struct timespec64 rmt; + ktime_t rem; + + rem = ktime_sub(absexp, alarm_bases[type].gettime()); + + if (rem <= 0) + return 0; + rmt = ktime_to_timespec64(rem); + + return nanosleep_copyout(restart, &rmt); + } + return -ERESTART_RESTARTBLOCK; +} + +static void +alarm_init_on_stack(struct alarm *alarm, enum alarmtimer_type type, + enum alarmtimer_restart (*function)(struct alarm *, ktime_t)) +{ + hrtimer_init_on_stack(&alarm->timer, alarm_bases[type].base_clockid, + HRTIMER_MODE_ABS); + __alarm_init(alarm, type, function); +} + +/** + * alarm_timer_nsleep_restart - restartblock alarmtimer nsleep + * @restart: ptr to restart block + * + * Handles restarted clock_nanosleep calls + */ +static long __sched alarm_timer_nsleep_restart(struct restart_block *restart) +{ + enum alarmtimer_type type = restart->nanosleep.clockid; + ktime_t exp = restart->nanosleep.expires; + struct alarm alarm; + + alarm_init_on_stack(&alarm, type, alarmtimer_nsleep_wakeup); + + return alarmtimer_do_nsleep(&alarm, exp, type); +} + +/** + * alarm_timer_nsleep - alarmtimer nanosleep + * @which_clock: clockid + * @flags: determins abstime or relative + * @tsreq: requested sleep time (abs or rel) + * @rmtp: remaining sleep time saved + * + * Handles clock_nanosleep calls against _ALARM clockids + */ +static int alarm_timer_nsleep(const clockid_t which_clock, int flags, + const struct timespec64 *tsreq) +{ + enum alarmtimer_type type = clock2alarm(which_clock); + struct restart_block *restart = ¤t->restart_block; + struct alarm alarm; + ktime_t exp; + int ret = 0; + + if (!alarmtimer_get_rtcdev()) + return -EOPNOTSUPP; + + if (flags & ~TIMER_ABSTIME) + return -EINVAL; + + if (!capable(CAP_WAKE_ALARM)) + return -EPERM; + + alarm_init_on_stack(&alarm, type, alarmtimer_nsleep_wakeup); + + exp = timespec64_to_ktime(*tsreq); + /* Convert (if necessary) to absolute time */ + if (flags != TIMER_ABSTIME) { + ktime_t now = alarm_bases[type].gettime(); + + exp = ktime_add_safe(now, exp); + } + + ret = alarmtimer_do_nsleep(&alarm, exp, type); + if (ret != -ERESTART_RESTARTBLOCK) + return ret; + + /* abs timers don't set remaining time or restart */ + if (flags == TIMER_ABSTIME) + return -ERESTARTNOHAND; + + restart->nanosleep.clockid = type; + restart->nanosleep.expires = exp; + set_restart_fn(restart, alarm_timer_nsleep_restart); + return ret; +} + +const struct k_clock alarm_clock = { + .clock_getres = alarm_clock_getres, + .clock_get = alarm_clock_get, + .timer_create = alarm_timer_create, + .timer_set = common_timer_set, + .timer_del = common_timer_del, + .timer_get = common_timer_get, + .timer_arm = alarm_timer_arm, + .timer_rearm = alarm_timer_rearm, + .timer_forward = alarm_timer_forward, + .timer_remaining = alarm_timer_remaining, + .timer_try_to_cancel = alarm_timer_try_to_cancel, + .nsleep = alarm_timer_nsleep, +}; +#endif /* CONFIG_POSIX_TIMERS */ + + +/* Suspend hook structures */ +static const struct dev_pm_ops alarmtimer_pm_ops = { + .suspend = alarmtimer_suspend, + .resume = alarmtimer_resume, +}; + +static struct platform_driver alarmtimer_driver = { + .driver = { + .name = "alarmtimer", + .pm = &alarmtimer_pm_ops, + } +}; + +/** + * alarmtimer_init - Initialize alarm timer code + * + * This function initializes the alarm bases and registers + * the posix clock ids. + */ +static int __init alarmtimer_init(void) +{ + struct platform_device *pdev; + int error = 0; + int i; + + alarmtimer_rtc_timer_init(); + + /* Initialize alarm bases */ + alarm_bases[ALARM_REALTIME].base_clockid = CLOCK_REALTIME; + alarm_bases[ALARM_REALTIME].gettime = &ktime_get_real; + alarm_bases[ALARM_BOOTTIME].base_clockid = CLOCK_BOOTTIME; + alarm_bases[ALARM_BOOTTIME].gettime = &ktime_get_boottime; + for (i = 0; i < ALARM_NUMTYPE; i++) { + timerqueue_init_head(&alarm_bases[i].timerqueue); + spin_lock_init(&alarm_bases[i].lock); + } + + error = alarmtimer_rtc_interface_setup(); + if (error) + return error; + + error = platform_driver_register(&alarmtimer_driver); + if (error) + goto out_if; + + pdev = platform_device_register_simple("alarmtimer", -1, NULL, 0); + if (IS_ERR(pdev)) { + error = PTR_ERR(pdev); + goto out_drv; + } + return 0; + +out_drv: + platform_driver_unregister(&alarmtimer_driver); +out_if: + alarmtimer_rtc_interface_remove(); + return error; +} +device_initcall(alarmtimer_init); diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c new file mode 100644 index 000000000..8c0e4092f --- /dev/null +++ b/kernel/time/clockevents.c @@ -0,0 +1,774 @@ +/* + * linux/kernel/time/clockevents.c + * + * This file contains functions which manage clock event devices. + * + * Copyright(C) 2005-2006, Thomas Gleixner <tglx@linutronix.de> + * Copyright(C) 2005-2007, Red Hat, Inc., Ingo Molnar + * Copyright(C) 2006-2007, Timesys Corp., Thomas Gleixner + * + * This code is licenced under the GPL version 2. For details see + * kernel-base/COPYING. + */ + +#include <linux/clockchips.h> +#include <linux/hrtimer.h> +#include <linux/init.h> +#include <linux/module.h> +#include <linux/smp.h> +#include <linux/device.h> + +#include "tick-internal.h" + +/* The registered clock event devices */ +static LIST_HEAD(clockevent_devices); +static LIST_HEAD(clockevents_released); +/* Protection for the above */ +static DEFINE_RAW_SPINLOCK(clockevents_lock); +/* Protection for unbind operations */ +static DEFINE_MUTEX(clockevents_mutex); + +struct ce_unbind { + struct clock_event_device *ce; + int res; +}; + +static u64 cev_delta2ns(unsigned long latch, struct clock_event_device *evt, + bool ismax) +{ + u64 clc = (u64) latch << evt->shift; + u64 rnd; + + if (unlikely(!evt->mult)) { + evt->mult = 1; + WARN_ON(1); + } + rnd = (u64) evt->mult - 1; + + /* + * Upper bound sanity check. If the backwards conversion is + * not equal latch, we know that the above shift overflowed. + */ + if ((clc >> evt->shift) != (u64)latch) + clc = ~0ULL; + + /* + * Scaled math oddities: + * + * For mult <= (1 << shift) we can safely add mult - 1 to + * prevent integer rounding loss. So the backwards conversion + * from nsec to device ticks will be correct. + * + * For mult > (1 << shift), i.e. device frequency is > 1GHz we + * need to be careful. Adding mult - 1 will result in a value + * which when converted back to device ticks can be larger + * than latch by up to (mult - 1) >> shift. For the min_delta + * calculation we still want to apply this in order to stay + * above the minimum device ticks limit. For the upper limit + * we would end up with a latch value larger than the upper + * limit of the device, so we omit the add to stay below the + * device upper boundary. + * + * Also omit the add if it would overflow the u64 boundary. + */ + if ((~0ULL - clc > rnd) && + (!ismax || evt->mult <= (1ULL << evt->shift))) + clc += rnd; + + do_div(clc, evt->mult); + + /* Deltas less than 1usec are pointless noise */ + return clc > 1000 ? clc : 1000; +} + +/** + * clockevents_delta2ns - Convert a latch value (device ticks) to nanoseconds + * @latch: value to convert + * @evt: pointer to clock event device descriptor + * + * Math helper, returns latch value converted to nanoseconds (bound checked) + */ +u64 clockevent_delta2ns(unsigned long latch, struct clock_event_device *evt) +{ + return cev_delta2ns(latch, evt, false); +} +EXPORT_SYMBOL_GPL(clockevent_delta2ns); + +static int __clockevents_switch_state(struct clock_event_device *dev, + enum clock_event_state state) +{ + if (dev->features & CLOCK_EVT_FEAT_DUMMY) + return 0; + + /* Transition with new state-specific callbacks */ + switch (state) { + case CLOCK_EVT_STATE_DETACHED: + /* The clockevent device is getting replaced. Shut it down. */ + + case CLOCK_EVT_STATE_SHUTDOWN: + if (dev->set_state_shutdown) + return dev->set_state_shutdown(dev); + return 0; + + case CLOCK_EVT_STATE_PERIODIC: + /* Core internal bug */ + if (!(dev->features & CLOCK_EVT_FEAT_PERIODIC)) + return -ENOSYS; + if (dev->set_state_periodic) + return dev->set_state_periodic(dev); + return 0; + + case CLOCK_EVT_STATE_ONESHOT: + /* Core internal bug */ + if (!(dev->features & CLOCK_EVT_FEAT_ONESHOT)) + return -ENOSYS; + if (dev->set_state_oneshot) + return dev->set_state_oneshot(dev); + return 0; + + case CLOCK_EVT_STATE_ONESHOT_STOPPED: + /* Core internal bug */ + if (WARN_ONCE(!clockevent_state_oneshot(dev), + "Current state: %d\n", + clockevent_get_state(dev))) + return -EINVAL; + + if (dev->set_state_oneshot_stopped) + return dev->set_state_oneshot_stopped(dev); + else + return -ENOSYS; + + default: + return -ENOSYS; + } +} + +/** + * clockevents_switch_state - set the operating state of a clock event device + * @dev: device to modify + * @state: new state + * + * Must be called with interrupts disabled ! + */ +void clockevents_switch_state(struct clock_event_device *dev, + enum clock_event_state state) +{ + if (clockevent_get_state(dev) != state) { + if (__clockevents_switch_state(dev, state)) + return; + + clockevent_set_state(dev, state); + + /* + * A nsec2cyc multiplicator of 0 is invalid and we'd crash + * on it, so fix it up and emit a warning: + */ + if (clockevent_state_oneshot(dev)) { + if (unlikely(!dev->mult)) { + dev->mult = 1; + WARN_ON(1); + } + } + } +} + +/** + * clockevents_shutdown - shutdown the device and clear next_event + * @dev: device to shutdown + */ +void clockevents_shutdown(struct clock_event_device *dev) +{ + clockevents_switch_state(dev, CLOCK_EVT_STATE_SHUTDOWN); + dev->next_event = KTIME_MAX; +} + +/** + * clockevents_tick_resume - Resume the tick device before using it again + * @dev: device to resume + */ +int clockevents_tick_resume(struct clock_event_device *dev) +{ + int ret = 0; + + if (dev->tick_resume) + ret = dev->tick_resume(dev); + + return ret; +} + +#ifdef CONFIG_GENERIC_CLOCKEVENTS_MIN_ADJUST + +/* Limit min_delta to a jiffie */ +#define MIN_DELTA_LIMIT (NSEC_PER_SEC / HZ) + +/** + * clockevents_increase_min_delta - raise minimum delta of a clock event device + * @dev: device to increase the minimum delta + * + * Returns 0 on success, -ETIME when the minimum delta reached the limit. + */ +static int clockevents_increase_min_delta(struct clock_event_device *dev) +{ + /* Nothing to do if we already reached the limit */ + if (dev->min_delta_ns >= MIN_DELTA_LIMIT) { + printk_deferred(KERN_WARNING + "CE: Reprogramming failure. Giving up\n"); + dev->next_event = KTIME_MAX; + return -ETIME; + } + + if (dev->min_delta_ns < 5000) + dev->min_delta_ns = 5000; + else + dev->min_delta_ns += dev->min_delta_ns >> 1; + + if (dev->min_delta_ns > MIN_DELTA_LIMIT) + dev->min_delta_ns = MIN_DELTA_LIMIT; + + printk_deferred(KERN_WARNING + "CE: %s increased min_delta_ns to %llu nsec\n", + dev->name ? dev->name : "?", + (unsigned long long) dev->min_delta_ns); + return 0; +} + +/** + * clockevents_program_min_delta - Set clock event device to the minimum delay. + * @dev: device to program + * + * Returns 0 on success, -ETIME when the retry loop failed. + */ +static int clockevents_program_min_delta(struct clock_event_device *dev) +{ + unsigned long long clc; + int64_t delta; + int i; + + for (i = 0;;) { + delta = dev->min_delta_ns; + dev->next_event = ktime_add_ns(ktime_get(), delta); + + if (clockevent_state_shutdown(dev)) + return 0; + + dev->retries++; + clc = ((unsigned long long) delta * dev->mult) >> dev->shift; + if (dev->set_next_event((unsigned long) clc, dev) == 0) + return 0; + + if (++i > 2) { + /* + * We tried 3 times to program the device with the + * given min_delta_ns. Try to increase the minimum + * delta, if that fails as well get out of here. + */ + if (clockevents_increase_min_delta(dev)) + return -ETIME; + i = 0; + } + } +} + +#else /* CONFIG_GENERIC_CLOCKEVENTS_MIN_ADJUST */ + +/** + * clockevents_program_min_delta - Set clock event device to the minimum delay. + * @dev: device to program + * + * Returns 0 on success, -ETIME when the retry loop failed. + */ +static int clockevents_program_min_delta(struct clock_event_device *dev) +{ + unsigned long long clc; + int64_t delta = 0; + int i; + + for (i = 0; i < 10; i++) { + delta += dev->min_delta_ns; + dev->next_event = ktime_add_ns(ktime_get(), delta); + + if (clockevent_state_shutdown(dev)) + return 0; + + dev->retries++; + clc = ((unsigned long long) delta * dev->mult) >> dev->shift; + if (dev->set_next_event((unsigned long) clc, dev) == 0) + return 0; + } + return -ETIME; +} + +#endif /* CONFIG_GENERIC_CLOCKEVENTS_MIN_ADJUST */ + +/** + * clockevents_program_event - Reprogram the clock event device. + * @dev: device to program + * @expires: absolute expiry time (monotonic clock) + * @force: program minimum delay if expires can not be set + * + * Returns 0 on success, -ETIME when the event is in the past. + */ +int clockevents_program_event(struct clock_event_device *dev, ktime_t expires, + bool force) +{ + unsigned long long clc; + int64_t delta; + int rc; + + if (unlikely(expires < 0)) { + WARN_ON_ONCE(1); + return -ETIME; + } + + dev->next_event = expires; + + if (clockevent_state_shutdown(dev)) + return 0; + + /* We must be in ONESHOT state here */ + WARN_ONCE(!clockevent_state_oneshot(dev), "Current state: %d\n", + clockevent_get_state(dev)); + + /* Shortcut for clockevent devices that can deal with ktime. */ + if (dev->features & CLOCK_EVT_FEAT_KTIME) + return dev->set_next_ktime(expires, dev); + + delta = ktime_to_ns(ktime_sub(expires, ktime_get())); + if (delta <= 0) + return force ? clockevents_program_min_delta(dev) : -ETIME; + + delta = min(delta, (int64_t) dev->max_delta_ns); + delta = max(delta, (int64_t) dev->min_delta_ns); + + clc = ((unsigned long long) delta * dev->mult) >> dev->shift; + rc = dev->set_next_event((unsigned long) clc, dev); + + return (rc && force) ? clockevents_program_min_delta(dev) : rc; +} + +/* + * Called after a notify add to make devices available which were + * released from the notifier call. + */ +static void clockevents_notify_released(void) +{ + struct clock_event_device *dev; + + while (!list_empty(&clockevents_released)) { + dev = list_entry(clockevents_released.next, + struct clock_event_device, list); + list_del(&dev->list); + list_add(&dev->list, &clockevent_devices); + tick_check_new_device(dev); + } +} + +/* + * Try to install a replacement clock event device + */ +static int clockevents_replace(struct clock_event_device *ced) +{ + struct clock_event_device *dev, *newdev = NULL; + + list_for_each_entry(dev, &clockevent_devices, list) { + if (dev == ced || !clockevent_state_detached(dev)) + continue; + + if (!tick_check_replacement(newdev, dev)) + continue; + + if (!try_module_get(dev->owner)) + continue; + + if (newdev) + module_put(newdev->owner); + newdev = dev; + } + if (newdev) { + tick_install_replacement(newdev); + list_del_init(&ced->list); + } + return newdev ? 0 : -EBUSY; +} + +/* + * Called with clockevents_mutex and clockevents_lock held + */ +static int __clockevents_try_unbind(struct clock_event_device *ced, int cpu) +{ + /* Fast track. Device is unused */ + if (clockevent_state_detached(ced)) { + list_del_init(&ced->list); + return 0; + } + + return ced == per_cpu(tick_cpu_device, cpu).evtdev ? -EAGAIN : -EBUSY; +} + +/* + * SMP function call to unbind a device + */ +static void __clockevents_unbind(void *arg) +{ + struct ce_unbind *cu = arg; + int res; + + raw_spin_lock(&clockevents_lock); + res = __clockevents_try_unbind(cu->ce, smp_processor_id()); + if (res == -EAGAIN) + res = clockevents_replace(cu->ce); + cu->res = res; + raw_spin_unlock(&clockevents_lock); +} + +/* + * Issues smp function call to unbind a per cpu device. Called with + * clockevents_mutex held. + */ +static int clockevents_unbind(struct clock_event_device *ced, int cpu) +{ + struct ce_unbind cu = { .ce = ced, .res = -ENODEV }; + + smp_call_function_single(cpu, __clockevents_unbind, &cu, 1); + return cu.res; +} + +/* + * Unbind a clockevents device. + */ +int clockevents_unbind_device(struct clock_event_device *ced, int cpu) +{ + int ret; + + mutex_lock(&clockevents_mutex); + ret = clockevents_unbind(ced, cpu); + mutex_unlock(&clockevents_mutex); + return ret; +} +EXPORT_SYMBOL_GPL(clockevents_unbind_device); + +/** + * clockevents_register_device - register a clock event device + * @dev: device to register + */ +void clockevents_register_device(struct clock_event_device *dev) +{ + unsigned long flags; + + /* Initialize state to DETACHED */ + clockevent_set_state(dev, CLOCK_EVT_STATE_DETACHED); + + if (!dev->cpumask) { + WARN_ON(num_possible_cpus() > 1); + dev->cpumask = cpumask_of(smp_processor_id()); + } + + if (dev->cpumask == cpu_all_mask) { + WARN(1, "%s cpumask == cpu_all_mask, using cpu_possible_mask instead\n", + dev->name); + dev->cpumask = cpu_possible_mask; + } + + raw_spin_lock_irqsave(&clockevents_lock, flags); + + list_add(&dev->list, &clockevent_devices); + tick_check_new_device(dev); + clockevents_notify_released(); + + raw_spin_unlock_irqrestore(&clockevents_lock, flags); +} +EXPORT_SYMBOL_GPL(clockevents_register_device); + +static void clockevents_config(struct clock_event_device *dev, u32 freq) +{ + u64 sec; + + if (!(dev->features & CLOCK_EVT_FEAT_ONESHOT)) + return; + + /* + * Calculate the maximum number of seconds we can sleep. Limit + * to 10 minutes for hardware which can program more than + * 32bit ticks so we still get reasonable conversion values. + */ + sec = dev->max_delta_ticks; + do_div(sec, freq); + if (!sec) + sec = 1; + else if (sec > 600 && dev->max_delta_ticks > UINT_MAX) + sec = 600; + + clockevents_calc_mult_shift(dev, freq, sec); + dev->min_delta_ns = cev_delta2ns(dev->min_delta_ticks, dev, false); + dev->max_delta_ns = cev_delta2ns(dev->max_delta_ticks, dev, true); +} + +/** + * clockevents_config_and_register - Configure and register a clock event device + * @dev: device to register + * @freq: The clock frequency + * @min_delta: The minimum clock ticks to program in oneshot mode + * @max_delta: The maximum clock ticks to program in oneshot mode + * + * min/max_delta can be 0 for devices which do not support oneshot mode. + */ +void clockevents_config_and_register(struct clock_event_device *dev, + u32 freq, unsigned long min_delta, + unsigned long max_delta) +{ + dev->min_delta_ticks = min_delta; + dev->max_delta_ticks = max_delta; + clockevents_config(dev, freq); + clockevents_register_device(dev); +} +EXPORT_SYMBOL_GPL(clockevents_config_and_register); + +int __clockevents_update_freq(struct clock_event_device *dev, u32 freq) +{ + clockevents_config(dev, freq); + + if (clockevent_state_oneshot(dev)) + return clockevents_program_event(dev, dev->next_event, false); + + if (clockevent_state_periodic(dev)) + return __clockevents_switch_state(dev, CLOCK_EVT_STATE_PERIODIC); + + return 0; +} + +/** + * clockevents_update_freq - Update frequency and reprogram a clock event device. + * @dev: device to modify + * @freq: new device frequency + * + * Reconfigure and reprogram a clock event device in oneshot + * mode. Must be called on the cpu for which the device delivers per + * cpu timer events. If called for the broadcast device the core takes + * care of serialization. + * + * Returns 0 on success, -ETIME when the event is in the past. + */ +int clockevents_update_freq(struct clock_event_device *dev, u32 freq) +{ + unsigned long flags; + int ret; + + local_irq_save(flags); + ret = tick_broadcast_update_freq(dev, freq); + if (ret == -ENODEV) + ret = __clockevents_update_freq(dev, freq); + local_irq_restore(flags); + return ret; +} + +/* + * Noop handler when we shut down an event device + */ +void clockevents_handle_noop(struct clock_event_device *dev) +{ +} + +/** + * clockevents_exchange_device - release and request clock devices + * @old: device to release (can be NULL) + * @new: device to request (can be NULL) + * + * Called from various tick functions with clockevents_lock held and + * interrupts disabled. + */ +void clockevents_exchange_device(struct clock_event_device *old, + struct clock_event_device *new) +{ + /* + * Caller releases a clock event device. We queue it into the + * released list and do a notify add later. + */ + if (old) { + module_put(old->owner); + clockevents_switch_state(old, CLOCK_EVT_STATE_DETACHED); + list_del(&old->list); + list_add(&old->list, &clockevents_released); + } + + if (new) { + BUG_ON(!clockevent_state_detached(new)); + clockevents_shutdown(new); + } +} + +/** + * clockevents_suspend - suspend clock devices + */ +void clockevents_suspend(void) +{ + struct clock_event_device *dev; + + list_for_each_entry_reverse(dev, &clockevent_devices, list) + if (dev->suspend && !clockevent_state_detached(dev)) + dev->suspend(dev); +} + +/** + * clockevents_resume - resume clock devices + */ +void clockevents_resume(void) +{ + struct clock_event_device *dev; + + list_for_each_entry(dev, &clockevent_devices, list) + if (dev->resume && !clockevent_state_detached(dev)) + dev->resume(dev); +} + +#ifdef CONFIG_HOTPLUG_CPU +/** + * tick_cleanup_dead_cpu - Cleanup the tick and clockevents of a dead cpu + */ +void tick_cleanup_dead_cpu(int cpu) +{ + struct clock_event_device *dev, *tmp; + unsigned long flags; + + raw_spin_lock_irqsave(&clockevents_lock, flags); + + tick_shutdown_broadcast_oneshot(cpu); + tick_shutdown_broadcast(cpu); + tick_shutdown(cpu); + /* + * Unregister the clock event devices which were + * released from the users in the notify chain. + */ + list_for_each_entry_safe(dev, tmp, &clockevents_released, list) + list_del(&dev->list); + /* + * Now check whether the CPU has left unused per cpu devices + */ + list_for_each_entry_safe(dev, tmp, &clockevent_devices, list) { + if (cpumask_test_cpu(cpu, dev->cpumask) && + cpumask_weight(dev->cpumask) == 1 && + !tick_is_broadcast_device(dev)) { + BUG_ON(!clockevent_state_detached(dev)); + list_del(&dev->list); + } + } + raw_spin_unlock_irqrestore(&clockevents_lock, flags); +} +#endif + +#ifdef CONFIG_SYSFS +static struct bus_type clockevents_subsys = { + .name = "clockevents", + .dev_name = "clockevent", +}; + +static DEFINE_PER_CPU(struct device, tick_percpu_dev); +static struct tick_device *tick_get_tick_dev(struct device *dev); + +static ssize_t sysfs_show_current_tick_dev(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + struct tick_device *td; + ssize_t count = 0; + + raw_spin_lock_irq(&clockevents_lock); + td = tick_get_tick_dev(dev); + if (td && td->evtdev) + count = snprintf(buf, PAGE_SIZE, "%s\n", td->evtdev->name); + raw_spin_unlock_irq(&clockevents_lock); + return count; +} +static DEVICE_ATTR(current_device, 0444, sysfs_show_current_tick_dev, NULL); + +/* We don't support the abomination of removable broadcast devices */ +static ssize_t sysfs_unbind_tick_dev(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t count) +{ + char name[CS_NAME_LEN]; + ssize_t ret = sysfs_get_uname(buf, name, count); + struct clock_event_device *ce; + + if (ret < 0) + return ret; + + ret = -ENODEV; + mutex_lock(&clockevents_mutex); + raw_spin_lock_irq(&clockevents_lock); + list_for_each_entry(ce, &clockevent_devices, list) { + if (!strcmp(ce->name, name)) { + ret = __clockevents_try_unbind(ce, dev->id); + break; + } + } + raw_spin_unlock_irq(&clockevents_lock); + /* + * We hold clockevents_mutex, so ce can't go away + */ + if (ret == -EAGAIN) + ret = clockevents_unbind(ce, dev->id); + mutex_unlock(&clockevents_mutex); + return ret ? ret : count; +} +static DEVICE_ATTR(unbind_device, 0200, NULL, sysfs_unbind_tick_dev); + +#ifdef CONFIG_GENERIC_CLOCKEVENTS_BROADCAST +static struct device tick_bc_dev = { + .init_name = "broadcast", + .id = 0, + .bus = &clockevents_subsys, +}; + +static struct tick_device *tick_get_tick_dev(struct device *dev) +{ + return dev == &tick_bc_dev ? tick_get_broadcast_device() : + &per_cpu(tick_cpu_device, dev->id); +} + +static __init int tick_broadcast_init_sysfs(void) +{ + int err = device_register(&tick_bc_dev); + + if (!err) + err = device_create_file(&tick_bc_dev, &dev_attr_current_device); + return err; +} +#else +static struct tick_device *tick_get_tick_dev(struct device *dev) +{ + return &per_cpu(tick_cpu_device, dev->id); +} +static inline int tick_broadcast_init_sysfs(void) { return 0; } +#endif + +static int __init tick_init_sysfs(void) +{ + int cpu; + + for_each_possible_cpu(cpu) { + struct device *dev = &per_cpu(tick_percpu_dev, cpu); + int err; + + dev->id = cpu; + dev->bus = &clockevents_subsys; + err = device_register(dev); + if (!err) + err = device_create_file(dev, &dev_attr_current_device); + if (!err) + err = device_create_file(dev, &dev_attr_unbind_device); + if (err) + return err; + } + return tick_broadcast_init_sysfs(); +} + +static int __init clockevents_init_sysfs(void) +{ + int err = subsys_system_register(&clockevents_subsys, NULL); + + if (!err) + err = tick_init_sysfs(); + return err; +} +device_initcall(clockevents_init_sysfs); +#endif /* SYSFS */ diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c new file mode 100644 index 000000000..221f8e746 --- /dev/null +++ b/kernel/time/clocksource.c @@ -0,0 +1,1294 @@ +/* + * linux/kernel/time/clocksource.c + * + * This file contains the functions which manage clocksource drivers. + * + * Copyright (C) 2004, 2005 IBM, John Stultz (johnstul@us.ibm.com) + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + * + * TODO WishList: + * o Allow clocksource drivers to be unregistered + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include <linux/device.h> +#include <linux/clocksource.h> +#include <linux/init.h> +#include <linux/module.h> +#include <linux/sched.h> /* for spin_unlock_irq() using preempt_count() m68k */ +#include <linux/tick.h> +#include <linux/kthread.h> + +#include "tick-internal.h" +#include "timekeeping_internal.h" + +/** + * clocks_calc_mult_shift - calculate mult/shift factors for scaled math of clocks + * @mult: pointer to mult variable + * @shift: pointer to shift variable + * @from: frequency to convert from + * @to: frequency to convert to + * @maxsec: guaranteed runtime conversion range in seconds + * + * The function evaluates the shift/mult pair for the scaled math + * operations of clocksources and clockevents. + * + * @to and @from are frequency values in HZ. For clock sources @to is + * NSEC_PER_SEC == 1GHz and @from is the counter frequency. For clock + * event @to is the counter frequency and @from is NSEC_PER_SEC. + * + * The @maxsec conversion range argument controls the time frame in + * seconds which must be covered by the runtime conversion with the + * calculated mult and shift factors. This guarantees that no 64bit + * overflow happens when the input value of the conversion is + * multiplied with the calculated mult factor. Larger ranges may + * reduce the conversion accuracy by chosing smaller mult and shift + * factors. + */ +void +clocks_calc_mult_shift(u32 *mult, u32 *shift, u32 from, u32 to, u32 maxsec) +{ + u64 tmp; + u32 sft, sftacc= 32; + + /* + * Calculate the shift factor which is limiting the conversion + * range: + */ + tmp = ((u64)maxsec * from) >> 32; + while (tmp) { + tmp >>=1; + sftacc--; + } + + /* + * Find the conversion shift/mult pair which has the best + * accuracy and fits the maxsec conversion range: + */ + for (sft = 32; sft > 0; sft--) { + tmp = (u64) to << sft; + tmp += from / 2; + do_div(tmp, from); + if ((tmp >> sftacc) == 0) + break; + } + *mult = tmp; + *shift = sft; +} +EXPORT_SYMBOL_GPL(clocks_calc_mult_shift); + +/*[Clocksource internal variables]--------- + * curr_clocksource: + * currently selected clocksource. + * suspend_clocksource: + * used to calculate the suspend time. + * clocksource_list: + * linked list with the registered clocksources + * clocksource_mutex: + * protects manipulations to curr_clocksource and the clocksource_list + * override_name: + * Name of the user-specified clocksource. + */ +static struct clocksource *curr_clocksource; +static struct clocksource *suspend_clocksource; +static LIST_HEAD(clocksource_list); +static DEFINE_MUTEX(clocksource_mutex); +static char override_name[CS_NAME_LEN]; +static int finished_booting; +static u64 suspend_start; + +#ifdef CONFIG_CLOCKSOURCE_WATCHDOG +static void clocksource_watchdog_work(struct work_struct *work); +static void clocksource_select(void); + +static LIST_HEAD(watchdog_list); +static struct clocksource *watchdog; +static struct timer_list watchdog_timer; +static DECLARE_WORK(watchdog_work, clocksource_watchdog_work); +static DEFINE_SPINLOCK(watchdog_lock); +static int watchdog_running; +static atomic_t watchdog_reset_pending; + +static void inline clocksource_watchdog_lock(unsigned long *flags) +{ + spin_lock_irqsave(&watchdog_lock, *flags); +} + +static void inline clocksource_watchdog_unlock(unsigned long *flags) +{ + spin_unlock_irqrestore(&watchdog_lock, *flags); +} + +static int clocksource_watchdog_kthread(void *data); +static void __clocksource_change_rating(struct clocksource *cs, int rating); + +/* + * Interval: 0.5sec Threshold: 0.0625s + */ +#define WATCHDOG_INTERVAL (HZ >> 1) +#define WATCHDOG_THRESHOLD (NSEC_PER_SEC >> 4) + +/* + * Maximum permissible delay between two readouts of the watchdog + * clocksource surrounding a read of the clocksource being validated. + * This delay could be due to SMIs, NMIs, or to VCPU preemptions. + */ +#define WATCHDOG_MAX_SKEW (100 * NSEC_PER_USEC) + +static void clocksource_watchdog_work(struct work_struct *work) +{ + /* + * We cannot directly run clocksource_watchdog_kthread() here, because + * clocksource_select() calls timekeeping_notify() which uses + * stop_machine(). One cannot use stop_machine() from a workqueue() due + * lock inversions wrt CPU hotplug. + * + * Also, we only ever run this work once or twice during the lifetime + * of the kernel, so there is no point in creating a more permanent + * kthread for this. + * + * If kthread_run fails the next watchdog scan over the + * watchdog_list will find the unstable clock again. + */ + kthread_run(clocksource_watchdog_kthread, NULL, "kwatchdog"); +} + +static void __clocksource_unstable(struct clocksource *cs) +{ + cs->flags &= ~(CLOCK_SOURCE_VALID_FOR_HRES | CLOCK_SOURCE_WATCHDOG); + cs->flags |= CLOCK_SOURCE_UNSTABLE; + + /* + * If the clocksource is registered clocksource_watchdog_kthread() will + * re-rate and re-select. + */ + if (list_empty(&cs->list)) { + cs->rating = 0; + return; + } + + if (cs->mark_unstable) + cs->mark_unstable(cs); + + /* kick clocksource_watchdog_kthread() */ + if (finished_booting) + schedule_work(&watchdog_work); +} + +/** + * clocksource_mark_unstable - mark clocksource unstable via watchdog + * @cs: clocksource to be marked unstable + * + * This function is called by the x86 TSC code to mark clocksources as unstable; + * it defers demotion and re-selection to a kthread. + */ +void clocksource_mark_unstable(struct clocksource *cs) +{ + unsigned long flags; + + spin_lock_irqsave(&watchdog_lock, flags); + if (!(cs->flags & CLOCK_SOURCE_UNSTABLE)) { + if (!list_empty(&cs->list) && list_empty(&cs->wd_list)) + list_add(&cs->wd_list, &watchdog_list); + __clocksource_unstable(cs); + } + spin_unlock_irqrestore(&watchdog_lock, flags); +} + +static ulong max_cswd_read_retries = 3; +module_param(max_cswd_read_retries, ulong, 0644); + +static bool cs_watchdog_read(struct clocksource *cs, u64 *csnow, u64 *wdnow) +{ + unsigned int nretries; + u64 wd_end, wd_delta; + int64_t wd_delay; + + for (nretries = 0; nretries <= max_cswd_read_retries; nretries++) { + local_irq_disable(); + *wdnow = watchdog->read(watchdog); + *csnow = cs->read(cs); + wd_end = watchdog->read(watchdog); + local_irq_enable(); + + wd_delta = clocksource_delta(wd_end, *wdnow, watchdog->mask); + wd_delay = clocksource_cyc2ns(wd_delta, watchdog->mult, + watchdog->shift); + if (wd_delay <= WATCHDOG_MAX_SKEW) { + if (nretries > 1 || nretries >= max_cswd_read_retries) { + pr_warn("timekeeping watchdog on CPU%d: %s retried %d times before success\n", + smp_processor_id(), watchdog->name, nretries); + } + return true; + } + } + + pr_warn("timekeeping watchdog on CPU%d: %s read-back delay of %lldns, attempt %d, marking unstable\n", + smp_processor_id(), watchdog->name, wd_delay, nretries); + return false; +} + +static void clocksource_watchdog(struct timer_list *unused) +{ + u64 csnow, wdnow, cslast, wdlast, delta; + int next_cpu, reset_pending; + int64_t wd_nsec, cs_nsec; + struct clocksource *cs; + + spin_lock(&watchdog_lock); + if (!watchdog_running) + goto out; + + reset_pending = atomic_read(&watchdog_reset_pending); + + list_for_each_entry(cs, &watchdog_list, wd_list) { + + /* Clocksource already marked unstable? */ + if (cs->flags & CLOCK_SOURCE_UNSTABLE) { + if (finished_booting) + schedule_work(&watchdog_work); + continue; + } + + if (!cs_watchdog_read(cs, &csnow, &wdnow)) { + /* Clock readout unreliable, so give it up. */ + __clocksource_unstable(cs); + continue; + } + + /* Clocksource initialized ? */ + if (!(cs->flags & CLOCK_SOURCE_WATCHDOG) || + atomic_read(&watchdog_reset_pending)) { + cs->flags |= CLOCK_SOURCE_WATCHDOG; + cs->wd_last = wdnow; + cs->cs_last = csnow; + continue; + } + + delta = clocksource_delta(wdnow, cs->wd_last, watchdog->mask); + wd_nsec = clocksource_cyc2ns(delta, watchdog->mult, + watchdog->shift); + + delta = clocksource_delta(csnow, cs->cs_last, cs->mask); + cs_nsec = clocksource_cyc2ns(delta, cs->mult, cs->shift); + wdlast = cs->wd_last; /* save these in case we print them */ + cslast = cs->cs_last; + cs->cs_last = csnow; + cs->wd_last = wdnow; + + if (atomic_read(&watchdog_reset_pending)) + continue; + + /* Check the deviation from the watchdog clocksource. */ + if (abs(cs_nsec - wd_nsec) > WATCHDOG_THRESHOLD) { + pr_warn("timekeeping watchdog on CPU%d: Marking clocksource '%s' as unstable because the skew is too large:\n", + smp_processor_id(), cs->name); + pr_warn(" '%s' wd_now: %llx wd_last: %llx mask: %llx\n", + watchdog->name, wdnow, wdlast, watchdog->mask); + pr_warn(" '%s' cs_now: %llx cs_last: %llx mask: %llx\n", + cs->name, csnow, cslast, cs->mask); + __clocksource_unstable(cs); + continue; + } + + if (cs == curr_clocksource && cs->tick_stable) + cs->tick_stable(cs); + + if (!(cs->flags & CLOCK_SOURCE_VALID_FOR_HRES) && + (cs->flags & CLOCK_SOURCE_IS_CONTINUOUS) && + (watchdog->flags & CLOCK_SOURCE_IS_CONTINUOUS)) { + /* Mark it valid for high-res. */ + cs->flags |= CLOCK_SOURCE_VALID_FOR_HRES; + + /* + * clocksource_done_booting() will sort it if + * finished_booting is not set yet. + */ + if (!finished_booting) + continue; + + /* + * If this is not the current clocksource let + * the watchdog thread reselect it. Due to the + * change to high res this clocksource might + * be preferred now. If it is the current + * clocksource let the tick code know about + * that change. + */ + if (cs != curr_clocksource) { + cs->flags |= CLOCK_SOURCE_RESELECT; + schedule_work(&watchdog_work); + } else { + tick_clock_notify(); + } + } + } + + /* + * We only clear the watchdog_reset_pending, when we did a + * full cycle through all clocksources. + */ + if (reset_pending) + atomic_dec(&watchdog_reset_pending); + + /* + * Cycle through CPUs to check if the CPUs stay synchronized + * to each other. + */ + next_cpu = cpumask_next(raw_smp_processor_id(), cpu_online_mask); + if (next_cpu >= nr_cpu_ids) + next_cpu = cpumask_first(cpu_online_mask); + + /* + * Arm timer if not already pending: could race with concurrent + * pair clocksource_stop_watchdog() clocksource_start_watchdog(). + */ + if (!timer_pending(&watchdog_timer)) { + watchdog_timer.expires += WATCHDOG_INTERVAL; + add_timer_on(&watchdog_timer, next_cpu); + } +out: + spin_unlock(&watchdog_lock); +} + +static inline void clocksource_start_watchdog(void) +{ + if (watchdog_running || !watchdog || list_empty(&watchdog_list)) + return; + timer_setup(&watchdog_timer, clocksource_watchdog, 0); + watchdog_timer.expires = jiffies + WATCHDOG_INTERVAL; + add_timer_on(&watchdog_timer, cpumask_first(cpu_online_mask)); + watchdog_running = 1; +} + +static inline void clocksource_stop_watchdog(void) +{ + if (!watchdog_running || (watchdog && !list_empty(&watchdog_list))) + return; + del_timer(&watchdog_timer); + watchdog_running = 0; +} + +static inline void clocksource_reset_watchdog(void) +{ + struct clocksource *cs; + + list_for_each_entry(cs, &watchdog_list, wd_list) + cs->flags &= ~CLOCK_SOURCE_WATCHDOG; +} + +static void clocksource_resume_watchdog(void) +{ + atomic_inc(&watchdog_reset_pending); +} + +static void clocksource_enqueue_watchdog(struct clocksource *cs) +{ + INIT_LIST_HEAD(&cs->wd_list); + + if (cs->flags & CLOCK_SOURCE_MUST_VERIFY) { + /* cs is a clocksource to be watched. */ + list_add(&cs->wd_list, &watchdog_list); + cs->flags &= ~CLOCK_SOURCE_WATCHDOG; + } else { + /* cs is a watchdog. */ + if (cs->flags & CLOCK_SOURCE_IS_CONTINUOUS) + cs->flags |= CLOCK_SOURCE_VALID_FOR_HRES; + } +} + +static void clocksource_select_watchdog(bool fallback) +{ + struct clocksource *cs, *old_wd; + unsigned long flags; + + spin_lock_irqsave(&watchdog_lock, flags); + /* save current watchdog */ + old_wd = watchdog; + if (fallback) + watchdog = NULL; + + list_for_each_entry(cs, &clocksource_list, list) { + /* cs is a clocksource to be watched. */ + if (cs->flags & CLOCK_SOURCE_MUST_VERIFY) + continue; + + /* Skip current if we were requested for a fallback. */ + if (fallback && cs == old_wd) + continue; + + /* Pick the best watchdog. */ + if (!watchdog || cs->rating > watchdog->rating) + watchdog = cs; + } + /* If we failed to find a fallback restore the old one. */ + if (!watchdog) + watchdog = old_wd; + + /* If we changed the watchdog we need to reset cycles. */ + if (watchdog != old_wd) + clocksource_reset_watchdog(); + + /* Check if the watchdog timer needs to be started. */ + clocksource_start_watchdog(); + spin_unlock_irqrestore(&watchdog_lock, flags); +} + +static void clocksource_dequeue_watchdog(struct clocksource *cs) +{ + if (cs != watchdog) { + if (cs->flags & CLOCK_SOURCE_MUST_VERIFY) { + /* cs is a watched clocksource. */ + list_del_init(&cs->wd_list); + /* Check if the watchdog timer needs to be stopped. */ + clocksource_stop_watchdog(); + } + } +} + +static int __clocksource_watchdog_kthread(void) +{ + struct clocksource *cs, *tmp; + unsigned long flags; + int select = 0; + + spin_lock_irqsave(&watchdog_lock, flags); + list_for_each_entry_safe(cs, tmp, &watchdog_list, wd_list) { + if (cs->flags & CLOCK_SOURCE_UNSTABLE) { + list_del_init(&cs->wd_list); + __clocksource_change_rating(cs, 0); + select = 1; + } + if (cs->flags & CLOCK_SOURCE_RESELECT) { + cs->flags &= ~CLOCK_SOURCE_RESELECT; + select = 1; + } + } + /* Check if the watchdog timer needs to be stopped. */ + clocksource_stop_watchdog(); + spin_unlock_irqrestore(&watchdog_lock, flags); + + return select; +} + +static int clocksource_watchdog_kthread(void *data) +{ + mutex_lock(&clocksource_mutex); + if (__clocksource_watchdog_kthread()) + clocksource_select(); + mutex_unlock(&clocksource_mutex); + return 0; +} + +static bool clocksource_is_watchdog(struct clocksource *cs) +{ + return cs == watchdog; +} + +#else /* CONFIG_CLOCKSOURCE_WATCHDOG */ + +static void clocksource_enqueue_watchdog(struct clocksource *cs) +{ + if (cs->flags & CLOCK_SOURCE_IS_CONTINUOUS) + cs->flags |= CLOCK_SOURCE_VALID_FOR_HRES; +} + +static void clocksource_select_watchdog(bool fallback) { } +static inline void clocksource_dequeue_watchdog(struct clocksource *cs) { } +static inline void clocksource_resume_watchdog(void) { } +static inline int __clocksource_watchdog_kthread(void) { return 0; } +static bool clocksource_is_watchdog(struct clocksource *cs) { return false; } +void clocksource_mark_unstable(struct clocksource *cs) { } + +static inline void clocksource_watchdog_lock(unsigned long *flags) { } +static inline void clocksource_watchdog_unlock(unsigned long *flags) { } + +#endif /* CONFIG_CLOCKSOURCE_WATCHDOG */ + +static bool clocksource_is_suspend(struct clocksource *cs) +{ + return cs == suspend_clocksource; +} + +static void __clocksource_suspend_select(struct clocksource *cs) +{ + /* + * Skip the clocksource which will be stopped in suspend state. + */ + if (!(cs->flags & CLOCK_SOURCE_SUSPEND_NONSTOP)) + return; + + /* + * The nonstop clocksource can be selected as the suspend clocksource to + * calculate the suspend time, so it should not supply suspend/resume + * interfaces to suspend the nonstop clocksource when system suspends. + */ + if (cs->suspend || cs->resume) { + pr_warn("Nonstop clocksource %s should not supply suspend/resume interfaces\n", + cs->name); + } + + /* Pick the best rating. */ + if (!suspend_clocksource || cs->rating > suspend_clocksource->rating) + suspend_clocksource = cs; +} + +/** + * clocksource_suspend_select - Select the best clocksource for suspend timing + * @fallback: if select a fallback clocksource + */ +static void clocksource_suspend_select(bool fallback) +{ + struct clocksource *cs, *old_suspend; + + old_suspend = suspend_clocksource; + if (fallback) + suspend_clocksource = NULL; + + list_for_each_entry(cs, &clocksource_list, list) { + /* Skip current if we were requested for a fallback. */ + if (fallback && cs == old_suspend) + continue; + + __clocksource_suspend_select(cs); + } +} + +/** + * clocksource_start_suspend_timing - Start measuring the suspend timing + * @cs: current clocksource from timekeeping + * @start_cycles: current cycles from timekeeping + * + * This function will save the start cycle values of suspend timer to calculate + * the suspend time when resuming system. + * + * This function is called late in the suspend process from timekeeping_suspend(), + * that means processes are freezed, non-boot cpus and interrupts are disabled + * now. It is therefore possible to start the suspend timer without taking the + * clocksource mutex. + */ +void clocksource_start_suspend_timing(struct clocksource *cs, u64 start_cycles) +{ + if (!suspend_clocksource) + return; + + /* + * If current clocksource is the suspend timer, we should use the + * tkr_mono.cycle_last value as suspend_start to avoid same reading + * from suspend timer. + */ + if (clocksource_is_suspend(cs)) { + suspend_start = start_cycles; + return; + } + + if (suspend_clocksource->enable && + suspend_clocksource->enable(suspend_clocksource)) { + pr_warn_once("Failed to enable the non-suspend-able clocksource.\n"); + return; + } + + suspend_start = suspend_clocksource->read(suspend_clocksource); +} + +/** + * clocksource_stop_suspend_timing - Stop measuring the suspend timing + * @cs: current clocksource from timekeeping + * @cycle_now: current cycles from timekeeping + * + * This function will calculate the suspend time from suspend timer. + * + * Returns nanoseconds since suspend started, 0 if no usable suspend clocksource. + * + * This function is called early in the resume process from timekeeping_resume(), + * that means there is only one cpu, no processes are running and the interrupts + * are disabled. It is therefore possible to stop the suspend timer without + * taking the clocksource mutex. + */ +u64 clocksource_stop_suspend_timing(struct clocksource *cs, u64 cycle_now) +{ + u64 now, delta, nsec = 0; + + if (!suspend_clocksource) + return 0; + + /* + * If current clocksource is the suspend timer, we should use the + * tkr_mono.cycle_last value from timekeeping as current cycle to + * avoid same reading from suspend timer. + */ + if (clocksource_is_suspend(cs)) + now = cycle_now; + else + now = suspend_clocksource->read(suspend_clocksource); + + if (now > suspend_start) { + delta = clocksource_delta(now, suspend_start, + suspend_clocksource->mask); + nsec = mul_u64_u32_shr(delta, suspend_clocksource->mult, + suspend_clocksource->shift); + } + + /* + * Disable the suspend timer to save power if current clocksource is + * not the suspend timer. + */ + if (!clocksource_is_suspend(cs) && suspend_clocksource->disable) + suspend_clocksource->disable(suspend_clocksource); + + return nsec; +} + +/** + * clocksource_suspend - suspend the clocksource(s) + */ +void clocksource_suspend(void) +{ + struct clocksource *cs; + + list_for_each_entry_reverse(cs, &clocksource_list, list) + if (cs->suspend) + cs->suspend(cs); +} + +/** + * clocksource_resume - resume the clocksource(s) + */ +void clocksource_resume(void) +{ + struct clocksource *cs; + + list_for_each_entry(cs, &clocksource_list, list) + if (cs->resume) + cs->resume(cs); + + clocksource_resume_watchdog(); +} + +/** + * clocksource_touch_watchdog - Update watchdog + * + * Update the watchdog after exception contexts such as kgdb so as not + * to incorrectly trip the watchdog. This might fail when the kernel + * was stopped in code which holds watchdog_lock. + */ +void clocksource_touch_watchdog(void) +{ + clocksource_resume_watchdog(); +} + +/** + * clocksource_max_adjustment- Returns max adjustment amount + * @cs: Pointer to clocksource + * + */ +static u32 clocksource_max_adjustment(struct clocksource *cs) +{ + u64 ret; + /* + * We won't try to correct for more than 11% adjustments (110,000 ppm), + */ + ret = (u64)cs->mult * 11; + do_div(ret,100); + return (u32)ret; +} + +/** + * clocks_calc_max_nsecs - Returns maximum nanoseconds that can be converted + * @mult: cycle to nanosecond multiplier + * @shift: cycle to nanosecond divisor (power of two) + * @maxadj: maximum adjustment value to mult (~11%) + * @mask: bitmask for two's complement subtraction of non 64 bit counters + * @max_cyc: maximum cycle value before potential overflow (does not include + * any safety margin) + * + * NOTE: This function includes a safety margin of 50%, in other words, we + * return half the number of nanoseconds the hardware counter can technically + * cover. This is done so that we can potentially detect problems caused by + * delayed timers or bad hardware, which might result in time intervals that + * are larger than what the math used can handle without overflows. + */ +u64 clocks_calc_max_nsecs(u32 mult, u32 shift, u32 maxadj, u64 mask, u64 *max_cyc) +{ + u64 max_nsecs, max_cycles; + + /* + * Calculate the maximum number of cycles that we can pass to the + * cyc2ns() function without overflowing a 64-bit result. + */ + max_cycles = ULLONG_MAX; + do_div(max_cycles, mult+maxadj); + + /* + * The actual maximum number of cycles we can defer the clocksource is + * determined by the minimum of max_cycles and mask. + * Note: Here we subtract the maxadj to make sure we don't sleep for + * too long if there's a large negative adjustment. + */ + max_cycles = min(max_cycles, mask); + max_nsecs = clocksource_cyc2ns(max_cycles, mult - maxadj, shift); + + /* return the max_cycles value as well if requested */ + if (max_cyc) + *max_cyc = max_cycles; + + /* Return 50% of the actual maximum, so we can detect bad values */ + max_nsecs >>= 1; + + return max_nsecs; +} + +/** + * clocksource_update_max_deferment - Updates the clocksource max_idle_ns & max_cycles + * @cs: Pointer to clocksource to be updated + * + */ +static inline void clocksource_update_max_deferment(struct clocksource *cs) +{ + cs->max_idle_ns = clocks_calc_max_nsecs(cs->mult, cs->shift, + cs->maxadj, cs->mask, + &cs->max_cycles); +} + +#ifndef CONFIG_ARCH_USES_GETTIMEOFFSET + +static struct clocksource *clocksource_find_best(bool oneshot, bool skipcur) +{ + struct clocksource *cs; + + if (!finished_booting || list_empty(&clocksource_list)) + return NULL; + + /* + * We pick the clocksource with the highest rating. If oneshot + * mode is active, we pick the highres valid clocksource with + * the best rating. + */ + list_for_each_entry(cs, &clocksource_list, list) { + if (skipcur && cs == curr_clocksource) + continue; + if (oneshot && !(cs->flags & CLOCK_SOURCE_VALID_FOR_HRES)) + continue; + return cs; + } + return NULL; +} + +static void __clocksource_select(bool skipcur) +{ + bool oneshot = tick_oneshot_mode_active(); + struct clocksource *best, *cs; + + /* Find the best suitable clocksource */ + best = clocksource_find_best(oneshot, skipcur); + if (!best) + return; + + if (!strlen(override_name)) + goto found; + + /* Check for the override clocksource. */ + list_for_each_entry(cs, &clocksource_list, list) { + if (skipcur && cs == curr_clocksource) + continue; + if (strcmp(cs->name, override_name) != 0) + continue; + /* + * Check to make sure we don't switch to a non-highres + * capable clocksource if the tick code is in oneshot + * mode (highres or nohz) + */ + if (!(cs->flags & CLOCK_SOURCE_VALID_FOR_HRES) && oneshot) { + /* Override clocksource cannot be used. */ + if (cs->flags & CLOCK_SOURCE_UNSTABLE) { + pr_warn("Override clocksource %s is unstable and not HRT compatible - cannot switch while in HRT/NOHZ mode\n", + cs->name); + override_name[0] = 0; + } else { + /* + * The override cannot be currently verified. + * Deferring to let the watchdog check. + */ + pr_info("Override clocksource %s is not currently HRT compatible - deferring\n", + cs->name); + } + } else + /* Override clocksource can be used. */ + best = cs; + break; + } + +found: + if (curr_clocksource != best && !timekeeping_notify(best)) { + pr_info("Switched to clocksource %s\n", best->name); + curr_clocksource = best; + } +} + +/** + * clocksource_select - Select the best clocksource available + * + * Private function. Must hold clocksource_mutex when called. + * + * Select the clocksource with the best rating, or the clocksource, + * which is selected by userspace override. + */ +static void clocksource_select(void) +{ + __clocksource_select(false); +} + +static void clocksource_select_fallback(void) +{ + __clocksource_select(true); +} + +#else /* !CONFIG_ARCH_USES_GETTIMEOFFSET */ +static inline void clocksource_select(void) { } +static inline void clocksource_select_fallback(void) { } + +#endif + +/* + * clocksource_done_booting - Called near the end of core bootup + * + * Hack to avoid lots of clocksource churn at boot time. + * We use fs_initcall because we want this to start before + * device_initcall but after subsys_initcall. + */ +static int __init clocksource_done_booting(void) +{ + mutex_lock(&clocksource_mutex); + curr_clocksource = clocksource_default_clock(); + finished_booting = 1; + /* + * Run the watchdog first to eliminate unstable clock sources + */ + __clocksource_watchdog_kthread(); + clocksource_select(); + mutex_unlock(&clocksource_mutex); + return 0; +} +fs_initcall(clocksource_done_booting); + +/* + * Enqueue the clocksource sorted by rating + */ +static void clocksource_enqueue(struct clocksource *cs) +{ + struct list_head *entry = &clocksource_list; + struct clocksource *tmp; + + list_for_each_entry(tmp, &clocksource_list, list) { + /* Keep track of the place, where to insert */ + if (tmp->rating < cs->rating) + break; + entry = &tmp->list; + } + list_add(&cs->list, entry); +} + +/** + * __clocksource_update_freq_scale - Used update clocksource with new freq + * @cs: clocksource to be registered + * @scale: Scale factor multiplied against freq to get clocksource hz + * @freq: clocksource frequency (cycles per second) divided by scale + * + * This should only be called from the clocksource->enable() method. + * + * This *SHOULD NOT* be called directly! Please use the + * __clocksource_update_freq_hz() or __clocksource_update_freq_khz() helper + * functions. + */ +void __clocksource_update_freq_scale(struct clocksource *cs, u32 scale, u32 freq) +{ + u64 sec; + + /* + * Default clocksources are *special* and self-define their mult/shift. + * But, you're not special, so you should specify a freq value. + */ + if (freq) { + /* + * Calc the maximum number of seconds which we can run before + * wrapping around. For clocksources which have a mask > 32-bit + * we need to limit the max sleep time to have a good + * conversion precision. 10 minutes is still a reasonable + * amount. That results in a shift value of 24 for a + * clocksource with mask >= 40-bit and f >= 4GHz. That maps to + * ~ 0.06ppm granularity for NTP. + */ + sec = cs->mask; + do_div(sec, freq); + do_div(sec, scale); + if (!sec) + sec = 1; + else if (sec > 600 && cs->mask > UINT_MAX) + sec = 600; + + clocks_calc_mult_shift(&cs->mult, &cs->shift, freq, + NSEC_PER_SEC / scale, sec * scale); + } + /* + * Ensure clocksources that have large 'mult' values don't overflow + * when adjusted. + */ + cs->maxadj = clocksource_max_adjustment(cs); + while (freq && ((cs->mult + cs->maxadj < cs->mult) + || (cs->mult - cs->maxadj > cs->mult))) { + cs->mult >>= 1; + cs->shift--; + cs->maxadj = clocksource_max_adjustment(cs); + } + + /* + * Only warn for *special* clocksources that self-define + * their mult/shift values and don't specify a freq. + */ + WARN_ONCE(cs->mult + cs->maxadj < cs->mult, + "timekeeping: Clocksource %s might overflow on 11%% adjustment\n", + cs->name); + + clocksource_update_max_deferment(cs); + + pr_info("%s: mask: 0x%llx max_cycles: 0x%llx, max_idle_ns: %lld ns\n", + cs->name, cs->mask, cs->max_cycles, cs->max_idle_ns); +} +EXPORT_SYMBOL_GPL(__clocksource_update_freq_scale); + +/** + * __clocksource_register_scale - Used to install new clocksources + * @cs: clocksource to be registered + * @scale: Scale factor multiplied against freq to get clocksource hz + * @freq: clocksource frequency (cycles per second) divided by scale + * + * Returns -EBUSY if registration fails, zero otherwise. + * + * This *SHOULD NOT* be called directly! Please use the + * clocksource_register_hz() or clocksource_register_khz helper functions. + */ +int __clocksource_register_scale(struct clocksource *cs, u32 scale, u32 freq) +{ + unsigned long flags; + + /* Initialize mult/shift and max_idle_ns */ + __clocksource_update_freq_scale(cs, scale, freq); + + /* Add clocksource to the clocksource list */ + mutex_lock(&clocksource_mutex); + + clocksource_watchdog_lock(&flags); + clocksource_enqueue(cs); + clocksource_enqueue_watchdog(cs); + clocksource_watchdog_unlock(&flags); + + clocksource_select(); + clocksource_select_watchdog(false); + __clocksource_suspend_select(cs); + mutex_unlock(&clocksource_mutex); + return 0; +} +EXPORT_SYMBOL_GPL(__clocksource_register_scale); + +static void __clocksource_change_rating(struct clocksource *cs, int rating) +{ + list_del(&cs->list); + cs->rating = rating; + clocksource_enqueue(cs); +} + +/** + * clocksource_change_rating - Change the rating of a registered clocksource + * @cs: clocksource to be changed + * @rating: new rating + */ +void clocksource_change_rating(struct clocksource *cs, int rating) +{ + unsigned long flags; + + mutex_lock(&clocksource_mutex); + clocksource_watchdog_lock(&flags); + __clocksource_change_rating(cs, rating); + clocksource_watchdog_unlock(&flags); + + clocksource_select(); + clocksource_select_watchdog(false); + clocksource_suspend_select(false); + mutex_unlock(&clocksource_mutex); +} +EXPORT_SYMBOL(clocksource_change_rating); + +/* + * Unbind clocksource @cs. Called with clocksource_mutex held + */ +static int clocksource_unbind(struct clocksource *cs) +{ + unsigned long flags; + + if (clocksource_is_watchdog(cs)) { + /* Select and try to install a replacement watchdog. */ + clocksource_select_watchdog(true); + if (clocksource_is_watchdog(cs)) + return -EBUSY; + } + + if (cs == curr_clocksource) { + /* Select and try to install a replacement clock source */ + clocksource_select_fallback(); + if (curr_clocksource == cs) + return -EBUSY; + } + + if (clocksource_is_suspend(cs)) { + /* + * Select and try to install a replacement suspend clocksource. + * If no replacement suspend clocksource, we will just let the + * clocksource go and have no suspend clocksource. + */ + clocksource_suspend_select(true); + } + + clocksource_watchdog_lock(&flags); + clocksource_dequeue_watchdog(cs); + list_del_init(&cs->list); + clocksource_watchdog_unlock(&flags); + + return 0; +} + +/** + * clocksource_unregister - remove a registered clocksource + * @cs: clocksource to be unregistered + */ +int clocksource_unregister(struct clocksource *cs) +{ + int ret = 0; + + mutex_lock(&clocksource_mutex); + if (!list_empty(&cs->list)) + ret = clocksource_unbind(cs); + mutex_unlock(&clocksource_mutex); + return ret; +} +EXPORT_SYMBOL(clocksource_unregister); + +#ifdef CONFIG_SYSFS +/** + * current_clocksource_show - sysfs interface for current clocksource + * @dev: unused + * @attr: unused + * @buf: char buffer to be filled with clocksource list + * + * Provides sysfs interface for listing current clocksource. + */ +static ssize_t current_clocksource_show(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + ssize_t count = 0; + + mutex_lock(&clocksource_mutex); + count = snprintf(buf, PAGE_SIZE, "%s\n", curr_clocksource->name); + mutex_unlock(&clocksource_mutex); + + return count; +} + +ssize_t sysfs_get_uname(const char *buf, char *dst, size_t cnt) +{ + size_t ret = cnt; + + /* strings from sysfs write are not 0 terminated! */ + if (!cnt || cnt >= CS_NAME_LEN) + return -EINVAL; + + /* strip of \n: */ + if (buf[cnt-1] == '\n') + cnt--; + if (cnt > 0) + memcpy(dst, buf, cnt); + dst[cnt] = 0; + return ret; +} + +/** + * current_clocksource_store - interface for manually overriding clocksource + * @dev: unused + * @attr: unused + * @buf: name of override clocksource + * @count: length of buffer + * + * Takes input from sysfs interface for manually overriding the default + * clocksource selection. + */ +static ssize_t current_clocksource_store(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t count) +{ + ssize_t ret; + + mutex_lock(&clocksource_mutex); + + ret = sysfs_get_uname(buf, override_name, count); + if (ret >= 0) + clocksource_select(); + + mutex_unlock(&clocksource_mutex); + + return ret; +} +static DEVICE_ATTR_RW(current_clocksource); + +/** + * unbind_clocksource_store - interface for manually unbinding clocksource + * @dev: unused + * @attr: unused + * @buf: unused + * @count: length of buffer + * + * Takes input from sysfs interface for manually unbinding a clocksource. + */ +static ssize_t unbind_clocksource_store(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t count) +{ + struct clocksource *cs; + char name[CS_NAME_LEN]; + ssize_t ret; + + ret = sysfs_get_uname(buf, name, count); + if (ret < 0) + return ret; + + ret = -ENODEV; + mutex_lock(&clocksource_mutex); + list_for_each_entry(cs, &clocksource_list, list) { + if (strcmp(cs->name, name)) + continue; + ret = clocksource_unbind(cs); + break; + } + mutex_unlock(&clocksource_mutex); + + return ret ? ret : count; +} +static DEVICE_ATTR_WO(unbind_clocksource); + +/** + * available_clocksource_show - sysfs interface for listing clocksource + * @dev: unused + * @attr: unused + * @buf: char buffer to be filled with clocksource list + * + * Provides sysfs interface for listing registered clocksources + */ +static ssize_t available_clocksource_show(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + struct clocksource *src; + ssize_t count = 0; + + mutex_lock(&clocksource_mutex); + list_for_each_entry(src, &clocksource_list, list) { + /* + * Don't show non-HRES clocksource if the tick code is + * in one shot mode (highres=on or nohz=on) + */ + if (!tick_oneshot_mode_active() || + (src->flags & CLOCK_SOURCE_VALID_FOR_HRES)) + count += snprintf(buf + count, + max((ssize_t)PAGE_SIZE - count, (ssize_t)0), + "%s ", src->name); + } + mutex_unlock(&clocksource_mutex); + + count += snprintf(buf + count, + max((ssize_t)PAGE_SIZE - count, (ssize_t)0), "\n"); + + return count; +} +static DEVICE_ATTR_RO(available_clocksource); + +static struct attribute *clocksource_attrs[] = { + &dev_attr_current_clocksource.attr, + &dev_attr_unbind_clocksource.attr, + &dev_attr_available_clocksource.attr, + NULL +}; +ATTRIBUTE_GROUPS(clocksource); + +static struct bus_type clocksource_subsys = { + .name = "clocksource", + .dev_name = "clocksource", +}; + +static struct device device_clocksource = { + .id = 0, + .bus = &clocksource_subsys, + .groups = clocksource_groups, +}; + +static int __init init_clocksource_sysfs(void) +{ + int error = subsys_system_register(&clocksource_subsys, NULL); + + if (!error) + error = device_register(&device_clocksource); + + return error; +} + +device_initcall(init_clocksource_sysfs); +#endif /* CONFIG_SYSFS */ + +/** + * boot_override_clocksource - boot clock override + * @str: override name + * + * Takes a clocksource= boot argument and uses it + * as the clocksource override name. + */ +static int __init boot_override_clocksource(char* str) +{ + mutex_lock(&clocksource_mutex); + if (str) + strlcpy(override_name, str, sizeof(override_name)); + mutex_unlock(&clocksource_mutex); + return 1; +} + +__setup("clocksource=", boot_override_clocksource); + +/** + * boot_override_clock - Compatibility layer for deprecated boot option + * @str: override name + * + * DEPRECATED! Takes a clock= boot argument and uses it + * as the clocksource override name + */ +static int __init boot_override_clock(char* str) +{ + if (!strcmp(str, "pmtmr")) { + pr_warn("clock=pmtmr is deprecated - use clocksource=acpi_pm\n"); + return boot_override_clocksource("acpi_pm"); + } + pr_warn("clock= boot option is deprecated - use clocksource=xyz\n"); + return boot_override_clocksource(str); +} + +__setup("clock=", boot_override_clock); diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c new file mode 100644 index 000000000..32ee24f51 --- /dev/null +++ b/kernel/time/hrtimer.c @@ -0,0 +1,2095 @@ +/* + * linux/kernel/hrtimer.c + * + * Copyright(C) 2005-2006, Thomas Gleixner <tglx@linutronix.de> + * Copyright(C) 2005-2007, Red Hat, Inc., Ingo Molnar + * Copyright(C) 2006-2007 Timesys Corp., Thomas Gleixner + * + * High-resolution kernel timers + * + * In contrast to the low-resolution timeout API implemented in + * kernel/timer.c, hrtimers provide finer resolution and accuracy + * depending on system configuration and capabilities. + * + * These timers are currently used for: + * - itimers + * - POSIX timers + * - nanosleep + * - precise in-kernel timing + * + * Started by: Thomas Gleixner and Ingo Molnar + * + * Credits: + * based on kernel/timer.c + * + * Help, testing, suggestions, bugfixes, improvements were + * provided by: + * + * George Anzinger, Andrew Morton, Steven Rostedt, Roman Zippel + * et. al. + * + * For licencing details see kernel-base/COPYING + */ + +#include <linux/cpu.h> +#include <linux/export.h> +#include <linux/percpu.h> +#include <linux/hrtimer.h> +#include <linux/notifier.h> +#include <linux/syscalls.h> +#include <linux/interrupt.h> +#include <linux/tick.h> +#include <linux/seq_file.h> +#include <linux/err.h> +#include <linux/debugobjects.h> +#include <linux/sched/signal.h> +#include <linux/sched/sysctl.h> +#include <linux/sched/rt.h> +#include <linux/sched/deadline.h> +#include <linux/sched/nohz.h> +#include <linux/sched/debug.h> +#include <linux/timer.h> +#include <linux/freezer.h> +#include <linux/compat.h> + +#include <linux/uaccess.h> + +#include <trace/events/timer.h> + +#include "tick-internal.h" + +/* + * Masks for selecting the soft and hard context timers from + * cpu_base->active + */ +#define MASK_SHIFT (HRTIMER_BASE_MONOTONIC_SOFT) +#define HRTIMER_ACTIVE_HARD ((1U << MASK_SHIFT) - 1) +#define HRTIMER_ACTIVE_SOFT (HRTIMER_ACTIVE_HARD << MASK_SHIFT) +#define HRTIMER_ACTIVE_ALL (HRTIMER_ACTIVE_SOFT | HRTIMER_ACTIVE_HARD) + +/* + * The timer bases: + * + * There are more clockids than hrtimer bases. Thus, we index + * into the timer bases by the hrtimer_base_type enum. When trying + * to reach a base using a clockid, hrtimer_clockid_to_base() + * is used to convert from clockid to the proper hrtimer_base_type. + */ +DEFINE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases) = +{ + .lock = __RAW_SPIN_LOCK_UNLOCKED(hrtimer_bases.lock), + .clock_base = + { + { + .index = HRTIMER_BASE_MONOTONIC, + .clockid = CLOCK_MONOTONIC, + .get_time = &ktime_get, + }, + { + .index = HRTIMER_BASE_REALTIME, + .clockid = CLOCK_REALTIME, + .get_time = &ktime_get_real, + }, + { + .index = HRTIMER_BASE_BOOTTIME, + .clockid = CLOCK_BOOTTIME, + .get_time = &ktime_get_boottime, + }, + { + .index = HRTIMER_BASE_TAI, + .clockid = CLOCK_TAI, + .get_time = &ktime_get_clocktai, + }, + { + .index = HRTIMER_BASE_MONOTONIC_SOFT, + .clockid = CLOCK_MONOTONIC, + .get_time = &ktime_get, + }, + { + .index = HRTIMER_BASE_REALTIME_SOFT, + .clockid = CLOCK_REALTIME, + .get_time = &ktime_get_real, + }, + { + .index = HRTIMER_BASE_BOOTTIME_SOFT, + .clockid = CLOCK_BOOTTIME, + .get_time = &ktime_get_boottime, + }, + { + .index = HRTIMER_BASE_TAI_SOFT, + .clockid = CLOCK_TAI, + .get_time = &ktime_get_clocktai, + }, + } +}; + +static const int hrtimer_clock_to_base_table[MAX_CLOCKS] = { + /* Make sure we catch unsupported clockids */ + [0 ... MAX_CLOCKS - 1] = HRTIMER_MAX_CLOCK_BASES, + + [CLOCK_REALTIME] = HRTIMER_BASE_REALTIME, + [CLOCK_MONOTONIC] = HRTIMER_BASE_MONOTONIC, + [CLOCK_BOOTTIME] = HRTIMER_BASE_BOOTTIME, + [CLOCK_TAI] = HRTIMER_BASE_TAI, +}; + +/* + * Functions and macros which are different for UP/SMP systems are kept in a + * single place + */ +#ifdef CONFIG_SMP + +/* + * We require the migration_base for lock_hrtimer_base()/switch_hrtimer_base() + * such that hrtimer_callback_running() can unconditionally dereference + * timer->base->cpu_base + */ +static struct hrtimer_cpu_base migration_cpu_base = { + .clock_base = { { .cpu_base = &migration_cpu_base, }, }, +}; + +#define migration_base migration_cpu_base.clock_base[0] + +/* + * We are using hashed locking: holding per_cpu(hrtimer_bases)[n].lock + * means that all timers which are tied to this base via timer->base are + * locked, and the base itself is locked too. + * + * So __run_timers/migrate_timers can safely modify all timers which could + * be found on the lists/queues. + * + * When the timer's base is locked, and the timer removed from list, it is + * possible to set timer->base = &migration_base and drop the lock: the timer + * remains locked. + */ +static +struct hrtimer_clock_base *lock_hrtimer_base(const struct hrtimer *timer, + unsigned long *flags) +{ + struct hrtimer_clock_base *base; + + for (;;) { + base = timer->base; + if (likely(base != &migration_base)) { + raw_spin_lock_irqsave(&base->cpu_base->lock, *flags); + if (likely(base == timer->base)) + return base; + /* The timer has migrated to another CPU: */ + raw_spin_unlock_irqrestore(&base->cpu_base->lock, *flags); + } + cpu_relax(); + } +} + +/* + * We do not migrate the timer when it is expiring before the next + * event on the target cpu. When high resolution is enabled, we cannot + * reprogram the target cpu hardware and we would cause it to fire + * late. To keep it simple, we handle the high resolution enabled and + * disabled case similar. + * + * Called with cpu_base->lock of target cpu held. + */ +static int +hrtimer_check_target(struct hrtimer *timer, struct hrtimer_clock_base *new_base) +{ + ktime_t expires; + + expires = ktime_sub(hrtimer_get_expires(timer), new_base->offset); + return expires < new_base->cpu_base->expires_next; +} + +static inline +struct hrtimer_cpu_base *get_target_base(struct hrtimer_cpu_base *base, + int pinned) +{ +#if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ_COMMON) + if (static_branch_likely(&timers_migration_enabled) && !pinned) + return &per_cpu(hrtimer_bases, get_nohz_timer_target()); +#endif + return base; +} + +/* + * We switch the timer base to a power-optimized selected CPU target, + * if: + * - NO_HZ_COMMON is enabled + * - timer migration is enabled + * - the timer callback is not running + * - the timer is not the first expiring timer on the new target + * + * If one of the above requirements is not fulfilled we move the timer + * to the current CPU or leave it on the previously assigned CPU if + * the timer callback is currently running. + */ +static inline struct hrtimer_clock_base * +switch_hrtimer_base(struct hrtimer *timer, struct hrtimer_clock_base *base, + int pinned) +{ + struct hrtimer_cpu_base *new_cpu_base, *this_cpu_base; + struct hrtimer_clock_base *new_base; + int basenum = base->index; + + this_cpu_base = this_cpu_ptr(&hrtimer_bases); + new_cpu_base = get_target_base(this_cpu_base, pinned); +again: + new_base = &new_cpu_base->clock_base[basenum]; + + if (base != new_base) { + /* + * We are trying to move timer to new_base. + * However we can't change timer's base while it is running, + * so we keep it on the same CPU. No hassle vs. reprogramming + * the event source in the high resolution case. The softirq + * code will take care of this when the timer function has + * completed. There is no conflict as we hold the lock until + * the timer is enqueued. + */ + if (unlikely(hrtimer_callback_running(timer))) + return base; + + /* See the comment in lock_hrtimer_base() */ + timer->base = &migration_base; + raw_spin_unlock(&base->cpu_base->lock); + raw_spin_lock(&new_base->cpu_base->lock); + + if (new_cpu_base != this_cpu_base && + hrtimer_check_target(timer, new_base)) { + raw_spin_unlock(&new_base->cpu_base->lock); + raw_spin_lock(&base->cpu_base->lock); + new_cpu_base = this_cpu_base; + timer->base = base; + goto again; + } + timer->base = new_base; + } else { + if (new_cpu_base != this_cpu_base && + hrtimer_check_target(timer, new_base)) { + new_cpu_base = this_cpu_base; + goto again; + } + } + return new_base; +} + +#else /* CONFIG_SMP */ + +static inline struct hrtimer_clock_base * +lock_hrtimer_base(const struct hrtimer *timer, unsigned long *flags) +{ + struct hrtimer_clock_base *base = timer->base; + + raw_spin_lock_irqsave(&base->cpu_base->lock, *flags); + + return base; +} + +# define switch_hrtimer_base(t, b, p) (b) + +#endif /* !CONFIG_SMP */ + +/* + * Functions for the union type storage format of ktime_t which are + * too large for inlining: + */ +#if BITS_PER_LONG < 64 +/* + * Divide a ktime value by a nanosecond value + */ +s64 __ktime_divns(const ktime_t kt, s64 div) +{ + int sft = 0; + s64 dclc; + u64 tmp; + + dclc = ktime_to_ns(kt); + tmp = dclc < 0 ? -dclc : dclc; + + /* Make sure the divisor is less than 2^32: */ + while (div >> 32) { + sft++; + div >>= 1; + } + tmp >>= sft; + do_div(tmp, (unsigned long) div); + return dclc < 0 ? -tmp : tmp; +} +EXPORT_SYMBOL_GPL(__ktime_divns); +#endif /* BITS_PER_LONG >= 64 */ + +/* + * Add two ktime values and do a safety check for overflow: + */ +ktime_t ktime_add_safe(const ktime_t lhs, const ktime_t rhs) +{ + ktime_t res = ktime_add_unsafe(lhs, rhs); + + /* + * We use KTIME_SEC_MAX here, the maximum timeout which we can + * return to user space in a timespec: + */ + if (res < 0 || res < lhs || res < rhs) + res = ktime_set(KTIME_SEC_MAX, 0); + + return res; +} + +EXPORT_SYMBOL_GPL(ktime_add_safe); + +#ifdef CONFIG_DEBUG_OBJECTS_TIMERS + +static struct debug_obj_descr hrtimer_debug_descr; + +static void *hrtimer_debug_hint(void *addr) +{ + return ((struct hrtimer *) addr)->function; +} + +/* + * fixup_init is called when: + * - an active object is initialized + */ +static bool hrtimer_fixup_init(void *addr, enum debug_obj_state state) +{ + struct hrtimer *timer = addr; + + switch (state) { + case ODEBUG_STATE_ACTIVE: + hrtimer_cancel(timer); + debug_object_init(timer, &hrtimer_debug_descr); + return true; + default: + return false; + } +} + +/* + * fixup_activate is called when: + * - an active object is activated + * - an unknown non-static object is activated + */ +static bool hrtimer_fixup_activate(void *addr, enum debug_obj_state state) +{ + switch (state) { + case ODEBUG_STATE_ACTIVE: + WARN_ON(1); + + default: + return false; + } +} + +/* + * fixup_free is called when: + * - an active object is freed + */ +static bool hrtimer_fixup_free(void *addr, enum debug_obj_state state) +{ + struct hrtimer *timer = addr; + + switch (state) { + case ODEBUG_STATE_ACTIVE: + hrtimer_cancel(timer); + debug_object_free(timer, &hrtimer_debug_descr); + return true; + default: + return false; + } +} + +static struct debug_obj_descr hrtimer_debug_descr = { + .name = "hrtimer", + .debug_hint = hrtimer_debug_hint, + .fixup_init = hrtimer_fixup_init, + .fixup_activate = hrtimer_fixup_activate, + .fixup_free = hrtimer_fixup_free, +}; + +static inline void debug_hrtimer_init(struct hrtimer *timer) +{ + debug_object_init(timer, &hrtimer_debug_descr); +} + +static inline void debug_hrtimer_activate(struct hrtimer *timer, + enum hrtimer_mode mode) +{ + debug_object_activate(timer, &hrtimer_debug_descr); +} + +static inline void debug_hrtimer_deactivate(struct hrtimer *timer) +{ + debug_object_deactivate(timer, &hrtimer_debug_descr); +} + +static inline void debug_hrtimer_free(struct hrtimer *timer) +{ + debug_object_free(timer, &hrtimer_debug_descr); +} + +static void __hrtimer_init(struct hrtimer *timer, clockid_t clock_id, + enum hrtimer_mode mode); + +void hrtimer_init_on_stack(struct hrtimer *timer, clockid_t clock_id, + enum hrtimer_mode mode) +{ + debug_object_init_on_stack(timer, &hrtimer_debug_descr); + __hrtimer_init(timer, clock_id, mode); +} +EXPORT_SYMBOL_GPL(hrtimer_init_on_stack); + +void destroy_hrtimer_on_stack(struct hrtimer *timer) +{ + debug_object_free(timer, &hrtimer_debug_descr); +} +EXPORT_SYMBOL_GPL(destroy_hrtimer_on_stack); + +#else + +static inline void debug_hrtimer_init(struct hrtimer *timer) { } +static inline void debug_hrtimer_activate(struct hrtimer *timer, + enum hrtimer_mode mode) { } +static inline void debug_hrtimer_deactivate(struct hrtimer *timer) { } +#endif + +static inline void +debug_init(struct hrtimer *timer, clockid_t clockid, + enum hrtimer_mode mode) +{ + debug_hrtimer_init(timer); + trace_hrtimer_init(timer, clockid, mode); +} + +static inline void debug_activate(struct hrtimer *timer, + enum hrtimer_mode mode) +{ + debug_hrtimer_activate(timer, mode); + trace_hrtimer_start(timer, mode); +} + +static inline void debug_deactivate(struct hrtimer *timer) +{ + debug_hrtimer_deactivate(timer); + trace_hrtimer_cancel(timer); +} + +static struct hrtimer_clock_base * +__next_base(struct hrtimer_cpu_base *cpu_base, unsigned int *active) +{ + unsigned int idx; + + if (!*active) + return NULL; + + idx = __ffs(*active); + *active &= ~(1U << idx); + + return &cpu_base->clock_base[idx]; +} + +#define for_each_active_base(base, cpu_base, active) \ + while ((base = __next_base((cpu_base), &(active)))) + +static ktime_t __hrtimer_next_event_base(struct hrtimer_cpu_base *cpu_base, + const struct hrtimer *exclude, + unsigned int active, + ktime_t expires_next) +{ + struct hrtimer_clock_base *base; + ktime_t expires; + + for_each_active_base(base, cpu_base, active) { + struct timerqueue_node *next; + struct hrtimer *timer; + + next = timerqueue_getnext(&base->active); + timer = container_of(next, struct hrtimer, node); + if (timer == exclude) { + /* Get to the next timer in the queue. */ + next = timerqueue_iterate_next(next); + if (!next) + continue; + + timer = container_of(next, struct hrtimer, node); + } + expires = ktime_sub(hrtimer_get_expires(timer), base->offset); + if (expires < expires_next) { + expires_next = expires; + + /* Skip cpu_base update if a timer is being excluded. */ + if (exclude) + continue; + + if (timer->is_soft) + cpu_base->softirq_next_timer = timer; + else + cpu_base->next_timer = timer; + } + } + /* + * clock_was_set() might have changed base->offset of any of + * the clock bases so the result might be negative. Fix it up + * to prevent a false positive in clockevents_program_event(). + */ + if (expires_next < 0) + expires_next = 0; + return expires_next; +} + +/* + * Recomputes cpu_base::*next_timer and returns the earliest expires_next + * but does not set cpu_base::*expires_next, that is done by + * hrtimer[_force]_reprogram and hrtimer_interrupt only. When updating + * cpu_base::*expires_next right away, reprogramming logic would no longer + * work. + * + * When a softirq is pending, we can ignore the HRTIMER_ACTIVE_SOFT bases, + * those timers will get run whenever the softirq gets handled, at the end of + * hrtimer_run_softirq(), hrtimer_update_softirq_timer() will re-add these bases. + * + * Therefore softirq values are those from the HRTIMER_ACTIVE_SOFT clock bases. + * The !softirq values are the minima across HRTIMER_ACTIVE_ALL, unless an actual + * softirq is pending, in which case they're the minima of HRTIMER_ACTIVE_HARD. + * + * @active_mask must be one of: + * - HRTIMER_ACTIVE_ALL, + * - HRTIMER_ACTIVE_SOFT, or + * - HRTIMER_ACTIVE_HARD. + */ +static ktime_t +__hrtimer_get_next_event(struct hrtimer_cpu_base *cpu_base, unsigned int active_mask) +{ + unsigned int active; + struct hrtimer *next_timer = NULL; + ktime_t expires_next = KTIME_MAX; + + if (!cpu_base->softirq_activated && (active_mask & HRTIMER_ACTIVE_SOFT)) { + active = cpu_base->active_bases & HRTIMER_ACTIVE_SOFT; + cpu_base->softirq_next_timer = NULL; + expires_next = __hrtimer_next_event_base(cpu_base, NULL, + active, KTIME_MAX); + + next_timer = cpu_base->softirq_next_timer; + } + + if (active_mask & HRTIMER_ACTIVE_HARD) { + active = cpu_base->active_bases & HRTIMER_ACTIVE_HARD; + cpu_base->next_timer = next_timer; + expires_next = __hrtimer_next_event_base(cpu_base, NULL, active, + expires_next); + } + + return expires_next; +} + +static ktime_t hrtimer_update_next_event(struct hrtimer_cpu_base *cpu_base) +{ + ktime_t expires_next, soft = KTIME_MAX; + + /* + * If the soft interrupt has already been activated, ignore the + * soft bases. They will be handled in the already raised soft + * interrupt. + */ + if (!cpu_base->softirq_activated) { + soft = __hrtimer_get_next_event(cpu_base, HRTIMER_ACTIVE_SOFT); + /* + * Update the soft expiry time. clock_settime() might have + * affected it. + */ + cpu_base->softirq_expires_next = soft; + } + + expires_next = __hrtimer_get_next_event(cpu_base, HRTIMER_ACTIVE_HARD); + /* + * If a softirq timer is expiring first, update cpu_base->next_timer + * and program the hardware with the soft expiry time. + */ + if (expires_next > soft) { + cpu_base->next_timer = cpu_base->softirq_next_timer; + expires_next = soft; + } + + return expires_next; +} + +static inline ktime_t hrtimer_update_base(struct hrtimer_cpu_base *base) +{ + ktime_t *offs_real = &base->clock_base[HRTIMER_BASE_REALTIME].offset; + ktime_t *offs_boot = &base->clock_base[HRTIMER_BASE_BOOTTIME].offset; + ktime_t *offs_tai = &base->clock_base[HRTIMER_BASE_TAI].offset; + + ktime_t now = ktime_get_update_offsets_now(&base->clock_was_set_seq, + offs_real, offs_boot, offs_tai); + + base->clock_base[HRTIMER_BASE_REALTIME_SOFT].offset = *offs_real; + base->clock_base[HRTIMER_BASE_BOOTTIME_SOFT].offset = *offs_boot; + base->clock_base[HRTIMER_BASE_TAI_SOFT].offset = *offs_tai; + + return now; +} + +/* + * Is the high resolution mode active ? + */ +static inline int __hrtimer_hres_active(struct hrtimer_cpu_base *cpu_base) +{ + return IS_ENABLED(CONFIG_HIGH_RES_TIMERS) ? + cpu_base->hres_active : 0; +} + +static inline int hrtimer_hres_active(void) +{ + return __hrtimer_hres_active(this_cpu_ptr(&hrtimer_bases)); +} + +/* + * Reprogram the event source with checking both queues for the + * next event + * Called with interrupts disabled and base->lock held + */ +static void +hrtimer_force_reprogram(struct hrtimer_cpu_base *cpu_base, int skip_equal) +{ + ktime_t expires_next; + + expires_next = hrtimer_update_next_event(cpu_base); + + if (skip_equal && expires_next == cpu_base->expires_next) + return; + + cpu_base->expires_next = expires_next; + + /* + * If hres is not active, hardware does not have to be + * reprogrammed yet. + * + * If a hang was detected in the last timer interrupt then we + * leave the hang delay active in the hardware. We want the + * system to make progress. That also prevents the following + * scenario: + * T1 expires 50ms from now + * T2 expires 5s from now + * + * T1 is removed, so this code is called and would reprogram + * the hardware to 5s from now. Any hrtimer_start after that + * will not reprogram the hardware due to hang_detected being + * set. So we'd effectivly block all timers until the T2 event + * fires. + */ + if (!__hrtimer_hres_active(cpu_base) || cpu_base->hang_detected) + return; + + tick_program_event(cpu_base->expires_next, 1); +} + +/* High resolution timer related functions */ +#ifdef CONFIG_HIGH_RES_TIMERS + +/* + * High resolution timer enabled ? + */ +static bool hrtimer_hres_enabled __read_mostly = true; +unsigned int hrtimer_resolution __read_mostly = LOW_RES_NSEC; +EXPORT_SYMBOL_GPL(hrtimer_resolution); + +/* + * Enable / Disable high resolution mode + */ +static int __init setup_hrtimer_hres(char *str) +{ + return (kstrtobool(str, &hrtimer_hres_enabled) == 0); +} + +__setup("highres=", setup_hrtimer_hres); + +/* + * hrtimer_high_res_enabled - query, if the highres mode is enabled + */ +static inline int hrtimer_is_hres_enabled(void) +{ + return hrtimer_hres_enabled; +} + +/* + * Retrigger next event is called after clock was set + * + * Called with interrupts disabled via on_each_cpu() + */ +static void retrigger_next_event(void *arg) +{ + struct hrtimer_cpu_base *base = this_cpu_ptr(&hrtimer_bases); + + if (!__hrtimer_hres_active(base)) + return; + + raw_spin_lock(&base->lock); + hrtimer_update_base(base); + hrtimer_force_reprogram(base, 0); + raw_spin_unlock(&base->lock); +} + +/* + * Switch to high resolution mode + */ +static void hrtimer_switch_to_hres(void) +{ + struct hrtimer_cpu_base *base = this_cpu_ptr(&hrtimer_bases); + + if (tick_init_highres()) { + pr_warn("Could not switch to high resolution mode on CPU %u\n", + base->cpu); + return; + } + base->hres_active = 1; + hrtimer_resolution = HIGH_RES_NSEC; + + tick_setup_sched_timer(); + /* "Retrigger" the interrupt to get things going */ + retrigger_next_event(NULL); +} + +static void clock_was_set_work(struct work_struct *work) +{ + clock_was_set(); +} + +static DECLARE_WORK(hrtimer_work, clock_was_set_work); + +/* + * Called from timekeeping and resume code to reprogram the hrtimer + * interrupt device on all cpus. + */ +void clock_was_set_delayed(void) +{ + schedule_work(&hrtimer_work); +} + +#else + +static inline int hrtimer_is_hres_enabled(void) { return 0; } +static inline void hrtimer_switch_to_hres(void) { } +static inline void retrigger_next_event(void *arg) { } + +#endif /* CONFIG_HIGH_RES_TIMERS */ + +/* + * When a timer is enqueued and expires earlier than the already enqueued + * timers, we have to check, whether it expires earlier than the timer for + * which the clock event device was armed. + * + * Called with interrupts disabled and base->cpu_base.lock held + */ +static void hrtimer_reprogram(struct hrtimer *timer, bool reprogram) +{ + struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases); + struct hrtimer_clock_base *base = timer->base; + ktime_t expires = ktime_sub(hrtimer_get_expires(timer), base->offset); + + WARN_ON_ONCE(hrtimer_get_expires_tv64(timer) < 0); + + /* + * CLOCK_REALTIME timer might be requested with an absolute + * expiry time which is less than base->offset. Set it to 0. + */ + if (expires < 0) + expires = 0; + + if (timer->is_soft) { + /* + * soft hrtimer could be started on a remote CPU. In this + * case softirq_expires_next needs to be updated on the + * remote CPU. The soft hrtimer will not expire before the + * first hard hrtimer on the remote CPU - + * hrtimer_check_target() prevents this case. + */ + struct hrtimer_cpu_base *timer_cpu_base = base->cpu_base; + + if (timer_cpu_base->softirq_activated) + return; + + if (!ktime_before(expires, timer_cpu_base->softirq_expires_next)) + return; + + timer_cpu_base->softirq_next_timer = timer; + timer_cpu_base->softirq_expires_next = expires; + + if (!ktime_before(expires, timer_cpu_base->expires_next) || + !reprogram) + return; + } + + /* + * If the timer is not on the current cpu, we cannot reprogram + * the other cpus clock event device. + */ + if (base->cpu_base != cpu_base) + return; + + /* + * If the hrtimer interrupt is running, then it will + * reevaluate the clock bases and reprogram the clock event + * device. The callbacks are always executed in hard interrupt + * context so we don't need an extra check for a running + * callback. + */ + if (cpu_base->in_hrtirq) + return; + + if (expires >= cpu_base->expires_next) + return; + + /* Update the pointer to the next expiring timer */ + cpu_base->next_timer = timer; + cpu_base->expires_next = expires; + + /* + * If hres is not active, hardware does not have to be + * programmed yet. + * + * If a hang was detected in the last timer interrupt then we + * do not schedule a timer which is earlier than the expiry + * which we enforced in the hang detection. We want the system + * to make progress. + */ + if (!__hrtimer_hres_active(cpu_base) || cpu_base->hang_detected) + return; + + /* + * Program the timer hardware. We enforce the expiry for + * events which are already in the past. + */ + tick_program_event(expires, 1); +} + +/* + * Clock realtime was set + * + * Change the offset of the realtime clock vs. the monotonic + * clock. + * + * We might have to reprogram the high resolution timer interrupt. On + * SMP we call the architecture specific code to retrigger _all_ high + * resolution timer interrupts. On UP we just disable interrupts and + * call the high resolution interrupt code. + */ +void clock_was_set(void) +{ +#ifdef CONFIG_HIGH_RES_TIMERS + /* Retrigger the CPU local events everywhere */ + on_each_cpu(retrigger_next_event, NULL, 1); +#endif + timerfd_clock_was_set(); +} + +/* + * During resume we might have to reprogram the high resolution timer + * interrupt on all online CPUs. However, all other CPUs will be + * stopped with IRQs interrupts disabled so the clock_was_set() call + * must be deferred. + */ +void hrtimers_resume(void) +{ + lockdep_assert_irqs_disabled(); + /* Retrigger on the local CPU */ + retrigger_next_event(NULL); + /* And schedule a retrigger for all others */ + clock_was_set_delayed(); +} + +/* + * Counterpart to lock_hrtimer_base above: + */ +static inline +void unlock_hrtimer_base(const struct hrtimer *timer, unsigned long *flags) +{ + raw_spin_unlock_irqrestore(&timer->base->cpu_base->lock, *flags); +} + +/** + * hrtimer_forward - forward the timer expiry + * @timer: hrtimer to forward + * @now: forward past this time + * @interval: the interval to forward + * + * Forward the timer expiry so it will expire in the future. + * Returns the number of overruns. + * + * Can be safely called from the callback function of @timer. If + * called from other contexts @timer must neither be enqueued nor + * running the callback and the caller needs to take care of + * serialization. + * + * Note: This only updates the timer expiry value and does not requeue + * the timer. + */ +u64 hrtimer_forward(struct hrtimer *timer, ktime_t now, ktime_t interval) +{ + u64 orun = 1; + ktime_t delta; + + delta = ktime_sub(now, hrtimer_get_expires(timer)); + + if (delta < 0) + return 0; + + if (WARN_ON(timer->state & HRTIMER_STATE_ENQUEUED)) + return 0; + + if (interval < hrtimer_resolution) + interval = hrtimer_resolution; + + if (unlikely(delta >= interval)) { + s64 incr = ktime_to_ns(interval); + + orun = ktime_divns(delta, incr); + hrtimer_add_expires_ns(timer, incr * orun); + if (hrtimer_get_expires_tv64(timer) > now) + return orun; + /* + * This (and the ktime_add() below) is the + * correction for exact: + */ + orun++; + } + hrtimer_add_expires(timer, interval); + + return orun; +} +EXPORT_SYMBOL_GPL(hrtimer_forward); + +/* + * enqueue_hrtimer - internal function to (re)start a timer + * + * The timer is inserted in expiry order. Insertion into the + * red black tree is O(log(n)). Must hold the base lock. + * + * Returns 1 when the new timer is the leftmost timer in the tree. + */ +static int enqueue_hrtimer(struct hrtimer *timer, + struct hrtimer_clock_base *base, + enum hrtimer_mode mode) +{ + debug_activate(timer, mode); + + base->cpu_base->active_bases |= 1 << base->index; + + /* Pairs with the lockless read in hrtimer_is_queued() */ + WRITE_ONCE(timer->state, HRTIMER_STATE_ENQUEUED); + + return timerqueue_add(&base->active, &timer->node); +} + +/* + * __remove_hrtimer - internal function to remove a timer + * + * Caller must hold the base lock. + * + * High resolution timer mode reprograms the clock event device when the + * timer is the one which expires next. The caller can disable this by setting + * reprogram to zero. This is useful, when the context does a reprogramming + * anyway (e.g. timer interrupt) + */ +static void __remove_hrtimer(struct hrtimer *timer, + struct hrtimer_clock_base *base, + u8 newstate, int reprogram) +{ + struct hrtimer_cpu_base *cpu_base = base->cpu_base; + u8 state = timer->state; + + /* Pairs with the lockless read in hrtimer_is_queued() */ + WRITE_ONCE(timer->state, newstate); + if (!(state & HRTIMER_STATE_ENQUEUED)) + return; + + if (!timerqueue_del(&base->active, &timer->node)) + cpu_base->active_bases &= ~(1 << base->index); + + /* + * Note: If reprogram is false we do not update + * cpu_base->next_timer. This happens when we remove the first + * timer on a remote cpu. No harm as we never dereference + * cpu_base->next_timer. So the worst thing what can happen is + * an superflous call to hrtimer_force_reprogram() on the + * remote cpu later on if the same timer gets enqueued again. + */ + if (reprogram && timer == cpu_base->next_timer) + hrtimer_force_reprogram(cpu_base, 1); +} + +/* + * remove hrtimer, called with base lock held + */ +static inline int +remove_hrtimer(struct hrtimer *timer, struct hrtimer_clock_base *base, + bool restart, bool keep_local) +{ + u8 state = timer->state; + + if (state & HRTIMER_STATE_ENQUEUED) { + bool reprogram; + + /* + * Remove the timer and force reprogramming when high + * resolution mode is active and the timer is on the current + * CPU. If we remove a timer on another CPU, reprogramming is + * skipped. The interrupt event on this CPU is fired and + * reprogramming happens in the interrupt handler. This is a + * rare case and less expensive than a smp call. + */ + debug_deactivate(timer); + reprogram = base->cpu_base == this_cpu_ptr(&hrtimer_bases); + + /* + * If the timer is not restarted then reprogramming is + * required if the timer is local. If it is local and about + * to be restarted, avoid programming it twice (on removal + * and a moment later when it's requeued). + */ + if (!restart) + state = HRTIMER_STATE_INACTIVE; + else + reprogram &= !keep_local; + + __remove_hrtimer(timer, base, state, reprogram); + return 1; + } + return 0; +} + +static inline ktime_t hrtimer_update_lowres(struct hrtimer *timer, ktime_t tim, + const enum hrtimer_mode mode) +{ +#ifdef CONFIG_TIME_LOW_RES + /* + * CONFIG_TIME_LOW_RES indicates that the system has no way to return + * granular time values. For relative timers we add hrtimer_resolution + * (i.e. one jiffie) to prevent short timeouts. + */ + timer->is_rel = mode & HRTIMER_MODE_REL; + if (timer->is_rel) + tim = ktime_add_safe(tim, hrtimer_resolution); +#endif + return tim; +} + +static void +hrtimer_update_softirq_timer(struct hrtimer_cpu_base *cpu_base, bool reprogram) +{ + ktime_t expires; + + /* + * Find the next SOFT expiration. + */ + expires = __hrtimer_get_next_event(cpu_base, HRTIMER_ACTIVE_SOFT); + + /* + * reprogramming needs to be triggered, even if the next soft + * hrtimer expires at the same time than the next hard + * hrtimer. cpu_base->softirq_expires_next needs to be updated! + */ + if (expires == KTIME_MAX) + return; + + /* + * cpu_base->*next_timer is recomputed by __hrtimer_get_next_event() + * cpu_base->*expires_next is only set by hrtimer_reprogram() + */ + hrtimer_reprogram(cpu_base->softirq_next_timer, reprogram); +} + +static int __hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim, + u64 delta_ns, const enum hrtimer_mode mode, + struct hrtimer_clock_base *base) +{ + struct hrtimer_clock_base *new_base; + bool force_local, first; + + /* + * If the timer is on the local cpu base and is the first expiring + * timer then this might end up reprogramming the hardware twice + * (on removal and on enqueue). To avoid that by prevent the + * reprogram on removal, keep the timer local to the current CPU + * and enforce reprogramming after it is queued no matter whether + * it is the new first expiring timer again or not. + */ + force_local = base->cpu_base == this_cpu_ptr(&hrtimer_bases); + force_local &= base->cpu_base->next_timer == timer; + + /* + * Remove an active timer from the queue. In case it is not queued + * on the current CPU, make sure that remove_hrtimer() updates the + * remote data correctly. + * + * If it's on the current CPU and the first expiring timer, then + * skip reprogramming, keep the timer local and enforce + * reprogramming later if it was the first expiring timer. This + * avoids programming the underlying clock event twice (once at + * removal and once after enqueue). + */ + remove_hrtimer(timer, base, true, force_local); + + if (mode & HRTIMER_MODE_REL) + tim = ktime_add_safe(tim, base->get_time()); + + tim = hrtimer_update_lowres(timer, tim, mode); + + hrtimer_set_expires_range_ns(timer, tim, delta_ns); + + /* Switch the timer base, if necessary: */ + if (!force_local) { + new_base = switch_hrtimer_base(timer, base, + mode & HRTIMER_MODE_PINNED); + } else { + new_base = base; + } + + first = enqueue_hrtimer(timer, new_base, mode); + if (!force_local) + return first; + + /* + * Timer was forced to stay on the current CPU to avoid + * reprogramming on removal and enqueue. Force reprogram the + * hardware by evaluating the new first expiring timer. + */ + hrtimer_force_reprogram(new_base->cpu_base, 1); + return 0; +} + +/** + * hrtimer_start_range_ns - (re)start an hrtimer + * @timer: the timer to be added + * @tim: expiry time + * @delta_ns: "slack" range for the timer + * @mode: timer mode: absolute (HRTIMER_MODE_ABS) or + * relative (HRTIMER_MODE_REL), and pinned (HRTIMER_MODE_PINNED); + * softirq based mode is considered for debug purpose only! + */ +void hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim, + u64 delta_ns, const enum hrtimer_mode mode) +{ + struct hrtimer_clock_base *base; + unsigned long flags; + + /* + * Check whether the HRTIMER_MODE_SOFT bit and hrtimer.is_soft + * match. + */ + WARN_ON_ONCE(!(mode & HRTIMER_MODE_SOFT) ^ !timer->is_soft); + + base = lock_hrtimer_base(timer, &flags); + + if (__hrtimer_start_range_ns(timer, tim, delta_ns, mode, base)) + hrtimer_reprogram(timer, true); + + unlock_hrtimer_base(timer, &flags); +} +EXPORT_SYMBOL_GPL(hrtimer_start_range_ns); + +/** + * hrtimer_try_to_cancel - try to deactivate a timer + * @timer: hrtimer to stop + * + * Returns: + * 0 when the timer was not active + * 1 when the timer was active + * -1 when the timer is currently executing the callback function and + * cannot be stopped + */ +int hrtimer_try_to_cancel(struct hrtimer *timer) +{ + struct hrtimer_clock_base *base; + unsigned long flags; + int ret = -1; + + /* + * Check lockless first. If the timer is not active (neither + * enqueued nor running the callback, nothing to do here. The + * base lock does not serialize against a concurrent enqueue, + * so we can avoid taking it. + */ + if (!hrtimer_active(timer)) + return 0; + + base = lock_hrtimer_base(timer, &flags); + + if (!hrtimer_callback_running(timer)) + ret = remove_hrtimer(timer, base, false, false); + + unlock_hrtimer_base(timer, &flags); + + return ret; + +} +EXPORT_SYMBOL_GPL(hrtimer_try_to_cancel); + +/** + * hrtimer_cancel - cancel a timer and wait for the handler to finish. + * @timer: the timer to be cancelled + * + * Returns: + * 0 when the timer was not active + * 1 when the timer was active + */ +int hrtimer_cancel(struct hrtimer *timer) +{ + for (;;) { + int ret = hrtimer_try_to_cancel(timer); + + if (ret >= 0) + return ret; + cpu_relax(); + } +} +EXPORT_SYMBOL_GPL(hrtimer_cancel); + +/** + * hrtimer_get_remaining - get remaining time for the timer + * @timer: the timer to read + * @adjust: adjust relative timers when CONFIG_TIME_LOW_RES=y + */ +ktime_t __hrtimer_get_remaining(const struct hrtimer *timer, bool adjust) +{ + unsigned long flags; + ktime_t rem; + + lock_hrtimer_base(timer, &flags); + if (IS_ENABLED(CONFIG_TIME_LOW_RES) && adjust) + rem = hrtimer_expires_remaining_adjusted(timer); + else + rem = hrtimer_expires_remaining(timer); + unlock_hrtimer_base(timer, &flags); + + return rem; +} +EXPORT_SYMBOL_GPL(__hrtimer_get_remaining); + +#ifdef CONFIG_NO_HZ_COMMON +/** + * hrtimer_get_next_event - get the time until next expiry event + * + * Returns the next expiry time or KTIME_MAX if no timer is pending. + */ +u64 hrtimer_get_next_event(void) +{ + struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases); + u64 expires = KTIME_MAX; + unsigned long flags; + + raw_spin_lock_irqsave(&cpu_base->lock, flags); + + if (!__hrtimer_hres_active(cpu_base)) + expires = __hrtimer_get_next_event(cpu_base, HRTIMER_ACTIVE_ALL); + + raw_spin_unlock_irqrestore(&cpu_base->lock, flags); + + return expires; +} + +/** + * hrtimer_next_event_without - time until next expiry event w/o one timer + * @exclude: timer to exclude + * + * Returns the next expiry time over all timers except for the @exclude one or + * KTIME_MAX if none of them is pending. + */ +u64 hrtimer_next_event_without(const struct hrtimer *exclude) +{ + struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases); + u64 expires = KTIME_MAX; + unsigned long flags; + + raw_spin_lock_irqsave(&cpu_base->lock, flags); + + if (__hrtimer_hres_active(cpu_base)) { + unsigned int active; + + if (!cpu_base->softirq_activated) { + active = cpu_base->active_bases & HRTIMER_ACTIVE_SOFT; + expires = __hrtimer_next_event_base(cpu_base, exclude, + active, KTIME_MAX); + } + active = cpu_base->active_bases & HRTIMER_ACTIVE_HARD; + expires = __hrtimer_next_event_base(cpu_base, exclude, active, + expires); + } + + raw_spin_unlock_irqrestore(&cpu_base->lock, flags); + + return expires; +} +#endif + +static inline int hrtimer_clockid_to_base(clockid_t clock_id) +{ + if (likely(clock_id < MAX_CLOCKS)) { + int base = hrtimer_clock_to_base_table[clock_id]; + + if (likely(base != HRTIMER_MAX_CLOCK_BASES)) + return base; + } + WARN(1, "Invalid clockid %d. Using MONOTONIC\n", clock_id); + return HRTIMER_BASE_MONOTONIC; +} + +static void __hrtimer_init(struct hrtimer *timer, clockid_t clock_id, + enum hrtimer_mode mode) +{ + bool softtimer = !!(mode & HRTIMER_MODE_SOFT); + int base = softtimer ? HRTIMER_MAX_CLOCK_BASES / 2 : 0; + struct hrtimer_cpu_base *cpu_base; + + memset(timer, 0, sizeof(struct hrtimer)); + + cpu_base = raw_cpu_ptr(&hrtimer_bases); + + /* + * POSIX magic: Relative CLOCK_REALTIME timers are not affected by + * clock modifications, so they needs to become CLOCK_MONOTONIC to + * ensure POSIX compliance. + */ + if (clock_id == CLOCK_REALTIME && mode & HRTIMER_MODE_REL) + clock_id = CLOCK_MONOTONIC; + + base += hrtimer_clockid_to_base(clock_id); + timer->is_soft = softtimer; + timer->base = &cpu_base->clock_base[base]; + timerqueue_init(&timer->node); +} + +/** + * hrtimer_init - initialize a timer to the given clock + * @timer: the timer to be initialized + * @clock_id: the clock to be used + * @mode: The modes which are relevant for intitialization: + * HRTIMER_MODE_ABS, HRTIMER_MODE_REL, HRTIMER_MODE_ABS_SOFT, + * HRTIMER_MODE_REL_SOFT + * + * The PINNED variants of the above can be handed in, + * but the PINNED bit is ignored as pinning happens + * when the hrtimer is started + */ +void hrtimer_init(struct hrtimer *timer, clockid_t clock_id, + enum hrtimer_mode mode) +{ + debug_init(timer, clock_id, mode); + __hrtimer_init(timer, clock_id, mode); +} +EXPORT_SYMBOL_GPL(hrtimer_init); + +/* + * A timer is active, when it is enqueued into the rbtree or the + * callback function is running or it's in the state of being migrated + * to another cpu. + * + * It is important for this function to not return a false negative. + */ +bool hrtimer_active(const struct hrtimer *timer) +{ + struct hrtimer_clock_base *base; + unsigned int seq; + + do { + base = READ_ONCE(timer->base); + seq = raw_read_seqcount_begin(&base->seq); + + if (timer->state != HRTIMER_STATE_INACTIVE || + base->running == timer) + return true; + + } while (read_seqcount_retry(&base->seq, seq) || + base != READ_ONCE(timer->base)); + + return false; +} +EXPORT_SYMBOL_GPL(hrtimer_active); + +/* + * The write_seqcount_barrier()s in __run_hrtimer() split the thing into 3 + * distinct sections: + * + * - queued: the timer is queued + * - callback: the timer is being ran + * - post: the timer is inactive or (re)queued + * + * On the read side we ensure we observe timer->state and cpu_base->running + * from the same section, if anything changed while we looked at it, we retry. + * This includes timer->base changing because sequence numbers alone are + * insufficient for that. + * + * The sequence numbers are required because otherwise we could still observe + * a false negative if the read side got smeared over multiple consequtive + * __run_hrtimer() invocations. + */ + +static void __run_hrtimer(struct hrtimer_cpu_base *cpu_base, + struct hrtimer_clock_base *base, + struct hrtimer *timer, ktime_t *now, + unsigned long flags) +{ + enum hrtimer_restart (*fn)(struct hrtimer *); + int restart; + + lockdep_assert_held(&cpu_base->lock); + + debug_deactivate(timer); + base->running = timer; + + /* + * Separate the ->running assignment from the ->state assignment. + * + * As with a regular write barrier, this ensures the read side in + * hrtimer_active() cannot observe base->running == NULL && + * timer->state == INACTIVE. + */ + raw_write_seqcount_barrier(&base->seq); + + __remove_hrtimer(timer, base, HRTIMER_STATE_INACTIVE, 0); + fn = timer->function; + + /* + * Clear the 'is relative' flag for the TIME_LOW_RES case. If the + * timer is restarted with a period then it becomes an absolute + * timer. If its not restarted it does not matter. + */ + if (IS_ENABLED(CONFIG_TIME_LOW_RES)) + timer->is_rel = false; + + /* + * The timer is marked as running in the CPU base, so it is + * protected against migration to a different CPU even if the lock + * is dropped. + */ + raw_spin_unlock_irqrestore(&cpu_base->lock, flags); + trace_hrtimer_expire_entry(timer, now); + restart = fn(timer); + trace_hrtimer_expire_exit(timer); + raw_spin_lock_irq(&cpu_base->lock); + + /* + * Note: We clear the running state after enqueue_hrtimer and + * we do not reprogram the event hardware. Happens either in + * hrtimer_start_range_ns() or in hrtimer_interrupt() + * + * Note: Because we dropped the cpu_base->lock above, + * hrtimer_start_range_ns() can have popped in and enqueued the timer + * for us already. + */ + if (restart != HRTIMER_NORESTART && + !(timer->state & HRTIMER_STATE_ENQUEUED)) + enqueue_hrtimer(timer, base, HRTIMER_MODE_ABS); + + /* + * Separate the ->running assignment from the ->state assignment. + * + * As with a regular write barrier, this ensures the read side in + * hrtimer_active() cannot observe base->running.timer == NULL && + * timer->state == INACTIVE. + */ + raw_write_seqcount_barrier(&base->seq); + + WARN_ON_ONCE(base->running != timer); + base->running = NULL; +} + +static void __hrtimer_run_queues(struct hrtimer_cpu_base *cpu_base, ktime_t now, + unsigned long flags, unsigned int active_mask) +{ + struct hrtimer_clock_base *base; + unsigned int active = cpu_base->active_bases & active_mask; + + for_each_active_base(base, cpu_base, active) { + struct timerqueue_node *node; + ktime_t basenow; + + basenow = ktime_add(now, base->offset); + + while ((node = timerqueue_getnext(&base->active))) { + struct hrtimer *timer; + + timer = container_of(node, struct hrtimer, node); + + /* + * The immediate goal for using the softexpires is + * minimizing wakeups, not running timers at the + * earliest interrupt after their soft expiration. + * This allows us to avoid using a Priority Search + * Tree, which can answer a stabbing querry for + * overlapping intervals and instead use the simple + * BST we already have. + * We don't add extra wakeups by delaying timers that + * are right-of a not yet expired timer, because that + * timer will have to trigger a wakeup anyway. + */ + if (basenow < hrtimer_get_softexpires_tv64(timer)) + break; + + __run_hrtimer(cpu_base, base, timer, &basenow, flags); + } + } +} + +static __latent_entropy void hrtimer_run_softirq(struct softirq_action *h) +{ + struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases); + unsigned long flags; + ktime_t now; + + raw_spin_lock_irqsave(&cpu_base->lock, flags); + + now = hrtimer_update_base(cpu_base); + __hrtimer_run_queues(cpu_base, now, flags, HRTIMER_ACTIVE_SOFT); + + cpu_base->softirq_activated = 0; + hrtimer_update_softirq_timer(cpu_base, true); + + raw_spin_unlock_irqrestore(&cpu_base->lock, flags); +} + +#ifdef CONFIG_HIGH_RES_TIMERS + +/* + * High resolution timer interrupt + * Called with interrupts disabled + */ +void hrtimer_interrupt(struct clock_event_device *dev) +{ + struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases); + ktime_t expires_next, now, entry_time, delta; + unsigned long flags; + int retries = 0; + + BUG_ON(!cpu_base->hres_active); + cpu_base->nr_events++; + dev->next_event = KTIME_MAX; + + raw_spin_lock_irqsave(&cpu_base->lock, flags); + entry_time = now = hrtimer_update_base(cpu_base); +retry: + cpu_base->in_hrtirq = 1; + /* + * We set expires_next to KTIME_MAX here with cpu_base->lock + * held to prevent that a timer is enqueued in our queue via + * the migration code. This does not affect enqueueing of + * timers which run their callback and need to be requeued on + * this CPU. + */ + cpu_base->expires_next = KTIME_MAX; + + if (!ktime_before(now, cpu_base->softirq_expires_next)) { + cpu_base->softirq_expires_next = KTIME_MAX; + cpu_base->softirq_activated = 1; + raise_softirq_irqoff(HRTIMER_SOFTIRQ); + } + + __hrtimer_run_queues(cpu_base, now, flags, HRTIMER_ACTIVE_HARD); + + /* Reevaluate the clock bases for the [soft] next expiry */ + expires_next = hrtimer_update_next_event(cpu_base); + /* + * Store the new expiry value so the migration code can verify + * against it. + */ + cpu_base->expires_next = expires_next; + cpu_base->in_hrtirq = 0; + raw_spin_unlock_irqrestore(&cpu_base->lock, flags); + + /* Reprogramming necessary ? */ + if (!tick_program_event(expires_next, 0)) { + cpu_base->hang_detected = 0; + return; + } + + /* + * The next timer was already expired due to: + * - tracing + * - long lasting callbacks + * - being scheduled away when running in a VM + * + * We need to prevent that we loop forever in the hrtimer + * interrupt routine. We give it 3 attempts to avoid + * overreacting on some spurious event. + * + * Acquire base lock for updating the offsets and retrieving + * the current time. + */ + raw_spin_lock_irqsave(&cpu_base->lock, flags); + now = hrtimer_update_base(cpu_base); + cpu_base->nr_retries++; + if (++retries < 3) + goto retry; + /* + * Give the system a chance to do something else than looping + * here. We stored the entry time, so we know exactly how long + * we spent here. We schedule the next event this amount of + * time away. + */ + cpu_base->nr_hangs++; + cpu_base->hang_detected = 1; + raw_spin_unlock_irqrestore(&cpu_base->lock, flags); + + delta = ktime_sub(now, entry_time); + if ((unsigned int)delta > cpu_base->max_hang_time) + cpu_base->max_hang_time = (unsigned int) delta; + /* + * Limit it to a sensible value as we enforce a longer + * delay. Give the CPU at least 100ms to catch up. + */ + if (delta > 100 * NSEC_PER_MSEC) + expires_next = ktime_add_ns(now, 100 * NSEC_PER_MSEC); + else + expires_next = ktime_add(now, delta); + tick_program_event(expires_next, 1); + pr_warn_once("hrtimer: interrupt took %llu ns\n", ktime_to_ns(delta)); +} + +/* called with interrupts disabled */ +static inline void __hrtimer_peek_ahead_timers(void) +{ + struct tick_device *td; + + if (!hrtimer_hres_active()) + return; + + td = this_cpu_ptr(&tick_cpu_device); + if (td && td->evtdev) + hrtimer_interrupt(td->evtdev); +} + +#else /* CONFIG_HIGH_RES_TIMERS */ + +static inline void __hrtimer_peek_ahead_timers(void) { } + +#endif /* !CONFIG_HIGH_RES_TIMERS */ + +/* + * Called from run_local_timers in hardirq context every jiffy + */ +void hrtimer_run_queues(void) +{ + struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases); + unsigned long flags; + ktime_t now; + + if (__hrtimer_hres_active(cpu_base)) + return; + + /* + * This _is_ ugly: We have to check periodically, whether we + * can switch to highres and / or nohz mode. The clocksource + * switch happens with xtime_lock held. Notification from + * there only sets the check bit in the tick_oneshot code, + * otherwise we might deadlock vs. xtime_lock. + */ + if (tick_check_oneshot_change(!hrtimer_is_hres_enabled())) { + hrtimer_switch_to_hres(); + return; + } + + raw_spin_lock_irqsave(&cpu_base->lock, flags); + now = hrtimer_update_base(cpu_base); + + if (!ktime_before(now, cpu_base->softirq_expires_next)) { + cpu_base->softirq_expires_next = KTIME_MAX; + cpu_base->softirq_activated = 1; + raise_softirq_irqoff(HRTIMER_SOFTIRQ); + } + + __hrtimer_run_queues(cpu_base, now, flags, HRTIMER_ACTIVE_HARD); + raw_spin_unlock_irqrestore(&cpu_base->lock, flags); +} + +/* + * Sleep related functions: + */ +static enum hrtimer_restart hrtimer_wakeup(struct hrtimer *timer) +{ + struct hrtimer_sleeper *t = + container_of(timer, struct hrtimer_sleeper, timer); + struct task_struct *task = t->task; + + t->task = NULL; + if (task) + wake_up_process(task); + + return HRTIMER_NORESTART; +} + +void hrtimer_init_sleeper(struct hrtimer_sleeper *sl, struct task_struct *task) +{ + sl->timer.function = hrtimer_wakeup; + sl->task = task; +} +EXPORT_SYMBOL_GPL(hrtimer_init_sleeper); + +int nanosleep_copyout(struct restart_block *restart, struct timespec64 *ts) +{ + switch(restart->nanosleep.type) { +#ifdef CONFIG_COMPAT_32BIT_TIME + case TT_COMPAT: + if (compat_put_timespec64(ts, restart->nanosleep.compat_rmtp)) + return -EFAULT; + break; +#endif + case TT_NATIVE: + if (put_timespec64(ts, restart->nanosleep.rmtp)) + return -EFAULT; + break; + default: + BUG(); + } + return -ERESTART_RESTARTBLOCK; +} + +static int __sched do_nanosleep(struct hrtimer_sleeper *t, enum hrtimer_mode mode) +{ + struct restart_block *restart; + + hrtimer_init_sleeper(t, current); + + do { + set_current_state(TASK_INTERRUPTIBLE); + hrtimer_start_expires(&t->timer, mode); + + if (likely(t->task)) + freezable_schedule(); + + hrtimer_cancel(&t->timer); + mode = HRTIMER_MODE_ABS; + + } while (t->task && !signal_pending(current)); + + __set_current_state(TASK_RUNNING); + + if (!t->task) + return 0; + + restart = ¤t->restart_block; + if (restart->nanosleep.type != TT_NONE) { + ktime_t rem = hrtimer_expires_remaining(&t->timer); + struct timespec64 rmt; + + if (rem <= 0) + return 0; + rmt = ktime_to_timespec64(rem); + + return nanosleep_copyout(restart, &rmt); + } + return -ERESTART_RESTARTBLOCK; +} + +static long __sched hrtimer_nanosleep_restart(struct restart_block *restart) +{ + struct hrtimer_sleeper t; + int ret; + + hrtimer_init_on_stack(&t.timer, restart->nanosleep.clockid, + HRTIMER_MODE_ABS); + hrtimer_set_expires_tv64(&t.timer, restart->nanosleep.expires); + + ret = do_nanosleep(&t, HRTIMER_MODE_ABS); + destroy_hrtimer_on_stack(&t.timer); + return ret; +} + +long hrtimer_nanosleep(const struct timespec64 *rqtp, + const enum hrtimer_mode mode, const clockid_t clockid) +{ + struct restart_block *restart; + struct hrtimer_sleeper t; + int ret = 0; + u64 slack; + + slack = current->timer_slack_ns; + if (dl_task(current) || rt_task(current)) + slack = 0; + + hrtimer_init_on_stack(&t.timer, clockid, mode); + hrtimer_set_expires_range_ns(&t.timer, timespec64_to_ktime(*rqtp), slack); + ret = do_nanosleep(&t, mode); + if (ret != -ERESTART_RESTARTBLOCK) + goto out; + + /* Absolute timers do not update the rmtp value and restart: */ + if (mode == HRTIMER_MODE_ABS) { + ret = -ERESTARTNOHAND; + goto out; + } + + restart = ¤t->restart_block; + restart->nanosleep.clockid = t.timer.base->clockid; + restart->nanosleep.expires = hrtimer_get_expires_tv64(&t.timer); + set_restart_fn(restart, hrtimer_nanosleep_restart); +out: + destroy_hrtimer_on_stack(&t.timer); + return ret; +} + +#if !defined(CONFIG_64BIT_TIME) || defined(CONFIG_64BIT) + +SYSCALL_DEFINE2(nanosleep, struct __kernel_timespec __user *, rqtp, + struct __kernel_timespec __user *, rmtp) +{ + struct timespec64 tu; + + if (get_timespec64(&tu, rqtp)) + return -EFAULT; + + if (!timespec64_valid(&tu)) + return -EINVAL; + + current->restart_block.nanosleep.type = rmtp ? TT_NATIVE : TT_NONE; + current->restart_block.nanosleep.rmtp = rmtp; + return hrtimer_nanosleep(&tu, HRTIMER_MODE_REL, CLOCK_MONOTONIC); +} + +#endif + +#ifdef CONFIG_COMPAT_32BIT_TIME + +COMPAT_SYSCALL_DEFINE2(nanosleep, struct compat_timespec __user *, rqtp, + struct compat_timespec __user *, rmtp) +{ + struct timespec64 tu; + + if (compat_get_timespec64(&tu, rqtp)) + return -EFAULT; + + if (!timespec64_valid(&tu)) + return -EINVAL; + + current->restart_block.nanosleep.type = rmtp ? TT_COMPAT : TT_NONE; + current->restart_block.nanosleep.compat_rmtp = rmtp; + return hrtimer_nanosleep(&tu, HRTIMER_MODE_REL, CLOCK_MONOTONIC); +} +#endif + +/* + * Functions related to boot-time initialization: + */ +int hrtimers_prepare_cpu(unsigned int cpu) +{ + struct hrtimer_cpu_base *cpu_base = &per_cpu(hrtimer_bases, cpu); + int i; + + for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) { + cpu_base->clock_base[i].cpu_base = cpu_base; + timerqueue_init_head(&cpu_base->clock_base[i].active); + } + + cpu_base->cpu = cpu; + cpu_base->active_bases = 0; + cpu_base->hres_active = 0; + cpu_base->hang_detected = 0; + cpu_base->next_timer = NULL; + cpu_base->softirq_next_timer = NULL; + cpu_base->expires_next = KTIME_MAX; + cpu_base->softirq_expires_next = KTIME_MAX; + return 0; +} + +#ifdef CONFIG_HOTPLUG_CPU + +static void migrate_hrtimer_list(struct hrtimer_clock_base *old_base, + struct hrtimer_clock_base *new_base) +{ + struct hrtimer *timer; + struct timerqueue_node *node; + + while ((node = timerqueue_getnext(&old_base->active))) { + timer = container_of(node, struct hrtimer, node); + BUG_ON(hrtimer_callback_running(timer)); + debug_deactivate(timer); + + /* + * Mark it as ENQUEUED not INACTIVE otherwise the + * timer could be seen as !active and just vanish away + * under us on another CPU + */ + __remove_hrtimer(timer, old_base, HRTIMER_STATE_ENQUEUED, 0); + timer->base = new_base; + /* + * Enqueue the timers on the new cpu. This does not + * reprogram the event device in case the timer + * expires before the earliest on this CPU, but we run + * hrtimer_interrupt after we migrated everything to + * sort out already expired timers and reprogram the + * event device. + */ + enqueue_hrtimer(timer, new_base, HRTIMER_MODE_ABS); + } +} + +int hrtimers_dead_cpu(unsigned int scpu) +{ + struct hrtimer_cpu_base *old_base, *new_base; + int i; + + BUG_ON(cpu_online(scpu)); + tick_cancel_sched_timer(scpu); + + /* + * this BH disable ensures that raise_softirq_irqoff() does + * not wakeup ksoftirqd (and acquire the pi-lock) while + * holding the cpu_base lock + */ + local_bh_disable(); + local_irq_disable(); + old_base = &per_cpu(hrtimer_bases, scpu); + new_base = this_cpu_ptr(&hrtimer_bases); + /* + * The caller is globally serialized and nobody else + * takes two locks at once, deadlock is not possible. + */ + raw_spin_lock(&new_base->lock); + raw_spin_lock_nested(&old_base->lock, SINGLE_DEPTH_NESTING); + + for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) { + migrate_hrtimer_list(&old_base->clock_base[i], + &new_base->clock_base[i]); + } + + /* + * The migration might have changed the first expiring softirq + * timer on this CPU. Update it. + */ + hrtimer_update_softirq_timer(new_base, false); + + raw_spin_unlock(&old_base->lock); + raw_spin_unlock(&new_base->lock); + + /* Check, if we got expired work to do */ + __hrtimer_peek_ahead_timers(); + local_irq_enable(); + local_bh_enable(); + return 0; +} + +#endif /* CONFIG_HOTPLUG_CPU */ + +void __init hrtimers_init(void) +{ + hrtimers_prepare_cpu(smp_processor_id()); + open_softirq(HRTIMER_SOFTIRQ, hrtimer_run_softirq); +} + +/** + * schedule_hrtimeout_range_clock - sleep until timeout + * @expires: timeout value (ktime_t) + * @delta: slack in expires timeout (ktime_t) + * @mode: timer mode + * @clock_id: timer clock to be used + */ +int __sched +schedule_hrtimeout_range_clock(ktime_t *expires, u64 delta, + const enum hrtimer_mode mode, clockid_t clock_id) +{ + struct hrtimer_sleeper t; + + /* + * Optimize when a zero timeout value is given. It does not + * matter whether this is an absolute or a relative time. + */ + if (expires && *expires == 0) { + __set_current_state(TASK_RUNNING); + return 0; + } + + /* + * A NULL parameter means "infinite" + */ + if (!expires) { + schedule(); + return -EINTR; + } + + hrtimer_init_on_stack(&t.timer, clock_id, mode); + hrtimer_set_expires_range_ns(&t.timer, *expires, delta); + + hrtimer_init_sleeper(&t, current); + + hrtimer_start_expires(&t.timer, mode); + + if (likely(t.task)) + schedule(); + + hrtimer_cancel(&t.timer); + destroy_hrtimer_on_stack(&t.timer); + + __set_current_state(TASK_RUNNING); + + return !t.task ? 0 : -EINTR; +} + +/** + * schedule_hrtimeout_range - sleep until timeout + * @expires: timeout value (ktime_t) + * @delta: slack in expires timeout (ktime_t) + * @mode: timer mode + * + * Make the current task sleep until the given expiry time has + * elapsed. The routine will return immediately unless + * the current task state has been set (see set_current_state()). + * + * The @delta argument gives the kernel the freedom to schedule the + * actual wakeup to a time that is both power and performance friendly. + * The kernel give the normal best effort behavior for "@expires+@delta", + * but may decide to fire the timer earlier, but no earlier than @expires. + * + * You can set the task state as follows - + * + * %TASK_UNINTERRUPTIBLE - at least @timeout time is guaranteed to + * pass before the routine returns unless the current task is explicitly + * woken up, (e.g. by wake_up_process()). + * + * %TASK_INTERRUPTIBLE - the routine may return early if a signal is + * delivered to the current task or the current task is explicitly woken + * up. + * + * The current task state is guaranteed to be TASK_RUNNING when this + * routine returns. + * + * Returns 0 when the timer has expired. If the task was woken before the + * timer expired by a signal (only possible in state TASK_INTERRUPTIBLE) or + * by an explicit wakeup, it returns -EINTR. + */ +int __sched schedule_hrtimeout_range(ktime_t *expires, u64 delta, + const enum hrtimer_mode mode) +{ + return schedule_hrtimeout_range_clock(expires, delta, mode, + CLOCK_MONOTONIC); +} +EXPORT_SYMBOL_GPL(schedule_hrtimeout_range); + +/** + * schedule_hrtimeout - sleep until timeout + * @expires: timeout value (ktime_t) + * @mode: timer mode + * + * Make the current task sleep until the given expiry time has + * elapsed. The routine will return immediately unless + * the current task state has been set (see set_current_state()). + * + * You can set the task state as follows - + * + * %TASK_UNINTERRUPTIBLE - at least @timeout time is guaranteed to + * pass before the routine returns unless the current task is explicitly + * woken up, (e.g. by wake_up_process()). + * + * %TASK_INTERRUPTIBLE - the routine may return early if a signal is + * delivered to the current task or the current task is explicitly woken + * up. + * + * The current task state is guaranteed to be TASK_RUNNING when this + * routine returns. + * + * Returns 0 when the timer has expired. If the task was woken before the + * timer expired by a signal (only possible in state TASK_INTERRUPTIBLE) or + * by an explicit wakeup, it returns -EINTR. + */ +int __sched schedule_hrtimeout(ktime_t *expires, + const enum hrtimer_mode mode) +{ + return schedule_hrtimeout_range(expires, 0, mode); +} +EXPORT_SYMBOL_GPL(schedule_hrtimeout); diff --git a/kernel/time/itimer.c b/kernel/time/itimer.c new file mode 100644 index 000000000..2e2b335ef --- /dev/null +++ b/kernel/time/itimer.c @@ -0,0 +1,336 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * linux/kernel/itimer.c + * + * Copyright (C) 1992 Darren Senn + */ + +/* These are all the functions necessary to implement itimers */ + +#include <linux/mm.h> +#include <linux/interrupt.h> +#include <linux/syscalls.h> +#include <linux/time.h> +#include <linux/sched/signal.h> +#include <linux/sched/cputime.h> +#include <linux/posix-timers.h> +#include <linux/hrtimer.h> +#include <trace/events/timer.h> +#include <linux/compat.h> + +#include <linux/uaccess.h> + +/** + * itimer_get_remtime - get remaining time for the timer + * + * @timer: the timer to read + * + * Returns the delta between the expiry time and now, which can be + * less than zero or 1usec for an pending expired timer + */ +static struct timeval itimer_get_remtime(struct hrtimer *timer) +{ + ktime_t rem = __hrtimer_get_remaining(timer, true); + + /* + * Racy but safe: if the itimer expires after the above + * hrtimer_get_remtime() call but before this condition + * then we return 0 - which is correct. + */ + if (hrtimer_active(timer)) { + if (rem <= 0) + rem = NSEC_PER_USEC; + } else + rem = 0; + + return ktime_to_timeval(rem); +} + +static void get_cpu_itimer(struct task_struct *tsk, unsigned int clock_id, + struct itimerval *const value) +{ + u64 val, interval; + struct cpu_itimer *it = &tsk->signal->it[clock_id]; + + spin_lock_irq(&tsk->sighand->siglock); + + val = it->expires; + interval = it->incr; + if (val) { + struct task_cputime cputime; + u64 t; + + thread_group_cputimer(tsk, &cputime); + if (clock_id == CPUCLOCK_PROF) + t = cputime.utime + cputime.stime; + else + /* CPUCLOCK_VIRT */ + t = cputime.utime; + + if (val < t) + /* about to fire */ + val = TICK_NSEC; + else + val -= t; + } + + spin_unlock_irq(&tsk->sighand->siglock); + + value->it_value = ns_to_timeval(val); + value->it_interval = ns_to_timeval(interval); +} + +int do_getitimer(int which, struct itimerval *value) +{ + struct task_struct *tsk = current; + + switch (which) { + case ITIMER_REAL: + spin_lock_irq(&tsk->sighand->siglock); + value->it_value = itimer_get_remtime(&tsk->signal->real_timer); + value->it_interval = + ktime_to_timeval(tsk->signal->it_real_incr); + spin_unlock_irq(&tsk->sighand->siglock); + break; + case ITIMER_VIRTUAL: + get_cpu_itimer(tsk, CPUCLOCK_VIRT, value); + break; + case ITIMER_PROF: + get_cpu_itimer(tsk, CPUCLOCK_PROF, value); + break; + default: + return(-EINVAL); + } + return 0; +} + +SYSCALL_DEFINE2(getitimer, int, which, struct itimerval __user *, value) +{ + int error = -EFAULT; + struct itimerval get_buffer; + + if (value) { + error = do_getitimer(which, &get_buffer); + if (!error && + copy_to_user(value, &get_buffer, sizeof(get_buffer))) + error = -EFAULT; + } + return error; +} + +#ifdef CONFIG_COMPAT +COMPAT_SYSCALL_DEFINE2(getitimer, int, which, + struct compat_itimerval __user *, it) +{ + struct itimerval kit; + int error = do_getitimer(which, &kit); + + if (!error && put_compat_itimerval(it, &kit)) + error = -EFAULT; + return error; +} +#endif + + +/* + * The timer is automagically restarted, when interval != 0 + */ +enum hrtimer_restart it_real_fn(struct hrtimer *timer) +{ + struct signal_struct *sig = + container_of(timer, struct signal_struct, real_timer); + struct pid *leader_pid = sig->pids[PIDTYPE_TGID]; + + trace_itimer_expire(ITIMER_REAL, leader_pid, 0); + kill_pid_info(SIGALRM, SEND_SIG_PRIV, leader_pid); + + return HRTIMER_NORESTART; +} + +static void set_cpu_itimer(struct task_struct *tsk, unsigned int clock_id, + const struct itimerval *const value, + struct itimerval *const ovalue) +{ + u64 oval, nval, ointerval, ninterval; + struct cpu_itimer *it = &tsk->signal->it[clock_id]; + + nval = ktime_to_ns(timeval_to_ktime(value->it_value)); + ninterval = ktime_to_ns(timeval_to_ktime(value->it_interval)); + + spin_lock_irq(&tsk->sighand->siglock); + + oval = it->expires; + ointerval = it->incr; + if (oval || nval) { + if (nval > 0) + nval += TICK_NSEC; + set_process_cpu_timer(tsk, clock_id, &nval, &oval); + } + it->expires = nval; + it->incr = ninterval; + trace_itimer_state(clock_id == CPUCLOCK_VIRT ? + ITIMER_VIRTUAL : ITIMER_PROF, value, nval); + + spin_unlock_irq(&tsk->sighand->siglock); + + if (ovalue) { + ovalue->it_value = ns_to_timeval(oval); + ovalue->it_interval = ns_to_timeval(ointerval); + } +} + +/* + * Returns true if the timeval is in canonical form + */ +#define timeval_valid(t) \ + (((t)->tv_sec >= 0) && (((unsigned long) (t)->tv_usec) < USEC_PER_SEC)) + +int do_setitimer(int which, struct itimerval *value, struct itimerval *ovalue) +{ + struct task_struct *tsk = current; + struct hrtimer *timer; + ktime_t expires; + + /* + * Validate the timevals in value. + */ + if (!timeval_valid(&value->it_value) || + !timeval_valid(&value->it_interval)) + return -EINVAL; + + switch (which) { + case ITIMER_REAL: +again: + spin_lock_irq(&tsk->sighand->siglock); + timer = &tsk->signal->real_timer; + if (ovalue) { + ovalue->it_value = itimer_get_remtime(timer); + ovalue->it_interval + = ktime_to_timeval(tsk->signal->it_real_incr); + } + /* We are sharing ->siglock with it_real_fn() */ + if (hrtimer_try_to_cancel(timer) < 0) { + spin_unlock_irq(&tsk->sighand->siglock); + goto again; + } + expires = timeval_to_ktime(value->it_value); + if (expires != 0) { + tsk->signal->it_real_incr = + timeval_to_ktime(value->it_interval); + hrtimer_start(timer, expires, HRTIMER_MODE_REL); + } else + tsk->signal->it_real_incr = 0; + + trace_itimer_state(ITIMER_REAL, value, 0); + spin_unlock_irq(&tsk->sighand->siglock); + break; + case ITIMER_VIRTUAL: + set_cpu_itimer(tsk, CPUCLOCK_VIRT, value, ovalue); + break; + case ITIMER_PROF: + set_cpu_itimer(tsk, CPUCLOCK_PROF, value, ovalue); + break; + default: + return -EINVAL; + } + return 0; +} + +#ifdef __ARCH_WANT_SYS_ALARM + +/** + * alarm_setitimer - set alarm in seconds + * + * @seconds: number of seconds until alarm + * 0 disables the alarm + * + * Returns the remaining time in seconds of a pending timer or 0 when + * the timer is not active. + * + * On 32 bit machines the seconds value is limited to (INT_MAX/2) to avoid + * negative timeval settings which would cause immediate expiry. + */ +static unsigned int alarm_setitimer(unsigned int seconds) +{ + struct itimerval it_new, it_old; + +#if BITS_PER_LONG < 64 + if (seconds > INT_MAX) + seconds = INT_MAX; +#endif + it_new.it_value.tv_sec = seconds; + it_new.it_value.tv_usec = 0; + it_new.it_interval.tv_sec = it_new.it_interval.tv_usec = 0; + + do_setitimer(ITIMER_REAL, &it_new, &it_old); + + /* + * We can't return 0 if we have an alarm pending ... And we'd + * better return too much than too little anyway + */ + if ((!it_old.it_value.tv_sec && it_old.it_value.tv_usec) || + it_old.it_value.tv_usec >= 500000) + it_old.it_value.tv_sec++; + + return it_old.it_value.tv_sec; +} + +/* + * For backwards compatibility? This can be done in libc so Alpha + * and all newer ports shouldn't need it. + */ +SYSCALL_DEFINE1(alarm, unsigned int, seconds) +{ + return alarm_setitimer(seconds); +} + +#endif + +SYSCALL_DEFINE3(setitimer, int, which, struct itimerval __user *, value, + struct itimerval __user *, ovalue) +{ + struct itimerval set_buffer, get_buffer; + int error; + + if (value) { + if(copy_from_user(&set_buffer, value, sizeof(set_buffer))) + return -EFAULT; + } else { + memset(&set_buffer, 0, sizeof(set_buffer)); + printk_once(KERN_WARNING "%s calls setitimer() with new_value NULL pointer." + " Misfeature support will be removed\n", + current->comm); + } + + error = do_setitimer(which, &set_buffer, ovalue ? &get_buffer : NULL); + if (error || !ovalue) + return error; + + if (copy_to_user(ovalue, &get_buffer, sizeof(get_buffer))) + return -EFAULT; + return 0; +} + +#ifdef CONFIG_COMPAT +COMPAT_SYSCALL_DEFINE3(setitimer, int, which, + struct compat_itimerval __user *, in, + struct compat_itimerval __user *, out) +{ + struct itimerval kin, kout; + int error; + + if (in) { + if (get_compat_itimerval(&kin, in)) + return -EFAULT; + } else { + memset(&kin, 0, sizeof(kin)); + } + + error = do_setitimer(which, &kin, out ? &kout : NULL); + if (error || !out) + return error; + if (put_compat_itimerval(out, &kout)) + return -EFAULT; + return 0; +} +#endif diff --git a/kernel/time/jiffies.c b/kernel/time/jiffies.c new file mode 100644 index 000000000..497719127 --- /dev/null +++ b/kernel/time/jiffies.c @@ -0,0 +1,136 @@ +/*********************************************************************** +* linux/kernel/time/jiffies.c +* +* This file contains the jiffies based clocksource. +* +* Copyright (C) 2004, 2005 IBM, John Stultz (johnstul@us.ibm.com) +* +* This program is free software; you can redistribute it and/or modify +* it under the terms of the GNU General Public License as published by +* the Free Software Foundation; either version 2 of the License, or +* (at your option) any later version. +* +* This program is distributed in the hope that it will be useful, +* but WITHOUT ANY WARRANTY; without even the implied warranty of +* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +* GNU General Public License for more details. +* +* You should have received a copy of the GNU General Public License +* along with this program; if not, write to the Free Software +* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. +* +************************************************************************/ +#include <linux/clocksource.h> +#include <linux/jiffies.h> +#include <linux/module.h> +#include <linux/init.h> + +#include "timekeeping.h" + + +/* Since jiffies uses a simple TICK_NSEC multiplier + * conversion, the .shift value could be zero. However + * this would make NTP adjustments impossible as they are + * in units of 1/2^.shift. Thus we use JIFFIES_SHIFT to + * shift both the nominator and denominator the same + * amount, and give ntp adjustments in units of 1/2^8 + * + * The value 8 is somewhat carefully chosen, as anything + * larger can result in overflows. TICK_NSEC grows as HZ + * shrinks, so values greater than 8 overflow 32bits when + * HZ=100. + */ +#if HZ < 34 +#define JIFFIES_SHIFT 6 +#elif HZ < 67 +#define JIFFIES_SHIFT 7 +#else +#define JIFFIES_SHIFT 8 +#endif + +static u64 jiffies_read(struct clocksource *cs) +{ + return (u64) jiffies; +} + +/* + * The Jiffies based clocksource is the lowest common + * denominator clock source which should function on + * all systems. It has the same coarse resolution as + * the timer interrupt frequency HZ and it suffers + * inaccuracies caused by missed or lost timer + * interrupts and the inability for the timer + * interrupt hardware to accuratly tick at the + * requested HZ value. It is also not recommended + * for "tick-less" systems. + */ +static struct clocksource clocksource_jiffies = { + .name = "jiffies", + .rating = 1, /* lowest valid rating*/ + .read = jiffies_read, + .mask = CLOCKSOURCE_MASK(32), + .mult = TICK_NSEC << JIFFIES_SHIFT, /* details above */ + .shift = JIFFIES_SHIFT, + .max_cycles = 10, +}; + +__cacheline_aligned_in_smp DEFINE_SEQLOCK(jiffies_lock); + +#if (BITS_PER_LONG < 64) +u64 get_jiffies_64(void) +{ + unsigned long seq; + u64 ret; + + do { + seq = read_seqbegin(&jiffies_lock); + ret = jiffies_64; + } while (read_seqretry(&jiffies_lock, seq)); + return ret; +} +EXPORT_SYMBOL(get_jiffies_64); +#endif + +EXPORT_SYMBOL(jiffies); + +static int __init init_jiffies_clocksource(void) +{ + return __clocksource_register(&clocksource_jiffies); +} + +core_initcall(init_jiffies_clocksource); + +struct clocksource * __init __weak clocksource_default_clock(void) +{ + return &clocksource_jiffies; +} + +struct clocksource refined_jiffies; + +int register_refined_jiffies(long cycles_per_second) +{ + u64 nsec_per_tick, shift_hz; + long cycles_per_tick; + + + + refined_jiffies = clocksource_jiffies; + refined_jiffies.name = "refined-jiffies"; + refined_jiffies.rating++; + + /* Calc cycles per tick */ + cycles_per_tick = (cycles_per_second + HZ/2)/HZ; + /* shift_hz stores hz<<8 for extra accuracy */ + shift_hz = (u64)cycles_per_second << 8; + shift_hz += cycles_per_tick/2; + do_div(shift_hz, cycles_per_tick); + /* Calculate nsec_per_tick using shift_hz */ + nsec_per_tick = (u64)NSEC_PER_SEC << 8; + nsec_per_tick += (u32)shift_hz/2; + do_div(nsec_per_tick, (u32)shift_hz); + + refined_jiffies.mult = ((u32)nsec_per_tick) << JIFFIES_SHIFT; + + __clocksource_register(&refined_jiffies); + return 0; +} diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c new file mode 100644 index 000000000..e1110a7bd --- /dev/null +++ b/kernel/time/ntp.c @@ -0,0 +1,1038 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * NTP state machine interfaces and logic. + * + * This code was mainly moved from kernel/timer.c and kernel/time.c + * Please see those files for relevant copyright info and historical + * changelogs. + */ +#include <linux/capability.h> +#include <linux/clocksource.h> +#include <linux/workqueue.h> +#include <linux/hrtimer.h> +#include <linux/jiffies.h> +#include <linux/math64.h> +#include <linux/timex.h> +#include <linux/time.h> +#include <linux/mm.h> +#include <linux/module.h> +#include <linux/rtc.h> +#include <linux/math64.h> + +#include "ntp_internal.h" +#include "timekeeping_internal.h" + + +/* + * NTP timekeeping variables: + * + * Note: All of the NTP state is protected by the timekeeping locks. + */ + + +/* USER_HZ period (usecs): */ +unsigned long tick_usec = USER_TICK_USEC; + +/* SHIFTED_HZ period (nsecs): */ +unsigned long tick_nsec; + +static u64 tick_length; +static u64 tick_length_base; + +#define SECS_PER_DAY 86400 +#define MAX_TICKADJ 500LL /* usecs */ +#define MAX_TICKADJ_SCALED \ + (((MAX_TICKADJ * NSEC_PER_USEC) << NTP_SCALE_SHIFT) / NTP_INTERVAL_FREQ) +#define MAX_TAI_OFFSET 100000 + +/* + * phase-lock loop variables + */ + +/* + * clock synchronization status + * + * (TIME_ERROR prevents overwriting the CMOS clock) + */ +static int time_state = TIME_OK; + +/* clock status bits: */ +static int time_status = STA_UNSYNC; + +/* time adjustment (nsecs): */ +static s64 time_offset; + +/* pll time constant: */ +static long time_constant = 2; + +/* maximum error (usecs): */ +static long time_maxerror = NTP_PHASE_LIMIT; + +/* estimated error (usecs): */ +static long time_esterror = NTP_PHASE_LIMIT; + +/* frequency offset (scaled nsecs/secs): */ +static s64 time_freq; + +/* time at last adjustment (secs): */ +static time64_t time_reftime; + +static long time_adjust; + +/* constant (boot-param configurable) NTP tick adjustment (upscaled) */ +static s64 ntp_tick_adj; + +/* second value of the next pending leapsecond, or TIME64_MAX if no leap */ +static time64_t ntp_next_leap_sec = TIME64_MAX; + +#ifdef CONFIG_NTP_PPS + +/* + * The following variables are used when a pulse-per-second (PPS) signal + * is available. They establish the engineering parameters of the clock + * discipline loop when controlled by the PPS signal. + */ +#define PPS_VALID 10 /* PPS signal watchdog max (s) */ +#define PPS_POPCORN 4 /* popcorn spike threshold (shift) */ +#define PPS_INTMIN 2 /* min freq interval (s) (shift) */ +#define PPS_INTMAX 8 /* max freq interval (s) (shift) */ +#define PPS_INTCOUNT 4 /* number of consecutive good intervals to + increase pps_shift or consecutive bad + intervals to decrease it */ +#define PPS_MAXWANDER 100000 /* max PPS freq wander (ns/s) */ + +static int pps_valid; /* signal watchdog counter */ +static long pps_tf[3]; /* phase median filter */ +static long pps_jitter; /* current jitter (ns) */ +static struct timespec64 pps_fbase; /* beginning of the last freq interval */ +static int pps_shift; /* current interval duration (s) (shift) */ +static int pps_intcnt; /* interval counter */ +static s64 pps_freq; /* frequency offset (scaled ns/s) */ +static long pps_stabil; /* current stability (scaled ns/s) */ + +/* + * PPS signal quality monitors + */ +static long pps_calcnt; /* calibration intervals */ +static long pps_jitcnt; /* jitter limit exceeded */ +static long pps_stbcnt; /* stability limit exceeded */ +static long pps_errcnt; /* calibration errors */ + + +/* PPS kernel consumer compensates the whole phase error immediately. + * Otherwise, reduce the offset by a fixed factor times the time constant. + */ +static inline s64 ntp_offset_chunk(s64 offset) +{ + if (time_status & STA_PPSTIME && time_status & STA_PPSSIGNAL) + return offset; + else + return shift_right(offset, SHIFT_PLL + time_constant); +} + +static inline void pps_reset_freq_interval(void) +{ + /* the PPS calibration interval may end + surprisingly early */ + pps_shift = PPS_INTMIN; + pps_intcnt = 0; +} + +/** + * pps_clear - Clears the PPS state variables + */ +static inline void pps_clear(void) +{ + pps_reset_freq_interval(); + pps_tf[0] = 0; + pps_tf[1] = 0; + pps_tf[2] = 0; + pps_fbase.tv_sec = pps_fbase.tv_nsec = 0; + pps_freq = 0; +} + +/* Decrease pps_valid to indicate that another second has passed since + * the last PPS signal. When it reaches 0, indicate that PPS signal is + * missing. + */ +static inline void pps_dec_valid(void) +{ + if (pps_valid > 0) + pps_valid--; + else { + time_status &= ~(STA_PPSSIGNAL | STA_PPSJITTER | + STA_PPSWANDER | STA_PPSERROR); + pps_clear(); + } +} + +static inline void pps_set_freq(s64 freq) +{ + pps_freq = freq; +} + +static inline int is_error_status(int status) +{ + return (status & (STA_UNSYNC|STA_CLOCKERR)) + /* PPS signal lost when either PPS time or + * PPS frequency synchronization requested + */ + || ((status & (STA_PPSFREQ|STA_PPSTIME)) + && !(status & STA_PPSSIGNAL)) + /* PPS jitter exceeded when + * PPS time synchronization requested */ + || ((status & (STA_PPSTIME|STA_PPSJITTER)) + == (STA_PPSTIME|STA_PPSJITTER)) + /* PPS wander exceeded or calibration error when + * PPS frequency synchronization requested + */ + || ((status & STA_PPSFREQ) + && (status & (STA_PPSWANDER|STA_PPSERROR))); +} + +static inline void pps_fill_timex(struct timex *txc) +{ + txc->ppsfreq = shift_right((pps_freq >> PPM_SCALE_INV_SHIFT) * + PPM_SCALE_INV, NTP_SCALE_SHIFT); + txc->jitter = pps_jitter; + if (!(time_status & STA_NANO)) + txc->jitter /= NSEC_PER_USEC; + txc->shift = pps_shift; + txc->stabil = pps_stabil; + txc->jitcnt = pps_jitcnt; + txc->calcnt = pps_calcnt; + txc->errcnt = pps_errcnt; + txc->stbcnt = pps_stbcnt; +} + +#else /* !CONFIG_NTP_PPS */ + +static inline s64 ntp_offset_chunk(s64 offset) +{ + return shift_right(offset, SHIFT_PLL + time_constant); +} + +static inline void pps_reset_freq_interval(void) {} +static inline void pps_clear(void) {} +static inline void pps_dec_valid(void) {} +static inline void pps_set_freq(s64 freq) {} + +static inline int is_error_status(int status) +{ + return status & (STA_UNSYNC|STA_CLOCKERR); +} + +static inline void pps_fill_timex(struct timex *txc) +{ + /* PPS is not implemented, so these are zero */ + txc->ppsfreq = 0; + txc->jitter = 0; + txc->shift = 0; + txc->stabil = 0; + txc->jitcnt = 0; + txc->calcnt = 0; + txc->errcnt = 0; + txc->stbcnt = 0; +} + +#endif /* CONFIG_NTP_PPS */ + + +/** + * ntp_synced - Returns 1 if the NTP status is not UNSYNC + * + */ +static inline int ntp_synced(void) +{ + return !(time_status & STA_UNSYNC); +} + + +/* + * NTP methods: + */ + +/* + * Update (tick_length, tick_length_base, tick_nsec), based + * on (tick_usec, ntp_tick_adj, time_freq): + */ +static void ntp_update_frequency(void) +{ + u64 second_length; + u64 new_base; + + second_length = (u64)(tick_usec * NSEC_PER_USEC * USER_HZ) + << NTP_SCALE_SHIFT; + + second_length += ntp_tick_adj; + second_length += time_freq; + + tick_nsec = div_u64(second_length, HZ) >> NTP_SCALE_SHIFT; + new_base = div_u64(second_length, NTP_INTERVAL_FREQ); + + /* + * Don't wait for the next second_overflow, apply + * the change to the tick length immediately: + */ + tick_length += new_base - tick_length_base; + tick_length_base = new_base; +} + +static inline s64 ntp_update_offset_fll(s64 offset64, long secs) +{ + time_status &= ~STA_MODE; + + if (secs < MINSEC) + return 0; + + if (!(time_status & STA_FLL) && (secs <= MAXSEC)) + return 0; + + time_status |= STA_MODE; + + return div64_long(offset64 << (NTP_SCALE_SHIFT - SHIFT_FLL), secs); +} + +static void ntp_update_offset(long offset) +{ + s64 freq_adj; + s64 offset64; + long secs; + + if (!(time_status & STA_PLL)) + return; + + if (!(time_status & STA_NANO)) { + /* Make sure the multiplication below won't overflow */ + offset = clamp(offset, -USEC_PER_SEC, USEC_PER_SEC); + offset *= NSEC_PER_USEC; + } + + /* + * Scale the phase adjustment and + * clamp to the operating range. + */ + offset = clamp(offset, -MAXPHASE, MAXPHASE); + + /* + * Select how the frequency is to be controlled + * and in which mode (PLL or FLL). + */ + secs = (long)(__ktime_get_real_seconds() - time_reftime); + if (unlikely(time_status & STA_FREQHOLD)) + secs = 0; + + time_reftime = __ktime_get_real_seconds(); + + offset64 = offset; + freq_adj = ntp_update_offset_fll(offset64, secs); + + /* + * Clamp update interval to reduce PLL gain with low + * sampling rate (e.g. intermittent network connection) + * to avoid instability. + */ + if (unlikely(secs > 1 << (SHIFT_PLL + 1 + time_constant))) + secs = 1 << (SHIFT_PLL + 1 + time_constant); + + freq_adj += (offset64 * secs) << + (NTP_SCALE_SHIFT - 2 * (SHIFT_PLL + 2 + time_constant)); + + freq_adj = min(freq_adj + time_freq, MAXFREQ_SCALED); + + time_freq = max(freq_adj, -MAXFREQ_SCALED); + + time_offset = div_s64(offset64 << NTP_SCALE_SHIFT, NTP_INTERVAL_FREQ); +} + +/** + * ntp_clear - Clears the NTP state variables + */ +void ntp_clear(void) +{ + time_adjust = 0; /* stop active adjtime() */ + time_status |= STA_UNSYNC; + time_maxerror = NTP_PHASE_LIMIT; + time_esterror = NTP_PHASE_LIMIT; + + ntp_update_frequency(); + + tick_length = tick_length_base; + time_offset = 0; + + ntp_next_leap_sec = TIME64_MAX; + /* Clear PPS state variables */ + pps_clear(); +} + + +u64 ntp_tick_length(void) +{ + return tick_length; +} + +/** + * ntp_get_next_leap - Returns the next leapsecond in CLOCK_REALTIME ktime_t + * + * Provides the time of the next leapsecond against CLOCK_REALTIME in + * a ktime_t format. Returns KTIME_MAX if no leapsecond is pending. + */ +ktime_t ntp_get_next_leap(void) +{ + ktime_t ret; + + if ((time_state == TIME_INS) && (time_status & STA_INS)) + return ktime_set(ntp_next_leap_sec, 0); + ret = KTIME_MAX; + return ret; +} + +/* + * this routine handles the overflow of the microsecond field + * + * The tricky bits of code to handle the accurate clock support + * were provided by Dave Mills (Mills@UDEL.EDU) of NTP fame. + * They were originally developed for SUN and DEC kernels. + * All the kudos should go to Dave for this stuff. + * + * Also handles leap second processing, and returns leap offset + */ +int second_overflow(time64_t secs) +{ + s64 delta; + int leap = 0; + s32 rem; + + /* + * Leap second processing. If in leap-insert state at the end of the + * day, the system clock is set back one second; if in leap-delete + * state, the system clock is set ahead one second. + */ + switch (time_state) { + case TIME_OK: + if (time_status & STA_INS) { + time_state = TIME_INS; + div_s64_rem(secs, SECS_PER_DAY, &rem); + ntp_next_leap_sec = secs + SECS_PER_DAY - rem; + } else if (time_status & STA_DEL) { + time_state = TIME_DEL; + div_s64_rem(secs + 1, SECS_PER_DAY, &rem); + ntp_next_leap_sec = secs + SECS_PER_DAY - rem; + } + break; + case TIME_INS: + if (!(time_status & STA_INS)) { + ntp_next_leap_sec = TIME64_MAX; + time_state = TIME_OK; + } else if (secs == ntp_next_leap_sec) { + leap = -1; + time_state = TIME_OOP; + printk(KERN_NOTICE + "Clock: inserting leap second 23:59:60 UTC\n"); + } + break; + case TIME_DEL: + if (!(time_status & STA_DEL)) { + ntp_next_leap_sec = TIME64_MAX; + time_state = TIME_OK; + } else if (secs == ntp_next_leap_sec) { + leap = 1; + ntp_next_leap_sec = TIME64_MAX; + time_state = TIME_WAIT; + printk(KERN_NOTICE + "Clock: deleting leap second 23:59:59 UTC\n"); + } + break; + case TIME_OOP: + ntp_next_leap_sec = TIME64_MAX; + time_state = TIME_WAIT; + break; + case TIME_WAIT: + if (!(time_status & (STA_INS | STA_DEL))) + time_state = TIME_OK; + break; + } + + + /* Bump the maxerror field */ + time_maxerror += MAXFREQ / NSEC_PER_USEC; + if (time_maxerror > NTP_PHASE_LIMIT) { + time_maxerror = NTP_PHASE_LIMIT; + time_status |= STA_UNSYNC; + } + + /* Compute the phase adjustment for the next second */ + tick_length = tick_length_base; + + delta = ntp_offset_chunk(time_offset); + time_offset -= delta; + tick_length += delta; + + /* Check PPS signal */ + pps_dec_valid(); + + if (!time_adjust) + goto out; + + if (time_adjust > MAX_TICKADJ) { + time_adjust -= MAX_TICKADJ; + tick_length += MAX_TICKADJ_SCALED; + goto out; + } + + if (time_adjust < -MAX_TICKADJ) { + time_adjust += MAX_TICKADJ; + tick_length -= MAX_TICKADJ_SCALED; + goto out; + } + + tick_length += (s64)(time_adjust * NSEC_PER_USEC / NTP_INTERVAL_FREQ) + << NTP_SCALE_SHIFT; + time_adjust = 0; + +out: + return leap; +} + +static void sync_hw_clock(struct work_struct *work); +static DECLARE_DELAYED_WORK(sync_work, sync_hw_clock); + +static void sched_sync_hw_clock(struct timespec64 now, + unsigned long target_nsec, bool fail) + +{ + struct timespec64 next; + + ktime_get_real_ts64(&next); + if (!fail) + next.tv_sec = 659; + else { + /* + * Try again as soon as possible. Delaying long periods + * decreases the accuracy of the work queue timer. Due to this + * the algorithm is very likely to require a short-sleep retry + * after the above long sleep to synchronize ts_nsec. + */ + next.tv_sec = 0; + } + + /* Compute the needed delay that will get to tv_nsec == target_nsec */ + next.tv_nsec = target_nsec - next.tv_nsec; + if (next.tv_nsec <= 0) + next.tv_nsec += NSEC_PER_SEC; + if (next.tv_nsec >= NSEC_PER_SEC) { + next.tv_sec++; + next.tv_nsec -= NSEC_PER_SEC; + } + + queue_delayed_work(system_power_efficient_wq, &sync_work, + timespec64_to_jiffies(&next)); +} + +static void sync_rtc_clock(void) +{ + unsigned long target_nsec; + struct timespec64 adjust, now; + int rc; + + if (!IS_ENABLED(CONFIG_RTC_SYSTOHC)) + return; + + ktime_get_real_ts64(&now); + + adjust = now; + if (persistent_clock_is_local) + adjust.tv_sec -= (sys_tz.tz_minuteswest * 60); + + /* + * The current RTC in use will provide the target_nsec it wants to be + * called at, and does rtc_tv_nsec_ok internally. + */ + rc = rtc_set_ntp_time(adjust, &target_nsec); + if (rc == -ENODEV) + return; + + sched_sync_hw_clock(now, target_nsec, rc); +} + +#ifdef CONFIG_GENERIC_CMOS_UPDATE +int __weak update_persistent_clock(struct timespec now) +{ + return -ENODEV; +} + +int __weak update_persistent_clock64(struct timespec64 now64) +{ + struct timespec now; + + now = timespec64_to_timespec(now64); + return update_persistent_clock(now); +} +#endif + +static bool sync_cmos_clock(void) +{ + static bool no_cmos; + struct timespec64 now; + struct timespec64 adjust; + int rc = -EPROTO; + long target_nsec = NSEC_PER_SEC / 2; + + if (!IS_ENABLED(CONFIG_GENERIC_CMOS_UPDATE)) + return false; + + if (no_cmos) + return false; + + /* + * Historically update_persistent_clock64() has followed x86 + * semantics, which match the MC146818A/etc RTC. This RTC will store + * 'adjust' and then in .5s it will advance once second. + * + * Architectures are strongly encouraged to use rtclib and not + * implement this legacy API. + */ + ktime_get_real_ts64(&now); + if (rtc_tv_nsec_ok(-1 * target_nsec, &adjust, &now)) { + if (persistent_clock_is_local) + adjust.tv_sec -= (sys_tz.tz_minuteswest * 60); + rc = update_persistent_clock64(adjust); + /* + * The machine does not support update_persistent_clock64 even + * though it defines CONFIG_GENERIC_CMOS_UPDATE. + */ + if (rc == -ENODEV) { + no_cmos = true; + return false; + } + } + + sched_sync_hw_clock(now, target_nsec, rc); + return true; +} + +/* + * If we have an externally synchronized Linux clock, then update RTC clock + * accordingly every ~11 minutes. Generally RTCs can only store second + * precision, but many RTCs will adjust the phase of their second tick to + * match the moment of update. This infrastructure arranges to call to the RTC + * set at the correct moment to phase synchronize the RTC second tick over + * with the kernel clock. + */ +static void sync_hw_clock(struct work_struct *work) +{ + if (!ntp_synced()) + return; + + if (sync_cmos_clock()) + return; + + sync_rtc_clock(); +} + +void ntp_notify_cmos_timer(void) +{ + if (!ntp_synced()) + return; + + if (IS_ENABLED(CONFIG_GENERIC_CMOS_UPDATE) || + IS_ENABLED(CONFIG_RTC_SYSTOHC)) + queue_delayed_work(system_power_efficient_wq, &sync_work, 0); +} + +/* + * Propagate a new txc->status value into the NTP state: + */ +static inline void process_adj_status(const struct timex *txc) +{ + if ((time_status & STA_PLL) && !(txc->status & STA_PLL)) { + time_state = TIME_OK; + time_status = STA_UNSYNC; + ntp_next_leap_sec = TIME64_MAX; + /* restart PPS frequency calibration */ + pps_reset_freq_interval(); + } + + /* + * If we turn on PLL adjustments then reset the + * reference time to current time. + */ + if (!(time_status & STA_PLL) && (txc->status & STA_PLL)) + time_reftime = __ktime_get_real_seconds(); + + /* only set allowed bits */ + time_status &= STA_RONLY; + time_status |= txc->status & ~STA_RONLY; +} + + +static inline void process_adjtimex_modes(const struct timex *txc, s32 *time_tai) +{ + if (txc->modes & ADJ_STATUS) + process_adj_status(txc); + + if (txc->modes & ADJ_NANO) + time_status |= STA_NANO; + + if (txc->modes & ADJ_MICRO) + time_status &= ~STA_NANO; + + if (txc->modes & ADJ_FREQUENCY) { + time_freq = txc->freq * PPM_SCALE; + time_freq = min(time_freq, MAXFREQ_SCALED); + time_freq = max(time_freq, -MAXFREQ_SCALED); + /* update pps_freq */ + pps_set_freq(time_freq); + } + + if (txc->modes & ADJ_MAXERROR) + time_maxerror = txc->maxerror; + + if (txc->modes & ADJ_ESTERROR) + time_esterror = txc->esterror; + + if (txc->modes & ADJ_TIMECONST) { + time_constant = txc->constant; + if (!(time_status & STA_NANO)) + time_constant += 4; + time_constant = min(time_constant, (long)MAXTC); + time_constant = max(time_constant, 0l); + } + + if (txc->modes & ADJ_TAI && + txc->constant >= 0 && txc->constant <= MAX_TAI_OFFSET) + *time_tai = txc->constant; + + if (txc->modes & ADJ_OFFSET) + ntp_update_offset(txc->offset); + + if (txc->modes & ADJ_TICK) + tick_usec = txc->tick; + + if (txc->modes & (ADJ_TICK|ADJ_FREQUENCY|ADJ_OFFSET)) + ntp_update_frequency(); +} + + +/* + * adjtimex mainly allows reading (and writing, if superuser) of + * kernel time-keeping variables. used by xntpd. + */ +int __do_adjtimex(struct timex *txc, const struct timespec64 *ts, s32 *time_tai) +{ + int result; + + if (txc->modes & ADJ_ADJTIME) { + long save_adjust = time_adjust; + + if (!(txc->modes & ADJ_OFFSET_READONLY)) { + /* adjtime() is independent from ntp_adjtime() */ + time_adjust = txc->offset; + ntp_update_frequency(); + } + txc->offset = save_adjust; + } else { + + /* If there are input parameters, then process them: */ + if (txc->modes) + process_adjtimex_modes(txc, time_tai); + + txc->offset = shift_right(time_offset * NTP_INTERVAL_FREQ, + NTP_SCALE_SHIFT); + if (!(time_status & STA_NANO)) + txc->offset /= NSEC_PER_USEC; + } + + result = time_state; /* mostly `TIME_OK' */ + /* check for errors */ + if (is_error_status(time_status)) + result = TIME_ERROR; + + txc->freq = shift_right((time_freq >> PPM_SCALE_INV_SHIFT) * + PPM_SCALE_INV, NTP_SCALE_SHIFT); + txc->maxerror = time_maxerror; + txc->esterror = time_esterror; + txc->status = time_status; + txc->constant = time_constant; + txc->precision = 1; + txc->tolerance = MAXFREQ_SCALED / PPM_SCALE; + txc->tick = tick_usec; + txc->tai = *time_tai; + + /* fill PPS status fields */ + pps_fill_timex(txc); + + txc->time.tv_sec = (time_t)ts->tv_sec; + txc->time.tv_usec = ts->tv_nsec; + if (!(time_status & STA_NANO)) + txc->time.tv_usec /= NSEC_PER_USEC; + + /* Handle leapsec adjustments */ + if (unlikely(ts->tv_sec >= ntp_next_leap_sec)) { + if ((time_state == TIME_INS) && (time_status & STA_INS)) { + result = TIME_OOP; + txc->tai++; + txc->time.tv_sec--; + } + if ((time_state == TIME_DEL) && (time_status & STA_DEL)) { + result = TIME_WAIT; + txc->tai--; + txc->time.tv_sec++; + } + if ((time_state == TIME_OOP) && + (ts->tv_sec == ntp_next_leap_sec)) { + result = TIME_WAIT; + } + } + + return result; +} + +#ifdef CONFIG_NTP_PPS + +/* actually struct pps_normtime is good old struct timespec, but it is + * semantically different (and it is the reason why it was invented): + * pps_normtime.nsec has a range of ( -NSEC_PER_SEC / 2, NSEC_PER_SEC / 2 ] + * while timespec.tv_nsec has a range of [0, NSEC_PER_SEC) */ +struct pps_normtime { + s64 sec; /* seconds */ + long nsec; /* nanoseconds */ +}; + +/* normalize the timestamp so that nsec is in the + ( -NSEC_PER_SEC / 2, NSEC_PER_SEC / 2 ] interval */ +static inline struct pps_normtime pps_normalize_ts(struct timespec64 ts) +{ + struct pps_normtime norm = { + .sec = ts.tv_sec, + .nsec = ts.tv_nsec + }; + + if (norm.nsec > (NSEC_PER_SEC >> 1)) { + norm.nsec -= NSEC_PER_SEC; + norm.sec++; + } + + return norm; +} + +/* get current phase correction and jitter */ +static inline long pps_phase_filter_get(long *jitter) +{ + *jitter = pps_tf[0] - pps_tf[1]; + if (*jitter < 0) + *jitter = -*jitter; + + /* TODO: test various filters */ + return pps_tf[0]; +} + +/* add the sample to the phase filter */ +static inline void pps_phase_filter_add(long err) +{ + pps_tf[2] = pps_tf[1]; + pps_tf[1] = pps_tf[0]; + pps_tf[0] = err; +} + +/* decrease frequency calibration interval length. + * It is halved after four consecutive unstable intervals. + */ +static inline void pps_dec_freq_interval(void) +{ + if (--pps_intcnt <= -PPS_INTCOUNT) { + pps_intcnt = -PPS_INTCOUNT; + if (pps_shift > PPS_INTMIN) { + pps_shift--; + pps_intcnt = 0; + } + } +} + +/* increase frequency calibration interval length. + * It is doubled after four consecutive stable intervals. + */ +static inline void pps_inc_freq_interval(void) +{ + if (++pps_intcnt >= PPS_INTCOUNT) { + pps_intcnt = PPS_INTCOUNT; + if (pps_shift < PPS_INTMAX) { + pps_shift++; + pps_intcnt = 0; + } + } +} + +/* update clock frequency based on MONOTONIC_RAW clock PPS signal + * timestamps + * + * At the end of the calibration interval the difference between the + * first and last MONOTONIC_RAW clock timestamps divided by the length + * of the interval becomes the frequency update. If the interval was + * too long, the data are discarded. + * Returns the difference between old and new frequency values. + */ +static long hardpps_update_freq(struct pps_normtime freq_norm) +{ + long delta, delta_mod; + s64 ftemp; + + /* check if the frequency interval was too long */ + if (freq_norm.sec > (2 << pps_shift)) { + time_status |= STA_PPSERROR; + pps_errcnt++; + pps_dec_freq_interval(); + printk_deferred(KERN_ERR + "hardpps: PPSERROR: interval too long - %lld s\n", + freq_norm.sec); + return 0; + } + + /* here the raw frequency offset and wander (stability) is + * calculated. If the wander is less than the wander threshold + * the interval is increased; otherwise it is decreased. + */ + ftemp = div_s64(((s64)(-freq_norm.nsec)) << NTP_SCALE_SHIFT, + freq_norm.sec); + delta = shift_right(ftemp - pps_freq, NTP_SCALE_SHIFT); + pps_freq = ftemp; + if (delta > PPS_MAXWANDER || delta < -PPS_MAXWANDER) { + printk_deferred(KERN_WARNING + "hardpps: PPSWANDER: change=%ld\n", delta); + time_status |= STA_PPSWANDER; + pps_stbcnt++; + pps_dec_freq_interval(); + } else { /* good sample */ + pps_inc_freq_interval(); + } + + /* the stability metric is calculated as the average of recent + * frequency changes, but is used only for performance + * monitoring + */ + delta_mod = delta; + if (delta_mod < 0) + delta_mod = -delta_mod; + pps_stabil += (div_s64(((s64)delta_mod) << + (NTP_SCALE_SHIFT - SHIFT_USEC), + NSEC_PER_USEC) - pps_stabil) >> PPS_INTMIN; + + /* if enabled, the system clock frequency is updated */ + if ((time_status & STA_PPSFREQ) != 0 && + (time_status & STA_FREQHOLD) == 0) { + time_freq = pps_freq; + ntp_update_frequency(); + } + + return delta; +} + +/* correct REALTIME clock phase error against PPS signal */ +static void hardpps_update_phase(long error) +{ + long correction = -error; + long jitter; + + /* add the sample to the median filter */ + pps_phase_filter_add(correction); + correction = pps_phase_filter_get(&jitter); + + /* Nominal jitter is due to PPS signal noise. If it exceeds the + * threshold, the sample is discarded; otherwise, if so enabled, + * the time offset is updated. + */ + if (jitter > (pps_jitter << PPS_POPCORN)) { + printk_deferred(KERN_WARNING + "hardpps: PPSJITTER: jitter=%ld, limit=%ld\n", + jitter, (pps_jitter << PPS_POPCORN)); + time_status |= STA_PPSJITTER; + pps_jitcnt++; + } else if (time_status & STA_PPSTIME) { + /* correct the time using the phase offset */ + time_offset = div_s64(((s64)correction) << NTP_SCALE_SHIFT, + NTP_INTERVAL_FREQ); + /* cancel running adjtime() */ + time_adjust = 0; + } + /* update jitter */ + pps_jitter += (jitter - pps_jitter) >> PPS_INTMIN; +} + +/* + * __hardpps() - discipline CPU clock oscillator to external PPS signal + * + * This routine is called at each PPS signal arrival in order to + * discipline the CPU clock oscillator to the PPS signal. It takes two + * parameters: REALTIME and MONOTONIC_RAW clock timestamps. The former + * is used to correct clock phase error and the latter is used to + * correct the frequency. + * + * This code is based on David Mills's reference nanokernel + * implementation. It was mostly rewritten but keeps the same idea. + */ +void __hardpps(const struct timespec64 *phase_ts, const struct timespec64 *raw_ts) +{ + struct pps_normtime pts_norm, freq_norm; + + pts_norm = pps_normalize_ts(*phase_ts); + + /* clear the error bits, they will be set again if needed */ + time_status &= ~(STA_PPSJITTER | STA_PPSWANDER | STA_PPSERROR); + + /* indicate signal presence */ + time_status |= STA_PPSSIGNAL; + pps_valid = PPS_VALID; + + /* when called for the first time, + * just start the frequency interval */ + if (unlikely(pps_fbase.tv_sec == 0)) { + pps_fbase = *raw_ts; + return; + } + + /* ok, now we have a base for frequency calculation */ + freq_norm = pps_normalize_ts(timespec64_sub(*raw_ts, pps_fbase)); + + /* check that the signal is in the range + * [1s - MAXFREQ us, 1s + MAXFREQ us], otherwise reject it */ + if ((freq_norm.sec == 0) || + (freq_norm.nsec > MAXFREQ * freq_norm.sec) || + (freq_norm.nsec < -MAXFREQ * freq_norm.sec)) { + time_status |= STA_PPSJITTER; + /* restart the frequency calibration interval */ + pps_fbase = *raw_ts; + printk_deferred(KERN_ERR "hardpps: PPSJITTER: bad pulse\n"); + return; + } + + /* signal is ok */ + + /* check if the current frequency interval is finished */ + if (freq_norm.sec >= (1 << pps_shift)) { + pps_calcnt++; + /* restart the frequency calibration interval */ + pps_fbase = *raw_ts; + hardpps_update_freq(freq_norm); + } + + hardpps_update_phase(pts_norm.nsec); + +} +#endif /* CONFIG_NTP_PPS */ + +static int __init ntp_tick_adj_setup(char *str) +{ + int rc = kstrtos64(str, 0, &ntp_tick_adj); + if (rc) + return rc; + + ntp_tick_adj <<= NTP_SCALE_SHIFT; + return 1; +} + +__setup("ntp_tick_adj=", ntp_tick_adj_setup); + +void __init ntp_init(void) +{ + ntp_clear(); +} diff --git a/kernel/time/ntp_internal.h b/kernel/time/ntp_internal.h new file mode 100644 index 000000000..c24b0e13f --- /dev/null +++ b/kernel/time/ntp_internal.h @@ -0,0 +1,13 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _LINUX_NTP_INTERNAL_H +#define _LINUX_NTP_INTERNAL_H + +extern void ntp_init(void); +extern void ntp_clear(void); +/* Returns how long ticks are at present, in ns / 2^NTP_SCALE_SHIFT. */ +extern u64 ntp_tick_length(void); +extern ktime_t ntp_get_next_leap(void); +extern int second_overflow(time64_t secs); +extern int __do_adjtimex(struct timex *txc, const struct timespec64 *ts, s32 *time_tai); +extern void __hardpps(const struct timespec64 *phase_ts, const struct timespec64 *raw_ts); +#endif /* _LINUX_NTP_INTERNAL_H */ diff --git a/kernel/time/posix-clock.c b/kernel/time/posix-clock.c new file mode 100644 index 000000000..c8a8501fa --- /dev/null +++ b/kernel/time/posix-clock.c @@ -0,0 +1,330 @@ +/* + * posix-clock.c - support for dynamic clock devices + * + * Copyright (C) 2010 OMICRON electronics GmbH + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + */ +#include <linux/device.h> +#include <linux/export.h> +#include <linux/file.h> +#include <linux/posix-clock.h> +#include <linux/slab.h> +#include <linux/syscalls.h> +#include <linux/uaccess.h> + +#include "posix-timers.h" + +/* + * Returns NULL if the posix_clock instance attached to 'fp' is old and stale. + */ +static struct posix_clock *get_posix_clock(struct file *fp) +{ + struct posix_clock *clk = fp->private_data; + + down_read(&clk->rwsem); + + if (!clk->zombie) + return clk; + + up_read(&clk->rwsem); + + return NULL; +} + +static void put_posix_clock(struct posix_clock *clk) +{ + up_read(&clk->rwsem); +} + +static ssize_t posix_clock_read(struct file *fp, char __user *buf, + size_t count, loff_t *ppos) +{ + struct posix_clock *clk = get_posix_clock(fp); + int err = -EINVAL; + + if (!clk) + return -ENODEV; + + if (clk->ops.read) + err = clk->ops.read(clk, fp->f_flags, buf, count); + + put_posix_clock(clk); + + return err; +} + +static __poll_t posix_clock_poll(struct file *fp, poll_table *wait) +{ + struct posix_clock *clk = get_posix_clock(fp); + __poll_t result = 0; + + if (!clk) + return EPOLLERR; + + if (clk->ops.poll) + result = clk->ops.poll(clk, fp, wait); + + put_posix_clock(clk); + + return result; +} + +static long posix_clock_ioctl(struct file *fp, + unsigned int cmd, unsigned long arg) +{ + struct posix_clock *clk = get_posix_clock(fp); + int err = -ENOTTY; + + if (!clk) + return -ENODEV; + + if (clk->ops.ioctl) + err = clk->ops.ioctl(clk, cmd, arg); + + put_posix_clock(clk); + + return err; +} + +#ifdef CONFIG_COMPAT +static long posix_clock_compat_ioctl(struct file *fp, + unsigned int cmd, unsigned long arg) +{ + struct posix_clock *clk = get_posix_clock(fp); + int err = -ENOTTY; + + if (!clk) + return -ENODEV; + + if (clk->ops.ioctl) + err = clk->ops.ioctl(clk, cmd, arg); + + put_posix_clock(clk); + + return err; +} +#endif + +static int posix_clock_open(struct inode *inode, struct file *fp) +{ + int err; + struct posix_clock *clk = + container_of(inode->i_cdev, struct posix_clock, cdev); + + down_read(&clk->rwsem); + + if (clk->zombie) { + err = -ENODEV; + goto out; + } + if (clk->ops.open) + err = clk->ops.open(clk, fp->f_mode); + else + err = 0; + + if (!err) { + get_device(clk->dev); + fp->private_data = clk; + } +out: + up_read(&clk->rwsem); + return err; +} + +static int posix_clock_release(struct inode *inode, struct file *fp) +{ + struct posix_clock *clk = fp->private_data; + int err = 0; + + if (clk->ops.release) + err = clk->ops.release(clk); + + put_device(clk->dev); + + fp->private_data = NULL; + + return err; +} + +static const struct file_operations posix_clock_file_operations = { + .owner = THIS_MODULE, + .llseek = no_llseek, + .read = posix_clock_read, + .poll = posix_clock_poll, + .unlocked_ioctl = posix_clock_ioctl, + .open = posix_clock_open, + .release = posix_clock_release, +#ifdef CONFIG_COMPAT + .compat_ioctl = posix_clock_compat_ioctl, +#endif +}; + +int posix_clock_register(struct posix_clock *clk, struct device *dev) +{ + int err; + + init_rwsem(&clk->rwsem); + + cdev_init(&clk->cdev, &posix_clock_file_operations); + err = cdev_device_add(&clk->cdev, dev); + if (err) { + pr_err("%s unable to add device %d:%d\n", + dev_name(dev), MAJOR(dev->devt), MINOR(dev->devt)); + return err; + } + clk->cdev.owner = clk->ops.owner; + clk->dev = dev; + + return 0; +} +EXPORT_SYMBOL_GPL(posix_clock_register); + +void posix_clock_unregister(struct posix_clock *clk) +{ + cdev_device_del(&clk->cdev, clk->dev); + + down_write(&clk->rwsem); + clk->zombie = true; + up_write(&clk->rwsem); + + put_device(clk->dev); +} +EXPORT_SYMBOL_GPL(posix_clock_unregister); + +struct posix_clock_desc { + struct file *fp; + struct posix_clock *clk; +}; + +static int get_clock_desc(const clockid_t id, struct posix_clock_desc *cd) +{ + struct file *fp = fget(clockid_to_fd(id)); + int err = -EINVAL; + + if (!fp) + return err; + + if (fp->f_op->open != posix_clock_open || !fp->private_data) + goto out; + + cd->fp = fp; + cd->clk = get_posix_clock(fp); + + err = cd->clk ? 0 : -ENODEV; +out: + if (err) + fput(fp); + return err; +} + +static void put_clock_desc(struct posix_clock_desc *cd) +{ + put_posix_clock(cd->clk); + fput(cd->fp); +} + +static int pc_clock_adjtime(clockid_t id, struct timex *tx) +{ + struct posix_clock_desc cd; + int err; + + err = get_clock_desc(id, &cd); + if (err) + return err; + + if ((cd.fp->f_mode & FMODE_WRITE) == 0) { + err = -EACCES; + goto out; + } + + if (cd.clk->ops.clock_adjtime) + err = cd.clk->ops.clock_adjtime(cd.clk, tx); + else + err = -EOPNOTSUPP; +out: + put_clock_desc(&cd); + + return err; +} + +static int pc_clock_gettime(clockid_t id, struct timespec64 *ts) +{ + struct posix_clock_desc cd; + int err; + + err = get_clock_desc(id, &cd); + if (err) + return err; + + if (cd.clk->ops.clock_gettime) + err = cd.clk->ops.clock_gettime(cd.clk, ts); + else + err = -EOPNOTSUPP; + + put_clock_desc(&cd); + + return err; +} + +static int pc_clock_getres(clockid_t id, struct timespec64 *ts) +{ + struct posix_clock_desc cd; + int err; + + err = get_clock_desc(id, &cd); + if (err) + return err; + + if (cd.clk->ops.clock_getres) + err = cd.clk->ops.clock_getres(cd.clk, ts); + else + err = -EOPNOTSUPP; + + put_clock_desc(&cd); + + return err; +} + +static int pc_clock_settime(clockid_t id, const struct timespec64 *ts) +{ + struct posix_clock_desc cd; + int err; + + err = get_clock_desc(id, &cd); + if (err) + return err; + + if ((cd.fp->f_mode & FMODE_WRITE) == 0) { + err = -EACCES; + goto out; + } + + if (cd.clk->ops.clock_settime) + err = cd.clk->ops.clock_settime(cd.clk, ts); + else + err = -EOPNOTSUPP; +out: + put_clock_desc(&cd); + + return err; +} + +const struct k_clock clock_posix_dynamic = { + .clock_getres = pc_clock_getres, + .clock_set = pc_clock_settime, + .clock_get = pc_clock_gettime, + .clock_adj = pc_clock_adjtime, +}; diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c new file mode 100644 index 000000000..bfaa44a80 --- /dev/null +++ b/kernel/time/posix-cpu-timers.c @@ -0,0 +1,1452 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Implement CPU time clocks for the POSIX clock interface. + */ + +#include <linux/sched/signal.h> +#include <linux/sched/cputime.h> +#include <linux/posix-timers.h> +#include <linux/errno.h> +#include <linux/math64.h> +#include <linux/uaccess.h> +#include <linux/kernel_stat.h> +#include <trace/events/timer.h> +#include <linux/tick.h> +#include <linux/workqueue.h> +#include <linux/compat.h> +#include <linux/sched/deadline.h> + +#include "posix-timers.h" + +static void posix_cpu_timer_rearm(struct k_itimer *timer); + +/* + * Called after updating RLIMIT_CPU to run cpu timer and update + * tsk->signal->cputime_expires expiration cache if necessary. Needs + * siglock protection since other code may update expiration cache as + * well. + */ +void update_rlimit_cpu(struct task_struct *task, unsigned long rlim_new) +{ + u64 nsecs = rlim_new * NSEC_PER_SEC; + + spin_lock_irq(&task->sighand->siglock); + set_process_cpu_timer(task, CPUCLOCK_PROF, &nsecs, NULL); + spin_unlock_irq(&task->sighand->siglock); +} + +static int check_clock(const clockid_t which_clock) +{ + int error = 0; + struct task_struct *p; + const pid_t pid = CPUCLOCK_PID(which_clock); + + if (CPUCLOCK_WHICH(which_clock) >= CPUCLOCK_MAX) + return -EINVAL; + + if (pid == 0) + return 0; + + rcu_read_lock(); + p = find_task_by_vpid(pid); + if (!p || !(CPUCLOCK_PERTHREAD(which_clock) ? + same_thread_group(p, current) : has_group_leader_pid(p))) { + error = -EINVAL; + } + rcu_read_unlock(); + + return error; +} + +/* + * Update expiry time from increment, and increase overrun count, + * given the current clock sample. + */ +static void bump_cpu_timer(struct k_itimer *timer, u64 now) +{ + int i; + u64 delta, incr; + + if (timer->it.cpu.incr == 0) + return; + + if (now < timer->it.cpu.expires) + return; + + incr = timer->it.cpu.incr; + delta = now + incr - timer->it.cpu.expires; + + /* Don't use (incr*2 < delta), incr*2 might overflow. */ + for (i = 0; incr < delta - incr; i++) + incr = incr << 1; + + for (; i >= 0; incr >>= 1, i--) { + if (delta < incr) + continue; + + timer->it.cpu.expires += incr; + timer->it_overrun += 1LL << i; + delta -= incr; + } +} + +/** + * task_cputime_zero - Check a task_cputime struct for all zero fields. + * + * @cputime: The struct to compare. + * + * Checks @cputime to see if all fields are zero. Returns true if all fields + * are zero, false if any field is nonzero. + */ +static inline int task_cputime_zero(const struct task_cputime *cputime) +{ + if (!cputime->utime && !cputime->stime && !cputime->sum_exec_runtime) + return 1; + return 0; +} + +static inline u64 prof_ticks(struct task_struct *p) +{ + u64 utime, stime; + + task_cputime(p, &utime, &stime); + + return utime + stime; +} +static inline u64 virt_ticks(struct task_struct *p) +{ + u64 utime, stime; + + task_cputime(p, &utime, &stime); + + return utime; +} + +static int +posix_cpu_clock_getres(const clockid_t which_clock, struct timespec64 *tp) +{ + int error = check_clock(which_clock); + if (!error) { + tp->tv_sec = 0; + tp->tv_nsec = ((NSEC_PER_SEC + HZ - 1) / HZ); + if (CPUCLOCK_WHICH(which_clock) == CPUCLOCK_SCHED) { + /* + * If sched_clock is using a cycle counter, we + * don't have any idea of its true resolution + * exported, but it is much more than 1s/HZ. + */ + tp->tv_nsec = 1; + } + } + return error; +} + +static int +posix_cpu_clock_set(const clockid_t which_clock, const struct timespec64 *tp) +{ + /* + * You can never reset a CPU clock, but we check for other errors + * in the call before failing with EPERM. + */ + int error = check_clock(which_clock); + if (error == 0) { + error = -EPERM; + } + return error; +} + + +/* + * Sample a per-thread clock for the given task. + */ +static int cpu_clock_sample(const clockid_t which_clock, + struct task_struct *p, u64 *sample) +{ + switch (CPUCLOCK_WHICH(which_clock)) { + default: + return -EINVAL; + case CPUCLOCK_PROF: + *sample = prof_ticks(p); + break; + case CPUCLOCK_VIRT: + *sample = virt_ticks(p); + break; + case CPUCLOCK_SCHED: + *sample = task_sched_runtime(p); + break; + } + return 0; +} + +/* + * Set cputime to sum_cputime if sum_cputime > cputime. Use cmpxchg + * to avoid race conditions with concurrent updates to cputime. + */ +static inline void __update_gt_cputime(atomic64_t *cputime, u64 sum_cputime) +{ + u64 curr_cputime; +retry: + curr_cputime = atomic64_read(cputime); + if (sum_cputime > curr_cputime) { + if (atomic64_cmpxchg(cputime, curr_cputime, sum_cputime) != curr_cputime) + goto retry; + } +} + +static void update_gt_cputime(struct task_cputime_atomic *cputime_atomic, struct task_cputime *sum) +{ + __update_gt_cputime(&cputime_atomic->utime, sum->utime); + __update_gt_cputime(&cputime_atomic->stime, sum->stime); + __update_gt_cputime(&cputime_atomic->sum_exec_runtime, sum->sum_exec_runtime); +} + +/* Sample task_cputime_atomic values in "atomic_timers", store results in "times". */ +static inline void sample_cputime_atomic(struct task_cputime *times, + struct task_cputime_atomic *atomic_times) +{ + times->utime = atomic64_read(&atomic_times->utime); + times->stime = atomic64_read(&atomic_times->stime); + times->sum_exec_runtime = atomic64_read(&atomic_times->sum_exec_runtime); +} + +void thread_group_cputimer(struct task_struct *tsk, struct task_cputime *times) +{ + struct thread_group_cputimer *cputimer = &tsk->signal->cputimer; + struct task_cputime sum; + + /* Check if cputimer isn't running. This is accessed without locking. */ + if (!READ_ONCE(cputimer->running)) { + /* + * The POSIX timer interface allows for absolute time expiry + * values through the TIMER_ABSTIME flag, therefore we have + * to synchronize the timer to the clock every time we start it. + */ + thread_group_cputime(tsk, &sum); + update_gt_cputime(&cputimer->cputime_atomic, &sum); + + /* + * We're setting cputimer->running without a lock. Ensure + * this only gets written to in one operation. We set + * running after update_gt_cputime() as a small optimization, + * but barriers are not required because update_gt_cputime() + * can handle concurrent updates. + */ + WRITE_ONCE(cputimer->running, true); + } + sample_cputime_atomic(times, &cputimer->cputime_atomic); +} + +/* + * Sample a process (thread group) clock for the given group_leader task. + * Must be called with task sighand lock held for safe while_each_thread() + * traversal. + */ +static int cpu_clock_sample_group(const clockid_t which_clock, + struct task_struct *p, + u64 *sample) +{ + struct task_cputime cputime; + + switch (CPUCLOCK_WHICH(which_clock)) { + default: + return -EINVAL; + case CPUCLOCK_PROF: + thread_group_cputime(p, &cputime); + *sample = cputime.utime + cputime.stime; + break; + case CPUCLOCK_VIRT: + thread_group_cputime(p, &cputime); + *sample = cputime.utime; + break; + case CPUCLOCK_SCHED: + thread_group_cputime(p, &cputime); + *sample = cputime.sum_exec_runtime; + break; + } + return 0; +} + +static int posix_cpu_clock_get_task(struct task_struct *tsk, + const clockid_t which_clock, + struct timespec64 *tp) +{ + int err = -EINVAL; + u64 rtn; + + if (CPUCLOCK_PERTHREAD(which_clock)) { + if (same_thread_group(tsk, current)) + err = cpu_clock_sample(which_clock, tsk, &rtn); + } else { + if (tsk == current || thread_group_leader(tsk)) + err = cpu_clock_sample_group(which_clock, tsk, &rtn); + } + + if (!err) + *tp = ns_to_timespec64(rtn); + + return err; +} + + +static int posix_cpu_clock_get(const clockid_t which_clock, struct timespec64 *tp) +{ + const pid_t pid = CPUCLOCK_PID(which_clock); + int err = -EINVAL; + + if (pid == 0) { + /* + * Special case constant value for our own clocks. + * We don't have to do any lookup to find ourselves. + */ + err = posix_cpu_clock_get_task(current, which_clock, tp); + } else { + /* + * Find the given PID, and validate that the caller + * should be able to see it. + */ + struct task_struct *p; + rcu_read_lock(); + p = find_task_by_vpid(pid); + if (p) + err = posix_cpu_clock_get_task(p, which_clock, tp); + rcu_read_unlock(); + } + + return err; +} + +/* + * Validate the clockid_t for a new CPU-clock timer, and initialize the timer. + * This is called from sys_timer_create() and do_cpu_nanosleep() with the + * new timer already all-zeros initialized. + */ +static int posix_cpu_timer_create(struct k_itimer *new_timer) +{ + int ret = 0; + const pid_t pid = CPUCLOCK_PID(new_timer->it_clock); + struct task_struct *p; + + if (CPUCLOCK_WHICH(new_timer->it_clock) >= CPUCLOCK_MAX) + return -EINVAL; + + new_timer->kclock = &clock_posix_cpu; + + INIT_LIST_HEAD(&new_timer->it.cpu.entry); + + rcu_read_lock(); + if (CPUCLOCK_PERTHREAD(new_timer->it_clock)) { + if (pid == 0) { + p = current; + } else { + p = find_task_by_vpid(pid); + if (p && !same_thread_group(p, current)) + p = NULL; + } + } else { + if (pid == 0) { + p = current->group_leader; + } else { + p = find_task_by_vpid(pid); + if (p && !has_group_leader_pid(p)) + p = NULL; + } + } + new_timer->it.cpu.task = p; + if (p) { + get_task_struct(p); + } else { + ret = -EINVAL; + } + rcu_read_unlock(); + + return ret; +} + +/* + * Clean up a CPU-clock timer that is about to be destroyed. + * This is called from timer deletion with the timer already locked. + * If we return TIMER_RETRY, it's necessary to release the timer's lock + * and try again. (This happens when the timer is in the middle of firing.) + */ +static int posix_cpu_timer_del(struct k_itimer *timer) +{ + int ret = 0; + unsigned long flags; + struct sighand_struct *sighand; + struct task_struct *p = timer->it.cpu.task; + + if (WARN_ON_ONCE(!p)) + return -EINVAL; + + /* + * Protect against sighand release/switch in exit/exec and process/ + * thread timer list entry concurrent read/writes. + */ + sighand = lock_task_sighand(p, &flags); + if (unlikely(sighand == NULL)) { + /* + * We raced with the reaping of the task. + * The deletion should have cleared us off the list. + */ + WARN_ON_ONCE(!list_empty(&timer->it.cpu.entry)); + } else { + if (timer->it.cpu.firing) + ret = TIMER_RETRY; + else + list_del(&timer->it.cpu.entry); + + unlock_task_sighand(p, &flags); + } + + if (!ret) + put_task_struct(p); + + return ret; +} + +static void cleanup_timers_list(struct list_head *head) +{ + struct cpu_timer_list *timer, *next; + + list_for_each_entry_safe(timer, next, head, entry) + list_del_init(&timer->entry); +} + +/* + * Clean out CPU timers still ticking when a thread exited. The task + * pointer is cleared, and the expiry time is replaced with the residual + * time for later timer_gettime calls to return. + * This must be called with the siglock held. + */ +static void cleanup_timers(struct list_head *head) +{ + cleanup_timers_list(head); + cleanup_timers_list(++head); + cleanup_timers_list(++head); +} + +/* + * These are both called with the siglock held, when the current thread + * is being reaped. When the final (leader) thread in the group is reaped, + * posix_cpu_timers_exit_group will be called after posix_cpu_timers_exit. + */ +void posix_cpu_timers_exit(struct task_struct *tsk) +{ + cleanup_timers(tsk->cpu_timers); +} +void posix_cpu_timers_exit_group(struct task_struct *tsk) +{ + cleanup_timers(tsk->signal->cpu_timers); +} + +static inline int expires_gt(u64 expires, u64 new_exp) +{ + return expires == 0 || expires > new_exp; +} + +/* + * Insert the timer on the appropriate list before any timers that + * expire later. This must be called with the sighand lock held. + */ +static void arm_timer(struct k_itimer *timer) +{ + struct task_struct *p = timer->it.cpu.task; + struct list_head *head, *listpos; + struct task_cputime *cputime_expires; + struct cpu_timer_list *const nt = &timer->it.cpu; + struct cpu_timer_list *next; + + if (CPUCLOCK_PERTHREAD(timer->it_clock)) { + head = p->cpu_timers; + cputime_expires = &p->cputime_expires; + } else { + head = p->signal->cpu_timers; + cputime_expires = &p->signal->cputime_expires; + } + head += CPUCLOCK_WHICH(timer->it_clock); + + listpos = head; + list_for_each_entry(next, head, entry) { + if (nt->expires < next->expires) + break; + listpos = &next->entry; + } + list_add(&nt->entry, listpos); + + if (listpos == head) { + u64 exp = nt->expires; + + /* + * We are the new earliest-expiring POSIX 1.b timer, hence + * need to update expiration cache. Take into account that + * for process timers we share expiration cache with itimers + * and RLIMIT_CPU and for thread timers with RLIMIT_RTTIME. + */ + + switch (CPUCLOCK_WHICH(timer->it_clock)) { + case CPUCLOCK_PROF: + if (expires_gt(cputime_expires->prof_exp, exp)) + cputime_expires->prof_exp = exp; + break; + case CPUCLOCK_VIRT: + if (expires_gt(cputime_expires->virt_exp, exp)) + cputime_expires->virt_exp = exp; + break; + case CPUCLOCK_SCHED: + if (expires_gt(cputime_expires->sched_exp, exp)) + cputime_expires->sched_exp = exp; + break; + } + if (CPUCLOCK_PERTHREAD(timer->it_clock)) + tick_dep_set_task(p, TICK_DEP_BIT_POSIX_TIMER); + else + tick_dep_set_signal(p->signal, TICK_DEP_BIT_POSIX_TIMER); + } +} + +/* + * The timer is locked, fire it and arrange for its reload. + */ +static void cpu_timer_fire(struct k_itimer *timer) +{ + if ((timer->it_sigev_notify & ~SIGEV_THREAD_ID) == SIGEV_NONE) { + /* + * User don't want any signal. + */ + timer->it.cpu.expires = 0; + } else if (unlikely(timer->sigq == NULL)) { + /* + * This a special case for clock_nanosleep, + * not a normal timer from sys_timer_create. + */ + wake_up_process(timer->it_process); + timer->it.cpu.expires = 0; + } else if (timer->it.cpu.incr == 0) { + /* + * One-shot timer. Clear it as soon as it's fired. + */ + posix_timer_event(timer, 0); + timer->it.cpu.expires = 0; + } else if (posix_timer_event(timer, ++timer->it_requeue_pending)) { + /* + * The signal did not get queued because the signal + * was ignored, so we won't get any callback to + * reload the timer. But we need to keep it + * ticking in case the signal is deliverable next time. + */ + posix_cpu_timer_rearm(timer); + ++timer->it_requeue_pending; + } +} + +/* + * Sample a process (thread group) timer for the given group_leader task. + * Must be called with task sighand lock held for safe while_each_thread() + * traversal. + */ +static int cpu_timer_sample_group(const clockid_t which_clock, + struct task_struct *p, u64 *sample) +{ + struct task_cputime cputime; + + thread_group_cputimer(p, &cputime); + switch (CPUCLOCK_WHICH(which_clock)) { + default: + return -EINVAL; + case CPUCLOCK_PROF: + *sample = cputime.utime + cputime.stime; + break; + case CPUCLOCK_VIRT: + *sample = cputime.utime; + break; + case CPUCLOCK_SCHED: + *sample = cputime.sum_exec_runtime; + break; + } + return 0; +} + +/* + * Guts of sys_timer_settime for CPU timers. + * This is called with the timer locked and interrupts disabled. + * If we return TIMER_RETRY, it's necessary to release the timer's lock + * and try again. (This happens when the timer is in the middle of firing.) + */ +static int posix_cpu_timer_set(struct k_itimer *timer, int timer_flags, + struct itimerspec64 *new, struct itimerspec64 *old) +{ + unsigned long flags; + struct sighand_struct *sighand; + struct task_struct *p = timer->it.cpu.task; + u64 old_expires, new_expires, old_incr, val; + int ret; + + if (WARN_ON_ONCE(!p)) + return -EINVAL; + + /* + * Use the to_ktime conversion because that clamps the maximum + * value to KTIME_MAX and avoid multiplication overflows. + */ + new_expires = ktime_to_ns(timespec64_to_ktime(new->it_value)); + + /* + * Protect against sighand release/switch in exit/exec and p->cpu_timers + * and p->signal->cpu_timers read/write in arm_timer() + */ + sighand = lock_task_sighand(p, &flags); + /* + * If p has just been reaped, we can no + * longer get any information about it at all. + */ + if (unlikely(sighand == NULL)) { + return -ESRCH; + } + + /* + * Disarm any old timer after extracting its expiry time. + */ + + ret = 0; + old_incr = timer->it.cpu.incr; + old_expires = timer->it.cpu.expires; + if (unlikely(timer->it.cpu.firing)) { + timer->it.cpu.firing = -1; + ret = TIMER_RETRY; + } else + list_del_init(&timer->it.cpu.entry); + + /* + * We need to sample the current value to convert the new + * value from to relative and absolute, and to convert the + * old value from absolute to relative. To set a process + * timer, we need a sample to balance the thread expiry + * times (in arm_timer). With an absolute time, we must + * check if it's already passed. In short, we need a sample. + */ + if (CPUCLOCK_PERTHREAD(timer->it_clock)) { + cpu_clock_sample(timer->it_clock, p, &val); + } else { + cpu_timer_sample_group(timer->it_clock, p, &val); + } + + if (old) { + if (old_expires == 0) { + old->it_value.tv_sec = 0; + old->it_value.tv_nsec = 0; + } else { + /* + * Update the timer in case it has + * overrun already. If it has, + * we'll report it as having overrun + * and with the next reloaded timer + * already ticking, though we are + * swallowing that pending + * notification here to install the + * new setting. + */ + bump_cpu_timer(timer, val); + if (val < timer->it.cpu.expires) { + old_expires = timer->it.cpu.expires - val; + old->it_value = ns_to_timespec64(old_expires); + } else { + old->it_value.tv_nsec = 1; + old->it_value.tv_sec = 0; + } + } + } + + if (unlikely(ret)) { + /* + * We are colliding with the timer actually firing. + * Punt after filling in the timer's old value, and + * disable this firing since we are already reporting + * it as an overrun (thanks to bump_cpu_timer above). + */ + unlock_task_sighand(p, &flags); + goto out; + } + + if (new_expires != 0 && !(timer_flags & TIMER_ABSTIME)) { + new_expires += val; + } + + /* + * Install the new expiry time (or zero). + * For a timer with no notification action, we don't actually + * arm the timer (we'll just fake it for timer_gettime). + */ + timer->it.cpu.expires = new_expires; + if (new_expires != 0 && val < new_expires) { + arm_timer(timer); + } + + unlock_task_sighand(p, &flags); + /* + * Install the new reload setting, and + * set up the signal and overrun bookkeeping. + */ + timer->it.cpu.incr = timespec64_to_ns(&new->it_interval); + timer->it_interval = ns_to_ktime(timer->it.cpu.incr); + + /* + * This acts as a modification timestamp for the timer, + * so any automatic reload attempt will punt on seeing + * that we have reset the timer manually. + */ + timer->it_requeue_pending = (timer->it_requeue_pending + 2) & + ~REQUEUE_PENDING; + timer->it_overrun_last = 0; + timer->it_overrun = -1; + + if (new_expires != 0 && !(val < new_expires)) { + /* + * The designated time already passed, so we notify + * immediately, even if the thread never runs to + * accumulate more time on this clock. + */ + cpu_timer_fire(timer); + } + + ret = 0; + out: + if (old) + old->it_interval = ns_to_timespec64(old_incr); + + return ret; +} + +static void posix_cpu_timer_get(struct k_itimer *timer, struct itimerspec64 *itp) +{ + struct task_struct *p = timer->it.cpu.task; + u64 now; + + if (WARN_ON_ONCE(!p)) + return; + + /* + * Easy part: convert the reload time. + */ + itp->it_interval = ns_to_timespec64(timer->it.cpu.incr); + + if (!timer->it.cpu.expires) + return; + + /* + * Sample the clock to take the difference with the expiry time. + */ + if (CPUCLOCK_PERTHREAD(timer->it_clock)) { + cpu_clock_sample(timer->it_clock, p, &now); + } else { + struct sighand_struct *sighand; + unsigned long flags; + + /* + * Protect against sighand release/switch in exit/exec and + * also make timer sampling safe if it ends up calling + * thread_group_cputime(). + */ + sighand = lock_task_sighand(p, &flags); + if (unlikely(sighand == NULL)) { + /* + * The process has been reaped. + * We can't even collect a sample any more. + * Call the timer disarmed, nothing else to do. + */ + timer->it.cpu.expires = 0; + return; + } else { + cpu_timer_sample_group(timer->it_clock, p, &now); + unlock_task_sighand(p, &flags); + } + } + + if (now < timer->it.cpu.expires) { + itp->it_value = ns_to_timespec64(timer->it.cpu.expires - now); + } else { + /* + * The timer should have expired already, but the firing + * hasn't taken place yet. Say it's just about to expire. + */ + itp->it_value.tv_nsec = 1; + itp->it_value.tv_sec = 0; + } +} + +static unsigned long long +check_timers_list(struct list_head *timers, + struct list_head *firing, + unsigned long long curr) +{ + int maxfire = 20; + + while (!list_empty(timers)) { + struct cpu_timer_list *t; + + t = list_first_entry(timers, struct cpu_timer_list, entry); + + if (!--maxfire || curr < t->expires) + return t->expires; + + t->firing = 1; + list_move_tail(&t->entry, firing); + } + + return 0; +} + +static inline void check_dl_overrun(struct task_struct *tsk) +{ + if (tsk->dl.dl_overrun) { + tsk->dl.dl_overrun = 0; + __group_send_sig_info(SIGXCPU, SEND_SIG_PRIV, tsk); + } +} + +/* + * Check for any per-thread CPU timers that have fired and move them off + * the tsk->cpu_timers[N] list onto the firing list. Here we update the + * tsk->it_*_expires values to reflect the remaining thread CPU timers. + */ +static void check_thread_timers(struct task_struct *tsk, + struct list_head *firing) +{ + struct list_head *timers = tsk->cpu_timers; + struct task_cputime *tsk_expires = &tsk->cputime_expires; + u64 expires; + unsigned long soft; + + if (dl_task(tsk)) + check_dl_overrun(tsk); + + /* + * If cputime_expires is zero, then there are no active + * per thread CPU timers. + */ + if (task_cputime_zero(&tsk->cputime_expires)) + return; + + expires = check_timers_list(timers, firing, prof_ticks(tsk)); + tsk_expires->prof_exp = expires; + + expires = check_timers_list(++timers, firing, virt_ticks(tsk)); + tsk_expires->virt_exp = expires; + + tsk_expires->sched_exp = check_timers_list(++timers, firing, + tsk->se.sum_exec_runtime); + + /* + * Check for the special case thread timers. + */ + soft = task_rlimit(tsk, RLIMIT_RTTIME); + if (soft != RLIM_INFINITY) { + unsigned long hard = task_rlimit_max(tsk, RLIMIT_RTTIME); + + if (hard != RLIM_INFINITY && + tsk->rt.timeout > DIV_ROUND_UP(hard, USEC_PER_SEC/HZ)) { + /* + * At the hard limit, we just die. + * No need to calculate anything else now. + */ + if (print_fatal_signals) { + pr_info("CPU Watchdog Timeout (hard): %s[%d]\n", + tsk->comm, task_pid_nr(tsk)); + } + __group_send_sig_info(SIGKILL, SEND_SIG_PRIV, tsk); + return; + } + if (tsk->rt.timeout > DIV_ROUND_UP(soft, USEC_PER_SEC/HZ)) { + /* + * At the soft limit, send a SIGXCPU every second. + */ + if (soft < hard) { + soft += USEC_PER_SEC; + tsk->signal->rlim[RLIMIT_RTTIME].rlim_cur = + soft; + } + if (print_fatal_signals) { + pr_info("RT Watchdog Timeout (soft): %s[%d]\n", + tsk->comm, task_pid_nr(tsk)); + } + __group_send_sig_info(SIGXCPU, SEND_SIG_PRIV, tsk); + } + } + if (task_cputime_zero(tsk_expires)) + tick_dep_clear_task(tsk, TICK_DEP_BIT_POSIX_TIMER); +} + +static inline void stop_process_timers(struct signal_struct *sig) +{ + struct thread_group_cputimer *cputimer = &sig->cputimer; + + /* Turn off cputimer->running. This is done without locking. */ + WRITE_ONCE(cputimer->running, false); + tick_dep_clear_signal(sig, TICK_DEP_BIT_POSIX_TIMER); +} + +static void check_cpu_itimer(struct task_struct *tsk, struct cpu_itimer *it, + u64 *expires, u64 cur_time, int signo) +{ + if (!it->expires) + return; + + if (cur_time >= it->expires) { + if (it->incr) + it->expires += it->incr; + else + it->expires = 0; + + trace_itimer_expire(signo == SIGPROF ? + ITIMER_PROF : ITIMER_VIRTUAL, + task_tgid(tsk), cur_time); + __group_send_sig_info(signo, SEND_SIG_PRIV, tsk); + } + + if (it->expires && (!*expires || it->expires < *expires)) + *expires = it->expires; +} + +/* + * Check for any per-thread CPU timers that have fired and move them + * off the tsk->*_timers list onto the firing list. Per-thread timers + * have already been taken off. + */ +static void check_process_timers(struct task_struct *tsk, + struct list_head *firing) +{ + struct signal_struct *const sig = tsk->signal; + u64 utime, ptime, virt_expires, prof_expires; + u64 sum_sched_runtime, sched_expires; + struct list_head *timers = sig->cpu_timers; + struct task_cputime cputime; + unsigned long soft; + + if (dl_task(tsk)) + check_dl_overrun(tsk); + + /* + * If cputimer is not running, then there are no active + * process wide timers (POSIX 1.b, itimers, RLIMIT_CPU). + */ + if (!READ_ONCE(tsk->signal->cputimer.running)) + return; + + /* + * Signify that a thread is checking for process timers. + * Write access to this field is protected by the sighand lock. + */ + sig->cputimer.checking_timer = true; + + /* + * Collect the current process totals. + */ + thread_group_cputimer(tsk, &cputime); + utime = cputime.utime; + ptime = utime + cputime.stime; + sum_sched_runtime = cputime.sum_exec_runtime; + + prof_expires = check_timers_list(timers, firing, ptime); + virt_expires = check_timers_list(++timers, firing, utime); + sched_expires = check_timers_list(++timers, firing, sum_sched_runtime); + + /* + * Check for the special case process timers. + */ + check_cpu_itimer(tsk, &sig->it[CPUCLOCK_PROF], &prof_expires, ptime, + SIGPROF); + check_cpu_itimer(tsk, &sig->it[CPUCLOCK_VIRT], &virt_expires, utime, + SIGVTALRM); + soft = task_rlimit(tsk, RLIMIT_CPU); + if (soft != RLIM_INFINITY) { + unsigned long psecs = div_u64(ptime, NSEC_PER_SEC); + unsigned long hard = task_rlimit_max(tsk, RLIMIT_CPU); + u64 x; + if (psecs >= hard) { + /* + * At the hard limit, we just die. + * No need to calculate anything else now. + */ + if (print_fatal_signals) { + pr_info("RT Watchdog Timeout (hard): %s[%d]\n", + tsk->comm, task_pid_nr(tsk)); + } + __group_send_sig_info(SIGKILL, SEND_SIG_PRIV, tsk); + return; + } + if (psecs >= soft) { + /* + * At the soft limit, send a SIGXCPU every second. + */ + if (print_fatal_signals) { + pr_info("CPU Watchdog Timeout (soft): %s[%d]\n", + tsk->comm, task_pid_nr(tsk)); + } + __group_send_sig_info(SIGXCPU, SEND_SIG_PRIV, tsk); + if (soft < hard) { + soft++; + sig->rlim[RLIMIT_CPU].rlim_cur = soft; + } + } + x = soft * NSEC_PER_SEC; + if (!prof_expires || x < prof_expires) + prof_expires = x; + } + + sig->cputime_expires.prof_exp = prof_expires; + sig->cputime_expires.virt_exp = virt_expires; + sig->cputime_expires.sched_exp = sched_expires; + if (task_cputime_zero(&sig->cputime_expires)) + stop_process_timers(sig); + + sig->cputimer.checking_timer = false; +} + +/* + * This is called from the signal code (via posixtimer_rearm) + * when the last timer signal was delivered and we have to reload the timer. + */ +static void posix_cpu_timer_rearm(struct k_itimer *timer) +{ + struct task_struct *p = timer->it.cpu.task; + struct sighand_struct *sighand; + unsigned long flags; + u64 now; + + if (WARN_ON_ONCE(!p)) + return; + + /* + * Fetch the current sample and update the timer's expiry time. + */ + if (CPUCLOCK_PERTHREAD(timer->it_clock)) { + cpu_clock_sample(timer->it_clock, p, &now); + bump_cpu_timer(timer, now); + if (unlikely(p->exit_state)) + return; + + /* Protect timer list r/w in arm_timer() */ + sighand = lock_task_sighand(p, &flags); + if (!sighand) + return; + } else { + /* + * Protect arm_timer() and timer sampling in case of call to + * thread_group_cputime(). + */ + sighand = lock_task_sighand(p, &flags); + if (unlikely(sighand == NULL)) { + /* + * The process has been reaped. + * We can't even collect a sample any more. + */ + timer->it.cpu.expires = 0; + return; + } else if (unlikely(p->exit_state) && thread_group_empty(p)) { + /* If the process is dying, no need to rearm */ + goto unlock; + } + cpu_timer_sample_group(timer->it_clock, p, &now); + bump_cpu_timer(timer, now); + /* Leave the sighand locked for the call below. */ + } + + /* + * Now re-arm for the new expiry time. + */ + arm_timer(timer); +unlock: + unlock_task_sighand(p, &flags); +} + +/** + * task_cputime_expired - Compare two task_cputime entities. + * + * @sample: The task_cputime structure to be checked for expiration. + * @expires: Expiration times, against which @sample will be checked. + * + * Checks @sample against @expires to see if any field of @sample has expired. + * Returns true if any field of the former is greater than the corresponding + * field of the latter if the latter field is set. Otherwise returns false. + */ +static inline int task_cputime_expired(const struct task_cputime *sample, + const struct task_cputime *expires) +{ + if (expires->utime && sample->utime >= expires->utime) + return 1; + if (expires->stime && sample->utime + sample->stime >= expires->stime) + return 1; + if (expires->sum_exec_runtime != 0 && + sample->sum_exec_runtime >= expires->sum_exec_runtime) + return 1; + return 0; +} + +/** + * fastpath_timer_check - POSIX CPU timers fast path. + * + * @tsk: The task (thread) being checked. + * + * Check the task and thread group timers. If both are zero (there are no + * timers set) return false. Otherwise snapshot the task and thread group + * timers and compare them with the corresponding expiration times. Return + * true if a timer has expired, else return false. + */ +static inline int fastpath_timer_check(struct task_struct *tsk) +{ + struct signal_struct *sig; + + if (!task_cputime_zero(&tsk->cputime_expires)) { + struct task_cputime task_sample; + + task_cputime(tsk, &task_sample.utime, &task_sample.stime); + task_sample.sum_exec_runtime = tsk->se.sum_exec_runtime; + if (task_cputime_expired(&task_sample, &tsk->cputime_expires)) + return 1; + } + + sig = tsk->signal; + /* + * Check if thread group timers expired when the cputimer is + * running and no other thread in the group is already checking + * for thread group cputimers. These fields are read without the + * sighand lock. However, this is fine because this is meant to + * be a fastpath heuristic to determine whether we should try to + * acquire the sighand lock to check/handle timers. + * + * In the worst case scenario, if 'running' or 'checking_timer' gets + * set but the current thread doesn't see the change yet, we'll wait + * until the next thread in the group gets a scheduler interrupt to + * handle the timer. This isn't an issue in practice because these + * types of delays with signals actually getting sent are expected. + */ + if (READ_ONCE(sig->cputimer.running) && + !READ_ONCE(sig->cputimer.checking_timer)) { + struct task_cputime group_sample; + + sample_cputime_atomic(&group_sample, &sig->cputimer.cputime_atomic); + + if (task_cputime_expired(&group_sample, &sig->cputime_expires)) + return 1; + } + + if (dl_task(tsk) && tsk->dl.dl_overrun) + return 1; + + return 0; +} + +/* + * This is called from the timer interrupt handler. The irq handler has + * already updated our counts. We need to check if any timers fire now. + * Interrupts are disabled. + */ +void run_posix_cpu_timers(struct task_struct *tsk) +{ + LIST_HEAD(firing); + struct k_itimer *timer, *next; + unsigned long flags; + + lockdep_assert_irqs_disabled(); + + /* + * The fast path checks that there are no expired thread or thread + * group timers. If that's so, just return. + */ + if (!fastpath_timer_check(tsk)) + return; + + if (!lock_task_sighand(tsk, &flags)) + return; + /* + * Here we take off tsk->signal->cpu_timers[N] and + * tsk->cpu_timers[N] all the timers that are firing, and + * put them on the firing list. + */ + check_thread_timers(tsk, &firing); + + check_process_timers(tsk, &firing); + + /* + * We must release these locks before taking any timer's lock. + * There is a potential race with timer deletion here, as the + * siglock now protects our private firing list. We have set + * the firing flag in each timer, so that a deletion attempt + * that gets the timer lock before we do will give it up and + * spin until we've taken care of that timer below. + */ + unlock_task_sighand(tsk, &flags); + + /* + * Now that all the timers on our list have the firing flag, + * no one will touch their list entries but us. We'll take + * each timer's lock before clearing its firing flag, so no + * timer call will interfere. + */ + list_for_each_entry_safe(timer, next, &firing, it.cpu.entry) { + int cpu_firing; + + spin_lock(&timer->it_lock); + list_del_init(&timer->it.cpu.entry); + cpu_firing = timer->it.cpu.firing; + timer->it.cpu.firing = 0; + /* + * The firing flag is -1 if we collided with a reset + * of the timer, which already reported this + * almost-firing as an overrun. So don't generate an event. + */ + if (likely(cpu_firing >= 0)) + cpu_timer_fire(timer); + spin_unlock(&timer->it_lock); + } +} + +/* + * Set one of the process-wide special case CPU timers or RLIMIT_CPU. + * The tsk->sighand->siglock must be held by the caller. + */ +void set_process_cpu_timer(struct task_struct *tsk, unsigned int clock_idx, + u64 *newval, u64 *oldval) +{ + u64 now; + int ret; + + if (WARN_ON_ONCE(clock_idx >= CPUCLOCK_SCHED)) + return; + + ret = cpu_timer_sample_group(clock_idx, tsk, &now); + + if (oldval && ret != -EINVAL) { + /* + * We are setting itimer. The *oldval is absolute and we update + * it to be relative, *newval argument is relative and we update + * it to be absolute. + */ + if (*oldval) { + if (*oldval <= now) { + /* Just about to fire. */ + *oldval = TICK_NSEC; + } else { + *oldval -= now; + } + } + + if (!*newval) + return; + *newval += now; + } + + /* + * Update expiration cache if we are the earliest timer, or eventually + * RLIMIT_CPU limit is earlier than prof_exp cpu timer expire. + */ + switch (clock_idx) { + case CPUCLOCK_PROF: + if (expires_gt(tsk->signal->cputime_expires.prof_exp, *newval)) + tsk->signal->cputime_expires.prof_exp = *newval; + break; + case CPUCLOCK_VIRT: + if (expires_gt(tsk->signal->cputime_expires.virt_exp, *newval)) + tsk->signal->cputime_expires.virt_exp = *newval; + break; + } + + tick_dep_set_signal(tsk->signal, TICK_DEP_BIT_POSIX_TIMER); +} + +static int do_cpu_nanosleep(const clockid_t which_clock, int flags, + const struct timespec64 *rqtp) +{ + struct itimerspec64 it; + struct k_itimer timer; + u64 expires; + int error; + + /* + * Set up a temporary timer and then wait for it to go off. + */ + memset(&timer, 0, sizeof timer); + spin_lock_init(&timer.it_lock); + timer.it_clock = which_clock; + timer.it_overrun = -1; + error = posix_cpu_timer_create(&timer); + timer.it_process = current; + if (!error) { + static struct itimerspec64 zero_it; + struct restart_block *restart; + + memset(&it, 0, sizeof(it)); + it.it_value = *rqtp; + + spin_lock_irq(&timer.it_lock); + error = posix_cpu_timer_set(&timer, flags, &it, NULL); + if (error) { + spin_unlock_irq(&timer.it_lock); + return error; + } + + while (!signal_pending(current)) { + if (timer.it.cpu.expires == 0) { + /* + * Our timer fired and was reset, below + * deletion can not fail. + */ + posix_cpu_timer_del(&timer); + spin_unlock_irq(&timer.it_lock); + return 0; + } + + /* + * Block until cpu_timer_fire (or a signal) wakes us. + */ + __set_current_state(TASK_INTERRUPTIBLE); + spin_unlock_irq(&timer.it_lock); + schedule(); + spin_lock_irq(&timer.it_lock); + } + + /* + * We were interrupted by a signal. + */ + expires = timer.it.cpu.expires; + error = posix_cpu_timer_set(&timer, 0, &zero_it, &it); + if (!error) { + /* + * Timer is now unarmed, deletion can not fail. + */ + posix_cpu_timer_del(&timer); + } + spin_unlock_irq(&timer.it_lock); + + while (error == TIMER_RETRY) { + /* + * We need to handle case when timer was or is in the + * middle of firing. In other cases we already freed + * resources. + */ + spin_lock_irq(&timer.it_lock); + error = posix_cpu_timer_del(&timer); + spin_unlock_irq(&timer.it_lock); + } + + if ((it.it_value.tv_sec | it.it_value.tv_nsec) == 0) { + /* + * It actually did fire already. + */ + return 0; + } + + error = -ERESTART_RESTARTBLOCK; + /* + * Report back to the user the time still remaining. + */ + restart = ¤t->restart_block; + restart->nanosleep.expires = expires; + if (restart->nanosleep.type != TT_NONE) + error = nanosleep_copyout(restart, &it.it_value); + } + + return error; +} + +static long posix_cpu_nsleep_restart(struct restart_block *restart_block); + +static int posix_cpu_nsleep(const clockid_t which_clock, int flags, + const struct timespec64 *rqtp) +{ + struct restart_block *restart_block = ¤t->restart_block; + int error; + + /* + * Diagnose required errors first. + */ + if (CPUCLOCK_PERTHREAD(which_clock) && + (CPUCLOCK_PID(which_clock) == 0 || + CPUCLOCK_PID(which_clock) == task_pid_vnr(current))) + return -EINVAL; + + error = do_cpu_nanosleep(which_clock, flags, rqtp); + + if (error == -ERESTART_RESTARTBLOCK) { + + if (flags & TIMER_ABSTIME) + return -ERESTARTNOHAND; + + restart_block->nanosleep.clockid = which_clock; + set_restart_fn(restart_block, posix_cpu_nsleep_restart); + } + return error; +} + +static long posix_cpu_nsleep_restart(struct restart_block *restart_block) +{ + clockid_t which_clock = restart_block->nanosleep.clockid; + struct timespec64 t; + + t = ns_to_timespec64(restart_block->nanosleep.expires); + + return do_cpu_nanosleep(which_clock, TIMER_ABSTIME, &t); +} + +#define PROCESS_CLOCK make_process_cpuclock(0, CPUCLOCK_SCHED) +#define THREAD_CLOCK make_thread_cpuclock(0, CPUCLOCK_SCHED) + +static int process_cpu_clock_getres(const clockid_t which_clock, + struct timespec64 *tp) +{ + return posix_cpu_clock_getres(PROCESS_CLOCK, tp); +} +static int process_cpu_clock_get(const clockid_t which_clock, + struct timespec64 *tp) +{ + return posix_cpu_clock_get(PROCESS_CLOCK, tp); +} +static int process_cpu_timer_create(struct k_itimer *timer) +{ + timer->it_clock = PROCESS_CLOCK; + return posix_cpu_timer_create(timer); +} +static int process_cpu_nsleep(const clockid_t which_clock, int flags, + const struct timespec64 *rqtp) +{ + return posix_cpu_nsleep(PROCESS_CLOCK, flags, rqtp); +} +static int thread_cpu_clock_getres(const clockid_t which_clock, + struct timespec64 *tp) +{ + return posix_cpu_clock_getres(THREAD_CLOCK, tp); +} +static int thread_cpu_clock_get(const clockid_t which_clock, + struct timespec64 *tp) +{ + return posix_cpu_clock_get(THREAD_CLOCK, tp); +} +static int thread_cpu_timer_create(struct k_itimer *timer) +{ + timer->it_clock = THREAD_CLOCK; + return posix_cpu_timer_create(timer); +} + +const struct k_clock clock_posix_cpu = { + .clock_getres = posix_cpu_clock_getres, + .clock_set = posix_cpu_clock_set, + .clock_get = posix_cpu_clock_get, + .timer_create = posix_cpu_timer_create, + .nsleep = posix_cpu_nsleep, + .timer_set = posix_cpu_timer_set, + .timer_del = posix_cpu_timer_del, + .timer_get = posix_cpu_timer_get, + .timer_rearm = posix_cpu_timer_rearm, +}; + +const struct k_clock clock_process = { + .clock_getres = process_cpu_clock_getres, + .clock_get = process_cpu_clock_get, + .timer_create = process_cpu_timer_create, + .nsleep = process_cpu_nsleep, +}; + +const struct k_clock clock_thread = { + .clock_getres = thread_cpu_clock_getres, + .clock_get = thread_cpu_clock_get, + .timer_create = thread_cpu_timer_create, +}; diff --git a/kernel/time/posix-stubs.c b/kernel/time/posix-stubs.c new file mode 100644 index 000000000..2c6847d5d --- /dev/null +++ b/kernel/time/posix-stubs.c @@ -0,0 +1,239 @@ +/* + * Dummy stubs used when CONFIG_POSIX_TIMERS=n + * + * Created by: Nicolas Pitre, July 2016 + * Copyright: (C) 2016 Linaro Limited + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include <linux/linkage.h> +#include <linux/kernel.h> +#include <linux/sched.h> +#include <linux/errno.h> +#include <linux/syscalls.h> +#include <linux/ktime.h> +#include <linux/timekeeping.h> +#include <linux/posix-timers.h> +#include <linux/compat.h> + +#ifdef CONFIG_ARCH_HAS_SYSCALL_WRAPPER +/* Architectures may override SYS_NI and COMPAT_SYS_NI */ +#include <asm/syscall_wrapper.h> +#endif + +asmlinkage long sys_ni_posix_timers(void) +{ + pr_err_once("process %d (%s) attempted a POSIX timer syscall " + "while CONFIG_POSIX_TIMERS is not set\n", + current->pid, current->comm); + return -ENOSYS; +} + +#ifndef SYS_NI +#define SYS_NI(name) SYSCALL_ALIAS(sys_##name, sys_ni_posix_timers) +#endif + +#ifndef COMPAT_SYS_NI +#define COMPAT_SYS_NI(name) SYSCALL_ALIAS(compat_sys_##name, sys_ni_posix_timers) +#endif + +SYS_NI(timer_create); +SYS_NI(timer_gettime); +SYS_NI(timer_getoverrun); +SYS_NI(timer_settime); +SYS_NI(timer_delete); +SYS_NI(clock_adjtime); +SYS_NI(getitimer); +SYS_NI(setitimer); +#ifdef __ARCH_WANT_SYS_ALARM +SYS_NI(alarm); +#endif + +/* + * We preserve minimal support for CLOCK_REALTIME and CLOCK_MONOTONIC + * as it is easy to remain compatible with little code. CLOCK_BOOTTIME + * is also included for convenience as at least systemd uses it. + */ + +SYSCALL_DEFINE2(clock_settime, const clockid_t, which_clock, + const struct __kernel_timespec __user *, tp) +{ + struct timespec64 new_tp; + + if (which_clock != CLOCK_REALTIME) + return -EINVAL; + if (get_timespec64(&new_tp, tp)) + return -EFAULT; + + return do_sys_settimeofday64(&new_tp, NULL); +} + +int do_clock_gettime(clockid_t which_clock, struct timespec64 *tp) +{ + switch (which_clock) { + case CLOCK_REALTIME: + ktime_get_real_ts64(tp); + break; + case CLOCK_MONOTONIC: + ktime_get_ts64(tp); + break; + case CLOCK_BOOTTIME: + ktime_get_boottime_ts64(tp); + break; + default: + return -EINVAL; + } + + return 0; +} +SYSCALL_DEFINE2(clock_gettime, const clockid_t, which_clock, + struct __kernel_timespec __user *, tp) +{ + int ret; + struct timespec64 kernel_tp; + + ret = do_clock_gettime(which_clock, &kernel_tp); + if (ret) + return ret; + + if (put_timespec64(&kernel_tp, tp)) + return -EFAULT; + return 0; +} + +SYSCALL_DEFINE2(clock_getres, const clockid_t, which_clock, struct __kernel_timespec __user *, tp) +{ + struct timespec64 rtn_tp = { + .tv_sec = 0, + .tv_nsec = hrtimer_resolution, + }; + + switch (which_clock) { + case CLOCK_REALTIME: + case CLOCK_MONOTONIC: + case CLOCK_BOOTTIME: + if (put_timespec64(&rtn_tp, tp)) + return -EFAULT; + return 0; + default: + return -EINVAL; + } +} + +SYSCALL_DEFINE4(clock_nanosleep, const clockid_t, which_clock, int, flags, + const struct __kernel_timespec __user *, rqtp, + struct __kernel_timespec __user *, rmtp) +{ + struct timespec64 t; + + switch (which_clock) { + case CLOCK_REALTIME: + case CLOCK_MONOTONIC: + case CLOCK_BOOTTIME: + break; + default: + return -EINVAL; + } + + if (get_timespec64(&t, rqtp)) + return -EFAULT; + if (!timespec64_valid(&t)) + return -EINVAL; + if (flags & TIMER_ABSTIME) + rmtp = NULL; + current->restart_block.nanosleep.type = rmtp ? TT_NATIVE : TT_NONE; + current->restart_block.nanosleep.rmtp = rmtp; + return hrtimer_nanosleep(&t, flags & TIMER_ABSTIME ? + HRTIMER_MODE_ABS : HRTIMER_MODE_REL, + which_clock); +} + +#ifdef CONFIG_COMPAT +COMPAT_SYS_NI(timer_create); +COMPAT_SYS_NI(clock_adjtime); +COMPAT_SYS_NI(timer_settime); +COMPAT_SYS_NI(timer_gettime); +COMPAT_SYS_NI(getitimer); +COMPAT_SYS_NI(setitimer); +#endif + +#ifdef CONFIG_COMPAT_32BIT_TIME +COMPAT_SYSCALL_DEFINE2(clock_settime, const clockid_t, which_clock, + struct compat_timespec __user *, tp) +{ + struct timespec64 new_tp; + + if (which_clock != CLOCK_REALTIME) + return -EINVAL; + if (compat_get_timespec64(&new_tp, tp)) + return -EFAULT; + + return do_sys_settimeofday64(&new_tp, NULL); +} + +COMPAT_SYSCALL_DEFINE2(clock_gettime, clockid_t, which_clock, + struct compat_timespec __user *, tp) +{ + int ret; + struct timespec64 kernel_tp; + + ret = do_clock_gettime(which_clock, &kernel_tp); + if (ret) + return ret; + + if (compat_put_timespec64(&kernel_tp, tp)) + return -EFAULT; + return 0; +} + +COMPAT_SYSCALL_DEFINE2(clock_getres, clockid_t, which_clock, + struct compat_timespec __user *, tp) +{ + struct timespec64 rtn_tp = { + .tv_sec = 0, + .tv_nsec = hrtimer_resolution, + }; + + switch (which_clock) { + case CLOCK_REALTIME: + case CLOCK_MONOTONIC: + case CLOCK_BOOTTIME: + if (compat_put_timespec64(&rtn_tp, tp)) + return -EFAULT; + return 0; + default: + return -EINVAL; + } +} + +COMPAT_SYSCALL_DEFINE4(clock_nanosleep, clockid_t, which_clock, int, flags, + struct compat_timespec __user *, rqtp, + struct compat_timespec __user *, rmtp) +{ + struct timespec64 t; + + switch (which_clock) { + case CLOCK_REALTIME: + case CLOCK_MONOTONIC: + case CLOCK_BOOTTIME: + break; + default: + return -EINVAL; + } + + if (compat_get_timespec64(&t, rqtp)) + return -EFAULT; + if (!timespec64_valid(&t)) + return -EINVAL; + if (flags & TIMER_ABSTIME) + rmtp = NULL; + current->restart_block.nanosleep.type = rmtp ? TT_COMPAT : TT_NONE; + current->restart_block.nanosleep.compat_rmtp = rmtp; + return hrtimer_nanosleep(&t, flags & TIMER_ABSTIME ? + HRTIMER_MODE_ABS : HRTIMER_MODE_REL, + which_clock); +} +#endif diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c new file mode 100644 index 000000000..48758108e --- /dev/null +++ b/kernel/time/posix-timers.c @@ -0,0 +1,1367 @@ +/* + * linux/kernel/posix-timers.c + * + * + * 2002-10-15 Posix Clocks & timers + * by George Anzinger george@mvista.com + * + * Copyright (C) 2002 2003 by MontaVista Software. + * + * 2004-06-01 Fix CLOCK_REALTIME clock/timer TIMER_ABSTIME bug. + * Copyright (C) 2004 Boris Hu + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or (at + * your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + * + * MontaVista Software | 1237 East Arques Avenue | Sunnyvale | CA 94085 | USA + */ + +/* These are all the functions necessary to implement + * POSIX clocks & timers + */ +#include <linux/mm.h> +#include <linux/interrupt.h> +#include <linux/slab.h> +#include <linux/time.h> +#include <linux/mutex.h> +#include <linux/sched/task.h> + +#include <linux/uaccess.h> +#include <linux/list.h> +#include <linux/init.h> +#include <linux/compiler.h> +#include <linux/hash.h> +#include <linux/posix-clock.h> +#include <linux/posix-timers.h> +#include <linux/syscalls.h> +#include <linux/wait.h> +#include <linux/workqueue.h> +#include <linux/export.h> +#include <linux/hashtable.h> +#include <linux/compat.h> +#include <linux/nospec.h> + +#include "timekeeping.h" +#include "posix-timers.h" + +/* + * Management arrays for POSIX timers. Timers are now kept in static hash table + * with 512 entries. + * Timer ids are allocated by local routine, which selects proper hash head by + * key, constructed from current->signal address and per signal struct counter. + * This keeps timer ids unique per process, but now they can intersect between + * processes. + */ + +/* + * Lets keep our timers in a slab cache :-) + */ +static struct kmem_cache *posix_timers_cache; + +static DEFINE_HASHTABLE(posix_timers_hashtable, 9); +static DEFINE_SPINLOCK(hash_lock); + +static const struct k_clock * const posix_clocks[]; +static const struct k_clock *clockid_to_kclock(const clockid_t id); +static const struct k_clock clock_realtime, clock_monotonic; + +/* + * we assume that the new SIGEV_THREAD_ID shares no bits with the other + * SIGEV values. Here we put out an error if this assumption fails. + */ +#if SIGEV_THREAD_ID != (SIGEV_THREAD_ID & \ + ~(SIGEV_SIGNAL | SIGEV_NONE | SIGEV_THREAD)) +#error "SIGEV_THREAD_ID must not share bit with other SIGEV values!" +#endif + +/* + * The timer ID is turned into a timer address by idr_find(). + * Verifying a valid ID consists of: + * + * a) checking that idr_find() returns other than -1. + * b) checking that the timer id matches the one in the timer itself. + * c) that the timer owner is in the callers thread group. + */ + +/* + * CLOCKs: The POSIX standard calls for a couple of clocks and allows us + * to implement others. This structure defines the various + * clocks. + * + * RESOLUTION: Clock resolution is used to round up timer and interval + * times, NOT to report clock times, which are reported with as + * much resolution as the system can muster. In some cases this + * resolution may depend on the underlying clock hardware and + * may not be quantifiable until run time, and only then is the + * necessary code is written. The standard says we should say + * something about this issue in the documentation... + * + * FUNCTIONS: The CLOCKs structure defines possible functions to + * handle various clock functions. + * + * The standard POSIX timer management code assumes the + * following: 1.) The k_itimer struct (sched.h) is used for + * the timer. 2.) The list, it_lock, it_clock, it_id and + * it_pid fields are not modified by timer code. + * + * Permissions: It is assumed that the clock_settime() function defined + * for each clock will take care of permission checks. Some + * clocks may be set able by any user (i.e. local process + * clocks) others not. Currently the only set able clock we + * have is CLOCK_REALTIME and its high res counter part, both of + * which we beg off on and pass to do_sys_settimeofday(). + */ +static struct k_itimer *__lock_timer(timer_t timer_id, unsigned long *flags); + +#define lock_timer(tid, flags) \ +({ struct k_itimer *__timr; \ + __cond_lock(&__timr->it_lock, __timr = __lock_timer(tid, flags)); \ + __timr; \ +}) + +static int hash(struct signal_struct *sig, unsigned int nr) +{ + return hash_32(hash32_ptr(sig) ^ nr, HASH_BITS(posix_timers_hashtable)); +} + +static struct k_itimer *__posix_timers_find(struct hlist_head *head, + struct signal_struct *sig, + timer_t id) +{ + struct k_itimer *timer; + + hlist_for_each_entry_rcu(timer, head, t_hash) { + if ((timer->it_signal == sig) && (timer->it_id == id)) + return timer; + } + return NULL; +} + +static struct k_itimer *posix_timer_by_id(timer_t id) +{ + struct signal_struct *sig = current->signal; + struct hlist_head *head = &posix_timers_hashtable[hash(sig, id)]; + + return __posix_timers_find(head, sig, id); +} + +static int posix_timer_add(struct k_itimer *timer) +{ + struct signal_struct *sig = current->signal; + int first_free_id = sig->posix_timer_id; + struct hlist_head *head; + int ret = -ENOENT; + + do { + spin_lock(&hash_lock); + head = &posix_timers_hashtable[hash(sig, sig->posix_timer_id)]; + if (!__posix_timers_find(head, sig, sig->posix_timer_id)) { + hlist_add_head_rcu(&timer->t_hash, head); + ret = sig->posix_timer_id; + } + if (++sig->posix_timer_id < 0) + sig->posix_timer_id = 0; + if ((sig->posix_timer_id == first_free_id) && (ret == -ENOENT)) + /* Loop over all possible ids completed */ + ret = -EAGAIN; + spin_unlock(&hash_lock); + } while (ret == -ENOENT); + return ret; +} + +static inline void unlock_timer(struct k_itimer *timr, unsigned long flags) +{ + spin_unlock_irqrestore(&timr->it_lock, flags); +} + +/* Get clock_realtime */ +static int posix_clock_realtime_get(clockid_t which_clock, struct timespec64 *tp) +{ + ktime_get_real_ts64(tp); + return 0; +} + +/* Set clock_realtime */ +static int posix_clock_realtime_set(const clockid_t which_clock, + const struct timespec64 *tp) +{ + return do_sys_settimeofday64(tp, NULL); +} + +static int posix_clock_realtime_adj(const clockid_t which_clock, + struct timex *t) +{ + return do_adjtimex(t); +} + +/* + * Get monotonic time for posix timers + */ +static int posix_ktime_get_ts(clockid_t which_clock, struct timespec64 *tp) +{ + ktime_get_ts64(tp); + return 0; +} + +/* + * Get monotonic-raw time for posix timers + */ +static int posix_get_monotonic_raw(clockid_t which_clock, struct timespec64 *tp) +{ + ktime_get_raw_ts64(tp); + return 0; +} + + +static int posix_get_realtime_coarse(clockid_t which_clock, struct timespec64 *tp) +{ + ktime_get_coarse_real_ts64(tp); + return 0; +} + +static int posix_get_monotonic_coarse(clockid_t which_clock, + struct timespec64 *tp) +{ + ktime_get_coarse_ts64(tp); + return 0; +} + +static int posix_get_coarse_res(const clockid_t which_clock, struct timespec64 *tp) +{ + *tp = ktime_to_timespec64(KTIME_LOW_RES); + return 0; +} + +static int posix_get_boottime(const clockid_t which_clock, struct timespec64 *tp) +{ + ktime_get_boottime_ts64(tp); + return 0; +} + +static int posix_get_tai(clockid_t which_clock, struct timespec64 *tp) +{ + ktime_get_clocktai_ts64(tp); + return 0; +} + +static int posix_get_hrtimer_res(clockid_t which_clock, struct timespec64 *tp) +{ + tp->tv_sec = 0; + tp->tv_nsec = hrtimer_resolution; + return 0; +} + +/* + * Initialize everything, well, just everything in Posix clocks/timers ;) + */ +static __init int init_posix_timers(void) +{ + posix_timers_cache = kmem_cache_create("posix_timers_cache", + sizeof (struct k_itimer), 0, SLAB_PANIC, + NULL); + return 0; +} +__initcall(init_posix_timers); + +/* + * The siginfo si_overrun field and the return value of timer_getoverrun(2) + * are of type int. Clamp the overrun value to INT_MAX + */ +static inline int timer_overrun_to_int(struct k_itimer *timr, int baseval) +{ + s64 sum = timr->it_overrun_last + (s64)baseval; + + return sum > (s64)INT_MAX ? INT_MAX : (int)sum; +} + +static void common_hrtimer_rearm(struct k_itimer *timr) +{ + struct hrtimer *timer = &timr->it.real.timer; + + timr->it_overrun += hrtimer_forward(timer, timer->base->get_time(), + timr->it_interval); + hrtimer_restart(timer); +} + +/* + * This function is exported for use by the signal deliver code. It is + * called just prior to the info block being released and passes that + * block to us. It's function is to update the overrun entry AND to + * restart the timer. It should only be called if the timer is to be + * restarted (i.e. we have flagged this in the sys_private entry of the + * info block). + * + * To protect against the timer going away while the interrupt is queued, + * we require that the it_requeue_pending flag be set. + */ +void posixtimer_rearm(struct siginfo *info) +{ + struct k_itimer *timr; + unsigned long flags; + + timr = lock_timer(info->si_tid, &flags); + if (!timr) + return; + + if (timr->it_interval && timr->it_requeue_pending == info->si_sys_private) { + timr->kclock->timer_rearm(timr); + + timr->it_active = 1; + timr->it_overrun_last = timr->it_overrun; + timr->it_overrun = -1LL; + ++timr->it_requeue_pending; + + info->si_overrun = timer_overrun_to_int(timr, info->si_overrun); + } + + unlock_timer(timr, flags); +} + +int posix_timer_event(struct k_itimer *timr, int si_private) +{ + enum pid_type type; + int ret = -1; + /* + * FIXME: if ->sigq is queued we can race with + * dequeue_signal()->posixtimer_rearm(). + * + * If dequeue_signal() sees the "right" value of + * si_sys_private it calls posixtimer_rearm(). + * We re-queue ->sigq and drop ->it_lock(). + * posixtimer_rearm() locks the timer + * and re-schedules it while ->sigq is pending. + * Not really bad, but not that we want. + */ + timr->sigq->info.si_sys_private = si_private; + + type = !(timr->it_sigev_notify & SIGEV_THREAD_ID) ? PIDTYPE_TGID : PIDTYPE_PID; + ret = send_sigqueue(timr->sigq, timr->it_pid, type); + /* If we failed to send the signal the timer stops. */ + return ret > 0; +} + +/* + * This function gets called when a POSIX.1b interval timer expires. It + * is used as a callback from the kernel internal timer. The + * run_timer_list code ALWAYS calls with interrupts on. + + * This code is for CLOCK_REALTIME* and CLOCK_MONOTONIC* timers. + */ +static enum hrtimer_restart posix_timer_fn(struct hrtimer *timer) +{ + struct k_itimer *timr; + unsigned long flags; + int si_private = 0; + enum hrtimer_restart ret = HRTIMER_NORESTART; + + timr = container_of(timer, struct k_itimer, it.real.timer); + spin_lock_irqsave(&timr->it_lock, flags); + + timr->it_active = 0; + if (timr->it_interval != 0) + si_private = ++timr->it_requeue_pending; + + if (posix_timer_event(timr, si_private)) { + /* + * signal was not sent because of sig_ignor + * we will not get a call back to restart it AND + * it should be restarted. + */ + if (timr->it_interval != 0) { + ktime_t now = hrtimer_cb_get_time(timer); + + /* + * FIXME: What we really want, is to stop this + * timer completely and restart it in case the + * SIG_IGN is removed. This is a non trivial + * change which involves sighand locking + * (sigh !), which we don't want to do late in + * the release cycle. + * + * For now we just let timers with an interval + * less than a jiffie expire every jiffie to + * avoid softirq starvation in case of SIG_IGN + * and a very small interval, which would put + * the timer right back on the softirq pending + * list. By moving now ahead of time we trick + * hrtimer_forward() to expire the timer + * later, while we still maintain the overrun + * accuracy, but have some inconsistency in + * the timer_gettime() case. This is at least + * better than a starved softirq. A more + * complex fix which solves also another related + * inconsistency is already in the pipeline. + */ +#ifdef CONFIG_HIGH_RES_TIMERS + { + ktime_t kj = NSEC_PER_SEC / HZ; + + if (timr->it_interval < kj) + now = ktime_add(now, kj); + } +#endif + timr->it_overrun += hrtimer_forward(timer, now, + timr->it_interval); + ret = HRTIMER_RESTART; + ++timr->it_requeue_pending; + timr->it_active = 1; + } + } + + unlock_timer(timr, flags); + return ret; +} + +static struct pid *good_sigevent(sigevent_t * event) +{ + struct pid *pid = task_tgid(current); + struct task_struct *rtn; + + switch (event->sigev_notify) { + case SIGEV_SIGNAL | SIGEV_THREAD_ID: + pid = find_vpid(event->sigev_notify_thread_id); + rtn = pid_task(pid, PIDTYPE_PID); + if (!rtn || !same_thread_group(rtn, current)) + return NULL; + /* FALLTHRU */ + case SIGEV_SIGNAL: + case SIGEV_THREAD: + if (event->sigev_signo <= 0 || event->sigev_signo > SIGRTMAX) + return NULL; + /* FALLTHRU */ + case SIGEV_NONE: + return pid; + default: + return NULL; + } +} + +static struct k_itimer * alloc_posix_timer(void) +{ + struct k_itimer *tmr; + tmr = kmem_cache_zalloc(posix_timers_cache, GFP_KERNEL); + if (!tmr) + return tmr; + if (unlikely(!(tmr->sigq = sigqueue_alloc()))) { + kmem_cache_free(posix_timers_cache, tmr); + return NULL; + } + clear_siginfo(&tmr->sigq->info); + return tmr; +} + +static void k_itimer_rcu_free(struct rcu_head *head) +{ + struct k_itimer *tmr = container_of(head, struct k_itimer, it.rcu); + + kmem_cache_free(posix_timers_cache, tmr); +} + +#define IT_ID_SET 1 +#define IT_ID_NOT_SET 0 +static void release_posix_timer(struct k_itimer *tmr, int it_id_set) +{ + if (it_id_set) { + unsigned long flags; + spin_lock_irqsave(&hash_lock, flags); + hlist_del_rcu(&tmr->t_hash); + spin_unlock_irqrestore(&hash_lock, flags); + } + put_pid(tmr->it_pid); + sigqueue_free(tmr->sigq); + call_rcu(&tmr->it.rcu, k_itimer_rcu_free); +} + +static int common_timer_create(struct k_itimer *new_timer) +{ + hrtimer_init(&new_timer->it.real.timer, new_timer->it_clock, 0); + return 0; +} + +/* Create a POSIX.1b interval timer. */ +static int do_timer_create(clockid_t which_clock, struct sigevent *event, + timer_t __user *created_timer_id) +{ + const struct k_clock *kc = clockid_to_kclock(which_clock); + struct k_itimer *new_timer; + int error, new_timer_id; + int it_id_set = IT_ID_NOT_SET; + + if (!kc) + return -EINVAL; + if (!kc->timer_create) + return -EOPNOTSUPP; + + new_timer = alloc_posix_timer(); + if (unlikely(!new_timer)) + return -EAGAIN; + + spin_lock_init(&new_timer->it_lock); + new_timer_id = posix_timer_add(new_timer); + if (new_timer_id < 0) { + error = new_timer_id; + goto out; + } + + it_id_set = IT_ID_SET; + new_timer->it_id = (timer_t) new_timer_id; + new_timer->it_clock = which_clock; + new_timer->kclock = kc; + new_timer->it_overrun = -1LL; + + if (event) { + rcu_read_lock(); + new_timer->it_pid = get_pid(good_sigevent(event)); + rcu_read_unlock(); + if (!new_timer->it_pid) { + error = -EINVAL; + goto out; + } + new_timer->it_sigev_notify = event->sigev_notify; + new_timer->sigq->info.si_signo = event->sigev_signo; + new_timer->sigq->info.si_value = event->sigev_value; + } else { + new_timer->it_sigev_notify = SIGEV_SIGNAL; + new_timer->sigq->info.si_signo = SIGALRM; + memset(&new_timer->sigq->info.si_value, 0, sizeof(sigval_t)); + new_timer->sigq->info.si_value.sival_int = new_timer->it_id; + new_timer->it_pid = get_pid(task_tgid(current)); + } + + new_timer->sigq->info.si_tid = new_timer->it_id; + new_timer->sigq->info.si_code = SI_TIMER; + + if (copy_to_user(created_timer_id, + &new_timer_id, sizeof (new_timer_id))) { + error = -EFAULT; + goto out; + } + + error = kc->timer_create(new_timer); + if (error) + goto out; + + spin_lock_irq(¤t->sighand->siglock); + new_timer->it_signal = current->signal; + list_add(&new_timer->list, ¤t->signal->posix_timers); + spin_unlock_irq(¤t->sighand->siglock); + + return 0; + /* + * In the case of the timer belonging to another task, after + * the task is unlocked, the timer is owned by the other task + * and may cease to exist at any time. Don't use or modify + * new_timer after the unlock call. + */ +out: + release_posix_timer(new_timer, it_id_set); + return error; +} + +SYSCALL_DEFINE3(timer_create, const clockid_t, which_clock, + struct sigevent __user *, timer_event_spec, + timer_t __user *, created_timer_id) +{ + if (timer_event_spec) { + sigevent_t event; + + if (copy_from_user(&event, timer_event_spec, sizeof (event))) + return -EFAULT; + return do_timer_create(which_clock, &event, created_timer_id); + } + return do_timer_create(which_clock, NULL, created_timer_id); +} + +#ifdef CONFIG_COMPAT +COMPAT_SYSCALL_DEFINE3(timer_create, clockid_t, which_clock, + struct compat_sigevent __user *, timer_event_spec, + timer_t __user *, created_timer_id) +{ + if (timer_event_spec) { + sigevent_t event; + + if (get_compat_sigevent(&event, timer_event_spec)) + return -EFAULT; + return do_timer_create(which_clock, &event, created_timer_id); + } + return do_timer_create(which_clock, NULL, created_timer_id); +} +#endif + +/* + * Locking issues: We need to protect the result of the id look up until + * we get the timer locked down so it is not deleted under us. The + * removal is done under the idr spinlock so we use that here to bridge + * the find to the timer lock. To avoid a dead lock, the timer id MUST + * be release with out holding the timer lock. + */ +static struct k_itimer *__lock_timer(timer_t timer_id, unsigned long *flags) +{ + struct k_itimer *timr; + + /* + * timer_t could be any type >= int and we want to make sure any + * @timer_id outside positive int range fails lookup. + */ + if ((unsigned long long)timer_id > INT_MAX) + return NULL; + + rcu_read_lock(); + timr = posix_timer_by_id(timer_id); + if (timr) { + spin_lock_irqsave(&timr->it_lock, *flags); + if (timr->it_signal == current->signal) { + rcu_read_unlock(); + return timr; + } + spin_unlock_irqrestore(&timr->it_lock, *flags); + } + rcu_read_unlock(); + + return NULL; +} + +static ktime_t common_hrtimer_remaining(struct k_itimer *timr, ktime_t now) +{ + struct hrtimer *timer = &timr->it.real.timer; + + return __hrtimer_expires_remaining_adjusted(timer, now); +} + +static s64 common_hrtimer_forward(struct k_itimer *timr, ktime_t now) +{ + struct hrtimer *timer = &timr->it.real.timer; + + return hrtimer_forward(timer, now, timr->it_interval); +} + +/* + * Get the time remaining on a POSIX.1b interval timer. This function + * is ALWAYS called with spin_lock_irq on the timer, thus it must not + * mess with irq. + * + * We have a couple of messes to clean up here. First there is the case + * of a timer that has a requeue pending. These timers should appear to + * be in the timer list with an expiry as if we were to requeue them + * now. + * + * The second issue is the SIGEV_NONE timer which may be active but is + * not really ever put in the timer list (to save system resources). + * This timer may be expired, and if so, we will do it here. Otherwise + * it is the same as a requeue pending timer WRT to what we should + * report. + */ +void common_timer_get(struct k_itimer *timr, struct itimerspec64 *cur_setting) +{ + const struct k_clock *kc = timr->kclock; + ktime_t now, remaining, iv; + struct timespec64 ts64; + bool sig_none; + + sig_none = timr->it_sigev_notify == SIGEV_NONE; + iv = timr->it_interval; + + /* interval timer ? */ + if (iv) { + cur_setting->it_interval = ktime_to_timespec64(iv); + } else if (!timr->it_active) { + /* + * SIGEV_NONE oneshot timers are never queued. Check them + * below. + */ + if (!sig_none) + return; + } + + /* + * The timespec64 based conversion is suboptimal, but it's not + * worth to implement yet another callback. + */ + kc->clock_get(timr->it_clock, &ts64); + now = timespec64_to_ktime(ts64); + + /* + * When a requeue is pending or this is a SIGEV_NONE timer move the + * expiry time forward by intervals, so expiry is > now. + */ + if (iv && (timr->it_requeue_pending & REQUEUE_PENDING || sig_none)) + timr->it_overrun += kc->timer_forward(timr, now); + + remaining = kc->timer_remaining(timr, now); + /* Return 0 only, when the timer is expired and not pending */ + if (remaining <= 0) { + /* + * A single shot SIGEV_NONE timer must return 0, when + * it is expired ! + */ + if (!sig_none) + cur_setting->it_value.tv_nsec = 1; + } else { + cur_setting->it_value = ktime_to_timespec64(remaining); + } +} + +/* Get the time remaining on a POSIX.1b interval timer. */ +static int do_timer_gettime(timer_t timer_id, struct itimerspec64 *setting) +{ + struct k_itimer *timr; + const struct k_clock *kc; + unsigned long flags; + int ret = 0; + + timr = lock_timer(timer_id, &flags); + if (!timr) + return -EINVAL; + + memset(setting, 0, sizeof(*setting)); + kc = timr->kclock; + if (WARN_ON_ONCE(!kc || !kc->timer_get)) + ret = -EINVAL; + else + kc->timer_get(timr, setting); + + unlock_timer(timr, flags); + return ret; +} + +/* Get the time remaining on a POSIX.1b interval timer. */ +SYSCALL_DEFINE2(timer_gettime, timer_t, timer_id, + struct __kernel_itimerspec __user *, setting) +{ + struct itimerspec64 cur_setting; + + int ret = do_timer_gettime(timer_id, &cur_setting); + if (!ret) { + if (put_itimerspec64(&cur_setting, setting)) + ret = -EFAULT; + } + return ret; +} + +#ifdef CONFIG_COMPAT_32BIT_TIME + +COMPAT_SYSCALL_DEFINE2(timer_gettime, timer_t, timer_id, + struct compat_itimerspec __user *, setting) +{ + struct itimerspec64 cur_setting; + + int ret = do_timer_gettime(timer_id, &cur_setting); + if (!ret) { + if (put_compat_itimerspec64(&cur_setting, setting)) + ret = -EFAULT; + } + return ret; +} + +#endif + +/* + * Get the number of overruns of a POSIX.1b interval timer. This is to + * be the overrun of the timer last delivered. At the same time we are + * accumulating overruns on the next timer. The overrun is frozen when + * the signal is delivered, either at the notify time (if the info block + * is not queued) or at the actual delivery time (as we are informed by + * the call back to posixtimer_rearm(). So all we need to do is + * to pick up the frozen overrun. + */ +SYSCALL_DEFINE1(timer_getoverrun, timer_t, timer_id) +{ + struct k_itimer *timr; + int overrun; + unsigned long flags; + + timr = lock_timer(timer_id, &flags); + if (!timr) + return -EINVAL; + + overrun = timer_overrun_to_int(timr, 0); + unlock_timer(timr, flags); + + return overrun; +} + +static void common_hrtimer_arm(struct k_itimer *timr, ktime_t expires, + bool absolute, bool sigev_none) +{ + struct hrtimer *timer = &timr->it.real.timer; + enum hrtimer_mode mode; + + mode = absolute ? HRTIMER_MODE_ABS : HRTIMER_MODE_REL; + /* + * Posix magic: Relative CLOCK_REALTIME timers are not affected by + * clock modifications, so they become CLOCK_MONOTONIC based under the + * hood. See hrtimer_init(). Update timr->kclock, so the generic + * functions which use timr->kclock->clock_get() work. + * + * Note: it_clock stays unmodified, because the next timer_set() might + * use ABSTIME, so it needs to switch back. + */ + if (timr->it_clock == CLOCK_REALTIME) + timr->kclock = absolute ? &clock_realtime : &clock_monotonic; + + hrtimer_init(&timr->it.real.timer, timr->it_clock, mode); + timr->it.real.timer.function = posix_timer_fn; + + if (!absolute) + expires = ktime_add_safe(expires, timer->base->get_time()); + hrtimer_set_expires(timer, expires); + + if (!sigev_none) + hrtimer_start_expires(timer, HRTIMER_MODE_ABS); +} + +static int common_hrtimer_try_to_cancel(struct k_itimer *timr) +{ + return hrtimer_try_to_cancel(&timr->it.real.timer); +} + +/* Set a POSIX.1b interval timer. */ +int common_timer_set(struct k_itimer *timr, int flags, + struct itimerspec64 *new_setting, + struct itimerspec64 *old_setting) +{ + const struct k_clock *kc = timr->kclock; + bool sigev_none; + ktime_t expires; + + if (old_setting) + common_timer_get(timr, old_setting); + + /* Prevent rearming by clearing the interval */ + timr->it_interval = 0; + /* + * Careful here. On SMP systems the timer expiry function could be + * active and spinning on timr->it_lock. + */ + if (kc->timer_try_to_cancel(timr) < 0) + return TIMER_RETRY; + + timr->it_active = 0; + timr->it_requeue_pending = (timr->it_requeue_pending + 2) & + ~REQUEUE_PENDING; + timr->it_overrun_last = 0; + + /* Switch off the timer when it_value is zero */ + if (!new_setting->it_value.tv_sec && !new_setting->it_value.tv_nsec) + return 0; + + timr->it_interval = timespec64_to_ktime(new_setting->it_interval); + expires = timespec64_to_ktime(new_setting->it_value); + sigev_none = timr->it_sigev_notify == SIGEV_NONE; + + kc->timer_arm(timr, expires, flags & TIMER_ABSTIME, sigev_none); + timr->it_active = !sigev_none; + return 0; +} + +static int do_timer_settime(timer_t timer_id, int flags, + struct itimerspec64 *new_spec64, + struct itimerspec64 *old_spec64) +{ + const struct k_clock *kc; + struct k_itimer *timr; + unsigned long flag; + int error = 0; + + if (!timespec64_valid(&new_spec64->it_interval) || + !timespec64_valid(&new_spec64->it_value)) + return -EINVAL; + + if (old_spec64) + memset(old_spec64, 0, sizeof(*old_spec64)); +retry: + timr = lock_timer(timer_id, &flag); + if (!timr) + return -EINVAL; + + kc = timr->kclock; + if (WARN_ON_ONCE(!kc || !kc->timer_set)) + error = -EINVAL; + else + error = kc->timer_set(timr, flags, new_spec64, old_spec64); + + unlock_timer(timr, flag); + if (error == TIMER_RETRY) { + old_spec64 = NULL; // We already got the old time... + goto retry; + } + + return error; +} + +/* Set a POSIX.1b interval timer */ +SYSCALL_DEFINE4(timer_settime, timer_t, timer_id, int, flags, + const struct __kernel_itimerspec __user *, new_setting, + struct __kernel_itimerspec __user *, old_setting) +{ + struct itimerspec64 new_spec, old_spec; + struct itimerspec64 *rtn = old_setting ? &old_spec : NULL; + int error = 0; + + if (!new_setting) + return -EINVAL; + + if (get_itimerspec64(&new_spec, new_setting)) + return -EFAULT; + + error = do_timer_settime(timer_id, flags, &new_spec, rtn); + if (!error && old_setting) { + if (put_itimerspec64(&old_spec, old_setting)) + error = -EFAULT; + } + return error; +} + +#ifdef CONFIG_COMPAT_32BIT_TIME +COMPAT_SYSCALL_DEFINE4(timer_settime, timer_t, timer_id, int, flags, + struct compat_itimerspec __user *, new, + struct compat_itimerspec __user *, old) +{ + struct itimerspec64 new_spec, old_spec; + struct itimerspec64 *rtn = old ? &old_spec : NULL; + int error = 0; + + if (!new) + return -EINVAL; + if (get_compat_itimerspec64(&new_spec, new)) + return -EFAULT; + + error = do_timer_settime(timer_id, flags, &new_spec, rtn); + if (!error && old) { + if (put_compat_itimerspec64(&old_spec, old)) + error = -EFAULT; + } + return error; +} +#endif + +int common_timer_del(struct k_itimer *timer) +{ + const struct k_clock *kc = timer->kclock; + + timer->it_interval = 0; + if (kc->timer_try_to_cancel(timer) < 0) + return TIMER_RETRY; + timer->it_active = 0; + return 0; +} + +static inline int timer_delete_hook(struct k_itimer *timer) +{ + const struct k_clock *kc = timer->kclock; + + if (WARN_ON_ONCE(!kc || !kc->timer_del)) + return -EINVAL; + return kc->timer_del(timer); +} + +/* Delete a POSIX.1b interval timer. */ +SYSCALL_DEFINE1(timer_delete, timer_t, timer_id) +{ + struct k_itimer *timer; + unsigned long flags; + +retry_delete: + timer = lock_timer(timer_id, &flags); + if (!timer) + return -EINVAL; + + if (timer_delete_hook(timer) == TIMER_RETRY) { + unlock_timer(timer, flags); + goto retry_delete; + } + + spin_lock(¤t->sighand->siglock); + list_del(&timer->list); + spin_unlock(¤t->sighand->siglock); + /* + * This keeps any tasks waiting on the spin lock from thinking + * they got something (see the lock code above). + */ + timer->it_signal = NULL; + + unlock_timer(timer, flags); + release_posix_timer(timer, IT_ID_SET); + return 0; +} + +/* + * return timer owned by the process, used by exit_itimers + */ +static void itimer_delete(struct k_itimer *timer) +{ + unsigned long flags; + +retry_delete: + spin_lock_irqsave(&timer->it_lock, flags); + + if (timer_delete_hook(timer) == TIMER_RETRY) { + unlock_timer(timer, flags); + goto retry_delete; + } + list_del(&timer->list); + /* + * This keeps any tasks waiting on the spin lock from thinking + * they got something (see the lock code above). + */ + timer->it_signal = NULL; + + unlock_timer(timer, flags); + release_posix_timer(timer, IT_ID_SET); +} + +/* + * This is called by do_exit or de_thread, only when there are no more + * references to the shared signal_struct. + */ +void exit_itimers(struct signal_struct *sig) +{ + struct k_itimer *tmr; + + while (!list_empty(&sig->posix_timers)) { + tmr = list_entry(sig->posix_timers.next, struct k_itimer, list); + itimer_delete(tmr); + } +} + +SYSCALL_DEFINE2(clock_settime, const clockid_t, which_clock, + const struct __kernel_timespec __user *, tp) +{ + const struct k_clock *kc = clockid_to_kclock(which_clock); + struct timespec64 new_tp; + + if (!kc || !kc->clock_set) + return -EINVAL; + + if (get_timespec64(&new_tp, tp)) + return -EFAULT; + + return kc->clock_set(which_clock, &new_tp); +} + +SYSCALL_DEFINE2(clock_gettime, const clockid_t, which_clock, + struct __kernel_timespec __user *, tp) +{ + const struct k_clock *kc = clockid_to_kclock(which_clock); + struct timespec64 kernel_tp; + int error; + + if (!kc) + return -EINVAL; + + error = kc->clock_get(which_clock, &kernel_tp); + + if (!error && put_timespec64(&kernel_tp, tp)) + error = -EFAULT; + + return error; +} + +SYSCALL_DEFINE2(clock_adjtime, const clockid_t, which_clock, + struct timex __user *, utx) +{ + const struct k_clock *kc = clockid_to_kclock(which_clock); + struct timex ktx; + int err; + + if (!kc) + return -EINVAL; + if (!kc->clock_adj) + return -EOPNOTSUPP; + + if (copy_from_user(&ktx, utx, sizeof(ktx))) + return -EFAULT; + + err = kc->clock_adj(which_clock, &ktx); + + if (err >= 0 && copy_to_user(utx, &ktx, sizeof(ktx))) + return -EFAULT; + + return err; +} + +SYSCALL_DEFINE2(clock_getres, const clockid_t, which_clock, + struct __kernel_timespec __user *, tp) +{ + const struct k_clock *kc = clockid_to_kclock(which_clock); + struct timespec64 rtn_tp; + int error; + + if (!kc) + return -EINVAL; + + error = kc->clock_getres(which_clock, &rtn_tp); + + if (!error && tp && put_timespec64(&rtn_tp, tp)) + error = -EFAULT; + + return error; +} + +#ifdef CONFIG_COMPAT_32BIT_TIME + +COMPAT_SYSCALL_DEFINE2(clock_settime, clockid_t, which_clock, + struct compat_timespec __user *, tp) +{ + const struct k_clock *kc = clockid_to_kclock(which_clock); + struct timespec64 ts; + + if (!kc || !kc->clock_set) + return -EINVAL; + + if (compat_get_timespec64(&ts, tp)) + return -EFAULT; + + return kc->clock_set(which_clock, &ts); +} + +COMPAT_SYSCALL_DEFINE2(clock_gettime, clockid_t, which_clock, + struct compat_timespec __user *, tp) +{ + const struct k_clock *kc = clockid_to_kclock(which_clock); + struct timespec64 ts; + int err; + + if (!kc) + return -EINVAL; + + err = kc->clock_get(which_clock, &ts); + + if (!err && compat_put_timespec64(&ts, tp)) + err = -EFAULT; + + return err; +} + +#endif + +#ifdef CONFIG_COMPAT + +COMPAT_SYSCALL_DEFINE2(clock_adjtime, clockid_t, which_clock, + struct compat_timex __user *, utp) +{ + const struct k_clock *kc = clockid_to_kclock(which_clock); + struct timex ktx; + int err; + + if (!kc) + return -EINVAL; + if (!kc->clock_adj) + return -EOPNOTSUPP; + + err = compat_get_timex(&ktx, utp); + if (err) + return err; + + err = kc->clock_adj(which_clock, &ktx); + + if (err >= 0 && compat_put_timex(utp, &ktx)) + return -EFAULT; + + return err; +} + +#endif + +#ifdef CONFIG_COMPAT_32BIT_TIME + +COMPAT_SYSCALL_DEFINE2(clock_getres, clockid_t, which_clock, + struct compat_timespec __user *, tp) +{ + const struct k_clock *kc = clockid_to_kclock(which_clock); + struct timespec64 ts; + int err; + + if (!kc) + return -EINVAL; + + err = kc->clock_getres(which_clock, &ts); + if (!err && tp && compat_put_timespec64(&ts, tp)) + return -EFAULT; + + return err; +} + +#endif + +/* + * nanosleep for monotonic and realtime clocks + */ +static int common_nsleep(const clockid_t which_clock, int flags, + const struct timespec64 *rqtp) +{ + return hrtimer_nanosleep(rqtp, flags & TIMER_ABSTIME ? + HRTIMER_MODE_ABS : HRTIMER_MODE_REL, + which_clock); +} + +SYSCALL_DEFINE4(clock_nanosleep, const clockid_t, which_clock, int, flags, + const struct __kernel_timespec __user *, rqtp, + struct __kernel_timespec __user *, rmtp) +{ + const struct k_clock *kc = clockid_to_kclock(which_clock); + struct timespec64 t; + + if (!kc) + return -EINVAL; + if (!kc->nsleep) + return -EOPNOTSUPP; + + if (get_timespec64(&t, rqtp)) + return -EFAULT; + + if (!timespec64_valid(&t)) + return -EINVAL; + if (flags & TIMER_ABSTIME) + rmtp = NULL; + current->restart_block.nanosleep.type = rmtp ? TT_NATIVE : TT_NONE; + current->restart_block.nanosleep.rmtp = rmtp; + + return kc->nsleep(which_clock, flags, &t); +} + +#ifdef CONFIG_COMPAT_32BIT_TIME + +COMPAT_SYSCALL_DEFINE4(clock_nanosleep, clockid_t, which_clock, int, flags, + struct compat_timespec __user *, rqtp, + struct compat_timespec __user *, rmtp) +{ + const struct k_clock *kc = clockid_to_kclock(which_clock); + struct timespec64 t; + + if (!kc) + return -EINVAL; + if (!kc->nsleep) + return -EOPNOTSUPP; + + if (compat_get_timespec64(&t, rqtp)) + return -EFAULT; + + if (!timespec64_valid(&t)) + return -EINVAL; + if (flags & TIMER_ABSTIME) + rmtp = NULL; + current->restart_block.nanosleep.type = rmtp ? TT_COMPAT : TT_NONE; + current->restart_block.nanosleep.compat_rmtp = rmtp; + + return kc->nsleep(which_clock, flags, &t); +} + +#endif + +static const struct k_clock clock_realtime = { + .clock_getres = posix_get_hrtimer_res, + .clock_get = posix_clock_realtime_get, + .clock_set = posix_clock_realtime_set, + .clock_adj = posix_clock_realtime_adj, + .nsleep = common_nsleep, + .timer_create = common_timer_create, + .timer_set = common_timer_set, + .timer_get = common_timer_get, + .timer_del = common_timer_del, + .timer_rearm = common_hrtimer_rearm, + .timer_forward = common_hrtimer_forward, + .timer_remaining = common_hrtimer_remaining, + .timer_try_to_cancel = common_hrtimer_try_to_cancel, + .timer_arm = common_hrtimer_arm, +}; + +static const struct k_clock clock_monotonic = { + .clock_getres = posix_get_hrtimer_res, + .clock_get = posix_ktime_get_ts, + .nsleep = common_nsleep, + .timer_create = common_timer_create, + .timer_set = common_timer_set, + .timer_get = common_timer_get, + .timer_del = common_timer_del, + .timer_rearm = common_hrtimer_rearm, + .timer_forward = common_hrtimer_forward, + .timer_remaining = common_hrtimer_remaining, + .timer_try_to_cancel = common_hrtimer_try_to_cancel, + .timer_arm = common_hrtimer_arm, +}; + +static const struct k_clock clock_monotonic_raw = { + .clock_getres = posix_get_hrtimer_res, + .clock_get = posix_get_monotonic_raw, +}; + +static const struct k_clock clock_realtime_coarse = { + .clock_getres = posix_get_coarse_res, + .clock_get = posix_get_realtime_coarse, +}; + +static const struct k_clock clock_monotonic_coarse = { + .clock_getres = posix_get_coarse_res, + .clock_get = posix_get_monotonic_coarse, +}; + +static const struct k_clock clock_tai = { + .clock_getres = posix_get_hrtimer_res, + .clock_get = posix_get_tai, + .nsleep = common_nsleep, + .timer_create = common_timer_create, + .timer_set = common_timer_set, + .timer_get = common_timer_get, + .timer_del = common_timer_del, + .timer_rearm = common_hrtimer_rearm, + .timer_forward = common_hrtimer_forward, + .timer_remaining = common_hrtimer_remaining, + .timer_try_to_cancel = common_hrtimer_try_to_cancel, + .timer_arm = common_hrtimer_arm, +}; + +static const struct k_clock clock_boottime = { + .clock_getres = posix_get_hrtimer_res, + .clock_get = posix_get_boottime, + .nsleep = common_nsleep, + .timer_create = common_timer_create, + .timer_set = common_timer_set, + .timer_get = common_timer_get, + .timer_del = common_timer_del, + .timer_rearm = common_hrtimer_rearm, + .timer_forward = common_hrtimer_forward, + .timer_remaining = common_hrtimer_remaining, + .timer_try_to_cancel = common_hrtimer_try_to_cancel, + .timer_arm = common_hrtimer_arm, +}; + +static const struct k_clock * const posix_clocks[] = { + [CLOCK_REALTIME] = &clock_realtime, + [CLOCK_MONOTONIC] = &clock_monotonic, + [CLOCK_PROCESS_CPUTIME_ID] = &clock_process, + [CLOCK_THREAD_CPUTIME_ID] = &clock_thread, + [CLOCK_MONOTONIC_RAW] = &clock_monotonic_raw, + [CLOCK_REALTIME_COARSE] = &clock_realtime_coarse, + [CLOCK_MONOTONIC_COARSE] = &clock_monotonic_coarse, + [CLOCK_BOOTTIME] = &clock_boottime, + [CLOCK_REALTIME_ALARM] = &alarm_clock, + [CLOCK_BOOTTIME_ALARM] = &alarm_clock, + [CLOCK_TAI] = &clock_tai, +}; + +static const struct k_clock *clockid_to_kclock(const clockid_t id) +{ + clockid_t idx = id; + + if (id < 0) { + return (id & CLOCKFD_MASK) == CLOCKFD ? + &clock_posix_dynamic : &clock_posix_cpu; + } + + if (id >= ARRAY_SIZE(posix_clocks)) + return NULL; + + return posix_clocks[array_index_nospec(idx, ARRAY_SIZE(posix_clocks))]; +} diff --git a/kernel/time/posix-timers.h b/kernel/time/posix-timers.h new file mode 100644 index 000000000..ddb211452 --- /dev/null +++ b/kernel/time/posix-timers.h @@ -0,0 +1,41 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#define TIMER_RETRY 1 + +struct k_clock { + int (*clock_getres)(const clockid_t which_clock, + struct timespec64 *tp); + int (*clock_set)(const clockid_t which_clock, + const struct timespec64 *tp); + int (*clock_get)(const clockid_t which_clock, + struct timespec64 *tp); + int (*clock_adj)(const clockid_t which_clock, struct timex *tx); + int (*timer_create)(struct k_itimer *timer); + int (*nsleep)(const clockid_t which_clock, int flags, + const struct timespec64 *); + int (*timer_set)(struct k_itimer *timr, int flags, + struct itimerspec64 *new_setting, + struct itimerspec64 *old_setting); + int (*timer_del)(struct k_itimer *timr); + void (*timer_get)(struct k_itimer *timr, + struct itimerspec64 *cur_setting); + void (*timer_rearm)(struct k_itimer *timr); + s64 (*timer_forward)(struct k_itimer *timr, ktime_t now); + ktime_t (*timer_remaining)(struct k_itimer *timr, ktime_t now); + int (*timer_try_to_cancel)(struct k_itimer *timr); + void (*timer_arm)(struct k_itimer *timr, ktime_t expires, + bool absolute, bool sigev_none); +}; + +extern const struct k_clock clock_posix_cpu; +extern const struct k_clock clock_posix_dynamic; +extern const struct k_clock clock_process; +extern const struct k_clock clock_thread; +extern const struct k_clock alarm_clock; + +int posix_timer_event(struct k_itimer *timr, int si_private); + +void common_timer_get(struct k_itimer *timr, struct itimerspec64 *cur_setting); +int common_timer_set(struct k_itimer *timr, int flags, + struct itimerspec64 *new_setting, + struct itimerspec64 *old_setting); +int common_timer_del(struct k_itimer *timer); diff --git a/kernel/time/sched_clock.c b/kernel/time/sched_clock.c new file mode 100644 index 000000000..78eb05aa8 --- /dev/null +++ b/kernel/time/sched_clock.c @@ -0,0 +1,309 @@ +/* + * sched_clock.c: Generic sched_clock() support, to extend low level + * hardware time counters to full 64-bit ns values. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ +#include <linux/clocksource.h> +#include <linux/init.h> +#include <linux/jiffies.h> +#include <linux/ktime.h> +#include <linux/kernel.h> +#include <linux/moduleparam.h> +#include <linux/sched.h> +#include <linux/sched/clock.h> +#include <linux/syscore_ops.h> +#include <linux/hrtimer.h> +#include <linux/sched_clock.h> +#include <linux/seqlock.h> +#include <linux/bitops.h> + +/** + * struct clock_read_data - data required to read from sched_clock() + * + * @epoch_ns: sched_clock() value at last update + * @epoch_cyc: Clock cycle value at last update. + * @sched_clock_mask: Bitmask for two's complement subtraction of non 64bit + * clocks. + * @read_sched_clock: Current clock source (or dummy source when suspended). + * @mult: Multipler for scaled math conversion. + * @shift: Shift value for scaled math conversion. + * + * Care must be taken when updating this structure; it is read by + * some very hot code paths. It occupies <=40 bytes and, when combined + * with the seqcount used to synchronize access, comfortably fits into + * a 64 byte cache line. + */ +struct clock_read_data { + u64 epoch_ns; + u64 epoch_cyc; + u64 sched_clock_mask; + u64 (*read_sched_clock)(void); + u32 mult; + u32 shift; +}; + +/** + * struct clock_data - all data needed for sched_clock() (including + * registration of a new clock source) + * + * @seq: Sequence counter for protecting updates. The lowest + * bit is the index for @read_data. + * @read_data: Data required to read from sched_clock. + * @wrap_kt: Duration for which clock can run before wrapping. + * @rate: Tick rate of the registered clock. + * @actual_read_sched_clock: Registered hardware level clock read function. + * + * The ordering of this structure has been chosen to optimize cache + * performance. In particular 'seq' and 'read_data[0]' (combined) should fit + * into a single 64-byte cache line. + */ +struct clock_data { + seqcount_t seq; + struct clock_read_data read_data[2]; + ktime_t wrap_kt; + unsigned long rate; + + u64 (*actual_read_sched_clock)(void); +}; + +static struct hrtimer sched_clock_timer; +static int irqtime = -1; + +core_param(irqtime, irqtime, int, 0400); + +static u64 notrace jiffy_sched_clock_read(void) +{ + /* + * We don't need to use get_jiffies_64 on 32-bit arches here + * because we register with BITS_PER_LONG + */ + return (u64)(jiffies - INITIAL_JIFFIES); +} + +static struct clock_data cd ____cacheline_aligned = { + .read_data[0] = { .mult = NSEC_PER_SEC / HZ, + .read_sched_clock = jiffy_sched_clock_read, }, + .actual_read_sched_clock = jiffy_sched_clock_read, +}; + +static inline u64 notrace cyc_to_ns(u64 cyc, u32 mult, u32 shift) +{ + return (cyc * mult) >> shift; +} + +unsigned long long notrace sched_clock(void) +{ + u64 cyc, res; + unsigned long seq; + struct clock_read_data *rd; + + do { + seq = raw_read_seqcount(&cd.seq); + rd = cd.read_data + (seq & 1); + + cyc = (rd->read_sched_clock() - rd->epoch_cyc) & + rd->sched_clock_mask; + res = rd->epoch_ns + cyc_to_ns(cyc, rd->mult, rd->shift); + } while (read_seqcount_retry(&cd.seq, seq)); + + return res; +} + +/* + * Updating the data required to read the clock. + * + * sched_clock() will never observe mis-matched data even if called from + * an NMI. We do this by maintaining an odd/even copy of the data and + * steering sched_clock() to one or the other using a sequence counter. + * In order to preserve the data cache profile of sched_clock() as much + * as possible the system reverts back to the even copy when the update + * completes; the odd copy is used *only* during an update. + */ +static void update_clock_read_data(struct clock_read_data *rd) +{ + /* update the backup (odd) copy with the new data */ + cd.read_data[1] = *rd; + + /* steer readers towards the odd copy */ + raw_write_seqcount_latch(&cd.seq); + + /* now its safe for us to update the normal (even) copy */ + cd.read_data[0] = *rd; + + /* switch readers back to the even copy */ + raw_write_seqcount_latch(&cd.seq); +} + +/* + * Atomically update the sched_clock() epoch. + */ +static void update_sched_clock(void) +{ + u64 cyc; + u64 ns; + struct clock_read_data rd; + + rd = cd.read_data[0]; + + cyc = cd.actual_read_sched_clock(); + ns = rd.epoch_ns + cyc_to_ns((cyc - rd.epoch_cyc) & rd.sched_clock_mask, rd.mult, rd.shift); + + rd.epoch_ns = ns; + rd.epoch_cyc = cyc; + + update_clock_read_data(&rd); +} + +static enum hrtimer_restart sched_clock_poll(struct hrtimer *hrt) +{ + update_sched_clock(); + hrtimer_forward_now(hrt, cd.wrap_kt); + + return HRTIMER_RESTART; +} + +void __init +sched_clock_register(u64 (*read)(void), int bits, unsigned long rate) +{ + u64 res, wrap, new_mask, new_epoch, cyc, ns; + u32 new_mult, new_shift; + unsigned long r; + char r_unit; + struct clock_read_data rd; + + if (cd.rate > rate) + return; + + WARN_ON(!irqs_disabled()); + + /* Calculate the mult/shift to convert counter ticks to ns. */ + clocks_calc_mult_shift(&new_mult, &new_shift, rate, NSEC_PER_SEC, 3600); + + new_mask = CLOCKSOURCE_MASK(bits); + cd.rate = rate; + + /* Calculate how many nanosecs until we risk wrapping */ + wrap = clocks_calc_max_nsecs(new_mult, new_shift, 0, new_mask, NULL); + cd.wrap_kt = ns_to_ktime(wrap); + + rd = cd.read_data[0]; + + /* Update epoch for new counter and update 'epoch_ns' from old counter*/ + new_epoch = read(); + cyc = cd.actual_read_sched_clock(); + ns = rd.epoch_ns + cyc_to_ns((cyc - rd.epoch_cyc) & rd.sched_clock_mask, rd.mult, rd.shift); + cd.actual_read_sched_clock = read; + + rd.read_sched_clock = read; + rd.sched_clock_mask = new_mask; + rd.mult = new_mult; + rd.shift = new_shift; + rd.epoch_cyc = new_epoch; + rd.epoch_ns = ns; + + update_clock_read_data(&rd); + + if (sched_clock_timer.function != NULL) { + /* update timeout for clock wrap */ + hrtimer_start(&sched_clock_timer, cd.wrap_kt, HRTIMER_MODE_REL); + } + + r = rate; + if (r >= 4000000) { + r /= 1000000; + r_unit = 'M'; + } else { + if (r >= 1000) { + r /= 1000; + r_unit = 'k'; + } else { + r_unit = ' '; + } + } + + /* Calculate the ns resolution of this counter */ + res = cyc_to_ns(1ULL, new_mult, new_shift); + + pr_info("sched_clock: %u bits at %lu%cHz, resolution %lluns, wraps every %lluns\n", + bits, r, r_unit, res, wrap); + + /* Enable IRQ time accounting if we have a fast enough sched_clock() */ + if (irqtime > 0 || (irqtime == -1 && rate >= 1000000)) + enable_sched_clock_irqtime(); + + pr_debug("Registered %pF as sched_clock source\n", read); +} + +void __init generic_sched_clock_init(void) +{ + /* + * If no sched_clock() function has been provided at that point, + * make it the final one one. + */ + if (cd.actual_read_sched_clock == jiffy_sched_clock_read) + sched_clock_register(jiffy_sched_clock_read, BITS_PER_LONG, HZ); + + update_sched_clock(); + + /* + * Start the timer to keep sched_clock() properly updated and + * sets the initial epoch. + */ + hrtimer_init(&sched_clock_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); + sched_clock_timer.function = sched_clock_poll; + hrtimer_start(&sched_clock_timer, cd.wrap_kt, HRTIMER_MODE_REL); +} + +/* + * Clock read function for use when the clock is suspended. + * + * This function makes it appear to sched_clock() as if the clock + * stopped counting at its last update. + * + * This function must only be called from the critical + * section in sched_clock(). It relies on the read_seqcount_retry() + * at the end of the critical section to be sure we observe the + * correct copy of 'epoch_cyc'. + */ +static u64 notrace suspended_sched_clock_read(void) +{ + unsigned long seq = raw_read_seqcount(&cd.seq); + + return cd.read_data[seq & 1].epoch_cyc; +} + +int sched_clock_suspend(void) +{ + struct clock_read_data *rd = &cd.read_data[0]; + + update_sched_clock(); + hrtimer_cancel(&sched_clock_timer); + rd->read_sched_clock = suspended_sched_clock_read; + + return 0; +} + +void sched_clock_resume(void) +{ + struct clock_read_data *rd = &cd.read_data[0]; + + rd->epoch_cyc = cd.actual_read_sched_clock(); + hrtimer_start(&sched_clock_timer, cd.wrap_kt, HRTIMER_MODE_REL); + rd->read_sched_clock = cd.actual_read_sched_clock; +} + +static struct syscore_ops sched_clock_ops = { + .suspend = sched_clock_suspend, + .resume = sched_clock_resume, +}; + +static int __init sched_clock_syscore_init(void) +{ + register_syscore_ops(&sched_clock_ops); + + return 0; +} +device_initcall(sched_clock_syscore_init); diff --git a/kernel/time/test_udelay.c b/kernel/time/test_udelay.c new file mode 100644 index 000000000..b0928ab32 --- /dev/null +++ b/kernel/time/test_udelay.c @@ -0,0 +1,168 @@ +/* + * udelay() test kernel module + * + * Test is executed by writing and reading to /sys/kernel/debug/udelay_test + * Tests are configured by writing: USECS ITERATIONS + * Tests are executed by reading from the same file. + * Specifying usecs of 0 or negative values will run multiples tests. + * + * Copyright (C) 2014 Google, Inc. + * + * This software is licensed under the terms of the GNU General Public + * License version 2, as published by the Free Software Foundation, and + * may be copied, distributed, and modified under those terms. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + */ + +#include <linux/debugfs.h> +#include <linux/delay.h> +#include <linux/ktime.h> +#include <linux/module.h> +#include <linux/uaccess.h> + +#define DEFAULT_ITERATIONS 100 + +#define DEBUGFS_FILENAME "udelay_test" + +static DEFINE_MUTEX(udelay_test_lock); +static struct dentry *udelay_test_debugfs_file; +static int udelay_test_usecs; +static int udelay_test_iterations = DEFAULT_ITERATIONS; + +static int udelay_test_single(struct seq_file *s, int usecs, uint32_t iters) +{ + int min = 0, max = 0, fail_count = 0; + uint64_t sum = 0; + uint64_t avg; + int i; + /* Allow udelay to be up to 0.5% fast */ + int allowed_error_ns = usecs * 5; + + for (i = 0; i < iters; ++i) { + s64 kt1, kt2; + int time_passed; + + kt1 = ktime_get_ns(); + udelay(usecs); + kt2 = ktime_get_ns(); + time_passed = kt2 - kt1; + + if (i == 0 || time_passed < min) + min = time_passed; + if (i == 0 || time_passed > max) + max = time_passed; + if ((time_passed + allowed_error_ns) / 1000 < usecs) + ++fail_count; + WARN_ON(time_passed < 0); + sum += time_passed; + } + + avg = sum; + do_div(avg, iters); + seq_printf(s, "%d usecs x %d: exp=%d allowed=%d min=%d avg=%lld max=%d", + usecs, iters, usecs * 1000, + (usecs * 1000) - allowed_error_ns, min, avg, max); + if (fail_count) + seq_printf(s, " FAIL=%d", fail_count); + seq_puts(s, "\n"); + + return 0; +} + +static int udelay_test_show(struct seq_file *s, void *v) +{ + int usecs; + int iters; + int ret = 0; + + mutex_lock(&udelay_test_lock); + usecs = udelay_test_usecs; + iters = udelay_test_iterations; + mutex_unlock(&udelay_test_lock); + + if (usecs > 0 && iters > 0) { + return udelay_test_single(s, usecs, iters); + } else if (usecs == 0) { + struct timespec64 ts; + + ktime_get_ts64(&ts); + seq_printf(s, "udelay() test (lpj=%ld kt=%lld.%09ld)\n", + loops_per_jiffy, (s64)ts.tv_sec, ts.tv_nsec); + seq_puts(s, "usage:\n"); + seq_puts(s, "echo USECS [ITERS] > " DEBUGFS_FILENAME "\n"); + seq_puts(s, "cat " DEBUGFS_FILENAME "\n"); + } + + return ret; +} + +static int udelay_test_open(struct inode *inode, struct file *file) +{ + return single_open(file, udelay_test_show, inode->i_private); +} + +static ssize_t udelay_test_write(struct file *file, const char __user *buf, + size_t count, loff_t *pos) +{ + char lbuf[32]; + int ret; + int usecs; + int iters; + + if (count >= sizeof(lbuf)) + return -EINVAL; + + if (copy_from_user(lbuf, buf, count)) + return -EFAULT; + lbuf[count] = '\0'; + + ret = sscanf(lbuf, "%d %d", &usecs, &iters); + if (ret < 1) + return -EINVAL; + else if (ret < 2) + iters = DEFAULT_ITERATIONS; + + mutex_lock(&udelay_test_lock); + udelay_test_usecs = usecs; + udelay_test_iterations = iters; + mutex_unlock(&udelay_test_lock); + + return count; +} + +static const struct file_operations udelay_test_debugfs_ops = { + .owner = THIS_MODULE, + .open = udelay_test_open, + .read = seq_read, + .write = udelay_test_write, + .llseek = seq_lseek, + .release = single_release, +}; + +static int __init udelay_test_init(void) +{ + mutex_lock(&udelay_test_lock); + udelay_test_debugfs_file = debugfs_create_file(DEBUGFS_FILENAME, + S_IRUSR, NULL, NULL, &udelay_test_debugfs_ops); + mutex_unlock(&udelay_test_lock); + + return 0; +} + +module_init(udelay_test_init); + +static void __exit udelay_test_exit(void) +{ + mutex_lock(&udelay_test_lock); + debugfs_remove(udelay_test_debugfs_file); + mutex_unlock(&udelay_test_lock); +} + +module_exit(udelay_test_exit); + +MODULE_AUTHOR("David Riley <davidriley@chromium.org>"); +MODULE_LICENSE("GPL"); diff --git a/kernel/time/tick-broadcast-hrtimer.c b/kernel/time/tick-broadcast-hrtimer.c new file mode 100644 index 000000000..a836efd34 --- /dev/null +++ b/kernel/time/tick-broadcast-hrtimer.c @@ -0,0 +1,113 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * linux/kernel/time/tick-broadcast-hrtimer.c + * This file emulates a local clock event device + * via a pseudo clock device. + */ +#include <linux/cpu.h> +#include <linux/err.h> +#include <linux/hrtimer.h> +#include <linux/interrupt.h> +#include <linux/percpu.h> +#include <linux/profile.h> +#include <linux/clockchips.h> +#include <linux/sched.h> +#include <linux/smp.h> +#include <linux/module.h> + +#include "tick-internal.h" + +static struct hrtimer bctimer; + +static int bc_shutdown(struct clock_event_device *evt) +{ + /* + * Note, we cannot cancel the timer here as we might + * run into the following live lock scenario: + * + * cpu 0 cpu1 + * lock(broadcast_lock); + * hrtimer_interrupt() + * bc_handler() + * tick_handle_oneshot_broadcast(); + * lock(broadcast_lock); + * hrtimer_cancel() + * wait_for_callback() + */ + hrtimer_try_to_cancel(&bctimer); + return 0; +} + +/* + * This is called from the guts of the broadcast code when the cpu + * which is about to enter idle has the earliest broadcast timer event. + */ +static int bc_set_next(ktime_t expires, struct clock_event_device *bc) +{ + /* + * This is called either from enter/exit idle code or from the + * broadcast handler. In all cases tick_broadcast_lock is held. + * + * hrtimer_cancel() cannot be called here neither from the + * broadcast handler nor from the enter/exit idle code. The idle + * code can run into the problem described in bc_shutdown() and the + * broadcast handler cannot wait for itself to complete for obvious + * reasons. + * + * Each caller tries to arm the hrtimer on its own CPU, but if the + * hrtimer callbback function is currently running, then + * hrtimer_start() cannot move it and the timer stays on the CPU on + * which it is assigned at the moment. + * + * As this can be called from idle code, the hrtimer_start() + * invocation has to be wrapped with RCU_NONIDLE() as + * hrtimer_start() can call into tracing. + */ + RCU_NONIDLE( { + hrtimer_start(&bctimer, expires, HRTIMER_MODE_ABS_PINNED); + /* + * The core tick broadcast mode expects bc->bound_on to be set + * correctly to prevent a CPU which has the broadcast hrtimer + * armed from going deep idle. + * + * As tick_broadcast_lock is held, nothing can change the cpu + * base which was just established in hrtimer_start() above. So + * the below access is safe even without holding the hrtimer + * base lock. + */ + bc->bound_on = bctimer.base->cpu_base->cpu; + } ); + return 0; +} + +static struct clock_event_device ce_broadcast_hrtimer = { + .name = "bc_hrtimer", + .set_state_shutdown = bc_shutdown, + .set_next_ktime = bc_set_next, + .features = CLOCK_EVT_FEAT_ONESHOT | + CLOCK_EVT_FEAT_KTIME | + CLOCK_EVT_FEAT_HRTIMER, + .rating = 0, + .bound_on = -1, + .min_delta_ns = 1, + .max_delta_ns = KTIME_MAX, + .min_delta_ticks = 1, + .max_delta_ticks = ULONG_MAX, + .mult = 1, + .shift = 0, + .cpumask = cpu_possible_mask, +}; + +static enum hrtimer_restart bc_handler(struct hrtimer *t) +{ + ce_broadcast_hrtimer.event_handler(&ce_broadcast_hrtimer); + + return HRTIMER_NORESTART; +} + +void tick_setup_hrtimer_broadcast(void) +{ + hrtimer_init(&bctimer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS); + bctimer.function = bc_handler; + clockevents_register_device(&ce_broadcast_hrtimer); +} diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c new file mode 100644 index 000000000..aa2094d5d --- /dev/null +++ b/kernel/time/tick-broadcast.c @@ -0,0 +1,1018 @@ +/* + * linux/kernel/time/tick-broadcast.c + * + * This file contains functions which emulate a local clock-event + * device via a broadcast event source. + * + * Copyright(C) 2005-2006, Thomas Gleixner <tglx@linutronix.de> + * Copyright(C) 2005-2007, Red Hat, Inc., Ingo Molnar + * Copyright(C) 2006-2007, Timesys Corp., Thomas Gleixner + * + * This code is licenced under the GPL version 2. For details see + * kernel-base/COPYING. + */ +#include <linux/cpu.h> +#include <linux/err.h> +#include <linux/hrtimer.h> +#include <linux/interrupt.h> +#include <linux/percpu.h> +#include <linux/profile.h> +#include <linux/sched.h> +#include <linux/smp.h> +#include <linux/module.h> + +#include "tick-internal.h" + +/* + * Broadcast support for broken x86 hardware, where the local apic + * timer stops in C3 state. + */ + +static struct tick_device tick_broadcast_device; +static cpumask_var_t tick_broadcast_mask __cpumask_var_read_mostly; +static cpumask_var_t tick_broadcast_on __cpumask_var_read_mostly; +static cpumask_var_t tmpmask __cpumask_var_read_mostly; +static int tick_broadcast_forced; + +static __cacheline_aligned_in_smp DEFINE_RAW_SPINLOCK(tick_broadcast_lock); + +#ifdef CONFIG_TICK_ONESHOT +static void tick_broadcast_setup_oneshot(struct clock_event_device *bc); +static void tick_broadcast_clear_oneshot(int cpu); +static void tick_resume_broadcast_oneshot(struct clock_event_device *bc); +#else +static inline void tick_broadcast_setup_oneshot(struct clock_event_device *bc) { BUG(); } +static inline void tick_broadcast_clear_oneshot(int cpu) { } +static inline void tick_resume_broadcast_oneshot(struct clock_event_device *bc) { } +#endif + +/* + * Debugging: see timer_list.c + */ +struct tick_device *tick_get_broadcast_device(void) +{ + return &tick_broadcast_device; +} + +struct cpumask *tick_get_broadcast_mask(void) +{ + return tick_broadcast_mask; +} + +/* + * Start the device in periodic mode + */ +static void tick_broadcast_start_periodic(struct clock_event_device *bc) +{ + if (bc) + tick_setup_periodic(bc, 1); +} + +/* + * Check, if the device can be utilized as broadcast device: + */ +static bool tick_check_broadcast_device(struct clock_event_device *curdev, + struct clock_event_device *newdev) +{ + if ((newdev->features & CLOCK_EVT_FEAT_DUMMY) || + (newdev->features & CLOCK_EVT_FEAT_PERCPU) || + (newdev->features & CLOCK_EVT_FEAT_C3STOP)) + return false; + + if (tick_broadcast_device.mode == TICKDEV_MODE_ONESHOT && + !(newdev->features & CLOCK_EVT_FEAT_ONESHOT)) + return false; + + return !curdev || newdev->rating > curdev->rating; +} + +/* + * Conditionally install/replace broadcast device + */ +void tick_install_broadcast_device(struct clock_event_device *dev) +{ + struct clock_event_device *cur = tick_broadcast_device.evtdev; + + if (!tick_check_broadcast_device(cur, dev)) + return; + + if (!try_module_get(dev->owner)) + return; + + clockevents_exchange_device(cur, dev); + if (cur) + cur->event_handler = clockevents_handle_noop; + tick_broadcast_device.evtdev = dev; + if (!cpumask_empty(tick_broadcast_mask)) + tick_broadcast_start_periodic(dev); + /* + * Inform all cpus about this. We might be in a situation + * where we did not switch to oneshot mode because the per cpu + * devices are affected by CLOCK_EVT_FEAT_C3STOP and the lack + * of a oneshot capable broadcast device. Without that + * notification the systems stays stuck in periodic mode + * forever. + */ + if (dev->features & CLOCK_EVT_FEAT_ONESHOT) + tick_clock_notify(); +} + +/* + * Check, if the device is the broadcast device + */ +int tick_is_broadcast_device(struct clock_event_device *dev) +{ + return (dev && tick_broadcast_device.evtdev == dev); +} + +int tick_broadcast_update_freq(struct clock_event_device *dev, u32 freq) +{ + int ret = -ENODEV; + + if (tick_is_broadcast_device(dev)) { + raw_spin_lock(&tick_broadcast_lock); + ret = __clockevents_update_freq(dev, freq); + raw_spin_unlock(&tick_broadcast_lock); + } + return ret; +} + + +static void err_broadcast(const struct cpumask *mask) +{ + pr_crit_once("Failed to broadcast timer tick. Some CPUs may be unresponsive.\n"); +} + +static void tick_device_setup_broadcast_func(struct clock_event_device *dev) +{ + if (!dev->broadcast) + dev->broadcast = tick_broadcast; + if (!dev->broadcast) { + pr_warn_once("%s depends on broadcast, but no broadcast function available\n", + dev->name); + dev->broadcast = err_broadcast; + } +} + +/* + * Check, if the device is disfunctional and a place holder, which + * needs to be handled by the broadcast device. + */ +int tick_device_uses_broadcast(struct clock_event_device *dev, int cpu) +{ + struct clock_event_device *bc = tick_broadcast_device.evtdev; + unsigned long flags; + int ret = 0; + + raw_spin_lock_irqsave(&tick_broadcast_lock, flags); + + /* + * Devices might be registered with both periodic and oneshot + * mode disabled. This signals, that the device needs to be + * operated from the broadcast device and is a placeholder for + * the cpu local device. + */ + if (!tick_device_is_functional(dev)) { + dev->event_handler = tick_handle_periodic; + tick_device_setup_broadcast_func(dev); + cpumask_set_cpu(cpu, tick_broadcast_mask); + if (tick_broadcast_device.mode == TICKDEV_MODE_PERIODIC) + tick_broadcast_start_periodic(bc); + else + tick_broadcast_setup_oneshot(bc); + ret = 1; + } else { + /* + * Clear the broadcast bit for this cpu if the + * device is not power state affected. + */ + if (!(dev->features & CLOCK_EVT_FEAT_C3STOP)) + cpumask_clear_cpu(cpu, tick_broadcast_mask); + else + tick_device_setup_broadcast_func(dev); + + /* + * Clear the broadcast bit if the CPU is not in + * periodic broadcast on state. + */ + if (!cpumask_test_cpu(cpu, tick_broadcast_on)) + cpumask_clear_cpu(cpu, tick_broadcast_mask); + + switch (tick_broadcast_device.mode) { + case TICKDEV_MODE_ONESHOT: + /* + * If the system is in oneshot mode we can + * unconditionally clear the oneshot mask bit, + * because the CPU is running and therefore + * not in an idle state which causes the power + * state affected device to stop. Let the + * caller initialize the device. + */ + tick_broadcast_clear_oneshot(cpu); + ret = 0; + break; + + case TICKDEV_MODE_PERIODIC: + /* + * If the system is in periodic mode, check + * whether the broadcast device can be + * switched off now. + */ + if (cpumask_empty(tick_broadcast_mask) && bc) + clockevents_shutdown(bc); + /* + * If we kept the cpu in the broadcast mask, + * tell the caller to leave the per cpu device + * in shutdown state. The periodic interrupt + * is delivered by the broadcast device, if + * the broadcast device exists and is not + * hrtimer based. + */ + if (bc && !(bc->features & CLOCK_EVT_FEAT_HRTIMER)) + ret = cpumask_test_cpu(cpu, tick_broadcast_mask); + break; + default: + break; + } + } + raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags); + return ret; +} + +#ifdef CONFIG_GENERIC_CLOCKEVENTS_BROADCAST +int tick_receive_broadcast(void) +{ + struct tick_device *td = this_cpu_ptr(&tick_cpu_device); + struct clock_event_device *evt = td->evtdev; + + if (!evt) + return -ENODEV; + + if (!evt->event_handler) + return -EINVAL; + + evt->event_handler(evt); + return 0; +} +#endif + +/* + * Broadcast the event to the cpus, which are set in the mask (mangled). + */ +static bool tick_do_broadcast(struct cpumask *mask) +{ + int cpu = smp_processor_id(); + struct tick_device *td; + bool local = false; + + /* + * Check, if the current cpu is in the mask + */ + if (cpumask_test_cpu(cpu, mask)) { + struct clock_event_device *bc = tick_broadcast_device.evtdev; + + cpumask_clear_cpu(cpu, mask); + /* + * We only run the local handler, if the broadcast + * device is not hrtimer based. Otherwise we run into + * a hrtimer recursion. + * + * local timer_interrupt() + * local_handler() + * expire_hrtimers() + * bc_handler() + * local_handler() + * expire_hrtimers() + */ + local = !(bc->features & CLOCK_EVT_FEAT_HRTIMER); + } + + if (!cpumask_empty(mask)) { + /* + * It might be necessary to actually check whether the devices + * have different broadcast functions. For now, just use the + * one of the first device. This works as long as we have this + * misfeature only on x86 (lapic) + */ + td = &per_cpu(tick_cpu_device, cpumask_first(mask)); + td->evtdev->broadcast(mask); + } + return local; +} + +/* + * Periodic broadcast: + * - invoke the broadcast handlers + */ +static bool tick_do_periodic_broadcast(void) +{ + cpumask_and(tmpmask, cpu_online_mask, tick_broadcast_mask); + return tick_do_broadcast(tmpmask); +} + +/* + * Event handler for periodic broadcast ticks + */ +static void tick_handle_periodic_broadcast(struct clock_event_device *dev) +{ + struct tick_device *td = this_cpu_ptr(&tick_cpu_device); + bool bc_local; + + raw_spin_lock(&tick_broadcast_lock); + + /* Handle spurious interrupts gracefully */ + if (clockevent_state_shutdown(tick_broadcast_device.evtdev)) { + raw_spin_unlock(&tick_broadcast_lock); + return; + } + + bc_local = tick_do_periodic_broadcast(); + + if (clockevent_state_oneshot(dev)) { + ktime_t next = ktime_add(dev->next_event, tick_period); + + clockevents_program_event(dev, next, true); + } + raw_spin_unlock(&tick_broadcast_lock); + + /* + * We run the handler of the local cpu after dropping + * tick_broadcast_lock because the handler might deadlock when + * trying to switch to oneshot mode. + */ + if (bc_local) + td->evtdev->event_handler(td->evtdev); +} + +/** + * tick_broadcast_control - Enable/disable or force broadcast mode + * @mode: The selected broadcast mode + * + * Called when the system enters a state where affected tick devices + * might stop. Note: TICK_BROADCAST_FORCE cannot be undone. + */ +void tick_broadcast_control(enum tick_broadcast_mode mode) +{ + struct clock_event_device *bc, *dev; + struct tick_device *td; + int cpu, bc_stopped; + unsigned long flags; + + /* Protects also the local clockevent device. */ + raw_spin_lock_irqsave(&tick_broadcast_lock, flags); + td = this_cpu_ptr(&tick_cpu_device); + dev = td->evtdev; + + /* + * Is the device not affected by the powerstate ? + */ + if (!dev || !(dev->features & CLOCK_EVT_FEAT_C3STOP)) + goto out; + + if (!tick_device_is_functional(dev)) + goto out; + + cpu = smp_processor_id(); + bc = tick_broadcast_device.evtdev; + bc_stopped = cpumask_empty(tick_broadcast_mask); + + switch (mode) { + case TICK_BROADCAST_FORCE: + tick_broadcast_forced = 1; + case TICK_BROADCAST_ON: + cpumask_set_cpu(cpu, tick_broadcast_on); + if (!cpumask_test_and_set_cpu(cpu, tick_broadcast_mask)) { + /* + * Only shutdown the cpu local device, if: + * + * - the broadcast device exists + * - the broadcast device is not a hrtimer based one + * - the broadcast device is in periodic mode to + * avoid a hickup during switch to oneshot mode + */ + if (bc && !(bc->features & CLOCK_EVT_FEAT_HRTIMER) && + tick_broadcast_device.mode == TICKDEV_MODE_PERIODIC) + clockevents_shutdown(dev); + } + break; + + case TICK_BROADCAST_OFF: + if (tick_broadcast_forced) + break; + cpumask_clear_cpu(cpu, tick_broadcast_on); + if (!tick_device_is_functional(dev)) + break; + if (cpumask_test_and_clear_cpu(cpu, tick_broadcast_mask)) { + if (tick_broadcast_device.mode == + TICKDEV_MODE_PERIODIC) + tick_setup_periodic(dev, 0); + } + break; + } + + if (bc) { + if (cpumask_empty(tick_broadcast_mask)) { + if (!bc_stopped) + clockevents_shutdown(bc); + } else if (bc_stopped) { + if (tick_broadcast_device.mode == TICKDEV_MODE_PERIODIC) + tick_broadcast_start_periodic(bc); + else + tick_broadcast_setup_oneshot(bc); + } + } +out: + raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags); +} +EXPORT_SYMBOL_GPL(tick_broadcast_control); + +/* + * Set the periodic handler depending on broadcast on/off + */ +void tick_set_periodic_handler(struct clock_event_device *dev, int broadcast) +{ + if (!broadcast) + dev->event_handler = tick_handle_periodic; + else + dev->event_handler = tick_handle_periodic_broadcast; +} + +#ifdef CONFIG_HOTPLUG_CPU +/* + * Remove a CPU from broadcasting + */ +void tick_shutdown_broadcast(unsigned int cpu) +{ + struct clock_event_device *bc; + unsigned long flags; + + raw_spin_lock_irqsave(&tick_broadcast_lock, flags); + + bc = tick_broadcast_device.evtdev; + cpumask_clear_cpu(cpu, tick_broadcast_mask); + cpumask_clear_cpu(cpu, tick_broadcast_on); + + if (tick_broadcast_device.mode == TICKDEV_MODE_PERIODIC) { + if (bc && cpumask_empty(tick_broadcast_mask)) + clockevents_shutdown(bc); + } + + raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags); +} +#endif + +void tick_suspend_broadcast(void) +{ + struct clock_event_device *bc; + unsigned long flags; + + raw_spin_lock_irqsave(&tick_broadcast_lock, flags); + + bc = tick_broadcast_device.evtdev; + if (bc) + clockevents_shutdown(bc); + + raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags); +} + +/* + * This is called from tick_resume_local() on a resuming CPU. That's + * called from the core resume function, tick_unfreeze() and the magic XEN + * resume hackery. + * + * In none of these cases the broadcast device mode can change and the + * bit of the resuming CPU in the broadcast mask is safe as well. + */ +bool tick_resume_check_broadcast(void) +{ + if (tick_broadcast_device.mode == TICKDEV_MODE_ONESHOT) + return false; + else + return cpumask_test_cpu(smp_processor_id(), tick_broadcast_mask); +} + +void tick_resume_broadcast(void) +{ + struct clock_event_device *bc; + unsigned long flags; + + raw_spin_lock_irqsave(&tick_broadcast_lock, flags); + + bc = tick_broadcast_device.evtdev; + + if (bc) { + clockevents_tick_resume(bc); + + switch (tick_broadcast_device.mode) { + case TICKDEV_MODE_PERIODIC: + if (!cpumask_empty(tick_broadcast_mask)) + tick_broadcast_start_periodic(bc); + break; + case TICKDEV_MODE_ONESHOT: + if (!cpumask_empty(tick_broadcast_mask)) + tick_resume_broadcast_oneshot(bc); + break; + } + } + raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags); +} + +#ifdef CONFIG_TICK_ONESHOT + +static cpumask_var_t tick_broadcast_oneshot_mask __cpumask_var_read_mostly; +static cpumask_var_t tick_broadcast_pending_mask __cpumask_var_read_mostly; +static cpumask_var_t tick_broadcast_force_mask __cpumask_var_read_mostly; + +/* + * Exposed for debugging: see timer_list.c + */ +struct cpumask *tick_get_broadcast_oneshot_mask(void) +{ + return tick_broadcast_oneshot_mask; +} + +/* + * Called before going idle with interrupts disabled. Checks whether a + * broadcast event from the other core is about to happen. We detected + * that in tick_broadcast_oneshot_control(). The callsite can use this + * to avoid a deep idle transition as we are about to get the + * broadcast IPI right away. + */ +int tick_check_broadcast_expired(void) +{ + return cpumask_test_cpu(smp_processor_id(), tick_broadcast_force_mask); +} + +/* + * Set broadcast interrupt affinity + */ +static void tick_broadcast_set_affinity(struct clock_event_device *bc, + const struct cpumask *cpumask) +{ + if (!(bc->features & CLOCK_EVT_FEAT_DYNIRQ)) + return; + + if (cpumask_equal(bc->cpumask, cpumask)) + return; + + bc->cpumask = cpumask; + irq_set_affinity(bc->irq, bc->cpumask); +} + +static void tick_broadcast_set_event(struct clock_event_device *bc, int cpu, + ktime_t expires) +{ + if (!clockevent_state_oneshot(bc)) + clockevents_switch_state(bc, CLOCK_EVT_STATE_ONESHOT); + + clockevents_program_event(bc, expires, 1); + tick_broadcast_set_affinity(bc, cpumask_of(cpu)); +} + +static void tick_resume_broadcast_oneshot(struct clock_event_device *bc) +{ + clockevents_switch_state(bc, CLOCK_EVT_STATE_ONESHOT); +} + +/* + * Called from irq_enter() when idle was interrupted to reenable the + * per cpu device. + */ +void tick_check_oneshot_broadcast_this_cpu(void) +{ + if (cpumask_test_cpu(smp_processor_id(), tick_broadcast_oneshot_mask)) { + struct tick_device *td = this_cpu_ptr(&tick_cpu_device); + + /* + * We might be in the middle of switching over from + * periodic to oneshot. If the CPU has not yet + * switched over, leave the device alone. + */ + if (td->mode == TICKDEV_MODE_ONESHOT) { + clockevents_switch_state(td->evtdev, + CLOCK_EVT_STATE_ONESHOT); + } + } +} + +/* + * Handle oneshot mode broadcasting + */ +static void tick_handle_oneshot_broadcast(struct clock_event_device *dev) +{ + struct tick_device *td; + ktime_t now, next_event; + int cpu, next_cpu = 0; + bool bc_local; + + raw_spin_lock(&tick_broadcast_lock); + dev->next_event = KTIME_MAX; + next_event = KTIME_MAX; + cpumask_clear(tmpmask); + now = ktime_get(); + /* Find all expired events */ + for_each_cpu(cpu, tick_broadcast_oneshot_mask) { + /* + * Required for !SMP because for_each_cpu() reports + * unconditionally CPU0 as set on UP kernels. + */ + if (!IS_ENABLED(CONFIG_SMP) && + cpumask_empty(tick_broadcast_oneshot_mask)) + break; + + td = &per_cpu(tick_cpu_device, cpu); + if (td->evtdev->next_event <= now) { + cpumask_set_cpu(cpu, tmpmask); + /* + * Mark the remote cpu in the pending mask, so + * it can avoid reprogramming the cpu local + * timer in tick_broadcast_oneshot_control(). + */ + cpumask_set_cpu(cpu, tick_broadcast_pending_mask); + } else if (td->evtdev->next_event < next_event) { + next_event = td->evtdev->next_event; + next_cpu = cpu; + } + } + + /* + * Remove the current cpu from the pending mask. The event is + * delivered immediately in tick_do_broadcast() ! + */ + cpumask_clear_cpu(smp_processor_id(), tick_broadcast_pending_mask); + + /* Take care of enforced broadcast requests */ + cpumask_or(tmpmask, tmpmask, tick_broadcast_force_mask); + cpumask_clear(tick_broadcast_force_mask); + + /* + * Sanity check. Catch the case where we try to broadcast to + * offline cpus. + */ + if (WARN_ON_ONCE(!cpumask_subset(tmpmask, cpu_online_mask))) + cpumask_and(tmpmask, tmpmask, cpu_online_mask); + + /* + * Wakeup the cpus which have an expired event. + */ + bc_local = tick_do_broadcast(tmpmask); + + /* + * Two reasons for reprogram: + * + * - The global event did not expire any CPU local + * events. This happens in dyntick mode, as the maximum PIT + * delta is quite small. + * + * - There are pending events on sleeping CPUs which were not + * in the event mask + */ + if (next_event != KTIME_MAX) + tick_broadcast_set_event(dev, next_cpu, next_event); + + raw_spin_unlock(&tick_broadcast_lock); + + if (bc_local) { + td = this_cpu_ptr(&tick_cpu_device); + td->evtdev->event_handler(td->evtdev); + } +} + +static int broadcast_needs_cpu(struct clock_event_device *bc, int cpu) +{ + if (!(bc->features & CLOCK_EVT_FEAT_HRTIMER)) + return 0; + if (bc->next_event == KTIME_MAX) + return 0; + return bc->bound_on == cpu ? -EBUSY : 0; +} + +static void broadcast_shutdown_local(struct clock_event_device *bc, + struct clock_event_device *dev) +{ + /* + * For hrtimer based broadcasting we cannot shutdown the cpu + * local device if our own event is the first one to expire or + * if we own the broadcast timer. + */ + if (bc->features & CLOCK_EVT_FEAT_HRTIMER) { + if (broadcast_needs_cpu(bc, smp_processor_id())) + return; + if (dev->next_event < bc->next_event) + return; + } + clockevents_switch_state(dev, CLOCK_EVT_STATE_SHUTDOWN); +} + +int __tick_broadcast_oneshot_control(enum tick_broadcast_state state) +{ + struct clock_event_device *bc, *dev; + int cpu, ret = 0; + ktime_t now; + + /* + * If there is no broadcast device, tell the caller not to go + * into deep idle. + */ + if (!tick_broadcast_device.evtdev) + return -EBUSY; + + dev = this_cpu_ptr(&tick_cpu_device)->evtdev; + + raw_spin_lock(&tick_broadcast_lock); + bc = tick_broadcast_device.evtdev; + cpu = smp_processor_id(); + + if (state == TICK_BROADCAST_ENTER) { + /* + * If the current CPU owns the hrtimer broadcast + * mechanism, it cannot go deep idle and we do not add + * the CPU to the broadcast mask. We don't have to go + * through the EXIT path as the local timer is not + * shutdown. + */ + ret = broadcast_needs_cpu(bc, cpu); + if (ret) + goto out; + + /* + * If the broadcast device is in periodic mode, we + * return. + */ + if (tick_broadcast_device.mode == TICKDEV_MODE_PERIODIC) { + /* If it is a hrtimer based broadcast, return busy */ + if (bc->features & CLOCK_EVT_FEAT_HRTIMER) + ret = -EBUSY; + goto out; + } + + if (!cpumask_test_and_set_cpu(cpu, tick_broadcast_oneshot_mask)) { + WARN_ON_ONCE(cpumask_test_cpu(cpu, tick_broadcast_pending_mask)); + + /* Conditionally shut down the local timer. */ + broadcast_shutdown_local(bc, dev); + + /* + * We only reprogram the broadcast timer if we + * did not mark ourself in the force mask and + * if the cpu local event is earlier than the + * broadcast event. If the current CPU is in + * the force mask, then we are going to be + * woken by the IPI right away; we return + * busy, so the CPU does not try to go deep + * idle. + */ + if (cpumask_test_cpu(cpu, tick_broadcast_force_mask)) { + ret = -EBUSY; + } else if (dev->next_event < bc->next_event) { + tick_broadcast_set_event(bc, cpu, dev->next_event); + /* + * In case of hrtimer broadcasts the + * programming might have moved the + * timer to this cpu. If yes, remove + * us from the broadcast mask and + * return busy. + */ + ret = broadcast_needs_cpu(bc, cpu); + if (ret) { + cpumask_clear_cpu(cpu, + tick_broadcast_oneshot_mask); + } + } + } + } else { + if (cpumask_test_and_clear_cpu(cpu, tick_broadcast_oneshot_mask)) { + clockevents_switch_state(dev, CLOCK_EVT_STATE_ONESHOT); + /* + * The cpu which was handling the broadcast + * timer marked this cpu in the broadcast + * pending mask and fired the broadcast + * IPI. So we are going to handle the expired + * event anyway via the broadcast IPI + * handler. No need to reprogram the timer + * with an already expired event. + */ + if (cpumask_test_and_clear_cpu(cpu, + tick_broadcast_pending_mask)) + goto out; + + /* + * Bail out if there is no next event. + */ + if (dev->next_event == KTIME_MAX) + goto out; + /* + * If the pending bit is not set, then we are + * either the CPU handling the broadcast + * interrupt or we got woken by something else. + * + * We are not longer in the broadcast mask, so + * if the cpu local expiry time is already + * reached, we would reprogram the cpu local + * timer with an already expired event. + * + * This can lead to a ping-pong when we return + * to idle and therefor rearm the broadcast + * timer before the cpu local timer was able + * to fire. This happens because the forced + * reprogramming makes sure that the event + * will happen in the future and depending on + * the min_delta setting this might be far + * enough out that the ping-pong starts. + * + * If the cpu local next_event has expired + * then we know that the broadcast timer + * next_event has expired as well and + * broadcast is about to be handled. So we + * avoid reprogramming and enforce that the + * broadcast handler, which did not run yet, + * will invoke the cpu local handler. + * + * We cannot call the handler directly from + * here, because we might be in a NOHZ phase + * and we did not go through the irq_enter() + * nohz fixups. + */ + now = ktime_get(); + if (dev->next_event <= now) { + cpumask_set_cpu(cpu, tick_broadcast_force_mask); + goto out; + } + /* + * We got woken by something else. Reprogram + * the cpu local timer device. + */ + tick_program_event(dev->next_event, 1); + } + } +out: + raw_spin_unlock(&tick_broadcast_lock); + return ret; +} + +/* + * Reset the one shot broadcast for a cpu + * + * Called with tick_broadcast_lock held + */ +static void tick_broadcast_clear_oneshot(int cpu) +{ + cpumask_clear_cpu(cpu, tick_broadcast_oneshot_mask); + cpumask_clear_cpu(cpu, tick_broadcast_pending_mask); +} + +static void tick_broadcast_init_next_event(struct cpumask *mask, + ktime_t expires) +{ + struct tick_device *td; + int cpu; + + for_each_cpu(cpu, mask) { + td = &per_cpu(tick_cpu_device, cpu); + if (td->evtdev) + td->evtdev->next_event = expires; + } +} + +/** + * tick_broadcast_setup_oneshot - setup the broadcast device + */ +static void tick_broadcast_setup_oneshot(struct clock_event_device *bc) +{ + int cpu = smp_processor_id(); + + if (!bc) + return; + + /* Set it up only once ! */ + if (bc->event_handler != tick_handle_oneshot_broadcast) { + int was_periodic = clockevent_state_periodic(bc); + + bc->event_handler = tick_handle_oneshot_broadcast; + + /* + * We must be careful here. There might be other CPUs + * waiting for periodic broadcast. We need to set the + * oneshot_mask bits for those and program the + * broadcast device to fire. + */ + cpumask_copy(tmpmask, tick_broadcast_mask); + cpumask_clear_cpu(cpu, tmpmask); + cpumask_or(tick_broadcast_oneshot_mask, + tick_broadcast_oneshot_mask, tmpmask); + + if (was_periodic && !cpumask_empty(tmpmask)) { + clockevents_switch_state(bc, CLOCK_EVT_STATE_ONESHOT); + tick_broadcast_init_next_event(tmpmask, + tick_next_period); + tick_broadcast_set_event(bc, cpu, tick_next_period); + } else + bc->next_event = KTIME_MAX; + } else { + /* + * The first cpu which switches to oneshot mode sets + * the bit for all other cpus which are in the general + * (periodic) broadcast mask. So the bit is set and + * would prevent the first broadcast enter after this + * to program the bc device. + */ + tick_broadcast_clear_oneshot(cpu); + } +} + +/* + * Select oneshot operating mode for the broadcast device + */ +void tick_broadcast_switch_to_oneshot(void) +{ + struct clock_event_device *bc; + unsigned long flags; + + raw_spin_lock_irqsave(&tick_broadcast_lock, flags); + + tick_broadcast_device.mode = TICKDEV_MODE_ONESHOT; + bc = tick_broadcast_device.evtdev; + if (bc) + tick_broadcast_setup_oneshot(bc); + + raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags); +} + +#ifdef CONFIG_HOTPLUG_CPU +void hotplug_cpu__broadcast_tick_pull(int deadcpu) +{ + struct clock_event_device *bc; + unsigned long flags; + + raw_spin_lock_irqsave(&tick_broadcast_lock, flags); + bc = tick_broadcast_device.evtdev; + + if (bc && broadcast_needs_cpu(bc, deadcpu)) { + /* This moves the broadcast assignment to this CPU: */ + clockevents_program_event(bc, bc->next_event, 1); + } + raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags); +} + +/* + * Remove a dead CPU from broadcasting + */ +void tick_shutdown_broadcast_oneshot(unsigned int cpu) +{ + unsigned long flags; + + raw_spin_lock_irqsave(&tick_broadcast_lock, flags); + + /* + * Clear the broadcast masks for the dead cpu, but do not stop + * the broadcast device! + */ + cpumask_clear_cpu(cpu, tick_broadcast_oneshot_mask); + cpumask_clear_cpu(cpu, tick_broadcast_pending_mask); + cpumask_clear_cpu(cpu, tick_broadcast_force_mask); + + raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags); +} +#endif + +/* + * Check, whether the broadcast device is in one shot mode + */ +int tick_broadcast_oneshot_active(void) +{ + return tick_broadcast_device.mode == TICKDEV_MODE_ONESHOT; +} + +/* + * Check whether the broadcast device supports oneshot. + */ +bool tick_broadcast_oneshot_available(void) +{ + struct clock_event_device *bc = tick_broadcast_device.evtdev; + + return bc ? bc->features & CLOCK_EVT_FEAT_ONESHOT : false; +} + +#else +int __tick_broadcast_oneshot_control(enum tick_broadcast_state state) +{ + struct clock_event_device *bc = tick_broadcast_device.evtdev; + + if (!bc || (bc->features & CLOCK_EVT_FEAT_HRTIMER)) + return -EBUSY; + + return 0; +} +#endif + +void __init tick_broadcast_init(void) +{ + zalloc_cpumask_var(&tick_broadcast_mask, GFP_NOWAIT); + zalloc_cpumask_var(&tick_broadcast_on, GFP_NOWAIT); + zalloc_cpumask_var(&tmpmask, GFP_NOWAIT); +#ifdef CONFIG_TICK_ONESHOT + zalloc_cpumask_var(&tick_broadcast_oneshot_mask, GFP_NOWAIT); + zalloc_cpumask_var(&tick_broadcast_pending_mask, GFP_NOWAIT); + zalloc_cpumask_var(&tick_broadcast_force_mask, GFP_NOWAIT); +#endif +} diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c new file mode 100644 index 000000000..0a3cc37e4 --- /dev/null +++ b/kernel/time/tick-common.c @@ -0,0 +1,541 @@ +/* + * linux/kernel/time/tick-common.c + * + * This file contains the base functions to manage periodic tick + * related events. + * + * Copyright(C) 2005-2006, Thomas Gleixner <tglx@linutronix.de> + * Copyright(C) 2005-2007, Red Hat, Inc., Ingo Molnar + * Copyright(C) 2006-2007, Timesys Corp., Thomas Gleixner + * + * This code is licenced under the GPL version 2. For details see + * kernel-base/COPYING. + */ +#include <linux/cpu.h> +#include <linux/err.h> +#include <linux/hrtimer.h> +#include <linux/interrupt.h> +#include <linux/nmi.h> +#include <linux/percpu.h> +#include <linux/profile.h> +#include <linux/sched.h> +#include <linux/module.h> +#include <trace/events/power.h> + +#include <asm/irq_regs.h> + +#include "tick-internal.h" + +/* + * Tick devices + */ +DEFINE_PER_CPU(struct tick_device, tick_cpu_device); +/* + * Tick next event: keeps track of the tick time + */ +ktime_t tick_next_period; +ktime_t tick_period; + +/* + * tick_do_timer_cpu is a timer core internal variable which holds the CPU NR + * which is responsible for calling do_timer(), i.e. the timekeeping stuff. This + * variable has two functions: + * + * 1) Prevent a thundering herd issue of a gazillion of CPUs trying to grab the + * timekeeping lock all at once. Only the CPU which is assigned to do the + * update is handling it. + * + * 2) Hand off the duty in the NOHZ idle case by setting the value to + * TICK_DO_TIMER_NONE, i.e. a non existing CPU. So the next cpu which looks + * at it will take over and keep the time keeping alive. The handover + * procedure also covers cpu hotplug. + */ +int tick_do_timer_cpu __read_mostly = TICK_DO_TIMER_BOOT; + +/* + * Debugging: see timer_list.c + */ +struct tick_device *tick_get_device(int cpu) +{ + return &per_cpu(tick_cpu_device, cpu); +} + +/** + * tick_is_oneshot_available - check for a oneshot capable event device + */ +int tick_is_oneshot_available(void) +{ + struct clock_event_device *dev = __this_cpu_read(tick_cpu_device.evtdev); + + if (!dev || !(dev->features & CLOCK_EVT_FEAT_ONESHOT)) + return 0; + if (!(dev->features & CLOCK_EVT_FEAT_C3STOP)) + return 1; + return tick_broadcast_oneshot_available(); +} + +/* + * Periodic tick + */ +static void tick_periodic(int cpu) +{ + if (tick_do_timer_cpu == cpu) { + write_seqlock(&jiffies_lock); + + /* Keep track of the next tick event */ + tick_next_period = ktime_add(tick_next_period, tick_period); + + do_timer(1); + write_sequnlock(&jiffies_lock); + update_wall_time(); + } + + update_process_times(user_mode(get_irq_regs())); + profile_tick(CPU_PROFILING); +} + +/* + * Event handler for periodic ticks + */ +void tick_handle_periodic(struct clock_event_device *dev) +{ + int cpu = smp_processor_id(); + ktime_t next = dev->next_event; + + tick_periodic(cpu); + +#if defined(CONFIG_HIGH_RES_TIMERS) || defined(CONFIG_NO_HZ_COMMON) + /* + * The cpu might have transitioned to HIGHRES or NOHZ mode via + * update_process_times() -> run_local_timers() -> + * hrtimer_run_queues(). + */ + if (dev->event_handler != tick_handle_periodic) + return; +#endif + + if (!clockevent_state_oneshot(dev)) + return; + for (;;) { + /* + * Setup the next period for devices, which do not have + * periodic mode: + */ + next = ktime_add(next, tick_period); + + if (!clockevents_program_event(dev, next, false)) + return; + /* + * Have to be careful here. If we're in oneshot mode, + * before we call tick_periodic() in a loop, we need + * to be sure we're using a real hardware clocksource. + * Otherwise we could get trapped in an infinite + * loop, as the tick_periodic() increments jiffies, + * which then will increment time, possibly causing + * the loop to trigger again and again. + */ + if (timekeeping_valid_for_hres()) + tick_periodic(cpu); + } +} + +/* + * Setup the device for a periodic tick + */ +void tick_setup_periodic(struct clock_event_device *dev, int broadcast) +{ + tick_set_periodic_handler(dev, broadcast); + + /* Broadcast setup ? */ + if (!tick_device_is_functional(dev)) + return; + + if ((dev->features & CLOCK_EVT_FEAT_PERIODIC) && + !tick_broadcast_oneshot_active()) { + clockevents_switch_state(dev, CLOCK_EVT_STATE_PERIODIC); + } else { + unsigned long seq; + ktime_t next; + + do { + seq = read_seqbegin(&jiffies_lock); + next = tick_next_period; + } while (read_seqretry(&jiffies_lock, seq)); + + clockevents_switch_state(dev, CLOCK_EVT_STATE_ONESHOT); + + for (;;) { + if (!clockevents_program_event(dev, next, false)) + return; + next = ktime_add(next, tick_period); + } + } +} + +/* + * Setup the tick device + */ +static void tick_setup_device(struct tick_device *td, + struct clock_event_device *newdev, int cpu, + const struct cpumask *cpumask) +{ + void (*handler)(struct clock_event_device *) = NULL; + ktime_t next_event = 0; + + /* + * First device setup ? + */ + if (!td->evtdev) { + /* + * If no cpu took the do_timer update, assign it to + * this cpu: + */ + if (tick_do_timer_cpu == TICK_DO_TIMER_BOOT) { + if (!tick_nohz_full_cpu(cpu)) + tick_do_timer_cpu = cpu; + else + tick_do_timer_cpu = TICK_DO_TIMER_NONE; + tick_next_period = ktime_get(); + tick_period = NSEC_PER_SEC / HZ; + } + + /* + * Startup in periodic mode first. + */ + td->mode = TICKDEV_MODE_PERIODIC; + } else { + handler = td->evtdev->event_handler; + next_event = td->evtdev->next_event; + td->evtdev->event_handler = clockevents_handle_noop; + } + + td->evtdev = newdev; + + /* + * When the device is not per cpu, pin the interrupt to the + * current cpu: + */ + if (!cpumask_equal(newdev->cpumask, cpumask)) + irq_set_affinity(newdev->irq, cpumask); + + /* + * When global broadcasting is active, check if the current + * device is registered as a placeholder for broadcast mode. + * This allows us to handle this x86 misfeature in a generic + * way. This function also returns !=0 when we keep the + * current active broadcast state for this CPU. + */ + if (tick_device_uses_broadcast(newdev, cpu)) + return; + + if (td->mode == TICKDEV_MODE_PERIODIC) + tick_setup_periodic(newdev, 0); + else + tick_setup_oneshot(newdev, handler, next_event); +} + +void tick_install_replacement(struct clock_event_device *newdev) +{ + struct tick_device *td = this_cpu_ptr(&tick_cpu_device); + int cpu = smp_processor_id(); + + clockevents_exchange_device(td->evtdev, newdev); + tick_setup_device(td, newdev, cpu, cpumask_of(cpu)); + if (newdev->features & CLOCK_EVT_FEAT_ONESHOT) + tick_oneshot_notify(); +} + +static bool tick_check_percpu(struct clock_event_device *curdev, + struct clock_event_device *newdev, int cpu) +{ + if (!cpumask_test_cpu(cpu, newdev->cpumask)) + return false; + if (cpumask_equal(newdev->cpumask, cpumask_of(cpu))) + return true; + /* Check if irq affinity can be set */ + if (newdev->irq >= 0 && !irq_can_set_affinity(newdev->irq)) + return false; + /* Prefer an existing cpu local device */ + if (curdev && cpumask_equal(curdev->cpumask, cpumask_of(cpu))) + return false; + return true; +} + +static bool tick_check_preferred(struct clock_event_device *curdev, + struct clock_event_device *newdev) +{ + /* Prefer oneshot capable device */ + if (!(newdev->features & CLOCK_EVT_FEAT_ONESHOT)) { + if (curdev && (curdev->features & CLOCK_EVT_FEAT_ONESHOT)) + return false; + if (tick_oneshot_mode_active()) + return false; + } + + /* + * Use the higher rated one, but prefer a CPU local device with a lower + * rating than a non-CPU local device + */ + return !curdev || + newdev->rating > curdev->rating || + !cpumask_equal(curdev->cpumask, newdev->cpumask); +} + +/* + * Check whether the new device is a better fit than curdev. curdev + * can be NULL ! + */ +bool tick_check_replacement(struct clock_event_device *curdev, + struct clock_event_device *newdev) +{ + if (!tick_check_percpu(curdev, newdev, smp_processor_id())) + return false; + + return tick_check_preferred(curdev, newdev); +} + +/* + * Check, if the new registered device should be used. Called with + * clockevents_lock held and interrupts disabled. + */ +void tick_check_new_device(struct clock_event_device *newdev) +{ + struct clock_event_device *curdev; + struct tick_device *td; + int cpu; + + cpu = smp_processor_id(); + td = &per_cpu(tick_cpu_device, cpu); + curdev = td->evtdev; + + /* cpu local device ? */ + if (!tick_check_percpu(curdev, newdev, cpu)) + goto out_bc; + + /* Preference decision */ + if (!tick_check_preferred(curdev, newdev)) + goto out_bc; + + if (!try_module_get(newdev->owner)) + return; + + /* + * Replace the eventually existing device by the new + * device. If the current device is the broadcast device, do + * not give it back to the clockevents layer ! + */ + if (tick_is_broadcast_device(curdev)) { + clockevents_shutdown(curdev); + curdev = NULL; + } + clockevents_exchange_device(curdev, newdev); + tick_setup_device(td, newdev, cpu, cpumask_of(cpu)); + if (newdev->features & CLOCK_EVT_FEAT_ONESHOT) + tick_oneshot_notify(); + return; + +out_bc: + /* + * Can the new device be used as a broadcast device ? + */ + tick_install_broadcast_device(newdev); +} + +/** + * tick_broadcast_oneshot_control - Enter/exit broadcast oneshot mode + * @state: The target state (enter/exit) + * + * The system enters/leaves a state, where affected devices might stop + * Returns 0 on success, -EBUSY if the cpu is used to broadcast wakeups. + * + * Called with interrupts disabled, so clockevents_lock is not + * required here because the local clock event device cannot go away + * under us. + */ +int tick_broadcast_oneshot_control(enum tick_broadcast_state state) +{ + struct tick_device *td = this_cpu_ptr(&tick_cpu_device); + + if (!(td->evtdev->features & CLOCK_EVT_FEAT_C3STOP)) + return 0; + + return __tick_broadcast_oneshot_control(state); +} +EXPORT_SYMBOL_GPL(tick_broadcast_oneshot_control); + +#ifdef CONFIG_HOTPLUG_CPU +/* + * Transfer the do_timer job away from a dying cpu. + * + * Called with interrupts disabled. Not locking required. If + * tick_do_timer_cpu is owned by this cpu, nothing can change it. + */ +void tick_handover_do_timer(void) +{ + if (tick_do_timer_cpu == smp_processor_id()) { + int cpu = cpumask_first(cpu_online_mask); + + tick_do_timer_cpu = (cpu < nr_cpu_ids) ? cpu : + TICK_DO_TIMER_NONE; + } +} + +/* + * Shutdown an event device on a given cpu: + * + * This is called on a life CPU, when a CPU is dead. So we cannot + * access the hardware device itself. + * We just set the mode and remove it from the lists. + */ +void tick_shutdown(unsigned int cpu) +{ + struct tick_device *td = &per_cpu(tick_cpu_device, cpu); + struct clock_event_device *dev = td->evtdev; + + td->mode = TICKDEV_MODE_PERIODIC; + if (dev) { + /* + * Prevent that the clock events layer tries to call + * the set mode function! + */ + clockevent_set_state(dev, CLOCK_EVT_STATE_DETACHED); + clockevents_exchange_device(dev, NULL); + dev->event_handler = clockevents_handle_noop; + td->evtdev = NULL; + } +} +#endif + +/** + * tick_suspend_local - Suspend the local tick device + * + * Called from the local cpu for freeze with interrupts disabled. + * + * No locks required. Nothing can change the per cpu device. + */ +void tick_suspend_local(void) +{ + struct tick_device *td = this_cpu_ptr(&tick_cpu_device); + + clockevents_shutdown(td->evtdev); +} + +/** + * tick_resume_local - Resume the local tick device + * + * Called from the local CPU for unfreeze or XEN resume magic. + * + * No locks required. Nothing can change the per cpu device. + */ +void tick_resume_local(void) +{ + struct tick_device *td = this_cpu_ptr(&tick_cpu_device); + bool broadcast = tick_resume_check_broadcast(); + + clockevents_tick_resume(td->evtdev); + if (!broadcast) { + if (td->mode == TICKDEV_MODE_PERIODIC) + tick_setup_periodic(td->evtdev, 0); + else + tick_resume_oneshot(); + } +} + +/** + * tick_suspend - Suspend the tick and the broadcast device + * + * Called from syscore_suspend() via timekeeping_suspend with only one + * CPU online and interrupts disabled or from tick_unfreeze() under + * tick_freeze_lock. + * + * No locks required. Nothing can change the per cpu device. + */ +void tick_suspend(void) +{ + tick_suspend_local(); + tick_suspend_broadcast(); +} + +/** + * tick_resume - Resume the tick and the broadcast device + * + * Called from syscore_resume() via timekeeping_resume with only one + * CPU online and interrupts disabled. + * + * No locks required. Nothing can change the per cpu device. + */ +void tick_resume(void) +{ + tick_resume_broadcast(); + tick_resume_local(); +} + +#ifdef CONFIG_SUSPEND +static DEFINE_RAW_SPINLOCK(tick_freeze_lock); +static unsigned int tick_freeze_depth; + +/** + * tick_freeze - Suspend the local tick and (possibly) timekeeping. + * + * Check if this is the last online CPU executing the function and if so, + * suspend timekeeping. Otherwise suspend the local tick. + * + * Call with interrupts disabled. Must be balanced with %tick_unfreeze(). + * Interrupts must not be enabled before the subsequent %tick_unfreeze(). + */ +void tick_freeze(void) +{ + raw_spin_lock(&tick_freeze_lock); + + tick_freeze_depth++; + if (tick_freeze_depth == num_online_cpus()) { + trace_suspend_resume(TPS("timekeeping_freeze"), + smp_processor_id(), true); + system_state = SYSTEM_SUSPEND; + sched_clock_suspend(); + timekeeping_suspend(); + } else { + tick_suspend_local(); + } + + raw_spin_unlock(&tick_freeze_lock); +} + +/** + * tick_unfreeze - Resume the local tick and (possibly) timekeeping. + * + * Check if this is the first CPU executing the function and if so, resume + * timekeeping. Otherwise resume the local tick. + * + * Call with interrupts disabled. Must be balanced with %tick_freeze(). + * Interrupts must not be enabled after the preceding %tick_freeze(). + */ +void tick_unfreeze(void) +{ + raw_spin_lock(&tick_freeze_lock); + + if (tick_freeze_depth == num_online_cpus()) { + timekeeping_resume(); + sched_clock_resume(); + system_state = SYSTEM_RUNNING; + trace_suspend_resume(TPS("timekeeping_freeze"), + smp_processor_id(), false); + } else { + touch_softlockup_watchdog(); + tick_resume_local(); + } + + tick_freeze_depth--; + + raw_spin_unlock(&tick_freeze_lock); +} +#endif /* CONFIG_SUSPEND */ + +/** + * tick_init - initialize the tick control + */ +void __init tick_init(void) +{ + tick_broadcast_init(); + tick_nohz_init(); +} diff --git a/kernel/time/tick-internal.h b/kernel/time/tick-internal.h new file mode 100644 index 000000000..e277284c2 --- /dev/null +++ b/kernel/time/tick-internal.h @@ -0,0 +1,165 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * tick internal variable and functions used by low/high res code + */ +#include <linux/hrtimer.h> +#include <linux/tick.h> + +#include "timekeeping.h" +#include "tick-sched.h" + +#ifdef CONFIG_GENERIC_CLOCKEVENTS + +# define TICK_DO_TIMER_NONE -1 +# define TICK_DO_TIMER_BOOT -2 + +DECLARE_PER_CPU(struct tick_device, tick_cpu_device); +extern ktime_t tick_next_period; +extern ktime_t tick_period; +extern int tick_do_timer_cpu __read_mostly; + +extern void tick_setup_periodic(struct clock_event_device *dev, int broadcast); +extern void tick_handle_periodic(struct clock_event_device *dev); +extern void tick_check_new_device(struct clock_event_device *dev); +extern void tick_shutdown(unsigned int cpu); +extern void tick_suspend(void); +extern void tick_resume(void); +extern bool tick_check_replacement(struct clock_event_device *curdev, + struct clock_event_device *newdev); +extern void tick_install_replacement(struct clock_event_device *dev); +extern int tick_is_oneshot_available(void); +extern struct tick_device *tick_get_device(int cpu); + +extern int clockevents_tick_resume(struct clock_event_device *dev); +/* Check, if the device is functional or a dummy for broadcast */ +static inline int tick_device_is_functional(struct clock_event_device *dev) +{ + return !(dev->features & CLOCK_EVT_FEAT_DUMMY); +} + +static inline enum clock_event_state clockevent_get_state(struct clock_event_device *dev) +{ + return dev->state_use_accessors; +} + +static inline void clockevent_set_state(struct clock_event_device *dev, + enum clock_event_state state) +{ + dev->state_use_accessors = state; +} + +extern void clockevents_shutdown(struct clock_event_device *dev); +extern void clockevents_exchange_device(struct clock_event_device *old, + struct clock_event_device *new); +extern void clockevents_switch_state(struct clock_event_device *dev, + enum clock_event_state state); +extern int clockevents_program_event(struct clock_event_device *dev, + ktime_t expires, bool force); +extern void clockevents_handle_noop(struct clock_event_device *dev); +extern int __clockevents_update_freq(struct clock_event_device *dev, u32 freq); +extern ssize_t sysfs_get_uname(const char *buf, char *dst, size_t cnt); + +/* Broadcasting support */ +# ifdef CONFIG_GENERIC_CLOCKEVENTS_BROADCAST +extern int tick_device_uses_broadcast(struct clock_event_device *dev, int cpu); +extern void tick_install_broadcast_device(struct clock_event_device *dev); +extern int tick_is_broadcast_device(struct clock_event_device *dev); +extern void tick_shutdown_broadcast(unsigned int cpu); +extern void tick_suspend_broadcast(void); +extern void tick_resume_broadcast(void); +extern bool tick_resume_check_broadcast(void); +extern void tick_broadcast_init(void); +extern void tick_set_periodic_handler(struct clock_event_device *dev, int broadcast); +extern int tick_broadcast_update_freq(struct clock_event_device *dev, u32 freq); +extern struct tick_device *tick_get_broadcast_device(void); +extern struct cpumask *tick_get_broadcast_mask(void); +# else /* !CONFIG_GENERIC_CLOCKEVENTS_BROADCAST: */ +static inline void tick_install_broadcast_device(struct clock_event_device *dev) { } +static inline int tick_is_broadcast_device(struct clock_event_device *dev) { return 0; } +static inline int tick_device_uses_broadcast(struct clock_event_device *dev, int cpu) { return 0; } +static inline void tick_do_periodic_broadcast(struct clock_event_device *d) { } +static inline void tick_shutdown_broadcast(unsigned int cpu) { } +static inline void tick_suspend_broadcast(void) { } +static inline void tick_resume_broadcast(void) { } +static inline bool tick_resume_check_broadcast(void) { return false; } +static inline void tick_broadcast_init(void) { } +static inline int tick_broadcast_update_freq(struct clock_event_device *dev, u32 freq) { return -ENODEV; } + +/* Set the periodic handler in non broadcast mode */ +static inline void tick_set_periodic_handler(struct clock_event_device *dev, int broadcast) +{ + dev->event_handler = tick_handle_periodic; +} +# endif /* !CONFIG_GENERIC_CLOCKEVENTS_BROADCAST */ + +#else /* !GENERIC_CLOCKEVENTS: */ +static inline void tick_suspend(void) { } +static inline void tick_resume(void) { } +#endif /* !GENERIC_CLOCKEVENTS */ + +/* Oneshot related functions */ +#ifdef CONFIG_TICK_ONESHOT +extern void tick_setup_oneshot(struct clock_event_device *newdev, + void (*handler)(struct clock_event_device *), + ktime_t nextevt); +extern int tick_program_event(ktime_t expires, int force); +extern void tick_oneshot_notify(void); +extern int tick_switch_to_oneshot(void (*handler)(struct clock_event_device *)); +extern void tick_resume_oneshot(void); +static inline bool tick_oneshot_possible(void) { return true; } +extern int tick_oneshot_mode_active(void); +extern void tick_clock_notify(void); +extern int tick_check_oneshot_change(int allow_nohz); +extern int tick_init_highres(void); +#else /* !CONFIG_TICK_ONESHOT: */ +static inline +void tick_setup_oneshot(struct clock_event_device *newdev, + void (*handler)(struct clock_event_device *), + ktime_t nextevt) { BUG(); } +static inline void tick_resume_oneshot(void) { BUG(); } +static inline int tick_program_event(ktime_t expires, int force) { return 0; } +static inline void tick_oneshot_notify(void) { } +static inline bool tick_oneshot_possible(void) { return false; } +static inline int tick_oneshot_mode_active(void) { return 0; } +static inline void tick_clock_notify(void) { } +static inline int tick_check_oneshot_change(int allow_nohz) { return 0; } +#endif /* !CONFIG_TICK_ONESHOT */ + +/* Functions related to oneshot broadcasting */ +#if defined(CONFIG_GENERIC_CLOCKEVENTS_BROADCAST) && defined(CONFIG_TICK_ONESHOT) +extern void tick_broadcast_switch_to_oneshot(void); +extern void tick_shutdown_broadcast_oneshot(unsigned int cpu); +extern int tick_broadcast_oneshot_active(void); +extern void tick_check_oneshot_broadcast_this_cpu(void); +bool tick_broadcast_oneshot_available(void); +extern struct cpumask *tick_get_broadcast_oneshot_mask(void); +#else /* !(BROADCAST && ONESHOT): */ +static inline void tick_broadcast_switch_to_oneshot(void) { } +static inline void tick_shutdown_broadcast_oneshot(unsigned int cpu) { } +static inline int tick_broadcast_oneshot_active(void) { return 0; } +static inline void tick_check_oneshot_broadcast_this_cpu(void) { } +static inline bool tick_broadcast_oneshot_available(void) { return tick_oneshot_possible(); } +#endif /* !(BROADCAST && ONESHOT) */ + +/* NO_HZ_FULL internal */ +#ifdef CONFIG_NO_HZ_FULL +extern void tick_nohz_init(void); +# else +static inline void tick_nohz_init(void) { } +#endif + +#ifdef CONFIG_NO_HZ_COMMON +extern unsigned long tick_nohz_active; +extern void timers_update_nohz(void); +# ifdef CONFIG_SMP +extern struct static_key_false timers_migration_enabled; +# endif +#else /* CONFIG_NO_HZ_COMMON */ +static inline void timers_update_nohz(void) { } +#define tick_nohz_active (0) +#endif + +DECLARE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases); + +extern u64 get_next_timer_interrupt(unsigned long basej, u64 basem); +void timer_clear_idle(void); diff --git a/kernel/time/tick-oneshot.c b/kernel/time/tick-oneshot.c new file mode 100644 index 000000000..6fe615d57 --- /dev/null +++ b/kernel/time/tick-oneshot.c @@ -0,0 +1,132 @@ +/* + * linux/kernel/time/tick-oneshot.c + * + * This file contains functions which manage high resolution tick + * related events. + * + * Copyright(C) 2005-2006, Thomas Gleixner <tglx@linutronix.de> + * Copyright(C) 2005-2007, Red Hat, Inc., Ingo Molnar + * Copyright(C) 2006-2007, Timesys Corp., Thomas Gleixner + * + * This code is licenced under the GPL version 2. For details see + * kernel-base/COPYING. + */ +#include <linux/cpu.h> +#include <linux/err.h> +#include <linux/hrtimer.h> +#include <linux/interrupt.h> +#include <linux/percpu.h> +#include <linux/profile.h> +#include <linux/sched.h> + +#include "tick-internal.h" + +/** + * tick_program_event + */ +int tick_program_event(ktime_t expires, int force) +{ + struct clock_event_device *dev = __this_cpu_read(tick_cpu_device.evtdev); + + if (unlikely(expires == KTIME_MAX)) { + /* + * We don't need the clock event device any more, stop it. + */ + clockevents_switch_state(dev, CLOCK_EVT_STATE_ONESHOT_STOPPED); + dev->next_event = KTIME_MAX; + return 0; + } + + if (unlikely(clockevent_state_oneshot_stopped(dev))) { + /* + * We need the clock event again, configure it in ONESHOT mode + * before using it. + */ + clockevents_switch_state(dev, CLOCK_EVT_STATE_ONESHOT); + } + + return clockevents_program_event(dev, expires, force); +} + +/** + * tick_resume_onshot - resume oneshot mode + */ +void tick_resume_oneshot(void) +{ + struct clock_event_device *dev = __this_cpu_read(tick_cpu_device.evtdev); + + clockevents_switch_state(dev, CLOCK_EVT_STATE_ONESHOT); + clockevents_program_event(dev, ktime_get(), true); +} + +/** + * tick_setup_oneshot - setup the event device for oneshot mode (hres or nohz) + */ +void tick_setup_oneshot(struct clock_event_device *newdev, + void (*handler)(struct clock_event_device *), + ktime_t next_event) +{ + newdev->event_handler = handler; + clockevents_switch_state(newdev, CLOCK_EVT_STATE_ONESHOT); + clockevents_program_event(newdev, next_event, true); +} + +/** + * tick_switch_to_oneshot - switch to oneshot mode + */ +int tick_switch_to_oneshot(void (*handler)(struct clock_event_device *)) +{ + struct tick_device *td = this_cpu_ptr(&tick_cpu_device); + struct clock_event_device *dev = td->evtdev; + + if (!dev || !(dev->features & CLOCK_EVT_FEAT_ONESHOT) || + !tick_device_is_functional(dev)) { + + pr_info("Clockevents: could not switch to one-shot mode:"); + if (!dev) { + pr_cont(" no tick device\n"); + } else { + if (!tick_device_is_functional(dev)) + pr_cont(" %s is not functional.\n", dev->name); + else + pr_cont(" %s does not support one-shot mode.\n", + dev->name); + } + return -EINVAL; + } + + td->mode = TICKDEV_MODE_ONESHOT; + dev->event_handler = handler; + clockevents_switch_state(dev, CLOCK_EVT_STATE_ONESHOT); + tick_broadcast_switch_to_oneshot(); + return 0; +} + +/** + * tick_check_oneshot_mode - check whether the system is in oneshot mode + * + * returns 1 when either nohz or highres are enabled. otherwise 0. + */ +int tick_oneshot_mode_active(void) +{ + unsigned long flags; + int ret; + + local_irq_save(flags); + ret = __this_cpu_read(tick_cpu_device.mode) == TICKDEV_MODE_ONESHOT; + local_irq_restore(flags); + + return ret; +} + +#ifdef CONFIG_HIGH_RES_TIMERS +/** + * tick_init_highres - switch to high resolution mode + * + * Called with interrupts disabled. + */ +int tick_init_highres(void) +{ + return tick_switch_to_oneshot(hrtimer_interrupt); +} +#endif diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c new file mode 100644 index 000000000..48403fb65 --- /dev/null +++ b/kernel/time/tick-sched.c @@ -0,0 +1,1393 @@ +/* + * linux/kernel/time/tick-sched.c + * + * Copyright(C) 2005-2006, Thomas Gleixner <tglx@linutronix.de> + * Copyright(C) 2005-2007, Red Hat, Inc., Ingo Molnar + * Copyright(C) 2006-2007 Timesys Corp., Thomas Gleixner + * + * No idle tick implementation for low and high resolution timers + * + * Started by: Thomas Gleixner and Ingo Molnar + * + * Distribute under GPLv2. + */ +#include <linux/cpu.h> +#include <linux/err.h> +#include <linux/hrtimer.h> +#include <linux/interrupt.h> +#include <linux/kernel_stat.h> +#include <linux/percpu.h> +#include <linux/nmi.h> +#include <linux/profile.h> +#include <linux/sched/signal.h> +#include <linux/sched/clock.h> +#include <linux/sched/stat.h> +#include <linux/sched/nohz.h> +#include <linux/module.h> +#include <linux/irq_work.h> +#include <linux/posix-timers.h> +#include <linux/context_tracking.h> +#include <linux/mm.h> + +#include <asm/irq_regs.h> + +#include "tick-internal.h" + +#include <trace/events/timer.h> + +/* + * Per-CPU nohz control structure + */ +static DEFINE_PER_CPU(struct tick_sched, tick_cpu_sched); + +struct tick_sched *tick_get_tick_sched(int cpu) +{ + return &per_cpu(tick_cpu_sched, cpu); +} + +#if defined(CONFIG_NO_HZ_COMMON) || defined(CONFIG_HIGH_RES_TIMERS) +/* + * The time, when the last jiffy update happened. Protected by jiffies_lock. + */ +static ktime_t last_jiffies_update; + +/* + * Must be called with interrupts disabled ! + */ +static void tick_do_update_jiffies64(ktime_t now) +{ + unsigned long ticks = 0; + ktime_t delta; + + /* + * Do a quick check without holding jiffies_lock: + * The READ_ONCE() pairs with two updates done later in this function. + */ + delta = ktime_sub(now, READ_ONCE(last_jiffies_update)); + if (delta < tick_period) + return; + + /* Reevaluate with jiffies_lock held */ + write_seqlock(&jiffies_lock); + + delta = ktime_sub(now, last_jiffies_update); + if (delta >= tick_period) { + + delta = ktime_sub(delta, tick_period); + /* Pairs with the lockless read in this function. */ + WRITE_ONCE(last_jiffies_update, + ktime_add(last_jiffies_update, tick_period)); + + /* Slow path for long timeouts */ + if (unlikely(delta >= tick_period)) { + s64 incr = ktime_to_ns(tick_period); + + ticks = ktime_divns(delta, incr); + + /* Pairs with the lockless read in this function. */ + WRITE_ONCE(last_jiffies_update, + ktime_add_ns(last_jiffies_update, + incr * ticks)); + } + do_timer(++ticks); + + /* Keep the tick_next_period variable up to date */ + tick_next_period = ktime_add(last_jiffies_update, tick_period); + } else { + write_sequnlock(&jiffies_lock); + return; + } + write_sequnlock(&jiffies_lock); + update_wall_time(); +} + +/* + * Initialize and return retrieve the jiffies update. + */ +static ktime_t tick_init_jiffy_update(void) +{ + ktime_t period; + + write_seqlock(&jiffies_lock); + /* Did we start the jiffies update yet ? */ + if (last_jiffies_update == 0) + last_jiffies_update = tick_next_period; + period = last_jiffies_update; + write_sequnlock(&jiffies_lock); + return period; +} + +static void tick_sched_do_timer(struct tick_sched *ts, ktime_t now) +{ + int cpu = smp_processor_id(); + +#ifdef CONFIG_NO_HZ_COMMON + /* + * Check if the do_timer duty was dropped. We don't care about + * concurrency: This happens only when the CPU in charge went + * into a long sleep. If two CPUs happen to assign themselves to + * this duty, then the jiffies update is still serialized by + * jiffies_lock. + */ + if (unlikely(tick_do_timer_cpu == TICK_DO_TIMER_NONE) + && !tick_nohz_full_cpu(cpu)) + tick_do_timer_cpu = cpu; +#endif + + /* Check, if the jiffies need an update */ + if (tick_do_timer_cpu == cpu) + tick_do_update_jiffies64(now); + + if (ts->inidle) + ts->got_idle_tick = 1; +} + +static void tick_sched_handle(struct tick_sched *ts, struct pt_regs *regs) +{ +#ifdef CONFIG_NO_HZ_COMMON + /* + * When we are idle and the tick is stopped, we have to touch + * the watchdog as we might not schedule for a really long + * time. This happens on complete idle SMP systems while + * waiting on the login prompt. We also increment the "start of + * idle" jiffy stamp so the idle accounting adjustment we do + * when we go busy again does not account too much ticks. + */ + if (ts->tick_stopped) { + touch_softlockup_watchdog_sched(); + if (is_idle_task(current)) + ts->idle_jiffies++; + /* + * In case the current tick fired too early past its expected + * expiration, make sure we don't bypass the next clock reprogramming + * to the same deadline. + */ + ts->next_tick = 0; + } +#endif + update_process_times(user_mode(regs)); + profile_tick(CPU_PROFILING); +} +#endif + +#ifdef CONFIG_NO_HZ_FULL +cpumask_var_t tick_nohz_full_mask; +bool tick_nohz_full_running; +static atomic_t tick_dep_mask; + +static bool check_tick_dependency(atomic_t *dep) +{ + int val = atomic_read(dep); + + if (val & TICK_DEP_MASK_POSIX_TIMER) { + trace_tick_stop(0, TICK_DEP_MASK_POSIX_TIMER); + return true; + } + + if (val & TICK_DEP_MASK_PERF_EVENTS) { + trace_tick_stop(0, TICK_DEP_MASK_PERF_EVENTS); + return true; + } + + if (val & TICK_DEP_MASK_SCHED) { + trace_tick_stop(0, TICK_DEP_MASK_SCHED); + return true; + } + + if (val & TICK_DEP_MASK_CLOCK_UNSTABLE) { + trace_tick_stop(0, TICK_DEP_MASK_CLOCK_UNSTABLE); + return true; + } + + return false; +} + +static bool can_stop_full_tick(int cpu, struct tick_sched *ts) +{ + lockdep_assert_irqs_disabled(); + + if (unlikely(!cpu_online(cpu))) + return false; + + if (check_tick_dependency(&tick_dep_mask)) + return false; + + if (check_tick_dependency(&ts->tick_dep_mask)) + return false; + + if (check_tick_dependency(¤t->tick_dep_mask)) + return false; + + if (check_tick_dependency(¤t->signal->tick_dep_mask)) + return false; + + return true; +} + +static void nohz_full_kick_func(struct irq_work *work) +{ + /* Empty, the tick restart happens on tick_nohz_irq_exit() */ +} + +static DEFINE_PER_CPU(struct irq_work, nohz_full_kick_work) = { + .func = nohz_full_kick_func, +}; + +/* + * Kick this CPU if it's full dynticks in order to force it to + * re-evaluate its dependency on the tick and restart it if necessary. + * This kick, unlike tick_nohz_full_kick_cpu() and tick_nohz_full_kick_all(), + * is NMI safe. + */ +static void tick_nohz_full_kick(void) +{ + if (!tick_nohz_full_cpu(smp_processor_id())) + return; + + irq_work_queue(this_cpu_ptr(&nohz_full_kick_work)); +} + +/* + * Kick the CPU if it's full dynticks in order to force it to + * re-evaluate its dependency on the tick and restart it if necessary. + */ +void tick_nohz_full_kick_cpu(int cpu) +{ + if (!tick_nohz_full_cpu(cpu)) + return; + + irq_work_queue_on(&per_cpu(nohz_full_kick_work, cpu), cpu); +} + +/* + * Kick all full dynticks CPUs in order to force these to re-evaluate + * their dependency on the tick and restart it if necessary. + */ +static void tick_nohz_full_kick_all(void) +{ + int cpu; + + if (!tick_nohz_full_running) + return; + + preempt_disable(); + for_each_cpu_and(cpu, tick_nohz_full_mask, cpu_online_mask) + tick_nohz_full_kick_cpu(cpu); + preempt_enable(); +} + +static void tick_nohz_dep_set_all(atomic_t *dep, + enum tick_dep_bits bit) +{ + int prev; + + prev = atomic_fetch_or(BIT(bit), dep); + if (!prev) + tick_nohz_full_kick_all(); +} + +/* + * Set a global tick dependency. Used by perf events that rely on freq and + * by unstable clock. + */ +void tick_nohz_dep_set(enum tick_dep_bits bit) +{ + tick_nohz_dep_set_all(&tick_dep_mask, bit); +} + +void tick_nohz_dep_clear(enum tick_dep_bits bit) +{ + atomic_andnot(BIT(bit), &tick_dep_mask); +} + +/* + * Set per-CPU tick dependency. Used by scheduler and perf events in order to + * manage events throttling. + */ +void tick_nohz_dep_set_cpu(int cpu, enum tick_dep_bits bit) +{ + int prev; + struct tick_sched *ts; + + ts = per_cpu_ptr(&tick_cpu_sched, cpu); + + prev = atomic_fetch_or(BIT(bit), &ts->tick_dep_mask); + if (!prev) { + preempt_disable(); + /* Perf needs local kick that is NMI safe */ + if (cpu == smp_processor_id()) { + tick_nohz_full_kick(); + } else { + /* Remote irq work not NMI-safe */ + if (!WARN_ON_ONCE(in_nmi())) + tick_nohz_full_kick_cpu(cpu); + } + preempt_enable(); + } +} + +void tick_nohz_dep_clear_cpu(int cpu, enum tick_dep_bits bit) +{ + struct tick_sched *ts = per_cpu_ptr(&tick_cpu_sched, cpu); + + atomic_andnot(BIT(bit), &ts->tick_dep_mask); +} + +/* + * Set a per-task tick dependency. Posix CPU timers need this in order to elapse + * per task timers. + */ +void tick_nohz_dep_set_task(struct task_struct *tsk, enum tick_dep_bits bit) +{ + /* + * We could optimize this with just kicking the target running the task + * if that noise matters for nohz full users. + */ + tick_nohz_dep_set_all(&tsk->tick_dep_mask, bit); +} + +void tick_nohz_dep_clear_task(struct task_struct *tsk, enum tick_dep_bits bit) +{ + atomic_andnot(BIT(bit), &tsk->tick_dep_mask); +} + +/* + * Set a per-taskgroup tick dependency. Posix CPU timers need this in order to elapse + * per process timers. + */ +void tick_nohz_dep_set_signal(struct signal_struct *sig, enum tick_dep_bits bit) +{ + tick_nohz_dep_set_all(&sig->tick_dep_mask, bit); +} + +void tick_nohz_dep_clear_signal(struct signal_struct *sig, enum tick_dep_bits bit) +{ + atomic_andnot(BIT(bit), &sig->tick_dep_mask); +} + +/* + * Re-evaluate the need for the tick as we switch the current task. + * It might need the tick due to per task/process properties: + * perf events, posix CPU timers, ... + */ +void __tick_nohz_task_switch(void) +{ + unsigned long flags; + struct tick_sched *ts; + + local_irq_save(flags); + + if (!tick_nohz_full_cpu(smp_processor_id())) + goto out; + + ts = this_cpu_ptr(&tick_cpu_sched); + + if (ts->tick_stopped) { + if (atomic_read(¤t->tick_dep_mask) || + atomic_read(¤t->signal->tick_dep_mask)) + tick_nohz_full_kick(); + } +out: + local_irq_restore(flags); +} + +/* Get the boot-time nohz CPU list from the kernel parameters. */ +void __init tick_nohz_full_setup(cpumask_var_t cpumask) +{ + alloc_bootmem_cpumask_var(&tick_nohz_full_mask); + cpumask_copy(tick_nohz_full_mask, cpumask); + tick_nohz_full_running = true; +} + +static int tick_nohz_cpu_down(unsigned int cpu) +{ + /* + * The boot CPU handles housekeeping duty (unbound timers, + * workqueues, timekeeping, ...) on behalf of full dynticks + * CPUs. It must remain online when nohz full is enabled. + */ + if (tick_nohz_full_running && tick_do_timer_cpu == cpu) + return -EBUSY; + return 0; +} + +void __init tick_nohz_init(void) +{ + int cpu, ret; + + if (!tick_nohz_full_running) + return; + + /* + * Full dynticks uses irq work to drive the tick rescheduling on safe + * locking contexts. But then we need irq work to raise its own + * interrupts to avoid circular dependency on the tick + */ + if (!arch_irq_work_has_interrupt()) { + pr_warn("NO_HZ: Can't run full dynticks because arch doesn't support irq work self-IPIs\n"); + cpumask_clear(tick_nohz_full_mask); + tick_nohz_full_running = false; + return; + } + + cpu = smp_processor_id(); + + if (cpumask_test_cpu(cpu, tick_nohz_full_mask)) { + pr_warn("NO_HZ: Clearing %d from nohz_full range for timekeeping\n", + cpu); + cpumask_clear_cpu(cpu, tick_nohz_full_mask); + } + + for_each_cpu(cpu, tick_nohz_full_mask) + context_tracking_cpu_set(cpu); + + ret = cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN, + "kernel/nohz:predown", NULL, + tick_nohz_cpu_down); + WARN_ON(ret < 0); + pr_info("NO_HZ: Full dynticks CPUs: %*pbl.\n", + cpumask_pr_args(tick_nohz_full_mask)); +} +#endif + +/* + * NOHZ - aka dynamic tick functionality + */ +#ifdef CONFIG_NO_HZ_COMMON +/* + * NO HZ enabled ? + */ +bool tick_nohz_enabled __read_mostly = true; +unsigned long tick_nohz_active __read_mostly; +/* + * Enable / Disable tickless mode + */ +static int __init setup_tick_nohz(char *str) +{ + return (kstrtobool(str, &tick_nohz_enabled) == 0); +} + +__setup("nohz=", setup_tick_nohz); + +bool tick_nohz_tick_stopped(void) +{ + struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched); + + return ts->tick_stopped; +} + +bool tick_nohz_tick_stopped_cpu(int cpu) +{ + struct tick_sched *ts = per_cpu_ptr(&tick_cpu_sched, cpu); + + return ts->tick_stopped; +} + +/** + * tick_nohz_update_jiffies - update jiffies when idle was interrupted + * + * Called from interrupt entry when the CPU was idle + * + * In case the sched_tick was stopped on this CPU, we have to check if jiffies + * must be updated. Otherwise an interrupt handler could use a stale jiffy + * value. We do this unconditionally on any CPU, as we don't know whether the + * CPU, which has the update task assigned is in a long sleep. + */ +static void tick_nohz_update_jiffies(ktime_t now) +{ + unsigned long flags; + + __this_cpu_write(tick_cpu_sched.idle_waketime, now); + + local_irq_save(flags); + tick_do_update_jiffies64(now); + local_irq_restore(flags); + + touch_softlockup_watchdog_sched(); +} + +/* + * Updates the per-CPU time idle statistics counters + */ +static void +update_ts_time_stats(int cpu, struct tick_sched *ts, ktime_t now, u64 *last_update_time) +{ + ktime_t delta; + + if (ts->idle_active) { + delta = ktime_sub(now, ts->idle_entrytime); + if (nr_iowait_cpu(cpu) > 0) + ts->iowait_sleeptime = ktime_add(ts->iowait_sleeptime, delta); + else + ts->idle_sleeptime = ktime_add(ts->idle_sleeptime, delta); + ts->idle_entrytime = now; + } + + if (last_update_time) + *last_update_time = ktime_to_us(now); + +} + +static void tick_nohz_stop_idle(struct tick_sched *ts, ktime_t now) +{ + update_ts_time_stats(smp_processor_id(), ts, now, NULL); + ts->idle_active = 0; + + sched_clock_idle_wakeup_event(); +} + +static void tick_nohz_start_idle(struct tick_sched *ts) +{ + ts->idle_entrytime = ktime_get(); + ts->idle_active = 1; + sched_clock_idle_sleep_event(); +} + +/** + * get_cpu_idle_time_us - get the total idle time of a CPU + * @cpu: CPU number to query + * @last_update_time: variable to store update time in. Do not update + * counters if NULL. + * + * Return the cumulative idle time (since boot) for a given + * CPU, in microseconds. + * + * This time is measured via accounting rather than sampling, + * and is as accurate as ktime_get() is. + * + * This function returns -1 if NOHZ is not enabled. + */ +u64 get_cpu_idle_time_us(int cpu, u64 *last_update_time) +{ + struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu); + ktime_t now, idle; + + if (!tick_nohz_active) + return -1; + + now = ktime_get(); + if (last_update_time) { + update_ts_time_stats(cpu, ts, now, last_update_time); + idle = ts->idle_sleeptime; + } else { + if (ts->idle_active && !nr_iowait_cpu(cpu)) { + ktime_t delta = ktime_sub(now, ts->idle_entrytime); + + idle = ktime_add(ts->idle_sleeptime, delta); + } else { + idle = ts->idle_sleeptime; + } + } + + return ktime_to_us(idle); + +} +EXPORT_SYMBOL_GPL(get_cpu_idle_time_us); + +/** + * get_cpu_iowait_time_us - get the total iowait time of a CPU + * @cpu: CPU number to query + * @last_update_time: variable to store update time in. Do not update + * counters if NULL. + * + * Return the cumulative iowait time (since boot) for a given + * CPU, in microseconds. + * + * This time is measured via accounting rather than sampling, + * and is as accurate as ktime_get() is. + * + * This function returns -1 if NOHZ is not enabled. + */ +u64 get_cpu_iowait_time_us(int cpu, u64 *last_update_time) +{ + struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu); + ktime_t now, iowait; + + if (!tick_nohz_active) + return -1; + + now = ktime_get(); + if (last_update_time) { + update_ts_time_stats(cpu, ts, now, last_update_time); + iowait = ts->iowait_sleeptime; + } else { + if (ts->idle_active && nr_iowait_cpu(cpu) > 0) { + ktime_t delta = ktime_sub(now, ts->idle_entrytime); + + iowait = ktime_add(ts->iowait_sleeptime, delta); + } else { + iowait = ts->iowait_sleeptime; + } + } + + return ktime_to_us(iowait); +} +EXPORT_SYMBOL_GPL(get_cpu_iowait_time_us); + +static void tick_nohz_restart(struct tick_sched *ts, ktime_t now) +{ + hrtimer_cancel(&ts->sched_timer); + hrtimer_set_expires(&ts->sched_timer, ts->last_tick); + + /* Forward the time to expire in the future */ + hrtimer_forward(&ts->sched_timer, now, tick_period); + + if (ts->nohz_mode == NOHZ_MODE_HIGHRES) + hrtimer_start_expires(&ts->sched_timer, HRTIMER_MODE_ABS_PINNED); + else + tick_program_event(hrtimer_get_expires(&ts->sched_timer), 1); + + /* + * Reset to make sure next tick stop doesn't get fooled by past + * cached clock deadline. + */ + ts->next_tick = 0; +} + +static inline bool local_timer_softirq_pending(void) +{ + return local_softirq_pending() & BIT(TIMER_SOFTIRQ); +} + +static ktime_t tick_nohz_next_event(struct tick_sched *ts, int cpu) +{ + u64 basemono, next_tick, next_tmr, next_rcu, delta, expires; + unsigned long seq, basejiff; + + /* Read jiffies and the time when jiffies were updated last */ + do { + seq = read_seqbegin(&jiffies_lock); + basemono = last_jiffies_update; + basejiff = jiffies; + } while (read_seqretry(&jiffies_lock, seq)); + ts->last_jiffies = basejiff; + ts->timer_expires_base = basemono; + + /* + * Keep the periodic tick, when RCU, architecture or irq_work + * requests it. + * Aside of that check whether the local timer softirq is + * pending. If so its a bad idea to call get_next_timer_interrupt() + * because there is an already expired timer, so it will request + * immeditate expiry, which rearms the hardware timer with a + * minimal delta which brings us back to this place + * immediately. Lather, rinse and repeat... + */ + if (rcu_needs_cpu(basemono, &next_rcu) || arch_needs_cpu() || + irq_work_needs_cpu() || local_timer_softirq_pending()) { + next_tick = basemono + TICK_NSEC; + } else { + /* + * Get the next pending timer. If high resolution + * timers are enabled this only takes the timer wheel + * timers into account. If high resolution timers are + * disabled this also looks at the next expiring + * hrtimer. + */ + next_tmr = get_next_timer_interrupt(basejiff, basemono); + ts->next_timer = next_tmr; + /* Take the next rcu event into account */ + next_tick = next_rcu < next_tmr ? next_rcu : next_tmr; + } + + /* + * If the tick is due in the next period, keep it ticking or + * force prod the timer. + */ + delta = next_tick - basemono; + if (delta <= (u64)TICK_NSEC) { + /* + * Tell the timer code that the base is not idle, i.e. undo + * the effect of get_next_timer_interrupt(): + */ + timer_clear_idle(); + /* + * We've not stopped the tick yet, and there's a timer in the + * next period, so no point in stopping it either, bail. + */ + if (!ts->tick_stopped) { + ts->timer_expires = 0; + goto out; + } + } + + /* + * If this CPU is the one which had the do_timer() duty last, we limit + * the sleep time to the timekeeping max_deferment value. + * Otherwise we can sleep as long as we want. + */ + delta = timekeeping_max_deferment(); + if (cpu != tick_do_timer_cpu && + (tick_do_timer_cpu != TICK_DO_TIMER_NONE || !ts->do_timer_last)) + delta = KTIME_MAX; + + /* Calculate the next expiry time */ + if (delta < (KTIME_MAX - basemono)) + expires = basemono + delta; + else + expires = KTIME_MAX; + + ts->timer_expires = min_t(u64, expires, next_tick); + +out: + return ts->timer_expires; +} + +static void tick_nohz_stop_tick(struct tick_sched *ts, int cpu) +{ + struct clock_event_device *dev = __this_cpu_read(tick_cpu_device.evtdev); + u64 basemono = ts->timer_expires_base; + u64 expires = ts->timer_expires; + ktime_t tick = expires; + + /* Make sure we won't be trying to stop it twice in a row. */ + ts->timer_expires_base = 0; + + /* + * If this CPU is the one which updates jiffies, then give up + * the assignment and let it be taken by the CPU which runs + * the tick timer next, which might be this CPU as well. If we + * don't drop this here the jiffies might be stale and + * do_timer() never invoked. Keep track of the fact that it + * was the one which had the do_timer() duty last. + */ + if (cpu == tick_do_timer_cpu) { + tick_do_timer_cpu = TICK_DO_TIMER_NONE; + ts->do_timer_last = 1; + } else if (tick_do_timer_cpu != TICK_DO_TIMER_NONE) { + ts->do_timer_last = 0; + } + + /* Skip reprogram of event if its not changed */ + if (ts->tick_stopped && (expires == ts->next_tick)) { + /* Sanity check: make sure clockevent is actually programmed */ + if (tick == KTIME_MAX || ts->next_tick == hrtimer_get_expires(&ts->sched_timer)) + return; + + WARN_ON_ONCE(1); + printk_once("basemono: %llu ts->next_tick: %llu dev->next_event: %llu timer->active: %d timer->expires: %llu\n", + basemono, ts->next_tick, dev->next_event, + hrtimer_active(&ts->sched_timer), hrtimer_get_expires(&ts->sched_timer)); + } + + /* + * nohz_stop_sched_tick can be called several times before + * the nohz_restart_sched_tick is called. This happens when + * interrupts arrive which do not cause a reschedule. In the + * first call we save the current tick time, so we can restart + * the scheduler tick in nohz_restart_sched_tick. + */ + if (!ts->tick_stopped) { + calc_load_nohz_start(); + cpu_load_update_nohz_start(); + quiet_vmstat(); + + ts->last_tick = hrtimer_get_expires(&ts->sched_timer); + ts->tick_stopped = 1; + trace_tick_stop(1, TICK_DEP_MASK_NONE); + } + + ts->next_tick = tick; + + /* + * If the expiration time == KTIME_MAX, then we simply stop + * the tick timer. + */ + if (unlikely(expires == KTIME_MAX)) { + if (ts->nohz_mode == NOHZ_MODE_HIGHRES) + hrtimer_cancel(&ts->sched_timer); + return; + } + + if (ts->nohz_mode == NOHZ_MODE_HIGHRES) { + hrtimer_start(&ts->sched_timer, tick, HRTIMER_MODE_ABS_PINNED); + } else { + hrtimer_set_expires(&ts->sched_timer, tick); + tick_program_event(tick, 1); + } +} + +static void tick_nohz_retain_tick(struct tick_sched *ts) +{ + ts->timer_expires_base = 0; +} + +#ifdef CONFIG_NO_HZ_FULL +static void tick_nohz_stop_sched_tick(struct tick_sched *ts, int cpu) +{ + if (tick_nohz_next_event(ts, cpu)) + tick_nohz_stop_tick(ts, cpu); + else + tick_nohz_retain_tick(ts); +} +#endif /* CONFIG_NO_HZ_FULL */ + +static void tick_nohz_restart_sched_tick(struct tick_sched *ts, ktime_t now) +{ + /* Update jiffies first */ + tick_do_update_jiffies64(now); + cpu_load_update_nohz_stop(); + + /* + * Clear the timer idle flag, so we avoid IPIs on remote queueing and + * the clock forward checks in the enqueue path: + */ + timer_clear_idle(); + + calc_load_nohz_stop(); + touch_softlockup_watchdog_sched(); + /* + * Cancel the scheduled timer and restore the tick + */ + ts->tick_stopped = 0; + ts->idle_exittime = now; + + tick_nohz_restart(ts, now); +} + +static void tick_nohz_full_update_tick(struct tick_sched *ts) +{ +#ifdef CONFIG_NO_HZ_FULL + int cpu = smp_processor_id(); + + if (!tick_nohz_full_cpu(cpu)) + return; + + if (!ts->tick_stopped && ts->nohz_mode == NOHZ_MODE_INACTIVE) + return; + + if (can_stop_full_tick(cpu, ts)) + tick_nohz_stop_sched_tick(ts, cpu); + else if (ts->tick_stopped) + tick_nohz_restart_sched_tick(ts, ktime_get()); +#endif +} + +static bool can_stop_idle_tick(int cpu, struct tick_sched *ts) +{ + /* + * If this CPU is offline and it is the one which updates + * jiffies, then give up the assignment and let it be taken by + * the CPU which runs the tick timer next. If we don't drop + * this here the jiffies might be stale and do_timer() never + * invoked. + */ + if (unlikely(!cpu_online(cpu))) { + if (cpu == tick_do_timer_cpu) + tick_do_timer_cpu = TICK_DO_TIMER_NONE; + /* + * Make sure the CPU doesn't get fooled by obsolete tick + * deadline if it comes back online later. + */ + ts->next_tick = 0; + return false; + } + + if (unlikely(ts->nohz_mode == NOHZ_MODE_INACTIVE)) + return false; + + if (need_resched()) + return false; + + if (unlikely(local_softirq_pending() && cpu_online(cpu))) { + static int ratelimit; + + if (ratelimit < 10 && + (local_softirq_pending() & SOFTIRQ_STOP_IDLE_MASK)) { + pr_warn("NOHZ: local_softirq_pending %02x\n", + (unsigned int) local_softirq_pending()); + ratelimit++; + } + return false; + } + + if (tick_nohz_full_enabled()) { + /* + * Keep the tick alive to guarantee timekeeping progression + * if there are full dynticks CPUs around + */ + if (tick_do_timer_cpu == cpu) + return false; + /* + * Boot safety: make sure the timekeeping duty has been + * assigned before entering dyntick-idle mode, + */ + if (tick_do_timer_cpu == TICK_DO_TIMER_NONE) + return false; + } + + return true; +} + +static void __tick_nohz_idle_stop_tick(struct tick_sched *ts) +{ + ktime_t expires; + int cpu = smp_processor_id(); + + /* + * If tick_nohz_get_sleep_length() ran tick_nohz_next_event(), the + * tick timer expiration time is known already. + */ + if (ts->timer_expires_base) + expires = ts->timer_expires; + else if (can_stop_idle_tick(cpu, ts)) + expires = tick_nohz_next_event(ts, cpu); + else + return; + + ts->idle_calls++; + + if (expires > 0LL) { + int was_stopped = ts->tick_stopped; + + tick_nohz_stop_tick(ts, cpu); + + ts->idle_sleeps++; + ts->idle_expires = expires; + + if (!was_stopped && ts->tick_stopped) { + ts->idle_jiffies = ts->last_jiffies; + nohz_balance_enter_idle(cpu); + } + } else { + tick_nohz_retain_tick(ts); + } +} + +/** + * tick_nohz_idle_stop_tick - stop the idle tick from the idle task + * + * When the next event is more than a tick into the future, stop the idle tick + */ +void tick_nohz_idle_stop_tick(void) +{ + __tick_nohz_idle_stop_tick(this_cpu_ptr(&tick_cpu_sched)); +} + +void tick_nohz_idle_retain_tick(void) +{ + tick_nohz_retain_tick(this_cpu_ptr(&tick_cpu_sched)); + /* + * Undo the effect of get_next_timer_interrupt() called from + * tick_nohz_next_event(). + */ + timer_clear_idle(); +} + +/** + * tick_nohz_idle_enter - prepare for entering idle on the current CPU + * + * Called when we start the idle loop. + */ +void tick_nohz_idle_enter(void) +{ + struct tick_sched *ts; + + lockdep_assert_irqs_enabled(); + + local_irq_disable(); + + ts = this_cpu_ptr(&tick_cpu_sched); + + WARN_ON_ONCE(ts->timer_expires_base); + + ts->inidle = 1; + tick_nohz_start_idle(ts); + + local_irq_enable(); +} + +/** + * tick_nohz_irq_exit - update next tick event from interrupt exit + * + * When an interrupt fires while we are idle and it doesn't cause + * a reschedule, it may still add, modify or delete a timer, enqueue + * an RCU callback, etc... + * So we need to re-calculate and reprogram the next tick event. + */ +void tick_nohz_irq_exit(void) +{ + struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched); + + if (ts->inidle) + tick_nohz_start_idle(ts); + else + tick_nohz_full_update_tick(ts); +} + +/** + * tick_nohz_idle_got_tick - Check whether or not the tick handler has run + */ +bool tick_nohz_idle_got_tick(void) +{ + struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched); + + if (ts->got_idle_tick) { + ts->got_idle_tick = 0; + return true; + } + return false; +} + +/** + * tick_nohz_get_sleep_length - return the expected length of the current sleep + * @delta_next: duration until the next event if the tick cannot be stopped + * + * Called from power state control code with interrupts disabled + */ +ktime_t tick_nohz_get_sleep_length(ktime_t *delta_next) +{ + struct clock_event_device *dev = __this_cpu_read(tick_cpu_device.evtdev); + struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched); + int cpu = smp_processor_id(); + /* + * The idle entry time is expected to be a sufficient approximation of + * the current time at this point. + */ + ktime_t now = ts->idle_entrytime; + ktime_t next_event; + + WARN_ON_ONCE(!ts->inidle); + + *delta_next = ktime_sub(dev->next_event, now); + + if (!can_stop_idle_tick(cpu, ts)) + return *delta_next; + + next_event = tick_nohz_next_event(ts, cpu); + if (!next_event) + return *delta_next; + + /* + * If the next highres timer to expire is earlier than next_event, the + * idle governor needs to know that. + */ + next_event = min_t(u64, next_event, + hrtimer_next_event_without(&ts->sched_timer)); + + return ktime_sub(next_event, now); +} + +/** + * tick_nohz_get_idle_calls_cpu - return the current idle calls counter value + * for a particular CPU. + * + * Called from the schedutil frequency scaling governor in scheduler context. + */ +unsigned long tick_nohz_get_idle_calls_cpu(int cpu) +{ + struct tick_sched *ts = tick_get_tick_sched(cpu); + + return ts->idle_calls; +} + +/** + * tick_nohz_get_idle_calls - return the current idle calls counter value + * + * Called from the schedutil frequency scaling governor in scheduler context. + */ +unsigned long tick_nohz_get_idle_calls(void) +{ + struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched); + + return ts->idle_calls; +} + +static void tick_nohz_account_idle_ticks(struct tick_sched *ts) +{ +#ifndef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE + unsigned long ticks; + + if (vtime_accounting_cpu_enabled()) + return; + /* + * We stopped the tick in idle. Update process times would miss the + * time we slept as update_process_times does only a 1 tick + * accounting. Enforce that this is accounted to idle ! + */ + ticks = jiffies - ts->idle_jiffies; + /* + * We might be one off. Do not randomly account a huge number of ticks! + */ + if (ticks && ticks < LONG_MAX) + account_idle_ticks(ticks); +#endif +} + +static void __tick_nohz_idle_restart_tick(struct tick_sched *ts, ktime_t now) +{ + tick_nohz_restart_sched_tick(ts, now); + tick_nohz_account_idle_ticks(ts); +} + +void tick_nohz_idle_restart_tick(void) +{ + struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched); + + if (ts->tick_stopped) + __tick_nohz_idle_restart_tick(ts, ktime_get()); +} + +/** + * tick_nohz_idle_exit - restart the idle tick from the idle task + * + * Restart the idle tick when the CPU is woken up from idle + * This also exit the RCU extended quiescent state. The CPU + * can use RCU again after this function is called. + */ +void tick_nohz_idle_exit(void) +{ + struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched); + bool idle_active, tick_stopped; + ktime_t now; + + local_irq_disable(); + + WARN_ON_ONCE(!ts->inidle); + WARN_ON_ONCE(ts->timer_expires_base); + + ts->inidle = 0; + idle_active = ts->idle_active; + tick_stopped = ts->tick_stopped; + + if (idle_active || tick_stopped) + now = ktime_get(); + + if (idle_active) + tick_nohz_stop_idle(ts, now); + + if (tick_stopped) + __tick_nohz_idle_restart_tick(ts, now); + + local_irq_enable(); +} + +/* + * The nohz low res interrupt handler + */ +static void tick_nohz_handler(struct clock_event_device *dev) +{ + struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched); + struct pt_regs *regs = get_irq_regs(); + ktime_t now = ktime_get(); + + dev->next_event = KTIME_MAX; + + tick_sched_do_timer(ts, now); + tick_sched_handle(ts, regs); + + /* No need to reprogram if we are running tickless */ + if (unlikely(ts->tick_stopped)) + return; + + hrtimer_forward(&ts->sched_timer, now, tick_period); + tick_program_event(hrtimer_get_expires(&ts->sched_timer), 1); +} + +static inline void tick_nohz_activate(struct tick_sched *ts, int mode) +{ + if (!tick_nohz_enabled) + return; + ts->nohz_mode = mode; + /* One update is enough */ + if (!test_and_set_bit(0, &tick_nohz_active)) + timers_update_nohz(); +} + +/** + * tick_nohz_switch_to_nohz - switch to nohz mode + */ +static void tick_nohz_switch_to_nohz(void) +{ + struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched); + ktime_t next; + + if (!tick_nohz_enabled) + return; + + if (tick_switch_to_oneshot(tick_nohz_handler)) + return; + + /* + * Recycle the hrtimer in ts, so we can share the + * hrtimer_forward with the highres code. + */ + hrtimer_init(&ts->sched_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS); + /* Get the next period */ + next = tick_init_jiffy_update(); + + hrtimer_set_expires(&ts->sched_timer, next); + hrtimer_forward_now(&ts->sched_timer, tick_period); + tick_program_event(hrtimer_get_expires(&ts->sched_timer), 1); + tick_nohz_activate(ts, NOHZ_MODE_LOWRES); +} + +static inline void tick_nohz_irq_enter(void) +{ + struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched); + ktime_t now; + + if (!ts->idle_active && !ts->tick_stopped) + return; + now = ktime_get(); + if (ts->idle_active) + tick_nohz_stop_idle(ts, now); + if (ts->tick_stopped) + tick_nohz_update_jiffies(now); +} + +#else + +static inline void tick_nohz_switch_to_nohz(void) { } +static inline void tick_nohz_irq_enter(void) { } +static inline void tick_nohz_activate(struct tick_sched *ts, int mode) { } + +#endif /* CONFIG_NO_HZ_COMMON */ + +/* + * Called from irq_enter to notify about the possible interruption of idle() + */ +void tick_irq_enter(void) +{ + tick_check_oneshot_broadcast_this_cpu(); + tick_nohz_irq_enter(); +} + +/* + * High resolution timer specific code + */ +#ifdef CONFIG_HIGH_RES_TIMERS +/* + * We rearm the timer until we get disabled by the idle code. + * Called with interrupts disabled. + */ +static enum hrtimer_restart tick_sched_timer(struct hrtimer *timer) +{ + struct tick_sched *ts = + container_of(timer, struct tick_sched, sched_timer); + struct pt_regs *regs = get_irq_regs(); + ktime_t now = ktime_get(); + + tick_sched_do_timer(ts, now); + + /* + * Do not call, when we are not in irq context and have + * no valid regs pointer + */ + if (regs) + tick_sched_handle(ts, regs); + else + ts->next_tick = 0; + + /* No need to reprogram if we are in idle or full dynticks mode */ + if (unlikely(ts->tick_stopped)) + return HRTIMER_NORESTART; + + hrtimer_forward(timer, now, tick_period); + + return HRTIMER_RESTART; +} + +static int sched_skew_tick; + +static int __init skew_tick(char *str) +{ + get_option(&str, &sched_skew_tick); + + return 0; +} +early_param("skew_tick", skew_tick); + +/** + * tick_setup_sched_timer - setup the tick emulation timer + */ +void tick_setup_sched_timer(void) +{ + struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched); + ktime_t now = ktime_get(); + + /* + * Emulate tick processing via per-CPU hrtimers: + */ + hrtimer_init(&ts->sched_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS); + ts->sched_timer.function = tick_sched_timer; + + /* Get the next period (per-CPU) */ + hrtimer_set_expires(&ts->sched_timer, tick_init_jiffy_update()); + + /* Offset the tick to avert jiffies_lock contention. */ + if (sched_skew_tick) { + u64 offset = ktime_to_ns(tick_period) >> 1; + do_div(offset, num_possible_cpus()); + offset *= smp_processor_id(); + hrtimer_add_expires_ns(&ts->sched_timer, offset); + } + + hrtimer_forward(&ts->sched_timer, now, tick_period); + hrtimer_start_expires(&ts->sched_timer, HRTIMER_MODE_ABS_PINNED); + tick_nohz_activate(ts, NOHZ_MODE_HIGHRES); +} +#endif /* HIGH_RES_TIMERS */ + +#if defined CONFIG_NO_HZ_COMMON || defined CONFIG_HIGH_RES_TIMERS +void tick_cancel_sched_timer(int cpu) +{ + struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu); + +# ifdef CONFIG_HIGH_RES_TIMERS + if (ts->sched_timer.base) + hrtimer_cancel(&ts->sched_timer); +# endif + + memset(ts, 0, sizeof(*ts)); +} +#endif + +/** + * Async notification about clocksource changes + */ +void tick_clock_notify(void) +{ + int cpu; + + for_each_possible_cpu(cpu) + set_bit(0, &per_cpu(tick_cpu_sched, cpu).check_clocks); +} + +/* + * Async notification about clock event changes + */ +void tick_oneshot_notify(void) +{ + struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched); + + set_bit(0, &ts->check_clocks); +} + +/** + * Check, if a change happened, which makes oneshot possible. + * + * Called cyclic from the hrtimer softirq (driven by the timer + * softirq) allow_nohz signals, that we can switch into low-res nohz + * mode, because high resolution timers are disabled (either compile + * or runtime). Called with interrupts disabled. + */ +int tick_check_oneshot_change(int allow_nohz) +{ + struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched); + + if (!test_and_clear_bit(0, &ts->check_clocks)) + return 0; + + if (ts->nohz_mode != NOHZ_MODE_INACTIVE) + return 0; + + if (!timekeeping_valid_for_hres() || !tick_is_oneshot_available()) + return 0; + + if (!allow_nohz) + return 1; + + tick_nohz_switch_to_nohz(); + return 0; +} diff --git a/kernel/time/tick-sched.h b/kernel/time/tick-sched.h new file mode 100644 index 000000000..6de959a85 --- /dev/null +++ b/kernel/time/tick-sched.h @@ -0,0 +1,94 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _TICK_SCHED_H +#define _TICK_SCHED_H + +#include <linux/hrtimer.h> + +enum tick_device_mode { + TICKDEV_MODE_PERIODIC, + TICKDEV_MODE_ONESHOT, +}; + +struct tick_device { + struct clock_event_device *evtdev; + enum tick_device_mode mode; +}; + +enum tick_nohz_mode { + NOHZ_MODE_INACTIVE, + NOHZ_MODE_LOWRES, + NOHZ_MODE_HIGHRES, +}; + +/** + * struct tick_sched - sched tick emulation and no idle tick control/stats + * @sched_timer: hrtimer to schedule the periodic tick in high + * resolution mode + * @last_tick: Store the last tick expiry time when the tick + * timer is modified for nohz sleeps. This is necessary + * to resume the tick timer operation in the timeline + * when the CPU returns from nohz sleep. + * @next_tick: Next tick to be fired when in dynticks mode. + * @tick_stopped: Indicator that the idle tick has been stopped + * @idle_jiffies: jiffies at the entry to idle for idle time accounting + * @idle_calls: Total number of idle calls + * @idle_sleeps: Number of idle calls, where the sched tick was stopped + * @idle_entrytime: Time when the idle call was entered + * @idle_waketime: Time when the idle was interrupted + * @idle_exittime: Time when the idle state was left + * @idle_sleeptime: Sum of the time slept in idle with sched tick stopped + * @iowait_sleeptime: Sum of the time slept in idle with sched tick stopped, with IO outstanding + * @timer_expires: Anticipated timer expiration time (in case sched tick is stopped) + * @timer_expires_base: Base time clock monotonic for @timer_expires + * @do_timer_lst: CPU was the last one doing do_timer before going idle + * @got_idle_tick: Tick timer function has run with @inidle set + */ +struct tick_sched { + struct hrtimer sched_timer; + unsigned long check_clocks; + enum tick_nohz_mode nohz_mode; + + unsigned int inidle : 1; + unsigned int tick_stopped : 1; + unsigned int idle_active : 1; + unsigned int do_timer_last : 1; + unsigned int got_idle_tick : 1; + + ktime_t last_tick; + ktime_t next_tick; + unsigned long idle_jiffies; + unsigned long idle_calls; + unsigned long idle_sleeps; + ktime_t idle_entrytime; + ktime_t idle_waketime; + ktime_t idle_exittime; + ktime_t idle_sleeptime; + ktime_t iowait_sleeptime; + unsigned long last_jiffies; + u64 timer_expires; + u64 timer_expires_base; + u64 next_timer; + ktime_t idle_expires; + atomic_t tick_dep_mask; +}; + +extern struct tick_sched *tick_get_tick_sched(int cpu); + +extern void tick_setup_sched_timer(void); +#if defined CONFIG_NO_HZ_COMMON || defined CONFIG_HIGH_RES_TIMERS +extern void tick_cancel_sched_timer(int cpu); +#else +static inline void tick_cancel_sched_timer(int cpu) { } +#endif + +#ifdef CONFIG_GENERIC_CLOCKEVENTS_BROADCAST +extern int __tick_broadcast_oneshot_control(enum tick_broadcast_state state); +#else +static inline int +__tick_broadcast_oneshot_control(enum tick_broadcast_state state) +{ + return -EBUSY; +} +#endif + +#endif diff --git a/kernel/time/time.c b/kernel/time/time.c new file mode 100644 index 000000000..f7d4fa5dd --- /dev/null +++ b/kernel/time/time.c @@ -0,0 +1,983 @@ +/* + * linux/kernel/time.c + * + * Copyright (C) 1991, 1992 Linus Torvalds + * + * This file contains the interface functions for the various + * time related system calls: time, stime, gettimeofday, settimeofday, + * adjtime + */ +/* + * Modification history kernel/time.c + * + * 1993-09-02 Philip Gladstone + * Created file with time related functions from sched/core.c and adjtimex() + * 1993-10-08 Torsten Duwe + * adjtime interface update and CMOS clock write code + * 1995-08-13 Torsten Duwe + * kernel PLL updated to 1994-12-13 specs (rfc-1589) + * 1999-01-16 Ulrich Windl + * Introduced error checking for many cases in adjtimex(). + * Updated NTP code according to technical memorandum Jan '96 + * "A Kernel Model for Precision Timekeeping" by Dave Mills + * Allow time_constant larger than MAXTC(6) for NTP v4 (MAXTC == 10) + * (Even though the technical memorandum forbids it) + * 2004-07-14 Christoph Lameter + * Added getnstimeofday to allow the posix timer functions to return + * with nanosecond accuracy + */ + +#include <linux/export.h> +#include <linux/kernel.h> +#include <linux/timex.h> +#include <linux/capability.h> +#include <linux/timekeeper_internal.h> +#include <linux/errno.h> +#include <linux/syscalls.h> +#include <linux/security.h> +#include <linux/fs.h> +#include <linux/math64.h> +#include <linux/ptrace.h> + +#include <linux/uaccess.h> +#include <linux/compat.h> +#include <asm/unistd.h> + +#include <generated/timeconst.h> +#include "timekeeping.h" + +/* + * The timezone where the local system is located. Used as a default by some + * programs who obtain this value by using gettimeofday. + */ +struct timezone sys_tz; + +EXPORT_SYMBOL(sys_tz); + +#ifdef __ARCH_WANT_SYS_TIME + +/* + * sys_time() can be implemented in user-level using + * sys_gettimeofday(). Is this for backwards compatibility? If so, + * why not move it into the appropriate arch directory (for those + * architectures that need it). + */ +SYSCALL_DEFINE1(time, time_t __user *, tloc) +{ + time_t i = (time_t)ktime_get_real_seconds(); + + if (tloc) { + if (put_user(i,tloc)) + return -EFAULT; + } + force_successful_syscall_return(); + return i; +} + +/* + * sys_stime() can be implemented in user-level using + * sys_settimeofday(). Is this for backwards compatibility? If so, + * why not move it into the appropriate arch directory (for those + * architectures that need it). + */ + +SYSCALL_DEFINE1(stime, time_t __user *, tptr) +{ + struct timespec64 tv; + int err; + + if (get_user(tv.tv_sec, tptr)) + return -EFAULT; + + tv.tv_nsec = 0; + + err = security_settime64(&tv, NULL); + if (err) + return err; + + do_settimeofday64(&tv); + return 0; +} + +#endif /* __ARCH_WANT_SYS_TIME */ + +#ifdef CONFIG_COMPAT +#ifdef __ARCH_WANT_COMPAT_SYS_TIME + +/* compat_time_t is a 32 bit "long" and needs to get converted. */ +COMPAT_SYSCALL_DEFINE1(time, compat_time_t __user *, tloc) +{ + compat_time_t i; + + i = (compat_time_t)ktime_get_real_seconds(); + + if (tloc) { + if (put_user(i,tloc)) + return -EFAULT; + } + force_successful_syscall_return(); + return i; +} + +COMPAT_SYSCALL_DEFINE1(stime, compat_time_t __user *, tptr) +{ + struct timespec64 tv; + int err; + + if (get_user(tv.tv_sec, tptr)) + return -EFAULT; + + tv.tv_nsec = 0; + + err = security_settime64(&tv, NULL); + if (err) + return err; + + do_settimeofday64(&tv); + return 0; +} + +#endif /* __ARCH_WANT_COMPAT_SYS_TIME */ +#endif + +SYSCALL_DEFINE2(gettimeofday, struct timeval __user *, tv, + struct timezone __user *, tz) +{ + if (likely(tv != NULL)) { + struct timespec64 ts; + + ktime_get_real_ts64(&ts); + if (put_user(ts.tv_sec, &tv->tv_sec) || + put_user(ts.tv_nsec / 1000, &tv->tv_usec)) + return -EFAULT; + } + if (unlikely(tz != NULL)) { + if (copy_to_user(tz, &sys_tz, sizeof(sys_tz))) + return -EFAULT; + } + return 0; +} + +/* + * In case for some reason the CMOS clock has not already been running + * in UTC, but in some local time: The first time we set the timezone, + * we will warp the clock so that it is ticking UTC time instead of + * local time. Presumably, if someone is setting the timezone then we + * are running in an environment where the programs understand about + * timezones. This should be done at boot time in the /etc/rc script, + * as soon as possible, so that the clock can be set right. Otherwise, + * various programs will get confused when the clock gets warped. + */ + +int do_sys_settimeofday64(const struct timespec64 *tv, const struct timezone *tz) +{ + static int firsttime = 1; + int error = 0; + + if (tv && !timespec64_valid_settod(tv)) + return -EINVAL; + + error = security_settime64(tv, tz); + if (error) + return error; + + if (tz) { + /* Verify we're witin the +-15 hrs range */ + if (tz->tz_minuteswest > 15*60 || tz->tz_minuteswest < -15*60) + return -EINVAL; + + sys_tz = *tz; + update_vsyscall_tz(); + if (firsttime) { + firsttime = 0; + if (!tv) + timekeeping_warp_clock(); + } + } + if (tv) + return do_settimeofday64(tv); + return 0; +} + +SYSCALL_DEFINE2(settimeofday, struct timeval __user *, tv, + struct timezone __user *, tz) +{ + struct timespec64 new_ts; + struct timeval user_tv; + struct timezone new_tz; + + if (tv) { + if (copy_from_user(&user_tv, tv, sizeof(*tv))) + return -EFAULT; + + if (!timeval_valid(&user_tv)) + return -EINVAL; + + new_ts.tv_sec = user_tv.tv_sec; + new_ts.tv_nsec = user_tv.tv_usec * NSEC_PER_USEC; + } + if (tz) { + if (copy_from_user(&new_tz, tz, sizeof(*tz))) + return -EFAULT; + } + + return do_sys_settimeofday64(tv ? &new_ts : NULL, tz ? &new_tz : NULL); +} + +#ifdef CONFIG_COMPAT +COMPAT_SYSCALL_DEFINE2(gettimeofday, struct compat_timeval __user *, tv, + struct timezone __user *, tz) +{ + if (tv) { + struct timespec64 ts; + + ktime_get_real_ts64(&ts); + if (put_user(ts.tv_sec, &tv->tv_sec) || + put_user(ts.tv_nsec / 1000, &tv->tv_usec)) + return -EFAULT; + } + if (tz) { + if (copy_to_user(tz, &sys_tz, sizeof(sys_tz))) + return -EFAULT; + } + + return 0; +} + +COMPAT_SYSCALL_DEFINE2(settimeofday, struct compat_timeval __user *, tv, + struct timezone __user *, tz) +{ + struct timespec64 new_ts; + struct timeval user_tv; + struct timezone new_tz; + + if (tv) { + if (compat_get_timeval(&user_tv, tv)) + return -EFAULT; + new_ts.tv_sec = user_tv.tv_sec; + new_ts.tv_nsec = user_tv.tv_usec * NSEC_PER_USEC; + } + if (tz) { + if (copy_from_user(&new_tz, tz, sizeof(*tz))) + return -EFAULT; + } + + return do_sys_settimeofday64(tv ? &new_ts : NULL, tz ? &new_tz : NULL); +} +#endif + +SYSCALL_DEFINE1(adjtimex, struct timex __user *, txc_p) +{ + struct timex txc; /* Local copy of parameter */ + int ret; + + /* Copy the user data space into the kernel copy + * structure. But bear in mind that the structures + * may change + */ + if (copy_from_user(&txc, txc_p, sizeof(struct timex))) + return -EFAULT; + ret = do_adjtimex(&txc); + return copy_to_user(txc_p, &txc, sizeof(struct timex)) ? -EFAULT : ret; +} + +#ifdef CONFIG_COMPAT + +COMPAT_SYSCALL_DEFINE1(adjtimex, struct compat_timex __user *, utp) +{ + struct timex txc; + int err, ret; + + err = compat_get_timex(&txc, utp); + if (err) + return err; + + ret = do_adjtimex(&txc); + + err = compat_put_timex(utp, &txc); + if (err) + return err; + + return ret; +} +#endif + +/* + * Convert jiffies to milliseconds and back. + * + * Avoid unnecessary multiplications/divisions in the + * two most common HZ cases: + */ +unsigned int jiffies_to_msecs(const unsigned long j) +{ +#if HZ <= MSEC_PER_SEC && !(MSEC_PER_SEC % HZ) + return (MSEC_PER_SEC / HZ) * j; +#elif HZ > MSEC_PER_SEC && !(HZ % MSEC_PER_SEC) + return (j + (HZ / MSEC_PER_SEC) - 1)/(HZ / MSEC_PER_SEC); +#else +# if BITS_PER_LONG == 32 + return (HZ_TO_MSEC_MUL32 * j + (1ULL << HZ_TO_MSEC_SHR32) - 1) >> + HZ_TO_MSEC_SHR32; +# else + return DIV_ROUND_UP(j * HZ_TO_MSEC_NUM, HZ_TO_MSEC_DEN); +# endif +#endif +} +EXPORT_SYMBOL(jiffies_to_msecs); + +unsigned int jiffies_to_usecs(const unsigned long j) +{ + /* + * Hz usually doesn't go much further MSEC_PER_SEC. + * jiffies_to_usecs() and usecs_to_jiffies() depend on that. + */ + BUILD_BUG_ON(HZ > USEC_PER_SEC); + +#if !(USEC_PER_SEC % HZ) + return (USEC_PER_SEC / HZ) * j; +#else +# if BITS_PER_LONG == 32 + return (HZ_TO_USEC_MUL32 * j) >> HZ_TO_USEC_SHR32; +# else + return (j * HZ_TO_USEC_NUM) / HZ_TO_USEC_DEN; +# endif +#endif +} +EXPORT_SYMBOL(jiffies_to_usecs); + +/** + * timespec_trunc - Truncate timespec to a granularity + * @t: Timespec + * @gran: Granularity in ns. + * + * Truncate a timespec to a granularity. Always rounds down. gran must + * not be 0 nor greater than a second (NSEC_PER_SEC, or 10^9 ns). + */ +struct timespec timespec_trunc(struct timespec t, unsigned gran) +{ + /* Avoid division in the common cases 1 ns and 1 s. */ + if (gran == 1) { + /* nothing */ + } else if (gran == NSEC_PER_SEC) { + t.tv_nsec = 0; + } else if (gran > 1 && gran < NSEC_PER_SEC) { + t.tv_nsec -= t.tv_nsec % gran; + } else { + WARN(1, "illegal file time granularity: %u", gran); + } + return t; +} +EXPORT_SYMBOL(timespec_trunc); + +/* + * mktime64 - Converts date to seconds. + * Converts Gregorian date to seconds since 1970-01-01 00:00:00. + * Assumes input in normal date format, i.e. 1980-12-31 23:59:59 + * => year=1980, mon=12, day=31, hour=23, min=59, sec=59. + * + * [For the Julian calendar (which was used in Russia before 1917, + * Britain & colonies before 1752, anywhere else before 1582, + * and is still in use by some communities) leave out the + * -year/100+year/400 terms, and add 10.] + * + * This algorithm was first published by Gauss (I think). + * + * A leap second can be indicated by calling this function with sec as + * 60 (allowable under ISO 8601). The leap second is treated the same + * as the following second since they don't exist in UNIX time. + * + * An encoding of midnight at the end of the day as 24:00:00 - ie. midnight + * tomorrow - (allowable under ISO 8601) is supported. + */ +time64_t mktime64(const unsigned int year0, const unsigned int mon0, + const unsigned int day, const unsigned int hour, + const unsigned int min, const unsigned int sec) +{ + unsigned int mon = mon0, year = year0; + + /* 1..12 -> 11,12,1..10 */ + if (0 >= (int) (mon -= 2)) { + mon += 12; /* Puts Feb last since it has leap day */ + year -= 1; + } + + return ((((time64_t) + (year/4 - year/100 + year/400 + 367*mon/12 + day) + + year*365 - 719499 + )*24 + hour /* now have hours - midnight tomorrow handled here */ + )*60 + min /* now have minutes */ + )*60 + sec; /* finally seconds */ +} +EXPORT_SYMBOL(mktime64); + +/** + * set_normalized_timespec - set timespec sec and nsec parts and normalize + * + * @ts: pointer to timespec variable to be set + * @sec: seconds to set + * @nsec: nanoseconds to set + * + * Set seconds and nanoseconds field of a timespec variable and + * normalize to the timespec storage format + * + * Note: The tv_nsec part is always in the range of + * 0 <= tv_nsec < NSEC_PER_SEC + * For negative values only the tv_sec field is negative ! + */ +void set_normalized_timespec(struct timespec *ts, time_t sec, s64 nsec) +{ + while (nsec >= NSEC_PER_SEC) { + /* + * The following asm() prevents the compiler from + * optimising this loop into a modulo operation. See + * also __iter_div_u64_rem() in include/linux/time.h + */ + asm("" : "+rm"(nsec)); + nsec -= NSEC_PER_SEC; + ++sec; + } + while (nsec < 0) { + asm("" : "+rm"(nsec)); + nsec += NSEC_PER_SEC; + --sec; + } + ts->tv_sec = sec; + ts->tv_nsec = nsec; +} +EXPORT_SYMBOL(set_normalized_timespec); + +/** + * ns_to_timespec - Convert nanoseconds to timespec + * @nsec: the nanoseconds value to be converted + * + * Returns the timespec representation of the nsec parameter. + */ +struct timespec ns_to_timespec(const s64 nsec) +{ + struct timespec ts; + s32 rem; + + if (!nsec) + return (struct timespec) {0, 0}; + + ts.tv_sec = div_s64_rem(nsec, NSEC_PER_SEC, &rem); + if (unlikely(rem < 0)) { + ts.tv_sec--; + rem += NSEC_PER_SEC; + } + ts.tv_nsec = rem; + + return ts; +} +EXPORT_SYMBOL(ns_to_timespec); + +/** + * ns_to_timeval - Convert nanoseconds to timeval + * @nsec: the nanoseconds value to be converted + * + * Returns the timeval representation of the nsec parameter. + */ +struct timeval ns_to_timeval(const s64 nsec) +{ + struct timespec ts = ns_to_timespec(nsec); + struct timeval tv; + + tv.tv_sec = ts.tv_sec; + tv.tv_usec = (suseconds_t) ts.tv_nsec / 1000; + + return tv; +} +EXPORT_SYMBOL(ns_to_timeval); + +struct __kernel_old_timeval ns_to_kernel_old_timeval(const s64 nsec) +{ + struct timespec64 ts = ns_to_timespec64(nsec); + struct __kernel_old_timeval tv; + + tv.tv_sec = ts.tv_sec; + tv.tv_usec = (suseconds_t)ts.tv_nsec / 1000; + + return tv; +} +EXPORT_SYMBOL(ns_to_kernel_old_timeval); + +/** + * set_normalized_timespec - set timespec sec and nsec parts and normalize + * + * @ts: pointer to timespec variable to be set + * @sec: seconds to set + * @nsec: nanoseconds to set + * + * Set seconds and nanoseconds field of a timespec variable and + * normalize to the timespec storage format + * + * Note: The tv_nsec part is always in the range of + * 0 <= tv_nsec < NSEC_PER_SEC + * For negative values only the tv_sec field is negative ! + */ +void set_normalized_timespec64(struct timespec64 *ts, time64_t sec, s64 nsec) +{ + while (nsec >= NSEC_PER_SEC) { + /* + * The following asm() prevents the compiler from + * optimising this loop into a modulo operation. See + * also __iter_div_u64_rem() in include/linux/time.h + */ + asm("" : "+rm"(nsec)); + nsec -= NSEC_PER_SEC; + ++sec; + } + while (nsec < 0) { + asm("" : "+rm"(nsec)); + nsec += NSEC_PER_SEC; + --sec; + } + ts->tv_sec = sec; + ts->tv_nsec = nsec; +} +EXPORT_SYMBOL(set_normalized_timespec64); + +/** + * ns_to_timespec64 - Convert nanoseconds to timespec64 + * @nsec: the nanoseconds value to be converted + * + * Returns the timespec64 representation of the nsec parameter. + */ +struct timespec64 ns_to_timespec64(const s64 nsec) +{ + struct timespec64 ts; + s32 rem; + + if (!nsec) + return (struct timespec64) {0, 0}; + + ts.tv_sec = div_s64_rem(nsec, NSEC_PER_SEC, &rem); + if (unlikely(rem < 0)) { + ts.tv_sec--; + rem += NSEC_PER_SEC; + } + ts.tv_nsec = rem; + + return ts; +} +EXPORT_SYMBOL(ns_to_timespec64); + +/** + * msecs_to_jiffies: - convert milliseconds to jiffies + * @m: time in milliseconds + * + * conversion is done as follows: + * + * - negative values mean 'infinite timeout' (MAX_JIFFY_OFFSET) + * + * - 'too large' values [that would result in larger than + * MAX_JIFFY_OFFSET values] mean 'infinite timeout' too. + * + * - all other values are converted to jiffies by either multiplying + * the input value by a factor or dividing it with a factor and + * handling any 32-bit overflows. + * for the details see __msecs_to_jiffies() + * + * msecs_to_jiffies() checks for the passed in value being a constant + * via __builtin_constant_p() allowing gcc to eliminate most of the + * code, __msecs_to_jiffies() is called if the value passed does not + * allow constant folding and the actual conversion must be done at + * runtime. + * the _msecs_to_jiffies helpers are the HZ dependent conversion + * routines found in include/linux/jiffies.h + */ +unsigned long __msecs_to_jiffies(const unsigned int m) +{ + /* + * Negative value, means infinite timeout: + */ + if ((int)m < 0) + return MAX_JIFFY_OFFSET; + return _msecs_to_jiffies(m); +} +EXPORT_SYMBOL(__msecs_to_jiffies); + +unsigned long __usecs_to_jiffies(const unsigned int u) +{ + if (u > jiffies_to_usecs(MAX_JIFFY_OFFSET)) + return MAX_JIFFY_OFFSET; + return _usecs_to_jiffies(u); +} +EXPORT_SYMBOL(__usecs_to_jiffies); + +/* + * The TICK_NSEC - 1 rounds up the value to the next resolution. Note + * that a remainder subtract here would not do the right thing as the + * resolution values don't fall on second boundries. I.e. the line: + * nsec -= nsec % TICK_NSEC; is NOT a correct resolution rounding. + * Note that due to the small error in the multiplier here, this + * rounding is incorrect for sufficiently large values of tv_nsec, but + * well formed timespecs should have tv_nsec < NSEC_PER_SEC, so we're + * OK. + * + * Rather, we just shift the bits off the right. + * + * The >> (NSEC_JIFFIE_SC - SEC_JIFFIE_SC) converts the scaled nsec + * value to a scaled second value. + */ +static unsigned long +__timespec64_to_jiffies(u64 sec, long nsec) +{ + nsec = nsec + TICK_NSEC - 1; + + if (sec >= MAX_SEC_IN_JIFFIES){ + sec = MAX_SEC_IN_JIFFIES; + nsec = 0; + } + return ((sec * SEC_CONVERSION) + + (((u64)nsec * NSEC_CONVERSION) >> + (NSEC_JIFFIE_SC - SEC_JIFFIE_SC))) >> SEC_JIFFIE_SC; + +} + +static unsigned long +__timespec_to_jiffies(unsigned long sec, long nsec) +{ + return __timespec64_to_jiffies((u64)sec, nsec); +} + +unsigned long +timespec64_to_jiffies(const struct timespec64 *value) +{ + return __timespec64_to_jiffies(value->tv_sec, value->tv_nsec); +} +EXPORT_SYMBOL(timespec64_to_jiffies); + +void +jiffies_to_timespec64(const unsigned long jiffies, struct timespec64 *value) +{ + /* + * Convert jiffies to nanoseconds and separate with + * one divide. + */ + u32 rem; + value->tv_sec = div_u64_rem((u64)jiffies * TICK_NSEC, + NSEC_PER_SEC, &rem); + value->tv_nsec = rem; +} +EXPORT_SYMBOL(jiffies_to_timespec64); + +/* + * We could use a similar algorithm to timespec_to_jiffies (with a + * different multiplier for usec instead of nsec). But this has a + * problem with rounding: we can't exactly add TICK_NSEC - 1 to the + * usec value, since it's not necessarily integral. + * + * We could instead round in the intermediate scaled representation + * (i.e. in units of 1/2^(large scale) jiffies) but that's also + * perilous: the scaling introduces a small positive error, which + * combined with a division-rounding-upward (i.e. adding 2^(scale) - 1 + * units to the intermediate before shifting) leads to accidental + * overflow and overestimates. + * + * At the cost of one additional multiplication by a constant, just + * use the timespec implementation. + */ +unsigned long +timeval_to_jiffies(const struct timeval *value) +{ + return __timespec_to_jiffies(value->tv_sec, + value->tv_usec * NSEC_PER_USEC); +} +EXPORT_SYMBOL(timeval_to_jiffies); + +void jiffies_to_timeval(const unsigned long jiffies, struct timeval *value) +{ + /* + * Convert jiffies to nanoseconds and separate with + * one divide. + */ + u32 rem; + + value->tv_sec = div_u64_rem((u64)jiffies * TICK_NSEC, + NSEC_PER_SEC, &rem); + value->tv_usec = rem / NSEC_PER_USEC; +} +EXPORT_SYMBOL(jiffies_to_timeval); + +/* + * Convert jiffies/jiffies_64 to clock_t and back. + */ +clock_t jiffies_to_clock_t(unsigned long x) +{ +#if (TICK_NSEC % (NSEC_PER_SEC / USER_HZ)) == 0 +# if HZ < USER_HZ + return x * (USER_HZ / HZ); +# else + return x / (HZ / USER_HZ); +# endif +#else + return div_u64((u64)x * TICK_NSEC, NSEC_PER_SEC / USER_HZ); +#endif +} +EXPORT_SYMBOL(jiffies_to_clock_t); + +unsigned long clock_t_to_jiffies(unsigned long x) +{ +#if (HZ % USER_HZ)==0 + if (x >= ~0UL / (HZ / USER_HZ)) + return ~0UL; + return x * (HZ / USER_HZ); +#else + /* Don't worry about loss of precision here .. */ + if (x >= ~0UL / HZ * USER_HZ) + return ~0UL; + + /* .. but do try to contain it here */ + return div_u64((u64)x * HZ, USER_HZ); +#endif +} +EXPORT_SYMBOL(clock_t_to_jiffies); + +u64 jiffies_64_to_clock_t(u64 x) +{ +#if (TICK_NSEC % (NSEC_PER_SEC / USER_HZ)) == 0 +# if HZ < USER_HZ + x = div_u64(x * USER_HZ, HZ); +# elif HZ > USER_HZ + x = div_u64(x, HZ / USER_HZ); +# else + /* Nothing to do */ +# endif +#else + /* + * There are better ways that don't overflow early, + * but even this doesn't overflow in hundreds of years + * in 64 bits, so.. + */ + x = div_u64(x * TICK_NSEC, (NSEC_PER_SEC / USER_HZ)); +#endif + return x; +} +EXPORT_SYMBOL(jiffies_64_to_clock_t); + +u64 nsec_to_clock_t(u64 x) +{ +#if (NSEC_PER_SEC % USER_HZ) == 0 + return div_u64(x, NSEC_PER_SEC / USER_HZ); +#elif (USER_HZ % 512) == 0 + return div_u64(x * USER_HZ / 512, NSEC_PER_SEC / 512); +#else + /* + * max relative error 5.7e-8 (1.8s per year) for USER_HZ <= 1024, + * overflow after 64.99 years. + * exact for HZ=60, 72, 90, 120, 144, 180, 300, 600, 900, ... + */ + return div_u64(x * 9, (9ull * NSEC_PER_SEC + (USER_HZ / 2)) / USER_HZ); +#endif +} + +u64 jiffies64_to_nsecs(u64 j) +{ +#if !(NSEC_PER_SEC % HZ) + return (NSEC_PER_SEC / HZ) * j; +# else + return div_u64(j * HZ_TO_NSEC_NUM, HZ_TO_NSEC_DEN); +#endif +} +EXPORT_SYMBOL(jiffies64_to_nsecs); + +/** + * nsecs_to_jiffies64 - Convert nsecs in u64 to jiffies64 + * + * @n: nsecs in u64 + * + * Unlike {m,u}secs_to_jiffies, type of input is not unsigned int but u64. + * And this doesn't return MAX_JIFFY_OFFSET since this function is designed + * for scheduler, not for use in device drivers to calculate timeout value. + * + * note: + * NSEC_PER_SEC = 10^9 = (5^9 * 2^9) = (1953125 * 512) + * ULLONG_MAX ns = 18446744073.709551615 secs = about 584 years + */ +u64 nsecs_to_jiffies64(u64 n) +{ +#if (NSEC_PER_SEC % HZ) == 0 + /* Common case, HZ = 100, 128, 200, 250, 256, 500, 512, 1000 etc. */ + return div_u64(n, NSEC_PER_SEC / HZ); +#elif (HZ % 512) == 0 + /* overflow after 292 years if HZ = 1024 */ + return div_u64(n * HZ / 512, NSEC_PER_SEC / 512); +#else + /* + * Generic case - optimized for cases where HZ is a multiple of 3. + * overflow after 64.99 years, exact for HZ = 60, 72, 90, 120 etc. + */ + return div_u64(n * 9, (9ull * NSEC_PER_SEC + HZ / 2) / HZ); +#endif +} +EXPORT_SYMBOL(nsecs_to_jiffies64); + +/** + * nsecs_to_jiffies - Convert nsecs in u64 to jiffies + * + * @n: nsecs in u64 + * + * Unlike {m,u}secs_to_jiffies, type of input is not unsigned int but u64. + * And this doesn't return MAX_JIFFY_OFFSET since this function is designed + * for scheduler, not for use in device drivers to calculate timeout value. + * + * note: + * NSEC_PER_SEC = 10^9 = (5^9 * 2^9) = (1953125 * 512) + * ULLONG_MAX ns = 18446744073.709551615 secs = about 584 years + */ +unsigned long nsecs_to_jiffies(u64 n) +{ + return (unsigned long)nsecs_to_jiffies64(n); +} +EXPORT_SYMBOL_GPL(nsecs_to_jiffies); + +/* + * Add two timespec64 values and do a safety check for overflow. + * It's assumed that both values are valid (>= 0). + * And, each timespec64 is in normalized form. + */ +struct timespec64 timespec64_add_safe(const struct timespec64 lhs, + const struct timespec64 rhs) +{ + struct timespec64 res; + + set_normalized_timespec64(&res, (timeu64_t) lhs.tv_sec + rhs.tv_sec, + lhs.tv_nsec + rhs.tv_nsec); + + if (unlikely(res.tv_sec < lhs.tv_sec || res.tv_sec < rhs.tv_sec)) { + res.tv_sec = TIME64_MAX; + res.tv_nsec = 0; + } + + return res; +} + +int get_timespec64(struct timespec64 *ts, + const struct __kernel_timespec __user *uts) +{ + struct __kernel_timespec kts; + int ret; + + ret = copy_from_user(&kts, uts, sizeof(kts)); + if (ret) + return -EFAULT; + + ts->tv_sec = kts.tv_sec; + + /* Zero out the padding for 32 bit systems or in compat mode */ + if (IS_ENABLED(CONFIG_64BIT_TIME) && (!IS_ENABLED(CONFIG_64BIT) || in_compat_syscall())) + kts.tv_nsec &= 0xFFFFFFFFUL; + + ts->tv_nsec = kts.tv_nsec; + + return 0; +} +EXPORT_SYMBOL_GPL(get_timespec64); + +int put_timespec64(const struct timespec64 *ts, + struct __kernel_timespec __user *uts) +{ + struct __kernel_timespec kts = { + .tv_sec = ts->tv_sec, + .tv_nsec = ts->tv_nsec + }; + + return copy_to_user(uts, &kts, sizeof(kts)) ? -EFAULT : 0; +} +EXPORT_SYMBOL_GPL(put_timespec64); + +int __compat_get_timespec64(struct timespec64 *ts64, + const struct compat_timespec __user *cts) +{ + struct compat_timespec ts; + int ret; + + ret = copy_from_user(&ts, cts, sizeof(ts)); + if (ret) + return -EFAULT; + + ts64->tv_sec = ts.tv_sec; + ts64->tv_nsec = ts.tv_nsec; + + return 0; +} + +int __compat_put_timespec64(const struct timespec64 *ts64, + struct compat_timespec __user *cts) +{ + struct compat_timespec ts = { + .tv_sec = ts64->tv_sec, + .tv_nsec = ts64->tv_nsec + }; + return copy_to_user(cts, &ts, sizeof(ts)) ? -EFAULT : 0; +} + +int compat_get_timespec64(struct timespec64 *ts, const void __user *uts) +{ + if (COMPAT_USE_64BIT_TIME) + return copy_from_user(ts, uts, sizeof(*ts)) ? -EFAULT : 0; + else + return __compat_get_timespec64(ts, uts); +} +EXPORT_SYMBOL_GPL(compat_get_timespec64); + +int compat_put_timespec64(const struct timespec64 *ts, void __user *uts) +{ + if (COMPAT_USE_64BIT_TIME) + return copy_to_user(uts, ts, sizeof(*ts)) ? -EFAULT : 0; + else + return __compat_put_timespec64(ts, uts); +} +EXPORT_SYMBOL_GPL(compat_put_timespec64); + +int get_itimerspec64(struct itimerspec64 *it, + const struct __kernel_itimerspec __user *uit) +{ + int ret; + + ret = get_timespec64(&it->it_interval, &uit->it_interval); + if (ret) + return ret; + + ret = get_timespec64(&it->it_value, &uit->it_value); + + return ret; +} +EXPORT_SYMBOL_GPL(get_itimerspec64); + +int put_itimerspec64(const struct itimerspec64 *it, + struct __kernel_itimerspec __user *uit) +{ + int ret; + + ret = put_timespec64(&it->it_interval, &uit->it_interval); + if (ret) + return ret; + + ret = put_timespec64(&it->it_value, &uit->it_value); + + return ret; +} +EXPORT_SYMBOL_GPL(put_itimerspec64); + +int get_compat_itimerspec64(struct itimerspec64 *its, + const struct compat_itimerspec __user *uits) +{ + + if (__compat_get_timespec64(&its->it_interval, &uits->it_interval) || + __compat_get_timespec64(&its->it_value, &uits->it_value)) + return -EFAULT; + return 0; +} +EXPORT_SYMBOL_GPL(get_compat_itimerspec64); + +int put_compat_itimerspec64(const struct itimerspec64 *its, + struct compat_itimerspec __user *uits) +{ + if (__compat_put_timespec64(&its->it_interval, &uits->it_interval) || + __compat_put_timespec64(&its->it_value, &uits->it_value)) + return -EFAULT; + return 0; +} +EXPORT_SYMBOL_GPL(put_compat_itimerspec64); diff --git a/kernel/time/timeconst.bc b/kernel/time/timeconst.bc new file mode 100644 index 000000000..f83bbb816 --- /dev/null +++ b/kernel/time/timeconst.bc @@ -0,0 +1,115 @@ +scale=0 + +define gcd(a,b) { + auto t; + while (b) { + t = b; + b = a % b; + a = t; + } + return a; +} + +/* Division by reciprocal multiplication. */ +define fmul(b,n,d) { + return (2^b*n+d-1)/d; +} + +/* Adjustment factor when a ceiling value is used. Use as: + (imul * n) + (fmulxx * n + fadjxx) >> xx) */ +define fadj(b,n,d) { + auto v; + d = d/gcd(n,d); + v = 2^b*(d-1)/d; + return v; +} + +/* Compute the appropriate mul/adj values as well as a shift count, + which brings the mul value into the range 2^b-1 <= x < 2^b. Such + a shift value will be correct in the signed integer range and off + by at most one in the upper half of the unsigned range. */ +define fmuls(b,n,d) { + auto s, m; + for (s = 0; 1; s++) { + m = fmul(s,n,d); + if (m >= 2^(b-1)) + return s; + } + return 0; +} + +define timeconst(hz) { + print "/* Automatically generated by kernel/time/timeconst.bc */\n" + print "/* Time conversion constants for HZ == ", hz, " */\n" + print "\n" + + print "#ifndef KERNEL_TIMECONST_H\n" + print "#define KERNEL_TIMECONST_H\n\n" + + print "#include <linux/param.h>\n" + print "#include <linux/types.h>\n\n" + + print "#if HZ != ", hz, "\n" + print "#error \qinclude/generated/timeconst.h has the wrong HZ value!\q\n" + print "#endif\n\n" + + if (hz < 2) { + print "#error Totally bogus HZ value!\n" + } else { + s=fmuls(32,1000,hz) + obase=16 + print "#define HZ_TO_MSEC_MUL32\tU64_C(0x", fmul(s,1000,hz), ")\n" + print "#define HZ_TO_MSEC_ADJ32\tU64_C(0x", fadj(s,1000,hz), ")\n" + obase=10 + print "#define HZ_TO_MSEC_SHR32\t", s, "\n" + + s=fmuls(32,hz,1000) + obase=16 + print "#define MSEC_TO_HZ_MUL32\tU64_C(0x", fmul(s,hz,1000), ")\n" + print "#define MSEC_TO_HZ_ADJ32\tU64_C(0x", fadj(s,hz,1000), ")\n" + obase=10 + print "#define MSEC_TO_HZ_SHR32\t", s, "\n" + + obase=10 + cd=gcd(hz,1000) + print "#define HZ_TO_MSEC_NUM\t\t", 1000/cd, "\n" + print "#define HZ_TO_MSEC_DEN\t\t", hz/cd, "\n" + print "#define MSEC_TO_HZ_NUM\t\t", hz/cd, "\n" + print "#define MSEC_TO_HZ_DEN\t\t", 1000/cd, "\n" + print "\n" + + s=fmuls(32,1000000,hz) + obase=16 + print "#define HZ_TO_USEC_MUL32\tU64_C(0x", fmul(s,1000000,hz), ")\n" + print "#define HZ_TO_USEC_ADJ32\tU64_C(0x", fadj(s,1000000,hz), ")\n" + obase=10 + print "#define HZ_TO_USEC_SHR32\t", s, "\n" + + s=fmuls(32,hz,1000000) + obase=16 + print "#define USEC_TO_HZ_MUL32\tU64_C(0x", fmul(s,hz,1000000), ")\n" + print "#define USEC_TO_HZ_ADJ32\tU64_C(0x", fadj(s,hz,1000000), ")\n" + obase=10 + print "#define USEC_TO_HZ_SHR32\t", s, "\n" + + obase=10 + cd=gcd(hz,1000000) + print "#define HZ_TO_USEC_NUM\t\t", 1000000/cd, "\n" + print "#define HZ_TO_USEC_DEN\t\t", hz/cd, "\n" + print "#define USEC_TO_HZ_NUM\t\t", hz/cd, "\n" + print "#define USEC_TO_HZ_DEN\t\t", 1000000/cd, "\n" + + cd=gcd(hz,1000000000) + print "#define HZ_TO_NSEC_NUM\t\t", 1000000000/cd, "\n" + print "#define HZ_TO_NSEC_DEN\t\t", hz/cd, "\n" + print "#define NSEC_TO_HZ_NUM\t\t", hz/cd, "\n" + print "#define NSEC_TO_HZ_DEN\t\t", 1000000000/cd, "\n" + print "\n" + + print "#endif /* KERNEL_TIMECONST_H */\n" + } + halt +} + +hz = read(); +timeconst(hz) diff --git a/kernel/time/timeconv.c b/kernel/time/timeconv.c new file mode 100644 index 000000000..7142580ad --- /dev/null +++ b/kernel/time/timeconv.c @@ -0,0 +1,128 @@ +/* + * Copyright (C) 1993, 1994, 1995, 1996, 1997 Free Software Foundation, Inc. + * This file is part of the GNU C Library. + * Contributed by Paul Eggert (eggert@twinsun.com). + * + * The GNU C Library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Library General Public License as + * published by the Free Software Foundation; either version 2 of the + * License, or (at your option) any later version. + * + * The GNU C Library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Library General Public License for more details. + * + * You should have received a copy of the GNU Library General Public + * License along with the GNU C Library; see the file COPYING.LIB. If not, + * write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 02111-1307, USA. + */ + +/* + * Converts the calendar time to broken-down time representation + * Based on code from glibc-2.6 + * + * 2009-7-14: + * Moved from glibc-2.6 to kernel by Zhaolei<zhaolei@cn.fujitsu.com> + */ + +#include <linux/time.h> +#include <linux/module.h> + +/* + * Nonzero if YEAR is a leap year (every 4 years, + * except every 100th isn't, and every 400th is). + */ +static int __isleap(long year) +{ + return (year) % 4 == 0 && ((year) % 100 != 0 || (year) % 400 == 0); +} + +/* do a mathdiv for long type */ +static long math_div(long a, long b) +{ + return a / b - (a % b < 0); +} + +/* How many leap years between y1 and y2, y1 must less or equal to y2 */ +static long leaps_between(long y1, long y2) +{ + long leaps1 = math_div(y1 - 1, 4) - math_div(y1 - 1, 100) + + math_div(y1 - 1, 400); + long leaps2 = math_div(y2 - 1, 4) - math_div(y2 - 1, 100) + + math_div(y2 - 1, 400); + return leaps2 - leaps1; +} + +/* How many days come before each month (0-12). */ +static const unsigned short __mon_yday[2][13] = { + /* Normal years. */ + {0, 31, 59, 90, 120, 151, 181, 212, 243, 273, 304, 334, 365}, + /* Leap years. */ + {0, 31, 60, 91, 121, 152, 182, 213, 244, 274, 305, 335, 366} +}; + +#define SECS_PER_HOUR (60 * 60) +#define SECS_PER_DAY (SECS_PER_HOUR * 24) + +/** + * time64_to_tm - converts the calendar time to local broken-down time + * + * @totalsecs the number of seconds elapsed since 00:00:00 on January 1, 1970, + * Coordinated Universal Time (UTC). + * @offset offset seconds adding to totalsecs. + * @result pointer to struct tm variable to receive broken-down time + */ +void time64_to_tm(time64_t totalsecs, int offset, struct tm *result) +{ + long days, rem, y; + int remainder; + const unsigned short *ip; + + days = div_s64_rem(totalsecs, SECS_PER_DAY, &remainder); + rem = remainder; + rem += offset; + while (rem < 0) { + rem += SECS_PER_DAY; + --days; + } + while (rem >= SECS_PER_DAY) { + rem -= SECS_PER_DAY; + ++days; + } + + result->tm_hour = rem / SECS_PER_HOUR; + rem %= SECS_PER_HOUR; + result->tm_min = rem / 60; + result->tm_sec = rem % 60; + + /* January 1, 1970 was a Thursday. */ + result->tm_wday = (4 + days) % 7; + if (result->tm_wday < 0) + result->tm_wday += 7; + + y = 1970; + + while (days < 0 || days >= (__isleap(y) ? 366 : 365)) { + /* Guess a corrected year, assuming 365 days per year. */ + long yg = y + math_div(days, 365); + + /* Adjust DAYS and Y to match the guessed year. */ + days -= (yg - y) * 365 + leaps_between(y, yg); + y = yg; + } + + result->tm_year = y - 1900; + + result->tm_yday = days; + + ip = __mon_yday[__isleap(y)]; + for (y = 11; days < ip[y]; y--) + continue; + days -= ip[y]; + + result->tm_mon = y; + result->tm_mday = days + 1; +} +EXPORT_SYMBOL(time64_to_tm); diff --git a/kernel/time/timecounter.c b/kernel/time/timecounter.c new file mode 100644 index 000000000..8afd78932 --- /dev/null +++ b/kernel/time/timecounter.c @@ -0,0 +1,112 @@ +/* + * linux/kernel/time/timecounter.c + * + * based on code that migrated away from + * linux/kernel/time/clocksource.c + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + */ + +#include <linux/export.h> +#include <linux/timecounter.h> + +void timecounter_init(struct timecounter *tc, + const struct cyclecounter *cc, + u64 start_tstamp) +{ + tc->cc = cc; + tc->cycle_last = cc->read(cc); + tc->nsec = start_tstamp; + tc->mask = (1ULL << cc->shift) - 1; + tc->frac = 0; +} +EXPORT_SYMBOL_GPL(timecounter_init); + +/** + * timecounter_read_delta - get nanoseconds since last call of this function + * @tc: Pointer to time counter + * + * When the underlying cycle counter runs over, this will be handled + * correctly as long as it does not run over more than once between + * calls. + * + * The first call to this function for a new time counter initializes + * the time tracking and returns an undefined result. + */ +static u64 timecounter_read_delta(struct timecounter *tc) +{ + u64 cycle_now, cycle_delta; + u64 ns_offset; + + /* read cycle counter: */ + cycle_now = tc->cc->read(tc->cc); + + /* calculate the delta since the last timecounter_read_delta(): */ + cycle_delta = (cycle_now - tc->cycle_last) & tc->cc->mask; + + /* convert to nanoseconds: */ + ns_offset = cyclecounter_cyc2ns(tc->cc, cycle_delta, + tc->mask, &tc->frac); + + /* update time stamp of timecounter_read_delta() call: */ + tc->cycle_last = cycle_now; + + return ns_offset; +} + +u64 timecounter_read(struct timecounter *tc) +{ + u64 nsec; + + /* increment time by nanoseconds since last call */ + nsec = timecounter_read_delta(tc); + nsec += tc->nsec; + tc->nsec = nsec; + + return nsec; +} +EXPORT_SYMBOL_GPL(timecounter_read); + +/* + * This is like cyclecounter_cyc2ns(), but it is used for computing a + * time previous to the time stored in the cycle counter. + */ +static u64 cc_cyc2ns_backwards(const struct cyclecounter *cc, + u64 cycles, u64 mask, u64 frac) +{ + u64 ns = (u64) cycles; + + ns = ((ns * cc->mult) - frac) >> cc->shift; + + return ns; +} + +u64 timecounter_cyc2time(struct timecounter *tc, + u64 cycle_tstamp) +{ + u64 delta = (cycle_tstamp - tc->cycle_last) & tc->cc->mask; + u64 nsec = tc->nsec, frac = tc->frac; + + /* + * Instead of always treating cycle_tstamp as more recent + * than tc->cycle_last, detect when it is too far in the + * future and treat it as old time stamp instead. + */ + if (delta > tc->cc->mask / 2) { + delta = (tc->cycle_last - cycle_tstamp) & tc->cc->mask; + nsec -= cc_cyc2ns_backwards(tc->cc, delta, tc->mask, frac); + } else { + nsec += cyclecounter_cyc2ns(tc->cc, delta, tc->mask, &frac); + } + + return nsec; +} +EXPORT_SYMBOL_GPL(timecounter_cyc2time); diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c new file mode 100644 index 000000000..087f71183 --- /dev/null +++ b/kernel/time/timekeeping.c @@ -0,0 +1,2414 @@ +/* + * linux/kernel/time/timekeeping.c + * + * Kernel timekeeping code and accessor functions + * + * This code was moved from linux/kernel/timer.c. + * Please see that file for copyright and history logs. + * + */ + +#include <linux/timekeeper_internal.h> +#include <linux/module.h> +#include <linux/interrupt.h> +#include <linux/percpu.h> +#include <linux/init.h> +#include <linux/mm.h> +#include <linux/nmi.h> +#include <linux/sched.h> +#include <linux/sched/loadavg.h> +#include <linux/sched/clock.h> +#include <linux/syscore_ops.h> +#include <linux/clocksource.h> +#include <linux/jiffies.h> +#include <linux/time.h> +#include <linux/timex.h> +#include <linux/tick.h> +#include <linux/stop_machine.h> +#include <linux/pvclock_gtod.h> +#include <linux/compiler.h> + +#include "tick-internal.h" +#include "ntp_internal.h" +#include "timekeeping_internal.h" + +#define TK_CLEAR_NTP (1 << 0) +#define TK_MIRROR (1 << 1) +#define TK_CLOCK_WAS_SET (1 << 2) + +enum timekeeping_adv_mode { + /* Update timekeeper when a tick has passed */ + TK_ADV_TICK, + + /* Update timekeeper on a direct frequency change */ + TK_ADV_FREQ +}; + +/* + * The most important data for readout fits into a single 64 byte + * cache line. + */ +static struct { + seqcount_t seq; + struct timekeeper timekeeper; +} tk_core ____cacheline_aligned = { + .seq = SEQCNT_ZERO(tk_core.seq), +}; + +static DEFINE_RAW_SPINLOCK(timekeeper_lock); +static struct timekeeper shadow_timekeeper; + +/** + * struct tk_fast - NMI safe timekeeper + * @seq: Sequence counter for protecting updates. The lowest bit + * is the index for the tk_read_base array + * @base: tk_read_base array. Access is indexed by the lowest bit of + * @seq. + * + * See @update_fast_timekeeper() below. + */ +struct tk_fast { + seqcount_t seq; + struct tk_read_base base[2]; +}; + +/* Suspend-time cycles value for halted fast timekeeper. */ +static u64 cycles_at_suspend; + +static u64 dummy_clock_read(struct clocksource *cs) +{ + return cycles_at_suspend; +} + +static struct clocksource dummy_clock = { + .read = dummy_clock_read, +}; + +static struct tk_fast tk_fast_mono ____cacheline_aligned = { + .base[0] = { .clock = &dummy_clock, }, + .base[1] = { .clock = &dummy_clock, }, +}; + +static struct tk_fast tk_fast_raw ____cacheline_aligned = { + .base[0] = { .clock = &dummy_clock, }, + .base[1] = { .clock = &dummy_clock, }, +}; + +/* flag for if timekeeping is suspended */ +int __read_mostly timekeeping_suspended; + +static inline void tk_normalize_xtime(struct timekeeper *tk) +{ + while (tk->tkr_mono.xtime_nsec >= ((u64)NSEC_PER_SEC << tk->tkr_mono.shift)) { + tk->tkr_mono.xtime_nsec -= (u64)NSEC_PER_SEC << tk->tkr_mono.shift; + tk->xtime_sec++; + } + while (tk->tkr_raw.xtime_nsec >= ((u64)NSEC_PER_SEC << tk->tkr_raw.shift)) { + tk->tkr_raw.xtime_nsec -= (u64)NSEC_PER_SEC << tk->tkr_raw.shift; + tk->raw_sec++; + } +} + +static inline struct timespec64 tk_xtime(const struct timekeeper *tk) +{ + struct timespec64 ts; + + ts.tv_sec = tk->xtime_sec; + ts.tv_nsec = (long)(tk->tkr_mono.xtime_nsec >> tk->tkr_mono.shift); + return ts; +} + +static void tk_set_xtime(struct timekeeper *tk, const struct timespec64 *ts) +{ + tk->xtime_sec = ts->tv_sec; + tk->tkr_mono.xtime_nsec = (u64)ts->tv_nsec << tk->tkr_mono.shift; +} + +static void tk_xtime_add(struct timekeeper *tk, const struct timespec64 *ts) +{ + tk->xtime_sec += ts->tv_sec; + tk->tkr_mono.xtime_nsec += (u64)ts->tv_nsec << tk->tkr_mono.shift; + tk_normalize_xtime(tk); +} + +static void tk_set_wall_to_mono(struct timekeeper *tk, struct timespec64 wtm) +{ + struct timespec64 tmp; + + /* + * Verify consistency of: offset_real = -wall_to_monotonic + * before modifying anything + */ + set_normalized_timespec64(&tmp, -tk->wall_to_monotonic.tv_sec, + -tk->wall_to_monotonic.tv_nsec); + WARN_ON_ONCE(tk->offs_real != timespec64_to_ktime(tmp)); + tk->wall_to_monotonic = wtm; + set_normalized_timespec64(&tmp, -wtm.tv_sec, -wtm.tv_nsec); + tk->offs_real = timespec64_to_ktime(tmp); + tk->offs_tai = ktime_add(tk->offs_real, ktime_set(tk->tai_offset, 0)); +} + +static inline void tk_update_sleep_time(struct timekeeper *tk, ktime_t delta) +{ + tk->offs_boot = ktime_add(tk->offs_boot, delta); +} + +/* + * tk_clock_read - atomic clocksource read() helper + * + * This helper is necessary to use in the read paths because, while the + * seqlock ensures we don't return a bad value while structures are updated, + * it doesn't protect from potential crashes. There is the possibility that + * the tkr's clocksource may change between the read reference, and the + * clock reference passed to the read function. This can cause crashes if + * the wrong clocksource is passed to the wrong read function. + * This isn't necessary to use when holding the timekeeper_lock or doing + * a read of the fast-timekeeper tkrs (which is protected by its own locking + * and update logic). + */ +static inline u64 tk_clock_read(const struct tk_read_base *tkr) +{ + struct clocksource *clock = READ_ONCE(tkr->clock); + + return clock->read(clock); +} + +#ifdef CONFIG_DEBUG_TIMEKEEPING +#define WARNING_FREQ (HZ*300) /* 5 minute rate-limiting */ + +static void timekeeping_check_update(struct timekeeper *tk, u64 offset) +{ + + u64 max_cycles = tk->tkr_mono.clock->max_cycles; + const char *name = tk->tkr_mono.clock->name; + + if (offset > max_cycles) { + printk_deferred("WARNING: timekeeping: Cycle offset (%lld) is larger than allowed by the '%s' clock's max_cycles value (%lld): time overflow danger\n", + offset, name, max_cycles); + printk_deferred(" timekeeping: Your kernel is sick, but tries to cope by capping time updates\n"); + } else { + if (offset > (max_cycles >> 1)) { + printk_deferred("INFO: timekeeping: Cycle offset (%lld) is larger than the '%s' clock's 50%% safety margin (%lld)\n", + offset, name, max_cycles >> 1); + printk_deferred(" timekeeping: Your kernel is still fine, but is feeling a bit nervous\n"); + } + } + + if (tk->underflow_seen) { + if (jiffies - tk->last_warning > WARNING_FREQ) { + printk_deferred("WARNING: Underflow in clocksource '%s' observed, time update ignored.\n", name); + printk_deferred(" Please report this, consider using a different clocksource, if possible.\n"); + printk_deferred(" Your kernel is probably still fine.\n"); + tk->last_warning = jiffies; + } + tk->underflow_seen = 0; + } + + if (tk->overflow_seen) { + if (jiffies - tk->last_warning > WARNING_FREQ) { + printk_deferred("WARNING: Overflow in clocksource '%s' observed, time update capped.\n", name); + printk_deferred(" Please report this, consider using a different clocksource, if possible.\n"); + printk_deferred(" Your kernel is probably still fine.\n"); + tk->last_warning = jiffies; + } + tk->overflow_seen = 0; + } +} + +static inline u64 timekeeping_get_delta(const struct tk_read_base *tkr) +{ + struct timekeeper *tk = &tk_core.timekeeper; + u64 now, last, mask, max, delta; + unsigned int seq; + + /* + * Since we're called holding a seqlock, the data may shift + * under us while we're doing the calculation. This can cause + * false positives, since we'd note a problem but throw the + * results away. So nest another seqlock here to atomically + * grab the points we are checking with. + */ + do { + seq = read_seqcount_begin(&tk_core.seq); + now = tk_clock_read(tkr); + last = tkr->cycle_last; + mask = tkr->mask; + max = tkr->clock->max_cycles; + } while (read_seqcount_retry(&tk_core.seq, seq)); + + delta = clocksource_delta(now, last, mask); + + /* + * Try to catch underflows by checking if we are seeing small + * mask-relative negative values. + */ + if (unlikely((~delta & mask) < (mask >> 3))) { + tk->underflow_seen = 1; + delta = 0; + } + + /* Cap delta value to the max_cycles values to avoid mult overflows */ + if (unlikely(delta > max)) { + tk->overflow_seen = 1; + delta = tkr->clock->max_cycles; + } + + return delta; +} +#else +static inline void timekeeping_check_update(struct timekeeper *tk, u64 offset) +{ +} +static inline u64 timekeeping_get_delta(const struct tk_read_base *tkr) +{ + u64 cycle_now, delta; + + /* read clocksource */ + cycle_now = tk_clock_read(tkr); + + /* calculate the delta since the last update_wall_time */ + delta = clocksource_delta(cycle_now, tkr->cycle_last, tkr->mask); + + return delta; +} +#endif + +/** + * tk_setup_internals - Set up internals to use clocksource clock. + * + * @tk: The target timekeeper to setup. + * @clock: Pointer to clocksource. + * + * Calculates a fixed cycle/nsec interval for a given clocksource/adjustment + * pair and interval request. + * + * Unless you're the timekeeping code, you should not be using this! + */ +static void tk_setup_internals(struct timekeeper *tk, struct clocksource *clock) +{ + u64 interval; + u64 tmp, ntpinterval; + struct clocksource *old_clock; + + ++tk->cs_was_changed_seq; + old_clock = tk->tkr_mono.clock; + tk->tkr_mono.clock = clock; + tk->tkr_mono.mask = clock->mask; + tk->tkr_mono.cycle_last = tk_clock_read(&tk->tkr_mono); + + tk->tkr_raw.clock = clock; + tk->tkr_raw.mask = clock->mask; + tk->tkr_raw.cycle_last = tk->tkr_mono.cycle_last; + + /* Do the ns -> cycle conversion first, using original mult */ + tmp = NTP_INTERVAL_LENGTH; + tmp <<= clock->shift; + ntpinterval = tmp; + tmp += clock->mult/2; + do_div(tmp, clock->mult); + if (tmp == 0) + tmp = 1; + + interval = (u64) tmp; + tk->cycle_interval = interval; + + /* Go back from cycles -> shifted ns */ + tk->xtime_interval = interval * clock->mult; + tk->xtime_remainder = ntpinterval - tk->xtime_interval; + tk->raw_interval = interval * clock->mult; + + /* if changing clocks, convert xtime_nsec shift units */ + if (old_clock) { + int shift_change = clock->shift - old_clock->shift; + if (shift_change < 0) { + tk->tkr_mono.xtime_nsec >>= -shift_change; + tk->tkr_raw.xtime_nsec >>= -shift_change; + } else { + tk->tkr_mono.xtime_nsec <<= shift_change; + tk->tkr_raw.xtime_nsec <<= shift_change; + } + } + + tk->tkr_mono.shift = clock->shift; + tk->tkr_raw.shift = clock->shift; + + tk->ntp_error = 0; + tk->ntp_error_shift = NTP_SCALE_SHIFT - clock->shift; + tk->ntp_tick = ntpinterval << tk->ntp_error_shift; + + /* + * The timekeeper keeps its own mult values for the currently + * active clocksource. These value will be adjusted via NTP + * to counteract clock drifting. + */ + tk->tkr_mono.mult = clock->mult; + tk->tkr_raw.mult = clock->mult; + tk->ntp_err_mult = 0; + tk->skip_second_overflow = 0; +} + +/* Timekeeper helper functions. */ + +#ifdef CONFIG_ARCH_USES_GETTIMEOFFSET +static u32 default_arch_gettimeoffset(void) { return 0; } +u32 (*arch_gettimeoffset)(void) = default_arch_gettimeoffset; +#else +static inline u32 arch_gettimeoffset(void) { return 0; } +#endif + +static inline u64 timekeeping_delta_to_ns(const struct tk_read_base *tkr, u64 delta) +{ + u64 nsec; + + nsec = delta * tkr->mult + tkr->xtime_nsec; + nsec >>= tkr->shift; + + /* If arch requires, add in get_arch_timeoffset() */ + return nsec + arch_gettimeoffset(); +} + +static inline u64 timekeeping_get_ns(const struct tk_read_base *tkr) +{ + u64 delta; + + delta = timekeeping_get_delta(tkr); + return timekeeping_delta_to_ns(tkr, delta); +} + +static inline u64 timekeeping_cycles_to_ns(const struct tk_read_base *tkr, u64 cycles) +{ + u64 delta; + + /* calculate the delta since the last update_wall_time */ + delta = clocksource_delta(cycles, tkr->cycle_last, tkr->mask); + return timekeeping_delta_to_ns(tkr, delta); +} + +/** + * update_fast_timekeeper - Update the fast and NMI safe monotonic timekeeper. + * @tkr: Timekeeping readout base from which we take the update + * + * We want to use this from any context including NMI and tracing / + * instrumenting the timekeeping code itself. + * + * Employ the latch technique; see @raw_write_seqcount_latch. + * + * So if a NMI hits the update of base[0] then it will use base[1] + * which is still consistent. In the worst case this can result is a + * slightly wrong timestamp (a few nanoseconds). See + * @ktime_get_mono_fast_ns. + */ +static void update_fast_timekeeper(const struct tk_read_base *tkr, + struct tk_fast *tkf) +{ + struct tk_read_base *base = tkf->base; + + /* Force readers off to base[1] */ + raw_write_seqcount_latch(&tkf->seq); + + /* Update base[0] */ + memcpy(base, tkr, sizeof(*base)); + + /* Force readers back to base[0] */ + raw_write_seqcount_latch(&tkf->seq); + + /* Update base[1] */ + memcpy(base + 1, base, sizeof(*base)); +} + +/** + * ktime_get_mono_fast_ns - Fast NMI safe access to clock monotonic + * + * This timestamp is not guaranteed to be monotonic across an update. + * The timestamp is calculated by: + * + * now = base_mono + clock_delta * slope + * + * So if the update lowers the slope, readers who are forced to the + * not yet updated second array are still using the old steeper slope. + * + * tmono + * ^ + * | o n + * | o n + * | u + * | o + * |o + * |12345678---> reader order + * + * o = old slope + * u = update + * n = new slope + * + * So reader 6 will observe time going backwards versus reader 5. + * + * While other CPUs are likely to be able observe that, the only way + * for a CPU local observation is when an NMI hits in the middle of + * the update. Timestamps taken from that NMI context might be ahead + * of the following timestamps. Callers need to be aware of that and + * deal with it. + */ +static __always_inline u64 __ktime_get_fast_ns(struct tk_fast *tkf) +{ + struct tk_read_base *tkr; + unsigned int seq; + u64 now; + + do { + seq = raw_read_seqcount_latch(&tkf->seq); + tkr = tkf->base + (seq & 0x01); + now = ktime_to_ns(tkr->base); + + now += timekeeping_delta_to_ns(tkr, + clocksource_delta( + tk_clock_read(tkr), + tkr->cycle_last, + tkr->mask)); + } while (read_seqcount_retry(&tkf->seq, seq)); + + return now; +} + +u64 ktime_get_mono_fast_ns(void) +{ + return __ktime_get_fast_ns(&tk_fast_mono); +} +EXPORT_SYMBOL_GPL(ktime_get_mono_fast_ns); + +u64 ktime_get_raw_fast_ns(void) +{ + return __ktime_get_fast_ns(&tk_fast_raw); +} +EXPORT_SYMBOL_GPL(ktime_get_raw_fast_ns); + +/** + * ktime_get_boot_fast_ns - NMI safe and fast access to boot clock. + * + * To keep it NMI safe since we're accessing from tracing, we're not using a + * separate timekeeper with updates to monotonic clock and boot offset + * protected with seqlocks. This has the following minor side effects: + * + * (1) Its possible that a timestamp be taken after the boot offset is updated + * but before the timekeeper is updated. If this happens, the new boot offset + * is added to the old timekeeping making the clock appear to update slightly + * earlier: + * CPU 0 CPU 1 + * timekeeping_inject_sleeptime64() + * __timekeeping_inject_sleeptime(tk, delta); + * timestamp(); + * timekeeping_update(tk, TK_CLEAR_NTP...); + * + * (2) On 32-bit systems, the 64-bit boot offset (tk->offs_boot) may be + * partially updated. Since the tk->offs_boot update is a rare event, this + * should be a rare occurrence which postprocessing should be able to handle. + */ +u64 notrace ktime_get_boot_fast_ns(void) +{ + struct timekeeper *tk = &tk_core.timekeeper; + + return (ktime_get_mono_fast_ns() + ktime_to_ns(tk->offs_boot)); +} +EXPORT_SYMBOL_GPL(ktime_get_boot_fast_ns); + + +/* + * See comment for __ktime_get_fast_ns() vs. timestamp ordering + */ +static __always_inline u64 __ktime_get_real_fast_ns(struct tk_fast *tkf) +{ + struct tk_read_base *tkr; + unsigned int seq; + u64 now; + + do { + seq = raw_read_seqcount_latch(&tkf->seq); + tkr = tkf->base + (seq & 0x01); + now = ktime_to_ns(tkr->base_real); + + now += timekeeping_delta_to_ns(tkr, + clocksource_delta( + tk_clock_read(tkr), + tkr->cycle_last, + tkr->mask)); + } while (read_seqcount_retry(&tkf->seq, seq)); + + return now; +} + +/** + * ktime_get_real_fast_ns: - NMI safe and fast access to clock realtime. + */ +u64 ktime_get_real_fast_ns(void) +{ + return __ktime_get_real_fast_ns(&tk_fast_mono); +} +EXPORT_SYMBOL_GPL(ktime_get_real_fast_ns); + +/** + * halt_fast_timekeeper - Prevent fast timekeeper from accessing clocksource. + * @tk: Timekeeper to snapshot. + * + * It generally is unsafe to access the clocksource after timekeeping has been + * suspended, so take a snapshot of the readout base of @tk and use it as the + * fast timekeeper's readout base while suspended. It will return the same + * number of cycles every time until timekeeping is resumed at which time the + * proper readout base for the fast timekeeper will be restored automatically. + */ +static void halt_fast_timekeeper(const struct timekeeper *tk) +{ + static struct tk_read_base tkr_dummy; + const struct tk_read_base *tkr = &tk->tkr_mono; + + memcpy(&tkr_dummy, tkr, sizeof(tkr_dummy)); + cycles_at_suspend = tk_clock_read(tkr); + tkr_dummy.clock = &dummy_clock; + tkr_dummy.base_real = tkr->base + tk->offs_real; + update_fast_timekeeper(&tkr_dummy, &tk_fast_mono); + + tkr = &tk->tkr_raw; + memcpy(&tkr_dummy, tkr, sizeof(tkr_dummy)); + tkr_dummy.clock = &dummy_clock; + update_fast_timekeeper(&tkr_dummy, &tk_fast_raw); +} + +static RAW_NOTIFIER_HEAD(pvclock_gtod_chain); + +static void update_pvclock_gtod(struct timekeeper *tk, bool was_set) +{ + raw_notifier_call_chain(&pvclock_gtod_chain, was_set, tk); +} + +/** + * pvclock_gtod_register_notifier - register a pvclock timedata update listener + */ +int pvclock_gtod_register_notifier(struct notifier_block *nb) +{ + struct timekeeper *tk = &tk_core.timekeeper; + unsigned long flags; + int ret; + + raw_spin_lock_irqsave(&timekeeper_lock, flags); + ret = raw_notifier_chain_register(&pvclock_gtod_chain, nb); + update_pvclock_gtod(tk, true); + raw_spin_unlock_irqrestore(&timekeeper_lock, flags); + + return ret; +} +EXPORT_SYMBOL_GPL(pvclock_gtod_register_notifier); + +/** + * pvclock_gtod_unregister_notifier - unregister a pvclock + * timedata update listener + */ +int pvclock_gtod_unregister_notifier(struct notifier_block *nb) +{ + unsigned long flags; + int ret; + + raw_spin_lock_irqsave(&timekeeper_lock, flags); + ret = raw_notifier_chain_unregister(&pvclock_gtod_chain, nb); + raw_spin_unlock_irqrestore(&timekeeper_lock, flags); + + return ret; +} +EXPORT_SYMBOL_GPL(pvclock_gtod_unregister_notifier); + +/* + * tk_update_leap_state - helper to update the next_leap_ktime + */ +static inline void tk_update_leap_state(struct timekeeper *tk) +{ + tk->next_leap_ktime = ntp_get_next_leap(); + if (tk->next_leap_ktime != KTIME_MAX) + /* Convert to monotonic time */ + tk->next_leap_ktime = ktime_sub(tk->next_leap_ktime, tk->offs_real); +} + +/* + * Update the ktime_t based scalar nsec members of the timekeeper + */ +static inline void tk_update_ktime_data(struct timekeeper *tk) +{ + u64 seconds; + u32 nsec; + + /* + * The xtime based monotonic readout is: + * nsec = (xtime_sec + wtm_sec) * 1e9 + wtm_nsec + now(); + * The ktime based monotonic readout is: + * nsec = base_mono + now(); + * ==> base_mono = (xtime_sec + wtm_sec) * 1e9 + wtm_nsec + */ + seconds = (u64)(tk->xtime_sec + tk->wall_to_monotonic.tv_sec); + nsec = (u32) tk->wall_to_monotonic.tv_nsec; + tk->tkr_mono.base = ns_to_ktime(seconds * NSEC_PER_SEC + nsec); + + /* + * The sum of the nanoseconds portions of xtime and + * wall_to_monotonic can be greater/equal one second. Take + * this into account before updating tk->ktime_sec. + */ + nsec += (u32)(tk->tkr_mono.xtime_nsec >> tk->tkr_mono.shift); + if (nsec >= NSEC_PER_SEC) + seconds++; + tk->ktime_sec = seconds; + + /* Update the monotonic raw base */ + tk->tkr_raw.base = ns_to_ktime(tk->raw_sec * NSEC_PER_SEC); +} + +/* must hold timekeeper_lock */ +static void timekeeping_update(struct timekeeper *tk, unsigned int action) +{ + if (action & TK_CLEAR_NTP) { + tk->ntp_error = 0; + ntp_clear(); + } + + tk_update_leap_state(tk); + tk_update_ktime_data(tk); + + update_vsyscall(tk); + update_pvclock_gtod(tk, action & TK_CLOCK_WAS_SET); + + tk->tkr_mono.base_real = tk->tkr_mono.base + tk->offs_real; + update_fast_timekeeper(&tk->tkr_mono, &tk_fast_mono); + update_fast_timekeeper(&tk->tkr_raw, &tk_fast_raw); + + if (action & TK_CLOCK_WAS_SET) + tk->clock_was_set_seq++; + /* + * The mirroring of the data to the shadow-timekeeper needs + * to happen last here to ensure we don't over-write the + * timekeeper structure on the next update with stale data + */ + if (action & TK_MIRROR) + memcpy(&shadow_timekeeper, &tk_core.timekeeper, + sizeof(tk_core.timekeeper)); +} + +/** + * timekeeping_forward_now - update clock to the current time + * + * Forward the current clock to update its state since the last call to + * update_wall_time(). This is useful before significant clock changes, + * as it avoids having to deal with this time offset explicitly. + */ +static void timekeeping_forward_now(struct timekeeper *tk) +{ + u64 cycle_now, delta; + + cycle_now = tk_clock_read(&tk->tkr_mono); + delta = clocksource_delta(cycle_now, tk->tkr_mono.cycle_last, tk->tkr_mono.mask); + tk->tkr_mono.cycle_last = cycle_now; + tk->tkr_raw.cycle_last = cycle_now; + + tk->tkr_mono.xtime_nsec += delta * tk->tkr_mono.mult; + + /* If arch requires, add in get_arch_timeoffset() */ + tk->tkr_mono.xtime_nsec += (u64)arch_gettimeoffset() << tk->tkr_mono.shift; + + + tk->tkr_raw.xtime_nsec += delta * tk->tkr_raw.mult; + + /* If arch requires, add in get_arch_timeoffset() */ + tk->tkr_raw.xtime_nsec += (u64)arch_gettimeoffset() << tk->tkr_raw.shift; + + tk_normalize_xtime(tk); +} + +/** + * ktime_get_real_ts64 - Returns the time of day in a timespec64. + * @ts: pointer to the timespec to be set + * + * Returns the time of day in a timespec64 (WARN if suspended). + */ +void ktime_get_real_ts64(struct timespec64 *ts) +{ + struct timekeeper *tk = &tk_core.timekeeper; + unsigned long seq; + u64 nsecs; + + WARN_ON(timekeeping_suspended); + + do { + seq = read_seqcount_begin(&tk_core.seq); + + ts->tv_sec = tk->xtime_sec; + nsecs = timekeeping_get_ns(&tk->tkr_mono); + + } while (read_seqcount_retry(&tk_core.seq, seq)); + + ts->tv_nsec = 0; + timespec64_add_ns(ts, nsecs); +} +EXPORT_SYMBOL(ktime_get_real_ts64); + +ktime_t ktime_get(void) +{ + struct timekeeper *tk = &tk_core.timekeeper; + unsigned int seq; + ktime_t base; + u64 nsecs; + + WARN_ON(timekeeping_suspended); + + do { + seq = read_seqcount_begin(&tk_core.seq); + base = tk->tkr_mono.base; + nsecs = timekeeping_get_ns(&tk->tkr_mono); + + } while (read_seqcount_retry(&tk_core.seq, seq)); + + return ktime_add_ns(base, nsecs); +} +EXPORT_SYMBOL_GPL(ktime_get); + +u32 ktime_get_resolution_ns(void) +{ + struct timekeeper *tk = &tk_core.timekeeper; + unsigned int seq; + u32 nsecs; + + WARN_ON(timekeeping_suspended); + + do { + seq = read_seqcount_begin(&tk_core.seq); + nsecs = tk->tkr_mono.mult >> tk->tkr_mono.shift; + } while (read_seqcount_retry(&tk_core.seq, seq)); + + return nsecs; +} +EXPORT_SYMBOL_GPL(ktime_get_resolution_ns); + +static ktime_t *offsets[TK_OFFS_MAX] = { + [TK_OFFS_REAL] = &tk_core.timekeeper.offs_real, + [TK_OFFS_BOOT] = &tk_core.timekeeper.offs_boot, + [TK_OFFS_TAI] = &tk_core.timekeeper.offs_tai, +}; + +ktime_t ktime_get_with_offset(enum tk_offsets offs) +{ + struct timekeeper *tk = &tk_core.timekeeper; + unsigned int seq; + ktime_t base, *offset = offsets[offs]; + u64 nsecs; + + WARN_ON(timekeeping_suspended); + + do { + seq = read_seqcount_begin(&tk_core.seq); + base = ktime_add(tk->tkr_mono.base, *offset); + nsecs = timekeeping_get_ns(&tk->tkr_mono); + + } while (read_seqcount_retry(&tk_core.seq, seq)); + + return ktime_add_ns(base, nsecs); + +} +EXPORT_SYMBOL_GPL(ktime_get_with_offset); + +ktime_t ktime_get_coarse_with_offset(enum tk_offsets offs) +{ + struct timekeeper *tk = &tk_core.timekeeper; + unsigned int seq; + ktime_t base, *offset = offsets[offs]; + u64 nsecs; + + WARN_ON(timekeeping_suspended); + + do { + seq = read_seqcount_begin(&tk_core.seq); + base = ktime_add(tk->tkr_mono.base, *offset); + nsecs = tk->tkr_mono.xtime_nsec >> tk->tkr_mono.shift; + + } while (read_seqcount_retry(&tk_core.seq, seq)); + + return ktime_add_ns(base, nsecs); +} +EXPORT_SYMBOL_GPL(ktime_get_coarse_with_offset); + +/** + * ktime_mono_to_any() - convert mononotic time to any other time + * @tmono: time to convert. + * @offs: which offset to use + */ +ktime_t ktime_mono_to_any(ktime_t tmono, enum tk_offsets offs) +{ + ktime_t *offset = offsets[offs]; + unsigned long seq; + ktime_t tconv; + + do { + seq = read_seqcount_begin(&tk_core.seq); + tconv = ktime_add(tmono, *offset); + } while (read_seqcount_retry(&tk_core.seq, seq)); + + return tconv; +} +EXPORT_SYMBOL_GPL(ktime_mono_to_any); + +/** + * ktime_get_raw - Returns the raw monotonic time in ktime_t format + */ +ktime_t ktime_get_raw(void) +{ + struct timekeeper *tk = &tk_core.timekeeper; + unsigned int seq; + ktime_t base; + u64 nsecs; + + do { + seq = read_seqcount_begin(&tk_core.seq); + base = tk->tkr_raw.base; + nsecs = timekeeping_get_ns(&tk->tkr_raw); + + } while (read_seqcount_retry(&tk_core.seq, seq)); + + return ktime_add_ns(base, nsecs); +} +EXPORT_SYMBOL_GPL(ktime_get_raw); + +/** + * ktime_get_ts64 - get the monotonic clock in timespec64 format + * @ts: pointer to timespec variable + * + * The function calculates the monotonic clock from the realtime + * clock and the wall_to_monotonic offset and stores the result + * in normalized timespec64 format in the variable pointed to by @ts. + */ +void ktime_get_ts64(struct timespec64 *ts) +{ + struct timekeeper *tk = &tk_core.timekeeper; + struct timespec64 tomono; + unsigned int seq; + u64 nsec; + + WARN_ON(timekeeping_suspended); + + do { + seq = read_seqcount_begin(&tk_core.seq); + ts->tv_sec = tk->xtime_sec; + nsec = timekeeping_get_ns(&tk->tkr_mono); + tomono = tk->wall_to_monotonic; + + } while (read_seqcount_retry(&tk_core.seq, seq)); + + ts->tv_sec += tomono.tv_sec; + ts->tv_nsec = 0; + timespec64_add_ns(ts, nsec + tomono.tv_nsec); +} +EXPORT_SYMBOL_GPL(ktime_get_ts64); + +/** + * ktime_get_seconds - Get the seconds portion of CLOCK_MONOTONIC + * + * Returns the seconds portion of CLOCK_MONOTONIC with a single non + * serialized read. tk->ktime_sec is of type 'unsigned long' so this + * works on both 32 and 64 bit systems. On 32 bit systems the readout + * covers ~136 years of uptime which should be enough to prevent + * premature wrap arounds. + */ +time64_t ktime_get_seconds(void) +{ + struct timekeeper *tk = &tk_core.timekeeper; + + WARN_ON(timekeeping_suspended); + return tk->ktime_sec; +} +EXPORT_SYMBOL_GPL(ktime_get_seconds); + +/** + * ktime_get_real_seconds - Get the seconds portion of CLOCK_REALTIME + * + * Returns the wall clock seconds since 1970. This replaces the + * get_seconds() interface which is not y2038 safe on 32bit systems. + * + * For 64bit systems the fast access to tk->xtime_sec is preserved. On + * 32bit systems the access must be protected with the sequence + * counter to provide "atomic" access to the 64bit tk->xtime_sec + * value. + */ +time64_t ktime_get_real_seconds(void) +{ + struct timekeeper *tk = &tk_core.timekeeper; + time64_t seconds; + unsigned int seq; + + if (IS_ENABLED(CONFIG_64BIT)) + return tk->xtime_sec; + + do { + seq = read_seqcount_begin(&tk_core.seq); + seconds = tk->xtime_sec; + + } while (read_seqcount_retry(&tk_core.seq, seq)); + + return seconds; +} +EXPORT_SYMBOL_GPL(ktime_get_real_seconds); + +/** + * __ktime_get_real_seconds - The same as ktime_get_real_seconds + * but without the sequence counter protect. This internal function + * is called just when timekeeping lock is already held. + */ +time64_t __ktime_get_real_seconds(void) +{ + struct timekeeper *tk = &tk_core.timekeeper; + + return tk->xtime_sec; +} + +/** + * ktime_get_snapshot - snapshots the realtime/monotonic raw clocks with counter + * @systime_snapshot: pointer to struct receiving the system time snapshot + */ +void ktime_get_snapshot(struct system_time_snapshot *systime_snapshot) +{ + struct timekeeper *tk = &tk_core.timekeeper; + unsigned long seq; + ktime_t base_raw; + ktime_t base_real; + u64 nsec_raw; + u64 nsec_real; + u64 now; + + WARN_ON_ONCE(timekeeping_suspended); + + do { + seq = read_seqcount_begin(&tk_core.seq); + now = tk_clock_read(&tk->tkr_mono); + systime_snapshot->cs_was_changed_seq = tk->cs_was_changed_seq; + systime_snapshot->clock_was_set_seq = tk->clock_was_set_seq; + base_real = ktime_add(tk->tkr_mono.base, + tk_core.timekeeper.offs_real); + base_raw = tk->tkr_raw.base; + nsec_real = timekeeping_cycles_to_ns(&tk->tkr_mono, now); + nsec_raw = timekeeping_cycles_to_ns(&tk->tkr_raw, now); + } while (read_seqcount_retry(&tk_core.seq, seq)); + + systime_snapshot->cycles = now; + systime_snapshot->real = ktime_add_ns(base_real, nsec_real); + systime_snapshot->raw = ktime_add_ns(base_raw, nsec_raw); +} +EXPORT_SYMBOL_GPL(ktime_get_snapshot); + +/* Scale base by mult/div checking for overflow */ +static int scale64_check_overflow(u64 mult, u64 div, u64 *base) +{ + u64 tmp, rem; + + tmp = div64_u64_rem(*base, div, &rem); + + if (((int)sizeof(u64)*8 - fls64(mult) < fls64(tmp)) || + ((int)sizeof(u64)*8 - fls64(mult) < fls64(rem))) + return -EOVERFLOW; + tmp *= mult; + + rem = div64_u64(rem * mult, div); + *base = tmp + rem; + return 0; +} + +/** + * adjust_historical_crosststamp - adjust crosstimestamp previous to current interval + * @history: Snapshot representing start of history + * @partial_history_cycles: Cycle offset into history (fractional part) + * @total_history_cycles: Total history length in cycles + * @discontinuity: True indicates clock was set on history period + * @ts: Cross timestamp that should be adjusted using + * partial/total ratio + * + * Helper function used by get_device_system_crosststamp() to correct the + * crosstimestamp corresponding to the start of the current interval to the + * system counter value (timestamp point) provided by the driver. The + * total_history_* quantities are the total history starting at the provided + * reference point and ending at the start of the current interval. The cycle + * count between the driver timestamp point and the start of the current + * interval is partial_history_cycles. + */ +static int adjust_historical_crosststamp(struct system_time_snapshot *history, + u64 partial_history_cycles, + u64 total_history_cycles, + bool discontinuity, + struct system_device_crosststamp *ts) +{ + struct timekeeper *tk = &tk_core.timekeeper; + u64 corr_raw, corr_real; + bool interp_forward; + int ret; + + if (total_history_cycles == 0 || partial_history_cycles == 0) + return 0; + + /* Interpolate shortest distance from beginning or end of history */ + interp_forward = partial_history_cycles > total_history_cycles / 2; + partial_history_cycles = interp_forward ? + total_history_cycles - partial_history_cycles : + partial_history_cycles; + + /* + * Scale the monotonic raw time delta by: + * partial_history_cycles / total_history_cycles + */ + corr_raw = (u64)ktime_to_ns( + ktime_sub(ts->sys_monoraw, history->raw)); + ret = scale64_check_overflow(partial_history_cycles, + total_history_cycles, &corr_raw); + if (ret) + return ret; + + /* + * If there is a discontinuity in the history, scale monotonic raw + * correction by: + * mult(real)/mult(raw) yielding the realtime correction + * Otherwise, calculate the realtime correction similar to monotonic + * raw calculation + */ + if (discontinuity) { + corr_real = mul_u64_u32_div + (corr_raw, tk->tkr_mono.mult, tk->tkr_raw.mult); + } else { + corr_real = (u64)ktime_to_ns( + ktime_sub(ts->sys_realtime, history->real)); + ret = scale64_check_overflow(partial_history_cycles, + total_history_cycles, &corr_real); + if (ret) + return ret; + } + + /* Fixup monotonic raw and real time time values */ + if (interp_forward) { + ts->sys_monoraw = ktime_add_ns(history->raw, corr_raw); + ts->sys_realtime = ktime_add_ns(history->real, corr_real); + } else { + ts->sys_monoraw = ktime_sub_ns(ts->sys_monoraw, corr_raw); + ts->sys_realtime = ktime_sub_ns(ts->sys_realtime, corr_real); + } + + return 0; +} + +/* + * cycle_between - true if test occurs chronologically between before and after + */ +static bool cycle_between(u64 before, u64 test, u64 after) +{ + if (test > before && test < after) + return true; + if (test < before && before > after) + return true; + return false; +} + +/** + * get_device_system_crosststamp - Synchronously capture system/device timestamp + * @get_time_fn: Callback to get simultaneous device time and + * system counter from the device driver + * @ctx: Context passed to get_time_fn() + * @history_begin: Historical reference point used to interpolate system + * time when counter provided by the driver is before the current interval + * @xtstamp: Receives simultaneously captured system and device time + * + * Reads a timestamp from a device and correlates it to system time + */ +int get_device_system_crosststamp(int (*get_time_fn) + (ktime_t *device_time, + struct system_counterval_t *sys_counterval, + void *ctx), + void *ctx, + struct system_time_snapshot *history_begin, + struct system_device_crosststamp *xtstamp) +{ + struct system_counterval_t system_counterval; + struct timekeeper *tk = &tk_core.timekeeper; + u64 cycles, now, interval_start; + unsigned int clock_was_set_seq = 0; + ktime_t base_real, base_raw; + u64 nsec_real, nsec_raw; + u8 cs_was_changed_seq; + unsigned long seq; + bool do_interp; + int ret; + + do { + seq = read_seqcount_begin(&tk_core.seq); + /* + * Try to synchronously capture device time and a system + * counter value calling back into the device driver + */ + ret = get_time_fn(&xtstamp->device, &system_counterval, ctx); + if (ret) + return ret; + + /* + * Verify that the clocksource associated with the captured + * system counter value is the same as the currently installed + * timekeeper clocksource + */ + if (tk->tkr_mono.clock != system_counterval.cs) + return -ENODEV; + cycles = system_counterval.cycles; + + /* + * Check whether the system counter value provided by the + * device driver is on the current timekeeping interval. + */ + now = tk_clock_read(&tk->tkr_mono); + interval_start = tk->tkr_mono.cycle_last; + if (!cycle_between(interval_start, cycles, now)) { + clock_was_set_seq = tk->clock_was_set_seq; + cs_was_changed_seq = tk->cs_was_changed_seq; + cycles = interval_start; + do_interp = true; + } else { + do_interp = false; + } + + base_real = ktime_add(tk->tkr_mono.base, + tk_core.timekeeper.offs_real); + base_raw = tk->tkr_raw.base; + + nsec_real = timekeeping_cycles_to_ns(&tk->tkr_mono, + system_counterval.cycles); + nsec_raw = timekeeping_cycles_to_ns(&tk->tkr_raw, + system_counterval.cycles); + } while (read_seqcount_retry(&tk_core.seq, seq)); + + xtstamp->sys_realtime = ktime_add_ns(base_real, nsec_real); + xtstamp->sys_monoraw = ktime_add_ns(base_raw, nsec_raw); + + /* + * Interpolate if necessary, adjusting back from the start of the + * current interval + */ + if (do_interp) { + u64 partial_history_cycles, total_history_cycles; + bool discontinuity; + + /* + * Check that the counter value occurs after the provided + * history reference and that the history doesn't cross a + * clocksource change + */ + if (!history_begin || + !cycle_between(history_begin->cycles, + system_counterval.cycles, cycles) || + history_begin->cs_was_changed_seq != cs_was_changed_seq) + return -EINVAL; + partial_history_cycles = cycles - system_counterval.cycles; + total_history_cycles = cycles - history_begin->cycles; + discontinuity = + history_begin->clock_was_set_seq != clock_was_set_seq; + + ret = adjust_historical_crosststamp(history_begin, + partial_history_cycles, + total_history_cycles, + discontinuity, xtstamp); + if (ret) + return ret; + } + + return 0; +} +EXPORT_SYMBOL_GPL(get_device_system_crosststamp); + +/** + * do_settimeofday64 - Sets the time of day. + * @ts: pointer to the timespec64 variable containing the new time + * + * Sets the time of day to the new time and update NTP and notify hrtimers + */ +int do_settimeofday64(const struct timespec64 *ts) +{ + struct timekeeper *tk = &tk_core.timekeeper; + struct timespec64 ts_delta, xt; + unsigned long flags; + int ret = 0; + + if (!timespec64_valid_settod(ts)) + return -EINVAL; + + raw_spin_lock_irqsave(&timekeeper_lock, flags); + write_seqcount_begin(&tk_core.seq); + + timekeeping_forward_now(tk); + + xt = tk_xtime(tk); + ts_delta = timespec64_sub(*ts, xt); + + if (timespec64_compare(&tk->wall_to_monotonic, &ts_delta) > 0) { + ret = -EINVAL; + goto out; + } + + tk_set_wall_to_mono(tk, timespec64_sub(tk->wall_to_monotonic, ts_delta)); + + tk_set_xtime(tk, ts); +out: + timekeeping_update(tk, TK_CLEAR_NTP | TK_MIRROR | TK_CLOCK_WAS_SET); + + write_seqcount_end(&tk_core.seq); + raw_spin_unlock_irqrestore(&timekeeper_lock, flags); + + /* signal hrtimers about time change */ + clock_was_set(); + + return ret; +} +EXPORT_SYMBOL(do_settimeofday64); + +/** + * timekeeping_inject_offset - Adds or subtracts from the current time. + * @tv: pointer to the timespec variable containing the offset + * + * Adds or subtracts an offset value from the current time. + */ +static int timekeeping_inject_offset(const struct timespec64 *ts) +{ + struct timekeeper *tk = &tk_core.timekeeper; + unsigned long flags; + struct timespec64 tmp; + int ret = 0; + + if (ts->tv_nsec < 0 || ts->tv_nsec >= NSEC_PER_SEC) + return -EINVAL; + + raw_spin_lock_irqsave(&timekeeper_lock, flags); + write_seqcount_begin(&tk_core.seq); + + timekeeping_forward_now(tk); + + /* Make sure the proposed value is valid */ + tmp = timespec64_add(tk_xtime(tk), *ts); + if (timespec64_compare(&tk->wall_to_monotonic, ts) > 0 || + !timespec64_valid_settod(&tmp)) { + ret = -EINVAL; + goto error; + } + + tk_xtime_add(tk, ts); + tk_set_wall_to_mono(tk, timespec64_sub(tk->wall_to_monotonic, *ts)); + +error: /* even if we error out, we forwarded the time, so call update */ + timekeeping_update(tk, TK_CLEAR_NTP | TK_MIRROR | TK_CLOCK_WAS_SET); + + write_seqcount_end(&tk_core.seq); + raw_spin_unlock_irqrestore(&timekeeper_lock, flags); + + /* signal hrtimers about time change */ + clock_was_set(); + + return ret; +} + +/* + * Indicates if there is an offset between the system clock and the hardware + * clock/persistent clock/rtc. + */ +int persistent_clock_is_local; + +/* + * Adjust the time obtained from the CMOS to be UTC time instead of + * local time. + * + * This is ugly, but preferable to the alternatives. Otherwise we + * would either need to write a program to do it in /etc/rc (and risk + * confusion if the program gets run more than once; it would also be + * hard to make the program warp the clock precisely n hours) or + * compile in the timezone information into the kernel. Bad, bad.... + * + * - TYT, 1992-01-01 + * + * The best thing to do is to keep the CMOS clock in universal time (UTC) + * as real UNIX machines always do it. This avoids all headaches about + * daylight saving times and warping kernel clocks. + */ +void timekeeping_warp_clock(void) +{ + if (sys_tz.tz_minuteswest != 0) { + struct timespec64 adjust; + + persistent_clock_is_local = 1; + adjust.tv_sec = sys_tz.tz_minuteswest * 60; + adjust.tv_nsec = 0; + timekeeping_inject_offset(&adjust); + } +} + +/** + * __timekeeping_set_tai_offset - Sets the TAI offset from UTC and monotonic + * + */ +static void __timekeeping_set_tai_offset(struct timekeeper *tk, s32 tai_offset) +{ + tk->tai_offset = tai_offset; + tk->offs_tai = ktime_add(tk->offs_real, ktime_set(tai_offset, 0)); +} + +/** + * change_clocksource - Swaps clocksources if a new one is available + * + * Accumulates current time interval and initializes new clocksource + */ +static int change_clocksource(void *data) +{ + struct timekeeper *tk = &tk_core.timekeeper; + struct clocksource *new, *old; + unsigned long flags; + + new = (struct clocksource *) data; + + raw_spin_lock_irqsave(&timekeeper_lock, flags); + write_seqcount_begin(&tk_core.seq); + + timekeeping_forward_now(tk); + /* + * If the cs is in module, get a module reference. Succeeds + * for built-in code (owner == NULL) as well. + */ + if (try_module_get(new->owner)) { + if (!new->enable || new->enable(new) == 0) { + old = tk->tkr_mono.clock; + tk_setup_internals(tk, new); + if (old->disable) + old->disable(old); + module_put(old->owner); + } else { + module_put(new->owner); + } + } + timekeeping_update(tk, TK_CLEAR_NTP | TK_MIRROR | TK_CLOCK_WAS_SET); + + write_seqcount_end(&tk_core.seq); + raw_spin_unlock_irqrestore(&timekeeper_lock, flags); + + return 0; +} + +/** + * timekeeping_notify - Install a new clock source + * @clock: pointer to the clock source + * + * This function is called from clocksource.c after a new, better clock + * source has been registered. The caller holds the clocksource_mutex. + */ +int timekeeping_notify(struct clocksource *clock) +{ + struct timekeeper *tk = &tk_core.timekeeper; + + if (tk->tkr_mono.clock == clock) + return 0; + stop_machine(change_clocksource, clock, NULL); + tick_clock_notify(); + return tk->tkr_mono.clock == clock ? 0 : -1; +} + +/** + * ktime_get_raw_ts64 - Returns the raw monotonic time in a timespec + * @ts: pointer to the timespec64 to be set + * + * Returns the raw monotonic time (completely un-modified by ntp) + */ +void ktime_get_raw_ts64(struct timespec64 *ts) +{ + struct timekeeper *tk = &tk_core.timekeeper; + unsigned long seq; + u64 nsecs; + + do { + seq = read_seqcount_begin(&tk_core.seq); + ts->tv_sec = tk->raw_sec; + nsecs = timekeeping_get_ns(&tk->tkr_raw); + + } while (read_seqcount_retry(&tk_core.seq, seq)); + + ts->tv_nsec = 0; + timespec64_add_ns(ts, nsecs); +} +EXPORT_SYMBOL(ktime_get_raw_ts64); + + +/** + * timekeeping_valid_for_hres - Check if timekeeping is suitable for hres + */ +int timekeeping_valid_for_hres(void) +{ + struct timekeeper *tk = &tk_core.timekeeper; + unsigned long seq; + int ret; + + do { + seq = read_seqcount_begin(&tk_core.seq); + + ret = tk->tkr_mono.clock->flags & CLOCK_SOURCE_VALID_FOR_HRES; + + } while (read_seqcount_retry(&tk_core.seq, seq)); + + return ret; +} + +/** + * timekeeping_max_deferment - Returns max time the clocksource can be deferred + */ +u64 timekeeping_max_deferment(void) +{ + struct timekeeper *tk = &tk_core.timekeeper; + unsigned long seq; + u64 ret; + + do { + seq = read_seqcount_begin(&tk_core.seq); + + ret = tk->tkr_mono.clock->max_idle_ns; + + } while (read_seqcount_retry(&tk_core.seq, seq)); + + return ret; +} + +/** + * read_persistent_clock - Return time from the persistent clock. + * + * Weak dummy function for arches that do not yet support it. + * Reads the time from the battery backed persistent clock. + * Returns a timespec with tv_sec=0 and tv_nsec=0 if unsupported. + * + * XXX - Do be sure to remove it once all arches implement it. + */ +void __weak read_persistent_clock(struct timespec *ts) +{ + ts->tv_sec = 0; + ts->tv_nsec = 0; +} + +void __weak read_persistent_clock64(struct timespec64 *ts64) +{ + struct timespec ts; + + read_persistent_clock(&ts); + *ts64 = timespec_to_timespec64(ts); +} + +/** + * read_persistent_wall_and_boot_offset - Read persistent clock, and also offset + * from the boot. + * + * Weak dummy function for arches that do not yet support it. + * wall_time - current time as returned by persistent clock + * boot_offset - offset that is defined as wall_time - boot_time + * The default function calculates offset based on the current value of + * local_clock(). This way architectures that support sched_clock() but don't + * support dedicated boot time clock will provide the best estimate of the + * boot time. + */ +void __weak __init +read_persistent_wall_and_boot_offset(struct timespec64 *wall_time, + struct timespec64 *boot_offset) +{ + read_persistent_clock64(wall_time); + *boot_offset = ns_to_timespec64(local_clock()); +} + +/* + * Flag reflecting whether timekeeping_resume() has injected sleeptime. + * + * The flag starts of false and is only set when a suspend reaches + * timekeeping_suspend(), timekeeping_resume() sets it to false when the + * timekeeper clocksource is not stopping across suspend and has been + * used to update sleep time. If the timekeeper clocksource has stopped + * then the flag stays true and is used by the RTC resume code to decide + * whether sleeptime must be injected and if so the flag gets false then. + * + * If a suspend fails before reaching timekeeping_resume() then the flag + * stays false and prevents erroneous sleeptime injection. + */ +static bool suspend_timing_needed; + +/* Flag for if there is a persistent clock on this platform */ +static bool persistent_clock_exists; + +/* + * timekeeping_init - Initializes the clocksource and common timekeeping values + */ +void __init timekeeping_init(void) +{ + struct timespec64 wall_time, boot_offset, wall_to_mono; + struct timekeeper *tk = &tk_core.timekeeper; + struct clocksource *clock; + unsigned long flags; + + read_persistent_wall_and_boot_offset(&wall_time, &boot_offset); + if (timespec64_valid_settod(&wall_time) && + timespec64_to_ns(&wall_time) > 0) { + persistent_clock_exists = true; + } else if (timespec64_to_ns(&wall_time) != 0) { + pr_warn("Persistent clock returned invalid value"); + wall_time = (struct timespec64){0}; + } + + if (timespec64_compare(&wall_time, &boot_offset) < 0) + boot_offset = (struct timespec64){0}; + + /* + * We want set wall_to_mono, so the following is true: + * wall time + wall_to_mono = boot time + */ + wall_to_mono = timespec64_sub(boot_offset, wall_time); + + raw_spin_lock_irqsave(&timekeeper_lock, flags); + write_seqcount_begin(&tk_core.seq); + ntp_init(); + + clock = clocksource_default_clock(); + if (clock->enable) + clock->enable(clock); + tk_setup_internals(tk, clock); + + tk_set_xtime(tk, &wall_time); + tk->raw_sec = 0; + + tk_set_wall_to_mono(tk, wall_to_mono); + + timekeeping_update(tk, TK_MIRROR | TK_CLOCK_WAS_SET); + + write_seqcount_end(&tk_core.seq); + raw_spin_unlock_irqrestore(&timekeeper_lock, flags); +} + +/* time in seconds when suspend began for persistent clock */ +static struct timespec64 timekeeping_suspend_time; + +/** + * __timekeeping_inject_sleeptime - Internal function to add sleep interval + * @delta: pointer to a timespec delta value + * + * Takes a timespec offset measuring a suspend interval and properly + * adds the sleep offset to the timekeeping variables. + */ +static void __timekeeping_inject_sleeptime(struct timekeeper *tk, + const struct timespec64 *delta) +{ + if (!timespec64_valid_strict(delta)) { + printk_deferred(KERN_WARNING + "__timekeeping_inject_sleeptime: Invalid " + "sleep delta value!\n"); + return; + } + tk_xtime_add(tk, delta); + tk_set_wall_to_mono(tk, timespec64_sub(tk->wall_to_monotonic, *delta)); + tk_update_sleep_time(tk, timespec64_to_ktime(*delta)); + tk_debug_account_sleep_time(delta); +} + +#if defined(CONFIG_PM_SLEEP) && defined(CONFIG_RTC_HCTOSYS_DEVICE) +/** + * We have three kinds of time sources to use for sleep time + * injection, the preference order is: + * 1) non-stop clocksource + * 2) persistent clock (ie: RTC accessible when irqs are off) + * 3) RTC + * + * 1) and 2) are used by timekeeping, 3) by RTC subsystem. + * If system has neither 1) nor 2), 3) will be used finally. + * + * + * If timekeeping has injected sleeptime via either 1) or 2), + * 3) becomes needless, so in this case we don't need to call + * rtc_resume(), and this is what timekeeping_rtc_skipresume() + * means. + */ +bool timekeeping_rtc_skipresume(void) +{ + return !suspend_timing_needed; +} + +/** + * 1) can be determined whether to use or not only when doing + * timekeeping_resume() which is invoked after rtc_suspend(), + * so we can't skip rtc_suspend() surely if system has 1). + * + * But if system has 2), 2) will definitely be used, so in this + * case we don't need to call rtc_suspend(), and this is what + * timekeeping_rtc_skipsuspend() means. + */ +bool timekeeping_rtc_skipsuspend(void) +{ + return persistent_clock_exists; +} + +/** + * timekeeping_inject_sleeptime64 - Adds suspend interval to timeekeeping values + * @delta: pointer to a timespec64 delta value + * + * This hook is for architectures that cannot support read_persistent_clock64 + * because their RTC/persistent clock is only accessible when irqs are enabled. + * and also don't have an effective nonstop clocksource. + * + * This function should only be called by rtc_resume(), and allows + * a suspend offset to be injected into the timekeeping values. + */ +void timekeeping_inject_sleeptime64(const struct timespec64 *delta) +{ + struct timekeeper *tk = &tk_core.timekeeper; + unsigned long flags; + + raw_spin_lock_irqsave(&timekeeper_lock, flags); + write_seqcount_begin(&tk_core.seq); + + suspend_timing_needed = false; + + timekeeping_forward_now(tk); + + __timekeeping_inject_sleeptime(tk, delta); + + timekeeping_update(tk, TK_CLEAR_NTP | TK_MIRROR | TK_CLOCK_WAS_SET); + + write_seqcount_end(&tk_core.seq); + raw_spin_unlock_irqrestore(&timekeeper_lock, flags); + + /* signal hrtimers about time change */ + clock_was_set(); +} +#endif + +/** + * timekeeping_resume - Resumes the generic timekeeping subsystem. + */ +void timekeeping_resume(void) +{ + struct timekeeper *tk = &tk_core.timekeeper; + struct clocksource *clock = tk->tkr_mono.clock; + unsigned long flags; + struct timespec64 ts_new, ts_delta; + u64 cycle_now, nsec; + bool inject_sleeptime = false; + + read_persistent_clock64(&ts_new); + + clockevents_resume(); + clocksource_resume(); + + raw_spin_lock_irqsave(&timekeeper_lock, flags); + write_seqcount_begin(&tk_core.seq); + + /* + * After system resumes, we need to calculate the suspended time and + * compensate it for the OS time. There are 3 sources that could be + * used: Nonstop clocksource during suspend, persistent clock and rtc + * device. + * + * One specific platform may have 1 or 2 or all of them, and the + * preference will be: + * suspend-nonstop clocksource -> persistent clock -> rtc + * The less preferred source will only be tried if there is no better + * usable source. The rtc part is handled separately in rtc core code. + */ + cycle_now = tk_clock_read(&tk->tkr_mono); + nsec = clocksource_stop_suspend_timing(clock, cycle_now); + if (nsec > 0) { + ts_delta = ns_to_timespec64(nsec); + inject_sleeptime = true; + } else if (timespec64_compare(&ts_new, &timekeeping_suspend_time) > 0) { + ts_delta = timespec64_sub(ts_new, timekeeping_suspend_time); + inject_sleeptime = true; + } + + if (inject_sleeptime) { + suspend_timing_needed = false; + __timekeeping_inject_sleeptime(tk, &ts_delta); + } + + /* Re-base the last cycle value */ + tk->tkr_mono.cycle_last = cycle_now; + tk->tkr_raw.cycle_last = cycle_now; + + tk->ntp_error = 0; + timekeeping_suspended = 0; + timekeeping_update(tk, TK_MIRROR | TK_CLOCK_WAS_SET); + write_seqcount_end(&tk_core.seq); + raw_spin_unlock_irqrestore(&timekeeper_lock, flags); + + touch_softlockup_watchdog(); + + tick_resume(); + hrtimers_resume(); +} + +int timekeeping_suspend(void) +{ + struct timekeeper *tk = &tk_core.timekeeper; + unsigned long flags; + struct timespec64 delta, delta_delta; + static struct timespec64 old_delta; + struct clocksource *curr_clock; + u64 cycle_now; + + read_persistent_clock64(&timekeeping_suspend_time); + + /* + * On some systems the persistent_clock can not be detected at + * timekeeping_init by its return value, so if we see a valid + * value returned, update the persistent_clock_exists flag. + */ + if (timekeeping_suspend_time.tv_sec || timekeeping_suspend_time.tv_nsec) + persistent_clock_exists = true; + + suspend_timing_needed = true; + + raw_spin_lock_irqsave(&timekeeper_lock, flags); + write_seqcount_begin(&tk_core.seq); + timekeeping_forward_now(tk); + timekeeping_suspended = 1; + + /* + * Since we've called forward_now, cycle_last stores the value + * just read from the current clocksource. Save this to potentially + * use in suspend timing. + */ + curr_clock = tk->tkr_mono.clock; + cycle_now = tk->tkr_mono.cycle_last; + clocksource_start_suspend_timing(curr_clock, cycle_now); + + if (persistent_clock_exists) { + /* + * To avoid drift caused by repeated suspend/resumes, + * which each can add ~1 second drift error, + * try to compensate so the difference in system time + * and persistent_clock time stays close to constant. + */ + delta = timespec64_sub(tk_xtime(tk), timekeeping_suspend_time); + delta_delta = timespec64_sub(delta, old_delta); + if (abs(delta_delta.tv_sec) >= 2) { + /* + * if delta_delta is too large, assume time correction + * has occurred and set old_delta to the current delta. + */ + old_delta = delta; + } else { + /* Otherwise try to adjust old_system to compensate */ + timekeeping_suspend_time = + timespec64_add(timekeeping_suspend_time, delta_delta); + } + } + + timekeeping_update(tk, TK_MIRROR); + halt_fast_timekeeper(tk); + write_seqcount_end(&tk_core.seq); + raw_spin_unlock_irqrestore(&timekeeper_lock, flags); + + tick_suspend(); + clocksource_suspend(); + clockevents_suspend(); + + return 0; +} + +/* sysfs resume/suspend bits for timekeeping */ +static struct syscore_ops timekeeping_syscore_ops = { + .resume = timekeeping_resume, + .suspend = timekeeping_suspend, +}; + +static int __init timekeeping_init_ops(void) +{ + register_syscore_ops(&timekeeping_syscore_ops); + return 0; +} +device_initcall(timekeeping_init_ops); + +/* + * Apply a multiplier adjustment to the timekeeper + */ +static __always_inline void timekeeping_apply_adjustment(struct timekeeper *tk, + s64 offset, + s32 mult_adj) +{ + s64 interval = tk->cycle_interval; + + if (mult_adj == 0) { + return; + } else if (mult_adj == -1) { + interval = -interval; + offset = -offset; + } else if (mult_adj != 1) { + interval *= mult_adj; + offset *= mult_adj; + } + + /* + * So the following can be confusing. + * + * To keep things simple, lets assume mult_adj == 1 for now. + * + * When mult_adj != 1, remember that the interval and offset values + * have been appropriately scaled so the math is the same. + * + * The basic idea here is that we're increasing the multiplier + * by one, this causes the xtime_interval to be incremented by + * one cycle_interval. This is because: + * xtime_interval = cycle_interval * mult + * So if mult is being incremented by one: + * xtime_interval = cycle_interval * (mult + 1) + * Its the same as: + * xtime_interval = (cycle_interval * mult) + cycle_interval + * Which can be shortened to: + * xtime_interval += cycle_interval + * + * So offset stores the non-accumulated cycles. Thus the current + * time (in shifted nanoseconds) is: + * now = (offset * adj) + xtime_nsec + * Now, even though we're adjusting the clock frequency, we have + * to keep time consistent. In other words, we can't jump back + * in time, and we also want to avoid jumping forward in time. + * + * So given the same offset value, we need the time to be the same + * both before and after the freq adjustment. + * now = (offset * adj_1) + xtime_nsec_1 + * now = (offset * adj_2) + xtime_nsec_2 + * So: + * (offset * adj_1) + xtime_nsec_1 = + * (offset * adj_2) + xtime_nsec_2 + * And we know: + * adj_2 = adj_1 + 1 + * So: + * (offset * adj_1) + xtime_nsec_1 = + * (offset * (adj_1+1)) + xtime_nsec_2 + * (offset * adj_1) + xtime_nsec_1 = + * (offset * adj_1) + offset + xtime_nsec_2 + * Canceling the sides: + * xtime_nsec_1 = offset + xtime_nsec_2 + * Which gives us: + * xtime_nsec_2 = xtime_nsec_1 - offset + * Which simplfies to: + * xtime_nsec -= offset + */ + if ((mult_adj > 0) && (tk->tkr_mono.mult + mult_adj < mult_adj)) { + /* NTP adjustment caused clocksource mult overflow */ + WARN_ON_ONCE(1); + return; + } + + tk->tkr_mono.mult += mult_adj; + tk->xtime_interval += interval; + tk->tkr_mono.xtime_nsec -= offset; +} + +/* + * Adjust the timekeeper's multiplier to the correct frequency + * and also to reduce the accumulated error value. + */ +static void timekeeping_adjust(struct timekeeper *tk, s64 offset) +{ + u32 mult; + + /* + * Determine the multiplier from the current NTP tick length. + * Avoid expensive division when the tick length doesn't change. + */ + if (likely(tk->ntp_tick == ntp_tick_length())) { + mult = tk->tkr_mono.mult - tk->ntp_err_mult; + } else { + tk->ntp_tick = ntp_tick_length(); + mult = div64_u64((tk->ntp_tick >> tk->ntp_error_shift) - + tk->xtime_remainder, tk->cycle_interval); + } + + /* + * If the clock is behind the NTP time, increase the multiplier by 1 + * to catch up with it. If it's ahead and there was a remainder in the + * tick division, the clock will slow down. Otherwise it will stay + * ahead until the tick length changes to a non-divisible value. + */ + tk->ntp_err_mult = tk->ntp_error > 0 ? 1 : 0; + mult += tk->ntp_err_mult; + + timekeeping_apply_adjustment(tk, offset, mult - tk->tkr_mono.mult); + + if (unlikely(tk->tkr_mono.clock->maxadj && + (abs(tk->tkr_mono.mult - tk->tkr_mono.clock->mult) + > tk->tkr_mono.clock->maxadj))) { + printk_once(KERN_WARNING + "Adjusting %s more than 11%% (%ld vs %ld)\n", + tk->tkr_mono.clock->name, (long)tk->tkr_mono.mult, + (long)tk->tkr_mono.clock->mult + tk->tkr_mono.clock->maxadj); + } + + /* + * It may be possible that when we entered this function, xtime_nsec + * was very small. Further, if we're slightly speeding the clocksource + * in the code above, its possible the required corrective factor to + * xtime_nsec could cause it to underflow. + * + * Now, since we have already accumulated the second and the NTP + * subsystem has been notified via second_overflow(), we need to skip + * the next update. + */ + if (unlikely((s64)tk->tkr_mono.xtime_nsec < 0)) { + tk->tkr_mono.xtime_nsec += (u64)NSEC_PER_SEC << + tk->tkr_mono.shift; + tk->xtime_sec--; + tk->skip_second_overflow = 1; + } +} + +/** + * accumulate_nsecs_to_secs - Accumulates nsecs into secs + * + * Helper function that accumulates the nsecs greater than a second + * from the xtime_nsec field to the xtime_secs field. + * It also calls into the NTP code to handle leapsecond processing. + * + */ +static inline unsigned int accumulate_nsecs_to_secs(struct timekeeper *tk) +{ + u64 nsecps = (u64)NSEC_PER_SEC << tk->tkr_mono.shift; + unsigned int clock_set = 0; + + while (tk->tkr_mono.xtime_nsec >= nsecps) { + int leap; + + tk->tkr_mono.xtime_nsec -= nsecps; + tk->xtime_sec++; + + /* + * Skip NTP update if this second was accumulated before, + * i.e. xtime_nsec underflowed in timekeeping_adjust() + */ + if (unlikely(tk->skip_second_overflow)) { + tk->skip_second_overflow = 0; + continue; + } + + /* Figure out if its a leap sec and apply if needed */ + leap = second_overflow(tk->xtime_sec); + if (unlikely(leap)) { + struct timespec64 ts; + + tk->xtime_sec += leap; + + ts.tv_sec = leap; + ts.tv_nsec = 0; + tk_set_wall_to_mono(tk, + timespec64_sub(tk->wall_to_monotonic, ts)); + + __timekeeping_set_tai_offset(tk, tk->tai_offset - leap); + + clock_set = TK_CLOCK_WAS_SET; + } + } + return clock_set; +} + +/** + * logarithmic_accumulation - shifted accumulation of cycles + * + * This functions accumulates a shifted interval of cycles into + * into a shifted interval nanoseconds. Allows for O(log) accumulation + * loop. + * + * Returns the unconsumed cycles. + */ +static u64 logarithmic_accumulation(struct timekeeper *tk, u64 offset, + u32 shift, unsigned int *clock_set) +{ + u64 interval = tk->cycle_interval << shift; + u64 snsec_per_sec; + + /* If the offset is smaller than a shifted interval, do nothing */ + if (offset < interval) + return offset; + + /* Accumulate one shifted interval */ + offset -= interval; + tk->tkr_mono.cycle_last += interval; + tk->tkr_raw.cycle_last += interval; + + tk->tkr_mono.xtime_nsec += tk->xtime_interval << shift; + *clock_set |= accumulate_nsecs_to_secs(tk); + + /* Accumulate raw time */ + tk->tkr_raw.xtime_nsec += tk->raw_interval << shift; + snsec_per_sec = (u64)NSEC_PER_SEC << tk->tkr_raw.shift; + while (tk->tkr_raw.xtime_nsec >= snsec_per_sec) { + tk->tkr_raw.xtime_nsec -= snsec_per_sec; + tk->raw_sec++; + } + + /* Accumulate error between NTP and clock interval */ + tk->ntp_error += tk->ntp_tick << shift; + tk->ntp_error -= (tk->xtime_interval + tk->xtime_remainder) << + (tk->ntp_error_shift + shift); + + return offset; +} + +/* + * timekeeping_advance - Updates the timekeeper to the current time and + * current NTP tick length + */ +static void timekeeping_advance(enum timekeeping_adv_mode mode) +{ + struct timekeeper *real_tk = &tk_core.timekeeper; + struct timekeeper *tk = &shadow_timekeeper; + u64 offset; + int shift = 0, maxshift; + unsigned int clock_set = 0; + unsigned long flags; + + raw_spin_lock_irqsave(&timekeeper_lock, flags); + + /* Make sure we're fully resumed: */ + if (unlikely(timekeeping_suspended)) + goto out; + +#ifdef CONFIG_ARCH_USES_GETTIMEOFFSET + offset = real_tk->cycle_interval; + + if (mode != TK_ADV_TICK) + goto out; +#else + offset = clocksource_delta(tk_clock_read(&tk->tkr_mono), + tk->tkr_mono.cycle_last, tk->tkr_mono.mask); + + /* Check if there's really nothing to do */ + if (offset < real_tk->cycle_interval && mode == TK_ADV_TICK) + goto out; +#endif + + /* Do some additional sanity checking */ + timekeeping_check_update(tk, offset); + + /* + * With NO_HZ we may have to accumulate many cycle_intervals + * (think "ticks") worth of time at once. To do this efficiently, + * we calculate the largest doubling multiple of cycle_intervals + * that is smaller than the offset. We then accumulate that + * chunk in one go, and then try to consume the next smaller + * doubled multiple. + */ + shift = ilog2(offset) - ilog2(tk->cycle_interval); + shift = max(0, shift); + /* Bound shift to one less than what overflows tick_length */ + maxshift = (64 - (ilog2(ntp_tick_length())+1)) - 1; + shift = min(shift, maxshift); + while (offset >= tk->cycle_interval) { + offset = logarithmic_accumulation(tk, offset, shift, + &clock_set); + if (offset < tk->cycle_interval<<shift) + shift--; + } + + /* Adjust the multiplier to correct NTP error */ + timekeeping_adjust(tk, offset); + + /* + * Finally, make sure that after the rounding + * xtime_nsec isn't larger than NSEC_PER_SEC + */ + clock_set |= accumulate_nsecs_to_secs(tk); + + write_seqcount_begin(&tk_core.seq); + /* + * Update the real timekeeper. + * + * We could avoid this memcpy by switching pointers, but that + * requires changes to all other timekeeper usage sites as + * well, i.e. move the timekeeper pointer getter into the + * spinlocked/seqcount protected sections. And we trade this + * memcpy under the tk_core.seq against one before we start + * updating. + */ + timekeeping_update(tk, clock_set); + memcpy(real_tk, tk, sizeof(*tk)); + /* The memcpy must come last. Do not put anything here! */ + write_seqcount_end(&tk_core.seq); +out: + raw_spin_unlock_irqrestore(&timekeeper_lock, flags); + if (clock_set) + /* Have to call _delayed version, since in irq context*/ + clock_was_set_delayed(); +} + +/** + * update_wall_time - Uses the current clocksource to increment the wall time + * + */ +void update_wall_time(void) +{ + timekeeping_advance(TK_ADV_TICK); +} + +/** + * getboottime64 - Return the real time of system boot. + * @ts: pointer to the timespec64 to be set + * + * Returns the wall-time of boot in a timespec64. + * + * This is based on the wall_to_monotonic offset and the total suspend + * time. Calls to settimeofday will affect the value returned (which + * basically means that however wrong your real time clock is at boot time, + * you get the right time here). + */ +void getboottime64(struct timespec64 *ts) +{ + struct timekeeper *tk = &tk_core.timekeeper; + ktime_t t = ktime_sub(tk->offs_real, tk->offs_boot); + + *ts = ktime_to_timespec64(t); +} +EXPORT_SYMBOL_GPL(getboottime64); + +void ktime_get_coarse_real_ts64(struct timespec64 *ts) +{ + struct timekeeper *tk = &tk_core.timekeeper; + unsigned long seq; + + do { + seq = read_seqcount_begin(&tk_core.seq); + + *ts = tk_xtime(tk); + } while (read_seqcount_retry(&tk_core.seq, seq)); +} +EXPORT_SYMBOL(ktime_get_coarse_real_ts64); + +void ktime_get_coarse_ts64(struct timespec64 *ts) +{ + struct timekeeper *tk = &tk_core.timekeeper; + struct timespec64 now, mono; + unsigned long seq; + + do { + seq = read_seqcount_begin(&tk_core.seq); + + now = tk_xtime(tk); + mono = tk->wall_to_monotonic; + } while (read_seqcount_retry(&tk_core.seq, seq)); + + set_normalized_timespec64(ts, now.tv_sec + mono.tv_sec, + now.tv_nsec + mono.tv_nsec); +} +EXPORT_SYMBOL(ktime_get_coarse_ts64); + +/* + * Must hold jiffies_lock + */ +void do_timer(unsigned long ticks) +{ + jiffies_64 += ticks; + calc_global_load(ticks); +} + +/** + * ktime_get_update_offsets_now - hrtimer helper + * @cwsseq: pointer to check and store the clock was set sequence number + * @offs_real: pointer to storage for monotonic -> realtime offset + * @offs_boot: pointer to storage for monotonic -> boottime offset + * @offs_tai: pointer to storage for monotonic -> clock tai offset + * + * Returns current monotonic time and updates the offsets if the + * sequence number in @cwsseq and timekeeper.clock_was_set_seq are + * different. + * + * Called from hrtimer_interrupt() or retrigger_next_event() + */ +ktime_t ktime_get_update_offsets_now(unsigned int *cwsseq, ktime_t *offs_real, + ktime_t *offs_boot, ktime_t *offs_tai) +{ + struct timekeeper *tk = &tk_core.timekeeper; + unsigned int seq; + ktime_t base; + u64 nsecs; + + do { + seq = read_seqcount_begin(&tk_core.seq); + + base = tk->tkr_mono.base; + nsecs = timekeeping_get_ns(&tk->tkr_mono); + base = ktime_add_ns(base, nsecs); + + if (*cwsseq != tk->clock_was_set_seq) { + *cwsseq = tk->clock_was_set_seq; + *offs_real = tk->offs_real; + *offs_boot = tk->offs_boot; + *offs_tai = tk->offs_tai; + } + + /* Handle leapsecond insertion adjustments */ + if (unlikely(base >= tk->next_leap_ktime)) + *offs_real = ktime_sub(tk->offs_real, ktime_set(1, 0)); + + } while (read_seqcount_retry(&tk_core.seq, seq)); + + return base; +} + +/** + * timekeeping_validate_timex - Ensures the timex is ok for use in do_adjtimex + */ +static int timekeeping_validate_timex(const struct timex *txc) +{ + if (txc->modes & ADJ_ADJTIME) { + /* singleshot must not be used with any other mode bits */ + if (!(txc->modes & ADJ_OFFSET_SINGLESHOT)) + return -EINVAL; + if (!(txc->modes & ADJ_OFFSET_READONLY) && + !capable(CAP_SYS_TIME)) + return -EPERM; + } else { + /* In order to modify anything, you gotta be super-user! */ + if (txc->modes && !capable(CAP_SYS_TIME)) + return -EPERM; + /* + * if the quartz is off by more than 10% then + * something is VERY wrong! + */ + if (txc->modes & ADJ_TICK && + (txc->tick < 900000/USER_HZ || + txc->tick > 1100000/USER_HZ)) + return -EINVAL; + } + + if (txc->modes & ADJ_SETOFFSET) { + /* In order to inject time, you gotta be super-user! */ + if (!capable(CAP_SYS_TIME)) + return -EPERM; + + /* + * Validate if a timespec/timeval used to inject a time + * offset is valid. Offsets can be postive or negative, so + * we don't check tv_sec. The value of the timeval/timespec + * is the sum of its fields,but *NOTE*: + * The field tv_usec/tv_nsec must always be non-negative and + * we can't have more nanoseconds/microseconds than a second. + */ + if (txc->time.tv_usec < 0) + return -EINVAL; + + if (txc->modes & ADJ_NANO) { + if (txc->time.tv_usec >= NSEC_PER_SEC) + return -EINVAL; + } else { + if (txc->time.tv_usec >= USEC_PER_SEC) + return -EINVAL; + } + } + + /* + * Check for potential multiplication overflows that can + * only happen on 64-bit systems: + */ + if ((txc->modes & ADJ_FREQUENCY) && (BITS_PER_LONG == 64)) { + if (LLONG_MIN / PPM_SCALE > txc->freq) + return -EINVAL; + if (LLONG_MAX / PPM_SCALE < txc->freq) + return -EINVAL; + } + + return 0; +} + +/** + * random_get_entropy_fallback - Returns the raw clock source value, + * used by random.c for platforms with no valid random_get_entropy(). + */ +unsigned long random_get_entropy_fallback(void) +{ + struct tk_read_base *tkr = &tk_core.timekeeper.tkr_mono; + struct clocksource *clock = READ_ONCE(tkr->clock); + + if (unlikely(timekeeping_suspended || !clock)) + return 0; + return clock->read(clock); +} +EXPORT_SYMBOL_GPL(random_get_entropy_fallback); + +/** + * do_adjtimex() - Accessor function to NTP __do_adjtimex function + */ +int do_adjtimex(struct timex *txc) +{ + struct timekeeper *tk = &tk_core.timekeeper; + unsigned long flags; + struct timespec64 ts; + s32 orig_tai, tai; + int ret; + + /* Validate the data before disabling interrupts */ + ret = timekeeping_validate_timex(txc); + if (ret) + return ret; + + if (txc->modes & ADJ_SETOFFSET) { + struct timespec64 delta; + delta.tv_sec = txc->time.tv_sec; + delta.tv_nsec = txc->time.tv_usec; + if (!(txc->modes & ADJ_NANO)) + delta.tv_nsec *= 1000; + ret = timekeeping_inject_offset(&delta); + if (ret) + return ret; + } + + ktime_get_real_ts64(&ts); + + raw_spin_lock_irqsave(&timekeeper_lock, flags); + write_seqcount_begin(&tk_core.seq); + + orig_tai = tai = tk->tai_offset; + ret = __do_adjtimex(txc, &ts, &tai); + + if (tai != orig_tai) { + __timekeeping_set_tai_offset(tk, tai); + timekeeping_update(tk, TK_MIRROR | TK_CLOCK_WAS_SET); + } + tk_update_leap_state(tk); + + write_seqcount_end(&tk_core.seq); + raw_spin_unlock_irqrestore(&timekeeper_lock, flags); + + /* Update the multiplier immediately if frequency was set directly */ + if (txc->modes & (ADJ_FREQUENCY | ADJ_TICK)) + timekeeping_advance(TK_ADV_FREQ); + + if (tai != orig_tai) + clock_was_set(); + + ntp_notify_cmos_timer(); + + return ret; +} + +#ifdef CONFIG_NTP_PPS +/** + * hardpps() - Accessor function to NTP __hardpps function + */ +void hardpps(const struct timespec64 *phase_ts, const struct timespec64 *raw_ts) +{ + unsigned long flags; + + raw_spin_lock_irqsave(&timekeeper_lock, flags); + write_seqcount_begin(&tk_core.seq); + + __hardpps(phase_ts, raw_ts); + + write_seqcount_end(&tk_core.seq); + raw_spin_unlock_irqrestore(&timekeeper_lock, flags); +} +EXPORT_SYMBOL(hardpps); +#endif /* CONFIG_NTP_PPS */ + +/** + * xtime_update() - advances the timekeeping infrastructure + * @ticks: number of ticks, that have elapsed since the last call. + * + * Must be called with interrupts disabled. + */ +void xtime_update(unsigned long ticks) +{ + write_seqlock(&jiffies_lock); + do_timer(ticks); + write_sequnlock(&jiffies_lock); + update_wall_time(); +} diff --git a/kernel/time/timekeeping.h b/kernel/time/timekeeping.h new file mode 100644 index 000000000..141ab3ab0 --- /dev/null +++ b/kernel/time/timekeeping.h @@ -0,0 +1,32 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _KERNEL_TIME_TIMEKEEPING_H +#define _KERNEL_TIME_TIMEKEEPING_H +/* + * Internal interfaces for kernel/time/ + */ +extern ktime_t ktime_get_update_offsets_now(unsigned int *cwsseq, + ktime_t *offs_real, + ktime_t *offs_boot, + ktime_t *offs_tai); + +extern int timekeeping_valid_for_hres(void); +extern u64 timekeeping_max_deferment(void); +extern void timekeeping_warp_clock(void); +extern int timekeeping_suspend(void); +extern void timekeeping_resume(void); +#ifdef CONFIG_GENERIC_SCHED_CLOCK +extern int sched_clock_suspend(void); +extern void sched_clock_resume(void); +#else +static inline int sched_clock_suspend(void) { return 0; } +static inline void sched_clock_resume(void) { } +#endif + +extern void do_timer(unsigned long ticks); +extern void update_wall_time(void); + +extern seqlock_t jiffies_lock; + +#define CS_NAME_LEN 32 + +#endif diff --git a/kernel/time/timekeeping_debug.c b/kernel/time/timekeeping_debug.c new file mode 100644 index 000000000..238e4be60 --- /dev/null +++ b/kernel/time/timekeeping_debug.c @@ -0,0 +1,82 @@ +/* + * debugfs file to track time spent in suspend + * + * Copyright (c) 2011, Google, Inc. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + */ + +#include <linux/debugfs.h> +#include <linux/err.h> +#include <linux/init.h> +#include <linux/kernel.h> +#include <linux/seq_file.h> +#include <linux/suspend.h> +#include <linux/time.h> + +#include "timekeeping_internal.h" + +#define NUM_BINS 32 + +static unsigned int sleep_time_bin[NUM_BINS] = {0}; + +static int tk_debug_show_sleep_time(struct seq_file *s, void *data) +{ + unsigned int bin; + seq_puts(s, " time (secs) count\n"); + seq_puts(s, "------------------------------\n"); + for (bin = 0; bin < 32; bin++) { + if (sleep_time_bin[bin] == 0) + continue; + seq_printf(s, "%10u - %-10u %4u\n", + bin ? 1 << (bin - 1) : 0, 1 << bin, + sleep_time_bin[bin]); + } + return 0; +} + +static int tk_debug_sleep_time_open(struct inode *inode, struct file *file) +{ + return single_open(file, tk_debug_show_sleep_time, NULL); +} + +static const struct file_operations tk_debug_sleep_time_fops = { + .open = tk_debug_sleep_time_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + +static int __init tk_debug_sleep_time_init(void) +{ + struct dentry *d; + + d = debugfs_create_file("sleep_time", 0444, NULL, NULL, + &tk_debug_sleep_time_fops); + if (!d) { + pr_err("Failed to create sleep_time debug file\n"); + return -ENOMEM; + } + + return 0; +} +late_initcall(tk_debug_sleep_time_init); + +void tk_debug_account_sleep_time(const struct timespec64 *t) +{ + /* Cap bin index so we don't overflow the array */ + int bin = min(fls(t->tv_sec), NUM_BINS-1); + + sleep_time_bin[bin]++; + pm_deferred_pr_dbg("Timekeeping suspended for %lld.%03lu seconds\n", + (s64)t->tv_sec, t->tv_nsec / NSEC_PER_MSEC); +} + diff --git a/kernel/time/timekeeping_internal.h b/kernel/time/timekeeping_internal.h new file mode 100644 index 000000000..bcbb52db2 --- /dev/null +++ b/kernel/time/timekeeping_internal.h @@ -0,0 +1,34 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _TIMEKEEPING_INTERNAL_H +#define _TIMEKEEPING_INTERNAL_H +/* + * timekeeping debug functions + */ +#include <linux/clocksource.h> +#include <linux/time.h> + +#ifdef CONFIG_DEBUG_FS +extern void tk_debug_account_sleep_time(const struct timespec64 *t); +#else +#define tk_debug_account_sleep_time(x) +#endif + +#ifdef CONFIG_CLOCKSOURCE_VALIDATE_LAST_CYCLE +static inline u64 clocksource_delta(u64 now, u64 last, u64 mask) +{ + u64 ret = (now - last) & mask; + + /* + * Prevent time going backwards by checking the MSB of mask in + * the result. If set, return 0. + */ + return ret & ~(mask >> 1) ? 0 : ret; +} +#else +static inline u64 clocksource_delta(u64 now, u64 last, u64 mask) +{ + return (now - last) & mask; +} +#endif + +#endif /* _TIMEKEEPING_INTERNAL_H */ diff --git a/kernel/time/timer.c b/kernel/time/timer.c new file mode 100644 index 000000000..a6e88d9bb --- /dev/null +++ b/kernel/time/timer.c @@ -0,0 +1,2013 @@ +/* + * linux/kernel/timer.c + * + * Kernel internal timers + * + * Copyright (C) 1991, 1992 Linus Torvalds + * + * 1997-01-28 Modified by Finn Arne Gangstad to make timers scale better. + * + * 1997-09-10 Updated NTP code according to technical memorandum Jan '96 + * "A Kernel Model for Precision Timekeeping" by Dave Mills + * 1998-12-24 Fixed a xtime SMP race (we need the xtime_lock rw spinlock to + * serialize accesses to xtime/lost_ticks). + * Copyright (C) 1998 Andrea Arcangeli + * 1999-03-10 Improved NTP compatibility by Ulrich Windl + * 2002-05-31 Move sys_sysinfo here and make its locking sane, Robert Love + * 2000-10-05 Implemented scalable SMP per-CPU timer handling. + * Copyright (C) 2000, 2001, 2002 Ingo Molnar + * Designed by David S. Miller, Alexey Kuznetsov and Ingo Molnar + */ + +#include <linux/kernel_stat.h> +#include <linux/export.h> +#include <linux/interrupt.h> +#include <linux/percpu.h> +#include <linux/init.h> +#include <linux/mm.h> +#include <linux/swap.h> +#include <linux/pid_namespace.h> +#include <linux/notifier.h> +#include <linux/thread_info.h> +#include <linux/time.h> +#include <linux/jiffies.h> +#include <linux/posix-timers.h> +#include <linux/cpu.h> +#include <linux/syscalls.h> +#include <linux/delay.h> +#include <linux/tick.h> +#include <linux/kallsyms.h> +#include <linux/irq_work.h> +#include <linux/sched/signal.h> +#include <linux/sched/sysctl.h> +#include <linux/sched/nohz.h> +#include <linux/sched/debug.h> +#include <linux/slab.h> +#include <linux/compat.h> +#include <linux/random.h> + +#include <linux/uaccess.h> +#include <asm/unistd.h> +#include <asm/div64.h> +#include <asm/timex.h> +#include <asm/io.h> + +#include "tick-internal.h" + +#define CREATE_TRACE_POINTS +#include <trace/events/timer.h> + +__visible u64 jiffies_64 __cacheline_aligned_in_smp = INITIAL_JIFFIES; + +EXPORT_SYMBOL(jiffies_64); + +/* + * The timer wheel has LVL_DEPTH array levels. Each level provides an array of + * LVL_SIZE buckets. Each level is driven by its own clock and therefor each + * level has a different granularity. + * + * The level granularity is: LVL_CLK_DIV ^ lvl + * The level clock frequency is: HZ / (LVL_CLK_DIV ^ level) + * + * The array level of a newly armed timer depends on the relative expiry + * time. The farther the expiry time is away the higher the array level and + * therefor the granularity becomes. + * + * Contrary to the original timer wheel implementation, which aims for 'exact' + * expiry of the timers, this implementation removes the need for recascading + * the timers into the lower array levels. The previous 'classic' timer wheel + * implementation of the kernel already violated the 'exact' expiry by adding + * slack to the expiry time to provide batched expiration. The granularity + * levels provide implicit batching. + * + * This is an optimization of the original timer wheel implementation for the + * majority of the timer wheel use cases: timeouts. The vast majority of + * timeout timers (networking, disk I/O ...) are canceled before expiry. If + * the timeout expires it indicates that normal operation is disturbed, so it + * does not matter much whether the timeout comes with a slight delay. + * + * The only exception to this are networking timers with a small expiry + * time. They rely on the granularity. Those fit into the first wheel level, + * which has HZ granularity. + * + * We don't have cascading anymore. timers with a expiry time above the + * capacity of the last wheel level are force expired at the maximum timeout + * value of the last wheel level. From data sampling we know that the maximum + * value observed is 5 days (network connection tracking), so this should not + * be an issue. + * + * The currently chosen array constants values are a good compromise between + * array size and granularity. + * + * This results in the following granularity and range levels: + * + * HZ 1000 steps + * Level Offset Granularity Range + * 0 0 1 ms 0 ms - 63 ms + * 1 64 8 ms 64 ms - 511 ms + * 2 128 64 ms 512 ms - 4095 ms (512ms - ~4s) + * 3 192 512 ms 4096 ms - 32767 ms (~4s - ~32s) + * 4 256 4096 ms (~4s) 32768 ms - 262143 ms (~32s - ~4m) + * 5 320 32768 ms (~32s) 262144 ms - 2097151 ms (~4m - ~34m) + * 6 384 262144 ms (~4m) 2097152 ms - 16777215 ms (~34m - ~4h) + * 7 448 2097152 ms (~34m) 16777216 ms - 134217727 ms (~4h - ~1d) + * 8 512 16777216 ms (~4h) 134217728 ms - 1073741822 ms (~1d - ~12d) + * + * HZ 300 + * Level Offset Granularity Range + * 0 0 3 ms 0 ms - 210 ms + * 1 64 26 ms 213 ms - 1703 ms (213ms - ~1s) + * 2 128 213 ms 1706 ms - 13650 ms (~1s - ~13s) + * 3 192 1706 ms (~1s) 13653 ms - 109223 ms (~13s - ~1m) + * 4 256 13653 ms (~13s) 109226 ms - 873810 ms (~1m - ~14m) + * 5 320 109226 ms (~1m) 873813 ms - 6990503 ms (~14m - ~1h) + * 6 384 873813 ms (~14m) 6990506 ms - 55924050 ms (~1h - ~15h) + * 7 448 6990506 ms (~1h) 55924053 ms - 447392423 ms (~15h - ~5d) + * 8 512 55924053 ms (~15h) 447392426 ms - 3579139406 ms (~5d - ~41d) + * + * HZ 250 + * Level Offset Granularity Range + * 0 0 4 ms 0 ms - 255 ms + * 1 64 32 ms 256 ms - 2047 ms (256ms - ~2s) + * 2 128 256 ms 2048 ms - 16383 ms (~2s - ~16s) + * 3 192 2048 ms (~2s) 16384 ms - 131071 ms (~16s - ~2m) + * 4 256 16384 ms (~16s) 131072 ms - 1048575 ms (~2m - ~17m) + * 5 320 131072 ms (~2m) 1048576 ms - 8388607 ms (~17m - ~2h) + * 6 384 1048576 ms (~17m) 8388608 ms - 67108863 ms (~2h - ~18h) + * 7 448 8388608 ms (~2h) 67108864 ms - 536870911 ms (~18h - ~6d) + * 8 512 67108864 ms (~18h) 536870912 ms - 4294967288 ms (~6d - ~49d) + * + * HZ 100 + * Level Offset Granularity Range + * 0 0 10 ms 0 ms - 630 ms + * 1 64 80 ms 640 ms - 5110 ms (640ms - ~5s) + * 2 128 640 ms 5120 ms - 40950 ms (~5s - ~40s) + * 3 192 5120 ms (~5s) 40960 ms - 327670 ms (~40s - ~5m) + * 4 256 40960 ms (~40s) 327680 ms - 2621430 ms (~5m - ~43m) + * 5 320 327680 ms (~5m) 2621440 ms - 20971510 ms (~43m - ~5h) + * 6 384 2621440 ms (~43m) 20971520 ms - 167772150 ms (~5h - ~1d) + * 7 448 20971520 ms (~5h) 167772160 ms - 1342177270 ms (~1d - ~15d) + */ + +/* Clock divisor for the next level */ +#define LVL_CLK_SHIFT 3 +#define LVL_CLK_DIV (1UL << LVL_CLK_SHIFT) +#define LVL_CLK_MASK (LVL_CLK_DIV - 1) +#define LVL_SHIFT(n) ((n) * LVL_CLK_SHIFT) +#define LVL_GRAN(n) (1UL << LVL_SHIFT(n)) + +/* + * The time start value for each level to select the bucket at enqueue + * time. + */ +#define LVL_START(n) ((LVL_SIZE - 1) << (((n) - 1) * LVL_CLK_SHIFT)) + +/* Size of each clock level */ +#define LVL_BITS 6 +#define LVL_SIZE (1UL << LVL_BITS) +#define LVL_MASK (LVL_SIZE - 1) +#define LVL_OFFS(n) ((n) * LVL_SIZE) + +/* Level depth */ +#if HZ > 100 +# define LVL_DEPTH 9 +# else +# define LVL_DEPTH 8 +#endif + +/* The cutoff (max. capacity of the wheel) */ +#define WHEEL_TIMEOUT_CUTOFF (LVL_START(LVL_DEPTH)) +#define WHEEL_TIMEOUT_MAX (WHEEL_TIMEOUT_CUTOFF - LVL_GRAN(LVL_DEPTH - 1)) + +/* + * The resulting wheel size. If NOHZ is configured we allocate two + * wheels so we have a separate storage for the deferrable timers. + */ +#define WHEEL_SIZE (LVL_SIZE * LVL_DEPTH) + +#ifdef CONFIG_NO_HZ_COMMON +# define NR_BASES 2 +# define BASE_STD 0 +# define BASE_DEF 1 +#else +# define NR_BASES 1 +# define BASE_STD 0 +# define BASE_DEF 0 +#endif + +struct timer_base { + raw_spinlock_t lock; + struct timer_list *running_timer; + unsigned long clk; + unsigned long next_expiry; + unsigned int cpu; + bool is_idle; + bool must_forward_clk; + DECLARE_BITMAP(pending_map, WHEEL_SIZE); + struct hlist_head vectors[WHEEL_SIZE]; +} ____cacheline_aligned; + +static DEFINE_PER_CPU(struct timer_base, timer_bases[NR_BASES]); + +#ifdef CONFIG_NO_HZ_COMMON + +static DEFINE_STATIC_KEY_FALSE(timers_nohz_active); +static DEFINE_MUTEX(timer_keys_mutex); + +static void timer_update_keys(struct work_struct *work); +static DECLARE_WORK(timer_update_work, timer_update_keys); + +#ifdef CONFIG_SMP +unsigned int sysctl_timer_migration = 1; + +DEFINE_STATIC_KEY_FALSE(timers_migration_enabled); + +static void timers_update_migration(void) +{ + if (sysctl_timer_migration && tick_nohz_active) + static_branch_enable(&timers_migration_enabled); + else + static_branch_disable(&timers_migration_enabled); +} +#else +static inline void timers_update_migration(void) { } +#endif /* !CONFIG_SMP */ + +static void timer_update_keys(struct work_struct *work) +{ + mutex_lock(&timer_keys_mutex); + timers_update_migration(); + static_branch_enable(&timers_nohz_active); + mutex_unlock(&timer_keys_mutex); +} + +void timers_update_nohz(void) +{ + schedule_work(&timer_update_work); +} + +int timer_migration_handler(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, + loff_t *ppos) +{ + int ret; + + mutex_lock(&timer_keys_mutex); + ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); + if (!ret && write) + timers_update_migration(); + mutex_unlock(&timer_keys_mutex); + return ret; +} + +static inline bool is_timers_nohz_active(void) +{ + return static_branch_unlikely(&timers_nohz_active); +} +#else +static inline bool is_timers_nohz_active(void) { return false; } +#endif /* NO_HZ_COMMON */ + +static unsigned long round_jiffies_common(unsigned long j, int cpu, + bool force_up) +{ + int rem; + unsigned long original = j; + + /* + * We don't want all cpus firing their timers at once hitting the + * same lock or cachelines, so we skew each extra cpu with an extra + * 3 jiffies. This 3 jiffies came originally from the mm/ code which + * already did this. + * The skew is done by adding 3*cpunr, then round, then subtract this + * extra offset again. + */ + j += cpu * 3; + + rem = j % HZ; + + /* + * If the target jiffie is just after a whole second (which can happen + * due to delays of the timer irq, long irq off times etc etc) then + * we should round down to the whole second, not up. Use 1/4th second + * as cutoff for this rounding as an extreme upper bound for this. + * But never round down if @force_up is set. + */ + if (rem < HZ/4 && !force_up) /* round down */ + j = j - rem; + else /* round up */ + j = j - rem + HZ; + + /* now that we have rounded, subtract the extra skew again */ + j -= cpu * 3; + + /* + * Make sure j is still in the future. Otherwise return the + * unmodified value. + */ + return time_is_after_jiffies(j) ? j : original; +} + +/** + * __round_jiffies - function to round jiffies to a full second + * @j: the time in (absolute) jiffies that should be rounded + * @cpu: the processor number on which the timeout will happen + * + * __round_jiffies() rounds an absolute time in the future (in jiffies) + * up or down to (approximately) full seconds. This is useful for timers + * for which the exact time they fire does not matter too much, as long as + * they fire approximately every X seconds. + * + * By rounding these timers to whole seconds, all such timers will fire + * at the same time, rather than at various times spread out. The goal + * of this is to have the CPU wake up less, which saves power. + * + * The exact rounding is skewed for each processor to avoid all + * processors firing at the exact same time, which could lead + * to lock contention or spurious cache line bouncing. + * + * The return value is the rounded version of the @j parameter. + */ +unsigned long __round_jiffies(unsigned long j, int cpu) +{ + return round_jiffies_common(j, cpu, false); +} +EXPORT_SYMBOL_GPL(__round_jiffies); + +/** + * __round_jiffies_relative - function to round jiffies to a full second + * @j: the time in (relative) jiffies that should be rounded + * @cpu: the processor number on which the timeout will happen + * + * __round_jiffies_relative() rounds a time delta in the future (in jiffies) + * up or down to (approximately) full seconds. This is useful for timers + * for which the exact time they fire does not matter too much, as long as + * they fire approximately every X seconds. + * + * By rounding these timers to whole seconds, all such timers will fire + * at the same time, rather than at various times spread out. The goal + * of this is to have the CPU wake up less, which saves power. + * + * The exact rounding is skewed for each processor to avoid all + * processors firing at the exact same time, which could lead + * to lock contention or spurious cache line bouncing. + * + * The return value is the rounded version of the @j parameter. + */ +unsigned long __round_jiffies_relative(unsigned long j, int cpu) +{ + unsigned long j0 = jiffies; + + /* Use j0 because jiffies might change while we run */ + return round_jiffies_common(j + j0, cpu, false) - j0; +} +EXPORT_SYMBOL_GPL(__round_jiffies_relative); + +/** + * round_jiffies - function to round jiffies to a full second + * @j: the time in (absolute) jiffies that should be rounded + * + * round_jiffies() rounds an absolute time in the future (in jiffies) + * up or down to (approximately) full seconds. This is useful for timers + * for which the exact time they fire does not matter too much, as long as + * they fire approximately every X seconds. + * + * By rounding these timers to whole seconds, all such timers will fire + * at the same time, rather than at various times spread out. The goal + * of this is to have the CPU wake up less, which saves power. + * + * The return value is the rounded version of the @j parameter. + */ +unsigned long round_jiffies(unsigned long j) +{ + return round_jiffies_common(j, raw_smp_processor_id(), false); +} +EXPORT_SYMBOL_GPL(round_jiffies); + +/** + * round_jiffies_relative - function to round jiffies to a full second + * @j: the time in (relative) jiffies that should be rounded + * + * round_jiffies_relative() rounds a time delta in the future (in jiffies) + * up or down to (approximately) full seconds. This is useful for timers + * for which the exact time they fire does not matter too much, as long as + * they fire approximately every X seconds. + * + * By rounding these timers to whole seconds, all such timers will fire + * at the same time, rather than at various times spread out. The goal + * of this is to have the CPU wake up less, which saves power. + * + * The return value is the rounded version of the @j parameter. + */ +unsigned long round_jiffies_relative(unsigned long j) +{ + return __round_jiffies_relative(j, raw_smp_processor_id()); +} +EXPORT_SYMBOL_GPL(round_jiffies_relative); + +/** + * __round_jiffies_up - function to round jiffies up to a full second + * @j: the time in (absolute) jiffies that should be rounded + * @cpu: the processor number on which the timeout will happen + * + * This is the same as __round_jiffies() except that it will never + * round down. This is useful for timeouts for which the exact time + * of firing does not matter too much, as long as they don't fire too + * early. + */ +unsigned long __round_jiffies_up(unsigned long j, int cpu) +{ + return round_jiffies_common(j, cpu, true); +} +EXPORT_SYMBOL_GPL(__round_jiffies_up); + +/** + * __round_jiffies_up_relative - function to round jiffies up to a full second + * @j: the time in (relative) jiffies that should be rounded + * @cpu: the processor number on which the timeout will happen + * + * This is the same as __round_jiffies_relative() except that it will never + * round down. This is useful for timeouts for which the exact time + * of firing does not matter too much, as long as they don't fire too + * early. + */ +unsigned long __round_jiffies_up_relative(unsigned long j, int cpu) +{ + unsigned long j0 = jiffies; + + /* Use j0 because jiffies might change while we run */ + return round_jiffies_common(j + j0, cpu, true) - j0; +} +EXPORT_SYMBOL_GPL(__round_jiffies_up_relative); + +/** + * round_jiffies_up - function to round jiffies up to a full second + * @j: the time in (absolute) jiffies that should be rounded + * + * This is the same as round_jiffies() except that it will never + * round down. This is useful for timeouts for which the exact time + * of firing does not matter too much, as long as they don't fire too + * early. + */ +unsigned long round_jiffies_up(unsigned long j) +{ + return round_jiffies_common(j, raw_smp_processor_id(), true); +} +EXPORT_SYMBOL_GPL(round_jiffies_up); + +/** + * round_jiffies_up_relative - function to round jiffies up to a full second + * @j: the time in (relative) jiffies that should be rounded + * + * This is the same as round_jiffies_relative() except that it will never + * round down. This is useful for timeouts for which the exact time + * of firing does not matter too much, as long as they don't fire too + * early. + */ +unsigned long round_jiffies_up_relative(unsigned long j) +{ + return __round_jiffies_up_relative(j, raw_smp_processor_id()); +} +EXPORT_SYMBOL_GPL(round_jiffies_up_relative); + + +static inline unsigned int timer_get_idx(struct timer_list *timer) +{ + return (timer->flags & TIMER_ARRAYMASK) >> TIMER_ARRAYSHIFT; +} + +static inline void timer_set_idx(struct timer_list *timer, unsigned int idx) +{ + timer->flags = (timer->flags & ~TIMER_ARRAYMASK) | + idx << TIMER_ARRAYSHIFT; +} + +/* + * Helper function to calculate the array index for a given expiry + * time. + */ +static inline unsigned calc_index(unsigned expires, unsigned lvl) +{ + expires = (expires + LVL_GRAN(lvl)) >> LVL_SHIFT(lvl); + return LVL_OFFS(lvl) + (expires & LVL_MASK); +} + +static int calc_wheel_index(unsigned long expires, unsigned long clk) +{ + unsigned long delta = expires - clk; + unsigned int idx; + + if (delta < LVL_START(1)) { + idx = calc_index(expires, 0); + } else if (delta < LVL_START(2)) { + idx = calc_index(expires, 1); + } else if (delta < LVL_START(3)) { + idx = calc_index(expires, 2); + } else if (delta < LVL_START(4)) { + idx = calc_index(expires, 3); + } else if (delta < LVL_START(5)) { + idx = calc_index(expires, 4); + } else if (delta < LVL_START(6)) { + idx = calc_index(expires, 5); + } else if (delta < LVL_START(7)) { + idx = calc_index(expires, 6); + } else if (LVL_DEPTH > 8 && delta < LVL_START(8)) { + idx = calc_index(expires, 7); + } else if ((long) delta < 0) { + idx = clk & LVL_MASK; + } else { + /* + * Force expire obscene large timeouts to expire at the + * capacity limit of the wheel. + */ + if (delta >= WHEEL_TIMEOUT_CUTOFF) + expires = clk + WHEEL_TIMEOUT_MAX; + + idx = calc_index(expires, LVL_DEPTH - 1); + } + return idx; +} + +/* + * Enqueue the timer into the hash bucket, mark it pending in + * the bitmap and store the index in the timer flags. + */ +static void enqueue_timer(struct timer_base *base, struct timer_list *timer, + unsigned int idx) +{ + hlist_add_head(&timer->entry, base->vectors + idx); + __set_bit(idx, base->pending_map); + timer_set_idx(timer, idx); +} + +static void +__internal_add_timer(struct timer_base *base, struct timer_list *timer) +{ + unsigned int idx; + + idx = calc_wheel_index(timer->expires, base->clk); + enqueue_timer(base, timer, idx); +} + +static void +trigger_dyntick_cpu(struct timer_base *base, struct timer_list *timer) +{ + if (!is_timers_nohz_active()) + return; + + /* + * TODO: This wants some optimizing similar to the code below, but we + * will do that when we switch from push to pull for deferrable timers. + */ + if (timer->flags & TIMER_DEFERRABLE) { + if (tick_nohz_full_cpu(base->cpu)) + wake_up_nohz_cpu(base->cpu); + return; + } + + /* + * We might have to IPI the remote CPU if the base is idle and the + * timer is not deferrable. If the other CPU is on the way to idle + * then it can't set base->is_idle as we hold the base lock: + */ + if (!base->is_idle) + return; + + /* Check whether this is the new first expiring timer: */ + if (time_after_eq(timer->expires, base->next_expiry)) + return; + + /* + * Set the next expiry time and kick the CPU so it can reevaluate the + * wheel: + */ + if (time_before(timer->expires, base->clk)) { + /* + * Prevent from forward_timer_base() moving the base->clk + * backward + */ + base->next_expiry = base->clk; + } else { + base->next_expiry = timer->expires; + } + wake_up_nohz_cpu(base->cpu); +} + +static void +internal_add_timer(struct timer_base *base, struct timer_list *timer) +{ + __internal_add_timer(base, timer); + trigger_dyntick_cpu(base, timer); +} + +#ifdef CONFIG_DEBUG_OBJECTS_TIMERS + +static struct debug_obj_descr timer_debug_descr; + +static void *timer_debug_hint(void *addr) +{ + return ((struct timer_list *) addr)->function; +} + +static bool timer_is_static_object(void *addr) +{ + struct timer_list *timer = addr; + + return (timer->entry.pprev == NULL && + timer->entry.next == TIMER_ENTRY_STATIC); +} + +/* + * fixup_init is called when: + * - an active object is initialized + */ +static bool timer_fixup_init(void *addr, enum debug_obj_state state) +{ + struct timer_list *timer = addr; + + switch (state) { + case ODEBUG_STATE_ACTIVE: + del_timer_sync(timer); + debug_object_init(timer, &timer_debug_descr); + return true; + default: + return false; + } +} + +/* Stub timer callback for improperly used timers. */ +static void stub_timer(struct timer_list *unused) +{ + WARN_ON(1); +} + +/* + * fixup_activate is called when: + * - an active object is activated + * - an unknown non-static object is activated + */ +static bool timer_fixup_activate(void *addr, enum debug_obj_state state) +{ + struct timer_list *timer = addr; + + switch (state) { + case ODEBUG_STATE_NOTAVAILABLE: + timer_setup(timer, stub_timer, 0); + return true; + + case ODEBUG_STATE_ACTIVE: + WARN_ON(1); + + default: + return false; + } +} + +/* + * fixup_free is called when: + * - an active object is freed + */ +static bool timer_fixup_free(void *addr, enum debug_obj_state state) +{ + struct timer_list *timer = addr; + + switch (state) { + case ODEBUG_STATE_ACTIVE: + del_timer_sync(timer); + debug_object_free(timer, &timer_debug_descr); + return true; + default: + return false; + } +} + +/* + * fixup_assert_init is called when: + * - an untracked/uninit-ed object is found + */ +static bool timer_fixup_assert_init(void *addr, enum debug_obj_state state) +{ + struct timer_list *timer = addr; + + switch (state) { + case ODEBUG_STATE_NOTAVAILABLE: + timer_setup(timer, stub_timer, 0); + return true; + default: + return false; + } +} + +static struct debug_obj_descr timer_debug_descr = { + .name = "timer_list", + .debug_hint = timer_debug_hint, + .is_static_object = timer_is_static_object, + .fixup_init = timer_fixup_init, + .fixup_activate = timer_fixup_activate, + .fixup_free = timer_fixup_free, + .fixup_assert_init = timer_fixup_assert_init, +}; + +static inline void debug_timer_init(struct timer_list *timer) +{ + debug_object_init(timer, &timer_debug_descr); +} + +static inline void debug_timer_activate(struct timer_list *timer) +{ + debug_object_activate(timer, &timer_debug_descr); +} + +static inline void debug_timer_deactivate(struct timer_list *timer) +{ + debug_object_deactivate(timer, &timer_debug_descr); +} + +static inline void debug_timer_free(struct timer_list *timer) +{ + debug_object_free(timer, &timer_debug_descr); +} + +static inline void debug_timer_assert_init(struct timer_list *timer) +{ + debug_object_assert_init(timer, &timer_debug_descr); +} + +static void do_init_timer(struct timer_list *timer, + void (*func)(struct timer_list *), + unsigned int flags, + const char *name, struct lock_class_key *key); + +void init_timer_on_stack_key(struct timer_list *timer, + void (*func)(struct timer_list *), + unsigned int flags, + const char *name, struct lock_class_key *key) +{ + debug_object_init_on_stack(timer, &timer_debug_descr); + do_init_timer(timer, func, flags, name, key); +} +EXPORT_SYMBOL_GPL(init_timer_on_stack_key); + +void destroy_timer_on_stack(struct timer_list *timer) +{ + debug_object_free(timer, &timer_debug_descr); +} +EXPORT_SYMBOL_GPL(destroy_timer_on_stack); + +#else +static inline void debug_timer_init(struct timer_list *timer) { } +static inline void debug_timer_activate(struct timer_list *timer) { } +static inline void debug_timer_deactivate(struct timer_list *timer) { } +static inline void debug_timer_assert_init(struct timer_list *timer) { } +#endif + +static inline void debug_init(struct timer_list *timer) +{ + debug_timer_init(timer); + trace_timer_init(timer); +} + +static inline void +debug_activate(struct timer_list *timer, unsigned long expires) +{ + debug_timer_activate(timer); + trace_timer_start(timer, expires, timer->flags); +} + +static inline void debug_deactivate(struct timer_list *timer) +{ + debug_timer_deactivate(timer); + trace_timer_cancel(timer); +} + +static inline void debug_assert_init(struct timer_list *timer) +{ + debug_timer_assert_init(timer); +} + +static void do_init_timer(struct timer_list *timer, + void (*func)(struct timer_list *), + unsigned int flags, + const char *name, struct lock_class_key *key) +{ + timer->entry.pprev = NULL; + timer->function = func; + timer->flags = flags | raw_smp_processor_id(); + lockdep_init_map(&timer->lockdep_map, name, key, 0); +} + +/** + * init_timer_key - initialize a timer + * @timer: the timer to be initialized + * @func: timer callback function + * @flags: timer flags + * @name: name of the timer + * @key: lockdep class key of the fake lock used for tracking timer + * sync lock dependencies + * + * init_timer_key() must be done to a timer prior calling *any* of the + * other timer functions. + */ +void init_timer_key(struct timer_list *timer, + void (*func)(struct timer_list *), unsigned int flags, + const char *name, struct lock_class_key *key) +{ + debug_init(timer); + do_init_timer(timer, func, flags, name, key); +} +EXPORT_SYMBOL(init_timer_key); + +static inline void detach_timer(struct timer_list *timer, bool clear_pending) +{ + struct hlist_node *entry = &timer->entry; + + debug_deactivate(timer); + + __hlist_del(entry); + if (clear_pending) + entry->pprev = NULL; + entry->next = LIST_POISON2; +} + +static int detach_if_pending(struct timer_list *timer, struct timer_base *base, + bool clear_pending) +{ + unsigned idx = timer_get_idx(timer); + + if (!timer_pending(timer)) + return 0; + + if (hlist_is_singular_node(&timer->entry, base->vectors + idx)) + __clear_bit(idx, base->pending_map); + + detach_timer(timer, clear_pending); + return 1; +} + +static inline struct timer_base *get_timer_cpu_base(u32 tflags, u32 cpu) +{ + struct timer_base *base = per_cpu_ptr(&timer_bases[BASE_STD], cpu); + + /* + * If the timer is deferrable and NO_HZ_COMMON is set then we need + * to use the deferrable base. + */ + if (IS_ENABLED(CONFIG_NO_HZ_COMMON) && (tflags & TIMER_DEFERRABLE)) + base = per_cpu_ptr(&timer_bases[BASE_DEF], cpu); + return base; +} + +static inline struct timer_base *get_timer_this_cpu_base(u32 tflags) +{ + struct timer_base *base = this_cpu_ptr(&timer_bases[BASE_STD]); + + /* + * If the timer is deferrable and NO_HZ_COMMON is set then we need + * to use the deferrable base. + */ + if (IS_ENABLED(CONFIG_NO_HZ_COMMON) && (tflags & TIMER_DEFERRABLE)) + base = this_cpu_ptr(&timer_bases[BASE_DEF]); + return base; +} + +static inline struct timer_base *get_timer_base(u32 tflags) +{ + return get_timer_cpu_base(tflags, tflags & TIMER_CPUMASK); +} + +static inline struct timer_base * +get_target_base(struct timer_base *base, unsigned tflags) +{ +#if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ_COMMON) + if (static_branch_likely(&timers_migration_enabled) && + !(tflags & TIMER_PINNED)) + return get_timer_cpu_base(tflags, get_nohz_timer_target()); +#endif + return get_timer_this_cpu_base(tflags); +} + +static inline void forward_timer_base(struct timer_base *base) +{ +#ifdef CONFIG_NO_HZ_COMMON + unsigned long jnow; + + /* + * We only forward the base when we are idle or have just come out of + * idle (must_forward_clk logic), and have a delta between base clock + * and jiffies. In the common case, run_timers will take care of it. + */ + if (likely(!base->must_forward_clk)) + return; + + jnow = READ_ONCE(jiffies); + base->must_forward_clk = base->is_idle; + if ((long)(jnow - base->clk) < 2) + return; + + /* + * If the next expiry value is > jiffies, then we fast forward to + * jiffies otherwise we forward to the next expiry value. + */ + if (time_after(base->next_expiry, jnow)) { + base->clk = jnow; + } else { + if (WARN_ON_ONCE(time_before(base->next_expiry, base->clk))) + return; + base->clk = base->next_expiry; + } +#endif +} + + +/* + * We are using hashed locking: Holding per_cpu(timer_bases[x]).lock means + * that all timers which are tied to this base are locked, and the base itself + * is locked too. + * + * So __run_timers/migrate_timers can safely modify all timers which could + * be found in the base->vectors array. + * + * When a timer is migrating then the TIMER_MIGRATING flag is set and we need + * to wait until the migration is done. + */ +static struct timer_base *lock_timer_base(struct timer_list *timer, + unsigned long *flags) + __acquires(timer->base->lock) +{ + for (;;) { + struct timer_base *base; + u32 tf; + + /* + * We need to use READ_ONCE() here, otherwise the compiler + * might re-read @tf between the check for TIMER_MIGRATING + * and spin_lock(). + */ + tf = READ_ONCE(timer->flags); + + if (!(tf & TIMER_MIGRATING)) { + base = get_timer_base(tf); + raw_spin_lock_irqsave(&base->lock, *flags); + if (timer->flags == tf) + return base; + raw_spin_unlock_irqrestore(&base->lock, *flags); + } + cpu_relax(); + } +} + +#define MOD_TIMER_PENDING_ONLY 0x01 +#define MOD_TIMER_REDUCE 0x02 + +static inline int +__mod_timer(struct timer_list *timer, unsigned long expires, unsigned int options) +{ + struct timer_base *base, *new_base; + unsigned int idx = UINT_MAX; + unsigned long clk = 0, flags; + int ret = 0; + + BUG_ON(!timer->function); + + /* + * This is a common optimization triggered by the networking code - if + * the timer is re-modified to have the same timeout or ends up in the + * same array bucket then just return: + */ + if (timer_pending(timer)) { + /* + * The downside of this optimization is that it can result in + * larger granularity than you would get from adding a new + * timer with this expiry. + */ + long diff = timer->expires - expires; + + if (!diff) + return 1; + if (options & MOD_TIMER_REDUCE && diff <= 0) + return 1; + + /* + * We lock timer base and calculate the bucket index right + * here. If the timer ends up in the same bucket, then we + * just update the expiry time and avoid the whole + * dequeue/enqueue dance. + */ + base = lock_timer_base(timer, &flags); + forward_timer_base(base); + + if (timer_pending(timer) && (options & MOD_TIMER_REDUCE) && + time_before_eq(timer->expires, expires)) { + ret = 1; + goto out_unlock; + } + + clk = base->clk; + idx = calc_wheel_index(expires, clk); + + /* + * Retrieve and compare the array index of the pending + * timer. If it matches set the expiry to the new value so a + * subsequent call will exit in the expires check above. + */ + if (idx == timer_get_idx(timer)) { + if (!(options & MOD_TIMER_REDUCE)) + timer->expires = expires; + else if (time_after(timer->expires, expires)) + timer->expires = expires; + ret = 1; + goto out_unlock; + } + } else { + base = lock_timer_base(timer, &flags); + forward_timer_base(base); + } + + ret = detach_if_pending(timer, base, false); + if (!ret && (options & MOD_TIMER_PENDING_ONLY)) + goto out_unlock; + + new_base = get_target_base(base, timer->flags); + + if (base != new_base) { + /* + * We are trying to schedule the timer on the new base. + * However we can't change timer's base while it is running, + * otherwise del_timer_sync() can't detect that the timer's + * handler yet has not finished. This also guarantees that the + * timer is serialized wrt itself. + */ + if (likely(base->running_timer != timer)) { + /* See the comment in lock_timer_base() */ + timer->flags |= TIMER_MIGRATING; + + raw_spin_unlock(&base->lock); + base = new_base; + raw_spin_lock(&base->lock); + WRITE_ONCE(timer->flags, + (timer->flags & ~TIMER_BASEMASK) | base->cpu); + forward_timer_base(base); + } + } + + debug_activate(timer, expires); + + timer->expires = expires; + /* + * If 'idx' was calculated above and the base time did not advance + * between calculating 'idx' and possibly switching the base, only + * enqueue_timer() and trigger_dyntick_cpu() is required. Otherwise + * we need to (re)calculate the wheel index via + * internal_add_timer(). + */ + if (idx != UINT_MAX && clk == base->clk) { + enqueue_timer(base, timer, idx); + trigger_dyntick_cpu(base, timer); + } else { + internal_add_timer(base, timer); + } + +out_unlock: + raw_spin_unlock_irqrestore(&base->lock, flags); + + return ret; +} + +/** + * mod_timer_pending - modify a pending timer's timeout + * @timer: the pending timer to be modified + * @expires: new timeout in jiffies + * + * mod_timer_pending() is the same for pending timers as mod_timer(), + * but will not re-activate and modify already deleted timers. + * + * It is useful for unserialized use of timers. + */ +int mod_timer_pending(struct timer_list *timer, unsigned long expires) +{ + return __mod_timer(timer, expires, MOD_TIMER_PENDING_ONLY); +} +EXPORT_SYMBOL(mod_timer_pending); + +/** + * mod_timer - modify a timer's timeout + * @timer: the timer to be modified + * @expires: new timeout in jiffies + * + * mod_timer() is a more efficient way to update the expire field of an + * active timer (if the timer is inactive it will be activated) + * + * mod_timer(timer, expires) is equivalent to: + * + * del_timer(timer); timer->expires = expires; add_timer(timer); + * + * Note that if there are multiple unserialized concurrent users of the + * same timer, then mod_timer() is the only safe way to modify the timeout, + * since add_timer() cannot modify an already running timer. + * + * The function returns whether it has modified a pending timer or not. + * (ie. mod_timer() of an inactive timer returns 0, mod_timer() of an + * active timer returns 1.) + */ +int mod_timer(struct timer_list *timer, unsigned long expires) +{ + return __mod_timer(timer, expires, 0); +} +EXPORT_SYMBOL(mod_timer); + +/** + * timer_reduce - Modify a timer's timeout if it would reduce the timeout + * @timer: The timer to be modified + * @expires: New timeout in jiffies + * + * timer_reduce() is very similar to mod_timer(), except that it will only + * modify a running timer if that would reduce the expiration time (it will + * start a timer that isn't running). + */ +int timer_reduce(struct timer_list *timer, unsigned long expires) +{ + return __mod_timer(timer, expires, MOD_TIMER_REDUCE); +} +EXPORT_SYMBOL(timer_reduce); + +/** + * add_timer - start a timer + * @timer: the timer to be added + * + * The kernel will do a ->function(@timer) callback from the + * timer interrupt at the ->expires point in the future. The + * current time is 'jiffies'. + * + * The timer's ->expires, ->function fields must be set prior calling this + * function. + * + * Timers with an ->expires field in the past will be executed in the next + * timer tick. + */ +void add_timer(struct timer_list *timer) +{ + BUG_ON(timer_pending(timer)); + mod_timer(timer, timer->expires); +} +EXPORT_SYMBOL(add_timer); + +/** + * add_timer_on - start a timer on a particular CPU + * @timer: the timer to be added + * @cpu: the CPU to start it on + * + * This is not very scalable on SMP. Double adds are not possible. + */ +void add_timer_on(struct timer_list *timer, int cpu) +{ + struct timer_base *new_base, *base; + unsigned long flags; + + BUG_ON(timer_pending(timer) || !timer->function); + + new_base = get_timer_cpu_base(timer->flags, cpu); + + /* + * If @timer was on a different CPU, it should be migrated with the + * old base locked to prevent other operations proceeding with the + * wrong base locked. See lock_timer_base(). + */ + base = lock_timer_base(timer, &flags); + if (base != new_base) { + timer->flags |= TIMER_MIGRATING; + + raw_spin_unlock(&base->lock); + base = new_base; + raw_spin_lock(&base->lock); + WRITE_ONCE(timer->flags, + (timer->flags & ~TIMER_BASEMASK) | cpu); + } + forward_timer_base(base); + + debug_activate(timer, timer->expires); + internal_add_timer(base, timer); + raw_spin_unlock_irqrestore(&base->lock, flags); +} +EXPORT_SYMBOL_GPL(add_timer_on); + +/** + * del_timer - deactivate a timer. + * @timer: the timer to be deactivated + * + * del_timer() deactivates a timer - this works on both active and inactive + * timers. + * + * The function returns whether it has deactivated a pending timer or not. + * (ie. del_timer() of an inactive timer returns 0, del_timer() of an + * active timer returns 1.) + */ +int del_timer(struct timer_list *timer) +{ + struct timer_base *base; + unsigned long flags; + int ret = 0; + + debug_assert_init(timer); + + if (timer_pending(timer)) { + base = lock_timer_base(timer, &flags); + ret = detach_if_pending(timer, base, true); + raw_spin_unlock_irqrestore(&base->lock, flags); + } + + return ret; +} +EXPORT_SYMBOL(del_timer); + +/** + * try_to_del_timer_sync - Try to deactivate a timer + * @timer: timer to delete + * + * This function tries to deactivate a timer. Upon successful (ret >= 0) + * exit the timer is not queued and the handler is not running on any CPU. + */ +int try_to_del_timer_sync(struct timer_list *timer) +{ + struct timer_base *base; + unsigned long flags; + int ret = -1; + + debug_assert_init(timer); + + base = lock_timer_base(timer, &flags); + + if (base->running_timer != timer) + ret = detach_if_pending(timer, base, true); + + raw_spin_unlock_irqrestore(&base->lock, flags); + + return ret; +} +EXPORT_SYMBOL(try_to_del_timer_sync); + +#ifdef CONFIG_SMP +/** + * del_timer_sync - deactivate a timer and wait for the handler to finish. + * @timer: the timer to be deactivated + * + * This function only differs from del_timer() on SMP: besides deactivating + * the timer it also makes sure the handler has finished executing on other + * CPUs. + * + * Synchronization rules: Callers must prevent restarting of the timer, + * otherwise this function is meaningless. It must not be called from + * interrupt contexts unless the timer is an irqsafe one. The caller must + * not hold locks which would prevent completion of the timer's + * handler. The timer's handler must not call add_timer_on(). Upon exit the + * timer is not queued and the handler is not running on any CPU. + * + * Note: For !irqsafe timers, you must not hold locks that are held in + * interrupt context while calling this function. Even if the lock has + * nothing to do with the timer in question. Here's why:: + * + * CPU0 CPU1 + * ---- ---- + * <SOFTIRQ> + * call_timer_fn(); + * base->running_timer = mytimer; + * spin_lock_irq(somelock); + * <IRQ> + * spin_lock(somelock); + * del_timer_sync(mytimer); + * while (base->running_timer == mytimer); + * + * Now del_timer_sync() will never return and never release somelock. + * The interrupt on the other CPU is waiting to grab somelock but + * it has interrupted the softirq that CPU0 is waiting to finish. + * + * The function returns whether it has deactivated a pending timer or not. + */ +int del_timer_sync(struct timer_list *timer) +{ +#ifdef CONFIG_LOCKDEP + unsigned long flags; + + /* + * If lockdep gives a backtrace here, please reference + * the synchronization rules above. + */ + local_irq_save(flags); + lock_map_acquire(&timer->lockdep_map); + lock_map_release(&timer->lockdep_map); + local_irq_restore(flags); +#endif + /* + * don't use it in hardirq context, because it + * could lead to deadlock. + */ + WARN_ON(in_irq() && !(timer->flags & TIMER_IRQSAFE)); + for (;;) { + int ret = try_to_del_timer_sync(timer); + if (ret >= 0) + return ret; + cpu_relax(); + } +} +EXPORT_SYMBOL(del_timer_sync); +#endif + +static void call_timer_fn(struct timer_list *timer, void (*fn)(struct timer_list *)) +{ + int count = preempt_count(); + +#ifdef CONFIG_LOCKDEP + /* + * It is permissible to free the timer from inside the + * function that is called from it, this we need to take into + * account for lockdep too. To avoid bogus "held lock freed" + * warnings as well as problems when looking into + * timer->lockdep_map, make a copy and use that here. + */ + struct lockdep_map lockdep_map; + + lockdep_copy_map(&lockdep_map, &timer->lockdep_map); +#endif + /* + * Couple the lock chain with the lock chain at + * del_timer_sync() by acquiring the lock_map around the fn() + * call here and in del_timer_sync(). + */ + lock_map_acquire(&lockdep_map); + + trace_timer_expire_entry(timer); + fn(timer); + trace_timer_expire_exit(timer); + + lock_map_release(&lockdep_map); + + if (count != preempt_count()) { + WARN_ONCE(1, "timer: %pF preempt leak: %08x -> %08x\n", + fn, count, preempt_count()); + /* + * Restore the preempt count. That gives us a decent + * chance to survive and extract information. If the + * callback kept a lock held, bad luck, but not worse + * than the BUG() we had. + */ + preempt_count_set(count); + } +} + +static void expire_timers(struct timer_base *base, struct hlist_head *head) +{ + while (!hlist_empty(head)) { + struct timer_list *timer; + void (*fn)(struct timer_list *); + + timer = hlist_entry(head->first, struct timer_list, entry); + + base->running_timer = timer; + detach_timer(timer, true); + + fn = timer->function; + + if (timer->flags & TIMER_IRQSAFE) { + raw_spin_unlock(&base->lock); + call_timer_fn(timer, fn); + raw_spin_lock(&base->lock); + } else { + raw_spin_unlock_irq(&base->lock); + call_timer_fn(timer, fn); + raw_spin_lock_irq(&base->lock); + } + } +} + +static int __collect_expired_timers(struct timer_base *base, + struct hlist_head *heads) +{ + unsigned long clk = base->clk; + struct hlist_head *vec; + int i, levels = 0; + unsigned int idx; + + for (i = 0; i < LVL_DEPTH; i++) { + idx = (clk & LVL_MASK) + i * LVL_SIZE; + + if (__test_and_clear_bit(idx, base->pending_map)) { + vec = base->vectors + idx; + hlist_move_list(vec, heads++); + levels++; + } + /* Is it time to look at the next level? */ + if (clk & LVL_CLK_MASK) + break; + /* Shift clock for the next level granularity */ + clk >>= LVL_CLK_SHIFT; + } + return levels; +} + +#ifdef CONFIG_NO_HZ_COMMON +/* + * Find the next pending bucket of a level. Search from level start (@offset) + * + @clk upwards and if nothing there, search from start of the level + * (@offset) up to @offset + clk. + */ +static int next_pending_bucket(struct timer_base *base, unsigned offset, + unsigned clk) +{ + unsigned pos, start = offset + clk; + unsigned end = offset + LVL_SIZE; + + pos = find_next_bit(base->pending_map, end, start); + if (pos < end) + return pos - start; + + pos = find_next_bit(base->pending_map, start, offset); + return pos < start ? pos + LVL_SIZE - start : -1; +} + +/* + * Search the first expiring timer in the various clock levels. Caller must + * hold base->lock. + */ +static unsigned long __next_timer_interrupt(struct timer_base *base) +{ + unsigned long clk, next, adj; + unsigned lvl, offset = 0; + + next = base->clk + NEXT_TIMER_MAX_DELTA; + clk = base->clk; + for (lvl = 0; lvl < LVL_DEPTH; lvl++, offset += LVL_SIZE) { + int pos = next_pending_bucket(base, offset, clk & LVL_MASK); + + if (pos >= 0) { + unsigned long tmp = clk + (unsigned long) pos; + + tmp <<= LVL_SHIFT(lvl); + if (time_before(tmp, next)) + next = tmp; + } + /* + * Clock for the next level. If the current level clock lower + * bits are zero, we look at the next level as is. If not we + * need to advance it by one because that's going to be the + * next expiring bucket in that level. base->clk is the next + * expiring jiffie. So in case of: + * + * LVL5 LVL4 LVL3 LVL2 LVL1 LVL0 + * 0 0 0 0 0 0 + * + * we have to look at all levels @index 0. With + * + * LVL5 LVL4 LVL3 LVL2 LVL1 LVL0 + * 0 0 0 0 0 2 + * + * LVL0 has the next expiring bucket @index 2. The upper + * levels have the next expiring bucket @index 1. + * + * In case that the propagation wraps the next level the same + * rules apply: + * + * LVL5 LVL4 LVL3 LVL2 LVL1 LVL0 + * 0 0 0 0 F 2 + * + * So after looking at LVL0 we get: + * + * LVL5 LVL4 LVL3 LVL2 LVL1 + * 0 0 0 1 0 + * + * So no propagation from LVL1 to LVL2 because that happened + * with the add already, but then we need to propagate further + * from LVL2 to LVL3. + * + * So the simple check whether the lower bits of the current + * level are 0 or not is sufficient for all cases. + */ + adj = clk & LVL_CLK_MASK ? 1 : 0; + clk >>= LVL_CLK_SHIFT; + clk += adj; + } + return next; +} + +/* + * Check, if the next hrtimer event is before the next timer wheel + * event: + */ +static u64 cmp_next_hrtimer_event(u64 basem, u64 expires) +{ + u64 nextevt = hrtimer_get_next_event(); + + /* + * If high resolution timers are enabled + * hrtimer_get_next_event() returns KTIME_MAX. + */ + if (expires <= nextevt) + return expires; + + /* + * If the next timer is already expired, return the tick base + * time so the tick is fired immediately. + */ + if (nextevt <= basem) + return basem; + + /* + * Round up to the next jiffie. High resolution timers are + * off, so the hrtimers are expired in the tick and we need to + * make sure that this tick really expires the timer to avoid + * a ping pong of the nohz stop code. + * + * Use DIV_ROUND_UP_ULL to prevent gcc calling __divdi3 + */ + return DIV_ROUND_UP_ULL(nextevt, TICK_NSEC) * TICK_NSEC; +} + +/** + * get_next_timer_interrupt - return the time (clock mono) of the next timer + * @basej: base time jiffies + * @basem: base time clock monotonic + * + * Returns the tick aligned clock monotonic time of the next pending + * timer or KTIME_MAX if no timer is pending. + */ +u64 get_next_timer_interrupt(unsigned long basej, u64 basem) +{ + struct timer_base *base = this_cpu_ptr(&timer_bases[BASE_STD]); + u64 expires = KTIME_MAX; + unsigned long nextevt; + bool is_max_delta; + + /* + * Pretend that there is no timer pending if the cpu is offline. + * Possible pending timers will be migrated later to an active cpu. + */ + if (cpu_is_offline(smp_processor_id())) + return expires; + + raw_spin_lock(&base->lock); + nextevt = __next_timer_interrupt(base); + is_max_delta = (nextevt == base->clk + NEXT_TIMER_MAX_DELTA); + base->next_expiry = nextevt; + /* + * We have a fresh next event. Check whether we can forward the + * base. We can only do that when @basej is past base->clk + * otherwise we might rewind base->clk. + */ + if (time_after(basej, base->clk)) { + if (time_after(nextevt, basej)) + base->clk = basej; + else if (time_after(nextevt, base->clk)) + base->clk = nextevt; + } + + if (time_before_eq(nextevt, basej)) { + expires = basem; + base->is_idle = false; + } else { + if (!is_max_delta) + expires = basem + (u64)(nextevt - basej) * TICK_NSEC; + /* + * If we expect to sleep more than a tick, mark the base idle. + * Also the tick is stopped so any added timer must forward + * the base clk itself to keep granularity small. This idle + * logic is only maintained for the BASE_STD base, deferrable + * timers may still see large granularity skew (by design). + */ + if ((expires - basem) > TICK_NSEC) { + base->must_forward_clk = true; + base->is_idle = true; + } + } + raw_spin_unlock(&base->lock); + + return cmp_next_hrtimer_event(basem, expires); +} + +/** + * timer_clear_idle - Clear the idle state of the timer base + * + * Called with interrupts disabled + */ +void timer_clear_idle(void) +{ + struct timer_base *base = this_cpu_ptr(&timer_bases[BASE_STD]); + + /* + * We do this unlocked. The worst outcome is a remote enqueue sending + * a pointless IPI, but taking the lock would just make the window for + * sending the IPI a few instructions smaller for the cost of taking + * the lock in the exit from idle path. + */ + base->is_idle = false; +} + +static int collect_expired_timers(struct timer_base *base, + struct hlist_head *heads) +{ + unsigned long now = READ_ONCE(jiffies); + + /* + * NOHZ optimization. After a long idle sleep we need to forward the + * base to current jiffies. Avoid a loop by searching the bitfield for + * the next expiring timer. + */ + if ((long)(now - base->clk) > 2) { + unsigned long next = __next_timer_interrupt(base); + + /* + * If the next timer is ahead of time forward to current + * jiffies, otherwise forward to the next expiry time: + */ + if (time_after(next, now)) { + /* + * The call site will increment base->clk and then + * terminate the expiry loop immediately. + */ + base->clk = now; + return 0; + } + base->clk = next; + } + return __collect_expired_timers(base, heads); +} +#else +static inline int collect_expired_timers(struct timer_base *base, + struct hlist_head *heads) +{ + return __collect_expired_timers(base, heads); +} +#endif + +/* + * Called from the timer interrupt handler to charge one tick to the current + * process. user_tick is 1 if the tick is user time, 0 for system. + */ +void update_process_times(int user_tick) +{ + struct task_struct *p = current; + + /* Note: this timer irq context must be accounted for as well. */ + account_process_tick(p, user_tick); + run_local_timers(); + rcu_check_callbacks(user_tick); +#ifdef CONFIG_IRQ_WORK + if (in_irq()) + irq_work_tick(); +#endif + scheduler_tick(); + if (IS_ENABLED(CONFIG_POSIX_TIMERS)) + run_posix_cpu_timers(p); +} + +/** + * __run_timers - run all expired timers (if any) on this CPU. + * @base: the timer vector to be processed. + */ +static inline void __run_timers(struct timer_base *base) +{ + struct hlist_head heads[LVL_DEPTH]; + int levels; + + if (!time_after_eq(jiffies, base->clk)) + return; + + raw_spin_lock_irq(&base->lock); + + /* + * timer_base::must_forward_clk must be cleared before running + * timers so that any timer functions that call mod_timer() will + * not try to forward the base. Idle tracking / clock forwarding + * logic is only used with BASE_STD timers. + * + * The must_forward_clk flag is cleared unconditionally also for + * the deferrable base. The deferrable base is not affected by idle + * tracking and never forwarded, so clearing the flag is a NOOP. + * + * The fact that the deferrable base is never forwarded can cause + * large variations in granularity for deferrable timers, but they + * can be deferred for long periods due to idle anyway. + */ + base->must_forward_clk = false; + + while (time_after_eq(jiffies, base->clk)) { + + levels = collect_expired_timers(base, heads); + base->clk++; + + while (levels--) + expire_timers(base, heads + levels); + } + base->running_timer = NULL; + raw_spin_unlock_irq(&base->lock); +} + +/* + * This function runs timers and the timer-tq in bottom half context. + */ +static __latent_entropy void run_timer_softirq(struct softirq_action *h) +{ + struct timer_base *base = this_cpu_ptr(&timer_bases[BASE_STD]); + + __run_timers(base); + if (IS_ENABLED(CONFIG_NO_HZ_COMMON)) + __run_timers(this_cpu_ptr(&timer_bases[BASE_DEF])); +} + +/* + * Called by the local, per-CPU timer interrupt on SMP. + */ +void run_local_timers(void) +{ + struct timer_base *base = this_cpu_ptr(&timer_bases[BASE_STD]); + + hrtimer_run_queues(); + /* Raise the softirq only if required. */ + if (time_before(jiffies, base->clk)) { + if (!IS_ENABLED(CONFIG_NO_HZ_COMMON)) + return; + /* CPU is awake, so check the deferrable base. */ + base++; + if (time_before(jiffies, base->clk)) + return; + } + raise_softirq(TIMER_SOFTIRQ); +} + +/* + * Since schedule_timeout()'s timer is defined on the stack, it must store + * the target task on the stack as well. + */ +struct process_timer { + struct timer_list timer; + struct task_struct *task; +}; + +static void process_timeout(struct timer_list *t) +{ + struct process_timer *timeout = from_timer(timeout, t, timer); + + wake_up_process(timeout->task); +} + +/** + * schedule_timeout - sleep until timeout + * @timeout: timeout value in jiffies + * + * Make the current task sleep until @timeout jiffies have + * elapsed. The routine will return immediately unless + * the current task state has been set (see set_current_state()). + * + * You can set the task state as follows - + * + * %TASK_UNINTERRUPTIBLE - at least @timeout jiffies are guaranteed to + * pass before the routine returns unless the current task is explicitly + * woken up, (e.g. by wake_up_process())". + * + * %TASK_INTERRUPTIBLE - the routine may return early if a signal is + * delivered to the current task or the current task is explicitly woken + * up. + * + * The current task state is guaranteed to be TASK_RUNNING when this + * routine returns. + * + * Specifying a @timeout value of %MAX_SCHEDULE_TIMEOUT will schedule + * the CPU away without a bound on the timeout. In this case the return + * value will be %MAX_SCHEDULE_TIMEOUT. + * + * Returns 0 when the timer has expired otherwise the remaining time in + * jiffies will be returned. In all cases the return value is guaranteed + * to be non-negative. + */ +signed long __sched schedule_timeout(signed long timeout) +{ + struct process_timer timer; + unsigned long expire; + + switch (timeout) + { + case MAX_SCHEDULE_TIMEOUT: + /* + * These two special cases are useful to be comfortable + * in the caller. Nothing more. We could take + * MAX_SCHEDULE_TIMEOUT from one of the negative value + * but I' d like to return a valid offset (>=0) to allow + * the caller to do everything it want with the retval. + */ + schedule(); + goto out; + default: + /* + * Another bit of PARANOID. Note that the retval will be + * 0 since no piece of kernel is supposed to do a check + * for a negative retval of schedule_timeout() (since it + * should never happens anyway). You just have the printk() + * that will tell you if something is gone wrong and where. + */ + if (timeout < 0) { + printk(KERN_ERR "schedule_timeout: wrong timeout " + "value %lx\n", timeout); + dump_stack(); + current->state = TASK_RUNNING; + goto out; + } + } + + expire = timeout + jiffies; + + timer.task = current; + timer_setup_on_stack(&timer.timer, process_timeout, 0); + __mod_timer(&timer.timer, expire, 0); + schedule(); + del_singleshot_timer_sync(&timer.timer); + + /* Remove the timer from the object tracker */ + destroy_timer_on_stack(&timer.timer); + + timeout = expire - jiffies; + + out: + return timeout < 0 ? 0 : timeout; +} +EXPORT_SYMBOL(schedule_timeout); + +/* + * We can use __set_current_state() here because schedule_timeout() calls + * schedule() unconditionally. + */ +signed long __sched schedule_timeout_interruptible(signed long timeout) +{ + __set_current_state(TASK_INTERRUPTIBLE); + return schedule_timeout(timeout); +} +EXPORT_SYMBOL(schedule_timeout_interruptible); + +signed long __sched schedule_timeout_killable(signed long timeout) +{ + __set_current_state(TASK_KILLABLE); + return schedule_timeout(timeout); +} +EXPORT_SYMBOL(schedule_timeout_killable); + +signed long __sched schedule_timeout_uninterruptible(signed long timeout) +{ + __set_current_state(TASK_UNINTERRUPTIBLE); + return schedule_timeout(timeout); +} +EXPORT_SYMBOL(schedule_timeout_uninterruptible); + +/* + * Like schedule_timeout_uninterruptible(), except this task will not contribute + * to load average. + */ +signed long __sched schedule_timeout_idle(signed long timeout) +{ + __set_current_state(TASK_IDLE); + return schedule_timeout(timeout); +} +EXPORT_SYMBOL(schedule_timeout_idle); + +#ifdef CONFIG_HOTPLUG_CPU +static void migrate_timer_list(struct timer_base *new_base, struct hlist_head *head) +{ + struct timer_list *timer; + int cpu = new_base->cpu; + + while (!hlist_empty(head)) { + timer = hlist_entry(head->first, struct timer_list, entry); + detach_timer(timer, false); + timer->flags = (timer->flags & ~TIMER_BASEMASK) | cpu; + internal_add_timer(new_base, timer); + } +} + +int timers_prepare_cpu(unsigned int cpu) +{ + struct timer_base *base; + int b; + + for (b = 0; b < NR_BASES; b++) { + base = per_cpu_ptr(&timer_bases[b], cpu); + base->clk = jiffies; + base->next_expiry = base->clk + NEXT_TIMER_MAX_DELTA; + base->is_idle = false; + base->must_forward_clk = true; + } + return 0; +} + +int timers_dead_cpu(unsigned int cpu) +{ + struct timer_base *old_base; + struct timer_base *new_base; + int b, i; + + BUG_ON(cpu_online(cpu)); + + for (b = 0; b < NR_BASES; b++) { + old_base = per_cpu_ptr(&timer_bases[b], cpu); + new_base = get_cpu_ptr(&timer_bases[b]); + /* + * The caller is globally serialized and nobody else + * takes two locks at once, deadlock is not possible. + */ + raw_spin_lock_irq(&new_base->lock); + raw_spin_lock_nested(&old_base->lock, SINGLE_DEPTH_NESTING); + + /* + * The current CPUs base clock might be stale. Update it + * before moving the timers over. + */ + forward_timer_base(new_base); + + BUG_ON(old_base->running_timer); + + for (i = 0; i < WHEEL_SIZE; i++) + migrate_timer_list(new_base, old_base->vectors + i); + + raw_spin_unlock(&old_base->lock); + raw_spin_unlock_irq(&new_base->lock); + put_cpu_ptr(&timer_bases); + } + return 0; +} + +#endif /* CONFIG_HOTPLUG_CPU */ + +static void __init init_timer_cpu(int cpu) +{ + struct timer_base *base; + int i; + + for (i = 0; i < NR_BASES; i++) { + base = per_cpu_ptr(&timer_bases[i], cpu); + base->cpu = cpu; + raw_spin_lock_init(&base->lock); + base->clk = jiffies; + } +} + +static void __init init_timer_cpus(void) +{ + int cpu; + + for_each_possible_cpu(cpu) + init_timer_cpu(cpu); +} + +void __init init_timers(void) +{ + init_timer_cpus(); + open_softirq(TIMER_SOFTIRQ, run_timer_softirq); +} + +/** + * msleep - sleep safely even with waitqueue interruptions + * @msecs: Time in milliseconds to sleep for + */ +void msleep(unsigned int msecs) +{ + unsigned long timeout = msecs_to_jiffies(msecs) + 1; + + while (timeout) + timeout = schedule_timeout_uninterruptible(timeout); +} + +EXPORT_SYMBOL(msleep); + +/** + * msleep_interruptible - sleep waiting for signals + * @msecs: Time in milliseconds to sleep for + */ +unsigned long msleep_interruptible(unsigned int msecs) +{ + unsigned long timeout = msecs_to_jiffies(msecs) + 1; + + while (timeout && !signal_pending(current)) + timeout = schedule_timeout_interruptible(timeout); + return jiffies_to_msecs(timeout); +} + +EXPORT_SYMBOL(msleep_interruptible); + +/** + * usleep_range - Sleep for an approximate time + * @min: Minimum time in usecs to sleep + * @max: Maximum time in usecs to sleep + * + * In non-atomic context where the exact wakeup time is flexible, use + * usleep_range() instead of udelay(). The sleep improves responsiveness + * by avoiding the CPU-hogging busy-wait of udelay(), and the range reduces + * power usage by allowing hrtimers to take advantage of an already- + * scheduled interrupt instead of scheduling a new one just for this sleep. + */ +void __sched usleep_range(unsigned long min, unsigned long max) +{ + ktime_t exp = ktime_add_us(ktime_get(), min); + u64 delta = (u64)(max - min) * NSEC_PER_USEC; + + for (;;) { + __set_current_state(TASK_UNINTERRUPTIBLE); + /* Do not return before the requested sleep time has elapsed */ + if (!schedule_hrtimeout_range(&exp, delta, HRTIMER_MODE_ABS)) + break; + } +} +EXPORT_SYMBOL(usleep_range); diff --git a/kernel/time/timer_list.c b/kernel/time/timer_list.c new file mode 100644 index 000000000..07afcfe2a --- /dev/null +++ b/kernel/time/timer_list.c @@ -0,0 +1,385 @@ +/* + * kernel/time/timer_list.c + * + * List pending timers + * + * Copyright(C) 2006, Red Hat, Inc., Ingo Molnar + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include <linux/proc_fs.h> +#include <linux/module.h> +#include <linux/spinlock.h> +#include <linux/sched.h> +#include <linux/seq_file.h> +#include <linux/kallsyms.h> +#include <linux/nmi.h> + +#include <linux/uaccess.h> + +#include "tick-internal.h" + +struct timer_list_iter { + int cpu; + bool second_pass; + u64 now; +}; + +/* + * This allows printing both to /proc/timer_list and + * to the console (on SysRq-Q): + */ +__printf(2, 3) +static void SEQ_printf(struct seq_file *m, const char *fmt, ...) +{ + va_list args; + + va_start(args, fmt); + + if (m) + seq_vprintf(m, fmt, args); + else + vprintk(fmt, args); + + va_end(args); +} + +static void print_name_offset(struct seq_file *m, void *sym) +{ + char symname[KSYM_NAME_LEN]; + + if (lookup_symbol_name((unsigned long)sym, symname) < 0) + SEQ_printf(m, "<%pK>", sym); + else + SEQ_printf(m, "%s", symname); +} + +static void +print_timer(struct seq_file *m, struct hrtimer *taddr, struct hrtimer *timer, + int idx, u64 now) +{ + SEQ_printf(m, " #%d: ", idx); + print_name_offset(m, taddr); + SEQ_printf(m, ", "); + print_name_offset(m, timer->function); + SEQ_printf(m, ", S:%02x", timer->state); + SEQ_printf(m, "\n"); + SEQ_printf(m, " # expires at %Lu-%Lu nsecs [in %Ld to %Ld nsecs]\n", + (unsigned long long)ktime_to_ns(hrtimer_get_softexpires(timer)), + (unsigned long long)ktime_to_ns(hrtimer_get_expires(timer)), + (long long)(ktime_to_ns(hrtimer_get_softexpires(timer)) - now), + (long long)(ktime_to_ns(hrtimer_get_expires(timer)) - now)); +} + +static void +print_active_timers(struct seq_file *m, struct hrtimer_clock_base *base, + u64 now) +{ + struct hrtimer *timer, tmp; + unsigned long next = 0, i; + struct timerqueue_node *curr; + unsigned long flags; + +next_one: + i = 0; + + touch_nmi_watchdog(); + + raw_spin_lock_irqsave(&base->cpu_base->lock, flags); + + curr = timerqueue_getnext(&base->active); + /* + * Crude but we have to do this O(N*N) thing, because + * we have to unlock the base when printing: + */ + while (curr && i < next) { + curr = timerqueue_iterate_next(curr); + i++; + } + + if (curr) { + + timer = container_of(curr, struct hrtimer, node); + tmp = *timer; + raw_spin_unlock_irqrestore(&base->cpu_base->lock, flags); + + print_timer(m, timer, &tmp, i, now); + next++; + goto next_one; + } + raw_spin_unlock_irqrestore(&base->cpu_base->lock, flags); +} + +static void +print_base(struct seq_file *m, struct hrtimer_clock_base *base, u64 now) +{ + SEQ_printf(m, " .base: %pK\n", base); + SEQ_printf(m, " .index: %d\n", base->index); + + SEQ_printf(m, " .resolution: %u nsecs\n", hrtimer_resolution); + + SEQ_printf(m, " .get_time: "); + print_name_offset(m, base->get_time); + SEQ_printf(m, "\n"); +#ifdef CONFIG_HIGH_RES_TIMERS + SEQ_printf(m, " .offset: %Lu nsecs\n", + (unsigned long long) ktime_to_ns(base->offset)); +#endif + SEQ_printf(m, "active timers:\n"); + print_active_timers(m, base, now + ktime_to_ns(base->offset)); +} + +static void print_cpu(struct seq_file *m, int cpu, u64 now) +{ + struct hrtimer_cpu_base *cpu_base = &per_cpu(hrtimer_bases, cpu); + int i; + + SEQ_printf(m, "cpu: %d\n", cpu); + for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) { + SEQ_printf(m, " clock %d:\n", i); + print_base(m, cpu_base->clock_base + i, now); + } +#define P(x) \ + SEQ_printf(m, " .%-15s: %Lu\n", #x, \ + (unsigned long long)(cpu_base->x)) +#define P_ns(x) \ + SEQ_printf(m, " .%-15s: %Lu nsecs\n", #x, \ + (unsigned long long)(ktime_to_ns(cpu_base->x))) + +#ifdef CONFIG_HIGH_RES_TIMERS + P_ns(expires_next); + P(hres_active); + P(nr_events); + P(nr_retries); + P(nr_hangs); + P(max_hang_time); +#endif +#undef P +#undef P_ns + +#ifdef CONFIG_TICK_ONESHOT +# define P(x) \ + SEQ_printf(m, " .%-15s: %Lu\n", #x, \ + (unsigned long long)(ts->x)) +# define P_ns(x) \ + SEQ_printf(m, " .%-15s: %Lu nsecs\n", #x, \ + (unsigned long long)(ktime_to_ns(ts->x))) + { + struct tick_sched *ts = tick_get_tick_sched(cpu); + P(nohz_mode); + P_ns(last_tick); + P(tick_stopped); + P(idle_jiffies); + P(idle_calls); + P(idle_sleeps); + P_ns(idle_entrytime); + P_ns(idle_waketime); + P_ns(idle_exittime); + P_ns(idle_sleeptime); + P_ns(iowait_sleeptime); + P(last_jiffies); + P(next_timer); + P_ns(idle_expires); + SEQ_printf(m, "jiffies: %Lu\n", + (unsigned long long)jiffies); + } +#endif + +#undef P +#undef P_ns + SEQ_printf(m, "\n"); +} + +#ifdef CONFIG_GENERIC_CLOCKEVENTS +static void +print_tickdevice(struct seq_file *m, struct tick_device *td, int cpu) +{ + struct clock_event_device *dev = td->evtdev; + + touch_nmi_watchdog(); + + SEQ_printf(m, "Tick Device: mode: %d\n", td->mode); + if (cpu < 0) + SEQ_printf(m, "Broadcast device\n"); + else + SEQ_printf(m, "Per CPU device: %d\n", cpu); + + SEQ_printf(m, "Clock Event Device: "); + if (!dev) { + SEQ_printf(m, "<NULL>\n"); + return; + } + SEQ_printf(m, "%s\n", dev->name); + SEQ_printf(m, " max_delta_ns: %llu\n", + (unsigned long long) dev->max_delta_ns); + SEQ_printf(m, " min_delta_ns: %llu\n", + (unsigned long long) dev->min_delta_ns); + SEQ_printf(m, " mult: %u\n", dev->mult); + SEQ_printf(m, " shift: %u\n", dev->shift); + SEQ_printf(m, " mode: %d\n", clockevent_get_state(dev)); + SEQ_printf(m, " next_event: %Ld nsecs\n", + (unsigned long long) ktime_to_ns(dev->next_event)); + + SEQ_printf(m, " set_next_event: "); + print_name_offset(m, dev->set_next_event); + SEQ_printf(m, "\n"); + + if (dev->set_state_shutdown) { + SEQ_printf(m, " shutdown: "); + print_name_offset(m, dev->set_state_shutdown); + SEQ_printf(m, "\n"); + } + + if (dev->set_state_periodic) { + SEQ_printf(m, " periodic: "); + print_name_offset(m, dev->set_state_periodic); + SEQ_printf(m, "\n"); + } + + if (dev->set_state_oneshot) { + SEQ_printf(m, " oneshot: "); + print_name_offset(m, dev->set_state_oneshot); + SEQ_printf(m, "\n"); + } + + if (dev->set_state_oneshot_stopped) { + SEQ_printf(m, " oneshot stopped: "); + print_name_offset(m, dev->set_state_oneshot_stopped); + SEQ_printf(m, "\n"); + } + + if (dev->tick_resume) { + SEQ_printf(m, " resume: "); + print_name_offset(m, dev->tick_resume); + SEQ_printf(m, "\n"); + } + + SEQ_printf(m, " event_handler: "); + print_name_offset(m, dev->event_handler); + SEQ_printf(m, "\n"); + SEQ_printf(m, " retries: %lu\n", dev->retries); + SEQ_printf(m, "\n"); +} + +static void timer_list_show_tickdevices_header(struct seq_file *m) +{ +#ifdef CONFIG_GENERIC_CLOCKEVENTS_BROADCAST + print_tickdevice(m, tick_get_broadcast_device(), -1); + SEQ_printf(m, "tick_broadcast_mask: %*pb\n", + cpumask_pr_args(tick_get_broadcast_mask())); +#ifdef CONFIG_TICK_ONESHOT + SEQ_printf(m, "tick_broadcast_oneshot_mask: %*pb\n", + cpumask_pr_args(tick_get_broadcast_oneshot_mask())); +#endif + SEQ_printf(m, "\n"); +#endif +} +#endif + +static inline void timer_list_header(struct seq_file *m, u64 now) +{ + SEQ_printf(m, "Timer List Version: v0.8\n"); + SEQ_printf(m, "HRTIMER_MAX_CLOCK_BASES: %d\n", HRTIMER_MAX_CLOCK_BASES); + SEQ_printf(m, "now at %Ld nsecs\n", (unsigned long long)now); + SEQ_printf(m, "\n"); +} + +void sysrq_timer_list_show(void) +{ + u64 now = ktime_to_ns(ktime_get()); + int cpu; + + timer_list_header(NULL, now); + + for_each_online_cpu(cpu) + print_cpu(NULL, cpu, now); + +#ifdef CONFIG_GENERIC_CLOCKEVENTS + timer_list_show_tickdevices_header(NULL); + for_each_online_cpu(cpu) + print_tickdevice(NULL, tick_get_device(cpu), cpu); +#endif + return; +} + +#ifdef CONFIG_PROC_FS +static int timer_list_show(struct seq_file *m, void *v) +{ + struct timer_list_iter *iter = v; + + if (iter->cpu == -1 && !iter->second_pass) + timer_list_header(m, iter->now); + else if (!iter->second_pass) + print_cpu(m, iter->cpu, iter->now); +#ifdef CONFIG_GENERIC_CLOCKEVENTS + else if (iter->cpu == -1 && iter->second_pass) + timer_list_show_tickdevices_header(m); + else + print_tickdevice(m, tick_get_device(iter->cpu), iter->cpu); +#endif + return 0; +} + +static void *move_iter(struct timer_list_iter *iter, loff_t offset) +{ + for (; offset; offset--) { + iter->cpu = cpumask_next(iter->cpu, cpu_online_mask); + if (iter->cpu >= nr_cpu_ids) { +#ifdef CONFIG_GENERIC_CLOCKEVENTS + if (!iter->second_pass) { + iter->cpu = -1; + iter->second_pass = true; + } else + return NULL; +#else + return NULL; +#endif + } + } + return iter; +} + +static void *timer_list_start(struct seq_file *file, loff_t *offset) +{ + struct timer_list_iter *iter = file->private; + + if (!*offset) + iter->now = ktime_to_ns(ktime_get()); + iter->cpu = -1; + iter->second_pass = false; + return move_iter(iter, *offset); +} + +static void *timer_list_next(struct seq_file *file, void *v, loff_t *offset) +{ + struct timer_list_iter *iter = file->private; + ++*offset; + return move_iter(iter, 1); +} + +static void timer_list_stop(struct seq_file *seq, void *v) +{ +} + +static const struct seq_operations timer_list_sops = { + .start = timer_list_start, + .next = timer_list_next, + .stop = timer_list_stop, + .show = timer_list_show, +}; + +static int __init init_timer_list_procfs(void) +{ + struct proc_dir_entry *pe; + + pe = proc_create_seq_private("timer_list", 0400, NULL, &timer_list_sops, + sizeof(struct timer_list_iter), NULL); + if (!pe) + return -ENOMEM; + return 0; +} +__initcall(init_timer_list_procfs); +#endif |