hrtimer.c - cachepc-linux - Fork of AMDESE/linux with modifications for CachePC side-channel attack

	cachepc-linux Fork of AMDESE/linux with modifications for CachePC side-channel attack
	git clone https://git.sinitax.com/sinitax/cachepc-linux
	Log \| Files \| Refs \| README \| LICENSE \| sfeed.txt
hrtimer.c (67658B)
      1// SPDX-License-Identifier: GPL-2.0
      2/*
      3 *  Copyright(C) 2005-2006, Thomas Gleixner <tglx@linutronix.de>
      4 *  Copyright(C) 2005-2007, Red Hat, Inc., Ingo Molnar
      5 *  Copyright(C) 2006-2007  Timesys Corp., Thomas Gleixner
      6 *
      7 *  High-resolution kernel timers
      8 *
      9 *  In contrast to the low-resolution timeout API, aka timer wheel,
     10 *  hrtimers provide finer resolution and accuracy depending on system
     11 *  configuration and capabilities.
     12 *
     13 *  Started by: Thomas Gleixner and Ingo Molnar
     14 *
     15 *  Credits:
     16 *	Based on the original timer wheel code
     17 *
     18 *	Help, testing, suggestions, bugfixes, improvements were
     19 *	provided by:
     20 *
     21 *	George Anzinger, Andrew Morton, Steven Rostedt, Roman Zippel
     22 *	et. al.
     23 */
     24
     25#include <linux/cpu.h>
     26#include <linux/export.h>
     27#include <linux/percpu.h>
     28#include <linux/hrtimer.h>
     29#include <linux/notifier.h>
     30#include <linux/syscalls.h>
     31#include <linux/interrupt.h>
     32#include <linux/tick.h>
     33#include <linux/err.h>
     34#include <linux/debugobjects.h>
     35#include <linux/sched/signal.h>
     36#include <linux/sched/sysctl.h>
     37#include <linux/sched/rt.h>
     38#include <linux/sched/deadline.h>
     39#include <linux/sched/nohz.h>
     40#include <linux/sched/debug.h>
     41#include <linux/timer.h>
     42#include <linux/freezer.h>
     43#include <linux/compat.h>
     44
     45#include <linux/uaccess.h>
     46
     47#include <trace/events/timer.h>
     48
     49#include "tick-internal.h"
     50
     51/*
     52 * Masks for selecting the soft and hard context timers from
     53 * cpu_base->active
     54 */
     55#define MASK_SHIFT		(HRTIMER_BASE_MONOTONIC_SOFT)
     56#define HRTIMER_ACTIVE_HARD	((1U << MASK_SHIFT) - 1)
     57#define HRTIMER_ACTIVE_SOFT	(HRTIMER_ACTIVE_HARD << MASK_SHIFT)
     58#define HRTIMER_ACTIVE_ALL	(HRTIMER_ACTIVE_SOFT | HRTIMER_ACTIVE_HARD)
     59
     60/*
     61 * The timer bases:
     62 *
     63 * There are more clockids than hrtimer bases. Thus, we index
     64 * into the timer bases by the hrtimer_base_type enum. When trying
     65 * to reach a base using a clockid, hrtimer_clockid_to_base()
     66 * is used to convert from clockid to the proper hrtimer_base_type.
     67 */
     68DEFINE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases) =
     69{
     70	.lock = __RAW_SPIN_LOCK_UNLOCKED(hrtimer_bases.lock),
     71	.clock_base =
     72	{
     73		{
     74			.index = HRTIMER_BASE_MONOTONIC,
     75			.clockid = CLOCK_MONOTONIC,
     76			.get_time = &ktime_get,
     77		},
     78		{
     79			.index = HRTIMER_BASE_REALTIME,
     80			.clockid = CLOCK_REALTIME,
     81			.get_time = &ktime_get_real,
     82		},
     83		{
     84			.index = HRTIMER_BASE_BOOTTIME,
     85			.clockid = CLOCK_BOOTTIME,
     86			.get_time = &ktime_get_boottime,
     87		},
     88		{
     89			.index = HRTIMER_BASE_TAI,
     90			.clockid = CLOCK_TAI,
     91			.get_time = &ktime_get_clocktai,
     92		},
     93		{
     94			.index = HRTIMER_BASE_MONOTONIC_SOFT,
     95			.clockid = CLOCK_MONOTONIC,
     96			.get_time = &ktime_get,
     97		},
     98		{
     99			.index = HRTIMER_BASE_REALTIME_SOFT,
    100			.clockid = CLOCK_REALTIME,
    101			.get_time = &ktime_get_real,
    102		},
    103		{
    104			.index = HRTIMER_BASE_BOOTTIME_SOFT,
    105			.clockid = CLOCK_BOOTTIME,
    106			.get_time = &ktime_get_boottime,
    107		},
    108		{
    109			.index = HRTIMER_BASE_TAI_SOFT,
    110			.clockid = CLOCK_TAI,
    111			.get_time = &ktime_get_clocktai,
    112		},
    113	}
    114};
    115
    116static const int hrtimer_clock_to_base_table[MAX_CLOCKS] = {
    117	/* Make sure we catch unsupported clockids */
    118	[0 ... MAX_CLOCKS - 1]	= HRTIMER_MAX_CLOCK_BASES,
    119
    120	[CLOCK_REALTIME]	= HRTIMER_BASE_REALTIME,
    121	[CLOCK_MONOTONIC]	= HRTIMER_BASE_MONOTONIC,
    122	[CLOCK_BOOTTIME]	= HRTIMER_BASE_BOOTTIME,
    123	[CLOCK_TAI]		= HRTIMER_BASE_TAI,
    124};
    125
    126/*
    127 * Functions and macros which are different for UP/SMP systems are kept in a
    128 * single place
    129 */
    130#ifdef CONFIG_SMP
    131
    132/*
    133 * We require the migration_base for lock_hrtimer_base()/switch_hrtimer_base()
    134 * such that hrtimer_callback_running() can unconditionally dereference
    135 * timer->base->cpu_base
    136 */
    137static struct hrtimer_cpu_base migration_cpu_base = {
    138	.clock_base = { {
    139		.cpu_base = &migration_cpu_base,
    140		.seq      = SEQCNT_RAW_SPINLOCK_ZERO(migration_cpu_base.seq,
    141						     &migration_cpu_base.lock),
    142	}, },
    143};
    144
    145#define migration_base	migration_cpu_base.clock_base[0]
    146
    147static inline bool is_migration_base(struct hrtimer_clock_base *base)
    148{
    149	return base == &migration_base;
    150}
    151
    152/*
    153 * We are using hashed locking: holding per_cpu(hrtimer_bases)[n].lock
    154 * means that all timers which are tied to this base via timer->base are
    155 * locked, and the base itself is locked too.
    156 *
    157 * So __run_timers/migrate_timers can safely modify all timers which could
    158 * be found on the lists/queues.
    159 *
    160 * When the timer's base is locked, and the timer removed from list, it is
    161 * possible to set timer->base = &migration_base and drop the lock: the timer
    162 * remains locked.
    163 */
    164static
    165struct hrtimer_clock_base *lock_hrtimer_base(const struct hrtimer *timer,
    166					     unsigned long *flags)
    167{
    168	struct hrtimer_clock_base *base;
    169
    170	for (;;) {
    171		base = READ_ONCE(timer->base);
    172		if (likely(base != &migration_base)) {
    173			raw_spin_lock_irqsave(&base->cpu_base->lock, *flags);
    174			if (likely(base == timer->base))
    175				return base;
    176			/* The timer has migrated to another CPU: */
    177			raw_spin_unlock_irqrestore(&base->cpu_base->lock, *flags);
    178		}
    179		cpu_relax();
    180	}
    181}
    182
    183/*
    184 * We do not migrate the timer when it is expiring before the next
    185 * event on the target cpu. When high resolution is enabled, we cannot
    186 * reprogram the target cpu hardware and we would cause it to fire
    187 * late. To keep it simple, we handle the high resolution enabled and
    188 * disabled case similar.
    189 *
    190 * Called with cpu_base->lock of target cpu held.
    191 */
    192static int
    193hrtimer_check_target(struct hrtimer *timer, struct hrtimer_clock_base *new_base)
    194{
    195	ktime_t expires;
    196
    197	expires = ktime_sub(hrtimer_get_expires(timer), new_base->offset);
    198	return expires < new_base->cpu_base->expires_next;
    199}
    200
    201static inline
    202struct hrtimer_cpu_base *get_target_base(struct hrtimer_cpu_base *base,
    203					 int pinned)
    204{
    205#if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ_COMMON)
    206	if (static_branch_likely(&timers_migration_enabled) && !pinned)
    207		return &per_cpu(hrtimer_bases, get_nohz_timer_target());
    208#endif
    209	return base;
    210}
    211
    212/*
    213 * We switch the timer base to a power-optimized selected CPU target,
    214 * if:
    215 *	- NO_HZ_COMMON is enabled
    216 *	- timer migration is enabled
    217 *	- the timer callback is not running
    218 *	- the timer is not the first expiring timer on the new target
    219 *
    220 * If one of the above requirements is not fulfilled we move the timer
    221 * to the current CPU or leave it on the previously assigned CPU if
    222 * the timer callback is currently running.
    223 */
    224static inline struct hrtimer_clock_base *
    225switch_hrtimer_base(struct hrtimer *timer, struct hrtimer_clock_base *base,
    226		    int pinned)
    227{
    228	struct hrtimer_cpu_base *new_cpu_base, *this_cpu_base;
    229	struct hrtimer_clock_base *new_base;
    230	int basenum = base->index;
    231
    232	this_cpu_base = this_cpu_ptr(&hrtimer_bases);
    233	new_cpu_base = get_target_base(this_cpu_base, pinned);
    234again:
    235	new_base = &new_cpu_base->clock_base[basenum];
    236
    237	if (base != new_base) {
    238		/*
    239		 * We are trying to move timer to new_base.
    240		 * However we can't change timer's base while it is running,
    241		 * so we keep it on the same CPU. No hassle vs. reprogramming
    242		 * the event source in the high resolution case. The softirq
    243		 * code will take care of this when the timer function has
    244		 * completed. There is no conflict as we hold the lock until
    245		 * the timer is enqueued.
    246		 */
    247		if (unlikely(hrtimer_callback_running(timer)))
    248			return base;
    249
    250		/* See the comment in lock_hrtimer_base() */
    251		WRITE_ONCE(timer->base, &migration_base);
    252		raw_spin_unlock(&base->cpu_base->lock);
    253		raw_spin_lock(&new_base->cpu_base->lock);
    254
    255		if (new_cpu_base != this_cpu_base &&
    256		    hrtimer_check_target(timer, new_base)) {
    257			raw_spin_unlock(&new_base->cpu_base->lock);
    258			raw_spin_lock(&base->cpu_base->lock);
    259			new_cpu_base = this_cpu_base;
    260			WRITE_ONCE(timer->base, base);
    261			goto again;
    262		}
    263		WRITE_ONCE(timer->base, new_base);
    264	} else {
    265		if (new_cpu_base != this_cpu_base &&
    266		    hrtimer_check_target(timer, new_base)) {
    267			new_cpu_base = this_cpu_base;
    268			goto again;
    269		}
    270	}
    271	return new_base;
    272}
    273
    274#else /* CONFIG_SMP */
    275
    276static inline bool is_migration_base(struct hrtimer_clock_base *base)
    277{
    278	return false;
    279}
    280
    281static inline struct hrtimer_clock_base *
    282lock_hrtimer_base(const struct hrtimer *timer, unsigned long *flags)
    283{
    284	struct hrtimer_clock_base *base = timer->base;
    285
    286	raw_spin_lock_irqsave(&base->cpu_base->lock, *flags);
    287
    288	return base;
    289}
    290
    291# define switch_hrtimer_base(t, b, p)	(b)
    292
    293#endif	/* !CONFIG_SMP */
    294
    295/*
    296 * Functions for the union type storage format of ktime_t which are
    297 * too large for inlining:
    298 */
    299#if BITS_PER_LONG < 64
    300/*
    301 * Divide a ktime value by a nanosecond value
    302 */
    303s64 __ktime_divns(const ktime_t kt, s64 div)
    304{
    305	int sft = 0;
    306	s64 dclc;
    307	u64 tmp;
    308
    309	dclc = ktime_to_ns(kt);
    310	tmp = dclc < 0 ? -dclc : dclc;
    311
    312	/* Make sure the divisor is less than 2^32: */
    313	while (div >> 32) {
    314		sft++;
    315		div >>= 1;
    316	}
    317	tmp >>= sft;
    318	do_div(tmp, (u32) div);
    319	return dclc < 0 ? -tmp : tmp;
    320}
    321EXPORT_SYMBOL_GPL(__ktime_divns);
    322#endif /* BITS_PER_LONG >= 64 */
    323
    324/*
    325 * Add two ktime values and do a safety check for overflow:
    326 */
    327ktime_t ktime_add_safe(const ktime_t lhs, const ktime_t rhs)
    328{
    329	ktime_t res = ktime_add_unsafe(lhs, rhs);
    330
    331	/*
    332	 * We use KTIME_SEC_MAX here, the maximum timeout which we can
    333	 * return to user space in a timespec:
    334	 */
    335	if (res < 0 || res < lhs || res < rhs)
    336		res = ktime_set(KTIME_SEC_MAX, 0);
    337
    338	return res;
    339}
    340
    341EXPORT_SYMBOL_GPL(ktime_add_safe);
    342
    343#ifdef CONFIG_DEBUG_OBJECTS_TIMERS
    344
    345static const struct debug_obj_descr hrtimer_debug_descr;
    346
    347static void *hrtimer_debug_hint(void *addr)
    348{
    349	return ((struct hrtimer *) addr)->function;
    350}
    351
    352/*
    353 * fixup_init is called when:
    354 * - an active object is initialized
    355 */
    356static bool hrtimer_fixup_init(void *addr, enum debug_obj_state state)
    357{
    358	struct hrtimer *timer = addr;
    359
    360	switch (state) {
    361	case ODEBUG_STATE_ACTIVE:
    362		hrtimer_cancel(timer);
    363		debug_object_init(timer, &hrtimer_debug_descr);
    364		return true;
    365	default:
    366		return false;
    367	}
    368}
    369
    370/*
    371 * fixup_activate is called when:
    372 * - an active object is activated
    373 * - an unknown non-static object is activated
    374 */
    375static bool hrtimer_fixup_activate(void *addr, enum debug_obj_state state)
    376{
    377	switch (state) {
    378	case ODEBUG_STATE_ACTIVE:
    379		WARN_ON(1);
    380		fallthrough;
    381	default:
    382		return false;
    383	}
    384}
    385
    386/*
    387 * fixup_free is called when:
    388 * - an active object is freed
    389 */
    390static bool hrtimer_fixup_free(void *addr, enum debug_obj_state state)
    391{
    392	struct hrtimer *timer = addr;
    393
    394	switch (state) {
    395	case ODEBUG_STATE_ACTIVE:
    396		hrtimer_cancel(timer);
    397		debug_object_free(timer, &hrtimer_debug_descr);
    398		return true;
    399	default:
    400		return false;
    401	}
    402}
    403
    404static const struct debug_obj_descr hrtimer_debug_descr = {
    405	.name		= "hrtimer",
    406	.debug_hint	= hrtimer_debug_hint,
    407	.fixup_init	= hrtimer_fixup_init,
    408	.fixup_activate	= hrtimer_fixup_activate,
    409	.fixup_free	= hrtimer_fixup_free,
    410};
    411
    412static inline void debug_hrtimer_init(struct hrtimer *timer)
    413{
    414	debug_object_init(timer, &hrtimer_debug_descr);
    415}
    416
    417static inline void debug_hrtimer_activate(struct hrtimer *timer,
    418					  enum hrtimer_mode mode)
    419{
    420	debug_object_activate(timer, &hrtimer_debug_descr);
    421}
    422
    423static inline void debug_hrtimer_deactivate(struct hrtimer *timer)
    424{
    425	debug_object_deactivate(timer, &hrtimer_debug_descr);
    426}
    427
    428static void __hrtimer_init(struct hrtimer *timer, clockid_t clock_id,
    429			   enum hrtimer_mode mode);
    430
    431void hrtimer_init_on_stack(struct hrtimer *timer, clockid_t clock_id,
    432			   enum hrtimer_mode mode)
    433{
    434	debug_object_init_on_stack(timer, &hrtimer_debug_descr);
    435	__hrtimer_init(timer, clock_id, mode);
    436}
    437EXPORT_SYMBOL_GPL(hrtimer_init_on_stack);
    438
    439static void __hrtimer_init_sleeper(struct hrtimer_sleeper *sl,
    440				   clockid_t clock_id, enum hrtimer_mode mode);
    441
    442void hrtimer_init_sleeper_on_stack(struct hrtimer_sleeper *sl,
    443				   clockid_t clock_id, enum hrtimer_mode mode)
    444{
    445	debug_object_init_on_stack(&sl->timer, &hrtimer_debug_descr);
    446	__hrtimer_init_sleeper(sl, clock_id, mode);
    447}
    448EXPORT_SYMBOL_GPL(hrtimer_init_sleeper_on_stack);
    449
    450void destroy_hrtimer_on_stack(struct hrtimer *timer)
    451{
    452	debug_object_free(timer, &hrtimer_debug_descr);
    453}
    454EXPORT_SYMBOL_GPL(destroy_hrtimer_on_stack);
    455
    456#else
    457
    458static inline void debug_hrtimer_init(struct hrtimer *timer) { }
    459static inline void debug_hrtimer_activate(struct hrtimer *timer,
    460					  enum hrtimer_mode mode) { }
    461static inline void debug_hrtimer_deactivate(struct hrtimer *timer) { }
    462#endif
    463
    464static inline void
    465debug_init(struct hrtimer *timer, clockid_t clockid,
    466	   enum hrtimer_mode mode)
    467{
    468	debug_hrtimer_init(timer);
    469	trace_hrtimer_init(timer, clockid, mode);
    470}
    471
    472static inline void debug_activate(struct hrtimer *timer,
    473				  enum hrtimer_mode mode)
    474{
    475	debug_hrtimer_activate(timer, mode);
    476	trace_hrtimer_start(timer, mode);
    477}
    478
    479static inline void debug_deactivate(struct hrtimer *timer)
    480{
    481	debug_hrtimer_deactivate(timer);
    482	trace_hrtimer_cancel(timer);
    483}
    484
    485static struct hrtimer_clock_base *
    486__next_base(struct hrtimer_cpu_base *cpu_base, unsigned int *active)
    487{
    488	unsigned int idx;
    489
    490	if (!*active)
    491		return NULL;
    492
    493	idx = __ffs(*active);
    494	*active &= ~(1U << idx);
    495
    496	return &cpu_base->clock_base[idx];
    497}
    498
    499#define for_each_active_base(base, cpu_base, active)	\
    500	while ((base = __next_base((cpu_base), &(active))))
    501
    502static ktime_t __hrtimer_next_event_base(struct hrtimer_cpu_base *cpu_base,
    503					 const struct hrtimer *exclude,
    504					 unsigned int active,
    505					 ktime_t expires_next)
    506{
    507	struct hrtimer_clock_base *base;
    508	ktime_t expires;
    509
    510	for_each_active_base(base, cpu_base, active) {
    511		struct timerqueue_node *next;
    512		struct hrtimer *timer;
    513
    514		next = timerqueue_getnext(&base->active);
    515		timer = container_of(next, struct hrtimer, node);
    516		if (timer == exclude) {
    517			/* Get to the next timer in the queue. */
    518			next = timerqueue_iterate_next(next);
    519			if (!next)
    520				continue;
    521
    522			timer = container_of(next, struct hrtimer, node);
    523		}
    524		expires = ktime_sub(hrtimer_get_expires(timer), base->offset);
    525		if (expires < expires_next) {
    526			expires_next = expires;
    527
    528			/* Skip cpu_base update if a timer is being excluded. */
    529			if (exclude)
    530				continue;
    531
    532			if (timer->is_soft)
    533				cpu_base->softirq_next_timer = timer;
    534			else
    535				cpu_base->next_timer = timer;
    536		}
    537	}
    538	/*
    539	 * clock_was_set() might have changed base->offset of any of
    540	 * the clock bases so the result might be negative. Fix it up
    541	 * to prevent a false positive in clockevents_program_event().
    542	 */
    543	if (expires_next < 0)
    544		expires_next = 0;
    545	return expires_next;
    546}
    547
    548/*
    549 * Recomputes cpu_base::*next_timer and returns the earliest expires_next
    550 * but does not set cpu_base::*expires_next, that is done by
    551 * hrtimer[_force]_reprogram and hrtimer_interrupt only. When updating
    552 * cpu_base::*expires_next right away, reprogramming logic would no longer
    553 * work.
    554 *
    555 * When a softirq is pending, we can ignore the HRTIMER_ACTIVE_SOFT bases,
    556 * those timers will get run whenever the softirq gets handled, at the end of
    557 * hrtimer_run_softirq(), hrtimer_update_softirq_timer() will re-add these bases.
    558 *
    559 * Therefore softirq values are those from the HRTIMER_ACTIVE_SOFT clock bases.
    560 * The !softirq values are the minima across HRTIMER_ACTIVE_ALL, unless an actual
    561 * softirq is pending, in which case they're the minima of HRTIMER_ACTIVE_HARD.
    562 *
    563 * @active_mask must be one of:
    564 *  - HRTIMER_ACTIVE_ALL,
    565 *  - HRTIMER_ACTIVE_SOFT, or
    566 *  - HRTIMER_ACTIVE_HARD.
    567 */
    568static ktime_t
    569__hrtimer_get_next_event(struct hrtimer_cpu_base *cpu_base, unsigned int active_mask)
    570{
    571	unsigned int active;
    572	struct hrtimer *next_timer = NULL;
    573	ktime_t expires_next = KTIME_MAX;
    574
    575	if (!cpu_base->softirq_activated && (active_mask & HRTIMER_ACTIVE_SOFT)) {
    576		active = cpu_base->active_bases & HRTIMER_ACTIVE_SOFT;
    577		cpu_base->softirq_next_timer = NULL;
    578		expires_next = __hrtimer_next_event_base(cpu_base, NULL,
    579							 active, KTIME_MAX);
    580
    581		next_timer = cpu_base->softirq_next_timer;
    582	}
    583
    584	if (active_mask & HRTIMER_ACTIVE_HARD) {
    585		active = cpu_base->active_bases & HRTIMER_ACTIVE_HARD;
    586		cpu_base->next_timer = next_timer;
    587		expires_next = __hrtimer_next_event_base(cpu_base, NULL, active,
    588							 expires_next);
    589	}
    590
    591	return expires_next;
    592}
    593
    594static ktime_t hrtimer_update_next_event(struct hrtimer_cpu_base *cpu_base)
    595{
    596	ktime_t expires_next, soft = KTIME_MAX;
    597
    598	/*
    599	 * If the soft interrupt has already been activated, ignore the
    600	 * soft bases. They will be handled in the already raised soft
    601	 * interrupt.
    602	 */
    603	if (!cpu_base->softirq_activated) {
    604		soft = __hrtimer_get_next_event(cpu_base, HRTIMER_ACTIVE_SOFT);
    605		/*
    606		 * Update the soft expiry time. clock_settime() might have
    607		 * affected it.
    608		 */
    609		cpu_base->softirq_expires_next = soft;
    610	}
    611
    612	expires_next = __hrtimer_get_next_event(cpu_base, HRTIMER_ACTIVE_HARD);
    613	/*
    614	 * If a softirq timer is expiring first, update cpu_base->next_timer
    615	 * and program the hardware with the soft expiry time.
    616	 */
    617	if (expires_next > soft) {
    618		cpu_base->next_timer = cpu_base->softirq_next_timer;
    619		expires_next = soft;
    620	}
    621
    622	return expires_next;
    623}
    624
    625static inline ktime_t hrtimer_update_base(struct hrtimer_cpu_base *base)
    626{
    627	ktime_t *offs_real = &base->clock_base[HRTIMER_BASE_REALTIME].offset;
    628	ktime_t *offs_boot = &base->clock_base[HRTIMER_BASE_BOOTTIME].offset;
    629	ktime_t *offs_tai = &base->clock_base[HRTIMER_BASE_TAI].offset;
    630
    631	ktime_t now = ktime_get_update_offsets_now(&base->clock_was_set_seq,
    632					    offs_real, offs_boot, offs_tai);
    633
    634	base->clock_base[HRTIMER_BASE_REALTIME_SOFT].offset = *offs_real;
    635	base->clock_base[HRTIMER_BASE_BOOTTIME_SOFT].offset = *offs_boot;
    636	base->clock_base[HRTIMER_BASE_TAI_SOFT].offset = *offs_tai;
    637
    638	return now;
    639}
    640
    641/*
    642 * Is the high resolution mode active ?
    643 */
    644static inline int __hrtimer_hres_active(struct hrtimer_cpu_base *cpu_base)
    645{
    646	return IS_ENABLED(CONFIG_HIGH_RES_TIMERS) ?
    647		cpu_base->hres_active : 0;
    648}
    649
    650static inline int hrtimer_hres_active(void)
    651{
    652	return __hrtimer_hres_active(this_cpu_ptr(&hrtimer_bases));
    653}
    654
    655static void __hrtimer_reprogram(struct hrtimer_cpu_base *cpu_base,
    656				struct hrtimer *next_timer,
    657				ktime_t expires_next)
    658{
    659	cpu_base->expires_next = expires_next;
    660
    661	/*
    662	 * If hres is not active, hardware does not have to be
    663	 * reprogrammed yet.
    664	 *
    665	 * If a hang was detected in the last timer interrupt then we
    666	 * leave the hang delay active in the hardware. We want the
    667	 * system to make progress. That also prevents the following
    668	 * scenario:
    669	 * T1 expires 50ms from now
    670	 * T2 expires 5s from now
    671	 *
    672	 * T1 is removed, so this code is called and would reprogram
    673	 * the hardware to 5s from now. Any hrtimer_start after that
    674	 * will not reprogram the hardware due to hang_detected being
    675	 * set. So we'd effectively block all timers until the T2 event
    676	 * fires.
    677	 */
    678	if (!__hrtimer_hres_active(cpu_base) || cpu_base->hang_detected)
    679		return;
    680
    681	tick_program_event(expires_next, 1);
    682}
    683
    684/*
    685 * Reprogram the event source with checking both queues for the
    686 * next event
    687 * Called with interrupts disabled and base->lock held
    688 */
    689static void
    690hrtimer_force_reprogram(struct hrtimer_cpu_base *cpu_base, int skip_equal)
    691{
    692	ktime_t expires_next;
    693
    694	expires_next = hrtimer_update_next_event(cpu_base);
    695
    696	if (skip_equal && expires_next == cpu_base->expires_next)
    697		return;
    698
    699	__hrtimer_reprogram(cpu_base, cpu_base->next_timer, expires_next);
    700}
    701
    702/* High resolution timer related functions */
    703#ifdef CONFIG_HIGH_RES_TIMERS
    704
    705/*
    706 * High resolution timer enabled ?
    707 */
    708static bool hrtimer_hres_enabled __read_mostly  = true;
    709unsigned int hrtimer_resolution __read_mostly = LOW_RES_NSEC;
    710EXPORT_SYMBOL_GPL(hrtimer_resolution);
    711
    712/*
    713 * Enable / Disable high resolution mode
    714 */
    715static int __init setup_hrtimer_hres(char *str)
    716{
    717	return (kstrtobool(str, &hrtimer_hres_enabled) == 0);
    718}
    719
    720__setup("highres=", setup_hrtimer_hres);
    721
    722/*
    723 * hrtimer_high_res_enabled - query, if the highres mode is enabled
    724 */
    725static inline int hrtimer_is_hres_enabled(void)
    726{
    727	return hrtimer_hres_enabled;
    728}
    729
    730static void retrigger_next_event(void *arg);
    731
    732/*
    733 * Switch to high resolution mode
    734 */
    735static void hrtimer_switch_to_hres(void)
    736{
    737	struct hrtimer_cpu_base *base = this_cpu_ptr(&hrtimer_bases);
    738
    739	if (tick_init_highres()) {
    740		pr_warn("Could not switch to high resolution mode on CPU %u\n",
    741			base->cpu);
    742		return;
    743	}
    744	base->hres_active = 1;
    745	hrtimer_resolution = HIGH_RES_NSEC;
    746
    747	tick_setup_sched_timer();
    748	/* "Retrigger" the interrupt to get things going */
    749	retrigger_next_event(NULL);
    750}
    751
    752#else
    753
    754static inline int hrtimer_is_hres_enabled(void) { return 0; }
    755static inline void hrtimer_switch_to_hres(void) { }
    756
    757#endif /* CONFIG_HIGH_RES_TIMERS */
    758/*
    759 * Retrigger next event is called after clock was set with interrupts
    760 * disabled through an SMP function call or directly from low level
    761 * resume code.
    762 *
    763 * This is only invoked when:
    764 *	- CONFIG_HIGH_RES_TIMERS is enabled.
    765 *	- CONFIG_NOHZ_COMMON is enabled
    766 *
    767 * For the other cases this function is empty and because the call sites
    768 * are optimized out it vanishes as well, i.e. no need for lots of
    769 * #ifdeffery.
    770 */
    771static void retrigger_next_event(void *arg)
    772{
    773	struct hrtimer_cpu_base *base = this_cpu_ptr(&hrtimer_bases);
    774
    775	/*
    776	 * When high resolution mode or nohz is active, then the offsets of
    777	 * CLOCK_REALTIME/TAI/BOOTTIME have to be updated. Otherwise the
    778	 * next tick will take care of that.
    779	 *
    780	 * If high resolution mode is active then the next expiring timer
    781	 * must be reevaluated and the clock event device reprogrammed if
    782	 * necessary.
    783	 *
    784	 * In the NOHZ case the update of the offset and the reevaluation
    785	 * of the next expiring timer is enough. The return from the SMP
    786	 * function call will take care of the reprogramming in case the
    787	 * CPU was in a NOHZ idle sleep.
    788	 */
    789	if (!__hrtimer_hres_active(base) && !tick_nohz_active)
    790		return;
    791
    792	raw_spin_lock(&base->lock);
    793	hrtimer_update_base(base);
    794	if (__hrtimer_hres_active(base))
    795		hrtimer_force_reprogram(base, 0);
    796	else
    797		hrtimer_update_next_event(base);
    798	raw_spin_unlock(&base->lock);
    799}
    800
    801/*
    802 * When a timer is enqueued and expires earlier than the already enqueued
    803 * timers, we have to check, whether it expires earlier than the timer for
    804 * which the clock event device was armed.
    805 *
    806 * Called with interrupts disabled and base->cpu_base.lock held
    807 */
    808static void hrtimer_reprogram(struct hrtimer *timer, bool reprogram)
    809{
    810	struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases);
    811	struct hrtimer_clock_base *base = timer->base;
    812	ktime_t expires = ktime_sub(hrtimer_get_expires(timer), base->offset);
    813
    814	WARN_ON_ONCE(hrtimer_get_expires_tv64(timer) < 0);
    815
    816	/*
    817	 * CLOCK_REALTIME timer might be requested with an absolute
    818	 * expiry time which is less than base->offset. Set it to 0.
    819	 */
    820	if (expires < 0)
    821		expires = 0;
    822
    823	if (timer->is_soft) {
    824		/*
    825		 * soft hrtimer could be started on a remote CPU. In this
    826		 * case softirq_expires_next needs to be updated on the
    827		 * remote CPU. The soft hrtimer will not expire before the
    828		 * first hard hrtimer on the remote CPU -
    829		 * hrtimer_check_target() prevents this case.
    830		 */
    831		struct hrtimer_cpu_base *timer_cpu_base = base->cpu_base;
    832
    833		if (timer_cpu_base->softirq_activated)
    834			return;
    835
    836		if (!ktime_before(expires, timer_cpu_base->softirq_expires_next))
    837			return;
    838
    839		timer_cpu_base->softirq_next_timer = timer;
    840		timer_cpu_base->softirq_expires_next = expires;
    841
    842		if (!ktime_before(expires, timer_cpu_base->expires_next) ||
    843		    !reprogram)
    844			return;
    845	}
    846
    847	/*
    848	 * If the timer is not on the current cpu, we cannot reprogram
    849	 * the other cpus clock event device.
    850	 */
    851	if (base->cpu_base != cpu_base)
    852		return;
    853
    854	if (expires >= cpu_base->expires_next)
    855		return;
    856
    857	/*
    858	 * If the hrtimer interrupt is running, then it will reevaluate the
    859	 * clock bases and reprogram the clock event device.
    860	 */
    861	if (cpu_base->in_hrtirq)
    862		return;
    863
    864	cpu_base->next_timer = timer;
    865
    866	__hrtimer_reprogram(cpu_base, timer, expires);
    867}
    868
    869static bool update_needs_ipi(struct hrtimer_cpu_base *cpu_base,
    870			     unsigned int active)
    871{
    872	struct hrtimer_clock_base *base;
    873	unsigned int seq;
    874	ktime_t expires;
    875
    876	/*
    877	 * Update the base offsets unconditionally so the following
    878	 * checks whether the SMP function call is required works.
    879	 *
    880	 * The update is safe even when the remote CPU is in the hrtimer
    881	 * interrupt or the hrtimer soft interrupt and expiring affected
    882	 * bases. Either it will see the update before handling a base or
    883	 * it will see it when it finishes the processing and reevaluates
    884	 * the next expiring timer.
    885	 */
    886	seq = cpu_base->clock_was_set_seq;
    887	hrtimer_update_base(cpu_base);
    888
    889	/*
    890	 * If the sequence did not change over the update then the
    891	 * remote CPU already handled it.
    892	 */
    893	if (seq == cpu_base->clock_was_set_seq)
    894		return false;
    895
    896	/*
    897	 * If the remote CPU is currently handling an hrtimer interrupt, it
    898	 * will reevaluate the first expiring timer of all clock bases
    899	 * before reprogramming. Nothing to do here.
    900	 */
    901	if (cpu_base->in_hrtirq)
    902		return false;
    903
    904	/*
    905	 * Walk the affected clock bases and check whether the first expiring
    906	 * timer in a clock base is moving ahead of the first expiring timer of
    907	 * @cpu_base. If so, the IPI must be invoked because per CPU clock
    908	 * event devices cannot be remotely reprogrammed.
    909	 */
    910	active &= cpu_base->active_bases;
    911
    912	for_each_active_base(base, cpu_base, active) {
    913		struct timerqueue_node *next;
    914
    915		next = timerqueue_getnext(&base->active);
    916		expires = ktime_sub(next->expires, base->offset);
    917		if (expires < cpu_base->expires_next)
    918			return true;
    919
    920		/* Extra check for softirq clock bases */
    921		if (base->clockid < HRTIMER_BASE_MONOTONIC_SOFT)
    922			continue;
    923		if (cpu_base->softirq_activated)
    924			continue;
    925		if (expires < cpu_base->softirq_expires_next)
    926			return true;
    927	}
    928	return false;
    929}
    930
    931/*
    932 * Clock was set. This might affect CLOCK_REALTIME, CLOCK_TAI and
    933 * CLOCK_BOOTTIME (for late sleep time injection).
    934 *
    935 * This requires to update the offsets for these clocks
    936 * vs. CLOCK_MONOTONIC. When high resolution timers are enabled, then this
    937 * also requires to eventually reprogram the per CPU clock event devices
    938 * when the change moves an affected timer ahead of the first expiring
    939 * timer on that CPU. Obviously remote per CPU clock event devices cannot
    940 * be reprogrammed. The other reason why an IPI has to be sent is when the
    941 * system is in !HIGH_RES and NOHZ mode. The NOHZ mode updates the offsets
    942 * in the tick, which obviously might be stopped, so this has to bring out
    943 * the remote CPU which might sleep in idle to get this sorted.
    944 */
    945void clock_was_set(unsigned int bases)
    946{
    947	struct hrtimer_cpu_base *cpu_base = raw_cpu_ptr(&hrtimer_bases);
    948	cpumask_var_t mask;
    949	int cpu;
    950
    951	if (!__hrtimer_hres_active(cpu_base) && !tick_nohz_active)
    952		goto out_timerfd;
    953
    954	if (!zalloc_cpumask_var(&mask, GFP_KERNEL)) {
    955		on_each_cpu(retrigger_next_event, NULL, 1);
    956		goto out_timerfd;
    957	}
    958
    959	/* Avoid interrupting CPUs if possible */
    960	cpus_read_lock();
    961	for_each_online_cpu(cpu) {
    962		unsigned long flags;
    963
    964		cpu_base = &per_cpu(hrtimer_bases, cpu);
    965		raw_spin_lock_irqsave(&cpu_base->lock, flags);
    966
    967		if (update_needs_ipi(cpu_base, bases))
    968			cpumask_set_cpu(cpu, mask);
    969
    970		raw_spin_unlock_irqrestore(&cpu_base->lock, flags);
    971	}
    972
    973	preempt_disable();
    974	smp_call_function_many(mask, retrigger_next_event, NULL, 1);
    975	preempt_enable();
    976	cpus_read_unlock();
    977	free_cpumask_var(mask);
    978
    979out_timerfd:
    980	timerfd_clock_was_set();
    981}
    982
    983static void clock_was_set_work(struct work_struct *work)
    984{
    985	clock_was_set(CLOCK_SET_WALL);
    986}
    987
    988static DECLARE_WORK(hrtimer_work, clock_was_set_work);
    989
    990/*
    991 * Called from timekeeping code to reprogram the hrtimer interrupt device
    992 * on all cpus and to notify timerfd.
    993 */
    994void clock_was_set_delayed(void)
    995{
    996	schedule_work(&hrtimer_work);
    997}
    998
    999/*
   1000 * Called during resume either directly from via timekeeping_resume()
   1001 * or in the case of s2idle from tick_unfreeze() to ensure that the
   1002 * hrtimers are up to date.
   1003 */
   1004void hrtimers_resume_local(void)
   1005{
   1006	lockdep_assert_irqs_disabled();
   1007	/* Retrigger on the local CPU */
   1008	retrigger_next_event(NULL);
   1009}
   1010
   1011/*
   1012 * Counterpart to lock_hrtimer_base above:
   1013 */
   1014static inline
   1015void unlock_hrtimer_base(const struct hrtimer *timer, unsigned long *flags)
   1016{
   1017	raw_spin_unlock_irqrestore(&timer->base->cpu_base->lock, *flags);
   1018}
   1019
   1020/**
   1021 * hrtimer_forward - forward the timer expiry
   1022 * @timer:	hrtimer to forward
   1023 * @now:	forward past this time
   1024 * @interval:	the interval to forward
   1025 *
   1026 * Forward the timer expiry so it will expire in the future.
   1027 * Returns the number of overruns.
   1028 *
   1029 * Can be safely called from the callback function of @timer. If
   1030 * called from other contexts @timer must neither be enqueued nor
   1031 * running the callback and the caller needs to take care of
   1032 * serialization.
   1033 *
   1034 * Note: This only updates the timer expiry value and does not requeue
   1035 * the timer.
   1036 */
   1037u64 hrtimer_forward(struct hrtimer *timer, ktime_t now, ktime_t interval)
   1038{
   1039	u64 orun = 1;
   1040	ktime_t delta;
   1041
   1042	delta = ktime_sub(now, hrtimer_get_expires(timer));
   1043
   1044	if (delta < 0)
   1045		return 0;
   1046
   1047	if (WARN_ON(timer->state & HRTIMER_STATE_ENQUEUED))
   1048		return 0;
   1049
   1050	if (interval < hrtimer_resolution)
   1051		interval = hrtimer_resolution;
   1052
   1053	if (unlikely(delta >= interval)) {
   1054		s64 incr = ktime_to_ns(interval);
   1055
   1056		orun = ktime_divns(delta, incr);
   1057		hrtimer_add_expires_ns(timer, incr * orun);
   1058		if (hrtimer_get_expires_tv64(timer) > now)
   1059			return orun;
   1060		/*
   1061		 * This (and the ktime_add() below) is the
   1062		 * correction for exact:
   1063		 */
   1064		orun++;
   1065	}
   1066	hrtimer_add_expires(timer, interval);
   1067
   1068	return orun;
   1069}
   1070EXPORT_SYMBOL_GPL(hrtimer_forward);
   1071
   1072/*
   1073 * enqueue_hrtimer - internal function to (re)start a timer
   1074 *
   1075 * The timer is inserted in expiry order. Insertion into the
   1076 * red black tree is O(log(n)). Must hold the base lock.
   1077 *
   1078 * Returns 1 when the new timer is the leftmost timer in the tree.
   1079 */
   1080static int enqueue_hrtimer(struct hrtimer *timer,
   1081			   struct hrtimer_clock_base *base,
   1082			   enum hrtimer_mode mode)
   1083{
   1084	debug_activate(timer, mode);
   1085
   1086	base->cpu_base->active_bases |= 1 << base->index;
   1087
   1088	/* Pairs with the lockless read in hrtimer_is_queued() */
   1089	WRITE_ONCE(timer->state, HRTIMER_STATE_ENQUEUED);
   1090
   1091	return timerqueue_add(&base->active, &timer->node);
   1092}
   1093
   1094/*
   1095 * __remove_hrtimer - internal function to remove a timer
   1096 *
   1097 * Caller must hold the base lock.
   1098 *
   1099 * High resolution timer mode reprograms the clock event device when the
   1100 * timer is the one which expires next. The caller can disable this by setting
   1101 * reprogram to zero. This is useful, when the context does a reprogramming
   1102 * anyway (e.g. timer interrupt)
   1103 */
   1104static void __remove_hrtimer(struct hrtimer *timer,
   1105			     struct hrtimer_clock_base *base,
   1106			     u8 newstate, int reprogram)
   1107{
   1108	struct hrtimer_cpu_base *cpu_base = base->cpu_base;
   1109	u8 state = timer->state;
   1110
   1111	/* Pairs with the lockless read in hrtimer_is_queued() */
   1112	WRITE_ONCE(timer->state, newstate);
   1113	if (!(state & HRTIMER_STATE_ENQUEUED))
   1114		return;
   1115
   1116	if (!timerqueue_del(&base->active, &timer->node))
   1117		cpu_base->active_bases &= ~(1 << base->index);
   1118
   1119	/*
   1120	 * Note: If reprogram is false we do not update
   1121	 * cpu_base->next_timer. This happens when we remove the first
   1122	 * timer on a remote cpu. No harm as we never dereference
   1123	 * cpu_base->next_timer. So the worst thing what can happen is
   1124	 * an superfluous call to hrtimer_force_reprogram() on the
   1125	 * remote cpu later on if the same timer gets enqueued again.
   1126	 */
   1127	if (reprogram && timer == cpu_base->next_timer)
   1128		hrtimer_force_reprogram(cpu_base, 1);
   1129}
   1130
   1131/*
   1132 * remove hrtimer, called with base lock held
   1133 */
   1134static inline int
   1135remove_hrtimer(struct hrtimer *timer, struct hrtimer_clock_base *base,
   1136	       bool restart, bool keep_local)
   1137{
   1138	u8 state = timer->state;
   1139
   1140	if (state & HRTIMER_STATE_ENQUEUED) {
   1141		bool reprogram;
   1142
   1143		/*
   1144		 * Remove the timer and force reprogramming when high
   1145		 * resolution mode is active and the timer is on the current
   1146		 * CPU. If we remove a timer on another CPU, reprogramming is
   1147		 * skipped. The interrupt event on this CPU is fired and
   1148		 * reprogramming happens in the interrupt handler. This is a
   1149		 * rare case and less expensive than a smp call.
   1150		 */
   1151		debug_deactivate(timer);
   1152		reprogram = base->cpu_base == this_cpu_ptr(&hrtimer_bases);
   1153
   1154		/*
   1155		 * If the timer is not restarted then reprogramming is
   1156		 * required if the timer is local. If it is local and about
   1157		 * to be restarted, avoid programming it twice (on removal
   1158		 * and a moment later when it's requeued).
   1159		 */
   1160		if (!restart)
   1161			state = HRTIMER_STATE_INACTIVE;
   1162		else
   1163			reprogram &= !keep_local;
   1164
   1165		__remove_hrtimer(timer, base, state, reprogram);
   1166		return 1;
   1167	}
   1168	return 0;
   1169}
   1170
   1171static inline ktime_t hrtimer_update_lowres(struct hrtimer *timer, ktime_t tim,
   1172					    const enum hrtimer_mode mode)
   1173{
   1174#ifdef CONFIG_TIME_LOW_RES
   1175	/*
   1176	 * CONFIG_TIME_LOW_RES indicates that the system has no way to return
   1177	 * granular time values. For relative timers we add hrtimer_resolution
   1178	 * (i.e. one jiffie) to prevent short timeouts.
   1179	 */
   1180	timer->is_rel = mode & HRTIMER_MODE_REL;
   1181	if (timer->is_rel)
   1182		tim = ktime_add_safe(tim, hrtimer_resolution);
   1183#endif
   1184	return tim;
   1185}
   1186
   1187static void
   1188hrtimer_update_softirq_timer(struct hrtimer_cpu_base *cpu_base, bool reprogram)
   1189{
   1190	ktime_t expires;
   1191
   1192	/*
   1193	 * Find the next SOFT expiration.
   1194	 */
   1195	expires = __hrtimer_get_next_event(cpu_base, HRTIMER_ACTIVE_SOFT);
   1196
   1197	/*
   1198	 * reprogramming needs to be triggered, even if the next soft
   1199	 * hrtimer expires at the same time than the next hard
   1200	 * hrtimer. cpu_base->softirq_expires_next needs to be updated!
   1201	 */
   1202	if (expires == KTIME_MAX)
   1203		return;
   1204
   1205	/*
   1206	 * cpu_base->*next_timer is recomputed by __hrtimer_get_next_event()
   1207	 * cpu_base->*expires_next is only set by hrtimer_reprogram()
   1208	 */
   1209	hrtimer_reprogram(cpu_base->softirq_next_timer, reprogram);
   1210}
   1211
   1212static int __hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim,
   1213				    u64 delta_ns, const enum hrtimer_mode mode,
   1214				    struct hrtimer_clock_base *base)
   1215{
   1216	struct hrtimer_clock_base *new_base;
   1217	bool force_local, first;
   1218
   1219	/*
   1220	 * If the timer is on the local cpu base and is the first expiring
   1221	 * timer then this might end up reprogramming the hardware twice
   1222	 * (on removal and on enqueue). To avoid that by prevent the
   1223	 * reprogram on removal, keep the timer local to the current CPU
   1224	 * and enforce reprogramming after it is queued no matter whether
   1225	 * it is the new first expiring timer again or not.
   1226	 */
   1227	force_local = base->cpu_base == this_cpu_ptr(&hrtimer_bases);
   1228	force_local &= base->cpu_base->next_timer == timer;
   1229
   1230	/*
   1231	 * Remove an active timer from the queue. In case it is not queued
   1232	 * on the current CPU, make sure that remove_hrtimer() updates the
   1233	 * remote data correctly.
   1234	 *
   1235	 * If it's on the current CPU and the first expiring timer, then
   1236	 * skip reprogramming, keep the timer local and enforce
   1237	 * reprogramming later if it was the first expiring timer.  This
   1238	 * avoids programming the underlying clock event twice (once at
   1239	 * removal and once after enqueue).
   1240	 */
   1241	remove_hrtimer(timer, base, true, force_local);
   1242
   1243	if (mode & HRTIMER_MODE_REL)
   1244		tim = ktime_add_safe(tim, base->get_time());
   1245
   1246	tim = hrtimer_update_lowres(timer, tim, mode);
   1247
   1248	hrtimer_set_expires_range_ns(timer, tim, delta_ns);
   1249
   1250	/* Switch the timer base, if necessary: */
   1251	if (!force_local) {
   1252		new_base = switch_hrtimer_base(timer, base,
   1253					       mode & HRTIMER_MODE_PINNED);
   1254	} else {
   1255		new_base = base;
   1256	}
   1257
   1258	first = enqueue_hrtimer(timer, new_base, mode);
   1259	if (!force_local)
   1260		return first;
   1261
   1262	/*
   1263	 * Timer was forced to stay on the current CPU to avoid
   1264	 * reprogramming on removal and enqueue. Force reprogram the
   1265	 * hardware by evaluating the new first expiring timer.
   1266	 */
   1267	hrtimer_force_reprogram(new_base->cpu_base, 1);
   1268	return 0;
   1269}
   1270
   1271/**
   1272 * hrtimer_start_range_ns - (re)start an hrtimer
   1273 * @timer:	the timer to be added
   1274 * @tim:	expiry time
   1275 * @delta_ns:	"slack" range for the timer
   1276 * @mode:	timer mode: absolute (HRTIMER_MODE_ABS) or
   1277 *		relative (HRTIMER_MODE_REL), and pinned (HRTIMER_MODE_PINNED);
   1278 *		softirq based mode is considered for debug purpose only!
   1279 */
   1280void hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim,
   1281			    u64 delta_ns, const enum hrtimer_mode mode)
   1282{
   1283	struct hrtimer_clock_base *base;
   1284	unsigned long flags;
   1285
   1286	/*
   1287	 * Check whether the HRTIMER_MODE_SOFT bit and hrtimer.is_soft
   1288	 * match on CONFIG_PREEMPT_RT = n. With PREEMPT_RT check the hard
   1289	 * expiry mode because unmarked timers are moved to softirq expiry.
   1290	 */
   1291	if (!IS_ENABLED(CONFIG_PREEMPT_RT))
   1292		WARN_ON_ONCE(!(mode & HRTIMER_MODE_SOFT) ^ !timer->is_soft);
   1293	else
   1294		WARN_ON_ONCE(!(mode & HRTIMER_MODE_HARD) ^ !timer->is_hard);
   1295
   1296	base = lock_hrtimer_base(timer, &flags);
   1297
   1298	if (__hrtimer_start_range_ns(timer, tim, delta_ns, mode, base))
   1299		hrtimer_reprogram(timer, true);
   1300
   1301	unlock_hrtimer_base(timer, &flags);
   1302}
   1303EXPORT_SYMBOL_GPL(hrtimer_start_range_ns);
   1304
   1305/**
   1306 * hrtimer_try_to_cancel - try to deactivate a timer
   1307 * @timer:	hrtimer to stop
   1308 *
   1309 * Returns:
   1310 *
   1311 *  *  0 when the timer was not active
   1312 *  *  1 when the timer was active
   1313 *  * -1 when the timer is currently executing the callback function and
   1314 *    cannot be stopped
   1315 */
   1316int hrtimer_try_to_cancel(struct hrtimer *timer)
   1317{
   1318	struct hrtimer_clock_base *base;
   1319	unsigned long flags;
   1320	int ret = -1;
   1321
   1322	/*
   1323	 * Check lockless first. If the timer is not active (neither
   1324	 * enqueued nor running the callback, nothing to do here.  The
   1325	 * base lock does not serialize against a concurrent enqueue,
   1326	 * so we can avoid taking it.
   1327	 */
   1328	if (!hrtimer_active(timer))
   1329		return 0;
   1330
   1331	base = lock_hrtimer_base(timer, &flags);
   1332
   1333	if (!hrtimer_callback_running(timer))
   1334		ret = remove_hrtimer(timer, base, false, false);
   1335
   1336	unlock_hrtimer_base(timer, &flags);
   1337
   1338	return ret;
   1339
   1340}
   1341EXPORT_SYMBOL_GPL(hrtimer_try_to_cancel);
   1342
   1343#ifdef CONFIG_PREEMPT_RT
   1344static void hrtimer_cpu_base_init_expiry_lock(struct hrtimer_cpu_base *base)
   1345{
   1346	spin_lock_init(&base->softirq_expiry_lock);
   1347}
   1348
   1349static void hrtimer_cpu_base_lock_expiry(struct hrtimer_cpu_base *base)
   1350{
   1351	spin_lock(&base->softirq_expiry_lock);
   1352}
   1353
   1354static void hrtimer_cpu_base_unlock_expiry(struct hrtimer_cpu_base *base)
   1355{
   1356	spin_unlock(&base->softirq_expiry_lock);
   1357}
   1358
   1359/*
   1360 * The counterpart to hrtimer_cancel_wait_running().
   1361 *
   1362 * If there is a waiter for cpu_base->expiry_lock, then it was waiting for
   1363 * the timer callback to finish. Drop expiry_lock and reacquire it. That
   1364 * allows the waiter to acquire the lock and make progress.
   1365 */
   1366static void hrtimer_sync_wait_running(struct hrtimer_cpu_base *cpu_base,
   1367				      unsigned long flags)
   1368{
   1369	if (atomic_read(&cpu_base->timer_waiters)) {
   1370		raw_spin_unlock_irqrestore(&cpu_base->lock, flags);
   1371		spin_unlock(&cpu_base->softirq_expiry_lock);
   1372		spin_lock(&cpu_base->softirq_expiry_lock);
   1373		raw_spin_lock_irq(&cpu_base->lock);
   1374	}
   1375}
   1376
   1377/*
   1378 * This function is called on PREEMPT_RT kernels when the fast path
   1379 * deletion of a timer failed because the timer callback function was
   1380 * running.
   1381 *
   1382 * This prevents priority inversion: if the soft irq thread is preempted
   1383 * in the middle of a timer callback, then calling del_timer_sync() can
   1384 * lead to two issues:
   1385 *
   1386 *  - If the caller is on a remote CPU then it has to spin wait for the timer
   1387 *    handler to complete. This can result in unbound priority inversion.
   1388 *
   1389 *  - If the caller originates from the task which preempted the timer
   1390 *    handler on the same CPU, then spin waiting for the timer handler to
   1391 *    complete is never going to end.
   1392 */
   1393void hrtimer_cancel_wait_running(const struct hrtimer *timer)
   1394{
   1395	/* Lockless read. Prevent the compiler from reloading it below */
   1396	struct hrtimer_clock_base *base = READ_ONCE(timer->base);
   1397
   1398	/*
   1399	 * Just relax if the timer expires in hard interrupt context or if
   1400	 * it is currently on the migration base.
   1401	 */
   1402	if (!timer->is_soft || is_migration_base(base)) {
   1403		cpu_relax();
   1404		return;
   1405	}
   1406
   1407	/*
   1408	 * Mark the base as contended and grab the expiry lock, which is
   1409	 * held by the softirq across the timer callback. Drop the lock
   1410	 * immediately so the softirq can expire the next timer. In theory
   1411	 * the timer could already be running again, but that's more than
   1412	 * unlikely and just causes another wait loop.
   1413	 */
   1414	atomic_inc(&base->cpu_base->timer_waiters);
   1415	spin_lock_bh(&base->cpu_base->softirq_expiry_lock);
   1416	atomic_dec(&base->cpu_base->timer_waiters);
   1417	spin_unlock_bh(&base->cpu_base->softirq_expiry_lock);
   1418}
   1419#else
   1420static inline void
   1421hrtimer_cpu_base_init_expiry_lock(struct hrtimer_cpu_base *base) { }
   1422static inline void
   1423hrtimer_cpu_base_lock_expiry(struct hrtimer_cpu_base *base) { }
   1424static inline void
   1425hrtimer_cpu_base_unlock_expiry(struct hrtimer_cpu_base *base) { }
   1426static inline void hrtimer_sync_wait_running(struct hrtimer_cpu_base *base,
   1427					     unsigned long flags) { }
   1428#endif
   1429
   1430/**
   1431 * hrtimer_cancel - cancel a timer and wait for the handler to finish.
   1432 * @timer:	the timer to be cancelled
   1433 *
   1434 * Returns:
   1435 *  0 when the timer was not active
   1436 *  1 when the timer was active
   1437 */
   1438int hrtimer_cancel(struct hrtimer *timer)
   1439{
   1440	int ret;
   1441
   1442	do {
   1443		ret = hrtimer_try_to_cancel(timer);
   1444
   1445		if (ret < 0)
   1446			hrtimer_cancel_wait_running(timer);
   1447	} while (ret < 0);
   1448	return ret;
   1449}
   1450EXPORT_SYMBOL_GPL(hrtimer_cancel);
   1451
   1452/**
   1453 * __hrtimer_get_remaining - get remaining time for the timer
   1454 * @timer:	the timer to read
   1455 * @adjust:	adjust relative timers when CONFIG_TIME_LOW_RES=y
   1456 */
   1457ktime_t __hrtimer_get_remaining(const struct hrtimer *timer, bool adjust)
   1458{
   1459	unsigned long flags;
   1460	ktime_t rem;
   1461
   1462	lock_hrtimer_base(timer, &flags);
   1463	if (IS_ENABLED(CONFIG_TIME_LOW_RES) && adjust)
   1464		rem = hrtimer_expires_remaining_adjusted(timer);
   1465	else
   1466		rem = hrtimer_expires_remaining(timer);
   1467	unlock_hrtimer_base(timer, &flags);
   1468
   1469	return rem;
   1470}
   1471EXPORT_SYMBOL_GPL(__hrtimer_get_remaining);
   1472
   1473#ifdef CONFIG_NO_HZ_COMMON
   1474/**
   1475 * hrtimer_get_next_event - get the time until next expiry event
   1476 *
   1477 * Returns the next expiry time or KTIME_MAX if no timer is pending.
   1478 */
   1479u64 hrtimer_get_next_event(void)
   1480{
   1481	struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases);
   1482	u64 expires = KTIME_MAX;
   1483	unsigned long flags;
   1484
   1485	raw_spin_lock_irqsave(&cpu_base->lock, flags);
   1486
   1487	if (!__hrtimer_hres_active(cpu_base))
   1488		expires = __hrtimer_get_next_event(cpu_base, HRTIMER_ACTIVE_ALL);
   1489
   1490	raw_spin_unlock_irqrestore(&cpu_base->lock, flags);
   1491
   1492	return expires;
   1493}
   1494
   1495/**
   1496 * hrtimer_next_event_without - time until next expiry event w/o one timer
   1497 * @exclude:	timer to exclude
   1498 *
   1499 * Returns the next expiry time over all timers except for the @exclude one or
   1500 * KTIME_MAX if none of them is pending.
   1501 */
   1502u64 hrtimer_next_event_without(const struct hrtimer *exclude)
   1503{
   1504	struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases);
   1505	u64 expires = KTIME_MAX;
   1506	unsigned long flags;
   1507
   1508	raw_spin_lock_irqsave(&cpu_base->lock, flags);
   1509
   1510	if (__hrtimer_hres_active(cpu_base)) {
   1511		unsigned int active;
   1512
   1513		if (!cpu_base->softirq_activated) {
   1514			active = cpu_base->active_bases & HRTIMER_ACTIVE_SOFT;
   1515			expires = __hrtimer_next_event_base(cpu_base, exclude,
   1516							    active, KTIME_MAX);
   1517		}
   1518		active = cpu_base->active_bases & HRTIMER_ACTIVE_HARD;
   1519		expires = __hrtimer_next_event_base(cpu_base, exclude, active,
   1520						    expires);
   1521	}
   1522
   1523	raw_spin_unlock_irqrestore(&cpu_base->lock, flags);
   1524
   1525	return expires;
   1526}
   1527#endif
   1528
   1529static inline int hrtimer_clockid_to_base(clockid_t clock_id)
   1530{
   1531	if (likely(clock_id < MAX_CLOCKS)) {
   1532		int base = hrtimer_clock_to_base_table[clock_id];
   1533
   1534		if (likely(base != HRTIMER_MAX_CLOCK_BASES))
   1535			return base;
   1536	}
   1537	WARN(1, "Invalid clockid %d. Using MONOTONIC\n", clock_id);
   1538	return HRTIMER_BASE_MONOTONIC;
   1539}
   1540
   1541static void __hrtimer_init(struct hrtimer *timer, clockid_t clock_id,
   1542			   enum hrtimer_mode mode)
   1543{
   1544	bool softtimer = !!(mode & HRTIMER_MODE_SOFT);
   1545	struct hrtimer_cpu_base *cpu_base;
   1546	int base;
   1547
   1548	/*
   1549	 * On PREEMPT_RT enabled kernels hrtimers which are not explicitly
   1550	 * marked for hard interrupt expiry mode are moved into soft
   1551	 * interrupt context for latency reasons and because the callbacks
   1552	 * can invoke functions which might sleep on RT, e.g. spin_lock().
   1553	 */
   1554	if (IS_ENABLED(CONFIG_PREEMPT_RT) && !(mode & HRTIMER_MODE_HARD))
   1555		softtimer = true;
   1556
   1557	memset(timer, 0, sizeof(struct hrtimer));
   1558
   1559	cpu_base = raw_cpu_ptr(&hrtimer_bases);
   1560
   1561	/*
   1562	 * POSIX magic: Relative CLOCK_REALTIME timers are not affected by
   1563	 * clock modifications, so they needs to become CLOCK_MONOTONIC to
   1564	 * ensure POSIX compliance.
   1565	 */
   1566	if (clock_id == CLOCK_REALTIME && mode & HRTIMER_MODE_REL)
   1567		clock_id = CLOCK_MONOTONIC;
   1568
   1569	base = softtimer ? HRTIMER_MAX_CLOCK_BASES / 2 : 0;
   1570	base += hrtimer_clockid_to_base(clock_id);
   1571	timer->is_soft = softtimer;
   1572	timer->is_hard = !!(mode & HRTIMER_MODE_HARD);
   1573	timer->base = &cpu_base->clock_base[base];
   1574	timerqueue_init(&timer->node);
   1575}
   1576
   1577/**
   1578 * hrtimer_init - initialize a timer to the given clock
   1579 * @timer:	the timer to be initialized
   1580 * @clock_id:	the clock to be used
   1581 * @mode:       The modes which are relevant for initialization:
   1582 *              HRTIMER_MODE_ABS, HRTIMER_MODE_REL, HRTIMER_MODE_ABS_SOFT,
   1583 *              HRTIMER_MODE_REL_SOFT
   1584 *
   1585 *              The PINNED variants of the above can be handed in,
   1586 *              but the PINNED bit is ignored as pinning happens
   1587 *              when the hrtimer is started
   1588 */
   1589void hrtimer_init(struct hrtimer *timer, clockid_t clock_id,
   1590		  enum hrtimer_mode mode)
   1591{
   1592	debug_init(timer, clock_id, mode);
   1593	__hrtimer_init(timer, clock_id, mode);
   1594}
   1595EXPORT_SYMBOL_GPL(hrtimer_init);
   1596
   1597/*
   1598 * A timer is active, when it is enqueued into the rbtree or the
   1599 * callback function is running or it's in the state of being migrated
   1600 * to another cpu.
   1601 *
   1602 * It is important for this function to not return a false negative.
   1603 */
   1604bool hrtimer_active(const struct hrtimer *timer)
   1605{
   1606	struct hrtimer_clock_base *base;
   1607	unsigned int seq;
   1608
   1609	do {
   1610		base = READ_ONCE(timer->base);
   1611		seq = raw_read_seqcount_begin(&base->seq);
   1612
   1613		if (timer->state != HRTIMER_STATE_INACTIVE ||
   1614		    base->running == timer)
   1615			return true;
   1616
   1617	} while (read_seqcount_retry(&base->seq, seq) ||
   1618		 base != READ_ONCE(timer->base));
   1619
   1620	return false;
   1621}
   1622EXPORT_SYMBOL_GPL(hrtimer_active);
   1623
   1624/*
   1625 * The write_seqcount_barrier()s in __run_hrtimer() split the thing into 3
   1626 * distinct sections:
   1627 *
   1628 *  - queued:	the timer is queued
   1629 *  - callback:	the timer is being ran
   1630 *  - post:	the timer is inactive or (re)queued
   1631 *
   1632 * On the read side we ensure we observe timer->state and cpu_base->running
   1633 * from the same section, if anything changed while we looked at it, we retry.
   1634 * This includes timer->base changing because sequence numbers alone are
   1635 * insufficient for that.
   1636 *
   1637 * The sequence numbers are required because otherwise we could still observe
   1638 * a false negative if the read side got smeared over multiple consecutive
   1639 * __run_hrtimer() invocations.
   1640 */
   1641
   1642static void __run_hrtimer(struct hrtimer_cpu_base *cpu_base,
   1643			  struct hrtimer_clock_base *base,
   1644			  struct hrtimer *timer, ktime_t *now,
   1645			  unsigned long flags) __must_hold(&cpu_base->lock)
   1646{
   1647	enum hrtimer_restart (*fn)(struct hrtimer *);
   1648	bool expires_in_hardirq;
   1649	int restart;
   1650
   1651	lockdep_assert_held(&cpu_base->lock);
   1652
   1653	debug_deactivate(timer);
   1654	base->running = timer;
   1655
   1656	/*
   1657	 * Separate the ->running assignment from the ->state assignment.
   1658	 *
   1659	 * As with a regular write barrier, this ensures the read side in
   1660	 * hrtimer_active() cannot observe base->running == NULL &&
   1661	 * timer->state == INACTIVE.
   1662	 */
   1663	raw_write_seqcount_barrier(&base->seq);
   1664
   1665	__remove_hrtimer(timer, base, HRTIMER_STATE_INACTIVE, 0);
   1666	fn = timer->function;
   1667
   1668	/*
   1669	 * Clear the 'is relative' flag for the TIME_LOW_RES case. If the
   1670	 * timer is restarted with a period then it becomes an absolute
   1671	 * timer. If its not restarted it does not matter.
   1672	 */
   1673	if (IS_ENABLED(CONFIG_TIME_LOW_RES))
   1674		timer->is_rel = false;
   1675
   1676	/*
   1677	 * The timer is marked as running in the CPU base, so it is
   1678	 * protected against migration to a different CPU even if the lock
   1679	 * is dropped.
   1680	 */
   1681	raw_spin_unlock_irqrestore(&cpu_base->lock, flags);
   1682	trace_hrtimer_expire_entry(timer, now);
   1683	expires_in_hardirq = lockdep_hrtimer_enter(timer);
   1684
   1685	restart = fn(timer);
   1686
   1687	lockdep_hrtimer_exit(expires_in_hardirq);
   1688	trace_hrtimer_expire_exit(timer);
   1689	raw_spin_lock_irq(&cpu_base->lock);
   1690
   1691	/*
   1692	 * Note: We clear the running state after enqueue_hrtimer and
   1693	 * we do not reprogram the event hardware. Happens either in
   1694	 * hrtimer_start_range_ns() or in hrtimer_interrupt()
   1695	 *
   1696	 * Note: Because we dropped the cpu_base->lock above,
   1697	 * hrtimer_start_range_ns() can have popped in and enqueued the timer
   1698	 * for us already.
   1699	 */
   1700	if (restart != HRTIMER_NORESTART &&
   1701	    !(timer->state & HRTIMER_STATE_ENQUEUED))
   1702		enqueue_hrtimer(timer, base, HRTIMER_MODE_ABS);
   1703
   1704	/*
   1705	 * Separate the ->running assignment from the ->state assignment.
   1706	 *
   1707	 * As with a regular write barrier, this ensures the read side in
   1708	 * hrtimer_active() cannot observe base->running.timer == NULL &&
   1709	 * timer->state == INACTIVE.
   1710	 */
   1711	raw_write_seqcount_barrier(&base->seq);
   1712
   1713	WARN_ON_ONCE(base->running != timer);
   1714	base->running = NULL;
   1715}
   1716
   1717static void __hrtimer_run_queues(struct hrtimer_cpu_base *cpu_base, ktime_t now,
   1718				 unsigned long flags, unsigned int active_mask)
   1719{
   1720	struct hrtimer_clock_base *base;
   1721	unsigned int active = cpu_base->active_bases & active_mask;
   1722
   1723	for_each_active_base(base, cpu_base, active) {
   1724		struct timerqueue_node *node;
   1725		ktime_t basenow;
   1726
   1727		basenow = ktime_add(now, base->offset);
   1728
   1729		while ((node = timerqueue_getnext(&base->active))) {
   1730			struct hrtimer *timer;
   1731
   1732			timer = container_of(node, struct hrtimer, node);
   1733
   1734			/*
   1735			 * The immediate goal for using the softexpires is
   1736			 * minimizing wakeups, not running timers at the
   1737			 * earliest interrupt after their soft expiration.
   1738			 * This allows us to avoid using a Priority Search
   1739			 * Tree, which can answer a stabbing query for
   1740			 * overlapping intervals and instead use the simple
   1741			 * BST we already have.
   1742			 * We don't add extra wakeups by delaying timers that
   1743			 * are right-of a not yet expired timer, because that
   1744			 * timer will have to trigger a wakeup anyway.
   1745			 */
   1746			if (basenow < hrtimer_get_softexpires_tv64(timer))
   1747				break;
   1748
   1749			__run_hrtimer(cpu_base, base, timer, &basenow, flags);
   1750			if (active_mask == HRTIMER_ACTIVE_SOFT)
   1751				hrtimer_sync_wait_running(cpu_base, flags);
   1752		}
   1753	}
   1754}
   1755
   1756static __latent_entropy void hrtimer_run_softirq(struct softirq_action *h)
   1757{
   1758	struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases);
   1759	unsigned long flags;
   1760	ktime_t now;
   1761
   1762	hrtimer_cpu_base_lock_expiry(cpu_base);
   1763	raw_spin_lock_irqsave(&cpu_base->lock, flags);
   1764
   1765	now = hrtimer_update_base(cpu_base);
   1766	__hrtimer_run_queues(cpu_base, now, flags, HRTIMER_ACTIVE_SOFT);
   1767
   1768	cpu_base->softirq_activated = 0;
   1769	hrtimer_update_softirq_timer(cpu_base, true);
   1770
   1771	raw_spin_unlock_irqrestore(&cpu_base->lock, flags);
   1772	hrtimer_cpu_base_unlock_expiry(cpu_base);
   1773}
   1774
   1775#ifdef CONFIG_HIGH_RES_TIMERS
   1776
   1777/*
   1778 * High resolution timer interrupt
   1779 * Called with interrupts disabled
   1780 */
   1781void hrtimer_interrupt(struct clock_event_device *dev)
   1782{
   1783	struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases);
   1784	ktime_t expires_next, now, entry_time, delta;
   1785	unsigned long flags;
   1786	int retries = 0;
   1787
   1788	BUG_ON(!cpu_base->hres_active);
   1789	cpu_base->nr_events++;
   1790	dev->next_event = KTIME_MAX;
   1791
   1792	raw_spin_lock_irqsave(&cpu_base->lock, flags);
   1793	entry_time = now = hrtimer_update_base(cpu_base);
   1794retry:
   1795	cpu_base->in_hrtirq = 1;
   1796	/*
   1797	 * We set expires_next to KTIME_MAX here with cpu_base->lock
   1798	 * held to prevent that a timer is enqueued in our queue via
   1799	 * the migration code. This does not affect enqueueing of
   1800	 * timers which run their callback and need to be requeued on
   1801	 * this CPU.
   1802	 */
   1803	cpu_base->expires_next = KTIME_MAX;
   1804
   1805	if (!ktime_before(now, cpu_base->softirq_expires_next)) {
   1806		cpu_base->softirq_expires_next = KTIME_MAX;
   1807		cpu_base->softirq_activated = 1;
   1808		raise_softirq_irqoff(HRTIMER_SOFTIRQ);
   1809	}
   1810
   1811	__hrtimer_run_queues(cpu_base, now, flags, HRTIMER_ACTIVE_HARD);
   1812
   1813	/* Reevaluate the clock bases for the [soft] next expiry */
   1814	expires_next = hrtimer_update_next_event(cpu_base);
   1815	/*
   1816	 * Store the new expiry value so the migration code can verify
   1817	 * against it.
   1818	 */
   1819	cpu_base->expires_next = expires_next;
   1820	cpu_base->in_hrtirq = 0;
   1821	raw_spin_unlock_irqrestore(&cpu_base->lock, flags);
   1822
   1823	/* Reprogramming necessary ? */
   1824	if (!tick_program_event(expires_next, 0)) {
   1825		cpu_base->hang_detected = 0;
   1826		return;
   1827	}
   1828
   1829	/*
   1830	 * The next timer was already expired due to:
   1831	 * - tracing
   1832	 * - long lasting callbacks
   1833	 * - being scheduled away when running in a VM
   1834	 *
   1835	 * We need to prevent that we loop forever in the hrtimer
   1836	 * interrupt routine. We give it 3 attempts to avoid
   1837	 * overreacting on some spurious event.
   1838	 *
   1839	 * Acquire base lock for updating the offsets and retrieving
   1840	 * the current time.
   1841	 */
   1842	raw_spin_lock_irqsave(&cpu_base->lock, flags);
   1843	now = hrtimer_update_base(cpu_base);
   1844	cpu_base->nr_retries++;
   1845	if (++retries < 3)
   1846		goto retry;
   1847	/*
   1848	 * Give the system a chance to do something else than looping
   1849	 * here. We stored the entry time, so we know exactly how long
   1850	 * we spent here. We schedule the next event this amount of
   1851	 * time away.
   1852	 */
   1853	cpu_base->nr_hangs++;
   1854	cpu_base->hang_detected = 1;
   1855	raw_spin_unlock_irqrestore(&cpu_base->lock, flags);
   1856
   1857	delta = ktime_sub(now, entry_time);
   1858	if ((unsigned int)delta > cpu_base->max_hang_time)
   1859		cpu_base->max_hang_time = (unsigned int) delta;
   1860	/*
   1861	 * Limit it to a sensible value as we enforce a longer
   1862	 * delay. Give the CPU at least 100ms to catch up.
   1863	 */
   1864	if (delta > 100 * NSEC_PER_MSEC)
   1865		expires_next = ktime_add_ns(now, 100 * NSEC_PER_MSEC);
   1866	else
   1867		expires_next = ktime_add(now, delta);
   1868	tick_program_event(expires_next, 1);
   1869	pr_warn_once("hrtimer: interrupt took %llu ns\n", ktime_to_ns(delta));
   1870}
   1871
   1872/* called with interrupts disabled */
   1873static inline void __hrtimer_peek_ahead_timers(void)
   1874{
   1875	struct tick_device *td;
   1876
   1877	if (!hrtimer_hres_active())
   1878		return;
   1879
   1880	td = this_cpu_ptr(&tick_cpu_device);
   1881	if (td && td->evtdev)
   1882		hrtimer_interrupt(td->evtdev);
   1883}
   1884
   1885#else /* CONFIG_HIGH_RES_TIMERS */
   1886
   1887static inline void __hrtimer_peek_ahead_timers(void) { }
   1888
   1889#endif	/* !CONFIG_HIGH_RES_TIMERS */
   1890
   1891/*
   1892 * Called from run_local_timers in hardirq context every jiffy
   1893 */
   1894void hrtimer_run_queues(void)
   1895{
   1896	struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases);
   1897	unsigned long flags;
   1898	ktime_t now;
   1899
   1900	if (__hrtimer_hres_active(cpu_base))
   1901		return;
   1902
   1903	/*
   1904	 * This _is_ ugly: We have to check periodically, whether we
   1905	 * can switch to highres and / or nohz mode. The clocksource
   1906	 * switch happens with xtime_lock held. Notification from
   1907	 * there only sets the check bit in the tick_oneshot code,
   1908	 * otherwise we might deadlock vs. xtime_lock.
   1909	 */
   1910	if (tick_check_oneshot_change(!hrtimer_is_hres_enabled())) {
   1911		hrtimer_switch_to_hres();
   1912		return;
   1913	}
   1914
   1915	raw_spin_lock_irqsave(&cpu_base->lock, flags);
   1916	now = hrtimer_update_base(cpu_base);
   1917
   1918	if (!ktime_before(now, cpu_base->softirq_expires_next)) {
   1919		cpu_base->softirq_expires_next = KTIME_MAX;
   1920		cpu_base->softirq_activated = 1;
   1921		raise_softirq_irqoff(HRTIMER_SOFTIRQ);
   1922	}
   1923
   1924	__hrtimer_run_queues(cpu_base, now, flags, HRTIMER_ACTIVE_HARD);
   1925	raw_spin_unlock_irqrestore(&cpu_base->lock, flags);
   1926}
   1927
   1928/*
   1929 * Sleep related functions:
   1930 */
   1931static enum hrtimer_restart hrtimer_wakeup(struct hrtimer *timer)
   1932{
   1933	struct hrtimer_sleeper *t =
   1934		container_of(timer, struct hrtimer_sleeper, timer);
   1935	struct task_struct *task = t->task;
   1936
   1937	t->task = NULL;
   1938	if (task)
   1939		wake_up_process(task);
   1940
   1941	return HRTIMER_NORESTART;
   1942}
   1943
   1944/**
   1945 * hrtimer_sleeper_start_expires - Start a hrtimer sleeper timer
   1946 * @sl:		sleeper to be started
   1947 * @mode:	timer mode abs/rel
   1948 *
   1949 * Wrapper around hrtimer_start_expires() for hrtimer_sleeper based timers
   1950 * to allow PREEMPT_RT to tweak the delivery mode (soft/hardirq context)
   1951 */
   1952void hrtimer_sleeper_start_expires(struct hrtimer_sleeper *sl,
   1953				   enum hrtimer_mode mode)
   1954{
   1955	/*
   1956	 * Make the enqueue delivery mode check work on RT. If the sleeper
   1957	 * was initialized for hard interrupt delivery, force the mode bit.
   1958	 * This is a special case for hrtimer_sleepers because
   1959	 * hrtimer_init_sleeper() determines the delivery mode on RT so the
   1960	 * fiddling with this decision is avoided at the call sites.
   1961	 */
   1962	if (IS_ENABLED(CONFIG_PREEMPT_RT) && sl->timer.is_hard)
   1963		mode |= HRTIMER_MODE_HARD;
   1964
   1965	hrtimer_start_expires(&sl->timer, mode);
   1966}
   1967EXPORT_SYMBOL_GPL(hrtimer_sleeper_start_expires);
   1968
   1969static void __hrtimer_init_sleeper(struct hrtimer_sleeper *sl,
   1970				   clockid_t clock_id, enum hrtimer_mode mode)
   1971{
   1972	/*
   1973	 * On PREEMPT_RT enabled kernels hrtimers which are not explicitly
   1974	 * marked for hard interrupt expiry mode are moved into soft
   1975	 * interrupt context either for latency reasons or because the
   1976	 * hrtimer callback takes regular spinlocks or invokes other
   1977	 * functions which are not suitable for hard interrupt context on
   1978	 * PREEMPT_RT.
   1979	 *
   1980	 * The hrtimer_sleeper callback is RT compatible in hard interrupt
   1981	 * context, but there is a latency concern: Untrusted userspace can
   1982	 * spawn many threads which arm timers for the same expiry time on
   1983	 * the same CPU. That causes a latency spike due to the wakeup of
   1984	 * a gazillion threads.
   1985	 *
   1986	 * OTOH, privileged real-time user space applications rely on the
   1987	 * low latency of hard interrupt wakeups. If the current task is in
   1988	 * a real-time scheduling class, mark the mode for hard interrupt
   1989	 * expiry.
   1990	 */
   1991	if (IS_ENABLED(CONFIG_PREEMPT_RT)) {
   1992		if (task_is_realtime(current) && !(mode & HRTIMER_MODE_SOFT))
   1993			mode |= HRTIMER_MODE_HARD;
   1994	}
   1995
   1996	__hrtimer_init(&sl->timer, clock_id, mode);
   1997	sl->timer.function = hrtimer_wakeup;
   1998	sl->task = current;
   1999}
   2000
   2001/**
   2002 * hrtimer_init_sleeper - initialize sleeper to the given clock
   2003 * @sl:		sleeper to be initialized
   2004 * @clock_id:	the clock to be used
   2005 * @mode:	timer mode abs/rel
   2006 */
   2007void hrtimer_init_sleeper(struct hrtimer_sleeper *sl, clockid_t clock_id,
   2008			  enum hrtimer_mode mode)
   2009{
   2010	debug_init(&sl->timer, clock_id, mode);
   2011	__hrtimer_init_sleeper(sl, clock_id, mode);
   2012
   2013}
   2014EXPORT_SYMBOL_GPL(hrtimer_init_sleeper);
   2015
   2016int nanosleep_copyout(struct restart_block *restart, struct timespec64 *ts)
   2017{
   2018	switch(restart->nanosleep.type) {
   2019#ifdef CONFIG_COMPAT_32BIT_TIME
   2020	case TT_COMPAT:
   2021		if (put_old_timespec32(ts, restart->nanosleep.compat_rmtp))
   2022			return -EFAULT;
   2023		break;
   2024#endif
   2025	case TT_NATIVE:
   2026		if (put_timespec64(ts, restart->nanosleep.rmtp))
   2027			return -EFAULT;
   2028		break;
   2029	default:
   2030		BUG();
   2031	}
   2032	return -ERESTART_RESTARTBLOCK;
   2033}
   2034
   2035static int __sched do_nanosleep(struct hrtimer_sleeper *t, enum hrtimer_mode mode)
   2036{
   2037	struct restart_block *restart;
   2038
   2039	do {
   2040		set_current_state(TASK_INTERRUPTIBLE);
   2041		hrtimer_sleeper_start_expires(t, mode);
   2042
   2043		if (likely(t->task))
   2044			freezable_schedule();
   2045
   2046		hrtimer_cancel(&t->timer);
   2047		mode = HRTIMER_MODE_ABS;
   2048
   2049	} while (t->task && !signal_pending(current));
   2050
   2051	__set_current_state(TASK_RUNNING);
   2052
   2053	if (!t->task)
   2054		return 0;
   2055
   2056	restart = &current->restart_block;
   2057	if (restart->nanosleep.type != TT_NONE) {
   2058		ktime_t rem = hrtimer_expires_remaining(&t->timer);
   2059		struct timespec64 rmt;
   2060
   2061		if (rem <= 0)
   2062			return 0;
   2063		rmt = ktime_to_timespec64(rem);
   2064
   2065		return nanosleep_copyout(restart, &rmt);
   2066	}
   2067	return -ERESTART_RESTARTBLOCK;
   2068}
   2069
   2070static long __sched hrtimer_nanosleep_restart(struct restart_block *restart)
   2071{
   2072	struct hrtimer_sleeper t;
   2073	int ret;
   2074
   2075	hrtimer_init_sleeper_on_stack(&t, restart->nanosleep.clockid,
   2076				      HRTIMER_MODE_ABS);
   2077	hrtimer_set_expires_tv64(&t.timer, restart->nanosleep.expires);
   2078	ret = do_nanosleep(&t, HRTIMER_MODE_ABS);
   2079	destroy_hrtimer_on_stack(&t.timer);
   2080	return ret;
   2081}
   2082
   2083long hrtimer_nanosleep(ktime_t rqtp, const enum hrtimer_mode mode,
   2084		       const clockid_t clockid)
   2085{
   2086	struct restart_block *restart;
   2087	struct hrtimer_sleeper t;
   2088	int ret = 0;
   2089	u64 slack;
   2090
   2091	slack = current->timer_slack_ns;
   2092	if (dl_task(current) || rt_task(current))
   2093		slack = 0;
   2094
   2095	hrtimer_init_sleeper_on_stack(&t, clockid, mode);
   2096	hrtimer_set_expires_range_ns(&t.timer, rqtp, slack);
   2097	ret = do_nanosleep(&t, mode);
   2098	if (ret != -ERESTART_RESTARTBLOCK)
   2099		goto out;
   2100
   2101	/* Absolute timers do not update the rmtp value and restart: */
   2102	if (mode == HRTIMER_MODE_ABS) {
   2103		ret = -ERESTARTNOHAND;
   2104		goto out;
   2105	}
   2106
   2107	restart = &current->restart_block;
   2108	restart->nanosleep.clockid = t.timer.base->clockid;
   2109	restart->nanosleep.expires = hrtimer_get_expires_tv64(&t.timer);
   2110	set_restart_fn(restart, hrtimer_nanosleep_restart);
   2111out:
   2112	destroy_hrtimer_on_stack(&t.timer);
   2113	return ret;
   2114}
   2115
   2116#ifdef CONFIG_64BIT
   2117
   2118SYSCALL_DEFINE2(nanosleep, struct __kernel_timespec __user *, rqtp,
   2119		struct __kernel_timespec __user *, rmtp)
   2120{
   2121	struct timespec64 tu;
   2122
   2123	if (get_timespec64(&tu, rqtp))
   2124		return -EFAULT;
   2125
   2126	if (!timespec64_valid(&tu))
   2127		return -EINVAL;
   2128
   2129	current->restart_block.nanosleep.type = rmtp ? TT_NATIVE : TT_NONE;
   2130	current->restart_block.nanosleep.rmtp = rmtp;
   2131	return hrtimer_nanosleep(timespec64_to_ktime(tu), HRTIMER_MODE_REL,
   2132				 CLOCK_MONOTONIC);
   2133}
   2134
   2135#endif
   2136
   2137#ifdef CONFIG_COMPAT_32BIT_TIME
   2138
   2139SYSCALL_DEFINE2(nanosleep_time32, struct old_timespec32 __user *, rqtp,
   2140		       struct old_timespec32 __user *, rmtp)
   2141{
   2142	struct timespec64 tu;
   2143
   2144	if (get_old_timespec32(&tu, rqtp))
   2145		return -EFAULT;
   2146
   2147	if (!timespec64_valid(&tu))
   2148		return -EINVAL;
   2149
   2150	current->restart_block.nanosleep.type = rmtp ? TT_COMPAT : TT_NONE;
   2151	current->restart_block.nanosleep.compat_rmtp = rmtp;
   2152	return hrtimer_nanosleep(timespec64_to_ktime(tu), HRTIMER_MODE_REL,
   2153				 CLOCK_MONOTONIC);
   2154}
   2155#endif
   2156
   2157/*
   2158 * Functions related to boot-time initialization:
   2159 */
   2160int hrtimers_prepare_cpu(unsigned int cpu)
   2161{
   2162	struct hrtimer_cpu_base *cpu_base = &per_cpu(hrtimer_bases, cpu);
   2163	int i;
   2164
   2165	for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) {
   2166		struct hrtimer_clock_base *clock_b = &cpu_base->clock_base[i];
   2167
   2168		clock_b->cpu_base = cpu_base;
   2169		seqcount_raw_spinlock_init(&clock_b->seq, &cpu_base->lock);
   2170		timerqueue_init_head(&clock_b->active);
   2171	}
   2172
   2173	cpu_base->cpu = cpu;
   2174	cpu_base->active_bases = 0;
   2175	cpu_base->hres_active = 0;
   2176	cpu_base->hang_detected = 0;
   2177	cpu_base->next_timer = NULL;
   2178	cpu_base->softirq_next_timer = NULL;
   2179	cpu_base->expires_next = KTIME_MAX;
   2180	cpu_base->softirq_expires_next = KTIME_MAX;
   2181	hrtimer_cpu_base_init_expiry_lock(cpu_base);
   2182	return 0;
   2183}
   2184
   2185#ifdef CONFIG_HOTPLUG_CPU
   2186
   2187static void migrate_hrtimer_list(struct hrtimer_clock_base *old_base,
   2188				struct hrtimer_clock_base *new_base)
   2189{
   2190	struct hrtimer *timer;
   2191	struct timerqueue_node *node;
   2192
   2193	while ((node = timerqueue_getnext(&old_base->active))) {
   2194		timer = container_of(node, struct hrtimer, node);
   2195		BUG_ON(hrtimer_callback_running(timer));
   2196		debug_deactivate(timer);
   2197
   2198		/*
   2199		 * Mark it as ENQUEUED not INACTIVE otherwise the
   2200		 * timer could be seen as !active and just vanish away
   2201		 * under us on another CPU
   2202		 */
   2203		__remove_hrtimer(timer, old_base, HRTIMER_STATE_ENQUEUED, 0);
   2204		timer->base = new_base;
   2205		/*
   2206		 * Enqueue the timers on the new cpu. This does not
   2207		 * reprogram the event device in case the timer
   2208		 * expires before the earliest on this CPU, but we run
   2209		 * hrtimer_interrupt after we migrated everything to
   2210		 * sort out already expired timers and reprogram the
   2211		 * event device.
   2212		 */
   2213		enqueue_hrtimer(timer, new_base, HRTIMER_MODE_ABS);
   2214	}
   2215}
   2216
   2217int hrtimers_dead_cpu(unsigned int scpu)
   2218{
   2219	struct hrtimer_cpu_base *old_base, *new_base;
   2220	int i;
   2221
   2222	BUG_ON(cpu_online(scpu));
   2223	tick_cancel_sched_timer(scpu);
   2224
   2225	/*
   2226	 * this BH disable ensures that raise_softirq_irqoff() does
   2227	 * not wakeup ksoftirqd (and acquire the pi-lock) while
   2228	 * holding the cpu_base lock
   2229	 */
   2230	local_bh_disable();
   2231	local_irq_disable();
   2232	old_base = &per_cpu(hrtimer_bases, scpu);
   2233	new_base = this_cpu_ptr(&hrtimer_bases);
   2234	/*
   2235	 * The caller is globally serialized and nobody else
   2236	 * takes two locks at once, deadlock is not possible.
   2237	 */
   2238	raw_spin_lock(&new_base->lock);
   2239	raw_spin_lock_nested(&old_base->lock, SINGLE_DEPTH_NESTING);
   2240
   2241	for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) {
   2242		migrate_hrtimer_list(&old_base->clock_base[i],
   2243				     &new_base->clock_base[i]);
   2244	}
   2245
   2246	/*
   2247	 * The migration might have changed the first expiring softirq
   2248	 * timer on this CPU. Update it.
   2249	 */
   2250	hrtimer_update_softirq_timer(new_base, false);
   2251
   2252	raw_spin_unlock(&old_base->lock);
   2253	raw_spin_unlock(&new_base->lock);
   2254
   2255	/* Check, if we got expired work to do */
   2256	__hrtimer_peek_ahead_timers();
   2257	local_irq_enable();
   2258	local_bh_enable();
   2259	return 0;
   2260}
   2261
   2262#endif /* CONFIG_HOTPLUG_CPU */
   2263
   2264void __init hrtimers_init(void)
   2265{
   2266	hrtimers_prepare_cpu(smp_processor_id());
   2267	open_softirq(HRTIMER_SOFTIRQ, hrtimer_run_softirq);
   2268}
   2269
   2270/**
   2271 * schedule_hrtimeout_range_clock - sleep until timeout
   2272 * @expires:	timeout value (ktime_t)
   2273 * @delta:	slack in expires timeout (ktime_t)
   2274 * @mode:	timer mode
   2275 * @clock_id:	timer clock to be used
   2276 */
   2277int __sched
   2278schedule_hrtimeout_range_clock(ktime_t *expires, u64 delta,
   2279			       const enum hrtimer_mode mode, clockid_t clock_id)
   2280{
   2281	struct hrtimer_sleeper t;
   2282
   2283	/*
   2284	 * Optimize when a zero timeout value is given. It does not
   2285	 * matter whether this is an absolute or a relative time.
   2286	 */
   2287	if (expires && *expires == 0) {
   2288		__set_current_state(TASK_RUNNING);
   2289		return 0;
   2290	}
   2291
   2292	/*
   2293	 * A NULL parameter means "infinite"
   2294	 */
   2295	if (!expires) {
   2296		schedule();
   2297		return -EINTR;
   2298	}
   2299
   2300	hrtimer_init_sleeper_on_stack(&t, clock_id, mode);
   2301	hrtimer_set_expires_range_ns(&t.timer, *expires, delta);
   2302	hrtimer_sleeper_start_expires(&t, mode);
   2303
   2304	if (likely(t.task))
   2305		schedule();
   2306
   2307	hrtimer_cancel(&t.timer);
   2308	destroy_hrtimer_on_stack(&t.timer);
   2309
   2310	__set_current_state(TASK_RUNNING);
   2311
   2312	return !t.task ? 0 : -EINTR;
   2313}
   2314
   2315/**
   2316 * schedule_hrtimeout_range - sleep until timeout
   2317 * @expires:	timeout value (ktime_t)
   2318 * @delta:	slack in expires timeout (ktime_t)
   2319 * @mode:	timer mode
   2320 *
   2321 * Make the current task sleep until the given expiry time has
   2322 * elapsed. The routine will return immediately unless
   2323 * the current task state has been set (see set_current_state()).
   2324 *
   2325 * The @delta argument gives the kernel the freedom to schedule the
   2326 * actual wakeup to a time that is both power and performance friendly.
   2327 * The kernel give the normal best effort behavior for "@expires+@delta",
   2328 * but may decide to fire the timer earlier, but no earlier than @expires.
   2329 *
   2330 * You can set the task state as follows -
   2331 *
   2332 * %TASK_UNINTERRUPTIBLE - at least @timeout time is guaranteed to
   2333 * pass before the routine returns unless the current task is explicitly
   2334 * woken up, (e.g. by wake_up_process()).
   2335 *
   2336 * %TASK_INTERRUPTIBLE - the routine may return early if a signal is
   2337 * delivered to the current task or the current task is explicitly woken
   2338 * up.
   2339 *
   2340 * The current task state is guaranteed to be TASK_RUNNING when this
   2341 * routine returns.
   2342 *
   2343 * Returns 0 when the timer has expired. If the task was woken before the
   2344 * timer expired by a signal (only possible in state TASK_INTERRUPTIBLE) or
   2345 * by an explicit wakeup, it returns -EINTR.
   2346 */
   2347int __sched schedule_hrtimeout_range(ktime_t *expires, u64 delta,
   2348				     const enum hrtimer_mode mode)
   2349{
   2350	return schedule_hrtimeout_range_clock(expires, delta, mode,
   2351					      CLOCK_MONOTONIC);
   2352}
   2353EXPORT_SYMBOL_GPL(schedule_hrtimeout_range);
   2354
   2355/**
   2356 * schedule_hrtimeout - sleep until timeout
   2357 * @expires:	timeout value (ktime_t)
   2358 * @mode:	timer mode
   2359 *
   2360 * Make the current task sleep until the given expiry time has
   2361 * elapsed. The routine will return immediately unless
   2362 * the current task state has been set (see set_current_state()).
   2363 *
   2364 * You can set the task state as follows -
   2365 *
   2366 * %TASK_UNINTERRUPTIBLE - at least @timeout time is guaranteed to
   2367 * pass before the routine returns unless the current task is explicitly
   2368 * woken up, (e.g. by wake_up_process()).
   2369 *
   2370 * %TASK_INTERRUPTIBLE - the routine may return early if a signal is
   2371 * delivered to the current task or the current task is explicitly woken
   2372 * up.
   2373 *
   2374 * The current task state is guaranteed to be TASK_RUNNING when this
   2375 * routine returns.
   2376 *
   2377 * Returns 0 when the timer has expired. If the task was woken before the
   2378 * timer expired by a signal (only possible in state TASK_INTERRUPTIBLE) or
   2379 * by an explicit wakeup, it returns -EINTR.
   2380 */
   2381int __sched schedule_hrtimeout(ktime_t *expires,
   2382			       const enum hrtimer_mode mode)
   2383{
   2384	return schedule_hrtimeout_range(expires, 0, mode);
   2385}
   2386EXPORT_SYMBOL_GPL(schedule_hrtimeout);