cachepc-linux

Fork of AMDESE/linux with modifications for CachePC side-channel attack
git clone https://git.sinitax.com/sinitax/cachepc-linux
Log | Files | Refs | README | LICENSE | sfeed.txt

clocksource.c (42278B)


      1// SPDX-License-Identifier: GPL-2.0+
      2/*
      3 * This file contains the functions which manage clocksource drivers.
      4 *
      5 * Copyright (C) 2004, 2005 IBM, John Stultz (johnstul@us.ibm.com)
      6 */
      7
      8#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
      9
     10#include <linux/device.h>
     11#include <linux/clocksource.h>
     12#include <linux/init.h>
     13#include <linux/module.h>
     14#include <linux/sched.h> /* for spin_unlock_irq() using preempt_count() m68k */
     15#include <linux/tick.h>
     16#include <linux/kthread.h>
     17#include <linux/prandom.h>
     18#include <linux/cpu.h>
     19
     20#include "tick-internal.h"
     21#include "timekeeping_internal.h"
     22
     23/**
     24 * clocks_calc_mult_shift - calculate mult/shift factors for scaled math of clocks
     25 * @mult:	pointer to mult variable
     26 * @shift:	pointer to shift variable
     27 * @from:	frequency to convert from
     28 * @to:		frequency to convert to
     29 * @maxsec:	guaranteed runtime conversion range in seconds
     30 *
     31 * The function evaluates the shift/mult pair for the scaled math
     32 * operations of clocksources and clockevents.
     33 *
     34 * @to and @from are frequency values in HZ. For clock sources @to is
     35 * NSEC_PER_SEC == 1GHz and @from is the counter frequency. For clock
     36 * event @to is the counter frequency and @from is NSEC_PER_SEC.
     37 *
     38 * The @maxsec conversion range argument controls the time frame in
     39 * seconds which must be covered by the runtime conversion with the
     40 * calculated mult and shift factors. This guarantees that no 64bit
     41 * overflow happens when the input value of the conversion is
     42 * multiplied with the calculated mult factor. Larger ranges may
     43 * reduce the conversion accuracy by choosing smaller mult and shift
     44 * factors.
     45 */
     46void
     47clocks_calc_mult_shift(u32 *mult, u32 *shift, u32 from, u32 to, u32 maxsec)
     48{
     49	u64 tmp;
     50	u32 sft, sftacc= 32;
     51
     52	/*
     53	 * Calculate the shift factor which is limiting the conversion
     54	 * range:
     55	 */
     56	tmp = ((u64)maxsec * from) >> 32;
     57	while (tmp) {
     58		tmp >>=1;
     59		sftacc--;
     60	}
     61
     62	/*
     63	 * Find the conversion shift/mult pair which has the best
     64	 * accuracy and fits the maxsec conversion range:
     65	 */
     66	for (sft = 32; sft > 0; sft--) {
     67		tmp = (u64) to << sft;
     68		tmp += from / 2;
     69		do_div(tmp, from);
     70		if ((tmp >> sftacc) == 0)
     71			break;
     72	}
     73	*mult = tmp;
     74	*shift = sft;
     75}
     76EXPORT_SYMBOL_GPL(clocks_calc_mult_shift);
     77
     78/*[Clocksource internal variables]---------
     79 * curr_clocksource:
     80 *	currently selected clocksource.
     81 * suspend_clocksource:
     82 *	used to calculate the suspend time.
     83 * clocksource_list:
     84 *	linked list with the registered clocksources
     85 * clocksource_mutex:
     86 *	protects manipulations to curr_clocksource and the clocksource_list
     87 * override_name:
     88 *	Name of the user-specified clocksource.
     89 */
     90static struct clocksource *curr_clocksource;
     91static struct clocksource *suspend_clocksource;
     92static LIST_HEAD(clocksource_list);
     93static DEFINE_MUTEX(clocksource_mutex);
     94static char override_name[CS_NAME_LEN];
     95static int finished_booting;
     96static u64 suspend_start;
     97
     98/*
     99 * Threshold: 0.0312s, when doubled: 0.0625s.
    100 * Also a default for cs->uncertainty_margin when registering clocks.
    101 */
    102#define WATCHDOG_THRESHOLD (NSEC_PER_SEC >> 5)
    103
    104/*
    105 * Maximum permissible delay between two readouts of the watchdog
    106 * clocksource surrounding a read of the clocksource being validated.
    107 * This delay could be due to SMIs, NMIs, or to VCPU preemptions.  Used as
    108 * a lower bound for cs->uncertainty_margin values when registering clocks.
    109 */
    110#ifdef CONFIG_CLOCKSOURCE_WATCHDOG_MAX_SKEW_US
    111#define MAX_SKEW_USEC	CONFIG_CLOCKSOURCE_WATCHDOG_MAX_SKEW_US
    112#else
    113#define MAX_SKEW_USEC	100
    114#endif
    115
    116#define WATCHDOG_MAX_SKEW (MAX_SKEW_USEC * NSEC_PER_USEC)
    117
    118#ifdef CONFIG_CLOCKSOURCE_WATCHDOG
    119static void clocksource_watchdog_work(struct work_struct *work);
    120static void clocksource_select(void);
    121
    122static LIST_HEAD(watchdog_list);
    123static struct clocksource *watchdog;
    124static struct timer_list watchdog_timer;
    125static DECLARE_WORK(watchdog_work, clocksource_watchdog_work);
    126static DEFINE_SPINLOCK(watchdog_lock);
    127static int watchdog_running;
    128static atomic_t watchdog_reset_pending;
    129
    130static inline void clocksource_watchdog_lock(unsigned long *flags)
    131{
    132	spin_lock_irqsave(&watchdog_lock, *flags);
    133}
    134
    135static inline void clocksource_watchdog_unlock(unsigned long *flags)
    136{
    137	spin_unlock_irqrestore(&watchdog_lock, *flags);
    138}
    139
    140static int clocksource_watchdog_kthread(void *data);
    141static void __clocksource_change_rating(struct clocksource *cs, int rating);
    142
    143/*
    144 * Interval: 0.5sec.
    145 */
    146#define WATCHDOG_INTERVAL (HZ >> 1)
    147
    148static void clocksource_watchdog_work(struct work_struct *work)
    149{
    150	/*
    151	 * We cannot directly run clocksource_watchdog_kthread() here, because
    152	 * clocksource_select() calls timekeeping_notify() which uses
    153	 * stop_machine(). One cannot use stop_machine() from a workqueue() due
    154	 * lock inversions wrt CPU hotplug.
    155	 *
    156	 * Also, we only ever run this work once or twice during the lifetime
    157	 * of the kernel, so there is no point in creating a more permanent
    158	 * kthread for this.
    159	 *
    160	 * If kthread_run fails the next watchdog scan over the
    161	 * watchdog_list will find the unstable clock again.
    162	 */
    163	kthread_run(clocksource_watchdog_kthread, NULL, "kwatchdog");
    164}
    165
    166static void __clocksource_unstable(struct clocksource *cs)
    167{
    168	cs->flags &= ~(CLOCK_SOURCE_VALID_FOR_HRES | CLOCK_SOURCE_WATCHDOG);
    169	cs->flags |= CLOCK_SOURCE_UNSTABLE;
    170
    171	/*
    172	 * If the clocksource is registered clocksource_watchdog_kthread() will
    173	 * re-rate and re-select.
    174	 */
    175	if (list_empty(&cs->list)) {
    176		cs->rating = 0;
    177		return;
    178	}
    179
    180	if (cs->mark_unstable)
    181		cs->mark_unstable(cs);
    182
    183	/* kick clocksource_watchdog_kthread() */
    184	if (finished_booting)
    185		schedule_work(&watchdog_work);
    186}
    187
    188/**
    189 * clocksource_mark_unstable - mark clocksource unstable via watchdog
    190 * @cs:		clocksource to be marked unstable
    191 *
    192 * This function is called by the x86 TSC code to mark clocksources as unstable;
    193 * it defers demotion and re-selection to a kthread.
    194 */
    195void clocksource_mark_unstable(struct clocksource *cs)
    196{
    197	unsigned long flags;
    198
    199	spin_lock_irqsave(&watchdog_lock, flags);
    200	if (!(cs->flags & CLOCK_SOURCE_UNSTABLE)) {
    201		if (!list_empty(&cs->list) && list_empty(&cs->wd_list))
    202			list_add(&cs->wd_list, &watchdog_list);
    203		__clocksource_unstable(cs);
    204	}
    205	spin_unlock_irqrestore(&watchdog_lock, flags);
    206}
    207
    208ulong max_cswd_read_retries = 2;
    209module_param(max_cswd_read_retries, ulong, 0644);
    210EXPORT_SYMBOL_GPL(max_cswd_read_retries);
    211static int verify_n_cpus = 8;
    212module_param(verify_n_cpus, int, 0644);
    213
    214enum wd_read_status {
    215	WD_READ_SUCCESS,
    216	WD_READ_UNSTABLE,
    217	WD_READ_SKIP
    218};
    219
    220static enum wd_read_status cs_watchdog_read(struct clocksource *cs, u64 *csnow, u64 *wdnow)
    221{
    222	unsigned int nretries;
    223	u64 wd_end, wd_end2, wd_delta;
    224	int64_t wd_delay, wd_seq_delay;
    225
    226	for (nretries = 0; nretries <= max_cswd_read_retries; nretries++) {
    227		local_irq_disable();
    228		*wdnow = watchdog->read(watchdog);
    229		*csnow = cs->read(cs);
    230		wd_end = watchdog->read(watchdog);
    231		wd_end2 = watchdog->read(watchdog);
    232		local_irq_enable();
    233
    234		wd_delta = clocksource_delta(wd_end, *wdnow, watchdog->mask);
    235		wd_delay = clocksource_cyc2ns(wd_delta, watchdog->mult,
    236					      watchdog->shift);
    237		if (wd_delay <= WATCHDOG_MAX_SKEW) {
    238			if (nretries > 1 || nretries >= max_cswd_read_retries) {
    239				pr_warn("timekeeping watchdog on CPU%d: %s retried %d times before success\n",
    240					smp_processor_id(), watchdog->name, nretries);
    241			}
    242			return WD_READ_SUCCESS;
    243		}
    244
    245		/*
    246		 * Now compute delay in consecutive watchdog read to see if
    247		 * there is too much external interferences that cause
    248		 * significant delay in reading both clocksource and watchdog.
    249		 *
    250		 * If consecutive WD read-back delay > WATCHDOG_MAX_SKEW/2,
    251		 * report system busy, reinit the watchdog and skip the current
    252		 * watchdog test.
    253		 */
    254		wd_delta = clocksource_delta(wd_end2, wd_end, watchdog->mask);
    255		wd_seq_delay = clocksource_cyc2ns(wd_delta, watchdog->mult, watchdog->shift);
    256		if (wd_seq_delay > WATCHDOG_MAX_SKEW/2)
    257			goto skip_test;
    258	}
    259
    260	pr_warn("timekeeping watchdog on CPU%d: %s read-back delay of %lldns, attempt %d, marking unstable\n",
    261		smp_processor_id(), watchdog->name, wd_delay, nretries);
    262	return WD_READ_UNSTABLE;
    263
    264skip_test:
    265	pr_info("timekeeping watchdog on CPU%d: %s wd-wd read-back delay of %lldns\n",
    266		smp_processor_id(), watchdog->name, wd_seq_delay);
    267	pr_info("wd-%s-wd read-back delay of %lldns, clock-skew test skipped!\n",
    268		cs->name, wd_delay);
    269	return WD_READ_SKIP;
    270}
    271
    272static u64 csnow_mid;
    273static cpumask_t cpus_ahead;
    274static cpumask_t cpus_behind;
    275static cpumask_t cpus_chosen;
    276
    277static void clocksource_verify_choose_cpus(void)
    278{
    279	int cpu, i, n = verify_n_cpus;
    280
    281	if (n < 0) {
    282		/* Check all of the CPUs. */
    283		cpumask_copy(&cpus_chosen, cpu_online_mask);
    284		cpumask_clear_cpu(smp_processor_id(), &cpus_chosen);
    285		return;
    286	}
    287
    288	/* If no checking desired, or no other CPU to check, leave. */
    289	cpumask_clear(&cpus_chosen);
    290	if (n == 0 || num_online_cpus() <= 1)
    291		return;
    292
    293	/* Make sure to select at least one CPU other than the current CPU. */
    294	cpu = cpumask_first(cpu_online_mask);
    295	if (cpu == smp_processor_id())
    296		cpu = cpumask_next(cpu, cpu_online_mask);
    297	if (WARN_ON_ONCE(cpu >= nr_cpu_ids))
    298		return;
    299	cpumask_set_cpu(cpu, &cpus_chosen);
    300
    301	/* Force a sane value for the boot parameter. */
    302	if (n > nr_cpu_ids)
    303		n = nr_cpu_ids;
    304
    305	/*
    306	 * Randomly select the specified number of CPUs.  If the same
    307	 * CPU is selected multiple times, that CPU is checked only once,
    308	 * and no replacement CPU is selected.  This gracefully handles
    309	 * situations where verify_n_cpus is greater than the number of
    310	 * CPUs that are currently online.
    311	 */
    312	for (i = 1; i < n; i++) {
    313		cpu = prandom_u32() % nr_cpu_ids;
    314		cpu = cpumask_next(cpu - 1, cpu_online_mask);
    315		if (cpu >= nr_cpu_ids)
    316			cpu = cpumask_first(cpu_online_mask);
    317		if (!WARN_ON_ONCE(cpu >= nr_cpu_ids))
    318			cpumask_set_cpu(cpu, &cpus_chosen);
    319	}
    320
    321	/* Don't verify ourselves. */
    322	cpumask_clear_cpu(smp_processor_id(), &cpus_chosen);
    323}
    324
    325static void clocksource_verify_one_cpu(void *csin)
    326{
    327	struct clocksource *cs = (struct clocksource *)csin;
    328
    329	csnow_mid = cs->read(cs);
    330}
    331
    332void clocksource_verify_percpu(struct clocksource *cs)
    333{
    334	int64_t cs_nsec, cs_nsec_max = 0, cs_nsec_min = LLONG_MAX;
    335	u64 csnow_begin, csnow_end;
    336	int cpu, testcpu;
    337	s64 delta;
    338
    339	if (verify_n_cpus == 0)
    340		return;
    341	cpumask_clear(&cpus_ahead);
    342	cpumask_clear(&cpus_behind);
    343	cpus_read_lock();
    344	preempt_disable();
    345	clocksource_verify_choose_cpus();
    346	if (cpumask_empty(&cpus_chosen)) {
    347		preempt_enable();
    348		cpus_read_unlock();
    349		pr_warn("Not enough CPUs to check clocksource '%s'.\n", cs->name);
    350		return;
    351	}
    352	testcpu = smp_processor_id();
    353	pr_warn("Checking clocksource %s synchronization from CPU %d to CPUs %*pbl.\n", cs->name, testcpu, cpumask_pr_args(&cpus_chosen));
    354	for_each_cpu(cpu, &cpus_chosen) {
    355		if (cpu == testcpu)
    356			continue;
    357		csnow_begin = cs->read(cs);
    358		smp_call_function_single(cpu, clocksource_verify_one_cpu, cs, 1);
    359		csnow_end = cs->read(cs);
    360		delta = (s64)((csnow_mid - csnow_begin) & cs->mask);
    361		if (delta < 0)
    362			cpumask_set_cpu(cpu, &cpus_behind);
    363		delta = (csnow_end - csnow_mid) & cs->mask;
    364		if (delta < 0)
    365			cpumask_set_cpu(cpu, &cpus_ahead);
    366		delta = clocksource_delta(csnow_end, csnow_begin, cs->mask);
    367		cs_nsec = clocksource_cyc2ns(delta, cs->mult, cs->shift);
    368		if (cs_nsec > cs_nsec_max)
    369			cs_nsec_max = cs_nsec;
    370		if (cs_nsec < cs_nsec_min)
    371			cs_nsec_min = cs_nsec;
    372	}
    373	preempt_enable();
    374	cpus_read_unlock();
    375	if (!cpumask_empty(&cpus_ahead))
    376		pr_warn("        CPUs %*pbl ahead of CPU %d for clocksource %s.\n",
    377			cpumask_pr_args(&cpus_ahead), testcpu, cs->name);
    378	if (!cpumask_empty(&cpus_behind))
    379		pr_warn("        CPUs %*pbl behind CPU %d for clocksource %s.\n",
    380			cpumask_pr_args(&cpus_behind), testcpu, cs->name);
    381	if (!cpumask_empty(&cpus_ahead) || !cpumask_empty(&cpus_behind))
    382		pr_warn("        CPU %d check durations %lldns - %lldns for clocksource %s.\n",
    383			testcpu, cs_nsec_min, cs_nsec_max, cs->name);
    384}
    385EXPORT_SYMBOL_GPL(clocksource_verify_percpu);
    386
    387static void clocksource_watchdog(struct timer_list *unused)
    388{
    389	u64 csnow, wdnow, cslast, wdlast, delta;
    390	int next_cpu, reset_pending;
    391	int64_t wd_nsec, cs_nsec;
    392	struct clocksource *cs;
    393	enum wd_read_status read_ret;
    394	u32 md;
    395
    396	spin_lock(&watchdog_lock);
    397	if (!watchdog_running)
    398		goto out;
    399
    400	reset_pending = atomic_read(&watchdog_reset_pending);
    401
    402	list_for_each_entry(cs, &watchdog_list, wd_list) {
    403
    404		/* Clocksource already marked unstable? */
    405		if (cs->flags & CLOCK_SOURCE_UNSTABLE) {
    406			if (finished_booting)
    407				schedule_work(&watchdog_work);
    408			continue;
    409		}
    410
    411		read_ret = cs_watchdog_read(cs, &csnow, &wdnow);
    412
    413		if (read_ret != WD_READ_SUCCESS) {
    414			if (read_ret == WD_READ_UNSTABLE)
    415				/* Clock readout unreliable, so give it up. */
    416				__clocksource_unstable(cs);
    417			continue;
    418		}
    419
    420		/* Clocksource initialized ? */
    421		if (!(cs->flags & CLOCK_SOURCE_WATCHDOG) ||
    422		    atomic_read(&watchdog_reset_pending)) {
    423			cs->flags |= CLOCK_SOURCE_WATCHDOG;
    424			cs->wd_last = wdnow;
    425			cs->cs_last = csnow;
    426			continue;
    427		}
    428
    429		delta = clocksource_delta(wdnow, cs->wd_last, watchdog->mask);
    430		wd_nsec = clocksource_cyc2ns(delta, watchdog->mult,
    431					     watchdog->shift);
    432
    433		delta = clocksource_delta(csnow, cs->cs_last, cs->mask);
    434		cs_nsec = clocksource_cyc2ns(delta, cs->mult, cs->shift);
    435		wdlast = cs->wd_last; /* save these in case we print them */
    436		cslast = cs->cs_last;
    437		cs->cs_last = csnow;
    438		cs->wd_last = wdnow;
    439
    440		if (atomic_read(&watchdog_reset_pending))
    441			continue;
    442
    443		/* Check the deviation from the watchdog clocksource. */
    444		md = cs->uncertainty_margin + watchdog->uncertainty_margin;
    445		if (abs(cs_nsec - wd_nsec) > md) {
    446			pr_warn("timekeeping watchdog on CPU%d: Marking clocksource '%s' as unstable because the skew is too large:\n",
    447				smp_processor_id(), cs->name);
    448			pr_warn("                      '%s' wd_nsec: %lld wd_now: %llx wd_last: %llx mask: %llx\n",
    449				watchdog->name, wd_nsec, wdnow, wdlast, watchdog->mask);
    450			pr_warn("                      '%s' cs_nsec: %lld cs_now: %llx cs_last: %llx mask: %llx\n",
    451				cs->name, cs_nsec, csnow, cslast, cs->mask);
    452			if (curr_clocksource == cs)
    453				pr_warn("                      '%s' is current clocksource.\n", cs->name);
    454			else if (curr_clocksource)
    455				pr_warn("                      '%s' (not '%s') is current clocksource.\n", curr_clocksource->name, cs->name);
    456			else
    457				pr_warn("                      No current clocksource.\n");
    458			__clocksource_unstable(cs);
    459			continue;
    460		}
    461
    462		if (cs == curr_clocksource && cs->tick_stable)
    463			cs->tick_stable(cs);
    464
    465		if (!(cs->flags & CLOCK_SOURCE_VALID_FOR_HRES) &&
    466		    (cs->flags & CLOCK_SOURCE_IS_CONTINUOUS) &&
    467		    (watchdog->flags & CLOCK_SOURCE_IS_CONTINUOUS)) {
    468			/* Mark it valid for high-res. */
    469			cs->flags |= CLOCK_SOURCE_VALID_FOR_HRES;
    470
    471			/*
    472			 * clocksource_done_booting() will sort it if
    473			 * finished_booting is not set yet.
    474			 */
    475			if (!finished_booting)
    476				continue;
    477
    478			/*
    479			 * If this is not the current clocksource let
    480			 * the watchdog thread reselect it. Due to the
    481			 * change to high res this clocksource might
    482			 * be preferred now. If it is the current
    483			 * clocksource let the tick code know about
    484			 * that change.
    485			 */
    486			if (cs != curr_clocksource) {
    487				cs->flags |= CLOCK_SOURCE_RESELECT;
    488				schedule_work(&watchdog_work);
    489			} else {
    490				tick_clock_notify();
    491			}
    492		}
    493	}
    494
    495	/*
    496	 * We only clear the watchdog_reset_pending, when we did a
    497	 * full cycle through all clocksources.
    498	 */
    499	if (reset_pending)
    500		atomic_dec(&watchdog_reset_pending);
    501
    502	/*
    503	 * Cycle through CPUs to check if the CPUs stay synchronized
    504	 * to each other.
    505	 */
    506	next_cpu = cpumask_next(raw_smp_processor_id(), cpu_online_mask);
    507	if (next_cpu >= nr_cpu_ids)
    508		next_cpu = cpumask_first(cpu_online_mask);
    509
    510	/*
    511	 * Arm timer if not already pending: could race with concurrent
    512	 * pair clocksource_stop_watchdog() clocksource_start_watchdog().
    513	 */
    514	if (!timer_pending(&watchdog_timer)) {
    515		watchdog_timer.expires += WATCHDOG_INTERVAL;
    516		add_timer_on(&watchdog_timer, next_cpu);
    517	}
    518out:
    519	spin_unlock(&watchdog_lock);
    520}
    521
    522static inline void clocksource_start_watchdog(void)
    523{
    524	if (watchdog_running || !watchdog || list_empty(&watchdog_list))
    525		return;
    526	timer_setup(&watchdog_timer, clocksource_watchdog, 0);
    527	watchdog_timer.expires = jiffies + WATCHDOG_INTERVAL;
    528	add_timer_on(&watchdog_timer, cpumask_first(cpu_online_mask));
    529	watchdog_running = 1;
    530}
    531
    532static inline void clocksource_stop_watchdog(void)
    533{
    534	if (!watchdog_running || (watchdog && !list_empty(&watchdog_list)))
    535		return;
    536	del_timer(&watchdog_timer);
    537	watchdog_running = 0;
    538}
    539
    540static inline void clocksource_reset_watchdog(void)
    541{
    542	struct clocksource *cs;
    543
    544	list_for_each_entry(cs, &watchdog_list, wd_list)
    545		cs->flags &= ~CLOCK_SOURCE_WATCHDOG;
    546}
    547
    548static void clocksource_resume_watchdog(void)
    549{
    550	atomic_inc(&watchdog_reset_pending);
    551}
    552
    553static void clocksource_enqueue_watchdog(struct clocksource *cs)
    554{
    555	INIT_LIST_HEAD(&cs->wd_list);
    556
    557	if (cs->flags & CLOCK_SOURCE_MUST_VERIFY) {
    558		/* cs is a clocksource to be watched. */
    559		list_add(&cs->wd_list, &watchdog_list);
    560		cs->flags &= ~CLOCK_SOURCE_WATCHDOG;
    561	} else {
    562		/* cs is a watchdog. */
    563		if (cs->flags & CLOCK_SOURCE_IS_CONTINUOUS)
    564			cs->flags |= CLOCK_SOURCE_VALID_FOR_HRES;
    565	}
    566}
    567
    568static void clocksource_select_watchdog(bool fallback)
    569{
    570	struct clocksource *cs, *old_wd;
    571	unsigned long flags;
    572
    573	spin_lock_irqsave(&watchdog_lock, flags);
    574	/* save current watchdog */
    575	old_wd = watchdog;
    576	if (fallback)
    577		watchdog = NULL;
    578
    579	list_for_each_entry(cs, &clocksource_list, list) {
    580		/* cs is a clocksource to be watched. */
    581		if (cs->flags & CLOCK_SOURCE_MUST_VERIFY)
    582			continue;
    583
    584		/* Skip current if we were requested for a fallback. */
    585		if (fallback && cs == old_wd)
    586			continue;
    587
    588		/* Pick the best watchdog. */
    589		if (!watchdog || cs->rating > watchdog->rating)
    590			watchdog = cs;
    591	}
    592	/* If we failed to find a fallback restore the old one. */
    593	if (!watchdog)
    594		watchdog = old_wd;
    595
    596	/* If we changed the watchdog we need to reset cycles. */
    597	if (watchdog != old_wd)
    598		clocksource_reset_watchdog();
    599
    600	/* Check if the watchdog timer needs to be started. */
    601	clocksource_start_watchdog();
    602	spin_unlock_irqrestore(&watchdog_lock, flags);
    603}
    604
    605static void clocksource_dequeue_watchdog(struct clocksource *cs)
    606{
    607	if (cs != watchdog) {
    608		if (cs->flags & CLOCK_SOURCE_MUST_VERIFY) {
    609			/* cs is a watched clocksource. */
    610			list_del_init(&cs->wd_list);
    611			/* Check if the watchdog timer needs to be stopped. */
    612			clocksource_stop_watchdog();
    613		}
    614	}
    615}
    616
    617static int __clocksource_watchdog_kthread(void)
    618{
    619	struct clocksource *cs, *tmp;
    620	unsigned long flags;
    621	int select = 0;
    622
    623	/* Do any required per-CPU skew verification. */
    624	if (curr_clocksource &&
    625	    curr_clocksource->flags & CLOCK_SOURCE_UNSTABLE &&
    626	    curr_clocksource->flags & CLOCK_SOURCE_VERIFY_PERCPU)
    627		clocksource_verify_percpu(curr_clocksource);
    628
    629	spin_lock_irqsave(&watchdog_lock, flags);
    630	list_for_each_entry_safe(cs, tmp, &watchdog_list, wd_list) {
    631		if (cs->flags & CLOCK_SOURCE_UNSTABLE) {
    632			list_del_init(&cs->wd_list);
    633			__clocksource_change_rating(cs, 0);
    634			select = 1;
    635		}
    636		if (cs->flags & CLOCK_SOURCE_RESELECT) {
    637			cs->flags &= ~CLOCK_SOURCE_RESELECT;
    638			select = 1;
    639		}
    640	}
    641	/* Check if the watchdog timer needs to be stopped. */
    642	clocksource_stop_watchdog();
    643	spin_unlock_irqrestore(&watchdog_lock, flags);
    644
    645	return select;
    646}
    647
    648static int clocksource_watchdog_kthread(void *data)
    649{
    650	mutex_lock(&clocksource_mutex);
    651	if (__clocksource_watchdog_kthread())
    652		clocksource_select();
    653	mutex_unlock(&clocksource_mutex);
    654	return 0;
    655}
    656
    657static bool clocksource_is_watchdog(struct clocksource *cs)
    658{
    659	return cs == watchdog;
    660}
    661
    662#else /* CONFIG_CLOCKSOURCE_WATCHDOG */
    663
    664static void clocksource_enqueue_watchdog(struct clocksource *cs)
    665{
    666	if (cs->flags & CLOCK_SOURCE_IS_CONTINUOUS)
    667		cs->flags |= CLOCK_SOURCE_VALID_FOR_HRES;
    668}
    669
    670static void clocksource_select_watchdog(bool fallback) { }
    671static inline void clocksource_dequeue_watchdog(struct clocksource *cs) { }
    672static inline void clocksource_resume_watchdog(void) { }
    673static inline int __clocksource_watchdog_kthread(void) { return 0; }
    674static bool clocksource_is_watchdog(struct clocksource *cs) { return false; }
    675void clocksource_mark_unstable(struct clocksource *cs) { }
    676
    677static inline void clocksource_watchdog_lock(unsigned long *flags) { }
    678static inline void clocksource_watchdog_unlock(unsigned long *flags) { }
    679
    680#endif /* CONFIG_CLOCKSOURCE_WATCHDOG */
    681
    682static bool clocksource_is_suspend(struct clocksource *cs)
    683{
    684	return cs == suspend_clocksource;
    685}
    686
    687static void __clocksource_suspend_select(struct clocksource *cs)
    688{
    689	/*
    690	 * Skip the clocksource which will be stopped in suspend state.
    691	 */
    692	if (!(cs->flags & CLOCK_SOURCE_SUSPEND_NONSTOP))
    693		return;
    694
    695	/*
    696	 * The nonstop clocksource can be selected as the suspend clocksource to
    697	 * calculate the suspend time, so it should not supply suspend/resume
    698	 * interfaces to suspend the nonstop clocksource when system suspends.
    699	 */
    700	if (cs->suspend || cs->resume) {
    701		pr_warn("Nonstop clocksource %s should not supply suspend/resume interfaces\n",
    702			cs->name);
    703	}
    704
    705	/* Pick the best rating. */
    706	if (!suspend_clocksource || cs->rating > suspend_clocksource->rating)
    707		suspend_clocksource = cs;
    708}
    709
    710/**
    711 * clocksource_suspend_select - Select the best clocksource for suspend timing
    712 * @fallback:	if select a fallback clocksource
    713 */
    714static void clocksource_suspend_select(bool fallback)
    715{
    716	struct clocksource *cs, *old_suspend;
    717
    718	old_suspend = suspend_clocksource;
    719	if (fallback)
    720		suspend_clocksource = NULL;
    721
    722	list_for_each_entry(cs, &clocksource_list, list) {
    723		/* Skip current if we were requested for a fallback. */
    724		if (fallback && cs == old_suspend)
    725			continue;
    726
    727		__clocksource_suspend_select(cs);
    728	}
    729}
    730
    731/**
    732 * clocksource_start_suspend_timing - Start measuring the suspend timing
    733 * @cs:			current clocksource from timekeeping
    734 * @start_cycles:	current cycles from timekeeping
    735 *
    736 * This function will save the start cycle values of suspend timer to calculate
    737 * the suspend time when resuming system.
    738 *
    739 * This function is called late in the suspend process from timekeeping_suspend(),
    740 * that means processes are frozen, non-boot cpus and interrupts are disabled
    741 * now. It is therefore possible to start the suspend timer without taking the
    742 * clocksource mutex.
    743 */
    744void clocksource_start_suspend_timing(struct clocksource *cs, u64 start_cycles)
    745{
    746	if (!suspend_clocksource)
    747		return;
    748
    749	/*
    750	 * If current clocksource is the suspend timer, we should use the
    751	 * tkr_mono.cycle_last value as suspend_start to avoid same reading
    752	 * from suspend timer.
    753	 */
    754	if (clocksource_is_suspend(cs)) {
    755		suspend_start = start_cycles;
    756		return;
    757	}
    758
    759	if (suspend_clocksource->enable &&
    760	    suspend_clocksource->enable(suspend_clocksource)) {
    761		pr_warn_once("Failed to enable the non-suspend-able clocksource.\n");
    762		return;
    763	}
    764
    765	suspend_start = suspend_clocksource->read(suspend_clocksource);
    766}
    767
    768/**
    769 * clocksource_stop_suspend_timing - Stop measuring the suspend timing
    770 * @cs:		current clocksource from timekeeping
    771 * @cycle_now:	current cycles from timekeeping
    772 *
    773 * This function will calculate the suspend time from suspend timer.
    774 *
    775 * Returns nanoseconds since suspend started, 0 if no usable suspend clocksource.
    776 *
    777 * This function is called early in the resume process from timekeeping_resume(),
    778 * that means there is only one cpu, no processes are running and the interrupts
    779 * are disabled. It is therefore possible to stop the suspend timer without
    780 * taking the clocksource mutex.
    781 */
    782u64 clocksource_stop_suspend_timing(struct clocksource *cs, u64 cycle_now)
    783{
    784	u64 now, delta, nsec = 0;
    785
    786	if (!suspend_clocksource)
    787		return 0;
    788
    789	/*
    790	 * If current clocksource is the suspend timer, we should use the
    791	 * tkr_mono.cycle_last value from timekeeping as current cycle to
    792	 * avoid same reading from suspend timer.
    793	 */
    794	if (clocksource_is_suspend(cs))
    795		now = cycle_now;
    796	else
    797		now = suspend_clocksource->read(suspend_clocksource);
    798
    799	if (now > suspend_start) {
    800		delta = clocksource_delta(now, suspend_start,
    801					  suspend_clocksource->mask);
    802		nsec = mul_u64_u32_shr(delta, suspend_clocksource->mult,
    803				       suspend_clocksource->shift);
    804	}
    805
    806	/*
    807	 * Disable the suspend timer to save power if current clocksource is
    808	 * not the suspend timer.
    809	 */
    810	if (!clocksource_is_suspend(cs) && suspend_clocksource->disable)
    811		suspend_clocksource->disable(suspend_clocksource);
    812
    813	return nsec;
    814}
    815
    816/**
    817 * clocksource_suspend - suspend the clocksource(s)
    818 */
    819void clocksource_suspend(void)
    820{
    821	struct clocksource *cs;
    822
    823	list_for_each_entry_reverse(cs, &clocksource_list, list)
    824		if (cs->suspend)
    825			cs->suspend(cs);
    826}
    827
    828/**
    829 * clocksource_resume - resume the clocksource(s)
    830 */
    831void clocksource_resume(void)
    832{
    833	struct clocksource *cs;
    834
    835	list_for_each_entry(cs, &clocksource_list, list)
    836		if (cs->resume)
    837			cs->resume(cs);
    838
    839	clocksource_resume_watchdog();
    840}
    841
    842/**
    843 * clocksource_touch_watchdog - Update watchdog
    844 *
    845 * Update the watchdog after exception contexts such as kgdb so as not
    846 * to incorrectly trip the watchdog. This might fail when the kernel
    847 * was stopped in code which holds watchdog_lock.
    848 */
    849void clocksource_touch_watchdog(void)
    850{
    851	clocksource_resume_watchdog();
    852}
    853
    854/**
    855 * clocksource_max_adjustment- Returns max adjustment amount
    856 * @cs:         Pointer to clocksource
    857 *
    858 */
    859static u32 clocksource_max_adjustment(struct clocksource *cs)
    860{
    861	u64 ret;
    862	/*
    863	 * We won't try to correct for more than 11% adjustments (110,000 ppm),
    864	 */
    865	ret = (u64)cs->mult * 11;
    866	do_div(ret,100);
    867	return (u32)ret;
    868}
    869
    870/**
    871 * clocks_calc_max_nsecs - Returns maximum nanoseconds that can be converted
    872 * @mult:	cycle to nanosecond multiplier
    873 * @shift:	cycle to nanosecond divisor (power of two)
    874 * @maxadj:	maximum adjustment value to mult (~11%)
    875 * @mask:	bitmask for two's complement subtraction of non 64 bit counters
    876 * @max_cyc:	maximum cycle value before potential overflow (does not include
    877 *		any safety margin)
    878 *
    879 * NOTE: This function includes a safety margin of 50%, in other words, we
    880 * return half the number of nanoseconds the hardware counter can technically
    881 * cover. This is done so that we can potentially detect problems caused by
    882 * delayed timers or bad hardware, which might result in time intervals that
    883 * are larger than what the math used can handle without overflows.
    884 */
    885u64 clocks_calc_max_nsecs(u32 mult, u32 shift, u32 maxadj, u64 mask, u64 *max_cyc)
    886{
    887	u64 max_nsecs, max_cycles;
    888
    889	/*
    890	 * Calculate the maximum number of cycles that we can pass to the
    891	 * cyc2ns() function without overflowing a 64-bit result.
    892	 */
    893	max_cycles = ULLONG_MAX;
    894	do_div(max_cycles, mult+maxadj);
    895
    896	/*
    897	 * The actual maximum number of cycles we can defer the clocksource is
    898	 * determined by the minimum of max_cycles and mask.
    899	 * Note: Here we subtract the maxadj to make sure we don't sleep for
    900	 * too long if there's a large negative adjustment.
    901	 */
    902	max_cycles = min(max_cycles, mask);
    903	max_nsecs = clocksource_cyc2ns(max_cycles, mult - maxadj, shift);
    904
    905	/* return the max_cycles value as well if requested */
    906	if (max_cyc)
    907		*max_cyc = max_cycles;
    908
    909	/* Return 50% of the actual maximum, so we can detect bad values */
    910	max_nsecs >>= 1;
    911
    912	return max_nsecs;
    913}
    914
    915/**
    916 * clocksource_update_max_deferment - Updates the clocksource max_idle_ns & max_cycles
    917 * @cs:         Pointer to clocksource to be updated
    918 *
    919 */
    920static inline void clocksource_update_max_deferment(struct clocksource *cs)
    921{
    922	cs->max_idle_ns = clocks_calc_max_nsecs(cs->mult, cs->shift,
    923						cs->maxadj, cs->mask,
    924						&cs->max_cycles);
    925}
    926
    927static struct clocksource *clocksource_find_best(bool oneshot, bool skipcur)
    928{
    929	struct clocksource *cs;
    930
    931	if (!finished_booting || list_empty(&clocksource_list))
    932		return NULL;
    933
    934	/*
    935	 * We pick the clocksource with the highest rating. If oneshot
    936	 * mode is active, we pick the highres valid clocksource with
    937	 * the best rating.
    938	 */
    939	list_for_each_entry(cs, &clocksource_list, list) {
    940		if (skipcur && cs == curr_clocksource)
    941			continue;
    942		if (oneshot && !(cs->flags & CLOCK_SOURCE_VALID_FOR_HRES))
    943			continue;
    944		return cs;
    945	}
    946	return NULL;
    947}
    948
    949static void __clocksource_select(bool skipcur)
    950{
    951	bool oneshot = tick_oneshot_mode_active();
    952	struct clocksource *best, *cs;
    953
    954	/* Find the best suitable clocksource */
    955	best = clocksource_find_best(oneshot, skipcur);
    956	if (!best)
    957		return;
    958
    959	if (!strlen(override_name))
    960		goto found;
    961
    962	/* Check for the override clocksource. */
    963	list_for_each_entry(cs, &clocksource_list, list) {
    964		if (skipcur && cs == curr_clocksource)
    965			continue;
    966		if (strcmp(cs->name, override_name) != 0)
    967			continue;
    968		/*
    969		 * Check to make sure we don't switch to a non-highres
    970		 * capable clocksource if the tick code is in oneshot
    971		 * mode (highres or nohz)
    972		 */
    973		if (!(cs->flags & CLOCK_SOURCE_VALID_FOR_HRES) && oneshot) {
    974			/* Override clocksource cannot be used. */
    975			if (cs->flags & CLOCK_SOURCE_UNSTABLE) {
    976				pr_warn("Override clocksource %s is unstable and not HRT compatible - cannot switch while in HRT/NOHZ mode\n",
    977					cs->name);
    978				override_name[0] = 0;
    979			} else {
    980				/*
    981				 * The override cannot be currently verified.
    982				 * Deferring to let the watchdog check.
    983				 */
    984				pr_info("Override clocksource %s is not currently HRT compatible - deferring\n",
    985					cs->name);
    986			}
    987		} else
    988			/* Override clocksource can be used. */
    989			best = cs;
    990		break;
    991	}
    992
    993found:
    994	if (curr_clocksource != best && !timekeeping_notify(best)) {
    995		pr_info("Switched to clocksource %s\n", best->name);
    996		curr_clocksource = best;
    997	}
    998}
    999
   1000/**
   1001 * clocksource_select - Select the best clocksource available
   1002 *
   1003 * Private function. Must hold clocksource_mutex when called.
   1004 *
   1005 * Select the clocksource with the best rating, or the clocksource,
   1006 * which is selected by userspace override.
   1007 */
   1008static void clocksource_select(void)
   1009{
   1010	__clocksource_select(false);
   1011}
   1012
   1013static void clocksource_select_fallback(void)
   1014{
   1015	__clocksource_select(true);
   1016}
   1017
   1018/*
   1019 * clocksource_done_booting - Called near the end of core bootup
   1020 *
   1021 * Hack to avoid lots of clocksource churn at boot time.
   1022 * We use fs_initcall because we want this to start before
   1023 * device_initcall but after subsys_initcall.
   1024 */
   1025static int __init clocksource_done_booting(void)
   1026{
   1027	mutex_lock(&clocksource_mutex);
   1028	curr_clocksource = clocksource_default_clock();
   1029	finished_booting = 1;
   1030	/*
   1031	 * Run the watchdog first to eliminate unstable clock sources
   1032	 */
   1033	__clocksource_watchdog_kthread();
   1034	clocksource_select();
   1035	mutex_unlock(&clocksource_mutex);
   1036	return 0;
   1037}
   1038fs_initcall(clocksource_done_booting);
   1039
   1040/*
   1041 * Enqueue the clocksource sorted by rating
   1042 */
   1043static void clocksource_enqueue(struct clocksource *cs)
   1044{
   1045	struct list_head *entry = &clocksource_list;
   1046	struct clocksource *tmp;
   1047
   1048	list_for_each_entry(tmp, &clocksource_list, list) {
   1049		/* Keep track of the place, where to insert */
   1050		if (tmp->rating < cs->rating)
   1051			break;
   1052		entry = &tmp->list;
   1053	}
   1054	list_add(&cs->list, entry);
   1055}
   1056
   1057/**
   1058 * __clocksource_update_freq_scale - Used update clocksource with new freq
   1059 * @cs:		clocksource to be registered
   1060 * @scale:	Scale factor multiplied against freq to get clocksource hz
   1061 * @freq:	clocksource frequency (cycles per second) divided by scale
   1062 *
   1063 * This should only be called from the clocksource->enable() method.
   1064 *
   1065 * This *SHOULD NOT* be called directly! Please use the
   1066 * __clocksource_update_freq_hz() or __clocksource_update_freq_khz() helper
   1067 * functions.
   1068 */
   1069void __clocksource_update_freq_scale(struct clocksource *cs, u32 scale, u32 freq)
   1070{
   1071	u64 sec;
   1072
   1073	/*
   1074	 * Default clocksources are *special* and self-define their mult/shift.
   1075	 * But, you're not special, so you should specify a freq value.
   1076	 */
   1077	if (freq) {
   1078		/*
   1079		 * Calc the maximum number of seconds which we can run before
   1080		 * wrapping around. For clocksources which have a mask > 32-bit
   1081		 * we need to limit the max sleep time to have a good
   1082		 * conversion precision. 10 minutes is still a reasonable
   1083		 * amount. That results in a shift value of 24 for a
   1084		 * clocksource with mask >= 40-bit and f >= 4GHz. That maps to
   1085		 * ~ 0.06ppm granularity for NTP.
   1086		 */
   1087		sec = cs->mask;
   1088		do_div(sec, freq);
   1089		do_div(sec, scale);
   1090		if (!sec)
   1091			sec = 1;
   1092		else if (sec > 600 && cs->mask > UINT_MAX)
   1093			sec = 600;
   1094
   1095		clocks_calc_mult_shift(&cs->mult, &cs->shift, freq,
   1096				       NSEC_PER_SEC / scale, sec * scale);
   1097	}
   1098
   1099	/*
   1100	 * If the uncertainty margin is not specified, calculate it.
   1101	 * If both scale and freq are non-zero, calculate the clock
   1102	 * period, but bound below at 2*WATCHDOG_MAX_SKEW.  However,
   1103	 * if either of scale or freq is zero, be very conservative and
   1104	 * take the tens-of-milliseconds WATCHDOG_THRESHOLD value for the
   1105	 * uncertainty margin.  Allow stupidly small uncertainty margins
   1106	 * to be specified by the caller for testing purposes, but warn
   1107	 * to discourage production use of this capability.
   1108	 */
   1109	if (scale && freq && !cs->uncertainty_margin) {
   1110		cs->uncertainty_margin = NSEC_PER_SEC / (scale * freq);
   1111		if (cs->uncertainty_margin < 2 * WATCHDOG_MAX_SKEW)
   1112			cs->uncertainty_margin = 2 * WATCHDOG_MAX_SKEW;
   1113	} else if (!cs->uncertainty_margin) {
   1114		cs->uncertainty_margin = WATCHDOG_THRESHOLD;
   1115	}
   1116	WARN_ON_ONCE(cs->uncertainty_margin < 2 * WATCHDOG_MAX_SKEW);
   1117
   1118	/*
   1119	 * Ensure clocksources that have large 'mult' values don't overflow
   1120	 * when adjusted.
   1121	 */
   1122	cs->maxadj = clocksource_max_adjustment(cs);
   1123	while (freq && ((cs->mult + cs->maxadj < cs->mult)
   1124		|| (cs->mult - cs->maxadj > cs->mult))) {
   1125		cs->mult >>= 1;
   1126		cs->shift--;
   1127		cs->maxadj = clocksource_max_adjustment(cs);
   1128	}
   1129
   1130	/*
   1131	 * Only warn for *special* clocksources that self-define
   1132	 * their mult/shift values and don't specify a freq.
   1133	 */
   1134	WARN_ONCE(cs->mult + cs->maxadj < cs->mult,
   1135		"timekeeping: Clocksource %s might overflow on 11%% adjustment\n",
   1136		cs->name);
   1137
   1138	clocksource_update_max_deferment(cs);
   1139
   1140	pr_info("%s: mask: 0x%llx max_cycles: 0x%llx, max_idle_ns: %lld ns\n",
   1141		cs->name, cs->mask, cs->max_cycles, cs->max_idle_ns);
   1142}
   1143EXPORT_SYMBOL_GPL(__clocksource_update_freq_scale);
   1144
   1145/**
   1146 * __clocksource_register_scale - Used to install new clocksources
   1147 * @cs:		clocksource to be registered
   1148 * @scale:	Scale factor multiplied against freq to get clocksource hz
   1149 * @freq:	clocksource frequency (cycles per second) divided by scale
   1150 *
   1151 * Returns -EBUSY if registration fails, zero otherwise.
   1152 *
   1153 * This *SHOULD NOT* be called directly! Please use the
   1154 * clocksource_register_hz() or clocksource_register_khz helper functions.
   1155 */
   1156int __clocksource_register_scale(struct clocksource *cs, u32 scale, u32 freq)
   1157{
   1158	unsigned long flags;
   1159
   1160	clocksource_arch_init(cs);
   1161
   1162	if (WARN_ON_ONCE((unsigned int)cs->id >= CSID_MAX))
   1163		cs->id = CSID_GENERIC;
   1164	if (cs->vdso_clock_mode < 0 ||
   1165	    cs->vdso_clock_mode >= VDSO_CLOCKMODE_MAX) {
   1166		pr_warn("clocksource %s registered with invalid VDSO mode %d. Disabling VDSO support.\n",
   1167			cs->name, cs->vdso_clock_mode);
   1168		cs->vdso_clock_mode = VDSO_CLOCKMODE_NONE;
   1169	}
   1170
   1171	/* Initialize mult/shift and max_idle_ns */
   1172	__clocksource_update_freq_scale(cs, scale, freq);
   1173
   1174	/* Add clocksource to the clocksource list */
   1175	mutex_lock(&clocksource_mutex);
   1176
   1177	clocksource_watchdog_lock(&flags);
   1178	clocksource_enqueue(cs);
   1179	clocksource_enqueue_watchdog(cs);
   1180	clocksource_watchdog_unlock(&flags);
   1181
   1182	clocksource_select();
   1183	clocksource_select_watchdog(false);
   1184	__clocksource_suspend_select(cs);
   1185	mutex_unlock(&clocksource_mutex);
   1186	return 0;
   1187}
   1188EXPORT_SYMBOL_GPL(__clocksource_register_scale);
   1189
   1190static void __clocksource_change_rating(struct clocksource *cs, int rating)
   1191{
   1192	list_del(&cs->list);
   1193	cs->rating = rating;
   1194	clocksource_enqueue(cs);
   1195}
   1196
   1197/**
   1198 * clocksource_change_rating - Change the rating of a registered clocksource
   1199 * @cs:		clocksource to be changed
   1200 * @rating:	new rating
   1201 */
   1202void clocksource_change_rating(struct clocksource *cs, int rating)
   1203{
   1204	unsigned long flags;
   1205
   1206	mutex_lock(&clocksource_mutex);
   1207	clocksource_watchdog_lock(&flags);
   1208	__clocksource_change_rating(cs, rating);
   1209	clocksource_watchdog_unlock(&flags);
   1210
   1211	clocksource_select();
   1212	clocksource_select_watchdog(false);
   1213	clocksource_suspend_select(false);
   1214	mutex_unlock(&clocksource_mutex);
   1215}
   1216EXPORT_SYMBOL(clocksource_change_rating);
   1217
   1218/*
   1219 * Unbind clocksource @cs. Called with clocksource_mutex held
   1220 */
   1221static int clocksource_unbind(struct clocksource *cs)
   1222{
   1223	unsigned long flags;
   1224
   1225	if (clocksource_is_watchdog(cs)) {
   1226		/* Select and try to install a replacement watchdog. */
   1227		clocksource_select_watchdog(true);
   1228		if (clocksource_is_watchdog(cs))
   1229			return -EBUSY;
   1230	}
   1231
   1232	if (cs == curr_clocksource) {
   1233		/* Select and try to install a replacement clock source */
   1234		clocksource_select_fallback();
   1235		if (curr_clocksource == cs)
   1236			return -EBUSY;
   1237	}
   1238
   1239	if (clocksource_is_suspend(cs)) {
   1240		/*
   1241		 * Select and try to install a replacement suspend clocksource.
   1242		 * If no replacement suspend clocksource, we will just let the
   1243		 * clocksource go and have no suspend clocksource.
   1244		 */
   1245		clocksource_suspend_select(true);
   1246	}
   1247
   1248	clocksource_watchdog_lock(&flags);
   1249	clocksource_dequeue_watchdog(cs);
   1250	list_del_init(&cs->list);
   1251	clocksource_watchdog_unlock(&flags);
   1252
   1253	return 0;
   1254}
   1255
   1256/**
   1257 * clocksource_unregister - remove a registered clocksource
   1258 * @cs:	clocksource to be unregistered
   1259 */
   1260int clocksource_unregister(struct clocksource *cs)
   1261{
   1262	int ret = 0;
   1263
   1264	mutex_lock(&clocksource_mutex);
   1265	if (!list_empty(&cs->list))
   1266		ret = clocksource_unbind(cs);
   1267	mutex_unlock(&clocksource_mutex);
   1268	return ret;
   1269}
   1270EXPORT_SYMBOL(clocksource_unregister);
   1271
   1272#ifdef CONFIG_SYSFS
   1273/**
   1274 * current_clocksource_show - sysfs interface for current clocksource
   1275 * @dev:	unused
   1276 * @attr:	unused
   1277 * @buf:	char buffer to be filled with clocksource list
   1278 *
   1279 * Provides sysfs interface for listing current clocksource.
   1280 */
   1281static ssize_t current_clocksource_show(struct device *dev,
   1282					struct device_attribute *attr,
   1283					char *buf)
   1284{
   1285	ssize_t count = 0;
   1286
   1287	mutex_lock(&clocksource_mutex);
   1288	count = snprintf(buf, PAGE_SIZE, "%s\n", curr_clocksource->name);
   1289	mutex_unlock(&clocksource_mutex);
   1290
   1291	return count;
   1292}
   1293
   1294ssize_t sysfs_get_uname(const char *buf, char *dst, size_t cnt)
   1295{
   1296	size_t ret = cnt;
   1297
   1298	/* strings from sysfs write are not 0 terminated! */
   1299	if (!cnt || cnt >= CS_NAME_LEN)
   1300		return -EINVAL;
   1301
   1302	/* strip of \n: */
   1303	if (buf[cnt-1] == '\n')
   1304		cnt--;
   1305	if (cnt > 0)
   1306		memcpy(dst, buf, cnt);
   1307	dst[cnt] = 0;
   1308	return ret;
   1309}
   1310
   1311/**
   1312 * current_clocksource_store - interface for manually overriding clocksource
   1313 * @dev:	unused
   1314 * @attr:	unused
   1315 * @buf:	name of override clocksource
   1316 * @count:	length of buffer
   1317 *
   1318 * Takes input from sysfs interface for manually overriding the default
   1319 * clocksource selection.
   1320 */
   1321static ssize_t current_clocksource_store(struct device *dev,
   1322					 struct device_attribute *attr,
   1323					 const char *buf, size_t count)
   1324{
   1325	ssize_t ret;
   1326
   1327	mutex_lock(&clocksource_mutex);
   1328
   1329	ret = sysfs_get_uname(buf, override_name, count);
   1330	if (ret >= 0)
   1331		clocksource_select();
   1332
   1333	mutex_unlock(&clocksource_mutex);
   1334
   1335	return ret;
   1336}
   1337static DEVICE_ATTR_RW(current_clocksource);
   1338
   1339/**
   1340 * unbind_clocksource_store - interface for manually unbinding clocksource
   1341 * @dev:	unused
   1342 * @attr:	unused
   1343 * @buf:	unused
   1344 * @count:	length of buffer
   1345 *
   1346 * Takes input from sysfs interface for manually unbinding a clocksource.
   1347 */
   1348static ssize_t unbind_clocksource_store(struct device *dev,
   1349					struct device_attribute *attr,
   1350					const char *buf, size_t count)
   1351{
   1352	struct clocksource *cs;
   1353	char name[CS_NAME_LEN];
   1354	ssize_t ret;
   1355
   1356	ret = sysfs_get_uname(buf, name, count);
   1357	if (ret < 0)
   1358		return ret;
   1359
   1360	ret = -ENODEV;
   1361	mutex_lock(&clocksource_mutex);
   1362	list_for_each_entry(cs, &clocksource_list, list) {
   1363		if (strcmp(cs->name, name))
   1364			continue;
   1365		ret = clocksource_unbind(cs);
   1366		break;
   1367	}
   1368	mutex_unlock(&clocksource_mutex);
   1369
   1370	return ret ? ret : count;
   1371}
   1372static DEVICE_ATTR_WO(unbind_clocksource);
   1373
   1374/**
   1375 * available_clocksource_show - sysfs interface for listing clocksource
   1376 * @dev:	unused
   1377 * @attr:	unused
   1378 * @buf:	char buffer to be filled with clocksource list
   1379 *
   1380 * Provides sysfs interface for listing registered clocksources
   1381 */
   1382static ssize_t available_clocksource_show(struct device *dev,
   1383					  struct device_attribute *attr,
   1384					  char *buf)
   1385{
   1386	struct clocksource *src;
   1387	ssize_t count = 0;
   1388
   1389	mutex_lock(&clocksource_mutex);
   1390	list_for_each_entry(src, &clocksource_list, list) {
   1391		/*
   1392		 * Don't show non-HRES clocksource if the tick code is
   1393		 * in one shot mode (highres=on or nohz=on)
   1394		 */
   1395		if (!tick_oneshot_mode_active() ||
   1396		    (src->flags & CLOCK_SOURCE_VALID_FOR_HRES))
   1397			count += snprintf(buf + count,
   1398				  max((ssize_t)PAGE_SIZE - count, (ssize_t)0),
   1399				  "%s ", src->name);
   1400	}
   1401	mutex_unlock(&clocksource_mutex);
   1402
   1403	count += snprintf(buf + count,
   1404			  max((ssize_t)PAGE_SIZE - count, (ssize_t)0), "\n");
   1405
   1406	return count;
   1407}
   1408static DEVICE_ATTR_RO(available_clocksource);
   1409
   1410static struct attribute *clocksource_attrs[] = {
   1411	&dev_attr_current_clocksource.attr,
   1412	&dev_attr_unbind_clocksource.attr,
   1413	&dev_attr_available_clocksource.attr,
   1414	NULL
   1415};
   1416ATTRIBUTE_GROUPS(clocksource);
   1417
   1418static struct bus_type clocksource_subsys = {
   1419	.name = "clocksource",
   1420	.dev_name = "clocksource",
   1421};
   1422
   1423static struct device device_clocksource = {
   1424	.id	= 0,
   1425	.bus	= &clocksource_subsys,
   1426	.groups	= clocksource_groups,
   1427};
   1428
   1429static int __init init_clocksource_sysfs(void)
   1430{
   1431	int error = subsys_system_register(&clocksource_subsys, NULL);
   1432
   1433	if (!error)
   1434		error = device_register(&device_clocksource);
   1435
   1436	return error;
   1437}
   1438
   1439device_initcall(init_clocksource_sysfs);
   1440#endif /* CONFIG_SYSFS */
   1441
   1442/**
   1443 * boot_override_clocksource - boot clock override
   1444 * @str:	override name
   1445 *
   1446 * Takes a clocksource= boot argument and uses it
   1447 * as the clocksource override name.
   1448 */
   1449static int __init boot_override_clocksource(char* str)
   1450{
   1451	mutex_lock(&clocksource_mutex);
   1452	if (str)
   1453		strlcpy(override_name, str, sizeof(override_name));
   1454	mutex_unlock(&clocksource_mutex);
   1455	return 1;
   1456}
   1457
   1458__setup("clocksource=", boot_override_clocksource);
   1459
   1460/**
   1461 * boot_override_clock - Compatibility layer for deprecated boot option
   1462 * @str:	override name
   1463 *
   1464 * DEPRECATED! Takes a clock= boot argument and uses it
   1465 * as the clocksource override name
   1466 */
   1467static int __init boot_override_clock(char* str)
   1468{
   1469	if (!strcmp(str, "pmtmr")) {
   1470		pr_warn("clock=pmtmr is deprecated - use clocksource=acpi_pm\n");
   1471		return boot_override_clocksource("acpi_pm");
   1472	}
   1473	pr_warn("clock= boot option is deprecated - use clocksource=xyz\n");
   1474	return boot_override_clocksource(str);
   1475}
   1476
   1477__setup("clock=", boot_override_clock);