cachepc-linux

Fork of AMDESE/linux with modifications for CachePC side-channel attack
git clone https://git.sinitax.com/sinitax/cachepc-linux
Log | Files | Refs | README | LICENSE | sfeed.txt

cputime.c (26800B)


      1// SPDX-License-Identifier: GPL-2.0-only
      2/*
      3 * Simple CPU accounting cgroup controller
      4 */
      5
      6#ifdef CONFIG_IRQ_TIME_ACCOUNTING
      7
      8/*
      9 * There are no locks covering percpu hardirq/softirq time.
     10 * They are only modified in vtime_account, on corresponding CPU
     11 * with interrupts disabled. So, writes are safe.
     12 * They are read and saved off onto struct rq in update_rq_clock().
     13 * This may result in other CPU reading this CPU's irq time and can
     14 * race with irq/vtime_account on this CPU. We would either get old
     15 * or new value with a side effect of accounting a slice of irq time to wrong
     16 * task when irq is in progress while we read rq->clock. That is a worthy
     17 * compromise in place of having locks on each irq in account_system_time.
     18 */
     19DEFINE_PER_CPU(struct irqtime, cpu_irqtime);
     20
     21static int sched_clock_irqtime;
     22
     23void enable_sched_clock_irqtime(void)
     24{
     25	sched_clock_irqtime = 1;
     26}
     27
     28void disable_sched_clock_irqtime(void)
     29{
     30	sched_clock_irqtime = 0;
     31}
     32
     33static void irqtime_account_delta(struct irqtime *irqtime, u64 delta,
     34				  enum cpu_usage_stat idx)
     35{
     36	u64 *cpustat = kcpustat_this_cpu->cpustat;
     37
     38	u64_stats_update_begin(&irqtime->sync);
     39	cpustat[idx] += delta;
     40	irqtime->total += delta;
     41	irqtime->tick_delta += delta;
     42	u64_stats_update_end(&irqtime->sync);
     43}
     44
     45/*
     46 * Called after incrementing preempt_count on {soft,}irq_enter
     47 * and before decrementing preempt_count on {soft,}irq_exit.
     48 */
     49void irqtime_account_irq(struct task_struct *curr, unsigned int offset)
     50{
     51	struct irqtime *irqtime = this_cpu_ptr(&cpu_irqtime);
     52	unsigned int pc;
     53	s64 delta;
     54	int cpu;
     55
     56	if (!sched_clock_irqtime)
     57		return;
     58
     59	cpu = smp_processor_id();
     60	delta = sched_clock_cpu(cpu) - irqtime->irq_start_time;
     61	irqtime->irq_start_time += delta;
     62	pc = irq_count() - offset;
     63
     64	/*
     65	 * We do not account for softirq time from ksoftirqd here.
     66	 * We want to continue accounting softirq time to ksoftirqd thread
     67	 * in that case, so as not to confuse scheduler with a special task
     68	 * that do not consume any time, but still wants to run.
     69	 */
     70	if (pc & HARDIRQ_MASK)
     71		irqtime_account_delta(irqtime, delta, CPUTIME_IRQ);
     72	else if ((pc & SOFTIRQ_OFFSET) && curr != this_cpu_ksoftirqd())
     73		irqtime_account_delta(irqtime, delta, CPUTIME_SOFTIRQ);
     74}
     75
     76static u64 irqtime_tick_accounted(u64 maxtime)
     77{
     78	struct irqtime *irqtime = this_cpu_ptr(&cpu_irqtime);
     79	u64 delta;
     80
     81	delta = min(irqtime->tick_delta, maxtime);
     82	irqtime->tick_delta -= delta;
     83
     84	return delta;
     85}
     86
     87#else /* CONFIG_IRQ_TIME_ACCOUNTING */
     88
     89#define sched_clock_irqtime	(0)
     90
     91static u64 irqtime_tick_accounted(u64 dummy)
     92{
     93	return 0;
     94}
     95
     96#endif /* !CONFIG_IRQ_TIME_ACCOUNTING */
     97
     98static inline void task_group_account_field(struct task_struct *p, int index,
     99					    u64 tmp)
    100{
    101	/*
    102	 * Since all updates are sure to touch the root cgroup, we
    103	 * get ourselves ahead and touch it first. If the root cgroup
    104	 * is the only cgroup, then nothing else should be necessary.
    105	 *
    106	 */
    107	__this_cpu_add(kernel_cpustat.cpustat[index], tmp);
    108
    109	cgroup_account_cputime_field(p, index, tmp);
    110}
    111
    112/*
    113 * Account user CPU time to a process.
    114 * @p: the process that the CPU time gets accounted to
    115 * @cputime: the CPU time spent in user space since the last update
    116 */
    117void account_user_time(struct task_struct *p, u64 cputime)
    118{
    119	int index;
    120
    121	/* Add user time to process. */
    122	p->utime += cputime;
    123	account_group_user_time(p, cputime);
    124
    125	index = (task_nice(p) > 0) ? CPUTIME_NICE : CPUTIME_USER;
    126
    127	/* Add user time to cpustat. */
    128	task_group_account_field(p, index, cputime);
    129
    130	/* Account for user time used */
    131	acct_account_cputime(p);
    132}
    133
    134/*
    135 * Account guest CPU time to a process.
    136 * @p: the process that the CPU time gets accounted to
    137 * @cputime: the CPU time spent in virtual machine since the last update
    138 */
    139void account_guest_time(struct task_struct *p, u64 cputime)
    140{
    141	u64 *cpustat = kcpustat_this_cpu->cpustat;
    142
    143	/* Add guest time to process. */
    144	p->utime += cputime;
    145	account_group_user_time(p, cputime);
    146	p->gtime += cputime;
    147
    148	/* Add guest time to cpustat. */
    149	if (task_nice(p) > 0) {
    150		task_group_account_field(p, CPUTIME_NICE, cputime);
    151		cpustat[CPUTIME_GUEST_NICE] += cputime;
    152	} else {
    153		task_group_account_field(p, CPUTIME_USER, cputime);
    154		cpustat[CPUTIME_GUEST] += cputime;
    155	}
    156}
    157
    158/*
    159 * Account system CPU time to a process and desired cpustat field
    160 * @p: the process that the CPU time gets accounted to
    161 * @cputime: the CPU time spent in kernel space since the last update
    162 * @index: pointer to cpustat field that has to be updated
    163 */
    164void account_system_index_time(struct task_struct *p,
    165			       u64 cputime, enum cpu_usage_stat index)
    166{
    167	/* Add system time to process. */
    168	p->stime += cputime;
    169	account_group_system_time(p, cputime);
    170
    171	/* Add system time to cpustat. */
    172	task_group_account_field(p, index, cputime);
    173
    174	/* Account for system time used */
    175	acct_account_cputime(p);
    176}
    177
    178/*
    179 * Account system CPU time to a process.
    180 * @p: the process that the CPU time gets accounted to
    181 * @hardirq_offset: the offset to subtract from hardirq_count()
    182 * @cputime: the CPU time spent in kernel space since the last update
    183 */
    184void account_system_time(struct task_struct *p, int hardirq_offset, u64 cputime)
    185{
    186	int index;
    187
    188	if ((p->flags & PF_VCPU) && (irq_count() - hardirq_offset == 0)) {
    189		account_guest_time(p, cputime);
    190		return;
    191	}
    192
    193	if (hardirq_count() - hardirq_offset)
    194		index = CPUTIME_IRQ;
    195	else if (in_serving_softirq())
    196		index = CPUTIME_SOFTIRQ;
    197	else
    198		index = CPUTIME_SYSTEM;
    199
    200	account_system_index_time(p, cputime, index);
    201}
    202
    203/*
    204 * Account for involuntary wait time.
    205 * @cputime: the CPU time spent in involuntary wait
    206 */
    207void account_steal_time(u64 cputime)
    208{
    209	u64 *cpustat = kcpustat_this_cpu->cpustat;
    210
    211	cpustat[CPUTIME_STEAL] += cputime;
    212}
    213
    214/*
    215 * Account for idle time.
    216 * @cputime: the CPU time spent in idle wait
    217 */
    218void account_idle_time(u64 cputime)
    219{
    220	u64 *cpustat = kcpustat_this_cpu->cpustat;
    221	struct rq *rq = this_rq();
    222
    223	if (atomic_read(&rq->nr_iowait) > 0)
    224		cpustat[CPUTIME_IOWAIT] += cputime;
    225	else
    226		cpustat[CPUTIME_IDLE] += cputime;
    227}
    228
    229/*
    230 * When a guest is interrupted for a longer amount of time, missed clock
    231 * ticks are not redelivered later. Due to that, this function may on
    232 * occasion account more time than the calling functions think elapsed.
    233 */
    234static __always_inline u64 steal_account_process_time(u64 maxtime)
    235{
    236#ifdef CONFIG_PARAVIRT
    237	if (static_key_false(&paravirt_steal_enabled)) {
    238		u64 steal;
    239
    240		steal = paravirt_steal_clock(smp_processor_id());
    241		steal -= this_rq()->prev_steal_time;
    242		steal = min(steal, maxtime);
    243		account_steal_time(steal);
    244		this_rq()->prev_steal_time += steal;
    245
    246		return steal;
    247	}
    248#endif
    249	return 0;
    250}
    251
    252/*
    253 * Account how much elapsed time was spent in steal, irq, or softirq time.
    254 */
    255static inline u64 account_other_time(u64 max)
    256{
    257	u64 accounted;
    258
    259	lockdep_assert_irqs_disabled();
    260
    261	accounted = steal_account_process_time(max);
    262
    263	if (accounted < max)
    264		accounted += irqtime_tick_accounted(max - accounted);
    265
    266	return accounted;
    267}
    268
    269#ifdef CONFIG_64BIT
    270static inline u64 read_sum_exec_runtime(struct task_struct *t)
    271{
    272	return t->se.sum_exec_runtime;
    273}
    274#else
    275static u64 read_sum_exec_runtime(struct task_struct *t)
    276{
    277	u64 ns;
    278	struct rq_flags rf;
    279	struct rq *rq;
    280
    281	rq = task_rq_lock(t, &rf);
    282	ns = t->se.sum_exec_runtime;
    283	task_rq_unlock(rq, t, &rf);
    284
    285	return ns;
    286}
    287#endif
    288
    289/*
    290 * Accumulate raw cputime values of dead tasks (sig->[us]time) and live
    291 * tasks (sum on group iteration) belonging to @tsk's group.
    292 */
    293void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times)
    294{
    295	struct signal_struct *sig = tsk->signal;
    296	u64 utime, stime;
    297	struct task_struct *t;
    298	unsigned int seq, nextseq;
    299	unsigned long flags;
    300
    301	/*
    302	 * Update current task runtime to account pending time since last
    303	 * scheduler action or thread_group_cputime() call. This thread group
    304	 * might have other running tasks on different CPUs, but updating
    305	 * their runtime can affect syscall performance, so we skip account
    306	 * those pending times and rely only on values updated on tick or
    307	 * other scheduler action.
    308	 */
    309	if (same_thread_group(current, tsk))
    310		(void) task_sched_runtime(current);
    311
    312	rcu_read_lock();
    313	/* Attempt a lockless read on the first round. */
    314	nextseq = 0;
    315	do {
    316		seq = nextseq;
    317		flags = read_seqbegin_or_lock_irqsave(&sig->stats_lock, &seq);
    318		times->utime = sig->utime;
    319		times->stime = sig->stime;
    320		times->sum_exec_runtime = sig->sum_sched_runtime;
    321
    322		for_each_thread(tsk, t) {
    323			task_cputime(t, &utime, &stime);
    324			times->utime += utime;
    325			times->stime += stime;
    326			times->sum_exec_runtime += read_sum_exec_runtime(t);
    327		}
    328		/* If lockless access failed, take the lock. */
    329		nextseq = 1;
    330	} while (need_seqretry(&sig->stats_lock, seq));
    331	done_seqretry_irqrestore(&sig->stats_lock, seq, flags);
    332	rcu_read_unlock();
    333}
    334
    335#ifdef CONFIG_IRQ_TIME_ACCOUNTING
    336/*
    337 * Account a tick to a process and cpustat
    338 * @p: the process that the CPU time gets accounted to
    339 * @user_tick: is the tick from userspace
    340 * @rq: the pointer to rq
    341 *
    342 * Tick demultiplexing follows the order
    343 * - pending hardirq update
    344 * - pending softirq update
    345 * - user_time
    346 * - idle_time
    347 * - system time
    348 *   - check for guest_time
    349 *   - else account as system_time
    350 *
    351 * Check for hardirq is done both for system and user time as there is
    352 * no timer going off while we are on hardirq and hence we may never get an
    353 * opportunity to update it solely in system time.
    354 * p->stime and friends are only updated on system time and not on irq
    355 * softirq as those do not count in task exec_runtime any more.
    356 */
    357static void irqtime_account_process_tick(struct task_struct *p, int user_tick,
    358					 int ticks)
    359{
    360	u64 other, cputime = TICK_NSEC * ticks;
    361
    362	/*
    363	 * When returning from idle, many ticks can get accounted at
    364	 * once, including some ticks of steal, irq, and softirq time.
    365	 * Subtract those ticks from the amount of time accounted to
    366	 * idle, or potentially user or system time. Due to rounding,
    367	 * other time can exceed ticks occasionally.
    368	 */
    369	other = account_other_time(ULONG_MAX);
    370	if (other >= cputime)
    371		return;
    372
    373	cputime -= other;
    374
    375	if (this_cpu_ksoftirqd() == p) {
    376		/*
    377		 * ksoftirqd time do not get accounted in cpu_softirq_time.
    378		 * So, we have to handle it separately here.
    379		 * Also, p->stime needs to be updated for ksoftirqd.
    380		 */
    381		account_system_index_time(p, cputime, CPUTIME_SOFTIRQ);
    382	} else if (user_tick) {
    383		account_user_time(p, cputime);
    384	} else if (p == this_rq()->idle) {
    385		account_idle_time(cputime);
    386	} else if (p->flags & PF_VCPU) { /* System time or guest time */
    387		account_guest_time(p, cputime);
    388	} else {
    389		account_system_index_time(p, cputime, CPUTIME_SYSTEM);
    390	}
    391}
    392
    393static void irqtime_account_idle_ticks(int ticks)
    394{
    395	irqtime_account_process_tick(current, 0, ticks);
    396}
    397#else /* CONFIG_IRQ_TIME_ACCOUNTING */
    398static inline void irqtime_account_idle_ticks(int ticks) { }
    399static inline void irqtime_account_process_tick(struct task_struct *p, int user_tick,
    400						int nr_ticks) { }
    401#endif /* CONFIG_IRQ_TIME_ACCOUNTING */
    402
    403/*
    404 * Use precise platform statistics if available:
    405 */
    406#ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
    407
    408# ifndef __ARCH_HAS_VTIME_TASK_SWITCH
    409void vtime_task_switch(struct task_struct *prev)
    410{
    411	if (is_idle_task(prev))
    412		vtime_account_idle(prev);
    413	else
    414		vtime_account_kernel(prev);
    415
    416	vtime_flush(prev);
    417	arch_vtime_task_switch(prev);
    418}
    419# endif
    420
    421void vtime_account_irq(struct task_struct *tsk, unsigned int offset)
    422{
    423	unsigned int pc = irq_count() - offset;
    424
    425	if (pc & HARDIRQ_OFFSET) {
    426		vtime_account_hardirq(tsk);
    427	} else if (pc & SOFTIRQ_OFFSET) {
    428		vtime_account_softirq(tsk);
    429	} else if (!IS_ENABLED(CONFIG_HAVE_VIRT_CPU_ACCOUNTING_IDLE) &&
    430		   is_idle_task(tsk)) {
    431		vtime_account_idle(tsk);
    432	} else {
    433		vtime_account_kernel(tsk);
    434	}
    435}
    436
    437void cputime_adjust(struct task_cputime *curr, struct prev_cputime *prev,
    438		    u64 *ut, u64 *st)
    439{
    440	*ut = curr->utime;
    441	*st = curr->stime;
    442}
    443
    444void task_cputime_adjusted(struct task_struct *p, u64 *ut, u64 *st)
    445{
    446	*ut = p->utime;
    447	*st = p->stime;
    448}
    449EXPORT_SYMBOL_GPL(task_cputime_adjusted);
    450
    451void thread_group_cputime_adjusted(struct task_struct *p, u64 *ut, u64 *st)
    452{
    453	struct task_cputime cputime;
    454
    455	thread_group_cputime(p, &cputime);
    456
    457	*ut = cputime.utime;
    458	*st = cputime.stime;
    459}
    460
    461#else /* !CONFIG_VIRT_CPU_ACCOUNTING_NATIVE: */
    462
    463/*
    464 * Account a single tick of CPU time.
    465 * @p: the process that the CPU time gets accounted to
    466 * @user_tick: indicates if the tick is a user or a system tick
    467 */
    468void account_process_tick(struct task_struct *p, int user_tick)
    469{
    470	u64 cputime, steal;
    471
    472	if (vtime_accounting_enabled_this_cpu())
    473		return;
    474
    475	if (sched_clock_irqtime) {
    476		irqtime_account_process_tick(p, user_tick, 1);
    477		return;
    478	}
    479
    480	cputime = TICK_NSEC;
    481	steal = steal_account_process_time(ULONG_MAX);
    482
    483	if (steal >= cputime)
    484		return;
    485
    486	cputime -= steal;
    487
    488	if (user_tick)
    489		account_user_time(p, cputime);
    490	else if ((p != this_rq()->idle) || (irq_count() != HARDIRQ_OFFSET))
    491		account_system_time(p, HARDIRQ_OFFSET, cputime);
    492	else
    493		account_idle_time(cputime);
    494}
    495
    496/*
    497 * Account multiple ticks of idle time.
    498 * @ticks: number of stolen ticks
    499 */
    500void account_idle_ticks(unsigned long ticks)
    501{
    502	u64 cputime, steal;
    503
    504	if (sched_clock_irqtime) {
    505		irqtime_account_idle_ticks(ticks);
    506		return;
    507	}
    508
    509	cputime = ticks * TICK_NSEC;
    510	steal = steal_account_process_time(ULONG_MAX);
    511
    512	if (steal >= cputime)
    513		return;
    514
    515	cputime -= steal;
    516	account_idle_time(cputime);
    517}
    518
    519/*
    520 * Adjust tick based cputime random precision against scheduler runtime
    521 * accounting.
    522 *
    523 * Tick based cputime accounting depend on random scheduling timeslices of a
    524 * task to be interrupted or not by the timer.  Depending on these
    525 * circumstances, the number of these interrupts may be over or
    526 * under-optimistic, matching the real user and system cputime with a variable
    527 * precision.
    528 *
    529 * Fix this by scaling these tick based values against the total runtime
    530 * accounted by the CFS scheduler.
    531 *
    532 * This code provides the following guarantees:
    533 *
    534 *   stime + utime == rtime
    535 *   stime_i+1 >= stime_i, utime_i+1 >= utime_i
    536 *
    537 * Assuming that rtime_i+1 >= rtime_i.
    538 */
    539void cputime_adjust(struct task_cputime *curr, struct prev_cputime *prev,
    540		    u64 *ut, u64 *st)
    541{
    542	u64 rtime, stime, utime;
    543	unsigned long flags;
    544
    545	/* Serialize concurrent callers such that we can honour our guarantees */
    546	raw_spin_lock_irqsave(&prev->lock, flags);
    547	rtime = curr->sum_exec_runtime;
    548
    549	/*
    550	 * This is possible under two circumstances:
    551	 *  - rtime isn't monotonic after all (a bug);
    552	 *  - we got reordered by the lock.
    553	 *
    554	 * In both cases this acts as a filter such that the rest of the code
    555	 * can assume it is monotonic regardless of anything else.
    556	 */
    557	if (prev->stime + prev->utime >= rtime)
    558		goto out;
    559
    560	stime = curr->stime;
    561	utime = curr->utime;
    562
    563	/*
    564	 * If either stime or utime are 0, assume all runtime is userspace.
    565	 * Once a task gets some ticks, the monotonicity code at 'update:'
    566	 * will ensure things converge to the observed ratio.
    567	 */
    568	if (stime == 0) {
    569		utime = rtime;
    570		goto update;
    571	}
    572
    573	if (utime == 0) {
    574		stime = rtime;
    575		goto update;
    576	}
    577
    578	stime = mul_u64_u64_div_u64(stime, rtime, stime + utime);
    579
    580update:
    581	/*
    582	 * Make sure stime doesn't go backwards; this preserves monotonicity
    583	 * for utime because rtime is monotonic.
    584	 *
    585	 *  utime_i+1 = rtime_i+1 - stime_i
    586	 *            = rtime_i+1 - (rtime_i - utime_i)
    587	 *            = (rtime_i+1 - rtime_i) + utime_i
    588	 *            >= utime_i
    589	 */
    590	if (stime < prev->stime)
    591		stime = prev->stime;
    592	utime = rtime - stime;
    593
    594	/*
    595	 * Make sure utime doesn't go backwards; this still preserves
    596	 * monotonicity for stime, analogous argument to above.
    597	 */
    598	if (utime < prev->utime) {
    599		utime = prev->utime;
    600		stime = rtime - utime;
    601	}
    602
    603	prev->stime = stime;
    604	prev->utime = utime;
    605out:
    606	*ut = prev->utime;
    607	*st = prev->stime;
    608	raw_spin_unlock_irqrestore(&prev->lock, flags);
    609}
    610
    611void task_cputime_adjusted(struct task_struct *p, u64 *ut, u64 *st)
    612{
    613	struct task_cputime cputime = {
    614		.sum_exec_runtime = p->se.sum_exec_runtime,
    615	};
    616
    617	if (task_cputime(p, &cputime.utime, &cputime.stime))
    618		cputime.sum_exec_runtime = task_sched_runtime(p);
    619	cputime_adjust(&cputime, &p->prev_cputime, ut, st);
    620}
    621EXPORT_SYMBOL_GPL(task_cputime_adjusted);
    622
    623void thread_group_cputime_adjusted(struct task_struct *p, u64 *ut, u64 *st)
    624{
    625	struct task_cputime cputime;
    626
    627	thread_group_cputime(p, &cputime);
    628	cputime_adjust(&cputime, &p->signal->prev_cputime, ut, st);
    629}
    630#endif /* !CONFIG_VIRT_CPU_ACCOUNTING_NATIVE */
    631
    632#ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN
    633static u64 vtime_delta(struct vtime *vtime)
    634{
    635	unsigned long long clock;
    636
    637	clock = sched_clock();
    638	if (clock < vtime->starttime)
    639		return 0;
    640
    641	return clock - vtime->starttime;
    642}
    643
    644static u64 get_vtime_delta(struct vtime *vtime)
    645{
    646	u64 delta = vtime_delta(vtime);
    647	u64 other;
    648
    649	/*
    650	 * Unlike tick based timing, vtime based timing never has lost
    651	 * ticks, and no need for steal time accounting to make up for
    652	 * lost ticks. Vtime accounts a rounded version of actual
    653	 * elapsed time. Limit account_other_time to prevent rounding
    654	 * errors from causing elapsed vtime to go negative.
    655	 */
    656	other = account_other_time(delta);
    657	WARN_ON_ONCE(vtime->state == VTIME_INACTIVE);
    658	vtime->starttime += delta;
    659
    660	return delta - other;
    661}
    662
    663static void vtime_account_system(struct task_struct *tsk,
    664				 struct vtime *vtime)
    665{
    666	vtime->stime += get_vtime_delta(vtime);
    667	if (vtime->stime >= TICK_NSEC) {
    668		account_system_time(tsk, irq_count(), vtime->stime);
    669		vtime->stime = 0;
    670	}
    671}
    672
    673static void vtime_account_guest(struct task_struct *tsk,
    674				struct vtime *vtime)
    675{
    676	vtime->gtime += get_vtime_delta(vtime);
    677	if (vtime->gtime >= TICK_NSEC) {
    678		account_guest_time(tsk, vtime->gtime);
    679		vtime->gtime = 0;
    680	}
    681}
    682
    683static void __vtime_account_kernel(struct task_struct *tsk,
    684				   struct vtime *vtime)
    685{
    686	/* We might have scheduled out from guest path */
    687	if (vtime->state == VTIME_GUEST)
    688		vtime_account_guest(tsk, vtime);
    689	else
    690		vtime_account_system(tsk, vtime);
    691}
    692
    693void vtime_account_kernel(struct task_struct *tsk)
    694{
    695	struct vtime *vtime = &tsk->vtime;
    696
    697	if (!vtime_delta(vtime))
    698		return;
    699
    700	write_seqcount_begin(&vtime->seqcount);
    701	__vtime_account_kernel(tsk, vtime);
    702	write_seqcount_end(&vtime->seqcount);
    703}
    704
    705void vtime_user_enter(struct task_struct *tsk)
    706{
    707	struct vtime *vtime = &tsk->vtime;
    708
    709	write_seqcount_begin(&vtime->seqcount);
    710	vtime_account_system(tsk, vtime);
    711	vtime->state = VTIME_USER;
    712	write_seqcount_end(&vtime->seqcount);
    713}
    714
    715void vtime_user_exit(struct task_struct *tsk)
    716{
    717	struct vtime *vtime = &tsk->vtime;
    718
    719	write_seqcount_begin(&vtime->seqcount);
    720	vtime->utime += get_vtime_delta(vtime);
    721	if (vtime->utime >= TICK_NSEC) {
    722		account_user_time(tsk, vtime->utime);
    723		vtime->utime = 0;
    724	}
    725	vtime->state = VTIME_SYS;
    726	write_seqcount_end(&vtime->seqcount);
    727}
    728
    729void vtime_guest_enter(struct task_struct *tsk)
    730{
    731	struct vtime *vtime = &tsk->vtime;
    732	/*
    733	 * The flags must be updated under the lock with
    734	 * the vtime_starttime flush and update.
    735	 * That enforces a right ordering and update sequence
    736	 * synchronization against the reader (task_gtime())
    737	 * that can thus safely catch up with a tickless delta.
    738	 */
    739	write_seqcount_begin(&vtime->seqcount);
    740	vtime_account_system(tsk, vtime);
    741	tsk->flags |= PF_VCPU;
    742	vtime->state = VTIME_GUEST;
    743	write_seqcount_end(&vtime->seqcount);
    744}
    745EXPORT_SYMBOL_GPL(vtime_guest_enter);
    746
    747void vtime_guest_exit(struct task_struct *tsk)
    748{
    749	struct vtime *vtime = &tsk->vtime;
    750
    751	write_seqcount_begin(&vtime->seqcount);
    752	vtime_account_guest(tsk, vtime);
    753	tsk->flags &= ~PF_VCPU;
    754	vtime->state = VTIME_SYS;
    755	write_seqcount_end(&vtime->seqcount);
    756}
    757EXPORT_SYMBOL_GPL(vtime_guest_exit);
    758
    759void vtime_account_idle(struct task_struct *tsk)
    760{
    761	account_idle_time(get_vtime_delta(&tsk->vtime));
    762}
    763
    764void vtime_task_switch_generic(struct task_struct *prev)
    765{
    766	struct vtime *vtime = &prev->vtime;
    767
    768	write_seqcount_begin(&vtime->seqcount);
    769	if (vtime->state == VTIME_IDLE)
    770		vtime_account_idle(prev);
    771	else
    772		__vtime_account_kernel(prev, vtime);
    773	vtime->state = VTIME_INACTIVE;
    774	vtime->cpu = -1;
    775	write_seqcount_end(&vtime->seqcount);
    776
    777	vtime = &current->vtime;
    778
    779	write_seqcount_begin(&vtime->seqcount);
    780	if (is_idle_task(current))
    781		vtime->state = VTIME_IDLE;
    782	else if (current->flags & PF_VCPU)
    783		vtime->state = VTIME_GUEST;
    784	else
    785		vtime->state = VTIME_SYS;
    786	vtime->starttime = sched_clock();
    787	vtime->cpu = smp_processor_id();
    788	write_seqcount_end(&vtime->seqcount);
    789}
    790
    791void vtime_init_idle(struct task_struct *t, int cpu)
    792{
    793	struct vtime *vtime = &t->vtime;
    794	unsigned long flags;
    795
    796	local_irq_save(flags);
    797	write_seqcount_begin(&vtime->seqcount);
    798	vtime->state = VTIME_IDLE;
    799	vtime->starttime = sched_clock();
    800	vtime->cpu = cpu;
    801	write_seqcount_end(&vtime->seqcount);
    802	local_irq_restore(flags);
    803}
    804
    805u64 task_gtime(struct task_struct *t)
    806{
    807	struct vtime *vtime = &t->vtime;
    808	unsigned int seq;
    809	u64 gtime;
    810
    811	if (!vtime_accounting_enabled())
    812		return t->gtime;
    813
    814	do {
    815		seq = read_seqcount_begin(&vtime->seqcount);
    816
    817		gtime = t->gtime;
    818		if (vtime->state == VTIME_GUEST)
    819			gtime += vtime->gtime + vtime_delta(vtime);
    820
    821	} while (read_seqcount_retry(&vtime->seqcount, seq));
    822
    823	return gtime;
    824}
    825
    826/*
    827 * Fetch cputime raw values from fields of task_struct and
    828 * add up the pending nohz execution time since the last
    829 * cputime snapshot.
    830 */
    831bool task_cputime(struct task_struct *t, u64 *utime, u64 *stime)
    832{
    833	struct vtime *vtime = &t->vtime;
    834	unsigned int seq;
    835	u64 delta;
    836	int ret;
    837
    838	if (!vtime_accounting_enabled()) {
    839		*utime = t->utime;
    840		*stime = t->stime;
    841		return false;
    842	}
    843
    844	do {
    845		ret = false;
    846		seq = read_seqcount_begin(&vtime->seqcount);
    847
    848		*utime = t->utime;
    849		*stime = t->stime;
    850
    851		/* Task is sleeping or idle, nothing to add */
    852		if (vtime->state < VTIME_SYS)
    853			continue;
    854
    855		ret = true;
    856		delta = vtime_delta(vtime);
    857
    858		/*
    859		 * Task runs either in user (including guest) or kernel space,
    860		 * add pending nohz time to the right place.
    861		 */
    862		if (vtime->state == VTIME_SYS)
    863			*stime += vtime->stime + delta;
    864		else
    865			*utime += vtime->utime + delta;
    866	} while (read_seqcount_retry(&vtime->seqcount, seq));
    867
    868	return ret;
    869}
    870
    871static int vtime_state_fetch(struct vtime *vtime, int cpu)
    872{
    873	int state = READ_ONCE(vtime->state);
    874
    875	/*
    876	 * We raced against a context switch, fetch the
    877	 * kcpustat task again.
    878	 */
    879	if (vtime->cpu != cpu && vtime->cpu != -1)
    880		return -EAGAIN;
    881
    882	/*
    883	 * Two possible things here:
    884	 * 1) We are seeing the scheduling out task (prev) or any past one.
    885	 * 2) We are seeing the scheduling in task (next) but it hasn't
    886	 *    passed though vtime_task_switch() yet so the pending
    887	 *    cputime of the prev task may not be flushed yet.
    888	 *
    889	 * Case 1) is ok but 2) is not. So wait for a safe VTIME state.
    890	 */
    891	if (state == VTIME_INACTIVE)
    892		return -EAGAIN;
    893
    894	return state;
    895}
    896
    897static u64 kcpustat_user_vtime(struct vtime *vtime)
    898{
    899	if (vtime->state == VTIME_USER)
    900		return vtime->utime + vtime_delta(vtime);
    901	else if (vtime->state == VTIME_GUEST)
    902		return vtime->gtime + vtime_delta(vtime);
    903	return 0;
    904}
    905
    906static int kcpustat_field_vtime(u64 *cpustat,
    907				struct task_struct *tsk,
    908				enum cpu_usage_stat usage,
    909				int cpu, u64 *val)
    910{
    911	struct vtime *vtime = &tsk->vtime;
    912	unsigned int seq;
    913
    914	do {
    915		int state;
    916
    917		seq = read_seqcount_begin(&vtime->seqcount);
    918
    919		state = vtime_state_fetch(vtime, cpu);
    920		if (state < 0)
    921			return state;
    922
    923		*val = cpustat[usage];
    924
    925		/*
    926		 * Nice VS unnice cputime accounting may be inaccurate if
    927		 * the nice value has changed since the last vtime update.
    928		 * But proper fix would involve interrupting target on nice
    929		 * updates which is a no go on nohz_full (although the scheduler
    930		 * may still interrupt the target if rescheduling is needed...)
    931		 */
    932		switch (usage) {
    933		case CPUTIME_SYSTEM:
    934			if (state == VTIME_SYS)
    935				*val += vtime->stime + vtime_delta(vtime);
    936			break;
    937		case CPUTIME_USER:
    938			if (task_nice(tsk) <= 0)
    939				*val += kcpustat_user_vtime(vtime);
    940			break;
    941		case CPUTIME_NICE:
    942			if (task_nice(tsk) > 0)
    943				*val += kcpustat_user_vtime(vtime);
    944			break;
    945		case CPUTIME_GUEST:
    946			if (state == VTIME_GUEST && task_nice(tsk) <= 0)
    947				*val += vtime->gtime + vtime_delta(vtime);
    948			break;
    949		case CPUTIME_GUEST_NICE:
    950			if (state == VTIME_GUEST && task_nice(tsk) > 0)
    951				*val += vtime->gtime + vtime_delta(vtime);
    952			break;
    953		default:
    954			break;
    955		}
    956	} while (read_seqcount_retry(&vtime->seqcount, seq));
    957
    958	return 0;
    959}
    960
    961u64 kcpustat_field(struct kernel_cpustat *kcpustat,
    962		   enum cpu_usage_stat usage, int cpu)
    963{
    964	u64 *cpustat = kcpustat->cpustat;
    965	u64 val = cpustat[usage];
    966	struct rq *rq;
    967	int err;
    968
    969	if (!vtime_accounting_enabled_cpu(cpu))
    970		return val;
    971
    972	rq = cpu_rq(cpu);
    973
    974	for (;;) {
    975		struct task_struct *curr;
    976
    977		rcu_read_lock();
    978		curr = rcu_dereference(rq->curr);
    979		if (WARN_ON_ONCE(!curr)) {
    980			rcu_read_unlock();
    981			return cpustat[usage];
    982		}
    983
    984		err = kcpustat_field_vtime(cpustat, curr, usage, cpu, &val);
    985		rcu_read_unlock();
    986
    987		if (!err)
    988			return val;
    989
    990		cpu_relax();
    991	}
    992}
    993EXPORT_SYMBOL_GPL(kcpustat_field);
    994
    995static int kcpustat_cpu_fetch_vtime(struct kernel_cpustat *dst,
    996				    const struct kernel_cpustat *src,
    997				    struct task_struct *tsk, int cpu)
    998{
    999	struct vtime *vtime = &tsk->vtime;
   1000	unsigned int seq;
   1001
   1002	do {
   1003		u64 *cpustat;
   1004		u64 delta;
   1005		int state;
   1006
   1007		seq = read_seqcount_begin(&vtime->seqcount);
   1008
   1009		state = vtime_state_fetch(vtime, cpu);
   1010		if (state < 0)
   1011			return state;
   1012
   1013		*dst = *src;
   1014		cpustat = dst->cpustat;
   1015
   1016		/* Task is sleeping, dead or idle, nothing to add */
   1017		if (state < VTIME_SYS)
   1018			continue;
   1019
   1020		delta = vtime_delta(vtime);
   1021
   1022		/*
   1023		 * Task runs either in user (including guest) or kernel space,
   1024		 * add pending nohz time to the right place.
   1025		 */
   1026		if (state == VTIME_SYS) {
   1027			cpustat[CPUTIME_SYSTEM] += vtime->stime + delta;
   1028		} else if (state == VTIME_USER) {
   1029			if (task_nice(tsk) > 0)
   1030				cpustat[CPUTIME_NICE] += vtime->utime + delta;
   1031			else
   1032				cpustat[CPUTIME_USER] += vtime->utime + delta;
   1033		} else {
   1034			WARN_ON_ONCE(state != VTIME_GUEST);
   1035			if (task_nice(tsk) > 0) {
   1036				cpustat[CPUTIME_GUEST_NICE] += vtime->gtime + delta;
   1037				cpustat[CPUTIME_NICE] += vtime->gtime + delta;
   1038			} else {
   1039				cpustat[CPUTIME_GUEST] += vtime->gtime + delta;
   1040				cpustat[CPUTIME_USER] += vtime->gtime + delta;
   1041			}
   1042		}
   1043	} while (read_seqcount_retry(&vtime->seqcount, seq));
   1044
   1045	return 0;
   1046}
   1047
   1048void kcpustat_cpu_fetch(struct kernel_cpustat *dst, int cpu)
   1049{
   1050	const struct kernel_cpustat *src = &kcpustat_cpu(cpu);
   1051	struct rq *rq;
   1052	int err;
   1053
   1054	if (!vtime_accounting_enabled_cpu(cpu)) {
   1055		*dst = *src;
   1056		return;
   1057	}
   1058
   1059	rq = cpu_rq(cpu);
   1060
   1061	for (;;) {
   1062		struct task_struct *curr;
   1063
   1064		rcu_read_lock();
   1065		curr = rcu_dereference(rq->curr);
   1066		if (WARN_ON_ONCE(!curr)) {
   1067			rcu_read_unlock();
   1068			*dst = *src;
   1069			return;
   1070		}
   1071
   1072		err = kcpustat_cpu_fetch_vtime(dst, src, curr, cpu);
   1073		rcu_read_unlock();
   1074
   1075		if (!err)
   1076			return;
   1077
   1078		cpu_relax();
   1079	}
   1080}
   1081EXPORT_SYMBOL_GPL(kcpustat_cpu_fetch);
   1082
   1083#endif /* CONFIG_VIRT_CPU_ACCOUNTING_GEN */