cachepc-linux

Fork of AMDESE/linux with modifications for CachePC side-channel attack
git clone https://git.sinitax.com/sinitax/cachepc-linux
Log | Files | Refs | README | LICENSE | sfeed.txt

menu.c (18246B)


      1// SPDX-License-Identifier: GPL-2.0-only
      2/*
      3 * menu.c - the menu idle governor
      4 *
      5 * Copyright (C) 2006-2007 Adam Belay <abelay@novell.com>
      6 * Copyright (C) 2009 Intel Corporation
      7 * Author:
      8 *        Arjan van de Ven <arjan@linux.intel.com>
      9 */
     10
     11#include <linux/kernel.h>
     12#include <linux/cpuidle.h>
     13#include <linux/time.h>
     14#include <linux/ktime.h>
     15#include <linux/hrtimer.h>
     16#include <linux/tick.h>
     17#include <linux/sched.h>
     18#include <linux/sched/loadavg.h>
     19#include <linux/sched/stat.h>
     20#include <linux/math64.h>
     21
     22#define BUCKETS 12
     23#define INTERVAL_SHIFT 3
     24#define INTERVALS (1UL << INTERVAL_SHIFT)
     25#define RESOLUTION 1024
     26#define DECAY 8
     27#define MAX_INTERESTING (50000 * NSEC_PER_USEC)
     28
     29/*
     30 * Concepts and ideas behind the menu governor
     31 *
     32 * For the menu governor, there are 3 decision factors for picking a C
     33 * state:
     34 * 1) Energy break even point
     35 * 2) Performance impact
     36 * 3) Latency tolerance (from pmqos infrastructure)
     37 * These three factors are treated independently.
     38 *
     39 * Energy break even point
     40 * -----------------------
     41 * C state entry and exit have an energy cost, and a certain amount of time in
     42 * the  C state is required to actually break even on this cost. CPUIDLE
     43 * provides us this duration in the "target_residency" field. So all that we
     44 * need is a good prediction of how long we'll be idle. Like the traditional
     45 * menu governor, we start with the actual known "next timer event" time.
     46 *
     47 * Since there are other source of wakeups (interrupts for example) than
     48 * the next timer event, this estimation is rather optimistic. To get a
     49 * more realistic estimate, a correction factor is applied to the estimate,
     50 * that is based on historic behavior. For example, if in the past the actual
     51 * duration always was 50% of the next timer tick, the correction factor will
     52 * be 0.5.
     53 *
     54 * menu uses a running average for this correction factor, however it uses a
     55 * set of factors, not just a single factor. This stems from the realization
     56 * that the ratio is dependent on the order of magnitude of the expected
     57 * duration; if we expect 500 milliseconds of idle time the likelihood of
     58 * getting an interrupt very early is much higher than if we expect 50 micro
     59 * seconds of idle time. A second independent factor that has big impact on
     60 * the actual factor is if there is (disk) IO outstanding or not.
     61 * (as a special twist, we consider every sleep longer than 50 milliseconds
     62 * as perfect; there are no power gains for sleeping longer than this)
     63 *
     64 * For these two reasons we keep an array of 12 independent factors, that gets
     65 * indexed based on the magnitude of the expected duration as well as the
     66 * "is IO outstanding" property.
     67 *
     68 * Repeatable-interval-detector
     69 * ----------------------------
     70 * There are some cases where "next timer" is a completely unusable predictor:
     71 * Those cases where the interval is fixed, for example due to hardware
     72 * interrupt mitigation, but also due to fixed transfer rate devices such as
     73 * mice.
     74 * For this, we use a different predictor: We track the duration of the last 8
     75 * intervals and if the stand deviation of these 8 intervals is below a
     76 * threshold value, we use the average of these intervals as prediction.
     77 *
     78 * Limiting Performance Impact
     79 * ---------------------------
     80 * C states, especially those with large exit latencies, can have a real
     81 * noticeable impact on workloads, which is not acceptable for most sysadmins,
     82 * and in addition, less performance has a power price of its own.
     83 *
     84 * As a general rule of thumb, menu assumes that the following heuristic
     85 * holds:
     86 *     The busier the system, the less impact of C states is acceptable
     87 *
     88 * This rule-of-thumb is implemented using a performance-multiplier:
     89 * If the exit latency times the performance multiplier is longer than
     90 * the predicted duration, the C state is not considered a candidate
     91 * for selection due to a too high performance impact. So the higher
     92 * this multiplier is, the longer we need to be idle to pick a deep C
     93 * state, and thus the less likely a busy CPU will hit such a deep
     94 * C state.
     95 *
     96 * Two factors are used in determing this multiplier:
     97 * a value of 10 is added for each point of "per cpu load average" we have.
     98 * a value of 5 points is added for each process that is waiting for
     99 * IO on this CPU.
    100 * (these values are experimentally determined)
    101 *
    102 * The load average factor gives a longer term (few seconds) input to the
    103 * decision, while the iowait value gives a cpu local instantanious input.
    104 * The iowait factor may look low, but realize that this is also already
    105 * represented in the system load average.
    106 *
    107 */
    108
    109struct menu_device {
    110	int             needs_update;
    111	int             tick_wakeup;
    112
    113	u64		next_timer_ns;
    114	unsigned int	bucket;
    115	unsigned int	correction_factor[BUCKETS];
    116	unsigned int	intervals[INTERVALS];
    117	int		interval_ptr;
    118};
    119
    120static inline int which_bucket(u64 duration_ns, unsigned int nr_iowaiters)
    121{
    122	int bucket = 0;
    123
    124	/*
    125	 * We keep two groups of stats; one with no
    126	 * IO pending, one without.
    127	 * This allows us to calculate
    128	 * E(duration)|iowait
    129	 */
    130	if (nr_iowaiters)
    131		bucket = BUCKETS/2;
    132
    133	if (duration_ns < 10ULL * NSEC_PER_USEC)
    134		return bucket;
    135	if (duration_ns < 100ULL * NSEC_PER_USEC)
    136		return bucket + 1;
    137	if (duration_ns < 1000ULL * NSEC_PER_USEC)
    138		return bucket + 2;
    139	if (duration_ns < 10000ULL * NSEC_PER_USEC)
    140		return bucket + 3;
    141	if (duration_ns < 100000ULL * NSEC_PER_USEC)
    142		return bucket + 4;
    143	return bucket + 5;
    144}
    145
    146/*
    147 * Return a multiplier for the exit latency that is intended
    148 * to take performance requirements into account.
    149 * The more performance critical we estimate the system
    150 * to be, the higher this multiplier, and thus the higher
    151 * the barrier to go to an expensive C state.
    152 */
    153static inline int performance_multiplier(unsigned int nr_iowaiters)
    154{
    155	/* for IO wait tasks (per cpu!) we add 10x each */
    156	return 1 + 10 * nr_iowaiters;
    157}
    158
    159static DEFINE_PER_CPU(struct menu_device, menu_devices);
    160
    161static void menu_update(struct cpuidle_driver *drv, struct cpuidle_device *dev);
    162
    163/*
    164 * Try detecting repeating patterns by keeping track of the last 8
    165 * intervals, and checking if the standard deviation of that set
    166 * of points is below a threshold. If it is... then use the
    167 * average of these 8 points as the estimated value.
    168 */
    169static unsigned int get_typical_interval(struct menu_device *data,
    170					 unsigned int predicted_us)
    171{
    172	int i, divisor;
    173	unsigned int min, max, thresh, avg;
    174	uint64_t sum, variance;
    175
    176	thresh = INT_MAX; /* Discard outliers above this value */
    177
    178again:
    179
    180	/* First calculate the average of past intervals */
    181	min = UINT_MAX;
    182	max = 0;
    183	sum = 0;
    184	divisor = 0;
    185	for (i = 0; i < INTERVALS; i++) {
    186		unsigned int value = data->intervals[i];
    187		if (value <= thresh) {
    188			sum += value;
    189			divisor++;
    190			if (value > max)
    191				max = value;
    192
    193			if (value < min)
    194				min = value;
    195		}
    196	}
    197
    198	/*
    199	 * If the result of the computation is going to be discarded anyway,
    200	 * avoid the computation altogether.
    201	 */
    202	if (min >= predicted_us)
    203		return UINT_MAX;
    204
    205	if (divisor == INTERVALS)
    206		avg = sum >> INTERVAL_SHIFT;
    207	else
    208		avg = div_u64(sum, divisor);
    209
    210	/* Then try to determine variance */
    211	variance = 0;
    212	for (i = 0; i < INTERVALS; i++) {
    213		unsigned int value = data->intervals[i];
    214		if (value <= thresh) {
    215			int64_t diff = (int64_t)value - avg;
    216			variance += diff * diff;
    217		}
    218	}
    219	if (divisor == INTERVALS)
    220		variance >>= INTERVAL_SHIFT;
    221	else
    222		do_div(variance, divisor);
    223
    224	/*
    225	 * The typical interval is obtained when standard deviation is
    226	 * small (stddev <= 20 us, variance <= 400 us^2) or standard
    227	 * deviation is small compared to the average interval (avg >
    228	 * 6*stddev, avg^2 > 36*variance). The average is smaller than
    229	 * UINT_MAX aka U32_MAX, so computing its square does not
    230	 * overflow a u64. We simply reject this candidate average if
    231	 * the standard deviation is greater than 715 s (which is
    232	 * rather unlikely).
    233	 *
    234	 * Use this result only if there is no timer to wake us up sooner.
    235	 */
    236	if (likely(variance <= U64_MAX/36)) {
    237		if ((((u64)avg*avg > variance*36) && (divisor * 4 >= INTERVALS * 3))
    238							|| variance <= 400) {
    239			return avg;
    240		}
    241	}
    242
    243	/*
    244	 * If we have outliers to the upside in our distribution, discard
    245	 * those by setting the threshold to exclude these outliers, then
    246	 * calculate the average and standard deviation again. Once we get
    247	 * down to the bottom 3/4 of our samples, stop excluding samples.
    248	 *
    249	 * This can deal with workloads that have long pauses interspersed
    250	 * with sporadic activity with a bunch of short pauses.
    251	 */
    252	if ((divisor * 4) <= INTERVALS * 3)
    253		return UINT_MAX;
    254
    255	thresh = max - 1;
    256	goto again;
    257}
    258
    259/**
    260 * menu_select - selects the next idle state to enter
    261 * @drv: cpuidle driver containing state data
    262 * @dev: the CPU
    263 * @stop_tick: indication on whether or not to stop the tick
    264 */
    265static int menu_select(struct cpuidle_driver *drv, struct cpuidle_device *dev,
    266		       bool *stop_tick)
    267{
    268	struct menu_device *data = this_cpu_ptr(&menu_devices);
    269	s64 latency_req = cpuidle_governor_latency_req(dev->cpu);
    270	unsigned int predicted_us;
    271	u64 predicted_ns;
    272	u64 interactivity_req;
    273	unsigned int nr_iowaiters;
    274	ktime_t delta, delta_tick;
    275	int i, idx;
    276
    277	if (data->needs_update) {
    278		menu_update(drv, dev);
    279		data->needs_update = 0;
    280	}
    281
    282	/* determine the expected residency time, round up */
    283	delta = tick_nohz_get_sleep_length(&delta_tick);
    284	if (unlikely(delta < 0)) {
    285		delta = 0;
    286		delta_tick = 0;
    287	}
    288	data->next_timer_ns = delta;
    289
    290	nr_iowaiters = nr_iowait_cpu(dev->cpu);
    291	data->bucket = which_bucket(data->next_timer_ns, nr_iowaiters);
    292
    293	if (unlikely(drv->state_count <= 1 || latency_req == 0) ||
    294	    ((data->next_timer_ns < drv->states[1].target_residency_ns ||
    295	      latency_req < drv->states[1].exit_latency_ns) &&
    296	     !dev->states_usage[0].disable)) {
    297		/*
    298		 * In this case state[0] will be used no matter what, so return
    299		 * it right away and keep the tick running if state[0] is a
    300		 * polling one.
    301		 */
    302		*stop_tick = !(drv->states[0].flags & CPUIDLE_FLAG_POLLING);
    303		return 0;
    304	}
    305
    306	/* Round up the result for half microseconds. */
    307	predicted_us = div_u64(data->next_timer_ns *
    308			       data->correction_factor[data->bucket] +
    309			       (RESOLUTION * DECAY * NSEC_PER_USEC) / 2,
    310			       RESOLUTION * DECAY * NSEC_PER_USEC);
    311	/* Use the lowest expected idle interval to pick the idle state. */
    312	predicted_ns = (u64)min(predicted_us,
    313				get_typical_interval(data, predicted_us)) *
    314				NSEC_PER_USEC;
    315
    316	if (tick_nohz_tick_stopped()) {
    317		/*
    318		 * If the tick is already stopped, the cost of possible short
    319		 * idle duration misprediction is much higher, because the CPU
    320		 * may be stuck in a shallow idle state for a long time as a
    321		 * result of it.  In that case say we might mispredict and use
    322		 * the known time till the closest timer event for the idle
    323		 * state selection.
    324		 */
    325		if (predicted_ns < TICK_NSEC)
    326			predicted_ns = data->next_timer_ns;
    327	} else {
    328		/*
    329		 * Use the performance multiplier and the user-configurable
    330		 * latency_req to determine the maximum exit latency.
    331		 */
    332		interactivity_req = div64_u64(predicted_ns,
    333					      performance_multiplier(nr_iowaiters));
    334		if (latency_req > interactivity_req)
    335			latency_req = interactivity_req;
    336	}
    337
    338	/*
    339	 * Find the idle state with the lowest power while satisfying
    340	 * our constraints.
    341	 */
    342	idx = -1;
    343	for (i = 0; i < drv->state_count; i++) {
    344		struct cpuidle_state *s = &drv->states[i];
    345
    346		if (dev->states_usage[i].disable)
    347			continue;
    348
    349		if (idx == -1)
    350			idx = i; /* first enabled state */
    351
    352		if (s->target_residency_ns > predicted_ns) {
    353			/*
    354			 * Use a physical idle state, not busy polling, unless
    355			 * a timer is going to trigger soon enough.
    356			 */
    357			if ((drv->states[idx].flags & CPUIDLE_FLAG_POLLING) &&
    358			    s->exit_latency_ns <= latency_req &&
    359			    s->target_residency_ns <= data->next_timer_ns) {
    360				predicted_ns = s->target_residency_ns;
    361				idx = i;
    362				break;
    363			}
    364			if (predicted_ns < TICK_NSEC)
    365				break;
    366
    367			if (!tick_nohz_tick_stopped()) {
    368				/*
    369				 * If the state selected so far is shallow,
    370				 * waking up early won't hurt, so retain the
    371				 * tick in that case and let the governor run
    372				 * again in the next iteration of the loop.
    373				 */
    374				predicted_ns = drv->states[idx].target_residency_ns;
    375				break;
    376			}
    377
    378			/*
    379			 * If the state selected so far is shallow and this
    380			 * state's target residency matches the time till the
    381			 * closest timer event, select this one to avoid getting
    382			 * stuck in the shallow one for too long.
    383			 */
    384			if (drv->states[idx].target_residency_ns < TICK_NSEC &&
    385			    s->target_residency_ns <= delta_tick)
    386				idx = i;
    387
    388			return idx;
    389		}
    390		if (s->exit_latency_ns > latency_req)
    391			break;
    392
    393		idx = i;
    394	}
    395
    396	if (idx == -1)
    397		idx = 0; /* No states enabled. Must use 0. */
    398
    399	/*
    400	 * Don't stop the tick if the selected state is a polling one or if the
    401	 * expected idle duration is shorter than the tick period length.
    402	 */
    403	if (((drv->states[idx].flags & CPUIDLE_FLAG_POLLING) ||
    404	     predicted_ns < TICK_NSEC) && !tick_nohz_tick_stopped()) {
    405		*stop_tick = false;
    406
    407		if (idx > 0 && drv->states[idx].target_residency_ns > delta_tick) {
    408			/*
    409			 * The tick is not going to be stopped and the target
    410			 * residency of the state to be returned is not within
    411			 * the time until the next timer event including the
    412			 * tick, so try to correct that.
    413			 */
    414			for (i = idx - 1; i >= 0; i--) {
    415				if (dev->states_usage[i].disable)
    416					continue;
    417
    418				idx = i;
    419				if (drv->states[i].target_residency_ns <= delta_tick)
    420					break;
    421			}
    422		}
    423	}
    424
    425	return idx;
    426}
    427
    428/**
    429 * menu_reflect - records that data structures need update
    430 * @dev: the CPU
    431 * @index: the index of actual entered state
    432 *
    433 * NOTE: it's important to be fast here because this operation will add to
    434 *       the overall exit latency.
    435 */
    436static void menu_reflect(struct cpuidle_device *dev, int index)
    437{
    438	struct menu_device *data = this_cpu_ptr(&menu_devices);
    439
    440	dev->last_state_idx = index;
    441	data->needs_update = 1;
    442	data->tick_wakeup = tick_nohz_idle_got_tick();
    443}
    444
    445/**
    446 * menu_update - attempts to guess what happened after entry
    447 * @drv: cpuidle driver containing state data
    448 * @dev: the CPU
    449 */
    450static void menu_update(struct cpuidle_driver *drv, struct cpuidle_device *dev)
    451{
    452	struct menu_device *data = this_cpu_ptr(&menu_devices);
    453	int last_idx = dev->last_state_idx;
    454	struct cpuidle_state *target = &drv->states[last_idx];
    455	u64 measured_ns;
    456	unsigned int new_factor;
    457
    458	/*
    459	 * Try to figure out how much time passed between entry to low
    460	 * power state and occurrence of the wakeup event.
    461	 *
    462	 * If the entered idle state didn't support residency measurements,
    463	 * we use them anyway if they are short, and if long,
    464	 * truncate to the whole expected time.
    465	 *
    466	 * Any measured amount of time will include the exit latency.
    467	 * Since we are interested in when the wakeup begun, not when it
    468	 * was completed, we must subtract the exit latency. However, if
    469	 * the measured amount of time is less than the exit latency,
    470	 * assume the state was never reached and the exit latency is 0.
    471	 */
    472
    473	if (data->tick_wakeup && data->next_timer_ns > TICK_NSEC) {
    474		/*
    475		 * The nohz code said that there wouldn't be any events within
    476		 * the tick boundary (if the tick was stopped), but the idle
    477		 * duration predictor had a differing opinion.  Since the CPU
    478		 * was woken up by a tick (that wasn't stopped after all), the
    479		 * predictor was not quite right, so assume that the CPU could
    480		 * have been idle long (but not forever) to help the idle
    481		 * duration predictor do a better job next time.
    482		 */
    483		measured_ns = 9 * MAX_INTERESTING / 10;
    484	} else if ((drv->states[last_idx].flags & CPUIDLE_FLAG_POLLING) &&
    485		   dev->poll_time_limit) {
    486		/*
    487		 * The CPU exited the "polling" state due to a time limit, so
    488		 * the idle duration prediction leading to the selection of that
    489		 * state was inaccurate.  If a better prediction had been made,
    490		 * the CPU might have been woken up from idle by the next timer.
    491		 * Assume that to be the case.
    492		 */
    493		measured_ns = data->next_timer_ns;
    494	} else {
    495		/* measured value */
    496		measured_ns = dev->last_residency_ns;
    497
    498		/* Deduct exit latency */
    499		if (measured_ns > 2 * target->exit_latency_ns)
    500			measured_ns -= target->exit_latency_ns;
    501		else
    502			measured_ns /= 2;
    503	}
    504
    505	/* Make sure our coefficients do not exceed unity */
    506	if (measured_ns > data->next_timer_ns)
    507		measured_ns = data->next_timer_ns;
    508
    509	/* Update our correction ratio */
    510	new_factor = data->correction_factor[data->bucket];
    511	new_factor -= new_factor / DECAY;
    512
    513	if (data->next_timer_ns > 0 && measured_ns < MAX_INTERESTING)
    514		new_factor += div64_u64(RESOLUTION * measured_ns,
    515					data->next_timer_ns);
    516	else
    517		/*
    518		 * we were idle so long that we count it as a perfect
    519		 * prediction
    520		 */
    521		new_factor += RESOLUTION;
    522
    523	/*
    524	 * We don't want 0 as factor; we always want at least
    525	 * a tiny bit of estimated time. Fortunately, due to rounding,
    526	 * new_factor will stay nonzero regardless of measured_us values
    527	 * and the compiler can eliminate this test as long as DECAY > 1.
    528	 */
    529	if (DECAY == 1 && unlikely(new_factor == 0))
    530		new_factor = 1;
    531
    532	data->correction_factor[data->bucket] = new_factor;
    533
    534	/* update the repeating-pattern data */
    535	data->intervals[data->interval_ptr++] = ktime_to_us(measured_ns);
    536	if (data->interval_ptr >= INTERVALS)
    537		data->interval_ptr = 0;
    538}
    539
    540/**
    541 * menu_enable_device - scans a CPU's states and does setup
    542 * @drv: cpuidle driver
    543 * @dev: the CPU
    544 */
    545static int menu_enable_device(struct cpuidle_driver *drv,
    546				struct cpuidle_device *dev)
    547{
    548	struct menu_device *data = &per_cpu(menu_devices, dev->cpu);
    549	int i;
    550
    551	memset(data, 0, sizeof(struct menu_device));
    552
    553	/*
    554	 * if the correction factor is 0 (eg first time init or cpu hotplug
    555	 * etc), we actually want to start out with a unity factor.
    556	 */
    557	for(i = 0; i < BUCKETS; i++)
    558		data->correction_factor[i] = RESOLUTION * DECAY;
    559
    560	return 0;
    561}
    562
    563static struct cpuidle_governor menu_governor = {
    564	.name =		"menu",
    565	.rating =	20,
    566	.enable =	menu_enable_device,
    567	.select =	menu_select,
    568	.reflect =	menu_reflect,
    569};
    570
    571/**
    572 * init_menu - initializes the governor
    573 */
    574static int __init init_menu(void)
    575{
    576	return cpuidle_register_governor(&menu_governor);
    577}
    578
    579postcore_initcall(init_menu);