cachepc-linux

Fork of AMDESE/linux with modifications for CachePC side-channel attack
git clone https://git.sinitax.com/sinitax/cachepc-linux
Log | Files | Refs | README | LICENSE | sfeed.txt

dm-ps-historical-service-time.c (14079B)


      1// SPDX-License-Identifier: GPL-2.0
      2/*
      3 * Historical Service Time
      4 *
      5 *  Keeps a time-weighted exponential moving average of the historical
      6 *  service time. Estimates future service time based on the historical
      7 *  service time and the number of outstanding requests.
      8 *
      9 *  Marks paths stale if they have not finished within hst *
     10 *  num_paths. If a path is stale and unused, we will send a single
     11 *  request to probe in case the path has improved. This situation
     12 *  generally arises if the path is so much worse than others that it
     13 *  will never have the best estimated service time, or if the entire
     14 *  multipath device is unused. If a path is stale and in use, limit the
     15 *  number of requests it can receive with the assumption that the path
     16 *  has become degraded.
     17 *
     18 *  To avoid repeatedly calculating exponents for time weighting, times
     19 *  are split into HST_WEIGHT_COUNT buckets each (1 >> HST_BUCKET_SHIFT)
     20 *  ns, and the weighting is pre-calculated.
     21 *
     22 */
     23
     24#include "dm.h"
     25#include "dm-path-selector.h"
     26
     27#include <linux/blkdev.h>
     28#include <linux/slab.h>
     29#include <linux/module.h>
     30
     31
     32#define DM_MSG_PREFIX	"multipath historical-service-time"
     33#define HST_MIN_IO 1
     34#define HST_VERSION "0.1.1"
     35
     36#define HST_FIXED_SHIFT 10  /* 10 bits of decimal precision */
     37#define HST_FIXED_MAX (ULLONG_MAX >> HST_FIXED_SHIFT)
     38#define HST_FIXED_1 (1 << HST_FIXED_SHIFT)
     39#define HST_FIXED_95 972
     40#define HST_MAX_INFLIGHT HST_FIXED_1
     41#define HST_BUCKET_SHIFT 24 /* Buckets are ~ 16ms */
     42#define HST_WEIGHT_COUNT 64ULL
     43
     44struct selector {
     45	struct list_head valid_paths;
     46	struct list_head failed_paths;
     47	int valid_count;
     48	spinlock_t lock;
     49
     50	unsigned int weights[HST_WEIGHT_COUNT];
     51	unsigned int threshold_multiplier;
     52};
     53
     54struct path_info {
     55	struct list_head list;
     56	struct dm_path *path;
     57	unsigned int repeat_count;
     58
     59	spinlock_t lock;
     60
     61	u64 historical_service_time; /* Fixed point */
     62
     63	u64 stale_after;
     64	u64 last_finish;
     65
     66	u64 outstanding;
     67};
     68
     69/**
     70 * fixed_power - compute: x^n, in O(log n) time
     71 *
     72 * @x:         base of the power
     73 * @frac_bits: fractional bits of @x
     74 * @n:         power to raise @x to.
     75 *
     76 * By exploiting the relation between the definition of the natural power
     77 * function: x^n := x*x*...*x (x multiplied by itself for n times), and
     78 * the binary encoding of numbers used by computers: n := \Sum n_i * 2^i,
     79 * (where: n_i \elem {0, 1}, the binary vector representing n),
     80 * we find: x^n := x^(\Sum n_i * 2^i) := \Prod x^(n_i * 2^i), which is
     81 * of course trivially computable in O(log_2 n), the length of our binary
     82 * vector.
     83 *
     84 * (see: kernel/sched/loadavg.c)
     85 */
     86static u64 fixed_power(u64 x, unsigned int frac_bits, unsigned int n)
     87{
     88	unsigned long result = 1UL << frac_bits;
     89
     90	if (n) {
     91		for (;;) {
     92			if (n & 1) {
     93				result *= x;
     94				result += 1UL << (frac_bits - 1);
     95				result >>= frac_bits;
     96			}
     97			n >>= 1;
     98			if (!n)
     99				break;
    100			x *= x;
    101			x += 1UL << (frac_bits - 1);
    102			x >>= frac_bits;
    103		}
    104	}
    105
    106	return result;
    107}
    108
    109/*
    110 * Calculate the next value of an exponential moving average
    111 * a_1 = a_0 * e + a * (1 - e)
    112 *
    113 * @last: [0, ULLONG_MAX >> HST_FIXED_SHIFT]
    114 * @next: [0, ULLONG_MAX >> HST_FIXED_SHIFT]
    115 * @weight: [0, HST_FIXED_1]
    116 *
    117 * Note:
    118 *   To account for multiple periods in the same calculation,
    119 *   a_n = a_0 * e^n + a * (1 - e^n),
    120 *   so call fixed_ema(last, next, pow(weight, N))
    121 */
    122static u64 fixed_ema(u64 last, u64 next, u64 weight)
    123{
    124	last *= weight;
    125	last += next * (HST_FIXED_1 - weight);
    126	last += 1ULL << (HST_FIXED_SHIFT - 1);
    127	return last >> HST_FIXED_SHIFT;
    128}
    129
    130static struct selector *alloc_selector(void)
    131{
    132	struct selector *s = kmalloc(sizeof(*s), GFP_KERNEL);
    133
    134	if (s) {
    135		INIT_LIST_HEAD(&s->valid_paths);
    136		INIT_LIST_HEAD(&s->failed_paths);
    137		spin_lock_init(&s->lock);
    138		s->valid_count = 0;
    139	}
    140
    141	return s;
    142}
    143
    144/*
    145 * Get the weight for a given time span.
    146 */
    147static u64 hst_weight(struct path_selector *ps, u64 delta)
    148{
    149	struct selector *s = ps->context;
    150	int bucket = clamp(delta >> HST_BUCKET_SHIFT, 0ULL,
    151			   HST_WEIGHT_COUNT - 1);
    152
    153	return s->weights[bucket];
    154}
    155
    156/*
    157 * Set up the weights array.
    158 *
    159 * weights[len-1] = 0
    160 * weights[n] = base ^ (n + 1)
    161 */
    162static void hst_set_weights(struct path_selector *ps, unsigned int base)
    163{
    164	struct selector *s = ps->context;
    165	int i;
    166
    167	if (base >= HST_FIXED_1)
    168		return;
    169
    170	for (i = 0; i < HST_WEIGHT_COUNT - 1; i++)
    171		s->weights[i] = fixed_power(base, HST_FIXED_SHIFT, i + 1);
    172	s->weights[HST_WEIGHT_COUNT - 1] = 0;
    173}
    174
    175static int hst_create(struct path_selector *ps, unsigned int argc, char **argv)
    176{
    177	struct selector *s;
    178	unsigned int base_weight = HST_FIXED_95;
    179	unsigned int threshold_multiplier = 0;
    180	char dummy;
    181
    182	/*
    183	 * Arguments: [<base_weight> [<threshold_multiplier>]]
    184	 *   <base_weight>: Base weight for ema [0, 1024) 10-bit fixed point. A
    185	 *                  value of 0 will completely ignore any history.
    186	 *                  If not given, default (HST_FIXED_95) is used.
    187	 *   <threshold_multiplier>: Minimum threshold multiplier for paths to
    188	 *                  be considered different. That is, a path is
    189	 *                  considered different iff (p1 > N * p2) where p1
    190	 *                  is the path with higher service time. A threshold
    191	 *                  of 1 or 0 has no effect. Defaults to 0.
    192	 */
    193	if (argc > 2)
    194		return -EINVAL;
    195
    196	if (argc && (sscanf(argv[0], "%u%c", &base_weight, &dummy) != 1 ||
    197	     base_weight >= HST_FIXED_1)) {
    198		return -EINVAL;
    199	}
    200
    201	if (argc > 1 && (sscanf(argv[1], "%u%c",
    202				&threshold_multiplier, &dummy) != 1)) {
    203		return -EINVAL;
    204	}
    205
    206	s = alloc_selector();
    207	if (!s)
    208		return -ENOMEM;
    209
    210	ps->context = s;
    211
    212	hst_set_weights(ps, base_weight);
    213	s->threshold_multiplier = threshold_multiplier;
    214	return 0;
    215}
    216
    217static void free_paths(struct list_head *paths)
    218{
    219	struct path_info *pi, *next;
    220
    221	list_for_each_entry_safe(pi, next, paths, list) {
    222		list_del(&pi->list);
    223		kfree(pi);
    224	}
    225}
    226
    227static void hst_destroy(struct path_selector *ps)
    228{
    229	struct selector *s = ps->context;
    230
    231	free_paths(&s->valid_paths);
    232	free_paths(&s->failed_paths);
    233	kfree(s);
    234	ps->context = NULL;
    235}
    236
    237static int hst_status(struct path_selector *ps, struct dm_path *path,
    238		     status_type_t type, char *result, unsigned int maxlen)
    239{
    240	unsigned int sz = 0;
    241	struct path_info *pi;
    242
    243	if (!path) {
    244		struct selector *s = ps->context;
    245
    246		DMEMIT("2 %u %u ", s->weights[0], s->threshold_multiplier);
    247	} else {
    248		pi = path->pscontext;
    249
    250		switch (type) {
    251		case STATUSTYPE_INFO:
    252			DMEMIT("%llu %llu %llu ", pi->historical_service_time,
    253			       pi->outstanding, pi->stale_after);
    254			break;
    255		case STATUSTYPE_TABLE:
    256			DMEMIT("0 ");
    257			break;
    258		case STATUSTYPE_IMA:
    259			*result = '\0';
    260			break;
    261		}
    262	}
    263
    264	return sz;
    265}
    266
    267static int hst_add_path(struct path_selector *ps, struct dm_path *path,
    268		       int argc, char **argv, char **error)
    269{
    270	struct selector *s = ps->context;
    271	struct path_info *pi;
    272	unsigned int repeat_count = HST_MIN_IO;
    273	char dummy;
    274	unsigned long flags;
    275
    276	/*
    277	 * Arguments: [<repeat_count>]
    278	 *   <repeat_count>: The number of I/Os before switching path.
    279	 *                   If not given, default (HST_MIN_IO) is used.
    280	 */
    281	if (argc > 1) {
    282		*error = "historical-service-time ps: incorrect number of arguments";
    283		return -EINVAL;
    284	}
    285
    286	if (argc && (sscanf(argv[0], "%u%c", &repeat_count, &dummy) != 1)) {
    287		*error = "historical-service-time ps: invalid repeat count";
    288		return -EINVAL;
    289	}
    290
    291	/* allocate the path */
    292	pi = kmalloc(sizeof(*pi), GFP_KERNEL);
    293	if (!pi) {
    294		*error = "historical-service-time ps: Error allocating path context";
    295		return -ENOMEM;
    296	}
    297
    298	pi->path = path;
    299	pi->repeat_count = repeat_count;
    300
    301	pi->historical_service_time = HST_FIXED_1;
    302
    303	spin_lock_init(&pi->lock);
    304	pi->outstanding = 0;
    305
    306	pi->stale_after = 0;
    307	pi->last_finish = 0;
    308
    309	path->pscontext = pi;
    310
    311	spin_lock_irqsave(&s->lock, flags);
    312	list_add_tail(&pi->list, &s->valid_paths);
    313	s->valid_count++;
    314	spin_unlock_irqrestore(&s->lock, flags);
    315
    316	return 0;
    317}
    318
    319static void hst_fail_path(struct path_selector *ps, struct dm_path *path)
    320{
    321	struct selector *s = ps->context;
    322	struct path_info *pi = path->pscontext;
    323	unsigned long flags;
    324
    325	spin_lock_irqsave(&s->lock, flags);
    326	list_move(&pi->list, &s->failed_paths);
    327	s->valid_count--;
    328	spin_unlock_irqrestore(&s->lock, flags);
    329}
    330
    331static int hst_reinstate_path(struct path_selector *ps, struct dm_path *path)
    332{
    333	struct selector *s = ps->context;
    334	struct path_info *pi = path->pscontext;
    335	unsigned long flags;
    336
    337	spin_lock_irqsave(&s->lock, flags);
    338	list_move_tail(&pi->list, &s->valid_paths);
    339	s->valid_count++;
    340	spin_unlock_irqrestore(&s->lock, flags);
    341
    342	return 0;
    343}
    344
    345static void hst_fill_compare(struct path_info *pi, u64 *hst,
    346			     u64 *out, u64 *stale)
    347{
    348	unsigned long flags;
    349
    350	spin_lock_irqsave(&pi->lock, flags);
    351	*hst = pi->historical_service_time;
    352	*out = pi->outstanding;
    353	*stale = pi->stale_after;
    354	spin_unlock_irqrestore(&pi->lock, flags);
    355}
    356
    357/*
    358 * Compare the estimated service time of 2 paths, pi1 and pi2,
    359 * for the incoming I/O.
    360 *
    361 * Returns:
    362 * < 0 : pi1 is better
    363 * 0   : no difference between pi1 and pi2
    364 * > 0 : pi2 is better
    365 *
    366 */
    367static long long hst_compare(struct path_info *pi1, struct path_info *pi2,
    368			     u64 time_now, struct path_selector *ps)
    369{
    370	struct selector *s = ps->context;
    371	u64 hst1, hst2;
    372	long long out1, out2, stale1, stale2;
    373	int pi2_better, over_threshold;
    374
    375	hst_fill_compare(pi1, &hst1, &out1, &stale1);
    376	hst_fill_compare(pi2, &hst2, &out2, &stale2);
    377
    378	/* Check here if estimated latency for two paths are too similar.
    379	 * If this is the case, we skip extra calculation and just compare
    380	 * outstanding requests. In this case, any unloaded paths will
    381	 * be preferred.
    382	 */
    383	if (hst1 > hst2)
    384		over_threshold = hst1 > (s->threshold_multiplier * hst2);
    385	else
    386		over_threshold = hst2 > (s->threshold_multiplier * hst1);
    387
    388	if (!over_threshold)
    389		return out1 - out2;
    390
    391	/*
    392	 * If an unloaded path is stale, choose it. If both paths are unloaded,
    393	 * choose path that is the most stale.
    394	 * (If one path is loaded, choose the other)
    395	 */
    396	if ((!out1 && stale1 < time_now) || (!out2 && stale2 < time_now) ||
    397	    (!out1 && !out2))
    398		return (!out2 * stale1) - (!out1 * stale2);
    399
    400	/* Compare estimated service time. If outstanding is the same, we
    401	 * don't need to multiply
    402	 */
    403	if (out1 == out2) {
    404		pi2_better = hst1 > hst2;
    405	} else {
    406		/* Potential overflow with out >= 1024 */
    407		if (unlikely(out1 >= HST_MAX_INFLIGHT ||
    408			     out2 >= HST_MAX_INFLIGHT)) {
    409			/* If over 1023 in-flights, we may overflow if hst
    410			 * is at max. (With this shift we still overflow at
    411			 * 1048576 in-flights, which is high enough).
    412			 */
    413			hst1 >>= HST_FIXED_SHIFT;
    414			hst2 >>= HST_FIXED_SHIFT;
    415		}
    416		pi2_better = (1 + out1) * hst1 > (1 + out2) * hst2;
    417	}
    418
    419	/* In the case that the 'winner' is stale, limit to equal usage. */
    420	if (pi2_better) {
    421		if (stale2 < time_now)
    422			return out1 - out2;
    423		return 1;
    424	}
    425	if (stale1 < time_now)
    426		return out1 - out2;
    427	return -1;
    428}
    429
    430static struct dm_path *hst_select_path(struct path_selector *ps,
    431				       size_t nr_bytes)
    432{
    433	struct selector *s = ps->context;
    434	struct path_info *pi = NULL, *best = NULL;
    435	u64 time_now = ktime_get_ns();
    436	struct dm_path *ret = NULL;
    437	unsigned long flags;
    438
    439	spin_lock_irqsave(&s->lock, flags);
    440	if (list_empty(&s->valid_paths))
    441		goto out;
    442
    443	list_for_each_entry(pi, &s->valid_paths, list) {
    444		if (!best || (hst_compare(pi, best, time_now, ps) < 0))
    445			best = pi;
    446	}
    447
    448	if (!best)
    449		goto out;
    450
    451	/* Move last used path to end (least preferred in case of ties) */
    452	list_move_tail(&best->list, &s->valid_paths);
    453
    454	ret = best->path;
    455
    456out:
    457	spin_unlock_irqrestore(&s->lock, flags);
    458	return ret;
    459}
    460
    461static int hst_start_io(struct path_selector *ps, struct dm_path *path,
    462			size_t nr_bytes)
    463{
    464	struct path_info *pi = path->pscontext;
    465	unsigned long flags;
    466
    467	spin_lock_irqsave(&pi->lock, flags);
    468	pi->outstanding++;
    469	spin_unlock_irqrestore(&pi->lock, flags);
    470
    471	return 0;
    472}
    473
    474static u64 path_service_time(struct path_info *pi, u64 start_time)
    475{
    476	u64 now = ktime_get_ns();
    477
    478	/* if a previous disk request has finished after this IO was
    479	 * sent to the hardware, pretend the submission happened
    480	 * serially.
    481	 */
    482	if (time_after64(pi->last_finish, start_time))
    483		start_time = pi->last_finish;
    484
    485	pi->last_finish = now;
    486	if (time_before64(now, start_time))
    487		return 0;
    488
    489	return now - start_time;
    490}
    491
    492static int hst_end_io(struct path_selector *ps, struct dm_path *path,
    493		      size_t nr_bytes, u64 start_time)
    494{
    495	struct path_info *pi = path->pscontext;
    496	struct selector *s = ps->context;
    497	unsigned long flags;
    498	u64 st;
    499
    500	spin_lock_irqsave(&pi->lock, flags);
    501
    502	st = path_service_time(pi, start_time);
    503	pi->outstanding--;
    504	pi->historical_service_time =
    505		fixed_ema(pi->historical_service_time,
    506			  min(st * HST_FIXED_1, HST_FIXED_MAX),
    507			  hst_weight(ps, st));
    508
    509	/*
    510	 * On request end, mark path as fresh. If a path hasn't
    511	 * finished any requests within the fresh period, the estimated
    512	 * service time is considered too optimistic and we limit the
    513	 * maximum requests on that path.
    514	 */
    515	pi->stale_after = pi->last_finish +
    516		(s->valid_count * (pi->historical_service_time >> HST_FIXED_SHIFT));
    517
    518	spin_unlock_irqrestore(&pi->lock, flags);
    519
    520	return 0;
    521}
    522
    523static struct path_selector_type hst_ps = {
    524	.name		= "historical-service-time",
    525	.module		= THIS_MODULE,
    526	.features	= DM_PS_USE_HR_TIMER,
    527	.table_args	= 1,
    528	.info_args	= 3,
    529	.create		= hst_create,
    530	.destroy	= hst_destroy,
    531	.status		= hst_status,
    532	.add_path	= hst_add_path,
    533	.fail_path	= hst_fail_path,
    534	.reinstate_path	= hst_reinstate_path,
    535	.select_path	= hst_select_path,
    536	.start_io	= hst_start_io,
    537	.end_io		= hst_end_io,
    538};
    539
    540static int __init dm_hst_init(void)
    541{
    542	int r = dm_register_path_selector(&hst_ps);
    543
    544	if (r < 0)
    545		DMERR("register failed %d", r);
    546
    547	DMINFO("version " HST_VERSION " loaded");
    548
    549	return r;
    550}
    551
    552static void __exit dm_hst_exit(void)
    553{
    554	int r = dm_unregister_path_selector(&hst_ps);
    555
    556	if (r < 0)
    557		DMERR("unregister failed %d", r);
    558}
    559
    560module_init(dm_hst_init);
    561module_exit(dm_hst_exit);
    562
    563MODULE_DESCRIPTION(DM_NAME " measured service time oriented path selector");
    564MODULE_AUTHOR("Khazhismel Kumykov <khazhy@google.com>");
    565MODULE_LICENSE("GPL");