dm-cache-policy-smq.c - cachepc-linux - Fork of AMDESE/linux with modifications for CachePC side-channel attack

	cachepc-linux Fork of AMDESE/linux with modifications for CachePC side-channel attack
	git clone https://git.sinitax.com/sinitax/cachepc-linux
	Log \| Files \| Refs \| README \| LICENSE \| sfeed.txt
dm-cache-policy-smq.c (44970B)
      1/*
      2 * Copyright (C) 2015 Red Hat. All rights reserved.
      3 *
      4 * This file is released under the GPL.
      5 */
      6
      7#include "dm-cache-background-tracker.h"
      8#include "dm-cache-policy-internal.h"
      9#include "dm-cache-policy.h"
     10#include "dm.h"
     11
     12#include <linux/hash.h>
     13#include <linux/jiffies.h>
     14#include <linux/module.h>
     15#include <linux/mutex.h>
     16#include <linux/vmalloc.h>
     17#include <linux/math64.h>
     18
     19#define DM_MSG_PREFIX "cache-policy-smq"
     20
     21/*----------------------------------------------------------------*/
     22
     23/*
     24 * Safe division functions that return zero on divide by zero.
     25 */
     26static unsigned safe_div(unsigned n, unsigned d)
     27{
     28	return d ? n / d : 0u;
     29}
     30
     31static unsigned safe_mod(unsigned n, unsigned d)
     32{
     33	return d ? n % d : 0u;
     34}
     35
     36/*----------------------------------------------------------------*/
     37
     38struct entry {
     39	unsigned hash_next:28;
     40	unsigned prev:28;
     41	unsigned next:28;
     42	unsigned level:6;
     43	bool dirty:1;
     44	bool allocated:1;
     45	bool sentinel:1;
     46	bool pending_work:1;
     47
     48	dm_oblock_t oblock;
     49};
     50
     51/*----------------------------------------------------------------*/
     52
     53#define INDEXER_NULL ((1u << 28u) - 1u)
     54
     55/*
     56 * An entry_space manages a set of entries that we use for the queues.
     57 * The clean and dirty queues share entries, so this object is separate
     58 * from the queue itself.
     59 */
     60struct entry_space {
     61	struct entry *begin;
     62	struct entry *end;
     63};
     64
     65static int space_init(struct entry_space *es, unsigned nr_entries)
     66{
     67	if (!nr_entries) {
     68		es->begin = es->end = NULL;
     69		return 0;
     70	}
     71
     72	es->begin = vzalloc(array_size(nr_entries, sizeof(struct entry)));
     73	if (!es->begin)
     74		return -ENOMEM;
     75
     76	es->end = es->begin + nr_entries;
     77	return 0;
     78}
     79
     80static void space_exit(struct entry_space *es)
     81{
     82	vfree(es->begin);
     83}
     84
     85static struct entry *__get_entry(struct entry_space *es, unsigned block)
     86{
     87	struct entry *e;
     88
     89	e = es->begin + block;
     90	BUG_ON(e >= es->end);
     91
     92	return e;
     93}
     94
     95static unsigned to_index(struct entry_space *es, struct entry *e)
     96{
     97	BUG_ON(e < es->begin || e >= es->end);
     98	return e - es->begin;
     99}
    100
    101static struct entry *to_entry(struct entry_space *es, unsigned block)
    102{
    103	if (block == INDEXER_NULL)
    104		return NULL;
    105
    106	return __get_entry(es, block);
    107}
    108
    109/*----------------------------------------------------------------*/
    110
    111struct ilist {
    112	unsigned nr_elts;	/* excluding sentinel entries */
    113	unsigned head, tail;
    114};
    115
    116static void l_init(struct ilist *l)
    117{
    118	l->nr_elts = 0;
    119	l->head = l->tail = INDEXER_NULL;
    120}
    121
    122static struct entry *l_head(struct entry_space *es, struct ilist *l)
    123{
    124	return to_entry(es, l->head);
    125}
    126
    127static struct entry *l_tail(struct entry_space *es, struct ilist *l)
    128{
    129	return to_entry(es, l->tail);
    130}
    131
    132static struct entry *l_next(struct entry_space *es, struct entry *e)
    133{
    134	return to_entry(es, e->next);
    135}
    136
    137static struct entry *l_prev(struct entry_space *es, struct entry *e)
    138{
    139	return to_entry(es, e->prev);
    140}
    141
    142static bool l_empty(struct ilist *l)
    143{
    144	return l->head == INDEXER_NULL;
    145}
    146
    147static void l_add_head(struct entry_space *es, struct ilist *l, struct entry *e)
    148{
    149	struct entry *head = l_head(es, l);
    150
    151	e->next = l->head;
    152	e->prev = INDEXER_NULL;
    153
    154	if (head)
    155		head->prev = l->head = to_index(es, e);
    156	else
    157		l->head = l->tail = to_index(es, e);
    158
    159	if (!e->sentinel)
    160		l->nr_elts++;
    161}
    162
    163static void l_add_tail(struct entry_space *es, struct ilist *l, struct entry *e)
    164{
    165	struct entry *tail = l_tail(es, l);
    166
    167	e->next = INDEXER_NULL;
    168	e->prev = l->tail;
    169
    170	if (tail)
    171		tail->next = l->tail = to_index(es, e);
    172	else
    173		l->head = l->tail = to_index(es, e);
    174
    175	if (!e->sentinel)
    176		l->nr_elts++;
    177}
    178
    179static void l_add_before(struct entry_space *es, struct ilist *l,
    180			 struct entry *old, struct entry *e)
    181{
    182	struct entry *prev = l_prev(es, old);
    183
    184	if (!prev)
    185		l_add_head(es, l, e);
    186
    187	else {
    188		e->prev = old->prev;
    189		e->next = to_index(es, old);
    190		prev->next = old->prev = to_index(es, e);
    191
    192		if (!e->sentinel)
    193			l->nr_elts++;
    194	}
    195}
    196
    197static void l_del(struct entry_space *es, struct ilist *l, struct entry *e)
    198{
    199	struct entry *prev = l_prev(es, e);
    200	struct entry *next = l_next(es, e);
    201
    202	if (prev)
    203		prev->next = e->next;
    204	else
    205		l->head = e->next;
    206
    207	if (next)
    208		next->prev = e->prev;
    209	else
    210		l->tail = e->prev;
    211
    212	if (!e->sentinel)
    213		l->nr_elts--;
    214}
    215
    216static struct entry *l_pop_head(struct entry_space *es, struct ilist *l)
    217{
    218	struct entry *e;
    219
    220	for (e = l_head(es, l); e; e = l_next(es, e))
    221		if (!e->sentinel) {
    222			l_del(es, l, e);
    223			return e;
    224		}
    225
    226	return NULL;
    227}
    228
    229static struct entry *l_pop_tail(struct entry_space *es, struct ilist *l)
    230{
    231	struct entry *e;
    232
    233	for (e = l_tail(es, l); e; e = l_prev(es, e))
    234		if (!e->sentinel) {
    235			l_del(es, l, e);
    236			return e;
    237		}
    238
    239	return NULL;
    240}
    241
    242/*----------------------------------------------------------------*/
    243
    244/*
    245 * The stochastic-multi-queue is a set of lru lists stacked into levels.
    246 * Entries are moved up levels when they are used, which loosely orders the
    247 * most accessed entries in the top levels and least in the bottom.  This
    248 * structure is *much* better than a single lru list.
    249 */
    250#define MAX_LEVELS 64u
    251
    252struct queue {
    253	struct entry_space *es;
    254
    255	unsigned nr_elts;
    256	unsigned nr_levels;
    257	struct ilist qs[MAX_LEVELS];
    258
    259	/*
    260	 * We maintain a count of the number of entries we would like in each
    261	 * level.
    262	 */
    263	unsigned last_target_nr_elts;
    264	unsigned nr_top_levels;
    265	unsigned nr_in_top_levels;
    266	unsigned target_count[MAX_LEVELS];
    267};
    268
    269static void q_init(struct queue *q, struct entry_space *es, unsigned nr_levels)
    270{
    271	unsigned i;
    272
    273	q->es = es;
    274	q->nr_elts = 0;
    275	q->nr_levels = nr_levels;
    276
    277	for (i = 0; i < q->nr_levels; i++) {
    278		l_init(q->qs + i);
    279		q->target_count[i] = 0u;
    280	}
    281
    282	q->last_target_nr_elts = 0u;
    283	q->nr_top_levels = 0u;
    284	q->nr_in_top_levels = 0u;
    285}
    286
    287static unsigned q_size(struct queue *q)
    288{
    289	return q->nr_elts;
    290}
    291
    292/*
    293 * Insert an entry to the back of the given level.
    294 */
    295static void q_push(struct queue *q, struct entry *e)
    296{
    297	BUG_ON(e->pending_work);
    298
    299	if (!e->sentinel)
    300		q->nr_elts++;
    301
    302	l_add_tail(q->es, q->qs + e->level, e);
    303}
    304
    305static void q_push_front(struct queue *q, struct entry *e)
    306{
    307	BUG_ON(e->pending_work);
    308
    309	if (!e->sentinel)
    310		q->nr_elts++;
    311
    312	l_add_head(q->es, q->qs + e->level, e);
    313}
    314
    315static void q_push_before(struct queue *q, struct entry *old, struct entry *e)
    316{
    317	BUG_ON(e->pending_work);
    318
    319	if (!e->sentinel)
    320		q->nr_elts++;
    321
    322	l_add_before(q->es, q->qs + e->level, old, e);
    323}
    324
    325static void q_del(struct queue *q, struct entry *e)
    326{
    327	l_del(q->es, q->qs + e->level, e);
    328	if (!e->sentinel)
    329		q->nr_elts--;
    330}
    331
    332/*
    333 * Return the oldest entry of the lowest populated level.
    334 */
    335static struct entry *q_peek(struct queue *q, unsigned max_level, bool can_cross_sentinel)
    336{
    337	unsigned level;
    338	struct entry *e;
    339
    340	max_level = min(max_level, q->nr_levels);
    341
    342	for (level = 0; level < max_level; level++)
    343		for (e = l_head(q->es, q->qs + level); e; e = l_next(q->es, e)) {
    344			if (e->sentinel) {
    345				if (can_cross_sentinel)
    346					continue;
    347				else
    348					break;
    349			}
    350
    351			return e;
    352		}
    353
    354	return NULL;
    355}
    356
    357static struct entry *q_pop(struct queue *q)
    358{
    359	struct entry *e = q_peek(q, q->nr_levels, true);
    360
    361	if (e)
    362		q_del(q, e);
    363
    364	return e;
    365}
    366
    367/*
    368 * This function assumes there is a non-sentinel entry to pop.  It's only
    369 * used by redistribute, so we know this is true.  It also doesn't adjust
    370 * the q->nr_elts count.
    371 */
    372static struct entry *__redist_pop_from(struct queue *q, unsigned level)
    373{
    374	struct entry *e;
    375
    376	for (; level < q->nr_levels; level++)
    377		for (e = l_head(q->es, q->qs + level); e; e = l_next(q->es, e))
    378			if (!e->sentinel) {
    379				l_del(q->es, q->qs + e->level, e);
    380				return e;
    381			}
    382
    383	return NULL;
    384}
    385
    386static void q_set_targets_subrange_(struct queue *q, unsigned nr_elts, unsigned lbegin, unsigned lend)
    387{
    388	unsigned level, nr_levels, entries_per_level, remainder;
    389
    390	BUG_ON(lbegin > lend);
    391	BUG_ON(lend > q->nr_levels);
    392	nr_levels = lend - lbegin;
    393	entries_per_level = safe_div(nr_elts, nr_levels);
    394	remainder = safe_mod(nr_elts, nr_levels);
    395
    396	for (level = lbegin; level < lend; level++)
    397		q->target_count[level] =
    398			(level < (lbegin + remainder)) ? entries_per_level + 1u : entries_per_level;
    399}
    400
    401/*
    402 * Typically we have fewer elements in the top few levels which allows us
    403 * to adjust the promote threshold nicely.
    404 */
    405static void q_set_targets(struct queue *q)
    406{
    407	if (q->last_target_nr_elts == q->nr_elts)
    408		return;
    409
    410	q->last_target_nr_elts = q->nr_elts;
    411
    412	if (q->nr_top_levels > q->nr_levels)
    413		q_set_targets_subrange_(q, q->nr_elts, 0, q->nr_levels);
    414
    415	else {
    416		q_set_targets_subrange_(q, q->nr_in_top_levels,
    417					q->nr_levels - q->nr_top_levels, q->nr_levels);
    418
    419		if (q->nr_in_top_levels < q->nr_elts)
    420			q_set_targets_subrange_(q, q->nr_elts - q->nr_in_top_levels,
    421						0, q->nr_levels - q->nr_top_levels);
    422		else
    423			q_set_targets_subrange_(q, 0, 0, q->nr_levels - q->nr_top_levels);
    424	}
    425}
    426
    427static void q_redistribute(struct queue *q)
    428{
    429	unsigned target, level;
    430	struct ilist *l, *l_above;
    431	struct entry *e;
    432
    433	q_set_targets(q);
    434
    435	for (level = 0u; level < q->nr_levels - 1u; level++) {
    436		l = q->qs + level;
    437		target = q->target_count[level];
    438
    439		/*
    440		 * Pull down some entries from the level above.
    441		 */
    442		while (l->nr_elts < target) {
    443			e = __redist_pop_from(q, level + 1u);
    444			if (!e) {
    445				/* bug in nr_elts */
    446				break;
    447			}
    448
    449			e->level = level;
    450			l_add_tail(q->es, l, e);
    451		}
    452
    453		/*
    454		 * Push some entries up.
    455		 */
    456		l_above = q->qs + level + 1u;
    457		while (l->nr_elts > target) {
    458			e = l_pop_tail(q->es, l);
    459
    460			if (!e)
    461				/* bug in nr_elts */
    462				break;
    463
    464			e->level = level + 1u;
    465			l_add_tail(q->es, l_above, e);
    466		}
    467	}
    468}
    469
    470static void q_requeue(struct queue *q, struct entry *e, unsigned extra_levels,
    471		      struct entry *s1, struct entry *s2)
    472{
    473	struct entry *de;
    474	unsigned sentinels_passed = 0;
    475	unsigned new_level = min(q->nr_levels - 1u, e->level + extra_levels);
    476
    477	/* try and find an entry to swap with */
    478	if (extra_levels && (e->level < q->nr_levels - 1u)) {
    479		for (de = l_head(q->es, q->qs + new_level); de && de->sentinel; de = l_next(q->es, de))
    480			sentinels_passed++;
    481
    482		if (de) {
    483			q_del(q, de);
    484			de->level = e->level;
    485			if (s1) {
    486				switch (sentinels_passed) {
    487				case 0:
    488					q_push_before(q, s1, de);
    489					break;
    490
    491				case 1:
    492					q_push_before(q, s2, de);
    493					break;
    494
    495				default:
    496					q_push(q, de);
    497				}
    498			} else
    499				q_push(q, de);
    500		}
    501	}
    502
    503	q_del(q, e);
    504	e->level = new_level;
    505	q_push(q, e);
    506}
    507
    508/*----------------------------------------------------------------*/
    509
    510#define FP_SHIFT 8
    511#define SIXTEENTH (1u << (FP_SHIFT - 4u))
    512#define EIGHTH (1u << (FP_SHIFT - 3u))
    513
    514struct stats {
    515	unsigned hit_threshold;
    516	unsigned hits;
    517	unsigned misses;
    518};
    519
    520enum performance {
    521	Q_POOR,
    522	Q_FAIR,
    523	Q_WELL
    524};
    525
    526static void stats_init(struct stats *s, unsigned nr_levels)
    527{
    528	s->hit_threshold = (nr_levels * 3u) / 4u;
    529	s->hits = 0u;
    530	s->misses = 0u;
    531}
    532
    533static void stats_reset(struct stats *s)
    534{
    535	s->hits = s->misses = 0u;
    536}
    537
    538static void stats_level_accessed(struct stats *s, unsigned level)
    539{
    540	if (level >= s->hit_threshold)
    541		s->hits++;
    542	else
    543		s->misses++;
    544}
    545
    546static void stats_miss(struct stats *s)
    547{
    548	s->misses++;
    549}
    550
    551/*
    552 * There are times when we don't have any confidence in the hotspot queue.
    553 * Such as when a fresh cache is created and the blocks have been spread
    554 * out across the levels, or if an io load changes.  We detect this by
    555 * seeing how often a lookup is in the top levels of the hotspot queue.
    556 */
    557static enum performance stats_assess(struct stats *s)
    558{
    559	unsigned confidence = safe_div(s->hits << FP_SHIFT, s->hits + s->misses);
    560
    561	if (confidence < SIXTEENTH)
    562		return Q_POOR;
    563
    564	else if (confidence < EIGHTH)
    565		return Q_FAIR;
    566
    567	else
    568		return Q_WELL;
    569}
    570
    571/*----------------------------------------------------------------*/
    572
    573struct smq_hash_table {
    574	struct entry_space *es;
    575	unsigned long long hash_bits;
    576	unsigned *buckets;
    577};
    578
    579/*
    580 * All cache entries are stored in a chained hash table.  To save space we
    581 * use indexing again, and only store indexes to the next entry.
    582 */
    583static int h_init(struct smq_hash_table *ht, struct entry_space *es, unsigned nr_entries)
    584{
    585	unsigned i, nr_buckets;
    586
    587	ht->es = es;
    588	nr_buckets = roundup_pow_of_two(max(nr_entries / 4u, 16u));
    589	ht->hash_bits = __ffs(nr_buckets);
    590
    591	ht->buckets = vmalloc(array_size(nr_buckets, sizeof(*ht->buckets)));
    592	if (!ht->buckets)
    593		return -ENOMEM;
    594
    595	for (i = 0; i < nr_buckets; i++)
    596		ht->buckets[i] = INDEXER_NULL;
    597
    598	return 0;
    599}
    600
    601static void h_exit(struct smq_hash_table *ht)
    602{
    603	vfree(ht->buckets);
    604}
    605
    606static struct entry *h_head(struct smq_hash_table *ht, unsigned bucket)
    607{
    608	return to_entry(ht->es, ht->buckets[bucket]);
    609}
    610
    611static struct entry *h_next(struct smq_hash_table *ht, struct entry *e)
    612{
    613	return to_entry(ht->es, e->hash_next);
    614}
    615
    616static void __h_insert(struct smq_hash_table *ht, unsigned bucket, struct entry *e)
    617{
    618	e->hash_next = ht->buckets[bucket];
    619	ht->buckets[bucket] = to_index(ht->es, e);
    620}
    621
    622static void h_insert(struct smq_hash_table *ht, struct entry *e)
    623{
    624	unsigned h = hash_64(from_oblock(e->oblock), ht->hash_bits);
    625	__h_insert(ht, h, e);
    626}
    627
    628static struct entry *__h_lookup(struct smq_hash_table *ht, unsigned h, dm_oblock_t oblock,
    629				struct entry **prev)
    630{
    631	struct entry *e;
    632
    633	*prev = NULL;
    634	for (e = h_head(ht, h); e; e = h_next(ht, e)) {
    635		if (e->oblock == oblock)
    636			return e;
    637
    638		*prev = e;
    639	}
    640
    641	return NULL;
    642}
    643
    644static void __h_unlink(struct smq_hash_table *ht, unsigned h,
    645		       struct entry *e, struct entry *prev)
    646{
    647	if (prev)
    648		prev->hash_next = e->hash_next;
    649	else
    650		ht->buckets[h] = e->hash_next;
    651}
    652
    653/*
    654 * Also moves each entry to the front of the bucket.
    655 */
    656static struct entry *h_lookup(struct smq_hash_table *ht, dm_oblock_t oblock)
    657{
    658	struct entry *e, *prev;
    659	unsigned h = hash_64(from_oblock(oblock), ht->hash_bits);
    660
    661	e = __h_lookup(ht, h, oblock, &prev);
    662	if (e && prev) {
    663		/*
    664		 * Move to the front because this entry is likely
    665		 * to be hit again.
    666		 */
    667		__h_unlink(ht, h, e, prev);
    668		__h_insert(ht, h, e);
    669	}
    670
    671	return e;
    672}
    673
    674static void h_remove(struct smq_hash_table *ht, struct entry *e)
    675{
    676	unsigned h = hash_64(from_oblock(e->oblock), ht->hash_bits);
    677	struct entry *prev;
    678
    679	/*
    680	 * The down side of using a singly linked list is we have to
    681	 * iterate the bucket to remove an item.
    682	 */
    683	e = __h_lookup(ht, h, e->oblock, &prev);
    684	if (e)
    685		__h_unlink(ht, h, e, prev);
    686}
    687
    688/*----------------------------------------------------------------*/
    689
    690struct entry_alloc {
    691	struct entry_space *es;
    692	unsigned begin;
    693
    694	unsigned nr_allocated;
    695	struct ilist free;
    696};
    697
    698static void init_allocator(struct entry_alloc *ea, struct entry_space *es,
    699			   unsigned begin, unsigned end)
    700{
    701	unsigned i;
    702
    703	ea->es = es;
    704	ea->nr_allocated = 0u;
    705	ea->begin = begin;
    706
    707	l_init(&ea->free);
    708	for (i = begin; i != end; i++)
    709		l_add_tail(ea->es, &ea->free, __get_entry(ea->es, i));
    710}
    711
    712static void init_entry(struct entry *e)
    713{
    714	/*
    715	 * We can't memset because that would clear the hotspot and
    716	 * sentinel bits which remain constant.
    717	 */
    718	e->hash_next = INDEXER_NULL;
    719	e->next = INDEXER_NULL;
    720	e->prev = INDEXER_NULL;
    721	e->level = 0u;
    722	e->dirty = true;	/* FIXME: audit */
    723	e->allocated = true;
    724	e->sentinel = false;
    725	e->pending_work = false;
    726}
    727
    728static struct entry *alloc_entry(struct entry_alloc *ea)
    729{
    730	struct entry *e;
    731
    732	if (l_empty(&ea->free))
    733		return NULL;
    734
    735	e = l_pop_head(ea->es, &ea->free);
    736	init_entry(e);
    737	ea->nr_allocated++;
    738
    739	return e;
    740}
    741
    742/*
    743 * This assumes the cblock hasn't already been allocated.
    744 */
    745static struct entry *alloc_particular_entry(struct entry_alloc *ea, unsigned i)
    746{
    747	struct entry *e = __get_entry(ea->es, ea->begin + i);
    748
    749	BUG_ON(e->allocated);
    750
    751	l_del(ea->es, &ea->free, e);
    752	init_entry(e);
    753	ea->nr_allocated++;
    754
    755	return e;
    756}
    757
    758static void free_entry(struct entry_alloc *ea, struct entry *e)
    759{
    760	BUG_ON(!ea->nr_allocated);
    761	BUG_ON(!e->allocated);
    762
    763	ea->nr_allocated--;
    764	e->allocated = false;
    765	l_add_tail(ea->es, &ea->free, e);
    766}
    767
    768static bool allocator_empty(struct entry_alloc *ea)
    769{
    770	return l_empty(&ea->free);
    771}
    772
    773static unsigned get_index(struct entry_alloc *ea, struct entry *e)
    774{
    775	return to_index(ea->es, e) - ea->begin;
    776}
    777
    778static struct entry *get_entry(struct entry_alloc *ea, unsigned index)
    779{
    780	return __get_entry(ea->es, ea->begin + index);
    781}
    782
    783/*----------------------------------------------------------------*/
    784
    785#define NR_HOTSPOT_LEVELS 64u
    786#define NR_CACHE_LEVELS 64u
    787
    788#define WRITEBACK_PERIOD (10ul * HZ)
    789#define DEMOTE_PERIOD (60ul * HZ)
    790
    791#define HOTSPOT_UPDATE_PERIOD (HZ)
    792#define CACHE_UPDATE_PERIOD (60ul * HZ)
    793
    794struct smq_policy {
    795	struct dm_cache_policy policy;
    796
    797	/* protects everything */
    798	spinlock_t lock;
    799	dm_cblock_t cache_size;
    800	sector_t cache_block_size;
    801
    802	sector_t hotspot_block_size;
    803	unsigned nr_hotspot_blocks;
    804	unsigned cache_blocks_per_hotspot_block;
    805	unsigned hotspot_level_jump;
    806
    807	struct entry_space es;
    808	struct entry_alloc writeback_sentinel_alloc;
    809	struct entry_alloc demote_sentinel_alloc;
    810	struct entry_alloc hotspot_alloc;
    811	struct entry_alloc cache_alloc;
    812
    813	unsigned long *hotspot_hit_bits;
    814	unsigned long *cache_hit_bits;
    815
    816	/*
    817	 * We maintain three queues of entries.  The cache proper,
    818	 * consisting of a clean and dirty queue, containing the currently
    819	 * active mappings.  The hotspot queue uses a larger block size to
    820	 * track blocks that are being hit frequently and potential
    821	 * candidates for promotion to the cache.
    822	 */
    823	struct queue hotspot;
    824	struct queue clean;
    825	struct queue dirty;
    826
    827	struct stats hotspot_stats;
    828	struct stats cache_stats;
    829
    830	/*
    831	 * Keeps track of time, incremented by the core.  We use this to
    832	 * avoid attributing multiple hits within the same tick.
    833	 */
    834	unsigned tick;
    835
    836	/*
    837	 * The hash tables allows us to quickly find an entry by origin
    838	 * block.
    839	 */
    840	struct smq_hash_table table;
    841	struct smq_hash_table hotspot_table;
    842
    843	bool current_writeback_sentinels;
    844	unsigned long next_writeback_period;
    845
    846	bool current_demote_sentinels;
    847	unsigned long next_demote_period;
    848
    849	unsigned write_promote_level;
    850	unsigned read_promote_level;
    851
    852	unsigned long next_hotspot_period;
    853	unsigned long next_cache_period;
    854
    855	struct background_tracker *bg_work;
    856
    857	bool migrations_allowed;
    858};
    859
    860/*----------------------------------------------------------------*/
    861
    862static struct entry *get_sentinel(struct entry_alloc *ea, unsigned level, bool which)
    863{
    864	return get_entry(ea, which ? level : NR_CACHE_LEVELS + level);
    865}
    866
    867static struct entry *writeback_sentinel(struct smq_policy *mq, unsigned level)
    868{
    869	return get_sentinel(&mq->writeback_sentinel_alloc, level, mq->current_writeback_sentinels);
    870}
    871
    872static struct entry *demote_sentinel(struct smq_policy *mq, unsigned level)
    873{
    874	return get_sentinel(&mq->demote_sentinel_alloc, level, mq->current_demote_sentinels);
    875}
    876
    877static void __update_writeback_sentinels(struct smq_policy *mq)
    878{
    879	unsigned level;
    880	struct queue *q = &mq->dirty;
    881	struct entry *sentinel;
    882
    883	for (level = 0; level < q->nr_levels; level++) {
    884		sentinel = writeback_sentinel(mq, level);
    885		q_del(q, sentinel);
    886		q_push(q, sentinel);
    887	}
    888}
    889
    890static void __update_demote_sentinels(struct smq_policy *mq)
    891{
    892	unsigned level;
    893	struct queue *q = &mq->clean;
    894	struct entry *sentinel;
    895
    896	for (level = 0; level < q->nr_levels; level++) {
    897		sentinel = demote_sentinel(mq, level);
    898		q_del(q, sentinel);
    899		q_push(q, sentinel);
    900	}
    901}
    902
    903static void update_sentinels(struct smq_policy *mq)
    904{
    905	if (time_after(jiffies, mq->next_writeback_period)) {
    906		mq->next_writeback_period = jiffies + WRITEBACK_PERIOD;
    907		mq->current_writeback_sentinels = !mq->current_writeback_sentinels;
    908		__update_writeback_sentinels(mq);
    909	}
    910
    911	if (time_after(jiffies, mq->next_demote_period)) {
    912		mq->next_demote_period = jiffies + DEMOTE_PERIOD;
    913		mq->current_demote_sentinels = !mq->current_demote_sentinels;
    914		__update_demote_sentinels(mq);
    915	}
    916}
    917
    918static void __sentinels_init(struct smq_policy *mq)
    919{
    920	unsigned level;
    921	struct entry *sentinel;
    922
    923	for (level = 0; level < NR_CACHE_LEVELS; level++) {
    924		sentinel = writeback_sentinel(mq, level);
    925		sentinel->level = level;
    926		q_push(&mq->dirty, sentinel);
    927
    928		sentinel = demote_sentinel(mq, level);
    929		sentinel->level = level;
    930		q_push(&mq->clean, sentinel);
    931	}
    932}
    933
    934static void sentinels_init(struct smq_policy *mq)
    935{
    936	mq->next_writeback_period = jiffies + WRITEBACK_PERIOD;
    937	mq->next_demote_period = jiffies + DEMOTE_PERIOD;
    938
    939	mq->current_writeback_sentinels = false;
    940	mq->current_demote_sentinels = false;
    941	__sentinels_init(mq);
    942
    943	mq->current_writeback_sentinels = !mq->current_writeback_sentinels;
    944	mq->current_demote_sentinels = !mq->current_demote_sentinels;
    945	__sentinels_init(mq);
    946}
    947
    948/*----------------------------------------------------------------*/
    949
    950static void del_queue(struct smq_policy *mq, struct entry *e)
    951{
    952	q_del(e->dirty ? &mq->dirty : &mq->clean, e);
    953}
    954
    955static void push_queue(struct smq_policy *mq, struct entry *e)
    956{
    957	if (e->dirty)
    958		q_push(&mq->dirty, e);
    959	else
    960		q_push(&mq->clean, e);
    961}
    962
    963// !h, !q, a -> h, q, a
    964static void push(struct smq_policy *mq, struct entry *e)
    965{
    966	h_insert(&mq->table, e);
    967	if (!e->pending_work)
    968		push_queue(mq, e);
    969}
    970
    971static void push_queue_front(struct smq_policy *mq, struct entry *e)
    972{
    973	if (e->dirty)
    974		q_push_front(&mq->dirty, e);
    975	else
    976		q_push_front(&mq->clean, e);
    977}
    978
    979static void push_front(struct smq_policy *mq, struct entry *e)
    980{
    981	h_insert(&mq->table, e);
    982	if (!e->pending_work)
    983		push_queue_front(mq, e);
    984}
    985
    986static dm_cblock_t infer_cblock(struct smq_policy *mq, struct entry *e)
    987{
    988	return to_cblock(get_index(&mq->cache_alloc, e));
    989}
    990
    991static void requeue(struct smq_policy *mq, struct entry *e)
    992{
    993	/*
    994	 * Pending work has temporarily been taken out of the queues.
    995	 */
    996	if (e->pending_work)
    997		return;
    998
    999	if (!test_and_set_bit(from_cblock(infer_cblock(mq, e)), mq->cache_hit_bits)) {
   1000		if (!e->dirty) {
   1001			q_requeue(&mq->clean, e, 1u, NULL, NULL);
   1002			return;
   1003		}
   1004
   1005		q_requeue(&mq->dirty, e, 1u,
   1006			  get_sentinel(&mq->writeback_sentinel_alloc, e->level, !mq->current_writeback_sentinels),
   1007			  get_sentinel(&mq->writeback_sentinel_alloc, e->level, mq->current_writeback_sentinels));
   1008	}
   1009}
   1010
   1011static unsigned default_promote_level(struct smq_policy *mq)
   1012{
   1013	/*
   1014	 * The promote level depends on the current performance of the
   1015	 * cache.
   1016	 *
   1017	 * If the cache is performing badly, then we can't afford
   1018	 * to promote much without causing performance to drop below that
   1019	 * of the origin device.
   1020	 *
   1021	 * If the cache is performing well, then we don't need to promote
   1022	 * much.  If it isn't broken, don't fix it.
   1023	 *
   1024	 * If the cache is middling then we promote more.
   1025	 *
   1026	 * This scheme reminds me of a graph of entropy vs probability of a
   1027	 * binary variable.
   1028	 */
   1029	static const unsigned int table[] = {
   1030		1, 1, 1, 2, 4, 6, 7, 8, 7, 6, 4, 4, 3, 3, 2, 2, 1
   1031	};
   1032
   1033	unsigned hits = mq->cache_stats.hits;
   1034	unsigned misses = mq->cache_stats.misses;
   1035	unsigned index = safe_div(hits << 4u, hits + misses);
   1036	return table[index];
   1037}
   1038
   1039static void update_promote_levels(struct smq_policy *mq)
   1040{
   1041	/*
   1042	 * If there are unused cache entries then we want to be really
   1043	 * eager to promote.
   1044	 */
   1045	unsigned threshold_level = allocator_empty(&mq->cache_alloc) ?
   1046		default_promote_level(mq) : (NR_HOTSPOT_LEVELS / 2u);
   1047
   1048	threshold_level = max(threshold_level, NR_HOTSPOT_LEVELS);
   1049
   1050	/*
   1051	 * If the hotspot queue is performing badly then we have little
   1052	 * confidence that we know which blocks to promote.  So we cut down
   1053	 * the amount of promotions.
   1054	 */
   1055	switch (stats_assess(&mq->hotspot_stats)) {
   1056	case Q_POOR:
   1057		threshold_level /= 4u;
   1058		break;
   1059
   1060	case Q_FAIR:
   1061		threshold_level /= 2u;
   1062		break;
   1063
   1064	case Q_WELL:
   1065		break;
   1066	}
   1067
   1068	mq->read_promote_level = NR_HOTSPOT_LEVELS - threshold_level;
   1069	mq->write_promote_level = (NR_HOTSPOT_LEVELS - threshold_level);
   1070}
   1071
   1072/*
   1073 * If the hotspot queue is performing badly, then we try and move entries
   1074 * around more quickly.
   1075 */
   1076static void update_level_jump(struct smq_policy *mq)
   1077{
   1078	switch (stats_assess(&mq->hotspot_stats)) {
   1079	case Q_POOR:
   1080		mq->hotspot_level_jump = 4u;
   1081		break;
   1082
   1083	case Q_FAIR:
   1084		mq->hotspot_level_jump = 2u;
   1085		break;
   1086
   1087	case Q_WELL:
   1088		mq->hotspot_level_jump = 1u;
   1089		break;
   1090	}
   1091}
   1092
   1093static void end_hotspot_period(struct smq_policy *mq)
   1094{
   1095	clear_bitset(mq->hotspot_hit_bits, mq->nr_hotspot_blocks);
   1096	update_promote_levels(mq);
   1097
   1098	if (time_after(jiffies, mq->next_hotspot_period)) {
   1099		update_level_jump(mq);
   1100		q_redistribute(&mq->hotspot);
   1101		stats_reset(&mq->hotspot_stats);
   1102		mq->next_hotspot_period = jiffies + HOTSPOT_UPDATE_PERIOD;
   1103	}
   1104}
   1105
   1106static void end_cache_period(struct smq_policy *mq)
   1107{
   1108	if (time_after(jiffies, mq->next_cache_period)) {
   1109		clear_bitset(mq->cache_hit_bits, from_cblock(mq->cache_size));
   1110
   1111		q_redistribute(&mq->dirty);
   1112		q_redistribute(&mq->clean);
   1113		stats_reset(&mq->cache_stats);
   1114
   1115		mq->next_cache_period = jiffies + CACHE_UPDATE_PERIOD;
   1116	}
   1117}
   1118
   1119/*----------------------------------------------------------------*/
   1120
   1121/*
   1122 * Targets are given as a percentage.
   1123 */
   1124#define CLEAN_TARGET 25u
   1125#define FREE_TARGET 25u
   1126
   1127static unsigned percent_to_target(struct smq_policy *mq, unsigned p)
   1128{
   1129	return from_cblock(mq->cache_size) * p / 100u;
   1130}
   1131
   1132static bool clean_target_met(struct smq_policy *mq, bool idle)
   1133{
   1134	/*
   1135	 * Cache entries may not be populated.  So we cannot rely on the
   1136	 * size of the clean queue.
   1137	 */
   1138	if (idle) {
   1139		/*
   1140		 * We'd like to clean everything.
   1141		 */
   1142		return q_size(&mq->dirty) == 0u;
   1143	}
   1144
   1145	/*
   1146	 * If we're busy we don't worry about cleaning at all.
   1147	 */
   1148	return true;
   1149}
   1150
   1151static bool free_target_met(struct smq_policy *mq)
   1152{
   1153	unsigned nr_free;
   1154
   1155	nr_free = from_cblock(mq->cache_size) - mq->cache_alloc.nr_allocated;
   1156	return (nr_free + btracker_nr_demotions_queued(mq->bg_work)) >=
   1157		percent_to_target(mq, FREE_TARGET);
   1158}
   1159
   1160/*----------------------------------------------------------------*/
   1161
   1162static void mark_pending(struct smq_policy *mq, struct entry *e)
   1163{
   1164	BUG_ON(e->sentinel);
   1165	BUG_ON(!e->allocated);
   1166	BUG_ON(e->pending_work);
   1167	e->pending_work = true;
   1168}
   1169
   1170static void clear_pending(struct smq_policy *mq, struct entry *e)
   1171{
   1172	BUG_ON(!e->pending_work);
   1173	e->pending_work = false;
   1174}
   1175
   1176static void queue_writeback(struct smq_policy *mq, bool idle)
   1177{
   1178	int r;
   1179	struct policy_work work;
   1180	struct entry *e;
   1181
   1182	e = q_peek(&mq->dirty, mq->dirty.nr_levels, idle);
   1183	if (e) {
   1184		mark_pending(mq, e);
   1185		q_del(&mq->dirty, e);
   1186
   1187		work.op = POLICY_WRITEBACK;
   1188		work.oblock = e->oblock;
   1189		work.cblock = infer_cblock(mq, e);
   1190
   1191		r = btracker_queue(mq->bg_work, &work, NULL);
   1192		if (r) {
   1193			clear_pending(mq, e);
   1194			q_push_front(&mq->dirty, e);
   1195		}
   1196	}
   1197}
   1198
   1199static void queue_demotion(struct smq_policy *mq)
   1200{
   1201	int r;
   1202	struct policy_work work;
   1203	struct entry *e;
   1204
   1205	if (WARN_ON_ONCE(!mq->migrations_allowed))
   1206		return;
   1207
   1208	e = q_peek(&mq->clean, mq->clean.nr_levels / 2, true);
   1209	if (!e) {
   1210		if (!clean_target_met(mq, true))
   1211			queue_writeback(mq, false);
   1212		return;
   1213	}
   1214
   1215	mark_pending(mq, e);
   1216	q_del(&mq->clean, e);
   1217
   1218	work.op = POLICY_DEMOTE;
   1219	work.oblock = e->oblock;
   1220	work.cblock = infer_cblock(mq, e);
   1221	r = btracker_queue(mq->bg_work, &work, NULL);
   1222	if (r) {
   1223		clear_pending(mq, e);
   1224		q_push_front(&mq->clean, e);
   1225	}
   1226}
   1227
   1228static void queue_promotion(struct smq_policy *mq, dm_oblock_t oblock,
   1229			    struct policy_work **workp)
   1230{
   1231	int r;
   1232	struct entry *e;
   1233	struct policy_work work;
   1234
   1235	if (!mq->migrations_allowed)
   1236		return;
   1237
   1238	if (allocator_empty(&mq->cache_alloc)) {
   1239		/*
   1240		 * We always claim to be 'idle' to ensure some demotions happen
   1241		 * with continuous loads.
   1242		 */
   1243		if (!free_target_met(mq))
   1244			queue_demotion(mq);
   1245		return;
   1246	}
   1247
   1248	if (btracker_promotion_already_present(mq->bg_work, oblock))
   1249		return;
   1250
   1251	/*
   1252	 * We allocate the entry now to reserve the cblock.  If the
   1253	 * background work is aborted we must remember to free it.
   1254	 */
   1255	e = alloc_entry(&mq->cache_alloc);
   1256	BUG_ON(!e);
   1257	e->pending_work = true;
   1258	work.op = POLICY_PROMOTE;
   1259	work.oblock = oblock;
   1260	work.cblock = infer_cblock(mq, e);
   1261	r = btracker_queue(mq->bg_work, &work, workp);
   1262	if (r)
   1263		free_entry(&mq->cache_alloc, e);
   1264}
   1265
   1266/*----------------------------------------------------------------*/
   1267
   1268enum promote_result {
   1269	PROMOTE_NOT,
   1270	PROMOTE_TEMPORARY,
   1271	PROMOTE_PERMANENT
   1272};
   1273
   1274/*
   1275 * Converts a boolean into a promote result.
   1276 */
   1277static enum promote_result maybe_promote(bool promote)
   1278{
   1279	return promote ? PROMOTE_PERMANENT : PROMOTE_NOT;
   1280}
   1281
   1282static enum promote_result should_promote(struct smq_policy *mq, struct entry *hs_e,
   1283					  int data_dir, bool fast_promote)
   1284{
   1285	if (data_dir == WRITE) {
   1286		if (!allocator_empty(&mq->cache_alloc) && fast_promote)
   1287			return PROMOTE_TEMPORARY;
   1288
   1289		return maybe_promote(hs_e->level >= mq->write_promote_level);
   1290	} else
   1291		return maybe_promote(hs_e->level >= mq->read_promote_level);
   1292}
   1293
   1294static dm_oblock_t to_hblock(struct smq_policy *mq, dm_oblock_t b)
   1295{
   1296	sector_t r = from_oblock(b);
   1297	(void) sector_div(r, mq->cache_blocks_per_hotspot_block);
   1298	return to_oblock(r);
   1299}
   1300
   1301static struct entry *update_hotspot_queue(struct smq_policy *mq, dm_oblock_t b)
   1302{
   1303	unsigned hi;
   1304	dm_oblock_t hb = to_hblock(mq, b);
   1305	struct entry *e = h_lookup(&mq->hotspot_table, hb);
   1306
   1307	if (e) {
   1308		stats_level_accessed(&mq->hotspot_stats, e->level);
   1309
   1310		hi = get_index(&mq->hotspot_alloc, e);
   1311		q_requeue(&mq->hotspot, e,
   1312			  test_and_set_bit(hi, mq->hotspot_hit_bits) ?
   1313			  0u : mq->hotspot_level_jump,
   1314			  NULL, NULL);
   1315
   1316	} else {
   1317		stats_miss(&mq->hotspot_stats);
   1318
   1319		e = alloc_entry(&mq->hotspot_alloc);
   1320		if (!e) {
   1321			e = q_pop(&mq->hotspot);
   1322			if (e) {
   1323				h_remove(&mq->hotspot_table, e);
   1324				hi = get_index(&mq->hotspot_alloc, e);
   1325				clear_bit(hi, mq->hotspot_hit_bits);
   1326			}
   1327
   1328		}
   1329
   1330		if (e) {
   1331			e->oblock = hb;
   1332			q_push(&mq->hotspot, e);
   1333			h_insert(&mq->hotspot_table, e);
   1334		}
   1335	}
   1336
   1337	return e;
   1338}
   1339
   1340/*----------------------------------------------------------------*/
   1341
   1342/*
   1343 * Public interface, via the policy struct.  See dm-cache-policy.h for a
   1344 * description of these.
   1345 */
   1346
   1347static struct smq_policy *to_smq_policy(struct dm_cache_policy *p)
   1348{
   1349	return container_of(p, struct smq_policy, policy);
   1350}
   1351
   1352static void smq_destroy(struct dm_cache_policy *p)
   1353{
   1354	struct smq_policy *mq = to_smq_policy(p);
   1355
   1356	btracker_destroy(mq->bg_work);
   1357	h_exit(&mq->hotspot_table);
   1358	h_exit(&mq->table);
   1359	free_bitset(mq->hotspot_hit_bits);
   1360	free_bitset(mq->cache_hit_bits);
   1361	space_exit(&mq->es);
   1362	kfree(mq);
   1363}
   1364
   1365/*----------------------------------------------------------------*/
   1366
   1367static int __lookup(struct smq_policy *mq, dm_oblock_t oblock, dm_cblock_t *cblock,
   1368		    int data_dir, bool fast_copy,
   1369		    struct policy_work **work, bool *background_work)
   1370{
   1371	struct entry *e, *hs_e;
   1372	enum promote_result pr;
   1373
   1374	*background_work = false;
   1375
   1376	e = h_lookup(&mq->table, oblock);
   1377	if (e) {
   1378		stats_level_accessed(&mq->cache_stats, e->level);
   1379
   1380		requeue(mq, e);
   1381		*cblock = infer_cblock(mq, e);
   1382		return 0;
   1383
   1384	} else {
   1385		stats_miss(&mq->cache_stats);
   1386
   1387		/*
   1388		 * The hotspot queue only gets updated with misses.
   1389		 */
   1390		hs_e = update_hotspot_queue(mq, oblock);
   1391
   1392		pr = should_promote(mq, hs_e, data_dir, fast_copy);
   1393		if (pr != PROMOTE_NOT) {
   1394			queue_promotion(mq, oblock, work);
   1395			*background_work = true;
   1396		}
   1397
   1398		return -ENOENT;
   1399	}
   1400}
   1401
   1402static int smq_lookup(struct dm_cache_policy *p, dm_oblock_t oblock, dm_cblock_t *cblock,
   1403		      int data_dir, bool fast_copy,
   1404		      bool *background_work)
   1405{
   1406	int r;
   1407	unsigned long flags;
   1408	struct smq_policy *mq = to_smq_policy(p);
   1409
   1410	spin_lock_irqsave(&mq->lock, flags);
   1411	r = __lookup(mq, oblock, cblock,
   1412		     data_dir, fast_copy,
   1413		     NULL, background_work);
   1414	spin_unlock_irqrestore(&mq->lock, flags);
   1415
   1416	return r;
   1417}
   1418
   1419static int smq_lookup_with_work(struct dm_cache_policy *p,
   1420				dm_oblock_t oblock, dm_cblock_t *cblock,
   1421				int data_dir, bool fast_copy,
   1422				struct policy_work **work)
   1423{
   1424	int r;
   1425	bool background_queued;
   1426	unsigned long flags;
   1427	struct smq_policy *mq = to_smq_policy(p);
   1428
   1429	spin_lock_irqsave(&mq->lock, flags);
   1430	r = __lookup(mq, oblock, cblock, data_dir, fast_copy, work, &background_queued);
   1431	spin_unlock_irqrestore(&mq->lock, flags);
   1432
   1433	return r;
   1434}
   1435
   1436static int smq_get_background_work(struct dm_cache_policy *p, bool idle,
   1437				   struct policy_work **result)
   1438{
   1439	int r;
   1440	unsigned long flags;
   1441	struct smq_policy *mq = to_smq_policy(p);
   1442
   1443	spin_lock_irqsave(&mq->lock, flags);
   1444	r = btracker_issue(mq->bg_work, result);
   1445	if (r == -ENODATA) {
   1446		if (!clean_target_met(mq, idle)) {
   1447			queue_writeback(mq, idle);
   1448			r = btracker_issue(mq->bg_work, result);
   1449		}
   1450	}
   1451	spin_unlock_irqrestore(&mq->lock, flags);
   1452
   1453	return r;
   1454}
   1455
   1456/*
   1457 * We need to clear any pending work flags that have been set, and in the
   1458 * case of promotion free the entry for the destination cblock.
   1459 */
   1460static void __complete_background_work(struct smq_policy *mq,
   1461				       struct policy_work *work,
   1462				       bool success)
   1463{
   1464	struct entry *e = get_entry(&mq->cache_alloc,
   1465				    from_cblock(work->cblock));
   1466
   1467	switch (work->op) {
   1468	case POLICY_PROMOTE:
   1469		// !h, !q, a
   1470		clear_pending(mq, e);
   1471		if (success) {
   1472			e->oblock = work->oblock;
   1473			e->level = NR_CACHE_LEVELS - 1;
   1474			push(mq, e);
   1475			// h, q, a
   1476		} else {
   1477			free_entry(&mq->cache_alloc, e);
   1478			// !h, !q, !a
   1479		}
   1480		break;
   1481
   1482	case POLICY_DEMOTE:
   1483		// h, !q, a
   1484		if (success) {
   1485			h_remove(&mq->table, e);
   1486			free_entry(&mq->cache_alloc, e);
   1487			// !h, !q, !a
   1488		} else {
   1489			clear_pending(mq, e);
   1490			push_queue(mq, e);
   1491			// h, q, a
   1492		}
   1493		break;
   1494
   1495	case POLICY_WRITEBACK:
   1496		// h, !q, a
   1497		clear_pending(mq, e);
   1498		push_queue(mq, e);
   1499		// h, q, a
   1500		break;
   1501	}
   1502
   1503	btracker_complete(mq->bg_work, work);
   1504}
   1505
   1506static void smq_complete_background_work(struct dm_cache_policy *p,
   1507					 struct policy_work *work,
   1508					 bool success)
   1509{
   1510	unsigned long flags;
   1511	struct smq_policy *mq = to_smq_policy(p);
   1512
   1513	spin_lock_irqsave(&mq->lock, flags);
   1514	__complete_background_work(mq, work, success);
   1515	spin_unlock_irqrestore(&mq->lock, flags);
   1516}
   1517
   1518// in_hash(oblock) -> in_hash(oblock)
   1519static void __smq_set_clear_dirty(struct smq_policy *mq, dm_cblock_t cblock, bool set)
   1520{
   1521	struct entry *e = get_entry(&mq->cache_alloc, from_cblock(cblock));
   1522
   1523	if (e->pending_work)
   1524		e->dirty = set;
   1525	else {
   1526		del_queue(mq, e);
   1527		e->dirty = set;
   1528		push_queue(mq, e);
   1529	}
   1530}
   1531
   1532static void smq_set_dirty(struct dm_cache_policy *p, dm_cblock_t cblock)
   1533{
   1534	unsigned long flags;
   1535	struct smq_policy *mq = to_smq_policy(p);
   1536
   1537	spin_lock_irqsave(&mq->lock, flags);
   1538	__smq_set_clear_dirty(mq, cblock, true);
   1539	spin_unlock_irqrestore(&mq->lock, flags);
   1540}
   1541
   1542static void smq_clear_dirty(struct dm_cache_policy *p, dm_cblock_t cblock)
   1543{
   1544	struct smq_policy *mq = to_smq_policy(p);
   1545	unsigned long flags;
   1546
   1547	spin_lock_irqsave(&mq->lock, flags);
   1548	__smq_set_clear_dirty(mq, cblock, false);
   1549	spin_unlock_irqrestore(&mq->lock, flags);
   1550}
   1551
   1552static unsigned random_level(dm_cblock_t cblock)
   1553{
   1554	return hash_32(from_cblock(cblock), 9) & (NR_CACHE_LEVELS - 1);
   1555}
   1556
   1557static int smq_load_mapping(struct dm_cache_policy *p,
   1558			    dm_oblock_t oblock, dm_cblock_t cblock,
   1559			    bool dirty, uint32_t hint, bool hint_valid)
   1560{
   1561	struct smq_policy *mq = to_smq_policy(p);
   1562	struct entry *e;
   1563
   1564	e = alloc_particular_entry(&mq->cache_alloc, from_cblock(cblock));
   1565	e->oblock = oblock;
   1566	e->dirty = dirty;
   1567	e->level = hint_valid ? min(hint, NR_CACHE_LEVELS - 1) : random_level(cblock);
   1568	e->pending_work = false;
   1569
   1570	/*
   1571	 * When we load mappings we push ahead of both sentinels in order to
   1572	 * allow demotions and cleaning to occur immediately.
   1573	 */
   1574	push_front(mq, e);
   1575
   1576	return 0;
   1577}
   1578
   1579static int smq_invalidate_mapping(struct dm_cache_policy *p, dm_cblock_t cblock)
   1580{
   1581	struct smq_policy *mq = to_smq_policy(p);
   1582	struct entry *e = get_entry(&mq->cache_alloc, from_cblock(cblock));
   1583
   1584	if (!e->allocated)
   1585		return -ENODATA;
   1586
   1587	// FIXME: what if this block has pending background work?
   1588	del_queue(mq, e);
   1589	h_remove(&mq->table, e);
   1590	free_entry(&mq->cache_alloc, e);
   1591	return 0;
   1592}
   1593
   1594static uint32_t smq_get_hint(struct dm_cache_policy *p, dm_cblock_t cblock)
   1595{
   1596	struct smq_policy *mq = to_smq_policy(p);
   1597	struct entry *e = get_entry(&mq->cache_alloc, from_cblock(cblock));
   1598
   1599	if (!e->allocated)
   1600		return 0;
   1601
   1602	return e->level;
   1603}
   1604
   1605static dm_cblock_t smq_residency(struct dm_cache_policy *p)
   1606{
   1607	dm_cblock_t r;
   1608	unsigned long flags;
   1609	struct smq_policy *mq = to_smq_policy(p);
   1610
   1611	spin_lock_irqsave(&mq->lock, flags);
   1612	r = to_cblock(mq->cache_alloc.nr_allocated);
   1613	spin_unlock_irqrestore(&mq->lock, flags);
   1614
   1615	return r;
   1616}
   1617
   1618static void smq_tick(struct dm_cache_policy *p, bool can_block)
   1619{
   1620	struct smq_policy *mq = to_smq_policy(p);
   1621	unsigned long flags;
   1622
   1623	spin_lock_irqsave(&mq->lock, flags);
   1624	mq->tick++;
   1625	update_sentinels(mq);
   1626	end_hotspot_period(mq);
   1627	end_cache_period(mq);
   1628	spin_unlock_irqrestore(&mq->lock, flags);
   1629}
   1630
   1631static void smq_allow_migrations(struct dm_cache_policy *p, bool allow)
   1632{
   1633	struct smq_policy *mq = to_smq_policy(p);
   1634	mq->migrations_allowed = allow;
   1635}
   1636
   1637/*
   1638 * smq has no config values, but the old mq policy did.  To avoid breaking
   1639 * software we continue to accept these configurables for the mq policy,
   1640 * but they have no effect.
   1641 */
   1642static int mq_set_config_value(struct dm_cache_policy *p,
   1643			       const char *key, const char *value)
   1644{
   1645	unsigned long tmp;
   1646
   1647	if (kstrtoul(value, 10, &tmp))
   1648		return -EINVAL;
   1649
   1650	if (!strcasecmp(key, "random_threshold") ||
   1651	    !strcasecmp(key, "sequential_threshold") ||
   1652	    !strcasecmp(key, "discard_promote_adjustment") ||
   1653	    !strcasecmp(key, "read_promote_adjustment") ||
   1654	    !strcasecmp(key, "write_promote_adjustment")) {
   1655		DMWARN("tunable '%s' no longer has any effect, mq policy is now an alias for smq", key);
   1656		return 0;
   1657	}
   1658
   1659	return -EINVAL;
   1660}
   1661
   1662static int mq_emit_config_values(struct dm_cache_policy *p, char *result,
   1663				 unsigned maxlen, ssize_t *sz_ptr)
   1664{
   1665	ssize_t sz = *sz_ptr;
   1666
   1667	DMEMIT("10 random_threshold 0 "
   1668	       "sequential_threshold 0 "
   1669	       "discard_promote_adjustment 0 "
   1670	       "read_promote_adjustment 0 "
   1671	       "write_promote_adjustment 0 ");
   1672
   1673	*sz_ptr = sz;
   1674	return 0;
   1675}
   1676
   1677/* Init the policy plugin interface function pointers. */
   1678static void init_policy_functions(struct smq_policy *mq, bool mimic_mq)
   1679{
   1680	mq->policy.destroy = smq_destroy;
   1681	mq->policy.lookup = smq_lookup;
   1682	mq->policy.lookup_with_work = smq_lookup_with_work;
   1683	mq->policy.get_background_work = smq_get_background_work;
   1684	mq->policy.complete_background_work = smq_complete_background_work;
   1685	mq->policy.set_dirty = smq_set_dirty;
   1686	mq->policy.clear_dirty = smq_clear_dirty;
   1687	mq->policy.load_mapping = smq_load_mapping;
   1688	mq->policy.invalidate_mapping = smq_invalidate_mapping;
   1689	mq->policy.get_hint = smq_get_hint;
   1690	mq->policy.residency = smq_residency;
   1691	mq->policy.tick = smq_tick;
   1692	mq->policy.allow_migrations = smq_allow_migrations;
   1693
   1694	if (mimic_mq) {
   1695		mq->policy.set_config_value = mq_set_config_value;
   1696		mq->policy.emit_config_values = mq_emit_config_values;
   1697	}
   1698}
   1699
   1700static bool too_many_hotspot_blocks(sector_t origin_size,
   1701				    sector_t hotspot_block_size,
   1702				    unsigned nr_hotspot_blocks)
   1703{
   1704	return (hotspot_block_size * nr_hotspot_blocks) > origin_size;
   1705}
   1706
   1707static void calc_hotspot_params(sector_t origin_size,
   1708				sector_t cache_block_size,
   1709				unsigned nr_cache_blocks,
   1710				sector_t *hotspot_block_size,
   1711				unsigned *nr_hotspot_blocks)
   1712{
   1713	*hotspot_block_size = cache_block_size * 16u;
   1714	*nr_hotspot_blocks = max(nr_cache_blocks / 4u, 1024u);
   1715
   1716	while ((*hotspot_block_size > cache_block_size) &&
   1717	       too_many_hotspot_blocks(origin_size, *hotspot_block_size, *nr_hotspot_blocks))
   1718		*hotspot_block_size /= 2u;
   1719}
   1720
   1721static struct dm_cache_policy *__smq_create(dm_cblock_t cache_size,
   1722					    sector_t origin_size,
   1723					    sector_t cache_block_size,
   1724					    bool mimic_mq,
   1725					    bool migrations_allowed)
   1726{
   1727	unsigned i;
   1728	unsigned nr_sentinels_per_queue = 2u * NR_CACHE_LEVELS;
   1729	unsigned total_sentinels = 2u * nr_sentinels_per_queue;
   1730	struct smq_policy *mq = kzalloc(sizeof(*mq), GFP_KERNEL);
   1731
   1732	if (!mq)
   1733		return NULL;
   1734
   1735	init_policy_functions(mq, mimic_mq);
   1736	mq->cache_size = cache_size;
   1737	mq->cache_block_size = cache_block_size;
   1738
   1739	calc_hotspot_params(origin_size, cache_block_size, from_cblock(cache_size),
   1740			    &mq->hotspot_block_size, &mq->nr_hotspot_blocks);
   1741
   1742	mq->cache_blocks_per_hotspot_block = div64_u64(mq->hotspot_block_size, mq->cache_block_size);
   1743	mq->hotspot_level_jump = 1u;
   1744	if (space_init(&mq->es, total_sentinels + mq->nr_hotspot_blocks + from_cblock(cache_size))) {
   1745		DMERR("couldn't initialize entry space");
   1746		goto bad_pool_init;
   1747	}
   1748
   1749	init_allocator(&mq->writeback_sentinel_alloc, &mq->es, 0, nr_sentinels_per_queue);
   1750	for (i = 0; i < nr_sentinels_per_queue; i++)
   1751		get_entry(&mq->writeback_sentinel_alloc, i)->sentinel = true;
   1752
   1753	init_allocator(&mq->demote_sentinel_alloc, &mq->es, nr_sentinels_per_queue, total_sentinels);
   1754	for (i = 0; i < nr_sentinels_per_queue; i++)
   1755		get_entry(&mq->demote_sentinel_alloc, i)->sentinel = true;
   1756
   1757	init_allocator(&mq->hotspot_alloc, &mq->es, total_sentinels,
   1758		       total_sentinels + mq->nr_hotspot_blocks);
   1759
   1760	init_allocator(&mq->cache_alloc, &mq->es,
   1761		       total_sentinels + mq->nr_hotspot_blocks,
   1762		       total_sentinels + mq->nr_hotspot_blocks + from_cblock(cache_size));
   1763
   1764	mq->hotspot_hit_bits = alloc_bitset(mq->nr_hotspot_blocks);
   1765	if (!mq->hotspot_hit_bits) {
   1766		DMERR("couldn't allocate hotspot hit bitset");
   1767		goto bad_hotspot_hit_bits;
   1768	}
   1769	clear_bitset(mq->hotspot_hit_bits, mq->nr_hotspot_blocks);
   1770
   1771	if (from_cblock(cache_size)) {
   1772		mq->cache_hit_bits = alloc_bitset(from_cblock(cache_size));
   1773		if (!mq->cache_hit_bits) {
   1774			DMERR("couldn't allocate cache hit bitset");
   1775			goto bad_cache_hit_bits;
   1776		}
   1777		clear_bitset(mq->cache_hit_bits, from_cblock(mq->cache_size));
   1778	} else
   1779		mq->cache_hit_bits = NULL;
   1780
   1781	mq->tick = 0;
   1782	spin_lock_init(&mq->lock);
   1783
   1784	q_init(&mq->hotspot, &mq->es, NR_HOTSPOT_LEVELS);
   1785	mq->hotspot.nr_top_levels = 8;
   1786	mq->hotspot.nr_in_top_levels = min(mq->nr_hotspot_blocks / NR_HOTSPOT_LEVELS,
   1787					   from_cblock(mq->cache_size) / mq->cache_blocks_per_hotspot_block);
   1788
   1789	q_init(&mq->clean, &mq->es, NR_CACHE_LEVELS);
   1790	q_init(&mq->dirty, &mq->es, NR_CACHE_LEVELS);
   1791
   1792	stats_init(&mq->hotspot_stats, NR_HOTSPOT_LEVELS);
   1793	stats_init(&mq->cache_stats, NR_CACHE_LEVELS);
   1794
   1795	if (h_init(&mq->table, &mq->es, from_cblock(cache_size)))
   1796		goto bad_alloc_table;
   1797
   1798	if (h_init(&mq->hotspot_table, &mq->es, mq->nr_hotspot_blocks))
   1799		goto bad_alloc_hotspot_table;
   1800
   1801	sentinels_init(mq);
   1802	mq->write_promote_level = mq->read_promote_level = NR_HOTSPOT_LEVELS;
   1803
   1804	mq->next_hotspot_period = jiffies;
   1805	mq->next_cache_period = jiffies;
   1806
   1807	mq->bg_work = btracker_create(4096); /* FIXME: hard coded value */
   1808	if (!mq->bg_work)
   1809		goto bad_btracker;
   1810
   1811	mq->migrations_allowed = migrations_allowed;
   1812
   1813	return &mq->policy;
   1814
   1815bad_btracker:
   1816	h_exit(&mq->hotspot_table);
   1817bad_alloc_hotspot_table:
   1818	h_exit(&mq->table);
   1819bad_alloc_table:
   1820	free_bitset(mq->cache_hit_bits);
   1821bad_cache_hit_bits:
   1822	free_bitset(mq->hotspot_hit_bits);
   1823bad_hotspot_hit_bits:
   1824	space_exit(&mq->es);
   1825bad_pool_init:
   1826	kfree(mq);
   1827
   1828	return NULL;
   1829}
   1830
   1831static struct dm_cache_policy *smq_create(dm_cblock_t cache_size,
   1832					  sector_t origin_size,
   1833					  sector_t cache_block_size)
   1834{
   1835	return __smq_create(cache_size, origin_size, cache_block_size, false, true);
   1836}
   1837
   1838static struct dm_cache_policy *mq_create(dm_cblock_t cache_size,
   1839					 sector_t origin_size,
   1840					 sector_t cache_block_size)
   1841{
   1842	return __smq_create(cache_size, origin_size, cache_block_size, true, true);
   1843}
   1844
   1845static struct dm_cache_policy *cleaner_create(dm_cblock_t cache_size,
   1846					      sector_t origin_size,
   1847					      sector_t cache_block_size)
   1848{
   1849	return __smq_create(cache_size, origin_size, cache_block_size, false, false);
   1850}
   1851
   1852/*----------------------------------------------------------------*/
   1853
   1854static struct dm_cache_policy_type smq_policy_type = {
   1855	.name = "smq",
   1856	.version = {2, 0, 0},
   1857	.hint_size = 4,
   1858	.owner = THIS_MODULE,
   1859	.create = smq_create
   1860};
   1861
   1862static struct dm_cache_policy_type mq_policy_type = {
   1863	.name = "mq",
   1864	.version = {2, 0, 0},
   1865	.hint_size = 4,
   1866	.owner = THIS_MODULE,
   1867	.create = mq_create,
   1868};
   1869
   1870static struct dm_cache_policy_type cleaner_policy_type = {
   1871	.name = "cleaner",
   1872	.version = {2, 0, 0},
   1873	.hint_size = 4,
   1874	.owner = THIS_MODULE,
   1875	.create = cleaner_create,
   1876};
   1877
   1878static struct dm_cache_policy_type default_policy_type = {
   1879	.name = "default",
   1880	.version = {2, 0, 0},
   1881	.hint_size = 4,
   1882	.owner = THIS_MODULE,
   1883	.create = smq_create,
   1884	.real = &smq_policy_type
   1885};
   1886
   1887static int __init smq_init(void)
   1888{
   1889	int r;
   1890
   1891	r = dm_cache_policy_register(&smq_policy_type);
   1892	if (r) {
   1893		DMERR("register failed %d", r);
   1894		return -ENOMEM;
   1895	}
   1896
   1897	r = dm_cache_policy_register(&mq_policy_type);
   1898	if (r) {
   1899		DMERR("register failed (as mq) %d", r);
   1900		goto out_mq;
   1901	}
   1902
   1903	r = dm_cache_policy_register(&cleaner_policy_type);
   1904	if (r) {
   1905		DMERR("register failed (as cleaner) %d", r);
   1906		goto out_cleaner;
   1907	}
   1908
   1909	r = dm_cache_policy_register(&default_policy_type);
   1910	if (r) {
   1911		DMERR("register failed (as default) %d", r);
   1912		goto out_default;
   1913	}
   1914
   1915	return 0;
   1916
   1917out_default:
   1918	dm_cache_policy_unregister(&cleaner_policy_type);
   1919out_cleaner:
   1920	dm_cache_policy_unregister(&mq_policy_type);
   1921out_mq:
   1922	dm_cache_policy_unregister(&smq_policy_type);
   1923
   1924	return -ENOMEM;
   1925}
   1926
   1927static void __exit smq_exit(void)
   1928{
   1929	dm_cache_policy_unregister(&cleaner_policy_type);
   1930	dm_cache_policy_unregister(&smq_policy_type);
   1931	dm_cache_policy_unregister(&mq_policy_type);
   1932	dm_cache_policy_unregister(&default_policy_type);
   1933}
   1934
   1935module_init(smq_init);
   1936module_exit(smq_exit);
   1937
   1938MODULE_AUTHOR("Joe Thornber <dm-devel@redhat.com>");
   1939MODULE_LICENSE("GPL");
   1940MODULE_DESCRIPTION("smq cache policy");
   1941
   1942MODULE_ALIAS("dm-cache-default");
   1943MODULE_ALIAS("dm-cache-mq");
   1944MODULE_ALIAS("dm-cache-cleaner");