cachepc-linux

Fork of AMDESE/linux with modifications for CachePC side-channel attack
git clone https://git.sinitax.com/sinitax/cachepc-linux
Log | Files | Refs | README | LICENSE | sfeed.txt

btree.h (14276B)


      1/* SPDX-License-Identifier: GPL-2.0 */
      2#ifndef _BCACHE_BTREE_H
      3#define _BCACHE_BTREE_H
      4
      5/*
      6 * THE BTREE:
      7 *
      8 * At a high level, bcache's btree is relatively standard b+ tree. All keys and
      9 * pointers are in the leaves; interior nodes only have pointers to the child
     10 * nodes.
     11 *
     12 * In the interior nodes, a struct bkey always points to a child btree node, and
     13 * the key is the highest key in the child node - except that the highest key in
     14 * an interior node is always MAX_KEY. The size field refers to the size on disk
     15 * of the child node - this would allow us to have variable sized btree nodes
     16 * (handy for keeping the depth of the btree 1 by expanding just the root).
     17 *
     18 * Btree nodes are themselves log structured, but this is hidden fairly
     19 * thoroughly. Btree nodes on disk will in practice have extents that overlap
     20 * (because they were written at different times), but in memory we never have
     21 * overlapping extents - when we read in a btree node from disk, the first thing
     22 * we do is resort all the sets of keys with a mergesort, and in the same pass
     23 * we check for overlapping extents and adjust them appropriately.
     24 *
     25 * struct btree_op is a central interface to the btree code. It's used for
     26 * specifying read vs. write locking, and the embedded closure is used for
     27 * waiting on IO or reserve memory.
     28 *
     29 * BTREE CACHE:
     30 *
     31 * Btree nodes are cached in memory; traversing the btree might require reading
     32 * in btree nodes which is handled mostly transparently.
     33 *
     34 * bch_btree_node_get() looks up a btree node in the cache and reads it in from
     35 * disk if necessary. This function is almost never called directly though - the
     36 * btree() macro is used to get a btree node, call some function on it, and
     37 * unlock the node after the function returns.
     38 *
     39 * The root is special cased - it's taken out of the cache's lru (thus pinning
     40 * it in memory), so we can find the root of the btree by just dereferencing a
     41 * pointer instead of looking it up in the cache. This makes locking a bit
     42 * tricky, since the root pointer is protected by the lock in the btree node it
     43 * points to - the btree_root() macro handles this.
     44 *
     45 * In various places we must be able to allocate memory for multiple btree nodes
     46 * in order to make forward progress. To do this we use the btree cache itself
     47 * as a reserve; if __get_free_pages() fails, we'll find a node in the btree
     48 * cache we can reuse. We can't allow more than one thread to be doing this at a
     49 * time, so there's a lock, implemented by a pointer to the btree_op closure -
     50 * this allows the btree_root() macro to implicitly release this lock.
     51 *
     52 * BTREE IO:
     53 *
     54 * Btree nodes never have to be explicitly read in; bch_btree_node_get() handles
     55 * this.
     56 *
     57 * For writing, we have two btree_write structs embeddded in struct btree - one
     58 * write in flight, and one being set up, and we toggle between them.
     59 *
     60 * Writing is done with a single function -  bch_btree_write() really serves two
     61 * different purposes and should be broken up into two different functions. When
     62 * passing now = false, it merely indicates that the node is now dirty - calling
     63 * it ensures that the dirty keys will be written at some point in the future.
     64 *
     65 * When passing now = true, bch_btree_write() causes a write to happen
     66 * "immediately" (if there was already a write in flight, it'll cause the write
     67 * to happen as soon as the previous write completes). It returns immediately
     68 * though - but it takes a refcount on the closure in struct btree_op you passed
     69 * to it, so a closure_sync() later can be used to wait for the write to
     70 * complete.
     71 *
     72 * This is handy because btree_split() and garbage collection can issue writes
     73 * in parallel, reducing the amount of time they have to hold write locks.
     74 *
     75 * LOCKING:
     76 *
     77 * When traversing the btree, we may need write locks starting at some level -
     78 * inserting a key into the btree will typically only require a write lock on
     79 * the leaf node.
     80 *
     81 * This is specified with the lock field in struct btree_op; lock = 0 means we
     82 * take write locks at level <= 0, i.e. only leaf nodes. bch_btree_node_get()
     83 * checks this field and returns the node with the appropriate lock held.
     84 *
     85 * If, after traversing the btree, the insertion code discovers it has to split
     86 * then it must restart from the root and take new locks - to do this it changes
     87 * the lock field and returns -EINTR, which causes the btree_root() macro to
     88 * loop.
     89 *
     90 * Handling cache misses require a different mechanism for upgrading to a write
     91 * lock. We do cache lookups with only a read lock held, but if we get a cache
     92 * miss and we wish to insert this data into the cache, we have to insert a
     93 * placeholder key to detect races - otherwise, we could race with a write and
     94 * overwrite the data that was just written to the cache with stale data from
     95 * the backing device.
     96 *
     97 * For this we use a sequence number that write locks and unlocks increment - to
     98 * insert the check key it unlocks the btree node and then takes a write lock,
     99 * and fails if the sequence number doesn't match.
    100 */
    101
    102#include "bset.h"
    103#include "debug.h"
    104
    105struct btree_write {
    106	atomic_t		*journal;
    107
    108	/* If btree_split() frees a btree node, it writes a new pointer to that
    109	 * btree node indicating it was freed; it takes a refcount on
    110	 * c->prio_blocked because we can't write the gens until the new
    111	 * pointer is on disk. This allows btree_write_endio() to release the
    112	 * refcount that btree_split() took.
    113	 */
    114	int			prio_blocked;
    115};
    116
    117struct btree {
    118	/* Hottest entries first */
    119	struct hlist_node	hash;
    120
    121	/* Key/pointer for this btree node */
    122	BKEY_PADDED(key);
    123
    124	unsigned long		seq;
    125	struct rw_semaphore	lock;
    126	struct cache_set	*c;
    127	struct btree		*parent;
    128
    129	struct mutex		write_lock;
    130
    131	unsigned long		flags;
    132	uint16_t		written;	/* would be nice to kill */
    133	uint8_t			level;
    134
    135	struct btree_keys	keys;
    136
    137	/* For outstanding btree writes, used as a lock - protects write_idx */
    138	struct closure		io;
    139	struct semaphore	io_mutex;
    140
    141	struct list_head	list;
    142	struct delayed_work	work;
    143
    144	struct btree_write	writes[2];
    145	struct bio		*bio;
    146};
    147
    148
    149
    150
    151#define BTREE_FLAG(flag)						\
    152static inline bool btree_node_ ## flag(struct btree *b)			\
    153{	return test_bit(BTREE_NODE_ ## flag, &b->flags); }		\
    154									\
    155static inline void set_btree_node_ ## flag(struct btree *b)		\
    156{	set_bit(BTREE_NODE_ ## flag, &b->flags); }
    157
    158enum btree_flags {
    159	BTREE_NODE_io_error,
    160	BTREE_NODE_dirty,
    161	BTREE_NODE_write_idx,
    162	BTREE_NODE_journal_flush,
    163};
    164
    165BTREE_FLAG(io_error);
    166BTREE_FLAG(dirty);
    167BTREE_FLAG(write_idx);
    168BTREE_FLAG(journal_flush);
    169
    170static inline struct btree_write *btree_current_write(struct btree *b)
    171{
    172	return b->writes + btree_node_write_idx(b);
    173}
    174
    175static inline struct btree_write *btree_prev_write(struct btree *b)
    176{
    177	return b->writes + (btree_node_write_idx(b) ^ 1);
    178}
    179
    180static inline struct bset *btree_bset_first(struct btree *b)
    181{
    182	return b->keys.set->data;
    183}
    184
    185static inline struct bset *btree_bset_last(struct btree *b)
    186{
    187	return bset_tree_last(&b->keys)->data;
    188}
    189
    190static inline unsigned int bset_block_offset(struct btree *b, struct bset *i)
    191{
    192	return bset_sector_offset(&b->keys, i) >> b->c->block_bits;
    193}
    194
    195static inline void set_gc_sectors(struct cache_set *c)
    196{
    197	atomic_set(&c->sectors_to_gc, c->cache->sb.bucket_size * c->nbuckets / 16);
    198}
    199
    200void bkey_put(struct cache_set *c, struct bkey *k);
    201
    202/* Looping macros */
    203
    204#define for_each_cached_btree(b, c, iter)				\
    205	for (iter = 0;							\
    206	     iter < ARRAY_SIZE((c)->bucket_hash);			\
    207	     iter++)							\
    208		hlist_for_each_entry_rcu((b), (c)->bucket_hash + iter, hash)
    209
    210/* Recursing down the btree */
    211
    212struct btree_op {
    213	/* for waiting on btree reserve in btree_split() */
    214	wait_queue_entry_t		wait;
    215
    216	/* Btree level at which we start taking write locks */
    217	short			lock;
    218
    219	unsigned int		insert_collision:1;
    220};
    221
    222struct btree_check_state;
    223struct btree_check_info {
    224	struct btree_check_state	*state;
    225	struct task_struct		*thread;
    226	int				result;
    227};
    228
    229#define BCH_BTR_CHKTHREAD_MAX	12
    230struct btree_check_state {
    231	struct cache_set		*c;
    232	int				total_threads;
    233	int				key_idx;
    234	spinlock_t			idx_lock;
    235	atomic_t			started;
    236	atomic_t			enough;
    237	wait_queue_head_t		wait;
    238	struct btree_check_info		infos[BCH_BTR_CHKTHREAD_MAX];
    239};
    240
    241static inline void bch_btree_op_init(struct btree_op *op, int write_lock_level)
    242{
    243	memset(op, 0, sizeof(struct btree_op));
    244	init_wait(&op->wait);
    245	op->lock = write_lock_level;
    246}
    247
    248static inline void rw_lock(bool w, struct btree *b, int level)
    249{
    250	w ? down_write_nested(&b->lock, level + 1)
    251	  : down_read_nested(&b->lock, level + 1);
    252	if (w)
    253		b->seq++;
    254}
    255
    256static inline void rw_unlock(bool w, struct btree *b)
    257{
    258	if (w)
    259		b->seq++;
    260	(w ? up_write : up_read)(&b->lock);
    261}
    262
    263void bch_btree_node_read_done(struct btree *b);
    264void __bch_btree_node_write(struct btree *b, struct closure *parent);
    265void bch_btree_node_write(struct btree *b, struct closure *parent);
    266
    267void bch_btree_set_root(struct btree *b);
    268struct btree *__bch_btree_node_alloc(struct cache_set *c, struct btree_op *op,
    269				     int level, bool wait,
    270				     struct btree *parent);
    271struct btree *bch_btree_node_get(struct cache_set *c, struct btree_op *op,
    272				 struct bkey *k, int level, bool write,
    273				 struct btree *parent);
    274
    275int bch_btree_insert_check_key(struct btree *b, struct btree_op *op,
    276			       struct bkey *check_key);
    277int bch_btree_insert(struct cache_set *c, struct keylist *keys,
    278		     atomic_t *journal_ref, struct bkey *replace_key);
    279
    280int bch_gc_thread_start(struct cache_set *c);
    281void bch_initial_gc_finish(struct cache_set *c);
    282void bch_moving_gc(struct cache_set *c);
    283int bch_btree_check(struct cache_set *c);
    284void bch_initial_mark_key(struct cache_set *c, int level, struct bkey *k);
    285
    286static inline void wake_up_gc(struct cache_set *c)
    287{
    288	wake_up(&c->gc_wait);
    289}
    290
    291static inline void force_wake_up_gc(struct cache_set *c)
    292{
    293	/*
    294	 * Garbage collection thread only works when sectors_to_gc < 0,
    295	 * calling wake_up_gc() won't start gc thread if sectors_to_gc is
    296	 * not a nagetive value.
    297	 * Therefore sectors_to_gc is set to -1 here, before waking up
    298	 * gc thread by calling wake_up_gc(). Then gc_should_run() will
    299	 * give a chance to permit gc thread to run. "Give a chance" means
    300	 * before going into gc_should_run(), there is still possibility
    301	 * that c->sectors_to_gc being set to other positive value. So
    302	 * this routine won't 100% make sure gc thread will be woken up
    303	 * to run.
    304	 */
    305	atomic_set(&c->sectors_to_gc, -1);
    306	wake_up_gc(c);
    307}
    308
    309/*
    310 * These macros are for recursing down the btree - they handle the details of
    311 * locking and looking up nodes in the cache for you. They're best treated as
    312 * mere syntax when reading code that uses them.
    313 *
    314 * op->lock determines whether we take a read or a write lock at a given depth.
    315 * If you've got a read lock and find that you need a write lock (i.e. you're
    316 * going to have to split), set op->lock and return -EINTR; btree_root() will
    317 * call you again and you'll have the correct lock.
    318 */
    319
    320/**
    321 * btree - recurse down the btree on a specified key
    322 * @fn:		function to call, which will be passed the child node
    323 * @key:	key to recurse on
    324 * @b:		parent btree node
    325 * @op:		pointer to struct btree_op
    326 */
    327#define bcache_btree(fn, key, b, op, ...)				\
    328({									\
    329	int _r, l = (b)->level - 1;					\
    330	bool _w = l <= (op)->lock;					\
    331	struct btree *_child = bch_btree_node_get((b)->c, op, key, l,	\
    332						  _w, b);		\
    333	if (!IS_ERR(_child)) {						\
    334		_r = bch_btree_ ## fn(_child, op, ##__VA_ARGS__);	\
    335		rw_unlock(_w, _child);					\
    336	} else								\
    337		_r = PTR_ERR(_child);					\
    338	_r;								\
    339})
    340
    341/**
    342 * btree_root - call a function on the root of the btree
    343 * @fn:		function to call, which will be passed the child node
    344 * @c:		cache set
    345 * @op:		pointer to struct btree_op
    346 */
    347#define bcache_btree_root(fn, c, op, ...)				\
    348({									\
    349	int _r = -EINTR;						\
    350	do {								\
    351		struct btree *_b = (c)->root;				\
    352		bool _w = insert_lock(op, _b);				\
    353		rw_lock(_w, _b, _b->level);				\
    354		if (_b == (c)->root &&					\
    355		    _w == insert_lock(op, _b)) {			\
    356			_r = bch_btree_ ## fn(_b, op, ##__VA_ARGS__);	\
    357		}							\
    358		rw_unlock(_w, _b);					\
    359		bch_cannibalize_unlock(c);                              \
    360		if (_r == -EINTR)                                       \
    361			schedule();                                     \
    362	} while (_r == -EINTR);                                         \
    363									\
    364	finish_wait(&(c)->btree_cache_wait, &(op)->wait);               \
    365	_r;                                                             \
    366})
    367
    368#define MAP_DONE	0
    369#define MAP_CONTINUE	1
    370
    371#define MAP_ALL_NODES	0
    372#define MAP_LEAF_NODES	1
    373
    374#define MAP_END_KEY	1
    375
    376typedef int (btree_map_nodes_fn)(struct btree_op *b_op, struct btree *b);
    377int __bch_btree_map_nodes(struct btree_op *op, struct cache_set *c,
    378			  struct bkey *from, btree_map_nodes_fn *fn, int flags);
    379
    380static inline int bch_btree_map_nodes(struct btree_op *op, struct cache_set *c,
    381				      struct bkey *from, btree_map_nodes_fn *fn)
    382{
    383	return __bch_btree_map_nodes(op, c, from, fn, MAP_ALL_NODES);
    384}
    385
    386static inline int bch_btree_map_leaf_nodes(struct btree_op *op,
    387					   struct cache_set *c,
    388					   struct bkey *from,
    389					   btree_map_nodes_fn *fn)
    390{
    391	return __bch_btree_map_nodes(op, c, from, fn, MAP_LEAF_NODES);
    392}
    393
    394typedef int (btree_map_keys_fn)(struct btree_op *op, struct btree *b,
    395				struct bkey *k);
    396int bch_btree_map_keys(struct btree_op *op, struct cache_set *c,
    397		       struct bkey *from, btree_map_keys_fn *fn, int flags);
    398int bch_btree_map_keys_recurse(struct btree *b, struct btree_op *op,
    399			       struct bkey *from, btree_map_keys_fn *fn,
    400			       int flags);
    401
    402typedef bool (keybuf_pred_fn)(struct keybuf *buf, struct bkey *k);
    403
    404void bch_keybuf_init(struct keybuf *buf);
    405void bch_refill_keybuf(struct cache_set *c, struct keybuf *buf,
    406		       struct bkey *end, keybuf_pred_fn *pred);
    407bool bch_keybuf_check_overlapping(struct keybuf *buf, struct bkey *start,
    408				  struct bkey *end);
    409void bch_keybuf_del(struct keybuf *buf, struct keybuf_key *w);
    410struct keybuf_key *bch_keybuf_next(struct keybuf *buf);
    411struct keybuf_key *bch_keybuf_next_rescan(struct cache_set *c,
    412					  struct keybuf *buf,
    413					  struct bkey *end,
    414					  keybuf_pred_fn *pred);
    415void bch_update_bucket_in_use(struct cache_set *c, struct gc_stat *stats);
    416#endif