btree.c - cachepc-linux - Fork of AMDESE/linux with modifications for CachePC side-channel attack

	cachepc-linux Fork of AMDESE/linux with modifications for CachePC side-channel attack
	git clone https://git.sinitax.com/sinitax/cachepc-linux
	Log \| Files \| Refs \| README \| LICENSE \| sfeed.txt
btree.c (64487B)
      1// SPDX-License-Identifier: GPL-2.0
      2/*
      3 * Copyright (C) 2010 Kent Overstreet <kent.overstreet@gmail.com>
      4 *
      5 * Uses a block device as cache for other block devices; optimized for SSDs.
      6 * All allocation is done in buckets, which should match the erase block size
      7 * of the device.
      8 *
      9 * Buckets containing cached data are kept on a heap sorted by priority;
     10 * bucket priority is increased on cache hit, and periodically all the buckets
     11 * on the heap have their priority scaled down. This currently is just used as
     12 * an LRU but in the future should allow for more intelligent heuristics.
     13 *
     14 * Buckets have an 8 bit counter; freeing is accomplished by incrementing the
     15 * counter. Garbage collection is used to remove stale pointers.
     16 *
     17 * Indexing is done via a btree; nodes are not necessarily fully sorted, rather
     18 * as keys are inserted we only sort the pages that have not yet been written.
     19 * When garbage collection is run, we resort the entire node.
     20 *
     21 * All configuration is done via sysfs; see Documentation/admin-guide/bcache.rst.
     22 */
     23
     24#include "bcache.h"
     25#include "btree.h"
     26#include "debug.h"
     27#include "extents.h"
     28
     29#include <linux/slab.h>
     30#include <linux/bitops.h>
     31#include <linux/hash.h>
     32#include <linux/kthread.h>
     33#include <linux/prefetch.h>
     34#include <linux/random.h>
     35#include <linux/rcupdate.h>
     36#include <linux/sched/clock.h>
     37#include <linux/rculist.h>
     38#include <linux/delay.h>
     39#include <trace/events/bcache.h>
     40
     41/*
     42 * Todo:
     43 * register_bcache: Return errors out to userspace correctly
     44 *
     45 * Writeback: don't undirty key until after a cache flush
     46 *
     47 * Create an iterator for key pointers
     48 *
     49 * On btree write error, mark bucket such that it won't be freed from the cache
     50 *
     51 * Journalling:
     52 *   Check for bad keys in replay
     53 *   Propagate barriers
     54 *   Refcount journal entries in journal_replay
     55 *
     56 * Garbage collection:
     57 *   Finish incremental gc
     58 *   Gc should free old UUIDs, data for invalid UUIDs
     59 *
     60 * Provide a way to list backing device UUIDs we have data cached for, and
     61 * probably how long it's been since we've seen them, and a way to invalidate
     62 * dirty data for devices that will never be attached again
     63 *
     64 * Keep 1 min/5 min/15 min statistics of how busy a block device has been, so
     65 * that based on that and how much dirty data we have we can keep writeback
     66 * from being starved
     67 *
     68 * Add a tracepoint or somesuch to watch for writeback starvation
     69 *
     70 * When btree depth > 1 and splitting an interior node, we have to make sure
     71 * alloc_bucket() cannot fail. This should be true but is not completely
     72 * obvious.
     73 *
     74 * Plugging?
     75 *
     76 * If data write is less than hard sector size of ssd, round up offset in open
     77 * bucket to the next whole sector
     78 *
     79 * Superblock needs to be fleshed out for multiple cache devices
     80 *
     81 * Add a sysfs tunable for the number of writeback IOs in flight
     82 *
     83 * Add a sysfs tunable for the number of open data buckets
     84 *
     85 * IO tracking: Can we track when one process is doing io on behalf of another?
     86 * IO tracking: Don't use just an average, weigh more recent stuff higher
     87 *
     88 * Test module load/unload
     89 */
     90
     91#define MAX_NEED_GC		64
     92#define MAX_SAVE_PRIO		72
     93#define MAX_GC_TIMES		100
     94#define MIN_GC_NODES		100
     95#define GC_SLEEP_MS		100
     96
     97#define PTR_DIRTY_BIT		(((uint64_t) 1 << 36))
     98
     99#define PTR_HASH(c, k)							\
    100	(((k)->ptr[0] >> c->bucket_bits) | PTR_GEN(k, 0))
    101
    102static struct workqueue_struct *btree_io_wq;
    103
    104#define insert_lock(s, b)	((b)->level <= (s)->lock)
    105
    106
    107static inline struct bset *write_block(struct btree *b)
    108{
    109	return ((void *) btree_bset_first(b)) + b->written * block_bytes(b->c->cache);
    110}
    111
    112static void bch_btree_init_next(struct btree *b)
    113{
    114	/* If not a leaf node, always sort */
    115	if (b->level && b->keys.nsets)
    116		bch_btree_sort(&b->keys, &b->c->sort);
    117	else
    118		bch_btree_sort_lazy(&b->keys, &b->c->sort);
    119
    120	if (b->written < btree_blocks(b))
    121		bch_bset_init_next(&b->keys, write_block(b),
    122				   bset_magic(&b->c->cache->sb));
    123
    124}
    125
    126/* Btree key manipulation */
    127
    128void bkey_put(struct cache_set *c, struct bkey *k)
    129{
    130	unsigned int i;
    131
    132	for (i = 0; i < KEY_PTRS(k); i++)
    133		if (ptr_available(c, k, i))
    134			atomic_dec_bug(&PTR_BUCKET(c, k, i)->pin);
    135}
    136
    137/* Btree IO */
    138
    139static uint64_t btree_csum_set(struct btree *b, struct bset *i)
    140{
    141	uint64_t crc = b->key.ptr[0];
    142	void *data = (void *) i + 8, *end = bset_bkey_last(i);
    143
    144	crc = crc64_be(crc, data, end - data);
    145	return crc ^ 0xffffffffffffffffULL;
    146}
    147
    148void bch_btree_node_read_done(struct btree *b)
    149{
    150	const char *err = "bad btree header";
    151	struct bset *i = btree_bset_first(b);
    152	struct btree_iter *iter;
    153
    154	/*
    155	 * c->fill_iter can allocate an iterator with more memory space
    156	 * than static MAX_BSETS.
    157	 * See the comment arount cache_set->fill_iter.
    158	 */
    159	iter = mempool_alloc(&b->c->fill_iter, GFP_NOIO);
    160	iter->size = b->c->cache->sb.bucket_size / b->c->cache->sb.block_size;
    161	iter->used = 0;
    162
    163#ifdef CONFIG_BCACHE_DEBUG
    164	iter->b = &b->keys;
    165#endif
    166
    167	if (!i->seq)
    168		goto err;
    169
    170	for (;
    171	     b->written < btree_blocks(b) && i->seq == b->keys.set[0].data->seq;
    172	     i = write_block(b)) {
    173		err = "unsupported bset version";
    174		if (i->version > BCACHE_BSET_VERSION)
    175			goto err;
    176
    177		err = "bad btree header";
    178		if (b->written + set_blocks(i, block_bytes(b->c->cache)) >
    179		    btree_blocks(b))
    180			goto err;
    181
    182		err = "bad magic";
    183		if (i->magic != bset_magic(&b->c->cache->sb))
    184			goto err;
    185
    186		err = "bad checksum";
    187		switch (i->version) {
    188		case 0:
    189			if (i->csum != csum_set(i))
    190				goto err;
    191			break;
    192		case BCACHE_BSET_VERSION:
    193			if (i->csum != btree_csum_set(b, i))
    194				goto err;
    195			break;
    196		}
    197
    198		err = "empty set";
    199		if (i != b->keys.set[0].data && !i->keys)
    200			goto err;
    201
    202		bch_btree_iter_push(iter, i->start, bset_bkey_last(i));
    203
    204		b->written += set_blocks(i, block_bytes(b->c->cache));
    205	}
    206
    207	err = "corrupted btree";
    208	for (i = write_block(b);
    209	     bset_sector_offset(&b->keys, i) < KEY_SIZE(&b->key);
    210	     i = ((void *) i) + block_bytes(b->c->cache))
    211		if (i->seq == b->keys.set[0].data->seq)
    212			goto err;
    213
    214	bch_btree_sort_and_fix_extents(&b->keys, iter, &b->c->sort);
    215
    216	i = b->keys.set[0].data;
    217	err = "short btree key";
    218	if (b->keys.set[0].size &&
    219	    bkey_cmp(&b->key, &b->keys.set[0].end) < 0)
    220		goto err;
    221
    222	if (b->written < btree_blocks(b))
    223		bch_bset_init_next(&b->keys, write_block(b),
    224				   bset_magic(&b->c->cache->sb));
    225out:
    226	mempool_free(iter, &b->c->fill_iter);
    227	return;
    228err:
    229	set_btree_node_io_error(b);
    230	bch_cache_set_error(b->c, "%s at bucket %zu, block %u, %u keys",
    231			    err, PTR_BUCKET_NR(b->c, &b->key, 0),
    232			    bset_block_offset(b, i), i->keys);
    233	goto out;
    234}
    235
    236static void btree_node_read_endio(struct bio *bio)
    237{
    238	struct closure *cl = bio->bi_private;
    239
    240	closure_put(cl);
    241}
    242
    243static void bch_btree_node_read(struct btree *b)
    244{
    245	uint64_t start_time = local_clock();
    246	struct closure cl;
    247	struct bio *bio;
    248
    249	trace_bcache_btree_read(b);
    250
    251	closure_init_stack(&cl);
    252
    253	bio = bch_bbio_alloc(b->c);
    254	bio->bi_iter.bi_size = KEY_SIZE(&b->key) << 9;
    255	bio->bi_end_io	= btree_node_read_endio;
    256	bio->bi_private	= &cl;
    257	bio->bi_opf = REQ_OP_READ | REQ_META;
    258
    259	bch_bio_map(bio, b->keys.set[0].data);
    260
    261	bch_submit_bbio(bio, b->c, &b->key, 0);
    262	closure_sync(&cl);
    263
    264	if (bio->bi_status)
    265		set_btree_node_io_error(b);
    266
    267	bch_bbio_free(bio, b->c);
    268
    269	if (btree_node_io_error(b))
    270		goto err;
    271
    272	bch_btree_node_read_done(b);
    273	bch_time_stats_update(&b->c->btree_read_time, start_time);
    274
    275	return;
    276err:
    277	bch_cache_set_error(b->c, "io error reading bucket %zu",
    278			    PTR_BUCKET_NR(b->c, &b->key, 0));
    279}
    280
    281static void btree_complete_write(struct btree *b, struct btree_write *w)
    282{
    283	if (w->prio_blocked &&
    284	    !atomic_sub_return(w->prio_blocked, &b->c->prio_blocked))
    285		wake_up_allocators(b->c);
    286
    287	if (w->journal) {
    288		atomic_dec_bug(w->journal);
    289		__closure_wake_up(&b->c->journal.wait);
    290	}
    291
    292	w->prio_blocked	= 0;
    293	w->journal	= NULL;
    294}
    295
    296static void btree_node_write_unlock(struct closure *cl)
    297{
    298	struct btree *b = container_of(cl, struct btree, io);
    299
    300	up(&b->io_mutex);
    301}
    302
    303static void __btree_node_write_done(struct closure *cl)
    304{
    305	struct btree *b = container_of(cl, struct btree, io);
    306	struct btree_write *w = btree_prev_write(b);
    307
    308	bch_bbio_free(b->bio, b->c);
    309	b->bio = NULL;
    310	btree_complete_write(b, w);
    311
    312	if (btree_node_dirty(b))
    313		queue_delayed_work(btree_io_wq, &b->work, 30 * HZ);
    314
    315	closure_return_with_destructor(cl, btree_node_write_unlock);
    316}
    317
    318static void btree_node_write_done(struct closure *cl)
    319{
    320	struct btree *b = container_of(cl, struct btree, io);
    321
    322	bio_free_pages(b->bio);
    323	__btree_node_write_done(cl);
    324}
    325
    326static void btree_node_write_endio(struct bio *bio)
    327{
    328	struct closure *cl = bio->bi_private;
    329	struct btree *b = container_of(cl, struct btree, io);
    330
    331	if (bio->bi_status)
    332		set_btree_node_io_error(b);
    333
    334	bch_bbio_count_io_errors(b->c, bio, bio->bi_status, "writing btree");
    335	closure_put(cl);
    336}
    337
    338static void do_btree_node_write(struct btree *b)
    339{
    340	struct closure *cl = &b->io;
    341	struct bset *i = btree_bset_last(b);
    342	BKEY_PADDED(key) k;
    343
    344	i->version	= BCACHE_BSET_VERSION;
    345	i->csum		= btree_csum_set(b, i);
    346
    347	BUG_ON(b->bio);
    348	b->bio = bch_bbio_alloc(b->c);
    349
    350	b->bio->bi_end_io	= btree_node_write_endio;
    351	b->bio->bi_private	= cl;
    352	b->bio->bi_iter.bi_size	= roundup(set_bytes(i), block_bytes(b->c->cache));
    353	b->bio->bi_opf		= REQ_OP_WRITE | REQ_META | REQ_FUA;
    354	bch_bio_map(b->bio, i);
    355
    356	/*
    357	 * If we're appending to a leaf node, we don't technically need FUA -
    358	 * this write just needs to be persisted before the next journal write,
    359	 * which will be marked FLUSH|FUA.
    360	 *
    361	 * Similarly if we're writing a new btree root - the pointer is going to
    362	 * be in the next journal entry.
    363	 *
    364	 * But if we're writing a new btree node (that isn't a root) or
    365	 * appending to a non leaf btree node, we need either FUA or a flush
    366	 * when we write the parent with the new pointer. FUA is cheaper than a
    367	 * flush, and writes appending to leaf nodes aren't blocking anything so
    368	 * just make all btree node writes FUA to keep things sane.
    369	 */
    370
    371	bkey_copy(&k.key, &b->key);
    372	SET_PTR_OFFSET(&k.key, 0, PTR_OFFSET(&k.key, 0) +
    373		       bset_sector_offset(&b->keys, i));
    374
    375	if (!bch_bio_alloc_pages(b->bio, __GFP_NOWARN|GFP_NOWAIT)) {
    376		struct bio_vec *bv;
    377		void *addr = (void *) ((unsigned long) i & ~(PAGE_SIZE - 1));
    378		struct bvec_iter_all iter_all;
    379
    380		bio_for_each_segment_all(bv, b->bio, iter_all) {
    381			memcpy(page_address(bv->bv_page), addr, PAGE_SIZE);
    382			addr += PAGE_SIZE;
    383		}
    384
    385		bch_submit_bbio(b->bio, b->c, &k.key, 0);
    386
    387		continue_at(cl, btree_node_write_done, NULL);
    388	} else {
    389		/*
    390		 * No problem for multipage bvec since the bio is
    391		 * just allocated
    392		 */
    393		b->bio->bi_vcnt = 0;
    394		bch_bio_map(b->bio, i);
    395
    396		bch_submit_bbio(b->bio, b->c, &k.key, 0);
    397
    398		closure_sync(cl);
    399		continue_at_nobarrier(cl, __btree_node_write_done, NULL);
    400	}
    401}
    402
    403void __bch_btree_node_write(struct btree *b, struct closure *parent)
    404{
    405	struct bset *i = btree_bset_last(b);
    406
    407	lockdep_assert_held(&b->write_lock);
    408
    409	trace_bcache_btree_write(b);
    410
    411	BUG_ON(current->bio_list);
    412	BUG_ON(b->written >= btree_blocks(b));
    413	BUG_ON(b->written && !i->keys);
    414	BUG_ON(btree_bset_first(b)->seq != i->seq);
    415	bch_check_keys(&b->keys, "writing");
    416
    417	cancel_delayed_work(&b->work);
    418
    419	/* If caller isn't waiting for write, parent refcount is cache set */
    420	down(&b->io_mutex);
    421	closure_init(&b->io, parent ?: &b->c->cl);
    422
    423	clear_bit(BTREE_NODE_dirty,	 &b->flags);
    424	change_bit(BTREE_NODE_write_idx, &b->flags);
    425
    426	do_btree_node_write(b);
    427
    428	atomic_long_add(set_blocks(i, block_bytes(b->c->cache)) * b->c->cache->sb.block_size,
    429			&b->c->cache->btree_sectors_written);
    430
    431	b->written += set_blocks(i, block_bytes(b->c->cache));
    432}
    433
    434void bch_btree_node_write(struct btree *b, struct closure *parent)
    435{
    436	unsigned int nsets = b->keys.nsets;
    437
    438	lockdep_assert_held(&b->lock);
    439
    440	__bch_btree_node_write(b, parent);
    441
    442	/*
    443	 * do verify if there was more than one set initially (i.e. we did a
    444	 * sort) and we sorted down to a single set:
    445	 */
    446	if (nsets && !b->keys.nsets)
    447		bch_btree_verify(b);
    448
    449	bch_btree_init_next(b);
    450}
    451
    452static void bch_btree_node_write_sync(struct btree *b)
    453{
    454	struct closure cl;
    455
    456	closure_init_stack(&cl);
    457
    458	mutex_lock(&b->write_lock);
    459	bch_btree_node_write(b, &cl);
    460	mutex_unlock(&b->write_lock);
    461
    462	closure_sync(&cl);
    463}
    464
    465static void btree_node_write_work(struct work_struct *w)
    466{
    467	struct btree *b = container_of(to_delayed_work(w), struct btree, work);
    468
    469	mutex_lock(&b->write_lock);
    470	if (btree_node_dirty(b))
    471		__bch_btree_node_write(b, NULL);
    472	mutex_unlock(&b->write_lock);
    473}
    474
    475static void bch_btree_leaf_dirty(struct btree *b, atomic_t *journal_ref)
    476{
    477	struct bset *i = btree_bset_last(b);
    478	struct btree_write *w = btree_current_write(b);
    479
    480	lockdep_assert_held(&b->write_lock);
    481
    482	BUG_ON(!b->written);
    483	BUG_ON(!i->keys);
    484
    485	if (!btree_node_dirty(b))
    486		queue_delayed_work(btree_io_wq, &b->work, 30 * HZ);
    487
    488	set_btree_node_dirty(b);
    489
    490	/*
    491	 * w->journal is always the oldest journal pin of all bkeys
    492	 * in the leaf node, to make sure the oldest jset seq won't
    493	 * be increased before this btree node is flushed.
    494	 */
    495	if (journal_ref) {
    496		if (w->journal &&
    497		    journal_pin_cmp(b->c, w->journal, journal_ref)) {
    498			atomic_dec_bug(w->journal);
    499			w->journal = NULL;
    500		}
    501
    502		if (!w->journal) {
    503			w->journal = journal_ref;
    504			atomic_inc(w->journal);
    505		}
    506	}
    507
    508	/* Force write if set is too big */
    509	if (set_bytes(i) > PAGE_SIZE - 48 &&
    510	    !current->bio_list)
    511		bch_btree_node_write(b, NULL);
    512}
    513
    514/*
    515 * Btree in memory cache - allocation/freeing
    516 * mca -> memory cache
    517 */
    518
    519#define mca_reserve(c)	(((!IS_ERR_OR_NULL(c->root) && c->root->level) \
    520			  ? c->root->level : 1) * 8 + 16)
    521#define mca_can_free(c)						\
    522	max_t(int, 0, c->btree_cache_used - mca_reserve(c))
    523
    524static void mca_data_free(struct btree *b)
    525{
    526	BUG_ON(b->io_mutex.count != 1);
    527
    528	bch_btree_keys_free(&b->keys);
    529
    530	b->c->btree_cache_used--;
    531	list_move(&b->list, &b->c->btree_cache_freed);
    532}
    533
    534static void mca_bucket_free(struct btree *b)
    535{
    536	BUG_ON(btree_node_dirty(b));
    537
    538	b->key.ptr[0] = 0;
    539	hlist_del_init_rcu(&b->hash);
    540	list_move(&b->list, &b->c->btree_cache_freeable);
    541}
    542
    543static unsigned int btree_order(struct bkey *k)
    544{
    545	return ilog2(KEY_SIZE(k) / PAGE_SECTORS ?: 1);
    546}
    547
    548static void mca_data_alloc(struct btree *b, struct bkey *k, gfp_t gfp)
    549{
    550	if (!bch_btree_keys_alloc(&b->keys,
    551				  max_t(unsigned int,
    552					ilog2(b->c->btree_pages),
    553					btree_order(k)),
    554				  gfp)) {
    555		b->c->btree_cache_used++;
    556		list_move(&b->list, &b->c->btree_cache);
    557	} else {
    558		list_move(&b->list, &b->c->btree_cache_freed);
    559	}
    560}
    561
    562static struct btree *mca_bucket_alloc(struct cache_set *c,
    563				      struct bkey *k, gfp_t gfp)
    564{
    565	/*
    566	 * kzalloc() is necessary here for initialization,
    567	 * see code comments in bch_btree_keys_init().
    568	 */
    569	struct btree *b = kzalloc(sizeof(struct btree), gfp);
    570
    571	if (!b)
    572		return NULL;
    573
    574	init_rwsem(&b->lock);
    575	lockdep_set_novalidate_class(&b->lock);
    576	mutex_init(&b->write_lock);
    577	lockdep_set_novalidate_class(&b->write_lock);
    578	INIT_LIST_HEAD(&b->list);
    579	INIT_DELAYED_WORK(&b->work, btree_node_write_work);
    580	b->c = c;
    581	sema_init(&b->io_mutex, 1);
    582
    583	mca_data_alloc(b, k, gfp);
    584	return b;
    585}
    586
    587static int mca_reap(struct btree *b, unsigned int min_order, bool flush)
    588{
    589	struct closure cl;
    590
    591	closure_init_stack(&cl);
    592	lockdep_assert_held(&b->c->bucket_lock);
    593
    594	if (!down_write_trylock(&b->lock))
    595		return -ENOMEM;
    596
    597	BUG_ON(btree_node_dirty(b) && !b->keys.set[0].data);
    598
    599	if (b->keys.page_order < min_order)
    600		goto out_unlock;
    601
    602	if (!flush) {
    603		if (btree_node_dirty(b))
    604			goto out_unlock;
    605
    606		if (down_trylock(&b->io_mutex))
    607			goto out_unlock;
    608		up(&b->io_mutex);
    609	}
    610
    611retry:
    612	/*
    613	 * BTREE_NODE_dirty might be cleared in btree_flush_btree() by
    614	 * __bch_btree_node_write(). To avoid an extra flush, acquire
    615	 * b->write_lock before checking BTREE_NODE_dirty bit.
    616	 */
    617	mutex_lock(&b->write_lock);
    618	/*
    619	 * If this btree node is selected in btree_flush_write() by journal
    620	 * code, delay and retry until the node is flushed by journal code
    621	 * and BTREE_NODE_journal_flush bit cleared by btree_flush_write().
    622	 */
    623	if (btree_node_journal_flush(b)) {
    624		pr_debug("bnode %p is flushing by journal, retry\n", b);
    625		mutex_unlock(&b->write_lock);
    626		udelay(1);
    627		goto retry;
    628	}
    629
    630	if (btree_node_dirty(b))
    631		__bch_btree_node_write(b, &cl);
    632	mutex_unlock(&b->write_lock);
    633
    634	closure_sync(&cl);
    635
    636	/* wait for any in flight btree write */
    637	down(&b->io_mutex);
    638	up(&b->io_mutex);
    639
    640	return 0;
    641out_unlock:
    642	rw_unlock(true, b);
    643	return -ENOMEM;
    644}
    645
    646static unsigned long bch_mca_scan(struct shrinker *shrink,
    647				  struct shrink_control *sc)
    648{
    649	struct cache_set *c = container_of(shrink, struct cache_set, shrink);
    650	struct btree *b, *t;
    651	unsigned long i, nr = sc->nr_to_scan;
    652	unsigned long freed = 0;
    653	unsigned int btree_cache_used;
    654
    655	if (c->shrinker_disabled)
    656		return SHRINK_STOP;
    657
    658	if (c->btree_cache_alloc_lock)
    659		return SHRINK_STOP;
    660
    661	/* Return -1 if we can't do anything right now */
    662	if (sc->gfp_mask & __GFP_IO)
    663		mutex_lock(&c->bucket_lock);
    664	else if (!mutex_trylock(&c->bucket_lock))
    665		return -1;
    666
    667	/*
    668	 * It's _really_ critical that we don't free too many btree nodes - we
    669	 * have to always leave ourselves a reserve. The reserve is how we
    670	 * guarantee that allocating memory for a new btree node can always
    671	 * succeed, so that inserting keys into the btree can always succeed and
    672	 * IO can always make forward progress:
    673	 */
    674	nr /= c->btree_pages;
    675	if (nr == 0)
    676		nr = 1;
    677	nr = min_t(unsigned long, nr, mca_can_free(c));
    678
    679	i = 0;
    680	btree_cache_used = c->btree_cache_used;
    681	list_for_each_entry_safe_reverse(b, t, &c->btree_cache_freeable, list) {
    682		if (nr <= 0)
    683			goto out;
    684
    685		if (!mca_reap(b, 0, false)) {
    686			mca_data_free(b);
    687			rw_unlock(true, b);
    688			freed++;
    689		}
    690		nr--;
    691		i++;
    692	}
    693
    694	list_for_each_entry_safe_reverse(b, t, &c->btree_cache, list) {
    695		if (nr <= 0 || i >= btree_cache_used)
    696			goto out;
    697
    698		if (!mca_reap(b, 0, false)) {
    699			mca_bucket_free(b);
    700			mca_data_free(b);
    701			rw_unlock(true, b);
    702			freed++;
    703		}
    704
    705		nr--;
    706		i++;
    707	}
    708out:
    709	mutex_unlock(&c->bucket_lock);
    710	return freed * c->btree_pages;
    711}
    712
    713static unsigned long bch_mca_count(struct shrinker *shrink,
    714				   struct shrink_control *sc)
    715{
    716	struct cache_set *c = container_of(shrink, struct cache_set, shrink);
    717
    718	if (c->shrinker_disabled)
    719		return 0;
    720
    721	if (c->btree_cache_alloc_lock)
    722		return 0;
    723
    724	return mca_can_free(c) * c->btree_pages;
    725}
    726
    727void bch_btree_cache_free(struct cache_set *c)
    728{
    729	struct btree *b;
    730	struct closure cl;
    731
    732	closure_init_stack(&cl);
    733
    734	if (c->shrink.list.next)
    735		unregister_shrinker(&c->shrink);
    736
    737	mutex_lock(&c->bucket_lock);
    738
    739#ifdef CONFIG_BCACHE_DEBUG
    740	if (c->verify_data)
    741		list_move(&c->verify_data->list, &c->btree_cache);
    742
    743	free_pages((unsigned long) c->verify_ondisk, ilog2(meta_bucket_pages(&c->cache->sb)));
    744#endif
    745
    746	list_splice(&c->btree_cache_freeable,
    747		    &c->btree_cache);
    748
    749	while (!list_empty(&c->btree_cache)) {
    750		b = list_first_entry(&c->btree_cache, struct btree, list);
    751
    752		/*
    753		 * This function is called by cache_set_free(), no I/O
    754		 * request on cache now, it is unnecessary to acquire
    755		 * b->write_lock before clearing BTREE_NODE_dirty anymore.
    756		 */
    757		if (btree_node_dirty(b)) {
    758			btree_complete_write(b, btree_current_write(b));
    759			clear_bit(BTREE_NODE_dirty, &b->flags);
    760		}
    761		mca_data_free(b);
    762	}
    763
    764	while (!list_empty(&c->btree_cache_freed)) {
    765		b = list_first_entry(&c->btree_cache_freed,
    766				     struct btree, list);
    767		list_del(&b->list);
    768		cancel_delayed_work_sync(&b->work);
    769		kfree(b);
    770	}
    771
    772	mutex_unlock(&c->bucket_lock);
    773}
    774
    775int bch_btree_cache_alloc(struct cache_set *c)
    776{
    777	unsigned int i;
    778
    779	for (i = 0; i < mca_reserve(c); i++)
    780		if (!mca_bucket_alloc(c, &ZERO_KEY, GFP_KERNEL))
    781			return -ENOMEM;
    782
    783	list_splice_init(&c->btree_cache,
    784			 &c->btree_cache_freeable);
    785
    786#ifdef CONFIG_BCACHE_DEBUG
    787	mutex_init(&c->verify_lock);
    788
    789	c->verify_ondisk = (void *)
    790		__get_free_pages(GFP_KERNEL|__GFP_COMP,
    791				 ilog2(meta_bucket_pages(&c->cache->sb)));
    792	if (!c->verify_ondisk) {
    793		/*
    794		 * Don't worry about the mca_rereserve buckets
    795		 * allocated in previous for-loop, they will be
    796		 * handled properly in bch_cache_set_unregister().
    797		 */
    798		return -ENOMEM;
    799	}
    800
    801	c->verify_data = mca_bucket_alloc(c, &ZERO_KEY, GFP_KERNEL);
    802
    803	if (c->verify_data &&
    804	    c->verify_data->keys.set->data)
    805		list_del_init(&c->verify_data->list);
    806	else
    807		c->verify_data = NULL;
    808#endif
    809
    810	c->shrink.count_objects = bch_mca_count;
    811	c->shrink.scan_objects = bch_mca_scan;
    812	c->shrink.seeks = 4;
    813	c->shrink.batch = c->btree_pages * 2;
    814
    815	if (register_shrinker(&c->shrink))
    816		pr_warn("bcache: %s: could not register shrinker\n",
    817				__func__);
    818
    819	return 0;
    820}
    821
    822/* Btree in memory cache - hash table */
    823
    824static struct hlist_head *mca_hash(struct cache_set *c, struct bkey *k)
    825{
    826	return &c->bucket_hash[hash_32(PTR_HASH(c, k), BUCKET_HASH_BITS)];
    827}
    828
    829static struct btree *mca_find(struct cache_set *c, struct bkey *k)
    830{
    831	struct btree *b;
    832
    833	rcu_read_lock();
    834	hlist_for_each_entry_rcu(b, mca_hash(c, k), hash)
    835		if (PTR_HASH(c, &b->key) == PTR_HASH(c, k))
    836			goto out;
    837	b = NULL;
    838out:
    839	rcu_read_unlock();
    840	return b;
    841}
    842
    843static int mca_cannibalize_lock(struct cache_set *c, struct btree_op *op)
    844{
    845	spin_lock(&c->btree_cannibalize_lock);
    846	if (likely(c->btree_cache_alloc_lock == NULL)) {
    847		c->btree_cache_alloc_lock = current;
    848	} else if (c->btree_cache_alloc_lock != current) {
    849		if (op)
    850			prepare_to_wait(&c->btree_cache_wait, &op->wait,
    851					TASK_UNINTERRUPTIBLE);
    852		spin_unlock(&c->btree_cannibalize_lock);
    853		return -EINTR;
    854	}
    855	spin_unlock(&c->btree_cannibalize_lock);
    856
    857	return 0;
    858}
    859
    860static struct btree *mca_cannibalize(struct cache_set *c, struct btree_op *op,
    861				     struct bkey *k)
    862{
    863	struct btree *b;
    864
    865	trace_bcache_btree_cache_cannibalize(c);
    866
    867	if (mca_cannibalize_lock(c, op))
    868		return ERR_PTR(-EINTR);
    869
    870	list_for_each_entry_reverse(b, &c->btree_cache, list)
    871		if (!mca_reap(b, btree_order(k), false))
    872			return b;
    873
    874	list_for_each_entry_reverse(b, &c->btree_cache, list)
    875		if (!mca_reap(b, btree_order(k), true))
    876			return b;
    877
    878	WARN(1, "btree cache cannibalize failed\n");
    879	return ERR_PTR(-ENOMEM);
    880}
    881
    882/*
    883 * We can only have one thread cannibalizing other cached btree nodes at a time,
    884 * or we'll deadlock. We use an open coded mutex to ensure that, which a
    885 * cannibalize_bucket() will take. This means every time we unlock the root of
    886 * the btree, we need to release this lock if we have it held.
    887 */
    888static void bch_cannibalize_unlock(struct cache_set *c)
    889{
    890	spin_lock(&c->btree_cannibalize_lock);
    891	if (c->btree_cache_alloc_lock == current) {
    892		c->btree_cache_alloc_lock = NULL;
    893		wake_up(&c->btree_cache_wait);
    894	}
    895	spin_unlock(&c->btree_cannibalize_lock);
    896}
    897
    898static struct btree *mca_alloc(struct cache_set *c, struct btree_op *op,
    899			       struct bkey *k, int level)
    900{
    901	struct btree *b;
    902
    903	BUG_ON(current->bio_list);
    904
    905	lockdep_assert_held(&c->bucket_lock);
    906
    907	if (mca_find(c, k))
    908		return NULL;
    909
    910	/* btree_free() doesn't free memory; it sticks the node on the end of
    911	 * the list. Check if there's any freed nodes there:
    912	 */
    913	list_for_each_entry(b, &c->btree_cache_freeable, list)
    914		if (!mca_reap(b, btree_order(k), false))
    915			goto out;
    916
    917	/* We never free struct btree itself, just the memory that holds the on
    918	 * disk node. Check the freed list before allocating a new one:
    919	 */
    920	list_for_each_entry(b, &c->btree_cache_freed, list)
    921		if (!mca_reap(b, 0, false)) {
    922			mca_data_alloc(b, k, __GFP_NOWARN|GFP_NOIO);
    923			if (!b->keys.set[0].data)
    924				goto err;
    925			else
    926				goto out;
    927		}
    928
    929	b = mca_bucket_alloc(c, k, __GFP_NOWARN|GFP_NOIO);
    930	if (!b)
    931		goto err;
    932
    933	BUG_ON(!down_write_trylock(&b->lock));
    934	if (!b->keys.set->data)
    935		goto err;
    936out:
    937	BUG_ON(b->io_mutex.count != 1);
    938
    939	bkey_copy(&b->key, k);
    940	list_move(&b->list, &c->btree_cache);
    941	hlist_del_init_rcu(&b->hash);
    942	hlist_add_head_rcu(&b->hash, mca_hash(c, k));
    943
    944	lock_set_subclass(&b->lock.dep_map, level + 1, _THIS_IP_);
    945	b->parent	= (void *) ~0UL;
    946	b->flags	= 0;
    947	b->written	= 0;
    948	b->level	= level;
    949
    950	if (!b->level)
    951		bch_btree_keys_init(&b->keys, &bch_extent_keys_ops,
    952				    &b->c->expensive_debug_checks);
    953	else
    954		bch_btree_keys_init(&b->keys, &bch_btree_keys_ops,
    955				    &b->c->expensive_debug_checks);
    956
    957	return b;
    958err:
    959	if (b)
    960		rw_unlock(true, b);
    961
    962	b = mca_cannibalize(c, op, k);
    963	if (!IS_ERR(b))
    964		goto out;
    965
    966	return b;
    967}
    968
    969/*
    970 * bch_btree_node_get - find a btree node in the cache and lock it, reading it
    971 * in from disk if necessary.
    972 *
    973 * If IO is necessary and running under submit_bio_noacct, returns -EAGAIN.
    974 *
    975 * The btree node will have either a read or a write lock held, depending on
    976 * level and op->lock.
    977 */
    978struct btree *bch_btree_node_get(struct cache_set *c, struct btree_op *op,
    979				 struct bkey *k, int level, bool write,
    980				 struct btree *parent)
    981{
    982	int i = 0;
    983	struct btree *b;
    984
    985	BUG_ON(level < 0);
    986retry:
    987	b = mca_find(c, k);
    988
    989	if (!b) {
    990		if (current->bio_list)
    991			return ERR_PTR(-EAGAIN);
    992
    993		mutex_lock(&c->bucket_lock);
    994		b = mca_alloc(c, op, k, level);
    995		mutex_unlock(&c->bucket_lock);
    996
    997		if (!b)
    998			goto retry;
    999		if (IS_ERR(b))
   1000			return b;
   1001
   1002		bch_btree_node_read(b);
   1003
   1004		if (!write)
   1005			downgrade_write(&b->lock);
   1006	} else {
   1007		rw_lock(write, b, level);
   1008		if (PTR_HASH(c, &b->key) != PTR_HASH(c, k)) {
   1009			rw_unlock(write, b);
   1010			goto retry;
   1011		}
   1012		BUG_ON(b->level != level);
   1013	}
   1014
   1015	if (btree_node_io_error(b)) {
   1016		rw_unlock(write, b);
   1017		return ERR_PTR(-EIO);
   1018	}
   1019
   1020	BUG_ON(!b->written);
   1021
   1022	b->parent = parent;
   1023
   1024	for (; i <= b->keys.nsets && b->keys.set[i].size; i++) {
   1025		prefetch(b->keys.set[i].tree);
   1026		prefetch(b->keys.set[i].data);
   1027	}
   1028
   1029	for (; i <= b->keys.nsets; i++)
   1030		prefetch(b->keys.set[i].data);
   1031
   1032	return b;
   1033}
   1034
   1035static void btree_node_prefetch(struct btree *parent, struct bkey *k)
   1036{
   1037	struct btree *b;
   1038
   1039	mutex_lock(&parent->c->bucket_lock);
   1040	b = mca_alloc(parent->c, NULL, k, parent->level - 1);
   1041	mutex_unlock(&parent->c->bucket_lock);
   1042
   1043	if (!IS_ERR_OR_NULL(b)) {
   1044		b->parent = parent;
   1045		bch_btree_node_read(b);
   1046		rw_unlock(true, b);
   1047	}
   1048}
   1049
   1050/* Btree alloc */
   1051
   1052static void btree_node_free(struct btree *b)
   1053{
   1054	trace_bcache_btree_node_free(b);
   1055
   1056	BUG_ON(b == b->c->root);
   1057
   1058retry:
   1059	mutex_lock(&b->write_lock);
   1060	/*
   1061	 * If the btree node is selected and flushing in btree_flush_write(),
   1062	 * delay and retry until the BTREE_NODE_journal_flush bit cleared,
   1063	 * then it is safe to free the btree node here. Otherwise this btree
   1064	 * node will be in race condition.
   1065	 */
   1066	if (btree_node_journal_flush(b)) {
   1067		mutex_unlock(&b->write_lock);
   1068		pr_debug("bnode %p journal_flush set, retry\n", b);
   1069		udelay(1);
   1070		goto retry;
   1071	}
   1072
   1073	if (btree_node_dirty(b)) {
   1074		btree_complete_write(b, btree_current_write(b));
   1075		clear_bit(BTREE_NODE_dirty, &b->flags);
   1076	}
   1077
   1078	mutex_unlock(&b->write_lock);
   1079
   1080	cancel_delayed_work(&b->work);
   1081
   1082	mutex_lock(&b->c->bucket_lock);
   1083	bch_bucket_free(b->c, &b->key);
   1084	mca_bucket_free(b);
   1085	mutex_unlock(&b->c->bucket_lock);
   1086}
   1087
   1088struct btree *__bch_btree_node_alloc(struct cache_set *c, struct btree_op *op,
   1089				     int level, bool wait,
   1090				     struct btree *parent)
   1091{
   1092	BKEY_PADDED(key) k;
   1093	struct btree *b = ERR_PTR(-EAGAIN);
   1094
   1095	mutex_lock(&c->bucket_lock);
   1096retry:
   1097	if (__bch_bucket_alloc_set(c, RESERVE_BTREE, &k.key, wait))
   1098		goto err;
   1099
   1100	bkey_put(c, &k.key);
   1101	SET_KEY_SIZE(&k.key, c->btree_pages * PAGE_SECTORS);
   1102
   1103	b = mca_alloc(c, op, &k.key, level);
   1104	if (IS_ERR(b))
   1105		goto err_free;
   1106
   1107	if (!b) {
   1108		cache_bug(c,
   1109			"Tried to allocate bucket that was in btree cache");
   1110		goto retry;
   1111	}
   1112
   1113	b->parent = parent;
   1114	bch_bset_init_next(&b->keys, b->keys.set->data, bset_magic(&b->c->cache->sb));
   1115
   1116	mutex_unlock(&c->bucket_lock);
   1117
   1118	trace_bcache_btree_node_alloc(b);
   1119	return b;
   1120err_free:
   1121	bch_bucket_free(c, &k.key);
   1122err:
   1123	mutex_unlock(&c->bucket_lock);
   1124
   1125	trace_bcache_btree_node_alloc_fail(c);
   1126	return b;
   1127}
   1128
   1129static struct btree *bch_btree_node_alloc(struct cache_set *c,
   1130					  struct btree_op *op, int level,
   1131					  struct btree *parent)
   1132{
   1133	return __bch_btree_node_alloc(c, op, level, op != NULL, parent);
   1134}
   1135
   1136static struct btree *btree_node_alloc_replacement(struct btree *b,
   1137						  struct btree_op *op)
   1138{
   1139	struct btree *n = bch_btree_node_alloc(b->c, op, b->level, b->parent);
   1140
   1141	if (!IS_ERR_OR_NULL(n)) {
   1142		mutex_lock(&n->write_lock);
   1143		bch_btree_sort_into(&b->keys, &n->keys, &b->c->sort);
   1144		bkey_copy_key(&n->key, &b->key);
   1145		mutex_unlock(&n->write_lock);
   1146	}
   1147
   1148	return n;
   1149}
   1150
   1151static void make_btree_freeing_key(struct btree *b, struct bkey *k)
   1152{
   1153	unsigned int i;
   1154
   1155	mutex_lock(&b->c->bucket_lock);
   1156
   1157	atomic_inc(&b->c->prio_blocked);
   1158
   1159	bkey_copy(k, &b->key);
   1160	bkey_copy_key(k, &ZERO_KEY);
   1161
   1162	for (i = 0; i < KEY_PTRS(k); i++)
   1163		SET_PTR_GEN(k, i,
   1164			    bch_inc_gen(b->c->cache,
   1165					PTR_BUCKET(b->c, &b->key, i)));
   1166
   1167	mutex_unlock(&b->c->bucket_lock);
   1168}
   1169
   1170static int btree_check_reserve(struct btree *b, struct btree_op *op)
   1171{
   1172	struct cache_set *c = b->c;
   1173	struct cache *ca = c->cache;
   1174	unsigned int reserve = (c->root->level - b->level) * 2 + 1;
   1175
   1176	mutex_lock(&c->bucket_lock);
   1177
   1178	if (fifo_used(&ca->free[RESERVE_BTREE]) < reserve) {
   1179		if (op)
   1180			prepare_to_wait(&c->btree_cache_wait, &op->wait,
   1181					TASK_UNINTERRUPTIBLE);
   1182		mutex_unlock(&c->bucket_lock);
   1183		return -EINTR;
   1184	}
   1185
   1186	mutex_unlock(&c->bucket_lock);
   1187
   1188	return mca_cannibalize_lock(b->c, op);
   1189}
   1190
   1191/* Garbage collection */
   1192
   1193static uint8_t __bch_btree_mark_key(struct cache_set *c, int level,
   1194				    struct bkey *k)
   1195{
   1196	uint8_t stale = 0;
   1197	unsigned int i;
   1198	struct bucket *g;
   1199
   1200	/*
   1201	 * ptr_invalid() can't return true for the keys that mark btree nodes as
   1202	 * freed, but since ptr_bad() returns true we'll never actually use them
   1203	 * for anything and thus we don't want mark their pointers here
   1204	 */
   1205	if (!bkey_cmp(k, &ZERO_KEY))
   1206		return stale;
   1207
   1208	for (i = 0; i < KEY_PTRS(k); i++) {
   1209		if (!ptr_available(c, k, i))
   1210			continue;
   1211
   1212		g = PTR_BUCKET(c, k, i);
   1213
   1214		if (gen_after(g->last_gc, PTR_GEN(k, i)))
   1215			g->last_gc = PTR_GEN(k, i);
   1216
   1217		if (ptr_stale(c, k, i)) {
   1218			stale = max(stale, ptr_stale(c, k, i));
   1219			continue;
   1220		}
   1221
   1222		cache_bug_on(GC_MARK(g) &&
   1223			     (GC_MARK(g) == GC_MARK_METADATA) != (level != 0),
   1224			     c, "inconsistent ptrs: mark = %llu, level = %i",
   1225			     GC_MARK(g), level);
   1226
   1227		if (level)
   1228			SET_GC_MARK(g, GC_MARK_METADATA);
   1229		else if (KEY_DIRTY(k))
   1230			SET_GC_MARK(g, GC_MARK_DIRTY);
   1231		else if (!GC_MARK(g))
   1232			SET_GC_MARK(g, GC_MARK_RECLAIMABLE);
   1233
   1234		/* guard against overflow */
   1235		SET_GC_SECTORS_USED(g, min_t(unsigned int,
   1236					     GC_SECTORS_USED(g) + KEY_SIZE(k),
   1237					     MAX_GC_SECTORS_USED));
   1238
   1239		BUG_ON(!GC_SECTORS_USED(g));
   1240	}
   1241
   1242	return stale;
   1243}
   1244
   1245#define btree_mark_key(b, k)	__bch_btree_mark_key(b->c, b->level, k)
   1246
   1247void bch_initial_mark_key(struct cache_set *c, int level, struct bkey *k)
   1248{
   1249	unsigned int i;
   1250
   1251	for (i = 0; i < KEY_PTRS(k); i++)
   1252		if (ptr_available(c, k, i) &&
   1253		    !ptr_stale(c, k, i)) {
   1254			struct bucket *b = PTR_BUCKET(c, k, i);
   1255
   1256			b->gen = PTR_GEN(k, i);
   1257
   1258			if (level && bkey_cmp(k, &ZERO_KEY))
   1259				b->prio = BTREE_PRIO;
   1260			else if (!level && b->prio == BTREE_PRIO)
   1261				b->prio = INITIAL_PRIO;
   1262		}
   1263
   1264	__bch_btree_mark_key(c, level, k);
   1265}
   1266
   1267void bch_update_bucket_in_use(struct cache_set *c, struct gc_stat *stats)
   1268{
   1269	stats->in_use = (c->nbuckets - c->avail_nbuckets) * 100 / c->nbuckets;
   1270}
   1271
   1272static bool btree_gc_mark_node(struct btree *b, struct gc_stat *gc)
   1273{
   1274	uint8_t stale = 0;
   1275	unsigned int keys = 0, good_keys = 0;
   1276	struct bkey *k;
   1277	struct btree_iter iter;
   1278	struct bset_tree *t;
   1279
   1280	gc->nodes++;
   1281
   1282	for_each_key_filter(&b->keys, k, &iter, bch_ptr_invalid) {
   1283		stale = max(stale, btree_mark_key(b, k));
   1284		keys++;
   1285
   1286		if (bch_ptr_bad(&b->keys, k))
   1287			continue;
   1288
   1289		gc->key_bytes += bkey_u64s(k);
   1290		gc->nkeys++;
   1291		good_keys++;
   1292
   1293		gc->data += KEY_SIZE(k);
   1294	}
   1295
   1296	for (t = b->keys.set; t <= &b->keys.set[b->keys.nsets]; t++)
   1297		btree_bug_on(t->size &&
   1298			     bset_written(&b->keys, t) &&
   1299			     bkey_cmp(&b->key, &t->end) < 0,
   1300			     b, "found short btree key in gc");
   1301
   1302	if (b->c->gc_always_rewrite)
   1303		return true;
   1304
   1305	if (stale > 10)
   1306		return true;
   1307
   1308	if ((keys - good_keys) * 2 > keys)
   1309		return true;
   1310
   1311	return false;
   1312}
   1313
   1314#define GC_MERGE_NODES	4U
   1315
   1316struct gc_merge_info {
   1317	struct btree	*b;
   1318	unsigned int	keys;
   1319};
   1320
   1321static int bch_btree_insert_node(struct btree *b, struct btree_op *op,
   1322				 struct keylist *insert_keys,
   1323				 atomic_t *journal_ref,
   1324				 struct bkey *replace_key);
   1325
   1326static int btree_gc_coalesce(struct btree *b, struct btree_op *op,
   1327			     struct gc_stat *gc, struct gc_merge_info *r)
   1328{
   1329	unsigned int i, nodes = 0, keys = 0, blocks;
   1330	struct btree *new_nodes[GC_MERGE_NODES];
   1331	struct keylist keylist;
   1332	struct closure cl;
   1333	struct bkey *k;
   1334
   1335	bch_keylist_init(&keylist);
   1336
   1337	if (btree_check_reserve(b, NULL))
   1338		return 0;
   1339
   1340	memset(new_nodes, 0, sizeof(new_nodes));
   1341	closure_init_stack(&cl);
   1342
   1343	while (nodes < GC_MERGE_NODES && !IS_ERR_OR_NULL(r[nodes].b))
   1344		keys += r[nodes++].keys;
   1345
   1346	blocks = btree_default_blocks(b->c) * 2 / 3;
   1347
   1348	if (nodes < 2 ||
   1349	    __set_blocks(b->keys.set[0].data, keys,
   1350			 block_bytes(b->c->cache)) > blocks * (nodes - 1))
   1351		return 0;
   1352
   1353	for (i = 0; i < nodes; i++) {
   1354		new_nodes[i] = btree_node_alloc_replacement(r[i].b, NULL);
   1355		if (IS_ERR_OR_NULL(new_nodes[i]))
   1356			goto out_nocoalesce;
   1357	}
   1358
   1359	/*
   1360	 * We have to check the reserve here, after we've allocated our new
   1361	 * nodes, to make sure the insert below will succeed - we also check
   1362	 * before as an optimization to potentially avoid a bunch of expensive
   1363	 * allocs/sorts
   1364	 */
   1365	if (btree_check_reserve(b, NULL))
   1366		goto out_nocoalesce;
   1367
   1368	for (i = 0; i < nodes; i++)
   1369		mutex_lock(&new_nodes[i]->write_lock);
   1370
   1371	for (i = nodes - 1; i > 0; --i) {
   1372		struct bset *n1 = btree_bset_first(new_nodes[i]);
   1373		struct bset *n2 = btree_bset_first(new_nodes[i - 1]);
   1374		struct bkey *k, *last = NULL;
   1375
   1376		keys = 0;
   1377
   1378		if (i > 1) {
   1379			for (k = n2->start;
   1380			     k < bset_bkey_last(n2);
   1381			     k = bkey_next(k)) {
   1382				if (__set_blocks(n1, n1->keys + keys +
   1383						 bkey_u64s(k),
   1384						 block_bytes(b->c->cache)) > blocks)
   1385					break;
   1386
   1387				last = k;
   1388				keys += bkey_u64s(k);
   1389			}
   1390		} else {
   1391			/*
   1392			 * Last node we're not getting rid of - we're getting
   1393			 * rid of the node at r[0]. Have to try and fit all of
   1394			 * the remaining keys into this node; we can't ensure
   1395			 * they will always fit due to rounding and variable
   1396			 * length keys (shouldn't be possible in practice,
   1397			 * though)
   1398			 */
   1399			if (__set_blocks(n1, n1->keys + n2->keys,
   1400					 block_bytes(b->c->cache)) >
   1401			    btree_blocks(new_nodes[i]))
   1402				goto out_unlock_nocoalesce;
   1403
   1404			keys = n2->keys;
   1405			/* Take the key of the node we're getting rid of */
   1406			last = &r->b->key;
   1407		}
   1408
   1409		BUG_ON(__set_blocks(n1, n1->keys + keys, block_bytes(b->c->cache)) >
   1410		       btree_blocks(new_nodes[i]));
   1411
   1412		if (last)
   1413			bkey_copy_key(&new_nodes[i]->key, last);
   1414
   1415		memcpy(bset_bkey_last(n1),
   1416		       n2->start,
   1417		       (void *) bset_bkey_idx(n2, keys) - (void *) n2->start);
   1418
   1419		n1->keys += keys;
   1420		r[i].keys = n1->keys;
   1421
   1422		memmove(n2->start,
   1423			bset_bkey_idx(n2, keys),
   1424			(void *) bset_bkey_last(n2) -
   1425			(void *) bset_bkey_idx(n2, keys));
   1426
   1427		n2->keys -= keys;
   1428
   1429		if (__bch_keylist_realloc(&keylist,
   1430					  bkey_u64s(&new_nodes[i]->key)))
   1431			goto out_unlock_nocoalesce;
   1432
   1433		bch_btree_node_write(new_nodes[i], &cl);
   1434		bch_keylist_add(&keylist, &new_nodes[i]->key);
   1435	}
   1436
   1437	for (i = 0; i < nodes; i++)
   1438		mutex_unlock(&new_nodes[i]->write_lock);
   1439
   1440	closure_sync(&cl);
   1441
   1442	/* We emptied out this node */
   1443	BUG_ON(btree_bset_first(new_nodes[0])->keys);
   1444	btree_node_free(new_nodes[0]);
   1445	rw_unlock(true, new_nodes[0]);
   1446	new_nodes[0] = NULL;
   1447
   1448	for (i = 0; i < nodes; i++) {
   1449		if (__bch_keylist_realloc(&keylist, bkey_u64s(&r[i].b->key)))
   1450			goto out_nocoalesce;
   1451
   1452		make_btree_freeing_key(r[i].b, keylist.top);
   1453		bch_keylist_push(&keylist);
   1454	}
   1455
   1456	bch_btree_insert_node(b, op, &keylist, NULL, NULL);
   1457	BUG_ON(!bch_keylist_empty(&keylist));
   1458
   1459	for (i = 0; i < nodes; i++) {
   1460		btree_node_free(r[i].b);
   1461		rw_unlock(true, r[i].b);
   1462
   1463		r[i].b = new_nodes[i];
   1464	}
   1465
   1466	memmove(r, r + 1, sizeof(r[0]) * (nodes - 1));
   1467	r[nodes - 1].b = ERR_PTR(-EINTR);
   1468
   1469	trace_bcache_btree_gc_coalesce(nodes);
   1470	gc->nodes--;
   1471
   1472	bch_keylist_free(&keylist);
   1473
   1474	/* Invalidated our iterator */
   1475	return -EINTR;
   1476
   1477out_unlock_nocoalesce:
   1478	for (i = 0; i < nodes; i++)
   1479		mutex_unlock(&new_nodes[i]->write_lock);
   1480
   1481out_nocoalesce:
   1482	closure_sync(&cl);
   1483
   1484	while ((k = bch_keylist_pop(&keylist)))
   1485		if (!bkey_cmp(k, &ZERO_KEY))
   1486			atomic_dec(&b->c->prio_blocked);
   1487	bch_keylist_free(&keylist);
   1488
   1489	for (i = 0; i < nodes; i++)
   1490		if (!IS_ERR_OR_NULL(new_nodes[i])) {
   1491			btree_node_free(new_nodes[i]);
   1492			rw_unlock(true, new_nodes[i]);
   1493		}
   1494	return 0;
   1495}
   1496
   1497static int btree_gc_rewrite_node(struct btree *b, struct btree_op *op,
   1498				 struct btree *replace)
   1499{
   1500	struct keylist keys;
   1501	struct btree *n;
   1502
   1503	if (btree_check_reserve(b, NULL))
   1504		return 0;
   1505
   1506	n = btree_node_alloc_replacement(replace, NULL);
   1507
   1508	/* recheck reserve after allocating replacement node */
   1509	if (btree_check_reserve(b, NULL)) {
   1510		btree_node_free(n);
   1511		rw_unlock(true, n);
   1512		return 0;
   1513	}
   1514
   1515	bch_btree_node_write_sync(n);
   1516
   1517	bch_keylist_init(&keys);
   1518	bch_keylist_add(&keys, &n->key);
   1519
   1520	make_btree_freeing_key(replace, keys.top);
   1521	bch_keylist_push(&keys);
   1522
   1523	bch_btree_insert_node(b, op, &keys, NULL, NULL);
   1524	BUG_ON(!bch_keylist_empty(&keys));
   1525
   1526	btree_node_free(replace);
   1527	rw_unlock(true, n);
   1528
   1529	/* Invalidated our iterator */
   1530	return -EINTR;
   1531}
   1532
   1533static unsigned int btree_gc_count_keys(struct btree *b)
   1534{
   1535	struct bkey *k;
   1536	struct btree_iter iter;
   1537	unsigned int ret = 0;
   1538
   1539	for_each_key_filter(&b->keys, k, &iter, bch_ptr_bad)
   1540		ret += bkey_u64s(k);
   1541
   1542	return ret;
   1543}
   1544
   1545static size_t btree_gc_min_nodes(struct cache_set *c)
   1546{
   1547	size_t min_nodes;
   1548
   1549	/*
   1550	 * Since incremental GC would stop 100ms when front
   1551	 * side I/O comes, so when there are many btree nodes,
   1552	 * if GC only processes constant (100) nodes each time,
   1553	 * GC would last a long time, and the front side I/Os
   1554	 * would run out of the buckets (since no new bucket
   1555	 * can be allocated during GC), and be blocked again.
   1556	 * So GC should not process constant nodes, but varied
   1557	 * nodes according to the number of btree nodes, which
   1558	 * realized by dividing GC into constant(100) times,
   1559	 * so when there are many btree nodes, GC can process
   1560	 * more nodes each time, otherwise, GC will process less
   1561	 * nodes each time (but no less than MIN_GC_NODES)
   1562	 */
   1563	min_nodes = c->gc_stats.nodes / MAX_GC_TIMES;
   1564	if (min_nodes < MIN_GC_NODES)
   1565		min_nodes = MIN_GC_NODES;
   1566
   1567	return min_nodes;
   1568}
   1569
   1570
   1571static int btree_gc_recurse(struct btree *b, struct btree_op *op,
   1572			    struct closure *writes, struct gc_stat *gc)
   1573{
   1574	int ret = 0;
   1575	bool should_rewrite;
   1576	struct bkey *k;
   1577	struct btree_iter iter;
   1578	struct gc_merge_info r[GC_MERGE_NODES];
   1579	struct gc_merge_info *i, *last = r + ARRAY_SIZE(r) - 1;
   1580
   1581	bch_btree_iter_init(&b->keys, &iter, &b->c->gc_done);
   1582
   1583	for (i = r; i < r + ARRAY_SIZE(r); i++)
   1584		i->b = ERR_PTR(-EINTR);
   1585
   1586	while (1) {
   1587		k = bch_btree_iter_next_filter(&iter, &b->keys, bch_ptr_bad);
   1588		if (k) {
   1589			r->b = bch_btree_node_get(b->c, op, k, b->level - 1,
   1590						  true, b);
   1591			if (IS_ERR(r->b)) {
   1592				ret = PTR_ERR(r->b);
   1593				break;
   1594			}
   1595
   1596			r->keys = btree_gc_count_keys(r->b);
   1597
   1598			ret = btree_gc_coalesce(b, op, gc, r);
   1599			if (ret)
   1600				break;
   1601		}
   1602
   1603		if (!last->b)
   1604			break;
   1605
   1606		if (!IS_ERR(last->b)) {
   1607			should_rewrite = btree_gc_mark_node(last->b, gc);
   1608			if (should_rewrite) {
   1609				ret = btree_gc_rewrite_node(b, op, last->b);
   1610				if (ret)
   1611					break;
   1612			}
   1613
   1614			if (last->b->level) {
   1615				ret = btree_gc_recurse(last->b, op, writes, gc);
   1616				if (ret)
   1617					break;
   1618			}
   1619
   1620			bkey_copy_key(&b->c->gc_done, &last->b->key);
   1621
   1622			/*
   1623			 * Must flush leaf nodes before gc ends, since replace
   1624			 * operations aren't journalled
   1625			 */
   1626			mutex_lock(&last->b->write_lock);
   1627			if (btree_node_dirty(last->b))
   1628				bch_btree_node_write(last->b, writes);
   1629			mutex_unlock(&last->b->write_lock);
   1630			rw_unlock(true, last->b);
   1631		}
   1632
   1633		memmove(r + 1, r, sizeof(r[0]) * (GC_MERGE_NODES - 1));
   1634		r->b = NULL;
   1635
   1636		if (atomic_read(&b->c->search_inflight) &&
   1637		    gc->nodes >= gc->nodes_pre + btree_gc_min_nodes(b->c)) {
   1638			gc->nodes_pre =  gc->nodes;
   1639			ret = -EAGAIN;
   1640			break;
   1641		}
   1642
   1643		if (need_resched()) {
   1644			ret = -EAGAIN;
   1645			break;
   1646		}
   1647	}
   1648
   1649	for (i = r; i < r + ARRAY_SIZE(r); i++)
   1650		if (!IS_ERR_OR_NULL(i->b)) {
   1651			mutex_lock(&i->b->write_lock);
   1652			if (btree_node_dirty(i->b))
   1653				bch_btree_node_write(i->b, writes);
   1654			mutex_unlock(&i->b->write_lock);
   1655			rw_unlock(true, i->b);
   1656		}
   1657
   1658	return ret;
   1659}
   1660
   1661static int bch_btree_gc_root(struct btree *b, struct btree_op *op,
   1662			     struct closure *writes, struct gc_stat *gc)
   1663{
   1664	struct btree *n = NULL;
   1665	int ret = 0;
   1666	bool should_rewrite;
   1667
   1668	should_rewrite = btree_gc_mark_node(b, gc);
   1669	if (should_rewrite) {
   1670		n = btree_node_alloc_replacement(b, NULL);
   1671
   1672		if (!IS_ERR_OR_NULL(n)) {
   1673			bch_btree_node_write_sync(n);
   1674
   1675			bch_btree_set_root(n);
   1676			btree_node_free(b);
   1677			rw_unlock(true, n);
   1678
   1679			return -EINTR;
   1680		}
   1681	}
   1682
   1683	__bch_btree_mark_key(b->c, b->level + 1, &b->key);
   1684
   1685	if (b->level) {
   1686		ret = btree_gc_recurse(b, op, writes, gc);
   1687		if (ret)
   1688			return ret;
   1689	}
   1690
   1691	bkey_copy_key(&b->c->gc_done, &b->key);
   1692
   1693	return ret;
   1694}
   1695
   1696static void btree_gc_start(struct cache_set *c)
   1697{
   1698	struct cache *ca;
   1699	struct bucket *b;
   1700
   1701	if (!c->gc_mark_valid)
   1702		return;
   1703
   1704	mutex_lock(&c->bucket_lock);
   1705
   1706	c->gc_mark_valid = 0;
   1707	c->gc_done = ZERO_KEY;
   1708
   1709	ca = c->cache;
   1710	for_each_bucket(b, ca) {
   1711		b->last_gc = b->gen;
   1712		if (!atomic_read(&b->pin)) {
   1713			SET_GC_MARK(b, 0);
   1714			SET_GC_SECTORS_USED(b, 0);
   1715		}
   1716	}
   1717
   1718	mutex_unlock(&c->bucket_lock);
   1719}
   1720
   1721static void bch_btree_gc_finish(struct cache_set *c)
   1722{
   1723	struct bucket *b;
   1724	struct cache *ca;
   1725	unsigned int i, j;
   1726	uint64_t *k;
   1727
   1728	mutex_lock(&c->bucket_lock);
   1729
   1730	set_gc_sectors(c);
   1731	c->gc_mark_valid = 1;
   1732	c->need_gc	= 0;
   1733
   1734	for (i = 0; i < KEY_PTRS(&c->uuid_bucket); i++)
   1735		SET_GC_MARK(PTR_BUCKET(c, &c->uuid_bucket, i),
   1736			    GC_MARK_METADATA);
   1737
   1738	/* don't reclaim buckets to which writeback keys point */
   1739	rcu_read_lock();
   1740	for (i = 0; i < c->devices_max_used; i++) {
   1741		struct bcache_device *d = c->devices[i];
   1742		struct cached_dev *dc;
   1743		struct keybuf_key *w, *n;
   1744
   1745		if (!d || UUID_FLASH_ONLY(&c->uuids[i]))
   1746			continue;
   1747		dc = container_of(d, struct cached_dev, disk);
   1748
   1749		spin_lock(&dc->writeback_keys.lock);
   1750		rbtree_postorder_for_each_entry_safe(w, n,
   1751					&dc->writeback_keys.keys, node)
   1752			for (j = 0; j < KEY_PTRS(&w->key); j++)
   1753				SET_GC_MARK(PTR_BUCKET(c, &w->key, j),
   1754					    GC_MARK_DIRTY);
   1755		spin_unlock(&dc->writeback_keys.lock);
   1756	}
   1757	rcu_read_unlock();
   1758
   1759	c->avail_nbuckets = 0;
   1760
   1761	ca = c->cache;
   1762	ca->invalidate_needs_gc = 0;
   1763
   1764	for (k = ca->sb.d; k < ca->sb.d + ca->sb.keys; k++)
   1765		SET_GC_MARK(ca->buckets + *k, GC_MARK_METADATA);
   1766
   1767	for (k = ca->prio_buckets;
   1768	     k < ca->prio_buckets + prio_buckets(ca) * 2; k++)
   1769		SET_GC_MARK(ca->buckets + *k, GC_MARK_METADATA);
   1770
   1771	for_each_bucket(b, ca) {
   1772		c->need_gc	= max(c->need_gc, bucket_gc_gen(b));
   1773
   1774		if (atomic_read(&b->pin))
   1775			continue;
   1776
   1777		BUG_ON(!GC_MARK(b) && GC_SECTORS_USED(b));
   1778
   1779		if (!GC_MARK(b) || GC_MARK(b) == GC_MARK_RECLAIMABLE)
   1780			c->avail_nbuckets++;
   1781	}
   1782
   1783	mutex_unlock(&c->bucket_lock);
   1784}
   1785
   1786static void bch_btree_gc(struct cache_set *c)
   1787{
   1788	int ret;
   1789	struct gc_stat stats;
   1790	struct closure writes;
   1791	struct btree_op op;
   1792	uint64_t start_time = local_clock();
   1793
   1794	trace_bcache_gc_start(c);
   1795
   1796	memset(&stats, 0, sizeof(struct gc_stat));
   1797	closure_init_stack(&writes);
   1798	bch_btree_op_init(&op, SHRT_MAX);
   1799
   1800	btree_gc_start(c);
   1801
   1802	/* if CACHE_SET_IO_DISABLE set, gc thread should stop too */
   1803	do {
   1804		ret = bcache_btree_root(gc_root, c, &op, &writes, &stats);
   1805		closure_sync(&writes);
   1806		cond_resched();
   1807
   1808		if (ret == -EAGAIN)
   1809			schedule_timeout_interruptible(msecs_to_jiffies
   1810						       (GC_SLEEP_MS));
   1811		else if (ret)
   1812			pr_warn("gc failed!\n");
   1813	} while (ret && !test_bit(CACHE_SET_IO_DISABLE, &c->flags));
   1814
   1815	bch_btree_gc_finish(c);
   1816	wake_up_allocators(c);
   1817
   1818	bch_time_stats_update(&c->btree_gc_time, start_time);
   1819
   1820	stats.key_bytes *= sizeof(uint64_t);
   1821	stats.data	<<= 9;
   1822	bch_update_bucket_in_use(c, &stats);
   1823	memcpy(&c->gc_stats, &stats, sizeof(struct gc_stat));
   1824
   1825	trace_bcache_gc_end(c);
   1826
   1827	bch_moving_gc(c);
   1828}
   1829
   1830static bool gc_should_run(struct cache_set *c)
   1831{
   1832	struct cache *ca = c->cache;
   1833
   1834	if (ca->invalidate_needs_gc)
   1835		return true;
   1836
   1837	if (atomic_read(&c->sectors_to_gc) < 0)
   1838		return true;
   1839
   1840	return false;
   1841}
   1842
   1843static int bch_gc_thread(void *arg)
   1844{
   1845	struct cache_set *c = arg;
   1846
   1847	while (1) {
   1848		wait_event_interruptible(c->gc_wait,
   1849			   kthread_should_stop() ||
   1850			   test_bit(CACHE_SET_IO_DISABLE, &c->flags) ||
   1851			   gc_should_run(c));
   1852
   1853		if (kthread_should_stop() ||
   1854		    test_bit(CACHE_SET_IO_DISABLE, &c->flags))
   1855			break;
   1856
   1857		set_gc_sectors(c);
   1858		bch_btree_gc(c);
   1859	}
   1860
   1861	wait_for_kthread_stop();
   1862	return 0;
   1863}
   1864
   1865int bch_gc_thread_start(struct cache_set *c)
   1866{
   1867	c->gc_thread = kthread_run(bch_gc_thread, c, "bcache_gc");
   1868	return PTR_ERR_OR_ZERO(c->gc_thread);
   1869}
   1870
   1871/* Initial partial gc */
   1872
   1873static int bch_btree_check_recurse(struct btree *b, struct btree_op *op)
   1874{
   1875	int ret = 0;
   1876	struct bkey *k, *p = NULL;
   1877	struct btree_iter iter;
   1878
   1879	for_each_key_filter(&b->keys, k, &iter, bch_ptr_invalid)
   1880		bch_initial_mark_key(b->c, b->level, k);
   1881
   1882	bch_initial_mark_key(b->c, b->level + 1, &b->key);
   1883
   1884	if (b->level) {
   1885		bch_btree_iter_init(&b->keys, &iter, NULL);
   1886
   1887		do {
   1888			k = bch_btree_iter_next_filter(&iter, &b->keys,
   1889						       bch_ptr_bad);
   1890			if (k) {
   1891				btree_node_prefetch(b, k);
   1892				/*
   1893				 * initiallize c->gc_stats.nodes
   1894				 * for incremental GC
   1895				 */
   1896				b->c->gc_stats.nodes++;
   1897			}
   1898
   1899			if (p)
   1900				ret = bcache_btree(check_recurse, p, b, op);
   1901
   1902			p = k;
   1903		} while (p && !ret);
   1904	}
   1905
   1906	return ret;
   1907}
   1908
   1909
   1910static int bch_btree_check_thread(void *arg)
   1911{
   1912	int ret;
   1913	struct btree_check_info *info = arg;
   1914	struct btree_check_state *check_state = info->state;
   1915	struct cache_set *c = check_state->c;
   1916	struct btree_iter iter;
   1917	struct bkey *k, *p;
   1918	int cur_idx, prev_idx, skip_nr;
   1919
   1920	k = p = NULL;
   1921	cur_idx = prev_idx = 0;
   1922	ret = 0;
   1923
   1924	/* root node keys are checked before thread created */
   1925	bch_btree_iter_init(&c->root->keys, &iter, NULL);
   1926	k = bch_btree_iter_next_filter(&iter, &c->root->keys, bch_ptr_bad);
   1927	BUG_ON(!k);
   1928
   1929	p = k;
   1930	while (k) {
   1931		/*
   1932		 * Fetch a root node key index, skip the keys which
   1933		 * should be fetched by other threads, then check the
   1934		 * sub-tree indexed by the fetched key.
   1935		 */
   1936		spin_lock(&check_state->idx_lock);
   1937		cur_idx = check_state->key_idx;
   1938		check_state->key_idx++;
   1939		spin_unlock(&check_state->idx_lock);
   1940
   1941		skip_nr = cur_idx - prev_idx;
   1942
   1943		while (skip_nr) {
   1944			k = bch_btree_iter_next_filter(&iter,
   1945						       &c->root->keys,
   1946						       bch_ptr_bad);
   1947			if (k)
   1948				p = k;
   1949			else {
   1950				/*
   1951				 * No more keys to check in root node,
   1952				 * current checking threads are enough,
   1953				 * stop creating more.
   1954				 */
   1955				atomic_set(&check_state->enough, 1);
   1956				/* Update check_state->enough earlier */
   1957				smp_mb__after_atomic();
   1958				goto out;
   1959			}
   1960			skip_nr--;
   1961			cond_resched();
   1962		}
   1963
   1964		if (p) {
   1965			struct btree_op op;
   1966
   1967			btree_node_prefetch(c->root, p);
   1968			c->gc_stats.nodes++;
   1969			bch_btree_op_init(&op, 0);
   1970			ret = bcache_btree(check_recurse, p, c->root, &op);
   1971			if (ret)
   1972				goto out;
   1973		}
   1974		p = NULL;
   1975		prev_idx = cur_idx;
   1976		cond_resched();
   1977	}
   1978
   1979out:
   1980	info->result = ret;
   1981	/* update check_state->started among all CPUs */
   1982	smp_mb__before_atomic();
   1983	if (atomic_dec_and_test(&check_state->started))
   1984		wake_up(&check_state->wait);
   1985
   1986	return ret;
   1987}
   1988
   1989
   1990
   1991static int bch_btree_chkthread_nr(void)
   1992{
   1993	int n = num_online_cpus()/2;
   1994
   1995	if (n == 0)
   1996		n = 1;
   1997	else if (n > BCH_BTR_CHKTHREAD_MAX)
   1998		n = BCH_BTR_CHKTHREAD_MAX;
   1999
   2000	return n;
   2001}
   2002
   2003int bch_btree_check(struct cache_set *c)
   2004{
   2005	int ret = 0;
   2006	int i;
   2007	struct bkey *k = NULL;
   2008	struct btree_iter iter;
   2009	struct btree_check_state check_state;
   2010
   2011	/* check and mark root node keys */
   2012	for_each_key_filter(&c->root->keys, k, &iter, bch_ptr_invalid)
   2013		bch_initial_mark_key(c, c->root->level, k);
   2014
   2015	bch_initial_mark_key(c, c->root->level + 1, &c->root->key);
   2016
   2017	if (c->root->level == 0)
   2018		return 0;
   2019
   2020	memset(&check_state, 0, sizeof(struct btree_check_state));
   2021	check_state.c = c;
   2022	check_state.total_threads = bch_btree_chkthread_nr();
   2023	check_state.key_idx = 0;
   2024	spin_lock_init(&check_state.idx_lock);
   2025	atomic_set(&check_state.started, 0);
   2026	atomic_set(&check_state.enough, 0);
   2027	init_waitqueue_head(&check_state.wait);
   2028
   2029	rw_lock(0, c->root, c->root->level);
   2030	/*
   2031	 * Run multiple threads to check btree nodes in parallel,
   2032	 * if check_state.enough is non-zero, it means current
   2033	 * running check threads are enough, unncessary to create
   2034	 * more.
   2035	 */
   2036	for (i = 0; i < check_state.total_threads; i++) {
   2037		/* fetch latest check_state.enough earlier */
   2038		smp_mb__before_atomic();
   2039		if (atomic_read(&check_state.enough))
   2040			break;
   2041
   2042		check_state.infos[i].result = 0;
   2043		check_state.infos[i].state = &check_state;
   2044
   2045		check_state.infos[i].thread =
   2046			kthread_run(bch_btree_check_thread,
   2047				    &check_state.infos[i],
   2048				    "bch_btrchk[%d]", i);
   2049		if (IS_ERR(check_state.infos[i].thread)) {
   2050			pr_err("fails to run thread bch_btrchk[%d]\n", i);
   2051			for (--i; i >= 0; i--)
   2052				kthread_stop(check_state.infos[i].thread);
   2053			ret = -ENOMEM;
   2054			goto out;
   2055		}
   2056		atomic_inc(&check_state.started);
   2057	}
   2058
   2059	/*
   2060	 * Must wait for all threads to stop.
   2061	 */
   2062	wait_event(check_state.wait, atomic_read(&check_state.started) == 0);
   2063
   2064	for (i = 0; i < check_state.total_threads; i++) {
   2065		if (check_state.infos[i].result) {
   2066			ret = check_state.infos[i].result;
   2067			goto out;
   2068		}
   2069	}
   2070
   2071out:
   2072	rw_unlock(0, c->root);
   2073	return ret;
   2074}
   2075
   2076void bch_initial_gc_finish(struct cache_set *c)
   2077{
   2078	struct cache *ca = c->cache;
   2079	struct bucket *b;
   2080
   2081	bch_btree_gc_finish(c);
   2082
   2083	mutex_lock(&c->bucket_lock);
   2084
   2085	/*
   2086	 * We need to put some unused buckets directly on the prio freelist in
   2087	 * order to get the allocator thread started - it needs freed buckets in
   2088	 * order to rewrite the prios and gens, and it needs to rewrite prios
   2089	 * and gens in order to free buckets.
   2090	 *
   2091	 * This is only safe for buckets that have no live data in them, which
   2092	 * there should always be some of.
   2093	 */
   2094	for_each_bucket(b, ca) {
   2095		if (fifo_full(&ca->free[RESERVE_PRIO]) &&
   2096		    fifo_full(&ca->free[RESERVE_BTREE]))
   2097			break;
   2098
   2099		if (bch_can_invalidate_bucket(ca, b) &&
   2100		    !GC_MARK(b)) {
   2101			__bch_invalidate_one_bucket(ca, b);
   2102			if (!fifo_push(&ca->free[RESERVE_PRIO],
   2103			   b - ca->buckets))
   2104				fifo_push(&ca->free[RESERVE_BTREE],
   2105					  b - ca->buckets);
   2106		}
   2107	}
   2108
   2109	mutex_unlock(&c->bucket_lock);
   2110}
   2111
   2112/* Btree insertion */
   2113
   2114static bool btree_insert_key(struct btree *b, struct bkey *k,
   2115			     struct bkey *replace_key)
   2116{
   2117	unsigned int status;
   2118
   2119	BUG_ON(bkey_cmp(k, &b->key) > 0);
   2120
   2121	status = bch_btree_insert_key(&b->keys, k, replace_key);
   2122	if (status != BTREE_INSERT_STATUS_NO_INSERT) {
   2123		bch_check_keys(&b->keys, "%u for %s", status,
   2124			       replace_key ? "replace" : "insert");
   2125
   2126		trace_bcache_btree_insert_key(b, k, replace_key != NULL,
   2127					      status);
   2128		return true;
   2129	} else
   2130		return false;
   2131}
   2132
   2133static size_t insert_u64s_remaining(struct btree *b)
   2134{
   2135	long ret = bch_btree_keys_u64s_remaining(&b->keys);
   2136
   2137	/*
   2138	 * Might land in the middle of an existing extent and have to split it
   2139	 */
   2140	if (b->keys.ops->is_extents)
   2141		ret -= KEY_MAX_U64S;
   2142
   2143	return max(ret, 0L);
   2144}
   2145
   2146static bool bch_btree_insert_keys(struct btree *b, struct btree_op *op,
   2147				  struct keylist *insert_keys,
   2148				  struct bkey *replace_key)
   2149{
   2150	bool ret = false;
   2151	int oldsize = bch_count_data(&b->keys);
   2152
   2153	while (!bch_keylist_empty(insert_keys)) {
   2154		struct bkey *k = insert_keys->keys;
   2155
   2156		if (bkey_u64s(k) > insert_u64s_remaining(b))
   2157			break;
   2158
   2159		if (bkey_cmp(k, &b->key) <= 0) {
   2160			if (!b->level)
   2161				bkey_put(b->c, k);
   2162
   2163			ret |= btree_insert_key(b, k, replace_key);
   2164			bch_keylist_pop_front(insert_keys);
   2165		} else if (bkey_cmp(&START_KEY(k), &b->key) < 0) {
   2166			BKEY_PADDED(key) temp;
   2167			bkey_copy(&temp.key, insert_keys->keys);
   2168
   2169			bch_cut_back(&b->key, &temp.key);
   2170			bch_cut_front(&b->key, insert_keys->keys);
   2171
   2172			ret |= btree_insert_key(b, &temp.key, replace_key);
   2173			break;
   2174		} else {
   2175			break;
   2176		}
   2177	}
   2178
   2179	if (!ret)
   2180		op->insert_collision = true;
   2181
   2182	BUG_ON(!bch_keylist_empty(insert_keys) && b->level);
   2183
   2184	BUG_ON(bch_count_data(&b->keys) < oldsize);
   2185	return ret;
   2186}
   2187
   2188static int btree_split(struct btree *b, struct btree_op *op,
   2189		       struct keylist *insert_keys,
   2190		       struct bkey *replace_key)
   2191{
   2192	bool split;
   2193	struct btree *n1, *n2 = NULL, *n3 = NULL;
   2194	uint64_t start_time = local_clock();
   2195	struct closure cl;
   2196	struct keylist parent_keys;
   2197
   2198	closure_init_stack(&cl);
   2199	bch_keylist_init(&parent_keys);
   2200
   2201	if (btree_check_reserve(b, op)) {
   2202		if (!b->level)
   2203			return -EINTR;
   2204		else
   2205			WARN(1, "insufficient reserve for split\n");
   2206	}
   2207
   2208	n1 = btree_node_alloc_replacement(b, op);
   2209	if (IS_ERR(n1))
   2210		goto err;
   2211
   2212	split = set_blocks(btree_bset_first(n1),
   2213			   block_bytes(n1->c->cache)) > (btree_blocks(b) * 4) / 5;
   2214
   2215	if (split) {
   2216		unsigned int keys = 0;
   2217
   2218		trace_bcache_btree_node_split(b, btree_bset_first(n1)->keys);
   2219
   2220		n2 = bch_btree_node_alloc(b->c, op, b->level, b->parent);
   2221		if (IS_ERR(n2))
   2222			goto err_free1;
   2223
   2224		if (!b->parent) {
   2225			n3 = bch_btree_node_alloc(b->c, op, b->level + 1, NULL);
   2226			if (IS_ERR(n3))
   2227				goto err_free2;
   2228		}
   2229
   2230		mutex_lock(&n1->write_lock);
   2231		mutex_lock(&n2->write_lock);
   2232
   2233		bch_btree_insert_keys(n1, op, insert_keys, replace_key);
   2234
   2235		/*
   2236		 * Has to be a linear search because we don't have an auxiliary
   2237		 * search tree yet
   2238		 */
   2239
   2240		while (keys < (btree_bset_first(n1)->keys * 3) / 5)
   2241			keys += bkey_u64s(bset_bkey_idx(btree_bset_first(n1),
   2242							keys));
   2243
   2244		bkey_copy_key(&n1->key,
   2245			      bset_bkey_idx(btree_bset_first(n1), keys));
   2246		keys += bkey_u64s(bset_bkey_idx(btree_bset_first(n1), keys));
   2247
   2248		btree_bset_first(n2)->keys = btree_bset_first(n1)->keys - keys;
   2249		btree_bset_first(n1)->keys = keys;
   2250
   2251		memcpy(btree_bset_first(n2)->start,
   2252		       bset_bkey_last(btree_bset_first(n1)),
   2253		       btree_bset_first(n2)->keys * sizeof(uint64_t));
   2254
   2255		bkey_copy_key(&n2->key, &b->key);
   2256
   2257		bch_keylist_add(&parent_keys, &n2->key);
   2258		bch_btree_node_write(n2, &cl);
   2259		mutex_unlock(&n2->write_lock);
   2260		rw_unlock(true, n2);
   2261	} else {
   2262		trace_bcache_btree_node_compact(b, btree_bset_first(n1)->keys);
   2263
   2264		mutex_lock(&n1->write_lock);
   2265		bch_btree_insert_keys(n1, op, insert_keys, replace_key);
   2266	}
   2267
   2268	bch_keylist_add(&parent_keys, &n1->key);
   2269	bch_btree_node_write(n1, &cl);
   2270	mutex_unlock(&n1->write_lock);
   2271
   2272	if (n3) {
   2273		/* Depth increases, make a new root */
   2274		mutex_lock(&n3->write_lock);
   2275		bkey_copy_key(&n3->key, &MAX_KEY);
   2276		bch_btree_insert_keys(n3, op, &parent_keys, NULL);
   2277		bch_btree_node_write(n3, &cl);
   2278		mutex_unlock(&n3->write_lock);
   2279
   2280		closure_sync(&cl);
   2281		bch_btree_set_root(n3);
   2282		rw_unlock(true, n3);
   2283	} else if (!b->parent) {
   2284		/* Root filled up but didn't need to be split */
   2285		closure_sync(&cl);
   2286		bch_btree_set_root(n1);
   2287	} else {
   2288		/* Split a non root node */
   2289		closure_sync(&cl);
   2290		make_btree_freeing_key(b, parent_keys.top);
   2291		bch_keylist_push(&parent_keys);
   2292
   2293		bch_btree_insert_node(b->parent, op, &parent_keys, NULL, NULL);
   2294		BUG_ON(!bch_keylist_empty(&parent_keys));
   2295	}
   2296
   2297	btree_node_free(b);
   2298	rw_unlock(true, n1);
   2299
   2300	bch_time_stats_update(&b->c->btree_split_time, start_time);
   2301
   2302	return 0;
   2303err_free2:
   2304	bkey_put(b->c, &n2->key);
   2305	btree_node_free(n2);
   2306	rw_unlock(true, n2);
   2307err_free1:
   2308	bkey_put(b->c, &n1->key);
   2309	btree_node_free(n1);
   2310	rw_unlock(true, n1);
   2311err:
   2312	WARN(1, "bcache: btree split failed (level %u)", b->level);
   2313
   2314	if (n3 == ERR_PTR(-EAGAIN) ||
   2315	    n2 == ERR_PTR(-EAGAIN) ||
   2316	    n1 == ERR_PTR(-EAGAIN))
   2317		return -EAGAIN;
   2318
   2319	return -ENOMEM;
   2320}
   2321
   2322static int bch_btree_insert_node(struct btree *b, struct btree_op *op,
   2323				 struct keylist *insert_keys,
   2324				 atomic_t *journal_ref,
   2325				 struct bkey *replace_key)
   2326{
   2327	struct closure cl;
   2328
   2329	BUG_ON(b->level && replace_key);
   2330
   2331	closure_init_stack(&cl);
   2332
   2333	mutex_lock(&b->write_lock);
   2334
   2335	if (write_block(b) != btree_bset_last(b) &&
   2336	    b->keys.last_set_unwritten)
   2337		bch_btree_init_next(b); /* just wrote a set */
   2338
   2339	if (bch_keylist_nkeys(insert_keys) > insert_u64s_remaining(b)) {
   2340		mutex_unlock(&b->write_lock);
   2341		goto split;
   2342	}
   2343
   2344	BUG_ON(write_block(b) != btree_bset_last(b));
   2345
   2346	if (bch_btree_insert_keys(b, op, insert_keys, replace_key)) {
   2347		if (!b->level)
   2348			bch_btree_leaf_dirty(b, journal_ref);
   2349		else
   2350			bch_btree_node_write(b, &cl);
   2351	}
   2352
   2353	mutex_unlock(&b->write_lock);
   2354
   2355	/* wait for btree node write if necessary, after unlock */
   2356	closure_sync(&cl);
   2357
   2358	return 0;
   2359split:
   2360	if (current->bio_list) {
   2361		op->lock = b->c->root->level + 1;
   2362		return -EAGAIN;
   2363	} else if (op->lock <= b->c->root->level) {
   2364		op->lock = b->c->root->level + 1;
   2365		return -EINTR;
   2366	} else {
   2367		/* Invalidated all iterators */
   2368		int ret = btree_split(b, op, insert_keys, replace_key);
   2369
   2370		if (bch_keylist_empty(insert_keys))
   2371			return 0;
   2372		else if (!ret)
   2373			return -EINTR;
   2374		return ret;
   2375	}
   2376}
   2377
   2378int bch_btree_insert_check_key(struct btree *b, struct btree_op *op,
   2379			       struct bkey *check_key)
   2380{
   2381	int ret = -EINTR;
   2382	uint64_t btree_ptr = b->key.ptr[0];
   2383	unsigned long seq = b->seq;
   2384	struct keylist insert;
   2385	bool upgrade = op->lock == -1;
   2386
   2387	bch_keylist_init(&insert);
   2388
   2389	if (upgrade) {
   2390		rw_unlock(false, b);
   2391		rw_lock(true, b, b->level);
   2392
   2393		if (b->key.ptr[0] != btree_ptr ||
   2394		    b->seq != seq + 1) {
   2395			op->lock = b->level;
   2396			goto out;
   2397		}
   2398	}
   2399
   2400	SET_KEY_PTRS(check_key, 1);
   2401	get_random_bytes(&check_key->ptr[0], sizeof(uint64_t));
   2402
   2403	SET_PTR_DEV(check_key, 0, PTR_CHECK_DEV);
   2404
   2405	bch_keylist_add(&insert, check_key);
   2406
   2407	ret = bch_btree_insert_node(b, op, &insert, NULL, NULL);
   2408
   2409	BUG_ON(!ret && !bch_keylist_empty(&insert));
   2410out:
   2411	if (upgrade)
   2412		downgrade_write(&b->lock);
   2413	return ret;
   2414}
   2415
   2416struct btree_insert_op {
   2417	struct btree_op	op;
   2418	struct keylist	*keys;
   2419	atomic_t	*journal_ref;
   2420	struct bkey	*replace_key;
   2421};
   2422
   2423static int btree_insert_fn(struct btree_op *b_op, struct btree *b)
   2424{
   2425	struct btree_insert_op *op = container_of(b_op,
   2426					struct btree_insert_op, op);
   2427
   2428	int ret = bch_btree_insert_node(b, &op->op, op->keys,
   2429					op->journal_ref, op->replace_key);
   2430	if (ret && !bch_keylist_empty(op->keys))
   2431		return ret;
   2432	else
   2433		return MAP_DONE;
   2434}
   2435
   2436int bch_btree_insert(struct cache_set *c, struct keylist *keys,
   2437		     atomic_t *journal_ref, struct bkey *replace_key)
   2438{
   2439	struct btree_insert_op op;
   2440	int ret = 0;
   2441
   2442	BUG_ON(current->bio_list);
   2443	BUG_ON(bch_keylist_empty(keys));
   2444
   2445	bch_btree_op_init(&op.op, 0);
   2446	op.keys		= keys;
   2447	op.journal_ref	= journal_ref;
   2448	op.replace_key	= replace_key;
   2449
   2450	while (!ret && !bch_keylist_empty(keys)) {
   2451		op.op.lock = 0;
   2452		ret = bch_btree_map_leaf_nodes(&op.op, c,
   2453					       &START_KEY(keys->keys),
   2454					       btree_insert_fn);
   2455	}
   2456
   2457	if (ret) {
   2458		struct bkey *k;
   2459
   2460		pr_err("error %i\n", ret);
   2461
   2462		while ((k = bch_keylist_pop(keys)))
   2463			bkey_put(c, k);
   2464	} else if (op.op.insert_collision)
   2465		ret = -ESRCH;
   2466
   2467	return ret;
   2468}
   2469
   2470void bch_btree_set_root(struct btree *b)
   2471{
   2472	unsigned int i;
   2473	struct closure cl;
   2474
   2475	closure_init_stack(&cl);
   2476
   2477	trace_bcache_btree_set_root(b);
   2478
   2479	BUG_ON(!b->written);
   2480
   2481	for (i = 0; i < KEY_PTRS(&b->key); i++)
   2482		BUG_ON(PTR_BUCKET(b->c, &b->key, i)->prio != BTREE_PRIO);
   2483
   2484	mutex_lock(&b->c->bucket_lock);
   2485	list_del_init(&b->list);
   2486	mutex_unlock(&b->c->bucket_lock);
   2487
   2488	b->c->root = b;
   2489
   2490	bch_journal_meta(b->c, &cl);
   2491	closure_sync(&cl);
   2492}
   2493
   2494/* Map across nodes or keys */
   2495
   2496static int bch_btree_map_nodes_recurse(struct btree *b, struct btree_op *op,
   2497				       struct bkey *from,
   2498				       btree_map_nodes_fn *fn, int flags)
   2499{
   2500	int ret = MAP_CONTINUE;
   2501
   2502	if (b->level) {
   2503		struct bkey *k;
   2504		struct btree_iter iter;
   2505
   2506		bch_btree_iter_init(&b->keys, &iter, from);
   2507
   2508		while ((k = bch_btree_iter_next_filter(&iter, &b->keys,
   2509						       bch_ptr_bad))) {
   2510			ret = bcache_btree(map_nodes_recurse, k, b,
   2511				    op, from, fn, flags);
   2512			from = NULL;
   2513
   2514			if (ret != MAP_CONTINUE)
   2515				return ret;
   2516		}
   2517	}
   2518
   2519	if (!b->level || flags == MAP_ALL_NODES)
   2520		ret = fn(op, b);
   2521
   2522	return ret;
   2523}
   2524
   2525int __bch_btree_map_nodes(struct btree_op *op, struct cache_set *c,
   2526			  struct bkey *from, btree_map_nodes_fn *fn, int flags)
   2527{
   2528	return bcache_btree_root(map_nodes_recurse, c, op, from, fn, flags);
   2529}
   2530
   2531int bch_btree_map_keys_recurse(struct btree *b, struct btree_op *op,
   2532				      struct bkey *from, btree_map_keys_fn *fn,
   2533				      int flags)
   2534{
   2535	int ret = MAP_CONTINUE;
   2536	struct bkey *k;
   2537	struct btree_iter iter;
   2538
   2539	bch_btree_iter_init(&b->keys, &iter, from);
   2540
   2541	while ((k = bch_btree_iter_next_filter(&iter, &b->keys, bch_ptr_bad))) {
   2542		ret = !b->level
   2543			? fn(op, b, k)
   2544			: bcache_btree(map_keys_recurse, k,
   2545				       b, op, from, fn, flags);
   2546		from = NULL;
   2547
   2548		if (ret != MAP_CONTINUE)
   2549			return ret;
   2550	}
   2551
   2552	if (!b->level && (flags & MAP_END_KEY))
   2553		ret = fn(op, b, &KEY(KEY_INODE(&b->key),
   2554				     KEY_OFFSET(&b->key), 0));
   2555
   2556	return ret;
   2557}
   2558
   2559int bch_btree_map_keys(struct btree_op *op, struct cache_set *c,
   2560		       struct bkey *from, btree_map_keys_fn *fn, int flags)
   2561{
   2562	return bcache_btree_root(map_keys_recurse, c, op, from, fn, flags);
   2563}
   2564
   2565/* Keybuf code */
   2566
   2567static inline int keybuf_cmp(struct keybuf_key *l, struct keybuf_key *r)
   2568{
   2569	/* Overlapping keys compare equal */
   2570	if (bkey_cmp(&l->key, &START_KEY(&r->key)) <= 0)
   2571		return -1;
   2572	if (bkey_cmp(&START_KEY(&l->key), &r->key) >= 0)
   2573		return 1;
   2574	return 0;
   2575}
   2576
   2577static inline int keybuf_nonoverlapping_cmp(struct keybuf_key *l,
   2578					    struct keybuf_key *r)
   2579{
   2580	return clamp_t(int64_t, bkey_cmp(&l->key, &r->key), -1, 1);
   2581}
   2582
   2583struct refill {
   2584	struct btree_op	op;
   2585	unsigned int	nr_found;
   2586	struct keybuf	*buf;
   2587	struct bkey	*end;
   2588	keybuf_pred_fn	*pred;
   2589};
   2590
   2591static int refill_keybuf_fn(struct btree_op *op, struct btree *b,
   2592			    struct bkey *k)
   2593{
   2594	struct refill *refill = container_of(op, struct refill, op);
   2595	struct keybuf *buf = refill->buf;
   2596	int ret = MAP_CONTINUE;
   2597
   2598	if (bkey_cmp(k, refill->end) > 0) {
   2599		ret = MAP_DONE;
   2600		goto out;
   2601	}
   2602
   2603	if (!KEY_SIZE(k)) /* end key */
   2604		goto out;
   2605
   2606	if (refill->pred(buf, k)) {
   2607		struct keybuf_key *w;
   2608
   2609		spin_lock(&buf->lock);
   2610
   2611		w = array_alloc(&buf->freelist);
   2612		if (!w) {
   2613			spin_unlock(&buf->lock);
   2614			return MAP_DONE;
   2615		}
   2616
   2617		w->private = NULL;
   2618		bkey_copy(&w->key, k);
   2619
   2620		if (RB_INSERT(&buf->keys, w, node, keybuf_cmp))
   2621			array_free(&buf->freelist, w);
   2622		else
   2623			refill->nr_found++;
   2624
   2625		if (array_freelist_empty(&buf->freelist))
   2626			ret = MAP_DONE;
   2627
   2628		spin_unlock(&buf->lock);
   2629	}
   2630out:
   2631	buf->last_scanned = *k;
   2632	return ret;
   2633}
   2634
   2635void bch_refill_keybuf(struct cache_set *c, struct keybuf *buf,
   2636		       struct bkey *end, keybuf_pred_fn *pred)
   2637{
   2638	struct bkey start = buf->last_scanned;
   2639	struct refill refill;
   2640
   2641	cond_resched();
   2642
   2643	bch_btree_op_init(&refill.op, -1);
   2644	refill.nr_found	= 0;
   2645	refill.buf	= buf;
   2646	refill.end	= end;
   2647	refill.pred	= pred;
   2648
   2649	bch_btree_map_keys(&refill.op, c, &buf->last_scanned,
   2650			   refill_keybuf_fn, MAP_END_KEY);
   2651
   2652	trace_bcache_keyscan(refill.nr_found,
   2653			     KEY_INODE(&start), KEY_OFFSET(&start),
   2654			     KEY_INODE(&buf->last_scanned),
   2655			     KEY_OFFSET(&buf->last_scanned));
   2656
   2657	spin_lock(&buf->lock);
   2658
   2659	if (!RB_EMPTY_ROOT(&buf->keys)) {
   2660		struct keybuf_key *w;
   2661
   2662		w = RB_FIRST(&buf->keys, struct keybuf_key, node);
   2663		buf->start	= START_KEY(&w->key);
   2664
   2665		w = RB_LAST(&buf->keys, struct keybuf_key, node);
   2666		buf->end	= w->key;
   2667	} else {
   2668		buf->start	= MAX_KEY;
   2669		buf->end	= MAX_KEY;
   2670	}
   2671
   2672	spin_unlock(&buf->lock);
   2673}
   2674
   2675static void __bch_keybuf_del(struct keybuf *buf, struct keybuf_key *w)
   2676{
   2677	rb_erase(&w->node, &buf->keys);
   2678	array_free(&buf->freelist, w);
   2679}
   2680
   2681void bch_keybuf_del(struct keybuf *buf, struct keybuf_key *w)
   2682{
   2683	spin_lock(&buf->lock);
   2684	__bch_keybuf_del(buf, w);
   2685	spin_unlock(&buf->lock);
   2686}
   2687
   2688bool bch_keybuf_check_overlapping(struct keybuf *buf, struct bkey *start,
   2689				  struct bkey *end)
   2690{
   2691	bool ret = false;
   2692	struct keybuf_key *p, *w, s;
   2693
   2694	s.key = *start;
   2695
   2696	if (bkey_cmp(end, &buf->start) <= 0 ||
   2697	    bkey_cmp(start, &buf->end) >= 0)
   2698		return false;
   2699
   2700	spin_lock(&buf->lock);
   2701	w = RB_GREATER(&buf->keys, s, node, keybuf_nonoverlapping_cmp);
   2702
   2703	while (w && bkey_cmp(&START_KEY(&w->key), end) < 0) {
   2704		p = w;
   2705		w = RB_NEXT(w, node);
   2706
   2707		if (p->private)
   2708			ret = true;
   2709		else
   2710			__bch_keybuf_del(buf, p);
   2711	}
   2712
   2713	spin_unlock(&buf->lock);
   2714	return ret;
   2715}
   2716
   2717struct keybuf_key *bch_keybuf_next(struct keybuf *buf)
   2718{
   2719	struct keybuf_key *w;
   2720
   2721	spin_lock(&buf->lock);
   2722
   2723	w = RB_FIRST(&buf->keys, struct keybuf_key, node);
   2724
   2725	while (w && w->private)
   2726		w = RB_NEXT(w, node);
   2727
   2728	if (w)
   2729		w->private = ERR_PTR(-EINTR);
   2730
   2731	spin_unlock(&buf->lock);
   2732	return w;
   2733}
   2734
   2735struct keybuf_key *bch_keybuf_next_rescan(struct cache_set *c,
   2736					  struct keybuf *buf,
   2737					  struct bkey *end,
   2738					  keybuf_pred_fn *pred)
   2739{
   2740	struct keybuf_key *ret;
   2741
   2742	while (1) {
   2743		ret = bch_keybuf_next(buf);
   2744		if (ret)
   2745			break;
   2746
   2747		if (bkey_cmp(&buf->last_scanned, end) >= 0) {
   2748			pr_debug("scan finished\n");
   2749			break;
   2750		}
   2751
   2752		bch_refill_keybuf(c, buf, end, pred);
   2753	}
   2754
   2755	return ret;
   2756}
   2757
   2758void bch_keybuf_init(struct keybuf *buf)
   2759{
   2760	buf->last_scanned	= MAX_KEY;
   2761	buf->keys		= RB_ROOT;
   2762
   2763	spin_lock_init(&buf->lock);
   2764	array_allocator_init(&buf->freelist);
   2765}
   2766
   2767void bch_btree_exit(void)
   2768{
   2769	if (btree_io_wq)
   2770		destroy_workqueue(btree_io_wq);
   2771}
   2772
   2773int __init bch_btree_init(void)
   2774{
   2775	btree_io_wq = alloc_workqueue("bch_btree_io", WQ_MEM_RECLAIM, 0);
   2776	if (!btree_io_wq)
   2777		return -ENOMEM;
   2778
   2779	return 0;
   2780}