cachepc-linux

Fork of AMDESE/linux with modifications for CachePC side-channel attack
git clone https://git.sinitax.com/sinitax/cachepc-linux
Log | Files | Refs | README | LICENSE | sfeed.txt

dm-btree.c (38649B)


      1/*
      2 * Copyright (C) 2011 Red Hat, Inc.
      3 *
      4 * This file is released under the GPL.
      5 */
      6
      7#include "dm-btree-internal.h"
      8#include "dm-space-map.h"
      9#include "dm-transaction-manager.h"
     10
     11#include <linux/export.h>
     12#include <linux/device-mapper.h>
     13
     14#define DM_MSG_PREFIX "btree"
     15
     16/*----------------------------------------------------------------
     17 * Array manipulation
     18 *--------------------------------------------------------------*/
     19static void memcpy_disk(void *dest, const void *src, size_t len)
     20	__dm_written_to_disk(src)
     21{
     22	memcpy(dest, src, len);
     23	__dm_unbless_for_disk(src);
     24}
     25
     26static void array_insert(void *base, size_t elt_size, unsigned nr_elts,
     27			 unsigned index, void *elt)
     28	__dm_written_to_disk(elt)
     29{
     30	if (index < nr_elts)
     31		memmove(base + (elt_size * (index + 1)),
     32			base + (elt_size * index),
     33			(nr_elts - index) * elt_size);
     34
     35	memcpy_disk(base + (elt_size * index), elt, elt_size);
     36}
     37
     38/*----------------------------------------------------------------*/
     39
     40/* makes the assumption that no two keys are the same. */
     41static int bsearch(struct btree_node *n, uint64_t key, int want_hi)
     42{
     43	int lo = -1, hi = le32_to_cpu(n->header.nr_entries);
     44
     45	while (hi - lo > 1) {
     46		int mid = lo + ((hi - lo) / 2);
     47		uint64_t mid_key = le64_to_cpu(n->keys[mid]);
     48
     49		if (mid_key == key)
     50			return mid;
     51
     52		if (mid_key < key)
     53			lo = mid;
     54		else
     55			hi = mid;
     56	}
     57
     58	return want_hi ? hi : lo;
     59}
     60
     61int lower_bound(struct btree_node *n, uint64_t key)
     62{
     63	return bsearch(n, key, 0);
     64}
     65
     66static int upper_bound(struct btree_node *n, uint64_t key)
     67{
     68	return bsearch(n, key, 1);
     69}
     70
     71void inc_children(struct dm_transaction_manager *tm, struct btree_node *n,
     72		  struct dm_btree_value_type *vt)
     73{
     74	uint32_t nr_entries = le32_to_cpu(n->header.nr_entries);
     75
     76	if (le32_to_cpu(n->header.flags) & INTERNAL_NODE)
     77		dm_tm_with_runs(tm, value_ptr(n, 0), nr_entries, dm_tm_inc_range);
     78
     79	else if (vt->inc)
     80		vt->inc(vt->context, value_ptr(n, 0), nr_entries);
     81}
     82
     83static int insert_at(size_t value_size, struct btree_node *node, unsigned index,
     84		     uint64_t key, void *value)
     85	__dm_written_to_disk(value)
     86{
     87	uint32_t nr_entries = le32_to_cpu(node->header.nr_entries);
     88	uint32_t max_entries = le32_to_cpu(node->header.max_entries);
     89	__le64 key_le = cpu_to_le64(key);
     90
     91	if (index > nr_entries ||
     92	    index >= max_entries ||
     93	    nr_entries >= max_entries) {
     94		DMERR("too many entries in btree node for insert");
     95		__dm_unbless_for_disk(value);
     96		return -ENOMEM;
     97	}
     98
     99	__dm_bless_for_disk(&key_le);
    100
    101	array_insert(node->keys, sizeof(*node->keys), nr_entries, index, &key_le);
    102	array_insert(value_base(node), value_size, nr_entries, index, value);
    103	node->header.nr_entries = cpu_to_le32(nr_entries + 1);
    104
    105	return 0;
    106}
    107
    108/*----------------------------------------------------------------*/
    109
    110/*
    111 * We want 3n entries (for some n).  This works more nicely for repeated
    112 * insert remove loops than (2n + 1).
    113 */
    114static uint32_t calc_max_entries(size_t value_size, size_t block_size)
    115{
    116	uint32_t total, n;
    117	size_t elt_size = sizeof(uint64_t) + value_size; /* key + value */
    118
    119	block_size -= sizeof(struct node_header);
    120	total = block_size / elt_size;
    121	n = total / 3;		/* rounds down */
    122
    123	return 3 * n;
    124}
    125
    126int dm_btree_empty(struct dm_btree_info *info, dm_block_t *root)
    127{
    128	int r;
    129	struct dm_block *b;
    130	struct btree_node *n;
    131	size_t block_size;
    132	uint32_t max_entries;
    133
    134	r = new_block(info, &b);
    135	if (r < 0)
    136		return r;
    137
    138	block_size = dm_bm_block_size(dm_tm_get_bm(info->tm));
    139	max_entries = calc_max_entries(info->value_type.size, block_size);
    140
    141	n = dm_block_data(b);
    142	memset(n, 0, block_size);
    143	n->header.flags = cpu_to_le32(LEAF_NODE);
    144	n->header.nr_entries = cpu_to_le32(0);
    145	n->header.max_entries = cpu_to_le32(max_entries);
    146	n->header.value_size = cpu_to_le32(info->value_type.size);
    147
    148	*root = dm_block_location(b);
    149	unlock_block(info, b);
    150
    151	return 0;
    152}
    153EXPORT_SYMBOL_GPL(dm_btree_empty);
    154
    155/*----------------------------------------------------------------*/
    156
    157/*
    158 * Deletion uses a recursive algorithm, since we have limited stack space
    159 * we explicitly manage our own stack on the heap.
    160 */
    161#define MAX_SPINE_DEPTH 64
    162struct frame {
    163	struct dm_block *b;
    164	struct btree_node *n;
    165	unsigned level;
    166	unsigned nr_children;
    167	unsigned current_child;
    168};
    169
    170struct del_stack {
    171	struct dm_btree_info *info;
    172	struct dm_transaction_manager *tm;
    173	int top;
    174	struct frame spine[MAX_SPINE_DEPTH];
    175};
    176
    177static int top_frame(struct del_stack *s, struct frame **f)
    178{
    179	if (s->top < 0) {
    180		DMERR("btree deletion stack empty");
    181		return -EINVAL;
    182	}
    183
    184	*f = s->spine + s->top;
    185
    186	return 0;
    187}
    188
    189static int unprocessed_frames(struct del_stack *s)
    190{
    191	return s->top >= 0;
    192}
    193
    194static void prefetch_children(struct del_stack *s, struct frame *f)
    195{
    196	unsigned i;
    197	struct dm_block_manager *bm = dm_tm_get_bm(s->tm);
    198
    199	for (i = 0; i < f->nr_children; i++)
    200		dm_bm_prefetch(bm, value64(f->n, i));
    201}
    202
    203static bool is_internal_level(struct dm_btree_info *info, struct frame *f)
    204{
    205	return f->level < (info->levels - 1);
    206}
    207
    208static int push_frame(struct del_stack *s, dm_block_t b, unsigned level)
    209{
    210	int r;
    211	uint32_t ref_count;
    212
    213	if (s->top >= MAX_SPINE_DEPTH - 1) {
    214		DMERR("btree deletion stack out of memory");
    215		return -ENOMEM;
    216	}
    217
    218	r = dm_tm_ref(s->tm, b, &ref_count);
    219	if (r)
    220		return r;
    221
    222	if (ref_count > 1)
    223		/*
    224		 * This is a shared node, so we can just decrement it's
    225		 * reference counter and leave the children.
    226		 */
    227		dm_tm_dec(s->tm, b);
    228
    229	else {
    230		uint32_t flags;
    231		struct frame *f = s->spine + ++s->top;
    232
    233		r = dm_tm_read_lock(s->tm, b, &btree_node_validator, &f->b);
    234		if (r) {
    235			s->top--;
    236			return r;
    237		}
    238
    239		f->n = dm_block_data(f->b);
    240		f->level = level;
    241		f->nr_children = le32_to_cpu(f->n->header.nr_entries);
    242		f->current_child = 0;
    243
    244		flags = le32_to_cpu(f->n->header.flags);
    245		if (flags & INTERNAL_NODE || is_internal_level(s->info, f))
    246			prefetch_children(s, f);
    247	}
    248
    249	return 0;
    250}
    251
    252static void pop_frame(struct del_stack *s)
    253{
    254	struct frame *f = s->spine + s->top--;
    255
    256	dm_tm_dec(s->tm, dm_block_location(f->b));
    257	dm_tm_unlock(s->tm, f->b);
    258}
    259
    260static void unlock_all_frames(struct del_stack *s)
    261{
    262	struct frame *f;
    263
    264	while (unprocessed_frames(s)) {
    265		f = s->spine + s->top--;
    266		dm_tm_unlock(s->tm, f->b);
    267	}
    268}
    269
    270int dm_btree_del(struct dm_btree_info *info, dm_block_t root)
    271{
    272	int r;
    273	struct del_stack *s;
    274
    275	/*
    276	 * dm_btree_del() is called via an ioctl, as such should be
    277	 * considered an FS op.  We can't recurse back into the FS, so we
    278	 * allocate GFP_NOFS.
    279	 */
    280	s = kmalloc(sizeof(*s), GFP_NOFS);
    281	if (!s)
    282		return -ENOMEM;
    283	s->info = info;
    284	s->tm = info->tm;
    285	s->top = -1;
    286
    287	r = push_frame(s, root, 0);
    288	if (r)
    289		goto out;
    290
    291	while (unprocessed_frames(s)) {
    292		uint32_t flags;
    293		struct frame *f;
    294		dm_block_t b;
    295
    296		r = top_frame(s, &f);
    297		if (r)
    298			goto out;
    299
    300		if (f->current_child >= f->nr_children) {
    301			pop_frame(s);
    302			continue;
    303		}
    304
    305		flags = le32_to_cpu(f->n->header.flags);
    306		if (flags & INTERNAL_NODE) {
    307			b = value64(f->n, f->current_child);
    308			f->current_child++;
    309			r = push_frame(s, b, f->level);
    310			if (r)
    311				goto out;
    312
    313		} else if (is_internal_level(info, f)) {
    314			b = value64(f->n, f->current_child);
    315			f->current_child++;
    316			r = push_frame(s, b, f->level + 1);
    317			if (r)
    318				goto out;
    319
    320		} else {
    321			if (info->value_type.dec)
    322				info->value_type.dec(info->value_type.context,
    323						     value_ptr(f->n, 0), f->nr_children);
    324			pop_frame(s);
    325		}
    326	}
    327out:
    328	if (r) {
    329		/* cleanup all frames of del_stack */
    330		unlock_all_frames(s);
    331	}
    332	kfree(s);
    333
    334	return r;
    335}
    336EXPORT_SYMBOL_GPL(dm_btree_del);
    337
    338/*----------------------------------------------------------------*/
    339
    340static int btree_lookup_raw(struct ro_spine *s, dm_block_t block, uint64_t key,
    341			    int (*search_fn)(struct btree_node *, uint64_t),
    342			    uint64_t *result_key, void *v, size_t value_size)
    343{
    344	int i, r;
    345	uint32_t flags, nr_entries;
    346
    347	do {
    348		r = ro_step(s, block);
    349		if (r < 0)
    350			return r;
    351
    352		i = search_fn(ro_node(s), key);
    353
    354		flags = le32_to_cpu(ro_node(s)->header.flags);
    355		nr_entries = le32_to_cpu(ro_node(s)->header.nr_entries);
    356		if (i < 0 || i >= nr_entries)
    357			return -ENODATA;
    358
    359		if (flags & INTERNAL_NODE)
    360			block = value64(ro_node(s), i);
    361
    362	} while (!(flags & LEAF_NODE));
    363
    364	*result_key = le64_to_cpu(ro_node(s)->keys[i]);
    365	if (v)
    366		memcpy(v, value_ptr(ro_node(s), i), value_size);
    367
    368	return 0;
    369}
    370
    371int dm_btree_lookup(struct dm_btree_info *info, dm_block_t root,
    372		    uint64_t *keys, void *value_le)
    373{
    374	unsigned level, last_level = info->levels - 1;
    375	int r = -ENODATA;
    376	uint64_t rkey;
    377	__le64 internal_value_le;
    378	struct ro_spine spine;
    379
    380	init_ro_spine(&spine, info);
    381	for (level = 0; level < info->levels; level++) {
    382		size_t size;
    383		void *value_p;
    384
    385		if (level == last_level) {
    386			value_p = value_le;
    387			size = info->value_type.size;
    388
    389		} else {
    390			value_p = &internal_value_le;
    391			size = sizeof(uint64_t);
    392		}
    393
    394		r = btree_lookup_raw(&spine, root, keys[level],
    395				     lower_bound, &rkey,
    396				     value_p, size);
    397
    398		if (!r) {
    399			if (rkey != keys[level]) {
    400				exit_ro_spine(&spine);
    401				return -ENODATA;
    402			}
    403		} else {
    404			exit_ro_spine(&spine);
    405			return r;
    406		}
    407
    408		root = le64_to_cpu(internal_value_le);
    409	}
    410	exit_ro_spine(&spine);
    411
    412	return r;
    413}
    414EXPORT_SYMBOL_GPL(dm_btree_lookup);
    415
    416static int dm_btree_lookup_next_single(struct dm_btree_info *info, dm_block_t root,
    417				       uint64_t key, uint64_t *rkey, void *value_le)
    418{
    419	int r, i;
    420	uint32_t flags, nr_entries;
    421	struct dm_block *node;
    422	struct btree_node *n;
    423
    424	r = bn_read_lock(info, root, &node);
    425	if (r)
    426		return r;
    427
    428	n = dm_block_data(node);
    429	flags = le32_to_cpu(n->header.flags);
    430	nr_entries = le32_to_cpu(n->header.nr_entries);
    431
    432	if (flags & INTERNAL_NODE) {
    433		i = lower_bound(n, key);
    434		if (i < 0) {
    435			/*
    436			 * avoid early -ENODATA return when all entries are
    437			 * higher than the search @key.
    438			 */
    439			i = 0;
    440		}
    441		if (i >= nr_entries) {
    442			r = -ENODATA;
    443			goto out;
    444		}
    445
    446		r = dm_btree_lookup_next_single(info, value64(n, i), key, rkey, value_le);
    447		if (r == -ENODATA && i < (nr_entries - 1)) {
    448			i++;
    449			r = dm_btree_lookup_next_single(info, value64(n, i), key, rkey, value_le);
    450		}
    451
    452	} else {
    453		i = upper_bound(n, key);
    454		if (i < 0 || i >= nr_entries) {
    455			r = -ENODATA;
    456			goto out;
    457		}
    458
    459		*rkey = le64_to_cpu(n->keys[i]);
    460		memcpy(value_le, value_ptr(n, i), info->value_type.size);
    461	}
    462out:
    463	dm_tm_unlock(info->tm, node);
    464	return r;
    465}
    466
    467int dm_btree_lookup_next(struct dm_btree_info *info, dm_block_t root,
    468			 uint64_t *keys, uint64_t *rkey, void *value_le)
    469{
    470	unsigned level;
    471	int r = -ENODATA;
    472	__le64 internal_value_le;
    473	struct ro_spine spine;
    474
    475	init_ro_spine(&spine, info);
    476	for (level = 0; level < info->levels - 1u; level++) {
    477		r = btree_lookup_raw(&spine, root, keys[level],
    478				     lower_bound, rkey,
    479				     &internal_value_le, sizeof(uint64_t));
    480		if (r)
    481			goto out;
    482
    483		if (*rkey != keys[level]) {
    484			r = -ENODATA;
    485			goto out;
    486		}
    487
    488		root = le64_to_cpu(internal_value_le);
    489	}
    490
    491	r = dm_btree_lookup_next_single(info, root, keys[level], rkey, value_le);
    492out:
    493	exit_ro_spine(&spine);
    494	return r;
    495}
    496
    497EXPORT_SYMBOL_GPL(dm_btree_lookup_next);
    498
    499/*----------------------------------------------------------------*/
    500
    501/*
    502 * Copies entries from one region of a btree node to another.  The regions
    503 * must not overlap.
    504 */
    505static void copy_entries(struct btree_node *dest, unsigned dest_offset,
    506			 struct btree_node *src, unsigned src_offset,
    507			 unsigned count)
    508{
    509	size_t value_size = le32_to_cpu(dest->header.value_size);
    510	memcpy(dest->keys + dest_offset, src->keys + src_offset, count * sizeof(uint64_t));
    511	memcpy(value_ptr(dest, dest_offset), value_ptr(src, src_offset), count * value_size);
    512}
    513
    514/*
    515 * Moves entries from one region fo a btree node to another.  The regions
    516 * may overlap.
    517 */
    518static void move_entries(struct btree_node *dest, unsigned dest_offset,
    519			 struct btree_node *src, unsigned src_offset,
    520			 unsigned count)
    521{
    522	size_t value_size = le32_to_cpu(dest->header.value_size);
    523	memmove(dest->keys + dest_offset, src->keys + src_offset, count * sizeof(uint64_t));
    524	memmove(value_ptr(dest, dest_offset), value_ptr(src, src_offset), count * value_size);
    525}
    526
    527/*
    528 * Erases the first 'count' entries of a btree node, shifting following
    529 * entries down into their place.
    530 */
    531static void shift_down(struct btree_node *n, unsigned count)
    532{
    533	move_entries(n, 0, n, count, le32_to_cpu(n->header.nr_entries) - count);
    534}
    535
    536/*
    537 * Moves entries in a btree node up 'count' places, making space for
    538 * new entries at the start of the node.
    539 */
    540static void shift_up(struct btree_node *n, unsigned count)
    541{
    542	move_entries(n, count, n, 0, le32_to_cpu(n->header.nr_entries));
    543}
    544
    545/*
    546 * Redistributes entries between two btree nodes to make them
    547 * have similar numbers of entries.
    548 */
    549static void redistribute2(struct btree_node *left, struct btree_node *right)
    550{
    551	unsigned nr_left = le32_to_cpu(left->header.nr_entries);
    552	unsigned nr_right = le32_to_cpu(right->header.nr_entries);
    553	unsigned total = nr_left + nr_right;
    554	unsigned target_left = total / 2;
    555	unsigned target_right = total - target_left;
    556
    557	if (nr_left < target_left) {
    558		unsigned delta = target_left - nr_left;
    559		copy_entries(left, nr_left, right, 0, delta);
    560		shift_down(right, delta);
    561	} else if (nr_left > target_left) {
    562		unsigned delta = nr_left - target_left;
    563		if (nr_right)
    564			shift_up(right, delta);
    565		copy_entries(right, 0, left, target_left, delta);
    566	}
    567
    568	left->header.nr_entries = cpu_to_le32(target_left);
    569	right->header.nr_entries = cpu_to_le32(target_right);
    570}
    571
    572/*
    573 * Redistribute entries between three nodes.  Assumes the central
    574 * node is empty.
    575 */
    576static void redistribute3(struct btree_node *left, struct btree_node *center,
    577			  struct btree_node *right)
    578{
    579	unsigned nr_left = le32_to_cpu(left->header.nr_entries);
    580	unsigned nr_center = le32_to_cpu(center->header.nr_entries);
    581	unsigned nr_right = le32_to_cpu(right->header.nr_entries);
    582	unsigned total, target_left, target_center, target_right;
    583
    584	BUG_ON(nr_center);
    585
    586	total = nr_left + nr_right;
    587	target_left = total / 3;
    588	target_center = (total - target_left) / 2;
    589	target_right = (total - target_left - target_center);
    590
    591	if (nr_left < target_left) {
    592		unsigned left_short = target_left - nr_left;
    593		copy_entries(left, nr_left, right, 0, left_short);
    594		copy_entries(center, 0, right, left_short, target_center);
    595		shift_down(right, nr_right - target_right);
    596
    597	} else if (nr_left < (target_left + target_center)) {
    598		unsigned left_to_center = nr_left - target_left;
    599		copy_entries(center, 0, left, target_left, left_to_center);
    600		copy_entries(center, left_to_center, right, 0, target_center - left_to_center);
    601		shift_down(right, nr_right - target_right);
    602
    603	} else {
    604		unsigned right_short = target_right - nr_right;
    605		shift_up(right, right_short);
    606		copy_entries(right, 0, left, nr_left - right_short, right_short);
    607		copy_entries(center, 0, left, target_left, nr_left - target_left);
    608	}
    609
    610	left->header.nr_entries = cpu_to_le32(target_left);
    611	center->header.nr_entries = cpu_to_le32(target_center);
    612	right->header.nr_entries = cpu_to_le32(target_right);
    613}
    614
    615/*
    616 * Splits a node by creating a sibling node and shifting half the nodes
    617 * contents across.  Assumes there is a parent node, and it has room for
    618 * another child.
    619 *
    620 * Before:
    621 *	  +--------+
    622 *	  | Parent |
    623 *	  +--------+
    624 *	     |
    625 *	     v
    626 *	+----------+
    627 *	| A ++++++ |
    628 *	+----------+
    629 *
    630 *
    631 * After:
    632 *		+--------+
    633 *		| Parent |
    634 *		+--------+
    635 *		  |	|
    636 *		  v	+------+
    637 *	    +---------+	       |
    638 *	    | A* +++  |	       v
    639 *	    +---------+	  +-------+
    640 *			  | B +++ |
    641 *			  +-------+
    642 *
    643 * Where A* is a shadow of A.
    644 */
    645static int split_one_into_two(struct shadow_spine *s, unsigned parent_index,
    646			      struct dm_btree_value_type *vt, uint64_t key)
    647{
    648	int r;
    649	struct dm_block *left, *right, *parent;
    650	struct btree_node *ln, *rn, *pn;
    651	__le64 location;
    652
    653	left = shadow_current(s);
    654
    655	r = new_block(s->info, &right);
    656	if (r < 0)
    657		return r;
    658
    659	ln = dm_block_data(left);
    660	rn = dm_block_data(right);
    661
    662	rn->header.flags = ln->header.flags;
    663	rn->header.nr_entries = cpu_to_le32(0);
    664	rn->header.max_entries = ln->header.max_entries;
    665	rn->header.value_size = ln->header.value_size;
    666	redistribute2(ln, rn);
    667
    668	/* patch up the parent */
    669	parent = shadow_parent(s);
    670	pn = dm_block_data(parent);
    671
    672	location = cpu_to_le64(dm_block_location(right));
    673	__dm_bless_for_disk(&location);
    674	r = insert_at(sizeof(__le64), pn, parent_index + 1,
    675		      le64_to_cpu(rn->keys[0]), &location);
    676	if (r) {
    677		unlock_block(s->info, right);
    678		return r;
    679	}
    680
    681	/* patch up the spine */
    682	if (key < le64_to_cpu(rn->keys[0])) {
    683		unlock_block(s->info, right);
    684		s->nodes[1] = left;
    685	} else {
    686		unlock_block(s->info, left);
    687		s->nodes[1] = right;
    688	}
    689
    690	return 0;
    691}
    692
    693/*
    694 * We often need to modify a sibling node.  This function shadows a particular
    695 * child of the given parent node.  Making sure to update the parent to point
    696 * to the new shadow.
    697 */
    698static int shadow_child(struct dm_btree_info *info, struct dm_btree_value_type *vt,
    699			struct btree_node *parent, unsigned index,
    700			struct dm_block **result)
    701{
    702	int r, inc;
    703	dm_block_t root;
    704	struct btree_node *node;
    705
    706	root = value64(parent, index);
    707
    708	r = dm_tm_shadow_block(info->tm, root, &btree_node_validator,
    709			       result, &inc);
    710	if (r)
    711		return r;
    712
    713	node = dm_block_data(*result);
    714
    715	if (inc)
    716		inc_children(info->tm, node, vt);
    717
    718	*((__le64 *) value_ptr(parent, index)) =
    719		cpu_to_le64(dm_block_location(*result));
    720
    721	return 0;
    722}
    723
    724/*
    725 * Splits two nodes into three.  This is more work, but results in fuller
    726 * nodes, so saves metadata space.
    727 */
    728static int split_two_into_three(struct shadow_spine *s, unsigned parent_index,
    729                                struct dm_btree_value_type *vt, uint64_t key)
    730{
    731	int r;
    732	unsigned middle_index;
    733	struct dm_block *left, *middle, *right, *parent;
    734	struct btree_node *ln, *rn, *mn, *pn;
    735	__le64 location;
    736
    737	parent = shadow_parent(s);
    738	pn = dm_block_data(parent);
    739
    740	if (parent_index == 0) {
    741		middle_index = 1;
    742		left = shadow_current(s);
    743		r = shadow_child(s->info, vt, pn, parent_index + 1, &right);
    744		if (r)
    745			return r;
    746	} else {
    747		middle_index = parent_index;
    748		right = shadow_current(s);
    749		r = shadow_child(s->info, vt, pn, parent_index - 1, &left);
    750		if (r)
    751			return r;
    752	}
    753
    754	r = new_block(s->info, &middle);
    755	if (r < 0)
    756		return r;
    757
    758	ln = dm_block_data(left);
    759	mn = dm_block_data(middle);
    760	rn = dm_block_data(right);
    761
    762	mn->header.nr_entries = cpu_to_le32(0);
    763	mn->header.flags = ln->header.flags;
    764	mn->header.max_entries = ln->header.max_entries;
    765	mn->header.value_size = ln->header.value_size;
    766
    767	redistribute3(ln, mn, rn);
    768
    769	/* patch up the parent */
    770	pn->keys[middle_index] = rn->keys[0];
    771	location = cpu_to_le64(dm_block_location(middle));
    772	__dm_bless_for_disk(&location);
    773	r = insert_at(sizeof(__le64), pn, middle_index,
    774		      le64_to_cpu(mn->keys[0]), &location);
    775	if (r) {
    776		if (shadow_current(s) != left)
    777			unlock_block(s->info, left);
    778
    779		unlock_block(s->info, middle);
    780
    781		if (shadow_current(s) != right)
    782			unlock_block(s->info, right);
    783
    784	        return r;
    785	}
    786
    787
    788	/* patch up the spine */
    789	if (key < le64_to_cpu(mn->keys[0])) {
    790		unlock_block(s->info, middle);
    791		unlock_block(s->info, right);
    792		s->nodes[1] = left;
    793	} else if (key < le64_to_cpu(rn->keys[0])) {
    794		unlock_block(s->info, left);
    795		unlock_block(s->info, right);
    796		s->nodes[1] = middle;
    797	} else {
    798		unlock_block(s->info, left);
    799		unlock_block(s->info, middle);
    800		s->nodes[1] = right;
    801	}
    802
    803	return 0;
    804}
    805
    806/*----------------------------------------------------------------*/
    807
    808/*
    809 * Splits a node by creating two new children beneath the given node.
    810 *
    811 * Before:
    812 *	  +----------+
    813 *	  | A ++++++ |
    814 *	  +----------+
    815 *
    816 *
    817 * After:
    818 *	+------------+
    819 *	| A (shadow) |
    820 *	+------------+
    821 *	    |	|
    822 *   +------+	+----+
    823 *   |		     |
    824 *   v		     v
    825 * +-------+	 +-------+
    826 * | B +++ |	 | C +++ |
    827 * +-------+	 +-------+
    828 */
    829static int btree_split_beneath(struct shadow_spine *s, uint64_t key)
    830{
    831	int r;
    832	size_t size;
    833	unsigned nr_left, nr_right;
    834	struct dm_block *left, *right, *new_parent;
    835	struct btree_node *pn, *ln, *rn;
    836	__le64 val;
    837
    838	new_parent = shadow_current(s);
    839
    840	pn = dm_block_data(new_parent);
    841	size = le32_to_cpu(pn->header.flags) & INTERNAL_NODE ?
    842		sizeof(__le64) : s->info->value_type.size;
    843
    844	/* create & init the left block */
    845	r = new_block(s->info, &left);
    846	if (r < 0)
    847		return r;
    848
    849	ln = dm_block_data(left);
    850	nr_left = le32_to_cpu(pn->header.nr_entries) / 2;
    851
    852	ln->header.flags = pn->header.flags;
    853	ln->header.nr_entries = cpu_to_le32(nr_left);
    854	ln->header.max_entries = pn->header.max_entries;
    855	ln->header.value_size = pn->header.value_size;
    856	memcpy(ln->keys, pn->keys, nr_left * sizeof(pn->keys[0]));
    857	memcpy(value_ptr(ln, 0), value_ptr(pn, 0), nr_left * size);
    858
    859	/* create & init the right block */
    860	r = new_block(s->info, &right);
    861	if (r < 0) {
    862		unlock_block(s->info, left);
    863		return r;
    864	}
    865
    866	rn = dm_block_data(right);
    867	nr_right = le32_to_cpu(pn->header.nr_entries) - nr_left;
    868
    869	rn->header.flags = pn->header.flags;
    870	rn->header.nr_entries = cpu_to_le32(nr_right);
    871	rn->header.max_entries = pn->header.max_entries;
    872	rn->header.value_size = pn->header.value_size;
    873	memcpy(rn->keys, pn->keys + nr_left, nr_right * sizeof(pn->keys[0]));
    874	memcpy(value_ptr(rn, 0), value_ptr(pn, nr_left),
    875	       nr_right * size);
    876
    877	/* new_parent should just point to l and r now */
    878	pn->header.flags = cpu_to_le32(INTERNAL_NODE);
    879	pn->header.nr_entries = cpu_to_le32(2);
    880	pn->header.max_entries = cpu_to_le32(
    881		calc_max_entries(sizeof(__le64),
    882				 dm_bm_block_size(
    883					 dm_tm_get_bm(s->info->tm))));
    884	pn->header.value_size = cpu_to_le32(sizeof(__le64));
    885
    886	val = cpu_to_le64(dm_block_location(left));
    887	__dm_bless_for_disk(&val);
    888	pn->keys[0] = ln->keys[0];
    889	memcpy_disk(value_ptr(pn, 0), &val, sizeof(__le64));
    890
    891	val = cpu_to_le64(dm_block_location(right));
    892	__dm_bless_for_disk(&val);
    893	pn->keys[1] = rn->keys[0];
    894	memcpy_disk(value_ptr(pn, 1), &val, sizeof(__le64));
    895
    896	unlock_block(s->info, left);
    897	unlock_block(s->info, right);
    898	return 0;
    899}
    900
    901/*----------------------------------------------------------------*/
    902
    903/*
    904 * Redistributes a node's entries with its left sibling.
    905 */
    906static int rebalance_left(struct shadow_spine *s, struct dm_btree_value_type *vt,
    907			  unsigned parent_index, uint64_t key)
    908{
    909	int r;
    910	struct dm_block *sib;
    911	struct btree_node *left, *right, *parent = dm_block_data(shadow_parent(s));
    912
    913	r = shadow_child(s->info, vt, parent, parent_index - 1, &sib);
    914	if (r)
    915		return r;
    916
    917	left = dm_block_data(sib);
    918	right = dm_block_data(shadow_current(s));
    919	redistribute2(left, right);
    920	*key_ptr(parent, parent_index) = right->keys[0];
    921
    922	if (key < le64_to_cpu(right->keys[0])) {
    923		unlock_block(s->info, s->nodes[1]);
    924		s->nodes[1] = sib;
    925	} else {
    926		unlock_block(s->info, sib);
    927	}
    928
    929	return 0;
    930}
    931
    932/*
    933 * Redistributes a nodes entries with its right sibling.
    934 */
    935static int rebalance_right(struct shadow_spine *s, struct dm_btree_value_type *vt,
    936			   unsigned parent_index, uint64_t key)
    937{
    938	int r;
    939	struct dm_block *sib;
    940	struct btree_node *left, *right, *parent = dm_block_data(shadow_parent(s));
    941
    942	r = shadow_child(s->info, vt, parent, parent_index + 1, &sib);
    943	if (r)
    944		return r;
    945
    946	left = dm_block_data(shadow_current(s));
    947	right = dm_block_data(sib);
    948	redistribute2(left, right);
    949	*key_ptr(parent, parent_index + 1) = right->keys[0];
    950
    951	if (key < le64_to_cpu(right->keys[0])) {
    952		unlock_block(s->info, sib);
    953	} else {
    954		unlock_block(s->info, s->nodes[1]);
    955		s->nodes[1] = sib;
    956	}
    957
    958	return 0;
    959}
    960
    961/*
    962 * Returns the number of spare entries in a node.
    963 */
    964static int get_node_free_space(struct dm_btree_info *info, dm_block_t b, unsigned *space)
    965{
    966	int r;
    967	unsigned nr_entries;
    968	struct dm_block *block;
    969	struct btree_node *node;
    970
    971	r = bn_read_lock(info, b, &block);
    972	if (r)
    973		return r;
    974
    975	node = dm_block_data(block);
    976	nr_entries = le32_to_cpu(node->header.nr_entries);
    977	*space = le32_to_cpu(node->header.max_entries) - nr_entries;
    978
    979	unlock_block(info, block);
    980	return 0;
    981}
    982
    983/*
    984 * Make space in a node, either by moving some entries to a sibling,
    985 * or creating a new sibling node.  SPACE_THRESHOLD defines the minimum
    986 * number of free entries that must be in the sibling to make the move
    987 * worth while.  If the siblings are shared (eg, part of a snapshot),
    988 * then they are not touched, since this break sharing and so consume
    989 * more space than we save.
    990 */
    991#define SPACE_THRESHOLD 8
    992static int rebalance_or_split(struct shadow_spine *s, struct dm_btree_value_type *vt,
    993			      unsigned parent_index, uint64_t key)
    994{
    995	int r;
    996	struct btree_node *parent = dm_block_data(shadow_parent(s));
    997	unsigned nr_parent = le32_to_cpu(parent->header.nr_entries);
    998	unsigned free_space;
    999	int left_shared = 0, right_shared = 0;
   1000
   1001	/* Should we move entries to the left sibling? */
   1002	if (parent_index > 0) {
   1003		dm_block_t left_b = value64(parent, parent_index - 1);
   1004		r = dm_tm_block_is_shared(s->info->tm, left_b, &left_shared);
   1005		if (r)
   1006			return r;
   1007
   1008		if (!left_shared) {
   1009			r = get_node_free_space(s->info, left_b, &free_space);
   1010			if (r)
   1011				return r;
   1012
   1013			if (free_space >= SPACE_THRESHOLD)
   1014				return rebalance_left(s, vt, parent_index, key);
   1015		}
   1016	}
   1017
   1018	/* Should we move entries to the right sibling? */
   1019	if (parent_index < (nr_parent - 1)) {
   1020		dm_block_t right_b = value64(parent, parent_index + 1);
   1021		r = dm_tm_block_is_shared(s->info->tm, right_b, &right_shared);
   1022		if (r)
   1023			return r;
   1024
   1025		if (!right_shared) {
   1026			r = get_node_free_space(s->info, right_b, &free_space);
   1027			if (r)
   1028				return r;
   1029
   1030			if (free_space >= SPACE_THRESHOLD)
   1031				return rebalance_right(s, vt, parent_index, key);
   1032		}
   1033	}
   1034
   1035	/*
   1036	 * We need to split the node, normally we split two nodes
   1037	 * into three.	But when inserting a sequence that is either
   1038	 * monotonically increasing or decreasing it's better to split
   1039	 * a single node into two.
   1040	 */
   1041	if (left_shared || right_shared || (nr_parent <= 2) ||
   1042	    (parent_index == 0) || (parent_index + 1 == nr_parent)) {
   1043		return split_one_into_two(s, parent_index, vt, key);
   1044	} else {
   1045		return split_two_into_three(s, parent_index, vt, key);
   1046	}
   1047}
   1048
   1049/*
   1050 * Does the node contain a particular key?
   1051 */
   1052static bool contains_key(struct btree_node *node, uint64_t key)
   1053{
   1054	int i = lower_bound(node, key);
   1055
   1056	if (i >= 0 && le64_to_cpu(node->keys[i]) == key)
   1057		return true;
   1058
   1059	return false;
   1060}
   1061
   1062/*
   1063 * In general we preemptively make sure there's a free entry in every
   1064 * node on the spine when doing an insert.  But we can avoid that with
   1065 * leaf nodes if we know it's an overwrite.
   1066 */
   1067static bool has_space_for_insert(struct btree_node *node, uint64_t key)
   1068{
   1069	if (node->header.nr_entries == node->header.max_entries) {
   1070		if (le32_to_cpu(node->header.flags) & LEAF_NODE) {
   1071			/* we don't need space if it's an overwrite */
   1072			return contains_key(node, key);
   1073		}
   1074
   1075		return false;
   1076	}
   1077
   1078	return true;
   1079}
   1080
   1081static int btree_insert_raw(struct shadow_spine *s, dm_block_t root,
   1082			    struct dm_btree_value_type *vt,
   1083			    uint64_t key, unsigned *index)
   1084{
   1085	int r, i = *index, top = 1;
   1086	struct btree_node *node;
   1087
   1088	for (;;) {
   1089		r = shadow_step(s, root, vt);
   1090		if (r < 0)
   1091			return r;
   1092
   1093		node = dm_block_data(shadow_current(s));
   1094
   1095		/*
   1096		 * We have to patch up the parent node, ugly, but I don't
   1097		 * see a way to do this automatically as part of the spine
   1098		 * op.
   1099		 */
   1100		if (shadow_has_parent(s) && i >= 0) { /* FIXME: second clause unness. */
   1101			__le64 location = cpu_to_le64(dm_block_location(shadow_current(s)));
   1102
   1103			__dm_bless_for_disk(&location);
   1104			memcpy_disk(value_ptr(dm_block_data(shadow_parent(s)), i),
   1105				    &location, sizeof(__le64));
   1106		}
   1107
   1108		node = dm_block_data(shadow_current(s));
   1109
   1110		if (!has_space_for_insert(node, key)) {
   1111			if (top)
   1112				r = btree_split_beneath(s, key);
   1113			else
   1114				r = rebalance_or_split(s, vt, i, key);
   1115
   1116			if (r < 0)
   1117				return r;
   1118
   1119			/* making space can cause the current node to change */
   1120			node = dm_block_data(shadow_current(s));
   1121		}
   1122
   1123		i = lower_bound(node, key);
   1124
   1125		if (le32_to_cpu(node->header.flags) & LEAF_NODE)
   1126			break;
   1127
   1128		if (i < 0) {
   1129			/* change the bounds on the lowest key */
   1130			node->keys[0] = cpu_to_le64(key);
   1131			i = 0;
   1132		}
   1133
   1134		root = value64(node, i);
   1135		top = 0;
   1136	}
   1137
   1138	if (i < 0 || le64_to_cpu(node->keys[i]) != key)
   1139		i++;
   1140
   1141	*index = i;
   1142	return 0;
   1143}
   1144
   1145static int __btree_get_overwrite_leaf(struct shadow_spine *s, dm_block_t root,
   1146				      uint64_t key, int *index)
   1147{
   1148	int r, i = -1;
   1149	struct btree_node *node;
   1150
   1151	*index = 0;
   1152	for (;;) {
   1153		r = shadow_step(s, root, &s->info->value_type);
   1154		if (r < 0)
   1155			return r;
   1156
   1157		node = dm_block_data(shadow_current(s));
   1158
   1159		/*
   1160		 * We have to patch up the parent node, ugly, but I don't
   1161		 * see a way to do this automatically as part of the spine
   1162		 * op.
   1163		 */
   1164		if (shadow_has_parent(s) && i >= 0) {
   1165			__le64 location = cpu_to_le64(dm_block_location(shadow_current(s)));
   1166
   1167			__dm_bless_for_disk(&location);
   1168			memcpy_disk(value_ptr(dm_block_data(shadow_parent(s)), i),
   1169				    &location, sizeof(__le64));
   1170		}
   1171
   1172		node = dm_block_data(shadow_current(s));
   1173		i = lower_bound(node, key);
   1174
   1175		BUG_ON(i < 0);
   1176		BUG_ON(i >= le32_to_cpu(node->header.nr_entries));
   1177
   1178		if (le32_to_cpu(node->header.flags) & LEAF_NODE) {
   1179			if (key != le64_to_cpu(node->keys[i]))
   1180				return -EINVAL;
   1181			break;
   1182		}
   1183
   1184		root = value64(node, i);
   1185	}
   1186
   1187	*index = i;
   1188	return 0;
   1189}
   1190
   1191int btree_get_overwrite_leaf(struct dm_btree_info *info, dm_block_t root,
   1192			     uint64_t key, int *index,
   1193			     dm_block_t *new_root, struct dm_block **leaf)
   1194{
   1195	int r;
   1196	struct shadow_spine spine;
   1197
   1198	BUG_ON(info->levels > 1);
   1199	init_shadow_spine(&spine, info);
   1200	r = __btree_get_overwrite_leaf(&spine, root, key, index);
   1201	if (!r) {
   1202		*new_root = shadow_root(&spine);
   1203		*leaf = shadow_current(&spine);
   1204
   1205		/*
   1206		 * Decrement the count so exit_shadow_spine() doesn't
   1207		 * unlock the leaf.
   1208		 */
   1209		spine.count--;
   1210	}
   1211	exit_shadow_spine(&spine);
   1212
   1213	return r;
   1214}
   1215
   1216static bool need_insert(struct btree_node *node, uint64_t *keys,
   1217			unsigned level, unsigned index)
   1218{
   1219        return ((index >= le32_to_cpu(node->header.nr_entries)) ||
   1220		(le64_to_cpu(node->keys[index]) != keys[level]));
   1221}
   1222
   1223static int insert(struct dm_btree_info *info, dm_block_t root,
   1224		  uint64_t *keys, void *value, dm_block_t *new_root,
   1225		  int *inserted)
   1226		  __dm_written_to_disk(value)
   1227{
   1228	int r;
   1229	unsigned level, index = -1, last_level = info->levels - 1;
   1230	dm_block_t block = root;
   1231	struct shadow_spine spine;
   1232	struct btree_node *n;
   1233	struct dm_btree_value_type le64_type;
   1234
   1235	init_le64_type(info->tm, &le64_type);
   1236	init_shadow_spine(&spine, info);
   1237
   1238	for (level = 0; level < (info->levels - 1); level++) {
   1239		r = btree_insert_raw(&spine, block, &le64_type, keys[level], &index);
   1240		if (r < 0)
   1241			goto bad;
   1242
   1243		n = dm_block_data(shadow_current(&spine));
   1244
   1245		if (need_insert(n, keys, level, index)) {
   1246			dm_block_t new_tree;
   1247			__le64 new_le;
   1248
   1249			r = dm_btree_empty(info, &new_tree);
   1250			if (r < 0)
   1251				goto bad;
   1252
   1253			new_le = cpu_to_le64(new_tree);
   1254			__dm_bless_for_disk(&new_le);
   1255
   1256			r = insert_at(sizeof(uint64_t), n, index,
   1257				      keys[level], &new_le);
   1258			if (r)
   1259				goto bad;
   1260		}
   1261
   1262		if (level < last_level)
   1263			block = value64(n, index);
   1264	}
   1265
   1266	r = btree_insert_raw(&spine, block, &info->value_type,
   1267			     keys[level], &index);
   1268	if (r < 0)
   1269		goto bad;
   1270
   1271	n = dm_block_data(shadow_current(&spine));
   1272
   1273	if (need_insert(n, keys, level, index)) {
   1274		if (inserted)
   1275			*inserted = 1;
   1276
   1277		r = insert_at(info->value_type.size, n, index,
   1278			      keys[level], value);
   1279		if (r)
   1280			goto bad_unblessed;
   1281	} else {
   1282		if (inserted)
   1283			*inserted = 0;
   1284
   1285		if (info->value_type.dec &&
   1286		    (!info->value_type.equal ||
   1287		     !info->value_type.equal(
   1288			     info->value_type.context,
   1289			     value_ptr(n, index),
   1290			     value))) {
   1291			info->value_type.dec(info->value_type.context,
   1292					     value_ptr(n, index), 1);
   1293		}
   1294		memcpy_disk(value_ptr(n, index),
   1295			    value, info->value_type.size);
   1296	}
   1297
   1298	*new_root = shadow_root(&spine);
   1299	exit_shadow_spine(&spine);
   1300
   1301	return 0;
   1302
   1303bad:
   1304	__dm_unbless_for_disk(value);
   1305bad_unblessed:
   1306	exit_shadow_spine(&spine);
   1307	return r;
   1308}
   1309
   1310int dm_btree_insert(struct dm_btree_info *info, dm_block_t root,
   1311		    uint64_t *keys, void *value, dm_block_t *new_root)
   1312		    __dm_written_to_disk(value)
   1313{
   1314	return insert(info, root, keys, value, new_root, NULL);
   1315}
   1316EXPORT_SYMBOL_GPL(dm_btree_insert);
   1317
   1318int dm_btree_insert_notify(struct dm_btree_info *info, dm_block_t root,
   1319			   uint64_t *keys, void *value, dm_block_t *new_root,
   1320			   int *inserted)
   1321			   __dm_written_to_disk(value)
   1322{
   1323	return insert(info, root, keys, value, new_root, inserted);
   1324}
   1325EXPORT_SYMBOL_GPL(dm_btree_insert_notify);
   1326
   1327/*----------------------------------------------------------------*/
   1328
   1329static int find_key(struct ro_spine *s, dm_block_t block, bool find_highest,
   1330		    uint64_t *result_key, dm_block_t *next_block)
   1331{
   1332	int i, r;
   1333	uint32_t flags;
   1334
   1335	do {
   1336		r = ro_step(s, block);
   1337		if (r < 0)
   1338			return r;
   1339
   1340		flags = le32_to_cpu(ro_node(s)->header.flags);
   1341		i = le32_to_cpu(ro_node(s)->header.nr_entries);
   1342		if (!i)
   1343			return -ENODATA;
   1344		else
   1345			i--;
   1346
   1347		if (find_highest)
   1348			*result_key = le64_to_cpu(ro_node(s)->keys[i]);
   1349		else
   1350			*result_key = le64_to_cpu(ro_node(s)->keys[0]);
   1351
   1352		if (next_block || flags & INTERNAL_NODE) {
   1353			if (find_highest)
   1354				block = value64(ro_node(s), i);
   1355			else
   1356				block = value64(ro_node(s), 0);
   1357		}
   1358
   1359	} while (flags & INTERNAL_NODE);
   1360
   1361	if (next_block)
   1362		*next_block = block;
   1363	return 0;
   1364}
   1365
   1366static int dm_btree_find_key(struct dm_btree_info *info, dm_block_t root,
   1367			     bool find_highest, uint64_t *result_keys)
   1368{
   1369	int r = 0, count = 0, level;
   1370	struct ro_spine spine;
   1371
   1372	init_ro_spine(&spine, info);
   1373	for (level = 0; level < info->levels; level++) {
   1374		r = find_key(&spine, root, find_highest, result_keys + level,
   1375			     level == info->levels - 1 ? NULL : &root);
   1376		if (r == -ENODATA) {
   1377			r = 0;
   1378			break;
   1379
   1380		} else if (r)
   1381			break;
   1382
   1383		count++;
   1384	}
   1385	exit_ro_spine(&spine);
   1386
   1387	return r ? r : count;
   1388}
   1389
   1390int dm_btree_find_highest_key(struct dm_btree_info *info, dm_block_t root,
   1391			      uint64_t *result_keys)
   1392{
   1393	return dm_btree_find_key(info, root, true, result_keys);
   1394}
   1395EXPORT_SYMBOL_GPL(dm_btree_find_highest_key);
   1396
   1397int dm_btree_find_lowest_key(struct dm_btree_info *info, dm_block_t root,
   1398			     uint64_t *result_keys)
   1399{
   1400	return dm_btree_find_key(info, root, false, result_keys);
   1401}
   1402EXPORT_SYMBOL_GPL(dm_btree_find_lowest_key);
   1403
   1404/*----------------------------------------------------------------*/
   1405
   1406/*
   1407 * FIXME: We shouldn't use a recursive algorithm when we have limited stack
   1408 * space.  Also this only works for single level trees.
   1409 */
   1410static int walk_node(struct dm_btree_info *info, dm_block_t block,
   1411		     int (*fn)(void *context, uint64_t *keys, void *leaf),
   1412		     void *context)
   1413{
   1414	int r;
   1415	unsigned i, nr;
   1416	struct dm_block *node;
   1417	struct btree_node *n;
   1418	uint64_t keys;
   1419
   1420	r = bn_read_lock(info, block, &node);
   1421	if (r)
   1422		return r;
   1423
   1424	n = dm_block_data(node);
   1425
   1426	nr = le32_to_cpu(n->header.nr_entries);
   1427	for (i = 0; i < nr; i++) {
   1428		if (le32_to_cpu(n->header.flags) & INTERNAL_NODE) {
   1429			r = walk_node(info, value64(n, i), fn, context);
   1430			if (r)
   1431				goto out;
   1432		} else {
   1433			keys = le64_to_cpu(*key_ptr(n, i));
   1434			r = fn(context, &keys, value_ptr(n, i));
   1435			if (r)
   1436				goto out;
   1437		}
   1438	}
   1439
   1440out:
   1441	dm_tm_unlock(info->tm, node);
   1442	return r;
   1443}
   1444
   1445int dm_btree_walk(struct dm_btree_info *info, dm_block_t root,
   1446		  int (*fn)(void *context, uint64_t *keys, void *leaf),
   1447		  void *context)
   1448{
   1449	BUG_ON(info->levels > 1);
   1450	return walk_node(info, root, fn, context);
   1451}
   1452EXPORT_SYMBOL_GPL(dm_btree_walk);
   1453
   1454/*----------------------------------------------------------------*/
   1455
   1456static void prefetch_values(struct dm_btree_cursor *c)
   1457{
   1458	unsigned i, nr;
   1459	__le64 value_le;
   1460	struct cursor_node *n = c->nodes + c->depth - 1;
   1461	struct btree_node *bn = dm_block_data(n->b);
   1462	struct dm_block_manager *bm = dm_tm_get_bm(c->info->tm);
   1463
   1464	BUG_ON(c->info->value_type.size != sizeof(value_le));
   1465
   1466	nr = le32_to_cpu(bn->header.nr_entries);
   1467	for (i = 0; i < nr; i++) {
   1468		memcpy(&value_le, value_ptr(bn, i), sizeof(value_le));
   1469		dm_bm_prefetch(bm, le64_to_cpu(value_le));
   1470	}
   1471}
   1472
   1473static bool leaf_node(struct dm_btree_cursor *c)
   1474{
   1475	struct cursor_node *n = c->nodes + c->depth - 1;
   1476	struct btree_node *bn = dm_block_data(n->b);
   1477
   1478	return le32_to_cpu(bn->header.flags) & LEAF_NODE;
   1479}
   1480
   1481static int push_node(struct dm_btree_cursor *c, dm_block_t b)
   1482{
   1483	int r;
   1484	struct cursor_node *n = c->nodes + c->depth;
   1485
   1486	if (c->depth >= DM_BTREE_CURSOR_MAX_DEPTH - 1) {
   1487		DMERR("couldn't push cursor node, stack depth too high");
   1488		return -EINVAL;
   1489	}
   1490
   1491	r = bn_read_lock(c->info, b, &n->b);
   1492	if (r)
   1493		return r;
   1494
   1495	n->index = 0;
   1496	c->depth++;
   1497
   1498	if (c->prefetch_leaves || !leaf_node(c))
   1499		prefetch_values(c);
   1500
   1501	return 0;
   1502}
   1503
   1504static void pop_node(struct dm_btree_cursor *c)
   1505{
   1506	c->depth--;
   1507	unlock_block(c->info, c->nodes[c->depth].b);
   1508}
   1509
   1510static int inc_or_backtrack(struct dm_btree_cursor *c)
   1511{
   1512	struct cursor_node *n;
   1513	struct btree_node *bn;
   1514
   1515	for (;;) {
   1516		if (!c->depth)
   1517			return -ENODATA;
   1518
   1519		n = c->nodes + c->depth - 1;
   1520		bn = dm_block_data(n->b);
   1521
   1522		n->index++;
   1523		if (n->index < le32_to_cpu(bn->header.nr_entries))
   1524			break;
   1525
   1526		pop_node(c);
   1527	}
   1528
   1529	return 0;
   1530}
   1531
   1532static int find_leaf(struct dm_btree_cursor *c)
   1533{
   1534	int r = 0;
   1535	struct cursor_node *n;
   1536	struct btree_node *bn;
   1537	__le64 value_le;
   1538
   1539	for (;;) {
   1540		n = c->nodes + c->depth - 1;
   1541		bn = dm_block_data(n->b);
   1542
   1543		if (le32_to_cpu(bn->header.flags) & LEAF_NODE)
   1544			break;
   1545
   1546		memcpy(&value_le, value_ptr(bn, n->index), sizeof(value_le));
   1547		r = push_node(c, le64_to_cpu(value_le));
   1548		if (r) {
   1549			DMERR("push_node failed");
   1550			break;
   1551		}
   1552	}
   1553
   1554	if (!r && (le32_to_cpu(bn->header.nr_entries) == 0))
   1555		return -ENODATA;
   1556
   1557	return r;
   1558}
   1559
   1560int dm_btree_cursor_begin(struct dm_btree_info *info, dm_block_t root,
   1561			  bool prefetch_leaves, struct dm_btree_cursor *c)
   1562{
   1563	int r;
   1564
   1565	c->info = info;
   1566	c->root = root;
   1567	c->depth = 0;
   1568	c->prefetch_leaves = prefetch_leaves;
   1569
   1570	r = push_node(c, root);
   1571	if (r)
   1572		return r;
   1573
   1574	return find_leaf(c);
   1575}
   1576EXPORT_SYMBOL_GPL(dm_btree_cursor_begin);
   1577
   1578void dm_btree_cursor_end(struct dm_btree_cursor *c)
   1579{
   1580	while (c->depth)
   1581		pop_node(c);
   1582}
   1583EXPORT_SYMBOL_GPL(dm_btree_cursor_end);
   1584
   1585int dm_btree_cursor_next(struct dm_btree_cursor *c)
   1586{
   1587	int r = inc_or_backtrack(c);
   1588	if (!r) {
   1589		r = find_leaf(c);
   1590		if (r)
   1591			DMERR("find_leaf failed");
   1592	}
   1593
   1594	return r;
   1595}
   1596EXPORT_SYMBOL_GPL(dm_btree_cursor_next);
   1597
   1598int dm_btree_cursor_skip(struct dm_btree_cursor *c, uint32_t count)
   1599{
   1600	int r = 0;
   1601
   1602	while (count-- && !r)
   1603		r = dm_btree_cursor_next(c);
   1604
   1605	return r;
   1606}
   1607EXPORT_SYMBOL_GPL(dm_btree_cursor_skip);
   1608
   1609int dm_btree_cursor_get_value(struct dm_btree_cursor *c, uint64_t *key, void *value_le)
   1610{
   1611	if (c->depth) {
   1612		struct cursor_node *n = c->nodes + c->depth - 1;
   1613		struct btree_node *bn = dm_block_data(n->b);
   1614
   1615		if (le32_to_cpu(bn->header.flags) & INTERNAL_NODE)
   1616			return -EINVAL;
   1617
   1618		*key = le64_to_cpu(*key_ptr(bn, n->index));
   1619		memcpy(value_le, value_ptr(bn, n->index), c->info->value_type.size);
   1620		return 0;
   1621
   1622	} else
   1623		return -ENODATA;
   1624}
   1625EXPORT_SYMBOL_GPL(dm_btree_cursor_get_value);