cachepc-linux

Fork of AMDESE/linux with modifications for CachePC side-channel attack
git clone https://git.sinitax.com/sinitax/cachepc-linux
Log | Files | Refs | README | LICENSE | sfeed.txt

ctree.c (129617B)


      1// SPDX-License-Identifier: GPL-2.0
      2/*
      3 * Copyright (C) 2007,2008 Oracle.  All rights reserved.
      4 */
      5
      6#include <linux/sched.h>
      7#include <linux/slab.h>
      8#include <linux/rbtree.h>
      9#include <linux/mm.h>
     10#include <linux/error-injection.h>
     11#include "ctree.h"
     12#include "disk-io.h"
     13#include "transaction.h"
     14#include "print-tree.h"
     15#include "locking.h"
     16#include "volumes.h"
     17#include "qgroup.h"
     18#include "tree-mod-log.h"
     19#include "tree-checker.h"
     20
     21static int split_node(struct btrfs_trans_handle *trans, struct btrfs_root
     22		      *root, struct btrfs_path *path, int level);
     23static int split_leaf(struct btrfs_trans_handle *trans, struct btrfs_root *root,
     24		      const struct btrfs_key *ins_key, struct btrfs_path *path,
     25		      int data_size, int extend);
     26static int push_node_left(struct btrfs_trans_handle *trans,
     27			  struct extent_buffer *dst,
     28			  struct extent_buffer *src, int empty);
     29static int balance_node_right(struct btrfs_trans_handle *trans,
     30			      struct extent_buffer *dst_buf,
     31			      struct extent_buffer *src_buf);
     32static void del_ptr(struct btrfs_root *root, struct btrfs_path *path,
     33		    int level, int slot);
     34
     35static const struct btrfs_csums {
     36	u16		size;
     37	const char	name[10];
     38	const char	driver[12];
     39} btrfs_csums[] = {
     40	[BTRFS_CSUM_TYPE_CRC32] = { .size = 4, .name = "crc32c" },
     41	[BTRFS_CSUM_TYPE_XXHASH] = { .size = 8, .name = "xxhash64" },
     42	[BTRFS_CSUM_TYPE_SHA256] = { .size = 32, .name = "sha256" },
     43	[BTRFS_CSUM_TYPE_BLAKE2] = { .size = 32, .name = "blake2b",
     44				     .driver = "blake2b-256" },
     45};
     46
     47int btrfs_super_csum_size(const struct btrfs_super_block *s)
     48{
     49	u16 t = btrfs_super_csum_type(s);
     50	/*
     51	 * csum type is validated at mount time
     52	 */
     53	return btrfs_csums[t].size;
     54}
     55
     56const char *btrfs_super_csum_name(u16 csum_type)
     57{
     58	/* csum type is validated at mount time */
     59	return btrfs_csums[csum_type].name;
     60}
     61
     62/*
     63 * Return driver name if defined, otherwise the name that's also a valid driver
     64 * name
     65 */
     66const char *btrfs_super_csum_driver(u16 csum_type)
     67{
     68	/* csum type is validated at mount time */
     69	return btrfs_csums[csum_type].driver[0] ?
     70		btrfs_csums[csum_type].driver :
     71		btrfs_csums[csum_type].name;
     72}
     73
     74size_t __attribute_const__ btrfs_get_num_csums(void)
     75{
     76	return ARRAY_SIZE(btrfs_csums);
     77}
     78
     79struct btrfs_path *btrfs_alloc_path(void)
     80{
     81	return kmem_cache_zalloc(btrfs_path_cachep, GFP_NOFS);
     82}
     83
     84/* this also releases the path */
     85void btrfs_free_path(struct btrfs_path *p)
     86{
     87	if (!p)
     88		return;
     89	btrfs_release_path(p);
     90	kmem_cache_free(btrfs_path_cachep, p);
     91}
     92
     93/*
     94 * path release drops references on the extent buffers in the path
     95 * and it drops any locks held by this path
     96 *
     97 * It is safe to call this on paths that no locks or extent buffers held.
     98 */
     99noinline void btrfs_release_path(struct btrfs_path *p)
    100{
    101	int i;
    102
    103	for (i = 0; i < BTRFS_MAX_LEVEL; i++) {
    104		p->slots[i] = 0;
    105		if (!p->nodes[i])
    106			continue;
    107		if (p->locks[i]) {
    108			btrfs_tree_unlock_rw(p->nodes[i], p->locks[i]);
    109			p->locks[i] = 0;
    110		}
    111		free_extent_buffer(p->nodes[i]);
    112		p->nodes[i] = NULL;
    113	}
    114}
    115
    116/*
    117 * safely gets a reference on the root node of a tree.  A lock
    118 * is not taken, so a concurrent writer may put a different node
    119 * at the root of the tree.  See btrfs_lock_root_node for the
    120 * looping required.
    121 *
    122 * The extent buffer returned by this has a reference taken, so
    123 * it won't disappear.  It may stop being the root of the tree
    124 * at any time because there are no locks held.
    125 */
    126struct extent_buffer *btrfs_root_node(struct btrfs_root *root)
    127{
    128	struct extent_buffer *eb;
    129
    130	while (1) {
    131		rcu_read_lock();
    132		eb = rcu_dereference(root->node);
    133
    134		/*
    135		 * RCU really hurts here, we could free up the root node because
    136		 * it was COWed but we may not get the new root node yet so do
    137		 * the inc_not_zero dance and if it doesn't work then
    138		 * synchronize_rcu and try again.
    139		 */
    140		if (atomic_inc_not_zero(&eb->refs)) {
    141			rcu_read_unlock();
    142			break;
    143		}
    144		rcu_read_unlock();
    145		synchronize_rcu();
    146	}
    147	return eb;
    148}
    149
    150/*
    151 * Cowonly root (not-shareable trees, everything not subvolume or reloc roots),
    152 * just get put onto a simple dirty list.  Transaction walks this list to make
    153 * sure they get properly updated on disk.
    154 */
    155static void add_root_to_dirty_list(struct btrfs_root *root)
    156{
    157	struct btrfs_fs_info *fs_info = root->fs_info;
    158
    159	if (test_bit(BTRFS_ROOT_DIRTY, &root->state) ||
    160	    !test_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state))
    161		return;
    162
    163	spin_lock(&fs_info->trans_lock);
    164	if (!test_and_set_bit(BTRFS_ROOT_DIRTY, &root->state)) {
    165		/* Want the extent tree to be the last on the list */
    166		if (root->root_key.objectid == BTRFS_EXTENT_TREE_OBJECTID)
    167			list_move_tail(&root->dirty_list,
    168				       &fs_info->dirty_cowonly_roots);
    169		else
    170			list_move(&root->dirty_list,
    171				  &fs_info->dirty_cowonly_roots);
    172	}
    173	spin_unlock(&fs_info->trans_lock);
    174}
    175
    176/*
    177 * used by snapshot creation to make a copy of a root for a tree with
    178 * a given objectid.  The buffer with the new root node is returned in
    179 * cow_ret, and this func returns zero on success or a negative error code.
    180 */
    181int btrfs_copy_root(struct btrfs_trans_handle *trans,
    182		      struct btrfs_root *root,
    183		      struct extent_buffer *buf,
    184		      struct extent_buffer **cow_ret, u64 new_root_objectid)
    185{
    186	struct btrfs_fs_info *fs_info = root->fs_info;
    187	struct extent_buffer *cow;
    188	int ret = 0;
    189	int level;
    190	struct btrfs_disk_key disk_key;
    191
    192	WARN_ON(test_bit(BTRFS_ROOT_SHAREABLE, &root->state) &&
    193		trans->transid != fs_info->running_transaction->transid);
    194	WARN_ON(test_bit(BTRFS_ROOT_SHAREABLE, &root->state) &&
    195		trans->transid != root->last_trans);
    196
    197	level = btrfs_header_level(buf);
    198	if (level == 0)
    199		btrfs_item_key(buf, &disk_key, 0);
    200	else
    201		btrfs_node_key(buf, &disk_key, 0);
    202
    203	cow = btrfs_alloc_tree_block(trans, root, 0, new_root_objectid,
    204				     &disk_key, level, buf->start, 0,
    205				     BTRFS_NESTING_NEW_ROOT);
    206	if (IS_ERR(cow))
    207		return PTR_ERR(cow);
    208
    209	copy_extent_buffer_full(cow, buf);
    210	btrfs_set_header_bytenr(cow, cow->start);
    211	btrfs_set_header_generation(cow, trans->transid);
    212	btrfs_set_header_backref_rev(cow, BTRFS_MIXED_BACKREF_REV);
    213	btrfs_clear_header_flag(cow, BTRFS_HEADER_FLAG_WRITTEN |
    214				     BTRFS_HEADER_FLAG_RELOC);
    215	if (new_root_objectid == BTRFS_TREE_RELOC_OBJECTID)
    216		btrfs_set_header_flag(cow, BTRFS_HEADER_FLAG_RELOC);
    217	else
    218		btrfs_set_header_owner(cow, new_root_objectid);
    219
    220	write_extent_buffer_fsid(cow, fs_info->fs_devices->metadata_uuid);
    221
    222	WARN_ON(btrfs_header_generation(buf) > trans->transid);
    223	if (new_root_objectid == BTRFS_TREE_RELOC_OBJECTID)
    224		ret = btrfs_inc_ref(trans, root, cow, 1);
    225	else
    226		ret = btrfs_inc_ref(trans, root, cow, 0);
    227	if (ret) {
    228		btrfs_tree_unlock(cow);
    229		free_extent_buffer(cow);
    230		btrfs_abort_transaction(trans, ret);
    231		return ret;
    232	}
    233
    234	btrfs_mark_buffer_dirty(cow);
    235	*cow_ret = cow;
    236	return 0;
    237}
    238
    239/*
    240 * check if the tree block can be shared by multiple trees
    241 */
    242int btrfs_block_can_be_shared(struct btrfs_root *root,
    243			      struct extent_buffer *buf)
    244{
    245	/*
    246	 * Tree blocks not in shareable trees and tree roots are never shared.
    247	 * If a block was allocated after the last snapshot and the block was
    248	 * not allocated by tree relocation, we know the block is not shared.
    249	 */
    250	if (test_bit(BTRFS_ROOT_SHAREABLE, &root->state) &&
    251	    buf != root->node && buf != root->commit_root &&
    252	    (btrfs_header_generation(buf) <=
    253	     btrfs_root_last_snapshot(&root->root_item) ||
    254	     btrfs_header_flag(buf, BTRFS_HEADER_FLAG_RELOC)))
    255		return 1;
    256
    257	return 0;
    258}
    259
    260static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans,
    261				       struct btrfs_root *root,
    262				       struct extent_buffer *buf,
    263				       struct extent_buffer *cow,
    264				       int *last_ref)
    265{
    266	struct btrfs_fs_info *fs_info = root->fs_info;
    267	u64 refs;
    268	u64 owner;
    269	u64 flags;
    270	u64 new_flags = 0;
    271	int ret;
    272
    273	/*
    274	 * Backrefs update rules:
    275	 *
    276	 * Always use full backrefs for extent pointers in tree block
    277	 * allocated by tree relocation.
    278	 *
    279	 * If a shared tree block is no longer referenced by its owner
    280	 * tree (btrfs_header_owner(buf) == root->root_key.objectid),
    281	 * use full backrefs for extent pointers in tree block.
    282	 *
    283	 * If a tree block is been relocating
    284	 * (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID),
    285	 * use full backrefs for extent pointers in tree block.
    286	 * The reason for this is some operations (such as drop tree)
    287	 * are only allowed for blocks use full backrefs.
    288	 */
    289
    290	if (btrfs_block_can_be_shared(root, buf)) {
    291		ret = btrfs_lookup_extent_info(trans, fs_info, buf->start,
    292					       btrfs_header_level(buf), 1,
    293					       &refs, &flags);
    294		if (ret)
    295			return ret;
    296		if (refs == 0) {
    297			ret = -EROFS;
    298			btrfs_handle_fs_error(fs_info, ret, NULL);
    299			return ret;
    300		}
    301	} else {
    302		refs = 1;
    303		if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID ||
    304		    btrfs_header_backref_rev(buf) < BTRFS_MIXED_BACKREF_REV)
    305			flags = BTRFS_BLOCK_FLAG_FULL_BACKREF;
    306		else
    307			flags = 0;
    308	}
    309
    310	owner = btrfs_header_owner(buf);
    311	BUG_ON(owner == BTRFS_TREE_RELOC_OBJECTID &&
    312	       !(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF));
    313
    314	if (refs > 1) {
    315		if ((owner == root->root_key.objectid ||
    316		     root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) &&
    317		    !(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF)) {
    318			ret = btrfs_inc_ref(trans, root, buf, 1);
    319			if (ret)
    320				return ret;
    321
    322			if (root->root_key.objectid ==
    323			    BTRFS_TREE_RELOC_OBJECTID) {
    324				ret = btrfs_dec_ref(trans, root, buf, 0);
    325				if (ret)
    326					return ret;
    327				ret = btrfs_inc_ref(trans, root, cow, 1);
    328				if (ret)
    329					return ret;
    330			}
    331			new_flags |= BTRFS_BLOCK_FLAG_FULL_BACKREF;
    332		} else {
    333
    334			if (root->root_key.objectid ==
    335			    BTRFS_TREE_RELOC_OBJECTID)
    336				ret = btrfs_inc_ref(trans, root, cow, 1);
    337			else
    338				ret = btrfs_inc_ref(trans, root, cow, 0);
    339			if (ret)
    340				return ret;
    341		}
    342		if (new_flags != 0) {
    343			int level = btrfs_header_level(buf);
    344
    345			ret = btrfs_set_disk_extent_flags(trans, buf,
    346							  new_flags, level);
    347			if (ret)
    348				return ret;
    349		}
    350	} else {
    351		if (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF) {
    352			if (root->root_key.objectid ==
    353			    BTRFS_TREE_RELOC_OBJECTID)
    354				ret = btrfs_inc_ref(trans, root, cow, 1);
    355			else
    356				ret = btrfs_inc_ref(trans, root, cow, 0);
    357			if (ret)
    358				return ret;
    359			ret = btrfs_dec_ref(trans, root, buf, 1);
    360			if (ret)
    361				return ret;
    362		}
    363		btrfs_clean_tree_block(buf);
    364		*last_ref = 1;
    365	}
    366	return 0;
    367}
    368
    369/*
    370 * does the dirty work in cow of a single block.  The parent block (if
    371 * supplied) is updated to point to the new cow copy.  The new buffer is marked
    372 * dirty and returned locked.  If you modify the block it needs to be marked
    373 * dirty again.
    374 *
    375 * search_start -- an allocation hint for the new block
    376 *
    377 * empty_size -- a hint that you plan on doing more cow.  This is the size in
    378 * bytes the allocator should try to find free next to the block it returns.
    379 * This is just a hint and may be ignored by the allocator.
    380 */
    381static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
    382			     struct btrfs_root *root,
    383			     struct extent_buffer *buf,
    384			     struct extent_buffer *parent, int parent_slot,
    385			     struct extent_buffer **cow_ret,
    386			     u64 search_start, u64 empty_size,
    387			     enum btrfs_lock_nesting nest)
    388{
    389	struct btrfs_fs_info *fs_info = root->fs_info;
    390	struct btrfs_disk_key disk_key;
    391	struct extent_buffer *cow;
    392	int level, ret;
    393	int last_ref = 0;
    394	int unlock_orig = 0;
    395	u64 parent_start = 0;
    396
    397	if (*cow_ret == buf)
    398		unlock_orig = 1;
    399
    400	btrfs_assert_tree_write_locked(buf);
    401
    402	WARN_ON(test_bit(BTRFS_ROOT_SHAREABLE, &root->state) &&
    403		trans->transid != fs_info->running_transaction->transid);
    404	WARN_ON(test_bit(BTRFS_ROOT_SHAREABLE, &root->state) &&
    405		trans->transid != root->last_trans);
    406
    407	level = btrfs_header_level(buf);
    408
    409	if (level == 0)
    410		btrfs_item_key(buf, &disk_key, 0);
    411	else
    412		btrfs_node_key(buf, &disk_key, 0);
    413
    414	if ((root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) && parent)
    415		parent_start = parent->start;
    416
    417	cow = btrfs_alloc_tree_block(trans, root, parent_start,
    418				     root->root_key.objectid, &disk_key, level,
    419				     search_start, empty_size, nest);
    420	if (IS_ERR(cow))
    421		return PTR_ERR(cow);
    422
    423	/* cow is set to blocking by btrfs_init_new_buffer */
    424
    425	copy_extent_buffer_full(cow, buf);
    426	btrfs_set_header_bytenr(cow, cow->start);
    427	btrfs_set_header_generation(cow, trans->transid);
    428	btrfs_set_header_backref_rev(cow, BTRFS_MIXED_BACKREF_REV);
    429	btrfs_clear_header_flag(cow, BTRFS_HEADER_FLAG_WRITTEN |
    430				     BTRFS_HEADER_FLAG_RELOC);
    431	if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID)
    432		btrfs_set_header_flag(cow, BTRFS_HEADER_FLAG_RELOC);
    433	else
    434		btrfs_set_header_owner(cow, root->root_key.objectid);
    435
    436	write_extent_buffer_fsid(cow, fs_info->fs_devices->metadata_uuid);
    437
    438	ret = update_ref_for_cow(trans, root, buf, cow, &last_ref);
    439	if (ret) {
    440		btrfs_tree_unlock(cow);
    441		free_extent_buffer(cow);
    442		btrfs_abort_transaction(trans, ret);
    443		return ret;
    444	}
    445
    446	if (test_bit(BTRFS_ROOT_SHAREABLE, &root->state)) {
    447		ret = btrfs_reloc_cow_block(trans, root, buf, cow);
    448		if (ret) {
    449			btrfs_tree_unlock(cow);
    450			free_extent_buffer(cow);
    451			btrfs_abort_transaction(trans, ret);
    452			return ret;
    453		}
    454	}
    455
    456	if (buf == root->node) {
    457		WARN_ON(parent && parent != buf);
    458		if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID ||
    459		    btrfs_header_backref_rev(buf) < BTRFS_MIXED_BACKREF_REV)
    460			parent_start = buf->start;
    461
    462		atomic_inc(&cow->refs);
    463		ret = btrfs_tree_mod_log_insert_root(root->node, cow, true);
    464		BUG_ON(ret < 0);
    465		rcu_assign_pointer(root->node, cow);
    466
    467		btrfs_free_tree_block(trans, btrfs_root_id(root), buf,
    468				      parent_start, last_ref);
    469		free_extent_buffer(buf);
    470		add_root_to_dirty_list(root);
    471	} else {
    472		WARN_ON(trans->transid != btrfs_header_generation(parent));
    473		btrfs_tree_mod_log_insert_key(parent, parent_slot,
    474					      BTRFS_MOD_LOG_KEY_REPLACE, GFP_NOFS);
    475		btrfs_set_node_blockptr(parent, parent_slot,
    476					cow->start);
    477		btrfs_set_node_ptr_generation(parent, parent_slot,
    478					      trans->transid);
    479		btrfs_mark_buffer_dirty(parent);
    480		if (last_ref) {
    481			ret = btrfs_tree_mod_log_free_eb(buf);
    482			if (ret) {
    483				btrfs_tree_unlock(cow);
    484				free_extent_buffer(cow);
    485				btrfs_abort_transaction(trans, ret);
    486				return ret;
    487			}
    488		}
    489		btrfs_free_tree_block(trans, btrfs_root_id(root), buf,
    490				      parent_start, last_ref);
    491	}
    492	if (unlock_orig)
    493		btrfs_tree_unlock(buf);
    494	free_extent_buffer_stale(buf);
    495	btrfs_mark_buffer_dirty(cow);
    496	*cow_ret = cow;
    497	return 0;
    498}
    499
    500static inline int should_cow_block(struct btrfs_trans_handle *trans,
    501				   struct btrfs_root *root,
    502				   struct extent_buffer *buf)
    503{
    504	if (btrfs_is_testing(root->fs_info))
    505		return 0;
    506
    507	/* Ensure we can see the FORCE_COW bit */
    508	smp_mb__before_atomic();
    509
    510	/*
    511	 * We do not need to cow a block if
    512	 * 1) this block is not created or changed in this transaction;
    513	 * 2) this block does not belong to TREE_RELOC tree;
    514	 * 3) the root is not forced COW.
    515	 *
    516	 * What is forced COW:
    517	 *    when we create snapshot during committing the transaction,
    518	 *    after we've finished copying src root, we must COW the shared
    519	 *    block to ensure the metadata consistency.
    520	 */
    521	if (btrfs_header_generation(buf) == trans->transid &&
    522	    !btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN) &&
    523	    !(root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID &&
    524	      btrfs_header_flag(buf, BTRFS_HEADER_FLAG_RELOC)) &&
    525	    !test_bit(BTRFS_ROOT_FORCE_COW, &root->state))
    526		return 0;
    527	return 1;
    528}
    529
    530/*
    531 * cows a single block, see __btrfs_cow_block for the real work.
    532 * This version of it has extra checks so that a block isn't COWed more than
    533 * once per transaction, as long as it hasn't been written yet
    534 */
    535noinline int btrfs_cow_block(struct btrfs_trans_handle *trans,
    536		    struct btrfs_root *root, struct extent_buffer *buf,
    537		    struct extent_buffer *parent, int parent_slot,
    538		    struct extent_buffer **cow_ret,
    539		    enum btrfs_lock_nesting nest)
    540{
    541	struct btrfs_fs_info *fs_info = root->fs_info;
    542	u64 search_start;
    543	int ret;
    544
    545	if (test_bit(BTRFS_ROOT_DELETING, &root->state))
    546		btrfs_err(fs_info,
    547			"COW'ing blocks on a fs root that's being dropped");
    548
    549	if (trans->transaction != fs_info->running_transaction)
    550		WARN(1, KERN_CRIT "trans %llu running %llu\n",
    551		       trans->transid,
    552		       fs_info->running_transaction->transid);
    553
    554	if (trans->transid != fs_info->generation)
    555		WARN(1, KERN_CRIT "trans %llu running %llu\n",
    556		       trans->transid, fs_info->generation);
    557
    558	if (!should_cow_block(trans, root, buf)) {
    559		*cow_ret = buf;
    560		return 0;
    561	}
    562
    563	search_start = buf->start & ~((u64)SZ_1G - 1);
    564
    565	/*
    566	 * Before CoWing this block for later modification, check if it's
    567	 * the subtree root and do the delayed subtree trace if needed.
    568	 *
    569	 * Also We don't care about the error, as it's handled internally.
    570	 */
    571	btrfs_qgroup_trace_subtree_after_cow(trans, root, buf);
    572	ret = __btrfs_cow_block(trans, root, buf, parent,
    573				 parent_slot, cow_ret, search_start, 0, nest);
    574
    575	trace_btrfs_cow_block(root, buf, *cow_ret);
    576
    577	return ret;
    578}
    579ALLOW_ERROR_INJECTION(btrfs_cow_block, ERRNO);
    580
    581/*
    582 * helper function for defrag to decide if two blocks pointed to by a
    583 * node are actually close by
    584 */
    585static int close_blocks(u64 blocknr, u64 other, u32 blocksize)
    586{
    587	if (blocknr < other && other - (blocknr + blocksize) < 32768)
    588		return 1;
    589	if (blocknr > other && blocknr - (other + blocksize) < 32768)
    590		return 1;
    591	return 0;
    592}
    593
    594#ifdef __LITTLE_ENDIAN
    595
    596/*
    597 * Compare two keys, on little-endian the disk order is same as CPU order and
    598 * we can avoid the conversion.
    599 */
    600static int comp_keys(const struct btrfs_disk_key *disk_key,
    601		     const struct btrfs_key *k2)
    602{
    603	const struct btrfs_key *k1 = (const struct btrfs_key *)disk_key;
    604
    605	return btrfs_comp_cpu_keys(k1, k2);
    606}
    607
    608#else
    609
    610/*
    611 * compare two keys in a memcmp fashion
    612 */
    613static int comp_keys(const struct btrfs_disk_key *disk,
    614		     const struct btrfs_key *k2)
    615{
    616	struct btrfs_key k1;
    617
    618	btrfs_disk_key_to_cpu(&k1, disk);
    619
    620	return btrfs_comp_cpu_keys(&k1, k2);
    621}
    622#endif
    623
    624/*
    625 * same as comp_keys only with two btrfs_key's
    626 */
    627int __pure btrfs_comp_cpu_keys(const struct btrfs_key *k1, const struct btrfs_key *k2)
    628{
    629	if (k1->objectid > k2->objectid)
    630		return 1;
    631	if (k1->objectid < k2->objectid)
    632		return -1;
    633	if (k1->type > k2->type)
    634		return 1;
    635	if (k1->type < k2->type)
    636		return -1;
    637	if (k1->offset > k2->offset)
    638		return 1;
    639	if (k1->offset < k2->offset)
    640		return -1;
    641	return 0;
    642}
    643
    644/*
    645 * this is used by the defrag code to go through all the
    646 * leaves pointed to by a node and reallocate them so that
    647 * disk order is close to key order
    648 */
    649int btrfs_realloc_node(struct btrfs_trans_handle *trans,
    650		       struct btrfs_root *root, struct extent_buffer *parent,
    651		       int start_slot, u64 *last_ret,
    652		       struct btrfs_key *progress)
    653{
    654	struct btrfs_fs_info *fs_info = root->fs_info;
    655	struct extent_buffer *cur;
    656	u64 blocknr;
    657	u64 search_start = *last_ret;
    658	u64 last_block = 0;
    659	u64 other;
    660	u32 parent_nritems;
    661	int end_slot;
    662	int i;
    663	int err = 0;
    664	u32 blocksize;
    665	int progress_passed = 0;
    666	struct btrfs_disk_key disk_key;
    667
    668	WARN_ON(trans->transaction != fs_info->running_transaction);
    669	WARN_ON(trans->transid != fs_info->generation);
    670
    671	parent_nritems = btrfs_header_nritems(parent);
    672	blocksize = fs_info->nodesize;
    673	end_slot = parent_nritems - 1;
    674
    675	if (parent_nritems <= 1)
    676		return 0;
    677
    678	for (i = start_slot; i <= end_slot; i++) {
    679		int close = 1;
    680
    681		btrfs_node_key(parent, &disk_key, i);
    682		if (!progress_passed && comp_keys(&disk_key, progress) < 0)
    683			continue;
    684
    685		progress_passed = 1;
    686		blocknr = btrfs_node_blockptr(parent, i);
    687		if (last_block == 0)
    688			last_block = blocknr;
    689
    690		if (i > 0) {
    691			other = btrfs_node_blockptr(parent, i - 1);
    692			close = close_blocks(blocknr, other, blocksize);
    693		}
    694		if (!close && i < end_slot) {
    695			other = btrfs_node_blockptr(parent, i + 1);
    696			close = close_blocks(blocknr, other, blocksize);
    697		}
    698		if (close) {
    699			last_block = blocknr;
    700			continue;
    701		}
    702
    703		cur = btrfs_read_node_slot(parent, i);
    704		if (IS_ERR(cur))
    705			return PTR_ERR(cur);
    706		if (search_start == 0)
    707			search_start = last_block;
    708
    709		btrfs_tree_lock(cur);
    710		err = __btrfs_cow_block(trans, root, cur, parent, i,
    711					&cur, search_start,
    712					min(16 * blocksize,
    713					    (end_slot - i) * blocksize),
    714					BTRFS_NESTING_COW);
    715		if (err) {
    716			btrfs_tree_unlock(cur);
    717			free_extent_buffer(cur);
    718			break;
    719		}
    720		search_start = cur->start;
    721		last_block = cur->start;
    722		*last_ret = search_start;
    723		btrfs_tree_unlock(cur);
    724		free_extent_buffer(cur);
    725	}
    726	return err;
    727}
    728
    729/*
    730 * Search for a key in the given extent_buffer.
    731 *
    732 * The lower boundary for the search is specified by the slot number @low. Use a
    733 * value of 0 to search over the whole extent buffer.
    734 *
    735 * The slot in the extent buffer is returned via @slot. If the key exists in the
    736 * extent buffer, then @slot will point to the slot where the key is, otherwise
    737 * it points to the slot where you would insert the key.
    738 *
    739 * Slot may point to the total number of items (i.e. one position beyond the last
    740 * key) if the key is bigger than the last key in the extent buffer.
    741 */
    742static noinline int generic_bin_search(struct extent_buffer *eb, int low,
    743				       const struct btrfs_key *key, int *slot)
    744{
    745	unsigned long p;
    746	int item_size;
    747	int high = btrfs_header_nritems(eb);
    748	int ret;
    749	const int key_size = sizeof(struct btrfs_disk_key);
    750
    751	if (low > high) {
    752		btrfs_err(eb->fs_info,
    753		 "%s: low (%d) > high (%d) eb %llu owner %llu level %d",
    754			  __func__, low, high, eb->start,
    755			  btrfs_header_owner(eb), btrfs_header_level(eb));
    756		return -EINVAL;
    757	}
    758
    759	if (btrfs_header_level(eb) == 0) {
    760		p = offsetof(struct btrfs_leaf, items);
    761		item_size = sizeof(struct btrfs_item);
    762	} else {
    763		p = offsetof(struct btrfs_node, ptrs);
    764		item_size = sizeof(struct btrfs_key_ptr);
    765	}
    766
    767	while (low < high) {
    768		unsigned long oip;
    769		unsigned long offset;
    770		struct btrfs_disk_key *tmp;
    771		struct btrfs_disk_key unaligned;
    772		int mid;
    773
    774		mid = (low + high) / 2;
    775		offset = p + mid * item_size;
    776		oip = offset_in_page(offset);
    777
    778		if (oip + key_size <= PAGE_SIZE) {
    779			const unsigned long idx = get_eb_page_index(offset);
    780			char *kaddr = page_address(eb->pages[idx]);
    781
    782			oip = get_eb_offset_in_page(eb, offset);
    783			tmp = (struct btrfs_disk_key *)(kaddr + oip);
    784		} else {
    785			read_extent_buffer(eb, &unaligned, offset, key_size);
    786			tmp = &unaligned;
    787		}
    788
    789		ret = comp_keys(tmp, key);
    790
    791		if (ret < 0)
    792			low = mid + 1;
    793		else if (ret > 0)
    794			high = mid;
    795		else {
    796			*slot = mid;
    797			return 0;
    798		}
    799	}
    800	*slot = low;
    801	return 1;
    802}
    803
    804/*
    805 * Simple binary search on an extent buffer. Works for both leaves and nodes, and
    806 * always searches over the whole range of keys (slot 0 to slot 'nritems - 1').
    807 */
    808int btrfs_bin_search(struct extent_buffer *eb, const struct btrfs_key *key,
    809		     int *slot)
    810{
    811	return generic_bin_search(eb, 0, key, slot);
    812}
    813
    814static void root_add_used(struct btrfs_root *root, u32 size)
    815{
    816	spin_lock(&root->accounting_lock);
    817	btrfs_set_root_used(&root->root_item,
    818			    btrfs_root_used(&root->root_item) + size);
    819	spin_unlock(&root->accounting_lock);
    820}
    821
    822static void root_sub_used(struct btrfs_root *root, u32 size)
    823{
    824	spin_lock(&root->accounting_lock);
    825	btrfs_set_root_used(&root->root_item,
    826			    btrfs_root_used(&root->root_item) - size);
    827	spin_unlock(&root->accounting_lock);
    828}
    829
    830/* given a node and slot number, this reads the blocks it points to.  The
    831 * extent buffer is returned with a reference taken (but unlocked).
    832 */
    833struct extent_buffer *btrfs_read_node_slot(struct extent_buffer *parent,
    834					   int slot)
    835{
    836	int level = btrfs_header_level(parent);
    837	struct extent_buffer *eb;
    838	struct btrfs_key first_key;
    839
    840	if (slot < 0 || slot >= btrfs_header_nritems(parent))
    841		return ERR_PTR(-ENOENT);
    842
    843	BUG_ON(level == 0);
    844
    845	btrfs_node_key_to_cpu(parent, &first_key, slot);
    846	eb = read_tree_block(parent->fs_info, btrfs_node_blockptr(parent, slot),
    847			     btrfs_header_owner(parent),
    848			     btrfs_node_ptr_generation(parent, slot),
    849			     level - 1, &first_key);
    850	if (IS_ERR(eb))
    851		return eb;
    852	if (!extent_buffer_uptodate(eb)) {
    853		free_extent_buffer(eb);
    854		return ERR_PTR(-EIO);
    855	}
    856
    857	return eb;
    858}
    859
    860/*
    861 * node level balancing, used to make sure nodes are in proper order for
    862 * item deletion.  We balance from the top down, so we have to make sure
    863 * that a deletion won't leave an node completely empty later on.
    864 */
    865static noinline int balance_level(struct btrfs_trans_handle *trans,
    866			 struct btrfs_root *root,
    867			 struct btrfs_path *path, int level)
    868{
    869	struct btrfs_fs_info *fs_info = root->fs_info;
    870	struct extent_buffer *right = NULL;
    871	struct extent_buffer *mid;
    872	struct extent_buffer *left = NULL;
    873	struct extent_buffer *parent = NULL;
    874	int ret = 0;
    875	int wret;
    876	int pslot;
    877	int orig_slot = path->slots[level];
    878	u64 orig_ptr;
    879
    880	ASSERT(level > 0);
    881
    882	mid = path->nodes[level];
    883
    884	WARN_ON(path->locks[level] != BTRFS_WRITE_LOCK);
    885	WARN_ON(btrfs_header_generation(mid) != trans->transid);
    886
    887	orig_ptr = btrfs_node_blockptr(mid, orig_slot);
    888
    889	if (level < BTRFS_MAX_LEVEL - 1) {
    890		parent = path->nodes[level + 1];
    891		pslot = path->slots[level + 1];
    892	}
    893
    894	/*
    895	 * deal with the case where there is only one pointer in the root
    896	 * by promoting the node below to a root
    897	 */
    898	if (!parent) {
    899		struct extent_buffer *child;
    900
    901		if (btrfs_header_nritems(mid) != 1)
    902			return 0;
    903
    904		/* promote the child to a root */
    905		child = btrfs_read_node_slot(mid, 0);
    906		if (IS_ERR(child)) {
    907			ret = PTR_ERR(child);
    908			btrfs_handle_fs_error(fs_info, ret, NULL);
    909			goto enospc;
    910		}
    911
    912		btrfs_tree_lock(child);
    913		ret = btrfs_cow_block(trans, root, child, mid, 0, &child,
    914				      BTRFS_NESTING_COW);
    915		if (ret) {
    916			btrfs_tree_unlock(child);
    917			free_extent_buffer(child);
    918			goto enospc;
    919		}
    920
    921		ret = btrfs_tree_mod_log_insert_root(root->node, child, true);
    922		BUG_ON(ret < 0);
    923		rcu_assign_pointer(root->node, child);
    924
    925		add_root_to_dirty_list(root);
    926		btrfs_tree_unlock(child);
    927
    928		path->locks[level] = 0;
    929		path->nodes[level] = NULL;
    930		btrfs_clean_tree_block(mid);
    931		btrfs_tree_unlock(mid);
    932		/* once for the path */
    933		free_extent_buffer(mid);
    934
    935		root_sub_used(root, mid->len);
    936		btrfs_free_tree_block(trans, btrfs_root_id(root), mid, 0, 1);
    937		/* once for the root ptr */
    938		free_extent_buffer_stale(mid);
    939		return 0;
    940	}
    941	if (btrfs_header_nritems(mid) >
    942	    BTRFS_NODEPTRS_PER_BLOCK(fs_info) / 4)
    943		return 0;
    944
    945	left = btrfs_read_node_slot(parent, pslot - 1);
    946	if (IS_ERR(left))
    947		left = NULL;
    948
    949	if (left) {
    950		__btrfs_tree_lock(left, BTRFS_NESTING_LEFT);
    951		wret = btrfs_cow_block(trans, root, left,
    952				       parent, pslot - 1, &left,
    953				       BTRFS_NESTING_LEFT_COW);
    954		if (wret) {
    955			ret = wret;
    956			goto enospc;
    957		}
    958	}
    959
    960	right = btrfs_read_node_slot(parent, pslot + 1);
    961	if (IS_ERR(right))
    962		right = NULL;
    963
    964	if (right) {
    965		__btrfs_tree_lock(right, BTRFS_NESTING_RIGHT);
    966		wret = btrfs_cow_block(trans, root, right,
    967				       parent, pslot + 1, &right,
    968				       BTRFS_NESTING_RIGHT_COW);
    969		if (wret) {
    970			ret = wret;
    971			goto enospc;
    972		}
    973	}
    974
    975	/* first, try to make some room in the middle buffer */
    976	if (left) {
    977		orig_slot += btrfs_header_nritems(left);
    978		wret = push_node_left(trans, left, mid, 1);
    979		if (wret < 0)
    980			ret = wret;
    981	}
    982
    983	/*
    984	 * then try to empty the right most buffer into the middle
    985	 */
    986	if (right) {
    987		wret = push_node_left(trans, mid, right, 1);
    988		if (wret < 0 && wret != -ENOSPC)
    989			ret = wret;
    990		if (btrfs_header_nritems(right) == 0) {
    991			btrfs_clean_tree_block(right);
    992			btrfs_tree_unlock(right);
    993			del_ptr(root, path, level + 1, pslot + 1);
    994			root_sub_used(root, right->len);
    995			btrfs_free_tree_block(trans, btrfs_root_id(root), right,
    996					      0, 1);
    997			free_extent_buffer_stale(right);
    998			right = NULL;
    999		} else {
   1000			struct btrfs_disk_key right_key;
   1001			btrfs_node_key(right, &right_key, 0);
   1002			ret = btrfs_tree_mod_log_insert_key(parent, pslot + 1,
   1003					BTRFS_MOD_LOG_KEY_REPLACE, GFP_NOFS);
   1004			BUG_ON(ret < 0);
   1005			btrfs_set_node_key(parent, &right_key, pslot + 1);
   1006			btrfs_mark_buffer_dirty(parent);
   1007		}
   1008	}
   1009	if (btrfs_header_nritems(mid) == 1) {
   1010		/*
   1011		 * we're not allowed to leave a node with one item in the
   1012		 * tree during a delete.  A deletion from lower in the tree
   1013		 * could try to delete the only pointer in this node.
   1014		 * So, pull some keys from the left.
   1015		 * There has to be a left pointer at this point because
   1016		 * otherwise we would have pulled some pointers from the
   1017		 * right
   1018		 */
   1019		if (!left) {
   1020			ret = -EROFS;
   1021			btrfs_handle_fs_error(fs_info, ret, NULL);
   1022			goto enospc;
   1023		}
   1024		wret = balance_node_right(trans, mid, left);
   1025		if (wret < 0) {
   1026			ret = wret;
   1027			goto enospc;
   1028		}
   1029		if (wret == 1) {
   1030			wret = push_node_left(trans, left, mid, 1);
   1031			if (wret < 0)
   1032				ret = wret;
   1033		}
   1034		BUG_ON(wret == 1);
   1035	}
   1036	if (btrfs_header_nritems(mid) == 0) {
   1037		btrfs_clean_tree_block(mid);
   1038		btrfs_tree_unlock(mid);
   1039		del_ptr(root, path, level + 1, pslot);
   1040		root_sub_used(root, mid->len);
   1041		btrfs_free_tree_block(trans, btrfs_root_id(root), mid, 0, 1);
   1042		free_extent_buffer_stale(mid);
   1043		mid = NULL;
   1044	} else {
   1045		/* update the parent key to reflect our changes */
   1046		struct btrfs_disk_key mid_key;
   1047		btrfs_node_key(mid, &mid_key, 0);
   1048		ret = btrfs_tree_mod_log_insert_key(parent, pslot,
   1049				BTRFS_MOD_LOG_KEY_REPLACE, GFP_NOFS);
   1050		BUG_ON(ret < 0);
   1051		btrfs_set_node_key(parent, &mid_key, pslot);
   1052		btrfs_mark_buffer_dirty(parent);
   1053	}
   1054
   1055	/* update the path */
   1056	if (left) {
   1057		if (btrfs_header_nritems(left) > orig_slot) {
   1058			atomic_inc(&left->refs);
   1059			/* left was locked after cow */
   1060			path->nodes[level] = left;
   1061			path->slots[level + 1] -= 1;
   1062			path->slots[level] = orig_slot;
   1063			if (mid) {
   1064				btrfs_tree_unlock(mid);
   1065				free_extent_buffer(mid);
   1066			}
   1067		} else {
   1068			orig_slot -= btrfs_header_nritems(left);
   1069			path->slots[level] = orig_slot;
   1070		}
   1071	}
   1072	/* double check we haven't messed things up */
   1073	if (orig_ptr !=
   1074	    btrfs_node_blockptr(path->nodes[level], path->slots[level]))
   1075		BUG();
   1076enospc:
   1077	if (right) {
   1078		btrfs_tree_unlock(right);
   1079		free_extent_buffer(right);
   1080	}
   1081	if (left) {
   1082		if (path->nodes[level] != left)
   1083			btrfs_tree_unlock(left);
   1084		free_extent_buffer(left);
   1085	}
   1086	return ret;
   1087}
   1088
   1089/* Node balancing for insertion.  Here we only split or push nodes around
   1090 * when they are completely full.  This is also done top down, so we
   1091 * have to be pessimistic.
   1092 */
   1093static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans,
   1094					  struct btrfs_root *root,
   1095					  struct btrfs_path *path, int level)
   1096{
   1097	struct btrfs_fs_info *fs_info = root->fs_info;
   1098	struct extent_buffer *right = NULL;
   1099	struct extent_buffer *mid;
   1100	struct extent_buffer *left = NULL;
   1101	struct extent_buffer *parent = NULL;
   1102	int ret = 0;
   1103	int wret;
   1104	int pslot;
   1105	int orig_slot = path->slots[level];
   1106
   1107	if (level == 0)
   1108		return 1;
   1109
   1110	mid = path->nodes[level];
   1111	WARN_ON(btrfs_header_generation(mid) != trans->transid);
   1112
   1113	if (level < BTRFS_MAX_LEVEL - 1) {
   1114		parent = path->nodes[level + 1];
   1115		pslot = path->slots[level + 1];
   1116	}
   1117
   1118	if (!parent)
   1119		return 1;
   1120
   1121	left = btrfs_read_node_slot(parent, pslot - 1);
   1122	if (IS_ERR(left))
   1123		left = NULL;
   1124
   1125	/* first, try to make some room in the middle buffer */
   1126	if (left) {
   1127		u32 left_nr;
   1128
   1129		__btrfs_tree_lock(left, BTRFS_NESTING_LEFT);
   1130
   1131		left_nr = btrfs_header_nritems(left);
   1132		if (left_nr >= BTRFS_NODEPTRS_PER_BLOCK(fs_info) - 1) {
   1133			wret = 1;
   1134		} else {
   1135			ret = btrfs_cow_block(trans, root, left, parent,
   1136					      pslot - 1, &left,
   1137					      BTRFS_NESTING_LEFT_COW);
   1138			if (ret)
   1139				wret = 1;
   1140			else {
   1141				wret = push_node_left(trans, left, mid, 0);
   1142			}
   1143		}
   1144		if (wret < 0)
   1145			ret = wret;
   1146		if (wret == 0) {
   1147			struct btrfs_disk_key disk_key;
   1148			orig_slot += left_nr;
   1149			btrfs_node_key(mid, &disk_key, 0);
   1150			ret = btrfs_tree_mod_log_insert_key(parent, pslot,
   1151					BTRFS_MOD_LOG_KEY_REPLACE, GFP_NOFS);
   1152			BUG_ON(ret < 0);
   1153			btrfs_set_node_key(parent, &disk_key, pslot);
   1154			btrfs_mark_buffer_dirty(parent);
   1155			if (btrfs_header_nritems(left) > orig_slot) {
   1156				path->nodes[level] = left;
   1157				path->slots[level + 1] -= 1;
   1158				path->slots[level] = orig_slot;
   1159				btrfs_tree_unlock(mid);
   1160				free_extent_buffer(mid);
   1161			} else {
   1162				orig_slot -=
   1163					btrfs_header_nritems(left);
   1164				path->slots[level] = orig_slot;
   1165				btrfs_tree_unlock(left);
   1166				free_extent_buffer(left);
   1167			}
   1168			return 0;
   1169		}
   1170		btrfs_tree_unlock(left);
   1171		free_extent_buffer(left);
   1172	}
   1173	right = btrfs_read_node_slot(parent, pslot + 1);
   1174	if (IS_ERR(right))
   1175		right = NULL;
   1176
   1177	/*
   1178	 * then try to empty the right most buffer into the middle
   1179	 */
   1180	if (right) {
   1181		u32 right_nr;
   1182
   1183		__btrfs_tree_lock(right, BTRFS_NESTING_RIGHT);
   1184
   1185		right_nr = btrfs_header_nritems(right);
   1186		if (right_nr >= BTRFS_NODEPTRS_PER_BLOCK(fs_info) - 1) {
   1187			wret = 1;
   1188		} else {
   1189			ret = btrfs_cow_block(trans, root, right,
   1190					      parent, pslot + 1,
   1191					      &right, BTRFS_NESTING_RIGHT_COW);
   1192			if (ret)
   1193				wret = 1;
   1194			else {
   1195				wret = balance_node_right(trans, right, mid);
   1196			}
   1197		}
   1198		if (wret < 0)
   1199			ret = wret;
   1200		if (wret == 0) {
   1201			struct btrfs_disk_key disk_key;
   1202
   1203			btrfs_node_key(right, &disk_key, 0);
   1204			ret = btrfs_tree_mod_log_insert_key(parent, pslot + 1,
   1205					BTRFS_MOD_LOG_KEY_REPLACE, GFP_NOFS);
   1206			BUG_ON(ret < 0);
   1207			btrfs_set_node_key(parent, &disk_key, pslot + 1);
   1208			btrfs_mark_buffer_dirty(parent);
   1209
   1210			if (btrfs_header_nritems(mid) <= orig_slot) {
   1211				path->nodes[level] = right;
   1212				path->slots[level + 1] += 1;
   1213				path->slots[level] = orig_slot -
   1214					btrfs_header_nritems(mid);
   1215				btrfs_tree_unlock(mid);
   1216				free_extent_buffer(mid);
   1217			} else {
   1218				btrfs_tree_unlock(right);
   1219				free_extent_buffer(right);
   1220			}
   1221			return 0;
   1222		}
   1223		btrfs_tree_unlock(right);
   1224		free_extent_buffer(right);
   1225	}
   1226	return 1;
   1227}
   1228
   1229/*
   1230 * readahead one full node of leaves, finding things that are close
   1231 * to the block in 'slot', and triggering ra on them.
   1232 */
   1233static void reada_for_search(struct btrfs_fs_info *fs_info,
   1234			     struct btrfs_path *path,
   1235			     int level, int slot, u64 objectid)
   1236{
   1237	struct extent_buffer *node;
   1238	struct btrfs_disk_key disk_key;
   1239	u32 nritems;
   1240	u64 search;
   1241	u64 target;
   1242	u64 nread = 0;
   1243	u64 nread_max;
   1244	u32 nr;
   1245	u32 blocksize;
   1246	u32 nscan = 0;
   1247
   1248	if (level != 1 && path->reada != READA_FORWARD_ALWAYS)
   1249		return;
   1250
   1251	if (!path->nodes[level])
   1252		return;
   1253
   1254	node = path->nodes[level];
   1255
   1256	/*
   1257	 * Since the time between visiting leaves is much shorter than the time
   1258	 * between visiting nodes, limit read ahead of nodes to 1, to avoid too
   1259	 * much IO at once (possibly random).
   1260	 */
   1261	if (path->reada == READA_FORWARD_ALWAYS) {
   1262		if (level > 1)
   1263			nread_max = node->fs_info->nodesize;
   1264		else
   1265			nread_max = SZ_128K;
   1266	} else {
   1267		nread_max = SZ_64K;
   1268	}
   1269
   1270	search = btrfs_node_blockptr(node, slot);
   1271	blocksize = fs_info->nodesize;
   1272	if (path->reada != READA_FORWARD_ALWAYS) {
   1273		struct extent_buffer *eb;
   1274
   1275		eb = find_extent_buffer(fs_info, search);
   1276		if (eb) {
   1277			free_extent_buffer(eb);
   1278			return;
   1279		}
   1280	}
   1281
   1282	target = search;
   1283
   1284	nritems = btrfs_header_nritems(node);
   1285	nr = slot;
   1286
   1287	while (1) {
   1288		if (path->reada == READA_BACK) {
   1289			if (nr == 0)
   1290				break;
   1291			nr--;
   1292		} else if (path->reada == READA_FORWARD ||
   1293			   path->reada == READA_FORWARD_ALWAYS) {
   1294			nr++;
   1295			if (nr >= nritems)
   1296				break;
   1297		}
   1298		if (path->reada == READA_BACK && objectid) {
   1299			btrfs_node_key(node, &disk_key, nr);
   1300			if (btrfs_disk_key_objectid(&disk_key) != objectid)
   1301				break;
   1302		}
   1303		search = btrfs_node_blockptr(node, nr);
   1304		if (path->reada == READA_FORWARD_ALWAYS ||
   1305		    (search <= target && target - search <= 65536) ||
   1306		    (search > target && search - target <= 65536)) {
   1307			btrfs_readahead_node_child(node, nr);
   1308			nread += blocksize;
   1309		}
   1310		nscan++;
   1311		if (nread > nread_max || nscan > 32)
   1312			break;
   1313	}
   1314}
   1315
   1316static noinline void reada_for_balance(struct btrfs_path *path, int level)
   1317{
   1318	struct extent_buffer *parent;
   1319	int slot;
   1320	int nritems;
   1321
   1322	parent = path->nodes[level + 1];
   1323	if (!parent)
   1324		return;
   1325
   1326	nritems = btrfs_header_nritems(parent);
   1327	slot = path->slots[level + 1];
   1328
   1329	if (slot > 0)
   1330		btrfs_readahead_node_child(parent, slot - 1);
   1331	if (slot + 1 < nritems)
   1332		btrfs_readahead_node_child(parent, slot + 1);
   1333}
   1334
   1335
   1336/*
   1337 * when we walk down the tree, it is usually safe to unlock the higher layers
   1338 * in the tree.  The exceptions are when our path goes through slot 0, because
   1339 * operations on the tree might require changing key pointers higher up in the
   1340 * tree.
   1341 *
   1342 * callers might also have set path->keep_locks, which tells this code to keep
   1343 * the lock if the path points to the last slot in the block.  This is part of
   1344 * walking through the tree, and selecting the next slot in the higher block.
   1345 *
   1346 * lowest_unlock sets the lowest level in the tree we're allowed to unlock.  so
   1347 * if lowest_unlock is 1, level 0 won't be unlocked
   1348 */
   1349static noinline void unlock_up(struct btrfs_path *path, int level,
   1350			       int lowest_unlock, int min_write_lock_level,
   1351			       int *write_lock_level)
   1352{
   1353	int i;
   1354	int skip_level = level;
   1355	bool check_skip = true;
   1356
   1357	for (i = level; i < BTRFS_MAX_LEVEL; i++) {
   1358		if (!path->nodes[i])
   1359			break;
   1360		if (!path->locks[i])
   1361			break;
   1362
   1363		if (check_skip) {
   1364			if (path->slots[i] == 0) {
   1365				skip_level = i + 1;
   1366				continue;
   1367			}
   1368
   1369			if (path->keep_locks) {
   1370				u32 nritems;
   1371
   1372				nritems = btrfs_header_nritems(path->nodes[i]);
   1373				if (nritems < 1 || path->slots[i] >= nritems - 1) {
   1374					skip_level = i + 1;
   1375					continue;
   1376				}
   1377			}
   1378		}
   1379
   1380		if (i >= lowest_unlock && i > skip_level) {
   1381			check_skip = false;
   1382			btrfs_tree_unlock_rw(path->nodes[i], path->locks[i]);
   1383			path->locks[i] = 0;
   1384			if (write_lock_level &&
   1385			    i > min_write_lock_level &&
   1386			    i <= *write_lock_level) {
   1387				*write_lock_level = i - 1;
   1388			}
   1389		}
   1390	}
   1391}
   1392
   1393/*
   1394 * Helper function for btrfs_search_slot() and other functions that do a search
   1395 * on a btree. The goal is to find a tree block in the cache (the radix tree at
   1396 * fs_info->buffer_radix), but if we can't find it, or it's not up to date, read
   1397 * its pages from disk.
   1398 *
   1399 * Returns -EAGAIN, with the path unlocked, if the caller needs to repeat the
   1400 * whole btree search, starting again from the current root node.
   1401 */
   1402static int
   1403read_block_for_search(struct btrfs_root *root, struct btrfs_path *p,
   1404		      struct extent_buffer **eb_ret, int level, int slot,
   1405		      const struct btrfs_key *key)
   1406{
   1407	struct btrfs_fs_info *fs_info = root->fs_info;
   1408	u64 blocknr;
   1409	u64 gen;
   1410	struct extent_buffer *tmp;
   1411	struct btrfs_key first_key;
   1412	int ret;
   1413	int parent_level;
   1414	bool unlock_up;
   1415
   1416	unlock_up = ((level + 1 < BTRFS_MAX_LEVEL) && p->locks[level + 1]);
   1417	blocknr = btrfs_node_blockptr(*eb_ret, slot);
   1418	gen = btrfs_node_ptr_generation(*eb_ret, slot);
   1419	parent_level = btrfs_header_level(*eb_ret);
   1420	btrfs_node_key_to_cpu(*eb_ret, &first_key, slot);
   1421
   1422	/*
   1423	 * If we need to read an extent buffer from disk and we are holding locks
   1424	 * on upper level nodes, we unlock all the upper nodes before reading the
   1425	 * extent buffer, and then return -EAGAIN to the caller as it needs to
   1426	 * restart the search. We don't release the lock on the current level
   1427	 * because we need to walk this node to figure out which blocks to read.
   1428	 */
   1429	tmp = find_extent_buffer(fs_info, blocknr);
   1430	if (tmp) {
   1431		if (p->reada == READA_FORWARD_ALWAYS)
   1432			reada_for_search(fs_info, p, level, slot, key->objectid);
   1433
   1434		/* first we do an atomic uptodate check */
   1435		if (btrfs_buffer_uptodate(tmp, gen, 1) > 0) {
   1436			/*
   1437			 * Do extra check for first_key, eb can be stale due to
   1438			 * being cached, read from scrub, or have multiple
   1439			 * parents (shared tree blocks).
   1440			 */
   1441			if (btrfs_verify_level_key(tmp,
   1442					parent_level - 1, &first_key, gen)) {
   1443				free_extent_buffer(tmp);
   1444				return -EUCLEAN;
   1445			}
   1446			*eb_ret = tmp;
   1447			return 0;
   1448		}
   1449
   1450		if (unlock_up)
   1451			btrfs_unlock_up_safe(p, level + 1);
   1452
   1453		/* now we're allowed to do a blocking uptodate check */
   1454		ret = btrfs_read_extent_buffer(tmp, gen, parent_level - 1, &first_key);
   1455		if (ret) {
   1456			free_extent_buffer(tmp);
   1457			btrfs_release_path(p);
   1458			return -EIO;
   1459		}
   1460		if (btrfs_check_eb_owner(tmp, root->root_key.objectid)) {
   1461			free_extent_buffer(tmp);
   1462			btrfs_release_path(p);
   1463			return -EUCLEAN;
   1464		}
   1465
   1466		if (unlock_up)
   1467			ret = -EAGAIN;
   1468
   1469		goto out;
   1470	}
   1471
   1472	if (unlock_up) {
   1473		btrfs_unlock_up_safe(p, level + 1);
   1474		ret = -EAGAIN;
   1475	} else {
   1476		ret = 0;
   1477	}
   1478
   1479	if (p->reada != READA_NONE)
   1480		reada_for_search(fs_info, p, level, slot, key->objectid);
   1481
   1482	tmp = read_tree_block(fs_info, blocknr, root->root_key.objectid,
   1483			      gen, parent_level - 1, &first_key);
   1484	if (IS_ERR(tmp)) {
   1485		btrfs_release_path(p);
   1486		return PTR_ERR(tmp);
   1487	}
   1488	/*
   1489	 * If the read above didn't mark this buffer up to date,
   1490	 * it will never end up being up to date.  Set ret to EIO now
   1491	 * and give up so that our caller doesn't loop forever
   1492	 * on our EAGAINs.
   1493	 */
   1494	if (!extent_buffer_uptodate(tmp))
   1495		ret = -EIO;
   1496
   1497out:
   1498	if (ret == 0) {
   1499		*eb_ret = tmp;
   1500	} else {
   1501		free_extent_buffer(tmp);
   1502		btrfs_release_path(p);
   1503	}
   1504
   1505	return ret;
   1506}
   1507
   1508/*
   1509 * helper function for btrfs_search_slot.  This does all of the checks
   1510 * for node-level blocks and does any balancing required based on
   1511 * the ins_len.
   1512 *
   1513 * If no extra work was required, zero is returned.  If we had to
   1514 * drop the path, -EAGAIN is returned and btrfs_search_slot must
   1515 * start over
   1516 */
   1517static int
   1518setup_nodes_for_search(struct btrfs_trans_handle *trans,
   1519		       struct btrfs_root *root, struct btrfs_path *p,
   1520		       struct extent_buffer *b, int level, int ins_len,
   1521		       int *write_lock_level)
   1522{
   1523	struct btrfs_fs_info *fs_info = root->fs_info;
   1524	int ret = 0;
   1525
   1526	if ((p->search_for_split || ins_len > 0) && btrfs_header_nritems(b) >=
   1527	    BTRFS_NODEPTRS_PER_BLOCK(fs_info) - 3) {
   1528
   1529		if (*write_lock_level < level + 1) {
   1530			*write_lock_level = level + 1;
   1531			btrfs_release_path(p);
   1532			return -EAGAIN;
   1533		}
   1534
   1535		reada_for_balance(p, level);
   1536		ret = split_node(trans, root, p, level);
   1537
   1538		b = p->nodes[level];
   1539	} else if (ins_len < 0 && btrfs_header_nritems(b) <
   1540		   BTRFS_NODEPTRS_PER_BLOCK(fs_info) / 2) {
   1541
   1542		if (*write_lock_level < level + 1) {
   1543			*write_lock_level = level + 1;
   1544			btrfs_release_path(p);
   1545			return -EAGAIN;
   1546		}
   1547
   1548		reada_for_balance(p, level);
   1549		ret = balance_level(trans, root, p, level);
   1550		if (ret)
   1551			return ret;
   1552
   1553		b = p->nodes[level];
   1554		if (!b) {
   1555			btrfs_release_path(p);
   1556			return -EAGAIN;
   1557		}
   1558		BUG_ON(btrfs_header_nritems(b) == 1);
   1559	}
   1560	return ret;
   1561}
   1562
   1563int btrfs_find_item(struct btrfs_root *fs_root, struct btrfs_path *path,
   1564		u64 iobjectid, u64 ioff, u8 key_type,
   1565		struct btrfs_key *found_key)
   1566{
   1567	int ret;
   1568	struct btrfs_key key;
   1569	struct extent_buffer *eb;
   1570
   1571	ASSERT(path);
   1572	ASSERT(found_key);
   1573
   1574	key.type = key_type;
   1575	key.objectid = iobjectid;
   1576	key.offset = ioff;
   1577
   1578	ret = btrfs_search_slot(NULL, fs_root, &key, path, 0, 0);
   1579	if (ret < 0)
   1580		return ret;
   1581
   1582	eb = path->nodes[0];
   1583	if (ret && path->slots[0] >= btrfs_header_nritems(eb)) {
   1584		ret = btrfs_next_leaf(fs_root, path);
   1585		if (ret)
   1586			return ret;
   1587		eb = path->nodes[0];
   1588	}
   1589
   1590	btrfs_item_key_to_cpu(eb, found_key, path->slots[0]);
   1591	if (found_key->type != key.type ||
   1592			found_key->objectid != key.objectid)
   1593		return 1;
   1594
   1595	return 0;
   1596}
   1597
   1598static struct extent_buffer *btrfs_search_slot_get_root(struct btrfs_root *root,
   1599							struct btrfs_path *p,
   1600							int write_lock_level)
   1601{
   1602	struct extent_buffer *b;
   1603	int root_lock = 0;
   1604	int level = 0;
   1605
   1606	if (p->search_commit_root) {
   1607		b = root->commit_root;
   1608		atomic_inc(&b->refs);
   1609		level = btrfs_header_level(b);
   1610		/*
   1611		 * Ensure that all callers have set skip_locking when
   1612		 * p->search_commit_root = 1.
   1613		 */
   1614		ASSERT(p->skip_locking == 1);
   1615
   1616		goto out;
   1617	}
   1618
   1619	if (p->skip_locking) {
   1620		b = btrfs_root_node(root);
   1621		level = btrfs_header_level(b);
   1622		goto out;
   1623	}
   1624
   1625	/* We try very hard to do read locks on the root */
   1626	root_lock = BTRFS_READ_LOCK;
   1627
   1628	/*
   1629	 * If the level is set to maximum, we can skip trying to get the read
   1630	 * lock.
   1631	 */
   1632	if (write_lock_level < BTRFS_MAX_LEVEL) {
   1633		/*
   1634		 * We don't know the level of the root node until we actually
   1635		 * have it read locked
   1636		 */
   1637		b = btrfs_read_lock_root_node(root);
   1638		level = btrfs_header_level(b);
   1639		if (level > write_lock_level)
   1640			goto out;
   1641
   1642		/* Whoops, must trade for write lock */
   1643		btrfs_tree_read_unlock(b);
   1644		free_extent_buffer(b);
   1645	}
   1646
   1647	b = btrfs_lock_root_node(root);
   1648	root_lock = BTRFS_WRITE_LOCK;
   1649
   1650	/* The level might have changed, check again */
   1651	level = btrfs_header_level(b);
   1652
   1653out:
   1654	/*
   1655	 * The root may have failed to write out at some point, and thus is no
   1656	 * longer valid, return an error in this case.
   1657	 */
   1658	if (!extent_buffer_uptodate(b)) {
   1659		if (root_lock)
   1660			btrfs_tree_unlock_rw(b, root_lock);
   1661		free_extent_buffer(b);
   1662		return ERR_PTR(-EIO);
   1663	}
   1664
   1665	p->nodes[level] = b;
   1666	if (!p->skip_locking)
   1667		p->locks[level] = root_lock;
   1668	/*
   1669	 * Callers are responsible for dropping b's references.
   1670	 */
   1671	return b;
   1672}
   1673
   1674/*
   1675 * Replace the extent buffer at the lowest level of the path with a cloned
   1676 * version. The purpose is to be able to use it safely, after releasing the
   1677 * commit root semaphore, even if relocation is happening in parallel, the
   1678 * transaction used for relocation is committed and the extent buffer is
   1679 * reallocated in the next transaction.
   1680 *
   1681 * This is used in a context where the caller does not prevent transaction
   1682 * commits from happening, either by holding a transaction handle or holding
   1683 * some lock, while it's doing searches through a commit root.
   1684 * At the moment it's only used for send operations.
   1685 */
   1686static int finish_need_commit_sem_search(struct btrfs_path *path)
   1687{
   1688	const int i = path->lowest_level;
   1689	const int slot = path->slots[i];
   1690	struct extent_buffer *lowest = path->nodes[i];
   1691	struct extent_buffer *clone;
   1692
   1693	ASSERT(path->need_commit_sem);
   1694
   1695	if (!lowest)
   1696		return 0;
   1697
   1698	lockdep_assert_held_read(&lowest->fs_info->commit_root_sem);
   1699
   1700	clone = btrfs_clone_extent_buffer(lowest);
   1701	if (!clone)
   1702		return -ENOMEM;
   1703
   1704	btrfs_release_path(path);
   1705	path->nodes[i] = clone;
   1706	path->slots[i] = slot;
   1707
   1708	return 0;
   1709}
   1710
   1711static inline int search_for_key_slot(struct extent_buffer *eb,
   1712				      int search_low_slot,
   1713				      const struct btrfs_key *key,
   1714				      int prev_cmp,
   1715				      int *slot)
   1716{
   1717	/*
   1718	 * If a previous call to btrfs_bin_search() on a parent node returned an
   1719	 * exact match (prev_cmp == 0), we can safely assume the target key will
   1720	 * always be at slot 0 on lower levels, since each key pointer
   1721	 * (struct btrfs_key_ptr) refers to the lowest key accessible from the
   1722	 * subtree it points to. Thus we can skip searching lower levels.
   1723	 */
   1724	if (prev_cmp == 0) {
   1725		*slot = 0;
   1726		return 0;
   1727	}
   1728
   1729	return generic_bin_search(eb, search_low_slot, key, slot);
   1730}
   1731
   1732static int search_leaf(struct btrfs_trans_handle *trans,
   1733		       struct btrfs_root *root,
   1734		       const struct btrfs_key *key,
   1735		       struct btrfs_path *path,
   1736		       int ins_len,
   1737		       int prev_cmp)
   1738{
   1739	struct extent_buffer *leaf = path->nodes[0];
   1740	int leaf_free_space = -1;
   1741	int search_low_slot = 0;
   1742	int ret;
   1743	bool do_bin_search = true;
   1744
   1745	/*
   1746	 * If we are doing an insertion, the leaf has enough free space and the
   1747	 * destination slot for the key is not slot 0, then we can unlock our
   1748	 * write lock on the parent, and any other upper nodes, before doing the
   1749	 * binary search on the leaf (with search_for_key_slot()), allowing other
   1750	 * tasks to lock the parent and any other upper nodes.
   1751	 */
   1752	if (ins_len > 0) {
   1753		/*
   1754		 * Cache the leaf free space, since we will need it later and it
   1755		 * will not change until then.
   1756		 */
   1757		leaf_free_space = btrfs_leaf_free_space(leaf);
   1758
   1759		/*
   1760		 * !path->locks[1] means we have a single node tree, the leaf is
   1761		 * the root of the tree.
   1762		 */
   1763		if (path->locks[1] && leaf_free_space >= ins_len) {
   1764			struct btrfs_disk_key first_key;
   1765
   1766			ASSERT(btrfs_header_nritems(leaf) > 0);
   1767			btrfs_item_key(leaf, &first_key, 0);
   1768
   1769			/*
   1770			 * Doing the extra comparison with the first key is cheap,
   1771			 * taking into account that the first key is very likely
   1772			 * already in a cache line because it immediately follows
   1773			 * the extent buffer's header and we have recently accessed
   1774			 * the header's level field.
   1775			 */
   1776			ret = comp_keys(&first_key, key);
   1777			if (ret < 0) {
   1778				/*
   1779				 * The first key is smaller than the key we want
   1780				 * to insert, so we are safe to unlock all upper
   1781				 * nodes and we have to do the binary search.
   1782				 *
   1783				 * We do use btrfs_unlock_up_safe() and not
   1784				 * unlock_up() because the later does not unlock
   1785				 * nodes with a slot of 0 - we can safely unlock
   1786				 * any node even if its slot is 0 since in this
   1787				 * case the key does not end up at slot 0 of the
   1788				 * leaf and there's no need to split the leaf.
   1789				 */
   1790				btrfs_unlock_up_safe(path, 1);
   1791				search_low_slot = 1;
   1792			} else {
   1793				/*
   1794				 * The first key is >= then the key we want to
   1795				 * insert, so we can skip the binary search as
   1796				 * the target key will be at slot 0.
   1797				 *
   1798				 * We can not unlock upper nodes when the key is
   1799				 * less than the first key, because we will need
   1800				 * to update the key at slot 0 of the parent node
   1801				 * and possibly of other upper nodes too.
   1802				 * If the key matches the first key, then we can
   1803				 * unlock all the upper nodes, using
   1804				 * btrfs_unlock_up_safe() instead of unlock_up()
   1805				 * as stated above.
   1806				 */
   1807				if (ret == 0)
   1808					btrfs_unlock_up_safe(path, 1);
   1809				/*
   1810				 * ret is already 0 or 1, matching the result of
   1811				 * a btrfs_bin_search() call, so there is no need
   1812				 * to adjust it.
   1813				 */
   1814				do_bin_search = false;
   1815				path->slots[0] = 0;
   1816			}
   1817		}
   1818	}
   1819
   1820	if (do_bin_search) {
   1821		ret = search_for_key_slot(leaf, search_low_slot, key,
   1822					  prev_cmp, &path->slots[0]);
   1823		if (ret < 0)
   1824			return ret;
   1825	}
   1826
   1827	if (ins_len > 0) {
   1828		/*
   1829		 * Item key already exists. In this case, if we are allowed to
   1830		 * insert the item (for example, in dir_item case, item key
   1831		 * collision is allowed), it will be merged with the original
   1832		 * item. Only the item size grows, no new btrfs item will be
   1833		 * added. If search_for_extension is not set, ins_len already
   1834		 * accounts the size btrfs_item, deduct it here so leaf space
   1835		 * check will be correct.
   1836		 */
   1837		if (ret == 0 && !path->search_for_extension) {
   1838			ASSERT(ins_len >= sizeof(struct btrfs_item));
   1839			ins_len -= sizeof(struct btrfs_item);
   1840		}
   1841
   1842		ASSERT(leaf_free_space >= 0);
   1843
   1844		if (leaf_free_space < ins_len) {
   1845			int err;
   1846
   1847			err = split_leaf(trans, root, key, path, ins_len,
   1848					 (ret == 0));
   1849			ASSERT(err <= 0);
   1850			if (WARN_ON(err > 0))
   1851				err = -EUCLEAN;
   1852			if (err)
   1853				ret = err;
   1854		}
   1855	}
   1856
   1857	return ret;
   1858}
   1859
   1860/*
   1861 * btrfs_search_slot - look for a key in a tree and perform necessary
   1862 * modifications to preserve tree invariants.
   1863 *
   1864 * @trans:	Handle of transaction, used when modifying the tree
   1865 * @p:		Holds all btree nodes along the search path
   1866 * @root:	The root node of the tree
   1867 * @key:	The key we are looking for
   1868 * @ins_len:	Indicates purpose of search:
   1869 *              >0  for inserts it's size of item inserted (*)
   1870 *              <0  for deletions
   1871 *               0  for plain searches, not modifying the tree
   1872 *
   1873 *              (*) If size of item inserted doesn't include
   1874 *              sizeof(struct btrfs_item), then p->search_for_extension must
   1875 *              be set.
   1876 * @cow:	boolean should CoW operations be performed. Must always be 1
   1877 *		when modifying the tree.
   1878 *
   1879 * If @ins_len > 0, nodes and leaves will be split as we walk down the tree.
   1880 * If @ins_len < 0, nodes will be merged as we walk down the tree (if possible)
   1881 *
   1882 * If @key is found, 0 is returned and you can find the item in the leaf level
   1883 * of the path (level 0)
   1884 *
   1885 * If @key isn't found, 1 is returned and the leaf level of the path (level 0)
   1886 * points to the slot where it should be inserted
   1887 *
   1888 * If an error is encountered while searching the tree a negative error number
   1889 * is returned
   1890 */
   1891int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root *root,
   1892		      const struct btrfs_key *key, struct btrfs_path *p,
   1893		      int ins_len, int cow)
   1894{
   1895	struct btrfs_fs_info *fs_info = root->fs_info;
   1896	struct extent_buffer *b;
   1897	int slot;
   1898	int ret;
   1899	int err;
   1900	int level;
   1901	int lowest_unlock = 1;
   1902	/* everything at write_lock_level or lower must be write locked */
   1903	int write_lock_level = 0;
   1904	u8 lowest_level = 0;
   1905	int min_write_lock_level;
   1906	int prev_cmp;
   1907
   1908	lowest_level = p->lowest_level;
   1909	WARN_ON(lowest_level && ins_len > 0);
   1910	WARN_ON(p->nodes[0] != NULL);
   1911	BUG_ON(!cow && ins_len);
   1912
   1913	if (ins_len < 0) {
   1914		lowest_unlock = 2;
   1915
   1916		/* when we are removing items, we might have to go up to level
   1917		 * two as we update tree pointers  Make sure we keep write
   1918		 * for those levels as well
   1919		 */
   1920		write_lock_level = 2;
   1921	} else if (ins_len > 0) {
   1922		/*
   1923		 * for inserting items, make sure we have a write lock on
   1924		 * level 1 so we can update keys
   1925		 */
   1926		write_lock_level = 1;
   1927	}
   1928
   1929	if (!cow)
   1930		write_lock_level = -1;
   1931
   1932	if (cow && (p->keep_locks || p->lowest_level))
   1933		write_lock_level = BTRFS_MAX_LEVEL;
   1934
   1935	min_write_lock_level = write_lock_level;
   1936
   1937	if (p->need_commit_sem) {
   1938		ASSERT(p->search_commit_root);
   1939		down_read(&fs_info->commit_root_sem);
   1940	}
   1941
   1942again:
   1943	prev_cmp = -1;
   1944	b = btrfs_search_slot_get_root(root, p, write_lock_level);
   1945	if (IS_ERR(b)) {
   1946		ret = PTR_ERR(b);
   1947		goto done;
   1948	}
   1949
   1950	while (b) {
   1951		int dec = 0;
   1952
   1953		level = btrfs_header_level(b);
   1954
   1955		if (cow) {
   1956			bool last_level = (level == (BTRFS_MAX_LEVEL - 1));
   1957
   1958			/*
   1959			 * if we don't really need to cow this block
   1960			 * then we don't want to set the path blocking,
   1961			 * so we test it here
   1962			 */
   1963			if (!should_cow_block(trans, root, b))
   1964				goto cow_done;
   1965
   1966			/*
   1967			 * must have write locks on this node and the
   1968			 * parent
   1969			 */
   1970			if (level > write_lock_level ||
   1971			    (level + 1 > write_lock_level &&
   1972			    level + 1 < BTRFS_MAX_LEVEL &&
   1973			    p->nodes[level + 1])) {
   1974				write_lock_level = level + 1;
   1975				btrfs_release_path(p);
   1976				goto again;
   1977			}
   1978
   1979			if (last_level)
   1980				err = btrfs_cow_block(trans, root, b, NULL, 0,
   1981						      &b,
   1982						      BTRFS_NESTING_COW);
   1983			else
   1984				err = btrfs_cow_block(trans, root, b,
   1985						      p->nodes[level + 1],
   1986						      p->slots[level + 1], &b,
   1987						      BTRFS_NESTING_COW);
   1988			if (err) {
   1989				ret = err;
   1990				goto done;
   1991			}
   1992		}
   1993cow_done:
   1994		p->nodes[level] = b;
   1995
   1996		/*
   1997		 * we have a lock on b and as long as we aren't changing
   1998		 * the tree, there is no way to for the items in b to change.
   1999		 * It is safe to drop the lock on our parent before we
   2000		 * go through the expensive btree search on b.
   2001		 *
   2002		 * If we're inserting or deleting (ins_len != 0), then we might
   2003		 * be changing slot zero, which may require changing the parent.
   2004		 * So, we can't drop the lock until after we know which slot
   2005		 * we're operating on.
   2006		 */
   2007		if (!ins_len && !p->keep_locks) {
   2008			int u = level + 1;
   2009
   2010			if (u < BTRFS_MAX_LEVEL && p->locks[u]) {
   2011				btrfs_tree_unlock_rw(p->nodes[u], p->locks[u]);
   2012				p->locks[u] = 0;
   2013			}
   2014		}
   2015
   2016		if (level == 0) {
   2017			if (ins_len > 0)
   2018				ASSERT(write_lock_level >= 1);
   2019
   2020			ret = search_leaf(trans, root, key, p, ins_len, prev_cmp);
   2021			if (!p->search_for_split)
   2022				unlock_up(p, level, lowest_unlock,
   2023					  min_write_lock_level, NULL);
   2024			goto done;
   2025		}
   2026
   2027		ret = search_for_key_slot(b, 0, key, prev_cmp, &slot);
   2028		if (ret < 0)
   2029			goto done;
   2030		prev_cmp = ret;
   2031
   2032		if (ret && slot > 0) {
   2033			dec = 1;
   2034			slot--;
   2035		}
   2036		p->slots[level] = slot;
   2037		err = setup_nodes_for_search(trans, root, p, b, level, ins_len,
   2038					     &write_lock_level);
   2039		if (err == -EAGAIN)
   2040			goto again;
   2041		if (err) {
   2042			ret = err;
   2043			goto done;
   2044		}
   2045		b = p->nodes[level];
   2046		slot = p->slots[level];
   2047
   2048		/*
   2049		 * Slot 0 is special, if we change the key we have to update
   2050		 * the parent pointer which means we must have a write lock on
   2051		 * the parent
   2052		 */
   2053		if (slot == 0 && ins_len && write_lock_level < level + 1) {
   2054			write_lock_level = level + 1;
   2055			btrfs_release_path(p);
   2056			goto again;
   2057		}
   2058
   2059		unlock_up(p, level, lowest_unlock, min_write_lock_level,
   2060			  &write_lock_level);
   2061
   2062		if (level == lowest_level) {
   2063			if (dec)
   2064				p->slots[level]++;
   2065			goto done;
   2066		}
   2067
   2068		err = read_block_for_search(root, p, &b, level, slot, key);
   2069		if (err == -EAGAIN)
   2070			goto again;
   2071		if (err) {
   2072			ret = err;
   2073			goto done;
   2074		}
   2075
   2076		if (!p->skip_locking) {
   2077			level = btrfs_header_level(b);
   2078			if (level <= write_lock_level) {
   2079				btrfs_tree_lock(b);
   2080				p->locks[level] = BTRFS_WRITE_LOCK;
   2081			} else {
   2082				btrfs_tree_read_lock(b);
   2083				p->locks[level] = BTRFS_READ_LOCK;
   2084			}
   2085			p->nodes[level] = b;
   2086		}
   2087	}
   2088	ret = 1;
   2089done:
   2090	if (ret < 0 && !p->skip_release_on_error)
   2091		btrfs_release_path(p);
   2092
   2093	if (p->need_commit_sem) {
   2094		int ret2;
   2095
   2096		ret2 = finish_need_commit_sem_search(p);
   2097		up_read(&fs_info->commit_root_sem);
   2098		if (ret2)
   2099			ret = ret2;
   2100	}
   2101
   2102	return ret;
   2103}
   2104ALLOW_ERROR_INJECTION(btrfs_search_slot, ERRNO);
   2105
   2106/*
   2107 * Like btrfs_search_slot, this looks for a key in the given tree. It uses the
   2108 * current state of the tree together with the operations recorded in the tree
   2109 * modification log to search for the key in a previous version of this tree, as
   2110 * denoted by the time_seq parameter.
   2111 *
   2112 * Naturally, there is no support for insert, delete or cow operations.
   2113 *
   2114 * The resulting path and return value will be set up as if we called
   2115 * btrfs_search_slot at that point in time with ins_len and cow both set to 0.
   2116 */
   2117int btrfs_search_old_slot(struct btrfs_root *root, const struct btrfs_key *key,
   2118			  struct btrfs_path *p, u64 time_seq)
   2119{
   2120	struct btrfs_fs_info *fs_info = root->fs_info;
   2121	struct extent_buffer *b;
   2122	int slot;
   2123	int ret;
   2124	int err;
   2125	int level;
   2126	int lowest_unlock = 1;
   2127	u8 lowest_level = 0;
   2128
   2129	lowest_level = p->lowest_level;
   2130	WARN_ON(p->nodes[0] != NULL);
   2131
   2132	if (p->search_commit_root) {
   2133		BUG_ON(time_seq);
   2134		return btrfs_search_slot(NULL, root, key, p, 0, 0);
   2135	}
   2136
   2137again:
   2138	b = btrfs_get_old_root(root, time_seq);
   2139	if (!b) {
   2140		ret = -EIO;
   2141		goto done;
   2142	}
   2143	level = btrfs_header_level(b);
   2144	p->locks[level] = BTRFS_READ_LOCK;
   2145
   2146	while (b) {
   2147		int dec = 0;
   2148
   2149		level = btrfs_header_level(b);
   2150		p->nodes[level] = b;
   2151
   2152		/*
   2153		 * we have a lock on b and as long as we aren't changing
   2154		 * the tree, there is no way to for the items in b to change.
   2155		 * It is safe to drop the lock on our parent before we
   2156		 * go through the expensive btree search on b.
   2157		 */
   2158		btrfs_unlock_up_safe(p, level + 1);
   2159
   2160		ret = btrfs_bin_search(b, key, &slot);
   2161		if (ret < 0)
   2162			goto done;
   2163
   2164		if (level == 0) {
   2165			p->slots[level] = slot;
   2166			unlock_up(p, level, lowest_unlock, 0, NULL);
   2167			goto done;
   2168		}
   2169
   2170		if (ret && slot > 0) {
   2171			dec = 1;
   2172			slot--;
   2173		}
   2174		p->slots[level] = slot;
   2175		unlock_up(p, level, lowest_unlock, 0, NULL);
   2176
   2177		if (level == lowest_level) {
   2178			if (dec)
   2179				p->slots[level]++;
   2180			goto done;
   2181		}
   2182
   2183		err = read_block_for_search(root, p, &b, level, slot, key);
   2184		if (err == -EAGAIN)
   2185			goto again;
   2186		if (err) {
   2187			ret = err;
   2188			goto done;
   2189		}
   2190
   2191		level = btrfs_header_level(b);
   2192		btrfs_tree_read_lock(b);
   2193		b = btrfs_tree_mod_log_rewind(fs_info, p, b, time_seq);
   2194		if (!b) {
   2195			ret = -ENOMEM;
   2196			goto done;
   2197		}
   2198		p->locks[level] = BTRFS_READ_LOCK;
   2199		p->nodes[level] = b;
   2200	}
   2201	ret = 1;
   2202done:
   2203	if (ret < 0)
   2204		btrfs_release_path(p);
   2205
   2206	return ret;
   2207}
   2208
   2209/*
   2210 * helper to use instead of search slot if no exact match is needed but
   2211 * instead the next or previous item should be returned.
   2212 * When find_higher is true, the next higher item is returned, the next lower
   2213 * otherwise.
   2214 * When return_any and find_higher are both true, and no higher item is found,
   2215 * return the next lower instead.
   2216 * When return_any is true and find_higher is false, and no lower item is found,
   2217 * return the next higher instead.
   2218 * It returns 0 if any item is found, 1 if none is found (tree empty), and
   2219 * < 0 on error
   2220 */
   2221int btrfs_search_slot_for_read(struct btrfs_root *root,
   2222			       const struct btrfs_key *key,
   2223			       struct btrfs_path *p, int find_higher,
   2224			       int return_any)
   2225{
   2226	int ret;
   2227	struct extent_buffer *leaf;
   2228
   2229again:
   2230	ret = btrfs_search_slot(NULL, root, key, p, 0, 0);
   2231	if (ret <= 0)
   2232		return ret;
   2233	/*
   2234	 * a return value of 1 means the path is at the position where the
   2235	 * item should be inserted. Normally this is the next bigger item,
   2236	 * but in case the previous item is the last in a leaf, path points
   2237	 * to the first free slot in the previous leaf, i.e. at an invalid
   2238	 * item.
   2239	 */
   2240	leaf = p->nodes[0];
   2241
   2242	if (find_higher) {
   2243		if (p->slots[0] >= btrfs_header_nritems(leaf)) {
   2244			ret = btrfs_next_leaf(root, p);
   2245			if (ret <= 0)
   2246				return ret;
   2247			if (!return_any)
   2248				return 1;
   2249			/*
   2250			 * no higher item found, return the next
   2251			 * lower instead
   2252			 */
   2253			return_any = 0;
   2254			find_higher = 0;
   2255			btrfs_release_path(p);
   2256			goto again;
   2257		}
   2258	} else {
   2259		if (p->slots[0] == 0) {
   2260			ret = btrfs_prev_leaf(root, p);
   2261			if (ret < 0)
   2262				return ret;
   2263			if (!ret) {
   2264				leaf = p->nodes[0];
   2265				if (p->slots[0] == btrfs_header_nritems(leaf))
   2266					p->slots[0]--;
   2267				return 0;
   2268			}
   2269			if (!return_any)
   2270				return 1;
   2271			/*
   2272			 * no lower item found, return the next
   2273			 * higher instead
   2274			 */
   2275			return_any = 0;
   2276			find_higher = 1;
   2277			btrfs_release_path(p);
   2278			goto again;
   2279		} else {
   2280			--p->slots[0];
   2281		}
   2282	}
   2283	return 0;
   2284}
   2285
   2286/*
   2287 * Execute search and call btrfs_previous_item to traverse backwards if the item
   2288 * was not found.
   2289 *
   2290 * Return 0 if found, 1 if not found and < 0 if error.
   2291 */
   2292int btrfs_search_backwards(struct btrfs_root *root, struct btrfs_key *key,
   2293			   struct btrfs_path *path)
   2294{
   2295	int ret;
   2296
   2297	ret = btrfs_search_slot(NULL, root, key, path, 0, 0);
   2298	if (ret > 0)
   2299		ret = btrfs_previous_item(root, path, key->objectid, key->type);
   2300
   2301	if (ret == 0)
   2302		btrfs_item_key_to_cpu(path->nodes[0], key, path->slots[0]);
   2303
   2304	return ret;
   2305}
   2306
   2307/**
   2308 * Search for a valid slot for the given path.
   2309 *
   2310 * @root:	The root node of the tree.
   2311 * @key:	Will contain a valid item if found.
   2312 * @path:	The starting point to validate the slot.
   2313 *
   2314 * Return: 0  if the item is valid
   2315 *         1  if not found
   2316 *         <0 if error.
   2317 */
   2318int btrfs_get_next_valid_item(struct btrfs_root *root, struct btrfs_key *key,
   2319			      struct btrfs_path *path)
   2320{
   2321	while (1) {
   2322		int ret;
   2323		const int slot = path->slots[0];
   2324		const struct extent_buffer *leaf = path->nodes[0];
   2325
   2326		/* This is where we start walking the path. */
   2327		if (slot >= btrfs_header_nritems(leaf)) {
   2328			/*
   2329			 * If we've reached the last slot in this leaf we need
   2330			 * to go to the next leaf and reset the path.
   2331			 */
   2332			ret = btrfs_next_leaf(root, path);
   2333			if (ret)
   2334				return ret;
   2335			continue;
   2336		}
   2337		/* Store the found, valid item in @key. */
   2338		btrfs_item_key_to_cpu(leaf, key, slot);
   2339		break;
   2340	}
   2341	return 0;
   2342}
   2343
   2344/*
   2345 * adjust the pointers going up the tree, starting at level
   2346 * making sure the right key of each node is points to 'key'.
   2347 * This is used after shifting pointers to the left, so it stops
   2348 * fixing up pointers when a given leaf/node is not in slot 0 of the
   2349 * higher levels
   2350 *
   2351 */
   2352static void fixup_low_keys(struct btrfs_path *path,
   2353			   struct btrfs_disk_key *key, int level)
   2354{
   2355	int i;
   2356	struct extent_buffer *t;
   2357	int ret;
   2358
   2359	for (i = level; i < BTRFS_MAX_LEVEL; i++) {
   2360		int tslot = path->slots[i];
   2361
   2362		if (!path->nodes[i])
   2363			break;
   2364		t = path->nodes[i];
   2365		ret = btrfs_tree_mod_log_insert_key(t, tslot,
   2366				BTRFS_MOD_LOG_KEY_REPLACE, GFP_ATOMIC);
   2367		BUG_ON(ret < 0);
   2368		btrfs_set_node_key(t, key, tslot);
   2369		btrfs_mark_buffer_dirty(path->nodes[i]);
   2370		if (tslot != 0)
   2371			break;
   2372	}
   2373}
   2374
   2375/*
   2376 * update item key.
   2377 *
   2378 * This function isn't completely safe. It's the caller's responsibility
   2379 * that the new key won't break the order
   2380 */
   2381void btrfs_set_item_key_safe(struct btrfs_fs_info *fs_info,
   2382			     struct btrfs_path *path,
   2383			     const struct btrfs_key *new_key)
   2384{
   2385	struct btrfs_disk_key disk_key;
   2386	struct extent_buffer *eb;
   2387	int slot;
   2388
   2389	eb = path->nodes[0];
   2390	slot = path->slots[0];
   2391	if (slot > 0) {
   2392		btrfs_item_key(eb, &disk_key, slot - 1);
   2393		if (unlikely(comp_keys(&disk_key, new_key) >= 0)) {
   2394			btrfs_crit(fs_info,
   2395		"slot %u key (%llu %u %llu) new key (%llu %u %llu)",
   2396				   slot, btrfs_disk_key_objectid(&disk_key),
   2397				   btrfs_disk_key_type(&disk_key),
   2398				   btrfs_disk_key_offset(&disk_key),
   2399				   new_key->objectid, new_key->type,
   2400				   new_key->offset);
   2401			btrfs_print_leaf(eb);
   2402			BUG();
   2403		}
   2404	}
   2405	if (slot < btrfs_header_nritems(eb) - 1) {
   2406		btrfs_item_key(eb, &disk_key, slot + 1);
   2407		if (unlikely(comp_keys(&disk_key, new_key) <= 0)) {
   2408			btrfs_crit(fs_info,
   2409		"slot %u key (%llu %u %llu) new key (%llu %u %llu)",
   2410				   slot, btrfs_disk_key_objectid(&disk_key),
   2411				   btrfs_disk_key_type(&disk_key),
   2412				   btrfs_disk_key_offset(&disk_key),
   2413				   new_key->objectid, new_key->type,
   2414				   new_key->offset);
   2415			btrfs_print_leaf(eb);
   2416			BUG();
   2417		}
   2418	}
   2419
   2420	btrfs_cpu_key_to_disk(&disk_key, new_key);
   2421	btrfs_set_item_key(eb, &disk_key, slot);
   2422	btrfs_mark_buffer_dirty(eb);
   2423	if (slot == 0)
   2424		fixup_low_keys(path, &disk_key, 1);
   2425}
   2426
   2427/*
   2428 * Check key order of two sibling extent buffers.
   2429 *
   2430 * Return true if something is wrong.
   2431 * Return false if everything is fine.
   2432 *
   2433 * Tree-checker only works inside one tree block, thus the following
   2434 * corruption can not be detected by tree-checker:
   2435 *
   2436 * Leaf @left			| Leaf @right
   2437 * --------------------------------------------------------------
   2438 * | 1 | 2 | 3 | 4 | 5 | f6 |   | 7 | 8 |
   2439 *
   2440 * Key f6 in leaf @left itself is valid, but not valid when the next
   2441 * key in leaf @right is 7.
   2442 * This can only be checked at tree block merge time.
   2443 * And since tree checker has ensured all key order in each tree block
   2444 * is correct, we only need to bother the last key of @left and the first
   2445 * key of @right.
   2446 */
   2447static bool check_sibling_keys(struct extent_buffer *left,
   2448			       struct extent_buffer *right)
   2449{
   2450	struct btrfs_key left_last;
   2451	struct btrfs_key right_first;
   2452	int level = btrfs_header_level(left);
   2453	int nr_left = btrfs_header_nritems(left);
   2454	int nr_right = btrfs_header_nritems(right);
   2455
   2456	/* No key to check in one of the tree blocks */
   2457	if (!nr_left || !nr_right)
   2458		return false;
   2459
   2460	if (level) {
   2461		btrfs_node_key_to_cpu(left, &left_last, nr_left - 1);
   2462		btrfs_node_key_to_cpu(right, &right_first, 0);
   2463	} else {
   2464		btrfs_item_key_to_cpu(left, &left_last, nr_left - 1);
   2465		btrfs_item_key_to_cpu(right, &right_first, 0);
   2466	}
   2467
   2468	if (btrfs_comp_cpu_keys(&left_last, &right_first) >= 0) {
   2469		btrfs_crit(left->fs_info,
   2470"bad key order, sibling blocks, left last (%llu %u %llu) right first (%llu %u %llu)",
   2471			   left_last.objectid, left_last.type,
   2472			   left_last.offset, right_first.objectid,
   2473			   right_first.type, right_first.offset);
   2474		return true;
   2475	}
   2476	return false;
   2477}
   2478
   2479/*
   2480 * try to push data from one node into the next node left in the
   2481 * tree.
   2482 *
   2483 * returns 0 if some ptrs were pushed left, < 0 if there was some horrible
   2484 * error, and > 0 if there was no room in the left hand block.
   2485 */
   2486static int push_node_left(struct btrfs_trans_handle *trans,
   2487			  struct extent_buffer *dst,
   2488			  struct extent_buffer *src, int empty)
   2489{
   2490	struct btrfs_fs_info *fs_info = trans->fs_info;
   2491	int push_items = 0;
   2492	int src_nritems;
   2493	int dst_nritems;
   2494	int ret = 0;
   2495
   2496	src_nritems = btrfs_header_nritems(src);
   2497	dst_nritems = btrfs_header_nritems(dst);
   2498	push_items = BTRFS_NODEPTRS_PER_BLOCK(fs_info) - dst_nritems;
   2499	WARN_ON(btrfs_header_generation(src) != trans->transid);
   2500	WARN_ON(btrfs_header_generation(dst) != trans->transid);
   2501
   2502	if (!empty && src_nritems <= 8)
   2503		return 1;
   2504
   2505	if (push_items <= 0)
   2506		return 1;
   2507
   2508	if (empty) {
   2509		push_items = min(src_nritems, push_items);
   2510		if (push_items < src_nritems) {
   2511			/* leave at least 8 pointers in the node if
   2512			 * we aren't going to empty it
   2513			 */
   2514			if (src_nritems - push_items < 8) {
   2515				if (push_items <= 8)
   2516					return 1;
   2517				push_items -= 8;
   2518			}
   2519		}
   2520	} else
   2521		push_items = min(src_nritems - 8, push_items);
   2522
   2523	/* dst is the left eb, src is the middle eb */
   2524	if (check_sibling_keys(dst, src)) {
   2525		ret = -EUCLEAN;
   2526		btrfs_abort_transaction(trans, ret);
   2527		return ret;
   2528	}
   2529	ret = btrfs_tree_mod_log_eb_copy(dst, src, dst_nritems, 0, push_items);
   2530	if (ret) {
   2531		btrfs_abort_transaction(trans, ret);
   2532		return ret;
   2533	}
   2534	copy_extent_buffer(dst, src,
   2535			   btrfs_node_key_ptr_offset(dst_nritems),
   2536			   btrfs_node_key_ptr_offset(0),
   2537			   push_items * sizeof(struct btrfs_key_ptr));
   2538
   2539	if (push_items < src_nritems) {
   2540		/*
   2541		 * Don't call btrfs_tree_mod_log_insert_move() here, key removal
   2542		 * was already fully logged by btrfs_tree_mod_log_eb_copy() above.
   2543		 */
   2544		memmove_extent_buffer(src, btrfs_node_key_ptr_offset(0),
   2545				      btrfs_node_key_ptr_offset(push_items),
   2546				      (src_nritems - push_items) *
   2547				      sizeof(struct btrfs_key_ptr));
   2548	}
   2549	btrfs_set_header_nritems(src, src_nritems - push_items);
   2550	btrfs_set_header_nritems(dst, dst_nritems + push_items);
   2551	btrfs_mark_buffer_dirty(src);
   2552	btrfs_mark_buffer_dirty(dst);
   2553
   2554	return ret;
   2555}
   2556
   2557/*
   2558 * try to push data from one node into the next node right in the
   2559 * tree.
   2560 *
   2561 * returns 0 if some ptrs were pushed, < 0 if there was some horrible
   2562 * error, and > 0 if there was no room in the right hand block.
   2563 *
   2564 * this will  only push up to 1/2 the contents of the left node over
   2565 */
   2566static int balance_node_right(struct btrfs_trans_handle *trans,
   2567			      struct extent_buffer *dst,
   2568			      struct extent_buffer *src)
   2569{
   2570	struct btrfs_fs_info *fs_info = trans->fs_info;
   2571	int push_items = 0;
   2572	int max_push;
   2573	int src_nritems;
   2574	int dst_nritems;
   2575	int ret = 0;
   2576
   2577	WARN_ON(btrfs_header_generation(src) != trans->transid);
   2578	WARN_ON(btrfs_header_generation(dst) != trans->transid);
   2579
   2580	src_nritems = btrfs_header_nritems(src);
   2581	dst_nritems = btrfs_header_nritems(dst);
   2582	push_items = BTRFS_NODEPTRS_PER_BLOCK(fs_info) - dst_nritems;
   2583	if (push_items <= 0)
   2584		return 1;
   2585
   2586	if (src_nritems < 4)
   2587		return 1;
   2588
   2589	max_push = src_nritems / 2 + 1;
   2590	/* don't try to empty the node */
   2591	if (max_push >= src_nritems)
   2592		return 1;
   2593
   2594	if (max_push < push_items)
   2595		push_items = max_push;
   2596
   2597	/* dst is the right eb, src is the middle eb */
   2598	if (check_sibling_keys(src, dst)) {
   2599		ret = -EUCLEAN;
   2600		btrfs_abort_transaction(trans, ret);
   2601		return ret;
   2602	}
   2603	ret = btrfs_tree_mod_log_insert_move(dst, push_items, 0, dst_nritems);
   2604	BUG_ON(ret < 0);
   2605	memmove_extent_buffer(dst, btrfs_node_key_ptr_offset(push_items),
   2606				      btrfs_node_key_ptr_offset(0),
   2607				      (dst_nritems) *
   2608				      sizeof(struct btrfs_key_ptr));
   2609
   2610	ret = btrfs_tree_mod_log_eb_copy(dst, src, 0, src_nritems - push_items,
   2611					 push_items);
   2612	if (ret) {
   2613		btrfs_abort_transaction(trans, ret);
   2614		return ret;
   2615	}
   2616	copy_extent_buffer(dst, src,
   2617			   btrfs_node_key_ptr_offset(0),
   2618			   btrfs_node_key_ptr_offset(src_nritems - push_items),
   2619			   push_items * sizeof(struct btrfs_key_ptr));
   2620
   2621	btrfs_set_header_nritems(src, src_nritems - push_items);
   2622	btrfs_set_header_nritems(dst, dst_nritems + push_items);
   2623
   2624	btrfs_mark_buffer_dirty(src);
   2625	btrfs_mark_buffer_dirty(dst);
   2626
   2627	return ret;
   2628}
   2629
   2630/*
   2631 * helper function to insert a new root level in the tree.
   2632 * A new node is allocated, and a single item is inserted to
   2633 * point to the existing root
   2634 *
   2635 * returns zero on success or < 0 on failure.
   2636 */
   2637static noinline int insert_new_root(struct btrfs_trans_handle *trans,
   2638			   struct btrfs_root *root,
   2639			   struct btrfs_path *path, int level)
   2640{
   2641	struct btrfs_fs_info *fs_info = root->fs_info;
   2642	u64 lower_gen;
   2643	struct extent_buffer *lower;
   2644	struct extent_buffer *c;
   2645	struct extent_buffer *old;
   2646	struct btrfs_disk_key lower_key;
   2647	int ret;
   2648
   2649	BUG_ON(path->nodes[level]);
   2650	BUG_ON(path->nodes[level-1] != root->node);
   2651
   2652	lower = path->nodes[level-1];
   2653	if (level == 1)
   2654		btrfs_item_key(lower, &lower_key, 0);
   2655	else
   2656		btrfs_node_key(lower, &lower_key, 0);
   2657
   2658	c = btrfs_alloc_tree_block(trans, root, 0, root->root_key.objectid,
   2659				   &lower_key, level, root->node->start, 0,
   2660				   BTRFS_NESTING_NEW_ROOT);
   2661	if (IS_ERR(c))
   2662		return PTR_ERR(c);
   2663
   2664	root_add_used(root, fs_info->nodesize);
   2665
   2666	btrfs_set_header_nritems(c, 1);
   2667	btrfs_set_node_key(c, &lower_key, 0);
   2668	btrfs_set_node_blockptr(c, 0, lower->start);
   2669	lower_gen = btrfs_header_generation(lower);
   2670	WARN_ON(lower_gen != trans->transid);
   2671
   2672	btrfs_set_node_ptr_generation(c, 0, lower_gen);
   2673
   2674	btrfs_mark_buffer_dirty(c);
   2675
   2676	old = root->node;
   2677	ret = btrfs_tree_mod_log_insert_root(root->node, c, false);
   2678	BUG_ON(ret < 0);
   2679	rcu_assign_pointer(root->node, c);
   2680
   2681	/* the super has an extra ref to root->node */
   2682	free_extent_buffer(old);
   2683
   2684	add_root_to_dirty_list(root);
   2685	atomic_inc(&c->refs);
   2686	path->nodes[level] = c;
   2687	path->locks[level] = BTRFS_WRITE_LOCK;
   2688	path->slots[level] = 0;
   2689	return 0;
   2690}
   2691
   2692/*
   2693 * worker function to insert a single pointer in a node.
   2694 * the node should have enough room for the pointer already
   2695 *
   2696 * slot and level indicate where you want the key to go, and
   2697 * blocknr is the block the key points to.
   2698 */
   2699static void insert_ptr(struct btrfs_trans_handle *trans,
   2700		       struct btrfs_path *path,
   2701		       struct btrfs_disk_key *key, u64 bytenr,
   2702		       int slot, int level)
   2703{
   2704	struct extent_buffer *lower;
   2705	int nritems;
   2706	int ret;
   2707
   2708	BUG_ON(!path->nodes[level]);
   2709	btrfs_assert_tree_write_locked(path->nodes[level]);
   2710	lower = path->nodes[level];
   2711	nritems = btrfs_header_nritems(lower);
   2712	BUG_ON(slot > nritems);
   2713	BUG_ON(nritems == BTRFS_NODEPTRS_PER_BLOCK(trans->fs_info));
   2714	if (slot != nritems) {
   2715		if (level) {
   2716			ret = btrfs_tree_mod_log_insert_move(lower, slot + 1,
   2717					slot, nritems - slot);
   2718			BUG_ON(ret < 0);
   2719		}
   2720		memmove_extent_buffer(lower,
   2721			      btrfs_node_key_ptr_offset(slot + 1),
   2722			      btrfs_node_key_ptr_offset(slot),
   2723			      (nritems - slot) * sizeof(struct btrfs_key_ptr));
   2724	}
   2725	if (level) {
   2726		ret = btrfs_tree_mod_log_insert_key(lower, slot,
   2727					    BTRFS_MOD_LOG_KEY_ADD, GFP_NOFS);
   2728		BUG_ON(ret < 0);
   2729	}
   2730	btrfs_set_node_key(lower, key, slot);
   2731	btrfs_set_node_blockptr(lower, slot, bytenr);
   2732	WARN_ON(trans->transid == 0);
   2733	btrfs_set_node_ptr_generation(lower, slot, trans->transid);
   2734	btrfs_set_header_nritems(lower, nritems + 1);
   2735	btrfs_mark_buffer_dirty(lower);
   2736}
   2737
   2738/*
   2739 * split the node at the specified level in path in two.
   2740 * The path is corrected to point to the appropriate node after the split
   2741 *
   2742 * Before splitting this tries to make some room in the node by pushing
   2743 * left and right, if either one works, it returns right away.
   2744 *
   2745 * returns 0 on success and < 0 on failure
   2746 */
   2747static noinline int split_node(struct btrfs_trans_handle *trans,
   2748			       struct btrfs_root *root,
   2749			       struct btrfs_path *path, int level)
   2750{
   2751	struct btrfs_fs_info *fs_info = root->fs_info;
   2752	struct extent_buffer *c;
   2753	struct extent_buffer *split;
   2754	struct btrfs_disk_key disk_key;
   2755	int mid;
   2756	int ret;
   2757	u32 c_nritems;
   2758
   2759	c = path->nodes[level];
   2760	WARN_ON(btrfs_header_generation(c) != trans->transid);
   2761	if (c == root->node) {
   2762		/*
   2763		 * trying to split the root, lets make a new one
   2764		 *
   2765		 * tree mod log: We don't log_removal old root in
   2766		 * insert_new_root, because that root buffer will be kept as a
   2767		 * normal node. We are going to log removal of half of the
   2768		 * elements below with btrfs_tree_mod_log_eb_copy(). We're
   2769		 * holding a tree lock on the buffer, which is why we cannot
   2770		 * race with other tree_mod_log users.
   2771		 */
   2772		ret = insert_new_root(trans, root, path, level + 1);
   2773		if (ret)
   2774			return ret;
   2775	} else {
   2776		ret = push_nodes_for_insert(trans, root, path, level);
   2777		c = path->nodes[level];
   2778		if (!ret && btrfs_header_nritems(c) <
   2779		    BTRFS_NODEPTRS_PER_BLOCK(fs_info) - 3)
   2780			return 0;
   2781		if (ret < 0)
   2782			return ret;
   2783	}
   2784
   2785	c_nritems = btrfs_header_nritems(c);
   2786	mid = (c_nritems + 1) / 2;
   2787	btrfs_node_key(c, &disk_key, mid);
   2788
   2789	split = btrfs_alloc_tree_block(trans, root, 0, root->root_key.objectid,
   2790				       &disk_key, level, c->start, 0,
   2791				       BTRFS_NESTING_SPLIT);
   2792	if (IS_ERR(split))
   2793		return PTR_ERR(split);
   2794
   2795	root_add_used(root, fs_info->nodesize);
   2796	ASSERT(btrfs_header_level(c) == level);
   2797
   2798	ret = btrfs_tree_mod_log_eb_copy(split, c, 0, mid, c_nritems - mid);
   2799	if (ret) {
   2800		btrfs_abort_transaction(trans, ret);
   2801		return ret;
   2802	}
   2803	copy_extent_buffer(split, c,
   2804			   btrfs_node_key_ptr_offset(0),
   2805			   btrfs_node_key_ptr_offset(mid),
   2806			   (c_nritems - mid) * sizeof(struct btrfs_key_ptr));
   2807	btrfs_set_header_nritems(split, c_nritems - mid);
   2808	btrfs_set_header_nritems(c, mid);
   2809
   2810	btrfs_mark_buffer_dirty(c);
   2811	btrfs_mark_buffer_dirty(split);
   2812
   2813	insert_ptr(trans, path, &disk_key, split->start,
   2814		   path->slots[level + 1] + 1, level + 1);
   2815
   2816	if (path->slots[level] >= mid) {
   2817		path->slots[level] -= mid;
   2818		btrfs_tree_unlock(c);
   2819		free_extent_buffer(c);
   2820		path->nodes[level] = split;
   2821		path->slots[level + 1] += 1;
   2822	} else {
   2823		btrfs_tree_unlock(split);
   2824		free_extent_buffer(split);
   2825	}
   2826	return 0;
   2827}
   2828
   2829/*
   2830 * how many bytes are required to store the items in a leaf.  start
   2831 * and nr indicate which items in the leaf to check.  This totals up the
   2832 * space used both by the item structs and the item data
   2833 */
   2834static int leaf_space_used(struct extent_buffer *l, int start, int nr)
   2835{
   2836	int data_len;
   2837	int nritems = btrfs_header_nritems(l);
   2838	int end = min(nritems, start + nr) - 1;
   2839
   2840	if (!nr)
   2841		return 0;
   2842	data_len = btrfs_item_offset(l, start) + btrfs_item_size(l, start);
   2843	data_len = data_len - btrfs_item_offset(l, end);
   2844	data_len += sizeof(struct btrfs_item) * nr;
   2845	WARN_ON(data_len < 0);
   2846	return data_len;
   2847}
   2848
   2849/*
   2850 * The space between the end of the leaf items and
   2851 * the start of the leaf data.  IOW, how much room
   2852 * the leaf has left for both items and data
   2853 */
   2854noinline int btrfs_leaf_free_space(struct extent_buffer *leaf)
   2855{
   2856	struct btrfs_fs_info *fs_info = leaf->fs_info;
   2857	int nritems = btrfs_header_nritems(leaf);
   2858	int ret;
   2859
   2860	ret = BTRFS_LEAF_DATA_SIZE(fs_info) - leaf_space_used(leaf, 0, nritems);
   2861	if (ret < 0) {
   2862		btrfs_crit(fs_info,
   2863			   "leaf free space ret %d, leaf data size %lu, used %d nritems %d",
   2864			   ret,
   2865			   (unsigned long) BTRFS_LEAF_DATA_SIZE(fs_info),
   2866			   leaf_space_used(leaf, 0, nritems), nritems);
   2867	}
   2868	return ret;
   2869}
   2870
   2871/*
   2872 * min slot controls the lowest index we're willing to push to the
   2873 * right.  We'll push up to and including min_slot, but no lower
   2874 */
   2875static noinline int __push_leaf_right(struct btrfs_path *path,
   2876				      int data_size, int empty,
   2877				      struct extent_buffer *right,
   2878				      int free_space, u32 left_nritems,
   2879				      u32 min_slot)
   2880{
   2881	struct btrfs_fs_info *fs_info = right->fs_info;
   2882	struct extent_buffer *left = path->nodes[0];
   2883	struct extent_buffer *upper = path->nodes[1];
   2884	struct btrfs_map_token token;
   2885	struct btrfs_disk_key disk_key;
   2886	int slot;
   2887	u32 i;
   2888	int push_space = 0;
   2889	int push_items = 0;
   2890	u32 nr;
   2891	u32 right_nritems;
   2892	u32 data_end;
   2893	u32 this_item_size;
   2894
   2895	if (empty)
   2896		nr = 0;
   2897	else
   2898		nr = max_t(u32, 1, min_slot);
   2899
   2900	if (path->slots[0] >= left_nritems)
   2901		push_space += data_size;
   2902
   2903	slot = path->slots[1];
   2904	i = left_nritems - 1;
   2905	while (i >= nr) {
   2906		if (!empty && push_items > 0) {
   2907			if (path->slots[0] > i)
   2908				break;
   2909			if (path->slots[0] == i) {
   2910				int space = btrfs_leaf_free_space(left);
   2911
   2912				if (space + push_space * 2 > free_space)
   2913					break;
   2914			}
   2915		}
   2916
   2917		if (path->slots[0] == i)
   2918			push_space += data_size;
   2919
   2920		this_item_size = btrfs_item_size(left, i);
   2921		if (this_item_size + sizeof(struct btrfs_item) +
   2922		    push_space > free_space)
   2923			break;
   2924
   2925		push_items++;
   2926		push_space += this_item_size + sizeof(struct btrfs_item);
   2927		if (i == 0)
   2928			break;
   2929		i--;
   2930	}
   2931
   2932	if (push_items == 0)
   2933		goto out_unlock;
   2934
   2935	WARN_ON(!empty && push_items == left_nritems);
   2936
   2937	/* push left to right */
   2938	right_nritems = btrfs_header_nritems(right);
   2939
   2940	push_space = btrfs_item_data_end(left, left_nritems - push_items);
   2941	push_space -= leaf_data_end(left);
   2942
   2943	/* make room in the right data area */
   2944	data_end = leaf_data_end(right);
   2945	memmove_extent_buffer(right,
   2946			      BTRFS_LEAF_DATA_OFFSET + data_end - push_space,
   2947			      BTRFS_LEAF_DATA_OFFSET + data_end,
   2948			      BTRFS_LEAF_DATA_SIZE(fs_info) - data_end);
   2949
   2950	/* copy from the left data area */
   2951	copy_extent_buffer(right, left, BTRFS_LEAF_DATA_OFFSET +
   2952		     BTRFS_LEAF_DATA_SIZE(fs_info) - push_space,
   2953		     BTRFS_LEAF_DATA_OFFSET + leaf_data_end(left),
   2954		     push_space);
   2955
   2956	memmove_extent_buffer(right, btrfs_item_nr_offset(push_items),
   2957			      btrfs_item_nr_offset(0),
   2958			      right_nritems * sizeof(struct btrfs_item));
   2959
   2960	/* copy the items from left to right */
   2961	copy_extent_buffer(right, left, btrfs_item_nr_offset(0),
   2962		   btrfs_item_nr_offset(left_nritems - push_items),
   2963		   push_items * sizeof(struct btrfs_item));
   2964
   2965	/* update the item pointers */
   2966	btrfs_init_map_token(&token, right);
   2967	right_nritems += push_items;
   2968	btrfs_set_header_nritems(right, right_nritems);
   2969	push_space = BTRFS_LEAF_DATA_SIZE(fs_info);
   2970	for (i = 0; i < right_nritems; i++) {
   2971		push_space -= btrfs_token_item_size(&token, i);
   2972		btrfs_set_token_item_offset(&token, i, push_space);
   2973	}
   2974
   2975	left_nritems -= push_items;
   2976	btrfs_set_header_nritems(left, left_nritems);
   2977
   2978	if (left_nritems)
   2979		btrfs_mark_buffer_dirty(left);
   2980	else
   2981		btrfs_clean_tree_block(left);
   2982
   2983	btrfs_mark_buffer_dirty(right);
   2984
   2985	btrfs_item_key(right, &disk_key, 0);
   2986	btrfs_set_node_key(upper, &disk_key, slot + 1);
   2987	btrfs_mark_buffer_dirty(upper);
   2988
   2989	/* then fixup the leaf pointer in the path */
   2990	if (path->slots[0] >= left_nritems) {
   2991		path->slots[0] -= left_nritems;
   2992		if (btrfs_header_nritems(path->nodes[0]) == 0)
   2993			btrfs_clean_tree_block(path->nodes[0]);
   2994		btrfs_tree_unlock(path->nodes[0]);
   2995		free_extent_buffer(path->nodes[0]);
   2996		path->nodes[0] = right;
   2997		path->slots[1] += 1;
   2998	} else {
   2999		btrfs_tree_unlock(right);
   3000		free_extent_buffer(right);
   3001	}
   3002	return 0;
   3003
   3004out_unlock:
   3005	btrfs_tree_unlock(right);
   3006	free_extent_buffer(right);
   3007	return 1;
   3008}
   3009
   3010/*
   3011 * push some data in the path leaf to the right, trying to free up at
   3012 * least data_size bytes.  returns zero if the push worked, nonzero otherwise
   3013 *
   3014 * returns 1 if the push failed because the other node didn't have enough
   3015 * room, 0 if everything worked out and < 0 if there were major errors.
   3016 *
   3017 * this will push starting from min_slot to the end of the leaf.  It won't
   3018 * push any slot lower than min_slot
   3019 */
   3020static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root
   3021			   *root, struct btrfs_path *path,
   3022			   int min_data_size, int data_size,
   3023			   int empty, u32 min_slot)
   3024{
   3025	struct extent_buffer *left = path->nodes[0];
   3026	struct extent_buffer *right;
   3027	struct extent_buffer *upper;
   3028	int slot;
   3029	int free_space;
   3030	u32 left_nritems;
   3031	int ret;
   3032
   3033	if (!path->nodes[1])
   3034		return 1;
   3035
   3036	slot = path->slots[1];
   3037	upper = path->nodes[1];
   3038	if (slot >= btrfs_header_nritems(upper) - 1)
   3039		return 1;
   3040
   3041	btrfs_assert_tree_write_locked(path->nodes[1]);
   3042
   3043	right = btrfs_read_node_slot(upper, slot + 1);
   3044	/*
   3045	 * slot + 1 is not valid or we fail to read the right node,
   3046	 * no big deal, just return.
   3047	 */
   3048	if (IS_ERR(right))
   3049		return 1;
   3050
   3051	__btrfs_tree_lock(right, BTRFS_NESTING_RIGHT);
   3052
   3053	free_space = btrfs_leaf_free_space(right);
   3054	if (free_space < data_size)
   3055		goto out_unlock;
   3056
   3057	ret = btrfs_cow_block(trans, root, right, upper,
   3058			      slot + 1, &right, BTRFS_NESTING_RIGHT_COW);
   3059	if (ret)
   3060		goto out_unlock;
   3061
   3062	left_nritems = btrfs_header_nritems(left);
   3063	if (left_nritems == 0)
   3064		goto out_unlock;
   3065
   3066	if (check_sibling_keys(left, right)) {
   3067		ret = -EUCLEAN;
   3068		btrfs_tree_unlock(right);
   3069		free_extent_buffer(right);
   3070		return ret;
   3071	}
   3072	if (path->slots[0] == left_nritems && !empty) {
   3073		/* Key greater than all keys in the leaf, right neighbor has
   3074		 * enough room for it and we're not emptying our leaf to delete
   3075		 * it, therefore use right neighbor to insert the new item and
   3076		 * no need to touch/dirty our left leaf. */
   3077		btrfs_tree_unlock(left);
   3078		free_extent_buffer(left);
   3079		path->nodes[0] = right;
   3080		path->slots[0] = 0;
   3081		path->slots[1]++;
   3082		return 0;
   3083	}
   3084
   3085	return __push_leaf_right(path, min_data_size, empty,
   3086				right, free_space, left_nritems, min_slot);
   3087out_unlock:
   3088	btrfs_tree_unlock(right);
   3089	free_extent_buffer(right);
   3090	return 1;
   3091}
   3092
   3093/*
   3094 * push some data in the path leaf to the left, trying to free up at
   3095 * least data_size bytes.  returns zero if the push worked, nonzero otherwise
   3096 *
   3097 * max_slot can put a limit on how far into the leaf we'll push items.  The
   3098 * item at 'max_slot' won't be touched.  Use (u32)-1 to make us do all the
   3099 * items
   3100 */
   3101static noinline int __push_leaf_left(struct btrfs_path *path, int data_size,
   3102				     int empty, struct extent_buffer *left,
   3103				     int free_space, u32 right_nritems,
   3104				     u32 max_slot)
   3105{
   3106	struct btrfs_fs_info *fs_info = left->fs_info;
   3107	struct btrfs_disk_key disk_key;
   3108	struct extent_buffer *right = path->nodes[0];
   3109	int i;
   3110	int push_space = 0;
   3111	int push_items = 0;
   3112	u32 old_left_nritems;
   3113	u32 nr;
   3114	int ret = 0;
   3115	u32 this_item_size;
   3116	u32 old_left_item_size;
   3117	struct btrfs_map_token token;
   3118
   3119	if (empty)
   3120		nr = min(right_nritems, max_slot);
   3121	else
   3122		nr = min(right_nritems - 1, max_slot);
   3123
   3124	for (i = 0; i < nr; i++) {
   3125		if (!empty && push_items > 0) {
   3126			if (path->slots[0] < i)
   3127				break;
   3128			if (path->slots[0] == i) {
   3129				int space = btrfs_leaf_free_space(right);
   3130
   3131				if (space + push_space * 2 > free_space)
   3132					break;
   3133			}
   3134		}
   3135
   3136		if (path->slots[0] == i)
   3137			push_space += data_size;
   3138
   3139		this_item_size = btrfs_item_size(right, i);
   3140		if (this_item_size + sizeof(struct btrfs_item) + push_space >
   3141		    free_space)
   3142			break;
   3143
   3144		push_items++;
   3145		push_space += this_item_size + sizeof(struct btrfs_item);
   3146	}
   3147
   3148	if (push_items == 0) {
   3149		ret = 1;
   3150		goto out;
   3151	}
   3152	WARN_ON(!empty && push_items == btrfs_header_nritems(right));
   3153
   3154	/* push data from right to left */
   3155	copy_extent_buffer(left, right,
   3156			   btrfs_item_nr_offset(btrfs_header_nritems(left)),
   3157			   btrfs_item_nr_offset(0),
   3158			   push_items * sizeof(struct btrfs_item));
   3159
   3160	push_space = BTRFS_LEAF_DATA_SIZE(fs_info) -
   3161		     btrfs_item_offset(right, push_items - 1);
   3162
   3163	copy_extent_buffer(left, right, BTRFS_LEAF_DATA_OFFSET +
   3164		     leaf_data_end(left) - push_space,
   3165		     BTRFS_LEAF_DATA_OFFSET +
   3166		     btrfs_item_offset(right, push_items - 1),
   3167		     push_space);
   3168	old_left_nritems = btrfs_header_nritems(left);
   3169	BUG_ON(old_left_nritems <= 0);
   3170
   3171	btrfs_init_map_token(&token, left);
   3172	old_left_item_size = btrfs_item_offset(left, old_left_nritems - 1);
   3173	for (i = old_left_nritems; i < old_left_nritems + push_items; i++) {
   3174		u32 ioff;
   3175
   3176		ioff = btrfs_token_item_offset(&token, i);
   3177		btrfs_set_token_item_offset(&token, i,
   3178		      ioff - (BTRFS_LEAF_DATA_SIZE(fs_info) - old_left_item_size));
   3179	}
   3180	btrfs_set_header_nritems(left, old_left_nritems + push_items);
   3181
   3182	/* fixup right node */
   3183	if (push_items > right_nritems)
   3184		WARN(1, KERN_CRIT "push items %d nr %u\n", push_items,
   3185		       right_nritems);
   3186
   3187	if (push_items < right_nritems) {
   3188		push_space = btrfs_item_offset(right, push_items - 1) -
   3189						  leaf_data_end(right);
   3190		memmove_extent_buffer(right, BTRFS_LEAF_DATA_OFFSET +
   3191				      BTRFS_LEAF_DATA_SIZE(fs_info) - push_space,
   3192				      BTRFS_LEAF_DATA_OFFSET +
   3193				      leaf_data_end(right), push_space);
   3194
   3195		memmove_extent_buffer(right, btrfs_item_nr_offset(0),
   3196			      btrfs_item_nr_offset(push_items),
   3197			     (btrfs_header_nritems(right) - push_items) *
   3198			     sizeof(struct btrfs_item));
   3199	}
   3200
   3201	btrfs_init_map_token(&token, right);
   3202	right_nritems -= push_items;
   3203	btrfs_set_header_nritems(right, right_nritems);
   3204	push_space = BTRFS_LEAF_DATA_SIZE(fs_info);
   3205	for (i = 0; i < right_nritems; i++) {
   3206		push_space = push_space - btrfs_token_item_size(&token, i);
   3207		btrfs_set_token_item_offset(&token, i, push_space);
   3208	}
   3209
   3210	btrfs_mark_buffer_dirty(left);
   3211	if (right_nritems)
   3212		btrfs_mark_buffer_dirty(right);
   3213	else
   3214		btrfs_clean_tree_block(right);
   3215
   3216	btrfs_item_key(right, &disk_key, 0);
   3217	fixup_low_keys(path, &disk_key, 1);
   3218
   3219	/* then fixup the leaf pointer in the path */
   3220	if (path->slots[0] < push_items) {
   3221		path->slots[0] += old_left_nritems;
   3222		btrfs_tree_unlock(path->nodes[0]);
   3223		free_extent_buffer(path->nodes[0]);
   3224		path->nodes[0] = left;
   3225		path->slots[1] -= 1;
   3226	} else {
   3227		btrfs_tree_unlock(left);
   3228		free_extent_buffer(left);
   3229		path->slots[0] -= push_items;
   3230	}
   3231	BUG_ON(path->slots[0] < 0);
   3232	return ret;
   3233out:
   3234	btrfs_tree_unlock(left);
   3235	free_extent_buffer(left);
   3236	return ret;
   3237}
   3238
   3239/*
   3240 * push some data in the path leaf to the left, trying to free up at
   3241 * least data_size bytes.  returns zero if the push worked, nonzero otherwise
   3242 *
   3243 * max_slot can put a limit on how far into the leaf we'll push items.  The
   3244 * item at 'max_slot' won't be touched.  Use (u32)-1 to make us push all the
   3245 * items
   3246 */
   3247static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root
   3248			  *root, struct btrfs_path *path, int min_data_size,
   3249			  int data_size, int empty, u32 max_slot)
   3250{
   3251	struct extent_buffer *right = path->nodes[0];
   3252	struct extent_buffer *left;
   3253	int slot;
   3254	int free_space;
   3255	u32 right_nritems;
   3256	int ret = 0;
   3257
   3258	slot = path->slots[1];
   3259	if (slot == 0)
   3260		return 1;
   3261	if (!path->nodes[1])
   3262		return 1;
   3263
   3264	right_nritems = btrfs_header_nritems(right);
   3265	if (right_nritems == 0)
   3266		return 1;
   3267
   3268	btrfs_assert_tree_write_locked(path->nodes[1]);
   3269
   3270	left = btrfs_read_node_slot(path->nodes[1], slot - 1);
   3271	/*
   3272	 * slot - 1 is not valid or we fail to read the left node,
   3273	 * no big deal, just return.
   3274	 */
   3275	if (IS_ERR(left))
   3276		return 1;
   3277
   3278	__btrfs_tree_lock(left, BTRFS_NESTING_LEFT);
   3279
   3280	free_space = btrfs_leaf_free_space(left);
   3281	if (free_space < data_size) {
   3282		ret = 1;
   3283		goto out;
   3284	}
   3285
   3286	ret = btrfs_cow_block(trans, root, left,
   3287			      path->nodes[1], slot - 1, &left,
   3288			      BTRFS_NESTING_LEFT_COW);
   3289	if (ret) {
   3290		/* we hit -ENOSPC, but it isn't fatal here */
   3291		if (ret == -ENOSPC)
   3292			ret = 1;
   3293		goto out;
   3294	}
   3295
   3296	if (check_sibling_keys(left, right)) {
   3297		ret = -EUCLEAN;
   3298		goto out;
   3299	}
   3300	return __push_leaf_left(path, min_data_size,
   3301			       empty, left, free_space, right_nritems,
   3302			       max_slot);
   3303out:
   3304	btrfs_tree_unlock(left);
   3305	free_extent_buffer(left);
   3306	return ret;
   3307}
   3308
   3309/*
   3310 * split the path's leaf in two, making sure there is at least data_size
   3311 * available for the resulting leaf level of the path.
   3312 */
   3313static noinline void copy_for_split(struct btrfs_trans_handle *trans,
   3314				    struct btrfs_path *path,
   3315				    struct extent_buffer *l,
   3316				    struct extent_buffer *right,
   3317				    int slot, int mid, int nritems)
   3318{
   3319	struct btrfs_fs_info *fs_info = trans->fs_info;
   3320	int data_copy_size;
   3321	int rt_data_off;
   3322	int i;
   3323	struct btrfs_disk_key disk_key;
   3324	struct btrfs_map_token token;
   3325
   3326	nritems = nritems - mid;
   3327	btrfs_set_header_nritems(right, nritems);
   3328	data_copy_size = btrfs_item_data_end(l, mid) - leaf_data_end(l);
   3329
   3330	copy_extent_buffer(right, l, btrfs_item_nr_offset(0),
   3331			   btrfs_item_nr_offset(mid),
   3332			   nritems * sizeof(struct btrfs_item));
   3333
   3334	copy_extent_buffer(right, l,
   3335		     BTRFS_LEAF_DATA_OFFSET + BTRFS_LEAF_DATA_SIZE(fs_info) -
   3336		     data_copy_size, BTRFS_LEAF_DATA_OFFSET +
   3337		     leaf_data_end(l), data_copy_size);
   3338
   3339	rt_data_off = BTRFS_LEAF_DATA_SIZE(fs_info) - btrfs_item_data_end(l, mid);
   3340
   3341	btrfs_init_map_token(&token, right);
   3342	for (i = 0; i < nritems; i++) {
   3343		u32 ioff;
   3344
   3345		ioff = btrfs_token_item_offset(&token, i);
   3346		btrfs_set_token_item_offset(&token, i, ioff + rt_data_off);
   3347	}
   3348
   3349	btrfs_set_header_nritems(l, mid);
   3350	btrfs_item_key(right, &disk_key, 0);
   3351	insert_ptr(trans, path, &disk_key, right->start, path->slots[1] + 1, 1);
   3352
   3353	btrfs_mark_buffer_dirty(right);
   3354	btrfs_mark_buffer_dirty(l);
   3355	BUG_ON(path->slots[0] != slot);
   3356
   3357	if (mid <= slot) {
   3358		btrfs_tree_unlock(path->nodes[0]);
   3359		free_extent_buffer(path->nodes[0]);
   3360		path->nodes[0] = right;
   3361		path->slots[0] -= mid;
   3362		path->slots[1] += 1;
   3363	} else {
   3364		btrfs_tree_unlock(right);
   3365		free_extent_buffer(right);
   3366	}
   3367
   3368	BUG_ON(path->slots[0] < 0);
   3369}
   3370
   3371/*
   3372 * double splits happen when we need to insert a big item in the middle
   3373 * of a leaf.  A double split can leave us with 3 mostly empty leaves:
   3374 * leaf: [ slots 0 - N] [ our target ] [ N + 1 - total in leaf ]
   3375 *          A                 B                 C
   3376 *
   3377 * We avoid this by trying to push the items on either side of our target
   3378 * into the adjacent leaves.  If all goes well we can avoid the double split
   3379 * completely.
   3380 */
   3381static noinline int push_for_double_split(struct btrfs_trans_handle *trans,
   3382					  struct btrfs_root *root,
   3383					  struct btrfs_path *path,
   3384					  int data_size)
   3385{
   3386	int ret;
   3387	int progress = 0;
   3388	int slot;
   3389	u32 nritems;
   3390	int space_needed = data_size;
   3391
   3392	slot = path->slots[0];
   3393	if (slot < btrfs_header_nritems(path->nodes[0]))
   3394		space_needed -= btrfs_leaf_free_space(path->nodes[0]);
   3395
   3396	/*
   3397	 * try to push all the items after our slot into the
   3398	 * right leaf
   3399	 */
   3400	ret = push_leaf_right(trans, root, path, 1, space_needed, 0, slot);
   3401	if (ret < 0)
   3402		return ret;
   3403
   3404	if (ret == 0)
   3405		progress++;
   3406
   3407	nritems = btrfs_header_nritems(path->nodes[0]);
   3408	/*
   3409	 * our goal is to get our slot at the start or end of a leaf.  If
   3410	 * we've done so we're done
   3411	 */
   3412	if (path->slots[0] == 0 || path->slots[0] == nritems)
   3413		return 0;
   3414
   3415	if (btrfs_leaf_free_space(path->nodes[0]) >= data_size)
   3416		return 0;
   3417
   3418	/* try to push all the items before our slot into the next leaf */
   3419	slot = path->slots[0];
   3420	space_needed = data_size;
   3421	if (slot > 0)
   3422		space_needed -= btrfs_leaf_free_space(path->nodes[0]);
   3423	ret = push_leaf_left(trans, root, path, 1, space_needed, 0, slot);
   3424	if (ret < 0)
   3425		return ret;
   3426
   3427	if (ret == 0)
   3428		progress++;
   3429
   3430	if (progress)
   3431		return 0;
   3432	return 1;
   3433}
   3434
   3435/*
   3436 * split the path's leaf in two, making sure there is at least data_size
   3437 * available for the resulting leaf level of the path.
   3438 *
   3439 * returns 0 if all went well and < 0 on failure.
   3440 */
   3441static noinline int split_leaf(struct btrfs_trans_handle *trans,
   3442			       struct btrfs_root *root,
   3443			       const struct btrfs_key *ins_key,
   3444			       struct btrfs_path *path, int data_size,
   3445			       int extend)
   3446{
   3447	struct btrfs_disk_key disk_key;
   3448	struct extent_buffer *l;
   3449	u32 nritems;
   3450	int mid;
   3451	int slot;
   3452	struct extent_buffer *right;
   3453	struct btrfs_fs_info *fs_info = root->fs_info;
   3454	int ret = 0;
   3455	int wret;
   3456	int split;
   3457	int num_doubles = 0;
   3458	int tried_avoid_double = 0;
   3459
   3460	l = path->nodes[0];
   3461	slot = path->slots[0];
   3462	if (extend && data_size + btrfs_item_size(l, slot) +
   3463	    sizeof(struct btrfs_item) > BTRFS_LEAF_DATA_SIZE(fs_info))
   3464		return -EOVERFLOW;
   3465
   3466	/* first try to make some room by pushing left and right */
   3467	if (data_size && path->nodes[1]) {
   3468		int space_needed = data_size;
   3469
   3470		if (slot < btrfs_header_nritems(l))
   3471			space_needed -= btrfs_leaf_free_space(l);
   3472
   3473		wret = push_leaf_right(trans, root, path, space_needed,
   3474				       space_needed, 0, 0);
   3475		if (wret < 0)
   3476			return wret;
   3477		if (wret) {
   3478			space_needed = data_size;
   3479			if (slot > 0)
   3480				space_needed -= btrfs_leaf_free_space(l);
   3481			wret = push_leaf_left(trans, root, path, space_needed,
   3482					      space_needed, 0, (u32)-1);
   3483			if (wret < 0)
   3484				return wret;
   3485		}
   3486		l = path->nodes[0];
   3487
   3488		/* did the pushes work? */
   3489		if (btrfs_leaf_free_space(l) >= data_size)
   3490			return 0;
   3491	}
   3492
   3493	if (!path->nodes[1]) {
   3494		ret = insert_new_root(trans, root, path, 1);
   3495		if (ret)
   3496			return ret;
   3497	}
   3498again:
   3499	split = 1;
   3500	l = path->nodes[0];
   3501	slot = path->slots[0];
   3502	nritems = btrfs_header_nritems(l);
   3503	mid = (nritems + 1) / 2;
   3504
   3505	if (mid <= slot) {
   3506		if (nritems == 1 ||
   3507		    leaf_space_used(l, mid, nritems - mid) + data_size >
   3508			BTRFS_LEAF_DATA_SIZE(fs_info)) {
   3509			if (slot >= nritems) {
   3510				split = 0;
   3511			} else {
   3512				mid = slot;
   3513				if (mid != nritems &&
   3514				    leaf_space_used(l, mid, nritems - mid) +
   3515				    data_size > BTRFS_LEAF_DATA_SIZE(fs_info)) {
   3516					if (data_size && !tried_avoid_double)
   3517						goto push_for_double;
   3518					split = 2;
   3519				}
   3520			}
   3521		}
   3522	} else {
   3523		if (leaf_space_used(l, 0, mid) + data_size >
   3524			BTRFS_LEAF_DATA_SIZE(fs_info)) {
   3525			if (!extend && data_size && slot == 0) {
   3526				split = 0;
   3527			} else if ((extend || !data_size) && slot == 0) {
   3528				mid = 1;
   3529			} else {
   3530				mid = slot;
   3531				if (mid != nritems &&
   3532				    leaf_space_used(l, mid, nritems - mid) +
   3533				    data_size > BTRFS_LEAF_DATA_SIZE(fs_info)) {
   3534					if (data_size && !tried_avoid_double)
   3535						goto push_for_double;
   3536					split = 2;
   3537				}
   3538			}
   3539		}
   3540	}
   3541
   3542	if (split == 0)
   3543		btrfs_cpu_key_to_disk(&disk_key, ins_key);
   3544	else
   3545		btrfs_item_key(l, &disk_key, mid);
   3546
   3547	/*
   3548	 * We have to about BTRFS_NESTING_NEW_ROOT here if we've done a double
   3549	 * split, because we're only allowed to have MAX_LOCKDEP_SUBCLASSES
   3550	 * subclasses, which is 8 at the time of this patch, and we've maxed it
   3551	 * out.  In the future we could add a
   3552	 * BTRFS_NESTING_SPLIT_THE_SPLITTENING if we need to, but for now just
   3553	 * use BTRFS_NESTING_NEW_ROOT.
   3554	 */
   3555	right = btrfs_alloc_tree_block(trans, root, 0, root->root_key.objectid,
   3556				       &disk_key, 0, l->start, 0,
   3557				       num_doubles ? BTRFS_NESTING_NEW_ROOT :
   3558				       BTRFS_NESTING_SPLIT);
   3559	if (IS_ERR(right))
   3560		return PTR_ERR(right);
   3561
   3562	root_add_used(root, fs_info->nodesize);
   3563
   3564	if (split == 0) {
   3565		if (mid <= slot) {
   3566			btrfs_set_header_nritems(right, 0);
   3567			insert_ptr(trans, path, &disk_key,
   3568				   right->start, path->slots[1] + 1, 1);
   3569			btrfs_tree_unlock(path->nodes[0]);
   3570			free_extent_buffer(path->nodes[0]);
   3571			path->nodes[0] = right;
   3572			path->slots[0] = 0;
   3573			path->slots[1] += 1;
   3574		} else {
   3575			btrfs_set_header_nritems(right, 0);
   3576			insert_ptr(trans, path, &disk_key,
   3577				   right->start, path->slots[1], 1);
   3578			btrfs_tree_unlock(path->nodes[0]);
   3579			free_extent_buffer(path->nodes[0]);
   3580			path->nodes[0] = right;
   3581			path->slots[0] = 0;
   3582			if (path->slots[1] == 0)
   3583				fixup_low_keys(path, &disk_key, 1);
   3584		}
   3585		/*
   3586		 * We create a new leaf 'right' for the required ins_len and
   3587		 * we'll do btrfs_mark_buffer_dirty() on this leaf after copying
   3588		 * the content of ins_len to 'right'.
   3589		 */
   3590		return ret;
   3591	}
   3592
   3593	copy_for_split(trans, path, l, right, slot, mid, nritems);
   3594
   3595	if (split == 2) {
   3596		BUG_ON(num_doubles != 0);
   3597		num_doubles++;
   3598		goto again;
   3599	}
   3600
   3601	return 0;
   3602
   3603push_for_double:
   3604	push_for_double_split(trans, root, path, data_size);
   3605	tried_avoid_double = 1;
   3606	if (btrfs_leaf_free_space(path->nodes[0]) >= data_size)
   3607		return 0;
   3608	goto again;
   3609}
   3610
   3611static noinline int setup_leaf_for_split(struct btrfs_trans_handle *trans,
   3612					 struct btrfs_root *root,
   3613					 struct btrfs_path *path, int ins_len)
   3614{
   3615	struct btrfs_key key;
   3616	struct extent_buffer *leaf;
   3617	struct btrfs_file_extent_item *fi;
   3618	u64 extent_len = 0;
   3619	u32 item_size;
   3620	int ret;
   3621
   3622	leaf = path->nodes[0];
   3623	btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
   3624
   3625	BUG_ON(key.type != BTRFS_EXTENT_DATA_KEY &&
   3626	       key.type != BTRFS_EXTENT_CSUM_KEY);
   3627
   3628	if (btrfs_leaf_free_space(leaf) >= ins_len)
   3629		return 0;
   3630
   3631	item_size = btrfs_item_size(leaf, path->slots[0]);
   3632	if (key.type == BTRFS_EXTENT_DATA_KEY) {
   3633		fi = btrfs_item_ptr(leaf, path->slots[0],
   3634				    struct btrfs_file_extent_item);
   3635		extent_len = btrfs_file_extent_num_bytes(leaf, fi);
   3636	}
   3637	btrfs_release_path(path);
   3638
   3639	path->keep_locks = 1;
   3640	path->search_for_split = 1;
   3641	ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
   3642	path->search_for_split = 0;
   3643	if (ret > 0)
   3644		ret = -EAGAIN;
   3645	if (ret < 0)
   3646		goto err;
   3647
   3648	ret = -EAGAIN;
   3649	leaf = path->nodes[0];
   3650	/* if our item isn't there, return now */
   3651	if (item_size != btrfs_item_size(leaf, path->slots[0]))
   3652		goto err;
   3653
   3654	/* the leaf has  changed, it now has room.  return now */
   3655	if (btrfs_leaf_free_space(path->nodes[0]) >= ins_len)
   3656		goto err;
   3657
   3658	if (key.type == BTRFS_EXTENT_DATA_KEY) {
   3659		fi = btrfs_item_ptr(leaf, path->slots[0],
   3660				    struct btrfs_file_extent_item);
   3661		if (extent_len != btrfs_file_extent_num_bytes(leaf, fi))
   3662			goto err;
   3663	}
   3664
   3665	ret = split_leaf(trans, root, &key, path, ins_len, 1);
   3666	if (ret)
   3667		goto err;
   3668
   3669	path->keep_locks = 0;
   3670	btrfs_unlock_up_safe(path, 1);
   3671	return 0;
   3672err:
   3673	path->keep_locks = 0;
   3674	return ret;
   3675}
   3676
   3677static noinline int split_item(struct btrfs_path *path,
   3678			       const struct btrfs_key *new_key,
   3679			       unsigned long split_offset)
   3680{
   3681	struct extent_buffer *leaf;
   3682	int orig_slot, slot;
   3683	char *buf;
   3684	u32 nritems;
   3685	u32 item_size;
   3686	u32 orig_offset;
   3687	struct btrfs_disk_key disk_key;
   3688
   3689	leaf = path->nodes[0];
   3690	BUG_ON(btrfs_leaf_free_space(leaf) < sizeof(struct btrfs_item));
   3691
   3692	orig_slot = path->slots[0];
   3693	orig_offset = btrfs_item_offset(leaf, path->slots[0]);
   3694	item_size = btrfs_item_size(leaf, path->slots[0]);
   3695
   3696	buf = kmalloc(item_size, GFP_NOFS);
   3697	if (!buf)
   3698		return -ENOMEM;
   3699
   3700	read_extent_buffer(leaf, buf, btrfs_item_ptr_offset(leaf,
   3701			    path->slots[0]), item_size);
   3702
   3703	slot = path->slots[0] + 1;
   3704	nritems = btrfs_header_nritems(leaf);
   3705	if (slot != nritems) {
   3706		/* shift the items */
   3707		memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot + 1),
   3708				btrfs_item_nr_offset(slot),
   3709				(nritems - slot) * sizeof(struct btrfs_item));
   3710	}
   3711
   3712	btrfs_cpu_key_to_disk(&disk_key, new_key);
   3713	btrfs_set_item_key(leaf, &disk_key, slot);
   3714
   3715	btrfs_set_item_offset(leaf, slot, orig_offset);
   3716	btrfs_set_item_size(leaf, slot, item_size - split_offset);
   3717
   3718	btrfs_set_item_offset(leaf, orig_slot,
   3719				 orig_offset + item_size - split_offset);
   3720	btrfs_set_item_size(leaf, orig_slot, split_offset);
   3721
   3722	btrfs_set_header_nritems(leaf, nritems + 1);
   3723
   3724	/* write the data for the start of the original item */
   3725	write_extent_buffer(leaf, buf,
   3726			    btrfs_item_ptr_offset(leaf, path->slots[0]),
   3727			    split_offset);
   3728
   3729	/* write the data for the new item */
   3730	write_extent_buffer(leaf, buf + split_offset,
   3731			    btrfs_item_ptr_offset(leaf, slot),
   3732			    item_size - split_offset);
   3733	btrfs_mark_buffer_dirty(leaf);
   3734
   3735	BUG_ON(btrfs_leaf_free_space(leaf) < 0);
   3736	kfree(buf);
   3737	return 0;
   3738}
   3739
   3740/*
   3741 * This function splits a single item into two items,
   3742 * giving 'new_key' to the new item and splitting the
   3743 * old one at split_offset (from the start of the item).
   3744 *
   3745 * The path may be released by this operation.  After
   3746 * the split, the path is pointing to the old item.  The
   3747 * new item is going to be in the same node as the old one.
   3748 *
   3749 * Note, the item being split must be smaller enough to live alone on
   3750 * a tree block with room for one extra struct btrfs_item
   3751 *
   3752 * This allows us to split the item in place, keeping a lock on the
   3753 * leaf the entire time.
   3754 */
   3755int btrfs_split_item(struct btrfs_trans_handle *trans,
   3756		     struct btrfs_root *root,
   3757		     struct btrfs_path *path,
   3758		     const struct btrfs_key *new_key,
   3759		     unsigned long split_offset)
   3760{
   3761	int ret;
   3762	ret = setup_leaf_for_split(trans, root, path,
   3763				   sizeof(struct btrfs_item));
   3764	if (ret)
   3765		return ret;
   3766
   3767	ret = split_item(path, new_key, split_offset);
   3768	return ret;
   3769}
   3770
   3771/*
   3772 * make the item pointed to by the path smaller.  new_size indicates
   3773 * how small to make it, and from_end tells us if we just chop bytes
   3774 * off the end of the item or if we shift the item to chop bytes off
   3775 * the front.
   3776 */
   3777void btrfs_truncate_item(struct btrfs_path *path, u32 new_size, int from_end)
   3778{
   3779	int slot;
   3780	struct extent_buffer *leaf;
   3781	u32 nritems;
   3782	unsigned int data_end;
   3783	unsigned int old_data_start;
   3784	unsigned int old_size;
   3785	unsigned int size_diff;
   3786	int i;
   3787	struct btrfs_map_token token;
   3788
   3789	leaf = path->nodes[0];
   3790	slot = path->slots[0];
   3791
   3792	old_size = btrfs_item_size(leaf, slot);
   3793	if (old_size == new_size)
   3794		return;
   3795
   3796	nritems = btrfs_header_nritems(leaf);
   3797	data_end = leaf_data_end(leaf);
   3798
   3799	old_data_start = btrfs_item_offset(leaf, slot);
   3800
   3801	size_diff = old_size - new_size;
   3802
   3803	BUG_ON(slot < 0);
   3804	BUG_ON(slot >= nritems);
   3805
   3806	/*
   3807	 * item0..itemN ... dataN.offset..dataN.size .. data0.size
   3808	 */
   3809	/* first correct the data pointers */
   3810	btrfs_init_map_token(&token, leaf);
   3811	for (i = slot; i < nritems; i++) {
   3812		u32 ioff;
   3813
   3814		ioff = btrfs_token_item_offset(&token, i);
   3815		btrfs_set_token_item_offset(&token, i, ioff + size_diff);
   3816	}
   3817
   3818	/* shift the data */
   3819	if (from_end) {
   3820		memmove_extent_buffer(leaf, BTRFS_LEAF_DATA_OFFSET +
   3821			      data_end + size_diff, BTRFS_LEAF_DATA_OFFSET +
   3822			      data_end, old_data_start + new_size - data_end);
   3823	} else {
   3824		struct btrfs_disk_key disk_key;
   3825		u64 offset;
   3826
   3827		btrfs_item_key(leaf, &disk_key, slot);
   3828
   3829		if (btrfs_disk_key_type(&disk_key) == BTRFS_EXTENT_DATA_KEY) {
   3830			unsigned long ptr;
   3831			struct btrfs_file_extent_item *fi;
   3832
   3833			fi = btrfs_item_ptr(leaf, slot,
   3834					    struct btrfs_file_extent_item);
   3835			fi = (struct btrfs_file_extent_item *)(
   3836			     (unsigned long)fi - size_diff);
   3837
   3838			if (btrfs_file_extent_type(leaf, fi) ==
   3839			    BTRFS_FILE_EXTENT_INLINE) {
   3840				ptr = btrfs_item_ptr_offset(leaf, slot);
   3841				memmove_extent_buffer(leaf, ptr,
   3842				      (unsigned long)fi,
   3843				      BTRFS_FILE_EXTENT_INLINE_DATA_START);
   3844			}
   3845		}
   3846
   3847		memmove_extent_buffer(leaf, BTRFS_LEAF_DATA_OFFSET +
   3848			      data_end + size_diff, BTRFS_LEAF_DATA_OFFSET +
   3849			      data_end, old_data_start - data_end);
   3850
   3851		offset = btrfs_disk_key_offset(&disk_key);
   3852		btrfs_set_disk_key_offset(&disk_key, offset + size_diff);
   3853		btrfs_set_item_key(leaf, &disk_key, slot);
   3854		if (slot == 0)
   3855			fixup_low_keys(path, &disk_key, 1);
   3856	}
   3857
   3858	btrfs_set_item_size(leaf, slot, new_size);
   3859	btrfs_mark_buffer_dirty(leaf);
   3860
   3861	if (btrfs_leaf_free_space(leaf) < 0) {
   3862		btrfs_print_leaf(leaf);
   3863		BUG();
   3864	}
   3865}
   3866
   3867/*
   3868 * make the item pointed to by the path bigger, data_size is the added size.
   3869 */
   3870void btrfs_extend_item(struct btrfs_path *path, u32 data_size)
   3871{
   3872	int slot;
   3873	struct extent_buffer *leaf;
   3874	u32 nritems;
   3875	unsigned int data_end;
   3876	unsigned int old_data;
   3877	unsigned int old_size;
   3878	int i;
   3879	struct btrfs_map_token token;
   3880
   3881	leaf = path->nodes[0];
   3882
   3883	nritems = btrfs_header_nritems(leaf);
   3884	data_end = leaf_data_end(leaf);
   3885
   3886	if (btrfs_leaf_free_space(leaf) < data_size) {
   3887		btrfs_print_leaf(leaf);
   3888		BUG();
   3889	}
   3890	slot = path->slots[0];
   3891	old_data = btrfs_item_data_end(leaf, slot);
   3892
   3893	BUG_ON(slot < 0);
   3894	if (slot >= nritems) {
   3895		btrfs_print_leaf(leaf);
   3896		btrfs_crit(leaf->fs_info, "slot %d too large, nritems %d",
   3897			   slot, nritems);
   3898		BUG();
   3899	}
   3900
   3901	/*
   3902	 * item0..itemN ... dataN.offset..dataN.size .. data0.size
   3903	 */
   3904	/* first correct the data pointers */
   3905	btrfs_init_map_token(&token, leaf);
   3906	for (i = slot; i < nritems; i++) {
   3907		u32 ioff;
   3908
   3909		ioff = btrfs_token_item_offset(&token, i);
   3910		btrfs_set_token_item_offset(&token, i, ioff - data_size);
   3911	}
   3912
   3913	/* shift the data */
   3914	memmove_extent_buffer(leaf, BTRFS_LEAF_DATA_OFFSET +
   3915		      data_end - data_size, BTRFS_LEAF_DATA_OFFSET +
   3916		      data_end, old_data - data_end);
   3917
   3918	data_end = old_data;
   3919	old_size = btrfs_item_size(leaf, slot);
   3920	btrfs_set_item_size(leaf, slot, old_size + data_size);
   3921	btrfs_mark_buffer_dirty(leaf);
   3922
   3923	if (btrfs_leaf_free_space(leaf) < 0) {
   3924		btrfs_print_leaf(leaf);
   3925		BUG();
   3926	}
   3927}
   3928
   3929/**
   3930 * setup_items_for_insert - Helper called before inserting one or more items
   3931 * to a leaf. Main purpose is to save stack depth by doing the bulk of the work
   3932 * in a function that doesn't call btrfs_search_slot
   3933 *
   3934 * @root:	root we are inserting items to
   3935 * @path:	points to the leaf/slot where we are going to insert new items
   3936 * @batch:      information about the batch of items to insert
   3937 */
   3938static void setup_items_for_insert(struct btrfs_root *root, struct btrfs_path *path,
   3939				   const struct btrfs_item_batch *batch)
   3940{
   3941	struct btrfs_fs_info *fs_info = root->fs_info;
   3942	int i;
   3943	u32 nritems;
   3944	unsigned int data_end;
   3945	struct btrfs_disk_key disk_key;
   3946	struct extent_buffer *leaf;
   3947	int slot;
   3948	struct btrfs_map_token token;
   3949	u32 total_size;
   3950
   3951	/*
   3952	 * Before anything else, update keys in the parent and other ancestors
   3953	 * if needed, then release the write locks on them, so that other tasks
   3954	 * can use them while we modify the leaf.
   3955	 */
   3956	if (path->slots[0] == 0) {
   3957		btrfs_cpu_key_to_disk(&disk_key, &batch->keys[0]);
   3958		fixup_low_keys(path, &disk_key, 1);
   3959	}
   3960	btrfs_unlock_up_safe(path, 1);
   3961
   3962	leaf = path->nodes[0];
   3963	slot = path->slots[0];
   3964
   3965	nritems = btrfs_header_nritems(leaf);
   3966	data_end = leaf_data_end(leaf);
   3967	total_size = batch->total_data_size + (batch->nr * sizeof(struct btrfs_item));
   3968
   3969	if (btrfs_leaf_free_space(leaf) < total_size) {
   3970		btrfs_print_leaf(leaf);
   3971		btrfs_crit(fs_info, "not enough freespace need %u have %d",
   3972			   total_size, btrfs_leaf_free_space(leaf));
   3973		BUG();
   3974	}
   3975
   3976	btrfs_init_map_token(&token, leaf);
   3977	if (slot != nritems) {
   3978		unsigned int old_data = btrfs_item_data_end(leaf, slot);
   3979
   3980		if (old_data < data_end) {
   3981			btrfs_print_leaf(leaf);
   3982			btrfs_crit(fs_info,
   3983		"item at slot %d with data offset %u beyond data end of leaf %u",
   3984				   slot, old_data, data_end);
   3985			BUG();
   3986		}
   3987		/*
   3988		 * item0..itemN ... dataN.offset..dataN.size .. data0.size
   3989		 */
   3990		/* first correct the data pointers */
   3991		for (i = slot; i < nritems; i++) {
   3992			u32 ioff;
   3993
   3994			ioff = btrfs_token_item_offset(&token, i);
   3995			btrfs_set_token_item_offset(&token, i,
   3996						       ioff - batch->total_data_size);
   3997		}
   3998		/* shift the items */
   3999		memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot + batch->nr),
   4000			      btrfs_item_nr_offset(slot),
   4001			      (nritems - slot) * sizeof(struct btrfs_item));
   4002
   4003		/* shift the data */
   4004		memmove_extent_buffer(leaf, BTRFS_LEAF_DATA_OFFSET +
   4005				      data_end - batch->total_data_size,
   4006				      BTRFS_LEAF_DATA_OFFSET + data_end,
   4007				      old_data - data_end);
   4008		data_end = old_data;
   4009	}
   4010
   4011	/* setup the item for the new data */
   4012	for (i = 0; i < batch->nr; i++) {
   4013		btrfs_cpu_key_to_disk(&disk_key, &batch->keys[i]);
   4014		btrfs_set_item_key(leaf, &disk_key, slot + i);
   4015		data_end -= batch->data_sizes[i];
   4016		btrfs_set_token_item_offset(&token, slot + i, data_end);
   4017		btrfs_set_token_item_size(&token, slot + i, batch->data_sizes[i]);
   4018	}
   4019
   4020	btrfs_set_header_nritems(leaf, nritems + batch->nr);
   4021	btrfs_mark_buffer_dirty(leaf);
   4022
   4023	if (btrfs_leaf_free_space(leaf) < 0) {
   4024		btrfs_print_leaf(leaf);
   4025		BUG();
   4026	}
   4027}
   4028
   4029/*
   4030 * Insert a new item into a leaf.
   4031 *
   4032 * @root:      The root of the btree.
   4033 * @path:      A path pointing to the target leaf and slot.
   4034 * @key:       The key of the new item.
   4035 * @data_size: The size of the data associated with the new key.
   4036 */
   4037void btrfs_setup_item_for_insert(struct btrfs_root *root,
   4038				 struct btrfs_path *path,
   4039				 const struct btrfs_key *key,
   4040				 u32 data_size)
   4041{
   4042	struct btrfs_item_batch batch;
   4043
   4044	batch.keys = key;
   4045	batch.data_sizes = &data_size;
   4046	batch.total_data_size = data_size;
   4047	batch.nr = 1;
   4048
   4049	setup_items_for_insert(root, path, &batch);
   4050}
   4051
   4052/*
   4053 * Given a key and some data, insert items into the tree.
   4054 * This does all the path init required, making room in the tree if needed.
   4055 */
   4056int btrfs_insert_empty_items(struct btrfs_trans_handle *trans,
   4057			    struct btrfs_root *root,
   4058			    struct btrfs_path *path,
   4059			    const struct btrfs_item_batch *batch)
   4060{
   4061	int ret = 0;
   4062	int slot;
   4063	u32 total_size;
   4064
   4065	total_size = batch->total_data_size + (batch->nr * sizeof(struct btrfs_item));
   4066	ret = btrfs_search_slot(trans, root, &batch->keys[0], path, total_size, 1);
   4067	if (ret == 0)
   4068		return -EEXIST;
   4069	if (ret < 0)
   4070		return ret;
   4071
   4072	slot = path->slots[0];
   4073	BUG_ON(slot < 0);
   4074
   4075	setup_items_for_insert(root, path, batch);
   4076	return 0;
   4077}
   4078
   4079/*
   4080 * Given a key and some data, insert an item into the tree.
   4081 * This does all the path init required, making room in the tree if needed.
   4082 */
   4083int btrfs_insert_item(struct btrfs_trans_handle *trans, struct btrfs_root *root,
   4084		      const struct btrfs_key *cpu_key, void *data,
   4085		      u32 data_size)
   4086{
   4087	int ret = 0;
   4088	struct btrfs_path *path;
   4089	struct extent_buffer *leaf;
   4090	unsigned long ptr;
   4091
   4092	path = btrfs_alloc_path();
   4093	if (!path)
   4094		return -ENOMEM;
   4095	ret = btrfs_insert_empty_item(trans, root, path, cpu_key, data_size);
   4096	if (!ret) {
   4097		leaf = path->nodes[0];
   4098		ptr = btrfs_item_ptr_offset(leaf, path->slots[0]);
   4099		write_extent_buffer(leaf, data, ptr, data_size);
   4100		btrfs_mark_buffer_dirty(leaf);
   4101	}
   4102	btrfs_free_path(path);
   4103	return ret;
   4104}
   4105
   4106/*
   4107 * This function duplicates an item, giving 'new_key' to the new item.
   4108 * It guarantees both items live in the same tree leaf and the new item is
   4109 * contiguous with the original item.
   4110 *
   4111 * This allows us to split a file extent in place, keeping a lock on the leaf
   4112 * the entire time.
   4113 */
   4114int btrfs_duplicate_item(struct btrfs_trans_handle *trans,
   4115			 struct btrfs_root *root,
   4116			 struct btrfs_path *path,
   4117			 const struct btrfs_key *new_key)
   4118{
   4119	struct extent_buffer *leaf;
   4120	int ret;
   4121	u32 item_size;
   4122
   4123	leaf = path->nodes[0];
   4124	item_size = btrfs_item_size(leaf, path->slots[0]);
   4125	ret = setup_leaf_for_split(trans, root, path,
   4126				   item_size + sizeof(struct btrfs_item));
   4127	if (ret)
   4128		return ret;
   4129
   4130	path->slots[0]++;
   4131	btrfs_setup_item_for_insert(root, path, new_key, item_size);
   4132	leaf = path->nodes[0];
   4133	memcpy_extent_buffer(leaf,
   4134			     btrfs_item_ptr_offset(leaf, path->slots[0]),
   4135			     btrfs_item_ptr_offset(leaf, path->slots[0] - 1),
   4136			     item_size);
   4137	return 0;
   4138}
   4139
   4140/*
   4141 * delete the pointer from a given node.
   4142 *
   4143 * the tree should have been previously balanced so the deletion does not
   4144 * empty a node.
   4145 */
   4146static void del_ptr(struct btrfs_root *root, struct btrfs_path *path,
   4147		    int level, int slot)
   4148{
   4149	struct extent_buffer *parent = path->nodes[level];
   4150	u32 nritems;
   4151	int ret;
   4152
   4153	nritems = btrfs_header_nritems(parent);
   4154	if (slot != nritems - 1) {
   4155		if (level) {
   4156			ret = btrfs_tree_mod_log_insert_move(parent, slot,
   4157					slot + 1, nritems - slot - 1);
   4158			BUG_ON(ret < 0);
   4159		}
   4160		memmove_extent_buffer(parent,
   4161			      btrfs_node_key_ptr_offset(slot),
   4162			      btrfs_node_key_ptr_offset(slot + 1),
   4163			      sizeof(struct btrfs_key_ptr) *
   4164			      (nritems - slot - 1));
   4165	} else if (level) {
   4166		ret = btrfs_tree_mod_log_insert_key(parent, slot,
   4167				BTRFS_MOD_LOG_KEY_REMOVE, GFP_NOFS);
   4168		BUG_ON(ret < 0);
   4169	}
   4170
   4171	nritems--;
   4172	btrfs_set_header_nritems(parent, nritems);
   4173	if (nritems == 0 && parent == root->node) {
   4174		BUG_ON(btrfs_header_level(root->node) != 1);
   4175		/* just turn the root into a leaf and break */
   4176		btrfs_set_header_level(root->node, 0);
   4177	} else if (slot == 0) {
   4178		struct btrfs_disk_key disk_key;
   4179
   4180		btrfs_node_key(parent, &disk_key, 0);
   4181		fixup_low_keys(path, &disk_key, level + 1);
   4182	}
   4183	btrfs_mark_buffer_dirty(parent);
   4184}
   4185
   4186/*
   4187 * a helper function to delete the leaf pointed to by path->slots[1] and
   4188 * path->nodes[1].
   4189 *
   4190 * This deletes the pointer in path->nodes[1] and frees the leaf
   4191 * block extent.  zero is returned if it all worked out, < 0 otherwise.
   4192 *
   4193 * The path must have already been setup for deleting the leaf, including
   4194 * all the proper balancing.  path->nodes[1] must be locked.
   4195 */
   4196static noinline void btrfs_del_leaf(struct btrfs_trans_handle *trans,
   4197				    struct btrfs_root *root,
   4198				    struct btrfs_path *path,
   4199				    struct extent_buffer *leaf)
   4200{
   4201	WARN_ON(btrfs_header_generation(leaf) != trans->transid);
   4202	del_ptr(root, path, 1, path->slots[1]);
   4203
   4204	/*
   4205	 * btrfs_free_extent is expensive, we want to make sure we
   4206	 * aren't holding any locks when we call it
   4207	 */
   4208	btrfs_unlock_up_safe(path, 0);
   4209
   4210	root_sub_used(root, leaf->len);
   4211
   4212	atomic_inc(&leaf->refs);
   4213	btrfs_free_tree_block(trans, btrfs_root_id(root), leaf, 0, 1);
   4214	free_extent_buffer_stale(leaf);
   4215}
   4216/*
   4217 * delete the item at the leaf level in path.  If that empties
   4218 * the leaf, remove it from the tree
   4219 */
   4220int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root,
   4221		    struct btrfs_path *path, int slot, int nr)
   4222{
   4223	struct btrfs_fs_info *fs_info = root->fs_info;
   4224	struct extent_buffer *leaf;
   4225	int ret = 0;
   4226	int wret;
   4227	u32 nritems;
   4228
   4229	leaf = path->nodes[0];
   4230	nritems = btrfs_header_nritems(leaf);
   4231
   4232	if (slot + nr != nritems) {
   4233		const u32 last_off = btrfs_item_offset(leaf, slot + nr - 1);
   4234		const int data_end = leaf_data_end(leaf);
   4235		struct btrfs_map_token token;
   4236		u32 dsize = 0;
   4237		int i;
   4238
   4239		for (i = 0; i < nr; i++)
   4240			dsize += btrfs_item_size(leaf, slot + i);
   4241
   4242		memmove_extent_buffer(leaf, BTRFS_LEAF_DATA_OFFSET +
   4243			      data_end + dsize,
   4244			      BTRFS_LEAF_DATA_OFFSET + data_end,
   4245			      last_off - data_end);
   4246
   4247		btrfs_init_map_token(&token, leaf);
   4248		for (i = slot + nr; i < nritems; i++) {
   4249			u32 ioff;
   4250
   4251			ioff = btrfs_token_item_offset(&token, i);
   4252			btrfs_set_token_item_offset(&token, i, ioff + dsize);
   4253		}
   4254
   4255		memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot),
   4256			      btrfs_item_nr_offset(slot + nr),
   4257			      sizeof(struct btrfs_item) *
   4258			      (nritems - slot - nr));
   4259	}
   4260	btrfs_set_header_nritems(leaf, nritems - nr);
   4261	nritems -= nr;
   4262
   4263	/* delete the leaf if we've emptied it */
   4264	if (nritems == 0) {
   4265		if (leaf == root->node) {
   4266			btrfs_set_header_level(leaf, 0);
   4267		} else {
   4268			btrfs_clean_tree_block(leaf);
   4269			btrfs_del_leaf(trans, root, path, leaf);
   4270		}
   4271	} else {
   4272		int used = leaf_space_used(leaf, 0, nritems);
   4273		if (slot == 0) {
   4274			struct btrfs_disk_key disk_key;
   4275
   4276			btrfs_item_key(leaf, &disk_key, 0);
   4277			fixup_low_keys(path, &disk_key, 1);
   4278		}
   4279
   4280		/*
   4281		 * Try to delete the leaf if it is mostly empty. We do this by
   4282		 * trying to move all its items into its left and right neighbours.
   4283		 * If we can't move all the items, then we don't delete it - it's
   4284		 * not ideal, but future insertions might fill the leaf with more
   4285		 * items, or items from other leaves might be moved later into our
   4286		 * leaf due to deletions on those leaves.
   4287		 */
   4288		if (used < BTRFS_LEAF_DATA_SIZE(fs_info) / 3) {
   4289			u32 min_push_space;
   4290
   4291			/* push_leaf_left fixes the path.
   4292			 * make sure the path still points to our leaf
   4293			 * for possible call to del_ptr below
   4294			 */
   4295			slot = path->slots[1];
   4296			atomic_inc(&leaf->refs);
   4297			/*
   4298			 * We want to be able to at least push one item to the
   4299			 * left neighbour leaf, and that's the first item.
   4300			 */
   4301			min_push_space = sizeof(struct btrfs_item) +
   4302				btrfs_item_size(leaf, 0);
   4303			wret = push_leaf_left(trans, root, path, 0,
   4304					      min_push_space, 1, (u32)-1);
   4305			if (wret < 0 && wret != -ENOSPC)
   4306				ret = wret;
   4307
   4308			if (path->nodes[0] == leaf &&
   4309			    btrfs_header_nritems(leaf)) {
   4310				/*
   4311				 * If we were not able to push all items from our
   4312				 * leaf to its left neighbour, then attempt to
   4313				 * either push all the remaining items to the
   4314				 * right neighbour or none. There's no advantage
   4315				 * in pushing only some items, instead of all, as
   4316				 * it's pointless to end up with a leaf having
   4317				 * too few items while the neighbours can be full
   4318				 * or nearly full.
   4319				 */
   4320				nritems = btrfs_header_nritems(leaf);
   4321				min_push_space = leaf_space_used(leaf, 0, nritems);
   4322				wret = push_leaf_right(trans, root, path, 0,
   4323						       min_push_space, 1, 0);
   4324				if (wret < 0 && wret != -ENOSPC)
   4325					ret = wret;
   4326			}
   4327
   4328			if (btrfs_header_nritems(leaf) == 0) {
   4329				path->slots[1] = slot;
   4330				btrfs_del_leaf(trans, root, path, leaf);
   4331				free_extent_buffer(leaf);
   4332				ret = 0;
   4333			} else {
   4334				/* if we're still in the path, make sure
   4335				 * we're dirty.  Otherwise, one of the
   4336				 * push_leaf functions must have already
   4337				 * dirtied this buffer
   4338				 */
   4339				if (path->nodes[0] == leaf)
   4340					btrfs_mark_buffer_dirty(leaf);
   4341				free_extent_buffer(leaf);
   4342			}
   4343		} else {
   4344			btrfs_mark_buffer_dirty(leaf);
   4345		}
   4346	}
   4347	return ret;
   4348}
   4349
   4350/*
   4351 * search the tree again to find a leaf with lesser keys
   4352 * returns 0 if it found something or 1 if there are no lesser leaves.
   4353 * returns < 0 on io errors.
   4354 *
   4355 * This may release the path, and so you may lose any locks held at the
   4356 * time you call it.
   4357 */
   4358int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path)
   4359{
   4360	struct btrfs_key key;
   4361	struct btrfs_disk_key found_key;
   4362	int ret;
   4363
   4364	btrfs_item_key_to_cpu(path->nodes[0], &key, 0);
   4365
   4366	if (key.offset > 0) {
   4367		key.offset--;
   4368	} else if (key.type > 0) {
   4369		key.type--;
   4370		key.offset = (u64)-1;
   4371	} else if (key.objectid > 0) {
   4372		key.objectid--;
   4373		key.type = (u8)-1;
   4374		key.offset = (u64)-1;
   4375	} else {
   4376		return 1;
   4377	}
   4378
   4379	btrfs_release_path(path);
   4380	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
   4381	if (ret < 0)
   4382		return ret;
   4383	btrfs_item_key(path->nodes[0], &found_key, 0);
   4384	ret = comp_keys(&found_key, &key);
   4385	/*
   4386	 * We might have had an item with the previous key in the tree right
   4387	 * before we released our path. And after we released our path, that
   4388	 * item might have been pushed to the first slot (0) of the leaf we
   4389	 * were holding due to a tree balance. Alternatively, an item with the
   4390	 * previous key can exist as the only element of a leaf (big fat item).
   4391	 * Therefore account for these 2 cases, so that our callers (like
   4392	 * btrfs_previous_item) don't miss an existing item with a key matching
   4393	 * the previous key we computed above.
   4394	 */
   4395	if (ret <= 0)
   4396		return 0;
   4397	return 1;
   4398}
   4399
   4400/*
   4401 * A helper function to walk down the tree starting at min_key, and looking
   4402 * for nodes or leaves that are have a minimum transaction id.
   4403 * This is used by the btree defrag code, and tree logging
   4404 *
   4405 * This does not cow, but it does stuff the starting key it finds back
   4406 * into min_key, so you can call btrfs_search_slot with cow=1 on the
   4407 * key and get a writable path.
   4408 *
   4409 * This honors path->lowest_level to prevent descent past a given level
   4410 * of the tree.
   4411 *
   4412 * min_trans indicates the oldest transaction that you are interested
   4413 * in walking through.  Any nodes or leaves older than min_trans are
   4414 * skipped over (without reading them).
   4415 *
   4416 * returns zero if something useful was found, < 0 on error and 1 if there
   4417 * was nothing in the tree that matched the search criteria.
   4418 */
   4419int btrfs_search_forward(struct btrfs_root *root, struct btrfs_key *min_key,
   4420			 struct btrfs_path *path,
   4421			 u64 min_trans)
   4422{
   4423	struct extent_buffer *cur;
   4424	struct btrfs_key found_key;
   4425	int slot;
   4426	int sret;
   4427	u32 nritems;
   4428	int level;
   4429	int ret = 1;
   4430	int keep_locks = path->keep_locks;
   4431
   4432	path->keep_locks = 1;
   4433again:
   4434	cur = btrfs_read_lock_root_node(root);
   4435	level = btrfs_header_level(cur);
   4436	WARN_ON(path->nodes[level]);
   4437	path->nodes[level] = cur;
   4438	path->locks[level] = BTRFS_READ_LOCK;
   4439
   4440	if (btrfs_header_generation(cur) < min_trans) {
   4441		ret = 1;
   4442		goto out;
   4443	}
   4444	while (1) {
   4445		nritems = btrfs_header_nritems(cur);
   4446		level = btrfs_header_level(cur);
   4447		sret = btrfs_bin_search(cur, min_key, &slot);
   4448		if (sret < 0) {
   4449			ret = sret;
   4450			goto out;
   4451		}
   4452
   4453		/* at the lowest level, we're done, setup the path and exit */
   4454		if (level == path->lowest_level) {
   4455			if (slot >= nritems)
   4456				goto find_next_key;
   4457			ret = 0;
   4458			path->slots[level] = slot;
   4459			btrfs_item_key_to_cpu(cur, &found_key, slot);
   4460			goto out;
   4461		}
   4462		if (sret && slot > 0)
   4463			slot--;
   4464		/*
   4465		 * check this node pointer against the min_trans parameters.
   4466		 * If it is too old, skip to the next one.
   4467		 */
   4468		while (slot < nritems) {
   4469			u64 gen;
   4470
   4471			gen = btrfs_node_ptr_generation(cur, slot);
   4472			if (gen < min_trans) {
   4473				slot++;
   4474				continue;
   4475			}
   4476			break;
   4477		}
   4478find_next_key:
   4479		/*
   4480		 * we didn't find a candidate key in this node, walk forward
   4481		 * and find another one
   4482		 */
   4483		if (slot >= nritems) {
   4484			path->slots[level] = slot;
   4485			sret = btrfs_find_next_key(root, path, min_key, level,
   4486						  min_trans);
   4487			if (sret == 0) {
   4488				btrfs_release_path(path);
   4489				goto again;
   4490			} else {
   4491				goto out;
   4492			}
   4493		}
   4494		/* save our key for returning back */
   4495		btrfs_node_key_to_cpu(cur, &found_key, slot);
   4496		path->slots[level] = slot;
   4497		if (level == path->lowest_level) {
   4498			ret = 0;
   4499			goto out;
   4500		}
   4501		cur = btrfs_read_node_slot(cur, slot);
   4502		if (IS_ERR(cur)) {
   4503			ret = PTR_ERR(cur);
   4504			goto out;
   4505		}
   4506
   4507		btrfs_tree_read_lock(cur);
   4508
   4509		path->locks[level - 1] = BTRFS_READ_LOCK;
   4510		path->nodes[level - 1] = cur;
   4511		unlock_up(path, level, 1, 0, NULL);
   4512	}
   4513out:
   4514	path->keep_locks = keep_locks;
   4515	if (ret == 0) {
   4516		btrfs_unlock_up_safe(path, path->lowest_level + 1);
   4517		memcpy(min_key, &found_key, sizeof(found_key));
   4518	}
   4519	return ret;
   4520}
   4521
   4522/*
   4523 * this is similar to btrfs_next_leaf, but does not try to preserve
   4524 * and fixup the path.  It looks for and returns the next key in the
   4525 * tree based on the current path and the min_trans parameters.
   4526 *
   4527 * 0 is returned if another key is found, < 0 if there are any errors
   4528 * and 1 is returned if there are no higher keys in the tree
   4529 *
   4530 * path->keep_locks should be set to 1 on the search made before
   4531 * calling this function.
   4532 */
   4533int btrfs_find_next_key(struct btrfs_root *root, struct btrfs_path *path,
   4534			struct btrfs_key *key, int level, u64 min_trans)
   4535{
   4536	int slot;
   4537	struct extent_buffer *c;
   4538
   4539	WARN_ON(!path->keep_locks && !path->skip_locking);
   4540	while (level < BTRFS_MAX_LEVEL) {
   4541		if (!path->nodes[level])
   4542			return 1;
   4543
   4544		slot = path->slots[level] + 1;
   4545		c = path->nodes[level];
   4546next:
   4547		if (slot >= btrfs_header_nritems(c)) {
   4548			int ret;
   4549			int orig_lowest;
   4550			struct btrfs_key cur_key;
   4551			if (level + 1 >= BTRFS_MAX_LEVEL ||
   4552			    !path->nodes[level + 1])
   4553				return 1;
   4554
   4555			if (path->locks[level + 1] || path->skip_locking) {
   4556				level++;
   4557				continue;
   4558			}
   4559
   4560			slot = btrfs_header_nritems(c) - 1;
   4561			if (level == 0)
   4562				btrfs_item_key_to_cpu(c, &cur_key, slot);
   4563			else
   4564				btrfs_node_key_to_cpu(c, &cur_key, slot);
   4565
   4566			orig_lowest = path->lowest_level;
   4567			btrfs_release_path(path);
   4568			path->lowest_level = level;
   4569			ret = btrfs_search_slot(NULL, root, &cur_key, path,
   4570						0, 0);
   4571			path->lowest_level = orig_lowest;
   4572			if (ret < 0)
   4573				return ret;
   4574
   4575			c = path->nodes[level];
   4576			slot = path->slots[level];
   4577			if (ret == 0)
   4578				slot++;
   4579			goto next;
   4580		}
   4581
   4582		if (level == 0)
   4583			btrfs_item_key_to_cpu(c, key, slot);
   4584		else {
   4585			u64 gen = btrfs_node_ptr_generation(c, slot);
   4586
   4587			if (gen < min_trans) {
   4588				slot++;
   4589				goto next;
   4590			}
   4591			btrfs_node_key_to_cpu(c, key, slot);
   4592		}
   4593		return 0;
   4594	}
   4595	return 1;
   4596}
   4597
   4598int btrfs_next_old_leaf(struct btrfs_root *root, struct btrfs_path *path,
   4599			u64 time_seq)
   4600{
   4601	int slot;
   4602	int level;
   4603	struct extent_buffer *c;
   4604	struct extent_buffer *next;
   4605	struct btrfs_fs_info *fs_info = root->fs_info;
   4606	struct btrfs_key key;
   4607	bool need_commit_sem = false;
   4608	u32 nritems;
   4609	int ret;
   4610	int i;
   4611
   4612	nritems = btrfs_header_nritems(path->nodes[0]);
   4613	if (nritems == 0)
   4614		return 1;
   4615
   4616	btrfs_item_key_to_cpu(path->nodes[0], &key, nritems - 1);
   4617again:
   4618	level = 1;
   4619	next = NULL;
   4620	btrfs_release_path(path);
   4621
   4622	path->keep_locks = 1;
   4623
   4624	if (time_seq) {
   4625		ret = btrfs_search_old_slot(root, &key, path, time_seq);
   4626	} else {
   4627		if (path->need_commit_sem) {
   4628			path->need_commit_sem = 0;
   4629			need_commit_sem = true;
   4630			down_read(&fs_info->commit_root_sem);
   4631		}
   4632		ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
   4633	}
   4634	path->keep_locks = 0;
   4635
   4636	if (ret < 0)
   4637		goto done;
   4638
   4639	nritems = btrfs_header_nritems(path->nodes[0]);
   4640	/*
   4641	 * by releasing the path above we dropped all our locks.  A balance
   4642	 * could have added more items next to the key that used to be
   4643	 * at the very end of the block.  So, check again here and
   4644	 * advance the path if there are now more items available.
   4645	 */
   4646	if (nritems > 0 && path->slots[0] < nritems - 1) {
   4647		if (ret == 0)
   4648			path->slots[0]++;
   4649		ret = 0;
   4650		goto done;
   4651	}
   4652	/*
   4653	 * So the above check misses one case:
   4654	 * - after releasing the path above, someone has removed the item that
   4655	 *   used to be at the very end of the block, and balance between leafs
   4656	 *   gets another one with bigger key.offset to replace it.
   4657	 *
   4658	 * This one should be returned as well, or we can get leaf corruption
   4659	 * later(esp. in __btrfs_drop_extents()).
   4660	 *
   4661	 * And a bit more explanation about this check,
   4662	 * with ret > 0, the key isn't found, the path points to the slot
   4663	 * where it should be inserted, so the path->slots[0] item must be the
   4664	 * bigger one.
   4665	 */
   4666	if (nritems > 0 && ret > 0 && path->slots[0] == nritems - 1) {
   4667		ret = 0;
   4668		goto done;
   4669	}
   4670
   4671	while (level < BTRFS_MAX_LEVEL) {
   4672		if (!path->nodes[level]) {
   4673			ret = 1;
   4674			goto done;
   4675		}
   4676
   4677		slot = path->slots[level] + 1;
   4678		c = path->nodes[level];
   4679		if (slot >= btrfs_header_nritems(c)) {
   4680			level++;
   4681			if (level == BTRFS_MAX_LEVEL) {
   4682				ret = 1;
   4683				goto done;
   4684			}
   4685			continue;
   4686		}
   4687
   4688
   4689		/*
   4690		 * Our current level is where we're going to start from, and to
   4691		 * make sure lockdep doesn't complain we need to drop our locks
   4692		 * and nodes from 0 to our current level.
   4693		 */
   4694		for (i = 0; i < level; i++) {
   4695			if (path->locks[level]) {
   4696				btrfs_tree_read_unlock(path->nodes[i]);
   4697				path->locks[i] = 0;
   4698			}
   4699			free_extent_buffer(path->nodes[i]);
   4700			path->nodes[i] = NULL;
   4701		}
   4702
   4703		next = c;
   4704		ret = read_block_for_search(root, path, &next, level,
   4705					    slot, &key);
   4706		if (ret == -EAGAIN)
   4707			goto again;
   4708
   4709		if (ret < 0) {
   4710			btrfs_release_path(path);
   4711			goto done;
   4712		}
   4713
   4714		if (!path->skip_locking) {
   4715			ret = btrfs_try_tree_read_lock(next);
   4716			if (!ret && time_seq) {
   4717				/*
   4718				 * If we don't get the lock, we may be racing
   4719				 * with push_leaf_left, holding that lock while
   4720				 * itself waiting for the leaf we've currently
   4721				 * locked. To solve this situation, we give up
   4722				 * on our lock and cycle.
   4723				 */
   4724				free_extent_buffer(next);
   4725				btrfs_release_path(path);
   4726				cond_resched();
   4727				goto again;
   4728			}
   4729			if (!ret)
   4730				btrfs_tree_read_lock(next);
   4731		}
   4732		break;
   4733	}
   4734	path->slots[level] = slot;
   4735	while (1) {
   4736		level--;
   4737		path->nodes[level] = next;
   4738		path->slots[level] = 0;
   4739		if (!path->skip_locking)
   4740			path->locks[level] = BTRFS_READ_LOCK;
   4741		if (!level)
   4742			break;
   4743
   4744		ret = read_block_for_search(root, path, &next, level,
   4745					    0, &key);
   4746		if (ret == -EAGAIN)
   4747			goto again;
   4748
   4749		if (ret < 0) {
   4750			btrfs_release_path(path);
   4751			goto done;
   4752		}
   4753
   4754		if (!path->skip_locking)
   4755			btrfs_tree_read_lock(next);
   4756	}
   4757	ret = 0;
   4758done:
   4759	unlock_up(path, 0, 1, 0, NULL);
   4760	if (need_commit_sem) {
   4761		int ret2;
   4762
   4763		path->need_commit_sem = 1;
   4764		ret2 = finish_need_commit_sem_search(path);
   4765		up_read(&fs_info->commit_root_sem);
   4766		if (ret2)
   4767			ret = ret2;
   4768	}
   4769
   4770	return ret;
   4771}
   4772
   4773/*
   4774 * this uses btrfs_prev_leaf to walk backwards in the tree, and keeps
   4775 * searching until it gets past min_objectid or finds an item of 'type'
   4776 *
   4777 * returns 0 if something is found, 1 if nothing was found and < 0 on error
   4778 */
   4779int btrfs_previous_item(struct btrfs_root *root,
   4780			struct btrfs_path *path, u64 min_objectid,
   4781			int type)
   4782{
   4783	struct btrfs_key found_key;
   4784	struct extent_buffer *leaf;
   4785	u32 nritems;
   4786	int ret;
   4787
   4788	while (1) {
   4789		if (path->slots[0] == 0) {
   4790			ret = btrfs_prev_leaf(root, path);
   4791			if (ret != 0)
   4792				return ret;
   4793		} else {
   4794			path->slots[0]--;
   4795		}
   4796		leaf = path->nodes[0];
   4797		nritems = btrfs_header_nritems(leaf);
   4798		if (nritems == 0)
   4799			return 1;
   4800		if (path->slots[0] == nritems)
   4801			path->slots[0]--;
   4802
   4803		btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
   4804		if (found_key.objectid < min_objectid)
   4805			break;
   4806		if (found_key.type == type)
   4807			return 0;
   4808		if (found_key.objectid == min_objectid &&
   4809		    found_key.type < type)
   4810			break;
   4811	}
   4812	return 1;
   4813}
   4814
   4815/*
   4816 * search in extent tree to find a previous Metadata/Data extent item with
   4817 * min objecitd.
   4818 *
   4819 * returns 0 if something is found, 1 if nothing was found and < 0 on error
   4820 */
   4821int btrfs_previous_extent_item(struct btrfs_root *root,
   4822			struct btrfs_path *path, u64 min_objectid)
   4823{
   4824	struct btrfs_key found_key;
   4825	struct extent_buffer *leaf;
   4826	u32 nritems;
   4827	int ret;
   4828
   4829	while (1) {
   4830		if (path->slots[0] == 0) {
   4831			ret = btrfs_prev_leaf(root, path);
   4832			if (ret != 0)
   4833				return ret;
   4834		} else {
   4835			path->slots[0]--;
   4836		}
   4837		leaf = path->nodes[0];
   4838		nritems = btrfs_header_nritems(leaf);
   4839		if (nritems == 0)
   4840			return 1;
   4841		if (path->slots[0] == nritems)
   4842			path->slots[0]--;
   4843
   4844		btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
   4845		if (found_key.objectid < min_objectid)
   4846			break;
   4847		if (found_key.type == BTRFS_EXTENT_ITEM_KEY ||
   4848		    found_key.type == BTRFS_METADATA_ITEM_KEY)
   4849			return 0;
   4850		if (found_key.objectid == min_objectid &&
   4851		    found_key.type < BTRFS_EXTENT_ITEM_KEY)
   4852			break;
   4853	}
   4854	return 1;
   4855}