tree-log.c - cachepc-linux - Fork of AMDESE/linux with modifications for CachePC side-channel attack

	cachepc-linux Fork of AMDESE/linux with modifications for CachePC side-channel attack
	git clone https://git.sinitax.com/sinitax/cachepc-linux
	Log \| Files \| Refs \| README \| LICENSE \| sfeed.txt
tree-log.c (200109B)
      1// SPDX-License-Identifier: GPL-2.0
      2/*
      3 * Copyright (C) 2008 Oracle.  All rights reserved.
      4 */
      5
      6#include <linux/sched.h>
      7#include <linux/slab.h>
      8#include <linux/blkdev.h>
      9#include <linux/list_sort.h>
     10#include <linux/iversion.h>
     11#include "misc.h"
     12#include "ctree.h"
     13#include "tree-log.h"
     14#include "disk-io.h"
     15#include "locking.h"
     16#include "print-tree.h"
     17#include "backref.h"
     18#include "compression.h"
     19#include "qgroup.h"
     20#include "block-group.h"
     21#include "space-info.h"
     22#include "zoned.h"
     23#include "inode-item.h"
     24
     25/* magic values for the inode_only field in btrfs_log_inode:
     26 *
     27 * LOG_INODE_ALL means to log everything
     28 * LOG_INODE_EXISTS means to log just enough to recreate the inode
     29 * during log replay
     30 */
     31enum {
     32	LOG_INODE_ALL,
     33	LOG_INODE_EXISTS,
     34	LOG_OTHER_INODE,
     35	LOG_OTHER_INODE_ALL,
     36};
     37
     38/*
     39 * directory trouble cases
     40 *
     41 * 1) on rename or unlink, if the inode being unlinked isn't in the fsync
     42 * log, we must force a full commit before doing an fsync of the directory
     43 * where the unlink was done.
     44 * ---> record transid of last unlink/rename per directory
     45 *
     46 * mkdir foo/some_dir
     47 * normal commit
     48 * rename foo/some_dir foo2/some_dir
     49 * mkdir foo/some_dir
     50 * fsync foo/some_dir/some_file
     51 *
     52 * The fsync above will unlink the original some_dir without recording
     53 * it in its new location (foo2).  After a crash, some_dir will be gone
     54 * unless the fsync of some_file forces a full commit
     55 *
     56 * 2) we must log any new names for any file or dir that is in the fsync
     57 * log. ---> check inode while renaming/linking.
     58 *
     59 * 2a) we must log any new names for any file or dir during rename
     60 * when the directory they are being removed from was logged.
     61 * ---> check inode and old parent dir during rename
     62 *
     63 *  2a is actually the more important variant.  With the extra logging
     64 *  a crash might unlink the old name without recreating the new one
     65 *
     66 * 3) after a crash, we must go through any directories with a link count
     67 * of zero and redo the rm -rf
     68 *
     69 * mkdir f1/foo
     70 * normal commit
     71 * rm -rf f1/foo
     72 * fsync(f1)
     73 *
     74 * The directory f1 was fully removed from the FS, but fsync was never
     75 * called on f1, only its parent dir.  After a crash the rm -rf must
     76 * be replayed.  This must be able to recurse down the entire
     77 * directory tree.  The inode link count fixup code takes care of the
     78 * ugly details.
     79 */
     80
     81/*
     82 * stages for the tree walking.  The first
     83 * stage (0) is to only pin down the blocks we find
     84 * the second stage (1) is to make sure that all the inodes
     85 * we find in the log are created in the subvolume.
     86 *
     87 * The last stage is to deal with directories and links and extents
     88 * and all the other fun semantics
     89 */
     90enum {
     91	LOG_WALK_PIN_ONLY,
     92	LOG_WALK_REPLAY_INODES,
     93	LOG_WALK_REPLAY_DIR_INDEX,
     94	LOG_WALK_REPLAY_ALL,
     95};
     96
     97static int btrfs_log_inode(struct btrfs_trans_handle *trans,
     98			   struct btrfs_inode *inode,
     99			   int inode_only,
    100			   struct btrfs_log_ctx *ctx);
    101static int link_to_fixup_dir(struct btrfs_trans_handle *trans,
    102			     struct btrfs_root *root,
    103			     struct btrfs_path *path, u64 objectid);
    104static noinline int replay_dir_deletes(struct btrfs_trans_handle *trans,
    105				       struct btrfs_root *root,
    106				       struct btrfs_root *log,
    107				       struct btrfs_path *path,
    108				       u64 dirid, int del_all);
    109static void wait_log_commit(struct btrfs_root *root, int transid);
    110
    111/*
    112 * tree logging is a special write ahead log used to make sure that
    113 * fsyncs and O_SYNCs can happen without doing full tree commits.
    114 *
    115 * Full tree commits are expensive because they require commonly
    116 * modified blocks to be recowed, creating many dirty pages in the
    117 * extent tree an 4x-6x higher write load than ext3.
    118 *
    119 * Instead of doing a tree commit on every fsync, we use the
    120 * key ranges and transaction ids to find items for a given file or directory
    121 * that have changed in this transaction.  Those items are copied into
    122 * a special tree (one per subvolume root), that tree is written to disk
    123 * and then the fsync is considered complete.
    124 *
    125 * After a crash, items are copied out of the log-tree back into the
    126 * subvolume tree.  Any file data extents found are recorded in the extent
    127 * allocation tree, and the log-tree freed.
    128 *
    129 * The log tree is read three times, once to pin down all the extents it is
    130 * using in ram and once, once to create all the inodes logged in the tree
    131 * and once to do all the other items.
    132 */
    133
    134/*
    135 * start a sub transaction and setup the log tree
    136 * this increments the log tree writer count to make the people
    137 * syncing the tree wait for us to finish
    138 */
    139static int start_log_trans(struct btrfs_trans_handle *trans,
    140			   struct btrfs_root *root,
    141			   struct btrfs_log_ctx *ctx)
    142{
    143	struct btrfs_fs_info *fs_info = root->fs_info;
    144	struct btrfs_root *tree_root = fs_info->tree_root;
    145	const bool zoned = btrfs_is_zoned(fs_info);
    146	int ret = 0;
    147	bool created = false;
    148
    149	/*
    150	 * First check if the log root tree was already created. If not, create
    151	 * it before locking the root's log_mutex, just to keep lockdep happy.
    152	 */
    153	if (!test_bit(BTRFS_ROOT_HAS_LOG_TREE, &tree_root->state)) {
    154		mutex_lock(&tree_root->log_mutex);
    155		if (!fs_info->log_root_tree) {
    156			ret = btrfs_init_log_root_tree(trans, fs_info);
    157			if (!ret) {
    158				set_bit(BTRFS_ROOT_HAS_LOG_TREE, &tree_root->state);
    159				created = true;
    160			}
    161		}
    162		mutex_unlock(&tree_root->log_mutex);
    163		if (ret)
    164			return ret;
    165	}
    166
    167	mutex_lock(&root->log_mutex);
    168
    169again:
    170	if (root->log_root) {
    171		int index = (root->log_transid + 1) % 2;
    172
    173		if (btrfs_need_log_full_commit(trans)) {
    174			ret = -EAGAIN;
    175			goto out;
    176		}
    177
    178		if (zoned && atomic_read(&root->log_commit[index])) {
    179			wait_log_commit(root, root->log_transid - 1);
    180			goto again;
    181		}
    182
    183		if (!root->log_start_pid) {
    184			clear_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state);
    185			root->log_start_pid = current->pid;
    186		} else if (root->log_start_pid != current->pid) {
    187			set_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state);
    188		}
    189	} else {
    190		/*
    191		 * This means fs_info->log_root_tree was already created
    192		 * for some other FS trees. Do the full commit not to mix
    193		 * nodes from multiple log transactions to do sequential
    194		 * writing.
    195		 */
    196		if (zoned && !created) {
    197			ret = -EAGAIN;
    198			goto out;
    199		}
    200
    201		ret = btrfs_add_log_tree(trans, root);
    202		if (ret)
    203			goto out;
    204
    205		set_bit(BTRFS_ROOT_HAS_LOG_TREE, &root->state);
    206		clear_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state);
    207		root->log_start_pid = current->pid;
    208	}
    209
    210	atomic_inc(&root->log_writers);
    211	if (!ctx->logging_new_name) {
    212		int index = root->log_transid % 2;
    213		list_add_tail(&ctx->list, &root->log_ctxs[index]);
    214		ctx->log_transid = root->log_transid;
    215	}
    216
    217out:
    218	mutex_unlock(&root->log_mutex);
    219	return ret;
    220}
    221
    222/*
    223 * returns 0 if there was a log transaction running and we were able
    224 * to join, or returns -ENOENT if there were not transactions
    225 * in progress
    226 */
    227static int join_running_log_trans(struct btrfs_root *root)
    228{
    229	const bool zoned = btrfs_is_zoned(root->fs_info);
    230	int ret = -ENOENT;
    231
    232	if (!test_bit(BTRFS_ROOT_HAS_LOG_TREE, &root->state))
    233		return ret;
    234
    235	mutex_lock(&root->log_mutex);
    236again:
    237	if (root->log_root) {
    238		int index = (root->log_transid + 1) % 2;
    239
    240		ret = 0;
    241		if (zoned && atomic_read(&root->log_commit[index])) {
    242			wait_log_commit(root, root->log_transid - 1);
    243			goto again;
    244		}
    245		atomic_inc(&root->log_writers);
    246	}
    247	mutex_unlock(&root->log_mutex);
    248	return ret;
    249}
    250
    251/*
    252 * This either makes the current running log transaction wait
    253 * until you call btrfs_end_log_trans() or it makes any future
    254 * log transactions wait until you call btrfs_end_log_trans()
    255 */
    256void btrfs_pin_log_trans(struct btrfs_root *root)
    257{
    258	atomic_inc(&root->log_writers);
    259}
    260
    261/*
    262 * indicate we're done making changes to the log tree
    263 * and wake up anyone waiting to do a sync
    264 */
    265void btrfs_end_log_trans(struct btrfs_root *root)
    266{
    267	if (atomic_dec_and_test(&root->log_writers)) {
    268		/* atomic_dec_and_test implies a barrier */
    269		cond_wake_up_nomb(&root->log_writer_wait);
    270	}
    271}
    272
    273static void btrfs_wait_tree_block_writeback(struct extent_buffer *buf)
    274{
    275	filemap_fdatawait_range(buf->pages[0]->mapping,
    276			        buf->start, buf->start + buf->len - 1);
    277}
    278
    279/*
    280 * the walk control struct is used to pass state down the chain when
    281 * processing the log tree.  The stage field tells us which part
    282 * of the log tree processing we are currently doing.  The others
    283 * are state fields used for that specific part
    284 */
    285struct walk_control {
    286	/* should we free the extent on disk when done?  This is used
    287	 * at transaction commit time while freeing a log tree
    288	 */
    289	int free;
    290
    291	/* pin only walk, we record which extents on disk belong to the
    292	 * log trees
    293	 */
    294	int pin;
    295
    296	/* what stage of the replay code we're currently in */
    297	int stage;
    298
    299	/*
    300	 * Ignore any items from the inode currently being processed. Needs
    301	 * to be set every time we find a BTRFS_INODE_ITEM_KEY and we are in
    302	 * the LOG_WALK_REPLAY_INODES stage.
    303	 */
    304	bool ignore_cur_inode;
    305
    306	/* the root we are currently replaying */
    307	struct btrfs_root *replay_dest;
    308
    309	/* the trans handle for the current replay */
    310	struct btrfs_trans_handle *trans;
    311
    312	/* the function that gets used to process blocks we find in the
    313	 * tree.  Note the extent_buffer might not be up to date when it is
    314	 * passed in, and it must be checked or read if you need the data
    315	 * inside it
    316	 */
    317	int (*process_func)(struct btrfs_root *log, struct extent_buffer *eb,
    318			    struct walk_control *wc, u64 gen, int level);
    319};
    320
    321/*
    322 * process_func used to pin down extents, write them or wait on them
    323 */
    324static int process_one_buffer(struct btrfs_root *log,
    325			      struct extent_buffer *eb,
    326			      struct walk_control *wc, u64 gen, int level)
    327{
    328	struct btrfs_fs_info *fs_info = log->fs_info;
    329	int ret = 0;
    330
    331	/*
    332	 * If this fs is mixed then we need to be able to process the leaves to
    333	 * pin down any logged extents, so we have to read the block.
    334	 */
    335	if (btrfs_fs_incompat(fs_info, MIXED_GROUPS)) {
    336		ret = btrfs_read_extent_buffer(eb, gen, level, NULL);
    337		if (ret)
    338			return ret;
    339	}
    340
    341	if (wc->pin) {
    342		ret = btrfs_pin_extent_for_log_replay(wc->trans, eb->start,
    343						      eb->len);
    344		if (ret)
    345			return ret;
    346
    347		if (btrfs_buffer_uptodate(eb, gen, 0) &&
    348		    btrfs_header_level(eb) == 0)
    349			ret = btrfs_exclude_logged_extents(eb);
    350	}
    351	return ret;
    352}
    353
    354static int do_overwrite_item(struct btrfs_trans_handle *trans,
    355			     struct btrfs_root *root,
    356			     struct btrfs_path *path,
    357			     struct extent_buffer *eb, int slot,
    358			     struct btrfs_key *key)
    359{
    360	int ret;
    361	u32 item_size;
    362	u64 saved_i_size = 0;
    363	int save_old_i_size = 0;
    364	unsigned long src_ptr;
    365	unsigned long dst_ptr;
    366	int overwrite_root = 0;
    367	bool inode_item = key->type == BTRFS_INODE_ITEM_KEY;
    368
    369	if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID)
    370		overwrite_root = 1;
    371
    372	item_size = btrfs_item_size(eb, slot);
    373	src_ptr = btrfs_item_ptr_offset(eb, slot);
    374
    375	/* Our caller must have done a search for the key for us. */
    376	ASSERT(path->nodes[0] != NULL);
    377
    378	/*
    379	 * And the slot must point to the exact key or the slot where the key
    380	 * should be at (the first item with a key greater than 'key')
    381	 */
    382	if (path->slots[0] < btrfs_header_nritems(path->nodes[0])) {
    383		struct btrfs_key found_key;
    384
    385		btrfs_item_key_to_cpu(path->nodes[0], &found_key, path->slots[0]);
    386		ret = btrfs_comp_cpu_keys(&found_key, key);
    387		ASSERT(ret >= 0);
    388	} else {
    389		ret = 1;
    390	}
    391
    392	if (ret == 0) {
    393		char *src_copy;
    394		char *dst_copy;
    395		u32 dst_size = btrfs_item_size(path->nodes[0],
    396						  path->slots[0]);
    397		if (dst_size != item_size)
    398			goto insert;
    399
    400		if (item_size == 0) {
    401			btrfs_release_path(path);
    402			return 0;
    403		}
    404		dst_copy = kmalloc(item_size, GFP_NOFS);
    405		src_copy = kmalloc(item_size, GFP_NOFS);
    406		if (!dst_copy || !src_copy) {
    407			btrfs_release_path(path);
    408			kfree(dst_copy);
    409			kfree(src_copy);
    410			return -ENOMEM;
    411		}
    412
    413		read_extent_buffer(eb, src_copy, src_ptr, item_size);
    414
    415		dst_ptr = btrfs_item_ptr_offset(path->nodes[0], path->slots[0]);
    416		read_extent_buffer(path->nodes[0], dst_copy, dst_ptr,
    417				   item_size);
    418		ret = memcmp(dst_copy, src_copy, item_size);
    419
    420		kfree(dst_copy);
    421		kfree(src_copy);
    422		/*
    423		 * they have the same contents, just return, this saves
    424		 * us from cowing blocks in the destination tree and doing
    425		 * extra writes that may not have been done by a previous
    426		 * sync
    427		 */
    428		if (ret == 0) {
    429			btrfs_release_path(path);
    430			return 0;
    431		}
    432
    433		/*
    434		 * We need to load the old nbytes into the inode so when we
    435		 * replay the extents we've logged we get the right nbytes.
    436		 */
    437		if (inode_item) {
    438			struct btrfs_inode_item *item;
    439			u64 nbytes;
    440			u32 mode;
    441
    442			item = btrfs_item_ptr(path->nodes[0], path->slots[0],
    443					      struct btrfs_inode_item);
    444			nbytes = btrfs_inode_nbytes(path->nodes[0], item);
    445			item = btrfs_item_ptr(eb, slot,
    446					      struct btrfs_inode_item);
    447			btrfs_set_inode_nbytes(eb, item, nbytes);
    448
    449			/*
    450			 * If this is a directory we need to reset the i_size to
    451			 * 0 so that we can set it up properly when replaying
    452			 * the rest of the items in this log.
    453			 */
    454			mode = btrfs_inode_mode(eb, item);
    455			if (S_ISDIR(mode))
    456				btrfs_set_inode_size(eb, item, 0);
    457		}
    458	} else if (inode_item) {
    459		struct btrfs_inode_item *item;
    460		u32 mode;
    461
    462		/*
    463		 * New inode, set nbytes to 0 so that the nbytes comes out
    464		 * properly when we replay the extents.
    465		 */
    466		item = btrfs_item_ptr(eb, slot, struct btrfs_inode_item);
    467		btrfs_set_inode_nbytes(eb, item, 0);
    468
    469		/*
    470		 * If this is a directory we need to reset the i_size to 0 so
    471		 * that we can set it up properly when replaying the rest of
    472		 * the items in this log.
    473		 */
    474		mode = btrfs_inode_mode(eb, item);
    475		if (S_ISDIR(mode))
    476			btrfs_set_inode_size(eb, item, 0);
    477	}
    478insert:
    479	btrfs_release_path(path);
    480	/* try to insert the key into the destination tree */
    481	path->skip_release_on_error = 1;
    482	ret = btrfs_insert_empty_item(trans, root, path,
    483				      key, item_size);
    484	path->skip_release_on_error = 0;
    485
    486	/* make sure any existing item is the correct size */
    487	if (ret == -EEXIST || ret == -EOVERFLOW) {
    488		u32 found_size;
    489		found_size = btrfs_item_size(path->nodes[0],
    490						path->slots[0]);
    491		if (found_size > item_size)
    492			btrfs_truncate_item(path, item_size, 1);
    493		else if (found_size < item_size)
    494			btrfs_extend_item(path, item_size - found_size);
    495	} else if (ret) {
    496		return ret;
    497	}
    498	dst_ptr = btrfs_item_ptr_offset(path->nodes[0],
    499					path->slots[0]);
    500
    501	/* don't overwrite an existing inode if the generation number
    502	 * was logged as zero.  This is done when the tree logging code
    503	 * is just logging an inode to make sure it exists after recovery.
    504	 *
    505	 * Also, don't overwrite i_size on directories during replay.
    506	 * log replay inserts and removes directory items based on the
    507	 * state of the tree found in the subvolume, and i_size is modified
    508	 * as it goes
    509	 */
    510	if (key->type == BTRFS_INODE_ITEM_KEY && ret == -EEXIST) {
    511		struct btrfs_inode_item *src_item;
    512		struct btrfs_inode_item *dst_item;
    513
    514		src_item = (struct btrfs_inode_item *)src_ptr;
    515		dst_item = (struct btrfs_inode_item *)dst_ptr;
    516
    517		if (btrfs_inode_generation(eb, src_item) == 0) {
    518			struct extent_buffer *dst_eb = path->nodes[0];
    519			const u64 ino_size = btrfs_inode_size(eb, src_item);
    520
    521			/*
    522			 * For regular files an ino_size == 0 is used only when
    523			 * logging that an inode exists, as part of a directory
    524			 * fsync, and the inode wasn't fsynced before. In this
    525			 * case don't set the size of the inode in the fs/subvol
    526			 * tree, otherwise we would be throwing valid data away.
    527			 */
    528			if (S_ISREG(btrfs_inode_mode(eb, src_item)) &&
    529			    S_ISREG(btrfs_inode_mode(dst_eb, dst_item)) &&
    530			    ino_size != 0)
    531				btrfs_set_inode_size(dst_eb, dst_item, ino_size);
    532			goto no_copy;
    533		}
    534
    535		if (overwrite_root &&
    536		    S_ISDIR(btrfs_inode_mode(eb, src_item)) &&
    537		    S_ISDIR(btrfs_inode_mode(path->nodes[0], dst_item))) {
    538			save_old_i_size = 1;
    539			saved_i_size = btrfs_inode_size(path->nodes[0],
    540							dst_item);
    541		}
    542	}
    543
    544	copy_extent_buffer(path->nodes[0], eb, dst_ptr,
    545			   src_ptr, item_size);
    546
    547	if (save_old_i_size) {
    548		struct btrfs_inode_item *dst_item;
    549		dst_item = (struct btrfs_inode_item *)dst_ptr;
    550		btrfs_set_inode_size(path->nodes[0], dst_item, saved_i_size);
    551	}
    552
    553	/* make sure the generation is filled in */
    554	if (key->type == BTRFS_INODE_ITEM_KEY) {
    555		struct btrfs_inode_item *dst_item;
    556		dst_item = (struct btrfs_inode_item *)dst_ptr;
    557		if (btrfs_inode_generation(path->nodes[0], dst_item) == 0) {
    558			btrfs_set_inode_generation(path->nodes[0], dst_item,
    559						   trans->transid);
    560		}
    561	}
    562no_copy:
    563	btrfs_mark_buffer_dirty(path->nodes[0]);
    564	btrfs_release_path(path);
    565	return 0;
    566}
    567
    568/*
    569 * Item overwrite used by replay and tree logging.  eb, slot and key all refer
    570 * to the src data we are copying out.
    571 *
    572 * root is the tree we are copying into, and path is a scratch
    573 * path for use in this function (it should be released on entry and
    574 * will be released on exit).
    575 *
    576 * If the key is already in the destination tree the existing item is
    577 * overwritten.  If the existing item isn't big enough, it is extended.
    578 * If it is too large, it is truncated.
    579 *
    580 * If the key isn't in the destination yet, a new item is inserted.
    581 */
    582static int overwrite_item(struct btrfs_trans_handle *trans,
    583			  struct btrfs_root *root,
    584			  struct btrfs_path *path,
    585			  struct extent_buffer *eb, int slot,
    586			  struct btrfs_key *key)
    587{
    588	int ret;
    589
    590	/* Look for the key in the destination tree. */
    591	ret = btrfs_search_slot(NULL, root, key, path, 0, 0);
    592	if (ret < 0)
    593		return ret;
    594
    595	return do_overwrite_item(trans, root, path, eb, slot, key);
    596}
    597
    598/*
    599 * simple helper to read an inode off the disk from a given root
    600 * This can only be called for subvolume roots and not for the log
    601 */
    602static noinline struct inode *read_one_inode(struct btrfs_root *root,
    603					     u64 objectid)
    604{
    605	struct inode *inode;
    606
    607	inode = btrfs_iget(root->fs_info->sb, objectid, root);
    608	if (IS_ERR(inode))
    609		inode = NULL;
    610	return inode;
    611}
    612
    613/* replays a single extent in 'eb' at 'slot' with 'key' into the
    614 * subvolume 'root'.  path is released on entry and should be released
    615 * on exit.
    616 *
    617 * extents in the log tree have not been allocated out of the extent
    618 * tree yet.  So, this completes the allocation, taking a reference
    619 * as required if the extent already exists or creating a new extent
    620 * if it isn't in the extent allocation tree yet.
    621 *
    622 * The extent is inserted into the file, dropping any existing extents
    623 * from the file that overlap the new one.
    624 */
    625static noinline int replay_one_extent(struct btrfs_trans_handle *trans,
    626				      struct btrfs_root *root,
    627				      struct btrfs_path *path,
    628				      struct extent_buffer *eb, int slot,
    629				      struct btrfs_key *key)
    630{
    631	struct btrfs_drop_extents_args drop_args = { 0 };
    632	struct btrfs_fs_info *fs_info = root->fs_info;
    633	int found_type;
    634	u64 extent_end;
    635	u64 start = key->offset;
    636	u64 nbytes = 0;
    637	struct btrfs_file_extent_item *item;
    638	struct inode *inode = NULL;
    639	unsigned long size;
    640	int ret = 0;
    641
    642	item = btrfs_item_ptr(eb, slot, struct btrfs_file_extent_item);
    643	found_type = btrfs_file_extent_type(eb, item);
    644
    645	if (found_type == BTRFS_FILE_EXTENT_REG ||
    646	    found_type == BTRFS_FILE_EXTENT_PREALLOC) {
    647		nbytes = btrfs_file_extent_num_bytes(eb, item);
    648		extent_end = start + nbytes;
    649
    650		/*
    651		 * We don't add to the inodes nbytes if we are prealloc or a
    652		 * hole.
    653		 */
    654		if (btrfs_file_extent_disk_bytenr(eb, item) == 0)
    655			nbytes = 0;
    656	} else if (found_type == BTRFS_FILE_EXTENT_INLINE) {
    657		size = btrfs_file_extent_ram_bytes(eb, item);
    658		nbytes = btrfs_file_extent_ram_bytes(eb, item);
    659		extent_end = ALIGN(start + size,
    660				   fs_info->sectorsize);
    661	} else {
    662		ret = 0;
    663		goto out;
    664	}
    665
    666	inode = read_one_inode(root, key->objectid);
    667	if (!inode) {
    668		ret = -EIO;
    669		goto out;
    670	}
    671
    672	/*
    673	 * first check to see if we already have this extent in the
    674	 * file.  This must be done before the btrfs_drop_extents run
    675	 * so we don't try to drop this extent.
    676	 */
    677	ret = btrfs_lookup_file_extent(trans, root, path,
    678			btrfs_ino(BTRFS_I(inode)), start, 0);
    679
    680	if (ret == 0 &&
    681	    (found_type == BTRFS_FILE_EXTENT_REG ||
    682	     found_type == BTRFS_FILE_EXTENT_PREALLOC)) {
    683		struct btrfs_file_extent_item cmp1;
    684		struct btrfs_file_extent_item cmp2;
    685		struct btrfs_file_extent_item *existing;
    686		struct extent_buffer *leaf;
    687
    688		leaf = path->nodes[0];
    689		existing = btrfs_item_ptr(leaf, path->slots[0],
    690					  struct btrfs_file_extent_item);
    691
    692		read_extent_buffer(eb, &cmp1, (unsigned long)item,
    693				   sizeof(cmp1));
    694		read_extent_buffer(leaf, &cmp2, (unsigned long)existing,
    695				   sizeof(cmp2));
    696
    697		/*
    698		 * we already have a pointer to this exact extent,
    699		 * we don't have to do anything
    700		 */
    701		if (memcmp(&cmp1, &cmp2, sizeof(cmp1)) == 0) {
    702			btrfs_release_path(path);
    703			goto out;
    704		}
    705	}
    706	btrfs_release_path(path);
    707
    708	/* drop any overlapping extents */
    709	drop_args.start = start;
    710	drop_args.end = extent_end;
    711	drop_args.drop_cache = true;
    712	ret = btrfs_drop_extents(trans, root, BTRFS_I(inode), &drop_args);
    713	if (ret)
    714		goto out;
    715
    716	if (found_type == BTRFS_FILE_EXTENT_REG ||
    717	    found_type == BTRFS_FILE_EXTENT_PREALLOC) {
    718		u64 offset;
    719		unsigned long dest_offset;
    720		struct btrfs_key ins;
    721
    722		if (btrfs_file_extent_disk_bytenr(eb, item) == 0 &&
    723		    btrfs_fs_incompat(fs_info, NO_HOLES))
    724			goto update_inode;
    725
    726		ret = btrfs_insert_empty_item(trans, root, path, key,
    727					      sizeof(*item));
    728		if (ret)
    729			goto out;
    730		dest_offset = btrfs_item_ptr_offset(path->nodes[0],
    731						    path->slots[0]);
    732		copy_extent_buffer(path->nodes[0], eb, dest_offset,
    733				(unsigned long)item,  sizeof(*item));
    734
    735		ins.objectid = btrfs_file_extent_disk_bytenr(eb, item);
    736		ins.offset = btrfs_file_extent_disk_num_bytes(eb, item);
    737		ins.type = BTRFS_EXTENT_ITEM_KEY;
    738		offset = key->offset - btrfs_file_extent_offset(eb, item);
    739
    740		/*
    741		 * Manually record dirty extent, as here we did a shallow
    742		 * file extent item copy and skip normal backref update,
    743		 * but modifying extent tree all by ourselves.
    744		 * So need to manually record dirty extent for qgroup,
    745		 * as the owner of the file extent changed from log tree
    746		 * (doesn't affect qgroup) to fs/file tree(affects qgroup)
    747		 */
    748		ret = btrfs_qgroup_trace_extent(trans,
    749				btrfs_file_extent_disk_bytenr(eb, item),
    750				btrfs_file_extent_disk_num_bytes(eb, item),
    751				GFP_NOFS);
    752		if (ret < 0)
    753			goto out;
    754
    755		if (ins.objectid > 0) {
    756			struct btrfs_ref ref = { 0 };
    757			u64 csum_start;
    758			u64 csum_end;
    759			LIST_HEAD(ordered_sums);
    760
    761			/*
    762			 * is this extent already allocated in the extent
    763			 * allocation tree?  If so, just add a reference
    764			 */
    765			ret = btrfs_lookup_data_extent(fs_info, ins.objectid,
    766						ins.offset);
    767			if (ret < 0) {
    768				goto out;
    769			} else if (ret == 0) {
    770				btrfs_init_generic_ref(&ref,
    771						BTRFS_ADD_DELAYED_REF,
    772						ins.objectid, ins.offset, 0);
    773				btrfs_init_data_ref(&ref,
    774						root->root_key.objectid,
    775						key->objectid, offset, 0, false);
    776				ret = btrfs_inc_extent_ref(trans, &ref);
    777				if (ret)
    778					goto out;
    779			} else {
    780				/*
    781				 * insert the extent pointer in the extent
    782				 * allocation tree
    783				 */
    784				ret = btrfs_alloc_logged_file_extent(trans,
    785						root->root_key.objectid,
    786						key->objectid, offset, &ins);
    787				if (ret)
    788					goto out;
    789			}
    790			btrfs_release_path(path);
    791
    792			if (btrfs_file_extent_compression(eb, item)) {
    793				csum_start = ins.objectid;
    794				csum_end = csum_start + ins.offset;
    795			} else {
    796				csum_start = ins.objectid +
    797					btrfs_file_extent_offset(eb, item);
    798				csum_end = csum_start +
    799					btrfs_file_extent_num_bytes(eb, item);
    800			}
    801
    802			ret = btrfs_lookup_csums_range(root->log_root,
    803						csum_start, csum_end - 1,
    804						&ordered_sums, 0);
    805			if (ret)
    806				goto out;
    807			/*
    808			 * Now delete all existing cums in the csum root that
    809			 * cover our range. We do this because we can have an
    810			 * extent that is completely referenced by one file
    811			 * extent item and partially referenced by another
    812			 * file extent item (like after using the clone or
    813			 * extent_same ioctls). In this case if we end up doing
    814			 * the replay of the one that partially references the
    815			 * extent first, and we do not do the csum deletion
    816			 * below, we can get 2 csum items in the csum tree that
    817			 * overlap each other. For example, imagine our log has
    818			 * the two following file extent items:
    819			 *
    820			 * key (257 EXTENT_DATA 409600)
    821			 *     extent data disk byte 12845056 nr 102400
    822			 *     extent data offset 20480 nr 20480 ram 102400
    823			 *
    824			 * key (257 EXTENT_DATA 819200)
    825			 *     extent data disk byte 12845056 nr 102400
    826			 *     extent data offset 0 nr 102400 ram 102400
    827			 *
    828			 * Where the second one fully references the 100K extent
    829			 * that starts at disk byte 12845056, and the log tree
    830			 * has a single csum item that covers the entire range
    831			 * of the extent:
    832			 *
    833			 * key (EXTENT_CSUM EXTENT_CSUM 12845056) itemsize 100
    834			 *
    835			 * After the first file extent item is replayed, the
    836			 * csum tree gets the following csum item:
    837			 *
    838			 * key (EXTENT_CSUM EXTENT_CSUM 12865536) itemsize 20
    839			 *
    840			 * Which covers the 20K sub-range starting at offset 20K
    841			 * of our extent. Now when we replay the second file
    842			 * extent item, if we do not delete existing csum items
    843			 * that cover any of its blocks, we end up getting two
    844			 * csum items in our csum tree that overlap each other:
    845			 *
    846			 * key (EXTENT_CSUM EXTENT_CSUM 12845056) itemsize 100
    847			 * key (EXTENT_CSUM EXTENT_CSUM 12865536) itemsize 20
    848			 *
    849			 * Which is a problem, because after this anyone trying
    850			 * to lookup up for the checksum of any block of our
    851			 * extent starting at an offset of 40K or higher, will
    852			 * end up looking at the second csum item only, which
    853			 * does not contain the checksum for any block starting
    854			 * at offset 40K or higher of our extent.
    855			 */
    856			while (!list_empty(&ordered_sums)) {
    857				struct btrfs_ordered_sum *sums;
    858				struct btrfs_root *csum_root;
    859
    860				sums = list_entry(ordered_sums.next,
    861						struct btrfs_ordered_sum,
    862						list);
    863				csum_root = btrfs_csum_root(fs_info,
    864							    sums->bytenr);
    865				if (!ret)
    866					ret = btrfs_del_csums(trans, csum_root,
    867							      sums->bytenr,
    868							      sums->len);
    869				if (!ret)
    870					ret = btrfs_csum_file_blocks(trans,
    871								     csum_root,
    872								     sums);
    873				list_del(&sums->list);
    874				kfree(sums);
    875			}
    876			if (ret)
    877				goto out;
    878		} else {
    879			btrfs_release_path(path);
    880		}
    881	} else if (found_type == BTRFS_FILE_EXTENT_INLINE) {
    882		/* inline extents are easy, we just overwrite them */
    883		ret = overwrite_item(trans, root, path, eb, slot, key);
    884		if (ret)
    885			goto out;
    886	}
    887
    888	ret = btrfs_inode_set_file_extent_range(BTRFS_I(inode), start,
    889						extent_end - start);
    890	if (ret)
    891		goto out;
    892
    893update_inode:
    894	btrfs_update_inode_bytes(BTRFS_I(inode), nbytes, drop_args.bytes_found);
    895	ret = btrfs_update_inode(trans, root, BTRFS_I(inode));
    896out:
    897	iput(inode);
    898	return ret;
    899}
    900
    901static int unlink_inode_for_log_replay(struct btrfs_trans_handle *trans,
    902				       struct btrfs_inode *dir,
    903				       struct btrfs_inode *inode,
    904				       const char *name,
    905				       int name_len)
    906{
    907	int ret;
    908
    909	ret = btrfs_unlink_inode(trans, dir, inode, name, name_len);
    910	if (ret)
    911		return ret;
    912	/*
    913	 * Whenever we need to check if a name exists or not, we check the
    914	 * fs/subvolume tree. So after an unlink we must run delayed items, so
    915	 * that future checks for a name during log replay see that the name
    916	 * does not exists anymore.
    917	 */
    918	return btrfs_run_delayed_items(trans);
    919}
    920
    921/*
    922 * when cleaning up conflicts between the directory names in the
    923 * subvolume, directory names in the log and directory names in the
    924 * inode back references, we may have to unlink inodes from directories.
    925 *
    926 * This is a helper function to do the unlink of a specific directory
    927 * item
    928 */
    929static noinline int drop_one_dir_item(struct btrfs_trans_handle *trans,
    930				      struct btrfs_path *path,
    931				      struct btrfs_inode *dir,
    932				      struct btrfs_dir_item *di)
    933{
    934	struct btrfs_root *root = dir->root;
    935	struct inode *inode;
    936	char *name;
    937	int name_len;
    938	struct extent_buffer *leaf;
    939	struct btrfs_key location;
    940	int ret;
    941
    942	leaf = path->nodes[0];
    943
    944	btrfs_dir_item_key_to_cpu(leaf, di, &location);
    945	name_len = btrfs_dir_name_len(leaf, di);
    946	name = kmalloc(name_len, GFP_NOFS);
    947	if (!name)
    948		return -ENOMEM;
    949
    950	read_extent_buffer(leaf, name, (unsigned long)(di + 1), name_len);
    951	btrfs_release_path(path);
    952
    953	inode = read_one_inode(root, location.objectid);
    954	if (!inode) {
    955		ret = -EIO;
    956		goto out;
    957	}
    958
    959	ret = link_to_fixup_dir(trans, root, path, location.objectid);
    960	if (ret)
    961		goto out;
    962
    963	ret = unlink_inode_for_log_replay(trans, dir, BTRFS_I(inode), name,
    964			name_len);
    965out:
    966	kfree(name);
    967	iput(inode);
    968	return ret;
    969}
    970
    971/*
    972 * See if a given name and sequence number found in an inode back reference are
    973 * already in a directory and correctly point to this inode.
    974 *
    975 * Returns: < 0 on error, 0 if the directory entry does not exists and 1 if it
    976 * exists.
    977 */
    978static noinline int inode_in_dir(struct btrfs_root *root,
    979				 struct btrfs_path *path,
    980				 u64 dirid, u64 objectid, u64 index,
    981				 const char *name, int name_len)
    982{
    983	struct btrfs_dir_item *di;
    984	struct btrfs_key location;
    985	int ret = 0;
    986
    987	di = btrfs_lookup_dir_index_item(NULL, root, path, dirid,
    988					 index, name, name_len, 0);
    989	if (IS_ERR(di)) {
    990		ret = PTR_ERR(di);
    991		goto out;
    992	} else if (di) {
    993		btrfs_dir_item_key_to_cpu(path->nodes[0], di, &location);
    994		if (location.objectid != objectid)
    995			goto out;
    996	} else {
    997		goto out;
    998	}
    999
   1000	btrfs_release_path(path);
   1001	di = btrfs_lookup_dir_item(NULL, root, path, dirid, name, name_len, 0);
   1002	if (IS_ERR(di)) {
   1003		ret = PTR_ERR(di);
   1004		goto out;
   1005	} else if (di) {
   1006		btrfs_dir_item_key_to_cpu(path->nodes[0], di, &location);
   1007		if (location.objectid == objectid)
   1008			ret = 1;
   1009	}
   1010out:
   1011	btrfs_release_path(path);
   1012	return ret;
   1013}
   1014
   1015/*
   1016 * helper function to check a log tree for a named back reference in
   1017 * an inode.  This is used to decide if a back reference that is
   1018 * found in the subvolume conflicts with what we find in the log.
   1019 *
   1020 * inode backreferences may have multiple refs in a single item,
   1021 * during replay we process one reference at a time, and we don't
   1022 * want to delete valid links to a file from the subvolume if that
   1023 * link is also in the log.
   1024 */
   1025static noinline int backref_in_log(struct btrfs_root *log,
   1026				   struct btrfs_key *key,
   1027				   u64 ref_objectid,
   1028				   const char *name, int namelen)
   1029{
   1030	struct btrfs_path *path;
   1031	int ret;
   1032
   1033	path = btrfs_alloc_path();
   1034	if (!path)
   1035		return -ENOMEM;
   1036
   1037	ret = btrfs_search_slot(NULL, log, key, path, 0, 0);
   1038	if (ret < 0) {
   1039		goto out;
   1040	} else if (ret == 1) {
   1041		ret = 0;
   1042		goto out;
   1043	}
   1044
   1045	if (key->type == BTRFS_INODE_EXTREF_KEY)
   1046		ret = !!btrfs_find_name_in_ext_backref(path->nodes[0],
   1047						       path->slots[0],
   1048						       ref_objectid,
   1049						       name, namelen);
   1050	else
   1051		ret = !!btrfs_find_name_in_backref(path->nodes[0],
   1052						   path->slots[0],
   1053						   name, namelen);
   1054out:
   1055	btrfs_free_path(path);
   1056	return ret;
   1057}
   1058
   1059static inline int __add_inode_ref(struct btrfs_trans_handle *trans,
   1060				  struct btrfs_root *root,
   1061				  struct btrfs_path *path,
   1062				  struct btrfs_root *log_root,
   1063				  struct btrfs_inode *dir,
   1064				  struct btrfs_inode *inode,
   1065				  u64 inode_objectid, u64 parent_objectid,
   1066				  u64 ref_index, char *name, int namelen,
   1067				  int *search_done)
   1068{
   1069	int ret;
   1070	char *victim_name;
   1071	int victim_name_len;
   1072	struct extent_buffer *leaf;
   1073	struct btrfs_dir_item *di;
   1074	struct btrfs_key search_key;
   1075	struct btrfs_inode_extref *extref;
   1076
   1077again:
   1078	/* Search old style refs */
   1079	search_key.objectid = inode_objectid;
   1080	search_key.type = BTRFS_INODE_REF_KEY;
   1081	search_key.offset = parent_objectid;
   1082	ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0);
   1083	if (ret == 0) {
   1084		struct btrfs_inode_ref *victim_ref;
   1085		unsigned long ptr;
   1086		unsigned long ptr_end;
   1087
   1088		leaf = path->nodes[0];
   1089
   1090		/* are we trying to overwrite a back ref for the root directory
   1091		 * if so, just jump out, we're done
   1092		 */
   1093		if (search_key.objectid == search_key.offset)
   1094			return 1;
   1095
   1096		/* check all the names in this back reference to see
   1097		 * if they are in the log.  if so, we allow them to stay
   1098		 * otherwise they must be unlinked as a conflict
   1099		 */
   1100		ptr = btrfs_item_ptr_offset(leaf, path->slots[0]);
   1101		ptr_end = ptr + btrfs_item_size(leaf, path->slots[0]);
   1102		while (ptr < ptr_end) {
   1103			victim_ref = (struct btrfs_inode_ref *)ptr;
   1104			victim_name_len = btrfs_inode_ref_name_len(leaf,
   1105								   victim_ref);
   1106			victim_name = kmalloc(victim_name_len, GFP_NOFS);
   1107			if (!victim_name)
   1108				return -ENOMEM;
   1109
   1110			read_extent_buffer(leaf, victim_name,
   1111					   (unsigned long)(victim_ref + 1),
   1112					   victim_name_len);
   1113
   1114			ret = backref_in_log(log_root, &search_key,
   1115					     parent_objectid, victim_name,
   1116					     victim_name_len);
   1117			if (ret < 0) {
   1118				kfree(victim_name);
   1119				return ret;
   1120			} else if (!ret) {
   1121				inc_nlink(&inode->vfs_inode);
   1122				btrfs_release_path(path);
   1123
   1124				ret = unlink_inode_for_log_replay(trans, dir, inode,
   1125						victim_name, victim_name_len);
   1126				kfree(victim_name);
   1127				if (ret)
   1128					return ret;
   1129				*search_done = 1;
   1130				goto again;
   1131			}
   1132			kfree(victim_name);
   1133
   1134			ptr = (unsigned long)(victim_ref + 1) + victim_name_len;
   1135		}
   1136
   1137		/*
   1138		 * NOTE: we have searched root tree and checked the
   1139		 * corresponding ref, it does not need to check again.
   1140		 */
   1141		*search_done = 1;
   1142	}
   1143	btrfs_release_path(path);
   1144
   1145	/* Same search but for extended refs */
   1146	extref = btrfs_lookup_inode_extref(NULL, root, path, name, namelen,
   1147					   inode_objectid, parent_objectid, 0,
   1148					   0);
   1149	if (!IS_ERR_OR_NULL(extref)) {
   1150		u32 item_size;
   1151		u32 cur_offset = 0;
   1152		unsigned long base;
   1153		struct inode *victim_parent;
   1154
   1155		leaf = path->nodes[0];
   1156
   1157		item_size = btrfs_item_size(leaf, path->slots[0]);
   1158		base = btrfs_item_ptr_offset(leaf, path->slots[0]);
   1159
   1160		while (cur_offset < item_size) {
   1161			extref = (struct btrfs_inode_extref *)(base + cur_offset);
   1162
   1163			victim_name_len = btrfs_inode_extref_name_len(leaf, extref);
   1164
   1165			if (btrfs_inode_extref_parent(leaf, extref) != parent_objectid)
   1166				goto next;
   1167
   1168			victim_name = kmalloc(victim_name_len, GFP_NOFS);
   1169			if (!victim_name)
   1170				return -ENOMEM;
   1171			read_extent_buffer(leaf, victim_name, (unsigned long)&extref->name,
   1172					   victim_name_len);
   1173
   1174			search_key.objectid = inode_objectid;
   1175			search_key.type = BTRFS_INODE_EXTREF_KEY;
   1176			search_key.offset = btrfs_extref_hash(parent_objectid,
   1177							      victim_name,
   1178							      victim_name_len);
   1179			ret = backref_in_log(log_root, &search_key,
   1180					     parent_objectid, victim_name,
   1181					     victim_name_len);
   1182			if (ret < 0) {
   1183				kfree(victim_name);
   1184				return ret;
   1185			} else if (!ret) {
   1186				ret = -ENOENT;
   1187				victim_parent = read_one_inode(root,
   1188						parent_objectid);
   1189				if (victim_parent) {
   1190					inc_nlink(&inode->vfs_inode);
   1191					btrfs_release_path(path);
   1192
   1193					ret = unlink_inode_for_log_replay(trans,
   1194							BTRFS_I(victim_parent),
   1195							inode,
   1196							victim_name,
   1197							victim_name_len);
   1198				}
   1199				iput(victim_parent);
   1200				kfree(victim_name);
   1201				if (ret)
   1202					return ret;
   1203				*search_done = 1;
   1204				goto again;
   1205			}
   1206			kfree(victim_name);
   1207next:
   1208			cur_offset += victim_name_len + sizeof(*extref);
   1209		}
   1210		*search_done = 1;
   1211	}
   1212	btrfs_release_path(path);
   1213
   1214	/* look for a conflicting sequence number */
   1215	di = btrfs_lookup_dir_index_item(trans, root, path, btrfs_ino(dir),
   1216					 ref_index, name, namelen, 0);
   1217	if (IS_ERR(di)) {
   1218		return PTR_ERR(di);
   1219	} else if (di) {
   1220		ret = drop_one_dir_item(trans, path, dir, di);
   1221		if (ret)
   1222			return ret;
   1223	}
   1224	btrfs_release_path(path);
   1225
   1226	/* look for a conflicting name */
   1227	di = btrfs_lookup_dir_item(trans, root, path, btrfs_ino(dir),
   1228				   name, namelen, 0);
   1229	if (IS_ERR(di)) {
   1230		return PTR_ERR(di);
   1231	} else if (di) {
   1232		ret = drop_one_dir_item(trans, path, dir, di);
   1233		if (ret)
   1234			return ret;
   1235	}
   1236	btrfs_release_path(path);
   1237
   1238	return 0;
   1239}
   1240
   1241static int extref_get_fields(struct extent_buffer *eb, unsigned long ref_ptr,
   1242			     u32 *namelen, char **name, u64 *index,
   1243			     u64 *parent_objectid)
   1244{
   1245	struct btrfs_inode_extref *extref;
   1246
   1247	extref = (struct btrfs_inode_extref *)ref_ptr;
   1248
   1249	*namelen = btrfs_inode_extref_name_len(eb, extref);
   1250	*name = kmalloc(*namelen, GFP_NOFS);
   1251	if (*name == NULL)
   1252		return -ENOMEM;
   1253
   1254	read_extent_buffer(eb, *name, (unsigned long)&extref->name,
   1255			   *namelen);
   1256
   1257	if (index)
   1258		*index = btrfs_inode_extref_index(eb, extref);
   1259	if (parent_objectid)
   1260		*parent_objectid = btrfs_inode_extref_parent(eb, extref);
   1261
   1262	return 0;
   1263}
   1264
   1265static int ref_get_fields(struct extent_buffer *eb, unsigned long ref_ptr,
   1266			  u32 *namelen, char **name, u64 *index)
   1267{
   1268	struct btrfs_inode_ref *ref;
   1269
   1270	ref = (struct btrfs_inode_ref *)ref_ptr;
   1271
   1272	*namelen = btrfs_inode_ref_name_len(eb, ref);
   1273	*name = kmalloc(*namelen, GFP_NOFS);
   1274	if (*name == NULL)
   1275		return -ENOMEM;
   1276
   1277	read_extent_buffer(eb, *name, (unsigned long)(ref + 1), *namelen);
   1278
   1279	if (index)
   1280		*index = btrfs_inode_ref_index(eb, ref);
   1281
   1282	return 0;
   1283}
   1284
   1285/*
   1286 * Take an inode reference item from the log tree and iterate all names from the
   1287 * inode reference item in the subvolume tree with the same key (if it exists).
   1288 * For any name that is not in the inode reference item from the log tree, do a
   1289 * proper unlink of that name (that is, remove its entry from the inode
   1290 * reference item and both dir index keys).
   1291 */
   1292static int unlink_old_inode_refs(struct btrfs_trans_handle *trans,
   1293				 struct btrfs_root *root,
   1294				 struct btrfs_path *path,
   1295				 struct btrfs_inode *inode,
   1296				 struct extent_buffer *log_eb,
   1297				 int log_slot,
   1298				 struct btrfs_key *key)
   1299{
   1300	int ret;
   1301	unsigned long ref_ptr;
   1302	unsigned long ref_end;
   1303	struct extent_buffer *eb;
   1304
   1305again:
   1306	btrfs_release_path(path);
   1307	ret = btrfs_search_slot(NULL, root, key, path, 0, 0);
   1308	if (ret > 0) {
   1309		ret = 0;
   1310		goto out;
   1311	}
   1312	if (ret < 0)
   1313		goto out;
   1314
   1315	eb = path->nodes[0];
   1316	ref_ptr = btrfs_item_ptr_offset(eb, path->slots[0]);
   1317	ref_end = ref_ptr + btrfs_item_size(eb, path->slots[0]);
   1318	while (ref_ptr < ref_end) {
   1319		char *name = NULL;
   1320		int namelen;
   1321		u64 parent_id;
   1322
   1323		if (key->type == BTRFS_INODE_EXTREF_KEY) {
   1324			ret = extref_get_fields(eb, ref_ptr, &namelen, &name,
   1325						NULL, &parent_id);
   1326		} else {
   1327			parent_id = key->offset;
   1328			ret = ref_get_fields(eb, ref_ptr, &namelen, &name,
   1329					     NULL);
   1330		}
   1331		if (ret)
   1332			goto out;
   1333
   1334		if (key->type == BTRFS_INODE_EXTREF_KEY)
   1335			ret = !!btrfs_find_name_in_ext_backref(log_eb, log_slot,
   1336							       parent_id, name,
   1337							       namelen);
   1338		else
   1339			ret = !!btrfs_find_name_in_backref(log_eb, log_slot,
   1340							   name, namelen);
   1341
   1342		if (!ret) {
   1343			struct inode *dir;
   1344
   1345			btrfs_release_path(path);
   1346			dir = read_one_inode(root, parent_id);
   1347			if (!dir) {
   1348				ret = -ENOENT;
   1349				kfree(name);
   1350				goto out;
   1351			}
   1352			ret = unlink_inode_for_log_replay(trans, BTRFS_I(dir),
   1353						 inode, name, namelen);
   1354			kfree(name);
   1355			iput(dir);
   1356			if (ret)
   1357				goto out;
   1358			goto again;
   1359		}
   1360
   1361		kfree(name);
   1362		ref_ptr += namelen;
   1363		if (key->type == BTRFS_INODE_EXTREF_KEY)
   1364			ref_ptr += sizeof(struct btrfs_inode_extref);
   1365		else
   1366			ref_ptr += sizeof(struct btrfs_inode_ref);
   1367	}
   1368	ret = 0;
   1369 out:
   1370	btrfs_release_path(path);
   1371	return ret;
   1372}
   1373
   1374static int btrfs_inode_ref_exists(struct inode *inode, struct inode *dir,
   1375				  const u8 ref_type, const char *name,
   1376				  const int namelen)
   1377{
   1378	struct btrfs_key key;
   1379	struct btrfs_path *path;
   1380	const u64 parent_id = btrfs_ino(BTRFS_I(dir));
   1381	int ret;
   1382
   1383	path = btrfs_alloc_path();
   1384	if (!path)
   1385		return -ENOMEM;
   1386
   1387	key.objectid = btrfs_ino(BTRFS_I(inode));
   1388	key.type = ref_type;
   1389	if (key.type == BTRFS_INODE_REF_KEY)
   1390		key.offset = parent_id;
   1391	else
   1392		key.offset = btrfs_extref_hash(parent_id, name, namelen);
   1393
   1394	ret = btrfs_search_slot(NULL, BTRFS_I(inode)->root, &key, path, 0, 0);
   1395	if (ret < 0)
   1396		goto out;
   1397	if (ret > 0) {
   1398		ret = 0;
   1399		goto out;
   1400	}
   1401	if (key.type == BTRFS_INODE_EXTREF_KEY)
   1402		ret = !!btrfs_find_name_in_ext_backref(path->nodes[0],
   1403				path->slots[0], parent_id, name, namelen);
   1404	else
   1405		ret = !!btrfs_find_name_in_backref(path->nodes[0], path->slots[0],
   1406						   name, namelen);
   1407
   1408out:
   1409	btrfs_free_path(path);
   1410	return ret;
   1411}
   1412
   1413static int add_link(struct btrfs_trans_handle *trans,
   1414		    struct inode *dir, struct inode *inode, const char *name,
   1415		    int namelen, u64 ref_index)
   1416{
   1417	struct btrfs_root *root = BTRFS_I(dir)->root;
   1418	struct btrfs_dir_item *dir_item;
   1419	struct btrfs_key key;
   1420	struct btrfs_path *path;
   1421	struct inode *other_inode = NULL;
   1422	int ret;
   1423
   1424	path = btrfs_alloc_path();
   1425	if (!path)
   1426		return -ENOMEM;
   1427
   1428	dir_item = btrfs_lookup_dir_item(NULL, root, path,
   1429					 btrfs_ino(BTRFS_I(dir)),
   1430					 name, namelen, 0);
   1431	if (!dir_item) {
   1432		btrfs_release_path(path);
   1433		goto add_link;
   1434	} else if (IS_ERR(dir_item)) {
   1435		ret = PTR_ERR(dir_item);
   1436		goto out;
   1437	}
   1438
   1439	/*
   1440	 * Our inode's dentry collides with the dentry of another inode which is
   1441	 * in the log but not yet processed since it has a higher inode number.
   1442	 * So delete that other dentry.
   1443	 */
   1444	btrfs_dir_item_key_to_cpu(path->nodes[0], dir_item, &key);
   1445	btrfs_release_path(path);
   1446	other_inode = read_one_inode(root, key.objectid);
   1447	if (!other_inode) {
   1448		ret = -ENOENT;
   1449		goto out;
   1450	}
   1451	ret = unlink_inode_for_log_replay(trans, BTRFS_I(dir), BTRFS_I(other_inode),
   1452					  name, namelen);
   1453	if (ret)
   1454		goto out;
   1455	/*
   1456	 * If we dropped the link count to 0, bump it so that later the iput()
   1457	 * on the inode will not free it. We will fixup the link count later.
   1458	 */
   1459	if (other_inode->i_nlink == 0)
   1460		inc_nlink(other_inode);
   1461add_link:
   1462	ret = btrfs_add_link(trans, BTRFS_I(dir), BTRFS_I(inode),
   1463			     name, namelen, 0, ref_index);
   1464out:
   1465	iput(other_inode);
   1466	btrfs_free_path(path);
   1467
   1468	return ret;
   1469}
   1470
   1471/*
   1472 * replay one inode back reference item found in the log tree.
   1473 * eb, slot and key refer to the buffer and key found in the log tree.
   1474 * root is the destination we are replaying into, and path is for temp
   1475 * use by this function.  (it should be released on return).
   1476 */
   1477static noinline int add_inode_ref(struct btrfs_trans_handle *trans,
   1478				  struct btrfs_root *root,
   1479				  struct btrfs_root *log,
   1480				  struct btrfs_path *path,
   1481				  struct extent_buffer *eb, int slot,
   1482				  struct btrfs_key *key)
   1483{
   1484	struct inode *dir = NULL;
   1485	struct inode *inode = NULL;
   1486	unsigned long ref_ptr;
   1487	unsigned long ref_end;
   1488	char *name = NULL;
   1489	int namelen;
   1490	int ret;
   1491	int search_done = 0;
   1492	int log_ref_ver = 0;
   1493	u64 parent_objectid;
   1494	u64 inode_objectid;
   1495	u64 ref_index = 0;
   1496	int ref_struct_size;
   1497
   1498	ref_ptr = btrfs_item_ptr_offset(eb, slot);
   1499	ref_end = ref_ptr + btrfs_item_size(eb, slot);
   1500
   1501	if (key->type == BTRFS_INODE_EXTREF_KEY) {
   1502		struct btrfs_inode_extref *r;
   1503
   1504		ref_struct_size = sizeof(struct btrfs_inode_extref);
   1505		log_ref_ver = 1;
   1506		r = (struct btrfs_inode_extref *)ref_ptr;
   1507		parent_objectid = btrfs_inode_extref_parent(eb, r);
   1508	} else {
   1509		ref_struct_size = sizeof(struct btrfs_inode_ref);
   1510		parent_objectid = key->offset;
   1511	}
   1512	inode_objectid = key->objectid;
   1513
   1514	/*
   1515	 * it is possible that we didn't log all the parent directories
   1516	 * for a given inode.  If we don't find the dir, just don't
   1517	 * copy the back ref in.  The link count fixup code will take
   1518	 * care of the rest
   1519	 */
   1520	dir = read_one_inode(root, parent_objectid);
   1521	if (!dir) {
   1522		ret = -ENOENT;
   1523		goto out;
   1524	}
   1525
   1526	inode = read_one_inode(root, inode_objectid);
   1527	if (!inode) {
   1528		ret = -EIO;
   1529		goto out;
   1530	}
   1531
   1532	while (ref_ptr < ref_end) {
   1533		if (log_ref_ver) {
   1534			ret = extref_get_fields(eb, ref_ptr, &namelen, &name,
   1535						&ref_index, &parent_objectid);
   1536			/*
   1537			 * parent object can change from one array
   1538			 * item to another.
   1539			 */
   1540			if (!dir)
   1541				dir = read_one_inode(root, parent_objectid);
   1542			if (!dir) {
   1543				ret = -ENOENT;
   1544				goto out;
   1545			}
   1546		} else {
   1547			ret = ref_get_fields(eb, ref_ptr, &namelen, &name,
   1548					     &ref_index);
   1549		}
   1550		if (ret)
   1551			goto out;
   1552
   1553		ret = inode_in_dir(root, path, btrfs_ino(BTRFS_I(dir)),
   1554				   btrfs_ino(BTRFS_I(inode)), ref_index,
   1555				   name, namelen);
   1556		if (ret < 0) {
   1557			goto out;
   1558		} else if (ret == 0) {
   1559			/*
   1560			 * look for a conflicting back reference in the
   1561			 * metadata. if we find one we have to unlink that name
   1562			 * of the file before we add our new link.  Later on, we
   1563			 * overwrite any existing back reference, and we don't
   1564			 * want to create dangling pointers in the directory.
   1565			 */
   1566
   1567			if (!search_done) {
   1568				ret = __add_inode_ref(trans, root, path, log,
   1569						      BTRFS_I(dir),
   1570						      BTRFS_I(inode),
   1571						      inode_objectid,
   1572						      parent_objectid,
   1573						      ref_index, name, namelen,
   1574						      &search_done);
   1575				if (ret) {
   1576					if (ret == 1)
   1577						ret = 0;
   1578					goto out;
   1579				}
   1580			}
   1581
   1582			/*
   1583			 * If a reference item already exists for this inode
   1584			 * with the same parent and name, but different index,
   1585			 * drop it and the corresponding directory index entries
   1586			 * from the parent before adding the new reference item
   1587			 * and dir index entries, otherwise we would fail with
   1588			 * -EEXIST returned from btrfs_add_link() below.
   1589			 */
   1590			ret = btrfs_inode_ref_exists(inode, dir, key->type,
   1591						     name, namelen);
   1592			if (ret > 0) {
   1593				ret = unlink_inode_for_log_replay(trans,
   1594							 BTRFS_I(dir),
   1595							 BTRFS_I(inode),
   1596							 name, namelen);
   1597				/*
   1598				 * If we dropped the link count to 0, bump it so
   1599				 * that later the iput() on the inode will not
   1600				 * free it. We will fixup the link count later.
   1601				 */
   1602				if (!ret && inode->i_nlink == 0)
   1603					inc_nlink(inode);
   1604			}
   1605			if (ret < 0)
   1606				goto out;
   1607
   1608			/* insert our name */
   1609			ret = add_link(trans, dir, inode, name, namelen,
   1610				       ref_index);
   1611			if (ret)
   1612				goto out;
   1613
   1614			ret = btrfs_update_inode(trans, root, BTRFS_I(inode));
   1615			if (ret)
   1616				goto out;
   1617		}
   1618		/* Else, ret == 1, we already have a perfect match, we're done. */
   1619
   1620		ref_ptr = (unsigned long)(ref_ptr + ref_struct_size) + namelen;
   1621		kfree(name);
   1622		name = NULL;
   1623		if (log_ref_ver) {
   1624			iput(dir);
   1625			dir = NULL;
   1626		}
   1627	}
   1628
   1629	/*
   1630	 * Before we overwrite the inode reference item in the subvolume tree
   1631	 * with the item from the log tree, we must unlink all names from the
   1632	 * parent directory that are in the subvolume's tree inode reference
   1633	 * item, otherwise we end up with an inconsistent subvolume tree where
   1634	 * dir index entries exist for a name but there is no inode reference
   1635	 * item with the same name.
   1636	 */
   1637	ret = unlink_old_inode_refs(trans, root, path, BTRFS_I(inode), eb, slot,
   1638				    key);
   1639	if (ret)
   1640		goto out;
   1641
   1642	/* finally write the back reference in the inode */
   1643	ret = overwrite_item(trans, root, path, eb, slot, key);
   1644out:
   1645	btrfs_release_path(path);
   1646	kfree(name);
   1647	iput(dir);
   1648	iput(inode);
   1649	return ret;
   1650}
   1651
   1652static int count_inode_extrefs(struct btrfs_root *root,
   1653		struct btrfs_inode *inode, struct btrfs_path *path)
   1654{
   1655	int ret = 0;
   1656	int name_len;
   1657	unsigned int nlink = 0;
   1658	u32 item_size;
   1659	u32 cur_offset = 0;
   1660	u64 inode_objectid = btrfs_ino(inode);
   1661	u64 offset = 0;
   1662	unsigned long ptr;
   1663	struct btrfs_inode_extref *extref;
   1664	struct extent_buffer *leaf;
   1665
   1666	while (1) {
   1667		ret = btrfs_find_one_extref(root, inode_objectid, offset, path,
   1668					    &extref, &offset);
   1669		if (ret)
   1670			break;
   1671
   1672		leaf = path->nodes[0];
   1673		item_size = btrfs_item_size(leaf, path->slots[0]);
   1674		ptr = btrfs_item_ptr_offset(leaf, path->slots[0]);
   1675		cur_offset = 0;
   1676
   1677		while (cur_offset < item_size) {
   1678			extref = (struct btrfs_inode_extref *) (ptr + cur_offset);
   1679			name_len = btrfs_inode_extref_name_len(leaf, extref);
   1680
   1681			nlink++;
   1682
   1683			cur_offset += name_len + sizeof(*extref);
   1684		}
   1685
   1686		offset++;
   1687		btrfs_release_path(path);
   1688	}
   1689	btrfs_release_path(path);
   1690
   1691	if (ret < 0 && ret != -ENOENT)
   1692		return ret;
   1693	return nlink;
   1694}
   1695
   1696static int count_inode_refs(struct btrfs_root *root,
   1697			struct btrfs_inode *inode, struct btrfs_path *path)
   1698{
   1699	int ret;
   1700	struct btrfs_key key;
   1701	unsigned int nlink = 0;
   1702	unsigned long ptr;
   1703	unsigned long ptr_end;
   1704	int name_len;
   1705	u64 ino = btrfs_ino(inode);
   1706
   1707	key.objectid = ino;
   1708	key.type = BTRFS_INODE_REF_KEY;
   1709	key.offset = (u64)-1;
   1710
   1711	while (1) {
   1712		ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
   1713		if (ret < 0)
   1714			break;
   1715		if (ret > 0) {
   1716			if (path->slots[0] == 0)
   1717				break;
   1718			path->slots[0]--;
   1719		}
   1720process_slot:
   1721		btrfs_item_key_to_cpu(path->nodes[0], &key,
   1722				      path->slots[0]);
   1723		if (key.objectid != ino ||
   1724		    key.type != BTRFS_INODE_REF_KEY)
   1725			break;
   1726		ptr = btrfs_item_ptr_offset(path->nodes[0], path->slots[0]);
   1727		ptr_end = ptr + btrfs_item_size(path->nodes[0],
   1728						   path->slots[0]);
   1729		while (ptr < ptr_end) {
   1730			struct btrfs_inode_ref *ref;
   1731
   1732			ref = (struct btrfs_inode_ref *)ptr;
   1733			name_len = btrfs_inode_ref_name_len(path->nodes[0],
   1734							    ref);
   1735			ptr = (unsigned long)(ref + 1) + name_len;
   1736			nlink++;
   1737		}
   1738
   1739		if (key.offset == 0)
   1740			break;
   1741		if (path->slots[0] > 0) {
   1742			path->slots[0]--;
   1743			goto process_slot;
   1744		}
   1745		key.offset--;
   1746		btrfs_release_path(path);
   1747	}
   1748	btrfs_release_path(path);
   1749
   1750	return nlink;
   1751}
   1752
   1753/*
   1754 * There are a few corners where the link count of the file can't
   1755 * be properly maintained during replay.  So, instead of adding
   1756 * lots of complexity to the log code, we just scan the backrefs
   1757 * for any file that has been through replay.
   1758 *
   1759 * The scan will update the link count on the inode to reflect the
   1760 * number of back refs found.  If it goes down to zero, the iput
   1761 * will free the inode.
   1762 */
   1763static noinline int fixup_inode_link_count(struct btrfs_trans_handle *trans,
   1764					   struct btrfs_root *root,
   1765					   struct inode *inode)
   1766{
   1767	struct btrfs_path *path;
   1768	int ret;
   1769	u64 nlink = 0;
   1770	u64 ino = btrfs_ino(BTRFS_I(inode));
   1771
   1772	path = btrfs_alloc_path();
   1773	if (!path)
   1774		return -ENOMEM;
   1775
   1776	ret = count_inode_refs(root, BTRFS_I(inode), path);
   1777	if (ret < 0)
   1778		goto out;
   1779
   1780	nlink = ret;
   1781
   1782	ret = count_inode_extrefs(root, BTRFS_I(inode), path);
   1783	if (ret < 0)
   1784		goto out;
   1785
   1786	nlink += ret;
   1787
   1788	ret = 0;
   1789
   1790	if (nlink != inode->i_nlink) {
   1791		set_nlink(inode, nlink);
   1792		ret = btrfs_update_inode(trans, root, BTRFS_I(inode));
   1793		if (ret)
   1794			goto out;
   1795	}
   1796	BTRFS_I(inode)->index_cnt = (u64)-1;
   1797
   1798	if (inode->i_nlink == 0) {
   1799		if (S_ISDIR(inode->i_mode)) {
   1800			ret = replay_dir_deletes(trans, root, NULL, path,
   1801						 ino, 1);
   1802			if (ret)
   1803				goto out;
   1804		}
   1805		ret = btrfs_insert_orphan_item(trans, root, ino);
   1806		if (ret == -EEXIST)
   1807			ret = 0;
   1808	}
   1809
   1810out:
   1811	btrfs_free_path(path);
   1812	return ret;
   1813}
   1814
   1815static noinline int fixup_inode_link_counts(struct btrfs_trans_handle *trans,
   1816					    struct btrfs_root *root,
   1817					    struct btrfs_path *path)
   1818{
   1819	int ret;
   1820	struct btrfs_key key;
   1821	struct inode *inode;
   1822
   1823	key.objectid = BTRFS_TREE_LOG_FIXUP_OBJECTID;
   1824	key.type = BTRFS_ORPHAN_ITEM_KEY;
   1825	key.offset = (u64)-1;
   1826	while (1) {
   1827		ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
   1828		if (ret < 0)
   1829			break;
   1830
   1831		if (ret == 1) {
   1832			ret = 0;
   1833			if (path->slots[0] == 0)
   1834				break;
   1835			path->slots[0]--;
   1836		}
   1837
   1838		btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
   1839		if (key.objectid != BTRFS_TREE_LOG_FIXUP_OBJECTID ||
   1840		    key.type != BTRFS_ORPHAN_ITEM_KEY)
   1841			break;
   1842
   1843		ret = btrfs_del_item(trans, root, path);
   1844		if (ret)
   1845			break;
   1846
   1847		btrfs_release_path(path);
   1848		inode = read_one_inode(root, key.offset);
   1849		if (!inode) {
   1850			ret = -EIO;
   1851			break;
   1852		}
   1853
   1854		ret = fixup_inode_link_count(trans, root, inode);
   1855		iput(inode);
   1856		if (ret)
   1857			break;
   1858
   1859		/*
   1860		 * fixup on a directory may create new entries,
   1861		 * make sure we always look for the highset possible
   1862		 * offset
   1863		 */
   1864		key.offset = (u64)-1;
   1865	}
   1866	btrfs_release_path(path);
   1867	return ret;
   1868}
   1869
   1870
   1871/*
   1872 * record a given inode in the fixup dir so we can check its link
   1873 * count when replay is done.  The link count is incremented here
   1874 * so the inode won't go away until we check it
   1875 */
   1876static noinline int link_to_fixup_dir(struct btrfs_trans_handle *trans,
   1877				      struct btrfs_root *root,
   1878				      struct btrfs_path *path,
   1879				      u64 objectid)
   1880{
   1881	struct btrfs_key key;
   1882	int ret = 0;
   1883	struct inode *inode;
   1884
   1885	inode = read_one_inode(root, objectid);
   1886	if (!inode)
   1887		return -EIO;
   1888
   1889	key.objectid = BTRFS_TREE_LOG_FIXUP_OBJECTID;
   1890	key.type = BTRFS_ORPHAN_ITEM_KEY;
   1891	key.offset = objectid;
   1892
   1893	ret = btrfs_insert_empty_item(trans, root, path, &key, 0);
   1894
   1895	btrfs_release_path(path);
   1896	if (ret == 0) {
   1897		if (!inode->i_nlink)
   1898			set_nlink(inode, 1);
   1899		else
   1900			inc_nlink(inode);
   1901		ret = btrfs_update_inode(trans, root, BTRFS_I(inode));
   1902	} else if (ret == -EEXIST) {
   1903		ret = 0;
   1904	}
   1905	iput(inode);
   1906
   1907	return ret;
   1908}
   1909
   1910/*
   1911 * when replaying the log for a directory, we only insert names
   1912 * for inodes that actually exist.  This means an fsync on a directory
   1913 * does not implicitly fsync all the new files in it
   1914 */
   1915static noinline int insert_one_name(struct btrfs_trans_handle *trans,
   1916				    struct btrfs_root *root,
   1917				    u64 dirid, u64 index,
   1918				    char *name, int name_len,
   1919				    struct btrfs_key *location)
   1920{
   1921	struct inode *inode;
   1922	struct inode *dir;
   1923	int ret;
   1924
   1925	inode = read_one_inode(root, location->objectid);
   1926	if (!inode)
   1927		return -ENOENT;
   1928
   1929	dir = read_one_inode(root, dirid);
   1930	if (!dir) {
   1931		iput(inode);
   1932		return -EIO;
   1933	}
   1934
   1935	ret = btrfs_add_link(trans, BTRFS_I(dir), BTRFS_I(inode), name,
   1936			name_len, 1, index);
   1937
   1938	/* FIXME, put inode into FIXUP list */
   1939
   1940	iput(inode);
   1941	iput(dir);
   1942	return ret;
   1943}
   1944
   1945static int delete_conflicting_dir_entry(struct btrfs_trans_handle *trans,
   1946					struct btrfs_inode *dir,
   1947					struct btrfs_path *path,
   1948					struct btrfs_dir_item *dst_di,
   1949					const struct btrfs_key *log_key,
   1950					u8 log_type,
   1951					bool exists)
   1952{
   1953	struct btrfs_key found_key;
   1954
   1955	btrfs_dir_item_key_to_cpu(path->nodes[0], dst_di, &found_key);
   1956	/* The existing dentry points to the same inode, don't delete it. */
   1957	if (found_key.objectid == log_key->objectid &&
   1958	    found_key.type == log_key->type &&
   1959	    found_key.offset == log_key->offset &&
   1960	    btrfs_dir_type(path->nodes[0], dst_di) == log_type)
   1961		return 1;
   1962
   1963	/*
   1964	 * Don't drop the conflicting directory entry if the inode for the new
   1965	 * entry doesn't exist.
   1966	 */
   1967	if (!exists)
   1968		return 0;
   1969
   1970	return drop_one_dir_item(trans, path, dir, dst_di);
   1971}
   1972
   1973/*
   1974 * take a single entry in a log directory item and replay it into
   1975 * the subvolume.
   1976 *
   1977 * if a conflicting item exists in the subdirectory already,
   1978 * the inode it points to is unlinked and put into the link count
   1979 * fix up tree.
   1980 *
   1981 * If a name from the log points to a file or directory that does
   1982 * not exist in the FS, it is skipped.  fsyncs on directories
   1983 * do not force down inodes inside that directory, just changes to the
   1984 * names or unlinks in a directory.
   1985 *
   1986 * Returns < 0 on error, 0 if the name wasn't replayed (dentry points to a
   1987 * non-existing inode) and 1 if the name was replayed.
   1988 */
   1989static noinline int replay_one_name(struct btrfs_trans_handle *trans,
   1990				    struct btrfs_root *root,
   1991				    struct btrfs_path *path,
   1992				    struct extent_buffer *eb,
   1993				    struct btrfs_dir_item *di,
   1994				    struct btrfs_key *key)
   1995{
   1996	char *name;
   1997	int name_len;
   1998	struct btrfs_dir_item *dir_dst_di;
   1999	struct btrfs_dir_item *index_dst_di;
   2000	bool dir_dst_matches = false;
   2001	bool index_dst_matches = false;
   2002	struct btrfs_key log_key;
   2003	struct btrfs_key search_key;
   2004	struct inode *dir;
   2005	u8 log_type;
   2006	bool exists;
   2007	int ret;
   2008	bool update_size = true;
   2009	bool name_added = false;
   2010
   2011	dir = read_one_inode(root, key->objectid);
   2012	if (!dir)
   2013		return -EIO;
   2014
   2015	name_len = btrfs_dir_name_len(eb, di);
   2016	name = kmalloc(name_len, GFP_NOFS);
   2017	if (!name) {
   2018		ret = -ENOMEM;
   2019		goto out;
   2020	}
   2021
   2022	log_type = btrfs_dir_type(eb, di);
   2023	read_extent_buffer(eb, name, (unsigned long)(di + 1),
   2024		   name_len);
   2025
   2026	btrfs_dir_item_key_to_cpu(eb, di, &log_key);
   2027	ret = btrfs_lookup_inode(trans, root, path, &log_key, 0);
   2028	btrfs_release_path(path);
   2029	if (ret < 0)
   2030		goto out;
   2031	exists = (ret == 0);
   2032	ret = 0;
   2033
   2034	dir_dst_di = btrfs_lookup_dir_item(trans, root, path, key->objectid,
   2035					   name, name_len, 1);
   2036	if (IS_ERR(dir_dst_di)) {
   2037		ret = PTR_ERR(dir_dst_di);
   2038		goto out;
   2039	} else if (dir_dst_di) {
   2040		ret = delete_conflicting_dir_entry(trans, BTRFS_I(dir), path,
   2041						   dir_dst_di, &log_key, log_type,
   2042						   exists);
   2043		if (ret < 0)
   2044			goto out;
   2045		dir_dst_matches = (ret == 1);
   2046	}
   2047
   2048	btrfs_release_path(path);
   2049
   2050	index_dst_di = btrfs_lookup_dir_index_item(trans, root, path,
   2051						   key->objectid, key->offset,
   2052						   name, name_len, 1);
   2053	if (IS_ERR(index_dst_di)) {
   2054		ret = PTR_ERR(index_dst_di);
   2055		goto out;
   2056	} else if (index_dst_di) {
   2057		ret = delete_conflicting_dir_entry(trans, BTRFS_I(dir), path,
   2058						   index_dst_di, &log_key,
   2059						   log_type, exists);
   2060		if (ret < 0)
   2061			goto out;
   2062		index_dst_matches = (ret == 1);
   2063	}
   2064
   2065	btrfs_release_path(path);
   2066
   2067	if (dir_dst_matches && index_dst_matches) {
   2068		ret = 0;
   2069		update_size = false;
   2070		goto out;
   2071	}
   2072
   2073	/*
   2074	 * Check if the inode reference exists in the log for the given name,
   2075	 * inode and parent inode
   2076	 */
   2077	search_key.objectid = log_key.objectid;
   2078	search_key.type = BTRFS_INODE_REF_KEY;
   2079	search_key.offset = key->objectid;
   2080	ret = backref_in_log(root->log_root, &search_key, 0, name, name_len);
   2081	if (ret < 0) {
   2082	        goto out;
   2083	} else if (ret) {
   2084	        /* The dentry will be added later. */
   2085	        ret = 0;
   2086	        update_size = false;
   2087	        goto out;
   2088	}
   2089
   2090	search_key.objectid = log_key.objectid;
   2091	search_key.type = BTRFS_INODE_EXTREF_KEY;
   2092	search_key.offset = key->objectid;
   2093	ret = backref_in_log(root->log_root, &search_key, key->objectid, name,
   2094			     name_len);
   2095	if (ret < 0) {
   2096		goto out;
   2097	} else if (ret) {
   2098		/* The dentry will be added later. */
   2099		ret = 0;
   2100		update_size = false;
   2101		goto out;
   2102	}
   2103	btrfs_release_path(path);
   2104	ret = insert_one_name(trans, root, key->objectid, key->offset,
   2105			      name, name_len, &log_key);
   2106	if (ret && ret != -ENOENT && ret != -EEXIST)
   2107		goto out;
   2108	if (!ret)
   2109		name_added = true;
   2110	update_size = false;
   2111	ret = 0;
   2112
   2113out:
   2114	if (!ret && update_size) {
   2115		btrfs_i_size_write(BTRFS_I(dir), dir->i_size + name_len * 2);
   2116		ret = btrfs_update_inode(trans, root, BTRFS_I(dir));
   2117	}
   2118	kfree(name);
   2119	iput(dir);
   2120	if (!ret && name_added)
   2121		ret = 1;
   2122	return ret;
   2123}
   2124
   2125/* Replay one dir item from a BTRFS_DIR_INDEX_KEY key. */
   2126static noinline int replay_one_dir_item(struct btrfs_trans_handle *trans,
   2127					struct btrfs_root *root,
   2128					struct btrfs_path *path,
   2129					struct extent_buffer *eb, int slot,
   2130					struct btrfs_key *key)
   2131{
   2132	int ret;
   2133	struct btrfs_dir_item *di;
   2134
   2135	/* We only log dir index keys, which only contain a single dir item. */
   2136	ASSERT(key->type == BTRFS_DIR_INDEX_KEY);
   2137
   2138	di = btrfs_item_ptr(eb, slot, struct btrfs_dir_item);
   2139	ret = replay_one_name(trans, root, path, eb, di, key);
   2140	if (ret < 0)
   2141		return ret;
   2142
   2143	/*
   2144	 * If this entry refers to a non-directory (directories can not have a
   2145	 * link count > 1) and it was added in the transaction that was not
   2146	 * committed, make sure we fixup the link count of the inode the entry
   2147	 * points to. Otherwise something like the following would result in a
   2148	 * directory pointing to an inode with a wrong link that does not account
   2149	 * for this dir entry:
   2150	 *
   2151	 * mkdir testdir
   2152	 * touch testdir/foo
   2153	 * touch testdir/bar
   2154	 * sync
   2155	 *
   2156	 * ln testdir/bar testdir/bar_link
   2157	 * ln testdir/foo testdir/foo_link
   2158	 * xfs_io -c "fsync" testdir/bar
   2159	 *
   2160	 * <power failure>
   2161	 *
   2162	 * mount fs, log replay happens
   2163	 *
   2164	 * File foo would remain with a link count of 1 when it has two entries
   2165	 * pointing to it in the directory testdir. This would make it impossible
   2166	 * to ever delete the parent directory has it would result in stale
   2167	 * dentries that can never be deleted.
   2168	 */
   2169	if (ret == 1 && btrfs_dir_type(eb, di) != BTRFS_FT_DIR) {
   2170		struct btrfs_path *fixup_path;
   2171		struct btrfs_key di_key;
   2172
   2173		fixup_path = btrfs_alloc_path();
   2174		if (!fixup_path)
   2175			return -ENOMEM;
   2176
   2177		btrfs_dir_item_key_to_cpu(eb, di, &di_key);
   2178		ret = link_to_fixup_dir(trans, root, fixup_path, di_key.objectid);
   2179		btrfs_free_path(fixup_path);
   2180	}
   2181
   2182	return ret;
   2183}
   2184
   2185/*
   2186 * directory replay has two parts.  There are the standard directory
   2187 * items in the log copied from the subvolume, and range items
   2188 * created in the log while the subvolume was logged.
   2189 *
   2190 * The range items tell us which parts of the key space the log
   2191 * is authoritative for.  During replay, if a key in the subvolume
   2192 * directory is in a logged range item, but not actually in the log
   2193 * that means it was deleted from the directory before the fsync
   2194 * and should be removed.
   2195 */
   2196static noinline int find_dir_range(struct btrfs_root *root,
   2197				   struct btrfs_path *path,
   2198				   u64 dirid,
   2199				   u64 *start_ret, u64 *end_ret)
   2200{
   2201	struct btrfs_key key;
   2202	u64 found_end;
   2203	struct btrfs_dir_log_item *item;
   2204	int ret;
   2205	int nritems;
   2206
   2207	if (*start_ret == (u64)-1)
   2208		return 1;
   2209
   2210	key.objectid = dirid;
   2211	key.type = BTRFS_DIR_LOG_INDEX_KEY;
   2212	key.offset = *start_ret;
   2213
   2214	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
   2215	if (ret < 0)
   2216		goto out;
   2217	if (ret > 0) {
   2218		if (path->slots[0] == 0)
   2219			goto out;
   2220		path->slots[0]--;
   2221	}
   2222	if (ret != 0)
   2223		btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
   2224
   2225	if (key.type != BTRFS_DIR_LOG_INDEX_KEY || key.objectid != dirid) {
   2226		ret = 1;
   2227		goto next;
   2228	}
   2229	item = btrfs_item_ptr(path->nodes[0], path->slots[0],
   2230			      struct btrfs_dir_log_item);
   2231	found_end = btrfs_dir_log_end(path->nodes[0], item);
   2232
   2233	if (*start_ret >= key.offset && *start_ret <= found_end) {
   2234		ret = 0;
   2235		*start_ret = key.offset;
   2236		*end_ret = found_end;
   2237		goto out;
   2238	}
   2239	ret = 1;
   2240next:
   2241	/* check the next slot in the tree to see if it is a valid item */
   2242	nritems = btrfs_header_nritems(path->nodes[0]);
   2243	path->slots[0]++;
   2244	if (path->slots[0] >= nritems) {
   2245		ret = btrfs_next_leaf(root, path);
   2246		if (ret)
   2247			goto out;
   2248	}
   2249
   2250	btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
   2251
   2252	if (key.type != BTRFS_DIR_LOG_INDEX_KEY || key.objectid != dirid) {
   2253		ret = 1;
   2254		goto out;
   2255	}
   2256	item = btrfs_item_ptr(path->nodes[0], path->slots[0],
   2257			      struct btrfs_dir_log_item);
   2258	found_end = btrfs_dir_log_end(path->nodes[0], item);
   2259	*start_ret = key.offset;
   2260	*end_ret = found_end;
   2261	ret = 0;
   2262out:
   2263	btrfs_release_path(path);
   2264	return ret;
   2265}
   2266
   2267/*
   2268 * this looks for a given directory item in the log.  If the directory
   2269 * item is not in the log, the item is removed and the inode it points
   2270 * to is unlinked
   2271 */
   2272static noinline int check_item_in_log(struct btrfs_trans_handle *trans,
   2273				      struct btrfs_root *log,
   2274				      struct btrfs_path *path,
   2275				      struct btrfs_path *log_path,
   2276				      struct inode *dir,
   2277				      struct btrfs_key *dir_key)
   2278{
   2279	struct btrfs_root *root = BTRFS_I(dir)->root;
   2280	int ret;
   2281	struct extent_buffer *eb;
   2282	int slot;
   2283	struct btrfs_dir_item *di;
   2284	int name_len;
   2285	char *name;
   2286	struct inode *inode = NULL;
   2287	struct btrfs_key location;
   2288
   2289	/*
   2290	 * Currenly we only log dir index keys. Even if we replay a log created
   2291	 * by an older kernel that logged both dir index and dir item keys, all
   2292	 * we need to do is process the dir index keys, we (and our caller) can
   2293	 * safely ignore dir item keys (key type BTRFS_DIR_ITEM_KEY).
   2294	 */
   2295	ASSERT(dir_key->type == BTRFS_DIR_INDEX_KEY);
   2296
   2297	eb = path->nodes[0];
   2298	slot = path->slots[0];
   2299	di = btrfs_item_ptr(eb, slot, struct btrfs_dir_item);
   2300	name_len = btrfs_dir_name_len(eb, di);
   2301	name = kmalloc(name_len, GFP_NOFS);
   2302	if (!name) {
   2303		ret = -ENOMEM;
   2304		goto out;
   2305	}
   2306
   2307	read_extent_buffer(eb, name, (unsigned long)(di + 1), name_len);
   2308
   2309	if (log) {
   2310		struct btrfs_dir_item *log_di;
   2311
   2312		log_di = btrfs_lookup_dir_index_item(trans, log, log_path,
   2313						     dir_key->objectid,
   2314						     dir_key->offset,
   2315						     name, name_len, 0);
   2316		if (IS_ERR(log_di)) {
   2317			ret = PTR_ERR(log_di);
   2318			goto out;
   2319		} else if (log_di) {
   2320			/* The dentry exists in the log, we have nothing to do. */
   2321			ret = 0;
   2322			goto out;
   2323		}
   2324	}
   2325
   2326	btrfs_dir_item_key_to_cpu(eb, di, &location);
   2327	btrfs_release_path(path);
   2328	btrfs_release_path(log_path);
   2329	inode = read_one_inode(root, location.objectid);
   2330	if (!inode) {
   2331		ret = -EIO;
   2332		goto out;
   2333	}
   2334
   2335	ret = link_to_fixup_dir(trans, root, path, location.objectid);
   2336	if (ret)
   2337		goto out;
   2338
   2339	inc_nlink(inode);
   2340	ret = unlink_inode_for_log_replay(trans, BTRFS_I(dir), BTRFS_I(inode),
   2341					  name, name_len);
   2342	/*
   2343	 * Unlike dir item keys, dir index keys can only have one name (entry) in
   2344	 * them, as there are no key collisions since each key has a unique offset
   2345	 * (an index number), so we're done.
   2346	 */
   2347out:
   2348	btrfs_release_path(path);
   2349	btrfs_release_path(log_path);
   2350	kfree(name);
   2351	iput(inode);
   2352	return ret;
   2353}
   2354
   2355static int replay_xattr_deletes(struct btrfs_trans_handle *trans,
   2356			      struct btrfs_root *root,
   2357			      struct btrfs_root *log,
   2358			      struct btrfs_path *path,
   2359			      const u64 ino)
   2360{
   2361	struct btrfs_key search_key;
   2362	struct btrfs_path *log_path;
   2363	int i;
   2364	int nritems;
   2365	int ret;
   2366
   2367	log_path = btrfs_alloc_path();
   2368	if (!log_path)
   2369		return -ENOMEM;
   2370
   2371	search_key.objectid = ino;
   2372	search_key.type = BTRFS_XATTR_ITEM_KEY;
   2373	search_key.offset = 0;
   2374again:
   2375	ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0);
   2376	if (ret < 0)
   2377		goto out;
   2378process_leaf:
   2379	nritems = btrfs_header_nritems(path->nodes[0]);
   2380	for (i = path->slots[0]; i < nritems; i++) {
   2381		struct btrfs_key key;
   2382		struct btrfs_dir_item *di;
   2383		struct btrfs_dir_item *log_di;
   2384		u32 total_size;
   2385		u32 cur;
   2386
   2387		btrfs_item_key_to_cpu(path->nodes[0], &key, i);
   2388		if (key.objectid != ino || key.type != BTRFS_XATTR_ITEM_KEY) {
   2389			ret = 0;
   2390			goto out;
   2391		}
   2392
   2393		di = btrfs_item_ptr(path->nodes[0], i, struct btrfs_dir_item);
   2394		total_size = btrfs_item_size(path->nodes[0], i);
   2395		cur = 0;
   2396		while (cur < total_size) {
   2397			u16 name_len = btrfs_dir_name_len(path->nodes[0], di);
   2398			u16 data_len = btrfs_dir_data_len(path->nodes[0], di);
   2399			u32 this_len = sizeof(*di) + name_len + data_len;
   2400			char *name;
   2401
   2402			name = kmalloc(name_len, GFP_NOFS);
   2403			if (!name) {
   2404				ret = -ENOMEM;
   2405				goto out;
   2406			}
   2407			read_extent_buffer(path->nodes[0], name,
   2408					   (unsigned long)(di + 1), name_len);
   2409
   2410			log_di = btrfs_lookup_xattr(NULL, log, log_path, ino,
   2411						    name, name_len, 0);
   2412			btrfs_release_path(log_path);
   2413			if (!log_di) {
   2414				/* Doesn't exist in log tree, so delete it. */
   2415				btrfs_release_path(path);
   2416				di = btrfs_lookup_xattr(trans, root, path, ino,
   2417							name, name_len, -1);
   2418				kfree(name);
   2419				if (IS_ERR(di)) {
   2420					ret = PTR_ERR(di);
   2421					goto out;
   2422				}
   2423				ASSERT(di);
   2424				ret = btrfs_delete_one_dir_name(trans, root,
   2425								path, di);
   2426				if (ret)
   2427					goto out;
   2428				btrfs_release_path(path);
   2429				search_key = key;
   2430				goto again;
   2431			}
   2432			kfree(name);
   2433			if (IS_ERR(log_di)) {
   2434				ret = PTR_ERR(log_di);
   2435				goto out;
   2436			}
   2437			cur += this_len;
   2438			di = (struct btrfs_dir_item *)((char *)di + this_len);
   2439		}
   2440	}
   2441	ret = btrfs_next_leaf(root, path);
   2442	if (ret > 0)
   2443		ret = 0;
   2444	else if (ret == 0)
   2445		goto process_leaf;
   2446out:
   2447	btrfs_free_path(log_path);
   2448	btrfs_release_path(path);
   2449	return ret;
   2450}
   2451
   2452
   2453/*
   2454 * deletion replay happens before we copy any new directory items
   2455 * out of the log or out of backreferences from inodes.  It
   2456 * scans the log to find ranges of keys that log is authoritative for,
   2457 * and then scans the directory to find items in those ranges that are
   2458 * not present in the log.
   2459 *
   2460 * Anything we don't find in the log is unlinked and removed from the
   2461 * directory.
   2462 */
   2463static noinline int replay_dir_deletes(struct btrfs_trans_handle *trans,
   2464				       struct btrfs_root *root,
   2465				       struct btrfs_root *log,
   2466				       struct btrfs_path *path,
   2467				       u64 dirid, int del_all)
   2468{
   2469	u64 range_start;
   2470	u64 range_end;
   2471	int ret = 0;
   2472	struct btrfs_key dir_key;
   2473	struct btrfs_key found_key;
   2474	struct btrfs_path *log_path;
   2475	struct inode *dir;
   2476
   2477	dir_key.objectid = dirid;
   2478	dir_key.type = BTRFS_DIR_INDEX_KEY;
   2479	log_path = btrfs_alloc_path();
   2480	if (!log_path)
   2481		return -ENOMEM;
   2482
   2483	dir = read_one_inode(root, dirid);
   2484	/* it isn't an error if the inode isn't there, that can happen
   2485	 * because we replay the deletes before we copy in the inode item
   2486	 * from the log
   2487	 */
   2488	if (!dir) {
   2489		btrfs_free_path(log_path);
   2490		return 0;
   2491	}
   2492
   2493	range_start = 0;
   2494	range_end = 0;
   2495	while (1) {
   2496		if (del_all)
   2497			range_end = (u64)-1;
   2498		else {
   2499			ret = find_dir_range(log, path, dirid,
   2500					     &range_start, &range_end);
   2501			if (ret < 0)
   2502				goto out;
   2503			else if (ret > 0)
   2504				break;
   2505		}
   2506
   2507		dir_key.offset = range_start;
   2508		while (1) {
   2509			int nritems;
   2510			ret = btrfs_search_slot(NULL, root, &dir_key, path,
   2511						0, 0);
   2512			if (ret < 0)
   2513				goto out;
   2514
   2515			nritems = btrfs_header_nritems(path->nodes[0]);
   2516			if (path->slots[0] >= nritems) {
   2517				ret = btrfs_next_leaf(root, path);
   2518				if (ret == 1)
   2519					break;
   2520				else if (ret < 0)
   2521					goto out;
   2522			}
   2523			btrfs_item_key_to_cpu(path->nodes[0], &found_key,
   2524					      path->slots[0]);
   2525			if (found_key.objectid != dirid ||
   2526			    found_key.type != dir_key.type) {
   2527				ret = 0;
   2528				goto out;
   2529			}
   2530
   2531			if (found_key.offset > range_end)
   2532				break;
   2533
   2534			ret = check_item_in_log(trans, log, path,
   2535						log_path, dir,
   2536						&found_key);
   2537			if (ret)
   2538				goto out;
   2539			if (found_key.offset == (u64)-1)
   2540				break;
   2541			dir_key.offset = found_key.offset + 1;
   2542		}
   2543		btrfs_release_path(path);
   2544		if (range_end == (u64)-1)
   2545			break;
   2546		range_start = range_end + 1;
   2547	}
   2548	ret = 0;
   2549out:
   2550	btrfs_release_path(path);
   2551	btrfs_free_path(log_path);
   2552	iput(dir);
   2553	return ret;
   2554}
   2555
   2556/*
   2557 * the process_func used to replay items from the log tree.  This
   2558 * gets called in two different stages.  The first stage just looks
   2559 * for inodes and makes sure they are all copied into the subvolume.
   2560 *
   2561 * The second stage copies all the other item types from the log into
   2562 * the subvolume.  The two stage approach is slower, but gets rid of
   2563 * lots of complexity around inodes referencing other inodes that exist
   2564 * only in the log (references come from either directory items or inode
   2565 * back refs).
   2566 */
   2567static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb,
   2568			     struct walk_control *wc, u64 gen, int level)
   2569{
   2570	int nritems;
   2571	struct btrfs_path *path;
   2572	struct btrfs_root *root = wc->replay_dest;
   2573	struct btrfs_key key;
   2574	int i;
   2575	int ret;
   2576
   2577	ret = btrfs_read_extent_buffer(eb, gen, level, NULL);
   2578	if (ret)
   2579		return ret;
   2580
   2581	level = btrfs_header_level(eb);
   2582
   2583	if (level != 0)
   2584		return 0;
   2585
   2586	path = btrfs_alloc_path();
   2587	if (!path)
   2588		return -ENOMEM;
   2589
   2590	nritems = btrfs_header_nritems(eb);
   2591	for (i = 0; i < nritems; i++) {
   2592		btrfs_item_key_to_cpu(eb, &key, i);
   2593
   2594		/* inode keys are done during the first stage */
   2595		if (key.type == BTRFS_INODE_ITEM_KEY &&
   2596		    wc->stage == LOG_WALK_REPLAY_INODES) {
   2597			struct btrfs_inode_item *inode_item;
   2598			u32 mode;
   2599
   2600			inode_item = btrfs_item_ptr(eb, i,
   2601					    struct btrfs_inode_item);
   2602			/*
   2603			 * If we have a tmpfile (O_TMPFILE) that got fsync'ed
   2604			 * and never got linked before the fsync, skip it, as
   2605			 * replaying it is pointless since it would be deleted
   2606			 * later. We skip logging tmpfiles, but it's always
   2607			 * possible we are replaying a log created with a kernel
   2608			 * that used to log tmpfiles.
   2609			 */
   2610			if (btrfs_inode_nlink(eb, inode_item) == 0) {
   2611				wc->ignore_cur_inode = true;
   2612				continue;
   2613			} else {
   2614				wc->ignore_cur_inode = false;
   2615			}
   2616			ret = replay_xattr_deletes(wc->trans, root, log,
   2617						   path, key.objectid);
   2618			if (ret)
   2619				break;
   2620			mode = btrfs_inode_mode(eb, inode_item);
   2621			if (S_ISDIR(mode)) {
   2622				ret = replay_dir_deletes(wc->trans,
   2623					 root, log, path, key.objectid, 0);
   2624				if (ret)
   2625					break;
   2626			}
   2627			ret = overwrite_item(wc->trans, root, path,
   2628					     eb, i, &key);
   2629			if (ret)
   2630				break;
   2631
   2632			/*
   2633			 * Before replaying extents, truncate the inode to its
   2634			 * size. We need to do it now and not after log replay
   2635			 * because before an fsync we can have prealloc extents
   2636			 * added beyond the inode's i_size. If we did it after,
   2637			 * through orphan cleanup for example, we would drop
   2638			 * those prealloc extents just after replaying them.
   2639			 */
   2640			if (S_ISREG(mode)) {
   2641				struct btrfs_drop_extents_args drop_args = { 0 };
   2642				struct inode *inode;
   2643				u64 from;
   2644
   2645				inode = read_one_inode(root, key.objectid);
   2646				if (!inode) {
   2647					ret = -EIO;
   2648					break;
   2649				}
   2650				from = ALIGN(i_size_read(inode),
   2651					     root->fs_info->sectorsize);
   2652				drop_args.start = from;
   2653				drop_args.end = (u64)-1;
   2654				drop_args.drop_cache = true;
   2655				ret = btrfs_drop_extents(wc->trans, root,
   2656							 BTRFS_I(inode),
   2657							 &drop_args);
   2658				if (!ret) {
   2659					inode_sub_bytes(inode,
   2660							drop_args.bytes_found);
   2661					/* Update the inode's nbytes. */
   2662					ret = btrfs_update_inode(wc->trans,
   2663							root, BTRFS_I(inode));
   2664				}
   2665				iput(inode);
   2666				if (ret)
   2667					break;
   2668			}
   2669
   2670			ret = link_to_fixup_dir(wc->trans, root,
   2671						path, key.objectid);
   2672			if (ret)
   2673				break;
   2674		}
   2675
   2676		if (wc->ignore_cur_inode)
   2677			continue;
   2678
   2679		if (key.type == BTRFS_DIR_INDEX_KEY &&
   2680		    wc->stage == LOG_WALK_REPLAY_DIR_INDEX) {
   2681			ret = replay_one_dir_item(wc->trans, root, path,
   2682						  eb, i, &key);
   2683			if (ret)
   2684				break;
   2685		}
   2686
   2687		if (wc->stage < LOG_WALK_REPLAY_ALL)
   2688			continue;
   2689
   2690		/* these keys are simply copied */
   2691		if (key.type == BTRFS_XATTR_ITEM_KEY) {
   2692			ret = overwrite_item(wc->trans, root, path,
   2693					     eb, i, &key);
   2694			if (ret)
   2695				break;
   2696		} else if (key.type == BTRFS_INODE_REF_KEY ||
   2697			   key.type == BTRFS_INODE_EXTREF_KEY) {
   2698			ret = add_inode_ref(wc->trans, root, log, path,
   2699					    eb, i, &key);
   2700			if (ret && ret != -ENOENT)
   2701				break;
   2702			ret = 0;
   2703		} else if (key.type == BTRFS_EXTENT_DATA_KEY) {
   2704			ret = replay_one_extent(wc->trans, root, path,
   2705						eb, i, &key);
   2706			if (ret)
   2707				break;
   2708		}
   2709		/*
   2710		 * We don't log BTRFS_DIR_ITEM_KEY keys anymore, only the
   2711		 * BTRFS_DIR_INDEX_KEY items which we use to derive the
   2712		 * BTRFS_DIR_ITEM_KEY items. If we are replaying a log from an
   2713		 * older kernel with such keys, ignore them.
   2714		 */
   2715	}
   2716	btrfs_free_path(path);
   2717	return ret;
   2718}
   2719
   2720/*
   2721 * Correctly adjust the reserved bytes occupied by a log tree extent buffer
   2722 */
   2723static void unaccount_log_buffer(struct btrfs_fs_info *fs_info, u64 start)
   2724{
   2725	struct btrfs_block_group *cache;
   2726
   2727	cache = btrfs_lookup_block_group(fs_info, start);
   2728	if (!cache) {
   2729		btrfs_err(fs_info, "unable to find block group for %llu", start);
   2730		return;
   2731	}
   2732
   2733	spin_lock(&cache->space_info->lock);
   2734	spin_lock(&cache->lock);
   2735	cache->reserved -= fs_info->nodesize;
   2736	cache->space_info->bytes_reserved -= fs_info->nodesize;
   2737	spin_unlock(&cache->lock);
   2738	spin_unlock(&cache->space_info->lock);
   2739
   2740	btrfs_put_block_group(cache);
   2741}
   2742
   2743static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans,
   2744				   struct btrfs_root *root,
   2745				   struct btrfs_path *path, int *level,
   2746				   struct walk_control *wc)
   2747{
   2748	struct btrfs_fs_info *fs_info = root->fs_info;
   2749	u64 bytenr;
   2750	u64 ptr_gen;
   2751	struct extent_buffer *next;
   2752	struct extent_buffer *cur;
   2753	u32 blocksize;
   2754	int ret = 0;
   2755
   2756	while (*level > 0) {
   2757		struct btrfs_key first_key;
   2758
   2759		cur = path->nodes[*level];
   2760
   2761		WARN_ON(btrfs_header_level(cur) != *level);
   2762
   2763		if (path->slots[*level] >=
   2764		    btrfs_header_nritems(cur))
   2765			break;
   2766
   2767		bytenr = btrfs_node_blockptr(cur, path->slots[*level]);
   2768		ptr_gen = btrfs_node_ptr_generation(cur, path->slots[*level]);
   2769		btrfs_node_key_to_cpu(cur, &first_key, path->slots[*level]);
   2770		blocksize = fs_info->nodesize;
   2771
   2772		next = btrfs_find_create_tree_block(fs_info, bytenr,
   2773						    btrfs_header_owner(cur),
   2774						    *level - 1);
   2775		if (IS_ERR(next))
   2776			return PTR_ERR(next);
   2777
   2778		if (*level == 1) {
   2779			ret = wc->process_func(root, next, wc, ptr_gen,
   2780					       *level - 1);
   2781			if (ret) {
   2782				free_extent_buffer(next);
   2783				return ret;
   2784			}
   2785
   2786			path->slots[*level]++;
   2787			if (wc->free) {
   2788				ret = btrfs_read_extent_buffer(next, ptr_gen,
   2789							*level - 1, &first_key);
   2790				if (ret) {
   2791					free_extent_buffer(next);
   2792					return ret;
   2793				}
   2794
   2795				if (trans) {
   2796					btrfs_tree_lock(next);
   2797					btrfs_clean_tree_block(next);
   2798					btrfs_wait_tree_block_writeback(next);
   2799					btrfs_tree_unlock(next);
   2800					ret = btrfs_pin_reserved_extent(trans,
   2801							bytenr, blocksize);
   2802					if (ret) {
   2803						free_extent_buffer(next);
   2804						return ret;
   2805					}
   2806					btrfs_redirty_list_add(
   2807						trans->transaction, next);
   2808				} else {
   2809					if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &next->bflags))
   2810						clear_extent_buffer_dirty(next);
   2811					unaccount_log_buffer(fs_info, bytenr);
   2812				}
   2813			}
   2814			free_extent_buffer(next);
   2815			continue;
   2816		}
   2817		ret = btrfs_read_extent_buffer(next, ptr_gen, *level - 1, &first_key);
   2818		if (ret) {
   2819			free_extent_buffer(next);
   2820			return ret;
   2821		}
   2822
   2823		if (path->nodes[*level-1])
   2824			free_extent_buffer(path->nodes[*level-1]);
   2825		path->nodes[*level-1] = next;
   2826		*level = btrfs_header_level(next);
   2827		path->slots[*level] = 0;
   2828		cond_resched();
   2829	}
   2830	path->slots[*level] = btrfs_header_nritems(path->nodes[*level]);
   2831
   2832	cond_resched();
   2833	return 0;
   2834}
   2835
   2836static noinline int walk_up_log_tree(struct btrfs_trans_handle *trans,
   2837				 struct btrfs_root *root,
   2838				 struct btrfs_path *path, int *level,
   2839				 struct walk_control *wc)
   2840{
   2841	struct btrfs_fs_info *fs_info = root->fs_info;
   2842	int i;
   2843	int slot;
   2844	int ret;
   2845
   2846	for (i = *level; i < BTRFS_MAX_LEVEL - 1 && path->nodes[i]; i++) {
   2847		slot = path->slots[i];
   2848		if (slot + 1 < btrfs_header_nritems(path->nodes[i])) {
   2849			path->slots[i]++;
   2850			*level = i;
   2851			WARN_ON(*level == 0);
   2852			return 0;
   2853		} else {
   2854			ret = wc->process_func(root, path->nodes[*level], wc,
   2855				 btrfs_header_generation(path->nodes[*level]),
   2856				 *level);
   2857			if (ret)
   2858				return ret;
   2859
   2860			if (wc->free) {
   2861				struct extent_buffer *next;
   2862
   2863				next = path->nodes[*level];
   2864
   2865				if (trans) {
   2866					btrfs_tree_lock(next);
   2867					btrfs_clean_tree_block(next);
   2868					btrfs_wait_tree_block_writeback(next);
   2869					btrfs_tree_unlock(next);
   2870					ret = btrfs_pin_reserved_extent(trans,
   2871						     path->nodes[*level]->start,
   2872						     path->nodes[*level]->len);
   2873					if (ret)
   2874						return ret;
   2875					btrfs_redirty_list_add(trans->transaction,
   2876							       next);
   2877				} else {
   2878					if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &next->bflags))
   2879						clear_extent_buffer_dirty(next);
   2880
   2881					unaccount_log_buffer(fs_info,
   2882						path->nodes[*level]->start);
   2883				}
   2884			}
   2885			free_extent_buffer(path->nodes[*level]);
   2886			path->nodes[*level] = NULL;
   2887			*level = i + 1;
   2888		}
   2889	}
   2890	return 1;
   2891}
   2892
   2893/*
   2894 * drop the reference count on the tree rooted at 'snap'.  This traverses
   2895 * the tree freeing any blocks that have a ref count of zero after being
   2896 * decremented.
   2897 */
   2898static int walk_log_tree(struct btrfs_trans_handle *trans,
   2899			 struct btrfs_root *log, struct walk_control *wc)
   2900{
   2901	struct btrfs_fs_info *fs_info = log->fs_info;
   2902	int ret = 0;
   2903	int wret;
   2904	int level;
   2905	struct btrfs_path *path;
   2906	int orig_level;
   2907
   2908	path = btrfs_alloc_path();
   2909	if (!path)
   2910		return -ENOMEM;
   2911
   2912	level = btrfs_header_level(log->node);
   2913	orig_level = level;
   2914	path->nodes[level] = log->node;
   2915	atomic_inc(&log->node->refs);
   2916	path->slots[level] = 0;
   2917
   2918	while (1) {
   2919		wret = walk_down_log_tree(trans, log, path, &level, wc);
   2920		if (wret > 0)
   2921			break;
   2922		if (wret < 0) {
   2923			ret = wret;
   2924			goto out;
   2925		}
   2926
   2927		wret = walk_up_log_tree(trans, log, path, &level, wc);
   2928		if (wret > 0)
   2929			break;
   2930		if (wret < 0) {
   2931			ret = wret;
   2932			goto out;
   2933		}
   2934	}
   2935
   2936	/* was the root node processed? if not, catch it here */
   2937	if (path->nodes[orig_level]) {
   2938		ret = wc->process_func(log, path->nodes[orig_level], wc,
   2939			 btrfs_header_generation(path->nodes[orig_level]),
   2940			 orig_level);
   2941		if (ret)
   2942			goto out;
   2943		if (wc->free) {
   2944			struct extent_buffer *next;
   2945
   2946			next = path->nodes[orig_level];
   2947
   2948			if (trans) {
   2949				btrfs_tree_lock(next);
   2950				btrfs_clean_tree_block(next);
   2951				btrfs_wait_tree_block_writeback(next);
   2952				btrfs_tree_unlock(next);
   2953				ret = btrfs_pin_reserved_extent(trans,
   2954						next->start, next->len);
   2955				if (ret)
   2956					goto out;
   2957				btrfs_redirty_list_add(trans->transaction, next);
   2958			} else {
   2959				if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &next->bflags))
   2960					clear_extent_buffer_dirty(next);
   2961				unaccount_log_buffer(fs_info, next->start);
   2962			}
   2963		}
   2964	}
   2965
   2966out:
   2967	btrfs_free_path(path);
   2968	return ret;
   2969}
   2970
   2971/*
   2972 * helper function to update the item for a given subvolumes log root
   2973 * in the tree of log roots
   2974 */
   2975static int update_log_root(struct btrfs_trans_handle *trans,
   2976			   struct btrfs_root *log,
   2977			   struct btrfs_root_item *root_item)
   2978{
   2979	struct btrfs_fs_info *fs_info = log->fs_info;
   2980	int ret;
   2981
   2982	if (log->log_transid == 1) {
   2983		/* insert root item on the first sync */
   2984		ret = btrfs_insert_root(trans, fs_info->log_root_tree,
   2985				&log->root_key, root_item);
   2986	} else {
   2987		ret = btrfs_update_root(trans, fs_info->log_root_tree,
   2988				&log->root_key, root_item);
   2989	}
   2990	return ret;
   2991}
   2992
   2993static void wait_log_commit(struct btrfs_root *root, int transid)
   2994{
   2995	DEFINE_WAIT(wait);
   2996	int index = transid % 2;
   2997
   2998	/*
   2999	 * we only allow two pending log transactions at a time,
   3000	 * so we know that if ours is more than 2 older than the
   3001	 * current transaction, we're done
   3002	 */
   3003	for (;;) {
   3004		prepare_to_wait(&root->log_commit_wait[index],
   3005				&wait, TASK_UNINTERRUPTIBLE);
   3006
   3007		if (!(root->log_transid_committed < transid &&
   3008		      atomic_read(&root->log_commit[index])))
   3009			break;
   3010
   3011		mutex_unlock(&root->log_mutex);
   3012		schedule();
   3013		mutex_lock(&root->log_mutex);
   3014	}
   3015	finish_wait(&root->log_commit_wait[index], &wait);
   3016}
   3017
   3018static void wait_for_writer(struct btrfs_root *root)
   3019{
   3020	DEFINE_WAIT(wait);
   3021
   3022	for (;;) {
   3023		prepare_to_wait(&root->log_writer_wait, &wait,
   3024				TASK_UNINTERRUPTIBLE);
   3025		if (!atomic_read(&root->log_writers))
   3026			break;
   3027
   3028		mutex_unlock(&root->log_mutex);
   3029		schedule();
   3030		mutex_lock(&root->log_mutex);
   3031	}
   3032	finish_wait(&root->log_writer_wait, &wait);
   3033}
   3034
   3035static inline void btrfs_remove_log_ctx(struct btrfs_root *root,
   3036					struct btrfs_log_ctx *ctx)
   3037{
   3038	mutex_lock(&root->log_mutex);
   3039	list_del_init(&ctx->list);
   3040	mutex_unlock(&root->log_mutex);
   3041}
   3042
   3043/* 
   3044 * Invoked in log mutex context, or be sure there is no other task which
   3045 * can access the list.
   3046 */
   3047static inline void btrfs_remove_all_log_ctxs(struct btrfs_root *root,
   3048					     int index, int error)
   3049{
   3050	struct btrfs_log_ctx *ctx;
   3051	struct btrfs_log_ctx *safe;
   3052
   3053	list_for_each_entry_safe(ctx, safe, &root->log_ctxs[index], list) {
   3054		list_del_init(&ctx->list);
   3055		ctx->log_ret = error;
   3056	}
   3057}
   3058
   3059/*
   3060 * btrfs_sync_log does sends a given tree log down to the disk and
   3061 * updates the super blocks to record it.  When this call is done,
   3062 * you know that any inodes previously logged are safely on disk only
   3063 * if it returns 0.
   3064 *
   3065 * Any other return value means you need to call btrfs_commit_transaction.
   3066 * Some of the edge cases for fsyncing directories that have had unlinks
   3067 * or renames done in the past mean that sometimes the only safe
   3068 * fsync is to commit the whole FS.  When btrfs_sync_log returns -EAGAIN,
   3069 * that has happened.
   3070 */
   3071int btrfs_sync_log(struct btrfs_trans_handle *trans,
   3072		   struct btrfs_root *root, struct btrfs_log_ctx *ctx)
   3073{
   3074	int index1;
   3075	int index2;
   3076	int mark;
   3077	int ret;
   3078	struct btrfs_fs_info *fs_info = root->fs_info;
   3079	struct btrfs_root *log = root->log_root;
   3080	struct btrfs_root *log_root_tree = fs_info->log_root_tree;
   3081	struct btrfs_root_item new_root_item;
   3082	int log_transid = 0;
   3083	struct btrfs_log_ctx root_log_ctx;
   3084	struct blk_plug plug;
   3085	u64 log_root_start;
   3086	u64 log_root_level;
   3087
   3088	mutex_lock(&root->log_mutex);
   3089	log_transid = ctx->log_transid;
   3090	if (root->log_transid_committed >= log_transid) {
   3091		mutex_unlock(&root->log_mutex);
   3092		return ctx->log_ret;
   3093	}
   3094
   3095	index1 = log_transid % 2;
   3096	if (atomic_read(&root->log_commit[index1])) {
   3097		wait_log_commit(root, log_transid);
   3098		mutex_unlock(&root->log_mutex);
   3099		return ctx->log_ret;
   3100	}
   3101	ASSERT(log_transid == root->log_transid);
   3102	atomic_set(&root->log_commit[index1], 1);
   3103
   3104	/* wait for previous tree log sync to complete */
   3105	if (atomic_read(&root->log_commit[(index1 + 1) % 2]))
   3106		wait_log_commit(root, log_transid - 1);
   3107
   3108	while (1) {
   3109		int batch = atomic_read(&root->log_batch);
   3110		/* when we're on an ssd, just kick the log commit out */
   3111		if (!btrfs_test_opt(fs_info, SSD) &&
   3112		    test_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state)) {
   3113			mutex_unlock(&root->log_mutex);
   3114			schedule_timeout_uninterruptible(1);
   3115			mutex_lock(&root->log_mutex);
   3116		}
   3117		wait_for_writer(root);
   3118		if (batch == atomic_read(&root->log_batch))
   3119			break;
   3120	}
   3121
   3122	/* bail out if we need to do a full commit */
   3123	if (btrfs_need_log_full_commit(trans)) {
   3124		ret = -EAGAIN;
   3125		mutex_unlock(&root->log_mutex);
   3126		goto out;
   3127	}
   3128
   3129	if (log_transid % 2 == 0)
   3130		mark = EXTENT_DIRTY;
   3131	else
   3132		mark = EXTENT_NEW;
   3133
   3134	/* we start IO on  all the marked extents here, but we don't actually
   3135	 * wait for them until later.
   3136	 */
   3137	blk_start_plug(&plug);
   3138	ret = btrfs_write_marked_extents(fs_info, &log->dirty_log_pages, mark);
   3139	/*
   3140	 * -EAGAIN happens when someone, e.g., a concurrent transaction
   3141	 *  commit, writes a dirty extent in this tree-log commit. This
   3142	 *  concurrent write will create a hole writing out the extents,
   3143	 *  and we cannot proceed on a zoned filesystem, requiring
   3144	 *  sequential writing. While we can bail out to a full commit
   3145	 *  here, but we can continue hoping the concurrent writing fills
   3146	 *  the hole.
   3147	 */
   3148	if (ret == -EAGAIN && btrfs_is_zoned(fs_info))
   3149		ret = 0;
   3150	if (ret) {
   3151		blk_finish_plug(&plug);
   3152		btrfs_abort_transaction(trans, ret);
   3153		btrfs_set_log_full_commit(trans);
   3154		mutex_unlock(&root->log_mutex);
   3155		goto out;
   3156	}
   3157
   3158	/*
   3159	 * We _must_ update under the root->log_mutex in order to make sure we
   3160	 * have a consistent view of the log root we are trying to commit at
   3161	 * this moment.
   3162	 *
   3163	 * We _must_ copy this into a local copy, because we are not holding the
   3164	 * log_root_tree->log_mutex yet.  This is important because when we
   3165	 * commit the log_root_tree we must have a consistent view of the
   3166	 * log_root_tree when we update the super block to point at the
   3167	 * log_root_tree bytenr.  If we update the log_root_tree here we'll race
   3168	 * with the commit and possibly point at the new block which we may not
   3169	 * have written out.
   3170	 */
   3171	btrfs_set_root_node(&log->root_item, log->node);
   3172	memcpy(&new_root_item, &log->root_item, sizeof(new_root_item));
   3173
   3174	root->log_transid++;
   3175	log->log_transid = root->log_transid;
   3176	root->log_start_pid = 0;
   3177	/*
   3178	 * IO has been started, blocks of the log tree have WRITTEN flag set
   3179	 * in their headers. new modifications of the log will be written to
   3180	 * new positions. so it's safe to allow log writers to go in.
   3181	 */
   3182	mutex_unlock(&root->log_mutex);
   3183
   3184	if (btrfs_is_zoned(fs_info)) {
   3185		mutex_lock(&fs_info->tree_root->log_mutex);
   3186		if (!log_root_tree->node) {
   3187			ret = btrfs_alloc_log_tree_node(trans, log_root_tree);
   3188			if (ret) {
   3189				mutex_unlock(&fs_info->tree_root->log_mutex);
   3190				blk_finish_plug(&plug);
   3191				goto out;
   3192			}
   3193		}
   3194		mutex_unlock(&fs_info->tree_root->log_mutex);
   3195	}
   3196
   3197	btrfs_init_log_ctx(&root_log_ctx, NULL);
   3198
   3199	mutex_lock(&log_root_tree->log_mutex);
   3200
   3201	index2 = log_root_tree->log_transid % 2;
   3202	list_add_tail(&root_log_ctx.list, &log_root_tree->log_ctxs[index2]);
   3203	root_log_ctx.log_transid = log_root_tree->log_transid;
   3204
   3205	/*
   3206	 * Now we are safe to update the log_root_tree because we're under the
   3207	 * log_mutex, and we're a current writer so we're holding the commit
   3208	 * open until we drop the log_mutex.
   3209	 */
   3210	ret = update_log_root(trans, log, &new_root_item);
   3211	if (ret) {
   3212		if (!list_empty(&root_log_ctx.list))
   3213			list_del_init(&root_log_ctx.list);
   3214
   3215		blk_finish_plug(&plug);
   3216		btrfs_set_log_full_commit(trans);
   3217
   3218		if (ret != -ENOSPC) {
   3219			btrfs_abort_transaction(trans, ret);
   3220			mutex_unlock(&log_root_tree->log_mutex);
   3221			goto out;
   3222		}
   3223		btrfs_wait_tree_log_extents(log, mark);
   3224		mutex_unlock(&log_root_tree->log_mutex);
   3225		ret = -EAGAIN;
   3226		goto out;
   3227	}
   3228
   3229	if (log_root_tree->log_transid_committed >= root_log_ctx.log_transid) {
   3230		blk_finish_plug(&plug);
   3231		list_del_init(&root_log_ctx.list);
   3232		mutex_unlock(&log_root_tree->log_mutex);
   3233		ret = root_log_ctx.log_ret;
   3234		goto out;
   3235	}
   3236
   3237	index2 = root_log_ctx.log_transid % 2;
   3238	if (atomic_read(&log_root_tree->log_commit[index2])) {
   3239		blk_finish_plug(&plug);
   3240		ret = btrfs_wait_tree_log_extents(log, mark);
   3241		wait_log_commit(log_root_tree,
   3242				root_log_ctx.log_transid);
   3243		mutex_unlock(&log_root_tree->log_mutex);
   3244		if (!ret)
   3245			ret = root_log_ctx.log_ret;
   3246		goto out;
   3247	}
   3248	ASSERT(root_log_ctx.log_transid == log_root_tree->log_transid);
   3249	atomic_set(&log_root_tree->log_commit[index2], 1);
   3250
   3251	if (atomic_read(&log_root_tree->log_commit[(index2 + 1) % 2])) {
   3252		wait_log_commit(log_root_tree,
   3253				root_log_ctx.log_transid - 1);
   3254	}
   3255
   3256	/*
   3257	 * now that we've moved on to the tree of log tree roots,
   3258	 * check the full commit flag again
   3259	 */
   3260	if (btrfs_need_log_full_commit(trans)) {
   3261		blk_finish_plug(&plug);
   3262		btrfs_wait_tree_log_extents(log, mark);
   3263		mutex_unlock(&log_root_tree->log_mutex);
   3264		ret = -EAGAIN;
   3265		goto out_wake_log_root;
   3266	}
   3267
   3268	ret = btrfs_write_marked_extents(fs_info,
   3269					 &log_root_tree->dirty_log_pages,
   3270					 EXTENT_DIRTY | EXTENT_NEW);
   3271	blk_finish_plug(&plug);
   3272	/*
   3273	 * As described above, -EAGAIN indicates a hole in the extents. We
   3274	 * cannot wait for these write outs since the waiting cause a
   3275	 * deadlock. Bail out to the full commit instead.
   3276	 */
   3277	if (ret == -EAGAIN && btrfs_is_zoned(fs_info)) {
   3278		btrfs_set_log_full_commit(trans);
   3279		btrfs_wait_tree_log_extents(log, mark);
   3280		mutex_unlock(&log_root_tree->log_mutex);
   3281		goto out_wake_log_root;
   3282	} else if (ret) {
   3283		btrfs_set_log_full_commit(trans);
   3284		btrfs_abort_transaction(trans, ret);
   3285		mutex_unlock(&log_root_tree->log_mutex);
   3286		goto out_wake_log_root;
   3287	}
   3288	ret = btrfs_wait_tree_log_extents(log, mark);
   3289	if (!ret)
   3290		ret = btrfs_wait_tree_log_extents(log_root_tree,
   3291						  EXTENT_NEW | EXTENT_DIRTY);
   3292	if (ret) {
   3293		btrfs_set_log_full_commit(trans);
   3294		mutex_unlock(&log_root_tree->log_mutex);
   3295		goto out_wake_log_root;
   3296	}
   3297
   3298	log_root_start = log_root_tree->node->start;
   3299	log_root_level = btrfs_header_level(log_root_tree->node);
   3300	log_root_tree->log_transid++;
   3301	mutex_unlock(&log_root_tree->log_mutex);
   3302
   3303	/*
   3304	 * Here we are guaranteed that nobody is going to write the superblock
   3305	 * for the current transaction before us and that neither we do write
   3306	 * our superblock before the previous transaction finishes its commit
   3307	 * and writes its superblock, because:
   3308	 *
   3309	 * 1) We are holding a handle on the current transaction, so no body
   3310	 *    can commit it until we release the handle;
   3311	 *
   3312	 * 2) Before writing our superblock we acquire the tree_log_mutex, so
   3313	 *    if the previous transaction is still committing, and hasn't yet
   3314	 *    written its superblock, we wait for it to do it, because a
   3315	 *    transaction commit acquires the tree_log_mutex when the commit
   3316	 *    begins and releases it only after writing its superblock.
   3317	 */
   3318	mutex_lock(&fs_info->tree_log_mutex);
   3319
   3320	/*
   3321	 * The previous transaction writeout phase could have failed, and thus
   3322	 * marked the fs in an error state.  We must not commit here, as we
   3323	 * could have updated our generation in the super_for_commit and
   3324	 * writing the super here would result in transid mismatches.  If there
   3325	 * is an error here just bail.
   3326	 */
   3327	if (BTRFS_FS_ERROR(fs_info)) {
   3328		ret = -EIO;
   3329		btrfs_set_log_full_commit(trans);
   3330		btrfs_abort_transaction(trans, ret);
   3331		mutex_unlock(&fs_info->tree_log_mutex);
   3332		goto out_wake_log_root;
   3333	}
   3334
   3335	btrfs_set_super_log_root(fs_info->super_for_commit, log_root_start);
   3336	btrfs_set_super_log_root_level(fs_info->super_for_commit, log_root_level);
   3337	ret = write_all_supers(fs_info, 1);
   3338	mutex_unlock(&fs_info->tree_log_mutex);
   3339	if (ret) {
   3340		btrfs_set_log_full_commit(trans);
   3341		btrfs_abort_transaction(trans, ret);
   3342		goto out_wake_log_root;
   3343	}
   3344
   3345	/*
   3346	 * We know there can only be one task here, since we have not yet set
   3347	 * root->log_commit[index1] to 0 and any task attempting to sync the
   3348	 * log must wait for the previous log transaction to commit if it's
   3349	 * still in progress or wait for the current log transaction commit if
   3350	 * someone else already started it. We use <= and not < because the
   3351	 * first log transaction has an ID of 0.
   3352	 */
   3353	ASSERT(root->last_log_commit <= log_transid);
   3354	root->last_log_commit = log_transid;
   3355
   3356out_wake_log_root:
   3357	mutex_lock(&log_root_tree->log_mutex);
   3358	btrfs_remove_all_log_ctxs(log_root_tree, index2, ret);
   3359
   3360	log_root_tree->log_transid_committed++;
   3361	atomic_set(&log_root_tree->log_commit[index2], 0);
   3362	mutex_unlock(&log_root_tree->log_mutex);
   3363
   3364	/*
   3365	 * The barrier before waitqueue_active (in cond_wake_up) is needed so
   3366	 * all the updates above are seen by the woken threads. It might not be
   3367	 * necessary, but proving that seems to be hard.
   3368	 */
   3369	cond_wake_up(&log_root_tree->log_commit_wait[index2]);
   3370out:
   3371	mutex_lock(&root->log_mutex);
   3372	btrfs_remove_all_log_ctxs(root, index1, ret);
   3373	root->log_transid_committed++;
   3374	atomic_set(&root->log_commit[index1], 0);
   3375	mutex_unlock(&root->log_mutex);
   3376
   3377	/*
   3378	 * The barrier before waitqueue_active (in cond_wake_up) is needed so
   3379	 * all the updates above are seen by the woken threads. It might not be
   3380	 * necessary, but proving that seems to be hard.
   3381	 */
   3382	cond_wake_up(&root->log_commit_wait[index1]);
   3383	return ret;
   3384}
   3385
   3386static void free_log_tree(struct btrfs_trans_handle *trans,
   3387			  struct btrfs_root *log)
   3388{
   3389	int ret;
   3390	struct walk_control wc = {
   3391		.free = 1,
   3392		.process_func = process_one_buffer
   3393	};
   3394
   3395	if (log->node) {
   3396		ret = walk_log_tree(trans, log, &wc);
   3397		if (ret) {
   3398			/*
   3399			 * We weren't able to traverse the entire log tree, the
   3400			 * typical scenario is getting an -EIO when reading an
   3401			 * extent buffer of the tree, due to a previous writeback
   3402			 * failure of it.
   3403			 */
   3404			set_bit(BTRFS_FS_STATE_LOG_CLEANUP_ERROR,
   3405				&log->fs_info->fs_state);
   3406
   3407			/*
   3408			 * Some extent buffers of the log tree may still be dirty
   3409			 * and not yet written back to storage, because we may
   3410			 * have updates to a log tree without syncing a log tree,
   3411			 * such as during rename and link operations. So flush
   3412			 * them out and wait for their writeback to complete, so
   3413			 * that we properly cleanup their state and pages.
   3414			 */
   3415			btrfs_write_marked_extents(log->fs_info,
   3416						   &log->dirty_log_pages,
   3417						   EXTENT_DIRTY | EXTENT_NEW);
   3418			btrfs_wait_tree_log_extents(log,
   3419						    EXTENT_DIRTY | EXTENT_NEW);
   3420
   3421			if (trans)
   3422				btrfs_abort_transaction(trans, ret);
   3423			else
   3424				btrfs_handle_fs_error(log->fs_info, ret, NULL);
   3425		}
   3426	}
   3427
   3428	clear_extent_bits(&log->dirty_log_pages, 0, (u64)-1,
   3429			  EXTENT_DIRTY | EXTENT_NEW | EXTENT_NEED_WAIT);
   3430	extent_io_tree_release(&log->log_csum_range);
   3431
   3432	btrfs_put_root(log);
   3433}
   3434
   3435/*
   3436 * free all the extents used by the tree log.  This should be called
   3437 * at commit time of the full transaction
   3438 */
   3439int btrfs_free_log(struct btrfs_trans_handle *trans, struct btrfs_root *root)
   3440{
   3441	if (root->log_root) {
   3442		free_log_tree(trans, root->log_root);
   3443		root->log_root = NULL;
   3444		clear_bit(BTRFS_ROOT_HAS_LOG_TREE, &root->state);
   3445	}
   3446	return 0;
   3447}
   3448
   3449int btrfs_free_log_root_tree(struct btrfs_trans_handle *trans,
   3450			     struct btrfs_fs_info *fs_info)
   3451{
   3452	if (fs_info->log_root_tree) {
   3453		free_log_tree(trans, fs_info->log_root_tree);
   3454		fs_info->log_root_tree = NULL;
   3455		clear_bit(BTRFS_ROOT_HAS_LOG_TREE, &fs_info->tree_root->state);
   3456	}
   3457	return 0;
   3458}
   3459
   3460/*
   3461 * Check if an inode was logged in the current transaction. This correctly deals
   3462 * with the case where the inode was logged but has a logged_trans of 0, which
   3463 * happens if the inode is evicted and loaded again, as logged_trans is an in
   3464 * memory only field (not persisted).
   3465 *
   3466 * Returns 1 if the inode was logged before in the transaction, 0 if it was not,
   3467 * and < 0 on error.
   3468 */
   3469static int inode_logged(struct btrfs_trans_handle *trans,
   3470			struct btrfs_inode *inode,
   3471			struct btrfs_path *path_in)
   3472{
   3473	struct btrfs_path *path = path_in;
   3474	struct btrfs_key key;
   3475	int ret;
   3476
   3477	if (inode->logged_trans == trans->transid)
   3478		return 1;
   3479
   3480	/*
   3481	 * If logged_trans is not 0, then we know the inode logged was not logged
   3482	 * in this transaction, so we can return false right away.
   3483	 */
   3484	if (inode->logged_trans > 0)
   3485		return 0;
   3486
   3487	/*
   3488	 * If no log tree was created for this root in this transaction, then
   3489	 * the inode can not have been logged in this transaction. In that case
   3490	 * set logged_trans to anything greater than 0 and less than the current
   3491	 * transaction's ID, to avoid the search below in a future call in case
   3492	 * a log tree gets created after this.
   3493	 */
   3494	if (!test_bit(BTRFS_ROOT_HAS_LOG_TREE, &inode->root->state)) {
   3495		inode->logged_trans = trans->transid - 1;
   3496		return 0;
   3497	}
   3498
   3499	/*
   3500	 * We have a log tree and the inode's logged_trans is 0. We can't tell
   3501	 * for sure if the inode was logged before in this transaction by looking
   3502	 * only at logged_trans. We could be pessimistic and assume it was, but
   3503	 * that can lead to unnecessarily logging an inode during rename and link
   3504	 * operations, and then further updating the log in followup rename and
   3505	 * link operations, specially if it's a directory, which adds latency
   3506	 * visible to applications doing a series of rename or link operations.
   3507	 *
   3508	 * A logged_trans of 0 here can mean several things:
   3509	 *
   3510	 * 1) The inode was never logged since the filesystem was mounted, and may
   3511	 *    or may have not been evicted and loaded again;
   3512	 *
   3513	 * 2) The inode was logged in a previous transaction, then evicted and
   3514	 *    then loaded again;
   3515	 *
   3516	 * 3) The inode was logged in the current transaction, then evicted and
   3517	 *    then loaded again.
   3518	 *
   3519	 * For cases 1) and 2) we don't want to return true, but we need to detect
   3520	 * case 3) and return true. So we do a search in the log root for the inode
   3521	 * item.
   3522	 */
   3523	key.objectid = btrfs_ino(inode);
   3524	key.type = BTRFS_INODE_ITEM_KEY;
   3525	key.offset = 0;
   3526
   3527	if (!path) {
   3528		path = btrfs_alloc_path();
   3529		if (!path)
   3530			return -ENOMEM;
   3531	}
   3532
   3533	ret = btrfs_search_slot(NULL, inode->root->log_root, &key, path, 0, 0);
   3534
   3535	if (path_in)
   3536		btrfs_release_path(path);
   3537	else
   3538		btrfs_free_path(path);
   3539
   3540	/*
   3541	 * Logging an inode always results in logging its inode item. So if we
   3542	 * did not find the item we know the inode was not logged for sure.
   3543	 */
   3544	if (ret < 0) {
   3545		return ret;
   3546	} else if (ret > 0) {
   3547		/*
   3548		 * Set logged_trans to a value greater than 0 and less then the
   3549		 * current transaction to avoid doing the search in future calls.
   3550		 */
   3551		inode->logged_trans = trans->transid - 1;
   3552		return 0;
   3553	}
   3554
   3555	/*
   3556	 * The inode was previously logged and then evicted, set logged_trans to
   3557	 * the current transacion's ID, to avoid future tree searches as long as
   3558	 * the inode is not evicted again.
   3559	 */
   3560	inode->logged_trans = trans->transid;
   3561
   3562	/*
   3563	 * If it's a directory, then we must set last_dir_index_offset to the
   3564	 * maximum possible value, so that the next attempt to log the inode does
   3565	 * not skip checking if dir index keys found in modified subvolume tree
   3566	 * leaves have been logged before, otherwise it would result in attempts
   3567	 * to insert duplicate dir index keys in the log tree. This must be done
   3568	 * because last_dir_index_offset is an in-memory only field, not persisted
   3569	 * in the inode item or any other on-disk structure, so its value is lost
   3570	 * once the inode is evicted.
   3571	 */
   3572	if (S_ISDIR(inode->vfs_inode.i_mode))
   3573		inode->last_dir_index_offset = (u64)-1;
   3574
   3575	return 1;
   3576}
   3577
   3578/*
   3579 * Delete a directory entry from the log if it exists.
   3580 *
   3581 * Returns < 0 on error
   3582 *           1 if the entry does not exists
   3583 *           0 if the entry existed and was successfully deleted
   3584 */
   3585static int del_logged_dentry(struct btrfs_trans_handle *trans,
   3586			     struct btrfs_root *log,
   3587			     struct btrfs_path *path,
   3588			     u64 dir_ino,
   3589			     const char *name, int name_len,
   3590			     u64 index)
   3591{
   3592	struct btrfs_dir_item *di;
   3593
   3594	/*
   3595	 * We only log dir index items of a directory, so we don't need to look
   3596	 * for dir item keys.
   3597	 */
   3598	di = btrfs_lookup_dir_index_item(trans, log, path, dir_ino,
   3599					 index, name, name_len, -1);
   3600	if (IS_ERR(di))
   3601		return PTR_ERR(di);
   3602	else if (!di)
   3603		return 1;
   3604
   3605	/*
   3606	 * We do not need to update the size field of the directory's
   3607	 * inode item because on log replay we update the field to reflect
   3608	 * all existing entries in the directory (see overwrite_item()).
   3609	 */
   3610	return btrfs_delete_one_dir_name(trans, log, path, di);
   3611}
   3612
   3613/*
   3614 * If both a file and directory are logged, and unlinks or renames are
   3615 * mixed in, we have a few interesting corners:
   3616 *
   3617 * create file X in dir Y
   3618 * link file X to X.link in dir Y
   3619 * fsync file X
   3620 * unlink file X but leave X.link
   3621 * fsync dir Y
   3622 *
   3623 * After a crash we would expect only X.link to exist.  But file X
   3624 * didn't get fsync'd again so the log has back refs for X and X.link.
   3625 *
   3626 * We solve this by removing directory entries and inode backrefs from the
   3627 * log when a file that was logged in the current transaction is
   3628 * unlinked.  Any later fsync will include the updated log entries, and
   3629 * we'll be able to reconstruct the proper directory items from backrefs.
   3630 *
   3631 * This optimizations allows us to avoid relogging the entire inode
   3632 * or the entire directory.
   3633 */
   3634void btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans,
   3635				  struct btrfs_root *root,
   3636				  const char *name, int name_len,
   3637				  struct btrfs_inode *dir, u64 index)
   3638{
   3639	struct btrfs_path *path;
   3640	int ret;
   3641
   3642	ret = inode_logged(trans, dir, NULL);
   3643	if (ret == 0)
   3644		return;
   3645	else if (ret < 0) {
   3646		btrfs_set_log_full_commit(trans);
   3647		return;
   3648	}
   3649
   3650	ret = join_running_log_trans(root);
   3651	if (ret)
   3652		return;
   3653
   3654	mutex_lock(&dir->log_mutex);
   3655
   3656	path = btrfs_alloc_path();
   3657	if (!path) {
   3658		ret = -ENOMEM;
   3659		goto out_unlock;
   3660	}
   3661
   3662	ret = del_logged_dentry(trans, root->log_root, path, btrfs_ino(dir),
   3663				name, name_len, index);
   3664	btrfs_free_path(path);
   3665out_unlock:
   3666	mutex_unlock(&dir->log_mutex);
   3667	if (ret < 0)
   3668		btrfs_set_log_full_commit(trans);
   3669	btrfs_end_log_trans(root);
   3670}
   3671
   3672/* see comments for btrfs_del_dir_entries_in_log */
   3673void btrfs_del_inode_ref_in_log(struct btrfs_trans_handle *trans,
   3674				struct btrfs_root *root,
   3675				const char *name, int name_len,
   3676				struct btrfs_inode *inode, u64 dirid)
   3677{
   3678	struct btrfs_root *log;
   3679	u64 index;
   3680	int ret;
   3681
   3682	ret = inode_logged(trans, inode, NULL);
   3683	if (ret == 0)
   3684		return;
   3685	else if (ret < 0) {
   3686		btrfs_set_log_full_commit(trans);
   3687		return;
   3688	}
   3689
   3690	ret = join_running_log_trans(root);
   3691	if (ret)
   3692		return;
   3693	log = root->log_root;
   3694	mutex_lock(&inode->log_mutex);
   3695
   3696	ret = btrfs_del_inode_ref(trans, log, name, name_len, btrfs_ino(inode),
   3697				  dirid, &index);
   3698	mutex_unlock(&inode->log_mutex);
   3699	if (ret < 0 && ret != -ENOENT)
   3700		btrfs_set_log_full_commit(trans);
   3701	btrfs_end_log_trans(root);
   3702}
   3703
   3704/*
   3705 * creates a range item in the log for 'dirid'.  first_offset and
   3706 * last_offset tell us which parts of the key space the log should
   3707 * be considered authoritative for.
   3708 */
   3709static noinline int insert_dir_log_key(struct btrfs_trans_handle *trans,
   3710				       struct btrfs_root *log,
   3711				       struct btrfs_path *path,
   3712				       u64 dirid,
   3713				       u64 first_offset, u64 last_offset)
   3714{
   3715	int ret;
   3716	struct btrfs_key key;
   3717	struct btrfs_dir_log_item *item;
   3718
   3719	key.objectid = dirid;
   3720	key.offset = first_offset;
   3721	key.type = BTRFS_DIR_LOG_INDEX_KEY;
   3722	ret = btrfs_insert_empty_item(trans, log, path, &key, sizeof(*item));
   3723	/*
   3724	 * -EEXIST is fine and can happen sporadically when we are logging a
   3725	 * directory and have concurrent insertions in the subvolume's tree for
   3726	 * items from other inodes and that result in pushing off some dir items
   3727	 * from one leaf to another in order to accommodate for the new items.
   3728	 * This results in logging the same dir index range key.
   3729	 */
   3730	if (ret && ret != -EEXIST)
   3731		return ret;
   3732
   3733	item = btrfs_item_ptr(path->nodes[0], path->slots[0],
   3734			      struct btrfs_dir_log_item);
   3735	if (ret == -EEXIST) {
   3736		const u64 curr_end = btrfs_dir_log_end(path->nodes[0], item);
   3737
   3738		/*
   3739		 * btrfs_del_dir_entries_in_log() might have been called during
   3740		 * an unlink between the initial insertion of this key and the
   3741		 * current update, or we might be logging a single entry deletion
   3742		 * during a rename, so set the new last_offset to the max value.
   3743		 */
   3744		last_offset = max(last_offset, curr_end);
   3745	}
   3746	btrfs_set_dir_log_end(path->nodes[0], item, last_offset);
   3747	btrfs_mark_buffer_dirty(path->nodes[0]);
   3748	btrfs_release_path(path);
   3749	return 0;
   3750}
   3751
   3752static int flush_dir_items_batch(struct btrfs_trans_handle *trans,
   3753				 struct btrfs_root *log,
   3754				 struct extent_buffer *src,
   3755				 struct btrfs_path *dst_path,
   3756				 int start_slot,
   3757				 int count)
   3758{
   3759	char *ins_data = NULL;
   3760	struct btrfs_item_batch batch;
   3761	struct extent_buffer *dst;
   3762	unsigned long src_offset;
   3763	unsigned long dst_offset;
   3764	struct btrfs_key key;
   3765	u32 item_size;
   3766	int ret;
   3767	int i;
   3768
   3769	ASSERT(count > 0);
   3770	batch.nr = count;
   3771
   3772	if (count == 1) {
   3773		btrfs_item_key_to_cpu(src, &key, start_slot);
   3774		item_size = btrfs_item_size(src, start_slot);
   3775		batch.keys = &key;
   3776		batch.data_sizes = &item_size;
   3777		batch.total_data_size = item_size;
   3778	} else {
   3779		struct btrfs_key *ins_keys;
   3780		u32 *ins_sizes;
   3781
   3782		ins_data = kmalloc(count * sizeof(u32) +
   3783				   count * sizeof(struct btrfs_key), GFP_NOFS);
   3784		if (!ins_data)
   3785			return -ENOMEM;
   3786
   3787		ins_sizes = (u32 *)ins_data;
   3788		ins_keys = (struct btrfs_key *)(ins_data + count * sizeof(u32));
   3789		batch.keys = ins_keys;
   3790		batch.data_sizes = ins_sizes;
   3791		batch.total_data_size = 0;
   3792
   3793		for (i = 0; i < count; i++) {
   3794			const int slot = start_slot + i;
   3795
   3796			btrfs_item_key_to_cpu(src, &ins_keys[i], slot);
   3797			ins_sizes[i] = btrfs_item_size(src, slot);
   3798			batch.total_data_size += ins_sizes[i];
   3799		}
   3800	}
   3801
   3802	ret = btrfs_insert_empty_items(trans, log, dst_path, &batch);
   3803	if (ret)
   3804		goto out;
   3805
   3806	dst = dst_path->nodes[0];
   3807	/*
   3808	 * Copy all the items in bulk, in a single copy operation. Item data is
   3809	 * organized such that it's placed at the end of a leaf and from right
   3810	 * to left. For example, the data for the second item ends at an offset
   3811	 * that matches the offset where the data for the first item starts, the
   3812	 * data for the third item ends at an offset that matches the offset
   3813	 * where the data of the second items starts, and so on.
   3814	 * Therefore our source and destination start offsets for copy match the
   3815	 * offsets of the last items (highest slots).
   3816	 */
   3817	dst_offset = btrfs_item_ptr_offset(dst, dst_path->slots[0] + count - 1);
   3818	src_offset = btrfs_item_ptr_offset(src, start_slot + count - 1);
   3819	copy_extent_buffer(dst, src, dst_offset, src_offset, batch.total_data_size);
   3820	btrfs_release_path(dst_path);
   3821out:
   3822	kfree(ins_data);
   3823
   3824	return ret;
   3825}
   3826
   3827static int process_dir_items_leaf(struct btrfs_trans_handle *trans,
   3828				  struct btrfs_inode *inode,
   3829				  struct btrfs_path *path,
   3830				  struct btrfs_path *dst_path,
   3831				  struct btrfs_log_ctx *ctx,
   3832				  u64 *last_old_dentry_offset)
   3833{
   3834	struct btrfs_root *log = inode->root->log_root;
   3835	struct extent_buffer *src = path->nodes[0];
   3836	const int nritems = btrfs_header_nritems(src);
   3837	const u64 ino = btrfs_ino(inode);
   3838	bool last_found = false;
   3839	int batch_start = 0;
   3840	int batch_size = 0;
   3841	int i;
   3842
   3843	for (i = path->slots[0]; i < nritems; i++) {
   3844		struct btrfs_dir_item *di;
   3845		struct btrfs_key key;
   3846		int ret;
   3847
   3848		btrfs_item_key_to_cpu(src, &key, i);
   3849
   3850		if (key.objectid != ino || key.type != BTRFS_DIR_INDEX_KEY) {
   3851			last_found = true;
   3852			break;
   3853		}
   3854
   3855		di = btrfs_item_ptr(src, i, struct btrfs_dir_item);
   3856		ctx->last_dir_item_offset = key.offset;
   3857
   3858		/*
   3859		 * Skip ranges of items that consist only of dir item keys created
   3860		 * in past transactions. However if we find a gap, we must log a
   3861		 * dir index range item for that gap, so that index keys in that
   3862		 * gap are deleted during log replay.
   3863		 */
   3864		if (btrfs_dir_transid(src, di) < trans->transid) {
   3865			if (key.offset > *last_old_dentry_offset + 1) {
   3866				ret = insert_dir_log_key(trans, log, dst_path,
   3867						 ino, *last_old_dentry_offset + 1,
   3868						 key.offset - 1);
   3869				if (ret < 0)
   3870					return ret;
   3871			}
   3872
   3873			*last_old_dentry_offset = key.offset;
   3874			continue;
   3875		}
   3876		/*
   3877		 * We must make sure that when we log a directory entry, the
   3878		 * corresponding inode, after log replay, has a matching link
   3879		 * count. For example:
   3880		 *
   3881		 * touch foo
   3882		 * mkdir mydir
   3883		 * sync
   3884		 * ln foo mydir/bar
   3885		 * xfs_io -c "fsync" mydir
   3886		 * <crash>
   3887		 * <mount fs and log replay>
   3888		 *
   3889		 * Would result in a fsync log that when replayed, our file inode
   3890		 * would have a link count of 1, but we get two directory entries
   3891		 * pointing to the same inode. After removing one of the names,
   3892		 * it would not be possible to remove the other name, which
   3893		 * resulted always in stale file handle errors, and would not be
   3894		 * possible to rmdir the parent directory, since its i_size could
   3895		 * never be decremented to the value BTRFS_EMPTY_DIR_SIZE,
   3896		 * resulting in -ENOTEMPTY errors.
   3897		 */
   3898		if (!ctx->log_new_dentries) {
   3899			struct btrfs_key di_key;
   3900
   3901			btrfs_dir_item_key_to_cpu(src, di, &di_key);
   3902			if (di_key.type != BTRFS_ROOT_ITEM_KEY)
   3903				ctx->log_new_dentries = true;
   3904		}
   3905
   3906		if (!ctx->logged_before)
   3907			goto add_to_batch;
   3908
   3909		/*
   3910		 * If we were logged before and have logged dir items, we can skip
   3911		 * checking if any item with a key offset larger than the last one
   3912		 * we logged is in the log tree, saving time and avoiding adding
   3913		 * contention on the log tree. We can only rely on the value of
   3914		 * last_dir_index_offset when we know for sure that the inode was
   3915		 * previously logged in the current transaction.
   3916		 */
   3917		if (key.offset > inode->last_dir_index_offset)
   3918			goto add_to_batch;
   3919		/*
   3920		 * Check if the key was already logged before. If not we can add
   3921		 * it to a batch for bulk insertion.
   3922		 */
   3923		ret = btrfs_search_slot(NULL, log, &key, dst_path, 0, 0);
   3924		if (ret < 0) {
   3925			return ret;
   3926		} else if (ret > 0) {
   3927			btrfs_release_path(dst_path);
   3928			goto add_to_batch;
   3929		}
   3930
   3931		/*
   3932		 * Item exists in the log. Overwrite the item in the log if it
   3933		 * has different content or do nothing if it has exactly the same
   3934		 * content. And then flush the current batch if any - do it after
   3935		 * overwriting the current item, or we would deadlock otherwise,
   3936		 * since we are holding a path for the existing item.
   3937		 */
   3938		ret = do_overwrite_item(trans, log, dst_path, src, i, &key);
   3939		if (ret < 0)
   3940			return ret;
   3941
   3942		if (batch_size > 0) {
   3943			ret = flush_dir_items_batch(trans, log, src, dst_path,
   3944						    batch_start, batch_size);
   3945			if (ret < 0)
   3946				return ret;
   3947			batch_size = 0;
   3948		}
   3949		continue;
   3950add_to_batch:
   3951		if (batch_size == 0)
   3952			batch_start = i;
   3953		batch_size++;
   3954	}
   3955
   3956	if (batch_size > 0) {
   3957		int ret;
   3958
   3959		ret = flush_dir_items_batch(trans, log, src, dst_path,
   3960					    batch_start, batch_size);
   3961		if (ret < 0)
   3962			return ret;
   3963	}
   3964
   3965	return last_found ? 1 : 0;
   3966}
   3967
   3968/*
   3969 * log all the items included in the current transaction for a given
   3970 * directory.  This also creates the range items in the log tree required
   3971 * to replay anything deleted before the fsync
   3972 */
   3973static noinline int log_dir_items(struct btrfs_trans_handle *trans,
   3974			  struct btrfs_inode *inode,
   3975			  struct btrfs_path *path,
   3976			  struct btrfs_path *dst_path,
   3977			  struct btrfs_log_ctx *ctx,
   3978			  u64 min_offset, u64 *last_offset_ret)
   3979{
   3980	struct btrfs_key min_key;
   3981	struct btrfs_root *root = inode->root;
   3982	struct btrfs_root *log = root->log_root;
   3983	int err = 0;
   3984	int ret;
   3985	u64 last_old_dentry_offset = min_offset - 1;
   3986	u64 last_offset = (u64)-1;
   3987	u64 ino = btrfs_ino(inode);
   3988
   3989	min_key.objectid = ino;
   3990	min_key.type = BTRFS_DIR_INDEX_KEY;
   3991	min_key.offset = min_offset;
   3992
   3993	ret = btrfs_search_forward(root, &min_key, path, trans->transid);
   3994
   3995	/*
   3996	 * we didn't find anything from this transaction, see if there
   3997	 * is anything at all
   3998	 */
   3999	if (ret != 0 || min_key.objectid != ino ||
   4000	    min_key.type != BTRFS_DIR_INDEX_KEY) {
   4001		min_key.objectid = ino;
   4002		min_key.type = BTRFS_DIR_INDEX_KEY;
   4003		min_key.offset = (u64)-1;
   4004		btrfs_release_path(path);
   4005		ret = btrfs_search_slot(NULL, root, &min_key, path, 0, 0);
   4006		if (ret < 0) {
   4007			btrfs_release_path(path);
   4008			return ret;
   4009		}
   4010		ret = btrfs_previous_item(root, path, ino, BTRFS_DIR_INDEX_KEY);
   4011
   4012		/* if ret == 0 there are items for this type,
   4013		 * create a range to tell us the last key of this type.
   4014		 * otherwise, there are no items in this directory after
   4015		 * *min_offset, and we create a range to indicate that.
   4016		 */
   4017		if (ret == 0) {
   4018			struct btrfs_key tmp;
   4019
   4020			btrfs_item_key_to_cpu(path->nodes[0], &tmp,
   4021					      path->slots[0]);
   4022			if (tmp.type == BTRFS_DIR_INDEX_KEY)
   4023				last_old_dentry_offset = tmp.offset;
   4024		}
   4025		goto done;
   4026	}
   4027
   4028	/* go backward to find any previous key */
   4029	ret = btrfs_previous_item(root, path, ino, BTRFS_DIR_INDEX_KEY);
   4030	if (ret == 0) {
   4031		struct btrfs_key tmp;
   4032
   4033		btrfs_item_key_to_cpu(path->nodes[0], &tmp, path->slots[0]);
   4034		/*
   4035		 * The dir index key before the first one we found that needs to
   4036		 * be logged might be in a previous leaf, and there might be a
   4037		 * gap between these keys, meaning that we had deletions that
   4038		 * happened. So the key range item we log (key type
   4039		 * BTRFS_DIR_LOG_INDEX_KEY) must cover a range that starts at the
   4040		 * previous key's offset plus 1, so that those deletes are replayed.
   4041		 */
   4042		if (tmp.type == BTRFS_DIR_INDEX_KEY)
   4043			last_old_dentry_offset = tmp.offset;
   4044	}
   4045	btrfs_release_path(path);
   4046
   4047	/*
   4048	 * Find the first key from this transaction again.  See the note for
   4049	 * log_new_dir_dentries, if we're logging a directory recursively we
   4050	 * won't be holding its i_mutex, which means we can modify the directory
   4051	 * while we're logging it.  If we remove an entry between our first
   4052	 * search and this search we'll not find the key again and can just
   4053	 * bail.
   4054	 */
   4055search:
   4056	ret = btrfs_search_slot(NULL, root, &min_key, path, 0, 0);
   4057	if (ret != 0)
   4058		goto done;
   4059
   4060	/*
   4061	 * we have a block from this transaction, log every item in it
   4062	 * from our directory
   4063	 */
   4064	while (1) {
   4065		ret = process_dir_items_leaf(trans, inode, path, dst_path, ctx,
   4066					     &last_old_dentry_offset);
   4067		if (ret != 0) {
   4068			if (ret < 0)
   4069				err = ret;
   4070			goto done;
   4071		}
   4072		path->slots[0] = btrfs_header_nritems(path->nodes[0]);
   4073
   4074		/*
   4075		 * look ahead to the next item and see if it is also
   4076		 * from this directory and from this transaction
   4077		 */
   4078		ret = btrfs_next_leaf(root, path);
   4079		if (ret) {
   4080			if (ret == 1)
   4081				last_offset = (u64)-1;
   4082			else
   4083				err = ret;
   4084			goto done;
   4085		}
   4086		btrfs_item_key_to_cpu(path->nodes[0], &min_key, path->slots[0]);
   4087		if (min_key.objectid != ino || min_key.type != BTRFS_DIR_INDEX_KEY) {
   4088			last_offset = (u64)-1;
   4089			goto done;
   4090		}
   4091		if (btrfs_header_generation(path->nodes[0]) != trans->transid) {
   4092			/*
   4093			 * The next leaf was not changed in the current transaction
   4094			 * and has at least one dir index key.
   4095			 * We check for the next key because there might have been
   4096			 * one or more deletions between the last key we logged and
   4097			 * that next key. So the key range item we log (key type
   4098			 * BTRFS_DIR_LOG_INDEX_KEY) must end at the next key's
   4099			 * offset minus 1, so that those deletes are replayed.
   4100			 */
   4101			last_offset = min_key.offset - 1;
   4102			goto done;
   4103		}
   4104		if (need_resched()) {
   4105			btrfs_release_path(path);
   4106			cond_resched();
   4107			goto search;
   4108		}
   4109	}
   4110done:
   4111	btrfs_release_path(path);
   4112	btrfs_release_path(dst_path);
   4113
   4114	if (err == 0) {
   4115		*last_offset_ret = last_offset;
   4116		/*
   4117		 * In case the leaf was changed in the current transaction but
   4118		 * all its dir items are from a past transaction, the last item
   4119		 * in the leaf is a dir item and there's no gap between that last
   4120		 * dir item and the first one on the next leaf (which did not
   4121		 * change in the current transaction), then we don't need to log
   4122		 * a range, last_old_dentry_offset is == to last_offset.
   4123		 */
   4124		ASSERT(last_old_dentry_offset <= last_offset);
   4125		if (last_old_dentry_offset < last_offset) {
   4126			ret = insert_dir_log_key(trans, log, path, ino,
   4127						 last_old_dentry_offset + 1,
   4128						 last_offset);
   4129			if (ret)
   4130				err = ret;
   4131		}
   4132	}
   4133	return err;
   4134}
   4135
   4136/*
   4137 * logging directories is very similar to logging inodes, We find all the items
   4138 * from the current transaction and write them to the log.
   4139 *
   4140 * The recovery code scans the directory in the subvolume, and if it finds a
   4141 * key in the range logged that is not present in the log tree, then it means
   4142 * that dir entry was unlinked during the transaction.
   4143 *
   4144 * In order for that scan to work, we must include one key smaller than
   4145 * the smallest logged by this transaction and one key larger than the largest
   4146 * key logged by this transaction.
   4147 */
   4148static noinline int log_directory_changes(struct btrfs_trans_handle *trans,
   4149			  struct btrfs_inode *inode,
   4150			  struct btrfs_path *path,
   4151			  struct btrfs_path *dst_path,
   4152			  struct btrfs_log_ctx *ctx)
   4153{
   4154	u64 min_key;
   4155	u64 max_key;
   4156	int ret;
   4157
   4158	min_key = BTRFS_DIR_START_INDEX;
   4159	max_key = 0;
   4160	ctx->last_dir_item_offset = inode->last_dir_index_offset;
   4161
   4162	while (1) {
   4163		ret = log_dir_items(trans, inode, path, dst_path,
   4164				ctx, min_key, &max_key);
   4165		if (ret)
   4166			return ret;
   4167		if (max_key == (u64)-1)
   4168			break;
   4169		min_key = max_key + 1;
   4170	}
   4171
   4172	inode->last_dir_index_offset = ctx->last_dir_item_offset;
   4173
   4174	return 0;
   4175}
   4176
   4177/*
   4178 * a helper function to drop items from the log before we relog an
   4179 * inode.  max_key_type indicates the highest item type to remove.
   4180 * This cannot be run for file data extents because it does not
   4181 * free the extents they point to.
   4182 */
   4183static int drop_inode_items(struct btrfs_trans_handle *trans,
   4184				  struct btrfs_root *log,
   4185				  struct btrfs_path *path,
   4186				  struct btrfs_inode *inode,
   4187				  int max_key_type)
   4188{
   4189	int ret;
   4190	struct btrfs_key key;
   4191	struct btrfs_key found_key;
   4192	int start_slot;
   4193
   4194	key.objectid = btrfs_ino(inode);
   4195	key.type = max_key_type;
   4196	key.offset = (u64)-1;
   4197
   4198	while (1) {
   4199		ret = btrfs_search_slot(trans, log, &key, path, -1, 1);
   4200		BUG_ON(ret == 0); /* Logic error */
   4201		if (ret < 0)
   4202			break;
   4203
   4204		if (path->slots[0] == 0)
   4205			break;
   4206
   4207		path->slots[0]--;
   4208		btrfs_item_key_to_cpu(path->nodes[0], &found_key,
   4209				      path->slots[0]);
   4210
   4211		if (found_key.objectid != key.objectid)
   4212			break;
   4213
   4214		found_key.offset = 0;
   4215		found_key.type = 0;
   4216		ret = btrfs_bin_search(path->nodes[0], &found_key, &start_slot);
   4217		if (ret < 0)
   4218			break;
   4219
   4220		ret = btrfs_del_items(trans, log, path, start_slot,
   4221				      path->slots[0] - start_slot + 1);
   4222		/*
   4223		 * If start slot isn't 0 then we don't need to re-search, we've
   4224		 * found the last guy with the objectid in this tree.
   4225		 */
   4226		if (ret || start_slot != 0)
   4227			break;
   4228		btrfs_release_path(path);
   4229	}
   4230	btrfs_release_path(path);
   4231	if (ret > 0)
   4232		ret = 0;
   4233	return ret;
   4234}
   4235
   4236static int truncate_inode_items(struct btrfs_trans_handle *trans,
   4237				struct btrfs_root *log_root,
   4238				struct btrfs_inode *inode,
   4239				u64 new_size, u32 min_type)
   4240{
   4241	struct btrfs_truncate_control control = {
   4242		.new_size = new_size,
   4243		.ino = btrfs_ino(inode),
   4244		.min_type = min_type,
   4245		.skip_ref_updates = true,
   4246	};
   4247
   4248	return btrfs_truncate_inode_items(trans, log_root, &control);
   4249}
   4250
   4251static void fill_inode_item(struct btrfs_trans_handle *trans,
   4252			    struct extent_buffer *leaf,
   4253			    struct btrfs_inode_item *item,
   4254			    struct inode *inode, int log_inode_only,
   4255			    u64 logged_isize)
   4256{
   4257	struct btrfs_map_token token;
   4258	u64 flags;
   4259
   4260	btrfs_init_map_token(&token, leaf);
   4261
   4262	if (log_inode_only) {
   4263		/* set the generation to zero so the recover code
   4264		 * can tell the difference between an logging
   4265		 * just to say 'this inode exists' and a logging
   4266		 * to say 'update this inode with these values'
   4267		 */
   4268		btrfs_set_token_inode_generation(&token, item, 0);
   4269		btrfs_set_token_inode_size(&token, item, logged_isize);
   4270	} else {
   4271		btrfs_set_token_inode_generation(&token, item,
   4272						 BTRFS_I(inode)->generation);
   4273		btrfs_set_token_inode_size(&token, item, inode->i_size);
   4274	}
   4275
   4276	btrfs_set_token_inode_uid(&token, item, i_uid_read(inode));
   4277	btrfs_set_token_inode_gid(&token, item, i_gid_read(inode));
   4278	btrfs_set_token_inode_mode(&token, item, inode->i_mode);
   4279	btrfs_set_token_inode_nlink(&token, item, inode->i_nlink);
   4280
   4281	btrfs_set_token_timespec_sec(&token, &item->atime,
   4282				     inode->i_atime.tv_sec);
   4283	btrfs_set_token_timespec_nsec(&token, &item->atime,
   4284				      inode->i_atime.tv_nsec);
   4285
   4286	btrfs_set_token_timespec_sec(&token, &item->mtime,
   4287				     inode->i_mtime.tv_sec);
   4288	btrfs_set_token_timespec_nsec(&token, &item->mtime,
   4289				      inode->i_mtime.tv_nsec);
   4290
   4291	btrfs_set_token_timespec_sec(&token, &item->ctime,
   4292				     inode->i_ctime.tv_sec);
   4293	btrfs_set_token_timespec_nsec(&token, &item->ctime,
   4294				      inode->i_ctime.tv_nsec);
   4295
   4296	/*
   4297	 * We do not need to set the nbytes field, in fact during a fast fsync
   4298	 * its value may not even be correct, since a fast fsync does not wait
   4299	 * for ordered extent completion, which is where we update nbytes, it
   4300	 * only waits for writeback to complete. During log replay as we find
   4301	 * file extent items and replay them, we adjust the nbytes field of the
   4302	 * inode item in subvolume tree as needed (see overwrite_item()).
   4303	 */
   4304
   4305	btrfs_set_token_inode_sequence(&token, item, inode_peek_iversion(inode));
   4306	btrfs_set_token_inode_transid(&token, item, trans->transid);
   4307	btrfs_set_token_inode_rdev(&token, item, inode->i_rdev);
   4308	flags = btrfs_inode_combine_flags(BTRFS_I(inode)->flags,
   4309					  BTRFS_I(inode)->ro_flags);
   4310	btrfs_set_token_inode_flags(&token, item, flags);
   4311	btrfs_set_token_inode_block_group(&token, item, 0);
   4312}
   4313
   4314static int log_inode_item(struct btrfs_trans_handle *trans,
   4315			  struct btrfs_root *log, struct btrfs_path *path,
   4316			  struct btrfs_inode *inode, bool inode_item_dropped)
   4317{
   4318	struct btrfs_inode_item *inode_item;
   4319	int ret;
   4320
   4321	/*
   4322	 * If we are doing a fast fsync and the inode was logged before in the
   4323	 * current transaction, then we know the inode was previously logged and
   4324	 * it exists in the log tree. For performance reasons, in this case use
   4325	 * btrfs_search_slot() directly with ins_len set to 0 so that we never
   4326	 * attempt a write lock on the leaf's parent, which adds unnecessary lock
   4327	 * contention in case there are concurrent fsyncs for other inodes of the
   4328	 * same subvolume. Using btrfs_insert_empty_item() when the inode item
   4329	 * already exists can also result in unnecessarily splitting a leaf.
   4330	 */
   4331	if (!inode_item_dropped && inode->logged_trans == trans->transid) {
   4332		ret = btrfs_search_slot(trans, log, &inode->location, path, 0, 1);
   4333		ASSERT(ret <= 0);
   4334		if (ret > 0)
   4335			ret = -ENOENT;
   4336	} else {
   4337		/*
   4338		 * This means it is the first fsync in the current transaction,
   4339		 * so the inode item is not in the log and we need to insert it.
   4340		 * We can never get -EEXIST because we are only called for a fast
   4341		 * fsync and in case an inode eviction happens after the inode was
   4342		 * logged before in the current transaction, when we load again
   4343		 * the inode, we set BTRFS_INODE_NEEDS_FULL_SYNC on its runtime
   4344		 * flags and set ->logged_trans to 0.
   4345		 */
   4346		ret = btrfs_insert_empty_item(trans, log, path, &inode->location,
   4347					      sizeof(*inode_item));
   4348		ASSERT(ret != -EEXIST);
   4349	}
   4350	if (ret)
   4351		return ret;
   4352	inode_item = btrfs_item_ptr(path->nodes[0], path->slots[0],
   4353				    struct btrfs_inode_item);
   4354	fill_inode_item(trans, path->nodes[0], inode_item, &inode->vfs_inode,
   4355			0, 0);
   4356	btrfs_release_path(path);
   4357	return 0;
   4358}
   4359
   4360static int log_csums(struct btrfs_trans_handle *trans,
   4361		     struct btrfs_inode *inode,
   4362		     struct btrfs_root *log_root,
   4363		     struct btrfs_ordered_sum *sums)
   4364{
   4365	const u64 lock_end = sums->bytenr + sums->len - 1;
   4366	struct extent_state *cached_state = NULL;
   4367	int ret;
   4368
   4369	/*
   4370	 * If this inode was not used for reflink operations in the current
   4371	 * transaction with new extents, then do the fast path, no need to
   4372	 * worry about logging checksum items with overlapping ranges.
   4373	 */
   4374	if (inode->last_reflink_trans < trans->transid)
   4375		return btrfs_csum_file_blocks(trans, log_root, sums);
   4376
   4377	/*
   4378	 * Serialize logging for checksums. This is to avoid racing with the
   4379	 * same checksum being logged by another task that is logging another
   4380	 * file which happens to refer to the same extent as well. Such races
   4381	 * can leave checksum items in the log with overlapping ranges.
   4382	 */
   4383	ret = lock_extent_bits(&log_root->log_csum_range, sums->bytenr,
   4384			       lock_end, &cached_state);
   4385	if (ret)
   4386		return ret;
   4387	/*
   4388	 * Due to extent cloning, we might have logged a csum item that covers a
   4389	 * subrange of a cloned extent, and later we can end up logging a csum
   4390	 * item for a larger subrange of the same extent or the entire range.
   4391	 * This would leave csum items in the log tree that cover the same range
   4392	 * and break the searches for checksums in the log tree, resulting in
   4393	 * some checksums missing in the fs/subvolume tree. So just delete (or
   4394	 * trim and adjust) any existing csum items in the log for this range.
   4395	 */
   4396	ret = btrfs_del_csums(trans, log_root, sums->bytenr, sums->len);
   4397	if (!ret)
   4398		ret = btrfs_csum_file_blocks(trans, log_root, sums);
   4399
   4400	unlock_extent_cached(&log_root->log_csum_range, sums->bytenr, lock_end,
   4401			     &cached_state);
   4402
   4403	return ret;
   4404}
   4405
   4406static noinline int copy_items(struct btrfs_trans_handle *trans,
   4407			       struct btrfs_inode *inode,
   4408			       struct btrfs_path *dst_path,
   4409			       struct btrfs_path *src_path,
   4410			       int start_slot, int nr, int inode_only,
   4411			       u64 logged_isize)
   4412{
   4413	struct btrfs_root *log = inode->root->log_root;
   4414	struct btrfs_file_extent_item *extent;
   4415	struct extent_buffer *src = src_path->nodes[0];
   4416	int ret = 0;
   4417	struct btrfs_key *ins_keys;
   4418	u32 *ins_sizes;
   4419	struct btrfs_item_batch batch;
   4420	char *ins_data;
   4421	int i;
   4422	int dst_index;
   4423	const bool skip_csum = (inode->flags & BTRFS_INODE_NODATASUM);
   4424	const u64 i_size = i_size_read(&inode->vfs_inode);
   4425
   4426	ins_data = kmalloc(nr * sizeof(struct btrfs_key) +
   4427			   nr * sizeof(u32), GFP_NOFS);
   4428	if (!ins_data)
   4429		return -ENOMEM;
   4430
   4431	ins_sizes = (u32 *)ins_data;
   4432	ins_keys = (struct btrfs_key *)(ins_data + nr * sizeof(u32));
   4433	batch.keys = ins_keys;
   4434	batch.data_sizes = ins_sizes;
   4435	batch.total_data_size = 0;
   4436	batch.nr = 0;
   4437
   4438	dst_index = 0;
   4439	for (i = 0; i < nr; i++) {
   4440		const int src_slot = start_slot + i;
   4441		struct btrfs_root *csum_root;
   4442		struct btrfs_ordered_sum *sums;
   4443		struct btrfs_ordered_sum *sums_next;
   4444		LIST_HEAD(ordered_sums);
   4445		u64 disk_bytenr;
   4446		u64 disk_num_bytes;
   4447		u64 extent_offset;
   4448		u64 extent_num_bytes;
   4449		bool is_old_extent;
   4450
   4451		btrfs_item_key_to_cpu(src, &ins_keys[dst_index], src_slot);
   4452
   4453		if (ins_keys[dst_index].type != BTRFS_EXTENT_DATA_KEY)
   4454			goto add_to_batch;
   4455
   4456		extent = btrfs_item_ptr(src, src_slot,
   4457					struct btrfs_file_extent_item);
   4458
   4459		is_old_extent = (btrfs_file_extent_generation(src, extent) <
   4460				 trans->transid);
   4461
   4462		/*
   4463		 * Don't copy extents from past generations. That would make us
   4464		 * log a lot more metadata for common cases like doing only a
   4465		 * few random writes into a file and then fsync it for the first
   4466		 * time or after the full sync flag is set on the inode. We can
   4467		 * get leaves full of extent items, most of which are from past
   4468		 * generations, so we can skip them - as long as the inode has
   4469		 * not been the target of a reflink operation in this transaction,
   4470		 * as in that case it might have had file extent items with old
   4471		 * generations copied into it. We also must always log prealloc
   4472		 * extents that start at or beyond eof, otherwise we would lose
   4473		 * them on log replay.
   4474		 */
   4475		if (is_old_extent &&
   4476		    ins_keys[dst_index].offset < i_size &&
   4477		    inode->last_reflink_trans < trans->transid)
   4478			continue;
   4479
   4480		if (skip_csum)
   4481			goto add_to_batch;
   4482
   4483		/* Only regular extents have checksums. */
   4484		if (btrfs_file_extent_type(src, extent) != BTRFS_FILE_EXTENT_REG)
   4485			goto add_to_batch;
   4486
   4487		/*
   4488		 * If it's an extent created in a past transaction, then its
   4489		 * checksums are already accessible from the committed csum tree,
   4490		 * no need to log them.
   4491		 */
   4492		if (is_old_extent)
   4493			goto add_to_batch;
   4494
   4495		disk_bytenr = btrfs_file_extent_disk_bytenr(src, extent);
   4496		/* If it's an explicit hole, there are no checksums. */
   4497		if (disk_bytenr == 0)
   4498			goto add_to_batch;
   4499
   4500		disk_num_bytes = btrfs_file_extent_disk_num_bytes(src, extent);
   4501
   4502		if (btrfs_file_extent_compression(src, extent)) {
   4503			extent_offset = 0;
   4504			extent_num_bytes = disk_num_bytes;
   4505		} else {
   4506			extent_offset = btrfs_file_extent_offset(src, extent);
   4507			extent_num_bytes = btrfs_file_extent_num_bytes(src, extent);
   4508		}
   4509
   4510		csum_root = btrfs_csum_root(trans->fs_info, disk_bytenr);
   4511		disk_bytenr += extent_offset;
   4512		ret = btrfs_lookup_csums_range(csum_root, disk_bytenr,
   4513					       disk_bytenr + extent_num_bytes - 1,
   4514					       &ordered_sums, 0);
   4515		if (ret)
   4516			goto out;
   4517
   4518		list_for_each_entry_safe(sums, sums_next, &ordered_sums, list) {
   4519			if (!ret)
   4520				ret = log_csums(trans, inode, log, sums);
   4521			list_del(&sums->list);
   4522			kfree(sums);
   4523		}
   4524		if (ret)
   4525			goto out;
   4526
   4527add_to_batch:
   4528		ins_sizes[dst_index] = btrfs_item_size(src, src_slot);
   4529		batch.total_data_size += ins_sizes[dst_index];
   4530		batch.nr++;
   4531		dst_index++;
   4532	}
   4533
   4534	/*
   4535	 * We have a leaf full of old extent items that don't need to be logged,
   4536	 * so we don't need to do anything.
   4537	 */
   4538	if (batch.nr == 0)
   4539		goto out;
   4540
   4541	ret = btrfs_insert_empty_items(trans, log, dst_path, &batch);
   4542	if (ret)
   4543		goto out;
   4544
   4545	dst_index = 0;
   4546	for (i = 0; i < nr; i++) {
   4547		const int src_slot = start_slot + i;
   4548		const int dst_slot = dst_path->slots[0] + dst_index;
   4549		struct btrfs_key key;
   4550		unsigned long src_offset;
   4551		unsigned long dst_offset;
   4552
   4553		/*
   4554		 * We're done, all the remaining items in the source leaf
   4555		 * correspond to old file extent items.
   4556		 */
   4557		if (dst_index >= batch.nr)
   4558			break;
   4559
   4560		btrfs_item_key_to_cpu(src, &key, src_slot);
   4561
   4562		if (key.type != BTRFS_EXTENT_DATA_KEY)
   4563			goto copy_item;
   4564
   4565		extent = btrfs_item_ptr(src, src_slot,
   4566					struct btrfs_file_extent_item);
   4567
   4568		/* See the comment in the previous loop, same logic. */
   4569		if (btrfs_file_extent_generation(src, extent) < trans->transid &&
   4570		    key.offset < i_size &&
   4571		    inode->last_reflink_trans < trans->transid)
   4572			continue;
   4573
   4574copy_item:
   4575		dst_offset = btrfs_item_ptr_offset(dst_path->nodes[0], dst_slot);
   4576		src_offset = btrfs_item_ptr_offset(src, src_slot);
   4577
   4578		if (key.type == BTRFS_INODE_ITEM_KEY) {
   4579			struct btrfs_inode_item *inode_item;
   4580
   4581			inode_item = btrfs_item_ptr(dst_path->nodes[0], dst_slot,
   4582						    struct btrfs_inode_item);
   4583			fill_inode_item(trans, dst_path->nodes[0], inode_item,
   4584					&inode->vfs_inode,
   4585					inode_only == LOG_INODE_EXISTS,
   4586					logged_isize);
   4587		} else {
   4588			copy_extent_buffer(dst_path->nodes[0], src, dst_offset,
   4589					   src_offset, ins_sizes[dst_index]);
   4590		}
   4591
   4592		dst_index++;
   4593	}
   4594
   4595	btrfs_mark_buffer_dirty(dst_path->nodes[0]);
   4596	btrfs_release_path(dst_path);
   4597out:
   4598	kfree(ins_data);
   4599
   4600	return ret;
   4601}
   4602
   4603static int extent_cmp(void *priv, const struct list_head *a,
   4604		      const struct list_head *b)
   4605{
   4606	const struct extent_map *em1, *em2;
   4607
   4608	em1 = list_entry(a, struct extent_map, list);
   4609	em2 = list_entry(b, struct extent_map, list);
   4610
   4611	if (em1->start < em2->start)
   4612		return -1;
   4613	else if (em1->start > em2->start)
   4614		return 1;
   4615	return 0;
   4616}
   4617
   4618static int log_extent_csums(struct btrfs_trans_handle *trans,
   4619			    struct btrfs_inode *inode,
   4620			    struct btrfs_root *log_root,
   4621			    const struct extent_map *em,
   4622			    struct btrfs_log_ctx *ctx)
   4623{
   4624	struct btrfs_ordered_extent *ordered;
   4625	struct btrfs_root *csum_root;
   4626	u64 csum_offset;
   4627	u64 csum_len;
   4628	u64 mod_start = em->mod_start;
   4629	u64 mod_len = em->mod_len;
   4630	LIST_HEAD(ordered_sums);
   4631	int ret = 0;
   4632
   4633	if (inode->flags & BTRFS_INODE_NODATASUM ||
   4634	    test_bit(EXTENT_FLAG_PREALLOC, &em->flags) ||
   4635	    em->block_start == EXTENT_MAP_HOLE)
   4636		return 0;
   4637
   4638	list_for_each_entry(ordered, &ctx->ordered_extents, log_list) {
   4639		const u64 ordered_end = ordered->file_offset + ordered->num_bytes;
   4640		const u64 mod_end = mod_start + mod_len;
   4641		struct btrfs_ordered_sum *sums;
   4642
   4643		if (mod_len == 0)
   4644			break;
   4645
   4646		if (ordered_end <= mod_start)
   4647			continue;
   4648		if (mod_end <= ordered->file_offset)
   4649			break;
   4650
   4651		/*
   4652		 * We are going to copy all the csums on this ordered extent, so
   4653		 * go ahead and adjust mod_start and mod_len in case this ordered
   4654		 * extent has already been logged.
   4655		 */
   4656		if (ordered->file_offset > mod_start) {
   4657			if (ordered_end >= mod_end)
   4658				mod_len = ordered->file_offset - mod_start;
   4659			/*
   4660			 * If we have this case
   4661			 *
   4662			 * |--------- logged extent ---------|
   4663			 *       |----- ordered extent ----|
   4664			 *
   4665			 * Just don't mess with mod_start and mod_len, we'll
   4666			 * just end up logging more csums than we need and it
   4667			 * will be ok.
   4668			 */
   4669		} else {
   4670			if (ordered_end < mod_end) {
   4671				mod_len = mod_end - ordered_end;
   4672				mod_start = ordered_end;
   4673			} else {
   4674				mod_len = 0;
   4675			}
   4676		}
   4677
   4678		/*
   4679		 * To keep us from looping for the above case of an ordered
   4680		 * extent that falls inside of the logged extent.
   4681		 */
   4682		if (test_and_set_bit(BTRFS_ORDERED_LOGGED_CSUM, &ordered->flags))
   4683			continue;
   4684
   4685		list_for_each_entry(sums, &ordered->list, list) {
   4686			ret = log_csums(trans, inode, log_root, sums);
   4687			if (ret)
   4688				return ret;
   4689		}
   4690	}
   4691
   4692	/* We're done, found all csums in the ordered extents. */
   4693	if (mod_len == 0)
   4694		return 0;
   4695
   4696	/* If we're compressed we have to save the entire range of csums. */
   4697	if (em->compress_type) {
   4698		csum_offset = 0;
   4699		csum_len = max(em->block_len, em->orig_block_len);
   4700	} else {
   4701		csum_offset = mod_start - em->start;
   4702		csum_len = mod_len;
   4703	}
   4704
   4705	/* block start is already adjusted for the file extent offset. */
   4706	csum_root = btrfs_csum_root(trans->fs_info, em->block_start);
   4707	ret = btrfs_lookup_csums_range(csum_root,
   4708				       em->block_start + csum_offset,
   4709				       em->block_start + csum_offset +
   4710				       csum_len - 1, &ordered_sums, 0);
   4711	if (ret)
   4712		return ret;
   4713
   4714	while (!list_empty(&ordered_sums)) {
   4715		struct btrfs_ordered_sum *sums = list_entry(ordered_sums.next,
   4716						   struct btrfs_ordered_sum,
   4717						   list);
   4718		if (!ret)
   4719			ret = log_csums(trans, inode, log_root, sums);
   4720		list_del(&sums->list);
   4721		kfree(sums);
   4722	}
   4723
   4724	return ret;
   4725}
   4726
   4727static int log_one_extent(struct btrfs_trans_handle *trans,
   4728			  struct btrfs_inode *inode,
   4729			  const struct extent_map *em,
   4730			  struct btrfs_path *path,
   4731			  struct btrfs_log_ctx *ctx)
   4732{
   4733	struct btrfs_drop_extents_args drop_args = { 0 };
   4734	struct btrfs_root *log = inode->root->log_root;
   4735	struct btrfs_file_extent_item fi = { 0 };
   4736	struct extent_buffer *leaf;
   4737	struct btrfs_key key;
   4738	u64 extent_offset = em->start - em->orig_start;
   4739	u64 block_len;
   4740	int ret;
   4741
   4742	btrfs_set_stack_file_extent_generation(&fi, trans->transid);
   4743	if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
   4744		btrfs_set_stack_file_extent_type(&fi, BTRFS_FILE_EXTENT_PREALLOC);
   4745	else
   4746		btrfs_set_stack_file_extent_type(&fi, BTRFS_FILE_EXTENT_REG);
   4747
   4748	block_len = max(em->block_len, em->orig_block_len);
   4749	if (em->compress_type != BTRFS_COMPRESS_NONE) {
   4750		btrfs_set_stack_file_extent_disk_bytenr(&fi, em->block_start);
   4751		btrfs_set_stack_file_extent_disk_num_bytes(&fi, block_len);
   4752	} else if (em->block_start < EXTENT_MAP_LAST_BYTE) {
   4753		btrfs_set_stack_file_extent_disk_bytenr(&fi, em->block_start -
   4754							extent_offset);
   4755		btrfs_set_stack_file_extent_disk_num_bytes(&fi, block_len);
   4756	}
   4757
   4758	btrfs_set_stack_file_extent_offset(&fi, extent_offset);
   4759	btrfs_set_stack_file_extent_num_bytes(&fi, em->len);
   4760	btrfs_set_stack_file_extent_ram_bytes(&fi, em->ram_bytes);
   4761	btrfs_set_stack_file_extent_compression(&fi, em->compress_type);
   4762
   4763	ret = log_extent_csums(trans, inode, log, em, ctx);
   4764	if (ret)
   4765		return ret;
   4766
   4767	/*
   4768	 * If this is the first time we are logging the inode in the current
   4769	 * transaction, we can avoid btrfs_drop_extents(), which is expensive
   4770	 * because it does a deletion search, which always acquires write locks
   4771	 * for extent buffers at levels 2, 1 and 0. This not only wastes time
   4772	 * but also adds significant contention in a log tree, since log trees
   4773	 * are small, with a root at level 2 or 3 at most, due to their short
   4774	 * life span.
   4775	 */
   4776	if (ctx->logged_before) {
   4777		drop_args.path = path;
   4778		drop_args.start = em->start;
   4779		drop_args.end = em->start + em->len;
   4780		drop_args.replace_extent = true;
   4781		drop_args.extent_item_size = sizeof(fi);
   4782		ret = btrfs_drop_extents(trans, log, inode, &drop_args);
   4783		if (ret)
   4784			return ret;
   4785	}
   4786
   4787	if (!drop_args.extent_inserted) {
   4788		key.objectid = btrfs_ino(inode);
   4789		key.type = BTRFS_EXTENT_DATA_KEY;
   4790		key.offset = em->start;
   4791
   4792		ret = btrfs_insert_empty_item(trans, log, path, &key,
   4793					      sizeof(fi));
   4794		if (ret)
   4795			return ret;
   4796	}
   4797	leaf = path->nodes[0];
   4798	write_extent_buffer(leaf, &fi,
   4799			    btrfs_item_ptr_offset(leaf, path->slots[0]),
   4800			    sizeof(fi));
   4801	btrfs_mark_buffer_dirty(leaf);
   4802
   4803	btrfs_release_path(path);
   4804
   4805	return ret;
   4806}
   4807
   4808/*
   4809 * Log all prealloc extents beyond the inode's i_size to make sure we do not
   4810 * lose them after doing a full/fast fsync and replaying the log. We scan the
   4811 * subvolume's root instead of iterating the inode's extent map tree because
   4812 * otherwise we can log incorrect extent items based on extent map conversion.
   4813 * That can happen due to the fact that extent maps are merged when they
   4814 * are not in the extent map tree's list of modified extents.
   4815 */
   4816static int btrfs_log_prealloc_extents(struct btrfs_trans_handle *trans,
   4817				      struct btrfs_inode *inode,
   4818				      struct btrfs_path *path)
   4819{
   4820	struct btrfs_root *root = inode->root;
   4821	struct btrfs_key key;
   4822	const u64 i_size = i_size_read(&inode->vfs_inode);
   4823	const u64 ino = btrfs_ino(inode);
   4824	struct btrfs_path *dst_path = NULL;
   4825	bool dropped_extents = false;
   4826	u64 truncate_offset = i_size;
   4827	struct extent_buffer *leaf;
   4828	int slot;
   4829	int ins_nr = 0;
   4830	int start_slot;
   4831	int ret;
   4832
   4833	if (!(inode->flags & BTRFS_INODE_PREALLOC))
   4834		return 0;
   4835
   4836	key.objectid = ino;
   4837	key.type = BTRFS_EXTENT_DATA_KEY;
   4838	key.offset = i_size;
   4839	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
   4840	if (ret < 0)
   4841		goto out;
   4842
   4843	/*
   4844	 * We must check if there is a prealloc extent that starts before the
   4845	 * i_size and crosses the i_size boundary. This is to ensure later we
   4846	 * truncate down to the end of that extent and not to the i_size, as
   4847	 * otherwise we end up losing part of the prealloc extent after a log
   4848	 * replay and with an implicit hole if there is another prealloc extent
   4849	 * that starts at an offset beyond i_size.
   4850	 */
   4851	ret = btrfs_previous_item(root, path, ino, BTRFS_EXTENT_DATA_KEY);
   4852	if (ret < 0)
   4853		goto out;
   4854
   4855	if (ret == 0) {
   4856		struct btrfs_file_extent_item *ei;
   4857
   4858		leaf = path->nodes[0];
   4859		slot = path->slots[0];
   4860		ei = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item);
   4861
   4862		if (btrfs_file_extent_type(leaf, ei) ==
   4863		    BTRFS_FILE_EXTENT_PREALLOC) {
   4864			u64 extent_end;
   4865
   4866			btrfs_item_key_to_cpu(leaf, &key, slot);
   4867			extent_end = key.offset +
   4868				btrfs_file_extent_num_bytes(leaf, ei);
   4869
   4870			if (extent_end > i_size)
   4871				truncate_offset = extent_end;
   4872		}
   4873	} else {
   4874		ret = 0;
   4875	}
   4876
   4877	while (true) {
   4878		leaf = path->nodes[0];
   4879		slot = path->slots[0];
   4880
   4881		if (slot >= btrfs_header_nritems(leaf)) {
   4882			if (ins_nr > 0) {
   4883				ret = copy_items(trans, inode, dst_path, path,
   4884						 start_slot, ins_nr, 1, 0);
   4885				if (ret < 0)
   4886					goto out;
   4887				ins_nr = 0;
   4888			}
   4889			ret = btrfs_next_leaf(root, path);
   4890			if (ret < 0)
   4891				goto out;
   4892			if (ret > 0) {
   4893				ret = 0;
   4894				break;
   4895			}
   4896			continue;
   4897		}
   4898
   4899		btrfs_item_key_to_cpu(leaf, &key, slot);
   4900		if (key.objectid > ino)
   4901			break;
   4902		if (WARN_ON_ONCE(key.objectid < ino) ||
   4903		    key.type < BTRFS_EXTENT_DATA_KEY ||
   4904		    key.offset < i_size) {
   4905			path->slots[0]++;
   4906			continue;
   4907		}
   4908		if (!dropped_extents) {
   4909			/*
   4910			 * Avoid logging extent items logged in past fsync calls
   4911			 * and leading to duplicate keys in the log tree.
   4912			 */
   4913			ret = truncate_inode_items(trans, root->log_root, inode,
   4914						   truncate_offset,
   4915						   BTRFS_EXTENT_DATA_KEY);
   4916			if (ret)
   4917				goto out;
   4918			dropped_extents = true;
   4919		}
   4920		if (ins_nr == 0)
   4921			start_slot = slot;
   4922		ins_nr++;
   4923		path->slots[0]++;
   4924		if (!dst_path) {
   4925			dst_path = btrfs_alloc_path();
   4926			if (!dst_path) {
   4927				ret = -ENOMEM;
   4928				goto out;
   4929			}
   4930		}
   4931	}
   4932	if (ins_nr > 0)
   4933		ret = copy_items(trans, inode, dst_path, path,
   4934				 start_slot, ins_nr, 1, 0);
   4935out:
   4936	btrfs_release_path(path);
   4937	btrfs_free_path(dst_path);
   4938	return ret;
   4939}
   4940
   4941static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans,
   4942				     struct btrfs_inode *inode,
   4943				     struct btrfs_path *path,
   4944				     struct btrfs_log_ctx *ctx)
   4945{
   4946	struct btrfs_ordered_extent *ordered;
   4947	struct btrfs_ordered_extent *tmp;
   4948	struct extent_map *em, *n;
   4949	struct list_head extents;
   4950	struct extent_map_tree *tree = &inode->extent_tree;
   4951	int ret = 0;
   4952	int num = 0;
   4953
   4954	INIT_LIST_HEAD(&extents);
   4955
   4956	write_lock(&tree->lock);
   4957
   4958	list_for_each_entry_safe(em, n, &tree->modified_extents, list) {
   4959		list_del_init(&em->list);
   4960		/*
   4961		 * Just an arbitrary number, this can be really CPU intensive
   4962		 * once we start getting a lot of extents, and really once we
   4963		 * have a bunch of extents we just want to commit since it will
   4964		 * be faster.
   4965		 */
   4966		if (++num > 32768) {
   4967			list_del_init(&tree->modified_extents);
   4968			ret = -EFBIG;
   4969			goto process;
   4970		}
   4971
   4972		if (em->generation < trans->transid)
   4973			continue;
   4974
   4975		/* We log prealloc extents beyond eof later. */
   4976		if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags) &&
   4977		    em->start >= i_size_read(&inode->vfs_inode))
   4978			continue;
   4979
   4980		/* Need a ref to keep it from getting evicted from cache */
   4981		refcount_inc(&em->refs);
   4982		set_bit(EXTENT_FLAG_LOGGING, &em->flags);
   4983		list_add_tail(&em->list, &extents);
   4984		num++;
   4985	}
   4986
   4987	list_sort(NULL, &extents, extent_cmp);
   4988process:
   4989	while (!list_empty(&extents)) {
   4990		em = list_entry(extents.next, struct extent_map, list);
   4991
   4992		list_del_init(&em->list);
   4993
   4994		/*
   4995		 * If we had an error we just need to delete everybody from our
   4996		 * private list.
   4997		 */
   4998		if (ret) {
   4999			clear_em_logging(tree, em);
   5000			free_extent_map(em);
   5001			continue;
   5002		}
   5003
   5004		write_unlock(&tree->lock);
   5005
   5006		ret = log_one_extent(trans, inode, em, path, ctx);
   5007		write_lock(&tree->lock);
   5008		clear_em_logging(tree, em);
   5009		free_extent_map(em);
   5010	}
   5011	WARN_ON(!list_empty(&extents));
   5012	write_unlock(&tree->lock);
   5013
   5014	if (!ret)
   5015		ret = btrfs_log_prealloc_extents(trans, inode, path);
   5016	if (ret)
   5017		return ret;
   5018
   5019	/*
   5020	 * We have logged all extents successfully, now make sure the commit of
   5021	 * the current transaction waits for the ordered extents to complete
   5022	 * before it commits and wipes out the log trees, otherwise we would
   5023	 * lose data if an ordered extents completes after the transaction
   5024	 * commits and a power failure happens after the transaction commit.
   5025	 */
   5026	list_for_each_entry_safe(ordered, tmp, &ctx->ordered_extents, log_list) {
   5027		list_del_init(&ordered->log_list);
   5028		set_bit(BTRFS_ORDERED_LOGGED, &ordered->flags);
   5029
   5030		if (!test_bit(BTRFS_ORDERED_COMPLETE, &ordered->flags)) {
   5031			spin_lock_irq(&inode->ordered_tree.lock);
   5032			if (!test_bit(BTRFS_ORDERED_COMPLETE, &ordered->flags)) {
   5033				set_bit(BTRFS_ORDERED_PENDING, &ordered->flags);
   5034				atomic_inc(&trans->transaction->pending_ordered);
   5035			}
   5036			spin_unlock_irq(&inode->ordered_tree.lock);
   5037		}
   5038		btrfs_put_ordered_extent(ordered);
   5039	}
   5040
   5041	return 0;
   5042}
   5043
   5044static int logged_inode_size(struct btrfs_root *log, struct btrfs_inode *inode,
   5045			     struct btrfs_path *path, u64 *size_ret)
   5046{
   5047	struct btrfs_key key;
   5048	int ret;
   5049
   5050	key.objectid = btrfs_ino(inode);
   5051	key.type = BTRFS_INODE_ITEM_KEY;
   5052	key.offset = 0;
   5053
   5054	ret = btrfs_search_slot(NULL, log, &key, path, 0, 0);
   5055	if (ret < 0) {
   5056		return ret;
   5057	} else if (ret > 0) {
   5058		*size_ret = 0;
   5059	} else {
   5060		struct btrfs_inode_item *item;
   5061
   5062		item = btrfs_item_ptr(path->nodes[0], path->slots[0],
   5063				      struct btrfs_inode_item);
   5064		*size_ret = btrfs_inode_size(path->nodes[0], item);
   5065		/*
   5066		 * If the in-memory inode's i_size is smaller then the inode
   5067		 * size stored in the btree, return the inode's i_size, so
   5068		 * that we get a correct inode size after replaying the log
   5069		 * when before a power failure we had a shrinking truncate
   5070		 * followed by addition of a new name (rename / new hard link).
   5071		 * Otherwise return the inode size from the btree, to avoid
   5072		 * data loss when replaying a log due to previously doing a
   5073		 * write that expands the inode's size and logging a new name
   5074		 * immediately after.
   5075		 */
   5076		if (*size_ret > inode->vfs_inode.i_size)
   5077			*size_ret = inode->vfs_inode.i_size;
   5078	}
   5079
   5080	btrfs_release_path(path);
   5081	return 0;
   5082}
   5083
   5084/*
   5085 * At the moment we always log all xattrs. This is to figure out at log replay
   5086 * time which xattrs must have their deletion replayed. If a xattr is missing
   5087 * in the log tree and exists in the fs/subvol tree, we delete it. This is
   5088 * because if a xattr is deleted, the inode is fsynced and a power failure
   5089 * happens, causing the log to be replayed the next time the fs is mounted,
   5090 * we want the xattr to not exist anymore (same behaviour as other filesystems
   5091 * with a journal, ext3/4, xfs, f2fs, etc).
   5092 */
   5093static int btrfs_log_all_xattrs(struct btrfs_trans_handle *trans,
   5094				struct btrfs_inode *inode,
   5095				struct btrfs_path *path,
   5096				struct btrfs_path *dst_path)
   5097{
   5098	struct btrfs_root *root = inode->root;
   5099	int ret;
   5100	struct btrfs_key key;
   5101	const u64 ino = btrfs_ino(inode);
   5102	int ins_nr = 0;
   5103	int start_slot = 0;
   5104	bool found_xattrs = false;
   5105
   5106	if (test_bit(BTRFS_INODE_NO_XATTRS, &inode->runtime_flags))
   5107		return 0;
   5108
   5109	key.objectid = ino;
   5110	key.type = BTRFS_XATTR_ITEM_KEY;
   5111	key.offset = 0;
   5112
   5113	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
   5114	if (ret < 0)
   5115		return ret;
   5116
   5117	while (true) {
   5118		int slot = path->slots[0];
   5119		struct extent_buffer *leaf = path->nodes[0];
   5120		int nritems = btrfs_header_nritems(leaf);
   5121
   5122		if (slot >= nritems) {
   5123			if (ins_nr > 0) {
   5124				ret = copy_items(trans, inode, dst_path, path,
   5125						 start_slot, ins_nr, 1, 0);
   5126				if (ret < 0)
   5127					return ret;
   5128				ins_nr = 0;
   5129			}
   5130			ret = btrfs_next_leaf(root, path);
   5131			if (ret < 0)
   5132				return ret;
   5133			else if (ret > 0)
   5134				break;
   5135			continue;
   5136		}
   5137
   5138		btrfs_item_key_to_cpu(leaf, &key, slot);
   5139		if (key.objectid != ino || key.type != BTRFS_XATTR_ITEM_KEY)
   5140			break;
   5141
   5142		if (ins_nr == 0)
   5143			start_slot = slot;
   5144		ins_nr++;
   5145		path->slots[0]++;
   5146		found_xattrs = true;
   5147		cond_resched();
   5148	}
   5149	if (ins_nr > 0) {
   5150		ret = copy_items(trans, inode, dst_path, path,
   5151				 start_slot, ins_nr, 1, 0);
   5152		if (ret < 0)
   5153			return ret;
   5154	}
   5155
   5156	if (!found_xattrs)
   5157		set_bit(BTRFS_INODE_NO_XATTRS, &inode->runtime_flags);
   5158
   5159	return 0;
   5160}
   5161
   5162/*
   5163 * When using the NO_HOLES feature if we punched a hole that causes the
   5164 * deletion of entire leafs or all the extent items of the first leaf (the one
   5165 * that contains the inode item and references) we may end up not processing
   5166 * any extents, because there are no leafs with a generation matching the
   5167 * current transaction that have extent items for our inode. So we need to find
   5168 * if any holes exist and then log them. We also need to log holes after any
   5169 * truncate operation that changes the inode's size.
   5170 */
   5171static int btrfs_log_holes(struct btrfs_trans_handle *trans,
   5172			   struct btrfs_inode *inode,
   5173			   struct btrfs_path *path)
   5174{
   5175	struct btrfs_root *root = inode->root;
   5176	struct btrfs_fs_info *fs_info = root->fs_info;
   5177	struct btrfs_key key;
   5178	const u64 ino = btrfs_ino(inode);
   5179	const u64 i_size = i_size_read(&inode->vfs_inode);
   5180	u64 prev_extent_end = 0;
   5181	int ret;
   5182
   5183	if (!btrfs_fs_incompat(fs_info, NO_HOLES) || i_size == 0)
   5184		return 0;
   5185
   5186	key.objectid = ino;
   5187	key.type = BTRFS_EXTENT_DATA_KEY;
   5188	key.offset = 0;
   5189
   5190	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
   5191	if (ret < 0)
   5192		return ret;
   5193
   5194	while (true) {
   5195		struct extent_buffer *leaf = path->nodes[0];
   5196
   5197		if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) {
   5198			ret = btrfs_next_leaf(root, path);
   5199			if (ret < 0)
   5200				return ret;
   5201			if (ret > 0) {
   5202				ret = 0;
   5203				break;
   5204			}
   5205			leaf = path->nodes[0];
   5206		}
   5207
   5208		btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
   5209		if (key.objectid != ino || key.type != BTRFS_EXTENT_DATA_KEY)
   5210			break;
   5211
   5212		/* We have a hole, log it. */
   5213		if (prev_extent_end < key.offset) {
   5214			const u64 hole_len = key.offset - prev_extent_end;
   5215
   5216			/*
   5217			 * Release the path to avoid deadlocks with other code
   5218			 * paths that search the root while holding locks on
   5219			 * leafs from the log root.
   5220			 */
   5221			btrfs_release_path(path);
   5222			ret = btrfs_insert_file_extent(trans, root->log_root,
   5223						       ino, prev_extent_end, 0,
   5224						       0, hole_len, 0, hole_len,
   5225						       0, 0, 0);
   5226			if (ret < 0)
   5227				return ret;
   5228
   5229			/*
   5230			 * Search for the same key again in the root. Since it's
   5231			 * an extent item and we are holding the inode lock, the
   5232			 * key must still exist. If it doesn't just emit warning
   5233			 * and return an error to fall back to a transaction
   5234			 * commit.
   5235			 */
   5236			ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
   5237			if (ret < 0)
   5238				return ret;
   5239			if (WARN_ON(ret > 0))
   5240				return -ENOENT;
   5241			leaf = path->nodes[0];
   5242		}
   5243
   5244		prev_extent_end = btrfs_file_extent_end(path);
   5245		path->slots[0]++;
   5246		cond_resched();
   5247	}
   5248
   5249	if (prev_extent_end < i_size) {
   5250		u64 hole_len;
   5251
   5252		btrfs_release_path(path);
   5253		hole_len = ALIGN(i_size - prev_extent_end, fs_info->sectorsize);
   5254		ret = btrfs_insert_file_extent(trans, root->log_root,
   5255					       ino, prev_extent_end, 0, 0,
   5256					       hole_len, 0, hole_len,
   5257					       0, 0, 0);
   5258		if (ret < 0)
   5259			return ret;
   5260	}
   5261
   5262	return 0;
   5263}
   5264
   5265/*
   5266 * When we are logging a new inode X, check if it doesn't have a reference that
   5267 * matches the reference from some other inode Y created in a past transaction
   5268 * and that was renamed in the current transaction. If we don't do this, then at
   5269 * log replay time we can lose inode Y (and all its files if it's a directory):
   5270 *
   5271 * mkdir /mnt/x
   5272 * echo "hello world" > /mnt/x/foobar
   5273 * sync
   5274 * mv /mnt/x /mnt/y
   5275 * mkdir /mnt/x                 # or touch /mnt/x
   5276 * xfs_io -c fsync /mnt/x
   5277 * <power fail>
   5278 * mount fs, trigger log replay
   5279 *
   5280 * After the log replay procedure, we would lose the first directory and all its
   5281 * files (file foobar).
   5282 * For the case where inode Y is not a directory we simply end up losing it:
   5283 *
   5284 * echo "123" > /mnt/foo
   5285 * sync
   5286 * mv /mnt/foo /mnt/bar
   5287 * echo "abc" > /mnt/foo
   5288 * xfs_io -c fsync /mnt/foo
   5289 * <power fail>
   5290 *
   5291 * We also need this for cases where a snapshot entry is replaced by some other
   5292 * entry (file or directory) otherwise we end up with an unreplayable log due to
   5293 * attempts to delete the snapshot entry (entry of type BTRFS_ROOT_ITEM_KEY) as
   5294 * if it were a regular entry:
   5295 *
   5296 * mkdir /mnt/x
   5297 * btrfs subvolume snapshot /mnt /mnt/x/snap
   5298 * btrfs subvolume delete /mnt/x/snap
   5299 * rmdir /mnt/x
   5300 * mkdir /mnt/x
   5301 * fsync /mnt/x or fsync some new file inside it
   5302 * <power fail>
   5303 *
   5304 * The snapshot delete, rmdir of x, mkdir of a new x and the fsync all happen in
   5305 * the same transaction.
   5306 */
   5307static int btrfs_check_ref_name_override(struct extent_buffer *eb,
   5308					 const int slot,
   5309					 const struct btrfs_key *key,
   5310					 struct btrfs_inode *inode,
   5311					 u64 *other_ino, u64 *other_parent)
   5312{
   5313	int ret;
   5314	struct btrfs_path *search_path;
   5315	char *name = NULL;
   5316	u32 name_len = 0;
   5317	u32 item_size = btrfs_item_size(eb, slot);
   5318	u32 cur_offset = 0;
   5319	unsigned long ptr = btrfs_item_ptr_offset(eb, slot);
   5320
   5321	search_path = btrfs_alloc_path();
   5322	if (!search_path)
   5323		return -ENOMEM;
   5324	search_path->search_commit_root = 1;
   5325	search_path->skip_locking = 1;
   5326
   5327	while (cur_offset < item_size) {
   5328		u64 parent;
   5329		u32 this_name_len;
   5330		u32 this_len;
   5331		unsigned long name_ptr;
   5332		struct btrfs_dir_item *di;
   5333
   5334		if (key->type == BTRFS_INODE_REF_KEY) {
   5335			struct btrfs_inode_ref *iref;
   5336
   5337			iref = (struct btrfs_inode_ref *)(ptr + cur_offset);
   5338			parent = key->offset;
   5339			this_name_len = btrfs_inode_ref_name_len(eb, iref);
   5340			name_ptr = (unsigned long)(iref + 1);
   5341			this_len = sizeof(*iref) + this_name_len;
   5342		} else {
   5343			struct btrfs_inode_extref *extref;
   5344
   5345			extref = (struct btrfs_inode_extref *)(ptr +
   5346							       cur_offset);
   5347			parent = btrfs_inode_extref_parent(eb, extref);
   5348			this_name_len = btrfs_inode_extref_name_len(eb, extref);
   5349			name_ptr = (unsigned long)&extref->name;
   5350			this_len = sizeof(*extref) + this_name_len;
   5351		}
   5352
   5353		if (this_name_len > name_len) {
   5354			char *new_name;
   5355
   5356			new_name = krealloc(name, this_name_len, GFP_NOFS);
   5357			if (!new_name) {
   5358				ret = -ENOMEM;
   5359				goto out;
   5360			}
   5361			name_len = this_name_len;
   5362			name = new_name;
   5363		}
   5364
   5365		read_extent_buffer(eb, name, name_ptr, this_name_len);
   5366		di = btrfs_lookup_dir_item(NULL, inode->root, search_path,
   5367				parent, name, this_name_len, 0);
   5368		if (di && !IS_ERR(di)) {
   5369			struct btrfs_key di_key;
   5370
   5371			btrfs_dir_item_key_to_cpu(search_path->nodes[0],
   5372						  di, &di_key);
   5373			if (di_key.type == BTRFS_INODE_ITEM_KEY) {
   5374				if (di_key.objectid != key->objectid) {
   5375					ret = 1;
   5376					*other_ino = di_key.objectid;
   5377					*other_parent = parent;
   5378				} else {
   5379					ret = 0;
   5380				}
   5381			} else {
   5382				ret = -EAGAIN;
   5383			}
   5384			goto out;
   5385		} else if (IS_ERR(di)) {
   5386			ret = PTR_ERR(di);
   5387			goto out;
   5388		}
   5389		btrfs_release_path(search_path);
   5390
   5391		cur_offset += this_len;
   5392	}
   5393	ret = 0;
   5394out:
   5395	btrfs_free_path(search_path);
   5396	kfree(name);
   5397	return ret;
   5398}
   5399
   5400struct btrfs_ino_list {
   5401	u64 ino;
   5402	u64 parent;
   5403	struct list_head list;
   5404};
   5405
   5406static int log_conflicting_inodes(struct btrfs_trans_handle *trans,
   5407				  struct btrfs_root *root,
   5408				  struct btrfs_path *path,
   5409				  struct btrfs_log_ctx *ctx,
   5410				  u64 ino, u64 parent)
   5411{
   5412	struct btrfs_ino_list *ino_elem;
   5413	LIST_HEAD(inode_list);
   5414	int ret = 0;
   5415
   5416	ino_elem = kmalloc(sizeof(*ino_elem), GFP_NOFS);
   5417	if (!ino_elem)
   5418		return -ENOMEM;
   5419	ino_elem->ino = ino;
   5420	ino_elem->parent = parent;
   5421	list_add_tail(&ino_elem->list, &inode_list);
   5422
   5423	while (!list_empty(&inode_list)) {
   5424		struct btrfs_fs_info *fs_info = root->fs_info;
   5425		struct btrfs_key key;
   5426		struct inode *inode;
   5427
   5428		ino_elem = list_first_entry(&inode_list, struct btrfs_ino_list,
   5429					    list);
   5430		ino = ino_elem->ino;
   5431		parent = ino_elem->parent;
   5432		list_del(&ino_elem->list);
   5433		kfree(ino_elem);
   5434		if (ret)
   5435			continue;
   5436
   5437		btrfs_release_path(path);
   5438
   5439		inode = btrfs_iget(fs_info->sb, ino, root);
   5440		/*
   5441		 * If the other inode that had a conflicting dir entry was
   5442		 * deleted in the current transaction, we need to log its parent
   5443		 * directory.
   5444		 */
   5445		if (IS_ERR(inode)) {
   5446			ret = PTR_ERR(inode);
   5447			if (ret == -ENOENT) {
   5448				inode = btrfs_iget(fs_info->sb, parent, root);
   5449				if (IS_ERR(inode)) {
   5450					ret = PTR_ERR(inode);
   5451				} else {
   5452					ret = btrfs_log_inode(trans,
   5453						      BTRFS_I(inode),
   5454						      LOG_OTHER_INODE_ALL,
   5455						      ctx);
   5456					btrfs_add_delayed_iput(inode);
   5457				}
   5458			}
   5459			continue;
   5460		}
   5461		/*
   5462		 * If the inode was already logged skip it - otherwise we can
   5463		 * hit an infinite loop. Example:
   5464		 *
   5465		 * From the commit root (previous transaction) we have the
   5466		 * following inodes:
   5467		 *
   5468		 * inode 257 a directory
   5469		 * inode 258 with references "zz" and "zz_link" on inode 257
   5470		 * inode 259 with reference "a" on inode 257
   5471		 *
   5472		 * And in the current (uncommitted) transaction we have:
   5473		 *
   5474		 * inode 257 a directory, unchanged
   5475		 * inode 258 with references "a" and "a2" on inode 257
   5476		 * inode 259 with reference "zz_link" on inode 257
   5477		 * inode 261 with reference "zz" on inode 257
   5478		 *
   5479		 * When logging inode 261 the following infinite loop could
   5480		 * happen if we don't skip already logged inodes:
   5481		 *
   5482		 * - we detect inode 258 as a conflicting inode, with inode 261
   5483		 *   on reference "zz", and log it;
   5484		 *
   5485		 * - we detect inode 259 as a conflicting inode, with inode 258
   5486		 *   on reference "a", and log it;
   5487		 *
   5488		 * - we detect inode 258 as a conflicting inode, with inode 259
   5489		 *   on reference "zz_link", and log it - again! After this we
   5490		 *   repeat the above steps forever.
   5491		 */
   5492		spin_lock(&BTRFS_I(inode)->lock);
   5493		/*
   5494		 * Check the inode's logged_trans only instead of
   5495		 * btrfs_inode_in_log(). This is because the last_log_commit of
   5496		 * the inode is not updated when we only log that it exists (see
   5497		 * btrfs_log_inode()).
   5498		 */
   5499		if (BTRFS_I(inode)->logged_trans == trans->transid) {
   5500			spin_unlock(&BTRFS_I(inode)->lock);
   5501			btrfs_add_delayed_iput(inode);
   5502			continue;
   5503		}
   5504		spin_unlock(&BTRFS_I(inode)->lock);
   5505		/*
   5506		 * We are safe logging the other inode without acquiring its
   5507		 * lock as long as we log with the LOG_INODE_EXISTS mode. We
   5508		 * are safe against concurrent renames of the other inode as
   5509		 * well because during a rename we pin the log and update the
   5510		 * log with the new name before we unpin it.
   5511		 */
   5512		ret = btrfs_log_inode(trans, BTRFS_I(inode), LOG_OTHER_INODE, ctx);
   5513		if (ret) {
   5514			btrfs_add_delayed_iput(inode);
   5515			continue;
   5516		}
   5517
   5518		key.objectid = ino;
   5519		key.type = BTRFS_INODE_REF_KEY;
   5520		key.offset = 0;
   5521		ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
   5522		if (ret < 0) {
   5523			btrfs_add_delayed_iput(inode);
   5524			continue;
   5525		}
   5526
   5527		while (true) {
   5528			struct extent_buffer *leaf = path->nodes[0];
   5529			int slot = path->slots[0];
   5530			u64 other_ino = 0;
   5531			u64 other_parent = 0;
   5532
   5533			if (slot >= btrfs_header_nritems(leaf)) {
   5534				ret = btrfs_next_leaf(root, path);
   5535				if (ret < 0) {
   5536					break;
   5537				} else if (ret > 0) {
   5538					ret = 0;
   5539					break;
   5540				}
   5541				continue;
   5542			}
   5543
   5544			btrfs_item_key_to_cpu(leaf, &key, slot);
   5545			if (key.objectid != ino ||
   5546			    (key.type != BTRFS_INODE_REF_KEY &&
   5547			     key.type != BTRFS_INODE_EXTREF_KEY)) {
   5548				ret = 0;
   5549				break;
   5550			}
   5551
   5552			ret = btrfs_check_ref_name_override(leaf, slot, &key,
   5553					BTRFS_I(inode), &other_ino,
   5554					&other_parent);
   5555			if (ret < 0)
   5556				break;
   5557			if (ret > 0) {
   5558				ino_elem = kmalloc(sizeof(*ino_elem), GFP_NOFS);
   5559				if (!ino_elem) {
   5560					ret = -ENOMEM;
   5561					break;
   5562				}
   5563				ino_elem->ino = other_ino;
   5564				ino_elem->parent = other_parent;
   5565				list_add_tail(&ino_elem->list, &inode_list);
   5566				ret = 0;
   5567			}
   5568			path->slots[0]++;
   5569		}
   5570		btrfs_add_delayed_iput(inode);
   5571	}
   5572
   5573	return ret;
   5574}
   5575
   5576static int copy_inode_items_to_log(struct btrfs_trans_handle *trans,
   5577				   struct btrfs_inode *inode,
   5578				   struct btrfs_key *min_key,
   5579				   const struct btrfs_key *max_key,
   5580				   struct btrfs_path *path,
   5581				   struct btrfs_path *dst_path,
   5582				   const u64 logged_isize,
   5583				   const bool recursive_logging,
   5584				   const int inode_only,
   5585				   struct btrfs_log_ctx *ctx,
   5586				   bool *need_log_inode_item)
   5587{
   5588	const u64 i_size = i_size_read(&inode->vfs_inode);
   5589	struct btrfs_root *root = inode->root;
   5590	int ins_start_slot = 0;
   5591	int ins_nr = 0;
   5592	int ret;
   5593
   5594	while (1) {
   5595		ret = btrfs_search_forward(root, min_key, path, trans->transid);
   5596		if (ret < 0)
   5597			return ret;
   5598		if (ret > 0) {
   5599			ret = 0;
   5600			break;
   5601		}
   5602again:
   5603		/* Note, ins_nr might be > 0 here, cleanup outside the loop */
   5604		if (min_key->objectid != max_key->objectid)
   5605			break;
   5606		if (min_key->type > max_key->type)
   5607			break;
   5608
   5609		if (min_key->type == BTRFS_INODE_ITEM_KEY) {
   5610			*need_log_inode_item = false;
   5611		} else if (min_key->type == BTRFS_EXTENT_DATA_KEY &&
   5612			   min_key->offset >= i_size) {
   5613			/*
   5614			 * Extents at and beyond eof are logged with
   5615			 * btrfs_log_prealloc_extents().
   5616			 * Only regular files have BTRFS_EXTENT_DATA_KEY keys,
   5617			 * and no keys greater than that, so bail out.
   5618			 */
   5619			break;
   5620		} else if ((min_key->type == BTRFS_INODE_REF_KEY ||
   5621			    min_key->type == BTRFS_INODE_EXTREF_KEY) &&
   5622			   inode->generation == trans->transid &&
   5623			   !recursive_logging) {
   5624			u64 other_ino = 0;
   5625			u64 other_parent = 0;
   5626
   5627			ret = btrfs_check_ref_name_override(path->nodes[0],
   5628					path->slots[0], min_key, inode,
   5629					&other_ino, &other_parent);
   5630			if (ret < 0) {
   5631				return ret;
   5632			} else if (ret > 0 &&
   5633				   other_ino != btrfs_ino(BTRFS_I(ctx->inode))) {
   5634				if (ins_nr > 0) {
   5635					ins_nr++;
   5636				} else {
   5637					ins_nr = 1;
   5638					ins_start_slot = path->slots[0];
   5639				}
   5640				ret = copy_items(trans, inode, dst_path, path,
   5641						 ins_start_slot, ins_nr,
   5642						 inode_only, logged_isize);
   5643				if (ret < 0)
   5644					return ret;
   5645				ins_nr = 0;
   5646
   5647				ret = log_conflicting_inodes(trans, root, path,
   5648						ctx, other_ino, other_parent);
   5649				if (ret)
   5650					return ret;
   5651				btrfs_release_path(path);
   5652				goto next_key;
   5653			}
   5654		} else if (min_key->type == BTRFS_XATTR_ITEM_KEY) {
   5655			/* Skip xattrs, logged later with btrfs_log_all_xattrs() */
   5656			if (ins_nr == 0)
   5657				goto next_slot;
   5658			ret = copy_items(trans, inode, dst_path, path,
   5659					 ins_start_slot,
   5660					 ins_nr, inode_only, logged_isize);
   5661			if (ret < 0)
   5662				return ret;
   5663			ins_nr = 0;
   5664			goto next_slot;
   5665		}
   5666
   5667		if (ins_nr && ins_start_slot + ins_nr == path->slots[0]) {
   5668			ins_nr++;
   5669			goto next_slot;
   5670		} else if (!ins_nr) {
   5671			ins_start_slot = path->slots[0];
   5672			ins_nr = 1;
   5673			goto next_slot;
   5674		}
   5675
   5676		ret = copy_items(trans, inode, dst_path, path, ins_start_slot,
   5677				 ins_nr, inode_only, logged_isize);
   5678		if (ret < 0)
   5679			return ret;
   5680		ins_nr = 1;
   5681		ins_start_slot = path->slots[0];
   5682next_slot:
   5683		path->slots[0]++;
   5684		if (path->slots[0] < btrfs_header_nritems(path->nodes[0])) {
   5685			btrfs_item_key_to_cpu(path->nodes[0], min_key,
   5686					      path->slots[0]);
   5687			goto again;
   5688		}
   5689		if (ins_nr) {
   5690			ret = copy_items(trans, inode, dst_path, path,
   5691					 ins_start_slot, ins_nr, inode_only,
   5692					 logged_isize);
   5693			if (ret < 0)
   5694				return ret;
   5695			ins_nr = 0;
   5696		}
   5697		btrfs_release_path(path);
   5698next_key:
   5699		if (min_key->offset < (u64)-1) {
   5700			min_key->offset++;
   5701		} else if (min_key->type < max_key->type) {
   5702			min_key->type++;
   5703			min_key->offset = 0;
   5704		} else {
   5705			break;
   5706		}
   5707
   5708		/*
   5709		 * We may process many leaves full of items for our inode, so
   5710		 * avoid monopolizing a cpu for too long by rescheduling while
   5711		 * not holding locks on any tree.
   5712		 */
   5713		cond_resched();
   5714	}
   5715	if (ins_nr) {
   5716		ret = copy_items(trans, inode, dst_path, path, ins_start_slot,
   5717				 ins_nr, inode_only, logged_isize);
   5718		if (ret)
   5719			return ret;
   5720	}
   5721
   5722	if (inode_only == LOG_INODE_ALL && S_ISREG(inode->vfs_inode.i_mode)) {
   5723		/*
   5724		 * Release the path because otherwise we might attempt to double
   5725		 * lock the same leaf with btrfs_log_prealloc_extents() below.
   5726		 */
   5727		btrfs_release_path(path);
   5728		ret = btrfs_log_prealloc_extents(trans, inode, dst_path);
   5729	}
   5730
   5731	return ret;
   5732}
   5733
   5734/* log a single inode in the tree log.
   5735 * At least one parent directory for this inode must exist in the tree
   5736 * or be logged already.
   5737 *
   5738 * Any items from this inode changed by the current transaction are copied
   5739 * to the log tree.  An extra reference is taken on any extents in this
   5740 * file, allowing us to avoid a whole pile of corner cases around logging
   5741 * blocks that have been removed from the tree.
   5742 *
   5743 * See LOG_INODE_ALL and related defines for a description of what inode_only
   5744 * does.
   5745 *
   5746 * This handles both files and directories.
   5747 */
   5748static int btrfs_log_inode(struct btrfs_trans_handle *trans,
   5749			   struct btrfs_inode *inode,
   5750			   int inode_only,
   5751			   struct btrfs_log_ctx *ctx)
   5752{
   5753	struct btrfs_path *path;
   5754	struct btrfs_path *dst_path;
   5755	struct btrfs_key min_key;
   5756	struct btrfs_key max_key;
   5757	struct btrfs_root *log = inode->root->log_root;
   5758	int ret;
   5759	bool fast_search = false;
   5760	u64 ino = btrfs_ino(inode);
   5761	struct extent_map_tree *em_tree = &inode->extent_tree;
   5762	u64 logged_isize = 0;
   5763	bool need_log_inode_item = true;
   5764	bool xattrs_logged = false;
   5765	bool recursive_logging = false;
   5766	bool inode_item_dropped = true;
   5767	const bool orig_logged_before = ctx->logged_before;
   5768
   5769	path = btrfs_alloc_path();
   5770	if (!path)
   5771		return -ENOMEM;
   5772	dst_path = btrfs_alloc_path();
   5773	if (!dst_path) {
   5774		btrfs_free_path(path);
   5775		return -ENOMEM;
   5776	}
   5777
   5778	min_key.objectid = ino;
   5779	min_key.type = BTRFS_INODE_ITEM_KEY;
   5780	min_key.offset = 0;
   5781
   5782	max_key.objectid = ino;
   5783
   5784
   5785	/* today the code can only do partial logging of directories */
   5786	if (S_ISDIR(inode->vfs_inode.i_mode) ||
   5787	    (!test_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
   5788		       &inode->runtime_flags) &&
   5789	     inode_only >= LOG_INODE_EXISTS))
   5790		max_key.type = BTRFS_XATTR_ITEM_KEY;
   5791	else
   5792		max_key.type = (u8)-1;
   5793	max_key.offset = (u64)-1;
   5794
   5795	/*
   5796	 * Only run delayed items if we are a directory. We want to make sure
   5797	 * all directory indexes hit the fs/subvolume tree so we can find them
   5798	 * and figure out which index ranges have to be logged.
   5799	 */
   5800	if (S_ISDIR(inode->vfs_inode.i_mode)) {
   5801		ret = btrfs_commit_inode_delayed_items(trans, inode);
   5802		if (ret)
   5803			goto out;
   5804	}
   5805
   5806	if (inode_only == LOG_OTHER_INODE || inode_only == LOG_OTHER_INODE_ALL) {
   5807		recursive_logging = true;
   5808		if (inode_only == LOG_OTHER_INODE)
   5809			inode_only = LOG_INODE_EXISTS;
   5810		else
   5811			inode_only = LOG_INODE_ALL;
   5812		mutex_lock_nested(&inode->log_mutex, SINGLE_DEPTH_NESTING);
   5813	} else {
   5814		mutex_lock(&inode->log_mutex);
   5815	}
   5816
   5817	/*
   5818	 * For symlinks, we must always log their content, which is stored in an
   5819	 * inline extent, otherwise we could end up with an empty symlink after
   5820	 * log replay, which is invalid on linux (symlink(2) returns -ENOENT if
   5821	 * one attempts to create an empty symlink).
   5822	 * We don't need to worry about flushing delalloc, because when we create
   5823	 * the inline extent when the symlink is created (we never have delalloc
   5824	 * for symlinks).
   5825	 */
   5826	if (S_ISLNK(inode->vfs_inode.i_mode))
   5827		inode_only = LOG_INODE_ALL;
   5828
   5829	/*
   5830	 * Before logging the inode item, cache the value returned by
   5831	 * inode_logged(), because after that we have the need to figure out if
   5832	 * the inode was previously logged in this transaction.
   5833	 */
   5834	ret = inode_logged(trans, inode, path);
   5835	if (ret < 0)
   5836		goto out_unlock;
   5837	ctx->logged_before = (ret == 1);
   5838	ret = 0;
   5839
   5840	/*
   5841	 * This is for cases where logging a directory could result in losing a
   5842	 * a file after replaying the log. For example, if we move a file from a
   5843	 * directory A to a directory B, then fsync directory A, we have no way
   5844	 * to known the file was moved from A to B, so logging just A would
   5845	 * result in losing the file after a log replay.
   5846	 */
   5847	if (S_ISDIR(inode->vfs_inode.i_mode) &&
   5848	    inode_only == LOG_INODE_ALL &&
   5849	    inode->last_unlink_trans >= trans->transid) {
   5850		btrfs_set_log_full_commit(trans);
   5851		ret = 1;
   5852		goto out_unlock;
   5853	}
   5854
   5855	/*
   5856	 * a brute force approach to making sure we get the most uptodate
   5857	 * copies of everything.
   5858	 */
   5859	if (S_ISDIR(inode->vfs_inode.i_mode)) {
   5860		int max_key_type = BTRFS_DIR_LOG_INDEX_KEY;
   5861
   5862		clear_bit(BTRFS_INODE_COPY_EVERYTHING, &inode->runtime_flags);
   5863		if (inode_only == LOG_INODE_EXISTS)
   5864			max_key_type = BTRFS_XATTR_ITEM_KEY;
   5865		if (ctx->logged_before)
   5866			ret = drop_inode_items(trans, log, path, inode,
   5867					       max_key_type);
   5868	} else {
   5869		if (inode_only == LOG_INODE_EXISTS && ctx->logged_before) {
   5870			/*
   5871			 * Make sure the new inode item we write to the log has
   5872			 * the same isize as the current one (if it exists).
   5873			 * This is necessary to prevent data loss after log
   5874			 * replay, and also to prevent doing a wrong expanding
   5875			 * truncate - for e.g. create file, write 4K into offset
   5876			 * 0, fsync, write 4K into offset 4096, add hard link,
   5877			 * fsync some other file (to sync log), power fail - if
   5878			 * we use the inode's current i_size, after log replay
   5879			 * we get a 8Kb file, with the last 4Kb extent as a hole
   5880			 * (zeroes), as if an expanding truncate happened,
   5881			 * instead of getting a file of 4Kb only.
   5882			 */
   5883			ret = logged_inode_size(log, inode, path, &logged_isize);
   5884			if (ret)
   5885				goto out_unlock;
   5886		}
   5887		if (test_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
   5888			     &inode->runtime_flags)) {
   5889			if (inode_only == LOG_INODE_EXISTS) {
   5890				max_key.type = BTRFS_XATTR_ITEM_KEY;
   5891				if (ctx->logged_before)
   5892					ret = drop_inode_items(trans, log, path,
   5893							       inode, max_key.type);
   5894			} else {
   5895				clear_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
   5896					  &inode->runtime_flags);
   5897				clear_bit(BTRFS_INODE_COPY_EVERYTHING,
   5898					  &inode->runtime_flags);
   5899				if (ctx->logged_before)
   5900					ret = truncate_inode_items(trans, log,
   5901								   inode, 0, 0);
   5902			}
   5903		} else if (test_and_clear_bit(BTRFS_INODE_COPY_EVERYTHING,
   5904					      &inode->runtime_flags) ||
   5905			   inode_only == LOG_INODE_EXISTS) {
   5906			if (inode_only == LOG_INODE_ALL)
   5907				fast_search = true;
   5908			max_key.type = BTRFS_XATTR_ITEM_KEY;
   5909			if (ctx->logged_before)
   5910				ret = drop_inode_items(trans, log, path, inode,
   5911						       max_key.type);
   5912		} else {
   5913			if (inode_only == LOG_INODE_ALL)
   5914				fast_search = true;
   5915			inode_item_dropped = false;
   5916			goto log_extents;
   5917		}
   5918
   5919	}
   5920	if (ret)
   5921		goto out_unlock;
   5922
   5923	ret = copy_inode_items_to_log(trans, inode, &min_key, &max_key,
   5924				      path, dst_path, logged_isize,
   5925				      recursive_logging, inode_only, ctx,
   5926				      &need_log_inode_item);
   5927	if (ret)
   5928		goto out_unlock;
   5929
   5930	btrfs_release_path(path);
   5931	btrfs_release_path(dst_path);
   5932	ret = btrfs_log_all_xattrs(trans, inode, path, dst_path);
   5933	if (ret)
   5934		goto out_unlock;
   5935	xattrs_logged = true;
   5936	if (max_key.type >= BTRFS_EXTENT_DATA_KEY && !fast_search) {
   5937		btrfs_release_path(path);
   5938		btrfs_release_path(dst_path);
   5939		ret = btrfs_log_holes(trans, inode, path);
   5940		if (ret)
   5941			goto out_unlock;
   5942	}
   5943log_extents:
   5944	btrfs_release_path(path);
   5945	btrfs_release_path(dst_path);
   5946	if (need_log_inode_item) {
   5947		ret = log_inode_item(trans, log, dst_path, inode, inode_item_dropped);
   5948		if (ret)
   5949			goto out_unlock;
   5950		/*
   5951		 * If we are doing a fast fsync and the inode was logged before
   5952		 * in this transaction, we don't need to log the xattrs because
   5953		 * they were logged before. If xattrs were added, changed or
   5954		 * deleted since the last time we logged the inode, then we have
   5955		 * already logged them because the inode had the runtime flag
   5956		 * BTRFS_INODE_COPY_EVERYTHING set.
   5957		 */
   5958		if (!xattrs_logged && inode->logged_trans < trans->transid) {
   5959			ret = btrfs_log_all_xattrs(trans, inode, path, dst_path);
   5960			if (ret)
   5961				goto out_unlock;
   5962			btrfs_release_path(path);
   5963		}
   5964	}
   5965	if (fast_search) {
   5966		ret = btrfs_log_changed_extents(trans, inode, dst_path, ctx);
   5967		if (ret)
   5968			goto out_unlock;
   5969	} else if (inode_only == LOG_INODE_ALL) {
   5970		struct extent_map *em, *n;
   5971
   5972		write_lock(&em_tree->lock);
   5973		list_for_each_entry_safe(em, n, &em_tree->modified_extents, list)
   5974			list_del_init(&em->list);
   5975		write_unlock(&em_tree->lock);
   5976	}
   5977
   5978	if (inode_only == LOG_INODE_ALL && S_ISDIR(inode->vfs_inode.i_mode)) {
   5979		ret = log_directory_changes(trans, inode, path, dst_path, ctx);
   5980		if (ret)
   5981			goto out_unlock;
   5982	}
   5983
   5984	spin_lock(&inode->lock);
   5985	inode->logged_trans = trans->transid;
   5986	/*
   5987	 * Don't update last_log_commit if we logged that an inode exists.
   5988	 * We do this for three reasons:
   5989	 *
   5990	 * 1) We might have had buffered writes to this inode that were
   5991	 *    flushed and had their ordered extents completed in this
   5992	 *    transaction, but we did not previously log the inode with
   5993	 *    LOG_INODE_ALL. Later the inode was evicted and after that
   5994	 *    it was loaded again and this LOG_INODE_EXISTS log operation
   5995	 *    happened. We must make sure that if an explicit fsync against
   5996	 *    the inode is performed later, it logs the new extents, an
   5997	 *    updated inode item, etc, and syncs the log. The same logic
   5998	 *    applies to direct IO writes instead of buffered writes.
   5999	 *
   6000	 * 2) When we log the inode with LOG_INODE_EXISTS, its inode item
   6001	 *    is logged with an i_size of 0 or whatever value was logged
   6002	 *    before. If later the i_size of the inode is increased by a
   6003	 *    truncate operation, the log is synced through an fsync of
   6004	 *    some other inode and then finally an explicit fsync against
   6005	 *    this inode is made, we must make sure this fsync logs the
   6006	 *    inode with the new i_size, the hole between old i_size and
   6007	 *    the new i_size, and syncs the log.
   6008	 *
   6009	 * 3) If we are logging that an ancestor inode exists as part of
   6010	 *    logging a new name from a link or rename operation, don't update
   6011	 *    its last_log_commit - otherwise if an explicit fsync is made
   6012	 *    against an ancestor, the fsync considers the inode in the log
   6013	 *    and doesn't sync the log, resulting in the ancestor missing after
   6014	 *    a power failure unless the log was synced as part of an fsync
   6015	 *    against any other unrelated inode.
   6016	 */
   6017	if (inode_only != LOG_INODE_EXISTS)
   6018		inode->last_log_commit = inode->last_sub_trans;
   6019	spin_unlock(&inode->lock);
   6020
   6021	/*
   6022	 * Reset the last_reflink_trans so that the next fsync does not need to
   6023	 * go through the slower path when logging extents and their checksums.
   6024	 */
   6025	if (inode_only == LOG_INODE_ALL)
   6026		inode->last_reflink_trans = 0;
   6027
   6028out_unlock:
   6029	mutex_unlock(&inode->log_mutex);
   6030out:
   6031	btrfs_free_path(path);
   6032	btrfs_free_path(dst_path);
   6033
   6034	if (recursive_logging)
   6035		ctx->logged_before = orig_logged_before;
   6036
   6037	return ret;
   6038}
   6039
   6040/*
   6041 * Check if we need to log an inode. This is used in contexts where while
   6042 * logging an inode we need to log another inode (either that it exists or in
   6043 * full mode). This is used instead of btrfs_inode_in_log() because the later
   6044 * requires the inode to be in the log and have the log transaction committed,
   6045 * while here we do not care if the log transaction was already committed - our
   6046 * caller will commit the log later - and we want to avoid logging an inode
   6047 * multiple times when multiple tasks have joined the same log transaction.
   6048 */
   6049static bool need_log_inode(struct btrfs_trans_handle *trans,
   6050			   struct btrfs_inode *inode)
   6051{
   6052	/*
   6053	 * If a directory was not modified, no dentries added or removed, we can
   6054	 * and should avoid logging it.
   6055	 */
   6056	if (S_ISDIR(inode->vfs_inode.i_mode) && inode->last_trans < trans->transid)
   6057		return false;
   6058
   6059	/*
   6060	 * If this inode does not have new/updated/deleted xattrs since the last
   6061	 * time it was logged and is flagged as logged in the current transaction,
   6062	 * we can skip logging it. As for new/deleted names, those are updated in
   6063	 * the log by link/unlink/rename operations.
   6064	 * In case the inode was logged and then evicted and reloaded, its
   6065	 * logged_trans will be 0, in which case we have to fully log it since
   6066	 * logged_trans is a transient field, not persisted.
   6067	 */
   6068	if (inode->logged_trans == trans->transid &&
   6069	    !test_bit(BTRFS_INODE_COPY_EVERYTHING, &inode->runtime_flags))
   6070		return false;
   6071
   6072	return true;
   6073}
   6074
   6075struct btrfs_dir_list {
   6076	u64 ino;
   6077	struct list_head list;
   6078};
   6079
   6080/*
   6081 * Log the inodes of the new dentries of a directory. See log_dir_items() for
   6082 * details about the why it is needed.
   6083 * This is a recursive operation - if an existing dentry corresponds to a
   6084 * directory, that directory's new entries are logged too (same behaviour as
   6085 * ext3/4, xfs, f2fs, reiserfs, nilfs2). Note that when logging the inodes
   6086 * the dentries point to we do not lock their i_mutex, otherwise lockdep
   6087 * complains about the following circular lock dependency / possible deadlock:
   6088 *
   6089 *        CPU0                                        CPU1
   6090 *        ----                                        ----
   6091 * lock(&type->i_mutex_dir_key#3/2);
   6092 *                                            lock(sb_internal#2);
   6093 *                                            lock(&type->i_mutex_dir_key#3/2);
   6094 * lock(&sb->s_type->i_mutex_key#14);
   6095 *
   6096 * Where sb_internal is the lock (a counter that works as a lock) acquired by
   6097 * sb_start_intwrite() in btrfs_start_transaction().
   6098 * Not locking i_mutex of the inodes is still safe because:
   6099 *
   6100 * 1) For regular files we log with a mode of LOG_INODE_EXISTS. It's possible
   6101 *    that while logging the inode new references (names) are added or removed
   6102 *    from the inode, leaving the logged inode item with a link count that does
   6103 *    not match the number of logged inode reference items. This is fine because
   6104 *    at log replay time we compute the real number of links and correct the
   6105 *    link count in the inode item (see replay_one_buffer() and
   6106 *    link_to_fixup_dir());
   6107 *
   6108 * 2) For directories we log with a mode of LOG_INODE_ALL. It's possible that
   6109 *    while logging the inode's items new index items (key type
   6110 *    BTRFS_DIR_INDEX_KEY) are added to fs/subvol tree and the logged inode item
   6111 *    has a size that doesn't match the sum of the lengths of all the logged
   6112 *    names - this is ok, not a problem, because at log replay time we set the
   6113 *    directory's i_size to the correct value (see replay_one_name() and
   6114 *    do_overwrite_item()).
   6115 */
   6116static int log_new_dir_dentries(struct btrfs_trans_handle *trans,
   6117				struct btrfs_root *root,
   6118				struct btrfs_inode *start_inode,
   6119				struct btrfs_log_ctx *ctx)
   6120{
   6121	struct btrfs_fs_info *fs_info = root->fs_info;
   6122	struct btrfs_path *path;
   6123	LIST_HEAD(dir_list);
   6124	struct btrfs_dir_list *dir_elem;
   6125	int ret = 0;
   6126
   6127	/*
   6128	 * If we are logging a new name, as part of a link or rename operation,
   6129	 * don't bother logging new dentries, as we just want to log the names
   6130	 * of an inode and that any new parents exist.
   6131	 */
   6132	if (ctx->logging_new_name)
   6133		return 0;
   6134
   6135	path = btrfs_alloc_path();
   6136	if (!path)
   6137		return -ENOMEM;
   6138
   6139	dir_elem = kmalloc(sizeof(*dir_elem), GFP_NOFS);
   6140	if (!dir_elem) {
   6141		btrfs_free_path(path);
   6142		return -ENOMEM;
   6143	}
   6144	dir_elem->ino = btrfs_ino(start_inode);
   6145	list_add_tail(&dir_elem->list, &dir_list);
   6146
   6147	while (!list_empty(&dir_list)) {
   6148		struct extent_buffer *leaf;
   6149		struct btrfs_key min_key;
   6150		int nritems;
   6151		int i;
   6152
   6153		dir_elem = list_first_entry(&dir_list, struct btrfs_dir_list,
   6154					    list);
   6155		if (ret)
   6156			goto next_dir_inode;
   6157
   6158		min_key.objectid = dir_elem->ino;
   6159		min_key.type = BTRFS_DIR_INDEX_KEY;
   6160		min_key.offset = 0;
   6161again:
   6162		btrfs_release_path(path);
   6163		ret = btrfs_search_forward(root, &min_key, path, trans->transid);
   6164		if (ret < 0) {
   6165			goto next_dir_inode;
   6166		} else if (ret > 0) {
   6167			ret = 0;
   6168			goto next_dir_inode;
   6169		}
   6170
   6171		leaf = path->nodes[0];
   6172		nritems = btrfs_header_nritems(leaf);
   6173		for (i = path->slots[0]; i < nritems; i++) {
   6174			struct btrfs_dir_item *di;
   6175			struct btrfs_key di_key;
   6176			struct inode *di_inode;
   6177			struct btrfs_dir_list *new_dir_elem;
   6178			int log_mode = LOG_INODE_EXISTS;
   6179			int type;
   6180
   6181			btrfs_item_key_to_cpu(leaf, &min_key, i);
   6182			if (min_key.objectid != dir_elem->ino ||
   6183			    min_key.type != BTRFS_DIR_INDEX_KEY)
   6184				goto next_dir_inode;
   6185
   6186			di = btrfs_item_ptr(leaf, i, struct btrfs_dir_item);
   6187			type = btrfs_dir_type(leaf, di);
   6188			if (btrfs_dir_transid(leaf, di) < trans->transid)
   6189				continue;
   6190			btrfs_dir_item_key_to_cpu(leaf, di, &di_key);
   6191			if (di_key.type == BTRFS_ROOT_ITEM_KEY)
   6192				continue;
   6193
   6194			btrfs_release_path(path);
   6195			di_inode = btrfs_iget(fs_info->sb, di_key.objectid, root);
   6196			if (IS_ERR(di_inode)) {
   6197				ret = PTR_ERR(di_inode);
   6198				goto next_dir_inode;
   6199			}
   6200
   6201			if (!need_log_inode(trans, BTRFS_I(di_inode))) {
   6202				btrfs_add_delayed_iput(di_inode);
   6203				break;
   6204			}
   6205
   6206			ctx->log_new_dentries = false;
   6207			if (type == BTRFS_FT_DIR)
   6208				log_mode = LOG_INODE_ALL;
   6209			ret = btrfs_log_inode(trans, BTRFS_I(di_inode),
   6210					      log_mode, ctx);
   6211			btrfs_add_delayed_iput(di_inode);
   6212			if (ret)
   6213				goto next_dir_inode;
   6214			if (ctx->log_new_dentries) {
   6215				new_dir_elem = kmalloc(sizeof(*new_dir_elem),
   6216						       GFP_NOFS);
   6217				if (!new_dir_elem) {
   6218					ret = -ENOMEM;
   6219					goto next_dir_inode;
   6220				}
   6221				new_dir_elem->ino = di_key.objectid;
   6222				list_add_tail(&new_dir_elem->list, &dir_list);
   6223			}
   6224			break;
   6225		}
   6226		if (min_key.offset < (u64)-1) {
   6227			min_key.offset++;
   6228			goto again;
   6229		}
   6230next_dir_inode:
   6231		list_del(&dir_elem->list);
   6232		kfree(dir_elem);
   6233	}
   6234
   6235	btrfs_free_path(path);
   6236	return ret;
   6237}
   6238
   6239static int btrfs_log_all_parents(struct btrfs_trans_handle *trans,
   6240				 struct btrfs_inode *inode,
   6241				 struct btrfs_log_ctx *ctx)
   6242{
   6243	struct btrfs_fs_info *fs_info = trans->fs_info;
   6244	int ret;
   6245	struct btrfs_path *path;
   6246	struct btrfs_key key;
   6247	struct btrfs_root *root = inode->root;
   6248	const u64 ino = btrfs_ino(inode);
   6249
   6250	path = btrfs_alloc_path();
   6251	if (!path)
   6252		return -ENOMEM;
   6253	path->skip_locking = 1;
   6254	path->search_commit_root = 1;
   6255
   6256	key.objectid = ino;
   6257	key.type = BTRFS_INODE_REF_KEY;
   6258	key.offset = 0;
   6259	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
   6260	if (ret < 0)
   6261		goto out;
   6262
   6263	while (true) {
   6264		struct extent_buffer *leaf = path->nodes[0];
   6265		int slot = path->slots[0];
   6266		u32 cur_offset = 0;
   6267		u32 item_size;
   6268		unsigned long ptr;
   6269
   6270		if (slot >= btrfs_header_nritems(leaf)) {
   6271			ret = btrfs_next_leaf(root, path);
   6272			if (ret < 0)
   6273				goto out;
   6274			else if (ret > 0)
   6275				break;
   6276			continue;
   6277		}
   6278
   6279		btrfs_item_key_to_cpu(leaf, &key, slot);
   6280		/* BTRFS_INODE_EXTREF_KEY is BTRFS_INODE_REF_KEY + 1 */
   6281		if (key.objectid != ino || key.type > BTRFS_INODE_EXTREF_KEY)
   6282			break;
   6283
   6284		item_size = btrfs_item_size(leaf, slot);
   6285		ptr = btrfs_item_ptr_offset(leaf, slot);
   6286		while (cur_offset < item_size) {
   6287			struct btrfs_key inode_key;
   6288			struct inode *dir_inode;
   6289
   6290			inode_key.type = BTRFS_INODE_ITEM_KEY;
   6291			inode_key.offset = 0;
   6292
   6293			if (key.type == BTRFS_INODE_EXTREF_KEY) {
   6294				struct btrfs_inode_extref *extref;
   6295
   6296				extref = (struct btrfs_inode_extref *)
   6297					(ptr + cur_offset);
   6298				inode_key.objectid = btrfs_inode_extref_parent(
   6299					leaf, extref);
   6300				cur_offset += sizeof(*extref);
   6301				cur_offset += btrfs_inode_extref_name_len(leaf,
   6302					extref);
   6303			} else {
   6304				inode_key.objectid = key.offset;
   6305				cur_offset = item_size;
   6306			}
   6307
   6308			dir_inode = btrfs_iget(fs_info->sb, inode_key.objectid,
   6309					       root);
   6310			/*
   6311			 * If the parent inode was deleted, return an error to
   6312			 * fallback to a transaction commit. This is to prevent
   6313			 * getting an inode that was moved from one parent A to
   6314			 * a parent B, got its former parent A deleted and then
   6315			 * it got fsync'ed, from existing at both parents after
   6316			 * a log replay (and the old parent still existing).
   6317			 * Example:
   6318			 *
   6319			 * mkdir /mnt/A
   6320			 * mkdir /mnt/B
   6321			 * touch /mnt/B/bar
   6322			 * sync
   6323			 * mv /mnt/B/bar /mnt/A/bar
   6324			 * mv -T /mnt/A /mnt/B
   6325			 * fsync /mnt/B/bar
   6326			 * <power fail>
   6327			 *
   6328			 * If we ignore the old parent B which got deleted,
   6329			 * after a log replay we would have file bar linked
   6330			 * at both parents and the old parent B would still
   6331			 * exist.
   6332			 */
   6333			if (IS_ERR(dir_inode)) {
   6334				ret = PTR_ERR(dir_inode);
   6335				goto out;
   6336			}
   6337
   6338			if (!need_log_inode(trans, BTRFS_I(dir_inode))) {
   6339				btrfs_add_delayed_iput(dir_inode);
   6340				continue;
   6341			}
   6342
   6343			ctx->log_new_dentries = false;
   6344			ret = btrfs_log_inode(trans, BTRFS_I(dir_inode),
   6345					      LOG_INODE_ALL, ctx);
   6346			if (!ret && ctx->log_new_dentries)
   6347				ret = log_new_dir_dentries(trans, root,
   6348						   BTRFS_I(dir_inode), ctx);
   6349			btrfs_add_delayed_iput(dir_inode);
   6350			if (ret)
   6351				goto out;
   6352		}
   6353		path->slots[0]++;
   6354	}
   6355	ret = 0;
   6356out:
   6357	btrfs_free_path(path);
   6358	return ret;
   6359}
   6360
   6361static int log_new_ancestors(struct btrfs_trans_handle *trans,
   6362			     struct btrfs_root *root,
   6363			     struct btrfs_path *path,
   6364			     struct btrfs_log_ctx *ctx)
   6365{
   6366	struct btrfs_key found_key;
   6367
   6368	btrfs_item_key_to_cpu(path->nodes[0], &found_key, path->slots[0]);
   6369
   6370	while (true) {
   6371		struct btrfs_fs_info *fs_info = root->fs_info;
   6372		struct extent_buffer *leaf = path->nodes[0];
   6373		int slot = path->slots[0];
   6374		struct btrfs_key search_key;
   6375		struct inode *inode;
   6376		u64 ino;
   6377		int ret = 0;
   6378
   6379		btrfs_release_path(path);
   6380
   6381		ino = found_key.offset;
   6382
   6383		search_key.objectid = found_key.offset;
   6384		search_key.type = BTRFS_INODE_ITEM_KEY;
   6385		search_key.offset = 0;
   6386		inode = btrfs_iget(fs_info->sb, ino, root);
   6387		if (IS_ERR(inode))
   6388			return PTR_ERR(inode);
   6389
   6390		if (BTRFS_I(inode)->generation >= trans->transid &&
   6391		    need_log_inode(trans, BTRFS_I(inode)))
   6392			ret = btrfs_log_inode(trans, BTRFS_I(inode),
   6393					      LOG_INODE_EXISTS, ctx);
   6394		btrfs_add_delayed_iput(inode);
   6395		if (ret)
   6396			return ret;
   6397
   6398		if (search_key.objectid == BTRFS_FIRST_FREE_OBJECTID)
   6399			break;
   6400
   6401		search_key.type = BTRFS_INODE_REF_KEY;
   6402		ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0);
   6403		if (ret < 0)
   6404			return ret;
   6405
   6406		leaf = path->nodes[0];
   6407		slot = path->slots[0];
   6408		if (slot >= btrfs_header_nritems(leaf)) {
   6409			ret = btrfs_next_leaf(root, path);
   6410			if (ret < 0)
   6411				return ret;
   6412			else if (ret > 0)
   6413				return -ENOENT;
   6414			leaf = path->nodes[0];
   6415			slot = path->slots[0];
   6416		}
   6417
   6418		btrfs_item_key_to_cpu(leaf, &found_key, slot);
   6419		if (found_key.objectid != search_key.objectid ||
   6420		    found_key.type != BTRFS_INODE_REF_KEY)
   6421			return -ENOENT;
   6422	}
   6423	return 0;
   6424}
   6425
   6426static int log_new_ancestors_fast(struct btrfs_trans_handle *trans,
   6427				  struct btrfs_inode *inode,
   6428				  struct dentry *parent,
   6429				  struct btrfs_log_ctx *ctx)
   6430{
   6431	struct btrfs_root *root = inode->root;
   6432	struct dentry *old_parent = NULL;
   6433	struct super_block *sb = inode->vfs_inode.i_sb;
   6434	int ret = 0;
   6435
   6436	while (true) {
   6437		if (!parent || d_really_is_negative(parent) ||
   6438		    sb != parent->d_sb)
   6439			break;
   6440
   6441		inode = BTRFS_I(d_inode(parent));
   6442		if (root != inode->root)
   6443			break;
   6444
   6445		if (inode->generation >= trans->transid &&
   6446		    need_log_inode(trans, inode)) {
   6447			ret = btrfs_log_inode(trans, inode,
   6448					      LOG_INODE_EXISTS, ctx);
   6449			if (ret)
   6450				break;
   6451		}
   6452		if (IS_ROOT(parent))
   6453			break;
   6454
   6455		parent = dget_parent(parent);
   6456		dput(old_parent);
   6457		old_parent = parent;
   6458	}
   6459	dput(old_parent);
   6460
   6461	return ret;
   6462}
   6463
   6464static int log_all_new_ancestors(struct btrfs_trans_handle *trans,
   6465				 struct btrfs_inode *inode,
   6466				 struct dentry *parent,
   6467				 struct btrfs_log_ctx *ctx)
   6468{
   6469	struct btrfs_root *root = inode->root;
   6470	const u64 ino = btrfs_ino(inode);
   6471	struct btrfs_path *path;
   6472	struct btrfs_key search_key;
   6473	int ret;
   6474
   6475	/*
   6476	 * For a single hard link case, go through a fast path that does not
   6477	 * need to iterate the fs/subvolume tree.
   6478	 */
   6479	if (inode->vfs_inode.i_nlink < 2)
   6480		return log_new_ancestors_fast(trans, inode, parent, ctx);
   6481
   6482	path = btrfs_alloc_path();
   6483	if (!path)
   6484		return -ENOMEM;
   6485
   6486	search_key.objectid = ino;
   6487	search_key.type = BTRFS_INODE_REF_KEY;
   6488	search_key.offset = 0;
   6489again:
   6490	ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0);
   6491	if (ret < 0)
   6492		goto out;
   6493	if (ret == 0)
   6494		path->slots[0]++;
   6495
   6496	while (true) {
   6497		struct extent_buffer *leaf = path->nodes[0];
   6498		int slot = path->slots[0];
   6499		struct btrfs_key found_key;
   6500
   6501		if (slot >= btrfs_header_nritems(leaf)) {
   6502			ret = btrfs_next_leaf(root, path);
   6503			if (ret < 0)
   6504				goto out;
   6505			else if (ret > 0)
   6506				break;
   6507			continue;
   6508		}
   6509
   6510		btrfs_item_key_to_cpu(leaf, &found_key, slot);
   6511		if (found_key.objectid != ino ||
   6512		    found_key.type > BTRFS_INODE_EXTREF_KEY)
   6513			break;
   6514
   6515		/*
   6516		 * Don't deal with extended references because they are rare
   6517		 * cases and too complex to deal with (we would need to keep
   6518		 * track of which subitem we are processing for each item in
   6519		 * this loop, etc). So just return some error to fallback to
   6520		 * a transaction commit.
   6521		 */
   6522		if (found_key.type == BTRFS_INODE_EXTREF_KEY) {
   6523			ret = -EMLINK;
   6524			goto out;
   6525		}
   6526
   6527		/*
   6528		 * Logging ancestors needs to do more searches on the fs/subvol
   6529		 * tree, so it releases the path as needed to avoid deadlocks.
   6530		 * Keep track of the last inode ref key and resume from that key
   6531		 * after logging all new ancestors for the current hard link.
   6532		 */
   6533		memcpy(&search_key, &found_key, sizeof(search_key));
   6534
   6535		ret = log_new_ancestors(trans, root, path, ctx);
   6536		if (ret)
   6537			goto out;
   6538		btrfs_release_path(path);
   6539		goto again;
   6540	}
   6541	ret = 0;
   6542out:
   6543	btrfs_free_path(path);
   6544	return ret;
   6545}
   6546
   6547/*
   6548 * helper function around btrfs_log_inode to make sure newly created
   6549 * parent directories also end up in the log.  A minimal inode and backref
   6550 * only logging is done of any parent directories that are older than
   6551 * the last committed transaction
   6552 */
   6553static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
   6554				  struct btrfs_inode *inode,
   6555				  struct dentry *parent,
   6556				  int inode_only,
   6557				  struct btrfs_log_ctx *ctx)
   6558{
   6559	struct btrfs_root *root = inode->root;
   6560	struct btrfs_fs_info *fs_info = root->fs_info;
   6561	int ret = 0;
   6562	bool log_dentries = false;
   6563
   6564	if (btrfs_test_opt(fs_info, NOTREELOG)) {
   6565		ret = 1;
   6566		goto end_no_trans;
   6567	}
   6568
   6569	if (btrfs_root_refs(&root->root_item) == 0) {
   6570		ret = 1;
   6571		goto end_no_trans;
   6572	}
   6573
   6574	/*
   6575	 * Skip already logged inodes or inodes corresponding to tmpfiles
   6576	 * (since logging them is pointless, a link count of 0 means they
   6577	 * will never be accessible).
   6578	 */
   6579	if ((btrfs_inode_in_log(inode, trans->transid) &&
   6580	     list_empty(&ctx->ordered_extents)) ||
   6581	    inode->vfs_inode.i_nlink == 0) {
   6582		ret = BTRFS_NO_LOG_SYNC;
   6583		goto end_no_trans;
   6584	}
   6585
   6586	ret = start_log_trans(trans, root, ctx);
   6587	if (ret)
   6588		goto end_no_trans;
   6589
   6590	ret = btrfs_log_inode(trans, inode, inode_only, ctx);
   6591	if (ret)
   6592		goto end_trans;
   6593
   6594	/*
   6595	 * for regular files, if its inode is already on disk, we don't
   6596	 * have to worry about the parents at all.  This is because
   6597	 * we can use the last_unlink_trans field to record renames
   6598	 * and other fun in this file.
   6599	 */
   6600	if (S_ISREG(inode->vfs_inode.i_mode) &&
   6601	    inode->generation < trans->transid &&
   6602	    inode->last_unlink_trans < trans->transid) {
   6603		ret = 0;
   6604		goto end_trans;
   6605	}
   6606
   6607	if (S_ISDIR(inode->vfs_inode.i_mode) && ctx->log_new_dentries)
   6608		log_dentries = true;
   6609
   6610	/*
   6611	 * On unlink we must make sure all our current and old parent directory
   6612	 * inodes are fully logged. This is to prevent leaving dangling
   6613	 * directory index entries in directories that were our parents but are
   6614	 * not anymore. Not doing this results in old parent directory being
   6615	 * impossible to delete after log replay (rmdir will always fail with
   6616	 * error -ENOTEMPTY).
   6617	 *
   6618	 * Example 1:
   6619	 *
   6620	 * mkdir testdir
   6621	 * touch testdir/foo
   6622	 * ln testdir/foo testdir/bar
   6623	 * sync
   6624	 * unlink testdir/bar
   6625	 * xfs_io -c fsync testdir/foo
   6626	 * <power failure>
   6627	 * mount fs, triggers log replay
   6628	 *
   6629	 * If we don't log the parent directory (testdir), after log replay the
   6630	 * directory still has an entry pointing to the file inode using the bar
   6631	 * name, but a matching BTRFS_INODE_[REF|EXTREF]_KEY does not exist and
   6632	 * the file inode has a link count of 1.
   6633	 *
   6634	 * Example 2:
   6635	 *
   6636	 * mkdir testdir
   6637	 * touch foo
   6638	 * ln foo testdir/foo2
   6639	 * ln foo testdir/foo3
   6640	 * sync
   6641	 * unlink testdir/foo3
   6642	 * xfs_io -c fsync foo
   6643	 * <power failure>
   6644	 * mount fs, triggers log replay
   6645	 *
   6646	 * Similar as the first example, after log replay the parent directory
   6647	 * testdir still has an entry pointing to the inode file with name foo3
   6648	 * but the file inode does not have a matching BTRFS_INODE_REF_KEY item
   6649	 * and has a link count of 2.
   6650	 */
   6651	if (inode->last_unlink_trans >= trans->transid) {
   6652		ret = btrfs_log_all_parents(trans, inode, ctx);
   6653		if (ret)
   6654			goto end_trans;
   6655	}
   6656
   6657	ret = log_all_new_ancestors(trans, inode, parent, ctx);
   6658	if (ret)
   6659		goto end_trans;
   6660
   6661	if (log_dentries)
   6662		ret = log_new_dir_dentries(trans, root, inode, ctx);
   6663	else
   6664		ret = 0;
   6665end_trans:
   6666	if (ret < 0) {
   6667		btrfs_set_log_full_commit(trans);
   6668		ret = 1;
   6669	}
   6670
   6671	if (ret)
   6672		btrfs_remove_log_ctx(root, ctx);
   6673	btrfs_end_log_trans(root);
   6674end_no_trans:
   6675	return ret;
   6676}
   6677
   6678/*
   6679 * it is not safe to log dentry if the chunk root has added new
   6680 * chunks.  This returns 0 if the dentry was logged, and 1 otherwise.
   6681 * If this returns 1, you must commit the transaction to safely get your
   6682 * data on disk.
   6683 */
   6684int btrfs_log_dentry_safe(struct btrfs_trans_handle *trans,
   6685			  struct dentry *dentry,
   6686			  struct btrfs_log_ctx *ctx)
   6687{
   6688	struct dentry *parent = dget_parent(dentry);
   6689	int ret;
   6690
   6691	ret = btrfs_log_inode_parent(trans, BTRFS_I(d_inode(dentry)), parent,
   6692				     LOG_INODE_ALL, ctx);
   6693	dput(parent);
   6694
   6695	return ret;
   6696}
   6697
   6698/*
   6699 * should be called during mount to recover any replay any log trees
   6700 * from the FS
   6701 */
   6702int btrfs_recover_log_trees(struct btrfs_root *log_root_tree)
   6703{
   6704	int ret;
   6705	struct btrfs_path *path;
   6706	struct btrfs_trans_handle *trans;
   6707	struct btrfs_key key;
   6708	struct btrfs_key found_key;
   6709	struct btrfs_root *log;
   6710	struct btrfs_fs_info *fs_info = log_root_tree->fs_info;
   6711	struct walk_control wc = {
   6712		.process_func = process_one_buffer,
   6713		.stage = LOG_WALK_PIN_ONLY,
   6714	};
   6715
   6716	path = btrfs_alloc_path();
   6717	if (!path)
   6718		return -ENOMEM;
   6719
   6720	set_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags);
   6721
   6722	trans = btrfs_start_transaction(fs_info->tree_root, 0);
   6723	if (IS_ERR(trans)) {
   6724		ret = PTR_ERR(trans);
   6725		goto error;
   6726	}
   6727
   6728	wc.trans = trans;
   6729	wc.pin = 1;
   6730
   6731	ret = walk_log_tree(trans, log_root_tree, &wc);
   6732	if (ret) {
   6733		btrfs_abort_transaction(trans, ret);
   6734		goto error;
   6735	}
   6736
   6737again:
   6738	key.objectid = BTRFS_TREE_LOG_OBJECTID;
   6739	key.offset = (u64)-1;
   6740	key.type = BTRFS_ROOT_ITEM_KEY;
   6741
   6742	while (1) {
   6743		ret = btrfs_search_slot(NULL, log_root_tree, &key, path, 0, 0);
   6744
   6745		if (ret < 0) {
   6746			btrfs_abort_transaction(trans, ret);
   6747			goto error;
   6748		}
   6749		if (ret > 0) {
   6750			if (path->slots[0] == 0)
   6751				break;
   6752			path->slots[0]--;
   6753		}
   6754		btrfs_item_key_to_cpu(path->nodes[0], &found_key,
   6755				      path->slots[0]);
   6756		btrfs_release_path(path);
   6757		if (found_key.objectid != BTRFS_TREE_LOG_OBJECTID)
   6758			break;
   6759
   6760		log = btrfs_read_tree_root(log_root_tree, &found_key);
   6761		if (IS_ERR(log)) {
   6762			ret = PTR_ERR(log);
   6763			btrfs_abort_transaction(trans, ret);
   6764			goto error;
   6765		}
   6766
   6767		wc.replay_dest = btrfs_get_fs_root(fs_info, found_key.offset,
   6768						   true);
   6769		if (IS_ERR(wc.replay_dest)) {
   6770			ret = PTR_ERR(wc.replay_dest);
   6771
   6772			/*
   6773			 * We didn't find the subvol, likely because it was
   6774			 * deleted.  This is ok, simply skip this log and go to
   6775			 * the next one.
   6776			 *
   6777			 * We need to exclude the root because we can't have
   6778			 * other log replays overwriting this log as we'll read
   6779			 * it back in a few more times.  This will keep our
   6780			 * block from being modified, and we'll just bail for
   6781			 * each subsequent pass.
   6782			 */
   6783			if (ret == -ENOENT)
   6784				ret = btrfs_pin_extent_for_log_replay(trans,
   6785							log->node->start,
   6786							log->node->len);
   6787			btrfs_put_root(log);
   6788
   6789			if (!ret)
   6790				goto next;
   6791			btrfs_abort_transaction(trans, ret);
   6792			goto error;
   6793		}
   6794
   6795		wc.replay_dest->log_root = log;
   6796		ret = btrfs_record_root_in_trans(trans, wc.replay_dest);
   6797		if (ret)
   6798			/* The loop needs to continue due to the root refs */
   6799			btrfs_abort_transaction(trans, ret);
   6800		else
   6801			ret = walk_log_tree(trans, log, &wc);
   6802
   6803		if (!ret && wc.stage == LOG_WALK_REPLAY_ALL) {
   6804			ret = fixup_inode_link_counts(trans, wc.replay_dest,
   6805						      path);
   6806			if (ret)
   6807				btrfs_abort_transaction(trans, ret);
   6808		}
   6809
   6810		if (!ret && wc.stage == LOG_WALK_REPLAY_ALL) {
   6811			struct btrfs_root *root = wc.replay_dest;
   6812
   6813			btrfs_release_path(path);
   6814
   6815			/*
   6816			 * We have just replayed everything, and the highest
   6817			 * objectid of fs roots probably has changed in case
   6818			 * some inode_item's got replayed.
   6819			 *
   6820			 * root->objectid_mutex is not acquired as log replay
   6821			 * could only happen during mount.
   6822			 */
   6823			ret = btrfs_init_root_free_objectid(root);
   6824			if (ret)
   6825				btrfs_abort_transaction(trans, ret);
   6826		}
   6827
   6828		wc.replay_dest->log_root = NULL;
   6829		btrfs_put_root(wc.replay_dest);
   6830		btrfs_put_root(log);
   6831
   6832		if (ret)
   6833			goto error;
   6834next:
   6835		if (found_key.offset == 0)
   6836			break;
   6837		key.offset = found_key.offset - 1;
   6838	}
   6839	btrfs_release_path(path);
   6840
   6841	/* step one is to pin it all, step two is to replay just inodes */
   6842	if (wc.pin) {
   6843		wc.pin = 0;
   6844		wc.process_func = replay_one_buffer;
   6845		wc.stage = LOG_WALK_REPLAY_INODES;
   6846		goto again;
   6847	}
   6848	/* step three is to replay everything */
   6849	if (wc.stage < LOG_WALK_REPLAY_ALL) {
   6850		wc.stage++;
   6851		goto again;
   6852	}
   6853
   6854	btrfs_free_path(path);
   6855
   6856	/* step 4: commit the transaction, which also unpins the blocks */
   6857	ret = btrfs_commit_transaction(trans);
   6858	if (ret)
   6859		return ret;
   6860
   6861	log_root_tree->log_root = NULL;
   6862	clear_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags);
   6863	btrfs_put_root(log_root_tree);
   6864
   6865	return 0;
   6866error:
   6867	if (wc.trans)
   6868		btrfs_end_transaction(wc.trans);
   6869	clear_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags);
   6870	btrfs_free_path(path);
   6871	return ret;
   6872}
   6873
   6874/*
   6875 * there are some corner cases where we want to force a full
   6876 * commit instead of allowing a directory to be logged.
   6877 *
   6878 * They revolve around files there were unlinked from the directory, and
   6879 * this function updates the parent directory so that a full commit is
   6880 * properly done if it is fsync'd later after the unlinks are done.
   6881 *
   6882 * Must be called before the unlink operations (updates to the subvolume tree,
   6883 * inodes, etc) are done.
   6884 */
   6885void btrfs_record_unlink_dir(struct btrfs_trans_handle *trans,
   6886			     struct btrfs_inode *dir, struct btrfs_inode *inode,
   6887			     int for_rename)
   6888{
   6889	/*
   6890	 * when we're logging a file, if it hasn't been renamed
   6891	 * or unlinked, and its inode is fully committed on disk,
   6892	 * we don't have to worry about walking up the directory chain
   6893	 * to log its parents.
   6894	 *
   6895	 * So, we use the last_unlink_trans field to put this transid
   6896	 * into the file.  When the file is logged we check it and
   6897	 * don't log the parents if the file is fully on disk.
   6898	 */
   6899	mutex_lock(&inode->log_mutex);
   6900	inode->last_unlink_trans = trans->transid;
   6901	mutex_unlock(&inode->log_mutex);
   6902
   6903	/*
   6904	 * if this directory was already logged any new
   6905	 * names for this file/dir will get recorded
   6906	 */
   6907	if (dir->logged_trans == trans->transid)
   6908		return;
   6909
   6910	/*
   6911	 * if the inode we're about to unlink was logged,
   6912	 * the log will be properly updated for any new names
   6913	 */
   6914	if (inode->logged_trans == trans->transid)
   6915		return;
   6916
   6917	/*
   6918	 * when renaming files across directories, if the directory
   6919	 * there we're unlinking from gets fsync'd later on, there's
   6920	 * no way to find the destination directory later and fsync it
   6921	 * properly.  So, we have to be conservative and force commits
   6922	 * so the new name gets discovered.
   6923	 */
   6924	if (for_rename)
   6925		goto record;
   6926
   6927	/* we can safely do the unlink without any special recording */
   6928	return;
   6929
   6930record:
   6931	mutex_lock(&dir->log_mutex);
   6932	dir->last_unlink_trans = trans->transid;
   6933	mutex_unlock(&dir->log_mutex);
   6934}
   6935
   6936/*
   6937 * Make sure that if someone attempts to fsync the parent directory of a deleted
   6938 * snapshot, it ends up triggering a transaction commit. This is to guarantee
   6939 * that after replaying the log tree of the parent directory's root we will not
   6940 * see the snapshot anymore and at log replay time we will not see any log tree
   6941 * corresponding to the deleted snapshot's root, which could lead to replaying
   6942 * it after replaying the log tree of the parent directory (which would replay
   6943 * the snapshot delete operation).
   6944 *
   6945 * Must be called before the actual snapshot destroy operation (updates to the
   6946 * parent root and tree of tree roots trees, etc) are done.
   6947 */
   6948void btrfs_record_snapshot_destroy(struct btrfs_trans_handle *trans,
   6949				   struct btrfs_inode *dir)
   6950{
   6951	mutex_lock(&dir->log_mutex);
   6952	dir->last_unlink_trans = trans->transid;
   6953	mutex_unlock(&dir->log_mutex);
   6954}
   6955
   6956/**
   6957 * Update the log after adding a new name for an inode.
   6958 *
   6959 * @trans:              Transaction handle.
   6960 * @old_dentry:         The dentry associated with the old name and the old
   6961 *                      parent directory.
   6962 * @old_dir:            The inode of the previous parent directory for the case
   6963 *                      of a rename. For a link operation, it must be NULL.
   6964 * @old_dir_index:      The index number associated with the old name, meaningful
   6965 *                      only for rename operations (when @old_dir is not NULL).
   6966 *                      Ignored for link operations.
   6967 * @parent:             The dentry associated with the directory under which the
   6968 *                      new name is located.
   6969 *
   6970 * Call this after adding a new name for an inode, as a result of a link or
   6971 * rename operation, and it will properly update the log to reflect the new name.
   6972 */
   6973void btrfs_log_new_name(struct btrfs_trans_handle *trans,
   6974			struct dentry *old_dentry, struct btrfs_inode *old_dir,
   6975			u64 old_dir_index, struct dentry *parent)
   6976{
   6977	struct btrfs_inode *inode = BTRFS_I(d_inode(old_dentry));
   6978	struct btrfs_root *root = inode->root;
   6979	struct btrfs_log_ctx ctx;
   6980	bool log_pinned = false;
   6981	int ret;
   6982
   6983	/*
   6984	 * this will force the logging code to walk the dentry chain
   6985	 * up for the file
   6986	 */
   6987	if (!S_ISDIR(inode->vfs_inode.i_mode))
   6988		inode->last_unlink_trans = trans->transid;
   6989
   6990	/*
   6991	 * if this inode hasn't been logged and directory we're renaming it
   6992	 * from hasn't been logged, we don't need to log it
   6993	 */
   6994	ret = inode_logged(trans, inode, NULL);
   6995	if (ret < 0) {
   6996		goto out;
   6997	} else if (ret == 0) {
   6998		if (!old_dir)
   6999			return;
   7000		/*
   7001		 * If the inode was not logged and we are doing a rename (old_dir is not
   7002		 * NULL), check if old_dir was logged - if it was not we can return and
   7003		 * do nothing.
   7004		 */
   7005		ret = inode_logged(trans, old_dir, NULL);
   7006		if (ret < 0)
   7007			goto out;
   7008		else if (ret == 0)
   7009			return;
   7010	}
   7011	ret = 0;
   7012
   7013	/*
   7014	 * If we are doing a rename (old_dir is not NULL) from a directory that
   7015	 * was previously logged, make sure that on log replay we get the old
   7016	 * dir entry deleted. This is needed because we will also log the new
   7017	 * name of the renamed inode, so we need to make sure that after log
   7018	 * replay we don't end up with both the new and old dir entries existing.
   7019	 */
   7020	if (old_dir && old_dir->logged_trans == trans->transid) {
   7021		struct btrfs_root *log = old_dir->root->log_root;
   7022		struct btrfs_path *path;
   7023
   7024		ASSERT(old_dir_index >= BTRFS_DIR_START_INDEX);
   7025
   7026		/*
   7027		 * We have two inodes to update in the log, the old directory and
   7028		 * the inode that got renamed, so we must pin the log to prevent
   7029		 * anyone from syncing the log until we have updated both inodes
   7030		 * in the log.
   7031		 */
   7032		log_pinned = true;
   7033		btrfs_pin_log_trans(root);
   7034
   7035		path = btrfs_alloc_path();
   7036		if (!path) {
   7037			ret = -ENOMEM;
   7038			goto out;
   7039		}
   7040
   7041		/*
   7042		 * Other concurrent task might be logging the old directory,
   7043		 * as it can be triggered when logging other inode that had or
   7044		 * still has a dentry in the old directory. We lock the old
   7045		 * directory's log_mutex to ensure the deletion of the old
   7046		 * name is persisted, because during directory logging we
   7047		 * delete all BTRFS_DIR_LOG_INDEX_KEY keys and the deletion of
   7048		 * the old name's dir index item is in the delayed items, so
   7049		 * it could be missed by an in progress directory logging.
   7050		 */
   7051		mutex_lock(&old_dir->log_mutex);
   7052		ret = del_logged_dentry(trans, log, path, btrfs_ino(old_dir),
   7053					old_dentry->d_name.name,
   7054					old_dentry->d_name.len, old_dir_index);
   7055		if (ret > 0) {
   7056			/*
   7057			 * The dentry does not exist in the log, so record its
   7058			 * deletion.
   7059			 */
   7060			btrfs_release_path(path);
   7061			ret = insert_dir_log_key(trans, log, path,
   7062						 btrfs_ino(old_dir),
   7063						 old_dir_index, old_dir_index);
   7064		}
   7065		mutex_unlock(&old_dir->log_mutex);
   7066
   7067		btrfs_free_path(path);
   7068		if (ret < 0)
   7069			goto out;
   7070	}
   7071
   7072	btrfs_init_log_ctx(&ctx, &inode->vfs_inode);
   7073	ctx.logging_new_name = true;
   7074	/*
   7075	 * We don't care about the return value. If we fail to log the new name
   7076	 * then we know the next attempt to sync the log will fallback to a full
   7077	 * transaction commit (due to a call to btrfs_set_log_full_commit()), so
   7078	 * we don't need to worry about getting a log committed that has an
   7079	 * inconsistent state after a rename operation.
   7080	 */
   7081	btrfs_log_inode_parent(trans, inode, parent, LOG_INODE_EXISTS, &ctx);
   7082out:
   7083	/*
   7084	 * If an error happened mark the log for a full commit because it's not
   7085	 * consistent and up to date or we couldn't find out if one of the
   7086	 * inodes was logged before in this transaction. Do it before unpinning
   7087	 * the log, to avoid any races with someone else trying to commit it.
   7088	 */
   7089	if (ret < 0)
   7090		btrfs_set_log_full_commit(trans);
   7091	if (log_pinned)
   7092		btrfs_end_log_trans(root);
   7093}
   7094