cachepc-linux

Fork of AMDESE/linux with modifications for CachePC side-channel attack
git clone https://git.sinitax.com/sinitax/cachepc-linux
Log | Files | Refs | README | LICENSE | sfeed.txt

send.c (198254B)


      1// SPDX-License-Identifier: GPL-2.0
      2/*
      3 * Copyright (C) 2012 Alexander Block.  All rights reserved.
      4 */
      5
      6#include <linux/bsearch.h>
      7#include <linux/fs.h>
      8#include <linux/file.h>
      9#include <linux/sort.h>
     10#include <linux/mount.h>
     11#include <linux/xattr.h>
     12#include <linux/posix_acl_xattr.h>
     13#include <linux/vmalloc.h>
     14#include <linux/string.h>
     15#include <linux/compat.h>
     16#include <linux/crc32c.h>
     17
     18#include "send.h"
     19#include "backref.h"
     20#include "locking.h"
     21#include "disk-io.h"
     22#include "btrfs_inode.h"
     23#include "transaction.h"
     24#include "compression.h"
     25#include "xattr.h"
     26#include "print-tree.h"
     27
     28/*
     29 * Maximum number of references an extent can have in order for us to attempt to
     30 * issue clone operations instead of write operations. This currently exists to
     31 * avoid hitting limitations of the backreference walking code (taking a lot of
     32 * time and using too much memory for extents with large number of references).
     33 */
     34#define SEND_MAX_EXTENT_REFS	64
     35
     36/*
     37 * A fs_path is a helper to dynamically build path names with unknown size.
     38 * It reallocates the internal buffer on demand.
     39 * It allows fast adding of path elements on the right side (normal path) and
     40 * fast adding to the left side (reversed path). A reversed path can also be
     41 * unreversed if needed.
     42 */
     43struct fs_path {
     44	union {
     45		struct {
     46			char *start;
     47			char *end;
     48
     49			char *buf;
     50			unsigned short buf_len:15;
     51			unsigned short reversed:1;
     52			char inline_buf[];
     53		};
     54		/*
     55		 * Average path length does not exceed 200 bytes, we'll have
     56		 * better packing in the slab and higher chance to satisfy
     57		 * a allocation later during send.
     58		 */
     59		char pad[256];
     60	};
     61};
     62#define FS_PATH_INLINE_SIZE \
     63	(sizeof(struct fs_path) - offsetof(struct fs_path, inline_buf))
     64
     65
     66/* reused for each extent */
     67struct clone_root {
     68	struct btrfs_root *root;
     69	u64 ino;
     70	u64 offset;
     71
     72	u64 found_refs;
     73};
     74
     75#define SEND_CTX_MAX_NAME_CACHE_SIZE 128
     76#define SEND_CTX_NAME_CACHE_CLEAN_SIZE (SEND_CTX_MAX_NAME_CACHE_SIZE * 2)
     77
     78struct send_ctx {
     79	struct file *send_filp;
     80	loff_t send_off;
     81	char *send_buf;
     82	u32 send_size;
     83	u32 send_max_size;
     84	u64 total_send_size;
     85	u64 cmd_send_size[BTRFS_SEND_C_MAX + 1];
     86	u64 flags;	/* 'flags' member of btrfs_ioctl_send_args is u64 */
     87	/* Protocol version compatibility requested */
     88	u32 proto;
     89
     90	struct btrfs_root *send_root;
     91	struct btrfs_root *parent_root;
     92	struct clone_root *clone_roots;
     93	int clone_roots_cnt;
     94
     95	/* current state of the compare_tree call */
     96	struct btrfs_path *left_path;
     97	struct btrfs_path *right_path;
     98	struct btrfs_key *cmp_key;
     99
    100	/*
    101	 * Keep track of the generation of the last transaction that was used
    102	 * for relocating a block group. This is periodically checked in order
    103	 * to detect if a relocation happened since the last check, so that we
    104	 * don't operate on stale extent buffers for nodes (level >= 1) or on
    105	 * stale disk_bytenr values of file extent items.
    106	 */
    107	u64 last_reloc_trans;
    108
    109	/*
    110	 * infos of the currently processed inode. In case of deleted inodes,
    111	 * these are the values from the deleted inode.
    112	 */
    113	u64 cur_ino;
    114	u64 cur_inode_gen;
    115	int cur_inode_new;
    116	int cur_inode_new_gen;
    117	int cur_inode_deleted;
    118	u64 cur_inode_size;
    119	u64 cur_inode_mode;
    120	u64 cur_inode_rdev;
    121	u64 cur_inode_last_extent;
    122	u64 cur_inode_next_write_offset;
    123	bool ignore_cur_inode;
    124
    125	u64 send_progress;
    126
    127	struct list_head new_refs;
    128	struct list_head deleted_refs;
    129
    130	struct xarray name_cache;
    131	struct list_head name_cache_list;
    132	int name_cache_size;
    133
    134	/*
    135	 * The inode we are currently processing. It's not NULL only when we
    136	 * need to issue write commands for data extents from this inode.
    137	 */
    138	struct inode *cur_inode;
    139	struct file_ra_state ra;
    140	u64 page_cache_clear_start;
    141	bool clean_page_cache;
    142
    143	/*
    144	 * We process inodes by their increasing order, so if before an
    145	 * incremental send we reverse the parent/child relationship of
    146	 * directories such that a directory with a lower inode number was
    147	 * the parent of a directory with a higher inode number, and the one
    148	 * becoming the new parent got renamed too, we can't rename/move the
    149	 * directory with lower inode number when we finish processing it - we
    150	 * must process the directory with higher inode number first, then
    151	 * rename/move it and then rename/move the directory with lower inode
    152	 * number. Example follows.
    153	 *
    154	 * Tree state when the first send was performed:
    155	 *
    156	 * .
    157	 * |-- a                   (ino 257)
    158	 *     |-- b               (ino 258)
    159	 *         |
    160	 *         |
    161	 *         |-- c           (ino 259)
    162	 *         |   |-- d       (ino 260)
    163	 *         |
    164	 *         |-- c2          (ino 261)
    165	 *
    166	 * Tree state when the second (incremental) send is performed:
    167	 *
    168	 * .
    169	 * |-- a                   (ino 257)
    170	 *     |-- b               (ino 258)
    171	 *         |-- c2          (ino 261)
    172	 *             |-- d2      (ino 260)
    173	 *                 |-- cc  (ino 259)
    174	 *
    175	 * The sequence of steps that lead to the second state was:
    176	 *
    177	 * mv /a/b/c/d /a/b/c2/d2
    178	 * mv /a/b/c /a/b/c2/d2/cc
    179	 *
    180	 * "c" has lower inode number, but we can't move it (2nd mv operation)
    181	 * before we move "d", which has higher inode number.
    182	 *
    183	 * So we just memorize which move/rename operations must be performed
    184	 * later when their respective parent is processed and moved/renamed.
    185	 */
    186
    187	/* Indexed by parent directory inode number. */
    188	struct rb_root pending_dir_moves;
    189
    190	/*
    191	 * Reverse index, indexed by the inode number of a directory that
    192	 * is waiting for the move/rename of its immediate parent before its
    193	 * own move/rename can be performed.
    194	 */
    195	struct rb_root waiting_dir_moves;
    196
    197	/*
    198	 * A directory that is going to be rm'ed might have a child directory
    199	 * which is in the pending directory moves index above. In this case,
    200	 * the directory can only be removed after the move/rename of its child
    201	 * is performed. Example:
    202	 *
    203	 * Parent snapshot:
    204	 *
    205	 * .                        (ino 256)
    206	 * |-- a/                   (ino 257)
    207	 *     |-- b/               (ino 258)
    208	 *         |-- c/           (ino 259)
    209	 *         |   |-- x/       (ino 260)
    210	 *         |
    211	 *         |-- y/           (ino 261)
    212	 *
    213	 * Send snapshot:
    214	 *
    215	 * .                        (ino 256)
    216	 * |-- a/                   (ino 257)
    217	 *     |-- b/               (ino 258)
    218	 *         |-- YY/          (ino 261)
    219	 *              |-- x/      (ino 260)
    220	 *
    221	 * Sequence of steps that lead to the send snapshot:
    222	 * rm -f /a/b/c/foo.txt
    223	 * mv /a/b/y /a/b/YY
    224	 * mv /a/b/c/x /a/b/YY
    225	 * rmdir /a/b/c
    226	 *
    227	 * When the child is processed, its move/rename is delayed until its
    228	 * parent is processed (as explained above), but all other operations
    229	 * like update utimes, chown, chgrp, etc, are performed and the paths
    230	 * that it uses for those operations must use the orphanized name of
    231	 * its parent (the directory we're going to rm later), so we need to
    232	 * memorize that name.
    233	 *
    234	 * Indexed by the inode number of the directory to be deleted.
    235	 */
    236	struct rb_root orphan_dirs;
    237};
    238
    239struct pending_dir_move {
    240	struct rb_node node;
    241	struct list_head list;
    242	u64 parent_ino;
    243	u64 ino;
    244	u64 gen;
    245	struct list_head update_refs;
    246};
    247
    248struct waiting_dir_move {
    249	struct rb_node node;
    250	u64 ino;
    251	/*
    252	 * There might be some directory that could not be removed because it
    253	 * was waiting for this directory inode to be moved first. Therefore
    254	 * after this directory is moved, we can try to rmdir the ino rmdir_ino.
    255	 */
    256	u64 rmdir_ino;
    257	u64 rmdir_gen;
    258	bool orphanized;
    259};
    260
    261struct orphan_dir_info {
    262	struct rb_node node;
    263	u64 ino;
    264	u64 gen;
    265	u64 last_dir_index_offset;
    266};
    267
    268struct name_cache_entry {
    269	struct list_head list;
    270	/*
    271	 * On 32bit kernels, xarray has only 32bit indices, but we need to
    272	 * handle 64bit inums. We use the lower 32bit of the 64bit inum to store
    273	 * it in the tree. If more than one inum would fall into the same entry,
    274	 * we use inum_aliases to store the additional entries. inum_aliases is
    275	 * also used to store entries with the same inum but different generations.
    276	 */
    277	struct list_head inum_aliases;
    278	u64 ino;
    279	u64 gen;
    280	u64 parent_ino;
    281	u64 parent_gen;
    282	int ret;
    283	int need_later_update;
    284	int name_len;
    285	char name[];
    286};
    287
    288#define ADVANCE							1
    289#define ADVANCE_ONLY_NEXT					-1
    290
    291enum btrfs_compare_tree_result {
    292	BTRFS_COMPARE_TREE_NEW,
    293	BTRFS_COMPARE_TREE_DELETED,
    294	BTRFS_COMPARE_TREE_CHANGED,
    295	BTRFS_COMPARE_TREE_SAME,
    296};
    297
    298__cold
    299static void inconsistent_snapshot_error(struct send_ctx *sctx,
    300					enum btrfs_compare_tree_result result,
    301					const char *what)
    302{
    303	const char *result_string;
    304
    305	switch (result) {
    306	case BTRFS_COMPARE_TREE_NEW:
    307		result_string = "new";
    308		break;
    309	case BTRFS_COMPARE_TREE_DELETED:
    310		result_string = "deleted";
    311		break;
    312	case BTRFS_COMPARE_TREE_CHANGED:
    313		result_string = "updated";
    314		break;
    315	case BTRFS_COMPARE_TREE_SAME:
    316		ASSERT(0);
    317		result_string = "unchanged";
    318		break;
    319	default:
    320		ASSERT(0);
    321		result_string = "unexpected";
    322	}
    323
    324	btrfs_err(sctx->send_root->fs_info,
    325		  "Send: inconsistent snapshot, found %s %s for inode %llu without updated inode item, send root is %llu, parent root is %llu",
    326		  result_string, what, sctx->cmp_key->objectid,
    327		  sctx->send_root->root_key.objectid,
    328		  (sctx->parent_root ?
    329		   sctx->parent_root->root_key.objectid : 0));
    330}
    331
    332__maybe_unused
    333static bool proto_cmd_ok(const struct send_ctx *sctx, int cmd)
    334{
    335	switch (sctx->proto) {
    336	case 1:	 return cmd < __BTRFS_SEND_C_MAX_V1;
    337	case 2:	 return cmd < __BTRFS_SEND_C_MAX_V2;
    338	default: return false;
    339	}
    340}
    341
    342static int is_waiting_for_move(struct send_ctx *sctx, u64 ino);
    343
    344static struct waiting_dir_move *
    345get_waiting_dir_move(struct send_ctx *sctx, u64 ino);
    346
    347static int is_waiting_for_rm(struct send_ctx *sctx, u64 dir_ino, u64 gen);
    348
    349static int need_send_hole(struct send_ctx *sctx)
    350{
    351	return (sctx->parent_root && !sctx->cur_inode_new &&
    352		!sctx->cur_inode_new_gen && !sctx->cur_inode_deleted &&
    353		S_ISREG(sctx->cur_inode_mode));
    354}
    355
    356static void fs_path_reset(struct fs_path *p)
    357{
    358	if (p->reversed) {
    359		p->start = p->buf + p->buf_len - 1;
    360		p->end = p->start;
    361		*p->start = 0;
    362	} else {
    363		p->start = p->buf;
    364		p->end = p->start;
    365		*p->start = 0;
    366	}
    367}
    368
    369static struct fs_path *fs_path_alloc(void)
    370{
    371	struct fs_path *p;
    372
    373	p = kmalloc(sizeof(*p), GFP_KERNEL);
    374	if (!p)
    375		return NULL;
    376	p->reversed = 0;
    377	p->buf = p->inline_buf;
    378	p->buf_len = FS_PATH_INLINE_SIZE;
    379	fs_path_reset(p);
    380	return p;
    381}
    382
    383static struct fs_path *fs_path_alloc_reversed(void)
    384{
    385	struct fs_path *p;
    386
    387	p = fs_path_alloc();
    388	if (!p)
    389		return NULL;
    390	p->reversed = 1;
    391	fs_path_reset(p);
    392	return p;
    393}
    394
    395static void fs_path_free(struct fs_path *p)
    396{
    397	if (!p)
    398		return;
    399	if (p->buf != p->inline_buf)
    400		kfree(p->buf);
    401	kfree(p);
    402}
    403
    404static int fs_path_len(struct fs_path *p)
    405{
    406	return p->end - p->start;
    407}
    408
    409static int fs_path_ensure_buf(struct fs_path *p, int len)
    410{
    411	char *tmp_buf;
    412	int path_len;
    413	int old_buf_len;
    414
    415	len++;
    416
    417	if (p->buf_len >= len)
    418		return 0;
    419
    420	if (len > PATH_MAX) {
    421		WARN_ON(1);
    422		return -ENOMEM;
    423	}
    424
    425	path_len = p->end - p->start;
    426	old_buf_len = p->buf_len;
    427
    428	/*
    429	 * First time the inline_buf does not suffice
    430	 */
    431	if (p->buf == p->inline_buf) {
    432		tmp_buf = kmalloc(len, GFP_KERNEL);
    433		if (tmp_buf)
    434			memcpy(tmp_buf, p->buf, old_buf_len);
    435	} else {
    436		tmp_buf = krealloc(p->buf, len, GFP_KERNEL);
    437	}
    438	if (!tmp_buf)
    439		return -ENOMEM;
    440	p->buf = tmp_buf;
    441	/*
    442	 * The real size of the buffer is bigger, this will let the fast path
    443	 * happen most of the time
    444	 */
    445	p->buf_len = ksize(p->buf);
    446
    447	if (p->reversed) {
    448		tmp_buf = p->buf + old_buf_len - path_len - 1;
    449		p->end = p->buf + p->buf_len - 1;
    450		p->start = p->end - path_len;
    451		memmove(p->start, tmp_buf, path_len + 1);
    452	} else {
    453		p->start = p->buf;
    454		p->end = p->start + path_len;
    455	}
    456	return 0;
    457}
    458
    459static int fs_path_prepare_for_add(struct fs_path *p, int name_len,
    460				   char **prepared)
    461{
    462	int ret;
    463	int new_len;
    464
    465	new_len = p->end - p->start + name_len;
    466	if (p->start != p->end)
    467		new_len++;
    468	ret = fs_path_ensure_buf(p, new_len);
    469	if (ret < 0)
    470		goto out;
    471
    472	if (p->reversed) {
    473		if (p->start != p->end)
    474			*--p->start = '/';
    475		p->start -= name_len;
    476		*prepared = p->start;
    477	} else {
    478		if (p->start != p->end)
    479			*p->end++ = '/';
    480		*prepared = p->end;
    481		p->end += name_len;
    482		*p->end = 0;
    483	}
    484
    485out:
    486	return ret;
    487}
    488
    489static int fs_path_add(struct fs_path *p, const char *name, int name_len)
    490{
    491	int ret;
    492	char *prepared;
    493
    494	ret = fs_path_prepare_for_add(p, name_len, &prepared);
    495	if (ret < 0)
    496		goto out;
    497	memcpy(prepared, name, name_len);
    498
    499out:
    500	return ret;
    501}
    502
    503static int fs_path_add_path(struct fs_path *p, struct fs_path *p2)
    504{
    505	int ret;
    506	char *prepared;
    507
    508	ret = fs_path_prepare_for_add(p, p2->end - p2->start, &prepared);
    509	if (ret < 0)
    510		goto out;
    511	memcpy(prepared, p2->start, p2->end - p2->start);
    512
    513out:
    514	return ret;
    515}
    516
    517static int fs_path_add_from_extent_buffer(struct fs_path *p,
    518					  struct extent_buffer *eb,
    519					  unsigned long off, int len)
    520{
    521	int ret;
    522	char *prepared;
    523
    524	ret = fs_path_prepare_for_add(p, len, &prepared);
    525	if (ret < 0)
    526		goto out;
    527
    528	read_extent_buffer(eb, prepared, off, len);
    529
    530out:
    531	return ret;
    532}
    533
    534static int fs_path_copy(struct fs_path *p, struct fs_path *from)
    535{
    536	p->reversed = from->reversed;
    537	fs_path_reset(p);
    538
    539	return fs_path_add_path(p, from);
    540}
    541
    542static void fs_path_unreverse(struct fs_path *p)
    543{
    544	char *tmp;
    545	int len;
    546
    547	if (!p->reversed)
    548		return;
    549
    550	tmp = p->start;
    551	len = p->end - p->start;
    552	p->start = p->buf;
    553	p->end = p->start + len;
    554	memmove(p->start, tmp, len + 1);
    555	p->reversed = 0;
    556}
    557
    558static struct btrfs_path *alloc_path_for_send(void)
    559{
    560	struct btrfs_path *path;
    561
    562	path = btrfs_alloc_path();
    563	if (!path)
    564		return NULL;
    565	path->search_commit_root = 1;
    566	path->skip_locking = 1;
    567	path->need_commit_sem = 1;
    568	return path;
    569}
    570
    571static int write_buf(struct file *filp, const void *buf, u32 len, loff_t *off)
    572{
    573	int ret;
    574	u32 pos = 0;
    575
    576	while (pos < len) {
    577		ret = kernel_write(filp, buf + pos, len - pos, off);
    578		/* TODO handle that correctly */
    579		/*if (ret == -ERESTARTSYS) {
    580			continue;
    581		}*/
    582		if (ret < 0)
    583			return ret;
    584		if (ret == 0) {
    585			return -EIO;
    586		}
    587		pos += ret;
    588	}
    589
    590	return 0;
    591}
    592
    593static int tlv_put(struct send_ctx *sctx, u16 attr, const void *data, int len)
    594{
    595	struct btrfs_tlv_header *hdr;
    596	int total_len = sizeof(*hdr) + len;
    597	int left = sctx->send_max_size - sctx->send_size;
    598
    599	if (unlikely(left < total_len))
    600		return -EOVERFLOW;
    601
    602	hdr = (struct btrfs_tlv_header *) (sctx->send_buf + sctx->send_size);
    603	put_unaligned_le16(attr, &hdr->tlv_type);
    604	put_unaligned_le16(len, &hdr->tlv_len);
    605	memcpy(hdr + 1, data, len);
    606	sctx->send_size += total_len;
    607
    608	return 0;
    609}
    610
    611#define TLV_PUT_DEFINE_INT(bits) \
    612	static int tlv_put_u##bits(struct send_ctx *sctx,	 	\
    613			u##bits attr, u##bits value)			\
    614	{								\
    615		__le##bits __tmp = cpu_to_le##bits(value);		\
    616		return tlv_put(sctx, attr, &__tmp, sizeof(__tmp));	\
    617	}
    618
    619TLV_PUT_DEFINE_INT(64)
    620
    621static int tlv_put_string(struct send_ctx *sctx, u16 attr,
    622			  const char *str, int len)
    623{
    624	if (len == -1)
    625		len = strlen(str);
    626	return tlv_put(sctx, attr, str, len);
    627}
    628
    629static int tlv_put_uuid(struct send_ctx *sctx, u16 attr,
    630			const u8 *uuid)
    631{
    632	return tlv_put(sctx, attr, uuid, BTRFS_UUID_SIZE);
    633}
    634
    635static int tlv_put_btrfs_timespec(struct send_ctx *sctx, u16 attr,
    636				  struct extent_buffer *eb,
    637				  struct btrfs_timespec *ts)
    638{
    639	struct btrfs_timespec bts;
    640	read_extent_buffer(eb, &bts, (unsigned long)ts, sizeof(bts));
    641	return tlv_put(sctx, attr, &bts, sizeof(bts));
    642}
    643
    644
    645#define TLV_PUT(sctx, attrtype, data, attrlen) \
    646	do { \
    647		ret = tlv_put(sctx, attrtype, data, attrlen); \
    648		if (ret < 0) \
    649			goto tlv_put_failure; \
    650	} while (0)
    651
    652#define TLV_PUT_INT(sctx, attrtype, bits, value) \
    653	do { \
    654		ret = tlv_put_u##bits(sctx, attrtype, value); \
    655		if (ret < 0) \
    656			goto tlv_put_failure; \
    657	} while (0)
    658
    659#define TLV_PUT_U8(sctx, attrtype, data) TLV_PUT_INT(sctx, attrtype, 8, data)
    660#define TLV_PUT_U16(sctx, attrtype, data) TLV_PUT_INT(sctx, attrtype, 16, data)
    661#define TLV_PUT_U32(sctx, attrtype, data) TLV_PUT_INT(sctx, attrtype, 32, data)
    662#define TLV_PUT_U64(sctx, attrtype, data) TLV_PUT_INT(sctx, attrtype, 64, data)
    663#define TLV_PUT_STRING(sctx, attrtype, str, len) \
    664	do { \
    665		ret = tlv_put_string(sctx, attrtype, str, len); \
    666		if (ret < 0) \
    667			goto tlv_put_failure; \
    668	} while (0)
    669#define TLV_PUT_PATH(sctx, attrtype, p) \
    670	do { \
    671		ret = tlv_put_string(sctx, attrtype, p->start, \
    672			p->end - p->start); \
    673		if (ret < 0) \
    674			goto tlv_put_failure; \
    675	} while(0)
    676#define TLV_PUT_UUID(sctx, attrtype, uuid) \
    677	do { \
    678		ret = tlv_put_uuid(sctx, attrtype, uuid); \
    679		if (ret < 0) \
    680			goto tlv_put_failure; \
    681	} while (0)
    682#define TLV_PUT_BTRFS_TIMESPEC(sctx, attrtype, eb, ts) \
    683	do { \
    684		ret = tlv_put_btrfs_timespec(sctx, attrtype, eb, ts); \
    685		if (ret < 0) \
    686			goto tlv_put_failure; \
    687	} while (0)
    688
    689static int send_header(struct send_ctx *sctx)
    690{
    691	struct btrfs_stream_header hdr;
    692
    693	strcpy(hdr.magic, BTRFS_SEND_STREAM_MAGIC);
    694	hdr.version = cpu_to_le32(BTRFS_SEND_STREAM_VERSION);
    695
    696	return write_buf(sctx->send_filp, &hdr, sizeof(hdr),
    697					&sctx->send_off);
    698}
    699
    700/*
    701 * For each command/item we want to send to userspace, we call this function.
    702 */
    703static int begin_cmd(struct send_ctx *sctx, int cmd)
    704{
    705	struct btrfs_cmd_header *hdr;
    706
    707	if (WARN_ON(!sctx->send_buf))
    708		return -EINVAL;
    709
    710	BUG_ON(sctx->send_size);
    711
    712	sctx->send_size += sizeof(*hdr);
    713	hdr = (struct btrfs_cmd_header *)sctx->send_buf;
    714	put_unaligned_le16(cmd, &hdr->cmd);
    715
    716	return 0;
    717}
    718
    719static int send_cmd(struct send_ctx *sctx)
    720{
    721	int ret;
    722	struct btrfs_cmd_header *hdr;
    723	u32 crc;
    724
    725	hdr = (struct btrfs_cmd_header *)sctx->send_buf;
    726	put_unaligned_le32(sctx->send_size - sizeof(*hdr), &hdr->len);
    727	put_unaligned_le32(0, &hdr->crc);
    728
    729	crc = btrfs_crc32c(0, (unsigned char *)sctx->send_buf, sctx->send_size);
    730	put_unaligned_le32(crc, &hdr->crc);
    731
    732	ret = write_buf(sctx->send_filp, sctx->send_buf, sctx->send_size,
    733					&sctx->send_off);
    734
    735	sctx->total_send_size += sctx->send_size;
    736	sctx->cmd_send_size[get_unaligned_le16(&hdr->cmd)] += sctx->send_size;
    737	sctx->send_size = 0;
    738
    739	return ret;
    740}
    741
    742/*
    743 * Sends a move instruction to user space
    744 */
    745static int send_rename(struct send_ctx *sctx,
    746		     struct fs_path *from, struct fs_path *to)
    747{
    748	struct btrfs_fs_info *fs_info = sctx->send_root->fs_info;
    749	int ret;
    750
    751	btrfs_debug(fs_info, "send_rename %s -> %s", from->start, to->start);
    752
    753	ret = begin_cmd(sctx, BTRFS_SEND_C_RENAME);
    754	if (ret < 0)
    755		goto out;
    756
    757	TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, from);
    758	TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH_TO, to);
    759
    760	ret = send_cmd(sctx);
    761
    762tlv_put_failure:
    763out:
    764	return ret;
    765}
    766
    767/*
    768 * Sends a link instruction to user space
    769 */
    770static int send_link(struct send_ctx *sctx,
    771		     struct fs_path *path, struct fs_path *lnk)
    772{
    773	struct btrfs_fs_info *fs_info = sctx->send_root->fs_info;
    774	int ret;
    775
    776	btrfs_debug(fs_info, "send_link %s -> %s", path->start, lnk->start);
    777
    778	ret = begin_cmd(sctx, BTRFS_SEND_C_LINK);
    779	if (ret < 0)
    780		goto out;
    781
    782	TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, path);
    783	TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH_LINK, lnk);
    784
    785	ret = send_cmd(sctx);
    786
    787tlv_put_failure:
    788out:
    789	return ret;
    790}
    791
    792/*
    793 * Sends an unlink instruction to user space
    794 */
    795static int send_unlink(struct send_ctx *sctx, struct fs_path *path)
    796{
    797	struct btrfs_fs_info *fs_info = sctx->send_root->fs_info;
    798	int ret;
    799
    800	btrfs_debug(fs_info, "send_unlink %s", path->start);
    801
    802	ret = begin_cmd(sctx, BTRFS_SEND_C_UNLINK);
    803	if (ret < 0)
    804		goto out;
    805
    806	TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, path);
    807
    808	ret = send_cmd(sctx);
    809
    810tlv_put_failure:
    811out:
    812	return ret;
    813}
    814
    815/*
    816 * Sends a rmdir instruction to user space
    817 */
    818static int send_rmdir(struct send_ctx *sctx, struct fs_path *path)
    819{
    820	struct btrfs_fs_info *fs_info = sctx->send_root->fs_info;
    821	int ret;
    822
    823	btrfs_debug(fs_info, "send_rmdir %s", path->start);
    824
    825	ret = begin_cmd(sctx, BTRFS_SEND_C_RMDIR);
    826	if (ret < 0)
    827		goto out;
    828
    829	TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, path);
    830
    831	ret = send_cmd(sctx);
    832
    833tlv_put_failure:
    834out:
    835	return ret;
    836}
    837
    838/*
    839 * Helper function to retrieve some fields from an inode item.
    840 */
    841static int __get_inode_info(struct btrfs_root *root, struct btrfs_path *path,
    842			  u64 ino, u64 *size, u64 *gen, u64 *mode, u64 *uid,
    843			  u64 *gid, u64 *rdev)
    844{
    845	int ret;
    846	struct btrfs_inode_item *ii;
    847	struct btrfs_key key;
    848
    849	key.objectid = ino;
    850	key.type = BTRFS_INODE_ITEM_KEY;
    851	key.offset = 0;
    852	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
    853	if (ret) {
    854		if (ret > 0)
    855			ret = -ENOENT;
    856		return ret;
    857	}
    858
    859	ii = btrfs_item_ptr(path->nodes[0], path->slots[0],
    860			struct btrfs_inode_item);
    861	if (size)
    862		*size = btrfs_inode_size(path->nodes[0], ii);
    863	if (gen)
    864		*gen = btrfs_inode_generation(path->nodes[0], ii);
    865	if (mode)
    866		*mode = btrfs_inode_mode(path->nodes[0], ii);
    867	if (uid)
    868		*uid = btrfs_inode_uid(path->nodes[0], ii);
    869	if (gid)
    870		*gid = btrfs_inode_gid(path->nodes[0], ii);
    871	if (rdev)
    872		*rdev = btrfs_inode_rdev(path->nodes[0], ii);
    873
    874	return ret;
    875}
    876
    877static int get_inode_info(struct btrfs_root *root,
    878			  u64 ino, u64 *size, u64 *gen,
    879			  u64 *mode, u64 *uid, u64 *gid,
    880			  u64 *rdev)
    881{
    882	struct btrfs_path *path;
    883	int ret;
    884
    885	path = alloc_path_for_send();
    886	if (!path)
    887		return -ENOMEM;
    888	ret = __get_inode_info(root, path, ino, size, gen, mode, uid, gid,
    889			       rdev);
    890	btrfs_free_path(path);
    891	return ret;
    892}
    893
    894typedef int (*iterate_inode_ref_t)(int num, u64 dir, int index,
    895				   struct fs_path *p,
    896				   void *ctx);
    897
    898/*
    899 * Helper function to iterate the entries in ONE btrfs_inode_ref or
    900 * btrfs_inode_extref.
    901 * The iterate callback may return a non zero value to stop iteration. This can
    902 * be a negative value for error codes or 1 to simply stop it.
    903 *
    904 * path must point to the INODE_REF or INODE_EXTREF when called.
    905 */
    906static int iterate_inode_ref(struct btrfs_root *root, struct btrfs_path *path,
    907			     struct btrfs_key *found_key, int resolve,
    908			     iterate_inode_ref_t iterate, void *ctx)
    909{
    910	struct extent_buffer *eb = path->nodes[0];
    911	struct btrfs_inode_ref *iref;
    912	struct btrfs_inode_extref *extref;
    913	struct btrfs_path *tmp_path;
    914	struct fs_path *p;
    915	u32 cur = 0;
    916	u32 total;
    917	int slot = path->slots[0];
    918	u32 name_len;
    919	char *start;
    920	int ret = 0;
    921	int num = 0;
    922	int index;
    923	u64 dir;
    924	unsigned long name_off;
    925	unsigned long elem_size;
    926	unsigned long ptr;
    927
    928	p = fs_path_alloc_reversed();
    929	if (!p)
    930		return -ENOMEM;
    931
    932	tmp_path = alloc_path_for_send();
    933	if (!tmp_path) {
    934		fs_path_free(p);
    935		return -ENOMEM;
    936	}
    937
    938
    939	if (found_key->type == BTRFS_INODE_REF_KEY) {
    940		ptr = (unsigned long)btrfs_item_ptr(eb, slot,
    941						    struct btrfs_inode_ref);
    942		total = btrfs_item_size(eb, slot);
    943		elem_size = sizeof(*iref);
    944	} else {
    945		ptr = btrfs_item_ptr_offset(eb, slot);
    946		total = btrfs_item_size(eb, slot);
    947		elem_size = sizeof(*extref);
    948	}
    949
    950	while (cur < total) {
    951		fs_path_reset(p);
    952
    953		if (found_key->type == BTRFS_INODE_REF_KEY) {
    954			iref = (struct btrfs_inode_ref *)(ptr + cur);
    955			name_len = btrfs_inode_ref_name_len(eb, iref);
    956			name_off = (unsigned long)(iref + 1);
    957			index = btrfs_inode_ref_index(eb, iref);
    958			dir = found_key->offset;
    959		} else {
    960			extref = (struct btrfs_inode_extref *)(ptr + cur);
    961			name_len = btrfs_inode_extref_name_len(eb, extref);
    962			name_off = (unsigned long)&extref->name;
    963			index = btrfs_inode_extref_index(eb, extref);
    964			dir = btrfs_inode_extref_parent(eb, extref);
    965		}
    966
    967		if (resolve) {
    968			start = btrfs_ref_to_path(root, tmp_path, name_len,
    969						  name_off, eb, dir,
    970						  p->buf, p->buf_len);
    971			if (IS_ERR(start)) {
    972				ret = PTR_ERR(start);
    973				goto out;
    974			}
    975			if (start < p->buf) {
    976				/* overflow , try again with larger buffer */
    977				ret = fs_path_ensure_buf(p,
    978						p->buf_len + p->buf - start);
    979				if (ret < 0)
    980					goto out;
    981				start = btrfs_ref_to_path(root, tmp_path,
    982							  name_len, name_off,
    983							  eb, dir,
    984							  p->buf, p->buf_len);
    985				if (IS_ERR(start)) {
    986					ret = PTR_ERR(start);
    987					goto out;
    988				}
    989				BUG_ON(start < p->buf);
    990			}
    991			p->start = start;
    992		} else {
    993			ret = fs_path_add_from_extent_buffer(p, eb, name_off,
    994							     name_len);
    995			if (ret < 0)
    996				goto out;
    997		}
    998
    999		cur += elem_size + name_len;
   1000		ret = iterate(num, dir, index, p, ctx);
   1001		if (ret)
   1002			goto out;
   1003		num++;
   1004	}
   1005
   1006out:
   1007	btrfs_free_path(tmp_path);
   1008	fs_path_free(p);
   1009	return ret;
   1010}
   1011
   1012typedef int (*iterate_dir_item_t)(int num, struct btrfs_key *di_key,
   1013				  const char *name, int name_len,
   1014				  const char *data, int data_len,
   1015				  void *ctx);
   1016
   1017/*
   1018 * Helper function to iterate the entries in ONE btrfs_dir_item.
   1019 * The iterate callback may return a non zero value to stop iteration. This can
   1020 * be a negative value for error codes or 1 to simply stop it.
   1021 *
   1022 * path must point to the dir item when called.
   1023 */
   1024static int iterate_dir_item(struct btrfs_root *root, struct btrfs_path *path,
   1025			    iterate_dir_item_t iterate, void *ctx)
   1026{
   1027	int ret = 0;
   1028	struct extent_buffer *eb;
   1029	struct btrfs_dir_item *di;
   1030	struct btrfs_key di_key;
   1031	char *buf = NULL;
   1032	int buf_len;
   1033	u32 name_len;
   1034	u32 data_len;
   1035	u32 cur;
   1036	u32 len;
   1037	u32 total;
   1038	int slot;
   1039	int num;
   1040
   1041	/*
   1042	 * Start with a small buffer (1 page). If later we end up needing more
   1043	 * space, which can happen for xattrs on a fs with a leaf size greater
   1044	 * then the page size, attempt to increase the buffer. Typically xattr
   1045	 * values are small.
   1046	 */
   1047	buf_len = PATH_MAX;
   1048	buf = kmalloc(buf_len, GFP_KERNEL);
   1049	if (!buf) {
   1050		ret = -ENOMEM;
   1051		goto out;
   1052	}
   1053
   1054	eb = path->nodes[0];
   1055	slot = path->slots[0];
   1056	di = btrfs_item_ptr(eb, slot, struct btrfs_dir_item);
   1057	cur = 0;
   1058	len = 0;
   1059	total = btrfs_item_size(eb, slot);
   1060
   1061	num = 0;
   1062	while (cur < total) {
   1063		name_len = btrfs_dir_name_len(eb, di);
   1064		data_len = btrfs_dir_data_len(eb, di);
   1065		btrfs_dir_item_key_to_cpu(eb, di, &di_key);
   1066
   1067		if (btrfs_dir_type(eb, di) == BTRFS_FT_XATTR) {
   1068			if (name_len > XATTR_NAME_MAX) {
   1069				ret = -ENAMETOOLONG;
   1070				goto out;
   1071			}
   1072			if (name_len + data_len >
   1073					BTRFS_MAX_XATTR_SIZE(root->fs_info)) {
   1074				ret = -E2BIG;
   1075				goto out;
   1076			}
   1077		} else {
   1078			/*
   1079			 * Path too long
   1080			 */
   1081			if (name_len + data_len > PATH_MAX) {
   1082				ret = -ENAMETOOLONG;
   1083				goto out;
   1084			}
   1085		}
   1086
   1087		if (name_len + data_len > buf_len) {
   1088			buf_len = name_len + data_len;
   1089			if (is_vmalloc_addr(buf)) {
   1090				vfree(buf);
   1091				buf = NULL;
   1092			} else {
   1093				char *tmp = krealloc(buf, buf_len,
   1094						GFP_KERNEL | __GFP_NOWARN);
   1095
   1096				if (!tmp)
   1097					kfree(buf);
   1098				buf = tmp;
   1099			}
   1100			if (!buf) {
   1101				buf = kvmalloc(buf_len, GFP_KERNEL);
   1102				if (!buf) {
   1103					ret = -ENOMEM;
   1104					goto out;
   1105				}
   1106			}
   1107		}
   1108
   1109		read_extent_buffer(eb, buf, (unsigned long)(di + 1),
   1110				name_len + data_len);
   1111
   1112		len = sizeof(*di) + name_len + data_len;
   1113		di = (struct btrfs_dir_item *)((char *)di + len);
   1114		cur += len;
   1115
   1116		ret = iterate(num, &di_key, buf, name_len, buf + name_len,
   1117			      data_len, ctx);
   1118		if (ret < 0)
   1119			goto out;
   1120		if (ret) {
   1121			ret = 0;
   1122			goto out;
   1123		}
   1124
   1125		num++;
   1126	}
   1127
   1128out:
   1129	kvfree(buf);
   1130	return ret;
   1131}
   1132
   1133static int __copy_first_ref(int num, u64 dir, int index,
   1134			    struct fs_path *p, void *ctx)
   1135{
   1136	int ret;
   1137	struct fs_path *pt = ctx;
   1138
   1139	ret = fs_path_copy(pt, p);
   1140	if (ret < 0)
   1141		return ret;
   1142
   1143	/* we want the first only */
   1144	return 1;
   1145}
   1146
   1147/*
   1148 * Retrieve the first path of an inode. If an inode has more then one
   1149 * ref/hardlink, this is ignored.
   1150 */
   1151static int get_inode_path(struct btrfs_root *root,
   1152			  u64 ino, struct fs_path *path)
   1153{
   1154	int ret;
   1155	struct btrfs_key key, found_key;
   1156	struct btrfs_path *p;
   1157
   1158	p = alloc_path_for_send();
   1159	if (!p)
   1160		return -ENOMEM;
   1161
   1162	fs_path_reset(path);
   1163
   1164	key.objectid = ino;
   1165	key.type = BTRFS_INODE_REF_KEY;
   1166	key.offset = 0;
   1167
   1168	ret = btrfs_search_slot_for_read(root, &key, p, 1, 0);
   1169	if (ret < 0)
   1170		goto out;
   1171	if (ret) {
   1172		ret = 1;
   1173		goto out;
   1174	}
   1175	btrfs_item_key_to_cpu(p->nodes[0], &found_key, p->slots[0]);
   1176	if (found_key.objectid != ino ||
   1177	    (found_key.type != BTRFS_INODE_REF_KEY &&
   1178	     found_key.type != BTRFS_INODE_EXTREF_KEY)) {
   1179		ret = -ENOENT;
   1180		goto out;
   1181	}
   1182
   1183	ret = iterate_inode_ref(root, p, &found_key, 1,
   1184				__copy_first_ref, path);
   1185	if (ret < 0)
   1186		goto out;
   1187	ret = 0;
   1188
   1189out:
   1190	btrfs_free_path(p);
   1191	return ret;
   1192}
   1193
   1194struct backref_ctx {
   1195	struct send_ctx *sctx;
   1196
   1197	/* number of total found references */
   1198	u64 found;
   1199
   1200	/*
   1201	 * used for clones found in send_root. clones found behind cur_objectid
   1202	 * and cur_offset are not considered as allowed clones.
   1203	 */
   1204	u64 cur_objectid;
   1205	u64 cur_offset;
   1206
   1207	/* may be truncated in case it's the last extent in a file */
   1208	u64 extent_len;
   1209
   1210	/* Just to check for bugs in backref resolving */
   1211	int found_itself;
   1212};
   1213
   1214static int __clone_root_cmp_bsearch(const void *key, const void *elt)
   1215{
   1216	u64 root = (u64)(uintptr_t)key;
   1217	const struct clone_root *cr = elt;
   1218
   1219	if (root < cr->root->root_key.objectid)
   1220		return -1;
   1221	if (root > cr->root->root_key.objectid)
   1222		return 1;
   1223	return 0;
   1224}
   1225
   1226static int __clone_root_cmp_sort(const void *e1, const void *e2)
   1227{
   1228	const struct clone_root *cr1 = e1;
   1229	const struct clone_root *cr2 = e2;
   1230
   1231	if (cr1->root->root_key.objectid < cr2->root->root_key.objectid)
   1232		return -1;
   1233	if (cr1->root->root_key.objectid > cr2->root->root_key.objectid)
   1234		return 1;
   1235	return 0;
   1236}
   1237
   1238/*
   1239 * Called for every backref that is found for the current extent.
   1240 * Results are collected in sctx->clone_roots->ino/offset/found_refs
   1241 */
   1242static int __iterate_backrefs(u64 ino, u64 offset, u64 root, void *ctx_)
   1243{
   1244	struct backref_ctx *bctx = ctx_;
   1245	struct clone_root *found;
   1246
   1247	/* First check if the root is in the list of accepted clone sources */
   1248	found = bsearch((void *)(uintptr_t)root, bctx->sctx->clone_roots,
   1249			bctx->sctx->clone_roots_cnt,
   1250			sizeof(struct clone_root),
   1251			__clone_root_cmp_bsearch);
   1252	if (!found)
   1253		return 0;
   1254
   1255	if (found->root == bctx->sctx->send_root &&
   1256	    ino == bctx->cur_objectid &&
   1257	    offset == bctx->cur_offset) {
   1258		bctx->found_itself = 1;
   1259	}
   1260
   1261	/*
   1262	 * Make sure we don't consider clones from send_root that are
   1263	 * behind the current inode/offset.
   1264	 */
   1265	if (found->root == bctx->sctx->send_root) {
   1266		/*
   1267		 * If the source inode was not yet processed we can't issue a
   1268		 * clone operation, as the source extent does not exist yet at
   1269		 * the destination of the stream.
   1270		 */
   1271		if (ino > bctx->cur_objectid)
   1272			return 0;
   1273		/*
   1274		 * We clone from the inode currently being sent as long as the
   1275		 * source extent is already processed, otherwise we could try
   1276		 * to clone from an extent that does not exist yet at the
   1277		 * destination of the stream.
   1278		 */
   1279		if (ino == bctx->cur_objectid &&
   1280		    offset + bctx->extent_len >
   1281		    bctx->sctx->cur_inode_next_write_offset)
   1282			return 0;
   1283	}
   1284
   1285	bctx->found++;
   1286	found->found_refs++;
   1287	if (ino < found->ino) {
   1288		found->ino = ino;
   1289		found->offset = offset;
   1290	} else if (found->ino == ino) {
   1291		/*
   1292		 * same extent found more then once in the same file.
   1293		 */
   1294		if (found->offset > offset + bctx->extent_len)
   1295			found->offset = offset;
   1296	}
   1297
   1298	return 0;
   1299}
   1300
   1301/*
   1302 * Given an inode, offset and extent item, it finds a good clone for a clone
   1303 * instruction. Returns -ENOENT when none could be found. The function makes
   1304 * sure that the returned clone is usable at the point where sending is at the
   1305 * moment. This means, that no clones are accepted which lie behind the current
   1306 * inode+offset.
   1307 *
   1308 * path must point to the extent item when called.
   1309 */
   1310static int find_extent_clone(struct send_ctx *sctx,
   1311			     struct btrfs_path *path,
   1312			     u64 ino, u64 data_offset,
   1313			     u64 ino_size,
   1314			     struct clone_root **found)
   1315{
   1316	struct btrfs_fs_info *fs_info = sctx->send_root->fs_info;
   1317	int ret;
   1318	int extent_type;
   1319	u64 logical;
   1320	u64 disk_byte;
   1321	u64 num_bytes;
   1322	u64 extent_item_pos;
   1323	u64 flags = 0;
   1324	struct btrfs_file_extent_item *fi;
   1325	struct extent_buffer *eb = path->nodes[0];
   1326	struct backref_ctx backref_ctx = {0};
   1327	struct clone_root *cur_clone_root;
   1328	struct btrfs_key found_key;
   1329	struct btrfs_path *tmp_path;
   1330	struct btrfs_extent_item *ei;
   1331	int compressed;
   1332	u32 i;
   1333
   1334	tmp_path = alloc_path_for_send();
   1335	if (!tmp_path)
   1336		return -ENOMEM;
   1337
   1338	/* We only use this path under the commit sem */
   1339	tmp_path->need_commit_sem = 0;
   1340
   1341	if (data_offset >= ino_size) {
   1342		/*
   1343		 * There may be extents that lie behind the file's size.
   1344		 * I at least had this in combination with snapshotting while
   1345		 * writing large files.
   1346		 */
   1347		ret = 0;
   1348		goto out;
   1349	}
   1350
   1351	fi = btrfs_item_ptr(eb, path->slots[0],
   1352			struct btrfs_file_extent_item);
   1353	extent_type = btrfs_file_extent_type(eb, fi);
   1354	if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
   1355		ret = -ENOENT;
   1356		goto out;
   1357	}
   1358	compressed = btrfs_file_extent_compression(eb, fi);
   1359
   1360	num_bytes = btrfs_file_extent_num_bytes(eb, fi);
   1361	disk_byte = btrfs_file_extent_disk_bytenr(eb, fi);
   1362	if (disk_byte == 0) {
   1363		ret = -ENOENT;
   1364		goto out;
   1365	}
   1366	logical = disk_byte + btrfs_file_extent_offset(eb, fi);
   1367
   1368	down_read(&fs_info->commit_root_sem);
   1369	ret = extent_from_logical(fs_info, disk_byte, tmp_path,
   1370				  &found_key, &flags);
   1371	up_read(&fs_info->commit_root_sem);
   1372
   1373	if (ret < 0)
   1374		goto out;
   1375	if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
   1376		ret = -EIO;
   1377		goto out;
   1378	}
   1379
   1380	ei = btrfs_item_ptr(tmp_path->nodes[0], tmp_path->slots[0],
   1381			    struct btrfs_extent_item);
   1382	/*
   1383	 * Backreference walking (iterate_extent_inodes() below) is currently
   1384	 * too expensive when an extent has a large number of references, both
   1385	 * in time spent and used memory. So for now just fallback to write
   1386	 * operations instead of clone operations when an extent has more than
   1387	 * a certain amount of references.
   1388	 */
   1389	if (btrfs_extent_refs(tmp_path->nodes[0], ei) > SEND_MAX_EXTENT_REFS) {
   1390		ret = -ENOENT;
   1391		goto out;
   1392	}
   1393	btrfs_release_path(tmp_path);
   1394
   1395	/*
   1396	 * Setup the clone roots.
   1397	 */
   1398	for (i = 0; i < sctx->clone_roots_cnt; i++) {
   1399		cur_clone_root = sctx->clone_roots + i;
   1400		cur_clone_root->ino = (u64)-1;
   1401		cur_clone_root->offset = 0;
   1402		cur_clone_root->found_refs = 0;
   1403	}
   1404
   1405	backref_ctx.sctx = sctx;
   1406	backref_ctx.found = 0;
   1407	backref_ctx.cur_objectid = ino;
   1408	backref_ctx.cur_offset = data_offset;
   1409	backref_ctx.found_itself = 0;
   1410	backref_ctx.extent_len = num_bytes;
   1411
   1412	/*
   1413	 * The last extent of a file may be too large due to page alignment.
   1414	 * We need to adjust extent_len in this case so that the checks in
   1415	 * __iterate_backrefs work.
   1416	 */
   1417	if (data_offset + num_bytes >= ino_size)
   1418		backref_ctx.extent_len = ino_size - data_offset;
   1419
   1420	/*
   1421	 * Now collect all backrefs.
   1422	 */
   1423	if (compressed == BTRFS_COMPRESS_NONE)
   1424		extent_item_pos = logical - found_key.objectid;
   1425	else
   1426		extent_item_pos = 0;
   1427	ret = iterate_extent_inodes(fs_info, found_key.objectid,
   1428				    extent_item_pos, 1, __iterate_backrefs,
   1429				    &backref_ctx, false);
   1430
   1431	if (ret < 0)
   1432		goto out;
   1433
   1434	down_read(&fs_info->commit_root_sem);
   1435	if (fs_info->last_reloc_trans > sctx->last_reloc_trans) {
   1436		/*
   1437		 * A transaction commit for a transaction in which block group
   1438		 * relocation was done just happened.
   1439		 * The disk_bytenr of the file extent item we processed is
   1440		 * possibly stale, referring to the extent's location before
   1441		 * relocation. So act as if we haven't found any clone sources
   1442		 * and fallback to write commands, which will read the correct
   1443		 * data from the new extent location. Otherwise we will fail
   1444		 * below because we haven't found our own back reference or we
   1445		 * could be getting incorrect sources in case the old extent
   1446		 * was already reallocated after the relocation.
   1447		 */
   1448		up_read(&fs_info->commit_root_sem);
   1449		ret = -ENOENT;
   1450		goto out;
   1451	}
   1452	up_read(&fs_info->commit_root_sem);
   1453
   1454	if (!backref_ctx.found_itself) {
   1455		/* found a bug in backref code? */
   1456		ret = -EIO;
   1457		btrfs_err(fs_info,
   1458			  "did not find backref in send_root. inode=%llu, offset=%llu, disk_byte=%llu found extent=%llu",
   1459			  ino, data_offset, disk_byte, found_key.objectid);
   1460		goto out;
   1461	}
   1462
   1463	btrfs_debug(fs_info,
   1464		    "find_extent_clone: data_offset=%llu, ino=%llu, num_bytes=%llu, logical=%llu",
   1465		    data_offset, ino, num_bytes, logical);
   1466
   1467	if (!backref_ctx.found)
   1468		btrfs_debug(fs_info, "no clones found");
   1469
   1470	cur_clone_root = NULL;
   1471	for (i = 0; i < sctx->clone_roots_cnt; i++) {
   1472		if (sctx->clone_roots[i].found_refs) {
   1473			if (!cur_clone_root)
   1474				cur_clone_root = sctx->clone_roots + i;
   1475			else if (sctx->clone_roots[i].root == sctx->send_root)
   1476				/* prefer clones from send_root over others */
   1477				cur_clone_root = sctx->clone_roots + i;
   1478		}
   1479
   1480	}
   1481
   1482	if (cur_clone_root) {
   1483		*found = cur_clone_root;
   1484		ret = 0;
   1485	} else {
   1486		ret = -ENOENT;
   1487	}
   1488
   1489out:
   1490	btrfs_free_path(tmp_path);
   1491	return ret;
   1492}
   1493
   1494static int read_symlink(struct btrfs_root *root,
   1495			u64 ino,
   1496			struct fs_path *dest)
   1497{
   1498	int ret;
   1499	struct btrfs_path *path;
   1500	struct btrfs_key key;
   1501	struct btrfs_file_extent_item *ei;
   1502	u8 type;
   1503	u8 compression;
   1504	unsigned long off;
   1505	int len;
   1506
   1507	path = alloc_path_for_send();
   1508	if (!path)
   1509		return -ENOMEM;
   1510
   1511	key.objectid = ino;
   1512	key.type = BTRFS_EXTENT_DATA_KEY;
   1513	key.offset = 0;
   1514	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
   1515	if (ret < 0)
   1516		goto out;
   1517	if (ret) {
   1518		/*
   1519		 * An empty symlink inode. Can happen in rare error paths when
   1520		 * creating a symlink (transaction committed before the inode
   1521		 * eviction handler removed the symlink inode items and a crash
   1522		 * happened in between or the subvol was snapshoted in between).
   1523		 * Print an informative message to dmesg/syslog so that the user
   1524		 * can delete the symlink.
   1525		 */
   1526		btrfs_err(root->fs_info,
   1527			  "Found empty symlink inode %llu at root %llu",
   1528			  ino, root->root_key.objectid);
   1529		ret = -EIO;
   1530		goto out;
   1531	}
   1532
   1533	ei = btrfs_item_ptr(path->nodes[0], path->slots[0],
   1534			struct btrfs_file_extent_item);
   1535	type = btrfs_file_extent_type(path->nodes[0], ei);
   1536	compression = btrfs_file_extent_compression(path->nodes[0], ei);
   1537	BUG_ON(type != BTRFS_FILE_EXTENT_INLINE);
   1538	BUG_ON(compression);
   1539
   1540	off = btrfs_file_extent_inline_start(ei);
   1541	len = btrfs_file_extent_ram_bytes(path->nodes[0], ei);
   1542
   1543	ret = fs_path_add_from_extent_buffer(dest, path->nodes[0], off, len);
   1544
   1545out:
   1546	btrfs_free_path(path);
   1547	return ret;
   1548}
   1549
   1550/*
   1551 * Helper function to generate a file name that is unique in the root of
   1552 * send_root and parent_root. This is used to generate names for orphan inodes.
   1553 */
   1554static int gen_unique_name(struct send_ctx *sctx,
   1555			   u64 ino, u64 gen,
   1556			   struct fs_path *dest)
   1557{
   1558	int ret = 0;
   1559	struct btrfs_path *path;
   1560	struct btrfs_dir_item *di;
   1561	char tmp[64];
   1562	int len;
   1563	u64 idx = 0;
   1564
   1565	path = alloc_path_for_send();
   1566	if (!path)
   1567		return -ENOMEM;
   1568
   1569	while (1) {
   1570		len = snprintf(tmp, sizeof(tmp), "o%llu-%llu-%llu",
   1571				ino, gen, idx);
   1572		ASSERT(len < sizeof(tmp));
   1573
   1574		di = btrfs_lookup_dir_item(NULL, sctx->send_root,
   1575				path, BTRFS_FIRST_FREE_OBJECTID,
   1576				tmp, strlen(tmp), 0);
   1577		btrfs_release_path(path);
   1578		if (IS_ERR(di)) {
   1579			ret = PTR_ERR(di);
   1580			goto out;
   1581		}
   1582		if (di) {
   1583			/* not unique, try again */
   1584			idx++;
   1585			continue;
   1586		}
   1587
   1588		if (!sctx->parent_root) {
   1589			/* unique */
   1590			ret = 0;
   1591			break;
   1592		}
   1593
   1594		di = btrfs_lookup_dir_item(NULL, sctx->parent_root,
   1595				path, BTRFS_FIRST_FREE_OBJECTID,
   1596				tmp, strlen(tmp), 0);
   1597		btrfs_release_path(path);
   1598		if (IS_ERR(di)) {
   1599			ret = PTR_ERR(di);
   1600			goto out;
   1601		}
   1602		if (di) {
   1603			/* not unique, try again */
   1604			idx++;
   1605			continue;
   1606		}
   1607		/* unique */
   1608		break;
   1609	}
   1610
   1611	ret = fs_path_add(dest, tmp, strlen(tmp));
   1612
   1613out:
   1614	btrfs_free_path(path);
   1615	return ret;
   1616}
   1617
   1618enum inode_state {
   1619	inode_state_no_change,
   1620	inode_state_will_create,
   1621	inode_state_did_create,
   1622	inode_state_will_delete,
   1623	inode_state_did_delete,
   1624};
   1625
   1626static int get_cur_inode_state(struct send_ctx *sctx, u64 ino, u64 gen)
   1627{
   1628	int ret;
   1629	int left_ret;
   1630	int right_ret;
   1631	u64 left_gen;
   1632	u64 right_gen;
   1633
   1634	ret = get_inode_info(sctx->send_root, ino, NULL, &left_gen, NULL, NULL,
   1635			NULL, NULL);
   1636	if (ret < 0 && ret != -ENOENT)
   1637		goto out;
   1638	left_ret = ret;
   1639
   1640	if (!sctx->parent_root) {
   1641		right_ret = -ENOENT;
   1642	} else {
   1643		ret = get_inode_info(sctx->parent_root, ino, NULL, &right_gen,
   1644				NULL, NULL, NULL, NULL);
   1645		if (ret < 0 && ret != -ENOENT)
   1646			goto out;
   1647		right_ret = ret;
   1648	}
   1649
   1650	if (!left_ret && !right_ret) {
   1651		if (left_gen == gen && right_gen == gen) {
   1652			ret = inode_state_no_change;
   1653		} else if (left_gen == gen) {
   1654			if (ino < sctx->send_progress)
   1655				ret = inode_state_did_create;
   1656			else
   1657				ret = inode_state_will_create;
   1658		} else if (right_gen == gen) {
   1659			if (ino < sctx->send_progress)
   1660				ret = inode_state_did_delete;
   1661			else
   1662				ret = inode_state_will_delete;
   1663		} else  {
   1664			ret = -ENOENT;
   1665		}
   1666	} else if (!left_ret) {
   1667		if (left_gen == gen) {
   1668			if (ino < sctx->send_progress)
   1669				ret = inode_state_did_create;
   1670			else
   1671				ret = inode_state_will_create;
   1672		} else {
   1673			ret = -ENOENT;
   1674		}
   1675	} else if (!right_ret) {
   1676		if (right_gen == gen) {
   1677			if (ino < sctx->send_progress)
   1678				ret = inode_state_did_delete;
   1679			else
   1680				ret = inode_state_will_delete;
   1681		} else {
   1682			ret = -ENOENT;
   1683		}
   1684	} else {
   1685		ret = -ENOENT;
   1686	}
   1687
   1688out:
   1689	return ret;
   1690}
   1691
   1692static int is_inode_existent(struct send_ctx *sctx, u64 ino, u64 gen)
   1693{
   1694	int ret;
   1695
   1696	if (ino == BTRFS_FIRST_FREE_OBJECTID)
   1697		return 1;
   1698
   1699	ret = get_cur_inode_state(sctx, ino, gen);
   1700	if (ret < 0)
   1701		goto out;
   1702
   1703	if (ret == inode_state_no_change ||
   1704	    ret == inode_state_did_create ||
   1705	    ret == inode_state_will_delete)
   1706		ret = 1;
   1707	else
   1708		ret = 0;
   1709
   1710out:
   1711	return ret;
   1712}
   1713
   1714/*
   1715 * Helper function to lookup a dir item in a dir.
   1716 */
   1717static int lookup_dir_item_inode(struct btrfs_root *root,
   1718				 u64 dir, const char *name, int name_len,
   1719				 u64 *found_inode)
   1720{
   1721	int ret = 0;
   1722	struct btrfs_dir_item *di;
   1723	struct btrfs_key key;
   1724	struct btrfs_path *path;
   1725
   1726	path = alloc_path_for_send();
   1727	if (!path)
   1728		return -ENOMEM;
   1729
   1730	di = btrfs_lookup_dir_item(NULL, root, path,
   1731			dir, name, name_len, 0);
   1732	if (IS_ERR_OR_NULL(di)) {
   1733		ret = di ? PTR_ERR(di) : -ENOENT;
   1734		goto out;
   1735	}
   1736	btrfs_dir_item_key_to_cpu(path->nodes[0], di, &key);
   1737	if (key.type == BTRFS_ROOT_ITEM_KEY) {
   1738		ret = -ENOENT;
   1739		goto out;
   1740	}
   1741	*found_inode = key.objectid;
   1742
   1743out:
   1744	btrfs_free_path(path);
   1745	return ret;
   1746}
   1747
   1748/*
   1749 * Looks up the first btrfs_inode_ref of a given ino. It returns the parent dir,
   1750 * generation of the parent dir and the name of the dir entry.
   1751 */
   1752static int get_first_ref(struct btrfs_root *root, u64 ino,
   1753			 u64 *dir, u64 *dir_gen, struct fs_path *name)
   1754{
   1755	int ret;
   1756	struct btrfs_key key;
   1757	struct btrfs_key found_key;
   1758	struct btrfs_path *path;
   1759	int len;
   1760	u64 parent_dir;
   1761
   1762	path = alloc_path_for_send();
   1763	if (!path)
   1764		return -ENOMEM;
   1765
   1766	key.objectid = ino;
   1767	key.type = BTRFS_INODE_REF_KEY;
   1768	key.offset = 0;
   1769
   1770	ret = btrfs_search_slot_for_read(root, &key, path, 1, 0);
   1771	if (ret < 0)
   1772		goto out;
   1773	if (!ret)
   1774		btrfs_item_key_to_cpu(path->nodes[0], &found_key,
   1775				path->slots[0]);
   1776	if (ret || found_key.objectid != ino ||
   1777	    (found_key.type != BTRFS_INODE_REF_KEY &&
   1778	     found_key.type != BTRFS_INODE_EXTREF_KEY)) {
   1779		ret = -ENOENT;
   1780		goto out;
   1781	}
   1782
   1783	if (found_key.type == BTRFS_INODE_REF_KEY) {
   1784		struct btrfs_inode_ref *iref;
   1785		iref = btrfs_item_ptr(path->nodes[0], path->slots[0],
   1786				      struct btrfs_inode_ref);
   1787		len = btrfs_inode_ref_name_len(path->nodes[0], iref);
   1788		ret = fs_path_add_from_extent_buffer(name, path->nodes[0],
   1789						     (unsigned long)(iref + 1),
   1790						     len);
   1791		parent_dir = found_key.offset;
   1792	} else {
   1793		struct btrfs_inode_extref *extref;
   1794		extref = btrfs_item_ptr(path->nodes[0], path->slots[0],
   1795					struct btrfs_inode_extref);
   1796		len = btrfs_inode_extref_name_len(path->nodes[0], extref);
   1797		ret = fs_path_add_from_extent_buffer(name, path->nodes[0],
   1798					(unsigned long)&extref->name, len);
   1799		parent_dir = btrfs_inode_extref_parent(path->nodes[0], extref);
   1800	}
   1801	if (ret < 0)
   1802		goto out;
   1803	btrfs_release_path(path);
   1804
   1805	if (dir_gen) {
   1806		ret = get_inode_info(root, parent_dir, NULL, dir_gen, NULL,
   1807				     NULL, NULL, NULL);
   1808		if (ret < 0)
   1809			goto out;
   1810	}
   1811
   1812	*dir = parent_dir;
   1813
   1814out:
   1815	btrfs_free_path(path);
   1816	return ret;
   1817}
   1818
   1819static int is_first_ref(struct btrfs_root *root,
   1820			u64 ino, u64 dir,
   1821			const char *name, int name_len)
   1822{
   1823	int ret;
   1824	struct fs_path *tmp_name;
   1825	u64 tmp_dir;
   1826
   1827	tmp_name = fs_path_alloc();
   1828	if (!tmp_name)
   1829		return -ENOMEM;
   1830
   1831	ret = get_first_ref(root, ino, &tmp_dir, NULL, tmp_name);
   1832	if (ret < 0)
   1833		goto out;
   1834
   1835	if (dir != tmp_dir || name_len != fs_path_len(tmp_name)) {
   1836		ret = 0;
   1837		goto out;
   1838	}
   1839
   1840	ret = !memcmp(tmp_name->start, name, name_len);
   1841
   1842out:
   1843	fs_path_free(tmp_name);
   1844	return ret;
   1845}
   1846
   1847/*
   1848 * Used by process_recorded_refs to determine if a new ref would overwrite an
   1849 * already existing ref. In case it detects an overwrite, it returns the
   1850 * inode/gen in who_ino/who_gen.
   1851 * When an overwrite is detected, process_recorded_refs does proper orphanizing
   1852 * to make sure later references to the overwritten inode are possible.
   1853 * Orphanizing is however only required for the first ref of an inode.
   1854 * process_recorded_refs does an additional is_first_ref check to see if
   1855 * orphanizing is really required.
   1856 */
   1857static int will_overwrite_ref(struct send_ctx *sctx, u64 dir, u64 dir_gen,
   1858			      const char *name, int name_len,
   1859			      u64 *who_ino, u64 *who_gen, u64 *who_mode)
   1860{
   1861	int ret = 0;
   1862	u64 gen;
   1863	u64 other_inode = 0;
   1864
   1865	if (!sctx->parent_root)
   1866		goto out;
   1867
   1868	ret = is_inode_existent(sctx, dir, dir_gen);
   1869	if (ret <= 0)
   1870		goto out;
   1871
   1872	/*
   1873	 * If we have a parent root we need to verify that the parent dir was
   1874	 * not deleted and then re-created, if it was then we have no overwrite
   1875	 * and we can just unlink this entry.
   1876	 */
   1877	if (sctx->parent_root && dir != BTRFS_FIRST_FREE_OBJECTID) {
   1878		ret = get_inode_info(sctx->parent_root, dir, NULL, &gen, NULL,
   1879				     NULL, NULL, NULL);
   1880		if (ret < 0 && ret != -ENOENT)
   1881			goto out;
   1882		if (ret) {
   1883			ret = 0;
   1884			goto out;
   1885		}
   1886		if (gen != dir_gen)
   1887			goto out;
   1888	}
   1889
   1890	ret = lookup_dir_item_inode(sctx->parent_root, dir, name, name_len,
   1891				    &other_inode);
   1892	if (ret < 0 && ret != -ENOENT)
   1893		goto out;
   1894	if (ret) {
   1895		ret = 0;
   1896		goto out;
   1897	}
   1898
   1899	/*
   1900	 * Check if the overwritten ref was already processed. If yes, the ref
   1901	 * was already unlinked/moved, so we can safely assume that we will not
   1902	 * overwrite anything at this point in time.
   1903	 */
   1904	if (other_inode > sctx->send_progress ||
   1905	    is_waiting_for_move(sctx, other_inode)) {
   1906		ret = get_inode_info(sctx->parent_root, other_inode, NULL,
   1907				who_gen, who_mode, NULL, NULL, NULL);
   1908		if (ret < 0)
   1909			goto out;
   1910
   1911		ret = 1;
   1912		*who_ino = other_inode;
   1913	} else {
   1914		ret = 0;
   1915	}
   1916
   1917out:
   1918	return ret;
   1919}
   1920
   1921/*
   1922 * Checks if the ref was overwritten by an already processed inode. This is
   1923 * used by __get_cur_name_and_parent to find out if the ref was orphanized and
   1924 * thus the orphan name needs be used.
   1925 * process_recorded_refs also uses it to avoid unlinking of refs that were
   1926 * overwritten.
   1927 */
   1928static int did_overwrite_ref(struct send_ctx *sctx,
   1929			    u64 dir, u64 dir_gen,
   1930			    u64 ino, u64 ino_gen,
   1931			    const char *name, int name_len)
   1932{
   1933	int ret = 0;
   1934	u64 gen;
   1935	u64 ow_inode;
   1936
   1937	if (!sctx->parent_root)
   1938		goto out;
   1939
   1940	ret = is_inode_existent(sctx, dir, dir_gen);
   1941	if (ret <= 0)
   1942		goto out;
   1943
   1944	if (dir != BTRFS_FIRST_FREE_OBJECTID) {
   1945		ret = get_inode_info(sctx->send_root, dir, NULL, &gen, NULL,
   1946				     NULL, NULL, NULL);
   1947		if (ret < 0 && ret != -ENOENT)
   1948			goto out;
   1949		if (ret) {
   1950			ret = 0;
   1951			goto out;
   1952		}
   1953		if (gen != dir_gen)
   1954			goto out;
   1955	}
   1956
   1957	/* check if the ref was overwritten by another ref */
   1958	ret = lookup_dir_item_inode(sctx->send_root, dir, name, name_len,
   1959				    &ow_inode);
   1960	if (ret < 0 && ret != -ENOENT)
   1961		goto out;
   1962	if (ret) {
   1963		/* was never and will never be overwritten */
   1964		ret = 0;
   1965		goto out;
   1966	}
   1967
   1968	ret = get_inode_info(sctx->send_root, ow_inode, NULL, &gen, NULL, NULL,
   1969			NULL, NULL);
   1970	if (ret < 0)
   1971		goto out;
   1972
   1973	if (ow_inode == ino && gen == ino_gen) {
   1974		ret = 0;
   1975		goto out;
   1976	}
   1977
   1978	/*
   1979	 * We know that it is or will be overwritten. Check this now.
   1980	 * The current inode being processed might have been the one that caused
   1981	 * inode 'ino' to be orphanized, therefore check if ow_inode matches
   1982	 * the current inode being processed.
   1983	 */
   1984	if ((ow_inode < sctx->send_progress) ||
   1985	    (ino != sctx->cur_ino && ow_inode == sctx->cur_ino &&
   1986	     gen == sctx->cur_inode_gen))
   1987		ret = 1;
   1988	else
   1989		ret = 0;
   1990
   1991out:
   1992	return ret;
   1993}
   1994
   1995/*
   1996 * Same as did_overwrite_ref, but also checks if it is the first ref of an inode
   1997 * that got overwritten. This is used by process_recorded_refs to determine
   1998 * if it has to use the path as returned by get_cur_path or the orphan name.
   1999 */
   2000static int did_overwrite_first_ref(struct send_ctx *sctx, u64 ino, u64 gen)
   2001{
   2002	int ret = 0;
   2003	struct fs_path *name = NULL;
   2004	u64 dir;
   2005	u64 dir_gen;
   2006
   2007	if (!sctx->parent_root)
   2008		goto out;
   2009
   2010	name = fs_path_alloc();
   2011	if (!name)
   2012		return -ENOMEM;
   2013
   2014	ret = get_first_ref(sctx->parent_root, ino, &dir, &dir_gen, name);
   2015	if (ret < 0)
   2016		goto out;
   2017
   2018	ret = did_overwrite_ref(sctx, dir, dir_gen, ino, gen,
   2019			name->start, fs_path_len(name));
   2020
   2021out:
   2022	fs_path_free(name);
   2023	return ret;
   2024}
   2025
   2026/*
   2027 * Insert a name cache entry. On 32bit kernels the xarray index is 32bit,
   2028 * so we need to do some special handling in case we have clashes. This function
   2029 * takes care of this with the help of name_cache_entry::inum_aliases.
   2030 * In case of error, nce is kfreed.
   2031 */
   2032static int name_cache_insert(struct send_ctx *sctx,
   2033			     struct name_cache_entry *nce)
   2034{
   2035	int ret = 0;
   2036	struct list_head *nce_head;
   2037
   2038	nce_head = xa_load(&sctx->name_cache, (unsigned long)nce->ino);
   2039	if (!nce_head) {
   2040		nce_head = kmalloc(sizeof(*nce_head), GFP_KERNEL);
   2041		if (!nce_head) {
   2042			kfree(nce);
   2043			return -ENOMEM;
   2044		}
   2045		INIT_LIST_HEAD(nce_head);
   2046
   2047		ret = xa_insert(&sctx->name_cache, nce->ino, nce_head, GFP_KERNEL);
   2048		if (ret < 0) {
   2049			kfree(nce_head);
   2050			kfree(nce);
   2051			return ret;
   2052		}
   2053	}
   2054	list_add_tail(&nce->inum_aliases, nce_head);
   2055	list_add_tail(&nce->list, &sctx->name_cache_list);
   2056	sctx->name_cache_size++;
   2057
   2058	return ret;
   2059}
   2060
   2061static void name_cache_delete(struct send_ctx *sctx,
   2062			      struct name_cache_entry *nce)
   2063{
   2064	struct list_head *nce_head;
   2065
   2066	nce_head = xa_load(&sctx->name_cache, (unsigned long)nce->ino);
   2067	if (!nce_head) {
   2068		btrfs_err(sctx->send_root->fs_info,
   2069	      "name_cache_delete lookup failed ino %llu cache size %d, leaking memory",
   2070			nce->ino, sctx->name_cache_size);
   2071	}
   2072
   2073	list_del(&nce->inum_aliases);
   2074	list_del(&nce->list);
   2075	sctx->name_cache_size--;
   2076
   2077	/*
   2078	 * We may not get to the final release of nce_head if the lookup fails
   2079	 */
   2080	if (nce_head && list_empty(nce_head)) {
   2081		xa_erase(&sctx->name_cache, (unsigned long)nce->ino);
   2082		kfree(nce_head);
   2083	}
   2084}
   2085
   2086static struct name_cache_entry *name_cache_search(struct send_ctx *sctx,
   2087						    u64 ino, u64 gen)
   2088{
   2089	struct list_head *nce_head;
   2090	struct name_cache_entry *cur;
   2091
   2092	nce_head = xa_load(&sctx->name_cache, (unsigned long)ino);
   2093	if (!nce_head)
   2094		return NULL;
   2095
   2096	list_for_each_entry(cur, nce_head, inum_aliases) {
   2097		if (cur->ino == ino && cur->gen == gen)
   2098			return cur;
   2099	}
   2100	return NULL;
   2101}
   2102
   2103/*
   2104 * Remove some entries from the beginning of name_cache_list.
   2105 */
   2106static void name_cache_clean_unused(struct send_ctx *sctx)
   2107{
   2108	struct name_cache_entry *nce;
   2109
   2110	if (sctx->name_cache_size < SEND_CTX_NAME_CACHE_CLEAN_SIZE)
   2111		return;
   2112
   2113	while (sctx->name_cache_size > SEND_CTX_MAX_NAME_CACHE_SIZE) {
   2114		nce = list_entry(sctx->name_cache_list.next,
   2115				struct name_cache_entry, list);
   2116		name_cache_delete(sctx, nce);
   2117		kfree(nce);
   2118	}
   2119}
   2120
   2121static void name_cache_free(struct send_ctx *sctx)
   2122{
   2123	struct name_cache_entry *nce;
   2124
   2125	while (!list_empty(&sctx->name_cache_list)) {
   2126		nce = list_entry(sctx->name_cache_list.next,
   2127				struct name_cache_entry, list);
   2128		name_cache_delete(sctx, nce);
   2129		kfree(nce);
   2130	}
   2131}
   2132
   2133/*
   2134 * Used by get_cur_path for each ref up to the root.
   2135 * Returns 0 if it succeeded.
   2136 * Returns 1 if the inode is not existent or got overwritten. In that case, the
   2137 * name is an orphan name. This instructs get_cur_path to stop iterating. If 1
   2138 * is returned, parent_ino/parent_gen are not guaranteed to be valid.
   2139 * Returns <0 in case of error.
   2140 */
   2141static int __get_cur_name_and_parent(struct send_ctx *sctx,
   2142				     u64 ino, u64 gen,
   2143				     u64 *parent_ino,
   2144				     u64 *parent_gen,
   2145				     struct fs_path *dest)
   2146{
   2147	int ret;
   2148	int nce_ret;
   2149	struct name_cache_entry *nce = NULL;
   2150
   2151	/*
   2152	 * First check if we already did a call to this function with the same
   2153	 * ino/gen. If yes, check if the cache entry is still up-to-date. If yes
   2154	 * return the cached result.
   2155	 */
   2156	nce = name_cache_search(sctx, ino, gen);
   2157	if (nce) {
   2158		if (ino < sctx->send_progress && nce->need_later_update) {
   2159			name_cache_delete(sctx, nce);
   2160			kfree(nce);
   2161			nce = NULL;
   2162		} else {
   2163			/*
   2164			 * Removes the entry from the list and adds it back to
   2165			 * the end.  This marks the entry as recently used so
   2166			 * that name_cache_clean_unused does not remove it.
   2167			 */
   2168			list_move_tail(&nce->list, &sctx->name_cache_list);
   2169
   2170			*parent_ino = nce->parent_ino;
   2171			*parent_gen = nce->parent_gen;
   2172			ret = fs_path_add(dest, nce->name, nce->name_len);
   2173			if (ret < 0)
   2174				goto out;
   2175			ret = nce->ret;
   2176			goto out;
   2177		}
   2178	}
   2179
   2180	/*
   2181	 * If the inode is not existent yet, add the orphan name and return 1.
   2182	 * This should only happen for the parent dir that we determine in
   2183	 * __record_new_ref
   2184	 */
   2185	ret = is_inode_existent(sctx, ino, gen);
   2186	if (ret < 0)
   2187		goto out;
   2188
   2189	if (!ret) {
   2190		ret = gen_unique_name(sctx, ino, gen, dest);
   2191		if (ret < 0)
   2192			goto out;
   2193		ret = 1;
   2194		goto out_cache;
   2195	}
   2196
   2197	/*
   2198	 * Depending on whether the inode was already processed or not, use
   2199	 * send_root or parent_root for ref lookup.
   2200	 */
   2201	if (ino < sctx->send_progress)
   2202		ret = get_first_ref(sctx->send_root, ino,
   2203				    parent_ino, parent_gen, dest);
   2204	else
   2205		ret = get_first_ref(sctx->parent_root, ino,
   2206				    parent_ino, parent_gen, dest);
   2207	if (ret < 0)
   2208		goto out;
   2209
   2210	/*
   2211	 * Check if the ref was overwritten by an inode's ref that was processed
   2212	 * earlier. If yes, treat as orphan and return 1.
   2213	 */
   2214	ret = did_overwrite_ref(sctx, *parent_ino, *parent_gen, ino, gen,
   2215			dest->start, dest->end - dest->start);
   2216	if (ret < 0)
   2217		goto out;
   2218	if (ret) {
   2219		fs_path_reset(dest);
   2220		ret = gen_unique_name(sctx, ino, gen, dest);
   2221		if (ret < 0)
   2222			goto out;
   2223		ret = 1;
   2224	}
   2225
   2226out_cache:
   2227	/*
   2228	 * Store the result of the lookup in the name cache.
   2229	 */
   2230	nce = kmalloc(sizeof(*nce) + fs_path_len(dest) + 1, GFP_KERNEL);
   2231	if (!nce) {
   2232		ret = -ENOMEM;
   2233		goto out;
   2234	}
   2235
   2236	nce->ino = ino;
   2237	nce->gen = gen;
   2238	nce->parent_ino = *parent_ino;
   2239	nce->parent_gen = *parent_gen;
   2240	nce->name_len = fs_path_len(dest);
   2241	nce->ret = ret;
   2242	strcpy(nce->name, dest->start);
   2243
   2244	if (ino < sctx->send_progress)
   2245		nce->need_later_update = 0;
   2246	else
   2247		nce->need_later_update = 1;
   2248
   2249	nce_ret = name_cache_insert(sctx, nce);
   2250	if (nce_ret < 0)
   2251		ret = nce_ret;
   2252	name_cache_clean_unused(sctx);
   2253
   2254out:
   2255	return ret;
   2256}
   2257
   2258/*
   2259 * Magic happens here. This function returns the first ref to an inode as it
   2260 * would look like while receiving the stream at this point in time.
   2261 * We walk the path up to the root. For every inode in between, we check if it
   2262 * was already processed/sent. If yes, we continue with the parent as found
   2263 * in send_root. If not, we continue with the parent as found in parent_root.
   2264 * If we encounter an inode that was deleted at this point in time, we use the
   2265 * inodes "orphan" name instead of the real name and stop. Same with new inodes
   2266 * that were not created yet and overwritten inodes/refs.
   2267 *
   2268 * When do we have orphan inodes:
   2269 * 1. When an inode is freshly created and thus no valid refs are available yet
   2270 * 2. When a directory lost all it's refs (deleted) but still has dir items
   2271 *    inside which were not processed yet (pending for move/delete). If anyone
   2272 *    tried to get the path to the dir items, it would get a path inside that
   2273 *    orphan directory.
   2274 * 3. When an inode is moved around or gets new links, it may overwrite the ref
   2275 *    of an unprocessed inode. If in that case the first ref would be
   2276 *    overwritten, the overwritten inode gets "orphanized". Later when we
   2277 *    process this overwritten inode, it is restored at a new place by moving
   2278 *    the orphan inode.
   2279 *
   2280 * sctx->send_progress tells this function at which point in time receiving
   2281 * would be.
   2282 */
   2283static int get_cur_path(struct send_ctx *sctx, u64 ino, u64 gen,
   2284			struct fs_path *dest)
   2285{
   2286	int ret = 0;
   2287	struct fs_path *name = NULL;
   2288	u64 parent_inode = 0;
   2289	u64 parent_gen = 0;
   2290	int stop = 0;
   2291
   2292	name = fs_path_alloc();
   2293	if (!name) {
   2294		ret = -ENOMEM;
   2295		goto out;
   2296	}
   2297
   2298	dest->reversed = 1;
   2299	fs_path_reset(dest);
   2300
   2301	while (!stop && ino != BTRFS_FIRST_FREE_OBJECTID) {
   2302		struct waiting_dir_move *wdm;
   2303
   2304		fs_path_reset(name);
   2305
   2306		if (is_waiting_for_rm(sctx, ino, gen)) {
   2307			ret = gen_unique_name(sctx, ino, gen, name);
   2308			if (ret < 0)
   2309				goto out;
   2310			ret = fs_path_add_path(dest, name);
   2311			break;
   2312		}
   2313
   2314		wdm = get_waiting_dir_move(sctx, ino);
   2315		if (wdm && wdm->orphanized) {
   2316			ret = gen_unique_name(sctx, ino, gen, name);
   2317			stop = 1;
   2318		} else if (wdm) {
   2319			ret = get_first_ref(sctx->parent_root, ino,
   2320					    &parent_inode, &parent_gen, name);
   2321		} else {
   2322			ret = __get_cur_name_and_parent(sctx, ino, gen,
   2323							&parent_inode,
   2324							&parent_gen, name);
   2325			if (ret)
   2326				stop = 1;
   2327		}
   2328
   2329		if (ret < 0)
   2330			goto out;
   2331
   2332		ret = fs_path_add_path(dest, name);
   2333		if (ret < 0)
   2334			goto out;
   2335
   2336		ino = parent_inode;
   2337		gen = parent_gen;
   2338	}
   2339
   2340out:
   2341	fs_path_free(name);
   2342	if (!ret)
   2343		fs_path_unreverse(dest);
   2344	return ret;
   2345}
   2346
   2347/*
   2348 * Sends a BTRFS_SEND_C_SUBVOL command/item to userspace
   2349 */
   2350static int send_subvol_begin(struct send_ctx *sctx)
   2351{
   2352	int ret;
   2353	struct btrfs_root *send_root = sctx->send_root;
   2354	struct btrfs_root *parent_root = sctx->parent_root;
   2355	struct btrfs_path *path;
   2356	struct btrfs_key key;
   2357	struct btrfs_root_ref *ref;
   2358	struct extent_buffer *leaf;
   2359	char *name = NULL;
   2360	int namelen;
   2361
   2362	path = btrfs_alloc_path();
   2363	if (!path)
   2364		return -ENOMEM;
   2365
   2366	name = kmalloc(BTRFS_PATH_NAME_MAX, GFP_KERNEL);
   2367	if (!name) {
   2368		btrfs_free_path(path);
   2369		return -ENOMEM;
   2370	}
   2371
   2372	key.objectid = send_root->root_key.objectid;
   2373	key.type = BTRFS_ROOT_BACKREF_KEY;
   2374	key.offset = 0;
   2375
   2376	ret = btrfs_search_slot_for_read(send_root->fs_info->tree_root,
   2377				&key, path, 1, 0);
   2378	if (ret < 0)
   2379		goto out;
   2380	if (ret) {
   2381		ret = -ENOENT;
   2382		goto out;
   2383	}
   2384
   2385	leaf = path->nodes[0];
   2386	btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
   2387	if (key.type != BTRFS_ROOT_BACKREF_KEY ||
   2388	    key.objectid != send_root->root_key.objectid) {
   2389		ret = -ENOENT;
   2390		goto out;
   2391	}
   2392	ref = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_root_ref);
   2393	namelen = btrfs_root_ref_name_len(leaf, ref);
   2394	read_extent_buffer(leaf, name, (unsigned long)(ref + 1), namelen);
   2395	btrfs_release_path(path);
   2396
   2397	if (parent_root) {
   2398		ret = begin_cmd(sctx, BTRFS_SEND_C_SNAPSHOT);
   2399		if (ret < 0)
   2400			goto out;
   2401	} else {
   2402		ret = begin_cmd(sctx, BTRFS_SEND_C_SUBVOL);
   2403		if (ret < 0)
   2404			goto out;
   2405	}
   2406
   2407	TLV_PUT_STRING(sctx, BTRFS_SEND_A_PATH, name, namelen);
   2408
   2409	if (!btrfs_is_empty_uuid(sctx->send_root->root_item.received_uuid))
   2410		TLV_PUT_UUID(sctx, BTRFS_SEND_A_UUID,
   2411			    sctx->send_root->root_item.received_uuid);
   2412	else
   2413		TLV_PUT_UUID(sctx, BTRFS_SEND_A_UUID,
   2414			    sctx->send_root->root_item.uuid);
   2415
   2416	TLV_PUT_U64(sctx, BTRFS_SEND_A_CTRANSID,
   2417		    btrfs_root_ctransid(&sctx->send_root->root_item));
   2418	if (parent_root) {
   2419		if (!btrfs_is_empty_uuid(parent_root->root_item.received_uuid))
   2420			TLV_PUT_UUID(sctx, BTRFS_SEND_A_CLONE_UUID,
   2421				     parent_root->root_item.received_uuid);
   2422		else
   2423			TLV_PUT_UUID(sctx, BTRFS_SEND_A_CLONE_UUID,
   2424				     parent_root->root_item.uuid);
   2425		TLV_PUT_U64(sctx, BTRFS_SEND_A_CLONE_CTRANSID,
   2426			    btrfs_root_ctransid(&sctx->parent_root->root_item));
   2427	}
   2428
   2429	ret = send_cmd(sctx);
   2430
   2431tlv_put_failure:
   2432out:
   2433	btrfs_free_path(path);
   2434	kfree(name);
   2435	return ret;
   2436}
   2437
   2438static int send_truncate(struct send_ctx *sctx, u64 ino, u64 gen, u64 size)
   2439{
   2440	struct btrfs_fs_info *fs_info = sctx->send_root->fs_info;
   2441	int ret = 0;
   2442	struct fs_path *p;
   2443
   2444	btrfs_debug(fs_info, "send_truncate %llu size=%llu", ino, size);
   2445
   2446	p = fs_path_alloc();
   2447	if (!p)
   2448		return -ENOMEM;
   2449
   2450	ret = begin_cmd(sctx, BTRFS_SEND_C_TRUNCATE);
   2451	if (ret < 0)
   2452		goto out;
   2453
   2454	ret = get_cur_path(sctx, ino, gen, p);
   2455	if (ret < 0)
   2456		goto out;
   2457	TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, p);
   2458	TLV_PUT_U64(sctx, BTRFS_SEND_A_SIZE, size);
   2459
   2460	ret = send_cmd(sctx);
   2461
   2462tlv_put_failure:
   2463out:
   2464	fs_path_free(p);
   2465	return ret;
   2466}
   2467
   2468static int send_chmod(struct send_ctx *sctx, u64 ino, u64 gen, u64 mode)
   2469{
   2470	struct btrfs_fs_info *fs_info = sctx->send_root->fs_info;
   2471	int ret = 0;
   2472	struct fs_path *p;
   2473
   2474	btrfs_debug(fs_info, "send_chmod %llu mode=%llu", ino, mode);
   2475
   2476	p = fs_path_alloc();
   2477	if (!p)
   2478		return -ENOMEM;
   2479
   2480	ret = begin_cmd(sctx, BTRFS_SEND_C_CHMOD);
   2481	if (ret < 0)
   2482		goto out;
   2483
   2484	ret = get_cur_path(sctx, ino, gen, p);
   2485	if (ret < 0)
   2486		goto out;
   2487	TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, p);
   2488	TLV_PUT_U64(sctx, BTRFS_SEND_A_MODE, mode & 07777);
   2489
   2490	ret = send_cmd(sctx);
   2491
   2492tlv_put_failure:
   2493out:
   2494	fs_path_free(p);
   2495	return ret;
   2496}
   2497
   2498static int send_chown(struct send_ctx *sctx, u64 ino, u64 gen, u64 uid, u64 gid)
   2499{
   2500	struct btrfs_fs_info *fs_info = sctx->send_root->fs_info;
   2501	int ret = 0;
   2502	struct fs_path *p;
   2503
   2504	btrfs_debug(fs_info, "send_chown %llu uid=%llu, gid=%llu",
   2505		    ino, uid, gid);
   2506
   2507	p = fs_path_alloc();
   2508	if (!p)
   2509		return -ENOMEM;
   2510
   2511	ret = begin_cmd(sctx, BTRFS_SEND_C_CHOWN);
   2512	if (ret < 0)
   2513		goto out;
   2514
   2515	ret = get_cur_path(sctx, ino, gen, p);
   2516	if (ret < 0)
   2517		goto out;
   2518	TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, p);
   2519	TLV_PUT_U64(sctx, BTRFS_SEND_A_UID, uid);
   2520	TLV_PUT_U64(sctx, BTRFS_SEND_A_GID, gid);
   2521
   2522	ret = send_cmd(sctx);
   2523
   2524tlv_put_failure:
   2525out:
   2526	fs_path_free(p);
   2527	return ret;
   2528}
   2529
   2530static int send_utimes(struct send_ctx *sctx, u64 ino, u64 gen)
   2531{
   2532	struct btrfs_fs_info *fs_info = sctx->send_root->fs_info;
   2533	int ret = 0;
   2534	struct fs_path *p = NULL;
   2535	struct btrfs_inode_item *ii;
   2536	struct btrfs_path *path = NULL;
   2537	struct extent_buffer *eb;
   2538	struct btrfs_key key;
   2539	int slot;
   2540
   2541	btrfs_debug(fs_info, "send_utimes %llu", ino);
   2542
   2543	p = fs_path_alloc();
   2544	if (!p)
   2545		return -ENOMEM;
   2546
   2547	path = alloc_path_for_send();
   2548	if (!path) {
   2549		ret = -ENOMEM;
   2550		goto out;
   2551	}
   2552
   2553	key.objectid = ino;
   2554	key.type = BTRFS_INODE_ITEM_KEY;
   2555	key.offset = 0;
   2556	ret = btrfs_search_slot(NULL, sctx->send_root, &key, path, 0, 0);
   2557	if (ret > 0)
   2558		ret = -ENOENT;
   2559	if (ret < 0)
   2560		goto out;
   2561
   2562	eb = path->nodes[0];
   2563	slot = path->slots[0];
   2564	ii = btrfs_item_ptr(eb, slot, struct btrfs_inode_item);
   2565
   2566	ret = begin_cmd(sctx, BTRFS_SEND_C_UTIMES);
   2567	if (ret < 0)
   2568		goto out;
   2569
   2570	ret = get_cur_path(sctx, ino, gen, p);
   2571	if (ret < 0)
   2572		goto out;
   2573	TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, p);
   2574	TLV_PUT_BTRFS_TIMESPEC(sctx, BTRFS_SEND_A_ATIME, eb, &ii->atime);
   2575	TLV_PUT_BTRFS_TIMESPEC(sctx, BTRFS_SEND_A_MTIME, eb, &ii->mtime);
   2576	TLV_PUT_BTRFS_TIMESPEC(sctx, BTRFS_SEND_A_CTIME, eb, &ii->ctime);
   2577	/* TODO Add otime support when the otime patches get into upstream */
   2578
   2579	ret = send_cmd(sctx);
   2580
   2581tlv_put_failure:
   2582out:
   2583	fs_path_free(p);
   2584	btrfs_free_path(path);
   2585	return ret;
   2586}
   2587
   2588/*
   2589 * Sends a BTRFS_SEND_C_MKXXX or SYMLINK command to user space. We don't have
   2590 * a valid path yet because we did not process the refs yet. So, the inode
   2591 * is created as orphan.
   2592 */
   2593static int send_create_inode(struct send_ctx *sctx, u64 ino)
   2594{
   2595	struct btrfs_fs_info *fs_info = sctx->send_root->fs_info;
   2596	int ret = 0;
   2597	struct fs_path *p;
   2598	int cmd;
   2599	u64 gen;
   2600	u64 mode;
   2601	u64 rdev;
   2602
   2603	btrfs_debug(fs_info, "send_create_inode %llu", ino);
   2604
   2605	p = fs_path_alloc();
   2606	if (!p)
   2607		return -ENOMEM;
   2608
   2609	if (ino != sctx->cur_ino) {
   2610		ret = get_inode_info(sctx->send_root, ino, NULL, &gen, &mode,
   2611				     NULL, NULL, &rdev);
   2612		if (ret < 0)
   2613			goto out;
   2614	} else {
   2615		gen = sctx->cur_inode_gen;
   2616		mode = sctx->cur_inode_mode;
   2617		rdev = sctx->cur_inode_rdev;
   2618	}
   2619
   2620	if (S_ISREG(mode)) {
   2621		cmd = BTRFS_SEND_C_MKFILE;
   2622	} else if (S_ISDIR(mode)) {
   2623		cmd = BTRFS_SEND_C_MKDIR;
   2624	} else if (S_ISLNK(mode)) {
   2625		cmd = BTRFS_SEND_C_SYMLINK;
   2626	} else if (S_ISCHR(mode) || S_ISBLK(mode)) {
   2627		cmd = BTRFS_SEND_C_MKNOD;
   2628	} else if (S_ISFIFO(mode)) {
   2629		cmd = BTRFS_SEND_C_MKFIFO;
   2630	} else if (S_ISSOCK(mode)) {
   2631		cmd = BTRFS_SEND_C_MKSOCK;
   2632	} else {
   2633		btrfs_warn(sctx->send_root->fs_info, "unexpected inode type %o",
   2634				(int)(mode & S_IFMT));
   2635		ret = -EOPNOTSUPP;
   2636		goto out;
   2637	}
   2638
   2639	ret = begin_cmd(sctx, cmd);
   2640	if (ret < 0)
   2641		goto out;
   2642
   2643	ret = gen_unique_name(sctx, ino, gen, p);
   2644	if (ret < 0)
   2645		goto out;
   2646
   2647	TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, p);
   2648	TLV_PUT_U64(sctx, BTRFS_SEND_A_INO, ino);
   2649
   2650	if (S_ISLNK(mode)) {
   2651		fs_path_reset(p);
   2652		ret = read_symlink(sctx->send_root, ino, p);
   2653		if (ret < 0)
   2654			goto out;
   2655		TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH_LINK, p);
   2656	} else if (S_ISCHR(mode) || S_ISBLK(mode) ||
   2657		   S_ISFIFO(mode) || S_ISSOCK(mode)) {
   2658		TLV_PUT_U64(sctx, BTRFS_SEND_A_RDEV, new_encode_dev(rdev));
   2659		TLV_PUT_U64(sctx, BTRFS_SEND_A_MODE, mode);
   2660	}
   2661
   2662	ret = send_cmd(sctx);
   2663	if (ret < 0)
   2664		goto out;
   2665
   2666
   2667tlv_put_failure:
   2668out:
   2669	fs_path_free(p);
   2670	return ret;
   2671}
   2672
   2673/*
   2674 * We need some special handling for inodes that get processed before the parent
   2675 * directory got created. See process_recorded_refs for details.
   2676 * This function does the check if we already created the dir out of order.
   2677 */
   2678static int did_create_dir(struct send_ctx *sctx, u64 dir)
   2679{
   2680	int ret = 0;
   2681	int iter_ret = 0;
   2682	struct btrfs_path *path = NULL;
   2683	struct btrfs_key key;
   2684	struct btrfs_key found_key;
   2685	struct btrfs_key di_key;
   2686	struct btrfs_dir_item *di;
   2687
   2688	path = alloc_path_for_send();
   2689	if (!path)
   2690		return -ENOMEM;
   2691
   2692	key.objectid = dir;
   2693	key.type = BTRFS_DIR_INDEX_KEY;
   2694	key.offset = 0;
   2695
   2696	btrfs_for_each_slot(sctx->send_root, &key, &found_key, path, iter_ret) {
   2697		struct extent_buffer *eb = path->nodes[0];
   2698
   2699		if (found_key.objectid != key.objectid ||
   2700		    found_key.type != key.type) {
   2701			ret = 0;
   2702			break;
   2703		}
   2704
   2705		di = btrfs_item_ptr(eb, path->slots[0], struct btrfs_dir_item);
   2706		btrfs_dir_item_key_to_cpu(eb, di, &di_key);
   2707
   2708		if (di_key.type != BTRFS_ROOT_ITEM_KEY &&
   2709		    di_key.objectid < sctx->send_progress) {
   2710			ret = 1;
   2711			break;
   2712		}
   2713	}
   2714	/* Catch error found during iteration */
   2715	if (iter_ret < 0)
   2716		ret = iter_ret;
   2717
   2718	btrfs_free_path(path);
   2719	return ret;
   2720}
   2721
   2722/*
   2723 * Only creates the inode if it is:
   2724 * 1. Not a directory
   2725 * 2. Or a directory which was not created already due to out of order
   2726 *    directories. See did_create_dir and process_recorded_refs for details.
   2727 */
   2728static int send_create_inode_if_needed(struct send_ctx *sctx)
   2729{
   2730	int ret;
   2731
   2732	if (S_ISDIR(sctx->cur_inode_mode)) {
   2733		ret = did_create_dir(sctx, sctx->cur_ino);
   2734		if (ret < 0)
   2735			return ret;
   2736		else if (ret > 0)
   2737			return 0;
   2738	}
   2739
   2740	return send_create_inode(sctx, sctx->cur_ino);
   2741}
   2742
   2743struct recorded_ref {
   2744	struct list_head list;
   2745	char *name;
   2746	struct fs_path *full_path;
   2747	u64 dir;
   2748	u64 dir_gen;
   2749	int name_len;
   2750};
   2751
   2752static void set_ref_path(struct recorded_ref *ref, struct fs_path *path)
   2753{
   2754	ref->full_path = path;
   2755	ref->name = (char *)kbasename(ref->full_path->start);
   2756	ref->name_len = ref->full_path->end - ref->name;
   2757}
   2758
   2759/*
   2760 * We need to process new refs before deleted refs, but compare_tree gives us
   2761 * everything mixed. So we first record all refs and later process them.
   2762 * This function is a helper to record one ref.
   2763 */
   2764static int __record_ref(struct list_head *head, u64 dir,
   2765		      u64 dir_gen, struct fs_path *path)
   2766{
   2767	struct recorded_ref *ref;
   2768
   2769	ref = kmalloc(sizeof(*ref), GFP_KERNEL);
   2770	if (!ref)
   2771		return -ENOMEM;
   2772
   2773	ref->dir = dir;
   2774	ref->dir_gen = dir_gen;
   2775	set_ref_path(ref, path);
   2776	list_add_tail(&ref->list, head);
   2777	return 0;
   2778}
   2779
   2780static int dup_ref(struct recorded_ref *ref, struct list_head *list)
   2781{
   2782	struct recorded_ref *new;
   2783
   2784	new = kmalloc(sizeof(*ref), GFP_KERNEL);
   2785	if (!new)
   2786		return -ENOMEM;
   2787
   2788	new->dir = ref->dir;
   2789	new->dir_gen = ref->dir_gen;
   2790	new->full_path = NULL;
   2791	INIT_LIST_HEAD(&new->list);
   2792	list_add_tail(&new->list, list);
   2793	return 0;
   2794}
   2795
   2796static void __free_recorded_refs(struct list_head *head)
   2797{
   2798	struct recorded_ref *cur;
   2799
   2800	while (!list_empty(head)) {
   2801		cur = list_entry(head->next, struct recorded_ref, list);
   2802		fs_path_free(cur->full_path);
   2803		list_del(&cur->list);
   2804		kfree(cur);
   2805	}
   2806}
   2807
   2808static void free_recorded_refs(struct send_ctx *sctx)
   2809{
   2810	__free_recorded_refs(&sctx->new_refs);
   2811	__free_recorded_refs(&sctx->deleted_refs);
   2812}
   2813
   2814/*
   2815 * Renames/moves a file/dir to its orphan name. Used when the first
   2816 * ref of an unprocessed inode gets overwritten and for all non empty
   2817 * directories.
   2818 */
   2819static int orphanize_inode(struct send_ctx *sctx, u64 ino, u64 gen,
   2820			  struct fs_path *path)
   2821{
   2822	int ret;
   2823	struct fs_path *orphan;
   2824
   2825	orphan = fs_path_alloc();
   2826	if (!orphan)
   2827		return -ENOMEM;
   2828
   2829	ret = gen_unique_name(sctx, ino, gen, orphan);
   2830	if (ret < 0)
   2831		goto out;
   2832
   2833	ret = send_rename(sctx, path, orphan);
   2834
   2835out:
   2836	fs_path_free(orphan);
   2837	return ret;
   2838}
   2839
   2840static struct orphan_dir_info *add_orphan_dir_info(struct send_ctx *sctx,
   2841						   u64 dir_ino, u64 dir_gen)
   2842{
   2843	struct rb_node **p = &sctx->orphan_dirs.rb_node;
   2844	struct rb_node *parent = NULL;
   2845	struct orphan_dir_info *entry, *odi;
   2846
   2847	while (*p) {
   2848		parent = *p;
   2849		entry = rb_entry(parent, struct orphan_dir_info, node);
   2850		if (dir_ino < entry->ino)
   2851			p = &(*p)->rb_left;
   2852		else if (dir_ino > entry->ino)
   2853			p = &(*p)->rb_right;
   2854		else if (dir_gen < entry->gen)
   2855			p = &(*p)->rb_left;
   2856		else if (dir_gen > entry->gen)
   2857			p = &(*p)->rb_right;
   2858		else
   2859			return entry;
   2860	}
   2861
   2862	odi = kmalloc(sizeof(*odi), GFP_KERNEL);
   2863	if (!odi)
   2864		return ERR_PTR(-ENOMEM);
   2865	odi->ino = dir_ino;
   2866	odi->gen = dir_gen;
   2867	odi->last_dir_index_offset = 0;
   2868
   2869	rb_link_node(&odi->node, parent, p);
   2870	rb_insert_color(&odi->node, &sctx->orphan_dirs);
   2871	return odi;
   2872}
   2873
   2874static struct orphan_dir_info *get_orphan_dir_info(struct send_ctx *sctx,
   2875						   u64 dir_ino, u64 gen)
   2876{
   2877	struct rb_node *n = sctx->orphan_dirs.rb_node;
   2878	struct orphan_dir_info *entry;
   2879
   2880	while (n) {
   2881		entry = rb_entry(n, struct orphan_dir_info, node);
   2882		if (dir_ino < entry->ino)
   2883			n = n->rb_left;
   2884		else if (dir_ino > entry->ino)
   2885			n = n->rb_right;
   2886		else if (gen < entry->gen)
   2887			n = n->rb_left;
   2888		else if (gen > entry->gen)
   2889			n = n->rb_right;
   2890		else
   2891			return entry;
   2892	}
   2893	return NULL;
   2894}
   2895
   2896static int is_waiting_for_rm(struct send_ctx *sctx, u64 dir_ino, u64 gen)
   2897{
   2898	struct orphan_dir_info *odi = get_orphan_dir_info(sctx, dir_ino, gen);
   2899
   2900	return odi != NULL;
   2901}
   2902
   2903static void free_orphan_dir_info(struct send_ctx *sctx,
   2904				 struct orphan_dir_info *odi)
   2905{
   2906	if (!odi)
   2907		return;
   2908	rb_erase(&odi->node, &sctx->orphan_dirs);
   2909	kfree(odi);
   2910}
   2911
   2912/*
   2913 * Returns 1 if a directory can be removed at this point in time.
   2914 * We check this by iterating all dir items and checking if the inode behind
   2915 * the dir item was already processed.
   2916 */
   2917static int can_rmdir(struct send_ctx *sctx, u64 dir, u64 dir_gen,
   2918		     u64 send_progress)
   2919{
   2920	int ret = 0;
   2921	int iter_ret = 0;
   2922	struct btrfs_root *root = sctx->parent_root;
   2923	struct btrfs_path *path;
   2924	struct btrfs_key key;
   2925	struct btrfs_key found_key;
   2926	struct btrfs_key loc;
   2927	struct btrfs_dir_item *di;
   2928	struct orphan_dir_info *odi = NULL;
   2929
   2930	/*
   2931	 * Don't try to rmdir the top/root subvolume dir.
   2932	 */
   2933	if (dir == BTRFS_FIRST_FREE_OBJECTID)
   2934		return 0;
   2935
   2936	path = alloc_path_for_send();
   2937	if (!path)
   2938		return -ENOMEM;
   2939
   2940	key.objectid = dir;
   2941	key.type = BTRFS_DIR_INDEX_KEY;
   2942	key.offset = 0;
   2943
   2944	odi = get_orphan_dir_info(sctx, dir, dir_gen);
   2945	if (odi)
   2946		key.offset = odi->last_dir_index_offset;
   2947
   2948	btrfs_for_each_slot(root, &key, &found_key, path, iter_ret) {
   2949		struct waiting_dir_move *dm;
   2950
   2951		if (found_key.objectid != key.objectid ||
   2952		    found_key.type != key.type)
   2953			break;
   2954
   2955		di = btrfs_item_ptr(path->nodes[0], path->slots[0],
   2956				struct btrfs_dir_item);
   2957		btrfs_dir_item_key_to_cpu(path->nodes[0], di, &loc);
   2958
   2959		dm = get_waiting_dir_move(sctx, loc.objectid);
   2960		if (dm) {
   2961			odi = add_orphan_dir_info(sctx, dir, dir_gen);
   2962			if (IS_ERR(odi)) {
   2963				ret = PTR_ERR(odi);
   2964				goto out;
   2965			}
   2966			odi->gen = dir_gen;
   2967			odi->last_dir_index_offset = found_key.offset;
   2968			dm->rmdir_ino = dir;
   2969			dm->rmdir_gen = dir_gen;
   2970			ret = 0;
   2971			goto out;
   2972		}
   2973
   2974		if (loc.objectid > send_progress) {
   2975			odi = add_orphan_dir_info(sctx, dir, dir_gen);
   2976			if (IS_ERR(odi)) {
   2977				ret = PTR_ERR(odi);
   2978				goto out;
   2979			}
   2980			odi->gen = dir_gen;
   2981			odi->last_dir_index_offset = found_key.offset;
   2982			ret = 0;
   2983			goto out;
   2984		}
   2985	}
   2986	if (iter_ret < 0) {
   2987		ret = iter_ret;
   2988		goto out;
   2989	}
   2990	free_orphan_dir_info(sctx, odi);
   2991
   2992	ret = 1;
   2993
   2994out:
   2995	btrfs_free_path(path);
   2996	return ret;
   2997}
   2998
   2999static int is_waiting_for_move(struct send_ctx *sctx, u64 ino)
   3000{
   3001	struct waiting_dir_move *entry = get_waiting_dir_move(sctx, ino);
   3002
   3003	return entry != NULL;
   3004}
   3005
   3006static int add_waiting_dir_move(struct send_ctx *sctx, u64 ino, bool orphanized)
   3007{
   3008	struct rb_node **p = &sctx->waiting_dir_moves.rb_node;
   3009	struct rb_node *parent = NULL;
   3010	struct waiting_dir_move *entry, *dm;
   3011
   3012	dm = kmalloc(sizeof(*dm), GFP_KERNEL);
   3013	if (!dm)
   3014		return -ENOMEM;
   3015	dm->ino = ino;
   3016	dm->rmdir_ino = 0;
   3017	dm->rmdir_gen = 0;
   3018	dm->orphanized = orphanized;
   3019
   3020	while (*p) {
   3021		parent = *p;
   3022		entry = rb_entry(parent, struct waiting_dir_move, node);
   3023		if (ino < entry->ino) {
   3024			p = &(*p)->rb_left;
   3025		} else if (ino > entry->ino) {
   3026			p = &(*p)->rb_right;
   3027		} else {
   3028			kfree(dm);
   3029			return -EEXIST;
   3030		}
   3031	}
   3032
   3033	rb_link_node(&dm->node, parent, p);
   3034	rb_insert_color(&dm->node, &sctx->waiting_dir_moves);
   3035	return 0;
   3036}
   3037
   3038static struct waiting_dir_move *
   3039get_waiting_dir_move(struct send_ctx *sctx, u64 ino)
   3040{
   3041	struct rb_node *n = sctx->waiting_dir_moves.rb_node;
   3042	struct waiting_dir_move *entry;
   3043
   3044	while (n) {
   3045		entry = rb_entry(n, struct waiting_dir_move, node);
   3046		if (ino < entry->ino)
   3047			n = n->rb_left;
   3048		else if (ino > entry->ino)
   3049			n = n->rb_right;
   3050		else
   3051			return entry;
   3052	}
   3053	return NULL;
   3054}
   3055
   3056static void free_waiting_dir_move(struct send_ctx *sctx,
   3057				  struct waiting_dir_move *dm)
   3058{
   3059	if (!dm)
   3060		return;
   3061	rb_erase(&dm->node, &sctx->waiting_dir_moves);
   3062	kfree(dm);
   3063}
   3064
   3065static int add_pending_dir_move(struct send_ctx *sctx,
   3066				u64 ino,
   3067				u64 ino_gen,
   3068				u64 parent_ino,
   3069				struct list_head *new_refs,
   3070				struct list_head *deleted_refs,
   3071				const bool is_orphan)
   3072{
   3073	struct rb_node **p = &sctx->pending_dir_moves.rb_node;
   3074	struct rb_node *parent = NULL;
   3075	struct pending_dir_move *entry = NULL, *pm;
   3076	struct recorded_ref *cur;
   3077	int exists = 0;
   3078	int ret;
   3079
   3080	pm = kmalloc(sizeof(*pm), GFP_KERNEL);
   3081	if (!pm)
   3082		return -ENOMEM;
   3083	pm->parent_ino = parent_ino;
   3084	pm->ino = ino;
   3085	pm->gen = ino_gen;
   3086	INIT_LIST_HEAD(&pm->list);
   3087	INIT_LIST_HEAD(&pm->update_refs);
   3088	RB_CLEAR_NODE(&pm->node);
   3089
   3090	while (*p) {
   3091		parent = *p;
   3092		entry = rb_entry(parent, struct pending_dir_move, node);
   3093		if (parent_ino < entry->parent_ino) {
   3094			p = &(*p)->rb_left;
   3095		} else if (parent_ino > entry->parent_ino) {
   3096			p = &(*p)->rb_right;
   3097		} else {
   3098			exists = 1;
   3099			break;
   3100		}
   3101	}
   3102
   3103	list_for_each_entry(cur, deleted_refs, list) {
   3104		ret = dup_ref(cur, &pm->update_refs);
   3105		if (ret < 0)
   3106			goto out;
   3107	}
   3108	list_for_each_entry(cur, new_refs, list) {
   3109		ret = dup_ref(cur, &pm->update_refs);
   3110		if (ret < 0)
   3111			goto out;
   3112	}
   3113
   3114	ret = add_waiting_dir_move(sctx, pm->ino, is_orphan);
   3115	if (ret)
   3116		goto out;
   3117
   3118	if (exists) {
   3119		list_add_tail(&pm->list, &entry->list);
   3120	} else {
   3121		rb_link_node(&pm->node, parent, p);
   3122		rb_insert_color(&pm->node, &sctx->pending_dir_moves);
   3123	}
   3124	ret = 0;
   3125out:
   3126	if (ret) {
   3127		__free_recorded_refs(&pm->update_refs);
   3128		kfree(pm);
   3129	}
   3130	return ret;
   3131}
   3132
   3133static struct pending_dir_move *get_pending_dir_moves(struct send_ctx *sctx,
   3134						      u64 parent_ino)
   3135{
   3136	struct rb_node *n = sctx->pending_dir_moves.rb_node;
   3137	struct pending_dir_move *entry;
   3138
   3139	while (n) {
   3140		entry = rb_entry(n, struct pending_dir_move, node);
   3141		if (parent_ino < entry->parent_ino)
   3142			n = n->rb_left;
   3143		else if (parent_ino > entry->parent_ino)
   3144			n = n->rb_right;
   3145		else
   3146			return entry;
   3147	}
   3148	return NULL;
   3149}
   3150
   3151static int path_loop(struct send_ctx *sctx, struct fs_path *name,
   3152		     u64 ino, u64 gen, u64 *ancestor_ino)
   3153{
   3154	int ret = 0;
   3155	u64 parent_inode = 0;
   3156	u64 parent_gen = 0;
   3157	u64 start_ino = ino;
   3158
   3159	*ancestor_ino = 0;
   3160	while (ino != BTRFS_FIRST_FREE_OBJECTID) {
   3161		fs_path_reset(name);
   3162
   3163		if (is_waiting_for_rm(sctx, ino, gen))
   3164			break;
   3165		if (is_waiting_for_move(sctx, ino)) {
   3166			if (*ancestor_ino == 0)
   3167				*ancestor_ino = ino;
   3168			ret = get_first_ref(sctx->parent_root, ino,
   3169					    &parent_inode, &parent_gen, name);
   3170		} else {
   3171			ret = __get_cur_name_and_parent(sctx, ino, gen,
   3172							&parent_inode,
   3173							&parent_gen, name);
   3174			if (ret > 0) {
   3175				ret = 0;
   3176				break;
   3177			}
   3178		}
   3179		if (ret < 0)
   3180			break;
   3181		if (parent_inode == start_ino) {
   3182			ret = 1;
   3183			if (*ancestor_ino == 0)
   3184				*ancestor_ino = ino;
   3185			break;
   3186		}
   3187		ino = parent_inode;
   3188		gen = parent_gen;
   3189	}
   3190	return ret;
   3191}
   3192
   3193static int apply_dir_move(struct send_ctx *sctx, struct pending_dir_move *pm)
   3194{
   3195	struct fs_path *from_path = NULL;
   3196	struct fs_path *to_path = NULL;
   3197	struct fs_path *name = NULL;
   3198	u64 orig_progress = sctx->send_progress;
   3199	struct recorded_ref *cur;
   3200	u64 parent_ino, parent_gen;
   3201	struct waiting_dir_move *dm = NULL;
   3202	u64 rmdir_ino = 0;
   3203	u64 rmdir_gen;
   3204	u64 ancestor;
   3205	bool is_orphan;
   3206	int ret;
   3207
   3208	name = fs_path_alloc();
   3209	from_path = fs_path_alloc();
   3210	if (!name || !from_path) {
   3211		ret = -ENOMEM;
   3212		goto out;
   3213	}
   3214
   3215	dm = get_waiting_dir_move(sctx, pm->ino);
   3216	ASSERT(dm);
   3217	rmdir_ino = dm->rmdir_ino;
   3218	rmdir_gen = dm->rmdir_gen;
   3219	is_orphan = dm->orphanized;
   3220	free_waiting_dir_move(sctx, dm);
   3221
   3222	if (is_orphan) {
   3223		ret = gen_unique_name(sctx, pm->ino,
   3224				      pm->gen, from_path);
   3225	} else {
   3226		ret = get_first_ref(sctx->parent_root, pm->ino,
   3227				    &parent_ino, &parent_gen, name);
   3228		if (ret < 0)
   3229			goto out;
   3230		ret = get_cur_path(sctx, parent_ino, parent_gen,
   3231				   from_path);
   3232		if (ret < 0)
   3233			goto out;
   3234		ret = fs_path_add_path(from_path, name);
   3235	}
   3236	if (ret < 0)
   3237		goto out;
   3238
   3239	sctx->send_progress = sctx->cur_ino + 1;
   3240	ret = path_loop(sctx, name, pm->ino, pm->gen, &ancestor);
   3241	if (ret < 0)
   3242		goto out;
   3243	if (ret) {
   3244		LIST_HEAD(deleted_refs);
   3245		ASSERT(ancestor > BTRFS_FIRST_FREE_OBJECTID);
   3246		ret = add_pending_dir_move(sctx, pm->ino, pm->gen, ancestor,
   3247					   &pm->update_refs, &deleted_refs,
   3248					   is_orphan);
   3249		if (ret < 0)
   3250			goto out;
   3251		if (rmdir_ino) {
   3252			dm = get_waiting_dir_move(sctx, pm->ino);
   3253			ASSERT(dm);
   3254			dm->rmdir_ino = rmdir_ino;
   3255			dm->rmdir_gen = rmdir_gen;
   3256		}
   3257		goto out;
   3258	}
   3259	fs_path_reset(name);
   3260	to_path = name;
   3261	name = NULL;
   3262	ret = get_cur_path(sctx, pm->ino, pm->gen, to_path);
   3263	if (ret < 0)
   3264		goto out;
   3265
   3266	ret = send_rename(sctx, from_path, to_path);
   3267	if (ret < 0)
   3268		goto out;
   3269
   3270	if (rmdir_ino) {
   3271		struct orphan_dir_info *odi;
   3272		u64 gen;
   3273
   3274		odi = get_orphan_dir_info(sctx, rmdir_ino, rmdir_gen);
   3275		if (!odi) {
   3276			/* already deleted */
   3277			goto finish;
   3278		}
   3279		gen = odi->gen;
   3280
   3281		ret = can_rmdir(sctx, rmdir_ino, gen, sctx->cur_ino);
   3282		if (ret < 0)
   3283			goto out;
   3284		if (!ret)
   3285			goto finish;
   3286
   3287		name = fs_path_alloc();
   3288		if (!name) {
   3289			ret = -ENOMEM;
   3290			goto out;
   3291		}
   3292		ret = get_cur_path(sctx, rmdir_ino, gen, name);
   3293		if (ret < 0)
   3294			goto out;
   3295		ret = send_rmdir(sctx, name);
   3296		if (ret < 0)
   3297			goto out;
   3298	}
   3299
   3300finish:
   3301	ret = send_utimes(sctx, pm->ino, pm->gen);
   3302	if (ret < 0)
   3303		goto out;
   3304
   3305	/*
   3306	 * After rename/move, need to update the utimes of both new parent(s)
   3307	 * and old parent(s).
   3308	 */
   3309	list_for_each_entry(cur, &pm->update_refs, list) {
   3310		/*
   3311		 * The parent inode might have been deleted in the send snapshot
   3312		 */
   3313		ret = get_inode_info(sctx->send_root, cur->dir, NULL,
   3314				     NULL, NULL, NULL, NULL, NULL);
   3315		if (ret == -ENOENT) {
   3316			ret = 0;
   3317			continue;
   3318		}
   3319		if (ret < 0)
   3320			goto out;
   3321
   3322		ret = send_utimes(sctx, cur->dir, cur->dir_gen);
   3323		if (ret < 0)
   3324			goto out;
   3325	}
   3326
   3327out:
   3328	fs_path_free(name);
   3329	fs_path_free(from_path);
   3330	fs_path_free(to_path);
   3331	sctx->send_progress = orig_progress;
   3332
   3333	return ret;
   3334}
   3335
   3336static void free_pending_move(struct send_ctx *sctx, struct pending_dir_move *m)
   3337{
   3338	if (!list_empty(&m->list))
   3339		list_del(&m->list);
   3340	if (!RB_EMPTY_NODE(&m->node))
   3341		rb_erase(&m->node, &sctx->pending_dir_moves);
   3342	__free_recorded_refs(&m->update_refs);
   3343	kfree(m);
   3344}
   3345
   3346static void tail_append_pending_moves(struct send_ctx *sctx,
   3347				      struct pending_dir_move *moves,
   3348				      struct list_head *stack)
   3349{
   3350	if (list_empty(&moves->list)) {
   3351		list_add_tail(&moves->list, stack);
   3352	} else {
   3353		LIST_HEAD(list);
   3354		list_splice_init(&moves->list, &list);
   3355		list_add_tail(&moves->list, stack);
   3356		list_splice_tail(&list, stack);
   3357	}
   3358	if (!RB_EMPTY_NODE(&moves->node)) {
   3359		rb_erase(&moves->node, &sctx->pending_dir_moves);
   3360		RB_CLEAR_NODE(&moves->node);
   3361	}
   3362}
   3363
   3364static int apply_children_dir_moves(struct send_ctx *sctx)
   3365{
   3366	struct pending_dir_move *pm;
   3367	struct list_head stack;
   3368	u64 parent_ino = sctx->cur_ino;
   3369	int ret = 0;
   3370
   3371	pm = get_pending_dir_moves(sctx, parent_ino);
   3372	if (!pm)
   3373		return 0;
   3374
   3375	INIT_LIST_HEAD(&stack);
   3376	tail_append_pending_moves(sctx, pm, &stack);
   3377
   3378	while (!list_empty(&stack)) {
   3379		pm = list_first_entry(&stack, struct pending_dir_move, list);
   3380		parent_ino = pm->ino;
   3381		ret = apply_dir_move(sctx, pm);
   3382		free_pending_move(sctx, pm);
   3383		if (ret)
   3384			goto out;
   3385		pm = get_pending_dir_moves(sctx, parent_ino);
   3386		if (pm)
   3387			tail_append_pending_moves(sctx, pm, &stack);
   3388	}
   3389	return 0;
   3390
   3391out:
   3392	while (!list_empty(&stack)) {
   3393		pm = list_first_entry(&stack, struct pending_dir_move, list);
   3394		free_pending_move(sctx, pm);
   3395	}
   3396	return ret;
   3397}
   3398
   3399/*
   3400 * We might need to delay a directory rename even when no ancestor directory
   3401 * (in the send root) with a higher inode number than ours (sctx->cur_ino) was
   3402 * renamed. This happens when we rename a directory to the old name (the name
   3403 * in the parent root) of some other unrelated directory that got its rename
   3404 * delayed due to some ancestor with higher number that got renamed.
   3405 *
   3406 * Example:
   3407 *
   3408 * Parent snapshot:
   3409 * .                                       (ino 256)
   3410 * |---- a/                                (ino 257)
   3411 * |     |---- file                        (ino 260)
   3412 * |
   3413 * |---- b/                                (ino 258)
   3414 * |---- c/                                (ino 259)
   3415 *
   3416 * Send snapshot:
   3417 * .                                       (ino 256)
   3418 * |---- a/                                (ino 258)
   3419 * |---- x/                                (ino 259)
   3420 *       |---- y/                          (ino 257)
   3421 *             |----- file                 (ino 260)
   3422 *
   3423 * Here we can not rename 258 from 'b' to 'a' without the rename of inode 257
   3424 * from 'a' to 'x/y' happening first, which in turn depends on the rename of
   3425 * inode 259 from 'c' to 'x'. So the order of rename commands the send stream
   3426 * must issue is:
   3427 *
   3428 * 1 - rename 259 from 'c' to 'x'
   3429 * 2 - rename 257 from 'a' to 'x/y'
   3430 * 3 - rename 258 from 'b' to 'a'
   3431 *
   3432 * Returns 1 if the rename of sctx->cur_ino needs to be delayed, 0 if it can
   3433 * be done right away and < 0 on error.
   3434 */
   3435static int wait_for_dest_dir_move(struct send_ctx *sctx,
   3436				  struct recorded_ref *parent_ref,
   3437				  const bool is_orphan)
   3438{
   3439	struct btrfs_fs_info *fs_info = sctx->parent_root->fs_info;
   3440	struct btrfs_path *path;
   3441	struct btrfs_key key;
   3442	struct btrfs_key di_key;
   3443	struct btrfs_dir_item *di;
   3444	u64 left_gen;
   3445	u64 right_gen;
   3446	int ret = 0;
   3447	struct waiting_dir_move *wdm;
   3448
   3449	if (RB_EMPTY_ROOT(&sctx->waiting_dir_moves))
   3450		return 0;
   3451
   3452	path = alloc_path_for_send();
   3453	if (!path)
   3454		return -ENOMEM;
   3455
   3456	key.objectid = parent_ref->dir;
   3457	key.type = BTRFS_DIR_ITEM_KEY;
   3458	key.offset = btrfs_name_hash(parent_ref->name, parent_ref->name_len);
   3459
   3460	ret = btrfs_search_slot(NULL, sctx->parent_root, &key, path, 0, 0);
   3461	if (ret < 0) {
   3462		goto out;
   3463	} else if (ret > 0) {
   3464		ret = 0;
   3465		goto out;
   3466	}
   3467
   3468	di = btrfs_match_dir_item_name(fs_info, path, parent_ref->name,
   3469				       parent_ref->name_len);
   3470	if (!di) {
   3471		ret = 0;
   3472		goto out;
   3473	}
   3474	/*
   3475	 * di_key.objectid has the number of the inode that has a dentry in the
   3476	 * parent directory with the same name that sctx->cur_ino is being
   3477	 * renamed to. We need to check if that inode is in the send root as
   3478	 * well and if it is currently marked as an inode with a pending rename,
   3479	 * if it is, we need to delay the rename of sctx->cur_ino as well, so
   3480	 * that it happens after that other inode is renamed.
   3481	 */
   3482	btrfs_dir_item_key_to_cpu(path->nodes[0], di, &di_key);
   3483	if (di_key.type != BTRFS_INODE_ITEM_KEY) {
   3484		ret = 0;
   3485		goto out;
   3486	}
   3487
   3488	ret = get_inode_info(sctx->parent_root, di_key.objectid, NULL,
   3489			     &left_gen, NULL, NULL, NULL, NULL);
   3490	if (ret < 0)
   3491		goto out;
   3492	ret = get_inode_info(sctx->send_root, di_key.objectid, NULL,
   3493			     &right_gen, NULL, NULL, NULL, NULL);
   3494	if (ret < 0) {
   3495		if (ret == -ENOENT)
   3496			ret = 0;
   3497		goto out;
   3498	}
   3499
   3500	/* Different inode, no need to delay the rename of sctx->cur_ino */
   3501	if (right_gen != left_gen) {
   3502		ret = 0;
   3503		goto out;
   3504	}
   3505
   3506	wdm = get_waiting_dir_move(sctx, di_key.objectid);
   3507	if (wdm && !wdm->orphanized) {
   3508		ret = add_pending_dir_move(sctx,
   3509					   sctx->cur_ino,
   3510					   sctx->cur_inode_gen,
   3511					   di_key.objectid,
   3512					   &sctx->new_refs,
   3513					   &sctx->deleted_refs,
   3514					   is_orphan);
   3515		if (!ret)
   3516			ret = 1;
   3517	}
   3518out:
   3519	btrfs_free_path(path);
   3520	return ret;
   3521}
   3522
   3523/*
   3524 * Check if inode ino2, or any of its ancestors, is inode ino1.
   3525 * Return 1 if true, 0 if false and < 0 on error.
   3526 */
   3527static int check_ino_in_path(struct btrfs_root *root,
   3528			     const u64 ino1,
   3529			     const u64 ino1_gen,
   3530			     const u64 ino2,
   3531			     const u64 ino2_gen,
   3532			     struct fs_path *fs_path)
   3533{
   3534	u64 ino = ino2;
   3535
   3536	if (ino1 == ino2)
   3537		return ino1_gen == ino2_gen;
   3538
   3539	while (ino > BTRFS_FIRST_FREE_OBJECTID) {
   3540		u64 parent;
   3541		u64 parent_gen;
   3542		int ret;
   3543
   3544		fs_path_reset(fs_path);
   3545		ret = get_first_ref(root, ino, &parent, &parent_gen, fs_path);
   3546		if (ret < 0)
   3547			return ret;
   3548		if (parent == ino1)
   3549			return parent_gen == ino1_gen;
   3550		ino = parent;
   3551	}
   3552	return 0;
   3553}
   3554
   3555/*
   3556 * Check if inode ino1 is an ancestor of inode ino2 in the given root for any
   3557 * possible path (in case ino2 is not a directory and has multiple hard links).
   3558 * Return 1 if true, 0 if false and < 0 on error.
   3559 */
   3560static int is_ancestor(struct btrfs_root *root,
   3561		       const u64 ino1,
   3562		       const u64 ino1_gen,
   3563		       const u64 ino2,
   3564		       struct fs_path *fs_path)
   3565{
   3566	bool free_fs_path = false;
   3567	int ret = 0;
   3568	int iter_ret = 0;
   3569	struct btrfs_path *path = NULL;
   3570	struct btrfs_key key;
   3571
   3572	if (!fs_path) {
   3573		fs_path = fs_path_alloc();
   3574		if (!fs_path)
   3575			return -ENOMEM;
   3576		free_fs_path = true;
   3577	}
   3578
   3579	path = alloc_path_for_send();
   3580	if (!path) {
   3581		ret = -ENOMEM;
   3582		goto out;
   3583	}
   3584
   3585	key.objectid = ino2;
   3586	key.type = BTRFS_INODE_REF_KEY;
   3587	key.offset = 0;
   3588
   3589	btrfs_for_each_slot(root, &key, &key, path, iter_ret) {
   3590		struct extent_buffer *leaf = path->nodes[0];
   3591		int slot = path->slots[0];
   3592		u32 cur_offset = 0;
   3593		u32 item_size;
   3594
   3595		if (key.objectid != ino2)
   3596			break;
   3597		if (key.type != BTRFS_INODE_REF_KEY &&
   3598		    key.type != BTRFS_INODE_EXTREF_KEY)
   3599			break;
   3600
   3601		item_size = btrfs_item_size(leaf, slot);
   3602		while (cur_offset < item_size) {
   3603			u64 parent;
   3604			u64 parent_gen;
   3605
   3606			if (key.type == BTRFS_INODE_EXTREF_KEY) {
   3607				unsigned long ptr;
   3608				struct btrfs_inode_extref *extref;
   3609
   3610				ptr = btrfs_item_ptr_offset(leaf, slot);
   3611				extref = (struct btrfs_inode_extref *)
   3612					(ptr + cur_offset);
   3613				parent = btrfs_inode_extref_parent(leaf,
   3614								   extref);
   3615				cur_offset += sizeof(*extref);
   3616				cur_offset += btrfs_inode_extref_name_len(leaf,
   3617								  extref);
   3618			} else {
   3619				parent = key.offset;
   3620				cur_offset = item_size;
   3621			}
   3622
   3623			ret = get_inode_info(root, parent, NULL, &parent_gen,
   3624					     NULL, NULL, NULL, NULL);
   3625			if (ret < 0)
   3626				goto out;
   3627			ret = check_ino_in_path(root, ino1, ino1_gen,
   3628						parent, parent_gen, fs_path);
   3629			if (ret)
   3630				goto out;
   3631		}
   3632	}
   3633	ret = 0;
   3634	if (iter_ret < 0)
   3635		ret = iter_ret;
   3636
   3637out:
   3638	btrfs_free_path(path);
   3639	if (free_fs_path)
   3640		fs_path_free(fs_path);
   3641	return ret;
   3642}
   3643
   3644static int wait_for_parent_move(struct send_ctx *sctx,
   3645				struct recorded_ref *parent_ref,
   3646				const bool is_orphan)
   3647{
   3648	int ret = 0;
   3649	u64 ino = parent_ref->dir;
   3650	u64 ino_gen = parent_ref->dir_gen;
   3651	u64 parent_ino_before, parent_ino_after;
   3652	struct fs_path *path_before = NULL;
   3653	struct fs_path *path_after = NULL;
   3654	int len1, len2;
   3655
   3656	path_after = fs_path_alloc();
   3657	path_before = fs_path_alloc();
   3658	if (!path_after || !path_before) {
   3659		ret = -ENOMEM;
   3660		goto out;
   3661	}
   3662
   3663	/*
   3664	 * Our current directory inode may not yet be renamed/moved because some
   3665	 * ancestor (immediate or not) has to be renamed/moved first. So find if
   3666	 * such ancestor exists and make sure our own rename/move happens after
   3667	 * that ancestor is processed to avoid path build infinite loops (done
   3668	 * at get_cur_path()).
   3669	 */
   3670	while (ino > BTRFS_FIRST_FREE_OBJECTID) {
   3671		u64 parent_ino_after_gen;
   3672
   3673		if (is_waiting_for_move(sctx, ino)) {
   3674			/*
   3675			 * If the current inode is an ancestor of ino in the
   3676			 * parent root, we need to delay the rename of the
   3677			 * current inode, otherwise don't delayed the rename
   3678			 * because we can end up with a circular dependency
   3679			 * of renames, resulting in some directories never
   3680			 * getting the respective rename operations issued in
   3681			 * the send stream or getting into infinite path build
   3682			 * loops.
   3683			 */
   3684			ret = is_ancestor(sctx->parent_root,
   3685					  sctx->cur_ino, sctx->cur_inode_gen,
   3686					  ino, path_before);
   3687			if (ret)
   3688				break;
   3689		}
   3690
   3691		fs_path_reset(path_before);
   3692		fs_path_reset(path_after);
   3693
   3694		ret = get_first_ref(sctx->send_root, ino, &parent_ino_after,
   3695				    &parent_ino_after_gen, path_after);
   3696		if (ret < 0)
   3697			goto out;
   3698		ret = get_first_ref(sctx->parent_root, ino, &parent_ino_before,
   3699				    NULL, path_before);
   3700		if (ret < 0 && ret != -ENOENT) {
   3701			goto out;
   3702		} else if (ret == -ENOENT) {
   3703			ret = 0;
   3704			break;
   3705		}
   3706
   3707		len1 = fs_path_len(path_before);
   3708		len2 = fs_path_len(path_after);
   3709		if (ino > sctx->cur_ino &&
   3710		    (parent_ino_before != parent_ino_after || len1 != len2 ||
   3711		     memcmp(path_before->start, path_after->start, len1))) {
   3712			u64 parent_ino_gen;
   3713
   3714			ret = get_inode_info(sctx->parent_root, ino, NULL,
   3715					     &parent_ino_gen, NULL, NULL, NULL,
   3716					     NULL);
   3717			if (ret < 0)
   3718				goto out;
   3719			if (ino_gen == parent_ino_gen) {
   3720				ret = 1;
   3721				break;
   3722			}
   3723		}
   3724		ino = parent_ino_after;
   3725		ino_gen = parent_ino_after_gen;
   3726	}
   3727
   3728out:
   3729	fs_path_free(path_before);
   3730	fs_path_free(path_after);
   3731
   3732	if (ret == 1) {
   3733		ret = add_pending_dir_move(sctx,
   3734					   sctx->cur_ino,
   3735					   sctx->cur_inode_gen,
   3736					   ino,
   3737					   &sctx->new_refs,
   3738					   &sctx->deleted_refs,
   3739					   is_orphan);
   3740		if (!ret)
   3741			ret = 1;
   3742	}
   3743
   3744	return ret;
   3745}
   3746
   3747static int update_ref_path(struct send_ctx *sctx, struct recorded_ref *ref)
   3748{
   3749	int ret;
   3750	struct fs_path *new_path;
   3751
   3752	/*
   3753	 * Our reference's name member points to its full_path member string, so
   3754	 * we use here a new path.
   3755	 */
   3756	new_path = fs_path_alloc();
   3757	if (!new_path)
   3758		return -ENOMEM;
   3759
   3760	ret = get_cur_path(sctx, ref->dir, ref->dir_gen, new_path);
   3761	if (ret < 0) {
   3762		fs_path_free(new_path);
   3763		return ret;
   3764	}
   3765	ret = fs_path_add(new_path, ref->name, ref->name_len);
   3766	if (ret < 0) {
   3767		fs_path_free(new_path);
   3768		return ret;
   3769	}
   3770
   3771	fs_path_free(ref->full_path);
   3772	set_ref_path(ref, new_path);
   3773
   3774	return 0;
   3775}
   3776
   3777/*
   3778 * When processing the new references for an inode we may orphanize an existing
   3779 * directory inode because its old name conflicts with one of the new references
   3780 * of the current inode. Later, when processing another new reference of our
   3781 * inode, we might need to orphanize another inode, but the path we have in the
   3782 * reference reflects the pre-orphanization name of the directory we previously
   3783 * orphanized. For example:
   3784 *
   3785 * parent snapshot looks like:
   3786 *
   3787 * .                                     (ino 256)
   3788 * |----- f1                             (ino 257)
   3789 * |----- f2                             (ino 258)
   3790 * |----- d1/                            (ino 259)
   3791 *        |----- d2/                     (ino 260)
   3792 *
   3793 * send snapshot looks like:
   3794 *
   3795 * .                                     (ino 256)
   3796 * |----- d1                             (ino 258)
   3797 * |----- f2/                            (ino 259)
   3798 *        |----- f2_link/                (ino 260)
   3799 *        |       |----- f1              (ino 257)
   3800 *        |
   3801 *        |----- d2                      (ino 258)
   3802 *
   3803 * When processing inode 257 we compute the name for inode 259 as "d1", and we
   3804 * cache it in the name cache. Later when we start processing inode 258, when
   3805 * collecting all its new references we set a full path of "d1/d2" for its new
   3806 * reference with name "d2". When we start processing the new references we
   3807 * start by processing the new reference with name "d1", and this results in
   3808 * orphanizing inode 259, since its old reference causes a conflict. Then we
   3809 * move on the next new reference, with name "d2", and we find out we must
   3810 * orphanize inode 260, as its old reference conflicts with ours - but for the
   3811 * orphanization we use a source path corresponding to the path we stored in the
   3812 * new reference, which is "d1/d2" and not "o259-6-0/d2" - this makes the
   3813 * receiver fail since the path component "d1/" no longer exists, it was renamed
   3814 * to "o259-6-0/" when processing the previous new reference. So in this case we
   3815 * must recompute the path in the new reference and use it for the new
   3816 * orphanization operation.
   3817 */
   3818static int refresh_ref_path(struct send_ctx *sctx, struct recorded_ref *ref)
   3819{
   3820	char *name;
   3821	int ret;
   3822
   3823	name = kmemdup(ref->name, ref->name_len, GFP_KERNEL);
   3824	if (!name)
   3825		return -ENOMEM;
   3826
   3827	fs_path_reset(ref->full_path);
   3828	ret = get_cur_path(sctx, ref->dir, ref->dir_gen, ref->full_path);
   3829	if (ret < 0)
   3830		goto out;
   3831
   3832	ret = fs_path_add(ref->full_path, name, ref->name_len);
   3833	if (ret < 0)
   3834		goto out;
   3835
   3836	/* Update the reference's base name pointer. */
   3837	set_ref_path(ref, ref->full_path);
   3838out:
   3839	kfree(name);
   3840	return ret;
   3841}
   3842
   3843/*
   3844 * This does all the move/link/unlink/rmdir magic.
   3845 */
   3846static int process_recorded_refs(struct send_ctx *sctx, int *pending_move)
   3847{
   3848	struct btrfs_fs_info *fs_info = sctx->send_root->fs_info;
   3849	int ret = 0;
   3850	struct recorded_ref *cur;
   3851	struct recorded_ref *cur2;
   3852	struct list_head check_dirs;
   3853	struct fs_path *valid_path = NULL;
   3854	u64 ow_inode = 0;
   3855	u64 ow_gen;
   3856	u64 ow_mode;
   3857	int did_overwrite = 0;
   3858	int is_orphan = 0;
   3859	u64 last_dir_ino_rm = 0;
   3860	bool can_rename = true;
   3861	bool orphanized_dir = false;
   3862	bool orphanized_ancestor = false;
   3863
   3864	btrfs_debug(fs_info, "process_recorded_refs %llu", sctx->cur_ino);
   3865
   3866	/*
   3867	 * This should never happen as the root dir always has the same ref
   3868	 * which is always '..'
   3869	 */
   3870	BUG_ON(sctx->cur_ino <= BTRFS_FIRST_FREE_OBJECTID);
   3871	INIT_LIST_HEAD(&check_dirs);
   3872
   3873	valid_path = fs_path_alloc();
   3874	if (!valid_path) {
   3875		ret = -ENOMEM;
   3876		goto out;
   3877	}
   3878
   3879	/*
   3880	 * First, check if the first ref of the current inode was overwritten
   3881	 * before. If yes, we know that the current inode was already orphanized
   3882	 * and thus use the orphan name. If not, we can use get_cur_path to
   3883	 * get the path of the first ref as it would like while receiving at
   3884	 * this point in time.
   3885	 * New inodes are always orphan at the beginning, so force to use the
   3886	 * orphan name in this case.
   3887	 * The first ref is stored in valid_path and will be updated if it
   3888	 * gets moved around.
   3889	 */
   3890	if (!sctx->cur_inode_new) {
   3891		ret = did_overwrite_first_ref(sctx, sctx->cur_ino,
   3892				sctx->cur_inode_gen);
   3893		if (ret < 0)
   3894			goto out;
   3895		if (ret)
   3896			did_overwrite = 1;
   3897	}
   3898	if (sctx->cur_inode_new || did_overwrite) {
   3899		ret = gen_unique_name(sctx, sctx->cur_ino,
   3900				sctx->cur_inode_gen, valid_path);
   3901		if (ret < 0)
   3902			goto out;
   3903		is_orphan = 1;
   3904	} else {
   3905		ret = get_cur_path(sctx, sctx->cur_ino, sctx->cur_inode_gen,
   3906				valid_path);
   3907		if (ret < 0)
   3908			goto out;
   3909	}
   3910
   3911	/*
   3912	 * Before doing any rename and link operations, do a first pass on the
   3913	 * new references to orphanize any unprocessed inodes that may have a
   3914	 * reference that conflicts with one of the new references of the current
   3915	 * inode. This needs to happen first because a new reference may conflict
   3916	 * with the old reference of a parent directory, so we must make sure
   3917	 * that the path used for link and rename commands don't use an
   3918	 * orphanized name when an ancestor was not yet orphanized.
   3919	 *
   3920	 * Example:
   3921	 *
   3922	 * Parent snapshot:
   3923	 *
   3924	 * .                                                      (ino 256)
   3925	 * |----- testdir/                                        (ino 259)
   3926	 * |          |----- a                                    (ino 257)
   3927	 * |
   3928	 * |----- b                                               (ino 258)
   3929	 *
   3930	 * Send snapshot:
   3931	 *
   3932	 * .                                                      (ino 256)
   3933	 * |----- testdir_2/                                      (ino 259)
   3934	 * |          |----- a                                    (ino 260)
   3935	 * |
   3936	 * |----- testdir                                         (ino 257)
   3937	 * |----- b                                               (ino 257)
   3938	 * |----- b2                                              (ino 258)
   3939	 *
   3940	 * Processing the new reference for inode 257 with name "b" may happen
   3941	 * before processing the new reference with name "testdir". If so, we
   3942	 * must make sure that by the time we send a link command to create the
   3943	 * hard link "b", inode 259 was already orphanized, since the generated
   3944	 * path in "valid_path" already contains the orphanized name for 259.
   3945	 * We are processing inode 257, so only later when processing 259 we do
   3946	 * the rename operation to change its temporary (orphanized) name to
   3947	 * "testdir_2".
   3948	 */
   3949	list_for_each_entry(cur, &sctx->new_refs, list) {
   3950		ret = get_cur_inode_state(sctx, cur->dir, cur->dir_gen);
   3951		if (ret < 0)
   3952			goto out;
   3953		if (ret == inode_state_will_create)
   3954			continue;
   3955
   3956		/*
   3957		 * Check if this new ref would overwrite the first ref of another
   3958		 * unprocessed inode. If yes, orphanize the overwritten inode.
   3959		 * If we find an overwritten ref that is not the first ref,
   3960		 * simply unlink it.
   3961		 */
   3962		ret = will_overwrite_ref(sctx, cur->dir, cur->dir_gen,
   3963				cur->name, cur->name_len,
   3964				&ow_inode, &ow_gen, &ow_mode);
   3965		if (ret < 0)
   3966			goto out;
   3967		if (ret) {
   3968			ret = is_first_ref(sctx->parent_root,
   3969					   ow_inode, cur->dir, cur->name,
   3970					   cur->name_len);
   3971			if (ret < 0)
   3972				goto out;
   3973			if (ret) {
   3974				struct name_cache_entry *nce;
   3975				struct waiting_dir_move *wdm;
   3976
   3977				if (orphanized_dir) {
   3978					ret = refresh_ref_path(sctx, cur);
   3979					if (ret < 0)
   3980						goto out;
   3981				}
   3982
   3983				ret = orphanize_inode(sctx, ow_inode, ow_gen,
   3984						cur->full_path);
   3985				if (ret < 0)
   3986					goto out;
   3987				if (S_ISDIR(ow_mode))
   3988					orphanized_dir = true;
   3989
   3990				/*
   3991				 * If ow_inode has its rename operation delayed
   3992				 * make sure that its orphanized name is used in
   3993				 * the source path when performing its rename
   3994				 * operation.
   3995				 */
   3996				if (is_waiting_for_move(sctx, ow_inode)) {
   3997					wdm = get_waiting_dir_move(sctx,
   3998								   ow_inode);
   3999					ASSERT(wdm);
   4000					wdm->orphanized = true;
   4001				}
   4002
   4003				/*
   4004				 * Make sure we clear our orphanized inode's
   4005				 * name from the name cache. This is because the
   4006				 * inode ow_inode might be an ancestor of some
   4007				 * other inode that will be orphanized as well
   4008				 * later and has an inode number greater than
   4009				 * sctx->send_progress. We need to prevent
   4010				 * future name lookups from using the old name
   4011				 * and get instead the orphan name.
   4012				 */
   4013				nce = name_cache_search(sctx, ow_inode, ow_gen);
   4014				if (nce) {
   4015					name_cache_delete(sctx, nce);
   4016					kfree(nce);
   4017				}
   4018
   4019				/*
   4020				 * ow_inode might currently be an ancestor of
   4021				 * cur_ino, therefore compute valid_path (the
   4022				 * current path of cur_ino) again because it
   4023				 * might contain the pre-orphanization name of
   4024				 * ow_inode, which is no longer valid.
   4025				 */
   4026				ret = is_ancestor(sctx->parent_root,
   4027						  ow_inode, ow_gen,
   4028						  sctx->cur_ino, NULL);
   4029				if (ret > 0) {
   4030					orphanized_ancestor = true;
   4031					fs_path_reset(valid_path);
   4032					ret = get_cur_path(sctx, sctx->cur_ino,
   4033							   sctx->cur_inode_gen,
   4034							   valid_path);
   4035				}
   4036				if (ret < 0)
   4037					goto out;
   4038			} else {
   4039				/*
   4040				 * If we previously orphanized a directory that
   4041				 * collided with a new reference that we already
   4042				 * processed, recompute the current path because
   4043				 * that directory may be part of the path.
   4044				 */
   4045				if (orphanized_dir) {
   4046					ret = refresh_ref_path(sctx, cur);
   4047					if (ret < 0)
   4048						goto out;
   4049				}
   4050				ret = send_unlink(sctx, cur->full_path);
   4051				if (ret < 0)
   4052					goto out;
   4053			}
   4054		}
   4055
   4056	}
   4057
   4058	list_for_each_entry(cur, &sctx->new_refs, list) {
   4059		/*
   4060		 * We may have refs where the parent directory does not exist
   4061		 * yet. This happens if the parent directories inum is higher
   4062		 * than the current inum. To handle this case, we create the
   4063		 * parent directory out of order. But we need to check if this
   4064		 * did already happen before due to other refs in the same dir.
   4065		 */
   4066		ret = get_cur_inode_state(sctx, cur->dir, cur->dir_gen);
   4067		if (ret < 0)
   4068			goto out;
   4069		if (ret == inode_state_will_create) {
   4070			ret = 0;
   4071			/*
   4072			 * First check if any of the current inodes refs did
   4073			 * already create the dir.
   4074			 */
   4075			list_for_each_entry(cur2, &sctx->new_refs, list) {
   4076				if (cur == cur2)
   4077					break;
   4078				if (cur2->dir == cur->dir) {
   4079					ret = 1;
   4080					break;
   4081				}
   4082			}
   4083
   4084			/*
   4085			 * If that did not happen, check if a previous inode
   4086			 * did already create the dir.
   4087			 */
   4088			if (!ret)
   4089				ret = did_create_dir(sctx, cur->dir);
   4090			if (ret < 0)
   4091				goto out;
   4092			if (!ret) {
   4093				ret = send_create_inode(sctx, cur->dir);
   4094				if (ret < 0)
   4095					goto out;
   4096			}
   4097		}
   4098
   4099		if (S_ISDIR(sctx->cur_inode_mode) && sctx->parent_root) {
   4100			ret = wait_for_dest_dir_move(sctx, cur, is_orphan);
   4101			if (ret < 0)
   4102				goto out;
   4103			if (ret == 1) {
   4104				can_rename = false;
   4105				*pending_move = 1;
   4106			}
   4107		}
   4108
   4109		if (S_ISDIR(sctx->cur_inode_mode) && sctx->parent_root &&
   4110		    can_rename) {
   4111			ret = wait_for_parent_move(sctx, cur, is_orphan);
   4112			if (ret < 0)
   4113				goto out;
   4114			if (ret == 1) {
   4115				can_rename = false;
   4116				*pending_move = 1;
   4117			}
   4118		}
   4119
   4120		/*
   4121		 * link/move the ref to the new place. If we have an orphan
   4122		 * inode, move it and update valid_path. If not, link or move
   4123		 * it depending on the inode mode.
   4124		 */
   4125		if (is_orphan && can_rename) {
   4126			ret = send_rename(sctx, valid_path, cur->full_path);
   4127			if (ret < 0)
   4128				goto out;
   4129			is_orphan = 0;
   4130			ret = fs_path_copy(valid_path, cur->full_path);
   4131			if (ret < 0)
   4132				goto out;
   4133		} else if (can_rename) {
   4134			if (S_ISDIR(sctx->cur_inode_mode)) {
   4135				/*
   4136				 * Dirs can't be linked, so move it. For moved
   4137				 * dirs, we always have one new and one deleted
   4138				 * ref. The deleted ref is ignored later.
   4139				 */
   4140				ret = send_rename(sctx, valid_path,
   4141						  cur->full_path);
   4142				if (!ret)
   4143					ret = fs_path_copy(valid_path,
   4144							   cur->full_path);
   4145				if (ret < 0)
   4146					goto out;
   4147			} else {
   4148				/*
   4149				 * We might have previously orphanized an inode
   4150				 * which is an ancestor of our current inode,
   4151				 * so our reference's full path, which was
   4152				 * computed before any such orphanizations, must
   4153				 * be updated.
   4154				 */
   4155				if (orphanized_dir) {
   4156					ret = update_ref_path(sctx, cur);
   4157					if (ret < 0)
   4158						goto out;
   4159				}
   4160				ret = send_link(sctx, cur->full_path,
   4161						valid_path);
   4162				if (ret < 0)
   4163					goto out;
   4164			}
   4165		}
   4166		ret = dup_ref(cur, &check_dirs);
   4167		if (ret < 0)
   4168			goto out;
   4169	}
   4170
   4171	if (S_ISDIR(sctx->cur_inode_mode) && sctx->cur_inode_deleted) {
   4172		/*
   4173		 * Check if we can already rmdir the directory. If not,
   4174		 * orphanize it. For every dir item inside that gets deleted
   4175		 * later, we do this check again and rmdir it then if possible.
   4176		 * See the use of check_dirs for more details.
   4177		 */
   4178		ret = can_rmdir(sctx, sctx->cur_ino, sctx->cur_inode_gen,
   4179				sctx->cur_ino);
   4180		if (ret < 0)
   4181			goto out;
   4182		if (ret) {
   4183			ret = send_rmdir(sctx, valid_path);
   4184			if (ret < 0)
   4185				goto out;
   4186		} else if (!is_orphan) {
   4187			ret = orphanize_inode(sctx, sctx->cur_ino,
   4188					sctx->cur_inode_gen, valid_path);
   4189			if (ret < 0)
   4190				goto out;
   4191			is_orphan = 1;
   4192		}
   4193
   4194		list_for_each_entry(cur, &sctx->deleted_refs, list) {
   4195			ret = dup_ref(cur, &check_dirs);
   4196			if (ret < 0)
   4197				goto out;
   4198		}
   4199	} else if (S_ISDIR(sctx->cur_inode_mode) &&
   4200		   !list_empty(&sctx->deleted_refs)) {
   4201		/*
   4202		 * We have a moved dir. Add the old parent to check_dirs
   4203		 */
   4204		cur = list_entry(sctx->deleted_refs.next, struct recorded_ref,
   4205				list);
   4206		ret = dup_ref(cur, &check_dirs);
   4207		if (ret < 0)
   4208			goto out;
   4209	} else if (!S_ISDIR(sctx->cur_inode_mode)) {
   4210		/*
   4211		 * We have a non dir inode. Go through all deleted refs and
   4212		 * unlink them if they were not already overwritten by other
   4213		 * inodes.
   4214		 */
   4215		list_for_each_entry(cur, &sctx->deleted_refs, list) {
   4216			ret = did_overwrite_ref(sctx, cur->dir, cur->dir_gen,
   4217					sctx->cur_ino, sctx->cur_inode_gen,
   4218					cur->name, cur->name_len);
   4219			if (ret < 0)
   4220				goto out;
   4221			if (!ret) {
   4222				/*
   4223				 * If we orphanized any ancestor before, we need
   4224				 * to recompute the full path for deleted names,
   4225				 * since any such path was computed before we
   4226				 * processed any references and orphanized any
   4227				 * ancestor inode.
   4228				 */
   4229				if (orphanized_ancestor) {
   4230					ret = update_ref_path(sctx, cur);
   4231					if (ret < 0)
   4232						goto out;
   4233				}
   4234				ret = send_unlink(sctx, cur->full_path);
   4235				if (ret < 0)
   4236					goto out;
   4237			}
   4238			ret = dup_ref(cur, &check_dirs);
   4239			if (ret < 0)
   4240				goto out;
   4241		}
   4242		/*
   4243		 * If the inode is still orphan, unlink the orphan. This may
   4244		 * happen when a previous inode did overwrite the first ref
   4245		 * of this inode and no new refs were added for the current
   4246		 * inode. Unlinking does not mean that the inode is deleted in
   4247		 * all cases. There may still be links to this inode in other
   4248		 * places.
   4249		 */
   4250		if (is_orphan) {
   4251			ret = send_unlink(sctx, valid_path);
   4252			if (ret < 0)
   4253				goto out;
   4254		}
   4255	}
   4256
   4257	/*
   4258	 * We did collect all parent dirs where cur_inode was once located. We
   4259	 * now go through all these dirs and check if they are pending for
   4260	 * deletion and if it's finally possible to perform the rmdir now.
   4261	 * We also update the inode stats of the parent dirs here.
   4262	 */
   4263	list_for_each_entry(cur, &check_dirs, list) {
   4264		/*
   4265		 * In case we had refs into dirs that were not processed yet,
   4266		 * we don't need to do the utime and rmdir logic for these dirs.
   4267		 * The dir will be processed later.
   4268		 */
   4269		if (cur->dir > sctx->cur_ino)
   4270			continue;
   4271
   4272		ret = get_cur_inode_state(sctx, cur->dir, cur->dir_gen);
   4273		if (ret < 0)
   4274			goto out;
   4275
   4276		if (ret == inode_state_did_create ||
   4277		    ret == inode_state_no_change) {
   4278			/* TODO delayed utimes */
   4279			ret = send_utimes(sctx, cur->dir, cur->dir_gen);
   4280			if (ret < 0)
   4281				goto out;
   4282		} else if (ret == inode_state_did_delete &&
   4283			   cur->dir != last_dir_ino_rm) {
   4284			ret = can_rmdir(sctx, cur->dir, cur->dir_gen,
   4285					sctx->cur_ino);
   4286			if (ret < 0)
   4287				goto out;
   4288			if (ret) {
   4289				ret = get_cur_path(sctx, cur->dir,
   4290						   cur->dir_gen, valid_path);
   4291				if (ret < 0)
   4292					goto out;
   4293				ret = send_rmdir(sctx, valid_path);
   4294				if (ret < 0)
   4295					goto out;
   4296				last_dir_ino_rm = cur->dir;
   4297			}
   4298		}
   4299	}
   4300
   4301	ret = 0;
   4302
   4303out:
   4304	__free_recorded_refs(&check_dirs);
   4305	free_recorded_refs(sctx);
   4306	fs_path_free(valid_path);
   4307	return ret;
   4308}
   4309
   4310static int record_ref(struct btrfs_root *root, u64 dir, struct fs_path *name,
   4311		      void *ctx, struct list_head *refs)
   4312{
   4313	int ret = 0;
   4314	struct send_ctx *sctx = ctx;
   4315	struct fs_path *p;
   4316	u64 gen;
   4317
   4318	p = fs_path_alloc();
   4319	if (!p)
   4320		return -ENOMEM;
   4321
   4322	ret = get_inode_info(root, dir, NULL, &gen, NULL, NULL,
   4323			NULL, NULL);
   4324	if (ret < 0)
   4325		goto out;
   4326
   4327	ret = get_cur_path(sctx, dir, gen, p);
   4328	if (ret < 0)
   4329		goto out;
   4330	ret = fs_path_add_path(p, name);
   4331	if (ret < 0)
   4332		goto out;
   4333
   4334	ret = __record_ref(refs, dir, gen, p);
   4335
   4336out:
   4337	if (ret)
   4338		fs_path_free(p);
   4339	return ret;
   4340}
   4341
   4342static int __record_new_ref(int num, u64 dir, int index,
   4343			    struct fs_path *name,
   4344			    void *ctx)
   4345{
   4346	struct send_ctx *sctx = ctx;
   4347	return record_ref(sctx->send_root, dir, name, ctx, &sctx->new_refs);
   4348}
   4349
   4350
   4351static int __record_deleted_ref(int num, u64 dir, int index,
   4352				struct fs_path *name,
   4353				void *ctx)
   4354{
   4355	struct send_ctx *sctx = ctx;
   4356	return record_ref(sctx->parent_root, dir, name, ctx,
   4357			  &sctx->deleted_refs);
   4358}
   4359
   4360static int record_new_ref(struct send_ctx *sctx)
   4361{
   4362	int ret;
   4363
   4364	ret = iterate_inode_ref(sctx->send_root, sctx->left_path,
   4365				sctx->cmp_key, 0, __record_new_ref, sctx);
   4366	if (ret < 0)
   4367		goto out;
   4368	ret = 0;
   4369
   4370out:
   4371	return ret;
   4372}
   4373
   4374static int record_deleted_ref(struct send_ctx *sctx)
   4375{
   4376	int ret;
   4377
   4378	ret = iterate_inode_ref(sctx->parent_root, sctx->right_path,
   4379				sctx->cmp_key, 0, __record_deleted_ref, sctx);
   4380	if (ret < 0)
   4381		goto out;
   4382	ret = 0;
   4383
   4384out:
   4385	return ret;
   4386}
   4387
   4388struct find_ref_ctx {
   4389	u64 dir;
   4390	u64 dir_gen;
   4391	struct btrfs_root *root;
   4392	struct fs_path *name;
   4393	int found_idx;
   4394};
   4395
   4396static int __find_iref(int num, u64 dir, int index,
   4397		       struct fs_path *name,
   4398		       void *ctx_)
   4399{
   4400	struct find_ref_ctx *ctx = ctx_;
   4401	u64 dir_gen;
   4402	int ret;
   4403
   4404	if (dir == ctx->dir && fs_path_len(name) == fs_path_len(ctx->name) &&
   4405	    strncmp(name->start, ctx->name->start, fs_path_len(name)) == 0) {
   4406		/*
   4407		 * To avoid doing extra lookups we'll only do this if everything
   4408		 * else matches.
   4409		 */
   4410		ret = get_inode_info(ctx->root, dir, NULL, &dir_gen, NULL,
   4411				     NULL, NULL, NULL);
   4412		if (ret)
   4413			return ret;
   4414		if (dir_gen != ctx->dir_gen)
   4415			return 0;
   4416		ctx->found_idx = num;
   4417		return 1;
   4418	}
   4419	return 0;
   4420}
   4421
   4422static int find_iref(struct btrfs_root *root,
   4423		     struct btrfs_path *path,
   4424		     struct btrfs_key *key,
   4425		     u64 dir, u64 dir_gen, struct fs_path *name)
   4426{
   4427	int ret;
   4428	struct find_ref_ctx ctx;
   4429
   4430	ctx.dir = dir;
   4431	ctx.name = name;
   4432	ctx.dir_gen = dir_gen;
   4433	ctx.found_idx = -1;
   4434	ctx.root = root;
   4435
   4436	ret = iterate_inode_ref(root, path, key, 0, __find_iref, &ctx);
   4437	if (ret < 0)
   4438		return ret;
   4439
   4440	if (ctx.found_idx == -1)
   4441		return -ENOENT;
   4442
   4443	return ctx.found_idx;
   4444}
   4445
   4446static int __record_changed_new_ref(int num, u64 dir, int index,
   4447				    struct fs_path *name,
   4448				    void *ctx)
   4449{
   4450	u64 dir_gen;
   4451	int ret;
   4452	struct send_ctx *sctx = ctx;
   4453
   4454	ret = get_inode_info(sctx->send_root, dir, NULL, &dir_gen, NULL,
   4455			     NULL, NULL, NULL);
   4456	if (ret)
   4457		return ret;
   4458
   4459	ret = find_iref(sctx->parent_root, sctx->right_path,
   4460			sctx->cmp_key, dir, dir_gen, name);
   4461	if (ret == -ENOENT)
   4462		ret = __record_new_ref(num, dir, index, name, sctx);
   4463	else if (ret > 0)
   4464		ret = 0;
   4465
   4466	return ret;
   4467}
   4468
   4469static int __record_changed_deleted_ref(int num, u64 dir, int index,
   4470					struct fs_path *name,
   4471					void *ctx)
   4472{
   4473	u64 dir_gen;
   4474	int ret;
   4475	struct send_ctx *sctx = ctx;
   4476
   4477	ret = get_inode_info(sctx->parent_root, dir, NULL, &dir_gen, NULL,
   4478			     NULL, NULL, NULL);
   4479	if (ret)
   4480		return ret;
   4481
   4482	ret = find_iref(sctx->send_root, sctx->left_path, sctx->cmp_key,
   4483			dir, dir_gen, name);
   4484	if (ret == -ENOENT)
   4485		ret = __record_deleted_ref(num, dir, index, name, sctx);
   4486	else if (ret > 0)
   4487		ret = 0;
   4488
   4489	return ret;
   4490}
   4491
   4492static int record_changed_ref(struct send_ctx *sctx)
   4493{
   4494	int ret = 0;
   4495
   4496	ret = iterate_inode_ref(sctx->send_root, sctx->left_path,
   4497			sctx->cmp_key, 0, __record_changed_new_ref, sctx);
   4498	if (ret < 0)
   4499		goto out;
   4500	ret = iterate_inode_ref(sctx->parent_root, sctx->right_path,
   4501			sctx->cmp_key, 0, __record_changed_deleted_ref, sctx);
   4502	if (ret < 0)
   4503		goto out;
   4504	ret = 0;
   4505
   4506out:
   4507	return ret;
   4508}
   4509
   4510/*
   4511 * Record and process all refs at once. Needed when an inode changes the
   4512 * generation number, which means that it was deleted and recreated.
   4513 */
   4514static int process_all_refs(struct send_ctx *sctx,
   4515			    enum btrfs_compare_tree_result cmd)
   4516{
   4517	int ret = 0;
   4518	int iter_ret = 0;
   4519	struct btrfs_root *root;
   4520	struct btrfs_path *path;
   4521	struct btrfs_key key;
   4522	struct btrfs_key found_key;
   4523	iterate_inode_ref_t cb;
   4524	int pending_move = 0;
   4525
   4526	path = alloc_path_for_send();
   4527	if (!path)
   4528		return -ENOMEM;
   4529
   4530	if (cmd == BTRFS_COMPARE_TREE_NEW) {
   4531		root = sctx->send_root;
   4532		cb = __record_new_ref;
   4533	} else if (cmd == BTRFS_COMPARE_TREE_DELETED) {
   4534		root = sctx->parent_root;
   4535		cb = __record_deleted_ref;
   4536	} else {
   4537		btrfs_err(sctx->send_root->fs_info,
   4538				"Wrong command %d in process_all_refs", cmd);
   4539		ret = -EINVAL;
   4540		goto out;
   4541	}
   4542
   4543	key.objectid = sctx->cmp_key->objectid;
   4544	key.type = BTRFS_INODE_REF_KEY;
   4545	key.offset = 0;
   4546	btrfs_for_each_slot(root, &key, &found_key, path, iter_ret) {
   4547		if (found_key.objectid != key.objectid ||
   4548		    (found_key.type != BTRFS_INODE_REF_KEY &&
   4549		     found_key.type != BTRFS_INODE_EXTREF_KEY))
   4550			break;
   4551
   4552		ret = iterate_inode_ref(root, path, &found_key, 0, cb, sctx);
   4553		if (ret < 0)
   4554			goto out;
   4555	}
   4556	/* Catch error found during iteration */
   4557	if (iter_ret < 0) {
   4558		ret = iter_ret;
   4559		goto out;
   4560	}
   4561	btrfs_release_path(path);
   4562
   4563	/*
   4564	 * We don't actually care about pending_move as we are simply
   4565	 * re-creating this inode and will be rename'ing it into place once we
   4566	 * rename the parent directory.
   4567	 */
   4568	ret = process_recorded_refs(sctx, &pending_move);
   4569out:
   4570	btrfs_free_path(path);
   4571	return ret;
   4572}
   4573
   4574static int send_set_xattr(struct send_ctx *sctx,
   4575			  struct fs_path *path,
   4576			  const char *name, int name_len,
   4577			  const char *data, int data_len)
   4578{
   4579	int ret = 0;
   4580
   4581	ret = begin_cmd(sctx, BTRFS_SEND_C_SET_XATTR);
   4582	if (ret < 0)
   4583		goto out;
   4584
   4585	TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, path);
   4586	TLV_PUT_STRING(sctx, BTRFS_SEND_A_XATTR_NAME, name, name_len);
   4587	TLV_PUT(sctx, BTRFS_SEND_A_XATTR_DATA, data, data_len);
   4588
   4589	ret = send_cmd(sctx);
   4590
   4591tlv_put_failure:
   4592out:
   4593	return ret;
   4594}
   4595
   4596static int send_remove_xattr(struct send_ctx *sctx,
   4597			  struct fs_path *path,
   4598			  const char *name, int name_len)
   4599{
   4600	int ret = 0;
   4601
   4602	ret = begin_cmd(sctx, BTRFS_SEND_C_REMOVE_XATTR);
   4603	if (ret < 0)
   4604		goto out;
   4605
   4606	TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, path);
   4607	TLV_PUT_STRING(sctx, BTRFS_SEND_A_XATTR_NAME, name, name_len);
   4608
   4609	ret = send_cmd(sctx);
   4610
   4611tlv_put_failure:
   4612out:
   4613	return ret;
   4614}
   4615
   4616static int __process_new_xattr(int num, struct btrfs_key *di_key,
   4617			       const char *name, int name_len, const char *data,
   4618			       int data_len, void *ctx)
   4619{
   4620	int ret;
   4621	struct send_ctx *sctx = ctx;
   4622	struct fs_path *p;
   4623	struct posix_acl_xattr_header dummy_acl;
   4624
   4625	/* Capabilities are emitted by finish_inode_if_needed */
   4626	if (!strncmp(name, XATTR_NAME_CAPS, name_len))
   4627		return 0;
   4628
   4629	p = fs_path_alloc();
   4630	if (!p)
   4631		return -ENOMEM;
   4632
   4633	/*
   4634	 * This hack is needed because empty acls are stored as zero byte
   4635	 * data in xattrs. Problem with that is, that receiving these zero byte
   4636	 * acls will fail later. To fix this, we send a dummy acl list that
   4637	 * only contains the version number and no entries.
   4638	 */
   4639	if (!strncmp(name, XATTR_NAME_POSIX_ACL_ACCESS, name_len) ||
   4640	    !strncmp(name, XATTR_NAME_POSIX_ACL_DEFAULT, name_len)) {
   4641		if (data_len == 0) {
   4642			dummy_acl.a_version =
   4643					cpu_to_le32(POSIX_ACL_XATTR_VERSION);
   4644			data = (char *)&dummy_acl;
   4645			data_len = sizeof(dummy_acl);
   4646		}
   4647	}
   4648
   4649	ret = get_cur_path(sctx, sctx->cur_ino, sctx->cur_inode_gen, p);
   4650	if (ret < 0)
   4651		goto out;
   4652
   4653	ret = send_set_xattr(sctx, p, name, name_len, data, data_len);
   4654
   4655out:
   4656	fs_path_free(p);
   4657	return ret;
   4658}
   4659
   4660static int __process_deleted_xattr(int num, struct btrfs_key *di_key,
   4661				   const char *name, int name_len,
   4662				   const char *data, int data_len, void *ctx)
   4663{
   4664	int ret;
   4665	struct send_ctx *sctx = ctx;
   4666	struct fs_path *p;
   4667
   4668	p = fs_path_alloc();
   4669	if (!p)
   4670		return -ENOMEM;
   4671
   4672	ret = get_cur_path(sctx, sctx->cur_ino, sctx->cur_inode_gen, p);
   4673	if (ret < 0)
   4674		goto out;
   4675
   4676	ret = send_remove_xattr(sctx, p, name, name_len);
   4677
   4678out:
   4679	fs_path_free(p);
   4680	return ret;
   4681}
   4682
   4683static int process_new_xattr(struct send_ctx *sctx)
   4684{
   4685	int ret = 0;
   4686
   4687	ret = iterate_dir_item(sctx->send_root, sctx->left_path,
   4688			       __process_new_xattr, sctx);
   4689
   4690	return ret;
   4691}
   4692
   4693static int process_deleted_xattr(struct send_ctx *sctx)
   4694{
   4695	return iterate_dir_item(sctx->parent_root, sctx->right_path,
   4696				__process_deleted_xattr, sctx);
   4697}
   4698
   4699struct find_xattr_ctx {
   4700	const char *name;
   4701	int name_len;
   4702	int found_idx;
   4703	char *found_data;
   4704	int found_data_len;
   4705};
   4706
   4707static int __find_xattr(int num, struct btrfs_key *di_key, const char *name,
   4708			int name_len, const char *data, int data_len, void *vctx)
   4709{
   4710	struct find_xattr_ctx *ctx = vctx;
   4711
   4712	if (name_len == ctx->name_len &&
   4713	    strncmp(name, ctx->name, name_len) == 0) {
   4714		ctx->found_idx = num;
   4715		ctx->found_data_len = data_len;
   4716		ctx->found_data = kmemdup(data, data_len, GFP_KERNEL);
   4717		if (!ctx->found_data)
   4718			return -ENOMEM;
   4719		return 1;
   4720	}
   4721	return 0;
   4722}
   4723
   4724static int find_xattr(struct btrfs_root *root,
   4725		      struct btrfs_path *path,
   4726		      struct btrfs_key *key,
   4727		      const char *name, int name_len,
   4728		      char **data, int *data_len)
   4729{
   4730	int ret;
   4731	struct find_xattr_ctx ctx;
   4732
   4733	ctx.name = name;
   4734	ctx.name_len = name_len;
   4735	ctx.found_idx = -1;
   4736	ctx.found_data = NULL;
   4737	ctx.found_data_len = 0;
   4738
   4739	ret = iterate_dir_item(root, path, __find_xattr, &ctx);
   4740	if (ret < 0)
   4741		return ret;
   4742
   4743	if (ctx.found_idx == -1)
   4744		return -ENOENT;
   4745	if (data) {
   4746		*data = ctx.found_data;
   4747		*data_len = ctx.found_data_len;
   4748	} else {
   4749		kfree(ctx.found_data);
   4750	}
   4751	return ctx.found_idx;
   4752}
   4753
   4754
   4755static int __process_changed_new_xattr(int num, struct btrfs_key *di_key,
   4756				       const char *name, int name_len,
   4757				       const char *data, int data_len,
   4758				       void *ctx)
   4759{
   4760	int ret;
   4761	struct send_ctx *sctx = ctx;
   4762	char *found_data = NULL;
   4763	int found_data_len  = 0;
   4764
   4765	ret = find_xattr(sctx->parent_root, sctx->right_path,
   4766			 sctx->cmp_key, name, name_len, &found_data,
   4767			 &found_data_len);
   4768	if (ret == -ENOENT) {
   4769		ret = __process_new_xattr(num, di_key, name, name_len, data,
   4770					  data_len, ctx);
   4771	} else if (ret >= 0) {
   4772		if (data_len != found_data_len ||
   4773		    memcmp(data, found_data, data_len)) {
   4774			ret = __process_new_xattr(num, di_key, name, name_len,
   4775						  data, data_len, ctx);
   4776		} else {
   4777			ret = 0;
   4778		}
   4779	}
   4780
   4781	kfree(found_data);
   4782	return ret;
   4783}
   4784
   4785static int __process_changed_deleted_xattr(int num, struct btrfs_key *di_key,
   4786					   const char *name, int name_len,
   4787					   const char *data, int data_len,
   4788					   void *ctx)
   4789{
   4790	int ret;
   4791	struct send_ctx *sctx = ctx;
   4792
   4793	ret = find_xattr(sctx->send_root, sctx->left_path, sctx->cmp_key,
   4794			 name, name_len, NULL, NULL);
   4795	if (ret == -ENOENT)
   4796		ret = __process_deleted_xattr(num, di_key, name, name_len, data,
   4797					      data_len, ctx);
   4798	else if (ret >= 0)
   4799		ret = 0;
   4800
   4801	return ret;
   4802}
   4803
   4804static int process_changed_xattr(struct send_ctx *sctx)
   4805{
   4806	int ret = 0;
   4807
   4808	ret = iterate_dir_item(sctx->send_root, sctx->left_path,
   4809			__process_changed_new_xattr, sctx);
   4810	if (ret < 0)
   4811		goto out;
   4812	ret = iterate_dir_item(sctx->parent_root, sctx->right_path,
   4813			__process_changed_deleted_xattr, sctx);
   4814
   4815out:
   4816	return ret;
   4817}
   4818
   4819static int process_all_new_xattrs(struct send_ctx *sctx)
   4820{
   4821	int ret = 0;
   4822	int iter_ret = 0;
   4823	struct btrfs_root *root;
   4824	struct btrfs_path *path;
   4825	struct btrfs_key key;
   4826	struct btrfs_key found_key;
   4827
   4828	path = alloc_path_for_send();
   4829	if (!path)
   4830		return -ENOMEM;
   4831
   4832	root = sctx->send_root;
   4833
   4834	key.objectid = sctx->cmp_key->objectid;
   4835	key.type = BTRFS_XATTR_ITEM_KEY;
   4836	key.offset = 0;
   4837	btrfs_for_each_slot(root, &key, &found_key, path, iter_ret) {
   4838		if (found_key.objectid != key.objectid ||
   4839		    found_key.type != key.type) {
   4840			ret = 0;
   4841			break;
   4842		}
   4843
   4844		ret = iterate_dir_item(root, path, __process_new_xattr, sctx);
   4845		if (ret < 0)
   4846			break;
   4847	}
   4848	/* Catch error found during iteration */
   4849	if (iter_ret < 0)
   4850		ret = iter_ret;
   4851
   4852	btrfs_free_path(path);
   4853	return ret;
   4854}
   4855
   4856static inline u64 max_send_read_size(const struct send_ctx *sctx)
   4857{
   4858	return sctx->send_max_size - SZ_16K;
   4859}
   4860
   4861static int put_data_header(struct send_ctx *sctx, u32 len)
   4862{
   4863	struct btrfs_tlv_header *hdr;
   4864
   4865	if (sctx->send_max_size - sctx->send_size < sizeof(*hdr) + len)
   4866		return -EOVERFLOW;
   4867	hdr = (struct btrfs_tlv_header *)(sctx->send_buf + sctx->send_size);
   4868	put_unaligned_le16(BTRFS_SEND_A_DATA, &hdr->tlv_type);
   4869	put_unaligned_le16(len, &hdr->tlv_len);
   4870	sctx->send_size += sizeof(*hdr);
   4871	return 0;
   4872}
   4873
   4874static int put_file_data(struct send_ctx *sctx, u64 offset, u32 len)
   4875{
   4876	struct btrfs_root *root = sctx->send_root;
   4877	struct btrfs_fs_info *fs_info = root->fs_info;
   4878	struct page *page;
   4879	pgoff_t index = offset >> PAGE_SHIFT;
   4880	pgoff_t last_index;
   4881	unsigned pg_offset = offset_in_page(offset);
   4882	int ret;
   4883
   4884	ret = put_data_header(sctx, len);
   4885	if (ret)
   4886		return ret;
   4887
   4888	last_index = (offset + len - 1) >> PAGE_SHIFT;
   4889
   4890	while (index <= last_index) {
   4891		unsigned cur_len = min_t(unsigned, len,
   4892					 PAGE_SIZE - pg_offset);
   4893
   4894		page = find_lock_page(sctx->cur_inode->i_mapping, index);
   4895		if (!page) {
   4896			page_cache_sync_readahead(sctx->cur_inode->i_mapping,
   4897						  &sctx->ra, NULL, index,
   4898						  last_index + 1 - index);
   4899
   4900			page = find_or_create_page(sctx->cur_inode->i_mapping,
   4901						   index, GFP_KERNEL);
   4902			if (!page) {
   4903				ret = -ENOMEM;
   4904				break;
   4905			}
   4906		}
   4907
   4908		if (PageReadahead(page))
   4909			page_cache_async_readahead(sctx->cur_inode->i_mapping,
   4910						   &sctx->ra, NULL, page_folio(page),
   4911						   index, last_index + 1 - index);
   4912
   4913		if (!PageUptodate(page)) {
   4914			btrfs_read_folio(NULL, page_folio(page));
   4915			lock_page(page);
   4916			if (!PageUptodate(page)) {
   4917				unlock_page(page);
   4918				btrfs_err(fs_info,
   4919			"send: IO error at offset %llu for inode %llu root %llu",
   4920					page_offset(page), sctx->cur_ino,
   4921					sctx->send_root->root_key.objectid);
   4922				put_page(page);
   4923				ret = -EIO;
   4924				break;
   4925			}
   4926		}
   4927
   4928		memcpy_from_page(sctx->send_buf + sctx->send_size, page,
   4929				 pg_offset, cur_len);
   4930		unlock_page(page);
   4931		put_page(page);
   4932		index++;
   4933		pg_offset = 0;
   4934		len -= cur_len;
   4935		sctx->send_size += cur_len;
   4936	}
   4937
   4938	return ret;
   4939}
   4940
   4941/*
   4942 * Read some bytes from the current inode/file and send a write command to
   4943 * user space.
   4944 */
   4945static int send_write(struct send_ctx *sctx, u64 offset, u32 len)
   4946{
   4947	struct btrfs_fs_info *fs_info = sctx->send_root->fs_info;
   4948	int ret = 0;
   4949	struct fs_path *p;
   4950
   4951	p = fs_path_alloc();
   4952	if (!p)
   4953		return -ENOMEM;
   4954
   4955	btrfs_debug(fs_info, "send_write offset=%llu, len=%d", offset, len);
   4956
   4957	ret = begin_cmd(sctx, BTRFS_SEND_C_WRITE);
   4958	if (ret < 0)
   4959		goto out;
   4960
   4961	ret = get_cur_path(sctx, sctx->cur_ino, sctx->cur_inode_gen, p);
   4962	if (ret < 0)
   4963		goto out;
   4964
   4965	TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, p);
   4966	TLV_PUT_U64(sctx, BTRFS_SEND_A_FILE_OFFSET, offset);
   4967	ret = put_file_data(sctx, offset, len);
   4968	if (ret < 0)
   4969		goto out;
   4970
   4971	ret = send_cmd(sctx);
   4972
   4973tlv_put_failure:
   4974out:
   4975	fs_path_free(p);
   4976	return ret;
   4977}
   4978
   4979/*
   4980 * Send a clone command to user space.
   4981 */
   4982static int send_clone(struct send_ctx *sctx,
   4983		      u64 offset, u32 len,
   4984		      struct clone_root *clone_root)
   4985{
   4986	int ret = 0;
   4987	struct fs_path *p;
   4988	u64 gen;
   4989
   4990	btrfs_debug(sctx->send_root->fs_info,
   4991		    "send_clone offset=%llu, len=%d, clone_root=%llu, clone_inode=%llu, clone_offset=%llu",
   4992		    offset, len, clone_root->root->root_key.objectid,
   4993		    clone_root->ino, clone_root->offset);
   4994
   4995	p = fs_path_alloc();
   4996	if (!p)
   4997		return -ENOMEM;
   4998
   4999	ret = begin_cmd(sctx, BTRFS_SEND_C_CLONE);
   5000	if (ret < 0)
   5001		goto out;
   5002
   5003	ret = get_cur_path(sctx, sctx->cur_ino, sctx->cur_inode_gen, p);
   5004	if (ret < 0)
   5005		goto out;
   5006
   5007	TLV_PUT_U64(sctx, BTRFS_SEND_A_FILE_OFFSET, offset);
   5008	TLV_PUT_U64(sctx, BTRFS_SEND_A_CLONE_LEN, len);
   5009	TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, p);
   5010
   5011	if (clone_root->root == sctx->send_root) {
   5012		ret = get_inode_info(sctx->send_root, clone_root->ino, NULL,
   5013				&gen, NULL, NULL, NULL, NULL);
   5014		if (ret < 0)
   5015			goto out;
   5016		ret = get_cur_path(sctx, clone_root->ino, gen, p);
   5017	} else {
   5018		ret = get_inode_path(clone_root->root, clone_root->ino, p);
   5019	}
   5020	if (ret < 0)
   5021		goto out;
   5022
   5023	/*
   5024	 * If the parent we're using has a received_uuid set then use that as
   5025	 * our clone source as that is what we will look for when doing a
   5026	 * receive.
   5027	 *
   5028	 * This covers the case that we create a snapshot off of a received
   5029	 * subvolume and then use that as the parent and try to receive on a
   5030	 * different host.
   5031	 */
   5032	if (!btrfs_is_empty_uuid(clone_root->root->root_item.received_uuid))
   5033		TLV_PUT_UUID(sctx, BTRFS_SEND_A_CLONE_UUID,
   5034			     clone_root->root->root_item.received_uuid);
   5035	else
   5036		TLV_PUT_UUID(sctx, BTRFS_SEND_A_CLONE_UUID,
   5037			     clone_root->root->root_item.uuid);
   5038	TLV_PUT_U64(sctx, BTRFS_SEND_A_CLONE_CTRANSID,
   5039		    btrfs_root_ctransid(&clone_root->root->root_item));
   5040	TLV_PUT_PATH(sctx, BTRFS_SEND_A_CLONE_PATH, p);
   5041	TLV_PUT_U64(sctx, BTRFS_SEND_A_CLONE_OFFSET,
   5042			clone_root->offset);
   5043
   5044	ret = send_cmd(sctx);
   5045
   5046tlv_put_failure:
   5047out:
   5048	fs_path_free(p);
   5049	return ret;
   5050}
   5051
   5052/*
   5053 * Send an update extent command to user space.
   5054 */
   5055static int send_update_extent(struct send_ctx *sctx,
   5056			      u64 offset, u32 len)
   5057{
   5058	int ret = 0;
   5059	struct fs_path *p;
   5060
   5061	p = fs_path_alloc();
   5062	if (!p)
   5063		return -ENOMEM;
   5064
   5065	ret = begin_cmd(sctx, BTRFS_SEND_C_UPDATE_EXTENT);
   5066	if (ret < 0)
   5067		goto out;
   5068
   5069	ret = get_cur_path(sctx, sctx->cur_ino, sctx->cur_inode_gen, p);
   5070	if (ret < 0)
   5071		goto out;
   5072
   5073	TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, p);
   5074	TLV_PUT_U64(sctx, BTRFS_SEND_A_FILE_OFFSET, offset);
   5075	TLV_PUT_U64(sctx, BTRFS_SEND_A_SIZE, len);
   5076
   5077	ret = send_cmd(sctx);
   5078
   5079tlv_put_failure:
   5080out:
   5081	fs_path_free(p);
   5082	return ret;
   5083}
   5084
   5085static int send_hole(struct send_ctx *sctx, u64 end)
   5086{
   5087	struct fs_path *p = NULL;
   5088	u64 read_size = max_send_read_size(sctx);
   5089	u64 offset = sctx->cur_inode_last_extent;
   5090	int ret = 0;
   5091
   5092	/*
   5093	 * A hole that starts at EOF or beyond it. Since we do not yet support
   5094	 * fallocate (for extent preallocation and hole punching), sending a
   5095	 * write of zeroes starting at EOF or beyond would later require issuing
   5096	 * a truncate operation which would undo the write and achieve nothing.
   5097	 */
   5098	if (offset >= sctx->cur_inode_size)
   5099		return 0;
   5100
   5101	/*
   5102	 * Don't go beyond the inode's i_size due to prealloc extents that start
   5103	 * after the i_size.
   5104	 */
   5105	end = min_t(u64, end, sctx->cur_inode_size);
   5106
   5107	if (sctx->flags & BTRFS_SEND_FLAG_NO_FILE_DATA)
   5108		return send_update_extent(sctx, offset, end - offset);
   5109
   5110	p = fs_path_alloc();
   5111	if (!p)
   5112		return -ENOMEM;
   5113	ret = get_cur_path(sctx, sctx->cur_ino, sctx->cur_inode_gen, p);
   5114	if (ret < 0)
   5115		goto tlv_put_failure;
   5116	while (offset < end) {
   5117		u64 len = min(end - offset, read_size);
   5118
   5119		ret = begin_cmd(sctx, BTRFS_SEND_C_WRITE);
   5120		if (ret < 0)
   5121			break;
   5122		TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, p);
   5123		TLV_PUT_U64(sctx, BTRFS_SEND_A_FILE_OFFSET, offset);
   5124		ret = put_data_header(sctx, len);
   5125		if (ret < 0)
   5126			break;
   5127		memset(sctx->send_buf + sctx->send_size, 0, len);
   5128		sctx->send_size += len;
   5129		ret = send_cmd(sctx);
   5130		if (ret < 0)
   5131			break;
   5132		offset += len;
   5133	}
   5134	sctx->cur_inode_next_write_offset = offset;
   5135tlv_put_failure:
   5136	fs_path_free(p);
   5137	return ret;
   5138}
   5139
   5140static int send_extent_data(struct send_ctx *sctx,
   5141			    const u64 offset,
   5142			    const u64 len)
   5143{
   5144	const u64 end = offset + len;
   5145	u64 read_size = max_send_read_size(sctx);
   5146	u64 sent = 0;
   5147
   5148	if (sctx->flags & BTRFS_SEND_FLAG_NO_FILE_DATA)
   5149		return send_update_extent(sctx, offset, len);
   5150
   5151	if (sctx->cur_inode == NULL) {
   5152		struct btrfs_root *root = sctx->send_root;
   5153
   5154		sctx->cur_inode = btrfs_iget(root->fs_info->sb, sctx->cur_ino, root);
   5155		if (IS_ERR(sctx->cur_inode)) {
   5156			int err = PTR_ERR(sctx->cur_inode);
   5157
   5158			sctx->cur_inode = NULL;
   5159			return err;
   5160		}
   5161		memset(&sctx->ra, 0, sizeof(struct file_ra_state));
   5162		file_ra_state_init(&sctx->ra, sctx->cur_inode->i_mapping);
   5163
   5164		/*
   5165		 * It's very likely there are no pages from this inode in the page
   5166		 * cache, so after reading extents and sending their data, we clean
   5167		 * the page cache to avoid trashing the page cache (adding pressure
   5168		 * to the page cache and forcing eviction of other data more useful
   5169		 * for applications).
   5170		 *
   5171		 * We decide if we should clean the page cache simply by checking
   5172		 * if the inode's mapping nrpages is 0 when we first open it, and
   5173		 * not by using something like filemap_range_has_page() before
   5174		 * reading an extent because when we ask the readahead code to
   5175		 * read a given file range, it may (and almost always does) read
   5176		 * pages from beyond that range (see the documentation for
   5177		 * page_cache_sync_readahead()), so it would not be reliable,
   5178		 * because after reading the first extent future calls to
   5179		 * filemap_range_has_page() would return true because the readahead
   5180		 * on the previous extent resulted in reading pages of the current
   5181		 * extent as well.
   5182		 */
   5183		sctx->clean_page_cache = (sctx->cur_inode->i_mapping->nrpages == 0);
   5184		sctx->page_cache_clear_start = round_down(offset, PAGE_SIZE);
   5185	}
   5186
   5187	while (sent < len) {
   5188		u64 size = min(len - sent, read_size);
   5189		int ret;
   5190
   5191		ret = send_write(sctx, offset + sent, size);
   5192		if (ret < 0)
   5193			return ret;
   5194		sent += size;
   5195	}
   5196
   5197	if (sctx->clean_page_cache && IS_ALIGNED(end, PAGE_SIZE)) {
   5198		/*
   5199		 * Always operate only on ranges that are a multiple of the page
   5200		 * size. This is not only to prevent zeroing parts of a page in
   5201		 * the case of subpage sector size, but also to guarantee we evict
   5202		 * pages, as passing a range that is smaller than page size does
   5203		 * not evict the respective page (only zeroes part of its content).
   5204		 *
   5205		 * Always start from the end offset of the last range cleared.
   5206		 * This is because the readahead code may (and very often does)
   5207		 * reads pages beyond the range we request for readahead. So if
   5208		 * we have an extent layout like this:
   5209		 *
   5210		 *            [ extent A ] [ extent B ] [ extent C ]
   5211		 *
   5212		 * When we ask page_cache_sync_readahead() to read extent A, it
   5213		 * may also trigger reads for pages of extent B. If we are doing
   5214		 * an incremental send and extent B has not changed between the
   5215		 * parent and send snapshots, some or all of its pages may end
   5216		 * up being read and placed in the page cache. So when truncating
   5217		 * the page cache we always start from the end offset of the
   5218		 * previously processed extent up to the end of the current
   5219		 * extent.
   5220		 */
   5221		truncate_inode_pages_range(&sctx->cur_inode->i_data,
   5222					   sctx->page_cache_clear_start,
   5223					   end - 1);
   5224		sctx->page_cache_clear_start = end;
   5225	}
   5226
   5227	return 0;
   5228}
   5229
   5230/*
   5231 * Search for a capability xattr related to sctx->cur_ino. If the capability is
   5232 * found, call send_set_xattr function to emit it.
   5233 *
   5234 * Return 0 if there isn't a capability, or when the capability was emitted
   5235 * successfully, or < 0 if an error occurred.
   5236 */
   5237static int send_capabilities(struct send_ctx *sctx)
   5238{
   5239	struct fs_path *fspath = NULL;
   5240	struct btrfs_path *path;
   5241	struct btrfs_dir_item *di;
   5242	struct extent_buffer *leaf;
   5243	unsigned long data_ptr;
   5244	char *buf = NULL;
   5245	int buf_len;
   5246	int ret = 0;
   5247
   5248	path = alloc_path_for_send();
   5249	if (!path)
   5250		return -ENOMEM;
   5251
   5252	di = btrfs_lookup_xattr(NULL, sctx->send_root, path, sctx->cur_ino,
   5253				XATTR_NAME_CAPS, strlen(XATTR_NAME_CAPS), 0);
   5254	if (!di) {
   5255		/* There is no xattr for this inode */
   5256		goto out;
   5257	} else if (IS_ERR(di)) {
   5258		ret = PTR_ERR(di);
   5259		goto out;
   5260	}
   5261
   5262	leaf = path->nodes[0];
   5263	buf_len = btrfs_dir_data_len(leaf, di);
   5264
   5265	fspath = fs_path_alloc();
   5266	buf = kmalloc(buf_len, GFP_KERNEL);
   5267	if (!fspath || !buf) {
   5268		ret = -ENOMEM;
   5269		goto out;
   5270	}
   5271
   5272	ret = get_cur_path(sctx, sctx->cur_ino, sctx->cur_inode_gen, fspath);
   5273	if (ret < 0)
   5274		goto out;
   5275
   5276	data_ptr = (unsigned long)(di + 1) + btrfs_dir_name_len(leaf, di);
   5277	read_extent_buffer(leaf, buf, data_ptr, buf_len);
   5278
   5279	ret = send_set_xattr(sctx, fspath, XATTR_NAME_CAPS,
   5280			strlen(XATTR_NAME_CAPS), buf, buf_len);
   5281out:
   5282	kfree(buf);
   5283	fs_path_free(fspath);
   5284	btrfs_free_path(path);
   5285	return ret;
   5286}
   5287
   5288static int clone_range(struct send_ctx *sctx,
   5289		       struct clone_root *clone_root,
   5290		       const u64 disk_byte,
   5291		       u64 data_offset,
   5292		       u64 offset,
   5293		       u64 len)
   5294{
   5295	struct btrfs_path *path;
   5296	struct btrfs_key key;
   5297	int ret;
   5298	u64 clone_src_i_size = 0;
   5299
   5300	/*
   5301	 * Prevent cloning from a zero offset with a length matching the sector
   5302	 * size because in some scenarios this will make the receiver fail.
   5303	 *
   5304	 * For example, if in the source filesystem the extent at offset 0
   5305	 * has a length of sectorsize and it was written using direct IO, then
   5306	 * it can never be an inline extent (even if compression is enabled).
   5307	 * Then this extent can be cloned in the original filesystem to a non
   5308	 * zero file offset, but it may not be possible to clone in the
   5309	 * destination filesystem because it can be inlined due to compression
   5310	 * on the destination filesystem (as the receiver's write operations are
   5311	 * always done using buffered IO). The same happens when the original
   5312	 * filesystem does not have compression enabled but the destination
   5313	 * filesystem has.
   5314	 */
   5315	if (clone_root->offset == 0 &&
   5316	    len == sctx->send_root->fs_info->sectorsize)
   5317		return send_extent_data(sctx, offset, len);
   5318
   5319	path = alloc_path_for_send();
   5320	if (!path)
   5321		return -ENOMEM;
   5322
   5323	/*
   5324	 * There are inodes that have extents that lie behind its i_size. Don't
   5325	 * accept clones from these extents.
   5326	 */
   5327	ret = __get_inode_info(clone_root->root, path, clone_root->ino,
   5328			       &clone_src_i_size, NULL, NULL, NULL, NULL, NULL);
   5329	btrfs_release_path(path);
   5330	if (ret < 0)
   5331		goto out;
   5332
   5333	/*
   5334	 * We can't send a clone operation for the entire range if we find
   5335	 * extent items in the respective range in the source file that
   5336	 * refer to different extents or if we find holes.
   5337	 * So check for that and do a mix of clone and regular write/copy
   5338	 * operations if needed.
   5339	 *
   5340	 * Example:
   5341	 *
   5342	 * mkfs.btrfs -f /dev/sda
   5343	 * mount /dev/sda /mnt
   5344	 * xfs_io -f -c "pwrite -S 0xaa 0K 100K" /mnt/foo
   5345	 * cp --reflink=always /mnt/foo /mnt/bar
   5346	 * xfs_io -c "pwrite -S 0xbb 50K 50K" /mnt/foo
   5347	 * btrfs subvolume snapshot -r /mnt /mnt/snap
   5348	 *
   5349	 * If when we send the snapshot and we are processing file bar (which
   5350	 * has a higher inode number than foo) we blindly send a clone operation
   5351	 * for the [0, 100K[ range from foo to bar, the receiver ends up getting
   5352	 * a file bar that matches the content of file foo - iow, doesn't match
   5353	 * the content from bar in the original filesystem.
   5354	 */
   5355	key.objectid = clone_root->ino;
   5356	key.type = BTRFS_EXTENT_DATA_KEY;
   5357	key.offset = clone_root->offset;
   5358	ret = btrfs_search_slot(NULL, clone_root->root, &key, path, 0, 0);
   5359	if (ret < 0)
   5360		goto out;
   5361	if (ret > 0 && path->slots[0] > 0) {
   5362		btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0] - 1);
   5363		if (key.objectid == clone_root->ino &&
   5364		    key.type == BTRFS_EXTENT_DATA_KEY)
   5365			path->slots[0]--;
   5366	}
   5367
   5368	while (true) {
   5369		struct extent_buffer *leaf = path->nodes[0];
   5370		int slot = path->slots[0];
   5371		struct btrfs_file_extent_item *ei;
   5372		u8 type;
   5373		u64 ext_len;
   5374		u64 clone_len;
   5375		u64 clone_data_offset;
   5376
   5377		if (slot >= btrfs_header_nritems(leaf)) {
   5378			ret = btrfs_next_leaf(clone_root->root, path);
   5379			if (ret < 0)
   5380				goto out;
   5381			else if (ret > 0)
   5382				break;
   5383			continue;
   5384		}
   5385
   5386		btrfs_item_key_to_cpu(leaf, &key, slot);
   5387
   5388		/*
   5389		 * We might have an implicit trailing hole (NO_HOLES feature
   5390		 * enabled). We deal with it after leaving this loop.
   5391		 */
   5392		if (key.objectid != clone_root->ino ||
   5393		    key.type != BTRFS_EXTENT_DATA_KEY)
   5394			break;
   5395
   5396		ei = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item);
   5397		type = btrfs_file_extent_type(leaf, ei);
   5398		if (type == BTRFS_FILE_EXTENT_INLINE) {
   5399			ext_len = btrfs_file_extent_ram_bytes(leaf, ei);
   5400			ext_len = PAGE_ALIGN(ext_len);
   5401		} else {
   5402			ext_len = btrfs_file_extent_num_bytes(leaf, ei);
   5403		}
   5404
   5405		if (key.offset + ext_len <= clone_root->offset)
   5406			goto next;
   5407
   5408		if (key.offset > clone_root->offset) {
   5409			/* Implicit hole, NO_HOLES feature enabled. */
   5410			u64 hole_len = key.offset - clone_root->offset;
   5411
   5412			if (hole_len > len)
   5413				hole_len = len;
   5414			ret = send_extent_data(sctx, offset, hole_len);
   5415			if (ret < 0)
   5416				goto out;
   5417
   5418			len -= hole_len;
   5419			if (len == 0)
   5420				break;
   5421			offset += hole_len;
   5422			clone_root->offset += hole_len;
   5423			data_offset += hole_len;
   5424		}
   5425
   5426		if (key.offset >= clone_root->offset + len)
   5427			break;
   5428
   5429		if (key.offset >= clone_src_i_size)
   5430			break;
   5431
   5432		if (key.offset + ext_len > clone_src_i_size)
   5433			ext_len = clone_src_i_size - key.offset;
   5434
   5435		clone_data_offset = btrfs_file_extent_offset(leaf, ei);
   5436		if (btrfs_file_extent_disk_bytenr(leaf, ei) == disk_byte) {
   5437			clone_root->offset = key.offset;
   5438			if (clone_data_offset < data_offset &&
   5439				clone_data_offset + ext_len > data_offset) {
   5440				u64 extent_offset;
   5441
   5442				extent_offset = data_offset - clone_data_offset;
   5443				ext_len -= extent_offset;
   5444				clone_data_offset += extent_offset;
   5445				clone_root->offset += extent_offset;
   5446			}
   5447		}
   5448
   5449		clone_len = min_t(u64, ext_len, len);
   5450
   5451		if (btrfs_file_extent_disk_bytenr(leaf, ei) == disk_byte &&
   5452		    clone_data_offset == data_offset) {
   5453			const u64 src_end = clone_root->offset + clone_len;
   5454			const u64 sectorsize = SZ_64K;
   5455
   5456			/*
   5457			 * We can't clone the last block, when its size is not
   5458			 * sector size aligned, into the middle of a file. If we
   5459			 * do so, the receiver will get a failure (-EINVAL) when
   5460			 * trying to clone or will silently corrupt the data in
   5461			 * the destination file if it's on a kernel without the
   5462			 * fix introduced by commit ac765f83f1397646
   5463			 * ("Btrfs: fix data corruption due to cloning of eof
   5464			 * block).
   5465			 *
   5466			 * So issue a clone of the aligned down range plus a
   5467			 * regular write for the eof block, if we hit that case.
   5468			 *
   5469			 * Also, we use the maximum possible sector size, 64K,
   5470			 * because we don't know what's the sector size of the
   5471			 * filesystem that receives the stream, so we have to
   5472			 * assume the largest possible sector size.
   5473			 */
   5474			if (src_end == clone_src_i_size &&
   5475			    !IS_ALIGNED(src_end, sectorsize) &&
   5476			    offset + clone_len < sctx->cur_inode_size) {
   5477				u64 slen;
   5478
   5479				slen = ALIGN_DOWN(src_end - clone_root->offset,
   5480						  sectorsize);
   5481				if (slen > 0) {
   5482					ret = send_clone(sctx, offset, slen,
   5483							 clone_root);
   5484					if (ret < 0)
   5485						goto out;
   5486				}
   5487				ret = send_extent_data(sctx, offset + slen,
   5488						       clone_len - slen);
   5489			} else {
   5490				ret = send_clone(sctx, offset, clone_len,
   5491						 clone_root);
   5492			}
   5493		} else {
   5494			ret = send_extent_data(sctx, offset, clone_len);
   5495		}
   5496
   5497		if (ret < 0)
   5498			goto out;
   5499
   5500		len -= clone_len;
   5501		if (len == 0)
   5502			break;
   5503		offset += clone_len;
   5504		clone_root->offset += clone_len;
   5505
   5506		/*
   5507		 * If we are cloning from the file we are currently processing,
   5508		 * and using the send root as the clone root, we must stop once
   5509		 * the current clone offset reaches the current eof of the file
   5510		 * at the receiver, otherwise we would issue an invalid clone
   5511		 * operation (source range going beyond eof) and cause the
   5512		 * receiver to fail. So if we reach the current eof, bail out
   5513		 * and fallback to a regular write.
   5514		 */
   5515		if (clone_root->root == sctx->send_root &&
   5516		    clone_root->ino == sctx->cur_ino &&
   5517		    clone_root->offset >= sctx->cur_inode_next_write_offset)
   5518			break;
   5519
   5520		data_offset += clone_len;
   5521next:
   5522		path->slots[0]++;
   5523	}
   5524
   5525	if (len > 0)
   5526		ret = send_extent_data(sctx, offset, len);
   5527	else
   5528		ret = 0;
   5529out:
   5530	btrfs_free_path(path);
   5531	return ret;
   5532}
   5533
   5534static int send_write_or_clone(struct send_ctx *sctx,
   5535			       struct btrfs_path *path,
   5536			       struct btrfs_key *key,
   5537			       struct clone_root *clone_root)
   5538{
   5539	int ret = 0;
   5540	u64 offset = key->offset;
   5541	u64 end;
   5542	u64 bs = sctx->send_root->fs_info->sb->s_blocksize;
   5543
   5544	end = min_t(u64, btrfs_file_extent_end(path), sctx->cur_inode_size);
   5545	if (offset >= end)
   5546		return 0;
   5547
   5548	if (clone_root && IS_ALIGNED(end, bs)) {
   5549		struct btrfs_file_extent_item *ei;
   5550		u64 disk_byte;
   5551		u64 data_offset;
   5552
   5553		ei = btrfs_item_ptr(path->nodes[0], path->slots[0],
   5554				    struct btrfs_file_extent_item);
   5555		disk_byte = btrfs_file_extent_disk_bytenr(path->nodes[0], ei);
   5556		data_offset = btrfs_file_extent_offset(path->nodes[0], ei);
   5557		ret = clone_range(sctx, clone_root, disk_byte, data_offset,
   5558				  offset, end - offset);
   5559	} else {
   5560		ret = send_extent_data(sctx, offset, end - offset);
   5561	}
   5562	sctx->cur_inode_next_write_offset = end;
   5563	return ret;
   5564}
   5565
   5566static int is_extent_unchanged(struct send_ctx *sctx,
   5567			       struct btrfs_path *left_path,
   5568			       struct btrfs_key *ekey)
   5569{
   5570	int ret = 0;
   5571	struct btrfs_key key;
   5572	struct btrfs_path *path = NULL;
   5573	struct extent_buffer *eb;
   5574	int slot;
   5575	struct btrfs_key found_key;
   5576	struct btrfs_file_extent_item *ei;
   5577	u64 left_disknr;
   5578	u64 right_disknr;
   5579	u64 left_offset;
   5580	u64 right_offset;
   5581	u64 left_offset_fixed;
   5582	u64 left_len;
   5583	u64 right_len;
   5584	u64 left_gen;
   5585	u64 right_gen;
   5586	u8 left_type;
   5587	u8 right_type;
   5588
   5589	path = alloc_path_for_send();
   5590	if (!path)
   5591		return -ENOMEM;
   5592
   5593	eb = left_path->nodes[0];
   5594	slot = left_path->slots[0];
   5595	ei = btrfs_item_ptr(eb, slot, struct btrfs_file_extent_item);
   5596	left_type = btrfs_file_extent_type(eb, ei);
   5597
   5598	if (left_type != BTRFS_FILE_EXTENT_REG) {
   5599		ret = 0;
   5600		goto out;
   5601	}
   5602	left_disknr = btrfs_file_extent_disk_bytenr(eb, ei);
   5603	left_len = btrfs_file_extent_num_bytes(eb, ei);
   5604	left_offset = btrfs_file_extent_offset(eb, ei);
   5605	left_gen = btrfs_file_extent_generation(eb, ei);
   5606
   5607	/*
   5608	 * Following comments will refer to these graphics. L is the left
   5609	 * extents which we are checking at the moment. 1-8 are the right
   5610	 * extents that we iterate.
   5611	 *
   5612	 *       |-----L-----|
   5613	 * |-1-|-2a-|-3-|-4-|-5-|-6-|
   5614	 *
   5615	 *       |-----L-----|
   5616	 * |--1--|-2b-|...(same as above)
   5617	 *
   5618	 * Alternative situation. Happens on files where extents got split.
   5619	 *       |-----L-----|
   5620	 * |-----------7-----------|-6-|
   5621	 *
   5622	 * Alternative situation. Happens on files which got larger.
   5623	 *       |-----L-----|
   5624	 * |-8-|
   5625	 * Nothing follows after 8.
   5626	 */
   5627
   5628	key.objectid = ekey->objectid;
   5629	key.type = BTRFS_EXTENT_DATA_KEY;
   5630	key.offset = ekey->offset;
   5631	ret = btrfs_search_slot_for_read(sctx->parent_root, &key, path, 0, 0);
   5632	if (ret < 0)
   5633		goto out;
   5634	if (ret) {
   5635		ret = 0;
   5636		goto out;
   5637	}
   5638
   5639	/*
   5640	 * Handle special case where the right side has no extents at all.
   5641	 */
   5642	eb = path->nodes[0];
   5643	slot = path->slots[0];
   5644	btrfs_item_key_to_cpu(eb, &found_key, slot);
   5645	if (found_key.objectid != key.objectid ||
   5646	    found_key.type != key.type) {
   5647		/* If we're a hole then just pretend nothing changed */
   5648		ret = (left_disknr) ? 0 : 1;
   5649		goto out;
   5650	}
   5651
   5652	/*
   5653	 * We're now on 2a, 2b or 7.
   5654	 */
   5655	key = found_key;
   5656	while (key.offset < ekey->offset + left_len) {
   5657		ei = btrfs_item_ptr(eb, slot, struct btrfs_file_extent_item);
   5658		right_type = btrfs_file_extent_type(eb, ei);
   5659		if (right_type != BTRFS_FILE_EXTENT_REG &&
   5660		    right_type != BTRFS_FILE_EXTENT_INLINE) {
   5661			ret = 0;
   5662			goto out;
   5663		}
   5664
   5665		if (right_type == BTRFS_FILE_EXTENT_INLINE) {
   5666			right_len = btrfs_file_extent_ram_bytes(eb, ei);
   5667			right_len = PAGE_ALIGN(right_len);
   5668		} else {
   5669			right_len = btrfs_file_extent_num_bytes(eb, ei);
   5670		}
   5671
   5672		/*
   5673		 * Are we at extent 8? If yes, we know the extent is changed.
   5674		 * This may only happen on the first iteration.
   5675		 */
   5676		if (found_key.offset + right_len <= ekey->offset) {
   5677			/* If we're a hole just pretend nothing changed */
   5678			ret = (left_disknr) ? 0 : 1;
   5679			goto out;
   5680		}
   5681
   5682		/*
   5683		 * We just wanted to see if when we have an inline extent, what
   5684		 * follows it is a regular extent (wanted to check the above
   5685		 * condition for inline extents too). This should normally not
   5686		 * happen but it's possible for example when we have an inline
   5687		 * compressed extent representing data with a size matching
   5688		 * the page size (currently the same as sector size).
   5689		 */
   5690		if (right_type == BTRFS_FILE_EXTENT_INLINE) {
   5691			ret = 0;
   5692			goto out;
   5693		}
   5694
   5695		right_disknr = btrfs_file_extent_disk_bytenr(eb, ei);
   5696		right_offset = btrfs_file_extent_offset(eb, ei);
   5697		right_gen = btrfs_file_extent_generation(eb, ei);
   5698
   5699		left_offset_fixed = left_offset;
   5700		if (key.offset < ekey->offset) {
   5701			/* Fix the right offset for 2a and 7. */
   5702			right_offset += ekey->offset - key.offset;
   5703		} else {
   5704			/* Fix the left offset for all behind 2a and 2b */
   5705			left_offset_fixed += key.offset - ekey->offset;
   5706		}
   5707
   5708		/*
   5709		 * Check if we have the same extent.
   5710		 */
   5711		if (left_disknr != right_disknr ||
   5712		    left_offset_fixed != right_offset ||
   5713		    left_gen != right_gen) {
   5714			ret = 0;
   5715			goto out;
   5716		}
   5717
   5718		/*
   5719		 * Go to the next extent.
   5720		 */
   5721		ret = btrfs_next_item(sctx->parent_root, path);
   5722		if (ret < 0)
   5723			goto out;
   5724		if (!ret) {
   5725			eb = path->nodes[0];
   5726			slot = path->slots[0];
   5727			btrfs_item_key_to_cpu(eb, &found_key, slot);
   5728		}
   5729		if (ret || found_key.objectid != key.objectid ||
   5730		    found_key.type != key.type) {
   5731			key.offset += right_len;
   5732			break;
   5733		}
   5734		if (found_key.offset != key.offset + right_len) {
   5735			ret = 0;
   5736			goto out;
   5737		}
   5738		key = found_key;
   5739	}
   5740
   5741	/*
   5742	 * We're now behind the left extent (treat as unchanged) or at the end
   5743	 * of the right side (treat as changed).
   5744	 */
   5745	if (key.offset >= ekey->offset + left_len)
   5746		ret = 1;
   5747	else
   5748		ret = 0;
   5749
   5750
   5751out:
   5752	btrfs_free_path(path);
   5753	return ret;
   5754}
   5755
   5756static int get_last_extent(struct send_ctx *sctx, u64 offset)
   5757{
   5758	struct btrfs_path *path;
   5759	struct btrfs_root *root = sctx->send_root;
   5760	struct btrfs_key key;
   5761	int ret;
   5762
   5763	path = alloc_path_for_send();
   5764	if (!path)
   5765		return -ENOMEM;
   5766
   5767	sctx->cur_inode_last_extent = 0;
   5768
   5769	key.objectid = sctx->cur_ino;
   5770	key.type = BTRFS_EXTENT_DATA_KEY;
   5771	key.offset = offset;
   5772	ret = btrfs_search_slot_for_read(root, &key, path, 0, 1);
   5773	if (ret < 0)
   5774		goto out;
   5775	ret = 0;
   5776	btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
   5777	if (key.objectid != sctx->cur_ino || key.type != BTRFS_EXTENT_DATA_KEY)
   5778		goto out;
   5779
   5780	sctx->cur_inode_last_extent = btrfs_file_extent_end(path);
   5781out:
   5782	btrfs_free_path(path);
   5783	return ret;
   5784}
   5785
   5786static int range_is_hole_in_parent(struct send_ctx *sctx,
   5787				   const u64 start,
   5788				   const u64 end)
   5789{
   5790	struct btrfs_path *path;
   5791	struct btrfs_key key;
   5792	struct btrfs_root *root = sctx->parent_root;
   5793	u64 search_start = start;
   5794	int ret;
   5795
   5796	path = alloc_path_for_send();
   5797	if (!path)
   5798		return -ENOMEM;
   5799
   5800	key.objectid = sctx->cur_ino;
   5801	key.type = BTRFS_EXTENT_DATA_KEY;
   5802	key.offset = search_start;
   5803	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
   5804	if (ret < 0)
   5805		goto out;
   5806	if (ret > 0 && path->slots[0] > 0)
   5807		path->slots[0]--;
   5808
   5809	while (search_start < end) {
   5810		struct extent_buffer *leaf = path->nodes[0];
   5811		int slot = path->slots[0];
   5812		struct btrfs_file_extent_item *fi;
   5813		u64 extent_end;
   5814
   5815		if (slot >= btrfs_header_nritems(leaf)) {
   5816			ret = btrfs_next_leaf(root, path);
   5817			if (ret < 0)
   5818				goto out;
   5819			else if (ret > 0)
   5820				break;
   5821			continue;
   5822		}
   5823
   5824		btrfs_item_key_to_cpu(leaf, &key, slot);
   5825		if (key.objectid < sctx->cur_ino ||
   5826		    key.type < BTRFS_EXTENT_DATA_KEY)
   5827			goto next;
   5828		if (key.objectid > sctx->cur_ino ||
   5829		    key.type > BTRFS_EXTENT_DATA_KEY ||
   5830		    key.offset >= end)
   5831			break;
   5832
   5833		fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item);
   5834		extent_end = btrfs_file_extent_end(path);
   5835		if (extent_end <= start)
   5836			goto next;
   5837		if (btrfs_file_extent_disk_bytenr(leaf, fi) == 0) {
   5838			search_start = extent_end;
   5839			goto next;
   5840		}
   5841		ret = 0;
   5842		goto out;
   5843next:
   5844		path->slots[0]++;
   5845	}
   5846	ret = 1;
   5847out:
   5848	btrfs_free_path(path);
   5849	return ret;
   5850}
   5851
   5852static int maybe_send_hole(struct send_ctx *sctx, struct btrfs_path *path,
   5853			   struct btrfs_key *key)
   5854{
   5855	int ret = 0;
   5856
   5857	if (sctx->cur_ino != key->objectid || !need_send_hole(sctx))
   5858		return 0;
   5859
   5860	if (sctx->cur_inode_last_extent == (u64)-1) {
   5861		ret = get_last_extent(sctx, key->offset - 1);
   5862		if (ret)
   5863			return ret;
   5864	}
   5865
   5866	if (path->slots[0] == 0 &&
   5867	    sctx->cur_inode_last_extent < key->offset) {
   5868		/*
   5869		 * We might have skipped entire leafs that contained only
   5870		 * file extent items for our current inode. These leafs have
   5871		 * a generation number smaller (older) than the one in the
   5872		 * current leaf and the leaf our last extent came from, and
   5873		 * are located between these 2 leafs.
   5874		 */
   5875		ret = get_last_extent(sctx, key->offset - 1);
   5876		if (ret)
   5877			return ret;
   5878	}
   5879
   5880	if (sctx->cur_inode_last_extent < key->offset) {
   5881		ret = range_is_hole_in_parent(sctx,
   5882					      sctx->cur_inode_last_extent,
   5883					      key->offset);
   5884		if (ret < 0)
   5885			return ret;
   5886		else if (ret == 0)
   5887			ret = send_hole(sctx, key->offset);
   5888		else
   5889			ret = 0;
   5890	}
   5891	sctx->cur_inode_last_extent = btrfs_file_extent_end(path);
   5892	return ret;
   5893}
   5894
   5895static int process_extent(struct send_ctx *sctx,
   5896			  struct btrfs_path *path,
   5897			  struct btrfs_key *key)
   5898{
   5899	struct clone_root *found_clone = NULL;
   5900	int ret = 0;
   5901
   5902	if (S_ISLNK(sctx->cur_inode_mode))
   5903		return 0;
   5904
   5905	if (sctx->parent_root && !sctx->cur_inode_new) {
   5906		ret = is_extent_unchanged(sctx, path, key);
   5907		if (ret < 0)
   5908			goto out;
   5909		if (ret) {
   5910			ret = 0;
   5911			goto out_hole;
   5912		}
   5913	} else {
   5914		struct btrfs_file_extent_item *ei;
   5915		u8 type;
   5916
   5917		ei = btrfs_item_ptr(path->nodes[0], path->slots[0],
   5918				    struct btrfs_file_extent_item);
   5919		type = btrfs_file_extent_type(path->nodes[0], ei);
   5920		if (type == BTRFS_FILE_EXTENT_PREALLOC ||
   5921		    type == BTRFS_FILE_EXTENT_REG) {
   5922			/*
   5923			 * The send spec does not have a prealloc command yet,
   5924			 * so just leave a hole for prealloc'ed extents until
   5925			 * we have enough commands queued up to justify rev'ing
   5926			 * the send spec.
   5927			 */
   5928			if (type == BTRFS_FILE_EXTENT_PREALLOC) {
   5929				ret = 0;
   5930				goto out;
   5931			}
   5932
   5933			/* Have a hole, just skip it. */
   5934			if (btrfs_file_extent_disk_bytenr(path->nodes[0], ei) == 0) {
   5935				ret = 0;
   5936				goto out;
   5937			}
   5938		}
   5939	}
   5940
   5941	ret = find_extent_clone(sctx, path, key->objectid, key->offset,
   5942			sctx->cur_inode_size, &found_clone);
   5943	if (ret != -ENOENT && ret < 0)
   5944		goto out;
   5945
   5946	ret = send_write_or_clone(sctx, path, key, found_clone);
   5947	if (ret)
   5948		goto out;
   5949out_hole:
   5950	ret = maybe_send_hole(sctx, path, key);
   5951out:
   5952	return ret;
   5953}
   5954
   5955static int process_all_extents(struct send_ctx *sctx)
   5956{
   5957	int ret = 0;
   5958	int iter_ret = 0;
   5959	struct btrfs_root *root;
   5960	struct btrfs_path *path;
   5961	struct btrfs_key key;
   5962	struct btrfs_key found_key;
   5963
   5964	root = sctx->send_root;
   5965	path = alloc_path_for_send();
   5966	if (!path)
   5967		return -ENOMEM;
   5968
   5969	key.objectid = sctx->cmp_key->objectid;
   5970	key.type = BTRFS_EXTENT_DATA_KEY;
   5971	key.offset = 0;
   5972	btrfs_for_each_slot(root, &key, &found_key, path, iter_ret) {
   5973		if (found_key.objectid != key.objectid ||
   5974		    found_key.type != key.type) {
   5975			ret = 0;
   5976			break;
   5977		}
   5978
   5979		ret = process_extent(sctx, path, &found_key);
   5980		if (ret < 0)
   5981			break;
   5982	}
   5983	/* Catch error found during iteration */
   5984	if (iter_ret < 0)
   5985		ret = iter_ret;
   5986
   5987	btrfs_free_path(path);
   5988	return ret;
   5989}
   5990
   5991static int process_recorded_refs_if_needed(struct send_ctx *sctx, int at_end,
   5992					   int *pending_move,
   5993					   int *refs_processed)
   5994{
   5995	int ret = 0;
   5996
   5997	if (sctx->cur_ino == 0)
   5998		goto out;
   5999	if (!at_end && sctx->cur_ino == sctx->cmp_key->objectid &&
   6000	    sctx->cmp_key->type <= BTRFS_INODE_EXTREF_KEY)
   6001		goto out;
   6002	if (list_empty(&sctx->new_refs) && list_empty(&sctx->deleted_refs))
   6003		goto out;
   6004
   6005	ret = process_recorded_refs(sctx, pending_move);
   6006	if (ret < 0)
   6007		goto out;
   6008
   6009	*refs_processed = 1;
   6010out:
   6011	return ret;
   6012}
   6013
   6014static int finish_inode_if_needed(struct send_ctx *sctx, int at_end)
   6015{
   6016	int ret = 0;
   6017	u64 left_mode;
   6018	u64 left_uid;
   6019	u64 left_gid;
   6020	u64 right_mode;
   6021	u64 right_uid;
   6022	u64 right_gid;
   6023	int need_chmod = 0;
   6024	int need_chown = 0;
   6025	int need_truncate = 1;
   6026	int pending_move = 0;
   6027	int refs_processed = 0;
   6028
   6029	if (sctx->ignore_cur_inode)
   6030		return 0;
   6031
   6032	ret = process_recorded_refs_if_needed(sctx, at_end, &pending_move,
   6033					      &refs_processed);
   6034	if (ret < 0)
   6035		goto out;
   6036
   6037	/*
   6038	 * We have processed the refs and thus need to advance send_progress.
   6039	 * Now, calls to get_cur_xxx will take the updated refs of the current
   6040	 * inode into account.
   6041	 *
   6042	 * On the other hand, if our current inode is a directory and couldn't
   6043	 * be moved/renamed because its parent was renamed/moved too and it has
   6044	 * a higher inode number, we can only move/rename our current inode
   6045	 * after we moved/renamed its parent. Therefore in this case operate on
   6046	 * the old path (pre move/rename) of our current inode, and the
   6047	 * move/rename will be performed later.
   6048	 */
   6049	if (refs_processed && !pending_move)
   6050		sctx->send_progress = sctx->cur_ino + 1;
   6051
   6052	if (sctx->cur_ino == 0 || sctx->cur_inode_deleted)
   6053		goto out;
   6054	if (!at_end && sctx->cmp_key->objectid == sctx->cur_ino)
   6055		goto out;
   6056
   6057	ret = get_inode_info(sctx->send_root, sctx->cur_ino, NULL, NULL,
   6058			&left_mode, &left_uid, &left_gid, NULL);
   6059	if (ret < 0)
   6060		goto out;
   6061
   6062	if (!sctx->parent_root || sctx->cur_inode_new) {
   6063		need_chown = 1;
   6064		if (!S_ISLNK(sctx->cur_inode_mode))
   6065			need_chmod = 1;
   6066		if (sctx->cur_inode_next_write_offset == sctx->cur_inode_size)
   6067			need_truncate = 0;
   6068	} else {
   6069		u64 old_size;
   6070
   6071		ret = get_inode_info(sctx->parent_root, sctx->cur_ino,
   6072				&old_size, NULL, &right_mode, &right_uid,
   6073				&right_gid, NULL);
   6074		if (ret < 0)
   6075			goto out;
   6076
   6077		if (left_uid != right_uid || left_gid != right_gid)
   6078			need_chown = 1;
   6079		if (!S_ISLNK(sctx->cur_inode_mode) && left_mode != right_mode)
   6080			need_chmod = 1;
   6081		if ((old_size == sctx->cur_inode_size) ||
   6082		    (sctx->cur_inode_size > old_size &&
   6083		     sctx->cur_inode_next_write_offset == sctx->cur_inode_size))
   6084			need_truncate = 0;
   6085	}
   6086
   6087	if (S_ISREG(sctx->cur_inode_mode)) {
   6088		if (need_send_hole(sctx)) {
   6089			if (sctx->cur_inode_last_extent == (u64)-1 ||
   6090			    sctx->cur_inode_last_extent <
   6091			    sctx->cur_inode_size) {
   6092				ret = get_last_extent(sctx, (u64)-1);
   6093				if (ret)
   6094					goto out;
   6095			}
   6096			if (sctx->cur_inode_last_extent <
   6097			    sctx->cur_inode_size) {
   6098				ret = send_hole(sctx, sctx->cur_inode_size);
   6099				if (ret)
   6100					goto out;
   6101			}
   6102		}
   6103		if (need_truncate) {
   6104			ret = send_truncate(sctx, sctx->cur_ino,
   6105					    sctx->cur_inode_gen,
   6106					    sctx->cur_inode_size);
   6107			if (ret < 0)
   6108				goto out;
   6109		}
   6110	}
   6111
   6112	if (need_chown) {
   6113		ret = send_chown(sctx, sctx->cur_ino, sctx->cur_inode_gen,
   6114				left_uid, left_gid);
   6115		if (ret < 0)
   6116			goto out;
   6117	}
   6118	if (need_chmod) {
   6119		ret = send_chmod(sctx, sctx->cur_ino, sctx->cur_inode_gen,
   6120				left_mode);
   6121		if (ret < 0)
   6122			goto out;
   6123	}
   6124
   6125	ret = send_capabilities(sctx);
   6126	if (ret < 0)
   6127		goto out;
   6128
   6129	/*
   6130	 * If other directory inodes depended on our current directory
   6131	 * inode's move/rename, now do their move/rename operations.
   6132	 */
   6133	if (!is_waiting_for_move(sctx, sctx->cur_ino)) {
   6134		ret = apply_children_dir_moves(sctx);
   6135		if (ret)
   6136			goto out;
   6137		/*
   6138		 * Need to send that every time, no matter if it actually
   6139		 * changed between the two trees as we have done changes to
   6140		 * the inode before. If our inode is a directory and it's
   6141		 * waiting to be moved/renamed, we will send its utimes when
   6142		 * it's moved/renamed, therefore we don't need to do it here.
   6143		 */
   6144		sctx->send_progress = sctx->cur_ino + 1;
   6145		ret = send_utimes(sctx, sctx->cur_ino, sctx->cur_inode_gen);
   6146		if (ret < 0)
   6147			goto out;
   6148	}
   6149
   6150out:
   6151	return ret;
   6152}
   6153
   6154struct parent_paths_ctx {
   6155	struct list_head *refs;
   6156	struct send_ctx *sctx;
   6157};
   6158
   6159static int record_parent_ref(int num, u64 dir, int index, struct fs_path *name,
   6160			     void *ctx)
   6161{
   6162	struct parent_paths_ctx *ppctx = ctx;
   6163
   6164	return record_ref(ppctx->sctx->parent_root, dir, name, ppctx->sctx,
   6165			  ppctx->refs);
   6166}
   6167
   6168/*
   6169 * Issue unlink operations for all paths of the current inode found in the
   6170 * parent snapshot.
   6171 */
   6172static int btrfs_unlink_all_paths(struct send_ctx *sctx)
   6173{
   6174	LIST_HEAD(deleted_refs);
   6175	struct btrfs_path *path;
   6176	struct btrfs_root *root = sctx->parent_root;
   6177	struct btrfs_key key;
   6178	struct btrfs_key found_key;
   6179	struct parent_paths_ctx ctx;
   6180	int iter_ret = 0;
   6181	int ret;
   6182
   6183	path = alloc_path_for_send();
   6184	if (!path)
   6185		return -ENOMEM;
   6186
   6187	key.objectid = sctx->cur_ino;
   6188	key.type = BTRFS_INODE_REF_KEY;
   6189	key.offset = 0;
   6190
   6191	ctx.refs = &deleted_refs;
   6192	ctx.sctx = sctx;
   6193
   6194	btrfs_for_each_slot(root, &key, &found_key, path, iter_ret) {
   6195		if (found_key.objectid != key.objectid)
   6196			break;
   6197		if (found_key.type != key.type &&
   6198		    found_key.type != BTRFS_INODE_EXTREF_KEY)
   6199			break;
   6200
   6201		ret = iterate_inode_ref(root, path, &found_key, 1,
   6202					record_parent_ref, &ctx);
   6203		if (ret < 0)
   6204			goto out;
   6205	}
   6206	/* Catch error found during iteration */
   6207	if (iter_ret < 0) {
   6208		ret = iter_ret;
   6209		goto out;
   6210	}
   6211
   6212	while (!list_empty(&deleted_refs)) {
   6213		struct recorded_ref *ref;
   6214
   6215		ref = list_first_entry(&deleted_refs, struct recorded_ref, list);
   6216		ret = send_unlink(sctx, ref->full_path);
   6217		if (ret < 0)
   6218			goto out;
   6219		fs_path_free(ref->full_path);
   6220		list_del(&ref->list);
   6221		kfree(ref);
   6222	}
   6223	ret = 0;
   6224out:
   6225	btrfs_free_path(path);
   6226	if (ret)
   6227		__free_recorded_refs(&deleted_refs);
   6228	return ret;
   6229}
   6230
   6231static void close_current_inode(struct send_ctx *sctx)
   6232{
   6233	u64 i_size;
   6234
   6235	if (sctx->cur_inode == NULL)
   6236		return;
   6237
   6238	i_size = i_size_read(sctx->cur_inode);
   6239
   6240	/*
   6241	 * If we are doing an incremental send, we may have extents between the
   6242	 * last processed extent and the i_size that have not been processed
   6243	 * because they haven't changed but we may have read some of their pages
   6244	 * through readahead, see the comments at send_extent_data().
   6245	 */
   6246	if (sctx->clean_page_cache && sctx->page_cache_clear_start < i_size)
   6247		truncate_inode_pages_range(&sctx->cur_inode->i_data,
   6248					   sctx->page_cache_clear_start,
   6249					   round_up(i_size, PAGE_SIZE) - 1);
   6250
   6251	iput(sctx->cur_inode);
   6252	sctx->cur_inode = NULL;
   6253}
   6254
   6255static int changed_inode(struct send_ctx *sctx,
   6256			 enum btrfs_compare_tree_result result)
   6257{
   6258	int ret = 0;
   6259	struct btrfs_key *key = sctx->cmp_key;
   6260	struct btrfs_inode_item *left_ii = NULL;
   6261	struct btrfs_inode_item *right_ii = NULL;
   6262	u64 left_gen = 0;
   6263	u64 right_gen = 0;
   6264
   6265	close_current_inode(sctx);
   6266
   6267	sctx->cur_ino = key->objectid;
   6268	sctx->cur_inode_new_gen = 0;
   6269	sctx->cur_inode_last_extent = (u64)-1;
   6270	sctx->cur_inode_next_write_offset = 0;
   6271	sctx->ignore_cur_inode = false;
   6272
   6273	/*
   6274	 * Set send_progress to current inode. This will tell all get_cur_xxx
   6275	 * functions that the current inode's refs are not updated yet. Later,
   6276	 * when process_recorded_refs is finished, it is set to cur_ino + 1.
   6277	 */
   6278	sctx->send_progress = sctx->cur_ino;
   6279
   6280	if (result == BTRFS_COMPARE_TREE_NEW ||
   6281	    result == BTRFS_COMPARE_TREE_CHANGED) {
   6282		left_ii = btrfs_item_ptr(sctx->left_path->nodes[0],
   6283				sctx->left_path->slots[0],
   6284				struct btrfs_inode_item);
   6285		left_gen = btrfs_inode_generation(sctx->left_path->nodes[0],
   6286				left_ii);
   6287	} else {
   6288		right_ii = btrfs_item_ptr(sctx->right_path->nodes[0],
   6289				sctx->right_path->slots[0],
   6290				struct btrfs_inode_item);
   6291		right_gen = btrfs_inode_generation(sctx->right_path->nodes[0],
   6292				right_ii);
   6293	}
   6294	if (result == BTRFS_COMPARE_TREE_CHANGED) {
   6295		right_ii = btrfs_item_ptr(sctx->right_path->nodes[0],
   6296				sctx->right_path->slots[0],
   6297				struct btrfs_inode_item);
   6298
   6299		right_gen = btrfs_inode_generation(sctx->right_path->nodes[0],
   6300				right_ii);
   6301
   6302		/*
   6303		 * The cur_ino = root dir case is special here. We can't treat
   6304		 * the inode as deleted+reused because it would generate a
   6305		 * stream that tries to delete/mkdir the root dir.
   6306		 */
   6307		if (left_gen != right_gen &&
   6308		    sctx->cur_ino != BTRFS_FIRST_FREE_OBJECTID)
   6309			sctx->cur_inode_new_gen = 1;
   6310	}
   6311
   6312	/*
   6313	 * Normally we do not find inodes with a link count of zero (orphans)
   6314	 * because the most common case is to create a snapshot and use it
   6315	 * for a send operation. However other less common use cases involve
   6316	 * using a subvolume and send it after turning it to RO mode just
   6317	 * after deleting all hard links of a file while holding an open
   6318	 * file descriptor against it or turning a RO snapshot into RW mode,
   6319	 * keep an open file descriptor against a file, delete it and then
   6320	 * turn the snapshot back to RO mode before using it for a send
   6321	 * operation. So if we find such cases, ignore the inode and all its
   6322	 * items completely if it's a new inode, or if it's a changed inode
   6323	 * make sure all its previous paths (from the parent snapshot) are all
   6324	 * unlinked and all other the inode items are ignored.
   6325	 */
   6326	if (result == BTRFS_COMPARE_TREE_NEW ||
   6327	    result == BTRFS_COMPARE_TREE_CHANGED) {
   6328		u32 nlinks;
   6329
   6330		nlinks = btrfs_inode_nlink(sctx->left_path->nodes[0], left_ii);
   6331		if (nlinks == 0) {
   6332			sctx->ignore_cur_inode = true;
   6333			if (result == BTRFS_COMPARE_TREE_CHANGED)
   6334				ret = btrfs_unlink_all_paths(sctx);
   6335			goto out;
   6336		}
   6337	}
   6338
   6339	if (result == BTRFS_COMPARE_TREE_NEW) {
   6340		sctx->cur_inode_gen = left_gen;
   6341		sctx->cur_inode_new = 1;
   6342		sctx->cur_inode_deleted = 0;
   6343		sctx->cur_inode_size = btrfs_inode_size(
   6344				sctx->left_path->nodes[0], left_ii);
   6345		sctx->cur_inode_mode = btrfs_inode_mode(
   6346				sctx->left_path->nodes[0], left_ii);
   6347		sctx->cur_inode_rdev = btrfs_inode_rdev(
   6348				sctx->left_path->nodes[0], left_ii);
   6349		if (sctx->cur_ino != BTRFS_FIRST_FREE_OBJECTID)
   6350			ret = send_create_inode_if_needed(sctx);
   6351	} else if (result == BTRFS_COMPARE_TREE_DELETED) {
   6352		sctx->cur_inode_gen = right_gen;
   6353		sctx->cur_inode_new = 0;
   6354		sctx->cur_inode_deleted = 1;
   6355		sctx->cur_inode_size = btrfs_inode_size(
   6356				sctx->right_path->nodes[0], right_ii);
   6357		sctx->cur_inode_mode = btrfs_inode_mode(
   6358				sctx->right_path->nodes[0], right_ii);
   6359	} else if (result == BTRFS_COMPARE_TREE_CHANGED) {
   6360		/*
   6361		 * We need to do some special handling in case the inode was
   6362		 * reported as changed with a changed generation number. This
   6363		 * means that the original inode was deleted and new inode
   6364		 * reused the same inum. So we have to treat the old inode as
   6365		 * deleted and the new one as new.
   6366		 */
   6367		if (sctx->cur_inode_new_gen) {
   6368			/*
   6369			 * First, process the inode as if it was deleted.
   6370			 */
   6371			sctx->cur_inode_gen = right_gen;
   6372			sctx->cur_inode_new = 0;
   6373			sctx->cur_inode_deleted = 1;
   6374			sctx->cur_inode_size = btrfs_inode_size(
   6375					sctx->right_path->nodes[0], right_ii);
   6376			sctx->cur_inode_mode = btrfs_inode_mode(
   6377					sctx->right_path->nodes[0], right_ii);
   6378			ret = process_all_refs(sctx,
   6379					BTRFS_COMPARE_TREE_DELETED);
   6380			if (ret < 0)
   6381				goto out;
   6382
   6383			/*
   6384			 * Now process the inode as if it was new.
   6385			 */
   6386			sctx->cur_inode_gen = left_gen;
   6387			sctx->cur_inode_new = 1;
   6388			sctx->cur_inode_deleted = 0;
   6389			sctx->cur_inode_size = btrfs_inode_size(
   6390					sctx->left_path->nodes[0], left_ii);
   6391			sctx->cur_inode_mode = btrfs_inode_mode(
   6392					sctx->left_path->nodes[0], left_ii);
   6393			sctx->cur_inode_rdev = btrfs_inode_rdev(
   6394					sctx->left_path->nodes[0], left_ii);
   6395			ret = send_create_inode_if_needed(sctx);
   6396			if (ret < 0)
   6397				goto out;
   6398
   6399			ret = process_all_refs(sctx, BTRFS_COMPARE_TREE_NEW);
   6400			if (ret < 0)
   6401				goto out;
   6402			/*
   6403			 * Advance send_progress now as we did not get into
   6404			 * process_recorded_refs_if_needed in the new_gen case.
   6405			 */
   6406			sctx->send_progress = sctx->cur_ino + 1;
   6407
   6408			/*
   6409			 * Now process all extents and xattrs of the inode as if
   6410			 * they were all new.
   6411			 */
   6412			ret = process_all_extents(sctx);
   6413			if (ret < 0)
   6414				goto out;
   6415			ret = process_all_new_xattrs(sctx);
   6416			if (ret < 0)
   6417				goto out;
   6418		} else {
   6419			sctx->cur_inode_gen = left_gen;
   6420			sctx->cur_inode_new = 0;
   6421			sctx->cur_inode_new_gen = 0;
   6422			sctx->cur_inode_deleted = 0;
   6423			sctx->cur_inode_size = btrfs_inode_size(
   6424					sctx->left_path->nodes[0], left_ii);
   6425			sctx->cur_inode_mode = btrfs_inode_mode(
   6426					sctx->left_path->nodes[0], left_ii);
   6427		}
   6428	}
   6429
   6430out:
   6431	return ret;
   6432}
   6433
   6434/*
   6435 * We have to process new refs before deleted refs, but compare_trees gives us
   6436 * the new and deleted refs mixed. To fix this, we record the new/deleted refs
   6437 * first and later process them in process_recorded_refs.
   6438 * For the cur_inode_new_gen case, we skip recording completely because
   6439 * changed_inode did already initiate processing of refs. The reason for this is
   6440 * that in this case, compare_tree actually compares the refs of 2 different
   6441 * inodes. To fix this, process_all_refs is used in changed_inode to handle all
   6442 * refs of the right tree as deleted and all refs of the left tree as new.
   6443 */
   6444static int changed_ref(struct send_ctx *sctx,
   6445		       enum btrfs_compare_tree_result result)
   6446{
   6447	int ret = 0;
   6448
   6449	if (sctx->cur_ino != sctx->cmp_key->objectid) {
   6450		inconsistent_snapshot_error(sctx, result, "reference");
   6451		return -EIO;
   6452	}
   6453
   6454	if (!sctx->cur_inode_new_gen &&
   6455	    sctx->cur_ino != BTRFS_FIRST_FREE_OBJECTID) {
   6456		if (result == BTRFS_COMPARE_TREE_NEW)
   6457			ret = record_new_ref(sctx);
   6458		else if (result == BTRFS_COMPARE_TREE_DELETED)
   6459			ret = record_deleted_ref(sctx);
   6460		else if (result == BTRFS_COMPARE_TREE_CHANGED)
   6461			ret = record_changed_ref(sctx);
   6462	}
   6463
   6464	return ret;
   6465}
   6466
   6467/*
   6468 * Process new/deleted/changed xattrs. We skip processing in the
   6469 * cur_inode_new_gen case because changed_inode did already initiate processing
   6470 * of xattrs. The reason is the same as in changed_ref
   6471 */
   6472static int changed_xattr(struct send_ctx *sctx,
   6473			 enum btrfs_compare_tree_result result)
   6474{
   6475	int ret = 0;
   6476
   6477	if (sctx->cur_ino != sctx->cmp_key->objectid) {
   6478		inconsistent_snapshot_error(sctx, result, "xattr");
   6479		return -EIO;
   6480	}
   6481
   6482	if (!sctx->cur_inode_new_gen && !sctx->cur_inode_deleted) {
   6483		if (result == BTRFS_COMPARE_TREE_NEW)
   6484			ret = process_new_xattr(sctx);
   6485		else if (result == BTRFS_COMPARE_TREE_DELETED)
   6486			ret = process_deleted_xattr(sctx);
   6487		else if (result == BTRFS_COMPARE_TREE_CHANGED)
   6488			ret = process_changed_xattr(sctx);
   6489	}
   6490
   6491	return ret;
   6492}
   6493
   6494/*
   6495 * Process new/deleted/changed extents. We skip processing in the
   6496 * cur_inode_new_gen case because changed_inode did already initiate processing
   6497 * of extents. The reason is the same as in changed_ref
   6498 */
   6499static int changed_extent(struct send_ctx *sctx,
   6500			  enum btrfs_compare_tree_result result)
   6501{
   6502	int ret = 0;
   6503
   6504	/*
   6505	 * We have found an extent item that changed without the inode item
   6506	 * having changed. This can happen either after relocation (where the
   6507	 * disk_bytenr of an extent item is replaced at
   6508	 * relocation.c:replace_file_extents()) or after deduplication into a
   6509	 * file in both the parent and send snapshots (where an extent item can
   6510	 * get modified or replaced with a new one). Note that deduplication
   6511	 * updates the inode item, but it only changes the iversion (sequence
   6512	 * field in the inode item) of the inode, so if a file is deduplicated
   6513	 * the same amount of times in both the parent and send snapshots, its
   6514	 * iversion becomes the same in both snapshots, whence the inode item is
   6515	 * the same on both snapshots.
   6516	 */
   6517	if (sctx->cur_ino != sctx->cmp_key->objectid)
   6518		return 0;
   6519
   6520	if (!sctx->cur_inode_new_gen && !sctx->cur_inode_deleted) {
   6521		if (result != BTRFS_COMPARE_TREE_DELETED)
   6522			ret = process_extent(sctx, sctx->left_path,
   6523					sctx->cmp_key);
   6524	}
   6525
   6526	return ret;
   6527}
   6528
   6529static int dir_changed(struct send_ctx *sctx, u64 dir)
   6530{
   6531	u64 orig_gen, new_gen;
   6532	int ret;
   6533
   6534	ret = get_inode_info(sctx->send_root, dir, NULL, &new_gen, NULL, NULL,
   6535			     NULL, NULL);
   6536	if (ret)
   6537		return ret;
   6538
   6539	ret = get_inode_info(sctx->parent_root, dir, NULL, &orig_gen, NULL,
   6540			     NULL, NULL, NULL);
   6541	if (ret)
   6542		return ret;
   6543
   6544	return (orig_gen != new_gen) ? 1 : 0;
   6545}
   6546
   6547static int compare_refs(struct send_ctx *sctx, struct btrfs_path *path,
   6548			struct btrfs_key *key)
   6549{
   6550	struct btrfs_inode_extref *extref;
   6551	struct extent_buffer *leaf;
   6552	u64 dirid = 0, last_dirid = 0;
   6553	unsigned long ptr;
   6554	u32 item_size;
   6555	u32 cur_offset = 0;
   6556	int ref_name_len;
   6557	int ret = 0;
   6558
   6559	/* Easy case, just check this one dirid */
   6560	if (key->type == BTRFS_INODE_REF_KEY) {
   6561		dirid = key->offset;
   6562
   6563		ret = dir_changed(sctx, dirid);
   6564		goto out;
   6565	}
   6566
   6567	leaf = path->nodes[0];
   6568	item_size = btrfs_item_size(leaf, path->slots[0]);
   6569	ptr = btrfs_item_ptr_offset(leaf, path->slots[0]);
   6570	while (cur_offset < item_size) {
   6571		extref = (struct btrfs_inode_extref *)(ptr +
   6572						       cur_offset);
   6573		dirid = btrfs_inode_extref_parent(leaf, extref);
   6574		ref_name_len = btrfs_inode_extref_name_len(leaf, extref);
   6575		cur_offset += ref_name_len + sizeof(*extref);
   6576		if (dirid == last_dirid)
   6577			continue;
   6578		ret = dir_changed(sctx, dirid);
   6579		if (ret)
   6580			break;
   6581		last_dirid = dirid;
   6582	}
   6583out:
   6584	return ret;
   6585}
   6586
   6587/*
   6588 * Updates compare related fields in sctx and simply forwards to the actual
   6589 * changed_xxx functions.
   6590 */
   6591static int changed_cb(struct btrfs_path *left_path,
   6592		      struct btrfs_path *right_path,
   6593		      struct btrfs_key *key,
   6594		      enum btrfs_compare_tree_result result,
   6595		      struct send_ctx *sctx)
   6596{
   6597	int ret = 0;
   6598
   6599	/*
   6600	 * We can not hold the commit root semaphore here. This is because in
   6601	 * the case of sending and receiving to the same filesystem, using a
   6602	 * pipe, could result in a deadlock:
   6603	 *
   6604	 * 1) The task running send blocks on the pipe because it's full;
   6605	 *
   6606	 * 2) The task running receive, which is the only consumer of the pipe,
   6607	 *    is waiting for a transaction commit (for example due to a space
   6608	 *    reservation when doing a write or triggering a transaction commit
   6609	 *    when creating a subvolume);
   6610	 *
   6611	 * 3) The transaction is waiting to write lock the commit root semaphore,
   6612	 *    but can not acquire it since it's being held at 1).
   6613	 *
   6614	 * Down this call chain we write to the pipe through kernel_write().
   6615	 * The same type of problem can also happen when sending to a file that
   6616	 * is stored in the same filesystem - when reserving space for a write
   6617	 * into the file, we can trigger a transaction commit.
   6618	 *
   6619	 * Our caller has supplied us with clones of leaves from the send and
   6620	 * parent roots, so we're safe here from a concurrent relocation and
   6621	 * further reallocation of metadata extents while we are here. Below we
   6622	 * also assert that the leaves are clones.
   6623	 */
   6624	lockdep_assert_not_held(&sctx->send_root->fs_info->commit_root_sem);
   6625
   6626	/*
   6627	 * We always have a send root, so left_path is never NULL. We will not
   6628	 * have a leaf when we have reached the end of the send root but have
   6629	 * not yet reached the end of the parent root.
   6630	 */
   6631	if (left_path->nodes[0])
   6632		ASSERT(test_bit(EXTENT_BUFFER_UNMAPPED,
   6633				&left_path->nodes[0]->bflags));
   6634	/*
   6635	 * When doing a full send we don't have a parent root, so right_path is
   6636	 * NULL. When doing an incremental send, we may have reached the end of
   6637	 * the parent root already, so we don't have a leaf at right_path.
   6638	 */
   6639	if (right_path && right_path->nodes[0])
   6640		ASSERT(test_bit(EXTENT_BUFFER_UNMAPPED,
   6641				&right_path->nodes[0]->bflags));
   6642
   6643	if (result == BTRFS_COMPARE_TREE_SAME) {
   6644		if (key->type == BTRFS_INODE_REF_KEY ||
   6645		    key->type == BTRFS_INODE_EXTREF_KEY) {
   6646			ret = compare_refs(sctx, left_path, key);
   6647			if (!ret)
   6648				return 0;
   6649			if (ret < 0)
   6650				return ret;
   6651		} else if (key->type == BTRFS_EXTENT_DATA_KEY) {
   6652			return maybe_send_hole(sctx, left_path, key);
   6653		} else {
   6654			return 0;
   6655		}
   6656		result = BTRFS_COMPARE_TREE_CHANGED;
   6657		ret = 0;
   6658	}
   6659
   6660	sctx->left_path = left_path;
   6661	sctx->right_path = right_path;
   6662	sctx->cmp_key = key;
   6663
   6664	ret = finish_inode_if_needed(sctx, 0);
   6665	if (ret < 0)
   6666		goto out;
   6667
   6668	/* Ignore non-FS objects */
   6669	if (key->objectid == BTRFS_FREE_INO_OBJECTID ||
   6670	    key->objectid == BTRFS_FREE_SPACE_OBJECTID)
   6671		goto out;
   6672
   6673	if (key->type == BTRFS_INODE_ITEM_KEY) {
   6674		ret = changed_inode(sctx, result);
   6675	} else if (!sctx->ignore_cur_inode) {
   6676		if (key->type == BTRFS_INODE_REF_KEY ||
   6677		    key->type == BTRFS_INODE_EXTREF_KEY)
   6678			ret = changed_ref(sctx, result);
   6679		else if (key->type == BTRFS_XATTR_ITEM_KEY)
   6680			ret = changed_xattr(sctx, result);
   6681		else if (key->type == BTRFS_EXTENT_DATA_KEY)
   6682			ret = changed_extent(sctx, result);
   6683	}
   6684
   6685out:
   6686	return ret;
   6687}
   6688
   6689static int search_key_again(const struct send_ctx *sctx,
   6690			    struct btrfs_root *root,
   6691			    struct btrfs_path *path,
   6692			    const struct btrfs_key *key)
   6693{
   6694	int ret;
   6695
   6696	if (!path->need_commit_sem)
   6697		lockdep_assert_held_read(&root->fs_info->commit_root_sem);
   6698
   6699	/*
   6700	 * Roots used for send operations are readonly and no one can add,
   6701	 * update or remove keys from them, so we should be able to find our
   6702	 * key again. The only exception is deduplication, which can operate on
   6703	 * readonly roots and add, update or remove keys to/from them - but at
   6704	 * the moment we don't allow it to run in parallel with send.
   6705	 */
   6706	ret = btrfs_search_slot(NULL, root, key, path, 0, 0);
   6707	ASSERT(ret <= 0);
   6708	if (ret > 0) {
   6709		btrfs_print_tree(path->nodes[path->lowest_level], false);
   6710		btrfs_err(root->fs_info,
   6711"send: key (%llu %u %llu) not found in %s root %llu, lowest_level %d, slot %d",
   6712			  key->objectid, key->type, key->offset,
   6713			  (root == sctx->parent_root ? "parent" : "send"),
   6714			  root->root_key.objectid, path->lowest_level,
   6715			  path->slots[path->lowest_level]);
   6716		return -EUCLEAN;
   6717	}
   6718
   6719	return ret;
   6720}
   6721
   6722static int full_send_tree(struct send_ctx *sctx)
   6723{
   6724	int ret;
   6725	struct btrfs_root *send_root = sctx->send_root;
   6726	struct btrfs_key key;
   6727	struct btrfs_fs_info *fs_info = send_root->fs_info;
   6728	struct btrfs_path *path;
   6729
   6730	path = alloc_path_for_send();
   6731	if (!path)
   6732		return -ENOMEM;
   6733	path->reada = READA_FORWARD_ALWAYS;
   6734
   6735	key.objectid = BTRFS_FIRST_FREE_OBJECTID;
   6736	key.type = BTRFS_INODE_ITEM_KEY;
   6737	key.offset = 0;
   6738
   6739	down_read(&fs_info->commit_root_sem);
   6740	sctx->last_reloc_trans = fs_info->last_reloc_trans;
   6741	up_read(&fs_info->commit_root_sem);
   6742
   6743	ret = btrfs_search_slot_for_read(send_root, &key, path, 1, 0);
   6744	if (ret < 0)
   6745		goto out;
   6746	if (ret)
   6747		goto out_finish;
   6748
   6749	while (1) {
   6750		btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
   6751
   6752		ret = changed_cb(path, NULL, &key,
   6753				 BTRFS_COMPARE_TREE_NEW, sctx);
   6754		if (ret < 0)
   6755			goto out;
   6756
   6757		down_read(&fs_info->commit_root_sem);
   6758		if (fs_info->last_reloc_trans > sctx->last_reloc_trans) {
   6759			sctx->last_reloc_trans = fs_info->last_reloc_trans;
   6760			up_read(&fs_info->commit_root_sem);
   6761			/*
   6762			 * A transaction used for relocating a block group was
   6763			 * committed or is about to finish its commit. Release
   6764			 * our path (leaf) and restart the search, so that we
   6765			 * avoid operating on any file extent items that are
   6766			 * stale, with a disk_bytenr that reflects a pre
   6767			 * relocation value. This way we avoid as much as
   6768			 * possible to fallback to regular writes when checking
   6769			 * if we can clone file ranges.
   6770			 */
   6771			btrfs_release_path(path);
   6772			ret = search_key_again(sctx, send_root, path, &key);
   6773			if (ret < 0)
   6774				goto out;
   6775		} else {
   6776			up_read(&fs_info->commit_root_sem);
   6777		}
   6778
   6779		ret = btrfs_next_item(send_root, path);
   6780		if (ret < 0)
   6781			goto out;
   6782		if (ret) {
   6783			ret  = 0;
   6784			break;
   6785		}
   6786	}
   6787
   6788out_finish:
   6789	ret = finish_inode_if_needed(sctx, 1);
   6790
   6791out:
   6792	btrfs_free_path(path);
   6793	return ret;
   6794}
   6795
   6796static int replace_node_with_clone(struct btrfs_path *path, int level)
   6797{
   6798	struct extent_buffer *clone;
   6799
   6800	clone = btrfs_clone_extent_buffer(path->nodes[level]);
   6801	if (!clone)
   6802		return -ENOMEM;
   6803
   6804	free_extent_buffer(path->nodes[level]);
   6805	path->nodes[level] = clone;
   6806
   6807	return 0;
   6808}
   6809
   6810static int tree_move_down(struct btrfs_path *path, int *level, u64 reada_min_gen)
   6811{
   6812	struct extent_buffer *eb;
   6813	struct extent_buffer *parent = path->nodes[*level];
   6814	int slot = path->slots[*level];
   6815	const int nritems = btrfs_header_nritems(parent);
   6816	u64 reada_max;
   6817	u64 reada_done = 0;
   6818
   6819	lockdep_assert_held_read(&parent->fs_info->commit_root_sem);
   6820
   6821	BUG_ON(*level == 0);
   6822	eb = btrfs_read_node_slot(parent, slot);
   6823	if (IS_ERR(eb))
   6824		return PTR_ERR(eb);
   6825
   6826	/*
   6827	 * Trigger readahead for the next leaves we will process, so that it is
   6828	 * very likely that when we need them they are already in memory and we
   6829	 * will not block on disk IO. For nodes we only do readahead for one,
   6830	 * since the time window between processing nodes is typically larger.
   6831	 */
   6832	reada_max = (*level == 1 ? SZ_128K : eb->fs_info->nodesize);
   6833
   6834	for (slot++; slot < nritems && reada_done < reada_max; slot++) {
   6835		if (btrfs_node_ptr_generation(parent, slot) > reada_min_gen) {
   6836			btrfs_readahead_node_child(parent, slot);
   6837			reada_done += eb->fs_info->nodesize;
   6838		}
   6839	}
   6840
   6841	path->nodes[*level - 1] = eb;
   6842	path->slots[*level - 1] = 0;
   6843	(*level)--;
   6844
   6845	if (*level == 0)
   6846		return replace_node_with_clone(path, 0);
   6847
   6848	return 0;
   6849}
   6850
   6851static int tree_move_next_or_upnext(struct btrfs_path *path,
   6852				    int *level, int root_level)
   6853{
   6854	int ret = 0;
   6855	int nritems;
   6856	nritems = btrfs_header_nritems(path->nodes[*level]);
   6857
   6858	path->slots[*level]++;
   6859
   6860	while (path->slots[*level] >= nritems) {
   6861		if (*level == root_level) {
   6862			path->slots[*level] = nritems - 1;
   6863			return -1;
   6864		}
   6865
   6866		/* move upnext */
   6867		path->slots[*level] = 0;
   6868		free_extent_buffer(path->nodes[*level]);
   6869		path->nodes[*level] = NULL;
   6870		(*level)++;
   6871		path->slots[*level]++;
   6872
   6873		nritems = btrfs_header_nritems(path->nodes[*level]);
   6874		ret = 1;
   6875	}
   6876	return ret;
   6877}
   6878
   6879/*
   6880 * Returns 1 if it had to move up and next. 0 is returned if it moved only next
   6881 * or down.
   6882 */
   6883static int tree_advance(struct btrfs_path *path,
   6884			int *level, int root_level,
   6885			int allow_down,
   6886			struct btrfs_key *key,
   6887			u64 reada_min_gen)
   6888{
   6889	int ret;
   6890
   6891	if (*level == 0 || !allow_down) {
   6892		ret = tree_move_next_or_upnext(path, level, root_level);
   6893	} else {
   6894		ret = tree_move_down(path, level, reada_min_gen);
   6895	}
   6896
   6897	/*
   6898	 * Even if we have reached the end of a tree, ret is -1, update the key
   6899	 * anyway, so that in case we need to restart due to a block group
   6900	 * relocation, we can assert that the last key of the root node still
   6901	 * exists in the tree.
   6902	 */
   6903	if (*level == 0)
   6904		btrfs_item_key_to_cpu(path->nodes[*level], key,
   6905				      path->slots[*level]);
   6906	else
   6907		btrfs_node_key_to_cpu(path->nodes[*level], key,
   6908				      path->slots[*level]);
   6909
   6910	return ret;
   6911}
   6912
   6913static int tree_compare_item(struct btrfs_path *left_path,
   6914			     struct btrfs_path *right_path,
   6915			     char *tmp_buf)
   6916{
   6917	int cmp;
   6918	int len1, len2;
   6919	unsigned long off1, off2;
   6920
   6921	len1 = btrfs_item_size(left_path->nodes[0], left_path->slots[0]);
   6922	len2 = btrfs_item_size(right_path->nodes[0], right_path->slots[0]);
   6923	if (len1 != len2)
   6924		return 1;
   6925
   6926	off1 = btrfs_item_ptr_offset(left_path->nodes[0], left_path->slots[0]);
   6927	off2 = btrfs_item_ptr_offset(right_path->nodes[0],
   6928				right_path->slots[0]);
   6929
   6930	read_extent_buffer(left_path->nodes[0], tmp_buf, off1, len1);
   6931
   6932	cmp = memcmp_extent_buffer(right_path->nodes[0], tmp_buf, off2, len1);
   6933	if (cmp)
   6934		return 1;
   6935	return 0;
   6936}
   6937
   6938/*
   6939 * A transaction used for relocating a block group was committed or is about to
   6940 * finish its commit. Release our paths and restart the search, so that we are
   6941 * not using stale extent buffers:
   6942 *
   6943 * 1) For levels > 0, we are only holding references of extent buffers, without
   6944 *    any locks on them, which does not prevent them from having been relocated
   6945 *    and reallocated after the last time we released the commit root semaphore.
   6946 *    The exception are the root nodes, for which we always have a clone, see
   6947 *    the comment at btrfs_compare_trees();
   6948 *
   6949 * 2) For leaves, level 0, we are holding copies (clones) of extent buffers, so
   6950 *    we are safe from the concurrent relocation and reallocation. However they
   6951 *    can have file extent items with a pre relocation disk_bytenr value, so we
   6952 *    restart the start from the current commit roots and clone the new leaves so
   6953 *    that we get the post relocation disk_bytenr values. Not doing so, could
   6954 *    make us clone the wrong data in case there are new extents using the old
   6955 *    disk_bytenr that happen to be shared.
   6956 */
   6957static int restart_after_relocation(struct btrfs_path *left_path,
   6958				    struct btrfs_path *right_path,
   6959				    const struct btrfs_key *left_key,
   6960				    const struct btrfs_key *right_key,
   6961				    int left_level,
   6962				    int right_level,
   6963				    const struct send_ctx *sctx)
   6964{
   6965	int root_level;
   6966	int ret;
   6967
   6968	lockdep_assert_held_read(&sctx->send_root->fs_info->commit_root_sem);
   6969
   6970	btrfs_release_path(left_path);
   6971	btrfs_release_path(right_path);
   6972
   6973	/*
   6974	 * Since keys can not be added or removed to/from our roots because they
   6975	 * are readonly and we do not allow deduplication to run in parallel
   6976	 * (which can add, remove or change keys), the layout of the trees should
   6977	 * not change.
   6978	 */
   6979	left_path->lowest_level = left_level;
   6980	ret = search_key_again(sctx, sctx->send_root, left_path, left_key);
   6981	if (ret < 0)
   6982		return ret;
   6983
   6984	right_path->lowest_level = right_level;
   6985	ret = search_key_again(sctx, sctx->parent_root, right_path, right_key);
   6986	if (ret < 0)
   6987		return ret;
   6988
   6989	/*
   6990	 * If the lowest level nodes are leaves, clone them so that they can be
   6991	 * safely used by changed_cb() while not under the protection of the
   6992	 * commit root semaphore, even if relocation and reallocation happens in
   6993	 * parallel.
   6994	 */
   6995	if (left_level == 0) {
   6996		ret = replace_node_with_clone(left_path, 0);
   6997		if (ret < 0)
   6998			return ret;
   6999	}
   7000
   7001	if (right_level == 0) {
   7002		ret = replace_node_with_clone(right_path, 0);
   7003		if (ret < 0)
   7004			return ret;
   7005	}
   7006
   7007	/*
   7008	 * Now clone the root nodes (unless they happen to be the leaves we have
   7009	 * already cloned). This is to protect against concurrent snapshotting of
   7010	 * the send and parent roots (see the comment at btrfs_compare_trees()).
   7011	 */
   7012	root_level = btrfs_header_level(sctx->send_root->commit_root);
   7013	if (root_level > 0) {
   7014		ret = replace_node_with_clone(left_path, root_level);
   7015		if (ret < 0)
   7016			return ret;
   7017	}
   7018
   7019	root_level = btrfs_header_level(sctx->parent_root->commit_root);
   7020	if (root_level > 0) {
   7021		ret = replace_node_with_clone(right_path, root_level);
   7022		if (ret < 0)
   7023			return ret;
   7024	}
   7025
   7026	return 0;
   7027}
   7028
   7029/*
   7030 * This function compares two trees and calls the provided callback for
   7031 * every changed/new/deleted item it finds.
   7032 * If shared tree blocks are encountered, whole subtrees are skipped, making
   7033 * the compare pretty fast on snapshotted subvolumes.
   7034 *
   7035 * This currently works on commit roots only. As commit roots are read only,
   7036 * we don't do any locking. The commit roots are protected with transactions.
   7037 * Transactions are ended and rejoined when a commit is tried in between.
   7038 *
   7039 * This function checks for modifications done to the trees while comparing.
   7040 * If it detects a change, it aborts immediately.
   7041 */
   7042static int btrfs_compare_trees(struct btrfs_root *left_root,
   7043			struct btrfs_root *right_root, struct send_ctx *sctx)
   7044{
   7045	struct btrfs_fs_info *fs_info = left_root->fs_info;
   7046	int ret;
   7047	int cmp;
   7048	struct btrfs_path *left_path = NULL;
   7049	struct btrfs_path *right_path = NULL;
   7050	struct btrfs_key left_key;
   7051	struct btrfs_key right_key;
   7052	char *tmp_buf = NULL;
   7053	int left_root_level;
   7054	int right_root_level;
   7055	int left_level;
   7056	int right_level;
   7057	int left_end_reached = 0;
   7058	int right_end_reached = 0;
   7059	int advance_left = 0;
   7060	int advance_right = 0;
   7061	u64 left_blockptr;
   7062	u64 right_blockptr;
   7063	u64 left_gen;
   7064	u64 right_gen;
   7065	u64 reada_min_gen;
   7066
   7067	left_path = btrfs_alloc_path();
   7068	if (!left_path) {
   7069		ret = -ENOMEM;
   7070		goto out;
   7071	}
   7072	right_path = btrfs_alloc_path();
   7073	if (!right_path) {
   7074		ret = -ENOMEM;
   7075		goto out;
   7076	}
   7077
   7078	tmp_buf = kvmalloc(fs_info->nodesize, GFP_KERNEL);
   7079	if (!tmp_buf) {
   7080		ret = -ENOMEM;
   7081		goto out;
   7082	}
   7083
   7084	left_path->search_commit_root = 1;
   7085	left_path->skip_locking = 1;
   7086	right_path->search_commit_root = 1;
   7087	right_path->skip_locking = 1;
   7088
   7089	/*
   7090	 * Strategy: Go to the first items of both trees. Then do
   7091	 *
   7092	 * If both trees are at level 0
   7093	 *   Compare keys of current items
   7094	 *     If left < right treat left item as new, advance left tree
   7095	 *       and repeat
   7096	 *     If left > right treat right item as deleted, advance right tree
   7097	 *       and repeat
   7098	 *     If left == right do deep compare of items, treat as changed if
   7099	 *       needed, advance both trees and repeat
   7100	 * If both trees are at the same level but not at level 0
   7101	 *   Compare keys of current nodes/leafs
   7102	 *     If left < right advance left tree and repeat
   7103	 *     If left > right advance right tree and repeat
   7104	 *     If left == right compare blockptrs of the next nodes/leafs
   7105	 *       If they match advance both trees but stay at the same level
   7106	 *         and repeat
   7107	 *       If they don't match advance both trees while allowing to go
   7108	 *         deeper and repeat
   7109	 * If tree levels are different
   7110	 *   Advance the tree that needs it and repeat
   7111	 *
   7112	 * Advancing a tree means:
   7113	 *   If we are at level 0, try to go to the next slot. If that's not
   7114	 *   possible, go one level up and repeat. Stop when we found a level
   7115	 *   where we could go to the next slot. We may at this point be on a
   7116	 *   node or a leaf.
   7117	 *
   7118	 *   If we are not at level 0 and not on shared tree blocks, go one
   7119	 *   level deeper.
   7120	 *
   7121	 *   If we are not at level 0 and on shared tree blocks, go one slot to
   7122	 *   the right if possible or go up and right.
   7123	 */
   7124
   7125	down_read(&fs_info->commit_root_sem);
   7126	left_level = btrfs_header_level(left_root->commit_root);
   7127	left_root_level = left_level;
   7128	/*
   7129	 * We clone the root node of the send and parent roots to prevent races
   7130	 * with snapshot creation of these roots. Snapshot creation COWs the
   7131	 * root node of a tree, so after the transaction is committed the old
   7132	 * extent can be reallocated while this send operation is still ongoing.
   7133	 * So we clone them, under the commit root semaphore, to be race free.
   7134	 */
   7135	left_path->nodes[left_level] =
   7136			btrfs_clone_extent_buffer(left_root->commit_root);
   7137	if (!left_path->nodes[left_level]) {
   7138		ret = -ENOMEM;
   7139		goto out_unlock;
   7140	}
   7141
   7142	right_level = btrfs_header_level(right_root->commit_root);
   7143	right_root_level = right_level;
   7144	right_path->nodes[right_level] =
   7145			btrfs_clone_extent_buffer(right_root->commit_root);
   7146	if (!right_path->nodes[right_level]) {
   7147		ret = -ENOMEM;
   7148		goto out_unlock;
   7149	}
   7150	/*
   7151	 * Our right root is the parent root, while the left root is the "send"
   7152	 * root. We know that all new nodes/leaves in the left root must have
   7153	 * a generation greater than the right root's generation, so we trigger
   7154	 * readahead for those nodes and leaves of the left root, as we know we
   7155	 * will need to read them at some point.
   7156	 */
   7157	reada_min_gen = btrfs_header_generation(right_root->commit_root);
   7158
   7159	if (left_level == 0)
   7160		btrfs_item_key_to_cpu(left_path->nodes[left_level],
   7161				&left_key, left_path->slots[left_level]);
   7162	else
   7163		btrfs_node_key_to_cpu(left_path->nodes[left_level],
   7164				&left_key, left_path->slots[left_level]);
   7165	if (right_level == 0)
   7166		btrfs_item_key_to_cpu(right_path->nodes[right_level],
   7167				&right_key, right_path->slots[right_level]);
   7168	else
   7169		btrfs_node_key_to_cpu(right_path->nodes[right_level],
   7170				&right_key, right_path->slots[right_level]);
   7171
   7172	sctx->last_reloc_trans = fs_info->last_reloc_trans;
   7173
   7174	while (1) {
   7175		if (need_resched() ||
   7176		    rwsem_is_contended(&fs_info->commit_root_sem)) {
   7177			up_read(&fs_info->commit_root_sem);
   7178			cond_resched();
   7179			down_read(&fs_info->commit_root_sem);
   7180		}
   7181
   7182		if (fs_info->last_reloc_trans > sctx->last_reloc_trans) {
   7183			ret = restart_after_relocation(left_path, right_path,
   7184						       &left_key, &right_key,
   7185						       left_level, right_level,
   7186						       sctx);
   7187			if (ret < 0)
   7188				goto out_unlock;
   7189			sctx->last_reloc_trans = fs_info->last_reloc_trans;
   7190		}
   7191
   7192		if (advance_left && !left_end_reached) {
   7193			ret = tree_advance(left_path, &left_level,
   7194					left_root_level,
   7195					advance_left != ADVANCE_ONLY_NEXT,
   7196					&left_key, reada_min_gen);
   7197			if (ret == -1)
   7198				left_end_reached = ADVANCE;
   7199			else if (ret < 0)
   7200				goto out_unlock;
   7201			advance_left = 0;
   7202		}
   7203		if (advance_right && !right_end_reached) {
   7204			ret = tree_advance(right_path, &right_level,
   7205					right_root_level,
   7206					advance_right != ADVANCE_ONLY_NEXT,
   7207					&right_key, reada_min_gen);
   7208			if (ret == -1)
   7209				right_end_reached = ADVANCE;
   7210			else if (ret < 0)
   7211				goto out_unlock;
   7212			advance_right = 0;
   7213		}
   7214
   7215		if (left_end_reached && right_end_reached) {
   7216			ret = 0;
   7217			goto out_unlock;
   7218		} else if (left_end_reached) {
   7219			if (right_level == 0) {
   7220				up_read(&fs_info->commit_root_sem);
   7221				ret = changed_cb(left_path, right_path,
   7222						&right_key,
   7223						BTRFS_COMPARE_TREE_DELETED,
   7224						sctx);
   7225				if (ret < 0)
   7226					goto out;
   7227				down_read(&fs_info->commit_root_sem);
   7228			}
   7229			advance_right = ADVANCE;
   7230			continue;
   7231		} else if (right_end_reached) {
   7232			if (left_level == 0) {
   7233				up_read(&fs_info->commit_root_sem);
   7234				ret = changed_cb(left_path, right_path,
   7235						&left_key,
   7236						BTRFS_COMPARE_TREE_NEW,
   7237						sctx);
   7238				if (ret < 0)
   7239					goto out;
   7240				down_read(&fs_info->commit_root_sem);
   7241			}
   7242			advance_left = ADVANCE;
   7243			continue;
   7244		}
   7245
   7246		if (left_level == 0 && right_level == 0) {
   7247			up_read(&fs_info->commit_root_sem);
   7248			cmp = btrfs_comp_cpu_keys(&left_key, &right_key);
   7249			if (cmp < 0) {
   7250				ret = changed_cb(left_path, right_path,
   7251						&left_key,
   7252						BTRFS_COMPARE_TREE_NEW,
   7253						sctx);
   7254				advance_left = ADVANCE;
   7255			} else if (cmp > 0) {
   7256				ret = changed_cb(left_path, right_path,
   7257						&right_key,
   7258						BTRFS_COMPARE_TREE_DELETED,
   7259						sctx);
   7260				advance_right = ADVANCE;
   7261			} else {
   7262				enum btrfs_compare_tree_result result;
   7263
   7264				WARN_ON(!extent_buffer_uptodate(left_path->nodes[0]));
   7265				ret = tree_compare_item(left_path, right_path,
   7266							tmp_buf);
   7267				if (ret)
   7268					result = BTRFS_COMPARE_TREE_CHANGED;
   7269				else
   7270					result = BTRFS_COMPARE_TREE_SAME;
   7271				ret = changed_cb(left_path, right_path,
   7272						 &left_key, result, sctx);
   7273				advance_left = ADVANCE;
   7274				advance_right = ADVANCE;
   7275			}
   7276
   7277			if (ret < 0)
   7278				goto out;
   7279			down_read(&fs_info->commit_root_sem);
   7280		} else if (left_level == right_level) {
   7281			cmp = btrfs_comp_cpu_keys(&left_key, &right_key);
   7282			if (cmp < 0) {
   7283				advance_left = ADVANCE;
   7284			} else if (cmp > 0) {
   7285				advance_right = ADVANCE;
   7286			} else {
   7287				left_blockptr = btrfs_node_blockptr(
   7288						left_path->nodes[left_level],
   7289						left_path->slots[left_level]);
   7290				right_blockptr = btrfs_node_blockptr(
   7291						right_path->nodes[right_level],
   7292						right_path->slots[right_level]);
   7293				left_gen = btrfs_node_ptr_generation(
   7294						left_path->nodes[left_level],
   7295						left_path->slots[left_level]);
   7296				right_gen = btrfs_node_ptr_generation(
   7297						right_path->nodes[right_level],
   7298						right_path->slots[right_level]);
   7299				if (left_blockptr == right_blockptr &&
   7300				    left_gen == right_gen) {
   7301					/*
   7302					 * As we're on a shared block, don't
   7303					 * allow to go deeper.
   7304					 */
   7305					advance_left = ADVANCE_ONLY_NEXT;
   7306					advance_right = ADVANCE_ONLY_NEXT;
   7307				} else {
   7308					advance_left = ADVANCE;
   7309					advance_right = ADVANCE;
   7310				}
   7311			}
   7312		} else if (left_level < right_level) {
   7313			advance_right = ADVANCE;
   7314		} else {
   7315			advance_left = ADVANCE;
   7316		}
   7317	}
   7318
   7319out_unlock:
   7320	up_read(&fs_info->commit_root_sem);
   7321out:
   7322	btrfs_free_path(left_path);
   7323	btrfs_free_path(right_path);
   7324	kvfree(tmp_buf);
   7325	return ret;
   7326}
   7327
   7328static int send_subvol(struct send_ctx *sctx)
   7329{
   7330	int ret;
   7331
   7332	if (!(sctx->flags & BTRFS_SEND_FLAG_OMIT_STREAM_HEADER)) {
   7333		ret = send_header(sctx);
   7334		if (ret < 0)
   7335			goto out;
   7336	}
   7337
   7338	ret = send_subvol_begin(sctx);
   7339	if (ret < 0)
   7340		goto out;
   7341
   7342	if (sctx->parent_root) {
   7343		ret = btrfs_compare_trees(sctx->send_root, sctx->parent_root, sctx);
   7344		if (ret < 0)
   7345			goto out;
   7346		ret = finish_inode_if_needed(sctx, 1);
   7347		if (ret < 0)
   7348			goto out;
   7349	} else {
   7350		ret = full_send_tree(sctx);
   7351		if (ret < 0)
   7352			goto out;
   7353	}
   7354
   7355out:
   7356	free_recorded_refs(sctx);
   7357	return ret;
   7358}
   7359
   7360/*
   7361 * If orphan cleanup did remove any orphans from a root, it means the tree
   7362 * was modified and therefore the commit root is not the same as the current
   7363 * root anymore. This is a problem, because send uses the commit root and
   7364 * therefore can see inode items that don't exist in the current root anymore,
   7365 * and for example make calls to btrfs_iget, which will do tree lookups based
   7366 * on the current root and not on the commit root. Those lookups will fail,
   7367 * returning a -ESTALE error, and making send fail with that error. So make
   7368 * sure a send does not see any orphans we have just removed, and that it will
   7369 * see the same inodes regardless of whether a transaction commit happened
   7370 * before it started (meaning that the commit root will be the same as the
   7371 * current root) or not.
   7372 */
   7373static int ensure_commit_roots_uptodate(struct send_ctx *sctx)
   7374{
   7375	int i;
   7376	struct btrfs_trans_handle *trans = NULL;
   7377
   7378again:
   7379	if (sctx->parent_root &&
   7380	    sctx->parent_root->node != sctx->parent_root->commit_root)
   7381		goto commit_trans;
   7382
   7383	for (i = 0; i < sctx->clone_roots_cnt; i++)
   7384		if (sctx->clone_roots[i].root->node !=
   7385		    sctx->clone_roots[i].root->commit_root)
   7386			goto commit_trans;
   7387
   7388	if (trans)
   7389		return btrfs_end_transaction(trans);
   7390
   7391	return 0;
   7392
   7393commit_trans:
   7394	/* Use any root, all fs roots will get their commit roots updated. */
   7395	if (!trans) {
   7396		trans = btrfs_join_transaction(sctx->send_root);
   7397		if (IS_ERR(trans))
   7398			return PTR_ERR(trans);
   7399		goto again;
   7400	}
   7401
   7402	return btrfs_commit_transaction(trans);
   7403}
   7404
   7405/*
   7406 * Make sure any existing dellaloc is flushed for any root used by a send
   7407 * operation so that we do not miss any data and we do not race with writeback
   7408 * finishing and changing a tree while send is using the tree. This could
   7409 * happen if a subvolume is in RW mode, has delalloc, is turned to RO mode and
   7410 * a send operation then uses the subvolume.
   7411 * After flushing delalloc ensure_commit_roots_uptodate() must be called.
   7412 */
   7413static int flush_delalloc_roots(struct send_ctx *sctx)
   7414{
   7415	struct btrfs_root *root = sctx->parent_root;
   7416	int ret;
   7417	int i;
   7418
   7419	if (root) {
   7420		ret = btrfs_start_delalloc_snapshot(root, false);
   7421		if (ret)
   7422			return ret;
   7423		btrfs_wait_ordered_extents(root, U64_MAX, 0, U64_MAX);
   7424	}
   7425
   7426	for (i = 0; i < sctx->clone_roots_cnt; i++) {
   7427		root = sctx->clone_roots[i].root;
   7428		ret = btrfs_start_delalloc_snapshot(root, false);
   7429		if (ret)
   7430			return ret;
   7431		btrfs_wait_ordered_extents(root, U64_MAX, 0, U64_MAX);
   7432	}
   7433
   7434	return 0;
   7435}
   7436
   7437static void btrfs_root_dec_send_in_progress(struct btrfs_root* root)
   7438{
   7439	spin_lock(&root->root_item_lock);
   7440	root->send_in_progress--;
   7441	/*
   7442	 * Not much left to do, we don't know why it's unbalanced and
   7443	 * can't blindly reset it to 0.
   7444	 */
   7445	if (root->send_in_progress < 0)
   7446		btrfs_err(root->fs_info,
   7447			  "send_in_progress unbalanced %d root %llu",
   7448			  root->send_in_progress, root->root_key.objectid);
   7449	spin_unlock(&root->root_item_lock);
   7450}
   7451
   7452static void dedupe_in_progress_warn(const struct btrfs_root *root)
   7453{
   7454	btrfs_warn_rl(root->fs_info,
   7455"cannot use root %llu for send while deduplications on it are in progress (%d in progress)",
   7456		      root->root_key.objectid, root->dedupe_in_progress);
   7457}
   7458
   7459long btrfs_ioctl_send(struct inode *inode, struct btrfs_ioctl_send_args *arg)
   7460{
   7461	int ret = 0;
   7462	struct btrfs_root *send_root = BTRFS_I(inode)->root;
   7463	struct btrfs_fs_info *fs_info = send_root->fs_info;
   7464	struct btrfs_root *clone_root;
   7465	struct send_ctx *sctx = NULL;
   7466	u32 i;
   7467	u64 *clone_sources_tmp = NULL;
   7468	int clone_sources_to_rollback = 0;
   7469	size_t alloc_size;
   7470	int sort_clone_roots = 0;
   7471
   7472	if (!capable(CAP_SYS_ADMIN))
   7473		return -EPERM;
   7474
   7475	/*
   7476	 * The subvolume must remain read-only during send, protect against
   7477	 * making it RW. This also protects against deletion.
   7478	 */
   7479	spin_lock(&send_root->root_item_lock);
   7480	if (btrfs_root_readonly(send_root) && send_root->dedupe_in_progress) {
   7481		dedupe_in_progress_warn(send_root);
   7482		spin_unlock(&send_root->root_item_lock);
   7483		return -EAGAIN;
   7484	}
   7485	send_root->send_in_progress++;
   7486	spin_unlock(&send_root->root_item_lock);
   7487
   7488	/*
   7489	 * Userspace tools do the checks and warn the user if it's
   7490	 * not RO.
   7491	 */
   7492	if (!btrfs_root_readonly(send_root)) {
   7493		ret = -EPERM;
   7494		goto out;
   7495	}
   7496
   7497	/*
   7498	 * Check that we don't overflow at later allocations, we request
   7499	 * clone_sources_count + 1 items, and compare to unsigned long inside
   7500	 * access_ok.
   7501	 */
   7502	if (arg->clone_sources_count >
   7503	    ULONG_MAX / sizeof(struct clone_root) - 1) {
   7504		ret = -EINVAL;
   7505		goto out;
   7506	}
   7507
   7508	if (arg->flags & ~BTRFS_SEND_FLAG_MASK) {
   7509		ret = -EINVAL;
   7510		goto out;
   7511	}
   7512
   7513	sctx = kzalloc(sizeof(struct send_ctx), GFP_KERNEL);
   7514	if (!sctx) {
   7515		ret = -ENOMEM;
   7516		goto out;
   7517	}
   7518
   7519	INIT_LIST_HEAD(&sctx->new_refs);
   7520	INIT_LIST_HEAD(&sctx->deleted_refs);
   7521	xa_init_flags(&sctx->name_cache, GFP_KERNEL);
   7522	INIT_LIST_HEAD(&sctx->name_cache_list);
   7523
   7524	sctx->flags = arg->flags;
   7525
   7526	if (arg->flags & BTRFS_SEND_FLAG_VERSION) {
   7527		if (arg->version > BTRFS_SEND_STREAM_VERSION) {
   7528			ret = -EPROTO;
   7529			goto out;
   7530		}
   7531		/* Zero means "use the highest version" */
   7532		sctx->proto = arg->version ?: BTRFS_SEND_STREAM_VERSION;
   7533	} else {
   7534		sctx->proto = 1;
   7535	}
   7536
   7537	sctx->send_filp = fget(arg->send_fd);
   7538	if (!sctx->send_filp) {
   7539		ret = -EBADF;
   7540		goto out;
   7541	}
   7542
   7543	sctx->send_root = send_root;
   7544	/*
   7545	 * Unlikely but possible, if the subvolume is marked for deletion but
   7546	 * is slow to remove the directory entry, send can still be started
   7547	 */
   7548	if (btrfs_root_dead(sctx->send_root)) {
   7549		ret = -EPERM;
   7550		goto out;
   7551	}
   7552
   7553	sctx->clone_roots_cnt = arg->clone_sources_count;
   7554
   7555	sctx->send_max_size = BTRFS_SEND_BUF_SIZE;
   7556	sctx->send_buf = kvmalloc(sctx->send_max_size, GFP_KERNEL);
   7557	if (!sctx->send_buf) {
   7558		ret = -ENOMEM;
   7559		goto out;
   7560	}
   7561
   7562	sctx->pending_dir_moves = RB_ROOT;
   7563	sctx->waiting_dir_moves = RB_ROOT;
   7564	sctx->orphan_dirs = RB_ROOT;
   7565
   7566	sctx->clone_roots = kvcalloc(sizeof(*sctx->clone_roots),
   7567				     arg->clone_sources_count + 1,
   7568				     GFP_KERNEL);
   7569	if (!sctx->clone_roots) {
   7570		ret = -ENOMEM;
   7571		goto out;
   7572	}
   7573
   7574	alloc_size = array_size(sizeof(*arg->clone_sources),
   7575				arg->clone_sources_count);
   7576
   7577	if (arg->clone_sources_count) {
   7578		clone_sources_tmp = kvmalloc(alloc_size, GFP_KERNEL);
   7579		if (!clone_sources_tmp) {
   7580			ret = -ENOMEM;
   7581			goto out;
   7582		}
   7583
   7584		ret = copy_from_user(clone_sources_tmp, arg->clone_sources,
   7585				alloc_size);
   7586		if (ret) {
   7587			ret = -EFAULT;
   7588			goto out;
   7589		}
   7590
   7591		for (i = 0; i < arg->clone_sources_count; i++) {
   7592			clone_root = btrfs_get_fs_root(fs_info,
   7593						clone_sources_tmp[i], true);
   7594			if (IS_ERR(clone_root)) {
   7595				ret = PTR_ERR(clone_root);
   7596				goto out;
   7597			}
   7598			spin_lock(&clone_root->root_item_lock);
   7599			if (!btrfs_root_readonly(clone_root) ||
   7600			    btrfs_root_dead(clone_root)) {
   7601				spin_unlock(&clone_root->root_item_lock);
   7602				btrfs_put_root(clone_root);
   7603				ret = -EPERM;
   7604				goto out;
   7605			}
   7606			if (clone_root->dedupe_in_progress) {
   7607				dedupe_in_progress_warn(clone_root);
   7608				spin_unlock(&clone_root->root_item_lock);
   7609				btrfs_put_root(clone_root);
   7610				ret = -EAGAIN;
   7611				goto out;
   7612			}
   7613			clone_root->send_in_progress++;
   7614			spin_unlock(&clone_root->root_item_lock);
   7615
   7616			sctx->clone_roots[i].root = clone_root;
   7617			clone_sources_to_rollback = i + 1;
   7618		}
   7619		kvfree(clone_sources_tmp);
   7620		clone_sources_tmp = NULL;
   7621	}
   7622
   7623	if (arg->parent_root) {
   7624		sctx->parent_root = btrfs_get_fs_root(fs_info, arg->parent_root,
   7625						      true);
   7626		if (IS_ERR(sctx->parent_root)) {
   7627			ret = PTR_ERR(sctx->parent_root);
   7628			goto out;
   7629		}
   7630
   7631		spin_lock(&sctx->parent_root->root_item_lock);
   7632		sctx->parent_root->send_in_progress++;
   7633		if (!btrfs_root_readonly(sctx->parent_root) ||
   7634				btrfs_root_dead(sctx->parent_root)) {
   7635			spin_unlock(&sctx->parent_root->root_item_lock);
   7636			ret = -EPERM;
   7637			goto out;
   7638		}
   7639		if (sctx->parent_root->dedupe_in_progress) {
   7640			dedupe_in_progress_warn(sctx->parent_root);
   7641			spin_unlock(&sctx->parent_root->root_item_lock);
   7642			ret = -EAGAIN;
   7643			goto out;
   7644		}
   7645		spin_unlock(&sctx->parent_root->root_item_lock);
   7646	}
   7647
   7648	/*
   7649	 * Clones from send_root are allowed, but only if the clone source
   7650	 * is behind the current send position. This is checked while searching
   7651	 * for possible clone sources.
   7652	 */
   7653	sctx->clone_roots[sctx->clone_roots_cnt++].root =
   7654		btrfs_grab_root(sctx->send_root);
   7655
   7656	/* We do a bsearch later */
   7657	sort(sctx->clone_roots, sctx->clone_roots_cnt,
   7658			sizeof(*sctx->clone_roots), __clone_root_cmp_sort,
   7659			NULL);
   7660	sort_clone_roots = 1;
   7661
   7662	ret = flush_delalloc_roots(sctx);
   7663	if (ret)
   7664		goto out;
   7665
   7666	ret = ensure_commit_roots_uptodate(sctx);
   7667	if (ret)
   7668		goto out;
   7669
   7670	ret = send_subvol(sctx);
   7671	if (ret < 0)
   7672		goto out;
   7673
   7674	if (!(sctx->flags & BTRFS_SEND_FLAG_OMIT_END_CMD)) {
   7675		ret = begin_cmd(sctx, BTRFS_SEND_C_END);
   7676		if (ret < 0)
   7677			goto out;
   7678		ret = send_cmd(sctx);
   7679		if (ret < 0)
   7680			goto out;
   7681	}
   7682
   7683out:
   7684	WARN_ON(sctx && !ret && !RB_EMPTY_ROOT(&sctx->pending_dir_moves));
   7685	while (sctx && !RB_EMPTY_ROOT(&sctx->pending_dir_moves)) {
   7686		struct rb_node *n;
   7687		struct pending_dir_move *pm;
   7688
   7689		n = rb_first(&sctx->pending_dir_moves);
   7690		pm = rb_entry(n, struct pending_dir_move, node);
   7691		while (!list_empty(&pm->list)) {
   7692			struct pending_dir_move *pm2;
   7693
   7694			pm2 = list_first_entry(&pm->list,
   7695					       struct pending_dir_move, list);
   7696			free_pending_move(sctx, pm2);
   7697		}
   7698		free_pending_move(sctx, pm);
   7699	}
   7700
   7701	WARN_ON(sctx && !ret && !RB_EMPTY_ROOT(&sctx->waiting_dir_moves));
   7702	while (sctx && !RB_EMPTY_ROOT(&sctx->waiting_dir_moves)) {
   7703		struct rb_node *n;
   7704		struct waiting_dir_move *dm;
   7705
   7706		n = rb_first(&sctx->waiting_dir_moves);
   7707		dm = rb_entry(n, struct waiting_dir_move, node);
   7708		rb_erase(&dm->node, &sctx->waiting_dir_moves);
   7709		kfree(dm);
   7710	}
   7711
   7712	WARN_ON(sctx && !ret && !RB_EMPTY_ROOT(&sctx->orphan_dirs));
   7713	while (sctx && !RB_EMPTY_ROOT(&sctx->orphan_dirs)) {
   7714		struct rb_node *n;
   7715		struct orphan_dir_info *odi;
   7716
   7717		n = rb_first(&sctx->orphan_dirs);
   7718		odi = rb_entry(n, struct orphan_dir_info, node);
   7719		free_orphan_dir_info(sctx, odi);
   7720	}
   7721
   7722	if (sort_clone_roots) {
   7723		for (i = 0; i < sctx->clone_roots_cnt; i++) {
   7724			btrfs_root_dec_send_in_progress(
   7725					sctx->clone_roots[i].root);
   7726			btrfs_put_root(sctx->clone_roots[i].root);
   7727		}
   7728	} else {
   7729		for (i = 0; sctx && i < clone_sources_to_rollback; i++) {
   7730			btrfs_root_dec_send_in_progress(
   7731					sctx->clone_roots[i].root);
   7732			btrfs_put_root(sctx->clone_roots[i].root);
   7733		}
   7734
   7735		btrfs_root_dec_send_in_progress(send_root);
   7736	}
   7737	if (sctx && !IS_ERR_OR_NULL(sctx->parent_root)) {
   7738		btrfs_root_dec_send_in_progress(sctx->parent_root);
   7739		btrfs_put_root(sctx->parent_root);
   7740	}
   7741
   7742	kvfree(clone_sources_tmp);
   7743
   7744	if (sctx) {
   7745		if (sctx->send_filp)
   7746			fput(sctx->send_filp);
   7747
   7748		kvfree(sctx->clone_roots);
   7749		kvfree(sctx->send_buf);
   7750
   7751		name_cache_free(sctx);
   7752
   7753		close_current_inode(sctx);
   7754
   7755		kfree(sctx);
   7756	}
   7757
   7758	return ret;
   7759}