cachepc-linux

Fork of AMDESE/linux with modifications for CachePC side-channel attack
git clone https://git.sinitax.com/sinitax/cachepc-linux
Log | Files | Refs | README | LICENSE | sfeed.txt

tree-mod-log.c (24048B)


      1// SPDX-License-Identifier: GPL-2.0
      2
      3#include "tree-mod-log.h"
      4#include "disk-io.h"
      5
      6struct tree_mod_root {
      7	u64 logical;
      8	u8 level;
      9};
     10
     11struct tree_mod_elem {
     12	struct rb_node node;
     13	u64 logical;
     14	u64 seq;
     15	enum btrfs_mod_log_op op;
     16
     17	/*
     18	 * This is used for BTRFS_MOD_LOG_KEY_* and BTRFS_MOD_LOG_MOVE_KEYS
     19	 * operations.
     20	 */
     21	int slot;
     22
     23	/* This is used for BTRFS_MOD_LOG_KEY* and BTRFS_MOD_LOG_ROOT_REPLACE. */
     24	u64 generation;
     25
     26	/* Those are used for op == BTRFS_MOD_LOG_KEY_{REPLACE,REMOVE}. */
     27	struct btrfs_disk_key key;
     28	u64 blockptr;
     29
     30	/* This is used for op == BTRFS_MOD_LOG_MOVE_KEYS. */
     31	struct {
     32		int dst_slot;
     33		int nr_items;
     34	} move;
     35
     36	/* This is used for op == BTRFS_MOD_LOG_ROOT_REPLACE. */
     37	struct tree_mod_root old_root;
     38};
     39
     40/*
     41 * Pull a new tree mod seq number for our operation.
     42 */
     43static inline u64 btrfs_inc_tree_mod_seq(struct btrfs_fs_info *fs_info)
     44{
     45	return atomic64_inc_return(&fs_info->tree_mod_seq);
     46}
     47
     48/*
     49 * This adds a new blocker to the tree mod log's blocker list if the @elem
     50 * passed does not already have a sequence number set. So when a caller expects
     51 * to record tree modifications, it should ensure to set elem->seq to zero
     52 * before calling btrfs_get_tree_mod_seq.
     53 * Returns a fresh, unused tree log modification sequence number, even if no new
     54 * blocker was added.
     55 */
     56u64 btrfs_get_tree_mod_seq(struct btrfs_fs_info *fs_info,
     57			   struct btrfs_seq_list *elem)
     58{
     59	write_lock(&fs_info->tree_mod_log_lock);
     60	if (!elem->seq) {
     61		elem->seq = btrfs_inc_tree_mod_seq(fs_info);
     62		list_add_tail(&elem->list, &fs_info->tree_mod_seq_list);
     63		set_bit(BTRFS_FS_TREE_MOD_LOG_USERS, &fs_info->flags);
     64	}
     65	write_unlock(&fs_info->tree_mod_log_lock);
     66
     67	return elem->seq;
     68}
     69
     70void btrfs_put_tree_mod_seq(struct btrfs_fs_info *fs_info,
     71			    struct btrfs_seq_list *elem)
     72{
     73	struct rb_root *tm_root;
     74	struct rb_node *node;
     75	struct rb_node *next;
     76	struct tree_mod_elem *tm;
     77	u64 min_seq = BTRFS_SEQ_LAST;
     78	u64 seq_putting = elem->seq;
     79
     80	if (!seq_putting)
     81		return;
     82
     83	write_lock(&fs_info->tree_mod_log_lock);
     84	list_del(&elem->list);
     85	elem->seq = 0;
     86
     87	if (list_empty(&fs_info->tree_mod_seq_list)) {
     88		clear_bit(BTRFS_FS_TREE_MOD_LOG_USERS, &fs_info->flags);
     89	} else {
     90		struct btrfs_seq_list *first;
     91
     92		first = list_first_entry(&fs_info->tree_mod_seq_list,
     93					 struct btrfs_seq_list, list);
     94		if (seq_putting > first->seq) {
     95			/*
     96			 * Blocker with lower sequence number exists, we cannot
     97			 * remove anything from the log.
     98			 */
     99			write_unlock(&fs_info->tree_mod_log_lock);
    100			return;
    101		}
    102		min_seq = first->seq;
    103	}
    104
    105	/*
    106	 * Anything that's lower than the lowest existing (read: blocked)
    107	 * sequence number can be removed from the tree.
    108	 */
    109	tm_root = &fs_info->tree_mod_log;
    110	for (node = rb_first(tm_root); node; node = next) {
    111		next = rb_next(node);
    112		tm = rb_entry(node, struct tree_mod_elem, node);
    113		if (tm->seq >= min_seq)
    114			continue;
    115		rb_erase(node, tm_root);
    116		kfree(tm);
    117	}
    118	write_unlock(&fs_info->tree_mod_log_lock);
    119}
    120
    121/*
    122 * Key order of the log:
    123 *       node/leaf start address -> sequence
    124 *
    125 * The 'start address' is the logical address of the *new* root node for root
    126 * replace operations, or the logical address of the affected block for all
    127 * other operations.
    128 */
    129static noinline int tree_mod_log_insert(struct btrfs_fs_info *fs_info,
    130					struct tree_mod_elem *tm)
    131{
    132	struct rb_root *tm_root;
    133	struct rb_node **new;
    134	struct rb_node *parent = NULL;
    135	struct tree_mod_elem *cur;
    136
    137	lockdep_assert_held_write(&fs_info->tree_mod_log_lock);
    138
    139	tm->seq = btrfs_inc_tree_mod_seq(fs_info);
    140
    141	tm_root = &fs_info->tree_mod_log;
    142	new = &tm_root->rb_node;
    143	while (*new) {
    144		cur = rb_entry(*new, struct tree_mod_elem, node);
    145		parent = *new;
    146		if (cur->logical < tm->logical)
    147			new = &((*new)->rb_left);
    148		else if (cur->logical > tm->logical)
    149			new = &((*new)->rb_right);
    150		else if (cur->seq < tm->seq)
    151			new = &((*new)->rb_left);
    152		else if (cur->seq > tm->seq)
    153			new = &((*new)->rb_right);
    154		else
    155			return -EEXIST;
    156	}
    157
    158	rb_link_node(&tm->node, parent, new);
    159	rb_insert_color(&tm->node, tm_root);
    160	return 0;
    161}
    162
    163/*
    164 * Determines if logging can be omitted. Returns true if it can. Otherwise, it
    165 * returns false with the tree_mod_log_lock acquired. The caller must hold
    166 * this until all tree mod log insertions are recorded in the rb tree and then
    167 * write unlock fs_info::tree_mod_log_lock.
    168 */
    169static inline bool tree_mod_dont_log(struct btrfs_fs_info *fs_info,
    170				    struct extent_buffer *eb)
    171{
    172	if (!test_bit(BTRFS_FS_TREE_MOD_LOG_USERS, &fs_info->flags))
    173		return true;
    174	if (eb && btrfs_header_level(eb) == 0)
    175		return true;
    176
    177	write_lock(&fs_info->tree_mod_log_lock);
    178	if (list_empty(&(fs_info)->tree_mod_seq_list)) {
    179		write_unlock(&fs_info->tree_mod_log_lock);
    180		return true;
    181	}
    182
    183	return false;
    184}
    185
    186/* Similar to tree_mod_dont_log, but doesn't acquire any locks. */
    187static inline bool tree_mod_need_log(const struct btrfs_fs_info *fs_info,
    188				    struct extent_buffer *eb)
    189{
    190	if (!test_bit(BTRFS_FS_TREE_MOD_LOG_USERS, &fs_info->flags))
    191		return false;
    192	if (eb && btrfs_header_level(eb) == 0)
    193		return false;
    194
    195	return true;
    196}
    197
    198static struct tree_mod_elem *alloc_tree_mod_elem(struct extent_buffer *eb,
    199						 int slot,
    200						 enum btrfs_mod_log_op op,
    201						 gfp_t flags)
    202{
    203	struct tree_mod_elem *tm;
    204
    205	tm = kzalloc(sizeof(*tm), flags);
    206	if (!tm)
    207		return NULL;
    208
    209	tm->logical = eb->start;
    210	if (op != BTRFS_MOD_LOG_KEY_ADD) {
    211		btrfs_node_key(eb, &tm->key, slot);
    212		tm->blockptr = btrfs_node_blockptr(eb, slot);
    213	}
    214	tm->op = op;
    215	tm->slot = slot;
    216	tm->generation = btrfs_node_ptr_generation(eb, slot);
    217	RB_CLEAR_NODE(&tm->node);
    218
    219	return tm;
    220}
    221
    222int btrfs_tree_mod_log_insert_key(struct extent_buffer *eb, int slot,
    223				  enum btrfs_mod_log_op op, gfp_t flags)
    224{
    225	struct tree_mod_elem *tm;
    226	int ret;
    227
    228	if (!tree_mod_need_log(eb->fs_info, eb))
    229		return 0;
    230
    231	tm = alloc_tree_mod_elem(eb, slot, op, flags);
    232	if (!tm)
    233		return -ENOMEM;
    234
    235	if (tree_mod_dont_log(eb->fs_info, eb)) {
    236		kfree(tm);
    237		return 0;
    238	}
    239
    240	ret = tree_mod_log_insert(eb->fs_info, tm);
    241	write_unlock(&eb->fs_info->tree_mod_log_lock);
    242	if (ret)
    243		kfree(tm);
    244
    245	return ret;
    246}
    247
    248int btrfs_tree_mod_log_insert_move(struct extent_buffer *eb,
    249				   int dst_slot, int src_slot,
    250				   int nr_items)
    251{
    252	struct tree_mod_elem *tm = NULL;
    253	struct tree_mod_elem **tm_list = NULL;
    254	int ret = 0;
    255	int i;
    256	bool locked = false;
    257
    258	if (!tree_mod_need_log(eb->fs_info, eb))
    259		return 0;
    260
    261	tm_list = kcalloc(nr_items, sizeof(struct tree_mod_elem *), GFP_NOFS);
    262	if (!tm_list)
    263		return -ENOMEM;
    264
    265	tm = kzalloc(sizeof(*tm), GFP_NOFS);
    266	if (!tm) {
    267		ret = -ENOMEM;
    268		goto free_tms;
    269	}
    270
    271	tm->logical = eb->start;
    272	tm->slot = src_slot;
    273	tm->move.dst_slot = dst_slot;
    274	tm->move.nr_items = nr_items;
    275	tm->op = BTRFS_MOD_LOG_MOVE_KEYS;
    276
    277	for (i = 0; i + dst_slot < src_slot && i < nr_items; i++) {
    278		tm_list[i] = alloc_tree_mod_elem(eb, i + dst_slot,
    279				BTRFS_MOD_LOG_KEY_REMOVE_WHILE_MOVING, GFP_NOFS);
    280		if (!tm_list[i]) {
    281			ret = -ENOMEM;
    282			goto free_tms;
    283		}
    284	}
    285
    286	if (tree_mod_dont_log(eb->fs_info, eb))
    287		goto free_tms;
    288	locked = true;
    289
    290	/*
    291	 * When we override something during the move, we log these removals.
    292	 * This can only happen when we move towards the beginning of the
    293	 * buffer, i.e. dst_slot < src_slot.
    294	 */
    295	for (i = 0; i + dst_slot < src_slot && i < nr_items; i++) {
    296		ret = tree_mod_log_insert(eb->fs_info, tm_list[i]);
    297		if (ret)
    298			goto free_tms;
    299	}
    300
    301	ret = tree_mod_log_insert(eb->fs_info, tm);
    302	if (ret)
    303		goto free_tms;
    304	write_unlock(&eb->fs_info->tree_mod_log_lock);
    305	kfree(tm_list);
    306
    307	return 0;
    308
    309free_tms:
    310	for (i = 0; i < nr_items; i++) {
    311		if (tm_list[i] && !RB_EMPTY_NODE(&tm_list[i]->node))
    312			rb_erase(&tm_list[i]->node, &eb->fs_info->tree_mod_log);
    313		kfree(tm_list[i]);
    314	}
    315	if (locked)
    316		write_unlock(&eb->fs_info->tree_mod_log_lock);
    317	kfree(tm_list);
    318	kfree(tm);
    319
    320	return ret;
    321}
    322
    323static inline int tree_mod_log_free_eb(struct btrfs_fs_info *fs_info,
    324				       struct tree_mod_elem **tm_list,
    325				       int nritems)
    326{
    327	int i, j;
    328	int ret;
    329
    330	for (i = nritems - 1; i >= 0; i--) {
    331		ret = tree_mod_log_insert(fs_info, tm_list[i]);
    332		if (ret) {
    333			for (j = nritems - 1; j > i; j--)
    334				rb_erase(&tm_list[j]->node,
    335					 &fs_info->tree_mod_log);
    336			return ret;
    337		}
    338	}
    339
    340	return 0;
    341}
    342
    343int btrfs_tree_mod_log_insert_root(struct extent_buffer *old_root,
    344				   struct extent_buffer *new_root,
    345				   bool log_removal)
    346{
    347	struct btrfs_fs_info *fs_info = old_root->fs_info;
    348	struct tree_mod_elem *tm = NULL;
    349	struct tree_mod_elem **tm_list = NULL;
    350	int nritems = 0;
    351	int ret = 0;
    352	int i;
    353
    354	if (!tree_mod_need_log(fs_info, NULL))
    355		return 0;
    356
    357	if (log_removal && btrfs_header_level(old_root) > 0) {
    358		nritems = btrfs_header_nritems(old_root);
    359		tm_list = kcalloc(nritems, sizeof(struct tree_mod_elem *),
    360				  GFP_NOFS);
    361		if (!tm_list) {
    362			ret = -ENOMEM;
    363			goto free_tms;
    364		}
    365		for (i = 0; i < nritems; i++) {
    366			tm_list[i] = alloc_tree_mod_elem(old_root, i,
    367			    BTRFS_MOD_LOG_KEY_REMOVE_WHILE_FREEING, GFP_NOFS);
    368			if (!tm_list[i]) {
    369				ret = -ENOMEM;
    370				goto free_tms;
    371			}
    372		}
    373	}
    374
    375	tm = kzalloc(sizeof(*tm), GFP_NOFS);
    376	if (!tm) {
    377		ret = -ENOMEM;
    378		goto free_tms;
    379	}
    380
    381	tm->logical = new_root->start;
    382	tm->old_root.logical = old_root->start;
    383	tm->old_root.level = btrfs_header_level(old_root);
    384	tm->generation = btrfs_header_generation(old_root);
    385	tm->op = BTRFS_MOD_LOG_ROOT_REPLACE;
    386
    387	if (tree_mod_dont_log(fs_info, NULL))
    388		goto free_tms;
    389
    390	if (tm_list)
    391		ret = tree_mod_log_free_eb(fs_info, tm_list, nritems);
    392	if (!ret)
    393		ret = tree_mod_log_insert(fs_info, tm);
    394
    395	write_unlock(&fs_info->tree_mod_log_lock);
    396	if (ret)
    397		goto free_tms;
    398	kfree(tm_list);
    399
    400	return ret;
    401
    402free_tms:
    403	if (tm_list) {
    404		for (i = 0; i < nritems; i++)
    405			kfree(tm_list[i]);
    406		kfree(tm_list);
    407	}
    408	kfree(tm);
    409
    410	return ret;
    411}
    412
    413static struct tree_mod_elem *__tree_mod_log_search(struct btrfs_fs_info *fs_info,
    414						   u64 start, u64 min_seq,
    415						   bool smallest)
    416{
    417	struct rb_root *tm_root;
    418	struct rb_node *node;
    419	struct tree_mod_elem *cur = NULL;
    420	struct tree_mod_elem *found = NULL;
    421
    422	read_lock(&fs_info->tree_mod_log_lock);
    423	tm_root = &fs_info->tree_mod_log;
    424	node = tm_root->rb_node;
    425	while (node) {
    426		cur = rb_entry(node, struct tree_mod_elem, node);
    427		if (cur->logical < start) {
    428			node = node->rb_left;
    429		} else if (cur->logical > start) {
    430			node = node->rb_right;
    431		} else if (cur->seq < min_seq) {
    432			node = node->rb_left;
    433		} else if (!smallest) {
    434			/* We want the node with the highest seq */
    435			if (found)
    436				BUG_ON(found->seq > cur->seq);
    437			found = cur;
    438			node = node->rb_left;
    439		} else if (cur->seq > min_seq) {
    440			/* We want the node with the smallest seq */
    441			if (found)
    442				BUG_ON(found->seq < cur->seq);
    443			found = cur;
    444			node = node->rb_right;
    445		} else {
    446			found = cur;
    447			break;
    448		}
    449	}
    450	read_unlock(&fs_info->tree_mod_log_lock);
    451
    452	return found;
    453}
    454
    455/*
    456 * This returns the element from the log with the smallest time sequence
    457 * value that's in the log (the oldest log item). Any element with a time
    458 * sequence lower than min_seq will be ignored.
    459 */
    460static struct tree_mod_elem *tree_mod_log_search_oldest(struct btrfs_fs_info *fs_info,
    461							u64 start, u64 min_seq)
    462{
    463	return __tree_mod_log_search(fs_info, start, min_seq, true);
    464}
    465
    466/*
    467 * This returns the element from the log with the largest time sequence
    468 * value that's in the log (the most recent log item). Any element with
    469 * a time sequence lower than min_seq will be ignored.
    470 */
    471static struct tree_mod_elem *tree_mod_log_search(struct btrfs_fs_info *fs_info,
    472						 u64 start, u64 min_seq)
    473{
    474	return __tree_mod_log_search(fs_info, start, min_seq, false);
    475}
    476
    477int btrfs_tree_mod_log_eb_copy(struct extent_buffer *dst,
    478			       struct extent_buffer *src,
    479			       unsigned long dst_offset,
    480			       unsigned long src_offset,
    481			       int nr_items)
    482{
    483	struct btrfs_fs_info *fs_info = dst->fs_info;
    484	int ret = 0;
    485	struct tree_mod_elem **tm_list = NULL;
    486	struct tree_mod_elem **tm_list_add, **tm_list_rem;
    487	int i;
    488	bool locked = false;
    489
    490	if (!tree_mod_need_log(fs_info, NULL))
    491		return 0;
    492
    493	if (btrfs_header_level(dst) == 0 && btrfs_header_level(src) == 0)
    494		return 0;
    495
    496	tm_list = kcalloc(nr_items * 2, sizeof(struct tree_mod_elem *),
    497			  GFP_NOFS);
    498	if (!tm_list)
    499		return -ENOMEM;
    500
    501	tm_list_add = tm_list;
    502	tm_list_rem = tm_list + nr_items;
    503	for (i = 0; i < nr_items; i++) {
    504		tm_list_rem[i] = alloc_tree_mod_elem(src, i + src_offset,
    505		    BTRFS_MOD_LOG_KEY_REMOVE, GFP_NOFS);
    506		if (!tm_list_rem[i]) {
    507			ret = -ENOMEM;
    508			goto free_tms;
    509		}
    510
    511		tm_list_add[i] = alloc_tree_mod_elem(dst, i + dst_offset,
    512						BTRFS_MOD_LOG_KEY_ADD, GFP_NOFS);
    513		if (!tm_list_add[i]) {
    514			ret = -ENOMEM;
    515			goto free_tms;
    516		}
    517	}
    518
    519	if (tree_mod_dont_log(fs_info, NULL))
    520		goto free_tms;
    521	locked = true;
    522
    523	for (i = 0; i < nr_items; i++) {
    524		ret = tree_mod_log_insert(fs_info, tm_list_rem[i]);
    525		if (ret)
    526			goto free_tms;
    527		ret = tree_mod_log_insert(fs_info, tm_list_add[i]);
    528		if (ret)
    529			goto free_tms;
    530	}
    531
    532	write_unlock(&fs_info->tree_mod_log_lock);
    533	kfree(tm_list);
    534
    535	return 0;
    536
    537free_tms:
    538	for (i = 0; i < nr_items * 2; i++) {
    539		if (tm_list[i] && !RB_EMPTY_NODE(&tm_list[i]->node))
    540			rb_erase(&tm_list[i]->node, &fs_info->tree_mod_log);
    541		kfree(tm_list[i]);
    542	}
    543	if (locked)
    544		write_unlock(&fs_info->tree_mod_log_lock);
    545	kfree(tm_list);
    546
    547	return ret;
    548}
    549
    550int btrfs_tree_mod_log_free_eb(struct extent_buffer *eb)
    551{
    552	struct tree_mod_elem **tm_list = NULL;
    553	int nritems = 0;
    554	int i;
    555	int ret = 0;
    556
    557	if (!tree_mod_need_log(eb->fs_info, eb))
    558		return 0;
    559
    560	nritems = btrfs_header_nritems(eb);
    561	tm_list = kcalloc(nritems, sizeof(struct tree_mod_elem *), GFP_NOFS);
    562	if (!tm_list)
    563		return -ENOMEM;
    564
    565	for (i = 0; i < nritems; i++) {
    566		tm_list[i] = alloc_tree_mod_elem(eb, i,
    567		    BTRFS_MOD_LOG_KEY_REMOVE_WHILE_FREEING, GFP_NOFS);
    568		if (!tm_list[i]) {
    569			ret = -ENOMEM;
    570			goto free_tms;
    571		}
    572	}
    573
    574	if (tree_mod_dont_log(eb->fs_info, eb))
    575		goto free_tms;
    576
    577	ret = tree_mod_log_free_eb(eb->fs_info, tm_list, nritems);
    578	write_unlock(&eb->fs_info->tree_mod_log_lock);
    579	if (ret)
    580		goto free_tms;
    581	kfree(tm_list);
    582
    583	return 0;
    584
    585free_tms:
    586	for (i = 0; i < nritems; i++)
    587		kfree(tm_list[i]);
    588	kfree(tm_list);
    589
    590	return ret;
    591}
    592
    593/*
    594 * Returns the logical address of the oldest predecessor of the given root.
    595 * Entries older than time_seq are ignored.
    596 */
    597static struct tree_mod_elem *tree_mod_log_oldest_root(struct extent_buffer *eb_root,
    598						      u64 time_seq)
    599{
    600	struct tree_mod_elem *tm;
    601	struct tree_mod_elem *found = NULL;
    602	u64 root_logical = eb_root->start;
    603	bool looped = false;
    604
    605	if (!time_seq)
    606		return NULL;
    607
    608	/*
    609	 * The very last operation that's logged for a root is the replacement
    610	 * operation (if it is replaced at all). This has the logical address
    611	 * of the *new* root, making it the very first operation that's logged
    612	 * for this root.
    613	 */
    614	while (1) {
    615		tm = tree_mod_log_search_oldest(eb_root->fs_info, root_logical,
    616						time_seq);
    617		if (!looped && !tm)
    618			return NULL;
    619		/*
    620		 * If there are no tree operation for the oldest root, we simply
    621		 * return it. This should only happen if that (old) root is at
    622		 * level 0.
    623		 */
    624		if (!tm)
    625			break;
    626
    627		/*
    628		 * If there's an operation that's not a root replacement, we
    629		 * found the oldest version of our root. Normally, we'll find a
    630		 * BTRFS_MOD_LOG_KEY_REMOVE_WHILE_FREEING operation here.
    631		 */
    632		if (tm->op != BTRFS_MOD_LOG_ROOT_REPLACE)
    633			break;
    634
    635		found = tm;
    636		root_logical = tm->old_root.logical;
    637		looped = true;
    638	}
    639
    640	/* If there's no old root to return, return what we found instead */
    641	if (!found)
    642		found = tm;
    643
    644	return found;
    645}
    646
    647
    648/*
    649 * tm is a pointer to the first operation to rewind within eb. Then, all
    650 * previous operations will be rewound (until we reach something older than
    651 * time_seq).
    652 */
    653static void tree_mod_log_rewind(struct btrfs_fs_info *fs_info,
    654				struct extent_buffer *eb,
    655				u64 time_seq,
    656				struct tree_mod_elem *first_tm)
    657{
    658	u32 n;
    659	struct rb_node *next;
    660	struct tree_mod_elem *tm = first_tm;
    661	unsigned long o_dst;
    662	unsigned long o_src;
    663	unsigned long p_size = sizeof(struct btrfs_key_ptr);
    664
    665	n = btrfs_header_nritems(eb);
    666	read_lock(&fs_info->tree_mod_log_lock);
    667	while (tm && tm->seq >= time_seq) {
    668		/*
    669		 * All the operations are recorded with the operator used for
    670		 * the modification. As we're going backwards, we do the
    671		 * opposite of each operation here.
    672		 */
    673		switch (tm->op) {
    674		case BTRFS_MOD_LOG_KEY_REMOVE_WHILE_FREEING:
    675			BUG_ON(tm->slot < n);
    676			fallthrough;
    677		case BTRFS_MOD_LOG_KEY_REMOVE_WHILE_MOVING:
    678		case BTRFS_MOD_LOG_KEY_REMOVE:
    679			btrfs_set_node_key(eb, &tm->key, tm->slot);
    680			btrfs_set_node_blockptr(eb, tm->slot, tm->blockptr);
    681			btrfs_set_node_ptr_generation(eb, tm->slot,
    682						      tm->generation);
    683			n++;
    684			break;
    685		case BTRFS_MOD_LOG_KEY_REPLACE:
    686			BUG_ON(tm->slot >= n);
    687			btrfs_set_node_key(eb, &tm->key, tm->slot);
    688			btrfs_set_node_blockptr(eb, tm->slot, tm->blockptr);
    689			btrfs_set_node_ptr_generation(eb, tm->slot,
    690						      tm->generation);
    691			break;
    692		case BTRFS_MOD_LOG_KEY_ADD:
    693			/* if a move operation is needed it's in the log */
    694			n--;
    695			break;
    696		case BTRFS_MOD_LOG_MOVE_KEYS:
    697			o_dst = btrfs_node_key_ptr_offset(tm->slot);
    698			o_src = btrfs_node_key_ptr_offset(tm->move.dst_slot);
    699			memmove_extent_buffer(eb, o_dst, o_src,
    700					      tm->move.nr_items * p_size);
    701			break;
    702		case BTRFS_MOD_LOG_ROOT_REPLACE:
    703			/*
    704			 * This operation is special. For roots, this must be
    705			 * handled explicitly before rewinding.
    706			 * For non-roots, this operation may exist if the node
    707			 * was a root: root A -> child B; then A gets empty and
    708			 * B is promoted to the new root. In the mod log, we'll
    709			 * have a root-replace operation for B, a tree block
    710			 * that is no root. We simply ignore that operation.
    711			 */
    712			break;
    713		}
    714		next = rb_next(&tm->node);
    715		if (!next)
    716			break;
    717		tm = rb_entry(next, struct tree_mod_elem, node);
    718		if (tm->logical != first_tm->logical)
    719			break;
    720	}
    721	read_unlock(&fs_info->tree_mod_log_lock);
    722	btrfs_set_header_nritems(eb, n);
    723}
    724
    725/*
    726 * Called with eb read locked. If the buffer cannot be rewound, the same buffer
    727 * is returned. If rewind operations happen, a fresh buffer is returned. The
    728 * returned buffer is always read-locked. If the returned buffer is not the
    729 * input buffer, the lock on the input buffer is released and the input buffer
    730 * is freed (its refcount is decremented).
    731 */
    732struct extent_buffer *btrfs_tree_mod_log_rewind(struct btrfs_fs_info *fs_info,
    733						struct btrfs_path *path,
    734						struct extent_buffer *eb,
    735						u64 time_seq)
    736{
    737	struct extent_buffer *eb_rewin;
    738	struct tree_mod_elem *tm;
    739
    740	if (!time_seq)
    741		return eb;
    742
    743	if (btrfs_header_level(eb) == 0)
    744		return eb;
    745
    746	tm = tree_mod_log_search(fs_info, eb->start, time_seq);
    747	if (!tm)
    748		return eb;
    749
    750	if (tm->op == BTRFS_MOD_LOG_KEY_REMOVE_WHILE_FREEING) {
    751		BUG_ON(tm->slot != 0);
    752		eb_rewin = alloc_dummy_extent_buffer(fs_info, eb->start);
    753		if (!eb_rewin) {
    754			btrfs_tree_read_unlock(eb);
    755			free_extent_buffer(eb);
    756			return NULL;
    757		}
    758		btrfs_set_header_bytenr(eb_rewin, eb->start);
    759		btrfs_set_header_backref_rev(eb_rewin,
    760					     btrfs_header_backref_rev(eb));
    761		btrfs_set_header_owner(eb_rewin, btrfs_header_owner(eb));
    762		btrfs_set_header_level(eb_rewin, btrfs_header_level(eb));
    763	} else {
    764		eb_rewin = btrfs_clone_extent_buffer(eb);
    765		if (!eb_rewin) {
    766			btrfs_tree_read_unlock(eb);
    767			free_extent_buffer(eb);
    768			return NULL;
    769		}
    770	}
    771
    772	btrfs_tree_read_unlock(eb);
    773	free_extent_buffer(eb);
    774
    775	btrfs_set_buffer_lockdep_class(btrfs_header_owner(eb_rewin),
    776				       eb_rewin, btrfs_header_level(eb_rewin));
    777	btrfs_tree_read_lock(eb_rewin);
    778	tree_mod_log_rewind(fs_info, eb_rewin, time_seq, tm);
    779	WARN_ON(btrfs_header_nritems(eb_rewin) >
    780		BTRFS_NODEPTRS_PER_BLOCK(fs_info));
    781
    782	return eb_rewin;
    783}
    784
    785/*
    786 * Rewind the state of @root's root node to the given @time_seq value.
    787 * If there are no changes, the current root->root_node is returned. If anything
    788 * changed in between, there's a fresh buffer allocated on which the rewind
    789 * operations are done. In any case, the returned buffer is read locked.
    790 * Returns NULL on error (with no locks held).
    791 */
    792struct extent_buffer *btrfs_get_old_root(struct btrfs_root *root, u64 time_seq)
    793{
    794	struct btrfs_fs_info *fs_info = root->fs_info;
    795	struct tree_mod_elem *tm;
    796	struct extent_buffer *eb = NULL;
    797	struct extent_buffer *eb_root;
    798	u64 eb_root_owner = 0;
    799	struct extent_buffer *old;
    800	struct tree_mod_root *old_root = NULL;
    801	u64 old_generation = 0;
    802	u64 logical;
    803	int level;
    804
    805	eb_root = btrfs_read_lock_root_node(root);
    806	tm = tree_mod_log_oldest_root(eb_root, time_seq);
    807	if (!tm)
    808		return eb_root;
    809
    810	if (tm->op == BTRFS_MOD_LOG_ROOT_REPLACE) {
    811		old_root = &tm->old_root;
    812		old_generation = tm->generation;
    813		logical = old_root->logical;
    814		level = old_root->level;
    815	} else {
    816		logical = eb_root->start;
    817		level = btrfs_header_level(eb_root);
    818	}
    819
    820	tm = tree_mod_log_search(fs_info, logical, time_seq);
    821	if (old_root && tm && tm->op != BTRFS_MOD_LOG_KEY_REMOVE_WHILE_FREEING) {
    822		btrfs_tree_read_unlock(eb_root);
    823		free_extent_buffer(eb_root);
    824		old = read_tree_block(fs_info, logical, root->root_key.objectid,
    825				      0, level, NULL);
    826		if (WARN_ON(IS_ERR(old) || !extent_buffer_uptodate(old))) {
    827			if (!IS_ERR(old))
    828				free_extent_buffer(old);
    829			btrfs_warn(fs_info,
    830				   "failed to read tree block %llu from get_old_root",
    831				   logical);
    832		} else {
    833			struct tree_mod_elem *tm2;
    834
    835			btrfs_tree_read_lock(old);
    836			eb = btrfs_clone_extent_buffer(old);
    837			/*
    838			 * After the lookup for the most recent tree mod operation
    839			 * above and before we locked and cloned the extent buffer
    840			 * 'old', a new tree mod log operation may have been added.
    841			 * So lookup for a more recent one to make sure the number
    842			 * of mod log operations we replay is consistent with the
    843			 * number of items we have in the cloned extent buffer,
    844			 * otherwise we can hit a BUG_ON when rewinding the extent
    845			 * buffer.
    846			 */
    847			tm2 = tree_mod_log_search(fs_info, logical, time_seq);
    848			btrfs_tree_read_unlock(old);
    849			free_extent_buffer(old);
    850			ASSERT(tm2);
    851			ASSERT(tm2 == tm || tm2->seq > tm->seq);
    852			if (!tm2 || tm2->seq < tm->seq) {
    853				free_extent_buffer(eb);
    854				return NULL;
    855			}
    856			tm = tm2;
    857		}
    858	} else if (old_root) {
    859		eb_root_owner = btrfs_header_owner(eb_root);
    860		btrfs_tree_read_unlock(eb_root);
    861		free_extent_buffer(eb_root);
    862		eb = alloc_dummy_extent_buffer(fs_info, logical);
    863	} else {
    864		eb = btrfs_clone_extent_buffer(eb_root);
    865		btrfs_tree_read_unlock(eb_root);
    866		free_extent_buffer(eb_root);
    867	}
    868
    869	if (!eb)
    870		return NULL;
    871	if (old_root) {
    872		btrfs_set_header_bytenr(eb, eb->start);
    873		btrfs_set_header_backref_rev(eb, BTRFS_MIXED_BACKREF_REV);
    874		btrfs_set_header_owner(eb, eb_root_owner);
    875		btrfs_set_header_level(eb, old_root->level);
    876		btrfs_set_header_generation(eb, old_generation);
    877	}
    878	btrfs_set_buffer_lockdep_class(btrfs_header_owner(eb), eb,
    879				       btrfs_header_level(eb));
    880	btrfs_tree_read_lock(eb);
    881	if (tm)
    882		tree_mod_log_rewind(fs_info, eb, time_seq, tm);
    883	else
    884		WARN_ON(btrfs_header_level(eb) != 0);
    885	WARN_ON(btrfs_header_nritems(eb) > BTRFS_NODEPTRS_PER_BLOCK(fs_info));
    886
    887	return eb;
    888}
    889
    890int btrfs_old_root_level(struct btrfs_root *root, u64 time_seq)
    891{
    892	struct tree_mod_elem *tm;
    893	int level;
    894	struct extent_buffer *eb_root = btrfs_root_node(root);
    895
    896	tm = tree_mod_log_oldest_root(eb_root, time_seq);
    897	if (tm && tm->op == BTRFS_MOD_LOG_ROOT_REPLACE)
    898		level = tm->old_root.level;
    899	else
    900		level = btrfs_header_level(eb_root);
    901
    902	free_extent_buffer(eb_root);
    903
    904	return level;
    905}
    906
    907/*
    908 * Return the lowest sequence number in the tree modification log.
    909 *
    910 * Return the sequence number of the oldest tree modification log user, which
    911 * corresponds to the lowest sequence number of all existing users. If there are
    912 * no users it returns 0.
    913 */
    914u64 btrfs_tree_mod_log_lowest_seq(struct btrfs_fs_info *fs_info)
    915{
    916	u64 ret = 0;
    917
    918	read_lock(&fs_info->tree_mod_log_lock);
    919	if (!list_empty(&fs_info->tree_mod_seq_list)) {
    920		struct btrfs_seq_list *elem;
    921
    922		elem = list_first_entry(&fs_info->tree_mod_seq_list,
    923					struct btrfs_seq_list, list);
    924		ret = elem->seq;
    925	}
    926	read_unlock(&fs_info->tree_mod_log_lock);
    927
    928	return ret;
    929}