qgroup.c - cachepc-linux - Fork of AMDESE/linux with modifications for CachePC side-channel attack

	cachepc-linux Fork of AMDESE/linux with modifications for CachePC side-channel attack
	git clone https://git.sinitax.com/sinitax/cachepc-linux
	Log \| Files \| Refs \| README \| LICENSE \| sfeed.txt
qgroup.c (114967B)
      1// SPDX-License-Identifier: GPL-2.0
      2/*
      3 * Copyright (C) 2011 STRATO.  All rights reserved.
      4 */
      5
      6#include <linux/sched.h>
      7#include <linux/pagemap.h>
      8#include <linux/writeback.h>
      9#include <linux/blkdev.h>
     10#include <linux/rbtree.h>
     11#include <linux/slab.h>
     12#include <linux/workqueue.h>
     13#include <linux/btrfs.h>
     14#include <linux/sched/mm.h>
     15
     16#include "ctree.h"
     17#include "transaction.h"
     18#include "disk-io.h"
     19#include "locking.h"
     20#include "ulist.h"
     21#include "backref.h"
     22#include "extent_io.h"
     23#include "qgroup.h"
     24#include "block-group.h"
     25#include "sysfs.h"
     26#include "tree-mod-log.h"
     27
     28/*
     29 * Helpers to access qgroup reservation
     30 *
     31 * Callers should ensure the lock context and type are valid
     32 */
     33
     34static u64 qgroup_rsv_total(const struct btrfs_qgroup *qgroup)
     35{
     36	u64 ret = 0;
     37	int i;
     38
     39	for (i = 0; i < BTRFS_QGROUP_RSV_LAST; i++)
     40		ret += qgroup->rsv.values[i];
     41
     42	return ret;
     43}
     44
     45#ifdef CONFIG_BTRFS_DEBUG
     46static const char *qgroup_rsv_type_str(enum btrfs_qgroup_rsv_type type)
     47{
     48	if (type == BTRFS_QGROUP_RSV_DATA)
     49		return "data";
     50	if (type == BTRFS_QGROUP_RSV_META_PERTRANS)
     51		return "meta_pertrans";
     52	if (type == BTRFS_QGROUP_RSV_META_PREALLOC)
     53		return "meta_prealloc";
     54	return NULL;
     55}
     56#endif
     57
     58static void qgroup_rsv_add(struct btrfs_fs_info *fs_info,
     59			   struct btrfs_qgroup *qgroup, u64 num_bytes,
     60			   enum btrfs_qgroup_rsv_type type)
     61{
     62	trace_qgroup_update_reserve(fs_info, qgroup, num_bytes, type);
     63	qgroup->rsv.values[type] += num_bytes;
     64}
     65
     66static void qgroup_rsv_release(struct btrfs_fs_info *fs_info,
     67			       struct btrfs_qgroup *qgroup, u64 num_bytes,
     68			       enum btrfs_qgroup_rsv_type type)
     69{
     70	trace_qgroup_update_reserve(fs_info, qgroup, -(s64)num_bytes, type);
     71	if (qgroup->rsv.values[type] >= num_bytes) {
     72		qgroup->rsv.values[type] -= num_bytes;
     73		return;
     74	}
     75#ifdef CONFIG_BTRFS_DEBUG
     76	WARN_RATELIMIT(1,
     77		"qgroup %llu %s reserved space underflow, have %llu to free %llu",
     78		qgroup->qgroupid, qgroup_rsv_type_str(type),
     79		qgroup->rsv.values[type], num_bytes);
     80#endif
     81	qgroup->rsv.values[type] = 0;
     82}
     83
     84static void qgroup_rsv_add_by_qgroup(struct btrfs_fs_info *fs_info,
     85				     struct btrfs_qgroup *dest,
     86				     struct btrfs_qgroup *src)
     87{
     88	int i;
     89
     90	for (i = 0; i < BTRFS_QGROUP_RSV_LAST; i++)
     91		qgroup_rsv_add(fs_info, dest, src->rsv.values[i], i);
     92}
     93
     94static void qgroup_rsv_release_by_qgroup(struct btrfs_fs_info *fs_info,
     95					 struct btrfs_qgroup *dest,
     96					  struct btrfs_qgroup *src)
     97{
     98	int i;
     99
    100	for (i = 0; i < BTRFS_QGROUP_RSV_LAST; i++)
    101		qgroup_rsv_release(fs_info, dest, src->rsv.values[i], i);
    102}
    103
    104static void btrfs_qgroup_update_old_refcnt(struct btrfs_qgroup *qg, u64 seq,
    105					   int mod)
    106{
    107	if (qg->old_refcnt < seq)
    108		qg->old_refcnt = seq;
    109	qg->old_refcnt += mod;
    110}
    111
    112static void btrfs_qgroup_update_new_refcnt(struct btrfs_qgroup *qg, u64 seq,
    113					   int mod)
    114{
    115	if (qg->new_refcnt < seq)
    116		qg->new_refcnt = seq;
    117	qg->new_refcnt += mod;
    118}
    119
    120static inline u64 btrfs_qgroup_get_old_refcnt(struct btrfs_qgroup *qg, u64 seq)
    121{
    122	if (qg->old_refcnt < seq)
    123		return 0;
    124	return qg->old_refcnt - seq;
    125}
    126
    127static inline u64 btrfs_qgroup_get_new_refcnt(struct btrfs_qgroup *qg, u64 seq)
    128{
    129	if (qg->new_refcnt < seq)
    130		return 0;
    131	return qg->new_refcnt - seq;
    132}
    133
    134/*
    135 * glue structure to represent the relations between qgroups.
    136 */
    137struct btrfs_qgroup_list {
    138	struct list_head next_group;
    139	struct list_head next_member;
    140	struct btrfs_qgroup *group;
    141	struct btrfs_qgroup *member;
    142};
    143
    144static inline u64 qgroup_to_aux(struct btrfs_qgroup *qg)
    145{
    146	return (u64)(uintptr_t)qg;
    147}
    148
    149static inline struct btrfs_qgroup* unode_aux_to_qgroup(struct ulist_node *n)
    150{
    151	return (struct btrfs_qgroup *)(uintptr_t)n->aux;
    152}
    153
    154static int
    155qgroup_rescan_init(struct btrfs_fs_info *fs_info, u64 progress_objectid,
    156		   int init_flags);
    157static void qgroup_rescan_zero_tracking(struct btrfs_fs_info *fs_info);
    158
    159/* must be called with qgroup_ioctl_lock held */
    160static struct btrfs_qgroup *find_qgroup_rb(struct btrfs_fs_info *fs_info,
    161					   u64 qgroupid)
    162{
    163	struct rb_node *n = fs_info->qgroup_tree.rb_node;
    164	struct btrfs_qgroup *qgroup;
    165
    166	while (n) {
    167		qgroup = rb_entry(n, struct btrfs_qgroup, node);
    168		if (qgroup->qgroupid < qgroupid)
    169			n = n->rb_left;
    170		else if (qgroup->qgroupid > qgroupid)
    171			n = n->rb_right;
    172		else
    173			return qgroup;
    174	}
    175	return NULL;
    176}
    177
    178/* must be called with qgroup_lock held */
    179static struct btrfs_qgroup *add_qgroup_rb(struct btrfs_fs_info *fs_info,
    180					  u64 qgroupid)
    181{
    182	struct rb_node **p = &fs_info->qgroup_tree.rb_node;
    183	struct rb_node *parent = NULL;
    184	struct btrfs_qgroup *qgroup;
    185
    186	while (*p) {
    187		parent = *p;
    188		qgroup = rb_entry(parent, struct btrfs_qgroup, node);
    189
    190		if (qgroup->qgroupid < qgroupid)
    191			p = &(*p)->rb_left;
    192		else if (qgroup->qgroupid > qgroupid)
    193			p = &(*p)->rb_right;
    194		else
    195			return qgroup;
    196	}
    197
    198	qgroup = kzalloc(sizeof(*qgroup), GFP_ATOMIC);
    199	if (!qgroup)
    200		return ERR_PTR(-ENOMEM);
    201
    202	qgroup->qgroupid = qgroupid;
    203	INIT_LIST_HEAD(&qgroup->groups);
    204	INIT_LIST_HEAD(&qgroup->members);
    205	INIT_LIST_HEAD(&qgroup->dirty);
    206
    207	rb_link_node(&qgroup->node, parent, p);
    208	rb_insert_color(&qgroup->node, &fs_info->qgroup_tree);
    209
    210	return qgroup;
    211}
    212
    213static void __del_qgroup_rb(struct btrfs_fs_info *fs_info,
    214			    struct btrfs_qgroup *qgroup)
    215{
    216	struct btrfs_qgroup_list *list;
    217
    218	list_del(&qgroup->dirty);
    219	while (!list_empty(&qgroup->groups)) {
    220		list = list_first_entry(&qgroup->groups,
    221					struct btrfs_qgroup_list, next_group);
    222		list_del(&list->next_group);
    223		list_del(&list->next_member);
    224		kfree(list);
    225	}
    226
    227	while (!list_empty(&qgroup->members)) {
    228		list = list_first_entry(&qgroup->members,
    229					struct btrfs_qgroup_list, next_member);
    230		list_del(&list->next_group);
    231		list_del(&list->next_member);
    232		kfree(list);
    233	}
    234}
    235
    236/* must be called with qgroup_lock held */
    237static int del_qgroup_rb(struct btrfs_fs_info *fs_info, u64 qgroupid)
    238{
    239	struct btrfs_qgroup *qgroup = find_qgroup_rb(fs_info, qgroupid);
    240
    241	if (!qgroup)
    242		return -ENOENT;
    243
    244	rb_erase(&qgroup->node, &fs_info->qgroup_tree);
    245	__del_qgroup_rb(fs_info, qgroup);
    246	return 0;
    247}
    248
    249/*
    250 * Add relation specified by two qgroups.
    251 *
    252 * Must be called with qgroup_lock held.
    253 *
    254 * Return: 0        on success
    255 *         -ENOENT  if one of the qgroups is NULL
    256 *         <0       other errors
    257 */
    258static int __add_relation_rb(struct btrfs_qgroup *member, struct btrfs_qgroup *parent)
    259{
    260	struct btrfs_qgroup_list *list;
    261
    262	if (!member || !parent)
    263		return -ENOENT;
    264
    265	list = kzalloc(sizeof(*list), GFP_ATOMIC);
    266	if (!list)
    267		return -ENOMEM;
    268
    269	list->group = parent;
    270	list->member = member;
    271	list_add_tail(&list->next_group, &member->groups);
    272	list_add_tail(&list->next_member, &parent->members);
    273
    274	return 0;
    275}
    276
    277/*
    278 * Add relation specified by two qgoup ids.
    279 *
    280 * Must be called with qgroup_lock held.
    281 *
    282 * Return: 0        on success
    283 *         -ENOENT  if one of the ids does not exist
    284 *         <0       other errors
    285 */
    286static int add_relation_rb(struct btrfs_fs_info *fs_info, u64 memberid, u64 parentid)
    287{
    288	struct btrfs_qgroup *member;
    289	struct btrfs_qgroup *parent;
    290
    291	member = find_qgroup_rb(fs_info, memberid);
    292	parent = find_qgroup_rb(fs_info, parentid);
    293
    294	return __add_relation_rb(member, parent);
    295}
    296
    297/* Must be called with qgroup_lock held */
    298static int del_relation_rb(struct btrfs_fs_info *fs_info,
    299			   u64 memberid, u64 parentid)
    300{
    301	struct btrfs_qgroup *member;
    302	struct btrfs_qgroup *parent;
    303	struct btrfs_qgroup_list *list;
    304
    305	member = find_qgroup_rb(fs_info, memberid);
    306	parent = find_qgroup_rb(fs_info, parentid);
    307	if (!member || !parent)
    308		return -ENOENT;
    309
    310	list_for_each_entry(list, &member->groups, next_group) {
    311		if (list->group == parent) {
    312			list_del(&list->next_group);
    313			list_del(&list->next_member);
    314			kfree(list);
    315			return 0;
    316		}
    317	}
    318	return -ENOENT;
    319}
    320
    321#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
    322int btrfs_verify_qgroup_counts(struct btrfs_fs_info *fs_info, u64 qgroupid,
    323			       u64 rfer, u64 excl)
    324{
    325	struct btrfs_qgroup *qgroup;
    326
    327	qgroup = find_qgroup_rb(fs_info, qgroupid);
    328	if (!qgroup)
    329		return -EINVAL;
    330	if (qgroup->rfer != rfer || qgroup->excl != excl)
    331		return -EINVAL;
    332	return 0;
    333}
    334#endif
    335
    336/*
    337 * The full config is read in one go, only called from open_ctree()
    338 * It doesn't use any locking, as at this point we're still single-threaded
    339 */
    340int btrfs_read_qgroup_config(struct btrfs_fs_info *fs_info)
    341{
    342	struct btrfs_key key;
    343	struct btrfs_key found_key;
    344	struct btrfs_root *quota_root = fs_info->quota_root;
    345	struct btrfs_path *path = NULL;
    346	struct extent_buffer *l;
    347	int slot;
    348	int ret = 0;
    349	u64 flags = 0;
    350	u64 rescan_progress = 0;
    351
    352	if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags))
    353		return 0;
    354
    355	fs_info->qgroup_ulist = ulist_alloc(GFP_KERNEL);
    356	if (!fs_info->qgroup_ulist) {
    357		ret = -ENOMEM;
    358		goto out;
    359	}
    360
    361	path = btrfs_alloc_path();
    362	if (!path) {
    363		ret = -ENOMEM;
    364		goto out;
    365	}
    366
    367	ret = btrfs_sysfs_add_qgroups(fs_info);
    368	if (ret < 0)
    369		goto out;
    370	/* default this to quota off, in case no status key is found */
    371	fs_info->qgroup_flags = 0;
    372
    373	/*
    374	 * pass 1: read status, all qgroup infos and limits
    375	 */
    376	key.objectid = 0;
    377	key.type = 0;
    378	key.offset = 0;
    379	ret = btrfs_search_slot_for_read(quota_root, &key, path, 1, 1);
    380	if (ret)
    381		goto out;
    382
    383	while (1) {
    384		struct btrfs_qgroup *qgroup;
    385
    386		slot = path->slots[0];
    387		l = path->nodes[0];
    388		btrfs_item_key_to_cpu(l, &found_key, slot);
    389
    390		if (found_key.type == BTRFS_QGROUP_STATUS_KEY) {
    391			struct btrfs_qgroup_status_item *ptr;
    392
    393			ptr = btrfs_item_ptr(l, slot,
    394					     struct btrfs_qgroup_status_item);
    395
    396			if (btrfs_qgroup_status_version(l, ptr) !=
    397			    BTRFS_QGROUP_STATUS_VERSION) {
    398				btrfs_err(fs_info,
    399				 "old qgroup version, quota disabled");
    400				goto out;
    401			}
    402			if (btrfs_qgroup_status_generation(l, ptr) !=
    403			    fs_info->generation) {
    404				flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
    405				btrfs_err(fs_info,
    406					"qgroup generation mismatch, marked as inconsistent");
    407			}
    408			fs_info->qgroup_flags = btrfs_qgroup_status_flags(l,
    409									  ptr);
    410			rescan_progress = btrfs_qgroup_status_rescan(l, ptr);
    411			goto next1;
    412		}
    413
    414		if (found_key.type != BTRFS_QGROUP_INFO_KEY &&
    415		    found_key.type != BTRFS_QGROUP_LIMIT_KEY)
    416			goto next1;
    417
    418		qgroup = find_qgroup_rb(fs_info, found_key.offset);
    419		if ((qgroup && found_key.type == BTRFS_QGROUP_INFO_KEY) ||
    420		    (!qgroup && found_key.type == BTRFS_QGROUP_LIMIT_KEY)) {
    421			btrfs_err(fs_info, "inconsistent qgroup config");
    422			flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
    423		}
    424		if (!qgroup) {
    425			qgroup = add_qgroup_rb(fs_info, found_key.offset);
    426			if (IS_ERR(qgroup)) {
    427				ret = PTR_ERR(qgroup);
    428				goto out;
    429			}
    430		}
    431		ret = btrfs_sysfs_add_one_qgroup(fs_info, qgroup);
    432		if (ret < 0)
    433			goto out;
    434
    435		switch (found_key.type) {
    436		case BTRFS_QGROUP_INFO_KEY: {
    437			struct btrfs_qgroup_info_item *ptr;
    438
    439			ptr = btrfs_item_ptr(l, slot,
    440					     struct btrfs_qgroup_info_item);
    441			qgroup->rfer = btrfs_qgroup_info_rfer(l, ptr);
    442			qgroup->rfer_cmpr = btrfs_qgroup_info_rfer_cmpr(l, ptr);
    443			qgroup->excl = btrfs_qgroup_info_excl(l, ptr);
    444			qgroup->excl_cmpr = btrfs_qgroup_info_excl_cmpr(l, ptr);
    445			/* generation currently unused */
    446			break;
    447		}
    448		case BTRFS_QGROUP_LIMIT_KEY: {
    449			struct btrfs_qgroup_limit_item *ptr;
    450
    451			ptr = btrfs_item_ptr(l, slot,
    452					     struct btrfs_qgroup_limit_item);
    453			qgroup->lim_flags = btrfs_qgroup_limit_flags(l, ptr);
    454			qgroup->max_rfer = btrfs_qgroup_limit_max_rfer(l, ptr);
    455			qgroup->max_excl = btrfs_qgroup_limit_max_excl(l, ptr);
    456			qgroup->rsv_rfer = btrfs_qgroup_limit_rsv_rfer(l, ptr);
    457			qgroup->rsv_excl = btrfs_qgroup_limit_rsv_excl(l, ptr);
    458			break;
    459		}
    460		}
    461next1:
    462		ret = btrfs_next_item(quota_root, path);
    463		if (ret < 0)
    464			goto out;
    465		if (ret)
    466			break;
    467	}
    468	btrfs_release_path(path);
    469
    470	/*
    471	 * pass 2: read all qgroup relations
    472	 */
    473	key.objectid = 0;
    474	key.type = BTRFS_QGROUP_RELATION_KEY;
    475	key.offset = 0;
    476	ret = btrfs_search_slot_for_read(quota_root, &key, path, 1, 0);
    477	if (ret)
    478		goto out;
    479	while (1) {
    480		slot = path->slots[0];
    481		l = path->nodes[0];
    482		btrfs_item_key_to_cpu(l, &found_key, slot);
    483
    484		if (found_key.type != BTRFS_QGROUP_RELATION_KEY)
    485			goto next2;
    486
    487		if (found_key.objectid > found_key.offset) {
    488			/* parent <- member, not needed to build config */
    489			/* FIXME should we omit the key completely? */
    490			goto next2;
    491		}
    492
    493		ret = add_relation_rb(fs_info, found_key.objectid,
    494				      found_key.offset);
    495		if (ret == -ENOENT) {
    496			btrfs_warn(fs_info,
    497				"orphan qgroup relation 0x%llx->0x%llx",
    498				found_key.objectid, found_key.offset);
    499			ret = 0;	/* ignore the error */
    500		}
    501		if (ret)
    502			goto out;
    503next2:
    504		ret = btrfs_next_item(quota_root, path);
    505		if (ret < 0)
    506			goto out;
    507		if (ret)
    508			break;
    509	}
    510out:
    511	btrfs_free_path(path);
    512	fs_info->qgroup_flags |= flags;
    513	if (!(fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_ON))
    514		clear_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags);
    515	else if (fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN &&
    516		 ret >= 0)
    517		ret = qgroup_rescan_init(fs_info, rescan_progress, 0);
    518
    519	if (ret < 0) {
    520		ulist_free(fs_info->qgroup_ulist);
    521		fs_info->qgroup_ulist = NULL;
    522		fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_RESCAN;
    523		btrfs_sysfs_del_qgroups(fs_info);
    524	}
    525
    526	return ret < 0 ? ret : 0;
    527}
    528
    529/*
    530 * Called in close_ctree() when quota is still enabled.  This verifies we don't
    531 * leak some reserved space.
    532 *
    533 * Return false if no reserved space is left.
    534 * Return true if some reserved space is leaked.
    535 */
    536bool btrfs_check_quota_leak(struct btrfs_fs_info *fs_info)
    537{
    538	struct rb_node *node;
    539	bool ret = false;
    540
    541	if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags))
    542		return ret;
    543	/*
    544	 * Since we're unmounting, there is no race and no need to grab qgroup
    545	 * lock.  And here we don't go post-order to provide a more user
    546	 * friendly sorted result.
    547	 */
    548	for (node = rb_first(&fs_info->qgroup_tree); node; node = rb_next(node)) {
    549		struct btrfs_qgroup *qgroup;
    550		int i;
    551
    552		qgroup = rb_entry(node, struct btrfs_qgroup, node);
    553		for (i = 0; i < BTRFS_QGROUP_RSV_LAST; i++) {
    554			if (qgroup->rsv.values[i]) {
    555				ret = true;
    556				btrfs_warn(fs_info,
    557		"qgroup %hu/%llu has unreleased space, type %d rsv %llu",
    558				   btrfs_qgroup_level(qgroup->qgroupid),
    559				   btrfs_qgroup_subvolid(qgroup->qgroupid),
    560				   i, qgroup->rsv.values[i]);
    561			}
    562		}
    563	}
    564	return ret;
    565}
    566
    567/*
    568 * This is called from close_ctree() or open_ctree() or btrfs_quota_disable(),
    569 * first two are in single-threaded paths.And for the third one, we have set
    570 * quota_root to be null with qgroup_lock held before, so it is safe to clean
    571 * up the in-memory structures without qgroup_lock held.
    572 */
    573void btrfs_free_qgroup_config(struct btrfs_fs_info *fs_info)
    574{
    575	struct rb_node *n;
    576	struct btrfs_qgroup *qgroup;
    577
    578	while ((n = rb_first(&fs_info->qgroup_tree))) {
    579		qgroup = rb_entry(n, struct btrfs_qgroup, node);
    580		rb_erase(n, &fs_info->qgroup_tree);
    581		__del_qgroup_rb(fs_info, qgroup);
    582		btrfs_sysfs_del_one_qgroup(fs_info, qgroup);
    583		kfree(qgroup);
    584	}
    585	/*
    586	 * We call btrfs_free_qgroup_config() when unmounting
    587	 * filesystem and disabling quota, so we set qgroup_ulist
    588	 * to be null here to avoid double free.
    589	 */
    590	ulist_free(fs_info->qgroup_ulist);
    591	fs_info->qgroup_ulist = NULL;
    592	btrfs_sysfs_del_qgroups(fs_info);
    593}
    594
    595static int add_qgroup_relation_item(struct btrfs_trans_handle *trans, u64 src,
    596				    u64 dst)
    597{
    598	int ret;
    599	struct btrfs_root *quota_root = trans->fs_info->quota_root;
    600	struct btrfs_path *path;
    601	struct btrfs_key key;
    602
    603	path = btrfs_alloc_path();
    604	if (!path)
    605		return -ENOMEM;
    606
    607	key.objectid = src;
    608	key.type = BTRFS_QGROUP_RELATION_KEY;
    609	key.offset = dst;
    610
    611	ret = btrfs_insert_empty_item(trans, quota_root, path, &key, 0);
    612
    613	btrfs_mark_buffer_dirty(path->nodes[0]);
    614
    615	btrfs_free_path(path);
    616	return ret;
    617}
    618
    619static int del_qgroup_relation_item(struct btrfs_trans_handle *trans, u64 src,
    620				    u64 dst)
    621{
    622	int ret;
    623	struct btrfs_root *quota_root = trans->fs_info->quota_root;
    624	struct btrfs_path *path;
    625	struct btrfs_key key;
    626
    627	path = btrfs_alloc_path();
    628	if (!path)
    629		return -ENOMEM;
    630
    631	key.objectid = src;
    632	key.type = BTRFS_QGROUP_RELATION_KEY;
    633	key.offset = dst;
    634
    635	ret = btrfs_search_slot(trans, quota_root, &key, path, -1, 1);
    636	if (ret < 0)
    637		goto out;
    638
    639	if (ret > 0) {
    640		ret = -ENOENT;
    641		goto out;
    642	}
    643
    644	ret = btrfs_del_item(trans, quota_root, path);
    645out:
    646	btrfs_free_path(path);
    647	return ret;
    648}
    649
    650static int add_qgroup_item(struct btrfs_trans_handle *trans,
    651			   struct btrfs_root *quota_root, u64 qgroupid)
    652{
    653	int ret;
    654	struct btrfs_path *path;
    655	struct btrfs_qgroup_info_item *qgroup_info;
    656	struct btrfs_qgroup_limit_item *qgroup_limit;
    657	struct extent_buffer *leaf;
    658	struct btrfs_key key;
    659
    660	if (btrfs_is_testing(quota_root->fs_info))
    661		return 0;
    662
    663	path = btrfs_alloc_path();
    664	if (!path)
    665		return -ENOMEM;
    666
    667	key.objectid = 0;
    668	key.type = BTRFS_QGROUP_INFO_KEY;
    669	key.offset = qgroupid;
    670
    671	/*
    672	 * Avoid a transaction abort by catching -EEXIST here. In that
    673	 * case, we proceed by re-initializing the existing structure
    674	 * on disk.
    675	 */
    676
    677	ret = btrfs_insert_empty_item(trans, quota_root, path, &key,
    678				      sizeof(*qgroup_info));
    679	if (ret && ret != -EEXIST)
    680		goto out;
    681
    682	leaf = path->nodes[0];
    683	qgroup_info = btrfs_item_ptr(leaf, path->slots[0],
    684				 struct btrfs_qgroup_info_item);
    685	btrfs_set_qgroup_info_generation(leaf, qgroup_info, trans->transid);
    686	btrfs_set_qgroup_info_rfer(leaf, qgroup_info, 0);
    687	btrfs_set_qgroup_info_rfer_cmpr(leaf, qgroup_info, 0);
    688	btrfs_set_qgroup_info_excl(leaf, qgroup_info, 0);
    689	btrfs_set_qgroup_info_excl_cmpr(leaf, qgroup_info, 0);
    690
    691	btrfs_mark_buffer_dirty(leaf);
    692
    693	btrfs_release_path(path);
    694
    695	key.type = BTRFS_QGROUP_LIMIT_KEY;
    696	ret = btrfs_insert_empty_item(trans, quota_root, path, &key,
    697				      sizeof(*qgroup_limit));
    698	if (ret && ret != -EEXIST)
    699		goto out;
    700
    701	leaf = path->nodes[0];
    702	qgroup_limit = btrfs_item_ptr(leaf, path->slots[0],
    703				  struct btrfs_qgroup_limit_item);
    704	btrfs_set_qgroup_limit_flags(leaf, qgroup_limit, 0);
    705	btrfs_set_qgroup_limit_max_rfer(leaf, qgroup_limit, 0);
    706	btrfs_set_qgroup_limit_max_excl(leaf, qgroup_limit, 0);
    707	btrfs_set_qgroup_limit_rsv_rfer(leaf, qgroup_limit, 0);
    708	btrfs_set_qgroup_limit_rsv_excl(leaf, qgroup_limit, 0);
    709
    710	btrfs_mark_buffer_dirty(leaf);
    711
    712	ret = 0;
    713out:
    714	btrfs_free_path(path);
    715	return ret;
    716}
    717
    718static int del_qgroup_item(struct btrfs_trans_handle *trans, u64 qgroupid)
    719{
    720	int ret;
    721	struct btrfs_root *quota_root = trans->fs_info->quota_root;
    722	struct btrfs_path *path;
    723	struct btrfs_key key;
    724
    725	path = btrfs_alloc_path();
    726	if (!path)
    727		return -ENOMEM;
    728
    729	key.objectid = 0;
    730	key.type = BTRFS_QGROUP_INFO_KEY;
    731	key.offset = qgroupid;
    732	ret = btrfs_search_slot(trans, quota_root, &key, path, -1, 1);
    733	if (ret < 0)
    734		goto out;
    735
    736	if (ret > 0) {
    737		ret = -ENOENT;
    738		goto out;
    739	}
    740
    741	ret = btrfs_del_item(trans, quota_root, path);
    742	if (ret)
    743		goto out;
    744
    745	btrfs_release_path(path);
    746
    747	key.type = BTRFS_QGROUP_LIMIT_KEY;
    748	ret = btrfs_search_slot(trans, quota_root, &key, path, -1, 1);
    749	if (ret < 0)
    750		goto out;
    751
    752	if (ret > 0) {
    753		ret = -ENOENT;
    754		goto out;
    755	}
    756
    757	ret = btrfs_del_item(trans, quota_root, path);
    758
    759out:
    760	btrfs_free_path(path);
    761	return ret;
    762}
    763
    764static int update_qgroup_limit_item(struct btrfs_trans_handle *trans,
    765				    struct btrfs_qgroup *qgroup)
    766{
    767	struct btrfs_root *quota_root = trans->fs_info->quota_root;
    768	struct btrfs_path *path;
    769	struct btrfs_key key;
    770	struct extent_buffer *l;
    771	struct btrfs_qgroup_limit_item *qgroup_limit;
    772	int ret;
    773	int slot;
    774
    775	key.objectid = 0;
    776	key.type = BTRFS_QGROUP_LIMIT_KEY;
    777	key.offset = qgroup->qgroupid;
    778
    779	path = btrfs_alloc_path();
    780	if (!path)
    781		return -ENOMEM;
    782
    783	ret = btrfs_search_slot(trans, quota_root, &key, path, 0, 1);
    784	if (ret > 0)
    785		ret = -ENOENT;
    786
    787	if (ret)
    788		goto out;
    789
    790	l = path->nodes[0];
    791	slot = path->slots[0];
    792	qgroup_limit = btrfs_item_ptr(l, slot, struct btrfs_qgroup_limit_item);
    793	btrfs_set_qgroup_limit_flags(l, qgroup_limit, qgroup->lim_flags);
    794	btrfs_set_qgroup_limit_max_rfer(l, qgroup_limit, qgroup->max_rfer);
    795	btrfs_set_qgroup_limit_max_excl(l, qgroup_limit, qgroup->max_excl);
    796	btrfs_set_qgroup_limit_rsv_rfer(l, qgroup_limit, qgroup->rsv_rfer);
    797	btrfs_set_qgroup_limit_rsv_excl(l, qgroup_limit, qgroup->rsv_excl);
    798
    799	btrfs_mark_buffer_dirty(l);
    800
    801out:
    802	btrfs_free_path(path);
    803	return ret;
    804}
    805
    806static int update_qgroup_info_item(struct btrfs_trans_handle *trans,
    807				   struct btrfs_qgroup *qgroup)
    808{
    809	struct btrfs_fs_info *fs_info = trans->fs_info;
    810	struct btrfs_root *quota_root = fs_info->quota_root;
    811	struct btrfs_path *path;
    812	struct btrfs_key key;
    813	struct extent_buffer *l;
    814	struct btrfs_qgroup_info_item *qgroup_info;
    815	int ret;
    816	int slot;
    817
    818	if (btrfs_is_testing(fs_info))
    819		return 0;
    820
    821	key.objectid = 0;
    822	key.type = BTRFS_QGROUP_INFO_KEY;
    823	key.offset = qgroup->qgroupid;
    824
    825	path = btrfs_alloc_path();
    826	if (!path)
    827		return -ENOMEM;
    828
    829	ret = btrfs_search_slot(trans, quota_root, &key, path, 0, 1);
    830	if (ret > 0)
    831		ret = -ENOENT;
    832
    833	if (ret)
    834		goto out;
    835
    836	l = path->nodes[0];
    837	slot = path->slots[0];
    838	qgroup_info = btrfs_item_ptr(l, slot, struct btrfs_qgroup_info_item);
    839	btrfs_set_qgroup_info_generation(l, qgroup_info, trans->transid);
    840	btrfs_set_qgroup_info_rfer(l, qgroup_info, qgroup->rfer);
    841	btrfs_set_qgroup_info_rfer_cmpr(l, qgroup_info, qgroup->rfer_cmpr);
    842	btrfs_set_qgroup_info_excl(l, qgroup_info, qgroup->excl);
    843	btrfs_set_qgroup_info_excl_cmpr(l, qgroup_info, qgroup->excl_cmpr);
    844
    845	btrfs_mark_buffer_dirty(l);
    846
    847out:
    848	btrfs_free_path(path);
    849	return ret;
    850}
    851
    852static int update_qgroup_status_item(struct btrfs_trans_handle *trans)
    853{
    854	struct btrfs_fs_info *fs_info = trans->fs_info;
    855	struct btrfs_root *quota_root = fs_info->quota_root;
    856	struct btrfs_path *path;
    857	struct btrfs_key key;
    858	struct extent_buffer *l;
    859	struct btrfs_qgroup_status_item *ptr;
    860	int ret;
    861	int slot;
    862
    863	key.objectid = 0;
    864	key.type = BTRFS_QGROUP_STATUS_KEY;
    865	key.offset = 0;
    866
    867	path = btrfs_alloc_path();
    868	if (!path)
    869		return -ENOMEM;
    870
    871	ret = btrfs_search_slot(trans, quota_root, &key, path, 0, 1);
    872	if (ret > 0)
    873		ret = -ENOENT;
    874
    875	if (ret)
    876		goto out;
    877
    878	l = path->nodes[0];
    879	slot = path->slots[0];
    880	ptr = btrfs_item_ptr(l, slot, struct btrfs_qgroup_status_item);
    881	btrfs_set_qgroup_status_flags(l, ptr, fs_info->qgroup_flags);
    882	btrfs_set_qgroup_status_generation(l, ptr, trans->transid);
    883	btrfs_set_qgroup_status_rescan(l, ptr,
    884				fs_info->qgroup_rescan_progress.objectid);
    885
    886	btrfs_mark_buffer_dirty(l);
    887
    888out:
    889	btrfs_free_path(path);
    890	return ret;
    891}
    892
    893/*
    894 * called with qgroup_lock held
    895 */
    896static int btrfs_clean_quota_tree(struct btrfs_trans_handle *trans,
    897				  struct btrfs_root *root)
    898{
    899	struct btrfs_path *path;
    900	struct btrfs_key key;
    901	struct extent_buffer *leaf = NULL;
    902	int ret;
    903	int nr = 0;
    904
    905	path = btrfs_alloc_path();
    906	if (!path)
    907		return -ENOMEM;
    908
    909	key.objectid = 0;
    910	key.offset = 0;
    911	key.type = 0;
    912
    913	while (1) {
    914		ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
    915		if (ret < 0)
    916			goto out;
    917		leaf = path->nodes[0];
    918		nr = btrfs_header_nritems(leaf);
    919		if (!nr)
    920			break;
    921		/*
    922		 * delete the leaf one by one
    923		 * since the whole tree is going
    924		 * to be deleted.
    925		 */
    926		path->slots[0] = 0;
    927		ret = btrfs_del_items(trans, root, path, 0, nr);
    928		if (ret)
    929			goto out;
    930
    931		btrfs_release_path(path);
    932	}
    933	ret = 0;
    934out:
    935	btrfs_free_path(path);
    936	return ret;
    937}
    938
    939int btrfs_quota_enable(struct btrfs_fs_info *fs_info)
    940{
    941	struct btrfs_root *quota_root;
    942	struct btrfs_root *tree_root = fs_info->tree_root;
    943	struct btrfs_path *path = NULL;
    944	struct btrfs_qgroup_status_item *ptr;
    945	struct extent_buffer *leaf;
    946	struct btrfs_key key;
    947	struct btrfs_key found_key;
    948	struct btrfs_qgroup *qgroup = NULL;
    949	struct btrfs_trans_handle *trans = NULL;
    950	struct ulist *ulist = NULL;
    951	int ret = 0;
    952	int slot;
    953
    954	/*
    955	 * We need to have subvol_sem write locked, to prevent races between
    956	 * concurrent tasks trying to enable quotas, because we will unlock
    957	 * and relock qgroup_ioctl_lock before setting fs_info->quota_root
    958	 * and before setting BTRFS_FS_QUOTA_ENABLED.
    959	 */
    960	lockdep_assert_held_write(&fs_info->subvol_sem);
    961
    962	if (btrfs_fs_incompat(fs_info, EXTENT_TREE_V2)) {
    963		btrfs_err(fs_info,
    964			  "qgroups are currently unsupported in extent tree v2");
    965		return -EINVAL;
    966	}
    967
    968	mutex_lock(&fs_info->qgroup_ioctl_lock);
    969	if (fs_info->quota_root)
    970		goto out;
    971
    972	ulist = ulist_alloc(GFP_KERNEL);
    973	if (!ulist) {
    974		ret = -ENOMEM;
    975		goto out;
    976	}
    977
    978	ret = btrfs_sysfs_add_qgroups(fs_info);
    979	if (ret < 0)
    980		goto out;
    981
    982	/*
    983	 * Unlock qgroup_ioctl_lock before starting the transaction. This is to
    984	 * avoid lock acquisition inversion problems (reported by lockdep) between
    985	 * qgroup_ioctl_lock and the vfs freeze semaphores, acquired when we
    986	 * start a transaction.
    987	 * After we started the transaction lock qgroup_ioctl_lock again and
    988	 * check if someone else created the quota root in the meanwhile. If so,
    989	 * just return success and release the transaction handle.
    990	 *
    991	 * Also we don't need to worry about someone else calling
    992	 * btrfs_sysfs_add_qgroups() after we unlock and getting an error because
    993	 * that function returns 0 (success) when the sysfs entries already exist.
    994	 */
    995	mutex_unlock(&fs_info->qgroup_ioctl_lock);
    996
    997	/*
    998	 * 1 for quota root item
    999	 * 1 for BTRFS_QGROUP_STATUS item
   1000	 *
   1001	 * Yet we also need 2*n items for a QGROUP_INFO/QGROUP_LIMIT items
   1002	 * per subvolume. However those are not currently reserved since it
   1003	 * would be a lot of overkill.
   1004	 */
   1005	trans = btrfs_start_transaction(tree_root, 2);
   1006
   1007	mutex_lock(&fs_info->qgroup_ioctl_lock);
   1008	if (IS_ERR(trans)) {
   1009		ret = PTR_ERR(trans);
   1010		trans = NULL;
   1011		goto out;
   1012	}
   1013
   1014	if (fs_info->quota_root)
   1015		goto out;
   1016
   1017	fs_info->qgroup_ulist = ulist;
   1018	ulist = NULL;
   1019
   1020	/*
   1021	 * initially create the quota tree
   1022	 */
   1023	quota_root = btrfs_create_tree(trans, BTRFS_QUOTA_TREE_OBJECTID);
   1024	if (IS_ERR(quota_root)) {
   1025		ret =  PTR_ERR(quota_root);
   1026		btrfs_abort_transaction(trans, ret);
   1027		goto out;
   1028	}
   1029
   1030	path = btrfs_alloc_path();
   1031	if (!path) {
   1032		ret = -ENOMEM;
   1033		btrfs_abort_transaction(trans, ret);
   1034		goto out_free_root;
   1035	}
   1036
   1037	key.objectid = 0;
   1038	key.type = BTRFS_QGROUP_STATUS_KEY;
   1039	key.offset = 0;
   1040
   1041	ret = btrfs_insert_empty_item(trans, quota_root, path, &key,
   1042				      sizeof(*ptr));
   1043	if (ret) {
   1044		btrfs_abort_transaction(trans, ret);
   1045		goto out_free_path;
   1046	}
   1047
   1048	leaf = path->nodes[0];
   1049	ptr = btrfs_item_ptr(leaf, path->slots[0],
   1050				 struct btrfs_qgroup_status_item);
   1051	btrfs_set_qgroup_status_generation(leaf, ptr, trans->transid);
   1052	btrfs_set_qgroup_status_version(leaf, ptr, BTRFS_QGROUP_STATUS_VERSION);
   1053	fs_info->qgroup_flags = BTRFS_QGROUP_STATUS_FLAG_ON |
   1054				BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
   1055	btrfs_set_qgroup_status_flags(leaf, ptr, fs_info->qgroup_flags);
   1056	btrfs_set_qgroup_status_rescan(leaf, ptr, 0);
   1057
   1058	btrfs_mark_buffer_dirty(leaf);
   1059
   1060	key.objectid = 0;
   1061	key.type = BTRFS_ROOT_REF_KEY;
   1062	key.offset = 0;
   1063
   1064	btrfs_release_path(path);
   1065	ret = btrfs_search_slot_for_read(tree_root, &key, path, 1, 0);
   1066	if (ret > 0)
   1067		goto out_add_root;
   1068	if (ret < 0) {
   1069		btrfs_abort_transaction(trans, ret);
   1070		goto out_free_path;
   1071	}
   1072
   1073	while (1) {
   1074		slot = path->slots[0];
   1075		leaf = path->nodes[0];
   1076		btrfs_item_key_to_cpu(leaf, &found_key, slot);
   1077
   1078		if (found_key.type == BTRFS_ROOT_REF_KEY) {
   1079
   1080			/* Release locks on tree_root before we access quota_root */
   1081			btrfs_release_path(path);
   1082
   1083			ret = add_qgroup_item(trans, quota_root,
   1084					      found_key.offset);
   1085			if (ret) {
   1086				btrfs_abort_transaction(trans, ret);
   1087				goto out_free_path;
   1088			}
   1089
   1090			qgroup = add_qgroup_rb(fs_info, found_key.offset);
   1091			if (IS_ERR(qgroup)) {
   1092				ret = PTR_ERR(qgroup);
   1093				btrfs_abort_transaction(trans, ret);
   1094				goto out_free_path;
   1095			}
   1096			ret = btrfs_sysfs_add_one_qgroup(fs_info, qgroup);
   1097			if (ret < 0) {
   1098				btrfs_abort_transaction(trans, ret);
   1099				goto out_free_path;
   1100			}
   1101			ret = btrfs_search_slot_for_read(tree_root, &found_key,
   1102							 path, 1, 0);
   1103			if (ret < 0) {
   1104				btrfs_abort_transaction(trans, ret);
   1105				goto out_free_path;
   1106			}
   1107			if (ret > 0) {
   1108				/*
   1109				 * Shouldn't happen, but in case it does we
   1110				 * don't need to do the btrfs_next_item, just
   1111				 * continue.
   1112				 */
   1113				continue;
   1114			}
   1115		}
   1116		ret = btrfs_next_item(tree_root, path);
   1117		if (ret < 0) {
   1118			btrfs_abort_transaction(trans, ret);
   1119			goto out_free_path;
   1120		}
   1121		if (ret)
   1122			break;
   1123	}
   1124
   1125out_add_root:
   1126	btrfs_release_path(path);
   1127	ret = add_qgroup_item(trans, quota_root, BTRFS_FS_TREE_OBJECTID);
   1128	if (ret) {
   1129		btrfs_abort_transaction(trans, ret);
   1130		goto out_free_path;
   1131	}
   1132
   1133	qgroup = add_qgroup_rb(fs_info, BTRFS_FS_TREE_OBJECTID);
   1134	if (IS_ERR(qgroup)) {
   1135		ret = PTR_ERR(qgroup);
   1136		btrfs_abort_transaction(trans, ret);
   1137		goto out_free_path;
   1138	}
   1139	ret = btrfs_sysfs_add_one_qgroup(fs_info, qgroup);
   1140	if (ret < 0) {
   1141		btrfs_abort_transaction(trans, ret);
   1142		goto out_free_path;
   1143	}
   1144
   1145	mutex_unlock(&fs_info->qgroup_ioctl_lock);
   1146	/*
   1147	 * Commit the transaction while not holding qgroup_ioctl_lock, to avoid
   1148	 * a deadlock with tasks concurrently doing other qgroup operations, such
   1149	 * adding/removing qgroups or adding/deleting qgroup relations for example,
   1150	 * because all qgroup operations first start or join a transaction and then
   1151	 * lock the qgroup_ioctl_lock mutex.
   1152	 * We are safe from a concurrent task trying to enable quotas, by calling
   1153	 * this function, since we are serialized by fs_info->subvol_sem.
   1154	 */
   1155	ret = btrfs_commit_transaction(trans);
   1156	trans = NULL;
   1157	mutex_lock(&fs_info->qgroup_ioctl_lock);
   1158	if (ret)
   1159		goto out_free_path;
   1160
   1161	/*
   1162	 * Set quota enabled flag after committing the transaction, to avoid
   1163	 * deadlocks on fs_info->qgroup_ioctl_lock with concurrent snapshot
   1164	 * creation.
   1165	 */
   1166	spin_lock(&fs_info->qgroup_lock);
   1167	fs_info->quota_root = quota_root;
   1168	set_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags);
   1169	spin_unlock(&fs_info->qgroup_lock);
   1170
   1171	ret = qgroup_rescan_init(fs_info, 0, 1);
   1172	if (!ret) {
   1173	        qgroup_rescan_zero_tracking(fs_info);
   1174		fs_info->qgroup_rescan_running = true;
   1175	        btrfs_queue_work(fs_info->qgroup_rescan_workers,
   1176	                         &fs_info->qgroup_rescan_work);
   1177	}
   1178
   1179out_free_path:
   1180	btrfs_free_path(path);
   1181out_free_root:
   1182	if (ret)
   1183		btrfs_put_root(quota_root);
   1184out:
   1185	if (ret) {
   1186		ulist_free(fs_info->qgroup_ulist);
   1187		fs_info->qgroup_ulist = NULL;
   1188		btrfs_sysfs_del_qgroups(fs_info);
   1189	}
   1190	mutex_unlock(&fs_info->qgroup_ioctl_lock);
   1191	if (ret && trans)
   1192		btrfs_end_transaction(trans);
   1193	else if (trans)
   1194		ret = btrfs_end_transaction(trans);
   1195	ulist_free(ulist);
   1196	return ret;
   1197}
   1198
   1199int btrfs_quota_disable(struct btrfs_fs_info *fs_info)
   1200{
   1201	struct btrfs_root *quota_root;
   1202	struct btrfs_trans_handle *trans = NULL;
   1203	int ret = 0;
   1204
   1205	/*
   1206	 * We need to have subvol_sem write locked, to prevent races between
   1207	 * concurrent tasks trying to disable quotas, because we will unlock
   1208	 * and relock qgroup_ioctl_lock across BTRFS_FS_QUOTA_ENABLED changes.
   1209	 */
   1210	lockdep_assert_held_write(&fs_info->subvol_sem);
   1211
   1212	mutex_lock(&fs_info->qgroup_ioctl_lock);
   1213	if (!fs_info->quota_root)
   1214		goto out;
   1215
   1216	/*
   1217	 * Unlock the qgroup_ioctl_lock mutex before waiting for the rescan worker to
   1218	 * complete. Otherwise we can deadlock because btrfs_remove_qgroup() needs
   1219	 * to lock that mutex while holding a transaction handle and the rescan
   1220	 * worker needs to commit a transaction.
   1221	 */
   1222	mutex_unlock(&fs_info->qgroup_ioctl_lock);
   1223
   1224	/*
   1225	 * Request qgroup rescan worker to complete and wait for it. This wait
   1226	 * must be done before transaction start for quota disable since it may
   1227	 * deadlock with transaction by the qgroup rescan worker.
   1228	 */
   1229	clear_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags);
   1230	btrfs_qgroup_wait_for_completion(fs_info, false);
   1231
   1232	/*
   1233	 * 1 For the root item
   1234	 *
   1235	 * We should also reserve enough items for the quota tree deletion in
   1236	 * btrfs_clean_quota_tree but this is not done.
   1237	 *
   1238	 * Also, we must always start a transaction without holding the mutex
   1239	 * qgroup_ioctl_lock, see btrfs_quota_enable().
   1240	 */
   1241	trans = btrfs_start_transaction(fs_info->tree_root, 1);
   1242
   1243	mutex_lock(&fs_info->qgroup_ioctl_lock);
   1244	if (IS_ERR(trans)) {
   1245		ret = PTR_ERR(trans);
   1246		trans = NULL;
   1247		set_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags);
   1248		goto out;
   1249	}
   1250
   1251	if (!fs_info->quota_root)
   1252		goto out;
   1253
   1254	spin_lock(&fs_info->qgroup_lock);
   1255	quota_root = fs_info->quota_root;
   1256	fs_info->quota_root = NULL;
   1257	fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_ON;
   1258	spin_unlock(&fs_info->qgroup_lock);
   1259
   1260	btrfs_free_qgroup_config(fs_info);
   1261
   1262	ret = btrfs_clean_quota_tree(trans, quota_root);
   1263	if (ret) {
   1264		btrfs_abort_transaction(trans, ret);
   1265		goto out;
   1266	}
   1267
   1268	ret = btrfs_del_root(trans, &quota_root->root_key);
   1269	if (ret) {
   1270		btrfs_abort_transaction(trans, ret);
   1271		goto out;
   1272	}
   1273
   1274	list_del(&quota_root->dirty_list);
   1275
   1276	btrfs_tree_lock(quota_root->node);
   1277	btrfs_clean_tree_block(quota_root->node);
   1278	btrfs_tree_unlock(quota_root->node);
   1279	btrfs_free_tree_block(trans, btrfs_root_id(quota_root),
   1280			      quota_root->node, 0, 1);
   1281
   1282	btrfs_put_root(quota_root);
   1283
   1284out:
   1285	mutex_unlock(&fs_info->qgroup_ioctl_lock);
   1286	if (ret && trans)
   1287		btrfs_end_transaction(trans);
   1288	else if (trans)
   1289		ret = btrfs_end_transaction(trans);
   1290
   1291	return ret;
   1292}
   1293
   1294static void qgroup_dirty(struct btrfs_fs_info *fs_info,
   1295			 struct btrfs_qgroup *qgroup)
   1296{
   1297	if (list_empty(&qgroup->dirty))
   1298		list_add(&qgroup->dirty, &fs_info->dirty_qgroups);
   1299}
   1300
   1301/*
   1302 * The easy accounting, we're updating qgroup relationship whose child qgroup
   1303 * only has exclusive extents.
   1304 *
   1305 * In this case, all exclusive extents will also be exclusive for parent, so
   1306 * excl/rfer just get added/removed.
   1307 *
   1308 * So is qgroup reservation space, which should also be added/removed to
   1309 * parent.
   1310 * Or when child tries to release reservation space, parent will underflow its
   1311 * reservation (for relationship adding case).
   1312 *
   1313 * Caller should hold fs_info->qgroup_lock.
   1314 */
   1315static int __qgroup_excl_accounting(struct btrfs_fs_info *fs_info,
   1316				    struct ulist *tmp, u64 ref_root,
   1317				    struct btrfs_qgroup *src, int sign)
   1318{
   1319	struct btrfs_qgroup *qgroup;
   1320	struct btrfs_qgroup_list *glist;
   1321	struct ulist_node *unode;
   1322	struct ulist_iterator uiter;
   1323	u64 num_bytes = src->excl;
   1324	int ret = 0;
   1325
   1326	qgroup = find_qgroup_rb(fs_info, ref_root);
   1327	if (!qgroup)
   1328		goto out;
   1329
   1330	qgroup->rfer += sign * num_bytes;
   1331	qgroup->rfer_cmpr += sign * num_bytes;
   1332
   1333	WARN_ON(sign < 0 && qgroup->excl < num_bytes);
   1334	qgroup->excl += sign * num_bytes;
   1335	qgroup->excl_cmpr += sign * num_bytes;
   1336
   1337	if (sign > 0)
   1338		qgroup_rsv_add_by_qgroup(fs_info, qgroup, src);
   1339	else
   1340		qgroup_rsv_release_by_qgroup(fs_info, qgroup, src);
   1341
   1342	qgroup_dirty(fs_info, qgroup);
   1343
   1344	/* Get all of the parent groups that contain this qgroup */
   1345	list_for_each_entry(glist, &qgroup->groups, next_group) {
   1346		ret = ulist_add(tmp, glist->group->qgroupid,
   1347				qgroup_to_aux(glist->group), GFP_ATOMIC);
   1348		if (ret < 0)
   1349			goto out;
   1350	}
   1351
   1352	/* Iterate all of the parents and adjust their reference counts */
   1353	ULIST_ITER_INIT(&uiter);
   1354	while ((unode = ulist_next(tmp, &uiter))) {
   1355		qgroup = unode_aux_to_qgroup(unode);
   1356		qgroup->rfer += sign * num_bytes;
   1357		qgroup->rfer_cmpr += sign * num_bytes;
   1358		WARN_ON(sign < 0 && qgroup->excl < num_bytes);
   1359		qgroup->excl += sign * num_bytes;
   1360		if (sign > 0)
   1361			qgroup_rsv_add_by_qgroup(fs_info, qgroup, src);
   1362		else
   1363			qgroup_rsv_release_by_qgroup(fs_info, qgroup, src);
   1364		qgroup->excl_cmpr += sign * num_bytes;
   1365		qgroup_dirty(fs_info, qgroup);
   1366
   1367		/* Add any parents of the parents */
   1368		list_for_each_entry(glist, &qgroup->groups, next_group) {
   1369			ret = ulist_add(tmp, glist->group->qgroupid,
   1370					qgroup_to_aux(glist->group), GFP_ATOMIC);
   1371			if (ret < 0)
   1372				goto out;
   1373		}
   1374	}
   1375	ret = 0;
   1376out:
   1377	return ret;
   1378}
   1379
   1380
   1381/*
   1382 * Quick path for updating qgroup with only excl refs.
   1383 *
   1384 * In that case, just update all parent will be enough.
   1385 * Or we needs to do a full rescan.
   1386 * Caller should also hold fs_info->qgroup_lock.
   1387 *
   1388 * Return 0 for quick update, return >0 for need to full rescan
   1389 * and mark INCONSISTENT flag.
   1390 * Return < 0 for other error.
   1391 */
   1392static int quick_update_accounting(struct btrfs_fs_info *fs_info,
   1393				   struct ulist *tmp, u64 src, u64 dst,
   1394				   int sign)
   1395{
   1396	struct btrfs_qgroup *qgroup;
   1397	int ret = 1;
   1398	int err = 0;
   1399
   1400	qgroup = find_qgroup_rb(fs_info, src);
   1401	if (!qgroup)
   1402		goto out;
   1403	if (qgroup->excl == qgroup->rfer) {
   1404		ret = 0;
   1405		err = __qgroup_excl_accounting(fs_info, tmp, dst,
   1406					       qgroup, sign);
   1407		if (err < 0) {
   1408			ret = err;
   1409			goto out;
   1410		}
   1411	}
   1412out:
   1413	if (ret)
   1414		fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
   1415	return ret;
   1416}
   1417
   1418int btrfs_add_qgroup_relation(struct btrfs_trans_handle *trans, u64 src,
   1419			      u64 dst)
   1420{
   1421	struct btrfs_fs_info *fs_info = trans->fs_info;
   1422	struct btrfs_qgroup *parent;
   1423	struct btrfs_qgroup *member;
   1424	struct btrfs_qgroup_list *list;
   1425	struct ulist *tmp;
   1426	unsigned int nofs_flag;
   1427	int ret = 0;
   1428
   1429	/* Check the level of src and dst first */
   1430	if (btrfs_qgroup_level(src) >= btrfs_qgroup_level(dst))
   1431		return -EINVAL;
   1432
   1433	/* We hold a transaction handle open, must do a NOFS allocation. */
   1434	nofs_flag = memalloc_nofs_save();
   1435	tmp = ulist_alloc(GFP_KERNEL);
   1436	memalloc_nofs_restore(nofs_flag);
   1437	if (!tmp)
   1438		return -ENOMEM;
   1439
   1440	mutex_lock(&fs_info->qgroup_ioctl_lock);
   1441	if (!fs_info->quota_root) {
   1442		ret = -ENOTCONN;
   1443		goto out;
   1444	}
   1445	member = find_qgroup_rb(fs_info, src);
   1446	parent = find_qgroup_rb(fs_info, dst);
   1447	if (!member || !parent) {
   1448		ret = -EINVAL;
   1449		goto out;
   1450	}
   1451
   1452	/* check if such qgroup relation exist firstly */
   1453	list_for_each_entry(list, &member->groups, next_group) {
   1454		if (list->group == parent) {
   1455			ret = -EEXIST;
   1456			goto out;
   1457		}
   1458	}
   1459
   1460	ret = add_qgroup_relation_item(trans, src, dst);
   1461	if (ret)
   1462		goto out;
   1463
   1464	ret = add_qgroup_relation_item(trans, dst, src);
   1465	if (ret) {
   1466		del_qgroup_relation_item(trans, src, dst);
   1467		goto out;
   1468	}
   1469
   1470	spin_lock(&fs_info->qgroup_lock);
   1471	ret = __add_relation_rb(member, parent);
   1472	if (ret < 0) {
   1473		spin_unlock(&fs_info->qgroup_lock);
   1474		goto out;
   1475	}
   1476	ret = quick_update_accounting(fs_info, tmp, src, dst, 1);
   1477	spin_unlock(&fs_info->qgroup_lock);
   1478out:
   1479	mutex_unlock(&fs_info->qgroup_ioctl_lock);
   1480	ulist_free(tmp);
   1481	return ret;
   1482}
   1483
   1484static int __del_qgroup_relation(struct btrfs_trans_handle *trans, u64 src,
   1485				 u64 dst)
   1486{
   1487	struct btrfs_fs_info *fs_info = trans->fs_info;
   1488	struct btrfs_qgroup *parent;
   1489	struct btrfs_qgroup *member;
   1490	struct btrfs_qgroup_list *list;
   1491	struct ulist *tmp;
   1492	bool found = false;
   1493	unsigned int nofs_flag;
   1494	int ret = 0;
   1495	int ret2;
   1496
   1497	/* We hold a transaction handle open, must do a NOFS allocation. */
   1498	nofs_flag = memalloc_nofs_save();
   1499	tmp = ulist_alloc(GFP_KERNEL);
   1500	memalloc_nofs_restore(nofs_flag);
   1501	if (!tmp)
   1502		return -ENOMEM;
   1503
   1504	if (!fs_info->quota_root) {
   1505		ret = -ENOTCONN;
   1506		goto out;
   1507	}
   1508
   1509	member = find_qgroup_rb(fs_info, src);
   1510	parent = find_qgroup_rb(fs_info, dst);
   1511	/*
   1512	 * The parent/member pair doesn't exist, then try to delete the dead
   1513	 * relation items only.
   1514	 */
   1515	if (!member || !parent)
   1516		goto delete_item;
   1517
   1518	/* check if such qgroup relation exist firstly */
   1519	list_for_each_entry(list, &member->groups, next_group) {
   1520		if (list->group == parent) {
   1521			found = true;
   1522			break;
   1523		}
   1524	}
   1525
   1526delete_item:
   1527	ret = del_qgroup_relation_item(trans, src, dst);
   1528	if (ret < 0 && ret != -ENOENT)
   1529		goto out;
   1530	ret2 = del_qgroup_relation_item(trans, dst, src);
   1531	if (ret2 < 0 && ret2 != -ENOENT)
   1532		goto out;
   1533
   1534	/* At least one deletion succeeded, return 0 */
   1535	if (!ret || !ret2)
   1536		ret = 0;
   1537
   1538	if (found) {
   1539		spin_lock(&fs_info->qgroup_lock);
   1540		del_relation_rb(fs_info, src, dst);
   1541		ret = quick_update_accounting(fs_info, tmp, src, dst, -1);
   1542		spin_unlock(&fs_info->qgroup_lock);
   1543	}
   1544out:
   1545	ulist_free(tmp);
   1546	return ret;
   1547}
   1548
   1549int btrfs_del_qgroup_relation(struct btrfs_trans_handle *trans, u64 src,
   1550			      u64 dst)
   1551{
   1552	struct btrfs_fs_info *fs_info = trans->fs_info;
   1553	int ret = 0;
   1554
   1555	mutex_lock(&fs_info->qgroup_ioctl_lock);
   1556	ret = __del_qgroup_relation(trans, src, dst);
   1557	mutex_unlock(&fs_info->qgroup_ioctl_lock);
   1558
   1559	return ret;
   1560}
   1561
   1562int btrfs_create_qgroup(struct btrfs_trans_handle *trans, u64 qgroupid)
   1563{
   1564	struct btrfs_fs_info *fs_info = trans->fs_info;
   1565	struct btrfs_root *quota_root;
   1566	struct btrfs_qgroup *qgroup;
   1567	int ret = 0;
   1568
   1569	mutex_lock(&fs_info->qgroup_ioctl_lock);
   1570	if (!fs_info->quota_root) {
   1571		ret = -ENOTCONN;
   1572		goto out;
   1573	}
   1574	quota_root = fs_info->quota_root;
   1575	qgroup = find_qgroup_rb(fs_info, qgroupid);
   1576	if (qgroup) {
   1577		ret = -EEXIST;
   1578		goto out;
   1579	}
   1580
   1581	ret = add_qgroup_item(trans, quota_root, qgroupid);
   1582	if (ret)
   1583		goto out;
   1584
   1585	spin_lock(&fs_info->qgroup_lock);
   1586	qgroup = add_qgroup_rb(fs_info, qgroupid);
   1587	spin_unlock(&fs_info->qgroup_lock);
   1588
   1589	if (IS_ERR(qgroup)) {
   1590		ret = PTR_ERR(qgroup);
   1591		goto out;
   1592	}
   1593	ret = btrfs_sysfs_add_one_qgroup(fs_info, qgroup);
   1594out:
   1595	mutex_unlock(&fs_info->qgroup_ioctl_lock);
   1596	return ret;
   1597}
   1598
   1599int btrfs_remove_qgroup(struct btrfs_trans_handle *trans, u64 qgroupid)
   1600{
   1601	struct btrfs_fs_info *fs_info = trans->fs_info;
   1602	struct btrfs_qgroup *qgroup;
   1603	struct btrfs_qgroup_list *list;
   1604	int ret = 0;
   1605
   1606	mutex_lock(&fs_info->qgroup_ioctl_lock);
   1607	if (!fs_info->quota_root) {
   1608		ret = -ENOTCONN;
   1609		goto out;
   1610	}
   1611
   1612	qgroup = find_qgroup_rb(fs_info, qgroupid);
   1613	if (!qgroup) {
   1614		ret = -ENOENT;
   1615		goto out;
   1616	}
   1617
   1618	/* Check if there are no children of this qgroup */
   1619	if (!list_empty(&qgroup->members)) {
   1620		ret = -EBUSY;
   1621		goto out;
   1622	}
   1623
   1624	ret = del_qgroup_item(trans, qgroupid);
   1625	if (ret && ret != -ENOENT)
   1626		goto out;
   1627
   1628	while (!list_empty(&qgroup->groups)) {
   1629		list = list_first_entry(&qgroup->groups,
   1630					struct btrfs_qgroup_list, next_group);
   1631		ret = __del_qgroup_relation(trans, qgroupid,
   1632					    list->group->qgroupid);
   1633		if (ret)
   1634			goto out;
   1635	}
   1636
   1637	spin_lock(&fs_info->qgroup_lock);
   1638	del_qgroup_rb(fs_info, qgroupid);
   1639	spin_unlock(&fs_info->qgroup_lock);
   1640
   1641	/*
   1642	 * Remove the qgroup from sysfs now without holding the qgroup_lock
   1643	 * spinlock, since the sysfs_remove_group() function needs to take
   1644	 * the mutex kernfs_mutex through kernfs_remove_by_name_ns().
   1645	 */
   1646	btrfs_sysfs_del_one_qgroup(fs_info, qgroup);
   1647	kfree(qgroup);
   1648out:
   1649	mutex_unlock(&fs_info->qgroup_ioctl_lock);
   1650	return ret;
   1651}
   1652
   1653int btrfs_limit_qgroup(struct btrfs_trans_handle *trans, u64 qgroupid,
   1654		       struct btrfs_qgroup_limit *limit)
   1655{
   1656	struct btrfs_fs_info *fs_info = trans->fs_info;
   1657	struct btrfs_qgroup *qgroup;
   1658	int ret = 0;
   1659	/* Sometimes we would want to clear the limit on this qgroup.
   1660	 * To meet this requirement, we treat the -1 as a special value
   1661	 * which tell kernel to clear the limit on this qgroup.
   1662	 */
   1663	const u64 CLEAR_VALUE = -1;
   1664
   1665	mutex_lock(&fs_info->qgroup_ioctl_lock);
   1666	if (!fs_info->quota_root) {
   1667		ret = -ENOTCONN;
   1668		goto out;
   1669	}
   1670
   1671	qgroup = find_qgroup_rb(fs_info, qgroupid);
   1672	if (!qgroup) {
   1673		ret = -ENOENT;
   1674		goto out;
   1675	}
   1676
   1677	spin_lock(&fs_info->qgroup_lock);
   1678	if (limit->flags & BTRFS_QGROUP_LIMIT_MAX_RFER) {
   1679		if (limit->max_rfer == CLEAR_VALUE) {
   1680			qgroup->lim_flags &= ~BTRFS_QGROUP_LIMIT_MAX_RFER;
   1681			limit->flags &= ~BTRFS_QGROUP_LIMIT_MAX_RFER;
   1682			qgroup->max_rfer = 0;
   1683		} else {
   1684			qgroup->max_rfer = limit->max_rfer;
   1685		}
   1686	}
   1687	if (limit->flags & BTRFS_QGROUP_LIMIT_MAX_EXCL) {
   1688		if (limit->max_excl == CLEAR_VALUE) {
   1689			qgroup->lim_flags &= ~BTRFS_QGROUP_LIMIT_MAX_EXCL;
   1690			limit->flags &= ~BTRFS_QGROUP_LIMIT_MAX_EXCL;
   1691			qgroup->max_excl = 0;
   1692		} else {
   1693			qgroup->max_excl = limit->max_excl;
   1694		}
   1695	}
   1696	if (limit->flags & BTRFS_QGROUP_LIMIT_RSV_RFER) {
   1697		if (limit->rsv_rfer == CLEAR_VALUE) {
   1698			qgroup->lim_flags &= ~BTRFS_QGROUP_LIMIT_RSV_RFER;
   1699			limit->flags &= ~BTRFS_QGROUP_LIMIT_RSV_RFER;
   1700			qgroup->rsv_rfer = 0;
   1701		} else {
   1702			qgroup->rsv_rfer = limit->rsv_rfer;
   1703		}
   1704	}
   1705	if (limit->flags & BTRFS_QGROUP_LIMIT_RSV_EXCL) {
   1706		if (limit->rsv_excl == CLEAR_VALUE) {
   1707			qgroup->lim_flags &= ~BTRFS_QGROUP_LIMIT_RSV_EXCL;
   1708			limit->flags &= ~BTRFS_QGROUP_LIMIT_RSV_EXCL;
   1709			qgroup->rsv_excl = 0;
   1710		} else {
   1711			qgroup->rsv_excl = limit->rsv_excl;
   1712		}
   1713	}
   1714	qgroup->lim_flags |= limit->flags;
   1715
   1716	spin_unlock(&fs_info->qgroup_lock);
   1717
   1718	ret = update_qgroup_limit_item(trans, qgroup);
   1719	if (ret) {
   1720		fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
   1721		btrfs_info(fs_info, "unable to update quota limit for %llu",
   1722		       qgroupid);
   1723	}
   1724
   1725out:
   1726	mutex_unlock(&fs_info->qgroup_ioctl_lock);
   1727	return ret;
   1728}
   1729
   1730int btrfs_qgroup_trace_extent_nolock(struct btrfs_fs_info *fs_info,
   1731				struct btrfs_delayed_ref_root *delayed_refs,
   1732				struct btrfs_qgroup_extent_record *record)
   1733{
   1734	struct rb_node **p = &delayed_refs->dirty_extent_root.rb_node;
   1735	struct rb_node *parent_node = NULL;
   1736	struct btrfs_qgroup_extent_record *entry;
   1737	u64 bytenr = record->bytenr;
   1738
   1739	lockdep_assert_held(&delayed_refs->lock);
   1740	trace_btrfs_qgroup_trace_extent(fs_info, record);
   1741
   1742	while (*p) {
   1743		parent_node = *p;
   1744		entry = rb_entry(parent_node, struct btrfs_qgroup_extent_record,
   1745				 node);
   1746		if (bytenr < entry->bytenr) {
   1747			p = &(*p)->rb_left;
   1748		} else if (bytenr > entry->bytenr) {
   1749			p = &(*p)->rb_right;
   1750		} else {
   1751			if (record->data_rsv && !entry->data_rsv) {
   1752				entry->data_rsv = record->data_rsv;
   1753				entry->data_rsv_refroot =
   1754					record->data_rsv_refroot;
   1755			}
   1756			return 1;
   1757		}
   1758	}
   1759
   1760	rb_link_node(&record->node, parent_node, p);
   1761	rb_insert_color(&record->node, &delayed_refs->dirty_extent_root);
   1762	return 0;
   1763}
   1764
   1765int btrfs_qgroup_trace_extent_post(struct btrfs_trans_handle *trans,
   1766				   struct btrfs_qgroup_extent_record *qrecord)
   1767{
   1768	struct ulist *old_root;
   1769	u64 bytenr = qrecord->bytenr;
   1770	int ret;
   1771
   1772	/*
   1773	 * We are always called in a context where we are already holding a
   1774	 * transaction handle. Often we are called when adding a data delayed
   1775	 * reference from btrfs_truncate_inode_items() (truncating or unlinking),
   1776	 * in which case we will be holding a write lock on extent buffer from a
   1777	 * subvolume tree. In this case we can't allow btrfs_find_all_roots() to
   1778	 * acquire fs_info->commit_root_sem, because that is a higher level lock
   1779	 * that must be acquired before locking any extent buffers.
   1780	 *
   1781	 * So we want btrfs_find_all_roots() to not acquire the commit_root_sem
   1782	 * but we can't pass it a non-NULL transaction handle, because otherwise
   1783	 * it would not use commit roots and would lock extent buffers, causing
   1784	 * a deadlock if it ends up trying to read lock the same extent buffer
   1785	 * that was previously write locked at btrfs_truncate_inode_items().
   1786	 *
   1787	 * So pass a NULL transaction handle to btrfs_find_all_roots() and
   1788	 * explicitly tell it to not acquire the commit_root_sem - if we are
   1789	 * holding a transaction handle we don't need its protection.
   1790	 */
   1791	ASSERT(trans != NULL);
   1792
   1793	ret = btrfs_find_all_roots(NULL, trans->fs_info, bytenr, 0, &old_root,
   1794				   true);
   1795	if (ret < 0) {
   1796		trans->fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
   1797		btrfs_warn(trans->fs_info,
   1798"error accounting new delayed refs extent (err code: %d), quota inconsistent",
   1799			ret);
   1800		return 0;
   1801	}
   1802
   1803	/*
   1804	 * Here we don't need to get the lock of
   1805	 * trans->transaction->delayed_refs, since inserted qrecord won't
   1806	 * be deleted, only qrecord->node may be modified (new qrecord insert)
   1807	 *
   1808	 * So modifying qrecord->old_roots is safe here
   1809	 */
   1810	qrecord->old_roots = old_root;
   1811	return 0;
   1812}
   1813
   1814int btrfs_qgroup_trace_extent(struct btrfs_trans_handle *trans, u64 bytenr,
   1815			      u64 num_bytes, gfp_t gfp_flag)
   1816{
   1817	struct btrfs_fs_info *fs_info = trans->fs_info;
   1818	struct btrfs_qgroup_extent_record *record;
   1819	struct btrfs_delayed_ref_root *delayed_refs;
   1820	int ret;
   1821
   1822	if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags)
   1823	    || bytenr == 0 || num_bytes == 0)
   1824		return 0;
   1825	record = kzalloc(sizeof(*record), gfp_flag);
   1826	if (!record)
   1827		return -ENOMEM;
   1828
   1829	delayed_refs = &trans->transaction->delayed_refs;
   1830	record->bytenr = bytenr;
   1831	record->num_bytes = num_bytes;
   1832	record->old_roots = NULL;
   1833
   1834	spin_lock(&delayed_refs->lock);
   1835	ret = btrfs_qgroup_trace_extent_nolock(fs_info, delayed_refs, record);
   1836	spin_unlock(&delayed_refs->lock);
   1837	if (ret > 0) {
   1838		kfree(record);
   1839		return 0;
   1840	}
   1841	return btrfs_qgroup_trace_extent_post(trans, record);
   1842}
   1843
   1844int btrfs_qgroup_trace_leaf_items(struct btrfs_trans_handle *trans,
   1845				  struct extent_buffer *eb)
   1846{
   1847	struct btrfs_fs_info *fs_info = trans->fs_info;
   1848	int nr = btrfs_header_nritems(eb);
   1849	int i, extent_type, ret;
   1850	struct btrfs_key key;
   1851	struct btrfs_file_extent_item *fi;
   1852	u64 bytenr, num_bytes;
   1853
   1854	/* We can be called directly from walk_up_proc() */
   1855	if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags))
   1856		return 0;
   1857
   1858	for (i = 0; i < nr; i++) {
   1859		btrfs_item_key_to_cpu(eb, &key, i);
   1860
   1861		if (key.type != BTRFS_EXTENT_DATA_KEY)
   1862			continue;
   1863
   1864		fi = btrfs_item_ptr(eb, i, struct btrfs_file_extent_item);
   1865		/* filter out non qgroup-accountable extents  */
   1866		extent_type = btrfs_file_extent_type(eb, fi);
   1867
   1868		if (extent_type == BTRFS_FILE_EXTENT_INLINE)
   1869			continue;
   1870
   1871		bytenr = btrfs_file_extent_disk_bytenr(eb, fi);
   1872		if (!bytenr)
   1873			continue;
   1874
   1875		num_bytes = btrfs_file_extent_disk_num_bytes(eb, fi);
   1876
   1877		ret = btrfs_qgroup_trace_extent(trans, bytenr, num_bytes,
   1878						GFP_NOFS);
   1879		if (ret)
   1880			return ret;
   1881	}
   1882	cond_resched();
   1883	return 0;
   1884}
   1885
   1886/*
   1887 * Walk up the tree from the bottom, freeing leaves and any interior
   1888 * nodes which have had all slots visited. If a node (leaf or
   1889 * interior) is freed, the node above it will have it's slot
   1890 * incremented. The root node will never be freed.
   1891 *
   1892 * At the end of this function, we should have a path which has all
   1893 * slots incremented to the next position for a search. If we need to
   1894 * read a new node it will be NULL and the node above it will have the
   1895 * correct slot selected for a later read.
   1896 *
   1897 * If we increment the root nodes slot counter past the number of
   1898 * elements, 1 is returned to signal completion of the search.
   1899 */
   1900static int adjust_slots_upwards(struct btrfs_path *path, int root_level)
   1901{
   1902	int level = 0;
   1903	int nr, slot;
   1904	struct extent_buffer *eb;
   1905
   1906	if (root_level == 0)
   1907		return 1;
   1908
   1909	while (level <= root_level) {
   1910		eb = path->nodes[level];
   1911		nr = btrfs_header_nritems(eb);
   1912		path->slots[level]++;
   1913		slot = path->slots[level];
   1914		if (slot >= nr || level == 0) {
   1915			/*
   1916			 * Don't free the root -  we will detect this
   1917			 * condition after our loop and return a
   1918			 * positive value for caller to stop walking the tree.
   1919			 */
   1920			if (level != root_level) {
   1921				btrfs_tree_unlock_rw(eb, path->locks[level]);
   1922				path->locks[level] = 0;
   1923
   1924				free_extent_buffer(eb);
   1925				path->nodes[level] = NULL;
   1926				path->slots[level] = 0;
   1927			}
   1928		} else {
   1929			/*
   1930			 * We have a valid slot to walk back down
   1931			 * from. Stop here so caller can process these
   1932			 * new nodes.
   1933			 */
   1934			break;
   1935		}
   1936
   1937		level++;
   1938	}
   1939
   1940	eb = path->nodes[root_level];
   1941	if (path->slots[root_level] >= btrfs_header_nritems(eb))
   1942		return 1;
   1943
   1944	return 0;
   1945}
   1946
   1947/*
   1948 * Helper function to trace a subtree tree block swap.
   1949 *
   1950 * The swap will happen in highest tree block, but there may be a lot of
   1951 * tree blocks involved.
   1952 *
   1953 * For example:
   1954 *  OO = Old tree blocks
   1955 *  NN = New tree blocks allocated during balance
   1956 *
   1957 *           File tree (257)                  Reloc tree for 257
   1958 * L2              OO                                NN
   1959 *               /    \                            /    \
   1960 * L1          OO      OO (a)                    OO      NN (a)
   1961 *            / \     / \                       / \     / \
   1962 * L0       OO   OO OO   OO                   OO   OO NN   NN
   1963 *                  (b)  (c)                          (b)  (c)
   1964 *
   1965 * When calling qgroup_trace_extent_swap(), we will pass:
   1966 * @src_eb = OO(a)
   1967 * @dst_path = [ nodes[1] = NN(a), nodes[0] = NN(c) ]
   1968 * @dst_level = 0
   1969 * @root_level = 1
   1970 *
   1971 * In that case, qgroup_trace_extent_swap() will search from OO(a) to
   1972 * reach OO(c), then mark both OO(c) and NN(c) as qgroup dirty.
   1973 *
   1974 * The main work of qgroup_trace_extent_swap() can be split into 3 parts:
   1975 *
   1976 * 1) Tree search from @src_eb
   1977 *    It should acts as a simplified btrfs_search_slot().
   1978 *    The key for search can be extracted from @dst_path->nodes[dst_level]
   1979 *    (first key).
   1980 *
   1981 * 2) Mark the final tree blocks in @src_path and @dst_path qgroup dirty
   1982 *    NOTE: In above case, OO(a) and NN(a) won't be marked qgroup dirty.
   1983 *    They should be marked during previous (@dst_level = 1) iteration.
   1984 *
   1985 * 3) Mark file extents in leaves dirty
   1986 *    We don't have good way to pick out new file extents only.
   1987 *    So we still follow the old method by scanning all file extents in
   1988 *    the leave.
   1989 *
   1990 * This function can free us from keeping two paths, thus later we only need
   1991 * to care about how to iterate all new tree blocks in reloc tree.
   1992 */
   1993static int qgroup_trace_extent_swap(struct btrfs_trans_handle* trans,
   1994				    struct extent_buffer *src_eb,
   1995				    struct btrfs_path *dst_path,
   1996				    int dst_level, int root_level,
   1997				    bool trace_leaf)
   1998{
   1999	struct btrfs_key key;
   2000	struct btrfs_path *src_path;
   2001	struct btrfs_fs_info *fs_info = trans->fs_info;
   2002	u32 nodesize = fs_info->nodesize;
   2003	int cur_level = root_level;
   2004	int ret;
   2005
   2006	BUG_ON(dst_level > root_level);
   2007	/* Level mismatch */
   2008	if (btrfs_header_level(src_eb) != root_level)
   2009		return -EINVAL;
   2010
   2011	src_path = btrfs_alloc_path();
   2012	if (!src_path) {
   2013		ret = -ENOMEM;
   2014		goto out;
   2015	}
   2016
   2017	if (dst_level)
   2018		btrfs_node_key_to_cpu(dst_path->nodes[dst_level], &key, 0);
   2019	else
   2020		btrfs_item_key_to_cpu(dst_path->nodes[dst_level], &key, 0);
   2021
   2022	/* For src_path */
   2023	atomic_inc(&src_eb->refs);
   2024	src_path->nodes[root_level] = src_eb;
   2025	src_path->slots[root_level] = dst_path->slots[root_level];
   2026	src_path->locks[root_level] = 0;
   2027
   2028	/* A simplified version of btrfs_search_slot() */
   2029	while (cur_level >= dst_level) {
   2030		struct btrfs_key src_key;
   2031		struct btrfs_key dst_key;
   2032
   2033		if (src_path->nodes[cur_level] == NULL) {
   2034			struct extent_buffer *eb;
   2035			int parent_slot;
   2036
   2037			eb = src_path->nodes[cur_level + 1];
   2038			parent_slot = src_path->slots[cur_level + 1];
   2039
   2040			eb = btrfs_read_node_slot(eb, parent_slot);
   2041			if (IS_ERR(eb)) {
   2042				ret = PTR_ERR(eb);
   2043				goto out;
   2044			}
   2045
   2046			src_path->nodes[cur_level] = eb;
   2047
   2048			btrfs_tree_read_lock(eb);
   2049			src_path->locks[cur_level] = BTRFS_READ_LOCK;
   2050		}
   2051
   2052		src_path->slots[cur_level] = dst_path->slots[cur_level];
   2053		if (cur_level) {
   2054			btrfs_node_key_to_cpu(dst_path->nodes[cur_level],
   2055					&dst_key, dst_path->slots[cur_level]);
   2056			btrfs_node_key_to_cpu(src_path->nodes[cur_level],
   2057					&src_key, src_path->slots[cur_level]);
   2058		} else {
   2059			btrfs_item_key_to_cpu(dst_path->nodes[cur_level],
   2060					&dst_key, dst_path->slots[cur_level]);
   2061			btrfs_item_key_to_cpu(src_path->nodes[cur_level],
   2062					&src_key, src_path->slots[cur_level]);
   2063		}
   2064		/* Content mismatch, something went wrong */
   2065		if (btrfs_comp_cpu_keys(&dst_key, &src_key)) {
   2066			ret = -ENOENT;
   2067			goto out;
   2068		}
   2069		cur_level--;
   2070	}
   2071
   2072	/*
   2073	 * Now both @dst_path and @src_path have been populated, record the tree
   2074	 * blocks for qgroup accounting.
   2075	 */
   2076	ret = btrfs_qgroup_trace_extent(trans, src_path->nodes[dst_level]->start,
   2077			nodesize, GFP_NOFS);
   2078	if (ret < 0)
   2079		goto out;
   2080	ret = btrfs_qgroup_trace_extent(trans,
   2081			dst_path->nodes[dst_level]->start,
   2082			nodesize, GFP_NOFS);
   2083	if (ret < 0)
   2084		goto out;
   2085
   2086	/* Record leaf file extents */
   2087	if (dst_level == 0 && trace_leaf) {
   2088		ret = btrfs_qgroup_trace_leaf_items(trans, src_path->nodes[0]);
   2089		if (ret < 0)
   2090			goto out;
   2091		ret = btrfs_qgroup_trace_leaf_items(trans, dst_path->nodes[0]);
   2092	}
   2093out:
   2094	btrfs_free_path(src_path);
   2095	return ret;
   2096}
   2097
   2098/*
   2099 * Helper function to do recursive generation-aware depth-first search, to
   2100 * locate all new tree blocks in a subtree of reloc tree.
   2101 *
   2102 * E.g. (OO = Old tree blocks, NN = New tree blocks, whose gen == last_snapshot)
   2103 *         reloc tree
   2104 * L2         NN (a)
   2105 *          /    \
   2106 * L1    OO        NN (b)
   2107 *      /  \      /  \
   2108 * L0  OO  OO    OO  NN
   2109 *               (c) (d)
   2110 * If we pass:
   2111 * @dst_path = [ nodes[1] = NN(b), nodes[0] = NULL ],
   2112 * @cur_level = 1
   2113 * @root_level = 1
   2114 *
   2115 * We will iterate through tree blocks NN(b), NN(d) and info qgroup to trace
   2116 * above tree blocks along with their counter parts in file tree.
   2117 * While during search, old tree blocks OO(c) will be skipped as tree block swap
   2118 * won't affect OO(c).
   2119 */
   2120static int qgroup_trace_new_subtree_blocks(struct btrfs_trans_handle* trans,
   2121					   struct extent_buffer *src_eb,
   2122					   struct btrfs_path *dst_path,
   2123					   int cur_level, int root_level,
   2124					   u64 last_snapshot, bool trace_leaf)
   2125{
   2126	struct btrfs_fs_info *fs_info = trans->fs_info;
   2127	struct extent_buffer *eb;
   2128	bool need_cleanup = false;
   2129	int ret = 0;
   2130	int i;
   2131
   2132	/* Level sanity check */
   2133	if (cur_level < 0 || cur_level >= BTRFS_MAX_LEVEL - 1 ||
   2134	    root_level < 0 || root_level >= BTRFS_MAX_LEVEL - 1 ||
   2135	    root_level < cur_level) {
   2136		btrfs_err_rl(fs_info,
   2137			"%s: bad levels, cur_level=%d root_level=%d",
   2138			__func__, cur_level, root_level);
   2139		return -EUCLEAN;
   2140	}
   2141
   2142	/* Read the tree block if needed */
   2143	if (dst_path->nodes[cur_level] == NULL) {
   2144		int parent_slot;
   2145		u64 child_gen;
   2146
   2147		/*
   2148		 * dst_path->nodes[root_level] must be initialized before
   2149		 * calling this function.
   2150		 */
   2151		if (cur_level == root_level) {
   2152			btrfs_err_rl(fs_info,
   2153	"%s: dst_path->nodes[%d] not initialized, root_level=%d cur_level=%d",
   2154				__func__, root_level, root_level, cur_level);
   2155			return -EUCLEAN;
   2156		}
   2157
   2158		/*
   2159		 * We need to get child blockptr/gen from parent before we can
   2160		 * read it.
   2161		  */
   2162		eb = dst_path->nodes[cur_level + 1];
   2163		parent_slot = dst_path->slots[cur_level + 1];
   2164		child_gen = btrfs_node_ptr_generation(eb, parent_slot);
   2165
   2166		/* This node is old, no need to trace */
   2167		if (child_gen < last_snapshot)
   2168			goto out;
   2169
   2170		eb = btrfs_read_node_slot(eb, parent_slot);
   2171		if (IS_ERR(eb)) {
   2172			ret = PTR_ERR(eb);
   2173			goto out;
   2174		}
   2175
   2176		dst_path->nodes[cur_level] = eb;
   2177		dst_path->slots[cur_level] = 0;
   2178
   2179		btrfs_tree_read_lock(eb);
   2180		dst_path->locks[cur_level] = BTRFS_READ_LOCK;
   2181		need_cleanup = true;
   2182	}
   2183
   2184	/* Now record this tree block and its counter part for qgroups */
   2185	ret = qgroup_trace_extent_swap(trans, src_eb, dst_path, cur_level,
   2186				       root_level, trace_leaf);
   2187	if (ret < 0)
   2188		goto cleanup;
   2189
   2190	eb = dst_path->nodes[cur_level];
   2191
   2192	if (cur_level > 0) {
   2193		/* Iterate all child tree blocks */
   2194		for (i = 0; i < btrfs_header_nritems(eb); i++) {
   2195			/* Skip old tree blocks as they won't be swapped */
   2196			if (btrfs_node_ptr_generation(eb, i) < last_snapshot)
   2197				continue;
   2198			dst_path->slots[cur_level] = i;
   2199
   2200			/* Recursive call (at most 7 times) */
   2201			ret = qgroup_trace_new_subtree_blocks(trans, src_eb,
   2202					dst_path, cur_level - 1, root_level,
   2203					last_snapshot, trace_leaf);
   2204			if (ret < 0)
   2205				goto cleanup;
   2206		}
   2207	}
   2208
   2209cleanup:
   2210	if (need_cleanup) {
   2211		/* Clean up */
   2212		btrfs_tree_unlock_rw(dst_path->nodes[cur_level],
   2213				     dst_path->locks[cur_level]);
   2214		free_extent_buffer(dst_path->nodes[cur_level]);
   2215		dst_path->nodes[cur_level] = NULL;
   2216		dst_path->slots[cur_level] = 0;
   2217		dst_path->locks[cur_level] = 0;
   2218	}
   2219out:
   2220	return ret;
   2221}
   2222
   2223static int qgroup_trace_subtree_swap(struct btrfs_trans_handle *trans,
   2224				struct extent_buffer *src_eb,
   2225				struct extent_buffer *dst_eb,
   2226				u64 last_snapshot, bool trace_leaf)
   2227{
   2228	struct btrfs_fs_info *fs_info = trans->fs_info;
   2229	struct btrfs_path *dst_path = NULL;
   2230	int level;
   2231	int ret;
   2232
   2233	if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags))
   2234		return 0;
   2235
   2236	/* Wrong parameter order */
   2237	if (btrfs_header_generation(src_eb) > btrfs_header_generation(dst_eb)) {
   2238		btrfs_err_rl(fs_info,
   2239		"%s: bad parameter order, src_gen=%llu dst_gen=%llu", __func__,
   2240			     btrfs_header_generation(src_eb),
   2241			     btrfs_header_generation(dst_eb));
   2242		return -EUCLEAN;
   2243	}
   2244
   2245	if (!extent_buffer_uptodate(src_eb) || !extent_buffer_uptodate(dst_eb)) {
   2246		ret = -EIO;
   2247		goto out;
   2248	}
   2249
   2250	level = btrfs_header_level(dst_eb);
   2251	dst_path = btrfs_alloc_path();
   2252	if (!dst_path) {
   2253		ret = -ENOMEM;
   2254		goto out;
   2255	}
   2256	/* For dst_path */
   2257	atomic_inc(&dst_eb->refs);
   2258	dst_path->nodes[level] = dst_eb;
   2259	dst_path->slots[level] = 0;
   2260	dst_path->locks[level] = 0;
   2261
   2262	/* Do the generation aware breadth-first search */
   2263	ret = qgroup_trace_new_subtree_blocks(trans, src_eb, dst_path, level,
   2264					      level, last_snapshot, trace_leaf);
   2265	if (ret < 0)
   2266		goto out;
   2267	ret = 0;
   2268
   2269out:
   2270	btrfs_free_path(dst_path);
   2271	if (ret < 0)
   2272		fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
   2273	return ret;
   2274}
   2275
   2276int btrfs_qgroup_trace_subtree(struct btrfs_trans_handle *trans,
   2277			       struct extent_buffer *root_eb,
   2278			       u64 root_gen, int root_level)
   2279{
   2280	struct btrfs_fs_info *fs_info = trans->fs_info;
   2281	int ret = 0;
   2282	int level;
   2283	struct extent_buffer *eb = root_eb;
   2284	struct btrfs_path *path = NULL;
   2285
   2286	BUG_ON(root_level < 0 || root_level >= BTRFS_MAX_LEVEL);
   2287	BUG_ON(root_eb == NULL);
   2288
   2289	if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags))
   2290		return 0;
   2291
   2292	if (!extent_buffer_uptodate(root_eb)) {
   2293		ret = btrfs_read_extent_buffer(root_eb, root_gen, root_level, NULL);
   2294		if (ret)
   2295			goto out;
   2296	}
   2297
   2298	if (root_level == 0) {
   2299		ret = btrfs_qgroup_trace_leaf_items(trans, root_eb);
   2300		goto out;
   2301	}
   2302
   2303	path = btrfs_alloc_path();
   2304	if (!path)
   2305		return -ENOMEM;
   2306
   2307	/*
   2308	 * Walk down the tree.  Missing extent blocks are filled in as
   2309	 * we go. Metadata is accounted every time we read a new
   2310	 * extent block.
   2311	 *
   2312	 * When we reach a leaf, we account for file extent items in it,
   2313	 * walk back up the tree (adjusting slot pointers as we go)
   2314	 * and restart the search process.
   2315	 */
   2316	atomic_inc(&root_eb->refs);	/* For path */
   2317	path->nodes[root_level] = root_eb;
   2318	path->slots[root_level] = 0;
   2319	path->locks[root_level] = 0; /* so release_path doesn't try to unlock */
   2320walk_down:
   2321	level = root_level;
   2322	while (level >= 0) {
   2323		if (path->nodes[level] == NULL) {
   2324			int parent_slot;
   2325			u64 child_bytenr;
   2326
   2327			/*
   2328			 * We need to get child blockptr from parent before we
   2329			 * can read it.
   2330			  */
   2331			eb = path->nodes[level + 1];
   2332			parent_slot = path->slots[level + 1];
   2333			child_bytenr = btrfs_node_blockptr(eb, parent_slot);
   2334
   2335			eb = btrfs_read_node_slot(eb, parent_slot);
   2336			if (IS_ERR(eb)) {
   2337				ret = PTR_ERR(eb);
   2338				goto out;
   2339			}
   2340
   2341			path->nodes[level] = eb;
   2342			path->slots[level] = 0;
   2343
   2344			btrfs_tree_read_lock(eb);
   2345			path->locks[level] = BTRFS_READ_LOCK;
   2346
   2347			ret = btrfs_qgroup_trace_extent(trans, child_bytenr,
   2348							fs_info->nodesize,
   2349							GFP_NOFS);
   2350			if (ret)
   2351				goto out;
   2352		}
   2353
   2354		if (level == 0) {
   2355			ret = btrfs_qgroup_trace_leaf_items(trans,
   2356							    path->nodes[level]);
   2357			if (ret)
   2358				goto out;
   2359
   2360			/* Nonzero return here means we completed our search */
   2361			ret = adjust_slots_upwards(path, root_level);
   2362			if (ret)
   2363				break;
   2364
   2365			/* Restart search with new slots */
   2366			goto walk_down;
   2367		}
   2368
   2369		level--;
   2370	}
   2371
   2372	ret = 0;
   2373out:
   2374	btrfs_free_path(path);
   2375
   2376	return ret;
   2377}
   2378
   2379#define UPDATE_NEW	0
   2380#define UPDATE_OLD	1
   2381/*
   2382 * Walk all of the roots that points to the bytenr and adjust their refcnts.
   2383 */
   2384static int qgroup_update_refcnt(struct btrfs_fs_info *fs_info,
   2385				struct ulist *roots, struct ulist *tmp,
   2386				struct ulist *qgroups, u64 seq, int update_old)
   2387{
   2388	struct ulist_node *unode;
   2389	struct ulist_iterator uiter;
   2390	struct ulist_node *tmp_unode;
   2391	struct ulist_iterator tmp_uiter;
   2392	struct btrfs_qgroup *qg;
   2393	int ret = 0;
   2394
   2395	if (!roots)
   2396		return 0;
   2397	ULIST_ITER_INIT(&uiter);
   2398	while ((unode = ulist_next(roots, &uiter))) {
   2399		qg = find_qgroup_rb(fs_info, unode->val);
   2400		if (!qg)
   2401			continue;
   2402
   2403		ulist_reinit(tmp);
   2404		ret = ulist_add(qgroups, qg->qgroupid, qgroup_to_aux(qg),
   2405				GFP_ATOMIC);
   2406		if (ret < 0)
   2407			return ret;
   2408		ret = ulist_add(tmp, qg->qgroupid, qgroup_to_aux(qg), GFP_ATOMIC);
   2409		if (ret < 0)
   2410			return ret;
   2411		ULIST_ITER_INIT(&tmp_uiter);
   2412		while ((tmp_unode = ulist_next(tmp, &tmp_uiter))) {
   2413			struct btrfs_qgroup_list *glist;
   2414
   2415			qg = unode_aux_to_qgroup(tmp_unode);
   2416			if (update_old)
   2417				btrfs_qgroup_update_old_refcnt(qg, seq, 1);
   2418			else
   2419				btrfs_qgroup_update_new_refcnt(qg, seq, 1);
   2420			list_for_each_entry(glist, &qg->groups, next_group) {
   2421				ret = ulist_add(qgroups, glist->group->qgroupid,
   2422						qgroup_to_aux(glist->group),
   2423						GFP_ATOMIC);
   2424				if (ret < 0)
   2425					return ret;
   2426				ret = ulist_add(tmp, glist->group->qgroupid,
   2427						qgroup_to_aux(glist->group),
   2428						GFP_ATOMIC);
   2429				if (ret < 0)
   2430					return ret;
   2431			}
   2432		}
   2433	}
   2434	return 0;
   2435}
   2436
   2437/*
   2438 * Update qgroup rfer/excl counters.
   2439 * Rfer update is easy, codes can explain themselves.
   2440 *
   2441 * Excl update is tricky, the update is split into 2 parts.
   2442 * Part 1: Possible exclusive <-> sharing detect:
   2443 *	|	A	|	!A	|
   2444 *  -------------------------------------
   2445 *  B	|	*	|	-	|
   2446 *  -------------------------------------
   2447 *  !B	|	+	|	**	|
   2448 *  -------------------------------------
   2449 *
   2450 * Conditions:
   2451 * A:	cur_old_roots < nr_old_roots	(not exclusive before)
   2452 * !A:	cur_old_roots == nr_old_roots	(possible exclusive before)
   2453 * B:	cur_new_roots < nr_new_roots	(not exclusive now)
   2454 * !B:	cur_new_roots == nr_new_roots	(possible exclusive now)
   2455 *
   2456 * Results:
   2457 * +: Possible sharing -> exclusive	-: Possible exclusive -> sharing
   2458 * *: Definitely not changed.		**: Possible unchanged.
   2459 *
   2460 * For !A and !B condition, the exception is cur_old/new_roots == 0 case.
   2461 *
   2462 * To make the logic clear, we first use condition A and B to split
   2463 * combination into 4 results.
   2464 *
   2465 * Then, for result "+" and "-", check old/new_roots == 0 case, as in them
   2466 * only on variant maybe 0.
   2467 *
   2468 * Lastly, check result **, since there are 2 variants maybe 0, split them
   2469 * again(2x2).
   2470 * But this time we don't need to consider other things, the codes and logic
   2471 * is easy to understand now.
   2472 */
   2473static int qgroup_update_counters(struct btrfs_fs_info *fs_info,
   2474				  struct ulist *qgroups,
   2475				  u64 nr_old_roots,
   2476				  u64 nr_new_roots,
   2477				  u64 num_bytes, u64 seq)
   2478{
   2479	struct ulist_node *unode;
   2480	struct ulist_iterator uiter;
   2481	struct btrfs_qgroup *qg;
   2482	u64 cur_new_count, cur_old_count;
   2483
   2484	ULIST_ITER_INIT(&uiter);
   2485	while ((unode = ulist_next(qgroups, &uiter))) {
   2486		bool dirty = false;
   2487
   2488		qg = unode_aux_to_qgroup(unode);
   2489		cur_old_count = btrfs_qgroup_get_old_refcnt(qg, seq);
   2490		cur_new_count = btrfs_qgroup_get_new_refcnt(qg, seq);
   2491
   2492		trace_qgroup_update_counters(fs_info, qg, cur_old_count,
   2493					     cur_new_count);
   2494
   2495		/* Rfer update part */
   2496		if (cur_old_count == 0 && cur_new_count > 0) {
   2497			qg->rfer += num_bytes;
   2498			qg->rfer_cmpr += num_bytes;
   2499			dirty = true;
   2500		}
   2501		if (cur_old_count > 0 && cur_new_count == 0) {
   2502			qg->rfer -= num_bytes;
   2503			qg->rfer_cmpr -= num_bytes;
   2504			dirty = true;
   2505		}
   2506
   2507		/* Excl update part */
   2508		/* Exclusive/none -> shared case */
   2509		if (cur_old_count == nr_old_roots &&
   2510		    cur_new_count < nr_new_roots) {
   2511			/* Exclusive -> shared */
   2512			if (cur_old_count != 0) {
   2513				qg->excl -= num_bytes;
   2514				qg->excl_cmpr -= num_bytes;
   2515				dirty = true;
   2516			}
   2517		}
   2518
   2519		/* Shared -> exclusive/none case */
   2520		if (cur_old_count < nr_old_roots &&
   2521		    cur_new_count == nr_new_roots) {
   2522			/* Shared->exclusive */
   2523			if (cur_new_count != 0) {
   2524				qg->excl += num_bytes;
   2525				qg->excl_cmpr += num_bytes;
   2526				dirty = true;
   2527			}
   2528		}
   2529
   2530		/* Exclusive/none -> exclusive/none case */
   2531		if (cur_old_count == nr_old_roots &&
   2532		    cur_new_count == nr_new_roots) {
   2533			if (cur_old_count == 0) {
   2534				/* None -> exclusive/none */
   2535
   2536				if (cur_new_count != 0) {
   2537					/* None -> exclusive */
   2538					qg->excl += num_bytes;
   2539					qg->excl_cmpr += num_bytes;
   2540					dirty = true;
   2541				}
   2542				/* None -> none, nothing changed */
   2543			} else {
   2544				/* Exclusive -> exclusive/none */
   2545
   2546				if (cur_new_count == 0) {
   2547					/* Exclusive -> none */
   2548					qg->excl -= num_bytes;
   2549					qg->excl_cmpr -= num_bytes;
   2550					dirty = true;
   2551				}
   2552				/* Exclusive -> exclusive, nothing changed */
   2553			}
   2554		}
   2555
   2556		if (dirty)
   2557			qgroup_dirty(fs_info, qg);
   2558	}
   2559	return 0;
   2560}
   2561
   2562/*
   2563 * Check if the @roots potentially is a list of fs tree roots
   2564 *
   2565 * Return 0 for definitely not a fs/subvol tree roots ulist
   2566 * Return 1 for possible fs/subvol tree roots in the list (considering an empty
   2567 *          one as well)
   2568 */
   2569static int maybe_fs_roots(struct ulist *roots)
   2570{
   2571	struct ulist_node *unode;
   2572	struct ulist_iterator uiter;
   2573
   2574	/* Empty one, still possible for fs roots */
   2575	if (!roots || roots->nnodes == 0)
   2576		return 1;
   2577
   2578	ULIST_ITER_INIT(&uiter);
   2579	unode = ulist_next(roots, &uiter);
   2580	if (!unode)
   2581		return 1;
   2582
   2583	/*
   2584	 * If it contains fs tree roots, then it must belong to fs/subvol
   2585	 * trees.
   2586	 * If it contains a non-fs tree, it won't be shared with fs/subvol trees.
   2587	 */
   2588	return is_fstree(unode->val);
   2589}
   2590
   2591int btrfs_qgroup_account_extent(struct btrfs_trans_handle *trans, u64 bytenr,
   2592				u64 num_bytes, struct ulist *old_roots,
   2593				struct ulist *new_roots)
   2594{
   2595	struct btrfs_fs_info *fs_info = trans->fs_info;
   2596	struct ulist *qgroups = NULL;
   2597	struct ulist *tmp = NULL;
   2598	u64 seq;
   2599	u64 nr_new_roots = 0;
   2600	u64 nr_old_roots = 0;
   2601	int ret = 0;
   2602
   2603	/*
   2604	 * If quotas get disabled meanwhile, the resources need to be freed and
   2605	 * we can't just exit here.
   2606	 */
   2607	if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags))
   2608		goto out_free;
   2609
   2610	if (new_roots) {
   2611		if (!maybe_fs_roots(new_roots))
   2612			goto out_free;
   2613		nr_new_roots = new_roots->nnodes;
   2614	}
   2615	if (old_roots) {
   2616		if (!maybe_fs_roots(old_roots))
   2617			goto out_free;
   2618		nr_old_roots = old_roots->nnodes;
   2619	}
   2620
   2621	/* Quick exit, either not fs tree roots, or won't affect any qgroup */
   2622	if (nr_old_roots == 0 && nr_new_roots == 0)
   2623		goto out_free;
   2624
   2625	BUG_ON(!fs_info->quota_root);
   2626
   2627	trace_btrfs_qgroup_account_extent(fs_info, trans->transid, bytenr,
   2628					num_bytes, nr_old_roots, nr_new_roots);
   2629
   2630	qgroups = ulist_alloc(GFP_NOFS);
   2631	if (!qgroups) {
   2632		ret = -ENOMEM;
   2633		goto out_free;
   2634	}
   2635	tmp = ulist_alloc(GFP_NOFS);
   2636	if (!tmp) {
   2637		ret = -ENOMEM;
   2638		goto out_free;
   2639	}
   2640
   2641	mutex_lock(&fs_info->qgroup_rescan_lock);
   2642	if (fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN) {
   2643		if (fs_info->qgroup_rescan_progress.objectid <= bytenr) {
   2644			mutex_unlock(&fs_info->qgroup_rescan_lock);
   2645			ret = 0;
   2646			goto out_free;
   2647		}
   2648	}
   2649	mutex_unlock(&fs_info->qgroup_rescan_lock);
   2650
   2651	spin_lock(&fs_info->qgroup_lock);
   2652	seq = fs_info->qgroup_seq;
   2653
   2654	/* Update old refcnts using old_roots */
   2655	ret = qgroup_update_refcnt(fs_info, old_roots, tmp, qgroups, seq,
   2656				   UPDATE_OLD);
   2657	if (ret < 0)
   2658		goto out;
   2659
   2660	/* Update new refcnts using new_roots */
   2661	ret = qgroup_update_refcnt(fs_info, new_roots, tmp, qgroups, seq,
   2662				   UPDATE_NEW);
   2663	if (ret < 0)
   2664		goto out;
   2665
   2666	qgroup_update_counters(fs_info, qgroups, nr_old_roots, nr_new_roots,
   2667			       num_bytes, seq);
   2668
   2669	/*
   2670	 * Bump qgroup_seq to avoid seq overlap
   2671	 */
   2672	fs_info->qgroup_seq += max(nr_old_roots, nr_new_roots) + 1;
   2673out:
   2674	spin_unlock(&fs_info->qgroup_lock);
   2675out_free:
   2676	ulist_free(tmp);
   2677	ulist_free(qgroups);
   2678	ulist_free(old_roots);
   2679	ulist_free(new_roots);
   2680	return ret;
   2681}
   2682
   2683int btrfs_qgroup_account_extents(struct btrfs_trans_handle *trans)
   2684{
   2685	struct btrfs_fs_info *fs_info = trans->fs_info;
   2686	struct btrfs_qgroup_extent_record *record;
   2687	struct btrfs_delayed_ref_root *delayed_refs;
   2688	struct ulist *new_roots = NULL;
   2689	struct rb_node *node;
   2690	u64 num_dirty_extents = 0;
   2691	u64 qgroup_to_skip;
   2692	int ret = 0;
   2693
   2694	delayed_refs = &trans->transaction->delayed_refs;
   2695	qgroup_to_skip = delayed_refs->qgroup_to_skip;
   2696	while ((node = rb_first(&delayed_refs->dirty_extent_root))) {
   2697		record = rb_entry(node, struct btrfs_qgroup_extent_record,
   2698				  node);
   2699
   2700		num_dirty_extents++;
   2701		trace_btrfs_qgroup_account_extents(fs_info, record);
   2702
   2703		if (!ret) {
   2704			/*
   2705			 * Old roots should be searched when inserting qgroup
   2706			 * extent record
   2707			 */
   2708			if (WARN_ON(!record->old_roots)) {
   2709				/* Search commit root to find old_roots */
   2710				ret = btrfs_find_all_roots(NULL, fs_info,
   2711						record->bytenr, 0,
   2712						&record->old_roots, false);
   2713				if (ret < 0)
   2714					goto cleanup;
   2715			}
   2716
   2717			/* Free the reserved data space */
   2718			btrfs_qgroup_free_refroot(fs_info,
   2719					record->data_rsv_refroot,
   2720					record->data_rsv,
   2721					BTRFS_QGROUP_RSV_DATA);
   2722			/*
   2723			 * Use BTRFS_SEQ_LAST as time_seq to do special search,
   2724			 * which doesn't lock tree or delayed_refs and search
   2725			 * current root. It's safe inside commit_transaction().
   2726			 */
   2727			ret = btrfs_find_all_roots(trans, fs_info,
   2728			   record->bytenr, BTRFS_SEQ_LAST, &new_roots, false);
   2729			if (ret < 0)
   2730				goto cleanup;
   2731			if (qgroup_to_skip) {
   2732				ulist_del(new_roots, qgroup_to_skip, 0);
   2733				ulist_del(record->old_roots, qgroup_to_skip,
   2734					  0);
   2735			}
   2736			ret = btrfs_qgroup_account_extent(trans, record->bytenr,
   2737							  record->num_bytes,
   2738							  record->old_roots,
   2739							  new_roots);
   2740			record->old_roots = NULL;
   2741			new_roots = NULL;
   2742		}
   2743cleanup:
   2744		ulist_free(record->old_roots);
   2745		ulist_free(new_roots);
   2746		new_roots = NULL;
   2747		rb_erase(node, &delayed_refs->dirty_extent_root);
   2748		kfree(record);
   2749
   2750	}
   2751	trace_qgroup_num_dirty_extents(fs_info, trans->transid,
   2752				       num_dirty_extents);
   2753	return ret;
   2754}
   2755
   2756/*
   2757 * called from commit_transaction. Writes all changed qgroups to disk.
   2758 */
   2759int btrfs_run_qgroups(struct btrfs_trans_handle *trans)
   2760{
   2761	struct btrfs_fs_info *fs_info = trans->fs_info;
   2762	int ret = 0;
   2763
   2764	if (!fs_info->quota_root)
   2765		return ret;
   2766
   2767	spin_lock(&fs_info->qgroup_lock);
   2768	while (!list_empty(&fs_info->dirty_qgroups)) {
   2769		struct btrfs_qgroup *qgroup;
   2770		qgroup = list_first_entry(&fs_info->dirty_qgroups,
   2771					  struct btrfs_qgroup, dirty);
   2772		list_del_init(&qgroup->dirty);
   2773		spin_unlock(&fs_info->qgroup_lock);
   2774		ret = update_qgroup_info_item(trans, qgroup);
   2775		if (ret)
   2776			fs_info->qgroup_flags |=
   2777					BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
   2778		ret = update_qgroup_limit_item(trans, qgroup);
   2779		if (ret)
   2780			fs_info->qgroup_flags |=
   2781					BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
   2782		spin_lock(&fs_info->qgroup_lock);
   2783	}
   2784	if (test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags))
   2785		fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_ON;
   2786	else
   2787		fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_ON;
   2788	spin_unlock(&fs_info->qgroup_lock);
   2789
   2790	ret = update_qgroup_status_item(trans);
   2791	if (ret)
   2792		fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
   2793
   2794	return ret;
   2795}
   2796
   2797/*
   2798 * Copy the accounting information between qgroups. This is necessary
   2799 * when a snapshot or a subvolume is created. Throwing an error will
   2800 * cause a transaction abort so we take extra care here to only error
   2801 * when a readonly fs is a reasonable outcome.
   2802 */
   2803int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans, u64 srcid,
   2804			 u64 objectid, struct btrfs_qgroup_inherit *inherit)
   2805{
   2806	int ret = 0;
   2807	int i;
   2808	u64 *i_qgroups;
   2809	bool committing = false;
   2810	struct btrfs_fs_info *fs_info = trans->fs_info;
   2811	struct btrfs_root *quota_root;
   2812	struct btrfs_qgroup *srcgroup;
   2813	struct btrfs_qgroup *dstgroup;
   2814	bool need_rescan = false;
   2815	u32 level_size = 0;
   2816	u64 nums;
   2817
   2818	/*
   2819	 * There are only two callers of this function.
   2820	 *
   2821	 * One in create_subvol() in the ioctl context, which needs to hold
   2822	 * the qgroup_ioctl_lock.
   2823	 *
   2824	 * The other one in create_pending_snapshot() where no other qgroup
   2825	 * code can modify the fs as they all need to either start a new trans
   2826	 * or hold a trans handler, thus we don't need to hold
   2827	 * qgroup_ioctl_lock.
   2828	 * This would avoid long and complex lock chain and make lockdep happy.
   2829	 */
   2830	spin_lock(&fs_info->trans_lock);
   2831	if (trans->transaction->state == TRANS_STATE_COMMIT_DOING)
   2832		committing = true;
   2833	spin_unlock(&fs_info->trans_lock);
   2834
   2835	if (!committing)
   2836		mutex_lock(&fs_info->qgroup_ioctl_lock);
   2837	if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags))
   2838		goto out;
   2839
   2840	quota_root = fs_info->quota_root;
   2841	if (!quota_root) {
   2842		ret = -EINVAL;
   2843		goto out;
   2844	}
   2845
   2846	if (inherit) {
   2847		i_qgroups = (u64 *)(inherit + 1);
   2848		nums = inherit->num_qgroups + 2 * inherit->num_ref_copies +
   2849		       2 * inherit->num_excl_copies;
   2850		for (i = 0; i < nums; ++i) {
   2851			srcgroup = find_qgroup_rb(fs_info, *i_qgroups);
   2852
   2853			/*
   2854			 * Zero out invalid groups so we can ignore
   2855			 * them later.
   2856			 */
   2857			if (!srcgroup ||
   2858			    ((srcgroup->qgroupid >> 48) <= (objectid >> 48)))
   2859				*i_qgroups = 0ULL;
   2860
   2861			++i_qgroups;
   2862		}
   2863	}
   2864
   2865	/*
   2866	 * create a tracking group for the subvol itself
   2867	 */
   2868	ret = add_qgroup_item(trans, quota_root, objectid);
   2869	if (ret)
   2870		goto out;
   2871
   2872	/*
   2873	 * add qgroup to all inherited groups
   2874	 */
   2875	if (inherit) {
   2876		i_qgroups = (u64 *)(inherit + 1);
   2877		for (i = 0; i < inherit->num_qgroups; ++i, ++i_qgroups) {
   2878			if (*i_qgroups == 0)
   2879				continue;
   2880			ret = add_qgroup_relation_item(trans, objectid,
   2881						       *i_qgroups);
   2882			if (ret && ret != -EEXIST)
   2883				goto out;
   2884			ret = add_qgroup_relation_item(trans, *i_qgroups,
   2885						       objectid);
   2886			if (ret && ret != -EEXIST)
   2887				goto out;
   2888		}
   2889		ret = 0;
   2890	}
   2891
   2892
   2893	spin_lock(&fs_info->qgroup_lock);
   2894
   2895	dstgroup = add_qgroup_rb(fs_info, objectid);
   2896	if (IS_ERR(dstgroup)) {
   2897		ret = PTR_ERR(dstgroup);
   2898		goto unlock;
   2899	}
   2900
   2901	if (inherit && inherit->flags & BTRFS_QGROUP_INHERIT_SET_LIMITS) {
   2902		dstgroup->lim_flags = inherit->lim.flags;
   2903		dstgroup->max_rfer = inherit->lim.max_rfer;
   2904		dstgroup->max_excl = inherit->lim.max_excl;
   2905		dstgroup->rsv_rfer = inherit->lim.rsv_rfer;
   2906		dstgroup->rsv_excl = inherit->lim.rsv_excl;
   2907
   2908		ret = update_qgroup_limit_item(trans, dstgroup);
   2909		if (ret) {
   2910			fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
   2911			btrfs_info(fs_info,
   2912				   "unable to update quota limit for %llu",
   2913				   dstgroup->qgroupid);
   2914			goto unlock;
   2915		}
   2916	}
   2917
   2918	if (srcid) {
   2919		srcgroup = find_qgroup_rb(fs_info, srcid);
   2920		if (!srcgroup)
   2921			goto unlock;
   2922
   2923		/*
   2924		 * We call inherit after we clone the root in order to make sure
   2925		 * our counts don't go crazy, so at this point the only
   2926		 * difference between the two roots should be the root node.
   2927		 */
   2928		level_size = fs_info->nodesize;
   2929		dstgroup->rfer = srcgroup->rfer;
   2930		dstgroup->rfer_cmpr = srcgroup->rfer_cmpr;
   2931		dstgroup->excl = level_size;
   2932		dstgroup->excl_cmpr = level_size;
   2933		srcgroup->excl = level_size;
   2934		srcgroup->excl_cmpr = level_size;
   2935
   2936		/* inherit the limit info */
   2937		dstgroup->lim_flags = srcgroup->lim_flags;
   2938		dstgroup->max_rfer = srcgroup->max_rfer;
   2939		dstgroup->max_excl = srcgroup->max_excl;
   2940		dstgroup->rsv_rfer = srcgroup->rsv_rfer;
   2941		dstgroup->rsv_excl = srcgroup->rsv_excl;
   2942
   2943		qgroup_dirty(fs_info, dstgroup);
   2944		qgroup_dirty(fs_info, srcgroup);
   2945	}
   2946
   2947	if (!inherit)
   2948		goto unlock;
   2949
   2950	i_qgroups = (u64 *)(inherit + 1);
   2951	for (i = 0; i < inherit->num_qgroups; ++i) {
   2952		if (*i_qgroups) {
   2953			ret = add_relation_rb(fs_info, objectid, *i_qgroups);
   2954			if (ret)
   2955				goto unlock;
   2956		}
   2957		++i_qgroups;
   2958
   2959		/*
   2960		 * If we're doing a snapshot, and adding the snapshot to a new
   2961		 * qgroup, the numbers are guaranteed to be incorrect.
   2962		 */
   2963		if (srcid)
   2964			need_rescan = true;
   2965	}
   2966
   2967	for (i = 0; i <  inherit->num_ref_copies; ++i, i_qgroups += 2) {
   2968		struct btrfs_qgroup *src;
   2969		struct btrfs_qgroup *dst;
   2970
   2971		if (!i_qgroups[0] || !i_qgroups[1])
   2972			continue;
   2973
   2974		src = find_qgroup_rb(fs_info, i_qgroups[0]);
   2975		dst = find_qgroup_rb(fs_info, i_qgroups[1]);
   2976
   2977		if (!src || !dst) {
   2978			ret = -EINVAL;
   2979			goto unlock;
   2980		}
   2981
   2982		dst->rfer = src->rfer - level_size;
   2983		dst->rfer_cmpr = src->rfer_cmpr - level_size;
   2984
   2985		/* Manually tweaking numbers certainly needs a rescan */
   2986		need_rescan = true;
   2987	}
   2988	for (i = 0; i <  inherit->num_excl_copies; ++i, i_qgroups += 2) {
   2989		struct btrfs_qgroup *src;
   2990		struct btrfs_qgroup *dst;
   2991
   2992		if (!i_qgroups[0] || !i_qgroups[1])
   2993			continue;
   2994
   2995		src = find_qgroup_rb(fs_info, i_qgroups[0]);
   2996		dst = find_qgroup_rb(fs_info, i_qgroups[1]);
   2997
   2998		if (!src || !dst) {
   2999			ret = -EINVAL;
   3000			goto unlock;
   3001		}
   3002
   3003		dst->excl = src->excl + level_size;
   3004		dst->excl_cmpr = src->excl_cmpr + level_size;
   3005		need_rescan = true;
   3006	}
   3007
   3008unlock:
   3009	spin_unlock(&fs_info->qgroup_lock);
   3010	if (!ret)
   3011		ret = btrfs_sysfs_add_one_qgroup(fs_info, dstgroup);
   3012out:
   3013	if (!committing)
   3014		mutex_unlock(&fs_info->qgroup_ioctl_lock);
   3015	if (need_rescan)
   3016		fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
   3017	return ret;
   3018}
   3019
   3020static bool qgroup_check_limits(const struct btrfs_qgroup *qg, u64 num_bytes)
   3021{
   3022	if ((qg->lim_flags & BTRFS_QGROUP_LIMIT_MAX_RFER) &&
   3023	    qgroup_rsv_total(qg) + (s64)qg->rfer + num_bytes > qg->max_rfer)
   3024		return false;
   3025
   3026	if ((qg->lim_flags & BTRFS_QGROUP_LIMIT_MAX_EXCL) &&
   3027	    qgroup_rsv_total(qg) + (s64)qg->excl + num_bytes > qg->max_excl)
   3028		return false;
   3029
   3030	return true;
   3031}
   3032
   3033static int qgroup_reserve(struct btrfs_root *root, u64 num_bytes, bool enforce,
   3034			  enum btrfs_qgroup_rsv_type type)
   3035{
   3036	struct btrfs_qgroup *qgroup;
   3037	struct btrfs_fs_info *fs_info = root->fs_info;
   3038	u64 ref_root = root->root_key.objectid;
   3039	int ret = 0;
   3040	struct ulist_node *unode;
   3041	struct ulist_iterator uiter;
   3042
   3043	if (!is_fstree(ref_root))
   3044		return 0;
   3045
   3046	if (num_bytes == 0)
   3047		return 0;
   3048
   3049	if (test_bit(BTRFS_FS_QUOTA_OVERRIDE, &fs_info->flags) &&
   3050	    capable(CAP_SYS_RESOURCE))
   3051		enforce = false;
   3052
   3053	spin_lock(&fs_info->qgroup_lock);
   3054	if (!fs_info->quota_root)
   3055		goto out;
   3056
   3057	qgroup = find_qgroup_rb(fs_info, ref_root);
   3058	if (!qgroup)
   3059		goto out;
   3060
   3061	/*
   3062	 * in a first step, we check all affected qgroups if any limits would
   3063	 * be exceeded
   3064	 */
   3065	ulist_reinit(fs_info->qgroup_ulist);
   3066	ret = ulist_add(fs_info->qgroup_ulist, qgroup->qgroupid,
   3067			qgroup_to_aux(qgroup), GFP_ATOMIC);
   3068	if (ret < 0)
   3069		goto out;
   3070	ULIST_ITER_INIT(&uiter);
   3071	while ((unode = ulist_next(fs_info->qgroup_ulist, &uiter))) {
   3072		struct btrfs_qgroup *qg;
   3073		struct btrfs_qgroup_list *glist;
   3074
   3075		qg = unode_aux_to_qgroup(unode);
   3076
   3077		if (enforce && !qgroup_check_limits(qg, num_bytes)) {
   3078			ret = -EDQUOT;
   3079			goto out;
   3080		}
   3081
   3082		list_for_each_entry(glist, &qg->groups, next_group) {
   3083			ret = ulist_add(fs_info->qgroup_ulist,
   3084					glist->group->qgroupid,
   3085					qgroup_to_aux(glist->group), GFP_ATOMIC);
   3086			if (ret < 0)
   3087				goto out;
   3088		}
   3089	}
   3090	ret = 0;
   3091	/*
   3092	 * no limits exceeded, now record the reservation into all qgroups
   3093	 */
   3094	ULIST_ITER_INIT(&uiter);
   3095	while ((unode = ulist_next(fs_info->qgroup_ulist, &uiter))) {
   3096		struct btrfs_qgroup *qg;
   3097
   3098		qg = unode_aux_to_qgroup(unode);
   3099
   3100		qgroup_rsv_add(fs_info, qg, num_bytes, type);
   3101	}
   3102
   3103out:
   3104	spin_unlock(&fs_info->qgroup_lock);
   3105	return ret;
   3106}
   3107
   3108/*
   3109 * Free @num_bytes of reserved space with @type for qgroup.  (Normally level 0
   3110 * qgroup).
   3111 *
   3112 * Will handle all higher level qgroup too.
   3113 *
   3114 * NOTE: If @num_bytes is (u64)-1, this means to free all bytes of this qgroup.
   3115 * This special case is only used for META_PERTRANS type.
   3116 */
   3117void btrfs_qgroup_free_refroot(struct btrfs_fs_info *fs_info,
   3118			       u64 ref_root, u64 num_bytes,
   3119			       enum btrfs_qgroup_rsv_type type)
   3120{
   3121	struct btrfs_qgroup *qgroup;
   3122	struct ulist_node *unode;
   3123	struct ulist_iterator uiter;
   3124	int ret = 0;
   3125
   3126	if (!is_fstree(ref_root))
   3127		return;
   3128
   3129	if (num_bytes == 0)
   3130		return;
   3131
   3132	if (num_bytes == (u64)-1 && type != BTRFS_QGROUP_RSV_META_PERTRANS) {
   3133		WARN(1, "%s: Invalid type to free", __func__);
   3134		return;
   3135	}
   3136	spin_lock(&fs_info->qgroup_lock);
   3137
   3138	if (!fs_info->quota_root)
   3139		goto out;
   3140
   3141	qgroup = find_qgroup_rb(fs_info, ref_root);
   3142	if (!qgroup)
   3143		goto out;
   3144
   3145	if (num_bytes == (u64)-1)
   3146		/*
   3147		 * We're freeing all pertrans rsv, get reserved value from
   3148		 * level 0 qgroup as real num_bytes to free.
   3149		 */
   3150		num_bytes = qgroup->rsv.values[type];
   3151
   3152	ulist_reinit(fs_info->qgroup_ulist);
   3153	ret = ulist_add(fs_info->qgroup_ulist, qgroup->qgroupid,
   3154			qgroup_to_aux(qgroup), GFP_ATOMIC);
   3155	if (ret < 0)
   3156		goto out;
   3157	ULIST_ITER_INIT(&uiter);
   3158	while ((unode = ulist_next(fs_info->qgroup_ulist, &uiter))) {
   3159		struct btrfs_qgroup *qg;
   3160		struct btrfs_qgroup_list *glist;
   3161
   3162		qg = unode_aux_to_qgroup(unode);
   3163
   3164		qgroup_rsv_release(fs_info, qg, num_bytes, type);
   3165
   3166		list_for_each_entry(glist, &qg->groups, next_group) {
   3167			ret = ulist_add(fs_info->qgroup_ulist,
   3168					glist->group->qgroupid,
   3169					qgroup_to_aux(glist->group), GFP_ATOMIC);
   3170			if (ret < 0)
   3171				goto out;
   3172		}
   3173	}
   3174
   3175out:
   3176	spin_unlock(&fs_info->qgroup_lock);
   3177}
   3178
   3179/*
   3180 * Check if the leaf is the last leaf. Which means all node pointers
   3181 * are at their last position.
   3182 */
   3183static bool is_last_leaf(struct btrfs_path *path)
   3184{
   3185	int i;
   3186
   3187	for (i = 1; i < BTRFS_MAX_LEVEL && path->nodes[i]; i++) {
   3188		if (path->slots[i] != btrfs_header_nritems(path->nodes[i]) - 1)
   3189			return false;
   3190	}
   3191	return true;
   3192}
   3193
   3194/*
   3195 * returns < 0 on error, 0 when more leafs are to be scanned.
   3196 * returns 1 when done.
   3197 */
   3198static int qgroup_rescan_leaf(struct btrfs_trans_handle *trans,
   3199			      struct btrfs_path *path)
   3200{
   3201	struct btrfs_fs_info *fs_info = trans->fs_info;
   3202	struct btrfs_root *extent_root;
   3203	struct btrfs_key found;
   3204	struct extent_buffer *scratch_leaf = NULL;
   3205	struct ulist *roots = NULL;
   3206	u64 num_bytes;
   3207	bool done;
   3208	int slot;
   3209	int ret;
   3210
   3211	mutex_lock(&fs_info->qgroup_rescan_lock);
   3212	extent_root = btrfs_extent_root(fs_info,
   3213				fs_info->qgroup_rescan_progress.objectid);
   3214	ret = btrfs_search_slot_for_read(extent_root,
   3215					 &fs_info->qgroup_rescan_progress,
   3216					 path, 1, 0);
   3217
   3218	btrfs_debug(fs_info,
   3219		"current progress key (%llu %u %llu), search_slot ret %d",
   3220		fs_info->qgroup_rescan_progress.objectid,
   3221		fs_info->qgroup_rescan_progress.type,
   3222		fs_info->qgroup_rescan_progress.offset, ret);
   3223
   3224	if (ret) {
   3225		/*
   3226		 * The rescan is about to end, we will not be scanning any
   3227		 * further blocks. We cannot unset the RESCAN flag here, because
   3228		 * we want to commit the transaction if everything went well.
   3229		 * To make the live accounting work in this phase, we set our
   3230		 * scan progress pointer such that every real extent objectid
   3231		 * will be smaller.
   3232		 */
   3233		fs_info->qgroup_rescan_progress.objectid = (u64)-1;
   3234		btrfs_release_path(path);
   3235		mutex_unlock(&fs_info->qgroup_rescan_lock);
   3236		return ret;
   3237	}
   3238	done = is_last_leaf(path);
   3239
   3240	btrfs_item_key_to_cpu(path->nodes[0], &found,
   3241			      btrfs_header_nritems(path->nodes[0]) - 1);
   3242	fs_info->qgroup_rescan_progress.objectid = found.objectid + 1;
   3243
   3244	scratch_leaf = btrfs_clone_extent_buffer(path->nodes[0]);
   3245	if (!scratch_leaf) {
   3246		ret = -ENOMEM;
   3247		mutex_unlock(&fs_info->qgroup_rescan_lock);
   3248		goto out;
   3249	}
   3250	slot = path->slots[0];
   3251	btrfs_release_path(path);
   3252	mutex_unlock(&fs_info->qgroup_rescan_lock);
   3253
   3254	for (; slot < btrfs_header_nritems(scratch_leaf); ++slot) {
   3255		btrfs_item_key_to_cpu(scratch_leaf, &found, slot);
   3256		if (found.type != BTRFS_EXTENT_ITEM_KEY &&
   3257		    found.type != BTRFS_METADATA_ITEM_KEY)
   3258			continue;
   3259		if (found.type == BTRFS_METADATA_ITEM_KEY)
   3260			num_bytes = fs_info->nodesize;
   3261		else
   3262			num_bytes = found.offset;
   3263
   3264		ret = btrfs_find_all_roots(NULL, fs_info, found.objectid, 0,
   3265					   &roots, false);
   3266		if (ret < 0)
   3267			goto out;
   3268		/* For rescan, just pass old_roots as NULL */
   3269		ret = btrfs_qgroup_account_extent(trans, found.objectid,
   3270						  num_bytes, NULL, roots);
   3271		if (ret < 0)
   3272			goto out;
   3273	}
   3274out:
   3275	if (scratch_leaf)
   3276		free_extent_buffer(scratch_leaf);
   3277
   3278	if (done && !ret) {
   3279		ret = 1;
   3280		fs_info->qgroup_rescan_progress.objectid = (u64)-1;
   3281	}
   3282	return ret;
   3283}
   3284
   3285static bool rescan_should_stop(struct btrfs_fs_info *fs_info)
   3286{
   3287	return btrfs_fs_closing(fs_info) ||
   3288		test_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state) ||
   3289		!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags);
   3290}
   3291
   3292static void btrfs_qgroup_rescan_worker(struct btrfs_work *work)
   3293{
   3294	struct btrfs_fs_info *fs_info = container_of(work, struct btrfs_fs_info,
   3295						     qgroup_rescan_work);
   3296	struct btrfs_path *path;
   3297	struct btrfs_trans_handle *trans = NULL;
   3298	int err = -ENOMEM;
   3299	int ret = 0;
   3300	bool stopped = false;
   3301
   3302	path = btrfs_alloc_path();
   3303	if (!path)
   3304		goto out;
   3305	/*
   3306	 * Rescan should only search for commit root, and any later difference
   3307	 * should be recorded by qgroup
   3308	 */
   3309	path->search_commit_root = 1;
   3310	path->skip_locking = 1;
   3311
   3312	err = 0;
   3313	while (!err && !(stopped = rescan_should_stop(fs_info))) {
   3314		trans = btrfs_start_transaction(fs_info->fs_root, 0);
   3315		if (IS_ERR(trans)) {
   3316			err = PTR_ERR(trans);
   3317			break;
   3318		}
   3319
   3320		err = qgroup_rescan_leaf(trans, path);
   3321
   3322		if (err > 0)
   3323			btrfs_commit_transaction(trans);
   3324		else
   3325			btrfs_end_transaction(trans);
   3326	}
   3327
   3328out:
   3329	btrfs_free_path(path);
   3330
   3331	mutex_lock(&fs_info->qgroup_rescan_lock);
   3332	if (err > 0 &&
   3333	    fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT) {
   3334		fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
   3335	} else if (err < 0 || stopped) {
   3336		fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
   3337	}
   3338	mutex_unlock(&fs_info->qgroup_rescan_lock);
   3339
   3340	/*
   3341	 * only update status, since the previous part has already updated the
   3342	 * qgroup info.
   3343	 */
   3344	trans = btrfs_start_transaction(fs_info->quota_root, 1);
   3345	if (IS_ERR(trans)) {
   3346		err = PTR_ERR(trans);
   3347		trans = NULL;
   3348		btrfs_err(fs_info,
   3349			  "fail to start transaction for status update: %d",
   3350			  err);
   3351	}
   3352
   3353	mutex_lock(&fs_info->qgroup_rescan_lock);
   3354	if (!stopped)
   3355		fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_RESCAN;
   3356	if (trans) {
   3357		ret = update_qgroup_status_item(trans);
   3358		if (ret < 0) {
   3359			err = ret;
   3360			btrfs_err(fs_info, "fail to update qgroup status: %d",
   3361				  err);
   3362		}
   3363	}
   3364	fs_info->qgroup_rescan_running = false;
   3365	complete_all(&fs_info->qgroup_rescan_completion);
   3366	mutex_unlock(&fs_info->qgroup_rescan_lock);
   3367
   3368	if (!trans)
   3369		return;
   3370
   3371	btrfs_end_transaction(trans);
   3372
   3373	if (stopped) {
   3374		btrfs_info(fs_info, "qgroup scan paused");
   3375	} else if (err >= 0) {
   3376		btrfs_info(fs_info, "qgroup scan completed%s",
   3377			err > 0 ? " (inconsistency flag cleared)" : "");
   3378	} else {
   3379		btrfs_err(fs_info, "qgroup scan failed with %d", err);
   3380	}
   3381}
   3382
   3383/*
   3384 * Checks that (a) no rescan is running and (b) quota is enabled. Allocates all
   3385 * memory required for the rescan context.
   3386 */
   3387static int
   3388qgroup_rescan_init(struct btrfs_fs_info *fs_info, u64 progress_objectid,
   3389		   int init_flags)
   3390{
   3391	int ret = 0;
   3392
   3393	if (!init_flags) {
   3394		/* we're resuming qgroup rescan at mount time */
   3395		if (!(fs_info->qgroup_flags &
   3396		      BTRFS_QGROUP_STATUS_FLAG_RESCAN)) {
   3397			btrfs_warn(fs_info,
   3398			"qgroup rescan init failed, qgroup rescan is not queued");
   3399			ret = -EINVAL;
   3400		} else if (!(fs_info->qgroup_flags &
   3401			     BTRFS_QGROUP_STATUS_FLAG_ON)) {
   3402			btrfs_warn(fs_info,
   3403			"qgroup rescan init failed, qgroup is not enabled");
   3404			ret = -EINVAL;
   3405		}
   3406
   3407		if (ret)
   3408			return ret;
   3409	}
   3410
   3411	mutex_lock(&fs_info->qgroup_rescan_lock);
   3412
   3413	if (init_flags) {
   3414		if (fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN) {
   3415			btrfs_warn(fs_info,
   3416				   "qgroup rescan is already in progress");
   3417			ret = -EINPROGRESS;
   3418		} else if (!(fs_info->qgroup_flags &
   3419			     BTRFS_QGROUP_STATUS_FLAG_ON)) {
   3420			btrfs_warn(fs_info,
   3421			"qgroup rescan init failed, qgroup is not enabled");
   3422			ret = -EINVAL;
   3423		} else if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags)) {
   3424			/* Quota disable is in progress */
   3425			ret = -EBUSY;
   3426		}
   3427
   3428		if (ret) {
   3429			mutex_unlock(&fs_info->qgroup_rescan_lock);
   3430			return ret;
   3431		}
   3432		fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_RESCAN;
   3433	}
   3434
   3435	memset(&fs_info->qgroup_rescan_progress, 0,
   3436		sizeof(fs_info->qgroup_rescan_progress));
   3437	fs_info->qgroup_rescan_progress.objectid = progress_objectid;
   3438	init_completion(&fs_info->qgroup_rescan_completion);
   3439	mutex_unlock(&fs_info->qgroup_rescan_lock);
   3440
   3441	btrfs_init_work(&fs_info->qgroup_rescan_work,
   3442			btrfs_qgroup_rescan_worker, NULL, NULL);
   3443	return 0;
   3444}
   3445
   3446static void
   3447qgroup_rescan_zero_tracking(struct btrfs_fs_info *fs_info)
   3448{
   3449	struct rb_node *n;
   3450	struct btrfs_qgroup *qgroup;
   3451
   3452	spin_lock(&fs_info->qgroup_lock);
   3453	/* clear all current qgroup tracking information */
   3454	for (n = rb_first(&fs_info->qgroup_tree); n; n = rb_next(n)) {
   3455		qgroup = rb_entry(n, struct btrfs_qgroup, node);
   3456		qgroup->rfer = 0;
   3457		qgroup->rfer_cmpr = 0;
   3458		qgroup->excl = 0;
   3459		qgroup->excl_cmpr = 0;
   3460		qgroup_dirty(fs_info, qgroup);
   3461	}
   3462	spin_unlock(&fs_info->qgroup_lock);
   3463}
   3464
   3465int
   3466btrfs_qgroup_rescan(struct btrfs_fs_info *fs_info)
   3467{
   3468	int ret = 0;
   3469	struct btrfs_trans_handle *trans;
   3470
   3471	ret = qgroup_rescan_init(fs_info, 0, 1);
   3472	if (ret)
   3473		return ret;
   3474
   3475	/*
   3476	 * We have set the rescan_progress to 0, which means no more
   3477	 * delayed refs will be accounted by btrfs_qgroup_account_ref.
   3478	 * However, btrfs_qgroup_account_ref may be right after its call
   3479	 * to btrfs_find_all_roots, in which case it would still do the
   3480	 * accounting.
   3481	 * To solve this, we're committing the transaction, which will
   3482	 * ensure we run all delayed refs and only after that, we are
   3483	 * going to clear all tracking information for a clean start.
   3484	 */
   3485
   3486	trans = btrfs_join_transaction(fs_info->fs_root);
   3487	if (IS_ERR(trans)) {
   3488		fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_RESCAN;
   3489		return PTR_ERR(trans);
   3490	}
   3491	ret = btrfs_commit_transaction(trans);
   3492	if (ret) {
   3493		fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_RESCAN;
   3494		return ret;
   3495	}
   3496
   3497	qgroup_rescan_zero_tracking(fs_info);
   3498
   3499	mutex_lock(&fs_info->qgroup_rescan_lock);
   3500	fs_info->qgroup_rescan_running = true;
   3501	btrfs_queue_work(fs_info->qgroup_rescan_workers,
   3502			 &fs_info->qgroup_rescan_work);
   3503	mutex_unlock(&fs_info->qgroup_rescan_lock);
   3504
   3505	return 0;
   3506}
   3507
   3508int btrfs_qgroup_wait_for_completion(struct btrfs_fs_info *fs_info,
   3509				     bool interruptible)
   3510{
   3511	int running;
   3512	int ret = 0;
   3513
   3514	mutex_lock(&fs_info->qgroup_rescan_lock);
   3515	running = fs_info->qgroup_rescan_running;
   3516	mutex_unlock(&fs_info->qgroup_rescan_lock);
   3517
   3518	if (!running)
   3519		return 0;
   3520
   3521	if (interruptible)
   3522		ret = wait_for_completion_interruptible(
   3523					&fs_info->qgroup_rescan_completion);
   3524	else
   3525		wait_for_completion(&fs_info->qgroup_rescan_completion);
   3526
   3527	return ret;
   3528}
   3529
   3530/*
   3531 * this is only called from open_ctree where we're still single threaded, thus
   3532 * locking is omitted here.
   3533 */
   3534void
   3535btrfs_qgroup_rescan_resume(struct btrfs_fs_info *fs_info)
   3536{
   3537	if (fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN) {
   3538		mutex_lock(&fs_info->qgroup_rescan_lock);
   3539		fs_info->qgroup_rescan_running = true;
   3540		btrfs_queue_work(fs_info->qgroup_rescan_workers,
   3541				 &fs_info->qgroup_rescan_work);
   3542		mutex_unlock(&fs_info->qgroup_rescan_lock);
   3543	}
   3544}
   3545
   3546#define rbtree_iterate_from_safe(node, next, start)				\
   3547       for (node = start; node && ({ next = rb_next(node); 1;}); node = next)
   3548
   3549static int qgroup_unreserve_range(struct btrfs_inode *inode,
   3550				  struct extent_changeset *reserved, u64 start,
   3551				  u64 len)
   3552{
   3553	struct rb_node *node;
   3554	struct rb_node *next;
   3555	struct ulist_node *entry;
   3556	int ret = 0;
   3557
   3558	node = reserved->range_changed.root.rb_node;
   3559	if (!node)
   3560		return 0;
   3561	while (node) {
   3562		entry = rb_entry(node, struct ulist_node, rb_node);
   3563		if (entry->val < start)
   3564			node = node->rb_right;
   3565		else
   3566			node = node->rb_left;
   3567	}
   3568
   3569	if (entry->val > start && rb_prev(&entry->rb_node))
   3570		entry = rb_entry(rb_prev(&entry->rb_node), struct ulist_node,
   3571				 rb_node);
   3572
   3573	rbtree_iterate_from_safe(node, next, &entry->rb_node) {
   3574		u64 entry_start;
   3575		u64 entry_end;
   3576		u64 entry_len;
   3577		int clear_ret;
   3578
   3579		entry = rb_entry(node, struct ulist_node, rb_node);
   3580		entry_start = entry->val;
   3581		entry_end = entry->aux;
   3582		entry_len = entry_end - entry_start + 1;
   3583
   3584		if (entry_start >= start + len)
   3585			break;
   3586		if (entry_start + entry_len <= start)
   3587			continue;
   3588		/*
   3589		 * Now the entry is in [start, start + len), revert the
   3590		 * EXTENT_QGROUP_RESERVED bit.
   3591		 */
   3592		clear_ret = clear_extent_bits(&inode->io_tree, entry_start,
   3593					      entry_end, EXTENT_QGROUP_RESERVED);
   3594		if (!ret && clear_ret < 0)
   3595			ret = clear_ret;
   3596
   3597		ulist_del(&reserved->range_changed, entry->val, entry->aux);
   3598		if (likely(reserved->bytes_changed >= entry_len)) {
   3599			reserved->bytes_changed -= entry_len;
   3600		} else {
   3601			WARN_ON(1);
   3602			reserved->bytes_changed = 0;
   3603		}
   3604	}
   3605
   3606	return ret;
   3607}
   3608
   3609/*
   3610 * Try to free some space for qgroup.
   3611 *
   3612 * For qgroup, there are only 3 ways to free qgroup space:
   3613 * - Flush nodatacow write
   3614 *   Any nodatacow write will free its reserved data space at run_delalloc_range().
   3615 *   In theory, we should only flush nodatacow inodes, but it's not yet
   3616 *   possible, so we need to flush the whole root.
   3617 *
   3618 * - Wait for ordered extents
   3619 *   When ordered extents are finished, their reserved metadata is finally
   3620 *   converted to per_trans status, which can be freed by later commit
   3621 *   transaction.
   3622 *
   3623 * - Commit transaction
   3624 *   This would free the meta_per_trans space.
   3625 *   In theory this shouldn't provide much space, but any more qgroup space
   3626 *   is needed.
   3627 */
   3628static int try_flush_qgroup(struct btrfs_root *root)
   3629{
   3630	struct btrfs_trans_handle *trans;
   3631	int ret;
   3632
   3633	/* Can't hold an open transaction or we run the risk of deadlocking. */
   3634	ASSERT(current->journal_info == NULL);
   3635	if (WARN_ON(current->journal_info))
   3636		return 0;
   3637
   3638	/*
   3639	 * We don't want to run flush again and again, so if there is a running
   3640	 * one, we won't try to start a new flush, but exit directly.
   3641	 */
   3642	if (test_and_set_bit(BTRFS_ROOT_QGROUP_FLUSHING, &root->state)) {
   3643		wait_event(root->qgroup_flush_wait,
   3644			!test_bit(BTRFS_ROOT_QGROUP_FLUSHING, &root->state));
   3645		return 0;
   3646	}
   3647
   3648	ret = btrfs_start_delalloc_snapshot(root, true);
   3649	if (ret < 0)
   3650		goto out;
   3651	btrfs_wait_ordered_extents(root, U64_MAX, 0, (u64)-1);
   3652
   3653	trans = btrfs_join_transaction(root);
   3654	if (IS_ERR(trans)) {
   3655		ret = PTR_ERR(trans);
   3656		goto out;
   3657	}
   3658
   3659	ret = btrfs_commit_transaction(trans);
   3660out:
   3661	clear_bit(BTRFS_ROOT_QGROUP_FLUSHING, &root->state);
   3662	wake_up(&root->qgroup_flush_wait);
   3663	return ret;
   3664}
   3665
   3666static int qgroup_reserve_data(struct btrfs_inode *inode,
   3667			struct extent_changeset **reserved_ret, u64 start,
   3668			u64 len)
   3669{
   3670	struct btrfs_root *root = inode->root;
   3671	struct extent_changeset *reserved;
   3672	bool new_reserved = false;
   3673	u64 orig_reserved;
   3674	u64 to_reserve;
   3675	int ret;
   3676
   3677	if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &root->fs_info->flags) ||
   3678	    !is_fstree(root->root_key.objectid) || len == 0)
   3679		return 0;
   3680
   3681	/* @reserved parameter is mandatory for qgroup */
   3682	if (WARN_ON(!reserved_ret))
   3683		return -EINVAL;
   3684	if (!*reserved_ret) {
   3685		new_reserved = true;
   3686		*reserved_ret = extent_changeset_alloc();
   3687		if (!*reserved_ret)
   3688			return -ENOMEM;
   3689	}
   3690	reserved = *reserved_ret;
   3691	/* Record already reserved space */
   3692	orig_reserved = reserved->bytes_changed;
   3693	ret = set_record_extent_bits(&inode->io_tree, start,
   3694			start + len -1, EXTENT_QGROUP_RESERVED, reserved);
   3695
   3696	/* Newly reserved space */
   3697	to_reserve = reserved->bytes_changed - orig_reserved;
   3698	trace_btrfs_qgroup_reserve_data(&inode->vfs_inode, start, len,
   3699					to_reserve, QGROUP_RESERVE);
   3700	if (ret < 0)
   3701		goto out;
   3702	ret = qgroup_reserve(root, to_reserve, true, BTRFS_QGROUP_RSV_DATA);
   3703	if (ret < 0)
   3704		goto cleanup;
   3705
   3706	return ret;
   3707
   3708cleanup:
   3709	qgroup_unreserve_range(inode, reserved, start, len);
   3710out:
   3711	if (new_reserved) {
   3712		extent_changeset_free(reserved);
   3713		*reserved_ret = NULL;
   3714	}
   3715	return ret;
   3716}
   3717
   3718/*
   3719 * Reserve qgroup space for range [start, start + len).
   3720 *
   3721 * This function will either reserve space from related qgroups or do nothing
   3722 * if the range is already reserved.
   3723 *
   3724 * Return 0 for successful reservation
   3725 * Return <0 for error (including -EQUOT)
   3726 *
   3727 * NOTE: This function may sleep for memory allocation, dirty page flushing and
   3728 *	 commit transaction. So caller should not hold any dirty page locked.
   3729 */
   3730int btrfs_qgroup_reserve_data(struct btrfs_inode *inode,
   3731			struct extent_changeset **reserved_ret, u64 start,
   3732			u64 len)
   3733{
   3734	int ret;
   3735
   3736	ret = qgroup_reserve_data(inode, reserved_ret, start, len);
   3737	if (ret <= 0 && ret != -EDQUOT)
   3738		return ret;
   3739
   3740	ret = try_flush_qgroup(inode->root);
   3741	if (ret < 0)
   3742		return ret;
   3743	return qgroup_reserve_data(inode, reserved_ret, start, len);
   3744}
   3745
   3746/* Free ranges specified by @reserved, normally in error path */
   3747static int qgroup_free_reserved_data(struct btrfs_inode *inode,
   3748			struct extent_changeset *reserved, u64 start, u64 len)
   3749{
   3750	struct btrfs_root *root = inode->root;
   3751	struct ulist_node *unode;
   3752	struct ulist_iterator uiter;
   3753	struct extent_changeset changeset;
   3754	int freed = 0;
   3755	int ret;
   3756
   3757	extent_changeset_init(&changeset);
   3758	len = round_up(start + len, root->fs_info->sectorsize);
   3759	start = round_down(start, root->fs_info->sectorsize);
   3760
   3761	ULIST_ITER_INIT(&uiter);
   3762	while ((unode = ulist_next(&reserved->range_changed, &uiter))) {
   3763		u64 range_start = unode->val;
   3764		/* unode->aux is the inclusive end */
   3765		u64 range_len = unode->aux - range_start + 1;
   3766		u64 free_start;
   3767		u64 free_len;
   3768
   3769		extent_changeset_release(&changeset);
   3770
   3771		/* Only free range in range [start, start + len) */
   3772		if (range_start >= start + len ||
   3773		    range_start + range_len <= start)
   3774			continue;
   3775		free_start = max(range_start, start);
   3776		free_len = min(start + len, range_start + range_len) -
   3777			   free_start;
   3778		/*
   3779		 * TODO: To also modify reserved->ranges_reserved to reflect
   3780		 * the modification.
   3781		 *
   3782		 * However as long as we free qgroup reserved according to
   3783		 * EXTENT_QGROUP_RESERVED, we won't double free.
   3784		 * So not need to rush.
   3785		 */
   3786		ret = clear_record_extent_bits(&inode->io_tree, free_start,
   3787				free_start + free_len - 1,
   3788				EXTENT_QGROUP_RESERVED, &changeset);
   3789		if (ret < 0)
   3790			goto out;
   3791		freed += changeset.bytes_changed;
   3792	}
   3793	btrfs_qgroup_free_refroot(root->fs_info, root->root_key.objectid, freed,
   3794				  BTRFS_QGROUP_RSV_DATA);
   3795	ret = freed;
   3796out:
   3797	extent_changeset_release(&changeset);
   3798	return ret;
   3799}
   3800
   3801static int __btrfs_qgroup_release_data(struct btrfs_inode *inode,
   3802			struct extent_changeset *reserved, u64 start, u64 len,
   3803			int free)
   3804{
   3805	struct extent_changeset changeset;
   3806	int trace_op = QGROUP_RELEASE;
   3807	int ret;
   3808
   3809	if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &inode->root->fs_info->flags))
   3810		return 0;
   3811
   3812	/* In release case, we shouldn't have @reserved */
   3813	WARN_ON(!free && reserved);
   3814	if (free && reserved)
   3815		return qgroup_free_reserved_data(inode, reserved, start, len);
   3816	extent_changeset_init(&changeset);
   3817	ret = clear_record_extent_bits(&inode->io_tree, start, start + len -1,
   3818				       EXTENT_QGROUP_RESERVED, &changeset);
   3819	if (ret < 0)
   3820		goto out;
   3821
   3822	if (free)
   3823		trace_op = QGROUP_FREE;
   3824	trace_btrfs_qgroup_release_data(&inode->vfs_inode, start, len,
   3825					changeset.bytes_changed, trace_op);
   3826	if (free)
   3827		btrfs_qgroup_free_refroot(inode->root->fs_info,
   3828				inode->root->root_key.objectid,
   3829				changeset.bytes_changed, BTRFS_QGROUP_RSV_DATA);
   3830	ret = changeset.bytes_changed;
   3831out:
   3832	extent_changeset_release(&changeset);
   3833	return ret;
   3834}
   3835
   3836/*
   3837 * Free a reserved space range from io_tree and related qgroups
   3838 *
   3839 * Should be called when a range of pages get invalidated before reaching disk.
   3840 * Or for error cleanup case.
   3841 * if @reserved is given, only reserved range in [@start, @start + @len) will
   3842 * be freed.
   3843 *
   3844 * For data written to disk, use btrfs_qgroup_release_data().
   3845 *
   3846 * NOTE: This function may sleep for memory allocation.
   3847 */
   3848int btrfs_qgroup_free_data(struct btrfs_inode *inode,
   3849			struct extent_changeset *reserved, u64 start, u64 len)
   3850{
   3851	return __btrfs_qgroup_release_data(inode, reserved, start, len, 1);
   3852}
   3853
   3854/*
   3855 * Release a reserved space range from io_tree only.
   3856 *
   3857 * Should be called when a range of pages get written to disk and corresponding
   3858 * FILE_EXTENT is inserted into corresponding root.
   3859 *
   3860 * Since new qgroup accounting framework will only update qgroup numbers at
   3861 * commit_transaction() time, its reserved space shouldn't be freed from
   3862 * related qgroups.
   3863 *
   3864 * But we should release the range from io_tree, to allow further write to be
   3865 * COWed.
   3866 *
   3867 * NOTE: This function may sleep for memory allocation.
   3868 */
   3869int btrfs_qgroup_release_data(struct btrfs_inode *inode, u64 start, u64 len)
   3870{
   3871	return __btrfs_qgroup_release_data(inode, NULL, start, len, 0);
   3872}
   3873
   3874static void add_root_meta_rsv(struct btrfs_root *root, int num_bytes,
   3875			      enum btrfs_qgroup_rsv_type type)
   3876{
   3877	if (type != BTRFS_QGROUP_RSV_META_PREALLOC &&
   3878	    type != BTRFS_QGROUP_RSV_META_PERTRANS)
   3879		return;
   3880	if (num_bytes == 0)
   3881		return;
   3882
   3883	spin_lock(&root->qgroup_meta_rsv_lock);
   3884	if (type == BTRFS_QGROUP_RSV_META_PREALLOC)
   3885		root->qgroup_meta_rsv_prealloc += num_bytes;
   3886	else
   3887		root->qgroup_meta_rsv_pertrans += num_bytes;
   3888	spin_unlock(&root->qgroup_meta_rsv_lock);
   3889}
   3890
   3891static int sub_root_meta_rsv(struct btrfs_root *root, int num_bytes,
   3892			     enum btrfs_qgroup_rsv_type type)
   3893{
   3894	if (type != BTRFS_QGROUP_RSV_META_PREALLOC &&
   3895	    type != BTRFS_QGROUP_RSV_META_PERTRANS)
   3896		return 0;
   3897	if (num_bytes == 0)
   3898		return 0;
   3899
   3900	spin_lock(&root->qgroup_meta_rsv_lock);
   3901	if (type == BTRFS_QGROUP_RSV_META_PREALLOC) {
   3902		num_bytes = min_t(u64, root->qgroup_meta_rsv_prealloc,
   3903				  num_bytes);
   3904		root->qgroup_meta_rsv_prealloc -= num_bytes;
   3905	} else {
   3906		num_bytes = min_t(u64, root->qgroup_meta_rsv_pertrans,
   3907				  num_bytes);
   3908		root->qgroup_meta_rsv_pertrans -= num_bytes;
   3909	}
   3910	spin_unlock(&root->qgroup_meta_rsv_lock);
   3911	return num_bytes;
   3912}
   3913
   3914int btrfs_qgroup_reserve_meta(struct btrfs_root *root, int num_bytes,
   3915			      enum btrfs_qgroup_rsv_type type, bool enforce)
   3916{
   3917	struct btrfs_fs_info *fs_info = root->fs_info;
   3918	int ret;
   3919
   3920	if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags) ||
   3921	    !is_fstree(root->root_key.objectid) || num_bytes == 0)
   3922		return 0;
   3923
   3924	BUG_ON(num_bytes != round_down(num_bytes, fs_info->nodesize));
   3925	trace_qgroup_meta_reserve(root, (s64)num_bytes, type);
   3926	ret = qgroup_reserve(root, num_bytes, enforce, type);
   3927	if (ret < 0)
   3928		return ret;
   3929	/*
   3930	 * Record what we have reserved into root.
   3931	 *
   3932	 * To avoid quota disabled->enabled underflow.
   3933	 * In that case, we may try to free space we haven't reserved
   3934	 * (since quota was disabled), so record what we reserved into root.
   3935	 * And ensure later release won't underflow this number.
   3936	 */
   3937	add_root_meta_rsv(root, num_bytes, type);
   3938	return ret;
   3939}
   3940
   3941int __btrfs_qgroup_reserve_meta(struct btrfs_root *root, int num_bytes,
   3942				enum btrfs_qgroup_rsv_type type, bool enforce,
   3943				bool noflush)
   3944{
   3945	int ret;
   3946
   3947	ret = btrfs_qgroup_reserve_meta(root, num_bytes, type, enforce);
   3948	if ((ret <= 0 && ret != -EDQUOT) || noflush)
   3949		return ret;
   3950
   3951	ret = try_flush_qgroup(root);
   3952	if (ret < 0)
   3953		return ret;
   3954	return btrfs_qgroup_reserve_meta(root, num_bytes, type, enforce);
   3955}
   3956
   3957void btrfs_qgroup_free_meta_all_pertrans(struct btrfs_root *root)
   3958{
   3959	struct btrfs_fs_info *fs_info = root->fs_info;
   3960
   3961	if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags) ||
   3962	    !is_fstree(root->root_key.objectid))
   3963		return;
   3964
   3965	/* TODO: Update trace point to handle such free */
   3966	trace_qgroup_meta_free_all_pertrans(root);
   3967	/* Special value -1 means to free all reserved space */
   3968	btrfs_qgroup_free_refroot(fs_info, root->root_key.objectid, (u64)-1,
   3969				  BTRFS_QGROUP_RSV_META_PERTRANS);
   3970}
   3971
   3972void __btrfs_qgroup_free_meta(struct btrfs_root *root, int num_bytes,
   3973			      enum btrfs_qgroup_rsv_type type)
   3974{
   3975	struct btrfs_fs_info *fs_info = root->fs_info;
   3976
   3977	if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags) ||
   3978	    !is_fstree(root->root_key.objectid))
   3979		return;
   3980
   3981	/*
   3982	 * reservation for META_PREALLOC can happen before quota is enabled,
   3983	 * which can lead to underflow.
   3984	 * Here ensure we will only free what we really have reserved.
   3985	 */
   3986	num_bytes = sub_root_meta_rsv(root, num_bytes, type);
   3987	BUG_ON(num_bytes != round_down(num_bytes, fs_info->nodesize));
   3988	trace_qgroup_meta_reserve(root, -(s64)num_bytes, type);
   3989	btrfs_qgroup_free_refroot(fs_info, root->root_key.objectid,
   3990				  num_bytes, type);
   3991}
   3992
   3993static void qgroup_convert_meta(struct btrfs_fs_info *fs_info, u64 ref_root,
   3994				int num_bytes)
   3995{
   3996	struct btrfs_qgroup *qgroup;
   3997	struct ulist_node *unode;
   3998	struct ulist_iterator uiter;
   3999	int ret = 0;
   4000
   4001	if (num_bytes == 0)
   4002		return;
   4003	if (!fs_info->quota_root)
   4004		return;
   4005
   4006	spin_lock(&fs_info->qgroup_lock);
   4007	qgroup = find_qgroup_rb(fs_info, ref_root);
   4008	if (!qgroup)
   4009		goto out;
   4010	ulist_reinit(fs_info->qgroup_ulist);
   4011	ret = ulist_add(fs_info->qgroup_ulist, qgroup->qgroupid,
   4012		       qgroup_to_aux(qgroup), GFP_ATOMIC);
   4013	if (ret < 0)
   4014		goto out;
   4015	ULIST_ITER_INIT(&uiter);
   4016	while ((unode = ulist_next(fs_info->qgroup_ulist, &uiter))) {
   4017		struct btrfs_qgroup *qg;
   4018		struct btrfs_qgroup_list *glist;
   4019
   4020		qg = unode_aux_to_qgroup(unode);
   4021
   4022		qgroup_rsv_release(fs_info, qg, num_bytes,
   4023				BTRFS_QGROUP_RSV_META_PREALLOC);
   4024		qgroup_rsv_add(fs_info, qg, num_bytes,
   4025				BTRFS_QGROUP_RSV_META_PERTRANS);
   4026		list_for_each_entry(glist, &qg->groups, next_group) {
   4027			ret = ulist_add(fs_info->qgroup_ulist,
   4028					glist->group->qgroupid,
   4029					qgroup_to_aux(glist->group), GFP_ATOMIC);
   4030			if (ret < 0)
   4031				goto out;
   4032		}
   4033	}
   4034out:
   4035	spin_unlock(&fs_info->qgroup_lock);
   4036}
   4037
   4038void btrfs_qgroup_convert_reserved_meta(struct btrfs_root *root, int num_bytes)
   4039{
   4040	struct btrfs_fs_info *fs_info = root->fs_info;
   4041
   4042	if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags) ||
   4043	    !is_fstree(root->root_key.objectid))
   4044		return;
   4045	/* Same as btrfs_qgroup_free_meta_prealloc() */
   4046	num_bytes = sub_root_meta_rsv(root, num_bytes,
   4047				      BTRFS_QGROUP_RSV_META_PREALLOC);
   4048	trace_qgroup_meta_convert(root, num_bytes);
   4049	qgroup_convert_meta(fs_info, root->root_key.objectid, num_bytes);
   4050}
   4051
   4052/*
   4053 * Check qgroup reserved space leaking, normally at destroy inode
   4054 * time
   4055 */
   4056void btrfs_qgroup_check_reserved_leak(struct btrfs_inode *inode)
   4057{
   4058	struct extent_changeset changeset;
   4059	struct ulist_node *unode;
   4060	struct ulist_iterator iter;
   4061	int ret;
   4062
   4063	extent_changeset_init(&changeset);
   4064	ret = clear_record_extent_bits(&inode->io_tree, 0, (u64)-1,
   4065			EXTENT_QGROUP_RESERVED, &changeset);
   4066
   4067	WARN_ON(ret < 0);
   4068	if (WARN_ON(changeset.bytes_changed)) {
   4069		ULIST_ITER_INIT(&iter);
   4070		while ((unode = ulist_next(&changeset.range_changed, &iter))) {
   4071			btrfs_warn(inode->root->fs_info,
   4072		"leaking qgroup reserved space, ino: %llu, start: %llu, end: %llu",
   4073				btrfs_ino(inode), unode->val, unode->aux);
   4074		}
   4075		btrfs_qgroup_free_refroot(inode->root->fs_info,
   4076				inode->root->root_key.objectid,
   4077				changeset.bytes_changed, BTRFS_QGROUP_RSV_DATA);
   4078
   4079	}
   4080	extent_changeset_release(&changeset);
   4081}
   4082
   4083void btrfs_qgroup_init_swapped_blocks(
   4084	struct btrfs_qgroup_swapped_blocks *swapped_blocks)
   4085{
   4086	int i;
   4087
   4088	spin_lock_init(&swapped_blocks->lock);
   4089	for (i = 0; i < BTRFS_MAX_LEVEL; i++)
   4090		swapped_blocks->blocks[i] = RB_ROOT;
   4091	swapped_blocks->swapped = false;
   4092}
   4093
   4094/*
   4095 * Delete all swapped blocks record of @root.
   4096 * Every record here means we skipped a full subtree scan for qgroup.
   4097 *
   4098 * Gets called when committing one transaction.
   4099 */
   4100void btrfs_qgroup_clean_swapped_blocks(struct btrfs_root *root)
   4101{
   4102	struct btrfs_qgroup_swapped_blocks *swapped_blocks;
   4103	int i;
   4104
   4105	swapped_blocks = &root->swapped_blocks;
   4106
   4107	spin_lock(&swapped_blocks->lock);
   4108	if (!swapped_blocks->swapped)
   4109		goto out;
   4110	for (i = 0; i < BTRFS_MAX_LEVEL; i++) {
   4111		struct rb_root *cur_root = &swapped_blocks->blocks[i];
   4112		struct btrfs_qgroup_swapped_block *entry;
   4113		struct btrfs_qgroup_swapped_block *next;
   4114
   4115		rbtree_postorder_for_each_entry_safe(entry, next, cur_root,
   4116						     node)
   4117			kfree(entry);
   4118		swapped_blocks->blocks[i] = RB_ROOT;
   4119	}
   4120	swapped_blocks->swapped = false;
   4121out:
   4122	spin_unlock(&swapped_blocks->lock);
   4123}
   4124
   4125/*
   4126 * Add subtree roots record into @subvol_root.
   4127 *
   4128 * @subvol_root:	tree root of the subvolume tree get swapped
   4129 * @bg:			block group under balance
   4130 * @subvol_parent/slot:	pointer to the subtree root in subvolume tree
   4131 * @reloc_parent/slot:	pointer to the subtree root in reloc tree
   4132 *			BOTH POINTERS ARE BEFORE TREE SWAP
   4133 * @last_snapshot:	last snapshot generation of the subvolume tree
   4134 */
   4135int btrfs_qgroup_add_swapped_blocks(struct btrfs_trans_handle *trans,
   4136		struct btrfs_root *subvol_root,
   4137		struct btrfs_block_group *bg,
   4138		struct extent_buffer *subvol_parent, int subvol_slot,
   4139		struct extent_buffer *reloc_parent, int reloc_slot,
   4140		u64 last_snapshot)
   4141{
   4142	struct btrfs_fs_info *fs_info = subvol_root->fs_info;
   4143	struct btrfs_qgroup_swapped_blocks *blocks = &subvol_root->swapped_blocks;
   4144	struct btrfs_qgroup_swapped_block *block;
   4145	struct rb_node **cur;
   4146	struct rb_node *parent = NULL;
   4147	int level = btrfs_header_level(subvol_parent) - 1;
   4148	int ret = 0;
   4149
   4150	if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags))
   4151		return 0;
   4152
   4153	if (btrfs_node_ptr_generation(subvol_parent, subvol_slot) >
   4154	    btrfs_node_ptr_generation(reloc_parent, reloc_slot)) {
   4155		btrfs_err_rl(fs_info,
   4156		"%s: bad parameter order, subvol_gen=%llu reloc_gen=%llu",
   4157			__func__,
   4158			btrfs_node_ptr_generation(subvol_parent, subvol_slot),
   4159			btrfs_node_ptr_generation(reloc_parent, reloc_slot));
   4160		return -EUCLEAN;
   4161	}
   4162
   4163	block = kmalloc(sizeof(*block), GFP_NOFS);
   4164	if (!block) {
   4165		ret = -ENOMEM;
   4166		goto out;
   4167	}
   4168
   4169	/*
   4170	 * @reloc_parent/slot is still before swap, while @block is going to
   4171	 * record the bytenr after swap, so we do the swap here.
   4172	 */
   4173	block->subvol_bytenr = btrfs_node_blockptr(reloc_parent, reloc_slot);
   4174	block->subvol_generation = btrfs_node_ptr_generation(reloc_parent,
   4175							     reloc_slot);
   4176	block->reloc_bytenr = btrfs_node_blockptr(subvol_parent, subvol_slot);
   4177	block->reloc_generation = btrfs_node_ptr_generation(subvol_parent,
   4178							    subvol_slot);
   4179	block->last_snapshot = last_snapshot;
   4180	block->level = level;
   4181
   4182	/*
   4183	 * If we have bg == NULL, we're called from btrfs_recover_relocation(),
   4184	 * no one else can modify tree blocks thus we qgroup will not change
   4185	 * no matter the value of trace_leaf.
   4186	 */
   4187	if (bg && bg->flags & BTRFS_BLOCK_GROUP_DATA)
   4188		block->trace_leaf = true;
   4189	else
   4190		block->trace_leaf = false;
   4191	btrfs_node_key_to_cpu(reloc_parent, &block->first_key, reloc_slot);
   4192
   4193	/* Insert @block into @blocks */
   4194	spin_lock(&blocks->lock);
   4195	cur = &blocks->blocks[level].rb_node;
   4196	while (*cur) {
   4197		struct btrfs_qgroup_swapped_block *entry;
   4198
   4199		parent = *cur;
   4200		entry = rb_entry(parent, struct btrfs_qgroup_swapped_block,
   4201				 node);
   4202
   4203		if (entry->subvol_bytenr < block->subvol_bytenr) {
   4204			cur = &(*cur)->rb_left;
   4205		} else if (entry->subvol_bytenr > block->subvol_bytenr) {
   4206			cur = &(*cur)->rb_right;
   4207		} else {
   4208			if (entry->subvol_generation !=
   4209					block->subvol_generation ||
   4210			    entry->reloc_bytenr != block->reloc_bytenr ||
   4211			    entry->reloc_generation !=
   4212					block->reloc_generation) {
   4213				/*
   4214				 * Duplicated but mismatch entry found.
   4215				 * Shouldn't happen.
   4216				 *
   4217				 * Marking qgroup inconsistent should be enough
   4218				 * for end users.
   4219				 */
   4220				WARN_ON(IS_ENABLED(CONFIG_BTRFS_DEBUG));
   4221				ret = -EEXIST;
   4222			}
   4223			kfree(block);
   4224			goto out_unlock;
   4225		}
   4226	}
   4227	rb_link_node(&block->node, parent, cur);
   4228	rb_insert_color(&block->node, &blocks->blocks[level]);
   4229	blocks->swapped = true;
   4230out_unlock:
   4231	spin_unlock(&blocks->lock);
   4232out:
   4233	if (ret < 0)
   4234		fs_info->qgroup_flags |=
   4235			BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
   4236	return ret;
   4237}
   4238
   4239/*
   4240 * Check if the tree block is a subtree root, and if so do the needed
   4241 * delayed subtree trace for qgroup.
   4242 *
   4243 * This is called during btrfs_cow_block().
   4244 */
   4245int btrfs_qgroup_trace_subtree_after_cow(struct btrfs_trans_handle *trans,
   4246					 struct btrfs_root *root,
   4247					 struct extent_buffer *subvol_eb)
   4248{
   4249	struct btrfs_fs_info *fs_info = root->fs_info;
   4250	struct btrfs_qgroup_swapped_blocks *blocks = &root->swapped_blocks;
   4251	struct btrfs_qgroup_swapped_block *block;
   4252	struct extent_buffer *reloc_eb = NULL;
   4253	struct rb_node *node;
   4254	bool found = false;
   4255	bool swapped = false;
   4256	int level = btrfs_header_level(subvol_eb);
   4257	int ret = 0;
   4258	int i;
   4259
   4260	if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags))
   4261		return 0;
   4262	if (!is_fstree(root->root_key.objectid) || !root->reloc_root)
   4263		return 0;
   4264
   4265	spin_lock(&blocks->lock);
   4266	if (!blocks->swapped) {
   4267		spin_unlock(&blocks->lock);
   4268		return 0;
   4269	}
   4270	node = blocks->blocks[level].rb_node;
   4271
   4272	while (node) {
   4273		block = rb_entry(node, struct btrfs_qgroup_swapped_block, node);
   4274		if (block->subvol_bytenr < subvol_eb->start) {
   4275			node = node->rb_left;
   4276		} else if (block->subvol_bytenr > subvol_eb->start) {
   4277			node = node->rb_right;
   4278		} else {
   4279			found = true;
   4280			break;
   4281		}
   4282	}
   4283	if (!found) {
   4284		spin_unlock(&blocks->lock);
   4285		goto out;
   4286	}
   4287	/* Found one, remove it from @blocks first and update blocks->swapped */
   4288	rb_erase(&block->node, &blocks->blocks[level]);
   4289	for (i = 0; i < BTRFS_MAX_LEVEL; i++) {
   4290		if (RB_EMPTY_ROOT(&blocks->blocks[i])) {
   4291			swapped = true;
   4292			break;
   4293		}
   4294	}
   4295	blocks->swapped = swapped;
   4296	spin_unlock(&blocks->lock);
   4297
   4298	/* Read out reloc subtree root */
   4299	reloc_eb = read_tree_block(fs_info, block->reloc_bytenr, 0,
   4300				   block->reloc_generation, block->level,
   4301				   &block->first_key);
   4302	if (IS_ERR(reloc_eb)) {
   4303		ret = PTR_ERR(reloc_eb);
   4304		reloc_eb = NULL;
   4305		goto free_out;
   4306	}
   4307	if (!extent_buffer_uptodate(reloc_eb)) {
   4308		ret = -EIO;
   4309		goto free_out;
   4310	}
   4311
   4312	ret = qgroup_trace_subtree_swap(trans, reloc_eb, subvol_eb,
   4313			block->last_snapshot, block->trace_leaf);
   4314free_out:
   4315	kfree(block);
   4316	free_extent_buffer(reloc_eb);
   4317out:
   4318	if (ret < 0) {
   4319		btrfs_err_rl(fs_info,
   4320			     "failed to account subtree at bytenr %llu: %d",
   4321			     subvol_eb->start, ret);
   4322		fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
   4323	}
   4324	return ret;
   4325}
   4326
   4327void btrfs_qgroup_destroy_extent_records(struct btrfs_transaction *trans)
   4328{
   4329	struct btrfs_qgroup_extent_record *entry;
   4330	struct btrfs_qgroup_extent_record *next;
   4331	struct rb_root *root;
   4332
   4333	root = &trans->delayed_refs.dirty_extent_root;
   4334	rbtree_postorder_for_each_entry_safe(entry, next, root, node) {
   4335		ulist_free(entry->old_roots);
   4336		kfree(entry);
   4337	}
   4338}