From 012e513e1bfeb39e1ce393f5a444f772ea27e954 Mon Sep 17 00:00:00 2001 From: Anand Jain Date: Wed, 30 Aug 2017 22:36:48 +0800 Subject: btrfs: declare TRACE_DEFINE_ENUM for each of show_flush_state enum So that perf can show the state symbol. Signed-off-by: Anand Jain Reviewed-by: David Sterba Signed-off-by: David Sterba --- include/trace/events/btrfs.h | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'include') diff --git a/include/trace/events/btrfs.h b/include/trace/events/btrfs.h index dc1d0df91e0b..77437f545c63 100644 --- a/include/trace/events/btrfs.h +++ b/include/trace/events/btrfs.h @@ -28,6 +28,13 @@ struct btrfs_qgroup_extent_record; struct btrfs_qgroup; struct prelim_ref; +TRACE_DEFINE_ENUM(FLUSH_DELAYED_ITEMS_NR); +TRACE_DEFINE_ENUM(FLUSH_DELAYED_ITEMS); +TRACE_DEFINE_ENUM(FLUSH_DELALLOC); +TRACE_DEFINE_ENUM(FLUSH_DELALLOC_WAIT); +TRACE_DEFINE_ENUM(ALLOC_CHUNK); +TRACE_DEFINE_ENUM(COMMIT_TRANS); + #define show_ref_type(type) \ __print_symbolic(type, \ { BTRFS_TREE_BLOCK_REF_KEY, "TREE_BLOCK_REF" }, \ -- cgit v1.2.3-71-gd317 From 40c3c40947324d9f40bf47830c92c59a9bbadf4a Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Wed, 23 Aug 2017 16:57:58 +0900 Subject: btrfs: Add sanity check for EXTENT_DATA when reading out leaf Add extra checks for item with EXTENT_DATA type. This checks the following thing: 0) Key offset All key offsets must be aligned to sectorsize. Inline extent must have 0 for key offset. 1) Item size Uncompressed inline file extent size must match item size. (Compressed inline file extent has no information about its on-disk size.) Regular/preallocated file extent size must be a fixed value. 2) Every member of regular file extent item Including alignment for bytenr and offset, possible value for compression/encryption/type. 3) Type/compression/encode must be one of the valid values. This should be the most comprehensive and strict check in the context of btrfs_item for EXTENT_DATA. Signed-off-by: Qu Wenruo Reviewed-by: Nikolay Borisov Reviewed-by: David Sterba [ switch to BTRFS_FILE_EXTENT_TYPES, similar to what BTRFS_COMPRESS_TYPES does ] Signed-off-by: David Sterba --- fs/btrfs/disk-io.c | 103 ++++++++++++++++++++++++++++++++++++++++ include/uapi/linux/btrfs_tree.h | 1 + 2 files changed, 104 insertions(+) (limited to 'include') diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index d1770b3e0385..b863d41f7d0a 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -549,6 +549,100 @@ static int check_tree_block_fsid(struct btrfs_fs_info *fs_info, btrfs_header_level(eb) == 0 ? "leaf" : "node", \ reason, btrfs_header_bytenr(eb), root->objectid, slot) +static int check_extent_data_item(struct btrfs_root *root, + struct extent_buffer *leaf, + struct btrfs_key *key, int slot) +{ + struct btrfs_file_extent_item *fi; + u32 sectorsize = root->fs_info->sectorsize; + u32 item_size = btrfs_item_size_nr(leaf, slot); + + if (!IS_ALIGNED(key->offset, sectorsize)) { + CORRUPT("unaligned key offset for file extent", + leaf, root, slot); + return -EUCLEAN; + } + + fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item); + + if (btrfs_file_extent_type(leaf, fi) > BTRFS_FILE_EXTENT_TYPES) { + CORRUPT("invalid file extent type", leaf, root, slot); + return -EUCLEAN; + } + + /* + * Support for new compression/encrption must introduce incompat flag, + * and must be caught in open_ctree(). + */ + if (btrfs_file_extent_compression(leaf, fi) > BTRFS_COMPRESS_TYPES) { + CORRUPT("invalid file extent compression", leaf, root, slot); + return -EUCLEAN; + } + if (btrfs_file_extent_encryption(leaf, fi)) { + CORRUPT("invalid file extent encryption", leaf, root, slot); + return -EUCLEAN; + } + if (btrfs_file_extent_type(leaf, fi) == BTRFS_FILE_EXTENT_INLINE) { + /* Inline extent must have 0 as key offset */ + if (key->offset) { + CORRUPT("inline extent has non-zero key offset", + leaf, root, slot); + return -EUCLEAN; + } + + /* Compressed inline extent has no on-disk size, skip it */ + if (btrfs_file_extent_compression(leaf, fi) != + BTRFS_COMPRESS_NONE) + return 0; + + /* Uncompressed inline extent size must match item size */ + if (item_size != BTRFS_FILE_EXTENT_INLINE_DATA_START + + btrfs_file_extent_ram_bytes(leaf, fi)) { + CORRUPT("plaintext inline extent has invalid size", + leaf, root, slot); + return -EUCLEAN; + } + return 0; + } + + /* Regular or preallocated extent has fixed item size */ + if (item_size != sizeof(*fi)) { + CORRUPT( + "regluar or preallocated extent data item size is invalid", + leaf, root, slot); + return -EUCLEAN; + } + if (!IS_ALIGNED(btrfs_file_extent_ram_bytes(leaf, fi), sectorsize) || + !IS_ALIGNED(btrfs_file_extent_disk_bytenr(leaf, fi), sectorsize) || + !IS_ALIGNED(btrfs_file_extent_disk_num_bytes(leaf, fi), sectorsize) || + !IS_ALIGNED(btrfs_file_extent_offset(leaf, fi), sectorsize) || + !IS_ALIGNED(btrfs_file_extent_num_bytes(leaf, fi), sectorsize)) { + CORRUPT( + "regular or preallocated extent data item has unaligned value", + leaf, root, slot); + return -EUCLEAN; + } + + return 0; +} + +/* + * Common point to switch the item-specific validation. + */ +static int check_leaf_item(struct btrfs_root *root, + struct extent_buffer *leaf, + struct btrfs_key *key, int slot) +{ + int ret = 0; + + switch (key->type) { + case BTRFS_EXTENT_DATA_KEY: + ret = check_extent_data_item(root, leaf, key, slot); + break; + } + return ret; +} + static noinline int check_leaf(struct btrfs_root *root, struct extent_buffer *leaf) { @@ -605,9 +699,13 @@ static noinline int check_leaf(struct btrfs_root *root, * 1) key order * 2) item offset and size * No overlap, no hole, all inside the leaf. + * 3) item content + * If possible, do comprehensive sanity check. + * NOTE: All checks must only rely on the item data itself. */ for (slot = 0; slot < nritems; slot++) { u32 item_end_expected; + int ret; btrfs_item_key_to_cpu(leaf, &key, slot); @@ -650,6 +748,11 @@ static noinline int check_leaf(struct btrfs_root *root, return -EUCLEAN; } + /* Check if the item size and content meet other criteria */ + ret = check_leaf_item(root, leaf, &key, slot); + if (ret < 0) + return ret; + prev_key.objectid = key.objectid; prev_key.type = key.type; prev_key.offset = key.offset; diff --git a/include/uapi/linux/btrfs_tree.h b/include/uapi/linux/btrfs_tree.h index 10689e1fdf11..3142645a27f5 100644 --- a/include/uapi/linux/btrfs_tree.h +++ b/include/uapi/linux/btrfs_tree.h @@ -732,6 +732,7 @@ struct btrfs_balance_item { #define BTRFS_FILE_EXTENT_INLINE 0 #define BTRFS_FILE_EXTENT_REG 1 #define BTRFS_FILE_EXTENT_PREALLOC 2 +#define BTRFS_FILE_EXTENT_TYPES 2 struct btrfs_file_extent_item { /* -- cgit v1.2.3-71-gd317 From d278850eff3053ef166cf64c16f798dfe36278a2 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Fri, 29 Sep 2017 15:43:57 -0400 Subject: btrfs: remove delayed_ref_node from ref_head This is just excessive information in the ref_head, and makes the code complicated. It is a relic from when we had the heads and the refs in the same tree, which is no longer the case. With this removal I've cleaned up a bunch of the cruft around this old assumption as well. Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/backref.c | 4 +- fs/btrfs/delayed-ref.c | 126 +++++++++++++++++++------------------------ fs/btrfs/delayed-ref.h | 49 ++++++----------- fs/btrfs/disk-io.c | 12 ++--- fs/btrfs/extent-tree.c | 90 ++++++++++++------------------- include/trace/events/btrfs.h | 13 ++--- 6 files changed, 119 insertions(+), 175 deletions(-) (limited to 'include') diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c index b517ef1477ea..33cba1abf8b6 100644 --- a/fs/btrfs/backref.c +++ b/fs/btrfs/backref.c @@ -1178,7 +1178,7 @@ again: head = btrfs_find_delayed_ref_head(delayed_refs, bytenr); if (head) { if (!mutex_trylock(&head->mutex)) { - refcount_inc(&head->node.refs); + refcount_inc(&head->refs); spin_unlock(&delayed_refs->lock); btrfs_release_path(path); @@ -1189,7 +1189,7 @@ again: */ mutex_lock(&head->mutex); mutex_unlock(&head->mutex); - btrfs_put_delayed_ref(&head->node); + btrfs_put_delayed_ref_head(head); goto again; } spin_unlock(&delayed_refs->lock); diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c index 93ffa898df6d..b9b41c838da4 100644 --- a/fs/btrfs/delayed-ref.c +++ b/fs/btrfs/delayed-ref.c @@ -96,15 +96,15 @@ static struct btrfs_delayed_ref_head *htree_insert(struct rb_root *root, u64 bytenr; ins = rb_entry(node, struct btrfs_delayed_ref_head, href_node); - bytenr = ins->node.bytenr; + bytenr = ins->bytenr; while (*p) { parent_node = *p; entry = rb_entry(parent_node, struct btrfs_delayed_ref_head, href_node); - if (bytenr < entry->node.bytenr) + if (bytenr < entry->bytenr) p = &(*p)->rb_left; - else if (bytenr > entry->node.bytenr) + else if (bytenr > entry->bytenr) p = &(*p)->rb_right; else return entry; @@ -133,15 +133,15 @@ find_ref_head(struct rb_root *root, u64 bytenr, while (n) { entry = rb_entry(n, struct btrfs_delayed_ref_head, href_node); - if (bytenr < entry->node.bytenr) + if (bytenr < entry->bytenr) n = n->rb_left; - else if (bytenr > entry->node.bytenr) + else if (bytenr > entry->bytenr) n = n->rb_right; else return entry; } if (entry && return_bigger) { - if (bytenr > entry->node.bytenr) { + if (bytenr > entry->bytenr) { n = rb_next(&entry->href_node); if (!n) n = rb_first(root); @@ -164,17 +164,17 @@ int btrfs_delayed_ref_lock(struct btrfs_trans_handle *trans, if (mutex_trylock(&head->mutex)) return 0; - refcount_inc(&head->node.refs); + refcount_inc(&head->refs); spin_unlock(&delayed_refs->lock); mutex_lock(&head->mutex); spin_lock(&delayed_refs->lock); - if (!head->node.in_tree) { + if (RB_EMPTY_NODE(&head->href_node)) { mutex_unlock(&head->mutex); - btrfs_put_delayed_ref(&head->node); + btrfs_put_delayed_ref_head(head); return -EAGAIN; } - btrfs_put_delayed_ref(&head->node); + btrfs_put_delayed_ref_head(head); return 0; } @@ -183,15 +183,10 @@ static inline void drop_delayed_ref(struct btrfs_trans_handle *trans, struct btrfs_delayed_ref_head *head, struct btrfs_delayed_ref_node *ref) { - if (btrfs_delayed_ref_is_head(ref)) { - head = btrfs_delayed_node_to_head(ref); - rb_erase(&head->href_node, &delayed_refs->href_root); - } else { - assert_spin_locked(&head->lock); - list_del(&ref->list); - if (!list_empty(&ref->add_list)) - list_del(&ref->add_list); - } + assert_spin_locked(&head->lock); + list_del(&ref->list); + if (!list_empty(&ref->add_list)) + list_del(&ref->add_list); ref->in_tree = 0; btrfs_put_delayed_ref(ref); atomic_dec(&delayed_refs->num_entries); @@ -380,8 +375,8 @@ again: head->processing = 1; WARN_ON(delayed_refs->num_heads_ready == 0); delayed_refs->num_heads_ready--; - delayed_refs->run_delayed_start = head->node.bytenr + - head->node.num_bytes; + delayed_refs->run_delayed_start = head->bytenr + + head->num_bytes; return head; } @@ -469,20 +464,16 @@ add_tail: */ static noinline void update_existing_head_ref(struct btrfs_delayed_ref_root *delayed_refs, - struct btrfs_delayed_ref_node *existing, - struct btrfs_delayed_ref_node *update, + struct btrfs_delayed_ref_head *existing, + struct btrfs_delayed_ref_head *update, int *old_ref_mod_ret) { - struct btrfs_delayed_ref_head *existing_ref; - struct btrfs_delayed_ref_head *ref; int old_ref_mod; - existing_ref = btrfs_delayed_node_to_head(existing); - ref = btrfs_delayed_node_to_head(update); - BUG_ON(existing_ref->is_data != ref->is_data); + BUG_ON(existing->is_data != update->is_data); - spin_lock(&existing_ref->lock); - if (ref->must_insert_reserved) { + spin_lock(&existing->lock); + if (update->must_insert_reserved) { /* if the extent was freed and then * reallocated before the delayed ref * entries were processed, we can end up @@ -490,7 +481,7 @@ update_existing_head_ref(struct btrfs_delayed_ref_root *delayed_refs, * the must_insert_reserved flag set. * Set it again here */ - existing_ref->must_insert_reserved = ref->must_insert_reserved; + existing->must_insert_reserved = update->must_insert_reserved; /* * update the num_bytes so we make sure the accounting @@ -500,22 +491,22 @@ update_existing_head_ref(struct btrfs_delayed_ref_root *delayed_refs, } - if (ref->extent_op) { - if (!existing_ref->extent_op) { - existing_ref->extent_op = ref->extent_op; + if (update->extent_op) { + if (!existing->extent_op) { + existing->extent_op = update->extent_op; } else { - if (ref->extent_op->update_key) { - memcpy(&existing_ref->extent_op->key, - &ref->extent_op->key, - sizeof(ref->extent_op->key)); - existing_ref->extent_op->update_key = true; + if (update->extent_op->update_key) { + memcpy(&existing->extent_op->key, + &update->extent_op->key, + sizeof(update->extent_op->key)); + existing->extent_op->update_key = true; } - if (ref->extent_op->update_flags) { - existing_ref->extent_op->flags_to_set |= - ref->extent_op->flags_to_set; - existing_ref->extent_op->update_flags = true; + if (update->extent_op->update_flags) { + existing->extent_op->flags_to_set |= + update->extent_op->flags_to_set; + existing->extent_op->update_flags = true; } - btrfs_free_delayed_extent_op(ref->extent_op); + btrfs_free_delayed_extent_op(update->extent_op); } } /* @@ -523,23 +514,23 @@ update_existing_head_ref(struct btrfs_delayed_ref_root *delayed_refs, * only need the lock for this case cause we could be processing it * currently, for refs we just added we know we're a-ok. */ - old_ref_mod = existing_ref->total_ref_mod; + old_ref_mod = existing->total_ref_mod; if (old_ref_mod_ret) *old_ref_mod_ret = old_ref_mod; existing->ref_mod += update->ref_mod; - existing_ref->total_ref_mod += update->ref_mod; + existing->total_ref_mod += update->ref_mod; /* * If we are going to from a positive ref mod to a negative or vice * versa we need to make sure to adjust pending_csums accordingly. */ - if (existing_ref->is_data) { - if (existing_ref->total_ref_mod >= 0 && old_ref_mod < 0) + if (existing->is_data) { + if (existing->total_ref_mod >= 0 && old_ref_mod < 0) delayed_refs->pending_csums -= existing->num_bytes; - if (existing_ref->total_ref_mod < 0 && old_ref_mod >= 0) + if (existing->total_ref_mod < 0 && old_ref_mod >= 0) delayed_refs->pending_csums += existing->num_bytes; } - spin_unlock(&existing_ref->lock); + spin_unlock(&existing->lock); } /* @@ -550,14 +541,13 @@ update_existing_head_ref(struct btrfs_delayed_ref_root *delayed_refs, static noinline struct btrfs_delayed_ref_head * add_delayed_ref_head(struct btrfs_fs_info *fs_info, struct btrfs_trans_handle *trans, - struct btrfs_delayed_ref_node *ref, + struct btrfs_delayed_ref_head *head_ref, struct btrfs_qgroup_extent_record *qrecord, u64 bytenr, u64 num_bytes, u64 ref_root, u64 reserved, int action, int is_data, int *qrecord_inserted_ret, int *old_ref_mod, int *new_ref_mod) { struct btrfs_delayed_ref_head *existing; - struct btrfs_delayed_ref_head *head_ref = NULL; struct btrfs_delayed_ref_root *delayed_refs; int count_mod = 1; int must_insert_reserved = 0; @@ -593,26 +583,21 @@ add_delayed_ref_head(struct btrfs_fs_info *fs_info, delayed_refs = &trans->transaction->delayed_refs; - /* first set the basic ref node struct up */ - refcount_set(&ref->refs, 1); - ref->bytenr = bytenr; - ref->num_bytes = num_bytes; - ref->ref_mod = count_mod; - ref->type = 0; - ref->action = 0; - ref->is_head = 1; - ref->in_tree = 1; - ref->seq = 0; - - head_ref = btrfs_delayed_node_to_head(ref); + refcount_set(&head_ref->refs, 1); + head_ref->bytenr = bytenr; + head_ref->num_bytes = num_bytes; + head_ref->ref_mod = count_mod; head_ref->must_insert_reserved = must_insert_reserved; head_ref->is_data = is_data; INIT_LIST_HEAD(&head_ref->ref_list); INIT_LIST_HEAD(&head_ref->ref_add_list); + RB_CLEAR_NODE(&head_ref->href_node); head_ref->processing = 0; head_ref->total_ref_mod = count_mod; head_ref->qgroup_reserved = 0; head_ref->qgroup_ref_root = 0; + spin_lock_init(&head_ref->lock); + mutex_init(&head_ref->mutex); /* Record qgroup extent info if provided */ if (qrecord) { @@ -632,17 +617,14 @@ add_delayed_ref_head(struct btrfs_fs_info *fs_info, qrecord_inserted = 1; } - spin_lock_init(&head_ref->lock); - mutex_init(&head_ref->mutex); - - trace_add_delayed_ref_head(fs_info, ref, head_ref, action); + trace_add_delayed_ref_head(fs_info, head_ref, action); existing = htree_insert(&delayed_refs->href_root, &head_ref->href_node); if (existing) { WARN_ON(ref_root && reserved && existing->qgroup_ref_root && existing->qgroup_reserved); - update_existing_head_ref(delayed_refs, &existing->node, ref, + update_existing_head_ref(delayed_refs, existing, head_ref, old_ref_mod); /* * we've updated the existing ref, free the newly @@ -821,7 +803,7 @@ int btrfs_add_delayed_tree_ref(struct btrfs_fs_info *fs_info, * insert both the head node and the new ref without dropping * the spin lock */ - head_ref = add_delayed_ref_head(fs_info, trans, &head_ref->node, record, + head_ref = add_delayed_ref_head(fs_info, trans, head_ref, record, bytenr, num_bytes, 0, 0, action, 0, &qrecord_inserted, old_ref_mod, new_ref_mod); @@ -888,7 +870,7 @@ int btrfs_add_delayed_data_ref(struct btrfs_fs_info *fs_info, * insert both the head node and the new ref without dropping * the spin lock */ - head_ref = add_delayed_ref_head(fs_info, trans, &head_ref->node, record, + head_ref = add_delayed_ref_head(fs_info, trans, head_ref, record, bytenr, num_bytes, ref_root, reserved, action, 1, &qrecord_inserted, old_ref_mod, new_ref_mod); @@ -920,7 +902,7 @@ int btrfs_add_delayed_extent_op(struct btrfs_fs_info *fs_info, delayed_refs = &trans->transaction->delayed_refs; spin_lock(&delayed_refs->lock); - add_delayed_ref_head(fs_info, trans, &head_ref->node, NULL, bytenr, + add_delayed_ref_head(fs_info, trans, head_ref, NULL, bytenr, num_bytes, 0, 0, BTRFS_UPDATE_DELAYED_HEAD, extent_op->is_data, NULL, NULL, NULL); diff --git a/fs/btrfs/delayed-ref.h b/fs/btrfs/delayed-ref.h index ce88e4ac5276..1ce11858d727 100644 --- a/fs/btrfs/delayed-ref.h +++ b/fs/btrfs/delayed-ref.h @@ -26,15 +26,6 @@ #define BTRFS_ADD_DELAYED_EXTENT 3 /* record a full extent allocation */ #define BTRFS_UPDATE_DELAYED_HEAD 4 /* not changing ref count on head ref */ -/* - * XXX: Qu: I really hate the design that ref_head and tree/data ref shares the - * same ref_node structure. - * Ref_head is in a higher logic level than tree/data ref, and duplicated - * bytenr/num_bytes in ref_node is really a waste or memory, they should be - * referred from ref_head. - * This gets more disgusting after we use list to store tree/data ref in - * ref_head. Must clean this mess up later. - */ struct btrfs_delayed_ref_node { /*data/tree ref use list, stored in ref_head->ref_list. */ struct list_head list; @@ -91,8 +82,9 @@ struct btrfs_delayed_extent_op { * reference count modifications we've queued up. */ struct btrfs_delayed_ref_head { - struct btrfs_delayed_ref_node node; - + u64 bytenr; + u64 num_bytes; + refcount_t refs; /* * the mutex is held while running the refs, and it is also * held when checking the sum of reference modifications. @@ -115,6 +107,14 @@ struct btrfs_delayed_ref_head { */ int total_ref_mod; + /* + * This is the current outstanding mod references for this bytenr. This + * is used with lookup_extent_info to get an accurate reference count + * for a bytenr, so it is adjusted as delayed refs are run so that any + * on disk reference count + ref_mod is accurate. + */ + int ref_mod; + /* * For qgroup reserved space freeing. * @@ -234,15 +234,18 @@ static inline void btrfs_put_delayed_ref(struct btrfs_delayed_ref_node *ref) case BTRFS_SHARED_DATA_REF_KEY: kmem_cache_free(btrfs_delayed_data_ref_cachep, ref); break; - case 0: - kmem_cache_free(btrfs_delayed_ref_head_cachep, ref); - break; default: BUG(); } } } +static inline void btrfs_put_delayed_ref_head(struct btrfs_delayed_ref_head *head) +{ + if (refcount_dec_and_test(&head->refs)) + kmem_cache_free(btrfs_delayed_ref_head_cachep, head); +} + int btrfs_add_delayed_tree_ref(struct btrfs_fs_info *fs_info, struct btrfs_trans_handle *trans, u64 bytenr, u64 num_bytes, u64 parent, @@ -282,36 +285,18 @@ int btrfs_check_delayed_seq(struct btrfs_fs_info *fs_info, struct btrfs_delayed_ref_root *delayed_refs, u64 seq); -/* - * a node might live in a head or a regular ref, this lets you - * test for the proper type to use. - */ -static int btrfs_delayed_ref_is_head(struct btrfs_delayed_ref_node *node) -{ - return node->is_head; -} - /* * helper functions to cast a node into its container */ static inline struct btrfs_delayed_tree_ref * btrfs_delayed_node_to_tree_ref(struct btrfs_delayed_ref_node *node) { - WARN_ON(btrfs_delayed_ref_is_head(node)); return container_of(node, struct btrfs_delayed_tree_ref, node); } static inline struct btrfs_delayed_data_ref * btrfs_delayed_node_to_data_ref(struct btrfs_delayed_ref_node *node) { - WARN_ON(btrfs_delayed_ref_is_head(node)); return container_of(node, struct btrfs_delayed_data_ref, node); } - -static inline struct btrfs_delayed_ref_head * -btrfs_delayed_node_to_head(struct btrfs_delayed_ref_node *node) -{ - WARN_ON(!btrfs_delayed_ref_is_head(node)); - return container_of(node, struct btrfs_delayed_ref_head, node); -} #endif diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index f971d5680e6f..69ce738c00d0 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -4121,12 +4121,12 @@ static int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans, head = rb_entry(node, struct btrfs_delayed_ref_head, href_node); if (!mutex_trylock(&head->mutex)) { - refcount_inc(&head->node.refs); + refcount_inc(&head->refs); spin_unlock(&delayed_refs->lock); mutex_lock(&head->mutex); mutex_unlock(&head->mutex); - btrfs_put_delayed_ref(&head->node); + btrfs_put_delayed_ref_head(head); spin_lock(&delayed_refs->lock); continue; } @@ -4147,16 +4147,16 @@ static int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans, if (head->processing == 0) delayed_refs->num_heads_ready--; atomic_dec(&delayed_refs->num_entries); - head->node.in_tree = 0; rb_erase(&head->href_node, &delayed_refs->href_root); + RB_CLEAR_NODE(&head->href_node); spin_unlock(&head->lock); spin_unlock(&delayed_refs->lock); mutex_unlock(&head->mutex); if (pin_bytes) - btrfs_pin_extent(fs_info, head->node.bytenr, - head->node.num_bytes, 1); - btrfs_put_delayed_ref(&head->node); + btrfs_pin_extent(fs_info, head->bytenr, + head->num_bytes, 1); + btrfs_put_delayed_ref_head(head); cond_resched(); spin_lock(&delayed_refs->lock); } diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 628ae71094f2..423d89145bac 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -912,7 +912,7 @@ search_again: head = btrfs_find_delayed_ref_head(delayed_refs, bytenr); if (head) { if (!mutex_trylock(&head->mutex)) { - refcount_inc(&head->node.refs); + refcount_inc(&head->refs); spin_unlock(&delayed_refs->lock); btrfs_release_path(path); @@ -923,7 +923,7 @@ search_again: */ mutex_lock(&head->mutex); mutex_unlock(&head->mutex); - btrfs_put_delayed_ref(&head->node); + btrfs_put_delayed_ref_head(head); goto search_again; } spin_lock(&head->lock); @@ -932,7 +932,7 @@ search_again: else BUG_ON(num_refs == 0); - num_refs += head->node.ref_mod; + num_refs += head->ref_mod; spin_unlock(&head->lock); mutex_unlock(&head->mutex); } @@ -2337,7 +2337,7 @@ static void __run_delayed_extent_op(struct btrfs_delayed_extent_op *extent_op, static int run_delayed_extent_op(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info, - struct btrfs_delayed_ref_node *node, + struct btrfs_delayed_ref_head *head, struct btrfs_delayed_extent_op *extent_op) { struct btrfs_key key; @@ -2359,14 +2359,14 @@ static int run_delayed_extent_op(struct btrfs_trans_handle *trans, if (!path) return -ENOMEM; - key.objectid = node->bytenr; + key.objectid = head->bytenr; if (metadata) { key.type = BTRFS_METADATA_ITEM_KEY; key.offset = extent_op->level; } else { key.type = BTRFS_EXTENT_ITEM_KEY; - key.offset = node->num_bytes; + key.offset = head->num_bytes; } again: @@ -2383,17 +2383,17 @@ again: path->slots[0]--; btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); - if (key.objectid == node->bytenr && + if (key.objectid == head->bytenr && key.type == BTRFS_EXTENT_ITEM_KEY && - key.offset == node->num_bytes) + key.offset == head->num_bytes) ret = 0; } if (ret > 0) { btrfs_release_path(path); metadata = 0; - key.objectid = node->bytenr; - key.offset = node->num_bytes; + key.objectid = head->bytenr; + key.offset = head->num_bytes; key.type = BTRFS_EXTENT_ITEM_KEY; goto again; } @@ -2562,7 +2562,7 @@ static int cleanup_extent_op(struct btrfs_trans_handle *trans, return 0; } spin_unlock(&head->lock); - ret = run_delayed_extent_op(trans, fs_info, &head->node, extent_op); + ret = run_delayed_extent_op(trans, fs_info, head, extent_op); btrfs_free_delayed_extent_op(extent_op); return ret ? ret : 1; } @@ -2597,39 +2597,37 @@ static int cleanup_ref_head(struct btrfs_trans_handle *trans, spin_unlock(&delayed_refs->lock); return 1; } - head->node.in_tree = 0; delayed_refs->num_heads--; rb_erase(&head->href_node, &delayed_refs->href_root); + RB_CLEAR_NODE(&head->href_node); spin_unlock(&delayed_refs->lock); spin_unlock(&head->lock); atomic_dec(&delayed_refs->num_entries); - trace_run_delayed_ref_head(fs_info, &head->node, head, - head->node.action); + trace_run_delayed_ref_head(fs_info, head, 0); if (head->total_ref_mod < 0) { struct btrfs_block_group_cache *cache; - cache = btrfs_lookup_block_group(fs_info, head->node.bytenr); + cache = btrfs_lookup_block_group(fs_info, head->bytenr); ASSERT(cache); percpu_counter_add(&cache->space_info->total_bytes_pinned, - -head->node.num_bytes); + -head->num_bytes); btrfs_put_block_group(cache); if (head->is_data) { spin_lock(&delayed_refs->lock); - delayed_refs->pending_csums -= head->node.num_bytes; + delayed_refs->pending_csums -= head->num_bytes; spin_unlock(&delayed_refs->lock); } } if (head->must_insert_reserved) { - btrfs_pin_extent(fs_info, head->node.bytenr, - head->node.num_bytes, 1); + btrfs_pin_extent(fs_info, head->bytenr, + head->num_bytes, 1); if (head->is_data) { - ret = btrfs_del_csums(trans, fs_info, - head->node.bytenr, - head->node.num_bytes); + ret = btrfs_del_csums(trans, fs_info, head->bytenr, + head->num_bytes); } } @@ -2637,7 +2635,7 @@ static int cleanup_ref_head(struct btrfs_trans_handle *trans, btrfs_qgroup_free_delayed_ref(fs_info, head->qgroup_ref_root, head->qgroup_reserved); btrfs_delayed_ref_unlock(head); - btrfs_put_delayed_ref(&head->node); + btrfs_put_delayed_ref_head(head); return 0; } @@ -2751,10 +2749,10 @@ static noinline int __btrfs_run_delayed_refs(struct btrfs_trans_handle *trans, switch (ref->action) { case BTRFS_ADD_DELAYED_REF: case BTRFS_ADD_DELAYED_EXTENT: - locked_ref->node.ref_mod -= ref->ref_mod; + locked_ref->ref_mod -= ref->ref_mod; break; case BTRFS_DROP_DELAYED_REF: - locked_ref->node.ref_mod += ref->ref_mod; + locked_ref->ref_mod += ref->ref_mod; break; default: WARN_ON(1); @@ -3087,33 +3085,16 @@ again: spin_unlock(&delayed_refs->lock); goto out; } + head = rb_entry(node, struct btrfs_delayed_ref_head, + href_node); + refcount_inc(&head->refs); + spin_unlock(&delayed_refs->lock); - while (node) { - head = rb_entry(node, struct btrfs_delayed_ref_head, - href_node); - if (btrfs_delayed_ref_is_head(&head->node)) { - struct btrfs_delayed_ref_node *ref; - - ref = &head->node; - refcount_inc(&ref->refs); - - spin_unlock(&delayed_refs->lock); - /* - * Mutex was contended, block until it's - * released and try again - */ - mutex_lock(&head->mutex); - mutex_unlock(&head->mutex); + /* Mutex was contended, block until it's released and retry. */ + mutex_lock(&head->mutex); + mutex_unlock(&head->mutex); - btrfs_put_delayed_ref(ref); - cond_resched(); - goto again; - } else { - WARN_ON(1); - } - node = rb_next(node); - } - spin_unlock(&delayed_refs->lock); + btrfs_put_delayed_ref_head(head); cond_resched(); goto again; } @@ -3171,7 +3152,7 @@ static noinline int check_delayed_ref(struct btrfs_root *root, } if (!mutex_trylock(&head->mutex)) { - refcount_inc(&head->node.refs); + refcount_inc(&head->refs); spin_unlock(&delayed_refs->lock); btrfs_release_path(path); @@ -3182,7 +3163,7 @@ static noinline int check_delayed_ref(struct btrfs_root *root, */ mutex_lock(&head->mutex); mutex_unlock(&head->mutex); - btrfs_put_delayed_ref(&head->node); + btrfs_put_delayed_ref_head(head); return -EAGAIN; } spin_unlock(&delayed_refs->lock); @@ -7235,9 +7216,8 @@ static noinline int check_ref_cleanup(struct btrfs_trans_handle *trans, * at this point we have a head with no other entries. Go * ahead and process it. */ - head->node.in_tree = 0; rb_erase(&head->href_node, &delayed_refs->href_root); - + RB_CLEAR_NODE(&head->href_node); atomic_dec(&delayed_refs->num_entries); /* @@ -7256,7 +7236,7 @@ static noinline int check_ref_cleanup(struct btrfs_trans_handle *trans, ret = 1; mutex_unlock(&head->mutex); - btrfs_put_delayed_ref(&head->node); + btrfs_put_delayed_ref_head(head); return ret; out: spin_unlock(&head->lock); diff --git a/include/trace/events/btrfs.h b/include/trace/events/btrfs.h index 77437f545c63..bfe2f23b578c 100644 --- a/include/trace/events/btrfs.h +++ b/include/trace/events/btrfs.h @@ -798,11 +798,10 @@ DEFINE_EVENT(btrfs_delayed_data_ref, run_delayed_data_ref, DECLARE_EVENT_CLASS(btrfs_delayed_ref_head, TP_PROTO(const struct btrfs_fs_info *fs_info, - const struct btrfs_delayed_ref_node *ref, const struct btrfs_delayed_ref_head *head_ref, int action), - TP_ARGS(fs_info, ref, head_ref, action), + TP_ARGS(fs_info, head_ref, action), TP_STRUCT__entry_btrfs( __field( u64, bytenr ) @@ -812,8 +811,8 @@ DECLARE_EVENT_CLASS(btrfs_delayed_ref_head, ), TP_fast_assign_btrfs(fs_info, - __entry->bytenr = ref->bytenr; - __entry->num_bytes = ref->num_bytes; + __entry->bytenr = head_ref->bytenr; + __entry->num_bytes = head_ref->num_bytes; __entry->action = action; __entry->is_data = head_ref->is_data; ), @@ -828,21 +827,19 @@ DECLARE_EVENT_CLASS(btrfs_delayed_ref_head, DEFINE_EVENT(btrfs_delayed_ref_head, add_delayed_ref_head, TP_PROTO(const struct btrfs_fs_info *fs_info, - const struct btrfs_delayed_ref_node *ref, const struct btrfs_delayed_ref_head *head_ref, int action), - TP_ARGS(fs_info, ref, head_ref, action) + TP_ARGS(fs_info, head_ref, action) ); DEFINE_EVENT(btrfs_delayed_ref_head, run_delayed_ref_head, TP_PROTO(const struct btrfs_fs_info *fs_info, - const struct btrfs_delayed_ref_node *ref, const struct btrfs_delayed_ref_head *head_ref, int action), - TP_ARGS(fs_info, ref, head_ref, action) + TP_ARGS(fs_info, head_ref, action) ); #define show_chunk_type(type) \ -- cgit v1.2.3-71-gd317 From d24a67b2d997c860a42516076f3315c2ad2d2884 Mon Sep 17 00:00:00 2001 From: Zygo Blaxell Date: Fri, 22 Sep 2017 13:58:46 -0400 Subject: btrfs: add a flags argument to LOGICAL_INO and call it LOGICAL_INO_V2 Now that check_extent_in_eb()'s extent offset filter can be turned off, we need a way to do it from userspace. Add a 'flags' field to the btrfs_logical_ino_args structure to disable extent offset filtering, taking the place of one of the existing reserved[] fields. Previous versions of LOGICAL_INO neglected to check whether any of the reserved fields have non-zero values. Assigning meaning to those fields now may change the behavior of existing programs that left these fields uninitialized. The lack of a zero check also means that new programs have no way to know whether the kernel is honoring the flags field. To avoid these problems, define a new ioctl LOGICAL_INO_V2. We can use the same argument layout as LOGICAL_INO, but shorten the reserved[] array by one element and turn it into the 'flags' field. The V2 ioctl explicitly checks that reserved fields and unsupported flag bits are zero so that userspace can negotiate future feature bits as they are defined. Since the memory layouts of the two ioctls' arguments are compatible, there is no need for a separate function for logical_to_ino_v2 (contrast with tree_search_v2 vs tree_search where the layout and code are quite different). A version parameter and an 'if' statement will suffice. Now that we have a flags field in logical_ino_args, add a flag BTRFS_LOGICAL_INO_ARGS_IGNORE_OFFSET to get the behavior we want, and pass it down the stack to iterate_inodes_from_logical. Motivation and background, copied from the patchset cover letter: Suppose we have a file with one extent: root@tester:~# zcat /usr/share/doc/cpio/changelog.gz > /test/a root@tester:~# sync Split the extent by overwriting it in the middle: root@tester:~# cat /dev/urandom | dd bs=4k seek=2 skip=2 count=1 conv=notrunc of=/test/a We should now have 3 extent refs to 2 extents, with one block unreachable. The extent tree looks like: root@tester:~# btrfs-debug-tree /dev/vdc -t 2 [...] item 9 key (1103101952 EXTENT_ITEM 73728) itemoff 15942 itemsize 53 extent refs 2 gen 29 flags DATA extent data backref root 5 objectid 261 offset 0 count 2 [...] item 11 key (1103175680 EXTENT_ITEM 4096) itemoff 15865 itemsize 53 extent refs 1 gen 30 flags DATA extent data backref root 5 objectid 261 offset 8192 count 1 [...] and the ref tree looks like: root@tester:~# btrfs-debug-tree /dev/vdc -t 5 [...] item 6 key (261 EXTENT_DATA 0) itemoff 15825 itemsize 53 extent data disk byte 1103101952 nr 73728 extent data offset 0 nr 8192 ram 73728 extent compression(none) item 7 key (261 EXTENT_DATA 8192) itemoff 15772 itemsize 53 extent data disk byte 1103175680 nr 4096 extent data offset 0 nr 4096 ram 4096 extent compression(none) item 8 key (261 EXTENT_DATA 12288) itemoff 15719 itemsize 53 extent data disk byte 1103101952 nr 73728 extent data offset 12288 nr 61440 ram 73728 extent compression(none) [...] There are two references to the same extent with different, non-overlapping byte offsets: [------------------72K extent at 1103101952----------------------] [--8K----------------|--4K unreachable----|--60K-----------------] ^ ^ | | [--8K ref offset 0--][--4K ref offset 0--][--60K ref offset 12K--] | v [-----4K extent-----] at 1103175680 We want to find all of the references to extent bytenr 1103101952. Without the patch (and without running btrfs-debug-tree), we have to do it with 18 LOGICAL_INO calls: root@tester:~# btrfs ins log 1103101952 -P /test/ Using LOGICAL_INO inode 261 offset 0 root 5 root@tester:~# for x in $(seq 0 17); do btrfs ins log $((1103101952 + x * 4096)) -P /test/; done 2>&1 | grep inode inode 261 offset 0 root 5 inode 261 offset 4096 root 5 <- same extent ref as offset 0 (offset 8192 returns empty set, not reachable) inode 261 offset 12288 root 5 inode 261 offset 16384 root 5 \ inode 261 offset 20480 root 5 | inode 261 offset 24576 root 5 | inode 261 offset 28672 root 5 | inode 261 offset 32768 root 5 | inode 261 offset 36864 root 5 \ inode 261 offset 40960 root 5 > all the same extent ref as offset 12288. inode 261 offset 45056 root 5 / More processing required in userspace inode 261 offset 49152 root 5 | to figure out these are all duplicates. inode 261 offset 53248 root 5 | inode 261 offset 57344 root 5 | inode 261 offset 61440 root 5 | inode 261 offset 65536 root 5 | inode 261 offset 69632 root 5 / In the worst case the extents are 128MB long, and we have to do 32768 iterations of the loop to find one 4K extent ref. With the patch, we just use one call to map all refs to the extent at once: root@tester:~# btrfs ins log 1103101952 -P /test/ Using LOGICAL_INO_V2 inode 261 offset 0 root 5 inode 261 offset 12288 root 5 The TREE_SEARCH ioctl allows userspace to retrieve the offset and extent bytenr fields easily once the root, inode and offset are known. This is sufficient information to build a complete map of the extent and all of its references. Userspace can use this information to make better choices to dedup or defrag. Signed-off-by: Zygo Blaxell Reviewed-by: Hans van Kranenburg Tested-by: Hans van Kranenburg [ copy background and motivation from cover letter ] Signed-off-by: David Sterba --- fs/btrfs/ioctl.c | 26 +++++++++++++++++++++++--- include/uapi/linux/btrfs.h | 8 +++++++- 2 files changed, 30 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 2497a5d45d9c..fa9996ab3da6 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -4530,13 +4530,14 @@ static int build_ino_list(u64 inum, u64 offset, u64 root, void *ctx) } static long btrfs_ioctl_logical_to_ino(struct btrfs_fs_info *fs_info, - void __user *arg) + void __user *arg, int version) { int ret = 0; int size; struct btrfs_ioctl_logical_ino_args *loi; struct btrfs_data_container *inodes = NULL; struct btrfs_path *path = NULL; + bool ignore_offset; if (!capable(CAP_SYS_ADMIN)) return -EPERM; @@ -4545,6 +4546,22 @@ static long btrfs_ioctl_logical_to_ino(struct btrfs_fs_info *fs_info, if (IS_ERR(loi)) return PTR_ERR(loi); + if (version == 1) { + ignore_offset = false; + } else { + /* All reserved bits must be 0 for now */ + if (memchr_inv(loi->reserved, 0, sizeof(loi->reserved))) { + ret = -EINVAL; + goto out_loi; + } + /* Only accept flags we have defined so far */ + if (loi->flags & ~(BTRFS_LOGICAL_INO_ARGS_IGNORE_OFFSET)) { + ret = -EINVAL; + goto out_loi; + } + ignore_offset = loi->flags & BTRFS_LOGICAL_INO_ARGS_IGNORE_OFFSET; + } + path = btrfs_alloc_path(); if (!path) { ret = -ENOMEM; @@ -4560,7 +4577,7 @@ static long btrfs_ioctl_logical_to_ino(struct btrfs_fs_info *fs_info, } ret = iterate_inodes_from_logical(loi->logical, fs_info, path, - build_ino_list, inodes, false); + build_ino_list, inodes, ignore_offset); if (ret == -EINVAL) ret = -ENOENT; if (ret < 0) @@ -4574,6 +4591,7 @@ static long btrfs_ioctl_logical_to_ino(struct btrfs_fs_info *fs_info, out: btrfs_free_path(path); kvfree(inodes); +out_loi: kfree(loi); return ret; @@ -5575,7 +5593,9 @@ long btrfs_ioctl(struct file *file, unsigned int case BTRFS_IOC_INO_PATHS: return btrfs_ioctl_ino_to_path(root, argp); case BTRFS_IOC_LOGICAL_INO: - return btrfs_ioctl_logical_to_ino(fs_info, argp); + return btrfs_ioctl_logical_to_ino(fs_info, argp, 1); + case BTRFS_IOC_LOGICAL_INO_V2: + return btrfs_ioctl_logical_to_ino(fs_info, argp, 2); case BTRFS_IOC_SPACE_INFO: return btrfs_ioctl_space_info(fs_info, argp); case BTRFS_IOC_SYNC: { diff --git a/include/uapi/linux/btrfs.h b/include/uapi/linux/btrfs.h index 378230c163d5..99bb7988e6fe 100644 --- a/include/uapi/linux/btrfs.h +++ b/include/uapi/linux/btrfs.h @@ -608,10 +608,14 @@ struct btrfs_ioctl_ino_path_args { struct btrfs_ioctl_logical_ino_args { __u64 logical; /* in */ __u64 size; /* in */ - __u64 reserved[4]; + __u64 reserved[3]; /* must be 0 for now */ + __u64 flags; /* in, v2 only */ /* struct btrfs_data_container *inodes; out */ __u64 inodes; }; +/* Return every ref to the extent, not just those containing logical block. + * Requires logical == extent bytenr. */ +#define BTRFS_LOGICAL_INO_ARGS_IGNORE_OFFSET (1ULL << 0) enum btrfs_dev_stat_values { /* disk I/O failure stats */ @@ -835,5 +839,7 @@ enum btrfs_err_code { struct btrfs_ioctl_feature_flags[3]) #define BTRFS_IOC_RM_DEV_V2 _IOW(BTRFS_IOCTL_MAGIC, 58, \ struct btrfs_ioctl_vol_args_v2) +#define BTRFS_IOC_LOGICAL_INO_V2 _IOWR(BTRFS_IOCTL_MAGIC, 59, \ + struct btrfs_ioctl_logical_ino_args) #endif /* _UAPI_LINUX_BTRFS_H */ -- cgit v1.2.3-71-gd317 From dd48d4072e0cdac51edcbff66342fe2f21b5b588 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Thu, 19 Oct 2017 14:15:56 -0400 Subject: btrfs: add tracepoints for outstanding extents mods This is handy for tracing problems with modifying the outstanding extents counters. Signed-off-by: Josef Bacik Signed-off-by: David Sterba --- fs/btrfs/btrfs_inode.h | 2 ++ include/trace/events/btrfs.h | 21 +++++++++++++++++++++ 2 files changed, 23 insertions(+) (limited to 'include') diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h index bed88811f686..427c8738a3bd 100644 --- a/fs/btrfs/btrfs_inode.h +++ b/fs/btrfs/btrfs_inode.h @@ -274,6 +274,8 @@ static inline void btrfs_mod_outstanding_extents(struct btrfs_inode *inode, inode->outstanding_extents += mod; if (btrfs_is_free_space_inode(inode)) return; + trace_btrfs_inode_mod_outstanding_extents(inode->root, btrfs_ino(inode), + mod); } static inline void btrfs_mod_reserved_extents(struct btrfs_inode *inode, int mod) diff --git a/include/trace/events/btrfs.h b/include/trace/events/btrfs.h index bfe2f23b578c..5848ae7845da 100644 --- a/include/trace/events/btrfs.h +++ b/include/trace/events/btrfs.h @@ -1695,6 +1695,27 @@ DEFINE_EVENT(btrfs__prelim_ref, btrfs_prelim_ref_insert, TP_ARGS(fs_info, oldref, newref, tree_size) ); +TRACE_EVENT(btrfs_inode_mod_outstanding_extents, + TP_PROTO(struct btrfs_root *root, u64 ino, int mod), + + TP_ARGS(root, ino, mod), + + TP_STRUCT__entry_btrfs( + __field( u64, root_objectid ) + __field( u64, ino ) + __field( int, mod ) + ), + + TP_fast_assign_btrfs(root->fs_info, + __entry->root_objectid = root->objectid; + __entry->ino = ino; + __entry->mod = mod; + ), + + TP_printk_btrfs("root=%llu(%s) ino=%llu mod=%d", + show_root_type(__entry->root_objectid), + (unsigned long long)__entry->ino, __entry->mod) +); #endif /* _TRACE_BTRFS_H */ /* This part must be outside protection */ -- cgit v1.2.3-71-gd317