cachepc-linux

Fork of AMDESE/linux with modifications for CachePC side-channel attack
git clone https://git.sinitax.com/sinitax/cachepc-linux
Log | Files | Refs | README | LICENSE | sfeed.txt

refcounttree.c (123112B)


      1// SPDX-License-Identifier: GPL-2.0-only
      2/*
      3 * refcounttree.c
      4 *
      5 * Copyright (C) 2009 Oracle.  All rights reserved.
      6 */
      7
      8#include <linux/sort.h>
      9#include <cluster/masklog.h>
     10#include "ocfs2.h"
     11#include "inode.h"
     12#include "alloc.h"
     13#include "suballoc.h"
     14#include "journal.h"
     15#include "uptodate.h"
     16#include "super.h"
     17#include "buffer_head_io.h"
     18#include "blockcheck.h"
     19#include "refcounttree.h"
     20#include "sysfile.h"
     21#include "dlmglue.h"
     22#include "extent_map.h"
     23#include "aops.h"
     24#include "xattr.h"
     25#include "namei.h"
     26#include "ocfs2_trace.h"
     27#include "file.h"
     28
     29#include <linux/bio.h>
     30#include <linux/blkdev.h>
     31#include <linux/slab.h>
     32#include <linux/writeback.h>
     33#include <linux/pagevec.h>
     34#include <linux/swap.h>
     35#include <linux/security.h>
     36#include <linux/fsnotify.h>
     37#include <linux/quotaops.h>
     38#include <linux/namei.h>
     39#include <linux/mount.h>
     40#include <linux/posix_acl.h>
     41
     42struct ocfs2_cow_context {
     43	struct inode *inode;
     44	u32 cow_start;
     45	u32 cow_len;
     46	struct ocfs2_extent_tree data_et;
     47	struct ocfs2_refcount_tree *ref_tree;
     48	struct buffer_head *ref_root_bh;
     49	struct ocfs2_alloc_context *meta_ac;
     50	struct ocfs2_alloc_context *data_ac;
     51	struct ocfs2_cached_dealloc_ctxt dealloc;
     52	void *cow_object;
     53	struct ocfs2_post_refcount *post_refcount;
     54	int extra_credits;
     55	int (*get_clusters)(struct ocfs2_cow_context *context,
     56			    u32 v_cluster, u32 *p_cluster,
     57			    u32 *num_clusters,
     58			    unsigned int *extent_flags);
     59	int (*cow_duplicate_clusters)(handle_t *handle,
     60				      struct inode *inode,
     61				      u32 cpos, u32 old_cluster,
     62				      u32 new_cluster, u32 new_len);
     63};
     64
     65static inline struct ocfs2_refcount_tree *
     66cache_info_to_refcount(struct ocfs2_caching_info *ci)
     67{
     68	return container_of(ci, struct ocfs2_refcount_tree, rf_ci);
     69}
     70
     71static int ocfs2_validate_refcount_block(struct super_block *sb,
     72					 struct buffer_head *bh)
     73{
     74	int rc;
     75	struct ocfs2_refcount_block *rb =
     76		(struct ocfs2_refcount_block *)bh->b_data;
     77
     78	trace_ocfs2_validate_refcount_block((unsigned long long)bh->b_blocknr);
     79
     80	BUG_ON(!buffer_uptodate(bh));
     81
     82	/*
     83	 * If the ecc fails, we return the error but otherwise
     84	 * leave the filesystem running.  We know any error is
     85	 * local to this block.
     86	 */
     87	rc = ocfs2_validate_meta_ecc(sb, bh->b_data, &rb->rf_check);
     88	if (rc) {
     89		mlog(ML_ERROR, "Checksum failed for refcount block %llu\n",
     90		     (unsigned long long)bh->b_blocknr);
     91		return rc;
     92	}
     93
     94
     95	if (!OCFS2_IS_VALID_REFCOUNT_BLOCK(rb)) {
     96		rc = ocfs2_error(sb,
     97				 "Refcount block #%llu has bad signature %.*s\n",
     98				 (unsigned long long)bh->b_blocknr, 7,
     99				 rb->rf_signature);
    100		goto out;
    101	}
    102
    103	if (le64_to_cpu(rb->rf_blkno) != bh->b_blocknr) {
    104		rc = ocfs2_error(sb,
    105				 "Refcount block #%llu has an invalid rf_blkno of %llu\n",
    106				 (unsigned long long)bh->b_blocknr,
    107				 (unsigned long long)le64_to_cpu(rb->rf_blkno));
    108		goto out;
    109	}
    110
    111	if (le32_to_cpu(rb->rf_fs_generation) != OCFS2_SB(sb)->fs_generation) {
    112		rc = ocfs2_error(sb,
    113				 "Refcount block #%llu has an invalid rf_fs_generation of #%u\n",
    114				 (unsigned long long)bh->b_blocknr,
    115				 le32_to_cpu(rb->rf_fs_generation));
    116		goto out;
    117	}
    118out:
    119	return rc;
    120}
    121
    122static int ocfs2_read_refcount_block(struct ocfs2_caching_info *ci,
    123				     u64 rb_blkno,
    124				     struct buffer_head **bh)
    125{
    126	int rc;
    127	struct buffer_head *tmp = *bh;
    128
    129	rc = ocfs2_read_block(ci, rb_blkno, &tmp,
    130			      ocfs2_validate_refcount_block);
    131
    132	/* If ocfs2_read_block() got us a new bh, pass it up. */
    133	if (!rc && !*bh)
    134		*bh = tmp;
    135
    136	return rc;
    137}
    138
    139static u64 ocfs2_refcount_cache_owner(struct ocfs2_caching_info *ci)
    140{
    141	struct ocfs2_refcount_tree *rf = cache_info_to_refcount(ci);
    142
    143	return rf->rf_blkno;
    144}
    145
    146static struct super_block *
    147ocfs2_refcount_cache_get_super(struct ocfs2_caching_info *ci)
    148{
    149	struct ocfs2_refcount_tree *rf = cache_info_to_refcount(ci);
    150
    151	return rf->rf_sb;
    152}
    153
    154static void ocfs2_refcount_cache_lock(struct ocfs2_caching_info *ci)
    155__acquires(&rf->rf_lock)
    156{
    157	struct ocfs2_refcount_tree *rf = cache_info_to_refcount(ci);
    158
    159	spin_lock(&rf->rf_lock);
    160}
    161
    162static void ocfs2_refcount_cache_unlock(struct ocfs2_caching_info *ci)
    163__releases(&rf->rf_lock)
    164{
    165	struct ocfs2_refcount_tree *rf = cache_info_to_refcount(ci);
    166
    167	spin_unlock(&rf->rf_lock);
    168}
    169
    170static void ocfs2_refcount_cache_io_lock(struct ocfs2_caching_info *ci)
    171{
    172	struct ocfs2_refcount_tree *rf = cache_info_to_refcount(ci);
    173
    174	mutex_lock(&rf->rf_io_mutex);
    175}
    176
    177static void ocfs2_refcount_cache_io_unlock(struct ocfs2_caching_info *ci)
    178{
    179	struct ocfs2_refcount_tree *rf = cache_info_to_refcount(ci);
    180
    181	mutex_unlock(&rf->rf_io_mutex);
    182}
    183
    184static const struct ocfs2_caching_operations ocfs2_refcount_caching_ops = {
    185	.co_owner		= ocfs2_refcount_cache_owner,
    186	.co_get_super		= ocfs2_refcount_cache_get_super,
    187	.co_cache_lock		= ocfs2_refcount_cache_lock,
    188	.co_cache_unlock	= ocfs2_refcount_cache_unlock,
    189	.co_io_lock		= ocfs2_refcount_cache_io_lock,
    190	.co_io_unlock		= ocfs2_refcount_cache_io_unlock,
    191};
    192
    193static struct ocfs2_refcount_tree *
    194ocfs2_find_refcount_tree(struct ocfs2_super *osb, u64 blkno)
    195{
    196	struct rb_node *n = osb->osb_rf_lock_tree.rb_node;
    197	struct ocfs2_refcount_tree *tree = NULL;
    198
    199	while (n) {
    200		tree = rb_entry(n, struct ocfs2_refcount_tree, rf_node);
    201
    202		if (blkno < tree->rf_blkno)
    203			n = n->rb_left;
    204		else if (blkno > tree->rf_blkno)
    205			n = n->rb_right;
    206		else
    207			return tree;
    208	}
    209
    210	return NULL;
    211}
    212
    213/* osb_lock is already locked. */
    214static void ocfs2_insert_refcount_tree(struct ocfs2_super *osb,
    215				       struct ocfs2_refcount_tree *new)
    216{
    217	u64 rf_blkno = new->rf_blkno;
    218	struct rb_node *parent = NULL;
    219	struct rb_node **p = &osb->osb_rf_lock_tree.rb_node;
    220	struct ocfs2_refcount_tree *tmp;
    221
    222	while (*p) {
    223		parent = *p;
    224
    225		tmp = rb_entry(parent, struct ocfs2_refcount_tree,
    226			       rf_node);
    227
    228		if (rf_blkno < tmp->rf_blkno)
    229			p = &(*p)->rb_left;
    230		else if (rf_blkno > tmp->rf_blkno)
    231			p = &(*p)->rb_right;
    232		else {
    233			/* This should never happen! */
    234			mlog(ML_ERROR, "Duplicate refcount block %llu found!\n",
    235			     (unsigned long long)rf_blkno);
    236			BUG();
    237		}
    238	}
    239
    240	rb_link_node(&new->rf_node, parent, p);
    241	rb_insert_color(&new->rf_node, &osb->osb_rf_lock_tree);
    242}
    243
    244static void ocfs2_free_refcount_tree(struct ocfs2_refcount_tree *tree)
    245{
    246	ocfs2_metadata_cache_exit(&tree->rf_ci);
    247	ocfs2_simple_drop_lockres(OCFS2_SB(tree->rf_sb), &tree->rf_lockres);
    248	ocfs2_lock_res_free(&tree->rf_lockres);
    249	kfree(tree);
    250}
    251
    252static inline void
    253ocfs2_erase_refcount_tree_from_list_no_lock(struct ocfs2_super *osb,
    254					struct ocfs2_refcount_tree *tree)
    255{
    256	rb_erase(&tree->rf_node, &osb->osb_rf_lock_tree);
    257	if (osb->osb_ref_tree_lru && osb->osb_ref_tree_lru == tree)
    258		osb->osb_ref_tree_lru = NULL;
    259}
    260
    261static void ocfs2_erase_refcount_tree_from_list(struct ocfs2_super *osb,
    262					struct ocfs2_refcount_tree *tree)
    263{
    264	spin_lock(&osb->osb_lock);
    265	ocfs2_erase_refcount_tree_from_list_no_lock(osb, tree);
    266	spin_unlock(&osb->osb_lock);
    267}
    268
    269static void ocfs2_kref_remove_refcount_tree(struct kref *kref)
    270{
    271	struct ocfs2_refcount_tree *tree =
    272		container_of(kref, struct ocfs2_refcount_tree, rf_getcnt);
    273
    274	ocfs2_free_refcount_tree(tree);
    275}
    276
    277static inline void
    278ocfs2_refcount_tree_get(struct ocfs2_refcount_tree *tree)
    279{
    280	kref_get(&tree->rf_getcnt);
    281}
    282
    283static inline void
    284ocfs2_refcount_tree_put(struct ocfs2_refcount_tree *tree)
    285{
    286	kref_put(&tree->rf_getcnt, ocfs2_kref_remove_refcount_tree);
    287}
    288
    289static inline void ocfs2_init_refcount_tree_ci(struct ocfs2_refcount_tree *new,
    290					       struct super_block *sb)
    291{
    292	ocfs2_metadata_cache_init(&new->rf_ci, &ocfs2_refcount_caching_ops);
    293	mutex_init(&new->rf_io_mutex);
    294	new->rf_sb = sb;
    295	spin_lock_init(&new->rf_lock);
    296}
    297
    298static inline void ocfs2_init_refcount_tree_lock(struct ocfs2_super *osb,
    299					struct ocfs2_refcount_tree *new,
    300					u64 rf_blkno, u32 generation)
    301{
    302	init_rwsem(&new->rf_sem);
    303	ocfs2_refcount_lock_res_init(&new->rf_lockres, osb,
    304				     rf_blkno, generation);
    305}
    306
    307static struct ocfs2_refcount_tree*
    308ocfs2_allocate_refcount_tree(struct ocfs2_super *osb, u64 rf_blkno)
    309{
    310	struct ocfs2_refcount_tree *new;
    311
    312	new = kzalloc(sizeof(struct ocfs2_refcount_tree), GFP_NOFS);
    313	if (!new)
    314		return NULL;
    315
    316	new->rf_blkno = rf_blkno;
    317	kref_init(&new->rf_getcnt);
    318	ocfs2_init_refcount_tree_ci(new, osb->sb);
    319
    320	return new;
    321}
    322
    323static int ocfs2_get_refcount_tree(struct ocfs2_super *osb, u64 rf_blkno,
    324				   struct ocfs2_refcount_tree **ret_tree)
    325{
    326	int ret = 0;
    327	struct ocfs2_refcount_tree *tree, *new = NULL;
    328	struct buffer_head *ref_root_bh = NULL;
    329	struct ocfs2_refcount_block *ref_rb;
    330
    331	spin_lock(&osb->osb_lock);
    332	if (osb->osb_ref_tree_lru &&
    333	    osb->osb_ref_tree_lru->rf_blkno == rf_blkno)
    334		tree = osb->osb_ref_tree_lru;
    335	else
    336		tree = ocfs2_find_refcount_tree(osb, rf_blkno);
    337	if (tree)
    338		goto out;
    339
    340	spin_unlock(&osb->osb_lock);
    341
    342	new = ocfs2_allocate_refcount_tree(osb, rf_blkno);
    343	if (!new) {
    344		ret = -ENOMEM;
    345		mlog_errno(ret);
    346		return ret;
    347	}
    348	/*
    349	 * We need the generation to create the refcount tree lock and since
    350	 * it isn't changed during the tree modification, we are safe here to
    351	 * read without protection.
    352	 * We also have to purge the cache after we create the lock since the
    353	 * refcount block may have the stale data. It can only be trusted when
    354	 * we hold the refcount lock.
    355	 */
    356	ret = ocfs2_read_refcount_block(&new->rf_ci, rf_blkno, &ref_root_bh);
    357	if (ret) {
    358		mlog_errno(ret);
    359		ocfs2_metadata_cache_exit(&new->rf_ci);
    360		kfree(new);
    361		return ret;
    362	}
    363
    364	ref_rb = (struct ocfs2_refcount_block *)ref_root_bh->b_data;
    365	new->rf_generation = le32_to_cpu(ref_rb->rf_generation);
    366	ocfs2_init_refcount_tree_lock(osb, new, rf_blkno,
    367				      new->rf_generation);
    368	ocfs2_metadata_cache_purge(&new->rf_ci);
    369
    370	spin_lock(&osb->osb_lock);
    371	tree = ocfs2_find_refcount_tree(osb, rf_blkno);
    372	if (tree)
    373		goto out;
    374
    375	ocfs2_insert_refcount_tree(osb, new);
    376
    377	tree = new;
    378	new = NULL;
    379
    380out:
    381	*ret_tree = tree;
    382
    383	osb->osb_ref_tree_lru = tree;
    384
    385	spin_unlock(&osb->osb_lock);
    386
    387	if (new)
    388		ocfs2_free_refcount_tree(new);
    389
    390	brelse(ref_root_bh);
    391	return ret;
    392}
    393
    394static int ocfs2_get_refcount_block(struct inode *inode, u64 *ref_blkno)
    395{
    396	int ret;
    397	struct buffer_head *di_bh = NULL;
    398	struct ocfs2_dinode *di;
    399
    400	ret = ocfs2_read_inode_block(inode, &di_bh);
    401	if (ret) {
    402		mlog_errno(ret);
    403		goto out;
    404	}
    405
    406	BUG_ON(!ocfs2_is_refcount_inode(inode));
    407
    408	di = (struct ocfs2_dinode *)di_bh->b_data;
    409	*ref_blkno = le64_to_cpu(di->i_refcount_loc);
    410	brelse(di_bh);
    411out:
    412	return ret;
    413}
    414
    415static int __ocfs2_lock_refcount_tree(struct ocfs2_super *osb,
    416				      struct ocfs2_refcount_tree *tree, int rw)
    417{
    418	int ret;
    419
    420	ret = ocfs2_refcount_lock(tree, rw);
    421	if (ret) {
    422		mlog_errno(ret);
    423		goto out;
    424	}
    425
    426	if (rw)
    427		down_write(&tree->rf_sem);
    428	else
    429		down_read(&tree->rf_sem);
    430
    431out:
    432	return ret;
    433}
    434
    435/*
    436 * Lock the refcount tree pointed by ref_blkno and return the tree.
    437 * In most case, we lock the tree and read the refcount block.
    438 * So read it here if the caller really needs it.
    439 *
    440 * If the tree has been re-created by other node, it will free the
    441 * old one and re-create it.
    442 */
    443int ocfs2_lock_refcount_tree(struct ocfs2_super *osb,
    444			     u64 ref_blkno, int rw,
    445			     struct ocfs2_refcount_tree **ret_tree,
    446			     struct buffer_head **ref_bh)
    447{
    448	int ret, delete_tree = 0;
    449	struct ocfs2_refcount_tree *tree = NULL;
    450	struct buffer_head *ref_root_bh = NULL;
    451	struct ocfs2_refcount_block *rb;
    452
    453again:
    454	ret = ocfs2_get_refcount_tree(osb, ref_blkno, &tree);
    455	if (ret) {
    456		mlog_errno(ret);
    457		return ret;
    458	}
    459
    460	ocfs2_refcount_tree_get(tree);
    461
    462	ret = __ocfs2_lock_refcount_tree(osb, tree, rw);
    463	if (ret) {
    464		mlog_errno(ret);
    465		ocfs2_refcount_tree_put(tree);
    466		goto out;
    467	}
    468
    469	ret = ocfs2_read_refcount_block(&tree->rf_ci, tree->rf_blkno,
    470					&ref_root_bh);
    471	if (ret) {
    472		mlog_errno(ret);
    473		ocfs2_unlock_refcount_tree(osb, tree, rw);
    474		goto out;
    475	}
    476
    477	rb = (struct ocfs2_refcount_block *)ref_root_bh->b_data;
    478	/*
    479	 * If the refcount block has been freed and re-created, we may need
    480	 * to recreate the refcount tree also.
    481	 *
    482	 * Here we just remove the tree from the rb-tree, and the last
    483	 * kref holder will unlock and delete this refcount_tree.
    484	 * Then we goto "again" and ocfs2_get_refcount_tree will create
    485	 * the new refcount tree for us.
    486	 */
    487	if (tree->rf_generation != le32_to_cpu(rb->rf_generation)) {
    488		if (!tree->rf_removed) {
    489			ocfs2_erase_refcount_tree_from_list(osb, tree);
    490			tree->rf_removed = 1;
    491			delete_tree = 1;
    492		}
    493
    494		ocfs2_unlock_refcount_tree(osb, tree, rw);
    495		/*
    496		 * We get an extra reference when we create the refcount
    497		 * tree, so another put will destroy it.
    498		 */
    499		if (delete_tree)
    500			ocfs2_refcount_tree_put(tree);
    501		brelse(ref_root_bh);
    502		ref_root_bh = NULL;
    503		goto again;
    504	}
    505
    506	*ret_tree = tree;
    507	if (ref_bh) {
    508		*ref_bh = ref_root_bh;
    509		ref_root_bh = NULL;
    510	}
    511out:
    512	brelse(ref_root_bh);
    513	return ret;
    514}
    515
    516void ocfs2_unlock_refcount_tree(struct ocfs2_super *osb,
    517				struct ocfs2_refcount_tree *tree, int rw)
    518{
    519	if (rw)
    520		up_write(&tree->rf_sem);
    521	else
    522		up_read(&tree->rf_sem);
    523
    524	ocfs2_refcount_unlock(tree, rw);
    525	ocfs2_refcount_tree_put(tree);
    526}
    527
    528void ocfs2_purge_refcount_trees(struct ocfs2_super *osb)
    529{
    530	struct rb_node *node;
    531	struct ocfs2_refcount_tree *tree;
    532	struct rb_root *root = &osb->osb_rf_lock_tree;
    533
    534	while ((node = rb_last(root)) != NULL) {
    535		tree = rb_entry(node, struct ocfs2_refcount_tree, rf_node);
    536
    537		trace_ocfs2_purge_refcount_trees(
    538				(unsigned long long) tree->rf_blkno);
    539
    540		rb_erase(&tree->rf_node, root);
    541		ocfs2_free_refcount_tree(tree);
    542	}
    543}
    544
    545/*
    546 * Create a refcount tree for an inode.
    547 * We take for granted that the inode is already locked.
    548 */
    549static int ocfs2_create_refcount_tree(struct inode *inode,
    550				      struct buffer_head *di_bh)
    551{
    552	int ret;
    553	handle_t *handle = NULL;
    554	struct ocfs2_alloc_context *meta_ac = NULL;
    555	struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
    556	struct ocfs2_inode_info *oi = OCFS2_I(inode);
    557	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
    558	struct buffer_head *new_bh = NULL;
    559	struct ocfs2_refcount_block *rb;
    560	struct ocfs2_refcount_tree *new_tree = NULL, *tree = NULL;
    561	u16 suballoc_bit_start;
    562	u32 num_got;
    563	u64 suballoc_loc, first_blkno;
    564
    565	BUG_ON(ocfs2_is_refcount_inode(inode));
    566
    567	trace_ocfs2_create_refcount_tree(
    568		(unsigned long long)oi->ip_blkno);
    569
    570	ret = ocfs2_reserve_new_metadata_blocks(osb, 1, &meta_ac);
    571	if (ret) {
    572		mlog_errno(ret);
    573		goto out;
    574	}
    575
    576	handle = ocfs2_start_trans(osb, OCFS2_REFCOUNT_TREE_CREATE_CREDITS);
    577	if (IS_ERR(handle)) {
    578		ret = PTR_ERR(handle);
    579		mlog_errno(ret);
    580		goto out;
    581	}
    582
    583	ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), di_bh,
    584				      OCFS2_JOURNAL_ACCESS_WRITE);
    585	if (ret) {
    586		mlog_errno(ret);
    587		goto out_commit;
    588	}
    589
    590	ret = ocfs2_claim_metadata(handle, meta_ac, 1, &suballoc_loc,
    591				   &suballoc_bit_start, &num_got,
    592				   &first_blkno);
    593	if (ret) {
    594		mlog_errno(ret);
    595		goto out_commit;
    596	}
    597
    598	new_tree = ocfs2_allocate_refcount_tree(osb, first_blkno);
    599	if (!new_tree) {
    600		ret = -ENOMEM;
    601		mlog_errno(ret);
    602		goto out_commit;
    603	}
    604
    605	new_bh = sb_getblk(inode->i_sb, first_blkno);
    606	if (!new_bh) {
    607		ret = -ENOMEM;
    608		mlog_errno(ret);
    609		goto out_commit;
    610	}
    611	ocfs2_set_new_buffer_uptodate(&new_tree->rf_ci, new_bh);
    612
    613	ret = ocfs2_journal_access_rb(handle, &new_tree->rf_ci, new_bh,
    614				      OCFS2_JOURNAL_ACCESS_CREATE);
    615	if (ret) {
    616		mlog_errno(ret);
    617		goto out_commit;
    618	}
    619
    620	/* Initialize ocfs2_refcount_block. */
    621	rb = (struct ocfs2_refcount_block *)new_bh->b_data;
    622	memset(rb, 0, inode->i_sb->s_blocksize);
    623	strcpy((void *)rb, OCFS2_REFCOUNT_BLOCK_SIGNATURE);
    624	rb->rf_suballoc_slot = cpu_to_le16(meta_ac->ac_alloc_slot);
    625	rb->rf_suballoc_loc = cpu_to_le64(suballoc_loc);
    626	rb->rf_suballoc_bit = cpu_to_le16(suballoc_bit_start);
    627	rb->rf_fs_generation = cpu_to_le32(osb->fs_generation);
    628	rb->rf_blkno = cpu_to_le64(first_blkno);
    629	rb->rf_count = cpu_to_le32(1);
    630	rb->rf_records.rl_count =
    631			cpu_to_le16(ocfs2_refcount_recs_per_rb(osb->sb));
    632	spin_lock(&osb->osb_lock);
    633	rb->rf_generation = osb->s_next_generation++;
    634	spin_unlock(&osb->osb_lock);
    635
    636	ocfs2_journal_dirty(handle, new_bh);
    637
    638	spin_lock(&oi->ip_lock);
    639	oi->ip_dyn_features |= OCFS2_HAS_REFCOUNT_FL;
    640	di->i_dyn_features = cpu_to_le16(oi->ip_dyn_features);
    641	di->i_refcount_loc = cpu_to_le64(first_blkno);
    642	spin_unlock(&oi->ip_lock);
    643
    644	trace_ocfs2_create_refcount_tree_blkno((unsigned long long)first_blkno);
    645
    646	ocfs2_journal_dirty(handle, di_bh);
    647
    648	/*
    649	 * We have to init the tree lock here since it will use
    650	 * the generation number to create it.
    651	 */
    652	new_tree->rf_generation = le32_to_cpu(rb->rf_generation);
    653	ocfs2_init_refcount_tree_lock(osb, new_tree, first_blkno,
    654				      new_tree->rf_generation);
    655
    656	spin_lock(&osb->osb_lock);
    657	tree = ocfs2_find_refcount_tree(osb, first_blkno);
    658
    659	/*
    660	 * We've just created a new refcount tree in this block.  If
    661	 * we found a refcount tree on the ocfs2_super, it must be
    662	 * one we just deleted.  We free the old tree before
    663	 * inserting the new tree.
    664	 */
    665	BUG_ON(tree && tree->rf_generation == new_tree->rf_generation);
    666	if (tree)
    667		ocfs2_erase_refcount_tree_from_list_no_lock(osb, tree);
    668	ocfs2_insert_refcount_tree(osb, new_tree);
    669	spin_unlock(&osb->osb_lock);
    670	new_tree = NULL;
    671	if (tree)
    672		ocfs2_refcount_tree_put(tree);
    673
    674out_commit:
    675	ocfs2_commit_trans(osb, handle);
    676
    677out:
    678	if (new_tree) {
    679		ocfs2_metadata_cache_exit(&new_tree->rf_ci);
    680		kfree(new_tree);
    681	}
    682
    683	brelse(new_bh);
    684	if (meta_ac)
    685		ocfs2_free_alloc_context(meta_ac);
    686
    687	return ret;
    688}
    689
    690static int ocfs2_set_refcount_tree(struct inode *inode,
    691				   struct buffer_head *di_bh,
    692				   u64 refcount_loc)
    693{
    694	int ret;
    695	handle_t *handle = NULL;
    696	struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
    697	struct ocfs2_inode_info *oi = OCFS2_I(inode);
    698	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
    699	struct buffer_head *ref_root_bh = NULL;
    700	struct ocfs2_refcount_block *rb;
    701	struct ocfs2_refcount_tree *ref_tree;
    702
    703	BUG_ON(ocfs2_is_refcount_inode(inode));
    704
    705	ret = ocfs2_lock_refcount_tree(osb, refcount_loc, 1,
    706				       &ref_tree, &ref_root_bh);
    707	if (ret) {
    708		mlog_errno(ret);
    709		return ret;
    710	}
    711
    712	handle = ocfs2_start_trans(osb, OCFS2_REFCOUNT_TREE_SET_CREDITS);
    713	if (IS_ERR(handle)) {
    714		ret = PTR_ERR(handle);
    715		mlog_errno(ret);
    716		goto out;
    717	}
    718
    719	ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), di_bh,
    720				      OCFS2_JOURNAL_ACCESS_WRITE);
    721	if (ret) {
    722		mlog_errno(ret);
    723		goto out_commit;
    724	}
    725
    726	ret = ocfs2_journal_access_rb(handle, &ref_tree->rf_ci, ref_root_bh,
    727				      OCFS2_JOURNAL_ACCESS_WRITE);
    728	if (ret) {
    729		mlog_errno(ret);
    730		goto out_commit;
    731	}
    732
    733	rb = (struct ocfs2_refcount_block *)ref_root_bh->b_data;
    734	le32_add_cpu(&rb->rf_count, 1);
    735
    736	ocfs2_journal_dirty(handle, ref_root_bh);
    737
    738	spin_lock(&oi->ip_lock);
    739	oi->ip_dyn_features |= OCFS2_HAS_REFCOUNT_FL;
    740	di->i_dyn_features = cpu_to_le16(oi->ip_dyn_features);
    741	di->i_refcount_loc = cpu_to_le64(refcount_loc);
    742	spin_unlock(&oi->ip_lock);
    743	ocfs2_journal_dirty(handle, di_bh);
    744
    745out_commit:
    746	ocfs2_commit_trans(osb, handle);
    747out:
    748	ocfs2_unlock_refcount_tree(osb, ref_tree, 1);
    749	brelse(ref_root_bh);
    750
    751	return ret;
    752}
    753
    754int ocfs2_remove_refcount_tree(struct inode *inode, struct buffer_head *di_bh)
    755{
    756	int ret, delete_tree = 0;
    757	handle_t *handle = NULL;
    758	struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
    759	struct ocfs2_inode_info *oi = OCFS2_I(inode);
    760	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
    761	struct ocfs2_refcount_block *rb;
    762	struct inode *alloc_inode = NULL;
    763	struct buffer_head *alloc_bh = NULL;
    764	struct buffer_head *blk_bh = NULL;
    765	struct ocfs2_refcount_tree *ref_tree;
    766	int credits = OCFS2_REFCOUNT_TREE_REMOVE_CREDITS;
    767	u64 blk = 0, bg_blkno = 0, ref_blkno = le64_to_cpu(di->i_refcount_loc);
    768	u16 bit = 0;
    769
    770	if (!ocfs2_is_refcount_inode(inode))
    771		return 0;
    772
    773	BUG_ON(!ref_blkno);
    774	ret = ocfs2_lock_refcount_tree(osb, ref_blkno, 1, &ref_tree, &blk_bh);
    775	if (ret) {
    776		mlog_errno(ret);
    777		return ret;
    778	}
    779
    780	rb = (struct ocfs2_refcount_block *)blk_bh->b_data;
    781
    782	/*
    783	 * If we are the last user, we need to free the block.
    784	 * So lock the allocator ahead.
    785	 */
    786	if (le32_to_cpu(rb->rf_count) == 1) {
    787		blk = le64_to_cpu(rb->rf_blkno);
    788		bit = le16_to_cpu(rb->rf_suballoc_bit);
    789		if (rb->rf_suballoc_loc)
    790			bg_blkno = le64_to_cpu(rb->rf_suballoc_loc);
    791		else
    792			bg_blkno = ocfs2_which_suballoc_group(blk, bit);
    793
    794		alloc_inode = ocfs2_get_system_file_inode(osb,
    795					EXTENT_ALLOC_SYSTEM_INODE,
    796					le16_to_cpu(rb->rf_suballoc_slot));
    797		if (!alloc_inode) {
    798			ret = -ENOMEM;
    799			mlog_errno(ret);
    800			goto out;
    801		}
    802		inode_lock(alloc_inode);
    803
    804		ret = ocfs2_inode_lock(alloc_inode, &alloc_bh, 1);
    805		if (ret) {
    806			mlog_errno(ret);
    807			goto out_mutex;
    808		}
    809
    810		credits += OCFS2_SUBALLOC_FREE;
    811	}
    812
    813	handle = ocfs2_start_trans(osb, credits);
    814	if (IS_ERR(handle)) {
    815		ret = PTR_ERR(handle);
    816		mlog_errno(ret);
    817		goto out_unlock;
    818	}
    819
    820	ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), di_bh,
    821				      OCFS2_JOURNAL_ACCESS_WRITE);
    822	if (ret) {
    823		mlog_errno(ret);
    824		goto out_commit;
    825	}
    826
    827	ret = ocfs2_journal_access_rb(handle, &ref_tree->rf_ci, blk_bh,
    828				      OCFS2_JOURNAL_ACCESS_WRITE);
    829	if (ret) {
    830		mlog_errno(ret);
    831		goto out_commit;
    832	}
    833
    834	spin_lock(&oi->ip_lock);
    835	oi->ip_dyn_features &= ~OCFS2_HAS_REFCOUNT_FL;
    836	di->i_dyn_features = cpu_to_le16(oi->ip_dyn_features);
    837	di->i_refcount_loc = 0;
    838	spin_unlock(&oi->ip_lock);
    839	ocfs2_journal_dirty(handle, di_bh);
    840
    841	le32_add_cpu(&rb->rf_count , -1);
    842	ocfs2_journal_dirty(handle, blk_bh);
    843
    844	if (!rb->rf_count) {
    845		delete_tree = 1;
    846		ocfs2_erase_refcount_tree_from_list(osb, ref_tree);
    847		ret = ocfs2_free_suballoc_bits(handle, alloc_inode,
    848					       alloc_bh, bit, bg_blkno, 1);
    849		if (ret)
    850			mlog_errno(ret);
    851	}
    852
    853out_commit:
    854	ocfs2_commit_trans(osb, handle);
    855out_unlock:
    856	if (alloc_inode) {
    857		ocfs2_inode_unlock(alloc_inode, 1);
    858		brelse(alloc_bh);
    859	}
    860out_mutex:
    861	if (alloc_inode) {
    862		inode_unlock(alloc_inode);
    863		iput(alloc_inode);
    864	}
    865out:
    866	ocfs2_unlock_refcount_tree(osb, ref_tree, 1);
    867	if (delete_tree)
    868		ocfs2_refcount_tree_put(ref_tree);
    869	brelse(blk_bh);
    870
    871	return ret;
    872}
    873
    874static void ocfs2_find_refcount_rec_in_rl(struct ocfs2_caching_info *ci,
    875					  struct buffer_head *ref_leaf_bh,
    876					  u64 cpos, unsigned int len,
    877					  struct ocfs2_refcount_rec *ret_rec,
    878					  int *index)
    879{
    880	int i = 0;
    881	struct ocfs2_refcount_block *rb =
    882		(struct ocfs2_refcount_block *)ref_leaf_bh->b_data;
    883	struct ocfs2_refcount_rec *rec = NULL;
    884
    885	for (; i < le16_to_cpu(rb->rf_records.rl_used); i++) {
    886		rec = &rb->rf_records.rl_recs[i];
    887
    888		if (le64_to_cpu(rec->r_cpos) +
    889		    le32_to_cpu(rec->r_clusters) <= cpos)
    890			continue;
    891		else if (le64_to_cpu(rec->r_cpos) > cpos)
    892			break;
    893
    894		/* ok, cpos fail in this rec. Just return. */
    895		if (ret_rec)
    896			*ret_rec = *rec;
    897		goto out;
    898	}
    899
    900	if (ret_rec) {
    901		/* We meet with a hole here, so fake the rec. */
    902		ret_rec->r_cpos = cpu_to_le64(cpos);
    903		ret_rec->r_refcount = 0;
    904		if (i < le16_to_cpu(rb->rf_records.rl_used) &&
    905		    le64_to_cpu(rec->r_cpos) < cpos + len)
    906			ret_rec->r_clusters =
    907				cpu_to_le32(le64_to_cpu(rec->r_cpos) - cpos);
    908		else
    909			ret_rec->r_clusters = cpu_to_le32(len);
    910	}
    911
    912out:
    913	*index = i;
    914}
    915
    916/*
    917 * Try to remove refcount tree. The mechanism is:
    918 * 1) Check whether i_clusters == 0, if no, exit.
    919 * 2) check whether we have i_xattr_loc in dinode. if yes, exit.
    920 * 3) Check whether we have inline xattr stored outside, if yes, exit.
    921 * 4) Remove the tree.
    922 */
    923int ocfs2_try_remove_refcount_tree(struct inode *inode,
    924				   struct buffer_head *di_bh)
    925{
    926	int ret;
    927	struct ocfs2_inode_info *oi = OCFS2_I(inode);
    928	struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
    929
    930	down_write(&oi->ip_xattr_sem);
    931	down_write(&oi->ip_alloc_sem);
    932
    933	if (oi->ip_clusters)
    934		goto out;
    935
    936	if ((oi->ip_dyn_features & OCFS2_HAS_XATTR_FL) && di->i_xattr_loc)
    937		goto out;
    938
    939	if (oi->ip_dyn_features & OCFS2_INLINE_XATTR_FL &&
    940	    ocfs2_has_inline_xattr_value_outside(inode, di))
    941		goto out;
    942
    943	ret = ocfs2_remove_refcount_tree(inode, di_bh);
    944	if (ret)
    945		mlog_errno(ret);
    946out:
    947	up_write(&oi->ip_alloc_sem);
    948	up_write(&oi->ip_xattr_sem);
    949	return 0;
    950}
    951
    952/*
    953 * Find the end range for a leaf refcount block indicated by
    954 * el->l_recs[index].e_blkno.
    955 */
    956static int ocfs2_get_refcount_cpos_end(struct ocfs2_caching_info *ci,
    957				       struct buffer_head *ref_root_bh,
    958				       struct ocfs2_extent_block *eb,
    959				       struct ocfs2_extent_list *el,
    960				       int index,  u32 *cpos_end)
    961{
    962	int ret, i, subtree_root;
    963	u32 cpos;
    964	u64 blkno;
    965	struct super_block *sb = ocfs2_metadata_cache_get_super(ci);
    966	struct ocfs2_path *left_path = NULL, *right_path = NULL;
    967	struct ocfs2_extent_tree et;
    968	struct ocfs2_extent_list *tmp_el;
    969
    970	if (index < le16_to_cpu(el->l_next_free_rec) - 1) {
    971		/*
    972		 * We have a extent rec after index, so just use the e_cpos
    973		 * of the next extent rec.
    974		 */
    975		*cpos_end = le32_to_cpu(el->l_recs[index+1].e_cpos);
    976		return 0;
    977	}
    978
    979	if (!eb || !eb->h_next_leaf_blk) {
    980		/*
    981		 * We are the last extent rec, so any high cpos should
    982		 * be stored in this leaf refcount block.
    983		 */
    984		*cpos_end = UINT_MAX;
    985		return 0;
    986	}
    987
    988	/*
    989	 * If the extent block isn't the last one, we have to find
    990	 * the subtree root between this extent block and the next
    991	 * leaf extent block and get the corresponding e_cpos from
    992	 * the subroot. Otherwise we may corrupt the b-tree.
    993	 */
    994	ocfs2_init_refcount_extent_tree(&et, ci, ref_root_bh);
    995
    996	left_path = ocfs2_new_path_from_et(&et);
    997	if (!left_path) {
    998		ret = -ENOMEM;
    999		mlog_errno(ret);
   1000		goto out;
   1001	}
   1002
   1003	cpos = le32_to_cpu(eb->h_list.l_recs[index].e_cpos);
   1004	ret = ocfs2_find_path(ci, left_path, cpos);
   1005	if (ret) {
   1006		mlog_errno(ret);
   1007		goto out;
   1008	}
   1009
   1010	right_path = ocfs2_new_path_from_path(left_path);
   1011	if (!right_path) {
   1012		ret = -ENOMEM;
   1013		mlog_errno(ret);
   1014		goto out;
   1015	}
   1016
   1017	ret = ocfs2_find_cpos_for_right_leaf(sb, left_path, &cpos);
   1018	if (ret) {
   1019		mlog_errno(ret);
   1020		goto out;
   1021	}
   1022
   1023	ret = ocfs2_find_path(ci, right_path, cpos);
   1024	if (ret) {
   1025		mlog_errno(ret);
   1026		goto out;
   1027	}
   1028
   1029	subtree_root = ocfs2_find_subtree_root(&et, left_path,
   1030					       right_path);
   1031
   1032	tmp_el = left_path->p_node[subtree_root].el;
   1033	blkno = left_path->p_node[subtree_root+1].bh->b_blocknr;
   1034	for (i = 0; i < le16_to_cpu(tmp_el->l_next_free_rec); i++) {
   1035		if (le64_to_cpu(tmp_el->l_recs[i].e_blkno) == blkno) {
   1036			*cpos_end = le32_to_cpu(tmp_el->l_recs[i+1].e_cpos);
   1037			break;
   1038		}
   1039	}
   1040
   1041	BUG_ON(i == le16_to_cpu(tmp_el->l_next_free_rec));
   1042
   1043out:
   1044	ocfs2_free_path(left_path);
   1045	ocfs2_free_path(right_path);
   1046	return ret;
   1047}
   1048
   1049/*
   1050 * Given a cpos and len, try to find the refcount record which contains cpos.
   1051 * 1. If cpos can be found in one refcount record, return the record.
   1052 * 2. If cpos can't be found, return a fake record which start from cpos
   1053 *    and end at a small value between cpos+len and start of the next record.
   1054 *    This fake record has r_refcount = 0.
   1055 */
   1056static int ocfs2_get_refcount_rec(struct ocfs2_caching_info *ci,
   1057				  struct buffer_head *ref_root_bh,
   1058				  u64 cpos, unsigned int len,
   1059				  struct ocfs2_refcount_rec *ret_rec,
   1060				  int *index,
   1061				  struct buffer_head **ret_bh)
   1062{
   1063	int ret = 0, i, found;
   1064	u32 low_cpos, cpos_end;
   1065	struct ocfs2_extent_list *el;
   1066	struct ocfs2_extent_rec *rec = NULL;
   1067	struct ocfs2_extent_block *eb = NULL;
   1068	struct buffer_head *eb_bh = NULL, *ref_leaf_bh = NULL;
   1069	struct super_block *sb = ocfs2_metadata_cache_get_super(ci);
   1070	struct ocfs2_refcount_block *rb =
   1071			(struct ocfs2_refcount_block *)ref_root_bh->b_data;
   1072
   1073	if (!(le32_to_cpu(rb->rf_flags) & OCFS2_REFCOUNT_TREE_FL)) {
   1074		ocfs2_find_refcount_rec_in_rl(ci, ref_root_bh, cpos, len,
   1075					      ret_rec, index);
   1076		*ret_bh = ref_root_bh;
   1077		get_bh(ref_root_bh);
   1078		return 0;
   1079	}
   1080
   1081	el = &rb->rf_list;
   1082	low_cpos = cpos & OCFS2_32BIT_POS_MASK;
   1083
   1084	if (el->l_tree_depth) {
   1085		ret = ocfs2_find_leaf(ci, el, low_cpos, &eb_bh);
   1086		if (ret) {
   1087			mlog_errno(ret);
   1088			goto out;
   1089		}
   1090
   1091		eb = (struct ocfs2_extent_block *) eb_bh->b_data;
   1092		el = &eb->h_list;
   1093
   1094		if (el->l_tree_depth) {
   1095			ret = ocfs2_error(sb,
   1096					  "refcount tree %llu has non zero tree depth in leaf btree tree block %llu\n",
   1097					  (unsigned long long)ocfs2_metadata_cache_owner(ci),
   1098					  (unsigned long long)eb_bh->b_blocknr);
   1099			goto out;
   1100		}
   1101	}
   1102
   1103	found = 0;
   1104	for (i = le16_to_cpu(el->l_next_free_rec) - 1; i >= 0; i--) {
   1105		rec = &el->l_recs[i];
   1106
   1107		if (le32_to_cpu(rec->e_cpos) <= low_cpos) {
   1108			found = 1;
   1109			break;
   1110		}
   1111	}
   1112
   1113	if (found) {
   1114		ret = ocfs2_get_refcount_cpos_end(ci, ref_root_bh,
   1115						  eb, el, i, &cpos_end);
   1116		if (ret) {
   1117			mlog_errno(ret);
   1118			goto out;
   1119		}
   1120
   1121		if (cpos_end < low_cpos + len)
   1122			len = cpos_end - low_cpos;
   1123	}
   1124
   1125	ret = ocfs2_read_refcount_block(ci, le64_to_cpu(rec->e_blkno),
   1126					&ref_leaf_bh);
   1127	if (ret) {
   1128		mlog_errno(ret);
   1129		goto out;
   1130	}
   1131
   1132	ocfs2_find_refcount_rec_in_rl(ci, ref_leaf_bh, cpos, len,
   1133				      ret_rec, index);
   1134	*ret_bh = ref_leaf_bh;
   1135out:
   1136	brelse(eb_bh);
   1137	return ret;
   1138}
   1139
   1140enum ocfs2_ref_rec_contig {
   1141	REF_CONTIG_NONE = 0,
   1142	REF_CONTIG_LEFT,
   1143	REF_CONTIG_RIGHT,
   1144	REF_CONTIG_LEFTRIGHT,
   1145};
   1146
   1147static enum ocfs2_ref_rec_contig
   1148	ocfs2_refcount_rec_adjacent(struct ocfs2_refcount_block *rb,
   1149				    int index)
   1150{
   1151	if ((rb->rf_records.rl_recs[index].r_refcount ==
   1152	    rb->rf_records.rl_recs[index + 1].r_refcount) &&
   1153	    (le64_to_cpu(rb->rf_records.rl_recs[index].r_cpos) +
   1154	    le32_to_cpu(rb->rf_records.rl_recs[index].r_clusters) ==
   1155	    le64_to_cpu(rb->rf_records.rl_recs[index + 1].r_cpos)))
   1156		return REF_CONTIG_RIGHT;
   1157
   1158	return REF_CONTIG_NONE;
   1159}
   1160
   1161static enum ocfs2_ref_rec_contig
   1162	ocfs2_refcount_rec_contig(struct ocfs2_refcount_block *rb,
   1163				  int index)
   1164{
   1165	enum ocfs2_ref_rec_contig ret = REF_CONTIG_NONE;
   1166
   1167	if (index < le16_to_cpu(rb->rf_records.rl_used) - 1)
   1168		ret = ocfs2_refcount_rec_adjacent(rb, index);
   1169
   1170	if (index > 0) {
   1171		enum ocfs2_ref_rec_contig tmp;
   1172
   1173		tmp = ocfs2_refcount_rec_adjacent(rb, index - 1);
   1174
   1175		if (tmp == REF_CONTIG_RIGHT) {
   1176			if (ret == REF_CONTIG_RIGHT)
   1177				ret = REF_CONTIG_LEFTRIGHT;
   1178			else
   1179				ret = REF_CONTIG_LEFT;
   1180		}
   1181	}
   1182
   1183	return ret;
   1184}
   1185
   1186static void ocfs2_rotate_refcount_rec_left(struct ocfs2_refcount_block *rb,
   1187					   int index)
   1188{
   1189	BUG_ON(rb->rf_records.rl_recs[index].r_refcount !=
   1190	       rb->rf_records.rl_recs[index+1].r_refcount);
   1191
   1192	le32_add_cpu(&rb->rf_records.rl_recs[index].r_clusters,
   1193		     le32_to_cpu(rb->rf_records.rl_recs[index+1].r_clusters));
   1194
   1195	if (index < le16_to_cpu(rb->rf_records.rl_used) - 2)
   1196		memmove(&rb->rf_records.rl_recs[index + 1],
   1197			&rb->rf_records.rl_recs[index + 2],
   1198			sizeof(struct ocfs2_refcount_rec) *
   1199			(le16_to_cpu(rb->rf_records.rl_used) - index - 2));
   1200
   1201	memset(&rb->rf_records.rl_recs[le16_to_cpu(rb->rf_records.rl_used) - 1],
   1202	       0, sizeof(struct ocfs2_refcount_rec));
   1203	le16_add_cpu(&rb->rf_records.rl_used, -1);
   1204}
   1205
   1206/*
   1207 * Merge the refcount rec if we are contiguous with the adjacent recs.
   1208 */
   1209static void ocfs2_refcount_rec_merge(struct ocfs2_refcount_block *rb,
   1210				     int index)
   1211{
   1212	enum ocfs2_ref_rec_contig contig =
   1213				ocfs2_refcount_rec_contig(rb, index);
   1214
   1215	if (contig == REF_CONTIG_NONE)
   1216		return;
   1217
   1218	if (contig == REF_CONTIG_LEFT || contig == REF_CONTIG_LEFTRIGHT) {
   1219		BUG_ON(index == 0);
   1220		index--;
   1221	}
   1222
   1223	ocfs2_rotate_refcount_rec_left(rb, index);
   1224
   1225	if (contig == REF_CONTIG_LEFTRIGHT)
   1226		ocfs2_rotate_refcount_rec_left(rb, index);
   1227}
   1228
   1229/*
   1230 * Change the refcount indexed by "index" in ref_bh.
   1231 * If refcount reaches 0, remove it.
   1232 */
   1233static int ocfs2_change_refcount_rec(handle_t *handle,
   1234				     struct ocfs2_caching_info *ci,
   1235				     struct buffer_head *ref_leaf_bh,
   1236				     int index, int merge, int change)
   1237{
   1238	int ret;
   1239	struct ocfs2_refcount_block *rb =
   1240			(struct ocfs2_refcount_block *)ref_leaf_bh->b_data;
   1241	struct ocfs2_refcount_list *rl = &rb->rf_records;
   1242	struct ocfs2_refcount_rec *rec = &rl->rl_recs[index];
   1243
   1244	ret = ocfs2_journal_access_rb(handle, ci, ref_leaf_bh,
   1245				      OCFS2_JOURNAL_ACCESS_WRITE);
   1246	if (ret) {
   1247		mlog_errno(ret);
   1248		goto out;
   1249	}
   1250
   1251	trace_ocfs2_change_refcount_rec(
   1252		(unsigned long long)ocfs2_metadata_cache_owner(ci),
   1253		index, le32_to_cpu(rec->r_refcount), change);
   1254	le32_add_cpu(&rec->r_refcount, change);
   1255
   1256	if (!rec->r_refcount) {
   1257		if (index != le16_to_cpu(rl->rl_used) - 1) {
   1258			memmove(rec, rec + 1,
   1259				(le16_to_cpu(rl->rl_used) - index - 1) *
   1260				sizeof(struct ocfs2_refcount_rec));
   1261			memset(&rl->rl_recs[le16_to_cpu(rl->rl_used) - 1],
   1262			       0, sizeof(struct ocfs2_refcount_rec));
   1263		}
   1264
   1265		le16_add_cpu(&rl->rl_used, -1);
   1266	} else if (merge)
   1267		ocfs2_refcount_rec_merge(rb, index);
   1268
   1269	ocfs2_journal_dirty(handle, ref_leaf_bh);
   1270out:
   1271	return ret;
   1272}
   1273
   1274static int ocfs2_expand_inline_ref_root(handle_t *handle,
   1275					struct ocfs2_caching_info *ci,
   1276					struct buffer_head *ref_root_bh,
   1277					struct buffer_head **ref_leaf_bh,
   1278					struct ocfs2_alloc_context *meta_ac)
   1279{
   1280	int ret;
   1281	u16 suballoc_bit_start;
   1282	u32 num_got;
   1283	u64 suballoc_loc, blkno;
   1284	struct super_block *sb = ocfs2_metadata_cache_get_super(ci);
   1285	struct buffer_head *new_bh = NULL;
   1286	struct ocfs2_refcount_block *new_rb;
   1287	struct ocfs2_refcount_block *root_rb =
   1288			(struct ocfs2_refcount_block *)ref_root_bh->b_data;
   1289
   1290	ret = ocfs2_journal_access_rb(handle, ci, ref_root_bh,
   1291				      OCFS2_JOURNAL_ACCESS_WRITE);
   1292	if (ret) {
   1293		mlog_errno(ret);
   1294		goto out;
   1295	}
   1296
   1297	ret = ocfs2_claim_metadata(handle, meta_ac, 1, &suballoc_loc,
   1298				   &suballoc_bit_start, &num_got,
   1299				   &blkno);
   1300	if (ret) {
   1301		mlog_errno(ret);
   1302		goto out;
   1303	}
   1304
   1305	new_bh = sb_getblk(sb, blkno);
   1306	if (new_bh == NULL) {
   1307		ret = -ENOMEM;
   1308		mlog_errno(ret);
   1309		goto out;
   1310	}
   1311	ocfs2_set_new_buffer_uptodate(ci, new_bh);
   1312
   1313	ret = ocfs2_journal_access_rb(handle, ci, new_bh,
   1314				      OCFS2_JOURNAL_ACCESS_CREATE);
   1315	if (ret) {
   1316		mlog_errno(ret);
   1317		goto out;
   1318	}
   1319
   1320	/*
   1321	 * Initialize ocfs2_refcount_block.
   1322	 * It should contain the same information as the old root.
   1323	 * so just memcpy it and change the corresponding field.
   1324	 */
   1325	memcpy(new_bh->b_data, ref_root_bh->b_data, sb->s_blocksize);
   1326
   1327	new_rb = (struct ocfs2_refcount_block *)new_bh->b_data;
   1328	new_rb->rf_suballoc_slot = cpu_to_le16(meta_ac->ac_alloc_slot);
   1329	new_rb->rf_suballoc_loc = cpu_to_le64(suballoc_loc);
   1330	new_rb->rf_suballoc_bit = cpu_to_le16(suballoc_bit_start);
   1331	new_rb->rf_blkno = cpu_to_le64(blkno);
   1332	new_rb->rf_cpos = cpu_to_le32(0);
   1333	new_rb->rf_parent = cpu_to_le64(ref_root_bh->b_blocknr);
   1334	new_rb->rf_flags = cpu_to_le32(OCFS2_REFCOUNT_LEAF_FL);
   1335	ocfs2_journal_dirty(handle, new_bh);
   1336
   1337	/* Now change the root. */
   1338	memset(&root_rb->rf_list, 0, sb->s_blocksize -
   1339	       offsetof(struct ocfs2_refcount_block, rf_list));
   1340	root_rb->rf_list.l_count = cpu_to_le16(ocfs2_extent_recs_per_rb(sb));
   1341	root_rb->rf_clusters = cpu_to_le32(1);
   1342	root_rb->rf_list.l_next_free_rec = cpu_to_le16(1);
   1343	root_rb->rf_list.l_recs[0].e_blkno = cpu_to_le64(blkno);
   1344	root_rb->rf_list.l_recs[0].e_leaf_clusters = cpu_to_le16(1);
   1345	root_rb->rf_flags = cpu_to_le32(OCFS2_REFCOUNT_TREE_FL);
   1346
   1347	ocfs2_journal_dirty(handle, ref_root_bh);
   1348
   1349	trace_ocfs2_expand_inline_ref_root((unsigned long long)blkno,
   1350		le16_to_cpu(new_rb->rf_records.rl_used));
   1351
   1352	*ref_leaf_bh = new_bh;
   1353	new_bh = NULL;
   1354out:
   1355	brelse(new_bh);
   1356	return ret;
   1357}
   1358
   1359static int ocfs2_refcount_rec_no_intersect(struct ocfs2_refcount_rec *prev,
   1360					   struct ocfs2_refcount_rec *next)
   1361{
   1362	if (ocfs2_get_ref_rec_low_cpos(prev) + le32_to_cpu(prev->r_clusters) <=
   1363		ocfs2_get_ref_rec_low_cpos(next))
   1364		return 1;
   1365
   1366	return 0;
   1367}
   1368
   1369static int cmp_refcount_rec_by_low_cpos(const void *a, const void *b)
   1370{
   1371	const struct ocfs2_refcount_rec *l = a, *r = b;
   1372	u32 l_cpos = ocfs2_get_ref_rec_low_cpos(l);
   1373	u32 r_cpos = ocfs2_get_ref_rec_low_cpos(r);
   1374
   1375	if (l_cpos > r_cpos)
   1376		return 1;
   1377	if (l_cpos < r_cpos)
   1378		return -1;
   1379	return 0;
   1380}
   1381
   1382static int cmp_refcount_rec_by_cpos(const void *a, const void *b)
   1383{
   1384	const struct ocfs2_refcount_rec *l = a, *r = b;
   1385	u64 l_cpos = le64_to_cpu(l->r_cpos);
   1386	u64 r_cpos = le64_to_cpu(r->r_cpos);
   1387
   1388	if (l_cpos > r_cpos)
   1389		return 1;
   1390	if (l_cpos < r_cpos)
   1391		return -1;
   1392	return 0;
   1393}
   1394
   1395static void swap_refcount_rec(void *a, void *b, int size)
   1396{
   1397	struct ocfs2_refcount_rec *l = a, *r = b;
   1398
   1399	swap(*l, *r);
   1400}
   1401
   1402/*
   1403 * The refcount cpos are ordered by their 64bit cpos,
   1404 * But we will use the low 32 bit to be the e_cpos in the b-tree.
   1405 * So we need to make sure that this pos isn't intersected with others.
   1406 *
   1407 * Note: The refcount block is already sorted by their low 32 bit cpos,
   1408 *       So just try the middle pos first, and we will exit when we find
   1409 *       the good position.
   1410 */
   1411static int ocfs2_find_refcount_split_pos(struct ocfs2_refcount_list *rl,
   1412					 u32 *split_pos, int *split_index)
   1413{
   1414	int num_used = le16_to_cpu(rl->rl_used);
   1415	int delta, middle = num_used / 2;
   1416
   1417	for (delta = 0; delta < middle; delta++) {
   1418		/* Let's check delta earlier than middle */
   1419		if (ocfs2_refcount_rec_no_intersect(
   1420					&rl->rl_recs[middle - delta - 1],
   1421					&rl->rl_recs[middle - delta])) {
   1422			*split_index = middle - delta;
   1423			break;
   1424		}
   1425
   1426		/* For even counts, don't walk off the end */
   1427		if ((middle + delta + 1) == num_used)
   1428			continue;
   1429
   1430		/* Now try delta past middle */
   1431		if (ocfs2_refcount_rec_no_intersect(
   1432					&rl->rl_recs[middle + delta],
   1433					&rl->rl_recs[middle + delta + 1])) {
   1434			*split_index = middle + delta + 1;
   1435			break;
   1436		}
   1437	}
   1438
   1439	if (delta >= middle)
   1440		return -ENOSPC;
   1441
   1442	*split_pos = ocfs2_get_ref_rec_low_cpos(&rl->rl_recs[*split_index]);
   1443	return 0;
   1444}
   1445
   1446static int ocfs2_divide_leaf_refcount_block(struct buffer_head *ref_leaf_bh,
   1447					    struct buffer_head *new_bh,
   1448					    u32 *split_cpos)
   1449{
   1450	int split_index = 0, num_moved, ret;
   1451	u32 cpos = 0;
   1452	struct ocfs2_refcount_block *rb =
   1453			(struct ocfs2_refcount_block *)ref_leaf_bh->b_data;
   1454	struct ocfs2_refcount_list *rl = &rb->rf_records;
   1455	struct ocfs2_refcount_block *new_rb =
   1456			(struct ocfs2_refcount_block *)new_bh->b_data;
   1457	struct ocfs2_refcount_list *new_rl = &new_rb->rf_records;
   1458
   1459	trace_ocfs2_divide_leaf_refcount_block(
   1460		(unsigned long long)ref_leaf_bh->b_blocknr,
   1461		le16_to_cpu(rl->rl_count), le16_to_cpu(rl->rl_used));
   1462
   1463	/*
   1464	 * XXX: Improvement later.
   1465	 * If we know all the high 32 bit cpos is the same, no need to sort.
   1466	 *
   1467	 * In order to make the whole process safe, we do:
   1468	 * 1. sort the entries by their low 32 bit cpos first so that we can
   1469	 *    find the split cpos easily.
   1470	 * 2. call ocfs2_insert_extent to insert the new refcount block.
   1471	 * 3. move the refcount rec to the new block.
   1472	 * 4. sort the entries by their 64 bit cpos.
   1473	 * 5. dirty the new_rb and rb.
   1474	 */
   1475	sort(&rl->rl_recs, le16_to_cpu(rl->rl_used),
   1476	     sizeof(struct ocfs2_refcount_rec),
   1477	     cmp_refcount_rec_by_low_cpos, swap_refcount_rec);
   1478
   1479	ret = ocfs2_find_refcount_split_pos(rl, &cpos, &split_index);
   1480	if (ret) {
   1481		mlog_errno(ret);
   1482		return ret;
   1483	}
   1484
   1485	new_rb->rf_cpos = cpu_to_le32(cpos);
   1486
   1487	/* move refcount records starting from split_index to the new block. */
   1488	num_moved = le16_to_cpu(rl->rl_used) - split_index;
   1489	memcpy(new_rl->rl_recs, &rl->rl_recs[split_index],
   1490	       num_moved * sizeof(struct ocfs2_refcount_rec));
   1491
   1492	/*ok, remove the entries we just moved over to the other block. */
   1493	memset(&rl->rl_recs[split_index], 0,
   1494	       num_moved * sizeof(struct ocfs2_refcount_rec));
   1495
   1496	/* change old and new rl_used accordingly. */
   1497	le16_add_cpu(&rl->rl_used, -num_moved);
   1498	new_rl->rl_used = cpu_to_le16(num_moved);
   1499
   1500	sort(&rl->rl_recs, le16_to_cpu(rl->rl_used),
   1501	     sizeof(struct ocfs2_refcount_rec),
   1502	     cmp_refcount_rec_by_cpos, swap_refcount_rec);
   1503
   1504	sort(&new_rl->rl_recs, le16_to_cpu(new_rl->rl_used),
   1505	     sizeof(struct ocfs2_refcount_rec),
   1506	     cmp_refcount_rec_by_cpos, swap_refcount_rec);
   1507
   1508	*split_cpos = cpos;
   1509	return 0;
   1510}
   1511
   1512static int ocfs2_new_leaf_refcount_block(handle_t *handle,
   1513					 struct ocfs2_caching_info *ci,
   1514					 struct buffer_head *ref_root_bh,
   1515					 struct buffer_head *ref_leaf_bh,
   1516					 struct ocfs2_alloc_context *meta_ac)
   1517{
   1518	int ret;
   1519	u16 suballoc_bit_start;
   1520	u32 num_got, new_cpos;
   1521	u64 suballoc_loc, blkno;
   1522	struct super_block *sb = ocfs2_metadata_cache_get_super(ci);
   1523	struct ocfs2_refcount_block *root_rb =
   1524			(struct ocfs2_refcount_block *)ref_root_bh->b_data;
   1525	struct buffer_head *new_bh = NULL;
   1526	struct ocfs2_refcount_block *new_rb;
   1527	struct ocfs2_extent_tree ref_et;
   1528
   1529	BUG_ON(!(le32_to_cpu(root_rb->rf_flags) & OCFS2_REFCOUNT_TREE_FL));
   1530
   1531	ret = ocfs2_journal_access_rb(handle, ci, ref_root_bh,
   1532				      OCFS2_JOURNAL_ACCESS_WRITE);
   1533	if (ret) {
   1534		mlog_errno(ret);
   1535		goto out;
   1536	}
   1537
   1538	ret = ocfs2_journal_access_rb(handle, ci, ref_leaf_bh,
   1539				      OCFS2_JOURNAL_ACCESS_WRITE);
   1540	if (ret) {
   1541		mlog_errno(ret);
   1542		goto out;
   1543	}
   1544
   1545	ret = ocfs2_claim_metadata(handle, meta_ac, 1, &suballoc_loc,
   1546				   &suballoc_bit_start, &num_got,
   1547				   &blkno);
   1548	if (ret) {
   1549		mlog_errno(ret);
   1550		goto out;
   1551	}
   1552
   1553	new_bh = sb_getblk(sb, blkno);
   1554	if (new_bh == NULL) {
   1555		ret = -ENOMEM;
   1556		mlog_errno(ret);
   1557		goto out;
   1558	}
   1559	ocfs2_set_new_buffer_uptodate(ci, new_bh);
   1560
   1561	ret = ocfs2_journal_access_rb(handle, ci, new_bh,
   1562				      OCFS2_JOURNAL_ACCESS_CREATE);
   1563	if (ret) {
   1564		mlog_errno(ret);
   1565		goto out;
   1566	}
   1567
   1568	/* Initialize ocfs2_refcount_block. */
   1569	new_rb = (struct ocfs2_refcount_block *)new_bh->b_data;
   1570	memset(new_rb, 0, sb->s_blocksize);
   1571	strcpy((void *)new_rb, OCFS2_REFCOUNT_BLOCK_SIGNATURE);
   1572	new_rb->rf_suballoc_slot = cpu_to_le16(meta_ac->ac_alloc_slot);
   1573	new_rb->rf_suballoc_loc = cpu_to_le64(suballoc_loc);
   1574	new_rb->rf_suballoc_bit = cpu_to_le16(suballoc_bit_start);
   1575	new_rb->rf_fs_generation = cpu_to_le32(OCFS2_SB(sb)->fs_generation);
   1576	new_rb->rf_blkno = cpu_to_le64(blkno);
   1577	new_rb->rf_parent = cpu_to_le64(ref_root_bh->b_blocknr);
   1578	new_rb->rf_flags = cpu_to_le32(OCFS2_REFCOUNT_LEAF_FL);
   1579	new_rb->rf_records.rl_count =
   1580				cpu_to_le16(ocfs2_refcount_recs_per_rb(sb));
   1581	new_rb->rf_generation = root_rb->rf_generation;
   1582
   1583	ret = ocfs2_divide_leaf_refcount_block(ref_leaf_bh, new_bh, &new_cpos);
   1584	if (ret) {
   1585		mlog_errno(ret);
   1586		goto out;
   1587	}
   1588
   1589	ocfs2_journal_dirty(handle, ref_leaf_bh);
   1590	ocfs2_journal_dirty(handle, new_bh);
   1591
   1592	ocfs2_init_refcount_extent_tree(&ref_et, ci, ref_root_bh);
   1593
   1594	trace_ocfs2_new_leaf_refcount_block(
   1595			(unsigned long long)new_bh->b_blocknr, new_cpos);
   1596
   1597	/* Insert the new leaf block with the specific offset cpos. */
   1598	ret = ocfs2_insert_extent(handle, &ref_et, new_cpos, new_bh->b_blocknr,
   1599				  1, 0, meta_ac);
   1600	if (ret)
   1601		mlog_errno(ret);
   1602
   1603out:
   1604	brelse(new_bh);
   1605	return ret;
   1606}
   1607
   1608static int ocfs2_expand_refcount_tree(handle_t *handle,
   1609				      struct ocfs2_caching_info *ci,
   1610				      struct buffer_head *ref_root_bh,
   1611				      struct buffer_head *ref_leaf_bh,
   1612				      struct ocfs2_alloc_context *meta_ac)
   1613{
   1614	int ret;
   1615	struct buffer_head *expand_bh = NULL;
   1616
   1617	if (ref_root_bh == ref_leaf_bh) {
   1618		/*
   1619		 * the old root bh hasn't been expanded to a b-tree,
   1620		 * so expand it first.
   1621		 */
   1622		ret = ocfs2_expand_inline_ref_root(handle, ci, ref_root_bh,
   1623						   &expand_bh, meta_ac);
   1624		if (ret) {
   1625			mlog_errno(ret);
   1626			goto out;
   1627		}
   1628	} else {
   1629		expand_bh = ref_leaf_bh;
   1630		get_bh(expand_bh);
   1631	}
   1632
   1633
   1634	/* Now add a new refcount block into the tree.*/
   1635	ret = ocfs2_new_leaf_refcount_block(handle, ci, ref_root_bh,
   1636					    expand_bh, meta_ac);
   1637	if (ret)
   1638		mlog_errno(ret);
   1639out:
   1640	brelse(expand_bh);
   1641	return ret;
   1642}
   1643
   1644/*
   1645 * Adjust the extent rec in b-tree representing ref_leaf_bh.
   1646 *
   1647 * Only called when we have inserted a new refcount rec at index 0
   1648 * which means ocfs2_extent_rec.e_cpos may need some change.
   1649 */
   1650static int ocfs2_adjust_refcount_rec(handle_t *handle,
   1651				     struct ocfs2_caching_info *ci,
   1652				     struct buffer_head *ref_root_bh,
   1653				     struct buffer_head *ref_leaf_bh,
   1654				     struct ocfs2_refcount_rec *rec)
   1655{
   1656	int ret = 0, i;
   1657	u32 new_cpos, old_cpos;
   1658	struct ocfs2_path *path = NULL;
   1659	struct ocfs2_extent_tree et;
   1660	struct ocfs2_refcount_block *rb =
   1661		(struct ocfs2_refcount_block *)ref_root_bh->b_data;
   1662	struct ocfs2_extent_list *el;
   1663
   1664	if (!(le32_to_cpu(rb->rf_flags) & OCFS2_REFCOUNT_TREE_FL))
   1665		goto out;
   1666
   1667	rb = (struct ocfs2_refcount_block *)ref_leaf_bh->b_data;
   1668	old_cpos = le32_to_cpu(rb->rf_cpos);
   1669	new_cpos = le64_to_cpu(rec->r_cpos) & OCFS2_32BIT_POS_MASK;
   1670	if (old_cpos <= new_cpos)
   1671		goto out;
   1672
   1673	ocfs2_init_refcount_extent_tree(&et, ci, ref_root_bh);
   1674
   1675	path = ocfs2_new_path_from_et(&et);
   1676	if (!path) {
   1677		ret = -ENOMEM;
   1678		mlog_errno(ret);
   1679		goto out;
   1680	}
   1681
   1682	ret = ocfs2_find_path(ci, path, old_cpos);
   1683	if (ret) {
   1684		mlog_errno(ret);
   1685		goto out;
   1686	}
   1687
   1688	/*
   1689	 * 2 more credits, one for the leaf refcount block, one for
   1690	 * the extent block contains the extent rec.
   1691	 */
   1692	ret = ocfs2_extend_trans(handle, 2);
   1693	if (ret < 0) {
   1694		mlog_errno(ret);
   1695		goto out;
   1696	}
   1697
   1698	ret = ocfs2_journal_access_rb(handle, ci, ref_leaf_bh,
   1699				      OCFS2_JOURNAL_ACCESS_WRITE);
   1700	if (ret < 0) {
   1701		mlog_errno(ret);
   1702		goto out;
   1703	}
   1704
   1705	ret = ocfs2_journal_access_eb(handle, ci, path_leaf_bh(path),
   1706				      OCFS2_JOURNAL_ACCESS_WRITE);
   1707	if (ret < 0) {
   1708		mlog_errno(ret);
   1709		goto out;
   1710	}
   1711
   1712	/* change the leaf extent block first. */
   1713	el = path_leaf_el(path);
   1714
   1715	for (i = 0; i < le16_to_cpu(el->l_next_free_rec); i++)
   1716		if (le32_to_cpu(el->l_recs[i].e_cpos) == old_cpos)
   1717			break;
   1718
   1719	BUG_ON(i == le16_to_cpu(el->l_next_free_rec));
   1720
   1721	el->l_recs[i].e_cpos = cpu_to_le32(new_cpos);
   1722
   1723	/* change the r_cpos in the leaf block. */
   1724	rb->rf_cpos = cpu_to_le32(new_cpos);
   1725
   1726	ocfs2_journal_dirty(handle, path_leaf_bh(path));
   1727	ocfs2_journal_dirty(handle, ref_leaf_bh);
   1728
   1729out:
   1730	ocfs2_free_path(path);
   1731	return ret;
   1732}
   1733
   1734static int ocfs2_insert_refcount_rec(handle_t *handle,
   1735				     struct ocfs2_caching_info *ci,
   1736				     struct buffer_head *ref_root_bh,
   1737				     struct buffer_head *ref_leaf_bh,
   1738				     struct ocfs2_refcount_rec *rec,
   1739				     int index, int merge,
   1740				     struct ocfs2_alloc_context *meta_ac)
   1741{
   1742	int ret;
   1743	struct ocfs2_refcount_block *rb =
   1744			(struct ocfs2_refcount_block *)ref_leaf_bh->b_data;
   1745	struct ocfs2_refcount_list *rf_list = &rb->rf_records;
   1746	struct buffer_head *new_bh = NULL;
   1747
   1748	BUG_ON(le32_to_cpu(rb->rf_flags) & OCFS2_REFCOUNT_TREE_FL);
   1749
   1750	if (rf_list->rl_used == rf_list->rl_count) {
   1751		u64 cpos = le64_to_cpu(rec->r_cpos);
   1752		u32 len = le32_to_cpu(rec->r_clusters);
   1753
   1754		ret = ocfs2_expand_refcount_tree(handle, ci, ref_root_bh,
   1755						 ref_leaf_bh, meta_ac);
   1756		if (ret) {
   1757			mlog_errno(ret);
   1758			goto out;
   1759		}
   1760
   1761		ret = ocfs2_get_refcount_rec(ci, ref_root_bh,
   1762					     cpos, len, NULL, &index,
   1763					     &new_bh);
   1764		if (ret) {
   1765			mlog_errno(ret);
   1766			goto out;
   1767		}
   1768
   1769		ref_leaf_bh = new_bh;
   1770		rb = (struct ocfs2_refcount_block *)ref_leaf_bh->b_data;
   1771		rf_list = &rb->rf_records;
   1772	}
   1773
   1774	ret = ocfs2_journal_access_rb(handle, ci, ref_leaf_bh,
   1775				      OCFS2_JOURNAL_ACCESS_WRITE);
   1776	if (ret) {
   1777		mlog_errno(ret);
   1778		goto out;
   1779	}
   1780
   1781	if (index < le16_to_cpu(rf_list->rl_used))
   1782		memmove(&rf_list->rl_recs[index + 1],
   1783			&rf_list->rl_recs[index],
   1784			(le16_to_cpu(rf_list->rl_used) - index) *
   1785			 sizeof(struct ocfs2_refcount_rec));
   1786
   1787	trace_ocfs2_insert_refcount_rec(
   1788		(unsigned long long)ref_leaf_bh->b_blocknr, index,
   1789		(unsigned long long)le64_to_cpu(rec->r_cpos),
   1790		le32_to_cpu(rec->r_clusters), le32_to_cpu(rec->r_refcount));
   1791
   1792	rf_list->rl_recs[index] = *rec;
   1793
   1794	le16_add_cpu(&rf_list->rl_used, 1);
   1795
   1796	if (merge)
   1797		ocfs2_refcount_rec_merge(rb, index);
   1798
   1799	ocfs2_journal_dirty(handle, ref_leaf_bh);
   1800
   1801	if (index == 0) {
   1802		ret = ocfs2_adjust_refcount_rec(handle, ci,
   1803						ref_root_bh,
   1804						ref_leaf_bh, rec);
   1805		if (ret)
   1806			mlog_errno(ret);
   1807	}
   1808out:
   1809	brelse(new_bh);
   1810	return ret;
   1811}
   1812
   1813/*
   1814 * Split the refcount_rec indexed by "index" in ref_leaf_bh.
   1815 * This is much simple than our b-tree code.
   1816 * split_rec is the new refcount rec we want to insert.
   1817 * If split_rec->r_refcount > 0, we are changing the refcount(in case we
   1818 * increase refcount or decrease a refcount to non-zero).
   1819 * If split_rec->r_refcount == 0, we are punching a hole in current refcount
   1820 * rec( in case we decrease a refcount to zero).
   1821 */
   1822static int ocfs2_split_refcount_rec(handle_t *handle,
   1823				    struct ocfs2_caching_info *ci,
   1824				    struct buffer_head *ref_root_bh,
   1825				    struct buffer_head *ref_leaf_bh,
   1826				    struct ocfs2_refcount_rec *split_rec,
   1827				    int index, int merge,
   1828				    struct ocfs2_alloc_context *meta_ac,
   1829				    struct ocfs2_cached_dealloc_ctxt *dealloc)
   1830{
   1831	int ret, recs_need;
   1832	u32 len;
   1833	struct ocfs2_refcount_block *rb =
   1834			(struct ocfs2_refcount_block *)ref_leaf_bh->b_data;
   1835	struct ocfs2_refcount_list *rf_list = &rb->rf_records;
   1836	struct ocfs2_refcount_rec *orig_rec = &rf_list->rl_recs[index];
   1837	struct ocfs2_refcount_rec *tail_rec = NULL;
   1838	struct buffer_head *new_bh = NULL;
   1839
   1840	BUG_ON(le32_to_cpu(rb->rf_flags) & OCFS2_REFCOUNT_TREE_FL);
   1841
   1842	trace_ocfs2_split_refcount_rec(le64_to_cpu(orig_rec->r_cpos),
   1843		le32_to_cpu(orig_rec->r_clusters),
   1844		le32_to_cpu(orig_rec->r_refcount),
   1845		le64_to_cpu(split_rec->r_cpos),
   1846		le32_to_cpu(split_rec->r_clusters),
   1847		le32_to_cpu(split_rec->r_refcount));
   1848
   1849	/*
   1850	 * If we just need to split the header or tail clusters,
   1851	 * no more recs are needed, just split is OK.
   1852	 * Otherwise we at least need one new recs.
   1853	 */
   1854	if (!split_rec->r_refcount &&
   1855	    (split_rec->r_cpos == orig_rec->r_cpos ||
   1856	     le64_to_cpu(split_rec->r_cpos) +
   1857	     le32_to_cpu(split_rec->r_clusters) ==
   1858	     le64_to_cpu(orig_rec->r_cpos) + le32_to_cpu(orig_rec->r_clusters)))
   1859		recs_need = 0;
   1860	else
   1861		recs_need = 1;
   1862
   1863	/*
   1864	 * We need one more rec if we split in the middle and the new rec have
   1865	 * some refcount in it.
   1866	 */
   1867	if (split_rec->r_refcount &&
   1868	    (split_rec->r_cpos != orig_rec->r_cpos &&
   1869	     le64_to_cpu(split_rec->r_cpos) +
   1870	     le32_to_cpu(split_rec->r_clusters) !=
   1871	     le64_to_cpu(orig_rec->r_cpos) + le32_to_cpu(orig_rec->r_clusters)))
   1872		recs_need++;
   1873
   1874	/* If the leaf block don't have enough record, expand it. */
   1875	if (le16_to_cpu(rf_list->rl_used) + recs_need >
   1876					 le16_to_cpu(rf_list->rl_count)) {
   1877		struct ocfs2_refcount_rec tmp_rec;
   1878		u64 cpos = le64_to_cpu(orig_rec->r_cpos);
   1879		len = le32_to_cpu(orig_rec->r_clusters);
   1880		ret = ocfs2_expand_refcount_tree(handle, ci, ref_root_bh,
   1881						 ref_leaf_bh, meta_ac);
   1882		if (ret) {
   1883			mlog_errno(ret);
   1884			goto out;
   1885		}
   1886
   1887		/*
   1888		 * We have to re-get it since now cpos may be moved to
   1889		 * another leaf block.
   1890		 */
   1891		ret = ocfs2_get_refcount_rec(ci, ref_root_bh,
   1892					     cpos, len, &tmp_rec, &index,
   1893					     &new_bh);
   1894		if (ret) {
   1895			mlog_errno(ret);
   1896			goto out;
   1897		}
   1898
   1899		ref_leaf_bh = new_bh;
   1900		rb = (struct ocfs2_refcount_block *)ref_leaf_bh->b_data;
   1901		rf_list = &rb->rf_records;
   1902		orig_rec = &rf_list->rl_recs[index];
   1903	}
   1904
   1905	ret = ocfs2_journal_access_rb(handle, ci, ref_leaf_bh,
   1906				      OCFS2_JOURNAL_ACCESS_WRITE);
   1907	if (ret) {
   1908		mlog_errno(ret);
   1909		goto out;
   1910	}
   1911
   1912	/*
   1913	 * We have calculated out how many new records we need and store
   1914	 * in recs_need, so spare enough space first by moving the records
   1915	 * after "index" to the end.
   1916	 */
   1917	if (index != le16_to_cpu(rf_list->rl_used) - 1)
   1918		memmove(&rf_list->rl_recs[index + 1 + recs_need],
   1919			&rf_list->rl_recs[index + 1],
   1920			(le16_to_cpu(rf_list->rl_used) - index - 1) *
   1921			 sizeof(struct ocfs2_refcount_rec));
   1922
   1923	len = (le64_to_cpu(orig_rec->r_cpos) +
   1924	      le32_to_cpu(orig_rec->r_clusters)) -
   1925	      (le64_to_cpu(split_rec->r_cpos) +
   1926	      le32_to_cpu(split_rec->r_clusters));
   1927
   1928	/*
   1929	 * If we have "len", the we will split in the tail and move it
   1930	 * to the end of the space we have just spared.
   1931	 */
   1932	if (len) {
   1933		tail_rec = &rf_list->rl_recs[index + recs_need];
   1934
   1935		memcpy(tail_rec, orig_rec, sizeof(struct ocfs2_refcount_rec));
   1936		le64_add_cpu(&tail_rec->r_cpos,
   1937			     le32_to_cpu(tail_rec->r_clusters) - len);
   1938		tail_rec->r_clusters = cpu_to_le32(len);
   1939	}
   1940
   1941	/*
   1942	 * If the split pos isn't the same as the original one, we need to
   1943	 * split in the head.
   1944	 *
   1945	 * Note: We have the chance that split_rec.r_refcount = 0,
   1946	 * recs_need = 0 and len > 0, which means we just cut the head from
   1947	 * the orig_rec and in that case we have done some modification in
   1948	 * orig_rec above, so the check for r_cpos is faked.
   1949	 */
   1950	if (split_rec->r_cpos != orig_rec->r_cpos && tail_rec != orig_rec) {
   1951		len = le64_to_cpu(split_rec->r_cpos) -
   1952		      le64_to_cpu(orig_rec->r_cpos);
   1953		orig_rec->r_clusters = cpu_to_le32(len);
   1954		index++;
   1955	}
   1956
   1957	le16_add_cpu(&rf_list->rl_used, recs_need);
   1958
   1959	if (split_rec->r_refcount) {
   1960		rf_list->rl_recs[index] = *split_rec;
   1961		trace_ocfs2_split_refcount_rec_insert(
   1962			(unsigned long long)ref_leaf_bh->b_blocknr, index,
   1963			(unsigned long long)le64_to_cpu(split_rec->r_cpos),
   1964			le32_to_cpu(split_rec->r_clusters),
   1965			le32_to_cpu(split_rec->r_refcount));
   1966
   1967		if (merge)
   1968			ocfs2_refcount_rec_merge(rb, index);
   1969	}
   1970
   1971	ocfs2_journal_dirty(handle, ref_leaf_bh);
   1972
   1973out:
   1974	brelse(new_bh);
   1975	return ret;
   1976}
   1977
   1978static int __ocfs2_increase_refcount(handle_t *handle,
   1979				     struct ocfs2_caching_info *ci,
   1980				     struct buffer_head *ref_root_bh,
   1981				     u64 cpos, u32 len, int merge,
   1982				     struct ocfs2_alloc_context *meta_ac,
   1983				     struct ocfs2_cached_dealloc_ctxt *dealloc)
   1984{
   1985	int ret = 0, index;
   1986	struct buffer_head *ref_leaf_bh = NULL;
   1987	struct ocfs2_refcount_rec rec;
   1988	unsigned int set_len = 0;
   1989
   1990	trace_ocfs2_increase_refcount_begin(
   1991	     (unsigned long long)ocfs2_metadata_cache_owner(ci),
   1992	     (unsigned long long)cpos, len);
   1993
   1994	while (len) {
   1995		ret = ocfs2_get_refcount_rec(ci, ref_root_bh,
   1996					     cpos, len, &rec, &index,
   1997					     &ref_leaf_bh);
   1998		if (ret) {
   1999			mlog_errno(ret);
   2000			goto out;
   2001		}
   2002
   2003		set_len = le32_to_cpu(rec.r_clusters);
   2004
   2005		/*
   2006		 * Here we may meet with 3 situations:
   2007		 *
   2008		 * 1. If we find an already existing record, and the length
   2009		 *    is the same, cool, we just need to increase the r_refcount
   2010		 *    and it is OK.
   2011		 * 2. If we find a hole, just insert it with r_refcount = 1.
   2012		 * 3. If we are in the middle of one extent record, split
   2013		 *    it.
   2014		 */
   2015		if (rec.r_refcount && le64_to_cpu(rec.r_cpos) == cpos &&
   2016		    set_len <= len) {
   2017			trace_ocfs2_increase_refcount_change(
   2018				(unsigned long long)cpos, set_len,
   2019				le32_to_cpu(rec.r_refcount));
   2020			ret = ocfs2_change_refcount_rec(handle, ci,
   2021							ref_leaf_bh, index,
   2022							merge, 1);
   2023			if (ret) {
   2024				mlog_errno(ret);
   2025				goto out;
   2026			}
   2027		} else if (!rec.r_refcount) {
   2028			rec.r_refcount = cpu_to_le32(1);
   2029
   2030			trace_ocfs2_increase_refcount_insert(
   2031			     (unsigned long long)le64_to_cpu(rec.r_cpos),
   2032			     set_len);
   2033			ret = ocfs2_insert_refcount_rec(handle, ci, ref_root_bh,
   2034							ref_leaf_bh,
   2035							&rec, index,
   2036							merge, meta_ac);
   2037			if (ret) {
   2038				mlog_errno(ret);
   2039				goto out;
   2040			}
   2041		} else  {
   2042			set_len = min((u64)(cpos + len),
   2043				      le64_to_cpu(rec.r_cpos) + set_len) - cpos;
   2044			rec.r_cpos = cpu_to_le64(cpos);
   2045			rec.r_clusters = cpu_to_le32(set_len);
   2046			le32_add_cpu(&rec.r_refcount, 1);
   2047
   2048			trace_ocfs2_increase_refcount_split(
   2049			     (unsigned long long)le64_to_cpu(rec.r_cpos),
   2050			     set_len, le32_to_cpu(rec.r_refcount));
   2051			ret = ocfs2_split_refcount_rec(handle, ci,
   2052						       ref_root_bh, ref_leaf_bh,
   2053						       &rec, index, merge,
   2054						       meta_ac, dealloc);
   2055			if (ret) {
   2056				mlog_errno(ret);
   2057				goto out;
   2058			}
   2059		}
   2060
   2061		cpos += set_len;
   2062		len -= set_len;
   2063		brelse(ref_leaf_bh);
   2064		ref_leaf_bh = NULL;
   2065	}
   2066
   2067out:
   2068	brelse(ref_leaf_bh);
   2069	return ret;
   2070}
   2071
   2072static int ocfs2_remove_refcount_extent(handle_t *handle,
   2073				struct ocfs2_caching_info *ci,
   2074				struct buffer_head *ref_root_bh,
   2075				struct buffer_head *ref_leaf_bh,
   2076				struct ocfs2_alloc_context *meta_ac,
   2077				struct ocfs2_cached_dealloc_ctxt *dealloc)
   2078{
   2079	int ret;
   2080	struct super_block *sb = ocfs2_metadata_cache_get_super(ci);
   2081	struct ocfs2_refcount_block *rb =
   2082			(struct ocfs2_refcount_block *)ref_leaf_bh->b_data;
   2083	struct ocfs2_extent_tree et;
   2084
   2085	BUG_ON(rb->rf_records.rl_used);
   2086
   2087	trace_ocfs2_remove_refcount_extent(
   2088		(unsigned long long)ocfs2_metadata_cache_owner(ci),
   2089		(unsigned long long)ref_leaf_bh->b_blocknr,
   2090		le32_to_cpu(rb->rf_cpos));
   2091
   2092	ocfs2_init_refcount_extent_tree(&et, ci, ref_root_bh);
   2093	ret = ocfs2_remove_extent(handle, &et, le32_to_cpu(rb->rf_cpos),
   2094				  1, meta_ac, dealloc);
   2095	if (ret) {
   2096		mlog_errno(ret);
   2097		goto out;
   2098	}
   2099
   2100	ocfs2_remove_from_cache(ci, ref_leaf_bh);
   2101
   2102	/*
   2103	 * add the freed block to the dealloc so that it will be freed
   2104	 * when we run dealloc.
   2105	 */
   2106	ret = ocfs2_cache_block_dealloc(dealloc, EXTENT_ALLOC_SYSTEM_INODE,
   2107					le16_to_cpu(rb->rf_suballoc_slot),
   2108					le64_to_cpu(rb->rf_suballoc_loc),
   2109					le64_to_cpu(rb->rf_blkno),
   2110					le16_to_cpu(rb->rf_suballoc_bit));
   2111	if (ret) {
   2112		mlog_errno(ret);
   2113		goto out;
   2114	}
   2115
   2116	ret = ocfs2_journal_access_rb(handle, ci, ref_root_bh,
   2117				      OCFS2_JOURNAL_ACCESS_WRITE);
   2118	if (ret) {
   2119		mlog_errno(ret);
   2120		goto out;
   2121	}
   2122
   2123	rb = (struct ocfs2_refcount_block *)ref_root_bh->b_data;
   2124
   2125	le32_add_cpu(&rb->rf_clusters, -1);
   2126
   2127	/*
   2128	 * check whether we need to restore the root refcount block if
   2129	 * there is no leaf extent block at atll.
   2130	 */
   2131	if (!rb->rf_list.l_next_free_rec) {
   2132		BUG_ON(rb->rf_clusters);
   2133
   2134		trace_ocfs2_restore_refcount_block(
   2135		     (unsigned long long)ref_root_bh->b_blocknr);
   2136
   2137		rb->rf_flags = 0;
   2138		rb->rf_parent = 0;
   2139		rb->rf_cpos = 0;
   2140		memset(&rb->rf_records, 0, sb->s_blocksize -
   2141		       offsetof(struct ocfs2_refcount_block, rf_records));
   2142		rb->rf_records.rl_count =
   2143				cpu_to_le16(ocfs2_refcount_recs_per_rb(sb));
   2144	}
   2145
   2146	ocfs2_journal_dirty(handle, ref_root_bh);
   2147
   2148out:
   2149	return ret;
   2150}
   2151
   2152int ocfs2_increase_refcount(handle_t *handle,
   2153			    struct ocfs2_caching_info *ci,
   2154			    struct buffer_head *ref_root_bh,
   2155			    u64 cpos, u32 len,
   2156			    struct ocfs2_alloc_context *meta_ac,
   2157			    struct ocfs2_cached_dealloc_ctxt *dealloc)
   2158{
   2159	return __ocfs2_increase_refcount(handle, ci, ref_root_bh,
   2160					 cpos, len, 1,
   2161					 meta_ac, dealloc);
   2162}
   2163
   2164static int ocfs2_decrease_refcount_rec(handle_t *handle,
   2165				struct ocfs2_caching_info *ci,
   2166				struct buffer_head *ref_root_bh,
   2167				struct buffer_head *ref_leaf_bh,
   2168				int index, u64 cpos, unsigned int len,
   2169				struct ocfs2_alloc_context *meta_ac,
   2170				struct ocfs2_cached_dealloc_ctxt *dealloc)
   2171{
   2172	int ret;
   2173	struct ocfs2_refcount_block *rb =
   2174			(struct ocfs2_refcount_block *)ref_leaf_bh->b_data;
   2175	struct ocfs2_refcount_rec *rec = &rb->rf_records.rl_recs[index];
   2176
   2177	BUG_ON(cpos < le64_to_cpu(rec->r_cpos));
   2178	BUG_ON(cpos + len >
   2179	       le64_to_cpu(rec->r_cpos) + le32_to_cpu(rec->r_clusters));
   2180
   2181	trace_ocfs2_decrease_refcount_rec(
   2182		(unsigned long long)ocfs2_metadata_cache_owner(ci),
   2183		(unsigned long long)cpos, len);
   2184
   2185	if (cpos == le64_to_cpu(rec->r_cpos) &&
   2186	    len == le32_to_cpu(rec->r_clusters))
   2187		ret = ocfs2_change_refcount_rec(handle, ci,
   2188						ref_leaf_bh, index, 1, -1);
   2189	else {
   2190		struct ocfs2_refcount_rec split = *rec;
   2191		split.r_cpos = cpu_to_le64(cpos);
   2192		split.r_clusters = cpu_to_le32(len);
   2193
   2194		le32_add_cpu(&split.r_refcount, -1);
   2195
   2196		ret = ocfs2_split_refcount_rec(handle, ci,
   2197					       ref_root_bh, ref_leaf_bh,
   2198					       &split, index, 1,
   2199					       meta_ac, dealloc);
   2200	}
   2201
   2202	if (ret) {
   2203		mlog_errno(ret);
   2204		goto out;
   2205	}
   2206
   2207	/* Remove the leaf refcount block if it contains no refcount record. */
   2208	if (!rb->rf_records.rl_used && ref_leaf_bh != ref_root_bh) {
   2209		ret = ocfs2_remove_refcount_extent(handle, ci, ref_root_bh,
   2210						   ref_leaf_bh, meta_ac,
   2211						   dealloc);
   2212		if (ret)
   2213			mlog_errno(ret);
   2214	}
   2215
   2216out:
   2217	return ret;
   2218}
   2219
   2220static int __ocfs2_decrease_refcount(handle_t *handle,
   2221				     struct ocfs2_caching_info *ci,
   2222				     struct buffer_head *ref_root_bh,
   2223				     u64 cpos, u32 len,
   2224				     struct ocfs2_alloc_context *meta_ac,
   2225				     struct ocfs2_cached_dealloc_ctxt *dealloc,
   2226				     int delete)
   2227{
   2228	int ret = 0, index = 0;
   2229	struct ocfs2_refcount_rec rec;
   2230	unsigned int r_count = 0, r_len;
   2231	struct super_block *sb = ocfs2_metadata_cache_get_super(ci);
   2232	struct buffer_head *ref_leaf_bh = NULL;
   2233
   2234	trace_ocfs2_decrease_refcount(
   2235		(unsigned long long)ocfs2_metadata_cache_owner(ci),
   2236		(unsigned long long)cpos, len, delete);
   2237
   2238	while (len) {
   2239		ret = ocfs2_get_refcount_rec(ci, ref_root_bh,
   2240					     cpos, len, &rec, &index,
   2241					     &ref_leaf_bh);
   2242		if (ret) {
   2243			mlog_errno(ret);
   2244			goto out;
   2245		}
   2246
   2247		r_count = le32_to_cpu(rec.r_refcount);
   2248		BUG_ON(r_count == 0);
   2249		if (!delete)
   2250			BUG_ON(r_count > 1);
   2251
   2252		r_len = min((u64)(cpos + len), le64_to_cpu(rec.r_cpos) +
   2253			      le32_to_cpu(rec.r_clusters)) - cpos;
   2254
   2255		ret = ocfs2_decrease_refcount_rec(handle, ci, ref_root_bh,
   2256						  ref_leaf_bh, index,
   2257						  cpos, r_len,
   2258						  meta_ac, dealloc);
   2259		if (ret) {
   2260			mlog_errno(ret);
   2261			goto out;
   2262		}
   2263
   2264		if (le32_to_cpu(rec.r_refcount) == 1 && delete) {
   2265			ret = ocfs2_cache_cluster_dealloc(dealloc,
   2266					  ocfs2_clusters_to_blocks(sb, cpos),
   2267							  r_len);
   2268			if (ret) {
   2269				mlog_errno(ret);
   2270				goto out;
   2271			}
   2272		}
   2273
   2274		cpos += r_len;
   2275		len -= r_len;
   2276		brelse(ref_leaf_bh);
   2277		ref_leaf_bh = NULL;
   2278	}
   2279
   2280out:
   2281	brelse(ref_leaf_bh);
   2282	return ret;
   2283}
   2284
   2285/* Caller must hold refcount tree lock. */
   2286int ocfs2_decrease_refcount(struct inode *inode,
   2287			    handle_t *handle, u32 cpos, u32 len,
   2288			    struct ocfs2_alloc_context *meta_ac,
   2289			    struct ocfs2_cached_dealloc_ctxt *dealloc,
   2290			    int delete)
   2291{
   2292	int ret;
   2293	u64 ref_blkno;
   2294	struct buffer_head *ref_root_bh = NULL;
   2295	struct ocfs2_refcount_tree *tree;
   2296
   2297	BUG_ON(!ocfs2_is_refcount_inode(inode));
   2298
   2299	ret = ocfs2_get_refcount_block(inode, &ref_blkno);
   2300	if (ret) {
   2301		mlog_errno(ret);
   2302		goto out;
   2303	}
   2304
   2305	ret = ocfs2_get_refcount_tree(OCFS2_SB(inode->i_sb), ref_blkno, &tree);
   2306	if (ret) {
   2307		mlog_errno(ret);
   2308		goto out;
   2309	}
   2310
   2311	ret = ocfs2_read_refcount_block(&tree->rf_ci, tree->rf_blkno,
   2312					&ref_root_bh);
   2313	if (ret) {
   2314		mlog_errno(ret);
   2315		goto out;
   2316	}
   2317
   2318	ret = __ocfs2_decrease_refcount(handle, &tree->rf_ci, ref_root_bh,
   2319					cpos, len, meta_ac, dealloc, delete);
   2320	if (ret)
   2321		mlog_errno(ret);
   2322out:
   2323	brelse(ref_root_bh);
   2324	return ret;
   2325}
   2326
   2327/*
   2328 * Mark the already-existing extent at cpos as refcounted for len clusters.
   2329 * This adds the refcount extent flag.
   2330 *
   2331 * If the existing extent is larger than the request, initiate a
   2332 * split. An attempt will be made at merging with adjacent extents.
   2333 *
   2334 * The caller is responsible for passing down meta_ac if we'll need it.
   2335 */
   2336static int ocfs2_mark_extent_refcounted(struct inode *inode,
   2337				struct ocfs2_extent_tree *et,
   2338				handle_t *handle, u32 cpos,
   2339				u32 len, u32 phys,
   2340				struct ocfs2_alloc_context *meta_ac,
   2341				struct ocfs2_cached_dealloc_ctxt *dealloc)
   2342{
   2343	int ret;
   2344
   2345	trace_ocfs2_mark_extent_refcounted(OCFS2_I(inode)->ip_blkno,
   2346					   cpos, len, phys);
   2347
   2348	if (!ocfs2_refcount_tree(OCFS2_SB(inode->i_sb))) {
   2349		ret = ocfs2_error(inode->i_sb, "Inode %lu want to use refcount tree, but the feature bit is not set in the super block\n",
   2350				  inode->i_ino);
   2351		goto out;
   2352	}
   2353
   2354	ret = ocfs2_change_extent_flag(handle, et, cpos,
   2355				       len, phys, meta_ac, dealloc,
   2356				       OCFS2_EXT_REFCOUNTED, 0);
   2357	if (ret)
   2358		mlog_errno(ret);
   2359
   2360out:
   2361	return ret;
   2362}
   2363
   2364/*
   2365 * Given some contiguous physical clusters, calculate what we need
   2366 * for modifying their refcount.
   2367 */
   2368static int ocfs2_calc_refcount_meta_credits(struct super_block *sb,
   2369					    struct ocfs2_caching_info *ci,
   2370					    struct buffer_head *ref_root_bh,
   2371					    u64 start_cpos,
   2372					    u32 clusters,
   2373					    int *meta_add,
   2374					    int *credits)
   2375{
   2376	int ret = 0, index, ref_blocks = 0, recs_add = 0;
   2377	u64 cpos = start_cpos;
   2378	struct ocfs2_refcount_block *rb;
   2379	struct ocfs2_refcount_rec rec;
   2380	struct buffer_head *ref_leaf_bh = NULL, *prev_bh = NULL;
   2381	u32 len;
   2382
   2383	while (clusters) {
   2384		ret = ocfs2_get_refcount_rec(ci, ref_root_bh,
   2385					     cpos, clusters, &rec,
   2386					     &index, &ref_leaf_bh);
   2387		if (ret) {
   2388			mlog_errno(ret);
   2389			goto out;
   2390		}
   2391
   2392		if (ref_leaf_bh != prev_bh) {
   2393			/*
   2394			 * Now we encounter a new leaf block, so calculate
   2395			 * whether we need to extend the old leaf.
   2396			 */
   2397			if (prev_bh) {
   2398				rb = (struct ocfs2_refcount_block *)
   2399							prev_bh->b_data;
   2400
   2401				if (le16_to_cpu(rb->rf_records.rl_used) +
   2402				    recs_add >
   2403				    le16_to_cpu(rb->rf_records.rl_count))
   2404					ref_blocks++;
   2405			}
   2406
   2407			recs_add = 0;
   2408			*credits += 1;
   2409			brelse(prev_bh);
   2410			prev_bh = ref_leaf_bh;
   2411			get_bh(prev_bh);
   2412		}
   2413
   2414		trace_ocfs2_calc_refcount_meta_credits_iterate(
   2415				recs_add, (unsigned long long)cpos, clusters,
   2416				(unsigned long long)le64_to_cpu(rec.r_cpos),
   2417				le32_to_cpu(rec.r_clusters),
   2418				le32_to_cpu(rec.r_refcount), index);
   2419
   2420		len = min((u64)cpos + clusters, le64_to_cpu(rec.r_cpos) +
   2421			  le32_to_cpu(rec.r_clusters)) - cpos;
   2422		/*
   2423		 * We record all the records which will be inserted to the
   2424		 * same refcount block, so that we can tell exactly whether
   2425		 * we need a new refcount block or not.
   2426		 *
   2427		 * If we will insert a new one, this is easy and only happens
   2428		 * during adding refcounted flag to the extent, so we don't
   2429		 * have a chance of spliting. We just need one record.
   2430		 *
   2431		 * If the refcount rec already exists, that would be a little
   2432		 * complicated. we may have to:
   2433		 * 1) split at the beginning if the start pos isn't aligned.
   2434		 *    we need 1 more record in this case.
   2435		 * 2) split int the end if the end pos isn't aligned.
   2436		 *    we need 1 more record in this case.
   2437		 * 3) split in the middle because of file system fragmentation.
   2438		 *    we need 2 more records in this case(we can't detect this
   2439		 *    beforehand, so always think of the worst case).
   2440		 */
   2441		if (rec.r_refcount) {
   2442			recs_add += 2;
   2443			/* Check whether we need a split at the beginning. */
   2444			if (cpos == start_cpos &&
   2445			    cpos != le64_to_cpu(rec.r_cpos))
   2446				recs_add++;
   2447
   2448			/* Check whether we need a split in the end. */
   2449			if (cpos + clusters < le64_to_cpu(rec.r_cpos) +
   2450			    le32_to_cpu(rec.r_clusters))
   2451				recs_add++;
   2452		} else
   2453			recs_add++;
   2454
   2455		brelse(ref_leaf_bh);
   2456		ref_leaf_bh = NULL;
   2457		clusters -= len;
   2458		cpos += len;
   2459	}
   2460
   2461	if (prev_bh) {
   2462		rb = (struct ocfs2_refcount_block *)prev_bh->b_data;
   2463
   2464		if (le16_to_cpu(rb->rf_records.rl_used) + recs_add >
   2465		    le16_to_cpu(rb->rf_records.rl_count))
   2466			ref_blocks++;
   2467
   2468		*credits += 1;
   2469	}
   2470
   2471	if (!ref_blocks)
   2472		goto out;
   2473
   2474	*meta_add += ref_blocks;
   2475	*credits += ref_blocks;
   2476
   2477	/*
   2478	 * So we may need ref_blocks to insert into the tree.
   2479	 * That also means we need to change the b-tree and add that number
   2480	 * of records since we never merge them.
   2481	 * We need one more block for expansion since the new created leaf
   2482	 * block is also full and needs split.
   2483	 */
   2484	rb = (struct ocfs2_refcount_block *)ref_root_bh->b_data;
   2485	if (le32_to_cpu(rb->rf_flags) & OCFS2_REFCOUNT_TREE_FL) {
   2486		struct ocfs2_extent_tree et;
   2487
   2488		ocfs2_init_refcount_extent_tree(&et, ci, ref_root_bh);
   2489		*meta_add += ocfs2_extend_meta_needed(et.et_root_el);
   2490		*credits += ocfs2_calc_extend_credits(sb,
   2491						      et.et_root_el);
   2492	} else {
   2493		*credits += OCFS2_EXPAND_REFCOUNT_TREE_CREDITS;
   2494		*meta_add += 1;
   2495	}
   2496
   2497out:
   2498
   2499	trace_ocfs2_calc_refcount_meta_credits(
   2500		(unsigned long long)start_cpos, clusters,
   2501		*meta_add, *credits);
   2502	brelse(ref_leaf_bh);
   2503	brelse(prev_bh);
   2504	return ret;
   2505}
   2506
   2507/*
   2508 * For refcount tree, we will decrease some contiguous clusters
   2509 * refcount count, so just go through it to see how many blocks
   2510 * we gonna touch and whether we need to create new blocks.
   2511 *
   2512 * Normally the refcount blocks store these refcount should be
   2513 * contiguous also, so that we can get the number easily.
   2514 * We will at most add split 2 refcount records and 2 more
   2515 * refcount blocks, so just check it in a rough way.
   2516 *
   2517 * Caller must hold refcount tree lock.
   2518 */
   2519int ocfs2_prepare_refcount_change_for_del(struct inode *inode,
   2520					  u64 refcount_loc,
   2521					  u64 phys_blkno,
   2522					  u32 clusters,
   2523					  int *credits,
   2524					  int *ref_blocks)
   2525{
   2526	int ret;
   2527	struct buffer_head *ref_root_bh = NULL;
   2528	struct ocfs2_refcount_tree *tree;
   2529	u64 start_cpos = ocfs2_blocks_to_clusters(inode->i_sb, phys_blkno);
   2530
   2531	if (!ocfs2_refcount_tree(OCFS2_SB(inode->i_sb))) {
   2532		ret = ocfs2_error(inode->i_sb, "Inode %lu want to use refcount tree, but the feature bit is not set in the super block\n",
   2533				  inode->i_ino);
   2534		goto out;
   2535	}
   2536
   2537	BUG_ON(!ocfs2_is_refcount_inode(inode));
   2538
   2539	ret = ocfs2_get_refcount_tree(OCFS2_SB(inode->i_sb),
   2540				      refcount_loc, &tree);
   2541	if (ret) {
   2542		mlog_errno(ret);
   2543		goto out;
   2544	}
   2545
   2546	ret = ocfs2_read_refcount_block(&tree->rf_ci, refcount_loc,
   2547					&ref_root_bh);
   2548	if (ret) {
   2549		mlog_errno(ret);
   2550		goto out;
   2551	}
   2552
   2553	ret = ocfs2_calc_refcount_meta_credits(inode->i_sb,
   2554					       &tree->rf_ci,
   2555					       ref_root_bh,
   2556					       start_cpos, clusters,
   2557					       ref_blocks, credits);
   2558	if (ret) {
   2559		mlog_errno(ret);
   2560		goto out;
   2561	}
   2562
   2563	trace_ocfs2_prepare_refcount_change_for_del(*ref_blocks, *credits);
   2564
   2565out:
   2566	brelse(ref_root_bh);
   2567	return ret;
   2568}
   2569
   2570#define	MAX_CONTIG_BYTES	1048576
   2571
   2572static inline unsigned int ocfs2_cow_contig_clusters(struct super_block *sb)
   2573{
   2574	return ocfs2_clusters_for_bytes(sb, MAX_CONTIG_BYTES);
   2575}
   2576
   2577static inline unsigned int ocfs2_cow_contig_mask(struct super_block *sb)
   2578{
   2579	return ~(ocfs2_cow_contig_clusters(sb) - 1);
   2580}
   2581
   2582/*
   2583 * Given an extent that starts at 'start' and an I/O that starts at 'cpos',
   2584 * find an offset (start + (n * contig_clusters)) that is closest to cpos
   2585 * while still being less than or equal to it.
   2586 *
   2587 * The goal is to break the extent at a multiple of contig_clusters.
   2588 */
   2589static inline unsigned int ocfs2_cow_align_start(struct super_block *sb,
   2590						 unsigned int start,
   2591						 unsigned int cpos)
   2592{
   2593	BUG_ON(start > cpos);
   2594
   2595	return start + ((cpos - start) & ocfs2_cow_contig_mask(sb));
   2596}
   2597
   2598/*
   2599 * Given a cluster count of len, pad it out so that it is a multiple
   2600 * of contig_clusters.
   2601 */
   2602static inline unsigned int ocfs2_cow_align_length(struct super_block *sb,
   2603						  unsigned int len)
   2604{
   2605	unsigned int padded =
   2606		(len + (ocfs2_cow_contig_clusters(sb) - 1)) &
   2607		ocfs2_cow_contig_mask(sb);
   2608
   2609	/* Did we wrap? */
   2610	if (padded < len)
   2611		padded = UINT_MAX;
   2612
   2613	return padded;
   2614}
   2615
   2616/*
   2617 * Calculate out the start and number of virtual clusters we need to to CoW.
   2618 *
   2619 * cpos is vitual start cluster position we want to do CoW in a
   2620 * file and write_len is the cluster length.
   2621 * max_cpos is the place where we want to stop CoW intentionally.
   2622 *
   2623 * Normal we will start CoW from the beginning of extent record cotaining cpos.
   2624 * We try to break up extents on boundaries of MAX_CONTIG_BYTES so that we
   2625 * get good I/O from the resulting extent tree.
   2626 */
   2627static int ocfs2_refcount_cal_cow_clusters(struct inode *inode,
   2628					   struct ocfs2_extent_list *el,
   2629					   u32 cpos,
   2630					   u32 write_len,
   2631					   u32 max_cpos,
   2632					   u32 *cow_start,
   2633					   u32 *cow_len)
   2634{
   2635	int ret = 0;
   2636	int tree_height = le16_to_cpu(el->l_tree_depth), i;
   2637	struct buffer_head *eb_bh = NULL;
   2638	struct ocfs2_extent_block *eb = NULL;
   2639	struct ocfs2_extent_rec *rec;
   2640	unsigned int want_clusters, rec_end = 0;
   2641	int contig_clusters = ocfs2_cow_contig_clusters(inode->i_sb);
   2642	int leaf_clusters;
   2643
   2644	BUG_ON(cpos + write_len > max_cpos);
   2645
   2646	if (tree_height > 0) {
   2647		ret = ocfs2_find_leaf(INODE_CACHE(inode), el, cpos, &eb_bh);
   2648		if (ret) {
   2649			mlog_errno(ret);
   2650			goto out;
   2651		}
   2652
   2653		eb = (struct ocfs2_extent_block *) eb_bh->b_data;
   2654		el = &eb->h_list;
   2655
   2656		if (el->l_tree_depth) {
   2657			ret = ocfs2_error(inode->i_sb,
   2658					  "Inode %lu has non zero tree depth in leaf block %llu\n",
   2659					  inode->i_ino,
   2660					  (unsigned long long)eb_bh->b_blocknr);
   2661			goto out;
   2662		}
   2663	}
   2664
   2665	*cow_len = 0;
   2666	for (i = 0; i < le16_to_cpu(el->l_next_free_rec); i++) {
   2667		rec = &el->l_recs[i];
   2668
   2669		if (ocfs2_is_empty_extent(rec)) {
   2670			mlog_bug_on_msg(i != 0, "Inode %lu has empty record in "
   2671					"index %d\n", inode->i_ino, i);
   2672			continue;
   2673		}
   2674
   2675		if (le32_to_cpu(rec->e_cpos) +
   2676		    le16_to_cpu(rec->e_leaf_clusters) <= cpos)
   2677			continue;
   2678
   2679		if (*cow_len == 0) {
   2680			/*
   2681			 * We should find a refcounted record in the
   2682			 * first pass.
   2683			 */
   2684			BUG_ON(!(rec->e_flags & OCFS2_EXT_REFCOUNTED));
   2685			*cow_start = le32_to_cpu(rec->e_cpos);
   2686		}
   2687
   2688		/*
   2689		 * If we encounter a hole, a non-refcounted record or
   2690		 * pass the max_cpos, stop the search.
   2691		 */
   2692		if ((!(rec->e_flags & OCFS2_EXT_REFCOUNTED)) ||
   2693		    (*cow_len && rec_end != le32_to_cpu(rec->e_cpos)) ||
   2694		    (max_cpos <= le32_to_cpu(rec->e_cpos)))
   2695			break;
   2696
   2697		leaf_clusters = le16_to_cpu(rec->e_leaf_clusters);
   2698		rec_end = le32_to_cpu(rec->e_cpos) + leaf_clusters;
   2699		if (rec_end > max_cpos) {
   2700			rec_end = max_cpos;
   2701			leaf_clusters = rec_end - le32_to_cpu(rec->e_cpos);
   2702		}
   2703
   2704		/*
   2705		 * How many clusters do we actually need from
   2706		 * this extent?  First we see how many we actually
   2707		 * need to complete the write.  If that's smaller
   2708		 * than contig_clusters, we try for contig_clusters.
   2709		 */
   2710		if (!*cow_len)
   2711			want_clusters = write_len;
   2712		else
   2713			want_clusters = (cpos + write_len) -
   2714				(*cow_start + *cow_len);
   2715		if (want_clusters < contig_clusters)
   2716			want_clusters = contig_clusters;
   2717
   2718		/*
   2719		 * If the write does not cover the whole extent, we
   2720		 * need to calculate how we're going to split the extent.
   2721		 * We try to do it on contig_clusters boundaries.
   2722		 *
   2723		 * Any extent smaller than contig_clusters will be
   2724		 * CoWed in its entirety.
   2725		 */
   2726		if (leaf_clusters <= contig_clusters)
   2727			*cow_len += leaf_clusters;
   2728		else if (*cow_len || (*cow_start == cpos)) {
   2729			/*
   2730			 * This extent needs to be CoW'd from its
   2731			 * beginning, so all we have to do is compute
   2732			 * how many clusters to grab.  We align
   2733			 * want_clusters to the edge of contig_clusters
   2734			 * to get better I/O.
   2735			 */
   2736			want_clusters = ocfs2_cow_align_length(inode->i_sb,
   2737							       want_clusters);
   2738
   2739			if (leaf_clusters < want_clusters)
   2740				*cow_len += leaf_clusters;
   2741			else
   2742				*cow_len += want_clusters;
   2743		} else if ((*cow_start + contig_clusters) >=
   2744			   (cpos + write_len)) {
   2745			/*
   2746			 * Breaking off contig_clusters at the front
   2747			 * of the extent will cover our write.  That's
   2748			 * easy.
   2749			 */
   2750			*cow_len = contig_clusters;
   2751		} else if ((rec_end - cpos) <= contig_clusters) {
   2752			/*
   2753			 * Breaking off contig_clusters at the tail of
   2754			 * this extent will cover cpos.
   2755			 */
   2756			*cow_start = rec_end - contig_clusters;
   2757			*cow_len = contig_clusters;
   2758		} else if ((rec_end - cpos) <= want_clusters) {
   2759			/*
   2760			 * While we can't fit the entire write in this
   2761			 * extent, we know that the write goes from cpos
   2762			 * to the end of the extent.  Break that off.
   2763			 * We try to break it at some multiple of
   2764			 * contig_clusters from the front of the extent.
   2765			 * Failing that (ie, cpos is within
   2766			 * contig_clusters of the front), we'll CoW the
   2767			 * entire extent.
   2768			 */
   2769			*cow_start = ocfs2_cow_align_start(inode->i_sb,
   2770							   *cow_start, cpos);
   2771			*cow_len = rec_end - *cow_start;
   2772		} else {
   2773			/*
   2774			 * Ok, the entire write lives in the middle of
   2775			 * this extent.  Let's try to slice the extent up
   2776			 * nicely.  Optimally, our CoW region starts at
   2777			 * m*contig_clusters from the beginning of the
   2778			 * extent and goes for n*contig_clusters,
   2779			 * covering the entire write.
   2780			 */
   2781			*cow_start = ocfs2_cow_align_start(inode->i_sb,
   2782							   *cow_start, cpos);
   2783
   2784			want_clusters = (cpos + write_len) - *cow_start;
   2785			want_clusters = ocfs2_cow_align_length(inode->i_sb,
   2786							       want_clusters);
   2787			if (*cow_start + want_clusters <= rec_end)
   2788				*cow_len = want_clusters;
   2789			else
   2790				*cow_len = rec_end - *cow_start;
   2791		}
   2792
   2793		/* Have we covered our entire write yet? */
   2794		if ((*cow_start + *cow_len) >= (cpos + write_len))
   2795			break;
   2796
   2797		/*
   2798		 * If we reach the end of the extent block and don't get enough
   2799		 * clusters, continue with the next extent block if possible.
   2800		 */
   2801		if (i + 1 == le16_to_cpu(el->l_next_free_rec) &&
   2802		    eb && eb->h_next_leaf_blk) {
   2803			brelse(eb_bh);
   2804			eb_bh = NULL;
   2805
   2806			ret = ocfs2_read_extent_block(INODE_CACHE(inode),
   2807					       le64_to_cpu(eb->h_next_leaf_blk),
   2808					       &eb_bh);
   2809			if (ret) {
   2810				mlog_errno(ret);
   2811				goto out;
   2812			}
   2813
   2814			eb = (struct ocfs2_extent_block *) eb_bh->b_data;
   2815			el = &eb->h_list;
   2816			i = -1;
   2817		}
   2818	}
   2819
   2820out:
   2821	brelse(eb_bh);
   2822	return ret;
   2823}
   2824
   2825/*
   2826 * Prepare meta_ac, data_ac and calculate credits when we want to add some
   2827 * num_clusters in data_tree "et" and change the refcount for the old
   2828 * clusters(starting form p_cluster) in the refcount tree.
   2829 *
   2830 * Note:
   2831 * 1. since we may split the old tree, so we at most will need num_clusters + 2
   2832 *    more new leaf records.
   2833 * 2. In some case, we may not need to reserve new clusters(e.g, reflink), so
   2834 *    just give data_ac = NULL.
   2835 */
   2836static int ocfs2_lock_refcount_allocators(struct super_block *sb,
   2837					u32 p_cluster, u32 num_clusters,
   2838					struct ocfs2_extent_tree *et,
   2839					struct ocfs2_caching_info *ref_ci,
   2840					struct buffer_head *ref_root_bh,
   2841					struct ocfs2_alloc_context **meta_ac,
   2842					struct ocfs2_alloc_context **data_ac,
   2843					int *credits)
   2844{
   2845	int ret = 0, meta_add = 0;
   2846	int num_free_extents = ocfs2_num_free_extents(et);
   2847
   2848	if (num_free_extents < 0) {
   2849		ret = num_free_extents;
   2850		mlog_errno(ret);
   2851		goto out;
   2852	}
   2853
   2854	if (num_free_extents < num_clusters + 2)
   2855		meta_add =
   2856			ocfs2_extend_meta_needed(et->et_root_el);
   2857
   2858	*credits += ocfs2_calc_extend_credits(sb, et->et_root_el);
   2859
   2860	ret = ocfs2_calc_refcount_meta_credits(sb, ref_ci, ref_root_bh,
   2861					       p_cluster, num_clusters,
   2862					       &meta_add, credits);
   2863	if (ret) {
   2864		mlog_errno(ret);
   2865		goto out;
   2866	}
   2867
   2868	trace_ocfs2_lock_refcount_allocators(meta_add, *credits);
   2869	ret = ocfs2_reserve_new_metadata_blocks(OCFS2_SB(sb), meta_add,
   2870						meta_ac);
   2871	if (ret) {
   2872		mlog_errno(ret);
   2873		goto out;
   2874	}
   2875
   2876	if (data_ac) {
   2877		ret = ocfs2_reserve_clusters(OCFS2_SB(sb), num_clusters,
   2878					     data_ac);
   2879		if (ret)
   2880			mlog_errno(ret);
   2881	}
   2882
   2883out:
   2884	if (ret) {
   2885		if (*meta_ac) {
   2886			ocfs2_free_alloc_context(*meta_ac);
   2887			*meta_ac = NULL;
   2888		}
   2889	}
   2890
   2891	return ret;
   2892}
   2893
   2894static int ocfs2_clear_cow_buffer(handle_t *handle, struct buffer_head *bh)
   2895{
   2896	BUG_ON(buffer_dirty(bh));
   2897
   2898	clear_buffer_mapped(bh);
   2899
   2900	return 0;
   2901}
   2902
   2903int ocfs2_duplicate_clusters_by_page(handle_t *handle,
   2904				     struct inode *inode,
   2905				     u32 cpos, u32 old_cluster,
   2906				     u32 new_cluster, u32 new_len)
   2907{
   2908	int ret = 0, partial;
   2909	struct super_block *sb = inode->i_sb;
   2910	u64 new_block = ocfs2_clusters_to_blocks(sb, new_cluster);
   2911	struct page *page;
   2912	pgoff_t page_index;
   2913	unsigned int from, to;
   2914	loff_t offset, end, map_end;
   2915	struct address_space *mapping = inode->i_mapping;
   2916
   2917	trace_ocfs2_duplicate_clusters_by_page(cpos, old_cluster,
   2918					       new_cluster, new_len);
   2919
   2920	offset = ((loff_t)cpos) << OCFS2_SB(sb)->s_clustersize_bits;
   2921	end = offset + (new_len << OCFS2_SB(sb)->s_clustersize_bits);
   2922	/*
   2923	 * We only duplicate pages until we reach the page contains i_size - 1.
   2924	 * So trim 'end' to i_size.
   2925	 */
   2926	if (end > i_size_read(inode))
   2927		end = i_size_read(inode);
   2928
   2929	while (offset < end) {
   2930		page_index = offset >> PAGE_SHIFT;
   2931		map_end = ((loff_t)page_index + 1) << PAGE_SHIFT;
   2932		if (map_end > end)
   2933			map_end = end;
   2934
   2935		/* from, to is the offset within the page. */
   2936		from = offset & (PAGE_SIZE - 1);
   2937		to = PAGE_SIZE;
   2938		if (map_end & (PAGE_SIZE - 1))
   2939			to = map_end & (PAGE_SIZE - 1);
   2940
   2941retry:
   2942		page = find_or_create_page(mapping, page_index, GFP_NOFS);
   2943		if (!page) {
   2944			ret = -ENOMEM;
   2945			mlog_errno(ret);
   2946			break;
   2947		}
   2948
   2949		/*
   2950		 * In case PAGE_SIZE <= CLUSTER_SIZE, we do not expect a dirty
   2951		 * page, so write it back.
   2952		 */
   2953		if (PAGE_SIZE <= OCFS2_SB(sb)->s_clustersize) {
   2954			if (PageDirty(page)) {
   2955				/*
   2956				 * write_on_page will unlock the page on return
   2957				 */
   2958				ret = write_one_page(page);
   2959				goto retry;
   2960			}
   2961		}
   2962
   2963		if (!PageUptodate(page)) {
   2964			struct folio *folio = page_folio(page);
   2965
   2966			ret = block_read_full_folio(folio, ocfs2_get_block);
   2967			if (ret) {
   2968				mlog_errno(ret);
   2969				goto unlock;
   2970			}
   2971			folio_lock(folio);
   2972		}
   2973
   2974		if (page_has_buffers(page)) {
   2975			ret = walk_page_buffers(handle, page_buffers(page),
   2976						from, to, &partial,
   2977						ocfs2_clear_cow_buffer);
   2978			if (ret) {
   2979				mlog_errno(ret);
   2980				goto unlock;
   2981			}
   2982		}
   2983
   2984		ocfs2_map_and_dirty_page(inode,
   2985					 handle, from, to,
   2986					 page, 0, &new_block);
   2987		mark_page_accessed(page);
   2988unlock:
   2989		unlock_page(page);
   2990		put_page(page);
   2991		page = NULL;
   2992		offset = map_end;
   2993		if (ret)
   2994			break;
   2995	}
   2996
   2997	return ret;
   2998}
   2999
   3000int ocfs2_duplicate_clusters_by_jbd(handle_t *handle,
   3001				    struct inode *inode,
   3002				    u32 cpos, u32 old_cluster,
   3003				    u32 new_cluster, u32 new_len)
   3004{
   3005	int ret = 0;
   3006	struct super_block *sb = inode->i_sb;
   3007	struct ocfs2_caching_info *ci = INODE_CACHE(inode);
   3008	int i, blocks = ocfs2_clusters_to_blocks(sb, new_len);
   3009	u64 old_block = ocfs2_clusters_to_blocks(sb, old_cluster);
   3010	u64 new_block = ocfs2_clusters_to_blocks(sb, new_cluster);
   3011	struct ocfs2_super *osb = OCFS2_SB(sb);
   3012	struct buffer_head *old_bh = NULL;
   3013	struct buffer_head *new_bh = NULL;
   3014
   3015	trace_ocfs2_duplicate_clusters_by_page(cpos, old_cluster,
   3016					       new_cluster, new_len);
   3017
   3018	for (i = 0; i < blocks; i++, old_block++, new_block++) {
   3019		new_bh = sb_getblk(osb->sb, new_block);
   3020		if (new_bh == NULL) {
   3021			ret = -ENOMEM;
   3022			mlog_errno(ret);
   3023			break;
   3024		}
   3025
   3026		ocfs2_set_new_buffer_uptodate(ci, new_bh);
   3027
   3028		ret = ocfs2_read_block(ci, old_block, &old_bh, NULL);
   3029		if (ret) {
   3030			mlog_errno(ret);
   3031			break;
   3032		}
   3033
   3034		ret = ocfs2_journal_access(handle, ci, new_bh,
   3035					   OCFS2_JOURNAL_ACCESS_CREATE);
   3036		if (ret) {
   3037			mlog_errno(ret);
   3038			break;
   3039		}
   3040
   3041		memcpy(new_bh->b_data, old_bh->b_data, sb->s_blocksize);
   3042		ocfs2_journal_dirty(handle, new_bh);
   3043
   3044		brelse(new_bh);
   3045		brelse(old_bh);
   3046		new_bh = NULL;
   3047		old_bh = NULL;
   3048	}
   3049
   3050	brelse(new_bh);
   3051	brelse(old_bh);
   3052	return ret;
   3053}
   3054
   3055static int ocfs2_clear_ext_refcount(handle_t *handle,
   3056				    struct ocfs2_extent_tree *et,
   3057				    u32 cpos, u32 p_cluster, u32 len,
   3058				    unsigned int ext_flags,
   3059				    struct ocfs2_alloc_context *meta_ac,
   3060				    struct ocfs2_cached_dealloc_ctxt *dealloc)
   3061{
   3062	int ret, index;
   3063	struct ocfs2_extent_rec replace_rec;
   3064	struct ocfs2_path *path = NULL;
   3065	struct ocfs2_extent_list *el;
   3066	struct super_block *sb = ocfs2_metadata_cache_get_super(et->et_ci);
   3067	u64 ino = ocfs2_metadata_cache_owner(et->et_ci);
   3068
   3069	trace_ocfs2_clear_ext_refcount((unsigned long long)ino,
   3070				       cpos, len, p_cluster, ext_flags);
   3071
   3072	memset(&replace_rec, 0, sizeof(replace_rec));
   3073	replace_rec.e_cpos = cpu_to_le32(cpos);
   3074	replace_rec.e_leaf_clusters = cpu_to_le16(len);
   3075	replace_rec.e_blkno = cpu_to_le64(ocfs2_clusters_to_blocks(sb,
   3076								   p_cluster));
   3077	replace_rec.e_flags = ext_flags;
   3078	replace_rec.e_flags &= ~OCFS2_EXT_REFCOUNTED;
   3079
   3080	path = ocfs2_new_path_from_et(et);
   3081	if (!path) {
   3082		ret = -ENOMEM;
   3083		mlog_errno(ret);
   3084		goto out;
   3085	}
   3086
   3087	ret = ocfs2_find_path(et->et_ci, path, cpos);
   3088	if (ret) {
   3089		mlog_errno(ret);
   3090		goto out;
   3091	}
   3092
   3093	el = path_leaf_el(path);
   3094
   3095	index = ocfs2_search_extent_list(el, cpos);
   3096	if (index == -1) {
   3097		ret = ocfs2_error(sb,
   3098				  "Inode %llu has an extent at cpos %u which can no longer be found\n",
   3099				  (unsigned long long)ino, cpos);
   3100		goto out;
   3101	}
   3102
   3103	ret = ocfs2_split_extent(handle, et, path, index,
   3104				 &replace_rec, meta_ac, dealloc);
   3105	if (ret)
   3106		mlog_errno(ret);
   3107
   3108out:
   3109	ocfs2_free_path(path);
   3110	return ret;
   3111}
   3112
   3113static int ocfs2_replace_clusters(handle_t *handle,
   3114				  struct ocfs2_cow_context *context,
   3115				  u32 cpos, u32 old,
   3116				  u32 new, u32 len,
   3117				  unsigned int ext_flags)
   3118{
   3119	int ret;
   3120	struct ocfs2_caching_info *ci = context->data_et.et_ci;
   3121	u64 ino = ocfs2_metadata_cache_owner(ci);
   3122
   3123	trace_ocfs2_replace_clusters((unsigned long long)ino,
   3124				     cpos, old, new, len, ext_flags);
   3125
   3126	/*If the old clusters is unwritten, no need to duplicate. */
   3127	if (!(ext_flags & OCFS2_EXT_UNWRITTEN)) {
   3128		ret = context->cow_duplicate_clusters(handle, context->inode,
   3129						      cpos, old, new, len);
   3130		if (ret) {
   3131			mlog_errno(ret);
   3132			goto out;
   3133		}
   3134	}
   3135
   3136	ret = ocfs2_clear_ext_refcount(handle, &context->data_et,
   3137				       cpos, new, len, ext_flags,
   3138				       context->meta_ac, &context->dealloc);
   3139	if (ret)
   3140		mlog_errno(ret);
   3141out:
   3142	return ret;
   3143}
   3144
   3145int ocfs2_cow_sync_writeback(struct super_block *sb,
   3146			     struct inode *inode,
   3147			     u32 cpos, u32 num_clusters)
   3148{
   3149	int ret = 0;
   3150	loff_t offset, end, map_end;
   3151	pgoff_t page_index;
   3152	struct page *page;
   3153
   3154	if (ocfs2_should_order_data(inode))
   3155		return 0;
   3156
   3157	offset = ((loff_t)cpos) << OCFS2_SB(sb)->s_clustersize_bits;
   3158	end = offset + (num_clusters << OCFS2_SB(sb)->s_clustersize_bits);
   3159
   3160	ret = filemap_fdatawrite_range(inode->i_mapping,
   3161				       offset, end - 1);
   3162	if (ret < 0) {
   3163		mlog_errno(ret);
   3164		return ret;
   3165	}
   3166
   3167	while (offset < end) {
   3168		page_index = offset >> PAGE_SHIFT;
   3169		map_end = ((loff_t)page_index + 1) << PAGE_SHIFT;
   3170		if (map_end > end)
   3171			map_end = end;
   3172
   3173		page = find_or_create_page(inode->i_mapping,
   3174					   page_index, GFP_NOFS);
   3175		BUG_ON(!page);
   3176
   3177		wait_on_page_writeback(page);
   3178		if (PageError(page)) {
   3179			ret = -EIO;
   3180			mlog_errno(ret);
   3181		} else
   3182			mark_page_accessed(page);
   3183
   3184		unlock_page(page);
   3185		put_page(page);
   3186		page = NULL;
   3187		offset = map_end;
   3188		if (ret)
   3189			break;
   3190	}
   3191
   3192	return ret;
   3193}
   3194
   3195static int ocfs2_di_get_clusters(struct ocfs2_cow_context *context,
   3196				 u32 v_cluster, u32 *p_cluster,
   3197				 u32 *num_clusters,
   3198				 unsigned int *extent_flags)
   3199{
   3200	return ocfs2_get_clusters(context->inode, v_cluster, p_cluster,
   3201				  num_clusters, extent_flags);
   3202}
   3203
   3204static int ocfs2_make_clusters_writable(struct super_block *sb,
   3205					struct ocfs2_cow_context *context,
   3206					u32 cpos, u32 p_cluster,
   3207					u32 num_clusters, unsigned int e_flags)
   3208{
   3209	int ret, delete, index, credits =  0;
   3210	u32 new_bit, new_len, orig_num_clusters;
   3211	unsigned int set_len;
   3212	struct ocfs2_super *osb = OCFS2_SB(sb);
   3213	handle_t *handle;
   3214	struct buffer_head *ref_leaf_bh = NULL;
   3215	struct ocfs2_caching_info *ref_ci = &context->ref_tree->rf_ci;
   3216	struct ocfs2_refcount_rec rec;
   3217
   3218	trace_ocfs2_make_clusters_writable(cpos, p_cluster,
   3219					   num_clusters, e_flags);
   3220
   3221	ret = ocfs2_lock_refcount_allocators(sb, p_cluster, num_clusters,
   3222					     &context->data_et,
   3223					     ref_ci,
   3224					     context->ref_root_bh,
   3225					     &context->meta_ac,
   3226					     &context->data_ac, &credits);
   3227	if (ret) {
   3228		mlog_errno(ret);
   3229		return ret;
   3230	}
   3231
   3232	if (context->post_refcount)
   3233		credits += context->post_refcount->credits;
   3234
   3235	credits += context->extra_credits;
   3236	handle = ocfs2_start_trans(osb, credits);
   3237	if (IS_ERR(handle)) {
   3238		ret = PTR_ERR(handle);
   3239		mlog_errno(ret);
   3240		goto out;
   3241	}
   3242
   3243	orig_num_clusters = num_clusters;
   3244
   3245	while (num_clusters) {
   3246		ret = ocfs2_get_refcount_rec(ref_ci, context->ref_root_bh,
   3247					     p_cluster, num_clusters,
   3248					     &rec, &index, &ref_leaf_bh);
   3249		if (ret) {
   3250			mlog_errno(ret);
   3251			goto out_commit;
   3252		}
   3253
   3254		BUG_ON(!rec.r_refcount);
   3255		set_len = min((u64)p_cluster + num_clusters,
   3256			      le64_to_cpu(rec.r_cpos) +
   3257			      le32_to_cpu(rec.r_clusters)) - p_cluster;
   3258
   3259		/*
   3260		 * There are many different situation here.
   3261		 * 1. If refcount == 1, remove the flag and don't COW.
   3262		 * 2. If refcount > 1, allocate clusters.
   3263		 *    Here we may not allocate r_len once at a time, so continue
   3264		 *    until we reach num_clusters.
   3265		 */
   3266		if (le32_to_cpu(rec.r_refcount) == 1) {
   3267			delete = 0;
   3268			ret = ocfs2_clear_ext_refcount(handle,
   3269						       &context->data_et,
   3270						       cpos, p_cluster,
   3271						       set_len, e_flags,
   3272						       context->meta_ac,
   3273						       &context->dealloc);
   3274			if (ret) {
   3275				mlog_errno(ret);
   3276				goto out_commit;
   3277			}
   3278		} else {
   3279			delete = 1;
   3280
   3281			ret = __ocfs2_claim_clusters(handle,
   3282						     context->data_ac,
   3283						     1, set_len,
   3284						     &new_bit, &new_len);
   3285			if (ret) {
   3286				mlog_errno(ret);
   3287				goto out_commit;
   3288			}
   3289
   3290			ret = ocfs2_replace_clusters(handle, context,
   3291						     cpos, p_cluster, new_bit,
   3292						     new_len, e_flags);
   3293			if (ret) {
   3294				mlog_errno(ret);
   3295				goto out_commit;
   3296			}
   3297			set_len = new_len;
   3298		}
   3299
   3300		ret = __ocfs2_decrease_refcount(handle, ref_ci,
   3301						context->ref_root_bh,
   3302						p_cluster, set_len,
   3303						context->meta_ac,
   3304						&context->dealloc, delete);
   3305		if (ret) {
   3306			mlog_errno(ret);
   3307			goto out_commit;
   3308		}
   3309
   3310		cpos += set_len;
   3311		p_cluster += set_len;
   3312		num_clusters -= set_len;
   3313		brelse(ref_leaf_bh);
   3314		ref_leaf_bh = NULL;
   3315	}
   3316
   3317	/* handle any post_cow action. */
   3318	if (context->post_refcount && context->post_refcount->func) {
   3319		ret = context->post_refcount->func(context->inode, handle,
   3320						context->post_refcount->para);
   3321		if (ret) {
   3322			mlog_errno(ret);
   3323			goto out_commit;
   3324		}
   3325	}
   3326
   3327	/*
   3328	 * Here we should write the new page out first if we are
   3329	 * in write-back mode.
   3330	 */
   3331	if (context->get_clusters == ocfs2_di_get_clusters) {
   3332		ret = ocfs2_cow_sync_writeback(sb, context->inode, cpos,
   3333					       orig_num_clusters);
   3334		if (ret)
   3335			mlog_errno(ret);
   3336	}
   3337
   3338out_commit:
   3339	ocfs2_commit_trans(osb, handle);
   3340
   3341out:
   3342	if (context->data_ac) {
   3343		ocfs2_free_alloc_context(context->data_ac);
   3344		context->data_ac = NULL;
   3345	}
   3346	if (context->meta_ac) {
   3347		ocfs2_free_alloc_context(context->meta_ac);
   3348		context->meta_ac = NULL;
   3349	}
   3350	brelse(ref_leaf_bh);
   3351
   3352	return ret;
   3353}
   3354
   3355static int ocfs2_replace_cow(struct ocfs2_cow_context *context)
   3356{
   3357	int ret = 0;
   3358	struct inode *inode = context->inode;
   3359	u32 cow_start = context->cow_start, cow_len = context->cow_len;
   3360	u32 p_cluster, num_clusters;
   3361	unsigned int ext_flags;
   3362	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
   3363
   3364	if (!ocfs2_refcount_tree(osb)) {
   3365		return ocfs2_error(inode->i_sb, "Inode %lu want to use refcount tree, but the feature bit is not set in the super block\n",
   3366				   inode->i_ino);
   3367	}
   3368
   3369	ocfs2_init_dealloc_ctxt(&context->dealloc);
   3370
   3371	while (cow_len) {
   3372		ret = context->get_clusters(context, cow_start, &p_cluster,
   3373					    &num_clusters, &ext_flags);
   3374		if (ret) {
   3375			mlog_errno(ret);
   3376			break;
   3377		}
   3378
   3379		BUG_ON(!(ext_flags & OCFS2_EXT_REFCOUNTED));
   3380
   3381		if (cow_len < num_clusters)
   3382			num_clusters = cow_len;
   3383
   3384		ret = ocfs2_make_clusters_writable(inode->i_sb, context,
   3385						   cow_start, p_cluster,
   3386						   num_clusters, ext_flags);
   3387		if (ret) {
   3388			mlog_errno(ret);
   3389			break;
   3390		}
   3391
   3392		cow_len -= num_clusters;
   3393		cow_start += num_clusters;
   3394	}
   3395
   3396	if (ocfs2_dealloc_has_cluster(&context->dealloc)) {
   3397		ocfs2_schedule_truncate_log_flush(osb, 1);
   3398		ocfs2_run_deallocs(osb, &context->dealloc);
   3399	}
   3400
   3401	return ret;
   3402}
   3403
   3404/*
   3405 * Starting at cpos, try to CoW write_len clusters.  Don't CoW
   3406 * past max_cpos.  This will stop when it runs into a hole or an
   3407 * unrefcounted extent.
   3408 */
   3409static int ocfs2_refcount_cow_hunk(struct inode *inode,
   3410				   struct buffer_head *di_bh,
   3411				   u32 cpos, u32 write_len, u32 max_cpos)
   3412{
   3413	int ret;
   3414	u32 cow_start = 0, cow_len = 0;
   3415	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
   3416	struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
   3417	struct buffer_head *ref_root_bh = NULL;
   3418	struct ocfs2_refcount_tree *ref_tree;
   3419	struct ocfs2_cow_context *context = NULL;
   3420
   3421	BUG_ON(!ocfs2_is_refcount_inode(inode));
   3422
   3423	ret = ocfs2_refcount_cal_cow_clusters(inode, &di->id2.i_list,
   3424					      cpos, write_len, max_cpos,
   3425					      &cow_start, &cow_len);
   3426	if (ret) {
   3427		mlog_errno(ret);
   3428		goto out;
   3429	}
   3430
   3431	trace_ocfs2_refcount_cow_hunk(OCFS2_I(inode)->ip_blkno,
   3432				      cpos, write_len, max_cpos,
   3433				      cow_start, cow_len);
   3434
   3435	BUG_ON(cow_len == 0);
   3436
   3437	context = kzalloc(sizeof(struct ocfs2_cow_context), GFP_NOFS);
   3438	if (!context) {
   3439		ret = -ENOMEM;
   3440		mlog_errno(ret);
   3441		goto out;
   3442	}
   3443
   3444	ret = ocfs2_lock_refcount_tree(osb, le64_to_cpu(di->i_refcount_loc),
   3445				       1, &ref_tree, &ref_root_bh);
   3446	if (ret) {
   3447		mlog_errno(ret);
   3448		goto out;
   3449	}
   3450
   3451	context->inode = inode;
   3452	context->cow_start = cow_start;
   3453	context->cow_len = cow_len;
   3454	context->ref_tree = ref_tree;
   3455	context->ref_root_bh = ref_root_bh;
   3456	context->cow_duplicate_clusters = ocfs2_duplicate_clusters_by_page;
   3457	context->get_clusters = ocfs2_di_get_clusters;
   3458
   3459	ocfs2_init_dinode_extent_tree(&context->data_et,
   3460				      INODE_CACHE(inode), di_bh);
   3461
   3462	ret = ocfs2_replace_cow(context);
   3463	if (ret)
   3464		mlog_errno(ret);
   3465
   3466	/*
   3467	 * truncate the extent map here since no matter whether we meet with
   3468	 * any error during the action, we shouldn't trust cached extent map
   3469	 * any more.
   3470	 */
   3471	ocfs2_extent_map_trunc(inode, cow_start);
   3472
   3473	ocfs2_unlock_refcount_tree(osb, ref_tree, 1);
   3474	brelse(ref_root_bh);
   3475out:
   3476	kfree(context);
   3477	return ret;
   3478}
   3479
   3480/*
   3481 * CoW any and all clusters between cpos and cpos+write_len.
   3482 * Don't CoW past max_cpos.  If this returns successfully, all
   3483 * clusters between cpos and cpos+write_len are safe to modify.
   3484 */
   3485int ocfs2_refcount_cow(struct inode *inode,
   3486		       struct buffer_head *di_bh,
   3487		       u32 cpos, u32 write_len, u32 max_cpos)
   3488{
   3489	int ret = 0;
   3490	u32 p_cluster, num_clusters;
   3491	unsigned int ext_flags;
   3492
   3493	while (write_len) {
   3494		ret = ocfs2_get_clusters(inode, cpos, &p_cluster,
   3495					 &num_clusters, &ext_flags);
   3496		if (ret) {
   3497			mlog_errno(ret);
   3498			break;
   3499		}
   3500
   3501		if (write_len < num_clusters)
   3502			num_clusters = write_len;
   3503
   3504		if (ext_flags & OCFS2_EXT_REFCOUNTED) {
   3505			ret = ocfs2_refcount_cow_hunk(inode, di_bh, cpos,
   3506						      num_clusters, max_cpos);
   3507			if (ret) {
   3508				mlog_errno(ret);
   3509				break;
   3510			}
   3511		}
   3512
   3513		write_len -= num_clusters;
   3514		cpos += num_clusters;
   3515	}
   3516
   3517	return ret;
   3518}
   3519
   3520static int ocfs2_xattr_value_get_clusters(struct ocfs2_cow_context *context,
   3521					  u32 v_cluster, u32 *p_cluster,
   3522					  u32 *num_clusters,
   3523					  unsigned int *extent_flags)
   3524{
   3525	struct inode *inode = context->inode;
   3526	struct ocfs2_xattr_value_root *xv = context->cow_object;
   3527
   3528	return ocfs2_xattr_get_clusters(inode, v_cluster, p_cluster,
   3529					num_clusters, &xv->xr_list,
   3530					extent_flags);
   3531}
   3532
   3533/*
   3534 * Given a xattr value root, calculate the most meta/credits we need for
   3535 * refcount tree change if we truncate it to 0.
   3536 */
   3537int ocfs2_refcounted_xattr_delete_need(struct inode *inode,
   3538				       struct ocfs2_caching_info *ref_ci,
   3539				       struct buffer_head *ref_root_bh,
   3540				       struct ocfs2_xattr_value_root *xv,
   3541				       int *meta_add, int *credits)
   3542{
   3543	int ret = 0, index, ref_blocks = 0;
   3544	u32 p_cluster, num_clusters;
   3545	u32 cpos = 0, clusters = le32_to_cpu(xv->xr_clusters);
   3546	struct ocfs2_refcount_block *rb;
   3547	struct ocfs2_refcount_rec rec;
   3548	struct buffer_head *ref_leaf_bh = NULL;
   3549
   3550	while (cpos < clusters) {
   3551		ret = ocfs2_xattr_get_clusters(inode, cpos, &p_cluster,
   3552					       &num_clusters, &xv->xr_list,
   3553					       NULL);
   3554		if (ret) {
   3555			mlog_errno(ret);
   3556			goto out;
   3557		}
   3558
   3559		cpos += num_clusters;
   3560
   3561		while (num_clusters) {
   3562			ret = ocfs2_get_refcount_rec(ref_ci, ref_root_bh,
   3563						     p_cluster, num_clusters,
   3564						     &rec, &index,
   3565						     &ref_leaf_bh);
   3566			if (ret) {
   3567				mlog_errno(ret);
   3568				goto out;
   3569			}
   3570
   3571			BUG_ON(!rec.r_refcount);
   3572
   3573			rb = (struct ocfs2_refcount_block *)ref_leaf_bh->b_data;
   3574
   3575			/*
   3576			 * We really don't know whether the other clusters is in
   3577			 * this refcount block or not, so just take the worst
   3578			 * case that all the clusters are in this block and each
   3579			 * one will split a refcount rec, so totally we need
   3580			 * clusters * 2 new refcount rec.
   3581			 */
   3582			if (le16_to_cpu(rb->rf_records.rl_used) + clusters * 2 >
   3583			    le16_to_cpu(rb->rf_records.rl_count))
   3584				ref_blocks++;
   3585
   3586			*credits += 1;
   3587			brelse(ref_leaf_bh);
   3588			ref_leaf_bh = NULL;
   3589
   3590			if (num_clusters <= le32_to_cpu(rec.r_clusters))
   3591				break;
   3592			else
   3593				num_clusters -= le32_to_cpu(rec.r_clusters);
   3594			p_cluster += num_clusters;
   3595		}
   3596	}
   3597
   3598	*meta_add += ref_blocks;
   3599	if (!ref_blocks)
   3600		goto out;
   3601
   3602	rb = (struct ocfs2_refcount_block *)ref_root_bh->b_data;
   3603	if (le32_to_cpu(rb->rf_flags) & OCFS2_REFCOUNT_TREE_FL)
   3604		*credits += OCFS2_EXPAND_REFCOUNT_TREE_CREDITS;
   3605	else {
   3606		struct ocfs2_extent_tree et;
   3607
   3608		ocfs2_init_refcount_extent_tree(&et, ref_ci, ref_root_bh);
   3609		*credits += ocfs2_calc_extend_credits(inode->i_sb,
   3610						      et.et_root_el);
   3611	}
   3612
   3613out:
   3614	brelse(ref_leaf_bh);
   3615	return ret;
   3616}
   3617
   3618/*
   3619 * Do CoW for xattr.
   3620 */
   3621int ocfs2_refcount_cow_xattr(struct inode *inode,
   3622			     struct ocfs2_dinode *di,
   3623			     struct ocfs2_xattr_value_buf *vb,
   3624			     struct ocfs2_refcount_tree *ref_tree,
   3625			     struct buffer_head *ref_root_bh,
   3626			     u32 cpos, u32 write_len,
   3627			     struct ocfs2_post_refcount *post)
   3628{
   3629	int ret;
   3630	struct ocfs2_xattr_value_root *xv = vb->vb_xv;
   3631	struct ocfs2_cow_context *context = NULL;
   3632	u32 cow_start, cow_len;
   3633
   3634	BUG_ON(!ocfs2_is_refcount_inode(inode));
   3635
   3636	ret = ocfs2_refcount_cal_cow_clusters(inode, &xv->xr_list,
   3637					      cpos, write_len, UINT_MAX,
   3638					      &cow_start, &cow_len);
   3639	if (ret) {
   3640		mlog_errno(ret);
   3641		goto out;
   3642	}
   3643
   3644	BUG_ON(cow_len == 0);
   3645
   3646	context = kzalloc(sizeof(struct ocfs2_cow_context), GFP_NOFS);
   3647	if (!context) {
   3648		ret = -ENOMEM;
   3649		mlog_errno(ret);
   3650		goto out;
   3651	}
   3652
   3653	context->inode = inode;
   3654	context->cow_start = cow_start;
   3655	context->cow_len = cow_len;
   3656	context->ref_tree = ref_tree;
   3657	context->ref_root_bh = ref_root_bh;
   3658	context->cow_object = xv;
   3659
   3660	context->cow_duplicate_clusters = ocfs2_duplicate_clusters_by_jbd;
   3661	/* We need the extra credits for duplicate_clusters by jbd. */
   3662	context->extra_credits =
   3663		ocfs2_clusters_to_blocks(inode->i_sb, 1) * cow_len;
   3664	context->get_clusters = ocfs2_xattr_value_get_clusters;
   3665	context->post_refcount = post;
   3666
   3667	ocfs2_init_xattr_value_extent_tree(&context->data_et,
   3668					   INODE_CACHE(inode), vb);
   3669
   3670	ret = ocfs2_replace_cow(context);
   3671	if (ret)
   3672		mlog_errno(ret);
   3673
   3674out:
   3675	kfree(context);
   3676	return ret;
   3677}
   3678
   3679/*
   3680 * Insert a new extent into refcount tree and mark a extent rec
   3681 * as refcounted in the dinode tree.
   3682 */
   3683int ocfs2_add_refcount_flag(struct inode *inode,
   3684			    struct ocfs2_extent_tree *data_et,
   3685			    struct ocfs2_caching_info *ref_ci,
   3686			    struct buffer_head *ref_root_bh,
   3687			    u32 cpos, u32 p_cluster, u32 num_clusters,
   3688			    struct ocfs2_cached_dealloc_ctxt *dealloc,
   3689			    struct ocfs2_post_refcount *post)
   3690{
   3691	int ret;
   3692	handle_t *handle;
   3693	int credits = 1, ref_blocks = 0;
   3694	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
   3695	struct ocfs2_alloc_context *meta_ac = NULL;
   3696
   3697	/* We need to be able to handle at least an extent tree split. */
   3698	ref_blocks = ocfs2_extend_meta_needed(data_et->et_root_el);
   3699
   3700	ret = ocfs2_calc_refcount_meta_credits(inode->i_sb,
   3701					       ref_ci, ref_root_bh,
   3702					       p_cluster, num_clusters,
   3703					       &ref_blocks, &credits);
   3704	if (ret) {
   3705		mlog_errno(ret);
   3706		goto out;
   3707	}
   3708
   3709	trace_ocfs2_add_refcount_flag(ref_blocks, credits);
   3710
   3711	if (ref_blocks) {
   3712		ret = ocfs2_reserve_new_metadata_blocks(osb,
   3713							ref_blocks, &meta_ac);
   3714		if (ret) {
   3715			mlog_errno(ret);
   3716			goto out;
   3717		}
   3718	}
   3719
   3720	if (post)
   3721		credits += post->credits;
   3722
   3723	handle = ocfs2_start_trans(osb, credits);
   3724	if (IS_ERR(handle)) {
   3725		ret = PTR_ERR(handle);
   3726		mlog_errno(ret);
   3727		goto out;
   3728	}
   3729
   3730	ret = ocfs2_mark_extent_refcounted(inode, data_et, handle,
   3731					   cpos, num_clusters, p_cluster,
   3732					   meta_ac, dealloc);
   3733	if (ret) {
   3734		mlog_errno(ret);
   3735		goto out_commit;
   3736	}
   3737
   3738	ret = __ocfs2_increase_refcount(handle, ref_ci, ref_root_bh,
   3739					p_cluster, num_clusters, 0,
   3740					meta_ac, dealloc);
   3741	if (ret) {
   3742		mlog_errno(ret);
   3743		goto out_commit;
   3744	}
   3745
   3746	if (post && post->func) {
   3747		ret = post->func(inode, handle, post->para);
   3748		if (ret)
   3749			mlog_errno(ret);
   3750	}
   3751
   3752out_commit:
   3753	ocfs2_commit_trans(osb, handle);
   3754out:
   3755	if (meta_ac)
   3756		ocfs2_free_alloc_context(meta_ac);
   3757	return ret;
   3758}
   3759
   3760static int ocfs2_change_ctime(struct inode *inode,
   3761			      struct buffer_head *di_bh)
   3762{
   3763	int ret;
   3764	handle_t *handle;
   3765	struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
   3766
   3767	handle = ocfs2_start_trans(OCFS2_SB(inode->i_sb),
   3768				   OCFS2_INODE_UPDATE_CREDITS);
   3769	if (IS_ERR(handle)) {
   3770		ret = PTR_ERR(handle);
   3771		mlog_errno(ret);
   3772		goto out;
   3773	}
   3774
   3775	ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), di_bh,
   3776				      OCFS2_JOURNAL_ACCESS_WRITE);
   3777	if (ret) {
   3778		mlog_errno(ret);
   3779		goto out_commit;
   3780	}
   3781
   3782	inode->i_ctime = current_time(inode);
   3783	di->i_ctime = cpu_to_le64(inode->i_ctime.tv_sec);
   3784	di->i_ctime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec);
   3785
   3786	ocfs2_journal_dirty(handle, di_bh);
   3787
   3788out_commit:
   3789	ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle);
   3790out:
   3791	return ret;
   3792}
   3793
   3794static int ocfs2_attach_refcount_tree(struct inode *inode,
   3795				      struct buffer_head *di_bh)
   3796{
   3797	int ret, data_changed = 0;
   3798	struct buffer_head *ref_root_bh = NULL;
   3799	struct ocfs2_inode_info *oi = OCFS2_I(inode);
   3800	struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
   3801	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
   3802	struct ocfs2_refcount_tree *ref_tree;
   3803	unsigned int ext_flags;
   3804	loff_t size;
   3805	u32 cpos, num_clusters, clusters, p_cluster;
   3806	struct ocfs2_cached_dealloc_ctxt dealloc;
   3807	struct ocfs2_extent_tree di_et;
   3808
   3809	ocfs2_init_dealloc_ctxt(&dealloc);
   3810
   3811	if (!ocfs2_is_refcount_inode(inode)) {
   3812		ret = ocfs2_create_refcount_tree(inode, di_bh);
   3813		if (ret) {
   3814			mlog_errno(ret);
   3815			goto out;
   3816		}
   3817	}
   3818
   3819	BUG_ON(!di->i_refcount_loc);
   3820	ret = ocfs2_lock_refcount_tree(osb,
   3821				       le64_to_cpu(di->i_refcount_loc), 1,
   3822				       &ref_tree, &ref_root_bh);
   3823	if (ret) {
   3824		mlog_errno(ret);
   3825		goto out;
   3826	}
   3827
   3828	if (oi->ip_dyn_features & OCFS2_INLINE_DATA_FL)
   3829		goto attach_xattr;
   3830
   3831	ocfs2_init_dinode_extent_tree(&di_et, INODE_CACHE(inode), di_bh);
   3832
   3833	size = i_size_read(inode);
   3834	clusters = ocfs2_clusters_for_bytes(inode->i_sb, size);
   3835
   3836	cpos = 0;
   3837	while (cpos < clusters) {
   3838		ret = ocfs2_get_clusters(inode, cpos, &p_cluster,
   3839					 &num_clusters, &ext_flags);
   3840		if (ret) {
   3841			mlog_errno(ret);
   3842			goto unlock;
   3843		}
   3844		if (p_cluster && !(ext_flags & OCFS2_EXT_REFCOUNTED)) {
   3845			ret = ocfs2_add_refcount_flag(inode, &di_et,
   3846						      &ref_tree->rf_ci,
   3847						      ref_root_bh, cpos,
   3848						      p_cluster, num_clusters,
   3849						      &dealloc, NULL);
   3850			if (ret) {
   3851				mlog_errno(ret);
   3852				goto unlock;
   3853			}
   3854
   3855			data_changed = 1;
   3856		}
   3857		cpos += num_clusters;
   3858	}
   3859
   3860attach_xattr:
   3861	if (oi->ip_dyn_features & OCFS2_HAS_XATTR_FL) {
   3862		ret = ocfs2_xattr_attach_refcount_tree(inode, di_bh,
   3863						       &ref_tree->rf_ci,
   3864						       ref_root_bh,
   3865						       &dealloc);
   3866		if (ret) {
   3867			mlog_errno(ret);
   3868			goto unlock;
   3869		}
   3870	}
   3871
   3872	if (data_changed) {
   3873		ret = ocfs2_change_ctime(inode, di_bh);
   3874		if (ret)
   3875			mlog_errno(ret);
   3876	}
   3877
   3878unlock:
   3879	ocfs2_unlock_refcount_tree(osb, ref_tree, 1);
   3880	brelse(ref_root_bh);
   3881
   3882	if (!ret && ocfs2_dealloc_has_cluster(&dealloc)) {
   3883		ocfs2_schedule_truncate_log_flush(osb, 1);
   3884		ocfs2_run_deallocs(osb, &dealloc);
   3885	}
   3886out:
   3887	/*
   3888	 * Empty the extent map so that we may get the right extent
   3889	 * record from the disk.
   3890	 */
   3891	ocfs2_extent_map_trunc(inode, 0);
   3892
   3893	return ret;
   3894}
   3895
   3896static int ocfs2_add_refcounted_extent(struct inode *inode,
   3897				   struct ocfs2_extent_tree *et,
   3898				   struct ocfs2_caching_info *ref_ci,
   3899				   struct buffer_head *ref_root_bh,
   3900				   u32 cpos, u32 p_cluster, u32 num_clusters,
   3901				   unsigned int ext_flags,
   3902				   struct ocfs2_cached_dealloc_ctxt *dealloc)
   3903{
   3904	int ret;
   3905	handle_t *handle;
   3906	int credits = 0;
   3907	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
   3908	struct ocfs2_alloc_context *meta_ac = NULL;
   3909
   3910	ret = ocfs2_lock_refcount_allocators(inode->i_sb,
   3911					     p_cluster, num_clusters,
   3912					     et, ref_ci,
   3913					     ref_root_bh, &meta_ac,
   3914					     NULL, &credits);
   3915	if (ret) {
   3916		mlog_errno(ret);
   3917		goto out;
   3918	}
   3919
   3920	handle = ocfs2_start_trans(osb, credits);
   3921	if (IS_ERR(handle)) {
   3922		ret = PTR_ERR(handle);
   3923		mlog_errno(ret);
   3924		goto out;
   3925	}
   3926
   3927	ret = ocfs2_insert_extent(handle, et, cpos,
   3928			ocfs2_clusters_to_blocks(inode->i_sb, p_cluster),
   3929			num_clusters, ext_flags, meta_ac);
   3930	if (ret) {
   3931		mlog_errno(ret);
   3932		goto out_commit;
   3933	}
   3934
   3935	ret = ocfs2_increase_refcount(handle, ref_ci, ref_root_bh,
   3936				      p_cluster, num_clusters,
   3937				      meta_ac, dealloc);
   3938	if (ret) {
   3939		mlog_errno(ret);
   3940		goto out_commit;
   3941	}
   3942
   3943	ret = dquot_alloc_space_nodirty(inode,
   3944		ocfs2_clusters_to_bytes(osb->sb, num_clusters));
   3945	if (ret)
   3946		mlog_errno(ret);
   3947
   3948out_commit:
   3949	ocfs2_commit_trans(osb, handle);
   3950out:
   3951	if (meta_ac)
   3952		ocfs2_free_alloc_context(meta_ac);
   3953	return ret;
   3954}
   3955
   3956static int ocfs2_duplicate_inline_data(struct inode *s_inode,
   3957				       struct buffer_head *s_bh,
   3958				       struct inode *t_inode,
   3959				       struct buffer_head *t_bh)
   3960{
   3961	int ret;
   3962	handle_t *handle;
   3963	struct ocfs2_super *osb = OCFS2_SB(s_inode->i_sb);
   3964	struct ocfs2_dinode *s_di = (struct ocfs2_dinode *)s_bh->b_data;
   3965	struct ocfs2_dinode *t_di = (struct ocfs2_dinode *)t_bh->b_data;
   3966
   3967	BUG_ON(!(OCFS2_I(s_inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL));
   3968
   3969	handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
   3970	if (IS_ERR(handle)) {
   3971		ret = PTR_ERR(handle);
   3972		mlog_errno(ret);
   3973		goto out;
   3974	}
   3975
   3976	ret = ocfs2_journal_access_di(handle, INODE_CACHE(t_inode), t_bh,
   3977				      OCFS2_JOURNAL_ACCESS_WRITE);
   3978	if (ret) {
   3979		mlog_errno(ret);
   3980		goto out_commit;
   3981	}
   3982
   3983	t_di->id2.i_data.id_count = s_di->id2.i_data.id_count;
   3984	memcpy(t_di->id2.i_data.id_data, s_di->id2.i_data.id_data,
   3985	       le16_to_cpu(s_di->id2.i_data.id_count));
   3986	spin_lock(&OCFS2_I(t_inode)->ip_lock);
   3987	OCFS2_I(t_inode)->ip_dyn_features |= OCFS2_INLINE_DATA_FL;
   3988	t_di->i_dyn_features = cpu_to_le16(OCFS2_I(t_inode)->ip_dyn_features);
   3989	spin_unlock(&OCFS2_I(t_inode)->ip_lock);
   3990
   3991	ocfs2_journal_dirty(handle, t_bh);
   3992
   3993out_commit:
   3994	ocfs2_commit_trans(osb, handle);
   3995out:
   3996	return ret;
   3997}
   3998
   3999static int ocfs2_duplicate_extent_list(struct inode *s_inode,
   4000				struct inode *t_inode,
   4001				struct buffer_head *t_bh,
   4002				struct ocfs2_caching_info *ref_ci,
   4003				struct buffer_head *ref_root_bh,
   4004				struct ocfs2_cached_dealloc_ctxt *dealloc)
   4005{
   4006	int ret = 0;
   4007	u32 p_cluster, num_clusters, clusters, cpos;
   4008	loff_t size;
   4009	unsigned int ext_flags;
   4010	struct ocfs2_extent_tree et;
   4011
   4012	ocfs2_init_dinode_extent_tree(&et, INODE_CACHE(t_inode), t_bh);
   4013
   4014	size = i_size_read(s_inode);
   4015	clusters = ocfs2_clusters_for_bytes(s_inode->i_sb, size);
   4016
   4017	cpos = 0;
   4018	while (cpos < clusters) {
   4019		ret = ocfs2_get_clusters(s_inode, cpos, &p_cluster,
   4020					 &num_clusters, &ext_flags);
   4021		if (ret) {
   4022			mlog_errno(ret);
   4023			goto out;
   4024		}
   4025		if (p_cluster) {
   4026			ret = ocfs2_add_refcounted_extent(t_inode, &et,
   4027							  ref_ci, ref_root_bh,
   4028							  cpos, p_cluster,
   4029							  num_clusters,
   4030							  ext_flags,
   4031							  dealloc);
   4032			if (ret) {
   4033				mlog_errno(ret);
   4034				goto out;
   4035			}
   4036		}
   4037
   4038		cpos += num_clusters;
   4039	}
   4040
   4041out:
   4042	return ret;
   4043}
   4044
   4045/*
   4046 * change the new file's attributes to the src.
   4047 *
   4048 * reflink creates a snapshot of a file, that means the attributes
   4049 * must be identical except for three exceptions - nlink, ino, and ctime.
   4050 */
   4051static int ocfs2_complete_reflink(struct inode *s_inode,
   4052				  struct buffer_head *s_bh,
   4053				  struct inode *t_inode,
   4054				  struct buffer_head *t_bh,
   4055				  bool preserve)
   4056{
   4057	int ret;
   4058	handle_t *handle;
   4059	struct ocfs2_dinode *s_di = (struct ocfs2_dinode *)s_bh->b_data;
   4060	struct ocfs2_dinode *di = (struct ocfs2_dinode *)t_bh->b_data;
   4061	loff_t size = i_size_read(s_inode);
   4062
   4063	handle = ocfs2_start_trans(OCFS2_SB(t_inode->i_sb),
   4064				   OCFS2_INODE_UPDATE_CREDITS);
   4065	if (IS_ERR(handle)) {
   4066		ret = PTR_ERR(handle);
   4067		mlog_errno(ret);
   4068		return ret;
   4069	}
   4070
   4071	ret = ocfs2_journal_access_di(handle, INODE_CACHE(t_inode), t_bh,
   4072				      OCFS2_JOURNAL_ACCESS_WRITE);
   4073	if (ret) {
   4074		mlog_errno(ret);
   4075		goto out_commit;
   4076	}
   4077
   4078	spin_lock(&OCFS2_I(t_inode)->ip_lock);
   4079	OCFS2_I(t_inode)->ip_clusters = OCFS2_I(s_inode)->ip_clusters;
   4080	OCFS2_I(t_inode)->ip_attr = OCFS2_I(s_inode)->ip_attr;
   4081	OCFS2_I(t_inode)->ip_dyn_features = OCFS2_I(s_inode)->ip_dyn_features;
   4082	spin_unlock(&OCFS2_I(t_inode)->ip_lock);
   4083	i_size_write(t_inode, size);
   4084	t_inode->i_blocks = s_inode->i_blocks;
   4085
   4086	di->i_xattr_inline_size = s_di->i_xattr_inline_size;
   4087	di->i_clusters = s_di->i_clusters;
   4088	di->i_size = s_di->i_size;
   4089	di->i_dyn_features = s_di->i_dyn_features;
   4090	di->i_attr = s_di->i_attr;
   4091
   4092	if (preserve) {
   4093		t_inode->i_uid = s_inode->i_uid;
   4094		t_inode->i_gid = s_inode->i_gid;
   4095		t_inode->i_mode = s_inode->i_mode;
   4096		di->i_uid = s_di->i_uid;
   4097		di->i_gid = s_di->i_gid;
   4098		di->i_mode = s_di->i_mode;
   4099
   4100		/*
   4101		 * update time.
   4102		 * we want mtime to appear identical to the source and
   4103		 * update ctime.
   4104		 */
   4105		t_inode->i_ctime = current_time(t_inode);
   4106
   4107		di->i_ctime = cpu_to_le64(t_inode->i_ctime.tv_sec);
   4108		di->i_ctime_nsec = cpu_to_le32(t_inode->i_ctime.tv_nsec);
   4109
   4110		t_inode->i_mtime = s_inode->i_mtime;
   4111		di->i_mtime = s_di->i_mtime;
   4112		di->i_mtime_nsec = s_di->i_mtime_nsec;
   4113	}
   4114
   4115	ocfs2_journal_dirty(handle, t_bh);
   4116
   4117out_commit:
   4118	ocfs2_commit_trans(OCFS2_SB(t_inode->i_sb), handle);
   4119	return ret;
   4120}
   4121
   4122static int ocfs2_create_reflink_node(struct inode *s_inode,
   4123				     struct buffer_head *s_bh,
   4124				     struct inode *t_inode,
   4125				     struct buffer_head *t_bh,
   4126				     bool preserve)
   4127{
   4128	int ret;
   4129	struct buffer_head *ref_root_bh = NULL;
   4130	struct ocfs2_cached_dealloc_ctxt dealloc;
   4131	struct ocfs2_super *osb = OCFS2_SB(s_inode->i_sb);
   4132	struct ocfs2_dinode *di = (struct ocfs2_dinode *)s_bh->b_data;
   4133	struct ocfs2_refcount_tree *ref_tree;
   4134
   4135	ocfs2_init_dealloc_ctxt(&dealloc);
   4136
   4137	ret = ocfs2_set_refcount_tree(t_inode, t_bh,
   4138				      le64_to_cpu(di->i_refcount_loc));
   4139	if (ret) {
   4140		mlog_errno(ret);
   4141		goto out;
   4142	}
   4143
   4144	if (OCFS2_I(s_inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) {
   4145		ret = ocfs2_duplicate_inline_data(s_inode, s_bh,
   4146						  t_inode, t_bh);
   4147		if (ret)
   4148			mlog_errno(ret);
   4149		goto out;
   4150	}
   4151
   4152	ret = ocfs2_lock_refcount_tree(osb, le64_to_cpu(di->i_refcount_loc),
   4153				       1, &ref_tree, &ref_root_bh);
   4154	if (ret) {
   4155		mlog_errno(ret);
   4156		goto out;
   4157	}
   4158
   4159	ret = ocfs2_duplicate_extent_list(s_inode, t_inode, t_bh,
   4160					  &ref_tree->rf_ci, ref_root_bh,
   4161					  &dealloc);
   4162	if (ret) {
   4163		mlog_errno(ret);
   4164		goto out_unlock_refcount;
   4165	}
   4166
   4167out_unlock_refcount:
   4168	ocfs2_unlock_refcount_tree(osb, ref_tree, 1);
   4169	brelse(ref_root_bh);
   4170out:
   4171	if (ocfs2_dealloc_has_cluster(&dealloc)) {
   4172		ocfs2_schedule_truncate_log_flush(osb, 1);
   4173		ocfs2_run_deallocs(osb, &dealloc);
   4174	}
   4175
   4176	return ret;
   4177}
   4178
   4179static int __ocfs2_reflink(struct dentry *old_dentry,
   4180			   struct buffer_head *old_bh,
   4181			   struct inode *new_inode,
   4182			   bool preserve)
   4183{
   4184	int ret;
   4185	struct inode *inode = d_inode(old_dentry);
   4186	struct buffer_head *new_bh = NULL;
   4187
   4188	if (OCFS2_I(inode)->ip_flags & OCFS2_INODE_SYSTEM_FILE) {
   4189		ret = -EINVAL;
   4190		mlog_errno(ret);
   4191		goto out;
   4192	}
   4193
   4194	ret = filemap_fdatawrite(inode->i_mapping);
   4195	if (ret) {
   4196		mlog_errno(ret);
   4197		goto out;
   4198	}
   4199
   4200	ret = ocfs2_attach_refcount_tree(inode, old_bh);
   4201	if (ret) {
   4202		mlog_errno(ret);
   4203		goto out;
   4204	}
   4205
   4206	inode_lock_nested(new_inode, I_MUTEX_CHILD);
   4207	ret = ocfs2_inode_lock_nested(new_inode, &new_bh, 1,
   4208				      OI_LS_REFLINK_TARGET);
   4209	if (ret) {
   4210		mlog_errno(ret);
   4211		goto out_unlock;
   4212	}
   4213
   4214	ret = ocfs2_create_reflink_node(inode, old_bh,
   4215					new_inode, new_bh, preserve);
   4216	if (ret) {
   4217		mlog_errno(ret);
   4218		goto inode_unlock;
   4219	}
   4220
   4221	if (OCFS2_I(inode)->ip_dyn_features & OCFS2_HAS_XATTR_FL) {
   4222		ret = ocfs2_reflink_xattrs(inode, old_bh,
   4223					   new_inode, new_bh,
   4224					   preserve);
   4225		if (ret) {
   4226			mlog_errno(ret);
   4227			goto inode_unlock;
   4228		}
   4229	}
   4230
   4231	ret = ocfs2_complete_reflink(inode, old_bh,
   4232				     new_inode, new_bh, preserve);
   4233	if (ret)
   4234		mlog_errno(ret);
   4235
   4236inode_unlock:
   4237	ocfs2_inode_unlock(new_inode, 1);
   4238	brelse(new_bh);
   4239out_unlock:
   4240	inode_unlock(new_inode);
   4241out:
   4242	if (!ret) {
   4243		ret = filemap_fdatawait(inode->i_mapping);
   4244		if (ret)
   4245			mlog_errno(ret);
   4246	}
   4247	return ret;
   4248}
   4249
   4250static int ocfs2_reflink(struct dentry *old_dentry, struct inode *dir,
   4251			 struct dentry *new_dentry, bool preserve)
   4252{
   4253	int error, had_lock;
   4254	struct inode *inode = d_inode(old_dentry);
   4255	struct buffer_head *old_bh = NULL;
   4256	struct inode *new_orphan_inode = NULL;
   4257	struct ocfs2_lock_holder oh;
   4258
   4259	if (!ocfs2_refcount_tree(OCFS2_SB(inode->i_sb)))
   4260		return -EOPNOTSUPP;
   4261
   4262
   4263	error = ocfs2_create_inode_in_orphan(dir, inode->i_mode,
   4264					     &new_orphan_inode);
   4265	if (error) {
   4266		mlog_errno(error);
   4267		goto out;
   4268	}
   4269
   4270	error = ocfs2_rw_lock(inode, 1);
   4271	if (error) {
   4272		mlog_errno(error);
   4273		goto out;
   4274	}
   4275
   4276	error = ocfs2_inode_lock(inode, &old_bh, 1);
   4277	if (error) {
   4278		mlog_errno(error);
   4279		ocfs2_rw_unlock(inode, 1);
   4280		goto out;
   4281	}
   4282
   4283	down_write(&OCFS2_I(inode)->ip_xattr_sem);
   4284	down_write(&OCFS2_I(inode)->ip_alloc_sem);
   4285	error = __ocfs2_reflink(old_dentry, old_bh,
   4286				new_orphan_inode, preserve);
   4287	up_write(&OCFS2_I(inode)->ip_alloc_sem);
   4288	up_write(&OCFS2_I(inode)->ip_xattr_sem);
   4289
   4290	ocfs2_inode_unlock(inode, 1);
   4291	ocfs2_rw_unlock(inode, 1);
   4292	brelse(old_bh);
   4293
   4294	if (error) {
   4295		mlog_errno(error);
   4296		goto out;
   4297	}
   4298
   4299	had_lock = ocfs2_inode_lock_tracker(new_orphan_inode, NULL, 1,
   4300					    &oh);
   4301	if (had_lock < 0) {
   4302		error = had_lock;
   4303		mlog_errno(error);
   4304		goto out;
   4305	}
   4306
   4307	/* If the security isn't preserved, we need to re-initialize them. */
   4308	if (!preserve) {
   4309		error = ocfs2_init_security_and_acl(dir, new_orphan_inode,
   4310						    &new_dentry->d_name);
   4311		if (error)
   4312			mlog_errno(error);
   4313	}
   4314	if (!error) {
   4315		error = ocfs2_mv_orphaned_inode_to_new(dir, new_orphan_inode,
   4316						       new_dentry);
   4317		if (error)
   4318			mlog_errno(error);
   4319	}
   4320	ocfs2_inode_unlock_tracker(new_orphan_inode, 1, &oh, had_lock);
   4321
   4322out:
   4323	if (new_orphan_inode) {
   4324		/*
   4325		 * We need to open_unlock the inode no matter whether we
   4326		 * succeed or not, so that other nodes can delete it later.
   4327		 */
   4328		ocfs2_open_unlock(new_orphan_inode);
   4329		if (error)
   4330			iput(new_orphan_inode);
   4331	}
   4332
   4333	return error;
   4334}
   4335
   4336/*
   4337 * Below here are the bits used by OCFS2_IOC_REFLINK() to fake
   4338 * sys_reflink().  This will go away when vfs_reflink() exists in
   4339 * fs/namei.c.
   4340 */
   4341
   4342/* copied from may_create in VFS. */
   4343static inline int ocfs2_may_create(struct inode *dir, struct dentry *child)
   4344{
   4345	if (d_really_is_positive(child))
   4346		return -EEXIST;
   4347	if (IS_DEADDIR(dir))
   4348		return -ENOENT;
   4349	return inode_permission(&init_user_ns, dir, MAY_WRITE | MAY_EXEC);
   4350}
   4351
   4352/**
   4353 * ocfs2_vfs_reflink - Create a reference-counted link
   4354 *
   4355 * @old_dentry:        source dentry + inode
   4356 * @dir:       directory to create the target
   4357 * @new_dentry:        target dentry
   4358 * @preserve:  if true, preserve all file attributes
   4359 */
   4360static int ocfs2_vfs_reflink(struct dentry *old_dentry, struct inode *dir,
   4361			     struct dentry *new_dentry, bool preserve)
   4362{
   4363	struct inode *inode = d_inode(old_dentry);
   4364	int error;
   4365
   4366	if (!inode)
   4367		return -ENOENT;
   4368
   4369	error = ocfs2_may_create(dir, new_dentry);
   4370	if (error)
   4371		return error;
   4372
   4373	if (dir->i_sb != inode->i_sb)
   4374		return -EXDEV;
   4375
   4376	/*
   4377	 * A reflink to an append-only or immutable file cannot be created.
   4378	 */
   4379	if (IS_APPEND(inode) || IS_IMMUTABLE(inode))
   4380		return -EPERM;
   4381
   4382	/* Only regular files can be reflinked. */
   4383	if (!S_ISREG(inode->i_mode))
   4384		return -EPERM;
   4385
   4386	/*
   4387	 * If the caller wants to preserve ownership, they require the
   4388	 * rights to do so.
   4389	 */
   4390	if (preserve) {
   4391		if (!uid_eq(current_fsuid(), inode->i_uid) && !capable(CAP_CHOWN))
   4392			return -EPERM;
   4393		if (!in_group_p(inode->i_gid) && !capable(CAP_CHOWN))
   4394			return -EPERM;
   4395	}
   4396
   4397	/*
   4398	 * If the caller is modifying any aspect of the attributes, they
   4399	 * are not creating a snapshot.  They need read permission on the
   4400	 * file.
   4401	 */
   4402	if (!preserve) {
   4403		error = inode_permission(&init_user_ns, inode, MAY_READ);
   4404		if (error)
   4405			return error;
   4406	}
   4407
   4408	inode_lock(inode);
   4409	error = dquot_initialize(dir);
   4410	if (!error)
   4411		error = ocfs2_reflink(old_dentry, dir, new_dentry, preserve);
   4412	inode_unlock(inode);
   4413	if (!error)
   4414		fsnotify_create(dir, new_dentry);
   4415	return error;
   4416}
   4417/*
   4418 * Most codes are copied from sys_linkat.
   4419 */
   4420int ocfs2_reflink_ioctl(struct inode *inode,
   4421			const char __user *oldname,
   4422			const char __user *newname,
   4423			bool preserve)
   4424{
   4425	struct dentry *new_dentry;
   4426	struct path old_path, new_path;
   4427	int error;
   4428
   4429	if (!ocfs2_refcount_tree(OCFS2_SB(inode->i_sb)))
   4430		return -EOPNOTSUPP;
   4431
   4432	error = user_path_at(AT_FDCWD, oldname, 0, &old_path);
   4433	if (error) {
   4434		mlog_errno(error);
   4435		return error;
   4436	}
   4437
   4438	new_dentry = user_path_create(AT_FDCWD, newname, &new_path, 0);
   4439	error = PTR_ERR(new_dentry);
   4440	if (IS_ERR(new_dentry)) {
   4441		mlog_errno(error);
   4442		goto out;
   4443	}
   4444
   4445	error = -EXDEV;
   4446	if (old_path.mnt != new_path.mnt) {
   4447		mlog_errno(error);
   4448		goto out_dput;
   4449	}
   4450
   4451	error = ocfs2_vfs_reflink(old_path.dentry,
   4452				  d_inode(new_path.dentry),
   4453				  new_dentry, preserve);
   4454out_dput:
   4455	done_path_create(&new_path, new_dentry);
   4456out:
   4457	path_put(&old_path);
   4458
   4459	return error;
   4460}
   4461
   4462/* Update destination inode size, if necessary. */
   4463int ocfs2_reflink_update_dest(struct inode *dest,
   4464			      struct buffer_head *d_bh,
   4465			      loff_t newlen)
   4466{
   4467	handle_t *handle;
   4468	int ret;
   4469
   4470	dest->i_blocks = ocfs2_inode_sector_count(dest);
   4471
   4472	if (newlen <= i_size_read(dest))
   4473		return 0;
   4474
   4475	handle = ocfs2_start_trans(OCFS2_SB(dest->i_sb),
   4476				   OCFS2_INODE_UPDATE_CREDITS);
   4477	if (IS_ERR(handle)) {
   4478		ret = PTR_ERR(handle);
   4479		mlog_errno(ret);
   4480		return ret;
   4481	}
   4482
   4483	/* Extend i_size if needed. */
   4484	spin_lock(&OCFS2_I(dest)->ip_lock);
   4485	if (newlen > i_size_read(dest))
   4486		i_size_write(dest, newlen);
   4487	spin_unlock(&OCFS2_I(dest)->ip_lock);
   4488	dest->i_ctime = dest->i_mtime = current_time(dest);
   4489
   4490	ret = ocfs2_mark_inode_dirty(handle, dest, d_bh);
   4491	if (ret) {
   4492		mlog_errno(ret);
   4493		goto out_commit;
   4494	}
   4495
   4496out_commit:
   4497	ocfs2_commit_trans(OCFS2_SB(dest->i_sb), handle);
   4498	return ret;
   4499}
   4500
   4501/* Remap the range pos_in:len in s_inode to pos_out:len in t_inode. */
   4502static loff_t ocfs2_reflink_remap_extent(struct inode *s_inode,
   4503					 struct buffer_head *s_bh,
   4504					 loff_t pos_in,
   4505					 struct inode *t_inode,
   4506					 struct buffer_head *t_bh,
   4507					 loff_t pos_out,
   4508					 loff_t len,
   4509					 struct ocfs2_cached_dealloc_ctxt *dealloc)
   4510{
   4511	struct ocfs2_extent_tree s_et;
   4512	struct ocfs2_extent_tree t_et;
   4513	struct ocfs2_dinode *dis;
   4514	struct buffer_head *ref_root_bh = NULL;
   4515	struct ocfs2_refcount_tree *ref_tree;
   4516	struct ocfs2_super *osb;
   4517	loff_t remapped_bytes = 0;
   4518	loff_t pstart, plen;
   4519	u32 p_cluster, num_clusters, slast, spos, tpos, remapped_clus = 0;
   4520	unsigned int ext_flags;
   4521	int ret = 0;
   4522
   4523	osb = OCFS2_SB(s_inode->i_sb);
   4524	dis = (struct ocfs2_dinode *)s_bh->b_data;
   4525	ocfs2_init_dinode_extent_tree(&s_et, INODE_CACHE(s_inode), s_bh);
   4526	ocfs2_init_dinode_extent_tree(&t_et, INODE_CACHE(t_inode), t_bh);
   4527
   4528	spos = ocfs2_bytes_to_clusters(s_inode->i_sb, pos_in);
   4529	tpos = ocfs2_bytes_to_clusters(t_inode->i_sb, pos_out);
   4530	slast = ocfs2_clusters_for_bytes(s_inode->i_sb, pos_in + len);
   4531
   4532	while (spos < slast) {
   4533		if (fatal_signal_pending(current)) {
   4534			ret = -EINTR;
   4535			goto out;
   4536		}
   4537
   4538		/* Look up the extent. */
   4539		ret = ocfs2_get_clusters(s_inode, spos, &p_cluster,
   4540					 &num_clusters, &ext_flags);
   4541		if (ret) {
   4542			mlog_errno(ret);
   4543			goto out;
   4544		}
   4545
   4546		num_clusters = min_t(u32, num_clusters, slast - spos);
   4547
   4548		/* Punch out the dest range. */
   4549		pstart = ocfs2_clusters_to_bytes(t_inode->i_sb, tpos);
   4550		plen = ocfs2_clusters_to_bytes(t_inode->i_sb, num_clusters);
   4551		ret = ocfs2_remove_inode_range(t_inode, t_bh, pstart, plen);
   4552		if (ret) {
   4553			mlog_errno(ret);
   4554			goto out;
   4555		}
   4556
   4557		if (p_cluster == 0)
   4558			goto next_loop;
   4559
   4560		/* Lock the refcount btree... */
   4561		ret = ocfs2_lock_refcount_tree(osb,
   4562					       le64_to_cpu(dis->i_refcount_loc),
   4563					       1, &ref_tree, &ref_root_bh);
   4564		if (ret) {
   4565			mlog_errno(ret);
   4566			goto out;
   4567		}
   4568
   4569		/* Mark s_inode's extent as refcounted. */
   4570		if (!(ext_flags & OCFS2_EXT_REFCOUNTED)) {
   4571			ret = ocfs2_add_refcount_flag(s_inode, &s_et,
   4572						      &ref_tree->rf_ci,
   4573						      ref_root_bh, spos,
   4574						      p_cluster, num_clusters,
   4575						      dealloc, NULL);
   4576			if (ret) {
   4577				mlog_errno(ret);
   4578				goto out_unlock_refcount;
   4579			}
   4580		}
   4581
   4582		/* Map in the new extent. */
   4583		ext_flags |= OCFS2_EXT_REFCOUNTED;
   4584		ret = ocfs2_add_refcounted_extent(t_inode, &t_et,
   4585						  &ref_tree->rf_ci,
   4586						  ref_root_bh,
   4587						  tpos, p_cluster,
   4588						  num_clusters,
   4589						  ext_flags,
   4590						  dealloc);
   4591		if (ret) {
   4592			mlog_errno(ret);
   4593			goto out_unlock_refcount;
   4594		}
   4595
   4596		ocfs2_unlock_refcount_tree(osb, ref_tree, 1);
   4597		brelse(ref_root_bh);
   4598next_loop:
   4599		spos += num_clusters;
   4600		tpos += num_clusters;
   4601		remapped_clus += num_clusters;
   4602	}
   4603
   4604	goto out;
   4605out_unlock_refcount:
   4606	ocfs2_unlock_refcount_tree(osb, ref_tree, 1);
   4607	brelse(ref_root_bh);
   4608out:
   4609	remapped_bytes = ocfs2_clusters_to_bytes(t_inode->i_sb, remapped_clus);
   4610	remapped_bytes = min_t(loff_t, len, remapped_bytes);
   4611
   4612	return remapped_bytes > 0 ? remapped_bytes : ret;
   4613}
   4614
   4615/* Set up refcount tree and remap s_inode to t_inode. */
   4616loff_t ocfs2_reflink_remap_blocks(struct inode *s_inode,
   4617				  struct buffer_head *s_bh,
   4618				  loff_t pos_in,
   4619				  struct inode *t_inode,
   4620				  struct buffer_head *t_bh,
   4621				  loff_t pos_out,
   4622				  loff_t len)
   4623{
   4624	struct ocfs2_cached_dealloc_ctxt dealloc;
   4625	struct ocfs2_super *osb;
   4626	struct ocfs2_dinode *dis;
   4627	struct ocfs2_dinode *dit;
   4628	loff_t ret;
   4629
   4630	osb = OCFS2_SB(s_inode->i_sb);
   4631	dis = (struct ocfs2_dinode *)s_bh->b_data;
   4632	dit = (struct ocfs2_dinode *)t_bh->b_data;
   4633	ocfs2_init_dealloc_ctxt(&dealloc);
   4634
   4635	/*
   4636	 * If we're reflinking the entire file and the source is inline
   4637	 * data, just copy the contents.
   4638	 */
   4639	if (pos_in == pos_out && pos_in == 0 && len == i_size_read(s_inode) &&
   4640	    i_size_read(t_inode) <= len &&
   4641	    (OCFS2_I(s_inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL)) {
   4642		ret = ocfs2_duplicate_inline_data(s_inode, s_bh, t_inode, t_bh);
   4643		if (ret)
   4644			mlog_errno(ret);
   4645		goto out;
   4646	}
   4647
   4648	/*
   4649	 * If both inodes belong to two different refcount groups then
   4650	 * forget it because we don't know how (or want) to go merging
   4651	 * refcount trees.
   4652	 */
   4653	ret = -EOPNOTSUPP;
   4654	if (ocfs2_is_refcount_inode(s_inode) &&
   4655	    ocfs2_is_refcount_inode(t_inode) &&
   4656	    le64_to_cpu(dis->i_refcount_loc) !=
   4657	    le64_to_cpu(dit->i_refcount_loc))
   4658		goto out;
   4659
   4660	/* Neither inode has a refcount tree.  Add one to s_inode. */
   4661	if (!ocfs2_is_refcount_inode(s_inode) &&
   4662	    !ocfs2_is_refcount_inode(t_inode)) {
   4663		ret = ocfs2_create_refcount_tree(s_inode, s_bh);
   4664		if (ret) {
   4665			mlog_errno(ret);
   4666			goto out;
   4667		}
   4668	}
   4669
   4670	/* Ensure that both inodes end up with the same refcount tree. */
   4671	if (!ocfs2_is_refcount_inode(s_inode)) {
   4672		ret = ocfs2_set_refcount_tree(s_inode, s_bh,
   4673					      le64_to_cpu(dit->i_refcount_loc));
   4674		if (ret) {
   4675			mlog_errno(ret);
   4676			goto out;
   4677		}
   4678	}
   4679	if (!ocfs2_is_refcount_inode(t_inode)) {
   4680		ret = ocfs2_set_refcount_tree(t_inode, t_bh,
   4681					      le64_to_cpu(dis->i_refcount_loc));
   4682		if (ret) {
   4683			mlog_errno(ret);
   4684			goto out;
   4685		}
   4686	}
   4687
   4688	/* Turn off inline data in the dest file. */
   4689	if (OCFS2_I(t_inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) {
   4690		ret = ocfs2_convert_inline_data_to_extents(t_inode, t_bh);
   4691		if (ret) {
   4692			mlog_errno(ret);
   4693			goto out;
   4694		}
   4695	}
   4696
   4697	/* Actually remap extents now. */
   4698	ret = ocfs2_reflink_remap_extent(s_inode, s_bh, pos_in, t_inode, t_bh,
   4699					 pos_out, len, &dealloc);
   4700	if (ret < 0) {
   4701		mlog_errno(ret);
   4702		goto out;
   4703	}
   4704
   4705out:
   4706	if (ocfs2_dealloc_has_cluster(&dealloc)) {
   4707		ocfs2_schedule_truncate_log_flush(osb, 1);
   4708		ocfs2_run_deallocs(osb, &dealloc);
   4709	}
   4710
   4711	return ret;
   4712}
   4713
   4714/* Lock an inode and grab a bh pointing to the inode. */
   4715int ocfs2_reflink_inodes_lock(struct inode *s_inode,
   4716			      struct buffer_head **bh_s,
   4717			      struct inode *t_inode,
   4718			      struct buffer_head **bh_t)
   4719{
   4720	struct inode *inode1 = s_inode;
   4721	struct inode *inode2 = t_inode;
   4722	struct ocfs2_inode_info *oi1;
   4723	struct ocfs2_inode_info *oi2;
   4724	struct buffer_head *bh1 = NULL;
   4725	struct buffer_head *bh2 = NULL;
   4726	bool same_inode = (s_inode == t_inode);
   4727	bool need_swap = (inode1->i_ino > inode2->i_ino);
   4728	int status;
   4729
   4730	/* First grab the VFS and rw locks. */
   4731	lock_two_nondirectories(s_inode, t_inode);
   4732	if (need_swap)
   4733		swap(inode1, inode2);
   4734
   4735	status = ocfs2_rw_lock(inode1, 1);
   4736	if (status) {
   4737		mlog_errno(status);
   4738		goto out_i1;
   4739	}
   4740	if (!same_inode) {
   4741		status = ocfs2_rw_lock(inode2, 1);
   4742		if (status) {
   4743			mlog_errno(status);
   4744			goto out_i2;
   4745		}
   4746	}
   4747
   4748	/* Now go for the cluster locks */
   4749	oi1 = OCFS2_I(inode1);
   4750	oi2 = OCFS2_I(inode2);
   4751
   4752	trace_ocfs2_double_lock((unsigned long long)oi1->ip_blkno,
   4753				(unsigned long long)oi2->ip_blkno);
   4754
   4755	/* We always want to lock the one with the lower lockid first. */
   4756	if (oi1->ip_blkno > oi2->ip_blkno)
   4757		mlog_errno(-ENOLCK);
   4758
   4759	/* lock id1 */
   4760	status = ocfs2_inode_lock_nested(inode1, &bh1, 1,
   4761					 OI_LS_REFLINK_TARGET);
   4762	if (status < 0) {
   4763		if (status != -ENOENT)
   4764			mlog_errno(status);
   4765		goto out_rw2;
   4766	}
   4767
   4768	/* lock id2 */
   4769	if (!same_inode) {
   4770		status = ocfs2_inode_lock_nested(inode2, &bh2, 1,
   4771						 OI_LS_REFLINK_TARGET);
   4772		if (status < 0) {
   4773			if (status != -ENOENT)
   4774				mlog_errno(status);
   4775			goto out_cl1;
   4776		}
   4777	} else {
   4778		bh2 = bh1;
   4779	}
   4780
   4781	/*
   4782	 * If we swapped inode order above, we have to swap the buffer heads
   4783	 * before passing them back to the caller.
   4784	 */
   4785	if (need_swap)
   4786		swap(bh1, bh2);
   4787	*bh_s = bh1;
   4788	*bh_t = bh2;
   4789
   4790	trace_ocfs2_double_lock_end(
   4791			(unsigned long long)oi1->ip_blkno,
   4792			(unsigned long long)oi2->ip_blkno);
   4793
   4794	return 0;
   4795
   4796out_cl1:
   4797	ocfs2_inode_unlock(inode1, 1);
   4798	brelse(bh1);
   4799out_rw2:
   4800	ocfs2_rw_unlock(inode2, 1);
   4801out_i2:
   4802	ocfs2_rw_unlock(inode1, 1);
   4803out_i1:
   4804	unlock_two_nondirectories(s_inode, t_inode);
   4805	return status;
   4806}
   4807
   4808/* Unlock both inodes and release buffers. */
   4809void ocfs2_reflink_inodes_unlock(struct inode *s_inode,
   4810				 struct buffer_head *s_bh,
   4811				 struct inode *t_inode,
   4812				 struct buffer_head *t_bh)
   4813{
   4814	ocfs2_inode_unlock(s_inode, 1);
   4815	ocfs2_rw_unlock(s_inode, 1);
   4816	brelse(s_bh);
   4817	if (s_inode != t_inode) {
   4818		ocfs2_inode_unlock(t_inode, 1);
   4819		ocfs2_rw_unlock(t_inode, 1);
   4820		brelse(t_bh);
   4821	}
   4822	unlock_two_nondirectories(s_inode, t_inode);
   4823}