cachepc-linux

Fork of AMDESE/linux with modifications for CachePC side-channel attack
git clone https://git.sinitax.com/sinitax/cachepc-linux
Log | Files | Refs | README | LICENSE | sfeed.txt

move_extents.c (25704B)


      1// SPDX-License-Identifier: GPL-2.0-only
      2/*
      3 * move_extents.c
      4 *
      5 * Copyright (C) 2011 Oracle.  All rights reserved.
      6 */
      7#include <linux/fs.h>
      8#include <linux/types.h>
      9#include <linux/mount.h>
     10#include <linux/swap.h>
     11
     12#include <cluster/masklog.h>
     13
     14#include "ocfs2.h"
     15#include "ocfs2_ioctl.h"
     16
     17#include "alloc.h"
     18#include "localalloc.h"
     19#include "aops.h"
     20#include "dlmglue.h"
     21#include "extent_map.h"
     22#include "inode.h"
     23#include "journal.h"
     24#include "suballoc.h"
     25#include "uptodate.h"
     26#include "super.h"
     27#include "dir.h"
     28#include "buffer_head_io.h"
     29#include "sysfile.h"
     30#include "refcounttree.h"
     31#include "move_extents.h"
     32
     33struct ocfs2_move_extents_context {
     34	struct inode *inode;
     35	struct file *file;
     36	int auto_defrag;
     37	int partial;
     38	int credits;
     39	u32 new_phys_cpos;
     40	u32 clusters_moved;
     41	u64 refcount_loc;
     42	struct ocfs2_move_extents *range;
     43	struct ocfs2_extent_tree et;
     44	struct ocfs2_alloc_context *meta_ac;
     45	struct ocfs2_alloc_context *data_ac;
     46	struct ocfs2_cached_dealloc_ctxt dealloc;
     47};
     48
     49static int __ocfs2_move_extent(handle_t *handle,
     50			       struct ocfs2_move_extents_context *context,
     51			       u32 cpos, u32 len, u32 p_cpos, u32 new_p_cpos,
     52			       int ext_flags)
     53{
     54	int ret = 0, index;
     55	struct inode *inode = context->inode;
     56	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
     57	struct ocfs2_extent_rec *rec, replace_rec;
     58	struct ocfs2_path *path = NULL;
     59	struct ocfs2_extent_list *el;
     60	u64 ino = ocfs2_metadata_cache_owner(context->et.et_ci);
     61	u64 old_blkno = ocfs2_clusters_to_blocks(inode->i_sb, p_cpos);
     62
     63	ret = ocfs2_duplicate_clusters_by_page(handle, inode, cpos,
     64					       p_cpos, new_p_cpos, len);
     65	if (ret) {
     66		mlog_errno(ret);
     67		goto out;
     68	}
     69
     70	memset(&replace_rec, 0, sizeof(replace_rec));
     71	replace_rec.e_cpos = cpu_to_le32(cpos);
     72	replace_rec.e_leaf_clusters = cpu_to_le16(len);
     73	replace_rec.e_blkno = cpu_to_le64(ocfs2_clusters_to_blocks(inode->i_sb,
     74								   new_p_cpos));
     75
     76	path = ocfs2_new_path_from_et(&context->et);
     77	if (!path) {
     78		ret = -ENOMEM;
     79		mlog_errno(ret);
     80		goto out;
     81	}
     82
     83	ret = ocfs2_find_path(INODE_CACHE(inode), path, cpos);
     84	if (ret) {
     85		mlog_errno(ret);
     86		goto out;
     87	}
     88
     89	el = path_leaf_el(path);
     90
     91	index = ocfs2_search_extent_list(el, cpos);
     92	if (index == -1) {
     93		ret = ocfs2_error(inode->i_sb,
     94				  "Inode %llu has an extent at cpos %u which can no longer be found\n",
     95				  (unsigned long long)ino, cpos);
     96		goto out;
     97	}
     98
     99	rec = &el->l_recs[index];
    100
    101	BUG_ON(ext_flags != rec->e_flags);
    102	/*
    103	 * after moving/defraging to new location, the extent is not going
    104	 * to be refcounted anymore.
    105	 */
    106	replace_rec.e_flags = ext_flags & ~OCFS2_EXT_REFCOUNTED;
    107
    108	ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode),
    109				      context->et.et_root_bh,
    110				      OCFS2_JOURNAL_ACCESS_WRITE);
    111	if (ret) {
    112		mlog_errno(ret);
    113		goto out;
    114	}
    115
    116	ret = ocfs2_split_extent(handle, &context->et, path, index,
    117				 &replace_rec, context->meta_ac,
    118				 &context->dealloc);
    119	if (ret) {
    120		mlog_errno(ret);
    121		goto out;
    122	}
    123
    124	ocfs2_journal_dirty(handle, context->et.et_root_bh);
    125
    126	context->new_phys_cpos = new_p_cpos;
    127
    128	/*
    129	 * need I to append truncate log for old clusters?
    130	 */
    131	if (old_blkno) {
    132		if (ext_flags & OCFS2_EXT_REFCOUNTED)
    133			ret = ocfs2_decrease_refcount(inode, handle,
    134					ocfs2_blocks_to_clusters(osb->sb,
    135								 old_blkno),
    136					len, context->meta_ac,
    137					&context->dealloc, 1);
    138		else
    139			ret = ocfs2_truncate_log_append(osb, handle,
    140							old_blkno, len);
    141	}
    142
    143	ocfs2_update_inode_fsync_trans(handle, inode, 0);
    144out:
    145	ocfs2_free_path(path);
    146	return ret;
    147}
    148
    149/*
    150 * lock allocator, and reserve appropriate number of bits for
    151 * meta blocks.
    152 */
    153static int ocfs2_lock_meta_allocator_move_extents(struct inode *inode,
    154					struct ocfs2_extent_tree *et,
    155					u32 clusters_to_move,
    156					u32 extents_to_split,
    157					struct ocfs2_alloc_context **meta_ac,
    158					int extra_blocks,
    159					int *credits)
    160{
    161	int ret, num_free_extents;
    162	unsigned int max_recs_needed = 2 * extents_to_split + clusters_to_move;
    163	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
    164
    165	num_free_extents = ocfs2_num_free_extents(et);
    166	if (num_free_extents < 0) {
    167		ret = num_free_extents;
    168		mlog_errno(ret);
    169		goto out;
    170	}
    171
    172	if (!num_free_extents ||
    173	    (ocfs2_sparse_alloc(osb) && num_free_extents < max_recs_needed))
    174		extra_blocks += ocfs2_extend_meta_needed(et->et_root_el);
    175
    176	ret = ocfs2_reserve_new_metadata_blocks(osb, extra_blocks, meta_ac);
    177	if (ret) {
    178		mlog_errno(ret);
    179		goto out;
    180	}
    181
    182
    183	*credits += ocfs2_calc_extend_credits(osb->sb, et->et_root_el);
    184
    185	mlog(0, "reserve metadata_blocks: %d, data_clusters: %u, credits: %d\n",
    186	     extra_blocks, clusters_to_move, *credits);
    187out:
    188	if (ret) {
    189		if (*meta_ac) {
    190			ocfs2_free_alloc_context(*meta_ac);
    191			*meta_ac = NULL;
    192		}
    193	}
    194
    195	return ret;
    196}
    197
    198/*
    199 * Using one journal handle to guarantee the data consistency in case
    200 * crash happens anywhere.
    201 *
    202 *  XXX: defrag can end up with finishing partial extent as requested,
    203 * due to not enough contiguous clusters can be found in allocator.
    204 */
    205static int ocfs2_defrag_extent(struct ocfs2_move_extents_context *context,
    206			       u32 cpos, u32 phys_cpos, u32 *len, int ext_flags)
    207{
    208	int ret, credits = 0, extra_blocks = 0, partial = context->partial;
    209	handle_t *handle;
    210	struct inode *inode = context->inode;
    211	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
    212	struct inode *tl_inode = osb->osb_tl_inode;
    213	struct ocfs2_refcount_tree *ref_tree = NULL;
    214	u32 new_phys_cpos, new_len;
    215	u64 phys_blkno = ocfs2_clusters_to_blocks(inode->i_sb, phys_cpos);
    216	int need_free = 0;
    217
    218	if ((ext_flags & OCFS2_EXT_REFCOUNTED) && *len) {
    219		BUG_ON(!ocfs2_is_refcount_inode(inode));
    220		BUG_ON(!context->refcount_loc);
    221
    222		ret = ocfs2_lock_refcount_tree(osb, context->refcount_loc, 1,
    223					       &ref_tree, NULL);
    224		if (ret) {
    225			mlog_errno(ret);
    226			return ret;
    227		}
    228
    229		ret = ocfs2_prepare_refcount_change_for_del(inode,
    230							context->refcount_loc,
    231							phys_blkno,
    232							*len,
    233							&credits,
    234							&extra_blocks);
    235		if (ret) {
    236			mlog_errno(ret);
    237			goto out;
    238		}
    239	}
    240
    241	ret = ocfs2_lock_meta_allocator_move_extents(inode, &context->et,
    242						*len, 1,
    243						&context->meta_ac,
    244						extra_blocks, &credits);
    245	if (ret) {
    246		mlog_errno(ret);
    247		goto out;
    248	}
    249
    250	/*
    251	 * should be using allocation reservation strategy there?
    252	 *
    253	 * if (context->data_ac)
    254	 *	context->data_ac->ac_resv = &OCFS2_I(inode)->ip_la_data_resv;
    255	 */
    256
    257	inode_lock(tl_inode);
    258
    259	if (ocfs2_truncate_log_needs_flush(osb)) {
    260		ret = __ocfs2_flush_truncate_log(osb);
    261		if (ret < 0) {
    262			mlog_errno(ret);
    263			goto out_unlock_mutex;
    264		}
    265	}
    266
    267	/*
    268	 * Make sure ocfs2_reserve_cluster is called after
    269	 * __ocfs2_flush_truncate_log, otherwise, dead lock may happen.
    270	 *
    271	 * If ocfs2_reserve_cluster is called
    272	 * before __ocfs2_flush_truncate_log, dead lock on global bitmap
    273	 * may happen.
    274	 *
    275	 */
    276	ret = ocfs2_reserve_clusters(osb, *len, &context->data_ac);
    277	if (ret) {
    278		mlog_errno(ret);
    279		goto out_unlock_mutex;
    280	}
    281
    282	handle = ocfs2_start_trans(osb, credits);
    283	if (IS_ERR(handle)) {
    284		ret = PTR_ERR(handle);
    285		mlog_errno(ret);
    286		goto out_unlock_mutex;
    287	}
    288
    289	ret = __ocfs2_claim_clusters(handle, context->data_ac, 1, *len,
    290				     &new_phys_cpos, &new_len);
    291	if (ret) {
    292		mlog_errno(ret);
    293		goto out_commit;
    294	}
    295
    296	/*
    297	 * allowing partial extent moving is kind of 'pros and cons', it makes
    298	 * whole defragmentation less likely to fail, on the contrary, the bad
    299	 * thing is it may make the fs even more fragmented after moving, let
    300	 * userspace make a good decision here.
    301	 */
    302	if (new_len != *len) {
    303		mlog(0, "len_claimed: %u, len: %u\n", new_len, *len);
    304		if (!partial) {
    305			context->range->me_flags &= ~OCFS2_MOVE_EXT_FL_COMPLETE;
    306			ret = -ENOSPC;
    307			need_free = 1;
    308			goto out_commit;
    309		}
    310	}
    311
    312	mlog(0, "cpos: %u, phys_cpos: %u, new_phys_cpos: %u\n", cpos,
    313	     phys_cpos, new_phys_cpos);
    314
    315	ret = __ocfs2_move_extent(handle, context, cpos, new_len, phys_cpos,
    316				  new_phys_cpos, ext_flags);
    317	if (ret)
    318		mlog_errno(ret);
    319
    320	if (partial && (new_len != *len))
    321		*len = new_len;
    322
    323	/*
    324	 * Here we should write the new page out first if we are
    325	 * in write-back mode.
    326	 */
    327	ret = ocfs2_cow_sync_writeback(inode->i_sb, context->inode, cpos, *len);
    328	if (ret)
    329		mlog_errno(ret);
    330
    331out_commit:
    332	if (need_free && context->data_ac) {
    333		struct ocfs2_alloc_context *data_ac = context->data_ac;
    334
    335		if (context->data_ac->ac_which == OCFS2_AC_USE_LOCAL)
    336			ocfs2_free_local_alloc_bits(osb, handle, data_ac,
    337					new_phys_cpos, new_len);
    338		else
    339			ocfs2_free_clusters(handle,
    340					data_ac->ac_inode,
    341					data_ac->ac_bh,
    342					ocfs2_clusters_to_blocks(osb->sb, new_phys_cpos),
    343					new_len);
    344	}
    345
    346	ocfs2_commit_trans(osb, handle);
    347
    348out_unlock_mutex:
    349	inode_unlock(tl_inode);
    350
    351	if (context->data_ac) {
    352		ocfs2_free_alloc_context(context->data_ac);
    353		context->data_ac = NULL;
    354	}
    355
    356	if (context->meta_ac) {
    357		ocfs2_free_alloc_context(context->meta_ac);
    358		context->meta_ac = NULL;
    359	}
    360
    361out:
    362	if (ref_tree)
    363		ocfs2_unlock_refcount_tree(osb, ref_tree, 1);
    364
    365	return ret;
    366}
    367
    368/*
    369 * find the victim alloc group, where #blkno fits.
    370 */
    371static int ocfs2_find_victim_alloc_group(struct inode *inode,
    372					 u64 vict_blkno,
    373					 int type, int slot,
    374					 int *vict_bit,
    375					 struct buffer_head **ret_bh)
    376{
    377	int ret, i, bits_per_unit = 0;
    378	u64 blkno;
    379	char namebuf[40];
    380
    381	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
    382	struct buffer_head *ac_bh = NULL, *gd_bh = NULL;
    383	struct ocfs2_chain_list *cl;
    384	struct ocfs2_chain_rec *rec;
    385	struct ocfs2_dinode *ac_dinode;
    386	struct ocfs2_group_desc *bg;
    387
    388	ocfs2_sprintf_system_inode_name(namebuf, sizeof(namebuf), type, slot);
    389	ret = ocfs2_lookup_ino_from_name(osb->sys_root_inode, namebuf,
    390					 strlen(namebuf), &blkno);
    391	if (ret) {
    392		ret = -ENOENT;
    393		goto out;
    394	}
    395
    396	ret = ocfs2_read_blocks_sync(osb, blkno, 1, &ac_bh);
    397	if (ret) {
    398		mlog_errno(ret);
    399		goto out;
    400	}
    401
    402	ac_dinode = (struct ocfs2_dinode *)ac_bh->b_data;
    403	cl = &(ac_dinode->id2.i_chain);
    404	rec = &(cl->cl_recs[0]);
    405
    406	if (type == GLOBAL_BITMAP_SYSTEM_INODE)
    407		bits_per_unit = osb->s_clustersize_bits -
    408					inode->i_sb->s_blocksize_bits;
    409	/*
    410	 * 'vict_blkno' was out of the valid range.
    411	 */
    412	if ((vict_blkno < le64_to_cpu(rec->c_blkno)) ||
    413	    (vict_blkno >= ((u64)le32_to_cpu(ac_dinode->id1.bitmap1.i_total) <<
    414				bits_per_unit))) {
    415		ret = -EINVAL;
    416		goto out;
    417	}
    418
    419	for (i = 0; i < le16_to_cpu(cl->cl_next_free_rec); i++) {
    420
    421		rec = &(cl->cl_recs[i]);
    422		if (!rec)
    423			continue;
    424
    425		bg = NULL;
    426
    427		do {
    428			if (!bg)
    429				blkno = le64_to_cpu(rec->c_blkno);
    430			else
    431				blkno = le64_to_cpu(bg->bg_next_group);
    432
    433			if (gd_bh) {
    434				brelse(gd_bh);
    435				gd_bh = NULL;
    436			}
    437
    438			ret = ocfs2_read_blocks_sync(osb, blkno, 1, &gd_bh);
    439			if (ret) {
    440				mlog_errno(ret);
    441				goto out;
    442			}
    443
    444			bg = (struct ocfs2_group_desc *)gd_bh->b_data;
    445
    446			if (vict_blkno < (le64_to_cpu(bg->bg_blkno) +
    447						le16_to_cpu(bg->bg_bits))) {
    448
    449				*ret_bh = gd_bh;
    450				*vict_bit = (vict_blkno - blkno) >>
    451							bits_per_unit;
    452				mlog(0, "find the victim group: #%llu, "
    453				     "total_bits: %u, vict_bit: %u\n",
    454				     blkno, le16_to_cpu(bg->bg_bits),
    455				     *vict_bit);
    456				goto out;
    457			}
    458
    459		} while (le64_to_cpu(bg->bg_next_group));
    460	}
    461
    462	ret = -EINVAL;
    463out:
    464	brelse(ac_bh);
    465
    466	/*
    467	 * caller has to release the gd_bh properly.
    468	 */
    469	return ret;
    470}
    471
    472/*
    473 * XXX: helper to validate and adjust moving goal.
    474 */
    475static int ocfs2_validate_and_adjust_move_goal(struct inode *inode,
    476					       struct ocfs2_move_extents *range)
    477{
    478	int ret, goal_bit = 0;
    479
    480	struct buffer_head *gd_bh = NULL;
    481	struct ocfs2_group_desc *bg;
    482	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
    483	int c_to_b = 1 << (osb->s_clustersize_bits -
    484					inode->i_sb->s_blocksize_bits);
    485
    486	/*
    487	 * make goal become cluster aligned.
    488	 */
    489	range->me_goal = ocfs2_block_to_cluster_start(inode->i_sb,
    490						      range->me_goal);
    491	/*
    492	 * validate goal sits within global_bitmap, and return the victim
    493	 * group desc
    494	 */
    495	ret = ocfs2_find_victim_alloc_group(inode, range->me_goal,
    496					    GLOBAL_BITMAP_SYSTEM_INODE,
    497					    OCFS2_INVALID_SLOT,
    498					    &goal_bit, &gd_bh);
    499	if (ret)
    500		goto out;
    501
    502	bg = (struct ocfs2_group_desc *)gd_bh->b_data;
    503
    504	/*
    505	 * moving goal is not allowd to start with a group desc blok(#0 blk)
    506	 * let's compromise to the latter cluster.
    507	 */
    508	if (range->me_goal == le64_to_cpu(bg->bg_blkno))
    509		range->me_goal += c_to_b;
    510
    511	/*
    512	 * movement is not gonna cross two groups.
    513	 */
    514	if ((le16_to_cpu(bg->bg_bits) - goal_bit) * osb->s_clustersize <
    515								range->me_len) {
    516		ret = -EINVAL;
    517		goto out;
    518	}
    519	/*
    520	 * more exact validations/adjustments will be performed later during
    521	 * moving operation for each extent range.
    522	 */
    523	mlog(0, "extents get ready to be moved to #%llu block\n",
    524	     range->me_goal);
    525
    526out:
    527	brelse(gd_bh);
    528
    529	return ret;
    530}
    531
    532static void ocfs2_probe_alloc_group(struct inode *inode, struct buffer_head *bh,
    533				    int *goal_bit, u32 move_len, u32 max_hop,
    534				    u32 *phys_cpos)
    535{
    536	int i, used, last_free_bits = 0, base_bit = *goal_bit;
    537	struct ocfs2_group_desc *gd = (struct ocfs2_group_desc *)bh->b_data;
    538	u32 base_cpos = ocfs2_blocks_to_clusters(inode->i_sb,
    539						 le64_to_cpu(gd->bg_blkno));
    540
    541	for (i = base_bit; i < le16_to_cpu(gd->bg_bits); i++) {
    542
    543		used = ocfs2_test_bit(i, (unsigned long *)gd->bg_bitmap);
    544		if (used) {
    545			/*
    546			 * we even tried searching the free chunk by jumping
    547			 * a 'max_hop' distance, but still failed.
    548			 */
    549			if ((i - base_bit) > max_hop) {
    550				*phys_cpos = 0;
    551				break;
    552			}
    553
    554			if (last_free_bits)
    555				last_free_bits = 0;
    556
    557			continue;
    558		} else
    559			last_free_bits++;
    560
    561		if (last_free_bits == move_len) {
    562			*goal_bit = i;
    563			*phys_cpos = base_cpos + i;
    564			break;
    565		}
    566	}
    567
    568	mlog(0, "found phys_cpos: %u to fit the wanted moving.\n", *phys_cpos);
    569}
    570
    571static int ocfs2_move_extent(struct ocfs2_move_extents_context *context,
    572			     u32 cpos, u32 phys_cpos, u32 *new_phys_cpos,
    573			     u32 len, int ext_flags)
    574{
    575	int ret, credits = 0, extra_blocks = 0, goal_bit = 0;
    576	handle_t *handle;
    577	struct inode *inode = context->inode;
    578	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
    579	struct inode *tl_inode = osb->osb_tl_inode;
    580	struct inode *gb_inode = NULL;
    581	struct buffer_head *gb_bh = NULL;
    582	struct buffer_head *gd_bh = NULL;
    583	struct ocfs2_group_desc *gd;
    584	struct ocfs2_refcount_tree *ref_tree = NULL;
    585	u32 move_max_hop = ocfs2_blocks_to_clusters(inode->i_sb,
    586						    context->range->me_threshold);
    587	u64 phys_blkno, new_phys_blkno;
    588
    589	phys_blkno = ocfs2_clusters_to_blocks(inode->i_sb, phys_cpos);
    590
    591	if ((ext_flags & OCFS2_EXT_REFCOUNTED) && len) {
    592		BUG_ON(!ocfs2_is_refcount_inode(inode));
    593		BUG_ON(!context->refcount_loc);
    594
    595		ret = ocfs2_lock_refcount_tree(osb, context->refcount_loc, 1,
    596					       &ref_tree, NULL);
    597		if (ret) {
    598			mlog_errno(ret);
    599			return ret;
    600		}
    601
    602		ret = ocfs2_prepare_refcount_change_for_del(inode,
    603							context->refcount_loc,
    604							phys_blkno,
    605							len,
    606							&credits,
    607							&extra_blocks);
    608		if (ret) {
    609			mlog_errno(ret);
    610			goto out;
    611		}
    612	}
    613
    614	ret = ocfs2_lock_meta_allocator_move_extents(inode, &context->et,
    615						len, 1,
    616						&context->meta_ac,
    617						extra_blocks, &credits);
    618	if (ret) {
    619		mlog_errno(ret);
    620		goto out;
    621	}
    622
    623	/*
    624	 * need to count 2 extra credits for global_bitmap inode and
    625	 * group descriptor.
    626	 */
    627	credits += OCFS2_INODE_UPDATE_CREDITS + 1;
    628
    629	/*
    630	 * ocfs2_move_extent() didn't reserve any clusters in lock_allocators()
    631	 * logic, while we still need to lock the global_bitmap.
    632	 */
    633	gb_inode = ocfs2_get_system_file_inode(osb, GLOBAL_BITMAP_SYSTEM_INODE,
    634					       OCFS2_INVALID_SLOT);
    635	if (!gb_inode) {
    636		mlog(ML_ERROR, "unable to get global_bitmap inode\n");
    637		ret = -EIO;
    638		goto out;
    639	}
    640
    641	inode_lock(gb_inode);
    642
    643	ret = ocfs2_inode_lock(gb_inode, &gb_bh, 1);
    644	if (ret) {
    645		mlog_errno(ret);
    646		goto out_unlock_gb_mutex;
    647	}
    648
    649	inode_lock(tl_inode);
    650
    651	handle = ocfs2_start_trans(osb, credits);
    652	if (IS_ERR(handle)) {
    653		ret = PTR_ERR(handle);
    654		mlog_errno(ret);
    655		goto out_unlock_tl_inode;
    656	}
    657
    658	new_phys_blkno = ocfs2_clusters_to_blocks(inode->i_sb, *new_phys_cpos);
    659	ret = ocfs2_find_victim_alloc_group(inode, new_phys_blkno,
    660					    GLOBAL_BITMAP_SYSTEM_INODE,
    661					    OCFS2_INVALID_SLOT,
    662					    &goal_bit, &gd_bh);
    663	if (ret) {
    664		mlog_errno(ret);
    665		goto out_commit;
    666	}
    667
    668	/*
    669	 * probe the victim cluster group to find a proper
    670	 * region to fit wanted movement, it even will perfrom
    671	 * a best-effort attempt by compromising to a threshold
    672	 * around the goal.
    673	 */
    674	ocfs2_probe_alloc_group(inode, gd_bh, &goal_bit, len, move_max_hop,
    675				new_phys_cpos);
    676	if (!*new_phys_cpos) {
    677		ret = -ENOSPC;
    678		goto out_commit;
    679	}
    680
    681	ret = __ocfs2_move_extent(handle, context, cpos, len, phys_cpos,
    682				  *new_phys_cpos, ext_flags);
    683	if (ret) {
    684		mlog_errno(ret);
    685		goto out_commit;
    686	}
    687
    688	gd = (struct ocfs2_group_desc *)gd_bh->b_data;
    689	ret = ocfs2_alloc_dinode_update_counts(gb_inode, handle, gb_bh, len,
    690					       le16_to_cpu(gd->bg_chain));
    691	if (ret) {
    692		mlog_errno(ret);
    693		goto out_commit;
    694	}
    695
    696	ret = ocfs2_block_group_set_bits(handle, gb_inode, gd, gd_bh,
    697					 goal_bit, len);
    698	if (ret) {
    699		ocfs2_rollback_alloc_dinode_counts(gb_inode, gb_bh, len,
    700					       le16_to_cpu(gd->bg_chain));
    701		mlog_errno(ret);
    702	}
    703
    704	/*
    705	 * Here we should write the new page out first if we are
    706	 * in write-back mode.
    707	 */
    708	ret = ocfs2_cow_sync_writeback(inode->i_sb, context->inode, cpos, len);
    709	if (ret)
    710		mlog_errno(ret);
    711
    712out_commit:
    713	ocfs2_commit_trans(osb, handle);
    714	brelse(gd_bh);
    715
    716out_unlock_tl_inode:
    717	inode_unlock(tl_inode);
    718
    719	ocfs2_inode_unlock(gb_inode, 1);
    720out_unlock_gb_mutex:
    721	inode_unlock(gb_inode);
    722	brelse(gb_bh);
    723	iput(gb_inode);
    724
    725out:
    726	if (context->meta_ac) {
    727		ocfs2_free_alloc_context(context->meta_ac);
    728		context->meta_ac = NULL;
    729	}
    730
    731	if (ref_tree)
    732		ocfs2_unlock_refcount_tree(osb, ref_tree, 1);
    733
    734	return ret;
    735}
    736
    737/*
    738 * Helper to calculate the defraging length in one run according to threshold.
    739 */
    740static void ocfs2_calc_extent_defrag_len(u32 *alloc_size, u32 *len_defraged,
    741					 u32 threshold, int *skip)
    742{
    743	if ((*alloc_size + *len_defraged) < threshold) {
    744		/*
    745		 * proceed defragmentation until we meet the thresh
    746		 */
    747		*len_defraged += *alloc_size;
    748	} else if (*len_defraged == 0) {
    749		/*
    750		 * XXX: skip a large extent.
    751		 */
    752		*skip = 1;
    753	} else {
    754		/*
    755		 * split this extent to coalesce with former pieces as
    756		 * to reach the threshold.
    757		 *
    758		 * we're done here with one cycle of defragmentation
    759		 * in a size of 'thresh', resetting 'len_defraged'
    760		 * forces a new defragmentation.
    761		 */
    762		*alloc_size = threshold - *len_defraged;
    763		*len_defraged = 0;
    764	}
    765}
    766
    767static int __ocfs2_move_extents_range(struct buffer_head *di_bh,
    768				struct ocfs2_move_extents_context *context)
    769{
    770	int ret = 0, flags, do_defrag, skip = 0;
    771	u32 cpos, phys_cpos, move_start, len_to_move, alloc_size;
    772	u32 len_defraged = 0, defrag_thresh = 0, new_phys_cpos = 0;
    773
    774	struct inode *inode = context->inode;
    775	struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
    776	struct ocfs2_move_extents *range = context->range;
    777	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
    778
    779	if ((i_size_read(inode) == 0) || (range->me_len == 0))
    780		return 0;
    781
    782	if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL)
    783		return 0;
    784
    785	context->refcount_loc = le64_to_cpu(di->i_refcount_loc);
    786
    787	ocfs2_init_dinode_extent_tree(&context->et, INODE_CACHE(inode), di_bh);
    788	ocfs2_init_dealloc_ctxt(&context->dealloc);
    789
    790	/*
    791	 * TO-DO XXX:
    792	 *
    793	 * - xattr extents.
    794	 */
    795
    796	do_defrag = context->auto_defrag;
    797
    798	/*
    799	 * extents moving happens in unit of clusters, for the sake
    800	 * of simplicity, we may ignore two clusters where 'byte_start'
    801	 * and 'byte_start + len' were within.
    802	 */
    803	move_start = ocfs2_clusters_for_bytes(osb->sb, range->me_start);
    804	len_to_move = (range->me_start + range->me_len) >>
    805						osb->s_clustersize_bits;
    806	if (len_to_move >= move_start)
    807		len_to_move -= move_start;
    808	else
    809		len_to_move = 0;
    810
    811	if (do_defrag) {
    812		defrag_thresh = range->me_threshold >> osb->s_clustersize_bits;
    813		if (defrag_thresh <= 1)
    814			goto done;
    815	} else
    816		new_phys_cpos = ocfs2_blocks_to_clusters(inode->i_sb,
    817							 range->me_goal);
    818
    819	mlog(0, "Inode: %llu, start: %llu, len: %llu, cstart: %u, clen: %u, "
    820	     "thresh: %u\n",
    821	     (unsigned long long)OCFS2_I(inode)->ip_blkno,
    822	     (unsigned long long)range->me_start,
    823	     (unsigned long long)range->me_len,
    824	     move_start, len_to_move, defrag_thresh);
    825
    826	cpos = move_start;
    827	while (len_to_move) {
    828		ret = ocfs2_get_clusters(inode, cpos, &phys_cpos, &alloc_size,
    829					 &flags);
    830		if (ret) {
    831			mlog_errno(ret);
    832			goto out;
    833		}
    834
    835		if (alloc_size > len_to_move)
    836			alloc_size = len_to_move;
    837
    838		/*
    839		 * XXX: how to deal with a hole:
    840		 *
    841		 * - skip the hole of course
    842		 * - force a new defragmentation
    843		 */
    844		if (!phys_cpos) {
    845			if (do_defrag)
    846				len_defraged = 0;
    847
    848			goto next;
    849		}
    850
    851		if (do_defrag) {
    852			ocfs2_calc_extent_defrag_len(&alloc_size, &len_defraged,
    853						     defrag_thresh, &skip);
    854			/*
    855			 * skip large extents
    856			 */
    857			if (skip) {
    858				skip = 0;
    859				goto next;
    860			}
    861
    862			mlog(0, "#Defrag: cpos: %u, phys_cpos: %u, "
    863			     "alloc_size: %u, len_defraged: %u\n",
    864			     cpos, phys_cpos, alloc_size, len_defraged);
    865
    866			ret = ocfs2_defrag_extent(context, cpos, phys_cpos,
    867						  &alloc_size, flags);
    868		} else {
    869			ret = ocfs2_move_extent(context, cpos, phys_cpos,
    870						&new_phys_cpos, alloc_size,
    871						flags);
    872
    873			new_phys_cpos += alloc_size;
    874		}
    875
    876		if (ret < 0) {
    877			mlog_errno(ret);
    878			goto out;
    879		}
    880
    881		context->clusters_moved += alloc_size;
    882next:
    883		cpos += alloc_size;
    884		len_to_move -= alloc_size;
    885	}
    886
    887done:
    888	range->me_flags |= OCFS2_MOVE_EXT_FL_COMPLETE;
    889
    890out:
    891	range->me_moved_len = ocfs2_clusters_to_bytes(osb->sb,
    892						      context->clusters_moved);
    893	range->me_new_offset = ocfs2_clusters_to_bytes(osb->sb,
    894						       context->new_phys_cpos);
    895
    896	ocfs2_schedule_truncate_log_flush(osb, 1);
    897	ocfs2_run_deallocs(osb, &context->dealloc);
    898
    899	return ret;
    900}
    901
    902static int ocfs2_move_extents(struct ocfs2_move_extents_context *context)
    903{
    904	int status;
    905	handle_t *handle;
    906	struct inode *inode = context->inode;
    907	struct ocfs2_dinode *di;
    908	struct buffer_head *di_bh = NULL;
    909	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
    910
    911	if (ocfs2_is_hard_readonly(osb) || ocfs2_is_soft_readonly(osb))
    912		return -EROFS;
    913
    914	inode_lock(inode);
    915
    916	/*
    917	 * This prevents concurrent writes from other nodes
    918	 */
    919	status = ocfs2_rw_lock(inode, 1);
    920	if (status) {
    921		mlog_errno(status);
    922		goto out;
    923	}
    924
    925	status = ocfs2_inode_lock(inode, &di_bh, 1);
    926	if (status) {
    927		mlog_errno(status);
    928		goto out_rw_unlock;
    929	}
    930
    931	/*
    932	 * rememer ip_xattr_sem also needs to be held if necessary
    933	 */
    934	down_write(&OCFS2_I(inode)->ip_alloc_sem);
    935
    936	status = __ocfs2_move_extents_range(di_bh, context);
    937
    938	up_write(&OCFS2_I(inode)->ip_alloc_sem);
    939	if (status) {
    940		mlog_errno(status);
    941		goto out_inode_unlock;
    942	}
    943
    944	/*
    945	 * We update ctime for these changes
    946	 */
    947	handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
    948	if (IS_ERR(handle)) {
    949		status = PTR_ERR(handle);
    950		mlog_errno(status);
    951		goto out_inode_unlock;
    952	}
    953
    954	status = ocfs2_journal_access_di(handle, INODE_CACHE(inode), di_bh,
    955					 OCFS2_JOURNAL_ACCESS_WRITE);
    956	if (status) {
    957		mlog_errno(status);
    958		goto out_commit;
    959	}
    960
    961	di = (struct ocfs2_dinode *)di_bh->b_data;
    962	inode->i_ctime = current_time(inode);
    963	di->i_ctime = cpu_to_le64(inode->i_ctime.tv_sec);
    964	di->i_ctime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec);
    965	ocfs2_update_inode_fsync_trans(handle, inode, 0);
    966
    967	ocfs2_journal_dirty(handle, di_bh);
    968
    969out_commit:
    970	ocfs2_commit_trans(osb, handle);
    971
    972out_inode_unlock:
    973	brelse(di_bh);
    974	ocfs2_inode_unlock(inode, 1);
    975out_rw_unlock:
    976	ocfs2_rw_unlock(inode, 1);
    977out:
    978	inode_unlock(inode);
    979
    980	return status;
    981}
    982
    983int ocfs2_ioctl_move_extents(struct file *filp, void __user *argp)
    984{
    985	int status;
    986
    987	struct inode *inode = file_inode(filp);
    988	struct ocfs2_move_extents range;
    989	struct ocfs2_move_extents_context *context;
    990
    991	if (!argp)
    992		return -EINVAL;
    993
    994	status = mnt_want_write_file(filp);
    995	if (status)
    996		return status;
    997
    998	if ((!S_ISREG(inode->i_mode)) || !(filp->f_mode & FMODE_WRITE)) {
    999		status = -EPERM;
   1000		goto out_drop;
   1001	}
   1002
   1003	if (inode->i_flags & (S_IMMUTABLE|S_APPEND)) {
   1004		status = -EPERM;
   1005		goto out_drop;
   1006	}
   1007
   1008	context = kzalloc(sizeof(struct ocfs2_move_extents_context), GFP_NOFS);
   1009	if (!context) {
   1010		status = -ENOMEM;
   1011		mlog_errno(status);
   1012		goto out_drop;
   1013	}
   1014
   1015	context->inode = inode;
   1016	context->file = filp;
   1017
   1018	if (copy_from_user(&range, argp, sizeof(range))) {
   1019		status = -EFAULT;
   1020		goto out_free;
   1021	}
   1022
   1023	if (range.me_start > i_size_read(inode)) {
   1024		status = -EINVAL;
   1025		goto out_free;
   1026	}
   1027
   1028	if (range.me_start + range.me_len > i_size_read(inode))
   1029			range.me_len = i_size_read(inode) - range.me_start;
   1030
   1031	context->range = &range;
   1032
   1033	if (range.me_flags & OCFS2_MOVE_EXT_FL_AUTO_DEFRAG) {
   1034		context->auto_defrag = 1;
   1035		/*
   1036		 * ok, the default theshold for the defragmentation
   1037		 * is 1M, since our maximum clustersize was 1M also.
   1038		 * any thought?
   1039		 */
   1040		if (!range.me_threshold)
   1041			range.me_threshold = 1024 * 1024;
   1042
   1043		if (range.me_threshold > i_size_read(inode))
   1044			range.me_threshold = i_size_read(inode);
   1045
   1046		if (range.me_flags & OCFS2_MOVE_EXT_FL_PART_DEFRAG)
   1047			context->partial = 1;
   1048	} else {
   1049		/*
   1050		 * first best-effort attempt to validate and adjust the goal
   1051		 * (physical address in block), while it can't guarantee later
   1052		 * operation can succeed all the time since global_bitmap may
   1053		 * change a bit over time.
   1054		 */
   1055
   1056		status = ocfs2_validate_and_adjust_move_goal(inode, &range);
   1057		if (status)
   1058			goto out_copy;
   1059	}
   1060
   1061	status = ocfs2_move_extents(context);
   1062	if (status)
   1063		mlog_errno(status);
   1064out_copy:
   1065	/*
   1066	 * movement/defragmentation may end up being partially completed,
   1067	 * that's the reason why we need to return userspace the finished
   1068	 * length and new_offset even if failure happens somewhere.
   1069	 */
   1070	if (copy_to_user(argp, &range, sizeof(range)))
   1071		status = -EFAULT;
   1072
   1073out_free:
   1074	kfree(context);
   1075out_drop:
   1076	mnt_drop_write_file(filp);
   1077
   1078	return status;
   1079}