cachepc-linux

Fork of AMDESE/linux with modifications for CachePC side-channel attack
git clone https://git.sinitax.com/sinitax/cachepc-linux
Log | Files | Refs | README | LICENSE | sfeed.txt

extents.c (169663B)


      1// SPDX-License-Identifier: GPL-2.0
      2/*
      3 * Copyright (c) 2003-2006, Cluster File Systems, Inc, info@clusterfs.com
      4 * Written by Alex Tomas <alex@clusterfs.com>
      5 *
      6 * Architecture independence:
      7 *   Copyright (c) 2005, Bull S.A.
      8 *   Written by Pierre Peiffer <pierre.peiffer@bull.net>
      9 */
     10
     11/*
     12 * Extents support for EXT4
     13 *
     14 * TODO:
     15 *   - ext4*_error() should be used in some situations
     16 *   - analyze all BUG()/BUG_ON(), use -EIO where appropriate
     17 *   - smart tree reduction
     18 */
     19
     20#include <linux/fs.h>
     21#include <linux/time.h>
     22#include <linux/jbd2.h>
     23#include <linux/highuid.h>
     24#include <linux/pagemap.h>
     25#include <linux/quotaops.h>
     26#include <linux/string.h>
     27#include <linux/slab.h>
     28#include <linux/uaccess.h>
     29#include <linux/fiemap.h>
     30#include <linux/iomap.h>
     31#include <linux/sched/mm.h>
     32#include "ext4_jbd2.h"
     33#include "ext4_extents.h"
     34#include "xattr.h"
     35
     36#include <trace/events/ext4.h>
     37
     38/*
     39 * used by extent splitting.
     40 */
     41#define EXT4_EXT_MAY_ZEROOUT	0x1  /* safe to zeroout if split fails \
     42					due to ENOSPC */
     43#define EXT4_EXT_MARK_UNWRIT1	0x2  /* mark first half unwritten */
     44#define EXT4_EXT_MARK_UNWRIT2	0x4  /* mark second half unwritten */
     45
     46#define EXT4_EXT_DATA_VALID1	0x8  /* first half contains valid data */
     47#define EXT4_EXT_DATA_VALID2	0x10 /* second half contains valid data */
     48
     49static __le32 ext4_extent_block_csum(struct inode *inode,
     50				     struct ext4_extent_header *eh)
     51{
     52	struct ext4_inode_info *ei = EXT4_I(inode);
     53	struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
     54	__u32 csum;
     55
     56	csum = ext4_chksum(sbi, ei->i_csum_seed, (__u8 *)eh,
     57			   EXT4_EXTENT_TAIL_OFFSET(eh));
     58	return cpu_to_le32(csum);
     59}
     60
     61static int ext4_extent_block_csum_verify(struct inode *inode,
     62					 struct ext4_extent_header *eh)
     63{
     64	struct ext4_extent_tail *et;
     65
     66	if (!ext4_has_metadata_csum(inode->i_sb))
     67		return 1;
     68
     69	et = find_ext4_extent_tail(eh);
     70	if (et->et_checksum != ext4_extent_block_csum(inode, eh))
     71		return 0;
     72	return 1;
     73}
     74
     75static void ext4_extent_block_csum_set(struct inode *inode,
     76				       struct ext4_extent_header *eh)
     77{
     78	struct ext4_extent_tail *et;
     79
     80	if (!ext4_has_metadata_csum(inode->i_sb))
     81		return;
     82
     83	et = find_ext4_extent_tail(eh);
     84	et->et_checksum = ext4_extent_block_csum(inode, eh);
     85}
     86
     87static int ext4_split_extent_at(handle_t *handle,
     88			     struct inode *inode,
     89			     struct ext4_ext_path **ppath,
     90			     ext4_lblk_t split,
     91			     int split_flag,
     92			     int flags);
     93
     94static int ext4_ext_trunc_restart_fn(struct inode *inode, int *dropped)
     95{
     96	/*
     97	 * Drop i_data_sem to avoid deadlock with ext4_map_blocks.  At this
     98	 * moment, get_block can be called only for blocks inside i_size since
     99	 * page cache has been already dropped and writes are blocked by
    100	 * i_rwsem. So we can safely drop the i_data_sem here.
    101	 */
    102	BUG_ON(EXT4_JOURNAL(inode) == NULL);
    103	ext4_discard_preallocations(inode, 0);
    104	up_write(&EXT4_I(inode)->i_data_sem);
    105	*dropped = 1;
    106	return 0;
    107}
    108
    109/*
    110 * Make sure 'handle' has at least 'check_cred' credits. If not, restart
    111 * transaction with 'restart_cred' credits. The function drops i_data_sem
    112 * when restarting transaction and gets it after transaction is restarted.
    113 *
    114 * The function returns 0 on success, 1 if transaction had to be restarted,
    115 * and < 0 in case of fatal error.
    116 */
    117int ext4_datasem_ensure_credits(handle_t *handle, struct inode *inode,
    118				int check_cred, int restart_cred,
    119				int revoke_cred)
    120{
    121	int ret;
    122	int dropped = 0;
    123
    124	ret = ext4_journal_ensure_credits_fn(handle, check_cred, restart_cred,
    125		revoke_cred, ext4_ext_trunc_restart_fn(inode, &dropped));
    126	if (dropped)
    127		down_write(&EXT4_I(inode)->i_data_sem);
    128	return ret;
    129}
    130
    131/*
    132 * could return:
    133 *  - EROFS
    134 *  - ENOMEM
    135 */
    136static int ext4_ext_get_access(handle_t *handle, struct inode *inode,
    137				struct ext4_ext_path *path)
    138{
    139	int err = 0;
    140
    141	if (path->p_bh) {
    142		/* path points to block */
    143		BUFFER_TRACE(path->p_bh, "get_write_access");
    144		err = ext4_journal_get_write_access(handle, inode->i_sb,
    145						    path->p_bh, EXT4_JTR_NONE);
    146		/*
    147		 * The extent buffer's verified bit will be set again in
    148		 * __ext4_ext_dirty(). We could leave an inconsistent
    149		 * buffer if the extents updating procudure break off du
    150		 * to some error happens, force to check it again.
    151		 */
    152		if (!err)
    153			clear_buffer_verified(path->p_bh);
    154	}
    155	/* path points to leaf/index in inode body */
    156	/* we use in-core data, no need to protect them */
    157	return err;
    158}
    159
    160/*
    161 * could return:
    162 *  - EROFS
    163 *  - ENOMEM
    164 *  - EIO
    165 */
    166static int __ext4_ext_dirty(const char *where, unsigned int line,
    167			    handle_t *handle, struct inode *inode,
    168			    struct ext4_ext_path *path)
    169{
    170	int err;
    171
    172	WARN_ON(!rwsem_is_locked(&EXT4_I(inode)->i_data_sem));
    173	if (path->p_bh) {
    174		ext4_extent_block_csum_set(inode, ext_block_hdr(path->p_bh));
    175		/* path points to block */
    176		err = __ext4_handle_dirty_metadata(where, line, handle,
    177						   inode, path->p_bh);
    178		/* Extents updating done, re-set verified flag */
    179		if (!err)
    180			set_buffer_verified(path->p_bh);
    181	} else {
    182		/* path points to leaf/index in inode body */
    183		err = ext4_mark_inode_dirty(handle, inode);
    184	}
    185	return err;
    186}
    187
    188#define ext4_ext_dirty(handle, inode, path) \
    189		__ext4_ext_dirty(__func__, __LINE__, (handle), (inode), (path))
    190
    191static ext4_fsblk_t ext4_ext_find_goal(struct inode *inode,
    192			      struct ext4_ext_path *path,
    193			      ext4_lblk_t block)
    194{
    195	if (path) {
    196		int depth = path->p_depth;
    197		struct ext4_extent *ex;
    198
    199		/*
    200		 * Try to predict block placement assuming that we are
    201		 * filling in a file which will eventually be
    202		 * non-sparse --- i.e., in the case of libbfd writing
    203		 * an ELF object sections out-of-order but in a way
    204		 * the eventually results in a contiguous object or
    205		 * executable file, or some database extending a table
    206		 * space file.  However, this is actually somewhat
    207		 * non-ideal if we are writing a sparse file such as
    208		 * qemu or KVM writing a raw image file that is going
    209		 * to stay fairly sparse, since it will end up
    210		 * fragmenting the file system's free space.  Maybe we
    211		 * should have some hueristics or some way to allow
    212		 * userspace to pass a hint to file system,
    213		 * especially if the latter case turns out to be
    214		 * common.
    215		 */
    216		ex = path[depth].p_ext;
    217		if (ex) {
    218			ext4_fsblk_t ext_pblk = ext4_ext_pblock(ex);
    219			ext4_lblk_t ext_block = le32_to_cpu(ex->ee_block);
    220
    221			if (block > ext_block)
    222				return ext_pblk + (block - ext_block);
    223			else
    224				return ext_pblk - (ext_block - block);
    225		}
    226
    227		/* it looks like index is empty;
    228		 * try to find starting block from index itself */
    229		if (path[depth].p_bh)
    230			return path[depth].p_bh->b_blocknr;
    231	}
    232
    233	/* OK. use inode's group */
    234	return ext4_inode_to_goal_block(inode);
    235}
    236
    237/*
    238 * Allocation for a meta data block
    239 */
    240static ext4_fsblk_t
    241ext4_ext_new_meta_block(handle_t *handle, struct inode *inode,
    242			struct ext4_ext_path *path,
    243			struct ext4_extent *ex, int *err, unsigned int flags)
    244{
    245	ext4_fsblk_t goal, newblock;
    246
    247	goal = ext4_ext_find_goal(inode, path, le32_to_cpu(ex->ee_block));
    248	newblock = ext4_new_meta_blocks(handle, inode, goal, flags,
    249					NULL, err);
    250	return newblock;
    251}
    252
    253static inline int ext4_ext_space_block(struct inode *inode, int check)
    254{
    255	int size;
    256
    257	size = (inode->i_sb->s_blocksize - sizeof(struct ext4_extent_header))
    258			/ sizeof(struct ext4_extent);
    259#ifdef AGGRESSIVE_TEST
    260	if (!check && size > 6)
    261		size = 6;
    262#endif
    263	return size;
    264}
    265
    266static inline int ext4_ext_space_block_idx(struct inode *inode, int check)
    267{
    268	int size;
    269
    270	size = (inode->i_sb->s_blocksize - sizeof(struct ext4_extent_header))
    271			/ sizeof(struct ext4_extent_idx);
    272#ifdef AGGRESSIVE_TEST
    273	if (!check && size > 5)
    274		size = 5;
    275#endif
    276	return size;
    277}
    278
    279static inline int ext4_ext_space_root(struct inode *inode, int check)
    280{
    281	int size;
    282
    283	size = sizeof(EXT4_I(inode)->i_data);
    284	size -= sizeof(struct ext4_extent_header);
    285	size /= sizeof(struct ext4_extent);
    286#ifdef AGGRESSIVE_TEST
    287	if (!check && size > 3)
    288		size = 3;
    289#endif
    290	return size;
    291}
    292
    293static inline int ext4_ext_space_root_idx(struct inode *inode, int check)
    294{
    295	int size;
    296
    297	size = sizeof(EXT4_I(inode)->i_data);
    298	size -= sizeof(struct ext4_extent_header);
    299	size /= sizeof(struct ext4_extent_idx);
    300#ifdef AGGRESSIVE_TEST
    301	if (!check && size > 4)
    302		size = 4;
    303#endif
    304	return size;
    305}
    306
    307static inline int
    308ext4_force_split_extent_at(handle_t *handle, struct inode *inode,
    309			   struct ext4_ext_path **ppath, ext4_lblk_t lblk,
    310			   int nofail)
    311{
    312	struct ext4_ext_path *path = *ppath;
    313	int unwritten = ext4_ext_is_unwritten(path[path->p_depth].p_ext);
    314	int flags = EXT4_EX_NOCACHE | EXT4_GET_BLOCKS_PRE_IO;
    315
    316	if (nofail)
    317		flags |= EXT4_GET_BLOCKS_METADATA_NOFAIL | EXT4_EX_NOFAIL;
    318
    319	return ext4_split_extent_at(handle, inode, ppath, lblk, unwritten ?
    320			EXT4_EXT_MARK_UNWRIT1|EXT4_EXT_MARK_UNWRIT2 : 0,
    321			flags);
    322}
    323
    324static int
    325ext4_ext_max_entries(struct inode *inode, int depth)
    326{
    327	int max;
    328
    329	if (depth == ext_depth(inode)) {
    330		if (depth == 0)
    331			max = ext4_ext_space_root(inode, 1);
    332		else
    333			max = ext4_ext_space_root_idx(inode, 1);
    334	} else {
    335		if (depth == 0)
    336			max = ext4_ext_space_block(inode, 1);
    337		else
    338			max = ext4_ext_space_block_idx(inode, 1);
    339	}
    340
    341	return max;
    342}
    343
    344static int ext4_valid_extent(struct inode *inode, struct ext4_extent *ext)
    345{
    346	ext4_fsblk_t block = ext4_ext_pblock(ext);
    347	int len = ext4_ext_get_actual_len(ext);
    348	ext4_lblk_t lblock = le32_to_cpu(ext->ee_block);
    349
    350	/*
    351	 * We allow neither:
    352	 *  - zero length
    353	 *  - overflow/wrap-around
    354	 */
    355	if (lblock + len <= lblock)
    356		return 0;
    357	return ext4_inode_block_valid(inode, block, len);
    358}
    359
    360static int ext4_valid_extent_idx(struct inode *inode,
    361				struct ext4_extent_idx *ext_idx)
    362{
    363	ext4_fsblk_t block = ext4_idx_pblock(ext_idx);
    364
    365	return ext4_inode_block_valid(inode, block, 1);
    366}
    367
    368static int ext4_valid_extent_entries(struct inode *inode,
    369				     struct ext4_extent_header *eh,
    370				     ext4_lblk_t lblk, ext4_fsblk_t *pblk,
    371				     int depth)
    372{
    373	unsigned short entries;
    374	ext4_lblk_t lblock = 0;
    375	ext4_lblk_t cur = 0;
    376
    377	if (eh->eh_entries == 0)
    378		return 1;
    379
    380	entries = le16_to_cpu(eh->eh_entries);
    381
    382	if (depth == 0) {
    383		/* leaf entries */
    384		struct ext4_extent *ext = EXT_FIRST_EXTENT(eh);
    385
    386		/*
    387		 * The logical block in the first entry should equal to
    388		 * the number in the index block.
    389		 */
    390		if (depth != ext_depth(inode) &&
    391		    lblk != le32_to_cpu(ext->ee_block))
    392			return 0;
    393		while (entries) {
    394			if (!ext4_valid_extent(inode, ext))
    395				return 0;
    396
    397			/* Check for overlapping extents */
    398			lblock = le32_to_cpu(ext->ee_block);
    399			if (lblock < cur) {
    400				*pblk = ext4_ext_pblock(ext);
    401				return 0;
    402			}
    403			cur = lblock + ext4_ext_get_actual_len(ext);
    404			ext++;
    405			entries--;
    406		}
    407	} else {
    408		struct ext4_extent_idx *ext_idx = EXT_FIRST_INDEX(eh);
    409
    410		/*
    411		 * The logical block in the first entry should equal to
    412		 * the number in the parent index block.
    413		 */
    414		if (depth != ext_depth(inode) &&
    415		    lblk != le32_to_cpu(ext_idx->ei_block))
    416			return 0;
    417		while (entries) {
    418			if (!ext4_valid_extent_idx(inode, ext_idx))
    419				return 0;
    420
    421			/* Check for overlapping index extents */
    422			lblock = le32_to_cpu(ext_idx->ei_block);
    423			if (lblock < cur) {
    424				*pblk = ext4_idx_pblock(ext_idx);
    425				return 0;
    426			}
    427			ext_idx++;
    428			entries--;
    429			cur = lblock + 1;
    430		}
    431	}
    432	return 1;
    433}
    434
    435static int __ext4_ext_check(const char *function, unsigned int line,
    436			    struct inode *inode, struct ext4_extent_header *eh,
    437			    int depth, ext4_fsblk_t pblk, ext4_lblk_t lblk)
    438{
    439	const char *error_msg;
    440	int max = 0, err = -EFSCORRUPTED;
    441
    442	if (unlikely(eh->eh_magic != EXT4_EXT_MAGIC)) {
    443		error_msg = "invalid magic";
    444		goto corrupted;
    445	}
    446	if (unlikely(le16_to_cpu(eh->eh_depth) != depth)) {
    447		error_msg = "unexpected eh_depth";
    448		goto corrupted;
    449	}
    450	if (unlikely(eh->eh_max == 0)) {
    451		error_msg = "invalid eh_max";
    452		goto corrupted;
    453	}
    454	max = ext4_ext_max_entries(inode, depth);
    455	if (unlikely(le16_to_cpu(eh->eh_max) > max)) {
    456		error_msg = "too large eh_max";
    457		goto corrupted;
    458	}
    459	if (unlikely(le16_to_cpu(eh->eh_entries) > le16_to_cpu(eh->eh_max))) {
    460		error_msg = "invalid eh_entries";
    461		goto corrupted;
    462	}
    463	if (!ext4_valid_extent_entries(inode, eh, lblk, &pblk, depth)) {
    464		error_msg = "invalid extent entries";
    465		goto corrupted;
    466	}
    467	if (unlikely(depth > 32)) {
    468		error_msg = "too large eh_depth";
    469		goto corrupted;
    470	}
    471	/* Verify checksum on non-root extent tree nodes */
    472	if (ext_depth(inode) != depth &&
    473	    !ext4_extent_block_csum_verify(inode, eh)) {
    474		error_msg = "extent tree corrupted";
    475		err = -EFSBADCRC;
    476		goto corrupted;
    477	}
    478	return 0;
    479
    480corrupted:
    481	ext4_error_inode_err(inode, function, line, 0, -err,
    482			     "pblk %llu bad header/extent: %s - magic %x, "
    483			     "entries %u, max %u(%u), depth %u(%u)",
    484			     (unsigned long long) pblk, error_msg,
    485			     le16_to_cpu(eh->eh_magic),
    486			     le16_to_cpu(eh->eh_entries),
    487			     le16_to_cpu(eh->eh_max),
    488			     max, le16_to_cpu(eh->eh_depth), depth);
    489	return err;
    490}
    491
    492#define ext4_ext_check(inode, eh, depth, pblk)			\
    493	__ext4_ext_check(__func__, __LINE__, (inode), (eh), (depth), (pblk), 0)
    494
    495int ext4_ext_check_inode(struct inode *inode)
    496{
    497	return ext4_ext_check(inode, ext_inode_hdr(inode), ext_depth(inode), 0);
    498}
    499
    500static void ext4_cache_extents(struct inode *inode,
    501			       struct ext4_extent_header *eh)
    502{
    503	struct ext4_extent *ex = EXT_FIRST_EXTENT(eh);
    504	ext4_lblk_t prev = 0;
    505	int i;
    506
    507	for (i = le16_to_cpu(eh->eh_entries); i > 0; i--, ex++) {
    508		unsigned int status = EXTENT_STATUS_WRITTEN;
    509		ext4_lblk_t lblk = le32_to_cpu(ex->ee_block);
    510		int len = ext4_ext_get_actual_len(ex);
    511
    512		if (prev && (prev != lblk))
    513			ext4_es_cache_extent(inode, prev, lblk - prev, ~0,
    514					     EXTENT_STATUS_HOLE);
    515
    516		if (ext4_ext_is_unwritten(ex))
    517			status = EXTENT_STATUS_UNWRITTEN;
    518		ext4_es_cache_extent(inode, lblk, len,
    519				     ext4_ext_pblock(ex), status);
    520		prev = lblk + len;
    521	}
    522}
    523
    524static struct buffer_head *
    525__read_extent_tree_block(const char *function, unsigned int line,
    526			 struct inode *inode, struct ext4_extent_idx *idx,
    527			 int depth, int flags)
    528{
    529	struct buffer_head		*bh;
    530	int				err;
    531	gfp_t				gfp_flags = __GFP_MOVABLE | GFP_NOFS;
    532	ext4_fsblk_t			pblk;
    533
    534	if (flags & EXT4_EX_NOFAIL)
    535		gfp_flags |= __GFP_NOFAIL;
    536
    537	pblk = ext4_idx_pblock(idx);
    538	bh = sb_getblk_gfp(inode->i_sb, pblk, gfp_flags);
    539	if (unlikely(!bh))
    540		return ERR_PTR(-ENOMEM);
    541
    542	if (!bh_uptodate_or_lock(bh)) {
    543		trace_ext4_ext_load_extent(inode, pblk, _RET_IP_);
    544		err = ext4_read_bh(bh, 0, NULL);
    545		if (err < 0)
    546			goto errout;
    547	}
    548	if (buffer_verified(bh) && !(flags & EXT4_EX_FORCE_CACHE))
    549		return bh;
    550	err = __ext4_ext_check(function, line, inode, ext_block_hdr(bh),
    551			       depth, pblk, le32_to_cpu(idx->ei_block));
    552	if (err)
    553		goto errout;
    554	set_buffer_verified(bh);
    555	/*
    556	 * If this is a leaf block, cache all of its entries
    557	 */
    558	if (!(flags & EXT4_EX_NOCACHE) && depth == 0) {
    559		struct ext4_extent_header *eh = ext_block_hdr(bh);
    560		ext4_cache_extents(inode, eh);
    561	}
    562	return bh;
    563errout:
    564	put_bh(bh);
    565	return ERR_PTR(err);
    566
    567}
    568
    569#define read_extent_tree_block(inode, idx, depth, flags)		\
    570	__read_extent_tree_block(__func__, __LINE__, (inode), (idx),	\
    571				 (depth), (flags))
    572
    573/*
    574 * This function is called to cache a file's extent information in the
    575 * extent status tree
    576 */
    577int ext4_ext_precache(struct inode *inode)
    578{
    579	struct ext4_inode_info *ei = EXT4_I(inode);
    580	struct ext4_ext_path *path = NULL;
    581	struct buffer_head *bh;
    582	int i = 0, depth, ret = 0;
    583
    584	if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
    585		return 0;	/* not an extent-mapped inode */
    586
    587	down_read(&ei->i_data_sem);
    588	depth = ext_depth(inode);
    589
    590	/* Don't cache anything if there are no external extent blocks */
    591	if (!depth) {
    592		up_read(&ei->i_data_sem);
    593		return ret;
    594	}
    595
    596	path = kcalloc(depth + 1, sizeof(struct ext4_ext_path),
    597		       GFP_NOFS);
    598	if (path == NULL) {
    599		up_read(&ei->i_data_sem);
    600		return -ENOMEM;
    601	}
    602
    603	path[0].p_hdr = ext_inode_hdr(inode);
    604	ret = ext4_ext_check(inode, path[0].p_hdr, depth, 0);
    605	if (ret)
    606		goto out;
    607	path[0].p_idx = EXT_FIRST_INDEX(path[0].p_hdr);
    608	while (i >= 0) {
    609		/*
    610		 * If this is a leaf block or we've reached the end of
    611		 * the index block, go up
    612		 */
    613		if ((i == depth) ||
    614		    path[i].p_idx > EXT_LAST_INDEX(path[i].p_hdr)) {
    615			brelse(path[i].p_bh);
    616			path[i].p_bh = NULL;
    617			i--;
    618			continue;
    619		}
    620		bh = read_extent_tree_block(inode, path[i].p_idx++,
    621					    depth - i - 1,
    622					    EXT4_EX_FORCE_CACHE);
    623		if (IS_ERR(bh)) {
    624			ret = PTR_ERR(bh);
    625			break;
    626		}
    627		i++;
    628		path[i].p_bh = bh;
    629		path[i].p_hdr = ext_block_hdr(bh);
    630		path[i].p_idx = EXT_FIRST_INDEX(path[i].p_hdr);
    631	}
    632	ext4_set_inode_state(inode, EXT4_STATE_EXT_PRECACHED);
    633out:
    634	up_read(&ei->i_data_sem);
    635	ext4_ext_drop_refs(path);
    636	kfree(path);
    637	return ret;
    638}
    639
    640#ifdef EXT_DEBUG
    641static void ext4_ext_show_path(struct inode *inode, struct ext4_ext_path *path)
    642{
    643	int k, l = path->p_depth;
    644
    645	ext_debug(inode, "path:");
    646	for (k = 0; k <= l; k++, path++) {
    647		if (path->p_idx) {
    648			ext_debug(inode, "  %d->%llu",
    649				  le32_to_cpu(path->p_idx->ei_block),
    650				  ext4_idx_pblock(path->p_idx));
    651		} else if (path->p_ext) {
    652			ext_debug(inode, "  %d:[%d]%d:%llu ",
    653				  le32_to_cpu(path->p_ext->ee_block),
    654				  ext4_ext_is_unwritten(path->p_ext),
    655				  ext4_ext_get_actual_len(path->p_ext),
    656				  ext4_ext_pblock(path->p_ext));
    657		} else
    658			ext_debug(inode, "  []");
    659	}
    660	ext_debug(inode, "\n");
    661}
    662
    663static void ext4_ext_show_leaf(struct inode *inode, struct ext4_ext_path *path)
    664{
    665	int depth = ext_depth(inode);
    666	struct ext4_extent_header *eh;
    667	struct ext4_extent *ex;
    668	int i;
    669
    670	if (!path)
    671		return;
    672
    673	eh = path[depth].p_hdr;
    674	ex = EXT_FIRST_EXTENT(eh);
    675
    676	ext_debug(inode, "Displaying leaf extents\n");
    677
    678	for (i = 0; i < le16_to_cpu(eh->eh_entries); i++, ex++) {
    679		ext_debug(inode, "%d:[%d]%d:%llu ", le32_to_cpu(ex->ee_block),
    680			  ext4_ext_is_unwritten(ex),
    681			  ext4_ext_get_actual_len(ex), ext4_ext_pblock(ex));
    682	}
    683	ext_debug(inode, "\n");
    684}
    685
    686static void ext4_ext_show_move(struct inode *inode, struct ext4_ext_path *path,
    687			ext4_fsblk_t newblock, int level)
    688{
    689	int depth = ext_depth(inode);
    690	struct ext4_extent *ex;
    691
    692	if (depth != level) {
    693		struct ext4_extent_idx *idx;
    694		idx = path[level].p_idx;
    695		while (idx <= EXT_MAX_INDEX(path[level].p_hdr)) {
    696			ext_debug(inode, "%d: move %d:%llu in new index %llu\n",
    697				  level, le32_to_cpu(idx->ei_block),
    698				  ext4_idx_pblock(idx), newblock);
    699			idx++;
    700		}
    701
    702		return;
    703	}
    704
    705	ex = path[depth].p_ext;
    706	while (ex <= EXT_MAX_EXTENT(path[depth].p_hdr)) {
    707		ext_debug(inode, "move %d:%llu:[%d]%d in new leaf %llu\n",
    708				le32_to_cpu(ex->ee_block),
    709				ext4_ext_pblock(ex),
    710				ext4_ext_is_unwritten(ex),
    711				ext4_ext_get_actual_len(ex),
    712				newblock);
    713		ex++;
    714	}
    715}
    716
    717#else
    718#define ext4_ext_show_path(inode, path)
    719#define ext4_ext_show_leaf(inode, path)
    720#define ext4_ext_show_move(inode, path, newblock, level)
    721#endif
    722
    723void ext4_ext_drop_refs(struct ext4_ext_path *path)
    724{
    725	int depth, i;
    726
    727	if (!path)
    728		return;
    729	depth = path->p_depth;
    730	for (i = 0; i <= depth; i++, path++) {
    731		brelse(path->p_bh);
    732		path->p_bh = NULL;
    733	}
    734}
    735
    736/*
    737 * ext4_ext_binsearch_idx:
    738 * binary search for the closest index of the given block
    739 * the header must be checked before calling this
    740 */
    741static void
    742ext4_ext_binsearch_idx(struct inode *inode,
    743			struct ext4_ext_path *path, ext4_lblk_t block)
    744{
    745	struct ext4_extent_header *eh = path->p_hdr;
    746	struct ext4_extent_idx *r, *l, *m;
    747
    748
    749	ext_debug(inode, "binsearch for %u(idx):  ", block);
    750
    751	l = EXT_FIRST_INDEX(eh) + 1;
    752	r = EXT_LAST_INDEX(eh);
    753	while (l <= r) {
    754		m = l + (r - l) / 2;
    755		ext_debug(inode, "%p(%u):%p(%u):%p(%u) ", l,
    756			  le32_to_cpu(l->ei_block), m, le32_to_cpu(m->ei_block),
    757			  r, le32_to_cpu(r->ei_block));
    758
    759		if (block < le32_to_cpu(m->ei_block))
    760			r = m - 1;
    761		else
    762			l = m + 1;
    763	}
    764
    765	path->p_idx = l - 1;
    766	ext_debug(inode, "  -> %u->%lld ", le32_to_cpu(path->p_idx->ei_block),
    767		  ext4_idx_pblock(path->p_idx));
    768
    769#ifdef CHECK_BINSEARCH
    770	{
    771		struct ext4_extent_idx *chix, *ix;
    772		int k;
    773
    774		chix = ix = EXT_FIRST_INDEX(eh);
    775		for (k = 0; k < le16_to_cpu(eh->eh_entries); k++, ix++) {
    776			if (k != 0 && le32_to_cpu(ix->ei_block) <=
    777			    le32_to_cpu(ix[-1].ei_block)) {
    778				printk(KERN_DEBUG "k=%d, ix=0x%p, "
    779				       "first=0x%p\n", k,
    780				       ix, EXT_FIRST_INDEX(eh));
    781				printk(KERN_DEBUG "%u <= %u\n",
    782				       le32_to_cpu(ix->ei_block),
    783				       le32_to_cpu(ix[-1].ei_block));
    784			}
    785			BUG_ON(k && le32_to_cpu(ix->ei_block)
    786					   <= le32_to_cpu(ix[-1].ei_block));
    787			if (block < le32_to_cpu(ix->ei_block))
    788				break;
    789			chix = ix;
    790		}
    791		BUG_ON(chix != path->p_idx);
    792	}
    793#endif
    794
    795}
    796
    797/*
    798 * ext4_ext_binsearch:
    799 * binary search for closest extent of the given block
    800 * the header must be checked before calling this
    801 */
    802static void
    803ext4_ext_binsearch(struct inode *inode,
    804		struct ext4_ext_path *path, ext4_lblk_t block)
    805{
    806	struct ext4_extent_header *eh = path->p_hdr;
    807	struct ext4_extent *r, *l, *m;
    808
    809	if (eh->eh_entries == 0) {
    810		/*
    811		 * this leaf is empty:
    812		 * we get such a leaf in split/add case
    813		 */
    814		return;
    815	}
    816
    817	ext_debug(inode, "binsearch for %u:  ", block);
    818
    819	l = EXT_FIRST_EXTENT(eh) + 1;
    820	r = EXT_LAST_EXTENT(eh);
    821
    822	while (l <= r) {
    823		m = l + (r - l) / 2;
    824		ext_debug(inode, "%p(%u):%p(%u):%p(%u) ", l,
    825			  le32_to_cpu(l->ee_block), m, le32_to_cpu(m->ee_block),
    826			  r, le32_to_cpu(r->ee_block));
    827
    828		if (block < le32_to_cpu(m->ee_block))
    829			r = m - 1;
    830		else
    831			l = m + 1;
    832	}
    833
    834	path->p_ext = l - 1;
    835	ext_debug(inode, "  -> %d:%llu:[%d]%d ",
    836			le32_to_cpu(path->p_ext->ee_block),
    837			ext4_ext_pblock(path->p_ext),
    838			ext4_ext_is_unwritten(path->p_ext),
    839			ext4_ext_get_actual_len(path->p_ext));
    840
    841#ifdef CHECK_BINSEARCH
    842	{
    843		struct ext4_extent *chex, *ex;
    844		int k;
    845
    846		chex = ex = EXT_FIRST_EXTENT(eh);
    847		for (k = 0; k < le16_to_cpu(eh->eh_entries); k++, ex++) {
    848			BUG_ON(k && le32_to_cpu(ex->ee_block)
    849					  <= le32_to_cpu(ex[-1].ee_block));
    850			if (block < le32_to_cpu(ex->ee_block))
    851				break;
    852			chex = ex;
    853		}
    854		BUG_ON(chex != path->p_ext);
    855	}
    856#endif
    857
    858}
    859
    860void ext4_ext_tree_init(handle_t *handle, struct inode *inode)
    861{
    862	struct ext4_extent_header *eh;
    863
    864	eh = ext_inode_hdr(inode);
    865	eh->eh_depth = 0;
    866	eh->eh_entries = 0;
    867	eh->eh_magic = EXT4_EXT_MAGIC;
    868	eh->eh_max = cpu_to_le16(ext4_ext_space_root(inode, 0));
    869	eh->eh_generation = 0;
    870	ext4_mark_inode_dirty(handle, inode);
    871}
    872
    873struct ext4_ext_path *
    874ext4_find_extent(struct inode *inode, ext4_lblk_t block,
    875		 struct ext4_ext_path **orig_path, int flags)
    876{
    877	struct ext4_extent_header *eh;
    878	struct buffer_head *bh;
    879	struct ext4_ext_path *path = orig_path ? *orig_path : NULL;
    880	short int depth, i, ppos = 0;
    881	int ret;
    882	gfp_t gfp_flags = GFP_NOFS;
    883
    884	if (flags & EXT4_EX_NOFAIL)
    885		gfp_flags |= __GFP_NOFAIL;
    886
    887	eh = ext_inode_hdr(inode);
    888	depth = ext_depth(inode);
    889	if (depth < 0 || depth > EXT4_MAX_EXTENT_DEPTH) {
    890		EXT4_ERROR_INODE(inode, "inode has invalid extent depth: %d",
    891				 depth);
    892		ret = -EFSCORRUPTED;
    893		goto err;
    894	}
    895
    896	if (path) {
    897		ext4_ext_drop_refs(path);
    898		if (depth > path[0].p_maxdepth) {
    899			kfree(path);
    900			*orig_path = path = NULL;
    901		}
    902	}
    903	if (!path) {
    904		/* account possible depth increase */
    905		path = kcalloc(depth + 2, sizeof(struct ext4_ext_path),
    906				gfp_flags);
    907		if (unlikely(!path))
    908			return ERR_PTR(-ENOMEM);
    909		path[0].p_maxdepth = depth + 1;
    910	}
    911	path[0].p_hdr = eh;
    912	path[0].p_bh = NULL;
    913
    914	i = depth;
    915	if (!(flags & EXT4_EX_NOCACHE) && depth == 0)
    916		ext4_cache_extents(inode, eh);
    917	/* walk through the tree */
    918	while (i) {
    919		ext_debug(inode, "depth %d: num %d, max %d\n",
    920			  ppos, le16_to_cpu(eh->eh_entries), le16_to_cpu(eh->eh_max));
    921
    922		ext4_ext_binsearch_idx(inode, path + ppos, block);
    923		path[ppos].p_block = ext4_idx_pblock(path[ppos].p_idx);
    924		path[ppos].p_depth = i;
    925		path[ppos].p_ext = NULL;
    926
    927		bh = read_extent_tree_block(inode, path[ppos].p_idx, --i, flags);
    928		if (IS_ERR(bh)) {
    929			ret = PTR_ERR(bh);
    930			goto err;
    931		}
    932
    933		eh = ext_block_hdr(bh);
    934		ppos++;
    935		path[ppos].p_bh = bh;
    936		path[ppos].p_hdr = eh;
    937	}
    938
    939	path[ppos].p_depth = i;
    940	path[ppos].p_ext = NULL;
    941	path[ppos].p_idx = NULL;
    942
    943	/* find extent */
    944	ext4_ext_binsearch(inode, path + ppos, block);
    945	/* if not an empty leaf */
    946	if (path[ppos].p_ext)
    947		path[ppos].p_block = ext4_ext_pblock(path[ppos].p_ext);
    948
    949	ext4_ext_show_path(inode, path);
    950
    951	return path;
    952
    953err:
    954	ext4_ext_drop_refs(path);
    955	kfree(path);
    956	if (orig_path)
    957		*orig_path = NULL;
    958	return ERR_PTR(ret);
    959}
    960
    961/*
    962 * ext4_ext_insert_index:
    963 * insert new index [@logical;@ptr] into the block at @curp;
    964 * check where to insert: before @curp or after @curp
    965 */
    966static int ext4_ext_insert_index(handle_t *handle, struct inode *inode,
    967				 struct ext4_ext_path *curp,
    968				 int logical, ext4_fsblk_t ptr)
    969{
    970	struct ext4_extent_idx *ix;
    971	int len, err;
    972
    973	err = ext4_ext_get_access(handle, inode, curp);
    974	if (err)
    975		return err;
    976
    977	if (unlikely(logical == le32_to_cpu(curp->p_idx->ei_block))) {
    978		EXT4_ERROR_INODE(inode,
    979				 "logical %d == ei_block %d!",
    980				 logical, le32_to_cpu(curp->p_idx->ei_block));
    981		return -EFSCORRUPTED;
    982	}
    983
    984	if (unlikely(le16_to_cpu(curp->p_hdr->eh_entries)
    985			     >= le16_to_cpu(curp->p_hdr->eh_max))) {
    986		EXT4_ERROR_INODE(inode,
    987				 "eh_entries %d >= eh_max %d!",
    988				 le16_to_cpu(curp->p_hdr->eh_entries),
    989				 le16_to_cpu(curp->p_hdr->eh_max));
    990		return -EFSCORRUPTED;
    991	}
    992
    993	if (logical > le32_to_cpu(curp->p_idx->ei_block)) {
    994		/* insert after */
    995		ext_debug(inode, "insert new index %d after: %llu\n",
    996			  logical, ptr);
    997		ix = curp->p_idx + 1;
    998	} else {
    999		/* insert before */
   1000		ext_debug(inode, "insert new index %d before: %llu\n",
   1001			  logical, ptr);
   1002		ix = curp->p_idx;
   1003	}
   1004
   1005	len = EXT_LAST_INDEX(curp->p_hdr) - ix + 1;
   1006	BUG_ON(len < 0);
   1007	if (len > 0) {
   1008		ext_debug(inode, "insert new index %d: "
   1009				"move %d indices from 0x%p to 0x%p\n",
   1010				logical, len, ix, ix + 1);
   1011		memmove(ix + 1, ix, len * sizeof(struct ext4_extent_idx));
   1012	}
   1013
   1014	if (unlikely(ix > EXT_MAX_INDEX(curp->p_hdr))) {
   1015		EXT4_ERROR_INODE(inode, "ix > EXT_MAX_INDEX!");
   1016		return -EFSCORRUPTED;
   1017	}
   1018
   1019	ix->ei_block = cpu_to_le32(logical);
   1020	ext4_idx_store_pblock(ix, ptr);
   1021	le16_add_cpu(&curp->p_hdr->eh_entries, 1);
   1022
   1023	if (unlikely(ix > EXT_LAST_INDEX(curp->p_hdr))) {
   1024		EXT4_ERROR_INODE(inode, "ix > EXT_LAST_INDEX!");
   1025		return -EFSCORRUPTED;
   1026	}
   1027
   1028	err = ext4_ext_dirty(handle, inode, curp);
   1029	ext4_std_error(inode->i_sb, err);
   1030
   1031	return err;
   1032}
   1033
   1034/*
   1035 * ext4_ext_split:
   1036 * inserts new subtree into the path, using free index entry
   1037 * at depth @at:
   1038 * - allocates all needed blocks (new leaf and all intermediate index blocks)
   1039 * - makes decision where to split
   1040 * - moves remaining extents and index entries (right to the split point)
   1041 *   into the newly allocated blocks
   1042 * - initializes subtree
   1043 */
   1044static int ext4_ext_split(handle_t *handle, struct inode *inode,
   1045			  unsigned int flags,
   1046			  struct ext4_ext_path *path,
   1047			  struct ext4_extent *newext, int at)
   1048{
   1049	struct buffer_head *bh = NULL;
   1050	int depth = ext_depth(inode);
   1051	struct ext4_extent_header *neh;
   1052	struct ext4_extent_idx *fidx;
   1053	int i = at, k, m, a;
   1054	ext4_fsblk_t newblock, oldblock;
   1055	__le32 border;
   1056	ext4_fsblk_t *ablocks = NULL; /* array of allocated blocks */
   1057	gfp_t gfp_flags = GFP_NOFS;
   1058	int err = 0;
   1059	size_t ext_size = 0;
   1060
   1061	if (flags & EXT4_EX_NOFAIL)
   1062		gfp_flags |= __GFP_NOFAIL;
   1063
   1064	/* make decision: where to split? */
   1065	/* FIXME: now decision is simplest: at current extent */
   1066
   1067	/* if current leaf will be split, then we should use
   1068	 * border from split point */
   1069	if (unlikely(path[depth].p_ext > EXT_MAX_EXTENT(path[depth].p_hdr))) {
   1070		EXT4_ERROR_INODE(inode, "p_ext > EXT_MAX_EXTENT!");
   1071		return -EFSCORRUPTED;
   1072	}
   1073	if (path[depth].p_ext != EXT_MAX_EXTENT(path[depth].p_hdr)) {
   1074		border = path[depth].p_ext[1].ee_block;
   1075		ext_debug(inode, "leaf will be split."
   1076				" next leaf starts at %d\n",
   1077				  le32_to_cpu(border));
   1078	} else {
   1079		border = newext->ee_block;
   1080		ext_debug(inode, "leaf will be added."
   1081				" next leaf starts at %d\n",
   1082				le32_to_cpu(border));
   1083	}
   1084
   1085	/*
   1086	 * If error occurs, then we break processing
   1087	 * and mark filesystem read-only. index won't
   1088	 * be inserted and tree will be in consistent
   1089	 * state. Next mount will repair buffers too.
   1090	 */
   1091
   1092	/*
   1093	 * Get array to track all allocated blocks.
   1094	 * We need this to handle errors and free blocks
   1095	 * upon them.
   1096	 */
   1097	ablocks = kcalloc(depth, sizeof(ext4_fsblk_t), gfp_flags);
   1098	if (!ablocks)
   1099		return -ENOMEM;
   1100
   1101	/* allocate all needed blocks */
   1102	ext_debug(inode, "allocate %d blocks for indexes/leaf\n", depth - at);
   1103	for (a = 0; a < depth - at; a++) {
   1104		newblock = ext4_ext_new_meta_block(handle, inode, path,
   1105						   newext, &err, flags);
   1106		if (newblock == 0)
   1107			goto cleanup;
   1108		ablocks[a] = newblock;
   1109	}
   1110
   1111	/* initialize new leaf */
   1112	newblock = ablocks[--a];
   1113	if (unlikely(newblock == 0)) {
   1114		EXT4_ERROR_INODE(inode, "newblock == 0!");
   1115		err = -EFSCORRUPTED;
   1116		goto cleanup;
   1117	}
   1118	bh = sb_getblk_gfp(inode->i_sb, newblock, __GFP_MOVABLE | GFP_NOFS);
   1119	if (unlikely(!bh)) {
   1120		err = -ENOMEM;
   1121		goto cleanup;
   1122	}
   1123	lock_buffer(bh);
   1124
   1125	err = ext4_journal_get_create_access(handle, inode->i_sb, bh,
   1126					     EXT4_JTR_NONE);
   1127	if (err)
   1128		goto cleanup;
   1129
   1130	neh = ext_block_hdr(bh);
   1131	neh->eh_entries = 0;
   1132	neh->eh_max = cpu_to_le16(ext4_ext_space_block(inode, 0));
   1133	neh->eh_magic = EXT4_EXT_MAGIC;
   1134	neh->eh_depth = 0;
   1135	neh->eh_generation = 0;
   1136
   1137	/* move remainder of path[depth] to the new leaf */
   1138	if (unlikely(path[depth].p_hdr->eh_entries !=
   1139		     path[depth].p_hdr->eh_max)) {
   1140		EXT4_ERROR_INODE(inode, "eh_entries %d != eh_max %d!",
   1141				 path[depth].p_hdr->eh_entries,
   1142				 path[depth].p_hdr->eh_max);
   1143		err = -EFSCORRUPTED;
   1144		goto cleanup;
   1145	}
   1146	/* start copy from next extent */
   1147	m = EXT_MAX_EXTENT(path[depth].p_hdr) - path[depth].p_ext++;
   1148	ext4_ext_show_move(inode, path, newblock, depth);
   1149	if (m) {
   1150		struct ext4_extent *ex;
   1151		ex = EXT_FIRST_EXTENT(neh);
   1152		memmove(ex, path[depth].p_ext, sizeof(struct ext4_extent) * m);
   1153		le16_add_cpu(&neh->eh_entries, m);
   1154	}
   1155
   1156	/* zero out unused area in the extent block */
   1157	ext_size = sizeof(struct ext4_extent_header) +
   1158		sizeof(struct ext4_extent) * le16_to_cpu(neh->eh_entries);
   1159	memset(bh->b_data + ext_size, 0, inode->i_sb->s_blocksize - ext_size);
   1160	ext4_extent_block_csum_set(inode, neh);
   1161	set_buffer_uptodate(bh);
   1162	unlock_buffer(bh);
   1163
   1164	err = ext4_handle_dirty_metadata(handle, inode, bh);
   1165	if (err)
   1166		goto cleanup;
   1167	brelse(bh);
   1168	bh = NULL;
   1169
   1170	/* correct old leaf */
   1171	if (m) {
   1172		err = ext4_ext_get_access(handle, inode, path + depth);
   1173		if (err)
   1174			goto cleanup;
   1175		le16_add_cpu(&path[depth].p_hdr->eh_entries, -m);
   1176		err = ext4_ext_dirty(handle, inode, path + depth);
   1177		if (err)
   1178			goto cleanup;
   1179
   1180	}
   1181
   1182	/* create intermediate indexes */
   1183	k = depth - at - 1;
   1184	if (unlikely(k < 0)) {
   1185		EXT4_ERROR_INODE(inode, "k %d < 0!", k);
   1186		err = -EFSCORRUPTED;
   1187		goto cleanup;
   1188	}
   1189	if (k)
   1190		ext_debug(inode, "create %d intermediate indices\n", k);
   1191	/* insert new index into current index block */
   1192	/* current depth stored in i var */
   1193	i = depth - 1;
   1194	while (k--) {
   1195		oldblock = newblock;
   1196		newblock = ablocks[--a];
   1197		bh = sb_getblk(inode->i_sb, newblock);
   1198		if (unlikely(!bh)) {
   1199			err = -ENOMEM;
   1200			goto cleanup;
   1201		}
   1202		lock_buffer(bh);
   1203
   1204		err = ext4_journal_get_create_access(handle, inode->i_sb, bh,
   1205						     EXT4_JTR_NONE);
   1206		if (err)
   1207			goto cleanup;
   1208
   1209		neh = ext_block_hdr(bh);
   1210		neh->eh_entries = cpu_to_le16(1);
   1211		neh->eh_magic = EXT4_EXT_MAGIC;
   1212		neh->eh_max = cpu_to_le16(ext4_ext_space_block_idx(inode, 0));
   1213		neh->eh_depth = cpu_to_le16(depth - i);
   1214		neh->eh_generation = 0;
   1215		fidx = EXT_FIRST_INDEX(neh);
   1216		fidx->ei_block = border;
   1217		ext4_idx_store_pblock(fidx, oldblock);
   1218
   1219		ext_debug(inode, "int.index at %d (block %llu): %u -> %llu\n",
   1220				i, newblock, le32_to_cpu(border), oldblock);
   1221
   1222		/* move remainder of path[i] to the new index block */
   1223		if (unlikely(EXT_MAX_INDEX(path[i].p_hdr) !=
   1224					EXT_LAST_INDEX(path[i].p_hdr))) {
   1225			EXT4_ERROR_INODE(inode,
   1226					 "EXT_MAX_INDEX != EXT_LAST_INDEX ee_block %d!",
   1227					 le32_to_cpu(path[i].p_ext->ee_block));
   1228			err = -EFSCORRUPTED;
   1229			goto cleanup;
   1230		}
   1231		/* start copy indexes */
   1232		m = EXT_MAX_INDEX(path[i].p_hdr) - path[i].p_idx++;
   1233		ext_debug(inode, "cur 0x%p, last 0x%p\n", path[i].p_idx,
   1234				EXT_MAX_INDEX(path[i].p_hdr));
   1235		ext4_ext_show_move(inode, path, newblock, i);
   1236		if (m) {
   1237			memmove(++fidx, path[i].p_idx,
   1238				sizeof(struct ext4_extent_idx) * m);
   1239			le16_add_cpu(&neh->eh_entries, m);
   1240		}
   1241		/* zero out unused area in the extent block */
   1242		ext_size = sizeof(struct ext4_extent_header) +
   1243		   (sizeof(struct ext4_extent) * le16_to_cpu(neh->eh_entries));
   1244		memset(bh->b_data + ext_size, 0,
   1245			inode->i_sb->s_blocksize - ext_size);
   1246		ext4_extent_block_csum_set(inode, neh);
   1247		set_buffer_uptodate(bh);
   1248		unlock_buffer(bh);
   1249
   1250		err = ext4_handle_dirty_metadata(handle, inode, bh);
   1251		if (err)
   1252			goto cleanup;
   1253		brelse(bh);
   1254		bh = NULL;
   1255
   1256		/* correct old index */
   1257		if (m) {
   1258			err = ext4_ext_get_access(handle, inode, path + i);
   1259			if (err)
   1260				goto cleanup;
   1261			le16_add_cpu(&path[i].p_hdr->eh_entries, -m);
   1262			err = ext4_ext_dirty(handle, inode, path + i);
   1263			if (err)
   1264				goto cleanup;
   1265		}
   1266
   1267		i--;
   1268	}
   1269
   1270	/* insert new index */
   1271	err = ext4_ext_insert_index(handle, inode, path + at,
   1272				    le32_to_cpu(border), newblock);
   1273
   1274cleanup:
   1275	if (bh) {
   1276		if (buffer_locked(bh))
   1277			unlock_buffer(bh);
   1278		brelse(bh);
   1279	}
   1280
   1281	if (err) {
   1282		/* free all allocated blocks in error case */
   1283		for (i = 0; i < depth; i++) {
   1284			if (!ablocks[i])
   1285				continue;
   1286			ext4_free_blocks(handle, inode, NULL, ablocks[i], 1,
   1287					 EXT4_FREE_BLOCKS_METADATA);
   1288		}
   1289	}
   1290	kfree(ablocks);
   1291
   1292	return err;
   1293}
   1294
   1295/*
   1296 * ext4_ext_grow_indepth:
   1297 * implements tree growing procedure:
   1298 * - allocates new block
   1299 * - moves top-level data (index block or leaf) into the new block
   1300 * - initializes new top-level, creating index that points to the
   1301 *   just created block
   1302 */
   1303static int ext4_ext_grow_indepth(handle_t *handle, struct inode *inode,
   1304				 unsigned int flags)
   1305{
   1306	struct ext4_extent_header *neh;
   1307	struct buffer_head *bh;
   1308	ext4_fsblk_t newblock, goal = 0;
   1309	struct ext4_super_block *es = EXT4_SB(inode->i_sb)->s_es;
   1310	int err = 0;
   1311	size_t ext_size = 0;
   1312
   1313	/* Try to prepend new index to old one */
   1314	if (ext_depth(inode))
   1315		goal = ext4_idx_pblock(EXT_FIRST_INDEX(ext_inode_hdr(inode)));
   1316	if (goal > le32_to_cpu(es->s_first_data_block)) {
   1317		flags |= EXT4_MB_HINT_TRY_GOAL;
   1318		goal--;
   1319	} else
   1320		goal = ext4_inode_to_goal_block(inode);
   1321	newblock = ext4_new_meta_blocks(handle, inode, goal, flags,
   1322					NULL, &err);
   1323	if (newblock == 0)
   1324		return err;
   1325
   1326	bh = sb_getblk_gfp(inode->i_sb, newblock, __GFP_MOVABLE | GFP_NOFS);
   1327	if (unlikely(!bh))
   1328		return -ENOMEM;
   1329	lock_buffer(bh);
   1330
   1331	err = ext4_journal_get_create_access(handle, inode->i_sb, bh,
   1332					     EXT4_JTR_NONE);
   1333	if (err) {
   1334		unlock_buffer(bh);
   1335		goto out;
   1336	}
   1337
   1338	ext_size = sizeof(EXT4_I(inode)->i_data);
   1339	/* move top-level index/leaf into new block */
   1340	memmove(bh->b_data, EXT4_I(inode)->i_data, ext_size);
   1341	/* zero out unused area in the extent block */
   1342	memset(bh->b_data + ext_size, 0, inode->i_sb->s_blocksize - ext_size);
   1343
   1344	/* set size of new block */
   1345	neh = ext_block_hdr(bh);
   1346	/* old root could have indexes or leaves
   1347	 * so calculate e_max right way */
   1348	if (ext_depth(inode))
   1349		neh->eh_max = cpu_to_le16(ext4_ext_space_block_idx(inode, 0));
   1350	else
   1351		neh->eh_max = cpu_to_le16(ext4_ext_space_block(inode, 0));
   1352	neh->eh_magic = EXT4_EXT_MAGIC;
   1353	ext4_extent_block_csum_set(inode, neh);
   1354	set_buffer_uptodate(bh);
   1355	set_buffer_verified(bh);
   1356	unlock_buffer(bh);
   1357
   1358	err = ext4_handle_dirty_metadata(handle, inode, bh);
   1359	if (err)
   1360		goto out;
   1361
   1362	/* Update top-level index: num,max,pointer */
   1363	neh = ext_inode_hdr(inode);
   1364	neh->eh_entries = cpu_to_le16(1);
   1365	ext4_idx_store_pblock(EXT_FIRST_INDEX(neh), newblock);
   1366	if (neh->eh_depth == 0) {
   1367		/* Root extent block becomes index block */
   1368		neh->eh_max = cpu_to_le16(ext4_ext_space_root_idx(inode, 0));
   1369		EXT_FIRST_INDEX(neh)->ei_block =
   1370			EXT_FIRST_EXTENT(neh)->ee_block;
   1371	}
   1372	ext_debug(inode, "new root: num %d(%d), lblock %d, ptr %llu\n",
   1373		  le16_to_cpu(neh->eh_entries), le16_to_cpu(neh->eh_max),
   1374		  le32_to_cpu(EXT_FIRST_INDEX(neh)->ei_block),
   1375		  ext4_idx_pblock(EXT_FIRST_INDEX(neh)));
   1376
   1377	le16_add_cpu(&neh->eh_depth, 1);
   1378	err = ext4_mark_inode_dirty(handle, inode);
   1379out:
   1380	brelse(bh);
   1381
   1382	return err;
   1383}
   1384
   1385/*
   1386 * ext4_ext_create_new_leaf:
   1387 * finds empty index and adds new leaf.
   1388 * if no free index is found, then it requests in-depth growing.
   1389 */
   1390static int ext4_ext_create_new_leaf(handle_t *handle, struct inode *inode,
   1391				    unsigned int mb_flags,
   1392				    unsigned int gb_flags,
   1393				    struct ext4_ext_path **ppath,
   1394				    struct ext4_extent *newext)
   1395{
   1396	struct ext4_ext_path *path = *ppath;
   1397	struct ext4_ext_path *curp;
   1398	int depth, i, err = 0;
   1399
   1400repeat:
   1401	i = depth = ext_depth(inode);
   1402
   1403	/* walk up to the tree and look for free index entry */
   1404	curp = path + depth;
   1405	while (i > 0 && !EXT_HAS_FREE_INDEX(curp)) {
   1406		i--;
   1407		curp--;
   1408	}
   1409
   1410	/* we use already allocated block for index block,
   1411	 * so subsequent data blocks should be contiguous */
   1412	if (EXT_HAS_FREE_INDEX(curp)) {
   1413		/* if we found index with free entry, then use that
   1414		 * entry: create all needed subtree and add new leaf */
   1415		err = ext4_ext_split(handle, inode, mb_flags, path, newext, i);
   1416		if (err)
   1417			goto out;
   1418
   1419		/* refill path */
   1420		path = ext4_find_extent(inode,
   1421				    (ext4_lblk_t)le32_to_cpu(newext->ee_block),
   1422				    ppath, gb_flags);
   1423		if (IS_ERR(path))
   1424			err = PTR_ERR(path);
   1425	} else {
   1426		/* tree is full, time to grow in depth */
   1427		err = ext4_ext_grow_indepth(handle, inode, mb_flags);
   1428		if (err)
   1429			goto out;
   1430
   1431		/* refill path */
   1432		path = ext4_find_extent(inode,
   1433				   (ext4_lblk_t)le32_to_cpu(newext->ee_block),
   1434				    ppath, gb_flags);
   1435		if (IS_ERR(path)) {
   1436			err = PTR_ERR(path);
   1437			goto out;
   1438		}
   1439
   1440		/*
   1441		 * only first (depth 0 -> 1) produces free space;
   1442		 * in all other cases we have to split the grown tree
   1443		 */
   1444		depth = ext_depth(inode);
   1445		if (path[depth].p_hdr->eh_entries == path[depth].p_hdr->eh_max) {
   1446			/* now we need to split */
   1447			goto repeat;
   1448		}
   1449	}
   1450
   1451out:
   1452	return err;
   1453}
   1454
   1455/*
   1456 * search the closest allocated block to the left for *logical
   1457 * and returns it at @logical + it's physical address at @phys
   1458 * if *logical is the smallest allocated block, the function
   1459 * returns 0 at @phys
   1460 * return value contains 0 (success) or error code
   1461 */
   1462static int ext4_ext_search_left(struct inode *inode,
   1463				struct ext4_ext_path *path,
   1464				ext4_lblk_t *logical, ext4_fsblk_t *phys)
   1465{
   1466	struct ext4_extent_idx *ix;
   1467	struct ext4_extent *ex;
   1468	int depth, ee_len;
   1469
   1470	if (unlikely(path == NULL)) {
   1471		EXT4_ERROR_INODE(inode, "path == NULL *logical %d!", *logical);
   1472		return -EFSCORRUPTED;
   1473	}
   1474	depth = path->p_depth;
   1475	*phys = 0;
   1476
   1477	if (depth == 0 && path->p_ext == NULL)
   1478		return 0;
   1479
   1480	/* usually extent in the path covers blocks smaller
   1481	 * then *logical, but it can be that extent is the
   1482	 * first one in the file */
   1483
   1484	ex = path[depth].p_ext;
   1485	ee_len = ext4_ext_get_actual_len(ex);
   1486	if (*logical < le32_to_cpu(ex->ee_block)) {
   1487		if (unlikely(EXT_FIRST_EXTENT(path[depth].p_hdr) != ex)) {
   1488			EXT4_ERROR_INODE(inode,
   1489					 "EXT_FIRST_EXTENT != ex *logical %d ee_block %d!",
   1490					 *logical, le32_to_cpu(ex->ee_block));
   1491			return -EFSCORRUPTED;
   1492		}
   1493		while (--depth >= 0) {
   1494			ix = path[depth].p_idx;
   1495			if (unlikely(ix != EXT_FIRST_INDEX(path[depth].p_hdr))) {
   1496				EXT4_ERROR_INODE(inode,
   1497				  "ix (%d) != EXT_FIRST_INDEX (%d) (depth %d)!",
   1498				  ix != NULL ? le32_to_cpu(ix->ei_block) : 0,
   1499				  le32_to_cpu(EXT_FIRST_INDEX(path[depth].p_hdr)->ei_block),
   1500				  depth);
   1501				return -EFSCORRUPTED;
   1502			}
   1503		}
   1504		return 0;
   1505	}
   1506
   1507	if (unlikely(*logical < (le32_to_cpu(ex->ee_block) + ee_len))) {
   1508		EXT4_ERROR_INODE(inode,
   1509				 "logical %d < ee_block %d + ee_len %d!",
   1510				 *logical, le32_to_cpu(ex->ee_block), ee_len);
   1511		return -EFSCORRUPTED;
   1512	}
   1513
   1514	*logical = le32_to_cpu(ex->ee_block) + ee_len - 1;
   1515	*phys = ext4_ext_pblock(ex) + ee_len - 1;
   1516	return 0;
   1517}
   1518
   1519/*
   1520 * Search the closest allocated block to the right for *logical
   1521 * and returns it at @logical + it's physical address at @phys.
   1522 * If not exists, return 0 and @phys is set to 0. We will return
   1523 * 1 which means we found an allocated block and ret_ex is valid.
   1524 * Or return a (< 0) error code.
   1525 */
   1526static int ext4_ext_search_right(struct inode *inode,
   1527				 struct ext4_ext_path *path,
   1528				 ext4_lblk_t *logical, ext4_fsblk_t *phys,
   1529				 struct ext4_extent *ret_ex)
   1530{
   1531	struct buffer_head *bh = NULL;
   1532	struct ext4_extent_header *eh;
   1533	struct ext4_extent_idx *ix;
   1534	struct ext4_extent *ex;
   1535	int depth;	/* Note, NOT eh_depth; depth from top of tree */
   1536	int ee_len;
   1537
   1538	if (unlikely(path == NULL)) {
   1539		EXT4_ERROR_INODE(inode, "path == NULL *logical %d!", *logical);
   1540		return -EFSCORRUPTED;
   1541	}
   1542	depth = path->p_depth;
   1543	*phys = 0;
   1544
   1545	if (depth == 0 && path->p_ext == NULL)
   1546		return 0;
   1547
   1548	/* usually extent in the path covers blocks smaller
   1549	 * then *logical, but it can be that extent is the
   1550	 * first one in the file */
   1551
   1552	ex = path[depth].p_ext;
   1553	ee_len = ext4_ext_get_actual_len(ex);
   1554	if (*logical < le32_to_cpu(ex->ee_block)) {
   1555		if (unlikely(EXT_FIRST_EXTENT(path[depth].p_hdr) != ex)) {
   1556			EXT4_ERROR_INODE(inode,
   1557					 "first_extent(path[%d].p_hdr) != ex",
   1558					 depth);
   1559			return -EFSCORRUPTED;
   1560		}
   1561		while (--depth >= 0) {
   1562			ix = path[depth].p_idx;
   1563			if (unlikely(ix != EXT_FIRST_INDEX(path[depth].p_hdr))) {
   1564				EXT4_ERROR_INODE(inode,
   1565						 "ix != EXT_FIRST_INDEX *logical %d!",
   1566						 *logical);
   1567				return -EFSCORRUPTED;
   1568			}
   1569		}
   1570		goto found_extent;
   1571	}
   1572
   1573	if (unlikely(*logical < (le32_to_cpu(ex->ee_block) + ee_len))) {
   1574		EXT4_ERROR_INODE(inode,
   1575				 "logical %d < ee_block %d + ee_len %d!",
   1576				 *logical, le32_to_cpu(ex->ee_block), ee_len);
   1577		return -EFSCORRUPTED;
   1578	}
   1579
   1580	if (ex != EXT_LAST_EXTENT(path[depth].p_hdr)) {
   1581		/* next allocated block in this leaf */
   1582		ex++;
   1583		goto found_extent;
   1584	}
   1585
   1586	/* go up and search for index to the right */
   1587	while (--depth >= 0) {
   1588		ix = path[depth].p_idx;
   1589		if (ix != EXT_LAST_INDEX(path[depth].p_hdr))
   1590			goto got_index;
   1591	}
   1592
   1593	/* we've gone up to the root and found no index to the right */
   1594	return 0;
   1595
   1596got_index:
   1597	/* we've found index to the right, let's
   1598	 * follow it and find the closest allocated
   1599	 * block to the right */
   1600	ix++;
   1601	while (++depth < path->p_depth) {
   1602		/* subtract from p_depth to get proper eh_depth */
   1603		bh = read_extent_tree_block(inode, ix, path->p_depth - depth, 0);
   1604		if (IS_ERR(bh))
   1605			return PTR_ERR(bh);
   1606		eh = ext_block_hdr(bh);
   1607		ix = EXT_FIRST_INDEX(eh);
   1608		put_bh(bh);
   1609	}
   1610
   1611	bh = read_extent_tree_block(inode, ix, path->p_depth - depth, 0);
   1612	if (IS_ERR(bh))
   1613		return PTR_ERR(bh);
   1614	eh = ext_block_hdr(bh);
   1615	ex = EXT_FIRST_EXTENT(eh);
   1616found_extent:
   1617	*logical = le32_to_cpu(ex->ee_block);
   1618	*phys = ext4_ext_pblock(ex);
   1619	if (ret_ex)
   1620		*ret_ex = *ex;
   1621	if (bh)
   1622		put_bh(bh);
   1623	return 1;
   1624}
   1625
   1626/*
   1627 * ext4_ext_next_allocated_block:
   1628 * returns allocated block in subsequent extent or EXT_MAX_BLOCKS.
   1629 * NOTE: it considers block number from index entry as
   1630 * allocated block. Thus, index entries have to be consistent
   1631 * with leaves.
   1632 */
   1633ext4_lblk_t
   1634ext4_ext_next_allocated_block(struct ext4_ext_path *path)
   1635{
   1636	int depth;
   1637
   1638	BUG_ON(path == NULL);
   1639	depth = path->p_depth;
   1640
   1641	if (depth == 0 && path->p_ext == NULL)
   1642		return EXT_MAX_BLOCKS;
   1643
   1644	while (depth >= 0) {
   1645		struct ext4_ext_path *p = &path[depth];
   1646
   1647		if (depth == path->p_depth) {
   1648			/* leaf */
   1649			if (p->p_ext && p->p_ext != EXT_LAST_EXTENT(p->p_hdr))
   1650				return le32_to_cpu(p->p_ext[1].ee_block);
   1651		} else {
   1652			/* index */
   1653			if (p->p_idx != EXT_LAST_INDEX(p->p_hdr))
   1654				return le32_to_cpu(p->p_idx[1].ei_block);
   1655		}
   1656		depth--;
   1657	}
   1658
   1659	return EXT_MAX_BLOCKS;
   1660}
   1661
   1662/*
   1663 * ext4_ext_next_leaf_block:
   1664 * returns first allocated block from next leaf or EXT_MAX_BLOCKS
   1665 */
   1666static ext4_lblk_t ext4_ext_next_leaf_block(struct ext4_ext_path *path)
   1667{
   1668	int depth;
   1669
   1670	BUG_ON(path == NULL);
   1671	depth = path->p_depth;
   1672
   1673	/* zero-tree has no leaf blocks at all */
   1674	if (depth == 0)
   1675		return EXT_MAX_BLOCKS;
   1676
   1677	/* go to index block */
   1678	depth--;
   1679
   1680	while (depth >= 0) {
   1681		if (path[depth].p_idx !=
   1682				EXT_LAST_INDEX(path[depth].p_hdr))
   1683			return (ext4_lblk_t)
   1684				le32_to_cpu(path[depth].p_idx[1].ei_block);
   1685		depth--;
   1686	}
   1687
   1688	return EXT_MAX_BLOCKS;
   1689}
   1690
   1691/*
   1692 * ext4_ext_correct_indexes:
   1693 * if leaf gets modified and modified extent is first in the leaf,
   1694 * then we have to correct all indexes above.
   1695 * TODO: do we need to correct tree in all cases?
   1696 */
   1697static int ext4_ext_correct_indexes(handle_t *handle, struct inode *inode,
   1698				struct ext4_ext_path *path)
   1699{
   1700	struct ext4_extent_header *eh;
   1701	int depth = ext_depth(inode);
   1702	struct ext4_extent *ex;
   1703	__le32 border;
   1704	int k, err = 0;
   1705
   1706	eh = path[depth].p_hdr;
   1707	ex = path[depth].p_ext;
   1708
   1709	if (unlikely(ex == NULL || eh == NULL)) {
   1710		EXT4_ERROR_INODE(inode,
   1711				 "ex %p == NULL or eh %p == NULL", ex, eh);
   1712		return -EFSCORRUPTED;
   1713	}
   1714
   1715	if (depth == 0) {
   1716		/* there is no tree at all */
   1717		return 0;
   1718	}
   1719
   1720	if (ex != EXT_FIRST_EXTENT(eh)) {
   1721		/* we correct tree if first leaf got modified only */
   1722		return 0;
   1723	}
   1724
   1725	/*
   1726	 * TODO: we need correction if border is smaller than current one
   1727	 */
   1728	k = depth - 1;
   1729	border = path[depth].p_ext->ee_block;
   1730	err = ext4_ext_get_access(handle, inode, path + k);
   1731	if (err)
   1732		return err;
   1733	path[k].p_idx->ei_block = border;
   1734	err = ext4_ext_dirty(handle, inode, path + k);
   1735	if (err)
   1736		return err;
   1737
   1738	while (k--) {
   1739		/* change all left-side indexes */
   1740		if (path[k+1].p_idx != EXT_FIRST_INDEX(path[k+1].p_hdr))
   1741			break;
   1742		err = ext4_ext_get_access(handle, inode, path + k);
   1743		if (err)
   1744			break;
   1745		path[k].p_idx->ei_block = border;
   1746		err = ext4_ext_dirty(handle, inode, path + k);
   1747		if (err)
   1748			break;
   1749	}
   1750
   1751	return err;
   1752}
   1753
   1754static int ext4_can_extents_be_merged(struct inode *inode,
   1755				      struct ext4_extent *ex1,
   1756				      struct ext4_extent *ex2)
   1757{
   1758	unsigned short ext1_ee_len, ext2_ee_len;
   1759
   1760	if (ext4_ext_is_unwritten(ex1) != ext4_ext_is_unwritten(ex2))
   1761		return 0;
   1762
   1763	ext1_ee_len = ext4_ext_get_actual_len(ex1);
   1764	ext2_ee_len = ext4_ext_get_actual_len(ex2);
   1765
   1766	if (le32_to_cpu(ex1->ee_block) + ext1_ee_len !=
   1767			le32_to_cpu(ex2->ee_block))
   1768		return 0;
   1769
   1770	if (ext1_ee_len + ext2_ee_len > EXT_INIT_MAX_LEN)
   1771		return 0;
   1772
   1773	if (ext4_ext_is_unwritten(ex1) &&
   1774	    ext1_ee_len + ext2_ee_len > EXT_UNWRITTEN_MAX_LEN)
   1775		return 0;
   1776#ifdef AGGRESSIVE_TEST
   1777	if (ext1_ee_len >= 4)
   1778		return 0;
   1779#endif
   1780
   1781	if (ext4_ext_pblock(ex1) + ext1_ee_len == ext4_ext_pblock(ex2))
   1782		return 1;
   1783	return 0;
   1784}
   1785
   1786/*
   1787 * This function tries to merge the "ex" extent to the next extent in the tree.
   1788 * It always tries to merge towards right. If you want to merge towards
   1789 * left, pass "ex - 1" as argument instead of "ex".
   1790 * Returns 0 if the extents (ex and ex+1) were _not_ merged and returns
   1791 * 1 if they got merged.
   1792 */
   1793static int ext4_ext_try_to_merge_right(struct inode *inode,
   1794				 struct ext4_ext_path *path,
   1795				 struct ext4_extent *ex)
   1796{
   1797	struct ext4_extent_header *eh;
   1798	unsigned int depth, len;
   1799	int merge_done = 0, unwritten;
   1800
   1801	depth = ext_depth(inode);
   1802	BUG_ON(path[depth].p_hdr == NULL);
   1803	eh = path[depth].p_hdr;
   1804
   1805	while (ex < EXT_LAST_EXTENT(eh)) {
   1806		if (!ext4_can_extents_be_merged(inode, ex, ex + 1))
   1807			break;
   1808		/* merge with next extent! */
   1809		unwritten = ext4_ext_is_unwritten(ex);
   1810		ex->ee_len = cpu_to_le16(ext4_ext_get_actual_len(ex)
   1811				+ ext4_ext_get_actual_len(ex + 1));
   1812		if (unwritten)
   1813			ext4_ext_mark_unwritten(ex);
   1814
   1815		if (ex + 1 < EXT_LAST_EXTENT(eh)) {
   1816			len = (EXT_LAST_EXTENT(eh) - ex - 1)
   1817				* sizeof(struct ext4_extent);
   1818			memmove(ex + 1, ex + 2, len);
   1819		}
   1820		le16_add_cpu(&eh->eh_entries, -1);
   1821		merge_done = 1;
   1822		WARN_ON(eh->eh_entries == 0);
   1823		if (!eh->eh_entries)
   1824			EXT4_ERROR_INODE(inode, "eh->eh_entries = 0!");
   1825	}
   1826
   1827	return merge_done;
   1828}
   1829
   1830/*
   1831 * This function does a very simple check to see if we can collapse
   1832 * an extent tree with a single extent tree leaf block into the inode.
   1833 */
   1834static void ext4_ext_try_to_merge_up(handle_t *handle,
   1835				     struct inode *inode,
   1836				     struct ext4_ext_path *path)
   1837{
   1838	size_t s;
   1839	unsigned max_root = ext4_ext_space_root(inode, 0);
   1840	ext4_fsblk_t blk;
   1841
   1842	if ((path[0].p_depth != 1) ||
   1843	    (le16_to_cpu(path[0].p_hdr->eh_entries) != 1) ||
   1844	    (le16_to_cpu(path[1].p_hdr->eh_entries) > max_root))
   1845		return;
   1846
   1847	/*
   1848	 * We need to modify the block allocation bitmap and the block
   1849	 * group descriptor to release the extent tree block.  If we
   1850	 * can't get the journal credits, give up.
   1851	 */
   1852	if (ext4_journal_extend(handle, 2,
   1853			ext4_free_metadata_revoke_credits(inode->i_sb, 1)))
   1854		return;
   1855
   1856	/*
   1857	 * Copy the extent data up to the inode
   1858	 */
   1859	blk = ext4_idx_pblock(path[0].p_idx);
   1860	s = le16_to_cpu(path[1].p_hdr->eh_entries) *
   1861		sizeof(struct ext4_extent_idx);
   1862	s += sizeof(struct ext4_extent_header);
   1863
   1864	path[1].p_maxdepth = path[0].p_maxdepth;
   1865	memcpy(path[0].p_hdr, path[1].p_hdr, s);
   1866	path[0].p_depth = 0;
   1867	path[0].p_ext = EXT_FIRST_EXTENT(path[0].p_hdr) +
   1868		(path[1].p_ext - EXT_FIRST_EXTENT(path[1].p_hdr));
   1869	path[0].p_hdr->eh_max = cpu_to_le16(max_root);
   1870
   1871	brelse(path[1].p_bh);
   1872	ext4_free_blocks(handle, inode, NULL, blk, 1,
   1873			 EXT4_FREE_BLOCKS_METADATA | EXT4_FREE_BLOCKS_FORGET);
   1874}
   1875
   1876/*
   1877 * This function tries to merge the @ex extent to neighbours in the tree, then
   1878 * tries to collapse the extent tree into the inode.
   1879 */
   1880static void ext4_ext_try_to_merge(handle_t *handle,
   1881				  struct inode *inode,
   1882				  struct ext4_ext_path *path,
   1883				  struct ext4_extent *ex)
   1884{
   1885	struct ext4_extent_header *eh;
   1886	unsigned int depth;
   1887	int merge_done = 0;
   1888
   1889	depth = ext_depth(inode);
   1890	BUG_ON(path[depth].p_hdr == NULL);
   1891	eh = path[depth].p_hdr;
   1892
   1893	if (ex > EXT_FIRST_EXTENT(eh))
   1894		merge_done = ext4_ext_try_to_merge_right(inode, path, ex - 1);
   1895
   1896	if (!merge_done)
   1897		(void) ext4_ext_try_to_merge_right(inode, path, ex);
   1898
   1899	ext4_ext_try_to_merge_up(handle, inode, path);
   1900}
   1901
   1902/*
   1903 * check if a portion of the "newext" extent overlaps with an
   1904 * existing extent.
   1905 *
   1906 * If there is an overlap discovered, it updates the length of the newext
   1907 * such that there will be no overlap, and then returns 1.
   1908 * If there is no overlap found, it returns 0.
   1909 */
   1910static unsigned int ext4_ext_check_overlap(struct ext4_sb_info *sbi,
   1911					   struct inode *inode,
   1912					   struct ext4_extent *newext,
   1913					   struct ext4_ext_path *path)
   1914{
   1915	ext4_lblk_t b1, b2;
   1916	unsigned int depth, len1;
   1917	unsigned int ret = 0;
   1918
   1919	b1 = le32_to_cpu(newext->ee_block);
   1920	len1 = ext4_ext_get_actual_len(newext);
   1921	depth = ext_depth(inode);
   1922	if (!path[depth].p_ext)
   1923		goto out;
   1924	b2 = EXT4_LBLK_CMASK(sbi, le32_to_cpu(path[depth].p_ext->ee_block));
   1925
   1926	/*
   1927	 * get the next allocated block if the extent in the path
   1928	 * is before the requested block(s)
   1929	 */
   1930	if (b2 < b1) {
   1931		b2 = ext4_ext_next_allocated_block(path);
   1932		if (b2 == EXT_MAX_BLOCKS)
   1933			goto out;
   1934		b2 = EXT4_LBLK_CMASK(sbi, b2);
   1935	}
   1936
   1937	/* check for wrap through zero on extent logical start block*/
   1938	if (b1 + len1 < b1) {
   1939		len1 = EXT_MAX_BLOCKS - b1;
   1940		newext->ee_len = cpu_to_le16(len1);
   1941		ret = 1;
   1942	}
   1943
   1944	/* check for overlap */
   1945	if (b1 + len1 > b2) {
   1946		newext->ee_len = cpu_to_le16(b2 - b1);
   1947		ret = 1;
   1948	}
   1949out:
   1950	return ret;
   1951}
   1952
   1953/*
   1954 * ext4_ext_insert_extent:
   1955 * tries to merge requested extent into the existing extent or
   1956 * inserts requested extent as new one into the tree,
   1957 * creating new leaf in the no-space case.
   1958 */
   1959int ext4_ext_insert_extent(handle_t *handle, struct inode *inode,
   1960				struct ext4_ext_path **ppath,
   1961				struct ext4_extent *newext, int gb_flags)
   1962{
   1963	struct ext4_ext_path *path = *ppath;
   1964	struct ext4_extent_header *eh;
   1965	struct ext4_extent *ex, *fex;
   1966	struct ext4_extent *nearex; /* nearest extent */
   1967	struct ext4_ext_path *npath = NULL;
   1968	int depth, len, err;
   1969	ext4_lblk_t next;
   1970	int mb_flags = 0, unwritten;
   1971
   1972	if (gb_flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE)
   1973		mb_flags |= EXT4_MB_DELALLOC_RESERVED;
   1974	if (unlikely(ext4_ext_get_actual_len(newext) == 0)) {
   1975		EXT4_ERROR_INODE(inode, "ext4_ext_get_actual_len(newext) == 0");
   1976		return -EFSCORRUPTED;
   1977	}
   1978	depth = ext_depth(inode);
   1979	ex = path[depth].p_ext;
   1980	eh = path[depth].p_hdr;
   1981	if (unlikely(path[depth].p_hdr == NULL)) {
   1982		EXT4_ERROR_INODE(inode, "path[%d].p_hdr == NULL", depth);
   1983		return -EFSCORRUPTED;
   1984	}
   1985
   1986	/* try to insert block into found extent and return */
   1987	if (ex && !(gb_flags & EXT4_GET_BLOCKS_PRE_IO)) {
   1988
   1989		/*
   1990		 * Try to see whether we should rather test the extent on
   1991		 * right from ex, or from the left of ex. This is because
   1992		 * ext4_find_extent() can return either extent on the
   1993		 * left, or on the right from the searched position. This
   1994		 * will make merging more effective.
   1995		 */
   1996		if (ex < EXT_LAST_EXTENT(eh) &&
   1997		    (le32_to_cpu(ex->ee_block) +
   1998		    ext4_ext_get_actual_len(ex) <
   1999		    le32_to_cpu(newext->ee_block))) {
   2000			ex += 1;
   2001			goto prepend;
   2002		} else if ((ex > EXT_FIRST_EXTENT(eh)) &&
   2003			   (le32_to_cpu(newext->ee_block) +
   2004			   ext4_ext_get_actual_len(newext) <
   2005			   le32_to_cpu(ex->ee_block)))
   2006			ex -= 1;
   2007
   2008		/* Try to append newex to the ex */
   2009		if (ext4_can_extents_be_merged(inode, ex, newext)) {
   2010			ext_debug(inode, "append [%d]%d block to %u:[%d]%d"
   2011				  "(from %llu)\n",
   2012				  ext4_ext_is_unwritten(newext),
   2013				  ext4_ext_get_actual_len(newext),
   2014				  le32_to_cpu(ex->ee_block),
   2015				  ext4_ext_is_unwritten(ex),
   2016				  ext4_ext_get_actual_len(ex),
   2017				  ext4_ext_pblock(ex));
   2018			err = ext4_ext_get_access(handle, inode,
   2019						  path + depth);
   2020			if (err)
   2021				return err;
   2022			unwritten = ext4_ext_is_unwritten(ex);
   2023			ex->ee_len = cpu_to_le16(ext4_ext_get_actual_len(ex)
   2024					+ ext4_ext_get_actual_len(newext));
   2025			if (unwritten)
   2026				ext4_ext_mark_unwritten(ex);
   2027			nearex = ex;
   2028			goto merge;
   2029		}
   2030
   2031prepend:
   2032		/* Try to prepend newex to the ex */
   2033		if (ext4_can_extents_be_merged(inode, newext, ex)) {
   2034			ext_debug(inode, "prepend %u[%d]%d block to %u:[%d]%d"
   2035				  "(from %llu)\n",
   2036				  le32_to_cpu(newext->ee_block),
   2037				  ext4_ext_is_unwritten(newext),
   2038				  ext4_ext_get_actual_len(newext),
   2039				  le32_to_cpu(ex->ee_block),
   2040				  ext4_ext_is_unwritten(ex),
   2041				  ext4_ext_get_actual_len(ex),
   2042				  ext4_ext_pblock(ex));
   2043			err = ext4_ext_get_access(handle, inode,
   2044						  path + depth);
   2045			if (err)
   2046				return err;
   2047
   2048			unwritten = ext4_ext_is_unwritten(ex);
   2049			ex->ee_block = newext->ee_block;
   2050			ext4_ext_store_pblock(ex, ext4_ext_pblock(newext));
   2051			ex->ee_len = cpu_to_le16(ext4_ext_get_actual_len(ex)
   2052					+ ext4_ext_get_actual_len(newext));
   2053			if (unwritten)
   2054				ext4_ext_mark_unwritten(ex);
   2055			nearex = ex;
   2056			goto merge;
   2057		}
   2058	}
   2059
   2060	depth = ext_depth(inode);
   2061	eh = path[depth].p_hdr;
   2062	if (le16_to_cpu(eh->eh_entries) < le16_to_cpu(eh->eh_max))
   2063		goto has_space;
   2064
   2065	/* probably next leaf has space for us? */
   2066	fex = EXT_LAST_EXTENT(eh);
   2067	next = EXT_MAX_BLOCKS;
   2068	if (le32_to_cpu(newext->ee_block) > le32_to_cpu(fex->ee_block))
   2069		next = ext4_ext_next_leaf_block(path);
   2070	if (next != EXT_MAX_BLOCKS) {
   2071		ext_debug(inode, "next leaf block - %u\n", next);
   2072		BUG_ON(npath != NULL);
   2073		npath = ext4_find_extent(inode, next, NULL, gb_flags);
   2074		if (IS_ERR(npath))
   2075			return PTR_ERR(npath);
   2076		BUG_ON(npath->p_depth != path->p_depth);
   2077		eh = npath[depth].p_hdr;
   2078		if (le16_to_cpu(eh->eh_entries) < le16_to_cpu(eh->eh_max)) {
   2079			ext_debug(inode, "next leaf isn't full(%d)\n",
   2080				  le16_to_cpu(eh->eh_entries));
   2081			path = npath;
   2082			goto has_space;
   2083		}
   2084		ext_debug(inode, "next leaf has no free space(%d,%d)\n",
   2085			  le16_to_cpu(eh->eh_entries), le16_to_cpu(eh->eh_max));
   2086	}
   2087
   2088	/*
   2089	 * There is no free space in the found leaf.
   2090	 * We're gonna add a new leaf in the tree.
   2091	 */
   2092	if (gb_flags & EXT4_GET_BLOCKS_METADATA_NOFAIL)
   2093		mb_flags |= EXT4_MB_USE_RESERVED;
   2094	err = ext4_ext_create_new_leaf(handle, inode, mb_flags, gb_flags,
   2095				       ppath, newext);
   2096	if (err)
   2097		goto cleanup;
   2098	depth = ext_depth(inode);
   2099	eh = path[depth].p_hdr;
   2100
   2101has_space:
   2102	nearex = path[depth].p_ext;
   2103
   2104	err = ext4_ext_get_access(handle, inode, path + depth);
   2105	if (err)
   2106		goto cleanup;
   2107
   2108	if (!nearex) {
   2109		/* there is no extent in this leaf, create first one */
   2110		ext_debug(inode, "first extent in the leaf: %u:%llu:[%d]%d\n",
   2111				le32_to_cpu(newext->ee_block),
   2112				ext4_ext_pblock(newext),
   2113				ext4_ext_is_unwritten(newext),
   2114				ext4_ext_get_actual_len(newext));
   2115		nearex = EXT_FIRST_EXTENT(eh);
   2116	} else {
   2117		if (le32_to_cpu(newext->ee_block)
   2118			   > le32_to_cpu(nearex->ee_block)) {
   2119			/* Insert after */
   2120			ext_debug(inode, "insert %u:%llu:[%d]%d before: "
   2121					"nearest %p\n",
   2122					le32_to_cpu(newext->ee_block),
   2123					ext4_ext_pblock(newext),
   2124					ext4_ext_is_unwritten(newext),
   2125					ext4_ext_get_actual_len(newext),
   2126					nearex);
   2127			nearex++;
   2128		} else {
   2129			/* Insert before */
   2130			BUG_ON(newext->ee_block == nearex->ee_block);
   2131			ext_debug(inode, "insert %u:%llu:[%d]%d after: "
   2132					"nearest %p\n",
   2133					le32_to_cpu(newext->ee_block),
   2134					ext4_ext_pblock(newext),
   2135					ext4_ext_is_unwritten(newext),
   2136					ext4_ext_get_actual_len(newext),
   2137					nearex);
   2138		}
   2139		len = EXT_LAST_EXTENT(eh) - nearex + 1;
   2140		if (len > 0) {
   2141			ext_debug(inode, "insert %u:%llu:[%d]%d: "
   2142					"move %d extents from 0x%p to 0x%p\n",
   2143					le32_to_cpu(newext->ee_block),
   2144					ext4_ext_pblock(newext),
   2145					ext4_ext_is_unwritten(newext),
   2146					ext4_ext_get_actual_len(newext),
   2147					len, nearex, nearex + 1);
   2148			memmove(nearex + 1, nearex,
   2149				len * sizeof(struct ext4_extent));
   2150		}
   2151	}
   2152
   2153	le16_add_cpu(&eh->eh_entries, 1);
   2154	path[depth].p_ext = nearex;
   2155	nearex->ee_block = newext->ee_block;
   2156	ext4_ext_store_pblock(nearex, ext4_ext_pblock(newext));
   2157	nearex->ee_len = newext->ee_len;
   2158
   2159merge:
   2160	/* try to merge extents */
   2161	if (!(gb_flags & EXT4_GET_BLOCKS_PRE_IO))
   2162		ext4_ext_try_to_merge(handle, inode, path, nearex);
   2163
   2164
   2165	/* time to correct all indexes above */
   2166	err = ext4_ext_correct_indexes(handle, inode, path);
   2167	if (err)
   2168		goto cleanup;
   2169
   2170	err = ext4_ext_dirty(handle, inode, path + path->p_depth);
   2171
   2172cleanup:
   2173	ext4_ext_drop_refs(npath);
   2174	kfree(npath);
   2175	return err;
   2176}
   2177
   2178static int ext4_fill_es_cache_info(struct inode *inode,
   2179				   ext4_lblk_t block, ext4_lblk_t num,
   2180				   struct fiemap_extent_info *fieinfo)
   2181{
   2182	ext4_lblk_t next, end = block + num - 1;
   2183	struct extent_status es;
   2184	unsigned char blksize_bits = inode->i_sb->s_blocksize_bits;
   2185	unsigned int flags;
   2186	int err;
   2187
   2188	while (block <= end) {
   2189		next = 0;
   2190		flags = 0;
   2191		if (!ext4_es_lookup_extent(inode, block, &next, &es))
   2192			break;
   2193		if (ext4_es_is_unwritten(&es))
   2194			flags |= FIEMAP_EXTENT_UNWRITTEN;
   2195		if (ext4_es_is_delayed(&es))
   2196			flags |= (FIEMAP_EXTENT_DELALLOC |
   2197				  FIEMAP_EXTENT_UNKNOWN);
   2198		if (ext4_es_is_hole(&es))
   2199			flags |= EXT4_FIEMAP_EXTENT_HOLE;
   2200		if (next == 0)
   2201			flags |= FIEMAP_EXTENT_LAST;
   2202		if (flags & (FIEMAP_EXTENT_DELALLOC|
   2203			     EXT4_FIEMAP_EXTENT_HOLE))
   2204			es.es_pblk = 0;
   2205		else
   2206			es.es_pblk = ext4_es_pblock(&es);
   2207		err = fiemap_fill_next_extent(fieinfo,
   2208				(__u64)es.es_lblk << blksize_bits,
   2209				(__u64)es.es_pblk << blksize_bits,
   2210				(__u64)es.es_len << blksize_bits,
   2211				flags);
   2212		if (next == 0)
   2213			break;
   2214		block = next;
   2215		if (err < 0)
   2216			return err;
   2217		if (err == 1)
   2218			return 0;
   2219	}
   2220	return 0;
   2221}
   2222
   2223
   2224/*
   2225 * ext4_ext_determine_hole - determine hole around given block
   2226 * @inode:	inode we lookup in
   2227 * @path:	path in extent tree to @lblk
   2228 * @lblk:	pointer to logical block around which we want to determine hole
   2229 *
   2230 * Determine hole length (and start if easily possible) around given logical
   2231 * block. We don't try too hard to find the beginning of the hole but @path
   2232 * actually points to extent before @lblk, we provide it.
   2233 *
   2234 * The function returns the length of a hole starting at @lblk. We update @lblk
   2235 * to the beginning of the hole if we managed to find it.
   2236 */
   2237static ext4_lblk_t ext4_ext_determine_hole(struct inode *inode,
   2238					   struct ext4_ext_path *path,
   2239					   ext4_lblk_t *lblk)
   2240{
   2241	int depth = ext_depth(inode);
   2242	struct ext4_extent *ex;
   2243	ext4_lblk_t len;
   2244
   2245	ex = path[depth].p_ext;
   2246	if (ex == NULL) {
   2247		/* there is no extent yet, so gap is [0;-] */
   2248		*lblk = 0;
   2249		len = EXT_MAX_BLOCKS;
   2250	} else if (*lblk < le32_to_cpu(ex->ee_block)) {
   2251		len = le32_to_cpu(ex->ee_block) - *lblk;
   2252	} else if (*lblk >= le32_to_cpu(ex->ee_block)
   2253			+ ext4_ext_get_actual_len(ex)) {
   2254		ext4_lblk_t next;
   2255
   2256		*lblk = le32_to_cpu(ex->ee_block) + ext4_ext_get_actual_len(ex);
   2257		next = ext4_ext_next_allocated_block(path);
   2258		BUG_ON(next == *lblk);
   2259		len = next - *lblk;
   2260	} else {
   2261		BUG();
   2262	}
   2263	return len;
   2264}
   2265
   2266/*
   2267 * ext4_ext_put_gap_in_cache:
   2268 * calculate boundaries of the gap that the requested block fits into
   2269 * and cache this gap
   2270 */
   2271static void
   2272ext4_ext_put_gap_in_cache(struct inode *inode, ext4_lblk_t hole_start,
   2273			  ext4_lblk_t hole_len)
   2274{
   2275	struct extent_status es;
   2276
   2277	ext4_es_find_extent_range(inode, &ext4_es_is_delayed, hole_start,
   2278				  hole_start + hole_len - 1, &es);
   2279	if (es.es_len) {
   2280		/* There's delayed extent containing lblock? */
   2281		if (es.es_lblk <= hole_start)
   2282			return;
   2283		hole_len = min(es.es_lblk - hole_start, hole_len);
   2284	}
   2285	ext_debug(inode, " -> %u:%u\n", hole_start, hole_len);
   2286	ext4_es_insert_extent(inode, hole_start, hole_len, ~0,
   2287			      EXTENT_STATUS_HOLE);
   2288}
   2289
   2290/*
   2291 * ext4_ext_rm_idx:
   2292 * removes index from the index block.
   2293 */
   2294static int ext4_ext_rm_idx(handle_t *handle, struct inode *inode,
   2295			struct ext4_ext_path *path, int depth)
   2296{
   2297	int err;
   2298	ext4_fsblk_t leaf;
   2299
   2300	/* free index block */
   2301	depth--;
   2302	path = path + depth;
   2303	leaf = ext4_idx_pblock(path->p_idx);
   2304	if (unlikely(path->p_hdr->eh_entries == 0)) {
   2305		EXT4_ERROR_INODE(inode, "path->p_hdr->eh_entries == 0");
   2306		return -EFSCORRUPTED;
   2307	}
   2308	err = ext4_ext_get_access(handle, inode, path);
   2309	if (err)
   2310		return err;
   2311
   2312	if (path->p_idx != EXT_LAST_INDEX(path->p_hdr)) {
   2313		int len = EXT_LAST_INDEX(path->p_hdr) - path->p_idx;
   2314		len *= sizeof(struct ext4_extent_idx);
   2315		memmove(path->p_idx, path->p_idx + 1, len);
   2316	}
   2317
   2318	le16_add_cpu(&path->p_hdr->eh_entries, -1);
   2319	err = ext4_ext_dirty(handle, inode, path);
   2320	if (err)
   2321		return err;
   2322	ext_debug(inode, "index is empty, remove it, free block %llu\n", leaf);
   2323	trace_ext4_ext_rm_idx(inode, leaf);
   2324
   2325	ext4_free_blocks(handle, inode, NULL, leaf, 1,
   2326			 EXT4_FREE_BLOCKS_METADATA | EXT4_FREE_BLOCKS_FORGET);
   2327
   2328	while (--depth >= 0) {
   2329		if (path->p_idx != EXT_FIRST_INDEX(path->p_hdr))
   2330			break;
   2331		path--;
   2332		err = ext4_ext_get_access(handle, inode, path);
   2333		if (err)
   2334			break;
   2335		path->p_idx->ei_block = (path+1)->p_idx->ei_block;
   2336		err = ext4_ext_dirty(handle, inode, path);
   2337		if (err)
   2338			break;
   2339	}
   2340	return err;
   2341}
   2342
   2343/*
   2344 * ext4_ext_calc_credits_for_single_extent:
   2345 * This routine returns max. credits that needed to insert an extent
   2346 * to the extent tree.
   2347 * When pass the actual path, the caller should calculate credits
   2348 * under i_data_sem.
   2349 */
   2350int ext4_ext_calc_credits_for_single_extent(struct inode *inode, int nrblocks,
   2351						struct ext4_ext_path *path)
   2352{
   2353	if (path) {
   2354		int depth = ext_depth(inode);
   2355		int ret = 0;
   2356
   2357		/* probably there is space in leaf? */
   2358		if (le16_to_cpu(path[depth].p_hdr->eh_entries)
   2359				< le16_to_cpu(path[depth].p_hdr->eh_max)) {
   2360
   2361			/*
   2362			 *  There are some space in the leaf tree, no
   2363			 *  need to account for leaf block credit
   2364			 *
   2365			 *  bitmaps and block group descriptor blocks
   2366			 *  and other metadata blocks still need to be
   2367			 *  accounted.
   2368			 */
   2369			/* 1 bitmap, 1 block group descriptor */
   2370			ret = 2 + EXT4_META_TRANS_BLOCKS(inode->i_sb);
   2371			return ret;
   2372		}
   2373	}
   2374
   2375	return ext4_chunk_trans_blocks(inode, nrblocks);
   2376}
   2377
   2378/*
   2379 * How many index/leaf blocks need to change/allocate to add @extents extents?
   2380 *
   2381 * If we add a single extent, then in the worse case, each tree level
   2382 * index/leaf need to be changed in case of the tree split.
   2383 *
   2384 * If more extents are inserted, they could cause the whole tree split more
   2385 * than once, but this is really rare.
   2386 */
   2387int ext4_ext_index_trans_blocks(struct inode *inode, int extents)
   2388{
   2389	int index;
   2390	int depth;
   2391
   2392	/* If we are converting the inline data, only one is needed here. */
   2393	if (ext4_has_inline_data(inode))
   2394		return 1;
   2395
   2396	depth = ext_depth(inode);
   2397
   2398	if (extents <= 1)
   2399		index = depth * 2;
   2400	else
   2401		index = depth * 3;
   2402
   2403	return index;
   2404}
   2405
   2406static inline int get_default_free_blocks_flags(struct inode *inode)
   2407{
   2408	if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode) ||
   2409	    ext4_test_inode_flag(inode, EXT4_INODE_EA_INODE))
   2410		return EXT4_FREE_BLOCKS_METADATA | EXT4_FREE_BLOCKS_FORGET;
   2411	else if (ext4_should_journal_data(inode))
   2412		return EXT4_FREE_BLOCKS_FORGET;
   2413	return 0;
   2414}
   2415
   2416/*
   2417 * ext4_rereserve_cluster - increment the reserved cluster count when
   2418 *                          freeing a cluster with a pending reservation
   2419 *
   2420 * @inode - file containing the cluster
   2421 * @lblk - logical block in cluster to be reserved
   2422 *
   2423 * Increments the reserved cluster count and adjusts quota in a bigalloc
   2424 * file system when freeing a partial cluster containing at least one
   2425 * delayed and unwritten block.  A partial cluster meeting that
   2426 * requirement will have a pending reservation.  If so, the
   2427 * RERESERVE_CLUSTER flag is used when calling ext4_free_blocks() to
   2428 * defer reserved and allocated space accounting to a subsequent call
   2429 * to this function.
   2430 */
   2431static void ext4_rereserve_cluster(struct inode *inode, ext4_lblk_t lblk)
   2432{
   2433	struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
   2434	struct ext4_inode_info *ei = EXT4_I(inode);
   2435
   2436	dquot_reclaim_block(inode, EXT4_C2B(sbi, 1));
   2437
   2438	spin_lock(&ei->i_block_reservation_lock);
   2439	ei->i_reserved_data_blocks++;
   2440	percpu_counter_add(&sbi->s_dirtyclusters_counter, 1);
   2441	spin_unlock(&ei->i_block_reservation_lock);
   2442
   2443	percpu_counter_add(&sbi->s_freeclusters_counter, 1);
   2444	ext4_remove_pending(inode, lblk);
   2445}
   2446
   2447static int ext4_remove_blocks(handle_t *handle, struct inode *inode,
   2448			      struct ext4_extent *ex,
   2449			      struct partial_cluster *partial,
   2450			      ext4_lblk_t from, ext4_lblk_t to)
   2451{
   2452	struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
   2453	unsigned short ee_len = ext4_ext_get_actual_len(ex);
   2454	ext4_fsblk_t last_pblk, pblk;
   2455	ext4_lblk_t num;
   2456	int flags;
   2457
   2458	/* only extent tail removal is allowed */
   2459	if (from < le32_to_cpu(ex->ee_block) ||
   2460	    to != le32_to_cpu(ex->ee_block) + ee_len - 1) {
   2461		ext4_error(sbi->s_sb,
   2462			   "strange request: removal(2) %u-%u from %u:%u",
   2463			   from, to, le32_to_cpu(ex->ee_block), ee_len);
   2464		return 0;
   2465	}
   2466
   2467#ifdef EXTENTS_STATS
   2468	spin_lock(&sbi->s_ext_stats_lock);
   2469	sbi->s_ext_blocks += ee_len;
   2470	sbi->s_ext_extents++;
   2471	if (ee_len < sbi->s_ext_min)
   2472		sbi->s_ext_min = ee_len;
   2473	if (ee_len > sbi->s_ext_max)
   2474		sbi->s_ext_max = ee_len;
   2475	if (ext_depth(inode) > sbi->s_depth_max)
   2476		sbi->s_depth_max = ext_depth(inode);
   2477	spin_unlock(&sbi->s_ext_stats_lock);
   2478#endif
   2479
   2480	trace_ext4_remove_blocks(inode, ex, from, to, partial);
   2481
   2482	/*
   2483	 * if we have a partial cluster, and it's different from the
   2484	 * cluster of the last block in the extent, we free it
   2485	 */
   2486	last_pblk = ext4_ext_pblock(ex) + ee_len - 1;
   2487
   2488	if (partial->state != initial &&
   2489	    partial->pclu != EXT4_B2C(sbi, last_pblk)) {
   2490		if (partial->state == tofree) {
   2491			flags = get_default_free_blocks_flags(inode);
   2492			if (ext4_is_pending(inode, partial->lblk))
   2493				flags |= EXT4_FREE_BLOCKS_RERESERVE_CLUSTER;
   2494			ext4_free_blocks(handle, inode, NULL,
   2495					 EXT4_C2B(sbi, partial->pclu),
   2496					 sbi->s_cluster_ratio, flags);
   2497			if (flags & EXT4_FREE_BLOCKS_RERESERVE_CLUSTER)
   2498				ext4_rereserve_cluster(inode, partial->lblk);
   2499		}
   2500		partial->state = initial;
   2501	}
   2502
   2503	num = le32_to_cpu(ex->ee_block) + ee_len - from;
   2504	pblk = ext4_ext_pblock(ex) + ee_len - num;
   2505
   2506	/*
   2507	 * We free the partial cluster at the end of the extent (if any),
   2508	 * unless the cluster is used by another extent (partial_cluster
   2509	 * state is nofree).  If a partial cluster exists here, it must be
   2510	 * shared with the last block in the extent.
   2511	 */
   2512	flags = get_default_free_blocks_flags(inode);
   2513
   2514	/* partial, left end cluster aligned, right end unaligned */
   2515	if ((EXT4_LBLK_COFF(sbi, to) != sbi->s_cluster_ratio - 1) &&
   2516	    (EXT4_LBLK_CMASK(sbi, to) >= from) &&
   2517	    (partial->state != nofree)) {
   2518		if (ext4_is_pending(inode, to))
   2519			flags |= EXT4_FREE_BLOCKS_RERESERVE_CLUSTER;
   2520		ext4_free_blocks(handle, inode, NULL,
   2521				 EXT4_PBLK_CMASK(sbi, last_pblk),
   2522				 sbi->s_cluster_ratio, flags);
   2523		if (flags & EXT4_FREE_BLOCKS_RERESERVE_CLUSTER)
   2524			ext4_rereserve_cluster(inode, to);
   2525		partial->state = initial;
   2526		flags = get_default_free_blocks_flags(inode);
   2527	}
   2528
   2529	flags |= EXT4_FREE_BLOCKS_NOFREE_LAST_CLUSTER;
   2530
   2531	/*
   2532	 * For bigalloc file systems, we never free a partial cluster
   2533	 * at the beginning of the extent.  Instead, we check to see if we
   2534	 * need to free it on a subsequent call to ext4_remove_blocks,
   2535	 * or at the end of ext4_ext_rm_leaf or ext4_ext_remove_space.
   2536	 */
   2537	flags |= EXT4_FREE_BLOCKS_NOFREE_FIRST_CLUSTER;
   2538	ext4_free_blocks(handle, inode, NULL, pblk, num, flags);
   2539
   2540	/* reset the partial cluster if we've freed past it */
   2541	if (partial->state != initial && partial->pclu != EXT4_B2C(sbi, pblk))
   2542		partial->state = initial;
   2543
   2544	/*
   2545	 * If we've freed the entire extent but the beginning is not left
   2546	 * cluster aligned and is not marked as ineligible for freeing we
   2547	 * record the partial cluster at the beginning of the extent.  It
   2548	 * wasn't freed by the preceding ext4_free_blocks() call, and we
   2549	 * need to look farther to the left to determine if it's to be freed
   2550	 * (not shared with another extent). Else, reset the partial
   2551	 * cluster - we're either  done freeing or the beginning of the
   2552	 * extent is left cluster aligned.
   2553	 */
   2554	if (EXT4_LBLK_COFF(sbi, from) && num == ee_len) {
   2555		if (partial->state == initial) {
   2556			partial->pclu = EXT4_B2C(sbi, pblk);
   2557			partial->lblk = from;
   2558			partial->state = tofree;
   2559		}
   2560	} else {
   2561		partial->state = initial;
   2562	}
   2563
   2564	return 0;
   2565}
   2566
   2567/*
   2568 * ext4_ext_rm_leaf() Removes the extents associated with the
   2569 * blocks appearing between "start" and "end".  Both "start"
   2570 * and "end" must appear in the same extent or EIO is returned.
   2571 *
   2572 * @handle: The journal handle
   2573 * @inode:  The files inode
   2574 * @path:   The path to the leaf
   2575 * @partial_cluster: The cluster which we'll have to free if all extents
   2576 *                   has been released from it.  However, if this value is
   2577 *                   negative, it's a cluster just to the right of the
   2578 *                   punched region and it must not be freed.
   2579 * @start:  The first block to remove
   2580 * @end:   The last block to remove
   2581 */
   2582static int
   2583ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
   2584		 struct ext4_ext_path *path,
   2585		 struct partial_cluster *partial,
   2586		 ext4_lblk_t start, ext4_lblk_t end)
   2587{
   2588	struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
   2589	int err = 0, correct_index = 0;
   2590	int depth = ext_depth(inode), credits, revoke_credits;
   2591	struct ext4_extent_header *eh;
   2592	ext4_lblk_t a, b;
   2593	unsigned num;
   2594	ext4_lblk_t ex_ee_block;
   2595	unsigned short ex_ee_len;
   2596	unsigned unwritten = 0;
   2597	struct ext4_extent *ex;
   2598	ext4_fsblk_t pblk;
   2599
   2600	/* the header must be checked already in ext4_ext_remove_space() */
   2601	ext_debug(inode, "truncate since %u in leaf to %u\n", start, end);
   2602	if (!path[depth].p_hdr)
   2603		path[depth].p_hdr = ext_block_hdr(path[depth].p_bh);
   2604	eh = path[depth].p_hdr;
   2605	if (unlikely(path[depth].p_hdr == NULL)) {
   2606		EXT4_ERROR_INODE(inode, "path[%d].p_hdr == NULL", depth);
   2607		return -EFSCORRUPTED;
   2608	}
   2609	/* find where to start removing */
   2610	ex = path[depth].p_ext;
   2611	if (!ex)
   2612		ex = EXT_LAST_EXTENT(eh);
   2613
   2614	ex_ee_block = le32_to_cpu(ex->ee_block);
   2615	ex_ee_len = ext4_ext_get_actual_len(ex);
   2616
   2617	trace_ext4_ext_rm_leaf(inode, start, ex, partial);
   2618
   2619	while (ex >= EXT_FIRST_EXTENT(eh) &&
   2620			ex_ee_block + ex_ee_len > start) {
   2621
   2622		if (ext4_ext_is_unwritten(ex))
   2623			unwritten = 1;
   2624		else
   2625			unwritten = 0;
   2626
   2627		ext_debug(inode, "remove ext %u:[%d]%d\n", ex_ee_block,
   2628			  unwritten, ex_ee_len);
   2629		path[depth].p_ext = ex;
   2630
   2631		a = ex_ee_block > start ? ex_ee_block : start;
   2632		b = ex_ee_block+ex_ee_len - 1 < end ?
   2633			ex_ee_block+ex_ee_len - 1 : end;
   2634
   2635		ext_debug(inode, "  border %u:%u\n", a, b);
   2636
   2637		/* If this extent is beyond the end of the hole, skip it */
   2638		if (end < ex_ee_block) {
   2639			/*
   2640			 * We're going to skip this extent and move to another,
   2641			 * so note that its first cluster is in use to avoid
   2642			 * freeing it when removing blocks.  Eventually, the
   2643			 * right edge of the truncated/punched region will
   2644			 * be just to the left.
   2645			 */
   2646			if (sbi->s_cluster_ratio > 1) {
   2647				pblk = ext4_ext_pblock(ex);
   2648				partial->pclu = EXT4_B2C(sbi, pblk);
   2649				partial->state = nofree;
   2650			}
   2651			ex--;
   2652			ex_ee_block = le32_to_cpu(ex->ee_block);
   2653			ex_ee_len = ext4_ext_get_actual_len(ex);
   2654			continue;
   2655		} else if (b != ex_ee_block + ex_ee_len - 1) {
   2656			EXT4_ERROR_INODE(inode,
   2657					 "can not handle truncate %u:%u "
   2658					 "on extent %u:%u",
   2659					 start, end, ex_ee_block,
   2660					 ex_ee_block + ex_ee_len - 1);
   2661			err = -EFSCORRUPTED;
   2662			goto out;
   2663		} else if (a != ex_ee_block) {
   2664			/* remove tail of the extent */
   2665			num = a - ex_ee_block;
   2666		} else {
   2667			/* remove whole extent: excellent! */
   2668			num = 0;
   2669		}
   2670		/*
   2671		 * 3 for leaf, sb, and inode plus 2 (bmap and group
   2672		 * descriptor) for each block group; assume two block
   2673		 * groups plus ex_ee_len/blocks_per_block_group for
   2674		 * the worst case
   2675		 */
   2676		credits = 7 + 2*(ex_ee_len/EXT4_BLOCKS_PER_GROUP(inode->i_sb));
   2677		if (ex == EXT_FIRST_EXTENT(eh)) {
   2678			correct_index = 1;
   2679			credits += (ext_depth(inode)) + 1;
   2680		}
   2681		credits += EXT4_MAXQUOTAS_TRANS_BLOCKS(inode->i_sb);
   2682		/*
   2683		 * We may end up freeing some index blocks and data from the
   2684		 * punched range. Note that partial clusters are accounted for
   2685		 * by ext4_free_data_revoke_credits().
   2686		 */
   2687		revoke_credits =
   2688			ext4_free_metadata_revoke_credits(inode->i_sb,
   2689							  ext_depth(inode)) +
   2690			ext4_free_data_revoke_credits(inode, b - a + 1);
   2691
   2692		err = ext4_datasem_ensure_credits(handle, inode, credits,
   2693						  credits, revoke_credits);
   2694		if (err) {
   2695			if (err > 0)
   2696				err = -EAGAIN;
   2697			goto out;
   2698		}
   2699
   2700		err = ext4_ext_get_access(handle, inode, path + depth);
   2701		if (err)
   2702			goto out;
   2703
   2704		err = ext4_remove_blocks(handle, inode, ex, partial, a, b);
   2705		if (err)
   2706			goto out;
   2707
   2708		if (num == 0)
   2709			/* this extent is removed; mark slot entirely unused */
   2710			ext4_ext_store_pblock(ex, 0);
   2711
   2712		ex->ee_len = cpu_to_le16(num);
   2713		/*
   2714		 * Do not mark unwritten if all the blocks in the
   2715		 * extent have been removed.
   2716		 */
   2717		if (unwritten && num)
   2718			ext4_ext_mark_unwritten(ex);
   2719		/*
   2720		 * If the extent was completely released,
   2721		 * we need to remove it from the leaf
   2722		 */
   2723		if (num == 0) {
   2724			if (end != EXT_MAX_BLOCKS - 1) {
   2725				/*
   2726				 * For hole punching, we need to scoot all the
   2727				 * extents up when an extent is removed so that
   2728				 * we dont have blank extents in the middle
   2729				 */
   2730				memmove(ex, ex+1, (EXT_LAST_EXTENT(eh) - ex) *
   2731					sizeof(struct ext4_extent));
   2732
   2733				/* Now get rid of the one at the end */
   2734				memset(EXT_LAST_EXTENT(eh), 0,
   2735					sizeof(struct ext4_extent));
   2736			}
   2737			le16_add_cpu(&eh->eh_entries, -1);
   2738		}
   2739
   2740		err = ext4_ext_dirty(handle, inode, path + depth);
   2741		if (err)
   2742			goto out;
   2743
   2744		ext_debug(inode, "new extent: %u:%u:%llu\n", ex_ee_block, num,
   2745				ext4_ext_pblock(ex));
   2746		ex--;
   2747		ex_ee_block = le32_to_cpu(ex->ee_block);
   2748		ex_ee_len = ext4_ext_get_actual_len(ex);
   2749	}
   2750
   2751	if (correct_index && eh->eh_entries)
   2752		err = ext4_ext_correct_indexes(handle, inode, path);
   2753
   2754	/*
   2755	 * If there's a partial cluster and at least one extent remains in
   2756	 * the leaf, free the partial cluster if it isn't shared with the
   2757	 * current extent.  If it is shared with the current extent
   2758	 * we reset the partial cluster because we've reached the start of the
   2759	 * truncated/punched region and we're done removing blocks.
   2760	 */
   2761	if (partial->state == tofree && ex >= EXT_FIRST_EXTENT(eh)) {
   2762		pblk = ext4_ext_pblock(ex) + ex_ee_len - 1;
   2763		if (partial->pclu != EXT4_B2C(sbi, pblk)) {
   2764			int flags = get_default_free_blocks_flags(inode);
   2765
   2766			if (ext4_is_pending(inode, partial->lblk))
   2767				flags |= EXT4_FREE_BLOCKS_RERESERVE_CLUSTER;
   2768			ext4_free_blocks(handle, inode, NULL,
   2769					 EXT4_C2B(sbi, partial->pclu),
   2770					 sbi->s_cluster_ratio, flags);
   2771			if (flags & EXT4_FREE_BLOCKS_RERESERVE_CLUSTER)
   2772				ext4_rereserve_cluster(inode, partial->lblk);
   2773		}
   2774		partial->state = initial;
   2775	}
   2776
   2777	/* if this leaf is free, then we should
   2778	 * remove it from index block above */
   2779	if (err == 0 && eh->eh_entries == 0 && path[depth].p_bh != NULL)
   2780		err = ext4_ext_rm_idx(handle, inode, path, depth);
   2781
   2782out:
   2783	return err;
   2784}
   2785
   2786/*
   2787 * ext4_ext_more_to_rm:
   2788 * returns 1 if current index has to be freed (even partial)
   2789 */
   2790static int
   2791ext4_ext_more_to_rm(struct ext4_ext_path *path)
   2792{
   2793	BUG_ON(path->p_idx == NULL);
   2794
   2795	if (path->p_idx < EXT_FIRST_INDEX(path->p_hdr))
   2796		return 0;
   2797
   2798	/*
   2799	 * if truncate on deeper level happened, it wasn't partial,
   2800	 * so we have to consider current index for truncation
   2801	 */
   2802	if (le16_to_cpu(path->p_hdr->eh_entries) == path->p_block)
   2803		return 0;
   2804	return 1;
   2805}
   2806
   2807int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start,
   2808			  ext4_lblk_t end)
   2809{
   2810	struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
   2811	int depth = ext_depth(inode);
   2812	struct ext4_ext_path *path = NULL;
   2813	struct partial_cluster partial;
   2814	handle_t *handle;
   2815	int i = 0, err = 0;
   2816
   2817	partial.pclu = 0;
   2818	partial.lblk = 0;
   2819	partial.state = initial;
   2820
   2821	ext_debug(inode, "truncate since %u to %u\n", start, end);
   2822
   2823	/* probably first extent we're gonna free will be last in block */
   2824	handle = ext4_journal_start_with_revoke(inode, EXT4_HT_TRUNCATE,
   2825			depth + 1,
   2826			ext4_free_metadata_revoke_credits(inode->i_sb, depth));
   2827	if (IS_ERR(handle))
   2828		return PTR_ERR(handle);
   2829
   2830again:
   2831	trace_ext4_ext_remove_space(inode, start, end, depth);
   2832
   2833	/*
   2834	 * Check if we are removing extents inside the extent tree. If that
   2835	 * is the case, we are going to punch a hole inside the extent tree
   2836	 * so we have to check whether we need to split the extent covering
   2837	 * the last block to remove so we can easily remove the part of it
   2838	 * in ext4_ext_rm_leaf().
   2839	 */
   2840	if (end < EXT_MAX_BLOCKS - 1) {
   2841		struct ext4_extent *ex;
   2842		ext4_lblk_t ee_block, ex_end, lblk;
   2843		ext4_fsblk_t pblk;
   2844
   2845		/* find extent for or closest extent to this block */
   2846		path = ext4_find_extent(inode, end, NULL,
   2847					EXT4_EX_NOCACHE | EXT4_EX_NOFAIL);
   2848		if (IS_ERR(path)) {
   2849			ext4_journal_stop(handle);
   2850			return PTR_ERR(path);
   2851		}
   2852		depth = ext_depth(inode);
   2853		/* Leaf not may not exist only if inode has no blocks at all */
   2854		ex = path[depth].p_ext;
   2855		if (!ex) {
   2856			if (depth) {
   2857				EXT4_ERROR_INODE(inode,
   2858						 "path[%d].p_hdr == NULL",
   2859						 depth);
   2860				err = -EFSCORRUPTED;
   2861			}
   2862			goto out;
   2863		}
   2864
   2865		ee_block = le32_to_cpu(ex->ee_block);
   2866		ex_end = ee_block + ext4_ext_get_actual_len(ex) - 1;
   2867
   2868		/*
   2869		 * See if the last block is inside the extent, if so split
   2870		 * the extent at 'end' block so we can easily remove the
   2871		 * tail of the first part of the split extent in
   2872		 * ext4_ext_rm_leaf().
   2873		 */
   2874		if (end >= ee_block && end < ex_end) {
   2875
   2876			/*
   2877			 * If we're going to split the extent, note that
   2878			 * the cluster containing the block after 'end' is
   2879			 * in use to avoid freeing it when removing blocks.
   2880			 */
   2881			if (sbi->s_cluster_ratio > 1) {
   2882				pblk = ext4_ext_pblock(ex) + end - ee_block + 1;
   2883				partial.pclu = EXT4_B2C(sbi, pblk);
   2884				partial.state = nofree;
   2885			}
   2886
   2887			/*
   2888			 * Split the extent in two so that 'end' is the last
   2889			 * block in the first new extent. Also we should not
   2890			 * fail removing space due to ENOSPC so try to use
   2891			 * reserved block if that happens.
   2892			 */
   2893			err = ext4_force_split_extent_at(handle, inode, &path,
   2894							 end + 1, 1);
   2895			if (err < 0)
   2896				goto out;
   2897
   2898		} else if (sbi->s_cluster_ratio > 1 && end >= ex_end &&
   2899			   partial.state == initial) {
   2900			/*
   2901			 * If we're punching, there's an extent to the right.
   2902			 * If the partial cluster hasn't been set, set it to
   2903			 * that extent's first cluster and its state to nofree
   2904			 * so it won't be freed should it contain blocks to be
   2905			 * removed. If it's already set (tofree/nofree), we're
   2906			 * retrying and keep the original partial cluster info
   2907			 * so a cluster marked tofree as a result of earlier
   2908			 * extent removal is not lost.
   2909			 */
   2910			lblk = ex_end + 1;
   2911			err = ext4_ext_search_right(inode, path, &lblk, &pblk,
   2912						    NULL);
   2913			if (err < 0)
   2914				goto out;
   2915			if (pblk) {
   2916				partial.pclu = EXT4_B2C(sbi, pblk);
   2917				partial.state = nofree;
   2918			}
   2919		}
   2920	}
   2921	/*
   2922	 * We start scanning from right side, freeing all the blocks
   2923	 * after i_size and walking into the tree depth-wise.
   2924	 */
   2925	depth = ext_depth(inode);
   2926	if (path) {
   2927		int k = i = depth;
   2928		while (--k > 0)
   2929			path[k].p_block =
   2930				le16_to_cpu(path[k].p_hdr->eh_entries)+1;
   2931	} else {
   2932		path = kcalloc(depth + 1, sizeof(struct ext4_ext_path),
   2933			       GFP_NOFS | __GFP_NOFAIL);
   2934		if (path == NULL) {
   2935			ext4_journal_stop(handle);
   2936			return -ENOMEM;
   2937		}
   2938		path[0].p_maxdepth = path[0].p_depth = depth;
   2939		path[0].p_hdr = ext_inode_hdr(inode);
   2940		i = 0;
   2941
   2942		if (ext4_ext_check(inode, path[0].p_hdr, depth, 0)) {
   2943			err = -EFSCORRUPTED;
   2944			goto out;
   2945		}
   2946	}
   2947	err = 0;
   2948
   2949	while (i >= 0 && err == 0) {
   2950		if (i == depth) {
   2951			/* this is leaf block */
   2952			err = ext4_ext_rm_leaf(handle, inode, path,
   2953					       &partial, start, end);
   2954			/* root level has p_bh == NULL, brelse() eats this */
   2955			brelse(path[i].p_bh);
   2956			path[i].p_bh = NULL;
   2957			i--;
   2958			continue;
   2959		}
   2960
   2961		/* this is index block */
   2962		if (!path[i].p_hdr) {
   2963			ext_debug(inode, "initialize header\n");
   2964			path[i].p_hdr = ext_block_hdr(path[i].p_bh);
   2965		}
   2966
   2967		if (!path[i].p_idx) {
   2968			/* this level hasn't been touched yet */
   2969			path[i].p_idx = EXT_LAST_INDEX(path[i].p_hdr);
   2970			path[i].p_block = le16_to_cpu(path[i].p_hdr->eh_entries)+1;
   2971			ext_debug(inode, "init index ptr: hdr 0x%p, num %d\n",
   2972				  path[i].p_hdr,
   2973				  le16_to_cpu(path[i].p_hdr->eh_entries));
   2974		} else {
   2975			/* we were already here, see at next index */
   2976			path[i].p_idx--;
   2977		}
   2978
   2979		ext_debug(inode, "level %d - index, first 0x%p, cur 0x%p\n",
   2980				i, EXT_FIRST_INDEX(path[i].p_hdr),
   2981				path[i].p_idx);
   2982		if (ext4_ext_more_to_rm(path + i)) {
   2983			struct buffer_head *bh;
   2984			/* go to the next level */
   2985			ext_debug(inode, "move to level %d (block %llu)\n",
   2986				  i + 1, ext4_idx_pblock(path[i].p_idx));
   2987			memset(path + i + 1, 0, sizeof(*path));
   2988			bh = read_extent_tree_block(inode, path[i].p_idx,
   2989						    depth - i - 1,
   2990						    EXT4_EX_NOCACHE);
   2991			if (IS_ERR(bh)) {
   2992				/* should we reset i_size? */
   2993				err = PTR_ERR(bh);
   2994				break;
   2995			}
   2996			/* Yield here to deal with large extent trees.
   2997			 * Should be a no-op if we did IO above. */
   2998			cond_resched();
   2999			if (WARN_ON(i + 1 > depth)) {
   3000				err = -EFSCORRUPTED;
   3001				break;
   3002			}
   3003			path[i + 1].p_bh = bh;
   3004
   3005			/* save actual number of indexes since this
   3006			 * number is changed at the next iteration */
   3007			path[i].p_block = le16_to_cpu(path[i].p_hdr->eh_entries);
   3008			i++;
   3009		} else {
   3010			/* we finished processing this index, go up */
   3011			if (path[i].p_hdr->eh_entries == 0 && i > 0) {
   3012				/* index is empty, remove it;
   3013				 * handle must be already prepared by the
   3014				 * truncatei_leaf() */
   3015				err = ext4_ext_rm_idx(handle, inode, path, i);
   3016			}
   3017			/* root level has p_bh == NULL, brelse() eats this */
   3018			brelse(path[i].p_bh);
   3019			path[i].p_bh = NULL;
   3020			i--;
   3021			ext_debug(inode, "return to level %d\n", i);
   3022		}
   3023	}
   3024
   3025	trace_ext4_ext_remove_space_done(inode, start, end, depth, &partial,
   3026					 path->p_hdr->eh_entries);
   3027
   3028	/*
   3029	 * if there's a partial cluster and we have removed the first extent
   3030	 * in the file, then we also free the partial cluster, if any
   3031	 */
   3032	if (partial.state == tofree && err == 0) {
   3033		int flags = get_default_free_blocks_flags(inode);
   3034
   3035		if (ext4_is_pending(inode, partial.lblk))
   3036			flags |= EXT4_FREE_BLOCKS_RERESERVE_CLUSTER;
   3037		ext4_free_blocks(handle, inode, NULL,
   3038				 EXT4_C2B(sbi, partial.pclu),
   3039				 sbi->s_cluster_ratio, flags);
   3040		if (flags & EXT4_FREE_BLOCKS_RERESERVE_CLUSTER)
   3041			ext4_rereserve_cluster(inode, partial.lblk);
   3042		partial.state = initial;
   3043	}
   3044
   3045	/* TODO: flexible tree reduction should be here */
   3046	if (path->p_hdr->eh_entries == 0) {
   3047		/*
   3048		 * truncate to zero freed all the tree,
   3049		 * so we need to correct eh_depth
   3050		 */
   3051		err = ext4_ext_get_access(handle, inode, path);
   3052		if (err == 0) {
   3053			ext_inode_hdr(inode)->eh_depth = 0;
   3054			ext_inode_hdr(inode)->eh_max =
   3055				cpu_to_le16(ext4_ext_space_root(inode, 0));
   3056			err = ext4_ext_dirty(handle, inode, path);
   3057		}
   3058	}
   3059out:
   3060	ext4_ext_drop_refs(path);
   3061	kfree(path);
   3062	path = NULL;
   3063	if (err == -EAGAIN)
   3064		goto again;
   3065	ext4_journal_stop(handle);
   3066
   3067	return err;
   3068}
   3069
   3070/*
   3071 * called at mount time
   3072 */
   3073void ext4_ext_init(struct super_block *sb)
   3074{
   3075	/*
   3076	 * possible initialization would be here
   3077	 */
   3078
   3079	if (ext4_has_feature_extents(sb)) {
   3080#if defined(AGGRESSIVE_TEST) || defined(CHECK_BINSEARCH) || defined(EXTENTS_STATS)
   3081		printk(KERN_INFO "EXT4-fs: file extents enabled"
   3082#ifdef AGGRESSIVE_TEST
   3083		       ", aggressive tests"
   3084#endif
   3085#ifdef CHECK_BINSEARCH
   3086		       ", check binsearch"
   3087#endif
   3088#ifdef EXTENTS_STATS
   3089		       ", stats"
   3090#endif
   3091		       "\n");
   3092#endif
   3093#ifdef EXTENTS_STATS
   3094		spin_lock_init(&EXT4_SB(sb)->s_ext_stats_lock);
   3095		EXT4_SB(sb)->s_ext_min = 1 << 30;
   3096		EXT4_SB(sb)->s_ext_max = 0;
   3097#endif
   3098	}
   3099}
   3100
   3101/*
   3102 * called at umount time
   3103 */
   3104void ext4_ext_release(struct super_block *sb)
   3105{
   3106	if (!ext4_has_feature_extents(sb))
   3107		return;
   3108
   3109#ifdef EXTENTS_STATS
   3110	if (EXT4_SB(sb)->s_ext_blocks && EXT4_SB(sb)->s_ext_extents) {
   3111		struct ext4_sb_info *sbi = EXT4_SB(sb);
   3112		printk(KERN_ERR "EXT4-fs: %lu blocks in %lu extents (%lu ave)\n",
   3113			sbi->s_ext_blocks, sbi->s_ext_extents,
   3114			sbi->s_ext_blocks / sbi->s_ext_extents);
   3115		printk(KERN_ERR "EXT4-fs: extents: %lu min, %lu max, max depth %lu\n",
   3116			sbi->s_ext_min, sbi->s_ext_max, sbi->s_depth_max);
   3117	}
   3118#endif
   3119}
   3120
   3121static int ext4_zeroout_es(struct inode *inode, struct ext4_extent *ex)
   3122{
   3123	ext4_lblk_t  ee_block;
   3124	ext4_fsblk_t ee_pblock;
   3125	unsigned int ee_len;
   3126
   3127	ee_block  = le32_to_cpu(ex->ee_block);
   3128	ee_len    = ext4_ext_get_actual_len(ex);
   3129	ee_pblock = ext4_ext_pblock(ex);
   3130
   3131	if (ee_len == 0)
   3132		return 0;
   3133
   3134	return ext4_es_insert_extent(inode, ee_block, ee_len, ee_pblock,
   3135				     EXTENT_STATUS_WRITTEN);
   3136}
   3137
   3138/* FIXME!! we need to try to merge to left or right after zero-out  */
   3139static int ext4_ext_zeroout(struct inode *inode, struct ext4_extent *ex)
   3140{
   3141	ext4_fsblk_t ee_pblock;
   3142	unsigned int ee_len;
   3143
   3144	ee_len    = ext4_ext_get_actual_len(ex);
   3145	ee_pblock = ext4_ext_pblock(ex);
   3146	return ext4_issue_zeroout(inode, le32_to_cpu(ex->ee_block), ee_pblock,
   3147				  ee_len);
   3148}
   3149
   3150/*
   3151 * ext4_split_extent_at() splits an extent at given block.
   3152 *
   3153 * @handle: the journal handle
   3154 * @inode: the file inode
   3155 * @path: the path to the extent
   3156 * @split: the logical block where the extent is splitted.
   3157 * @split_flags: indicates if the extent could be zeroout if split fails, and
   3158 *		 the states(init or unwritten) of new extents.
   3159 * @flags: flags used to insert new extent to extent tree.
   3160 *
   3161 *
   3162 * Splits extent [a, b] into two extents [a, @split) and [@split, b], states
   3163 * of which are determined by split_flag.
   3164 *
   3165 * There are two cases:
   3166 *  a> the extent are splitted into two extent.
   3167 *  b> split is not needed, and just mark the extent.
   3168 *
   3169 * return 0 on success.
   3170 */
   3171static int ext4_split_extent_at(handle_t *handle,
   3172			     struct inode *inode,
   3173			     struct ext4_ext_path **ppath,
   3174			     ext4_lblk_t split,
   3175			     int split_flag,
   3176			     int flags)
   3177{
   3178	struct ext4_ext_path *path = *ppath;
   3179	ext4_fsblk_t newblock;
   3180	ext4_lblk_t ee_block;
   3181	struct ext4_extent *ex, newex, orig_ex, zero_ex;
   3182	struct ext4_extent *ex2 = NULL;
   3183	unsigned int ee_len, depth;
   3184	int err = 0;
   3185
   3186	BUG_ON((split_flag & (EXT4_EXT_DATA_VALID1 | EXT4_EXT_DATA_VALID2)) ==
   3187	       (EXT4_EXT_DATA_VALID1 | EXT4_EXT_DATA_VALID2));
   3188
   3189	ext_debug(inode, "logical block %llu\n", (unsigned long long)split);
   3190
   3191	ext4_ext_show_leaf(inode, path);
   3192
   3193	depth = ext_depth(inode);
   3194	ex = path[depth].p_ext;
   3195	ee_block = le32_to_cpu(ex->ee_block);
   3196	ee_len = ext4_ext_get_actual_len(ex);
   3197	newblock = split - ee_block + ext4_ext_pblock(ex);
   3198
   3199	BUG_ON(split < ee_block || split >= (ee_block + ee_len));
   3200	BUG_ON(!ext4_ext_is_unwritten(ex) &&
   3201	       split_flag & (EXT4_EXT_MAY_ZEROOUT |
   3202			     EXT4_EXT_MARK_UNWRIT1 |
   3203			     EXT4_EXT_MARK_UNWRIT2));
   3204
   3205	err = ext4_ext_get_access(handle, inode, path + depth);
   3206	if (err)
   3207		goto out;
   3208
   3209	if (split == ee_block) {
   3210		/*
   3211		 * case b: block @split is the block that the extent begins with
   3212		 * then we just change the state of the extent, and splitting
   3213		 * is not needed.
   3214		 */
   3215		if (split_flag & EXT4_EXT_MARK_UNWRIT2)
   3216			ext4_ext_mark_unwritten(ex);
   3217		else
   3218			ext4_ext_mark_initialized(ex);
   3219
   3220		if (!(flags & EXT4_GET_BLOCKS_PRE_IO))
   3221			ext4_ext_try_to_merge(handle, inode, path, ex);
   3222
   3223		err = ext4_ext_dirty(handle, inode, path + path->p_depth);
   3224		goto out;
   3225	}
   3226
   3227	/* case a */
   3228	memcpy(&orig_ex, ex, sizeof(orig_ex));
   3229	ex->ee_len = cpu_to_le16(split - ee_block);
   3230	if (split_flag & EXT4_EXT_MARK_UNWRIT1)
   3231		ext4_ext_mark_unwritten(ex);
   3232
   3233	/*
   3234	 * path may lead to new leaf, not to original leaf any more
   3235	 * after ext4_ext_insert_extent() returns,
   3236	 */
   3237	err = ext4_ext_dirty(handle, inode, path + depth);
   3238	if (err)
   3239		goto fix_extent_len;
   3240
   3241	ex2 = &newex;
   3242	ex2->ee_block = cpu_to_le32(split);
   3243	ex2->ee_len   = cpu_to_le16(ee_len - (split - ee_block));
   3244	ext4_ext_store_pblock(ex2, newblock);
   3245	if (split_flag & EXT4_EXT_MARK_UNWRIT2)
   3246		ext4_ext_mark_unwritten(ex2);
   3247
   3248	err = ext4_ext_insert_extent(handle, inode, ppath, &newex, flags);
   3249	if (err != -ENOSPC && err != -EDQUOT)
   3250		goto out;
   3251
   3252	if (EXT4_EXT_MAY_ZEROOUT & split_flag) {
   3253		if (split_flag & (EXT4_EXT_DATA_VALID1|EXT4_EXT_DATA_VALID2)) {
   3254			if (split_flag & EXT4_EXT_DATA_VALID1) {
   3255				err = ext4_ext_zeroout(inode, ex2);
   3256				zero_ex.ee_block = ex2->ee_block;
   3257				zero_ex.ee_len = cpu_to_le16(
   3258						ext4_ext_get_actual_len(ex2));
   3259				ext4_ext_store_pblock(&zero_ex,
   3260						      ext4_ext_pblock(ex2));
   3261			} else {
   3262				err = ext4_ext_zeroout(inode, ex);
   3263				zero_ex.ee_block = ex->ee_block;
   3264				zero_ex.ee_len = cpu_to_le16(
   3265						ext4_ext_get_actual_len(ex));
   3266				ext4_ext_store_pblock(&zero_ex,
   3267						      ext4_ext_pblock(ex));
   3268			}
   3269		} else {
   3270			err = ext4_ext_zeroout(inode, &orig_ex);
   3271			zero_ex.ee_block = orig_ex.ee_block;
   3272			zero_ex.ee_len = cpu_to_le16(
   3273						ext4_ext_get_actual_len(&orig_ex));
   3274			ext4_ext_store_pblock(&zero_ex,
   3275					      ext4_ext_pblock(&orig_ex));
   3276		}
   3277
   3278		if (!err) {
   3279			/* update the extent length and mark as initialized */
   3280			ex->ee_len = cpu_to_le16(ee_len);
   3281			ext4_ext_try_to_merge(handle, inode, path, ex);
   3282			err = ext4_ext_dirty(handle, inode, path + path->p_depth);
   3283			if (!err)
   3284				/* update extent status tree */
   3285				err = ext4_zeroout_es(inode, &zero_ex);
   3286			/* If we failed at this point, we don't know in which
   3287			 * state the extent tree exactly is so don't try to fix
   3288			 * length of the original extent as it may do even more
   3289			 * damage.
   3290			 */
   3291			goto out;
   3292		}
   3293	}
   3294
   3295fix_extent_len:
   3296	ex->ee_len = orig_ex.ee_len;
   3297	/*
   3298	 * Ignore ext4_ext_dirty return value since we are already in error path
   3299	 * and err is a non-zero error code.
   3300	 */
   3301	ext4_ext_dirty(handle, inode, path + path->p_depth);
   3302	return err;
   3303out:
   3304	ext4_ext_show_leaf(inode, path);
   3305	return err;
   3306}
   3307
   3308/*
   3309 * ext4_split_extents() splits an extent and mark extent which is covered
   3310 * by @map as split_flags indicates
   3311 *
   3312 * It may result in splitting the extent into multiple extents (up to three)
   3313 * There are three possibilities:
   3314 *   a> There is no split required
   3315 *   b> Splits in two extents: Split is happening at either end of the extent
   3316 *   c> Splits in three extents: Somone is splitting in middle of the extent
   3317 *
   3318 */
   3319static int ext4_split_extent(handle_t *handle,
   3320			      struct inode *inode,
   3321			      struct ext4_ext_path **ppath,
   3322			      struct ext4_map_blocks *map,
   3323			      int split_flag,
   3324			      int flags)
   3325{
   3326	struct ext4_ext_path *path = *ppath;
   3327	ext4_lblk_t ee_block;
   3328	struct ext4_extent *ex;
   3329	unsigned int ee_len, depth;
   3330	int err = 0;
   3331	int unwritten;
   3332	int split_flag1, flags1;
   3333	int allocated = map->m_len;
   3334
   3335	depth = ext_depth(inode);
   3336	ex = path[depth].p_ext;
   3337	ee_block = le32_to_cpu(ex->ee_block);
   3338	ee_len = ext4_ext_get_actual_len(ex);
   3339	unwritten = ext4_ext_is_unwritten(ex);
   3340
   3341	if (map->m_lblk + map->m_len < ee_block + ee_len) {
   3342		split_flag1 = split_flag & EXT4_EXT_MAY_ZEROOUT;
   3343		flags1 = flags | EXT4_GET_BLOCKS_PRE_IO;
   3344		if (unwritten)
   3345			split_flag1 |= EXT4_EXT_MARK_UNWRIT1 |
   3346				       EXT4_EXT_MARK_UNWRIT2;
   3347		if (split_flag & EXT4_EXT_DATA_VALID2)
   3348			split_flag1 |= EXT4_EXT_DATA_VALID1;
   3349		err = ext4_split_extent_at(handle, inode, ppath,
   3350				map->m_lblk + map->m_len, split_flag1, flags1);
   3351		if (err)
   3352			goto out;
   3353	} else {
   3354		allocated = ee_len - (map->m_lblk - ee_block);
   3355	}
   3356	/*
   3357	 * Update path is required because previous ext4_split_extent_at() may
   3358	 * result in split of original leaf or extent zeroout.
   3359	 */
   3360	path = ext4_find_extent(inode, map->m_lblk, ppath, flags);
   3361	if (IS_ERR(path))
   3362		return PTR_ERR(path);
   3363	depth = ext_depth(inode);
   3364	ex = path[depth].p_ext;
   3365	if (!ex) {
   3366		EXT4_ERROR_INODE(inode, "unexpected hole at %lu",
   3367				 (unsigned long) map->m_lblk);
   3368		return -EFSCORRUPTED;
   3369	}
   3370	unwritten = ext4_ext_is_unwritten(ex);
   3371
   3372	if (map->m_lblk >= ee_block) {
   3373		split_flag1 = split_flag & EXT4_EXT_DATA_VALID2;
   3374		if (unwritten) {
   3375			split_flag1 |= EXT4_EXT_MARK_UNWRIT1;
   3376			split_flag1 |= split_flag & (EXT4_EXT_MAY_ZEROOUT |
   3377						     EXT4_EXT_MARK_UNWRIT2);
   3378		}
   3379		err = ext4_split_extent_at(handle, inode, ppath,
   3380				map->m_lblk, split_flag1, flags);
   3381		if (err)
   3382			goto out;
   3383	}
   3384
   3385	ext4_ext_show_leaf(inode, path);
   3386out:
   3387	return err ? err : allocated;
   3388}
   3389
   3390/*
   3391 * This function is called by ext4_ext_map_blocks() if someone tries to write
   3392 * to an unwritten extent. It may result in splitting the unwritten
   3393 * extent into multiple extents (up to three - one initialized and two
   3394 * unwritten).
   3395 * There are three possibilities:
   3396 *   a> There is no split required: Entire extent should be initialized
   3397 *   b> Splits in two extents: Write is happening at either end of the extent
   3398 *   c> Splits in three extents: Somone is writing in middle of the extent
   3399 *
   3400 * Pre-conditions:
   3401 *  - The extent pointed to by 'path' is unwritten.
   3402 *  - The extent pointed to by 'path' contains a superset
   3403 *    of the logical span [map->m_lblk, map->m_lblk + map->m_len).
   3404 *
   3405 * Post-conditions on success:
   3406 *  - the returned value is the number of blocks beyond map->l_lblk
   3407 *    that are allocated and initialized.
   3408 *    It is guaranteed to be >= map->m_len.
   3409 */
   3410static int ext4_ext_convert_to_initialized(handle_t *handle,
   3411					   struct inode *inode,
   3412					   struct ext4_map_blocks *map,
   3413					   struct ext4_ext_path **ppath,
   3414					   int flags)
   3415{
   3416	struct ext4_ext_path *path = *ppath;
   3417	struct ext4_sb_info *sbi;
   3418	struct ext4_extent_header *eh;
   3419	struct ext4_map_blocks split_map;
   3420	struct ext4_extent zero_ex1, zero_ex2;
   3421	struct ext4_extent *ex, *abut_ex;
   3422	ext4_lblk_t ee_block, eof_block;
   3423	unsigned int ee_len, depth, map_len = map->m_len;
   3424	int allocated = 0, max_zeroout = 0;
   3425	int err = 0;
   3426	int split_flag = EXT4_EXT_DATA_VALID2;
   3427
   3428	ext_debug(inode, "logical block %llu, max_blocks %u\n",
   3429		  (unsigned long long)map->m_lblk, map_len);
   3430
   3431	sbi = EXT4_SB(inode->i_sb);
   3432	eof_block = (EXT4_I(inode)->i_disksize + inode->i_sb->s_blocksize - 1)
   3433			>> inode->i_sb->s_blocksize_bits;
   3434	if (eof_block < map->m_lblk + map_len)
   3435		eof_block = map->m_lblk + map_len;
   3436
   3437	depth = ext_depth(inode);
   3438	eh = path[depth].p_hdr;
   3439	ex = path[depth].p_ext;
   3440	ee_block = le32_to_cpu(ex->ee_block);
   3441	ee_len = ext4_ext_get_actual_len(ex);
   3442	zero_ex1.ee_len = 0;
   3443	zero_ex2.ee_len = 0;
   3444
   3445	trace_ext4_ext_convert_to_initialized_enter(inode, map, ex);
   3446
   3447	/* Pre-conditions */
   3448	BUG_ON(!ext4_ext_is_unwritten(ex));
   3449	BUG_ON(!in_range(map->m_lblk, ee_block, ee_len));
   3450
   3451	/*
   3452	 * Attempt to transfer newly initialized blocks from the currently
   3453	 * unwritten extent to its neighbor. This is much cheaper
   3454	 * than an insertion followed by a merge as those involve costly
   3455	 * memmove() calls. Transferring to the left is the common case in
   3456	 * steady state for workloads doing fallocate(FALLOC_FL_KEEP_SIZE)
   3457	 * followed by append writes.
   3458	 *
   3459	 * Limitations of the current logic:
   3460	 *  - L1: we do not deal with writes covering the whole extent.
   3461	 *    This would require removing the extent if the transfer
   3462	 *    is possible.
   3463	 *  - L2: we only attempt to merge with an extent stored in the
   3464	 *    same extent tree node.
   3465	 */
   3466	if ((map->m_lblk == ee_block) &&
   3467		/* See if we can merge left */
   3468		(map_len < ee_len) &&		/*L1*/
   3469		(ex > EXT_FIRST_EXTENT(eh))) {	/*L2*/
   3470		ext4_lblk_t prev_lblk;
   3471		ext4_fsblk_t prev_pblk, ee_pblk;
   3472		unsigned int prev_len;
   3473
   3474		abut_ex = ex - 1;
   3475		prev_lblk = le32_to_cpu(abut_ex->ee_block);
   3476		prev_len = ext4_ext_get_actual_len(abut_ex);
   3477		prev_pblk = ext4_ext_pblock(abut_ex);
   3478		ee_pblk = ext4_ext_pblock(ex);
   3479
   3480		/*
   3481		 * A transfer of blocks from 'ex' to 'abut_ex' is allowed
   3482		 * upon those conditions:
   3483		 * - C1: abut_ex is initialized,
   3484		 * - C2: abut_ex is logically abutting ex,
   3485		 * - C3: abut_ex is physically abutting ex,
   3486		 * - C4: abut_ex can receive the additional blocks without
   3487		 *   overflowing the (initialized) length limit.
   3488		 */
   3489		if ((!ext4_ext_is_unwritten(abut_ex)) &&		/*C1*/
   3490			((prev_lblk + prev_len) == ee_block) &&		/*C2*/
   3491			((prev_pblk + prev_len) == ee_pblk) &&		/*C3*/
   3492			(prev_len < (EXT_INIT_MAX_LEN - map_len))) {	/*C4*/
   3493			err = ext4_ext_get_access(handle, inode, path + depth);
   3494			if (err)
   3495				goto out;
   3496
   3497			trace_ext4_ext_convert_to_initialized_fastpath(inode,
   3498				map, ex, abut_ex);
   3499
   3500			/* Shift the start of ex by 'map_len' blocks */
   3501			ex->ee_block = cpu_to_le32(ee_block + map_len);
   3502			ext4_ext_store_pblock(ex, ee_pblk + map_len);
   3503			ex->ee_len = cpu_to_le16(ee_len - map_len);
   3504			ext4_ext_mark_unwritten(ex); /* Restore the flag */
   3505
   3506			/* Extend abut_ex by 'map_len' blocks */
   3507			abut_ex->ee_len = cpu_to_le16(prev_len + map_len);
   3508
   3509			/* Result: number of initialized blocks past m_lblk */
   3510			allocated = map_len;
   3511		}
   3512	} else if (((map->m_lblk + map_len) == (ee_block + ee_len)) &&
   3513		   (map_len < ee_len) &&	/*L1*/
   3514		   ex < EXT_LAST_EXTENT(eh)) {	/*L2*/
   3515		/* See if we can merge right */
   3516		ext4_lblk_t next_lblk;
   3517		ext4_fsblk_t next_pblk, ee_pblk;
   3518		unsigned int next_len;
   3519
   3520		abut_ex = ex + 1;
   3521		next_lblk = le32_to_cpu(abut_ex->ee_block);
   3522		next_len = ext4_ext_get_actual_len(abut_ex);
   3523		next_pblk = ext4_ext_pblock(abut_ex);
   3524		ee_pblk = ext4_ext_pblock(ex);
   3525
   3526		/*
   3527		 * A transfer of blocks from 'ex' to 'abut_ex' is allowed
   3528		 * upon those conditions:
   3529		 * - C1: abut_ex is initialized,
   3530		 * - C2: abut_ex is logically abutting ex,
   3531		 * - C3: abut_ex is physically abutting ex,
   3532		 * - C4: abut_ex can receive the additional blocks without
   3533		 *   overflowing the (initialized) length limit.
   3534		 */
   3535		if ((!ext4_ext_is_unwritten(abut_ex)) &&		/*C1*/
   3536		    ((map->m_lblk + map_len) == next_lblk) &&		/*C2*/
   3537		    ((ee_pblk + ee_len) == next_pblk) &&		/*C3*/
   3538		    (next_len < (EXT_INIT_MAX_LEN - map_len))) {	/*C4*/
   3539			err = ext4_ext_get_access(handle, inode, path + depth);
   3540			if (err)
   3541				goto out;
   3542
   3543			trace_ext4_ext_convert_to_initialized_fastpath(inode,
   3544				map, ex, abut_ex);
   3545
   3546			/* Shift the start of abut_ex by 'map_len' blocks */
   3547			abut_ex->ee_block = cpu_to_le32(next_lblk - map_len);
   3548			ext4_ext_store_pblock(abut_ex, next_pblk - map_len);
   3549			ex->ee_len = cpu_to_le16(ee_len - map_len);
   3550			ext4_ext_mark_unwritten(ex); /* Restore the flag */
   3551
   3552			/* Extend abut_ex by 'map_len' blocks */
   3553			abut_ex->ee_len = cpu_to_le16(next_len + map_len);
   3554
   3555			/* Result: number of initialized blocks past m_lblk */
   3556			allocated = map_len;
   3557		}
   3558	}
   3559	if (allocated) {
   3560		/* Mark the block containing both extents as dirty */
   3561		err = ext4_ext_dirty(handle, inode, path + depth);
   3562
   3563		/* Update path to point to the right extent */
   3564		path[depth].p_ext = abut_ex;
   3565		goto out;
   3566	} else
   3567		allocated = ee_len - (map->m_lblk - ee_block);
   3568
   3569	WARN_ON(map->m_lblk < ee_block);
   3570	/*
   3571	 * It is safe to convert extent to initialized via explicit
   3572	 * zeroout only if extent is fully inside i_size or new_size.
   3573	 */
   3574	split_flag |= ee_block + ee_len <= eof_block ? EXT4_EXT_MAY_ZEROOUT : 0;
   3575
   3576	if (EXT4_EXT_MAY_ZEROOUT & split_flag)
   3577		max_zeroout = sbi->s_extent_max_zeroout_kb >>
   3578			(inode->i_sb->s_blocksize_bits - 10);
   3579
   3580	/*
   3581	 * five cases:
   3582	 * 1. split the extent into three extents.
   3583	 * 2. split the extent into two extents, zeroout the head of the first
   3584	 *    extent.
   3585	 * 3. split the extent into two extents, zeroout the tail of the second
   3586	 *    extent.
   3587	 * 4. split the extent into two extents with out zeroout.
   3588	 * 5. no splitting needed, just possibly zeroout the head and / or the
   3589	 *    tail of the extent.
   3590	 */
   3591	split_map.m_lblk = map->m_lblk;
   3592	split_map.m_len = map->m_len;
   3593
   3594	if (max_zeroout && (allocated > split_map.m_len)) {
   3595		if (allocated <= max_zeroout) {
   3596			/* case 3 or 5 */
   3597			zero_ex1.ee_block =
   3598				 cpu_to_le32(split_map.m_lblk +
   3599					     split_map.m_len);
   3600			zero_ex1.ee_len =
   3601				cpu_to_le16(allocated - split_map.m_len);
   3602			ext4_ext_store_pblock(&zero_ex1,
   3603				ext4_ext_pblock(ex) + split_map.m_lblk +
   3604				split_map.m_len - ee_block);
   3605			err = ext4_ext_zeroout(inode, &zero_ex1);
   3606			if (err)
   3607				goto fallback;
   3608			split_map.m_len = allocated;
   3609		}
   3610		if (split_map.m_lblk - ee_block + split_map.m_len <
   3611								max_zeroout) {
   3612			/* case 2 or 5 */
   3613			if (split_map.m_lblk != ee_block) {
   3614				zero_ex2.ee_block = ex->ee_block;
   3615				zero_ex2.ee_len = cpu_to_le16(split_map.m_lblk -
   3616							ee_block);
   3617				ext4_ext_store_pblock(&zero_ex2,
   3618						      ext4_ext_pblock(ex));
   3619				err = ext4_ext_zeroout(inode, &zero_ex2);
   3620				if (err)
   3621					goto fallback;
   3622			}
   3623
   3624			split_map.m_len += split_map.m_lblk - ee_block;
   3625			split_map.m_lblk = ee_block;
   3626			allocated = map->m_len;
   3627		}
   3628	}
   3629
   3630fallback:
   3631	err = ext4_split_extent(handle, inode, ppath, &split_map, split_flag,
   3632				flags);
   3633	if (err > 0)
   3634		err = 0;
   3635out:
   3636	/* If we have gotten a failure, don't zero out status tree */
   3637	if (!err) {
   3638		err = ext4_zeroout_es(inode, &zero_ex1);
   3639		if (!err)
   3640			err = ext4_zeroout_es(inode, &zero_ex2);
   3641	}
   3642	return err ? err : allocated;
   3643}
   3644
   3645/*
   3646 * This function is called by ext4_ext_map_blocks() from
   3647 * ext4_get_blocks_dio_write() when DIO to write
   3648 * to an unwritten extent.
   3649 *
   3650 * Writing to an unwritten extent may result in splitting the unwritten
   3651 * extent into multiple initialized/unwritten extents (up to three)
   3652 * There are three possibilities:
   3653 *   a> There is no split required: Entire extent should be unwritten
   3654 *   b> Splits in two extents: Write is happening at either end of the extent
   3655 *   c> Splits in three extents: Somone is writing in middle of the extent
   3656 *
   3657 * This works the same way in the case of initialized -> unwritten conversion.
   3658 *
   3659 * One of more index blocks maybe needed if the extent tree grow after
   3660 * the unwritten extent split. To prevent ENOSPC occur at the IO
   3661 * complete, we need to split the unwritten extent before DIO submit
   3662 * the IO. The unwritten extent called at this time will be split
   3663 * into three unwritten extent(at most). After IO complete, the part
   3664 * being filled will be convert to initialized by the end_io callback function
   3665 * via ext4_convert_unwritten_extents().
   3666 *
   3667 * Returns the size of unwritten extent to be written on success.
   3668 */
   3669static int ext4_split_convert_extents(handle_t *handle,
   3670					struct inode *inode,
   3671					struct ext4_map_blocks *map,
   3672					struct ext4_ext_path **ppath,
   3673					int flags)
   3674{
   3675	struct ext4_ext_path *path = *ppath;
   3676	ext4_lblk_t eof_block;
   3677	ext4_lblk_t ee_block;
   3678	struct ext4_extent *ex;
   3679	unsigned int ee_len;
   3680	int split_flag = 0, depth;
   3681
   3682	ext_debug(inode, "logical block %llu, max_blocks %u\n",
   3683		  (unsigned long long)map->m_lblk, map->m_len);
   3684
   3685	eof_block = (EXT4_I(inode)->i_disksize + inode->i_sb->s_blocksize - 1)
   3686			>> inode->i_sb->s_blocksize_bits;
   3687	if (eof_block < map->m_lblk + map->m_len)
   3688		eof_block = map->m_lblk + map->m_len;
   3689	/*
   3690	 * It is safe to convert extent to initialized via explicit
   3691	 * zeroout only if extent is fully inside i_size or new_size.
   3692	 */
   3693	depth = ext_depth(inode);
   3694	ex = path[depth].p_ext;
   3695	ee_block = le32_to_cpu(ex->ee_block);
   3696	ee_len = ext4_ext_get_actual_len(ex);
   3697
   3698	/* Convert to unwritten */
   3699	if (flags & EXT4_GET_BLOCKS_CONVERT_UNWRITTEN) {
   3700		split_flag |= EXT4_EXT_DATA_VALID1;
   3701	/* Convert to initialized */
   3702	} else if (flags & EXT4_GET_BLOCKS_CONVERT) {
   3703		split_flag |= ee_block + ee_len <= eof_block ?
   3704			      EXT4_EXT_MAY_ZEROOUT : 0;
   3705		split_flag |= (EXT4_EXT_MARK_UNWRIT2 | EXT4_EXT_DATA_VALID2);
   3706	}
   3707	flags |= EXT4_GET_BLOCKS_PRE_IO;
   3708	return ext4_split_extent(handle, inode, ppath, map, split_flag, flags);
   3709}
   3710
   3711static int ext4_convert_unwritten_extents_endio(handle_t *handle,
   3712						struct inode *inode,
   3713						struct ext4_map_blocks *map,
   3714						struct ext4_ext_path **ppath)
   3715{
   3716	struct ext4_ext_path *path = *ppath;
   3717	struct ext4_extent *ex;
   3718	ext4_lblk_t ee_block;
   3719	unsigned int ee_len;
   3720	int depth;
   3721	int err = 0;
   3722
   3723	depth = ext_depth(inode);
   3724	ex = path[depth].p_ext;
   3725	ee_block = le32_to_cpu(ex->ee_block);
   3726	ee_len = ext4_ext_get_actual_len(ex);
   3727
   3728	ext_debug(inode, "logical block %llu, max_blocks %u\n",
   3729		  (unsigned long long)ee_block, ee_len);
   3730
   3731	/* If extent is larger than requested it is a clear sign that we still
   3732	 * have some extent state machine issues left. So extent_split is still
   3733	 * required.
   3734	 * TODO: Once all related issues will be fixed this situation should be
   3735	 * illegal.
   3736	 */
   3737	if (ee_block != map->m_lblk || ee_len > map->m_len) {
   3738#ifdef CONFIG_EXT4_DEBUG
   3739		ext4_warning(inode->i_sb, "Inode (%ld) finished: extent logical block %llu,"
   3740			     " len %u; IO logical block %llu, len %u",
   3741			     inode->i_ino, (unsigned long long)ee_block, ee_len,
   3742			     (unsigned long long)map->m_lblk, map->m_len);
   3743#endif
   3744		err = ext4_split_convert_extents(handle, inode, map, ppath,
   3745						 EXT4_GET_BLOCKS_CONVERT);
   3746		if (err < 0)
   3747			return err;
   3748		path = ext4_find_extent(inode, map->m_lblk, ppath, 0);
   3749		if (IS_ERR(path))
   3750			return PTR_ERR(path);
   3751		depth = ext_depth(inode);
   3752		ex = path[depth].p_ext;
   3753	}
   3754
   3755	err = ext4_ext_get_access(handle, inode, path + depth);
   3756	if (err)
   3757		goto out;
   3758	/* first mark the extent as initialized */
   3759	ext4_ext_mark_initialized(ex);
   3760
   3761	/* note: ext4_ext_correct_indexes() isn't needed here because
   3762	 * borders are not changed
   3763	 */
   3764	ext4_ext_try_to_merge(handle, inode, path, ex);
   3765
   3766	/* Mark modified extent as dirty */
   3767	err = ext4_ext_dirty(handle, inode, path + path->p_depth);
   3768out:
   3769	ext4_ext_show_leaf(inode, path);
   3770	return err;
   3771}
   3772
   3773static int
   3774convert_initialized_extent(handle_t *handle, struct inode *inode,
   3775			   struct ext4_map_blocks *map,
   3776			   struct ext4_ext_path **ppath,
   3777			   unsigned int *allocated)
   3778{
   3779	struct ext4_ext_path *path = *ppath;
   3780	struct ext4_extent *ex;
   3781	ext4_lblk_t ee_block;
   3782	unsigned int ee_len;
   3783	int depth;
   3784	int err = 0;
   3785
   3786	/*
   3787	 * Make sure that the extent is no bigger than we support with
   3788	 * unwritten extent
   3789	 */
   3790	if (map->m_len > EXT_UNWRITTEN_MAX_LEN)
   3791		map->m_len = EXT_UNWRITTEN_MAX_LEN / 2;
   3792
   3793	depth = ext_depth(inode);
   3794	ex = path[depth].p_ext;
   3795	ee_block = le32_to_cpu(ex->ee_block);
   3796	ee_len = ext4_ext_get_actual_len(ex);
   3797
   3798	ext_debug(inode, "logical block %llu, max_blocks %u\n",
   3799		  (unsigned long long)ee_block, ee_len);
   3800
   3801	if (ee_block != map->m_lblk || ee_len > map->m_len) {
   3802		err = ext4_split_convert_extents(handle, inode, map, ppath,
   3803				EXT4_GET_BLOCKS_CONVERT_UNWRITTEN);
   3804		if (err < 0)
   3805			return err;
   3806		path = ext4_find_extent(inode, map->m_lblk, ppath, 0);
   3807		if (IS_ERR(path))
   3808			return PTR_ERR(path);
   3809		depth = ext_depth(inode);
   3810		ex = path[depth].p_ext;
   3811		if (!ex) {
   3812			EXT4_ERROR_INODE(inode, "unexpected hole at %lu",
   3813					 (unsigned long) map->m_lblk);
   3814			return -EFSCORRUPTED;
   3815		}
   3816	}
   3817
   3818	err = ext4_ext_get_access(handle, inode, path + depth);
   3819	if (err)
   3820		return err;
   3821	/* first mark the extent as unwritten */
   3822	ext4_ext_mark_unwritten(ex);
   3823
   3824	/* note: ext4_ext_correct_indexes() isn't needed here because
   3825	 * borders are not changed
   3826	 */
   3827	ext4_ext_try_to_merge(handle, inode, path, ex);
   3828
   3829	/* Mark modified extent as dirty */
   3830	err = ext4_ext_dirty(handle, inode, path + path->p_depth);
   3831	if (err)
   3832		return err;
   3833	ext4_ext_show_leaf(inode, path);
   3834
   3835	ext4_update_inode_fsync_trans(handle, inode, 1);
   3836
   3837	map->m_flags |= EXT4_MAP_UNWRITTEN;
   3838	if (*allocated > map->m_len)
   3839		*allocated = map->m_len;
   3840	map->m_len = *allocated;
   3841	return 0;
   3842}
   3843
   3844static int
   3845ext4_ext_handle_unwritten_extents(handle_t *handle, struct inode *inode,
   3846			struct ext4_map_blocks *map,
   3847			struct ext4_ext_path **ppath, int flags,
   3848			unsigned int allocated, ext4_fsblk_t newblock)
   3849{
   3850	struct ext4_ext_path __maybe_unused *path = *ppath;
   3851	int ret = 0;
   3852	int err = 0;
   3853
   3854	ext_debug(inode, "logical block %llu, max_blocks %u, flags 0x%x, allocated %u\n",
   3855		  (unsigned long long)map->m_lblk, map->m_len, flags,
   3856		  allocated);
   3857	ext4_ext_show_leaf(inode, path);
   3858
   3859	/*
   3860	 * When writing into unwritten space, we should not fail to
   3861	 * allocate metadata blocks for the new extent block if needed.
   3862	 */
   3863	flags |= EXT4_GET_BLOCKS_METADATA_NOFAIL;
   3864
   3865	trace_ext4_ext_handle_unwritten_extents(inode, map, flags,
   3866						    allocated, newblock);
   3867
   3868	/* get_block() before submitting IO, split the extent */
   3869	if (flags & EXT4_GET_BLOCKS_PRE_IO) {
   3870		ret = ext4_split_convert_extents(handle, inode, map, ppath,
   3871					 flags | EXT4_GET_BLOCKS_CONVERT);
   3872		if (ret < 0) {
   3873			err = ret;
   3874			goto out2;
   3875		}
   3876		/*
   3877		 * shouldn't get a 0 return when splitting an extent unless
   3878		 * m_len is 0 (bug) or extent has been corrupted
   3879		 */
   3880		if (unlikely(ret == 0)) {
   3881			EXT4_ERROR_INODE(inode,
   3882					 "unexpected ret == 0, m_len = %u",
   3883					 map->m_len);
   3884			err = -EFSCORRUPTED;
   3885			goto out2;
   3886		}
   3887		map->m_flags |= EXT4_MAP_UNWRITTEN;
   3888		goto out;
   3889	}
   3890	/* IO end_io complete, convert the filled extent to written */
   3891	if (flags & EXT4_GET_BLOCKS_CONVERT) {
   3892		err = ext4_convert_unwritten_extents_endio(handle, inode, map,
   3893							   ppath);
   3894		if (err < 0)
   3895			goto out2;
   3896		ext4_update_inode_fsync_trans(handle, inode, 1);
   3897		goto map_out;
   3898	}
   3899	/* buffered IO cases */
   3900	/*
   3901	 * repeat fallocate creation request
   3902	 * we already have an unwritten extent
   3903	 */
   3904	if (flags & EXT4_GET_BLOCKS_UNWRIT_EXT) {
   3905		map->m_flags |= EXT4_MAP_UNWRITTEN;
   3906		goto map_out;
   3907	}
   3908
   3909	/* buffered READ or buffered write_begin() lookup */
   3910	if ((flags & EXT4_GET_BLOCKS_CREATE) == 0) {
   3911		/*
   3912		 * We have blocks reserved already.  We
   3913		 * return allocated blocks so that delalloc
   3914		 * won't do block reservation for us.  But
   3915		 * the buffer head will be unmapped so that
   3916		 * a read from the block returns 0s.
   3917		 */
   3918		map->m_flags |= EXT4_MAP_UNWRITTEN;
   3919		goto out1;
   3920	}
   3921
   3922	/*
   3923	 * Default case when (flags & EXT4_GET_BLOCKS_CREATE) == 1.
   3924	 * For buffered writes, at writepage time, etc.  Convert a
   3925	 * discovered unwritten extent to written.
   3926	 */
   3927	ret = ext4_ext_convert_to_initialized(handle, inode, map, ppath, flags);
   3928	if (ret < 0) {
   3929		err = ret;
   3930		goto out2;
   3931	}
   3932	ext4_update_inode_fsync_trans(handle, inode, 1);
   3933	/*
   3934	 * shouldn't get a 0 return when converting an unwritten extent
   3935	 * unless m_len is 0 (bug) or extent has been corrupted
   3936	 */
   3937	if (unlikely(ret == 0)) {
   3938		EXT4_ERROR_INODE(inode, "unexpected ret == 0, m_len = %u",
   3939				 map->m_len);
   3940		err = -EFSCORRUPTED;
   3941		goto out2;
   3942	}
   3943
   3944out:
   3945	allocated = ret;
   3946	map->m_flags |= EXT4_MAP_NEW;
   3947map_out:
   3948	map->m_flags |= EXT4_MAP_MAPPED;
   3949out1:
   3950	map->m_pblk = newblock;
   3951	if (allocated > map->m_len)
   3952		allocated = map->m_len;
   3953	map->m_len = allocated;
   3954	ext4_ext_show_leaf(inode, path);
   3955out2:
   3956	return err ? err : allocated;
   3957}
   3958
   3959/*
   3960 * get_implied_cluster_alloc - check to see if the requested
   3961 * allocation (in the map structure) overlaps with a cluster already
   3962 * allocated in an extent.
   3963 *	@sb	The filesystem superblock structure
   3964 *	@map	The requested lblk->pblk mapping
   3965 *	@ex	The extent structure which might contain an implied
   3966 *			cluster allocation
   3967 *
   3968 * This function is called by ext4_ext_map_blocks() after we failed to
   3969 * find blocks that were already in the inode's extent tree.  Hence,
   3970 * we know that the beginning of the requested region cannot overlap
   3971 * the extent from the inode's extent tree.  There are three cases we
   3972 * want to catch.  The first is this case:
   3973 *
   3974 *		 |--- cluster # N--|
   3975 *    |--- extent ---|	|---- requested region ---|
   3976 *			|==========|
   3977 *
   3978 * The second case that we need to test for is this one:
   3979 *
   3980 *   |--------- cluster # N ----------------|
   3981 *	   |--- requested region --|   |------- extent ----|
   3982 *	   |=======================|
   3983 *
   3984 * The third case is when the requested region lies between two extents
   3985 * within the same cluster:
   3986 *          |------------- cluster # N-------------|
   3987 * |----- ex -----|                  |---- ex_right ----|
   3988 *                  |------ requested region ------|
   3989 *                  |================|
   3990 *
   3991 * In each of the above cases, we need to set the map->m_pblk and
   3992 * map->m_len so it corresponds to the return the extent labelled as
   3993 * "|====|" from cluster #N, since it is already in use for data in
   3994 * cluster EXT4_B2C(sbi, map->m_lblk).	We will then return 1 to
   3995 * signal to ext4_ext_map_blocks() that map->m_pblk should be treated
   3996 * as a new "allocated" block region.  Otherwise, we will return 0 and
   3997 * ext4_ext_map_blocks() will then allocate one or more new clusters
   3998 * by calling ext4_mb_new_blocks().
   3999 */
   4000static int get_implied_cluster_alloc(struct super_block *sb,
   4001				     struct ext4_map_blocks *map,
   4002				     struct ext4_extent *ex,
   4003				     struct ext4_ext_path *path)
   4004{
   4005	struct ext4_sb_info *sbi = EXT4_SB(sb);
   4006	ext4_lblk_t c_offset = EXT4_LBLK_COFF(sbi, map->m_lblk);
   4007	ext4_lblk_t ex_cluster_start, ex_cluster_end;
   4008	ext4_lblk_t rr_cluster_start;
   4009	ext4_lblk_t ee_block = le32_to_cpu(ex->ee_block);
   4010	ext4_fsblk_t ee_start = ext4_ext_pblock(ex);
   4011	unsigned short ee_len = ext4_ext_get_actual_len(ex);
   4012
   4013	/* The extent passed in that we are trying to match */
   4014	ex_cluster_start = EXT4_B2C(sbi, ee_block);
   4015	ex_cluster_end = EXT4_B2C(sbi, ee_block + ee_len - 1);
   4016
   4017	/* The requested region passed into ext4_map_blocks() */
   4018	rr_cluster_start = EXT4_B2C(sbi, map->m_lblk);
   4019
   4020	if ((rr_cluster_start == ex_cluster_end) ||
   4021	    (rr_cluster_start == ex_cluster_start)) {
   4022		if (rr_cluster_start == ex_cluster_end)
   4023			ee_start += ee_len - 1;
   4024		map->m_pblk = EXT4_PBLK_CMASK(sbi, ee_start) + c_offset;
   4025		map->m_len = min(map->m_len,
   4026				 (unsigned) sbi->s_cluster_ratio - c_offset);
   4027		/*
   4028		 * Check for and handle this case:
   4029		 *
   4030		 *   |--------- cluster # N-------------|
   4031		 *		       |------- extent ----|
   4032		 *	   |--- requested region ---|
   4033		 *	   |===========|
   4034		 */
   4035
   4036		if (map->m_lblk < ee_block)
   4037			map->m_len = min(map->m_len, ee_block - map->m_lblk);
   4038
   4039		/*
   4040		 * Check for the case where there is already another allocated
   4041		 * block to the right of 'ex' but before the end of the cluster.
   4042		 *
   4043		 *          |------------- cluster # N-------------|
   4044		 * |----- ex -----|                  |---- ex_right ----|
   4045		 *                  |------ requested region ------|
   4046		 *                  |================|
   4047		 */
   4048		if (map->m_lblk > ee_block) {
   4049			ext4_lblk_t next = ext4_ext_next_allocated_block(path);
   4050			map->m_len = min(map->m_len, next - map->m_lblk);
   4051		}
   4052
   4053		trace_ext4_get_implied_cluster_alloc_exit(sb, map, 1);
   4054		return 1;
   4055	}
   4056
   4057	trace_ext4_get_implied_cluster_alloc_exit(sb, map, 0);
   4058	return 0;
   4059}
   4060
   4061
   4062/*
   4063 * Block allocation/map/preallocation routine for extents based files
   4064 *
   4065 *
   4066 * Need to be called with
   4067 * down_read(&EXT4_I(inode)->i_data_sem) if not allocating file system block
   4068 * (ie, create is zero). Otherwise down_write(&EXT4_I(inode)->i_data_sem)
   4069 *
   4070 * return > 0, number of blocks already mapped/allocated
   4071 *          if create == 0 and these are pre-allocated blocks
   4072 *          	buffer head is unmapped
   4073 *          otherwise blocks are mapped
   4074 *
   4075 * return = 0, if plain look up failed (blocks have not been allocated)
   4076 *          buffer head is unmapped
   4077 *
   4078 * return < 0, error case.
   4079 */
   4080int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
   4081			struct ext4_map_blocks *map, int flags)
   4082{
   4083	struct ext4_ext_path *path = NULL;
   4084	struct ext4_extent newex, *ex, ex2;
   4085	struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
   4086	ext4_fsblk_t newblock = 0, pblk;
   4087	int err = 0, depth, ret;
   4088	unsigned int allocated = 0, offset = 0;
   4089	unsigned int allocated_clusters = 0;
   4090	struct ext4_allocation_request ar;
   4091	ext4_lblk_t cluster_offset;
   4092
   4093	ext_debug(inode, "blocks %u/%u requested\n", map->m_lblk, map->m_len);
   4094	trace_ext4_ext_map_blocks_enter(inode, map->m_lblk, map->m_len, flags);
   4095
   4096	/* find extent for this block */
   4097	path = ext4_find_extent(inode, map->m_lblk, NULL, 0);
   4098	if (IS_ERR(path)) {
   4099		err = PTR_ERR(path);
   4100		path = NULL;
   4101		goto out;
   4102	}
   4103
   4104	depth = ext_depth(inode);
   4105
   4106	/*
   4107	 * consistent leaf must not be empty;
   4108	 * this situation is possible, though, _during_ tree modification;
   4109	 * this is why assert can't be put in ext4_find_extent()
   4110	 */
   4111	if (unlikely(path[depth].p_ext == NULL && depth != 0)) {
   4112		EXT4_ERROR_INODE(inode, "bad extent address "
   4113				 "lblock: %lu, depth: %d pblock %lld",
   4114				 (unsigned long) map->m_lblk, depth,
   4115				 path[depth].p_block);
   4116		err = -EFSCORRUPTED;
   4117		goto out;
   4118	}
   4119
   4120	ex = path[depth].p_ext;
   4121	if (ex) {
   4122		ext4_lblk_t ee_block = le32_to_cpu(ex->ee_block);
   4123		ext4_fsblk_t ee_start = ext4_ext_pblock(ex);
   4124		unsigned short ee_len;
   4125
   4126
   4127		/*
   4128		 * unwritten extents are treated as holes, except that
   4129		 * we split out initialized portions during a write.
   4130		 */
   4131		ee_len = ext4_ext_get_actual_len(ex);
   4132
   4133		trace_ext4_ext_show_extent(inode, ee_block, ee_start, ee_len);
   4134
   4135		/* if found extent covers block, simply return it */
   4136		if (in_range(map->m_lblk, ee_block, ee_len)) {
   4137			newblock = map->m_lblk - ee_block + ee_start;
   4138			/* number of remaining blocks in the extent */
   4139			allocated = ee_len - (map->m_lblk - ee_block);
   4140			ext_debug(inode, "%u fit into %u:%d -> %llu\n",
   4141				  map->m_lblk, ee_block, ee_len, newblock);
   4142
   4143			/*
   4144			 * If the extent is initialized check whether the
   4145			 * caller wants to convert it to unwritten.
   4146			 */
   4147			if ((!ext4_ext_is_unwritten(ex)) &&
   4148			    (flags & EXT4_GET_BLOCKS_CONVERT_UNWRITTEN)) {
   4149				err = convert_initialized_extent(handle,
   4150					inode, map, &path, &allocated);
   4151				goto out;
   4152			} else if (!ext4_ext_is_unwritten(ex)) {
   4153				map->m_flags |= EXT4_MAP_MAPPED;
   4154				map->m_pblk = newblock;
   4155				if (allocated > map->m_len)
   4156					allocated = map->m_len;
   4157				map->m_len = allocated;
   4158				ext4_ext_show_leaf(inode, path);
   4159				goto out;
   4160			}
   4161
   4162			ret = ext4_ext_handle_unwritten_extents(
   4163				handle, inode, map, &path, flags,
   4164				allocated, newblock);
   4165			if (ret < 0)
   4166				err = ret;
   4167			else
   4168				allocated = ret;
   4169			goto out;
   4170		}
   4171	}
   4172
   4173	/*
   4174	 * requested block isn't allocated yet;
   4175	 * we couldn't try to create block if create flag is zero
   4176	 */
   4177	if ((flags & EXT4_GET_BLOCKS_CREATE) == 0) {
   4178		ext4_lblk_t hole_start, hole_len;
   4179
   4180		hole_start = map->m_lblk;
   4181		hole_len = ext4_ext_determine_hole(inode, path, &hole_start);
   4182		/*
   4183		 * put just found gap into cache to speed up
   4184		 * subsequent requests
   4185		 */
   4186		ext4_ext_put_gap_in_cache(inode, hole_start, hole_len);
   4187
   4188		/* Update hole_len to reflect hole size after map->m_lblk */
   4189		if (hole_start != map->m_lblk)
   4190			hole_len -= map->m_lblk - hole_start;
   4191		map->m_pblk = 0;
   4192		map->m_len = min_t(unsigned int, map->m_len, hole_len);
   4193
   4194		goto out;
   4195	}
   4196
   4197	/*
   4198	 * Okay, we need to do block allocation.
   4199	 */
   4200	newex.ee_block = cpu_to_le32(map->m_lblk);
   4201	cluster_offset = EXT4_LBLK_COFF(sbi, map->m_lblk);
   4202
   4203	/*
   4204	 * If we are doing bigalloc, check to see if the extent returned
   4205	 * by ext4_find_extent() implies a cluster we can use.
   4206	 */
   4207	if (cluster_offset && ex &&
   4208	    get_implied_cluster_alloc(inode->i_sb, map, ex, path)) {
   4209		ar.len = allocated = map->m_len;
   4210		newblock = map->m_pblk;
   4211		goto got_allocated_blocks;
   4212	}
   4213
   4214	/* find neighbour allocated blocks */
   4215	ar.lleft = map->m_lblk;
   4216	err = ext4_ext_search_left(inode, path, &ar.lleft, &ar.pleft);
   4217	if (err)
   4218		goto out;
   4219	ar.lright = map->m_lblk;
   4220	err = ext4_ext_search_right(inode, path, &ar.lright, &ar.pright, &ex2);
   4221	if (err < 0)
   4222		goto out;
   4223
   4224	/* Check if the extent after searching to the right implies a
   4225	 * cluster we can use. */
   4226	if ((sbi->s_cluster_ratio > 1) && err &&
   4227	    get_implied_cluster_alloc(inode->i_sb, map, &ex2, path)) {
   4228		ar.len = allocated = map->m_len;
   4229		newblock = map->m_pblk;
   4230		goto got_allocated_blocks;
   4231	}
   4232
   4233	/*
   4234	 * See if request is beyond maximum number of blocks we can have in
   4235	 * a single extent. For an initialized extent this limit is
   4236	 * EXT_INIT_MAX_LEN and for an unwritten extent this limit is
   4237	 * EXT_UNWRITTEN_MAX_LEN.
   4238	 */
   4239	if (map->m_len > EXT_INIT_MAX_LEN &&
   4240	    !(flags & EXT4_GET_BLOCKS_UNWRIT_EXT))
   4241		map->m_len = EXT_INIT_MAX_LEN;
   4242	else if (map->m_len > EXT_UNWRITTEN_MAX_LEN &&
   4243		 (flags & EXT4_GET_BLOCKS_UNWRIT_EXT))
   4244		map->m_len = EXT_UNWRITTEN_MAX_LEN;
   4245
   4246	/* Check if we can really insert (m_lblk)::(m_lblk + m_len) extent */
   4247	newex.ee_len = cpu_to_le16(map->m_len);
   4248	err = ext4_ext_check_overlap(sbi, inode, &newex, path);
   4249	if (err)
   4250		allocated = ext4_ext_get_actual_len(&newex);
   4251	else
   4252		allocated = map->m_len;
   4253
   4254	/* allocate new block */
   4255	ar.inode = inode;
   4256	ar.goal = ext4_ext_find_goal(inode, path, map->m_lblk);
   4257	ar.logical = map->m_lblk;
   4258	/*
   4259	 * We calculate the offset from the beginning of the cluster
   4260	 * for the logical block number, since when we allocate a
   4261	 * physical cluster, the physical block should start at the
   4262	 * same offset from the beginning of the cluster.  This is
   4263	 * needed so that future calls to get_implied_cluster_alloc()
   4264	 * work correctly.
   4265	 */
   4266	offset = EXT4_LBLK_COFF(sbi, map->m_lblk);
   4267	ar.len = EXT4_NUM_B2C(sbi, offset+allocated);
   4268	ar.goal -= offset;
   4269	ar.logical -= offset;
   4270	if (S_ISREG(inode->i_mode))
   4271		ar.flags = EXT4_MB_HINT_DATA;
   4272	else
   4273		/* disable in-core preallocation for non-regular files */
   4274		ar.flags = 0;
   4275	if (flags & EXT4_GET_BLOCKS_NO_NORMALIZE)
   4276		ar.flags |= EXT4_MB_HINT_NOPREALLOC;
   4277	if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE)
   4278		ar.flags |= EXT4_MB_DELALLOC_RESERVED;
   4279	if (flags & EXT4_GET_BLOCKS_METADATA_NOFAIL)
   4280		ar.flags |= EXT4_MB_USE_RESERVED;
   4281	newblock = ext4_mb_new_blocks(handle, &ar, &err);
   4282	if (!newblock)
   4283		goto out;
   4284	allocated_clusters = ar.len;
   4285	ar.len = EXT4_C2B(sbi, ar.len) - offset;
   4286	ext_debug(inode, "allocate new block: goal %llu, found %llu/%u, requested %u\n",
   4287		  ar.goal, newblock, ar.len, allocated);
   4288	if (ar.len > allocated)
   4289		ar.len = allocated;
   4290
   4291got_allocated_blocks:
   4292	/* try to insert new extent into found leaf and return */
   4293	pblk = newblock + offset;
   4294	ext4_ext_store_pblock(&newex, pblk);
   4295	newex.ee_len = cpu_to_le16(ar.len);
   4296	/* Mark unwritten */
   4297	if (flags & EXT4_GET_BLOCKS_UNWRIT_EXT) {
   4298		ext4_ext_mark_unwritten(&newex);
   4299		map->m_flags |= EXT4_MAP_UNWRITTEN;
   4300	}
   4301
   4302	err = ext4_ext_insert_extent(handle, inode, &path, &newex, flags);
   4303	if (err) {
   4304		if (allocated_clusters) {
   4305			int fb_flags = 0;
   4306
   4307			/*
   4308			 * free data blocks we just allocated.
   4309			 * not a good idea to call discard here directly,
   4310			 * but otherwise we'd need to call it every free().
   4311			 */
   4312			ext4_discard_preallocations(inode, 0);
   4313			if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE)
   4314				fb_flags = EXT4_FREE_BLOCKS_NO_QUOT_UPDATE;
   4315			ext4_free_blocks(handle, inode, NULL, newblock,
   4316					 EXT4_C2B(sbi, allocated_clusters),
   4317					 fb_flags);
   4318		}
   4319		goto out;
   4320	}
   4321
   4322	/*
   4323	 * Reduce the reserved cluster count to reflect successful deferred
   4324	 * allocation of delayed allocated clusters or direct allocation of
   4325	 * clusters discovered to be delayed allocated.  Once allocated, a
   4326	 * cluster is not included in the reserved count.
   4327	 */
   4328	if (test_opt(inode->i_sb, DELALLOC) && allocated_clusters) {
   4329		if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) {
   4330			/*
   4331			 * When allocating delayed allocated clusters, simply
   4332			 * reduce the reserved cluster count and claim quota
   4333			 */
   4334			ext4_da_update_reserve_space(inode, allocated_clusters,
   4335							1);
   4336		} else {
   4337			ext4_lblk_t lblk, len;
   4338			unsigned int n;
   4339
   4340			/*
   4341			 * When allocating non-delayed allocated clusters
   4342			 * (from fallocate, filemap, DIO, or clusters
   4343			 * allocated when delalloc has been disabled by
   4344			 * ext4_nonda_switch), reduce the reserved cluster
   4345			 * count by the number of allocated clusters that
   4346			 * have previously been delayed allocated.  Quota
   4347			 * has been claimed by ext4_mb_new_blocks() above,
   4348			 * so release the quota reservations made for any
   4349			 * previously delayed allocated clusters.
   4350			 */
   4351			lblk = EXT4_LBLK_CMASK(sbi, map->m_lblk);
   4352			len = allocated_clusters << sbi->s_cluster_bits;
   4353			n = ext4_es_delayed_clu(inode, lblk, len);
   4354			if (n > 0)
   4355				ext4_da_update_reserve_space(inode, (int) n, 0);
   4356		}
   4357	}
   4358
   4359	/*
   4360	 * Cache the extent and update transaction to commit on fdatasync only
   4361	 * when it is _not_ an unwritten extent.
   4362	 */
   4363	if ((flags & EXT4_GET_BLOCKS_UNWRIT_EXT) == 0)
   4364		ext4_update_inode_fsync_trans(handle, inode, 1);
   4365	else
   4366		ext4_update_inode_fsync_trans(handle, inode, 0);
   4367
   4368	map->m_flags |= (EXT4_MAP_NEW | EXT4_MAP_MAPPED);
   4369	map->m_pblk = pblk;
   4370	map->m_len = ar.len;
   4371	allocated = map->m_len;
   4372	ext4_ext_show_leaf(inode, path);
   4373out:
   4374	ext4_ext_drop_refs(path);
   4375	kfree(path);
   4376
   4377	trace_ext4_ext_map_blocks_exit(inode, flags, map,
   4378				       err ? err : allocated);
   4379	return err ? err : allocated;
   4380}
   4381
   4382int ext4_ext_truncate(handle_t *handle, struct inode *inode)
   4383{
   4384	struct super_block *sb = inode->i_sb;
   4385	ext4_lblk_t last_block;
   4386	int err = 0;
   4387
   4388	/*
   4389	 * TODO: optimization is possible here.
   4390	 * Probably we need not scan at all,
   4391	 * because page truncation is enough.
   4392	 */
   4393
   4394	/* we have to know where to truncate from in crash case */
   4395	EXT4_I(inode)->i_disksize = inode->i_size;
   4396	err = ext4_mark_inode_dirty(handle, inode);
   4397	if (err)
   4398		return err;
   4399
   4400	last_block = (inode->i_size + sb->s_blocksize - 1)
   4401			>> EXT4_BLOCK_SIZE_BITS(sb);
   4402retry:
   4403	err = ext4_es_remove_extent(inode, last_block,
   4404				    EXT_MAX_BLOCKS - last_block);
   4405	if (err == -ENOMEM) {
   4406		memalloc_retry_wait(GFP_ATOMIC);
   4407		goto retry;
   4408	}
   4409	if (err)
   4410		return err;
   4411retry_remove_space:
   4412	err = ext4_ext_remove_space(inode, last_block, EXT_MAX_BLOCKS - 1);
   4413	if (err == -ENOMEM) {
   4414		memalloc_retry_wait(GFP_ATOMIC);
   4415		goto retry_remove_space;
   4416	}
   4417	return err;
   4418}
   4419
   4420static int ext4_alloc_file_blocks(struct file *file, ext4_lblk_t offset,
   4421				  ext4_lblk_t len, loff_t new_size,
   4422				  int flags)
   4423{
   4424	struct inode *inode = file_inode(file);
   4425	handle_t *handle;
   4426	int ret = 0, ret2 = 0, ret3 = 0;
   4427	int retries = 0;
   4428	int depth = 0;
   4429	struct ext4_map_blocks map;
   4430	unsigned int credits;
   4431	loff_t epos;
   4432
   4433	BUG_ON(!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS));
   4434	map.m_lblk = offset;
   4435	map.m_len = len;
   4436	/*
   4437	 * Don't normalize the request if it can fit in one extent so
   4438	 * that it doesn't get unnecessarily split into multiple
   4439	 * extents.
   4440	 */
   4441	if (len <= EXT_UNWRITTEN_MAX_LEN)
   4442		flags |= EXT4_GET_BLOCKS_NO_NORMALIZE;
   4443
   4444	/*
   4445	 * credits to insert 1 extent into extent tree
   4446	 */
   4447	credits = ext4_chunk_trans_blocks(inode, len);
   4448	depth = ext_depth(inode);
   4449
   4450retry:
   4451	while (len) {
   4452		/*
   4453		 * Recalculate credits when extent tree depth changes.
   4454		 */
   4455		if (depth != ext_depth(inode)) {
   4456			credits = ext4_chunk_trans_blocks(inode, len);
   4457			depth = ext_depth(inode);
   4458		}
   4459
   4460		handle = ext4_journal_start(inode, EXT4_HT_MAP_BLOCKS,
   4461					    credits);
   4462		if (IS_ERR(handle)) {
   4463			ret = PTR_ERR(handle);
   4464			break;
   4465		}
   4466		ret = ext4_map_blocks(handle, inode, &map, flags);
   4467		if (ret <= 0) {
   4468			ext4_debug("inode #%lu: block %u: len %u: "
   4469				   "ext4_ext_map_blocks returned %d",
   4470				   inode->i_ino, map.m_lblk,
   4471				   map.m_len, ret);
   4472			ext4_mark_inode_dirty(handle, inode);
   4473			ext4_journal_stop(handle);
   4474			break;
   4475		}
   4476		/*
   4477		 * allow a full retry cycle for any remaining allocations
   4478		 */
   4479		retries = 0;
   4480		map.m_lblk += ret;
   4481		map.m_len = len = len - ret;
   4482		epos = (loff_t)map.m_lblk << inode->i_blkbits;
   4483		inode->i_ctime = current_time(inode);
   4484		if (new_size) {
   4485			if (epos > new_size)
   4486				epos = new_size;
   4487			if (ext4_update_inode_size(inode, epos) & 0x1)
   4488				inode->i_mtime = inode->i_ctime;
   4489		}
   4490		ret2 = ext4_mark_inode_dirty(handle, inode);
   4491		ext4_update_inode_fsync_trans(handle, inode, 1);
   4492		ret3 = ext4_journal_stop(handle);
   4493		ret2 = ret3 ? ret3 : ret2;
   4494		if (unlikely(ret2))
   4495			break;
   4496	}
   4497	if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
   4498		goto retry;
   4499
   4500	return ret > 0 ? ret2 : ret;
   4501}
   4502
   4503static int ext4_collapse_range(struct file *file, loff_t offset, loff_t len);
   4504
   4505static int ext4_insert_range(struct file *file, loff_t offset, loff_t len);
   4506
   4507static long ext4_zero_range(struct file *file, loff_t offset,
   4508			    loff_t len, int mode)
   4509{
   4510	struct inode *inode = file_inode(file);
   4511	struct address_space *mapping = file->f_mapping;
   4512	handle_t *handle = NULL;
   4513	unsigned int max_blocks;
   4514	loff_t new_size = 0;
   4515	int ret = 0;
   4516	int flags;
   4517	int credits;
   4518	int partial_begin, partial_end;
   4519	loff_t start, end;
   4520	ext4_lblk_t lblk;
   4521	unsigned int blkbits = inode->i_blkbits;
   4522
   4523	trace_ext4_zero_range(inode, offset, len, mode);
   4524
   4525	/* Call ext4_force_commit to flush all data in case of data=journal. */
   4526	if (ext4_should_journal_data(inode)) {
   4527		ret = ext4_force_commit(inode->i_sb);
   4528		if (ret)
   4529			return ret;
   4530	}
   4531
   4532	/*
   4533	 * Round up offset. This is not fallocate, we need to zero out
   4534	 * blocks, so convert interior block aligned part of the range to
   4535	 * unwritten and possibly manually zero out unaligned parts of the
   4536	 * range.
   4537	 */
   4538	start = round_up(offset, 1 << blkbits);
   4539	end = round_down((offset + len), 1 << blkbits);
   4540
   4541	if (start < offset || end > offset + len)
   4542		return -EINVAL;
   4543	partial_begin = offset & ((1 << blkbits) - 1);
   4544	partial_end = (offset + len) & ((1 << blkbits) - 1);
   4545
   4546	lblk = start >> blkbits;
   4547	max_blocks = (end >> blkbits);
   4548	if (max_blocks < lblk)
   4549		max_blocks = 0;
   4550	else
   4551		max_blocks -= lblk;
   4552
   4553	inode_lock(inode);
   4554
   4555	/*
   4556	 * Indirect files do not support unwritten extents
   4557	 */
   4558	if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) {
   4559		ret = -EOPNOTSUPP;
   4560		goto out_mutex;
   4561	}
   4562
   4563	if (!(mode & FALLOC_FL_KEEP_SIZE) &&
   4564	    (offset + len > inode->i_size ||
   4565	     offset + len > EXT4_I(inode)->i_disksize)) {
   4566		new_size = offset + len;
   4567		ret = inode_newsize_ok(inode, new_size);
   4568		if (ret)
   4569			goto out_mutex;
   4570	}
   4571
   4572	flags = EXT4_GET_BLOCKS_CREATE_UNWRIT_EXT;
   4573
   4574	/* Wait all existing dio workers, newcomers will block on i_rwsem */
   4575	inode_dio_wait(inode);
   4576
   4577	ret = file_modified(file);
   4578	if (ret)
   4579		goto out_mutex;
   4580
   4581	/* Preallocate the range including the unaligned edges */
   4582	if (partial_begin || partial_end) {
   4583		ret = ext4_alloc_file_blocks(file,
   4584				round_down(offset, 1 << blkbits) >> blkbits,
   4585				(round_up((offset + len), 1 << blkbits) -
   4586				 round_down(offset, 1 << blkbits)) >> blkbits,
   4587				new_size, flags);
   4588		if (ret)
   4589			goto out_mutex;
   4590
   4591	}
   4592
   4593	/* Zero range excluding the unaligned edges */
   4594	if (max_blocks > 0) {
   4595		flags |= (EXT4_GET_BLOCKS_CONVERT_UNWRITTEN |
   4596			  EXT4_EX_NOCACHE);
   4597
   4598		/*
   4599		 * Prevent page faults from reinstantiating pages we have
   4600		 * released from page cache.
   4601		 */
   4602		filemap_invalidate_lock(mapping);
   4603
   4604		ret = ext4_break_layouts(inode);
   4605		if (ret) {
   4606			filemap_invalidate_unlock(mapping);
   4607			goto out_mutex;
   4608		}
   4609
   4610		ret = ext4_update_disksize_before_punch(inode, offset, len);
   4611		if (ret) {
   4612			filemap_invalidate_unlock(mapping);
   4613			goto out_mutex;
   4614		}
   4615		/* Now release the pages and zero block aligned part of pages */
   4616		truncate_pagecache_range(inode, start, end - 1);
   4617		inode->i_mtime = inode->i_ctime = current_time(inode);
   4618
   4619		ret = ext4_alloc_file_blocks(file, lblk, max_blocks, new_size,
   4620					     flags);
   4621		filemap_invalidate_unlock(mapping);
   4622		if (ret)
   4623			goto out_mutex;
   4624	}
   4625	if (!partial_begin && !partial_end)
   4626		goto out_mutex;
   4627
   4628	/*
   4629	 * In worst case we have to writeout two nonadjacent unwritten
   4630	 * blocks and update the inode
   4631	 */
   4632	credits = (2 * ext4_ext_index_trans_blocks(inode, 2)) + 1;
   4633	if (ext4_should_journal_data(inode))
   4634		credits += 2;
   4635	handle = ext4_journal_start(inode, EXT4_HT_MISC, credits);
   4636	if (IS_ERR(handle)) {
   4637		ret = PTR_ERR(handle);
   4638		ext4_std_error(inode->i_sb, ret);
   4639		goto out_mutex;
   4640	}
   4641
   4642	inode->i_mtime = inode->i_ctime = current_time(inode);
   4643	if (new_size)
   4644		ext4_update_inode_size(inode, new_size);
   4645	ret = ext4_mark_inode_dirty(handle, inode);
   4646	if (unlikely(ret))
   4647		goto out_handle;
   4648	/* Zero out partial block at the edges of the range */
   4649	ret = ext4_zero_partial_blocks(handle, inode, offset, len);
   4650	if (ret >= 0)
   4651		ext4_update_inode_fsync_trans(handle, inode, 1);
   4652
   4653	if (file->f_flags & O_SYNC)
   4654		ext4_handle_sync(handle);
   4655
   4656out_handle:
   4657	ext4_journal_stop(handle);
   4658out_mutex:
   4659	inode_unlock(inode);
   4660	return ret;
   4661}
   4662
   4663/*
   4664 * preallocate space for a file. This implements ext4's fallocate file
   4665 * operation, which gets called from sys_fallocate system call.
   4666 * For block-mapped files, posix_fallocate should fall back to the method
   4667 * of writing zeroes to the required new blocks (the same behavior which is
   4668 * expected for file systems which do not support fallocate() system call).
   4669 */
   4670long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
   4671{
   4672	struct inode *inode = file_inode(file);
   4673	loff_t new_size = 0;
   4674	unsigned int max_blocks;
   4675	int ret = 0;
   4676	int flags;
   4677	ext4_lblk_t lblk;
   4678	unsigned int blkbits = inode->i_blkbits;
   4679
   4680	/*
   4681	 * Encrypted inodes can't handle collapse range or insert
   4682	 * range since we would need to re-encrypt blocks with a
   4683	 * different IV or XTS tweak (which are based on the logical
   4684	 * block number).
   4685	 */
   4686	if (IS_ENCRYPTED(inode) &&
   4687	    (mode & (FALLOC_FL_COLLAPSE_RANGE | FALLOC_FL_INSERT_RANGE)))
   4688		return -EOPNOTSUPP;
   4689
   4690	/* Return error if mode is not supported */
   4691	if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE |
   4692		     FALLOC_FL_COLLAPSE_RANGE | FALLOC_FL_ZERO_RANGE |
   4693		     FALLOC_FL_INSERT_RANGE))
   4694		return -EOPNOTSUPP;
   4695
   4696	inode_lock(inode);
   4697	ret = ext4_convert_inline_data(inode);
   4698	inode_unlock(inode);
   4699	if (ret)
   4700		goto exit;
   4701
   4702	if (mode & FALLOC_FL_PUNCH_HOLE) {
   4703		ret = ext4_punch_hole(file, offset, len);
   4704		goto exit;
   4705	}
   4706
   4707	if (mode & FALLOC_FL_COLLAPSE_RANGE) {
   4708		ret = ext4_collapse_range(file, offset, len);
   4709		goto exit;
   4710	}
   4711
   4712	if (mode & FALLOC_FL_INSERT_RANGE) {
   4713		ret = ext4_insert_range(file, offset, len);
   4714		goto exit;
   4715	}
   4716
   4717	if (mode & FALLOC_FL_ZERO_RANGE) {
   4718		ret = ext4_zero_range(file, offset, len, mode);
   4719		goto exit;
   4720	}
   4721	trace_ext4_fallocate_enter(inode, offset, len, mode);
   4722	lblk = offset >> blkbits;
   4723
   4724	max_blocks = EXT4_MAX_BLOCKS(len, offset, blkbits);
   4725	flags = EXT4_GET_BLOCKS_CREATE_UNWRIT_EXT;
   4726
   4727	inode_lock(inode);
   4728
   4729	/*
   4730	 * We only support preallocation for extent-based files only
   4731	 */
   4732	if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) {
   4733		ret = -EOPNOTSUPP;
   4734		goto out;
   4735	}
   4736
   4737	if (!(mode & FALLOC_FL_KEEP_SIZE) &&
   4738	    (offset + len > inode->i_size ||
   4739	     offset + len > EXT4_I(inode)->i_disksize)) {
   4740		new_size = offset + len;
   4741		ret = inode_newsize_ok(inode, new_size);
   4742		if (ret)
   4743			goto out;
   4744	}
   4745
   4746	/* Wait all existing dio workers, newcomers will block on i_rwsem */
   4747	inode_dio_wait(inode);
   4748
   4749	ret = file_modified(file);
   4750	if (ret)
   4751		goto out;
   4752
   4753	ret = ext4_alloc_file_blocks(file, lblk, max_blocks, new_size, flags);
   4754	if (ret)
   4755		goto out;
   4756
   4757	if (file->f_flags & O_SYNC && EXT4_SB(inode->i_sb)->s_journal) {
   4758		ret = ext4_fc_commit(EXT4_SB(inode->i_sb)->s_journal,
   4759					EXT4_I(inode)->i_sync_tid);
   4760	}
   4761out:
   4762	inode_unlock(inode);
   4763	trace_ext4_fallocate_exit(inode, offset, max_blocks, ret);
   4764exit:
   4765	return ret;
   4766}
   4767
   4768/*
   4769 * This function convert a range of blocks to written extents
   4770 * The caller of this function will pass the start offset and the size.
   4771 * all unwritten extents within this range will be converted to
   4772 * written extents.
   4773 *
   4774 * This function is called from the direct IO end io call back
   4775 * function, to convert the fallocated extents after IO is completed.
   4776 * Returns 0 on success.
   4777 */
   4778int ext4_convert_unwritten_extents(handle_t *handle, struct inode *inode,
   4779				   loff_t offset, ssize_t len)
   4780{
   4781	unsigned int max_blocks;
   4782	int ret = 0, ret2 = 0, ret3 = 0;
   4783	struct ext4_map_blocks map;
   4784	unsigned int blkbits = inode->i_blkbits;
   4785	unsigned int credits = 0;
   4786
   4787	map.m_lblk = offset >> blkbits;
   4788	max_blocks = EXT4_MAX_BLOCKS(len, offset, blkbits);
   4789
   4790	if (!handle) {
   4791		/*
   4792		 * credits to insert 1 extent into extent tree
   4793		 */
   4794		credits = ext4_chunk_trans_blocks(inode, max_blocks);
   4795	}
   4796	while (ret >= 0 && ret < max_blocks) {
   4797		map.m_lblk += ret;
   4798		map.m_len = (max_blocks -= ret);
   4799		if (credits) {
   4800			handle = ext4_journal_start(inode, EXT4_HT_MAP_BLOCKS,
   4801						    credits);
   4802			if (IS_ERR(handle)) {
   4803				ret = PTR_ERR(handle);
   4804				break;
   4805			}
   4806		}
   4807		ret = ext4_map_blocks(handle, inode, &map,
   4808				      EXT4_GET_BLOCKS_IO_CONVERT_EXT);
   4809		if (ret <= 0)
   4810			ext4_warning(inode->i_sb,
   4811				     "inode #%lu: block %u: len %u: "
   4812				     "ext4_ext_map_blocks returned %d",
   4813				     inode->i_ino, map.m_lblk,
   4814				     map.m_len, ret);
   4815		ret2 = ext4_mark_inode_dirty(handle, inode);
   4816		if (credits) {
   4817			ret3 = ext4_journal_stop(handle);
   4818			if (unlikely(ret3))
   4819				ret2 = ret3;
   4820		}
   4821
   4822		if (ret <= 0 || ret2)
   4823			break;
   4824	}
   4825	return ret > 0 ? ret2 : ret;
   4826}
   4827
   4828int ext4_convert_unwritten_io_end_vec(handle_t *handle, ext4_io_end_t *io_end)
   4829{
   4830	int ret = 0, err = 0;
   4831	struct ext4_io_end_vec *io_end_vec;
   4832
   4833	/*
   4834	 * This is somewhat ugly but the idea is clear: When transaction is
   4835	 * reserved, everything goes into it. Otherwise we rather start several
   4836	 * smaller transactions for conversion of each extent separately.
   4837	 */
   4838	if (handle) {
   4839		handle = ext4_journal_start_reserved(handle,
   4840						     EXT4_HT_EXT_CONVERT);
   4841		if (IS_ERR(handle))
   4842			return PTR_ERR(handle);
   4843	}
   4844
   4845	list_for_each_entry(io_end_vec, &io_end->list_vec, list) {
   4846		ret = ext4_convert_unwritten_extents(handle, io_end->inode,
   4847						     io_end_vec->offset,
   4848						     io_end_vec->size);
   4849		if (ret)
   4850			break;
   4851	}
   4852
   4853	if (handle)
   4854		err = ext4_journal_stop(handle);
   4855
   4856	return ret < 0 ? ret : err;
   4857}
   4858
   4859static int ext4_iomap_xattr_fiemap(struct inode *inode, struct iomap *iomap)
   4860{
   4861	__u64 physical = 0;
   4862	__u64 length = 0;
   4863	int blockbits = inode->i_sb->s_blocksize_bits;
   4864	int error = 0;
   4865	u16 iomap_type;
   4866
   4867	/* in-inode? */
   4868	if (ext4_test_inode_state(inode, EXT4_STATE_XATTR)) {
   4869		struct ext4_iloc iloc;
   4870		int offset;	/* offset of xattr in inode */
   4871
   4872		error = ext4_get_inode_loc(inode, &iloc);
   4873		if (error)
   4874			return error;
   4875		physical = (__u64)iloc.bh->b_blocknr << blockbits;
   4876		offset = EXT4_GOOD_OLD_INODE_SIZE +
   4877				EXT4_I(inode)->i_extra_isize;
   4878		physical += offset;
   4879		length = EXT4_SB(inode->i_sb)->s_inode_size - offset;
   4880		brelse(iloc.bh);
   4881		iomap_type = IOMAP_INLINE;
   4882	} else if (EXT4_I(inode)->i_file_acl) { /* external block */
   4883		physical = (__u64)EXT4_I(inode)->i_file_acl << blockbits;
   4884		length = inode->i_sb->s_blocksize;
   4885		iomap_type = IOMAP_MAPPED;
   4886	} else {
   4887		/* no in-inode or external block for xattr, so return -ENOENT */
   4888		error = -ENOENT;
   4889		goto out;
   4890	}
   4891
   4892	iomap->addr = physical;
   4893	iomap->offset = 0;
   4894	iomap->length = length;
   4895	iomap->type = iomap_type;
   4896	iomap->flags = 0;
   4897out:
   4898	return error;
   4899}
   4900
   4901static int ext4_iomap_xattr_begin(struct inode *inode, loff_t offset,
   4902				  loff_t length, unsigned flags,
   4903				  struct iomap *iomap, struct iomap *srcmap)
   4904{
   4905	int error;
   4906
   4907	error = ext4_iomap_xattr_fiemap(inode, iomap);
   4908	if (error == 0 && (offset >= iomap->length))
   4909		error = -ENOENT;
   4910	return error;
   4911}
   4912
   4913static const struct iomap_ops ext4_iomap_xattr_ops = {
   4914	.iomap_begin		= ext4_iomap_xattr_begin,
   4915};
   4916
   4917static int ext4_fiemap_check_ranges(struct inode *inode, u64 start, u64 *len)
   4918{
   4919	u64 maxbytes;
   4920
   4921	if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
   4922		maxbytes = inode->i_sb->s_maxbytes;
   4923	else
   4924		maxbytes = EXT4_SB(inode->i_sb)->s_bitmap_maxbytes;
   4925
   4926	if (*len == 0)
   4927		return -EINVAL;
   4928	if (start > maxbytes)
   4929		return -EFBIG;
   4930
   4931	/*
   4932	 * Shrink request scope to what the fs can actually handle.
   4933	 */
   4934	if (*len > maxbytes || (maxbytes - *len) < start)
   4935		*len = maxbytes - start;
   4936	return 0;
   4937}
   4938
   4939int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
   4940		u64 start, u64 len)
   4941{
   4942	int error = 0;
   4943
   4944	if (fieinfo->fi_flags & FIEMAP_FLAG_CACHE) {
   4945		error = ext4_ext_precache(inode);
   4946		if (error)
   4947			return error;
   4948		fieinfo->fi_flags &= ~FIEMAP_FLAG_CACHE;
   4949	}
   4950
   4951	/*
   4952	 * For bitmap files the maximum size limit could be smaller than
   4953	 * s_maxbytes, so check len here manually instead of just relying on the
   4954	 * generic check.
   4955	 */
   4956	error = ext4_fiemap_check_ranges(inode, start, &len);
   4957	if (error)
   4958		return error;
   4959
   4960	if (fieinfo->fi_flags & FIEMAP_FLAG_XATTR) {
   4961		fieinfo->fi_flags &= ~FIEMAP_FLAG_XATTR;
   4962		return iomap_fiemap(inode, fieinfo, start, len,
   4963				    &ext4_iomap_xattr_ops);
   4964	}
   4965
   4966	return iomap_fiemap(inode, fieinfo, start, len, &ext4_iomap_report_ops);
   4967}
   4968
   4969int ext4_get_es_cache(struct inode *inode, struct fiemap_extent_info *fieinfo,
   4970		      __u64 start, __u64 len)
   4971{
   4972	ext4_lblk_t start_blk, len_blks;
   4973	__u64 last_blk;
   4974	int error = 0;
   4975
   4976	if (ext4_has_inline_data(inode)) {
   4977		int has_inline;
   4978
   4979		down_read(&EXT4_I(inode)->xattr_sem);
   4980		has_inline = ext4_has_inline_data(inode);
   4981		up_read(&EXT4_I(inode)->xattr_sem);
   4982		if (has_inline)
   4983			return 0;
   4984	}
   4985
   4986	if (fieinfo->fi_flags & FIEMAP_FLAG_CACHE) {
   4987		error = ext4_ext_precache(inode);
   4988		if (error)
   4989			return error;
   4990		fieinfo->fi_flags &= ~FIEMAP_FLAG_CACHE;
   4991	}
   4992
   4993	error = fiemap_prep(inode, fieinfo, start, &len, 0);
   4994	if (error)
   4995		return error;
   4996
   4997	error = ext4_fiemap_check_ranges(inode, start, &len);
   4998	if (error)
   4999		return error;
   5000
   5001	start_blk = start >> inode->i_sb->s_blocksize_bits;
   5002	last_blk = (start + len - 1) >> inode->i_sb->s_blocksize_bits;
   5003	if (last_blk >= EXT_MAX_BLOCKS)
   5004		last_blk = EXT_MAX_BLOCKS-1;
   5005	len_blks = ((ext4_lblk_t) last_blk) - start_blk + 1;
   5006
   5007	/*
   5008	 * Walk the extent tree gathering extent information
   5009	 * and pushing extents back to the user.
   5010	 */
   5011	return ext4_fill_es_cache_info(inode, start_blk, len_blks, fieinfo);
   5012}
   5013
   5014/*
   5015 * ext4_ext_shift_path_extents:
   5016 * Shift the extents of a path structure lying between path[depth].p_ext
   5017 * and EXT_LAST_EXTENT(path[depth].p_hdr), by @shift blocks. @SHIFT tells
   5018 * if it is right shift or left shift operation.
   5019 */
   5020static int
   5021ext4_ext_shift_path_extents(struct ext4_ext_path *path, ext4_lblk_t shift,
   5022			    struct inode *inode, handle_t *handle,
   5023			    enum SHIFT_DIRECTION SHIFT)
   5024{
   5025	int depth, err = 0;
   5026	struct ext4_extent *ex_start, *ex_last;
   5027	bool update = false;
   5028	int credits, restart_credits;
   5029	depth = path->p_depth;
   5030
   5031	while (depth >= 0) {
   5032		if (depth == path->p_depth) {
   5033			ex_start = path[depth].p_ext;
   5034			if (!ex_start)
   5035				return -EFSCORRUPTED;
   5036
   5037			ex_last = EXT_LAST_EXTENT(path[depth].p_hdr);
   5038			/* leaf + sb + inode */
   5039			credits = 3;
   5040			if (ex_start == EXT_FIRST_EXTENT(path[depth].p_hdr)) {
   5041				update = true;
   5042				/* extent tree + sb + inode */
   5043				credits = depth + 2;
   5044			}
   5045
   5046			restart_credits = ext4_writepage_trans_blocks(inode);
   5047			err = ext4_datasem_ensure_credits(handle, inode, credits,
   5048					restart_credits, 0);
   5049			if (err) {
   5050				if (err > 0)
   5051					err = -EAGAIN;
   5052				goto out;
   5053			}
   5054
   5055			err = ext4_ext_get_access(handle, inode, path + depth);
   5056			if (err)
   5057				goto out;
   5058
   5059			while (ex_start <= ex_last) {
   5060				if (SHIFT == SHIFT_LEFT) {
   5061					le32_add_cpu(&ex_start->ee_block,
   5062						-shift);
   5063					/* Try to merge to the left. */
   5064					if ((ex_start >
   5065					    EXT_FIRST_EXTENT(path[depth].p_hdr))
   5066					    &&
   5067					    ext4_ext_try_to_merge_right(inode,
   5068					    path, ex_start - 1))
   5069						ex_last--;
   5070					else
   5071						ex_start++;
   5072				} else {
   5073					le32_add_cpu(&ex_last->ee_block, shift);
   5074					ext4_ext_try_to_merge_right(inode, path,
   5075						ex_last);
   5076					ex_last--;
   5077				}
   5078			}
   5079			err = ext4_ext_dirty(handle, inode, path + depth);
   5080			if (err)
   5081				goto out;
   5082
   5083			if (--depth < 0 || !update)
   5084				break;
   5085		}
   5086
   5087		/* Update index too */
   5088		err = ext4_ext_get_access(handle, inode, path + depth);
   5089		if (err)
   5090			goto out;
   5091
   5092		if (SHIFT == SHIFT_LEFT)
   5093			le32_add_cpu(&path[depth].p_idx->ei_block, -shift);
   5094		else
   5095			le32_add_cpu(&path[depth].p_idx->ei_block, shift);
   5096		err = ext4_ext_dirty(handle, inode, path + depth);
   5097		if (err)
   5098			goto out;
   5099
   5100		/* we are done if current index is not a starting index */
   5101		if (path[depth].p_idx != EXT_FIRST_INDEX(path[depth].p_hdr))
   5102			break;
   5103
   5104		depth--;
   5105	}
   5106
   5107out:
   5108	return err;
   5109}
   5110
   5111/*
   5112 * ext4_ext_shift_extents:
   5113 * All the extents which lies in the range from @start to the last allocated
   5114 * block for the @inode are shifted either towards left or right (depending
   5115 * upon @SHIFT) by @shift blocks.
   5116 * On success, 0 is returned, error otherwise.
   5117 */
   5118static int
   5119ext4_ext_shift_extents(struct inode *inode, handle_t *handle,
   5120		       ext4_lblk_t start, ext4_lblk_t shift,
   5121		       enum SHIFT_DIRECTION SHIFT)
   5122{
   5123	struct ext4_ext_path *path;
   5124	int ret = 0, depth;
   5125	struct ext4_extent *extent;
   5126	ext4_lblk_t stop, *iterator, ex_start, ex_end;
   5127	ext4_lblk_t tmp = EXT_MAX_BLOCKS;
   5128
   5129	/* Let path point to the last extent */
   5130	path = ext4_find_extent(inode, EXT_MAX_BLOCKS - 1, NULL,
   5131				EXT4_EX_NOCACHE);
   5132	if (IS_ERR(path))
   5133		return PTR_ERR(path);
   5134
   5135	depth = path->p_depth;
   5136	extent = path[depth].p_ext;
   5137	if (!extent)
   5138		goto out;
   5139
   5140	stop = le32_to_cpu(extent->ee_block);
   5141
   5142       /*
   5143	* For left shifts, make sure the hole on the left is big enough to
   5144	* accommodate the shift.  For right shifts, make sure the last extent
   5145	* won't be shifted beyond EXT_MAX_BLOCKS.
   5146	*/
   5147	if (SHIFT == SHIFT_LEFT) {
   5148		path = ext4_find_extent(inode, start - 1, &path,
   5149					EXT4_EX_NOCACHE);
   5150		if (IS_ERR(path))
   5151			return PTR_ERR(path);
   5152		depth = path->p_depth;
   5153		extent =  path[depth].p_ext;
   5154		if (extent) {
   5155			ex_start = le32_to_cpu(extent->ee_block);
   5156			ex_end = le32_to_cpu(extent->ee_block) +
   5157				ext4_ext_get_actual_len(extent);
   5158		} else {
   5159			ex_start = 0;
   5160			ex_end = 0;
   5161		}
   5162
   5163		if ((start == ex_start && shift > ex_start) ||
   5164		    (shift > start - ex_end)) {
   5165			ret = -EINVAL;
   5166			goto out;
   5167		}
   5168	} else {
   5169		if (shift > EXT_MAX_BLOCKS -
   5170		    (stop + ext4_ext_get_actual_len(extent))) {
   5171			ret = -EINVAL;
   5172			goto out;
   5173		}
   5174	}
   5175
   5176	/*
   5177	 * In case of left shift, iterator points to start and it is increased
   5178	 * till we reach stop. In case of right shift, iterator points to stop
   5179	 * and it is decreased till we reach start.
   5180	 */
   5181again:
   5182	if (SHIFT == SHIFT_LEFT)
   5183		iterator = &start;
   5184	else
   5185		iterator = &stop;
   5186
   5187	if (tmp != EXT_MAX_BLOCKS)
   5188		*iterator = tmp;
   5189
   5190	/*
   5191	 * Its safe to start updating extents.  Start and stop are unsigned, so
   5192	 * in case of right shift if extent with 0 block is reached, iterator
   5193	 * becomes NULL to indicate the end of the loop.
   5194	 */
   5195	while (iterator && start <= stop) {
   5196		path = ext4_find_extent(inode, *iterator, &path,
   5197					EXT4_EX_NOCACHE);
   5198		if (IS_ERR(path))
   5199			return PTR_ERR(path);
   5200		depth = path->p_depth;
   5201		extent = path[depth].p_ext;
   5202		if (!extent) {
   5203			EXT4_ERROR_INODE(inode, "unexpected hole at %lu",
   5204					 (unsigned long) *iterator);
   5205			return -EFSCORRUPTED;
   5206		}
   5207		if (SHIFT == SHIFT_LEFT && *iterator >
   5208		    le32_to_cpu(extent->ee_block)) {
   5209			/* Hole, move to the next extent */
   5210			if (extent < EXT_LAST_EXTENT(path[depth].p_hdr)) {
   5211				path[depth].p_ext++;
   5212			} else {
   5213				*iterator = ext4_ext_next_allocated_block(path);
   5214				continue;
   5215			}
   5216		}
   5217
   5218		tmp = *iterator;
   5219		if (SHIFT == SHIFT_LEFT) {
   5220			extent = EXT_LAST_EXTENT(path[depth].p_hdr);
   5221			*iterator = le32_to_cpu(extent->ee_block) +
   5222					ext4_ext_get_actual_len(extent);
   5223		} else {
   5224			extent = EXT_FIRST_EXTENT(path[depth].p_hdr);
   5225			if (le32_to_cpu(extent->ee_block) > 0)
   5226				*iterator = le32_to_cpu(extent->ee_block) - 1;
   5227			else
   5228				/* Beginning is reached, end of the loop */
   5229				iterator = NULL;
   5230			/* Update path extent in case we need to stop */
   5231			while (le32_to_cpu(extent->ee_block) < start)
   5232				extent++;
   5233			path[depth].p_ext = extent;
   5234		}
   5235		ret = ext4_ext_shift_path_extents(path, shift, inode,
   5236				handle, SHIFT);
   5237		/* iterator can be NULL which means we should break */
   5238		if (ret == -EAGAIN)
   5239			goto again;
   5240		if (ret)
   5241			break;
   5242	}
   5243out:
   5244	ext4_ext_drop_refs(path);
   5245	kfree(path);
   5246	return ret;
   5247}
   5248
   5249/*
   5250 * ext4_collapse_range:
   5251 * This implements the fallocate's collapse range functionality for ext4
   5252 * Returns: 0 and non-zero on error.
   5253 */
   5254static int ext4_collapse_range(struct file *file, loff_t offset, loff_t len)
   5255{
   5256	struct inode *inode = file_inode(file);
   5257	struct super_block *sb = inode->i_sb;
   5258	struct address_space *mapping = inode->i_mapping;
   5259	ext4_lblk_t punch_start, punch_stop;
   5260	handle_t *handle;
   5261	unsigned int credits;
   5262	loff_t new_size, ioffset;
   5263	int ret;
   5264
   5265	/*
   5266	 * We need to test this early because xfstests assumes that a
   5267	 * collapse range of (0, 1) will return EOPNOTSUPP if the file
   5268	 * system does not support collapse range.
   5269	 */
   5270	if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
   5271		return -EOPNOTSUPP;
   5272
   5273	/* Collapse range works only on fs cluster size aligned regions. */
   5274	if (!IS_ALIGNED(offset | len, EXT4_CLUSTER_SIZE(sb)))
   5275		return -EINVAL;
   5276
   5277	trace_ext4_collapse_range(inode, offset, len);
   5278
   5279	punch_start = offset >> EXT4_BLOCK_SIZE_BITS(sb);
   5280	punch_stop = (offset + len) >> EXT4_BLOCK_SIZE_BITS(sb);
   5281
   5282	/* Call ext4_force_commit to flush all data in case of data=journal. */
   5283	if (ext4_should_journal_data(inode)) {
   5284		ret = ext4_force_commit(inode->i_sb);
   5285		if (ret)
   5286			return ret;
   5287	}
   5288
   5289	inode_lock(inode);
   5290	/*
   5291	 * There is no need to overlap collapse range with EOF, in which case
   5292	 * it is effectively a truncate operation
   5293	 */
   5294	if (offset + len >= inode->i_size) {
   5295		ret = -EINVAL;
   5296		goto out_mutex;
   5297	}
   5298
   5299	/* Currently just for extent based files */
   5300	if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) {
   5301		ret = -EOPNOTSUPP;
   5302		goto out_mutex;
   5303	}
   5304
   5305	/* Wait for existing dio to complete */
   5306	inode_dio_wait(inode);
   5307
   5308	ret = file_modified(file);
   5309	if (ret)
   5310		goto out_mutex;
   5311
   5312	/*
   5313	 * Prevent page faults from reinstantiating pages we have released from
   5314	 * page cache.
   5315	 */
   5316	filemap_invalidate_lock(mapping);
   5317
   5318	ret = ext4_break_layouts(inode);
   5319	if (ret)
   5320		goto out_mmap;
   5321
   5322	/*
   5323	 * Need to round down offset to be aligned with page size boundary
   5324	 * for page size > block size.
   5325	 */
   5326	ioffset = round_down(offset, PAGE_SIZE);
   5327	/*
   5328	 * Write tail of the last page before removed range since it will get
   5329	 * removed from the page cache below.
   5330	 */
   5331	ret = filemap_write_and_wait_range(mapping, ioffset, offset);
   5332	if (ret)
   5333		goto out_mmap;
   5334	/*
   5335	 * Write data that will be shifted to preserve them when discarding
   5336	 * page cache below. We are also protected from pages becoming dirty
   5337	 * by i_rwsem and invalidate_lock.
   5338	 */
   5339	ret = filemap_write_and_wait_range(mapping, offset + len,
   5340					   LLONG_MAX);
   5341	if (ret)
   5342		goto out_mmap;
   5343	truncate_pagecache(inode, ioffset);
   5344
   5345	credits = ext4_writepage_trans_blocks(inode);
   5346	handle = ext4_journal_start(inode, EXT4_HT_TRUNCATE, credits);
   5347	if (IS_ERR(handle)) {
   5348		ret = PTR_ERR(handle);
   5349		goto out_mmap;
   5350	}
   5351	ext4_fc_mark_ineligible(sb, EXT4_FC_REASON_FALLOC_RANGE, handle);
   5352
   5353	down_write(&EXT4_I(inode)->i_data_sem);
   5354	ext4_discard_preallocations(inode, 0);
   5355
   5356	ret = ext4_es_remove_extent(inode, punch_start,
   5357				    EXT_MAX_BLOCKS - punch_start);
   5358	if (ret) {
   5359		up_write(&EXT4_I(inode)->i_data_sem);
   5360		goto out_stop;
   5361	}
   5362
   5363	ret = ext4_ext_remove_space(inode, punch_start, punch_stop - 1);
   5364	if (ret) {
   5365		up_write(&EXT4_I(inode)->i_data_sem);
   5366		goto out_stop;
   5367	}
   5368	ext4_discard_preallocations(inode, 0);
   5369
   5370	ret = ext4_ext_shift_extents(inode, handle, punch_stop,
   5371				     punch_stop - punch_start, SHIFT_LEFT);
   5372	if (ret) {
   5373		up_write(&EXT4_I(inode)->i_data_sem);
   5374		goto out_stop;
   5375	}
   5376
   5377	new_size = inode->i_size - len;
   5378	i_size_write(inode, new_size);
   5379	EXT4_I(inode)->i_disksize = new_size;
   5380
   5381	up_write(&EXT4_I(inode)->i_data_sem);
   5382	if (IS_SYNC(inode))
   5383		ext4_handle_sync(handle);
   5384	inode->i_mtime = inode->i_ctime = current_time(inode);
   5385	ret = ext4_mark_inode_dirty(handle, inode);
   5386	ext4_update_inode_fsync_trans(handle, inode, 1);
   5387
   5388out_stop:
   5389	ext4_journal_stop(handle);
   5390out_mmap:
   5391	filemap_invalidate_unlock(mapping);
   5392out_mutex:
   5393	inode_unlock(inode);
   5394	return ret;
   5395}
   5396
   5397/*
   5398 * ext4_insert_range:
   5399 * This function implements the FALLOC_FL_INSERT_RANGE flag of fallocate.
   5400 * The data blocks starting from @offset to the EOF are shifted by @len
   5401 * towards right to create a hole in the @inode. Inode size is increased
   5402 * by len bytes.
   5403 * Returns 0 on success, error otherwise.
   5404 */
   5405static int ext4_insert_range(struct file *file, loff_t offset, loff_t len)
   5406{
   5407	struct inode *inode = file_inode(file);
   5408	struct super_block *sb = inode->i_sb;
   5409	struct address_space *mapping = inode->i_mapping;
   5410	handle_t *handle;
   5411	struct ext4_ext_path *path;
   5412	struct ext4_extent *extent;
   5413	ext4_lblk_t offset_lblk, len_lblk, ee_start_lblk = 0;
   5414	unsigned int credits, ee_len;
   5415	int ret = 0, depth, split_flag = 0;
   5416	loff_t ioffset;
   5417
   5418	/*
   5419	 * We need to test this early because xfstests assumes that an
   5420	 * insert range of (0, 1) will return EOPNOTSUPP if the file
   5421	 * system does not support insert range.
   5422	 */
   5423	if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
   5424		return -EOPNOTSUPP;
   5425
   5426	/* Insert range works only on fs cluster size aligned regions. */
   5427	if (!IS_ALIGNED(offset | len, EXT4_CLUSTER_SIZE(sb)))
   5428		return -EINVAL;
   5429
   5430	trace_ext4_insert_range(inode, offset, len);
   5431
   5432	offset_lblk = offset >> EXT4_BLOCK_SIZE_BITS(sb);
   5433	len_lblk = len >> EXT4_BLOCK_SIZE_BITS(sb);
   5434
   5435	/* Call ext4_force_commit to flush all data in case of data=journal */
   5436	if (ext4_should_journal_data(inode)) {
   5437		ret = ext4_force_commit(inode->i_sb);
   5438		if (ret)
   5439			return ret;
   5440	}
   5441
   5442	inode_lock(inode);
   5443	/* Currently just for extent based files */
   5444	if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) {
   5445		ret = -EOPNOTSUPP;
   5446		goto out_mutex;
   5447	}
   5448
   5449	/* Check whether the maximum file size would be exceeded */
   5450	if (len > inode->i_sb->s_maxbytes - inode->i_size) {
   5451		ret = -EFBIG;
   5452		goto out_mutex;
   5453	}
   5454
   5455	/* Offset must be less than i_size */
   5456	if (offset >= inode->i_size) {
   5457		ret = -EINVAL;
   5458		goto out_mutex;
   5459	}
   5460
   5461	/* Wait for existing dio to complete */
   5462	inode_dio_wait(inode);
   5463
   5464	ret = file_modified(file);
   5465	if (ret)
   5466		goto out_mutex;
   5467
   5468	/*
   5469	 * Prevent page faults from reinstantiating pages we have released from
   5470	 * page cache.
   5471	 */
   5472	filemap_invalidate_lock(mapping);
   5473
   5474	ret = ext4_break_layouts(inode);
   5475	if (ret)
   5476		goto out_mmap;
   5477
   5478	/*
   5479	 * Need to round down to align start offset to page size boundary
   5480	 * for page size > block size.
   5481	 */
   5482	ioffset = round_down(offset, PAGE_SIZE);
   5483	/* Write out all dirty pages */
   5484	ret = filemap_write_and_wait_range(inode->i_mapping, ioffset,
   5485			LLONG_MAX);
   5486	if (ret)
   5487		goto out_mmap;
   5488	truncate_pagecache(inode, ioffset);
   5489
   5490	credits = ext4_writepage_trans_blocks(inode);
   5491	handle = ext4_journal_start(inode, EXT4_HT_TRUNCATE, credits);
   5492	if (IS_ERR(handle)) {
   5493		ret = PTR_ERR(handle);
   5494		goto out_mmap;
   5495	}
   5496	ext4_fc_mark_ineligible(sb, EXT4_FC_REASON_FALLOC_RANGE, handle);
   5497
   5498	/* Expand file to avoid data loss if there is error while shifting */
   5499	inode->i_size += len;
   5500	EXT4_I(inode)->i_disksize += len;
   5501	inode->i_mtime = inode->i_ctime = current_time(inode);
   5502	ret = ext4_mark_inode_dirty(handle, inode);
   5503	if (ret)
   5504		goto out_stop;
   5505
   5506	down_write(&EXT4_I(inode)->i_data_sem);
   5507	ext4_discard_preallocations(inode, 0);
   5508
   5509	path = ext4_find_extent(inode, offset_lblk, NULL, 0);
   5510	if (IS_ERR(path)) {
   5511		up_write(&EXT4_I(inode)->i_data_sem);
   5512		goto out_stop;
   5513	}
   5514
   5515	depth = ext_depth(inode);
   5516	extent = path[depth].p_ext;
   5517	if (extent) {
   5518		ee_start_lblk = le32_to_cpu(extent->ee_block);
   5519		ee_len = ext4_ext_get_actual_len(extent);
   5520
   5521		/*
   5522		 * If offset_lblk is not the starting block of extent, split
   5523		 * the extent @offset_lblk
   5524		 */
   5525		if ((offset_lblk > ee_start_lblk) &&
   5526				(offset_lblk < (ee_start_lblk + ee_len))) {
   5527			if (ext4_ext_is_unwritten(extent))
   5528				split_flag = EXT4_EXT_MARK_UNWRIT1 |
   5529					EXT4_EXT_MARK_UNWRIT2;
   5530			ret = ext4_split_extent_at(handle, inode, &path,
   5531					offset_lblk, split_flag,
   5532					EXT4_EX_NOCACHE |
   5533					EXT4_GET_BLOCKS_PRE_IO |
   5534					EXT4_GET_BLOCKS_METADATA_NOFAIL);
   5535		}
   5536
   5537		ext4_ext_drop_refs(path);
   5538		kfree(path);
   5539		if (ret < 0) {
   5540			up_write(&EXT4_I(inode)->i_data_sem);
   5541			goto out_stop;
   5542		}
   5543	} else {
   5544		ext4_ext_drop_refs(path);
   5545		kfree(path);
   5546	}
   5547
   5548	ret = ext4_es_remove_extent(inode, offset_lblk,
   5549			EXT_MAX_BLOCKS - offset_lblk);
   5550	if (ret) {
   5551		up_write(&EXT4_I(inode)->i_data_sem);
   5552		goto out_stop;
   5553	}
   5554
   5555	/*
   5556	 * if offset_lblk lies in a hole which is at start of file, use
   5557	 * ee_start_lblk to shift extents
   5558	 */
   5559	ret = ext4_ext_shift_extents(inode, handle,
   5560		ee_start_lblk > offset_lblk ? ee_start_lblk : offset_lblk,
   5561		len_lblk, SHIFT_RIGHT);
   5562
   5563	up_write(&EXT4_I(inode)->i_data_sem);
   5564	if (IS_SYNC(inode))
   5565		ext4_handle_sync(handle);
   5566	if (ret >= 0)
   5567		ext4_update_inode_fsync_trans(handle, inode, 1);
   5568
   5569out_stop:
   5570	ext4_journal_stop(handle);
   5571out_mmap:
   5572	filemap_invalidate_unlock(mapping);
   5573out_mutex:
   5574	inode_unlock(inode);
   5575	return ret;
   5576}
   5577
   5578/**
   5579 * ext4_swap_extents() - Swap extents between two inodes
   5580 * @handle: handle for this transaction
   5581 * @inode1:	First inode
   5582 * @inode2:	Second inode
   5583 * @lblk1:	Start block for first inode
   5584 * @lblk2:	Start block for second inode
   5585 * @count:	Number of blocks to swap
   5586 * @unwritten: Mark second inode's extents as unwritten after swap
   5587 * @erp:	Pointer to save error value
   5588 *
   5589 * This helper routine does exactly what is promise "swap extents". All other
   5590 * stuff such as page-cache locking consistency, bh mapping consistency or
   5591 * extent's data copying must be performed by caller.
   5592 * Locking:
   5593 *		i_rwsem is held for both inodes
   5594 * 		i_data_sem is locked for write for both inodes
   5595 * Assumptions:
   5596 *		All pages from requested range are locked for both inodes
   5597 */
   5598int
   5599ext4_swap_extents(handle_t *handle, struct inode *inode1,
   5600		  struct inode *inode2, ext4_lblk_t lblk1, ext4_lblk_t lblk2,
   5601		  ext4_lblk_t count, int unwritten, int *erp)
   5602{
   5603	struct ext4_ext_path *path1 = NULL;
   5604	struct ext4_ext_path *path2 = NULL;
   5605	int replaced_count = 0;
   5606
   5607	BUG_ON(!rwsem_is_locked(&EXT4_I(inode1)->i_data_sem));
   5608	BUG_ON(!rwsem_is_locked(&EXT4_I(inode2)->i_data_sem));
   5609	BUG_ON(!inode_is_locked(inode1));
   5610	BUG_ON(!inode_is_locked(inode2));
   5611
   5612	*erp = ext4_es_remove_extent(inode1, lblk1, count);
   5613	if (unlikely(*erp))
   5614		return 0;
   5615	*erp = ext4_es_remove_extent(inode2, lblk2, count);
   5616	if (unlikely(*erp))
   5617		return 0;
   5618
   5619	while (count) {
   5620		struct ext4_extent *ex1, *ex2, tmp_ex;
   5621		ext4_lblk_t e1_blk, e2_blk;
   5622		int e1_len, e2_len, len;
   5623		int split = 0;
   5624
   5625		path1 = ext4_find_extent(inode1, lblk1, NULL, EXT4_EX_NOCACHE);
   5626		if (IS_ERR(path1)) {
   5627			*erp = PTR_ERR(path1);
   5628			path1 = NULL;
   5629		finish:
   5630			count = 0;
   5631			goto repeat;
   5632		}
   5633		path2 = ext4_find_extent(inode2, lblk2, NULL, EXT4_EX_NOCACHE);
   5634		if (IS_ERR(path2)) {
   5635			*erp = PTR_ERR(path2);
   5636			path2 = NULL;
   5637			goto finish;
   5638		}
   5639		ex1 = path1[path1->p_depth].p_ext;
   5640		ex2 = path2[path2->p_depth].p_ext;
   5641		/* Do we have something to swap ? */
   5642		if (unlikely(!ex2 || !ex1))
   5643			goto finish;
   5644
   5645		e1_blk = le32_to_cpu(ex1->ee_block);
   5646		e2_blk = le32_to_cpu(ex2->ee_block);
   5647		e1_len = ext4_ext_get_actual_len(ex1);
   5648		e2_len = ext4_ext_get_actual_len(ex2);
   5649
   5650		/* Hole handling */
   5651		if (!in_range(lblk1, e1_blk, e1_len) ||
   5652		    !in_range(lblk2, e2_blk, e2_len)) {
   5653			ext4_lblk_t next1, next2;
   5654
   5655			/* if hole after extent, then go to next extent */
   5656			next1 = ext4_ext_next_allocated_block(path1);
   5657			next2 = ext4_ext_next_allocated_block(path2);
   5658			/* If hole before extent, then shift to that extent */
   5659			if (e1_blk > lblk1)
   5660				next1 = e1_blk;
   5661			if (e2_blk > lblk2)
   5662				next2 = e2_blk;
   5663			/* Do we have something to swap */
   5664			if (next1 == EXT_MAX_BLOCKS || next2 == EXT_MAX_BLOCKS)
   5665				goto finish;
   5666			/* Move to the rightest boundary */
   5667			len = next1 - lblk1;
   5668			if (len < next2 - lblk2)
   5669				len = next2 - lblk2;
   5670			if (len > count)
   5671				len = count;
   5672			lblk1 += len;
   5673			lblk2 += len;
   5674			count -= len;
   5675			goto repeat;
   5676		}
   5677
   5678		/* Prepare left boundary */
   5679		if (e1_blk < lblk1) {
   5680			split = 1;
   5681			*erp = ext4_force_split_extent_at(handle, inode1,
   5682						&path1, lblk1, 0);
   5683			if (unlikely(*erp))
   5684				goto finish;
   5685		}
   5686		if (e2_blk < lblk2) {
   5687			split = 1;
   5688			*erp = ext4_force_split_extent_at(handle, inode2,
   5689						&path2,  lblk2, 0);
   5690			if (unlikely(*erp))
   5691				goto finish;
   5692		}
   5693		/* ext4_split_extent_at() may result in leaf extent split,
   5694		 * path must to be revalidated. */
   5695		if (split)
   5696			goto repeat;
   5697
   5698		/* Prepare right boundary */
   5699		len = count;
   5700		if (len > e1_blk + e1_len - lblk1)
   5701			len = e1_blk + e1_len - lblk1;
   5702		if (len > e2_blk + e2_len - lblk2)
   5703			len = e2_blk + e2_len - lblk2;
   5704
   5705		if (len != e1_len) {
   5706			split = 1;
   5707			*erp = ext4_force_split_extent_at(handle, inode1,
   5708						&path1, lblk1 + len, 0);
   5709			if (unlikely(*erp))
   5710				goto finish;
   5711		}
   5712		if (len != e2_len) {
   5713			split = 1;
   5714			*erp = ext4_force_split_extent_at(handle, inode2,
   5715						&path2, lblk2 + len, 0);
   5716			if (*erp)
   5717				goto finish;
   5718		}
   5719		/* ext4_split_extent_at() may result in leaf extent split,
   5720		 * path must to be revalidated. */
   5721		if (split)
   5722			goto repeat;
   5723
   5724		BUG_ON(e2_len != e1_len);
   5725		*erp = ext4_ext_get_access(handle, inode1, path1 + path1->p_depth);
   5726		if (unlikely(*erp))
   5727			goto finish;
   5728		*erp = ext4_ext_get_access(handle, inode2, path2 + path2->p_depth);
   5729		if (unlikely(*erp))
   5730			goto finish;
   5731
   5732		/* Both extents are fully inside boundaries. Swap it now */
   5733		tmp_ex = *ex1;
   5734		ext4_ext_store_pblock(ex1, ext4_ext_pblock(ex2));
   5735		ext4_ext_store_pblock(ex2, ext4_ext_pblock(&tmp_ex));
   5736		ex1->ee_len = cpu_to_le16(e2_len);
   5737		ex2->ee_len = cpu_to_le16(e1_len);
   5738		if (unwritten)
   5739			ext4_ext_mark_unwritten(ex2);
   5740		if (ext4_ext_is_unwritten(&tmp_ex))
   5741			ext4_ext_mark_unwritten(ex1);
   5742
   5743		ext4_ext_try_to_merge(handle, inode2, path2, ex2);
   5744		ext4_ext_try_to_merge(handle, inode1, path1, ex1);
   5745		*erp = ext4_ext_dirty(handle, inode2, path2 +
   5746				      path2->p_depth);
   5747		if (unlikely(*erp))
   5748			goto finish;
   5749		*erp = ext4_ext_dirty(handle, inode1, path1 +
   5750				      path1->p_depth);
   5751		/*
   5752		 * Looks scarry ah..? second inode already points to new blocks,
   5753		 * and it was successfully dirtied. But luckily error may happen
   5754		 * only due to journal error, so full transaction will be
   5755		 * aborted anyway.
   5756		 */
   5757		if (unlikely(*erp))
   5758			goto finish;
   5759		lblk1 += len;
   5760		lblk2 += len;
   5761		replaced_count += len;
   5762		count -= len;
   5763
   5764	repeat:
   5765		ext4_ext_drop_refs(path1);
   5766		kfree(path1);
   5767		ext4_ext_drop_refs(path2);
   5768		kfree(path2);
   5769		path1 = path2 = NULL;
   5770	}
   5771	return replaced_count;
   5772}
   5773
   5774/*
   5775 * ext4_clu_mapped - determine whether any block in a logical cluster has
   5776 *                   been mapped to a physical cluster
   5777 *
   5778 * @inode - file containing the logical cluster
   5779 * @lclu - logical cluster of interest
   5780 *
   5781 * Returns 1 if any block in the logical cluster is mapped, signifying
   5782 * that a physical cluster has been allocated for it.  Otherwise,
   5783 * returns 0.  Can also return negative error codes.  Derived from
   5784 * ext4_ext_map_blocks().
   5785 */
   5786int ext4_clu_mapped(struct inode *inode, ext4_lblk_t lclu)
   5787{
   5788	struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
   5789	struct ext4_ext_path *path;
   5790	int depth, mapped = 0, err = 0;
   5791	struct ext4_extent *extent;
   5792	ext4_lblk_t first_lblk, first_lclu, last_lclu;
   5793
   5794	/* search for the extent closest to the first block in the cluster */
   5795	path = ext4_find_extent(inode, EXT4_C2B(sbi, lclu), NULL, 0);
   5796	if (IS_ERR(path)) {
   5797		err = PTR_ERR(path);
   5798		path = NULL;
   5799		goto out;
   5800	}
   5801
   5802	depth = ext_depth(inode);
   5803
   5804	/*
   5805	 * A consistent leaf must not be empty.  This situation is possible,
   5806	 * though, _during_ tree modification, and it's why an assert can't
   5807	 * be put in ext4_find_extent().
   5808	 */
   5809	if (unlikely(path[depth].p_ext == NULL && depth != 0)) {
   5810		EXT4_ERROR_INODE(inode,
   5811		    "bad extent address - lblock: %lu, depth: %d, pblock: %lld",
   5812				 (unsigned long) EXT4_C2B(sbi, lclu),
   5813				 depth, path[depth].p_block);
   5814		err = -EFSCORRUPTED;
   5815		goto out;
   5816	}
   5817
   5818	extent = path[depth].p_ext;
   5819
   5820	/* can't be mapped if the extent tree is empty */
   5821	if (extent == NULL)
   5822		goto out;
   5823
   5824	first_lblk = le32_to_cpu(extent->ee_block);
   5825	first_lclu = EXT4_B2C(sbi, first_lblk);
   5826
   5827	/*
   5828	 * Three possible outcomes at this point - found extent spanning
   5829	 * the target cluster, to the left of the target cluster, or to the
   5830	 * right of the target cluster.  The first two cases are handled here.
   5831	 * The last case indicates the target cluster is not mapped.
   5832	 */
   5833	if (lclu >= first_lclu) {
   5834		last_lclu = EXT4_B2C(sbi, first_lblk +
   5835				     ext4_ext_get_actual_len(extent) - 1);
   5836		if (lclu <= last_lclu) {
   5837			mapped = 1;
   5838		} else {
   5839			first_lblk = ext4_ext_next_allocated_block(path);
   5840			first_lclu = EXT4_B2C(sbi, first_lblk);
   5841			if (lclu == first_lclu)
   5842				mapped = 1;
   5843		}
   5844	}
   5845
   5846out:
   5847	ext4_ext_drop_refs(path);
   5848	kfree(path);
   5849
   5850	return err ? err : mapped;
   5851}
   5852
   5853/*
   5854 * Updates physical block address and unwritten status of extent
   5855 * starting at lblk start and of len. If such an extent doesn't exist,
   5856 * this function splits the extent tree appropriately to create an
   5857 * extent like this.  This function is called in the fast commit
   5858 * replay path.  Returns 0 on success and error on failure.
   5859 */
   5860int ext4_ext_replay_update_ex(struct inode *inode, ext4_lblk_t start,
   5861			      int len, int unwritten, ext4_fsblk_t pblk)
   5862{
   5863	struct ext4_ext_path *path = NULL, *ppath;
   5864	struct ext4_extent *ex;
   5865	int ret;
   5866
   5867	path = ext4_find_extent(inode, start, NULL, 0);
   5868	if (IS_ERR(path))
   5869		return PTR_ERR(path);
   5870	ex = path[path->p_depth].p_ext;
   5871	if (!ex) {
   5872		ret = -EFSCORRUPTED;
   5873		goto out;
   5874	}
   5875
   5876	if (le32_to_cpu(ex->ee_block) != start ||
   5877		ext4_ext_get_actual_len(ex) != len) {
   5878		/* We need to split this extent to match our extent first */
   5879		ppath = path;
   5880		down_write(&EXT4_I(inode)->i_data_sem);
   5881		ret = ext4_force_split_extent_at(NULL, inode, &ppath, start, 1);
   5882		up_write(&EXT4_I(inode)->i_data_sem);
   5883		if (ret)
   5884			goto out;
   5885		kfree(path);
   5886		path = ext4_find_extent(inode, start, NULL, 0);
   5887		if (IS_ERR(path))
   5888			return -1;
   5889		ppath = path;
   5890		ex = path[path->p_depth].p_ext;
   5891		WARN_ON(le32_to_cpu(ex->ee_block) != start);
   5892		if (ext4_ext_get_actual_len(ex) != len) {
   5893			down_write(&EXT4_I(inode)->i_data_sem);
   5894			ret = ext4_force_split_extent_at(NULL, inode, &ppath,
   5895							 start + len, 1);
   5896			up_write(&EXT4_I(inode)->i_data_sem);
   5897			if (ret)
   5898				goto out;
   5899			kfree(path);
   5900			path = ext4_find_extent(inode, start, NULL, 0);
   5901			if (IS_ERR(path))
   5902				return -EINVAL;
   5903			ex = path[path->p_depth].p_ext;
   5904		}
   5905	}
   5906	if (unwritten)
   5907		ext4_ext_mark_unwritten(ex);
   5908	else
   5909		ext4_ext_mark_initialized(ex);
   5910	ext4_ext_store_pblock(ex, pblk);
   5911	down_write(&EXT4_I(inode)->i_data_sem);
   5912	ret = ext4_ext_dirty(NULL, inode, &path[path->p_depth]);
   5913	up_write(&EXT4_I(inode)->i_data_sem);
   5914out:
   5915	ext4_ext_drop_refs(path);
   5916	kfree(path);
   5917	ext4_mark_inode_dirty(NULL, inode);
   5918	return ret;
   5919}
   5920
   5921/* Try to shrink the extent tree */
   5922void ext4_ext_replay_shrink_inode(struct inode *inode, ext4_lblk_t end)
   5923{
   5924	struct ext4_ext_path *path = NULL;
   5925	struct ext4_extent *ex;
   5926	ext4_lblk_t old_cur, cur = 0;
   5927
   5928	while (cur < end) {
   5929		path = ext4_find_extent(inode, cur, NULL, 0);
   5930		if (IS_ERR(path))
   5931			return;
   5932		ex = path[path->p_depth].p_ext;
   5933		if (!ex) {
   5934			ext4_ext_drop_refs(path);
   5935			kfree(path);
   5936			ext4_mark_inode_dirty(NULL, inode);
   5937			return;
   5938		}
   5939		old_cur = cur;
   5940		cur = le32_to_cpu(ex->ee_block) + ext4_ext_get_actual_len(ex);
   5941		if (cur <= old_cur)
   5942			cur = old_cur + 1;
   5943		ext4_ext_try_to_merge(NULL, inode, path, ex);
   5944		down_write(&EXT4_I(inode)->i_data_sem);
   5945		ext4_ext_dirty(NULL, inode, &path[path->p_depth]);
   5946		up_write(&EXT4_I(inode)->i_data_sem);
   5947		ext4_mark_inode_dirty(NULL, inode);
   5948		ext4_ext_drop_refs(path);
   5949		kfree(path);
   5950	}
   5951}
   5952
   5953/* Check if *cur is a hole and if it is, skip it */
   5954static int skip_hole(struct inode *inode, ext4_lblk_t *cur)
   5955{
   5956	int ret;
   5957	struct ext4_map_blocks map;
   5958
   5959	map.m_lblk = *cur;
   5960	map.m_len = ((inode->i_size) >> inode->i_sb->s_blocksize_bits) - *cur;
   5961
   5962	ret = ext4_map_blocks(NULL, inode, &map, 0);
   5963	if (ret < 0)
   5964		return ret;
   5965	if (ret != 0)
   5966		return 0;
   5967	*cur = *cur + map.m_len;
   5968	return 0;
   5969}
   5970
   5971/* Count number of blocks used by this inode and update i_blocks */
   5972int ext4_ext_replay_set_iblocks(struct inode *inode)
   5973{
   5974	struct ext4_ext_path *path = NULL, *path2 = NULL;
   5975	struct ext4_extent *ex;
   5976	ext4_lblk_t cur = 0, end;
   5977	int numblks = 0, i, ret = 0;
   5978	ext4_fsblk_t cmp1, cmp2;
   5979	struct ext4_map_blocks map;
   5980
   5981	/* Determin the size of the file first */
   5982	path = ext4_find_extent(inode, EXT_MAX_BLOCKS - 1, NULL,
   5983					EXT4_EX_NOCACHE);
   5984	if (IS_ERR(path))
   5985		return PTR_ERR(path);
   5986	ex = path[path->p_depth].p_ext;
   5987	if (!ex) {
   5988		ext4_ext_drop_refs(path);
   5989		kfree(path);
   5990		goto out;
   5991	}
   5992	end = le32_to_cpu(ex->ee_block) + ext4_ext_get_actual_len(ex);
   5993	ext4_ext_drop_refs(path);
   5994	kfree(path);
   5995
   5996	/* Count the number of data blocks */
   5997	cur = 0;
   5998	while (cur < end) {
   5999		map.m_lblk = cur;
   6000		map.m_len = end - cur;
   6001		ret = ext4_map_blocks(NULL, inode, &map, 0);
   6002		if (ret < 0)
   6003			break;
   6004		if (ret > 0)
   6005			numblks += ret;
   6006		cur = cur + map.m_len;
   6007	}
   6008
   6009	/*
   6010	 * Count the number of extent tree blocks. We do it by looking up
   6011	 * two successive extents and determining the difference between
   6012	 * their paths. When path is different for 2 successive extents
   6013	 * we compare the blocks in the path at each level and increment
   6014	 * iblocks by total number of differences found.
   6015	 */
   6016	cur = 0;
   6017	ret = skip_hole(inode, &cur);
   6018	if (ret < 0)
   6019		goto out;
   6020	path = ext4_find_extent(inode, cur, NULL, 0);
   6021	if (IS_ERR(path))
   6022		goto out;
   6023	numblks += path->p_depth;
   6024	ext4_ext_drop_refs(path);
   6025	kfree(path);
   6026	while (cur < end) {
   6027		path = ext4_find_extent(inode, cur, NULL, 0);
   6028		if (IS_ERR(path))
   6029			break;
   6030		ex = path[path->p_depth].p_ext;
   6031		if (!ex) {
   6032			ext4_ext_drop_refs(path);
   6033			kfree(path);
   6034			return 0;
   6035		}
   6036		cur = max(cur + 1, le32_to_cpu(ex->ee_block) +
   6037					ext4_ext_get_actual_len(ex));
   6038		ret = skip_hole(inode, &cur);
   6039		if (ret < 0) {
   6040			ext4_ext_drop_refs(path);
   6041			kfree(path);
   6042			break;
   6043		}
   6044		path2 = ext4_find_extent(inode, cur, NULL, 0);
   6045		if (IS_ERR(path2)) {
   6046			ext4_ext_drop_refs(path);
   6047			kfree(path);
   6048			break;
   6049		}
   6050		for (i = 0; i <= max(path->p_depth, path2->p_depth); i++) {
   6051			cmp1 = cmp2 = 0;
   6052			if (i <= path->p_depth)
   6053				cmp1 = path[i].p_bh ?
   6054					path[i].p_bh->b_blocknr : 0;
   6055			if (i <= path2->p_depth)
   6056				cmp2 = path2[i].p_bh ?
   6057					path2[i].p_bh->b_blocknr : 0;
   6058			if (cmp1 != cmp2 && cmp2 != 0)
   6059				numblks++;
   6060		}
   6061		ext4_ext_drop_refs(path);
   6062		ext4_ext_drop_refs(path2);
   6063		kfree(path);
   6064		kfree(path2);
   6065	}
   6066
   6067out:
   6068	inode->i_blocks = numblks << (inode->i_sb->s_blocksize_bits - 9);
   6069	ext4_mark_inode_dirty(NULL, inode);
   6070	return 0;
   6071}
   6072
   6073int ext4_ext_clear_bb(struct inode *inode)
   6074{
   6075	struct ext4_ext_path *path = NULL;
   6076	struct ext4_extent *ex;
   6077	ext4_lblk_t cur = 0, end;
   6078	int j, ret = 0;
   6079	struct ext4_map_blocks map;
   6080
   6081	if (ext4_test_inode_flag(inode, EXT4_INODE_INLINE_DATA))
   6082		return 0;
   6083
   6084	/* Determin the size of the file first */
   6085	path = ext4_find_extent(inode, EXT_MAX_BLOCKS - 1, NULL,
   6086					EXT4_EX_NOCACHE);
   6087	if (IS_ERR(path))
   6088		return PTR_ERR(path);
   6089	ex = path[path->p_depth].p_ext;
   6090	if (!ex) {
   6091		ext4_ext_drop_refs(path);
   6092		kfree(path);
   6093		return 0;
   6094	}
   6095	end = le32_to_cpu(ex->ee_block) + ext4_ext_get_actual_len(ex);
   6096	ext4_ext_drop_refs(path);
   6097	kfree(path);
   6098
   6099	cur = 0;
   6100	while (cur < end) {
   6101		map.m_lblk = cur;
   6102		map.m_len = end - cur;
   6103		ret = ext4_map_blocks(NULL, inode, &map, 0);
   6104		if (ret < 0)
   6105			break;
   6106		if (ret > 0) {
   6107			path = ext4_find_extent(inode, map.m_lblk, NULL, 0);
   6108			if (!IS_ERR_OR_NULL(path)) {
   6109				for (j = 0; j < path->p_depth; j++) {
   6110
   6111					ext4_mb_mark_bb(inode->i_sb,
   6112							path[j].p_block, 1, 0);
   6113					ext4_fc_record_regions(inode->i_sb, inode->i_ino,
   6114							0, path[j].p_block, 1, 1);
   6115				}
   6116				ext4_ext_drop_refs(path);
   6117				kfree(path);
   6118			}
   6119			ext4_mb_mark_bb(inode->i_sb, map.m_pblk, map.m_len, 0);
   6120			ext4_fc_record_regions(inode->i_sb, inode->i_ino,
   6121					map.m_lblk, map.m_pblk, map.m_len, 1);
   6122		}
   6123		cur = cur + map.m_len;
   6124	}
   6125
   6126	return 0;
   6127}