cachepc-linux

Fork of AMDESE/linux with modifications for CachePC side-channel attack
git clone https://git.sinitax.com/sinitax/cachepc-linux
Log | Files | Refs | README | LICENSE | sfeed.txt

alloc.c (200579B)


      1// SPDX-License-Identifier: GPL-2.0-or-later
      2/*
      3 * alloc.c
      4 *
      5 * Extent allocs and frees
      6 *
      7 * Copyright (C) 2002, 2004 Oracle.  All rights reserved.
      8 */
      9
     10#include <linux/fs.h>
     11#include <linux/types.h>
     12#include <linux/slab.h>
     13#include <linux/highmem.h>
     14#include <linux/swap.h>
     15#include <linux/quotaops.h>
     16#include <linux/blkdev.h>
     17#include <linux/sched/signal.h>
     18
     19#include <cluster/masklog.h>
     20
     21#include "ocfs2.h"
     22
     23#include "alloc.h"
     24#include "aops.h"
     25#include "blockcheck.h"
     26#include "dlmglue.h"
     27#include "extent_map.h"
     28#include "inode.h"
     29#include "journal.h"
     30#include "localalloc.h"
     31#include "suballoc.h"
     32#include "sysfile.h"
     33#include "file.h"
     34#include "super.h"
     35#include "uptodate.h"
     36#include "xattr.h"
     37#include "refcounttree.h"
     38#include "ocfs2_trace.h"
     39
     40#include "buffer_head_io.h"
     41
     42enum ocfs2_contig_type {
     43	CONTIG_NONE = 0,
     44	CONTIG_LEFT,
     45	CONTIG_RIGHT,
     46	CONTIG_LEFTRIGHT,
     47};
     48
     49static enum ocfs2_contig_type
     50	ocfs2_extent_rec_contig(struct super_block *sb,
     51				struct ocfs2_extent_rec *ext,
     52				struct ocfs2_extent_rec *insert_rec);
     53/*
     54 * Operations for a specific extent tree type.
     55 *
     56 * To implement an on-disk btree (extent tree) type in ocfs2, add
     57 * an ocfs2_extent_tree_operations structure and the matching
     58 * ocfs2_init_<thingy>_extent_tree() function.  That's pretty much it
     59 * for the allocation portion of the extent tree.
     60 */
     61struct ocfs2_extent_tree_operations {
     62	/*
     63	 * last_eb_blk is the block number of the right most leaf extent
     64	 * block.  Most on-disk structures containing an extent tree store
     65	 * this value for fast access.  The ->eo_set_last_eb_blk() and
     66	 * ->eo_get_last_eb_blk() operations access this value.  They are
     67	 *  both required.
     68	 */
     69	void (*eo_set_last_eb_blk)(struct ocfs2_extent_tree *et,
     70				   u64 blkno);
     71	u64 (*eo_get_last_eb_blk)(struct ocfs2_extent_tree *et);
     72
     73	/*
     74	 * The on-disk structure usually keeps track of how many total
     75	 * clusters are stored in this extent tree.  This function updates
     76	 * that value.  new_clusters is the delta, and must be
     77	 * added to the total.  Required.
     78	 */
     79	void (*eo_update_clusters)(struct ocfs2_extent_tree *et,
     80				   u32 new_clusters);
     81
     82	/*
     83	 * If this extent tree is supported by an extent map, insert
     84	 * a record into the map.
     85	 */
     86	void (*eo_extent_map_insert)(struct ocfs2_extent_tree *et,
     87				     struct ocfs2_extent_rec *rec);
     88
     89	/*
     90	 * If this extent tree is supported by an extent map, truncate the
     91	 * map to clusters,
     92	 */
     93	void (*eo_extent_map_truncate)(struct ocfs2_extent_tree *et,
     94				       u32 clusters);
     95
     96	/*
     97	 * If ->eo_insert_check() exists, it is called before rec is
     98	 * inserted into the extent tree.  It is optional.
     99	 */
    100	int (*eo_insert_check)(struct ocfs2_extent_tree *et,
    101			       struct ocfs2_extent_rec *rec);
    102	int (*eo_sanity_check)(struct ocfs2_extent_tree *et);
    103
    104	/*
    105	 * --------------------------------------------------------------
    106	 * The remaining are internal to ocfs2_extent_tree and don't have
    107	 * accessor functions
    108	 */
    109
    110	/*
    111	 * ->eo_fill_root_el() takes et->et_object and sets et->et_root_el.
    112	 * It is required.
    113	 */
    114	void (*eo_fill_root_el)(struct ocfs2_extent_tree *et);
    115
    116	/*
    117	 * ->eo_fill_max_leaf_clusters sets et->et_max_leaf_clusters if
    118	 * it exists.  If it does not, et->et_max_leaf_clusters is set
    119	 * to 0 (unlimited).  Optional.
    120	 */
    121	void (*eo_fill_max_leaf_clusters)(struct ocfs2_extent_tree *et);
    122
    123	/*
    124	 * ->eo_extent_contig test whether the 2 ocfs2_extent_rec
    125	 * are contiguous or not. Optional. Don't need to set it if use
    126	 * ocfs2_extent_rec as the tree leaf.
    127	 */
    128	enum ocfs2_contig_type
    129		(*eo_extent_contig)(struct ocfs2_extent_tree *et,
    130				    struct ocfs2_extent_rec *ext,
    131				    struct ocfs2_extent_rec *insert_rec);
    132};
    133
    134
    135/*
    136 * Pre-declare ocfs2_dinode_et_ops so we can use it as a sanity check
    137 * in the methods.
    138 */
    139static u64 ocfs2_dinode_get_last_eb_blk(struct ocfs2_extent_tree *et);
    140static void ocfs2_dinode_set_last_eb_blk(struct ocfs2_extent_tree *et,
    141					 u64 blkno);
    142static void ocfs2_dinode_update_clusters(struct ocfs2_extent_tree *et,
    143					 u32 clusters);
    144static void ocfs2_dinode_extent_map_insert(struct ocfs2_extent_tree *et,
    145					   struct ocfs2_extent_rec *rec);
    146static void ocfs2_dinode_extent_map_truncate(struct ocfs2_extent_tree *et,
    147					     u32 clusters);
    148static int ocfs2_dinode_insert_check(struct ocfs2_extent_tree *et,
    149				     struct ocfs2_extent_rec *rec);
    150static int ocfs2_dinode_sanity_check(struct ocfs2_extent_tree *et);
    151static void ocfs2_dinode_fill_root_el(struct ocfs2_extent_tree *et);
    152
    153static int ocfs2_reuse_blk_from_dealloc(handle_t *handle,
    154					struct ocfs2_extent_tree *et,
    155					struct buffer_head **new_eb_bh,
    156					int blk_wanted, int *blk_given);
    157static int ocfs2_is_dealloc_empty(struct ocfs2_extent_tree *et);
    158
    159static const struct ocfs2_extent_tree_operations ocfs2_dinode_et_ops = {
    160	.eo_set_last_eb_blk	= ocfs2_dinode_set_last_eb_blk,
    161	.eo_get_last_eb_blk	= ocfs2_dinode_get_last_eb_blk,
    162	.eo_update_clusters	= ocfs2_dinode_update_clusters,
    163	.eo_extent_map_insert	= ocfs2_dinode_extent_map_insert,
    164	.eo_extent_map_truncate	= ocfs2_dinode_extent_map_truncate,
    165	.eo_insert_check	= ocfs2_dinode_insert_check,
    166	.eo_sanity_check	= ocfs2_dinode_sanity_check,
    167	.eo_fill_root_el	= ocfs2_dinode_fill_root_el,
    168};
    169
    170static void ocfs2_dinode_set_last_eb_blk(struct ocfs2_extent_tree *et,
    171					 u64 blkno)
    172{
    173	struct ocfs2_dinode *di = et->et_object;
    174
    175	BUG_ON(et->et_ops != &ocfs2_dinode_et_ops);
    176	di->i_last_eb_blk = cpu_to_le64(blkno);
    177}
    178
    179static u64 ocfs2_dinode_get_last_eb_blk(struct ocfs2_extent_tree *et)
    180{
    181	struct ocfs2_dinode *di = et->et_object;
    182
    183	BUG_ON(et->et_ops != &ocfs2_dinode_et_ops);
    184	return le64_to_cpu(di->i_last_eb_blk);
    185}
    186
    187static void ocfs2_dinode_update_clusters(struct ocfs2_extent_tree *et,
    188					 u32 clusters)
    189{
    190	struct ocfs2_inode_info *oi = cache_info_to_inode(et->et_ci);
    191	struct ocfs2_dinode *di = et->et_object;
    192
    193	le32_add_cpu(&di->i_clusters, clusters);
    194	spin_lock(&oi->ip_lock);
    195	oi->ip_clusters = le32_to_cpu(di->i_clusters);
    196	spin_unlock(&oi->ip_lock);
    197}
    198
    199static void ocfs2_dinode_extent_map_insert(struct ocfs2_extent_tree *et,
    200					   struct ocfs2_extent_rec *rec)
    201{
    202	struct inode *inode = &cache_info_to_inode(et->et_ci)->vfs_inode;
    203
    204	ocfs2_extent_map_insert_rec(inode, rec);
    205}
    206
    207static void ocfs2_dinode_extent_map_truncate(struct ocfs2_extent_tree *et,
    208					     u32 clusters)
    209{
    210	struct inode *inode = &cache_info_to_inode(et->et_ci)->vfs_inode;
    211
    212	ocfs2_extent_map_trunc(inode, clusters);
    213}
    214
    215static int ocfs2_dinode_insert_check(struct ocfs2_extent_tree *et,
    216				     struct ocfs2_extent_rec *rec)
    217{
    218	struct ocfs2_inode_info *oi = cache_info_to_inode(et->et_ci);
    219	struct ocfs2_super *osb = OCFS2_SB(oi->vfs_inode.i_sb);
    220
    221	BUG_ON(oi->ip_dyn_features & OCFS2_INLINE_DATA_FL);
    222	mlog_bug_on_msg(!ocfs2_sparse_alloc(osb) &&
    223			(oi->ip_clusters != le32_to_cpu(rec->e_cpos)),
    224			"Device %s, asking for sparse allocation: inode %llu, "
    225			"cpos %u, clusters %u\n",
    226			osb->dev_str,
    227			(unsigned long long)oi->ip_blkno,
    228			rec->e_cpos, oi->ip_clusters);
    229
    230	return 0;
    231}
    232
    233static int ocfs2_dinode_sanity_check(struct ocfs2_extent_tree *et)
    234{
    235	struct ocfs2_dinode *di = et->et_object;
    236
    237	BUG_ON(et->et_ops != &ocfs2_dinode_et_ops);
    238	BUG_ON(!OCFS2_IS_VALID_DINODE(di));
    239
    240	return 0;
    241}
    242
    243static void ocfs2_dinode_fill_root_el(struct ocfs2_extent_tree *et)
    244{
    245	struct ocfs2_dinode *di = et->et_object;
    246
    247	et->et_root_el = &di->id2.i_list;
    248}
    249
    250
    251static void ocfs2_xattr_value_fill_root_el(struct ocfs2_extent_tree *et)
    252{
    253	struct ocfs2_xattr_value_buf *vb = et->et_object;
    254
    255	et->et_root_el = &vb->vb_xv->xr_list;
    256}
    257
    258static void ocfs2_xattr_value_set_last_eb_blk(struct ocfs2_extent_tree *et,
    259					      u64 blkno)
    260{
    261	struct ocfs2_xattr_value_buf *vb = et->et_object;
    262
    263	vb->vb_xv->xr_last_eb_blk = cpu_to_le64(blkno);
    264}
    265
    266static u64 ocfs2_xattr_value_get_last_eb_blk(struct ocfs2_extent_tree *et)
    267{
    268	struct ocfs2_xattr_value_buf *vb = et->et_object;
    269
    270	return le64_to_cpu(vb->vb_xv->xr_last_eb_blk);
    271}
    272
    273static void ocfs2_xattr_value_update_clusters(struct ocfs2_extent_tree *et,
    274					      u32 clusters)
    275{
    276	struct ocfs2_xattr_value_buf *vb = et->et_object;
    277
    278	le32_add_cpu(&vb->vb_xv->xr_clusters, clusters);
    279}
    280
    281static const struct ocfs2_extent_tree_operations ocfs2_xattr_value_et_ops = {
    282	.eo_set_last_eb_blk	= ocfs2_xattr_value_set_last_eb_blk,
    283	.eo_get_last_eb_blk	= ocfs2_xattr_value_get_last_eb_blk,
    284	.eo_update_clusters	= ocfs2_xattr_value_update_clusters,
    285	.eo_fill_root_el	= ocfs2_xattr_value_fill_root_el,
    286};
    287
    288static void ocfs2_xattr_tree_fill_root_el(struct ocfs2_extent_tree *et)
    289{
    290	struct ocfs2_xattr_block *xb = et->et_object;
    291
    292	et->et_root_el = &xb->xb_attrs.xb_root.xt_list;
    293}
    294
    295static void ocfs2_xattr_tree_fill_max_leaf_clusters(struct ocfs2_extent_tree *et)
    296{
    297	struct super_block *sb = ocfs2_metadata_cache_get_super(et->et_ci);
    298	et->et_max_leaf_clusters =
    299		ocfs2_clusters_for_bytes(sb, OCFS2_MAX_XATTR_TREE_LEAF_SIZE);
    300}
    301
    302static void ocfs2_xattr_tree_set_last_eb_blk(struct ocfs2_extent_tree *et,
    303					     u64 blkno)
    304{
    305	struct ocfs2_xattr_block *xb = et->et_object;
    306	struct ocfs2_xattr_tree_root *xt = &xb->xb_attrs.xb_root;
    307
    308	xt->xt_last_eb_blk = cpu_to_le64(blkno);
    309}
    310
    311static u64 ocfs2_xattr_tree_get_last_eb_blk(struct ocfs2_extent_tree *et)
    312{
    313	struct ocfs2_xattr_block *xb = et->et_object;
    314	struct ocfs2_xattr_tree_root *xt = &xb->xb_attrs.xb_root;
    315
    316	return le64_to_cpu(xt->xt_last_eb_blk);
    317}
    318
    319static void ocfs2_xattr_tree_update_clusters(struct ocfs2_extent_tree *et,
    320					     u32 clusters)
    321{
    322	struct ocfs2_xattr_block *xb = et->et_object;
    323
    324	le32_add_cpu(&xb->xb_attrs.xb_root.xt_clusters, clusters);
    325}
    326
    327static const struct ocfs2_extent_tree_operations ocfs2_xattr_tree_et_ops = {
    328	.eo_set_last_eb_blk	= ocfs2_xattr_tree_set_last_eb_blk,
    329	.eo_get_last_eb_blk	= ocfs2_xattr_tree_get_last_eb_blk,
    330	.eo_update_clusters	= ocfs2_xattr_tree_update_clusters,
    331	.eo_fill_root_el	= ocfs2_xattr_tree_fill_root_el,
    332	.eo_fill_max_leaf_clusters = ocfs2_xattr_tree_fill_max_leaf_clusters,
    333};
    334
    335static void ocfs2_dx_root_set_last_eb_blk(struct ocfs2_extent_tree *et,
    336					  u64 blkno)
    337{
    338	struct ocfs2_dx_root_block *dx_root = et->et_object;
    339
    340	dx_root->dr_last_eb_blk = cpu_to_le64(blkno);
    341}
    342
    343static u64 ocfs2_dx_root_get_last_eb_blk(struct ocfs2_extent_tree *et)
    344{
    345	struct ocfs2_dx_root_block *dx_root = et->et_object;
    346
    347	return le64_to_cpu(dx_root->dr_last_eb_blk);
    348}
    349
    350static void ocfs2_dx_root_update_clusters(struct ocfs2_extent_tree *et,
    351					  u32 clusters)
    352{
    353	struct ocfs2_dx_root_block *dx_root = et->et_object;
    354
    355	le32_add_cpu(&dx_root->dr_clusters, clusters);
    356}
    357
    358static int ocfs2_dx_root_sanity_check(struct ocfs2_extent_tree *et)
    359{
    360	struct ocfs2_dx_root_block *dx_root = et->et_object;
    361
    362	BUG_ON(!OCFS2_IS_VALID_DX_ROOT(dx_root));
    363
    364	return 0;
    365}
    366
    367static void ocfs2_dx_root_fill_root_el(struct ocfs2_extent_tree *et)
    368{
    369	struct ocfs2_dx_root_block *dx_root = et->et_object;
    370
    371	et->et_root_el = &dx_root->dr_list;
    372}
    373
    374static const struct ocfs2_extent_tree_operations ocfs2_dx_root_et_ops = {
    375	.eo_set_last_eb_blk	= ocfs2_dx_root_set_last_eb_blk,
    376	.eo_get_last_eb_blk	= ocfs2_dx_root_get_last_eb_blk,
    377	.eo_update_clusters	= ocfs2_dx_root_update_clusters,
    378	.eo_sanity_check	= ocfs2_dx_root_sanity_check,
    379	.eo_fill_root_el	= ocfs2_dx_root_fill_root_el,
    380};
    381
    382static void ocfs2_refcount_tree_fill_root_el(struct ocfs2_extent_tree *et)
    383{
    384	struct ocfs2_refcount_block *rb = et->et_object;
    385
    386	et->et_root_el = &rb->rf_list;
    387}
    388
    389static void ocfs2_refcount_tree_set_last_eb_blk(struct ocfs2_extent_tree *et,
    390						u64 blkno)
    391{
    392	struct ocfs2_refcount_block *rb = et->et_object;
    393
    394	rb->rf_last_eb_blk = cpu_to_le64(blkno);
    395}
    396
    397static u64 ocfs2_refcount_tree_get_last_eb_blk(struct ocfs2_extent_tree *et)
    398{
    399	struct ocfs2_refcount_block *rb = et->et_object;
    400
    401	return le64_to_cpu(rb->rf_last_eb_blk);
    402}
    403
    404static void ocfs2_refcount_tree_update_clusters(struct ocfs2_extent_tree *et,
    405						u32 clusters)
    406{
    407	struct ocfs2_refcount_block *rb = et->et_object;
    408
    409	le32_add_cpu(&rb->rf_clusters, clusters);
    410}
    411
    412static enum ocfs2_contig_type
    413ocfs2_refcount_tree_extent_contig(struct ocfs2_extent_tree *et,
    414				  struct ocfs2_extent_rec *ext,
    415				  struct ocfs2_extent_rec *insert_rec)
    416{
    417	return CONTIG_NONE;
    418}
    419
    420static const struct ocfs2_extent_tree_operations ocfs2_refcount_tree_et_ops = {
    421	.eo_set_last_eb_blk	= ocfs2_refcount_tree_set_last_eb_blk,
    422	.eo_get_last_eb_blk	= ocfs2_refcount_tree_get_last_eb_blk,
    423	.eo_update_clusters	= ocfs2_refcount_tree_update_clusters,
    424	.eo_fill_root_el	= ocfs2_refcount_tree_fill_root_el,
    425	.eo_extent_contig	= ocfs2_refcount_tree_extent_contig,
    426};
    427
    428static void __ocfs2_init_extent_tree(struct ocfs2_extent_tree *et,
    429				     struct ocfs2_caching_info *ci,
    430				     struct buffer_head *bh,
    431				     ocfs2_journal_access_func access,
    432				     void *obj,
    433				     const struct ocfs2_extent_tree_operations *ops)
    434{
    435	et->et_ops = ops;
    436	et->et_root_bh = bh;
    437	et->et_ci = ci;
    438	et->et_root_journal_access = access;
    439	if (!obj)
    440		obj = (void *)bh->b_data;
    441	et->et_object = obj;
    442	et->et_dealloc = NULL;
    443
    444	et->et_ops->eo_fill_root_el(et);
    445	if (!et->et_ops->eo_fill_max_leaf_clusters)
    446		et->et_max_leaf_clusters = 0;
    447	else
    448		et->et_ops->eo_fill_max_leaf_clusters(et);
    449}
    450
    451void ocfs2_init_dinode_extent_tree(struct ocfs2_extent_tree *et,
    452				   struct ocfs2_caching_info *ci,
    453				   struct buffer_head *bh)
    454{
    455	__ocfs2_init_extent_tree(et, ci, bh, ocfs2_journal_access_di,
    456				 NULL, &ocfs2_dinode_et_ops);
    457}
    458
    459void ocfs2_init_xattr_tree_extent_tree(struct ocfs2_extent_tree *et,
    460				       struct ocfs2_caching_info *ci,
    461				       struct buffer_head *bh)
    462{
    463	__ocfs2_init_extent_tree(et, ci, bh, ocfs2_journal_access_xb,
    464				 NULL, &ocfs2_xattr_tree_et_ops);
    465}
    466
    467void ocfs2_init_xattr_value_extent_tree(struct ocfs2_extent_tree *et,
    468					struct ocfs2_caching_info *ci,
    469					struct ocfs2_xattr_value_buf *vb)
    470{
    471	__ocfs2_init_extent_tree(et, ci, vb->vb_bh, vb->vb_access, vb,
    472				 &ocfs2_xattr_value_et_ops);
    473}
    474
    475void ocfs2_init_dx_root_extent_tree(struct ocfs2_extent_tree *et,
    476				    struct ocfs2_caching_info *ci,
    477				    struct buffer_head *bh)
    478{
    479	__ocfs2_init_extent_tree(et, ci, bh, ocfs2_journal_access_dr,
    480				 NULL, &ocfs2_dx_root_et_ops);
    481}
    482
    483void ocfs2_init_refcount_extent_tree(struct ocfs2_extent_tree *et,
    484				     struct ocfs2_caching_info *ci,
    485				     struct buffer_head *bh)
    486{
    487	__ocfs2_init_extent_tree(et, ci, bh, ocfs2_journal_access_rb,
    488				 NULL, &ocfs2_refcount_tree_et_ops);
    489}
    490
    491static inline void ocfs2_et_set_last_eb_blk(struct ocfs2_extent_tree *et,
    492					    u64 new_last_eb_blk)
    493{
    494	et->et_ops->eo_set_last_eb_blk(et, new_last_eb_blk);
    495}
    496
    497static inline u64 ocfs2_et_get_last_eb_blk(struct ocfs2_extent_tree *et)
    498{
    499	return et->et_ops->eo_get_last_eb_blk(et);
    500}
    501
    502static inline void ocfs2_et_update_clusters(struct ocfs2_extent_tree *et,
    503					    u32 clusters)
    504{
    505	et->et_ops->eo_update_clusters(et, clusters);
    506}
    507
    508static inline void ocfs2_et_extent_map_insert(struct ocfs2_extent_tree *et,
    509					      struct ocfs2_extent_rec *rec)
    510{
    511	if (et->et_ops->eo_extent_map_insert)
    512		et->et_ops->eo_extent_map_insert(et, rec);
    513}
    514
    515static inline void ocfs2_et_extent_map_truncate(struct ocfs2_extent_tree *et,
    516						u32 clusters)
    517{
    518	if (et->et_ops->eo_extent_map_truncate)
    519		et->et_ops->eo_extent_map_truncate(et, clusters);
    520}
    521
    522static inline int ocfs2_et_root_journal_access(handle_t *handle,
    523					       struct ocfs2_extent_tree *et,
    524					       int type)
    525{
    526	return et->et_root_journal_access(handle, et->et_ci, et->et_root_bh,
    527					  type);
    528}
    529
    530static inline enum ocfs2_contig_type
    531	ocfs2_et_extent_contig(struct ocfs2_extent_tree *et,
    532			       struct ocfs2_extent_rec *rec,
    533			       struct ocfs2_extent_rec *insert_rec)
    534{
    535	if (et->et_ops->eo_extent_contig)
    536		return et->et_ops->eo_extent_contig(et, rec, insert_rec);
    537
    538	return ocfs2_extent_rec_contig(
    539				ocfs2_metadata_cache_get_super(et->et_ci),
    540				rec, insert_rec);
    541}
    542
    543static inline int ocfs2_et_insert_check(struct ocfs2_extent_tree *et,
    544					struct ocfs2_extent_rec *rec)
    545{
    546	int ret = 0;
    547
    548	if (et->et_ops->eo_insert_check)
    549		ret = et->et_ops->eo_insert_check(et, rec);
    550	return ret;
    551}
    552
    553static inline int ocfs2_et_sanity_check(struct ocfs2_extent_tree *et)
    554{
    555	int ret = 0;
    556
    557	if (et->et_ops->eo_sanity_check)
    558		ret = et->et_ops->eo_sanity_check(et);
    559	return ret;
    560}
    561
    562static int ocfs2_cache_extent_block_free(struct ocfs2_cached_dealloc_ctxt *ctxt,
    563					 struct ocfs2_extent_block *eb);
    564static void ocfs2_adjust_rightmost_records(handle_t *handle,
    565					   struct ocfs2_extent_tree *et,
    566					   struct ocfs2_path *path,
    567					   struct ocfs2_extent_rec *insert_rec);
    568/*
    569 * Reset the actual path elements so that we can re-use the structure
    570 * to build another path. Generally, this involves freeing the buffer
    571 * heads.
    572 */
    573void ocfs2_reinit_path(struct ocfs2_path *path, int keep_root)
    574{
    575	int i, start = 0, depth = 0;
    576	struct ocfs2_path_item *node;
    577
    578	if (keep_root)
    579		start = 1;
    580
    581	for(i = start; i < path_num_items(path); i++) {
    582		node = &path->p_node[i];
    583
    584		brelse(node->bh);
    585		node->bh = NULL;
    586		node->el = NULL;
    587	}
    588
    589	/*
    590	 * Tree depth may change during truncate, or insert. If we're
    591	 * keeping the root extent list, then make sure that our path
    592	 * structure reflects the proper depth.
    593	 */
    594	if (keep_root)
    595		depth = le16_to_cpu(path_root_el(path)->l_tree_depth);
    596	else
    597		path_root_access(path) = NULL;
    598
    599	path->p_tree_depth = depth;
    600}
    601
    602void ocfs2_free_path(struct ocfs2_path *path)
    603{
    604	if (path) {
    605		ocfs2_reinit_path(path, 0);
    606		kfree(path);
    607	}
    608}
    609
    610/*
    611 * All the elements of src into dest. After this call, src could be freed
    612 * without affecting dest.
    613 *
    614 * Both paths should have the same root. Any non-root elements of dest
    615 * will be freed.
    616 */
    617static void ocfs2_cp_path(struct ocfs2_path *dest, struct ocfs2_path *src)
    618{
    619	int i;
    620
    621	BUG_ON(path_root_bh(dest) != path_root_bh(src));
    622	BUG_ON(path_root_el(dest) != path_root_el(src));
    623	BUG_ON(path_root_access(dest) != path_root_access(src));
    624
    625	ocfs2_reinit_path(dest, 1);
    626
    627	for(i = 1; i < OCFS2_MAX_PATH_DEPTH; i++) {
    628		dest->p_node[i].bh = src->p_node[i].bh;
    629		dest->p_node[i].el = src->p_node[i].el;
    630
    631		if (dest->p_node[i].bh)
    632			get_bh(dest->p_node[i].bh);
    633	}
    634}
    635
    636/*
    637 * Make the *dest path the same as src and re-initialize src path to
    638 * have a root only.
    639 */
    640static void ocfs2_mv_path(struct ocfs2_path *dest, struct ocfs2_path *src)
    641{
    642	int i;
    643
    644	BUG_ON(path_root_bh(dest) != path_root_bh(src));
    645	BUG_ON(path_root_access(dest) != path_root_access(src));
    646
    647	for(i = 1; i < OCFS2_MAX_PATH_DEPTH; i++) {
    648		brelse(dest->p_node[i].bh);
    649
    650		dest->p_node[i].bh = src->p_node[i].bh;
    651		dest->p_node[i].el = src->p_node[i].el;
    652
    653		src->p_node[i].bh = NULL;
    654		src->p_node[i].el = NULL;
    655	}
    656}
    657
    658/*
    659 * Insert an extent block at given index.
    660 *
    661 * This will not take an additional reference on eb_bh.
    662 */
    663static inline void ocfs2_path_insert_eb(struct ocfs2_path *path, int index,
    664					struct buffer_head *eb_bh)
    665{
    666	struct ocfs2_extent_block *eb = (struct ocfs2_extent_block *)eb_bh->b_data;
    667
    668	/*
    669	 * Right now, no root bh is an extent block, so this helps
    670	 * catch code errors with dinode trees. The assertion can be
    671	 * safely removed if we ever need to insert extent block
    672	 * structures at the root.
    673	 */
    674	BUG_ON(index == 0);
    675
    676	path->p_node[index].bh = eb_bh;
    677	path->p_node[index].el = &eb->h_list;
    678}
    679
    680static struct ocfs2_path *ocfs2_new_path(struct buffer_head *root_bh,
    681					 struct ocfs2_extent_list *root_el,
    682					 ocfs2_journal_access_func access)
    683{
    684	struct ocfs2_path *path;
    685
    686	BUG_ON(le16_to_cpu(root_el->l_tree_depth) >= OCFS2_MAX_PATH_DEPTH);
    687
    688	path = kzalloc(sizeof(*path), GFP_NOFS);
    689	if (path) {
    690		path->p_tree_depth = le16_to_cpu(root_el->l_tree_depth);
    691		get_bh(root_bh);
    692		path_root_bh(path) = root_bh;
    693		path_root_el(path) = root_el;
    694		path_root_access(path) = access;
    695	}
    696
    697	return path;
    698}
    699
    700struct ocfs2_path *ocfs2_new_path_from_path(struct ocfs2_path *path)
    701{
    702	return ocfs2_new_path(path_root_bh(path), path_root_el(path),
    703			      path_root_access(path));
    704}
    705
    706struct ocfs2_path *ocfs2_new_path_from_et(struct ocfs2_extent_tree *et)
    707{
    708	return ocfs2_new_path(et->et_root_bh, et->et_root_el,
    709			      et->et_root_journal_access);
    710}
    711
    712/*
    713 * Journal the buffer at depth idx.  All idx>0 are extent_blocks,
    714 * otherwise it's the root_access function.
    715 *
    716 * I don't like the way this function's name looks next to
    717 * ocfs2_journal_access_path(), but I don't have a better one.
    718 */
    719int ocfs2_path_bh_journal_access(handle_t *handle,
    720				 struct ocfs2_caching_info *ci,
    721				 struct ocfs2_path *path,
    722				 int idx)
    723{
    724	ocfs2_journal_access_func access = path_root_access(path);
    725
    726	if (!access)
    727		access = ocfs2_journal_access;
    728
    729	if (idx)
    730		access = ocfs2_journal_access_eb;
    731
    732	return access(handle, ci, path->p_node[idx].bh,
    733		      OCFS2_JOURNAL_ACCESS_WRITE);
    734}
    735
    736/*
    737 * Convenience function to journal all components in a path.
    738 */
    739int ocfs2_journal_access_path(struct ocfs2_caching_info *ci,
    740			      handle_t *handle,
    741			      struct ocfs2_path *path)
    742{
    743	int i, ret = 0;
    744
    745	if (!path)
    746		goto out;
    747
    748	for(i = 0; i < path_num_items(path); i++) {
    749		ret = ocfs2_path_bh_journal_access(handle, ci, path, i);
    750		if (ret < 0) {
    751			mlog_errno(ret);
    752			goto out;
    753		}
    754	}
    755
    756out:
    757	return ret;
    758}
    759
    760/*
    761 * Return the index of the extent record which contains cluster #v_cluster.
    762 * -1 is returned if it was not found.
    763 *
    764 * Should work fine on interior and exterior nodes.
    765 */
    766int ocfs2_search_extent_list(struct ocfs2_extent_list *el, u32 v_cluster)
    767{
    768	int ret = -1;
    769	int i;
    770	struct ocfs2_extent_rec *rec;
    771	u32 rec_end, rec_start, clusters;
    772
    773	for(i = 0; i < le16_to_cpu(el->l_next_free_rec); i++) {
    774		rec = &el->l_recs[i];
    775
    776		rec_start = le32_to_cpu(rec->e_cpos);
    777		clusters = ocfs2_rec_clusters(el, rec);
    778
    779		rec_end = rec_start + clusters;
    780
    781		if (v_cluster >= rec_start && v_cluster < rec_end) {
    782			ret = i;
    783			break;
    784		}
    785	}
    786
    787	return ret;
    788}
    789
    790/*
    791 * NOTE: ocfs2_block_extent_contig(), ocfs2_extents_adjacent() and
    792 * ocfs2_extent_rec_contig only work properly against leaf nodes!
    793 */
    794static int ocfs2_block_extent_contig(struct super_block *sb,
    795				     struct ocfs2_extent_rec *ext,
    796				     u64 blkno)
    797{
    798	u64 blk_end = le64_to_cpu(ext->e_blkno);
    799
    800	blk_end += ocfs2_clusters_to_blocks(sb,
    801				    le16_to_cpu(ext->e_leaf_clusters));
    802
    803	return blkno == blk_end;
    804}
    805
    806static int ocfs2_extents_adjacent(struct ocfs2_extent_rec *left,
    807				  struct ocfs2_extent_rec *right)
    808{
    809	u32 left_range;
    810
    811	left_range = le32_to_cpu(left->e_cpos) +
    812		le16_to_cpu(left->e_leaf_clusters);
    813
    814	return (left_range == le32_to_cpu(right->e_cpos));
    815}
    816
    817static enum ocfs2_contig_type
    818	ocfs2_extent_rec_contig(struct super_block *sb,
    819				struct ocfs2_extent_rec *ext,
    820				struct ocfs2_extent_rec *insert_rec)
    821{
    822	u64 blkno = le64_to_cpu(insert_rec->e_blkno);
    823
    824	/*
    825	 * Refuse to coalesce extent records with different flag
    826	 * fields - we don't want to mix unwritten extents with user
    827	 * data.
    828	 */
    829	if (ext->e_flags != insert_rec->e_flags)
    830		return CONTIG_NONE;
    831
    832	if (ocfs2_extents_adjacent(ext, insert_rec) &&
    833	    ocfs2_block_extent_contig(sb, ext, blkno))
    834			return CONTIG_RIGHT;
    835
    836	blkno = le64_to_cpu(ext->e_blkno);
    837	if (ocfs2_extents_adjacent(insert_rec, ext) &&
    838	    ocfs2_block_extent_contig(sb, insert_rec, blkno))
    839		return CONTIG_LEFT;
    840
    841	return CONTIG_NONE;
    842}
    843
    844/*
    845 * NOTE: We can have pretty much any combination of contiguousness and
    846 * appending.
    847 *
    848 * The usefulness of APPEND_TAIL is more in that it lets us know that
    849 * we'll have to update the path to that leaf.
    850 */
    851enum ocfs2_append_type {
    852	APPEND_NONE = 0,
    853	APPEND_TAIL,
    854};
    855
    856enum ocfs2_split_type {
    857	SPLIT_NONE = 0,
    858	SPLIT_LEFT,
    859	SPLIT_RIGHT,
    860};
    861
    862struct ocfs2_insert_type {
    863	enum ocfs2_split_type	ins_split;
    864	enum ocfs2_append_type	ins_appending;
    865	enum ocfs2_contig_type	ins_contig;
    866	int			ins_contig_index;
    867	int			ins_tree_depth;
    868};
    869
    870struct ocfs2_merge_ctxt {
    871	enum ocfs2_contig_type	c_contig_type;
    872	int			c_has_empty_extent;
    873	int			c_split_covers_rec;
    874};
    875
    876static int ocfs2_validate_extent_block(struct super_block *sb,
    877				       struct buffer_head *bh)
    878{
    879	int rc;
    880	struct ocfs2_extent_block *eb =
    881		(struct ocfs2_extent_block *)bh->b_data;
    882
    883	trace_ocfs2_validate_extent_block((unsigned long long)bh->b_blocknr);
    884
    885	BUG_ON(!buffer_uptodate(bh));
    886
    887	/*
    888	 * If the ecc fails, we return the error but otherwise
    889	 * leave the filesystem running.  We know any error is
    890	 * local to this block.
    891	 */
    892	rc = ocfs2_validate_meta_ecc(sb, bh->b_data, &eb->h_check);
    893	if (rc) {
    894		mlog(ML_ERROR, "Checksum failed for extent block %llu\n",
    895		     (unsigned long long)bh->b_blocknr);
    896		return rc;
    897	}
    898
    899	/*
    900	 * Errors after here are fatal.
    901	 */
    902
    903	if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
    904		rc = ocfs2_error(sb,
    905				 "Extent block #%llu has bad signature %.*s\n",
    906				 (unsigned long long)bh->b_blocknr, 7,
    907				 eb->h_signature);
    908		goto bail;
    909	}
    910
    911	if (le64_to_cpu(eb->h_blkno) != bh->b_blocknr) {
    912		rc = ocfs2_error(sb,
    913				 "Extent block #%llu has an invalid h_blkno of %llu\n",
    914				 (unsigned long long)bh->b_blocknr,
    915				 (unsigned long long)le64_to_cpu(eb->h_blkno));
    916		goto bail;
    917	}
    918
    919	if (le32_to_cpu(eb->h_fs_generation) != OCFS2_SB(sb)->fs_generation)
    920		rc = ocfs2_error(sb,
    921				 "Extent block #%llu has an invalid h_fs_generation of #%u\n",
    922				 (unsigned long long)bh->b_blocknr,
    923				 le32_to_cpu(eb->h_fs_generation));
    924bail:
    925	return rc;
    926}
    927
    928int ocfs2_read_extent_block(struct ocfs2_caching_info *ci, u64 eb_blkno,
    929			    struct buffer_head **bh)
    930{
    931	int rc;
    932	struct buffer_head *tmp = *bh;
    933
    934	rc = ocfs2_read_block(ci, eb_blkno, &tmp,
    935			      ocfs2_validate_extent_block);
    936
    937	/* If ocfs2_read_block() got us a new bh, pass it up. */
    938	if (!rc && !*bh)
    939		*bh = tmp;
    940
    941	return rc;
    942}
    943
    944
    945/*
    946 * How many free extents have we got before we need more meta data?
    947 */
    948int ocfs2_num_free_extents(struct ocfs2_extent_tree *et)
    949{
    950	int retval;
    951	struct ocfs2_extent_list *el = NULL;
    952	struct ocfs2_extent_block *eb;
    953	struct buffer_head *eb_bh = NULL;
    954	u64 last_eb_blk = 0;
    955
    956	el = et->et_root_el;
    957	last_eb_blk = ocfs2_et_get_last_eb_blk(et);
    958
    959	if (last_eb_blk) {
    960		retval = ocfs2_read_extent_block(et->et_ci, last_eb_blk,
    961						 &eb_bh);
    962		if (retval < 0) {
    963			mlog_errno(retval);
    964			goto bail;
    965		}
    966		eb = (struct ocfs2_extent_block *) eb_bh->b_data;
    967		el = &eb->h_list;
    968	}
    969
    970	BUG_ON(el->l_tree_depth != 0);
    971
    972	retval = le16_to_cpu(el->l_count) - le16_to_cpu(el->l_next_free_rec);
    973bail:
    974	brelse(eb_bh);
    975
    976	trace_ocfs2_num_free_extents(retval);
    977	return retval;
    978}
    979
    980/* expects array to already be allocated
    981 *
    982 * sets h_signature, h_blkno, h_suballoc_bit, h_suballoc_slot, and
    983 * l_count for you
    984 */
    985static int ocfs2_create_new_meta_bhs(handle_t *handle,
    986				     struct ocfs2_extent_tree *et,
    987				     int wanted,
    988				     struct ocfs2_alloc_context *meta_ac,
    989				     struct buffer_head *bhs[])
    990{
    991	int count, status, i;
    992	u16 suballoc_bit_start;
    993	u32 num_got;
    994	u64 suballoc_loc, first_blkno;
    995	struct ocfs2_super *osb =
    996		OCFS2_SB(ocfs2_metadata_cache_get_super(et->et_ci));
    997	struct ocfs2_extent_block *eb;
    998
    999	count = 0;
   1000	while (count < wanted) {
   1001		status = ocfs2_claim_metadata(handle,
   1002					      meta_ac,
   1003					      wanted - count,
   1004					      &suballoc_loc,
   1005					      &suballoc_bit_start,
   1006					      &num_got,
   1007					      &first_blkno);
   1008		if (status < 0) {
   1009			mlog_errno(status);
   1010			goto bail;
   1011		}
   1012
   1013		for(i = count;  i < (num_got + count); i++) {
   1014			bhs[i] = sb_getblk(osb->sb, first_blkno);
   1015			if (bhs[i] == NULL) {
   1016				status = -ENOMEM;
   1017				mlog_errno(status);
   1018				goto bail;
   1019			}
   1020			ocfs2_set_new_buffer_uptodate(et->et_ci, bhs[i]);
   1021
   1022			status = ocfs2_journal_access_eb(handle, et->et_ci,
   1023							 bhs[i],
   1024							 OCFS2_JOURNAL_ACCESS_CREATE);
   1025			if (status < 0) {
   1026				mlog_errno(status);
   1027				goto bail;
   1028			}
   1029
   1030			memset(bhs[i]->b_data, 0, osb->sb->s_blocksize);
   1031			eb = (struct ocfs2_extent_block *) bhs[i]->b_data;
   1032			/* Ok, setup the minimal stuff here. */
   1033			strcpy(eb->h_signature, OCFS2_EXTENT_BLOCK_SIGNATURE);
   1034			eb->h_blkno = cpu_to_le64(first_blkno);
   1035			eb->h_fs_generation = cpu_to_le32(osb->fs_generation);
   1036			eb->h_suballoc_slot =
   1037				cpu_to_le16(meta_ac->ac_alloc_slot);
   1038			eb->h_suballoc_loc = cpu_to_le64(suballoc_loc);
   1039			eb->h_suballoc_bit = cpu_to_le16(suballoc_bit_start);
   1040			eb->h_list.l_count =
   1041				cpu_to_le16(ocfs2_extent_recs_per_eb(osb->sb));
   1042
   1043			suballoc_bit_start++;
   1044			first_blkno++;
   1045
   1046			/* We'll also be dirtied by the caller, so
   1047			 * this isn't absolutely necessary. */
   1048			ocfs2_journal_dirty(handle, bhs[i]);
   1049		}
   1050
   1051		count += num_got;
   1052	}
   1053
   1054	status = 0;
   1055bail:
   1056	if (status < 0) {
   1057		for(i = 0; i < wanted; i++) {
   1058			brelse(bhs[i]);
   1059			bhs[i] = NULL;
   1060		}
   1061	}
   1062	return status;
   1063}
   1064
   1065/*
   1066 * Helper function for ocfs2_add_branch() and ocfs2_shift_tree_depth().
   1067 *
   1068 * Returns the sum of the rightmost extent rec logical offset and
   1069 * cluster count.
   1070 *
   1071 * ocfs2_add_branch() uses this to determine what logical cluster
   1072 * value should be populated into the leftmost new branch records.
   1073 *
   1074 * ocfs2_shift_tree_depth() uses this to determine the # clusters
   1075 * value for the new topmost tree record.
   1076 */
   1077static inline u32 ocfs2_sum_rightmost_rec(struct ocfs2_extent_list  *el)
   1078{
   1079	int i;
   1080
   1081	i = le16_to_cpu(el->l_next_free_rec) - 1;
   1082
   1083	return le32_to_cpu(el->l_recs[i].e_cpos) +
   1084		ocfs2_rec_clusters(el, &el->l_recs[i]);
   1085}
   1086
   1087/*
   1088 * Change range of the branches in the right most path according to the leaf
   1089 * extent block's rightmost record.
   1090 */
   1091static int ocfs2_adjust_rightmost_branch(handle_t *handle,
   1092					 struct ocfs2_extent_tree *et)
   1093{
   1094	int status;
   1095	struct ocfs2_path *path = NULL;
   1096	struct ocfs2_extent_list *el;
   1097	struct ocfs2_extent_rec *rec;
   1098
   1099	path = ocfs2_new_path_from_et(et);
   1100	if (!path) {
   1101		status = -ENOMEM;
   1102		return status;
   1103	}
   1104
   1105	status = ocfs2_find_path(et->et_ci, path, UINT_MAX);
   1106	if (status < 0) {
   1107		mlog_errno(status);
   1108		goto out;
   1109	}
   1110
   1111	status = ocfs2_extend_trans(handle, path_num_items(path));
   1112	if (status < 0) {
   1113		mlog_errno(status);
   1114		goto out;
   1115	}
   1116
   1117	status = ocfs2_journal_access_path(et->et_ci, handle, path);
   1118	if (status < 0) {
   1119		mlog_errno(status);
   1120		goto out;
   1121	}
   1122
   1123	el = path_leaf_el(path);
   1124	rec = &el->l_recs[le16_to_cpu(el->l_next_free_rec) - 1];
   1125
   1126	ocfs2_adjust_rightmost_records(handle, et, path, rec);
   1127
   1128out:
   1129	ocfs2_free_path(path);
   1130	return status;
   1131}
   1132
   1133/*
   1134 * Add an entire tree branch to our inode. eb_bh is the extent block
   1135 * to start at, if we don't want to start the branch at the root
   1136 * structure.
   1137 *
   1138 * last_eb_bh is required as we have to update it's next_leaf pointer
   1139 * for the new last extent block.
   1140 *
   1141 * the new branch will be 'empty' in the sense that every block will
   1142 * contain a single record with cluster count == 0.
   1143 */
   1144static int ocfs2_add_branch(handle_t *handle,
   1145			    struct ocfs2_extent_tree *et,
   1146			    struct buffer_head *eb_bh,
   1147			    struct buffer_head **last_eb_bh,
   1148			    struct ocfs2_alloc_context *meta_ac)
   1149{
   1150	int status, new_blocks, i, block_given = 0;
   1151	u64 next_blkno, new_last_eb_blk;
   1152	struct buffer_head *bh;
   1153	struct buffer_head **new_eb_bhs = NULL;
   1154	struct ocfs2_extent_block *eb;
   1155	struct ocfs2_extent_list  *eb_el;
   1156	struct ocfs2_extent_list  *el;
   1157	u32 new_cpos, root_end;
   1158
   1159	BUG_ON(!last_eb_bh || !*last_eb_bh);
   1160
   1161	if (eb_bh) {
   1162		eb = (struct ocfs2_extent_block *) eb_bh->b_data;
   1163		el = &eb->h_list;
   1164	} else
   1165		el = et->et_root_el;
   1166
   1167	/* we never add a branch to a leaf. */
   1168	BUG_ON(!el->l_tree_depth);
   1169
   1170	new_blocks = le16_to_cpu(el->l_tree_depth);
   1171
   1172	eb = (struct ocfs2_extent_block *)(*last_eb_bh)->b_data;
   1173	new_cpos = ocfs2_sum_rightmost_rec(&eb->h_list);
   1174	root_end = ocfs2_sum_rightmost_rec(et->et_root_el);
   1175
   1176	/*
   1177	 * If there is a gap before the root end and the real end
   1178	 * of the righmost leaf block, we need to remove the gap
   1179	 * between new_cpos and root_end first so that the tree
   1180	 * is consistent after we add a new branch(it will start
   1181	 * from new_cpos).
   1182	 */
   1183	if (root_end > new_cpos) {
   1184		trace_ocfs2_adjust_rightmost_branch(
   1185			(unsigned long long)
   1186			ocfs2_metadata_cache_owner(et->et_ci),
   1187			root_end, new_cpos);
   1188
   1189		status = ocfs2_adjust_rightmost_branch(handle, et);
   1190		if (status) {
   1191			mlog_errno(status);
   1192			goto bail;
   1193		}
   1194	}
   1195
   1196	/* allocate the number of new eb blocks we need */
   1197	new_eb_bhs = kcalloc(new_blocks, sizeof(struct buffer_head *),
   1198			     GFP_KERNEL);
   1199	if (!new_eb_bhs) {
   1200		status = -ENOMEM;
   1201		mlog_errno(status);
   1202		goto bail;
   1203	}
   1204
   1205	/* Firstyly, try to reuse dealloc since we have already estimated how
   1206	 * many extent blocks we may use.
   1207	 */
   1208	if (!ocfs2_is_dealloc_empty(et)) {
   1209		status = ocfs2_reuse_blk_from_dealloc(handle, et,
   1210						      new_eb_bhs, new_blocks,
   1211						      &block_given);
   1212		if (status < 0) {
   1213			mlog_errno(status);
   1214			goto bail;
   1215		}
   1216	}
   1217
   1218	BUG_ON(block_given > new_blocks);
   1219
   1220	if (block_given < new_blocks) {
   1221		BUG_ON(!meta_ac);
   1222		status = ocfs2_create_new_meta_bhs(handle, et,
   1223						   new_blocks - block_given,
   1224						   meta_ac,
   1225						   &new_eb_bhs[block_given]);
   1226		if (status < 0) {
   1227			mlog_errno(status);
   1228			goto bail;
   1229		}
   1230	}
   1231
   1232	/* Note: new_eb_bhs[new_blocks - 1] is the guy which will be
   1233	 * linked with the rest of the tree.
   1234	 * conversly, new_eb_bhs[0] is the new bottommost leaf.
   1235	 *
   1236	 * when we leave the loop, new_last_eb_blk will point to the
   1237	 * newest leaf, and next_blkno will point to the topmost extent
   1238	 * block. */
   1239	next_blkno = new_last_eb_blk = 0;
   1240	for(i = 0; i < new_blocks; i++) {
   1241		bh = new_eb_bhs[i];
   1242		eb = (struct ocfs2_extent_block *) bh->b_data;
   1243		/* ocfs2_create_new_meta_bhs() should create it right! */
   1244		BUG_ON(!OCFS2_IS_VALID_EXTENT_BLOCK(eb));
   1245		eb_el = &eb->h_list;
   1246
   1247		status = ocfs2_journal_access_eb(handle, et->et_ci, bh,
   1248						 OCFS2_JOURNAL_ACCESS_CREATE);
   1249		if (status < 0) {
   1250			mlog_errno(status);
   1251			goto bail;
   1252		}
   1253
   1254		eb->h_next_leaf_blk = 0;
   1255		eb_el->l_tree_depth = cpu_to_le16(i);
   1256		eb_el->l_next_free_rec = cpu_to_le16(1);
   1257		/*
   1258		 * This actually counts as an empty extent as
   1259		 * c_clusters == 0
   1260		 */
   1261		eb_el->l_recs[0].e_cpos = cpu_to_le32(new_cpos);
   1262		eb_el->l_recs[0].e_blkno = cpu_to_le64(next_blkno);
   1263		/*
   1264		 * eb_el isn't always an interior node, but even leaf
   1265		 * nodes want a zero'd flags and reserved field so
   1266		 * this gets the whole 32 bits regardless of use.
   1267		 */
   1268		eb_el->l_recs[0].e_int_clusters = cpu_to_le32(0);
   1269		if (!eb_el->l_tree_depth)
   1270			new_last_eb_blk = le64_to_cpu(eb->h_blkno);
   1271
   1272		ocfs2_journal_dirty(handle, bh);
   1273		next_blkno = le64_to_cpu(eb->h_blkno);
   1274	}
   1275
   1276	/* This is a bit hairy. We want to update up to three blocks
   1277	 * here without leaving any of them in an inconsistent state
   1278	 * in case of error. We don't have to worry about
   1279	 * journal_dirty erroring as it won't unless we've aborted the
   1280	 * handle (in which case we would never be here) so reserving
   1281	 * the write with journal_access is all we need to do. */
   1282	status = ocfs2_journal_access_eb(handle, et->et_ci, *last_eb_bh,
   1283					 OCFS2_JOURNAL_ACCESS_WRITE);
   1284	if (status < 0) {
   1285		mlog_errno(status);
   1286		goto bail;
   1287	}
   1288	status = ocfs2_et_root_journal_access(handle, et,
   1289					      OCFS2_JOURNAL_ACCESS_WRITE);
   1290	if (status < 0) {
   1291		mlog_errno(status);
   1292		goto bail;
   1293	}
   1294	if (eb_bh) {
   1295		status = ocfs2_journal_access_eb(handle, et->et_ci, eb_bh,
   1296						 OCFS2_JOURNAL_ACCESS_WRITE);
   1297		if (status < 0) {
   1298			mlog_errno(status);
   1299			goto bail;
   1300		}
   1301	}
   1302
   1303	/* Link the new branch into the rest of the tree (el will
   1304	 * either be on the root_bh, or the extent block passed in. */
   1305	i = le16_to_cpu(el->l_next_free_rec);
   1306	el->l_recs[i].e_blkno = cpu_to_le64(next_blkno);
   1307	el->l_recs[i].e_cpos = cpu_to_le32(new_cpos);
   1308	el->l_recs[i].e_int_clusters = 0;
   1309	le16_add_cpu(&el->l_next_free_rec, 1);
   1310
   1311	/* fe needs a new last extent block pointer, as does the
   1312	 * next_leaf on the previously last-extent-block. */
   1313	ocfs2_et_set_last_eb_blk(et, new_last_eb_blk);
   1314
   1315	eb = (struct ocfs2_extent_block *) (*last_eb_bh)->b_data;
   1316	eb->h_next_leaf_blk = cpu_to_le64(new_last_eb_blk);
   1317
   1318	ocfs2_journal_dirty(handle, *last_eb_bh);
   1319	ocfs2_journal_dirty(handle, et->et_root_bh);
   1320	if (eb_bh)
   1321		ocfs2_journal_dirty(handle, eb_bh);
   1322
   1323	/*
   1324	 * Some callers want to track the rightmost leaf so pass it
   1325	 * back here.
   1326	 */
   1327	brelse(*last_eb_bh);
   1328	get_bh(new_eb_bhs[0]);
   1329	*last_eb_bh = new_eb_bhs[0];
   1330
   1331	status = 0;
   1332bail:
   1333	if (new_eb_bhs) {
   1334		for (i = 0; i < new_blocks; i++)
   1335			brelse(new_eb_bhs[i]);
   1336		kfree(new_eb_bhs);
   1337	}
   1338
   1339	return status;
   1340}
   1341
   1342/*
   1343 * adds another level to the allocation tree.
   1344 * returns back the new extent block so you can add a branch to it
   1345 * after this call.
   1346 */
   1347static int ocfs2_shift_tree_depth(handle_t *handle,
   1348				  struct ocfs2_extent_tree *et,
   1349				  struct ocfs2_alloc_context *meta_ac,
   1350				  struct buffer_head **ret_new_eb_bh)
   1351{
   1352	int status, i, block_given = 0;
   1353	u32 new_clusters;
   1354	struct buffer_head *new_eb_bh = NULL;
   1355	struct ocfs2_extent_block *eb;
   1356	struct ocfs2_extent_list  *root_el;
   1357	struct ocfs2_extent_list  *eb_el;
   1358
   1359	if (!ocfs2_is_dealloc_empty(et)) {
   1360		status = ocfs2_reuse_blk_from_dealloc(handle, et,
   1361						      &new_eb_bh, 1,
   1362						      &block_given);
   1363	} else if (meta_ac) {
   1364		status = ocfs2_create_new_meta_bhs(handle, et, 1, meta_ac,
   1365						   &new_eb_bh);
   1366
   1367	} else {
   1368		BUG();
   1369	}
   1370
   1371	if (status < 0) {
   1372		mlog_errno(status);
   1373		goto bail;
   1374	}
   1375
   1376	eb = (struct ocfs2_extent_block *) new_eb_bh->b_data;
   1377	/* ocfs2_create_new_meta_bhs() should create it right! */
   1378	BUG_ON(!OCFS2_IS_VALID_EXTENT_BLOCK(eb));
   1379
   1380	eb_el = &eb->h_list;
   1381	root_el = et->et_root_el;
   1382
   1383	status = ocfs2_journal_access_eb(handle, et->et_ci, new_eb_bh,
   1384					 OCFS2_JOURNAL_ACCESS_CREATE);
   1385	if (status < 0) {
   1386		mlog_errno(status);
   1387		goto bail;
   1388	}
   1389
   1390	/* copy the root extent list data into the new extent block */
   1391	eb_el->l_tree_depth = root_el->l_tree_depth;
   1392	eb_el->l_next_free_rec = root_el->l_next_free_rec;
   1393	for (i = 0; i < le16_to_cpu(root_el->l_next_free_rec); i++)
   1394		eb_el->l_recs[i] = root_el->l_recs[i];
   1395
   1396	ocfs2_journal_dirty(handle, new_eb_bh);
   1397
   1398	status = ocfs2_et_root_journal_access(handle, et,
   1399					      OCFS2_JOURNAL_ACCESS_WRITE);
   1400	if (status < 0) {
   1401		mlog_errno(status);
   1402		goto bail;
   1403	}
   1404
   1405	new_clusters = ocfs2_sum_rightmost_rec(eb_el);
   1406
   1407	/* update root_bh now */
   1408	le16_add_cpu(&root_el->l_tree_depth, 1);
   1409	root_el->l_recs[0].e_cpos = 0;
   1410	root_el->l_recs[0].e_blkno = eb->h_blkno;
   1411	root_el->l_recs[0].e_int_clusters = cpu_to_le32(new_clusters);
   1412	for (i = 1; i < le16_to_cpu(root_el->l_next_free_rec); i++)
   1413		memset(&root_el->l_recs[i], 0, sizeof(struct ocfs2_extent_rec));
   1414	root_el->l_next_free_rec = cpu_to_le16(1);
   1415
   1416	/* If this is our 1st tree depth shift, then last_eb_blk
   1417	 * becomes the allocated extent block */
   1418	if (root_el->l_tree_depth == cpu_to_le16(1))
   1419		ocfs2_et_set_last_eb_blk(et, le64_to_cpu(eb->h_blkno));
   1420
   1421	ocfs2_journal_dirty(handle, et->et_root_bh);
   1422
   1423	*ret_new_eb_bh = new_eb_bh;
   1424	new_eb_bh = NULL;
   1425	status = 0;
   1426bail:
   1427	brelse(new_eb_bh);
   1428
   1429	return status;
   1430}
   1431
   1432/*
   1433 * Should only be called when there is no space left in any of the
   1434 * leaf nodes. What we want to do is find the lowest tree depth
   1435 * non-leaf extent block with room for new records. There are three
   1436 * valid results of this search:
   1437 *
   1438 * 1) a lowest extent block is found, then we pass it back in
   1439 *    *lowest_eb_bh and return '0'
   1440 *
   1441 * 2) the search fails to find anything, but the root_el has room. We
   1442 *    pass NULL back in *lowest_eb_bh, but still return '0'
   1443 *
   1444 * 3) the search fails to find anything AND the root_el is full, in
   1445 *    which case we return > 0
   1446 *
   1447 * return status < 0 indicates an error.
   1448 */
   1449static int ocfs2_find_branch_target(struct ocfs2_extent_tree *et,
   1450				    struct buffer_head **target_bh)
   1451{
   1452	int status = 0, i;
   1453	u64 blkno;
   1454	struct ocfs2_extent_block *eb;
   1455	struct ocfs2_extent_list  *el;
   1456	struct buffer_head *bh = NULL;
   1457	struct buffer_head *lowest_bh = NULL;
   1458
   1459	*target_bh = NULL;
   1460
   1461	el = et->et_root_el;
   1462
   1463	while(le16_to_cpu(el->l_tree_depth) > 1) {
   1464		if (le16_to_cpu(el->l_next_free_rec) == 0) {
   1465			status = ocfs2_error(ocfs2_metadata_cache_get_super(et->et_ci),
   1466					"Owner %llu has empty extent list (next_free_rec == 0)\n",
   1467					(unsigned long long)ocfs2_metadata_cache_owner(et->et_ci));
   1468			goto bail;
   1469		}
   1470		i = le16_to_cpu(el->l_next_free_rec) - 1;
   1471		blkno = le64_to_cpu(el->l_recs[i].e_blkno);
   1472		if (!blkno) {
   1473			status = ocfs2_error(ocfs2_metadata_cache_get_super(et->et_ci),
   1474					"Owner %llu has extent list where extent # %d has no physical block start\n",
   1475					(unsigned long long)ocfs2_metadata_cache_owner(et->et_ci), i);
   1476			goto bail;
   1477		}
   1478
   1479		brelse(bh);
   1480		bh = NULL;
   1481
   1482		status = ocfs2_read_extent_block(et->et_ci, blkno, &bh);
   1483		if (status < 0) {
   1484			mlog_errno(status);
   1485			goto bail;
   1486		}
   1487
   1488		eb = (struct ocfs2_extent_block *) bh->b_data;
   1489		el = &eb->h_list;
   1490
   1491		if (le16_to_cpu(el->l_next_free_rec) <
   1492		    le16_to_cpu(el->l_count)) {
   1493			brelse(lowest_bh);
   1494			lowest_bh = bh;
   1495			get_bh(lowest_bh);
   1496		}
   1497	}
   1498
   1499	/* If we didn't find one and the fe doesn't have any room,
   1500	 * then return '1' */
   1501	el = et->et_root_el;
   1502	if (!lowest_bh && (el->l_next_free_rec == el->l_count))
   1503		status = 1;
   1504
   1505	*target_bh = lowest_bh;
   1506bail:
   1507	brelse(bh);
   1508
   1509	return status;
   1510}
   1511
   1512/*
   1513 * Grow a b-tree so that it has more records.
   1514 *
   1515 * We might shift the tree depth in which case existing paths should
   1516 * be considered invalid.
   1517 *
   1518 * Tree depth after the grow is returned via *final_depth.
   1519 *
   1520 * *last_eb_bh will be updated by ocfs2_add_branch().
   1521 */
   1522static int ocfs2_grow_tree(handle_t *handle, struct ocfs2_extent_tree *et,
   1523			   int *final_depth, struct buffer_head **last_eb_bh,
   1524			   struct ocfs2_alloc_context *meta_ac)
   1525{
   1526	int ret, shift;
   1527	struct ocfs2_extent_list *el = et->et_root_el;
   1528	int depth = le16_to_cpu(el->l_tree_depth);
   1529	struct buffer_head *bh = NULL;
   1530
   1531	BUG_ON(meta_ac == NULL && ocfs2_is_dealloc_empty(et));
   1532
   1533	shift = ocfs2_find_branch_target(et, &bh);
   1534	if (shift < 0) {
   1535		ret = shift;
   1536		mlog_errno(ret);
   1537		goto out;
   1538	}
   1539
   1540	/* We traveled all the way to the bottom of the allocation tree
   1541	 * and didn't find room for any more extents - we need to add
   1542	 * another tree level */
   1543	if (shift) {
   1544		BUG_ON(bh);
   1545		trace_ocfs2_grow_tree(
   1546			(unsigned long long)
   1547			ocfs2_metadata_cache_owner(et->et_ci),
   1548			depth);
   1549
   1550		/* ocfs2_shift_tree_depth will return us a buffer with
   1551		 * the new extent block (so we can pass that to
   1552		 * ocfs2_add_branch). */
   1553		ret = ocfs2_shift_tree_depth(handle, et, meta_ac, &bh);
   1554		if (ret < 0) {
   1555			mlog_errno(ret);
   1556			goto out;
   1557		}
   1558		depth++;
   1559		if (depth == 1) {
   1560			/*
   1561			 * Special case: we have room now if we shifted from
   1562			 * tree_depth 0, so no more work needs to be done.
   1563			 *
   1564			 * We won't be calling add_branch, so pass
   1565			 * back *last_eb_bh as the new leaf. At depth
   1566			 * zero, it should always be null so there's
   1567			 * no reason to brelse.
   1568			 */
   1569			BUG_ON(*last_eb_bh);
   1570			get_bh(bh);
   1571			*last_eb_bh = bh;
   1572			goto out;
   1573		}
   1574	}
   1575
   1576	/* call ocfs2_add_branch to add the final part of the tree with
   1577	 * the new data. */
   1578	ret = ocfs2_add_branch(handle, et, bh, last_eb_bh,
   1579			       meta_ac);
   1580	if (ret < 0)
   1581		mlog_errno(ret);
   1582
   1583out:
   1584	if (final_depth)
   1585		*final_depth = depth;
   1586	brelse(bh);
   1587	return ret;
   1588}
   1589
   1590/*
   1591 * This function will discard the rightmost extent record.
   1592 */
   1593static void ocfs2_shift_records_right(struct ocfs2_extent_list *el)
   1594{
   1595	int next_free = le16_to_cpu(el->l_next_free_rec);
   1596	int count = le16_to_cpu(el->l_count);
   1597	unsigned int num_bytes;
   1598
   1599	BUG_ON(!next_free);
   1600	/* This will cause us to go off the end of our extent list. */
   1601	BUG_ON(next_free >= count);
   1602
   1603	num_bytes = sizeof(struct ocfs2_extent_rec) * next_free;
   1604
   1605	memmove(&el->l_recs[1], &el->l_recs[0], num_bytes);
   1606}
   1607
   1608static void ocfs2_rotate_leaf(struct ocfs2_extent_list *el,
   1609			      struct ocfs2_extent_rec *insert_rec)
   1610{
   1611	int i, insert_index, next_free, has_empty, num_bytes;
   1612	u32 insert_cpos = le32_to_cpu(insert_rec->e_cpos);
   1613	struct ocfs2_extent_rec *rec;
   1614
   1615	next_free = le16_to_cpu(el->l_next_free_rec);
   1616	has_empty = ocfs2_is_empty_extent(&el->l_recs[0]);
   1617
   1618	BUG_ON(!next_free);
   1619
   1620	/* The tree code before us didn't allow enough room in the leaf. */
   1621	BUG_ON(el->l_next_free_rec == el->l_count && !has_empty);
   1622
   1623	/*
   1624	 * The easiest way to approach this is to just remove the
   1625	 * empty extent and temporarily decrement next_free.
   1626	 */
   1627	if (has_empty) {
   1628		/*
   1629		 * If next_free was 1 (only an empty extent), this
   1630		 * loop won't execute, which is fine. We still want
   1631		 * the decrement above to happen.
   1632		 */
   1633		for(i = 0; i < (next_free - 1); i++)
   1634			el->l_recs[i] = el->l_recs[i+1];
   1635
   1636		next_free--;
   1637	}
   1638
   1639	/*
   1640	 * Figure out what the new record index should be.
   1641	 */
   1642	for(i = 0; i < next_free; i++) {
   1643		rec = &el->l_recs[i];
   1644
   1645		if (insert_cpos < le32_to_cpu(rec->e_cpos))
   1646			break;
   1647	}
   1648	insert_index = i;
   1649
   1650	trace_ocfs2_rotate_leaf(insert_cpos, insert_index,
   1651				has_empty, next_free,
   1652				le16_to_cpu(el->l_count));
   1653
   1654	BUG_ON(insert_index < 0);
   1655	BUG_ON(insert_index >= le16_to_cpu(el->l_count));
   1656	BUG_ON(insert_index > next_free);
   1657
   1658	/*
   1659	 * No need to memmove if we're just adding to the tail.
   1660	 */
   1661	if (insert_index != next_free) {
   1662		BUG_ON(next_free >= le16_to_cpu(el->l_count));
   1663
   1664		num_bytes = next_free - insert_index;
   1665		num_bytes *= sizeof(struct ocfs2_extent_rec);
   1666		memmove(&el->l_recs[insert_index + 1],
   1667			&el->l_recs[insert_index],
   1668			num_bytes);
   1669	}
   1670
   1671	/*
   1672	 * Either we had an empty extent, and need to re-increment or
   1673	 * there was no empty extent on a non full rightmost leaf node,
   1674	 * in which case we still need to increment.
   1675	 */
   1676	next_free++;
   1677	el->l_next_free_rec = cpu_to_le16(next_free);
   1678	/*
   1679	 * Make sure none of the math above just messed up our tree.
   1680	 */
   1681	BUG_ON(le16_to_cpu(el->l_next_free_rec) > le16_to_cpu(el->l_count));
   1682
   1683	el->l_recs[insert_index] = *insert_rec;
   1684
   1685}
   1686
   1687static void ocfs2_remove_empty_extent(struct ocfs2_extent_list *el)
   1688{
   1689	int size, num_recs = le16_to_cpu(el->l_next_free_rec);
   1690
   1691	BUG_ON(num_recs == 0);
   1692
   1693	if (ocfs2_is_empty_extent(&el->l_recs[0])) {
   1694		num_recs--;
   1695		size = num_recs * sizeof(struct ocfs2_extent_rec);
   1696		memmove(&el->l_recs[0], &el->l_recs[1], size);
   1697		memset(&el->l_recs[num_recs], 0,
   1698		       sizeof(struct ocfs2_extent_rec));
   1699		el->l_next_free_rec = cpu_to_le16(num_recs);
   1700	}
   1701}
   1702
   1703/*
   1704 * Create an empty extent record .
   1705 *
   1706 * l_next_free_rec may be updated.
   1707 *
   1708 * If an empty extent already exists do nothing.
   1709 */
   1710static void ocfs2_create_empty_extent(struct ocfs2_extent_list *el)
   1711{
   1712	int next_free = le16_to_cpu(el->l_next_free_rec);
   1713
   1714	BUG_ON(le16_to_cpu(el->l_tree_depth) != 0);
   1715
   1716	if (next_free == 0)
   1717		goto set_and_inc;
   1718
   1719	if (ocfs2_is_empty_extent(&el->l_recs[0]))
   1720		return;
   1721
   1722	mlog_bug_on_msg(el->l_count == el->l_next_free_rec,
   1723			"Asked to create an empty extent in a full list:\n"
   1724			"count = %u, tree depth = %u",
   1725			le16_to_cpu(el->l_count),
   1726			le16_to_cpu(el->l_tree_depth));
   1727
   1728	ocfs2_shift_records_right(el);
   1729
   1730set_and_inc:
   1731	le16_add_cpu(&el->l_next_free_rec, 1);
   1732	memset(&el->l_recs[0], 0, sizeof(struct ocfs2_extent_rec));
   1733}
   1734
   1735/*
   1736 * For a rotation which involves two leaf nodes, the "root node" is
   1737 * the lowest level tree node which contains a path to both leafs. This
   1738 * resulting set of information can be used to form a complete "subtree"
   1739 *
   1740 * This function is passed two full paths from the dinode down to a
   1741 * pair of adjacent leaves. It's task is to figure out which path
   1742 * index contains the subtree root - this can be the root index itself
   1743 * in a worst-case rotation.
   1744 *
   1745 * The array index of the subtree root is passed back.
   1746 */
   1747int ocfs2_find_subtree_root(struct ocfs2_extent_tree *et,
   1748			    struct ocfs2_path *left,
   1749			    struct ocfs2_path *right)
   1750{
   1751	int i = 0;
   1752
   1753	/*
   1754	 * Check that the caller passed in two paths from the same tree.
   1755	 */
   1756	BUG_ON(path_root_bh(left) != path_root_bh(right));
   1757
   1758	do {
   1759		i++;
   1760
   1761		/*
   1762		 * The caller didn't pass two adjacent paths.
   1763		 */
   1764		mlog_bug_on_msg(i > left->p_tree_depth,
   1765				"Owner %llu, left depth %u, right depth %u\n"
   1766				"left leaf blk %llu, right leaf blk %llu\n",
   1767				(unsigned long long)ocfs2_metadata_cache_owner(et->et_ci),
   1768				left->p_tree_depth, right->p_tree_depth,
   1769				(unsigned long long)path_leaf_bh(left)->b_blocknr,
   1770				(unsigned long long)path_leaf_bh(right)->b_blocknr);
   1771	} while (left->p_node[i].bh->b_blocknr ==
   1772		 right->p_node[i].bh->b_blocknr);
   1773
   1774	return i - 1;
   1775}
   1776
   1777typedef void (path_insert_t)(void *, struct buffer_head *);
   1778
   1779/*
   1780 * Traverse a btree path in search of cpos, starting at root_el.
   1781 *
   1782 * This code can be called with a cpos larger than the tree, in which
   1783 * case it will return the rightmost path.
   1784 */
   1785static int __ocfs2_find_path(struct ocfs2_caching_info *ci,
   1786			     struct ocfs2_extent_list *root_el, u32 cpos,
   1787			     path_insert_t *func, void *data)
   1788{
   1789	int i, ret = 0;
   1790	u32 range;
   1791	u64 blkno;
   1792	struct buffer_head *bh = NULL;
   1793	struct ocfs2_extent_block *eb;
   1794	struct ocfs2_extent_list *el;
   1795	struct ocfs2_extent_rec *rec;
   1796
   1797	el = root_el;
   1798	while (el->l_tree_depth) {
   1799		if (le16_to_cpu(el->l_next_free_rec) == 0) {
   1800			ocfs2_error(ocfs2_metadata_cache_get_super(ci),
   1801				    "Owner %llu has empty extent list at depth %u\n",
   1802				    (unsigned long long)ocfs2_metadata_cache_owner(ci),
   1803				    le16_to_cpu(el->l_tree_depth));
   1804			ret = -EROFS;
   1805			goto out;
   1806
   1807		}
   1808
   1809		for(i = 0; i < le16_to_cpu(el->l_next_free_rec) - 1; i++) {
   1810			rec = &el->l_recs[i];
   1811
   1812			/*
   1813			 * In the case that cpos is off the allocation
   1814			 * tree, this should just wind up returning the
   1815			 * rightmost record.
   1816			 */
   1817			range = le32_to_cpu(rec->e_cpos) +
   1818				ocfs2_rec_clusters(el, rec);
   1819			if (cpos >= le32_to_cpu(rec->e_cpos) && cpos < range)
   1820			    break;
   1821		}
   1822
   1823		blkno = le64_to_cpu(el->l_recs[i].e_blkno);
   1824		if (blkno == 0) {
   1825			ocfs2_error(ocfs2_metadata_cache_get_super(ci),
   1826				    "Owner %llu has bad blkno in extent list at depth %u (index %d)\n",
   1827				    (unsigned long long)ocfs2_metadata_cache_owner(ci),
   1828				    le16_to_cpu(el->l_tree_depth), i);
   1829			ret = -EROFS;
   1830			goto out;
   1831		}
   1832
   1833		brelse(bh);
   1834		bh = NULL;
   1835		ret = ocfs2_read_extent_block(ci, blkno, &bh);
   1836		if (ret) {
   1837			mlog_errno(ret);
   1838			goto out;
   1839		}
   1840
   1841		eb = (struct ocfs2_extent_block *) bh->b_data;
   1842		el = &eb->h_list;
   1843
   1844		if (le16_to_cpu(el->l_next_free_rec) >
   1845		    le16_to_cpu(el->l_count)) {
   1846			ocfs2_error(ocfs2_metadata_cache_get_super(ci),
   1847				    "Owner %llu has bad count in extent list at block %llu (next free=%u, count=%u)\n",
   1848				    (unsigned long long)ocfs2_metadata_cache_owner(ci),
   1849				    (unsigned long long)bh->b_blocknr,
   1850				    le16_to_cpu(el->l_next_free_rec),
   1851				    le16_to_cpu(el->l_count));
   1852			ret = -EROFS;
   1853			goto out;
   1854		}
   1855
   1856		if (func)
   1857			func(data, bh);
   1858	}
   1859
   1860out:
   1861	/*
   1862	 * Catch any trailing bh that the loop didn't handle.
   1863	 */
   1864	brelse(bh);
   1865
   1866	return ret;
   1867}
   1868
   1869/*
   1870 * Given an initialized path (that is, it has a valid root extent
   1871 * list), this function will traverse the btree in search of the path
   1872 * which would contain cpos.
   1873 *
   1874 * The path traveled is recorded in the path structure.
   1875 *
   1876 * Note that this will not do any comparisons on leaf node extent
   1877 * records, so it will work fine in the case that we just added a tree
   1878 * branch.
   1879 */
   1880struct find_path_data {
   1881	int index;
   1882	struct ocfs2_path *path;
   1883};
   1884static void find_path_ins(void *data, struct buffer_head *bh)
   1885{
   1886	struct find_path_data *fp = data;
   1887
   1888	get_bh(bh);
   1889	ocfs2_path_insert_eb(fp->path, fp->index, bh);
   1890	fp->index++;
   1891}
   1892int ocfs2_find_path(struct ocfs2_caching_info *ci,
   1893		    struct ocfs2_path *path, u32 cpos)
   1894{
   1895	struct find_path_data data;
   1896
   1897	data.index = 1;
   1898	data.path = path;
   1899	return __ocfs2_find_path(ci, path_root_el(path), cpos,
   1900				 find_path_ins, &data);
   1901}
   1902
   1903static void find_leaf_ins(void *data, struct buffer_head *bh)
   1904{
   1905	struct ocfs2_extent_block *eb =(struct ocfs2_extent_block *)bh->b_data;
   1906	struct ocfs2_extent_list *el = &eb->h_list;
   1907	struct buffer_head **ret = data;
   1908
   1909	/* We want to retain only the leaf block. */
   1910	if (le16_to_cpu(el->l_tree_depth) == 0) {
   1911		get_bh(bh);
   1912		*ret = bh;
   1913	}
   1914}
   1915/*
   1916 * Find the leaf block in the tree which would contain cpos. No
   1917 * checking of the actual leaf is done.
   1918 *
   1919 * Some paths want to call this instead of allocating a path structure
   1920 * and calling ocfs2_find_path().
   1921 *
   1922 * This function doesn't handle non btree extent lists.
   1923 */
   1924int ocfs2_find_leaf(struct ocfs2_caching_info *ci,
   1925		    struct ocfs2_extent_list *root_el, u32 cpos,
   1926		    struct buffer_head **leaf_bh)
   1927{
   1928	int ret;
   1929	struct buffer_head *bh = NULL;
   1930
   1931	ret = __ocfs2_find_path(ci, root_el, cpos, find_leaf_ins, &bh);
   1932	if (ret) {
   1933		mlog_errno(ret);
   1934		goto out;
   1935	}
   1936
   1937	*leaf_bh = bh;
   1938out:
   1939	return ret;
   1940}
   1941
   1942/*
   1943 * Adjust the adjacent records (left_rec, right_rec) involved in a rotation.
   1944 *
   1945 * Basically, we've moved stuff around at the bottom of the tree and
   1946 * we need to fix up the extent records above the changes to reflect
   1947 * the new changes.
   1948 *
   1949 * left_rec: the record on the left.
   1950 * right_rec: the record to the right of left_rec
   1951 * right_child_el: is the child list pointed to by right_rec
   1952 *
   1953 * By definition, this only works on interior nodes.
   1954 */
   1955static void ocfs2_adjust_adjacent_records(struct ocfs2_extent_rec *left_rec,
   1956				  struct ocfs2_extent_rec *right_rec,
   1957				  struct ocfs2_extent_list *right_child_el)
   1958{
   1959	u32 left_clusters, right_end;
   1960
   1961	/*
   1962	 * Interior nodes never have holes. Their cpos is the cpos of
   1963	 * the leftmost record in their child list. Their cluster
   1964	 * count covers the full theoretical range of their child list
   1965	 * - the range between their cpos and the cpos of the record
   1966	 * immediately to their right.
   1967	 */
   1968	left_clusters = le32_to_cpu(right_child_el->l_recs[0].e_cpos);
   1969	if (!ocfs2_rec_clusters(right_child_el, &right_child_el->l_recs[0])) {
   1970		BUG_ON(right_child_el->l_tree_depth);
   1971		BUG_ON(le16_to_cpu(right_child_el->l_next_free_rec) <= 1);
   1972		left_clusters = le32_to_cpu(right_child_el->l_recs[1].e_cpos);
   1973	}
   1974	left_clusters -= le32_to_cpu(left_rec->e_cpos);
   1975	left_rec->e_int_clusters = cpu_to_le32(left_clusters);
   1976
   1977	/*
   1978	 * Calculate the rightmost cluster count boundary before
   1979	 * moving cpos - we will need to adjust clusters after
   1980	 * updating e_cpos to keep the same highest cluster count.
   1981	 */
   1982	right_end = le32_to_cpu(right_rec->e_cpos);
   1983	right_end += le32_to_cpu(right_rec->e_int_clusters);
   1984
   1985	right_rec->e_cpos = left_rec->e_cpos;
   1986	le32_add_cpu(&right_rec->e_cpos, left_clusters);
   1987
   1988	right_end -= le32_to_cpu(right_rec->e_cpos);
   1989	right_rec->e_int_clusters = cpu_to_le32(right_end);
   1990}
   1991
   1992/*
   1993 * Adjust the adjacent root node records involved in a
   1994 * rotation. left_el_blkno is passed in as a key so that we can easily
   1995 * find it's index in the root list.
   1996 */
   1997static void ocfs2_adjust_root_records(struct ocfs2_extent_list *root_el,
   1998				      struct ocfs2_extent_list *left_el,
   1999				      struct ocfs2_extent_list *right_el,
   2000				      u64 left_el_blkno)
   2001{
   2002	int i;
   2003
   2004	BUG_ON(le16_to_cpu(root_el->l_tree_depth) <=
   2005	       le16_to_cpu(left_el->l_tree_depth));
   2006
   2007	for(i = 0; i < le16_to_cpu(root_el->l_next_free_rec) - 1; i++) {
   2008		if (le64_to_cpu(root_el->l_recs[i].e_blkno) == left_el_blkno)
   2009			break;
   2010	}
   2011
   2012	/*
   2013	 * The path walking code should have never returned a root and
   2014	 * two paths which are not adjacent.
   2015	 */
   2016	BUG_ON(i >= (le16_to_cpu(root_el->l_next_free_rec) - 1));
   2017
   2018	ocfs2_adjust_adjacent_records(&root_el->l_recs[i],
   2019				      &root_el->l_recs[i + 1], right_el);
   2020}
   2021
   2022/*
   2023 * We've changed a leaf block (in right_path) and need to reflect that
   2024 * change back up the subtree.
   2025 *
   2026 * This happens in multiple places:
   2027 *   - When we've moved an extent record from the left path leaf to the right
   2028 *     path leaf to make room for an empty extent in the left path leaf.
   2029 *   - When our insert into the right path leaf is at the leftmost edge
   2030 *     and requires an update of the path immediately to it's left. This
   2031 *     can occur at the end of some types of rotation and appending inserts.
   2032 *   - When we've adjusted the last extent record in the left path leaf and the
   2033 *     1st extent record in the right path leaf during cross extent block merge.
   2034 */
   2035static void ocfs2_complete_edge_insert(handle_t *handle,
   2036				       struct ocfs2_path *left_path,
   2037				       struct ocfs2_path *right_path,
   2038				       int subtree_index)
   2039{
   2040	int i, idx;
   2041	struct ocfs2_extent_list *el, *left_el, *right_el;
   2042	struct ocfs2_extent_rec *left_rec, *right_rec;
   2043	struct buffer_head *root_bh;
   2044
   2045	/*
   2046	 * Update the counts and position values within all the
   2047	 * interior nodes to reflect the leaf rotation we just did.
   2048	 *
   2049	 * The root node is handled below the loop.
   2050	 *
   2051	 * We begin the loop with right_el and left_el pointing to the
   2052	 * leaf lists and work our way up.
   2053	 *
   2054	 * NOTE: within this loop, left_el and right_el always refer
   2055	 * to the *child* lists.
   2056	 */
   2057	left_el = path_leaf_el(left_path);
   2058	right_el = path_leaf_el(right_path);
   2059	for(i = left_path->p_tree_depth - 1; i > subtree_index; i--) {
   2060		trace_ocfs2_complete_edge_insert(i);
   2061
   2062		/*
   2063		 * One nice property of knowing that all of these
   2064		 * nodes are below the root is that we only deal with
   2065		 * the leftmost right node record and the rightmost
   2066		 * left node record.
   2067		 */
   2068		el = left_path->p_node[i].el;
   2069		idx = le16_to_cpu(left_el->l_next_free_rec) - 1;
   2070		left_rec = &el->l_recs[idx];
   2071
   2072		el = right_path->p_node[i].el;
   2073		right_rec = &el->l_recs[0];
   2074
   2075		ocfs2_adjust_adjacent_records(left_rec, right_rec, right_el);
   2076
   2077		ocfs2_journal_dirty(handle, left_path->p_node[i].bh);
   2078		ocfs2_journal_dirty(handle, right_path->p_node[i].bh);
   2079
   2080		/*
   2081		 * Setup our list pointers now so that the current
   2082		 * parents become children in the next iteration.
   2083		 */
   2084		left_el = left_path->p_node[i].el;
   2085		right_el = right_path->p_node[i].el;
   2086	}
   2087
   2088	/*
   2089	 * At the root node, adjust the two adjacent records which
   2090	 * begin our path to the leaves.
   2091	 */
   2092
   2093	el = left_path->p_node[subtree_index].el;
   2094	left_el = left_path->p_node[subtree_index + 1].el;
   2095	right_el = right_path->p_node[subtree_index + 1].el;
   2096
   2097	ocfs2_adjust_root_records(el, left_el, right_el,
   2098				  left_path->p_node[subtree_index + 1].bh->b_blocknr);
   2099
   2100	root_bh = left_path->p_node[subtree_index].bh;
   2101
   2102	ocfs2_journal_dirty(handle, root_bh);
   2103}
   2104
   2105static int ocfs2_rotate_subtree_right(handle_t *handle,
   2106				      struct ocfs2_extent_tree *et,
   2107				      struct ocfs2_path *left_path,
   2108				      struct ocfs2_path *right_path,
   2109				      int subtree_index)
   2110{
   2111	int ret, i;
   2112	struct buffer_head *right_leaf_bh;
   2113	struct buffer_head *left_leaf_bh = NULL;
   2114	struct buffer_head *root_bh;
   2115	struct ocfs2_extent_list *right_el, *left_el;
   2116	struct ocfs2_extent_rec move_rec;
   2117
   2118	left_leaf_bh = path_leaf_bh(left_path);
   2119	left_el = path_leaf_el(left_path);
   2120
   2121	if (left_el->l_next_free_rec != left_el->l_count) {
   2122		ocfs2_error(ocfs2_metadata_cache_get_super(et->et_ci),
   2123			    "Inode %llu has non-full interior leaf node %llu (next free = %u)\n",
   2124			    (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci),
   2125			    (unsigned long long)left_leaf_bh->b_blocknr,
   2126			    le16_to_cpu(left_el->l_next_free_rec));
   2127		return -EROFS;
   2128	}
   2129
   2130	/*
   2131	 * This extent block may already have an empty record, so we
   2132	 * return early if so.
   2133	 */
   2134	if (ocfs2_is_empty_extent(&left_el->l_recs[0]))
   2135		return 0;
   2136
   2137	root_bh = left_path->p_node[subtree_index].bh;
   2138	BUG_ON(root_bh != right_path->p_node[subtree_index].bh);
   2139
   2140	ret = ocfs2_path_bh_journal_access(handle, et->et_ci, right_path,
   2141					   subtree_index);
   2142	if (ret) {
   2143		mlog_errno(ret);
   2144		goto out;
   2145	}
   2146
   2147	for(i = subtree_index + 1; i < path_num_items(right_path); i++) {
   2148		ret = ocfs2_path_bh_journal_access(handle, et->et_ci,
   2149						   right_path, i);
   2150		if (ret) {
   2151			mlog_errno(ret);
   2152			goto out;
   2153		}
   2154
   2155		ret = ocfs2_path_bh_journal_access(handle, et->et_ci,
   2156						   left_path, i);
   2157		if (ret) {
   2158			mlog_errno(ret);
   2159			goto out;
   2160		}
   2161	}
   2162
   2163	right_leaf_bh = path_leaf_bh(right_path);
   2164	right_el = path_leaf_el(right_path);
   2165
   2166	/* This is a code error, not a disk corruption. */
   2167	mlog_bug_on_msg(!right_el->l_next_free_rec, "Inode %llu: Rotate fails "
   2168			"because rightmost leaf block %llu is empty\n",
   2169			(unsigned long long)ocfs2_metadata_cache_owner(et->et_ci),
   2170			(unsigned long long)right_leaf_bh->b_blocknr);
   2171
   2172	ocfs2_create_empty_extent(right_el);
   2173
   2174	ocfs2_journal_dirty(handle, right_leaf_bh);
   2175
   2176	/* Do the copy now. */
   2177	i = le16_to_cpu(left_el->l_next_free_rec) - 1;
   2178	move_rec = left_el->l_recs[i];
   2179	right_el->l_recs[0] = move_rec;
   2180
   2181	/*
   2182	 * Clear out the record we just copied and shift everything
   2183	 * over, leaving an empty extent in the left leaf.
   2184	 *
   2185	 * We temporarily subtract from next_free_rec so that the
   2186	 * shift will lose the tail record (which is now defunct).
   2187	 */
   2188	le16_add_cpu(&left_el->l_next_free_rec, -1);
   2189	ocfs2_shift_records_right(left_el);
   2190	memset(&left_el->l_recs[0], 0, sizeof(struct ocfs2_extent_rec));
   2191	le16_add_cpu(&left_el->l_next_free_rec, 1);
   2192
   2193	ocfs2_journal_dirty(handle, left_leaf_bh);
   2194
   2195	ocfs2_complete_edge_insert(handle, left_path, right_path,
   2196				   subtree_index);
   2197
   2198out:
   2199	return ret;
   2200}
   2201
   2202/*
   2203 * Given a full path, determine what cpos value would return us a path
   2204 * containing the leaf immediately to the left of the current one.
   2205 *
   2206 * Will return zero if the path passed in is already the leftmost path.
   2207 */
   2208int ocfs2_find_cpos_for_left_leaf(struct super_block *sb,
   2209				  struct ocfs2_path *path, u32 *cpos)
   2210{
   2211	int i, j, ret = 0;
   2212	u64 blkno;
   2213	struct ocfs2_extent_list *el;
   2214
   2215	BUG_ON(path->p_tree_depth == 0);
   2216
   2217	*cpos = 0;
   2218
   2219	blkno = path_leaf_bh(path)->b_blocknr;
   2220
   2221	/* Start at the tree node just above the leaf and work our way up. */
   2222	i = path->p_tree_depth - 1;
   2223	while (i >= 0) {
   2224		el = path->p_node[i].el;
   2225
   2226		/*
   2227		 * Find the extent record just before the one in our
   2228		 * path.
   2229		 */
   2230		for(j = 0; j < le16_to_cpu(el->l_next_free_rec); j++) {
   2231			if (le64_to_cpu(el->l_recs[j].e_blkno) == blkno) {
   2232				if (j == 0) {
   2233					if (i == 0) {
   2234						/*
   2235						 * We've determined that the
   2236						 * path specified is already
   2237						 * the leftmost one - return a
   2238						 * cpos of zero.
   2239						 */
   2240						goto out;
   2241					}
   2242					/*
   2243					 * The leftmost record points to our
   2244					 * leaf - we need to travel up the
   2245					 * tree one level.
   2246					 */
   2247					goto next_node;
   2248				}
   2249
   2250				*cpos = le32_to_cpu(el->l_recs[j - 1].e_cpos);
   2251				*cpos = *cpos + ocfs2_rec_clusters(el,
   2252							   &el->l_recs[j - 1]);
   2253				*cpos = *cpos - 1;
   2254				goto out;
   2255			}
   2256		}
   2257
   2258		/*
   2259		 * If we got here, we never found a valid node where
   2260		 * the tree indicated one should be.
   2261		 */
   2262		ocfs2_error(sb, "Invalid extent tree at extent block %llu\n",
   2263			    (unsigned long long)blkno);
   2264		ret = -EROFS;
   2265		goto out;
   2266
   2267next_node:
   2268		blkno = path->p_node[i].bh->b_blocknr;
   2269		i--;
   2270	}
   2271
   2272out:
   2273	return ret;
   2274}
   2275
   2276/*
   2277 * Extend the transaction by enough credits to complete the rotation,
   2278 * and still leave at least the original number of credits allocated
   2279 * to this transaction.
   2280 */
   2281static int ocfs2_extend_rotate_transaction(handle_t *handle, int subtree_depth,
   2282					   int op_credits,
   2283					   struct ocfs2_path *path)
   2284{
   2285	int ret = 0;
   2286	int credits = (path->p_tree_depth - subtree_depth) * 2 + 1 + op_credits;
   2287
   2288	if (jbd2_handle_buffer_credits(handle) < credits)
   2289		ret = ocfs2_extend_trans(handle,
   2290				credits - jbd2_handle_buffer_credits(handle));
   2291
   2292	return ret;
   2293}
   2294
   2295/*
   2296 * Trap the case where we're inserting into the theoretical range past
   2297 * the _actual_ left leaf range. Otherwise, we'll rotate a record
   2298 * whose cpos is less than ours into the right leaf.
   2299 *
   2300 * It's only necessary to look at the rightmost record of the left
   2301 * leaf because the logic that calls us should ensure that the
   2302 * theoretical ranges in the path components above the leaves are
   2303 * correct.
   2304 */
   2305static int ocfs2_rotate_requires_path_adjustment(struct ocfs2_path *left_path,
   2306						 u32 insert_cpos)
   2307{
   2308	struct ocfs2_extent_list *left_el;
   2309	struct ocfs2_extent_rec *rec;
   2310	int next_free;
   2311
   2312	left_el = path_leaf_el(left_path);
   2313	next_free = le16_to_cpu(left_el->l_next_free_rec);
   2314	rec = &left_el->l_recs[next_free - 1];
   2315
   2316	if (insert_cpos > le32_to_cpu(rec->e_cpos))
   2317		return 1;
   2318	return 0;
   2319}
   2320
   2321static int ocfs2_leftmost_rec_contains(struct ocfs2_extent_list *el, u32 cpos)
   2322{
   2323	int next_free = le16_to_cpu(el->l_next_free_rec);
   2324	unsigned int range;
   2325	struct ocfs2_extent_rec *rec;
   2326
   2327	if (next_free == 0)
   2328		return 0;
   2329
   2330	rec = &el->l_recs[0];
   2331	if (ocfs2_is_empty_extent(rec)) {
   2332		/* Empty list. */
   2333		if (next_free == 1)
   2334			return 0;
   2335		rec = &el->l_recs[1];
   2336	}
   2337
   2338	range = le32_to_cpu(rec->e_cpos) + ocfs2_rec_clusters(el, rec);
   2339	if (cpos >= le32_to_cpu(rec->e_cpos) && cpos < range)
   2340		return 1;
   2341	return 0;
   2342}
   2343
   2344/*
   2345 * Rotate all the records in a btree right one record, starting at insert_cpos.
   2346 *
   2347 * The path to the rightmost leaf should be passed in.
   2348 *
   2349 * The array is assumed to be large enough to hold an entire path (tree depth).
   2350 *
   2351 * Upon successful return from this function:
   2352 *
   2353 * - The 'right_path' array will contain a path to the leaf block
   2354 *   whose range contains e_cpos.
   2355 * - That leaf block will have a single empty extent in list index 0.
   2356 * - In the case that the rotation requires a post-insert update,
   2357 *   *ret_left_path will contain a valid path which can be passed to
   2358 *   ocfs2_insert_path().
   2359 */
   2360static int ocfs2_rotate_tree_right(handle_t *handle,
   2361				   struct ocfs2_extent_tree *et,
   2362				   enum ocfs2_split_type split,
   2363				   u32 insert_cpos,
   2364				   struct ocfs2_path *right_path,
   2365				   struct ocfs2_path **ret_left_path)
   2366{
   2367	int ret, start, orig_credits = jbd2_handle_buffer_credits(handle);
   2368	u32 cpos;
   2369	struct ocfs2_path *left_path = NULL;
   2370	struct super_block *sb = ocfs2_metadata_cache_get_super(et->et_ci);
   2371
   2372	*ret_left_path = NULL;
   2373
   2374	left_path = ocfs2_new_path_from_path(right_path);
   2375	if (!left_path) {
   2376		ret = -ENOMEM;
   2377		mlog_errno(ret);
   2378		goto out;
   2379	}
   2380
   2381	ret = ocfs2_find_cpos_for_left_leaf(sb, right_path, &cpos);
   2382	if (ret) {
   2383		mlog_errno(ret);
   2384		goto out;
   2385	}
   2386
   2387	trace_ocfs2_rotate_tree_right(
   2388		(unsigned long long)ocfs2_metadata_cache_owner(et->et_ci),
   2389		insert_cpos, cpos);
   2390
   2391	/*
   2392	 * What we want to do here is:
   2393	 *
   2394	 * 1) Start with the rightmost path.
   2395	 *
   2396	 * 2) Determine a path to the leaf block directly to the left
   2397	 *    of that leaf.
   2398	 *
   2399	 * 3) Determine the 'subtree root' - the lowest level tree node
   2400	 *    which contains a path to both leaves.
   2401	 *
   2402	 * 4) Rotate the subtree.
   2403	 *
   2404	 * 5) Find the next subtree by considering the left path to be
   2405	 *    the new right path.
   2406	 *
   2407	 * The check at the top of this while loop also accepts
   2408	 * insert_cpos == cpos because cpos is only a _theoretical_
   2409	 * value to get us the left path - insert_cpos might very well
   2410	 * be filling that hole.
   2411	 *
   2412	 * Stop at a cpos of '0' because we either started at the
   2413	 * leftmost branch (i.e., a tree with one branch and a
   2414	 * rotation inside of it), or we've gone as far as we can in
   2415	 * rotating subtrees.
   2416	 */
   2417	while (cpos && insert_cpos <= cpos) {
   2418		trace_ocfs2_rotate_tree_right(
   2419			(unsigned long long)
   2420			ocfs2_metadata_cache_owner(et->et_ci),
   2421			insert_cpos, cpos);
   2422
   2423		ret = ocfs2_find_path(et->et_ci, left_path, cpos);
   2424		if (ret) {
   2425			mlog_errno(ret);
   2426			goto out;
   2427		}
   2428
   2429		mlog_bug_on_msg(path_leaf_bh(left_path) ==
   2430				path_leaf_bh(right_path),
   2431				"Owner %llu: error during insert of %u "
   2432				"(left path cpos %u) results in two identical "
   2433				"paths ending at %llu\n",
   2434				(unsigned long long)ocfs2_metadata_cache_owner(et->et_ci),
   2435				insert_cpos, cpos,
   2436				(unsigned long long)
   2437				path_leaf_bh(left_path)->b_blocknr);
   2438
   2439		if (split == SPLIT_NONE &&
   2440		    ocfs2_rotate_requires_path_adjustment(left_path,
   2441							  insert_cpos)) {
   2442
   2443			/*
   2444			 * We've rotated the tree as much as we
   2445			 * should. The rest is up to
   2446			 * ocfs2_insert_path() to complete, after the
   2447			 * record insertion. We indicate this
   2448			 * situation by returning the left path.
   2449			 *
   2450			 * The reason we don't adjust the records here
   2451			 * before the record insert is that an error
   2452			 * later might break the rule where a parent
   2453			 * record e_cpos will reflect the actual
   2454			 * e_cpos of the 1st nonempty record of the
   2455			 * child list.
   2456			 */
   2457			*ret_left_path = left_path;
   2458			goto out_ret_path;
   2459		}
   2460
   2461		start = ocfs2_find_subtree_root(et, left_path, right_path);
   2462
   2463		trace_ocfs2_rotate_subtree(start,
   2464			(unsigned long long)
   2465			right_path->p_node[start].bh->b_blocknr,
   2466			right_path->p_tree_depth);
   2467
   2468		ret = ocfs2_extend_rotate_transaction(handle, start,
   2469						      orig_credits, right_path);
   2470		if (ret) {
   2471			mlog_errno(ret);
   2472			goto out;
   2473		}
   2474
   2475		ret = ocfs2_rotate_subtree_right(handle, et, left_path,
   2476						 right_path, start);
   2477		if (ret) {
   2478			mlog_errno(ret);
   2479			goto out;
   2480		}
   2481
   2482		if (split != SPLIT_NONE &&
   2483		    ocfs2_leftmost_rec_contains(path_leaf_el(right_path),
   2484						insert_cpos)) {
   2485			/*
   2486			 * A rotate moves the rightmost left leaf
   2487			 * record over to the leftmost right leaf
   2488			 * slot. If we're doing an extent split
   2489			 * instead of a real insert, then we have to
   2490			 * check that the extent to be split wasn't
   2491			 * just moved over. If it was, then we can
   2492			 * exit here, passing left_path back -
   2493			 * ocfs2_split_extent() is smart enough to
   2494			 * search both leaves.
   2495			 */
   2496			*ret_left_path = left_path;
   2497			goto out_ret_path;
   2498		}
   2499
   2500		/*
   2501		 * There is no need to re-read the next right path
   2502		 * as we know that it'll be our current left
   2503		 * path. Optimize by copying values instead.
   2504		 */
   2505		ocfs2_mv_path(right_path, left_path);
   2506
   2507		ret = ocfs2_find_cpos_for_left_leaf(sb, right_path, &cpos);
   2508		if (ret) {
   2509			mlog_errno(ret);
   2510			goto out;
   2511		}
   2512	}
   2513
   2514out:
   2515	ocfs2_free_path(left_path);
   2516
   2517out_ret_path:
   2518	return ret;
   2519}
   2520
   2521static int ocfs2_update_edge_lengths(handle_t *handle,
   2522				     struct ocfs2_extent_tree *et,
   2523				     struct ocfs2_path *path)
   2524{
   2525	int i, idx, ret;
   2526	struct ocfs2_extent_rec *rec;
   2527	struct ocfs2_extent_list *el;
   2528	struct ocfs2_extent_block *eb;
   2529	u32 range;
   2530
   2531	ret = ocfs2_journal_access_path(et->et_ci, handle, path);
   2532	if (ret) {
   2533		mlog_errno(ret);
   2534		goto out;
   2535	}
   2536
   2537	/* Path should always be rightmost. */
   2538	eb = (struct ocfs2_extent_block *)path_leaf_bh(path)->b_data;
   2539	BUG_ON(eb->h_next_leaf_blk != 0ULL);
   2540
   2541	el = &eb->h_list;
   2542	BUG_ON(le16_to_cpu(el->l_next_free_rec) == 0);
   2543	idx = le16_to_cpu(el->l_next_free_rec) - 1;
   2544	rec = &el->l_recs[idx];
   2545	range = le32_to_cpu(rec->e_cpos) + ocfs2_rec_clusters(el, rec);
   2546
   2547	for (i = 0; i < path->p_tree_depth; i++) {
   2548		el = path->p_node[i].el;
   2549		idx = le16_to_cpu(el->l_next_free_rec) - 1;
   2550		rec = &el->l_recs[idx];
   2551
   2552		rec->e_int_clusters = cpu_to_le32(range);
   2553		le32_add_cpu(&rec->e_int_clusters, -le32_to_cpu(rec->e_cpos));
   2554
   2555		ocfs2_journal_dirty(handle, path->p_node[i].bh);
   2556	}
   2557out:
   2558	return ret;
   2559}
   2560
   2561static void ocfs2_unlink_path(handle_t *handle,
   2562			      struct ocfs2_extent_tree *et,
   2563			      struct ocfs2_cached_dealloc_ctxt *dealloc,
   2564			      struct ocfs2_path *path, int unlink_start)
   2565{
   2566	int ret, i;
   2567	struct ocfs2_extent_block *eb;
   2568	struct ocfs2_extent_list *el;
   2569	struct buffer_head *bh;
   2570
   2571	for(i = unlink_start; i < path_num_items(path); i++) {
   2572		bh = path->p_node[i].bh;
   2573
   2574		eb = (struct ocfs2_extent_block *)bh->b_data;
   2575		/*
   2576		 * Not all nodes might have had their final count
   2577		 * decremented by the caller - handle this here.
   2578		 */
   2579		el = &eb->h_list;
   2580		if (le16_to_cpu(el->l_next_free_rec) > 1) {
   2581			mlog(ML_ERROR,
   2582			     "Inode %llu, attempted to remove extent block "
   2583			     "%llu with %u records\n",
   2584			     (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci),
   2585			     (unsigned long long)le64_to_cpu(eb->h_blkno),
   2586			     le16_to_cpu(el->l_next_free_rec));
   2587
   2588			ocfs2_journal_dirty(handle, bh);
   2589			ocfs2_remove_from_cache(et->et_ci, bh);
   2590			continue;
   2591		}
   2592
   2593		el->l_next_free_rec = 0;
   2594		memset(&el->l_recs[0], 0, sizeof(struct ocfs2_extent_rec));
   2595
   2596		ocfs2_journal_dirty(handle, bh);
   2597
   2598		ret = ocfs2_cache_extent_block_free(dealloc, eb);
   2599		if (ret)
   2600			mlog_errno(ret);
   2601
   2602		ocfs2_remove_from_cache(et->et_ci, bh);
   2603	}
   2604}
   2605
   2606static void ocfs2_unlink_subtree(handle_t *handle,
   2607				 struct ocfs2_extent_tree *et,
   2608				 struct ocfs2_path *left_path,
   2609				 struct ocfs2_path *right_path,
   2610				 int subtree_index,
   2611				 struct ocfs2_cached_dealloc_ctxt *dealloc)
   2612{
   2613	int i;
   2614	struct buffer_head *root_bh = left_path->p_node[subtree_index].bh;
   2615	struct ocfs2_extent_list *root_el = left_path->p_node[subtree_index].el;
   2616	struct ocfs2_extent_block *eb;
   2617
   2618	eb = (struct ocfs2_extent_block *)right_path->p_node[subtree_index + 1].bh->b_data;
   2619
   2620	for(i = 1; i < le16_to_cpu(root_el->l_next_free_rec); i++)
   2621		if (root_el->l_recs[i].e_blkno == eb->h_blkno)
   2622			break;
   2623
   2624	BUG_ON(i >= le16_to_cpu(root_el->l_next_free_rec));
   2625
   2626	memset(&root_el->l_recs[i], 0, sizeof(struct ocfs2_extent_rec));
   2627	le16_add_cpu(&root_el->l_next_free_rec, -1);
   2628
   2629	eb = (struct ocfs2_extent_block *)path_leaf_bh(left_path)->b_data;
   2630	eb->h_next_leaf_blk = 0;
   2631
   2632	ocfs2_journal_dirty(handle, root_bh);
   2633	ocfs2_journal_dirty(handle, path_leaf_bh(left_path));
   2634
   2635	ocfs2_unlink_path(handle, et, dealloc, right_path,
   2636			  subtree_index + 1);
   2637}
   2638
   2639static int ocfs2_rotate_subtree_left(handle_t *handle,
   2640				     struct ocfs2_extent_tree *et,
   2641				     struct ocfs2_path *left_path,
   2642				     struct ocfs2_path *right_path,
   2643				     int subtree_index,
   2644				     struct ocfs2_cached_dealloc_ctxt *dealloc,
   2645				     int *deleted)
   2646{
   2647	int ret, i, del_right_subtree = 0, right_has_empty = 0;
   2648	struct buffer_head *root_bh, *et_root_bh = path_root_bh(right_path);
   2649	struct ocfs2_extent_list *right_leaf_el, *left_leaf_el;
   2650	struct ocfs2_extent_block *eb;
   2651
   2652	*deleted = 0;
   2653
   2654	right_leaf_el = path_leaf_el(right_path);
   2655	left_leaf_el = path_leaf_el(left_path);
   2656	root_bh = left_path->p_node[subtree_index].bh;
   2657	BUG_ON(root_bh != right_path->p_node[subtree_index].bh);
   2658
   2659	if (!ocfs2_is_empty_extent(&left_leaf_el->l_recs[0]))
   2660		return 0;
   2661
   2662	eb = (struct ocfs2_extent_block *)path_leaf_bh(right_path)->b_data;
   2663	if (ocfs2_is_empty_extent(&right_leaf_el->l_recs[0])) {
   2664		/*
   2665		 * It's legal for us to proceed if the right leaf is
   2666		 * the rightmost one and it has an empty extent. There
   2667		 * are two cases to handle - whether the leaf will be
   2668		 * empty after removal or not. If the leaf isn't empty
   2669		 * then just remove the empty extent up front. The
   2670		 * next block will handle empty leaves by flagging
   2671		 * them for unlink.
   2672		 *
   2673		 * Non rightmost leaves will throw -EAGAIN and the
   2674		 * caller can manually move the subtree and retry.
   2675		 */
   2676
   2677		if (eb->h_next_leaf_blk != 0ULL)
   2678			return -EAGAIN;
   2679
   2680		if (le16_to_cpu(right_leaf_el->l_next_free_rec) > 1) {
   2681			ret = ocfs2_journal_access_eb(handle, et->et_ci,
   2682						      path_leaf_bh(right_path),
   2683						      OCFS2_JOURNAL_ACCESS_WRITE);
   2684			if (ret) {
   2685				mlog_errno(ret);
   2686				goto out;
   2687			}
   2688
   2689			ocfs2_remove_empty_extent(right_leaf_el);
   2690		} else
   2691			right_has_empty = 1;
   2692	}
   2693
   2694	if (eb->h_next_leaf_blk == 0ULL &&
   2695	    le16_to_cpu(right_leaf_el->l_next_free_rec) == 1) {
   2696		/*
   2697		 * We have to update i_last_eb_blk during the meta
   2698		 * data delete.
   2699		 */
   2700		ret = ocfs2_et_root_journal_access(handle, et,
   2701						   OCFS2_JOURNAL_ACCESS_WRITE);
   2702		if (ret) {
   2703			mlog_errno(ret);
   2704			goto out;
   2705		}
   2706
   2707		del_right_subtree = 1;
   2708	}
   2709
   2710	/*
   2711	 * Getting here with an empty extent in the right path implies
   2712	 * that it's the rightmost path and will be deleted.
   2713	 */
   2714	BUG_ON(right_has_empty && !del_right_subtree);
   2715
   2716	ret = ocfs2_path_bh_journal_access(handle, et->et_ci, right_path,
   2717					   subtree_index);
   2718	if (ret) {
   2719		mlog_errno(ret);
   2720		goto out;
   2721	}
   2722
   2723	for(i = subtree_index + 1; i < path_num_items(right_path); i++) {
   2724		ret = ocfs2_path_bh_journal_access(handle, et->et_ci,
   2725						   right_path, i);
   2726		if (ret) {
   2727			mlog_errno(ret);
   2728			goto out;
   2729		}
   2730
   2731		ret = ocfs2_path_bh_journal_access(handle, et->et_ci,
   2732						   left_path, i);
   2733		if (ret) {
   2734			mlog_errno(ret);
   2735			goto out;
   2736		}
   2737	}
   2738
   2739	if (!right_has_empty) {
   2740		/*
   2741		 * Only do this if we're moving a real
   2742		 * record. Otherwise, the action is delayed until
   2743		 * after removal of the right path in which case we
   2744		 * can do a simple shift to remove the empty extent.
   2745		 */
   2746		ocfs2_rotate_leaf(left_leaf_el, &right_leaf_el->l_recs[0]);
   2747		memset(&right_leaf_el->l_recs[0], 0,
   2748		       sizeof(struct ocfs2_extent_rec));
   2749	}
   2750	if (eb->h_next_leaf_blk == 0ULL) {
   2751		/*
   2752		 * Move recs over to get rid of empty extent, decrease
   2753		 * next_free. This is allowed to remove the last
   2754		 * extent in our leaf (setting l_next_free_rec to
   2755		 * zero) - the delete code below won't care.
   2756		 */
   2757		ocfs2_remove_empty_extent(right_leaf_el);
   2758	}
   2759
   2760	ocfs2_journal_dirty(handle, path_leaf_bh(left_path));
   2761	ocfs2_journal_dirty(handle, path_leaf_bh(right_path));
   2762
   2763	if (del_right_subtree) {
   2764		ocfs2_unlink_subtree(handle, et, left_path, right_path,
   2765				     subtree_index, dealloc);
   2766		ret = ocfs2_update_edge_lengths(handle, et, left_path);
   2767		if (ret) {
   2768			mlog_errno(ret);
   2769			goto out;
   2770		}
   2771
   2772		eb = (struct ocfs2_extent_block *)path_leaf_bh(left_path)->b_data;
   2773		ocfs2_et_set_last_eb_blk(et, le64_to_cpu(eb->h_blkno));
   2774
   2775		/*
   2776		 * Removal of the extent in the left leaf was skipped
   2777		 * above so we could delete the right path
   2778		 * 1st.
   2779		 */
   2780		if (right_has_empty)
   2781			ocfs2_remove_empty_extent(left_leaf_el);
   2782
   2783		ocfs2_journal_dirty(handle, et_root_bh);
   2784
   2785		*deleted = 1;
   2786	} else
   2787		ocfs2_complete_edge_insert(handle, left_path, right_path,
   2788					   subtree_index);
   2789
   2790out:
   2791	return ret;
   2792}
   2793
   2794/*
   2795 * Given a full path, determine what cpos value would return us a path
   2796 * containing the leaf immediately to the right of the current one.
   2797 *
   2798 * Will return zero if the path passed in is already the rightmost path.
   2799 *
   2800 * This looks similar, but is subtly different to
   2801 * ocfs2_find_cpos_for_left_leaf().
   2802 */
   2803int ocfs2_find_cpos_for_right_leaf(struct super_block *sb,
   2804				   struct ocfs2_path *path, u32 *cpos)
   2805{
   2806	int i, j, ret = 0;
   2807	u64 blkno;
   2808	struct ocfs2_extent_list *el;
   2809
   2810	*cpos = 0;
   2811
   2812	if (path->p_tree_depth == 0)
   2813		return 0;
   2814
   2815	blkno = path_leaf_bh(path)->b_blocknr;
   2816
   2817	/* Start at the tree node just above the leaf and work our way up. */
   2818	i = path->p_tree_depth - 1;
   2819	while (i >= 0) {
   2820		int next_free;
   2821
   2822		el = path->p_node[i].el;
   2823
   2824		/*
   2825		 * Find the extent record just after the one in our
   2826		 * path.
   2827		 */
   2828		next_free = le16_to_cpu(el->l_next_free_rec);
   2829		for(j = 0; j < le16_to_cpu(el->l_next_free_rec); j++) {
   2830			if (le64_to_cpu(el->l_recs[j].e_blkno) == blkno) {
   2831				if (j == (next_free - 1)) {
   2832					if (i == 0) {
   2833						/*
   2834						 * We've determined that the
   2835						 * path specified is already
   2836						 * the rightmost one - return a
   2837						 * cpos of zero.
   2838						 */
   2839						goto out;
   2840					}
   2841					/*
   2842					 * The rightmost record points to our
   2843					 * leaf - we need to travel up the
   2844					 * tree one level.
   2845					 */
   2846					goto next_node;
   2847				}
   2848
   2849				*cpos = le32_to_cpu(el->l_recs[j + 1].e_cpos);
   2850				goto out;
   2851			}
   2852		}
   2853
   2854		/*
   2855		 * If we got here, we never found a valid node where
   2856		 * the tree indicated one should be.
   2857		 */
   2858		ocfs2_error(sb, "Invalid extent tree at extent block %llu\n",
   2859			    (unsigned long long)blkno);
   2860		ret = -EROFS;
   2861		goto out;
   2862
   2863next_node:
   2864		blkno = path->p_node[i].bh->b_blocknr;
   2865		i--;
   2866	}
   2867
   2868out:
   2869	return ret;
   2870}
   2871
   2872static int ocfs2_rotate_rightmost_leaf_left(handle_t *handle,
   2873					    struct ocfs2_extent_tree *et,
   2874					    struct ocfs2_path *path)
   2875{
   2876	int ret;
   2877	struct buffer_head *bh = path_leaf_bh(path);
   2878	struct ocfs2_extent_list *el = path_leaf_el(path);
   2879
   2880	if (!ocfs2_is_empty_extent(&el->l_recs[0]))
   2881		return 0;
   2882
   2883	ret = ocfs2_path_bh_journal_access(handle, et->et_ci, path,
   2884					   path_num_items(path) - 1);
   2885	if (ret) {
   2886		mlog_errno(ret);
   2887		goto out;
   2888	}
   2889
   2890	ocfs2_remove_empty_extent(el);
   2891	ocfs2_journal_dirty(handle, bh);
   2892
   2893out:
   2894	return ret;
   2895}
   2896
   2897static int __ocfs2_rotate_tree_left(handle_t *handle,
   2898				    struct ocfs2_extent_tree *et,
   2899				    int orig_credits,
   2900				    struct ocfs2_path *path,
   2901				    struct ocfs2_cached_dealloc_ctxt *dealloc,
   2902				    struct ocfs2_path **empty_extent_path)
   2903{
   2904	int ret, subtree_root, deleted;
   2905	u32 right_cpos;
   2906	struct ocfs2_path *left_path = NULL;
   2907	struct ocfs2_path *right_path = NULL;
   2908	struct super_block *sb = ocfs2_metadata_cache_get_super(et->et_ci);
   2909
   2910	if (!ocfs2_is_empty_extent(&(path_leaf_el(path)->l_recs[0])))
   2911		return 0;
   2912
   2913	*empty_extent_path = NULL;
   2914
   2915	ret = ocfs2_find_cpos_for_right_leaf(sb, path, &right_cpos);
   2916	if (ret) {
   2917		mlog_errno(ret);
   2918		goto out;
   2919	}
   2920
   2921	left_path = ocfs2_new_path_from_path(path);
   2922	if (!left_path) {
   2923		ret = -ENOMEM;
   2924		mlog_errno(ret);
   2925		goto out;
   2926	}
   2927
   2928	ocfs2_cp_path(left_path, path);
   2929
   2930	right_path = ocfs2_new_path_from_path(path);
   2931	if (!right_path) {
   2932		ret = -ENOMEM;
   2933		mlog_errno(ret);
   2934		goto out;
   2935	}
   2936
   2937	while (right_cpos) {
   2938		ret = ocfs2_find_path(et->et_ci, right_path, right_cpos);
   2939		if (ret) {
   2940			mlog_errno(ret);
   2941			goto out;
   2942		}
   2943
   2944		subtree_root = ocfs2_find_subtree_root(et, left_path,
   2945						       right_path);
   2946
   2947		trace_ocfs2_rotate_subtree(subtree_root,
   2948		     (unsigned long long)
   2949		     right_path->p_node[subtree_root].bh->b_blocknr,
   2950		     right_path->p_tree_depth);
   2951
   2952		ret = ocfs2_extend_rotate_transaction(handle, 0,
   2953						      orig_credits, left_path);
   2954		if (ret) {
   2955			mlog_errno(ret);
   2956			goto out;
   2957		}
   2958
   2959		/*
   2960		 * Caller might still want to make changes to the
   2961		 * tree root, so re-add it to the journal here.
   2962		 */
   2963		ret = ocfs2_path_bh_journal_access(handle, et->et_ci,
   2964						   left_path, 0);
   2965		if (ret) {
   2966			mlog_errno(ret);
   2967			goto out;
   2968		}
   2969
   2970		ret = ocfs2_rotate_subtree_left(handle, et, left_path,
   2971						right_path, subtree_root,
   2972						dealloc, &deleted);
   2973		if (ret == -EAGAIN) {
   2974			/*
   2975			 * The rotation has to temporarily stop due to
   2976			 * the right subtree having an empty
   2977			 * extent. Pass it back to the caller for a
   2978			 * fixup.
   2979			 */
   2980			*empty_extent_path = right_path;
   2981			right_path = NULL;
   2982			goto out;
   2983		}
   2984		if (ret) {
   2985			mlog_errno(ret);
   2986			goto out;
   2987		}
   2988
   2989		/*
   2990		 * The subtree rotate might have removed records on
   2991		 * the rightmost edge. If so, then rotation is
   2992		 * complete.
   2993		 */
   2994		if (deleted)
   2995			break;
   2996
   2997		ocfs2_mv_path(left_path, right_path);
   2998
   2999		ret = ocfs2_find_cpos_for_right_leaf(sb, left_path,
   3000						     &right_cpos);
   3001		if (ret) {
   3002			mlog_errno(ret);
   3003			goto out;
   3004		}
   3005	}
   3006
   3007out:
   3008	ocfs2_free_path(right_path);
   3009	ocfs2_free_path(left_path);
   3010
   3011	return ret;
   3012}
   3013
   3014static int ocfs2_remove_rightmost_path(handle_t *handle,
   3015				struct ocfs2_extent_tree *et,
   3016				struct ocfs2_path *path,
   3017				struct ocfs2_cached_dealloc_ctxt *dealloc)
   3018{
   3019	int ret, subtree_index;
   3020	u32 cpos;
   3021	struct ocfs2_path *left_path = NULL;
   3022	struct ocfs2_extent_block *eb;
   3023	struct ocfs2_extent_list *el;
   3024
   3025	ret = ocfs2_et_sanity_check(et);
   3026	if (ret)
   3027		goto out;
   3028
   3029	ret = ocfs2_journal_access_path(et->et_ci, handle, path);
   3030	if (ret) {
   3031		mlog_errno(ret);
   3032		goto out;
   3033	}
   3034
   3035	ret = ocfs2_find_cpos_for_left_leaf(ocfs2_metadata_cache_get_super(et->et_ci),
   3036					    path, &cpos);
   3037	if (ret) {
   3038		mlog_errno(ret);
   3039		goto out;
   3040	}
   3041
   3042	if (cpos) {
   3043		/*
   3044		 * We have a path to the left of this one - it needs
   3045		 * an update too.
   3046		 */
   3047		left_path = ocfs2_new_path_from_path(path);
   3048		if (!left_path) {
   3049			ret = -ENOMEM;
   3050			mlog_errno(ret);
   3051			goto out;
   3052		}
   3053
   3054		ret = ocfs2_find_path(et->et_ci, left_path, cpos);
   3055		if (ret) {
   3056			mlog_errno(ret);
   3057			goto out;
   3058		}
   3059
   3060		ret = ocfs2_journal_access_path(et->et_ci, handle, left_path);
   3061		if (ret) {
   3062			mlog_errno(ret);
   3063			goto out;
   3064		}
   3065
   3066		subtree_index = ocfs2_find_subtree_root(et, left_path, path);
   3067
   3068		ocfs2_unlink_subtree(handle, et, left_path, path,
   3069				     subtree_index, dealloc);
   3070		ret = ocfs2_update_edge_lengths(handle, et, left_path);
   3071		if (ret) {
   3072			mlog_errno(ret);
   3073			goto out;
   3074		}
   3075
   3076		eb = (struct ocfs2_extent_block *)path_leaf_bh(left_path)->b_data;
   3077		ocfs2_et_set_last_eb_blk(et, le64_to_cpu(eb->h_blkno));
   3078	} else {
   3079		/*
   3080		 * 'path' is also the leftmost path which
   3081		 * means it must be the only one. This gets
   3082		 * handled differently because we want to
   3083		 * revert the root back to having extents
   3084		 * in-line.
   3085		 */
   3086		ocfs2_unlink_path(handle, et, dealloc, path, 1);
   3087
   3088		el = et->et_root_el;
   3089		el->l_tree_depth = 0;
   3090		el->l_next_free_rec = 0;
   3091		memset(&el->l_recs[0], 0, sizeof(struct ocfs2_extent_rec));
   3092
   3093		ocfs2_et_set_last_eb_blk(et, 0);
   3094	}
   3095
   3096	ocfs2_journal_dirty(handle, path_root_bh(path));
   3097
   3098out:
   3099	ocfs2_free_path(left_path);
   3100	return ret;
   3101}
   3102
   3103static int ocfs2_remove_rightmost_empty_extent(struct ocfs2_super *osb,
   3104				struct ocfs2_extent_tree *et,
   3105				struct ocfs2_path *path,
   3106				struct ocfs2_cached_dealloc_ctxt *dealloc)
   3107{
   3108	handle_t *handle;
   3109	int ret;
   3110	int credits = path->p_tree_depth * 2 + 1;
   3111
   3112	handle = ocfs2_start_trans(osb, credits);
   3113	if (IS_ERR(handle)) {
   3114		ret = PTR_ERR(handle);
   3115		mlog_errno(ret);
   3116		return ret;
   3117	}
   3118
   3119	ret = ocfs2_remove_rightmost_path(handle, et, path, dealloc);
   3120	if (ret)
   3121		mlog_errno(ret);
   3122
   3123	ocfs2_commit_trans(osb, handle);
   3124	return ret;
   3125}
   3126
   3127/*
   3128 * Left rotation of btree records.
   3129 *
   3130 * In many ways, this is (unsurprisingly) the opposite of right
   3131 * rotation. We start at some non-rightmost path containing an empty
   3132 * extent in the leaf block. The code works its way to the rightmost
   3133 * path by rotating records to the left in every subtree.
   3134 *
   3135 * This is used by any code which reduces the number of extent records
   3136 * in a leaf. After removal, an empty record should be placed in the
   3137 * leftmost list position.
   3138 *
   3139 * This won't handle a length update of the rightmost path records if
   3140 * the rightmost tree leaf record is removed so the caller is
   3141 * responsible for detecting and correcting that.
   3142 */
   3143static int ocfs2_rotate_tree_left(handle_t *handle,
   3144				  struct ocfs2_extent_tree *et,
   3145				  struct ocfs2_path *path,
   3146				  struct ocfs2_cached_dealloc_ctxt *dealloc)
   3147{
   3148	int ret, orig_credits = jbd2_handle_buffer_credits(handle);
   3149	struct ocfs2_path *tmp_path = NULL, *restart_path = NULL;
   3150	struct ocfs2_extent_block *eb;
   3151	struct ocfs2_extent_list *el;
   3152
   3153	el = path_leaf_el(path);
   3154	if (!ocfs2_is_empty_extent(&el->l_recs[0]))
   3155		return 0;
   3156
   3157	if (path->p_tree_depth == 0) {
   3158rightmost_no_delete:
   3159		/*
   3160		 * Inline extents. This is trivially handled, so do
   3161		 * it up front.
   3162		 */
   3163		ret = ocfs2_rotate_rightmost_leaf_left(handle, et, path);
   3164		if (ret)
   3165			mlog_errno(ret);
   3166		goto out;
   3167	}
   3168
   3169	/*
   3170	 * Handle rightmost branch now. There's several cases:
   3171	 *  1) simple rotation leaving records in there. That's trivial.
   3172	 *  2) rotation requiring a branch delete - there's no more
   3173	 *     records left. Two cases of this:
   3174	 *     a) There are branches to the left.
   3175	 *     b) This is also the leftmost (the only) branch.
   3176	 *
   3177	 *  1) is handled via ocfs2_rotate_rightmost_leaf_left()
   3178	 *  2a) we need the left branch so that we can update it with the unlink
   3179	 *  2b) we need to bring the root back to inline extents.
   3180	 */
   3181
   3182	eb = (struct ocfs2_extent_block *)path_leaf_bh(path)->b_data;
   3183	el = &eb->h_list;
   3184	if (eb->h_next_leaf_blk == 0) {
   3185		/*
   3186		 * This gets a bit tricky if we're going to delete the
   3187		 * rightmost path. Get the other cases out of the way
   3188		 * 1st.
   3189		 */
   3190		if (le16_to_cpu(el->l_next_free_rec) > 1)
   3191			goto rightmost_no_delete;
   3192
   3193		if (le16_to_cpu(el->l_next_free_rec) == 0) {
   3194			ret = ocfs2_error(ocfs2_metadata_cache_get_super(et->et_ci),
   3195					"Owner %llu has empty extent block at %llu\n",
   3196					(unsigned long long)ocfs2_metadata_cache_owner(et->et_ci),
   3197					(unsigned long long)le64_to_cpu(eb->h_blkno));
   3198			goto out;
   3199		}
   3200
   3201		/*
   3202		 * XXX: The caller can not trust "path" any more after
   3203		 * this as it will have been deleted. What do we do?
   3204		 *
   3205		 * In theory the rotate-for-merge code will never get
   3206		 * here because it'll always ask for a rotate in a
   3207		 * nonempty list.
   3208		 */
   3209
   3210		ret = ocfs2_remove_rightmost_path(handle, et, path,
   3211						  dealloc);
   3212		if (ret)
   3213			mlog_errno(ret);
   3214		goto out;
   3215	}
   3216
   3217	/*
   3218	 * Now we can loop, remembering the path we get from -EAGAIN
   3219	 * and restarting from there.
   3220	 */
   3221try_rotate:
   3222	ret = __ocfs2_rotate_tree_left(handle, et, orig_credits, path,
   3223				       dealloc, &restart_path);
   3224	if (ret && ret != -EAGAIN) {
   3225		mlog_errno(ret);
   3226		goto out;
   3227	}
   3228
   3229	while (ret == -EAGAIN) {
   3230		tmp_path = restart_path;
   3231		restart_path = NULL;
   3232
   3233		ret = __ocfs2_rotate_tree_left(handle, et, orig_credits,
   3234					       tmp_path, dealloc,
   3235					       &restart_path);
   3236		if (ret && ret != -EAGAIN) {
   3237			mlog_errno(ret);
   3238			goto out;
   3239		}
   3240
   3241		ocfs2_free_path(tmp_path);
   3242		tmp_path = NULL;
   3243
   3244		if (ret == 0)
   3245			goto try_rotate;
   3246	}
   3247
   3248out:
   3249	ocfs2_free_path(tmp_path);
   3250	ocfs2_free_path(restart_path);
   3251	return ret;
   3252}
   3253
   3254static void ocfs2_cleanup_merge(struct ocfs2_extent_list *el,
   3255				int index)
   3256{
   3257	struct ocfs2_extent_rec *rec = &el->l_recs[index];
   3258	unsigned int size;
   3259
   3260	if (rec->e_leaf_clusters == 0) {
   3261		/*
   3262		 * We consumed all of the merged-from record. An empty
   3263		 * extent cannot exist anywhere but the 1st array
   3264		 * position, so move things over if the merged-from
   3265		 * record doesn't occupy that position.
   3266		 *
   3267		 * This creates a new empty extent so the caller
   3268		 * should be smart enough to have removed any existing
   3269		 * ones.
   3270		 */
   3271		if (index > 0) {
   3272			BUG_ON(ocfs2_is_empty_extent(&el->l_recs[0]));
   3273			size = index * sizeof(struct ocfs2_extent_rec);
   3274			memmove(&el->l_recs[1], &el->l_recs[0], size);
   3275		}
   3276
   3277		/*
   3278		 * Always memset - the caller doesn't check whether it
   3279		 * created an empty extent, so there could be junk in
   3280		 * the other fields.
   3281		 */
   3282		memset(&el->l_recs[0], 0, sizeof(struct ocfs2_extent_rec));
   3283	}
   3284}
   3285
   3286static int ocfs2_get_right_path(struct ocfs2_extent_tree *et,
   3287				struct ocfs2_path *left_path,
   3288				struct ocfs2_path **ret_right_path)
   3289{
   3290	int ret;
   3291	u32 right_cpos;
   3292	struct ocfs2_path *right_path = NULL;
   3293	struct ocfs2_extent_list *left_el;
   3294
   3295	*ret_right_path = NULL;
   3296
   3297	/* This function shouldn't be called for non-trees. */
   3298	BUG_ON(left_path->p_tree_depth == 0);
   3299
   3300	left_el = path_leaf_el(left_path);
   3301	BUG_ON(left_el->l_next_free_rec != left_el->l_count);
   3302
   3303	ret = ocfs2_find_cpos_for_right_leaf(ocfs2_metadata_cache_get_super(et->et_ci),
   3304					     left_path, &right_cpos);
   3305	if (ret) {
   3306		mlog_errno(ret);
   3307		goto out;
   3308	}
   3309
   3310	/* This function shouldn't be called for the rightmost leaf. */
   3311	BUG_ON(right_cpos == 0);
   3312
   3313	right_path = ocfs2_new_path_from_path(left_path);
   3314	if (!right_path) {
   3315		ret = -ENOMEM;
   3316		mlog_errno(ret);
   3317		goto out;
   3318	}
   3319
   3320	ret = ocfs2_find_path(et->et_ci, right_path, right_cpos);
   3321	if (ret) {
   3322		mlog_errno(ret);
   3323		goto out;
   3324	}
   3325
   3326	*ret_right_path = right_path;
   3327out:
   3328	if (ret)
   3329		ocfs2_free_path(right_path);
   3330	return ret;
   3331}
   3332
   3333/*
   3334 * Remove split_rec clusters from the record at index and merge them
   3335 * onto the beginning of the record "next" to it.
   3336 * For index < l_count - 1, the next means the extent rec at index + 1.
   3337 * For index == l_count - 1, the "next" means the 1st extent rec of the
   3338 * next extent block.
   3339 */
   3340static int ocfs2_merge_rec_right(struct ocfs2_path *left_path,
   3341				 handle_t *handle,
   3342				 struct ocfs2_extent_tree *et,
   3343				 struct ocfs2_extent_rec *split_rec,
   3344				 int index)
   3345{
   3346	int ret, next_free, i;
   3347	unsigned int split_clusters = le16_to_cpu(split_rec->e_leaf_clusters);
   3348	struct ocfs2_extent_rec *left_rec;
   3349	struct ocfs2_extent_rec *right_rec;
   3350	struct ocfs2_extent_list *right_el;
   3351	struct ocfs2_path *right_path = NULL;
   3352	int subtree_index = 0;
   3353	struct ocfs2_extent_list *el = path_leaf_el(left_path);
   3354	struct buffer_head *bh = path_leaf_bh(left_path);
   3355	struct buffer_head *root_bh = NULL;
   3356
   3357	BUG_ON(index >= le16_to_cpu(el->l_next_free_rec));
   3358	left_rec = &el->l_recs[index];
   3359
   3360	if (index == le16_to_cpu(el->l_next_free_rec) - 1 &&
   3361	    le16_to_cpu(el->l_next_free_rec) == le16_to_cpu(el->l_count)) {
   3362		/* we meet with a cross extent block merge. */
   3363		ret = ocfs2_get_right_path(et, left_path, &right_path);
   3364		if (ret) {
   3365			mlog_errno(ret);
   3366			return ret;
   3367		}
   3368
   3369		right_el = path_leaf_el(right_path);
   3370		next_free = le16_to_cpu(right_el->l_next_free_rec);
   3371		BUG_ON(next_free <= 0);
   3372		right_rec = &right_el->l_recs[0];
   3373		if (ocfs2_is_empty_extent(right_rec)) {
   3374			BUG_ON(next_free <= 1);
   3375			right_rec = &right_el->l_recs[1];
   3376		}
   3377
   3378		BUG_ON(le32_to_cpu(left_rec->e_cpos) +
   3379		       le16_to_cpu(left_rec->e_leaf_clusters) !=
   3380		       le32_to_cpu(right_rec->e_cpos));
   3381
   3382		subtree_index = ocfs2_find_subtree_root(et, left_path,
   3383							right_path);
   3384
   3385		ret = ocfs2_extend_rotate_transaction(handle, subtree_index,
   3386					jbd2_handle_buffer_credits(handle),
   3387					right_path);
   3388		if (ret) {
   3389			mlog_errno(ret);
   3390			goto out;
   3391		}
   3392
   3393		root_bh = left_path->p_node[subtree_index].bh;
   3394		BUG_ON(root_bh != right_path->p_node[subtree_index].bh);
   3395
   3396		ret = ocfs2_path_bh_journal_access(handle, et->et_ci, right_path,
   3397						   subtree_index);
   3398		if (ret) {
   3399			mlog_errno(ret);
   3400			goto out;
   3401		}
   3402
   3403		for (i = subtree_index + 1;
   3404		     i < path_num_items(right_path); i++) {
   3405			ret = ocfs2_path_bh_journal_access(handle, et->et_ci,
   3406							   right_path, i);
   3407			if (ret) {
   3408				mlog_errno(ret);
   3409				goto out;
   3410			}
   3411
   3412			ret = ocfs2_path_bh_journal_access(handle, et->et_ci,
   3413							   left_path, i);
   3414			if (ret) {
   3415				mlog_errno(ret);
   3416				goto out;
   3417			}
   3418		}
   3419
   3420	} else {
   3421		BUG_ON(index == le16_to_cpu(el->l_next_free_rec) - 1);
   3422		right_rec = &el->l_recs[index + 1];
   3423	}
   3424
   3425	ret = ocfs2_path_bh_journal_access(handle, et->et_ci, left_path,
   3426					   path_num_items(left_path) - 1);
   3427	if (ret) {
   3428		mlog_errno(ret);
   3429		goto out;
   3430	}
   3431
   3432	le16_add_cpu(&left_rec->e_leaf_clusters, -split_clusters);
   3433
   3434	le32_add_cpu(&right_rec->e_cpos, -split_clusters);
   3435	le64_add_cpu(&right_rec->e_blkno,
   3436		     -ocfs2_clusters_to_blocks(ocfs2_metadata_cache_get_super(et->et_ci),
   3437					       split_clusters));
   3438	le16_add_cpu(&right_rec->e_leaf_clusters, split_clusters);
   3439
   3440	ocfs2_cleanup_merge(el, index);
   3441
   3442	ocfs2_journal_dirty(handle, bh);
   3443	if (right_path) {
   3444		ocfs2_journal_dirty(handle, path_leaf_bh(right_path));
   3445		ocfs2_complete_edge_insert(handle, left_path, right_path,
   3446					   subtree_index);
   3447	}
   3448out:
   3449	ocfs2_free_path(right_path);
   3450	return ret;
   3451}
   3452
   3453static int ocfs2_get_left_path(struct ocfs2_extent_tree *et,
   3454			       struct ocfs2_path *right_path,
   3455			       struct ocfs2_path **ret_left_path)
   3456{
   3457	int ret;
   3458	u32 left_cpos;
   3459	struct ocfs2_path *left_path = NULL;
   3460
   3461	*ret_left_path = NULL;
   3462
   3463	/* This function shouldn't be called for non-trees. */
   3464	BUG_ON(right_path->p_tree_depth == 0);
   3465
   3466	ret = ocfs2_find_cpos_for_left_leaf(ocfs2_metadata_cache_get_super(et->et_ci),
   3467					    right_path, &left_cpos);
   3468	if (ret) {
   3469		mlog_errno(ret);
   3470		goto out;
   3471	}
   3472
   3473	/* This function shouldn't be called for the leftmost leaf. */
   3474	BUG_ON(left_cpos == 0);
   3475
   3476	left_path = ocfs2_new_path_from_path(right_path);
   3477	if (!left_path) {
   3478		ret = -ENOMEM;
   3479		mlog_errno(ret);
   3480		goto out;
   3481	}
   3482
   3483	ret = ocfs2_find_path(et->et_ci, left_path, left_cpos);
   3484	if (ret) {
   3485		mlog_errno(ret);
   3486		goto out;
   3487	}
   3488
   3489	*ret_left_path = left_path;
   3490out:
   3491	if (ret)
   3492		ocfs2_free_path(left_path);
   3493	return ret;
   3494}
   3495
   3496/*
   3497 * Remove split_rec clusters from the record at index and merge them
   3498 * onto the tail of the record "before" it.
   3499 * For index > 0, the "before" means the extent rec at index - 1.
   3500 *
   3501 * For index == 0, the "before" means the last record of the previous
   3502 * extent block. And there is also a situation that we may need to
   3503 * remove the rightmost leaf extent block in the right_path and change
   3504 * the right path to indicate the new rightmost path.
   3505 */
   3506static int ocfs2_merge_rec_left(struct ocfs2_path *right_path,
   3507				handle_t *handle,
   3508				struct ocfs2_extent_tree *et,
   3509				struct ocfs2_extent_rec *split_rec,
   3510				struct ocfs2_cached_dealloc_ctxt *dealloc,
   3511				int index)
   3512{
   3513	int ret, i, subtree_index = 0, has_empty_extent = 0;
   3514	unsigned int split_clusters = le16_to_cpu(split_rec->e_leaf_clusters);
   3515	struct ocfs2_extent_rec *left_rec;
   3516	struct ocfs2_extent_rec *right_rec;
   3517	struct ocfs2_extent_list *el = path_leaf_el(right_path);
   3518	struct buffer_head *bh = path_leaf_bh(right_path);
   3519	struct buffer_head *root_bh = NULL;
   3520	struct ocfs2_path *left_path = NULL;
   3521	struct ocfs2_extent_list *left_el;
   3522
   3523	BUG_ON(index < 0);
   3524
   3525	right_rec = &el->l_recs[index];
   3526	if (index == 0) {
   3527		/* we meet with a cross extent block merge. */
   3528		ret = ocfs2_get_left_path(et, right_path, &left_path);
   3529		if (ret) {
   3530			mlog_errno(ret);
   3531			return ret;
   3532		}
   3533
   3534		left_el = path_leaf_el(left_path);
   3535		BUG_ON(le16_to_cpu(left_el->l_next_free_rec) !=
   3536		       le16_to_cpu(left_el->l_count));
   3537
   3538		left_rec = &left_el->l_recs[
   3539				le16_to_cpu(left_el->l_next_free_rec) - 1];
   3540		BUG_ON(le32_to_cpu(left_rec->e_cpos) +
   3541		       le16_to_cpu(left_rec->e_leaf_clusters) !=
   3542		       le32_to_cpu(split_rec->e_cpos));
   3543
   3544		subtree_index = ocfs2_find_subtree_root(et, left_path,
   3545							right_path);
   3546
   3547		ret = ocfs2_extend_rotate_transaction(handle, subtree_index,
   3548					jbd2_handle_buffer_credits(handle),
   3549					left_path);
   3550		if (ret) {
   3551			mlog_errno(ret);
   3552			goto out;
   3553		}
   3554
   3555		root_bh = left_path->p_node[subtree_index].bh;
   3556		BUG_ON(root_bh != right_path->p_node[subtree_index].bh);
   3557
   3558		ret = ocfs2_path_bh_journal_access(handle, et->et_ci, right_path,
   3559						   subtree_index);
   3560		if (ret) {
   3561			mlog_errno(ret);
   3562			goto out;
   3563		}
   3564
   3565		for (i = subtree_index + 1;
   3566		     i < path_num_items(right_path); i++) {
   3567			ret = ocfs2_path_bh_journal_access(handle, et->et_ci,
   3568							   right_path, i);
   3569			if (ret) {
   3570				mlog_errno(ret);
   3571				goto out;
   3572			}
   3573
   3574			ret = ocfs2_path_bh_journal_access(handle, et->et_ci,
   3575							   left_path, i);
   3576			if (ret) {
   3577				mlog_errno(ret);
   3578				goto out;
   3579			}
   3580		}
   3581	} else {
   3582		left_rec = &el->l_recs[index - 1];
   3583		if (ocfs2_is_empty_extent(&el->l_recs[0]))
   3584			has_empty_extent = 1;
   3585	}
   3586
   3587	ret = ocfs2_path_bh_journal_access(handle, et->et_ci, right_path,
   3588					   path_num_items(right_path) - 1);
   3589	if (ret) {
   3590		mlog_errno(ret);
   3591		goto out;
   3592	}
   3593
   3594	if (has_empty_extent && index == 1) {
   3595		/*
   3596		 * The easy case - we can just plop the record right in.
   3597		 */
   3598		*left_rec = *split_rec;
   3599	} else
   3600		le16_add_cpu(&left_rec->e_leaf_clusters, split_clusters);
   3601
   3602	le32_add_cpu(&right_rec->e_cpos, split_clusters);
   3603	le64_add_cpu(&right_rec->e_blkno,
   3604		     ocfs2_clusters_to_blocks(ocfs2_metadata_cache_get_super(et->et_ci),
   3605					      split_clusters));
   3606	le16_add_cpu(&right_rec->e_leaf_clusters, -split_clusters);
   3607
   3608	ocfs2_cleanup_merge(el, index);
   3609
   3610	ocfs2_journal_dirty(handle, bh);
   3611	if (left_path) {
   3612		ocfs2_journal_dirty(handle, path_leaf_bh(left_path));
   3613
   3614		/*
   3615		 * In the situation that the right_rec is empty and the extent
   3616		 * block is empty also,  ocfs2_complete_edge_insert can't handle
   3617		 * it and we need to delete the right extent block.
   3618		 */
   3619		if (le16_to_cpu(right_rec->e_leaf_clusters) == 0 &&
   3620		    le16_to_cpu(el->l_next_free_rec) == 1) {
   3621			/* extend credit for ocfs2_remove_rightmost_path */
   3622			ret = ocfs2_extend_rotate_transaction(handle, 0,
   3623					jbd2_handle_buffer_credits(handle),
   3624					right_path);
   3625			if (ret) {
   3626				mlog_errno(ret);
   3627				goto out;
   3628			}
   3629
   3630			ret = ocfs2_remove_rightmost_path(handle, et,
   3631							  right_path,
   3632							  dealloc);
   3633			if (ret) {
   3634				mlog_errno(ret);
   3635				goto out;
   3636			}
   3637
   3638			/* Now the rightmost extent block has been deleted.
   3639			 * So we use the new rightmost path.
   3640			 */
   3641			ocfs2_mv_path(right_path, left_path);
   3642			left_path = NULL;
   3643		} else
   3644			ocfs2_complete_edge_insert(handle, left_path,
   3645						   right_path, subtree_index);
   3646	}
   3647out:
   3648	ocfs2_free_path(left_path);
   3649	return ret;
   3650}
   3651
   3652static int ocfs2_try_to_merge_extent(handle_t *handle,
   3653				     struct ocfs2_extent_tree *et,
   3654				     struct ocfs2_path *path,
   3655				     int split_index,
   3656				     struct ocfs2_extent_rec *split_rec,
   3657				     struct ocfs2_cached_dealloc_ctxt *dealloc,
   3658				     struct ocfs2_merge_ctxt *ctxt)
   3659{
   3660	int ret = 0;
   3661	struct ocfs2_extent_list *el = path_leaf_el(path);
   3662	struct ocfs2_extent_rec *rec = &el->l_recs[split_index];
   3663
   3664	BUG_ON(ctxt->c_contig_type == CONTIG_NONE);
   3665
   3666	if (ctxt->c_split_covers_rec && ctxt->c_has_empty_extent) {
   3667		/* extend credit for ocfs2_remove_rightmost_path */
   3668		ret = ocfs2_extend_rotate_transaction(handle, 0,
   3669				jbd2_handle_buffer_credits(handle),
   3670				path);
   3671		if (ret) {
   3672			mlog_errno(ret);
   3673			goto out;
   3674		}
   3675		/*
   3676		 * The merge code will need to create an empty
   3677		 * extent to take the place of the newly
   3678		 * emptied slot. Remove any pre-existing empty
   3679		 * extents - having more than one in a leaf is
   3680		 * illegal.
   3681		 */
   3682		ret = ocfs2_rotate_tree_left(handle, et, path, dealloc);
   3683		if (ret) {
   3684			mlog_errno(ret);
   3685			goto out;
   3686		}
   3687		split_index--;
   3688		rec = &el->l_recs[split_index];
   3689	}
   3690
   3691	if (ctxt->c_contig_type == CONTIG_LEFTRIGHT) {
   3692		/*
   3693		 * Left-right contig implies this.
   3694		 */
   3695		BUG_ON(!ctxt->c_split_covers_rec);
   3696
   3697		/*
   3698		 * Since the leftright insert always covers the entire
   3699		 * extent, this call will delete the insert record
   3700		 * entirely, resulting in an empty extent record added to
   3701		 * the extent block.
   3702		 *
   3703		 * Since the adding of an empty extent shifts
   3704		 * everything back to the right, there's no need to
   3705		 * update split_index here.
   3706		 *
   3707		 * When the split_index is zero, we need to merge it to the
   3708		 * prevoius extent block. It is more efficient and easier
   3709		 * if we do merge_right first and merge_left later.
   3710		 */
   3711		ret = ocfs2_merge_rec_right(path, handle, et, split_rec,
   3712					    split_index);
   3713		if (ret) {
   3714			mlog_errno(ret);
   3715			goto out;
   3716		}
   3717
   3718		/*
   3719		 * We can only get this from logic error above.
   3720		 */
   3721		BUG_ON(!ocfs2_is_empty_extent(&el->l_recs[0]));
   3722
   3723		/* extend credit for ocfs2_remove_rightmost_path */
   3724		ret = ocfs2_extend_rotate_transaction(handle, 0,
   3725					jbd2_handle_buffer_credits(handle),
   3726					path);
   3727		if (ret) {
   3728			mlog_errno(ret);
   3729			goto out;
   3730		}
   3731
   3732		/* The merge left us with an empty extent, remove it. */
   3733		ret = ocfs2_rotate_tree_left(handle, et, path, dealloc);
   3734		if (ret) {
   3735			mlog_errno(ret);
   3736			goto out;
   3737		}
   3738
   3739		rec = &el->l_recs[split_index];
   3740
   3741		/*
   3742		 * Note that we don't pass split_rec here on purpose -
   3743		 * we've merged it into the rec already.
   3744		 */
   3745		ret = ocfs2_merge_rec_left(path, handle, et, rec,
   3746					   dealloc, split_index);
   3747
   3748		if (ret) {
   3749			mlog_errno(ret);
   3750			goto out;
   3751		}
   3752
   3753		/* extend credit for ocfs2_remove_rightmost_path */
   3754		ret = ocfs2_extend_rotate_transaction(handle, 0,
   3755				jbd2_handle_buffer_credits(handle),
   3756				path);
   3757		if (ret) {
   3758			mlog_errno(ret);
   3759			goto out;
   3760		}
   3761
   3762		ret = ocfs2_rotate_tree_left(handle, et, path, dealloc);
   3763		/*
   3764		 * Error from this last rotate is not critical, so
   3765		 * print but don't bubble it up.
   3766		 */
   3767		if (ret)
   3768			mlog_errno(ret);
   3769		ret = 0;
   3770	} else {
   3771		/*
   3772		 * Merge a record to the left or right.
   3773		 *
   3774		 * 'contig_type' is relative to the existing record,
   3775		 * so for example, if we're "right contig", it's to
   3776		 * the record on the left (hence the left merge).
   3777		 */
   3778		if (ctxt->c_contig_type == CONTIG_RIGHT) {
   3779			ret = ocfs2_merge_rec_left(path, handle, et,
   3780						   split_rec, dealloc,
   3781						   split_index);
   3782			if (ret) {
   3783				mlog_errno(ret);
   3784				goto out;
   3785			}
   3786		} else {
   3787			ret = ocfs2_merge_rec_right(path, handle,
   3788						    et, split_rec,
   3789						    split_index);
   3790			if (ret) {
   3791				mlog_errno(ret);
   3792				goto out;
   3793			}
   3794		}
   3795
   3796		if (ctxt->c_split_covers_rec) {
   3797			/* extend credit for ocfs2_remove_rightmost_path */
   3798			ret = ocfs2_extend_rotate_transaction(handle, 0,
   3799					jbd2_handle_buffer_credits(handle),
   3800					path);
   3801			if (ret) {
   3802				mlog_errno(ret);
   3803				ret = 0;
   3804				goto out;
   3805			}
   3806
   3807			/*
   3808			 * The merge may have left an empty extent in
   3809			 * our leaf. Try to rotate it away.
   3810			 */
   3811			ret = ocfs2_rotate_tree_left(handle, et, path,
   3812						     dealloc);
   3813			if (ret)
   3814				mlog_errno(ret);
   3815			ret = 0;
   3816		}
   3817	}
   3818
   3819out:
   3820	return ret;
   3821}
   3822
   3823static void ocfs2_subtract_from_rec(struct super_block *sb,
   3824				    enum ocfs2_split_type split,
   3825				    struct ocfs2_extent_rec *rec,
   3826				    struct ocfs2_extent_rec *split_rec)
   3827{
   3828	u64 len_blocks;
   3829
   3830	len_blocks = ocfs2_clusters_to_blocks(sb,
   3831				le16_to_cpu(split_rec->e_leaf_clusters));
   3832
   3833	if (split == SPLIT_LEFT) {
   3834		/*
   3835		 * Region is on the left edge of the existing
   3836		 * record.
   3837		 */
   3838		le32_add_cpu(&rec->e_cpos,
   3839			     le16_to_cpu(split_rec->e_leaf_clusters));
   3840		le64_add_cpu(&rec->e_blkno, len_blocks);
   3841		le16_add_cpu(&rec->e_leaf_clusters,
   3842			     -le16_to_cpu(split_rec->e_leaf_clusters));
   3843	} else {
   3844		/*
   3845		 * Region is on the right edge of the existing
   3846		 * record.
   3847		 */
   3848		le16_add_cpu(&rec->e_leaf_clusters,
   3849			     -le16_to_cpu(split_rec->e_leaf_clusters));
   3850	}
   3851}
   3852
   3853/*
   3854 * Do the final bits of extent record insertion at the target leaf
   3855 * list. If this leaf is part of an allocation tree, it is assumed
   3856 * that the tree above has been prepared.
   3857 */
   3858static void ocfs2_insert_at_leaf(struct ocfs2_extent_tree *et,
   3859				 struct ocfs2_extent_rec *insert_rec,
   3860				 struct ocfs2_extent_list *el,
   3861				 struct ocfs2_insert_type *insert)
   3862{
   3863	int i = insert->ins_contig_index;
   3864	unsigned int range;
   3865	struct ocfs2_extent_rec *rec;
   3866
   3867	BUG_ON(le16_to_cpu(el->l_tree_depth) != 0);
   3868
   3869	if (insert->ins_split != SPLIT_NONE) {
   3870		i = ocfs2_search_extent_list(el, le32_to_cpu(insert_rec->e_cpos));
   3871		BUG_ON(i == -1);
   3872		rec = &el->l_recs[i];
   3873		ocfs2_subtract_from_rec(ocfs2_metadata_cache_get_super(et->et_ci),
   3874					insert->ins_split, rec,
   3875					insert_rec);
   3876		goto rotate;
   3877	}
   3878
   3879	/*
   3880	 * Contiguous insert - either left or right.
   3881	 */
   3882	if (insert->ins_contig != CONTIG_NONE) {
   3883		rec = &el->l_recs[i];
   3884		if (insert->ins_contig == CONTIG_LEFT) {
   3885			rec->e_blkno = insert_rec->e_blkno;
   3886			rec->e_cpos = insert_rec->e_cpos;
   3887		}
   3888		le16_add_cpu(&rec->e_leaf_clusters,
   3889			     le16_to_cpu(insert_rec->e_leaf_clusters));
   3890		return;
   3891	}
   3892
   3893	/*
   3894	 * Handle insert into an empty leaf.
   3895	 */
   3896	if (le16_to_cpu(el->l_next_free_rec) == 0 ||
   3897	    ((le16_to_cpu(el->l_next_free_rec) == 1) &&
   3898	     ocfs2_is_empty_extent(&el->l_recs[0]))) {
   3899		el->l_recs[0] = *insert_rec;
   3900		el->l_next_free_rec = cpu_to_le16(1);
   3901		return;
   3902	}
   3903
   3904	/*
   3905	 * Appending insert.
   3906	 */
   3907	if (insert->ins_appending == APPEND_TAIL) {
   3908		i = le16_to_cpu(el->l_next_free_rec) - 1;
   3909		rec = &el->l_recs[i];
   3910		range = le32_to_cpu(rec->e_cpos)
   3911			+ le16_to_cpu(rec->e_leaf_clusters);
   3912		BUG_ON(le32_to_cpu(insert_rec->e_cpos) < range);
   3913
   3914		mlog_bug_on_msg(le16_to_cpu(el->l_next_free_rec) >=
   3915				le16_to_cpu(el->l_count),
   3916				"owner %llu, depth %u, count %u, next free %u, "
   3917				"rec.cpos %u, rec.clusters %u, "
   3918				"insert.cpos %u, insert.clusters %u\n",
   3919				ocfs2_metadata_cache_owner(et->et_ci),
   3920				le16_to_cpu(el->l_tree_depth),
   3921				le16_to_cpu(el->l_count),
   3922				le16_to_cpu(el->l_next_free_rec),
   3923				le32_to_cpu(el->l_recs[i].e_cpos),
   3924				le16_to_cpu(el->l_recs[i].e_leaf_clusters),
   3925				le32_to_cpu(insert_rec->e_cpos),
   3926				le16_to_cpu(insert_rec->e_leaf_clusters));
   3927		i++;
   3928		el->l_recs[i] = *insert_rec;
   3929		le16_add_cpu(&el->l_next_free_rec, 1);
   3930		return;
   3931	}
   3932
   3933rotate:
   3934	/*
   3935	 * Ok, we have to rotate.
   3936	 *
   3937	 * At this point, it is safe to assume that inserting into an
   3938	 * empty leaf and appending to a leaf have both been handled
   3939	 * above.
   3940	 *
   3941	 * This leaf needs to have space, either by the empty 1st
   3942	 * extent record, or by virtue of an l_next_free_rec < l_count.
   3943	 */
   3944	ocfs2_rotate_leaf(el, insert_rec);
   3945}
   3946
   3947static void ocfs2_adjust_rightmost_records(handle_t *handle,
   3948					   struct ocfs2_extent_tree *et,
   3949					   struct ocfs2_path *path,
   3950					   struct ocfs2_extent_rec *insert_rec)
   3951{
   3952	int i, next_free;
   3953	struct buffer_head *bh;
   3954	struct ocfs2_extent_list *el;
   3955	struct ocfs2_extent_rec *rec;
   3956
   3957	/*
   3958	 * Update everything except the leaf block.
   3959	 */
   3960	for (i = 0; i < path->p_tree_depth; i++) {
   3961		bh = path->p_node[i].bh;
   3962		el = path->p_node[i].el;
   3963
   3964		next_free = le16_to_cpu(el->l_next_free_rec);
   3965		if (next_free == 0) {
   3966			ocfs2_error(ocfs2_metadata_cache_get_super(et->et_ci),
   3967				    "Owner %llu has a bad extent list\n",
   3968				    (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci));
   3969			return;
   3970		}
   3971
   3972		rec = &el->l_recs[next_free - 1];
   3973
   3974		rec->e_int_clusters = insert_rec->e_cpos;
   3975		le32_add_cpu(&rec->e_int_clusters,
   3976			     le16_to_cpu(insert_rec->e_leaf_clusters));
   3977		le32_add_cpu(&rec->e_int_clusters,
   3978			     -le32_to_cpu(rec->e_cpos));
   3979
   3980		ocfs2_journal_dirty(handle, bh);
   3981	}
   3982}
   3983
   3984static int ocfs2_append_rec_to_path(handle_t *handle,
   3985				    struct ocfs2_extent_tree *et,
   3986				    struct ocfs2_extent_rec *insert_rec,
   3987				    struct ocfs2_path *right_path,
   3988				    struct ocfs2_path **ret_left_path)
   3989{
   3990	int ret, next_free;
   3991	struct ocfs2_extent_list *el;
   3992	struct ocfs2_path *left_path = NULL;
   3993
   3994	*ret_left_path = NULL;
   3995
   3996	/*
   3997	 * This shouldn't happen for non-trees. The extent rec cluster
   3998	 * count manipulation below only works for interior nodes.
   3999	 */
   4000	BUG_ON(right_path->p_tree_depth == 0);
   4001
   4002	/*
   4003	 * If our appending insert is at the leftmost edge of a leaf,
   4004	 * then we might need to update the rightmost records of the
   4005	 * neighboring path.
   4006	 */
   4007	el = path_leaf_el(right_path);
   4008	next_free = le16_to_cpu(el->l_next_free_rec);
   4009	if (next_free == 0 ||
   4010	    (next_free == 1 && ocfs2_is_empty_extent(&el->l_recs[0]))) {
   4011		u32 left_cpos;
   4012
   4013		ret = ocfs2_find_cpos_for_left_leaf(ocfs2_metadata_cache_get_super(et->et_ci),
   4014						    right_path, &left_cpos);
   4015		if (ret) {
   4016			mlog_errno(ret);
   4017			goto out;
   4018		}
   4019
   4020		trace_ocfs2_append_rec_to_path(
   4021			(unsigned long long)
   4022			ocfs2_metadata_cache_owner(et->et_ci),
   4023			le32_to_cpu(insert_rec->e_cpos),
   4024			left_cpos);
   4025
   4026		/*
   4027		 * No need to worry if the append is already in the
   4028		 * leftmost leaf.
   4029		 */
   4030		if (left_cpos) {
   4031			left_path = ocfs2_new_path_from_path(right_path);
   4032			if (!left_path) {
   4033				ret = -ENOMEM;
   4034				mlog_errno(ret);
   4035				goto out;
   4036			}
   4037
   4038			ret = ocfs2_find_path(et->et_ci, left_path,
   4039					      left_cpos);
   4040			if (ret) {
   4041				mlog_errno(ret);
   4042				goto out;
   4043			}
   4044
   4045			/*
   4046			 * ocfs2_insert_path() will pass the left_path to the
   4047			 * journal for us.
   4048			 */
   4049		}
   4050	}
   4051
   4052	ret = ocfs2_journal_access_path(et->et_ci, handle, right_path);
   4053	if (ret) {
   4054		mlog_errno(ret);
   4055		goto out;
   4056	}
   4057
   4058	ocfs2_adjust_rightmost_records(handle, et, right_path, insert_rec);
   4059
   4060	*ret_left_path = left_path;
   4061	ret = 0;
   4062out:
   4063	if (ret != 0)
   4064		ocfs2_free_path(left_path);
   4065
   4066	return ret;
   4067}
   4068
   4069static void ocfs2_split_record(struct ocfs2_extent_tree *et,
   4070			       struct ocfs2_path *left_path,
   4071			       struct ocfs2_path *right_path,
   4072			       struct ocfs2_extent_rec *split_rec,
   4073			       enum ocfs2_split_type split)
   4074{
   4075	int index;
   4076	u32 cpos = le32_to_cpu(split_rec->e_cpos);
   4077	struct ocfs2_extent_list *left_el = NULL, *right_el, *insert_el, *el;
   4078	struct ocfs2_extent_rec *rec, *tmprec;
   4079
   4080	right_el = path_leaf_el(right_path);
   4081	if (left_path)
   4082		left_el = path_leaf_el(left_path);
   4083
   4084	el = right_el;
   4085	insert_el = right_el;
   4086	index = ocfs2_search_extent_list(el, cpos);
   4087	if (index != -1) {
   4088		if (index == 0 && left_path) {
   4089			BUG_ON(ocfs2_is_empty_extent(&el->l_recs[0]));
   4090
   4091			/*
   4092			 * This typically means that the record
   4093			 * started in the left path but moved to the
   4094			 * right as a result of rotation. We either
   4095			 * move the existing record to the left, or we
   4096			 * do the later insert there.
   4097			 *
   4098			 * In this case, the left path should always
   4099			 * exist as the rotate code will have passed
   4100			 * it back for a post-insert update.
   4101			 */
   4102
   4103			if (split == SPLIT_LEFT) {
   4104				/*
   4105				 * It's a left split. Since we know
   4106				 * that the rotate code gave us an
   4107				 * empty extent in the left path, we
   4108				 * can just do the insert there.
   4109				 */
   4110				insert_el = left_el;
   4111			} else {
   4112				/*
   4113				 * Right split - we have to move the
   4114				 * existing record over to the left
   4115				 * leaf. The insert will be into the
   4116				 * newly created empty extent in the
   4117				 * right leaf.
   4118				 */
   4119				tmprec = &right_el->l_recs[index];
   4120				ocfs2_rotate_leaf(left_el, tmprec);
   4121				el = left_el;
   4122
   4123				memset(tmprec, 0, sizeof(*tmprec));
   4124				index = ocfs2_search_extent_list(left_el, cpos);
   4125				BUG_ON(index == -1);
   4126			}
   4127		}
   4128	} else {
   4129		BUG_ON(!left_path);
   4130		BUG_ON(!ocfs2_is_empty_extent(&left_el->l_recs[0]));
   4131		/*
   4132		 * Left path is easy - we can just allow the insert to
   4133		 * happen.
   4134		 */
   4135		el = left_el;
   4136		insert_el = left_el;
   4137		index = ocfs2_search_extent_list(el, cpos);
   4138		BUG_ON(index == -1);
   4139	}
   4140
   4141	rec = &el->l_recs[index];
   4142	ocfs2_subtract_from_rec(ocfs2_metadata_cache_get_super(et->et_ci),
   4143				split, rec, split_rec);
   4144	ocfs2_rotate_leaf(insert_el, split_rec);
   4145}
   4146
   4147/*
   4148 * This function only does inserts on an allocation b-tree. For tree
   4149 * depth = 0, ocfs2_insert_at_leaf() is called directly.
   4150 *
   4151 * right_path is the path we want to do the actual insert
   4152 * in. left_path should only be passed in if we need to update that
   4153 * portion of the tree after an edge insert.
   4154 */
   4155static int ocfs2_insert_path(handle_t *handle,
   4156			     struct ocfs2_extent_tree *et,
   4157			     struct ocfs2_path *left_path,
   4158			     struct ocfs2_path *right_path,
   4159			     struct ocfs2_extent_rec *insert_rec,
   4160			     struct ocfs2_insert_type *insert)
   4161{
   4162	int ret, subtree_index;
   4163	struct buffer_head *leaf_bh = path_leaf_bh(right_path);
   4164
   4165	if (left_path) {
   4166		/*
   4167		 * There's a chance that left_path got passed back to
   4168		 * us without being accounted for in the
   4169		 * journal. Extend our transaction here to be sure we
   4170		 * can change those blocks.
   4171		 */
   4172		ret = ocfs2_extend_trans(handle, left_path->p_tree_depth);
   4173		if (ret < 0) {
   4174			mlog_errno(ret);
   4175			goto out;
   4176		}
   4177
   4178		ret = ocfs2_journal_access_path(et->et_ci, handle, left_path);
   4179		if (ret < 0) {
   4180			mlog_errno(ret);
   4181			goto out;
   4182		}
   4183	}
   4184
   4185	/*
   4186	 * Pass both paths to the journal. The majority of inserts
   4187	 * will be touching all components anyway.
   4188	 */
   4189	ret = ocfs2_journal_access_path(et->et_ci, handle, right_path);
   4190	if (ret < 0) {
   4191		mlog_errno(ret);
   4192		goto out;
   4193	}
   4194
   4195	if (insert->ins_split != SPLIT_NONE) {
   4196		/*
   4197		 * We could call ocfs2_insert_at_leaf() for some types
   4198		 * of splits, but it's easier to just let one separate
   4199		 * function sort it all out.
   4200		 */
   4201		ocfs2_split_record(et, left_path, right_path,
   4202				   insert_rec, insert->ins_split);
   4203
   4204		/*
   4205		 * Split might have modified either leaf and we don't
   4206		 * have a guarantee that the later edge insert will
   4207		 * dirty this for us.
   4208		 */
   4209		if (left_path)
   4210			ocfs2_journal_dirty(handle,
   4211					    path_leaf_bh(left_path));
   4212	} else
   4213		ocfs2_insert_at_leaf(et, insert_rec, path_leaf_el(right_path),
   4214				     insert);
   4215
   4216	ocfs2_journal_dirty(handle, leaf_bh);
   4217
   4218	if (left_path) {
   4219		/*
   4220		 * The rotate code has indicated that we need to fix
   4221		 * up portions of the tree after the insert.
   4222		 *
   4223		 * XXX: Should we extend the transaction here?
   4224		 */
   4225		subtree_index = ocfs2_find_subtree_root(et, left_path,
   4226							right_path);
   4227		ocfs2_complete_edge_insert(handle, left_path, right_path,
   4228					   subtree_index);
   4229	}
   4230
   4231	ret = 0;
   4232out:
   4233	return ret;
   4234}
   4235
   4236static int ocfs2_do_insert_extent(handle_t *handle,
   4237				  struct ocfs2_extent_tree *et,
   4238				  struct ocfs2_extent_rec *insert_rec,
   4239				  struct ocfs2_insert_type *type)
   4240{
   4241	int ret, rotate = 0;
   4242	u32 cpos;
   4243	struct ocfs2_path *right_path = NULL;
   4244	struct ocfs2_path *left_path = NULL;
   4245	struct ocfs2_extent_list *el;
   4246
   4247	el = et->et_root_el;
   4248
   4249	ret = ocfs2_et_root_journal_access(handle, et,
   4250					   OCFS2_JOURNAL_ACCESS_WRITE);
   4251	if (ret) {
   4252		mlog_errno(ret);
   4253		goto out;
   4254	}
   4255
   4256	if (le16_to_cpu(el->l_tree_depth) == 0) {
   4257		ocfs2_insert_at_leaf(et, insert_rec, el, type);
   4258		goto out_update_clusters;
   4259	}
   4260
   4261	right_path = ocfs2_new_path_from_et(et);
   4262	if (!right_path) {
   4263		ret = -ENOMEM;
   4264		mlog_errno(ret);
   4265		goto out;
   4266	}
   4267
   4268	/*
   4269	 * Determine the path to start with. Rotations need the
   4270	 * rightmost path, everything else can go directly to the
   4271	 * target leaf.
   4272	 */
   4273	cpos = le32_to_cpu(insert_rec->e_cpos);
   4274	if (type->ins_appending == APPEND_NONE &&
   4275	    type->ins_contig == CONTIG_NONE) {
   4276		rotate = 1;
   4277		cpos = UINT_MAX;
   4278	}
   4279
   4280	ret = ocfs2_find_path(et->et_ci, right_path, cpos);
   4281	if (ret) {
   4282		mlog_errno(ret);
   4283		goto out;
   4284	}
   4285
   4286	/*
   4287	 * Rotations and appends need special treatment - they modify
   4288	 * parts of the tree's above them.
   4289	 *
   4290	 * Both might pass back a path immediate to the left of the
   4291	 * one being inserted to. This will be cause
   4292	 * ocfs2_insert_path() to modify the rightmost records of
   4293	 * left_path to account for an edge insert.
   4294	 *
   4295	 * XXX: When modifying this code, keep in mind that an insert
   4296	 * can wind up skipping both of these two special cases...
   4297	 */
   4298	if (rotate) {
   4299		ret = ocfs2_rotate_tree_right(handle, et, type->ins_split,
   4300					      le32_to_cpu(insert_rec->e_cpos),
   4301					      right_path, &left_path);
   4302		if (ret) {
   4303			mlog_errno(ret);
   4304			goto out;
   4305		}
   4306
   4307		/*
   4308		 * ocfs2_rotate_tree_right() might have extended the
   4309		 * transaction without re-journaling our tree root.
   4310		 */
   4311		ret = ocfs2_et_root_journal_access(handle, et,
   4312						   OCFS2_JOURNAL_ACCESS_WRITE);
   4313		if (ret) {
   4314			mlog_errno(ret);
   4315			goto out;
   4316		}
   4317	} else if (type->ins_appending == APPEND_TAIL
   4318		   && type->ins_contig != CONTIG_LEFT) {
   4319		ret = ocfs2_append_rec_to_path(handle, et, insert_rec,
   4320					       right_path, &left_path);
   4321		if (ret) {
   4322			mlog_errno(ret);
   4323			goto out;
   4324		}
   4325	}
   4326
   4327	ret = ocfs2_insert_path(handle, et, left_path, right_path,
   4328				insert_rec, type);
   4329	if (ret) {
   4330		mlog_errno(ret);
   4331		goto out;
   4332	}
   4333
   4334out_update_clusters:
   4335	if (type->ins_split == SPLIT_NONE)
   4336		ocfs2_et_update_clusters(et,
   4337					 le16_to_cpu(insert_rec->e_leaf_clusters));
   4338
   4339	ocfs2_journal_dirty(handle, et->et_root_bh);
   4340
   4341out:
   4342	ocfs2_free_path(left_path);
   4343	ocfs2_free_path(right_path);
   4344
   4345	return ret;
   4346}
   4347
   4348static int ocfs2_figure_merge_contig_type(struct ocfs2_extent_tree *et,
   4349			       struct ocfs2_path *path,
   4350			       struct ocfs2_extent_list *el, int index,
   4351			       struct ocfs2_extent_rec *split_rec,
   4352			       struct ocfs2_merge_ctxt *ctxt)
   4353{
   4354	int status = 0;
   4355	enum ocfs2_contig_type ret = CONTIG_NONE;
   4356	u32 left_cpos, right_cpos;
   4357	struct ocfs2_extent_rec *rec = NULL;
   4358	struct ocfs2_extent_list *new_el;
   4359	struct ocfs2_path *left_path = NULL, *right_path = NULL;
   4360	struct buffer_head *bh;
   4361	struct ocfs2_extent_block *eb;
   4362	struct super_block *sb = ocfs2_metadata_cache_get_super(et->et_ci);
   4363
   4364	if (index > 0) {
   4365		rec = &el->l_recs[index - 1];
   4366	} else if (path->p_tree_depth > 0) {
   4367		status = ocfs2_find_cpos_for_left_leaf(sb, path, &left_cpos);
   4368		if (status)
   4369			goto exit;
   4370
   4371		if (left_cpos != 0) {
   4372			left_path = ocfs2_new_path_from_path(path);
   4373			if (!left_path) {
   4374				status = -ENOMEM;
   4375				mlog_errno(status);
   4376				goto exit;
   4377			}
   4378
   4379			status = ocfs2_find_path(et->et_ci, left_path,
   4380						 left_cpos);
   4381			if (status)
   4382				goto free_left_path;
   4383
   4384			new_el = path_leaf_el(left_path);
   4385
   4386			if (le16_to_cpu(new_el->l_next_free_rec) !=
   4387			    le16_to_cpu(new_el->l_count)) {
   4388				bh = path_leaf_bh(left_path);
   4389				eb = (struct ocfs2_extent_block *)bh->b_data;
   4390				status = ocfs2_error(sb,
   4391						"Extent block #%llu has an invalid l_next_free_rec of %d.  It should have matched the l_count of %d\n",
   4392						(unsigned long long)le64_to_cpu(eb->h_blkno),
   4393						le16_to_cpu(new_el->l_next_free_rec),
   4394						le16_to_cpu(new_el->l_count));
   4395				goto free_left_path;
   4396			}
   4397			rec = &new_el->l_recs[
   4398				le16_to_cpu(new_el->l_next_free_rec) - 1];
   4399		}
   4400	}
   4401
   4402	/*
   4403	 * We're careful to check for an empty extent record here -
   4404	 * the merge code will know what to do if it sees one.
   4405	 */
   4406	if (rec) {
   4407		if (index == 1 && ocfs2_is_empty_extent(rec)) {
   4408			if (split_rec->e_cpos == el->l_recs[index].e_cpos)
   4409				ret = CONTIG_RIGHT;
   4410		} else {
   4411			ret = ocfs2_et_extent_contig(et, rec, split_rec);
   4412		}
   4413	}
   4414
   4415	rec = NULL;
   4416	if (index < (le16_to_cpu(el->l_next_free_rec) - 1))
   4417		rec = &el->l_recs[index + 1];
   4418	else if (le16_to_cpu(el->l_next_free_rec) == le16_to_cpu(el->l_count) &&
   4419		 path->p_tree_depth > 0) {
   4420		status = ocfs2_find_cpos_for_right_leaf(sb, path, &right_cpos);
   4421		if (status)
   4422			goto free_left_path;
   4423
   4424		if (right_cpos == 0)
   4425			goto free_left_path;
   4426
   4427		right_path = ocfs2_new_path_from_path(path);
   4428		if (!right_path) {
   4429			status = -ENOMEM;
   4430			mlog_errno(status);
   4431			goto free_left_path;
   4432		}
   4433
   4434		status = ocfs2_find_path(et->et_ci, right_path, right_cpos);
   4435		if (status)
   4436			goto free_right_path;
   4437
   4438		new_el = path_leaf_el(right_path);
   4439		rec = &new_el->l_recs[0];
   4440		if (ocfs2_is_empty_extent(rec)) {
   4441			if (le16_to_cpu(new_el->l_next_free_rec) <= 1) {
   4442				bh = path_leaf_bh(right_path);
   4443				eb = (struct ocfs2_extent_block *)bh->b_data;
   4444				status = ocfs2_error(sb,
   4445						"Extent block #%llu has an invalid l_next_free_rec of %d\n",
   4446						(unsigned long long)le64_to_cpu(eb->h_blkno),
   4447						le16_to_cpu(new_el->l_next_free_rec));
   4448				goto free_right_path;
   4449			}
   4450			rec = &new_el->l_recs[1];
   4451		}
   4452	}
   4453
   4454	if (rec) {
   4455		enum ocfs2_contig_type contig_type;
   4456
   4457		contig_type = ocfs2_et_extent_contig(et, rec, split_rec);
   4458
   4459		if (contig_type == CONTIG_LEFT && ret == CONTIG_RIGHT)
   4460			ret = CONTIG_LEFTRIGHT;
   4461		else if (ret == CONTIG_NONE)
   4462			ret = contig_type;
   4463	}
   4464
   4465free_right_path:
   4466	ocfs2_free_path(right_path);
   4467free_left_path:
   4468	ocfs2_free_path(left_path);
   4469exit:
   4470	if (status == 0)
   4471		ctxt->c_contig_type = ret;
   4472
   4473	return status;
   4474}
   4475
   4476static void ocfs2_figure_contig_type(struct ocfs2_extent_tree *et,
   4477				     struct ocfs2_insert_type *insert,
   4478				     struct ocfs2_extent_list *el,
   4479				     struct ocfs2_extent_rec *insert_rec)
   4480{
   4481	int i;
   4482	enum ocfs2_contig_type contig_type = CONTIG_NONE;
   4483
   4484	BUG_ON(le16_to_cpu(el->l_tree_depth) != 0);
   4485
   4486	for(i = 0; i < le16_to_cpu(el->l_next_free_rec); i++) {
   4487		contig_type = ocfs2_et_extent_contig(et, &el->l_recs[i],
   4488						     insert_rec);
   4489		if (contig_type != CONTIG_NONE) {
   4490			insert->ins_contig_index = i;
   4491			break;
   4492		}
   4493	}
   4494	insert->ins_contig = contig_type;
   4495
   4496	if (insert->ins_contig != CONTIG_NONE) {
   4497		struct ocfs2_extent_rec *rec =
   4498				&el->l_recs[insert->ins_contig_index];
   4499		unsigned int len = le16_to_cpu(rec->e_leaf_clusters) +
   4500				   le16_to_cpu(insert_rec->e_leaf_clusters);
   4501
   4502		/*
   4503		 * Caller might want us to limit the size of extents, don't
   4504		 * calculate contiguousness if we might exceed that limit.
   4505		 */
   4506		if (et->et_max_leaf_clusters &&
   4507		    (len > et->et_max_leaf_clusters))
   4508			insert->ins_contig = CONTIG_NONE;
   4509	}
   4510}
   4511
   4512/*
   4513 * This should only be called against the righmost leaf extent list.
   4514 *
   4515 * ocfs2_figure_appending_type() will figure out whether we'll have to
   4516 * insert at the tail of the rightmost leaf.
   4517 *
   4518 * This should also work against the root extent list for tree's with 0
   4519 * depth. If we consider the root extent list to be the rightmost leaf node
   4520 * then the logic here makes sense.
   4521 */
   4522static void ocfs2_figure_appending_type(struct ocfs2_insert_type *insert,
   4523					struct ocfs2_extent_list *el,
   4524					struct ocfs2_extent_rec *insert_rec)
   4525{
   4526	int i;
   4527	u32 cpos = le32_to_cpu(insert_rec->e_cpos);
   4528	struct ocfs2_extent_rec *rec;
   4529
   4530	insert->ins_appending = APPEND_NONE;
   4531
   4532	BUG_ON(le16_to_cpu(el->l_tree_depth) != 0);
   4533
   4534	if (!el->l_next_free_rec)
   4535		goto set_tail_append;
   4536
   4537	if (ocfs2_is_empty_extent(&el->l_recs[0])) {
   4538		/* Were all records empty? */
   4539		if (le16_to_cpu(el->l_next_free_rec) == 1)
   4540			goto set_tail_append;
   4541	}
   4542
   4543	i = le16_to_cpu(el->l_next_free_rec) - 1;
   4544	rec = &el->l_recs[i];
   4545
   4546	if (cpos >=
   4547	    (le32_to_cpu(rec->e_cpos) + le16_to_cpu(rec->e_leaf_clusters)))
   4548		goto set_tail_append;
   4549
   4550	return;
   4551
   4552set_tail_append:
   4553	insert->ins_appending = APPEND_TAIL;
   4554}
   4555
   4556/*
   4557 * Helper function called at the beginning of an insert.
   4558 *
   4559 * This computes a few things that are commonly used in the process of
   4560 * inserting into the btree:
   4561 *   - Whether the new extent is contiguous with an existing one.
   4562 *   - The current tree depth.
   4563 *   - Whether the insert is an appending one.
   4564 *   - The total # of free records in the tree.
   4565 *
   4566 * All of the information is stored on the ocfs2_insert_type
   4567 * structure.
   4568 */
   4569static int ocfs2_figure_insert_type(struct ocfs2_extent_tree *et,
   4570				    struct buffer_head **last_eb_bh,
   4571				    struct ocfs2_extent_rec *insert_rec,
   4572				    int *free_records,
   4573				    struct ocfs2_insert_type *insert)
   4574{
   4575	int ret;
   4576	struct ocfs2_extent_block *eb;
   4577	struct ocfs2_extent_list *el;
   4578	struct ocfs2_path *path = NULL;
   4579	struct buffer_head *bh = NULL;
   4580
   4581	insert->ins_split = SPLIT_NONE;
   4582
   4583	el = et->et_root_el;
   4584	insert->ins_tree_depth = le16_to_cpu(el->l_tree_depth);
   4585
   4586	if (el->l_tree_depth) {
   4587		/*
   4588		 * If we have tree depth, we read in the
   4589		 * rightmost extent block ahead of time as
   4590		 * ocfs2_figure_insert_type() and ocfs2_add_branch()
   4591		 * may want it later.
   4592		 */
   4593		ret = ocfs2_read_extent_block(et->et_ci,
   4594					      ocfs2_et_get_last_eb_blk(et),
   4595					      &bh);
   4596		if (ret) {
   4597			mlog_errno(ret);
   4598			goto out;
   4599		}
   4600		eb = (struct ocfs2_extent_block *) bh->b_data;
   4601		el = &eb->h_list;
   4602	}
   4603
   4604	/*
   4605	 * Unless we have a contiguous insert, we'll need to know if
   4606	 * there is room left in our allocation tree for another
   4607	 * extent record.
   4608	 *
   4609	 * XXX: This test is simplistic, we can search for empty
   4610	 * extent records too.
   4611	 */
   4612	*free_records = le16_to_cpu(el->l_count) -
   4613		le16_to_cpu(el->l_next_free_rec);
   4614
   4615	if (!insert->ins_tree_depth) {
   4616		ocfs2_figure_contig_type(et, insert, el, insert_rec);
   4617		ocfs2_figure_appending_type(insert, el, insert_rec);
   4618		return 0;
   4619	}
   4620
   4621	path = ocfs2_new_path_from_et(et);
   4622	if (!path) {
   4623		ret = -ENOMEM;
   4624		mlog_errno(ret);
   4625		goto out;
   4626	}
   4627
   4628	/*
   4629	 * In the case that we're inserting past what the tree
   4630	 * currently accounts for, ocfs2_find_path() will return for
   4631	 * us the rightmost tree path. This is accounted for below in
   4632	 * the appending code.
   4633	 */
   4634	ret = ocfs2_find_path(et->et_ci, path, le32_to_cpu(insert_rec->e_cpos));
   4635	if (ret) {
   4636		mlog_errno(ret);
   4637		goto out;
   4638	}
   4639
   4640	el = path_leaf_el(path);
   4641
   4642	/*
   4643	 * Now that we have the path, there's two things we want to determine:
   4644	 * 1) Contiguousness (also set contig_index if this is so)
   4645	 *
   4646	 * 2) Are we doing an append? We can trivially break this up
   4647         *     into two types of appends: simple record append, or a
   4648         *     rotate inside the tail leaf.
   4649	 */
   4650	ocfs2_figure_contig_type(et, insert, el, insert_rec);
   4651
   4652	/*
   4653	 * The insert code isn't quite ready to deal with all cases of
   4654	 * left contiguousness. Specifically, if it's an insert into
   4655	 * the 1st record in a leaf, it will require the adjustment of
   4656	 * cluster count on the last record of the path directly to it's
   4657	 * left. For now, just catch that case and fool the layers
   4658	 * above us. This works just fine for tree_depth == 0, which
   4659	 * is why we allow that above.
   4660	 */
   4661	if (insert->ins_contig == CONTIG_LEFT &&
   4662	    insert->ins_contig_index == 0)
   4663		insert->ins_contig = CONTIG_NONE;
   4664
   4665	/*
   4666	 * Ok, so we can simply compare against last_eb to figure out
   4667	 * whether the path doesn't exist. This will only happen in
   4668	 * the case that we're doing a tail append, so maybe we can
   4669	 * take advantage of that information somehow.
   4670	 */
   4671	if (ocfs2_et_get_last_eb_blk(et) ==
   4672	    path_leaf_bh(path)->b_blocknr) {
   4673		/*
   4674		 * Ok, ocfs2_find_path() returned us the rightmost
   4675		 * tree path. This might be an appending insert. There are
   4676		 * two cases:
   4677		 *    1) We're doing a true append at the tail:
   4678		 *	-This might even be off the end of the leaf
   4679		 *    2) We're "appending" by rotating in the tail
   4680		 */
   4681		ocfs2_figure_appending_type(insert, el, insert_rec);
   4682	}
   4683
   4684out:
   4685	ocfs2_free_path(path);
   4686
   4687	if (ret == 0)
   4688		*last_eb_bh = bh;
   4689	else
   4690		brelse(bh);
   4691	return ret;
   4692}
   4693
   4694/*
   4695 * Insert an extent into a btree.
   4696 *
   4697 * The caller needs to update the owning btree's cluster count.
   4698 */
   4699int ocfs2_insert_extent(handle_t *handle,
   4700			struct ocfs2_extent_tree *et,
   4701			u32 cpos,
   4702			u64 start_blk,
   4703			u32 new_clusters,
   4704			u8 flags,
   4705			struct ocfs2_alloc_context *meta_ac)
   4706{
   4707	int status;
   4708	int free_records;
   4709	struct buffer_head *last_eb_bh = NULL;
   4710	struct ocfs2_insert_type insert = {0, };
   4711	struct ocfs2_extent_rec rec;
   4712
   4713	trace_ocfs2_insert_extent_start(
   4714		(unsigned long long)ocfs2_metadata_cache_owner(et->et_ci),
   4715		cpos, new_clusters);
   4716
   4717	memset(&rec, 0, sizeof(rec));
   4718	rec.e_cpos = cpu_to_le32(cpos);
   4719	rec.e_blkno = cpu_to_le64(start_blk);
   4720	rec.e_leaf_clusters = cpu_to_le16(new_clusters);
   4721	rec.e_flags = flags;
   4722	status = ocfs2_et_insert_check(et, &rec);
   4723	if (status) {
   4724		mlog_errno(status);
   4725		goto bail;
   4726	}
   4727
   4728	status = ocfs2_figure_insert_type(et, &last_eb_bh, &rec,
   4729					  &free_records, &insert);
   4730	if (status < 0) {
   4731		mlog_errno(status);
   4732		goto bail;
   4733	}
   4734
   4735	trace_ocfs2_insert_extent(insert.ins_appending, insert.ins_contig,
   4736				  insert.ins_contig_index, free_records,
   4737				  insert.ins_tree_depth);
   4738
   4739	if (insert.ins_contig == CONTIG_NONE && free_records == 0) {
   4740		status = ocfs2_grow_tree(handle, et,
   4741					 &insert.ins_tree_depth, &last_eb_bh,
   4742					 meta_ac);
   4743		if (status) {
   4744			mlog_errno(status);
   4745			goto bail;
   4746		}
   4747	}
   4748
   4749	/* Finally, we can add clusters. This might rotate the tree for us. */
   4750	status = ocfs2_do_insert_extent(handle, et, &rec, &insert);
   4751	if (status < 0)
   4752		mlog_errno(status);
   4753	else
   4754		ocfs2_et_extent_map_insert(et, &rec);
   4755
   4756bail:
   4757	brelse(last_eb_bh);
   4758
   4759	return status;
   4760}
   4761
   4762/*
   4763 * Allcate and add clusters into the extent b-tree.
   4764 * The new clusters(clusters_to_add) will be inserted at logical_offset.
   4765 * The extent b-tree's root is specified by et, and
   4766 * it is not limited to the file storage. Any extent tree can use this
   4767 * function if it implements the proper ocfs2_extent_tree.
   4768 */
   4769int ocfs2_add_clusters_in_btree(handle_t *handle,
   4770				struct ocfs2_extent_tree *et,
   4771				u32 *logical_offset,
   4772				u32 clusters_to_add,
   4773				int mark_unwritten,
   4774				struct ocfs2_alloc_context *data_ac,
   4775				struct ocfs2_alloc_context *meta_ac,
   4776				enum ocfs2_alloc_restarted *reason_ret)
   4777{
   4778	int status = 0, err = 0;
   4779	int need_free = 0;
   4780	int free_extents;
   4781	enum ocfs2_alloc_restarted reason = RESTART_NONE;
   4782	u32 bit_off, num_bits;
   4783	u64 block;
   4784	u8 flags = 0;
   4785	struct ocfs2_super *osb =
   4786		OCFS2_SB(ocfs2_metadata_cache_get_super(et->et_ci));
   4787
   4788	BUG_ON(!clusters_to_add);
   4789
   4790	if (mark_unwritten)
   4791		flags = OCFS2_EXT_UNWRITTEN;
   4792
   4793	free_extents = ocfs2_num_free_extents(et);
   4794	if (free_extents < 0) {
   4795		status = free_extents;
   4796		mlog_errno(status);
   4797		goto leave;
   4798	}
   4799
   4800	/* there are two cases which could cause us to EAGAIN in the
   4801	 * we-need-more-metadata case:
   4802	 * 1) we haven't reserved *any*
   4803	 * 2) we are so fragmented, we've needed to add metadata too
   4804	 *    many times. */
   4805	if (!free_extents && !meta_ac) {
   4806		err = -1;
   4807		status = -EAGAIN;
   4808		reason = RESTART_META;
   4809		goto leave;
   4810	} else if ((!free_extents)
   4811		   && (ocfs2_alloc_context_bits_left(meta_ac)
   4812		       < ocfs2_extend_meta_needed(et->et_root_el))) {
   4813		err = -2;
   4814		status = -EAGAIN;
   4815		reason = RESTART_META;
   4816		goto leave;
   4817	}
   4818
   4819	status = __ocfs2_claim_clusters(handle, data_ac, 1,
   4820					clusters_to_add, &bit_off, &num_bits);
   4821	if (status < 0) {
   4822		if (status != -ENOSPC)
   4823			mlog_errno(status);
   4824		goto leave;
   4825	}
   4826
   4827	BUG_ON(num_bits > clusters_to_add);
   4828
   4829	/* reserve our write early -- insert_extent may update the tree root */
   4830	status = ocfs2_et_root_journal_access(handle, et,
   4831					      OCFS2_JOURNAL_ACCESS_WRITE);
   4832	if (status < 0) {
   4833		mlog_errno(status);
   4834		need_free = 1;
   4835		goto bail;
   4836	}
   4837
   4838	block = ocfs2_clusters_to_blocks(osb->sb, bit_off);
   4839	trace_ocfs2_add_clusters_in_btree(
   4840	     (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci),
   4841	     bit_off, num_bits);
   4842	status = ocfs2_insert_extent(handle, et, *logical_offset, block,
   4843				     num_bits, flags, meta_ac);
   4844	if (status < 0) {
   4845		mlog_errno(status);
   4846		need_free = 1;
   4847		goto bail;
   4848	}
   4849
   4850	ocfs2_journal_dirty(handle, et->et_root_bh);
   4851
   4852	clusters_to_add -= num_bits;
   4853	*logical_offset += num_bits;
   4854
   4855	if (clusters_to_add) {
   4856		err = clusters_to_add;
   4857		status = -EAGAIN;
   4858		reason = RESTART_TRANS;
   4859	}
   4860
   4861bail:
   4862	if (need_free) {
   4863		if (data_ac->ac_which == OCFS2_AC_USE_LOCAL)
   4864			ocfs2_free_local_alloc_bits(osb, handle, data_ac,
   4865					bit_off, num_bits);
   4866		else
   4867			ocfs2_free_clusters(handle,
   4868					data_ac->ac_inode,
   4869					data_ac->ac_bh,
   4870					ocfs2_clusters_to_blocks(osb->sb, bit_off),
   4871					num_bits);
   4872	}
   4873
   4874leave:
   4875	if (reason_ret)
   4876		*reason_ret = reason;
   4877	trace_ocfs2_add_clusters_in_btree_ret(status, reason, err);
   4878	return status;
   4879}
   4880
   4881static void ocfs2_make_right_split_rec(struct super_block *sb,
   4882				       struct ocfs2_extent_rec *split_rec,
   4883				       u32 cpos,
   4884				       struct ocfs2_extent_rec *rec)
   4885{
   4886	u32 rec_cpos = le32_to_cpu(rec->e_cpos);
   4887	u32 rec_range = rec_cpos + le16_to_cpu(rec->e_leaf_clusters);
   4888
   4889	memset(split_rec, 0, sizeof(struct ocfs2_extent_rec));
   4890
   4891	split_rec->e_cpos = cpu_to_le32(cpos);
   4892	split_rec->e_leaf_clusters = cpu_to_le16(rec_range - cpos);
   4893
   4894	split_rec->e_blkno = rec->e_blkno;
   4895	le64_add_cpu(&split_rec->e_blkno,
   4896		     ocfs2_clusters_to_blocks(sb, cpos - rec_cpos));
   4897
   4898	split_rec->e_flags = rec->e_flags;
   4899}
   4900
   4901static int ocfs2_split_and_insert(handle_t *handle,
   4902				  struct ocfs2_extent_tree *et,
   4903				  struct ocfs2_path *path,
   4904				  struct buffer_head **last_eb_bh,
   4905				  int split_index,
   4906				  struct ocfs2_extent_rec *orig_split_rec,
   4907				  struct ocfs2_alloc_context *meta_ac)
   4908{
   4909	int ret = 0, depth;
   4910	unsigned int insert_range, rec_range, do_leftright = 0;
   4911	struct ocfs2_extent_rec tmprec;
   4912	struct ocfs2_extent_list *rightmost_el;
   4913	struct ocfs2_extent_rec rec;
   4914	struct ocfs2_extent_rec split_rec = *orig_split_rec;
   4915	struct ocfs2_insert_type insert;
   4916	struct ocfs2_extent_block *eb;
   4917
   4918leftright:
   4919	/*
   4920	 * Store a copy of the record on the stack - it might move
   4921	 * around as the tree is manipulated below.
   4922	 */
   4923	rec = path_leaf_el(path)->l_recs[split_index];
   4924
   4925	rightmost_el = et->et_root_el;
   4926
   4927	depth = le16_to_cpu(rightmost_el->l_tree_depth);
   4928	if (depth) {
   4929		BUG_ON(!(*last_eb_bh));
   4930		eb = (struct ocfs2_extent_block *) (*last_eb_bh)->b_data;
   4931		rightmost_el = &eb->h_list;
   4932	}
   4933
   4934	if (le16_to_cpu(rightmost_el->l_next_free_rec) ==
   4935	    le16_to_cpu(rightmost_el->l_count)) {
   4936		ret = ocfs2_grow_tree(handle, et,
   4937				      &depth, last_eb_bh, meta_ac);
   4938		if (ret) {
   4939			mlog_errno(ret);
   4940			goto out;
   4941		}
   4942	}
   4943
   4944	memset(&insert, 0, sizeof(struct ocfs2_insert_type));
   4945	insert.ins_appending = APPEND_NONE;
   4946	insert.ins_contig = CONTIG_NONE;
   4947	insert.ins_tree_depth = depth;
   4948
   4949	insert_range = le32_to_cpu(split_rec.e_cpos) +
   4950		le16_to_cpu(split_rec.e_leaf_clusters);
   4951	rec_range = le32_to_cpu(rec.e_cpos) +
   4952		le16_to_cpu(rec.e_leaf_clusters);
   4953
   4954	if (split_rec.e_cpos == rec.e_cpos) {
   4955		insert.ins_split = SPLIT_LEFT;
   4956	} else if (insert_range == rec_range) {
   4957		insert.ins_split = SPLIT_RIGHT;
   4958	} else {
   4959		/*
   4960		 * Left/right split. We fake this as a right split
   4961		 * first and then make a second pass as a left split.
   4962		 */
   4963		insert.ins_split = SPLIT_RIGHT;
   4964
   4965		ocfs2_make_right_split_rec(ocfs2_metadata_cache_get_super(et->et_ci),
   4966					   &tmprec, insert_range, &rec);
   4967
   4968		split_rec = tmprec;
   4969
   4970		BUG_ON(do_leftright);
   4971		do_leftright = 1;
   4972	}
   4973
   4974	ret = ocfs2_do_insert_extent(handle, et, &split_rec, &insert);
   4975	if (ret) {
   4976		mlog_errno(ret);
   4977		goto out;
   4978	}
   4979
   4980	if (do_leftright == 1) {
   4981		u32 cpos;
   4982		struct ocfs2_extent_list *el;
   4983
   4984		do_leftright++;
   4985		split_rec = *orig_split_rec;
   4986
   4987		ocfs2_reinit_path(path, 1);
   4988
   4989		cpos = le32_to_cpu(split_rec.e_cpos);
   4990		ret = ocfs2_find_path(et->et_ci, path, cpos);
   4991		if (ret) {
   4992			mlog_errno(ret);
   4993			goto out;
   4994		}
   4995
   4996		el = path_leaf_el(path);
   4997		split_index = ocfs2_search_extent_list(el, cpos);
   4998		if (split_index == -1) {
   4999			ocfs2_error(ocfs2_metadata_cache_get_super(et->et_ci),
   5000				    "Owner %llu has an extent at cpos %u which can no longer be found\n",
   5001				    (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci),
   5002				    cpos);
   5003			ret = -EROFS;
   5004			goto out;
   5005		}
   5006		goto leftright;
   5007	}
   5008out:
   5009
   5010	return ret;
   5011}
   5012
   5013static int ocfs2_replace_extent_rec(handle_t *handle,
   5014				    struct ocfs2_extent_tree *et,
   5015				    struct ocfs2_path *path,
   5016				    struct ocfs2_extent_list *el,
   5017				    int split_index,
   5018				    struct ocfs2_extent_rec *split_rec)
   5019{
   5020	int ret;
   5021
   5022	ret = ocfs2_path_bh_journal_access(handle, et->et_ci, path,
   5023					   path_num_items(path) - 1);
   5024	if (ret) {
   5025		mlog_errno(ret);
   5026		goto out;
   5027	}
   5028
   5029	el->l_recs[split_index] = *split_rec;
   5030
   5031	ocfs2_journal_dirty(handle, path_leaf_bh(path));
   5032out:
   5033	return ret;
   5034}
   5035
   5036/*
   5037 * Split part or all of the extent record at split_index in the leaf
   5038 * pointed to by path. Merge with the contiguous extent record if needed.
   5039 *
   5040 * Care is taken to handle contiguousness so as to not grow the tree.
   5041 *
   5042 * meta_ac is not strictly necessary - we only truly need it if growth
   5043 * of the tree is required. All other cases will degrade into a less
   5044 * optimal tree layout.
   5045 *
   5046 * last_eb_bh should be the rightmost leaf block for any extent
   5047 * btree. Since a split may grow the tree or a merge might shrink it,
   5048 * the caller cannot trust the contents of that buffer after this call.
   5049 *
   5050 * This code is optimized for readability - several passes might be
   5051 * made over certain portions of the tree. All of those blocks will
   5052 * have been brought into cache (and pinned via the journal), so the
   5053 * extra overhead is not expressed in terms of disk reads.
   5054 */
   5055int ocfs2_split_extent(handle_t *handle,
   5056		       struct ocfs2_extent_tree *et,
   5057		       struct ocfs2_path *path,
   5058		       int split_index,
   5059		       struct ocfs2_extent_rec *split_rec,
   5060		       struct ocfs2_alloc_context *meta_ac,
   5061		       struct ocfs2_cached_dealloc_ctxt *dealloc)
   5062{
   5063	int ret = 0;
   5064	struct ocfs2_extent_list *el = path_leaf_el(path);
   5065	struct buffer_head *last_eb_bh = NULL;
   5066	struct ocfs2_extent_rec *rec = &el->l_recs[split_index];
   5067	struct ocfs2_merge_ctxt ctxt;
   5068
   5069	if (le32_to_cpu(rec->e_cpos) > le32_to_cpu(split_rec->e_cpos) ||
   5070	    ((le32_to_cpu(rec->e_cpos) + le16_to_cpu(rec->e_leaf_clusters)) <
   5071	     (le32_to_cpu(split_rec->e_cpos) + le16_to_cpu(split_rec->e_leaf_clusters)))) {
   5072		ret = -EIO;
   5073		mlog_errno(ret);
   5074		goto out;
   5075	}
   5076
   5077	ret = ocfs2_figure_merge_contig_type(et, path, el,
   5078					     split_index,
   5079					     split_rec,
   5080					     &ctxt);
   5081	if (ret) {
   5082		mlog_errno(ret);
   5083		goto out;
   5084	}
   5085
   5086	/*
   5087	 * The core merge / split code wants to know how much room is
   5088	 * left in this allocation tree, so we pass the
   5089	 * rightmost extent list.
   5090	 */
   5091	if (path->p_tree_depth) {
   5092		ret = ocfs2_read_extent_block(et->et_ci,
   5093					      ocfs2_et_get_last_eb_blk(et),
   5094					      &last_eb_bh);
   5095		if (ret) {
   5096			mlog_errno(ret);
   5097			goto out;
   5098		}
   5099	}
   5100
   5101	if (rec->e_cpos == split_rec->e_cpos &&
   5102	    rec->e_leaf_clusters == split_rec->e_leaf_clusters)
   5103		ctxt.c_split_covers_rec = 1;
   5104	else
   5105		ctxt.c_split_covers_rec = 0;
   5106
   5107	ctxt.c_has_empty_extent = ocfs2_is_empty_extent(&el->l_recs[0]);
   5108
   5109	trace_ocfs2_split_extent(split_index, ctxt.c_contig_type,
   5110				 ctxt.c_has_empty_extent,
   5111				 ctxt.c_split_covers_rec);
   5112
   5113	if (ctxt.c_contig_type == CONTIG_NONE) {
   5114		if (ctxt.c_split_covers_rec)
   5115			ret = ocfs2_replace_extent_rec(handle, et, path, el,
   5116						       split_index, split_rec);
   5117		else
   5118			ret = ocfs2_split_and_insert(handle, et, path,
   5119						     &last_eb_bh, split_index,
   5120						     split_rec, meta_ac);
   5121		if (ret)
   5122			mlog_errno(ret);
   5123	} else {
   5124		ret = ocfs2_try_to_merge_extent(handle, et, path,
   5125						split_index, split_rec,
   5126						dealloc, &ctxt);
   5127		if (ret)
   5128			mlog_errno(ret);
   5129	}
   5130
   5131out:
   5132	brelse(last_eb_bh);
   5133	return ret;
   5134}
   5135
   5136/*
   5137 * Change the flags of the already-existing extent at cpos for len clusters.
   5138 *
   5139 * new_flags: the flags we want to set.
   5140 * clear_flags: the flags we want to clear.
   5141 * phys: the new physical offset we want this new extent starts from.
   5142 *
   5143 * If the existing extent is larger than the request, initiate a
   5144 * split. An attempt will be made at merging with adjacent extents.
   5145 *
   5146 * The caller is responsible for passing down meta_ac if we'll need it.
   5147 */
   5148int ocfs2_change_extent_flag(handle_t *handle,
   5149			     struct ocfs2_extent_tree *et,
   5150			     u32 cpos, u32 len, u32 phys,
   5151			     struct ocfs2_alloc_context *meta_ac,
   5152			     struct ocfs2_cached_dealloc_ctxt *dealloc,
   5153			     int new_flags, int clear_flags)
   5154{
   5155	int ret, index;
   5156	struct super_block *sb = ocfs2_metadata_cache_get_super(et->et_ci);
   5157	u64 start_blkno = ocfs2_clusters_to_blocks(sb, phys);
   5158	struct ocfs2_extent_rec split_rec;
   5159	struct ocfs2_path *left_path = NULL;
   5160	struct ocfs2_extent_list *el;
   5161	struct ocfs2_extent_rec *rec;
   5162
   5163	left_path = ocfs2_new_path_from_et(et);
   5164	if (!left_path) {
   5165		ret = -ENOMEM;
   5166		mlog_errno(ret);
   5167		goto out;
   5168	}
   5169
   5170	ret = ocfs2_find_path(et->et_ci, left_path, cpos);
   5171	if (ret) {
   5172		mlog_errno(ret);
   5173		goto out;
   5174	}
   5175	el = path_leaf_el(left_path);
   5176
   5177	index = ocfs2_search_extent_list(el, cpos);
   5178	if (index == -1) {
   5179		ocfs2_error(sb,
   5180			    "Owner %llu has an extent at cpos %u which can no longer be found\n",
   5181			    (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci),
   5182			    cpos);
   5183		ret = -EROFS;
   5184		goto out;
   5185	}
   5186
   5187	ret = -EIO;
   5188	rec = &el->l_recs[index];
   5189	if (new_flags && (rec->e_flags & new_flags)) {
   5190		mlog(ML_ERROR, "Owner %llu tried to set %d flags on an "
   5191		     "extent that already had them\n",
   5192		     (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci),
   5193		     new_flags);
   5194		goto out;
   5195	}
   5196
   5197	if (clear_flags && !(rec->e_flags & clear_flags)) {
   5198		mlog(ML_ERROR, "Owner %llu tried to clear %d flags on an "
   5199		     "extent that didn't have them\n",
   5200		     (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci),
   5201		     clear_flags);
   5202		goto out;
   5203	}
   5204
   5205	memset(&split_rec, 0, sizeof(struct ocfs2_extent_rec));
   5206	split_rec.e_cpos = cpu_to_le32(cpos);
   5207	split_rec.e_leaf_clusters = cpu_to_le16(len);
   5208	split_rec.e_blkno = cpu_to_le64(start_blkno);
   5209	split_rec.e_flags = rec->e_flags;
   5210	if (new_flags)
   5211		split_rec.e_flags |= new_flags;
   5212	if (clear_flags)
   5213		split_rec.e_flags &= ~clear_flags;
   5214
   5215	ret = ocfs2_split_extent(handle, et, left_path,
   5216				 index, &split_rec, meta_ac,
   5217				 dealloc);
   5218	if (ret)
   5219		mlog_errno(ret);
   5220
   5221out:
   5222	ocfs2_free_path(left_path);
   5223	return ret;
   5224
   5225}
   5226
   5227/*
   5228 * Mark the already-existing extent at cpos as written for len clusters.
   5229 * This removes the unwritten extent flag.
   5230 *
   5231 * If the existing extent is larger than the request, initiate a
   5232 * split. An attempt will be made at merging with adjacent extents.
   5233 *
   5234 * The caller is responsible for passing down meta_ac if we'll need it.
   5235 */
   5236int ocfs2_mark_extent_written(struct inode *inode,
   5237			      struct ocfs2_extent_tree *et,
   5238			      handle_t *handle, u32 cpos, u32 len, u32 phys,
   5239			      struct ocfs2_alloc_context *meta_ac,
   5240			      struct ocfs2_cached_dealloc_ctxt *dealloc)
   5241{
   5242	int ret;
   5243
   5244	trace_ocfs2_mark_extent_written(
   5245		(unsigned long long)OCFS2_I(inode)->ip_blkno,
   5246		cpos, len, phys);
   5247
   5248	if (!ocfs2_writes_unwritten_extents(OCFS2_SB(inode->i_sb))) {
   5249		ocfs2_error(inode->i_sb, "Inode %llu has unwritten extents that are being written to, but the feature bit is not set in the super block\n",
   5250			    (unsigned long long)OCFS2_I(inode)->ip_blkno);
   5251		ret = -EROFS;
   5252		goto out;
   5253	}
   5254
   5255	/*
   5256	 * XXX: This should be fixed up so that we just re-insert the
   5257	 * next extent records.
   5258	 */
   5259	ocfs2_et_extent_map_truncate(et, 0);
   5260
   5261	ret = ocfs2_change_extent_flag(handle, et, cpos,
   5262				       len, phys, meta_ac, dealloc,
   5263				       0, OCFS2_EXT_UNWRITTEN);
   5264	if (ret)
   5265		mlog_errno(ret);
   5266
   5267out:
   5268	return ret;
   5269}
   5270
   5271static int ocfs2_split_tree(handle_t *handle, struct ocfs2_extent_tree *et,
   5272			    struct ocfs2_path *path,
   5273			    int index, u32 new_range,
   5274			    struct ocfs2_alloc_context *meta_ac)
   5275{
   5276	int ret, depth, credits;
   5277	struct buffer_head *last_eb_bh = NULL;
   5278	struct ocfs2_extent_block *eb;
   5279	struct ocfs2_extent_list *rightmost_el, *el;
   5280	struct ocfs2_extent_rec split_rec;
   5281	struct ocfs2_extent_rec *rec;
   5282	struct ocfs2_insert_type insert;
   5283
   5284	/*
   5285	 * Setup the record to split before we grow the tree.
   5286	 */
   5287	el = path_leaf_el(path);
   5288	rec = &el->l_recs[index];
   5289	ocfs2_make_right_split_rec(ocfs2_metadata_cache_get_super(et->et_ci),
   5290				   &split_rec, new_range, rec);
   5291
   5292	depth = path->p_tree_depth;
   5293	if (depth > 0) {
   5294		ret = ocfs2_read_extent_block(et->et_ci,
   5295					      ocfs2_et_get_last_eb_blk(et),
   5296					      &last_eb_bh);
   5297		if (ret < 0) {
   5298			mlog_errno(ret);
   5299			goto out;
   5300		}
   5301
   5302		eb = (struct ocfs2_extent_block *) last_eb_bh->b_data;
   5303		rightmost_el = &eb->h_list;
   5304	} else
   5305		rightmost_el = path_leaf_el(path);
   5306
   5307	credits = path->p_tree_depth +
   5308		  ocfs2_extend_meta_needed(et->et_root_el);
   5309	ret = ocfs2_extend_trans(handle, credits);
   5310	if (ret) {
   5311		mlog_errno(ret);
   5312		goto out;
   5313	}
   5314
   5315	if (le16_to_cpu(rightmost_el->l_next_free_rec) ==
   5316	    le16_to_cpu(rightmost_el->l_count)) {
   5317		ret = ocfs2_grow_tree(handle, et, &depth, &last_eb_bh,
   5318				      meta_ac);
   5319		if (ret) {
   5320			mlog_errno(ret);
   5321			goto out;
   5322		}
   5323	}
   5324
   5325	memset(&insert, 0, sizeof(struct ocfs2_insert_type));
   5326	insert.ins_appending = APPEND_NONE;
   5327	insert.ins_contig = CONTIG_NONE;
   5328	insert.ins_split = SPLIT_RIGHT;
   5329	insert.ins_tree_depth = depth;
   5330
   5331	ret = ocfs2_do_insert_extent(handle, et, &split_rec, &insert);
   5332	if (ret)
   5333		mlog_errno(ret);
   5334
   5335out:
   5336	brelse(last_eb_bh);
   5337	return ret;
   5338}
   5339
   5340static int ocfs2_truncate_rec(handle_t *handle,
   5341			      struct ocfs2_extent_tree *et,
   5342			      struct ocfs2_path *path, int index,
   5343			      struct ocfs2_cached_dealloc_ctxt *dealloc,
   5344			      u32 cpos, u32 len)
   5345{
   5346	int ret;
   5347	u32 left_cpos, rec_range, trunc_range;
   5348	int is_rightmost_tree_rec = 0;
   5349	struct super_block *sb = ocfs2_metadata_cache_get_super(et->et_ci);
   5350	struct ocfs2_path *left_path = NULL;
   5351	struct ocfs2_extent_list *el = path_leaf_el(path);
   5352	struct ocfs2_extent_rec *rec;
   5353	struct ocfs2_extent_block *eb;
   5354
   5355	if (ocfs2_is_empty_extent(&el->l_recs[0]) && index > 0) {
   5356		/* extend credit for ocfs2_remove_rightmost_path */
   5357		ret = ocfs2_extend_rotate_transaction(handle, 0,
   5358				jbd2_handle_buffer_credits(handle),
   5359				path);
   5360		if (ret) {
   5361			mlog_errno(ret);
   5362			goto out;
   5363		}
   5364
   5365		ret = ocfs2_rotate_tree_left(handle, et, path, dealloc);
   5366		if (ret) {
   5367			mlog_errno(ret);
   5368			goto out;
   5369		}
   5370
   5371		index--;
   5372	}
   5373
   5374	if (index == (le16_to_cpu(el->l_next_free_rec) - 1) &&
   5375	    path->p_tree_depth) {
   5376		/*
   5377		 * Check whether this is the rightmost tree record. If
   5378		 * we remove all of this record or part of its right
   5379		 * edge then an update of the record lengths above it
   5380		 * will be required.
   5381		 */
   5382		eb = (struct ocfs2_extent_block *)path_leaf_bh(path)->b_data;
   5383		if (eb->h_next_leaf_blk == 0)
   5384			is_rightmost_tree_rec = 1;
   5385	}
   5386
   5387	rec = &el->l_recs[index];
   5388	if (index == 0 && path->p_tree_depth &&
   5389	    le32_to_cpu(rec->e_cpos) == cpos) {
   5390		/*
   5391		 * Changing the leftmost offset (via partial or whole
   5392		 * record truncate) of an interior (or rightmost) path
   5393		 * means we have to update the subtree that is formed
   5394		 * by this leaf and the one to it's left.
   5395		 *
   5396		 * There are two cases we can skip:
   5397		 *   1) Path is the leftmost one in our btree.
   5398		 *   2) The leaf is rightmost and will be empty after
   5399		 *      we remove the extent record - the rotate code
   5400		 *      knows how to update the newly formed edge.
   5401		 */
   5402
   5403		ret = ocfs2_find_cpos_for_left_leaf(sb, path, &left_cpos);
   5404		if (ret) {
   5405			mlog_errno(ret);
   5406			goto out;
   5407		}
   5408
   5409		if (left_cpos && le16_to_cpu(el->l_next_free_rec) > 1) {
   5410			left_path = ocfs2_new_path_from_path(path);
   5411			if (!left_path) {
   5412				ret = -ENOMEM;
   5413				mlog_errno(ret);
   5414				goto out;
   5415			}
   5416
   5417			ret = ocfs2_find_path(et->et_ci, left_path,
   5418					      left_cpos);
   5419			if (ret) {
   5420				mlog_errno(ret);
   5421				goto out;
   5422			}
   5423		}
   5424	}
   5425
   5426	ret = ocfs2_extend_rotate_transaction(handle, 0,
   5427					jbd2_handle_buffer_credits(handle),
   5428					path);
   5429	if (ret) {
   5430		mlog_errno(ret);
   5431		goto out;
   5432	}
   5433
   5434	ret = ocfs2_journal_access_path(et->et_ci, handle, path);
   5435	if (ret) {
   5436		mlog_errno(ret);
   5437		goto out;
   5438	}
   5439
   5440	ret = ocfs2_journal_access_path(et->et_ci, handle, left_path);
   5441	if (ret) {
   5442		mlog_errno(ret);
   5443		goto out;
   5444	}
   5445
   5446	rec_range = le32_to_cpu(rec->e_cpos) + ocfs2_rec_clusters(el, rec);
   5447	trunc_range = cpos + len;
   5448
   5449	if (le32_to_cpu(rec->e_cpos) == cpos && rec_range == trunc_range) {
   5450		int next_free;
   5451
   5452		memset(rec, 0, sizeof(*rec));
   5453		ocfs2_cleanup_merge(el, index);
   5454
   5455		next_free = le16_to_cpu(el->l_next_free_rec);
   5456		if (is_rightmost_tree_rec && next_free > 1) {
   5457			/*
   5458			 * We skip the edge update if this path will
   5459			 * be deleted by the rotate code.
   5460			 */
   5461			rec = &el->l_recs[next_free - 1];
   5462			ocfs2_adjust_rightmost_records(handle, et, path,
   5463						       rec);
   5464		}
   5465	} else if (le32_to_cpu(rec->e_cpos) == cpos) {
   5466		/* Remove leftmost portion of the record. */
   5467		le32_add_cpu(&rec->e_cpos, len);
   5468		le64_add_cpu(&rec->e_blkno, ocfs2_clusters_to_blocks(sb, len));
   5469		le16_add_cpu(&rec->e_leaf_clusters, -len);
   5470	} else if (rec_range == trunc_range) {
   5471		/* Remove rightmost portion of the record */
   5472		le16_add_cpu(&rec->e_leaf_clusters, -len);
   5473		if (is_rightmost_tree_rec)
   5474			ocfs2_adjust_rightmost_records(handle, et, path, rec);
   5475	} else {
   5476		/* Caller should have trapped this. */
   5477		mlog(ML_ERROR, "Owner %llu: Invalid record truncate: (%u, %u) "
   5478		     "(%u, %u)\n",
   5479		     (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci),
   5480		     le32_to_cpu(rec->e_cpos),
   5481		     le16_to_cpu(rec->e_leaf_clusters), cpos, len);
   5482		BUG();
   5483	}
   5484
   5485	if (left_path) {
   5486		int subtree_index;
   5487
   5488		subtree_index = ocfs2_find_subtree_root(et, left_path, path);
   5489		ocfs2_complete_edge_insert(handle, left_path, path,
   5490					   subtree_index);
   5491	}
   5492
   5493	ocfs2_journal_dirty(handle, path_leaf_bh(path));
   5494
   5495	ret = ocfs2_rotate_tree_left(handle, et, path, dealloc);
   5496	if (ret)
   5497		mlog_errno(ret);
   5498
   5499out:
   5500	ocfs2_free_path(left_path);
   5501	return ret;
   5502}
   5503
   5504int ocfs2_remove_extent(handle_t *handle,
   5505			struct ocfs2_extent_tree *et,
   5506			u32 cpos, u32 len,
   5507			struct ocfs2_alloc_context *meta_ac,
   5508			struct ocfs2_cached_dealloc_ctxt *dealloc)
   5509{
   5510	int ret, index;
   5511	u32 rec_range, trunc_range;
   5512	struct ocfs2_extent_rec *rec;
   5513	struct ocfs2_extent_list *el;
   5514	struct ocfs2_path *path = NULL;
   5515
   5516	/*
   5517	 * XXX: Why are we truncating to 0 instead of wherever this
   5518	 * affects us?
   5519	 */
   5520	ocfs2_et_extent_map_truncate(et, 0);
   5521
   5522	path = ocfs2_new_path_from_et(et);
   5523	if (!path) {
   5524		ret = -ENOMEM;
   5525		mlog_errno(ret);
   5526		goto out;
   5527	}
   5528
   5529	ret = ocfs2_find_path(et->et_ci, path, cpos);
   5530	if (ret) {
   5531		mlog_errno(ret);
   5532		goto out;
   5533	}
   5534
   5535	el = path_leaf_el(path);
   5536	index = ocfs2_search_extent_list(el, cpos);
   5537	if (index == -1) {
   5538		ocfs2_error(ocfs2_metadata_cache_get_super(et->et_ci),
   5539			    "Owner %llu has an extent at cpos %u which can no longer be found\n",
   5540			    (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci),
   5541			    cpos);
   5542		ret = -EROFS;
   5543		goto out;
   5544	}
   5545
   5546	/*
   5547	 * We have 3 cases of extent removal:
   5548	 *   1) Range covers the entire extent rec
   5549	 *   2) Range begins or ends on one edge of the extent rec
   5550	 *   3) Range is in the middle of the extent rec (no shared edges)
   5551	 *
   5552	 * For case 1 we remove the extent rec and left rotate to
   5553	 * fill the hole.
   5554	 *
   5555	 * For case 2 we just shrink the existing extent rec, with a
   5556	 * tree update if the shrinking edge is also the edge of an
   5557	 * extent block.
   5558	 *
   5559	 * For case 3 we do a right split to turn the extent rec into
   5560	 * something case 2 can handle.
   5561	 */
   5562	rec = &el->l_recs[index];
   5563	rec_range = le32_to_cpu(rec->e_cpos) + ocfs2_rec_clusters(el, rec);
   5564	trunc_range = cpos + len;
   5565
   5566	BUG_ON(cpos < le32_to_cpu(rec->e_cpos) || trunc_range > rec_range);
   5567
   5568	trace_ocfs2_remove_extent(
   5569		(unsigned long long)ocfs2_metadata_cache_owner(et->et_ci),
   5570		cpos, len, index, le32_to_cpu(rec->e_cpos),
   5571		ocfs2_rec_clusters(el, rec));
   5572
   5573	if (le32_to_cpu(rec->e_cpos) == cpos || rec_range == trunc_range) {
   5574		ret = ocfs2_truncate_rec(handle, et, path, index, dealloc,
   5575					 cpos, len);
   5576		if (ret) {
   5577			mlog_errno(ret);
   5578			goto out;
   5579		}
   5580	} else {
   5581		ret = ocfs2_split_tree(handle, et, path, index,
   5582				       trunc_range, meta_ac);
   5583		if (ret) {
   5584			mlog_errno(ret);
   5585			goto out;
   5586		}
   5587
   5588		/*
   5589		 * The split could have manipulated the tree enough to
   5590		 * move the record location, so we have to look for it again.
   5591		 */
   5592		ocfs2_reinit_path(path, 1);
   5593
   5594		ret = ocfs2_find_path(et->et_ci, path, cpos);
   5595		if (ret) {
   5596			mlog_errno(ret);
   5597			goto out;
   5598		}
   5599
   5600		el = path_leaf_el(path);
   5601		index = ocfs2_search_extent_list(el, cpos);
   5602		if (index == -1) {
   5603			ocfs2_error(ocfs2_metadata_cache_get_super(et->et_ci),
   5604				    "Owner %llu: split at cpos %u lost record\n",
   5605				    (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci),
   5606				    cpos);
   5607			ret = -EROFS;
   5608			goto out;
   5609		}
   5610
   5611		/*
   5612		 * Double check our values here. If anything is fishy,
   5613		 * it's easier to catch it at the top level.
   5614		 */
   5615		rec = &el->l_recs[index];
   5616		rec_range = le32_to_cpu(rec->e_cpos) +
   5617			ocfs2_rec_clusters(el, rec);
   5618		if (rec_range != trunc_range) {
   5619			ocfs2_error(ocfs2_metadata_cache_get_super(et->et_ci),
   5620				    "Owner %llu: error after split at cpos %u trunc len %u, existing record is (%u,%u)\n",
   5621				    (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci),
   5622				    cpos, len, le32_to_cpu(rec->e_cpos),
   5623				    ocfs2_rec_clusters(el, rec));
   5624			ret = -EROFS;
   5625			goto out;
   5626		}
   5627
   5628		ret = ocfs2_truncate_rec(handle, et, path, index, dealloc,
   5629					 cpos, len);
   5630		if (ret)
   5631			mlog_errno(ret);
   5632	}
   5633
   5634out:
   5635	ocfs2_free_path(path);
   5636	return ret;
   5637}
   5638
   5639/*
   5640 * ocfs2_reserve_blocks_for_rec_trunc() would look basically the
   5641 * same as ocfs2_lock_alloctors(), except for it accepts a blocks
   5642 * number to reserve some extra blocks, and it only handles meta
   5643 * data allocations.
   5644 *
   5645 * Currently, only ocfs2_remove_btree_range() uses it for truncating
   5646 * and punching holes.
   5647 */
   5648static int ocfs2_reserve_blocks_for_rec_trunc(struct inode *inode,
   5649					      struct ocfs2_extent_tree *et,
   5650					      u32 extents_to_split,
   5651					      struct ocfs2_alloc_context **ac,
   5652					      int extra_blocks)
   5653{
   5654	int ret = 0, num_free_extents;
   5655	unsigned int max_recs_needed = 2 * extents_to_split;
   5656	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
   5657
   5658	*ac = NULL;
   5659
   5660	num_free_extents = ocfs2_num_free_extents(et);
   5661	if (num_free_extents < 0) {
   5662		ret = num_free_extents;
   5663		mlog_errno(ret);
   5664		goto out;
   5665	}
   5666
   5667	if (!num_free_extents ||
   5668	    (ocfs2_sparse_alloc(osb) && num_free_extents < max_recs_needed))
   5669		extra_blocks += ocfs2_extend_meta_needed(et->et_root_el);
   5670
   5671	if (extra_blocks) {
   5672		ret = ocfs2_reserve_new_metadata_blocks(osb, extra_blocks, ac);
   5673		if (ret < 0) {
   5674			if (ret != -ENOSPC)
   5675				mlog_errno(ret);
   5676		}
   5677	}
   5678
   5679out:
   5680	if (ret) {
   5681		if (*ac) {
   5682			ocfs2_free_alloc_context(*ac);
   5683			*ac = NULL;
   5684		}
   5685	}
   5686
   5687	return ret;
   5688}
   5689
   5690int ocfs2_remove_btree_range(struct inode *inode,
   5691			     struct ocfs2_extent_tree *et,
   5692			     u32 cpos, u32 phys_cpos, u32 len, int flags,
   5693			     struct ocfs2_cached_dealloc_ctxt *dealloc,
   5694			     u64 refcount_loc, bool refcount_tree_locked)
   5695{
   5696	int ret, credits = 0, extra_blocks = 0;
   5697	u64 phys_blkno = ocfs2_clusters_to_blocks(inode->i_sb, phys_cpos);
   5698	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
   5699	struct inode *tl_inode = osb->osb_tl_inode;
   5700	handle_t *handle;
   5701	struct ocfs2_alloc_context *meta_ac = NULL;
   5702	struct ocfs2_refcount_tree *ref_tree = NULL;
   5703
   5704	if ((flags & OCFS2_EXT_REFCOUNTED) && len) {
   5705		BUG_ON(!ocfs2_is_refcount_inode(inode));
   5706
   5707		if (!refcount_tree_locked) {
   5708			ret = ocfs2_lock_refcount_tree(osb, refcount_loc, 1,
   5709						       &ref_tree, NULL);
   5710			if (ret) {
   5711				mlog_errno(ret);
   5712				goto bail;
   5713			}
   5714		}
   5715
   5716		ret = ocfs2_prepare_refcount_change_for_del(inode,
   5717							    refcount_loc,
   5718							    phys_blkno,
   5719							    len,
   5720							    &credits,
   5721							    &extra_blocks);
   5722		if (ret < 0) {
   5723			mlog_errno(ret);
   5724			goto bail;
   5725		}
   5726	}
   5727
   5728	ret = ocfs2_reserve_blocks_for_rec_trunc(inode, et, 1, &meta_ac,
   5729						 extra_blocks);
   5730	if (ret) {
   5731		mlog_errno(ret);
   5732		goto bail;
   5733	}
   5734
   5735	inode_lock(tl_inode);
   5736
   5737	if (ocfs2_truncate_log_needs_flush(osb)) {
   5738		ret = __ocfs2_flush_truncate_log(osb);
   5739		if (ret < 0) {
   5740			mlog_errno(ret);
   5741			goto out;
   5742		}
   5743	}
   5744
   5745	handle = ocfs2_start_trans(osb,
   5746			ocfs2_remove_extent_credits(osb->sb) + credits);
   5747	if (IS_ERR(handle)) {
   5748		ret = PTR_ERR(handle);
   5749		mlog_errno(ret);
   5750		goto out;
   5751	}
   5752
   5753	ret = ocfs2_et_root_journal_access(handle, et,
   5754					   OCFS2_JOURNAL_ACCESS_WRITE);
   5755	if (ret) {
   5756		mlog_errno(ret);
   5757		goto out_commit;
   5758	}
   5759
   5760	dquot_free_space_nodirty(inode,
   5761				  ocfs2_clusters_to_bytes(inode->i_sb, len));
   5762
   5763	ret = ocfs2_remove_extent(handle, et, cpos, len, meta_ac, dealloc);
   5764	if (ret) {
   5765		mlog_errno(ret);
   5766		goto out_commit;
   5767	}
   5768
   5769	ocfs2_et_update_clusters(et, -len);
   5770	ocfs2_update_inode_fsync_trans(handle, inode, 1);
   5771
   5772	ocfs2_journal_dirty(handle, et->et_root_bh);
   5773
   5774	if (phys_blkno) {
   5775		if (flags & OCFS2_EXT_REFCOUNTED)
   5776			ret = ocfs2_decrease_refcount(inode, handle,
   5777					ocfs2_blocks_to_clusters(osb->sb,
   5778								 phys_blkno),
   5779					len, meta_ac,
   5780					dealloc, 1);
   5781		else
   5782			ret = ocfs2_truncate_log_append(osb, handle,
   5783							phys_blkno, len);
   5784		if (ret)
   5785			mlog_errno(ret);
   5786
   5787	}
   5788
   5789out_commit:
   5790	ocfs2_commit_trans(osb, handle);
   5791out:
   5792	inode_unlock(tl_inode);
   5793bail:
   5794	if (meta_ac)
   5795		ocfs2_free_alloc_context(meta_ac);
   5796
   5797	if (ref_tree)
   5798		ocfs2_unlock_refcount_tree(osb, ref_tree, 1);
   5799
   5800	return ret;
   5801}
   5802
   5803int ocfs2_truncate_log_needs_flush(struct ocfs2_super *osb)
   5804{
   5805	struct buffer_head *tl_bh = osb->osb_tl_bh;
   5806	struct ocfs2_dinode *di;
   5807	struct ocfs2_truncate_log *tl;
   5808
   5809	di = (struct ocfs2_dinode *) tl_bh->b_data;
   5810	tl = &di->id2.i_dealloc;
   5811
   5812	mlog_bug_on_msg(le16_to_cpu(tl->tl_used) > le16_to_cpu(tl->tl_count),
   5813			"slot %d, invalid truncate log parameters: used = "
   5814			"%u, count = %u\n", osb->slot_num,
   5815			le16_to_cpu(tl->tl_used), le16_to_cpu(tl->tl_count));
   5816	return le16_to_cpu(tl->tl_used) == le16_to_cpu(tl->tl_count);
   5817}
   5818
   5819static int ocfs2_truncate_log_can_coalesce(struct ocfs2_truncate_log *tl,
   5820					   unsigned int new_start)
   5821{
   5822	unsigned int tail_index;
   5823	unsigned int current_tail;
   5824
   5825	/* No records, nothing to coalesce */
   5826	if (!le16_to_cpu(tl->tl_used))
   5827		return 0;
   5828
   5829	tail_index = le16_to_cpu(tl->tl_used) - 1;
   5830	current_tail = le32_to_cpu(tl->tl_recs[tail_index].t_start);
   5831	current_tail += le32_to_cpu(tl->tl_recs[tail_index].t_clusters);
   5832
   5833	return current_tail == new_start;
   5834}
   5835
   5836int ocfs2_truncate_log_append(struct ocfs2_super *osb,
   5837			      handle_t *handle,
   5838			      u64 start_blk,
   5839			      unsigned int num_clusters)
   5840{
   5841	int status, index;
   5842	unsigned int start_cluster, tl_count;
   5843	struct inode *tl_inode = osb->osb_tl_inode;
   5844	struct buffer_head *tl_bh = osb->osb_tl_bh;
   5845	struct ocfs2_dinode *di;
   5846	struct ocfs2_truncate_log *tl;
   5847
   5848	BUG_ON(inode_trylock(tl_inode));
   5849
   5850	start_cluster = ocfs2_blocks_to_clusters(osb->sb, start_blk);
   5851
   5852	di = (struct ocfs2_dinode *) tl_bh->b_data;
   5853
   5854	/* tl_bh is loaded from ocfs2_truncate_log_init().  It's validated
   5855	 * by the underlying call to ocfs2_read_inode_block(), so any
   5856	 * corruption is a code bug */
   5857	BUG_ON(!OCFS2_IS_VALID_DINODE(di));
   5858
   5859	tl = &di->id2.i_dealloc;
   5860	tl_count = le16_to_cpu(tl->tl_count);
   5861	mlog_bug_on_msg(tl_count > ocfs2_truncate_recs_per_inode(osb->sb) ||
   5862			tl_count == 0,
   5863			"Truncate record count on #%llu invalid "
   5864			"wanted %u, actual %u\n",
   5865			(unsigned long long)OCFS2_I(tl_inode)->ip_blkno,
   5866			ocfs2_truncate_recs_per_inode(osb->sb),
   5867			le16_to_cpu(tl->tl_count));
   5868
   5869	/* Caller should have known to flush before calling us. */
   5870	index = le16_to_cpu(tl->tl_used);
   5871	if (index >= tl_count) {
   5872		status = -ENOSPC;
   5873		mlog_errno(status);
   5874		goto bail;
   5875	}
   5876
   5877	status = ocfs2_journal_access_di(handle, INODE_CACHE(tl_inode), tl_bh,
   5878					 OCFS2_JOURNAL_ACCESS_WRITE);
   5879	if (status < 0) {
   5880		mlog_errno(status);
   5881		goto bail;
   5882	}
   5883
   5884	trace_ocfs2_truncate_log_append(
   5885		(unsigned long long)OCFS2_I(tl_inode)->ip_blkno, index,
   5886		start_cluster, num_clusters);
   5887	if (ocfs2_truncate_log_can_coalesce(tl, start_cluster)) {
   5888		/*
   5889		 * Move index back to the record we are coalescing with.
   5890		 * ocfs2_truncate_log_can_coalesce() guarantees nonzero
   5891		 */
   5892		index--;
   5893
   5894		num_clusters += le32_to_cpu(tl->tl_recs[index].t_clusters);
   5895		trace_ocfs2_truncate_log_append(
   5896			(unsigned long long)OCFS2_I(tl_inode)->ip_blkno,
   5897			index, le32_to_cpu(tl->tl_recs[index].t_start),
   5898			num_clusters);
   5899	} else {
   5900		tl->tl_recs[index].t_start = cpu_to_le32(start_cluster);
   5901		tl->tl_used = cpu_to_le16(index + 1);
   5902	}
   5903	tl->tl_recs[index].t_clusters = cpu_to_le32(num_clusters);
   5904
   5905	ocfs2_journal_dirty(handle, tl_bh);
   5906
   5907	osb->truncated_clusters += num_clusters;
   5908bail:
   5909	return status;
   5910}
   5911
   5912static int ocfs2_replay_truncate_records(struct ocfs2_super *osb,
   5913					 struct inode *data_alloc_inode,
   5914					 struct buffer_head *data_alloc_bh)
   5915{
   5916	int status = 0;
   5917	int i;
   5918	unsigned int num_clusters;
   5919	u64 start_blk;
   5920	struct ocfs2_truncate_rec rec;
   5921	struct ocfs2_dinode *di;
   5922	struct ocfs2_truncate_log *tl;
   5923	struct inode *tl_inode = osb->osb_tl_inode;
   5924	struct buffer_head *tl_bh = osb->osb_tl_bh;
   5925	handle_t *handle;
   5926
   5927	di = (struct ocfs2_dinode *) tl_bh->b_data;
   5928	tl = &di->id2.i_dealloc;
   5929	i = le16_to_cpu(tl->tl_used) - 1;
   5930	while (i >= 0) {
   5931		handle = ocfs2_start_trans(osb, OCFS2_TRUNCATE_LOG_FLUSH_ONE_REC);
   5932		if (IS_ERR(handle)) {
   5933			status = PTR_ERR(handle);
   5934			mlog_errno(status);
   5935			goto bail;
   5936		}
   5937
   5938		/* Caller has given us at least enough credits to
   5939		 * update the truncate log dinode */
   5940		status = ocfs2_journal_access_di(handle, INODE_CACHE(tl_inode), tl_bh,
   5941						 OCFS2_JOURNAL_ACCESS_WRITE);
   5942		if (status < 0) {
   5943			ocfs2_commit_trans(osb, handle);
   5944			mlog_errno(status);
   5945			goto bail;
   5946		}
   5947
   5948		tl->tl_used = cpu_to_le16(i);
   5949
   5950		ocfs2_journal_dirty(handle, tl_bh);
   5951
   5952		rec = tl->tl_recs[i];
   5953		start_blk = ocfs2_clusters_to_blocks(data_alloc_inode->i_sb,
   5954						    le32_to_cpu(rec.t_start));
   5955		num_clusters = le32_to_cpu(rec.t_clusters);
   5956
   5957		/* if start_blk is not set, we ignore the record as
   5958		 * invalid. */
   5959		if (start_blk) {
   5960			trace_ocfs2_replay_truncate_records(
   5961				(unsigned long long)OCFS2_I(tl_inode)->ip_blkno,
   5962				i, le32_to_cpu(rec.t_start), num_clusters);
   5963
   5964			status = ocfs2_free_clusters(handle, data_alloc_inode,
   5965						     data_alloc_bh, start_blk,
   5966						     num_clusters);
   5967			if (status < 0) {
   5968				ocfs2_commit_trans(osb, handle);
   5969				mlog_errno(status);
   5970				goto bail;
   5971			}
   5972		}
   5973
   5974		ocfs2_commit_trans(osb, handle);
   5975		i--;
   5976	}
   5977
   5978	osb->truncated_clusters = 0;
   5979
   5980bail:
   5981	return status;
   5982}
   5983
   5984/* Expects you to already be holding tl_inode->i_rwsem */
   5985int __ocfs2_flush_truncate_log(struct ocfs2_super *osb)
   5986{
   5987	int status;
   5988	unsigned int num_to_flush;
   5989	struct inode *tl_inode = osb->osb_tl_inode;
   5990	struct inode *data_alloc_inode = NULL;
   5991	struct buffer_head *tl_bh = osb->osb_tl_bh;
   5992	struct buffer_head *data_alloc_bh = NULL;
   5993	struct ocfs2_dinode *di;
   5994	struct ocfs2_truncate_log *tl;
   5995	struct ocfs2_journal *journal = osb->journal;
   5996
   5997	BUG_ON(inode_trylock(tl_inode));
   5998
   5999	di = (struct ocfs2_dinode *) tl_bh->b_data;
   6000
   6001	/* tl_bh is loaded from ocfs2_truncate_log_init().  It's validated
   6002	 * by the underlying call to ocfs2_read_inode_block(), so any
   6003	 * corruption is a code bug */
   6004	BUG_ON(!OCFS2_IS_VALID_DINODE(di));
   6005
   6006	tl = &di->id2.i_dealloc;
   6007	num_to_flush = le16_to_cpu(tl->tl_used);
   6008	trace_ocfs2_flush_truncate_log(
   6009		(unsigned long long)OCFS2_I(tl_inode)->ip_blkno,
   6010		num_to_flush);
   6011	if (!num_to_flush) {
   6012		status = 0;
   6013		goto out;
   6014	}
   6015
   6016	/* Appending truncate log(TA) and flushing truncate log(TF) are
   6017	 * two separated transactions. They can be both committed but not
   6018	 * checkpointed. If crash occurs then, both two transaction will be
   6019	 * replayed with several already released to global bitmap clusters.
   6020	 * Then truncate log will be replayed resulting in cluster double free.
   6021	 */
   6022	jbd2_journal_lock_updates(journal->j_journal);
   6023	status = jbd2_journal_flush(journal->j_journal, 0);
   6024	jbd2_journal_unlock_updates(journal->j_journal);
   6025	if (status < 0) {
   6026		mlog_errno(status);
   6027		goto out;
   6028	}
   6029
   6030	data_alloc_inode = ocfs2_get_system_file_inode(osb,
   6031						       GLOBAL_BITMAP_SYSTEM_INODE,
   6032						       OCFS2_INVALID_SLOT);
   6033	if (!data_alloc_inode) {
   6034		status = -EINVAL;
   6035		mlog(ML_ERROR, "Could not get bitmap inode!\n");
   6036		goto out;
   6037	}
   6038
   6039	inode_lock(data_alloc_inode);
   6040
   6041	status = ocfs2_inode_lock(data_alloc_inode, &data_alloc_bh, 1);
   6042	if (status < 0) {
   6043		mlog_errno(status);
   6044		goto out_mutex;
   6045	}
   6046
   6047	status = ocfs2_replay_truncate_records(osb, data_alloc_inode,
   6048					       data_alloc_bh);
   6049	if (status < 0)
   6050		mlog_errno(status);
   6051
   6052	brelse(data_alloc_bh);
   6053	ocfs2_inode_unlock(data_alloc_inode, 1);
   6054
   6055out_mutex:
   6056	inode_unlock(data_alloc_inode);
   6057	iput(data_alloc_inode);
   6058
   6059out:
   6060	return status;
   6061}
   6062
   6063int ocfs2_flush_truncate_log(struct ocfs2_super *osb)
   6064{
   6065	int status;
   6066	struct inode *tl_inode = osb->osb_tl_inode;
   6067
   6068	inode_lock(tl_inode);
   6069	status = __ocfs2_flush_truncate_log(osb);
   6070	inode_unlock(tl_inode);
   6071
   6072	return status;
   6073}
   6074
   6075static void ocfs2_truncate_log_worker(struct work_struct *work)
   6076{
   6077	int status;
   6078	struct ocfs2_super *osb =
   6079		container_of(work, struct ocfs2_super,
   6080			     osb_truncate_log_wq.work);
   6081
   6082	status = ocfs2_flush_truncate_log(osb);
   6083	if (status < 0)
   6084		mlog_errno(status);
   6085	else
   6086		ocfs2_init_steal_slots(osb);
   6087}
   6088
   6089#define OCFS2_TRUNCATE_LOG_FLUSH_INTERVAL (2 * HZ)
   6090void ocfs2_schedule_truncate_log_flush(struct ocfs2_super *osb,
   6091				       int cancel)
   6092{
   6093	if (osb->osb_tl_inode &&
   6094			atomic_read(&osb->osb_tl_disable) == 0) {
   6095		/* We want to push off log flushes while truncates are
   6096		 * still running. */
   6097		if (cancel)
   6098			cancel_delayed_work(&osb->osb_truncate_log_wq);
   6099
   6100		queue_delayed_work(osb->ocfs2_wq, &osb->osb_truncate_log_wq,
   6101				   OCFS2_TRUNCATE_LOG_FLUSH_INTERVAL);
   6102	}
   6103}
   6104
   6105/*
   6106 * Try to flush truncate logs if we can free enough clusters from it.
   6107 * As for return value, "< 0" means error, "0" no space and "1" means
   6108 * we have freed enough spaces and let the caller try to allocate again.
   6109 */
   6110int ocfs2_try_to_free_truncate_log(struct ocfs2_super *osb,
   6111					unsigned int needed)
   6112{
   6113	tid_t target;
   6114	int ret = 0;
   6115	unsigned int truncated_clusters;
   6116
   6117	inode_lock(osb->osb_tl_inode);
   6118	truncated_clusters = osb->truncated_clusters;
   6119	inode_unlock(osb->osb_tl_inode);
   6120
   6121	/*
   6122	 * Check whether we can succeed in allocating if we free
   6123	 * the truncate log.
   6124	 */
   6125	if (truncated_clusters < needed)
   6126		goto out;
   6127
   6128	ret = ocfs2_flush_truncate_log(osb);
   6129	if (ret) {
   6130		mlog_errno(ret);
   6131		goto out;
   6132	}
   6133
   6134	if (jbd2_journal_start_commit(osb->journal->j_journal, &target)) {
   6135		jbd2_log_wait_commit(osb->journal->j_journal, target);
   6136		ret = 1;
   6137	}
   6138out:
   6139	return ret;
   6140}
   6141
   6142static int ocfs2_get_truncate_log_info(struct ocfs2_super *osb,
   6143				       int slot_num,
   6144				       struct inode **tl_inode,
   6145				       struct buffer_head **tl_bh)
   6146{
   6147	int status;
   6148	struct inode *inode = NULL;
   6149	struct buffer_head *bh = NULL;
   6150
   6151	inode = ocfs2_get_system_file_inode(osb,
   6152					   TRUNCATE_LOG_SYSTEM_INODE,
   6153					   slot_num);
   6154	if (!inode) {
   6155		status = -EINVAL;
   6156		mlog(ML_ERROR, "Could not get load truncate log inode!\n");
   6157		goto bail;
   6158	}
   6159
   6160	status = ocfs2_read_inode_block(inode, &bh);
   6161	if (status < 0) {
   6162		iput(inode);
   6163		mlog_errno(status);
   6164		goto bail;
   6165	}
   6166
   6167	*tl_inode = inode;
   6168	*tl_bh    = bh;
   6169bail:
   6170	return status;
   6171}
   6172
   6173/* called during the 1st stage of node recovery. we stamp a clean
   6174 * truncate log and pass back a copy for processing later. if the
   6175 * truncate log does not require processing, a *tl_copy is set to
   6176 * NULL. */
   6177int ocfs2_begin_truncate_log_recovery(struct ocfs2_super *osb,
   6178				      int slot_num,
   6179				      struct ocfs2_dinode **tl_copy)
   6180{
   6181	int status;
   6182	struct inode *tl_inode = NULL;
   6183	struct buffer_head *tl_bh = NULL;
   6184	struct ocfs2_dinode *di;
   6185	struct ocfs2_truncate_log *tl;
   6186
   6187	*tl_copy = NULL;
   6188
   6189	trace_ocfs2_begin_truncate_log_recovery(slot_num);
   6190
   6191	status = ocfs2_get_truncate_log_info(osb, slot_num, &tl_inode, &tl_bh);
   6192	if (status < 0) {
   6193		mlog_errno(status);
   6194		goto bail;
   6195	}
   6196
   6197	di = (struct ocfs2_dinode *) tl_bh->b_data;
   6198
   6199	/* tl_bh is loaded from ocfs2_get_truncate_log_info().  It's
   6200	 * validated by the underlying call to ocfs2_read_inode_block(),
   6201	 * so any corruption is a code bug */
   6202	BUG_ON(!OCFS2_IS_VALID_DINODE(di));
   6203
   6204	tl = &di->id2.i_dealloc;
   6205	if (le16_to_cpu(tl->tl_used)) {
   6206		trace_ocfs2_truncate_log_recovery_num(le16_to_cpu(tl->tl_used));
   6207
   6208		/*
   6209		 * Assuming the write-out below goes well, this copy will be
   6210		 * passed back to recovery for processing.
   6211		 */
   6212		*tl_copy = kmemdup(tl_bh->b_data, tl_bh->b_size, GFP_KERNEL);
   6213		if (!(*tl_copy)) {
   6214			status = -ENOMEM;
   6215			mlog_errno(status);
   6216			goto bail;
   6217		}
   6218
   6219		/* All we need to do to clear the truncate log is set
   6220		 * tl_used. */
   6221		tl->tl_used = 0;
   6222
   6223		ocfs2_compute_meta_ecc(osb->sb, tl_bh->b_data, &di->i_check);
   6224		status = ocfs2_write_block(osb, tl_bh, INODE_CACHE(tl_inode));
   6225		if (status < 0) {
   6226			mlog_errno(status);
   6227			goto bail;
   6228		}
   6229	}
   6230
   6231bail:
   6232	iput(tl_inode);
   6233	brelse(tl_bh);
   6234
   6235	if (status < 0) {
   6236		kfree(*tl_copy);
   6237		*tl_copy = NULL;
   6238		mlog_errno(status);
   6239	}
   6240
   6241	return status;
   6242}
   6243
   6244int ocfs2_complete_truncate_log_recovery(struct ocfs2_super *osb,
   6245					 struct ocfs2_dinode *tl_copy)
   6246{
   6247	int status = 0;
   6248	int i;
   6249	unsigned int clusters, num_recs, start_cluster;
   6250	u64 start_blk;
   6251	handle_t *handle;
   6252	struct inode *tl_inode = osb->osb_tl_inode;
   6253	struct ocfs2_truncate_log *tl;
   6254
   6255	if (OCFS2_I(tl_inode)->ip_blkno == le64_to_cpu(tl_copy->i_blkno)) {
   6256		mlog(ML_ERROR, "Asked to recover my own truncate log!\n");
   6257		return -EINVAL;
   6258	}
   6259
   6260	tl = &tl_copy->id2.i_dealloc;
   6261	num_recs = le16_to_cpu(tl->tl_used);
   6262	trace_ocfs2_complete_truncate_log_recovery(
   6263		(unsigned long long)le64_to_cpu(tl_copy->i_blkno),
   6264		num_recs);
   6265
   6266	inode_lock(tl_inode);
   6267	for(i = 0; i < num_recs; i++) {
   6268		if (ocfs2_truncate_log_needs_flush(osb)) {
   6269			status = __ocfs2_flush_truncate_log(osb);
   6270			if (status < 0) {
   6271				mlog_errno(status);
   6272				goto bail_up;
   6273			}
   6274		}
   6275
   6276		handle = ocfs2_start_trans(osb, OCFS2_TRUNCATE_LOG_UPDATE);
   6277		if (IS_ERR(handle)) {
   6278			status = PTR_ERR(handle);
   6279			mlog_errno(status);
   6280			goto bail_up;
   6281		}
   6282
   6283		clusters = le32_to_cpu(tl->tl_recs[i].t_clusters);
   6284		start_cluster = le32_to_cpu(tl->tl_recs[i].t_start);
   6285		start_blk = ocfs2_clusters_to_blocks(osb->sb, start_cluster);
   6286
   6287		status = ocfs2_truncate_log_append(osb, handle,
   6288						   start_blk, clusters);
   6289		ocfs2_commit_trans(osb, handle);
   6290		if (status < 0) {
   6291			mlog_errno(status);
   6292			goto bail_up;
   6293		}
   6294	}
   6295
   6296bail_up:
   6297	inode_unlock(tl_inode);
   6298
   6299	return status;
   6300}
   6301
   6302void ocfs2_truncate_log_shutdown(struct ocfs2_super *osb)
   6303{
   6304	int status;
   6305	struct inode *tl_inode = osb->osb_tl_inode;
   6306
   6307	atomic_set(&osb->osb_tl_disable, 1);
   6308
   6309	if (tl_inode) {
   6310		cancel_delayed_work(&osb->osb_truncate_log_wq);
   6311		flush_workqueue(osb->ocfs2_wq);
   6312
   6313		status = ocfs2_flush_truncate_log(osb);
   6314		if (status < 0)
   6315			mlog_errno(status);
   6316
   6317		brelse(osb->osb_tl_bh);
   6318		iput(osb->osb_tl_inode);
   6319	}
   6320}
   6321
   6322int ocfs2_truncate_log_init(struct ocfs2_super *osb)
   6323{
   6324	int status;
   6325	struct inode *tl_inode = NULL;
   6326	struct buffer_head *tl_bh = NULL;
   6327
   6328	status = ocfs2_get_truncate_log_info(osb,
   6329					     osb->slot_num,
   6330					     &tl_inode,
   6331					     &tl_bh);
   6332	if (status < 0)
   6333		mlog_errno(status);
   6334
   6335	/* ocfs2_truncate_log_shutdown keys on the existence of
   6336	 * osb->osb_tl_inode so we don't set any of the osb variables
   6337	 * until we're sure all is well. */
   6338	INIT_DELAYED_WORK(&osb->osb_truncate_log_wq,
   6339			  ocfs2_truncate_log_worker);
   6340	atomic_set(&osb->osb_tl_disable, 0);
   6341	osb->osb_tl_bh    = tl_bh;
   6342	osb->osb_tl_inode = tl_inode;
   6343
   6344	return status;
   6345}
   6346
   6347/*
   6348 * Delayed de-allocation of suballocator blocks.
   6349 *
   6350 * Some sets of block de-allocations might involve multiple suballocator inodes.
   6351 *
   6352 * The locking for this can get extremely complicated, especially when
   6353 * the suballocator inodes to delete from aren't known until deep
   6354 * within an unrelated codepath.
   6355 *
   6356 * ocfs2_extent_block structures are a good example of this - an inode
   6357 * btree could have been grown by any number of nodes each allocating
   6358 * out of their own suballoc inode.
   6359 *
   6360 * These structures allow the delay of block de-allocation until a
   6361 * later time, when locking of multiple cluster inodes won't cause
   6362 * deadlock.
   6363 */
   6364
   6365/*
   6366 * Describe a single bit freed from a suballocator.  For the block
   6367 * suballocators, it represents one block.  For the global cluster
   6368 * allocator, it represents some clusters and free_bit indicates
   6369 * clusters number.
   6370 */
   6371struct ocfs2_cached_block_free {
   6372	struct ocfs2_cached_block_free		*free_next;
   6373	u64					free_bg;
   6374	u64					free_blk;
   6375	unsigned int				free_bit;
   6376};
   6377
   6378struct ocfs2_per_slot_free_list {
   6379	struct ocfs2_per_slot_free_list		*f_next_suballocator;
   6380	int					f_inode_type;
   6381	int					f_slot;
   6382	struct ocfs2_cached_block_free		*f_first;
   6383};
   6384
   6385static int ocfs2_free_cached_blocks(struct ocfs2_super *osb,
   6386				    int sysfile_type,
   6387				    int slot,
   6388				    struct ocfs2_cached_block_free *head)
   6389{
   6390	int ret;
   6391	u64 bg_blkno;
   6392	handle_t *handle;
   6393	struct inode *inode;
   6394	struct buffer_head *di_bh = NULL;
   6395	struct ocfs2_cached_block_free *tmp;
   6396
   6397	inode = ocfs2_get_system_file_inode(osb, sysfile_type, slot);
   6398	if (!inode) {
   6399		ret = -EINVAL;
   6400		mlog_errno(ret);
   6401		goto out;
   6402	}
   6403
   6404	inode_lock(inode);
   6405
   6406	ret = ocfs2_inode_lock(inode, &di_bh, 1);
   6407	if (ret) {
   6408		mlog_errno(ret);
   6409		goto out_mutex;
   6410	}
   6411
   6412	while (head) {
   6413		if (head->free_bg)
   6414			bg_blkno = head->free_bg;
   6415		else
   6416			bg_blkno = ocfs2_which_suballoc_group(head->free_blk,
   6417							      head->free_bit);
   6418		handle = ocfs2_start_trans(osb, OCFS2_SUBALLOC_FREE);
   6419		if (IS_ERR(handle)) {
   6420			ret = PTR_ERR(handle);
   6421			mlog_errno(ret);
   6422			goto out_unlock;
   6423		}
   6424
   6425		trace_ocfs2_free_cached_blocks(
   6426		     (unsigned long long)head->free_blk, head->free_bit);
   6427
   6428		ret = ocfs2_free_suballoc_bits(handle, inode, di_bh,
   6429					       head->free_bit, bg_blkno, 1);
   6430		if (ret)
   6431			mlog_errno(ret);
   6432
   6433		ocfs2_commit_trans(osb, handle);
   6434
   6435		tmp = head;
   6436		head = head->free_next;
   6437		kfree(tmp);
   6438	}
   6439
   6440out_unlock:
   6441	ocfs2_inode_unlock(inode, 1);
   6442	brelse(di_bh);
   6443out_mutex:
   6444	inode_unlock(inode);
   6445	iput(inode);
   6446out:
   6447	while(head) {
   6448		/* Premature exit may have left some dangling items. */
   6449		tmp = head;
   6450		head = head->free_next;
   6451		kfree(tmp);
   6452	}
   6453
   6454	return ret;
   6455}
   6456
   6457int ocfs2_cache_cluster_dealloc(struct ocfs2_cached_dealloc_ctxt *ctxt,
   6458				u64 blkno, unsigned int bit)
   6459{
   6460	int ret = 0;
   6461	struct ocfs2_cached_block_free *item;
   6462
   6463	item = kzalloc(sizeof(*item), GFP_NOFS);
   6464	if (item == NULL) {
   6465		ret = -ENOMEM;
   6466		mlog_errno(ret);
   6467		return ret;
   6468	}
   6469
   6470	trace_ocfs2_cache_cluster_dealloc((unsigned long long)blkno, bit);
   6471
   6472	item->free_blk = blkno;
   6473	item->free_bit = bit;
   6474	item->free_next = ctxt->c_global_allocator;
   6475
   6476	ctxt->c_global_allocator = item;
   6477	return ret;
   6478}
   6479
   6480static int ocfs2_free_cached_clusters(struct ocfs2_super *osb,
   6481				      struct ocfs2_cached_block_free *head)
   6482{
   6483	struct ocfs2_cached_block_free *tmp;
   6484	struct inode *tl_inode = osb->osb_tl_inode;
   6485	handle_t *handle;
   6486	int ret = 0;
   6487
   6488	inode_lock(tl_inode);
   6489
   6490	while (head) {
   6491		if (ocfs2_truncate_log_needs_flush(osb)) {
   6492			ret = __ocfs2_flush_truncate_log(osb);
   6493			if (ret < 0) {
   6494				mlog_errno(ret);
   6495				break;
   6496			}
   6497		}
   6498
   6499		handle = ocfs2_start_trans(osb, OCFS2_TRUNCATE_LOG_UPDATE);
   6500		if (IS_ERR(handle)) {
   6501			ret = PTR_ERR(handle);
   6502			mlog_errno(ret);
   6503			break;
   6504		}
   6505
   6506		ret = ocfs2_truncate_log_append(osb, handle, head->free_blk,
   6507						head->free_bit);
   6508
   6509		ocfs2_commit_trans(osb, handle);
   6510		tmp = head;
   6511		head = head->free_next;
   6512		kfree(tmp);
   6513
   6514		if (ret < 0) {
   6515			mlog_errno(ret);
   6516			break;
   6517		}
   6518	}
   6519
   6520	inode_unlock(tl_inode);
   6521
   6522	while (head) {
   6523		/* Premature exit may have left some dangling items. */
   6524		tmp = head;
   6525		head = head->free_next;
   6526		kfree(tmp);
   6527	}
   6528
   6529	return ret;
   6530}
   6531
   6532int ocfs2_run_deallocs(struct ocfs2_super *osb,
   6533		       struct ocfs2_cached_dealloc_ctxt *ctxt)
   6534{
   6535	int ret = 0, ret2;
   6536	struct ocfs2_per_slot_free_list *fl;
   6537
   6538	if (!ctxt)
   6539		return 0;
   6540
   6541	while (ctxt->c_first_suballocator) {
   6542		fl = ctxt->c_first_suballocator;
   6543
   6544		if (fl->f_first) {
   6545			trace_ocfs2_run_deallocs(fl->f_inode_type,
   6546						 fl->f_slot);
   6547			ret2 = ocfs2_free_cached_blocks(osb,
   6548							fl->f_inode_type,
   6549							fl->f_slot,
   6550							fl->f_first);
   6551			if (ret2)
   6552				mlog_errno(ret2);
   6553			if (!ret)
   6554				ret = ret2;
   6555		}
   6556
   6557		ctxt->c_first_suballocator = fl->f_next_suballocator;
   6558		kfree(fl);
   6559	}
   6560
   6561	if (ctxt->c_global_allocator) {
   6562		ret2 = ocfs2_free_cached_clusters(osb,
   6563						  ctxt->c_global_allocator);
   6564		if (ret2)
   6565			mlog_errno(ret2);
   6566		if (!ret)
   6567			ret = ret2;
   6568
   6569		ctxt->c_global_allocator = NULL;
   6570	}
   6571
   6572	return ret;
   6573}
   6574
   6575static struct ocfs2_per_slot_free_list *
   6576ocfs2_find_per_slot_free_list(int type,
   6577			      int slot,
   6578			      struct ocfs2_cached_dealloc_ctxt *ctxt)
   6579{
   6580	struct ocfs2_per_slot_free_list *fl = ctxt->c_first_suballocator;
   6581
   6582	while (fl) {
   6583		if (fl->f_inode_type == type && fl->f_slot == slot)
   6584			return fl;
   6585
   6586		fl = fl->f_next_suballocator;
   6587	}
   6588
   6589	fl = kmalloc(sizeof(*fl), GFP_NOFS);
   6590	if (fl) {
   6591		fl->f_inode_type = type;
   6592		fl->f_slot = slot;
   6593		fl->f_first = NULL;
   6594		fl->f_next_suballocator = ctxt->c_first_suballocator;
   6595
   6596		ctxt->c_first_suballocator = fl;
   6597	}
   6598	return fl;
   6599}
   6600
   6601static struct ocfs2_per_slot_free_list *
   6602ocfs2_find_preferred_free_list(int type,
   6603			       int preferred_slot,
   6604			       int *real_slot,
   6605			       struct ocfs2_cached_dealloc_ctxt *ctxt)
   6606{
   6607	struct ocfs2_per_slot_free_list *fl = ctxt->c_first_suballocator;
   6608
   6609	while (fl) {
   6610		if (fl->f_inode_type == type && fl->f_slot == preferred_slot) {
   6611			*real_slot = fl->f_slot;
   6612			return fl;
   6613		}
   6614
   6615		fl = fl->f_next_suballocator;
   6616	}
   6617
   6618	/* If we can't find any free list matching preferred slot, just use
   6619	 * the first one.
   6620	 */
   6621	fl = ctxt->c_first_suballocator;
   6622	*real_slot = fl->f_slot;
   6623
   6624	return fl;
   6625}
   6626
   6627/* Return Value 1 indicates empty */
   6628static int ocfs2_is_dealloc_empty(struct ocfs2_extent_tree *et)
   6629{
   6630	struct ocfs2_per_slot_free_list *fl = NULL;
   6631
   6632	if (!et->et_dealloc)
   6633		return 1;
   6634
   6635	fl = et->et_dealloc->c_first_suballocator;
   6636	if (!fl)
   6637		return 1;
   6638
   6639	if (!fl->f_first)
   6640		return 1;
   6641
   6642	return 0;
   6643}
   6644
   6645/* If extent was deleted from tree due to extent rotation and merging, and
   6646 * no metadata is reserved ahead of time. Try to reuse some extents
   6647 * just deleted. This is only used to reuse extent blocks.
   6648 * It is supposed to find enough extent blocks in dealloc if our estimation
   6649 * on metadata is accurate.
   6650 */
   6651static int ocfs2_reuse_blk_from_dealloc(handle_t *handle,
   6652					struct ocfs2_extent_tree *et,
   6653					struct buffer_head **new_eb_bh,
   6654					int blk_wanted, int *blk_given)
   6655{
   6656	int i, status = 0, real_slot;
   6657	struct ocfs2_cached_dealloc_ctxt *dealloc;
   6658	struct ocfs2_per_slot_free_list *fl;
   6659	struct ocfs2_cached_block_free *bf;
   6660	struct ocfs2_extent_block *eb;
   6661	struct ocfs2_super *osb =
   6662		OCFS2_SB(ocfs2_metadata_cache_get_super(et->et_ci));
   6663
   6664	*blk_given = 0;
   6665
   6666	/* If extent tree doesn't have a dealloc, this is not faulty. Just
   6667	 * tell upper caller dealloc can't provide any block and it should
   6668	 * ask for alloc to claim more space.
   6669	 */
   6670	dealloc = et->et_dealloc;
   6671	if (!dealloc)
   6672		goto bail;
   6673
   6674	for (i = 0; i < blk_wanted; i++) {
   6675		/* Prefer to use local slot */
   6676		fl = ocfs2_find_preferred_free_list(EXTENT_ALLOC_SYSTEM_INODE,
   6677						    osb->slot_num, &real_slot,
   6678						    dealloc);
   6679		/* If no more block can be reused, we should claim more
   6680		 * from alloc. Just return here normally.
   6681		 */
   6682		if (!fl) {
   6683			status = 0;
   6684			break;
   6685		}
   6686
   6687		bf = fl->f_first;
   6688		fl->f_first = bf->free_next;
   6689
   6690		new_eb_bh[i] = sb_getblk(osb->sb, bf->free_blk);
   6691		if (new_eb_bh[i] == NULL) {
   6692			status = -ENOMEM;
   6693			mlog_errno(status);
   6694			goto bail;
   6695		}
   6696
   6697		mlog(0, "Reusing block(%llu) from "
   6698		     "dealloc(local slot:%d, real slot:%d)\n",
   6699		     bf->free_blk, osb->slot_num, real_slot);
   6700
   6701		ocfs2_set_new_buffer_uptodate(et->et_ci, new_eb_bh[i]);
   6702
   6703		status = ocfs2_journal_access_eb(handle, et->et_ci,
   6704						 new_eb_bh[i],
   6705						 OCFS2_JOURNAL_ACCESS_CREATE);
   6706		if (status < 0) {
   6707			mlog_errno(status);
   6708			goto bail;
   6709		}
   6710
   6711		memset(new_eb_bh[i]->b_data, 0, osb->sb->s_blocksize);
   6712		eb = (struct ocfs2_extent_block *) new_eb_bh[i]->b_data;
   6713
   6714		/* We can't guarantee that buffer head is still cached, so
   6715		 * polutlate the extent block again.
   6716		 */
   6717		strcpy(eb->h_signature, OCFS2_EXTENT_BLOCK_SIGNATURE);
   6718		eb->h_blkno = cpu_to_le64(bf->free_blk);
   6719		eb->h_fs_generation = cpu_to_le32(osb->fs_generation);
   6720		eb->h_suballoc_slot = cpu_to_le16(real_slot);
   6721		eb->h_suballoc_loc = cpu_to_le64(bf->free_bg);
   6722		eb->h_suballoc_bit = cpu_to_le16(bf->free_bit);
   6723		eb->h_list.l_count =
   6724			cpu_to_le16(ocfs2_extent_recs_per_eb(osb->sb));
   6725
   6726		/* We'll also be dirtied by the caller, so
   6727		 * this isn't absolutely necessary.
   6728		 */
   6729		ocfs2_journal_dirty(handle, new_eb_bh[i]);
   6730
   6731		if (!fl->f_first) {
   6732			dealloc->c_first_suballocator = fl->f_next_suballocator;
   6733			kfree(fl);
   6734		}
   6735		kfree(bf);
   6736	}
   6737
   6738	*blk_given = i;
   6739
   6740bail:
   6741	if (unlikely(status < 0)) {
   6742		for (i = 0; i < blk_wanted; i++)
   6743			brelse(new_eb_bh[i]);
   6744	}
   6745
   6746	return status;
   6747}
   6748
   6749int ocfs2_cache_block_dealloc(struct ocfs2_cached_dealloc_ctxt *ctxt,
   6750			      int type, int slot, u64 suballoc,
   6751			      u64 blkno, unsigned int bit)
   6752{
   6753	int ret;
   6754	struct ocfs2_per_slot_free_list *fl;
   6755	struct ocfs2_cached_block_free *item;
   6756
   6757	fl = ocfs2_find_per_slot_free_list(type, slot, ctxt);
   6758	if (fl == NULL) {
   6759		ret = -ENOMEM;
   6760		mlog_errno(ret);
   6761		goto out;
   6762	}
   6763
   6764	item = kzalloc(sizeof(*item), GFP_NOFS);
   6765	if (item == NULL) {
   6766		ret = -ENOMEM;
   6767		mlog_errno(ret);
   6768		goto out;
   6769	}
   6770
   6771	trace_ocfs2_cache_block_dealloc(type, slot,
   6772					(unsigned long long)suballoc,
   6773					(unsigned long long)blkno, bit);
   6774
   6775	item->free_bg = suballoc;
   6776	item->free_blk = blkno;
   6777	item->free_bit = bit;
   6778	item->free_next = fl->f_first;
   6779
   6780	fl->f_first = item;
   6781
   6782	ret = 0;
   6783out:
   6784	return ret;
   6785}
   6786
   6787static int ocfs2_cache_extent_block_free(struct ocfs2_cached_dealloc_ctxt *ctxt,
   6788					 struct ocfs2_extent_block *eb)
   6789{
   6790	return ocfs2_cache_block_dealloc(ctxt, EXTENT_ALLOC_SYSTEM_INODE,
   6791					 le16_to_cpu(eb->h_suballoc_slot),
   6792					 le64_to_cpu(eb->h_suballoc_loc),
   6793					 le64_to_cpu(eb->h_blkno),
   6794					 le16_to_cpu(eb->h_suballoc_bit));
   6795}
   6796
   6797static int ocfs2_zero_func(handle_t *handle, struct buffer_head *bh)
   6798{
   6799	set_buffer_uptodate(bh);
   6800	mark_buffer_dirty(bh);
   6801	return 0;
   6802}
   6803
   6804void ocfs2_map_and_dirty_page(struct inode *inode, handle_t *handle,
   6805			      unsigned int from, unsigned int to,
   6806			      struct page *page, int zero, u64 *phys)
   6807{
   6808	int ret, partial = 0;
   6809	loff_t start_byte = ((loff_t)page->index << PAGE_SHIFT) + from;
   6810	loff_t length = to - from;
   6811
   6812	ret = ocfs2_map_page_blocks(page, phys, inode, from, to, 0);
   6813	if (ret)
   6814		mlog_errno(ret);
   6815
   6816	if (zero)
   6817		zero_user_segment(page, from, to);
   6818
   6819	/*
   6820	 * Need to set the buffers we zero'd into uptodate
   6821	 * here if they aren't - ocfs2_map_page_blocks()
   6822	 * might've skipped some
   6823	 */
   6824	ret = walk_page_buffers(handle, page_buffers(page),
   6825				from, to, &partial,
   6826				ocfs2_zero_func);
   6827	if (ret < 0)
   6828		mlog_errno(ret);
   6829	else if (ocfs2_should_order_data(inode)) {
   6830		ret = ocfs2_jbd2_inode_add_write(handle, inode,
   6831						 start_byte, length);
   6832		if (ret < 0)
   6833			mlog_errno(ret);
   6834	}
   6835
   6836	if (!partial)
   6837		SetPageUptodate(page);
   6838
   6839	flush_dcache_page(page);
   6840}
   6841
   6842static void ocfs2_zero_cluster_pages(struct inode *inode, loff_t start,
   6843				     loff_t end, struct page **pages,
   6844				     int numpages, u64 phys, handle_t *handle)
   6845{
   6846	int i;
   6847	struct page *page;
   6848	unsigned int from, to = PAGE_SIZE;
   6849	struct super_block *sb = inode->i_sb;
   6850
   6851	BUG_ON(!ocfs2_sparse_alloc(OCFS2_SB(sb)));
   6852
   6853	if (numpages == 0)
   6854		goto out;
   6855
   6856	to = PAGE_SIZE;
   6857	for(i = 0; i < numpages; i++) {
   6858		page = pages[i];
   6859
   6860		from = start & (PAGE_SIZE - 1);
   6861		if ((end >> PAGE_SHIFT) == page->index)
   6862			to = end & (PAGE_SIZE - 1);
   6863
   6864		BUG_ON(from > PAGE_SIZE);
   6865		BUG_ON(to > PAGE_SIZE);
   6866
   6867		ocfs2_map_and_dirty_page(inode, handle, from, to, page, 1,
   6868					 &phys);
   6869
   6870		start = (page->index + 1) << PAGE_SHIFT;
   6871	}
   6872out:
   6873	if (pages)
   6874		ocfs2_unlock_and_free_pages(pages, numpages);
   6875}
   6876
   6877int ocfs2_grab_pages(struct inode *inode, loff_t start, loff_t end,
   6878		     struct page **pages, int *num)
   6879{
   6880	int numpages, ret = 0;
   6881	struct address_space *mapping = inode->i_mapping;
   6882	unsigned long index;
   6883	loff_t last_page_bytes;
   6884
   6885	BUG_ON(start > end);
   6886
   6887	numpages = 0;
   6888	last_page_bytes = PAGE_ALIGN(end);
   6889	index = start >> PAGE_SHIFT;
   6890	do {
   6891		pages[numpages] = find_or_create_page(mapping, index, GFP_NOFS);
   6892		if (!pages[numpages]) {
   6893			ret = -ENOMEM;
   6894			mlog_errno(ret);
   6895			goto out;
   6896		}
   6897
   6898		numpages++;
   6899		index++;
   6900	} while (index < (last_page_bytes >> PAGE_SHIFT));
   6901
   6902out:
   6903	if (ret != 0) {
   6904		if (pages)
   6905			ocfs2_unlock_and_free_pages(pages, numpages);
   6906		numpages = 0;
   6907	}
   6908
   6909	*num = numpages;
   6910
   6911	return ret;
   6912}
   6913
   6914static int ocfs2_grab_eof_pages(struct inode *inode, loff_t start, loff_t end,
   6915				struct page **pages, int *num)
   6916{
   6917	struct super_block *sb = inode->i_sb;
   6918
   6919	BUG_ON(start >> OCFS2_SB(sb)->s_clustersize_bits !=
   6920	       (end - 1) >> OCFS2_SB(sb)->s_clustersize_bits);
   6921
   6922	return ocfs2_grab_pages(inode, start, end, pages, num);
   6923}
   6924
   6925/*
   6926 * Zero partial cluster for a hole punch or truncate. This avoids exposing
   6927 * nonzero data on subsequent file extends.
   6928 *
   6929 * We need to call this before i_size is updated on the inode because
   6930 * otherwise block_write_full_page() will skip writeout of pages past
   6931 * i_size.
   6932 */
   6933int ocfs2_zero_range_for_truncate(struct inode *inode, handle_t *handle,
   6934				  u64 range_start, u64 range_end)
   6935{
   6936	int ret = 0, numpages;
   6937	struct page **pages = NULL;
   6938	u64 phys;
   6939	unsigned int ext_flags;
   6940	struct super_block *sb = inode->i_sb;
   6941
   6942	/*
   6943	 * File systems which don't support sparse files zero on every
   6944	 * extend.
   6945	 */
   6946	if (!ocfs2_sparse_alloc(OCFS2_SB(sb)))
   6947		return 0;
   6948
   6949	/*
   6950	 * Avoid zeroing pages fully beyond current i_size. It is pointless as
   6951	 * underlying blocks of those pages should be already zeroed out and
   6952	 * page writeback will skip them anyway.
   6953	 */
   6954	range_end = min_t(u64, range_end, i_size_read(inode));
   6955	if (range_start >= range_end)
   6956		return 0;
   6957
   6958	pages = kcalloc(ocfs2_pages_per_cluster(sb),
   6959			sizeof(struct page *), GFP_NOFS);
   6960	if (pages == NULL) {
   6961		ret = -ENOMEM;
   6962		mlog_errno(ret);
   6963		goto out;
   6964	}
   6965
   6966	ret = ocfs2_extent_map_get_blocks(inode,
   6967					  range_start >> sb->s_blocksize_bits,
   6968					  &phys, NULL, &ext_flags);
   6969	if (ret) {
   6970		mlog_errno(ret);
   6971		goto out;
   6972	}
   6973
   6974	/*
   6975	 * Tail is a hole, or is marked unwritten. In either case, we
   6976	 * can count on read and write to return/push zero's.
   6977	 */
   6978	if (phys == 0 || ext_flags & OCFS2_EXT_UNWRITTEN)
   6979		goto out;
   6980
   6981	ret = ocfs2_grab_eof_pages(inode, range_start, range_end, pages,
   6982				   &numpages);
   6983	if (ret) {
   6984		mlog_errno(ret);
   6985		goto out;
   6986	}
   6987
   6988	ocfs2_zero_cluster_pages(inode, range_start, range_end, pages,
   6989				 numpages, phys, handle);
   6990
   6991	/*
   6992	 * Initiate writeout of the pages we zero'd here. We don't
   6993	 * wait on them - the truncate_inode_pages() call later will
   6994	 * do that for us.
   6995	 */
   6996	ret = filemap_fdatawrite_range(inode->i_mapping, range_start,
   6997				       range_end - 1);
   6998	if (ret)
   6999		mlog_errno(ret);
   7000
   7001out:
   7002	kfree(pages);
   7003
   7004	return ret;
   7005}
   7006
   7007static void ocfs2_zero_dinode_id2_with_xattr(struct inode *inode,
   7008					     struct ocfs2_dinode *di)
   7009{
   7010	unsigned int blocksize = 1 << inode->i_sb->s_blocksize_bits;
   7011	unsigned int xattrsize = le16_to_cpu(di->i_xattr_inline_size);
   7012
   7013	if (le16_to_cpu(di->i_dyn_features) & OCFS2_INLINE_XATTR_FL)
   7014		memset(&di->id2, 0, blocksize -
   7015				    offsetof(struct ocfs2_dinode, id2) -
   7016				    xattrsize);
   7017	else
   7018		memset(&di->id2, 0, blocksize -
   7019				    offsetof(struct ocfs2_dinode, id2));
   7020}
   7021
   7022void ocfs2_dinode_new_extent_list(struct inode *inode,
   7023				  struct ocfs2_dinode *di)
   7024{
   7025	ocfs2_zero_dinode_id2_with_xattr(inode, di);
   7026	di->id2.i_list.l_tree_depth = 0;
   7027	di->id2.i_list.l_next_free_rec = 0;
   7028	di->id2.i_list.l_count = cpu_to_le16(
   7029		ocfs2_extent_recs_per_inode_with_xattr(inode->i_sb, di));
   7030}
   7031
   7032void ocfs2_set_inode_data_inline(struct inode *inode, struct ocfs2_dinode *di)
   7033{
   7034	struct ocfs2_inode_info *oi = OCFS2_I(inode);
   7035	struct ocfs2_inline_data *idata = &di->id2.i_data;
   7036
   7037	spin_lock(&oi->ip_lock);
   7038	oi->ip_dyn_features |= OCFS2_INLINE_DATA_FL;
   7039	di->i_dyn_features = cpu_to_le16(oi->ip_dyn_features);
   7040	spin_unlock(&oi->ip_lock);
   7041
   7042	/*
   7043	 * We clear the entire i_data structure here so that all
   7044	 * fields can be properly initialized.
   7045	 */
   7046	ocfs2_zero_dinode_id2_with_xattr(inode, di);
   7047
   7048	idata->id_count = cpu_to_le16(
   7049			ocfs2_max_inline_data_with_xattr(inode->i_sb, di));
   7050}
   7051
   7052int ocfs2_convert_inline_data_to_extents(struct inode *inode,
   7053					 struct buffer_head *di_bh)
   7054{
   7055	int ret, has_data, num_pages = 0;
   7056	int need_free = 0;
   7057	u32 bit_off, num;
   7058	handle_t *handle;
   7059	u64 block;
   7060	struct ocfs2_inode_info *oi = OCFS2_I(inode);
   7061	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
   7062	struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
   7063	struct ocfs2_alloc_context *data_ac = NULL;
   7064	struct page *page = NULL;
   7065	struct ocfs2_extent_tree et;
   7066	int did_quota = 0;
   7067
   7068	has_data = i_size_read(inode) ? 1 : 0;
   7069
   7070	if (has_data) {
   7071		ret = ocfs2_reserve_clusters(osb, 1, &data_ac);
   7072		if (ret) {
   7073			mlog_errno(ret);
   7074			goto out;
   7075		}
   7076	}
   7077
   7078	handle = ocfs2_start_trans(osb,
   7079				   ocfs2_inline_to_extents_credits(osb->sb));
   7080	if (IS_ERR(handle)) {
   7081		ret = PTR_ERR(handle);
   7082		mlog_errno(ret);
   7083		goto out;
   7084	}
   7085
   7086	ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), di_bh,
   7087				      OCFS2_JOURNAL_ACCESS_WRITE);
   7088	if (ret) {
   7089		mlog_errno(ret);
   7090		goto out_commit;
   7091	}
   7092
   7093	if (has_data) {
   7094		unsigned int page_end = min_t(unsigned, PAGE_SIZE,
   7095							osb->s_clustersize);
   7096		u64 phys;
   7097
   7098		ret = dquot_alloc_space_nodirty(inode,
   7099				       ocfs2_clusters_to_bytes(osb->sb, 1));
   7100		if (ret)
   7101			goto out_commit;
   7102		did_quota = 1;
   7103
   7104		data_ac->ac_resv = &oi->ip_la_data_resv;
   7105
   7106		ret = ocfs2_claim_clusters(handle, data_ac, 1, &bit_off,
   7107					   &num);
   7108		if (ret) {
   7109			mlog_errno(ret);
   7110			goto out_commit;
   7111		}
   7112
   7113		/*
   7114		 * Save two copies, one for insert, and one that can
   7115		 * be changed by ocfs2_map_and_dirty_page() below.
   7116		 */
   7117		block = phys = ocfs2_clusters_to_blocks(inode->i_sb, bit_off);
   7118
   7119		ret = ocfs2_grab_eof_pages(inode, 0, page_end, &page,
   7120					   &num_pages);
   7121		if (ret) {
   7122			mlog_errno(ret);
   7123			need_free = 1;
   7124			goto out_commit;
   7125		}
   7126
   7127		/*
   7128		 * This should populate the 1st page for us and mark
   7129		 * it up to date.
   7130		 */
   7131		ret = ocfs2_read_inline_data(inode, page, di_bh);
   7132		if (ret) {
   7133			mlog_errno(ret);
   7134			need_free = 1;
   7135			goto out_unlock;
   7136		}
   7137
   7138		ocfs2_map_and_dirty_page(inode, handle, 0, page_end, page, 0,
   7139					 &phys);
   7140	}
   7141
   7142	spin_lock(&oi->ip_lock);
   7143	oi->ip_dyn_features &= ~OCFS2_INLINE_DATA_FL;
   7144	di->i_dyn_features = cpu_to_le16(oi->ip_dyn_features);
   7145	spin_unlock(&oi->ip_lock);
   7146
   7147	ocfs2_update_inode_fsync_trans(handle, inode, 1);
   7148	ocfs2_dinode_new_extent_list(inode, di);
   7149
   7150	ocfs2_journal_dirty(handle, di_bh);
   7151
   7152	if (has_data) {
   7153		/*
   7154		 * An error at this point should be extremely rare. If
   7155		 * this proves to be false, we could always re-build
   7156		 * the in-inode data from our pages.
   7157		 */
   7158		ocfs2_init_dinode_extent_tree(&et, INODE_CACHE(inode), di_bh);
   7159		ret = ocfs2_insert_extent(handle, &et, 0, block, 1, 0, NULL);
   7160		if (ret) {
   7161			mlog_errno(ret);
   7162			need_free = 1;
   7163			goto out_unlock;
   7164		}
   7165
   7166		inode->i_blocks = ocfs2_inode_sector_count(inode);
   7167	}
   7168
   7169out_unlock:
   7170	if (page)
   7171		ocfs2_unlock_and_free_pages(&page, num_pages);
   7172
   7173out_commit:
   7174	if (ret < 0 && did_quota)
   7175		dquot_free_space_nodirty(inode,
   7176					  ocfs2_clusters_to_bytes(osb->sb, 1));
   7177
   7178	if (need_free) {
   7179		if (data_ac->ac_which == OCFS2_AC_USE_LOCAL)
   7180			ocfs2_free_local_alloc_bits(osb, handle, data_ac,
   7181					bit_off, num);
   7182		else
   7183			ocfs2_free_clusters(handle,
   7184					data_ac->ac_inode,
   7185					data_ac->ac_bh,
   7186					ocfs2_clusters_to_blocks(osb->sb, bit_off),
   7187					num);
   7188	}
   7189
   7190	ocfs2_commit_trans(osb, handle);
   7191
   7192out:
   7193	if (data_ac)
   7194		ocfs2_free_alloc_context(data_ac);
   7195	return ret;
   7196}
   7197
   7198/*
   7199 * It is expected, that by the time you call this function,
   7200 * inode->i_size and fe->i_size have been adjusted.
   7201 *
   7202 * WARNING: This will kfree the truncate context
   7203 */
   7204int ocfs2_commit_truncate(struct ocfs2_super *osb,
   7205			  struct inode *inode,
   7206			  struct buffer_head *di_bh)
   7207{
   7208	int status = 0, i, flags = 0;
   7209	u32 new_highest_cpos, range, trunc_cpos, trunc_len, phys_cpos, coff;
   7210	u64 blkno = 0;
   7211	struct ocfs2_extent_list *el;
   7212	struct ocfs2_extent_rec *rec;
   7213	struct ocfs2_path *path = NULL;
   7214	struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
   7215	struct ocfs2_extent_list *root_el = &(di->id2.i_list);
   7216	u64 refcount_loc = le64_to_cpu(di->i_refcount_loc);
   7217	struct ocfs2_extent_tree et;
   7218	struct ocfs2_cached_dealloc_ctxt dealloc;
   7219	struct ocfs2_refcount_tree *ref_tree = NULL;
   7220
   7221	ocfs2_init_dinode_extent_tree(&et, INODE_CACHE(inode), di_bh);
   7222	ocfs2_init_dealloc_ctxt(&dealloc);
   7223
   7224	new_highest_cpos = ocfs2_clusters_for_bytes(osb->sb,
   7225						     i_size_read(inode));
   7226
   7227	path = ocfs2_new_path(di_bh, &di->id2.i_list,
   7228			      ocfs2_journal_access_di);
   7229	if (!path) {
   7230		status = -ENOMEM;
   7231		mlog_errno(status);
   7232		goto bail;
   7233	}
   7234
   7235	ocfs2_extent_map_trunc(inode, new_highest_cpos);
   7236
   7237start:
   7238	/*
   7239	 * Check that we still have allocation to delete.
   7240	 */
   7241	if (OCFS2_I(inode)->ip_clusters == 0) {
   7242		status = 0;
   7243		goto bail;
   7244	}
   7245
   7246	/*
   7247	 * Truncate always works against the rightmost tree branch.
   7248	 */
   7249	status = ocfs2_find_path(INODE_CACHE(inode), path, UINT_MAX);
   7250	if (status) {
   7251		mlog_errno(status);
   7252		goto bail;
   7253	}
   7254
   7255	trace_ocfs2_commit_truncate(
   7256		(unsigned long long)OCFS2_I(inode)->ip_blkno,
   7257		new_highest_cpos,
   7258		OCFS2_I(inode)->ip_clusters,
   7259		path->p_tree_depth);
   7260
   7261	/*
   7262	 * By now, el will point to the extent list on the bottom most
   7263	 * portion of this tree. Only the tail record is considered in
   7264	 * each pass.
   7265	 *
   7266	 * We handle the following cases, in order:
   7267	 * - empty extent: delete the remaining branch
   7268	 * - remove the entire record
   7269	 * - remove a partial record
   7270	 * - no record needs to be removed (truncate has completed)
   7271	 */
   7272	el = path_leaf_el(path);
   7273	if (le16_to_cpu(el->l_next_free_rec) == 0) {
   7274		ocfs2_error(inode->i_sb,
   7275			    "Inode %llu has empty extent block at %llu\n",
   7276			    (unsigned long long)OCFS2_I(inode)->ip_blkno,
   7277			    (unsigned long long)path_leaf_bh(path)->b_blocknr);
   7278		status = -EROFS;
   7279		goto bail;
   7280	}
   7281
   7282	i = le16_to_cpu(el->l_next_free_rec) - 1;
   7283	rec = &el->l_recs[i];
   7284	flags = rec->e_flags;
   7285	range = le32_to_cpu(rec->e_cpos) + ocfs2_rec_clusters(el, rec);
   7286
   7287	if (i == 0 && ocfs2_is_empty_extent(rec)) {
   7288		/*
   7289		 * Lower levels depend on this never happening, but it's best
   7290		 * to check it up here before changing the tree.
   7291		*/
   7292		if (root_el->l_tree_depth && rec->e_int_clusters == 0) {
   7293			mlog(ML_ERROR, "Inode %lu has an empty "
   7294				    "extent record, depth %u\n", inode->i_ino,
   7295				    le16_to_cpu(root_el->l_tree_depth));
   7296			status = ocfs2_remove_rightmost_empty_extent(osb,
   7297					&et, path, &dealloc);
   7298			if (status) {
   7299				mlog_errno(status);
   7300				goto bail;
   7301			}
   7302
   7303			ocfs2_reinit_path(path, 1);
   7304			goto start;
   7305		} else {
   7306			trunc_cpos = le32_to_cpu(rec->e_cpos);
   7307			trunc_len = 0;
   7308			blkno = 0;
   7309		}
   7310	} else if (le32_to_cpu(rec->e_cpos) >= new_highest_cpos) {
   7311		/*
   7312		 * Truncate entire record.
   7313		 */
   7314		trunc_cpos = le32_to_cpu(rec->e_cpos);
   7315		trunc_len = ocfs2_rec_clusters(el, rec);
   7316		blkno = le64_to_cpu(rec->e_blkno);
   7317	} else if (range > new_highest_cpos) {
   7318		/*
   7319		 * Partial truncate. it also should be
   7320		 * the last truncate we're doing.
   7321		 */
   7322		trunc_cpos = new_highest_cpos;
   7323		trunc_len = range - new_highest_cpos;
   7324		coff = new_highest_cpos - le32_to_cpu(rec->e_cpos);
   7325		blkno = le64_to_cpu(rec->e_blkno) +
   7326				ocfs2_clusters_to_blocks(inode->i_sb, coff);
   7327	} else {
   7328		/*
   7329		 * Truncate completed, leave happily.
   7330		 */
   7331		status = 0;
   7332		goto bail;
   7333	}
   7334
   7335	phys_cpos = ocfs2_blocks_to_clusters(inode->i_sb, blkno);
   7336
   7337	if ((flags & OCFS2_EXT_REFCOUNTED) && trunc_len && !ref_tree) {
   7338		status = ocfs2_lock_refcount_tree(osb, refcount_loc, 1,
   7339				&ref_tree, NULL);
   7340		if (status) {
   7341			mlog_errno(status);
   7342			goto bail;
   7343		}
   7344	}
   7345
   7346	status = ocfs2_remove_btree_range(inode, &et, trunc_cpos,
   7347					  phys_cpos, trunc_len, flags, &dealloc,
   7348					  refcount_loc, true);
   7349	if (status < 0) {
   7350		mlog_errno(status);
   7351		goto bail;
   7352	}
   7353
   7354	ocfs2_reinit_path(path, 1);
   7355
   7356	/*
   7357	 * The check above will catch the case where we've truncated
   7358	 * away all allocation.
   7359	 */
   7360	goto start;
   7361
   7362bail:
   7363	if (ref_tree)
   7364		ocfs2_unlock_refcount_tree(osb, ref_tree, 1);
   7365
   7366	ocfs2_schedule_truncate_log_flush(osb, 1);
   7367
   7368	ocfs2_run_deallocs(osb, &dealloc);
   7369
   7370	ocfs2_free_path(path);
   7371
   7372	return status;
   7373}
   7374
   7375/*
   7376 * 'start' is inclusive, 'end' is not.
   7377 */
   7378int ocfs2_truncate_inline(struct inode *inode, struct buffer_head *di_bh,
   7379			  unsigned int start, unsigned int end, int trunc)
   7380{
   7381	int ret;
   7382	unsigned int numbytes;
   7383	handle_t *handle;
   7384	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
   7385	struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
   7386	struct ocfs2_inline_data *idata = &di->id2.i_data;
   7387
   7388	/* No need to punch hole beyond i_size. */
   7389	if (start >= i_size_read(inode))
   7390		return 0;
   7391
   7392	if (end > i_size_read(inode))
   7393		end = i_size_read(inode);
   7394
   7395	BUG_ON(start > end);
   7396
   7397	if (!(OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) ||
   7398	    !(le16_to_cpu(di->i_dyn_features) & OCFS2_INLINE_DATA_FL) ||
   7399	    !ocfs2_supports_inline_data(osb)) {
   7400		ocfs2_error(inode->i_sb,
   7401			    "Inline data flags for inode %llu don't agree! Disk: 0x%x, Memory: 0x%x, Superblock: 0x%x\n",
   7402			    (unsigned long long)OCFS2_I(inode)->ip_blkno,
   7403			    le16_to_cpu(di->i_dyn_features),
   7404			    OCFS2_I(inode)->ip_dyn_features,
   7405			    osb->s_feature_incompat);
   7406		ret = -EROFS;
   7407		goto out;
   7408	}
   7409
   7410	handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
   7411	if (IS_ERR(handle)) {
   7412		ret = PTR_ERR(handle);
   7413		mlog_errno(ret);
   7414		goto out;
   7415	}
   7416
   7417	ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), di_bh,
   7418				      OCFS2_JOURNAL_ACCESS_WRITE);
   7419	if (ret) {
   7420		mlog_errno(ret);
   7421		goto out_commit;
   7422	}
   7423
   7424	numbytes = end - start;
   7425	memset(idata->id_data + start, 0, numbytes);
   7426
   7427	/*
   7428	 * No need to worry about the data page here - it's been
   7429	 * truncated already and inline data doesn't need it for
   7430	 * pushing zero's to disk, so we'll let read_folio pick it up
   7431	 * later.
   7432	 */
   7433	if (trunc) {
   7434		i_size_write(inode, start);
   7435		di->i_size = cpu_to_le64(start);
   7436	}
   7437
   7438	inode->i_blocks = ocfs2_inode_sector_count(inode);
   7439	inode->i_ctime = inode->i_mtime = current_time(inode);
   7440
   7441	di->i_ctime = di->i_mtime = cpu_to_le64(inode->i_ctime.tv_sec);
   7442	di->i_ctime_nsec = di->i_mtime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec);
   7443
   7444	ocfs2_update_inode_fsync_trans(handle, inode, 1);
   7445	ocfs2_journal_dirty(handle, di_bh);
   7446
   7447out_commit:
   7448	ocfs2_commit_trans(osb, handle);
   7449
   7450out:
   7451	return ret;
   7452}
   7453
   7454static int ocfs2_trim_extent(struct super_block *sb,
   7455			     struct ocfs2_group_desc *gd,
   7456			     u64 group, u32 start, u32 count)
   7457{
   7458	u64 discard, bcount;
   7459	struct ocfs2_super *osb = OCFS2_SB(sb);
   7460
   7461	bcount = ocfs2_clusters_to_blocks(sb, count);
   7462	discard = ocfs2_clusters_to_blocks(sb, start);
   7463
   7464	/*
   7465	 * For the first cluster group, the gd->bg_blkno is not at the start
   7466	 * of the group, but at an offset from the start. If we add it while
   7467	 * calculating discard for first group, we will wrongly start fstrim a
   7468	 * few blocks after the desried start block and the range can cross
   7469	 * over into the next cluster group. So, add it only if this is not
   7470	 * the first cluster group.
   7471	 */
   7472	if (group != osb->first_cluster_group_blkno)
   7473		discard += le64_to_cpu(gd->bg_blkno);
   7474
   7475	trace_ocfs2_trim_extent(sb, (unsigned long long)discard, bcount);
   7476
   7477	return sb_issue_discard(sb, discard, bcount, GFP_NOFS, 0);
   7478}
   7479
   7480static int ocfs2_trim_group(struct super_block *sb,
   7481			    struct ocfs2_group_desc *gd, u64 group,
   7482			    u32 start, u32 max, u32 minbits)
   7483{
   7484	int ret = 0, count = 0, next;
   7485	void *bitmap = gd->bg_bitmap;
   7486
   7487	if (le16_to_cpu(gd->bg_free_bits_count) < minbits)
   7488		return 0;
   7489
   7490	trace_ocfs2_trim_group((unsigned long long)le64_to_cpu(gd->bg_blkno),
   7491			       start, max, minbits);
   7492
   7493	while (start < max) {
   7494		start = ocfs2_find_next_zero_bit(bitmap, max, start);
   7495		if (start >= max)
   7496			break;
   7497		next = ocfs2_find_next_bit(bitmap, max, start);
   7498
   7499		if ((next - start) >= minbits) {
   7500			ret = ocfs2_trim_extent(sb, gd, group,
   7501						start, next - start);
   7502			if (ret < 0) {
   7503				mlog_errno(ret);
   7504				break;
   7505			}
   7506			count += next - start;
   7507		}
   7508		start = next + 1;
   7509
   7510		if (fatal_signal_pending(current)) {
   7511			count = -ERESTARTSYS;
   7512			break;
   7513		}
   7514
   7515		if ((le16_to_cpu(gd->bg_free_bits_count) - count) < minbits)
   7516			break;
   7517	}
   7518
   7519	if (ret < 0)
   7520		count = ret;
   7521
   7522	return count;
   7523}
   7524
   7525static
   7526int ocfs2_trim_mainbm(struct super_block *sb, struct fstrim_range *range)
   7527{
   7528	struct ocfs2_super *osb = OCFS2_SB(sb);
   7529	u64 start, len, trimmed = 0, first_group, last_group = 0, group = 0;
   7530	int ret, cnt;
   7531	u32 first_bit, last_bit, minlen;
   7532	struct buffer_head *main_bm_bh = NULL;
   7533	struct inode *main_bm_inode = NULL;
   7534	struct buffer_head *gd_bh = NULL;
   7535	struct ocfs2_dinode *main_bm;
   7536	struct ocfs2_group_desc *gd = NULL;
   7537
   7538	start = range->start >> osb->s_clustersize_bits;
   7539	len = range->len >> osb->s_clustersize_bits;
   7540	minlen = range->minlen >> osb->s_clustersize_bits;
   7541
   7542	if (minlen >= osb->bitmap_cpg || range->len < sb->s_blocksize)
   7543		return -EINVAL;
   7544
   7545	trace_ocfs2_trim_mainbm(start, len, minlen);
   7546
   7547next_group:
   7548	main_bm_inode = ocfs2_get_system_file_inode(osb,
   7549						    GLOBAL_BITMAP_SYSTEM_INODE,
   7550						    OCFS2_INVALID_SLOT);
   7551	if (!main_bm_inode) {
   7552		ret = -EIO;
   7553		mlog_errno(ret);
   7554		goto out;
   7555	}
   7556
   7557	inode_lock(main_bm_inode);
   7558
   7559	ret = ocfs2_inode_lock(main_bm_inode, &main_bm_bh, 0);
   7560	if (ret < 0) {
   7561		mlog_errno(ret);
   7562		goto out_mutex;
   7563	}
   7564	main_bm = (struct ocfs2_dinode *)main_bm_bh->b_data;
   7565
   7566	/*
   7567	 * Do some check before trim the first group.
   7568	 */
   7569	if (!group) {
   7570		if (start >= le32_to_cpu(main_bm->i_clusters)) {
   7571			ret = -EINVAL;
   7572			goto out_unlock;
   7573		}
   7574
   7575		if (start + len > le32_to_cpu(main_bm->i_clusters))
   7576			len = le32_to_cpu(main_bm->i_clusters) - start;
   7577
   7578		/*
   7579		 * Determine first and last group to examine based on
   7580		 * start and len
   7581		 */
   7582		first_group = ocfs2_which_cluster_group(main_bm_inode, start);
   7583		if (first_group == osb->first_cluster_group_blkno)
   7584			first_bit = start;
   7585		else
   7586			first_bit = start - ocfs2_blocks_to_clusters(sb,
   7587								first_group);
   7588		last_group = ocfs2_which_cluster_group(main_bm_inode,
   7589						       start + len - 1);
   7590		group = first_group;
   7591	}
   7592
   7593	do {
   7594		if (first_bit + len >= osb->bitmap_cpg)
   7595			last_bit = osb->bitmap_cpg;
   7596		else
   7597			last_bit = first_bit + len;
   7598
   7599		ret = ocfs2_read_group_descriptor(main_bm_inode,
   7600						  main_bm, group,
   7601						  &gd_bh);
   7602		if (ret < 0) {
   7603			mlog_errno(ret);
   7604			break;
   7605		}
   7606
   7607		gd = (struct ocfs2_group_desc *)gd_bh->b_data;
   7608		cnt = ocfs2_trim_group(sb, gd, group,
   7609				       first_bit, last_bit, minlen);
   7610		brelse(gd_bh);
   7611		gd_bh = NULL;
   7612		if (cnt < 0) {
   7613			ret = cnt;
   7614			mlog_errno(ret);
   7615			break;
   7616		}
   7617
   7618		trimmed += cnt;
   7619		len -= osb->bitmap_cpg - first_bit;
   7620		first_bit = 0;
   7621		if (group == osb->first_cluster_group_blkno)
   7622			group = ocfs2_clusters_to_blocks(sb, osb->bitmap_cpg);
   7623		else
   7624			group += ocfs2_clusters_to_blocks(sb, osb->bitmap_cpg);
   7625	} while (0);
   7626
   7627out_unlock:
   7628	ocfs2_inode_unlock(main_bm_inode, 0);
   7629	brelse(main_bm_bh);
   7630	main_bm_bh = NULL;
   7631out_mutex:
   7632	inode_unlock(main_bm_inode);
   7633	iput(main_bm_inode);
   7634
   7635	/*
   7636	 * If all the groups trim are not done or failed, but we should release
   7637	 * main_bm related locks for avoiding the current IO starve, then go to
   7638	 * trim the next group
   7639	 */
   7640	if (ret >= 0 && group <= last_group) {
   7641		cond_resched();
   7642		goto next_group;
   7643	}
   7644out:
   7645	range->len = trimmed * sb->s_blocksize;
   7646	return ret;
   7647}
   7648
   7649int ocfs2_trim_fs(struct super_block *sb, struct fstrim_range *range)
   7650{
   7651	int ret;
   7652	struct ocfs2_super *osb = OCFS2_SB(sb);
   7653	struct ocfs2_trim_fs_info info, *pinfo = NULL;
   7654
   7655	ocfs2_trim_fs_lock_res_init(osb);
   7656
   7657	trace_ocfs2_trim_fs(range->start, range->len, range->minlen);
   7658
   7659	ret = ocfs2_trim_fs_lock(osb, NULL, 1);
   7660	if (ret < 0) {
   7661		if (ret != -EAGAIN) {
   7662			mlog_errno(ret);
   7663			ocfs2_trim_fs_lock_res_uninit(osb);
   7664			return ret;
   7665		}
   7666
   7667		mlog(ML_NOTICE, "Wait for trim on device (%s) to "
   7668		     "finish, which is running from another node.\n",
   7669		     osb->dev_str);
   7670		ret = ocfs2_trim_fs_lock(osb, &info, 0);
   7671		if (ret < 0) {
   7672			mlog_errno(ret);
   7673			ocfs2_trim_fs_lock_res_uninit(osb);
   7674			return ret;
   7675		}
   7676
   7677		if (info.tf_valid && info.tf_success &&
   7678		    info.tf_start == range->start &&
   7679		    info.tf_len == range->len &&
   7680		    info.tf_minlen == range->minlen) {
   7681			/* Avoid sending duplicated trim to a shared device */
   7682			mlog(ML_NOTICE, "The same trim on device (%s) was "
   7683			     "just done from node (%u), return.\n",
   7684			     osb->dev_str, info.tf_nodenum);
   7685			range->len = info.tf_trimlen;
   7686			goto out;
   7687		}
   7688	}
   7689
   7690	info.tf_nodenum = osb->node_num;
   7691	info.tf_start = range->start;
   7692	info.tf_len = range->len;
   7693	info.tf_minlen = range->minlen;
   7694
   7695	ret = ocfs2_trim_mainbm(sb, range);
   7696
   7697	info.tf_trimlen = range->len;
   7698	info.tf_success = (ret < 0 ? 0 : 1);
   7699	pinfo = &info;
   7700out:
   7701	ocfs2_trim_fs_unlock(osb, pinfo);
   7702	ocfs2_trim_fs_lock_res_uninit(osb);
   7703	return ret;
   7704}