xfs_bmap_util.c - cachepc-linux - Fork of AMDESE/linux with modifications for CachePC side-channel attack

	cachepc-linux Fork of AMDESE/linux with modifications for CachePC side-channel attack
	git clone https://git.sinitax.com/sinitax/cachepc-linux
	Log \| Files \| Refs \| README \| LICENSE \| sfeed.txt
xfs_bmap_util.c (49423B)
      1// SPDX-License-Identifier: GPL-2.0
      2/*
      3 * Copyright (c) 2000-2006 Silicon Graphics, Inc.
      4 * Copyright (c) 2012 Red Hat, Inc.
      5 * All Rights Reserved.
      6 */
      7#include "xfs.h"
      8#include "xfs_fs.h"
      9#include "xfs_shared.h"
     10#include "xfs_format.h"
     11#include "xfs_log_format.h"
     12#include "xfs_trans_resv.h"
     13#include "xfs_bit.h"
     14#include "xfs_mount.h"
     15#include "xfs_defer.h"
     16#include "xfs_inode.h"
     17#include "xfs_btree.h"
     18#include "xfs_trans.h"
     19#include "xfs_alloc.h"
     20#include "xfs_bmap.h"
     21#include "xfs_bmap_util.h"
     22#include "xfs_bmap_btree.h"
     23#include "xfs_rtalloc.h"
     24#include "xfs_error.h"
     25#include "xfs_quota.h"
     26#include "xfs_trans_space.h"
     27#include "xfs_trace.h"
     28#include "xfs_icache.h"
     29#include "xfs_iomap.h"
     30#include "xfs_reflink.h"
     31
     32/* Kernel only BMAP related definitions and functions */
     33
     34/*
     35 * Convert the given file system block to a disk block.  We have to treat it
     36 * differently based on whether the file is a real time file or not, because the
     37 * bmap code does.
     38 */
     39xfs_daddr_t
     40xfs_fsb_to_db(struct xfs_inode *ip, xfs_fsblock_t fsb)
     41{
     42	if (XFS_IS_REALTIME_INODE(ip))
     43		return XFS_FSB_TO_BB(ip->i_mount, fsb);
     44	return XFS_FSB_TO_DADDR(ip->i_mount, fsb);
     45}
     46
     47/*
     48 * Routine to zero an extent on disk allocated to the specific inode.
     49 *
     50 * The VFS functions take a linearised filesystem block offset, so we have to
     51 * convert the sparse xfs fsb to the right format first.
     52 * VFS types are real funky, too.
     53 */
     54int
     55xfs_zero_extent(
     56	struct xfs_inode	*ip,
     57	xfs_fsblock_t		start_fsb,
     58	xfs_off_t		count_fsb)
     59{
     60	struct xfs_mount	*mp = ip->i_mount;
     61	struct xfs_buftarg	*target = xfs_inode_buftarg(ip);
     62	xfs_daddr_t		sector = xfs_fsb_to_db(ip, start_fsb);
     63	sector_t		block = XFS_BB_TO_FSBT(mp, sector);
     64
     65	return blkdev_issue_zeroout(target->bt_bdev,
     66		block << (mp->m_super->s_blocksize_bits - 9),
     67		count_fsb << (mp->m_super->s_blocksize_bits - 9),
     68		GFP_NOFS, 0);
     69}
     70
     71#ifdef CONFIG_XFS_RT
     72int
     73xfs_bmap_rtalloc(
     74	struct xfs_bmalloca	*ap)
     75{
     76	struct xfs_mount	*mp = ap->ip->i_mount;
     77	xfs_fileoff_t		orig_offset = ap->offset;
     78	xfs_rtblock_t		rtb;
     79	xfs_extlen_t		prod = 0;  /* product factor for allocators */
     80	xfs_extlen_t		mod = 0;   /* product factor for allocators */
     81	xfs_extlen_t		ralen = 0; /* realtime allocation length */
     82	xfs_extlen_t		align;     /* minimum allocation alignment */
     83	xfs_extlen_t		orig_length = ap->length;
     84	xfs_extlen_t		minlen = mp->m_sb.sb_rextsize;
     85	xfs_extlen_t		raminlen;
     86	bool			rtlocked = false;
     87	bool			ignore_locality = false;
     88	int			error;
     89
     90	align = xfs_get_extsz_hint(ap->ip);
     91retry:
     92	prod = align / mp->m_sb.sb_rextsize;
     93	error = xfs_bmap_extsize_align(mp, &ap->got, &ap->prev,
     94					align, 1, ap->eof, 0,
     95					ap->conv, &ap->offset, &ap->length);
     96	if (error)
     97		return error;
     98	ASSERT(ap->length);
     99	ASSERT(ap->length % mp->m_sb.sb_rextsize == 0);
    100
    101	/*
    102	 * If we shifted the file offset downward to satisfy an extent size
    103	 * hint, increase minlen by that amount so that the allocator won't
    104	 * give us an allocation that's too short to cover at least one of the
    105	 * blocks that the caller asked for.
    106	 */
    107	if (ap->offset != orig_offset)
    108		minlen += orig_offset - ap->offset;
    109
    110	/*
    111	 * If the offset & length are not perfectly aligned
    112	 * then kill prod, it will just get us in trouble.
    113	 */
    114	div_u64_rem(ap->offset, align, &mod);
    115	if (mod || ap->length % align)
    116		prod = 1;
    117	/*
    118	 * Set ralen to be the actual requested length in rtextents.
    119	 */
    120	ralen = ap->length / mp->m_sb.sb_rextsize;
    121	/*
    122	 * If the old value was close enough to XFS_BMBT_MAX_EXTLEN that
    123	 * we rounded up to it, cut it back so it's valid again.
    124	 * Note that if it's a really large request (bigger than
    125	 * XFS_BMBT_MAX_EXTLEN), we don't hear about that number, and can't
    126	 * adjust the starting point to match it.
    127	 */
    128	if (ralen * mp->m_sb.sb_rextsize >= XFS_MAX_BMBT_EXTLEN)
    129		ralen = XFS_MAX_BMBT_EXTLEN / mp->m_sb.sb_rextsize;
    130
    131	/*
    132	 * Lock out modifications to both the RT bitmap and summary inodes
    133	 */
    134	if (!rtlocked) {
    135		xfs_ilock(mp->m_rbmip, XFS_ILOCK_EXCL|XFS_ILOCK_RTBITMAP);
    136		xfs_trans_ijoin(ap->tp, mp->m_rbmip, XFS_ILOCK_EXCL);
    137		xfs_ilock(mp->m_rsumip, XFS_ILOCK_EXCL|XFS_ILOCK_RTSUM);
    138		xfs_trans_ijoin(ap->tp, mp->m_rsumip, XFS_ILOCK_EXCL);
    139		rtlocked = true;
    140	}
    141
    142	/*
    143	 * If it's an allocation to an empty file at offset 0,
    144	 * pick an extent that will space things out in the rt area.
    145	 */
    146	if (ap->eof && ap->offset == 0) {
    147		xfs_rtblock_t rtx; /* realtime extent no */
    148
    149		error = xfs_rtpick_extent(mp, ap->tp, ralen, &rtx);
    150		if (error)
    151			return error;
    152		ap->blkno = rtx * mp->m_sb.sb_rextsize;
    153	} else {
    154		ap->blkno = 0;
    155	}
    156
    157	xfs_bmap_adjacent(ap);
    158
    159	/*
    160	 * Realtime allocation, done through xfs_rtallocate_extent.
    161	 */
    162	if (ignore_locality)
    163		ap->blkno = 0;
    164	else
    165		do_div(ap->blkno, mp->m_sb.sb_rextsize);
    166	rtb = ap->blkno;
    167	ap->length = ralen;
    168	raminlen = max_t(xfs_extlen_t, 1, minlen / mp->m_sb.sb_rextsize);
    169	error = xfs_rtallocate_extent(ap->tp, ap->blkno, raminlen, ap->length,
    170			&ralen, ap->wasdel, prod, &rtb);
    171	if (error)
    172		return error;
    173
    174	if (rtb != NULLRTBLOCK) {
    175		ap->blkno = rtb * mp->m_sb.sb_rextsize;
    176		ap->length = ralen * mp->m_sb.sb_rextsize;
    177		ap->ip->i_nblocks += ap->length;
    178		xfs_trans_log_inode(ap->tp, ap->ip, XFS_ILOG_CORE);
    179		if (ap->wasdel)
    180			ap->ip->i_delayed_blks -= ap->length;
    181		/*
    182		 * Adjust the disk quota also. This was reserved
    183		 * earlier.
    184		 */
    185		xfs_trans_mod_dquot_byino(ap->tp, ap->ip,
    186			ap->wasdel ? XFS_TRANS_DQ_DELRTBCOUNT :
    187					XFS_TRANS_DQ_RTBCOUNT, ap->length);
    188		return 0;
    189	}
    190
    191	if (align > mp->m_sb.sb_rextsize) {
    192		/*
    193		 * We previously enlarged the request length to try to satisfy
    194		 * an extent size hint.  The allocator didn't return anything,
    195		 * so reset the parameters to the original values and try again
    196		 * without alignment criteria.
    197		 */
    198		ap->offset = orig_offset;
    199		ap->length = orig_length;
    200		minlen = align = mp->m_sb.sb_rextsize;
    201		goto retry;
    202	}
    203
    204	if (!ignore_locality && ap->blkno != 0) {
    205		/*
    206		 * If we can't allocate near a specific rt extent, try again
    207		 * without locality criteria.
    208		 */
    209		ignore_locality = true;
    210		goto retry;
    211	}
    212
    213	ap->blkno = NULLFSBLOCK;
    214	ap->length = 0;
    215	return 0;
    216}
    217#endif /* CONFIG_XFS_RT */
    218
    219/*
    220 * Extent tree block counting routines.
    221 */
    222
    223/*
    224 * Count leaf blocks given a range of extent records.  Delayed allocation
    225 * extents are not counted towards the totals.
    226 */
    227xfs_extnum_t
    228xfs_bmap_count_leaves(
    229	struct xfs_ifork	*ifp,
    230	xfs_filblks_t		*count)
    231{
    232	struct xfs_iext_cursor	icur;
    233	struct xfs_bmbt_irec	got;
    234	xfs_extnum_t		numrecs = 0;
    235
    236	for_each_xfs_iext(ifp, &icur, &got) {
    237		if (!isnullstartblock(got.br_startblock)) {
    238			*count += got.br_blockcount;
    239			numrecs++;
    240		}
    241	}
    242
    243	return numrecs;
    244}
    245
    246/*
    247 * Count fsblocks of the given fork.  Delayed allocation extents are
    248 * not counted towards the totals.
    249 */
    250int
    251xfs_bmap_count_blocks(
    252	struct xfs_trans	*tp,
    253	struct xfs_inode	*ip,
    254	int			whichfork,
    255	xfs_extnum_t		*nextents,
    256	xfs_filblks_t		*count)
    257{
    258	struct xfs_mount	*mp = ip->i_mount;
    259	struct xfs_ifork	*ifp = XFS_IFORK_PTR(ip, whichfork);
    260	struct xfs_btree_cur	*cur;
    261	xfs_extlen_t		btblocks = 0;
    262	int			error;
    263
    264	*nextents = 0;
    265	*count = 0;
    266
    267	if (!ifp)
    268		return 0;
    269
    270	switch (ifp->if_format) {
    271	case XFS_DINODE_FMT_BTREE:
    272		error = xfs_iread_extents(tp, ip, whichfork);
    273		if (error)
    274			return error;
    275
    276		cur = xfs_bmbt_init_cursor(mp, tp, ip, whichfork);
    277		error = xfs_btree_count_blocks(cur, &btblocks);
    278		xfs_btree_del_cursor(cur, error);
    279		if (error)
    280			return error;
    281
    282		/*
    283		 * xfs_btree_count_blocks includes the root block contained in
    284		 * the inode fork in @btblocks, so subtract one because we're
    285		 * only interested in allocated disk blocks.
    286		 */
    287		*count += btblocks - 1;
    288
    289		fallthrough;
    290	case XFS_DINODE_FMT_EXTENTS:
    291		*nextents = xfs_bmap_count_leaves(ifp, count);
    292		break;
    293	}
    294
    295	return 0;
    296}
    297
    298static int
    299xfs_getbmap_report_one(
    300	struct xfs_inode	*ip,
    301	struct getbmapx		*bmv,
    302	struct kgetbmap		*out,
    303	int64_t			bmv_end,
    304	struct xfs_bmbt_irec	*got)
    305{
    306	struct kgetbmap		*p = out + bmv->bmv_entries;
    307	bool			shared = false;
    308	int			error;
    309
    310	error = xfs_reflink_trim_around_shared(ip, got, &shared);
    311	if (error)
    312		return error;
    313
    314	if (isnullstartblock(got->br_startblock) ||
    315	    got->br_startblock == DELAYSTARTBLOCK) {
    316		/*
    317		 * Delalloc extents that start beyond EOF can occur due to
    318		 * speculative EOF allocation when the delalloc extent is larger
    319		 * than the largest freespace extent at conversion time.  These
    320		 * extents cannot be converted by data writeback, so can exist
    321		 * here even if we are not supposed to be finding delalloc
    322		 * extents.
    323		 */
    324		if (got->br_startoff < XFS_B_TO_FSB(ip->i_mount, XFS_ISIZE(ip)))
    325			ASSERT((bmv->bmv_iflags & BMV_IF_DELALLOC) != 0);
    326
    327		p->bmv_oflags |= BMV_OF_DELALLOC;
    328		p->bmv_block = -2;
    329	} else {
    330		p->bmv_block = xfs_fsb_to_db(ip, got->br_startblock);
    331	}
    332
    333	if (got->br_state == XFS_EXT_UNWRITTEN &&
    334	    (bmv->bmv_iflags & BMV_IF_PREALLOC))
    335		p->bmv_oflags |= BMV_OF_PREALLOC;
    336
    337	if (shared)
    338		p->bmv_oflags |= BMV_OF_SHARED;
    339
    340	p->bmv_offset = XFS_FSB_TO_BB(ip->i_mount, got->br_startoff);
    341	p->bmv_length = XFS_FSB_TO_BB(ip->i_mount, got->br_blockcount);
    342
    343	bmv->bmv_offset = p->bmv_offset + p->bmv_length;
    344	bmv->bmv_length = max(0LL, bmv_end - bmv->bmv_offset);
    345	bmv->bmv_entries++;
    346	return 0;
    347}
    348
    349static void
    350xfs_getbmap_report_hole(
    351	struct xfs_inode	*ip,
    352	struct getbmapx		*bmv,
    353	struct kgetbmap		*out,
    354	int64_t			bmv_end,
    355	xfs_fileoff_t		bno,
    356	xfs_fileoff_t		end)
    357{
    358	struct kgetbmap		*p = out + bmv->bmv_entries;
    359
    360	if (bmv->bmv_iflags & BMV_IF_NO_HOLES)
    361		return;
    362
    363	p->bmv_block = -1;
    364	p->bmv_offset = XFS_FSB_TO_BB(ip->i_mount, bno);
    365	p->bmv_length = XFS_FSB_TO_BB(ip->i_mount, end - bno);
    366
    367	bmv->bmv_offset = p->bmv_offset + p->bmv_length;
    368	bmv->bmv_length = max(0LL, bmv_end - bmv->bmv_offset);
    369	bmv->bmv_entries++;
    370}
    371
    372static inline bool
    373xfs_getbmap_full(
    374	struct getbmapx		*bmv)
    375{
    376	return bmv->bmv_length == 0 || bmv->bmv_entries >= bmv->bmv_count - 1;
    377}
    378
    379static bool
    380xfs_getbmap_next_rec(
    381	struct xfs_bmbt_irec	*rec,
    382	xfs_fileoff_t		total_end)
    383{
    384	xfs_fileoff_t		end = rec->br_startoff + rec->br_blockcount;
    385
    386	if (end == total_end)
    387		return false;
    388
    389	rec->br_startoff += rec->br_blockcount;
    390	if (!isnullstartblock(rec->br_startblock) &&
    391	    rec->br_startblock != DELAYSTARTBLOCK)
    392		rec->br_startblock += rec->br_blockcount;
    393	rec->br_blockcount = total_end - end;
    394	return true;
    395}
    396
    397/*
    398 * Get inode's extents as described in bmv, and format for output.
    399 * Calls formatter to fill the user's buffer until all extents
    400 * are mapped, until the passed-in bmv->bmv_count slots have
    401 * been filled, or until the formatter short-circuits the loop,
    402 * if it is tracking filled-in extents on its own.
    403 */
    404int						/* error code */
    405xfs_getbmap(
    406	struct xfs_inode	*ip,
    407	struct getbmapx		*bmv,		/* user bmap structure */
    408	struct kgetbmap		*out)
    409{
    410	struct xfs_mount	*mp = ip->i_mount;
    411	int			iflags = bmv->bmv_iflags;
    412	int			whichfork, lock, error = 0;
    413	int64_t			bmv_end, max_len;
    414	xfs_fileoff_t		bno, first_bno;
    415	struct xfs_ifork	*ifp;
    416	struct xfs_bmbt_irec	got, rec;
    417	xfs_filblks_t		len;
    418	struct xfs_iext_cursor	icur;
    419
    420	if (bmv->bmv_iflags & ~BMV_IF_VALID)
    421		return -EINVAL;
    422#ifndef DEBUG
    423	/* Only allow CoW fork queries if we're debugging. */
    424	if (iflags & BMV_IF_COWFORK)
    425		return -EINVAL;
    426#endif
    427	if ((iflags & BMV_IF_ATTRFORK) && (iflags & BMV_IF_COWFORK))
    428		return -EINVAL;
    429
    430	if (bmv->bmv_length < -1)
    431		return -EINVAL;
    432	bmv->bmv_entries = 0;
    433	if (bmv->bmv_length == 0)
    434		return 0;
    435
    436	if (iflags & BMV_IF_ATTRFORK)
    437		whichfork = XFS_ATTR_FORK;
    438	else if (iflags & BMV_IF_COWFORK)
    439		whichfork = XFS_COW_FORK;
    440	else
    441		whichfork = XFS_DATA_FORK;
    442	ifp = XFS_IFORK_PTR(ip, whichfork);
    443
    444	xfs_ilock(ip, XFS_IOLOCK_SHARED);
    445	switch (whichfork) {
    446	case XFS_ATTR_FORK:
    447		if (!XFS_IFORK_Q(ip))
    448			goto out_unlock_iolock;
    449
    450		max_len = 1LL << 32;
    451		lock = xfs_ilock_attr_map_shared(ip);
    452		break;
    453	case XFS_COW_FORK:
    454		/* No CoW fork? Just return */
    455		if (!ifp)
    456			goto out_unlock_iolock;
    457
    458		if (xfs_get_cowextsz_hint(ip))
    459			max_len = mp->m_super->s_maxbytes;
    460		else
    461			max_len = XFS_ISIZE(ip);
    462
    463		lock = XFS_ILOCK_SHARED;
    464		xfs_ilock(ip, lock);
    465		break;
    466	case XFS_DATA_FORK:
    467		if (!(iflags & BMV_IF_DELALLOC) &&
    468		    (ip->i_delayed_blks || XFS_ISIZE(ip) > ip->i_disk_size)) {
    469			error = filemap_write_and_wait(VFS_I(ip)->i_mapping);
    470			if (error)
    471				goto out_unlock_iolock;
    472
    473			/*
    474			 * Even after flushing the inode, there can still be
    475			 * delalloc blocks on the inode beyond EOF due to
    476			 * speculative preallocation.  These are not removed
    477			 * until the release function is called or the inode
    478			 * is inactivated.  Hence we cannot assert here that
    479			 * ip->i_delayed_blks == 0.
    480			 */
    481		}
    482
    483		if (xfs_get_extsz_hint(ip) ||
    484		    (ip->i_diflags &
    485		     (XFS_DIFLAG_PREALLOC | XFS_DIFLAG_APPEND)))
    486			max_len = mp->m_super->s_maxbytes;
    487		else
    488			max_len = XFS_ISIZE(ip);
    489
    490		lock = xfs_ilock_data_map_shared(ip);
    491		break;
    492	}
    493
    494	switch (ifp->if_format) {
    495	case XFS_DINODE_FMT_EXTENTS:
    496	case XFS_DINODE_FMT_BTREE:
    497		break;
    498	case XFS_DINODE_FMT_LOCAL:
    499		/* Local format inode forks report no extents. */
    500		goto out_unlock_ilock;
    501	default:
    502		error = -EINVAL;
    503		goto out_unlock_ilock;
    504	}
    505
    506	if (bmv->bmv_length == -1) {
    507		max_len = XFS_FSB_TO_BB(mp, XFS_B_TO_FSB(mp, max_len));
    508		bmv->bmv_length = max(0LL, max_len - bmv->bmv_offset);
    509	}
    510
    511	bmv_end = bmv->bmv_offset + bmv->bmv_length;
    512
    513	first_bno = bno = XFS_BB_TO_FSBT(mp, bmv->bmv_offset);
    514	len = XFS_BB_TO_FSB(mp, bmv->bmv_length);
    515
    516	error = xfs_iread_extents(NULL, ip, whichfork);
    517	if (error)
    518		goto out_unlock_ilock;
    519
    520	if (!xfs_iext_lookup_extent(ip, ifp, bno, &icur, &got)) {
    521		/*
    522		 * Report a whole-file hole if the delalloc flag is set to
    523		 * stay compatible with the old implementation.
    524		 */
    525		if (iflags & BMV_IF_DELALLOC)
    526			xfs_getbmap_report_hole(ip, bmv, out, bmv_end, bno,
    527					XFS_B_TO_FSB(mp, XFS_ISIZE(ip)));
    528		goto out_unlock_ilock;
    529	}
    530
    531	while (!xfs_getbmap_full(bmv)) {
    532		xfs_trim_extent(&got, first_bno, len);
    533
    534		/*
    535		 * Report an entry for a hole if this extent doesn't directly
    536		 * follow the previous one.
    537		 */
    538		if (got.br_startoff > bno) {
    539			xfs_getbmap_report_hole(ip, bmv, out, bmv_end, bno,
    540					got.br_startoff);
    541			if (xfs_getbmap_full(bmv))
    542				break;
    543		}
    544
    545		/*
    546		 * In order to report shared extents accurately, we report each
    547		 * distinct shared / unshared part of a single bmbt record with
    548		 * an individual getbmapx record.
    549		 */
    550		bno = got.br_startoff + got.br_blockcount;
    551		rec = got;
    552		do {
    553			error = xfs_getbmap_report_one(ip, bmv, out, bmv_end,
    554					&rec);
    555			if (error || xfs_getbmap_full(bmv))
    556				goto out_unlock_ilock;
    557		} while (xfs_getbmap_next_rec(&rec, bno));
    558
    559		if (!xfs_iext_next_extent(ifp, &icur, &got)) {
    560			xfs_fileoff_t	end = XFS_B_TO_FSB(mp, XFS_ISIZE(ip));
    561
    562			out[bmv->bmv_entries - 1].bmv_oflags |= BMV_OF_LAST;
    563
    564			if (whichfork != XFS_ATTR_FORK && bno < end &&
    565			    !xfs_getbmap_full(bmv)) {
    566				xfs_getbmap_report_hole(ip, bmv, out, bmv_end,
    567						bno, end);
    568			}
    569			break;
    570		}
    571
    572		if (bno >= first_bno + len)
    573			break;
    574	}
    575
    576out_unlock_ilock:
    577	xfs_iunlock(ip, lock);
    578out_unlock_iolock:
    579	xfs_iunlock(ip, XFS_IOLOCK_SHARED);
    580	return error;
    581}
    582
    583/*
    584 * Dead simple method of punching delalyed allocation blocks from a range in
    585 * the inode.  This will always punch out both the start and end blocks, even
    586 * if the ranges only partially overlap them, so it is up to the caller to
    587 * ensure that partial blocks are not passed in.
    588 */
    589int
    590xfs_bmap_punch_delalloc_range(
    591	struct xfs_inode	*ip,
    592	xfs_fileoff_t		start_fsb,
    593	xfs_fileoff_t		length)
    594{
    595	struct xfs_ifork	*ifp = &ip->i_df;
    596	xfs_fileoff_t		end_fsb = start_fsb + length;
    597	struct xfs_bmbt_irec	got, del;
    598	struct xfs_iext_cursor	icur;
    599	int			error = 0;
    600
    601	ASSERT(!xfs_need_iread_extents(ifp));
    602
    603	xfs_ilock(ip, XFS_ILOCK_EXCL);
    604	if (!xfs_iext_lookup_extent_before(ip, ifp, &end_fsb, &icur, &got))
    605		goto out_unlock;
    606
    607	while (got.br_startoff + got.br_blockcount > start_fsb) {
    608		del = got;
    609		xfs_trim_extent(&del, start_fsb, length);
    610
    611		/*
    612		 * A delete can push the cursor forward. Step back to the
    613		 * previous extent on non-delalloc or extents outside the
    614		 * target range.
    615		 */
    616		if (!del.br_blockcount ||
    617		    !isnullstartblock(del.br_startblock)) {
    618			if (!xfs_iext_prev_extent(ifp, &icur, &got))
    619				break;
    620			continue;
    621		}
    622
    623		error = xfs_bmap_del_extent_delay(ip, XFS_DATA_FORK, &icur,
    624						  &got, &del);
    625		if (error || !xfs_iext_get_extent(ifp, &icur, &got))
    626			break;
    627	}
    628
    629out_unlock:
    630	xfs_iunlock(ip, XFS_ILOCK_EXCL);
    631	return error;
    632}
    633
    634/*
    635 * Test whether it is appropriate to check an inode for and free post EOF
    636 * blocks. The 'force' parameter determines whether we should also consider
    637 * regular files that are marked preallocated or append-only.
    638 */
    639bool
    640xfs_can_free_eofblocks(
    641	struct xfs_inode	*ip,
    642	bool			force)
    643{
    644	struct xfs_bmbt_irec	imap;
    645	struct xfs_mount	*mp = ip->i_mount;
    646	xfs_fileoff_t		end_fsb;
    647	xfs_fileoff_t		last_fsb;
    648	int			nimaps = 1;
    649	int			error;
    650
    651	/*
    652	 * Caller must either hold the exclusive io lock; or be inactivating
    653	 * the inode, which guarantees there are no other users of the inode.
    654	 */
    655	ASSERT(xfs_isilocked(ip, XFS_IOLOCK_EXCL) ||
    656	       (VFS_I(ip)->i_state & I_FREEING));
    657
    658	/* prealloc/delalloc exists only on regular files */
    659	if (!S_ISREG(VFS_I(ip)->i_mode))
    660		return false;
    661
    662	/*
    663	 * Zero sized files with no cached pages and delalloc blocks will not
    664	 * have speculative prealloc/delalloc blocks to remove.
    665	 */
    666	if (VFS_I(ip)->i_size == 0 &&
    667	    VFS_I(ip)->i_mapping->nrpages == 0 &&
    668	    ip->i_delayed_blks == 0)
    669		return false;
    670
    671	/* If we haven't read in the extent list, then don't do it now. */
    672	if (xfs_need_iread_extents(&ip->i_df))
    673		return false;
    674
    675	/*
    676	 * Do not free real preallocated or append-only files unless the file
    677	 * has delalloc blocks and we are forced to remove them.
    678	 */
    679	if (ip->i_diflags & (XFS_DIFLAG_PREALLOC | XFS_DIFLAG_APPEND))
    680		if (!force || ip->i_delayed_blks == 0)
    681			return false;
    682
    683	/*
    684	 * Do not try to free post-EOF blocks if EOF is beyond the end of the
    685	 * range supported by the page cache, because the truncation will loop
    686	 * forever.
    687	 */
    688	end_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)XFS_ISIZE(ip));
    689	if (XFS_IS_REALTIME_INODE(ip) && mp->m_sb.sb_rextsize > 1)
    690		end_fsb = roundup_64(end_fsb, mp->m_sb.sb_rextsize);
    691	last_fsb = XFS_B_TO_FSB(mp, mp->m_super->s_maxbytes);
    692	if (last_fsb <= end_fsb)
    693		return false;
    694
    695	/*
    696	 * Look up the mapping for the first block past EOF.  If we can't find
    697	 * it, there's nothing to free.
    698	 */
    699	xfs_ilock(ip, XFS_ILOCK_SHARED);
    700	error = xfs_bmapi_read(ip, end_fsb, last_fsb - end_fsb, &imap, &nimaps,
    701			0);
    702	xfs_iunlock(ip, XFS_ILOCK_SHARED);
    703	if (error || nimaps == 0)
    704		return false;
    705
    706	/*
    707	 * If there's a real mapping there or there are delayed allocation
    708	 * reservations, then we have post-EOF blocks to try to free.
    709	 */
    710	return imap.br_startblock != HOLESTARTBLOCK || ip->i_delayed_blks;
    711}
    712
    713/*
    714 * This is called to free any blocks beyond eof. The caller must hold
    715 * IOLOCK_EXCL unless we are in the inode reclaim path and have the only
    716 * reference to the inode.
    717 */
    718int
    719xfs_free_eofblocks(
    720	struct xfs_inode	*ip)
    721{
    722	struct xfs_trans	*tp;
    723	struct xfs_mount	*mp = ip->i_mount;
    724	int			error;
    725
    726	/* Attach the dquots to the inode up front. */
    727	error = xfs_qm_dqattach(ip);
    728	if (error)
    729		return error;
    730
    731	/* Wait on dio to ensure i_size has settled. */
    732	inode_dio_wait(VFS_I(ip));
    733
    734	error = xfs_trans_alloc(mp, &M_RES(mp)->tr_itruncate, 0, 0, 0, &tp);
    735	if (error) {
    736		ASSERT(xfs_is_shutdown(mp));
    737		return error;
    738	}
    739
    740	xfs_ilock(ip, XFS_ILOCK_EXCL);
    741	xfs_trans_ijoin(tp, ip, 0);
    742
    743	/*
    744	 * Do not update the on-disk file size.  If we update the on-disk file
    745	 * size and then the system crashes before the contents of the file are
    746	 * flushed to disk then the files may be full of holes (ie NULL files
    747	 * bug).
    748	 */
    749	error = xfs_itruncate_extents_flags(&tp, ip, XFS_DATA_FORK,
    750				XFS_ISIZE(ip), XFS_BMAPI_NODISCARD);
    751	if (error)
    752		goto err_cancel;
    753
    754	error = xfs_trans_commit(tp);
    755	if (error)
    756		goto out_unlock;
    757
    758	xfs_inode_clear_eofblocks_tag(ip);
    759	goto out_unlock;
    760
    761err_cancel:
    762	/*
    763	 * If we get an error at this point we simply don't
    764	 * bother truncating the file.
    765	 */
    766	xfs_trans_cancel(tp);
    767out_unlock:
    768	xfs_iunlock(ip, XFS_ILOCK_EXCL);
    769	return error;
    770}
    771
    772int
    773xfs_alloc_file_space(
    774	struct xfs_inode	*ip,
    775	xfs_off_t		offset,
    776	xfs_off_t		len)
    777{
    778	xfs_mount_t		*mp = ip->i_mount;
    779	xfs_off_t		count;
    780	xfs_filblks_t		allocated_fsb;
    781	xfs_filblks_t		allocatesize_fsb;
    782	xfs_extlen_t		extsz, temp;
    783	xfs_fileoff_t		startoffset_fsb;
    784	xfs_fileoff_t		endoffset_fsb;
    785	int			nimaps;
    786	int			rt;
    787	xfs_trans_t		*tp;
    788	xfs_bmbt_irec_t		imaps[1], *imapp;
    789	int			error;
    790
    791	trace_xfs_alloc_file_space(ip);
    792
    793	if (xfs_is_shutdown(mp))
    794		return -EIO;
    795
    796	error = xfs_qm_dqattach(ip);
    797	if (error)
    798		return error;
    799
    800	if (len <= 0)
    801		return -EINVAL;
    802
    803	rt = XFS_IS_REALTIME_INODE(ip);
    804	extsz = xfs_get_extsz_hint(ip);
    805
    806	count = len;
    807	imapp = &imaps[0];
    808	nimaps = 1;
    809	startoffset_fsb	= XFS_B_TO_FSBT(mp, offset);
    810	endoffset_fsb = XFS_B_TO_FSB(mp, offset + count);
    811	allocatesize_fsb = endoffset_fsb - startoffset_fsb;
    812
    813	/*
    814	 * Allocate file space until done or until there is an error
    815	 */
    816	while (allocatesize_fsb && !error) {
    817		xfs_fileoff_t	s, e;
    818		unsigned int	dblocks, rblocks, resblks;
    819
    820		/*
    821		 * Determine space reservations for data/realtime.
    822		 */
    823		if (unlikely(extsz)) {
    824			s = startoffset_fsb;
    825			do_div(s, extsz);
    826			s *= extsz;
    827			e = startoffset_fsb + allocatesize_fsb;
    828			div_u64_rem(startoffset_fsb, extsz, &temp);
    829			if (temp)
    830				e += temp;
    831			div_u64_rem(e, extsz, &temp);
    832			if (temp)
    833				e += extsz - temp;
    834		} else {
    835			s = 0;
    836			e = allocatesize_fsb;
    837		}
    838
    839		/*
    840		 * The transaction reservation is limited to a 32-bit block
    841		 * count, hence we need to limit the number of blocks we are
    842		 * trying to reserve to avoid an overflow. We can't allocate
    843		 * more than @nimaps extents, and an extent is limited on disk
    844		 * to XFS_BMBT_MAX_EXTLEN (21 bits), so use that to enforce the
    845		 * limit.
    846		 */
    847		resblks = min_t(xfs_fileoff_t, (e - s),
    848				(XFS_MAX_BMBT_EXTLEN * nimaps));
    849		if (unlikely(rt)) {
    850			dblocks = XFS_DIOSTRAT_SPACE_RES(mp, 0);
    851			rblocks = resblks;
    852		} else {
    853			dblocks = XFS_DIOSTRAT_SPACE_RES(mp, resblks);
    854			rblocks = 0;
    855		}
    856
    857		error = xfs_trans_alloc_inode(ip, &M_RES(mp)->tr_write,
    858				dblocks, rblocks, false, &tp);
    859		if (error)
    860			break;
    861
    862		error = xfs_iext_count_may_overflow(ip, XFS_DATA_FORK,
    863				XFS_IEXT_ADD_NOSPLIT_CNT);
    864		if (error == -EFBIG)
    865			error = xfs_iext_count_upgrade(tp, ip,
    866					XFS_IEXT_ADD_NOSPLIT_CNT);
    867		if (error)
    868			goto error;
    869
    870		error = xfs_bmapi_write(tp, ip, startoffset_fsb,
    871				allocatesize_fsb, XFS_BMAPI_PREALLOC, 0, imapp,
    872				&nimaps);
    873		if (error)
    874			goto error;
    875
    876		ip->i_diflags |= XFS_DIFLAG_PREALLOC;
    877		xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
    878
    879		error = xfs_trans_commit(tp);
    880		xfs_iunlock(ip, XFS_ILOCK_EXCL);
    881		if (error)
    882			break;
    883
    884		allocated_fsb = imapp->br_blockcount;
    885
    886		if (nimaps == 0) {
    887			error = -ENOSPC;
    888			break;
    889		}
    890
    891		startoffset_fsb += allocated_fsb;
    892		allocatesize_fsb -= allocated_fsb;
    893	}
    894
    895	return error;
    896
    897error:
    898	xfs_trans_cancel(tp);
    899	xfs_iunlock(ip, XFS_ILOCK_EXCL);
    900	return error;
    901}
    902
    903static int
    904xfs_unmap_extent(
    905	struct xfs_inode	*ip,
    906	xfs_fileoff_t		startoffset_fsb,
    907	xfs_filblks_t		len_fsb,
    908	int			*done)
    909{
    910	struct xfs_mount	*mp = ip->i_mount;
    911	struct xfs_trans	*tp;
    912	uint			resblks = XFS_DIOSTRAT_SPACE_RES(mp, 0);
    913	int			error;
    914
    915	error = xfs_trans_alloc_inode(ip, &M_RES(mp)->tr_write, resblks, 0,
    916			false, &tp);
    917	if (error)
    918		return error;
    919
    920	error = xfs_iext_count_may_overflow(ip, XFS_DATA_FORK,
    921			XFS_IEXT_PUNCH_HOLE_CNT);
    922	if (error == -EFBIG)
    923		error = xfs_iext_count_upgrade(tp, ip, XFS_IEXT_PUNCH_HOLE_CNT);
    924	if (error)
    925		goto out_trans_cancel;
    926
    927	error = xfs_bunmapi(tp, ip, startoffset_fsb, len_fsb, 0, 2, done);
    928	if (error)
    929		goto out_trans_cancel;
    930
    931	error = xfs_trans_commit(tp);
    932out_unlock:
    933	xfs_iunlock(ip, XFS_ILOCK_EXCL);
    934	return error;
    935
    936out_trans_cancel:
    937	xfs_trans_cancel(tp);
    938	goto out_unlock;
    939}
    940
    941/* Caller must first wait for the completion of any pending DIOs if required. */
    942int
    943xfs_flush_unmap_range(
    944	struct xfs_inode	*ip,
    945	xfs_off_t		offset,
    946	xfs_off_t		len)
    947{
    948	struct xfs_mount	*mp = ip->i_mount;
    949	struct inode		*inode = VFS_I(ip);
    950	xfs_off_t		rounding, start, end;
    951	int			error;
    952
    953	rounding = max_t(xfs_off_t, mp->m_sb.sb_blocksize, PAGE_SIZE);
    954	start = round_down(offset, rounding);
    955	end = round_up(offset + len, rounding) - 1;
    956
    957	error = filemap_write_and_wait_range(inode->i_mapping, start, end);
    958	if (error)
    959		return error;
    960	truncate_pagecache_range(inode, start, end);
    961	return 0;
    962}
    963
    964int
    965xfs_free_file_space(
    966	struct xfs_inode	*ip,
    967	xfs_off_t		offset,
    968	xfs_off_t		len)
    969{
    970	struct xfs_mount	*mp = ip->i_mount;
    971	xfs_fileoff_t		startoffset_fsb;
    972	xfs_fileoff_t		endoffset_fsb;
    973	int			done = 0, error;
    974
    975	trace_xfs_free_file_space(ip);
    976
    977	error = xfs_qm_dqattach(ip);
    978	if (error)
    979		return error;
    980
    981	if (len <= 0)	/* if nothing being freed */
    982		return 0;
    983
    984	startoffset_fsb = XFS_B_TO_FSB(mp, offset);
    985	endoffset_fsb = XFS_B_TO_FSBT(mp, offset + len);
    986
    987	/* We can only free complete realtime extents. */
    988	if (XFS_IS_REALTIME_INODE(ip) && mp->m_sb.sb_rextsize > 1) {
    989		startoffset_fsb = roundup_64(startoffset_fsb,
    990					     mp->m_sb.sb_rextsize);
    991		endoffset_fsb = rounddown_64(endoffset_fsb,
    992					     mp->m_sb.sb_rextsize);
    993	}
    994
    995	/*
    996	 * Need to zero the stuff we're not freeing, on disk.
    997	 */
    998	if (endoffset_fsb > startoffset_fsb) {
    999		while (!done) {
   1000			error = xfs_unmap_extent(ip, startoffset_fsb,
   1001					endoffset_fsb - startoffset_fsb, &done);
   1002			if (error)
   1003				return error;
   1004		}
   1005	}
   1006
   1007	/*
   1008	 * Now that we've unmap all full blocks we'll have to zero out any
   1009	 * partial block at the beginning and/or end.  xfs_zero_range is smart
   1010	 * enough to skip any holes, including those we just created, but we
   1011	 * must take care not to zero beyond EOF and enlarge i_size.
   1012	 */
   1013	if (offset >= XFS_ISIZE(ip))
   1014		return 0;
   1015	if (offset + len > XFS_ISIZE(ip))
   1016		len = XFS_ISIZE(ip) - offset;
   1017	error = xfs_zero_range(ip, offset, len, NULL);
   1018	if (error)
   1019		return error;
   1020
   1021	/*
   1022	 * If we zeroed right up to EOF and EOF straddles a page boundary we
   1023	 * must make sure that the post-EOF area is also zeroed because the
   1024	 * page could be mmap'd and xfs_zero_range doesn't do that for us.
   1025	 * Writeback of the eof page will do this, albeit clumsily.
   1026	 */
   1027	if (offset + len >= XFS_ISIZE(ip) && offset_in_page(offset + len) > 0) {
   1028		error = filemap_write_and_wait_range(VFS_I(ip)->i_mapping,
   1029				round_down(offset + len, PAGE_SIZE), LLONG_MAX);
   1030	}
   1031
   1032	return error;
   1033}
   1034
   1035static int
   1036xfs_prepare_shift(
   1037	struct xfs_inode	*ip,
   1038	loff_t			offset)
   1039{
   1040	struct xfs_mount	*mp = ip->i_mount;
   1041	int			error;
   1042
   1043	/*
   1044	 * Trim eofblocks to avoid shifting uninitialized post-eof preallocation
   1045	 * into the accessible region of the file.
   1046	 */
   1047	if (xfs_can_free_eofblocks(ip, true)) {
   1048		error = xfs_free_eofblocks(ip);
   1049		if (error)
   1050			return error;
   1051	}
   1052
   1053	/*
   1054	 * Shift operations must stabilize the start block offset boundary along
   1055	 * with the full range of the operation. If we don't, a COW writeback
   1056	 * completion could race with an insert, front merge with the start
   1057	 * extent (after split) during the shift and corrupt the file. Start
   1058	 * with the block just prior to the start to stabilize the boundary.
   1059	 */
   1060	offset = round_down(offset, mp->m_sb.sb_blocksize);
   1061	if (offset)
   1062		offset -= mp->m_sb.sb_blocksize;
   1063
   1064	/*
   1065	 * Writeback and invalidate cache for the remainder of the file as we're
   1066	 * about to shift down every extent from offset to EOF.
   1067	 */
   1068	error = xfs_flush_unmap_range(ip, offset, XFS_ISIZE(ip));
   1069	if (error)
   1070		return error;
   1071
   1072	/*
   1073	 * Clean out anything hanging around in the cow fork now that
   1074	 * we've flushed all the dirty data out to disk to avoid having
   1075	 * CoW extents at the wrong offsets.
   1076	 */
   1077	if (xfs_inode_has_cow_data(ip)) {
   1078		error = xfs_reflink_cancel_cow_range(ip, offset, NULLFILEOFF,
   1079				true);
   1080		if (error)
   1081			return error;
   1082	}
   1083
   1084	return 0;
   1085}
   1086
   1087/*
   1088 * xfs_collapse_file_space()
   1089 *	This routine frees disk space and shift extent for the given file.
   1090 *	The first thing we do is to free data blocks in the specified range
   1091 *	by calling xfs_free_file_space(). It would also sync dirty data
   1092 *	and invalidate page cache over the region on which collapse range
   1093 *	is working. And Shift extent records to the left to cover a hole.
   1094 * RETURNS:
   1095 *	0 on success
   1096 *	errno on error
   1097 *
   1098 */
   1099int
   1100xfs_collapse_file_space(
   1101	struct xfs_inode	*ip,
   1102	xfs_off_t		offset,
   1103	xfs_off_t		len)
   1104{
   1105	struct xfs_mount	*mp = ip->i_mount;
   1106	struct xfs_trans	*tp;
   1107	int			error;
   1108	xfs_fileoff_t		next_fsb = XFS_B_TO_FSB(mp, offset + len);
   1109	xfs_fileoff_t		shift_fsb = XFS_B_TO_FSB(mp, len);
   1110	bool			done = false;
   1111
   1112	ASSERT(xfs_isilocked(ip, XFS_IOLOCK_EXCL));
   1113	ASSERT(xfs_isilocked(ip, XFS_MMAPLOCK_EXCL));
   1114
   1115	trace_xfs_collapse_file_space(ip);
   1116
   1117	error = xfs_free_file_space(ip, offset, len);
   1118	if (error)
   1119		return error;
   1120
   1121	error = xfs_prepare_shift(ip, offset);
   1122	if (error)
   1123		return error;
   1124
   1125	error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, 0, 0, 0, &tp);
   1126	if (error)
   1127		return error;
   1128
   1129	xfs_ilock(ip, XFS_ILOCK_EXCL);
   1130	xfs_trans_ijoin(tp, ip, 0);
   1131
   1132	while (!done) {
   1133		error = xfs_bmap_collapse_extents(tp, ip, &next_fsb, shift_fsb,
   1134				&done);
   1135		if (error)
   1136			goto out_trans_cancel;
   1137		if (done)
   1138			break;
   1139
   1140		/* finish any deferred frees and roll the transaction */
   1141		error = xfs_defer_finish(&tp);
   1142		if (error)
   1143			goto out_trans_cancel;
   1144	}
   1145
   1146	error = xfs_trans_commit(tp);
   1147	xfs_iunlock(ip, XFS_ILOCK_EXCL);
   1148	return error;
   1149
   1150out_trans_cancel:
   1151	xfs_trans_cancel(tp);
   1152	xfs_iunlock(ip, XFS_ILOCK_EXCL);
   1153	return error;
   1154}
   1155
   1156/*
   1157 * xfs_insert_file_space()
   1158 *	This routine create hole space by shifting extents for the given file.
   1159 *	The first thing we do is to sync dirty data and invalidate page cache
   1160 *	over the region on which insert range is working. And split an extent
   1161 *	to two extents at given offset by calling xfs_bmap_split_extent.
   1162 *	And shift all extent records which are laying between [offset,
   1163 *	last allocated extent] to the right to reserve hole range.
   1164 * RETURNS:
   1165 *	0 on success
   1166 *	errno on error
   1167 */
   1168int
   1169xfs_insert_file_space(
   1170	struct xfs_inode	*ip,
   1171	loff_t			offset,
   1172	loff_t			len)
   1173{
   1174	struct xfs_mount	*mp = ip->i_mount;
   1175	struct xfs_trans	*tp;
   1176	int			error;
   1177	xfs_fileoff_t		stop_fsb = XFS_B_TO_FSB(mp, offset);
   1178	xfs_fileoff_t		next_fsb = NULLFSBLOCK;
   1179	xfs_fileoff_t		shift_fsb = XFS_B_TO_FSB(mp, len);
   1180	bool			done = false;
   1181
   1182	ASSERT(xfs_isilocked(ip, XFS_IOLOCK_EXCL));
   1183	ASSERT(xfs_isilocked(ip, XFS_MMAPLOCK_EXCL));
   1184
   1185	trace_xfs_insert_file_space(ip);
   1186
   1187	error = xfs_bmap_can_insert_extents(ip, stop_fsb, shift_fsb);
   1188	if (error)
   1189		return error;
   1190
   1191	error = xfs_prepare_shift(ip, offset);
   1192	if (error)
   1193		return error;
   1194
   1195	error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write,
   1196			XFS_DIOSTRAT_SPACE_RES(mp, 0), 0, 0, &tp);
   1197	if (error)
   1198		return error;
   1199
   1200	xfs_ilock(ip, XFS_ILOCK_EXCL);
   1201	xfs_trans_ijoin(tp, ip, 0);
   1202
   1203	error = xfs_iext_count_may_overflow(ip, XFS_DATA_FORK,
   1204			XFS_IEXT_PUNCH_HOLE_CNT);
   1205	if (error == -EFBIG)
   1206		error = xfs_iext_count_upgrade(tp, ip, XFS_IEXT_PUNCH_HOLE_CNT);
   1207	if (error)
   1208		goto out_trans_cancel;
   1209
   1210	/*
   1211	 * The extent shifting code works on extent granularity. So, if stop_fsb
   1212	 * is not the starting block of extent, we need to split the extent at
   1213	 * stop_fsb.
   1214	 */
   1215	error = xfs_bmap_split_extent(tp, ip, stop_fsb);
   1216	if (error)
   1217		goto out_trans_cancel;
   1218
   1219	do {
   1220		error = xfs_defer_finish(&tp);
   1221		if (error)
   1222			goto out_trans_cancel;
   1223
   1224		error = xfs_bmap_insert_extents(tp, ip, &next_fsb, shift_fsb,
   1225				&done, stop_fsb);
   1226		if (error)
   1227			goto out_trans_cancel;
   1228	} while (!done);
   1229
   1230	error = xfs_trans_commit(tp);
   1231	xfs_iunlock(ip, XFS_ILOCK_EXCL);
   1232	return error;
   1233
   1234out_trans_cancel:
   1235	xfs_trans_cancel(tp);
   1236	xfs_iunlock(ip, XFS_ILOCK_EXCL);
   1237	return error;
   1238}
   1239
   1240/*
   1241 * We need to check that the format of the data fork in the temporary inode is
   1242 * valid for the target inode before doing the swap. This is not a problem with
   1243 * attr1 because of the fixed fork offset, but attr2 has a dynamically sized
   1244 * data fork depending on the space the attribute fork is taking so we can get
   1245 * invalid formats on the target inode.
   1246 *
   1247 * E.g. target has space for 7 extents in extent format, temp inode only has
   1248 * space for 6.  If we defragment down to 7 extents, then the tmp format is a
   1249 * btree, but when swapped it needs to be in extent format. Hence we can't just
   1250 * blindly swap data forks on attr2 filesystems.
   1251 *
   1252 * Note that we check the swap in both directions so that we don't end up with
   1253 * a corrupt temporary inode, either.
   1254 *
   1255 * Note that fixing the way xfs_fsr sets up the attribute fork in the source
   1256 * inode will prevent this situation from occurring, so all we do here is
   1257 * reject and log the attempt. basically we are putting the responsibility on
   1258 * userspace to get this right.
   1259 */
   1260static int
   1261xfs_swap_extents_check_format(
   1262	struct xfs_inode	*ip,	/* target inode */
   1263	struct xfs_inode	*tip)	/* tmp inode */
   1264{
   1265	struct xfs_ifork	*ifp = &ip->i_df;
   1266	struct xfs_ifork	*tifp = &tip->i_df;
   1267
   1268	/* User/group/project quota ids must match if quotas are enforced. */
   1269	if (XFS_IS_QUOTA_ON(ip->i_mount) &&
   1270	    (!uid_eq(VFS_I(ip)->i_uid, VFS_I(tip)->i_uid) ||
   1271	     !gid_eq(VFS_I(ip)->i_gid, VFS_I(tip)->i_gid) ||
   1272	     ip->i_projid != tip->i_projid))
   1273		return -EINVAL;
   1274
   1275	/* Should never get a local format */
   1276	if (ifp->if_format == XFS_DINODE_FMT_LOCAL ||
   1277	    tifp->if_format == XFS_DINODE_FMT_LOCAL)
   1278		return -EINVAL;
   1279
   1280	/*
   1281	 * if the target inode has less extents that then temporary inode then
   1282	 * why did userspace call us?
   1283	 */
   1284	if (ifp->if_nextents < tifp->if_nextents)
   1285		return -EINVAL;
   1286
   1287	/*
   1288	 * If we have to use the (expensive) rmap swap method, we can
   1289	 * handle any number of extents and any format.
   1290	 */
   1291	if (xfs_has_rmapbt(ip->i_mount))
   1292		return 0;
   1293
   1294	/*
   1295	 * if the target inode is in extent form and the temp inode is in btree
   1296	 * form then we will end up with the target inode in the wrong format
   1297	 * as we already know there are less extents in the temp inode.
   1298	 */
   1299	if (ifp->if_format == XFS_DINODE_FMT_EXTENTS &&
   1300	    tifp->if_format == XFS_DINODE_FMT_BTREE)
   1301		return -EINVAL;
   1302
   1303	/* Check temp in extent form to max in target */
   1304	if (tifp->if_format == XFS_DINODE_FMT_EXTENTS &&
   1305	    tifp->if_nextents > XFS_IFORK_MAXEXT(ip, XFS_DATA_FORK))
   1306		return -EINVAL;
   1307
   1308	/* Check target in extent form to max in temp */
   1309	if (ifp->if_format == XFS_DINODE_FMT_EXTENTS &&
   1310	    ifp->if_nextents > XFS_IFORK_MAXEXT(tip, XFS_DATA_FORK))
   1311		return -EINVAL;
   1312
   1313	/*
   1314	 * If we are in a btree format, check that the temp root block will fit
   1315	 * in the target and that it has enough extents to be in btree format
   1316	 * in the target.
   1317	 *
   1318	 * Note that we have to be careful to allow btree->extent conversions
   1319	 * (a common defrag case) which will occur when the temp inode is in
   1320	 * extent format...
   1321	 */
   1322	if (tifp->if_format == XFS_DINODE_FMT_BTREE) {
   1323		if (XFS_IFORK_Q(ip) &&
   1324		    XFS_BMAP_BMDR_SPACE(tifp->if_broot) > XFS_IFORK_BOFF(ip))
   1325			return -EINVAL;
   1326		if (tifp->if_nextents <= XFS_IFORK_MAXEXT(ip, XFS_DATA_FORK))
   1327			return -EINVAL;
   1328	}
   1329
   1330	/* Reciprocal target->temp btree format checks */
   1331	if (ifp->if_format == XFS_DINODE_FMT_BTREE) {
   1332		if (XFS_IFORK_Q(tip) &&
   1333		    XFS_BMAP_BMDR_SPACE(ip->i_df.if_broot) > XFS_IFORK_BOFF(tip))
   1334			return -EINVAL;
   1335		if (ifp->if_nextents <= XFS_IFORK_MAXEXT(tip, XFS_DATA_FORK))
   1336			return -EINVAL;
   1337	}
   1338
   1339	return 0;
   1340}
   1341
   1342static int
   1343xfs_swap_extent_flush(
   1344	struct xfs_inode	*ip)
   1345{
   1346	int	error;
   1347
   1348	error = filemap_write_and_wait(VFS_I(ip)->i_mapping);
   1349	if (error)
   1350		return error;
   1351	truncate_pagecache_range(VFS_I(ip), 0, -1);
   1352
   1353	/* Verify O_DIRECT for ftmp */
   1354	if (VFS_I(ip)->i_mapping->nrpages)
   1355		return -EINVAL;
   1356	return 0;
   1357}
   1358
   1359/*
   1360 * Move extents from one file to another, when rmap is enabled.
   1361 */
   1362STATIC int
   1363xfs_swap_extent_rmap(
   1364	struct xfs_trans		**tpp,
   1365	struct xfs_inode		*ip,
   1366	struct xfs_inode		*tip)
   1367{
   1368	struct xfs_trans		*tp = *tpp;
   1369	struct xfs_bmbt_irec		irec;
   1370	struct xfs_bmbt_irec		uirec;
   1371	struct xfs_bmbt_irec		tirec;
   1372	xfs_fileoff_t			offset_fsb;
   1373	xfs_fileoff_t			end_fsb;
   1374	xfs_filblks_t			count_fsb;
   1375	int				error;
   1376	xfs_filblks_t			ilen;
   1377	xfs_filblks_t			rlen;
   1378	int				nimaps;
   1379	uint64_t			tip_flags2;
   1380
   1381	/*
   1382	 * If the source file has shared blocks, we must flag the donor
   1383	 * file as having shared blocks so that we get the shared-block
   1384	 * rmap functions when we go to fix up the rmaps.  The flags
   1385	 * will be switch for reals later.
   1386	 */
   1387	tip_flags2 = tip->i_diflags2;
   1388	if (ip->i_diflags2 & XFS_DIFLAG2_REFLINK)
   1389		tip->i_diflags2 |= XFS_DIFLAG2_REFLINK;
   1390
   1391	offset_fsb = 0;
   1392	end_fsb = XFS_B_TO_FSB(ip->i_mount, i_size_read(VFS_I(ip)));
   1393	count_fsb = (xfs_filblks_t)(end_fsb - offset_fsb);
   1394
   1395	while (count_fsb) {
   1396		/* Read extent from the donor file */
   1397		nimaps = 1;
   1398		error = xfs_bmapi_read(tip, offset_fsb, count_fsb, &tirec,
   1399				&nimaps, 0);
   1400		if (error)
   1401			goto out;
   1402		ASSERT(nimaps == 1);
   1403		ASSERT(tirec.br_startblock != DELAYSTARTBLOCK);
   1404
   1405		trace_xfs_swap_extent_rmap_remap(tip, &tirec);
   1406		ilen = tirec.br_blockcount;
   1407
   1408		/* Unmap the old blocks in the source file. */
   1409		while (tirec.br_blockcount) {
   1410			ASSERT(tp->t_firstblock == NULLFSBLOCK);
   1411			trace_xfs_swap_extent_rmap_remap_piece(tip, &tirec);
   1412
   1413			/* Read extent from the source file */
   1414			nimaps = 1;
   1415			error = xfs_bmapi_read(ip, tirec.br_startoff,
   1416					tirec.br_blockcount, &irec,
   1417					&nimaps, 0);
   1418			if (error)
   1419				goto out;
   1420			ASSERT(nimaps == 1);
   1421			ASSERT(tirec.br_startoff == irec.br_startoff);
   1422			trace_xfs_swap_extent_rmap_remap_piece(ip, &irec);
   1423
   1424			/* Trim the extent. */
   1425			uirec = tirec;
   1426			uirec.br_blockcount = rlen = min_t(xfs_filblks_t,
   1427					tirec.br_blockcount,
   1428					irec.br_blockcount);
   1429			trace_xfs_swap_extent_rmap_remap_piece(tip, &uirec);
   1430
   1431			if (xfs_bmap_is_real_extent(&uirec)) {
   1432				error = xfs_iext_count_may_overflow(ip,
   1433						XFS_DATA_FORK,
   1434						XFS_IEXT_SWAP_RMAP_CNT);
   1435				if (error == -EFBIG)
   1436					error = xfs_iext_count_upgrade(tp, ip,
   1437							XFS_IEXT_SWAP_RMAP_CNT);
   1438				if (error)
   1439					goto out;
   1440			}
   1441
   1442			if (xfs_bmap_is_real_extent(&irec)) {
   1443				error = xfs_iext_count_may_overflow(tip,
   1444						XFS_DATA_FORK,
   1445						XFS_IEXT_SWAP_RMAP_CNT);
   1446				if (error == -EFBIG)
   1447					error = xfs_iext_count_upgrade(tp, ip,
   1448							XFS_IEXT_SWAP_RMAP_CNT);
   1449				if (error)
   1450					goto out;
   1451			}
   1452
   1453			/* Remove the mapping from the donor file. */
   1454			xfs_bmap_unmap_extent(tp, tip, &uirec);
   1455
   1456			/* Remove the mapping from the source file. */
   1457			xfs_bmap_unmap_extent(tp, ip, &irec);
   1458
   1459			/* Map the donor file's blocks into the source file. */
   1460			xfs_bmap_map_extent(tp, ip, &uirec);
   1461
   1462			/* Map the source file's blocks into the donor file. */
   1463			xfs_bmap_map_extent(tp, tip, &irec);
   1464
   1465			error = xfs_defer_finish(tpp);
   1466			tp = *tpp;
   1467			if (error)
   1468				goto out;
   1469
   1470			tirec.br_startoff += rlen;
   1471			if (tirec.br_startblock != HOLESTARTBLOCK &&
   1472			    tirec.br_startblock != DELAYSTARTBLOCK)
   1473				tirec.br_startblock += rlen;
   1474			tirec.br_blockcount -= rlen;
   1475		}
   1476
   1477		/* Roll on... */
   1478		count_fsb -= ilen;
   1479		offset_fsb += ilen;
   1480	}
   1481
   1482	tip->i_diflags2 = tip_flags2;
   1483	return 0;
   1484
   1485out:
   1486	trace_xfs_swap_extent_rmap_error(ip, error, _RET_IP_);
   1487	tip->i_diflags2 = tip_flags2;
   1488	return error;
   1489}
   1490
   1491/* Swap the extents of two files by swapping data forks. */
   1492STATIC int
   1493xfs_swap_extent_forks(
   1494	struct xfs_trans	*tp,
   1495	struct xfs_inode	*ip,
   1496	struct xfs_inode	*tip,
   1497	int			*src_log_flags,
   1498	int			*target_log_flags)
   1499{
   1500	xfs_filblks_t		aforkblks = 0;
   1501	xfs_filblks_t		taforkblks = 0;
   1502	xfs_extnum_t		junk;
   1503	uint64_t		tmp;
   1504	int			error;
   1505
   1506	/*
   1507	 * Count the number of extended attribute blocks
   1508	 */
   1509	if (XFS_IFORK_Q(ip) && ip->i_afp->if_nextents > 0 &&
   1510	    ip->i_afp->if_format != XFS_DINODE_FMT_LOCAL) {
   1511		error = xfs_bmap_count_blocks(tp, ip, XFS_ATTR_FORK, &junk,
   1512				&aforkblks);
   1513		if (error)
   1514			return error;
   1515	}
   1516	if (XFS_IFORK_Q(tip) && tip->i_afp->if_nextents > 0 &&
   1517	    tip->i_afp->if_format != XFS_DINODE_FMT_LOCAL) {
   1518		error = xfs_bmap_count_blocks(tp, tip, XFS_ATTR_FORK, &junk,
   1519				&taforkblks);
   1520		if (error)
   1521			return error;
   1522	}
   1523
   1524	/*
   1525	 * Btree format (v3) inodes have the inode number stamped in the bmbt
   1526	 * block headers. We can't start changing the bmbt blocks until the
   1527	 * inode owner change is logged so recovery does the right thing in the
   1528	 * event of a crash. Set the owner change log flags now and leave the
   1529	 * bmbt scan as the last step.
   1530	 */
   1531	if (xfs_has_v3inodes(ip->i_mount)) {
   1532		if (ip->i_df.if_format == XFS_DINODE_FMT_BTREE)
   1533			(*target_log_flags) |= XFS_ILOG_DOWNER;
   1534		if (tip->i_df.if_format == XFS_DINODE_FMT_BTREE)
   1535			(*src_log_flags) |= XFS_ILOG_DOWNER;
   1536	}
   1537
   1538	/*
   1539	 * Swap the data forks of the inodes
   1540	 */
   1541	swap(ip->i_df, tip->i_df);
   1542
   1543	/*
   1544	 * Fix the on-disk inode values
   1545	 */
   1546	tmp = (uint64_t)ip->i_nblocks;
   1547	ip->i_nblocks = tip->i_nblocks - taforkblks + aforkblks;
   1548	tip->i_nblocks = tmp + taforkblks - aforkblks;
   1549
   1550	/*
   1551	 * The extents in the source inode could still contain speculative
   1552	 * preallocation beyond EOF (e.g. the file is open but not modified
   1553	 * while defrag is in progress). In that case, we need to copy over the
   1554	 * number of delalloc blocks the data fork in the source inode is
   1555	 * tracking beyond EOF so that when the fork is truncated away when the
   1556	 * temporary inode is unlinked we don't underrun the i_delayed_blks
   1557	 * counter on that inode.
   1558	 */
   1559	ASSERT(tip->i_delayed_blks == 0);
   1560	tip->i_delayed_blks = ip->i_delayed_blks;
   1561	ip->i_delayed_blks = 0;
   1562
   1563	switch (ip->i_df.if_format) {
   1564	case XFS_DINODE_FMT_EXTENTS:
   1565		(*src_log_flags) |= XFS_ILOG_DEXT;
   1566		break;
   1567	case XFS_DINODE_FMT_BTREE:
   1568		ASSERT(!xfs_has_v3inodes(ip->i_mount) ||
   1569		       (*src_log_flags & XFS_ILOG_DOWNER));
   1570		(*src_log_flags) |= XFS_ILOG_DBROOT;
   1571		break;
   1572	}
   1573
   1574	switch (tip->i_df.if_format) {
   1575	case XFS_DINODE_FMT_EXTENTS:
   1576		(*target_log_flags) |= XFS_ILOG_DEXT;
   1577		break;
   1578	case XFS_DINODE_FMT_BTREE:
   1579		(*target_log_flags) |= XFS_ILOG_DBROOT;
   1580		ASSERT(!xfs_has_v3inodes(ip->i_mount) ||
   1581		       (*target_log_flags & XFS_ILOG_DOWNER));
   1582		break;
   1583	}
   1584
   1585	return 0;
   1586}
   1587
   1588/*
   1589 * Fix up the owners of the bmbt blocks to refer to the current inode. The
   1590 * change owner scan attempts to order all modified buffers in the current
   1591 * transaction. In the event of ordered buffer failure, the offending buffer is
   1592 * physically logged as a fallback and the scan returns -EAGAIN. We must roll
   1593 * the transaction in this case to replenish the fallback log reservation and
   1594 * restart the scan. This process repeats until the scan completes.
   1595 */
   1596static int
   1597xfs_swap_change_owner(
   1598	struct xfs_trans	**tpp,
   1599	struct xfs_inode	*ip,
   1600	struct xfs_inode	*tmpip)
   1601{
   1602	int			error;
   1603	struct xfs_trans	*tp = *tpp;
   1604
   1605	do {
   1606		error = xfs_bmbt_change_owner(tp, ip, XFS_DATA_FORK, ip->i_ino,
   1607					      NULL);
   1608		/* success or fatal error */
   1609		if (error != -EAGAIN)
   1610			break;
   1611
   1612		error = xfs_trans_roll(tpp);
   1613		if (error)
   1614			break;
   1615		tp = *tpp;
   1616
   1617		/*
   1618		 * Redirty both inodes so they can relog and keep the log tail
   1619		 * moving forward.
   1620		 */
   1621		xfs_trans_ijoin(tp, ip, 0);
   1622		xfs_trans_ijoin(tp, tmpip, 0);
   1623		xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
   1624		xfs_trans_log_inode(tp, tmpip, XFS_ILOG_CORE);
   1625	} while (true);
   1626
   1627	return error;
   1628}
   1629
   1630int
   1631xfs_swap_extents(
   1632	struct xfs_inode	*ip,	/* target inode */
   1633	struct xfs_inode	*tip,	/* tmp inode */
   1634	struct xfs_swapext	*sxp)
   1635{
   1636	struct xfs_mount	*mp = ip->i_mount;
   1637	struct xfs_trans	*tp;
   1638	struct xfs_bstat	*sbp = &sxp->sx_stat;
   1639	int			src_log_flags, target_log_flags;
   1640	int			error = 0;
   1641	uint64_t		f;
   1642	int			resblks = 0;
   1643	unsigned int		flags = 0;
   1644
   1645	/*
   1646	 * Lock the inodes against other IO, page faults and truncate to
   1647	 * begin with.  Then we can ensure the inodes are flushed and have no
   1648	 * page cache safely. Once we have done this we can take the ilocks and
   1649	 * do the rest of the checks.
   1650	 */
   1651	lock_two_nondirectories(VFS_I(ip), VFS_I(tip));
   1652	filemap_invalidate_lock_two(VFS_I(ip)->i_mapping,
   1653				    VFS_I(tip)->i_mapping);
   1654
   1655	/* Verify that both files have the same format */
   1656	if ((VFS_I(ip)->i_mode & S_IFMT) != (VFS_I(tip)->i_mode & S_IFMT)) {
   1657		error = -EINVAL;
   1658		goto out_unlock;
   1659	}
   1660
   1661	/* Verify both files are either real-time or non-realtime */
   1662	if (XFS_IS_REALTIME_INODE(ip) != XFS_IS_REALTIME_INODE(tip)) {
   1663		error = -EINVAL;
   1664		goto out_unlock;
   1665	}
   1666
   1667	error = xfs_qm_dqattach(ip);
   1668	if (error)
   1669		goto out_unlock;
   1670
   1671	error = xfs_qm_dqattach(tip);
   1672	if (error)
   1673		goto out_unlock;
   1674
   1675	error = xfs_swap_extent_flush(ip);
   1676	if (error)
   1677		goto out_unlock;
   1678	error = xfs_swap_extent_flush(tip);
   1679	if (error)
   1680		goto out_unlock;
   1681
   1682	if (xfs_inode_has_cow_data(tip)) {
   1683		error = xfs_reflink_cancel_cow_range(tip, 0, NULLFILEOFF, true);
   1684		if (error)
   1685			goto out_unlock;
   1686	}
   1687
   1688	/*
   1689	 * Extent "swapping" with rmap requires a permanent reservation and
   1690	 * a block reservation because it's really just a remap operation
   1691	 * performed with log redo items!
   1692	 */
   1693	if (xfs_has_rmapbt(mp)) {
   1694		int		w = XFS_DATA_FORK;
   1695		uint32_t	ipnext = ip->i_df.if_nextents;
   1696		uint32_t	tipnext	= tip->i_df.if_nextents;
   1697
   1698		/*
   1699		 * Conceptually this shouldn't affect the shape of either bmbt,
   1700		 * but since we atomically move extents one by one, we reserve
   1701		 * enough space to rebuild both trees.
   1702		 */
   1703		resblks = XFS_SWAP_RMAP_SPACE_RES(mp, ipnext, w);
   1704		resblks +=  XFS_SWAP_RMAP_SPACE_RES(mp, tipnext, w);
   1705
   1706		/*
   1707		 * If either inode straddles a bmapbt block allocation boundary,
   1708		 * the rmapbt algorithm triggers repeated allocs and frees as
   1709		 * extents are remapped. This can exhaust the block reservation
   1710		 * prematurely and cause shutdown. Return freed blocks to the
   1711		 * transaction reservation to counter this behavior.
   1712		 */
   1713		flags |= XFS_TRANS_RES_FDBLKS;
   1714	}
   1715	error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, resblks, 0, flags,
   1716				&tp);
   1717	if (error)
   1718		goto out_unlock;
   1719
   1720	/*
   1721	 * Lock and join the inodes to the tansaction so that transaction commit
   1722	 * or cancel will unlock the inodes from this point onwards.
   1723	 */
   1724	xfs_lock_two_inodes(ip, XFS_ILOCK_EXCL, tip, XFS_ILOCK_EXCL);
   1725	xfs_trans_ijoin(tp, ip, 0);
   1726	xfs_trans_ijoin(tp, tip, 0);
   1727
   1728
   1729	/* Verify all data are being swapped */
   1730	if (sxp->sx_offset != 0 ||
   1731	    sxp->sx_length != ip->i_disk_size ||
   1732	    sxp->sx_length != tip->i_disk_size) {
   1733		error = -EFAULT;
   1734		goto out_trans_cancel;
   1735	}
   1736
   1737	trace_xfs_swap_extent_before(ip, 0);
   1738	trace_xfs_swap_extent_before(tip, 1);
   1739
   1740	/* check inode formats now that data is flushed */
   1741	error = xfs_swap_extents_check_format(ip, tip);
   1742	if (error) {
   1743		xfs_notice(mp,
   1744		    "%s: inode 0x%llx format is incompatible for exchanging.",
   1745				__func__, ip->i_ino);
   1746		goto out_trans_cancel;
   1747	}
   1748
   1749	/*
   1750	 * Compare the current change & modify times with that
   1751	 * passed in.  If they differ, we abort this swap.
   1752	 * This is the mechanism used to ensure the calling
   1753	 * process that the file was not changed out from
   1754	 * under it.
   1755	 */
   1756	if ((sbp->bs_ctime.tv_sec != VFS_I(ip)->i_ctime.tv_sec) ||
   1757	    (sbp->bs_ctime.tv_nsec != VFS_I(ip)->i_ctime.tv_nsec) ||
   1758	    (sbp->bs_mtime.tv_sec != VFS_I(ip)->i_mtime.tv_sec) ||
   1759	    (sbp->bs_mtime.tv_nsec != VFS_I(ip)->i_mtime.tv_nsec)) {
   1760		error = -EBUSY;
   1761		goto out_trans_cancel;
   1762	}
   1763
   1764	/*
   1765	 * Note the trickiness in setting the log flags - we set the owner log
   1766	 * flag on the opposite inode (i.e. the inode we are setting the new
   1767	 * owner to be) because once we swap the forks and log that, log
   1768	 * recovery is going to see the fork as owned by the swapped inode,
   1769	 * not the pre-swapped inodes.
   1770	 */
   1771	src_log_flags = XFS_ILOG_CORE;
   1772	target_log_flags = XFS_ILOG_CORE;
   1773
   1774	if (xfs_has_rmapbt(mp))
   1775		error = xfs_swap_extent_rmap(&tp, ip, tip);
   1776	else
   1777		error = xfs_swap_extent_forks(tp, ip, tip, &src_log_flags,
   1778				&target_log_flags);
   1779	if (error)
   1780		goto out_trans_cancel;
   1781
   1782	/* Do we have to swap reflink flags? */
   1783	if ((ip->i_diflags2 & XFS_DIFLAG2_REFLINK) ^
   1784	    (tip->i_diflags2 & XFS_DIFLAG2_REFLINK)) {
   1785		f = ip->i_diflags2 & XFS_DIFLAG2_REFLINK;
   1786		ip->i_diflags2 &= ~XFS_DIFLAG2_REFLINK;
   1787		ip->i_diflags2 |= tip->i_diflags2 & XFS_DIFLAG2_REFLINK;
   1788		tip->i_diflags2 &= ~XFS_DIFLAG2_REFLINK;
   1789		tip->i_diflags2 |= f & XFS_DIFLAG2_REFLINK;
   1790	}
   1791
   1792	/* Swap the cow forks. */
   1793	if (xfs_has_reflink(mp)) {
   1794		ASSERT(!ip->i_cowfp ||
   1795		       ip->i_cowfp->if_format == XFS_DINODE_FMT_EXTENTS);
   1796		ASSERT(!tip->i_cowfp ||
   1797		       tip->i_cowfp->if_format == XFS_DINODE_FMT_EXTENTS);
   1798
   1799		swap(ip->i_cowfp, tip->i_cowfp);
   1800
   1801		if (ip->i_cowfp && ip->i_cowfp->if_bytes)
   1802			xfs_inode_set_cowblocks_tag(ip);
   1803		else
   1804			xfs_inode_clear_cowblocks_tag(ip);
   1805		if (tip->i_cowfp && tip->i_cowfp->if_bytes)
   1806			xfs_inode_set_cowblocks_tag(tip);
   1807		else
   1808			xfs_inode_clear_cowblocks_tag(tip);
   1809	}
   1810
   1811	xfs_trans_log_inode(tp, ip,  src_log_flags);
   1812	xfs_trans_log_inode(tp, tip, target_log_flags);
   1813
   1814	/*
   1815	 * The extent forks have been swapped, but crc=1,rmapbt=0 filesystems
   1816	 * have inode number owner values in the bmbt blocks that still refer to
   1817	 * the old inode. Scan each bmbt to fix up the owner values with the
   1818	 * inode number of the current inode.
   1819	 */
   1820	if (src_log_flags & XFS_ILOG_DOWNER) {
   1821		error = xfs_swap_change_owner(&tp, ip, tip);
   1822		if (error)
   1823			goto out_trans_cancel;
   1824	}
   1825	if (target_log_flags & XFS_ILOG_DOWNER) {
   1826		error = xfs_swap_change_owner(&tp, tip, ip);
   1827		if (error)
   1828			goto out_trans_cancel;
   1829	}
   1830
   1831	/*
   1832	 * If this is a synchronous mount, make sure that the
   1833	 * transaction goes to disk before returning to the user.
   1834	 */
   1835	if (xfs_has_wsync(mp))
   1836		xfs_trans_set_sync(tp);
   1837
   1838	error = xfs_trans_commit(tp);
   1839
   1840	trace_xfs_swap_extent_after(ip, 0);
   1841	trace_xfs_swap_extent_after(tip, 1);
   1842
   1843out_unlock_ilock:
   1844	xfs_iunlock(ip, XFS_ILOCK_EXCL);
   1845	xfs_iunlock(tip, XFS_ILOCK_EXCL);
   1846out_unlock:
   1847	filemap_invalidate_unlock_two(VFS_I(ip)->i_mapping,
   1848				      VFS_I(tip)->i_mapping);
   1849	unlock_two_nondirectories(VFS_I(ip), VFS_I(tip));
   1850	return error;
   1851
   1852out_trans_cancel:
   1853	xfs_trans_cancel(tp);
   1854	goto out_unlock_ilock;
   1855}