xfs_iomap.c - cachepc-linux - Fork of AMDESE/linux with modifications for CachePC side-channel attack

	cachepc-linux Fork of AMDESE/linux with modifications for CachePC side-channel attack
	git clone https://git.sinitax.com/sinitax/cachepc-linux
	Log \| Files \| Refs \| README \| LICENSE \| sfeed.txt
xfs_iomap.c (37300B)
      1// SPDX-License-Identifier: GPL-2.0
      2/*
      3 * Copyright (c) 2000-2006 Silicon Graphics, Inc.
      4 * Copyright (c) 2016-2018 Christoph Hellwig.
      5 * All Rights Reserved.
      6 */
      7#include "xfs.h"
      8#include "xfs_fs.h"
      9#include "xfs_shared.h"
     10#include "xfs_format.h"
     11#include "xfs_log_format.h"
     12#include "xfs_trans_resv.h"
     13#include "xfs_mount.h"
     14#include "xfs_inode.h"
     15#include "xfs_btree.h"
     16#include "xfs_bmap_btree.h"
     17#include "xfs_bmap.h"
     18#include "xfs_bmap_util.h"
     19#include "xfs_errortag.h"
     20#include "xfs_error.h"
     21#include "xfs_trans.h"
     22#include "xfs_trans_space.h"
     23#include "xfs_inode_item.h"
     24#include "xfs_iomap.h"
     25#include "xfs_trace.h"
     26#include "xfs_quota.h"
     27#include "xfs_dquot_item.h"
     28#include "xfs_dquot.h"
     29#include "xfs_reflink.h"
     30
     31#define XFS_ALLOC_ALIGN(mp, off) \
     32	(((off) >> mp->m_allocsize_log) << mp->m_allocsize_log)
     33
     34static int
     35xfs_alert_fsblock_zero(
     36	xfs_inode_t	*ip,
     37	xfs_bmbt_irec_t	*imap)
     38{
     39	xfs_alert_tag(ip->i_mount, XFS_PTAG_FSBLOCK_ZERO,
     40			"Access to block zero in inode %llu "
     41			"start_block: %llx start_off: %llx "
     42			"blkcnt: %llx extent-state: %x",
     43		(unsigned long long)ip->i_ino,
     44		(unsigned long long)imap->br_startblock,
     45		(unsigned long long)imap->br_startoff,
     46		(unsigned long long)imap->br_blockcount,
     47		imap->br_state);
     48	return -EFSCORRUPTED;
     49}
     50
     51int
     52xfs_bmbt_to_iomap(
     53	struct xfs_inode	*ip,
     54	struct iomap		*iomap,
     55	struct xfs_bmbt_irec	*imap,
     56	unsigned int		mapping_flags,
     57	u16			iomap_flags)
     58{
     59	struct xfs_mount	*mp = ip->i_mount;
     60	struct xfs_buftarg	*target = xfs_inode_buftarg(ip);
     61
     62	if (unlikely(!xfs_valid_startblock(ip, imap->br_startblock)))
     63		return xfs_alert_fsblock_zero(ip, imap);
     64
     65	if (imap->br_startblock == HOLESTARTBLOCK) {
     66		iomap->addr = IOMAP_NULL_ADDR;
     67		iomap->type = IOMAP_HOLE;
     68	} else if (imap->br_startblock == DELAYSTARTBLOCK ||
     69		   isnullstartblock(imap->br_startblock)) {
     70		iomap->addr = IOMAP_NULL_ADDR;
     71		iomap->type = IOMAP_DELALLOC;
     72	} else {
     73		iomap->addr = BBTOB(xfs_fsb_to_db(ip, imap->br_startblock));
     74		if (mapping_flags & IOMAP_DAX)
     75			iomap->addr += target->bt_dax_part_off;
     76
     77		if (imap->br_state == XFS_EXT_UNWRITTEN)
     78			iomap->type = IOMAP_UNWRITTEN;
     79		else
     80			iomap->type = IOMAP_MAPPED;
     81
     82	}
     83	iomap->offset = XFS_FSB_TO_B(mp, imap->br_startoff);
     84	iomap->length = XFS_FSB_TO_B(mp, imap->br_blockcount);
     85	if (mapping_flags & IOMAP_DAX)
     86		iomap->dax_dev = target->bt_daxdev;
     87	else
     88		iomap->bdev = target->bt_bdev;
     89	iomap->flags = iomap_flags;
     90
     91	if (xfs_ipincount(ip) &&
     92	    (ip->i_itemp->ili_fsync_fields & ~XFS_ILOG_TIMESTAMP))
     93		iomap->flags |= IOMAP_F_DIRTY;
     94	return 0;
     95}
     96
     97static void
     98xfs_hole_to_iomap(
     99	struct xfs_inode	*ip,
    100	struct iomap		*iomap,
    101	xfs_fileoff_t		offset_fsb,
    102	xfs_fileoff_t		end_fsb)
    103{
    104	struct xfs_buftarg	*target = xfs_inode_buftarg(ip);
    105
    106	iomap->addr = IOMAP_NULL_ADDR;
    107	iomap->type = IOMAP_HOLE;
    108	iomap->offset = XFS_FSB_TO_B(ip->i_mount, offset_fsb);
    109	iomap->length = XFS_FSB_TO_B(ip->i_mount, end_fsb - offset_fsb);
    110	iomap->bdev = target->bt_bdev;
    111	iomap->dax_dev = target->bt_daxdev;
    112}
    113
    114static inline xfs_fileoff_t
    115xfs_iomap_end_fsb(
    116	struct xfs_mount	*mp,
    117	loff_t			offset,
    118	loff_t			count)
    119{
    120	ASSERT(offset <= mp->m_super->s_maxbytes);
    121	return min(XFS_B_TO_FSB(mp, offset + count),
    122		   XFS_B_TO_FSB(mp, mp->m_super->s_maxbytes));
    123}
    124
    125static xfs_extlen_t
    126xfs_eof_alignment(
    127	struct xfs_inode	*ip)
    128{
    129	struct xfs_mount	*mp = ip->i_mount;
    130	xfs_extlen_t		align = 0;
    131
    132	if (!XFS_IS_REALTIME_INODE(ip)) {
    133		/*
    134		 * Round up the allocation request to a stripe unit
    135		 * (m_dalign) boundary if the file size is >= stripe unit
    136		 * size, and we are allocating past the allocation eof.
    137		 *
    138		 * If mounted with the "-o swalloc" option the alignment is
    139		 * increased from the strip unit size to the stripe width.
    140		 */
    141		if (mp->m_swidth && xfs_has_swalloc(mp))
    142			align = mp->m_swidth;
    143		else if (mp->m_dalign)
    144			align = mp->m_dalign;
    145
    146		if (align && XFS_ISIZE(ip) < XFS_FSB_TO_B(mp, align))
    147			align = 0;
    148	}
    149
    150	return align;
    151}
    152
    153/*
    154 * Check if last_fsb is outside the last extent, and if so grow it to the next
    155 * stripe unit boundary.
    156 */
    157xfs_fileoff_t
    158xfs_iomap_eof_align_last_fsb(
    159	struct xfs_inode	*ip,
    160	xfs_fileoff_t		end_fsb)
    161{
    162	struct xfs_ifork	*ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK);
    163	xfs_extlen_t		extsz = xfs_get_extsz_hint(ip);
    164	xfs_extlen_t		align = xfs_eof_alignment(ip);
    165	struct xfs_bmbt_irec	irec;
    166	struct xfs_iext_cursor	icur;
    167
    168	ASSERT(!xfs_need_iread_extents(ifp));
    169
    170	/*
    171	 * Always round up the allocation request to the extent hint boundary.
    172	 */
    173	if (extsz) {
    174		if (align)
    175			align = roundup_64(align, extsz);
    176		else
    177			align = extsz;
    178	}
    179
    180	if (align) {
    181		xfs_fileoff_t	aligned_end_fsb = roundup_64(end_fsb, align);
    182
    183		xfs_iext_last(ifp, &icur);
    184		if (!xfs_iext_get_extent(ifp, &icur, &irec) ||
    185		    aligned_end_fsb >= irec.br_startoff + irec.br_blockcount)
    186			return aligned_end_fsb;
    187	}
    188
    189	return end_fsb;
    190}
    191
    192int
    193xfs_iomap_write_direct(
    194	struct xfs_inode	*ip,
    195	xfs_fileoff_t		offset_fsb,
    196	xfs_fileoff_t		count_fsb,
    197	unsigned int		flags,
    198	struct xfs_bmbt_irec	*imap)
    199{
    200	struct xfs_mount	*mp = ip->i_mount;
    201	struct xfs_trans	*tp;
    202	xfs_filblks_t		resaligned;
    203	int			nimaps;
    204	unsigned int		dblocks, rblocks;
    205	bool			force = false;
    206	int			error;
    207	int			bmapi_flags = XFS_BMAPI_PREALLOC;
    208	int			nr_exts = XFS_IEXT_ADD_NOSPLIT_CNT;
    209
    210	ASSERT(count_fsb > 0);
    211
    212	resaligned = xfs_aligned_fsb_count(offset_fsb, count_fsb,
    213					   xfs_get_extsz_hint(ip));
    214	if (unlikely(XFS_IS_REALTIME_INODE(ip))) {
    215		dblocks = XFS_DIOSTRAT_SPACE_RES(mp, 0);
    216		rblocks = resaligned;
    217	} else {
    218		dblocks = XFS_DIOSTRAT_SPACE_RES(mp, resaligned);
    219		rblocks = 0;
    220	}
    221
    222	error = xfs_qm_dqattach(ip);
    223	if (error)
    224		return error;
    225
    226	/*
    227	 * For DAX, we do not allocate unwritten extents, but instead we zero
    228	 * the block before we commit the transaction.  Ideally we'd like to do
    229	 * this outside the transaction context, but if we commit and then crash
    230	 * we may not have zeroed the blocks and this will be exposed on
    231	 * recovery of the allocation. Hence we must zero before commit.
    232	 *
    233	 * Further, if we are mapping unwritten extents here, we need to zero
    234	 * and convert them to written so that we don't need an unwritten extent
    235	 * callback for DAX. This also means that we need to be able to dip into
    236	 * the reserve block pool for bmbt block allocation if there is no space
    237	 * left but we need to do unwritten extent conversion.
    238	 */
    239	if (flags & IOMAP_DAX) {
    240		bmapi_flags = XFS_BMAPI_CONVERT | XFS_BMAPI_ZERO;
    241		if (imap->br_state == XFS_EXT_UNWRITTEN) {
    242			force = true;
    243			nr_exts = XFS_IEXT_WRITE_UNWRITTEN_CNT;
    244			dblocks = XFS_DIOSTRAT_SPACE_RES(mp, 0) << 1;
    245		}
    246	}
    247
    248	error = xfs_trans_alloc_inode(ip, &M_RES(mp)->tr_write, dblocks,
    249			rblocks, force, &tp);
    250	if (error)
    251		return error;
    252
    253	error = xfs_iext_count_may_overflow(ip, XFS_DATA_FORK, nr_exts);
    254	if (error == -EFBIG)
    255		error = xfs_iext_count_upgrade(tp, ip, nr_exts);
    256	if (error)
    257		goto out_trans_cancel;
    258
    259	/*
    260	 * From this point onwards we overwrite the imap pointer that the
    261	 * caller gave to us.
    262	 */
    263	nimaps = 1;
    264	error = xfs_bmapi_write(tp, ip, offset_fsb, count_fsb, bmapi_flags, 0,
    265				imap, &nimaps);
    266	if (error)
    267		goto out_trans_cancel;
    268
    269	/*
    270	 * Complete the transaction
    271	 */
    272	error = xfs_trans_commit(tp);
    273	if (error)
    274		goto out_unlock;
    275
    276	/*
    277	 * Copy any maps to caller's array and return any error.
    278	 */
    279	if (nimaps == 0) {
    280		error = -ENOSPC;
    281		goto out_unlock;
    282	}
    283
    284	if (unlikely(!xfs_valid_startblock(ip, imap->br_startblock)))
    285		error = xfs_alert_fsblock_zero(ip, imap);
    286
    287out_unlock:
    288	xfs_iunlock(ip, XFS_ILOCK_EXCL);
    289	return error;
    290
    291out_trans_cancel:
    292	xfs_trans_cancel(tp);
    293	goto out_unlock;
    294}
    295
    296STATIC bool
    297xfs_quota_need_throttle(
    298	struct xfs_inode	*ip,
    299	xfs_dqtype_t		type,
    300	xfs_fsblock_t		alloc_blocks)
    301{
    302	struct xfs_dquot	*dq = xfs_inode_dquot(ip, type);
    303
    304	if (!dq || !xfs_this_quota_on(ip->i_mount, type))
    305		return false;
    306
    307	/* no hi watermark, no throttle */
    308	if (!dq->q_prealloc_hi_wmark)
    309		return false;
    310
    311	/* under the lo watermark, no throttle */
    312	if (dq->q_blk.reserved + alloc_blocks < dq->q_prealloc_lo_wmark)
    313		return false;
    314
    315	return true;
    316}
    317
    318STATIC void
    319xfs_quota_calc_throttle(
    320	struct xfs_inode	*ip,
    321	xfs_dqtype_t		type,
    322	xfs_fsblock_t		*qblocks,
    323	int			*qshift,
    324	int64_t			*qfreesp)
    325{
    326	struct xfs_dquot	*dq = xfs_inode_dquot(ip, type);
    327	int64_t			freesp;
    328	int			shift = 0;
    329
    330	/* no dq, or over hi wmark, squash the prealloc completely */
    331	if (!dq || dq->q_blk.reserved >= dq->q_prealloc_hi_wmark) {
    332		*qblocks = 0;
    333		*qfreesp = 0;
    334		return;
    335	}
    336
    337	freesp = dq->q_prealloc_hi_wmark - dq->q_blk.reserved;
    338	if (freesp < dq->q_low_space[XFS_QLOWSP_5_PCNT]) {
    339		shift = 2;
    340		if (freesp < dq->q_low_space[XFS_QLOWSP_3_PCNT])
    341			shift += 2;
    342		if (freesp < dq->q_low_space[XFS_QLOWSP_1_PCNT])
    343			shift += 2;
    344	}
    345
    346	if (freesp < *qfreesp)
    347		*qfreesp = freesp;
    348
    349	/* only overwrite the throttle values if we are more aggressive */
    350	if ((freesp >> shift) < (*qblocks >> *qshift)) {
    351		*qblocks = freesp;
    352		*qshift = shift;
    353	}
    354}
    355
    356/*
    357 * If we don't have a user specified preallocation size, dynamically increase
    358 * the preallocation size as the size of the file grows.  Cap the maximum size
    359 * at a single extent or less if the filesystem is near full. The closer the
    360 * filesystem is to being full, the smaller the maximum preallocation.
    361 */
    362STATIC xfs_fsblock_t
    363xfs_iomap_prealloc_size(
    364	struct xfs_inode	*ip,
    365	int			whichfork,
    366	loff_t			offset,
    367	loff_t			count,
    368	struct xfs_iext_cursor	*icur)
    369{
    370	struct xfs_iext_cursor	ncur = *icur;
    371	struct xfs_bmbt_irec	prev, got;
    372	struct xfs_mount	*mp = ip->i_mount;
    373	struct xfs_ifork	*ifp = XFS_IFORK_PTR(ip, whichfork);
    374	xfs_fileoff_t		offset_fsb = XFS_B_TO_FSBT(mp, offset);
    375	int64_t			freesp;
    376	xfs_fsblock_t		qblocks;
    377	xfs_fsblock_t		alloc_blocks = 0;
    378	xfs_extlen_t		plen;
    379	int			shift = 0;
    380	int			qshift = 0;
    381
    382	/*
    383	 * As an exception we don't do any preallocation at all if the file is
    384	 * smaller than the minimum preallocation and we are using the default
    385	 * dynamic preallocation scheme, as it is likely this is the only write
    386	 * to the file that is going to be done.
    387	 */
    388	if (XFS_ISIZE(ip) < XFS_FSB_TO_B(mp, mp->m_allocsize_blocks))
    389		return 0;
    390
    391	/*
    392	 * Use the minimum preallocation size for small files or if we are
    393	 * writing right after a hole.
    394	 */
    395	if (XFS_ISIZE(ip) < XFS_FSB_TO_B(mp, mp->m_dalign) ||
    396	    !xfs_iext_prev_extent(ifp, &ncur, &prev) ||
    397	    prev.br_startoff + prev.br_blockcount < offset_fsb)
    398		return mp->m_allocsize_blocks;
    399
    400	/*
    401	 * Take the size of the preceding data extents as the basis for the
    402	 * preallocation size. Note that we don't care if the previous extents
    403	 * are written or not.
    404	 */
    405	plen = prev.br_blockcount;
    406	while (xfs_iext_prev_extent(ifp, &ncur, &got)) {
    407		if (plen > XFS_MAX_BMBT_EXTLEN / 2 ||
    408		    isnullstartblock(got.br_startblock) ||
    409		    got.br_startoff + got.br_blockcount != prev.br_startoff ||
    410		    got.br_startblock + got.br_blockcount != prev.br_startblock)
    411			break;
    412		plen += got.br_blockcount;
    413		prev = got;
    414	}
    415
    416	/*
    417	 * If the size of the extents is greater than half the maximum extent
    418	 * length, then use the current offset as the basis.  This ensures that
    419	 * for large files the preallocation size always extends to
    420	 * XFS_BMBT_MAX_EXTLEN rather than falling short due to things like stripe
    421	 * unit/width alignment of real extents.
    422	 */
    423	alloc_blocks = plen * 2;
    424	if (alloc_blocks > XFS_MAX_BMBT_EXTLEN)
    425		alloc_blocks = XFS_B_TO_FSB(mp, offset);
    426	qblocks = alloc_blocks;
    427
    428	/*
    429	 * XFS_BMBT_MAX_EXTLEN is not a power of two value but we round the prealloc
    430	 * down to the nearest power of two value after throttling. To prevent
    431	 * the round down from unconditionally reducing the maximum supported
    432	 * prealloc size, we round up first, apply appropriate throttling, round
    433	 * down and cap the value to XFS_BMBT_MAX_EXTLEN.
    434	 */
    435	alloc_blocks = XFS_FILEOFF_MIN(roundup_pow_of_two(XFS_MAX_BMBT_EXTLEN),
    436				       alloc_blocks);
    437
    438	freesp = percpu_counter_read_positive(&mp->m_fdblocks);
    439	if (freesp < mp->m_low_space[XFS_LOWSP_5_PCNT]) {
    440		shift = 2;
    441		if (freesp < mp->m_low_space[XFS_LOWSP_4_PCNT])
    442			shift++;
    443		if (freesp < mp->m_low_space[XFS_LOWSP_3_PCNT])
    444			shift++;
    445		if (freesp < mp->m_low_space[XFS_LOWSP_2_PCNT])
    446			shift++;
    447		if (freesp < mp->m_low_space[XFS_LOWSP_1_PCNT])
    448			shift++;
    449	}
    450
    451	/*
    452	 * Check each quota to cap the prealloc size, provide a shift value to
    453	 * throttle with and adjust amount of available space.
    454	 */
    455	if (xfs_quota_need_throttle(ip, XFS_DQTYPE_USER, alloc_blocks))
    456		xfs_quota_calc_throttle(ip, XFS_DQTYPE_USER, &qblocks, &qshift,
    457					&freesp);
    458	if (xfs_quota_need_throttle(ip, XFS_DQTYPE_GROUP, alloc_blocks))
    459		xfs_quota_calc_throttle(ip, XFS_DQTYPE_GROUP, &qblocks, &qshift,
    460					&freesp);
    461	if (xfs_quota_need_throttle(ip, XFS_DQTYPE_PROJ, alloc_blocks))
    462		xfs_quota_calc_throttle(ip, XFS_DQTYPE_PROJ, &qblocks, &qshift,
    463					&freesp);
    464
    465	/*
    466	 * The final prealloc size is set to the minimum of free space available
    467	 * in each of the quotas and the overall filesystem.
    468	 *
    469	 * The shift throttle value is set to the maximum value as determined by
    470	 * the global low free space values and per-quota low free space values.
    471	 */
    472	alloc_blocks = min(alloc_blocks, qblocks);
    473	shift = max(shift, qshift);
    474
    475	if (shift)
    476		alloc_blocks >>= shift;
    477	/*
    478	 * rounddown_pow_of_two() returns an undefined result if we pass in
    479	 * alloc_blocks = 0.
    480	 */
    481	if (alloc_blocks)
    482		alloc_blocks = rounddown_pow_of_two(alloc_blocks);
    483	if (alloc_blocks > XFS_MAX_BMBT_EXTLEN)
    484		alloc_blocks = XFS_MAX_BMBT_EXTLEN;
    485
    486	/*
    487	 * If we are still trying to allocate more space than is
    488	 * available, squash the prealloc hard. This can happen if we
    489	 * have a large file on a small filesystem and the above
    490	 * lowspace thresholds are smaller than XFS_BMBT_MAX_EXTLEN.
    491	 */
    492	while (alloc_blocks && alloc_blocks >= freesp)
    493		alloc_blocks >>= 4;
    494	if (alloc_blocks < mp->m_allocsize_blocks)
    495		alloc_blocks = mp->m_allocsize_blocks;
    496	trace_xfs_iomap_prealloc_size(ip, alloc_blocks, shift,
    497				      mp->m_allocsize_blocks);
    498	return alloc_blocks;
    499}
    500
    501int
    502xfs_iomap_write_unwritten(
    503	xfs_inode_t	*ip,
    504	xfs_off_t	offset,
    505	xfs_off_t	count,
    506	bool		update_isize)
    507{
    508	xfs_mount_t	*mp = ip->i_mount;
    509	xfs_fileoff_t	offset_fsb;
    510	xfs_filblks_t	count_fsb;
    511	xfs_filblks_t	numblks_fsb;
    512	int		nimaps;
    513	xfs_trans_t	*tp;
    514	xfs_bmbt_irec_t imap;
    515	struct inode	*inode = VFS_I(ip);
    516	xfs_fsize_t	i_size;
    517	uint		resblks;
    518	int		error;
    519
    520	trace_xfs_unwritten_convert(ip, offset, count);
    521
    522	offset_fsb = XFS_B_TO_FSBT(mp, offset);
    523	count_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)offset + count);
    524	count_fsb = (xfs_filblks_t)(count_fsb - offset_fsb);
    525
    526	/*
    527	 * Reserve enough blocks in this transaction for two complete extent
    528	 * btree splits.  We may be converting the middle part of an unwritten
    529	 * extent and in this case we will insert two new extents in the btree
    530	 * each of which could cause a full split.
    531	 *
    532	 * This reservation amount will be used in the first call to
    533	 * xfs_bmbt_split() to select an AG with enough space to satisfy the
    534	 * rest of the operation.
    535	 */
    536	resblks = XFS_DIOSTRAT_SPACE_RES(mp, 0) << 1;
    537
    538	/* Attach dquots so that bmbt splits are accounted correctly. */
    539	error = xfs_qm_dqattach(ip);
    540	if (error)
    541		return error;
    542
    543	do {
    544		/*
    545		 * Set up a transaction to convert the range of extents
    546		 * from unwritten to real. Do allocations in a loop until
    547		 * we have covered the range passed in.
    548		 *
    549		 * Note that we can't risk to recursing back into the filesystem
    550		 * here as we might be asked to write out the same inode that we
    551		 * complete here and might deadlock on the iolock.
    552		 */
    553		error = xfs_trans_alloc_inode(ip, &M_RES(mp)->tr_write, resblks,
    554				0, true, &tp);
    555		if (error)
    556			return error;
    557
    558		error = xfs_iext_count_may_overflow(ip, XFS_DATA_FORK,
    559				XFS_IEXT_WRITE_UNWRITTEN_CNT);
    560		if (error == -EFBIG)
    561			error = xfs_iext_count_upgrade(tp, ip,
    562					XFS_IEXT_WRITE_UNWRITTEN_CNT);
    563		if (error)
    564			goto error_on_bmapi_transaction;
    565
    566		/*
    567		 * Modify the unwritten extent state of the buffer.
    568		 */
    569		nimaps = 1;
    570		error = xfs_bmapi_write(tp, ip, offset_fsb, count_fsb,
    571					XFS_BMAPI_CONVERT, resblks, &imap,
    572					&nimaps);
    573		if (error)
    574			goto error_on_bmapi_transaction;
    575
    576		/*
    577		 * Log the updated inode size as we go.  We have to be careful
    578		 * to only log it up to the actual write offset if it is
    579		 * halfway into a block.
    580		 */
    581		i_size = XFS_FSB_TO_B(mp, offset_fsb + count_fsb);
    582		if (i_size > offset + count)
    583			i_size = offset + count;
    584		if (update_isize && i_size > i_size_read(inode))
    585			i_size_write(inode, i_size);
    586		i_size = xfs_new_eof(ip, i_size);
    587		if (i_size) {
    588			ip->i_disk_size = i_size;
    589			xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
    590		}
    591
    592		error = xfs_trans_commit(tp);
    593		xfs_iunlock(ip, XFS_ILOCK_EXCL);
    594		if (error)
    595			return error;
    596
    597		if (unlikely(!xfs_valid_startblock(ip, imap.br_startblock)))
    598			return xfs_alert_fsblock_zero(ip, &imap);
    599
    600		if ((numblks_fsb = imap.br_blockcount) == 0) {
    601			/*
    602			 * The numblks_fsb value should always get
    603			 * smaller, otherwise the loop is stuck.
    604			 */
    605			ASSERT(imap.br_blockcount);
    606			break;
    607		}
    608		offset_fsb += numblks_fsb;
    609		count_fsb -= numblks_fsb;
    610	} while (count_fsb > 0);
    611
    612	return 0;
    613
    614error_on_bmapi_transaction:
    615	xfs_trans_cancel(tp);
    616	xfs_iunlock(ip, XFS_ILOCK_EXCL);
    617	return error;
    618}
    619
    620static inline bool
    621imap_needs_alloc(
    622	struct inode		*inode,
    623	unsigned		flags,
    624	struct xfs_bmbt_irec	*imap,
    625	int			nimaps)
    626{
    627	/* don't allocate blocks when just zeroing */
    628	if (flags & IOMAP_ZERO)
    629		return false;
    630	if (!nimaps ||
    631	    imap->br_startblock == HOLESTARTBLOCK ||
    632	    imap->br_startblock == DELAYSTARTBLOCK)
    633		return true;
    634	/* we convert unwritten extents before copying the data for DAX */
    635	if ((flags & IOMAP_DAX) && imap->br_state == XFS_EXT_UNWRITTEN)
    636		return true;
    637	return false;
    638}
    639
    640static inline bool
    641imap_needs_cow(
    642	struct xfs_inode	*ip,
    643	unsigned int		flags,
    644	struct xfs_bmbt_irec	*imap,
    645	int			nimaps)
    646{
    647	if (!xfs_is_cow_inode(ip))
    648		return false;
    649
    650	/* when zeroing we don't have to COW holes or unwritten extents */
    651	if (flags & IOMAP_ZERO) {
    652		if (!nimaps ||
    653		    imap->br_startblock == HOLESTARTBLOCK ||
    654		    imap->br_state == XFS_EXT_UNWRITTEN)
    655			return false;
    656	}
    657
    658	return true;
    659}
    660
    661static int
    662xfs_ilock_for_iomap(
    663	struct xfs_inode	*ip,
    664	unsigned		flags,
    665	unsigned		*lockmode)
    666{
    667	unsigned		mode = XFS_ILOCK_SHARED;
    668	bool			is_write = flags & (IOMAP_WRITE | IOMAP_ZERO);
    669
    670	/*
    671	 * COW writes may allocate delalloc space or convert unwritten COW
    672	 * extents, so we need to make sure to take the lock exclusively here.
    673	 */
    674	if (xfs_is_cow_inode(ip) && is_write)
    675		mode = XFS_ILOCK_EXCL;
    676
    677	/*
    678	 * Extents not yet cached requires exclusive access, don't block.  This
    679	 * is an opencoded xfs_ilock_data_map_shared() call but with
    680	 * non-blocking behaviour.
    681	 */
    682	if (xfs_need_iread_extents(&ip->i_df)) {
    683		if (flags & IOMAP_NOWAIT)
    684			return -EAGAIN;
    685		mode = XFS_ILOCK_EXCL;
    686	}
    687
    688relock:
    689	if (flags & IOMAP_NOWAIT) {
    690		if (!xfs_ilock_nowait(ip, mode))
    691			return -EAGAIN;
    692	} else {
    693		xfs_ilock(ip, mode);
    694	}
    695
    696	/*
    697	 * The reflink iflag could have changed since the earlier unlocked
    698	 * check, so if we got ILOCK_SHARED for a write and but we're now a
    699	 * reflink inode we have to switch to ILOCK_EXCL and relock.
    700	 */
    701	if (mode == XFS_ILOCK_SHARED && is_write && xfs_is_cow_inode(ip)) {
    702		xfs_iunlock(ip, mode);
    703		mode = XFS_ILOCK_EXCL;
    704		goto relock;
    705	}
    706
    707	*lockmode = mode;
    708	return 0;
    709}
    710
    711/*
    712 * Check that the imap we are going to return to the caller spans the entire
    713 * range that the caller requested for the IO.
    714 */
    715static bool
    716imap_spans_range(
    717	struct xfs_bmbt_irec	*imap,
    718	xfs_fileoff_t		offset_fsb,
    719	xfs_fileoff_t		end_fsb)
    720{
    721	if (imap->br_startoff > offset_fsb)
    722		return false;
    723	if (imap->br_startoff + imap->br_blockcount < end_fsb)
    724		return false;
    725	return true;
    726}
    727
    728static int
    729xfs_direct_write_iomap_begin(
    730	struct inode		*inode,
    731	loff_t			offset,
    732	loff_t			length,
    733	unsigned		flags,
    734	struct iomap		*iomap,
    735	struct iomap		*srcmap)
    736{
    737	struct xfs_inode	*ip = XFS_I(inode);
    738	struct xfs_mount	*mp = ip->i_mount;
    739	struct xfs_bmbt_irec	imap, cmap;
    740	xfs_fileoff_t		offset_fsb = XFS_B_TO_FSBT(mp, offset);
    741	xfs_fileoff_t		end_fsb = xfs_iomap_end_fsb(mp, offset, length);
    742	int			nimaps = 1, error = 0;
    743	bool			shared = false;
    744	u16			iomap_flags = 0;
    745	unsigned		lockmode;
    746
    747	ASSERT(flags & (IOMAP_WRITE | IOMAP_ZERO));
    748
    749	if (xfs_is_shutdown(mp))
    750		return -EIO;
    751
    752	/*
    753	 * Writes that span EOF might trigger an IO size update on completion,
    754	 * so consider them to be dirty for the purposes of O_DSYNC even if
    755	 * there is no other metadata changes pending or have been made here.
    756	 */
    757	if (offset + length > i_size_read(inode))
    758		iomap_flags |= IOMAP_F_DIRTY;
    759
    760	error = xfs_ilock_for_iomap(ip, flags, &lockmode);
    761	if (error)
    762		return error;
    763
    764	error = xfs_bmapi_read(ip, offset_fsb, end_fsb - offset_fsb, &imap,
    765			       &nimaps, 0);
    766	if (error)
    767		goto out_unlock;
    768
    769	if (imap_needs_cow(ip, flags, &imap, nimaps)) {
    770		error = -EAGAIN;
    771		if (flags & IOMAP_NOWAIT)
    772			goto out_unlock;
    773
    774		/* may drop and re-acquire the ilock */
    775		error = xfs_reflink_allocate_cow(ip, &imap, &cmap, &shared,
    776				&lockmode, flags & IOMAP_DIRECT);
    777		if (error)
    778			goto out_unlock;
    779		if (shared)
    780			goto out_found_cow;
    781		end_fsb = imap.br_startoff + imap.br_blockcount;
    782		length = XFS_FSB_TO_B(mp, end_fsb) - offset;
    783	}
    784
    785	if (imap_needs_alloc(inode, flags, &imap, nimaps))
    786		goto allocate_blocks;
    787
    788	/*
    789	 * NOWAIT and OVERWRITE I/O needs to span the entire requested I/O with
    790	 * a single map so that we avoid partial IO failures due to the rest of
    791	 * the I/O range not covered by this map triggering an EAGAIN condition
    792	 * when it is subsequently mapped and aborting the I/O.
    793	 */
    794	if (flags & (IOMAP_NOWAIT | IOMAP_OVERWRITE_ONLY)) {
    795		error = -EAGAIN;
    796		if (!imap_spans_range(&imap, offset_fsb, end_fsb))
    797			goto out_unlock;
    798	}
    799
    800	/*
    801	 * For overwrite only I/O, we cannot convert unwritten extents without
    802	 * requiring sub-block zeroing.  This can only be done under an
    803	 * exclusive IOLOCK, hence return -EAGAIN if this is not a written
    804	 * extent to tell the caller to try again.
    805	 */
    806	if (flags & IOMAP_OVERWRITE_ONLY) {
    807		error = -EAGAIN;
    808		if (imap.br_state != XFS_EXT_NORM &&
    809	            ((offset | length) & mp->m_blockmask))
    810			goto out_unlock;
    811	}
    812
    813	xfs_iunlock(ip, lockmode);
    814	trace_xfs_iomap_found(ip, offset, length, XFS_DATA_FORK, &imap);
    815	return xfs_bmbt_to_iomap(ip, iomap, &imap, flags, iomap_flags);
    816
    817allocate_blocks:
    818	error = -EAGAIN;
    819	if (flags & (IOMAP_NOWAIT | IOMAP_OVERWRITE_ONLY))
    820		goto out_unlock;
    821
    822	/*
    823	 * We cap the maximum length we map to a sane size  to keep the chunks
    824	 * of work done where somewhat symmetric with the work writeback does.
    825	 * This is a completely arbitrary number pulled out of thin air as a
    826	 * best guess for initial testing.
    827	 *
    828	 * Note that the values needs to be less than 32-bits wide until the
    829	 * lower level functions are updated.
    830	 */
    831	length = min_t(loff_t, length, 1024 * PAGE_SIZE);
    832	end_fsb = xfs_iomap_end_fsb(mp, offset, length);
    833
    834	if (offset + length > XFS_ISIZE(ip))
    835		end_fsb = xfs_iomap_eof_align_last_fsb(ip, end_fsb);
    836	else if (nimaps && imap.br_startblock == HOLESTARTBLOCK)
    837		end_fsb = min(end_fsb, imap.br_startoff + imap.br_blockcount);
    838	xfs_iunlock(ip, lockmode);
    839
    840	error = xfs_iomap_write_direct(ip, offset_fsb, end_fsb - offset_fsb,
    841			flags, &imap);
    842	if (error)
    843		return error;
    844
    845	trace_xfs_iomap_alloc(ip, offset, length, XFS_DATA_FORK, &imap);
    846	return xfs_bmbt_to_iomap(ip, iomap, &imap, flags,
    847				 iomap_flags | IOMAP_F_NEW);
    848
    849out_found_cow:
    850	xfs_iunlock(ip, lockmode);
    851	length = XFS_FSB_TO_B(mp, cmap.br_startoff + cmap.br_blockcount);
    852	trace_xfs_iomap_found(ip, offset, length - offset, XFS_COW_FORK, &cmap);
    853	if (imap.br_startblock != HOLESTARTBLOCK) {
    854		error = xfs_bmbt_to_iomap(ip, srcmap, &imap, flags, 0);
    855		if (error)
    856			return error;
    857	}
    858	return xfs_bmbt_to_iomap(ip, iomap, &cmap, flags, IOMAP_F_SHARED);
    859
    860out_unlock:
    861	if (lockmode)
    862		xfs_iunlock(ip, lockmode);
    863	return error;
    864}
    865
    866const struct iomap_ops xfs_direct_write_iomap_ops = {
    867	.iomap_begin		= xfs_direct_write_iomap_begin,
    868};
    869
    870static int
    871xfs_buffered_write_iomap_begin(
    872	struct inode		*inode,
    873	loff_t			offset,
    874	loff_t			count,
    875	unsigned		flags,
    876	struct iomap		*iomap,
    877	struct iomap		*srcmap)
    878{
    879	struct xfs_inode	*ip = XFS_I(inode);
    880	struct xfs_mount	*mp = ip->i_mount;
    881	xfs_fileoff_t		offset_fsb = XFS_B_TO_FSBT(mp, offset);
    882	xfs_fileoff_t		end_fsb = xfs_iomap_end_fsb(mp, offset, count);
    883	struct xfs_bmbt_irec	imap, cmap;
    884	struct xfs_iext_cursor	icur, ccur;
    885	xfs_fsblock_t		prealloc_blocks = 0;
    886	bool			eof = false, cow_eof = false, shared = false;
    887	int			allocfork = XFS_DATA_FORK;
    888	int			error = 0;
    889
    890	if (xfs_is_shutdown(mp))
    891		return -EIO;
    892
    893	/* we can't use delayed allocations when using extent size hints */
    894	if (xfs_get_extsz_hint(ip))
    895		return xfs_direct_write_iomap_begin(inode, offset, count,
    896				flags, iomap, srcmap);
    897
    898	ASSERT(!XFS_IS_REALTIME_INODE(ip));
    899
    900	xfs_ilock(ip, XFS_ILOCK_EXCL);
    901
    902	if (XFS_IS_CORRUPT(mp, !xfs_ifork_has_extents(&ip->i_df)) ||
    903	    XFS_TEST_ERROR(false, mp, XFS_ERRTAG_BMAPIFORMAT)) {
    904		error = -EFSCORRUPTED;
    905		goto out_unlock;
    906	}
    907
    908	XFS_STATS_INC(mp, xs_blk_mapw);
    909
    910	error = xfs_iread_extents(NULL, ip, XFS_DATA_FORK);
    911	if (error)
    912		goto out_unlock;
    913
    914	/*
    915	 * Search the data fork first to look up our source mapping.  We
    916	 * always need the data fork map, as we have to return it to the
    917	 * iomap code so that the higher level write code can read data in to
    918	 * perform read-modify-write cycles for unaligned writes.
    919	 */
    920	eof = !xfs_iext_lookup_extent(ip, &ip->i_df, offset_fsb, &icur, &imap);
    921	if (eof)
    922		imap.br_startoff = end_fsb; /* fake hole until the end */
    923
    924	/* We never need to allocate blocks for zeroing a hole. */
    925	if ((flags & IOMAP_ZERO) && imap.br_startoff > offset_fsb) {
    926		xfs_hole_to_iomap(ip, iomap, offset_fsb, imap.br_startoff);
    927		goto out_unlock;
    928	}
    929
    930	/*
    931	 * Search the COW fork extent list even if we did not find a data fork
    932	 * extent.  This serves two purposes: first this implements the
    933	 * speculative preallocation using cowextsize, so that we also unshare
    934	 * block adjacent to shared blocks instead of just the shared blocks
    935	 * themselves.  Second the lookup in the extent list is generally faster
    936	 * than going out to the shared extent tree.
    937	 */
    938	if (xfs_is_cow_inode(ip)) {
    939		if (!ip->i_cowfp) {
    940			ASSERT(!xfs_is_reflink_inode(ip));
    941			xfs_ifork_init_cow(ip);
    942		}
    943		cow_eof = !xfs_iext_lookup_extent(ip, ip->i_cowfp, offset_fsb,
    944				&ccur, &cmap);
    945		if (!cow_eof && cmap.br_startoff <= offset_fsb) {
    946			trace_xfs_reflink_cow_found(ip, &cmap);
    947			goto found_cow;
    948		}
    949	}
    950
    951	if (imap.br_startoff <= offset_fsb) {
    952		/*
    953		 * For reflink files we may need a delalloc reservation when
    954		 * overwriting shared extents.   This includes zeroing of
    955		 * existing extents that contain data.
    956		 */
    957		if (!xfs_is_cow_inode(ip) ||
    958		    ((flags & IOMAP_ZERO) && imap.br_state != XFS_EXT_NORM)) {
    959			trace_xfs_iomap_found(ip, offset, count, XFS_DATA_FORK,
    960					&imap);
    961			goto found_imap;
    962		}
    963
    964		xfs_trim_extent(&imap, offset_fsb, end_fsb - offset_fsb);
    965
    966		/* Trim the mapping to the nearest shared extent boundary. */
    967		error = xfs_bmap_trim_cow(ip, &imap, &shared);
    968		if (error)
    969			goto out_unlock;
    970
    971		/* Not shared?  Just report the (potentially capped) extent. */
    972		if (!shared) {
    973			trace_xfs_iomap_found(ip, offset, count, XFS_DATA_FORK,
    974					&imap);
    975			goto found_imap;
    976		}
    977
    978		/*
    979		 * Fork all the shared blocks from our write offset until the
    980		 * end of the extent.
    981		 */
    982		allocfork = XFS_COW_FORK;
    983		end_fsb = imap.br_startoff + imap.br_blockcount;
    984	} else {
    985		/*
    986		 * We cap the maximum length we map here to MAX_WRITEBACK_PAGES
    987		 * pages to keep the chunks of work done where somewhat
    988		 * symmetric with the work writeback does.  This is a completely
    989		 * arbitrary number pulled out of thin air.
    990		 *
    991		 * Note that the values needs to be less than 32-bits wide until
    992		 * the lower level functions are updated.
    993		 */
    994		count = min_t(loff_t, count, 1024 * PAGE_SIZE);
    995		end_fsb = xfs_iomap_end_fsb(mp, offset, count);
    996
    997		if (xfs_is_always_cow_inode(ip))
    998			allocfork = XFS_COW_FORK;
    999	}
   1000
   1001	error = xfs_qm_dqattach_locked(ip, false);
   1002	if (error)
   1003		goto out_unlock;
   1004
   1005	if (eof && offset + count > XFS_ISIZE(ip)) {
   1006		/*
   1007		 * Determine the initial size of the preallocation.
   1008		 * We clean up any extra preallocation when the file is closed.
   1009		 */
   1010		if (xfs_has_allocsize(mp))
   1011			prealloc_blocks = mp->m_allocsize_blocks;
   1012		else
   1013			prealloc_blocks = xfs_iomap_prealloc_size(ip, allocfork,
   1014						offset, count, &icur);
   1015		if (prealloc_blocks) {
   1016			xfs_extlen_t	align;
   1017			xfs_off_t	end_offset;
   1018			xfs_fileoff_t	p_end_fsb;
   1019
   1020			end_offset = XFS_ALLOC_ALIGN(mp, offset + count - 1);
   1021			p_end_fsb = XFS_B_TO_FSBT(mp, end_offset) +
   1022					prealloc_blocks;
   1023
   1024			align = xfs_eof_alignment(ip);
   1025			if (align)
   1026				p_end_fsb = roundup_64(p_end_fsb, align);
   1027
   1028			p_end_fsb = min(p_end_fsb,
   1029				XFS_B_TO_FSB(mp, mp->m_super->s_maxbytes));
   1030			ASSERT(p_end_fsb > offset_fsb);
   1031			prealloc_blocks = p_end_fsb - end_fsb;
   1032		}
   1033	}
   1034
   1035retry:
   1036	error = xfs_bmapi_reserve_delalloc(ip, allocfork, offset_fsb,
   1037			end_fsb - offset_fsb, prealloc_blocks,
   1038			allocfork == XFS_DATA_FORK ? &imap : &cmap,
   1039			allocfork == XFS_DATA_FORK ? &icur : &ccur,
   1040			allocfork == XFS_DATA_FORK ? eof : cow_eof);
   1041	switch (error) {
   1042	case 0:
   1043		break;
   1044	case -ENOSPC:
   1045	case -EDQUOT:
   1046		/* retry without any preallocation */
   1047		trace_xfs_delalloc_enospc(ip, offset, count);
   1048		if (prealloc_blocks) {
   1049			prealloc_blocks = 0;
   1050			goto retry;
   1051		}
   1052		fallthrough;
   1053	default:
   1054		goto out_unlock;
   1055	}
   1056
   1057	if (allocfork == XFS_COW_FORK) {
   1058		trace_xfs_iomap_alloc(ip, offset, count, allocfork, &cmap);
   1059		goto found_cow;
   1060	}
   1061
   1062	/*
   1063	 * Flag newly allocated delalloc blocks with IOMAP_F_NEW so we punch
   1064	 * them out if the write happens to fail.
   1065	 */
   1066	xfs_iunlock(ip, XFS_ILOCK_EXCL);
   1067	trace_xfs_iomap_alloc(ip, offset, count, allocfork, &imap);
   1068	return xfs_bmbt_to_iomap(ip, iomap, &imap, flags, IOMAP_F_NEW);
   1069
   1070found_imap:
   1071	xfs_iunlock(ip, XFS_ILOCK_EXCL);
   1072	return xfs_bmbt_to_iomap(ip, iomap, &imap, flags, 0);
   1073
   1074found_cow:
   1075	xfs_iunlock(ip, XFS_ILOCK_EXCL);
   1076	if (imap.br_startoff <= offset_fsb) {
   1077		error = xfs_bmbt_to_iomap(ip, srcmap, &imap, flags, 0);
   1078		if (error)
   1079			return error;
   1080		return xfs_bmbt_to_iomap(ip, iomap, &cmap, flags,
   1081					 IOMAP_F_SHARED);
   1082	}
   1083
   1084	xfs_trim_extent(&cmap, offset_fsb, imap.br_startoff - offset_fsb);
   1085	return xfs_bmbt_to_iomap(ip, iomap, &cmap, flags, 0);
   1086
   1087out_unlock:
   1088	xfs_iunlock(ip, XFS_ILOCK_EXCL);
   1089	return error;
   1090}
   1091
   1092static int
   1093xfs_buffered_write_iomap_end(
   1094	struct inode		*inode,
   1095	loff_t			offset,
   1096	loff_t			length,
   1097	ssize_t			written,
   1098	unsigned		flags,
   1099	struct iomap		*iomap)
   1100{
   1101	struct xfs_inode	*ip = XFS_I(inode);
   1102	struct xfs_mount	*mp = ip->i_mount;
   1103	xfs_fileoff_t		start_fsb;
   1104	xfs_fileoff_t		end_fsb;
   1105	int			error = 0;
   1106
   1107	if (iomap->type != IOMAP_DELALLOC)
   1108		return 0;
   1109
   1110	/*
   1111	 * Behave as if the write failed if drop writes is enabled. Set the NEW
   1112	 * flag to force delalloc cleanup.
   1113	 */
   1114	if (XFS_TEST_ERROR(false, mp, XFS_ERRTAG_DROP_WRITES)) {
   1115		iomap->flags |= IOMAP_F_NEW;
   1116		written = 0;
   1117	}
   1118
   1119	/*
   1120	 * start_fsb refers to the first unused block after a short write. If
   1121	 * nothing was written, round offset down to point at the first block in
   1122	 * the range.
   1123	 */
   1124	if (unlikely(!written))
   1125		start_fsb = XFS_B_TO_FSBT(mp, offset);
   1126	else
   1127		start_fsb = XFS_B_TO_FSB(mp, offset + written);
   1128	end_fsb = XFS_B_TO_FSB(mp, offset + length);
   1129
   1130	/*
   1131	 * Trim delalloc blocks if they were allocated by this write and we
   1132	 * didn't manage to write the whole range.
   1133	 *
   1134	 * We don't need to care about racing delalloc as we hold i_mutex
   1135	 * across the reserve/allocate/unreserve calls. If there are delalloc
   1136	 * blocks in the range, they are ours.
   1137	 */
   1138	if ((iomap->flags & IOMAP_F_NEW) && start_fsb < end_fsb) {
   1139		truncate_pagecache_range(VFS_I(ip), XFS_FSB_TO_B(mp, start_fsb),
   1140					 XFS_FSB_TO_B(mp, end_fsb) - 1);
   1141
   1142		error = xfs_bmap_punch_delalloc_range(ip, start_fsb,
   1143					       end_fsb - start_fsb);
   1144		if (error && !xfs_is_shutdown(mp)) {
   1145			xfs_alert(mp, "%s: unable to clean up ino %lld",
   1146				__func__, ip->i_ino);
   1147			return error;
   1148		}
   1149	}
   1150
   1151	return 0;
   1152}
   1153
   1154const struct iomap_ops xfs_buffered_write_iomap_ops = {
   1155	.iomap_begin		= xfs_buffered_write_iomap_begin,
   1156	.iomap_end		= xfs_buffered_write_iomap_end,
   1157};
   1158
   1159static int
   1160xfs_read_iomap_begin(
   1161	struct inode		*inode,
   1162	loff_t			offset,
   1163	loff_t			length,
   1164	unsigned		flags,
   1165	struct iomap		*iomap,
   1166	struct iomap		*srcmap)
   1167{
   1168	struct xfs_inode	*ip = XFS_I(inode);
   1169	struct xfs_mount	*mp = ip->i_mount;
   1170	struct xfs_bmbt_irec	imap;
   1171	xfs_fileoff_t		offset_fsb = XFS_B_TO_FSBT(mp, offset);
   1172	xfs_fileoff_t		end_fsb = xfs_iomap_end_fsb(mp, offset, length);
   1173	int			nimaps = 1, error = 0;
   1174	bool			shared = false;
   1175	unsigned		lockmode;
   1176
   1177	ASSERT(!(flags & (IOMAP_WRITE | IOMAP_ZERO)));
   1178
   1179	if (xfs_is_shutdown(mp))
   1180		return -EIO;
   1181
   1182	error = xfs_ilock_for_iomap(ip, flags, &lockmode);
   1183	if (error)
   1184		return error;
   1185	error = xfs_bmapi_read(ip, offset_fsb, end_fsb - offset_fsb, &imap,
   1186			       &nimaps, 0);
   1187	if (!error && (flags & IOMAP_REPORT))
   1188		error = xfs_reflink_trim_around_shared(ip, &imap, &shared);
   1189	xfs_iunlock(ip, lockmode);
   1190
   1191	if (error)
   1192		return error;
   1193	trace_xfs_iomap_found(ip, offset, length, XFS_DATA_FORK, &imap);
   1194	return xfs_bmbt_to_iomap(ip, iomap, &imap, flags,
   1195				 shared ? IOMAP_F_SHARED : 0);
   1196}
   1197
   1198const struct iomap_ops xfs_read_iomap_ops = {
   1199	.iomap_begin		= xfs_read_iomap_begin,
   1200};
   1201
   1202static int
   1203xfs_seek_iomap_begin(
   1204	struct inode		*inode,
   1205	loff_t			offset,
   1206	loff_t			length,
   1207	unsigned		flags,
   1208	struct iomap		*iomap,
   1209	struct iomap		*srcmap)
   1210{
   1211	struct xfs_inode	*ip = XFS_I(inode);
   1212	struct xfs_mount	*mp = ip->i_mount;
   1213	xfs_fileoff_t		offset_fsb = XFS_B_TO_FSBT(mp, offset);
   1214	xfs_fileoff_t		end_fsb = XFS_B_TO_FSB(mp, offset + length);
   1215	xfs_fileoff_t		cow_fsb = NULLFILEOFF, data_fsb = NULLFILEOFF;
   1216	struct xfs_iext_cursor	icur;
   1217	struct xfs_bmbt_irec	imap, cmap;
   1218	int			error = 0;
   1219	unsigned		lockmode;
   1220
   1221	if (xfs_is_shutdown(mp))
   1222		return -EIO;
   1223
   1224	lockmode = xfs_ilock_data_map_shared(ip);
   1225	error = xfs_iread_extents(NULL, ip, XFS_DATA_FORK);
   1226	if (error)
   1227		goto out_unlock;
   1228
   1229	if (xfs_iext_lookup_extent(ip, &ip->i_df, offset_fsb, &icur, &imap)) {
   1230		/*
   1231		 * If we found a data extent we are done.
   1232		 */
   1233		if (imap.br_startoff <= offset_fsb)
   1234			goto done;
   1235		data_fsb = imap.br_startoff;
   1236	} else {
   1237		/*
   1238		 * Fake a hole until the end of the file.
   1239		 */
   1240		data_fsb = xfs_iomap_end_fsb(mp, offset, length);
   1241	}
   1242
   1243	/*
   1244	 * If a COW fork extent covers the hole, report it - capped to the next
   1245	 * data fork extent:
   1246	 */
   1247	if (xfs_inode_has_cow_data(ip) &&
   1248	    xfs_iext_lookup_extent(ip, ip->i_cowfp, offset_fsb, &icur, &cmap))
   1249		cow_fsb = cmap.br_startoff;
   1250	if (cow_fsb != NULLFILEOFF && cow_fsb <= offset_fsb) {
   1251		if (data_fsb < cow_fsb + cmap.br_blockcount)
   1252			end_fsb = min(end_fsb, data_fsb);
   1253		xfs_trim_extent(&cmap, offset_fsb, end_fsb);
   1254		error = xfs_bmbt_to_iomap(ip, iomap, &cmap, flags,
   1255					  IOMAP_F_SHARED);
   1256		/*
   1257		 * This is a COW extent, so we must probe the page cache
   1258		 * because there could be dirty page cache being backed
   1259		 * by this extent.
   1260		 */
   1261		iomap->type = IOMAP_UNWRITTEN;
   1262		goto out_unlock;
   1263	}
   1264
   1265	/*
   1266	 * Else report a hole, capped to the next found data or COW extent.
   1267	 */
   1268	if (cow_fsb != NULLFILEOFF && cow_fsb < data_fsb)
   1269		imap.br_blockcount = cow_fsb - offset_fsb;
   1270	else
   1271		imap.br_blockcount = data_fsb - offset_fsb;
   1272	imap.br_startoff = offset_fsb;
   1273	imap.br_startblock = HOLESTARTBLOCK;
   1274	imap.br_state = XFS_EXT_NORM;
   1275done:
   1276	xfs_trim_extent(&imap, offset_fsb, end_fsb);
   1277	error = xfs_bmbt_to_iomap(ip, iomap, &imap, flags, 0);
   1278out_unlock:
   1279	xfs_iunlock(ip, lockmode);
   1280	return error;
   1281}
   1282
   1283const struct iomap_ops xfs_seek_iomap_ops = {
   1284	.iomap_begin		= xfs_seek_iomap_begin,
   1285};
   1286
   1287static int
   1288xfs_xattr_iomap_begin(
   1289	struct inode		*inode,
   1290	loff_t			offset,
   1291	loff_t			length,
   1292	unsigned		flags,
   1293	struct iomap		*iomap,
   1294	struct iomap		*srcmap)
   1295{
   1296	struct xfs_inode	*ip = XFS_I(inode);
   1297	struct xfs_mount	*mp = ip->i_mount;
   1298	xfs_fileoff_t		offset_fsb = XFS_B_TO_FSBT(mp, offset);
   1299	xfs_fileoff_t		end_fsb = XFS_B_TO_FSB(mp, offset + length);
   1300	struct xfs_bmbt_irec	imap;
   1301	int			nimaps = 1, error = 0;
   1302	unsigned		lockmode;
   1303
   1304	if (xfs_is_shutdown(mp))
   1305		return -EIO;
   1306
   1307	lockmode = xfs_ilock_attr_map_shared(ip);
   1308
   1309	/* if there are no attribute fork or extents, return ENOENT */
   1310	if (!XFS_IFORK_Q(ip) || !ip->i_afp->if_nextents) {
   1311		error = -ENOENT;
   1312		goto out_unlock;
   1313	}
   1314
   1315	ASSERT(ip->i_afp->if_format != XFS_DINODE_FMT_LOCAL);
   1316	error = xfs_bmapi_read(ip, offset_fsb, end_fsb - offset_fsb, &imap,
   1317			       &nimaps, XFS_BMAPI_ATTRFORK);
   1318out_unlock:
   1319	xfs_iunlock(ip, lockmode);
   1320
   1321	if (error)
   1322		return error;
   1323	ASSERT(nimaps);
   1324	return xfs_bmbt_to_iomap(ip, iomap, &imap, flags, 0);
   1325}
   1326
   1327const struct iomap_ops xfs_xattr_iomap_ops = {
   1328	.iomap_begin		= xfs_xattr_iomap_begin,
   1329};
   1330
   1331int
   1332xfs_zero_range(
   1333	struct xfs_inode	*ip,
   1334	loff_t			pos,
   1335	loff_t			len,
   1336	bool			*did_zero)
   1337{
   1338	struct inode		*inode = VFS_I(ip);
   1339
   1340	if (IS_DAX(inode))
   1341		return dax_zero_range(inode, pos, len, did_zero,
   1342				      &xfs_direct_write_iomap_ops);
   1343	return iomap_zero_range(inode, pos, len, did_zero,
   1344				&xfs_buffered_write_iomap_ops);
   1345}
   1346
   1347int
   1348xfs_truncate_page(
   1349	struct xfs_inode	*ip,
   1350	loff_t			pos,
   1351	bool			*did_zero)
   1352{
   1353	struct inode		*inode = VFS_I(ip);
   1354
   1355	if (IS_DAX(inode))
   1356		return dax_truncate_page(inode, pos, did_zero,
   1357					&xfs_direct_write_iomap_ops);
   1358	return iomap_truncate_page(inode, pos, did_zero,
   1359				   &xfs_buffered_write_iomap_ops);
   1360}