cachepc-linux

Fork of AMDESE/linux with modifications for CachePC side-channel attack
git clone https://git.sinitax.com/sinitax/cachepc-linux
Log | Files | Refs | README | LICENSE | sfeed.txt

file.c (70338B)


      1// SPDX-License-Identifier: GPL-2.0-or-later
      2/*
      3 * file.c
      4 *
      5 * File open, close, extend, truncate
      6 *
      7 * Copyright (C) 2002, 2004 Oracle.  All rights reserved.
      8 */
      9
     10#include <linux/capability.h>
     11#include <linux/fs.h>
     12#include <linux/types.h>
     13#include <linux/slab.h>
     14#include <linux/highmem.h>
     15#include <linux/pagemap.h>
     16#include <linux/uio.h>
     17#include <linux/sched.h>
     18#include <linux/splice.h>
     19#include <linux/mount.h>
     20#include <linux/writeback.h>
     21#include <linux/falloc.h>
     22#include <linux/quotaops.h>
     23#include <linux/blkdev.h>
     24#include <linux/backing-dev.h>
     25
     26#include <cluster/masklog.h>
     27
     28#include "ocfs2.h"
     29
     30#include "alloc.h"
     31#include "aops.h"
     32#include "dir.h"
     33#include "dlmglue.h"
     34#include "extent_map.h"
     35#include "file.h"
     36#include "sysfile.h"
     37#include "inode.h"
     38#include "ioctl.h"
     39#include "journal.h"
     40#include "locks.h"
     41#include "mmap.h"
     42#include "suballoc.h"
     43#include "super.h"
     44#include "xattr.h"
     45#include "acl.h"
     46#include "quota.h"
     47#include "refcounttree.h"
     48#include "ocfs2_trace.h"
     49
     50#include "buffer_head_io.h"
     51
     52static int ocfs2_init_file_private(struct inode *inode, struct file *file)
     53{
     54	struct ocfs2_file_private *fp;
     55
     56	fp = kzalloc(sizeof(struct ocfs2_file_private), GFP_KERNEL);
     57	if (!fp)
     58		return -ENOMEM;
     59
     60	fp->fp_file = file;
     61	mutex_init(&fp->fp_mutex);
     62	ocfs2_file_lock_res_init(&fp->fp_flock, fp);
     63	file->private_data = fp;
     64
     65	return 0;
     66}
     67
     68static void ocfs2_free_file_private(struct inode *inode, struct file *file)
     69{
     70	struct ocfs2_file_private *fp = file->private_data;
     71	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
     72
     73	if (fp) {
     74		ocfs2_simple_drop_lockres(osb, &fp->fp_flock);
     75		ocfs2_lock_res_free(&fp->fp_flock);
     76		kfree(fp);
     77		file->private_data = NULL;
     78	}
     79}
     80
     81static int ocfs2_file_open(struct inode *inode, struct file *file)
     82{
     83	int status;
     84	int mode = file->f_flags;
     85	struct ocfs2_inode_info *oi = OCFS2_I(inode);
     86
     87	trace_ocfs2_file_open(inode, file, file->f_path.dentry,
     88			      (unsigned long long)oi->ip_blkno,
     89			      file->f_path.dentry->d_name.len,
     90			      file->f_path.dentry->d_name.name, mode);
     91
     92	if (file->f_mode & FMODE_WRITE) {
     93		status = dquot_initialize(inode);
     94		if (status)
     95			goto leave;
     96	}
     97
     98	spin_lock(&oi->ip_lock);
     99
    100	/* Check that the inode hasn't been wiped from disk by another
    101	 * node. If it hasn't then we're safe as long as we hold the
    102	 * spin lock until our increment of open count. */
    103	if (oi->ip_flags & OCFS2_INODE_DELETED) {
    104		spin_unlock(&oi->ip_lock);
    105
    106		status = -ENOENT;
    107		goto leave;
    108	}
    109
    110	if (mode & O_DIRECT)
    111		oi->ip_flags |= OCFS2_INODE_OPEN_DIRECT;
    112
    113	oi->ip_open_count++;
    114	spin_unlock(&oi->ip_lock);
    115
    116	status = ocfs2_init_file_private(inode, file);
    117	if (status) {
    118		/*
    119		 * We want to set open count back if we're failing the
    120		 * open.
    121		 */
    122		spin_lock(&oi->ip_lock);
    123		oi->ip_open_count--;
    124		spin_unlock(&oi->ip_lock);
    125	}
    126
    127	file->f_mode |= FMODE_NOWAIT;
    128
    129leave:
    130	return status;
    131}
    132
    133static int ocfs2_file_release(struct inode *inode, struct file *file)
    134{
    135	struct ocfs2_inode_info *oi = OCFS2_I(inode);
    136
    137	spin_lock(&oi->ip_lock);
    138	if (!--oi->ip_open_count)
    139		oi->ip_flags &= ~OCFS2_INODE_OPEN_DIRECT;
    140
    141	trace_ocfs2_file_release(inode, file, file->f_path.dentry,
    142				 oi->ip_blkno,
    143				 file->f_path.dentry->d_name.len,
    144				 file->f_path.dentry->d_name.name,
    145				 oi->ip_open_count);
    146	spin_unlock(&oi->ip_lock);
    147
    148	ocfs2_free_file_private(inode, file);
    149
    150	return 0;
    151}
    152
    153static int ocfs2_dir_open(struct inode *inode, struct file *file)
    154{
    155	return ocfs2_init_file_private(inode, file);
    156}
    157
    158static int ocfs2_dir_release(struct inode *inode, struct file *file)
    159{
    160	ocfs2_free_file_private(inode, file);
    161	return 0;
    162}
    163
    164static int ocfs2_sync_file(struct file *file, loff_t start, loff_t end,
    165			   int datasync)
    166{
    167	int err = 0;
    168	struct inode *inode = file->f_mapping->host;
    169	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
    170	struct ocfs2_inode_info *oi = OCFS2_I(inode);
    171	journal_t *journal = osb->journal->j_journal;
    172	int ret;
    173	tid_t commit_tid;
    174	bool needs_barrier = false;
    175
    176	trace_ocfs2_sync_file(inode, file, file->f_path.dentry,
    177			      oi->ip_blkno,
    178			      file->f_path.dentry->d_name.len,
    179			      file->f_path.dentry->d_name.name,
    180			      (unsigned long long)datasync);
    181
    182	if (ocfs2_is_hard_readonly(osb) || ocfs2_is_soft_readonly(osb))
    183		return -EROFS;
    184
    185	err = file_write_and_wait_range(file, start, end);
    186	if (err)
    187		return err;
    188
    189	commit_tid = datasync ? oi->i_datasync_tid : oi->i_sync_tid;
    190	if (journal->j_flags & JBD2_BARRIER &&
    191	    !jbd2_trans_will_send_data_barrier(journal, commit_tid))
    192		needs_barrier = true;
    193	err = jbd2_complete_transaction(journal, commit_tid);
    194	if (needs_barrier) {
    195		ret = blkdev_issue_flush(inode->i_sb->s_bdev);
    196		if (!err)
    197			err = ret;
    198	}
    199
    200	if (err)
    201		mlog_errno(err);
    202
    203	return (err < 0) ? -EIO : 0;
    204}
    205
    206int ocfs2_should_update_atime(struct inode *inode,
    207			      struct vfsmount *vfsmnt)
    208{
    209	struct timespec64 now;
    210	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
    211
    212	if (ocfs2_is_hard_readonly(osb) || ocfs2_is_soft_readonly(osb))
    213		return 0;
    214
    215	if ((inode->i_flags & S_NOATIME) ||
    216	    ((inode->i_sb->s_flags & SB_NODIRATIME) && S_ISDIR(inode->i_mode)))
    217		return 0;
    218
    219	/*
    220	 * We can be called with no vfsmnt structure - NFSD will
    221	 * sometimes do this.
    222	 *
    223	 * Note that our action here is different than touch_atime() -
    224	 * if we can't tell whether this is a noatime mount, then we
    225	 * don't know whether to trust the value of s_atime_quantum.
    226	 */
    227	if (vfsmnt == NULL)
    228		return 0;
    229
    230	if ((vfsmnt->mnt_flags & MNT_NOATIME) ||
    231	    ((vfsmnt->mnt_flags & MNT_NODIRATIME) && S_ISDIR(inode->i_mode)))
    232		return 0;
    233
    234	if (vfsmnt->mnt_flags & MNT_RELATIME) {
    235		if ((timespec64_compare(&inode->i_atime, &inode->i_mtime) <= 0) ||
    236		    (timespec64_compare(&inode->i_atime, &inode->i_ctime) <= 0))
    237			return 1;
    238
    239		return 0;
    240	}
    241
    242	now = current_time(inode);
    243	if ((now.tv_sec - inode->i_atime.tv_sec <= osb->s_atime_quantum))
    244		return 0;
    245	else
    246		return 1;
    247}
    248
    249int ocfs2_update_inode_atime(struct inode *inode,
    250			     struct buffer_head *bh)
    251{
    252	int ret;
    253	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
    254	handle_t *handle;
    255	struct ocfs2_dinode *di = (struct ocfs2_dinode *) bh->b_data;
    256
    257	handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
    258	if (IS_ERR(handle)) {
    259		ret = PTR_ERR(handle);
    260		mlog_errno(ret);
    261		goto out;
    262	}
    263
    264	ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), bh,
    265				      OCFS2_JOURNAL_ACCESS_WRITE);
    266	if (ret) {
    267		mlog_errno(ret);
    268		goto out_commit;
    269	}
    270
    271	/*
    272	 * Don't use ocfs2_mark_inode_dirty() here as we don't always
    273	 * have i_rwsem to guard against concurrent changes to other
    274	 * inode fields.
    275	 */
    276	inode->i_atime = current_time(inode);
    277	di->i_atime = cpu_to_le64(inode->i_atime.tv_sec);
    278	di->i_atime_nsec = cpu_to_le32(inode->i_atime.tv_nsec);
    279	ocfs2_update_inode_fsync_trans(handle, inode, 0);
    280	ocfs2_journal_dirty(handle, bh);
    281
    282out_commit:
    283	ocfs2_commit_trans(osb, handle);
    284out:
    285	return ret;
    286}
    287
    288int ocfs2_set_inode_size(handle_t *handle,
    289				struct inode *inode,
    290				struct buffer_head *fe_bh,
    291				u64 new_i_size)
    292{
    293	int status;
    294
    295	i_size_write(inode, new_i_size);
    296	inode->i_blocks = ocfs2_inode_sector_count(inode);
    297	inode->i_ctime = inode->i_mtime = current_time(inode);
    298
    299	status = ocfs2_mark_inode_dirty(handle, inode, fe_bh);
    300	if (status < 0) {
    301		mlog_errno(status);
    302		goto bail;
    303	}
    304
    305bail:
    306	return status;
    307}
    308
    309int ocfs2_simple_size_update(struct inode *inode,
    310			     struct buffer_head *di_bh,
    311			     u64 new_i_size)
    312{
    313	int ret;
    314	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
    315	handle_t *handle = NULL;
    316
    317	handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
    318	if (IS_ERR(handle)) {
    319		ret = PTR_ERR(handle);
    320		mlog_errno(ret);
    321		goto out;
    322	}
    323
    324	ret = ocfs2_set_inode_size(handle, inode, di_bh,
    325				   new_i_size);
    326	if (ret < 0)
    327		mlog_errno(ret);
    328
    329	ocfs2_update_inode_fsync_trans(handle, inode, 0);
    330	ocfs2_commit_trans(osb, handle);
    331out:
    332	return ret;
    333}
    334
    335static int ocfs2_cow_file_pos(struct inode *inode,
    336			      struct buffer_head *fe_bh,
    337			      u64 offset)
    338{
    339	int status;
    340	u32 phys, cpos = offset >> OCFS2_SB(inode->i_sb)->s_clustersize_bits;
    341	unsigned int num_clusters = 0;
    342	unsigned int ext_flags = 0;
    343
    344	/*
    345	 * If the new offset is aligned to the range of the cluster, there is
    346	 * no space for ocfs2_zero_range_for_truncate to fill, so no need to
    347	 * CoW either.
    348	 */
    349	if ((offset & (OCFS2_SB(inode->i_sb)->s_clustersize - 1)) == 0)
    350		return 0;
    351
    352	status = ocfs2_get_clusters(inode, cpos, &phys,
    353				    &num_clusters, &ext_flags);
    354	if (status) {
    355		mlog_errno(status);
    356		goto out;
    357	}
    358
    359	if (!(ext_flags & OCFS2_EXT_REFCOUNTED))
    360		goto out;
    361
    362	return ocfs2_refcount_cow(inode, fe_bh, cpos, 1, cpos+1);
    363
    364out:
    365	return status;
    366}
    367
    368static int ocfs2_orphan_for_truncate(struct ocfs2_super *osb,
    369				     struct inode *inode,
    370				     struct buffer_head *fe_bh,
    371				     u64 new_i_size)
    372{
    373	int status;
    374	handle_t *handle;
    375	struct ocfs2_dinode *di;
    376	u64 cluster_bytes;
    377
    378	/*
    379	 * We need to CoW the cluster contains the offset if it is reflinked
    380	 * since we will call ocfs2_zero_range_for_truncate later which will
    381	 * write "0" from offset to the end of the cluster.
    382	 */
    383	status = ocfs2_cow_file_pos(inode, fe_bh, new_i_size);
    384	if (status) {
    385		mlog_errno(status);
    386		return status;
    387	}
    388
    389	/* TODO: This needs to actually orphan the inode in this
    390	 * transaction. */
    391
    392	handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
    393	if (IS_ERR(handle)) {
    394		status = PTR_ERR(handle);
    395		mlog_errno(status);
    396		goto out;
    397	}
    398
    399	status = ocfs2_journal_access_di(handle, INODE_CACHE(inode), fe_bh,
    400					 OCFS2_JOURNAL_ACCESS_WRITE);
    401	if (status < 0) {
    402		mlog_errno(status);
    403		goto out_commit;
    404	}
    405
    406	/*
    407	 * Do this before setting i_size.
    408	 */
    409	cluster_bytes = ocfs2_align_bytes_to_clusters(inode->i_sb, new_i_size);
    410	status = ocfs2_zero_range_for_truncate(inode, handle, new_i_size,
    411					       cluster_bytes);
    412	if (status) {
    413		mlog_errno(status);
    414		goto out_commit;
    415	}
    416
    417	i_size_write(inode, new_i_size);
    418	inode->i_ctime = inode->i_mtime = current_time(inode);
    419
    420	di = (struct ocfs2_dinode *) fe_bh->b_data;
    421	di->i_size = cpu_to_le64(new_i_size);
    422	di->i_ctime = di->i_mtime = cpu_to_le64(inode->i_ctime.tv_sec);
    423	di->i_ctime_nsec = di->i_mtime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec);
    424	ocfs2_update_inode_fsync_trans(handle, inode, 0);
    425
    426	ocfs2_journal_dirty(handle, fe_bh);
    427
    428out_commit:
    429	ocfs2_commit_trans(osb, handle);
    430out:
    431	return status;
    432}
    433
    434int ocfs2_truncate_file(struct inode *inode,
    435			       struct buffer_head *di_bh,
    436			       u64 new_i_size)
    437{
    438	int status = 0;
    439	struct ocfs2_dinode *fe = NULL;
    440	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
    441
    442	/* We trust di_bh because it comes from ocfs2_inode_lock(), which
    443	 * already validated it */
    444	fe = (struct ocfs2_dinode *) di_bh->b_data;
    445
    446	trace_ocfs2_truncate_file((unsigned long long)OCFS2_I(inode)->ip_blkno,
    447				  (unsigned long long)le64_to_cpu(fe->i_size),
    448				  (unsigned long long)new_i_size);
    449
    450	mlog_bug_on_msg(le64_to_cpu(fe->i_size) != i_size_read(inode),
    451			"Inode %llu, inode i_size = %lld != di "
    452			"i_size = %llu, i_flags = 0x%x\n",
    453			(unsigned long long)OCFS2_I(inode)->ip_blkno,
    454			i_size_read(inode),
    455			(unsigned long long)le64_to_cpu(fe->i_size),
    456			le32_to_cpu(fe->i_flags));
    457
    458	if (new_i_size > le64_to_cpu(fe->i_size)) {
    459		trace_ocfs2_truncate_file_error(
    460			(unsigned long long)le64_to_cpu(fe->i_size),
    461			(unsigned long long)new_i_size);
    462		status = -EINVAL;
    463		mlog_errno(status);
    464		goto bail;
    465	}
    466
    467	down_write(&OCFS2_I(inode)->ip_alloc_sem);
    468
    469	ocfs2_resv_discard(&osb->osb_la_resmap,
    470			   &OCFS2_I(inode)->ip_la_data_resv);
    471
    472	/*
    473	 * The inode lock forced other nodes to sync and drop their
    474	 * pages, which (correctly) happens even if we have a truncate
    475	 * without allocation change - ocfs2 cluster sizes can be much
    476	 * greater than page size, so we have to truncate them
    477	 * anyway.
    478	 */
    479
    480	if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) {
    481		unmap_mapping_range(inode->i_mapping,
    482				    new_i_size + PAGE_SIZE - 1, 0, 1);
    483		truncate_inode_pages(inode->i_mapping, new_i_size);
    484		status = ocfs2_truncate_inline(inode, di_bh, new_i_size,
    485					       i_size_read(inode), 1);
    486		if (status)
    487			mlog_errno(status);
    488
    489		goto bail_unlock_sem;
    490	}
    491
    492	/* alright, we're going to need to do a full blown alloc size
    493	 * change. Orphan the inode so that recovery can complete the
    494	 * truncate if necessary. This does the task of marking
    495	 * i_size. */
    496	status = ocfs2_orphan_for_truncate(osb, inode, di_bh, new_i_size);
    497	if (status < 0) {
    498		mlog_errno(status);
    499		goto bail_unlock_sem;
    500	}
    501
    502	unmap_mapping_range(inode->i_mapping, new_i_size + PAGE_SIZE - 1, 0, 1);
    503	truncate_inode_pages(inode->i_mapping, new_i_size);
    504
    505	status = ocfs2_commit_truncate(osb, inode, di_bh);
    506	if (status < 0) {
    507		mlog_errno(status);
    508		goto bail_unlock_sem;
    509	}
    510
    511	/* TODO: orphan dir cleanup here. */
    512bail_unlock_sem:
    513	up_write(&OCFS2_I(inode)->ip_alloc_sem);
    514
    515bail:
    516	if (!status && OCFS2_I(inode)->ip_clusters == 0)
    517		status = ocfs2_try_remove_refcount_tree(inode, di_bh);
    518
    519	return status;
    520}
    521
    522/*
    523 * extend file allocation only here.
    524 * we'll update all the disk stuff, and oip->alloc_size
    525 *
    526 * expect stuff to be locked, a transaction started and enough data /
    527 * metadata reservations in the contexts.
    528 *
    529 * Will return -EAGAIN, and a reason if a restart is needed.
    530 * If passed in, *reason will always be set, even in error.
    531 */
    532int ocfs2_add_inode_data(struct ocfs2_super *osb,
    533			 struct inode *inode,
    534			 u32 *logical_offset,
    535			 u32 clusters_to_add,
    536			 int mark_unwritten,
    537			 struct buffer_head *fe_bh,
    538			 handle_t *handle,
    539			 struct ocfs2_alloc_context *data_ac,
    540			 struct ocfs2_alloc_context *meta_ac,
    541			 enum ocfs2_alloc_restarted *reason_ret)
    542{
    543	struct ocfs2_extent_tree et;
    544
    545	ocfs2_init_dinode_extent_tree(&et, INODE_CACHE(inode), fe_bh);
    546	return ocfs2_add_clusters_in_btree(handle, &et, logical_offset,
    547					   clusters_to_add, mark_unwritten,
    548					   data_ac, meta_ac, reason_ret);
    549}
    550
    551static int ocfs2_extend_allocation(struct inode *inode, u32 logical_start,
    552				   u32 clusters_to_add, int mark_unwritten)
    553{
    554	int status = 0;
    555	int restart_func = 0;
    556	int credits;
    557	u32 prev_clusters;
    558	struct buffer_head *bh = NULL;
    559	struct ocfs2_dinode *fe = NULL;
    560	handle_t *handle = NULL;
    561	struct ocfs2_alloc_context *data_ac = NULL;
    562	struct ocfs2_alloc_context *meta_ac = NULL;
    563	enum ocfs2_alloc_restarted why = RESTART_NONE;
    564	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
    565	struct ocfs2_extent_tree et;
    566	int did_quota = 0;
    567
    568	/*
    569	 * Unwritten extent only exists for file systems which
    570	 * support holes.
    571	 */
    572	BUG_ON(mark_unwritten && !ocfs2_sparse_alloc(osb));
    573
    574	status = ocfs2_read_inode_block(inode, &bh);
    575	if (status < 0) {
    576		mlog_errno(status);
    577		goto leave;
    578	}
    579	fe = (struct ocfs2_dinode *) bh->b_data;
    580
    581restart_all:
    582	BUG_ON(le32_to_cpu(fe->i_clusters) != OCFS2_I(inode)->ip_clusters);
    583
    584	ocfs2_init_dinode_extent_tree(&et, INODE_CACHE(inode), bh);
    585	status = ocfs2_lock_allocators(inode, &et, clusters_to_add, 0,
    586				       &data_ac, &meta_ac);
    587	if (status) {
    588		mlog_errno(status);
    589		goto leave;
    590	}
    591
    592	credits = ocfs2_calc_extend_credits(osb->sb, &fe->id2.i_list);
    593	handle = ocfs2_start_trans(osb, credits);
    594	if (IS_ERR(handle)) {
    595		status = PTR_ERR(handle);
    596		handle = NULL;
    597		mlog_errno(status);
    598		goto leave;
    599	}
    600
    601restarted_transaction:
    602	trace_ocfs2_extend_allocation(
    603		(unsigned long long)OCFS2_I(inode)->ip_blkno,
    604		(unsigned long long)i_size_read(inode),
    605		le32_to_cpu(fe->i_clusters), clusters_to_add,
    606		why, restart_func);
    607
    608	status = dquot_alloc_space_nodirty(inode,
    609			ocfs2_clusters_to_bytes(osb->sb, clusters_to_add));
    610	if (status)
    611		goto leave;
    612	did_quota = 1;
    613
    614	/* reserve a write to the file entry early on - that we if we
    615	 * run out of credits in the allocation path, we can still
    616	 * update i_size. */
    617	status = ocfs2_journal_access_di(handle, INODE_CACHE(inode), bh,
    618					 OCFS2_JOURNAL_ACCESS_WRITE);
    619	if (status < 0) {
    620		mlog_errno(status);
    621		goto leave;
    622	}
    623
    624	prev_clusters = OCFS2_I(inode)->ip_clusters;
    625
    626	status = ocfs2_add_inode_data(osb,
    627				      inode,
    628				      &logical_start,
    629				      clusters_to_add,
    630				      mark_unwritten,
    631				      bh,
    632				      handle,
    633				      data_ac,
    634				      meta_ac,
    635				      &why);
    636	if ((status < 0) && (status != -EAGAIN)) {
    637		if (status != -ENOSPC)
    638			mlog_errno(status);
    639		goto leave;
    640	}
    641	ocfs2_update_inode_fsync_trans(handle, inode, 1);
    642	ocfs2_journal_dirty(handle, bh);
    643
    644	spin_lock(&OCFS2_I(inode)->ip_lock);
    645	clusters_to_add -= (OCFS2_I(inode)->ip_clusters - prev_clusters);
    646	spin_unlock(&OCFS2_I(inode)->ip_lock);
    647	/* Release unused quota reservation */
    648	dquot_free_space(inode,
    649			ocfs2_clusters_to_bytes(osb->sb, clusters_to_add));
    650	did_quota = 0;
    651
    652	if (why != RESTART_NONE && clusters_to_add) {
    653		if (why == RESTART_META) {
    654			restart_func = 1;
    655			status = 0;
    656		} else {
    657			BUG_ON(why != RESTART_TRANS);
    658
    659			status = ocfs2_allocate_extend_trans(handle, 1);
    660			if (status < 0) {
    661				/* handle still has to be committed at
    662				 * this point. */
    663				status = -ENOMEM;
    664				mlog_errno(status);
    665				goto leave;
    666			}
    667			goto restarted_transaction;
    668		}
    669	}
    670
    671	trace_ocfs2_extend_allocation_end(OCFS2_I(inode)->ip_blkno,
    672	     le32_to_cpu(fe->i_clusters),
    673	     (unsigned long long)le64_to_cpu(fe->i_size),
    674	     OCFS2_I(inode)->ip_clusters,
    675	     (unsigned long long)i_size_read(inode));
    676
    677leave:
    678	if (status < 0 && did_quota)
    679		dquot_free_space(inode,
    680			ocfs2_clusters_to_bytes(osb->sb, clusters_to_add));
    681	if (handle) {
    682		ocfs2_commit_trans(osb, handle);
    683		handle = NULL;
    684	}
    685	if (data_ac) {
    686		ocfs2_free_alloc_context(data_ac);
    687		data_ac = NULL;
    688	}
    689	if (meta_ac) {
    690		ocfs2_free_alloc_context(meta_ac);
    691		meta_ac = NULL;
    692	}
    693	if ((!status) && restart_func) {
    694		restart_func = 0;
    695		goto restart_all;
    696	}
    697	brelse(bh);
    698	bh = NULL;
    699
    700	return status;
    701}
    702
    703/*
    704 * While a write will already be ordering the data, a truncate will not.
    705 * Thus, we need to explicitly order the zeroed pages.
    706 */
    707static handle_t *ocfs2_zero_start_ordered_transaction(struct inode *inode,
    708						      struct buffer_head *di_bh,
    709						      loff_t start_byte,
    710						      loff_t length)
    711{
    712	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
    713	handle_t *handle = NULL;
    714	int ret = 0;
    715
    716	if (!ocfs2_should_order_data(inode))
    717		goto out;
    718
    719	handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
    720	if (IS_ERR(handle)) {
    721		ret = -ENOMEM;
    722		mlog_errno(ret);
    723		goto out;
    724	}
    725
    726	ret = ocfs2_jbd2_inode_add_write(handle, inode, start_byte, length);
    727	if (ret < 0) {
    728		mlog_errno(ret);
    729		goto out;
    730	}
    731
    732	ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), di_bh,
    733				      OCFS2_JOURNAL_ACCESS_WRITE);
    734	if (ret)
    735		mlog_errno(ret);
    736	ocfs2_update_inode_fsync_trans(handle, inode, 1);
    737
    738out:
    739	if (ret) {
    740		if (!IS_ERR(handle))
    741			ocfs2_commit_trans(osb, handle);
    742		handle = ERR_PTR(ret);
    743	}
    744	return handle;
    745}
    746
    747/* Some parts of this taken from generic_cont_expand, which turned out
    748 * to be too fragile to do exactly what we need without us having to
    749 * worry about recursive locking in ->write_begin() and ->write_end(). */
    750static int ocfs2_write_zero_page(struct inode *inode, u64 abs_from,
    751				 u64 abs_to, struct buffer_head *di_bh)
    752{
    753	struct address_space *mapping = inode->i_mapping;
    754	struct page *page;
    755	unsigned long index = abs_from >> PAGE_SHIFT;
    756	handle_t *handle;
    757	int ret = 0;
    758	unsigned zero_from, zero_to, block_start, block_end;
    759	struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
    760
    761	BUG_ON(abs_from >= abs_to);
    762	BUG_ON(abs_to > (((u64)index + 1) << PAGE_SHIFT));
    763	BUG_ON(abs_from & (inode->i_blkbits - 1));
    764
    765	handle = ocfs2_zero_start_ordered_transaction(inode, di_bh,
    766						      abs_from,
    767						      abs_to - abs_from);
    768	if (IS_ERR(handle)) {
    769		ret = PTR_ERR(handle);
    770		goto out;
    771	}
    772
    773	page = find_or_create_page(mapping, index, GFP_NOFS);
    774	if (!page) {
    775		ret = -ENOMEM;
    776		mlog_errno(ret);
    777		goto out_commit_trans;
    778	}
    779
    780	/* Get the offsets within the page that we want to zero */
    781	zero_from = abs_from & (PAGE_SIZE - 1);
    782	zero_to = abs_to & (PAGE_SIZE - 1);
    783	if (!zero_to)
    784		zero_to = PAGE_SIZE;
    785
    786	trace_ocfs2_write_zero_page(
    787			(unsigned long long)OCFS2_I(inode)->ip_blkno,
    788			(unsigned long long)abs_from,
    789			(unsigned long long)abs_to,
    790			index, zero_from, zero_to);
    791
    792	/* We know that zero_from is block aligned */
    793	for (block_start = zero_from; block_start < zero_to;
    794	     block_start = block_end) {
    795		block_end = block_start + i_blocksize(inode);
    796
    797		/*
    798		 * block_start is block-aligned.  Bump it by one to force
    799		 * __block_write_begin and block_commit_write to zero the
    800		 * whole block.
    801		 */
    802		ret = __block_write_begin(page, block_start + 1, 0,
    803					  ocfs2_get_block);
    804		if (ret < 0) {
    805			mlog_errno(ret);
    806			goto out_unlock;
    807		}
    808
    809
    810		/* must not update i_size! */
    811		ret = block_commit_write(page, block_start + 1,
    812					 block_start + 1);
    813		if (ret < 0)
    814			mlog_errno(ret);
    815		else
    816			ret = 0;
    817	}
    818
    819	/*
    820	 * fs-writeback will release the dirty pages without page lock
    821	 * whose offset are over inode size, the release happens at
    822	 * block_write_full_page().
    823	 */
    824	i_size_write(inode, abs_to);
    825	inode->i_blocks = ocfs2_inode_sector_count(inode);
    826	di->i_size = cpu_to_le64((u64)i_size_read(inode));
    827	inode->i_mtime = inode->i_ctime = current_time(inode);
    828	di->i_mtime = di->i_ctime = cpu_to_le64(inode->i_mtime.tv_sec);
    829	di->i_ctime_nsec = cpu_to_le32(inode->i_mtime.tv_nsec);
    830	di->i_mtime_nsec = di->i_ctime_nsec;
    831	if (handle) {
    832		ocfs2_journal_dirty(handle, di_bh);
    833		ocfs2_update_inode_fsync_trans(handle, inode, 1);
    834	}
    835
    836out_unlock:
    837	unlock_page(page);
    838	put_page(page);
    839out_commit_trans:
    840	if (handle)
    841		ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle);
    842out:
    843	return ret;
    844}
    845
    846/*
    847 * Find the next range to zero.  We do this in terms of bytes because
    848 * that's what ocfs2_zero_extend() wants, and it is dealing with the
    849 * pagecache.  We may return multiple extents.
    850 *
    851 * zero_start and zero_end are ocfs2_zero_extend()s current idea of what
    852 * needs to be zeroed.  range_start and range_end return the next zeroing
    853 * range.  A subsequent call should pass the previous range_end as its
    854 * zero_start.  If range_end is 0, there's nothing to do.
    855 *
    856 * Unwritten extents are skipped over.  Refcounted extents are CoWd.
    857 */
    858static int ocfs2_zero_extend_get_range(struct inode *inode,
    859				       struct buffer_head *di_bh,
    860				       u64 zero_start, u64 zero_end,
    861				       u64 *range_start, u64 *range_end)
    862{
    863	int rc = 0, needs_cow = 0;
    864	u32 p_cpos, zero_clusters = 0;
    865	u32 zero_cpos =
    866		zero_start >> OCFS2_SB(inode->i_sb)->s_clustersize_bits;
    867	u32 last_cpos = ocfs2_clusters_for_bytes(inode->i_sb, zero_end);
    868	unsigned int num_clusters = 0;
    869	unsigned int ext_flags = 0;
    870
    871	while (zero_cpos < last_cpos) {
    872		rc = ocfs2_get_clusters(inode, zero_cpos, &p_cpos,
    873					&num_clusters, &ext_flags);
    874		if (rc) {
    875			mlog_errno(rc);
    876			goto out;
    877		}
    878
    879		if (p_cpos && !(ext_flags & OCFS2_EXT_UNWRITTEN)) {
    880			zero_clusters = num_clusters;
    881			if (ext_flags & OCFS2_EXT_REFCOUNTED)
    882				needs_cow = 1;
    883			break;
    884		}
    885
    886		zero_cpos += num_clusters;
    887	}
    888	if (!zero_clusters) {
    889		*range_end = 0;
    890		goto out;
    891	}
    892
    893	while ((zero_cpos + zero_clusters) < last_cpos) {
    894		rc = ocfs2_get_clusters(inode, zero_cpos + zero_clusters,
    895					&p_cpos, &num_clusters,
    896					&ext_flags);
    897		if (rc) {
    898			mlog_errno(rc);
    899			goto out;
    900		}
    901
    902		if (!p_cpos || (ext_flags & OCFS2_EXT_UNWRITTEN))
    903			break;
    904		if (ext_flags & OCFS2_EXT_REFCOUNTED)
    905			needs_cow = 1;
    906		zero_clusters += num_clusters;
    907	}
    908	if ((zero_cpos + zero_clusters) > last_cpos)
    909		zero_clusters = last_cpos - zero_cpos;
    910
    911	if (needs_cow) {
    912		rc = ocfs2_refcount_cow(inode, di_bh, zero_cpos,
    913					zero_clusters, UINT_MAX);
    914		if (rc) {
    915			mlog_errno(rc);
    916			goto out;
    917		}
    918	}
    919
    920	*range_start = ocfs2_clusters_to_bytes(inode->i_sb, zero_cpos);
    921	*range_end = ocfs2_clusters_to_bytes(inode->i_sb,
    922					     zero_cpos + zero_clusters);
    923
    924out:
    925	return rc;
    926}
    927
    928/*
    929 * Zero one range returned from ocfs2_zero_extend_get_range().  The caller
    930 * has made sure that the entire range needs zeroing.
    931 */
    932static int ocfs2_zero_extend_range(struct inode *inode, u64 range_start,
    933				   u64 range_end, struct buffer_head *di_bh)
    934{
    935	int rc = 0;
    936	u64 next_pos;
    937	u64 zero_pos = range_start;
    938
    939	trace_ocfs2_zero_extend_range(
    940			(unsigned long long)OCFS2_I(inode)->ip_blkno,
    941			(unsigned long long)range_start,
    942			(unsigned long long)range_end);
    943	BUG_ON(range_start >= range_end);
    944
    945	while (zero_pos < range_end) {
    946		next_pos = (zero_pos & PAGE_MASK) + PAGE_SIZE;
    947		if (next_pos > range_end)
    948			next_pos = range_end;
    949		rc = ocfs2_write_zero_page(inode, zero_pos, next_pos, di_bh);
    950		if (rc < 0) {
    951			mlog_errno(rc);
    952			break;
    953		}
    954		zero_pos = next_pos;
    955
    956		/*
    957		 * Very large extends have the potential to lock up
    958		 * the cpu for extended periods of time.
    959		 */
    960		cond_resched();
    961	}
    962
    963	return rc;
    964}
    965
    966int ocfs2_zero_extend(struct inode *inode, struct buffer_head *di_bh,
    967		      loff_t zero_to_size)
    968{
    969	int ret = 0;
    970	u64 zero_start, range_start = 0, range_end = 0;
    971	struct super_block *sb = inode->i_sb;
    972
    973	zero_start = ocfs2_align_bytes_to_blocks(sb, i_size_read(inode));
    974	trace_ocfs2_zero_extend((unsigned long long)OCFS2_I(inode)->ip_blkno,
    975				(unsigned long long)zero_start,
    976				(unsigned long long)i_size_read(inode));
    977	while (zero_start < zero_to_size) {
    978		ret = ocfs2_zero_extend_get_range(inode, di_bh, zero_start,
    979						  zero_to_size,
    980						  &range_start,
    981						  &range_end);
    982		if (ret) {
    983			mlog_errno(ret);
    984			break;
    985		}
    986		if (!range_end)
    987			break;
    988		/* Trim the ends */
    989		if (range_start < zero_start)
    990			range_start = zero_start;
    991		if (range_end > zero_to_size)
    992			range_end = zero_to_size;
    993
    994		ret = ocfs2_zero_extend_range(inode, range_start,
    995					      range_end, di_bh);
    996		if (ret) {
    997			mlog_errno(ret);
    998			break;
    999		}
   1000		zero_start = range_end;
   1001	}
   1002
   1003	return ret;
   1004}
   1005
   1006int ocfs2_extend_no_holes(struct inode *inode, struct buffer_head *di_bh,
   1007			  u64 new_i_size, u64 zero_to)
   1008{
   1009	int ret;
   1010	u32 clusters_to_add;
   1011	struct ocfs2_inode_info *oi = OCFS2_I(inode);
   1012
   1013	/*
   1014	 * Only quota files call this without a bh, and they can't be
   1015	 * refcounted.
   1016	 */
   1017	BUG_ON(!di_bh && ocfs2_is_refcount_inode(inode));
   1018	BUG_ON(!di_bh && !(oi->ip_flags & OCFS2_INODE_SYSTEM_FILE));
   1019
   1020	clusters_to_add = ocfs2_clusters_for_bytes(inode->i_sb, new_i_size);
   1021	if (clusters_to_add < oi->ip_clusters)
   1022		clusters_to_add = 0;
   1023	else
   1024		clusters_to_add -= oi->ip_clusters;
   1025
   1026	if (clusters_to_add) {
   1027		ret = ocfs2_extend_allocation(inode, oi->ip_clusters,
   1028					      clusters_to_add, 0);
   1029		if (ret) {
   1030			mlog_errno(ret);
   1031			goto out;
   1032		}
   1033	}
   1034
   1035	/*
   1036	 * Call this even if we don't add any clusters to the tree. We
   1037	 * still need to zero the area between the old i_size and the
   1038	 * new i_size.
   1039	 */
   1040	ret = ocfs2_zero_extend(inode, di_bh, zero_to);
   1041	if (ret < 0)
   1042		mlog_errno(ret);
   1043
   1044out:
   1045	return ret;
   1046}
   1047
   1048static int ocfs2_extend_file(struct inode *inode,
   1049			     struct buffer_head *di_bh,
   1050			     u64 new_i_size)
   1051{
   1052	int ret = 0;
   1053	struct ocfs2_inode_info *oi = OCFS2_I(inode);
   1054
   1055	BUG_ON(!di_bh);
   1056
   1057	/* setattr sometimes calls us like this. */
   1058	if (new_i_size == 0)
   1059		goto out;
   1060
   1061	if (i_size_read(inode) == new_i_size)
   1062		goto out;
   1063	BUG_ON(new_i_size < i_size_read(inode));
   1064
   1065	/*
   1066	 * The alloc sem blocks people in read/write from reading our
   1067	 * allocation until we're done changing it. We depend on
   1068	 * i_rwsem to block other extend/truncate calls while we're
   1069	 * here.  We even have to hold it for sparse files because there
   1070	 * might be some tail zeroing.
   1071	 */
   1072	down_write(&oi->ip_alloc_sem);
   1073
   1074	if (oi->ip_dyn_features & OCFS2_INLINE_DATA_FL) {
   1075		/*
   1076		 * We can optimize small extends by keeping the inodes
   1077		 * inline data.
   1078		 */
   1079		if (ocfs2_size_fits_inline_data(di_bh, new_i_size)) {
   1080			up_write(&oi->ip_alloc_sem);
   1081			goto out_update_size;
   1082		}
   1083
   1084		ret = ocfs2_convert_inline_data_to_extents(inode, di_bh);
   1085		if (ret) {
   1086			up_write(&oi->ip_alloc_sem);
   1087			mlog_errno(ret);
   1088			goto out;
   1089		}
   1090	}
   1091
   1092	if (ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb)))
   1093		ret = ocfs2_zero_extend(inode, di_bh, new_i_size);
   1094	else
   1095		ret = ocfs2_extend_no_holes(inode, di_bh, new_i_size,
   1096					    new_i_size);
   1097
   1098	up_write(&oi->ip_alloc_sem);
   1099
   1100	if (ret < 0) {
   1101		mlog_errno(ret);
   1102		goto out;
   1103	}
   1104
   1105out_update_size:
   1106	ret = ocfs2_simple_size_update(inode, di_bh, new_i_size);
   1107	if (ret < 0)
   1108		mlog_errno(ret);
   1109
   1110out:
   1111	return ret;
   1112}
   1113
   1114int ocfs2_setattr(struct user_namespace *mnt_userns, struct dentry *dentry,
   1115		  struct iattr *attr)
   1116{
   1117	int status = 0, size_change;
   1118	int inode_locked = 0;
   1119	struct inode *inode = d_inode(dentry);
   1120	struct super_block *sb = inode->i_sb;
   1121	struct ocfs2_super *osb = OCFS2_SB(sb);
   1122	struct buffer_head *bh = NULL;
   1123	handle_t *handle = NULL;
   1124	struct dquot *transfer_to[MAXQUOTAS] = { };
   1125	int qtype;
   1126	int had_lock;
   1127	struct ocfs2_lock_holder oh;
   1128
   1129	trace_ocfs2_setattr(inode, dentry,
   1130			    (unsigned long long)OCFS2_I(inode)->ip_blkno,
   1131			    dentry->d_name.len, dentry->d_name.name,
   1132			    attr->ia_valid, attr->ia_mode,
   1133			    from_kuid(&init_user_ns, attr->ia_uid),
   1134			    from_kgid(&init_user_ns, attr->ia_gid));
   1135
   1136	/* ensuring we don't even attempt to truncate a symlink */
   1137	if (S_ISLNK(inode->i_mode))
   1138		attr->ia_valid &= ~ATTR_SIZE;
   1139
   1140#define OCFS2_VALID_ATTRS (ATTR_ATIME | ATTR_MTIME | ATTR_CTIME | ATTR_SIZE \
   1141			   | ATTR_GID | ATTR_UID | ATTR_MODE)
   1142	if (!(attr->ia_valid & OCFS2_VALID_ATTRS))
   1143		return 0;
   1144
   1145	status = setattr_prepare(&init_user_ns, dentry, attr);
   1146	if (status)
   1147		return status;
   1148
   1149	if (is_quota_modification(inode, attr)) {
   1150		status = dquot_initialize(inode);
   1151		if (status)
   1152			return status;
   1153	}
   1154	size_change = S_ISREG(inode->i_mode) && attr->ia_valid & ATTR_SIZE;
   1155	if (size_change) {
   1156		/*
   1157		 * Here we should wait dio to finish before inode lock
   1158		 * to avoid a deadlock between ocfs2_setattr() and
   1159		 * ocfs2_dio_end_io_write()
   1160		 */
   1161		inode_dio_wait(inode);
   1162
   1163		status = ocfs2_rw_lock(inode, 1);
   1164		if (status < 0) {
   1165			mlog_errno(status);
   1166			goto bail;
   1167		}
   1168	}
   1169
   1170	had_lock = ocfs2_inode_lock_tracker(inode, &bh, 1, &oh);
   1171	if (had_lock < 0) {
   1172		status = had_lock;
   1173		goto bail_unlock_rw;
   1174	} else if (had_lock) {
   1175		/*
   1176		 * As far as we know, ocfs2_setattr() could only be the first
   1177		 * VFS entry point in the call chain of recursive cluster
   1178		 * locking issue.
   1179		 *
   1180		 * For instance:
   1181		 * chmod_common()
   1182		 *  notify_change()
   1183		 *   ocfs2_setattr()
   1184		 *    posix_acl_chmod()
   1185		 *     ocfs2_iop_get_acl()
   1186		 *
   1187		 * But, we're not 100% sure if it's always true, because the
   1188		 * ordering of the VFS entry points in the call chain is out
   1189		 * of our control. So, we'd better dump the stack here to
   1190		 * catch the other cases of recursive locking.
   1191		 */
   1192		mlog(ML_ERROR, "Another case of recursive locking:\n");
   1193		dump_stack();
   1194	}
   1195	inode_locked = 1;
   1196
   1197	if (size_change) {
   1198		status = inode_newsize_ok(inode, attr->ia_size);
   1199		if (status)
   1200			goto bail_unlock;
   1201
   1202		if (i_size_read(inode) >= attr->ia_size) {
   1203			if (ocfs2_should_order_data(inode)) {
   1204				status = ocfs2_begin_ordered_truncate(inode,
   1205								      attr->ia_size);
   1206				if (status)
   1207					goto bail_unlock;
   1208			}
   1209			status = ocfs2_truncate_file(inode, bh, attr->ia_size);
   1210		} else
   1211			status = ocfs2_extend_file(inode, bh, attr->ia_size);
   1212		if (status < 0) {
   1213			if (status != -ENOSPC)
   1214				mlog_errno(status);
   1215			status = -ENOSPC;
   1216			goto bail_unlock;
   1217		}
   1218	}
   1219
   1220	if ((attr->ia_valid & ATTR_UID && !uid_eq(attr->ia_uid, inode->i_uid)) ||
   1221	    (attr->ia_valid & ATTR_GID && !gid_eq(attr->ia_gid, inode->i_gid))) {
   1222		/*
   1223		 * Gather pointers to quota structures so that allocation /
   1224		 * freeing of quota structures happens here and not inside
   1225		 * dquot_transfer() where we have problems with lock ordering
   1226		 */
   1227		if (attr->ia_valid & ATTR_UID && !uid_eq(attr->ia_uid, inode->i_uid)
   1228		    && OCFS2_HAS_RO_COMPAT_FEATURE(sb,
   1229		    OCFS2_FEATURE_RO_COMPAT_USRQUOTA)) {
   1230			transfer_to[USRQUOTA] = dqget(sb, make_kqid_uid(attr->ia_uid));
   1231			if (IS_ERR(transfer_to[USRQUOTA])) {
   1232				status = PTR_ERR(transfer_to[USRQUOTA]);
   1233				transfer_to[USRQUOTA] = NULL;
   1234				goto bail_unlock;
   1235			}
   1236		}
   1237		if (attr->ia_valid & ATTR_GID && !gid_eq(attr->ia_gid, inode->i_gid)
   1238		    && OCFS2_HAS_RO_COMPAT_FEATURE(sb,
   1239		    OCFS2_FEATURE_RO_COMPAT_GRPQUOTA)) {
   1240			transfer_to[GRPQUOTA] = dqget(sb, make_kqid_gid(attr->ia_gid));
   1241			if (IS_ERR(transfer_to[GRPQUOTA])) {
   1242				status = PTR_ERR(transfer_to[GRPQUOTA]);
   1243				transfer_to[GRPQUOTA] = NULL;
   1244				goto bail_unlock;
   1245			}
   1246		}
   1247		down_write(&OCFS2_I(inode)->ip_alloc_sem);
   1248		handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS +
   1249					   2 * ocfs2_quota_trans_credits(sb));
   1250		if (IS_ERR(handle)) {
   1251			status = PTR_ERR(handle);
   1252			mlog_errno(status);
   1253			goto bail_unlock_alloc;
   1254		}
   1255		status = __dquot_transfer(inode, transfer_to);
   1256		if (status < 0)
   1257			goto bail_commit;
   1258	} else {
   1259		down_write(&OCFS2_I(inode)->ip_alloc_sem);
   1260		handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
   1261		if (IS_ERR(handle)) {
   1262			status = PTR_ERR(handle);
   1263			mlog_errno(status);
   1264			goto bail_unlock_alloc;
   1265		}
   1266	}
   1267
   1268	setattr_copy(&init_user_ns, inode, attr);
   1269	mark_inode_dirty(inode);
   1270
   1271	status = ocfs2_mark_inode_dirty(handle, inode, bh);
   1272	if (status < 0)
   1273		mlog_errno(status);
   1274
   1275bail_commit:
   1276	ocfs2_commit_trans(osb, handle);
   1277bail_unlock_alloc:
   1278	up_write(&OCFS2_I(inode)->ip_alloc_sem);
   1279bail_unlock:
   1280	if (status && inode_locked) {
   1281		ocfs2_inode_unlock_tracker(inode, 1, &oh, had_lock);
   1282		inode_locked = 0;
   1283	}
   1284bail_unlock_rw:
   1285	if (size_change)
   1286		ocfs2_rw_unlock(inode, 1);
   1287bail:
   1288
   1289	/* Release quota pointers in case we acquired them */
   1290	for (qtype = 0; qtype < OCFS2_MAXQUOTAS; qtype++)
   1291		dqput(transfer_to[qtype]);
   1292
   1293	if (!status && attr->ia_valid & ATTR_MODE) {
   1294		status = ocfs2_acl_chmod(inode, bh);
   1295		if (status < 0)
   1296			mlog_errno(status);
   1297	}
   1298	if (inode_locked)
   1299		ocfs2_inode_unlock_tracker(inode, 1, &oh, had_lock);
   1300
   1301	brelse(bh);
   1302	return status;
   1303}
   1304
   1305int ocfs2_getattr(struct user_namespace *mnt_userns, const struct path *path,
   1306		  struct kstat *stat, u32 request_mask, unsigned int flags)
   1307{
   1308	struct inode *inode = d_inode(path->dentry);
   1309	struct super_block *sb = path->dentry->d_sb;
   1310	struct ocfs2_super *osb = sb->s_fs_info;
   1311	int err;
   1312
   1313	err = ocfs2_inode_revalidate(path->dentry);
   1314	if (err) {
   1315		if (err != -ENOENT)
   1316			mlog_errno(err);
   1317		goto bail;
   1318	}
   1319
   1320	generic_fillattr(&init_user_ns, inode, stat);
   1321	/*
   1322	 * If there is inline data in the inode, the inode will normally not
   1323	 * have data blocks allocated (it may have an external xattr block).
   1324	 * Report at least one sector for such files, so tools like tar, rsync,
   1325	 * others don't incorrectly think the file is completely sparse.
   1326	 */
   1327	if (unlikely(OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL))
   1328		stat->blocks += (stat->size + 511)>>9;
   1329
   1330	/* We set the blksize from the cluster size for performance */
   1331	stat->blksize = osb->s_clustersize;
   1332
   1333bail:
   1334	return err;
   1335}
   1336
   1337int ocfs2_permission(struct user_namespace *mnt_userns, struct inode *inode,
   1338		     int mask)
   1339{
   1340	int ret, had_lock;
   1341	struct ocfs2_lock_holder oh;
   1342
   1343	if (mask & MAY_NOT_BLOCK)
   1344		return -ECHILD;
   1345
   1346	had_lock = ocfs2_inode_lock_tracker(inode, NULL, 0, &oh);
   1347	if (had_lock < 0) {
   1348		ret = had_lock;
   1349		goto out;
   1350	} else if (had_lock) {
   1351		/* See comments in ocfs2_setattr() for details.
   1352		 * The call chain of this case could be:
   1353		 * do_sys_open()
   1354		 *  may_open()
   1355		 *   inode_permission()
   1356		 *    ocfs2_permission()
   1357		 *     ocfs2_iop_get_acl()
   1358		 */
   1359		mlog(ML_ERROR, "Another case of recursive locking:\n");
   1360		dump_stack();
   1361	}
   1362
   1363	ret = generic_permission(&init_user_ns, inode, mask);
   1364
   1365	ocfs2_inode_unlock_tracker(inode, 0, &oh, had_lock);
   1366out:
   1367	return ret;
   1368}
   1369
   1370static int __ocfs2_write_remove_suid(struct inode *inode,
   1371				     struct buffer_head *bh)
   1372{
   1373	int ret;
   1374	handle_t *handle;
   1375	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
   1376	struct ocfs2_dinode *di;
   1377
   1378	trace_ocfs2_write_remove_suid(
   1379			(unsigned long long)OCFS2_I(inode)->ip_blkno,
   1380			inode->i_mode);
   1381
   1382	handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
   1383	if (IS_ERR(handle)) {
   1384		ret = PTR_ERR(handle);
   1385		mlog_errno(ret);
   1386		goto out;
   1387	}
   1388
   1389	ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), bh,
   1390				      OCFS2_JOURNAL_ACCESS_WRITE);
   1391	if (ret < 0) {
   1392		mlog_errno(ret);
   1393		goto out_trans;
   1394	}
   1395
   1396	inode->i_mode &= ~S_ISUID;
   1397	if ((inode->i_mode & S_ISGID) && (inode->i_mode & S_IXGRP))
   1398		inode->i_mode &= ~S_ISGID;
   1399
   1400	di = (struct ocfs2_dinode *) bh->b_data;
   1401	di->i_mode = cpu_to_le16(inode->i_mode);
   1402	ocfs2_update_inode_fsync_trans(handle, inode, 0);
   1403
   1404	ocfs2_journal_dirty(handle, bh);
   1405
   1406out_trans:
   1407	ocfs2_commit_trans(osb, handle);
   1408out:
   1409	return ret;
   1410}
   1411
   1412static int ocfs2_write_remove_suid(struct inode *inode)
   1413{
   1414	int ret;
   1415	struct buffer_head *bh = NULL;
   1416
   1417	ret = ocfs2_read_inode_block(inode, &bh);
   1418	if (ret < 0) {
   1419		mlog_errno(ret);
   1420		goto out;
   1421	}
   1422
   1423	ret =  __ocfs2_write_remove_suid(inode, bh);
   1424out:
   1425	brelse(bh);
   1426	return ret;
   1427}
   1428
   1429/*
   1430 * Allocate enough extents to cover the region starting at byte offset
   1431 * start for len bytes. Existing extents are skipped, any extents
   1432 * added are marked as "unwritten".
   1433 */
   1434static int ocfs2_allocate_unwritten_extents(struct inode *inode,
   1435					    u64 start, u64 len)
   1436{
   1437	int ret;
   1438	u32 cpos, phys_cpos, clusters, alloc_size;
   1439	u64 end = start + len;
   1440	struct buffer_head *di_bh = NULL;
   1441
   1442	if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) {
   1443		ret = ocfs2_read_inode_block(inode, &di_bh);
   1444		if (ret) {
   1445			mlog_errno(ret);
   1446			goto out;
   1447		}
   1448
   1449		/*
   1450		 * Nothing to do if the requested reservation range
   1451		 * fits within the inode.
   1452		 */
   1453		if (ocfs2_size_fits_inline_data(di_bh, end))
   1454			goto out;
   1455
   1456		ret = ocfs2_convert_inline_data_to_extents(inode, di_bh);
   1457		if (ret) {
   1458			mlog_errno(ret);
   1459			goto out;
   1460		}
   1461	}
   1462
   1463	/*
   1464	 * We consider both start and len to be inclusive.
   1465	 */
   1466	cpos = start >> OCFS2_SB(inode->i_sb)->s_clustersize_bits;
   1467	clusters = ocfs2_clusters_for_bytes(inode->i_sb, start + len);
   1468	clusters -= cpos;
   1469
   1470	while (clusters) {
   1471		ret = ocfs2_get_clusters(inode, cpos, &phys_cpos,
   1472					 &alloc_size, NULL);
   1473		if (ret) {
   1474			mlog_errno(ret);
   1475			goto out;
   1476		}
   1477
   1478		/*
   1479		 * Hole or existing extent len can be arbitrary, so
   1480		 * cap it to our own allocation request.
   1481		 */
   1482		if (alloc_size > clusters)
   1483			alloc_size = clusters;
   1484
   1485		if (phys_cpos) {
   1486			/*
   1487			 * We already have an allocation at this
   1488			 * region so we can safely skip it.
   1489			 */
   1490			goto next;
   1491		}
   1492
   1493		ret = ocfs2_extend_allocation(inode, cpos, alloc_size, 1);
   1494		if (ret) {
   1495			if (ret != -ENOSPC)
   1496				mlog_errno(ret);
   1497			goto out;
   1498		}
   1499
   1500next:
   1501		cpos += alloc_size;
   1502		clusters -= alloc_size;
   1503	}
   1504
   1505	ret = 0;
   1506out:
   1507
   1508	brelse(di_bh);
   1509	return ret;
   1510}
   1511
   1512/*
   1513 * Truncate a byte range, avoiding pages within partial clusters. This
   1514 * preserves those pages for the zeroing code to write to.
   1515 */
   1516static void ocfs2_truncate_cluster_pages(struct inode *inode, u64 byte_start,
   1517					 u64 byte_len)
   1518{
   1519	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
   1520	loff_t start, end;
   1521	struct address_space *mapping = inode->i_mapping;
   1522
   1523	start = (loff_t)ocfs2_align_bytes_to_clusters(inode->i_sb, byte_start);
   1524	end = byte_start + byte_len;
   1525	end = end & ~(osb->s_clustersize - 1);
   1526
   1527	if (start < end) {
   1528		unmap_mapping_range(mapping, start, end - start, 0);
   1529		truncate_inode_pages_range(mapping, start, end - 1);
   1530	}
   1531}
   1532
   1533/*
   1534 * zero out partial blocks of one cluster.
   1535 *
   1536 * start: file offset where zero starts, will be made upper block aligned.
   1537 * len: it will be trimmed to the end of current cluster if "start + len"
   1538 *      is bigger than it.
   1539 */
   1540static int ocfs2_zeroout_partial_cluster(struct inode *inode,
   1541					u64 start, u64 len)
   1542{
   1543	int ret;
   1544	u64 start_block, end_block, nr_blocks;
   1545	u64 p_block, offset;
   1546	u32 cluster, p_cluster, nr_clusters;
   1547	struct super_block *sb = inode->i_sb;
   1548	u64 end = ocfs2_align_bytes_to_clusters(sb, start);
   1549
   1550	if (start + len < end)
   1551		end = start + len;
   1552
   1553	start_block = ocfs2_blocks_for_bytes(sb, start);
   1554	end_block = ocfs2_blocks_for_bytes(sb, end);
   1555	nr_blocks = end_block - start_block;
   1556	if (!nr_blocks)
   1557		return 0;
   1558
   1559	cluster = ocfs2_bytes_to_clusters(sb, start);
   1560	ret = ocfs2_get_clusters(inode, cluster, &p_cluster,
   1561				&nr_clusters, NULL);
   1562	if (ret)
   1563		return ret;
   1564	if (!p_cluster)
   1565		return 0;
   1566
   1567	offset = start_block - ocfs2_clusters_to_blocks(sb, cluster);
   1568	p_block = ocfs2_clusters_to_blocks(sb, p_cluster) + offset;
   1569	return sb_issue_zeroout(sb, p_block, nr_blocks, GFP_NOFS);
   1570}
   1571
   1572static int ocfs2_zero_partial_clusters(struct inode *inode,
   1573				       u64 start, u64 len)
   1574{
   1575	int ret = 0;
   1576	u64 tmpend = 0;
   1577	u64 end = start + len;
   1578	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
   1579	unsigned int csize = osb->s_clustersize;
   1580	handle_t *handle;
   1581	loff_t isize = i_size_read(inode);
   1582
   1583	/*
   1584	 * The "start" and "end" values are NOT necessarily part of
   1585	 * the range whose allocation is being deleted. Rather, this
   1586	 * is what the user passed in with the request. We must zero
   1587	 * partial clusters here. There's no need to worry about
   1588	 * physical allocation - the zeroing code knows to skip holes.
   1589	 */
   1590	trace_ocfs2_zero_partial_clusters(
   1591		(unsigned long long)OCFS2_I(inode)->ip_blkno,
   1592		(unsigned long long)start, (unsigned long long)end);
   1593
   1594	/*
   1595	 * If both edges are on a cluster boundary then there's no
   1596	 * zeroing required as the region is part of the allocation to
   1597	 * be truncated.
   1598	 */
   1599	if ((start & (csize - 1)) == 0 && (end & (csize - 1)) == 0)
   1600		goto out;
   1601
   1602	/* No page cache for EOF blocks, issue zero out to disk. */
   1603	if (end > isize) {
   1604		/*
   1605		 * zeroout eof blocks in last cluster starting from
   1606		 * "isize" even "start" > "isize" because it is
   1607		 * complicated to zeroout just at "start" as "start"
   1608		 * may be not aligned with block size, buffer write
   1609		 * would be required to do that, but out of eof buffer
   1610		 * write is not supported.
   1611		 */
   1612		ret = ocfs2_zeroout_partial_cluster(inode, isize,
   1613					end - isize);
   1614		if (ret) {
   1615			mlog_errno(ret);
   1616			goto out;
   1617		}
   1618		if (start >= isize)
   1619			goto out;
   1620		end = isize;
   1621	}
   1622	handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
   1623	if (IS_ERR(handle)) {
   1624		ret = PTR_ERR(handle);
   1625		mlog_errno(ret);
   1626		goto out;
   1627	}
   1628
   1629	/*
   1630	 * If start is on a cluster boundary and end is somewhere in another
   1631	 * cluster, we have not COWed the cluster starting at start, unless
   1632	 * end is also within the same cluster. So, in this case, we skip this
   1633	 * first call to ocfs2_zero_range_for_truncate() truncate and move on
   1634	 * to the next one.
   1635	 */
   1636	if ((start & (csize - 1)) != 0) {
   1637		/*
   1638		 * We want to get the byte offset of the end of the 1st
   1639		 * cluster.
   1640		 */
   1641		tmpend = (u64)osb->s_clustersize +
   1642			(start & ~(osb->s_clustersize - 1));
   1643		if (tmpend > end)
   1644			tmpend = end;
   1645
   1646		trace_ocfs2_zero_partial_clusters_range1(
   1647			(unsigned long long)start,
   1648			(unsigned long long)tmpend);
   1649
   1650		ret = ocfs2_zero_range_for_truncate(inode, handle, start,
   1651						    tmpend);
   1652		if (ret)
   1653			mlog_errno(ret);
   1654	}
   1655
   1656	if (tmpend < end) {
   1657		/*
   1658		 * This may make start and end equal, but the zeroing
   1659		 * code will skip any work in that case so there's no
   1660		 * need to catch it up here.
   1661		 */
   1662		start = end & ~(osb->s_clustersize - 1);
   1663
   1664		trace_ocfs2_zero_partial_clusters_range2(
   1665			(unsigned long long)start, (unsigned long long)end);
   1666
   1667		ret = ocfs2_zero_range_for_truncate(inode, handle, start, end);
   1668		if (ret)
   1669			mlog_errno(ret);
   1670	}
   1671	ocfs2_update_inode_fsync_trans(handle, inode, 1);
   1672
   1673	ocfs2_commit_trans(osb, handle);
   1674out:
   1675	return ret;
   1676}
   1677
   1678static int ocfs2_find_rec(struct ocfs2_extent_list *el, u32 pos)
   1679{
   1680	int i;
   1681	struct ocfs2_extent_rec *rec = NULL;
   1682
   1683	for (i = le16_to_cpu(el->l_next_free_rec) - 1; i >= 0; i--) {
   1684
   1685		rec = &el->l_recs[i];
   1686
   1687		if (le32_to_cpu(rec->e_cpos) < pos)
   1688			break;
   1689	}
   1690
   1691	return i;
   1692}
   1693
   1694/*
   1695 * Helper to calculate the punching pos and length in one run, we handle the
   1696 * following three cases in order:
   1697 *
   1698 * - remove the entire record
   1699 * - remove a partial record
   1700 * - no record needs to be removed (hole-punching completed)
   1701*/
   1702static void ocfs2_calc_trunc_pos(struct inode *inode,
   1703				 struct ocfs2_extent_list *el,
   1704				 struct ocfs2_extent_rec *rec,
   1705				 u32 trunc_start, u32 *trunc_cpos,
   1706				 u32 *trunc_len, u32 *trunc_end,
   1707				 u64 *blkno, int *done)
   1708{
   1709	int ret = 0;
   1710	u32 coff, range;
   1711
   1712	range = le32_to_cpu(rec->e_cpos) + ocfs2_rec_clusters(el, rec);
   1713
   1714	if (le32_to_cpu(rec->e_cpos) >= trunc_start) {
   1715		/*
   1716		 * remove an entire extent record.
   1717		 */
   1718		*trunc_cpos = le32_to_cpu(rec->e_cpos);
   1719		/*
   1720		 * Skip holes if any.
   1721		 */
   1722		if (range < *trunc_end)
   1723			*trunc_end = range;
   1724		*trunc_len = *trunc_end - le32_to_cpu(rec->e_cpos);
   1725		*blkno = le64_to_cpu(rec->e_blkno);
   1726		*trunc_end = le32_to_cpu(rec->e_cpos);
   1727	} else if (range > trunc_start) {
   1728		/*
   1729		 * remove a partial extent record, which means we're
   1730		 * removing the last extent record.
   1731		 */
   1732		*trunc_cpos = trunc_start;
   1733		/*
   1734		 * skip hole if any.
   1735		 */
   1736		if (range < *trunc_end)
   1737			*trunc_end = range;
   1738		*trunc_len = *trunc_end - trunc_start;
   1739		coff = trunc_start - le32_to_cpu(rec->e_cpos);
   1740		*blkno = le64_to_cpu(rec->e_blkno) +
   1741				ocfs2_clusters_to_blocks(inode->i_sb, coff);
   1742		*trunc_end = trunc_start;
   1743	} else {
   1744		/*
   1745		 * It may have two following possibilities:
   1746		 *
   1747		 * - last record has been removed
   1748		 * - trunc_start was within a hole
   1749		 *
   1750		 * both two cases mean the completion of hole punching.
   1751		 */
   1752		ret = 1;
   1753	}
   1754
   1755	*done = ret;
   1756}
   1757
   1758int ocfs2_remove_inode_range(struct inode *inode,
   1759			     struct buffer_head *di_bh, u64 byte_start,
   1760			     u64 byte_len)
   1761{
   1762	int ret = 0, flags = 0, done = 0, i;
   1763	u32 trunc_start, trunc_len, trunc_end, trunc_cpos, phys_cpos;
   1764	u32 cluster_in_el;
   1765	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
   1766	struct ocfs2_cached_dealloc_ctxt dealloc;
   1767	struct address_space *mapping = inode->i_mapping;
   1768	struct ocfs2_extent_tree et;
   1769	struct ocfs2_path *path = NULL;
   1770	struct ocfs2_extent_list *el = NULL;
   1771	struct ocfs2_extent_rec *rec = NULL;
   1772	struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
   1773	u64 blkno, refcount_loc = le64_to_cpu(di->i_refcount_loc);
   1774
   1775	ocfs2_init_dinode_extent_tree(&et, INODE_CACHE(inode), di_bh);
   1776	ocfs2_init_dealloc_ctxt(&dealloc);
   1777
   1778	trace_ocfs2_remove_inode_range(
   1779			(unsigned long long)OCFS2_I(inode)->ip_blkno,
   1780			(unsigned long long)byte_start,
   1781			(unsigned long long)byte_len);
   1782
   1783	if (byte_len == 0)
   1784		return 0;
   1785
   1786	if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) {
   1787		ret = ocfs2_truncate_inline(inode, di_bh, byte_start,
   1788					    byte_start + byte_len, 0);
   1789		if (ret) {
   1790			mlog_errno(ret);
   1791			goto out;
   1792		}
   1793		/*
   1794		 * There's no need to get fancy with the page cache
   1795		 * truncate of an inline-data inode. We're talking
   1796		 * about less than a page here, which will be cached
   1797		 * in the dinode buffer anyway.
   1798		 */
   1799		unmap_mapping_range(mapping, 0, 0, 0);
   1800		truncate_inode_pages(mapping, 0);
   1801		goto out;
   1802	}
   1803
   1804	/*
   1805	 * For reflinks, we may need to CoW 2 clusters which might be
   1806	 * partially zero'd later, if hole's start and end offset were
   1807	 * within one cluster(means is not exactly aligned to clustersize).
   1808	 */
   1809
   1810	if (ocfs2_is_refcount_inode(inode)) {
   1811		ret = ocfs2_cow_file_pos(inode, di_bh, byte_start);
   1812		if (ret) {
   1813			mlog_errno(ret);
   1814			goto out;
   1815		}
   1816
   1817		ret = ocfs2_cow_file_pos(inode, di_bh, byte_start + byte_len);
   1818		if (ret) {
   1819			mlog_errno(ret);
   1820			goto out;
   1821		}
   1822	}
   1823
   1824	trunc_start = ocfs2_clusters_for_bytes(osb->sb, byte_start);
   1825	trunc_end = (byte_start + byte_len) >> osb->s_clustersize_bits;
   1826	cluster_in_el = trunc_end;
   1827
   1828	ret = ocfs2_zero_partial_clusters(inode, byte_start, byte_len);
   1829	if (ret) {
   1830		mlog_errno(ret);
   1831		goto out;
   1832	}
   1833
   1834	path = ocfs2_new_path_from_et(&et);
   1835	if (!path) {
   1836		ret = -ENOMEM;
   1837		mlog_errno(ret);
   1838		goto out;
   1839	}
   1840
   1841	while (trunc_end > trunc_start) {
   1842
   1843		ret = ocfs2_find_path(INODE_CACHE(inode), path,
   1844				      cluster_in_el);
   1845		if (ret) {
   1846			mlog_errno(ret);
   1847			goto out;
   1848		}
   1849
   1850		el = path_leaf_el(path);
   1851
   1852		i = ocfs2_find_rec(el, trunc_end);
   1853		/*
   1854		 * Need to go to previous extent block.
   1855		 */
   1856		if (i < 0) {
   1857			if (path->p_tree_depth == 0)
   1858				break;
   1859
   1860			ret = ocfs2_find_cpos_for_left_leaf(inode->i_sb,
   1861							    path,
   1862							    &cluster_in_el);
   1863			if (ret) {
   1864				mlog_errno(ret);
   1865				goto out;
   1866			}
   1867
   1868			/*
   1869			 * We've reached the leftmost extent block,
   1870			 * it's safe to leave.
   1871			 */
   1872			if (cluster_in_el == 0)
   1873				break;
   1874
   1875			/*
   1876			 * The 'pos' searched for previous extent block is
   1877			 * always one cluster less than actual trunc_end.
   1878			 */
   1879			trunc_end = cluster_in_el + 1;
   1880
   1881			ocfs2_reinit_path(path, 1);
   1882
   1883			continue;
   1884
   1885		} else
   1886			rec = &el->l_recs[i];
   1887
   1888		ocfs2_calc_trunc_pos(inode, el, rec, trunc_start, &trunc_cpos,
   1889				     &trunc_len, &trunc_end, &blkno, &done);
   1890		if (done)
   1891			break;
   1892
   1893		flags = rec->e_flags;
   1894		phys_cpos = ocfs2_blocks_to_clusters(inode->i_sb, blkno);
   1895
   1896		ret = ocfs2_remove_btree_range(inode, &et, trunc_cpos,
   1897					       phys_cpos, trunc_len, flags,
   1898					       &dealloc, refcount_loc, false);
   1899		if (ret < 0) {
   1900			mlog_errno(ret);
   1901			goto out;
   1902		}
   1903
   1904		cluster_in_el = trunc_end;
   1905
   1906		ocfs2_reinit_path(path, 1);
   1907	}
   1908
   1909	ocfs2_truncate_cluster_pages(inode, byte_start, byte_len);
   1910
   1911out:
   1912	ocfs2_free_path(path);
   1913	ocfs2_schedule_truncate_log_flush(osb, 1);
   1914	ocfs2_run_deallocs(osb, &dealloc);
   1915
   1916	return ret;
   1917}
   1918
   1919/*
   1920 * Parts of this function taken from xfs_change_file_space()
   1921 */
   1922static int __ocfs2_change_file_space(struct file *file, struct inode *inode,
   1923				     loff_t f_pos, unsigned int cmd,
   1924				     struct ocfs2_space_resv *sr,
   1925				     int change_size)
   1926{
   1927	int ret;
   1928	s64 llen;
   1929	loff_t size, orig_isize;
   1930	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
   1931	struct buffer_head *di_bh = NULL;
   1932	handle_t *handle;
   1933	unsigned long long max_off = inode->i_sb->s_maxbytes;
   1934
   1935	if (ocfs2_is_hard_readonly(osb) || ocfs2_is_soft_readonly(osb))
   1936		return -EROFS;
   1937
   1938	inode_lock(inode);
   1939
   1940	/*
   1941	 * This prevents concurrent writes on other nodes
   1942	 */
   1943	ret = ocfs2_rw_lock(inode, 1);
   1944	if (ret) {
   1945		mlog_errno(ret);
   1946		goto out;
   1947	}
   1948
   1949	ret = ocfs2_inode_lock(inode, &di_bh, 1);
   1950	if (ret) {
   1951		mlog_errno(ret);
   1952		goto out_rw_unlock;
   1953	}
   1954
   1955	if (inode->i_flags & (S_IMMUTABLE|S_APPEND)) {
   1956		ret = -EPERM;
   1957		goto out_inode_unlock;
   1958	}
   1959
   1960	switch (sr->l_whence) {
   1961	case 0: /*SEEK_SET*/
   1962		break;
   1963	case 1: /*SEEK_CUR*/
   1964		sr->l_start += f_pos;
   1965		break;
   1966	case 2: /*SEEK_END*/
   1967		sr->l_start += i_size_read(inode);
   1968		break;
   1969	default:
   1970		ret = -EINVAL;
   1971		goto out_inode_unlock;
   1972	}
   1973	sr->l_whence = 0;
   1974
   1975	llen = sr->l_len > 0 ? sr->l_len - 1 : sr->l_len;
   1976
   1977	if (sr->l_start < 0
   1978	    || sr->l_start > max_off
   1979	    || (sr->l_start + llen) < 0
   1980	    || (sr->l_start + llen) > max_off) {
   1981		ret = -EINVAL;
   1982		goto out_inode_unlock;
   1983	}
   1984	size = sr->l_start + sr->l_len;
   1985
   1986	if (cmd == OCFS2_IOC_RESVSP || cmd == OCFS2_IOC_RESVSP64 ||
   1987	    cmd == OCFS2_IOC_UNRESVSP || cmd == OCFS2_IOC_UNRESVSP64) {
   1988		if (sr->l_len <= 0) {
   1989			ret = -EINVAL;
   1990			goto out_inode_unlock;
   1991		}
   1992	}
   1993
   1994	if (file && should_remove_suid(file->f_path.dentry)) {
   1995		ret = __ocfs2_write_remove_suid(inode, di_bh);
   1996		if (ret) {
   1997			mlog_errno(ret);
   1998			goto out_inode_unlock;
   1999		}
   2000	}
   2001
   2002	down_write(&OCFS2_I(inode)->ip_alloc_sem);
   2003	switch (cmd) {
   2004	case OCFS2_IOC_RESVSP:
   2005	case OCFS2_IOC_RESVSP64:
   2006		/*
   2007		 * This takes unsigned offsets, but the signed ones we
   2008		 * pass have been checked against overflow above.
   2009		 */
   2010		ret = ocfs2_allocate_unwritten_extents(inode, sr->l_start,
   2011						       sr->l_len);
   2012		break;
   2013	case OCFS2_IOC_UNRESVSP:
   2014	case OCFS2_IOC_UNRESVSP64:
   2015		ret = ocfs2_remove_inode_range(inode, di_bh, sr->l_start,
   2016					       sr->l_len);
   2017		break;
   2018	default:
   2019		ret = -EINVAL;
   2020	}
   2021
   2022	orig_isize = i_size_read(inode);
   2023	/* zeroout eof blocks in the cluster. */
   2024	if (!ret && change_size && orig_isize < size) {
   2025		ret = ocfs2_zeroout_partial_cluster(inode, orig_isize,
   2026					size - orig_isize);
   2027		if (!ret)
   2028			i_size_write(inode, size);
   2029	}
   2030	up_write(&OCFS2_I(inode)->ip_alloc_sem);
   2031	if (ret) {
   2032		mlog_errno(ret);
   2033		goto out_inode_unlock;
   2034	}
   2035
   2036	/*
   2037	 * We update c/mtime for these changes
   2038	 */
   2039	handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
   2040	if (IS_ERR(handle)) {
   2041		ret = PTR_ERR(handle);
   2042		mlog_errno(ret);
   2043		goto out_inode_unlock;
   2044	}
   2045
   2046	inode->i_ctime = inode->i_mtime = current_time(inode);
   2047	ret = ocfs2_mark_inode_dirty(handle, inode, di_bh);
   2048	if (ret < 0)
   2049		mlog_errno(ret);
   2050
   2051	if (file && (file->f_flags & O_SYNC))
   2052		handle->h_sync = 1;
   2053
   2054	ocfs2_commit_trans(osb, handle);
   2055
   2056out_inode_unlock:
   2057	brelse(di_bh);
   2058	ocfs2_inode_unlock(inode, 1);
   2059out_rw_unlock:
   2060	ocfs2_rw_unlock(inode, 1);
   2061
   2062out:
   2063	inode_unlock(inode);
   2064	return ret;
   2065}
   2066
   2067int ocfs2_change_file_space(struct file *file, unsigned int cmd,
   2068			    struct ocfs2_space_resv *sr)
   2069{
   2070	struct inode *inode = file_inode(file);
   2071	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
   2072	int ret;
   2073
   2074	if ((cmd == OCFS2_IOC_RESVSP || cmd == OCFS2_IOC_RESVSP64) &&
   2075	    !ocfs2_writes_unwritten_extents(osb))
   2076		return -ENOTTY;
   2077	else if ((cmd == OCFS2_IOC_UNRESVSP || cmd == OCFS2_IOC_UNRESVSP64) &&
   2078		 !ocfs2_sparse_alloc(osb))
   2079		return -ENOTTY;
   2080
   2081	if (!S_ISREG(inode->i_mode))
   2082		return -EINVAL;
   2083
   2084	if (!(file->f_mode & FMODE_WRITE))
   2085		return -EBADF;
   2086
   2087	ret = mnt_want_write_file(file);
   2088	if (ret)
   2089		return ret;
   2090	ret = __ocfs2_change_file_space(file, inode, file->f_pos, cmd, sr, 0);
   2091	mnt_drop_write_file(file);
   2092	return ret;
   2093}
   2094
   2095static long ocfs2_fallocate(struct file *file, int mode, loff_t offset,
   2096			    loff_t len)
   2097{
   2098	struct inode *inode = file_inode(file);
   2099	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
   2100	struct ocfs2_space_resv sr;
   2101	int change_size = 1;
   2102	int cmd = OCFS2_IOC_RESVSP64;
   2103
   2104	if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE))
   2105		return -EOPNOTSUPP;
   2106	if (!ocfs2_writes_unwritten_extents(osb))
   2107		return -EOPNOTSUPP;
   2108
   2109	if (mode & FALLOC_FL_KEEP_SIZE)
   2110		change_size = 0;
   2111
   2112	if (mode & FALLOC_FL_PUNCH_HOLE)
   2113		cmd = OCFS2_IOC_UNRESVSP64;
   2114
   2115	sr.l_whence = 0;
   2116	sr.l_start = (s64)offset;
   2117	sr.l_len = (s64)len;
   2118
   2119	return __ocfs2_change_file_space(NULL, inode, offset, cmd, &sr,
   2120					 change_size);
   2121}
   2122
   2123int ocfs2_check_range_for_refcount(struct inode *inode, loff_t pos,
   2124				   size_t count)
   2125{
   2126	int ret = 0;
   2127	unsigned int extent_flags;
   2128	u32 cpos, clusters, extent_len, phys_cpos;
   2129	struct super_block *sb = inode->i_sb;
   2130
   2131	if (!ocfs2_refcount_tree(OCFS2_SB(inode->i_sb)) ||
   2132	    !ocfs2_is_refcount_inode(inode) ||
   2133	    OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL)
   2134		return 0;
   2135
   2136	cpos = pos >> OCFS2_SB(sb)->s_clustersize_bits;
   2137	clusters = ocfs2_clusters_for_bytes(sb, pos + count) - cpos;
   2138
   2139	while (clusters) {
   2140		ret = ocfs2_get_clusters(inode, cpos, &phys_cpos, &extent_len,
   2141					 &extent_flags);
   2142		if (ret < 0) {
   2143			mlog_errno(ret);
   2144			goto out;
   2145		}
   2146
   2147		if (phys_cpos && (extent_flags & OCFS2_EXT_REFCOUNTED)) {
   2148			ret = 1;
   2149			break;
   2150		}
   2151
   2152		if (extent_len > clusters)
   2153			extent_len = clusters;
   2154
   2155		clusters -= extent_len;
   2156		cpos += extent_len;
   2157	}
   2158out:
   2159	return ret;
   2160}
   2161
   2162static int ocfs2_is_io_unaligned(struct inode *inode, size_t count, loff_t pos)
   2163{
   2164	int blockmask = inode->i_sb->s_blocksize - 1;
   2165	loff_t final_size = pos + count;
   2166
   2167	if ((pos & blockmask) || (final_size & blockmask))
   2168		return 1;
   2169	return 0;
   2170}
   2171
   2172static int ocfs2_inode_lock_for_extent_tree(struct inode *inode,
   2173					    struct buffer_head **di_bh,
   2174					    int meta_level,
   2175					    int write_sem,
   2176					    int wait)
   2177{
   2178	int ret = 0;
   2179
   2180	if (wait)
   2181		ret = ocfs2_inode_lock(inode, di_bh, meta_level);
   2182	else
   2183		ret = ocfs2_try_inode_lock(inode, di_bh, meta_level);
   2184	if (ret < 0)
   2185		goto out;
   2186
   2187	if (wait) {
   2188		if (write_sem)
   2189			down_write(&OCFS2_I(inode)->ip_alloc_sem);
   2190		else
   2191			down_read(&OCFS2_I(inode)->ip_alloc_sem);
   2192	} else {
   2193		if (write_sem)
   2194			ret = down_write_trylock(&OCFS2_I(inode)->ip_alloc_sem);
   2195		else
   2196			ret = down_read_trylock(&OCFS2_I(inode)->ip_alloc_sem);
   2197
   2198		if (!ret) {
   2199			ret = -EAGAIN;
   2200			goto out_unlock;
   2201		}
   2202	}
   2203
   2204	return ret;
   2205
   2206out_unlock:
   2207	brelse(*di_bh);
   2208	*di_bh = NULL;
   2209	ocfs2_inode_unlock(inode, meta_level);
   2210out:
   2211	return ret;
   2212}
   2213
   2214static void ocfs2_inode_unlock_for_extent_tree(struct inode *inode,
   2215					       struct buffer_head **di_bh,
   2216					       int meta_level,
   2217					       int write_sem)
   2218{
   2219	if (write_sem)
   2220		up_write(&OCFS2_I(inode)->ip_alloc_sem);
   2221	else
   2222		up_read(&OCFS2_I(inode)->ip_alloc_sem);
   2223
   2224	brelse(*di_bh);
   2225	*di_bh = NULL;
   2226
   2227	if (meta_level >= 0)
   2228		ocfs2_inode_unlock(inode, meta_level);
   2229}
   2230
   2231static int ocfs2_prepare_inode_for_write(struct file *file,
   2232					 loff_t pos, size_t count, int wait)
   2233{
   2234	int ret = 0, meta_level = 0, overwrite_io = 0;
   2235	int write_sem = 0;
   2236	struct dentry *dentry = file->f_path.dentry;
   2237	struct inode *inode = d_inode(dentry);
   2238	struct buffer_head *di_bh = NULL;
   2239	u32 cpos;
   2240	u32 clusters;
   2241
   2242	/*
   2243	 * We start with a read level meta lock and only jump to an ex
   2244	 * if we need to make modifications here.
   2245	 */
   2246	for(;;) {
   2247		ret = ocfs2_inode_lock_for_extent_tree(inode,
   2248						       &di_bh,
   2249						       meta_level,
   2250						       write_sem,
   2251						       wait);
   2252		if (ret < 0) {
   2253			if (ret != -EAGAIN)
   2254				mlog_errno(ret);
   2255			goto out;
   2256		}
   2257
   2258		/*
   2259		 * Check if IO will overwrite allocated blocks in case
   2260		 * IOCB_NOWAIT flag is set.
   2261		 */
   2262		if (!wait && !overwrite_io) {
   2263			overwrite_io = 1;
   2264
   2265			ret = ocfs2_overwrite_io(inode, di_bh, pos, count);
   2266			if (ret < 0) {
   2267				if (ret != -EAGAIN)
   2268					mlog_errno(ret);
   2269				goto out_unlock;
   2270			}
   2271		}
   2272
   2273		/* Clear suid / sgid if necessary. We do this here
   2274		 * instead of later in the write path because
   2275		 * remove_suid() calls ->setattr without any hint that
   2276		 * we may have already done our cluster locking. Since
   2277		 * ocfs2_setattr() *must* take cluster locks to
   2278		 * proceed, this will lead us to recursively lock the
   2279		 * inode. There's also the dinode i_size state which
   2280		 * can be lost via setattr during extending writes (we
   2281		 * set inode->i_size at the end of a write. */
   2282		if (should_remove_suid(dentry)) {
   2283			if (meta_level == 0) {
   2284				ocfs2_inode_unlock_for_extent_tree(inode,
   2285								   &di_bh,
   2286								   meta_level,
   2287								   write_sem);
   2288				meta_level = 1;
   2289				continue;
   2290			}
   2291
   2292			ret = ocfs2_write_remove_suid(inode);
   2293			if (ret < 0) {
   2294				mlog_errno(ret);
   2295				goto out_unlock;
   2296			}
   2297		}
   2298
   2299		ret = ocfs2_check_range_for_refcount(inode, pos, count);
   2300		if (ret == 1) {
   2301			ocfs2_inode_unlock_for_extent_tree(inode,
   2302							   &di_bh,
   2303							   meta_level,
   2304							   write_sem);
   2305			meta_level = 1;
   2306			write_sem = 1;
   2307			ret = ocfs2_inode_lock_for_extent_tree(inode,
   2308							       &di_bh,
   2309							       meta_level,
   2310							       write_sem,
   2311							       wait);
   2312			if (ret < 0) {
   2313				if (ret != -EAGAIN)
   2314					mlog_errno(ret);
   2315				goto out;
   2316			}
   2317
   2318			cpos = pos >> OCFS2_SB(inode->i_sb)->s_clustersize_bits;
   2319			clusters =
   2320				ocfs2_clusters_for_bytes(inode->i_sb, pos + count) - cpos;
   2321			ret = ocfs2_refcount_cow(inode, di_bh, cpos, clusters, UINT_MAX);
   2322		}
   2323
   2324		if (ret < 0) {
   2325			if (ret != -EAGAIN)
   2326				mlog_errno(ret);
   2327			goto out_unlock;
   2328		}
   2329
   2330		break;
   2331	}
   2332
   2333out_unlock:
   2334	trace_ocfs2_prepare_inode_for_write(OCFS2_I(inode)->ip_blkno,
   2335					    pos, count, wait);
   2336
   2337	ocfs2_inode_unlock_for_extent_tree(inode,
   2338					   &di_bh,
   2339					   meta_level,
   2340					   write_sem);
   2341
   2342out:
   2343	return ret;
   2344}
   2345
   2346static ssize_t ocfs2_file_write_iter(struct kiocb *iocb,
   2347				    struct iov_iter *from)
   2348{
   2349	int rw_level;
   2350	ssize_t written = 0;
   2351	ssize_t ret;
   2352	size_t count = iov_iter_count(from);
   2353	struct file *file = iocb->ki_filp;
   2354	struct inode *inode = file_inode(file);
   2355	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
   2356	int full_coherency = !(osb->s_mount_opt &
   2357			       OCFS2_MOUNT_COHERENCY_BUFFERED);
   2358	void *saved_ki_complete = NULL;
   2359	int append_write = ((iocb->ki_pos + count) >=
   2360			i_size_read(inode) ? 1 : 0);
   2361	int direct_io = iocb->ki_flags & IOCB_DIRECT ? 1 : 0;
   2362	int nowait = iocb->ki_flags & IOCB_NOWAIT ? 1 : 0;
   2363
   2364	trace_ocfs2_file_write_iter(inode, file, file->f_path.dentry,
   2365		(unsigned long long)OCFS2_I(inode)->ip_blkno,
   2366		file->f_path.dentry->d_name.len,
   2367		file->f_path.dentry->d_name.name,
   2368		(unsigned int)from->nr_segs);	/* GRRRRR */
   2369
   2370	if (!direct_io && nowait)
   2371		return -EOPNOTSUPP;
   2372
   2373	if (count == 0)
   2374		return 0;
   2375
   2376	if (nowait) {
   2377		if (!inode_trylock(inode))
   2378			return -EAGAIN;
   2379	} else
   2380		inode_lock(inode);
   2381
   2382	/*
   2383	 * Concurrent O_DIRECT writes are allowed with
   2384	 * mount_option "coherency=buffered".
   2385	 * For append write, we must take rw EX.
   2386	 */
   2387	rw_level = (!direct_io || full_coherency || append_write);
   2388
   2389	if (nowait)
   2390		ret = ocfs2_try_rw_lock(inode, rw_level);
   2391	else
   2392		ret = ocfs2_rw_lock(inode, rw_level);
   2393	if (ret < 0) {
   2394		if (ret != -EAGAIN)
   2395			mlog_errno(ret);
   2396		goto out_mutex;
   2397	}
   2398
   2399	/*
   2400	 * O_DIRECT writes with "coherency=full" need to take EX cluster
   2401	 * inode_lock to guarantee coherency.
   2402	 */
   2403	if (direct_io && full_coherency) {
   2404		/*
   2405		 * We need to take and drop the inode lock to force
   2406		 * other nodes to drop their caches.  Buffered I/O
   2407		 * already does this in write_begin().
   2408		 */
   2409		if (nowait)
   2410			ret = ocfs2_try_inode_lock(inode, NULL, 1);
   2411		else
   2412			ret = ocfs2_inode_lock(inode, NULL, 1);
   2413		if (ret < 0) {
   2414			if (ret != -EAGAIN)
   2415				mlog_errno(ret);
   2416			goto out;
   2417		}
   2418
   2419		ocfs2_inode_unlock(inode, 1);
   2420	}
   2421
   2422	ret = generic_write_checks(iocb, from);
   2423	if (ret <= 0) {
   2424		if (ret)
   2425			mlog_errno(ret);
   2426		goto out;
   2427	}
   2428	count = ret;
   2429
   2430	ret = ocfs2_prepare_inode_for_write(file, iocb->ki_pos, count, !nowait);
   2431	if (ret < 0) {
   2432		if (ret != -EAGAIN)
   2433			mlog_errno(ret);
   2434		goto out;
   2435	}
   2436
   2437	if (direct_io && !is_sync_kiocb(iocb) &&
   2438	    ocfs2_is_io_unaligned(inode, count, iocb->ki_pos)) {
   2439		/*
   2440		 * Make it a sync io if it's an unaligned aio.
   2441		 */
   2442		saved_ki_complete = xchg(&iocb->ki_complete, NULL);
   2443	}
   2444
   2445	/* communicate with ocfs2_dio_end_io */
   2446	ocfs2_iocb_set_rw_locked(iocb, rw_level);
   2447
   2448	written = __generic_file_write_iter(iocb, from);
   2449	/* buffered aio wouldn't have proper lock coverage today */
   2450	BUG_ON(written == -EIOCBQUEUED && !direct_io);
   2451
   2452	/*
   2453	 * deep in g_f_a_w_n()->ocfs2_direct_IO we pass in a ocfs2_dio_end_io
   2454	 * function pointer which is called when o_direct io completes so that
   2455	 * it can unlock our rw lock.
   2456	 * Unfortunately there are error cases which call end_io and others
   2457	 * that don't.  so we don't have to unlock the rw_lock if either an
   2458	 * async dio is going to do it in the future or an end_io after an
   2459	 * error has already done it.
   2460	 */
   2461	if ((written == -EIOCBQUEUED) || (!ocfs2_iocb_is_rw_locked(iocb))) {
   2462		rw_level = -1;
   2463	}
   2464
   2465	if (unlikely(written <= 0))
   2466		goto out;
   2467
   2468	if (((file->f_flags & O_DSYNC) && !direct_io) ||
   2469	    IS_SYNC(inode)) {
   2470		ret = filemap_fdatawrite_range(file->f_mapping,
   2471					       iocb->ki_pos - written,
   2472					       iocb->ki_pos - 1);
   2473		if (ret < 0)
   2474			written = ret;
   2475
   2476		if (!ret) {
   2477			ret = jbd2_journal_force_commit(osb->journal->j_journal);
   2478			if (ret < 0)
   2479				written = ret;
   2480		}
   2481
   2482		if (!ret)
   2483			ret = filemap_fdatawait_range(file->f_mapping,
   2484						      iocb->ki_pos - written,
   2485						      iocb->ki_pos - 1);
   2486	}
   2487
   2488out:
   2489	if (saved_ki_complete)
   2490		xchg(&iocb->ki_complete, saved_ki_complete);
   2491
   2492	if (rw_level != -1)
   2493		ocfs2_rw_unlock(inode, rw_level);
   2494
   2495out_mutex:
   2496	inode_unlock(inode);
   2497
   2498	if (written)
   2499		ret = written;
   2500	return ret;
   2501}
   2502
   2503static ssize_t ocfs2_file_read_iter(struct kiocb *iocb,
   2504				   struct iov_iter *to)
   2505{
   2506	int ret = 0, rw_level = -1, lock_level = 0;
   2507	struct file *filp = iocb->ki_filp;
   2508	struct inode *inode = file_inode(filp);
   2509	int direct_io = iocb->ki_flags & IOCB_DIRECT ? 1 : 0;
   2510	int nowait = iocb->ki_flags & IOCB_NOWAIT ? 1 : 0;
   2511
   2512	trace_ocfs2_file_read_iter(inode, filp, filp->f_path.dentry,
   2513			(unsigned long long)OCFS2_I(inode)->ip_blkno,
   2514			filp->f_path.dentry->d_name.len,
   2515			filp->f_path.dentry->d_name.name,
   2516			to->nr_segs);	/* GRRRRR */
   2517
   2518
   2519	if (!inode) {
   2520		ret = -EINVAL;
   2521		mlog_errno(ret);
   2522		goto bail;
   2523	}
   2524
   2525	if (!direct_io && nowait)
   2526		return -EOPNOTSUPP;
   2527
   2528	/*
   2529	 * buffered reads protect themselves in ->read_folio().  O_DIRECT reads
   2530	 * need locks to protect pending reads from racing with truncate.
   2531	 */
   2532	if (direct_io) {
   2533		if (nowait)
   2534			ret = ocfs2_try_rw_lock(inode, 0);
   2535		else
   2536			ret = ocfs2_rw_lock(inode, 0);
   2537
   2538		if (ret < 0) {
   2539			if (ret != -EAGAIN)
   2540				mlog_errno(ret);
   2541			goto bail;
   2542		}
   2543		rw_level = 0;
   2544		/* communicate with ocfs2_dio_end_io */
   2545		ocfs2_iocb_set_rw_locked(iocb, rw_level);
   2546	}
   2547
   2548	/*
   2549	 * We're fine letting folks race truncates and extending
   2550	 * writes with read across the cluster, just like they can
   2551	 * locally. Hence no rw_lock during read.
   2552	 *
   2553	 * Take and drop the meta data lock to update inode fields
   2554	 * like i_size. This allows the checks down below
   2555	 * generic_file_read_iter() a chance of actually working.
   2556	 */
   2557	ret = ocfs2_inode_lock_atime(inode, filp->f_path.mnt, &lock_level,
   2558				     !nowait);
   2559	if (ret < 0) {
   2560		if (ret != -EAGAIN)
   2561			mlog_errno(ret);
   2562		goto bail;
   2563	}
   2564	ocfs2_inode_unlock(inode, lock_level);
   2565
   2566	ret = generic_file_read_iter(iocb, to);
   2567	trace_generic_file_read_iter_ret(ret);
   2568
   2569	/* buffered aio wouldn't have proper lock coverage today */
   2570	BUG_ON(ret == -EIOCBQUEUED && !direct_io);
   2571
   2572	/* see ocfs2_file_write_iter */
   2573	if (ret == -EIOCBQUEUED || !ocfs2_iocb_is_rw_locked(iocb)) {
   2574		rw_level = -1;
   2575	}
   2576
   2577bail:
   2578	if (rw_level != -1)
   2579		ocfs2_rw_unlock(inode, rw_level);
   2580
   2581	return ret;
   2582}
   2583
   2584/* Refer generic_file_llseek_unlocked() */
   2585static loff_t ocfs2_file_llseek(struct file *file, loff_t offset, int whence)
   2586{
   2587	struct inode *inode = file->f_mapping->host;
   2588	int ret = 0;
   2589
   2590	inode_lock(inode);
   2591
   2592	switch (whence) {
   2593	case SEEK_SET:
   2594		break;
   2595	case SEEK_END:
   2596		/* SEEK_END requires the OCFS2 inode lock for the file
   2597		 * because it references the file's size.
   2598		 */
   2599		ret = ocfs2_inode_lock(inode, NULL, 0);
   2600		if (ret < 0) {
   2601			mlog_errno(ret);
   2602			goto out;
   2603		}
   2604		offset += i_size_read(inode);
   2605		ocfs2_inode_unlock(inode, 0);
   2606		break;
   2607	case SEEK_CUR:
   2608		if (offset == 0) {
   2609			offset = file->f_pos;
   2610			goto out;
   2611		}
   2612		offset += file->f_pos;
   2613		break;
   2614	case SEEK_DATA:
   2615	case SEEK_HOLE:
   2616		ret = ocfs2_seek_data_hole_offset(file, &offset, whence);
   2617		if (ret)
   2618			goto out;
   2619		break;
   2620	default:
   2621		ret = -EINVAL;
   2622		goto out;
   2623	}
   2624
   2625	offset = vfs_setpos(file, offset, inode->i_sb->s_maxbytes);
   2626
   2627out:
   2628	inode_unlock(inode);
   2629	if (ret)
   2630		return ret;
   2631	return offset;
   2632}
   2633
   2634static loff_t ocfs2_remap_file_range(struct file *file_in, loff_t pos_in,
   2635				     struct file *file_out, loff_t pos_out,
   2636				     loff_t len, unsigned int remap_flags)
   2637{
   2638	struct inode *inode_in = file_inode(file_in);
   2639	struct inode *inode_out = file_inode(file_out);
   2640	struct ocfs2_super *osb = OCFS2_SB(inode_in->i_sb);
   2641	struct buffer_head *in_bh = NULL, *out_bh = NULL;
   2642	bool same_inode = (inode_in == inode_out);
   2643	loff_t remapped = 0;
   2644	ssize_t ret;
   2645
   2646	if (remap_flags & ~(REMAP_FILE_DEDUP | REMAP_FILE_ADVISORY))
   2647		return -EINVAL;
   2648	if (!ocfs2_refcount_tree(osb))
   2649		return -EOPNOTSUPP;
   2650	if (ocfs2_is_hard_readonly(osb) || ocfs2_is_soft_readonly(osb))
   2651		return -EROFS;
   2652
   2653	/* Lock both files against IO */
   2654	ret = ocfs2_reflink_inodes_lock(inode_in, &in_bh, inode_out, &out_bh);
   2655	if (ret)
   2656		return ret;
   2657
   2658	/* Check file eligibility and prepare for block sharing. */
   2659	ret = -EINVAL;
   2660	if ((OCFS2_I(inode_in)->ip_flags & OCFS2_INODE_SYSTEM_FILE) ||
   2661	    (OCFS2_I(inode_out)->ip_flags & OCFS2_INODE_SYSTEM_FILE))
   2662		goto out_unlock;
   2663
   2664	ret = generic_remap_file_range_prep(file_in, pos_in, file_out, pos_out,
   2665			&len, remap_flags);
   2666	if (ret < 0 || len == 0)
   2667		goto out_unlock;
   2668
   2669	/* Lock out changes to the allocation maps and remap. */
   2670	down_write(&OCFS2_I(inode_in)->ip_alloc_sem);
   2671	if (!same_inode)
   2672		down_write_nested(&OCFS2_I(inode_out)->ip_alloc_sem,
   2673				  SINGLE_DEPTH_NESTING);
   2674
   2675	/* Zap any page cache for the destination file's range. */
   2676	truncate_inode_pages_range(&inode_out->i_data,
   2677				   round_down(pos_out, PAGE_SIZE),
   2678				   round_up(pos_out + len, PAGE_SIZE) - 1);
   2679
   2680	remapped = ocfs2_reflink_remap_blocks(inode_in, in_bh, pos_in,
   2681			inode_out, out_bh, pos_out, len);
   2682	up_write(&OCFS2_I(inode_in)->ip_alloc_sem);
   2683	if (!same_inode)
   2684		up_write(&OCFS2_I(inode_out)->ip_alloc_sem);
   2685	if (remapped < 0) {
   2686		ret = remapped;
   2687		mlog_errno(ret);
   2688		goto out_unlock;
   2689	}
   2690
   2691	/*
   2692	 * Empty the extent map so that we may get the right extent
   2693	 * record from the disk.
   2694	 */
   2695	ocfs2_extent_map_trunc(inode_in, 0);
   2696	ocfs2_extent_map_trunc(inode_out, 0);
   2697
   2698	ret = ocfs2_reflink_update_dest(inode_out, out_bh, pos_out + len);
   2699	if (ret) {
   2700		mlog_errno(ret);
   2701		goto out_unlock;
   2702	}
   2703
   2704out_unlock:
   2705	ocfs2_reflink_inodes_unlock(inode_in, in_bh, inode_out, out_bh);
   2706	return remapped > 0 ? remapped : ret;
   2707}
   2708
   2709const struct inode_operations ocfs2_file_iops = {
   2710	.setattr	= ocfs2_setattr,
   2711	.getattr	= ocfs2_getattr,
   2712	.permission	= ocfs2_permission,
   2713	.listxattr	= ocfs2_listxattr,
   2714	.fiemap		= ocfs2_fiemap,
   2715	.get_acl	= ocfs2_iop_get_acl,
   2716	.set_acl	= ocfs2_iop_set_acl,
   2717	.fileattr_get	= ocfs2_fileattr_get,
   2718	.fileattr_set	= ocfs2_fileattr_set,
   2719};
   2720
   2721const struct inode_operations ocfs2_special_file_iops = {
   2722	.setattr	= ocfs2_setattr,
   2723	.getattr	= ocfs2_getattr,
   2724	.permission	= ocfs2_permission,
   2725	.get_acl	= ocfs2_iop_get_acl,
   2726	.set_acl	= ocfs2_iop_set_acl,
   2727};
   2728
   2729/*
   2730 * Other than ->lock, keep ocfs2_fops and ocfs2_dops in sync with
   2731 * ocfs2_fops_no_plocks and ocfs2_dops_no_plocks!
   2732 */
   2733const struct file_operations ocfs2_fops = {
   2734	.llseek		= ocfs2_file_llseek,
   2735	.mmap		= ocfs2_mmap,
   2736	.fsync		= ocfs2_sync_file,
   2737	.release	= ocfs2_file_release,
   2738	.open		= ocfs2_file_open,
   2739	.read_iter	= ocfs2_file_read_iter,
   2740	.write_iter	= ocfs2_file_write_iter,
   2741	.unlocked_ioctl	= ocfs2_ioctl,
   2742#ifdef CONFIG_COMPAT
   2743	.compat_ioctl   = ocfs2_compat_ioctl,
   2744#endif
   2745	.lock		= ocfs2_lock,
   2746	.flock		= ocfs2_flock,
   2747	.splice_read	= generic_file_splice_read,
   2748	.splice_write	= iter_file_splice_write,
   2749	.fallocate	= ocfs2_fallocate,
   2750	.remap_file_range = ocfs2_remap_file_range,
   2751};
   2752
   2753const struct file_operations ocfs2_dops = {
   2754	.llseek		= generic_file_llseek,
   2755	.read		= generic_read_dir,
   2756	.iterate	= ocfs2_readdir,
   2757	.fsync		= ocfs2_sync_file,
   2758	.release	= ocfs2_dir_release,
   2759	.open		= ocfs2_dir_open,
   2760	.unlocked_ioctl	= ocfs2_ioctl,
   2761#ifdef CONFIG_COMPAT
   2762	.compat_ioctl   = ocfs2_compat_ioctl,
   2763#endif
   2764	.lock		= ocfs2_lock,
   2765	.flock		= ocfs2_flock,
   2766};
   2767
   2768/*
   2769 * POSIX-lockless variants of our file_operations.
   2770 *
   2771 * These will be used if the underlying cluster stack does not support
   2772 * posix file locking, if the user passes the "localflocks" mount
   2773 * option, or if we have a local-only fs.
   2774 *
   2775 * ocfs2_flock is in here because all stacks handle UNIX file locks,
   2776 * so we still want it in the case of no stack support for
   2777 * plocks. Internally, it will do the right thing when asked to ignore
   2778 * the cluster.
   2779 */
   2780const struct file_operations ocfs2_fops_no_plocks = {
   2781	.llseek		= ocfs2_file_llseek,
   2782	.mmap		= ocfs2_mmap,
   2783	.fsync		= ocfs2_sync_file,
   2784	.release	= ocfs2_file_release,
   2785	.open		= ocfs2_file_open,
   2786	.read_iter	= ocfs2_file_read_iter,
   2787	.write_iter	= ocfs2_file_write_iter,
   2788	.unlocked_ioctl	= ocfs2_ioctl,
   2789#ifdef CONFIG_COMPAT
   2790	.compat_ioctl   = ocfs2_compat_ioctl,
   2791#endif
   2792	.flock		= ocfs2_flock,
   2793	.splice_read	= generic_file_splice_read,
   2794	.splice_write	= iter_file_splice_write,
   2795	.fallocate	= ocfs2_fallocate,
   2796	.remap_file_range = ocfs2_remap_file_range,
   2797};
   2798
   2799const struct file_operations ocfs2_dops_no_plocks = {
   2800	.llseek		= generic_file_llseek,
   2801	.read		= generic_read_dir,
   2802	.iterate	= ocfs2_readdir,
   2803	.fsync		= ocfs2_sync_file,
   2804	.release	= ocfs2_dir_release,
   2805	.open		= ocfs2_dir_open,
   2806	.unlocked_ioctl	= ocfs2_ioctl,
   2807#ifdef CONFIG_COMPAT
   2808	.compat_ioctl   = ocfs2_compat_ioctl,
   2809#endif
   2810	.flock		= ocfs2_flock,
   2811};