vfs.c - cachepc-linux - Fork of AMDESE/linux with modifications for CachePC side-channel attack

	cachepc-linux Fork of AMDESE/linux with modifications for CachePC side-channel attack
	git clone https://git.sinitax.com/sinitax/cachepc-linux
	Log \| Files \| Refs \| README \| LICENSE \| sfeed.txt
vfs.c (56776B)
      1// SPDX-License-Identifier: GPL-2.0
      2/*
      3 * File operations used by nfsd. Some of these have been ripped from
      4 * other parts of the kernel because they weren't exported, others
      5 * are partial duplicates with added or changed functionality.
      6 *
      7 * Note that several functions dget() the dentry upon which they want
      8 * to act, most notably those that create directory entries. Response
      9 * dentry's are dput()'d if necessary in the release callback.
     10 * So if you notice code paths that apparently fail to dput() the
     11 * dentry, don't worry--they have been taken care of.
     12 *
     13 * Copyright (C) 1995-1999 Olaf Kirch <okir@monad.swb.de>
     14 * Zerocpy NFS support (C) 2002 Hirokazu Takahashi <taka@valinux.co.jp>
     15 */
     16
     17#include <linux/fs.h>
     18#include <linux/file.h>
     19#include <linux/splice.h>
     20#include <linux/falloc.h>
     21#include <linux/fcntl.h>
     22#include <linux/namei.h>
     23#include <linux/delay.h>
     24#include <linux/fsnotify.h>
     25#include <linux/posix_acl_xattr.h>
     26#include <linux/xattr.h>
     27#include <linux/jhash.h>
     28#include <linux/ima.h>
     29#include <linux/pagemap.h>
     30#include <linux/slab.h>
     31#include <linux/uaccess.h>
     32#include <linux/exportfs.h>
     33#include <linux/writeback.h>
     34#include <linux/security.h>
     35
     36#include "xdr3.h"
     37
     38#ifdef CONFIG_NFSD_V4
     39#include "../internal.h"
     40#include "acl.h"
     41#include "idmap.h"
     42#include "xdr4.h"
     43#endif /* CONFIG_NFSD_V4 */
     44
     45#include "nfsd.h"
     46#include "vfs.h"
     47#include "filecache.h"
     48#include "trace.h"
     49
     50#define NFSDDBG_FACILITY		NFSDDBG_FILEOP
     51
     52/* 
     53 * Called from nfsd_lookup and encode_dirent. Check if we have crossed 
     54 * a mount point.
     55 * Returns -EAGAIN or -ETIMEDOUT leaving *dpp and *expp unchanged,
     56 *  or nfs_ok having possibly changed *dpp and *expp
     57 */
     58int
     59nfsd_cross_mnt(struct svc_rqst *rqstp, struct dentry **dpp, 
     60		        struct svc_export **expp)
     61{
     62	struct svc_export *exp = *expp, *exp2 = NULL;
     63	struct dentry *dentry = *dpp;
     64	struct path path = {.mnt = mntget(exp->ex_path.mnt),
     65			    .dentry = dget(dentry)};
     66	int err = 0;
     67
     68	err = follow_down(&path);
     69	if (err < 0)
     70		goto out;
     71	if (path.mnt == exp->ex_path.mnt && path.dentry == dentry &&
     72	    nfsd_mountpoint(dentry, exp) == 2) {
     73		/* This is only a mountpoint in some other namespace */
     74		path_put(&path);
     75		goto out;
     76	}
     77
     78	exp2 = rqst_exp_get_by_name(rqstp, &path);
     79	if (IS_ERR(exp2)) {
     80		err = PTR_ERR(exp2);
     81		/*
     82		 * We normally allow NFS clients to continue
     83		 * "underneath" a mountpoint that is not exported.
     84		 * The exception is V4ROOT, where no traversal is ever
     85		 * allowed without an explicit export of the new
     86		 * directory.
     87		 */
     88		if (err == -ENOENT && !(exp->ex_flags & NFSEXP_V4ROOT))
     89			err = 0;
     90		path_put(&path);
     91		goto out;
     92	}
     93	if (nfsd_v4client(rqstp) ||
     94		(exp->ex_flags & NFSEXP_CROSSMOUNT) || EX_NOHIDE(exp2)) {
     95		/* successfully crossed mount point */
     96		/*
     97		 * This is subtle: path.dentry is *not* on path.mnt
     98		 * at this point.  The only reason we are safe is that
     99		 * original mnt is pinned down by exp, so we should
    100		 * put path *before* putting exp
    101		 */
    102		*dpp = path.dentry;
    103		path.dentry = dentry;
    104		*expp = exp2;
    105		exp2 = exp;
    106	}
    107	path_put(&path);
    108	exp_put(exp2);
    109out:
    110	return err;
    111}
    112
    113static void follow_to_parent(struct path *path)
    114{
    115	struct dentry *dp;
    116
    117	while (path->dentry == path->mnt->mnt_root && follow_up(path))
    118		;
    119	dp = dget_parent(path->dentry);
    120	dput(path->dentry);
    121	path->dentry = dp;
    122}
    123
    124static int nfsd_lookup_parent(struct svc_rqst *rqstp, struct dentry *dparent, struct svc_export **exp, struct dentry **dentryp)
    125{
    126	struct svc_export *exp2;
    127	struct path path = {.mnt = mntget((*exp)->ex_path.mnt),
    128			    .dentry = dget(dparent)};
    129
    130	follow_to_parent(&path);
    131
    132	exp2 = rqst_exp_parent(rqstp, &path);
    133	if (PTR_ERR(exp2) == -ENOENT) {
    134		*dentryp = dget(dparent);
    135	} else if (IS_ERR(exp2)) {
    136		path_put(&path);
    137		return PTR_ERR(exp2);
    138	} else {
    139		*dentryp = dget(path.dentry);
    140		exp_put(*exp);
    141		*exp = exp2;
    142	}
    143	path_put(&path);
    144	return 0;
    145}
    146
    147/*
    148 * For nfsd purposes, we treat V4ROOT exports as though there was an
    149 * export at *every* directory.
    150 * We return:
    151 * '1' if this dentry *must* be an export point,
    152 * '2' if it might be, if there is really a mount here, and
    153 * '0' if there is no chance of an export point here.
    154 */
    155int nfsd_mountpoint(struct dentry *dentry, struct svc_export *exp)
    156{
    157	if (!d_inode(dentry))
    158		return 0;
    159	if (exp->ex_flags & NFSEXP_V4ROOT)
    160		return 1;
    161	if (nfsd4_is_junction(dentry))
    162		return 1;
    163	if (d_mountpoint(dentry))
    164		/*
    165		 * Might only be a mountpoint in a different namespace,
    166		 * but we need to check.
    167		 */
    168		return 2;
    169	return 0;
    170}
    171
    172__be32
    173nfsd_lookup_dentry(struct svc_rqst *rqstp, struct svc_fh *fhp,
    174		   const char *name, unsigned int len,
    175		   struct svc_export **exp_ret, struct dentry **dentry_ret)
    176{
    177	struct svc_export	*exp;
    178	struct dentry		*dparent;
    179	struct dentry		*dentry;
    180	int			host_err;
    181
    182	dprintk("nfsd: nfsd_lookup(fh %s, %.*s)\n", SVCFH_fmt(fhp), len,name);
    183
    184	dparent = fhp->fh_dentry;
    185	exp = exp_get(fhp->fh_export);
    186
    187	/* Lookup the name, but don't follow links */
    188	if (isdotent(name, len)) {
    189		if (len==1)
    190			dentry = dget(dparent);
    191		else if (dparent != exp->ex_path.dentry)
    192			dentry = dget_parent(dparent);
    193		else if (!EX_NOHIDE(exp) && !nfsd_v4client(rqstp))
    194			dentry = dget(dparent); /* .. == . just like at / */
    195		else {
    196			/* checking mountpoint crossing is very different when stepping up */
    197			host_err = nfsd_lookup_parent(rqstp, dparent, &exp, &dentry);
    198			if (host_err)
    199				goto out_nfserr;
    200		}
    201	} else {
    202		/*
    203		 * In the nfsd4_open() case, this may be held across
    204		 * subsequent open and delegation acquisition which may
    205		 * need to take the child's i_mutex:
    206		 */
    207		fh_lock_nested(fhp, I_MUTEX_PARENT);
    208		dentry = lookup_one_len(name, dparent, len);
    209		host_err = PTR_ERR(dentry);
    210		if (IS_ERR(dentry))
    211			goto out_nfserr;
    212		if (nfsd_mountpoint(dentry, exp)) {
    213			/*
    214			 * We don't need the i_mutex after all.  It's
    215			 * still possible we could open this (regular
    216			 * files can be mountpoints too), but the
    217			 * i_mutex is just there to prevent renames of
    218			 * something that we might be about to delegate,
    219			 * and a mountpoint won't be renamed:
    220			 */
    221			fh_unlock(fhp);
    222			if ((host_err = nfsd_cross_mnt(rqstp, &dentry, &exp))) {
    223				dput(dentry);
    224				goto out_nfserr;
    225			}
    226		}
    227	}
    228	*dentry_ret = dentry;
    229	*exp_ret = exp;
    230	return 0;
    231
    232out_nfserr:
    233	exp_put(exp);
    234	return nfserrno(host_err);
    235}
    236
    237/*
    238 * Look up one component of a pathname.
    239 * N.B. After this call _both_ fhp and resfh need an fh_put
    240 *
    241 * If the lookup would cross a mountpoint, and the mounted filesystem
    242 * is exported to the client with NFSEXP_NOHIDE, then the lookup is
    243 * accepted as it stands and the mounted directory is
    244 * returned. Otherwise the covered directory is returned.
    245 * NOTE: this mountpoint crossing is not supported properly by all
    246 *   clients and is explicitly disallowed for NFSv3
    247 *      NeilBrown <neilb@cse.unsw.edu.au>
    248 */
    249__be32
    250nfsd_lookup(struct svc_rqst *rqstp, struct svc_fh *fhp, const char *name,
    251				unsigned int len, struct svc_fh *resfh)
    252{
    253	struct svc_export	*exp;
    254	struct dentry		*dentry;
    255	__be32 err;
    256
    257	err = fh_verify(rqstp, fhp, S_IFDIR, NFSD_MAY_EXEC);
    258	if (err)
    259		return err;
    260	err = nfsd_lookup_dentry(rqstp, fhp, name, len, &exp, &dentry);
    261	if (err)
    262		return err;
    263	err = check_nfsd_access(exp, rqstp);
    264	if (err)
    265		goto out;
    266	/*
    267	 * Note: we compose the file handle now, but as the
    268	 * dentry may be negative, it may need to be updated.
    269	 */
    270	err = fh_compose(resfh, exp, dentry, fhp);
    271	if (!err && d_really_is_negative(dentry))
    272		err = nfserr_noent;
    273out:
    274	dput(dentry);
    275	exp_put(exp);
    276	return err;
    277}
    278
    279/*
    280 * Commit metadata changes to stable storage.
    281 */
    282static int
    283commit_inode_metadata(struct inode *inode)
    284{
    285	const struct export_operations *export_ops = inode->i_sb->s_export_op;
    286
    287	if (export_ops->commit_metadata)
    288		return export_ops->commit_metadata(inode);
    289	return sync_inode_metadata(inode, 1);
    290}
    291
    292static int
    293commit_metadata(struct svc_fh *fhp)
    294{
    295	struct inode *inode = d_inode(fhp->fh_dentry);
    296
    297	if (!EX_ISSYNC(fhp->fh_export))
    298		return 0;
    299	return commit_inode_metadata(inode);
    300}
    301
    302/*
    303 * Go over the attributes and take care of the small differences between
    304 * NFS semantics and what Linux expects.
    305 */
    306static void
    307nfsd_sanitize_attrs(struct inode *inode, struct iattr *iap)
    308{
    309	/* sanitize the mode change */
    310	if (iap->ia_valid & ATTR_MODE) {
    311		iap->ia_mode &= S_IALLUGO;
    312		iap->ia_mode |= (inode->i_mode & ~S_IALLUGO);
    313	}
    314
    315	/* Revoke setuid/setgid on chown */
    316	if (!S_ISDIR(inode->i_mode) &&
    317	    ((iap->ia_valid & ATTR_UID) || (iap->ia_valid & ATTR_GID))) {
    318		iap->ia_valid |= ATTR_KILL_PRIV;
    319		if (iap->ia_valid & ATTR_MODE) {
    320			/* we're setting mode too, just clear the s*id bits */
    321			iap->ia_mode &= ~S_ISUID;
    322			if (iap->ia_mode & S_IXGRP)
    323				iap->ia_mode &= ~S_ISGID;
    324		} else {
    325			/* set ATTR_KILL_* bits and let VFS handle it */
    326			iap->ia_valid |= (ATTR_KILL_SUID | ATTR_KILL_SGID);
    327		}
    328	}
    329}
    330
    331static __be32
    332nfsd_get_write_access(struct svc_rqst *rqstp, struct svc_fh *fhp,
    333		struct iattr *iap)
    334{
    335	struct inode *inode = d_inode(fhp->fh_dentry);
    336
    337	if (iap->ia_size < inode->i_size) {
    338		__be32 err;
    339
    340		err = nfsd_permission(rqstp, fhp->fh_export, fhp->fh_dentry,
    341				NFSD_MAY_TRUNC | NFSD_MAY_OWNER_OVERRIDE);
    342		if (err)
    343			return err;
    344	}
    345	return nfserrno(get_write_access(inode));
    346}
    347
    348/*
    349 * Set various file attributes.  After this call fhp needs an fh_put.
    350 */
    351__be32
    352nfsd_setattr(struct svc_rqst *rqstp, struct svc_fh *fhp, struct iattr *iap,
    353	     int check_guard, time64_t guardtime)
    354{
    355	struct dentry	*dentry;
    356	struct inode	*inode;
    357	int		accmode = NFSD_MAY_SATTR;
    358	umode_t		ftype = 0;
    359	__be32		err;
    360	int		host_err;
    361	bool		get_write_count;
    362	bool		size_change = (iap->ia_valid & ATTR_SIZE);
    363
    364	if (iap->ia_valid & ATTR_SIZE) {
    365		accmode |= NFSD_MAY_WRITE|NFSD_MAY_OWNER_OVERRIDE;
    366		ftype = S_IFREG;
    367	}
    368
    369	/*
    370	 * If utimes(2) and friends are called with times not NULL, we should
    371	 * not set NFSD_MAY_WRITE bit. Otherwise fh_verify->nfsd_permission
    372	 * will return EACCES, when the caller's effective UID does not match
    373	 * the owner of the file, and the caller is not privileged. In this
    374	 * situation, we should return EPERM(notify_change will return this).
    375	 */
    376	if (iap->ia_valid & (ATTR_ATIME | ATTR_MTIME)) {
    377		accmode |= NFSD_MAY_OWNER_OVERRIDE;
    378		if (!(iap->ia_valid & (ATTR_ATIME_SET | ATTR_MTIME_SET)))
    379			accmode |= NFSD_MAY_WRITE;
    380	}
    381
    382	/* Callers that do fh_verify should do the fh_want_write: */
    383	get_write_count = !fhp->fh_dentry;
    384
    385	/* Get inode */
    386	err = fh_verify(rqstp, fhp, ftype, accmode);
    387	if (err)
    388		return err;
    389	if (get_write_count) {
    390		host_err = fh_want_write(fhp);
    391		if (host_err)
    392			goto out;
    393	}
    394
    395	dentry = fhp->fh_dentry;
    396	inode = d_inode(dentry);
    397
    398	/* Ignore any mode updates on symlinks */
    399	if (S_ISLNK(inode->i_mode))
    400		iap->ia_valid &= ~ATTR_MODE;
    401
    402	if (!iap->ia_valid)
    403		return 0;
    404
    405	nfsd_sanitize_attrs(inode, iap);
    406
    407	if (check_guard && guardtime != inode->i_ctime.tv_sec)
    408		return nfserr_notsync;
    409
    410	/*
    411	 * The size case is special, it changes the file in addition to the
    412	 * attributes, and file systems don't expect it to be mixed with
    413	 * "random" attribute changes.  We thus split out the size change
    414	 * into a separate call to ->setattr, and do the rest as a separate
    415	 * setattr call.
    416	 */
    417	if (size_change) {
    418		err = nfsd_get_write_access(rqstp, fhp, iap);
    419		if (err)
    420			return err;
    421	}
    422
    423	fh_lock(fhp);
    424	if (size_change) {
    425		/*
    426		 * RFC5661, Section 18.30.4:
    427		 *   Changing the size of a file with SETATTR indirectly
    428		 *   changes the time_modify and change attributes.
    429		 *
    430		 * (and similar for the older RFCs)
    431		 */
    432		struct iattr size_attr = {
    433			.ia_valid	= ATTR_SIZE | ATTR_CTIME | ATTR_MTIME,
    434			.ia_size	= iap->ia_size,
    435		};
    436
    437		host_err = -EFBIG;
    438		if (iap->ia_size < 0)
    439			goto out_unlock;
    440
    441		host_err = notify_change(&init_user_ns, dentry, &size_attr, NULL);
    442		if (host_err)
    443			goto out_unlock;
    444		iap->ia_valid &= ~ATTR_SIZE;
    445
    446		/*
    447		 * Avoid the additional setattr call below if the only other
    448		 * attribute that the client sends is the mtime, as we update
    449		 * it as part of the size change above.
    450		 */
    451		if ((iap->ia_valid & ~ATTR_MTIME) == 0)
    452			goto out_unlock;
    453	}
    454
    455	iap->ia_valid |= ATTR_CTIME;
    456	host_err = notify_change(&init_user_ns, dentry, iap, NULL);
    457
    458out_unlock:
    459	fh_unlock(fhp);
    460	if (size_change)
    461		put_write_access(inode);
    462out:
    463	if (!host_err)
    464		host_err = commit_metadata(fhp);
    465	return nfserrno(host_err);
    466}
    467
    468#if defined(CONFIG_NFSD_V4)
    469/*
    470 * NFS junction information is stored in an extended attribute.
    471 */
    472#define NFSD_JUNCTION_XATTR_NAME	XATTR_TRUSTED_PREFIX "junction.nfs"
    473
    474/**
    475 * nfsd4_is_junction - Test if an object could be an NFS junction
    476 *
    477 * @dentry: object to test
    478 *
    479 * Returns 1 if "dentry" appears to contain NFS junction information.
    480 * Otherwise 0 is returned.
    481 */
    482int nfsd4_is_junction(struct dentry *dentry)
    483{
    484	struct inode *inode = d_inode(dentry);
    485
    486	if (inode == NULL)
    487		return 0;
    488	if (inode->i_mode & S_IXUGO)
    489		return 0;
    490	if (!(inode->i_mode & S_ISVTX))
    491		return 0;
    492	if (vfs_getxattr(&init_user_ns, dentry, NFSD_JUNCTION_XATTR_NAME,
    493			 NULL, 0) <= 0)
    494		return 0;
    495	return 1;
    496}
    497#ifdef CONFIG_NFSD_V4_SECURITY_LABEL
    498__be32 nfsd4_set_nfs4_label(struct svc_rqst *rqstp, struct svc_fh *fhp,
    499		struct xdr_netobj *label)
    500{
    501	__be32 error;
    502	int host_error;
    503	struct dentry *dentry;
    504
    505	error = fh_verify(rqstp, fhp, 0 /* S_IFREG */, NFSD_MAY_SATTR);
    506	if (error)
    507		return error;
    508
    509	dentry = fhp->fh_dentry;
    510
    511	inode_lock(d_inode(dentry));
    512	host_error = security_inode_setsecctx(dentry, label->data, label->len);
    513	inode_unlock(d_inode(dentry));
    514	return nfserrno(host_error);
    515}
    516#else
    517__be32 nfsd4_set_nfs4_label(struct svc_rqst *rqstp, struct svc_fh *fhp,
    518		struct xdr_netobj *label)
    519{
    520	return nfserr_notsupp;
    521}
    522#endif
    523
    524static struct nfsd4_compound_state *nfsd4_get_cstate(struct svc_rqst *rqstp)
    525{
    526	return &((struct nfsd4_compoundres *)rqstp->rq_resp)->cstate;
    527}
    528
    529__be32 nfsd4_clone_file_range(struct svc_rqst *rqstp,
    530		struct nfsd_file *nf_src, u64 src_pos,
    531		struct nfsd_file *nf_dst, u64 dst_pos,
    532		u64 count, bool sync)
    533{
    534	struct file *src = nf_src->nf_file;
    535	struct file *dst = nf_dst->nf_file;
    536	errseq_t since;
    537	loff_t cloned;
    538	__be32 ret = 0;
    539
    540	since = READ_ONCE(dst->f_wb_err);
    541	cloned = vfs_clone_file_range(src, src_pos, dst, dst_pos, count, 0);
    542	if (cloned < 0) {
    543		ret = nfserrno(cloned);
    544		goto out_err;
    545	}
    546	if (count && cloned != count) {
    547		ret = nfserrno(-EINVAL);
    548		goto out_err;
    549	}
    550	if (sync) {
    551		loff_t dst_end = count ? dst_pos + count - 1 : LLONG_MAX;
    552		int status = vfs_fsync_range(dst, dst_pos, dst_end, 0);
    553
    554		if (!status)
    555			status = filemap_check_wb_err(dst->f_mapping, since);
    556		if (!status)
    557			status = commit_inode_metadata(file_inode(src));
    558		if (status < 0) {
    559			struct nfsd_net *nn = net_generic(nf_dst->nf_net,
    560							  nfsd_net_id);
    561
    562			trace_nfsd_clone_file_range_err(rqstp,
    563					&nfsd4_get_cstate(rqstp)->save_fh,
    564					src_pos,
    565					&nfsd4_get_cstate(rqstp)->current_fh,
    566					dst_pos,
    567					count, status);
    568			nfsd_reset_write_verifier(nn);
    569			trace_nfsd_writeverf_reset(nn, rqstp, status);
    570			ret = nfserrno(status);
    571		}
    572	}
    573out_err:
    574	return ret;
    575}
    576
    577ssize_t nfsd_copy_file_range(struct file *src, u64 src_pos, struct file *dst,
    578			     u64 dst_pos, u64 count)
    579{
    580	ssize_t ret;
    581
    582	/*
    583	 * Limit copy to 4MB to prevent indefinitely blocking an nfsd
    584	 * thread and client rpc slot.  The choice of 4MB is somewhat
    585	 * arbitrary.  We might instead base this on r/wsize, or make it
    586	 * tunable, or use a time instead of a byte limit, or implement
    587	 * asynchronous copy.  In theory a client could also recognize a
    588	 * limit like this and pipeline multiple COPY requests.
    589	 */
    590	count = min_t(u64, count, 1 << 22);
    591	ret = vfs_copy_file_range(src, src_pos, dst, dst_pos, count, 0);
    592
    593	if (ret == -EOPNOTSUPP || ret == -EXDEV)
    594		ret = generic_copy_file_range(src, src_pos, dst, dst_pos,
    595					      count, 0);
    596	return ret;
    597}
    598
    599__be32 nfsd4_vfs_fallocate(struct svc_rqst *rqstp, struct svc_fh *fhp,
    600			   struct file *file, loff_t offset, loff_t len,
    601			   int flags)
    602{
    603	int error;
    604
    605	if (!S_ISREG(file_inode(file)->i_mode))
    606		return nfserr_inval;
    607
    608	error = vfs_fallocate(file, flags, offset, len);
    609	if (!error)
    610		error = commit_metadata(fhp);
    611
    612	return nfserrno(error);
    613}
    614#endif /* defined(CONFIG_NFSD_V4) */
    615
    616/*
    617 * Check server access rights to a file system object
    618 */
    619struct accessmap {
    620	u32		access;
    621	int		how;
    622};
    623static struct accessmap	nfs3_regaccess[] = {
    624    {	NFS3_ACCESS_READ,	NFSD_MAY_READ			},
    625    {	NFS3_ACCESS_EXECUTE,	NFSD_MAY_EXEC			},
    626    {	NFS3_ACCESS_MODIFY,	NFSD_MAY_WRITE|NFSD_MAY_TRUNC	},
    627    {	NFS3_ACCESS_EXTEND,	NFSD_MAY_WRITE			},
    628
    629#ifdef CONFIG_NFSD_V4
    630    {	NFS4_ACCESS_XAREAD,	NFSD_MAY_READ			},
    631    {	NFS4_ACCESS_XAWRITE,	NFSD_MAY_WRITE			},
    632    {	NFS4_ACCESS_XALIST,	NFSD_MAY_READ			},
    633#endif
    634
    635    {	0,			0				}
    636};
    637
    638static struct accessmap	nfs3_diraccess[] = {
    639    {	NFS3_ACCESS_READ,	NFSD_MAY_READ			},
    640    {	NFS3_ACCESS_LOOKUP,	NFSD_MAY_EXEC			},
    641    {	NFS3_ACCESS_MODIFY,	NFSD_MAY_EXEC|NFSD_MAY_WRITE|NFSD_MAY_TRUNC},
    642    {	NFS3_ACCESS_EXTEND,	NFSD_MAY_EXEC|NFSD_MAY_WRITE	},
    643    {	NFS3_ACCESS_DELETE,	NFSD_MAY_REMOVE			},
    644
    645#ifdef CONFIG_NFSD_V4
    646    {	NFS4_ACCESS_XAREAD,	NFSD_MAY_READ			},
    647    {	NFS4_ACCESS_XAWRITE,	NFSD_MAY_WRITE			},
    648    {	NFS4_ACCESS_XALIST,	NFSD_MAY_READ			},
    649#endif
    650
    651    {	0,			0				}
    652};
    653
    654static struct accessmap	nfs3_anyaccess[] = {
    655	/* Some clients - Solaris 2.6 at least, make an access call
    656	 * to the server to check for access for things like /dev/null
    657	 * (which really, the server doesn't care about).  So
    658	 * We provide simple access checking for them, looking
    659	 * mainly at mode bits, and we make sure to ignore read-only
    660	 * filesystem checks
    661	 */
    662    {	NFS3_ACCESS_READ,	NFSD_MAY_READ			},
    663    {	NFS3_ACCESS_EXECUTE,	NFSD_MAY_EXEC			},
    664    {	NFS3_ACCESS_MODIFY,	NFSD_MAY_WRITE|NFSD_MAY_LOCAL_ACCESS	},
    665    {	NFS3_ACCESS_EXTEND,	NFSD_MAY_WRITE|NFSD_MAY_LOCAL_ACCESS	},
    666
    667    {	0,			0				}
    668};
    669
    670__be32
    671nfsd_access(struct svc_rqst *rqstp, struct svc_fh *fhp, u32 *access, u32 *supported)
    672{
    673	struct accessmap	*map;
    674	struct svc_export	*export;
    675	struct dentry		*dentry;
    676	u32			query, result = 0, sresult = 0;
    677	__be32			error;
    678
    679	error = fh_verify(rqstp, fhp, 0, NFSD_MAY_NOP);
    680	if (error)
    681		goto out;
    682
    683	export = fhp->fh_export;
    684	dentry = fhp->fh_dentry;
    685
    686	if (d_is_reg(dentry))
    687		map = nfs3_regaccess;
    688	else if (d_is_dir(dentry))
    689		map = nfs3_diraccess;
    690	else
    691		map = nfs3_anyaccess;
    692
    693
    694	query = *access;
    695	for  (; map->access; map++) {
    696		if (map->access & query) {
    697			__be32 err2;
    698
    699			sresult |= map->access;
    700
    701			err2 = nfsd_permission(rqstp, export, dentry, map->how);
    702			switch (err2) {
    703			case nfs_ok:
    704				result |= map->access;
    705				break;
    706				
    707			/* the following error codes just mean the access was not allowed,
    708			 * rather than an error occurred */
    709			case nfserr_rofs:
    710			case nfserr_acces:
    711			case nfserr_perm:
    712				/* simply don't "or" in the access bit. */
    713				break;
    714			default:
    715				error = err2;
    716				goto out;
    717			}
    718		}
    719	}
    720	*access = result;
    721	if (supported)
    722		*supported = sresult;
    723
    724 out:
    725	return error;
    726}
    727
    728int nfsd_open_break_lease(struct inode *inode, int access)
    729{
    730	unsigned int mode;
    731
    732	if (access & NFSD_MAY_NOT_BREAK_LEASE)
    733		return 0;
    734	mode = (access & NFSD_MAY_WRITE) ? O_WRONLY : O_RDONLY;
    735	return break_lease(inode, mode | O_NONBLOCK);
    736}
    737
    738/*
    739 * Open an existing file or directory.
    740 * The may_flags argument indicates the type of open (read/write/lock)
    741 * and additional flags.
    742 * N.B. After this call fhp needs an fh_put
    743 */
    744static __be32
    745__nfsd_open(struct svc_rqst *rqstp, struct svc_fh *fhp, umode_t type,
    746			int may_flags, struct file **filp)
    747{
    748	struct path	path;
    749	struct inode	*inode;
    750	struct file	*file;
    751	int		flags = O_RDONLY|O_LARGEFILE;
    752	__be32		err;
    753	int		host_err = 0;
    754
    755	path.mnt = fhp->fh_export->ex_path.mnt;
    756	path.dentry = fhp->fh_dentry;
    757	inode = d_inode(path.dentry);
    758
    759	err = nfserr_perm;
    760	if (IS_APPEND(inode) && (may_flags & NFSD_MAY_WRITE))
    761		goto out;
    762
    763	if (!inode->i_fop)
    764		goto out;
    765
    766	host_err = nfsd_open_break_lease(inode, may_flags);
    767	if (host_err) /* NOMEM or WOULDBLOCK */
    768		goto out_nfserr;
    769
    770	if (may_flags & NFSD_MAY_WRITE) {
    771		if (may_flags & NFSD_MAY_READ)
    772			flags = O_RDWR|O_LARGEFILE;
    773		else
    774			flags = O_WRONLY|O_LARGEFILE;
    775	}
    776
    777	file = dentry_open(&path, flags, current_cred());
    778	if (IS_ERR(file)) {
    779		host_err = PTR_ERR(file);
    780		goto out_nfserr;
    781	}
    782
    783	host_err = ima_file_check(file, may_flags);
    784	if (host_err) {
    785		fput(file);
    786		goto out_nfserr;
    787	}
    788
    789	if (may_flags & NFSD_MAY_64BIT_COOKIE)
    790		file->f_mode |= FMODE_64BITHASH;
    791	else
    792		file->f_mode |= FMODE_32BITHASH;
    793
    794	*filp = file;
    795out_nfserr:
    796	err = nfserrno(host_err);
    797out:
    798	return err;
    799}
    800
    801__be32
    802nfsd_open(struct svc_rqst *rqstp, struct svc_fh *fhp, umode_t type,
    803		int may_flags, struct file **filp)
    804{
    805	__be32 err;
    806	bool retried = false;
    807
    808	validate_process_creds();
    809	/*
    810	 * If we get here, then the client has already done an "open",
    811	 * and (hopefully) checked permission - so allow OWNER_OVERRIDE
    812	 * in case a chmod has now revoked permission.
    813	 *
    814	 * Arguably we should also allow the owner override for
    815	 * directories, but we never have and it doesn't seem to have
    816	 * caused anyone a problem.  If we were to change this, note
    817	 * also that our filldir callbacks would need a variant of
    818	 * lookup_one_len that doesn't check permissions.
    819	 */
    820	if (type == S_IFREG)
    821		may_flags |= NFSD_MAY_OWNER_OVERRIDE;
    822retry:
    823	err = fh_verify(rqstp, fhp, type, may_flags);
    824	if (!err) {
    825		err = __nfsd_open(rqstp, fhp, type, may_flags, filp);
    826		if (err == nfserr_stale && !retried) {
    827			retried = true;
    828			fh_put(fhp);
    829			goto retry;
    830		}
    831	}
    832	validate_process_creds();
    833	return err;
    834}
    835
    836/**
    837 * nfsd_open_verified - Open a regular file for the filecache
    838 * @rqstp: RPC request
    839 * @fhp: NFS filehandle of the file to open
    840 * @may_flags: internal permission flags
    841 * @filp: OUT: open "struct file *"
    842 *
    843 * Returns an nfsstat value in network byte order.
    844 */
    845__be32
    846nfsd_open_verified(struct svc_rqst *rqstp, struct svc_fh *fhp, int may_flags,
    847		   struct file **filp)
    848{
    849	__be32 err;
    850
    851	validate_process_creds();
    852	err = __nfsd_open(rqstp, fhp, S_IFREG, may_flags, filp);
    853	validate_process_creds();
    854	return err;
    855}
    856
    857/*
    858 * Grab and keep cached pages associated with a file in the svc_rqst
    859 * so that they can be passed to the network sendmsg/sendpage routines
    860 * directly. They will be released after the sending has completed.
    861 */
    862static int
    863nfsd_splice_actor(struct pipe_inode_info *pipe, struct pipe_buffer *buf,
    864		  struct splice_desc *sd)
    865{
    866	struct svc_rqst *rqstp = sd->u.data;
    867
    868	svc_rqst_replace_page(rqstp, buf->page);
    869	if (rqstp->rq_res.page_len == 0)
    870		rqstp->rq_res.page_base = buf->offset;
    871	rqstp->rq_res.page_len += sd->len;
    872	return sd->len;
    873}
    874
    875static int nfsd_direct_splice_actor(struct pipe_inode_info *pipe,
    876				    struct splice_desc *sd)
    877{
    878	return __splice_from_pipe(pipe, sd, nfsd_splice_actor);
    879}
    880
    881static u32 nfsd_eof_on_read(struct file *file, loff_t offset, ssize_t len,
    882		size_t expected)
    883{
    884	if (expected != 0 && len == 0)
    885		return 1;
    886	if (offset+len >= i_size_read(file_inode(file)))
    887		return 1;
    888	return 0;
    889}
    890
    891static __be32 nfsd_finish_read(struct svc_rqst *rqstp, struct svc_fh *fhp,
    892			       struct file *file, loff_t offset,
    893			       unsigned long *count, u32 *eof, ssize_t host_err)
    894{
    895	if (host_err >= 0) {
    896		nfsd_stats_io_read_add(fhp->fh_export, host_err);
    897		*eof = nfsd_eof_on_read(file, offset, host_err, *count);
    898		*count = host_err;
    899		fsnotify_access(file);
    900		trace_nfsd_read_io_done(rqstp, fhp, offset, *count);
    901		return 0;
    902	} else {
    903		trace_nfsd_read_err(rqstp, fhp, offset, host_err);
    904		return nfserrno(host_err);
    905	}
    906}
    907
    908__be32 nfsd_splice_read(struct svc_rqst *rqstp, struct svc_fh *fhp,
    909			struct file *file, loff_t offset, unsigned long *count,
    910			u32 *eof)
    911{
    912	struct splice_desc sd = {
    913		.len		= 0,
    914		.total_len	= *count,
    915		.pos		= offset,
    916		.u.data		= rqstp,
    917	};
    918	ssize_t host_err;
    919
    920	trace_nfsd_read_splice(rqstp, fhp, offset, *count);
    921	rqstp->rq_next_page = rqstp->rq_respages + 1;
    922	host_err = splice_direct_to_actor(file, &sd, nfsd_direct_splice_actor);
    923	return nfsd_finish_read(rqstp, fhp, file, offset, count, eof, host_err);
    924}
    925
    926__be32 nfsd_readv(struct svc_rqst *rqstp, struct svc_fh *fhp,
    927		  struct file *file, loff_t offset,
    928		  struct kvec *vec, int vlen, unsigned long *count,
    929		  u32 *eof)
    930{
    931	struct iov_iter iter;
    932	loff_t ppos = offset;
    933	ssize_t host_err;
    934
    935	trace_nfsd_read_vector(rqstp, fhp, offset, *count);
    936	iov_iter_kvec(&iter, READ, vec, vlen, *count);
    937	host_err = vfs_iter_read(file, &iter, &ppos, 0);
    938	return nfsd_finish_read(rqstp, fhp, file, offset, count, eof, host_err);
    939}
    940
    941/*
    942 * Gathered writes: If another process is currently writing to the file,
    943 * there's a high chance this is another nfsd (triggered by a bulk write
    944 * from a client's biod). Rather than syncing the file with each write
    945 * request, we sleep for 10 msec.
    946 *
    947 * I don't know if this roughly approximates C. Juszak's idea of
    948 * gathered writes, but it's a nice and simple solution (IMHO), and it
    949 * seems to work:-)
    950 *
    951 * Note: we do this only in the NFSv2 case, since v3 and higher have a
    952 * better tool (separate unstable writes and commits) for solving this
    953 * problem.
    954 */
    955static int wait_for_concurrent_writes(struct file *file)
    956{
    957	struct inode *inode = file_inode(file);
    958	static ino_t last_ino;
    959	static dev_t last_dev;
    960	int err = 0;
    961
    962	if (atomic_read(&inode->i_writecount) > 1
    963	    || (last_ino == inode->i_ino && last_dev == inode->i_sb->s_dev)) {
    964		dprintk("nfsd: write defer %d\n", task_pid_nr(current));
    965		msleep(10);
    966		dprintk("nfsd: write resume %d\n", task_pid_nr(current));
    967	}
    968
    969	if (inode->i_state & I_DIRTY) {
    970		dprintk("nfsd: write sync %d\n", task_pid_nr(current));
    971		err = vfs_fsync(file, 0);
    972	}
    973	last_ino = inode->i_ino;
    974	last_dev = inode->i_sb->s_dev;
    975	return err;
    976}
    977
    978__be32
    979nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct nfsd_file *nf,
    980				loff_t offset, struct kvec *vec, int vlen,
    981				unsigned long *cnt, int stable,
    982				__be32 *verf)
    983{
    984	struct nfsd_net		*nn = net_generic(SVC_NET(rqstp), nfsd_net_id);
    985	struct file		*file = nf->nf_file;
    986	struct super_block	*sb = file_inode(file)->i_sb;
    987	struct svc_export	*exp;
    988	struct iov_iter		iter;
    989	errseq_t		since;
    990	__be32			nfserr;
    991	int			host_err;
    992	int			use_wgather;
    993	loff_t			pos = offset;
    994	unsigned long		exp_op_flags = 0;
    995	unsigned int		pflags = current->flags;
    996	rwf_t			flags = 0;
    997	bool			restore_flags = false;
    998
    999	trace_nfsd_write_opened(rqstp, fhp, offset, *cnt);
   1000
   1001	if (sb->s_export_op)
   1002		exp_op_flags = sb->s_export_op->flags;
   1003
   1004	if (test_bit(RQ_LOCAL, &rqstp->rq_flags) &&
   1005	    !(exp_op_flags & EXPORT_OP_REMOTE_FS)) {
   1006		/*
   1007		 * We want throttling in balance_dirty_pages()
   1008		 * and shrink_inactive_list() to only consider
   1009		 * the backingdev we are writing to, so that nfs to
   1010		 * localhost doesn't cause nfsd to lock up due to all
   1011		 * the client's dirty pages or its congested queue.
   1012		 */
   1013		current->flags |= PF_LOCAL_THROTTLE;
   1014		restore_flags = true;
   1015	}
   1016
   1017	exp = fhp->fh_export;
   1018	use_wgather = (rqstp->rq_vers == 2) && EX_WGATHER(exp);
   1019
   1020	if (!EX_ISSYNC(exp))
   1021		stable = NFS_UNSTABLE;
   1022
   1023	if (stable && !use_wgather)
   1024		flags |= RWF_SYNC;
   1025
   1026	iov_iter_kvec(&iter, WRITE, vec, vlen, *cnt);
   1027	since = READ_ONCE(file->f_wb_err);
   1028	if (verf)
   1029		nfsd_copy_write_verifier(verf, nn);
   1030	host_err = vfs_iter_write(file, &iter, &pos, flags);
   1031	if (host_err < 0) {
   1032		nfsd_reset_write_verifier(nn);
   1033		trace_nfsd_writeverf_reset(nn, rqstp, host_err);
   1034		goto out_nfserr;
   1035	}
   1036	*cnt = host_err;
   1037	nfsd_stats_io_write_add(exp, *cnt);
   1038	fsnotify_modify(file);
   1039	host_err = filemap_check_wb_err(file->f_mapping, since);
   1040	if (host_err < 0)
   1041		goto out_nfserr;
   1042
   1043	if (stable && use_wgather) {
   1044		host_err = wait_for_concurrent_writes(file);
   1045		if (host_err < 0) {
   1046			nfsd_reset_write_verifier(nn);
   1047			trace_nfsd_writeverf_reset(nn, rqstp, host_err);
   1048		}
   1049	}
   1050
   1051out_nfserr:
   1052	if (host_err >= 0) {
   1053		trace_nfsd_write_io_done(rqstp, fhp, offset, *cnt);
   1054		nfserr = nfs_ok;
   1055	} else {
   1056		trace_nfsd_write_err(rqstp, fhp, offset, host_err);
   1057		nfserr = nfserrno(host_err);
   1058	}
   1059	if (restore_flags)
   1060		current_restore_flags(pflags, PF_LOCAL_THROTTLE);
   1061	return nfserr;
   1062}
   1063
   1064/*
   1065 * Read data from a file. count must contain the requested read count
   1066 * on entry. On return, *count contains the number of bytes actually read.
   1067 * N.B. After this call fhp needs an fh_put
   1068 */
   1069__be32 nfsd_read(struct svc_rqst *rqstp, struct svc_fh *fhp,
   1070	loff_t offset, struct kvec *vec, int vlen, unsigned long *count,
   1071	u32 *eof)
   1072{
   1073	struct nfsd_file	*nf;
   1074	struct file *file;
   1075	__be32 err;
   1076
   1077	trace_nfsd_read_start(rqstp, fhp, offset, *count);
   1078	err = nfsd_file_acquire(rqstp, fhp, NFSD_MAY_READ, &nf);
   1079	if (err)
   1080		return err;
   1081
   1082	file = nf->nf_file;
   1083	if (file->f_op->splice_read && test_bit(RQ_SPLICE_OK, &rqstp->rq_flags))
   1084		err = nfsd_splice_read(rqstp, fhp, file, offset, count, eof);
   1085	else
   1086		err = nfsd_readv(rqstp, fhp, file, offset, vec, vlen, count, eof);
   1087
   1088	nfsd_file_put(nf);
   1089
   1090	trace_nfsd_read_done(rqstp, fhp, offset, *count);
   1091
   1092	return err;
   1093}
   1094
   1095/*
   1096 * Write data to a file.
   1097 * The stable flag requests synchronous writes.
   1098 * N.B. After this call fhp needs an fh_put
   1099 */
   1100__be32
   1101nfsd_write(struct svc_rqst *rqstp, struct svc_fh *fhp, loff_t offset,
   1102	   struct kvec *vec, int vlen, unsigned long *cnt, int stable,
   1103	   __be32 *verf)
   1104{
   1105	struct nfsd_file *nf;
   1106	__be32 err;
   1107
   1108	trace_nfsd_write_start(rqstp, fhp, offset, *cnt);
   1109
   1110	err = nfsd_file_acquire(rqstp, fhp, NFSD_MAY_WRITE, &nf);
   1111	if (err)
   1112		goto out;
   1113
   1114	err = nfsd_vfs_write(rqstp, fhp, nf, offset, vec,
   1115			vlen, cnt, stable, verf);
   1116	nfsd_file_put(nf);
   1117out:
   1118	trace_nfsd_write_done(rqstp, fhp, offset, *cnt);
   1119	return err;
   1120}
   1121
   1122/**
   1123 * nfsd_commit - Commit pending writes to stable storage
   1124 * @rqstp: RPC request being processed
   1125 * @fhp: NFS filehandle
   1126 * @offset: raw offset from beginning of file
   1127 * @count: raw count of bytes to sync
   1128 * @verf: filled in with the server's current write verifier
   1129 *
   1130 * Note: we guarantee that data that lies within the range specified
   1131 * by the 'offset' and 'count' parameters will be synced. The server
   1132 * is permitted to sync data that lies outside this range at the
   1133 * same time.
   1134 *
   1135 * Unfortunately we cannot lock the file to make sure we return full WCC
   1136 * data to the client, as locking happens lower down in the filesystem.
   1137 *
   1138 * Return values:
   1139 *   An nfsstat value in network byte order.
   1140 */
   1141__be32
   1142nfsd_commit(struct svc_rqst *rqstp, struct svc_fh *fhp, u64 offset,
   1143	    u32 count, __be32 *verf)
   1144{
   1145	u64			maxbytes;
   1146	loff_t			start, end;
   1147	struct nfsd_net		*nn;
   1148	struct nfsd_file	*nf;
   1149	__be32			err;
   1150
   1151	err = nfsd_file_acquire(rqstp, fhp,
   1152			NFSD_MAY_WRITE|NFSD_MAY_NOT_BREAK_LEASE, &nf);
   1153	if (err)
   1154		goto out;
   1155
   1156	/*
   1157	 * Convert the client-provided (offset, count) range to a
   1158	 * (start, end) range. If the client-provided range falls
   1159	 * outside the maximum file size of the underlying FS,
   1160	 * clamp the sync range appropriately.
   1161	 */
   1162	start = 0;
   1163	end = LLONG_MAX;
   1164	maxbytes = (u64)fhp->fh_dentry->d_sb->s_maxbytes;
   1165	if (offset < maxbytes) {
   1166		start = offset;
   1167		if (count && (offset + count - 1 < maxbytes))
   1168			end = offset + count - 1;
   1169	}
   1170
   1171	nn = net_generic(nf->nf_net, nfsd_net_id);
   1172	if (EX_ISSYNC(fhp->fh_export)) {
   1173		errseq_t since = READ_ONCE(nf->nf_file->f_wb_err);
   1174		int err2;
   1175
   1176		err2 = vfs_fsync_range(nf->nf_file, start, end, 0);
   1177		switch (err2) {
   1178		case 0:
   1179			nfsd_copy_write_verifier(verf, nn);
   1180			err2 = filemap_check_wb_err(nf->nf_file->f_mapping,
   1181						    since);
   1182			err = nfserrno(err2);
   1183			break;
   1184		case -EINVAL:
   1185			err = nfserr_notsupp;
   1186			break;
   1187		default:
   1188			nfsd_reset_write_verifier(nn);
   1189			trace_nfsd_writeverf_reset(nn, rqstp, err2);
   1190			err = nfserrno(err2);
   1191		}
   1192	} else
   1193		nfsd_copy_write_verifier(verf, nn);
   1194
   1195	nfsd_file_put(nf);
   1196out:
   1197	return err;
   1198}
   1199
   1200/**
   1201 * nfsd_create_setattr - Set a created file's attributes
   1202 * @rqstp: RPC transaction being executed
   1203 * @fhp: NFS filehandle of parent directory
   1204 * @resfhp: NFS filehandle of new object
   1205 * @iap: requested attributes of new object
   1206 *
   1207 * Returns nfs_ok on success, or an nfsstat in network byte order.
   1208 */
   1209__be32
   1210nfsd_create_setattr(struct svc_rqst *rqstp, struct svc_fh *fhp,
   1211		    struct svc_fh *resfhp, struct iattr *iap)
   1212{
   1213	__be32 status;
   1214
   1215	/*
   1216	 * Mode has already been set by file creation.
   1217	 */
   1218	iap->ia_valid &= ~ATTR_MODE;
   1219
   1220	/*
   1221	 * Setting uid/gid works only for root.  Irix appears to
   1222	 * send along the gid on create when it tries to implement
   1223	 * setgid directories via NFS:
   1224	 */
   1225	if (!uid_eq(current_fsuid(), GLOBAL_ROOT_UID))
   1226		iap->ia_valid &= ~(ATTR_UID|ATTR_GID);
   1227
   1228	/*
   1229	 * Callers expect new file metadata to be committed even
   1230	 * if the attributes have not changed.
   1231	 */
   1232	if (iap->ia_valid)
   1233		status = nfsd_setattr(rqstp, resfhp, iap, 0, (time64_t)0);
   1234	else
   1235		status = nfserrno(commit_metadata(resfhp));
   1236
   1237	/*
   1238	 * Transactional filesystems had a chance to commit changes
   1239	 * for both parent and child simultaneously making the
   1240	 * following commit_metadata a noop in many cases.
   1241	 */
   1242	if (!status)
   1243		status = nfserrno(commit_metadata(fhp));
   1244
   1245	/*
   1246	 * Update the new filehandle to pick up the new attributes.
   1247	 */
   1248	if (!status)
   1249		status = fh_update(resfhp);
   1250
   1251	return status;
   1252}
   1253
   1254/* HPUX client sometimes creates a file in mode 000, and sets size to 0.
   1255 * setting size to 0 may fail for some specific file systems by the permission
   1256 * checking which requires WRITE permission but the mode is 000.
   1257 * we ignore the resizing(to 0) on the just new created file, since the size is
   1258 * 0 after file created.
   1259 *
   1260 * call this only after vfs_create() is called.
   1261 * */
   1262static void
   1263nfsd_check_ignore_resizing(struct iattr *iap)
   1264{
   1265	if ((iap->ia_valid & ATTR_SIZE) && (iap->ia_size == 0))
   1266		iap->ia_valid &= ~ATTR_SIZE;
   1267}
   1268
   1269/* The parent directory should already be locked: */
   1270__be32
   1271nfsd_create_locked(struct svc_rqst *rqstp, struct svc_fh *fhp,
   1272		char *fname, int flen, struct iattr *iap,
   1273		int type, dev_t rdev, struct svc_fh *resfhp)
   1274{
   1275	struct dentry	*dentry, *dchild;
   1276	struct inode	*dirp;
   1277	__be32		err;
   1278	int		host_err;
   1279
   1280	dentry = fhp->fh_dentry;
   1281	dirp = d_inode(dentry);
   1282
   1283	dchild = dget(resfhp->fh_dentry);
   1284	if (!fhp->fh_locked) {
   1285		WARN_ONCE(1, "nfsd_create: parent %pd2 not locked!\n",
   1286				dentry);
   1287		err = nfserr_io;
   1288		goto out;
   1289	}
   1290
   1291	err = nfsd_permission(rqstp, fhp->fh_export, dentry, NFSD_MAY_CREATE);
   1292	if (err)
   1293		goto out;
   1294
   1295	if (!(iap->ia_valid & ATTR_MODE))
   1296		iap->ia_mode = 0;
   1297	iap->ia_mode = (iap->ia_mode & S_IALLUGO) | type;
   1298
   1299	if (!IS_POSIXACL(dirp))
   1300		iap->ia_mode &= ~current_umask();
   1301
   1302	err = 0;
   1303	host_err = 0;
   1304	switch (type) {
   1305	case S_IFREG:
   1306		host_err = vfs_create(&init_user_ns, dirp, dchild, iap->ia_mode, true);
   1307		if (!host_err)
   1308			nfsd_check_ignore_resizing(iap);
   1309		break;
   1310	case S_IFDIR:
   1311		host_err = vfs_mkdir(&init_user_ns, dirp, dchild, iap->ia_mode);
   1312		if (!host_err && unlikely(d_unhashed(dchild))) {
   1313			struct dentry *d;
   1314			d = lookup_one_len(dchild->d_name.name,
   1315					   dchild->d_parent,
   1316					   dchild->d_name.len);
   1317			if (IS_ERR(d)) {
   1318				host_err = PTR_ERR(d);
   1319				break;
   1320			}
   1321			if (unlikely(d_is_negative(d))) {
   1322				dput(d);
   1323				err = nfserr_serverfault;
   1324				goto out;
   1325			}
   1326			dput(resfhp->fh_dentry);
   1327			resfhp->fh_dentry = dget(d);
   1328			err = fh_update(resfhp);
   1329			dput(dchild);
   1330			dchild = d;
   1331			if (err)
   1332				goto out;
   1333		}
   1334		break;
   1335	case S_IFCHR:
   1336	case S_IFBLK:
   1337	case S_IFIFO:
   1338	case S_IFSOCK:
   1339		host_err = vfs_mknod(&init_user_ns, dirp, dchild,
   1340				     iap->ia_mode, rdev);
   1341		break;
   1342	default:
   1343		printk(KERN_WARNING "nfsd: bad file type %o in nfsd_create\n",
   1344		       type);
   1345		host_err = -EINVAL;
   1346	}
   1347	if (host_err < 0)
   1348		goto out_nfserr;
   1349
   1350	err = nfsd_create_setattr(rqstp, fhp, resfhp, iap);
   1351
   1352out:
   1353	dput(dchild);
   1354	return err;
   1355
   1356out_nfserr:
   1357	err = nfserrno(host_err);
   1358	goto out;
   1359}
   1360
   1361/*
   1362 * Create a filesystem object (regular, directory, special).
   1363 * Note that the parent directory is left locked.
   1364 *
   1365 * N.B. Every call to nfsd_create needs an fh_put for _both_ fhp and resfhp
   1366 */
   1367__be32
   1368nfsd_create(struct svc_rqst *rqstp, struct svc_fh *fhp,
   1369		char *fname, int flen, struct iattr *iap,
   1370		int type, dev_t rdev, struct svc_fh *resfhp)
   1371{
   1372	struct dentry	*dentry, *dchild = NULL;
   1373	__be32		err;
   1374	int		host_err;
   1375
   1376	if (isdotent(fname, flen))
   1377		return nfserr_exist;
   1378
   1379	err = fh_verify(rqstp, fhp, S_IFDIR, NFSD_MAY_NOP);
   1380	if (err)
   1381		return err;
   1382
   1383	dentry = fhp->fh_dentry;
   1384
   1385	host_err = fh_want_write(fhp);
   1386	if (host_err)
   1387		return nfserrno(host_err);
   1388
   1389	fh_lock_nested(fhp, I_MUTEX_PARENT);
   1390	dchild = lookup_one_len(fname, dentry, flen);
   1391	host_err = PTR_ERR(dchild);
   1392	if (IS_ERR(dchild))
   1393		return nfserrno(host_err);
   1394	err = fh_compose(resfhp, fhp->fh_export, dchild, fhp);
   1395	/*
   1396	 * We unconditionally drop our ref to dchild as fh_compose will have
   1397	 * already grabbed its own ref for it.
   1398	 */
   1399	dput(dchild);
   1400	if (err)
   1401		return err;
   1402	return nfsd_create_locked(rqstp, fhp, fname, flen, iap, type,
   1403					rdev, resfhp);
   1404}
   1405
   1406/*
   1407 * Read a symlink. On entry, *lenp must contain the maximum path length that
   1408 * fits into the buffer. On return, it contains the true length.
   1409 * N.B. After this call fhp needs an fh_put
   1410 */
   1411__be32
   1412nfsd_readlink(struct svc_rqst *rqstp, struct svc_fh *fhp, char *buf, int *lenp)
   1413{
   1414	__be32		err;
   1415	const char *link;
   1416	struct path path;
   1417	DEFINE_DELAYED_CALL(done);
   1418	int len;
   1419
   1420	err = fh_verify(rqstp, fhp, S_IFLNK, NFSD_MAY_NOP);
   1421	if (unlikely(err))
   1422		return err;
   1423
   1424	path.mnt = fhp->fh_export->ex_path.mnt;
   1425	path.dentry = fhp->fh_dentry;
   1426
   1427	if (unlikely(!d_is_symlink(path.dentry)))
   1428		return nfserr_inval;
   1429
   1430	touch_atime(&path);
   1431
   1432	link = vfs_get_link(path.dentry, &done);
   1433	if (IS_ERR(link))
   1434		return nfserrno(PTR_ERR(link));
   1435
   1436	len = strlen(link);
   1437	if (len < *lenp)
   1438		*lenp = len;
   1439	memcpy(buf, link, *lenp);
   1440	do_delayed_call(&done);
   1441	return 0;
   1442}
   1443
   1444/*
   1445 * Create a symlink and look up its inode
   1446 * N.B. After this call _both_ fhp and resfhp need an fh_put
   1447 */
   1448__be32
   1449nfsd_symlink(struct svc_rqst *rqstp, struct svc_fh *fhp,
   1450				char *fname, int flen,
   1451				char *path,
   1452				struct svc_fh *resfhp)
   1453{
   1454	struct dentry	*dentry, *dnew;
   1455	__be32		err, cerr;
   1456	int		host_err;
   1457
   1458	err = nfserr_noent;
   1459	if (!flen || path[0] == '\0')
   1460		goto out;
   1461	err = nfserr_exist;
   1462	if (isdotent(fname, flen))
   1463		goto out;
   1464
   1465	err = fh_verify(rqstp, fhp, S_IFDIR, NFSD_MAY_CREATE);
   1466	if (err)
   1467		goto out;
   1468
   1469	host_err = fh_want_write(fhp);
   1470	if (host_err)
   1471		goto out_nfserr;
   1472
   1473	fh_lock(fhp);
   1474	dentry = fhp->fh_dentry;
   1475	dnew = lookup_one_len(fname, dentry, flen);
   1476	host_err = PTR_ERR(dnew);
   1477	if (IS_ERR(dnew))
   1478		goto out_nfserr;
   1479
   1480	host_err = vfs_symlink(&init_user_ns, d_inode(dentry), dnew, path);
   1481	err = nfserrno(host_err);
   1482	fh_unlock(fhp);
   1483	if (!err)
   1484		err = nfserrno(commit_metadata(fhp));
   1485
   1486	fh_drop_write(fhp);
   1487
   1488	cerr = fh_compose(resfhp, fhp->fh_export, dnew, fhp);
   1489	dput(dnew);
   1490	if (err==0) err = cerr;
   1491out:
   1492	return err;
   1493
   1494out_nfserr:
   1495	err = nfserrno(host_err);
   1496	goto out;
   1497}
   1498
   1499/*
   1500 * Create a hardlink
   1501 * N.B. After this call _both_ ffhp and tfhp need an fh_put
   1502 */
   1503__be32
   1504nfsd_link(struct svc_rqst *rqstp, struct svc_fh *ffhp,
   1505				char *name, int len, struct svc_fh *tfhp)
   1506{
   1507	struct dentry	*ddir, *dnew, *dold;
   1508	struct inode	*dirp;
   1509	__be32		err;
   1510	int		host_err;
   1511
   1512	err = fh_verify(rqstp, ffhp, S_IFDIR, NFSD_MAY_CREATE);
   1513	if (err)
   1514		goto out;
   1515	err = fh_verify(rqstp, tfhp, 0, NFSD_MAY_NOP);
   1516	if (err)
   1517		goto out;
   1518	err = nfserr_isdir;
   1519	if (d_is_dir(tfhp->fh_dentry))
   1520		goto out;
   1521	err = nfserr_perm;
   1522	if (!len)
   1523		goto out;
   1524	err = nfserr_exist;
   1525	if (isdotent(name, len))
   1526		goto out;
   1527
   1528	host_err = fh_want_write(tfhp);
   1529	if (host_err) {
   1530		err = nfserrno(host_err);
   1531		goto out;
   1532	}
   1533
   1534	fh_lock_nested(ffhp, I_MUTEX_PARENT);
   1535	ddir = ffhp->fh_dentry;
   1536	dirp = d_inode(ddir);
   1537
   1538	dnew = lookup_one_len(name, ddir, len);
   1539	host_err = PTR_ERR(dnew);
   1540	if (IS_ERR(dnew))
   1541		goto out_nfserr;
   1542
   1543	dold = tfhp->fh_dentry;
   1544
   1545	err = nfserr_noent;
   1546	if (d_really_is_negative(dold))
   1547		goto out_dput;
   1548	host_err = vfs_link(dold, &init_user_ns, dirp, dnew, NULL);
   1549	fh_unlock(ffhp);
   1550	if (!host_err) {
   1551		err = nfserrno(commit_metadata(ffhp));
   1552		if (!err)
   1553			err = nfserrno(commit_metadata(tfhp));
   1554	} else {
   1555		if (host_err == -EXDEV && rqstp->rq_vers == 2)
   1556			err = nfserr_acces;
   1557		else
   1558			err = nfserrno(host_err);
   1559	}
   1560out_dput:
   1561	dput(dnew);
   1562out_unlock:
   1563	fh_unlock(ffhp);
   1564	fh_drop_write(tfhp);
   1565out:
   1566	return err;
   1567
   1568out_nfserr:
   1569	err = nfserrno(host_err);
   1570	goto out_unlock;
   1571}
   1572
   1573static void
   1574nfsd_close_cached_files(struct dentry *dentry)
   1575{
   1576	struct inode *inode = d_inode(dentry);
   1577
   1578	if (inode && S_ISREG(inode->i_mode))
   1579		nfsd_file_close_inode_sync(inode);
   1580}
   1581
   1582static bool
   1583nfsd_has_cached_files(struct dentry *dentry)
   1584{
   1585	bool		ret = false;
   1586	struct inode *inode = d_inode(dentry);
   1587
   1588	if (inode && S_ISREG(inode->i_mode))
   1589		ret = nfsd_file_is_cached(inode);
   1590	return ret;
   1591}
   1592
   1593/*
   1594 * Rename a file
   1595 * N.B. After this call _both_ ffhp and tfhp need an fh_put
   1596 */
   1597__be32
   1598nfsd_rename(struct svc_rqst *rqstp, struct svc_fh *ffhp, char *fname, int flen,
   1599			    struct svc_fh *tfhp, char *tname, int tlen)
   1600{
   1601	struct dentry	*fdentry, *tdentry, *odentry, *ndentry, *trap;
   1602	struct inode	*fdir, *tdir;
   1603	__be32		err;
   1604	int		host_err;
   1605	bool		close_cached = false;
   1606
   1607	err = fh_verify(rqstp, ffhp, S_IFDIR, NFSD_MAY_REMOVE);
   1608	if (err)
   1609		goto out;
   1610	err = fh_verify(rqstp, tfhp, S_IFDIR, NFSD_MAY_CREATE);
   1611	if (err)
   1612		goto out;
   1613
   1614	fdentry = ffhp->fh_dentry;
   1615	fdir = d_inode(fdentry);
   1616
   1617	tdentry = tfhp->fh_dentry;
   1618	tdir = d_inode(tdentry);
   1619
   1620	err = nfserr_perm;
   1621	if (!flen || isdotent(fname, flen) || !tlen || isdotent(tname, tlen))
   1622		goto out;
   1623
   1624retry:
   1625	host_err = fh_want_write(ffhp);
   1626	if (host_err) {
   1627		err = nfserrno(host_err);
   1628		goto out;
   1629	}
   1630
   1631	/* cannot use fh_lock as we need deadlock protective ordering
   1632	 * so do it by hand */
   1633	trap = lock_rename(tdentry, fdentry);
   1634	ffhp->fh_locked = tfhp->fh_locked = true;
   1635	fh_fill_pre_attrs(ffhp);
   1636	fh_fill_pre_attrs(tfhp);
   1637
   1638	odentry = lookup_one_len(fname, fdentry, flen);
   1639	host_err = PTR_ERR(odentry);
   1640	if (IS_ERR(odentry))
   1641		goto out_nfserr;
   1642
   1643	host_err = -ENOENT;
   1644	if (d_really_is_negative(odentry))
   1645		goto out_dput_old;
   1646	host_err = -EINVAL;
   1647	if (odentry == trap)
   1648		goto out_dput_old;
   1649
   1650	ndentry = lookup_one_len(tname, tdentry, tlen);
   1651	host_err = PTR_ERR(ndentry);
   1652	if (IS_ERR(ndentry))
   1653		goto out_dput_old;
   1654	host_err = -ENOTEMPTY;
   1655	if (ndentry == trap)
   1656		goto out_dput_new;
   1657
   1658	host_err = -EXDEV;
   1659	if (ffhp->fh_export->ex_path.mnt != tfhp->fh_export->ex_path.mnt)
   1660		goto out_dput_new;
   1661	if (ffhp->fh_export->ex_path.dentry != tfhp->fh_export->ex_path.dentry)
   1662		goto out_dput_new;
   1663
   1664	if ((ndentry->d_sb->s_export_op->flags & EXPORT_OP_CLOSE_BEFORE_UNLINK) &&
   1665	    nfsd_has_cached_files(ndentry)) {
   1666		close_cached = true;
   1667		goto out_dput_old;
   1668	} else {
   1669		struct renamedata rd = {
   1670			.old_mnt_userns	= &init_user_ns,
   1671			.old_dir	= fdir,
   1672			.old_dentry	= odentry,
   1673			.new_mnt_userns	= &init_user_ns,
   1674			.new_dir	= tdir,
   1675			.new_dentry	= ndentry,
   1676		};
   1677		host_err = vfs_rename(&rd);
   1678		if (!host_err) {
   1679			host_err = commit_metadata(tfhp);
   1680			if (!host_err)
   1681				host_err = commit_metadata(ffhp);
   1682		}
   1683	}
   1684 out_dput_new:
   1685	dput(ndentry);
   1686 out_dput_old:
   1687	dput(odentry);
   1688 out_nfserr:
   1689	err = nfserrno(host_err);
   1690	/*
   1691	 * We cannot rely on fh_unlock on the two filehandles,
   1692	 * as that would do the wrong thing if the two directories
   1693	 * were the same, so again we do it by hand.
   1694	 */
   1695	if (!close_cached) {
   1696		fh_fill_post_attrs(ffhp);
   1697		fh_fill_post_attrs(tfhp);
   1698	}
   1699	unlock_rename(tdentry, fdentry);
   1700	ffhp->fh_locked = tfhp->fh_locked = false;
   1701	fh_drop_write(ffhp);
   1702
   1703	/*
   1704	 * If the target dentry has cached open files, then we need to try to
   1705	 * close them prior to doing the rename. Flushing delayed fput
   1706	 * shouldn't be done with locks held however, so we delay it until this
   1707	 * point and then reattempt the whole shebang.
   1708	 */
   1709	if (close_cached) {
   1710		close_cached = false;
   1711		nfsd_close_cached_files(ndentry);
   1712		dput(ndentry);
   1713		goto retry;
   1714	}
   1715out:
   1716	return err;
   1717}
   1718
   1719/*
   1720 * Unlink a file or directory
   1721 * N.B. After this call fhp needs an fh_put
   1722 */
   1723__be32
   1724nfsd_unlink(struct svc_rqst *rqstp, struct svc_fh *fhp, int type,
   1725				char *fname, int flen)
   1726{
   1727	struct dentry	*dentry, *rdentry;
   1728	struct inode	*dirp;
   1729	struct inode	*rinode;
   1730	__be32		err;
   1731	int		host_err;
   1732
   1733	err = nfserr_acces;
   1734	if (!flen || isdotent(fname, flen))
   1735		goto out;
   1736	err = fh_verify(rqstp, fhp, S_IFDIR, NFSD_MAY_REMOVE);
   1737	if (err)
   1738		goto out;
   1739
   1740	host_err = fh_want_write(fhp);
   1741	if (host_err)
   1742		goto out_nfserr;
   1743
   1744	fh_lock_nested(fhp, I_MUTEX_PARENT);
   1745	dentry = fhp->fh_dentry;
   1746	dirp = d_inode(dentry);
   1747
   1748	rdentry = lookup_one_len(fname, dentry, flen);
   1749	host_err = PTR_ERR(rdentry);
   1750	if (IS_ERR(rdentry))
   1751		goto out_drop_write;
   1752
   1753	if (d_really_is_negative(rdentry)) {
   1754		dput(rdentry);
   1755		host_err = -ENOENT;
   1756		goto out_drop_write;
   1757	}
   1758	rinode = d_inode(rdentry);
   1759	ihold(rinode);
   1760
   1761	if (!type)
   1762		type = d_inode(rdentry)->i_mode & S_IFMT;
   1763
   1764	if (type != S_IFDIR) {
   1765		if (rdentry->d_sb->s_export_op->flags & EXPORT_OP_CLOSE_BEFORE_UNLINK)
   1766			nfsd_close_cached_files(rdentry);
   1767		host_err = vfs_unlink(&init_user_ns, dirp, rdentry, NULL);
   1768	} else {
   1769		host_err = vfs_rmdir(&init_user_ns, dirp, rdentry);
   1770	}
   1771
   1772	fh_unlock(fhp);
   1773	if (!host_err)
   1774		host_err = commit_metadata(fhp);
   1775	dput(rdentry);
   1776	iput(rinode);    /* truncate the inode here */
   1777
   1778out_drop_write:
   1779	fh_drop_write(fhp);
   1780out_nfserr:
   1781	if (host_err == -EBUSY) {
   1782		/* name is mounted-on. There is no perfect
   1783		 * error status.
   1784		 */
   1785		if (nfsd_v4client(rqstp))
   1786			err = nfserr_file_open;
   1787		else
   1788			err = nfserr_acces;
   1789	} else {
   1790		err = nfserrno(host_err);
   1791	}
   1792out:
   1793	return err;
   1794}
   1795
   1796/*
   1797 * We do this buffering because we must not call back into the file
   1798 * system's ->lookup() method from the filldir callback. That may well
   1799 * deadlock a number of file systems.
   1800 *
   1801 * This is based heavily on the implementation of same in XFS.
   1802 */
   1803struct buffered_dirent {
   1804	u64		ino;
   1805	loff_t		offset;
   1806	int		namlen;
   1807	unsigned int	d_type;
   1808	char		name[];
   1809};
   1810
   1811struct readdir_data {
   1812	struct dir_context ctx;
   1813	char		*dirent;
   1814	size_t		used;
   1815	int		full;
   1816};
   1817
   1818static int nfsd_buffered_filldir(struct dir_context *ctx, const char *name,
   1819				 int namlen, loff_t offset, u64 ino,
   1820				 unsigned int d_type)
   1821{
   1822	struct readdir_data *buf =
   1823		container_of(ctx, struct readdir_data, ctx);
   1824	struct buffered_dirent *de = (void *)(buf->dirent + buf->used);
   1825	unsigned int reclen;
   1826
   1827	reclen = ALIGN(sizeof(struct buffered_dirent) + namlen, sizeof(u64));
   1828	if (buf->used + reclen > PAGE_SIZE) {
   1829		buf->full = 1;
   1830		return -EINVAL;
   1831	}
   1832
   1833	de->namlen = namlen;
   1834	de->offset = offset;
   1835	de->ino = ino;
   1836	de->d_type = d_type;
   1837	memcpy(de->name, name, namlen);
   1838	buf->used += reclen;
   1839
   1840	return 0;
   1841}
   1842
   1843static __be32 nfsd_buffered_readdir(struct file *file, struct svc_fh *fhp,
   1844				    nfsd_filldir_t func, struct readdir_cd *cdp,
   1845				    loff_t *offsetp)
   1846{
   1847	struct buffered_dirent *de;
   1848	int host_err;
   1849	int size;
   1850	loff_t offset;
   1851	struct readdir_data buf = {
   1852		.ctx.actor = nfsd_buffered_filldir,
   1853		.dirent = (void *)__get_free_page(GFP_KERNEL)
   1854	};
   1855
   1856	if (!buf.dirent)
   1857		return nfserrno(-ENOMEM);
   1858
   1859	offset = *offsetp;
   1860
   1861	while (1) {
   1862		unsigned int reclen;
   1863
   1864		cdp->err = nfserr_eof; /* will be cleared on successful read */
   1865		buf.used = 0;
   1866		buf.full = 0;
   1867
   1868		host_err = iterate_dir(file, &buf.ctx);
   1869		if (buf.full)
   1870			host_err = 0;
   1871
   1872		if (host_err < 0)
   1873			break;
   1874
   1875		size = buf.used;
   1876
   1877		if (!size)
   1878			break;
   1879
   1880		de = (struct buffered_dirent *)buf.dirent;
   1881		while (size > 0) {
   1882			offset = de->offset;
   1883
   1884			if (func(cdp, de->name, de->namlen, de->offset,
   1885				 de->ino, de->d_type))
   1886				break;
   1887
   1888			if (cdp->err != nfs_ok)
   1889				break;
   1890
   1891			trace_nfsd_dirent(fhp, de->ino, de->name, de->namlen);
   1892
   1893			reclen = ALIGN(sizeof(*de) + de->namlen,
   1894				       sizeof(u64));
   1895			size -= reclen;
   1896			de = (struct buffered_dirent *)((char *)de + reclen);
   1897		}
   1898		if (size > 0) /* We bailed out early */
   1899			break;
   1900
   1901		offset = vfs_llseek(file, 0, SEEK_CUR);
   1902	}
   1903
   1904	free_page((unsigned long)(buf.dirent));
   1905
   1906	if (host_err)
   1907		return nfserrno(host_err);
   1908
   1909	*offsetp = offset;
   1910	return cdp->err;
   1911}
   1912
   1913/*
   1914 * Read entries from a directory.
   1915 * The  NFSv3/4 verifier we ignore for now.
   1916 */
   1917__be32
   1918nfsd_readdir(struct svc_rqst *rqstp, struct svc_fh *fhp, loff_t *offsetp, 
   1919	     struct readdir_cd *cdp, nfsd_filldir_t func)
   1920{
   1921	__be32		err;
   1922	struct file	*file;
   1923	loff_t		offset = *offsetp;
   1924	int             may_flags = NFSD_MAY_READ;
   1925
   1926	/* NFSv2 only supports 32 bit cookies */
   1927	if (rqstp->rq_vers > 2)
   1928		may_flags |= NFSD_MAY_64BIT_COOKIE;
   1929
   1930	err = nfsd_open(rqstp, fhp, S_IFDIR, may_flags, &file);
   1931	if (err)
   1932		goto out;
   1933
   1934	offset = vfs_llseek(file, offset, SEEK_SET);
   1935	if (offset < 0) {
   1936		err = nfserrno((int)offset);
   1937		goto out_close;
   1938	}
   1939
   1940	err = nfsd_buffered_readdir(file, fhp, func, cdp, offsetp);
   1941
   1942	if (err == nfserr_eof || err == nfserr_toosmall)
   1943		err = nfs_ok; /* can still be found in ->err */
   1944out_close:
   1945	fput(file);
   1946out:
   1947	return err;
   1948}
   1949
   1950/*
   1951 * Get file system stats
   1952 * N.B. After this call fhp needs an fh_put
   1953 */
   1954__be32
   1955nfsd_statfs(struct svc_rqst *rqstp, struct svc_fh *fhp, struct kstatfs *stat, int access)
   1956{
   1957	__be32 err;
   1958
   1959	err = fh_verify(rqstp, fhp, 0, NFSD_MAY_NOP | access);
   1960	if (!err) {
   1961		struct path path = {
   1962			.mnt	= fhp->fh_export->ex_path.mnt,
   1963			.dentry	= fhp->fh_dentry,
   1964		};
   1965		if (vfs_statfs(&path, stat))
   1966			err = nfserr_io;
   1967	}
   1968	return err;
   1969}
   1970
   1971static int exp_rdonly(struct svc_rqst *rqstp, struct svc_export *exp)
   1972{
   1973	return nfsexp_flags(rqstp, exp) & NFSEXP_READONLY;
   1974}
   1975
   1976#ifdef CONFIG_NFSD_V4
   1977/*
   1978 * Helper function to translate error numbers. In the case of xattr operations,
   1979 * some error codes need to be translated outside of the standard translations.
   1980 *
   1981 * ENODATA needs to be translated to nfserr_noxattr.
   1982 * E2BIG to nfserr_xattr2big.
   1983 *
   1984 * Additionally, vfs_listxattr can return -ERANGE. This means that the
   1985 * file has too many extended attributes to retrieve inside an
   1986 * XATTR_LIST_MAX sized buffer. This is a bug in the xattr implementation:
   1987 * filesystems will allow the adding of extended attributes until they hit
   1988 * their own internal limit. This limit may be larger than XATTR_LIST_MAX.
   1989 * So, at that point, the attributes are present and valid, but can't
   1990 * be retrieved using listxattr, since the upper level xattr code enforces
   1991 * the XATTR_LIST_MAX limit.
   1992 *
   1993 * This bug means that we need to deal with listxattr returning -ERANGE. The
   1994 * best mapping is to return TOOSMALL.
   1995 */
   1996static __be32
   1997nfsd_xattr_errno(int err)
   1998{
   1999	switch (err) {
   2000	case -ENODATA:
   2001		return nfserr_noxattr;
   2002	case -E2BIG:
   2003		return nfserr_xattr2big;
   2004	case -ERANGE:
   2005		return nfserr_toosmall;
   2006	}
   2007	return nfserrno(err);
   2008}
   2009
   2010/*
   2011 * Retrieve the specified user extended attribute. To avoid always
   2012 * having to allocate the maximum size (since we are not getting
   2013 * a maximum size from the RPC), do a probe + alloc. Hold a reader
   2014 * lock on i_rwsem to prevent the extended attribute from changing
   2015 * size while we're doing this.
   2016 */
   2017__be32
   2018nfsd_getxattr(struct svc_rqst *rqstp, struct svc_fh *fhp, char *name,
   2019	      void **bufp, int *lenp)
   2020{
   2021	ssize_t len;
   2022	__be32 err;
   2023	char *buf;
   2024	struct inode *inode;
   2025	struct dentry *dentry;
   2026
   2027	err = fh_verify(rqstp, fhp, 0, NFSD_MAY_READ);
   2028	if (err)
   2029		return err;
   2030
   2031	err = nfs_ok;
   2032	dentry = fhp->fh_dentry;
   2033	inode = d_inode(dentry);
   2034
   2035	inode_lock_shared(inode);
   2036
   2037	len = vfs_getxattr(&init_user_ns, dentry, name, NULL, 0);
   2038
   2039	/*
   2040	 * Zero-length attribute, just return.
   2041	 */
   2042	if (len == 0) {
   2043		*bufp = NULL;
   2044		*lenp = 0;
   2045		goto out;
   2046	}
   2047
   2048	if (len < 0) {
   2049		err = nfsd_xattr_errno(len);
   2050		goto out;
   2051	}
   2052
   2053	if (len > *lenp) {
   2054		err = nfserr_toosmall;
   2055		goto out;
   2056	}
   2057
   2058	buf = kvmalloc(len, GFP_KERNEL | GFP_NOFS);
   2059	if (buf == NULL) {
   2060		err = nfserr_jukebox;
   2061		goto out;
   2062	}
   2063
   2064	len = vfs_getxattr(&init_user_ns, dentry, name, buf, len);
   2065	if (len <= 0) {
   2066		kvfree(buf);
   2067		buf = NULL;
   2068		err = nfsd_xattr_errno(len);
   2069	}
   2070
   2071	*lenp = len;
   2072	*bufp = buf;
   2073
   2074out:
   2075	inode_unlock_shared(inode);
   2076
   2077	return err;
   2078}
   2079
   2080/*
   2081 * Retrieve the xattr names. Since we can't know how many are
   2082 * user extended attributes, we must get all attributes here,
   2083 * and have the XDR encode filter out the "user." ones.
   2084 *
   2085 * While this could always just allocate an XATTR_LIST_MAX
   2086 * buffer, that's a waste, so do a probe + allocate. To
   2087 * avoid any changes between the probe and allocate, wrap
   2088 * this in inode_lock.
   2089 */
   2090__be32
   2091nfsd_listxattr(struct svc_rqst *rqstp, struct svc_fh *fhp, char **bufp,
   2092	       int *lenp)
   2093{
   2094	ssize_t len;
   2095	__be32 err;
   2096	char *buf;
   2097	struct inode *inode;
   2098	struct dentry *dentry;
   2099
   2100	err = fh_verify(rqstp, fhp, 0, NFSD_MAY_READ);
   2101	if (err)
   2102		return err;
   2103
   2104	dentry = fhp->fh_dentry;
   2105	inode = d_inode(dentry);
   2106	*lenp = 0;
   2107
   2108	inode_lock_shared(inode);
   2109
   2110	len = vfs_listxattr(dentry, NULL, 0);
   2111	if (len <= 0) {
   2112		err = nfsd_xattr_errno(len);
   2113		goto out;
   2114	}
   2115
   2116	if (len > XATTR_LIST_MAX) {
   2117		err = nfserr_xattr2big;
   2118		goto out;
   2119	}
   2120
   2121	/*
   2122	 * We're holding i_rwsem - use GFP_NOFS.
   2123	 */
   2124	buf = kvmalloc(len, GFP_KERNEL | GFP_NOFS);
   2125	if (buf == NULL) {
   2126		err = nfserr_jukebox;
   2127		goto out;
   2128	}
   2129
   2130	len = vfs_listxattr(dentry, buf, len);
   2131	if (len <= 0) {
   2132		kvfree(buf);
   2133		err = nfsd_xattr_errno(len);
   2134		goto out;
   2135	}
   2136
   2137	*lenp = len;
   2138	*bufp = buf;
   2139
   2140	err = nfs_ok;
   2141out:
   2142	inode_unlock_shared(inode);
   2143
   2144	return err;
   2145}
   2146
   2147/*
   2148 * Removexattr and setxattr need to call fh_lock to both lock the inode
   2149 * and set the change attribute. Since the top-level vfs_removexattr
   2150 * and vfs_setxattr calls already do their own inode_lock calls, call
   2151 * the _locked variant. Pass in a NULL pointer for delegated_inode,
   2152 * and let the client deal with NFS4ERR_DELAY (same as with e.g.
   2153 * setattr and remove).
   2154 */
   2155__be32
   2156nfsd_removexattr(struct svc_rqst *rqstp, struct svc_fh *fhp, char *name)
   2157{
   2158	__be32 err;
   2159	int ret;
   2160
   2161	err = fh_verify(rqstp, fhp, 0, NFSD_MAY_WRITE);
   2162	if (err)
   2163		return err;
   2164
   2165	ret = fh_want_write(fhp);
   2166	if (ret)
   2167		return nfserrno(ret);
   2168
   2169	fh_lock(fhp);
   2170
   2171	ret = __vfs_removexattr_locked(&init_user_ns, fhp->fh_dentry,
   2172				       name, NULL);
   2173
   2174	fh_unlock(fhp);
   2175	fh_drop_write(fhp);
   2176
   2177	return nfsd_xattr_errno(ret);
   2178}
   2179
   2180__be32
   2181nfsd_setxattr(struct svc_rqst *rqstp, struct svc_fh *fhp, char *name,
   2182	      void *buf, u32 len, u32 flags)
   2183{
   2184	__be32 err;
   2185	int ret;
   2186
   2187	err = fh_verify(rqstp, fhp, 0, NFSD_MAY_WRITE);
   2188	if (err)
   2189		return err;
   2190
   2191	ret = fh_want_write(fhp);
   2192	if (ret)
   2193		return nfserrno(ret);
   2194	fh_lock(fhp);
   2195
   2196	ret = __vfs_setxattr_locked(&init_user_ns, fhp->fh_dentry, name, buf,
   2197				    len, flags, NULL);
   2198
   2199	fh_unlock(fhp);
   2200	fh_drop_write(fhp);
   2201
   2202	return nfsd_xattr_errno(ret);
   2203}
   2204#endif
   2205
   2206/*
   2207 * Check for a user's access permissions to this inode.
   2208 */
   2209__be32
   2210nfsd_permission(struct svc_rqst *rqstp, struct svc_export *exp,
   2211					struct dentry *dentry, int acc)
   2212{
   2213	struct inode	*inode = d_inode(dentry);
   2214	int		err;
   2215
   2216	if ((acc & NFSD_MAY_MASK) == NFSD_MAY_NOP)
   2217		return 0;
   2218#if 0
   2219	dprintk("nfsd: permission 0x%x%s%s%s%s%s%s%s mode 0%o%s%s%s\n",
   2220		acc,
   2221		(acc & NFSD_MAY_READ)?	" read"  : "",
   2222		(acc & NFSD_MAY_WRITE)?	" write" : "",
   2223		(acc & NFSD_MAY_EXEC)?	" exec"  : "",
   2224		(acc & NFSD_MAY_SATTR)?	" sattr" : "",
   2225		(acc & NFSD_MAY_TRUNC)?	" trunc" : "",
   2226		(acc & NFSD_MAY_LOCK)?	" lock"  : "",
   2227		(acc & NFSD_MAY_OWNER_OVERRIDE)? " owneroverride" : "",
   2228		inode->i_mode,
   2229		IS_IMMUTABLE(inode)?	" immut" : "",
   2230		IS_APPEND(inode)?	" append" : "",
   2231		__mnt_is_readonly(exp->ex_path.mnt)?	" ro" : "");
   2232	dprintk("      owner %d/%d user %d/%d\n",
   2233		inode->i_uid, inode->i_gid, current_fsuid(), current_fsgid());
   2234#endif
   2235
   2236	/* Normally we reject any write/sattr etc access on a read-only file
   2237	 * system.  But if it is IRIX doing check on write-access for a 
   2238	 * device special file, we ignore rofs.
   2239	 */
   2240	if (!(acc & NFSD_MAY_LOCAL_ACCESS))
   2241		if (acc & (NFSD_MAY_WRITE | NFSD_MAY_SATTR | NFSD_MAY_TRUNC)) {
   2242			if (exp_rdonly(rqstp, exp) ||
   2243			    __mnt_is_readonly(exp->ex_path.mnt))
   2244				return nfserr_rofs;
   2245			if (/* (acc & NFSD_MAY_WRITE) && */ IS_IMMUTABLE(inode))
   2246				return nfserr_perm;
   2247		}
   2248	if ((acc & NFSD_MAY_TRUNC) && IS_APPEND(inode))
   2249		return nfserr_perm;
   2250
   2251	if (acc & NFSD_MAY_LOCK) {
   2252		/* If we cannot rely on authentication in NLM requests,
   2253		 * just allow locks, otherwise require read permission, or
   2254		 * ownership
   2255		 */
   2256		if (exp->ex_flags & NFSEXP_NOAUTHNLM)
   2257			return 0;
   2258		else
   2259			acc = NFSD_MAY_READ | NFSD_MAY_OWNER_OVERRIDE;
   2260	}
   2261	/*
   2262	 * The file owner always gets access permission for accesses that
   2263	 * would normally be checked at open time. This is to make
   2264	 * file access work even when the client has done a fchmod(fd, 0).
   2265	 *
   2266	 * However, `cp foo bar' should fail nevertheless when bar is
   2267	 * readonly. A sensible way to do this might be to reject all
   2268	 * attempts to truncate a read-only file, because a creat() call
   2269	 * always implies file truncation.
   2270	 * ... but this isn't really fair.  A process may reasonably call
   2271	 * ftruncate on an open file descriptor on a file with perm 000.
   2272	 * We must trust the client to do permission checking - using "ACCESS"
   2273	 * with NFSv3.
   2274	 */
   2275	if ((acc & NFSD_MAY_OWNER_OVERRIDE) &&
   2276	    uid_eq(inode->i_uid, current_fsuid()))
   2277		return 0;
   2278
   2279	/* This assumes  NFSD_MAY_{READ,WRITE,EXEC} == MAY_{READ,WRITE,EXEC} */
   2280	err = inode_permission(&init_user_ns, inode,
   2281			       acc & (MAY_READ | MAY_WRITE | MAY_EXEC));
   2282
   2283	/* Allow read access to binaries even when mode 111 */
   2284	if (err == -EACCES && S_ISREG(inode->i_mode) &&
   2285	     (acc == (NFSD_MAY_READ | NFSD_MAY_OWNER_OVERRIDE) ||
   2286	      acc == (NFSD_MAY_READ | NFSD_MAY_READ_IF_EXEC)))
   2287		err = inode_permission(&init_user_ns, inode, MAY_EXEC);
   2288
   2289	return err? nfserrno(err) : 0;
   2290}