cachepc-linux

Fork of AMDESE/linux with modifications for CachePC side-channel attack
git clone https://git.sinitax.com/sinitax/cachepc-linux
Log | Files | Refs | README | LICENSE | sfeed.txt

namei.c (138468B)


      1// SPDX-License-Identifier: GPL-2.0
      2/*
      3 *  linux/fs/namei.c
      4 *
      5 *  Copyright (C) 1991, 1992  Linus Torvalds
      6 */
      7
      8/*
      9 * Some corrections by tytso.
     10 */
     11
     12/* [Feb 1997 T. Schoebel-Theuer] Complete rewrite of the pathname
     13 * lookup logic.
     14 */
     15/* [Feb-Apr 2000, AV] Rewrite to the new namespace architecture.
     16 */
     17
     18#include <linux/init.h>
     19#include <linux/export.h>
     20#include <linux/kernel.h>
     21#include <linux/slab.h>
     22#include <linux/fs.h>
     23#include <linux/namei.h>
     24#include <linux/pagemap.h>
     25#include <linux/sched/mm.h>
     26#include <linux/fsnotify.h>
     27#include <linux/personality.h>
     28#include <linux/security.h>
     29#include <linux/ima.h>
     30#include <linux/syscalls.h>
     31#include <linux/mount.h>
     32#include <linux/audit.h>
     33#include <linux/capability.h>
     34#include <linux/file.h>
     35#include <linux/fcntl.h>
     36#include <linux/device_cgroup.h>
     37#include <linux/fs_struct.h>
     38#include <linux/posix_acl.h>
     39#include <linux/hash.h>
     40#include <linux/bitops.h>
     41#include <linux/init_task.h>
     42#include <linux/uaccess.h>
     43
     44#include "internal.h"
     45#include "mount.h"
     46
     47/* [Feb-1997 T. Schoebel-Theuer]
     48 * Fundamental changes in the pathname lookup mechanisms (namei)
     49 * were necessary because of omirr.  The reason is that omirr needs
     50 * to know the _real_ pathname, not the user-supplied one, in case
     51 * of symlinks (and also when transname replacements occur).
     52 *
     53 * The new code replaces the old recursive symlink resolution with
     54 * an iterative one (in case of non-nested symlink chains).  It does
     55 * this with calls to <fs>_follow_link().
     56 * As a side effect, dir_namei(), _namei() and follow_link() are now 
     57 * replaced with a single function lookup_dentry() that can handle all 
     58 * the special cases of the former code.
     59 *
     60 * With the new dcache, the pathname is stored at each inode, at least as
     61 * long as the refcount of the inode is positive.  As a side effect, the
     62 * size of the dcache depends on the inode cache and thus is dynamic.
     63 *
     64 * [29-Apr-1998 C. Scott Ananian] Updated above description of symlink
     65 * resolution to correspond with current state of the code.
     66 *
     67 * Note that the symlink resolution is not *completely* iterative.
     68 * There is still a significant amount of tail- and mid- recursion in
     69 * the algorithm.  Also, note that <fs>_readlink() is not used in
     70 * lookup_dentry(): lookup_dentry() on the result of <fs>_readlink()
     71 * may return different results than <fs>_follow_link().  Many virtual
     72 * filesystems (including /proc) exhibit this behavior.
     73 */
     74
     75/* [24-Feb-97 T. Schoebel-Theuer] Side effects caused by new implementation:
     76 * New symlink semantics: when open() is called with flags O_CREAT | O_EXCL
     77 * and the name already exists in form of a symlink, try to create the new
     78 * name indicated by the symlink. The old code always complained that the
     79 * name already exists, due to not following the symlink even if its target
     80 * is nonexistent.  The new semantics affects also mknod() and link() when
     81 * the name is a symlink pointing to a non-existent name.
     82 *
     83 * I don't know which semantics is the right one, since I have no access
     84 * to standards. But I found by trial that HP-UX 9.0 has the full "new"
     85 * semantics implemented, while SunOS 4.1.1 and Solaris (SunOS 5.4) have the
     86 * "old" one. Personally, I think the new semantics is much more logical.
     87 * Note that "ln old new" where "new" is a symlink pointing to a non-existing
     88 * file does succeed in both HP-UX and SunOs, but not in Solaris
     89 * and in the old Linux semantics.
     90 */
     91
     92/* [16-Dec-97 Kevin Buhr] For security reasons, we change some symlink
     93 * semantics.  See the comments in "open_namei" and "do_link" below.
     94 *
     95 * [10-Sep-98 Alan Modra] Another symlink change.
     96 */
     97
     98/* [Feb-Apr 2000 AV] Complete rewrite. Rules for symlinks:
     99 *	inside the path - always follow.
    100 *	in the last component in creation/removal/renaming - never follow.
    101 *	if LOOKUP_FOLLOW passed - follow.
    102 *	if the pathname has trailing slashes - follow.
    103 *	otherwise - don't follow.
    104 * (applied in that order).
    105 *
    106 * [Jun 2000 AV] Inconsistent behaviour of open() in case if flags==O_CREAT
    107 * restored for 2.4. This is the last surviving part of old 4.2BSD bug.
    108 * During the 2.4 we need to fix the userland stuff depending on it -
    109 * hopefully we will be able to get rid of that wart in 2.5. So far only
    110 * XEmacs seems to be relying on it...
    111 */
    112/*
    113 * [Sep 2001 AV] Single-semaphore locking scheme (kudos to David Holland)
    114 * implemented.  Let's see if raised priority of ->s_vfs_rename_mutex gives
    115 * any extra contention...
    116 */
    117
    118/* In order to reduce some races, while at the same time doing additional
    119 * checking and hopefully speeding things up, we copy filenames to the
    120 * kernel data space before using them..
    121 *
    122 * POSIX.1 2.4: an empty pathname is invalid (ENOENT).
    123 * PATH_MAX includes the nul terminator --RR.
    124 */
    125
    126#define EMBEDDED_NAME_MAX	(PATH_MAX - offsetof(struct filename, iname))
    127
    128struct filename *
    129getname_flags(const char __user *filename, int flags, int *empty)
    130{
    131	struct filename *result;
    132	char *kname;
    133	int len;
    134
    135	result = audit_reusename(filename);
    136	if (result)
    137		return result;
    138
    139	result = __getname();
    140	if (unlikely(!result))
    141		return ERR_PTR(-ENOMEM);
    142
    143	/*
    144	 * First, try to embed the struct filename inside the names_cache
    145	 * allocation
    146	 */
    147	kname = (char *)result->iname;
    148	result->name = kname;
    149
    150	len = strncpy_from_user(kname, filename, EMBEDDED_NAME_MAX);
    151	if (unlikely(len < 0)) {
    152		__putname(result);
    153		return ERR_PTR(len);
    154	}
    155
    156	/*
    157	 * Uh-oh. We have a name that's approaching PATH_MAX. Allocate a
    158	 * separate struct filename so we can dedicate the entire
    159	 * names_cache allocation for the pathname, and re-do the copy from
    160	 * userland.
    161	 */
    162	if (unlikely(len == EMBEDDED_NAME_MAX)) {
    163		const size_t size = offsetof(struct filename, iname[1]);
    164		kname = (char *)result;
    165
    166		/*
    167		 * size is chosen that way we to guarantee that
    168		 * result->iname[0] is within the same object and that
    169		 * kname can't be equal to result->iname, no matter what.
    170		 */
    171		result = kzalloc(size, GFP_KERNEL);
    172		if (unlikely(!result)) {
    173			__putname(kname);
    174			return ERR_PTR(-ENOMEM);
    175		}
    176		result->name = kname;
    177		len = strncpy_from_user(kname, filename, PATH_MAX);
    178		if (unlikely(len < 0)) {
    179			__putname(kname);
    180			kfree(result);
    181			return ERR_PTR(len);
    182		}
    183		if (unlikely(len == PATH_MAX)) {
    184			__putname(kname);
    185			kfree(result);
    186			return ERR_PTR(-ENAMETOOLONG);
    187		}
    188	}
    189
    190	result->refcnt = 1;
    191	/* The empty path is special. */
    192	if (unlikely(!len)) {
    193		if (empty)
    194			*empty = 1;
    195		if (!(flags & LOOKUP_EMPTY)) {
    196			putname(result);
    197			return ERR_PTR(-ENOENT);
    198		}
    199	}
    200
    201	result->uptr = filename;
    202	result->aname = NULL;
    203	audit_getname(result);
    204	return result;
    205}
    206
    207struct filename *
    208getname_uflags(const char __user *filename, int uflags)
    209{
    210	int flags = (uflags & AT_EMPTY_PATH) ? LOOKUP_EMPTY : 0;
    211
    212	return getname_flags(filename, flags, NULL);
    213}
    214
    215struct filename *
    216getname(const char __user * filename)
    217{
    218	return getname_flags(filename, 0, NULL);
    219}
    220
    221struct filename *
    222getname_kernel(const char * filename)
    223{
    224	struct filename *result;
    225	int len = strlen(filename) + 1;
    226
    227	result = __getname();
    228	if (unlikely(!result))
    229		return ERR_PTR(-ENOMEM);
    230
    231	if (len <= EMBEDDED_NAME_MAX) {
    232		result->name = (char *)result->iname;
    233	} else if (len <= PATH_MAX) {
    234		const size_t size = offsetof(struct filename, iname[1]);
    235		struct filename *tmp;
    236
    237		tmp = kmalloc(size, GFP_KERNEL);
    238		if (unlikely(!tmp)) {
    239			__putname(result);
    240			return ERR_PTR(-ENOMEM);
    241		}
    242		tmp->name = (char *)result;
    243		result = tmp;
    244	} else {
    245		__putname(result);
    246		return ERR_PTR(-ENAMETOOLONG);
    247	}
    248	memcpy((char *)result->name, filename, len);
    249	result->uptr = NULL;
    250	result->aname = NULL;
    251	result->refcnt = 1;
    252	audit_getname(result);
    253
    254	return result;
    255}
    256
    257void putname(struct filename *name)
    258{
    259	if (IS_ERR(name))
    260		return;
    261
    262	BUG_ON(name->refcnt <= 0);
    263
    264	if (--name->refcnt > 0)
    265		return;
    266
    267	if (name->name != name->iname) {
    268		__putname(name->name);
    269		kfree(name);
    270	} else
    271		__putname(name);
    272}
    273
    274/**
    275 * check_acl - perform ACL permission checking
    276 * @mnt_userns:	user namespace of the mount the inode was found from
    277 * @inode:	inode to check permissions on
    278 * @mask:	right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC ...)
    279 *
    280 * This function performs the ACL permission checking. Since this function
    281 * retrieve POSIX acls it needs to know whether it is called from a blocking or
    282 * non-blocking context and thus cares about the MAY_NOT_BLOCK bit.
    283 *
    284 * If the inode has been found through an idmapped mount the user namespace of
    285 * the vfsmount must be passed through @mnt_userns. This function will then take
    286 * care to map the inode according to @mnt_userns before checking permissions.
    287 * On non-idmapped mounts or if permission checking is to be performed on the
    288 * raw inode simply passs init_user_ns.
    289 */
    290static int check_acl(struct user_namespace *mnt_userns,
    291		     struct inode *inode, int mask)
    292{
    293#ifdef CONFIG_FS_POSIX_ACL
    294	struct posix_acl *acl;
    295
    296	if (mask & MAY_NOT_BLOCK) {
    297		acl = get_cached_acl_rcu(inode, ACL_TYPE_ACCESS);
    298	        if (!acl)
    299	                return -EAGAIN;
    300		/* no ->get_acl() calls in RCU mode... */
    301		if (is_uncached_acl(acl))
    302			return -ECHILD;
    303	        return posix_acl_permission(mnt_userns, inode, acl, mask);
    304	}
    305
    306	acl = get_acl(inode, ACL_TYPE_ACCESS);
    307	if (IS_ERR(acl))
    308		return PTR_ERR(acl);
    309	if (acl) {
    310	        int error = posix_acl_permission(mnt_userns, inode, acl, mask);
    311	        posix_acl_release(acl);
    312	        return error;
    313	}
    314#endif
    315
    316	return -EAGAIN;
    317}
    318
    319/**
    320 * acl_permission_check - perform basic UNIX permission checking
    321 * @mnt_userns:	user namespace of the mount the inode was found from
    322 * @inode:	inode to check permissions on
    323 * @mask:	right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC ...)
    324 *
    325 * This function performs the basic UNIX permission checking. Since this
    326 * function may retrieve POSIX acls it needs to know whether it is called from a
    327 * blocking or non-blocking context and thus cares about the MAY_NOT_BLOCK bit.
    328 *
    329 * If the inode has been found through an idmapped mount the user namespace of
    330 * the vfsmount must be passed through @mnt_userns. This function will then take
    331 * care to map the inode according to @mnt_userns before checking permissions.
    332 * On non-idmapped mounts or if permission checking is to be performed on the
    333 * raw inode simply passs init_user_ns.
    334 */
    335static int acl_permission_check(struct user_namespace *mnt_userns,
    336				struct inode *inode, int mask)
    337{
    338	unsigned int mode = inode->i_mode;
    339	kuid_t i_uid;
    340
    341	/* Are we the owner? If so, ACL's don't matter */
    342	i_uid = i_uid_into_mnt(mnt_userns, inode);
    343	if (likely(uid_eq(current_fsuid(), i_uid))) {
    344		mask &= 7;
    345		mode >>= 6;
    346		return (mask & ~mode) ? -EACCES : 0;
    347	}
    348
    349	/* Do we have ACL's? */
    350	if (IS_POSIXACL(inode) && (mode & S_IRWXG)) {
    351		int error = check_acl(mnt_userns, inode, mask);
    352		if (error != -EAGAIN)
    353			return error;
    354	}
    355
    356	/* Only RWX matters for group/other mode bits */
    357	mask &= 7;
    358
    359	/*
    360	 * Are the group permissions different from
    361	 * the other permissions in the bits we care
    362	 * about? Need to check group ownership if so.
    363	 */
    364	if (mask & (mode ^ (mode >> 3))) {
    365		kgid_t kgid = i_gid_into_mnt(mnt_userns, inode);
    366		if (in_group_p(kgid))
    367			mode >>= 3;
    368	}
    369
    370	/* Bits in 'mode' clear that we require? */
    371	return (mask & ~mode) ? -EACCES : 0;
    372}
    373
    374/**
    375 * generic_permission -  check for access rights on a Posix-like filesystem
    376 * @mnt_userns:	user namespace of the mount the inode was found from
    377 * @inode:	inode to check access rights for
    378 * @mask:	right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC,
    379 *		%MAY_NOT_BLOCK ...)
    380 *
    381 * Used to check for read/write/execute permissions on a file.
    382 * We use "fsuid" for this, letting us set arbitrary permissions
    383 * for filesystem access without changing the "normal" uids which
    384 * are used for other things.
    385 *
    386 * generic_permission is rcu-walk aware. It returns -ECHILD in case an rcu-walk
    387 * request cannot be satisfied (eg. requires blocking or too much complexity).
    388 * It would then be called again in ref-walk mode.
    389 *
    390 * If the inode has been found through an idmapped mount the user namespace of
    391 * the vfsmount must be passed through @mnt_userns. This function will then take
    392 * care to map the inode according to @mnt_userns before checking permissions.
    393 * On non-idmapped mounts or if permission checking is to be performed on the
    394 * raw inode simply passs init_user_ns.
    395 */
    396int generic_permission(struct user_namespace *mnt_userns, struct inode *inode,
    397		       int mask)
    398{
    399	int ret;
    400
    401	/*
    402	 * Do the basic permission checks.
    403	 */
    404	ret = acl_permission_check(mnt_userns, inode, mask);
    405	if (ret != -EACCES)
    406		return ret;
    407
    408	if (S_ISDIR(inode->i_mode)) {
    409		/* DACs are overridable for directories */
    410		if (!(mask & MAY_WRITE))
    411			if (capable_wrt_inode_uidgid(mnt_userns, inode,
    412						     CAP_DAC_READ_SEARCH))
    413				return 0;
    414		if (capable_wrt_inode_uidgid(mnt_userns, inode,
    415					     CAP_DAC_OVERRIDE))
    416			return 0;
    417		return -EACCES;
    418	}
    419
    420	/*
    421	 * Searching includes executable on directories, else just read.
    422	 */
    423	mask &= MAY_READ | MAY_WRITE | MAY_EXEC;
    424	if (mask == MAY_READ)
    425		if (capable_wrt_inode_uidgid(mnt_userns, inode,
    426					     CAP_DAC_READ_SEARCH))
    427			return 0;
    428	/*
    429	 * Read/write DACs are always overridable.
    430	 * Executable DACs are overridable when there is
    431	 * at least one exec bit set.
    432	 */
    433	if (!(mask & MAY_EXEC) || (inode->i_mode & S_IXUGO))
    434		if (capable_wrt_inode_uidgid(mnt_userns, inode,
    435					     CAP_DAC_OVERRIDE))
    436			return 0;
    437
    438	return -EACCES;
    439}
    440EXPORT_SYMBOL(generic_permission);
    441
    442/**
    443 * do_inode_permission - UNIX permission checking
    444 * @mnt_userns:	user namespace of the mount the inode was found from
    445 * @inode:	inode to check permissions on
    446 * @mask:	right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC ...)
    447 *
    448 * We _really_ want to just do "generic_permission()" without
    449 * even looking at the inode->i_op values. So we keep a cache
    450 * flag in inode->i_opflags, that says "this has not special
    451 * permission function, use the fast case".
    452 */
    453static inline int do_inode_permission(struct user_namespace *mnt_userns,
    454				      struct inode *inode, int mask)
    455{
    456	if (unlikely(!(inode->i_opflags & IOP_FASTPERM))) {
    457		if (likely(inode->i_op->permission))
    458			return inode->i_op->permission(mnt_userns, inode, mask);
    459
    460		/* This gets set once for the inode lifetime */
    461		spin_lock(&inode->i_lock);
    462		inode->i_opflags |= IOP_FASTPERM;
    463		spin_unlock(&inode->i_lock);
    464	}
    465	return generic_permission(mnt_userns, inode, mask);
    466}
    467
    468/**
    469 * sb_permission - Check superblock-level permissions
    470 * @sb: Superblock of inode to check permission on
    471 * @inode: Inode to check permission on
    472 * @mask: Right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC)
    473 *
    474 * Separate out file-system wide checks from inode-specific permission checks.
    475 */
    476static int sb_permission(struct super_block *sb, struct inode *inode, int mask)
    477{
    478	if (unlikely(mask & MAY_WRITE)) {
    479		umode_t mode = inode->i_mode;
    480
    481		/* Nobody gets write access to a read-only fs. */
    482		if (sb_rdonly(sb) && (S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode)))
    483			return -EROFS;
    484	}
    485	return 0;
    486}
    487
    488/**
    489 * inode_permission - Check for access rights to a given inode
    490 * @mnt_userns:	User namespace of the mount the inode was found from
    491 * @inode:	Inode to check permission on
    492 * @mask:	Right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC)
    493 *
    494 * Check for read/write/execute permissions on an inode.  We use fs[ug]id for
    495 * this, letting us set arbitrary permissions for filesystem access without
    496 * changing the "normal" UIDs which are used for other things.
    497 *
    498 * When checking for MAY_APPEND, MAY_WRITE must also be set in @mask.
    499 */
    500int inode_permission(struct user_namespace *mnt_userns,
    501		     struct inode *inode, int mask)
    502{
    503	int retval;
    504
    505	retval = sb_permission(inode->i_sb, inode, mask);
    506	if (retval)
    507		return retval;
    508
    509	if (unlikely(mask & MAY_WRITE)) {
    510		/*
    511		 * Nobody gets write access to an immutable file.
    512		 */
    513		if (IS_IMMUTABLE(inode))
    514			return -EPERM;
    515
    516		/*
    517		 * Updating mtime will likely cause i_uid and i_gid to be
    518		 * written back improperly if their true value is unknown
    519		 * to the vfs.
    520		 */
    521		if (HAS_UNMAPPED_ID(mnt_userns, inode))
    522			return -EACCES;
    523	}
    524
    525	retval = do_inode_permission(mnt_userns, inode, mask);
    526	if (retval)
    527		return retval;
    528
    529	retval = devcgroup_inode_permission(inode, mask);
    530	if (retval)
    531		return retval;
    532
    533	return security_inode_permission(inode, mask);
    534}
    535EXPORT_SYMBOL(inode_permission);
    536
    537/**
    538 * path_get - get a reference to a path
    539 * @path: path to get the reference to
    540 *
    541 * Given a path increment the reference count to the dentry and the vfsmount.
    542 */
    543void path_get(const struct path *path)
    544{
    545	mntget(path->mnt);
    546	dget(path->dentry);
    547}
    548EXPORT_SYMBOL(path_get);
    549
    550/**
    551 * path_put - put a reference to a path
    552 * @path: path to put the reference to
    553 *
    554 * Given a path decrement the reference count to the dentry and the vfsmount.
    555 */
    556void path_put(const struct path *path)
    557{
    558	dput(path->dentry);
    559	mntput(path->mnt);
    560}
    561EXPORT_SYMBOL(path_put);
    562
    563#define EMBEDDED_LEVELS 2
    564struct nameidata {
    565	struct path	path;
    566	struct qstr	last;
    567	struct path	root;
    568	struct inode	*inode; /* path.dentry.d_inode */
    569	unsigned int	flags, state;
    570	unsigned	seq, m_seq, r_seq;
    571	int		last_type;
    572	unsigned	depth;
    573	int		total_link_count;
    574	struct saved {
    575		struct path link;
    576		struct delayed_call done;
    577		const char *name;
    578		unsigned seq;
    579	} *stack, internal[EMBEDDED_LEVELS];
    580	struct filename	*name;
    581	struct nameidata *saved;
    582	unsigned	root_seq;
    583	int		dfd;
    584	kuid_t		dir_uid;
    585	umode_t		dir_mode;
    586} __randomize_layout;
    587
    588#define ND_ROOT_PRESET 1
    589#define ND_ROOT_GRABBED 2
    590#define ND_JUMPED 4
    591
    592static void __set_nameidata(struct nameidata *p, int dfd, struct filename *name)
    593{
    594	struct nameidata *old = current->nameidata;
    595	p->stack = p->internal;
    596	p->depth = 0;
    597	p->dfd = dfd;
    598	p->name = name;
    599	p->path.mnt = NULL;
    600	p->path.dentry = NULL;
    601	p->total_link_count = old ? old->total_link_count : 0;
    602	p->saved = old;
    603	current->nameidata = p;
    604}
    605
    606static inline void set_nameidata(struct nameidata *p, int dfd, struct filename *name,
    607			  const struct path *root)
    608{
    609	__set_nameidata(p, dfd, name);
    610	p->state = 0;
    611	if (unlikely(root)) {
    612		p->state = ND_ROOT_PRESET;
    613		p->root = *root;
    614	}
    615}
    616
    617static void restore_nameidata(void)
    618{
    619	struct nameidata *now = current->nameidata, *old = now->saved;
    620
    621	current->nameidata = old;
    622	if (old)
    623		old->total_link_count = now->total_link_count;
    624	if (now->stack != now->internal)
    625		kfree(now->stack);
    626}
    627
    628static bool nd_alloc_stack(struct nameidata *nd)
    629{
    630	struct saved *p;
    631
    632	p= kmalloc_array(MAXSYMLINKS, sizeof(struct saved),
    633			 nd->flags & LOOKUP_RCU ? GFP_ATOMIC : GFP_KERNEL);
    634	if (unlikely(!p))
    635		return false;
    636	memcpy(p, nd->internal, sizeof(nd->internal));
    637	nd->stack = p;
    638	return true;
    639}
    640
    641/**
    642 * path_connected - Verify that a dentry is below mnt.mnt_root
    643 *
    644 * Rename can sometimes move a file or directory outside of a bind
    645 * mount, path_connected allows those cases to be detected.
    646 */
    647static bool path_connected(struct vfsmount *mnt, struct dentry *dentry)
    648{
    649	struct super_block *sb = mnt->mnt_sb;
    650
    651	/* Bind mounts can have disconnected paths */
    652	if (mnt->mnt_root == sb->s_root)
    653		return true;
    654
    655	return is_subdir(dentry, mnt->mnt_root);
    656}
    657
    658static void drop_links(struct nameidata *nd)
    659{
    660	int i = nd->depth;
    661	while (i--) {
    662		struct saved *last = nd->stack + i;
    663		do_delayed_call(&last->done);
    664		clear_delayed_call(&last->done);
    665	}
    666}
    667
    668static void terminate_walk(struct nameidata *nd)
    669{
    670	drop_links(nd);
    671	if (!(nd->flags & LOOKUP_RCU)) {
    672		int i;
    673		path_put(&nd->path);
    674		for (i = 0; i < nd->depth; i++)
    675			path_put(&nd->stack[i].link);
    676		if (nd->state & ND_ROOT_GRABBED) {
    677			path_put(&nd->root);
    678			nd->state &= ~ND_ROOT_GRABBED;
    679		}
    680	} else {
    681		nd->flags &= ~LOOKUP_RCU;
    682		rcu_read_unlock();
    683	}
    684	nd->depth = 0;
    685	nd->path.mnt = NULL;
    686	nd->path.dentry = NULL;
    687}
    688
    689/* path_put is needed afterwards regardless of success or failure */
    690static bool __legitimize_path(struct path *path, unsigned seq, unsigned mseq)
    691{
    692	int res = __legitimize_mnt(path->mnt, mseq);
    693	if (unlikely(res)) {
    694		if (res > 0)
    695			path->mnt = NULL;
    696		path->dentry = NULL;
    697		return false;
    698	}
    699	if (unlikely(!lockref_get_not_dead(&path->dentry->d_lockref))) {
    700		path->dentry = NULL;
    701		return false;
    702	}
    703	return !read_seqcount_retry(&path->dentry->d_seq, seq);
    704}
    705
    706static inline bool legitimize_path(struct nameidata *nd,
    707			    struct path *path, unsigned seq)
    708{
    709	return __legitimize_path(path, seq, nd->m_seq);
    710}
    711
    712static bool legitimize_links(struct nameidata *nd)
    713{
    714	int i;
    715	if (unlikely(nd->flags & LOOKUP_CACHED)) {
    716		drop_links(nd);
    717		nd->depth = 0;
    718		return false;
    719	}
    720	for (i = 0; i < nd->depth; i++) {
    721		struct saved *last = nd->stack + i;
    722		if (unlikely(!legitimize_path(nd, &last->link, last->seq))) {
    723			drop_links(nd);
    724			nd->depth = i + 1;
    725			return false;
    726		}
    727	}
    728	return true;
    729}
    730
    731static bool legitimize_root(struct nameidata *nd)
    732{
    733	/* Nothing to do if nd->root is zero or is managed by the VFS user. */
    734	if (!nd->root.mnt || (nd->state & ND_ROOT_PRESET))
    735		return true;
    736	nd->state |= ND_ROOT_GRABBED;
    737	return legitimize_path(nd, &nd->root, nd->root_seq);
    738}
    739
    740/*
    741 * Path walking has 2 modes, rcu-walk and ref-walk (see
    742 * Documentation/filesystems/path-lookup.txt).  In situations when we can't
    743 * continue in RCU mode, we attempt to drop out of rcu-walk mode and grab
    744 * normal reference counts on dentries and vfsmounts to transition to ref-walk
    745 * mode.  Refcounts are grabbed at the last known good point before rcu-walk
    746 * got stuck, so ref-walk may continue from there. If this is not successful
    747 * (eg. a seqcount has changed), then failure is returned and it's up to caller
    748 * to restart the path walk from the beginning in ref-walk mode.
    749 */
    750
    751/**
    752 * try_to_unlazy - try to switch to ref-walk mode.
    753 * @nd: nameidata pathwalk data
    754 * Returns: true on success, false on failure
    755 *
    756 * try_to_unlazy attempts to legitimize the current nd->path and nd->root
    757 * for ref-walk mode.
    758 * Must be called from rcu-walk context.
    759 * Nothing should touch nameidata between try_to_unlazy() failure and
    760 * terminate_walk().
    761 */
    762static bool try_to_unlazy(struct nameidata *nd)
    763{
    764	struct dentry *parent = nd->path.dentry;
    765
    766	BUG_ON(!(nd->flags & LOOKUP_RCU));
    767
    768	nd->flags &= ~LOOKUP_RCU;
    769	if (unlikely(!legitimize_links(nd)))
    770		goto out1;
    771	if (unlikely(!legitimize_path(nd, &nd->path, nd->seq)))
    772		goto out;
    773	if (unlikely(!legitimize_root(nd)))
    774		goto out;
    775	rcu_read_unlock();
    776	BUG_ON(nd->inode != parent->d_inode);
    777	return true;
    778
    779out1:
    780	nd->path.mnt = NULL;
    781	nd->path.dentry = NULL;
    782out:
    783	rcu_read_unlock();
    784	return false;
    785}
    786
    787/**
    788 * try_to_unlazy_next - try to switch to ref-walk mode.
    789 * @nd: nameidata pathwalk data
    790 * @dentry: next dentry to step into
    791 * @seq: seq number to check @dentry against
    792 * Returns: true on success, false on failure
    793 *
    794 * Similar to try_to_unlazy(), but here we have the next dentry already
    795 * picked by rcu-walk and want to legitimize that in addition to the current
    796 * nd->path and nd->root for ref-walk mode.  Must be called from rcu-walk context.
    797 * Nothing should touch nameidata between try_to_unlazy_next() failure and
    798 * terminate_walk().
    799 */
    800static bool try_to_unlazy_next(struct nameidata *nd, struct dentry *dentry, unsigned seq)
    801{
    802	BUG_ON(!(nd->flags & LOOKUP_RCU));
    803
    804	nd->flags &= ~LOOKUP_RCU;
    805	if (unlikely(!legitimize_links(nd)))
    806		goto out2;
    807	if (unlikely(!legitimize_mnt(nd->path.mnt, nd->m_seq)))
    808		goto out2;
    809	if (unlikely(!lockref_get_not_dead(&nd->path.dentry->d_lockref)))
    810		goto out1;
    811
    812	/*
    813	 * We need to move both the parent and the dentry from the RCU domain
    814	 * to be properly refcounted. And the sequence number in the dentry
    815	 * validates *both* dentry counters, since we checked the sequence
    816	 * number of the parent after we got the child sequence number. So we
    817	 * know the parent must still be valid if the child sequence number is
    818	 */
    819	if (unlikely(!lockref_get_not_dead(&dentry->d_lockref)))
    820		goto out;
    821	if (unlikely(read_seqcount_retry(&dentry->d_seq, seq)))
    822		goto out_dput;
    823	/*
    824	 * Sequence counts matched. Now make sure that the root is
    825	 * still valid and get it if required.
    826	 */
    827	if (unlikely(!legitimize_root(nd)))
    828		goto out_dput;
    829	rcu_read_unlock();
    830	return true;
    831
    832out2:
    833	nd->path.mnt = NULL;
    834out1:
    835	nd->path.dentry = NULL;
    836out:
    837	rcu_read_unlock();
    838	return false;
    839out_dput:
    840	rcu_read_unlock();
    841	dput(dentry);
    842	return false;
    843}
    844
    845static inline int d_revalidate(struct dentry *dentry, unsigned int flags)
    846{
    847	if (unlikely(dentry->d_flags & DCACHE_OP_REVALIDATE))
    848		return dentry->d_op->d_revalidate(dentry, flags);
    849	else
    850		return 1;
    851}
    852
    853/**
    854 * complete_walk - successful completion of path walk
    855 * @nd:  pointer nameidata
    856 *
    857 * If we had been in RCU mode, drop out of it and legitimize nd->path.
    858 * Revalidate the final result, unless we'd already done that during
    859 * the path walk or the filesystem doesn't ask for it.  Return 0 on
    860 * success, -error on failure.  In case of failure caller does not
    861 * need to drop nd->path.
    862 */
    863static int complete_walk(struct nameidata *nd)
    864{
    865	struct dentry *dentry = nd->path.dentry;
    866	int status;
    867
    868	if (nd->flags & LOOKUP_RCU) {
    869		/*
    870		 * We don't want to zero nd->root for scoped-lookups or
    871		 * externally-managed nd->root.
    872		 */
    873		if (!(nd->state & ND_ROOT_PRESET))
    874			if (!(nd->flags & LOOKUP_IS_SCOPED))
    875				nd->root.mnt = NULL;
    876		nd->flags &= ~LOOKUP_CACHED;
    877		if (!try_to_unlazy(nd))
    878			return -ECHILD;
    879	}
    880
    881	if (unlikely(nd->flags & LOOKUP_IS_SCOPED)) {
    882		/*
    883		 * While the guarantee of LOOKUP_IS_SCOPED is (roughly) "don't
    884		 * ever step outside the root during lookup" and should already
    885		 * be guaranteed by the rest of namei, we want to avoid a namei
    886		 * BUG resulting in userspace being given a path that was not
    887		 * scoped within the root at some point during the lookup.
    888		 *
    889		 * So, do a final sanity-check to make sure that in the
    890		 * worst-case scenario (a complete bypass of LOOKUP_IS_SCOPED)
    891		 * we won't silently return an fd completely outside of the
    892		 * requested root to userspace.
    893		 *
    894		 * Userspace could move the path outside the root after this
    895		 * check, but as discussed elsewhere this is not a concern (the
    896		 * resolved file was inside the root at some point).
    897		 */
    898		if (!path_is_under(&nd->path, &nd->root))
    899			return -EXDEV;
    900	}
    901
    902	if (likely(!(nd->state & ND_JUMPED)))
    903		return 0;
    904
    905	if (likely(!(dentry->d_flags & DCACHE_OP_WEAK_REVALIDATE)))
    906		return 0;
    907
    908	status = dentry->d_op->d_weak_revalidate(dentry, nd->flags);
    909	if (status > 0)
    910		return 0;
    911
    912	if (!status)
    913		status = -ESTALE;
    914
    915	return status;
    916}
    917
    918static int set_root(struct nameidata *nd)
    919{
    920	struct fs_struct *fs = current->fs;
    921
    922	/*
    923	 * Jumping to the real root in a scoped-lookup is a BUG in namei, but we
    924	 * still have to ensure it doesn't happen because it will cause a breakout
    925	 * from the dirfd.
    926	 */
    927	if (WARN_ON(nd->flags & LOOKUP_IS_SCOPED))
    928		return -ENOTRECOVERABLE;
    929
    930	if (nd->flags & LOOKUP_RCU) {
    931		unsigned seq;
    932
    933		do {
    934			seq = read_seqcount_begin(&fs->seq);
    935			nd->root = fs->root;
    936			nd->root_seq = __read_seqcount_begin(&nd->root.dentry->d_seq);
    937		} while (read_seqcount_retry(&fs->seq, seq));
    938	} else {
    939		get_fs_root(fs, &nd->root);
    940		nd->state |= ND_ROOT_GRABBED;
    941	}
    942	return 0;
    943}
    944
    945static int nd_jump_root(struct nameidata *nd)
    946{
    947	if (unlikely(nd->flags & LOOKUP_BENEATH))
    948		return -EXDEV;
    949	if (unlikely(nd->flags & LOOKUP_NO_XDEV)) {
    950		/* Absolute path arguments to path_init() are allowed. */
    951		if (nd->path.mnt != NULL && nd->path.mnt != nd->root.mnt)
    952			return -EXDEV;
    953	}
    954	if (!nd->root.mnt) {
    955		int error = set_root(nd);
    956		if (error)
    957			return error;
    958	}
    959	if (nd->flags & LOOKUP_RCU) {
    960		struct dentry *d;
    961		nd->path = nd->root;
    962		d = nd->path.dentry;
    963		nd->inode = d->d_inode;
    964		nd->seq = nd->root_seq;
    965		if (unlikely(read_seqcount_retry(&d->d_seq, nd->seq)))
    966			return -ECHILD;
    967	} else {
    968		path_put(&nd->path);
    969		nd->path = nd->root;
    970		path_get(&nd->path);
    971		nd->inode = nd->path.dentry->d_inode;
    972	}
    973	nd->state |= ND_JUMPED;
    974	return 0;
    975}
    976
    977/*
    978 * Helper to directly jump to a known parsed path from ->get_link,
    979 * caller must have taken a reference to path beforehand.
    980 */
    981int nd_jump_link(struct path *path)
    982{
    983	int error = -ELOOP;
    984	struct nameidata *nd = current->nameidata;
    985
    986	if (unlikely(nd->flags & LOOKUP_NO_MAGICLINKS))
    987		goto err;
    988
    989	error = -EXDEV;
    990	if (unlikely(nd->flags & LOOKUP_NO_XDEV)) {
    991		if (nd->path.mnt != path->mnt)
    992			goto err;
    993	}
    994	/* Not currently safe for scoped-lookups. */
    995	if (unlikely(nd->flags & LOOKUP_IS_SCOPED))
    996		goto err;
    997
    998	path_put(&nd->path);
    999	nd->path = *path;
   1000	nd->inode = nd->path.dentry->d_inode;
   1001	nd->state |= ND_JUMPED;
   1002	return 0;
   1003
   1004err:
   1005	path_put(path);
   1006	return error;
   1007}
   1008
   1009static inline void put_link(struct nameidata *nd)
   1010{
   1011	struct saved *last = nd->stack + --nd->depth;
   1012	do_delayed_call(&last->done);
   1013	if (!(nd->flags & LOOKUP_RCU))
   1014		path_put(&last->link);
   1015}
   1016
   1017static int sysctl_protected_symlinks __read_mostly;
   1018static int sysctl_protected_hardlinks __read_mostly;
   1019static int sysctl_protected_fifos __read_mostly;
   1020static int sysctl_protected_regular __read_mostly;
   1021
   1022#ifdef CONFIG_SYSCTL
   1023static struct ctl_table namei_sysctls[] = {
   1024	{
   1025		.procname	= "protected_symlinks",
   1026		.data		= &sysctl_protected_symlinks,
   1027		.maxlen		= sizeof(int),
   1028		.mode		= 0644,
   1029		.proc_handler	= proc_dointvec_minmax,
   1030		.extra1		= SYSCTL_ZERO,
   1031		.extra2		= SYSCTL_ONE,
   1032	},
   1033	{
   1034		.procname	= "protected_hardlinks",
   1035		.data		= &sysctl_protected_hardlinks,
   1036		.maxlen		= sizeof(int),
   1037		.mode		= 0644,
   1038		.proc_handler	= proc_dointvec_minmax,
   1039		.extra1		= SYSCTL_ZERO,
   1040		.extra2		= SYSCTL_ONE,
   1041	},
   1042	{
   1043		.procname	= "protected_fifos",
   1044		.data		= &sysctl_protected_fifos,
   1045		.maxlen		= sizeof(int),
   1046		.mode		= 0644,
   1047		.proc_handler	= proc_dointvec_minmax,
   1048		.extra1		= SYSCTL_ZERO,
   1049		.extra2		= SYSCTL_TWO,
   1050	},
   1051	{
   1052		.procname	= "protected_regular",
   1053		.data		= &sysctl_protected_regular,
   1054		.maxlen		= sizeof(int),
   1055		.mode		= 0644,
   1056		.proc_handler	= proc_dointvec_minmax,
   1057		.extra1		= SYSCTL_ZERO,
   1058		.extra2		= SYSCTL_TWO,
   1059	},
   1060	{ }
   1061};
   1062
   1063static int __init init_fs_namei_sysctls(void)
   1064{
   1065	register_sysctl_init("fs", namei_sysctls);
   1066	return 0;
   1067}
   1068fs_initcall(init_fs_namei_sysctls);
   1069
   1070#endif /* CONFIG_SYSCTL */
   1071
   1072/**
   1073 * may_follow_link - Check symlink following for unsafe situations
   1074 * @nd: nameidata pathwalk data
   1075 *
   1076 * In the case of the sysctl_protected_symlinks sysctl being enabled,
   1077 * CAP_DAC_OVERRIDE needs to be specifically ignored if the symlink is
   1078 * in a sticky world-writable directory. This is to protect privileged
   1079 * processes from failing races against path names that may change out
   1080 * from under them by way of other users creating malicious symlinks.
   1081 * It will permit symlinks to be followed only when outside a sticky
   1082 * world-writable directory, or when the uid of the symlink and follower
   1083 * match, or when the directory owner matches the symlink's owner.
   1084 *
   1085 * Returns 0 if following the symlink is allowed, -ve on error.
   1086 */
   1087static inline int may_follow_link(struct nameidata *nd, const struct inode *inode)
   1088{
   1089	struct user_namespace *mnt_userns;
   1090	kuid_t i_uid;
   1091
   1092	if (!sysctl_protected_symlinks)
   1093		return 0;
   1094
   1095	mnt_userns = mnt_user_ns(nd->path.mnt);
   1096	i_uid = i_uid_into_mnt(mnt_userns, inode);
   1097	/* Allowed if owner and follower match. */
   1098	if (uid_eq(current_cred()->fsuid, i_uid))
   1099		return 0;
   1100
   1101	/* Allowed if parent directory not sticky and world-writable. */
   1102	if ((nd->dir_mode & (S_ISVTX|S_IWOTH)) != (S_ISVTX|S_IWOTH))
   1103		return 0;
   1104
   1105	/* Allowed if parent directory and link owner match. */
   1106	if (uid_valid(nd->dir_uid) && uid_eq(nd->dir_uid, i_uid))
   1107		return 0;
   1108
   1109	if (nd->flags & LOOKUP_RCU)
   1110		return -ECHILD;
   1111
   1112	audit_inode(nd->name, nd->stack[0].link.dentry, 0);
   1113	audit_log_path_denied(AUDIT_ANOM_LINK, "follow_link");
   1114	return -EACCES;
   1115}
   1116
   1117/**
   1118 * safe_hardlink_source - Check for safe hardlink conditions
   1119 * @mnt_userns:	user namespace of the mount the inode was found from
   1120 * @inode: the source inode to hardlink from
   1121 *
   1122 * Return false if at least one of the following conditions:
   1123 *    - inode is not a regular file
   1124 *    - inode is setuid
   1125 *    - inode is setgid and group-exec
   1126 *    - access failure for read and write
   1127 *
   1128 * Otherwise returns true.
   1129 */
   1130static bool safe_hardlink_source(struct user_namespace *mnt_userns,
   1131				 struct inode *inode)
   1132{
   1133	umode_t mode = inode->i_mode;
   1134
   1135	/* Special files should not get pinned to the filesystem. */
   1136	if (!S_ISREG(mode))
   1137		return false;
   1138
   1139	/* Setuid files should not get pinned to the filesystem. */
   1140	if (mode & S_ISUID)
   1141		return false;
   1142
   1143	/* Executable setgid files should not get pinned to the filesystem. */
   1144	if ((mode & (S_ISGID | S_IXGRP)) == (S_ISGID | S_IXGRP))
   1145		return false;
   1146
   1147	/* Hardlinking to unreadable or unwritable sources is dangerous. */
   1148	if (inode_permission(mnt_userns, inode, MAY_READ | MAY_WRITE))
   1149		return false;
   1150
   1151	return true;
   1152}
   1153
   1154/**
   1155 * may_linkat - Check permissions for creating a hardlink
   1156 * @mnt_userns:	user namespace of the mount the inode was found from
   1157 * @link: the source to hardlink from
   1158 *
   1159 * Block hardlink when all of:
   1160 *  - sysctl_protected_hardlinks enabled
   1161 *  - fsuid does not match inode
   1162 *  - hardlink source is unsafe (see safe_hardlink_source() above)
   1163 *  - not CAP_FOWNER in a namespace with the inode owner uid mapped
   1164 *
   1165 * If the inode has been found through an idmapped mount the user namespace of
   1166 * the vfsmount must be passed through @mnt_userns. This function will then take
   1167 * care to map the inode according to @mnt_userns before checking permissions.
   1168 * On non-idmapped mounts or if permission checking is to be performed on the
   1169 * raw inode simply passs init_user_ns.
   1170 *
   1171 * Returns 0 if successful, -ve on error.
   1172 */
   1173int may_linkat(struct user_namespace *mnt_userns, struct path *link)
   1174{
   1175	struct inode *inode = link->dentry->d_inode;
   1176
   1177	/* Inode writeback is not safe when the uid or gid are invalid. */
   1178	if (!uid_valid(i_uid_into_mnt(mnt_userns, inode)) ||
   1179	    !gid_valid(i_gid_into_mnt(mnt_userns, inode)))
   1180		return -EOVERFLOW;
   1181
   1182	if (!sysctl_protected_hardlinks)
   1183		return 0;
   1184
   1185	/* Source inode owner (or CAP_FOWNER) can hardlink all they like,
   1186	 * otherwise, it must be a safe source.
   1187	 */
   1188	if (safe_hardlink_source(mnt_userns, inode) ||
   1189	    inode_owner_or_capable(mnt_userns, inode))
   1190		return 0;
   1191
   1192	audit_log_path_denied(AUDIT_ANOM_LINK, "linkat");
   1193	return -EPERM;
   1194}
   1195
   1196/**
   1197 * may_create_in_sticky - Check whether an O_CREAT open in a sticky directory
   1198 *			  should be allowed, or not, on files that already
   1199 *			  exist.
   1200 * @mnt_userns:	user namespace of the mount the inode was found from
   1201 * @nd: nameidata pathwalk data
   1202 * @inode: the inode of the file to open
   1203 *
   1204 * Block an O_CREAT open of a FIFO (or a regular file) when:
   1205 *   - sysctl_protected_fifos (or sysctl_protected_regular) is enabled
   1206 *   - the file already exists
   1207 *   - we are in a sticky directory
   1208 *   - we don't own the file
   1209 *   - the owner of the directory doesn't own the file
   1210 *   - the directory is world writable
   1211 * If the sysctl_protected_fifos (or sysctl_protected_regular) is set to 2
   1212 * the directory doesn't have to be world writable: being group writable will
   1213 * be enough.
   1214 *
   1215 * If the inode has been found through an idmapped mount the user namespace of
   1216 * the vfsmount must be passed through @mnt_userns. This function will then take
   1217 * care to map the inode according to @mnt_userns before checking permissions.
   1218 * On non-idmapped mounts or if permission checking is to be performed on the
   1219 * raw inode simply passs init_user_ns.
   1220 *
   1221 * Returns 0 if the open is allowed, -ve on error.
   1222 */
   1223static int may_create_in_sticky(struct user_namespace *mnt_userns,
   1224				struct nameidata *nd, struct inode *const inode)
   1225{
   1226	umode_t dir_mode = nd->dir_mode;
   1227	kuid_t dir_uid = nd->dir_uid;
   1228
   1229	if ((!sysctl_protected_fifos && S_ISFIFO(inode->i_mode)) ||
   1230	    (!sysctl_protected_regular && S_ISREG(inode->i_mode)) ||
   1231	    likely(!(dir_mode & S_ISVTX)) ||
   1232	    uid_eq(i_uid_into_mnt(mnt_userns, inode), dir_uid) ||
   1233	    uid_eq(current_fsuid(), i_uid_into_mnt(mnt_userns, inode)))
   1234		return 0;
   1235
   1236	if (likely(dir_mode & 0002) ||
   1237	    (dir_mode & 0020 &&
   1238	     ((sysctl_protected_fifos >= 2 && S_ISFIFO(inode->i_mode)) ||
   1239	      (sysctl_protected_regular >= 2 && S_ISREG(inode->i_mode))))) {
   1240		const char *operation = S_ISFIFO(inode->i_mode) ?
   1241					"sticky_create_fifo" :
   1242					"sticky_create_regular";
   1243		audit_log_path_denied(AUDIT_ANOM_CREAT, operation);
   1244		return -EACCES;
   1245	}
   1246	return 0;
   1247}
   1248
   1249/*
   1250 * follow_up - Find the mountpoint of path's vfsmount
   1251 *
   1252 * Given a path, find the mountpoint of its source file system.
   1253 * Replace @path with the path of the mountpoint in the parent mount.
   1254 * Up is towards /.
   1255 *
   1256 * Return 1 if we went up a level and 0 if we were already at the
   1257 * root.
   1258 */
   1259int follow_up(struct path *path)
   1260{
   1261	struct mount *mnt = real_mount(path->mnt);
   1262	struct mount *parent;
   1263	struct dentry *mountpoint;
   1264
   1265	read_seqlock_excl(&mount_lock);
   1266	parent = mnt->mnt_parent;
   1267	if (parent == mnt) {
   1268		read_sequnlock_excl(&mount_lock);
   1269		return 0;
   1270	}
   1271	mntget(&parent->mnt);
   1272	mountpoint = dget(mnt->mnt_mountpoint);
   1273	read_sequnlock_excl(&mount_lock);
   1274	dput(path->dentry);
   1275	path->dentry = mountpoint;
   1276	mntput(path->mnt);
   1277	path->mnt = &parent->mnt;
   1278	return 1;
   1279}
   1280EXPORT_SYMBOL(follow_up);
   1281
   1282static bool choose_mountpoint_rcu(struct mount *m, const struct path *root,
   1283				  struct path *path, unsigned *seqp)
   1284{
   1285	while (mnt_has_parent(m)) {
   1286		struct dentry *mountpoint = m->mnt_mountpoint;
   1287
   1288		m = m->mnt_parent;
   1289		if (unlikely(root->dentry == mountpoint &&
   1290			     root->mnt == &m->mnt))
   1291			break;
   1292		if (mountpoint != m->mnt.mnt_root) {
   1293			path->mnt = &m->mnt;
   1294			path->dentry = mountpoint;
   1295			*seqp = read_seqcount_begin(&mountpoint->d_seq);
   1296			return true;
   1297		}
   1298	}
   1299	return false;
   1300}
   1301
   1302static bool choose_mountpoint(struct mount *m, const struct path *root,
   1303			      struct path *path)
   1304{
   1305	bool found;
   1306
   1307	rcu_read_lock();
   1308	while (1) {
   1309		unsigned seq, mseq = read_seqbegin(&mount_lock);
   1310
   1311		found = choose_mountpoint_rcu(m, root, path, &seq);
   1312		if (unlikely(!found)) {
   1313			if (!read_seqretry(&mount_lock, mseq))
   1314				break;
   1315		} else {
   1316			if (likely(__legitimize_path(path, seq, mseq)))
   1317				break;
   1318			rcu_read_unlock();
   1319			path_put(path);
   1320			rcu_read_lock();
   1321		}
   1322	}
   1323	rcu_read_unlock();
   1324	return found;
   1325}
   1326
   1327/*
   1328 * Perform an automount
   1329 * - return -EISDIR to tell follow_managed() to stop and return the path we
   1330 *   were called with.
   1331 */
   1332static int follow_automount(struct path *path, int *count, unsigned lookup_flags)
   1333{
   1334	struct dentry *dentry = path->dentry;
   1335
   1336	/* We don't want to mount if someone's just doing a stat -
   1337	 * unless they're stat'ing a directory and appended a '/' to
   1338	 * the name.
   1339	 *
   1340	 * We do, however, want to mount if someone wants to open or
   1341	 * create a file of any type under the mountpoint, wants to
   1342	 * traverse through the mountpoint or wants to open the
   1343	 * mounted directory.  Also, autofs may mark negative dentries
   1344	 * as being automount points.  These will need the attentions
   1345	 * of the daemon to instantiate them before they can be used.
   1346	 */
   1347	if (!(lookup_flags & (LOOKUP_PARENT | LOOKUP_DIRECTORY |
   1348			   LOOKUP_OPEN | LOOKUP_CREATE | LOOKUP_AUTOMOUNT)) &&
   1349	    dentry->d_inode)
   1350		return -EISDIR;
   1351
   1352	if (count && (*count)++ >= MAXSYMLINKS)
   1353		return -ELOOP;
   1354
   1355	return finish_automount(dentry->d_op->d_automount(path), path);
   1356}
   1357
   1358/*
   1359 * mount traversal - out-of-line part.  One note on ->d_flags accesses -
   1360 * dentries are pinned but not locked here, so negative dentry can go
   1361 * positive right under us.  Use of smp_load_acquire() provides a barrier
   1362 * sufficient for ->d_inode and ->d_flags consistency.
   1363 */
   1364static int __traverse_mounts(struct path *path, unsigned flags, bool *jumped,
   1365			     int *count, unsigned lookup_flags)
   1366{
   1367	struct vfsmount *mnt = path->mnt;
   1368	bool need_mntput = false;
   1369	int ret = 0;
   1370
   1371	while (flags & DCACHE_MANAGED_DENTRY) {
   1372		/* Allow the filesystem to manage the transit without i_mutex
   1373		 * being held. */
   1374		if (flags & DCACHE_MANAGE_TRANSIT) {
   1375			ret = path->dentry->d_op->d_manage(path, false);
   1376			flags = smp_load_acquire(&path->dentry->d_flags);
   1377			if (ret < 0)
   1378				break;
   1379		}
   1380
   1381		if (flags & DCACHE_MOUNTED) {	// something's mounted on it..
   1382			struct vfsmount *mounted = lookup_mnt(path);
   1383			if (mounted) {		// ... in our namespace
   1384				dput(path->dentry);
   1385				if (need_mntput)
   1386					mntput(path->mnt);
   1387				path->mnt = mounted;
   1388				path->dentry = dget(mounted->mnt_root);
   1389				// here we know it's positive
   1390				flags = path->dentry->d_flags;
   1391				need_mntput = true;
   1392				continue;
   1393			}
   1394		}
   1395
   1396		if (!(flags & DCACHE_NEED_AUTOMOUNT))
   1397			break;
   1398
   1399		// uncovered automount point
   1400		ret = follow_automount(path, count, lookup_flags);
   1401		flags = smp_load_acquire(&path->dentry->d_flags);
   1402		if (ret < 0)
   1403			break;
   1404	}
   1405
   1406	if (ret == -EISDIR)
   1407		ret = 0;
   1408	// possible if you race with several mount --move
   1409	if (need_mntput && path->mnt == mnt)
   1410		mntput(path->mnt);
   1411	if (!ret && unlikely(d_flags_negative(flags)))
   1412		ret = -ENOENT;
   1413	*jumped = need_mntput;
   1414	return ret;
   1415}
   1416
   1417static inline int traverse_mounts(struct path *path, bool *jumped,
   1418				  int *count, unsigned lookup_flags)
   1419{
   1420	unsigned flags = smp_load_acquire(&path->dentry->d_flags);
   1421
   1422	/* fastpath */
   1423	if (likely(!(flags & DCACHE_MANAGED_DENTRY))) {
   1424		*jumped = false;
   1425		if (unlikely(d_flags_negative(flags)))
   1426			return -ENOENT;
   1427		return 0;
   1428	}
   1429	return __traverse_mounts(path, flags, jumped, count, lookup_flags);
   1430}
   1431
   1432int follow_down_one(struct path *path)
   1433{
   1434	struct vfsmount *mounted;
   1435
   1436	mounted = lookup_mnt(path);
   1437	if (mounted) {
   1438		dput(path->dentry);
   1439		mntput(path->mnt);
   1440		path->mnt = mounted;
   1441		path->dentry = dget(mounted->mnt_root);
   1442		return 1;
   1443	}
   1444	return 0;
   1445}
   1446EXPORT_SYMBOL(follow_down_one);
   1447
   1448/*
   1449 * Follow down to the covering mount currently visible to userspace.  At each
   1450 * point, the filesystem owning that dentry may be queried as to whether the
   1451 * caller is permitted to proceed or not.
   1452 */
   1453int follow_down(struct path *path)
   1454{
   1455	struct vfsmount *mnt = path->mnt;
   1456	bool jumped;
   1457	int ret = traverse_mounts(path, &jumped, NULL, 0);
   1458
   1459	if (path->mnt != mnt)
   1460		mntput(mnt);
   1461	return ret;
   1462}
   1463EXPORT_SYMBOL(follow_down);
   1464
   1465/*
   1466 * Try to skip to top of mountpoint pile in rcuwalk mode.  Fail if
   1467 * we meet a managed dentry that would need blocking.
   1468 */
   1469static bool __follow_mount_rcu(struct nameidata *nd, struct path *path,
   1470			       struct inode **inode, unsigned *seqp)
   1471{
   1472	struct dentry *dentry = path->dentry;
   1473	unsigned int flags = dentry->d_flags;
   1474
   1475	if (likely(!(flags & DCACHE_MANAGED_DENTRY)))
   1476		return true;
   1477
   1478	if (unlikely(nd->flags & LOOKUP_NO_XDEV))
   1479		return false;
   1480
   1481	for (;;) {
   1482		/*
   1483		 * Don't forget we might have a non-mountpoint managed dentry
   1484		 * that wants to block transit.
   1485		 */
   1486		if (unlikely(flags & DCACHE_MANAGE_TRANSIT)) {
   1487			int res = dentry->d_op->d_manage(path, true);
   1488			if (res)
   1489				return res == -EISDIR;
   1490			flags = dentry->d_flags;
   1491		}
   1492
   1493		if (flags & DCACHE_MOUNTED) {
   1494			struct mount *mounted = __lookup_mnt(path->mnt, dentry);
   1495			if (mounted) {
   1496				path->mnt = &mounted->mnt;
   1497				dentry = path->dentry = mounted->mnt.mnt_root;
   1498				nd->state |= ND_JUMPED;
   1499				*seqp = read_seqcount_begin(&dentry->d_seq);
   1500				*inode = dentry->d_inode;
   1501				/*
   1502				 * We don't need to re-check ->d_seq after this
   1503				 * ->d_inode read - there will be an RCU delay
   1504				 * between mount hash removal and ->mnt_root
   1505				 * becoming unpinned.
   1506				 */
   1507				flags = dentry->d_flags;
   1508				continue;
   1509			}
   1510			if (read_seqretry(&mount_lock, nd->m_seq))
   1511				return false;
   1512		}
   1513		return !(flags & DCACHE_NEED_AUTOMOUNT);
   1514	}
   1515}
   1516
   1517static inline int handle_mounts(struct nameidata *nd, struct dentry *dentry,
   1518			  struct path *path, struct inode **inode,
   1519			  unsigned int *seqp)
   1520{
   1521	bool jumped;
   1522	int ret;
   1523
   1524	path->mnt = nd->path.mnt;
   1525	path->dentry = dentry;
   1526	if (nd->flags & LOOKUP_RCU) {
   1527		unsigned int seq = *seqp;
   1528		if (unlikely(!*inode))
   1529			return -ENOENT;
   1530		if (likely(__follow_mount_rcu(nd, path, inode, seqp)))
   1531			return 0;
   1532		if (!try_to_unlazy_next(nd, dentry, seq))
   1533			return -ECHILD;
   1534		// *path might've been clobbered by __follow_mount_rcu()
   1535		path->mnt = nd->path.mnt;
   1536		path->dentry = dentry;
   1537	}
   1538	ret = traverse_mounts(path, &jumped, &nd->total_link_count, nd->flags);
   1539	if (jumped) {
   1540		if (unlikely(nd->flags & LOOKUP_NO_XDEV))
   1541			ret = -EXDEV;
   1542		else
   1543			nd->state |= ND_JUMPED;
   1544	}
   1545	if (unlikely(ret)) {
   1546		dput(path->dentry);
   1547		if (path->mnt != nd->path.mnt)
   1548			mntput(path->mnt);
   1549	} else {
   1550		*inode = d_backing_inode(path->dentry);
   1551		*seqp = 0; /* out of RCU mode, so the value doesn't matter */
   1552	}
   1553	return ret;
   1554}
   1555
   1556/*
   1557 * This looks up the name in dcache and possibly revalidates the found dentry.
   1558 * NULL is returned if the dentry does not exist in the cache.
   1559 */
   1560static struct dentry *lookup_dcache(const struct qstr *name,
   1561				    struct dentry *dir,
   1562				    unsigned int flags)
   1563{
   1564	struct dentry *dentry = d_lookup(dir, name);
   1565	if (dentry) {
   1566		int error = d_revalidate(dentry, flags);
   1567		if (unlikely(error <= 0)) {
   1568			if (!error)
   1569				d_invalidate(dentry);
   1570			dput(dentry);
   1571			return ERR_PTR(error);
   1572		}
   1573	}
   1574	return dentry;
   1575}
   1576
   1577/*
   1578 * Parent directory has inode locked exclusive.  This is one
   1579 * and only case when ->lookup() gets called on non in-lookup
   1580 * dentries - as the matter of fact, this only gets called
   1581 * when directory is guaranteed to have no in-lookup children
   1582 * at all.
   1583 */
   1584static struct dentry *__lookup_hash(const struct qstr *name,
   1585		struct dentry *base, unsigned int flags)
   1586{
   1587	struct dentry *dentry = lookup_dcache(name, base, flags);
   1588	struct dentry *old;
   1589	struct inode *dir = base->d_inode;
   1590
   1591	if (dentry)
   1592		return dentry;
   1593
   1594	/* Don't create child dentry for a dead directory. */
   1595	if (unlikely(IS_DEADDIR(dir)))
   1596		return ERR_PTR(-ENOENT);
   1597
   1598	dentry = d_alloc(base, name);
   1599	if (unlikely(!dentry))
   1600		return ERR_PTR(-ENOMEM);
   1601
   1602	old = dir->i_op->lookup(dir, dentry, flags);
   1603	if (unlikely(old)) {
   1604		dput(dentry);
   1605		dentry = old;
   1606	}
   1607	return dentry;
   1608}
   1609
   1610static struct dentry *lookup_fast(struct nameidata *nd,
   1611				  struct inode **inode,
   1612			          unsigned *seqp)
   1613{
   1614	struct dentry *dentry, *parent = nd->path.dentry;
   1615	int status = 1;
   1616
   1617	/*
   1618	 * Rename seqlock is not required here because in the off chance
   1619	 * of a false negative due to a concurrent rename, the caller is
   1620	 * going to fall back to non-racy lookup.
   1621	 */
   1622	if (nd->flags & LOOKUP_RCU) {
   1623		unsigned seq;
   1624		dentry = __d_lookup_rcu(parent, &nd->last, &seq);
   1625		if (unlikely(!dentry)) {
   1626			if (!try_to_unlazy(nd))
   1627				return ERR_PTR(-ECHILD);
   1628			return NULL;
   1629		}
   1630
   1631		/*
   1632		 * This sequence count validates that the inode matches
   1633		 * the dentry name information from lookup.
   1634		 */
   1635		*inode = d_backing_inode(dentry);
   1636		if (unlikely(read_seqcount_retry(&dentry->d_seq, seq)))
   1637			return ERR_PTR(-ECHILD);
   1638
   1639		/*
   1640		 * This sequence count validates that the parent had no
   1641		 * changes while we did the lookup of the dentry above.
   1642		 *
   1643		 * The memory barrier in read_seqcount_begin of child is
   1644		 *  enough, we can use __read_seqcount_retry here.
   1645		 */
   1646		if (unlikely(__read_seqcount_retry(&parent->d_seq, nd->seq)))
   1647			return ERR_PTR(-ECHILD);
   1648
   1649		*seqp = seq;
   1650		status = d_revalidate(dentry, nd->flags);
   1651		if (likely(status > 0))
   1652			return dentry;
   1653		if (!try_to_unlazy_next(nd, dentry, seq))
   1654			return ERR_PTR(-ECHILD);
   1655		if (status == -ECHILD)
   1656			/* we'd been told to redo it in non-rcu mode */
   1657			status = d_revalidate(dentry, nd->flags);
   1658	} else {
   1659		dentry = __d_lookup(parent, &nd->last);
   1660		if (unlikely(!dentry))
   1661			return NULL;
   1662		status = d_revalidate(dentry, nd->flags);
   1663	}
   1664	if (unlikely(status <= 0)) {
   1665		if (!status)
   1666			d_invalidate(dentry);
   1667		dput(dentry);
   1668		return ERR_PTR(status);
   1669	}
   1670	return dentry;
   1671}
   1672
   1673/* Fast lookup failed, do it the slow way */
   1674static struct dentry *__lookup_slow(const struct qstr *name,
   1675				    struct dentry *dir,
   1676				    unsigned int flags)
   1677{
   1678	struct dentry *dentry, *old;
   1679	struct inode *inode = dir->d_inode;
   1680	DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq);
   1681
   1682	/* Don't go there if it's already dead */
   1683	if (unlikely(IS_DEADDIR(inode)))
   1684		return ERR_PTR(-ENOENT);
   1685again:
   1686	dentry = d_alloc_parallel(dir, name, &wq);
   1687	if (IS_ERR(dentry))
   1688		return dentry;
   1689	if (unlikely(!d_in_lookup(dentry))) {
   1690		int error = d_revalidate(dentry, flags);
   1691		if (unlikely(error <= 0)) {
   1692			if (!error) {
   1693				d_invalidate(dentry);
   1694				dput(dentry);
   1695				goto again;
   1696			}
   1697			dput(dentry);
   1698			dentry = ERR_PTR(error);
   1699		}
   1700	} else {
   1701		old = inode->i_op->lookup(inode, dentry, flags);
   1702		d_lookup_done(dentry);
   1703		if (unlikely(old)) {
   1704			dput(dentry);
   1705			dentry = old;
   1706		}
   1707	}
   1708	return dentry;
   1709}
   1710
   1711static struct dentry *lookup_slow(const struct qstr *name,
   1712				  struct dentry *dir,
   1713				  unsigned int flags)
   1714{
   1715	struct inode *inode = dir->d_inode;
   1716	struct dentry *res;
   1717	inode_lock_shared(inode);
   1718	res = __lookup_slow(name, dir, flags);
   1719	inode_unlock_shared(inode);
   1720	return res;
   1721}
   1722
   1723static inline int may_lookup(struct user_namespace *mnt_userns,
   1724			     struct nameidata *nd)
   1725{
   1726	if (nd->flags & LOOKUP_RCU) {
   1727		int err = inode_permission(mnt_userns, nd->inode, MAY_EXEC|MAY_NOT_BLOCK);
   1728		if (err != -ECHILD || !try_to_unlazy(nd))
   1729			return err;
   1730	}
   1731	return inode_permission(mnt_userns, nd->inode, MAY_EXEC);
   1732}
   1733
   1734static int reserve_stack(struct nameidata *nd, struct path *link, unsigned seq)
   1735{
   1736	if (unlikely(nd->total_link_count++ >= MAXSYMLINKS))
   1737		return -ELOOP;
   1738
   1739	if (likely(nd->depth != EMBEDDED_LEVELS))
   1740		return 0;
   1741	if (likely(nd->stack != nd->internal))
   1742		return 0;
   1743	if (likely(nd_alloc_stack(nd)))
   1744		return 0;
   1745
   1746	if (nd->flags & LOOKUP_RCU) {
   1747		// we need to grab link before we do unlazy.  And we can't skip
   1748		// unlazy even if we fail to grab the link - cleanup needs it
   1749		bool grabbed_link = legitimize_path(nd, link, seq);
   1750
   1751		if (!try_to_unlazy(nd) || !grabbed_link)
   1752			return -ECHILD;
   1753
   1754		if (nd_alloc_stack(nd))
   1755			return 0;
   1756	}
   1757	return -ENOMEM;
   1758}
   1759
   1760enum {WALK_TRAILING = 1, WALK_MORE = 2, WALK_NOFOLLOW = 4};
   1761
   1762static const char *pick_link(struct nameidata *nd, struct path *link,
   1763		     struct inode *inode, unsigned seq, int flags)
   1764{
   1765	struct saved *last;
   1766	const char *res;
   1767	int error = reserve_stack(nd, link, seq);
   1768
   1769	if (unlikely(error)) {
   1770		if (!(nd->flags & LOOKUP_RCU))
   1771			path_put(link);
   1772		return ERR_PTR(error);
   1773	}
   1774	last = nd->stack + nd->depth++;
   1775	last->link = *link;
   1776	clear_delayed_call(&last->done);
   1777	last->seq = seq;
   1778
   1779	if (flags & WALK_TRAILING) {
   1780		error = may_follow_link(nd, inode);
   1781		if (unlikely(error))
   1782			return ERR_PTR(error);
   1783	}
   1784
   1785	if (unlikely(nd->flags & LOOKUP_NO_SYMLINKS) ||
   1786			unlikely(link->mnt->mnt_flags & MNT_NOSYMFOLLOW))
   1787		return ERR_PTR(-ELOOP);
   1788
   1789	if (!(nd->flags & LOOKUP_RCU)) {
   1790		touch_atime(&last->link);
   1791		cond_resched();
   1792	} else if (atime_needs_update(&last->link, inode)) {
   1793		if (!try_to_unlazy(nd))
   1794			return ERR_PTR(-ECHILD);
   1795		touch_atime(&last->link);
   1796	}
   1797
   1798	error = security_inode_follow_link(link->dentry, inode,
   1799					   nd->flags & LOOKUP_RCU);
   1800	if (unlikely(error))
   1801		return ERR_PTR(error);
   1802
   1803	res = READ_ONCE(inode->i_link);
   1804	if (!res) {
   1805		const char * (*get)(struct dentry *, struct inode *,
   1806				struct delayed_call *);
   1807		get = inode->i_op->get_link;
   1808		if (nd->flags & LOOKUP_RCU) {
   1809			res = get(NULL, inode, &last->done);
   1810			if (res == ERR_PTR(-ECHILD) && try_to_unlazy(nd))
   1811				res = get(link->dentry, inode, &last->done);
   1812		} else {
   1813			res = get(link->dentry, inode, &last->done);
   1814		}
   1815		if (!res)
   1816			goto all_done;
   1817		if (IS_ERR(res))
   1818			return res;
   1819	}
   1820	if (*res == '/') {
   1821		error = nd_jump_root(nd);
   1822		if (unlikely(error))
   1823			return ERR_PTR(error);
   1824		while (unlikely(*++res == '/'))
   1825			;
   1826	}
   1827	if (*res)
   1828		return res;
   1829all_done: // pure jump
   1830	put_link(nd);
   1831	return NULL;
   1832}
   1833
   1834/*
   1835 * Do we need to follow links? We _really_ want to be able
   1836 * to do this check without having to look at inode->i_op,
   1837 * so we keep a cache of "no, this doesn't need follow_link"
   1838 * for the common case.
   1839 */
   1840static const char *step_into(struct nameidata *nd, int flags,
   1841		     struct dentry *dentry, struct inode *inode, unsigned seq)
   1842{
   1843	struct path path;
   1844	int err = handle_mounts(nd, dentry, &path, &inode, &seq);
   1845
   1846	if (err < 0)
   1847		return ERR_PTR(err);
   1848	if (likely(!d_is_symlink(path.dentry)) ||
   1849	   ((flags & WALK_TRAILING) && !(nd->flags & LOOKUP_FOLLOW)) ||
   1850	   (flags & WALK_NOFOLLOW)) {
   1851		/* not a symlink or should not follow */
   1852		if (!(nd->flags & LOOKUP_RCU)) {
   1853			dput(nd->path.dentry);
   1854			if (nd->path.mnt != path.mnt)
   1855				mntput(nd->path.mnt);
   1856		}
   1857		nd->path = path;
   1858		nd->inode = inode;
   1859		nd->seq = seq;
   1860		return NULL;
   1861	}
   1862	if (nd->flags & LOOKUP_RCU) {
   1863		/* make sure that d_is_symlink above matches inode */
   1864		if (read_seqcount_retry(&path.dentry->d_seq, seq))
   1865			return ERR_PTR(-ECHILD);
   1866	} else {
   1867		if (path.mnt == nd->path.mnt)
   1868			mntget(path.mnt);
   1869	}
   1870	return pick_link(nd, &path, inode, seq, flags);
   1871}
   1872
   1873static struct dentry *follow_dotdot_rcu(struct nameidata *nd,
   1874					struct inode **inodep,
   1875					unsigned *seqp)
   1876{
   1877	struct dentry *parent, *old;
   1878
   1879	if (path_equal(&nd->path, &nd->root))
   1880		goto in_root;
   1881	if (unlikely(nd->path.dentry == nd->path.mnt->mnt_root)) {
   1882		struct path path;
   1883		unsigned seq;
   1884		if (!choose_mountpoint_rcu(real_mount(nd->path.mnt),
   1885					   &nd->root, &path, &seq))
   1886			goto in_root;
   1887		if (unlikely(nd->flags & LOOKUP_NO_XDEV))
   1888			return ERR_PTR(-ECHILD);
   1889		nd->path = path;
   1890		nd->inode = path.dentry->d_inode;
   1891		nd->seq = seq;
   1892		if (unlikely(read_seqretry(&mount_lock, nd->m_seq)))
   1893			return ERR_PTR(-ECHILD);
   1894		/* we know that mountpoint was pinned */
   1895	}
   1896	old = nd->path.dentry;
   1897	parent = old->d_parent;
   1898	*inodep = parent->d_inode;
   1899	*seqp = read_seqcount_begin(&parent->d_seq);
   1900	if (unlikely(read_seqcount_retry(&old->d_seq, nd->seq)))
   1901		return ERR_PTR(-ECHILD);
   1902	if (unlikely(!path_connected(nd->path.mnt, parent)))
   1903		return ERR_PTR(-ECHILD);
   1904	return parent;
   1905in_root:
   1906	if (unlikely(read_seqretry(&mount_lock, nd->m_seq)))
   1907		return ERR_PTR(-ECHILD);
   1908	if (unlikely(nd->flags & LOOKUP_BENEATH))
   1909		return ERR_PTR(-ECHILD);
   1910	return NULL;
   1911}
   1912
   1913static struct dentry *follow_dotdot(struct nameidata *nd,
   1914				 struct inode **inodep,
   1915				 unsigned *seqp)
   1916{
   1917	struct dentry *parent;
   1918
   1919	if (path_equal(&nd->path, &nd->root))
   1920		goto in_root;
   1921	if (unlikely(nd->path.dentry == nd->path.mnt->mnt_root)) {
   1922		struct path path;
   1923
   1924		if (!choose_mountpoint(real_mount(nd->path.mnt),
   1925				       &nd->root, &path))
   1926			goto in_root;
   1927		path_put(&nd->path);
   1928		nd->path = path;
   1929		nd->inode = path.dentry->d_inode;
   1930		if (unlikely(nd->flags & LOOKUP_NO_XDEV))
   1931			return ERR_PTR(-EXDEV);
   1932	}
   1933	/* rare case of legitimate dget_parent()... */
   1934	parent = dget_parent(nd->path.dentry);
   1935	if (unlikely(!path_connected(nd->path.mnt, parent))) {
   1936		dput(parent);
   1937		return ERR_PTR(-ENOENT);
   1938	}
   1939	*seqp = 0;
   1940	*inodep = parent->d_inode;
   1941	return parent;
   1942
   1943in_root:
   1944	if (unlikely(nd->flags & LOOKUP_BENEATH))
   1945		return ERR_PTR(-EXDEV);
   1946	dget(nd->path.dentry);
   1947	return NULL;
   1948}
   1949
   1950static const char *handle_dots(struct nameidata *nd, int type)
   1951{
   1952	if (type == LAST_DOTDOT) {
   1953		const char *error = NULL;
   1954		struct dentry *parent;
   1955		struct inode *inode;
   1956		unsigned seq;
   1957
   1958		if (!nd->root.mnt) {
   1959			error = ERR_PTR(set_root(nd));
   1960			if (error)
   1961				return error;
   1962		}
   1963		if (nd->flags & LOOKUP_RCU)
   1964			parent = follow_dotdot_rcu(nd, &inode, &seq);
   1965		else
   1966			parent = follow_dotdot(nd, &inode, &seq);
   1967		if (IS_ERR(parent))
   1968			return ERR_CAST(parent);
   1969		if (unlikely(!parent))
   1970			error = step_into(nd, WALK_NOFOLLOW,
   1971					 nd->path.dentry, nd->inode, nd->seq);
   1972		else
   1973			error = step_into(nd, WALK_NOFOLLOW,
   1974					 parent, inode, seq);
   1975		if (unlikely(error))
   1976			return error;
   1977
   1978		if (unlikely(nd->flags & LOOKUP_IS_SCOPED)) {
   1979			/*
   1980			 * If there was a racing rename or mount along our
   1981			 * path, then we can't be sure that ".." hasn't jumped
   1982			 * above nd->root (and so userspace should retry or use
   1983			 * some fallback).
   1984			 */
   1985			smp_rmb();
   1986			if (unlikely(__read_seqcount_retry(&mount_lock.seqcount, nd->m_seq)))
   1987				return ERR_PTR(-EAGAIN);
   1988			if (unlikely(__read_seqcount_retry(&rename_lock.seqcount, nd->r_seq)))
   1989				return ERR_PTR(-EAGAIN);
   1990		}
   1991	}
   1992	return NULL;
   1993}
   1994
   1995static const char *walk_component(struct nameidata *nd, int flags)
   1996{
   1997	struct dentry *dentry;
   1998	struct inode *inode;
   1999	unsigned seq;
   2000	/*
   2001	 * "." and ".." are special - ".." especially so because it has
   2002	 * to be able to know about the current root directory and
   2003	 * parent relationships.
   2004	 */
   2005	if (unlikely(nd->last_type != LAST_NORM)) {
   2006		if (!(flags & WALK_MORE) && nd->depth)
   2007			put_link(nd);
   2008		return handle_dots(nd, nd->last_type);
   2009	}
   2010	dentry = lookup_fast(nd, &inode, &seq);
   2011	if (IS_ERR(dentry))
   2012		return ERR_CAST(dentry);
   2013	if (unlikely(!dentry)) {
   2014		dentry = lookup_slow(&nd->last, nd->path.dentry, nd->flags);
   2015		if (IS_ERR(dentry))
   2016			return ERR_CAST(dentry);
   2017	}
   2018	if (!(flags & WALK_MORE) && nd->depth)
   2019		put_link(nd);
   2020	return step_into(nd, flags, dentry, inode, seq);
   2021}
   2022
   2023/*
   2024 * We can do the critical dentry name comparison and hashing
   2025 * operations one word at a time, but we are limited to:
   2026 *
   2027 * - Architectures with fast unaligned word accesses. We could
   2028 *   do a "get_unaligned()" if this helps and is sufficiently
   2029 *   fast.
   2030 *
   2031 * - non-CONFIG_DEBUG_PAGEALLOC configurations (so that we
   2032 *   do not trap on the (extremely unlikely) case of a page
   2033 *   crossing operation.
   2034 *
   2035 * - Furthermore, we need an efficient 64-bit compile for the
   2036 *   64-bit case in order to generate the "number of bytes in
   2037 *   the final mask". Again, that could be replaced with a
   2038 *   efficient population count instruction or similar.
   2039 */
   2040#ifdef CONFIG_DCACHE_WORD_ACCESS
   2041
   2042#include <asm/word-at-a-time.h>
   2043
   2044#ifdef HASH_MIX
   2045
   2046/* Architecture provides HASH_MIX and fold_hash() in <asm/hash.h> */
   2047
   2048#elif defined(CONFIG_64BIT)
   2049/*
   2050 * Register pressure in the mixing function is an issue, particularly
   2051 * on 32-bit x86, but almost any function requires one state value and
   2052 * one temporary.  Instead, use a function designed for two state values
   2053 * and no temporaries.
   2054 *
   2055 * This function cannot create a collision in only two iterations, so
   2056 * we have two iterations to achieve avalanche.  In those two iterations,
   2057 * we have six layers of mixing, which is enough to spread one bit's
   2058 * influence out to 2^6 = 64 state bits.
   2059 *
   2060 * Rotate constants are scored by considering either 64 one-bit input
   2061 * deltas or 64*63/2 = 2016 two-bit input deltas, and finding the
   2062 * probability of that delta causing a change to each of the 128 output
   2063 * bits, using a sample of random initial states.
   2064 *
   2065 * The Shannon entropy of the computed probabilities is then summed
   2066 * to produce a score.  Ideally, any input change has a 50% chance of
   2067 * toggling any given output bit.
   2068 *
   2069 * Mixing scores (in bits) for (12,45):
   2070 * Input delta: 1-bit      2-bit
   2071 * 1 round:     713.3    42542.6
   2072 * 2 rounds:   2753.7   140389.8
   2073 * 3 rounds:   5954.1   233458.2
   2074 * 4 rounds:   7862.6   256672.2
   2075 * Perfect:    8192     258048
   2076 *            (64*128) (64*63/2 * 128)
   2077 */
   2078#define HASH_MIX(x, y, a)	\
   2079	(	x ^= (a),	\
   2080	y ^= x,	x = rol64(x,12),\
   2081	x += y,	y = rol64(y,45),\
   2082	y *= 9			)
   2083
   2084/*
   2085 * Fold two longs into one 32-bit hash value.  This must be fast, but
   2086 * latency isn't quite as critical, as there is a fair bit of additional
   2087 * work done before the hash value is used.
   2088 */
   2089static inline unsigned int fold_hash(unsigned long x, unsigned long y)
   2090{
   2091	y ^= x * GOLDEN_RATIO_64;
   2092	y *= GOLDEN_RATIO_64;
   2093	return y >> 32;
   2094}
   2095
   2096#else	/* 32-bit case */
   2097
   2098/*
   2099 * Mixing scores (in bits) for (7,20):
   2100 * Input delta: 1-bit      2-bit
   2101 * 1 round:     330.3     9201.6
   2102 * 2 rounds:   1246.4    25475.4
   2103 * 3 rounds:   1907.1    31295.1
   2104 * 4 rounds:   2042.3    31718.6
   2105 * Perfect:    2048      31744
   2106 *            (32*64)   (32*31/2 * 64)
   2107 */
   2108#define HASH_MIX(x, y, a)	\
   2109	(	x ^= (a),	\
   2110	y ^= x,	x = rol32(x, 7),\
   2111	x += y,	y = rol32(y,20),\
   2112	y *= 9			)
   2113
   2114static inline unsigned int fold_hash(unsigned long x, unsigned long y)
   2115{
   2116	/* Use arch-optimized multiply if one exists */
   2117	return __hash_32(y ^ __hash_32(x));
   2118}
   2119
   2120#endif
   2121
   2122/*
   2123 * Return the hash of a string of known length.  This is carfully
   2124 * designed to match hash_name(), which is the more critical function.
   2125 * In particular, we must end by hashing a final word containing 0..7
   2126 * payload bytes, to match the way that hash_name() iterates until it
   2127 * finds the delimiter after the name.
   2128 */
   2129unsigned int full_name_hash(const void *salt, const char *name, unsigned int len)
   2130{
   2131	unsigned long a, x = 0, y = (unsigned long)salt;
   2132
   2133	for (;;) {
   2134		if (!len)
   2135			goto done;
   2136		a = load_unaligned_zeropad(name);
   2137		if (len < sizeof(unsigned long))
   2138			break;
   2139		HASH_MIX(x, y, a);
   2140		name += sizeof(unsigned long);
   2141		len -= sizeof(unsigned long);
   2142	}
   2143	x ^= a & bytemask_from_count(len);
   2144done:
   2145	return fold_hash(x, y);
   2146}
   2147EXPORT_SYMBOL(full_name_hash);
   2148
   2149/* Return the "hash_len" (hash and length) of a null-terminated string */
   2150u64 hashlen_string(const void *salt, const char *name)
   2151{
   2152	unsigned long a = 0, x = 0, y = (unsigned long)salt;
   2153	unsigned long adata, mask, len;
   2154	const struct word_at_a_time constants = WORD_AT_A_TIME_CONSTANTS;
   2155
   2156	len = 0;
   2157	goto inside;
   2158
   2159	do {
   2160		HASH_MIX(x, y, a);
   2161		len += sizeof(unsigned long);
   2162inside:
   2163		a = load_unaligned_zeropad(name+len);
   2164	} while (!has_zero(a, &adata, &constants));
   2165
   2166	adata = prep_zero_mask(a, adata, &constants);
   2167	mask = create_zero_mask(adata);
   2168	x ^= a & zero_bytemask(mask);
   2169
   2170	return hashlen_create(fold_hash(x, y), len + find_zero(mask));
   2171}
   2172EXPORT_SYMBOL(hashlen_string);
   2173
   2174/*
   2175 * Calculate the length and hash of the path component, and
   2176 * return the "hash_len" as the result.
   2177 */
   2178static inline u64 hash_name(const void *salt, const char *name)
   2179{
   2180	unsigned long a = 0, b, x = 0, y = (unsigned long)salt;
   2181	unsigned long adata, bdata, mask, len;
   2182	const struct word_at_a_time constants = WORD_AT_A_TIME_CONSTANTS;
   2183
   2184	len = 0;
   2185	goto inside;
   2186
   2187	do {
   2188		HASH_MIX(x, y, a);
   2189		len += sizeof(unsigned long);
   2190inside:
   2191		a = load_unaligned_zeropad(name+len);
   2192		b = a ^ REPEAT_BYTE('/');
   2193	} while (!(has_zero(a, &adata, &constants) | has_zero(b, &bdata, &constants)));
   2194
   2195	adata = prep_zero_mask(a, adata, &constants);
   2196	bdata = prep_zero_mask(b, bdata, &constants);
   2197	mask = create_zero_mask(adata | bdata);
   2198	x ^= a & zero_bytemask(mask);
   2199
   2200	return hashlen_create(fold_hash(x, y), len + find_zero(mask));
   2201}
   2202
   2203#else	/* !CONFIG_DCACHE_WORD_ACCESS: Slow, byte-at-a-time version */
   2204
   2205/* Return the hash of a string of known length */
   2206unsigned int full_name_hash(const void *salt, const char *name, unsigned int len)
   2207{
   2208	unsigned long hash = init_name_hash(salt);
   2209	while (len--)
   2210		hash = partial_name_hash((unsigned char)*name++, hash);
   2211	return end_name_hash(hash);
   2212}
   2213EXPORT_SYMBOL(full_name_hash);
   2214
   2215/* Return the "hash_len" (hash and length) of a null-terminated string */
   2216u64 hashlen_string(const void *salt, const char *name)
   2217{
   2218	unsigned long hash = init_name_hash(salt);
   2219	unsigned long len = 0, c;
   2220
   2221	c = (unsigned char)*name;
   2222	while (c) {
   2223		len++;
   2224		hash = partial_name_hash(c, hash);
   2225		c = (unsigned char)name[len];
   2226	}
   2227	return hashlen_create(end_name_hash(hash), len);
   2228}
   2229EXPORT_SYMBOL(hashlen_string);
   2230
   2231/*
   2232 * We know there's a real path component here of at least
   2233 * one character.
   2234 */
   2235static inline u64 hash_name(const void *salt, const char *name)
   2236{
   2237	unsigned long hash = init_name_hash(salt);
   2238	unsigned long len = 0, c;
   2239
   2240	c = (unsigned char)*name;
   2241	do {
   2242		len++;
   2243		hash = partial_name_hash(c, hash);
   2244		c = (unsigned char)name[len];
   2245	} while (c && c != '/');
   2246	return hashlen_create(end_name_hash(hash), len);
   2247}
   2248
   2249#endif
   2250
   2251/*
   2252 * Name resolution.
   2253 * This is the basic name resolution function, turning a pathname into
   2254 * the final dentry. We expect 'base' to be positive and a directory.
   2255 *
   2256 * Returns 0 and nd will have valid dentry and mnt on success.
   2257 * Returns error and drops reference to input namei data on failure.
   2258 */
   2259static int link_path_walk(const char *name, struct nameidata *nd)
   2260{
   2261	int depth = 0; // depth <= nd->depth
   2262	int err;
   2263
   2264	nd->last_type = LAST_ROOT;
   2265	nd->flags |= LOOKUP_PARENT;
   2266	if (IS_ERR(name))
   2267		return PTR_ERR(name);
   2268	while (*name=='/')
   2269		name++;
   2270	if (!*name) {
   2271		nd->dir_mode = 0; // short-circuit the 'hardening' idiocy
   2272		return 0;
   2273	}
   2274
   2275	/* At this point we know we have a real path component. */
   2276	for(;;) {
   2277		struct user_namespace *mnt_userns;
   2278		const char *link;
   2279		u64 hash_len;
   2280		int type;
   2281
   2282		mnt_userns = mnt_user_ns(nd->path.mnt);
   2283		err = may_lookup(mnt_userns, nd);
   2284		if (err)
   2285			return err;
   2286
   2287		hash_len = hash_name(nd->path.dentry, name);
   2288
   2289		type = LAST_NORM;
   2290		if (name[0] == '.') switch (hashlen_len(hash_len)) {
   2291			case 2:
   2292				if (name[1] == '.') {
   2293					type = LAST_DOTDOT;
   2294					nd->state |= ND_JUMPED;
   2295				}
   2296				break;
   2297			case 1:
   2298				type = LAST_DOT;
   2299		}
   2300		if (likely(type == LAST_NORM)) {
   2301			struct dentry *parent = nd->path.dentry;
   2302			nd->state &= ~ND_JUMPED;
   2303			if (unlikely(parent->d_flags & DCACHE_OP_HASH)) {
   2304				struct qstr this = { { .hash_len = hash_len }, .name = name };
   2305				err = parent->d_op->d_hash(parent, &this);
   2306				if (err < 0)
   2307					return err;
   2308				hash_len = this.hash_len;
   2309				name = this.name;
   2310			}
   2311		}
   2312
   2313		nd->last.hash_len = hash_len;
   2314		nd->last.name = name;
   2315		nd->last_type = type;
   2316
   2317		name += hashlen_len(hash_len);
   2318		if (!*name)
   2319			goto OK;
   2320		/*
   2321		 * If it wasn't NUL, we know it was '/'. Skip that
   2322		 * slash, and continue until no more slashes.
   2323		 */
   2324		do {
   2325			name++;
   2326		} while (unlikely(*name == '/'));
   2327		if (unlikely(!*name)) {
   2328OK:
   2329			/* pathname or trailing symlink, done */
   2330			if (!depth) {
   2331				nd->dir_uid = i_uid_into_mnt(mnt_userns, nd->inode);
   2332				nd->dir_mode = nd->inode->i_mode;
   2333				nd->flags &= ~LOOKUP_PARENT;
   2334				return 0;
   2335			}
   2336			/* last component of nested symlink */
   2337			name = nd->stack[--depth].name;
   2338			link = walk_component(nd, 0);
   2339		} else {
   2340			/* not the last component */
   2341			link = walk_component(nd, WALK_MORE);
   2342		}
   2343		if (unlikely(link)) {
   2344			if (IS_ERR(link))
   2345				return PTR_ERR(link);
   2346			/* a symlink to follow */
   2347			nd->stack[depth++].name = name;
   2348			name = link;
   2349			continue;
   2350		}
   2351		if (unlikely(!d_can_lookup(nd->path.dentry))) {
   2352			if (nd->flags & LOOKUP_RCU) {
   2353				if (!try_to_unlazy(nd))
   2354					return -ECHILD;
   2355			}
   2356			return -ENOTDIR;
   2357		}
   2358	}
   2359}
   2360
   2361/* must be paired with terminate_walk() */
   2362static const char *path_init(struct nameidata *nd, unsigned flags)
   2363{
   2364	int error;
   2365	const char *s = nd->name->name;
   2366
   2367	/* LOOKUP_CACHED requires RCU, ask caller to retry */
   2368	if ((flags & (LOOKUP_RCU | LOOKUP_CACHED)) == LOOKUP_CACHED)
   2369		return ERR_PTR(-EAGAIN);
   2370
   2371	if (!*s)
   2372		flags &= ~LOOKUP_RCU;
   2373	if (flags & LOOKUP_RCU)
   2374		rcu_read_lock();
   2375
   2376	nd->flags = flags;
   2377	nd->state |= ND_JUMPED;
   2378
   2379	nd->m_seq = __read_seqcount_begin(&mount_lock.seqcount);
   2380	nd->r_seq = __read_seqcount_begin(&rename_lock.seqcount);
   2381	smp_rmb();
   2382
   2383	if (nd->state & ND_ROOT_PRESET) {
   2384		struct dentry *root = nd->root.dentry;
   2385		struct inode *inode = root->d_inode;
   2386		if (*s && unlikely(!d_can_lookup(root)))
   2387			return ERR_PTR(-ENOTDIR);
   2388		nd->path = nd->root;
   2389		nd->inode = inode;
   2390		if (flags & LOOKUP_RCU) {
   2391			nd->seq = read_seqcount_begin(&nd->path.dentry->d_seq);
   2392			nd->root_seq = nd->seq;
   2393		} else {
   2394			path_get(&nd->path);
   2395		}
   2396		return s;
   2397	}
   2398
   2399	nd->root.mnt = NULL;
   2400
   2401	/* Absolute pathname -- fetch the root (LOOKUP_IN_ROOT uses nd->dfd). */
   2402	if (*s == '/' && !(flags & LOOKUP_IN_ROOT)) {
   2403		error = nd_jump_root(nd);
   2404		if (unlikely(error))
   2405			return ERR_PTR(error);
   2406		return s;
   2407	}
   2408
   2409	/* Relative pathname -- get the starting-point it is relative to. */
   2410	if (nd->dfd == AT_FDCWD) {
   2411		if (flags & LOOKUP_RCU) {
   2412			struct fs_struct *fs = current->fs;
   2413			unsigned seq;
   2414
   2415			do {
   2416				seq = read_seqcount_begin(&fs->seq);
   2417				nd->path = fs->pwd;
   2418				nd->inode = nd->path.dentry->d_inode;
   2419				nd->seq = __read_seqcount_begin(&nd->path.dentry->d_seq);
   2420			} while (read_seqcount_retry(&fs->seq, seq));
   2421		} else {
   2422			get_fs_pwd(current->fs, &nd->path);
   2423			nd->inode = nd->path.dentry->d_inode;
   2424		}
   2425	} else {
   2426		/* Caller must check execute permissions on the starting path component */
   2427		struct fd f = fdget_raw(nd->dfd);
   2428		struct dentry *dentry;
   2429
   2430		if (!f.file)
   2431			return ERR_PTR(-EBADF);
   2432
   2433		dentry = f.file->f_path.dentry;
   2434
   2435		if (*s && unlikely(!d_can_lookup(dentry))) {
   2436			fdput(f);
   2437			return ERR_PTR(-ENOTDIR);
   2438		}
   2439
   2440		nd->path = f.file->f_path;
   2441		if (flags & LOOKUP_RCU) {
   2442			nd->inode = nd->path.dentry->d_inode;
   2443			nd->seq = read_seqcount_begin(&nd->path.dentry->d_seq);
   2444		} else {
   2445			path_get(&nd->path);
   2446			nd->inode = nd->path.dentry->d_inode;
   2447		}
   2448		fdput(f);
   2449	}
   2450
   2451	/* For scoped-lookups we need to set the root to the dirfd as well. */
   2452	if (flags & LOOKUP_IS_SCOPED) {
   2453		nd->root = nd->path;
   2454		if (flags & LOOKUP_RCU) {
   2455			nd->root_seq = nd->seq;
   2456		} else {
   2457			path_get(&nd->root);
   2458			nd->state |= ND_ROOT_GRABBED;
   2459		}
   2460	}
   2461	return s;
   2462}
   2463
   2464static inline const char *lookup_last(struct nameidata *nd)
   2465{
   2466	if (nd->last_type == LAST_NORM && nd->last.name[nd->last.len])
   2467		nd->flags |= LOOKUP_FOLLOW | LOOKUP_DIRECTORY;
   2468
   2469	return walk_component(nd, WALK_TRAILING);
   2470}
   2471
   2472static int handle_lookup_down(struct nameidata *nd)
   2473{
   2474	if (!(nd->flags & LOOKUP_RCU))
   2475		dget(nd->path.dentry);
   2476	return PTR_ERR(step_into(nd, WALK_NOFOLLOW,
   2477			nd->path.dentry, nd->inode, nd->seq));
   2478}
   2479
   2480/* Returns 0 and nd will be valid on success; Retuns error, otherwise. */
   2481static int path_lookupat(struct nameidata *nd, unsigned flags, struct path *path)
   2482{
   2483	const char *s = path_init(nd, flags);
   2484	int err;
   2485
   2486	if (unlikely(flags & LOOKUP_DOWN) && !IS_ERR(s)) {
   2487		err = handle_lookup_down(nd);
   2488		if (unlikely(err < 0))
   2489			s = ERR_PTR(err);
   2490	}
   2491
   2492	while (!(err = link_path_walk(s, nd)) &&
   2493	       (s = lookup_last(nd)) != NULL)
   2494		;
   2495	if (!err && unlikely(nd->flags & LOOKUP_MOUNTPOINT)) {
   2496		err = handle_lookup_down(nd);
   2497		nd->state &= ~ND_JUMPED; // no d_weak_revalidate(), please...
   2498	}
   2499	if (!err)
   2500		err = complete_walk(nd);
   2501
   2502	if (!err && nd->flags & LOOKUP_DIRECTORY)
   2503		if (!d_can_lookup(nd->path.dentry))
   2504			err = -ENOTDIR;
   2505	if (!err) {
   2506		*path = nd->path;
   2507		nd->path.mnt = NULL;
   2508		nd->path.dentry = NULL;
   2509	}
   2510	terminate_walk(nd);
   2511	return err;
   2512}
   2513
   2514int filename_lookup(int dfd, struct filename *name, unsigned flags,
   2515		    struct path *path, struct path *root)
   2516{
   2517	int retval;
   2518	struct nameidata nd;
   2519	if (IS_ERR(name))
   2520		return PTR_ERR(name);
   2521	set_nameidata(&nd, dfd, name, root);
   2522	retval = path_lookupat(&nd, flags | LOOKUP_RCU, path);
   2523	if (unlikely(retval == -ECHILD))
   2524		retval = path_lookupat(&nd, flags, path);
   2525	if (unlikely(retval == -ESTALE))
   2526		retval = path_lookupat(&nd, flags | LOOKUP_REVAL, path);
   2527
   2528	if (likely(!retval))
   2529		audit_inode(name, path->dentry,
   2530			    flags & LOOKUP_MOUNTPOINT ? AUDIT_INODE_NOEVAL : 0);
   2531	restore_nameidata();
   2532	return retval;
   2533}
   2534
   2535/* Returns 0 and nd will be valid on success; Retuns error, otherwise. */
   2536static int path_parentat(struct nameidata *nd, unsigned flags,
   2537				struct path *parent)
   2538{
   2539	const char *s = path_init(nd, flags);
   2540	int err = link_path_walk(s, nd);
   2541	if (!err)
   2542		err = complete_walk(nd);
   2543	if (!err) {
   2544		*parent = nd->path;
   2545		nd->path.mnt = NULL;
   2546		nd->path.dentry = NULL;
   2547	}
   2548	terminate_walk(nd);
   2549	return err;
   2550}
   2551
   2552/* Note: this does not consume "name" */
   2553static int filename_parentat(int dfd, struct filename *name,
   2554			     unsigned int flags, struct path *parent,
   2555			     struct qstr *last, int *type)
   2556{
   2557	int retval;
   2558	struct nameidata nd;
   2559
   2560	if (IS_ERR(name))
   2561		return PTR_ERR(name);
   2562	set_nameidata(&nd, dfd, name, NULL);
   2563	retval = path_parentat(&nd, flags | LOOKUP_RCU, parent);
   2564	if (unlikely(retval == -ECHILD))
   2565		retval = path_parentat(&nd, flags, parent);
   2566	if (unlikely(retval == -ESTALE))
   2567		retval = path_parentat(&nd, flags | LOOKUP_REVAL, parent);
   2568	if (likely(!retval)) {
   2569		*last = nd.last;
   2570		*type = nd.last_type;
   2571		audit_inode(name, parent->dentry, AUDIT_INODE_PARENT);
   2572	}
   2573	restore_nameidata();
   2574	return retval;
   2575}
   2576
   2577/* does lookup, returns the object with parent locked */
   2578static struct dentry *__kern_path_locked(struct filename *name, struct path *path)
   2579{
   2580	struct dentry *d;
   2581	struct qstr last;
   2582	int type, error;
   2583
   2584	error = filename_parentat(AT_FDCWD, name, 0, path, &last, &type);
   2585	if (error)
   2586		return ERR_PTR(error);
   2587	if (unlikely(type != LAST_NORM)) {
   2588		path_put(path);
   2589		return ERR_PTR(-EINVAL);
   2590	}
   2591	inode_lock_nested(path->dentry->d_inode, I_MUTEX_PARENT);
   2592	d = __lookup_hash(&last, path->dentry, 0);
   2593	if (IS_ERR(d)) {
   2594		inode_unlock(path->dentry->d_inode);
   2595		path_put(path);
   2596	}
   2597	return d;
   2598}
   2599
   2600struct dentry *kern_path_locked(const char *name, struct path *path)
   2601{
   2602	struct filename *filename = getname_kernel(name);
   2603	struct dentry *res = __kern_path_locked(filename, path);
   2604
   2605	putname(filename);
   2606	return res;
   2607}
   2608
   2609int kern_path(const char *name, unsigned int flags, struct path *path)
   2610{
   2611	struct filename *filename = getname_kernel(name);
   2612	int ret = filename_lookup(AT_FDCWD, filename, flags, path, NULL);
   2613
   2614	putname(filename);
   2615	return ret;
   2616
   2617}
   2618EXPORT_SYMBOL(kern_path);
   2619
   2620/**
   2621 * vfs_path_lookup - lookup a file path relative to a dentry-vfsmount pair
   2622 * @dentry:  pointer to dentry of the base directory
   2623 * @mnt: pointer to vfs mount of the base directory
   2624 * @name: pointer to file name
   2625 * @flags: lookup flags
   2626 * @path: pointer to struct path to fill
   2627 */
   2628int vfs_path_lookup(struct dentry *dentry, struct vfsmount *mnt,
   2629		    const char *name, unsigned int flags,
   2630		    struct path *path)
   2631{
   2632	struct filename *filename;
   2633	struct path root = {.mnt = mnt, .dentry = dentry};
   2634	int ret;
   2635
   2636	filename = getname_kernel(name);
   2637	/* the first argument of filename_lookup() is ignored with root */
   2638	ret = filename_lookup(AT_FDCWD, filename, flags, path, &root);
   2639	putname(filename);
   2640	return ret;
   2641}
   2642EXPORT_SYMBOL(vfs_path_lookup);
   2643
   2644static int lookup_one_common(struct user_namespace *mnt_userns,
   2645			     const char *name, struct dentry *base, int len,
   2646			     struct qstr *this)
   2647{
   2648	this->name = name;
   2649	this->len = len;
   2650	this->hash = full_name_hash(base, name, len);
   2651	if (!len)
   2652		return -EACCES;
   2653
   2654	if (unlikely(name[0] == '.')) {
   2655		if (len < 2 || (len == 2 && name[1] == '.'))
   2656			return -EACCES;
   2657	}
   2658
   2659	while (len--) {
   2660		unsigned int c = *(const unsigned char *)name++;
   2661		if (c == '/' || c == '\0')
   2662			return -EACCES;
   2663	}
   2664	/*
   2665	 * See if the low-level filesystem might want
   2666	 * to use its own hash..
   2667	 */
   2668	if (base->d_flags & DCACHE_OP_HASH) {
   2669		int err = base->d_op->d_hash(base, this);
   2670		if (err < 0)
   2671			return err;
   2672	}
   2673
   2674	return inode_permission(mnt_userns, base->d_inode, MAY_EXEC);
   2675}
   2676
   2677/**
   2678 * try_lookup_one_len - filesystem helper to lookup single pathname component
   2679 * @name:	pathname component to lookup
   2680 * @base:	base directory to lookup from
   2681 * @len:	maximum length @len should be interpreted to
   2682 *
   2683 * Look up a dentry by name in the dcache, returning NULL if it does not
   2684 * currently exist.  The function does not try to create a dentry.
   2685 *
   2686 * Note that this routine is purely a helper for filesystem usage and should
   2687 * not be called by generic code.
   2688 *
   2689 * The caller must hold base->i_mutex.
   2690 */
   2691struct dentry *try_lookup_one_len(const char *name, struct dentry *base, int len)
   2692{
   2693	struct qstr this;
   2694	int err;
   2695
   2696	WARN_ON_ONCE(!inode_is_locked(base->d_inode));
   2697
   2698	err = lookup_one_common(&init_user_ns, name, base, len, &this);
   2699	if (err)
   2700		return ERR_PTR(err);
   2701
   2702	return lookup_dcache(&this, base, 0);
   2703}
   2704EXPORT_SYMBOL(try_lookup_one_len);
   2705
   2706/**
   2707 * lookup_one_len - filesystem helper to lookup single pathname component
   2708 * @name:	pathname component to lookup
   2709 * @base:	base directory to lookup from
   2710 * @len:	maximum length @len should be interpreted to
   2711 *
   2712 * Note that this routine is purely a helper for filesystem usage and should
   2713 * not be called by generic code.
   2714 *
   2715 * The caller must hold base->i_mutex.
   2716 */
   2717struct dentry *lookup_one_len(const char *name, struct dentry *base, int len)
   2718{
   2719	struct dentry *dentry;
   2720	struct qstr this;
   2721	int err;
   2722
   2723	WARN_ON_ONCE(!inode_is_locked(base->d_inode));
   2724
   2725	err = lookup_one_common(&init_user_ns, name, base, len, &this);
   2726	if (err)
   2727		return ERR_PTR(err);
   2728
   2729	dentry = lookup_dcache(&this, base, 0);
   2730	return dentry ? dentry : __lookup_slow(&this, base, 0);
   2731}
   2732EXPORT_SYMBOL(lookup_one_len);
   2733
   2734/**
   2735 * lookup_one - filesystem helper to lookup single pathname component
   2736 * @mnt_userns:	user namespace of the mount the lookup is performed from
   2737 * @name:	pathname component to lookup
   2738 * @base:	base directory to lookup from
   2739 * @len:	maximum length @len should be interpreted to
   2740 *
   2741 * Note that this routine is purely a helper for filesystem usage and should
   2742 * not be called by generic code.
   2743 *
   2744 * The caller must hold base->i_mutex.
   2745 */
   2746struct dentry *lookup_one(struct user_namespace *mnt_userns, const char *name,
   2747			  struct dentry *base, int len)
   2748{
   2749	struct dentry *dentry;
   2750	struct qstr this;
   2751	int err;
   2752
   2753	WARN_ON_ONCE(!inode_is_locked(base->d_inode));
   2754
   2755	err = lookup_one_common(mnt_userns, name, base, len, &this);
   2756	if (err)
   2757		return ERR_PTR(err);
   2758
   2759	dentry = lookup_dcache(&this, base, 0);
   2760	return dentry ? dentry : __lookup_slow(&this, base, 0);
   2761}
   2762EXPORT_SYMBOL(lookup_one);
   2763
   2764/**
   2765 * lookup_one_unlocked - filesystem helper to lookup single pathname component
   2766 * @mnt_userns:	idmapping of the mount the lookup is performed from
   2767 * @name:	pathname component to lookup
   2768 * @base:	base directory to lookup from
   2769 * @len:	maximum length @len should be interpreted to
   2770 *
   2771 * Note that this routine is purely a helper for filesystem usage and should
   2772 * not be called by generic code.
   2773 *
   2774 * Unlike lookup_one_len, it should be called without the parent
   2775 * i_mutex held, and will take the i_mutex itself if necessary.
   2776 */
   2777struct dentry *lookup_one_unlocked(struct user_namespace *mnt_userns,
   2778				   const char *name, struct dentry *base,
   2779				   int len)
   2780{
   2781	struct qstr this;
   2782	int err;
   2783	struct dentry *ret;
   2784
   2785	err = lookup_one_common(mnt_userns, name, base, len, &this);
   2786	if (err)
   2787		return ERR_PTR(err);
   2788
   2789	ret = lookup_dcache(&this, base, 0);
   2790	if (!ret)
   2791		ret = lookup_slow(&this, base, 0);
   2792	return ret;
   2793}
   2794EXPORT_SYMBOL(lookup_one_unlocked);
   2795
   2796/**
   2797 * lookup_one_positive_unlocked - filesystem helper to lookup single
   2798 *				  pathname component
   2799 * @mnt_userns:	idmapping of the mount the lookup is performed from
   2800 * @name:	pathname component to lookup
   2801 * @base:	base directory to lookup from
   2802 * @len:	maximum length @len should be interpreted to
   2803 *
   2804 * This helper will yield ERR_PTR(-ENOENT) on negatives. The helper returns
   2805 * known positive or ERR_PTR(). This is what most of the users want.
   2806 *
   2807 * Note that pinned negative with unlocked parent _can_ become positive at any
   2808 * time, so callers of lookup_one_unlocked() need to be very careful; pinned
   2809 * positives have >d_inode stable, so this one avoids such problems.
   2810 *
   2811 * Note that this routine is purely a helper for filesystem usage and should
   2812 * not be called by generic code.
   2813 *
   2814 * The helper should be called without i_mutex held.
   2815 */
   2816struct dentry *lookup_one_positive_unlocked(struct user_namespace *mnt_userns,
   2817					    const char *name,
   2818					    struct dentry *base, int len)
   2819{
   2820	struct dentry *ret = lookup_one_unlocked(mnt_userns, name, base, len);
   2821
   2822	if (!IS_ERR(ret) && d_flags_negative(smp_load_acquire(&ret->d_flags))) {
   2823		dput(ret);
   2824		ret = ERR_PTR(-ENOENT);
   2825	}
   2826	return ret;
   2827}
   2828EXPORT_SYMBOL(lookup_one_positive_unlocked);
   2829
   2830/**
   2831 * lookup_one_len_unlocked - filesystem helper to lookup single pathname component
   2832 * @name:	pathname component to lookup
   2833 * @base:	base directory to lookup from
   2834 * @len:	maximum length @len should be interpreted to
   2835 *
   2836 * Note that this routine is purely a helper for filesystem usage and should
   2837 * not be called by generic code.
   2838 *
   2839 * Unlike lookup_one_len, it should be called without the parent
   2840 * i_mutex held, and will take the i_mutex itself if necessary.
   2841 */
   2842struct dentry *lookup_one_len_unlocked(const char *name,
   2843				       struct dentry *base, int len)
   2844{
   2845	return lookup_one_unlocked(&init_user_ns, name, base, len);
   2846}
   2847EXPORT_SYMBOL(lookup_one_len_unlocked);
   2848
   2849/*
   2850 * Like lookup_one_len_unlocked(), except that it yields ERR_PTR(-ENOENT)
   2851 * on negatives.  Returns known positive or ERR_PTR(); that's what
   2852 * most of the users want.  Note that pinned negative with unlocked parent
   2853 * _can_ become positive at any time, so callers of lookup_one_len_unlocked()
   2854 * need to be very careful; pinned positives have ->d_inode stable, so
   2855 * this one avoids such problems.
   2856 */
   2857struct dentry *lookup_positive_unlocked(const char *name,
   2858				       struct dentry *base, int len)
   2859{
   2860	return lookup_one_positive_unlocked(&init_user_ns, name, base, len);
   2861}
   2862EXPORT_SYMBOL(lookup_positive_unlocked);
   2863
   2864#ifdef CONFIG_UNIX98_PTYS
   2865int path_pts(struct path *path)
   2866{
   2867	/* Find something mounted on "pts" in the same directory as
   2868	 * the input path.
   2869	 */
   2870	struct dentry *parent = dget_parent(path->dentry);
   2871	struct dentry *child;
   2872	struct qstr this = QSTR_INIT("pts", 3);
   2873
   2874	if (unlikely(!path_connected(path->mnt, parent))) {
   2875		dput(parent);
   2876		return -ENOENT;
   2877	}
   2878	dput(path->dentry);
   2879	path->dentry = parent;
   2880	child = d_hash_and_lookup(parent, &this);
   2881	if (!child)
   2882		return -ENOENT;
   2883
   2884	path->dentry = child;
   2885	dput(parent);
   2886	follow_down(path);
   2887	return 0;
   2888}
   2889#endif
   2890
   2891int user_path_at_empty(int dfd, const char __user *name, unsigned flags,
   2892		 struct path *path, int *empty)
   2893{
   2894	struct filename *filename = getname_flags(name, flags, empty);
   2895	int ret = filename_lookup(dfd, filename, flags, path, NULL);
   2896
   2897	putname(filename);
   2898	return ret;
   2899}
   2900EXPORT_SYMBOL(user_path_at_empty);
   2901
   2902int __check_sticky(struct user_namespace *mnt_userns, struct inode *dir,
   2903		   struct inode *inode)
   2904{
   2905	kuid_t fsuid = current_fsuid();
   2906
   2907	if (uid_eq(i_uid_into_mnt(mnt_userns, inode), fsuid))
   2908		return 0;
   2909	if (uid_eq(i_uid_into_mnt(mnt_userns, dir), fsuid))
   2910		return 0;
   2911	return !capable_wrt_inode_uidgid(mnt_userns, inode, CAP_FOWNER);
   2912}
   2913EXPORT_SYMBOL(__check_sticky);
   2914
   2915/*
   2916 *	Check whether we can remove a link victim from directory dir, check
   2917 *  whether the type of victim is right.
   2918 *  1. We can't do it if dir is read-only (done in permission())
   2919 *  2. We should have write and exec permissions on dir
   2920 *  3. We can't remove anything from append-only dir
   2921 *  4. We can't do anything with immutable dir (done in permission())
   2922 *  5. If the sticky bit on dir is set we should either
   2923 *	a. be owner of dir, or
   2924 *	b. be owner of victim, or
   2925 *	c. have CAP_FOWNER capability
   2926 *  6. If the victim is append-only or immutable we can't do antyhing with
   2927 *     links pointing to it.
   2928 *  7. If the victim has an unknown uid or gid we can't change the inode.
   2929 *  8. If we were asked to remove a directory and victim isn't one - ENOTDIR.
   2930 *  9. If we were asked to remove a non-directory and victim isn't one - EISDIR.
   2931 * 10. We can't remove a root or mountpoint.
   2932 * 11. We don't allow removal of NFS sillyrenamed files; it's handled by
   2933 *     nfs_async_unlink().
   2934 */
   2935static int may_delete(struct user_namespace *mnt_userns, struct inode *dir,
   2936		      struct dentry *victim, bool isdir)
   2937{
   2938	struct inode *inode = d_backing_inode(victim);
   2939	int error;
   2940
   2941	if (d_is_negative(victim))
   2942		return -ENOENT;
   2943	BUG_ON(!inode);
   2944
   2945	BUG_ON(victim->d_parent->d_inode != dir);
   2946
   2947	/* Inode writeback is not safe when the uid or gid are invalid. */
   2948	if (!uid_valid(i_uid_into_mnt(mnt_userns, inode)) ||
   2949	    !gid_valid(i_gid_into_mnt(mnt_userns, inode)))
   2950		return -EOVERFLOW;
   2951
   2952	audit_inode_child(dir, victim, AUDIT_TYPE_CHILD_DELETE);
   2953
   2954	error = inode_permission(mnt_userns, dir, MAY_WRITE | MAY_EXEC);
   2955	if (error)
   2956		return error;
   2957	if (IS_APPEND(dir))
   2958		return -EPERM;
   2959
   2960	if (check_sticky(mnt_userns, dir, inode) || IS_APPEND(inode) ||
   2961	    IS_IMMUTABLE(inode) || IS_SWAPFILE(inode) ||
   2962	    HAS_UNMAPPED_ID(mnt_userns, inode))
   2963		return -EPERM;
   2964	if (isdir) {
   2965		if (!d_is_dir(victim))
   2966			return -ENOTDIR;
   2967		if (IS_ROOT(victim))
   2968			return -EBUSY;
   2969	} else if (d_is_dir(victim))
   2970		return -EISDIR;
   2971	if (IS_DEADDIR(dir))
   2972		return -ENOENT;
   2973	if (victim->d_flags & DCACHE_NFSFS_RENAMED)
   2974		return -EBUSY;
   2975	return 0;
   2976}
   2977
   2978/*	Check whether we can create an object with dentry child in directory
   2979 *  dir.
   2980 *  1. We can't do it if child already exists (open has special treatment for
   2981 *     this case, but since we are inlined it's OK)
   2982 *  2. We can't do it if dir is read-only (done in permission())
   2983 *  3. We can't do it if the fs can't represent the fsuid or fsgid.
   2984 *  4. We should have write and exec permissions on dir
   2985 *  5. We can't do it if dir is immutable (done in permission())
   2986 */
   2987static inline int may_create(struct user_namespace *mnt_userns,
   2988			     struct inode *dir, struct dentry *child)
   2989{
   2990	audit_inode_child(dir, child, AUDIT_TYPE_CHILD_CREATE);
   2991	if (child->d_inode)
   2992		return -EEXIST;
   2993	if (IS_DEADDIR(dir))
   2994		return -ENOENT;
   2995	if (!fsuidgid_has_mapping(dir->i_sb, mnt_userns))
   2996		return -EOVERFLOW;
   2997
   2998	return inode_permission(mnt_userns, dir, MAY_WRITE | MAY_EXEC);
   2999}
   3000
   3001/*
   3002 * p1 and p2 should be directories on the same fs.
   3003 */
   3004struct dentry *lock_rename(struct dentry *p1, struct dentry *p2)
   3005{
   3006	struct dentry *p;
   3007
   3008	if (p1 == p2) {
   3009		inode_lock_nested(p1->d_inode, I_MUTEX_PARENT);
   3010		return NULL;
   3011	}
   3012
   3013	mutex_lock(&p1->d_sb->s_vfs_rename_mutex);
   3014
   3015	p = d_ancestor(p2, p1);
   3016	if (p) {
   3017		inode_lock_nested(p2->d_inode, I_MUTEX_PARENT);
   3018		inode_lock_nested(p1->d_inode, I_MUTEX_CHILD);
   3019		return p;
   3020	}
   3021
   3022	p = d_ancestor(p1, p2);
   3023	if (p) {
   3024		inode_lock_nested(p1->d_inode, I_MUTEX_PARENT);
   3025		inode_lock_nested(p2->d_inode, I_MUTEX_CHILD);
   3026		return p;
   3027	}
   3028
   3029	inode_lock_nested(p1->d_inode, I_MUTEX_PARENT);
   3030	inode_lock_nested(p2->d_inode, I_MUTEX_PARENT2);
   3031	return NULL;
   3032}
   3033EXPORT_SYMBOL(lock_rename);
   3034
   3035void unlock_rename(struct dentry *p1, struct dentry *p2)
   3036{
   3037	inode_unlock(p1->d_inode);
   3038	if (p1 != p2) {
   3039		inode_unlock(p2->d_inode);
   3040		mutex_unlock(&p1->d_sb->s_vfs_rename_mutex);
   3041	}
   3042}
   3043EXPORT_SYMBOL(unlock_rename);
   3044
   3045/**
   3046 * vfs_create - create new file
   3047 * @mnt_userns:	user namespace of the mount the inode was found from
   3048 * @dir:	inode of @dentry
   3049 * @dentry:	pointer to dentry of the base directory
   3050 * @mode:	mode of the new file
   3051 * @want_excl:	whether the file must not yet exist
   3052 *
   3053 * Create a new file.
   3054 *
   3055 * If the inode has been found through an idmapped mount the user namespace of
   3056 * the vfsmount must be passed through @mnt_userns. This function will then take
   3057 * care to map the inode according to @mnt_userns before checking permissions.
   3058 * On non-idmapped mounts or if permission checking is to be performed on the
   3059 * raw inode simply passs init_user_ns.
   3060 */
   3061int vfs_create(struct user_namespace *mnt_userns, struct inode *dir,
   3062	       struct dentry *dentry, umode_t mode, bool want_excl)
   3063{
   3064	int error = may_create(mnt_userns, dir, dentry);
   3065	if (error)
   3066		return error;
   3067
   3068	if (!dir->i_op->create)
   3069		return -EACCES;	/* shouldn't it be ENOSYS? */
   3070	mode &= S_IALLUGO;
   3071	mode |= S_IFREG;
   3072	error = security_inode_create(dir, dentry, mode);
   3073	if (error)
   3074		return error;
   3075	error = dir->i_op->create(mnt_userns, dir, dentry, mode, want_excl);
   3076	if (!error)
   3077		fsnotify_create(dir, dentry);
   3078	return error;
   3079}
   3080EXPORT_SYMBOL(vfs_create);
   3081
   3082int vfs_mkobj(struct dentry *dentry, umode_t mode,
   3083		int (*f)(struct dentry *, umode_t, void *),
   3084		void *arg)
   3085{
   3086	struct inode *dir = dentry->d_parent->d_inode;
   3087	int error = may_create(&init_user_ns, dir, dentry);
   3088	if (error)
   3089		return error;
   3090
   3091	mode &= S_IALLUGO;
   3092	mode |= S_IFREG;
   3093	error = security_inode_create(dir, dentry, mode);
   3094	if (error)
   3095		return error;
   3096	error = f(dentry, mode, arg);
   3097	if (!error)
   3098		fsnotify_create(dir, dentry);
   3099	return error;
   3100}
   3101EXPORT_SYMBOL(vfs_mkobj);
   3102
   3103bool may_open_dev(const struct path *path)
   3104{
   3105	return !(path->mnt->mnt_flags & MNT_NODEV) &&
   3106		!(path->mnt->mnt_sb->s_iflags & SB_I_NODEV);
   3107}
   3108
   3109static int may_open(struct user_namespace *mnt_userns, const struct path *path,
   3110		    int acc_mode, int flag)
   3111{
   3112	struct dentry *dentry = path->dentry;
   3113	struct inode *inode = dentry->d_inode;
   3114	int error;
   3115
   3116	if (!inode)
   3117		return -ENOENT;
   3118
   3119	switch (inode->i_mode & S_IFMT) {
   3120	case S_IFLNK:
   3121		return -ELOOP;
   3122	case S_IFDIR:
   3123		if (acc_mode & MAY_WRITE)
   3124			return -EISDIR;
   3125		if (acc_mode & MAY_EXEC)
   3126			return -EACCES;
   3127		break;
   3128	case S_IFBLK:
   3129	case S_IFCHR:
   3130		if (!may_open_dev(path))
   3131			return -EACCES;
   3132		fallthrough;
   3133	case S_IFIFO:
   3134	case S_IFSOCK:
   3135		if (acc_mode & MAY_EXEC)
   3136			return -EACCES;
   3137		flag &= ~O_TRUNC;
   3138		break;
   3139	case S_IFREG:
   3140		if ((acc_mode & MAY_EXEC) && path_noexec(path))
   3141			return -EACCES;
   3142		break;
   3143	}
   3144
   3145	error = inode_permission(mnt_userns, inode, MAY_OPEN | acc_mode);
   3146	if (error)
   3147		return error;
   3148
   3149	/*
   3150	 * An append-only file must be opened in append mode for writing.
   3151	 */
   3152	if (IS_APPEND(inode)) {
   3153		if  ((flag & O_ACCMODE) != O_RDONLY && !(flag & O_APPEND))
   3154			return -EPERM;
   3155		if (flag & O_TRUNC)
   3156			return -EPERM;
   3157	}
   3158
   3159	/* O_NOATIME can only be set by the owner or superuser */
   3160	if (flag & O_NOATIME && !inode_owner_or_capable(mnt_userns, inode))
   3161		return -EPERM;
   3162
   3163	return 0;
   3164}
   3165
   3166static int handle_truncate(struct user_namespace *mnt_userns, struct file *filp)
   3167{
   3168	const struct path *path = &filp->f_path;
   3169	struct inode *inode = path->dentry->d_inode;
   3170	int error = get_write_access(inode);
   3171	if (error)
   3172		return error;
   3173
   3174	error = security_path_truncate(path);
   3175	if (!error) {
   3176		error = do_truncate(mnt_userns, path->dentry, 0,
   3177				    ATTR_MTIME|ATTR_CTIME|ATTR_OPEN,
   3178				    filp);
   3179	}
   3180	put_write_access(inode);
   3181	return error;
   3182}
   3183
   3184static inline int open_to_namei_flags(int flag)
   3185{
   3186	if ((flag & O_ACCMODE) == 3)
   3187		flag--;
   3188	return flag;
   3189}
   3190
   3191static int may_o_create(struct user_namespace *mnt_userns,
   3192			const struct path *dir, struct dentry *dentry,
   3193			umode_t mode)
   3194{
   3195	int error = security_path_mknod(dir, dentry, mode, 0);
   3196	if (error)
   3197		return error;
   3198
   3199	if (!fsuidgid_has_mapping(dir->dentry->d_sb, mnt_userns))
   3200		return -EOVERFLOW;
   3201
   3202	error = inode_permission(mnt_userns, dir->dentry->d_inode,
   3203				 MAY_WRITE | MAY_EXEC);
   3204	if (error)
   3205		return error;
   3206
   3207	return security_inode_create(dir->dentry->d_inode, dentry, mode);
   3208}
   3209
   3210/*
   3211 * Attempt to atomically look up, create and open a file from a negative
   3212 * dentry.
   3213 *
   3214 * Returns 0 if successful.  The file will have been created and attached to
   3215 * @file by the filesystem calling finish_open().
   3216 *
   3217 * If the file was looked up only or didn't need creating, FMODE_OPENED won't
   3218 * be set.  The caller will need to perform the open themselves.  @path will
   3219 * have been updated to point to the new dentry.  This may be negative.
   3220 *
   3221 * Returns an error code otherwise.
   3222 */
   3223static struct dentry *atomic_open(struct nameidata *nd, struct dentry *dentry,
   3224				  struct file *file,
   3225				  int open_flag, umode_t mode)
   3226{
   3227	struct dentry *const DENTRY_NOT_SET = (void *) -1UL;
   3228	struct inode *dir =  nd->path.dentry->d_inode;
   3229	int error;
   3230
   3231	if (nd->flags & LOOKUP_DIRECTORY)
   3232		open_flag |= O_DIRECTORY;
   3233
   3234	file->f_path.dentry = DENTRY_NOT_SET;
   3235	file->f_path.mnt = nd->path.mnt;
   3236	error = dir->i_op->atomic_open(dir, dentry, file,
   3237				       open_to_namei_flags(open_flag), mode);
   3238	d_lookup_done(dentry);
   3239	if (!error) {
   3240		if (file->f_mode & FMODE_OPENED) {
   3241			if (unlikely(dentry != file->f_path.dentry)) {
   3242				dput(dentry);
   3243				dentry = dget(file->f_path.dentry);
   3244			}
   3245		} else if (WARN_ON(file->f_path.dentry == DENTRY_NOT_SET)) {
   3246			error = -EIO;
   3247		} else {
   3248			if (file->f_path.dentry) {
   3249				dput(dentry);
   3250				dentry = file->f_path.dentry;
   3251			}
   3252			if (unlikely(d_is_negative(dentry)))
   3253				error = -ENOENT;
   3254		}
   3255	}
   3256	if (error) {
   3257		dput(dentry);
   3258		dentry = ERR_PTR(error);
   3259	}
   3260	return dentry;
   3261}
   3262
   3263/*
   3264 * Look up and maybe create and open the last component.
   3265 *
   3266 * Must be called with parent locked (exclusive in O_CREAT case).
   3267 *
   3268 * Returns 0 on success, that is, if
   3269 *  the file was successfully atomically created (if necessary) and opened, or
   3270 *  the file was not completely opened at this time, though lookups and
   3271 *  creations were performed.
   3272 * These case are distinguished by presence of FMODE_OPENED on file->f_mode.
   3273 * In the latter case dentry returned in @path might be negative if O_CREAT
   3274 * hadn't been specified.
   3275 *
   3276 * An error code is returned on failure.
   3277 */
   3278static struct dentry *lookup_open(struct nameidata *nd, struct file *file,
   3279				  const struct open_flags *op,
   3280				  bool got_write)
   3281{
   3282	struct user_namespace *mnt_userns;
   3283	struct dentry *dir = nd->path.dentry;
   3284	struct inode *dir_inode = dir->d_inode;
   3285	int open_flag = op->open_flag;
   3286	struct dentry *dentry;
   3287	int error, create_error = 0;
   3288	umode_t mode = op->mode;
   3289	DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq);
   3290
   3291	if (unlikely(IS_DEADDIR(dir_inode)))
   3292		return ERR_PTR(-ENOENT);
   3293
   3294	file->f_mode &= ~FMODE_CREATED;
   3295	dentry = d_lookup(dir, &nd->last);
   3296	for (;;) {
   3297		if (!dentry) {
   3298			dentry = d_alloc_parallel(dir, &nd->last, &wq);
   3299			if (IS_ERR(dentry))
   3300				return dentry;
   3301		}
   3302		if (d_in_lookup(dentry))
   3303			break;
   3304
   3305		error = d_revalidate(dentry, nd->flags);
   3306		if (likely(error > 0))
   3307			break;
   3308		if (error)
   3309			goto out_dput;
   3310		d_invalidate(dentry);
   3311		dput(dentry);
   3312		dentry = NULL;
   3313	}
   3314	if (dentry->d_inode) {
   3315		/* Cached positive dentry: will open in f_op->open */
   3316		return dentry;
   3317	}
   3318
   3319	/*
   3320	 * Checking write permission is tricky, bacuse we don't know if we are
   3321	 * going to actually need it: O_CREAT opens should work as long as the
   3322	 * file exists.  But checking existence breaks atomicity.  The trick is
   3323	 * to check access and if not granted clear O_CREAT from the flags.
   3324	 *
   3325	 * Another problem is returing the "right" error value (e.g. for an
   3326	 * O_EXCL open we want to return EEXIST not EROFS).
   3327	 */
   3328	if (unlikely(!got_write))
   3329		open_flag &= ~O_TRUNC;
   3330	mnt_userns = mnt_user_ns(nd->path.mnt);
   3331	if (open_flag & O_CREAT) {
   3332		if (open_flag & O_EXCL)
   3333			open_flag &= ~O_TRUNC;
   3334		if (!IS_POSIXACL(dir->d_inode))
   3335			mode &= ~current_umask();
   3336		if (likely(got_write))
   3337			create_error = may_o_create(mnt_userns, &nd->path,
   3338						    dentry, mode);
   3339		else
   3340			create_error = -EROFS;
   3341	}
   3342	if (create_error)
   3343		open_flag &= ~O_CREAT;
   3344	if (dir_inode->i_op->atomic_open) {
   3345		dentry = atomic_open(nd, dentry, file, open_flag, mode);
   3346		if (unlikely(create_error) && dentry == ERR_PTR(-ENOENT))
   3347			dentry = ERR_PTR(create_error);
   3348		return dentry;
   3349	}
   3350
   3351	if (d_in_lookup(dentry)) {
   3352		struct dentry *res = dir_inode->i_op->lookup(dir_inode, dentry,
   3353							     nd->flags);
   3354		d_lookup_done(dentry);
   3355		if (unlikely(res)) {
   3356			if (IS_ERR(res)) {
   3357				error = PTR_ERR(res);
   3358				goto out_dput;
   3359			}
   3360			dput(dentry);
   3361			dentry = res;
   3362		}
   3363	}
   3364
   3365	/* Negative dentry, just create the file */
   3366	if (!dentry->d_inode && (open_flag & O_CREAT)) {
   3367		file->f_mode |= FMODE_CREATED;
   3368		audit_inode_child(dir_inode, dentry, AUDIT_TYPE_CHILD_CREATE);
   3369		if (!dir_inode->i_op->create) {
   3370			error = -EACCES;
   3371			goto out_dput;
   3372		}
   3373
   3374		error = dir_inode->i_op->create(mnt_userns, dir_inode, dentry,
   3375						mode, open_flag & O_EXCL);
   3376		if (error)
   3377			goto out_dput;
   3378	}
   3379	if (unlikely(create_error) && !dentry->d_inode) {
   3380		error = create_error;
   3381		goto out_dput;
   3382	}
   3383	return dentry;
   3384
   3385out_dput:
   3386	dput(dentry);
   3387	return ERR_PTR(error);
   3388}
   3389
   3390static const char *open_last_lookups(struct nameidata *nd,
   3391		   struct file *file, const struct open_flags *op)
   3392{
   3393	struct dentry *dir = nd->path.dentry;
   3394	int open_flag = op->open_flag;
   3395	bool got_write = false;
   3396	unsigned seq;
   3397	struct inode *inode;
   3398	struct dentry *dentry;
   3399	const char *res;
   3400
   3401	nd->flags |= op->intent;
   3402
   3403	if (nd->last_type != LAST_NORM) {
   3404		if (nd->depth)
   3405			put_link(nd);
   3406		return handle_dots(nd, nd->last_type);
   3407	}
   3408
   3409	if (!(open_flag & O_CREAT)) {
   3410		if (nd->last.name[nd->last.len])
   3411			nd->flags |= LOOKUP_FOLLOW | LOOKUP_DIRECTORY;
   3412		/* we _can_ be in RCU mode here */
   3413		dentry = lookup_fast(nd, &inode, &seq);
   3414		if (IS_ERR(dentry))
   3415			return ERR_CAST(dentry);
   3416		if (likely(dentry))
   3417			goto finish_lookup;
   3418
   3419		BUG_ON(nd->flags & LOOKUP_RCU);
   3420	} else {
   3421		/* create side of things */
   3422		if (nd->flags & LOOKUP_RCU) {
   3423			if (!try_to_unlazy(nd))
   3424				return ERR_PTR(-ECHILD);
   3425		}
   3426		audit_inode(nd->name, dir, AUDIT_INODE_PARENT);
   3427		/* trailing slashes? */
   3428		if (unlikely(nd->last.name[nd->last.len]))
   3429			return ERR_PTR(-EISDIR);
   3430	}
   3431
   3432	if (open_flag & (O_CREAT | O_TRUNC | O_WRONLY | O_RDWR)) {
   3433		got_write = !mnt_want_write(nd->path.mnt);
   3434		/*
   3435		 * do _not_ fail yet - we might not need that or fail with
   3436		 * a different error; let lookup_open() decide; we'll be
   3437		 * dropping this one anyway.
   3438		 */
   3439	}
   3440	if (open_flag & O_CREAT)
   3441		inode_lock(dir->d_inode);
   3442	else
   3443		inode_lock_shared(dir->d_inode);
   3444	dentry = lookup_open(nd, file, op, got_write);
   3445	if (!IS_ERR(dentry) && (file->f_mode & FMODE_CREATED))
   3446		fsnotify_create(dir->d_inode, dentry);
   3447	if (open_flag & O_CREAT)
   3448		inode_unlock(dir->d_inode);
   3449	else
   3450		inode_unlock_shared(dir->d_inode);
   3451
   3452	if (got_write)
   3453		mnt_drop_write(nd->path.mnt);
   3454
   3455	if (IS_ERR(dentry))
   3456		return ERR_CAST(dentry);
   3457
   3458	if (file->f_mode & (FMODE_OPENED | FMODE_CREATED)) {
   3459		dput(nd->path.dentry);
   3460		nd->path.dentry = dentry;
   3461		return NULL;
   3462	}
   3463
   3464finish_lookup:
   3465	if (nd->depth)
   3466		put_link(nd);
   3467	res = step_into(nd, WALK_TRAILING, dentry, inode, seq);
   3468	if (unlikely(res))
   3469		nd->flags &= ~(LOOKUP_OPEN|LOOKUP_CREATE|LOOKUP_EXCL);
   3470	return res;
   3471}
   3472
   3473/*
   3474 * Handle the last step of open()
   3475 */
   3476static int do_open(struct nameidata *nd,
   3477		   struct file *file, const struct open_flags *op)
   3478{
   3479	struct user_namespace *mnt_userns;
   3480	int open_flag = op->open_flag;
   3481	bool do_truncate;
   3482	int acc_mode;
   3483	int error;
   3484
   3485	if (!(file->f_mode & (FMODE_OPENED | FMODE_CREATED))) {
   3486		error = complete_walk(nd);
   3487		if (error)
   3488			return error;
   3489	}
   3490	if (!(file->f_mode & FMODE_CREATED))
   3491		audit_inode(nd->name, nd->path.dentry, 0);
   3492	mnt_userns = mnt_user_ns(nd->path.mnt);
   3493	if (open_flag & O_CREAT) {
   3494		if ((open_flag & O_EXCL) && !(file->f_mode & FMODE_CREATED))
   3495			return -EEXIST;
   3496		if (d_is_dir(nd->path.dentry))
   3497			return -EISDIR;
   3498		error = may_create_in_sticky(mnt_userns, nd,
   3499					     d_backing_inode(nd->path.dentry));
   3500		if (unlikely(error))
   3501			return error;
   3502	}
   3503	if ((nd->flags & LOOKUP_DIRECTORY) && !d_can_lookup(nd->path.dentry))
   3504		return -ENOTDIR;
   3505
   3506	do_truncate = false;
   3507	acc_mode = op->acc_mode;
   3508	if (file->f_mode & FMODE_CREATED) {
   3509		/* Don't check for write permission, don't truncate */
   3510		open_flag &= ~O_TRUNC;
   3511		acc_mode = 0;
   3512	} else if (d_is_reg(nd->path.dentry) && open_flag & O_TRUNC) {
   3513		error = mnt_want_write(nd->path.mnt);
   3514		if (error)
   3515			return error;
   3516		do_truncate = true;
   3517	}
   3518	error = may_open(mnt_userns, &nd->path, acc_mode, open_flag);
   3519	if (!error && !(file->f_mode & FMODE_OPENED))
   3520		error = vfs_open(&nd->path, file);
   3521	if (!error)
   3522		error = ima_file_check(file, op->acc_mode);
   3523	if (!error && do_truncate)
   3524		error = handle_truncate(mnt_userns, file);
   3525	if (unlikely(error > 0)) {
   3526		WARN_ON(1);
   3527		error = -EINVAL;
   3528	}
   3529	if (do_truncate)
   3530		mnt_drop_write(nd->path.mnt);
   3531	return error;
   3532}
   3533
   3534/**
   3535 * vfs_tmpfile - create tmpfile
   3536 * @mnt_userns:	user namespace of the mount the inode was found from
   3537 * @dentry:	pointer to dentry of the base directory
   3538 * @mode:	mode of the new tmpfile
   3539 * @open_flag:	flags
   3540 *
   3541 * Create a temporary file.
   3542 *
   3543 * If the inode has been found through an idmapped mount the user namespace of
   3544 * the vfsmount must be passed through @mnt_userns. This function will then take
   3545 * care to map the inode according to @mnt_userns before checking permissions.
   3546 * On non-idmapped mounts or if permission checking is to be performed on the
   3547 * raw inode simply passs init_user_ns.
   3548 */
   3549struct dentry *vfs_tmpfile(struct user_namespace *mnt_userns,
   3550			   struct dentry *dentry, umode_t mode, int open_flag)
   3551{
   3552	struct dentry *child = NULL;
   3553	struct inode *dir = dentry->d_inode;
   3554	struct inode *inode;
   3555	int error;
   3556
   3557	/* we want directory to be writable */
   3558	error = inode_permission(mnt_userns, dir, MAY_WRITE | MAY_EXEC);
   3559	if (error)
   3560		goto out_err;
   3561	error = -EOPNOTSUPP;
   3562	if (!dir->i_op->tmpfile)
   3563		goto out_err;
   3564	error = -ENOMEM;
   3565	child = d_alloc(dentry, &slash_name);
   3566	if (unlikely(!child))
   3567		goto out_err;
   3568	error = dir->i_op->tmpfile(mnt_userns, dir, child, mode);
   3569	if (error)
   3570		goto out_err;
   3571	error = -ENOENT;
   3572	inode = child->d_inode;
   3573	if (unlikely(!inode))
   3574		goto out_err;
   3575	if (!(open_flag & O_EXCL)) {
   3576		spin_lock(&inode->i_lock);
   3577		inode->i_state |= I_LINKABLE;
   3578		spin_unlock(&inode->i_lock);
   3579	}
   3580	ima_post_create_tmpfile(mnt_userns, inode);
   3581	return child;
   3582
   3583out_err:
   3584	dput(child);
   3585	return ERR_PTR(error);
   3586}
   3587EXPORT_SYMBOL(vfs_tmpfile);
   3588
   3589static int do_tmpfile(struct nameidata *nd, unsigned flags,
   3590		const struct open_flags *op,
   3591		struct file *file)
   3592{
   3593	struct user_namespace *mnt_userns;
   3594	struct dentry *child;
   3595	struct path path;
   3596	int error = path_lookupat(nd, flags | LOOKUP_DIRECTORY, &path);
   3597	if (unlikely(error))
   3598		return error;
   3599	error = mnt_want_write(path.mnt);
   3600	if (unlikely(error))
   3601		goto out;
   3602	mnt_userns = mnt_user_ns(path.mnt);
   3603	child = vfs_tmpfile(mnt_userns, path.dentry, op->mode, op->open_flag);
   3604	error = PTR_ERR(child);
   3605	if (IS_ERR(child))
   3606		goto out2;
   3607	dput(path.dentry);
   3608	path.dentry = child;
   3609	audit_inode(nd->name, child, 0);
   3610	/* Don't check for other permissions, the inode was just created */
   3611	error = may_open(mnt_userns, &path, 0, op->open_flag);
   3612	if (!error)
   3613		error = vfs_open(&path, file);
   3614out2:
   3615	mnt_drop_write(path.mnt);
   3616out:
   3617	path_put(&path);
   3618	return error;
   3619}
   3620
   3621static int do_o_path(struct nameidata *nd, unsigned flags, struct file *file)
   3622{
   3623	struct path path;
   3624	int error = path_lookupat(nd, flags, &path);
   3625	if (!error) {
   3626		audit_inode(nd->name, path.dentry, 0);
   3627		error = vfs_open(&path, file);
   3628		path_put(&path);
   3629	}
   3630	return error;
   3631}
   3632
   3633static struct file *path_openat(struct nameidata *nd,
   3634			const struct open_flags *op, unsigned flags)
   3635{
   3636	struct file *file;
   3637	int error;
   3638
   3639	file = alloc_empty_file(op->open_flag, current_cred());
   3640	if (IS_ERR(file))
   3641		return file;
   3642
   3643	if (unlikely(file->f_flags & __O_TMPFILE)) {
   3644		error = do_tmpfile(nd, flags, op, file);
   3645	} else if (unlikely(file->f_flags & O_PATH)) {
   3646		error = do_o_path(nd, flags, file);
   3647	} else {
   3648		const char *s = path_init(nd, flags);
   3649		while (!(error = link_path_walk(s, nd)) &&
   3650		       (s = open_last_lookups(nd, file, op)) != NULL)
   3651			;
   3652		if (!error)
   3653			error = do_open(nd, file, op);
   3654		terminate_walk(nd);
   3655	}
   3656	if (likely(!error)) {
   3657		if (likely(file->f_mode & FMODE_OPENED))
   3658			return file;
   3659		WARN_ON(1);
   3660		error = -EINVAL;
   3661	}
   3662	fput(file);
   3663	if (error == -EOPENSTALE) {
   3664		if (flags & LOOKUP_RCU)
   3665			error = -ECHILD;
   3666		else
   3667			error = -ESTALE;
   3668	}
   3669	return ERR_PTR(error);
   3670}
   3671
   3672struct file *do_filp_open(int dfd, struct filename *pathname,
   3673		const struct open_flags *op)
   3674{
   3675	struct nameidata nd;
   3676	int flags = op->lookup_flags;
   3677	struct file *filp;
   3678
   3679	set_nameidata(&nd, dfd, pathname, NULL);
   3680	filp = path_openat(&nd, op, flags | LOOKUP_RCU);
   3681	if (unlikely(filp == ERR_PTR(-ECHILD)))
   3682		filp = path_openat(&nd, op, flags);
   3683	if (unlikely(filp == ERR_PTR(-ESTALE)))
   3684		filp = path_openat(&nd, op, flags | LOOKUP_REVAL);
   3685	restore_nameidata();
   3686	return filp;
   3687}
   3688
   3689struct file *do_file_open_root(const struct path *root,
   3690		const char *name, const struct open_flags *op)
   3691{
   3692	struct nameidata nd;
   3693	struct file *file;
   3694	struct filename *filename;
   3695	int flags = op->lookup_flags;
   3696
   3697	if (d_is_symlink(root->dentry) && op->intent & LOOKUP_OPEN)
   3698		return ERR_PTR(-ELOOP);
   3699
   3700	filename = getname_kernel(name);
   3701	if (IS_ERR(filename))
   3702		return ERR_CAST(filename);
   3703
   3704	set_nameidata(&nd, -1, filename, root);
   3705	file = path_openat(&nd, op, flags | LOOKUP_RCU);
   3706	if (unlikely(file == ERR_PTR(-ECHILD)))
   3707		file = path_openat(&nd, op, flags);
   3708	if (unlikely(file == ERR_PTR(-ESTALE)))
   3709		file = path_openat(&nd, op, flags | LOOKUP_REVAL);
   3710	restore_nameidata();
   3711	putname(filename);
   3712	return file;
   3713}
   3714
   3715static struct dentry *filename_create(int dfd, struct filename *name,
   3716				      struct path *path, unsigned int lookup_flags)
   3717{
   3718	struct dentry *dentry = ERR_PTR(-EEXIST);
   3719	struct qstr last;
   3720	bool want_dir = lookup_flags & LOOKUP_DIRECTORY;
   3721	unsigned int reval_flag = lookup_flags & LOOKUP_REVAL;
   3722	unsigned int create_flags = LOOKUP_CREATE | LOOKUP_EXCL;
   3723	int type;
   3724	int err2;
   3725	int error;
   3726
   3727	error = filename_parentat(dfd, name, reval_flag, path, &last, &type);
   3728	if (error)
   3729		return ERR_PTR(error);
   3730
   3731	/*
   3732	 * Yucky last component or no last component at all?
   3733	 * (foo/., foo/.., /////)
   3734	 */
   3735	if (unlikely(type != LAST_NORM))
   3736		goto out;
   3737
   3738	/* don't fail immediately if it's r/o, at least try to report other errors */
   3739	err2 = mnt_want_write(path->mnt);
   3740	/*
   3741	 * Do the final lookup.  Suppress 'create' if there is a trailing
   3742	 * '/', and a directory wasn't requested.
   3743	 */
   3744	if (last.name[last.len] && !want_dir)
   3745		create_flags = 0;
   3746	inode_lock_nested(path->dentry->d_inode, I_MUTEX_PARENT);
   3747	dentry = __lookup_hash(&last, path->dentry, reval_flag | create_flags);
   3748	if (IS_ERR(dentry))
   3749		goto unlock;
   3750
   3751	error = -EEXIST;
   3752	if (d_is_positive(dentry))
   3753		goto fail;
   3754
   3755	/*
   3756	 * Special case - lookup gave negative, but... we had foo/bar/
   3757	 * From the vfs_mknod() POV we just have a negative dentry -
   3758	 * all is fine. Let's be bastards - you had / on the end, you've
   3759	 * been asking for (non-existent) directory. -ENOENT for you.
   3760	 */
   3761	if (unlikely(!create_flags)) {
   3762		error = -ENOENT;
   3763		goto fail;
   3764	}
   3765	if (unlikely(err2)) {
   3766		error = err2;
   3767		goto fail;
   3768	}
   3769	return dentry;
   3770fail:
   3771	dput(dentry);
   3772	dentry = ERR_PTR(error);
   3773unlock:
   3774	inode_unlock(path->dentry->d_inode);
   3775	if (!err2)
   3776		mnt_drop_write(path->mnt);
   3777out:
   3778	path_put(path);
   3779	return dentry;
   3780}
   3781
   3782struct dentry *kern_path_create(int dfd, const char *pathname,
   3783				struct path *path, unsigned int lookup_flags)
   3784{
   3785	struct filename *filename = getname_kernel(pathname);
   3786	struct dentry *res = filename_create(dfd, filename, path, lookup_flags);
   3787
   3788	putname(filename);
   3789	return res;
   3790}
   3791EXPORT_SYMBOL(kern_path_create);
   3792
   3793void done_path_create(struct path *path, struct dentry *dentry)
   3794{
   3795	dput(dentry);
   3796	inode_unlock(path->dentry->d_inode);
   3797	mnt_drop_write(path->mnt);
   3798	path_put(path);
   3799}
   3800EXPORT_SYMBOL(done_path_create);
   3801
   3802inline struct dentry *user_path_create(int dfd, const char __user *pathname,
   3803				struct path *path, unsigned int lookup_flags)
   3804{
   3805	struct filename *filename = getname(pathname);
   3806	struct dentry *res = filename_create(dfd, filename, path, lookup_flags);
   3807
   3808	putname(filename);
   3809	return res;
   3810}
   3811EXPORT_SYMBOL(user_path_create);
   3812
   3813/**
   3814 * vfs_mknod - create device node or file
   3815 * @mnt_userns:	user namespace of the mount the inode was found from
   3816 * @dir:	inode of @dentry
   3817 * @dentry:	pointer to dentry of the base directory
   3818 * @mode:	mode of the new device node or file
   3819 * @dev:	device number of device to create
   3820 *
   3821 * Create a device node or file.
   3822 *
   3823 * If the inode has been found through an idmapped mount the user namespace of
   3824 * the vfsmount must be passed through @mnt_userns. This function will then take
   3825 * care to map the inode according to @mnt_userns before checking permissions.
   3826 * On non-idmapped mounts or if permission checking is to be performed on the
   3827 * raw inode simply passs init_user_ns.
   3828 */
   3829int vfs_mknod(struct user_namespace *mnt_userns, struct inode *dir,
   3830	      struct dentry *dentry, umode_t mode, dev_t dev)
   3831{
   3832	bool is_whiteout = S_ISCHR(mode) && dev == WHITEOUT_DEV;
   3833	int error = may_create(mnt_userns, dir, dentry);
   3834
   3835	if (error)
   3836		return error;
   3837
   3838	if ((S_ISCHR(mode) || S_ISBLK(mode)) && !is_whiteout &&
   3839	    !capable(CAP_MKNOD))
   3840		return -EPERM;
   3841
   3842	if (!dir->i_op->mknod)
   3843		return -EPERM;
   3844
   3845	error = devcgroup_inode_mknod(mode, dev);
   3846	if (error)
   3847		return error;
   3848
   3849	error = security_inode_mknod(dir, dentry, mode, dev);
   3850	if (error)
   3851		return error;
   3852
   3853	error = dir->i_op->mknod(mnt_userns, dir, dentry, mode, dev);
   3854	if (!error)
   3855		fsnotify_create(dir, dentry);
   3856	return error;
   3857}
   3858EXPORT_SYMBOL(vfs_mknod);
   3859
   3860static int may_mknod(umode_t mode)
   3861{
   3862	switch (mode & S_IFMT) {
   3863	case S_IFREG:
   3864	case S_IFCHR:
   3865	case S_IFBLK:
   3866	case S_IFIFO:
   3867	case S_IFSOCK:
   3868	case 0: /* zero mode translates to S_IFREG */
   3869		return 0;
   3870	case S_IFDIR:
   3871		return -EPERM;
   3872	default:
   3873		return -EINVAL;
   3874	}
   3875}
   3876
   3877static int do_mknodat(int dfd, struct filename *name, umode_t mode,
   3878		unsigned int dev)
   3879{
   3880	struct user_namespace *mnt_userns;
   3881	struct dentry *dentry;
   3882	struct path path;
   3883	int error;
   3884	unsigned int lookup_flags = 0;
   3885
   3886	error = may_mknod(mode);
   3887	if (error)
   3888		goto out1;
   3889retry:
   3890	dentry = filename_create(dfd, name, &path, lookup_flags);
   3891	error = PTR_ERR(dentry);
   3892	if (IS_ERR(dentry))
   3893		goto out1;
   3894
   3895	if (!IS_POSIXACL(path.dentry->d_inode))
   3896		mode &= ~current_umask();
   3897	error = security_path_mknod(&path, dentry, mode, dev);
   3898	if (error)
   3899		goto out2;
   3900
   3901	mnt_userns = mnt_user_ns(path.mnt);
   3902	switch (mode & S_IFMT) {
   3903		case 0: case S_IFREG:
   3904			error = vfs_create(mnt_userns, path.dentry->d_inode,
   3905					   dentry, mode, true);
   3906			if (!error)
   3907				ima_post_path_mknod(mnt_userns, dentry);
   3908			break;
   3909		case S_IFCHR: case S_IFBLK:
   3910			error = vfs_mknod(mnt_userns, path.dentry->d_inode,
   3911					  dentry, mode, new_decode_dev(dev));
   3912			break;
   3913		case S_IFIFO: case S_IFSOCK:
   3914			error = vfs_mknod(mnt_userns, path.dentry->d_inode,
   3915					  dentry, mode, 0);
   3916			break;
   3917	}
   3918out2:
   3919	done_path_create(&path, dentry);
   3920	if (retry_estale(error, lookup_flags)) {
   3921		lookup_flags |= LOOKUP_REVAL;
   3922		goto retry;
   3923	}
   3924out1:
   3925	putname(name);
   3926	return error;
   3927}
   3928
   3929SYSCALL_DEFINE4(mknodat, int, dfd, const char __user *, filename, umode_t, mode,
   3930		unsigned int, dev)
   3931{
   3932	return do_mknodat(dfd, getname(filename), mode, dev);
   3933}
   3934
   3935SYSCALL_DEFINE3(mknod, const char __user *, filename, umode_t, mode, unsigned, dev)
   3936{
   3937	return do_mknodat(AT_FDCWD, getname(filename), mode, dev);
   3938}
   3939
   3940/**
   3941 * vfs_mkdir - create directory
   3942 * @mnt_userns:	user namespace of the mount the inode was found from
   3943 * @dir:	inode of @dentry
   3944 * @dentry:	pointer to dentry of the base directory
   3945 * @mode:	mode of the new directory
   3946 *
   3947 * Create a directory.
   3948 *
   3949 * If the inode has been found through an idmapped mount the user namespace of
   3950 * the vfsmount must be passed through @mnt_userns. This function will then take
   3951 * care to map the inode according to @mnt_userns before checking permissions.
   3952 * On non-idmapped mounts or if permission checking is to be performed on the
   3953 * raw inode simply passs init_user_ns.
   3954 */
   3955int vfs_mkdir(struct user_namespace *mnt_userns, struct inode *dir,
   3956	      struct dentry *dentry, umode_t mode)
   3957{
   3958	int error = may_create(mnt_userns, dir, dentry);
   3959	unsigned max_links = dir->i_sb->s_max_links;
   3960
   3961	if (error)
   3962		return error;
   3963
   3964	if (!dir->i_op->mkdir)
   3965		return -EPERM;
   3966
   3967	mode &= (S_IRWXUGO|S_ISVTX);
   3968	error = security_inode_mkdir(dir, dentry, mode);
   3969	if (error)
   3970		return error;
   3971
   3972	if (max_links && dir->i_nlink >= max_links)
   3973		return -EMLINK;
   3974
   3975	error = dir->i_op->mkdir(mnt_userns, dir, dentry, mode);
   3976	if (!error)
   3977		fsnotify_mkdir(dir, dentry);
   3978	return error;
   3979}
   3980EXPORT_SYMBOL(vfs_mkdir);
   3981
   3982int do_mkdirat(int dfd, struct filename *name, umode_t mode)
   3983{
   3984	struct dentry *dentry;
   3985	struct path path;
   3986	int error;
   3987	unsigned int lookup_flags = LOOKUP_DIRECTORY;
   3988
   3989retry:
   3990	dentry = filename_create(dfd, name, &path, lookup_flags);
   3991	error = PTR_ERR(dentry);
   3992	if (IS_ERR(dentry))
   3993		goto out_putname;
   3994
   3995	if (!IS_POSIXACL(path.dentry->d_inode))
   3996		mode &= ~current_umask();
   3997	error = security_path_mkdir(&path, dentry, mode);
   3998	if (!error) {
   3999		struct user_namespace *mnt_userns;
   4000		mnt_userns = mnt_user_ns(path.mnt);
   4001		error = vfs_mkdir(mnt_userns, path.dentry->d_inode, dentry,
   4002				  mode);
   4003	}
   4004	done_path_create(&path, dentry);
   4005	if (retry_estale(error, lookup_flags)) {
   4006		lookup_flags |= LOOKUP_REVAL;
   4007		goto retry;
   4008	}
   4009out_putname:
   4010	putname(name);
   4011	return error;
   4012}
   4013
   4014SYSCALL_DEFINE3(mkdirat, int, dfd, const char __user *, pathname, umode_t, mode)
   4015{
   4016	return do_mkdirat(dfd, getname(pathname), mode);
   4017}
   4018
   4019SYSCALL_DEFINE2(mkdir, const char __user *, pathname, umode_t, mode)
   4020{
   4021	return do_mkdirat(AT_FDCWD, getname(pathname), mode);
   4022}
   4023
   4024/**
   4025 * vfs_rmdir - remove directory
   4026 * @mnt_userns:	user namespace of the mount the inode was found from
   4027 * @dir:	inode of @dentry
   4028 * @dentry:	pointer to dentry of the base directory
   4029 *
   4030 * Remove a directory.
   4031 *
   4032 * If the inode has been found through an idmapped mount the user namespace of
   4033 * the vfsmount must be passed through @mnt_userns. This function will then take
   4034 * care to map the inode according to @mnt_userns before checking permissions.
   4035 * On non-idmapped mounts or if permission checking is to be performed on the
   4036 * raw inode simply passs init_user_ns.
   4037 */
   4038int vfs_rmdir(struct user_namespace *mnt_userns, struct inode *dir,
   4039		     struct dentry *dentry)
   4040{
   4041	int error = may_delete(mnt_userns, dir, dentry, 1);
   4042
   4043	if (error)
   4044		return error;
   4045
   4046	if (!dir->i_op->rmdir)
   4047		return -EPERM;
   4048
   4049	dget(dentry);
   4050	inode_lock(dentry->d_inode);
   4051
   4052	error = -EBUSY;
   4053	if (is_local_mountpoint(dentry) ||
   4054	    (dentry->d_inode->i_flags & S_KERNEL_FILE))
   4055		goto out;
   4056
   4057	error = security_inode_rmdir(dir, dentry);
   4058	if (error)
   4059		goto out;
   4060
   4061	error = dir->i_op->rmdir(dir, dentry);
   4062	if (error)
   4063		goto out;
   4064
   4065	shrink_dcache_parent(dentry);
   4066	dentry->d_inode->i_flags |= S_DEAD;
   4067	dont_mount(dentry);
   4068	detach_mounts(dentry);
   4069
   4070out:
   4071	inode_unlock(dentry->d_inode);
   4072	dput(dentry);
   4073	if (!error)
   4074		d_delete_notify(dir, dentry);
   4075	return error;
   4076}
   4077EXPORT_SYMBOL(vfs_rmdir);
   4078
   4079int do_rmdir(int dfd, struct filename *name)
   4080{
   4081	struct user_namespace *mnt_userns;
   4082	int error;
   4083	struct dentry *dentry;
   4084	struct path path;
   4085	struct qstr last;
   4086	int type;
   4087	unsigned int lookup_flags = 0;
   4088retry:
   4089	error = filename_parentat(dfd, name, lookup_flags, &path, &last, &type);
   4090	if (error)
   4091		goto exit1;
   4092
   4093	switch (type) {
   4094	case LAST_DOTDOT:
   4095		error = -ENOTEMPTY;
   4096		goto exit2;
   4097	case LAST_DOT:
   4098		error = -EINVAL;
   4099		goto exit2;
   4100	case LAST_ROOT:
   4101		error = -EBUSY;
   4102		goto exit2;
   4103	}
   4104
   4105	error = mnt_want_write(path.mnt);
   4106	if (error)
   4107		goto exit2;
   4108
   4109	inode_lock_nested(path.dentry->d_inode, I_MUTEX_PARENT);
   4110	dentry = __lookup_hash(&last, path.dentry, lookup_flags);
   4111	error = PTR_ERR(dentry);
   4112	if (IS_ERR(dentry))
   4113		goto exit3;
   4114	if (!dentry->d_inode) {
   4115		error = -ENOENT;
   4116		goto exit4;
   4117	}
   4118	error = security_path_rmdir(&path, dentry);
   4119	if (error)
   4120		goto exit4;
   4121	mnt_userns = mnt_user_ns(path.mnt);
   4122	error = vfs_rmdir(mnt_userns, path.dentry->d_inode, dentry);
   4123exit4:
   4124	dput(dentry);
   4125exit3:
   4126	inode_unlock(path.dentry->d_inode);
   4127	mnt_drop_write(path.mnt);
   4128exit2:
   4129	path_put(&path);
   4130	if (retry_estale(error, lookup_flags)) {
   4131		lookup_flags |= LOOKUP_REVAL;
   4132		goto retry;
   4133	}
   4134exit1:
   4135	putname(name);
   4136	return error;
   4137}
   4138
   4139SYSCALL_DEFINE1(rmdir, const char __user *, pathname)
   4140{
   4141	return do_rmdir(AT_FDCWD, getname(pathname));
   4142}
   4143
   4144/**
   4145 * vfs_unlink - unlink a filesystem object
   4146 * @mnt_userns:	user namespace of the mount the inode was found from
   4147 * @dir:	parent directory
   4148 * @dentry:	victim
   4149 * @delegated_inode: returns victim inode, if the inode is delegated.
   4150 *
   4151 * The caller must hold dir->i_mutex.
   4152 *
   4153 * If vfs_unlink discovers a delegation, it will return -EWOULDBLOCK and
   4154 * return a reference to the inode in delegated_inode.  The caller
   4155 * should then break the delegation on that inode and retry.  Because
   4156 * breaking a delegation may take a long time, the caller should drop
   4157 * dir->i_mutex before doing so.
   4158 *
   4159 * Alternatively, a caller may pass NULL for delegated_inode.  This may
   4160 * be appropriate for callers that expect the underlying filesystem not
   4161 * to be NFS exported.
   4162 *
   4163 * If the inode has been found through an idmapped mount the user namespace of
   4164 * the vfsmount must be passed through @mnt_userns. This function will then take
   4165 * care to map the inode according to @mnt_userns before checking permissions.
   4166 * On non-idmapped mounts or if permission checking is to be performed on the
   4167 * raw inode simply passs init_user_ns.
   4168 */
   4169int vfs_unlink(struct user_namespace *mnt_userns, struct inode *dir,
   4170	       struct dentry *dentry, struct inode **delegated_inode)
   4171{
   4172	struct inode *target = dentry->d_inode;
   4173	int error = may_delete(mnt_userns, dir, dentry, 0);
   4174
   4175	if (error)
   4176		return error;
   4177
   4178	if (!dir->i_op->unlink)
   4179		return -EPERM;
   4180
   4181	inode_lock(target);
   4182	if (IS_SWAPFILE(target))
   4183		error = -EPERM;
   4184	else if (is_local_mountpoint(dentry))
   4185		error = -EBUSY;
   4186	else {
   4187		error = security_inode_unlink(dir, dentry);
   4188		if (!error) {
   4189			error = try_break_deleg(target, delegated_inode);
   4190			if (error)
   4191				goto out;
   4192			error = dir->i_op->unlink(dir, dentry);
   4193			if (!error) {
   4194				dont_mount(dentry);
   4195				detach_mounts(dentry);
   4196			}
   4197		}
   4198	}
   4199out:
   4200	inode_unlock(target);
   4201
   4202	/* We don't d_delete() NFS sillyrenamed files--they still exist. */
   4203	if (!error && dentry->d_flags & DCACHE_NFSFS_RENAMED) {
   4204		fsnotify_unlink(dir, dentry);
   4205	} else if (!error) {
   4206		fsnotify_link_count(target);
   4207		d_delete_notify(dir, dentry);
   4208	}
   4209
   4210	return error;
   4211}
   4212EXPORT_SYMBOL(vfs_unlink);
   4213
   4214/*
   4215 * Make sure that the actual truncation of the file will occur outside its
   4216 * directory's i_mutex.  Truncate can take a long time if there is a lot of
   4217 * writeout happening, and we don't want to prevent access to the directory
   4218 * while waiting on the I/O.
   4219 */
   4220int do_unlinkat(int dfd, struct filename *name)
   4221{
   4222	int error;
   4223	struct dentry *dentry;
   4224	struct path path;
   4225	struct qstr last;
   4226	int type;
   4227	struct inode *inode = NULL;
   4228	struct inode *delegated_inode = NULL;
   4229	unsigned int lookup_flags = 0;
   4230retry:
   4231	error = filename_parentat(dfd, name, lookup_flags, &path, &last, &type);
   4232	if (error)
   4233		goto exit1;
   4234
   4235	error = -EISDIR;
   4236	if (type != LAST_NORM)
   4237		goto exit2;
   4238
   4239	error = mnt_want_write(path.mnt);
   4240	if (error)
   4241		goto exit2;
   4242retry_deleg:
   4243	inode_lock_nested(path.dentry->d_inode, I_MUTEX_PARENT);
   4244	dentry = __lookup_hash(&last, path.dentry, lookup_flags);
   4245	error = PTR_ERR(dentry);
   4246	if (!IS_ERR(dentry)) {
   4247		struct user_namespace *mnt_userns;
   4248
   4249		/* Why not before? Because we want correct error value */
   4250		if (last.name[last.len])
   4251			goto slashes;
   4252		inode = dentry->d_inode;
   4253		if (d_is_negative(dentry))
   4254			goto slashes;
   4255		ihold(inode);
   4256		error = security_path_unlink(&path, dentry);
   4257		if (error)
   4258			goto exit3;
   4259		mnt_userns = mnt_user_ns(path.mnt);
   4260		error = vfs_unlink(mnt_userns, path.dentry->d_inode, dentry,
   4261				   &delegated_inode);
   4262exit3:
   4263		dput(dentry);
   4264	}
   4265	inode_unlock(path.dentry->d_inode);
   4266	if (inode)
   4267		iput(inode);	/* truncate the inode here */
   4268	inode = NULL;
   4269	if (delegated_inode) {
   4270		error = break_deleg_wait(&delegated_inode);
   4271		if (!error)
   4272			goto retry_deleg;
   4273	}
   4274	mnt_drop_write(path.mnt);
   4275exit2:
   4276	path_put(&path);
   4277	if (retry_estale(error, lookup_flags)) {
   4278		lookup_flags |= LOOKUP_REVAL;
   4279		inode = NULL;
   4280		goto retry;
   4281	}
   4282exit1:
   4283	putname(name);
   4284	return error;
   4285
   4286slashes:
   4287	if (d_is_negative(dentry))
   4288		error = -ENOENT;
   4289	else if (d_is_dir(dentry))
   4290		error = -EISDIR;
   4291	else
   4292		error = -ENOTDIR;
   4293	goto exit3;
   4294}
   4295
   4296SYSCALL_DEFINE3(unlinkat, int, dfd, const char __user *, pathname, int, flag)
   4297{
   4298	if ((flag & ~AT_REMOVEDIR) != 0)
   4299		return -EINVAL;
   4300
   4301	if (flag & AT_REMOVEDIR)
   4302		return do_rmdir(dfd, getname(pathname));
   4303	return do_unlinkat(dfd, getname(pathname));
   4304}
   4305
   4306SYSCALL_DEFINE1(unlink, const char __user *, pathname)
   4307{
   4308	return do_unlinkat(AT_FDCWD, getname(pathname));
   4309}
   4310
   4311/**
   4312 * vfs_symlink - create symlink
   4313 * @mnt_userns:	user namespace of the mount the inode was found from
   4314 * @dir:	inode of @dentry
   4315 * @dentry:	pointer to dentry of the base directory
   4316 * @oldname:	name of the file to link to
   4317 *
   4318 * Create a symlink.
   4319 *
   4320 * If the inode has been found through an idmapped mount the user namespace of
   4321 * the vfsmount must be passed through @mnt_userns. This function will then take
   4322 * care to map the inode according to @mnt_userns before checking permissions.
   4323 * On non-idmapped mounts or if permission checking is to be performed on the
   4324 * raw inode simply passs init_user_ns.
   4325 */
   4326int vfs_symlink(struct user_namespace *mnt_userns, struct inode *dir,
   4327		struct dentry *dentry, const char *oldname)
   4328{
   4329	int error = may_create(mnt_userns, dir, dentry);
   4330
   4331	if (error)
   4332		return error;
   4333
   4334	if (!dir->i_op->symlink)
   4335		return -EPERM;
   4336
   4337	error = security_inode_symlink(dir, dentry, oldname);
   4338	if (error)
   4339		return error;
   4340
   4341	error = dir->i_op->symlink(mnt_userns, dir, dentry, oldname);
   4342	if (!error)
   4343		fsnotify_create(dir, dentry);
   4344	return error;
   4345}
   4346EXPORT_SYMBOL(vfs_symlink);
   4347
   4348int do_symlinkat(struct filename *from, int newdfd, struct filename *to)
   4349{
   4350	int error;
   4351	struct dentry *dentry;
   4352	struct path path;
   4353	unsigned int lookup_flags = 0;
   4354
   4355	if (IS_ERR(from)) {
   4356		error = PTR_ERR(from);
   4357		goto out_putnames;
   4358	}
   4359retry:
   4360	dentry = filename_create(newdfd, to, &path, lookup_flags);
   4361	error = PTR_ERR(dentry);
   4362	if (IS_ERR(dentry))
   4363		goto out_putnames;
   4364
   4365	error = security_path_symlink(&path, dentry, from->name);
   4366	if (!error) {
   4367		struct user_namespace *mnt_userns;
   4368
   4369		mnt_userns = mnt_user_ns(path.mnt);
   4370		error = vfs_symlink(mnt_userns, path.dentry->d_inode, dentry,
   4371				    from->name);
   4372	}
   4373	done_path_create(&path, dentry);
   4374	if (retry_estale(error, lookup_flags)) {
   4375		lookup_flags |= LOOKUP_REVAL;
   4376		goto retry;
   4377	}
   4378out_putnames:
   4379	putname(to);
   4380	putname(from);
   4381	return error;
   4382}
   4383
   4384SYSCALL_DEFINE3(symlinkat, const char __user *, oldname,
   4385		int, newdfd, const char __user *, newname)
   4386{
   4387	return do_symlinkat(getname(oldname), newdfd, getname(newname));
   4388}
   4389
   4390SYSCALL_DEFINE2(symlink, const char __user *, oldname, const char __user *, newname)
   4391{
   4392	return do_symlinkat(getname(oldname), AT_FDCWD, getname(newname));
   4393}
   4394
   4395/**
   4396 * vfs_link - create a new link
   4397 * @old_dentry:	object to be linked
   4398 * @mnt_userns:	the user namespace of the mount
   4399 * @dir:	new parent
   4400 * @new_dentry:	where to create the new link
   4401 * @delegated_inode: returns inode needing a delegation break
   4402 *
   4403 * The caller must hold dir->i_mutex
   4404 *
   4405 * If vfs_link discovers a delegation on the to-be-linked file in need
   4406 * of breaking, it will return -EWOULDBLOCK and return a reference to the
   4407 * inode in delegated_inode.  The caller should then break the delegation
   4408 * and retry.  Because breaking a delegation may take a long time, the
   4409 * caller should drop the i_mutex before doing so.
   4410 *
   4411 * Alternatively, a caller may pass NULL for delegated_inode.  This may
   4412 * be appropriate for callers that expect the underlying filesystem not
   4413 * to be NFS exported.
   4414 *
   4415 * If the inode has been found through an idmapped mount the user namespace of
   4416 * the vfsmount must be passed through @mnt_userns. This function will then take
   4417 * care to map the inode according to @mnt_userns before checking permissions.
   4418 * On non-idmapped mounts or if permission checking is to be performed on the
   4419 * raw inode simply passs init_user_ns.
   4420 */
   4421int vfs_link(struct dentry *old_dentry, struct user_namespace *mnt_userns,
   4422	     struct inode *dir, struct dentry *new_dentry,
   4423	     struct inode **delegated_inode)
   4424{
   4425	struct inode *inode = old_dentry->d_inode;
   4426	unsigned max_links = dir->i_sb->s_max_links;
   4427	int error;
   4428
   4429	if (!inode)
   4430		return -ENOENT;
   4431
   4432	error = may_create(mnt_userns, dir, new_dentry);
   4433	if (error)
   4434		return error;
   4435
   4436	if (dir->i_sb != inode->i_sb)
   4437		return -EXDEV;
   4438
   4439	/*
   4440	 * A link to an append-only or immutable file cannot be created.
   4441	 */
   4442	if (IS_APPEND(inode) || IS_IMMUTABLE(inode))
   4443		return -EPERM;
   4444	/*
   4445	 * Updating the link count will likely cause i_uid and i_gid to
   4446	 * be writen back improperly if their true value is unknown to
   4447	 * the vfs.
   4448	 */
   4449	if (HAS_UNMAPPED_ID(mnt_userns, inode))
   4450		return -EPERM;
   4451	if (!dir->i_op->link)
   4452		return -EPERM;
   4453	if (S_ISDIR(inode->i_mode))
   4454		return -EPERM;
   4455
   4456	error = security_inode_link(old_dentry, dir, new_dentry);
   4457	if (error)
   4458		return error;
   4459
   4460	inode_lock(inode);
   4461	/* Make sure we don't allow creating hardlink to an unlinked file */
   4462	if (inode->i_nlink == 0 && !(inode->i_state & I_LINKABLE))
   4463		error =  -ENOENT;
   4464	else if (max_links && inode->i_nlink >= max_links)
   4465		error = -EMLINK;
   4466	else {
   4467		error = try_break_deleg(inode, delegated_inode);
   4468		if (!error)
   4469			error = dir->i_op->link(old_dentry, dir, new_dentry);
   4470	}
   4471
   4472	if (!error && (inode->i_state & I_LINKABLE)) {
   4473		spin_lock(&inode->i_lock);
   4474		inode->i_state &= ~I_LINKABLE;
   4475		spin_unlock(&inode->i_lock);
   4476	}
   4477	inode_unlock(inode);
   4478	if (!error)
   4479		fsnotify_link(dir, inode, new_dentry);
   4480	return error;
   4481}
   4482EXPORT_SYMBOL(vfs_link);
   4483
   4484/*
   4485 * Hardlinks are often used in delicate situations.  We avoid
   4486 * security-related surprises by not following symlinks on the
   4487 * newname.  --KAB
   4488 *
   4489 * We don't follow them on the oldname either to be compatible
   4490 * with linux 2.0, and to avoid hard-linking to directories
   4491 * and other special files.  --ADM
   4492 */
   4493int do_linkat(int olddfd, struct filename *old, int newdfd,
   4494	      struct filename *new, int flags)
   4495{
   4496	struct user_namespace *mnt_userns;
   4497	struct dentry *new_dentry;
   4498	struct path old_path, new_path;
   4499	struct inode *delegated_inode = NULL;
   4500	int how = 0;
   4501	int error;
   4502
   4503	if ((flags & ~(AT_SYMLINK_FOLLOW | AT_EMPTY_PATH)) != 0) {
   4504		error = -EINVAL;
   4505		goto out_putnames;
   4506	}
   4507	/*
   4508	 * To use null names we require CAP_DAC_READ_SEARCH
   4509	 * This ensures that not everyone will be able to create
   4510	 * handlink using the passed filedescriptor.
   4511	 */
   4512	if (flags & AT_EMPTY_PATH && !capable(CAP_DAC_READ_SEARCH)) {
   4513		error = -ENOENT;
   4514		goto out_putnames;
   4515	}
   4516
   4517	if (flags & AT_SYMLINK_FOLLOW)
   4518		how |= LOOKUP_FOLLOW;
   4519retry:
   4520	error = filename_lookup(olddfd, old, how, &old_path, NULL);
   4521	if (error)
   4522		goto out_putnames;
   4523
   4524	new_dentry = filename_create(newdfd, new, &new_path,
   4525					(how & LOOKUP_REVAL));
   4526	error = PTR_ERR(new_dentry);
   4527	if (IS_ERR(new_dentry))
   4528		goto out_putpath;
   4529
   4530	error = -EXDEV;
   4531	if (old_path.mnt != new_path.mnt)
   4532		goto out_dput;
   4533	mnt_userns = mnt_user_ns(new_path.mnt);
   4534	error = may_linkat(mnt_userns, &old_path);
   4535	if (unlikely(error))
   4536		goto out_dput;
   4537	error = security_path_link(old_path.dentry, &new_path, new_dentry);
   4538	if (error)
   4539		goto out_dput;
   4540	error = vfs_link(old_path.dentry, mnt_userns, new_path.dentry->d_inode,
   4541			 new_dentry, &delegated_inode);
   4542out_dput:
   4543	done_path_create(&new_path, new_dentry);
   4544	if (delegated_inode) {
   4545		error = break_deleg_wait(&delegated_inode);
   4546		if (!error) {
   4547			path_put(&old_path);
   4548			goto retry;
   4549		}
   4550	}
   4551	if (retry_estale(error, how)) {
   4552		path_put(&old_path);
   4553		how |= LOOKUP_REVAL;
   4554		goto retry;
   4555	}
   4556out_putpath:
   4557	path_put(&old_path);
   4558out_putnames:
   4559	putname(old);
   4560	putname(new);
   4561
   4562	return error;
   4563}
   4564
   4565SYSCALL_DEFINE5(linkat, int, olddfd, const char __user *, oldname,
   4566		int, newdfd, const char __user *, newname, int, flags)
   4567{
   4568	return do_linkat(olddfd, getname_uflags(oldname, flags),
   4569		newdfd, getname(newname), flags);
   4570}
   4571
   4572SYSCALL_DEFINE2(link, const char __user *, oldname, const char __user *, newname)
   4573{
   4574	return do_linkat(AT_FDCWD, getname(oldname), AT_FDCWD, getname(newname), 0);
   4575}
   4576
   4577/**
   4578 * vfs_rename - rename a filesystem object
   4579 * @rd:		pointer to &struct renamedata info
   4580 *
   4581 * The caller must hold multiple mutexes--see lock_rename()).
   4582 *
   4583 * If vfs_rename discovers a delegation in need of breaking at either
   4584 * the source or destination, it will return -EWOULDBLOCK and return a
   4585 * reference to the inode in delegated_inode.  The caller should then
   4586 * break the delegation and retry.  Because breaking a delegation may
   4587 * take a long time, the caller should drop all locks before doing
   4588 * so.
   4589 *
   4590 * Alternatively, a caller may pass NULL for delegated_inode.  This may
   4591 * be appropriate for callers that expect the underlying filesystem not
   4592 * to be NFS exported.
   4593 *
   4594 * The worst of all namespace operations - renaming directory. "Perverted"
   4595 * doesn't even start to describe it. Somebody in UCB had a heck of a trip...
   4596 * Problems:
   4597 *
   4598 *	a) we can get into loop creation.
   4599 *	b) race potential - two innocent renames can create a loop together.
   4600 *	   That's where 4.4 screws up. Current fix: serialization on
   4601 *	   sb->s_vfs_rename_mutex. We might be more accurate, but that's another
   4602 *	   story.
   4603 *	c) we have to lock _four_ objects - parents and victim (if it exists),
   4604 *	   and source (if it is not a directory).
   4605 *	   And that - after we got ->i_mutex on parents (until then we don't know
   4606 *	   whether the target exists).  Solution: try to be smart with locking
   4607 *	   order for inodes.  We rely on the fact that tree topology may change
   4608 *	   only under ->s_vfs_rename_mutex _and_ that parent of the object we
   4609 *	   move will be locked.  Thus we can rank directories by the tree
   4610 *	   (ancestors first) and rank all non-directories after them.
   4611 *	   That works since everybody except rename does "lock parent, lookup,
   4612 *	   lock child" and rename is under ->s_vfs_rename_mutex.
   4613 *	   HOWEVER, it relies on the assumption that any object with ->lookup()
   4614 *	   has no more than 1 dentry.  If "hybrid" objects will ever appear,
   4615 *	   we'd better make sure that there's no link(2) for them.
   4616 *	d) conversion from fhandle to dentry may come in the wrong moment - when
   4617 *	   we are removing the target. Solution: we will have to grab ->i_mutex
   4618 *	   in the fhandle_to_dentry code. [FIXME - current nfsfh.c relies on
   4619 *	   ->i_mutex on parents, which works but leads to some truly excessive
   4620 *	   locking].
   4621 */
   4622int vfs_rename(struct renamedata *rd)
   4623{
   4624	int error;
   4625	struct inode *old_dir = rd->old_dir, *new_dir = rd->new_dir;
   4626	struct dentry *old_dentry = rd->old_dentry;
   4627	struct dentry *new_dentry = rd->new_dentry;
   4628	struct inode **delegated_inode = rd->delegated_inode;
   4629	unsigned int flags = rd->flags;
   4630	bool is_dir = d_is_dir(old_dentry);
   4631	struct inode *source = old_dentry->d_inode;
   4632	struct inode *target = new_dentry->d_inode;
   4633	bool new_is_dir = false;
   4634	unsigned max_links = new_dir->i_sb->s_max_links;
   4635	struct name_snapshot old_name;
   4636
   4637	if (source == target)
   4638		return 0;
   4639
   4640	error = may_delete(rd->old_mnt_userns, old_dir, old_dentry, is_dir);
   4641	if (error)
   4642		return error;
   4643
   4644	if (!target) {
   4645		error = may_create(rd->new_mnt_userns, new_dir, new_dentry);
   4646	} else {
   4647		new_is_dir = d_is_dir(new_dentry);
   4648
   4649		if (!(flags & RENAME_EXCHANGE))
   4650			error = may_delete(rd->new_mnt_userns, new_dir,
   4651					   new_dentry, is_dir);
   4652		else
   4653			error = may_delete(rd->new_mnt_userns, new_dir,
   4654					   new_dentry, new_is_dir);
   4655	}
   4656	if (error)
   4657		return error;
   4658
   4659	if (!old_dir->i_op->rename)
   4660		return -EPERM;
   4661
   4662	/*
   4663	 * If we are going to change the parent - check write permissions,
   4664	 * we'll need to flip '..'.
   4665	 */
   4666	if (new_dir != old_dir) {
   4667		if (is_dir) {
   4668			error = inode_permission(rd->old_mnt_userns, source,
   4669						 MAY_WRITE);
   4670			if (error)
   4671				return error;
   4672		}
   4673		if ((flags & RENAME_EXCHANGE) && new_is_dir) {
   4674			error = inode_permission(rd->new_mnt_userns, target,
   4675						 MAY_WRITE);
   4676			if (error)
   4677				return error;
   4678		}
   4679	}
   4680
   4681	error = security_inode_rename(old_dir, old_dentry, new_dir, new_dentry,
   4682				      flags);
   4683	if (error)
   4684		return error;
   4685
   4686	take_dentry_name_snapshot(&old_name, old_dentry);
   4687	dget(new_dentry);
   4688	if (!is_dir || (flags & RENAME_EXCHANGE))
   4689		lock_two_nondirectories(source, target);
   4690	else if (target)
   4691		inode_lock(target);
   4692
   4693	error = -EPERM;
   4694	if (IS_SWAPFILE(source) || (target && IS_SWAPFILE(target)))
   4695		goto out;
   4696
   4697	error = -EBUSY;
   4698	if (is_local_mountpoint(old_dentry) || is_local_mountpoint(new_dentry))
   4699		goto out;
   4700
   4701	if (max_links && new_dir != old_dir) {
   4702		error = -EMLINK;
   4703		if (is_dir && !new_is_dir && new_dir->i_nlink >= max_links)
   4704			goto out;
   4705		if ((flags & RENAME_EXCHANGE) && !is_dir && new_is_dir &&
   4706		    old_dir->i_nlink >= max_links)
   4707			goto out;
   4708	}
   4709	if (!is_dir) {
   4710		error = try_break_deleg(source, delegated_inode);
   4711		if (error)
   4712			goto out;
   4713	}
   4714	if (target && !new_is_dir) {
   4715		error = try_break_deleg(target, delegated_inode);
   4716		if (error)
   4717			goto out;
   4718	}
   4719	error = old_dir->i_op->rename(rd->new_mnt_userns, old_dir, old_dentry,
   4720				      new_dir, new_dentry, flags);
   4721	if (error)
   4722		goto out;
   4723
   4724	if (!(flags & RENAME_EXCHANGE) && target) {
   4725		if (is_dir) {
   4726			shrink_dcache_parent(new_dentry);
   4727			target->i_flags |= S_DEAD;
   4728		}
   4729		dont_mount(new_dentry);
   4730		detach_mounts(new_dentry);
   4731	}
   4732	if (!(old_dir->i_sb->s_type->fs_flags & FS_RENAME_DOES_D_MOVE)) {
   4733		if (!(flags & RENAME_EXCHANGE))
   4734			d_move(old_dentry, new_dentry);
   4735		else
   4736			d_exchange(old_dentry, new_dentry);
   4737	}
   4738out:
   4739	if (!is_dir || (flags & RENAME_EXCHANGE))
   4740		unlock_two_nondirectories(source, target);
   4741	else if (target)
   4742		inode_unlock(target);
   4743	dput(new_dentry);
   4744	if (!error) {
   4745		fsnotify_move(old_dir, new_dir, &old_name.name, is_dir,
   4746			      !(flags & RENAME_EXCHANGE) ? target : NULL, old_dentry);
   4747		if (flags & RENAME_EXCHANGE) {
   4748			fsnotify_move(new_dir, old_dir, &old_dentry->d_name,
   4749				      new_is_dir, NULL, new_dentry);
   4750		}
   4751	}
   4752	release_dentry_name_snapshot(&old_name);
   4753
   4754	return error;
   4755}
   4756EXPORT_SYMBOL(vfs_rename);
   4757
   4758int do_renameat2(int olddfd, struct filename *from, int newdfd,
   4759		 struct filename *to, unsigned int flags)
   4760{
   4761	struct renamedata rd;
   4762	struct dentry *old_dentry, *new_dentry;
   4763	struct dentry *trap;
   4764	struct path old_path, new_path;
   4765	struct qstr old_last, new_last;
   4766	int old_type, new_type;
   4767	struct inode *delegated_inode = NULL;
   4768	unsigned int lookup_flags = 0, target_flags = LOOKUP_RENAME_TARGET;
   4769	bool should_retry = false;
   4770	int error = -EINVAL;
   4771
   4772	if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE | RENAME_WHITEOUT))
   4773		goto put_names;
   4774
   4775	if ((flags & (RENAME_NOREPLACE | RENAME_WHITEOUT)) &&
   4776	    (flags & RENAME_EXCHANGE))
   4777		goto put_names;
   4778
   4779	if (flags & RENAME_EXCHANGE)
   4780		target_flags = 0;
   4781
   4782retry:
   4783	error = filename_parentat(olddfd, from, lookup_flags, &old_path,
   4784				  &old_last, &old_type);
   4785	if (error)
   4786		goto put_names;
   4787
   4788	error = filename_parentat(newdfd, to, lookup_flags, &new_path, &new_last,
   4789				  &new_type);
   4790	if (error)
   4791		goto exit1;
   4792
   4793	error = -EXDEV;
   4794	if (old_path.mnt != new_path.mnt)
   4795		goto exit2;
   4796
   4797	error = -EBUSY;
   4798	if (old_type != LAST_NORM)
   4799		goto exit2;
   4800
   4801	if (flags & RENAME_NOREPLACE)
   4802		error = -EEXIST;
   4803	if (new_type != LAST_NORM)
   4804		goto exit2;
   4805
   4806	error = mnt_want_write(old_path.mnt);
   4807	if (error)
   4808		goto exit2;
   4809
   4810retry_deleg:
   4811	trap = lock_rename(new_path.dentry, old_path.dentry);
   4812
   4813	old_dentry = __lookup_hash(&old_last, old_path.dentry, lookup_flags);
   4814	error = PTR_ERR(old_dentry);
   4815	if (IS_ERR(old_dentry))
   4816		goto exit3;
   4817	/* source must exist */
   4818	error = -ENOENT;
   4819	if (d_is_negative(old_dentry))
   4820		goto exit4;
   4821	new_dentry = __lookup_hash(&new_last, new_path.dentry, lookup_flags | target_flags);
   4822	error = PTR_ERR(new_dentry);
   4823	if (IS_ERR(new_dentry))
   4824		goto exit4;
   4825	error = -EEXIST;
   4826	if ((flags & RENAME_NOREPLACE) && d_is_positive(new_dentry))
   4827		goto exit5;
   4828	if (flags & RENAME_EXCHANGE) {
   4829		error = -ENOENT;
   4830		if (d_is_negative(new_dentry))
   4831			goto exit5;
   4832
   4833		if (!d_is_dir(new_dentry)) {
   4834			error = -ENOTDIR;
   4835			if (new_last.name[new_last.len])
   4836				goto exit5;
   4837		}
   4838	}
   4839	/* unless the source is a directory trailing slashes give -ENOTDIR */
   4840	if (!d_is_dir(old_dentry)) {
   4841		error = -ENOTDIR;
   4842		if (old_last.name[old_last.len])
   4843			goto exit5;
   4844		if (!(flags & RENAME_EXCHANGE) && new_last.name[new_last.len])
   4845			goto exit5;
   4846	}
   4847	/* source should not be ancestor of target */
   4848	error = -EINVAL;
   4849	if (old_dentry == trap)
   4850		goto exit5;
   4851	/* target should not be an ancestor of source */
   4852	if (!(flags & RENAME_EXCHANGE))
   4853		error = -ENOTEMPTY;
   4854	if (new_dentry == trap)
   4855		goto exit5;
   4856
   4857	error = security_path_rename(&old_path, old_dentry,
   4858				     &new_path, new_dentry, flags);
   4859	if (error)
   4860		goto exit5;
   4861
   4862	rd.old_dir	   = old_path.dentry->d_inode;
   4863	rd.old_dentry	   = old_dentry;
   4864	rd.old_mnt_userns  = mnt_user_ns(old_path.mnt);
   4865	rd.new_dir	   = new_path.dentry->d_inode;
   4866	rd.new_dentry	   = new_dentry;
   4867	rd.new_mnt_userns  = mnt_user_ns(new_path.mnt);
   4868	rd.delegated_inode = &delegated_inode;
   4869	rd.flags	   = flags;
   4870	error = vfs_rename(&rd);
   4871exit5:
   4872	dput(new_dentry);
   4873exit4:
   4874	dput(old_dentry);
   4875exit3:
   4876	unlock_rename(new_path.dentry, old_path.dentry);
   4877	if (delegated_inode) {
   4878		error = break_deleg_wait(&delegated_inode);
   4879		if (!error)
   4880			goto retry_deleg;
   4881	}
   4882	mnt_drop_write(old_path.mnt);
   4883exit2:
   4884	if (retry_estale(error, lookup_flags))
   4885		should_retry = true;
   4886	path_put(&new_path);
   4887exit1:
   4888	path_put(&old_path);
   4889	if (should_retry) {
   4890		should_retry = false;
   4891		lookup_flags |= LOOKUP_REVAL;
   4892		goto retry;
   4893	}
   4894put_names:
   4895	putname(from);
   4896	putname(to);
   4897	return error;
   4898}
   4899
   4900SYSCALL_DEFINE5(renameat2, int, olddfd, const char __user *, oldname,
   4901		int, newdfd, const char __user *, newname, unsigned int, flags)
   4902{
   4903	return do_renameat2(olddfd, getname(oldname), newdfd, getname(newname),
   4904				flags);
   4905}
   4906
   4907SYSCALL_DEFINE4(renameat, int, olddfd, const char __user *, oldname,
   4908		int, newdfd, const char __user *, newname)
   4909{
   4910	return do_renameat2(olddfd, getname(oldname), newdfd, getname(newname),
   4911				0);
   4912}
   4913
   4914SYSCALL_DEFINE2(rename, const char __user *, oldname, const char __user *, newname)
   4915{
   4916	return do_renameat2(AT_FDCWD, getname(oldname), AT_FDCWD,
   4917				getname(newname), 0);
   4918}
   4919
   4920int readlink_copy(char __user *buffer, int buflen, const char *link)
   4921{
   4922	int len = PTR_ERR(link);
   4923	if (IS_ERR(link))
   4924		goto out;
   4925
   4926	len = strlen(link);
   4927	if (len > (unsigned) buflen)
   4928		len = buflen;
   4929	if (copy_to_user(buffer, link, len))
   4930		len = -EFAULT;
   4931out:
   4932	return len;
   4933}
   4934
   4935/**
   4936 * vfs_readlink - copy symlink body into userspace buffer
   4937 * @dentry: dentry on which to get symbolic link
   4938 * @buffer: user memory pointer
   4939 * @buflen: size of buffer
   4940 *
   4941 * Does not touch atime.  That's up to the caller if necessary
   4942 *
   4943 * Does not call security hook.
   4944 */
   4945int vfs_readlink(struct dentry *dentry, char __user *buffer, int buflen)
   4946{
   4947	struct inode *inode = d_inode(dentry);
   4948	DEFINE_DELAYED_CALL(done);
   4949	const char *link;
   4950	int res;
   4951
   4952	if (unlikely(!(inode->i_opflags & IOP_DEFAULT_READLINK))) {
   4953		if (unlikely(inode->i_op->readlink))
   4954			return inode->i_op->readlink(dentry, buffer, buflen);
   4955
   4956		if (!d_is_symlink(dentry))
   4957			return -EINVAL;
   4958
   4959		spin_lock(&inode->i_lock);
   4960		inode->i_opflags |= IOP_DEFAULT_READLINK;
   4961		spin_unlock(&inode->i_lock);
   4962	}
   4963
   4964	link = READ_ONCE(inode->i_link);
   4965	if (!link) {
   4966		link = inode->i_op->get_link(dentry, inode, &done);
   4967		if (IS_ERR(link))
   4968			return PTR_ERR(link);
   4969	}
   4970	res = readlink_copy(buffer, buflen, link);
   4971	do_delayed_call(&done);
   4972	return res;
   4973}
   4974EXPORT_SYMBOL(vfs_readlink);
   4975
   4976/**
   4977 * vfs_get_link - get symlink body
   4978 * @dentry: dentry on which to get symbolic link
   4979 * @done: caller needs to free returned data with this
   4980 *
   4981 * Calls security hook and i_op->get_link() on the supplied inode.
   4982 *
   4983 * It does not touch atime.  That's up to the caller if necessary.
   4984 *
   4985 * Does not work on "special" symlinks like /proc/$$/fd/N
   4986 */
   4987const char *vfs_get_link(struct dentry *dentry, struct delayed_call *done)
   4988{
   4989	const char *res = ERR_PTR(-EINVAL);
   4990	struct inode *inode = d_inode(dentry);
   4991
   4992	if (d_is_symlink(dentry)) {
   4993		res = ERR_PTR(security_inode_readlink(dentry));
   4994		if (!res)
   4995			res = inode->i_op->get_link(dentry, inode, done);
   4996	}
   4997	return res;
   4998}
   4999EXPORT_SYMBOL(vfs_get_link);
   5000
   5001/* get the link contents into pagecache */
   5002const char *page_get_link(struct dentry *dentry, struct inode *inode,
   5003			  struct delayed_call *callback)
   5004{
   5005	char *kaddr;
   5006	struct page *page;
   5007	struct address_space *mapping = inode->i_mapping;
   5008
   5009	if (!dentry) {
   5010		page = find_get_page(mapping, 0);
   5011		if (!page)
   5012			return ERR_PTR(-ECHILD);
   5013		if (!PageUptodate(page)) {
   5014			put_page(page);
   5015			return ERR_PTR(-ECHILD);
   5016		}
   5017	} else {
   5018		page = read_mapping_page(mapping, 0, NULL);
   5019		if (IS_ERR(page))
   5020			return (char*)page;
   5021	}
   5022	set_delayed_call(callback, page_put_link, page);
   5023	BUG_ON(mapping_gfp_mask(mapping) & __GFP_HIGHMEM);
   5024	kaddr = page_address(page);
   5025	nd_terminate_link(kaddr, inode->i_size, PAGE_SIZE - 1);
   5026	return kaddr;
   5027}
   5028
   5029EXPORT_SYMBOL(page_get_link);
   5030
   5031void page_put_link(void *arg)
   5032{
   5033	put_page(arg);
   5034}
   5035EXPORT_SYMBOL(page_put_link);
   5036
   5037int page_readlink(struct dentry *dentry, char __user *buffer, int buflen)
   5038{
   5039	DEFINE_DELAYED_CALL(done);
   5040	int res = readlink_copy(buffer, buflen,
   5041				page_get_link(dentry, d_inode(dentry),
   5042					      &done));
   5043	do_delayed_call(&done);
   5044	return res;
   5045}
   5046EXPORT_SYMBOL(page_readlink);
   5047
   5048int page_symlink(struct inode *inode, const char *symname, int len)
   5049{
   5050	struct address_space *mapping = inode->i_mapping;
   5051	const struct address_space_operations *aops = mapping->a_ops;
   5052	bool nofs = !mapping_gfp_constraint(mapping, __GFP_FS);
   5053	struct page *page;
   5054	void *fsdata;
   5055	int err;
   5056	unsigned int flags;
   5057
   5058retry:
   5059	if (nofs)
   5060		flags = memalloc_nofs_save();
   5061	err = aops->write_begin(NULL, mapping, 0, len-1, &page, &fsdata);
   5062	if (nofs)
   5063		memalloc_nofs_restore(flags);
   5064	if (err)
   5065		goto fail;
   5066
   5067	memcpy(page_address(page), symname, len-1);
   5068
   5069	err = aops->write_end(NULL, mapping, 0, len-1, len-1,
   5070							page, fsdata);
   5071	if (err < 0)
   5072		goto fail;
   5073	if (err < len-1)
   5074		goto retry;
   5075
   5076	mark_inode_dirty(inode);
   5077	return 0;
   5078fail:
   5079	return err;
   5080}
   5081EXPORT_SYMBOL(page_symlink);
   5082
   5083const struct inode_operations page_symlink_inode_operations = {
   5084	.get_link	= page_get_link,
   5085};
   5086EXPORT_SYMBOL(page_symlink_inode_operations);