fanotify_user.c - cachepc-linux - Fork of AMDESE/linux with modifications for CachePC side-channel attack

	cachepc-linux Fork of AMDESE/linux with modifications for CachePC side-channel attack
	git clone https://git.sinitax.com/sinitax/cachepc-linux
	Log \| Files \| Refs \| README \| LICENSE \| sfeed.txt
fanotify_user.c (49186B)
      1// SPDX-License-Identifier: GPL-2.0
      2#include <linux/fanotify.h>
      3#include <linux/fcntl.h>
      4#include <linux/fdtable.h>
      5#include <linux/file.h>
      6#include <linux/fs.h>
      7#include <linux/anon_inodes.h>
      8#include <linux/fsnotify_backend.h>
      9#include <linux/init.h>
     10#include <linux/mount.h>
     11#include <linux/namei.h>
     12#include <linux/poll.h>
     13#include <linux/security.h>
     14#include <linux/syscalls.h>
     15#include <linux/slab.h>
     16#include <linux/types.h>
     17#include <linux/uaccess.h>
     18#include <linux/compat.h>
     19#include <linux/sched/signal.h>
     20#include <linux/memcontrol.h>
     21#include <linux/statfs.h>
     22#include <linux/exportfs.h>
     23
     24#include <asm/ioctls.h>
     25
     26#include "../../mount.h"
     27#include "../fdinfo.h"
     28#include "fanotify.h"
     29
     30#define FANOTIFY_DEFAULT_MAX_EVENTS	16384
     31#define FANOTIFY_OLD_DEFAULT_MAX_MARKS	8192
     32#define FANOTIFY_DEFAULT_MAX_GROUPS	128
     33#define FANOTIFY_DEFAULT_FEE_POOL_SIZE	32
     34
     35/*
     36 * Legacy fanotify marks limits (8192) is per group and we introduced a tunable
     37 * limit of marks per user, similar to inotify.  Effectively, the legacy limit
     38 * of fanotify marks per user is <max marks per group> * <max groups per user>.
     39 * This default limit (1M) also happens to match the increased limit of inotify
     40 * max_user_watches since v5.10.
     41 */
     42#define FANOTIFY_DEFAULT_MAX_USER_MARKS	\
     43	(FANOTIFY_OLD_DEFAULT_MAX_MARKS * FANOTIFY_DEFAULT_MAX_GROUPS)
     44
     45/*
     46 * Most of the memory cost of adding an inode mark is pinning the marked inode.
     47 * The size of the filesystem inode struct is not uniform across filesystems,
     48 * so double the size of a VFS inode is used as a conservative approximation.
     49 */
     50#define INODE_MARK_COST	(2 * sizeof(struct inode))
     51
     52/* configurable via /proc/sys/fs/fanotify/ */
     53static int fanotify_max_queued_events __read_mostly;
     54
     55#ifdef CONFIG_SYSCTL
     56
     57#include <linux/sysctl.h>
     58
     59static long ft_zero = 0;
     60static long ft_int_max = INT_MAX;
     61
     62static struct ctl_table fanotify_table[] = {
     63	{
     64		.procname	= "max_user_groups",
     65		.data	= &init_user_ns.ucount_max[UCOUNT_FANOTIFY_GROUPS],
     66		.maxlen		= sizeof(long),
     67		.mode		= 0644,
     68		.proc_handler	= proc_doulongvec_minmax,
     69		.extra1		= &ft_zero,
     70		.extra2		= &ft_int_max,
     71	},
     72	{
     73		.procname	= "max_user_marks",
     74		.data	= &init_user_ns.ucount_max[UCOUNT_FANOTIFY_MARKS],
     75		.maxlen		= sizeof(long),
     76		.mode		= 0644,
     77		.proc_handler	= proc_doulongvec_minmax,
     78		.extra1		= &ft_zero,
     79		.extra2		= &ft_int_max,
     80	},
     81	{
     82		.procname	= "max_queued_events",
     83		.data		= &fanotify_max_queued_events,
     84		.maxlen		= sizeof(int),
     85		.mode		= 0644,
     86		.proc_handler	= proc_dointvec_minmax,
     87		.extra1		= SYSCTL_ZERO
     88	},
     89	{ }
     90};
     91
     92static void __init fanotify_sysctls_init(void)
     93{
     94	register_sysctl("fs/fanotify", fanotify_table);
     95}
     96#else
     97#define fanotify_sysctls_init() do { } while (0)
     98#endif /* CONFIG_SYSCTL */
     99
    100/*
    101 * All flags that may be specified in parameter event_f_flags of fanotify_init.
    102 *
    103 * Internal and external open flags are stored together in field f_flags of
    104 * struct file. Only external open flags shall be allowed in event_f_flags.
    105 * Internal flags like FMODE_NONOTIFY, FMODE_EXEC, FMODE_NOCMTIME shall be
    106 * excluded.
    107 */
    108#define	FANOTIFY_INIT_ALL_EVENT_F_BITS				( \
    109		O_ACCMODE	| O_APPEND	| O_NONBLOCK	| \
    110		__O_SYNC	| O_DSYNC	| O_CLOEXEC     | \
    111		O_LARGEFILE	| O_NOATIME	)
    112
    113extern const struct fsnotify_ops fanotify_fsnotify_ops;
    114
    115struct kmem_cache *fanotify_mark_cache __read_mostly;
    116struct kmem_cache *fanotify_fid_event_cachep __read_mostly;
    117struct kmem_cache *fanotify_path_event_cachep __read_mostly;
    118struct kmem_cache *fanotify_perm_event_cachep __read_mostly;
    119
    120#define FANOTIFY_EVENT_ALIGN 4
    121#define FANOTIFY_FID_INFO_HDR_LEN \
    122	(sizeof(struct fanotify_event_info_fid) + sizeof(struct file_handle))
    123#define FANOTIFY_PIDFD_INFO_HDR_LEN \
    124	sizeof(struct fanotify_event_info_pidfd)
    125#define FANOTIFY_ERROR_INFO_LEN \
    126	(sizeof(struct fanotify_event_info_error))
    127
    128static int fanotify_fid_info_len(int fh_len, int name_len)
    129{
    130	int info_len = fh_len;
    131
    132	if (name_len)
    133		info_len += name_len + 1;
    134
    135	return roundup(FANOTIFY_FID_INFO_HDR_LEN + info_len,
    136		       FANOTIFY_EVENT_ALIGN);
    137}
    138
    139/* FAN_RENAME may have one or two dir+name info records */
    140static int fanotify_dir_name_info_len(struct fanotify_event *event)
    141{
    142	struct fanotify_info *info = fanotify_event_info(event);
    143	int dir_fh_len = fanotify_event_dir_fh_len(event);
    144	int dir2_fh_len = fanotify_event_dir2_fh_len(event);
    145	int info_len = 0;
    146
    147	if (dir_fh_len)
    148		info_len += fanotify_fid_info_len(dir_fh_len,
    149						  info->name_len);
    150	if (dir2_fh_len)
    151		info_len += fanotify_fid_info_len(dir2_fh_len,
    152						  info->name2_len);
    153
    154	return info_len;
    155}
    156
    157static size_t fanotify_event_len(unsigned int info_mode,
    158				 struct fanotify_event *event)
    159{
    160	size_t event_len = FAN_EVENT_METADATA_LEN;
    161	int fh_len;
    162	int dot_len = 0;
    163
    164	if (!info_mode)
    165		return event_len;
    166
    167	if (fanotify_is_error_event(event->mask))
    168		event_len += FANOTIFY_ERROR_INFO_LEN;
    169
    170	if (fanotify_event_has_any_dir_fh(event)) {
    171		event_len += fanotify_dir_name_info_len(event);
    172	} else if ((info_mode & FAN_REPORT_NAME) &&
    173		   (event->mask & FAN_ONDIR)) {
    174		/*
    175		 * With group flag FAN_REPORT_NAME, if name was not recorded in
    176		 * event on a directory, we will report the name ".".
    177		 */
    178		dot_len = 1;
    179	}
    180
    181	if (info_mode & FAN_REPORT_PIDFD)
    182		event_len += FANOTIFY_PIDFD_INFO_HDR_LEN;
    183
    184	if (fanotify_event_has_object_fh(event)) {
    185		fh_len = fanotify_event_object_fh_len(event);
    186		event_len += fanotify_fid_info_len(fh_len, dot_len);
    187	}
    188
    189	return event_len;
    190}
    191
    192/*
    193 * Remove an hashed event from merge hash table.
    194 */
    195static void fanotify_unhash_event(struct fsnotify_group *group,
    196				  struct fanotify_event *event)
    197{
    198	assert_spin_locked(&group->notification_lock);
    199
    200	pr_debug("%s: group=%p event=%p bucket=%u\n", __func__,
    201		 group, event, fanotify_event_hash_bucket(group, event));
    202
    203	if (WARN_ON_ONCE(hlist_unhashed(&event->merge_list)))
    204		return;
    205
    206	hlist_del_init(&event->merge_list);
    207}
    208
    209/*
    210 * Get an fanotify notification event if one exists and is small
    211 * enough to fit in "count". Return an error pointer if the count
    212 * is not large enough. When permission event is dequeued, its state is
    213 * updated accordingly.
    214 */
    215static struct fanotify_event *get_one_event(struct fsnotify_group *group,
    216					    size_t count)
    217{
    218	size_t event_size;
    219	struct fanotify_event *event = NULL;
    220	struct fsnotify_event *fsn_event;
    221	unsigned int info_mode = FAN_GROUP_FLAG(group, FANOTIFY_INFO_MODES);
    222
    223	pr_debug("%s: group=%p count=%zd\n", __func__, group, count);
    224
    225	spin_lock(&group->notification_lock);
    226	fsn_event = fsnotify_peek_first_event(group);
    227	if (!fsn_event)
    228		goto out;
    229
    230	event = FANOTIFY_E(fsn_event);
    231	event_size = fanotify_event_len(info_mode, event);
    232
    233	if (event_size > count) {
    234		event = ERR_PTR(-EINVAL);
    235		goto out;
    236	}
    237
    238	/*
    239	 * Held the notification_lock the whole time, so this is the
    240	 * same event we peeked above.
    241	 */
    242	fsnotify_remove_first_event(group);
    243	if (fanotify_is_perm_event(event->mask))
    244		FANOTIFY_PERM(event)->state = FAN_EVENT_REPORTED;
    245	if (fanotify_is_hashed_event(event->mask))
    246		fanotify_unhash_event(group, event);
    247out:
    248	spin_unlock(&group->notification_lock);
    249	return event;
    250}
    251
    252static int create_fd(struct fsnotify_group *group, struct path *path,
    253		     struct file **file)
    254{
    255	int client_fd;
    256	struct file *new_file;
    257
    258	client_fd = get_unused_fd_flags(group->fanotify_data.f_flags);
    259	if (client_fd < 0)
    260		return client_fd;
    261
    262	/*
    263	 * we need a new file handle for the userspace program so it can read even if it was
    264	 * originally opened O_WRONLY.
    265	 */
    266	new_file = dentry_open(path,
    267			       group->fanotify_data.f_flags | __FMODE_NONOTIFY,
    268			       current_cred());
    269	if (IS_ERR(new_file)) {
    270		/*
    271		 * we still send an event even if we can't open the file.  this
    272		 * can happen when say tasks are gone and we try to open their
    273		 * /proc files or we try to open a WRONLY file like in sysfs
    274		 * we just send the errno to userspace since there isn't much
    275		 * else we can do.
    276		 */
    277		put_unused_fd(client_fd);
    278		client_fd = PTR_ERR(new_file);
    279	} else {
    280		*file = new_file;
    281	}
    282
    283	return client_fd;
    284}
    285
    286/*
    287 * Finish processing of permission event by setting it to ANSWERED state and
    288 * drop group->notification_lock.
    289 */
    290static void finish_permission_event(struct fsnotify_group *group,
    291				    struct fanotify_perm_event *event,
    292				    unsigned int response)
    293				    __releases(&group->notification_lock)
    294{
    295	bool destroy = false;
    296
    297	assert_spin_locked(&group->notification_lock);
    298	event->response = response;
    299	if (event->state == FAN_EVENT_CANCELED)
    300		destroy = true;
    301	else
    302		event->state = FAN_EVENT_ANSWERED;
    303	spin_unlock(&group->notification_lock);
    304	if (destroy)
    305		fsnotify_destroy_event(group, &event->fae.fse);
    306}
    307
    308static int process_access_response(struct fsnotify_group *group,
    309				   struct fanotify_response *response_struct)
    310{
    311	struct fanotify_perm_event *event;
    312	int fd = response_struct->fd;
    313	int response = response_struct->response;
    314
    315	pr_debug("%s: group=%p fd=%d response=%d\n", __func__, group,
    316		 fd, response);
    317	/*
    318	 * make sure the response is valid, if invalid we do nothing and either
    319	 * userspace can send a valid response or we will clean it up after the
    320	 * timeout
    321	 */
    322	switch (response & ~FAN_AUDIT) {
    323	case FAN_ALLOW:
    324	case FAN_DENY:
    325		break;
    326	default:
    327		return -EINVAL;
    328	}
    329
    330	if (fd < 0)
    331		return -EINVAL;
    332
    333	if ((response & FAN_AUDIT) && !FAN_GROUP_FLAG(group, FAN_ENABLE_AUDIT))
    334		return -EINVAL;
    335
    336	spin_lock(&group->notification_lock);
    337	list_for_each_entry(event, &group->fanotify_data.access_list,
    338			    fae.fse.list) {
    339		if (event->fd != fd)
    340			continue;
    341
    342		list_del_init(&event->fae.fse.list);
    343		finish_permission_event(group, event, response);
    344		wake_up(&group->fanotify_data.access_waitq);
    345		return 0;
    346	}
    347	spin_unlock(&group->notification_lock);
    348
    349	return -ENOENT;
    350}
    351
    352static size_t copy_error_info_to_user(struct fanotify_event *event,
    353				      char __user *buf, int count)
    354{
    355	struct fanotify_event_info_error info = { };
    356	struct fanotify_error_event *fee = FANOTIFY_EE(event);
    357
    358	info.hdr.info_type = FAN_EVENT_INFO_TYPE_ERROR;
    359	info.hdr.len = FANOTIFY_ERROR_INFO_LEN;
    360
    361	if (WARN_ON(count < info.hdr.len))
    362		return -EFAULT;
    363
    364	info.error = fee->error;
    365	info.error_count = fee->err_count;
    366
    367	if (copy_to_user(buf, &info, sizeof(info)))
    368		return -EFAULT;
    369
    370	return info.hdr.len;
    371}
    372
    373static int copy_fid_info_to_user(__kernel_fsid_t *fsid, struct fanotify_fh *fh,
    374				 int info_type, const char *name,
    375				 size_t name_len,
    376				 char __user *buf, size_t count)
    377{
    378	struct fanotify_event_info_fid info = { };
    379	struct file_handle handle = { };
    380	unsigned char bounce[FANOTIFY_INLINE_FH_LEN], *fh_buf;
    381	size_t fh_len = fh ? fh->len : 0;
    382	size_t info_len = fanotify_fid_info_len(fh_len, name_len);
    383	size_t len = info_len;
    384
    385	pr_debug("%s: fh_len=%zu name_len=%zu, info_len=%zu, count=%zu\n",
    386		 __func__, fh_len, name_len, info_len, count);
    387
    388	if (WARN_ON_ONCE(len < sizeof(info) || len > count))
    389		return -EFAULT;
    390
    391	/*
    392	 * Copy event info fid header followed by variable sized file handle
    393	 * and optionally followed by variable sized filename.
    394	 */
    395	switch (info_type) {
    396	case FAN_EVENT_INFO_TYPE_FID:
    397	case FAN_EVENT_INFO_TYPE_DFID:
    398		if (WARN_ON_ONCE(name_len))
    399			return -EFAULT;
    400		break;
    401	case FAN_EVENT_INFO_TYPE_DFID_NAME:
    402	case FAN_EVENT_INFO_TYPE_OLD_DFID_NAME:
    403	case FAN_EVENT_INFO_TYPE_NEW_DFID_NAME:
    404		if (WARN_ON_ONCE(!name || !name_len))
    405			return -EFAULT;
    406		break;
    407	default:
    408		return -EFAULT;
    409	}
    410
    411	info.hdr.info_type = info_type;
    412	info.hdr.len = len;
    413	info.fsid = *fsid;
    414	if (copy_to_user(buf, &info, sizeof(info)))
    415		return -EFAULT;
    416
    417	buf += sizeof(info);
    418	len -= sizeof(info);
    419	if (WARN_ON_ONCE(len < sizeof(handle)))
    420		return -EFAULT;
    421
    422	handle.handle_type = fh->type;
    423	handle.handle_bytes = fh_len;
    424
    425	/* Mangle handle_type for bad file_handle */
    426	if (!fh_len)
    427		handle.handle_type = FILEID_INVALID;
    428
    429	if (copy_to_user(buf, &handle, sizeof(handle)))
    430		return -EFAULT;
    431
    432	buf += sizeof(handle);
    433	len -= sizeof(handle);
    434	if (WARN_ON_ONCE(len < fh_len))
    435		return -EFAULT;
    436
    437	/*
    438	 * For an inline fh and inline file name, copy through stack to exclude
    439	 * the copy from usercopy hardening protections.
    440	 */
    441	fh_buf = fanotify_fh_buf(fh);
    442	if (fh_len <= FANOTIFY_INLINE_FH_LEN) {
    443		memcpy(bounce, fh_buf, fh_len);
    444		fh_buf = bounce;
    445	}
    446	if (copy_to_user(buf, fh_buf, fh_len))
    447		return -EFAULT;
    448
    449	buf += fh_len;
    450	len -= fh_len;
    451
    452	if (name_len) {
    453		/* Copy the filename with terminating null */
    454		name_len++;
    455		if (WARN_ON_ONCE(len < name_len))
    456			return -EFAULT;
    457
    458		if (copy_to_user(buf, name, name_len))
    459			return -EFAULT;
    460
    461		buf += name_len;
    462		len -= name_len;
    463	}
    464
    465	/* Pad with 0's */
    466	WARN_ON_ONCE(len < 0 || len >= FANOTIFY_EVENT_ALIGN);
    467	if (len > 0 && clear_user(buf, len))
    468		return -EFAULT;
    469
    470	return info_len;
    471}
    472
    473static int copy_pidfd_info_to_user(int pidfd,
    474				   char __user *buf,
    475				   size_t count)
    476{
    477	struct fanotify_event_info_pidfd info = { };
    478	size_t info_len = FANOTIFY_PIDFD_INFO_HDR_LEN;
    479
    480	if (WARN_ON_ONCE(info_len > count))
    481		return -EFAULT;
    482
    483	info.hdr.info_type = FAN_EVENT_INFO_TYPE_PIDFD;
    484	info.hdr.len = info_len;
    485	info.pidfd = pidfd;
    486
    487	if (copy_to_user(buf, &info, info_len))
    488		return -EFAULT;
    489
    490	return info_len;
    491}
    492
    493static int copy_info_records_to_user(struct fanotify_event *event,
    494				     struct fanotify_info *info,
    495				     unsigned int info_mode, int pidfd,
    496				     char __user *buf, size_t count)
    497{
    498	int ret, total_bytes = 0, info_type = 0;
    499	unsigned int fid_mode = info_mode & FANOTIFY_FID_BITS;
    500	unsigned int pidfd_mode = info_mode & FAN_REPORT_PIDFD;
    501
    502	/*
    503	 * Event info records order is as follows:
    504	 * 1. dir fid + name
    505	 * 2. (optional) new dir fid + new name
    506	 * 3. (optional) child fid
    507	 */
    508	if (fanotify_event_has_dir_fh(event)) {
    509		info_type = info->name_len ? FAN_EVENT_INFO_TYPE_DFID_NAME :
    510					     FAN_EVENT_INFO_TYPE_DFID;
    511
    512		/* FAN_RENAME uses special info types */
    513		if (event->mask & FAN_RENAME)
    514			info_type = FAN_EVENT_INFO_TYPE_OLD_DFID_NAME;
    515
    516		ret = copy_fid_info_to_user(fanotify_event_fsid(event),
    517					    fanotify_info_dir_fh(info),
    518					    info_type,
    519					    fanotify_info_name(info),
    520					    info->name_len, buf, count);
    521		if (ret < 0)
    522			return ret;
    523
    524		buf += ret;
    525		count -= ret;
    526		total_bytes += ret;
    527	}
    528
    529	/* New dir fid+name may be reported in addition to old dir fid+name */
    530	if (fanotify_event_has_dir2_fh(event)) {
    531		info_type = FAN_EVENT_INFO_TYPE_NEW_DFID_NAME;
    532		ret = copy_fid_info_to_user(fanotify_event_fsid(event),
    533					    fanotify_info_dir2_fh(info),
    534					    info_type,
    535					    fanotify_info_name2(info),
    536					    info->name2_len, buf, count);
    537		if (ret < 0)
    538			return ret;
    539
    540		buf += ret;
    541		count -= ret;
    542		total_bytes += ret;
    543	}
    544
    545	if (fanotify_event_has_object_fh(event)) {
    546		const char *dot = NULL;
    547		int dot_len = 0;
    548
    549		if (fid_mode == FAN_REPORT_FID || info_type) {
    550			/*
    551			 * With only group flag FAN_REPORT_FID only type FID is
    552			 * reported. Second info record type is always FID.
    553			 */
    554			info_type = FAN_EVENT_INFO_TYPE_FID;
    555		} else if ((fid_mode & FAN_REPORT_NAME) &&
    556			   (event->mask & FAN_ONDIR)) {
    557			/*
    558			 * With group flag FAN_REPORT_NAME, if name was not
    559			 * recorded in an event on a directory, report the name
    560			 * "." with info type DFID_NAME.
    561			 */
    562			info_type = FAN_EVENT_INFO_TYPE_DFID_NAME;
    563			dot = ".";
    564			dot_len = 1;
    565		} else if ((event->mask & ALL_FSNOTIFY_DIRENT_EVENTS) ||
    566			   (event->mask & FAN_ONDIR)) {
    567			/*
    568			 * With group flag FAN_REPORT_DIR_FID, a single info
    569			 * record has type DFID for directory entry modification
    570			 * event and for event on a directory.
    571			 */
    572			info_type = FAN_EVENT_INFO_TYPE_DFID;
    573		} else {
    574			/*
    575			 * With group flags FAN_REPORT_DIR_FID|FAN_REPORT_FID,
    576			 * a single info record has type FID for event on a
    577			 * non-directory, when there is no directory to report.
    578			 * For example, on FAN_DELETE_SELF event.
    579			 */
    580			info_type = FAN_EVENT_INFO_TYPE_FID;
    581		}
    582
    583		ret = copy_fid_info_to_user(fanotify_event_fsid(event),
    584					    fanotify_event_object_fh(event),
    585					    info_type, dot, dot_len,
    586					    buf, count);
    587		if (ret < 0)
    588			return ret;
    589
    590		buf += ret;
    591		count -= ret;
    592		total_bytes += ret;
    593	}
    594
    595	if (pidfd_mode) {
    596		ret = copy_pidfd_info_to_user(pidfd, buf, count);
    597		if (ret < 0)
    598			return ret;
    599
    600		buf += ret;
    601		count -= ret;
    602		total_bytes += ret;
    603	}
    604
    605	if (fanotify_is_error_event(event->mask)) {
    606		ret = copy_error_info_to_user(event, buf, count);
    607		if (ret < 0)
    608			return ret;
    609		buf += ret;
    610		count -= ret;
    611		total_bytes += ret;
    612	}
    613
    614	return total_bytes;
    615}
    616
    617static ssize_t copy_event_to_user(struct fsnotify_group *group,
    618				  struct fanotify_event *event,
    619				  char __user *buf, size_t count)
    620{
    621	struct fanotify_event_metadata metadata;
    622	struct path *path = fanotify_event_path(event);
    623	struct fanotify_info *info = fanotify_event_info(event);
    624	unsigned int info_mode = FAN_GROUP_FLAG(group, FANOTIFY_INFO_MODES);
    625	unsigned int pidfd_mode = info_mode & FAN_REPORT_PIDFD;
    626	struct file *f = NULL;
    627	int ret, pidfd = FAN_NOPIDFD, fd = FAN_NOFD;
    628
    629	pr_debug("%s: group=%p event=%p\n", __func__, group, event);
    630
    631	metadata.event_len = fanotify_event_len(info_mode, event);
    632	metadata.metadata_len = FAN_EVENT_METADATA_LEN;
    633	metadata.vers = FANOTIFY_METADATA_VERSION;
    634	metadata.reserved = 0;
    635	metadata.mask = event->mask & FANOTIFY_OUTGOING_EVENTS;
    636	metadata.pid = pid_vnr(event->pid);
    637	/*
    638	 * For an unprivileged listener, event->pid can be used to identify the
    639	 * events generated by the listener process itself, without disclosing
    640	 * the pids of other processes.
    641	 */
    642	if (FAN_GROUP_FLAG(group, FANOTIFY_UNPRIV) &&
    643	    task_tgid(current) != event->pid)
    644		metadata.pid = 0;
    645
    646	/*
    647	 * For now, fid mode is required for an unprivileged listener and
    648	 * fid mode does not report fd in events.  Keep this check anyway
    649	 * for safety in case fid mode requirement is relaxed in the future
    650	 * to allow unprivileged listener to get events with no fd and no fid.
    651	 */
    652	if (!FAN_GROUP_FLAG(group, FANOTIFY_UNPRIV) &&
    653	    path && path->mnt && path->dentry) {
    654		fd = create_fd(group, path, &f);
    655		if (fd < 0)
    656			return fd;
    657	}
    658	metadata.fd = fd;
    659
    660	if (pidfd_mode) {
    661		/*
    662		 * Complain if the FAN_REPORT_PIDFD and FAN_REPORT_TID mutual
    663		 * exclusion is ever lifted. At the time of incoporating pidfd
    664		 * support within fanotify, the pidfd API only supported the
    665		 * creation of pidfds for thread-group leaders.
    666		 */
    667		WARN_ON_ONCE(FAN_GROUP_FLAG(group, FAN_REPORT_TID));
    668
    669		/*
    670		 * The PIDTYPE_TGID check for an event->pid is performed
    671		 * preemptively in an attempt to catch out cases where the event
    672		 * listener reads events after the event generating process has
    673		 * already terminated. Report FAN_NOPIDFD to the event listener
    674		 * in those cases, with all other pidfd creation errors being
    675		 * reported as FAN_EPIDFD.
    676		 */
    677		if (metadata.pid == 0 ||
    678		    !pid_has_task(event->pid, PIDTYPE_TGID)) {
    679			pidfd = FAN_NOPIDFD;
    680		} else {
    681			pidfd = pidfd_create(event->pid, 0);
    682			if (pidfd < 0)
    683				pidfd = FAN_EPIDFD;
    684		}
    685	}
    686
    687	ret = -EFAULT;
    688	/*
    689	 * Sanity check copy size in case get_one_event() and
    690	 * event_len sizes ever get out of sync.
    691	 */
    692	if (WARN_ON_ONCE(metadata.event_len > count))
    693		goto out_close_fd;
    694
    695	if (copy_to_user(buf, &metadata, FAN_EVENT_METADATA_LEN))
    696		goto out_close_fd;
    697
    698	buf += FAN_EVENT_METADATA_LEN;
    699	count -= FAN_EVENT_METADATA_LEN;
    700
    701	if (fanotify_is_perm_event(event->mask))
    702		FANOTIFY_PERM(event)->fd = fd;
    703
    704	if (info_mode) {
    705		ret = copy_info_records_to_user(event, info, info_mode, pidfd,
    706						buf, count);
    707		if (ret < 0)
    708			goto out_close_fd;
    709	}
    710
    711	if (f)
    712		fd_install(fd, f);
    713
    714	return metadata.event_len;
    715
    716out_close_fd:
    717	if (fd != FAN_NOFD) {
    718		put_unused_fd(fd);
    719		fput(f);
    720	}
    721
    722	if (pidfd >= 0)
    723		close_fd(pidfd);
    724
    725	return ret;
    726}
    727
    728/* intofiy userspace file descriptor functions */
    729static __poll_t fanotify_poll(struct file *file, poll_table *wait)
    730{
    731	struct fsnotify_group *group = file->private_data;
    732	__poll_t ret = 0;
    733
    734	poll_wait(file, &group->notification_waitq, wait);
    735	spin_lock(&group->notification_lock);
    736	if (!fsnotify_notify_queue_is_empty(group))
    737		ret = EPOLLIN | EPOLLRDNORM;
    738	spin_unlock(&group->notification_lock);
    739
    740	return ret;
    741}
    742
    743static ssize_t fanotify_read(struct file *file, char __user *buf,
    744			     size_t count, loff_t *pos)
    745{
    746	struct fsnotify_group *group;
    747	struct fanotify_event *event;
    748	char __user *start;
    749	int ret;
    750	DEFINE_WAIT_FUNC(wait, woken_wake_function);
    751
    752	start = buf;
    753	group = file->private_data;
    754
    755	pr_debug("%s: group=%p\n", __func__, group);
    756
    757	add_wait_queue(&group->notification_waitq, &wait);
    758	while (1) {
    759		/*
    760		 * User can supply arbitrarily large buffer. Avoid softlockups
    761		 * in case there are lots of available events.
    762		 */
    763		cond_resched();
    764		event = get_one_event(group, count);
    765		if (IS_ERR(event)) {
    766			ret = PTR_ERR(event);
    767			break;
    768		}
    769
    770		if (!event) {
    771			ret = -EAGAIN;
    772			if (file->f_flags & O_NONBLOCK)
    773				break;
    774
    775			ret = -ERESTARTSYS;
    776			if (signal_pending(current))
    777				break;
    778
    779			if (start != buf)
    780				break;
    781
    782			wait_woken(&wait, TASK_INTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT);
    783			continue;
    784		}
    785
    786		ret = copy_event_to_user(group, event, buf, count);
    787		if (unlikely(ret == -EOPENSTALE)) {
    788			/*
    789			 * We cannot report events with stale fd so drop it.
    790			 * Setting ret to 0 will continue the event loop and
    791			 * do the right thing if there are no more events to
    792			 * read (i.e. return bytes read, -EAGAIN or wait).
    793			 */
    794			ret = 0;
    795		}
    796
    797		/*
    798		 * Permission events get queued to wait for response.  Other
    799		 * events can be destroyed now.
    800		 */
    801		if (!fanotify_is_perm_event(event->mask)) {
    802			fsnotify_destroy_event(group, &event->fse);
    803		} else {
    804			if (ret <= 0) {
    805				spin_lock(&group->notification_lock);
    806				finish_permission_event(group,
    807					FANOTIFY_PERM(event), FAN_DENY);
    808				wake_up(&group->fanotify_data.access_waitq);
    809			} else {
    810				spin_lock(&group->notification_lock);
    811				list_add_tail(&event->fse.list,
    812					&group->fanotify_data.access_list);
    813				spin_unlock(&group->notification_lock);
    814			}
    815		}
    816		if (ret < 0)
    817			break;
    818		buf += ret;
    819		count -= ret;
    820	}
    821	remove_wait_queue(&group->notification_waitq, &wait);
    822
    823	if (start != buf && ret != -EFAULT)
    824		ret = buf - start;
    825	return ret;
    826}
    827
    828static ssize_t fanotify_write(struct file *file, const char __user *buf, size_t count, loff_t *pos)
    829{
    830	struct fanotify_response response = { .fd = -1, .response = -1 };
    831	struct fsnotify_group *group;
    832	int ret;
    833
    834	if (!IS_ENABLED(CONFIG_FANOTIFY_ACCESS_PERMISSIONS))
    835		return -EINVAL;
    836
    837	group = file->private_data;
    838
    839	if (count < sizeof(response))
    840		return -EINVAL;
    841
    842	count = sizeof(response);
    843
    844	pr_debug("%s: group=%p count=%zu\n", __func__, group, count);
    845
    846	if (copy_from_user(&response, buf, count))
    847		return -EFAULT;
    848
    849	ret = process_access_response(group, &response);
    850	if (ret < 0)
    851		count = ret;
    852
    853	return count;
    854}
    855
    856static int fanotify_release(struct inode *ignored, struct file *file)
    857{
    858	struct fsnotify_group *group = file->private_data;
    859	struct fsnotify_event *fsn_event;
    860
    861	/*
    862	 * Stop new events from arriving in the notification queue. since
    863	 * userspace cannot use fanotify fd anymore, no event can enter or
    864	 * leave access_list by now either.
    865	 */
    866	fsnotify_group_stop_queueing(group);
    867
    868	/*
    869	 * Process all permission events on access_list and notification queue
    870	 * and simulate reply from userspace.
    871	 */
    872	spin_lock(&group->notification_lock);
    873	while (!list_empty(&group->fanotify_data.access_list)) {
    874		struct fanotify_perm_event *event;
    875
    876		event = list_first_entry(&group->fanotify_data.access_list,
    877				struct fanotify_perm_event, fae.fse.list);
    878		list_del_init(&event->fae.fse.list);
    879		finish_permission_event(group, event, FAN_ALLOW);
    880		spin_lock(&group->notification_lock);
    881	}
    882
    883	/*
    884	 * Destroy all non-permission events. For permission events just
    885	 * dequeue them and set the response. They will be freed once the
    886	 * response is consumed and fanotify_get_response() returns.
    887	 */
    888	while ((fsn_event = fsnotify_remove_first_event(group))) {
    889		struct fanotify_event *event = FANOTIFY_E(fsn_event);
    890
    891		if (!(event->mask & FANOTIFY_PERM_EVENTS)) {
    892			spin_unlock(&group->notification_lock);
    893			fsnotify_destroy_event(group, fsn_event);
    894		} else {
    895			finish_permission_event(group, FANOTIFY_PERM(event),
    896						FAN_ALLOW);
    897		}
    898		spin_lock(&group->notification_lock);
    899	}
    900	spin_unlock(&group->notification_lock);
    901
    902	/* Response for all permission events it set, wakeup waiters */
    903	wake_up(&group->fanotify_data.access_waitq);
    904
    905	/* matches the fanotify_init->fsnotify_alloc_group */
    906	fsnotify_destroy_group(group);
    907
    908	return 0;
    909}
    910
    911static long fanotify_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
    912{
    913	struct fsnotify_group *group;
    914	struct fsnotify_event *fsn_event;
    915	void __user *p;
    916	int ret = -ENOTTY;
    917	size_t send_len = 0;
    918
    919	group = file->private_data;
    920
    921	p = (void __user *) arg;
    922
    923	switch (cmd) {
    924	case FIONREAD:
    925		spin_lock(&group->notification_lock);
    926		list_for_each_entry(fsn_event, &group->notification_list, list)
    927			send_len += FAN_EVENT_METADATA_LEN;
    928		spin_unlock(&group->notification_lock);
    929		ret = put_user(send_len, (int __user *) p);
    930		break;
    931	}
    932
    933	return ret;
    934}
    935
    936static const struct file_operations fanotify_fops = {
    937	.show_fdinfo	= fanotify_show_fdinfo,
    938	.poll		= fanotify_poll,
    939	.read		= fanotify_read,
    940	.write		= fanotify_write,
    941	.fasync		= NULL,
    942	.release	= fanotify_release,
    943	.unlocked_ioctl	= fanotify_ioctl,
    944	.compat_ioctl	= compat_ptr_ioctl,
    945	.llseek		= noop_llseek,
    946};
    947
    948static int fanotify_find_path(int dfd, const char __user *filename,
    949			      struct path *path, unsigned int flags, __u64 mask,
    950			      unsigned int obj_type)
    951{
    952	int ret;
    953
    954	pr_debug("%s: dfd=%d filename=%p flags=%x\n", __func__,
    955		 dfd, filename, flags);
    956
    957	if (filename == NULL) {
    958		struct fd f = fdget(dfd);
    959
    960		ret = -EBADF;
    961		if (!f.file)
    962			goto out;
    963
    964		ret = -ENOTDIR;
    965		if ((flags & FAN_MARK_ONLYDIR) &&
    966		    !(S_ISDIR(file_inode(f.file)->i_mode))) {
    967			fdput(f);
    968			goto out;
    969		}
    970
    971		*path = f.file->f_path;
    972		path_get(path);
    973		fdput(f);
    974	} else {
    975		unsigned int lookup_flags = 0;
    976
    977		if (!(flags & FAN_MARK_DONT_FOLLOW))
    978			lookup_flags |= LOOKUP_FOLLOW;
    979		if (flags & FAN_MARK_ONLYDIR)
    980			lookup_flags |= LOOKUP_DIRECTORY;
    981
    982		ret = user_path_at(dfd, filename, lookup_flags, path);
    983		if (ret)
    984			goto out;
    985	}
    986
    987	/* you can only watch an inode if you have read permissions on it */
    988	ret = path_permission(path, MAY_READ);
    989	if (ret) {
    990		path_put(path);
    991		goto out;
    992	}
    993
    994	ret = security_path_notify(path, mask, obj_type);
    995	if (ret)
    996		path_put(path);
    997
    998out:
    999	return ret;
   1000}
   1001
   1002static __u32 fanotify_mark_remove_from_mask(struct fsnotify_mark *fsn_mark,
   1003					    __u32 mask, unsigned int flags,
   1004					    __u32 umask, int *destroy)
   1005{
   1006	__u32 oldmask, newmask;
   1007
   1008	/* umask bits cannot be removed by user */
   1009	mask &= ~umask;
   1010	spin_lock(&fsn_mark->lock);
   1011	oldmask = fsnotify_calc_mask(fsn_mark);
   1012	if (!(flags & FAN_MARK_IGNORED_MASK)) {
   1013		fsn_mark->mask &= ~mask;
   1014	} else {
   1015		fsn_mark->ignored_mask &= ~mask;
   1016	}
   1017	newmask = fsnotify_calc_mask(fsn_mark);
   1018	/*
   1019	 * We need to keep the mark around even if remaining mask cannot
   1020	 * result in any events (e.g. mask == FAN_ONDIR) to support incremenal
   1021	 * changes to the mask.
   1022	 * Destroy mark when only umask bits remain.
   1023	 */
   1024	*destroy = !((fsn_mark->mask | fsn_mark->ignored_mask) & ~umask);
   1025	spin_unlock(&fsn_mark->lock);
   1026
   1027	return oldmask & ~newmask;
   1028}
   1029
   1030static int fanotify_remove_mark(struct fsnotify_group *group,
   1031				fsnotify_connp_t *connp, __u32 mask,
   1032				unsigned int flags, __u32 umask)
   1033{
   1034	struct fsnotify_mark *fsn_mark = NULL;
   1035	__u32 removed;
   1036	int destroy_mark;
   1037
   1038	fsnotify_group_lock(group);
   1039	fsn_mark = fsnotify_find_mark(connp, group);
   1040	if (!fsn_mark) {
   1041		fsnotify_group_unlock(group);
   1042		return -ENOENT;
   1043	}
   1044
   1045	removed = fanotify_mark_remove_from_mask(fsn_mark, mask, flags,
   1046						 umask, &destroy_mark);
   1047	if (removed & fsnotify_conn_mask(fsn_mark->connector))
   1048		fsnotify_recalc_mask(fsn_mark->connector);
   1049	if (destroy_mark)
   1050		fsnotify_detach_mark(fsn_mark);
   1051	fsnotify_group_unlock(group);
   1052	if (destroy_mark)
   1053		fsnotify_free_mark(fsn_mark);
   1054
   1055	/* matches the fsnotify_find_mark() */
   1056	fsnotify_put_mark(fsn_mark);
   1057	return 0;
   1058}
   1059
   1060static int fanotify_remove_vfsmount_mark(struct fsnotify_group *group,
   1061					 struct vfsmount *mnt, __u32 mask,
   1062					 unsigned int flags, __u32 umask)
   1063{
   1064	return fanotify_remove_mark(group, &real_mount(mnt)->mnt_fsnotify_marks,
   1065				    mask, flags, umask);
   1066}
   1067
   1068static int fanotify_remove_sb_mark(struct fsnotify_group *group,
   1069				   struct super_block *sb, __u32 mask,
   1070				   unsigned int flags, __u32 umask)
   1071{
   1072	return fanotify_remove_mark(group, &sb->s_fsnotify_marks, mask,
   1073				    flags, umask);
   1074}
   1075
   1076static int fanotify_remove_inode_mark(struct fsnotify_group *group,
   1077				      struct inode *inode, __u32 mask,
   1078				      unsigned int flags, __u32 umask)
   1079{
   1080	return fanotify_remove_mark(group, &inode->i_fsnotify_marks, mask,
   1081				    flags, umask);
   1082}
   1083
   1084static bool fanotify_mark_update_flags(struct fsnotify_mark *fsn_mark,
   1085				       unsigned int fan_flags)
   1086{
   1087	bool want_iref = !(fan_flags & FAN_MARK_EVICTABLE);
   1088	bool recalc = false;
   1089
   1090	/*
   1091	 * Setting FAN_MARK_IGNORED_SURV_MODIFY for the first time may lead to
   1092	 * the removal of the FS_MODIFY bit in calculated mask if it was set
   1093	 * because of an ignored mask that is now going to survive FS_MODIFY.
   1094	 */
   1095	if ((fan_flags & FAN_MARK_IGNORED_MASK) &&
   1096	    (fan_flags & FAN_MARK_IGNORED_SURV_MODIFY) &&
   1097	    !(fsn_mark->flags & FSNOTIFY_MARK_FLAG_IGNORED_SURV_MODIFY)) {
   1098		fsn_mark->flags |= FSNOTIFY_MARK_FLAG_IGNORED_SURV_MODIFY;
   1099		if (!(fsn_mark->mask & FS_MODIFY))
   1100			recalc = true;
   1101	}
   1102
   1103	if (fsn_mark->connector->type != FSNOTIFY_OBJ_TYPE_INODE ||
   1104	    want_iref == !(fsn_mark->flags & FSNOTIFY_MARK_FLAG_NO_IREF))
   1105		return recalc;
   1106
   1107	/*
   1108	 * NO_IREF may be removed from a mark, but not added.
   1109	 * When removed, fsnotify_recalc_mask() will take the inode ref.
   1110	 */
   1111	WARN_ON_ONCE(!want_iref);
   1112	fsn_mark->flags &= ~FSNOTIFY_MARK_FLAG_NO_IREF;
   1113
   1114	return true;
   1115}
   1116
   1117static bool fanotify_mark_add_to_mask(struct fsnotify_mark *fsn_mark,
   1118				      __u32 mask, unsigned int fan_flags)
   1119{
   1120	bool recalc;
   1121
   1122	spin_lock(&fsn_mark->lock);
   1123	if (!(fan_flags & FAN_MARK_IGNORED_MASK))
   1124		fsn_mark->mask |= mask;
   1125	else
   1126		fsn_mark->ignored_mask |= mask;
   1127
   1128	recalc = fsnotify_calc_mask(fsn_mark) &
   1129		~fsnotify_conn_mask(fsn_mark->connector);
   1130
   1131	recalc |= fanotify_mark_update_flags(fsn_mark, fan_flags);
   1132	spin_unlock(&fsn_mark->lock);
   1133
   1134	return recalc;
   1135}
   1136
   1137static struct fsnotify_mark *fanotify_add_new_mark(struct fsnotify_group *group,
   1138						   fsnotify_connp_t *connp,
   1139						   unsigned int obj_type,
   1140						   unsigned int fan_flags,
   1141						   __kernel_fsid_t *fsid)
   1142{
   1143	struct ucounts *ucounts = group->fanotify_data.ucounts;
   1144	struct fsnotify_mark *mark;
   1145	int ret;
   1146
   1147	/*
   1148	 * Enforce per user marks limits per user in all containing user ns.
   1149	 * A group with FAN_UNLIMITED_MARKS does not contribute to mark count
   1150	 * in the limited groups account.
   1151	 */
   1152	if (!FAN_GROUP_FLAG(group, FAN_UNLIMITED_MARKS) &&
   1153	    !inc_ucount(ucounts->ns, ucounts->uid, UCOUNT_FANOTIFY_MARKS))
   1154		return ERR_PTR(-ENOSPC);
   1155
   1156	mark = kmem_cache_alloc(fanotify_mark_cache, GFP_KERNEL);
   1157	if (!mark) {
   1158		ret = -ENOMEM;
   1159		goto out_dec_ucounts;
   1160	}
   1161
   1162	fsnotify_init_mark(mark, group);
   1163	if (fan_flags & FAN_MARK_EVICTABLE)
   1164		mark->flags |= FSNOTIFY_MARK_FLAG_NO_IREF;
   1165
   1166	ret = fsnotify_add_mark_locked(mark, connp, obj_type, 0, fsid);
   1167	if (ret) {
   1168		fsnotify_put_mark(mark);
   1169		goto out_dec_ucounts;
   1170	}
   1171
   1172	return mark;
   1173
   1174out_dec_ucounts:
   1175	if (!FAN_GROUP_FLAG(group, FAN_UNLIMITED_MARKS))
   1176		dec_ucount(ucounts, UCOUNT_FANOTIFY_MARKS);
   1177	return ERR_PTR(ret);
   1178}
   1179
   1180static int fanotify_group_init_error_pool(struct fsnotify_group *group)
   1181{
   1182	if (mempool_initialized(&group->fanotify_data.error_events_pool))
   1183		return 0;
   1184
   1185	return mempool_init_kmalloc_pool(&group->fanotify_data.error_events_pool,
   1186					 FANOTIFY_DEFAULT_FEE_POOL_SIZE,
   1187					 sizeof(struct fanotify_error_event));
   1188}
   1189
   1190static int fanotify_add_mark(struct fsnotify_group *group,
   1191			     fsnotify_connp_t *connp, unsigned int obj_type,
   1192			     __u32 mask, unsigned int fan_flags,
   1193			     __kernel_fsid_t *fsid)
   1194{
   1195	struct fsnotify_mark *fsn_mark;
   1196	bool recalc;
   1197	int ret = 0;
   1198
   1199	fsnotify_group_lock(group);
   1200	fsn_mark = fsnotify_find_mark(connp, group);
   1201	if (!fsn_mark) {
   1202		fsn_mark = fanotify_add_new_mark(group, connp, obj_type,
   1203						 fan_flags, fsid);
   1204		if (IS_ERR(fsn_mark)) {
   1205			fsnotify_group_unlock(group);
   1206			return PTR_ERR(fsn_mark);
   1207		}
   1208	}
   1209
   1210	/*
   1211	 * Non evictable mark cannot be downgraded to evictable mark.
   1212	 */
   1213	if (fan_flags & FAN_MARK_EVICTABLE &&
   1214	    !(fsn_mark->flags & FSNOTIFY_MARK_FLAG_NO_IREF)) {
   1215		ret = -EEXIST;
   1216		goto out;
   1217	}
   1218
   1219	/*
   1220	 * Error events are pre-allocated per group, only if strictly
   1221	 * needed (i.e. FAN_FS_ERROR was requested).
   1222	 */
   1223	if (!(fan_flags & FAN_MARK_IGNORED_MASK) && (mask & FAN_FS_ERROR)) {
   1224		ret = fanotify_group_init_error_pool(group);
   1225		if (ret)
   1226			goto out;
   1227	}
   1228
   1229	recalc = fanotify_mark_add_to_mask(fsn_mark, mask, fan_flags);
   1230	if (recalc)
   1231		fsnotify_recalc_mask(fsn_mark->connector);
   1232
   1233out:
   1234	fsnotify_group_unlock(group);
   1235
   1236	fsnotify_put_mark(fsn_mark);
   1237	return ret;
   1238}
   1239
   1240static int fanotify_add_vfsmount_mark(struct fsnotify_group *group,
   1241				      struct vfsmount *mnt, __u32 mask,
   1242				      unsigned int flags, __kernel_fsid_t *fsid)
   1243{
   1244	return fanotify_add_mark(group, &real_mount(mnt)->mnt_fsnotify_marks,
   1245				 FSNOTIFY_OBJ_TYPE_VFSMOUNT, mask, flags, fsid);
   1246}
   1247
   1248static int fanotify_add_sb_mark(struct fsnotify_group *group,
   1249				struct super_block *sb, __u32 mask,
   1250				unsigned int flags, __kernel_fsid_t *fsid)
   1251{
   1252	return fanotify_add_mark(group, &sb->s_fsnotify_marks,
   1253				 FSNOTIFY_OBJ_TYPE_SB, mask, flags, fsid);
   1254}
   1255
   1256static int fanotify_add_inode_mark(struct fsnotify_group *group,
   1257				   struct inode *inode, __u32 mask,
   1258				   unsigned int flags, __kernel_fsid_t *fsid)
   1259{
   1260	pr_debug("%s: group=%p inode=%p\n", __func__, group, inode);
   1261
   1262	/*
   1263	 * If some other task has this inode open for write we should not add
   1264	 * an ignored mark, unless that ignored mark is supposed to survive
   1265	 * modification changes anyway.
   1266	 */
   1267	if ((flags & FAN_MARK_IGNORED_MASK) &&
   1268	    !(flags & FAN_MARK_IGNORED_SURV_MODIFY) &&
   1269	    inode_is_open_for_write(inode))
   1270		return 0;
   1271
   1272	return fanotify_add_mark(group, &inode->i_fsnotify_marks,
   1273				 FSNOTIFY_OBJ_TYPE_INODE, mask, flags, fsid);
   1274}
   1275
   1276static struct fsnotify_event *fanotify_alloc_overflow_event(void)
   1277{
   1278	struct fanotify_event *oevent;
   1279
   1280	oevent = kmalloc(sizeof(*oevent), GFP_KERNEL_ACCOUNT);
   1281	if (!oevent)
   1282		return NULL;
   1283
   1284	fanotify_init_event(oevent, 0, FS_Q_OVERFLOW);
   1285	oevent->type = FANOTIFY_EVENT_TYPE_OVERFLOW;
   1286
   1287	return &oevent->fse;
   1288}
   1289
   1290static struct hlist_head *fanotify_alloc_merge_hash(void)
   1291{
   1292	struct hlist_head *hash;
   1293
   1294	hash = kmalloc(sizeof(struct hlist_head) << FANOTIFY_HTABLE_BITS,
   1295		       GFP_KERNEL_ACCOUNT);
   1296	if (!hash)
   1297		return NULL;
   1298
   1299	__hash_init(hash, FANOTIFY_HTABLE_SIZE);
   1300
   1301	return hash;
   1302}
   1303
   1304/* fanotify syscalls */
   1305SYSCALL_DEFINE2(fanotify_init, unsigned int, flags, unsigned int, event_f_flags)
   1306{
   1307	struct fsnotify_group *group;
   1308	int f_flags, fd;
   1309	unsigned int fid_mode = flags & FANOTIFY_FID_BITS;
   1310	unsigned int class = flags & FANOTIFY_CLASS_BITS;
   1311	unsigned int internal_flags = 0;
   1312
   1313	pr_debug("%s: flags=%x event_f_flags=%x\n",
   1314		 __func__, flags, event_f_flags);
   1315
   1316	if (!capable(CAP_SYS_ADMIN)) {
   1317		/*
   1318		 * An unprivileged user can setup an fanotify group with
   1319		 * limited functionality - an unprivileged group is limited to
   1320		 * notification events with file handles and it cannot use
   1321		 * unlimited queue/marks.
   1322		 */
   1323		if ((flags & FANOTIFY_ADMIN_INIT_FLAGS) || !fid_mode)
   1324			return -EPERM;
   1325
   1326		/*
   1327		 * Setting the internal flag FANOTIFY_UNPRIV on the group
   1328		 * prevents setting mount/filesystem marks on this group and
   1329		 * prevents reporting pid and open fd in events.
   1330		 */
   1331		internal_flags |= FANOTIFY_UNPRIV;
   1332	}
   1333
   1334#ifdef CONFIG_AUDITSYSCALL
   1335	if (flags & ~(FANOTIFY_INIT_FLAGS | FAN_ENABLE_AUDIT))
   1336#else
   1337	if (flags & ~FANOTIFY_INIT_FLAGS)
   1338#endif
   1339		return -EINVAL;
   1340
   1341	/*
   1342	 * A pidfd can only be returned for a thread-group leader; thus
   1343	 * FAN_REPORT_PIDFD and FAN_REPORT_TID need to remain mutually
   1344	 * exclusive.
   1345	 */
   1346	if ((flags & FAN_REPORT_PIDFD) && (flags & FAN_REPORT_TID))
   1347		return -EINVAL;
   1348
   1349	if (event_f_flags & ~FANOTIFY_INIT_ALL_EVENT_F_BITS)
   1350		return -EINVAL;
   1351
   1352	switch (event_f_flags & O_ACCMODE) {
   1353	case O_RDONLY:
   1354	case O_RDWR:
   1355	case O_WRONLY:
   1356		break;
   1357	default:
   1358		return -EINVAL;
   1359	}
   1360
   1361	if (fid_mode && class != FAN_CLASS_NOTIF)
   1362		return -EINVAL;
   1363
   1364	/*
   1365	 * Child name is reported with parent fid so requires dir fid.
   1366	 * We can report both child fid and dir fid with or without name.
   1367	 */
   1368	if ((fid_mode & FAN_REPORT_NAME) && !(fid_mode & FAN_REPORT_DIR_FID))
   1369		return -EINVAL;
   1370
   1371	/*
   1372	 * FAN_REPORT_TARGET_FID requires FAN_REPORT_NAME and FAN_REPORT_FID
   1373	 * and is used as an indication to report both dir and child fid on all
   1374	 * dirent events.
   1375	 */
   1376	if ((fid_mode & FAN_REPORT_TARGET_FID) &&
   1377	    (!(fid_mode & FAN_REPORT_NAME) || !(fid_mode & FAN_REPORT_FID)))
   1378		return -EINVAL;
   1379
   1380	f_flags = O_RDWR | __FMODE_NONOTIFY;
   1381	if (flags & FAN_CLOEXEC)
   1382		f_flags |= O_CLOEXEC;
   1383	if (flags & FAN_NONBLOCK)
   1384		f_flags |= O_NONBLOCK;
   1385
   1386	/* fsnotify_alloc_group takes a ref.  Dropped in fanotify_release */
   1387	group = fsnotify_alloc_group(&fanotify_fsnotify_ops,
   1388				     FSNOTIFY_GROUP_USER | FSNOTIFY_GROUP_NOFS);
   1389	if (IS_ERR(group)) {
   1390		return PTR_ERR(group);
   1391	}
   1392
   1393	/* Enforce groups limits per user in all containing user ns */
   1394	group->fanotify_data.ucounts = inc_ucount(current_user_ns(),
   1395						  current_euid(),
   1396						  UCOUNT_FANOTIFY_GROUPS);
   1397	if (!group->fanotify_data.ucounts) {
   1398		fd = -EMFILE;
   1399		goto out_destroy_group;
   1400	}
   1401
   1402	group->fanotify_data.flags = flags | internal_flags;
   1403	group->memcg = get_mem_cgroup_from_mm(current->mm);
   1404
   1405	group->fanotify_data.merge_hash = fanotify_alloc_merge_hash();
   1406	if (!group->fanotify_data.merge_hash) {
   1407		fd = -ENOMEM;
   1408		goto out_destroy_group;
   1409	}
   1410
   1411	group->overflow_event = fanotify_alloc_overflow_event();
   1412	if (unlikely(!group->overflow_event)) {
   1413		fd = -ENOMEM;
   1414		goto out_destroy_group;
   1415	}
   1416
   1417	if (force_o_largefile())
   1418		event_f_flags |= O_LARGEFILE;
   1419	group->fanotify_data.f_flags = event_f_flags;
   1420	init_waitqueue_head(&group->fanotify_data.access_waitq);
   1421	INIT_LIST_HEAD(&group->fanotify_data.access_list);
   1422	switch (class) {
   1423	case FAN_CLASS_NOTIF:
   1424		group->priority = FS_PRIO_0;
   1425		break;
   1426	case FAN_CLASS_CONTENT:
   1427		group->priority = FS_PRIO_1;
   1428		break;
   1429	case FAN_CLASS_PRE_CONTENT:
   1430		group->priority = FS_PRIO_2;
   1431		break;
   1432	default:
   1433		fd = -EINVAL;
   1434		goto out_destroy_group;
   1435	}
   1436
   1437	if (flags & FAN_UNLIMITED_QUEUE) {
   1438		fd = -EPERM;
   1439		if (!capable(CAP_SYS_ADMIN))
   1440			goto out_destroy_group;
   1441		group->max_events = UINT_MAX;
   1442	} else {
   1443		group->max_events = fanotify_max_queued_events;
   1444	}
   1445
   1446	if (flags & FAN_UNLIMITED_MARKS) {
   1447		fd = -EPERM;
   1448		if (!capable(CAP_SYS_ADMIN))
   1449			goto out_destroy_group;
   1450	}
   1451
   1452	if (flags & FAN_ENABLE_AUDIT) {
   1453		fd = -EPERM;
   1454		if (!capable(CAP_AUDIT_WRITE))
   1455			goto out_destroy_group;
   1456	}
   1457
   1458	fd = anon_inode_getfd("[fanotify]", &fanotify_fops, group, f_flags);
   1459	if (fd < 0)
   1460		goto out_destroy_group;
   1461
   1462	return fd;
   1463
   1464out_destroy_group:
   1465	fsnotify_destroy_group(group);
   1466	return fd;
   1467}
   1468
   1469static int fanotify_test_fsid(struct dentry *dentry, __kernel_fsid_t *fsid)
   1470{
   1471	__kernel_fsid_t root_fsid;
   1472	int err;
   1473
   1474	/*
   1475	 * Make sure dentry is not of a filesystem with zero fsid (e.g. fuse).
   1476	 */
   1477	err = vfs_get_fsid(dentry, fsid);
   1478	if (err)
   1479		return err;
   1480
   1481	if (!fsid->val[0] && !fsid->val[1])
   1482		return -ENODEV;
   1483
   1484	/*
   1485	 * Make sure dentry is not of a filesystem subvolume (e.g. btrfs)
   1486	 * which uses a different fsid than sb root.
   1487	 */
   1488	err = vfs_get_fsid(dentry->d_sb->s_root, &root_fsid);
   1489	if (err)
   1490		return err;
   1491
   1492	if (root_fsid.val[0] != fsid->val[0] ||
   1493	    root_fsid.val[1] != fsid->val[1])
   1494		return -EXDEV;
   1495
   1496	return 0;
   1497}
   1498
   1499/* Check if filesystem can encode a unique fid */
   1500static int fanotify_test_fid(struct dentry *dentry)
   1501{
   1502	/*
   1503	 * We need to make sure that the file system supports at least
   1504	 * encoding a file handle so user can use name_to_handle_at() to
   1505	 * compare fid returned with event to the file handle of watched
   1506	 * objects. However, name_to_handle_at() requires that the
   1507	 * filesystem also supports decoding file handles.
   1508	 */
   1509	if (!dentry->d_sb->s_export_op ||
   1510	    !dentry->d_sb->s_export_op->fh_to_dentry)
   1511		return -EOPNOTSUPP;
   1512
   1513	return 0;
   1514}
   1515
   1516static int fanotify_events_supported(struct fsnotify_group *group,
   1517				     struct path *path, __u64 mask,
   1518				     unsigned int flags)
   1519{
   1520	unsigned int mark_type = flags & FANOTIFY_MARK_TYPE_BITS;
   1521	/* Strict validation of events in non-dir inode mask with v5.17+ APIs */
   1522	bool strict_dir_events = FAN_GROUP_FLAG(group, FAN_REPORT_TARGET_FID) ||
   1523				 (mask & FAN_RENAME);
   1524
   1525	/*
   1526	 * Some filesystems such as 'proc' acquire unusual locks when opening
   1527	 * files. For them fanotify permission events have high chances of
   1528	 * deadlocking the system - open done when reporting fanotify event
   1529	 * blocks on this "unusual" lock while another process holding the lock
   1530	 * waits for fanotify permission event to be answered. Just disallow
   1531	 * permission events for such filesystems.
   1532	 */
   1533	if (mask & FANOTIFY_PERM_EVENTS &&
   1534	    path->mnt->mnt_sb->s_type->fs_flags & FS_DISALLOW_NOTIFY_PERM)
   1535		return -EINVAL;
   1536
   1537	/*
   1538	 * We shouldn't have allowed setting dirent events and the directory
   1539	 * flags FAN_ONDIR and FAN_EVENT_ON_CHILD in mask of non-dir inode,
   1540	 * but because we always allowed it, error only when using new APIs.
   1541	 */
   1542	if (strict_dir_events && mark_type == FAN_MARK_INODE &&
   1543	    !d_is_dir(path->dentry) && (mask & FANOTIFY_DIRONLY_EVENT_BITS))
   1544		return -ENOTDIR;
   1545
   1546	return 0;
   1547}
   1548
   1549static int do_fanotify_mark(int fanotify_fd, unsigned int flags, __u64 mask,
   1550			    int dfd, const char  __user *pathname)
   1551{
   1552	struct inode *inode = NULL;
   1553	struct vfsmount *mnt = NULL;
   1554	struct fsnotify_group *group;
   1555	struct fd f;
   1556	struct path path;
   1557	__kernel_fsid_t __fsid, *fsid = NULL;
   1558	u32 valid_mask = FANOTIFY_EVENTS | FANOTIFY_EVENT_FLAGS;
   1559	unsigned int mark_type = flags & FANOTIFY_MARK_TYPE_BITS;
   1560	bool ignored = flags & FAN_MARK_IGNORED_MASK;
   1561	unsigned int obj_type, fid_mode;
   1562	u32 umask = 0;
   1563	int ret;
   1564
   1565	pr_debug("%s: fanotify_fd=%d flags=%x dfd=%d pathname=%p mask=%llx\n",
   1566		 __func__, fanotify_fd, flags, dfd, pathname, mask);
   1567
   1568	/* we only use the lower 32 bits as of right now. */
   1569	if (upper_32_bits(mask))
   1570		return -EINVAL;
   1571
   1572	if (flags & ~FANOTIFY_MARK_FLAGS)
   1573		return -EINVAL;
   1574
   1575	switch (mark_type) {
   1576	case FAN_MARK_INODE:
   1577		obj_type = FSNOTIFY_OBJ_TYPE_INODE;
   1578		break;
   1579	case FAN_MARK_MOUNT:
   1580		obj_type = FSNOTIFY_OBJ_TYPE_VFSMOUNT;
   1581		break;
   1582	case FAN_MARK_FILESYSTEM:
   1583		obj_type = FSNOTIFY_OBJ_TYPE_SB;
   1584		break;
   1585	default:
   1586		return -EINVAL;
   1587	}
   1588
   1589	switch (flags & (FAN_MARK_ADD | FAN_MARK_REMOVE | FAN_MARK_FLUSH)) {
   1590	case FAN_MARK_ADD:
   1591	case FAN_MARK_REMOVE:
   1592		if (!mask)
   1593			return -EINVAL;
   1594		break;
   1595	case FAN_MARK_FLUSH:
   1596		if (flags & ~(FANOTIFY_MARK_TYPE_BITS | FAN_MARK_FLUSH))
   1597			return -EINVAL;
   1598		break;
   1599	default:
   1600		return -EINVAL;
   1601	}
   1602
   1603	if (IS_ENABLED(CONFIG_FANOTIFY_ACCESS_PERMISSIONS))
   1604		valid_mask |= FANOTIFY_PERM_EVENTS;
   1605
   1606	if (mask & ~valid_mask)
   1607		return -EINVAL;
   1608
   1609	/* Event flags (ONDIR, ON_CHILD) are meaningless in ignored mask */
   1610	if (ignored)
   1611		mask &= ~FANOTIFY_EVENT_FLAGS;
   1612
   1613	f = fdget(fanotify_fd);
   1614	if (unlikely(!f.file))
   1615		return -EBADF;
   1616
   1617	/* verify that this is indeed an fanotify instance */
   1618	ret = -EINVAL;
   1619	if (unlikely(f.file->f_op != &fanotify_fops))
   1620		goto fput_and_out;
   1621	group = f.file->private_data;
   1622
   1623	/*
   1624	 * An unprivileged user is not allowed to setup mount nor filesystem
   1625	 * marks.  This also includes setting up such marks by a group that
   1626	 * was initialized by an unprivileged user.
   1627	 */
   1628	ret = -EPERM;
   1629	if ((!capable(CAP_SYS_ADMIN) ||
   1630	     FAN_GROUP_FLAG(group, FANOTIFY_UNPRIV)) &&
   1631	    mark_type != FAN_MARK_INODE)
   1632		goto fput_and_out;
   1633
   1634	/*
   1635	 * group->priority == FS_PRIO_0 == FAN_CLASS_NOTIF.  These are not
   1636	 * allowed to set permissions events.
   1637	 */
   1638	ret = -EINVAL;
   1639	if (mask & FANOTIFY_PERM_EVENTS &&
   1640	    group->priority == FS_PRIO_0)
   1641		goto fput_and_out;
   1642
   1643	if (mask & FAN_FS_ERROR &&
   1644	    mark_type != FAN_MARK_FILESYSTEM)
   1645		goto fput_and_out;
   1646
   1647	/*
   1648	 * Evictable is only relevant for inode marks, because only inode object
   1649	 * can be evicted on memory pressure.
   1650	 */
   1651	if (flags & FAN_MARK_EVICTABLE &&
   1652	     mark_type != FAN_MARK_INODE)
   1653		goto fput_and_out;
   1654
   1655	/*
   1656	 * Events that do not carry enough information to report
   1657	 * event->fd require a group that supports reporting fid.  Those
   1658	 * events are not supported on a mount mark, because they do not
   1659	 * carry enough information (i.e. path) to be filtered by mount
   1660	 * point.
   1661	 */
   1662	fid_mode = FAN_GROUP_FLAG(group, FANOTIFY_FID_BITS);
   1663	if (mask & ~(FANOTIFY_FD_EVENTS|FANOTIFY_EVENT_FLAGS) &&
   1664	    (!fid_mode || mark_type == FAN_MARK_MOUNT))
   1665		goto fput_and_out;
   1666
   1667	/*
   1668	 * FAN_RENAME uses special info type records to report the old and
   1669	 * new parent+name.  Reporting only old and new parent id is less
   1670	 * useful and was not implemented.
   1671	 */
   1672	if (mask & FAN_RENAME && !(fid_mode & FAN_REPORT_NAME))
   1673		goto fput_and_out;
   1674
   1675	if (flags & FAN_MARK_FLUSH) {
   1676		ret = 0;
   1677		if (mark_type == FAN_MARK_MOUNT)
   1678			fsnotify_clear_vfsmount_marks_by_group(group);
   1679		else if (mark_type == FAN_MARK_FILESYSTEM)
   1680			fsnotify_clear_sb_marks_by_group(group);
   1681		else
   1682			fsnotify_clear_inode_marks_by_group(group);
   1683		goto fput_and_out;
   1684	}
   1685
   1686	ret = fanotify_find_path(dfd, pathname, &path, flags,
   1687			(mask & ALL_FSNOTIFY_EVENTS), obj_type);
   1688	if (ret)
   1689		goto fput_and_out;
   1690
   1691	if (flags & FAN_MARK_ADD) {
   1692		ret = fanotify_events_supported(group, &path, mask, flags);
   1693		if (ret)
   1694			goto path_put_and_out;
   1695	}
   1696
   1697	if (fid_mode) {
   1698		ret = fanotify_test_fsid(path.dentry, &__fsid);
   1699		if (ret)
   1700			goto path_put_and_out;
   1701
   1702		ret = fanotify_test_fid(path.dentry);
   1703		if (ret)
   1704			goto path_put_and_out;
   1705
   1706		fsid = &__fsid;
   1707	}
   1708
   1709	/* inode held in place by reference to path; group by fget on fd */
   1710	if (mark_type == FAN_MARK_INODE)
   1711		inode = path.dentry->d_inode;
   1712	else
   1713		mnt = path.mnt;
   1714
   1715	/* Mask out FAN_EVENT_ON_CHILD flag for sb/mount/non-dir marks */
   1716	if (mnt || !S_ISDIR(inode->i_mode)) {
   1717		mask &= ~FAN_EVENT_ON_CHILD;
   1718		umask = FAN_EVENT_ON_CHILD;
   1719		/*
   1720		 * If group needs to report parent fid, register for getting
   1721		 * events with parent/name info for non-directory.
   1722		 */
   1723		if ((fid_mode & FAN_REPORT_DIR_FID) &&
   1724		    (flags & FAN_MARK_ADD) && !ignored)
   1725			mask |= FAN_EVENT_ON_CHILD;
   1726	}
   1727
   1728	/* create/update an inode mark */
   1729	switch (flags & (FAN_MARK_ADD | FAN_MARK_REMOVE)) {
   1730	case FAN_MARK_ADD:
   1731		if (mark_type == FAN_MARK_MOUNT)
   1732			ret = fanotify_add_vfsmount_mark(group, mnt, mask,
   1733							 flags, fsid);
   1734		else if (mark_type == FAN_MARK_FILESYSTEM)
   1735			ret = fanotify_add_sb_mark(group, mnt->mnt_sb, mask,
   1736						   flags, fsid);
   1737		else
   1738			ret = fanotify_add_inode_mark(group, inode, mask,
   1739						      flags, fsid);
   1740		break;
   1741	case FAN_MARK_REMOVE:
   1742		if (mark_type == FAN_MARK_MOUNT)
   1743			ret = fanotify_remove_vfsmount_mark(group, mnt, mask,
   1744							    flags, umask);
   1745		else if (mark_type == FAN_MARK_FILESYSTEM)
   1746			ret = fanotify_remove_sb_mark(group, mnt->mnt_sb, mask,
   1747						      flags, umask);
   1748		else
   1749			ret = fanotify_remove_inode_mark(group, inode, mask,
   1750							 flags, umask);
   1751		break;
   1752	default:
   1753		ret = -EINVAL;
   1754	}
   1755
   1756path_put_and_out:
   1757	path_put(&path);
   1758fput_and_out:
   1759	fdput(f);
   1760	return ret;
   1761}
   1762
   1763#ifndef CONFIG_ARCH_SPLIT_ARG64
   1764SYSCALL_DEFINE5(fanotify_mark, int, fanotify_fd, unsigned int, flags,
   1765			      __u64, mask, int, dfd,
   1766			      const char  __user *, pathname)
   1767{
   1768	return do_fanotify_mark(fanotify_fd, flags, mask, dfd, pathname);
   1769}
   1770#endif
   1771
   1772#if defined(CONFIG_ARCH_SPLIT_ARG64) || defined(CONFIG_COMPAT)
   1773SYSCALL32_DEFINE6(fanotify_mark,
   1774				int, fanotify_fd, unsigned int, flags,
   1775				SC_ARG64(mask), int, dfd,
   1776				const char  __user *, pathname)
   1777{
   1778	return do_fanotify_mark(fanotify_fd, flags, SC_VAL64(__u64, mask),
   1779				dfd, pathname);
   1780}
   1781#endif
   1782
   1783/*
   1784 * fanotify_user_setup - Our initialization function.  Note that we cannot return
   1785 * error because we have compiled-in VFS hooks.  So an (unlikely) failure here
   1786 * must result in panic().
   1787 */
   1788static int __init fanotify_user_setup(void)
   1789{
   1790	struct sysinfo si;
   1791	int max_marks;
   1792
   1793	si_meminfo(&si);
   1794	/*
   1795	 * Allow up to 1% of addressable memory to be accounted for per user
   1796	 * marks limited to the range [8192, 1048576]. mount and sb marks are
   1797	 * a lot cheaper than inode marks, but there is no reason for a user
   1798	 * to have many of those, so calculate by the cost of inode marks.
   1799	 */
   1800	max_marks = (((si.totalram - si.totalhigh) / 100) << PAGE_SHIFT) /
   1801		    INODE_MARK_COST;
   1802	max_marks = clamp(max_marks, FANOTIFY_OLD_DEFAULT_MAX_MARKS,
   1803				     FANOTIFY_DEFAULT_MAX_USER_MARKS);
   1804
   1805	BUILD_BUG_ON(FANOTIFY_INIT_FLAGS & FANOTIFY_INTERNAL_GROUP_FLAGS);
   1806	BUILD_BUG_ON(HWEIGHT32(FANOTIFY_INIT_FLAGS) != 12);
   1807	BUILD_BUG_ON(HWEIGHT32(FANOTIFY_MARK_FLAGS) != 10);
   1808
   1809	fanotify_mark_cache = KMEM_CACHE(fsnotify_mark,
   1810					 SLAB_PANIC|SLAB_ACCOUNT);
   1811	fanotify_fid_event_cachep = KMEM_CACHE(fanotify_fid_event,
   1812					       SLAB_PANIC);
   1813	fanotify_path_event_cachep = KMEM_CACHE(fanotify_path_event,
   1814						SLAB_PANIC);
   1815	if (IS_ENABLED(CONFIG_FANOTIFY_ACCESS_PERMISSIONS)) {
   1816		fanotify_perm_event_cachep =
   1817			KMEM_CACHE(fanotify_perm_event, SLAB_PANIC);
   1818	}
   1819
   1820	fanotify_max_queued_events = FANOTIFY_DEFAULT_MAX_EVENTS;
   1821	init_user_ns.ucount_max[UCOUNT_FANOTIFY_GROUPS] =
   1822					FANOTIFY_DEFAULT_MAX_GROUPS;
   1823	init_user_ns.ucount_max[UCOUNT_FANOTIFY_MARKS] = max_marks;
   1824	fanotify_sysctls_init();
   1825
   1826	return 0;
   1827}
   1828device_initcall(fanotify_user_setup);