cachepc-linux

Fork of AMDESE/linux with modifications for CachePC side-channel attack
git clone https://git.sinitax.com/sinitax/cachepc-linux
Log | Files | Refs | README | LICENSE | sfeed.txt

eventpoll.c (65595B)


      1// SPDX-License-Identifier: GPL-2.0-or-later
      2/*
      3 *  fs/eventpoll.c (Efficient event retrieval implementation)
      4 *  Copyright (C) 2001,...,2009	 Davide Libenzi
      5 *
      6 *  Davide Libenzi <davidel@xmailserver.org>
      7 */
      8
      9#include <linux/init.h>
     10#include <linux/kernel.h>
     11#include <linux/sched/signal.h>
     12#include <linux/fs.h>
     13#include <linux/file.h>
     14#include <linux/signal.h>
     15#include <linux/errno.h>
     16#include <linux/mm.h>
     17#include <linux/slab.h>
     18#include <linux/poll.h>
     19#include <linux/string.h>
     20#include <linux/list.h>
     21#include <linux/hash.h>
     22#include <linux/spinlock.h>
     23#include <linux/syscalls.h>
     24#include <linux/rbtree.h>
     25#include <linux/wait.h>
     26#include <linux/eventpoll.h>
     27#include <linux/mount.h>
     28#include <linux/bitops.h>
     29#include <linux/mutex.h>
     30#include <linux/anon_inodes.h>
     31#include <linux/device.h>
     32#include <linux/uaccess.h>
     33#include <asm/io.h>
     34#include <asm/mman.h>
     35#include <linux/atomic.h>
     36#include <linux/proc_fs.h>
     37#include <linux/seq_file.h>
     38#include <linux/compat.h>
     39#include <linux/rculist.h>
     40#include <net/busy_poll.h>
     41
     42/*
     43 * LOCKING:
     44 * There are three level of locking required by epoll :
     45 *
     46 * 1) epmutex (mutex)
     47 * 2) ep->mtx (mutex)
     48 * 3) ep->lock (rwlock)
     49 *
     50 * The acquire order is the one listed above, from 1 to 3.
     51 * We need a rwlock (ep->lock) because we manipulate objects
     52 * from inside the poll callback, that might be triggered from
     53 * a wake_up() that in turn might be called from IRQ context.
     54 * So we can't sleep inside the poll callback and hence we need
     55 * a spinlock. During the event transfer loop (from kernel to
     56 * user space) we could end up sleeping due a copy_to_user(), so
     57 * we need a lock that will allow us to sleep. This lock is a
     58 * mutex (ep->mtx). It is acquired during the event transfer loop,
     59 * during epoll_ctl(EPOLL_CTL_DEL) and during eventpoll_release_file().
     60 * Then we also need a global mutex to serialize eventpoll_release_file()
     61 * and ep_free().
     62 * This mutex is acquired by ep_free() during the epoll file
     63 * cleanup path and it is also acquired by eventpoll_release_file()
     64 * if a file has been pushed inside an epoll set and it is then
     65 * close()d without a previous call to epoll_ctl(EPOLL_CTL_DEL).
     66 * It is also acquired when inserting an epoll fd onto another epoll
     67 * fd. We do this so that we walk the epoll tree and ensure that this
     68 * insertion does not create a cycle of epoll file descriptors, which
     69 * could lead to deadlock. We need a global mutex to prevent two
     70 * simultaneous inserts (A into B and B into A) from racing and
     71 * constructing a cycle without either insert observing that it is
     72 * going to.
     73 * It is necessary to acquire multiple "ep->mtx"es at once in the
     74 * case when one epoll fd is added to another. In this case, we
     75 * always acquire the locks in the order of nesting (i.e. after
     76 * epoll_ctl(e1, EPOLL_CTL_ADD, e2), e1->mtx will always be acquired
     77 * before e2->mtx). Since we disallow cycles of epoll file
     78 * descriptors, this ensures that the mutexes are well-ordered. In
     79 * order to communicate this nesting to lockdep, when walking a tree
     80 * of epoll file descriptors, we use the current recursion depth as
     81 * the lockdep subkey.
     82 * It is possible to drop the "ep->mtx" and to use the global
     83 * mutex "epmutex" (together with "ep->lock") to have it working,
     84 * but having "ep->mtx" will make the interface more scalable.
     85 * Events that require holding "epmutex" are very rare, while for
     86 * normal operations the epoll private "ep->mtx" will guarantee
     87 * a better scalability.
     88 */
     89
     90/* Epoll private bits inside the event mask */
     91#define EP_PRIVATE_BITS (EPOLLWAKEUP | EPOLLONESHOT | EPOLLET | EPOLLEXCLUSIVE)
     92
     93#define EPOLLINOUT_BITS (EPOLLIN | EPOLLOUT)
     94
     95#define EPOLLEXCLUSIVE_OK_BITS (EPOLLINOUT_BITS | EPOLLERR | EPOLLHUP | \
     96				EPOLLWAKEUP | EPOLLET | EPOLLEXCLUSIVE)
     97
     98/* Maximum number of nesting allowed inside epoll sets */
     99#define EP_MAX_NESTS 4
    100
    101#define EP_MAX_EVENTS (INT_MAX / sizeof(struct epoll_event))
    102
    103#define EP_UNACTIVE_PTR ((void *) -1L)
    104
    105#define EP_ITEM_COST (sizeof(struct epitem) + sizeof(struct eppoll_entry))
    106
    107struct epoll_filefd {
    108	struct file *file;
    109	int fd;
    110} __packed;
    111
    112/* Wait structure used by the poll hooks */
    113struct eppoll_entry {
    114	/* List header used to link this structure to the "struct epitem" */
    115	struct eppoll_entry *next;
    116
    117	/* The "base" pointer is set to the container "struct epitem" */
    118	struct epitem *base;
    119
    120	/*
    121	 * Wait queue item that will be linked to the target file wait
    122	 * queue head.
    123	 */
    124	wait_queue_entry_t wait;
    125
    126	/* The wait queue head that linked the "wait" wait queue item */
    127	wait_queue_head_t *whead;
    128};
    129
    130/*
    131 * Each file descriptor added to the eventpoll interface will
    132 * have an entry of this type linked to the "rbr" RB tree.
    133 * Avoid increasing the size of this struct, there can be many thousands
    134 * of these on a server and we do not want this to take another cache line.
    135 */
    136struct epitem {
    137	union {
    138		/* RB tree node links this structure to the eventpoll RB tree */
    139		struct rb_node rbn;
    140		/* Used to free the struct epitem */
    141		struct rcu_head rcu;
    142	};
    143
    144	/* List header used to link this structure to the eventpoll ready list */
    145	struct list_head rdllink;
    146
    147	/*
    148	 * Works together "struct eventpoll"->ovflist in keeping the
    149	 * single linked chain of items.
    150	 */
    151	struct epitem *next;
    152
    153	/* The file descriptor information this item refers to */
    154	struct epoll_filefd ffd;
    155
    156	/* List containing poll wait queues */
    157	struct eppoll_entry *pwqlist;
    158
    159	/* The "container" of this item */
    160	struct eventpoll *ep;
    161
    162	/* List header used to link this item to the "struct file" items list */
    163	struct hlist_node fllink;
    164
    165	/* wakeup_source used when EPOLLWAKEUP is set */
    166	struct wakeup_source __rcu *ws;
    167
    168	/* The structure that describe the interested events and the source fd */
    169	struct epoll_event event;
    170};
    171
    172/*
    173 * This structure is stored inside the "private_data" member of the file
    174 * structure and represents the main data structure for the eventpoll
    175 * interface.
    176 */
    177struct eventpoll {
    178	/*
    179	 * This mutex is used to ensure that files are not removed
    180	 * while epoll is using them. This is held during the event
    181	 * collection loop, the file cleanup path, the epoll file exit
    182	 * code and the ctl operations.
    183	 */
    184	struct mutex mtx;
    185
    186	/* Wait queue used by sys_epoll_wait() */
    187	wait_queue_head_t wq;
    188
    189	/* Wait queue used by file->poll() */
    190	wait_queue_head_t poll_wait;
    191
    192	/* List of ready file descriptors */
    193	struct list_head rdllist;
    194
    195	/* Lock which protects rdllist and ovflist */
    196	rwlock_t lock;
    197
    198	/* RB tree root used to store monitored fd structs */
    199	struct rb_root_cached rbr;
    200
    201	/*
    202	 * This is a single linked list that chains all the "struct epitem" that
    203	 * happened while transferring ready events to userspace w/out
    204	 * holding ->lock.
    205	 */
    206	struct epitem *ovflist;
    207
    208	/* wakeup_source used when ep_scan_ready_list is running */
    209	struct wakeup_source *ws;
    210
    211	/* The user that created the eventpoll descriptor */
    212	struct user_struct *user;
    213
    214	struct file *file;
    215
    216	/* used to optimize loop detection check */
    217	u64 gen;
    218	struct hlist_head refs;
    219
    220#ifdef CONFIG_NET_RX_BUSY_POLL
    221	/* used to track busy poll napi_id */
    222	unsigned int napi_id;
    223#endif
    224
    225#ifdef CONFIG_DEBUG_LOCK_ALLOC
    226	/* tracks wakeup nests for lockdep validation */
    227	u8 nests;
    228#endif
    229};
    230
    231/* Wrapper struct used by poll queueing */
    232struct ep_pqueue {
    233	poll_table pt;
    234	struct epitem *epi;
    235};
    236
    237/*
    238 * Configuration options available inside /proc/sys/fs/epoll/
    239 */
    240/* Maximum number of epoll watched descriptors, per user */
    241static long max_user_watches __read_mostly;
    242
    243/*
    244 * This mutex is used to serialize ep_free() and eventpoll_release_file().
    245 */
    246static DEFINE_MUTEX(epmutex);
    247
    248static u64 loop_check_gen = 0;
    249
    250/* Used to check for epoll file descriptor inclusion loops */
    251static struct eventpoll *inserting_into;
    252
    253/* Slab cache used to allocate "struct epitem" */
    254static struct kmem_cache *epi_cache __read_mostly;
    255
    256/* Slab cache used to allocate "struct eppoll_entry" */
    257static struct kmem_cache *pwq_cache __read_mostly;
    258
    259/*
    260 * List of files with newly added links, where we may need to limit the number
    261 * of emanating paths. Protected by the epmutex.
    262 */
    263struct epitems_head {
    264	struct hlist_head epitems;
    265	struct epitems_head *next;
    266};
    267static struct epitems_head *tfile_check_list = EP_UNACTIVE_PTR;
    268
    269static struct kmem_cache *ephead_cache __read_mostly;
    270
    271static inline void free_ephead(struct epitems_head *head)
    272{
    273	if (head)
    274		kmem_cache_free(ephead_cache, head);
    275}
    276
    277static void list_file(struct file *file)
    278{
    279	struct epitems_head *head;
    280
    281	head = container_of(file->f_ep, struct epitems_head, epitems);
    282	if (!head->next) {
    283		head->next = tfile_check_list;
    284		tfile_check_list = head;
    285	}
    286}
    287
    288static void unlist_file(struct epitems_head *head)
    289{
    290	struct epitems_head *to_free = head;
    291	struct hlist_node *p = rcu_dereference(hlist_first_rcu(&head->epitems));
    292	if (p) {
    293		struct epitem *epi= container_of(p, struct epitem, fllink);
    294		spin_lock(&epi->ffd.file->f_lock);
    295		if (!hlist_empty(&head->epitems))
    296			to_free = NULL;
    297		head->next = NULL;
    298		spin_unlock(&epi->ffd.file->f_lock);
    299	}
    300	free_ephead(to_free);
    301}
    302
    303#ifdef CONFIG_SYSCTL
    304
    305#include <linux/sysctl.h>
    306
    307static long long_zero;
    308static long long_max = LONG_MAX;
    309
    310static struct ctl_table epoll_table[] = {
    311	{
    312		.procname	= "max_user_watches",
    313		.data		= &max_user_watches,
    314		.maxlen		= sizeof(max_user_watches),
    315		.mode		= 0644,
    316		.proc_handler	= proc_doulongvec_minmax,
    317		.extra1		= &long_zero,
    318		.extra2		= &long_max,
    319	},
    320	{ }
    321};
    322
    323static void __init epoll_sysctls_init(void)
    324{
    325	register_sysctl("fs/epoll", epoll_table);
    326}
    327#else
    328#define epoll_sysctls_init() do { } while (0)
    329#endif /* CONFIG_SYSCTL */
    330
    331static const struct file_operations eventpoll_fops;
    332
    333static inline int is_file_epoll(struct file *f)
    334{
    335	return f->f_op == &eventpoll_fops;
    336}
    337
    338/* Setup the structure that is used as key for the RB tree */
    339static inline void ep_set_ffd(struct epoll_filefd *ffd,
    340			      struct file *file, int fd)
    341{
    342	ffd->file = file;
    343	ffd->fd = fd;
    344}
    345
    346/* Compare RB tree keys */
    347static inline int ep_cmp_ffd(struct epoll_filefd *p1,
    348			     struct epoll_filefd *p2)
    349{
    350	return (p1->file > p2->file ? +1:
    351	        (p1->file < p2->file ? -1 : p1->fd - p2->fd));
    352}
    353
    354/* Tells us if the item is currently linked */
    355static inline int ep_is_linked(struct epitem *epi)
    356{
    357	return !list_empty(&epi->rdllink);
    358}
    359
    360static inline struct eppoll_entry *ep_pwq_from_wait(wait_queue_entry_t *p)
    361{
    362	return container_of(p, struct eppoll_entry, wait);
    363}
    364
    365/* Get the "struct epitem" from a wait queue pointer */
    366static inline struct epitem *ep_item_from_wait(wait_queue_entry_t *p)
    367{
    368	return container_of(p, struct eppoll_entry, wait)->base;
    369}
    370
    371/**
    372 * ep_events_available - Checks if ready events might be available.
    373 *
    374 * @ep: Pointer to the eventpoll context.
    375 *
    376 * Return: a value different than %zero if ready events are available,
    377 *          or %zero otherwise.
    378 */
    379static inline int ep_events_available(struct eventpoll *ep)
    380{
    381	return !list_empty_careful(&ep->rdllist) ||
    382		READ_ONCE(ep->ovflist) != EP_UNACTIVE_PTR;
    383}
    384
    385#ifdef CONFIG_NET_RX_BUSY_POLL
    386static bool ep_busy_loop_end(void *p, unsigned long start_time)
    387{
    388	struct eventpoll *ep = p;
    389
    390	return ep_events_available(ep) || busy_loop_timeout(start_time);
    391}
    392
    393/*
    394 * Busy poll if globally on and supporting sockets found && no events,
    395 * busy loop will return if need_resched or ep_events_available.
    396 *
    397 * we must do our busy polling with irqs enabled
    398 */
    399static bool ep_busy_loop(struct eventpoll *ep, int nonblock)
    400{
    401	unsigned int napi_id = READ_ONCE(ep->napi_id);
    402
    403	if ((napi_id >= MIN_NAPI_ID) && net_busy_loop_on()) {
    404		napi_busy_loop(napi_id, nonblock ? NULL : ep_busy_loop_end, ep, false,
    405			       BUSY_POLL_BUDGET);
    406		if (ep_events_available(ep))
    407			return true;
    408		/*
    409		 * Busy poll timed out.  Drop NAPI ID for now, we can add
    410		 * it back in when we have moved a socket with a valid NAPI
    411		 * ID onto the ready list.
    412		 */
    413		ep->napi_id = 0;
    414		return false;
    415	}
    416	return false;
    417}
    418
    419/*
    420 * Set epoll busy poll NAPI ID from sk.
    421 */
    422static inline void ep_set_busy_poll_napi_id(struct epitem *epi)
    423{
    424	struct eventpoll *ep;
    425	unsigned int napi_id;
    426	struct socket *sock;
    427	struct sock *sk;
    428
    429	if (!net_busy_loop_on())
    430		return;
    431
    432	sock = sock_from_file(epi->ffd.file);
    433	if (!sock)
    434		return;
    435
    436	sk = sock->sk;
    437	if (!sk)
    438		return;
    439
    440	napi_id = READ_ONCE(sk->sk_napi_id);
    441	ep = epi->ep;
    442
    443	/* Non-NAPI IDs can be rejected
    444	 *	or
    445	 * Nothing to do if we already have this ID
    446	 */
    447	if (napi_id < MIN_NAPI_ID || napi_id == ep->napi_id)
    448		return;
    449
    450	/* record NAPI ID for use in next busy poll */
    451	ep->napi_id = napi_id;
    452}
    453
    454#else
    455
    456static inline bool ep_busy_loop(struct eventpoll *ep, int nonblock)
    457{
    458	return false;
    459}
    460
    461static inline void ep_set_busy_poll_napi_id(struct epitem *epi)
    462{
    463}
    464
    465#endif /* CONFIG_NET_RX_BUSY_POLL */
    466
    467/*
    468 * As described in commit 0ccf831cb lockdep: annotate epoll
    469 * the use of wait queues used by epoll is done in a very controlled
    470 * manner. Wake ups can nest inside each other, but are never done
    471 * with the same locking. For example:
    472 *
    473 *   dfd = socket(...);
    474 *   efd1 = epoll_create();
    475 *   efd2 = epoll_create();
    476 *   epoll_ctl(efd1, EPOLL_CTL_ADD, dfd, ...);
    477 *   epoll_ctl(efd2, EPOLL_CTL_ADD, efd1, ...);
    478 *
    479 * When a packet arrives to the device underneath "dfd", the net code will
    480 * issue a wake_up() on its poll wake list. Epoll (efd1) has installed a
    481 * callback wakeup entry on that queue, and the wake_up() performed by the
    482 * "dfd" net code will end up in ep_poll_callback(). At this point epoll
    483 * (efd1) notices that it may have some event ready, so it needs to wake up
    484 * the waiters on its poll wait list (efd2). So it calls ep_poll_safewake()
    485 * that ends up in another wake_up(), after having checked about the
    486 * recursion constraints. That are, no more than EP_MAX_POLLWAKE_NESTS, to
    487 * avoid stack blasting.
    488 *
    489 * When CONFIG_DEBUG_LOCK_ALLOC is enabled, make sure lockdep can handle
    490 * this special case of epoll.
    491 */
    492#ifdef CONFIG_DEBUG_LOCK_ALLOC
    493
    494static void ep_poll_safewake(struct eventpoll *ep, struct epitem *epi)
    495{
    496	struct eventpoll *ep_src;
    497	unsigned long flags;
    498	u8 nests = 0;
    499
    500	/*
    501	 * To set the subclass or nesting level for spin_lock_irqsave_nested()
    502	 * it might be natural to create a per-cpu nest count. However, since
    503	 * we can recurse on ep->poll_wait.lock, and a non-raw spinlock can
    504	 * schedule() in the -rt kernel, the per-cpu variable are no longer
    505	 * protected. Thus, we are introducing a per eventpoll nest field.
    506	 * If we are not being call from ep_poll_callback(), epi is NULL and
    507	 * we are at the first level of nesting, 0. Otherwise, we are being
    508	 * called from ep_poll_callback() and if a previous wakeup source is
    509	 * not an epoll file itself, we are at depth 1 since the wakeup source
    510	 * is depth 0. If the wakeup source is a previous epoll file in the
    511	 * wakeup chain then we use its nests value and record ours as
    512	 * nests + 1. The previous epoll file nests value is stable since its
    513	 * already holding its own poll_wait.lock.
    514	 */
    515	if (epi) {
    516		if ((is_file_epoll(epi->ffd.file))) {
    517			ep_src = epi->ffd.file->private_data;
    518			nests = ep_src->nests;
    519		} else {
    520			nests = 1;
    521		}
    522	}
    523	spin_lock_irqsave_nested(&ep->poll_wait.lock, flags, nests);
    524	ep->nests = nests + 1;
    525	wake_up_locked_poll(&ep->poll_wait, EPOLLIN);
    526	ep->nests = 0;
    527	spin_unlock_irqrestore(&ep->poll_wait.lock, flags);
    528}
    529
    530#else
    531
    532static void ep_poll_safewake(struct eventpoll *ep, struct epitem *epi)
    533{
    534	wake_up_poll(&ep->poll_wait, EPOLLIN);
    535}
    536
    537#endif
    538
    539static void ep_remove_wait_queue(struct eppoll_entry *pwq)
    540{
    541	wait_queue_head_t *whead;
    542
    543	rcu_read_lock();
    544	/*
    545	 * If it is cleared by POLLFREE, it should be rcu-safe.
    546	 * If we read NULL we need a barrier paired with
    547	 * smp_store_release() in ep_poll_callback(), otherwise
    548	 * we rely on whead->lock.
    549	 */
    550	whead = smp_load_acquire(&pwq->whead);
    551	if (whead)
    552		remove_wait_queue(whead, &pwq->wait);
    553	rcu_read_unlock();
    554}
    555
    556/*
    557 * This function unregisters poll callbacks from the associated file
    558 * descriptor.  Must be called with "mtx" held (or "epmutex" if called from
    559 * ep_free).
    560 */
    561static void ep_unregister_pollwait(struct eventpoll *ep, struct epitem *epi)
    562{
    563	struct eppoll_entry **p = &epi->pwqlist;
    564	struct eppoll_entry *pwq;
    565
    566	while ((pwq = *p) != NULL) {
    567		*p = pwq->next;
    568		ep_remove_wait_queue(pwq);
    569		kmem_cache_free(pwq_cache, pwq);
    570	}
    571}
    572
    573/* call only when ep->mtx is held */
    574static inline struct wakeup_source *ep_wakeup_source(struct epitem *epi)
    575{
    576	return rcu_dereference_check(epi->ws, lockdep_is_held(&epi->ep->mtx));
    577}
    578
    579/* call only when ep->mtx is held */
    580static inline void ep_pm_stay_awake(struct epitem *epi)
    581{
    582	struct wakeup_source *ws = ep_wakeup_source(epi);
    583
    584	if (ws)
    585		__pm_stay_awake(ws);
    586}
    587
    588static inline bool ep_has_wakeup_source(struct epitem *epi)
    589{
    590	return rcu_access_pointer(epi->ws) ? true : false;
    591}
    592
    593/* call when ep->mtx cannot be held (ep_poll_callback) */
    594static inline void ep_pm_stay_awake_rcu(struct epitem *epi)
    595{
    596	struct wakeup_source *ws;
    597
    598	rcu_read_lock();
    599	ws = rcu_dereference(epi->ws);
    600	if (ws)
    601		__pm_stay_awake(ws);
    602	rcu_read_unlock();
    603}
    604
    605
    606/*
    607 * ep->mutex needs to be held because we could be hit by
    608 * eventpoll_release_file() and epoll_ctl().
    609 */
    610static void ep_start_scan(struct eventpoll *ep, struct list_head *txlist)
    611{
    612	/*
    613	 * Steal the ready list, and re-init the original one to the
    614	 * empty list. Also, set ep->ovflist to NULL so that events
    615	 * happening while looping w/out locks, are not lost. We cannot
    616	 * have the poll callback to queue directly on ep->rdllist,
    617	 * because we want the "sproc" callback to be able to do it
    618	 * in a lockless way.
    619	 */
    620	lockdep_assert_irqs_enabled();
    621	write_lock_irq(&ep->lock);
    622	list_splice_init(&ep->rdllist, txlist);
    623	WRITE_ONCE(ep->ovflist, NULL);
    624	write_unlock_irq(&ep->lock);
    625}
    626
    627static void ep_done_scan(struct eventpoll *ep,
    628			 struct list_head *txlist)
    629{
    630	struct epitem *epi, *nepi;
    631
    632	write_lock_irq(&ep->lock);
    633	/*
    634	 * During the time we spent inside the "sproc" callback, some
    635	 * other events might have been queued by the poll callback.
    636	 * We re-insert them inside the main ready-list here.
    637	 */
    638	for (nepi = READ_ONCE(ep->ovflist); (epi = nepi) != NULL;
    639	     nepi = epi->next, epi->next = EP_UNACTIVE_PTR) {
    640		/*
    641		 * We need to check if the item is already in the list.
    642		 * During the "sproc" callback execution time, items are
    643		 * queued into ->ovflist but the "txlist" might already
    644		 * contain them, and the list_splice() below takes care of them.
    645		 */
    646		if (!ep_is_linked(epi)) {
    647			/*
    648			 * ->ovflist is LIFO, so we have to reverse it in order
    649			 * to keep in FIFO.
    650			 */
    651			list_add(&epi->rdllink, &ep->rdllist);
    652			ep_pm_stay_awake(epi);
    653		}
    654	}
    655	/*
    656	 * We need to set back ep->ovflist to EP_UNACTIVE_PTR, so that after
    657	 * releasing the lock, events will be queued in the normal way inside
    658	 * ep->rdllist.
    659	 */
    660	WRITE_ONCE(ep->ovflist, EP_UNACTIVE_PTR);
    661
    662	/*
    663	 * Quickly re-inject items left on "txlist".
    664	 */
    665	list_splice(txlist, &ep->rdllist);
    666	__pm_relax(ep->ws);
    667
    668	if (!list_empty(&ep->rdllist)) {
    669		if (waitqueue_active(&ep->wq))
    670			wake_up(&ep->wq);
    671	}
    672
    673	write_unlock_irq(&ep->lock);
    674}
    675
    676static void epi_rcu_free(struct rcu_head *head)
    677{
    678	struct epitem *epi = container_of(head, struct epitem, rcu);
    679	kmem_cache_free(epi_cache, epi);
    680}
    681
    682/*
    683 * Removes a "struct epitem" from the eventpoll RB tree and deallocates
    684 * all the associated resources. Must be called with "mtx" held.
    685 */
    686static int ep_remove(struct eventpoll *ep, struct epitem *epi)
    687{
    688	struct file *file = epi->ffd.file;
    689	struct epitems_head *to_free;
    690	struct hlist_head *head;
    691
    692	lockdep_assert_irqs_enabled();
    693
    694	/*
    695	 * Removes poll wait queue hooks.
    696	 */
    697	ep_unregister_pollwait(ep, epi);
    698
    699	/* Remove the current item from the list of epoll hooks */
    700	spin_lock(&file->f_lock);
    701	to_free = NULL;
    702	head = file->f_ep;
    703	if (head->first == &epi->fllink && !epi->fllink.next) {
    704		file->f_ep = NULL;
    705		if (!is_file_epoll(file)) {
    706			struct epitems_head *v;
    707			v = container_of(head, struct epitems_head, epitems);
    708			if (!smp_load_acquire(&v->next))
    709				to_free = v;
    710		}
    711	}
    712	hlist_del_rcu(&epi->fllink);
    713	spin_unlock(&file->f_lock);
    714	free_ephead(to_free);
    715
    716	rb_erase_cached(&epi->rbn, &ep->rbr);
    717
    718	write_lock_irq(&ep->lock);
    719	if (ep_is_linked(epi))
    720		list_del_init(&epi->rdllink);
    721	write_unlock_irq(&ep->lock);
    722
    723	wakeup_source_unregister(ep_wakeup_source(epi));
    724	/*
    725	 * At this point it is safe to free the eventpoll item. Use the union
    726	 * field epi->rcu, since we are trying to minimize the size of
    727	 * 'struct epitem'. The 'rbn' field is no longer in use. Protected by
    728	 * ep->mtx. The rcu read side, reverse_path_check_proc(), does not make
    729	 * use of the rbn field.
    730	 */
    731	call_rcu(&epi->rcu, epi_rcu_free);
    732
    733	percpu_counter_dec(&ep->user->epoll_watches);
    734
    735	return 0;
    736}
    737
    738static void ep_free(struct eventpoll *ep)
    739{
    740	struct rb_node *rbp;
    741	struct epitem *epi;
    742
    743	/* We need to release all tasks waiting for these file */
    744	if (waitqueue_active(&ep->poll_wait))
    745		ep_poll_safewake(ep, NULL);
    746
    747	/*
    748	 * We need to lock this because we could be hit by
    749	 * eventpoll_release_file() while we're freeing the "struct eventpoll".
    750	 * We do not need to hold "ep->mtx" here because the epoll file
    751	 * is on the way to be removed and no one has references to it
    752	 * anymore. The only hit might come from eventpoll_release_file() but
    753	 * holding "epmutex" is sufficient here.
    754	 */
    755	mutex_lock(&epmutex);
    756
    757	/*
    758	 * Walks through the whole tree by unregistering poll callbacks.
    759	 */
    760	for (rbp = rb_first_cached(&ep->rbr); rbp; rbp = rb_next(rbp)) {
    761		epi = rb_entry(rbp, struct epitem, rbn);
    762
    763		ep_unregister_pollwait(ep, epi);
    764		cond_resched();
    765	}
    766
    767	/*
    768	 * Walks through the whole tree by freeing each "struct epitem". At this
    769	 * point we are sure no poll callbacks will be lingering around, and also by
    770	 * holding "epmutex" we can be sure that no file cleanup code will hit
    771	 * us during this operation. So we can avoid the lock on "ep->lock".
    772	 * We do not need to lock ep->mtx, either, we only do it to prevent
    773	 * a lockdep warning.
    774	 */
    775	mutex_lock(&ep->mtx);
    776	while ((rbp = rb_first_cached(&ep->rbr)) != NULL) {
    777		epi = rb_entry(rbp, struct epitem, rbn);
    778		ep_remove(ep, epi);
    779		cond_resched();
    780	}
    781	mutex_unlock(&ep->mtx);
    782
    783	mutex_unlock(&epmutex);
    784	mutex_destroy(&ep->mtx);
    785	free_uid(ep->user);
    786	wakeup_source_unregister(ep->ws);
    787	kfree(ep);
    788}
    789
    790static int ep_eventpoll_release(struct inode *inode, struct file *file)
    791{
    792	struct eventpoll *ep = file->private_data;
    793
    794	if (ep)
    795		ep_free(ep);
    796
    797	return 0;
    798}
    799
    800static __poll_t ep_item_poll(const struct epitem *epi, poll_table *pt, int depth);
    801
    802static __poll_t __ep_eventpoll_poll(struct file *file, poll_table *wait, int depth)
    803{
    804	struct eventpoll *ep = file->private_data;
    805	LIST_HEAD(txlist);
    806	struct epitem *epi, *tmp;
    807	poll_table pt;
    808	__poll_t res = 0;
    809
    810	init_poll_funcptr(&pt, NULL);
    811
    812	/* Insert inside our poll wait queue */
    813	poll_wait(file, &ep->poll_wait, wait);
    814
    815	/*
    816	 * Proceed to find out if wanted events are really available inside
    817	 * the ready list.
    818	 */
    819	mutex_lock_nested(&ep->mtx, depth);
    820	ep_start_scan(ep, &txlist);
    821	list_for_each_entry_safe(epi, tmp, &txlist, rdllink) {
    822		if (ep_item_poll(epi, &pt, depth + 1)) {
    823			res = EPOLLIN | EPOLLRDNORM;
    824			break;
    825		} else {
    826			/*
    827			 * Item has been dropped into the ready list by the poll
    828			 * callback, but it's not actually ready, as far as
    829			 * caller requested events goes. We can remove it here.
    830			 */
    831			__pm_relax(ep_wakeup_source(epi));
    832			list_del_init(&epi->rdllink);
    833		}
    834	}
    835	ep_done_scan(ep, &txlist);
    836	mutex_unlock(&ep->mtx);
    837	return res;
    838}
    839
    840/*
    841 * Differs from ep_eventpoll_poll() in that internal callers already have
    842 * the ep->mtx so we need to start from depth=1, such that mutex_lock_nested()
    843 * is correctly annotated.
    844 */
    845static __poll_t ep_item_poll(const struct epitem *epi, poll_table *pt,
    846				 int depth)
    847{
    848	struct file *file = epi->ffd.file;
    849	__poll_t res;
    850
    851	pt->_key = epi->event.events;
    852	if (!is_file_epoll(file))
    853		res = vfs_poll(file, pt);
    854	else
    855		res = __ep_eventpoll_poll(file, pt, depth);
    856	return res & epi->event.events;
    857}
    858
    859static __poll_t ep_eventpoll_poll(struct file *file, poll_table *wait)
    860{
    861	return __ep_eventpoll_poll(file, wait, 0);
    862}
    863
    864#ifdef CONFIG_PROC_FS
    865static void ep_show_fdinfo(struct seq_file *m, struct file *f)
    866{
    867	struct eventpoll *ep = f->private_data;
    868	struct rb_node *rbp;
    869
    870	mutex_lock(&ep->mtx);
    871	for (rbp = rb_first_cached(&ep->rbr); rbp; rbp = rb_next(rbp)) {
    872		struct epitem *epi = rb_entry(rbp, struct epitem, rbn);
    873		struct inode *inode = file_inode(epi->ffd.file);
    874
    875		seq_printf(m, "tfd: %8d events: %8x data: %16llx "
    876			   " pos:%lli ino:%lx sdev:%x\n",
    877			   epi->ffd.fd, epi->event.events,
    878			   (long long)epi->event.data,
    879			   (long long)epi->ffd.file->f_pos,
    880			   inode->i_ino, inode->i_sb->s_dev);
    881		if (seq_has_overflowed(m))
    882			break;
    883	}
    884	mutex_unlock(&ep->mtx);
    885}
    886#endif
    887
    888/* File callbacks that implement the eventpoll file behaviour */
    889static const struct file_operations eventpoll_fops = {
    890#ifdef CONFIG_PROC_FS
    891	.show_fdinfo	= ep_show_fdinfo,
    892#endif
    893	.release	= ep_eventpoll_release,
    894	.poll		= ep_eventpoll_poll,
    895	.llseek		= noop_llseek,
    896};
    897
    898/*
    899 * This is called from eventpoll_release() to unlink files from the eventpoll
    900 * interface. We need to have this facility to cleanup correctly files that are
    901 * closed without being removed from the eventpoll interface.
    902 */
    903void eventpoll_release_file(struct file *file)
    904{
    905	struct eventpoll *ep;
    906	struct epitem *epi;
    907	struct hlist_node *next;
    908
    909	/*
    910	 * We don't want to get "file->f_lock" because it is not
    911	 * necessary. It is not necessary because we're in the "struct file"
    912	 * cleanup path, and this means that no one is using this file anymore.
    913	 * So, for example, epoll_ctl() cannot hit here since if we reach this
    914	 * point, the file counter already went to zero and fget() would fail.
    915	 * The only hit might come from ep_free() but by holding the mutex
    916	 * will correctly serialize the operation. We do need to acquire
    917	 * "ep->mtx" after "epmutex" because ep_remove() requires it when called
    918	 * from anywhere but ep_free().
    919	 *
    920	 * Besides, ep_remove() acquires the lock, so we can't hold it here.
    921	 */
    922	mutex_lock(&epmutex);
    923	if (unlikely(!file->f_ep)) {
    924		mutex_unlock(&epmutex);
    925		return;
    926	}
    927	hlist_for_each_entry_safe(epi, next, file->f_ep, fllink) {
    928		ep = epi->ep;
    929		mutex_lock_nested(&ep->mtx, 0);
    930		ep_remove(ep, epi);
    931		mutex_unlock(&ep->mtx);
    932	}
    933	mutex_unlock(&epmutex);
    934}
    935
    936static int ep_alloc(struct eventpoll **pep)
    937{
    938	int error;
    939	struct user_struct *user;
    940	struct eventpoll *ep;
    941
    942	user = get_current_user();
    943	error = -ENOMEM;
    944	ep = kzalloc(sizeof(*ep), GFP_KERNEL);
    945	if (unlikely(!ep))
    946		goto free_uid;
    947
    948	mutex_init(&ep->mtx);
    949	rwlock_init(&ep->lock);
    950	init_waitqueue_head(&ep->wq);
    951	init_waitqueue_head(&ep->poll_wait);
    952	INIT_LIST_HEAD(&ep->rdllist);
    953	ep->rbr = RB_ROOT_CACHED;
    954	ep->ovflist = EP_UNACTIVE_PTR;
    955	ep->user = user;
    956
    957	*pep = ep;
    958
    959	return 0;
    960
    961free_uid:
    962	free_uid(user);
    963	return error;
    964}
    965
    966/*
    967 * Search the file inside the eventpoll tree. The RB tree operations
    968 * are protected by the "mtx" mutex, and ep_find() must be called with
    969 * "mtx" held.
    970 */
    971static struct epitem *ep_find(struct eventpoll *ep, struct file *file, int fd)
    972{
    973	int kcmp;
    974	struct rb_node *rbp;
    975	struct epitem *epi, *epir = NULL;
    976	struct epoll_filefd ffd;
    977
    978	ep_set_ffd(&ffd, file, fd);
    979	for (rbp = ep->rbr.rb_root.rb_node; rbp; ) {
    980		epi = rb_entry(rbp, struct epitem, rbn);
    981		kcmp = ep_cmp_ffd(&ffd, &epi->ffd);
    982		if (kcmp > 0)
    983			rbp = rbp->rb_right;
    984		else if (kcmp < 0)
    985			rbp = rbp->rb_left;
    986		else {
    987			epir = epi;
    988			break;
    989		}
    990	}
    991
    992	return epir;
    993}
    994
    995#ifdef CONFIG_KCMP
    996static struct epitem *ep_find_tfd(struct eventpoll *ep, int tfd, unsigned long toff)
    997{
    998	struct rb_node *rbp;
    999	struct epitem *epi;
   1000
   1001	for (rbp = rb_first_cached(&ep->rbr); rbp; rbp = rb_next(rbp)) {
   1002		epi = rb_entry(rbp, struct epitem, rbn);
   1003		if (epi->ffd.fd == tfd) {
   1004			if (toff == 0)
   1005				return epi;
   1006			else
   1007				toff--;
   1008		}
   1009		cond_resched();
   1010	}
   1011
   1012	return NULL;
   1013}
   1014
   1015struct file *get_epoll_tfile_raw_ptr(struct file *file, int tfd,
   1016				     unsigned long toff)
   1017{
   1018	struct file *file_raw;
   1019	struct eventpoll *ep;
   1020	struct epitem *epi;
   1021
   1022	if (!is_file_epoll(file))
   1023		return ERR_PTR(-EINVAL);
   1024
   1025	ep = file->private_data;
   1026
   1027	mutex_lock(&ep->mtx);
   1028	epi = ep_find_tfd(ep, tfd, toff);
   1029	if (epi)
   1030		file_raw = epi->ffd.file;
   1031	else
   1032		file_raw = ERR_PTR(-ENOENT);
   1033	mutex_unlock(&ep->mtx);
   1034
   1035	return file_raw;
   1036}
   1037#endif /* CONFIG_KCMP */
   1038
   1039/*
   1040 * Adds a new entry to the tail of the list in a lockless way, i.e.
   1041 * multiple CPUs are allowed to call this function concurrently.
   1042 *
   1043 * Beware: it is necessary to prevent any other modifications of the
   1044 *         existing list until all changes are completed, in other words
   1045 *         concurrent list_add_tail_lockless() calls should be protected
   1046 *         with a read lock, where write lock acts as a barrier which
   1047 *         makes sure all list_add_tail_lockless() calls are fully
   1048 *         completed.
   1049 *
   1050 *        Also an element can be locklessly added to the list only in one
   1051 *        direction i.e. either to the tail or to the head, otherwise
   1052 *        concurrent access will corrupt the list.
   1053 *
   1054 * Return: %false if element has been already added to the list, %true
   1055 * otherwise.
   1056 */
   1057static inline bool list_add_tail_lockless(struct list_head *new,
   1058					  struct list_head *head)
   1059{
   1060	struct list_head *prev;
   1061
   1062	/*
   1063	 * This is simple 'new->next = head' operation, but cmpxchg()
   1064	 * is used in order to detect that same element has been just
   1065	 * added to the list from another CPU: the winner observes
   1066	 * new->next == new.
   1067	 */
   1068	if (cmpxchg(&new->next, new, head) != new)
   1069		return false;
   1070
   1071	/*
   1072	 * Initially ->next of a new element must be updated with the head
   1073	 * (we are inserting to the tail) and only then pointers are atomically
   1074	 * exchanged.  XCHG guarantees memory ordering, thus ->next should be
   1075	 * updated before pointers are actually swapped and pointers are
   1076	 * swapped before prev->next is updated.
   1077	 */
   1078
   1079	prev = xchg(&head->prev, new);
   1080
   1081	/*
   1082	 * It is safe to modify prev->next and new->prev, because a new element
   1083	 * is added only to the tail and new->next is updated before XCHG.
   1084	 */
   1085
   1086	prev->next = new;
   1087	new->prev = prev;
   1088
   1089	return true;
   1090}
   1091
   1092/*
   1093 * Chains a new epi entry to the tail of the ep->ovflist in a lockless way,
   1094 * i.e. multiple CPUs are allowed to call this function concurrently.
   1095 *
   1096 * Return: %false if epi element has been already chained, %true otherwise.
   1097 */
   1098static inline bool chain_epi_lockless(struct epitem *epi)
   1099{
   1100	struct eventpoll *ep = epi->ep;
   1101
   1102	/* Fast preliminary check */
   1103	if (epi->next != EP_UNACTIVE_PTR)
   1104		return false;
   1105
   1106	/* Check that the same epi has not been just chained from another CPU */
   1107	if (cmpxchg(&epi->next, EP_UNACTIVE_PTR, NULL) != EP_UNACTIVE_PTR)
   1108		return false;
   1109
   1110	/* Atomically exchange tail */
   1111	epi->next = xchg(&ep->ovflist, epi);
   1112
   1113	return true;
   1114}
   1115
   1116/*
   1117 * This is the callback that is passed to the wait queue wakeup
   1118 * mechanism. It is called by the stored file descriptors when they
   1119 * have events to report.
   1120 *
   1121 * This callback takes a read lock in order not to contend with concurrent
   1122 * events from another file descriptor, thus all modifications to ->rdllist
   1123 * or ->ovflist are lockless.  Read lock is paired with the write lock from
   1124 * ep_scan_ready_list(), which stops all list modifications and guarantees
   1125 * that lists state is seen correctly.
   1126 *
   1127 * Another thing worth to mention is that ep_poll_callback() can be called
   1128 * concurrently for the same @epi from different CPUs if poll table was inited
   1129 * with several wait queues entries.  Plural wakeup from different CPUs of a
   1130 * single wait queue is serialized by wq.lock, but the case when multiple wait
   1131 * queues are used should be detected accordingly.  This is detected using
   1132 * cmpxchg() operation.
   1133 */
   1134static int ep_poll_callback(wait_queue_entry_t *wait, unsigned mode, int sync, void *key)
   1135{
   1136	int pwake = 0;
   1137	struct epitem *epi = ep_item_from_wait(wait);
   1138	struct eventpoll *ep = epi->ep;
   1139	__poll_t pollflags = key_to_poll(key);
   1140	unsigned long flags;
   1141	int ewake = 0;
   1142
   1143	read_lock_irqsave(&ep->lock, flags);
   1144
   1145	ep_set_busy_poll_napi_id(epi);
   1146
   1147	/*
   1148	 * If the event mask does not contain any poll(2) event, we consider the
   1149	 * descriptor to be disabled. This condition is likely the effect of the
   1150	 * EPOLLONESHOT bit that disables the descriptor when an event is received,
   1151	 * until the next EPOLL_CTL_MOD will be issued.
   1152	 */
   1153	if (!(epi->event.events & ~EP_PRIVATE_BITS))
   1154		goto out_unlock;
   1155
   1156	/*
   1157	 * Check the events coming with the callback. At this stage, not
   1158	 * every device reports the events in the "key" parameter of the
   1159	 * callback. We need to be able to handle both cases here, hence the
   1160	 * test for "key" != NULL before the event match test.
   1161	 */
   1162	if (pollflags && !(pollflags & epi->event.events))
   1163		goto out_unlock;
   1164
   1165	/*
   1166	 * If we are transferring events to userspace, we can hold no locks
   1167	 * (because we're accessing user memory, and because of linux f_op->poll()
   1168	 * semantics). All the events that happen during that period of time are
   1169	 * chained in ep->ovflist and requeued later on.
   1170	 */
   1171	if (READ_ONCE(ep->ovflist) != EP_UNACTIVE_PTR) {
   1172		if (chain_epi_lockless(epi))
   1173			ep_pm_stay_awake_rcu(epi);
   1174	} else if (!ep_is_linked(epi)) {
   1175		/* In the usual case, add event to ready list. */
   1176		if (list_add_tail_lockless(&epi->rdllink, &ep->rdllist))
   1177			ep_pm_stay_awake_rcu(epi);
   1178	}
   1179
   1180	/*
   1181	 * Wake up ( if active ) both the eventpoll wait list and the ->poll()
   1182	 * wait list.
   1183	 */
   1184	if (waitqueue_active(&ep->wq)) {
   1185		if ((epi->event.events & EPOLLEXCLUSIVE) &&
   1186					!(pollflags & POLLFREE)) {
   1187			switch (pollflags & EPOLLINOUT_BITS) {
   1188			case EPOLLIN:
   1189				if (epi->event.events & EPOLLIN)
   1190					ewake = 1;
   1191				break;
   1192			case EPOLLOUT:
   1193				if (epi->event.events & EPOLLOUT)
   1194					ewake = 1;
   1195				break;
   1196			case 0:
   1197				ewake = 1;
   1198				break;
   1199			}
   1200		}
   1201		wake_up(&ep->wq);
   1202	}
   1203	if (waitqueue_active(&ep->poll_wait))
   1204		pwake++;
   1205
   1206out_unlock:
   1207	read_unlock_irqrestore(&ep->lock, flags);
   1208
   1209	/* We have to call this outside the lock */
   1210	if (pwake)
   1211		ep_poll_safewake(ep, epi);
   1212
   1213	if (!(epi->event.events & EPOLLEXCLUSIVE))
   1214		ewake = 1;
   1215
   1216	if (pollflags & POLLFREE) {
   1217		/*
   1218		 * If we race with ep_remove_wait_queue() it can miss
   1219		 * ->whead = NULL and do another remove_wait_queue() after
   1220		 * us, so we can't use __remove_wait_queue().
   1221		 */
   1222		list_del_init(&wait->entry);
   1223		/*
   1224		 * ->whead != NULL protects us from the race with ep_free()
   1225		 * or ep_remove(), ep_remove_wait_queue() takes whead->lock
   1226		 * held by the caller. Once we nullify it, nothing protects
   1227		 * ep/epi or even wait.
   1228		 */
   1229		smp_store_release(&ep_pwq_from_wait(wait)->whead, NULL);
   1230	}
   1231
   1232	return ewake;
   1233}
   1234
   1235/*
   1236 * This is the callback that is used to add our wait queue to the
   1237 * target file wakeup lists.
   1238 */
   1239static void ep_ptable_queue_proc(struct file *file, wait_queue_head_t *whead,
   1240				 poll_table *pt)
   1241{
   1242	struct ep_pqueue *epq = container_of(pt, struct ep_pqueue, pt);
   1243	struct epitem *epi = epq->epi;
   1244	struct eppoll_entry *pwq;
   1245
   1246	if (unlikely(!epi))	// an earlier allocation has failed
   1247		return;
   1248
   1249	pwq = kmem_cache_alloc(pwq_cache, GFP_KERNEL);
   1250	if (unlikely(!pwq)) {
   1251		epq->epi = NULL;
   1252		return;
   1253	}
   1254
   1255	init_waitqueue_func_entry(&pwq->wait, ep_poll_callback);
   1256	pwq->whead = whead;
   1257	pwq->base = epi;
   1258	if (epi->event.events & EPOLLEXCLUSIVE)
   1259		add_wait_queue_exclusive(whead, &pwq->wait);
   1260	else
   1261		add_wait_queue(whead, &pwq->wait);
   1262	pwq->next = epi->pwqlist;
   1263	epi->pwqlist = pwq;
   1264}
   1265
   1266static void ep_rbtree_insert(struct eventpoll *ep, struct epitem *epi)
   1267{
   1268	int kcmp;
   1269	struct rb_node **p = &ep->rbr.rb_root.rb_node, *parent = NULL;
   1270	struct epitem *epic;
   1271	bool leftmost = true;
   1272
   1273	while (*p) {
   1274		parent = *p;
   1275		epic = rb_entry(parent, struct epitem, rbn);
   1276		kcmp = ep_cmp_ffd(&epi->ffd, &epic->ffd);
   1277		if (kcmp > 0) {
   1278			p = &parent->rb_right;
   1279			leftmost = false;
   1280		} else
   1281			p = &parent->rb_left;
   1282	}
   1283	rb_link_node(&epi->rbn, parent, p);
   1284	rb_insert_color_cached(&epi->rbn, &ep->rbr, leftmost);
   1285}
   1286
   1287
   1288
   1289#define PATH_ARR_SIZE 5
   1290/*
   1291 * These are the number paths of length 1 to 5, that we are allowing to emanate
   1292 * from a single file of interest. For example, we allow 1000 paths of length
   1293 * 1, to emanate from each file of interest. This essentially represents the
   1294 * potential wakeup paths, which need to be limited in order to avoid massive
   1295 * uncontrolled wakeup storms. The common use case should be a single ep which
   1296 * is connected to n file sources. In this case each file source has 1 path
   1297 * of length 1. Thus, the numbers below should be more than sufficient. These
   1298 * path limits are enforced during an EPOLL_CTL_ADD operation, since a modify
   1299 * and delete can't add additional paths. Protected by the epmutex.
   1300 */
   1301static const int path_limits[PATH_ARR_SIZE] = { 1000, 500, 100, 50, 10 };
   1302static int path_count[PATH_ARR_SIZE];
   1303
   1304static int path_count_inc(int nests)
   1305{
   1306	/* Allow an arbitrary number of depth 1 paths */
   1307	if (nests == 0)
   1308		return 0;
   1309
   1310	if (++path_count[nests] > path_limits[nests])
   1311		return -1;
   1312	return 0;
   1313}
   1314
   1315static void path_count_init(void)
   1316{
   1317	int i;
   1318
   1319	for (i = 0; i < PATH_ARR_SIZE; i++)
   1320		path_count[i] = 0;
   1321}
   1322
   1323static int reverse_path_check_proc(struct hlist_head *refs, int depth)
   1324{
   1325	int error = 0;
   1326	struct epitem *epi;
   1327
   1328	if (depth > EP_MAX_NESTS) /* too deep nesting */
   1329		return -1;
   1330
   1331	/* CTL_DEL can remove links here, but that can't increase our count */
   1332	hlist_for_each_entry_rcu(epi, refs, fllink) {
   1333		struct hlist_head *refs = &epi->ep->refs;
   1334		if (hlist_empty(refs))
   1335			error = path_count_inc(depth);
   1336		else
   1337			error = reverse_path_check_proc(refs, depth + 1);
   1338		if (error != 0)
   1339			break;
   1340	}
   1341	return error;
   1342}
   1343
   1344/**
   1345 * reverse_path_check - The tfile_check_list is list of epitem_head, which have
   1346 *                      links that are proposed to be newly added. We need to
   1347 *                      make sure that those added links don't add too many
   1348 *                      paths such that we will spend all our time waking up
   1349 *                      eventpoll objects.
   1350 *
   1351 * Return: %zero if the proposed links don't create too many paths,
   1352 *	    %-1 otherwise.
   1353 */
   1354static int reverse_path_check(void)
   1355{
   1356	struct epitems_head *p;
   1357
   1358	for (p = tfile_check_list; p != EP_UNACTIVE_PTR; p = p->next) {
   1359		int error;
   1360		path_count_init();
   1361		rcu_read_lock();
   1362		error = reverse_path_check_proc(&p->epitems, 0);
   1363		rcu_read_unlock();
   1364		if (error)
   1365			return error;
   1366	}
   1367	return 0;
   1368}
   1369
   1370static int ep_create_wakeup_source(struct epitem *epi)
   1371{
   1372	struct name_snapshot n;
   1373	struct wakeup_source *ws;
   1374
   1375	if (!epi->ep->ws) {
   1376		epi->ep->ws = wakeup_source_register(NULL, "eventpoll");
   1377		if (!epi->ep->ws)
   1378			return -ENOMEM;
   1379	}
   1380
   1381	take_dentry_name_snapshot(&n, epi->ffd.file->f_path.dentry);
   1382	ws = wakeup_source_register(NULL, n.name.name);
   1383	release_dentry_name_snapshot(&n);
   1384
   1385	if (!ws)
   1386		return -ENOMEM;
   1387	rcu_assign_pointer(epi->ws, ws);
   1388
   1389	return 0;
   1390}
   1391
   1392/* rare code path, only used when EPOLL_CTL_MOD removes a wakeup source */
   1393static noinline void ep_destroy_wakeup_source(struct epitem *epi)
   1394{
   1395	struct wakeup_source *ws = ep_wakeup_source(epi);
   1396
   1397	RCU_INIT_POINTER(epi->ws, NULL);
   1398
   1399	/*
   1400	 * wait for ep_pm_stay_awake_rcu to finish, synchronize_rcu is
   1401	 * used internally by wakeup_source_remove, too (called by
   1402	 * wakeup_source_unregister), so we cannot use call_rcu
   1403	 */
   1404	synchronize_rcu();
   1405	wakeup_source_unregister(ws);
   1406}
   1407
   1408static int attach_epitem(struct file *file, struct epitem *epi)
   1409{
   1410	struct epitems_head *to_free = NULL;
   1411	struct hlist_head *head = NULL;
   1412	struct eventpoll *ep = NULL;
   1413
   1414	if (is_file_epoll(file))
   1415		ep = file->private_data;
   1416
   1417	if (ep) {
   1418		head = &ep->refs;
   1419	} else if (!READ_ONCE(file->f_ep)) {
   1420allocate:
   1421		to_free = kmem_cache_zalloc(ephead_cache, GFP_KERNEL);
   1422		if (!to_free)
   1423			return -ENOMEM;
   1424		head = &to_free->epitems;
   1425	}
   1426	spin_lock(&file->f_lock);
   1427	if (!file->f_ep) {
   1428		if (unlikely(!head)) {
   1429			spin_unlock(&file->f_lock);
   1430			goto allocate;
   1431		}
   1432		file->f_ep = head;
   1433		to_free = NULL;
   1434	}
   1435	hlist_add_head_rcu(&epi->fllink, file->f_ep);
   1436	spin_unlock(&file->f_lock);
   1437	free_ephead(to_free);
   1438	return 0;
   1439}
   1440
   1441/*
   1442 * Must be called with "mtx" held.
   1443 */
   1444static int ep_insert(struct eventpoll *ep, const struct epoll_event *event,
   1445		     struct file *tfile, int fd, int full_check)
   1446{
   1447	int error, pwake = 0;
   1448	__poll_t revents;
   1449	struct epitem *epi;
   1450	struct ep_pqueue epq;
   1451	struct eventpoll *tep = NULL;
   1452
   1453	if (is_file_epoll(tfile))
   1454		tep = tfile->private_data;
   1455
   1456	lockdep_assert_irqs_enabled();
   1457
   1458	if (unlikely(percpu_counter_compare(&ep->user->epoll_watches,
   1459					    max_user_watches) >= 0))
   1460		return -ENOSPC;
   1461	percpu_counter_inc(&ep->user->epoll_watches);
   1462
   1463	if (!(epi = kmem_cache_zalloc(epi_cache, GFP_KERNEL))) {
   1464		percpu_counter_dec(&ep->user->epoll_watches);
   1465		return -ENOMEM;
   1466	}
   1467
   1468	/* Item initialization follow here ... */
   1469	INIT_LIST_HEAD(&epi->rdllink);
   1470	epi->ep = ep;
   1471	ep_set_ffd(&epi->ffd, tfile, fd);
   1472	epi->event = *event;
   1473	epi->next = EP_UNACTIVE_PTR;
   1474
   1475	if (tep)
   1476		mutex_lock_nested(&tep->mtx, 1);
   1477	/* Add the current item to the list of active epoll hook for this file */
   1478	if (unlikely(attach_epitem(tfile, epi) < 0)) {
   1479		if (tep)
   1480			mutex_unlock(&tep->mtx);
   1481		kmem_cache_free(epi_cache, epi);
   1482		percpu_counter_dec(&ep->user->epoll_watches);
   1483		return -ENOMEM;
   1484	}
   1485
   1486	if (full_check && !tep)
   1487		list_file(tfile);
   1488
   1489	/*
   1490	 * Add the current item to the RB tree. All RB tree operations are
   1491	 * protected by "mtx", and ep_insert() is called with "mtx" held.
   1492	 */
   1493	ep_rbtree_insert(ep, epi);
   1494	if (tep)
   1495		mutex_unlock(&tep->mtx);
   1496
   1497	/* now check if we've created too many backpaths */
   1498	if (unlikely(full_check && reverse_path_check())) {
   1499		ep_remove(ep, epi);
   1500		return -EINVAL;
   1501	}
   1502
   1503	if (epi->event.events & EPOLLWAKEUP) {
   1504		error = ep_create_wakeup_source(epi);
   1505		if (error) {
   1506			ep_remove(ep, epi);
   1507			return error;
   1508		}
   1509	}
   1510
   1511	/* Initialize the poll table using the queue callback */
   1512	epq.epi = epi;
   1513	init_poll_funcptr(&epq.pt, ep_ptable_queue_proc);
   1514
   1515	/*
   1516	 * Attach the item to the poll hooks and get current event bits.
   1517	 * We can safely use the file* here because its usage count has
   1518	 * been increased by the caller of this function. Note that after
   1519	 * this operation completes, the poll callback can start hitting
   1520	 * the new item.
   1521	 */
   1522	revents = ep_item_poll(epi, &epq.pt, 1);
   1523
   1524	/*
   1525	 * We have to check if something went wrong during the poll wait queue
   1526	 * install process. Namely an allocation for a wait queue failed due
   1527	 * high memory pressure.
   1528	 */
   1529	if (unlikely(!epq.epi)) {
   1530		ep_remove(ep, epi);
   1531		return -ENOMEM;
   1532	}
   1533
   1534	/* We have to drop the new item inside our item list to keep track of it */
   1535	write_lock_irq(&ep->lock);
   1536
   1537	/* record NAPI ID of new item if present */
   1538	ep_set_busy_poll_napi_id(epi);
   1539
   1540	/* If the file is already "ready" we drop it inside the ready list */
   1541	if (revents && !ep_is_linked(epi)) {
   1542		list_add_tail(&epi->rdllink, &ep->rdllist);
   1543		ep_pm_stay_awake(epi);
   1544
   1545		/* Notify waiting tasks that events are available */
   1546		if (waitqueue_active(&ep->wq))
   1547			wake_up(&ep->wq);
   1548		if (waitqueue_active(&ep->poll_wait))
   1549			pwake++;
   1550	}
   1551
   1552	write_unlock_irq(&ep->lock);
   1553
   1554	/* We have to call this outside the lock */
   1555	if (pwake)
   1556		ep_poll_safewake(ep, NULL);
   1557
   1558	return 0;
   1559}
   1560
   1561/*
   1562 * Modify the interest event mask by dropping an event if the new mask
   1563 * has a match in the current file status. Must be called with "mtx" held.
   1564 */
   1565static int ep_modify(struct eventpoll *ep, struct epitem *epi,
   1566		     const struct epoll_event *event)
   1567{
   1568	int pwake = 0;
   1569	poll_table pt;
   1570
   1571	lockdep_assert_irqs_enabled();
   1572
   1573	init_poll_funcptr(&pt, NULL);
   1574
   1575	/*
   1576	 * Set the new event interest mask before calling f_op->poll();
   1577	 * otherwise we might miss an event that happens between the
   1578	 * f_op->poll() call and the new event set registering.
   1579	 */
   1580	epi->event.events = event->events; /* need barrier below */
   1581	epi->event.data = event->data; /* protected by mtx */
   1582	if (epi->event.events & EPOLLWAKEUP) {
   1583		if (!ep_has_wakeup_source(epi))
   1584			ep_create_wakeup_source(epi);
   1585	} else if (ep_has_wakeup_source(epi)) {
   1586		ep_destroy_wakeup_source(epi);
   1587	}
   1588
   1589	/*
   1590	 * The following barrier has two effects:
   1591	 *
   1592	 * 1) Flush epi changes above to other CPUs.  This ensures
   1593	 *    we do not miss events from ep_poll_callback if an
   1594	 *    event occurs immediately after we call f_op->poll().
   1595	 *    We need this because we did not take ep->lock while
   1596	 *    changing epi above (but ep_poll_callback does take
   1597	 *    ep->lock).
   1598	 *
   1599	 * 2) We also need to ensure we do not miss _past_ events
   1600	 *    when calling f_op->poll().  This barrier also
   1601	 *    pairs with the barrier in wq_has_sleeper (see
   1602	 *    comments for wq_has_sleeper).
   1603	 *
   1604	 * This barrier will now guarantee ep_poll_callback or f_op->poll
   1605	 * (or both) will notice the readiness of an item.
   1606	 */
   1607	smp_mb();
   1608
   1609	/*
   1610	 * Get current event bits. We can safely use the file* here because
   1611	 * its usage count has been increased by the caller of this function.
   1612	 * If the item is "hot" and it is not registered inside the ready
   1613	 * list, push it inside.
   1614	 */
   1615	if (ep_item_poll(epi, &pt, 1)) {
   1616		write_lock_irq(&ep->lock);
   1617		if (!ep_is_linked(epi)) {
   1618			list_add_tail(&epi->rdllink, &ep->rdllist);
   1619			ep_pm_stay_awake(epi);
   1620
   1621			/* Notify waiting tasks that events are available */
   1622			if (waitqueue_active(&ep->wq))
   1623				wake_up(&ep->wq);
   1624			if (waitqueue_active(&ep->poll_wait))
   1625				pwake++;
   1626		}
   1627		write_unlock_irq(&ep->lock);
   1628	}
   1629
   1630	/* We have to call this outside the lock */
   1631	if (pwake)
   1632		ep_poll_safewake(ep, NULL);
   1633
   1634	return 0;
   1635}
   1636
   1637static int ep_send_events(struct eventpoll *ep,
   1638			  struct epoll_event __user *events, int maxevents)
   1639{
   1640	struct epitem *epi, *tmp;
   1641	LIST_HEAD(txlist);
   1642	poll_table pt;
   1643	int res = 0;
   1644
   1645	/*
   1646	 * Always short-circuit for fatal signals to allow threads to make a
   1647	 * timely exit without the chance of finding more events available and
   1648	 * fetching repeatedly.
   1649	 */
   1650	if (fatal_signal_pending(current))
   1651		return -EINTR;
   1652
   1653	init_poll_funcptr(&pt, NULL);
   1654
   1655	mutex_lock(&ep->mtx);
   1656	ep_start_scan(ep, &txlist);
   1657
   1658	/*
   1659	 * We can loop without lock because we are passed a task private list.
   1660	 * Items cannot vanish during the loop we are holding ep->mtx.
   1661	 */
   1662	list_for_each_entry_safe(epi, tmp, &txlist, rdllink) {
   1663		struct wakeup_source *ws;
   1664		__poll_t revents;
   1665
   1666		if (res >= maxevents)
   1667			break;
   1668
   1669		/*
   1670		 * Activate ep->ws before deactivating epi->ws to prevent
   1671		 * triggering auto-suspend here (in case we reactive epi->ws
   1672		 * below).
   1673		 *
   1674		 * This could be rearranged to delay the deactivation of epi->ws
   1675		 * instead, but then epi->ws would temporarily be out of sync
   1676		 * with ep_is_linked().
   1677		 */
   1678		ws = ep_wakeup_source(epi);
   1679		if (ws) {
   1680			if (ws->active)
   1681				__pm_stay_awake(ep->ws);
   1682			__pm_relax(ws);
   1683		}
   1684
   1685		list_del_init(&epi->rdllink);
   1686
   1687		/*
   1688		 * If the event mask intersect the caller-requested one,
   1689		 * deliver the event to userspace. Again, we are holding ep->mtx,
   1690		 * so no operations coming from userspace can change the item.
   1691		 */
   1692		revents = ep_item_poll(epi, &pt, 1);
   1693		if (!revents)
   1694			continue;
   1695
   1696		events = epoll_put_uevent(revents, epi->event.data, events);
   1697		if (!events) {
   1698			list_add(&epi->rdllink, &txlist);
   1699			ep_pm_stay_awake(epi);
   1700			if (!res)
   1701				res = -EFAULT;
   1702			break;
   1703		}
   1704		res++;
   1705		if (epi->event.events & EPOLLONESHOT)
   1706			epi->event.events &= EP_PRIVATE_BITS;
   1707		else if (!(epi->event.events & EPOLLET)) {
   1708			/*
   1709			 * If this file has been added with Level
   1710			 * Trigger mode, we need to insert back inside
   1711			 * the ready list, so that the next call to
   1712			 * epoll_wait() will check again the events
   1713			 * availability. At this point, no one can insert
   1714			 * into ep->rdllist besides us. The epoll_ctl()
   1715			 * callers are locked out by
   1716			 * ep_scan_ready_list() holding "mtx" and the
   1717			 * poll callback will queue them in ep->ovflist.
   1718			 */
   1719			list_add_tail(&epi->rdllink, &ep->rdllist);
   1720			ep_pm_stay_awake(epi);
   1721		}
   1722	}
   1723	ep_done_scan(ep, &txlist);
   1724	mutex_unlock(&ep->mtx);
   1725
   1726	return res;
   1727}
   1728
   1729static struct timespec64 *ep_timeout_to_timespec(struct timespec64 *to, long ms)
   1730{
   1731	struct timespec64 now;
   1732
   1733	if (ms < 0)
   1734		return NULL;
   1735
   1736	if (!ms) {
   1737		to->tv_sec = 0;
   1738		to->tv_nsec = 0;
   1739		return to;
   1740	}
   1741
   1742	to->tv_sec = ms / MSEC_PER_SEC;
   1743	to->tv_nsec = NSEC_PER_MSEC * (ms % MSEC_PER_SEC);
   1744
   1745	ktime_get_ts64(&now);
   1746	*to = timespec64_add_safe(now, *to);
   1747	return to;
   1748}
   1749
   1750/**
   1751 * ep_poll - Retrieves ready events, and delivers them to the caller-supplied
   1752 *           event buffer.
   1753 *
   1754 * @ep: Pointer to the eventpoll context.
   1755 * @events: Pointer to the userspace buffer where the ready events should be
   1756 *          stored.
   1757 * @maxevents: Size (in terms of number of events) of the caller event buffer.
   1758 * @timeout: Maximum timeout for the ready events fetch operation, in
   1759 *           timespec. If the timeout is zero, the function will not block,
   1760 *           while if the @timeout ptr is NULL, the function will block
   1761 *           until at least one event has been retrieved (or an error
   1762 *           occurred).
   1763 *
   1764 * Return: the number of ready events which have been fetched, or an
   1765 *          error code, in case of error.
   1766 */
   1767static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events,
   1768		   int maxevents, struct timespec64 *timeout)
   1769{
   1770	int res, eavail, timed_out = 0;
   1771	u64 slack = 0;
   1772	wait_queue_entry_t wait;
   1773	ktime_t expires, *to = NULL;
   1774
   1775	lockdep_assert_irqs_enabled();
   1776
   1777	if (timeout && (timeout->tv_sec | timeout->tv_nsec)) {
   1778		slack = select_estimate_accuracy(timeout);
   1779		to = &expires;
   1780		*to = timespec64_to_ktime(*timeout);
   1781	} else if (timeout) {
   1782		/*
   1783		 * Avoid the unnecessary trip to the wait queue loop, if the
   1784		 * caller specified a non blocking operation.
   1785		 */
   1786		timed_out = 1;
   1787	}
   1788
   1789	/*
   1790	 * This call is racy: We may or may not see events that are being added
   1791	 * to the ready list under the lock (e.g., in IRQ callbacks). For cases
   1792	 * with a non-zero timeout, this thread will check the ready list under
   1793	 * lock and will add to the wait queue.  For cases with a zero
   1794	 * timeout, the user by definition should not care and will have to
   1795	 * recheck again.
   1796	 */
   1797	eavail = ep_events_available(ep);
   1798
   1799	while (1) {
   1800		if (eavail) {
   1801			/*
   1802			 * Try to transfer events to user space. In case we get
   1803			 * 0 events and there's still timeout left over, we go
   1804			 * trying again in search of more luck.
   1805			 */
   1806			res = ep_send_events(ep, events, maxevents);
   1807			if (res)
   1808				return res;
   1809		}
   1810
   1811		if (timed_out)
   1812			return 0;
   1813
   1814		eavail = ep_busy_loop(ep, timed_out);
   1815		if (eavail)
   1816			continue;
   1817
   1818		if (signal_pending(current))
   1819			return -EINTR;
   1820
   1821		/*
   1822		 * Internally init_wait() uses autoremove_wake_function(),
   1823		 * thus wait entry is removed from the wait queue on each
   1824		 * wakeup. Why it is important? In case of several waiters
   1825		 * each new wakeup will hit the next waiter, giving it the
   1826		 * chance to harvest new event. Otherwise wakeup can be
   1827		 * lost. This is also good performance-wise, because on
   1828		 * normal wakeup path no need to call __remove_wait_queue()
   1829		 * explicitly, thus ep->lock is not taken, which halts the
   1830		 * event delivery.
   1831		 */
   1832		init_wait(&wait);
   1833
   1834		write_lock_irq(&ep->lock);
   1835		/*
   1836		 * Barrierless variant, waitqueue_active() is called under
   1837		 * the same lock on wakeup ep_poll_callback() side, so it
   1838		 * is safe to avoid an explicit barrier.
   1839		 */
   1840		__set_current_state(TASK_INTERRUPTIBLE);
   1841
   1842		/*
   1843		 * Do the final check under the lock. ep_scan_ready_list()
   1844		 * plays with two lists (->rdllist and ->ovflist) and there
   1845		 * is always a race when both lists are empty for short
   1846		 * period of time although events are pending, so lock is
   1847		 * important.
   1848		 */
   1849		eavail = ep_events_available(ep);
   1850		if (!eavail)
   1851			__add_wait_queue_exclusive(&ep->wq, &wait);
   1852
   1853		write_unlock_irq(&ep->lock);
   1854
   1855		if (!eavail)
   1856			timed_out = !schedule_hrtimeout_range(to, slack,
   1857							      HRTIMER_MODE_ABS);
   1858		__set_current_state(TASK_RUNNING);
   1859
   1860		/*
   1861		 * We were woken up, thus go and try to harvest some events.
   1862		 * If timed out and still on the wait queue, recheck eavail
   1863		 * carefully under lock, below.
   1864		 */
   1865		eavail = 1;
   1866
   1867		if (!list_empty_careful(&wait.entry)) {
   1868			write_lock_irq(&ep->lock);
   1869			/*
   1870			 * If the thread timed out and is not on the wait queue,
   1871			 * it means that the thread was woken up after its
   1872			 * timeout expired before it could reacquire the lock.
   1873			 * Thus, when wait.entry is empty, it needs to harvest
   1874			 * events.
   1875			 */
   1876			if (timed_out)
   1877				eavail = list_empty(&wait.entry);
   1878			__remove_wait_queue(&ep->wq, &wait);
   1879			write_unlock_irq(&ep->lock);
   1880		}
   1881	}
   1882}
   1883
   1884/**
   1885 * ep_loop_check_proc - verify that adding an epoll file inside another
   1886 *                      epoll structure does not violate the constraints, in
   1887 *                      terms of closed loops, or too deep chains (which can
   1888 *                      result in excessive stack usage).
   1889 *
   1890 * @ep: the &struct eventpoll to be currently checked.
   1891 * @depth: Current depth of the path being checked.
   1892 *
   1893 * Return: %zero if adding the epoll @file inside current epoll
   1894 *          structure @ep does not violate the constraints, or %-1 otherwise.
   1895 */
   1896static int ep_loop_check_proc(struct eventpoll *ep, int depth)
   1897{
   1898	int error = 0;
   1899	struct rb_node *rbp;
   1900	struct epitem *epi;
   1901
   1902	mutex_lock_nested(&ep->mtx, depth + 1);
   1903	ep->gen = loop_check_gen;
   1904	for (rbp = rb_first_cached(&ep->rbr); rbp; rbp = rb_next(rbp)) {
   1905		epi = rb_entry(rbp, struct epitem, rbn);
   1906		if (unlikely(is_file_epoll(epi->ffd.file))) {
   1907			struct eventpoll *ep_tovisit;
   1908			ep_tovisit = epi->ffd.file->private_data;
   1909			if (ep_tovisit->gen == loop_check_gen)
   1910				continue;
   1911			if (ep_tovisit == inserting_into || depth > EP_MAX_NESTS)
   1912				error = -1;
   1913			else
   1914				error = ep_loop_check_proc(ep_tovisit, depth + 1);
   1915			if (error != 0)
   1916				break;
   1917		} else {
   1918			/*
   1919			 * If we've reached a file that is not associated with
   1920			 * an ep, then we need to check if the newly added
   1921			 * links are going to add too many wakeup paths. We do
   1922			 * this by adding it to the tfile_check_list, if it's
   1923			 * not already there, and calling reverse_path_check()
   1924			 * during ep_insert().
   1925			 */
   1926			list_file(epi->ffd.file);
   1927		}
   1928	}
   1929	mutex_unlock(&ep->mtx);
   1930
   1931	return error;
   1932}
   1933
   1934/**
   1935 * ep_loop_check - Performs a check to verify that adding an epoll file (@to)
   1936 *                 into another epoll file (represented by @ep) does not create
   1937 *                 closed loops or too deep chains.
   1938 *
   1939 * @ep: Pointer to the epoll we are inserting into.
   1940 * @to: Pointer to the epoll to be inserted.
   1941 *
   1942 * Return: %zero if adding the epoll @to inside the epoll @from
   1943 * does not violate the constraints, or %-1 otherwise.
   1944 */
   1945static int ep_loop_check(struct eventpoll *ep, struct eventpoll *to)
   1946{
   1947	inserting_into = ep;
   1948	return ep_loop_check_proc(to, 0);
   1949}
   1950
   1951static void clear_tfile_check_list(void)
   1952{
   1953	rcu_read_lock();
   1954	while (tfile_check_list != EP_UNACTIVE_PTR) {
   1955		struct epitems_head *head = tfile_check_list;
   1956		tfile_check_list = head->next;
   1957		unlist_file(head);
   1958	}
   1959	rcu_read_unlock();
   1960}
   1961
   1962/*
   1963 * Open an eventpoll file descriptor.
   1964 */
   1965static int do_epoll_create(int flags)
   1966{
   1967	int error, fd;
   1968	struct eventpoll *ep = NULL;
   1969	struct file *file;
   1970
   1971	/* Check the EPOLL_* constant for consistency.  */
   1972	BUILD_BUG_ON(EPOLL_CLOEXEC != O_CLOEXEC);
   1973
   1974	if (flags & ~EPOLL_CLOEXEC)
   1975		return -EINVAL;
   1976	/*
   1977	 * Create the internal data structure ("struct eventpoll").
   1978	 */
   1979	error = ep_alloc(&ep);
   1980	if (error < 0)
   1981		return error;
   1982	/*
   1983	 * Creates all the items needed to setup an eventpoll file. That is,
   1984	 * a file structure and a free file descriptor.
   1985	 */
   1986	fd = get_unused_fd_flags(O_RDWR | (flags & O_CLOEXEC));
   1987	if (fd < 0) {
   1988		error = fd;
   1989		goto out_free_ep;
   1990	}
   1991	file = anon_inode_getfile("[eventpoll]", &eventpoll_fops, ep,
   1992				 O_RDWR | (flags & O_CLOEXEC));
   1993	if (IS_ERR(file)) {
   1994		error = PTR_ERR(file);
   1995		goto out_free_fd;
   1996	}
   1997	ep->file = file;
   1998	fd_install(fd, file);
   1999	return fd;
   2000
   2001out_free_fd:
   2002	put_unused_fd(fd);
   2003out_free_ep:
   2004	ep_free(ep);
   2005	return error;
   2006}
   2007
   2008SYSCALL_DEFINE1(epoll_create1, int, flags)
   2009{
   2010	return do_epoll_create(flags);
   2011}
   2012
   2013SYSCALL_DEFINE1(epoll_create, int, size)
   2014{
   2015	if (size <= 0)
   2016		return -EINVAL;
   2017
   2018	return do_epoll_create(0);
   2019}
   2020
   2021static inline int epoll_mutex_lock(struct mutex *mutex, int depth,
   2022				   bool nonblock)
   2023{
   2024	if (!nonblock) {
   2025		mutex_lock_nested(mutex, depth);
   2026		return 0;
   2027	}
   2028	if (mutex_trylock(mutex))
   2029		return 0;
   2030	return -EAGAIN;
   2031}
   2032
   2033int do_epoll_ctl(int epfd, int op, int fd, struct epoll_event *epds,
   2034		 bool nonblock)
   2035{
   2036	int error;
   2037	int full_check = 0;
   2038	struct fd f, tf;
   2039	struct eventpoll *ep;
   2040	struct epitem *epi;
   2041	struct eventpoll *tep = NULL;
   2042
   2043	error = -EBADF;
   2044	f = fdget(epfd);
   2045	if (!f.file)
   2046		goto error_return;
   2047
   2048	/* Get the "struct file *" for the target file */
   2049	tf = fdget(fd);
   2050	if (!tf.file)
   2051		goto error_fput;
   2052
   2053	/* The target file descriptor must support poll */
   2054	error = -EPERM;
   2055	if (!file_can_poll(tf.file))
   2056		goto error_tgt_fput;
   2057
   2058	/* Check if EPOLLWAKEUP is allowed */
   2059	if (ep_op_has_event(op))
   2060		ep_take_care_of_epollwakeup(epds);
   2061
   2062	/*
   2063	 * We have to check that the file structure underneath the file descriptor
   2064	 * the user passed to us _is_ an eventpoll file. And also we do not permit
   2065	 * adding an epoll file descriptor inside itself.
   2066	 */
   2067	error = -EINVAL;
   2068	if (f.file == tf.file || !is_file_epoll(f.file))
   2069		goto error_tgt_fput;
   2070
   2071	/*
   2072	 * epoll adds to the wakeup queue at EPOLL_CTL_ADD time only,
   2073	 * so EPOLLEXCLUSIVE is not allowed for a EPOLL_CTL_MOD operation.
   2074	 * Also, we do not currently supported nested exclusive wakeups.
   2075	 */
   2076	if (ep_op_has_event(op) && (epds->events & EPOLLEXCLUSIVE)) {
   2077		if (op == EPOLL_CTL_MOD)
   2078			goto error_tgt_fput;
   2079		if (op == EPOLL_CTL_ADD && (is_file_epoll(tf.file) ||
   2080				(epds->events & ~EPOLLEXCLUSIVE_OK_BITS)))
   2081			goto error_tgt_fput;
   2082	}
   2083
   2084	/*
   2085	 * At this point it is safe to assume that the "private_data" contains
   2086	 * our own data structure.
   2087	 */
   2088	ep = f.file->private_data;
   2089
   2090	/*
   2091	 * When we insert an epoll file descriptor inside another epoll file
   2092	 * descriptor, there is the chance of creating closed loops, which are
   2093	 * better be handled here, than in more critical paths. While we are
   2094	 * checking for loops we also determine the list of files reachable
   2095	 * and hang them on the tfile_check_list, so we can check that we
   2096	 * haven't created too many possible wakeup paths.
   2097	 *
   2098	 * We do not need to take the global 'epumutex' on EPOLL_CTL_ADD when
   2099	 * the epoll file descriptor is attaching directly to a wakeup source,
   2100	 * unless the epoll file descriptor is nested. The purpose of taking the
   2101	 * 'epmutex' on add is to prevent complex toplogies such as loops and
   2102	 * deep wakeup paths from forming in parallel through multiple
   2103	 * EPOLL_CTL_ADD operations.
   2104	 */
   2105	error = epoll_mutex_lock(&ep->mtx, 0, nonblock);
   2106	if (error)
   2107		goto error_tgt_fput;
   2108	if (op == EPOLL_CTL_ADD) {
   2109		if (READ_ONCE(f.file->f_ep) || ep->gen == loop_check_gen ||
   2110		    is_file_epoll(tf.file)) {
   2111			mutex_unlock(&ep->mtx);
   2112			error = epoll_mutex_lock(&epmutex, 0, nonblock);
   2113			if (error)
   2114				goto error_tgt_fput;
   2115			loop_check_gen++;
   2116			full_check = 1;
   2117			if (is_file_epoll(tf.file)) {
   2118				tep = tf.file->private_data;
   2119				error = -ELOOP;
   2120				if (ep_loop_check(ep, tep) != 0)
   2121					goto error_tgt_fput;
   2122			}
   2123			error = epoll_mutex_lock(&ep->mtx, 0, nonblock);
   2124			if (error)
   2125				goto error_tgt_fput;
   2126		}
   2127	}
   2128
   2129	/*
   2130	 * Try to lookup the file inside our RB tree. Since we grabbed "mtx"
   2131	 * above, we can be sure to be able to use the item looked up by
   2132	 * ep_find() till we release the mutex.
   2133	 */
   2134	epi = ep_find(ep, tf.file, fd);
   2135
   2136	error = -EINVAL;
   2137	switch (op) {
   2138	case EPOLL_CTL_ADD:
   2139		if (!epi) {
   2140			epds->events |= EPOLLERR | EPOLLHUP;
   2141			error = ep_insert(ep, epds, tf.file, fd, full_check);
   2142		} else
   2143			error = -EEXIST;
   2144		break;
   2145	case EPOLL_CTL_DEL:
   2146		if (epi)
   2147			error = ep_remove(ep, epi);
   2148		else
   2149			error = -ENOENT;
   2150		break;
   2151	case EPOLL_CTL_MOD:
   2152		if (epi) {
   2153			if (!(epi->event.events & EPOLLEXCLUSIVE)) {
   2154				epds->events |= EPOLLERR | EPOLLHUP;
   2155				error = ep_modify(ep, epi, epds);
   2156			}
   2157		} else
   2158			error = -ENOENT;
   2159		break;
   2160	}
   2161	mutex_unlock(&ep->mtx);
   2162
   2163error_tgt_fput:
   2164	if (full_check) {
   2165		clear_tfile_check_list();
   2166		loop_check_gen++;
   2167		mutex_unlock(&epmutex);
   2168	}
   2169
   2170	fdput(tf);
   2171error_fput:
   2172	fdput(f);
   2173error_return:
   2174
   2175	return error;
   2176}
   2177
   2178/*
   2179 * The following function implements the controller interface for
   2180 * the eventpoll file that enables the insertion/removal/change of
   2181 * file descriptors inside the interest set.
   2182 */
   2183SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd,
   2184		struct epoll_event __user *, event)
   2185{
   2186	struct epoll_event epds;
   2187
   2188	if (ep_op_has_event(op) &&
   2189	    copy_from_user(&epds, event, sizeof(struct epoll_event)))
   2190		return -EFAULT;
   2191
   2192	return do_epoll_ctl(epfd, op, fd, &epds, false);
   2193}
   2194
   2195/*
   2196 * Implement the event wait interface for the eventpoll file. It is the kernel
   2197 * part of the user space epoll_wait(2).
   2198 */
   2199static int do_epoll_wait(int epfd, struct epoll_event __user *events,
   2200			 int maxevents, struct timespec64 *to)
   2201{
   2202	int error;
   2203	struct fd f;
   2204	struct eventpoll *ep;
   2205
   2206	/* The maximum number of event must be greater than zero */
   2207	if (maxevents <= 0 || maxevents > EP_MAX_EVENTS)
   2208		return -EINVAL;
   2209
   2210	/* Verify that the area passed by the user is writeable */
   2211	if (!access_ok(events, maxevents * sizeof(struct epoll_event)))
   2212		return -EFAULT;
   2213
   2214	/* Get the "struct file *" for the eventpoll file */
   2215	f = fdget(epfd);
   2216	if (!f.file)
   2217		return -EBADF;
   2218
   2219	/*
   2220	 * We have to check that the file structure underneath the fd
   2221	 * the user passed to us _is_ an eventpoll file.
   2222	 */
   2223	error = -EINVAL;
   2224	if (!is_file_epoll(f.file))
   2225		goto error_fput;
   2226
   2227	/*
   2228	 * At this point it is safe to assume that the "private_data" contains
   2229	 * our own data structure.
   2230	 */
   2231	ep = f.file->private_data;
   2232
   2233	/* Time to fish for events ... */
   2234	error = ep_poll(ep, events, maxevents, to);
   2235
   2236error_fput:
   2237	fdput(f);
   2238	return error;
   2239}
   2240
   2241SYSCALL_DEFINE4(epoll_wait, int, epfd, struct epoll_event __user *, events,
   2242		int, maxevents, int, timeout)
   2243{
   2244	struct timespec64 to;
   2245
   2246	return do_epoll_wait(epfd, events, maxevents,
   2247			     ep_timeout_to_timespec(&to, timeout));
   2248}
   2249
   2250/*
   2251 * Implement the event wait interface for the eventpoll file. It is the kernel
   2252 * part of the user space epoll_pwait(2).
   2253 */
   2254static int do_epoll_pwait(int epfd, struct epoll_event __user *events,
   2255			  int maxevents, struct timespec64 *to,
   2256			  const sigset_t __user *sigmask, size_t sigsetsize)
   2257{
   2258	int error;
   2259
   2260	/*
   2261	 * If the caller wants a certain signal mask to be set during the wait,
   2262	 * we apply it here.
   2263	 */
   2264	error = set_user_sigmask(sigmask, sigsetsize);
   2265	if (error)
   2266		return error;
   2267
   2268	error = do_epoll_wait(epfd, events, maxevents, to);
   2269
   2270	restore_saved_sigmask_unless(error == -EINTR);
   2271
   2272	return error;
   2273}
   2274
   2275SYSCALL_DEFINE6(epoll_pwait, int, epfd, struct epoll_event __user *, events,
   2276		int, maxevents, int, timeout, const sigset_t __user *, sigmask,
   2277		size_t, sigsetsize)
   2278{
   2279	struct timespec64 to;
   2280
   2281	return do_epoll_pwait(epfd, events, maxevents,
   2282			      ep_timeout_to_timespec(&to, timeout),
   2283			      sigmask, sigsetsize);
   2284}
   2285
   2286SYSCALL_DEFINE6(epoll_pwait2, int, epfd, struct epoll_event __user *, events,
   2287		int, maxevents, const struct __kernel_timespec __user *, timeout,
   2288		const sigset_t __user *, sigmask, size_t, sigsetsize)
   2289{
   2290	struct timespec64 ts, *to = NULL;
   2291
   2292	if (timeout) {
   2293		if (get_timespec64(&ts, timeout))
   2294			return -EFAULT;
   2295		to = &ts;
   2296		if (poll_select_set_timeout(to, ts.tv_sec, ts.tv_nsec))
   2297			return -EINVAL;
   2298	}
   2299
   2300	return do_epoll_pwait(epfd, events, maxevents, to,
   2301			      sigmask, sigsetsize);
   2302}
   2303
   2304#ifdef CONFIG_COMPAT
   2305static int do_compat_epoll_pwait(int epfd, struct epoll_event __user *events,
   2306				 int maxevents, struct timespec64 *timeout,
   2307				 const compat_sigset_t __user *sigmask,
   2308				 compat_size_t sigsetsize)
   2309{
   2310	long err;
   2311
   2312	/*
   2313	 * If the caller wants a certain signal mask to be set during the wait,
   2314	 * we apply it here.
   2315	 */
   2316	err = set_compat_user_sigmask(sigmask, sigsetsize);
   2317	if (err)
   2318		return err;
   2319
   2320	err = do_epoll_wait(epfd, events, maxevents, timeout);
   2321
   2322	restore_saved_sigmask_unless(err == -EINTR);
   2323
   2324	return err;
   2325}
   2326
   2327COMPAT_SYSCALL_DEFINE6(epoll_pwait, int, epfd,
   2328		       struct epoll_event __user *, events,
   2329		       int, maxevents, int, timeout,
   2330		       const compat_sigset_t __user *, sigmask,
   2331		       compat_size_t, sigsetsize)
   2332{
   2333	struct timespec64 to;
   2334
   2335	return do_compat_epoll_pwait(epfd, events, maxevents,
   2336				     ep_timeout_to_timespec(&to, timeout),
   2337				     sigmask, sigsetsize);
   2338}
   2339
   2340COMPAT_SYSCALL_DEFINE6(epoll_pwait2, int, epfd,
   2341		       struct epoll_event __user *, events,
   2342		       int, maxevents,
   2343		       const struct __kernel_timespec __user *, timeout,
   2344		       const compat_sigset_t __user *, sigmask,
   2345		       compat_size_t, sigsetsize)
   2346{
   2347	struct timespec64 ts, *to = NULL;
   2348
   2349	if (timeout) {
   2350		if (get_timespec64(&ts, timeout))
   2351			return -EFAULT;
   2352		to = &ts;
   2353		if (poll_select_set_timeout(to, ts.tv_sec, ts.tv_nsec))
   2354			return -EINVAL;
   2355	}
   2356
   2357	return do_compat_epoll_pwait(epfd, events, maxevents, to,
   2358				     sigmask, sigsetsize);
   2359}
   2360
   2361#endif
   2362
   2363static int __init eventpoll_init(void)
   2364{
   2365	struct sysinfo si;
   2366
   2367	si_meminfo(&si);
   2368	/*
   2369	 * Allows top 4% of lomem to be allocated for epoll watches (per user).
   2370	 */
   2371	max_user_watches = (((si.totalram - si.totalhigh) / 25) << PAGE_SHIFT) /
   2372		EP_ITEM_COST;
   2373	BUG_ON(max_user_watches < 0);
   2374
   2375	/*
   2376	 * We can have many thousands of epitems, so prevent this from
   2377	 * using an extra cache line on 64-bit (and smaller) CPUs
   2378	 */
   2379	BUILD_BUG_ON(sizeof(void *) <= 8 && sizeof(struct epitem) > 128);
   2380
   2381	/* Allocates slab cache used to allocate "struct epitem" items */
   2382	epi_cache = kmem_cache_create("eventpoll_epi", sizeof(struct epitem),
   2383			0, SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_ACCOUNT, NULL);
   2384
   2385	/* Allocates slab cache used to allocate "struct eppoll_entry" */
   2386	pwq_cache = kmem_cache_create("eventpoll_pwq",
   2387		sizeof(struct eppoll_entry), 0, SLAB_PANIC|SLAB_ACCOUNT, NULL);
   2388	epoll_sysctls_init();
   2389
   2390	ephead_cache = kmem_cache_create("ep_head",
   2391		sizeof(struct epitems_head), 0, SLAB_PANIC|SLAB_ACCOUNT, NULL);
   2392
   2393	return 0;
   2394}
   2395fs_initcall(eventpoll_init);