cachepc-linux

Fork of AMDESE/linux with modifications for CachePC side-channel attack
git clone https://git.sinitax.com/sinitax/cachepc-linux
Log | Files | Refs | README | LICENSE | sfeed.txt

eventfd.c (12324B)


      1// SPDX-License-Identifier: GPL-2.0-only
      2/*
      3 *  fs/eventfd.c
      4 *
      5 *  Copyright (C) 2007  Davide Libenzi <davidel@xmailserver.org>
      6 *
      7 */
      8
      9#include <linux/file.h>
     10#include <linux/poll.h>
     11#include <linux/init.h>
     12#include <linux/fs.h>
     13#include <linux/sched/signal.h>
     14#include <linux/kernel.h>
     15#include <linux/slab.h>
     16#include <linux/list.h>
     17#include <linux/spinlock.h>
     18#include <linux/anon_inodes.h>
     19#include <linux/syscalls.h>
     20#include <linux/export.h>
     21#include <linux/kref.h>
     22#include <linux/eventfd.h>
     23#include <linux/proc_fs.h>
     24#include <linux/seq_file.h>
     25#include <linux/idr.h>
     26#include <linux/uio.h>
     27
     28static DEFINE_IDA(eventfd_ida);
     29
     30struct eventfd_ctx {
     31	struct kref kref;
     32	wait_queue_head_t wqh;
     33	/*
     34	 * Every time that a write(2) is performed on an eventfd, the
     35	 * value of the __u64 being written is added to "count" and a
     36	 * wakeup is performed on "wqh". A read(2) will return the "count"
     37	 * value to userspace, and will reset "count" to zero. The kernel
     38	 * side eventfd_signal() also, adds to the "count" counter and
     39	 * issue a wakeup.
     40	 */
     41	__u64 count;
     42	unsigned int flags;
     43	int id;
     44};
     45
     46/**
     47 * eventfd_signal - Adds @n to the eventfd counter.
     48 * @ctx: [in] Pointer to the eventfd context.
     49 * @n: [in] Value of the counter to be added to the eventfd internal counter.
     50 *          The value cannot be negative.
     51 *
     52 * This function is supposed to be called by the kernel in paths that do not
     53 * allow sleeping. In this function we allow the counter to reach the ULLONG_MAX
     54 * value, and we signal this as overflow condition by returning a EPOLLERR
     55 * to poll(2).
     56 *
     57 * Returns the amount by which the counter was incremented.  This will be less
     58 * than @n if the counter has overflowed.
     59 */
     60__u64 eventfd_signal(struct eventfd_ctx *ctx, __u64 n)
     61{
     62	unsigned long flags;
     63
     64	/*
     65	 * Deadlock or stack overflow issues can happen if we recurse here
     66	 * through waitqueue wakeup handlers. If the caller users potentially
     67	 * nested waitqueues with custom wakeup handlers, then it should
     68	 * check eventfd_signal_allowed() before calling this function. If
     69	 * it returns false, the eventfd_signal() call should be deferred to a
     70	 * safe context.
     71	 */
     72	if (WARN_ON_ONCE(current->in_eventfd_signal))
     73		return 0;
     74
     75	spin_lock_irqsave(&ctx->wqh.lock, flags);
     76	current->in_eventfd_signal = 1;
     77	if (ULLONG_MAX - ctx->count < n)
     78		n = ULLONG_MAX - ctx->count;
     79	ctx->count += n;
     80	if (waitqueue_active(&ctx->wqh))
     81		wake_up_locked_poll(&ctx->wqh, EPOLLIN);
     82	current->in_eventfd_signal = 0;
     83	spin_unlock_irqrestore(&ctx->wqh.lock, flags);
     84
     85	return n;
     86}
     87EXPORT_SYMBOL_GPL(eventfd_signal);
     88
     89static void eventfd_free_ctx(struct eventfd_ctx *ctx)
     90{
     91	if (ctx->id >= 0)
     92		ida_simple_remove(&eventfd_ida, ctx->id);
     93	kfree(ctx);
     94}
     95
     96static void eventfd_free(struct kref *kref)
     97{
     98	struct eventfd_ctx *ctx = container_of(kref, struct eventfd_ctx, kref);
     99
    100	eventfd_free_ctx(ctx);
    101}
    102
    103/**
    104 * eventfd_ctx_put - Releases a reference to the internal eventfd context.
    105 * @ctx: [in] Pointer to eventfd context.
    106 *
    107 * The eventfd context reference must have been previously acquired either
    108 * with eventfd_ctx_fdget() or eventfd_ctx_fileget().
    109 */
    110void eventfd_ctx_put(struct eventfd_ctx *ctx)
    111{
    112	kref_put(&ctx->kref, eventfd_free);
    113}
    114EXPORT_SYMBOL_GPL(eventfd_ctx_put);
    115
    116static int eventfd_release(struct inode *inode, struct file *file)
    117{
    118	struct eventfd_ctx *ctx = file->private_data;
    119
    120	wake_up_poll(&ctx->wqh, EPOLLHUP);
    121	eventfd_ctx_put(ctx);
    122	return 0;
    123}
    124
    125static __poll_t eventfd_poll(struct file *file, poll_table *wait)
    126{
    127	struct eventfd_ctx *ctx = file->private_data;
    128	__poll_t events = 0;
    129	u64 count;
    130
    131	poll_wait(file, &ctx->wqh, wait);
    132
    133	/*
    134	 * All writes to ctx->count occur within ctx->wqh.lock.  This read
    135	 * can be done outside ctx->wqh.lock because we know that poll_wait
    136	 * takes that lock (through add_wait_queue) if our caller will sleep.
    137	 *
    138	 * The read _can_ therefore seep into add_wait_queue's critical
    139	 * section, but cannot move above it!  add_wait_queue's spin_lock acts
    140	 * as an acquire barrier and ensures that the read be ordered properly
    141	 * against the writes.  The following CAN happen and is safe:
    142	 *
    143	 *     poll                               write
    144	 *     -----------------                  ------------
    145	 *     lock ctx->wqh.lock (in poll_wait)
    146	 *     count = ctx->count
    147	 *     __add_wait_queue
    148	 *     unlock ctx->wqh.lock
    149	 *                                        lock ctx->qwh.lock
    150	 *                                        ctx->count += n
    151	 *                                        if (waitqueue_active)
    152	 *                                          wake_up_locked_poll
    153	 *                                        unlock ctx->qwh.lock
    154	 *     eventfd_poll returns 0
    155	 *
    156	 * but the following, which would miss a wakeup, cannot happen:
    157	 *
    158	 *     poll                               write
    159	 *     -----------------                  ------------
    160	 *     count = ctx->count (INVALID!)
    161	 *                                        lock ctx->qwh.lock
    162	 *                                        ctx->count += n
    163	 *                                        **waitqueue_active is false**
    164	 *                                        **no wake_up_locked_poll!**
    165	 *                                        unlock ctx->qwh.lock
    166	 *     lock ctx->wqh.lock (in poll_wait)
    167	 *     __add_wait_queue
    168	 *     unlock ctx->wqh.lock
    169	 *     eventfd_poll returns 0
    170	 */
    171	count = READ_ONCE(ctx->count);
    172
    173	if (count > 0)
    174		events |= EPOLLIN;
    175	if (count == ULLONG_MAX)
    176		events |= EPOLLERR;
    177	if (ULLONG_MAX - 1 > count)
    178		events |= EPOLLOUT;
    179
    180	return events;
    181}
    182
    183void eventfd_ctx_do_read(struct eventfd_ctx *ctx, __u64 *cnt)
    184{
    185	lockdep_assert_held(&ctx->wqh.lock);
    186
    187	*cnt = (ctx->flags & EFD_SEMAPHORE) ? 1 : ctx->count;
    188	ctx->count -= *cnt;
    189}
    190EXPORT_SYMBOL_GPL(eventfd_ctx_do_read);
    191
    192/**
    193 * eventfd_ctx_remove_wait_queue - Read the current counter and removes wait queue.
    194 * @ctx: [in] Pointer to eventfd context.
    195 * @wait: [in] Wait queue to be removed.
    196 * @cnt: [out] Pointer to the 64-bit counter value.
    197 *
    198 * Returns %0 if successful, or the following error codes:
    199 *
    200 * -EAGAIN      : The operation would have blocked.
    201 *
    202 * This is used to atomically remove a wait queue entry from the eventfd wait
    203 * queue head, and read/reset the counter value.
    204 */
    205int eventfd_ctx_remove_wait_queue(struct eventfd_ctx *ctx, wait_queue_entry_t *wait,
    206				  __u64 *cnt)
    207{
    208	unsigned long flags;
    209
    210	spin_lock_irqsave(&ctx->wqh.lock, flags);
    211	eventfd_ctx_do_read(ctx, cnt);
    212	__remove_wait_queue(&ctx->wqh, wait);
    213	if (*cnt != 0 && waitqueue_active(&ctx->wqh))
    214		wake_up_locked_poll(&ctx->wqh, EPOLLOUT);
    215	spin_unlock_irqrestore(&ctx->wqh.lock, flags);
    216
    217	return *cnt != 0 ? 0 : -EAGAIN;
    218}
    219EXPORT_SYMBOL_GPL(eventfd_ctx_remove_wait_queue);
    220
    221static ssize_t eventfd_read(struct kiocb *iocb, struct iov_iter *to)
    222{
    223	struct file *file = iocb->ki_filp;
    224	struct eventfd_ctx *ctx = file->private_data;
    225	__u64 ucnt = 0;
    226	DECLARE_WAITQUEUE(wait, current);
    227
    228	if (iov_iter_count(to) < sizeof(ucnt))
    229		return -EINVAL;
    230	spin_lock_irq(&ctx->wqh.lock);
    231	if (!ctx->count) {
    232		if ((file->f_flags & O_NONBLOCK) ||
    233		    (iocb->ki_flags & IOCB_NOWAIT)) {
    234			spin_unlock_irq(&ctx->wqh.lock);
    235			return -EAGAIN;
    236		}
    237		__add_wait_queue(&ctx->wqh, &wait);
    238		for (;;) {
    239			set_current_state(TASK_INTERRUPTIBLE);
    240			if (ctx->count)
    241				break;
    242			if (signal_pending(current)) {
    243				__remove_wait_queue(&ctx->wqh, &wait);
    244				__set_current_state(TASK_RUNNING);
    245				spin_unlock_irq(&ctx->wqh.lock);
    246				return -ERESTARTSYS;
    247			}
    248			spin_unlock_irq(&ctx->wqh.lock);
    249			schedule();
    250			spin_lock_irq(&ctx->wqh.lock);
    251		}
    252		__remove_wait_queue(&ctx->wqh, &wait);
    253		__set_current_state(TASK_RUNNING);
    254	}
    255	eventfd_ctx_do_read(ctx, &ucnt);
    256	if (waitqueue_active(&ctx->wqh))
    257		wake_up_locked_poll(&ctx->wqh, EPOLLOUT);
    258	spin_unlock_irq(&ctx->wqh.lock);
    259	if (unlikely(copy_to_iter(&ucnt, sizeof(ucnt), to) != sizeof(ucnt)))
    260		return -EFAULT;
    261
    262	return sizeof(ucnt);
    263}
    264
    265static ssize_t eventfd_write(struct file *file, const char __user *buf, size_t count,
    266			     loff_t *ppos)
    267{
    268	struct eventfd_ctx *ctx = file->private_data;
    269	ssize_t res;
    270	__u64 ucnt;
    271	DECLARE_WAITQUEUE(wait, current);
    272
    273	if (count < sizeof(ucnt))
    274		return -EINVAL;
    275	if (copy_from_user(&ucnt, buf, sizeof(ucnt)))
    276		return -EFAULT;
    277	if (ucnt == ULLONG_MAX)
    278		return -EINVAL;
    279	spin_lock_irq(&ctx->wqh.lock);
    280	res = -EAGAIN;
    281	if (ULLONG_MAX - ctx->count > ucnt)
    282		res = sizeof(ucnt);
    283	else if (!(file->f_flags & O_NONBLOCK)) {
    284		__add_wait_queue(&ctx->wqh, &wait);
    285		for (res = 0;;) {
    286			set_current_state(TASK_INTERRUPTIBLE);
    287			if (ULLONG_MAX - ctx->count > ucnt) {
    288				res = sizeof(ucnt);
    289				break;
    290			}
    291			if (signal_pending(current)) {
    292				res = -ERESTARTSYS;
    293				break;
    294			}
    295			spin_unlock_irq(&ctx->wqh.lock);
    296			schedule();
    297			spin_lock_irq(&ctx->wqh.lock);
    298		}
    299		__remove_wait_queue(&ctx->wqh, &wait);
    300		__set_current_state(TASK_RUNNING);
    301	}
    302	if (likely(res > 0)) {
    303		ctx->count += ucnt;
    304		if (waitqueue_active(&ctx->wqh))
    305			wake_up_locked_poll(&ctx->wqh, EPOLLIN);
    306	}
    307	spin_unlock_irq(&ctx->wqh.lock);
    308
    309	return res;
    310}
    311
    312#ifdef CONFIG_PROC_FS
    313static void eventfd_show_fdinfo(struct seq_file *m, struct file *f)
    314{
    315	struct eventfd_ctx *ctx = f->private_data;
    316
    317	spin_lock_irq(&ctx->wqh.lock);
    318	seq_printf(m, "eventfd-count: %16llx\n",
    319		   (unsigned long long)ctx->count);
    320	spin_unlock_irq(&ctx->wqh.lock);
    321	seq_printf(m, "eventfd-id: %d\n", ctx->id);
    322}
    323#endif
    324
    325static const struct file_operations eventfd_fops = {
    326#ifdef CONFIG_PROC_FS
    327	.show_fdinfo	= eventfd_show_fdinfo,
    328#endif
    329	.release	= eventfd_release,
    330	.poll		= eventfd_poll,
    331	.read_iter	= eventfd_read,
    332	.write		= eventfd_write,
    333	.llseek		= noop_llseek,
    334};
    335
    336/**
    337 * eventfd_fget - Acquire a reference of an eventfd file descriptor.
    338 * @fd: [in] Eventfd file descriptor.
    339 *
    340 * Returns a pointer to the eventfd file structure in case of success, or the
    341 * following error pointer:
    342 *
    343 * -EBADF    : Invalid @fd file descriptor.
    344 * -EINVAL   : The @fd file descriptor is not an eventfd file.
    345 */
    346struct file *eventfd_fget(int fd)
    347{
    348	struct file *file;
    349
    350	file = fget(fd);
    351	if (!file)
    352		return ERR_PTR(-EBADF);
    353	if (file->f_op != &eventfd_fops) {
    354		fput(file);
    355		return ERR_PTR(-EINVAL);
    356	}
    357
    358	return file;
    359}
    360EXPORT_SYMBOL_GPL(eventfd_fget);
    361
    362/**
    363 * eventfd_ctx_fdget - Acquires a reference to the internal eventfd context.
    364 * @fd: [in] Eventfd file descriptor.
    365 *
    366 * Returns a pointer to the internal eventfd context, otherwise the error
    367 * pointers returned by the following functions:
    368 *
    369 * eventfd_fget
    370 */
    371struct eventfd_ctx *eventfd_ctx_fdget(int fd)
    372{
    373	struct eventfd_ctx *ctx;
    374	struct fd f = fdget(fd);
    375	if (!f.file)
    376		return ERR_PTR(-EBADF);
    377	ctx = eventfd_ctx_fileget(f.file);
    378	fdput(f);
    379	return ctx;
    380}
    381EXPORT_SYMBOL_GPL(eventfd_ctx_fdget);
    382
    383/**
    384 * eventfd_ctx_fileget - Acquires a reference to the internal eventfd context.
    385 * @file: [in] Eventfd file pointer.
    386 *
    387 * Returns a pointer to the internal eventfd context, otherwise the error
    388 * pointer:
    389 *
    390 * -EINVAL   : The @fd file descriptor is not an eventfd file.
    391 */
    392struct eventfd_ctx *eventfd_ctx_fileget(struct file *file)
    393{
    394	struct eventfd_ctx *ctx;
    395
    396	if (file->f_op != &eventfd_fops)
    397		return ERR_PTR(-EINVAL);
    398
    399	ctx = file->private_data;
    400	kref_get(&ctx->kref);
    401	return ctx;
    402}
    403EXPORT_SYMBOL_GPL(eventfd_ctx_fileget);
    404
    405static int do_eventfd(unsigned int count, int flags)
    406{
    407	struct eventfd_ctx *ctx;
    408	struct file *file;
    409	int fd;
    410
    411	/* Check the EFD_* constants for consistency.  */
    412	BUILD_BUG_ON(EFD_CLOEXEC != O_CLOEXEC);
    413	BUILD_BUG_ON(EFD_NONBLOCK != O_NONBLOCK);
    414
    415	if (flags & ~EFD_FLAGS_SET)
    416		return -EINVAL;
    417
    418	ctx = kmalloc(sizeof(*ctx), GFP_KERNEL);
    419	if (!ctx)
    420		return -ENOMEM;
    421
    422	kref_init(&ctx->kref);
    423	init_waitqueue_head(&ctx->wqh);
    424	ctx->count = count;
    425	ctx->flags = flags;
    426	ctx->id = ida_simple_get(&eventfd_ida, 0, 0, GFP_KERNEL);
    427
    428	flags &= EFD_SHARED_FCNTL_FLAGS;
    429	flags |= O_RDWR;
    430	fd = get_unused_fd_flags(flags);
    431	if (fd < 0)
    432		goto err;
    433
    434	file = anon_inode_getfile("[eventfd]", &eventfd_fops, ctx, flags);
    435	if (IS_ERR(file)) {
    436		put_unused_fd(fd);
    437		fd = PTR_ERR(file);
    438		goto err;
    439	}
    440
    441	file->f_mode |= FMODE_NOWAIT;
    442	fd_install(fd, file);
    443	return fd;
    444err:
    445	eventfd_free_ctx(ctx);
    446	return fd;
    447}
    448
    449SYSCALL_DEFINE2(eventfd2, unsigned int, count, int, flags)
    450{
    451	return do_eventfd(count, flags);
    452}
    453
    454SYSCALL_DEFINE1(eventfd, unsigned int, count)
    455{
    456	return do_eventfd(count, 0);
    457}
    458