cachepc-linux

Fork of AMDESE/linux with modifications for CachePC side-channel attack
git clone https://git.sinitax.com/sinitax/cachepc-linux
Log | Files | Refs | README | LICENSE | sfeed.txt

pipe.c (36899B)


      1// SPDX-License-Identifier: GPL-2.0
      2/*
      3 *  linux/fs/pipe.c
      4 *
      5 *  Copyright (C) 1991, 1992, 1999  Linus Torvalds
      6 */
      7
      8#include <linux/mm.h>
      9#include <linux/file.h>
     10#include <linux/poll.h>
     11#include <linux/slab.h>
     12#include <linux/module.h>
     13#include <linux/init.h>
     14#include <linux/fs.h>
     15#include <linux/log2.h>
     16#include <linux/mount.h>
     17#include <linux/pseudo_fs.h>
     18#include <linux/magic.h>
     19#include <linux/pipe_fs_i.h>
     20#include <linux/uio.h>
     21#include <linux/highmem.h>
     22#include <linux/pagemap.h>
     23#include <linux/audit.h>
     24#include <linux/syscalls.h>
     25#include <linux/fcntl.h>
     26#include <linux/memcontrol.h>
     27#include <linux/watch_queue.h>
     28#include <linux/sysctl.h>
     29
     30#include <linux/uaccess.h>
     31#include <asm/ioctls.h>
     32
     33#include "internal.h"
     34
     35/*
     36 * New pipe buffers will be restricted to this size while the user is exceeding
     37 * their pipe buffer quota. The general pipe use case needs at least two
     38 * buffers: one for data yet to be read, and one for new data. If this is less
     39 * than two, then a write to a non-empty pipe may block even if the pipe is not
     40 * full. This can occur with GNU make jobserver or similar uses of pipes as
     41 * semaphores: multiple processes may be waiting to write tokens back to the
     42 * pipe before reading tokens: https://lore.kernel.org/lkml/1628086770.5rn8p04n6j.none@localhost/.
     43 *
     44 * Users can reduce their pipe buffers with F_SETPIPE_SZ below this at their
     45 * own risk, namely: pipe writes to non-full pipes may block until the pipe is
     46 * emptied.
     47 */
     48#define PIPE_MIN_DEF_BUFFERS 2
     49
     50/*
     51 * The max size that a non-root user is allowed to grow the pipe. Can
     52 * be set by root in /proc/sys/fs/pipe-max-size
     53 */
     54static unsigned int pipe_max_size = 1048576;
     55
     56/* Maximum allocatable pages per user. Hard limit is unset by default, soft
     57 * matches default values.
     58 */
     59static unsigned long pipe_user_pages_hard;
     60static unsigned long pipe_user_pages_soft = PIPE_DEF_BUFFERS * INR_OPEN_CUR;
     61
     62/*
     63 * We use head and tail indices that aren't masked off, except at the point of
     64 * dereference, but rather they're allowed to wrap naturally.  This means there
     65 * isn't a dead spot in the buffer, but the ring has to be a power of two and
     66 * <= 2^31.
     67 * -- David Howells 2019-09-23.
     68 *
     69 * Reads with count = 0 should always return 0.
     70 * -- Julian Bradfield 1999-06-07.
     71 *
     72 * FIFOs and Pipes now generate SIGIO for both readers and writers.
     73 * -- Jeremy Elson <jelson@circlemud.org> 2001-08-16
     74 *
     75 * pipe_read & write cleanup
     76 * -- Manfred Spraul <manfred@colorfullife.com> 2002-05-09
     77 */
     78
     79static void pipe_lock_nested(struct pipe_inode_info *pipe, int subclass)
     80{
     81	if (pipe->files)
     82		mutex_lock_nested(&pipe->mutex, subclass);
     83}
     84
     85void pipe_lock(struct pipe_inode_info *pipe)
     86{
     87	/*
     88	 * pipe_lock() nests non-pipe inode locks (for writing to a file)
     89	 */
     90	pipe_lock_nested(pipe, I_MUTEX_PARENT);
     91}
     92EXPORT_SYMBOL(pipe_lock);
     93
     94void pipe_unlock(struct pipe_inode_info *pipe)
     95{
     96	if (pipe->files)
     97		mutex_unlock(&pipe->mutex);
     98}
     99EXPORT_SYMBOL(pipe_unlock);
    100
    101static inline void __pipe_lock(struct pipe_inode_info *pipe)
    102{
    103	mutex_lock_nested(&pipe->mutex, I_MUTEX_PARENT);
    104}
    105
    106static inline void __pipe_unlock(struct pipe_inode_info *pipe)
    107{
    108	mutex_unlock(&pipe->mutex);
    109}
    110
    111void pipe_double_lock(struct pipe_inode_info *pipe1,
    112		      struct pipe_inode_info *pipe2)
    113{
    114	BUG_ON(pipe1 == pipe2);
    115
    116	if (pipe1 < pipe2) {
    117		pipe_lock_nested(pipe1, I_MUTEX_PARENT);
    118		pipe_lock_nested(pipe2, I_MUTEX_CHILD);
    119	} else {
    120		pipe_lock_nested(pipe2, I_MUTEX_PARENT);
    121		pipe_lock_nested(pipe1, I_MUTEX_CHILD);
    122	}
    123}
    124
    125static void anon_pipe_buf_release(struct pipe_inode_info *pipe,
    126				  struct pipe_buffer *buf)
    127{
    128	struct page *page = buf->page;
    129
    130	/*
    131	 * If nobody else uses this page, and we don't already have a
    132	 * temporary page, let's keep track of it as a one-deep
    133	 * allocation cache. (Otherwise just release our reference to it)
    134	 */
    135	if (page_count(page) == 1 && !pipe->tmp_page)
    136		pipe->tmp_page = page;
    137	else
    138		put_page(page);
    139}
    140
    141static bool anon_pipe_buf_try_steal(struct pipe_inode_info *pipe,
    142		struct pipe_buffer *buf)
    143{
    144	struct page *page = buf->page;
    145
    146	if (page_count(page) != 1)
    147		return false;
    148	memcg_kmem_uncharge_page(page, 0);
    149	__SetPageLocked(page);
    150	return true;
    151}
    152
    153/**
    154 * generic_pipe_buf_try_steal - attempt to take ownership of a &pipe_buffer
    155 * @pipe:	the pipe that the buffer belongs to
    156 * @buf:	the buffer to attempt to steal
    157 *
    158 * Description:
    159 *	This function attempts to steal the &struct page attached to
    160 *	@buf. If successful, this function returns 0 and returns with
    161 *	the page locked. The caller may then reuse the page for whatever
    162 *	he wishes; the typical use is insertion into a different file
    163 *	page cache.
    164 */
    165bool generic_pipe_buf_try_steal(struct pipe_inode_info *pipe,
    166		struct pipe_buffer *buf)
    167{
    168	struct page *page = buf->page;
    169
    170	/*
    171	 * A reference of one is golden, that means that the owner of this
    172	 * page is the only one holding a reference to it. lock the page
    173	 * and return OK.
    174	 */
    175	if (page_count(page) == 1) {
    176		lock_page(page);
    177		return true;
    178	}
    179	return false;
    180}
    181EXPORT_SYMBOL(generic_pipe_buf_try_steal);
    182
    183/**
    184 * generic_pipe_buf_get - get a reference to a &struct pipe_buffer
    185 * @pipe:	the pipe that the buffer belongs to
    186 * @buf:	the buffer to get a reference to
    187 *
    188 * Description:
    189 *	This function grabs an extra reference to @buf. It's used in
    190 *	the tee() system call, when we duplicate the buffers in one
    191 *	pipe into another.
    192 */
    193bool generic_pipe_buf_get(struct pipe_inode_info *pipe, struct pipe_buffer *buf)
    194{
    195	return try_get_page(buf->page);
    196}
    197EXPORT_SYMBOL(generic_pipe_buf_get);
    198
    199/**
    200 * generic_pipe_buf_release - put a reference to a &struct pipe_buffer
    201 * @pipe:	the pipe that the buffer belongs to
    202 * @buf:	the buffer to put a reference to
    203 *
    204 * Description:
    205 *	This function releases a reference to @buf.
    206 */
    207void generic_pipe_buf_release(struct pipe_inode_info *pipe,
    208			      struct pipe_buffer *buf)
    209{
    210	put_page(buf->page);
    211}
    212EXPORT_SYMBOL(generic_pipe_buf_release);
    213
    214static const struct pipe_buf_operations anon_pipe_buf_ops = {
    215	.release	= anon_pipe_buf_release,
    216	.try_steal	= anon_pipe_buf_try_steal,
    217	.get		= generic_pipe_buf_get,
    218};
    219
    220/* Done while waiting without holding the pipe lock - thus the READ_ONCE() */
    221static inline bool pipe_readable(const struct pipe_inode_info *pipe)
    222{
    223	unsigned int head = READ_ONCE(pipe->head);
    224	unsigned int tail = READ_ONCE(pipe->tail);
    225	unsigned int writers = READ_ONCE(pipe->writers);
    226
    227	return !pipe_empty(head, tail) || !writers;
    228}
    229
    230static ssize_t
    231pipe_read(struct kiocb *iocb, struct iov_iter *to)
    232{
    233	size_t total_len = iov_iter_count(to);
    234	struct file *filp = iocb->ki_filp;
    235	struct pipe_inode_info *pipe = filp->private_data;
    236	bool was_full, wake_next_reader = false;
    237	ssize_t ret;
    238
    239	/* Null read succeeds. */
    240	if (unlikely(total_len == 0))
    241		return 0;
    242
    243	ret = 0;
    244	__pipe_lock(pipe);
    245
    246	/*
    247	 * We only wake up writers if the pipe was full when we started
    248	 * reading in order to avoid unnecessary wakeups.
    249	 *
    250	 * But when we do wake up writers, we do so using a sync wakeup
    251	 * (WF_SYNC), because we want them to get going and generate more
    252	 * data for us.
    253	 */
    254	was_full = pipe_full(pipe->head, pipe->tail, pipe->max_usage);
    255	for (;;) {
    256		/* Read ->head with a barrier vs post_one_notification() */
    257		unsigned int head = smp_load_acquire(&pipe->head);
    258		unsigned int tail = pipe->tail;
    259		unsigned int mask = pipe->ring_size - 1;
    260
    261#ifdef CONFIG_WATCH_QUEUE
    262		if (pipe->note_loss) {
    263			struct watch_notification n;
    264
    265			if (total_len < 8) {
    266				if (ret == 0)
    267					ret = -ENOBUFS;
    268				break;
    269			}
    270
    271			n.type = WATCH_TYPE_META;
    272			n.subtype = WATCH_META_LOSS_NOTIFICATION;
    273			n.info = watch_sizeof(n);
    274			if (copy_to_iter(&n, sizeof(n), to) != sizeof(n)) {
    275				if (ret == 0)
    276					ret = -EFAULT;
    277				break;
    278			}
    279			ret += sizeof(n);
    280			total_len -= sizeof(n);
    281			pipe->note_loss = false;
    282		}
    283#endif
    284
    285		if (!pipe_empty(head, tail)) {
    286			struct pipe_buffer *buf = &pipe->bufs[tail & mask];
    287			size_t chars = buf->len;
    288			size_t written;
    289			int error;
    290
    291			if (chars > total_len) {
    292				if (buf->flags & PIPE_BUF_FLAG_WHOLE) {
    293					if (ret == 0)
    294						ret = -ENOBUFS;
    295					break;
    296				}
    297				chars = total_len;
    298			}
    299
    300			error = pipe_buf_confirm(pipe, buf);
    301			if (error) {
    302				if (!ret)
    303					ret = error;
    304				break;
    305			}
    306
    307			written = copy_page_to_iter(buf->page, buf->offset, chars, to);
    308			if (unlikely(written < chars)) {
    309				if (!ret)
    310					ret = -EFAULT;
    311				break;
    312			}
    313			ret += chars;
    314			buf->offset += chars;
    315			buf->len -= chars;
    316
    317			/* Was it a packet buffer? Clean up and exit */
    318			if (buf->flags & PIPE_BUF_FLAG_PACKET) {
    319				total_len = chars;
    320				buf->len = 0;
    321			}
    322
    323			if (!buf->len) {
    324				pipe_buf_release(pipe, buf);
    325				spin_lock_irq(&pipe->rd_wait.lock);
    326#ifdef CONFIG_WATCH_QUEUE
    327				if (buf->flags & PIPE_BUF_FLAG_LOSS)
    328					pipe->note_loss = true;
    329#endif
    330				tail++;
    331				pipe->tail = tail;
    332				spin_unlock_irq(&pipe->rd_wait.lock);
    333			}
    334			total_len -= chars;
    335			if (!total_len)
    336				break;	/* common path: read succeeded */
    337			if (!pipe_empty(head, tail))	/* More to do? */
    338				continue;
    339		}
    340
    341		if (!pipe->writers)
    342			break;
    343		if (ret)
    344			break;
    345		if (filp->f_flags & O_NONBLOCK) {
    346			ret = -EAGAIN;
    347			break;
    348		}
    349		__pipe_unlock(pipe);
    350
    351		/*
    352		 * We only get here if we didn't actually read anything.
    353		 *
    354		 * However, we could have seen (and removed) a zero-sized
    355		 * pipe buffer, and might have made space in the buffers
    356		 * that way.
    357		 *
    358		 * You can't make zero-sized pipe buffers by doing an empty
    359		 * write (not even in packet mode), but they can happen if
    360		 * the writer gets an EFAULT when trying to fill a buffer
    361		 * that already got allocated and inserted in the buffer
    362		 * array.
    363		 *
    364		 * So we still need to wake up any pending writers in the
    365		 * _very_ unlikely case that the pipe was full, but we got
    366		 * no data.
    367		 */
    368		if (unlikely(was_full))
    369			wake_up_interruptible_sync_poll(&pipe->wr_wait, EPOLLOUT | EPOLLWRNORM);
    370		kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT);
    371
    372		/*
    373		 * But because we didn't read anything, at this point we can
    374		 * just return directly with -ERESTARTSYS if we're interrupted,
    375		 * since we've done any required wakeups and there's no need
    376		 * to mark anything accessed. And we've dropped the lock.
    377		 */
    378		if (wait_event_interruptible_exclusive(pipe->rd_wait, pipe_readable(pipe)) < 0)
    379			return -ERESTARTSYS;
    380
    381		__pipe_lock(pipe);
    382		was_full = pipe_full(pipe->head, pipe->tail, pipe->max_usage);
    383		wake_next_reader = true;
    384	}
    385	if (pipe_empty(pipe->head, pipe->tail))
    386		wake_next_reader = false;
    387	__pipe_unlock(pipe);
    388
    389	if (was_full)
    390		wake_up_interruptible_sync_poll(&pipe->wr_wait, EPOLLOUT | EPOLLWRNORM);
    391	if (wake_next_reader)
    392		wake_up_interruptible_sync_poll(&pipe->rd_wait, EPOLLIN | EPOLLRDNORM);
    393	kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT);
    394	if (ret > 0)
    395		file_accessed(filp);
    396	return ret;
    397}
    398
    399static inline int is_packetized(struct file *file)
    400{
    401	return (file->f_flags & O_DIRECT) != 0;
    402}
    403
    404/* Done while waiting without holding the pipe lock - thus the READ_ONCE() */
    405static inline bool pipe_writable(const struct pipe_inode_info *pipe)
    406{
    407	unsigned int head = READ_ONCE(pipe->head);
    408	unsigned int tail = READ_ONCE(pipe->tail);
    409	unsigned int max_usage = READ_ONCE(pipe->max_usage);
    410
    411	return !pipe_full(head, tail, max_usage) ||
    412		!READ_ONCE(pipe->readers);
    413}
    414
    415static ssize_t
    416pipe_write(struct kiocb *iocb, struct iov_iter *from)
    417{
    418	struct file *filp = iocb->ki_filp;
    419	struct pipe_inode_info *pipe = filp->private_data;
    420	unsigned int head;
    421	ssize_t ret = 0;
    422	size_t total_len = iov_iter_count(from);
    423	ssize_t chars;
    424	bool was_empty = false;
    425	bool wake_next_writer = false;
    426
    427	/* Null write succeeds. */
    428	if (unlikely(total_len == 0))
    429		return 0;
    430
    431	__pipe_lock(pipe);
    432
    433	if (!pipe->readers) {
    434		send_sig(SIGPIPE, current, 0);
    435		ret = -EPIPE;
    436		goto out;
    437	}
    438
    439#ifdef CONFIG_WATCH_QUEUE
    440	if (pipe->watch_queue) {
    441		ret = -EXDEV;
    442		goto out;
    443	}
    444#endif
    445
    446	/*
    447	 * If it wasn't empty we try to merge new data into
    448	 * the last buffer.
    449	 *
    450	 * That naturally merges small writes, but it also
    451	 * page-aligns the rest of the writes for large writes
    452	 * spanning multiple pages.
    453	 */
    454	head = pipe->head;
    455	was_empty = pipe_empty(head, pipe->tail);
    456	chars = total_len & (PAGE_SIZE-1);
    457	if (chars && !was_empty) {
    458		unsigned int mask = pipe->ring_size - 1;
    459		struct pipe_buffer *buf = &pipe->bufs[(head - 1) & mask];
    460		int offset = buf->offset + buf->len;
    461
    462		if ((buf->flags & PIPE_BUF_FLAG_CAN_MERGE) &&
    463		    offset + chars <= PAGE_SIZE) {
    464			ret = pipe_buf_confirm(pipe, buf);
    465			if (ret)
    466				goto out;
    467
    468			ret = copy_page_from_iter(buf->page, offset, chars, from);
    469			if (unlikely(ret < chars)) {
    470				ret = -EFAULT;
    471				goto out;
    472			}
    473
    474			buf->len += ret;
    475			if (!iov_iter_count(from))
    476				goto out;
    477		}
    478	}
    479
    480	for (;;) {
    481		if (!pipe->readers) {
    482			send_sig(SIGPIPE, current, 0);
    483			if (!ret)
    484				ret = -EPIPE;
    485			break;
    486		}
    487
    488		head = pipe->head;
    489		if (!pipe_full(head, pipe->tail, pipe->max_usage)) {
    490			unsigned int mask = pipe->ring_size - 1;
    491			struct pipe_buffer *buf = &pipe->bufs[head & mask];
    492			struct page *page = pipe->tmp_page;
    493			int copied;
    494
    495			if (!page) {
    496				page = alloc_page(GFP_HIGHUSER | __GFP_ACCOUNT);
    497				if (unlikely(!page)) {
    498					ret = ret ? : -ENOMEM;
    499					break;
    500				}
    501				pipe->tmp_page = page;
    502			}
    503
    504			/* Allocate a slot in the ring in advance and attach an
    505			 * empty buffer.  If we fault or otherwise fail to use
    506			 * it, either the reader will consume it or it'll still
    507			 * be there for the next write.
    508			 */
    509			spin_lock_irq(&pipe->rd_wait.lock);
    510
    511			head = pipe->head;
    512			if (pipe_full(head, pipe->tail, pipe->max_usage)) {
    513				spin_unlock_irq(&pipe->rd_wait.lock);
    514				continue;
    515			}
    516
    517			pipe->head = head + 1;
    518			spin_unlock_irq(&pipe->rd_wait.lock);
    519
    520			/* Insert it into the buffer array */
    521			buf = &pipe->bufs[head & mask];
    522			buf->page = page;
    523			buf->ops = &anon_pipe_buf_ops;
    524			buf->offset = 0;
    525			buf->len = 0;
    526			if (is_packetized(filp))
    527				buf->flags = PIPE_BUF_FLAG_PACKET;
    528			else
    529				buf->flags = PIPE_BUF_FLAG_CAN_MERGE;
    530			pipe->tmp_page = NULL;
    531
    532			copied = copy_page_from_iter(page, 0, PAGE_SIZE, from);
    533			if (unlikely(copied < PAGE_SIZE && iov_iter_count(from))) {
    534				if (!ret)
    535					ret = -EFAULT;
    536				break;
    537			}
    538			ret += copied;
    539			buf->offset = 0;
    540			buf->len = copied;
    541
    542			if (!iov_iter_count(from))
    543				break;
    544		}
    545
    546		if (!pipe_full(head, pipe->tail, pipe->max_usage))
    547			continue;
    548
    549		/* Wait for buffer space to become available. */
    550		if (filp->f_flags & O_NONBLOCK) {
    551			if (!ret)
    552				ret = -EAGAIN;
    553			break;
    554		}
    555		if (signal_pending(current)) {
    556			if (!ret)
    557				ret = -ERESTARTSYS;
    558			break;
    559		}
    560
    561		/*
    562		 * We're going to release the pipe lock and wait for more
    563		 * space. We wake up any readers if necessary, and then
    564		 * after waiting we need to re-check whether the pipe
    565		 * become empty while we dropped the lock.
    566		 */
    567		__pipe_unlock(pipe);
    568		if (was_empty)
    569			wake_up_interruptible_sync_poll(&pipe->rd_wait, EPOLLIN | EPOLLRDNORM);
    570		kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN);
    571		wait_event_interruptible_exclusive(pipe->wr_wait, pipe_writable(pipe));
    572		__pipe_lock(pipe);
    573		was_empty = pipe_empty(pipe->head, pipe->tail);
    574		wake_next_writer = true;
    575	}
    576out:
    577	if (pipe_full(pipe->head, pipe->tail, pipe->max_usage))
    578		wake_next_writer = false;
    579	__pipe_unlock(pipe);
    580
    581	/*
    582	 * If we do do a wakeup event, we do a 'sync' wakeup, because we
    583	 * want the reader to start processing things asap, rather than
    584	 * leave the data pending.
    585	 *
    586	 * This is particularly important for small writes, because of
    587	 * how (for example) the GNU make jobserver uses small writes to
    588	 * wake up pending jobs
    589	 *
    590	 * Epoll nonsensically wants a wakeup whether the pipe
    591	 * was already empty or not.
    592	 */
    593	if (was_empty || pipe->poll_usage)
    594		wake_up_interruptible_sync_poll(&pipe->rd_wait, EPOLLIN | EPOLLRDNORM);
    595	kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN);
    596	if (wake_next_writer)
    597		wake_up_interruptible_sync_poll(&pipe->wr_wait, EPOLLOUT | EPOLLWRNORM);
    598	if (ret > 0 && sb_start_write_trylock(file_inode(filp)->i_sb)) {
    599		int err = file_update_time(filp);
    600		if (err)
    601			ret = err;
    602		sb_end_write(file_inode(filp)->i_sb);
    603	}
    604	return ret;
    605}
    606
    607static long pipe_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
    608{
    609	struct pipe_inode_info *pipe = filp->private_data;
    610	unsigned int count, head, tail, mask;
    611
    612	switch (cmd) {
    613	case FIONREAD:
    614		__pipe_lock(pipe);
    615		count = 0;
    616		head = pipe->head;
    617		tail = pipe->tail;
    618		mask = pipe->ring_size - 1;
    619
    620		while (tail != head) {
    621			count += pipe->bufs[tail & mask].len;
    622			tail++;
    623		}
    624		__pipe_unlock(pipe);
    625
    626		return put_user(count, (int __user *)arg);
    627
    628#ifdef CONFIG_WATCH_QUEUE
    629	case IOC_WATCH_QUEUE_SET_SIZE: {
    630		int ret;
    631		__pipe_lock(pipe);
    632		ret = watch_queue_set_size(pipe, arg);
    633		__pipe_unlock(pipe);
    634		return ret;
    635	}
    636
    637	case IOC_WATCH_QUEUE_SET_FILTER:
    638		return watch_queue_set_filter(
    639			pipe, (struct watch_notification_filter __user *)arg);
    640#endif
    641
    642	default:
    643		return -ENOIOCTLCMD;
    644	}
    645}
    646
    647/* No kernel lock held - fine */
    648static __poll_t
    649pipe_poll(struct file *filp, poll_table *wait)
    650{
    651	__poll_t mask;
    652	struct pipe_inode_info *pipe = filp->private_data;
    653	unsigned int head, tail;
    654
    655	/* Epoll has some historical nasty semantics, this enables them */
    656	WRITE_ONCE(pipe->poll_usage, true);
    657
    658	/*
    659	 * Reading pipe state only -- no need for acquiring the semaphore.
    660	 *
    661	 * But because this is racy, the code has to add the
    662	 * entry to the poll table _first_ ..
    663	 */
    664	if (filp->f_mode & FMODE_READ)
    665		poll_wait(filp, &pipe->rd_wait, wait);
    666	if (filp->f_mode & FMODE_WRITE)
    667		poll_wait(filp, &pipe->wr_wait, wait);
    668
    669	/*
    670	 * .. and only then can you do the racy tests. That way,
    671	 * if something changes and you got it wrong, the poll
    672	 * table entry will wake you up and fix it.
    673	 */
    674	head = READ_ONCE(pipe->head);
    675	tail = READ_ONCE(pipe->tail);
    676
    677	mask = 0;
    678	if (filp->f_mode & FMODE_READ) {
    679		if (!pipe_empty(head, tail))
    680			mask |= EPOLLIN | EPOLLRDNORM;
    681		if (!pipe->writers && filp->f_version != pipe->w_counter)
    682			mask |= EPOLLHUP;
    683	}
    684
    685	if (filp->f_mode & FMODE_WRITE) {
    686		if (!pipe_full(head, tail, pipe->max_usage))
    687			mask |= EPOLLOUT | EPOLLWRNORM;
    688		/*
    689		 * Most Unices do not set EPOLLERR for FIFOs but on Linux they
    690		 * behave exactly like pipes for poll().
    691		 */
    692		if (!pipe->readers)
    693			mask |= EPOLLERR;
    694	}
    695
    696	return mask;
    697}
    698
    699static void put_pipe_info(struct inode *inode, struct pipe_inode_info *pipe)
    700{
    701	int kill = 0;
    702
    703	spin_lock(&inode->i_lock);
    704	if (!--pipe->files) {
    705		inode->i_pipe = NULL;
    706		kill = 1;
    707	}
    708	spin_unlock(&inode->i_lock);
    709
    710	if (kill)
    711		free_pipe_info(pipe);
    712}
    713
    714static int
    715pipe_release(struct inode *inode, struct file *file)
    716{
    717	struct pipe_inode_info *pipe = file->private_data;
    718
    719	__pipe_lock(pipe);
    720	if (file->f_mode & FMODE_READ)
    721		pipe->readers--;
    722	if (file->f_mode & FMODE_WRITE)
    723		pipe->writers--;
    724
    725	/* Was that the last reader or writer, but not the other side? */
    726	if (!pipe->readers != !pipe->writers) {
    727		wake_up_interruptible_all(&pipe->rd_wait);
    728		wake_up_interruptible_all(&pipe->wr_wait);
    729		kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN);
    730		kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT);
    731	}
    732	__pipe_unlock(pipe);
    733
    734	put_pipe_info(inode, pipe);
    735	return 0;
    736}
    737
    738static int
    739pipe_fasync(int fd, struct file *filp, int on)
    740{
    741	struct pipe_inode_info *pipe = filp->private_data;
    742	int retval = 0;
    743
    744	__pipe_lock(pipe);
    745	if (filp->f_mode & FMODE_READ)
    746		retval = fasync_helper(fd, filp, on, &pipe->fasync_readers);
    747	if ((filp->f_mode & FMODE_WRITE) && retval >= 0) {
    748		retval = fasync_helper(fd, filp, on, &pipe->fasync_writers);
    749		if (retval < 0 && (filp->f_mode & FMODE_READ))
    750			/* this can happen only if on == T */
    751			fasync_helper(-1, filp, 0, &pipe->fasync_readers);
    752	}
    753	__pipe_unlock(pipe);
    754	return retval;
    755}
    756
    757unsigned long account_pipe_buffers(struct user_struct *user,
    758				   unsigned long old, unsigned long new)
    759{
    760	return atomic_long_add_return(new - old, &user->pipe_bufs);
    761}
    762
    763bool too_many_pipe_buffers_soft(unsigned long user_bufs)
    764{
    765	unsigned long soft_limit = READ_ONCE(pipe_user_pages_soft);
    766
    767	return soft_limit && user_bufs > soft_limit;
    768}
    769
    770bool too_many_pipe_buffers_hard(unsigned long user_bufs)
    771{
    772	unsigned long hard_limit = READ_ONCE(pipe_user_pages_hard);
    773
    774	return hard_limit && user_bufs > hard_limit;
    775}
    776
    777bool pipe_is_unprivileged_user(void)
    778{
    779	return !capable(CAP_SYS_RESOURCE) && !capable(CAP_SYS_ADMIN);
    780}
    781
    782struct pipe_inode_info *alloc_pipe_info(void)
    783{
    784	struct pipe_inode_info *pipe;
    785	unsigned long pipe_bufs = PIPE_DEF_BUFFERS;
    786	struct user_struct *user = get_current_user();
    787	unsigned long user_bufs;
    788	unsigned int max_size = READ_ONCE(pipe_max_size);
    789
    790	pipe = kzalloc(sizeof(struct pipe_inode_info), GFP_KERNEL_ACCOUNT);
    791	if (pipe == NULL)
    792		goto out_free_uid;
    793
    794	if (pipe_bufs * PAGE_SIZE > max_size && !capable(CAP_SYS_RESOURCE))
    795		pipe_bufs = max_size >> PAGE_SHIFT;
    796
    797	user_bufs = account_pipe_buffers(user, 0, pipe_bufs);
    798
    799	if (too_many_pipe_buffers_soft(user_bufs) && pipe_is_unprivileged_user()) {
    800		user_bufs = account_pipe_buffers(user, pipe_bufs, PIPE_MIN_DEF_BUFFERS);
    801		pipe_bufs = PIPE_MIN_DEF_BUFFERS;
    802	}
    803
    804	if (too_many_pipe_buffers_hard(user_bufs) && pipe_is_unprivileged_user())
    805		goto out_revert_acct;
    806
    807	pipe->bufs = kcalloc(pipe_bufs, sizeof(struct pipe_buffer),
    808			     GFP_KERNEL_ACCOUNT);
    809
    810	if (pipe->bufs) {
    811		init_waitqueue_head(&pipe->rd_wait);
    812		init_waitqueue_head(&pipe->wr_wait);
    813		pipe->r_counter = pipe->w_counter = 1;
    814		pipe->max_usage = pipe_bufs;
    815		pipe->ring_size = pipe_bufs;
    816		pipe->nr_accounted = pipe_bufs;
    817		pipe->user = user;
    818		mutex_init(&pipe->mutex);
    819		return pipe;
    820	}
    821
    822out_revert_acct:
    823	(void) account_pipe_buffers(user, pipe_bufs, 0);
    824	kfree(pipe);
    825out_free_uid:
    826	free_uid(user);
    827	return NULL;
    828}
    829
    830void free_pipe_info(struct pipe_inode_info *pipe)
    831{
    832	unsigned int i;
    833
    834#ifdef CONFIG_WATCH_QUEUE
    835	if (pipe->watch_queue)
    836		watch_queue_clear(pipe->watch_queue);
    837#endif
    838
    839	(void) account_pipe_buffers(pipe->user, pipe->nr_accounted, 0);
    840	free_uid(pipe->user);
    841	for (i = 0; i < pipe->ring_size; i++) {
    842		struct pipe_buffer *buf = pipe->bufs + i;
    843		if (buf->ops)
    844			pipe_buf_release(pipe, buf);
    845	}
    846#ifdef CONFIG_WATCH_QUEUE
    847	if (pipe->watch_queue)
    848		put_watch_queue(pipe->watch_queue);
    849#endif
    850	if (pipe->tmp_page)
    851		__free_page(pipe->tmp_page);
    852	kfree(pipe->bufs);
    853	kfree(pipe);
    854}
    855
    856static struct vfsmount *pipe_mnt __read_mostly;
    857
    858/*
    859 * pipefs_dname() is called from d_path().
    860 */
    861static char *pipefs_dname(struct dentry *dentry, char *buffer, int buflen)
    862{
    863	return dynamic_dname(dentry, buffer, buflen, "pipe:[%lu]",
    864				d_inode(dentry)->i_ino);
    865}
    866
    867static const struct dentry_operations pipefs_dentry_operations = {
    868	.d_dname	= pipefs_dname,
    869};
    870
    871static struct inode * get_pipe_inode(void)
    872{
    873	struct inode *inode = new_inode_pseudo(pipe_mnt->mnt_sb);
    874	struct pipe_inode_info *pipe;
    875
    876	if (!inode)
    877		goto fail_inode;
    878
    879	inode->i_ino = get_next_ino();
    880
    881	pipe = alloc_pipe_info();
    882	if (!pipe)
    883		goto fail_iput;
    884
    885	inode->i_pipe = pipe;
    886	pipe->files = 2;
    887	pipe->readers = pipe->writers = 1;
    888	inode->i_fop = &pipefifo_fops;
    889
    890	/*
    891	 * Mark the inode dirty from the very beginning,
    892	 * that way it will never be moved to the dirty
    893	 * list because "mark_inode_dirty()" will think
    894	 * that it already _is_ on the dirty list.
    895	 */
    896	inode->i_state = I_DIRTY;
    897	inode->i_mode = S_IFIFO | S_IRUSR | S_IWUSR;
    898	inode->i_uid = current_fsuid();
    899	inode->i_gid = current_fsgid();
    900	inode->i_atime = inode->i_mtime = inode->i_ctime = current_time(inode);
    901
    902	return inode;
    903
    904fail_iput:
    905	iput(inode);
    906
    907fail_inode:
    908	return NULL;
    909}
    910
    911int create_pipe_files(struct file **res, int flags)
    912{
    913	struct inode *inode = get_pipe_inode();
    914	struct file *f;
    915	int error;
    916
    917	if (!inode)
    918		return -ENFILE;
    919
    920	if (flags & O_NOTIFICATION_PIPE) {
    921		error = watch_queue_init(inode->i_pipe);
    922		if (error) {
    923			free_pipe_info(inode->i_pipe);
    924			iput(inode);
    925			return error;
    926		}
    927	}
    928
    929	f = alloc_file_pseudo(inode, pipe_mnt, "",
    930				O_WRONLY | (flags & (O_NONBLOCK | O_DIRECT)),
    931				&pipefifo_fops);
    932	if (IS_ERR(f)) {
    933		free_pipe_info(inode->i_pipe);
    934		iput(inode);
    935		return PTR_ERR(f);
    936	}
    937
    938	f->private_data = inode->i_pipe;
    939
    940	res[0] = alloc_file_clone(f, O_RDONLY | (flags & O_NONBLOCK),
    941				  &pipefifo_fops);
    942	if (IS_ERR(res[0])) {
    943		put_pipe_info(inode, inode->i_pipe);
    944		fput(f);
    945		return PTR_ERR(res[0]);
    946	}
    947	res[0]->private_data = inode->i_pipe;
    948	res[1] = f;
    949	stream_open(inode, res[0]);
    950	stream_open(inode, res[1]);
    951	return 0;
    952}
    953
    954static int __do_pipe_flags(int *fd, struct file **files, int flags)
    955{
    956	int error;
    957	int fdw, fdr;
    958
    959	if (flags & ~(O_CLOEXEC | O_NONBLOCK | O_DIRECT | O_NOTIFICATION_PIPE))
    960		return -EINVAL;
    961
    962	error = create_pipe_files(files, flags);
    963	if (error)
    964		return error;
    965
    966	error = get_unused_fd_flags(flags);
    967	if (error < 0)
    968		goto err_read_pipe;
    969	fdr = error;
    970
    971	error = get_unused_fd_flags(flags);
    972	if (error < 0)
    973		goto err_fdr;
    974	fdw = error;
    975
    976	audit_fd_pair(fdr, fdw);
    977	fd[0] = fdr;
    978	fd[1] = fdw;
    979	return 0;
    980
    981 err_fdr:
    982	put_unused_fd(fdr);
    983 err_read_pipe:
    984	fput(files[0]);
    985	fput(files[1]);
    986	return error;
    987}
    988
    989int do_pipe_flags(int *fd, int flags)
    990{
    991	struct file *files[2];
    992	int error = __do_pipe_flags(fd, files, flags);
    993	if (!error) {
    994		fd_install(fd[0], files[0]);
    995		fd_install(fd[1], files[1]);
    996	}
    997	return error;
    998}
    999
   1000/*
   1001 * sys_pipe() is the normal C calling standard for creating
   1002 * a pipe. It's not the way Unix traditionally does this, though.
   1003 */
   1004static int do_pipe2(int __user *fildes, int flags)
   1005{
   1006	struct file *files[2];
   1007	int fd[2];
   1008	int error;
   1009
   1010	error = __do_pipe_flags(fd, files, flags);
   1011	if (!error) {
   1012		if (unlikely(copy_to_user(fildes, fd, sizeof(fd)))) {
   1013			fput(files[0]);
   1014			fput(files[1]);
   1015			put_unused_fd(fd[0]);
   1016			put_unused_fd(fd[1]);
   1017			error = -EFAULT;
   1018		} else {
   1019			fd_install(fd[0], files[0]);
   1020			fd_install(fd[1], files[1]);
   1021		}
   1022	}
   1023	return error;
   1024}
   1025
   1026SYSCALL_DEFINE2(pipe2, int __user *, fildes, int, flags)
   1027{
   1028	return do_pipe2(fildes, flags);
   1029}
   1030
   1031SYSCALL_DEFINE1(pipe, int __user *, fildes)
   1032{
   1033	return do_pipe2(fildes, 0);
   1034}
   1035
   1036/*
   1037 * This is the stupid "wait for pipe to be readable or writable"
   1038 * model.
   1039 *
   1040 * See pipe_read/write() for the proper kind of exclusive wait,
   1041 * but that requires that we wake up any other readers/writers
   1042 * if we then do not end up reading everything (ie the whole
   1043 * "wake_next_reader/writer" logic in pipe_read/write()).
   1044 */
   1045void pipe_wait_readable(struct pipe_inode_info *pipe)
   1046{
   1047	pipe_unlock(pipe);
   1048	wait_event_interruptible(pipe->rd_wait, pipe_readable(pipe));
   1049	pipe_lock(pipe);
   1050}
   1051
   1052void pipe_wait_writable(struct pipe_inode_info *pipe)
   1053{
   1054	pipe_unlock(pipe);
   1055	wait_event_interruptible(pipe->wr_wait, pipe_writable(pipe));
   1056	pipe_lock(pipe);
   1057}
   1058
   1059/*
   1060 * This depends on both the wait (here) and the wakeup (wake_up_partner)
   1061 * holding the pipe lock, so "*cnt" is stable and we know a wakeup cannot
   1062 * race with the count check and waitqueue prep.
   1063 *
   1064 * Normally in order to avoid races, you'd do the prepare_to_wait() first,
   1065 * then check the condition you're waiting for, and only then sleep. But
   1066 * because of the pipe lock, we can check the condition before being on
   1067 * the wait queue.
   1068 *
   1069 * We use the 'rd_wait' waitqueue for pipe partner waiting.
   1070 */
   1071static int wait_for_partner(struct pipe_inode_info *pipe, unsigned int *cnt)
   1072{
   1073	DEFINE_WAIT(rdwait);
   1074	int cur = *cnt;
   1075
   1076	while (cur == *cnt) {
   1077		prepare_to_wait(&pipe->rd_wait, &rdwait, TASK_INTERRUPTIBLE);
   1078		pipe_unlock(pipe);
   1079		schedule();
   1080		finish_wait(&pipe->rd_wait, &rdwait);
   1081		pipe_lock(pipe);
   1082		if (signal_pending(current))
   1083			break;
   1084	}
   1085	return cur == *cnt ? -ERESTARTSYS : 0;
   1086}
   1087
   1088static void wake_up_partner(struct pipe_inode_info *pipe)
   1089{
   1090	wake_up_interruptible_all(&pipe->rd_wait);
   1091}
   1092
   1093static int fifo_open(struct inode *inode, struct file *filp)
   1094{
   1095	struct pipe_inode_info *pipe;
   1096	bool is_pipe = inode->i_sb->s_magic == PIPEFS_MAGIC;
   1097	int ret;
   1098
   1099	filp->f_version = 0;
   1100
   1101	spin_lock(&inode->i_lock);
   1102	if (inode->i_pipe) {
   1103		pipe = inode->i_pipe;
   1104		pipe->files++;
   1105		spin_unlock(&inode->i_lock);
   1106	} else {
   1107		spin_unlock(&inode->i_lock);
   1108		pipe = alloc_pipe_info();
   1109		if (!pipe)
   1110			return -ENOMEM;
   1111		pipe->files = 1;
   1112		spin_lock(&inode->i_lock);
   1113		if (unlikely(inode->i_pipe)) {
   1114			inode->i_pipe->files++;
   1115			spin_unlock(&inode->i_lock);
   1116			free_pipe_info(pipe);
   1117			pipe = inode->i_pipe;
   1118		} else {
   1119			inode->i_pipe = pipe;
   1120			spin_unlock(&inode->i_lock);
   1121		}
   1122	}
   1123	filp->private_data = pipe;
   1124	/* OK, we have a pipe and it's pinned down */
   1125
   1126	__pipe_lock(pipe);
   1127
   1128	/* We can only do regular read/write on fifos */
   1129	stream_open(inode, filp);
   1130
   1131	switch (filp->f_mode & (FMODE_READ | FMODE_WRITE)) {
   1132	case FMODE_READ:
   1133	/*
   1134	 *  O_RDONLY
   1135	 *  POSIX.1 says that O_NONBLOCK means return with the FIFO
   1136	 *  opened, even when there is no process writing the FIFO.
   1137	 */
   1138		pipe->r_counter++;
   1139		if (pipe->readers++ == 0)
   1140			wake_up_partner(pipe);
   1141
   1142		if (!is_pipe && !pipe->writers) {
   1143			if ((filp->f_flags & O_NONBLOCK)) {
   1144				/* suppress EPOLLHUP until we have
   1145				 * seen a writer */
   1146				filp->f_version = pipe->w_counter;
   1147			} else {
   1148				if (wait_for_partner(pipe, &pipe->w_counter))
   1149					goto err_rd;
   1150			}
   1151		}
   1152		break;
   1153
   1154	case FMODE_WRITE:
   1155	/*
   1156	 *  O_WRONLY
   1157	 *  POSIX.1 says that O_NONBLOCK means return -1 with
   1158	 *  errno=ENXIO when there is no process reading the FIFO.
   1159	 */
   1160		ret = -ENXIO;
   1161		if (!is_pipe && (filp->f_flags & O_NONBLOCK) && !pipe->readers)
   1162			goto err;
   1163
   1164		pipe->w_counter++;
   1165		if (!pipe->writers++)
   1166			wake_up_partner(pipe);
   1167
   1168		if (!is_pipe && !pipe->readers) {
   1169			if (wait_for_partner(pipe, &pipe->r_counter))
   1170				goto err_wr;
   1171		}
   1172		break;
   1173
   1174	case FMODE_READ | FMODE_WRITE:
   1175	/*
   1176	 *  O_RDWR
   1177	 *  POSIX.1 leaves this case "undefined" when O_NONBLOCK is set.
   1178	 *  This implementation will NEVER block on a O_RDWR open, since
   1179	 *  the process can at least talk to itself.
   1180	 */
   1181
   1182		pipe->readers++;
   1183		pipe->writers++;
   1184		pipe->r_counter++;
   1185		pipe->w_counter++;
   1186		if (pipe->readers == 1 || pipe->writers == 1)
   1187			wake_up_partner(pipe);
   1188		break;
   1189
   1190	default:
   1191		ret = -EINVAL;
   1192		goto err;
   1193	}
   1194
   1195	/* Ok! */
   1196	__pipe_unlock(pipe);
   1197	return 0;
   1198
   1199err_rd:
   1200	if (!--pipe->readers)
   1201		wake_up_interruptible(&pipe->wr_wait);
   1202	ret = -ERESTARTSYS;
   1203	goto err;
   1204
   1205err_wr:
   1206	if (!--pipe->writers)
   1207		wake_up_interruptible_all(&pipe->rd_wait);
   1208	ret = -ERESTARTSYS;
   1209	goto err;
   1210
   1211err:
   1212	__pipe_unlock(pipe);
   1213
   1214	put_pipe_info(inode, pipe);
   1215	return ret;
   1216}
   1217
   1218const struct file_operations pipefifo_fops = {
   1219	.open		= fifo_open,
   1220	.llseek		= no_llseek,
   1221	.read_iter	= pipe_read,
   1222	.write_iter	= pipe_write,
   1223	.poll		= pipe_poll,
   1224	.unlocked_ioctl	= pipe_ioctl,
   1225	.release	= pipe_release,
   1226	.fasync		= pipe_fasync,
   1227	.splice_write	= iter_file_splice_write,
   1228};
   1229
   1230/*
   1231 * Currently we rely on the pipe array holding a power-of-2 number
   1232 * of pages. Returns 0 on error.
   1233 */
   1234unsigned int round_pipe_size(unsigned long size)
   1235{
   1236	if (size > (1U << 31))
   1237		return 0;
   1238
   1239	/* Minimum pipe size, as required by POSIX */
   1240	if (size < PAGE_SIZE)
   1241		return PAGE_SIZE;
   1242
   1243	return roundup_pow_of_two(size);
   1244}
   1245
   1246/*
   1247 * Resize the pipe ring to a number of slots.
   1248 *
   1249 * Note the pipe can be reduced in capacity, but only if the current
   1250 * occupancy doesn't exceed nr_slots; if it does, EBUSY will be
   1251 * returned instead.
   1252 */
   1253int pipe_resize_ring(struct pipe_inode_info *pipe, unsigned int nr_slots)
   1254{
   1255	struct pipe_buffer *bufs;
   1256	unsigned int head, tail, mask, n;
   1257
   1258	bufs = kcalloc(nr_slots, sizeof(*bufs),
   1259		       GFP_KERNEL_ACCOUNT | __GFP_NOWARN);
   1260	if (unlikely(!bufs))
   1261		return -ENOMEM;
   1262
   1263	spin_lock_irq(&pipe->rd_wait.lock);
   1264	mask = pipe->ring_size - 1;
   1265	head = pipe->head;
   1266	tail = pipe->tail;
   1267
   1268	n = pipe_occupancy(head, tail);
   1269	if (nr_slots < n) {
   1270		spin_unlock_irq(&pipe->rd_wait.lock);
   1271		kfree(bufs);
   1272		return -EBUSY;
   1273	}
   1274
   1275	/*
   1276	 * The pipe array wraps around, so just start the new one at zero
   1277	 * and adjust the indices.
   1278	 */
   1279	if (n > 0) {
   1280		unsigned int h = head & mask;
   1281		unsigned int t = tail & mask;
   1282		if (h > t) {
   1283			memcpy(bufs, pipe->bufs + t,
   1284			       n * sizeof(struct pipe_buffer));
   1285		} else {
   1286			unsigned int tsize = pipe->ring_size - t;
   1287			if (h > 0)
   1288				memcpy(bufs + tsize, pipe->bufs,
   1289				       h * sizeof(struct pipe_buffer));
   1290			memcpy(bufs, pipe->bufs + t,
   1291			       tsize * sizeof(struct pipe_buffer));
   1292		}
   1293	}
   1294
   1295	head = n;
   1296	tail = 0;
   1297
   1298	kfree(pipe->bufs);
   1299	pipe->bufs = bufs;
   1300	pipe->ring_size = nr_slots;
   1301	if (pipe->max_usage > nr_slots)
   1302		pipe->max_usage = nr_slots;
   1303	pipe->tail = tail;
   1304	pipe->head = head;
   1305
   1306	spin_unlock_irq(&pipe->rd_wait.lock);
   1307
   1308	/* This might have made more room for writers */
   1309	wake_up_interruptible(&pipe->wr_wait);
   1310	return 0;
   1311}
   1312
   1313/*
   1314 * Allocate a new array of pipe buffers and copy the info over. Returns the
   1315 * pipe size if successful, or return -ERROR on error.
   1316 */
   1317static long pipe_set_size(struct pipe_inode_info *pipe, unsigned long arg)
   1318{
   1319	unsigned long user_bufs;
   1320	unsigned int nr_slots, size;
   1321	long ret = 0;
   1322
   1323#ifdef CONFIG_WATCH_QUEUE
   1324	if (pipe->watch_queue)
   1325		return -EBUSY;
   1326#endif
   1327
   1328	size = round_pipe_size(arg);
   1329	nr_slots = size >> PAGE_SHIFT;
   1330
   1331	if (!nr_slots)
   1332		return -EINVAL;
   1333
   1334	/*
   1335	 * If trying to increase the pipe capacity, check that an
   1336	 * unprivileged user is not trying to exceed various limits
   1337	 * (soft limit check here, hard limit check just below).
   1338	 * Decreasing the pipe capacity is always permitted, even
   1339	 * if the user is currently over a limit.
   1340	 */
   1341	if (nr_slots > pipe->max_usage &&
   1342			size > pipe_max_size && !capable(CAP_SYS_RESOURCE))
   1343		return -EPERM;
   1344
   1345	user_bufs = account_pipe_buffers(pipe->user, pipe->nr_accounted, nr_slots);
   1346
   1347	if (nr_slots > pipe->max_usage &&
   1348			(too_many_pipe_buffers_hard(user_bufs) ||
   1349			 too_many_pipe_buffers_soft(user_bufs)) &&
   1350			pipe_is_unprivileged_user()) {
   1351		ret = -EPERM;
   1352		goto out_revert_acct;
   1353	}
   1354
   1355	ret = pipe_resize_ring(pipe, nr_slots);
   1356	if (ret < 0)
   1357		goto out_revert_acct;
   1358
   1359	pipe->max_usage = nr_slots;
   1360	pipe->nr_accounted = nr_slots;
   1361	return pipe->max_usage * PAGE_SIZE;
   1362
   1363out_revert_acct:
   1364	(void) account_pipe_buffers(pipe->user, nr_slots, pipe->nr_accounted);
   1365	return ret;
   1366}
   1367
   1368/*
   1369 * Note that i_pipe and i_cdev share the same location, so checking ->i_pipe is
   1370 * not enough to verify that this is a pipe.
   1371 */
   1372struct pipe_inode_info *get_pipe_info(struct file *file, bool for_splice)
   1373{
   1374	struct pipe_inode_info *pipe = file->private_data;
   1375
   1376	if (file->f_op != &pipefifo_fops || !pipe)
   1377		return NULL;
   1378#ifdef CONFIG_WATCH_QUEUE
   1379	if (for_splice && pipe->watch_queue)
   1380		return NULL;
   1381#endif
   1382	return pipe;
   1383}
   1384
   1385long pipe_fcntl(struct file *file, unsigned int cmd, unsigned long arg)
   1386{
   1387	struct pipe_inode_info *pipe;
   1388	long ret;
   1389
   1390	pipe = get_pipe_info(file, false);
   1391	if (!pipe)
   1392		return -EBADF;
   1393
   1394	__pipe_lock(pipe);
   1395
   1396	switch (cmd) {
   1397	case F_SETPIPE_SZ:
   1398		ret = pipe_set_size(pipe, arg);
   1399		break;
   1400	case F_GETPIPE_SZ:
   1401		ret = pipe->max_usage * PAGE_SIZE;
   1402		break;
   1403	default:
   1404		ret = -EINVAL;
   1405		break;
   1406	}
   1407
   1408	__pipe_unlock(pipe);
   1409	return ret;
   1410}
   1411
   1412static const struct super_operations pipefs_ops = {
   1413	.destroy_inode = free_inode_nonrcu,
   1414	.statfs = simple_statfs,
   1415};
   1416
   1417/*
   1418 * pipefs should _never_ be mounted by userland - too much of security hassle,
   1419 * no real gain from having the whole whorehouse mounted. So we don't need
   1420 * any operations on the root directory. However, we need a non-trivial
   1421 * d_name - pipe: will go nicely and kill the special-casing in procfs.
   1422 */
   1423
   1424static int pipefs_init_fs_context(struct fs_context *fc)
   1425{
   1426	struct pseudo_fs_context *ctx = init_pseudo(fc, PIPEFS_MAGIC);
   1427	if (!ctx)
   1428		return -ENOMEM;
   1429	ctx->ops = &pipefs_ops;
   1430	ctx->dops = &pipefs_dentry_operations;
   1431	return 0;
   1432}
   1433
   1434static struct file_system_type pipe_fs_type = {
   1435	.name		= "pipefs",
   1436	.init_fs_context = pipefs_init_fs_context,
   1437	.kill_sb	= kill_anon_super,
   1438};
   1439
   1440#ifdef CONFIG_SYSCTL
   1441static int do_proc_dopipe_max_size_conv(unsigned long *lvalp,
   1442					unsigned int *valp,
   1443					int write, void *data)
   1444{
   1445	if (write) {
   1446		unsigned int val;
   1447
   1448		val = round_pipe_size(*lvalp);
   1449		if (val == 0)
   1450			return -EINVAL;
   1451
   1452		*valp = val;
   1453	} else {
   1454		unsigned int val = *valp;
   1455		*lvalp = (unsigned long) val;
   1456	}
   1457
   1458	return 0;
   1459}
   1460
   1461static int proc_dopipe_max_size(struct ctl_table *table, int write,
   1462				void *buffer, size_t *lenp, loff_t *ppos)
   1463{
   1464	return do_proc_douintvec(table, write, buffer, lenp, ppos,
   1465				 do_proc_dopipe_max_size_conv, NULL);
   1466}
   1467
   1468static struct ctl_table fs_pipe_sysctls[] = {
   1469	{
   1470		.procname	= "pipe-max-size",
   1471		.data		= &pipe_max_size,
   1472		.maxlen		= sizeof(pipe_max_size),
   1473		.mode		= 0644,
   1474		.proc_handler	= proc_dopipe_max_size,
   1475	},
   1476	{
   1477		.procname	= "pipe-user-pages-hard",
   1478		.data		= &pipe_user_pages_hard,
   1479		.maxlen		= sizeof(pipe_user_pages_hard),
   1480		.mode		= 0644,
   1481		.proc_handler	= proc_doulongvec_minmax,
   1482	},
   1483	{
   1484		.procname	= "pipe-user-pages-soft",
   1485		.data		= &pipe_user_pages_soft,
   1486		.maxlen		= sizeof(pipe_user_pages_soft),
   1487		.mode		= 0644,
   1488		.proc_handler	= proc_doulongvec_minmax,
   1489	},
   1490	{ }
   1491};
   1492#endif
   1493
   1494static int __init init_pipe_fs(void)
   1495{
   1496	int err = register_filesystem(&pipe_fs_type);
   1497
   1498	if (!err) {
   1499		pipe_mnt = kern_mount(&pipe_fs_type);
   1500		if (IS_ERR(pipe_mnt)) {
   1501			err = PTR_ERR(pipe_mnt);
   1502			unregister_filesystem(&pipe_fs_type);
   1503		}
   1504	}
   1505#ifdef CONFIG_SYSCTL
   1506	register_sysctl_init("fs", fs_pipe_sysctls);
   1507#endif
   1508	return err;
   1509}
   1510
   1511fs_initcall(init_pipe_fs);