cachepc-linux

Fork of AMDESE/linux with modifications for CachePC side-channel attack
git clone https://git.sinitax.com/sinitax/cachepc-linux
Log | Files | Refs | README | LICENSE | sfeed.txt

io_uring.c (333420B)


      1// SPDX-License-Identifier: GPL-2.0
      2/*
      3 * Shared application/kernel submission and completion ring pairs, for
      4 * supporting fast/efficient IO.
      5 *
      6 * A note on the read/write ordering memory barriers that are matched between
      7 * the application and kernel side.
      8 *
      9 * After the application reads the CQ ring tail, it must use an
     10 * appropriate smp_rmb() to pair with the smp_wmb() the kernel uses
     11 * before writing the tail (using smp_load_acquire to read the tail will
     12 * do). It also needs a smp_mb() before updating CQ head (ordering the
     13 * entry load(s) with the head store), pairing with an implicit barrier
     14 * through a control-dependency in io_get_cqe (smp_store_release to
     15 * store head will do). Failure to do so could lead to reading invalid
     16 * CQ entries.
     17 *
     18 * Likewise, the application must use an appropriate smp_wmb() before
     19 * writing the SQ tail (ordering SQ entry stores with the tail store),
     20 * which pairs with smp_load_acquire in io_get_sqring (smp_store_release
     21 * to store the tail will do). And it needs a barrier ordering the SQ
     22 * head load before writing new SQ entries (smp_load_acquire to read
     23 * head will do).
     24 *
     25 * When using the SQ poll thread (IORING_SETUP_SQPOLL), the application
     26 * needs to check the SQ flags for IORING_SQ_NEED_WAKEUP *after*
     27 * updating the SQ tail; a full memory barrier smp_mb() is needed
     28 * between.
     29 *
     30 * Also see the examples in the liburing library:
     31 *
     32 *	git://git.kernel.dk/liburing
     33 *
     34 * io_uring also uses READ/WRITE_ONCE() for _any_ store or load that happens
     35 * from data shared between the kernel and application. This is done both
     36 * for ordering purposes, but also to ensure that once a value is loaded from
     37 * data that the application could potentially modify, it remains stable.
     38 *
     39 * Copyright (C) 2018-2019 Jens Axboe
     40 * Copyright (c) 2018-2019 Christoph Hellwig
     41 */
     42#include <linux/kernel.h>
     43#include <linux/init.h>
     44#include <linux/errno.h>
     45#include <linux/syscalls.h>
     46#include <linux/compat.h>
     47#include <net/compat.h>
     48#include <linux/refcount.h>
     49#include <linux/uio.h>
     50#include <linux/bits.h>
     51
     52#include <linux/sched/signal.h>
     53#include <linux/fs.h>
     54#include <linux/file.h>
     55#include <linux/fdtable.h>
     56#include <linux/mm.h>
     57#include <linux/mman.h>
     58#include <linux/percpu.h>
     59#include <linux/slab.h>
     60#include <linux/blk-mq.h>
     61#include <linux/bvec.h>
     62#include <linux/net.h>
     63#include <net/sock.h>
     64#include <net/af_unix.h>
     65#include <net/scm.h>
     66#include <linux/anon_inodes.h>
     67#include <linux/sched/mm.h>
     68#include <linux/uaccess.h>
     69#include <linux/nospec.h>
     70#include <linux/sizes.h>
     71#include <linux/hugetlb.h>
     72#include <linux/highmem.h>
     73#include <linux/namei.h>
     74#include <linux/fsnotify.h>
     75#include <linux/fadvise.h>
     76#include <linux/eventpoll.h>
     77#include <linux/splice.h>
     78#include <linux/task_work.h>
     79#include <linux/pagemap.h>
     80#include <linux/io_uring.h>
     81#include <linux/audit.h>
     82#include <linux/security.h>
     83#include <linux/xattr.h>
     84
     85#define CREATE_TRACE_POINTS
     86#include <trace/events/io_uring.h>
     87
     88#include <uapi/linux/io_uring.h>
     89
     90#include "internal.h"
     91#include "io-wq.h"
     92
     93#define IORING_MAX_ENTRIES	32768
     94#define IORING_MAX_CQ_ENTRIES	(2 * IORING_MAX_ENTRIES)
     95#define IORING_SQPOLL_CAP_ENTRIES_VALUE 8
     96
     97/* only define max */
     98#define IORING_MAX_FIXED_FILES	(1U << 20)
     99#define IORING_MAX_RESTRICTIONS	(IORING_RESTRICTION_LAST + \
    100				 IORING_REGISTER_LAST + IORING_OP_LAST)
    101
    102#define IO_RSRC_TAG_TABLE_SHIFT	(PAGE_SHIFT - 3)
    103#define IO_RSRC_TAG_TABLE_MAX	(1U << IO_RSRC_TAG_TABLE_SHIFT)
    104#define IO_RSRC_TAG_TABLE_MASK	(IO_RSRC_TAG_TABLE_MAX - 1)
    105
    106#define IORING_MAX_REG_BUFFERS	(1U << 14)
    107
    108#define SQE_COMMON_FLAGS (IOSQE_FIXED_FILE | IOSQE_IO_LINK | \
    109			  IOSQE_IO_HARDLINK | IOSQE_ASYNC)
    110
    111#define SQE_VALID_FLAGS	(SQE_COMMON_FLAGS | IOSQE_BUFFER_SELECT | \
    112			IOSQE_IO_DRAIN | IOSQE_CQE_SKIP_SUCCESS)
    113
    114#define IO_REQ_CLEAN_FLAGS (REQ_F_BUFFER_SELECTED | REQ_F_NEED_CLEANUP | \
    115				REQ_F_POLLED | REQ_F_INFLIGHT | REQ_F_CREDS | \
    116				REQ_F_ASYNC_DATA)
    117
    118#define IO_REQ_CLEAN_SLOW_FLAGS (REQ_F_REFCOUNT | REQ_F_LINK | REQ_F_HARDLINK |\
    119				 IO_REQ_CLEAN_FLAGS)
    120
    121#define IO_APOLL_MULTI_POLLED (REQ_F_APOLL_MULTISHOT | REQ_F_POLLED)
    122
    123#define IO_TCTX_REFS_CACHE_NR	(1U << 10)
    124
    125struct io_uring {
    126	u32 head ____cacheline_aligned_in_smp;
    127	u32 tail ____cacheline_aligned_in_smp;
    128};
    129
    130/*
    131 * This data is shared with the application through the mmap at offsets
    132 * IORING_OFF_SQ_RING and IORING_OFF_CQ_RING.
    133 *
    134 * The offsets to the member fields are published through struct
    135 * io_sqring_offsets when calling io_uring_setup.
    136 */
    137struct io_rings {
    138	/*
    139	 * Head and tail offsets into the ring; the offsets need to be
    140	 * masked to get valid indices.
    141	 *
    142	 * The kernel controls head of the sq ring and the tail of the cq ring,
    143	 * and the application controls tail of the sq ring and the head of the
    144	 * cq ring.
    145	 */
    146	struct io_uring		sq, cq;
    147	/*
    148	 * Bitmasks to apply to head and tail offsets (constant, equals
    149	 * ring_entries - 1)
    150	 */
    151	u32			sq_ring_mask, cq_ring_mask;
    152	/* Ring sizes (constant, power of 2) */
    153	u32			sq_ring_entries, cq_ring_entries;
    154	/*
    155	 * Number of invalid entries dropped by the kernel due to
    156	 * invalid index stored in array
    157	 *
    158	 * Written by the kernel, shouldn't be modified by the
    159	 * application (i.e. get number of "new events" by comparing to
    160	 * cached value).
    161	 *
    162	 * After a new SQ head value was read by the application this
    163	 * counter includes all submissions that were dropped reaching
    164	 * the new SQ head (and possibly more).
    165	 */
    166	u32			sq_dropped;
    167	/*
    168	 * Runtime SQ flags
    169	 *
    170	 * Written by the kernel, shouldn't be modified by the
    171	 * application.
    172	 *
    173	 * The application needs a full memory barrier before checking
    174	 * for IORING_SQ_NEED_WAKEUP after updating the sq tail.
    175	 */
    176	atomic_t		sq_flags;
    177	/*
    178	 * Runtime CQ flags
    179	 *
    180	 * Written by the application, shouldn't be modified by the
    181	 * kernel.
    182	 */
    183	u32			cq_flags;
    184	/*
    185	 * Number of completion events lost because the queue was full;
    186	 * this should be avoided by the application by making sure
    187	 * there are not more requests pending than there is space in
    188	 * the completion queue.
    189	 *
    190	 * Written by the kernel, shouldn't be modified by the
    191	 * application (i.e. get number of "new events" by comparing to
    192	 * cached value).
    193	 *
    194	 * As completion events come in out of order this counter is not
    195	 * ordered with any other data.
    196	 */
    197	u32			cq_overflow;
    198	/*
    199	 * Ring buffer of completion events.
    200	 *
    201	 * The kernel writes completion events fresh every time they are
    202	 * produced, so the application is allowed to modify pending
    203	 * entries.
    204	 */
    205	struct io_uring_cqe	cqes[] ____cacheline_aligned_in_smp;
    206};
    207
    208struct io_mapped_ubuf {
    209	u64		ubuf;
    210	u64		ubuf_end;
    211	unsigned int	nr_bvecs;
    212	unsigned long	acct_pages;
    213	struct bio_vec	bvec[];
    214};
    215
    216struct io_ring_ctx;
    217
    218struct io_overflow_cqe {
    219	struct list_head list;
    220	struct io_uring_cqe cqe;
    221};
    222
    223/*
    224 * FFS_SCM is only available on 64-bit archs, for 32-bit we just define it as 0
    225 * and define IO_URING_SCM_ALL. For this case, we use SCM for all files as we
    226 * can't safely always dereference the file when the task has exited and ring
    227 * cleanup is done. If a file is tracked and part of SCM, then unix gc on
    228 * process exit may reap it before __io_sqe_files_unregister() is run.
    229 */
    230#define FFS_NOWAIT		0x1UL
    231#define FFS_ISREG		0x2UL
    232#if defined(CONFIG_64BIT)
    233#define FFS_SCM			0x4UL
    234#else
    235#define IO_URING_SCM_ALL
    236#define FFS_SCM			0x0UL
    237#endif
    238#define FFS_MASK		~(FFS_NOWAIT|FFS_ISREG|FFS_SCM)
    239
    240struct io_fixed_file {
    241	/* file * with additional FFS_* flags */
    242	unsigned long file_ptr;
    243};
    244
    245struct io_rsrc_put {
    246	struct list_head list;
    247	u64 tag;
    248	union {
    249		void *rsrc;
    250		struct file *file;
    251		struct io_mapped_ubuf *buf;
    252	};
    253};
    254
    255struct io_file_table {
    256	struct io_fixed_file *files;
    257	unsigned long *bitmap;
    258	unsigned int alloc_hint;
    259};
    260
    261struct io_rsrc_node {
    262	struct percpu_ref		refs;
    263	struct list_head		node;
    264	struct list_head		rsrc_list;
    265	struct io_rsrc_data		*rsrc_data;
    266	struct llist_node		llist;
    267	bool				done;
    268};
    269
    270typedef void (rsrc_put_fn)(struct io_ring_ctx *ctx, struct io_rsrc_put *prsrc);
    271
    272struct io_rsrc_data {
    273	struct io_ring_ctx		*ctx;
    274
    275	u64				**tags;
    276	unsigned int			nr;
    277	rsrc_put_fn			*do_put;
    278	atomic_t			refs;
    279	struct completion		done;
    280	bool				quiesce;
    281};
    282
    283#define IO_BUFFER_LIST_BUF_PER_PAGE (PAGE_SIZE / sizeof(struct io_uring_buf))
    284struct io_buffer_list {
    285	/*
    286	 * If ->buf_nr_pages is set, then buf_pages/buf_ring are used. If not,
    287	 * then these are classic provided buffers and ->buf_list is used.
    288	 */
    289	union {
    290		struct list_head buf_list;
    291		struct {
    292			struct page **buf_pages;
    293			struct io_uring_buf_ring *buf_ring;
    294		};
    295	};
    296	__u16 bgid;
    297
    298	/* below is for ring provided buffers */
    299	__u16 buf_nr_pages;
    300	__u16 nr_entries;
    301	__u16 head;
    302	__u16 mask;
    303};
    304
    305struct io_buffer {
    306	struct list_head list;
    307	__u64 addr;
    308	__u32 len;
    309	__u16 bid;
    310	__u16 bgid;
    311};
    312
    313struct io_restriction {
    314	DECLARE_BITMAP(register_op, IORING_REGISTER_LAST);
    315	DECLARE_BITMAP(sqe_op, IORING_OP_LAST);
    316	u8 sqe_flags_allowed;
    317	u8 sqe_flags_required;
    318	bool registered;
    319};
    320
    321enum {
    322	IO_SQ_THREAD_SHOULD_STOP = 0,
    323	IO_SQ_THREAD_SHOULD_PARK,
    324};
    325
    326struct io_sq_data {
    327	refcount_t		refs;
    328	atomic_t		park_pending;
    329	struct mutex		lock;
    330
    331	/* ctx's that are using this sqd */
    332	struct list_head	ctx_list;
    333
    334	struct task_struct	*thread;
    335	struct wait_queue_head	wait;
    336
    337	unsigned		sq_thread_idle;
    338	int			sq_cpu;
    339	pid_t			task_pid;
    340	pid_t			task_tgid;
    341
    342	unsigned long		state;
    343	struct completion	exited;
    344};
    345
    346#define IO_COMPL_BATCH			32
    347#define IO_REQ_CACHE_SIZE		32
    348#define IO_REQ_ALLOC_BATCH		8
    349
    350struct io_submit_link {
    351	struct io_kiocb		*head;
    352	struct io_kiocb		*last;
    353};
    354
    355struct io_submit_state {
    356	/* inline/task_work completion list, under ->uring_lock */
    357	struct io_wq_work_node	free_list;
    358	/* batch completion logic */
    359	struct io_wq_work_list	compl_reqs;
    360	struct io_submit_link	link;
    361
    362	bool			plug_started;
    363	bool			need_plug;
    364	bool			flush_cqes;
    365	unsigned short		submit_nr;
    366	struct blk_plug		plug;
    367};
    368
    369struct io_ev_fd {
    370	struct eventfd_ctx	*cq_ev_fd;
    371	unsigned int		eventfd_async: 1;
    372	struct rcu_head		rcu;
    373};
    374
    375#define BGID_ARRAY	64
    376
    377struct io_ring_ctx {
    378	/* const or read-mostly hot data */
    379	struct {
    380		struct percpu_ref	refs;
    381
    382		struct io_rings		*rings;
    383		unsigned int		flags;
    384		enum task_work_notify_mode	notify_method;
    385		unsigned int		compat: 1;
    386		unsigned int		drain_next: 1;
    387		unsigned int		restricted: 1;
    388		unsigned int		off_timeout_used: 1;
    389		unsigned int		drain_active: 1;
    390		unsigned int		drain_disabled: 1;
    391		unsigned int		has_evfd: 1;
    392		unsigned int		syscall_iopoll: 1;
    393	} ____cacheline_aligned_in_smp;
    394
    395	/* submission data */
    396	struct {
    397		struct mutex		uring_lock;
    398
    399		/*
    400		 * Ring buffer of indices into array of io_uring_sqe, which is
    401		 * mmapped by the application using the IORING_OFF_SQES offset.
    402		 *
    403		 * This indirection could e.g. be used to assign fixed
    404		 * io_uring_sqe entries to operations and only submit them to
    405		 * the queue when needed.
    406		 *
    407		 * The kernel modifies neither the indices array nor the entries
    408		 * array.
    409		 */
    410		u32			*sq_array;
    411		struct io_uring_sqe	*sq_sqes;
    412		unsigned		cached_sq_head;
    413		unsigned		sq_entries;
    414		struct list_head	defer_list;
    415
    416		/*
    417		 * Fixed resources fast path, should be accessed only under
    418		 * uring_lock, and updated through io_uring_register(2)
    419		 */
    420		struct io_rsrc_node	*rsrc_node;
    421		int			rsrc_cached_refs;
    422		atomic_t		cancel_seq;
    423		struct io_file_table	file_table;
    424		unsigned		nr_user_files;
    425		unsigned		nr_user_bufs;
    426		struct io_mapped_ubuf	**user_bufs;
    427
    428		struct io_submit_state	submit_state;
    429
    430		struct io_buffer_list	*io_bl;
    431		struct xarray		io_bl_xa;
    432		struct list_head	io_buffers_cache;
    433
    434		struct list_head	timeout_list;
    435		struct list_head	ltimeout_list;
    436		struct list_head	cq_overflow_list;
    437		struct list_head	apoll_cache;
    438		struct xarray		personalities;
    439		u32			pers_next;
    440		unsigned		sq_thread_idle;
    441	} ____cacheline_aligned_in_smp;
    442
    443	/* IRQ completion list, under ->completion_lock */
    444	struct io_wq_work_list	locked_free_list;
    445	unsigned int		locked_free_nr;
    446
    447	const struct cred	*sq_creds;	/* cred used for __io_sq_thread() */
    448	struct io_sq_data	*sq_data;	/* if using sq thread polling */
    449
    450	struct wait_queue_head	sqo_sq_wait;
    451	struct list_head	sqd_list;
    452
    453	unsigned long		check_cq;
    454
    455	struct {
    456		/*
    457		 * We cache a range of free CQEs we can use, once exhausted it
    458		 * should go through a slower range setup, see __io_get_cqe()
    459		 */
    460		struct io_uring_cqe	*cqe_cached;
    461		struct io_uring_cqe	*cqe_sentinel;
    462
    463		unsigned		cached_cq_tail;
    464		unsigned		cq_entries;
    465		struct io_ev_fd	__rcu	*io_ev_fd;
    466		struct wait_queue_head	cq_wait;
    467		unsigned		cq_extra;
    468		atomic_t		cq_timeouts;
    469		unsigned		cq_last_tm_flush;
    470	} ____cacheline_aligned_in_smp;
    471
    472	struct {
    473		spinlock_t		completion_lock;
    474
    475		spinlock_t		timeout_lock;
    476
    477		/*
    478		 * ->iopoll_list is protected by the ctx->uring_lock for
    479		 * io_uring instances that don't use IORING_SETUP_SQPOLL.
    480		 * For SQPOLL, only the single threaded io_sq_thread() will
    481		 * manipulate the list, hence no extra locking is needed there.
    482		 */
    483		struct io_wq_work_list	iopoll_list;
    484		struct hlist_head	*cancel_hash;
    485		unsigned		cancel_hash_bits;
    486		bool			poll_multi_queue;
    487
    488		struct list_head	io_buffers_comp;
    489	} ____cacheline_aligned_in_smp;
    490
    491	struct io_restriction		restrictions;
    492
    493	/* slow path rsrc auxilary data, used by update/register */
    494	struct {
    495		struct io_rsrc_node		*rsrc_backup_node;
    496		struct io_mapped_ubuf		*dummy_ubuf;
    497		struct io_rsrc_data		*file_data;
    498		struct io_rsrc_data		*buf_data;
    499
    500		struct delayed_work		rsrc_put_work;
    501		struct llist_head		rsrc_put_llist;
    502		struct list_head		rsrc_ref_list;
    503		spinlock_t			rsrc_ref_lock;
    504
    505		struct list_head	io_buffers_pages;
    506	};
    507
    508	/* Keep this last, we don't need it for the fast path */
    509	struct {
    510		#if defined(CONFIG_UNIX)
    511			struct socket		*ring_sock;
    512		#endif
    513		/* hashed buffered write serialization */
    514		struct io_wq_hash		*hash_map;
    515
    516		/* Only used for accounting purposes */
    517		struct user_struct		*user;
    518		struct mm_struct		*mm_account;
    519
    520		/* ctx exit and cancelation */
    521		struct llist_head		fallback_llist;
    522		struct delayed_work		fallback_work;
    523		struct work_struct		exit_work;
    524		struct list_head		tctx_list;
    525		struct completion		ref_comp;
    526		u32				iowq_limits[2];
    527		bool				iowq_limits_set;
    528	};
    529};
    530
    531/*
    532 * Arbitrary limit, can be raised if need be
    533 */
    534#define IO_RINGFD_REG_MAX 16
    535
    536struct io_uring_task {
    537	/* submission side */
    538	int			cached_refs;
    539	struct xarray		xa;
    540	struct wait_queue_head	wait;
    541	const struct io_ring_ctx *last;
    542	struct io_wq		*io_wq;
    543	struct percpu_counter	inflight;
    544	atomic_t		inflight_tracked;
    545	atomic_t		in_idle;
    546
    547	spinlock_t		task_lock;
    548	struct io_wq_work_list	task_list;
    549	struct io_wq_work_list	prio_task_list;
    550	struct callback_head	task_work;
    551	struct file		**registered_rings;
    552	bool			task_running;
    553};
    554
    555/*
    556 * First field must be the file pointer in all the
    557 * iocb unions! See also 'struct kiocb' in <linux/fs.h>
    558 */
    559struct io_poll_iocb {
    560	struct file			*file;
    561	struct wait_queue_head		*head;
    562	__poll_t			events;
    563	struct wait_queue_entry		wait;
    564};
    565
    566struct io_poll_update {
    567	struct file			*file;
    568	u64				old_user_data;
    569	u64				new_user_data;
    570	__poll_t			events;
    571	bool				update_events;
    572	bool				update_user_data;
    573};
    574
    575struct io_close {
    576	struct file			*file;
    577	int				fd;
    578	u32				file_slot;
    579};
    580
    581struct io_timeout_data {
    582	struct io_kiocb			*req;
    583	struct hrtimer			timer;
    584	struct timespec64		ts;
    585	enum hrtimer_mode		mode;
    586	u32				flags;
    587};
    588
    589struct io_accept {
    590	struct file			*file;
    591	struct sockaddr __user		*addr;
    592	int __user			*addr_len;
    593	int				flags;
    594	u32				file_slot;
    595	unsigned long			nofile;
    596};
    597
    598struct io_socket {
    599	struct file			*file;
    600	int				domain;
    601	int				type;
    602	int				protocol;
    603	int				flags;
    604	u32				file_slot;
    605	unsigned long			nofile;
    606};
    607
    608struct io_sync {
    609	struct file			*file;
    610	loff_t				len;
    611	loff_t				off;
    612	int				flags;
    613	int				mode;
    614};
    615
    616struct io_cancel {
    617	struct file			*file;
    618	u64				addr;
    619	u32				flags;
    620	s32				fd;
    621};
    622
    623struct io_timeout {
    624	struct file			*file;
    625	u32				off;
    626	u32				target_seq;
    627	struct list_head		list;
    628	/* head of the link, used by linked timeouts only */
    629	struct io_kiocb			*head;
    630	/* for linked completions */
    631	struct io_kiocb			*prev;
    632};
    633
    634struct io_timeout_rem {
    635	struct file			*file;
    636	u64				addr;
    637
    638	/* timeout update */
    639	struct timespec64		ts;
    640	u32				flags;
    641	bool				ltimeout;
    642};
    643
    644struct io_rw {
    645	/* NOTE: kiocb has the file as the first member, so don't do it here */
    646	struct kiocb			kiocb;
    647	u64				addr;
    648	u32				len;
    649	rwf_t				flags;
    650};
    651
    652struct io_connect {
    653	struct file			*file;
    654	struct sockaddr __user		*addr;
    655	int				addr_len;
    656};
    657
    658struct io_sr_msg {
    659	struct file			*file;
    660	union {
    661		struct compat_msghdr __user	*umsg_compat;
    662		struct user_msghdr __user	*umsg;
    663		void __user			*buf;
    664	};
    665	int				msg_flags;
    666	size_t				len;
    667	size_t				done_io;
    668	unsigned int			flags;
    669};
    670
    671struct io_open {
    672	struct file			*file;
    673	int				dfd;
    674	u32				file_slot;
    675	struct filename			*filename;
    676	struct open_how			how;
    677	unsigned long			nofile;
    678};
    679
    680struct io_rsrc_update {
    681	struct file			*file;
    682	u64				arg;
    683	u32				nr_args;
    684	u32				offset;
    685};
    686
    687struct io_fadvise {
    688	struct file			*file;
    689	u64				offset;
    690	u32				len;
    691	u32				advice;
    692};
    693
    694struct io_madvise {
    695	struct file			*file;
    696	u64				addr;
    697	u32				len;
    698	u32				advice;
    699};
    700
    701struct io_epoll {
    702	struct file			*file;
    703	int				epfd;
    704	int				op;
    705	int				fd;
    706	struct epoll_event		event;
    707};
    708
    709struct io_splice {
    710	struct file			*file_out;
    711	loff_t				off_out;
    712	loff_t				off_in;
    713	u64				len;
    714	int				splice_fd_in;
    715	unsigned int			flags;
    716};
    717
    718struct io_provide_buf {
    719	struct file			*file;
    720	__u64				addr;
    721	__u32				len;
    722	__u32				bgid;
    723	__u16				nbufs;
    724	__u16				bid;
    725};
    726
    727struct io_statx {
    728	struct file			*file;
    729	int				dfd;
    730	unsigned int			mask;
    731	unsigned int			flags;
    732	struct filename			*filename;
    733	struct statx __user		*buffer;
    734};
    735
    736struct io_shutdown {
    737	struct file			*file;
    738	int				how;
    739};
    740
    741struct io_rename {
    742	struct file			*file;
    743	int				old_dfd;
    744	int				new_dfd;
    745	struct filename			*oldpath;
    746	struct filename			*newpath;
    747	int				flags;
    748};
    749
    750struct io_unlink {
    751	struct file			*file;
    752	int				dfd;
    753	int				flags;
    754	struct filename			*filename;
    755};
    756
    757struct io_mkdir {
    758	struct file			*file;
    759	int				dfd;
    760	umode_t				mode;
    761	struct filename			*filename;
    762};
    763
    764struct io_symlink {
    765	struct file			*file;
    766	int				new_dfd;
    767	struct filename			*oldpath;
    768	struct filename			*newpath;
    769};
    770
    771struct io_hardlink {
    772	struct file			*file;
    773	int				old_dfd;
    774	int				new_dfd;
    775	struct filename			*oldpath;
    776	struct filename			*newpath;
    777	int				flags;
    778};
    779
    780struct io_msg {
    781	struct file			*file;
    782	u64 user_data;
    783	u32 len;
    784};
    785
    786struct io_async_connect {
    787	struct sockaddr_storage		address;
    788};
    789
    790struct io_async_msghdr {
    791	struct iovec			fast_iov[UIO_FASTIOV];
    792	/* points to an allocated iov, if NULL we use fast_iov instead */
    793	struct iovec			*free_iov;
    794	struct sockaddr __user		*uaddr;
    795	struct msghdr			msg;
    796	struct sockaddr_storage		addr;
    797};
    798
    799struct io_rw_state {
    800	struct iov_iter			iter;
    801	struct iov_iter_state		iter_state;
    802	struct iovec			fast_iov[UIO_FASTIOV];
    803};
    804
    805struct io_async_rw {
    806	struct io_rw_state		s;
    807	const struct iovec		*free_iovec;
    808	size_t				bytes_done;
    809	struct wait_page_queue		wpq;
    810};
    811
    812struct io_xattr {
    813	struct file			*file;
    814	struct xattr_ctx		ctx;
    815	struct filename			*filename;
    816};
    817
    818enum {
    819	REQ_F_FIXED_FILE_BIT	= IOSQE_FIXED_FILE_BIT,
    820	REQ_F_IO_DRAIN_BIT	= IOSQE_IO_DRAIN_BIT,
    821	REQ_F_LINK_BIT		= IOSQE_IO_LINK_BIT,
    822	REQ_F_HARDLINK_BIT	= IOSQE_IO_HARDLINK_BIT,
    823	REQ_F_FORCE_ASYNC_BIT	= IOSQE_ASYNC_BIT,
    824	REQ_F_BUFFER_SELECT_BIT	= IOSQE_BUFFER_SELECT_BIT,
    825	REQ_F_CQE_SKIP_BIT	= IOSQE_CQE_SKIP_SUCCESS_BIT,
    826
    827	/* first byte is taken by user flags, shift it to not overlap */
    828	REQ_F_FAIL_BIT		= 8,
    829	REQ_F_INFLIGHT_BIT,
    830	REQ_F_CUR_POS_BIT,
    831	REQ_F_NOWAIT_BIT,
    832	REQ_F_LINK_TIMEOUT_BIT,
    833	REQ_F_NEED_CLEANUP_BIT,
    834	REQ_F_POLLED_BIT,
    835	REQ_F_BUFFER_SELECTED_BIT,
    836	REQ_F_BUFFER_RING_BIT,
    837	REQ_F_COMPLETE_INLINE_BIT,
    838	REQ_F_REISSUE_BIT,
    839	REQ_F_CREDS_BIT,
    840	REQ_F_REFCOUNT_BIT,
    841	REQ_F_ARM_LTIMEOUT_BIT,
    842	REQ_F_ASYNC_DATA_BIT,
    843	REQ_F_SKIP_LINK_CQES_BIT,
    844	REQ_F_SINGLE_POLL_BIT,
    845	REQ_F_DOUBLE_POLL_BIT,
    846	REQ_F_PARTIAL_IO_BIT,
    847	REQ_F_CQE32_INIT_BIT,
    848	REQ_F_APOLL_MULTISHOT_BIT,
    849	/* keep async read/write and isreg together and in order */
    850	REQ_F_SUPPORT_NOWAIT_BIT,
    851	REQ_F_ISREG_BIT,
    852
    853	/* not a real bit, just to check we're not overflowing the space */
    854	__REQ_F_LAST_BIT,
    855};
    856
    857enum {
    858	/* ctx owns file */
    859	REQ_F_FIXED_FILE	= BIT(REQ_F_FIXED_FILE_BIT),
    860	/* drain existing IO first */
    861	REQ_F_IO_DRAIN		= BIT(REQ_F_IO_DRAIN_BIT),
    862	/* linked sqes */
    863	REQ_F_LINK		= BIT(REQ_F_LINK_BIT),
    864	/* doesn't sever on completion < 0 */
    865	REQ_F_HARDLINK		= BIT(REQ_F_HARDLINK_BIT),
    866	/* IOSQE_ASYNC */
    867	REQ_F_FORCE_ASYNC	= BIT(REQ_F_FORCE_ASYNC_BIT),
    868	/* IOSQE_BUFFER_SELECT */
    869	REQ_F_BUFFER_SELECT	= BIT(REQ_F_BUFFER_SELECT_BIT),
    870	/* IOSQE_CQE_SKIP_SUCCESS */
    871	REQ_F_CQE_SKIP		= BIT(REQ_F_CQE_SKIP_BIT),
    872
    873	/* fail rest of links */
    874	REQ_F_FAIL		= BIT(REQ_F_FAIL_BIT),
    875	/* on inflight list, should be cancelled and waited on exit reliably */
    876	REQ_F_INFLIGHT		= BIT(REQ_F_INFLIGHT_BIT),
    877	/* read/write uses file position */
    878	REQ_F_CUR_POS		= BIT(REQ_F_CUR_POS_BIT),
    879	/* must not punt to workers */
    880	REQ_F_NOWAIT		= BIT(REQ_F_NOWAIT_BIT),
    881	/* has or had linked timeout */
    882	REQ_F_LINK_TIMEOUT	= BIT(REQ_F_LINK_TIMEOUT_BIT),
    883	/* needs cleanup */
    884	REQ_F_NEED_CLEANUP	= BIT(REQ_F_NEED_CLEANUP_BIT),
    885	/* already went through poll handler */
    886	REQ_F_POLLED		= BIT(REQ_F_POLLED_BIT),
    887	/* buffer already selected */
    888	REQ_F_BUFFER_SELECTED	= BIT(REQ_F_BUFFER_SELECTED_BIT),
    889	/* buffer selected from ring, needs commit */
    890	REQ_F_BUFFER_RING	= BIT(REQ_F_BUFFER_RING_BIT),
    891	/* completion is deferred through io_comp_state */
    892	REQ_F_COMPLETE_INLINE	= BIT(REQ_F_COMPLETE_INLINE_BIT),
    893	/* caller should reissue async */
    894	REQ_F_REISSUE		= BIT(REQ_F_REISSUE_BIT),
    895	/* supports async reads/writes */
    896	REQ_F_SUPPORT_NOWAIT	= BIT(REQ_F_SUPPORT_NOWAIT_BIT),
    897	/* regular file */
    898	REQ_F_ISREG		= BIT(REQ_F_ISREG_BIT),
    899	/* has creds assigned */
    900	REQ_F_CREDS		= BIT(REQ_F_CREDS_BIT),
    901	/* skip refcounting if not set */
    902	REQ_F_REFCOUNT		= BIT(REQ_F_REFCOUNT_BIT),
    903	/* there is a linked timeout that has to be armed */
    904	REQ_F_ARM_LTIMEOUT	= BIT(REQ_F_ARM_LTIMEOUT_BIT),
    905	/* ->async_data allocated */
    906	REQ_F_ASYNC_DATA	= BIT(REQ_F_ASYNC_DATA_BIT),
    907	/* don't post CQEs while failing linked requests */
    908	REQ_F_SKIP_LINK_CQES	= BIT(REQ_F_SKIP_LINK_CQES_BIT),
    909	/* single poll may be active */
    910	REQ_F_SINGLE_POLL	= BIT(REQ_F_SINGLE_POLL_BIT),
    911	/* double poll may active */
    912	REQ_F_DOUBLE_POLL	= BIT(REQ_F_DOUBLE_POLL_BIT),
    913	/* request has already done partial IO */
    914	REQ_F_PARTIAL_IO	= BIT(REQ_F_PARTIAL_IO_BIT),
    915	/* fast poll multishot mode */
    916	REQ_F_APOLL_MULTISHOT	= BIT(REQ_F_APOLL_MULTISHOT_BIT),
    917	/* ->extra1 and ->extra2 are initialised */
    918	REQ_F_CQE32_INIT	= BIT(REQ_F_CQE32_INIT_BIT),
    919};
    920
    921struct async_poll {
    922	struct io_poll_iocb	poll;
    923	struct io_poll_iocb	*double_poll;
    924};
    925
    926typedef void (*io_req_tw_func_t)(struct io_kiocb *req, bool *locked);
    927
    928struct io_task_work {
    929	union {
    930		struct io_wq_work_node	node;
    931		struct llist_node	fallback_node;
    932	};
    933	io_req_tw_func_t		func;
    934};
    935
    936enum {
    937	IORING_RSRC_FILE		= 0,
    938	IORING_RSRC_BUFFER		= 1,
    939};
    940
    941struct io_cqe {
    942	__u64	user_data;
    943	__s32	res;
    944	/* fd initially, then cflags for completion */
    945	union {
    946		__u32	flags;
    947		int	fd;
    948	};
    949};
    950
    951enum {
    952	IO_CHECK_CQ_OVERFLOW_BIT,
    953	IO_CHECK_CQ_DROPPED_BIT,
    954};
    955
    956/*
    957 * NOTE! Each of the iocb union members has the file pointer
    958 * as the first entry in their struct definition. So you can
    959 * access the file pointer through any of the sub-structs,
    960 * or directly as just 'file' in this struct.
    961 */
    962struct io_kiocb {
    963	union {
    964		struct file		*file;
    965		struct io_rw		rw;
    966		struct io_poll_iocb	poll;
    967		struct io_poll_update	poll_update;
    968		struct io_accept	accept;
    969		struct io_sync		sync;
    970		struct io_cancel	cancel;
    971		struct io_timeout	timeout;
    972		struct io_timeout_rem	timeout_rem;
    973		struct io_connect	connect;
    974		struct io_sr_msg	sr_msg;
    975		struct io_open		open;
    976		struct io_close		close;
    977		struct io_rsrc_update	rsrc_update;
    978		struct io_fadvise	fadvise;
    979		struct io_madvise	madvise;
    980		struct io_epoll		epoll;
    981		struct io_splice	splice;
    982		struct io_provide_buf	pbuf;
    983		struct io_statx		statx;
    984		struct io_shutdown	shutdown;
    985		struct io_rename	rename;
    986		struct io_unlink	unlink;
    987		struct io_mkdir		mkdir;
    988		struct io_symlink	symlink;
    989		struct io_hardlink	hardlink;
    990		struct io_msg		msg;
    991		struct io_xattr		xattr;
    992		struct io_socket	sock;
    993		struct io_uring_cmd	uring_cmd;
    994	};
    995
    996	u8				opcode;
    997	/* polled IO has completed */
    998	u8				iopoll_completed;
    999	/*
   1000	 * Can be either a fixed buffer index, or used with provided buffers.
   1001	 * For the latter, before issue it points to the buffer group ID,
   1002	 * and after selection it points to the buffer ID itself.
   1003	 */
   1004	u16				buf_index;
   1005	unsigned int			flags;
   1006
   1007	struct io_cqe			cqe;
   1008
   1009	struct io_ring_ctx		*ctx;
   1010	struct task_struct		*task;
   1011
   1012	struct io_rsrc_node		*rsrc_node;
   1013
   1014	union {
   1015		/* store used ubuf, so we can prevent reloading */
   1016		struct io_mapped_ubuf	*imu;
   1017
   1018		/* stores selected buf, valid IFF REQ_F_BUFFER_SELECTED is set */
   1019		struct io_buffer	*kbuf;
   1020
   1021		/*
   1022		 * stores buffer ID for ring provided buffers, valid IFF
   1023		 * REQ_F_BUFFER_RING is set.
   1024		 */
   1025		struct io_buffer_list	*buf_list;
   1026	};
   1027
   1028	union {
   1029		/* used by request caches, completion batching and iopoll */
   1030		struct io_wq_work_node	comp_list;
   1031		/* cache ->apoll->events */
   1032		__poll_t apoll_events;
   1033	};
   1034	atomic_t			refs;
   1035	atomic_t			poll_refs;
   1036	struct io_task_work		io_task_work;
   1037	/* for polled requests, i.e. IORING_OP_POLL_ADD and async armed poll */
   1038	union {
   1039		struct hlist_node	hash_node;
   1040		struct {
   1041			u64		extra1;
   1042			u64		extra2;
   1043		};
   1044	};
   1045	/* internal polling, see IORING_FEAT_FAST_POLL */
   1046	struct async_poll		*apoll;
   1047	/* opcode allocated if it needs to store data for async defer */
   1048	void				*async_data;
   1049	/* linked requests, IFF REQ_F_HARDLINK or REQ_F_LINK are set */
   1050	struct io_kiocb			*link;
   1051	/* custom credentials, valid IFF REQ_F_CREDS is set */
   1052	const struct cred		*creds;
   1053	struct io_wq_work		work;
   1054};
   1055
   1056struct io_tctx_node {
   1057	struct list_head	ctx_node;
   1058	struct task_struct	*task;
   1059	struct io_ring_ctx	*ctx;
   1060};
   1061
   1062struct io_defer_entry {
   1063	struct list_head	list;
   1064	struct io_kiocb		*req;
   1065	u32			seq;
   1066};
   1067
   1068struct io_cancel_data {
   1069	struct io_ring_ctx *ctx;
   1070	union {
   1071		u64 data;
   1072		struct file *file;
   1073	};
   1074	u32 flags;
   1075	int seq;
   1076};
   1077
   1078/*
   1079 * The URING_CMD payload starts at 'cmd' in the first sqe, and continues into
   1080 * the following sqe if SQE128 is used.
   1081 */
   1082#define uring_cmd_pdu_size(is_sqe128)				\
   1083	((1 + !!(is_sqe128)) * sizeof(struct io_uring_sqe) -	\
   1084		offsetof(struct io_uring_sqe, cmd))
   1085
   1086struct io_op_def {
   1087	/* needs req->file assigned */
   1088	unsigned		needs_file : 1;
   1089	/* should block plug */
   1090	unsigned		plug : 1;
   1091	/* hash wq insertion if file is a regular file */
   1092	unsigned		hash_reg_file : 1;
   1093	/* unbound wq insertion if file is a non-regular file */
   1094	unsigned		unbound_nonreg_file : 1;
   1095	/* set if opcode supports polled "wait" */
   1096	unsigned		pollin : 1;
   1097	unsigned		pollout : 1;
   1098	unsigned		poll_exclusive : 1;
   1099	/* op supports buffer selection */
   1100	unsigned		buffer_select : 1;
   1101	/* do prep async if is going to be punted */
   1102	unsigned		needs_async_setup : 1;
   1103	/* opcode is not supported by this kernel */
   1104	unsigned		not_supported : 1;
   1105	/* skip auditing */
   1106	unsigned		audit_skip : 1;
   1107	/* supports ioprio */
   1108	unsigned		ioprio : 1;
   1109	/* supports iopoll */
   1110	unsigned		iopoll : 1;
   1111	/* size of async data needed, if any */
   1112	unsigned short		async_size;
   1113};
   1114
   1115static const struct io_op_def io_op_defs[] = {
   1116	[IORING_OP_NOP] = {
   1117		.audit_skip		= 1,
   1118		.iopoll			= 1,
   1119	},
   1120	[IORING_OP_READV] = {
   1121		.needs_file		= 1,
   1122		.unbound_nonreg_file	= 1,
   1123		.pollin			= 1,
   1124		.buffer_select		= 1,
   1125		.needs_async_setup	= 1,
   1126		.plug			= 1,
   1127		.audit_skip		= 1,
   1128		.ioprio			= 1,
   1129		.iopoll			= 1,
   1130		.async_size		= sizeof(struct io_async_rw),
   1131	},
   1132	[IORING_OP_WRITEV] = {
   1133		.needs_file		= 1,
   1134		.hash_reg_file		= 1,
   1135		.unbound_nonreg_file	= 1,
   1136		.pollout		= 1,
   1137		.needs_async_setup	= 1,
   1138		.plug			= 1,
   1139		.audit_skip		= 1,
   1140		.ioprio			= 1,
   1141		.iopoll			= 1,
   1142		.async_size		= sizeof(struct io_async_rw),
   1143	},
   1144	[IORING_OP_FSYNC] = {
   1145		.needs_file		= 1,
   1146		.audit_skip		= 1,
   1147	},
   1148	[IORING_OP_READ_FIXED] = {
   1149		.needs_file		= 1,
   1150		.unbound_nonreg_file	= 1,
   1151		.pollin			= 1,
   1152		.plug			= 1,
   1153		.audit_skip		= 1,
   1154		.ioprio			= 1,
   1155		.iopoll			= 1,
   1156		.async_size		= sizeof(struct io_async_rw),
   1157	},
   1158	[IORING_OP_WRITE_FIXED] = {
   1159		.needs_file		= 1,
   1160		.hash_reg_file		= 1,
   1161		.unbound_nonreg_file	= 1,
   1162		.pollout		= 1,
   1163		.plug			= 1,
   1164		.audit_skip		= 1,
   1165		.ioprio			= 1,
   1166		.iopoll			= 1,
   1167		.async_size		= sizeof(struct io_async_rw),
   1168	},
   1169	[IORING_OP_POLL_ADD] = {
   1170		.needs_file		= 1,
   1171		.unbound_nonreg_file	= 1,
   1172		.audit_skip		= 1,
   1173	},
   1174	[IORING_OP_POLL_REMOVE] = {
   1175		.audit_skip		= 1,
   1176	},
   1177	[IORING_OP_SYNC_FILE_RANGE] = {
   1178		.needs_file		= 1,
   1179		.audit_skip		= 1,
   1180	},
   1181	[IORING_OP_SENDMSG] = {
   1182		.needs_file		= 1,
   1183		.unbound_nonreg_file	= 1,
   1184		.pollout		= 1,
   1185		.needs_async_setup	= 1,
   1186		.ioprio			= 1,
   1187		.async_size		= sizeof(struct io_async_msghdr),
   1188	},
   1189	[IORING_OP_RECVMSG] = {
   1190		.needs_file		= 1,
   1191		.unbound_nonreg_file	= 1,
   1192		.pollin			= 1,
   1193		.buffer_select		= 1,
   1194		.needs_async_setup	= 1,
   1195		.ioprio			= 1,
   1196		.async_size		= sizeof(struct io_async_msghdr),
   1197	},
   1198	[IORING_OP_TIMEOUT] = {
   1199		.audit_skip		= 1,
   1200		.async_size		= sizeof(struct io_timeout_data),
   1201	},
   1202	[IORING_OP_TIMEOUT_REMOVE] = {
   1203		/* used by timeout updates' prep() */
   1204		.audit_skip		= 1,
   1205	},
   1206	[IORING_OP_ACCEPT] = {
   1207		.needs_file		= 1,
   1208		.unbound_nonreg_file	= 1,
   1209		.pollin			= 1,
   1210		.poll_exclusive		= 1,
   1211		.ioprio			= 1,	/* used for flags */
   1212	},
   1213	[IORING_OP_ASYNC_CANCEL] = {
   1214		.audit_skip		= 1,
   1215	},
   1216	[IORING_OP_LINK_TIMEOUT] = {
   1217		.audit_skip		= 1,
   1218		.async_size		= sizeof(struct io_timeout_data),
   1219	},
   1220	[IORING_OP_CONNECT] = {
   1221		.needs_file		= 1,
   1222		.unbound_nonreg_file	= 1,
   1223		.pollout		= 1,
   1224		.needs_async_setup	= 1,
   1225		.async_size		= sizeof(struct io_async_connect),
   1226	},
   1227	[IORING_OP_FALLOCATE] = {
   1228		.needs_file		= 1,
   1229	},
   1230	[IORING_OP_OPENAT] = {},
   1231	[IORING_OP_CLOSE] = {},
   1232	[IORING_OP_FILES_UPDATE] = {
   1233		.audit_skip		= 1,
   1234		.iopoll			= 1,
   1235	},
   1236	[IORING_OP_STATX] = {
   1237		.audit_skip		= 1,
   1238	},
   1239	[IORING_OP_READ] = {
   1240		.needs_file		= 1,
   1241		.unbound_nonreg_file	= 1,
   1242		.pollin			= 1,
   1243		.buffer_select		= 1,
   1244		.plug			= 1,
   1245		.audit_skip		= 1,
   1246		.ioprio			= 1,
   1247		.iopoll			= 1,
   1248		.async_size		= sizeof(struct io_async_rw),
   1249	},
   1250	[IORING_OP_WRITE] = {
   1251		.needs_file		= 1,
   1252		.hash_reg_file		= 1,
   1253		.unbound_nonreg_file	= 1,
   1254		.pollout		= 1,
   1255		.plug			= 1,
   1256		.audit_skip		= 1,
   1257		.ioprio			= 1,
   1258		.iopoll			= 1,
   1259		.async_size		= sizeof(struct io_async_rw),
   1260	},
   1261	[IORING_OP_FADVISE] = {
   1262		.needs_file		= 1,
   1263		.audit_skip		= 1,
   1264	},
   1265	[IORING_OP_MADVISE] = {},
   1266	[IORING_OP_SEND] = {
   1267		.needs_file		= 1,
   1268		.unbound_nonreg_file	= 1,
   1269		.pollout		= 1,
   1270		.audit_skip		= 1,
   1271		.ioprio			= 1,
   1272	},
   1273	[IORING_OP_RECV] = {
   1274		.needs_file		= 1,
   1275		.unbound_nonreg_file	= 1,
   1276		.pollin			= 1,
   1277		.buffer_select		= 1,
   1278		.audit_skip		= 1,
   1279		.ioprio			= 1,
   1280	},
   1281	[IORING_OP_OPENAT2] = {
   1282	},
   1283	[IORING_OP_EPOLL_CTL] = {
   1284		.unbound_nonreg_file	= 1,
   1285		.audit_skip		= 1,
   1286	},
   1287	[IORING_OP_SPLICE] = {
   1288		.needs_file		= 1,
   1289		.hash_reg_file		= 1,
   1290		.unbound_nonreg_file	= 1,
   1291		.audit_skip		= 1,
   1292	},
   1293	[IORING_OP_PROVIDE_BUFFERS] = {
   1294		.audit_skip		= 1,
   1295		.iopoll			= 1,
   1296	},
   1297	[IORING_OP_REMOVE_BUFFERS] = {
   1298		.audit_skip		= 1,
   1299		.iopoll			= 1,
   1300	},
   1301	[IORING_OP_TEE] = {
   1302		.needs_file		= 1,
   1303		.hash_reg_file		= 1,
   1304		.unbound_nonreg_file	= 1,
   1305		.audit_skip		= 1,
   1306	},
   1307	[IORING_OP_SHUTDOWN] = {
   1308		.needs_file		= 1,
   1309	},
   1310	[IORING_OP_RENAMEAT] = {},
   1311	[IORING_OP_UNLINKAT] = {},
   1312	[IORING_OP_MKDIRAT] = {},
   1313	[IORING_OP_SYMLINKAT] = {},
   1314	[IORING_OP_LINKAT] = {},
   1315	[IORING_OP_MSG_RING] = {
   1316		.needs_file		= 1,
   1317		.iopoll			= 1,
   1318	},
   1319	[IORING_OP_FSETXATTR] = {
   1320		.needs_file = 1
   1321	},
   1322	[IORING_OP_SETXATTR] = {},
   1323	[IORING_OP_FGETXATTR] = {
   1324		.needs_file = 1
   1325	},
   1326	[IORING_OP_GETXATTR] = {},
   1327	[IORING_OP_SOCKET] = {
   1328		.audit_skip		= 1,
   1329	},
   1330	[IORING_OP_URING_CMD] = {
   1331		.needs_file		= 1,
   1332		.plug			= 1,
   1333		.needs_async_setup	= 1,
   1334		.async_size		= uring_cmd_pdu_size(1),
   1335	},
   1336};
   1337
   1338/* requests with any of those set should undergo io_disarm_next() */
   1339#define IO_DISARM_MASK (REQ_F_ARM_LTIMEOUT | REQ_F_LINK_TIMEOUT | REQ_F_FAIL)
   1340#define IO_REQ_LINK_FLAGS (REQ_F_LINK | REQ_F_HARDLINK)
   1341
   1342static bool io_disarm_next(struct io_kiocb *req);
   1343static void io_uring_del_tctx_node(unsigned long index);
   1344static void io_uring_try_cancel_requests(struct io_ring_ctx *ctx,
   1345					 struct task_struct *task,
   1346					 bool cancel_all);
   1347static void io_uring_cancel_generic(bool cancel_all, struct io_sq_data *sqd);
   1348
   1349static void __io_req_complete_post(struct io_kiocb *req, s32 res, u32 cflags);
   1350static void io_dismantle_req(struct io_kiocb *req);
   1351static void io_queue_linked_timeout(struct io_kiocb *req);
   1352static int __io_register_rsrc_update(struct io_ring_ctx *ctx, unsigned type,
   1353				     struct io_uring_rsrc_update2 *up,
   1354				     unsigned nr_args);
   1355static void io_clean_op(struct io_kiocb *req);
   1356static inline struct file *io_file_get_fixed(struct io_kiocb *req, int fd,
   1357					     unsigned issue_flags);
   1358static struct file *io_file_get_normal(struct io_kiocb *req, int fd);
   1359static void io_queue_sqe(struct io_kiocb *req);
   1360static void io_rsrc_put_work(struct work_struct *work);
   1361
   1362static void io_req_task_queue(struct io_kiocb *req);
   1363static void __io_submit_flush_completions(struct io_ring_ctx *ctx);
   1364static int io_req_prep_async(struct io_kiocb *req);
   1365
   1366static int io_install_fixed_file(struct io_kiocb *req, struct file *file,
   1367				 unsigned int issue_flags, u32 slot_index);
   1368static int __io_close_fixed(struct io_kiocb *req, unsigned int issue_flags,
   1369			    unsigned int offset);
   1370static inline int io_close_fixed(struct io_kiocb *req, unsigned int issue_flags);
   1371
   1372static enum hrtimer_restart io_link_timeout_fn(struct hrtimer *timer);
   1373static void io_eventfd_signal(struct io_ring_ctx *ctx);
   1374static void io_req_tw_post_queue(struct io_kiocb *req, s32 res, u32 cflags);
   1375
   1376static struct kmem_cache *req_cachep;
   1377
   1378static const struct file_operations io_uring_fops;
   1379
   1380const char *io_uring_get_opcode(u8 opcode)
   1381{
   1382	switch ((enum io_uring_op)opcode) {
   1383	case IORING_OP_NOP:
   1384		return "NOP";
   1385	case IORING_OP_READV:
   1386		return "READV";
   1387	case IORING_OP_WRITEV:
   1388		return "WRITEV";
   1389	case IORING_OP_FSYNC:
   1390		return "FSYNC";
   1391	case IORING_OP_READ_FIXED:
   1392		return "READ_FIXED";
   1393	case IORING_OP_WRITE_FIXED:
   1394		return "WRITE_FIXED";
   1395	case IORING_OP_POLL_ADD:
   1396		return "POLL_ADD";
   1397	case IORING_OP_POLL_REMOVE:
   1398		return "POLL_REMOVE";
   1399	case IORING_OP_SYNC_FILE_RANGE:
   1400		return "SYNC_FILE_RANGE";
   1401	case IORING_OP_SENDMSG:
   1402		return "SENDMSG";
   1403	case IORING_OP_RECVMSG:
   1404		return "RECVMSG";
   1405	case IORING_OP_TIMEOUT:
   1406		return "TIMEOUT";
   1407	case IORING_OP_TIMEOUT_REMOVE:
   1408		return "TIMEOUT_REMOVE";
   1409	case IORING_OP_ACCEPT:
   1410		return "ACCEPT";
   1411	case IORING_OP_ASYNC_CANCEL:
   1412		return "ASYNC_CANCEL";
   1413	case IORING_OP_LINK_TIMEOUT:
   1414		return "LINK_TIMEOUT";
   1415	case IORING_OP_CONNECT:
   1416		return "CONNECT";
   1417	case IORING_OP_FALLOCATE:
   1418		return "FALLOCATE";
   1419	case IORING_OP_OPENAT:
   1420		return "OPENAT";
   1421	case IORING_OP_CLOSE:
   1422		return "CLOSE";
   1423	case IORING_OP_FILES_UPDATE:
   1424		return "FILES_UPDATE";
   1425	case IORING_OP_STATX:
   1426		return "STATX";
   1427	case IORING_OP_READ:
   1428		return "READ";
   1429	case IORING_OP_WRITE:
   1430		return "WRITE";
   1431	case IORING_OP_FADVISE:
   1432		return "FADVISE";
   1433	case IORING_OP_MADVISE:
   1434		return "MADVISE";
   1435	case IORING_OP_SEND:
   1436		return "SEND";
   1437	case IORING_OP_RECV:
   1438		return "RECV";
   1439	case IORING_OP_OPENAT2:
   1440		return "OPENAT2";
   1441	case IORING_OP_EPOLL_CTL:
   1442		return "EPOLL_CTL";
   1443	case IORING_OP_SPLICE:
   1444		return "SPLICE";
   1445	case IORING_OP_PROVIDE_BUFFERS:
   1446		return "PROVIDE_BUFFERS";
   1447	case IORING_OP_REMOVE_BUFFERS:
   1448		return "REMOVE_BUFFERS";
   1449	case IORING_OP_TEE:
   1450		return "TEE";
   1451	case IORING_OP_SHUTDOWN:
   1452		return "SHUTDOWN";
   1453	case IORING_OP_RENAMEAT:
   1454		return "RENAMEAT";
   1455	case IORING_OP_UNLINKAT:
   1456		return "UNLINKAT";
   1457	case IORING_OP_MKDIRAT:
   1458		return "MKDIRAT";
   1459	case IORING_OP_SYMLINKAT:
   1460		return "SYMLINKAT";
   1461	case IORING_OP_LINKAT:
   1462		return "LINKAT";
   1463	case IORING_OP_MSG_RING:
   1464		return "MSG_RING";
   1465	case IORING_OP_FSETXATTR:
   1466		return "FSETXATTR";
   1467	case IORING_OP_SETXATTR:
   1468		return "SETXATTR";
   1469	case IORING_OP_FGETXATTR:
   1470		return "FGETXATTR";
   1471	case IORING_OP_GETXATTR:
   1472		return "GETXATTR";
   1473	case IORING_OP_SOCKET:
   1474		return "SOCKET";
   1475	case IORING_OP_URING_CMD:
   1476		return "URING_CMD";
   1477	case IORING_OP_LAST:
   1478		return "INVALID";
   1479	}
   1480	return "INVALID";
   1481}
   1482
   1483struct sock *io_uring_get_socket(struct file *file)
   1484{
   1485#if defined(CONFIG_UNIX)
   1486	if (file->f_op == &io_uring_fops) {
   1487		struct io_ring_ctx *ctx = file->private_data;
   1488
   1489		return ctx->ring_sock->sk;
   1490	}
   1491#endif
   1492	return NULL;
   1493}
   1494EXPORT_SYMBOL(io_uring_get_socket);
   1495
   1496#if defined(CONFIG_UNIX)
   1497static inline bool io_file_need_scm(struct file *filp)
   1498{
   1499#if defined(IO_URING_SCM_ALL)
   1500	return true;
   1501#else
   1502	return !!unix_get_socket(filp);
   1503#endif
   1504}
   1505#else
   1506static inline bool io_file_need_scm(struct file *filp)
   1507{
   1508	return false;
   1509}
   1510#endif
   1511
   1512static void io_ring_submit_unlock(struct io_ring_ctx *ctx, unsigned issue_flags)
   1513{
   1514	lockdep_assert_held(&ctx->uring_lock);
   1515	if (issue_flags & IO_URING_F_UNLOCKED)
   1516		mutex_unlock(&ctx->uring_lock);
   1517}
   1518
   1519static void io_ring_submit_lock(struct io_ring_ctx *ctx, unsigned issue_flags)
   1520{
   1521	/*
   1522	 * "Normal" inline submissions always hold the uring_lock, since we
   1523	 * grab it from the system call. Same is true for the SQPOLL offload.
   1524	 * The only exception is when we've detached the request and issue it
   1525	 * from an async worker thread, grab the lock for that case.
   1526	 */
   1527	if (issue_flags & IO_URING_F_UNLOCKED)
   1528		mutex_lock(&ctx->uring_lock);
   1529	lockdep_assert_held(&ctx->uring_lock);
   1530}
   1531
   1532static inline void io_tw_lock(struct io_ring_ctx *ctx, bool *locked)
   1533{
   1534	if (!*locked) {
   1535		mutex_lock(&ctx->uring_lock);
   1536		*locked = true;
   1537	}
   1538}
   1539
   1540#define io_for_each_link(pos, head) \
   1541	for (pos = (head); pos; pos = pos->link)
   1542
   1543/*
   1544 * Shamelessly stolen from the mm implementation of page reference checking,
   1545 * see commit f958d7b528b1 for details.
   1546 */
   1547#define req_ref_zero_or_close_to_overflow(req)	\
   1548	((unsigned int) atomic_read(&(req->refs)) + 127u <= 127u)
   1549
   1550static inline bool req_ref_inc_not_zero(struct io_kiocb *req)
   1551{
   1552	WARN_ON_ONCE(!(req->flags & REQ_F_REFCOUNT));
   1553	return atomic_inc_not_zero(&req->refs);
   1554}
   1555
   1556static inline bool req_ref_put_and_test(struct io_kiocb *req)
   1557{
   1558	if (likely(!(req->flags & REQ_F_REFCOUNT)))
   1559		return true;
   1560
   1561	WARN_ON_ONCE(req_ref_zero_or_close_to_overflow(req));
   1562	return atomic_dec_and_test(&req->refs);
   1563}
   1564
   1565static inline void req_ref_get(struct io_kiocb *req)
   1566{
   1567	WARN_ON_ONCE(!(req->flags & REQ_F_REFCOUNT));
   1568	WARN_ON_ONCE(req_ref_zero_or_close_to_overflow(req));
   1569	atomic_inc(&req->refs);
   1570}
   1571
   1572static inline void io_submit_flush_completions(struct io_ring_ctx *ctx)
   1573{
   1574	if (!wq_list_empty(&ctx->submit_state.compl_reqs))
   1575		__io_submit_flush_completions(ctx);
   1576}
   1577
   1578static inline void __io_req_set_refcount(struct io_kiocb *req, int nr)
   1579{
   1580	if (!(req->flags & REQ_F_REFCOUNT)) {
   1581		req->flags |= REQ_F_REFCOUNT;
   1582		atomic_set(&req->refs, nr);
   1583	}
   1584}
   1585
   1586static inline void io_req_set_refcount(struct io_kiocb *req)
   1587{
   1588	__io_req_set_refcount(req, 1);
   1589}
   1590
   1591#define IO_RSRC_REF_BATCH	100
   1592
   1593static void io_rsrc_put_node(struct io_rsrc_node *node, int nr)
   1594{
   1595	percpu_ref_put_many(&node->refs, nr);
   1596}
   1597
   1598static inline void io_req_put_rsrc_locked(struct io_kiocb *req,
   1599					  struct io_ring_ctx *ctx)
   1600	__must_hold(&ctx->uring_lock)
   1601{
   1602	struct io_rsrc_node *node = req->rsrc_node;
   1603
   1604	if (node) {
   1605		if (node == ctx->rsrc_node)
   1606			ctx->rsrc_cached_refs++;
   1607		else
   1608			io_rsrc_put_node(node, 1);
   1609	}
   1610}
   1611
   1612static inline void io_req_put_rsrc(struct io_kiocb *req)
   1613{
   1614	if (req->rsrc_node)
   1615		io_rsrc_put_node(req->rsrc_node, 1);
   1616}
   1617
   1618static __cold void io_rsrc_refs_drop(struct io_ring_ctx *ctx)
   1619	__must_hold(&ctx->uring_lock)
   1620{
   1621	if (ctx->rsrc_cached_refs) {
   1622		io_rsrc_put_node(ctx->rsrc_node, ctx->rsrc_cached_refs);
   1623		ctx->rsrc_cached_refs = 0;
   1624	}
   1625}
   1626
   1627static void io_rsrc_refs_refill(struct io_ring_ctx *ctx)
   1628	__must_hold(&ctx->uring_lock)
   1629{
   1630	ctx->rsrc_cached_refs += IO_RSRC_REF_BATCH;
   1631	percpu_ref_get_many(&ctx->rsrc_node->refs, IO_RSRC_REF_BATCH);
   1632}
   1633
   1634static inline void io_req_set_rsrc_node(struct io_kiocb *req,
   1635					struct io_ring_ctx *ctx,
   1636					unsigned int issue_flags)
   1637{
   1638	if (!req->rsrc_node) {
   1639		req->rsrc_node = ctx->rsrc_node;
   1640
   1641		if (!(issue_flags & IO_URING_F_UNLOCKED)) {
   1642			lockdep_assert_held(&ctx->uring_lock);
   1643			ctx->rsrc_cached_refs--;
   1644			if (unlikely(ctx->rsrc_cached_refs < 0))
   1645				io_rsrc_refs_refill(ctx);
   1646		} else {
   1647			percpu_ref_get(&req->rsrc_node->refs);
   1648		}
   1649	}
   1650}
   1651
   1652static unsigned int __io_put_kbuf(struct io_kiocb *req, struct list_head *list)
   1653{
   1654	if (req->flags & REQ_F_BUFFER_RING) {
   1655		if (req->buf_list)
   1656			req->buf_list->head++;
   1657		req->flags &= ~REQ_F_BUFFER_RING;
   1658	} else {
   1659		list_add(&req->kbuf->list, list);
   1660		req->flags &= ~REQ_F_BUFFER_SELECTED;
   1661	}
   1662
   1663	return IORING_CQE_F_BUFFER | (req->buf_index << IORING_CQE_BUFFER_SHIFT);
   1664}
   1665
   1666static inline unsigned int io_put_kbuf_comp(struct io_kiocb *req)
   1667{
   1668	lockdep_assert_held(&req->ctx->completion_lock);
   1669
   1670	if (!(req->flags & (REQ_F_BUFFER_SELECTED|REQ_F_BUFFER_RING)))
   1671		return 0;
   1672	return __io_put_kbuf(req, &req->ctx->io_buffers_comp);
   1673}
   1674
   1675static inline unsigned int io_put_kbuf(struct io_kiocb *req,
   1676				       unsigned issue_flags)
   1677{
   1678	unsigned int cflags;
   1679
   1680	if (!(req->flags & (REQ_F_BUFFER_SELECTED|REQ_F_BUFFER_RING)))
   1681		return 0;
   1682
   1683	/*
   1684	 * We can add this buffer back to two lists:
   1685	 *
   1686	 * 1) The io_buffers_cache list. This one is protected by the
   1687	 *    ctx->uring_lock. If we already hold this lock, add back to this
   1688	 *    list as we can grab it from issue as well.
   1689	 * 2) The io_buffers_comp list. This one is protected by the
   1690	 *    ctx->completion_lock.
   1691	 *
   1692	 * We migrate buffers from the comp_list to the issue cache list
   1693	 * when we need one.
   1694	 */
   1695	if (req->flags & REQ_F_BUFFER_RING) {
   1696		/* no buffers to recycle for this case */
   1697		cflags = __io_put_kbuf(req, NULL);
   1698	} else if (issue_flags & IO_URING_F_UNLOCKED) {
   1699		struct io_ring_ctx *ctx = req->ctx;
   1700
   1701		spin_lock(&ctx->completion_lock);
   1702		cflags = __io_put_kbuf(req, &ctx->io_buffers_comp);
   1703		spin_unlock(&ctx->completion_lock);
   1704	} else {
   1705		lockdep_assert_held(&req->ctx->uring_lock);
   1706
   1707		cflags = __io_put_kbuf(req, &req->ctx->io_buffers_cache);
   1708	}
   1709
   1710	return cflags;
   1711}
   1712
   1713static struct io_buffer_list *io_buffer_get_list(struct io_ring_ctx *ctx,
   1714						 unsigned int bgid)
   1715{
   1716	if (ctx->io_bl && bgid < BGID_ARRAY)
   1717		return &ctx->io_bl[bgid];
   1718
   1719	return xa_load(&ctx->io_bl_xa, bgid);
   1720}
   1721
   1722static void io_kbuf_recycle(struct io_kiocb *req, unsigned issue_flags)
   1723{
   1724	struct io_ring_ctx *ctx = req->ctx;
   1725	struct io_buffer_list *bl;
   1726	struct io_buffer *buf;
   1727
   1728	if (!(req->flags & (REQ_F_BUFFER_SELECTED|REQ_F_BUFFER_RING)))
   1729		return;
   1730	/*
   1731	 * For legacy provided buffer mode, don't recycle if we already did
   1732	 * IO to this buffer. For ring-mapped provided buffer mode, we should
   1733	 * increment ring->head to explicitly monopolize the buffer to avoid
   1734	 * multiple use.
   1735	 */
   1736	if ((req->flags & REQ_F_BUFFER_SELECTED) &&
   1737	    (req->flags & REQ_F_PARTIAL_IO))
   1738		return;
   1739
   1740	/*
   1741	 * We don't need to recycle for REQ_F_BUFFER_RING, we can just clear
   1742	 * the flag and hence ensure that bl->head doesn't get incremented.
   1743	 * If the tail has already been incremented, hang on to it.
   1744	 */
   1745	if (req->flags & REQ_F_BUFFER_RING) {
   1746		if (req->buf_list) {
   1747			if (req->flags & REQ_F_PARTIAL_IO) {
   1748				req->buf_list->head++;
   1749				req->buf_list = NULL;
   1750			} else {
   1751				req->buf_index = req->buf_list->bgid;
   1752				req->flags &= ~REQ_F_BUFFER_RING;
   1753			}
   1754		}
   1755		return;
   1756	}
   1757
   1758	io_ring_submit_lock(ctx, issue_flags);
   1759
   1760	buf = req->kbuf;
   1761	bl = io_buffer_get_list(ctx, buf->bgid);
   1762	list_add(&buf->list, &bl->buf_list);
   1763	req->flags &= ~REQ_F_BUFFER_SELECTED;
   1764	req->buf_index = buf->bgid;
   1765
   1766	io_ring_submit_unlock(ctx, issue_flags);
   1767}
   1768
   1769static bool io_match_task(struct io_kiocb *head, struct task_struct *task,
   1770			  bool cancel_all)
   1771	__must_hold(&req->ctx->timeout_lock)
   1772{
   1773	struct io_kiocb *req;
   1774
   1775	if (task && head->task != task)
   1776		return false;
   1777	if (cancel_all)
   1778		return true;
   1779
   1780	io_for_each_link(req, head) {
   1781		if (req->flags & REQ_F_INFLIGHT)
   1782			return true;
   1783	}
   1784	return false;
   1785}
   1786
   1787static bool io_match_linked(struct io_kiocb *head)
   1788{
   1789	struct io_kiocb *req;
   1790
   1791	io_for_each_link(req, head) {
   1792		if (req->flags & REQ_F_INFLIGHT)
   1793			return true;
   1794	}
   1795	return false;
   1796}
   1797
   1798/*
   1799 * As io_match_task() but protected against racing with linked timeouts.
   1800 * User must not hold timeout_lock.
   1801 */
   1802static bool io_match_task_safe(struct io_kiocb *head, struct task_struct *task,
   1803			       bool cancel_all)
   1804{
   1805	bool matched;
   1806
   1807	if (task && head->task != task)
   1808		return false;
   1809	if (cancel_all)
   1810		return true;
   1811
   1812	if (head->flags & REQ_F_LINK_TIMEOUT) {
   1813		struct io_ring_ctx *ctx = head->ctx;
   1814
   1815		/* protect against races with linked timeouts */
   1816		spin_lock_irq(&ctx->timeout_lock);
   1817		matched = io_match_linked(head);
   1818		spin_unlock_irq(&ctx->timeout_lock);
   1819	} else {
   1820		matched = io_match_linked(head);
   1821	}
   1822	return matched;
   1823}
   1824
   1825static inline bool req_has_async_data(struct io_kiocb *req)
   1826{
   1827	return req->flags & REQ_F_ASYNC_DATA;
   1828}
   1829
   1830static inline void req_set_fail(struct io_kiocb *req)
   1831{
   1832	req->flags |= REQ_F_FAIL;
   1833	if (req->flags & REQ_F_CQE_SKIP) {
   1834		req->flags &= ~REQ_F_CQE_SKIP;
   1835		req->flags |= REQ_F_SKIP_LINK_CQES;
   1836	}
   1837}
   1838
   1839static inline void req_fail_link_node(struct io_kiocb *req, int res)
   1840{
   1841	req_set_fail(req);
   1842	req->cqe.res = res;
   1843}
   1844
   1845static inline void io_req_add_to_cache(struct io_kiocb *req, struct io_ring_ctx *ctx)
   1846{
   1847	wq_stack_add_head(&req->comp_list, &ctx->submit_state.free_list);
   1848}
   1849
   1850static __cold void io_ring_ctx_ref_free(struct percpu_ref *ref)
   1851{
   1852	struct io_ring_ctx *ctx = container_of(ref, struct io_ring_ctx, refs);
   1853
   1854	complete(&ctx->ref_comp);
   1855}
   1856
   1857static inline bool io_is_timeout_noseq(struct io_kiocb *req)
   1858{
   1859	return !req->timeout.off;
   1860}
   1861
   1862static __cold void io_fallback_req_func(struct work_struct *work)
   1863{
   1864	struct io_ring_ctx *ctx = container_of(work, struct io_ring_ctx,
   1865						fallback_work.work);
   1866	struct llist_node *node = llist_del_all(&ctx->fallback_llist);
   1867	struct io_kiocb *req, *tmp;
   1868	bool locked = false;
   1869
   1870	percpu_ref_get(&ctx->refs);
   1871	llist_for_each_entry_safe(req, tmp, node, io_task_work.fallback_node)
   1872		req->io_task_work.func(req, &locked);
   1873
   1874	if (locked) {
   1875		io_submit_flush_completions(ctx);
   1876		mutex_unlock(&ctx->uring_lock);
   1877	}
   1878	percpu_ref_put(&ctx->refs);
   1879}
   1880
   1881static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
   1882{
   1883	struct io_ring_ctx *ctx;
   1884	int hash_bits;
   1885
   1886	ctx = kzalloc(sizeof(*ctx), GFP_KERNEL);
   1887	if (!ctx)
   1888		return NULL;
   1889
   1890	xa_init(&ctx->io_bl_xa);
   1891
   1892	/*
   1893	 * Use 5 bits less than the max cq entries, that should give us around
   1894	 * 32 entries per hash list if totally full and uniformly spread.
   1895	 */
   1896	hash_bits = ilog2(p->cq_entries);
   1897	hash_bits -= 5;
   1898	if (hash_bits <= 0)
   1899		hash_bits = 1;
   1900	ctx->cancel_hash_bits = hash_bits;
   1901	ctx->cancel_hash = kmalloc((1U << hash_bits) * sizeof(struct hlist_head),
   1902					GFP_KERNEL);
   1903	if (!ctx->cancel_hash)
   1904		goto err;
   1905	__hash_init(ctx->cancel_hash, 1U << hash_bits);
   1906
   1907	ctx->dummy_ubuf = kzalloc(sizeof(*ctx->dummy_ubuf), GFP_KERNEL);
   1908	if (!ctx->dummy_ubuf)
   1909		goto err;
   1910	/* set invalid range, so io_import_fixed() fails meeting it */
   1911	ctx->dummy_ubuf->ubuf = -1UL;
   1912
   1913	if (percpu_ref_init(&ctx->refs, io_ring_ctx_ref_free,
   1914			    PERCPU_REF_ALLOW_REINIT, GFP_KERNEL))
   1915		goto err;
   1916
   1917	ctx->flags = p->flags;
   1918	init_waitqueue_head(&ctx->sqo_sq_wait);
   1919	INIT_LIST_HEAD(&ctx->sqd_list);
   1920	INIT_LIST_HEAD(&ctx->cq_overflow_list);
   1921	INIT_LIST_HEAD(&ctx->io_buffers_cache);
   1922	INIT_LIST_HEAD(&ctx->apoll_cache);
   1923	init_completion(&ctx->ref_comp);
   1924	xa_init_flags(&ctx->personalities, XA_FLAGS_ALLOC1);
   1925	mutex_init(&ctx->uring_lock);
   1926	init_waitqueue_head(&ctx->cq_wait);
   1927	spin_lock_init(&ctx->completion_lock);
   1928	spin_lock_init(&ctx->timeout_lock);
   1929	INIT_WQ_LIST(&ctx->iopoll_list);
   1930	INIT_LIST_HEAD(&ctx->io_buffers_pages);
   1931	INIT_LIST_HEAD(&ctx->io_buffers_comp);
   1932	INIT_LIST_HEAD(&ctx->defer_list);
   1933	INIT_LIST_HEAD(&ctx->timeout_list);
   1934	INIT_LIST_HEAD(&ctx->ltimeout_list);
   1935	spin_lock_init(&ctx->rsrc_ref_lock);
   1936	INIT_LIST_HEAD(&ctx->rsrc_ref_list);
   1937	INIT_DELAYED_WORK(&ctx->rsrc_put_work, io_rsrc_put_work);
   1938	init_llist_head(&ctx->rsrc_put_llist);
   1939	INIT_LIST_HEAD(&ctx->tctx_list);
   1940	ctx->submit_state.free_list.next = NULL;
   1941	INIT_WQ_LIST(&ctx->locked_free_list);
   1942	INIT_DELAYED_WORK(&ctx->fallback_work, io_fallback_req_func);
   1943	INIT_WQ_LIST(&ctx->submit_state.compl_reqs);
   1944	return ctx;
   1945err:
   1946	kfree(ctx->dummy_ubuf);
   1947	kfree(ctx->cancel_hash);
   1948	kfree(ctx->io_bl);
   1949	xa_destroy(&ctx->io_bl_xa);
   1950	kfree(ctx);
   1951	return NULL;
   1952}
   1953
   1954static void io_account_cq_overflow(struct io_ring_ctx *ctx)
   1955{
   1956	struct io_rings *r = ctx->rings;
   1957
   1958	WRITE_ONCE(r->cq_overflow, READ_ONCE(r->cq_overflow) + 1);
   1959	ctx->cq_extra--;
   1960}
   1961
   1962static bool req_need_defer(struct io_kiocb *req, u32 seq)
   1963{
   1964	if (unlikely(req->flags & REQ_F_IO_DRAIN)) {
   1965		struct io_ring_ctx *ctx = req->ctx;
   1966
   1967		return seq + READ_ONCE(ctx->cq_extra) != ctx->cached_cq_tail;
   1968	}
   1969
   1970	return false;
   1971}
   1972
   1973static inline bool io_req_ffs_set(struct io_kiocb *req)
   1974{
   1975	return req->flags & REQ_F_FIXED_FILE;
   1976}
   1977
   1978static inline void io_req_track_inflight(struct io_kiocb *req)
   1979{
   1980	if (!(req->flags & REQ_F_INFLIGHT)) {
   1981		req->flags |= REQ_F_INFLIGHT;
   1982		atomic_inc(&req->task->io_uring->inflight_tracked);
   1983	}
   1984}
   1985
   1986static struct io_kiocb *__io_prep_linked_timeout(struct io_kiocb *req)
   1987{
   1988	if (WARN_ON_ONCE(!req->link))
   1989		return NULL;
   1990
   1991	req->flags &= ~REQ_F_ARM_LTIMEOUT;
   1992	req->flags |= REQ_F_LINK_TIMEOUT;
   1993
   1994	/* linked timeouts should have two refs once prep'ed */
   1995	io_req_set_refcount(req);
   1996	__io_req_set_refcount(req->link, 2);
   1997	return req->link;
   1998}
   1999
   2000static inline struct io_kiocb *io_prep_linked_timeout(struct io_kiocb *req)
   2001{
   2002	if (likely(!(req->flags & REQ_F_ARM_LTIMEOUT)))
   2003		return NULL;
   2004	return __io_prep_linked_timeout(req);
   2005}
   2006
   2007static noinline void __io_arm_ltimeout(struct io_kiocb *req)
   2008{
   2009	io_queue_linked_timeout(__io_prep_linked_timeout(req));
   2010}
   2011
   2012static inline void io_arm_ltimeout(struct io_kiocb *req)
   2013{
   2014	if (unlikely(req->flags & REQ_F_ARM_LTIMEOUT))
   2015		__io_arm_ltimeout(req);
   2016}
   2017
   2018static void io_prep_async_work(struct io_kiocb *req)
   2019{
   2020	const struct io_op_def *def = &io_op_defs[req->opcode];
   2021	struct io_ring_ctx *ctx = req->ctx;
   2022
   2023	if (!(req->flags & REQ_F_CREDS)) {
   2024		req->flags |= REQ_F_CREDS;
   2025		req->creds = get_current_cred();
   2026	}
   2027
   2028	req->work.list.next = NULL;
   2029	req->work.flags = 0;
   2030	req->work.cancel_seq = atomic_read(&ctx->cancel_seq);
   2031	if (req->flags & REQ_F_FORCE_ASYNC)
   2032		req->work.flags |= IO_WQ_WORK_CONCURRENT;
   2033
   2034	if (req->flags & REQ_F_ISREG) {
   2035		if (def->hash_reg_file || (ctx->flags & IORING_SETUP_IOPOLL))
   2036			io_wq_hash_work(&req->work, file_inode(req->file));
   2037	} else if (!req->file || !S_ISBLK(file_inode(req->file)->i_mode)) {
   2038		if (def->unbound_nonreg_file)
   2039			req->work.flags |= IO_WQ_WORK_UNBOUND;
   2040	}
   2041}
   2042
   2043static void io_prep_async_link(struct io_kiocb *req)
   2044{
   2045	struct io_kiocb *cur;
   2046
   2047	if (req->flags & REQ_F_LINK_TIMEOUT) {
   2048		struct io_ring_ctx *ctx = req->ctx;
   2049
   2050		spin_lock_irq(&ctx->timeout_lock);
   2051		io_for_each_link(cur, req)
   2052			io_prep_async_work(cur);
   2053		spin_unlock_irq(&ctx->timeout_lock);
   2054	} else {
   2055		io_for_each_link(cur, req)
   2056			io_prep_async_work(cur);
   2057	}
   2058}
   2059
   2060static inline void io_req_add_compl_list(struct io_kiocb *req)
   2061{
   2062	struct io_submit_state *state = &req->ctx->submit_state;
   2063
   2064	if (!(req->flags & REQ_F_CQE_SKIP))
   2065		state->flush_cqes = true;
   2066	wq_list_add_tail(&req->comp_list, &state->compl_reqs);
   2067}
   2068
   2069static void io_queue_iowq(struct io_kiocb *req, bool *dont_use)
   2070{
   2071	struct io_kiocb *link = io_prep_linked_timeout(req);
   2072	struct io_uring_task *tctx = req->task->io_uring;
   2073
   2074	BUG_ON(!tctx);
   2075	BUG_ON(!tctx->io_wq);
   2076
   2077	/* init ->work of the whole link before punting */
   2078	io_prep_async_link(req);
   2079
   2080	/*
   2081	 * Not expected to happen, but if we do have a bug where this _can_
   2082	 * happen, catch it here and ensure the request is marked as
   2083	 * canceled. That will make io-wq go through the usual work cancel
   2084	 * procedure rather than attempt to run this request (or create a new
   2085	 * worker for it).
   2086	 */
   2087	if (WARN_ON_ONCE(!same_thread_group(req->task, current)))
   2088		req->work.flags |= IO_WQ_WORK_CANCEL;
   2089
   2090	trace_io_uring_queue_async_work(req->ctx, req, req->cqe.user_data,
   2091					req->opcode, req->flags, &req->work,
   2092					io_wq_is_hashed(&req->work));
   2093	io_wq_enqueue(tctx->io_wq, &req->work);
   2094	if (link)
   2095		io_queue_linked_timeout(link);
   2096}
   2097
   2098static void io_kill_timeout(struct io_kiocb *req, int status)
   2099	__must_hold(&req->ctx->completion_lock)
   2100	__must_hold(&req->ctx->timeout_lock)
   2101{
   2102	struct io_timeout_data *io = req->async_data;
   2103
   2104	if (hrtimer_try_to_cancel(&io->timer) != -1) {
   2105		if (status)
   2106			req_set_fail(req);
   2107		atomic_set(&req->ctx->cq_timeouts,
   2108			atomic_read(&req->ctx->cq_timeouts) + 1);
   2109		list_del_init(&req->timeout.list);
   2110		io_req_tw_post_queue(req, status, 0);
   2111	}
   2112}
   2113
   2114static __cold void io_queue_deferred(struct io_ring_ctx *ctx)
   2115{
   2116	while (!list_empty(&ctx->defer_list)) {
   2117		struct io_defer_entry *de = list_first_entry(&ctx->defer_list,
   2118						struct io_defer_entry, list);
   2119
   2120		if (req_need_defer(de->req, de->seq))
   2121			break;
   2122		list_del_init(&de->list);
   2123		io_req_task_queue(de->req);
   2124		kfree(de);
   2125	}
   2126}
   2127
   2128static __cold void io_flush_timeouts(struct io_ring_ctx *ctx)
   2129	__must_hold(&ctx->completion_lock)
   2130{
   2131	u32 seq = ctx->cached_cq_tail - atomic_read(&ctx->cq_timeouts);
   2132	struct io_kiocb *req, *tmp;
   2133
   2134	spin_lock_irq(&ctx->timeout_lock);
   2135	list_for_each_entry_safe(req, tmp, &ctx->timeout_list, timeout.list) {
   2136		u32 events_needed, events_got;
   2137
   2138		if (io_is_timeout_noseq(req))
   2139			break;
   2140
   2141		/*
   2142		 * Since seq can easily wrap around over time, subtract
   2143		 * the last seq at which timeouts were flushed before comparing.
   2144		 * Assuming not more than 2^31-1 events have happened since,
   2145		 * these subtractions won't have wrapped, so we can check if
   2146		 * target is in [last_seq, current_seq] by comparing the two.
   2147		 */
   2148		events_needed = req->timeout.target_seq - ctx->cq_last_tm_flush;
   2149		events_got = seq - ctx->cq_last_tm_flush;
   2150		if (events_got < events_needed)
   2151			break;
   2152
   2153		io_kill_timeout(req, 0);
   2154	}
   2155	ctx->cq_last_tm_flush = seq;
   2156	spin_unlock_irq(&ctx->timeout_lock);
   2157}
   2158
   2159static inline void io_commit_cqring(struct io_ring_ctx *ctx)
   2160{
   2161	/* order cqe stores with ring update */
   2162	smp_store_release(&ctx->rings->cq.tail, ctx->cached_cq_tail);
   2163}
   2164
   2165static void __io_commit_cqring_flush(struct io_ring_ctx *ctx)
   2166{
   2167	if (ctx->off_timeout_used || ctx->drain_active) {
   2168		spin_lock(&ctx->completion_lock);
   2169		if (ctx->off_timeout_used)
   2170			io_flush_timeouts(ctx);
   2171		if (ctx->drain_active)
   2172			io_queue_deferred(ctx);
   2173		io_commit_cqring(ctx);
   2174		spin_unlock(&ctx->completion_lock);
   2175	}
   2176	if (ctx->has_evfd)
   2177		io_eventfd_signal(ctx);
   2178}
   2179
   2180static inline bool io_sqring_full(struct io_ring_ctx *ctx)
   2181{
   2182	struct io_rings *r = ctx->rings;
   2183
   2184	return READ_ONCE(r->sq.tail) - ctx->cached_sq_head == ctx->sq_entries;
   2185}
   2186
   2187static inline unsigned int __io_cqring_events(struct io_ring_ctx *ctx)
   2188{
   2189	return ctx->cached_cq_tail - READ_ONCE(ctx->rings->cq.head);
   2190}
   2191
   2192/*
   2193 * writes to the cq entry need to come after reading head; the
   2194 * control dependency is enough as we're using WRITE_ONCE to
   2195 * fill the cq entry
   2196 */
   2197static noinline struct io_uring_cqe *__io_get_cqe(struct io_ring_ctx *ctx)
   2198{
   2199	struct io_rings *rings = ctx->rings;
   2200	unsigned int off = ctx->cached_cq_tail & (ctx->cq_entries - 1);
   2201	unsigned int shift = 0;
   2202	unsigned int free, queued, len;
   2203
   2204	if (ctx->flags & IORING_SETUP_CQE32)
   2205		shift = 1;
   2206
   2207	/* userspace may cheat modifying the tail, be safe and do min */
   2208	queued = min(__io_cqring_events(ctx), ctx->cq_entries);
   2209	free = ctx->cq_entries - queued;
   2210	/* we need a contiguous range, limit based on the current array offset */
   2211	len = min(free, ctx->cq_entries - off);
   2212	if (!len)
   2213		return NULL;
   2214
   2215	ctx->cached_cq_tail++;
   2216	ctx->cqe_cached = &rings->cqes[off];
   2217	ctx->cqe_sentinel = ctx->cqe_cached + len;
   2218	ctx->cqe_cached++;
   2219	return &rings->cqes[off << shift];
   2220}
   2221
   2222static inline struct io_uring_cqe *io_get_cqe(struct io_ring_ctx *ctx)
   2223{
   2224	if (likely(ctx->cqe_cached < ctx->cqe_sentinel)) {
   2225		struct io_uring_cqe *cqe = ctx->cqe_cached;
   2226
   2227		if (ctx->flags & IORING_SETUP_CQE32) {
   2228			unsigned int off = ctx->cqe_cached - ctx->rings->cqes;
   2229
   2230			cqe += off;
   2231		}
   2232
   2233		ctx->cached_cq_tail++;
   2234		ctx->cqe_cached++;
   2235		return cqe;
   2236	}
   2237
   2238	return __io_get_cqe(ctx);
   2239}
   2240
   2241static void io_eventfd_signal(struct io_ring_ctx *ctx)
   2242{
   2243	struct io_ev_fd *ev_fd;
   2244
   2245	rcu_read_lock();
   2246	/*
   2247	 * rcu_dereference ctx->io_ev_fd once and use it for both for checking
   2248	 * and eventfd_signal
   2249	 */
   2250	ev_fd = rcu_dereference(ctx->io_ev_fd);
   2251
   2252	/*
   2253	 * Check again if ev_fd exists incase an io_eventfd_unregister call
   2254	 * completed between the NULL check of ctx->io_ev_fd at the start of
   2255	 * the function and rcu_read_lock.
   2256	 */
   2257	if (unlikely(!ev_fd))
   2258		goto out;
   2259	if (READ_ONCE(ctx->rings->cq_flags) & IORING_CQ_EVENTFD_DISABLED)
   2260		goto out;
   2261
   2262	if (!ev_fd->eventfd_async || io_wq_current_is_worker())
   2263		eventfd_signal(ev_fd->cq_ev_fd, 1);
   2264out:
   2265	rcu_read_unlock();
   2266}
   2267
   2268static inline void io_cqring_wake(struct io_ring_ctx *ctx)
   2269{
   2270	/*
   2271	 * wake_up_all() may seem excessive, but io_wake_function() and
   2272	 * io_should_wake() handle the termination of the loop and only
   2273	 * wake as many waiters as we need to.
   2274	 */
   2275	if (wq_has_sleeper(&ctx->cq_wait))
   2276		wake_up_all(&ctx->cq_wait);
   2277}
   2278
   2279/*
   2280 * This should only get called when at least one event has been posted.
   2281 * Some applications rely on the eventfd notification count only changing
   2282 * IFF a new CQE has been added to the CQ ring. There's no depedency on
   2283 * 1:1 relationship between how many times this function is called (and
   2284 * hence the eventfd count) and number of CQEs posted to the CQ ring.
   2285 */
   2286static inline void io_cqring_ev_posted(struct io_ring_ctx *ctx)
   2287{
   2288	if (unlikely(ctx->off_timeout_used || ctx->drain_active ||
   2289		     ctx->has_evfd))
   2290		__io_commit_cqring_flush(ctx);
   2291
   2292	io_cqring_wake(ctx);
   2293}
   2294
   2295static void io_cqring_ev_posted_iopoll(struct io_ring_ctx *ctx)
   2296{
   2297	if (unlikely(ctx->off_timeout_used || ctx->drain_active ||
   2298		     ctx->has_evfd))
   2299		__io_commit_cqring_flush(ctx);
   2300
   2301	if (ctx->flags & IORING_SETUP_SQPOLL)
   2302		io_cqring_wake(ctx);
   2303}
   2304
   2305/* Returns true if there are no backlogged entries after the flush */
   2306static bool __io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force)
   2307{
   2308	bool all_flushed, posted;
   2309	size_t cqe_size = sizeof(struct io_uring_cqe);
   2310
   2311	if (!force && __io_cqring_events(ctx) == ctx->cq_entries)
   2312		return false;
   2313
   2314	if (ctx->flags & IORING_SETUP_CQE32)
   2315		cqe_size <<= 1;
   2316
   2317	posted = false;
   2318	spin_lock(&ctx->completion_lock);
   2319	while (!list_empty(&ctx->cq_overflow_list)) {
   2320		struct io_uring_cqe *cqe = io_get_cqe(ctx);
   2321		struct io_overflow_cqe *ocqe;
   2322
   2323		if (!cqe && !force)
   2324			break;
   2325		ocqe = list_first_entry(&ctx->cq_overflow_list,
   2326					struct io_overflow_cqe, list);
   2327		if (cqe)
   2328			memcpy(cqe, &ocqe->cqe, cqe_size);
   2329		else
   2330			io_account_cq_overflow(ctx);
   2331
   2332		posted = true;
   2333		list_del(&ocqe->list);
   2334		kfree(ocqe);
   2335	}
   2336
   2337	all_flushed = list_empty(&ctx->cq_overflow_list);
   2338	if (all_flushed) {
   2339		clear_bit(IO_CHECK_CQ_OVERFLOW_BIT, &ctx->check_cq);
   2340		atomic_andnot(IORING_SQ_CQ_OVERFLOW, &ctx->rings->sq_flags);
   2341	}
   2342
   2343	io_commit_cqring(ctx);
   2344	spin_unlock(&ctx->completion_lock);
   2345	if (posted)
   2346		io_cqring_ev_posted(ctx);
   2347	return all_flushed;
   2348}
   2349
   2350static bool io_cqring_overflow_flush(struct io_ring_ctx *ctx)
   2351{
   2352	bool ret = true;
   2353
   2354	if (test_bit(IO_CHECK_CQ_OVERFLOW_BIT, &ctx->check_cq)) {
   2355		/* iopoll syncs against uring_lock, not completion_lock */
   2356		if (ctx->flags & IORING_SETUP_IOPOLL)
   2357			mutex_lock(&ctx->uring_lock);
   2358		ret = __io_cqring_overflow_flush(ctx, false);
   2359		if (ctx->flags & IORING_SETUP_IOPOLL)
   2360			mutex_unlock(&ctx->uring_lock);
   2361	}
   2362
   2363	return ret;
   2364}
   2365
   2366static void __io_put_task(struct task_struct *task, int nr)
   2367{
   2368	struct io_uring_task *tctx = task->io_uring;
   2369
   2370	percpu_counter_sub(&tctx->inflight, nr);
   2371	if (unlikely(atomic_read(&tctx->in_idle)))
   2372		wake_up(&tctx->wait);
   2373	put_task_struct_many(task, nr);
   2374}
   2375
   2376/* must to be called somewhat shortly after putting a request */
   2377static inline void io_put_task(struct task_struct *task, int nr)
   2378{
   2379	if (likely(task == current))
   2380		task->io_uring->cached_refs += nr;
   2381	else
   2382		__io_put_task(task, nr);
   2383}
   2384
   2385static void io_task_refs_refill(struct io_uring_task *tctx)
   2386{
   2387	unsigned int refill = -tctx->cached_refs + IO_TCTX_REFS_CACHE_NR;
   2388
   2389	percpu_counter_add(&tctx->inflight, refill);
   2390	refcount_add(refill, &current->usage);
   2391	tctx->cached_refs += refill;
   2392}
   2393
   2394static inline void io_get_task_refs(int nr)
   2395{
   2396	struct io_uring_task *tctx = current->io_uring;
   2397
   2398	tctx->cached_refs -= nr;
   2399	if (unlikely(tctx->cached_refs < 0))
   2400		io_task_refs_refill(tctx);
   2401}
   2402
   2403static __cold void io_uring_drop_tctx_refs(struct task_struct *task)
   2404{
   2405	struct io_uring_task *tctx = task->io_uring;
   2406	unsigned int refs = tctx->cached_refs;
   2407
   2408	if (refs) {
   2409		tctx->cached_refs = 0;
   2410		percpu_counter_sub(&tctx->inflight, refs);
   2411		put_task_struct_many(task, refs);
   2412	}
   2413}
   2414
   2415static bool io_cqring_event_overflow(struct io_ring_ctx *ctx, u64 user_data,
   2416				     s32 res, u32 cflags, u64 extra1,
   2417				     u64 extra2)
   2418{
   2419	struct io_overflow_cqe *ocqe;
   2420	size_t ocq_size = sizeof(struct io_overflow_cqe);
   2421	bool is_cqe32 = (ctx->flags & IORING_SETUP_CQE32);
   2422
   2423	if (is_cqe32)
   2424		ocq_size += sizeof(struct io_uring_cqe);
   2425
   2426	ocqe = kmalloc(ocq_size, GFP_ATOMIC | __GFP_ACCOUNT);
   2427	trace_io_uring_cqe_overflow(ctx, user_data, res, cflags, ocqe);
   2428	if (!ocqe) {
   2429		/*
   2430		 * If we're in ring overflow flush mode, or in task cancel mode,
   2431		 * or cannot allocate an overflow entry, then we need to drop it
   2432		 * on the floor.
   2433		 */
   2434		io_account_cq_overflow(ctx);
   2435		set_bit(IO_CHECK_CQ_DROPPED_BIT, &ctx->check_cq);
   2436		return false;
   2437	}
   2438	if (list_empty(&ctx->cq_overflow_list)) {
   2439		set_bit(IO_CHECK_CQ_OVERFLOW_BIT, &ctx->check_cq);
   2440		atomic_or(IORING_SQ_CQ_OVERFLOW, &ctx->rings->sq_flags);
   2441
   2442	}
   2443	ocqe->cqe.user_data = user_data;
   2444	ocqe->cqe.res = res;
   2445	ocqe->cqe.flags = cflags;
   2446	if (is_cqe32) {
   2447		ocqe->cqe.big_cqe[0] = extra1;
   2448		ocqe->cqe.big_cqe[1] = extra2;
   2449	}
   2450	list_add_tail(&ocqe->list, &ctx->cq_overflow_list);
   2451	return true;
   2452}
   2453
   2454static inline bool __io_fill_cqe_req(struct io_ring_ctx *ctx,
   2455				     struct io_kiocb *req)
   2456{
   2457	struct io_uring_cqe *cqe;
   2458
   2459	if (!(ctx->flags & IORING_SETUP_CQE32)) {
   2460		trace_io_uring_complete(req->ctx, req, req->cqe.user_data,
   2461					req->cqe.res, req->cqe.flags, 0, 0);
   2462
   2463		/*
   2464		 * If we can't get a cq entry, userspace overflowed the
   2465		 * submission (by quite a lot). Increment the overflow count in
   2466		 * the ring.
   2467		 */
   2468		cqe = io_get_cqe(ctx);
   2469		if (likely(cqe)) {
   2470			memcpy(cqe, &req->cqe, sizeof(*cqe));
   2471			return true;
   2472		}
   2473
   2474		return io_cqring_event_overflow(ctx, req->cqe.user_data,
   2475						req->cqe.res, req->cqe.flags,
   2476						0, 0);
   2477	} else {
   2478		u64 extra1 = 0, extra2 = 0;
   2479
   2480		if (req->flags & REQ_F_CQE32_INIT) {
   2481			extra1 = req->extra1;
   2482			extra2 = req->extra2;
   2483		}
   2484
   2485		trace_io_uring_complete(req->ctx, req, req->cqe.user_data,
   2486					req->cqe.res, req->cqe.flags, extra1, extra2);
   2487
   2488		/*
   2489		 * If we can't get a cq entry, userspace overflowed the
   2490		 * submission (by quite a lot). Increment the overflow count in
   2491		 * the ring.
   2492		 */
   2493		cqe = io_get_cqe(ctx);
   2494		if (likely(cqe)) {
   2495			memcpy(cqe, &req->cqe, sizeof(struct io_uring_cqe));
   2496			WRITE_ONCE(cqe->big_cqe[0], extra1);
   2497			WRITE_ONCE(cqe->big_cqe[1], extra2);
   2498			return true;
   2499		}
   2500
   2501		return io_cqring_event_overflow(ctx, req->cqe.user_data,
   2502				req->cqe.res, req->cqe.flags,
   2503				extra1, extra2);
   2504	}
   2505}
   2506
   2507static noinline bool io_fill_cqe_aux(struct io_ring_ctx *ctx, u64 user_data,
   2508				     s32 res, u32 cflags)
   2509{
   2510	struct io_uring_cqe *cqe;
   2511
   2512	ctx->cq_extra++;
   2513	trace_io_uring_complete(ctx, NULL, user_data, res, cflags, 0, 0);
   2514
   2515	/*
   2516	 * If we can't get a cq entry, userspace overflowed the
   2517	 * submission (by quite a lot). Increment the overflow count in
   2518	 * the ring.
   2519	 */
   2520	cqe = io_get_cqe(ctx);
   2521	if (likely(cqe)) {
   2522		WRITE_ONCE(cqe->user_data, user_data);
   2523		WRITE_ONCE(cqe->res, res);
   2524		WRITE_ONCE(cqe->flags, cflags);
   2525
   2526		if (ctx->flags & IORING_SETUP_CQE32) {
   2527			WRITE_ONCE(cqe->big_cqe[0], 0);
   2528			WRITE_ONCE(cqe->big_cqe[1], 0);
   2529		}
   2530		return true;
   2531	}
   2532	return io_cqring_event_overflow(ctx, user_data, res, cflags, 0, 0);
   2533}
   2534
   2535static void __io_req_complete_put(struct io_kiocb *req)
   2536{
   2537	/*
   2538	 * If we're the last reference to this request, add to our locked
   2539	 * free_list cache.
   2540	 */
   2541	if (req_ref_put_and_test(req)) {
   2542		struct io_ring_ctx *ctx = req->ctx;
   2543
   2544		if (req->flags & IO_REQ_LINK_FLAGS) {
   2545			if (req->flags & IO_DISARM_MASK)
   2546				io_disarm_next(req);
   2547			if (req->link) {
   2548				io_req_task_queue(req->link);
   2549				req->link = NULL;
   2550			}
   2551		}
   2552		io_req_put_rsrc(req);
   2553		/*
   2554		 * Selected buffer deallocation in io_clean_op() assumes that
   2555		 * we don't hold ->completion_lock. Clean them here to avoid
   2556		 * deadlocks.
   2557		 */
   2558		io_put_kbuf_comp(req);
   2559		io_dismantle_req(req);
   2560		io_put_task(req->task, 1);
   2561		wq_list_add_head(&req->comp_list, &ctx->locked_free_list);
   2562		ctx->locked_free_nr++;
   2563	}
   2564}
   2565
   2566static void __io_req_complete_post(struct io_kiocb *req, s32 res,
   2567				   u32 cflags)
   2568{
   2569	if (!(req->flags & REQ_F_CQE_SKIP)) {
   2570		req->cqe.res = res;
   2571		req->cqe.flags = cflags;
   2572		__io_fill_cqe_req(req->ctx, req);
   2573	}
   2574	__io_req_complete_put(req);
   2575}
   2576
   2577static void io_req_complete_post(struct io_kiocb *req, s32 res, u32 cflags)
   2578{
   2579	struct io_ring_ctx *ctx = req->ctx;
   2580
   2581	spin_lock(&ctx->completion_lock);
   2582	__io_req_complete_post(req, res, cflags);
   2583	io_commit_cqring(ctx);
   2584	spin_unlock(&ctx->completion_lock);
   2585	io_cqring_ev_posted(ctx);
   2586}
   2587
   2588static inline void io_req_complete_state(struct io_kiocb *req, s32 res,
   2589					 u32 cflags)
   2590{
   2591	req->cqe.res = res;
   2592	req->cqe.flags = cflags;
   2593	req->flags |= REQ_F_COMPLETE_INLINE;
   2594}
   2595
   2596static inline void __io_req_complete(struct io_kiocb *req, unsigned issue_flags,
   2597				     s32 res, u32 cflags)
   2598{
   2599	if (issue_flags & IO_URING_F_COMPLETE_DEFER)
   2600		io_req_complete_state(req, res, cflags);
   2601	else
   2602		io_req_complete_post(req, res, cflags);
   2603}
   2604
   2605static inline void io_req_complete(struct io_kiocb *req, s32 res)
   2606{
   2607	if (res < 0)
   2608		req_set_fail(req);
   2609	__io_req_complete(req, 0, res, 0);
   2610}
   2611
   2612static void io_req_complete_failed(struct io_kiocb *req, s32 res)
   2613{
   2614	req_set_fail(req);
   2615	io_req_complete_post(req, res, io_put_kbuf(req, IO_URING_F_UNLOCKED));
   2616}
   2617
   2618/*
   2619 * Don't initialise the fields below on every allocation, but do that in
   2620 * advance and keep them valid across allocations.
   2621 */
   2622static void io_preinit_req(struct io_kiocb *req, struct io_ring_ctx *ctx)
   2623{
   2624	req->ctx = ctx;
   2625	req->link = NULL;
   2626	req->async_data = NULL;
   2627	/* not necessary, but safer to zero */
   2628	req->cqe.res = 0;
   2629}
   2630
   2631static void io_flush_cached_locked_reqs(struct io_ring_ctx *ctx,
   2632					struct io_submit_state *state)
   2633{
   2634	spin_lock(&ctx->completion_lock);
   2635	wq_list_splice(&ctx->locked_free_list, &state->free_list);
   2636	ctx->locked_free_nr = 0;
   2637	spin_unlock(&ctx->completion_lock);
   2638}
   2639
   2640static inline bool io_req_cache_empty(struct io_ring_ctx *ctx)
   2641{
   2642	return !ctx->submit_state.free_list.next;
   2643}
   2644
   2645/*
   2646 * A request might get retired back into the request caches even before opcode
   2647 * handlers and io_issue_sqe() are done with it, e.g. inline completion path.
   2648 * Because of that, io_alloc_req() should be called only under ->uring_lock
   2649 * and with extra caution to not get a request that is still worked on.
   2650 */
   2651static __cold bool __io_alloc_req_refill(struct io_ring_ctx *ctx)
   2652	__must_hold(&ctx->uring_lock)
   2653{
   2654	gfp_t gfp = GFP_KERNEL | __GFP_NOWARN;
   2655	void *reqs[IO_REQ_ALLOC_BATCH];
   2656	int ret, i;
   2657
   2658	/*
   2659	 * If we have more than a batch's worth of requests in our IRQ side
   2660	 * locked cache, grab the lock and move them over to our submission
   2661	 * side cache.
   2662	 */
   2663	if (data_race(ctx->locked_free_nr) > IO_COMPL_BATCH) {
   2664		io_flush_cached_locked_reqs(ctx, &ctx->submit_state);
   2665		if (!io_req_cache_empty(ctx))
   2666			return true;
   2667	}
   2668
   2669	ret = kmem_cache_alloc_bulk(req_cachep, gfp, ARRAY_SIZE(reqs), reqs);
   2670
   2671	/*
   2672	 * Bulk alloc is all-or-nothing. If we fail to get a batch,
   2673	 * retry single alloc to be on the safe side.
   2674	 */
   2675	if (unlikely(ret <= 0)) {
   2676		reqs[0] = kmem_cache_alloc(req_cachep, gfp);
   2677		if (!reqs[0])
   2678			return false;
   2679		ret = 1;
   2680	}
   2681
   2682	percpu_ref_get_many(&ctx->refs, ret);
   2683	for (i = 0; i < ret; i++) {
   2684		struct io_kiocb *req = reqs[i];
   2685
   2686		io_preinit_req(req, ctx);
   2687		io_req_add_to_cache(req, ctx);
   2688	}
   2689	return true;
   2690}
   2691
   2692static inline bool io_alloc_req_refill(struct io_ring_ctx *ctx)
   2693{
   2694	if (unlikely(io_req_cache_empty(ctx)))
   2695		return __io_alloc_req_refill(ctx);
   2696	return true;
   2697}
   2698
   2699static inline struct io_kiocb *io_alloc_req(struct io_ring_ctx *ctx)
   2700{
   2701	struct io_wq_work_node *node;
   2702
   2703	node = wq_stack_extract(&ctx->submit_state.free_list);
   2704	return container_of(node, struct io_kiocb, comp_list);
   2705}
   2706
   2707static inline void io_put_file(struct file *file)
   2708{
   2709	if (file)
   2710		fput(file);
   2711}
   2712
   2713static inline void io_dismantle_req(struct io_kiocb *req)
   2714{
   2715	unsigned int flags = req->flags;
   2716
   2717	if (unlikely(flags & IO_REQ_CLEAN_FLAGS))
   2718		io_clean_op(req);
   2719	if (!(flags & REQ_F_FIXED_FILE))
   2720		io_put_file(req->file);
   2721}
   2722
   2723static __cold void io_free_req(struct io_kiocb *req)
   2724{
   2725	struct io_ring_ctx *ctx = req->ctx;
   2726
   2727	io_req_put_rsrc(req);
   2728	io_dismantle_req(req);
   2729	io_put_task(req->task, 1);
   2730
   2731	spin_lock(&ctx->completion_lock);
   2732	wq_list_add_head(&req->comp_list, &ctx->locked_free_list);
   2733	ctx->locked_free_nr++;
   2734	spin_unlock(&ctx->completion_lock);
   2735}
   2736
   2737static inline void io_remove_next_linked(struct io_kiocb *req)
   2738{
   2739	struct io_kiocb *nxt = req->link;
   2740
   2741	req->link = nxt->link;
   2742	nxt->link = NULL;
   2743}
   2744
   2745static struct io_kiocb *io_disarm_linked_timeout(struct io_kiocb *req)
   2746	__must_hold(&req->ctx->completion_lock)
   2747	__must_hold(&req->ctx->timeout_lock)
   2748{
   2749	struct io_kiocb *link = req->link;
   2750
   2751	if (link && link->opcode == IORING_OP_LINK_TIMEOUT) {
   2752		struct io_timeout_data *io = link->async_data;
   2753
   2754		io_remove_next_linked(req);
   2755		link->timeout.head = NULL;
   2756		if (hrtimer_try_to_cancel(&io->timer) != -1) {
   2757			list_del(&link->timeout.list);
   2758			return link;
   2759		}
   2760	}
   2761	return NULL;
   2762}
   2763
   2764static void io_fail_links(struct io_kiocb *req)
   2765	__must_hold(&req->ctx->completion_lock)
   2766{
   2767	struct io_kiocb *nxt, *link = req->link;
   2768	bool ignore_cqes = req->flags & REQ_F_SKIP_LINK_CQES;
   2769
   2770	req->link = NULL;
   2771	while (link) {
   2772		long res = -ECANCELED;
   2773
   2774		if (link->flags & REQ_F_FAIL)
   2775			res = link->cqe.res;
   2776
   2777		nxt = link->link;
   2778		link->link = NULL;
   2779
   2780		trace_io_uring_fail_link(req->ctx, req, req->cqe.user_data,
   2781					req->opcode, link);
   2782
   2783		if (ignore_cqes)
   2784			link->flags |= REQ_F_CQE_SKIP;
   2785		else
   2786			link->flags &= ~REQ_F_CQE_SKIP;
   2787		__io_req_complete_post(link, res, 0);
   2788		link = nxt;
   2789	}
   2790}
   2791
   2792static bool io_disarm_next(struct io_kiocb *req)
   2793	__must_hold(&req->ctx->completion_lock)
   2794{
   2795	struct io_kiocb *link = NULL;
   2796	bool posted = false;
   2797
   2798	if (req->flags & REQ_F_ARM_LTIMEOUT) {
   2799		link = req->link;
   2800		req->flags &= ~REQ_F_ARM_LTIMEOUT;
   2801		if (link && link->opcode == IORING_OP_LINK_TIMEOUT) {
   2802			io_remove_next_linked(req);
   2803			io_req_tw_post_queue(link, -ECANCELED, 0);
   2804			posted = true;
   2805		}
   2806	} else if (req->flags & REQ_F_LINK_TIMEOUT) {
   2807		struct io_ring_ctx *ctx = req->ctx;
   2808
   2809		spin_lock_irq(&ctx->timeout_lock);
   2810		link = io_disarm_linked_timeout(req);
   2811		spin_unlock_irq(&ctx->timeout_lock);
   2812		if (link) {
   2813			posted = true;
   2814			io_req_tw_post_queue(link, -ECANCELED, 0);
   2815		}
   2816	}
   2817	if (unlikely((req->flags & REQ_F_FAIL) &&
   2818		     !(req->flags & REQ_F_HARDLINK))) {
   2819		posted |= (req->link != NULL);
   2820		io_fail_links(req);
   2821	}
   2822	return posted;
   2823}
   2824
   2825static void __io_req_find_next_prep(struct io_kiocb *req)
   2826{
   2827	struct io_ring_ctx *ctx = req->ctx;
   2828	bool posted;
   2829
   2830	spin_lock(&ctx->completion_lock);
   2831	posted = io_disarm_next(req);
   2832	io_commit_cqring(ctx);
   2833	spin_unlock(&ctx->completion_lock);
   2834	if (posted)
   2835		io_cqring_ev_posted(ctx);
   2836}
   2837
   2838static inline struct io_kiocb *io_req_find_next(struct io_kiocb *req)
   2839{
   2840	struct io_kiocb *nxt;
   2841
   2842	/*
   2843	 * If LINK is set, we have dependent requests in this chain. If we
   2844	 * didn't fail this request, queue the first one up, moving any other
   2845	 * dependencies to the next request. In case of failure, fail the rest
   2846	 * of the chain.
   2847	 */
   2848	if (unlikely(req->flags & IO_DISARM_MASK))
   2849		__io_req_find_next_prep(req);
   2850	nxt = req->link;
   2851	req->link = NULL;
   2852	return nxt;
   2853}
   2854
   2855static void ctx_flush_and_put(struct io_ring_ctx *ctx, bool *locked)
   2856{
   2857	if (!ctx)
   2858		return;
   2859	if (ctx->flags & IORING_SETUP_TASKRUN_FLAG)
   2860		atomic_andnot(IORING_SQ_TASKRUN, &ctx->rings->sq_flags);
   2861	if (*locked) {
   2862		io_submit_flush_completions(ctx);
   2863		mutex_unlock(&ctx->uring_lock);
   2864		*locked = false;
   2865	}
   2866	percpu_ref_put(&ctx->refs);
   2867}
   2868
   2869static inline void ctx_commit_and_unlock(struct io_ring_ctx *ctx)
   2870{
   2871	io_commit_cqring(ctx);
   2872	spin_unlock(&ctx->completion_lock);
   2873	io_cqring_ev_posted(ctx);
   2874}
   2875
   2876static void handle_prev_tw_list(struct io_wq_work_node *node,
   2877				struct io_ring_ctx **ctx, bool *uring_locked)
   2878{
   2879	if (*ctx && !*uring_locked)
   2880		spin_lock(&(*ctx)->completion_lock);
   2881
   2882	do {
   2883		struct io_wq_work_node *next = node->next;
   2884		struct io_kiocb *req = container_of(node, struct io_kiocb,
   2885						    io_task_work.node);
   2886
   2887		prefetch(container_of(next, struct io_kiocb, io_task_work.node));
   2888
   2889		if (req->ctx != *ctx) {
   2890			if (unlikely(!*uring_locked && *ctx))
   2891				ctx_commit_and_unlock(*ctx);
   2892
   2893			ctx_flush_and_put(*ctx, uring_locked);
   2894			*ctx = req->ctx;
   2895			/* if not contended, grab and improve batching */
   2896			*uring_locked = mutex_trylock(&(*ctx)->uring_lock);
   2897			percpu_ref_get(&(*ctx)->refs);
   2898			if (unlikely(!*uring_locked))
   2899				spin_lock(&(*ctx)->completion_lock);
   2900		}
   2901		if (likely(*uring_locked))
   2902			req->io_task_work.func(req, uring_locked);
   2903		else
   2904			__io_req_complete_post(req, req->cqe.res,
   2905						io_put_kbuf_comp(req));
   2906		node = next;
   2907	} while (node);
   2908
   2909	if (unlikely(!*uring_locked))
   2910		ctx_commit_and_unlock(*ctx);
   2911}
   2912
   2913static void handle_tw_list(struct io_wq_work_node *node,
   2914			   struct io_ring_ctx **ctx, bool *locked)
   2915{
   2916	do {
   2917		struct io_wq_work_node *next = node->next;
   2918		struct io_kiocb *req = container_of(node, struct io_kiocb,
   2919						    io_task_work.node);
   2920
   2921		prefetch(container_of(next, struct io_kiocb, io_task_work.node));
   2922
   2923		if (req->ctx != *ctx) {
   2924			ctx_flush_and_put(*ctx, locked);
   2925			*ctx = req->ctx;
   2926			/* if not contended, grab and improve batching */
   2927			*locked = mutex_trylock(&(*ctx)->uring_lock);
   2928			percpu_ref_get(&(*ctx)->refs);
   2929		}
   2930		req->io_task_work.func(req, locked);
   2931		node = next;
   2932	} while (node);
   2933}
   2934
   2935static void tctx_task_work(struct callback_head *cb)
   2936{
   2937	bool uring_locked = false;
   2938	struct io_ring_ctx *ctx = NULL;
   2939	struct io_uring_task *tctx = container_of(cb, struct io_uring_task,
   2940						  task_work);
   2941
   2942	while (1) {
   2943		struct io_wq_work_node *node1, *node2;
   2944
   2945		spin_lock_irq(&tctx->task_lock);
   2946		node1 = tctx->prio_task_list.first;
   2947		node2 = tctx->task_list.first;
   2948		INIT_WQ_LIST(&tctx->task_list);
   2949		INIT_WQ_LIST(&tctx->prio_task_list);
   2950		if (!node2 && !node1)
   2951			tctx->task_running = false;
   2952		spin_unlock_irq(&tctx->task_lock);
   2953		if (!node2 && !node1)
   2954			break;
   2955
   2956		if (node1)
   2957			handle_prev_tw_list(node1, &ctx, &uring_locked);
   2958		if (node2)
   2959			handle_tw_list(node2, &ctx, &uring_locked);
   2960		cond_resched();
   2961
   2962		if (data_race(!tctx->task_list.first) &&
   2963		    data_race(!tctx->prio_task_list.first) && uring_locked)
   2964			io_submit_flush_completions(ctx);
   2965	}
   2966
   2967	ctx_flush_and_put(ctx, &uring_locked);
   2968
   2969	/* relaxed read is enough as only the task itself sets ->in_idle */
   2970	if (unlikely(atomic_read(&tctx->in_idle)))
   2971		io_uring_drop_tctx_refs(current);
   2972}
   2973
   2974static void __io_req_task_work_add(struct io_kiocb *req,
   2975				   struct io_uring_task *tctx,
   2976				   struct io_wq_work_list *list)
   2977{
   2978	struct io_ring_ctx *ctx = req->ctx;
   2979	struct io_wq_work_node *node;
   2980	unsigned long flags;
   2981	bool running;
   2982
   2983	spin_lock_irqsave(&tctx->task_lock, flags);
   2984	wq_list_add_tail(&req->io_task_work.node, list);
   2985	running = tctx->task_running;
   2986	if (!running)
   2987		tctx->task_running = true;
   2988	spin_unlock_irqrestore(&tctx->task_lock, flags);
   2989
   2990	/* task_work already pending, we're done */
   2991	if (running)
   2992		return;
   2993
   2994	if (ctx->flags & IORING_SETUP_TASKRUN_FLAG)
   2995		atomic_or(IORING_SQ_TASKRUN, &ctx->rings->sq_flags);
   2996
   2997	if (likely(!task_work_add(req->task, &tctx->task_work, ctx->notify_method)))
   2998		return;
   2999
   3000	spin_lock_irqsave(&tctx->task_lock, flags);
   3001	tctx->task_running = false;
   3002	node = wq_list_merge(&tctx->prio_task_list, &tctx->task_list);
   3003	spin_unlock_irqrestore(&tctx->task_lock, flags);
   3004
   3005	while (node) {
   3006		req = container_of(node, struct io_kiocb, io_task_work.node);
   3007		node = node->next;
   3008		if (llist_add(&req->io_task_work.fallback_node,
   3009			      &req->ctx->fallback_llist))
   3010			schedule_delayed_work(&req->ctx->fallback_work, 1);
   3011	}
   3012}
   3013
   3014static void io_req_task_work_add(struct io_kiocb *req)
   3015{
   3016	struct io_uring_task *tctx = req->task->io_uring;
   3017
   3018	__io_req_task_work_add(req, tctx, &tctx->task_list);
   3019}
   3020
   3021static void io_req_task_prio_work_add(struct io_kiocb *req)
   3022{
   3023	struct io_uring_task *tctx = req->task->io_uring;
   3024
   3025	if (req->ctx->flags & IORING_SETUP_SQPOLL)
   3026		__io_req_task_work_add(req, tctx, &tctx->prio_task_list);
   3027	else
   3028		__io_req_task_work_add(req, tctx, &tctx->task_list);
   3029}
   3030
   3031static void io_req_tw_post(struct io_kiocb *req, bool *locked)
   3032{
   3033	io_req_complete_post(req, req->cqe.res, req->cqe.flags);
   3034}
   3035
   3036static void io_req_tw_post_queue(struct io_kiocb *req, s32 res, u32 cflags)
   3037{
   3038	req->cqe.res = res;
   3039	req->cqe.flags = cflags;
   3040	req->io_task_work.func = io_req_tw_post;
   3041	io_req_task_work_add(req);
   3042}
   3043
   3044static void io_req_task_cancel(struct io_kiocb *req, bool *locked)
   3045{
   3046	/* not needed for normal modes, but SQPOLL depends on it */
   3047	io_tw_lock(req->ctx, locked);
   3048	io_req_complete_failed(req, req->cqe.res);
   3049}
   3050
   3051static void io_req_task_submit(struct io_kiocb *req, bool *locked)
   3052{
   3053	io_tw_lock(req->ctx, locked);
   3054	/* req->task == current here, checking PF_EXITING is safe */
   3055	if (likely(!(req->task->flags & PF_EXITING)))
   3056		io_queue_sqe(req);
   3057	else
   3058		io_req_complete_failed(req, -EFAULT);
   3059}
   3060
   3061static void io_req_task_queue_fail(struct io_kiocb *req, int ret)
   3062{
   3063	req->cqe.res = ret;
   3064	req->io_task_work.func = io_req_task_cancel;
   3065	io_req_task_work_add(req);
   3066}
   3067
   3068static void io_req_task_queue(struct io_kiocb *req)
   3069{
   3070	req->io_task_work.func = io_req_task_submit;
   3071	io_req_task_work_add(req);
   3072}
   3073
   3074static void io_req_task_queue_reissue(struct io_kiocb *req)
   3075{
   3076	req->io_task_work.func = io_queue_iowq;
   3077	io_req_task_work_add(req);
   3078}
   3079
   3080static void io_queue_next(struct io_kiocb *req)
   3081{
   3082	struct io_kiocb *nxt = io_req_find_next(req);
   3083
   3084	if (nxt)
   3085		io_req_task_queue(nxt);
   3086}
   3087
   3088static void io_free_batch_list(struct io_ring_ctx *ctx,
   3089				struct io_wq_work_node *node)
   3090	__must_hold(&ctx->uring_lock)
   3091{
   3092	struct task_struct *task = NULL;
   3093	int task_refs = 0;
   3094
   3095	do {
   3096		struct io_kiocb *req = container_of(node, struct io_kiocb,
   3097						    comp_list);
   3098
   3099		if (unlikely(req->flags & IO_REQ_CLEAN_SLOW_FLAGS)) {
   3100			if (req->flags & REQ_F_REFCOUNT) {
   3101				node = req->comp_list.next;
   3102				if (!req_ref_put_and_test(req))
   3103					continue;
   3104			}
   3105			if ((req->flags & REQ_F_POLLED) && req->apoll) {
   3106				struct async_poll *apoll = req->apoll;
   3107
   3108				if (apoll->double_poll)
   3109					kfree(apoll->double_poll);
   3110				list_add(&apoll->poll.wait.entry,
   3111						&ctx->apoll_cache);
   3112				req->flags &= ~REQ_F_POLLED;
   3113			}
   3114			if (req->flags & IO_REQ_LINK_FLAGS)
   3115				io_queue_next(req);
   3116			if (unlikely(req->flags & IO_REQ_CLEAN_FLAGS))
   3117				io_clean_op(req);
   3118		}
   3119		if (!(req->flags & REQ_F_FIXED_FILE))
   3120			io_put_file(req->file);
   3121
   3122		io_req_put_rsrc_locked(req, ctx);
   3123
   3124		if (req->task != task) {
   3125			if (task)
   3126				io_put_task(task, task_refs);
   3127			task = req->task;
   3128			task_refs = 0;
   3129		}
   3130		task_refs++;
   3131		node = req->comp_list.next;
   3132		io_req_add_to_cache(req, ctx);
   3133	} while (node);
   3134
   3135	if (task)
   3136		io_put_task(task, task_refs);
   3137}
   3138
   3139static void __io_submit_flush_completions(struct io_ring_ctx *ctx)
   3140	__must_hold(&ctx->uring_lock)
   3141{
   3142	struct io_wq_work_node *node, *prev;
   3143	struct io_submit_state *state = &ctx->submit_state;
   3144
   3145	if (state->flush_cqes) {
   3146		spin_lock(&ctx->completion_lock);
   3147		wq_list_for_each(node, prev, &state->compl_reqs) {
   3148			struct io_kiocb *req = container_of(node, struct io_kiocb,
   3149						    comp_list);
   3150
   3151			if (!(req->flags & REQ_F_CQE_SKIP))
   3152				__io_fill_cqe_req(ctx, req);
   3153		}
   3154
   3155		io_commit_cqring(ctx);
   3156		spin_unlock(&ctx->completion_lock);
   3157		io_cqring_ev_posted(ctx);
   3158		state->flush_cqes = false;
   3159	}
   3160
   3161	io_free_batch_list(ctx, state->compl_reqs.first);
   3162	INIT_WQ_LIST(&state->compl_reqs);
   3163}
   3164
   3165/*
   3166 * Drop reference to request, return next in chain (if there is one) if this
   3167 * was the last reference to this request.
   3168 */
   3169static inline struct io_kiocb *io_put_req_find_next(struct io_kiocb *req)
   3170{
   3171	struct io_kiocb *nxt = NULL;
   3172
   3173	if (req_ref_put_and_test(req)) {
   3174		if (unlikely(req->flags & IO_REQ_LINK_FLAGS))
   3175			nxt = io_req_find_next(req);
   3176		io_free_req(req);
   3177	}
   3178	return nxt;
   3179}
   3180
   3181static inline void io_put_req(struct io_kiocb *req)
   3182{
   3183	if (req_ref_put_and_test(req)) {
   3184		io_queue_next(req);
   3185		io_free_req(req);
   3186	}
   3187}
   3188
   3189static unsigned io_cqring_events(struct io_ring_ctx *ctx)
   3190{
   3191	/* See comment at the top of this file */
   3192	smp_rmb();
   3193	return __io_cqring_events(ctx);
   3194}
   3195
   3196static inline unsigned int io_sqring_entries(struct io_ring_ctx *ctx)
   3197{
   3198	struct io_rings *rings = ctx->rings;
   3199
   3200	/* make sure SQ entry isn't read before tail */
   3201	return smp_load_acquire(&rings->sq.tail) - ctx->cached_sq_head;
   3202}
   3203
   3204static inline bool io_run_task_work(void)
   3205{
   3206	if (test_thread_flag(TIF_NOTIFY_SIGNAL) || task_work_pending(current)) {
   3207		__set_current_state(TASK_RUNNING);
   3208		clear_notify_signal();
   3209		if (task_work_pending(current))
   3210			task_work_run();
   3211		return true;
   3212	}
   3213
   3214	return false;
   3215}
   3216
   3217static int io_do_iopoll(struct io_ring_ctx *ctx, bool force_nonspin)
   3218{
   3219	struct io_wq_work_node *pos, *start, *prev;
   3220	unsigned int poll_flags = BLK_POLL_NOSLEEP;
   3221	DEFINE_IO_COMP_BATCH(iob);
   3222	int nr_events = 0;
   3223
   3224	/*
   3225	 * Only spin for completions if we don't have multiple devices hanging
   3226	 * off our complete list.
   3227	 */
   3228	if (ctx->poll_multi_queue || force_nonspin)
   3229		poll_flags |= BLK_POLL_ONESHOT;
   3230
   3231	wq_list_for_each(pos, start, &ctx->iopoll_list) {
   3232		struct io_kiocb *req = container_of(pos, struct io_kiocb, comp_list);
   3233		struct kiocb *kiocb = &req->rw.kiocb;
   3234		int ret;
   3235
   3236		/*
   3237		 * Move completed and retryable entries to our local lists.
   3238		 * If we find a request that requires polling, break out
   3239		 * and complete those lists first, if we have entries there.
   3240		 */
   3241		if (READ_ONCE(req->iopoll_completed))
   3242			break;
   3243
   3244		ret = kiocb->ki_filp->f_op->iopoll(kiocb, &iob, poll_flags);
   3245		if (unlikely(ret < 0))
   3246			return ret;
   3247		else if (ret)
   3248			poll_flags |= BLK_POLL_ONESHOT;
   3249
   3250		/* iopoll may have completed current req */
   3251		if (!rq_list_empty(iob.req_list) ||
   3252		    READ_ONCE(req->iopoll_completed))
   3253			break;
   3254	}
   3255
   3256	if (!rq_list_empty(iob.req_list))
   3257		iob.complete(&iob);
   3258	else if (!pos)
   3259		return 0;
   3260
   3261	prev = start;
   3262	wq_list_for_each_resume(pos, prev) {
   3263		struct io_kiocb *req = container_of(pos, struct io_kiocb, comp_list);
   3264
   3265		/* order with io_complete_rw_iopoll(), e.g. ->result updates */
   3266		if (!smp_load_acquire(&req->iopoll_completed))
   3267			break;
   3268		nr_events++;
   3269		if (unlikely(req->flags & REQ_F_CQE_SKIP))
   3270			continue;
   3271
   3272		req->cqe.flags = io_put_kbuf(req, 0);
   3273		__io_fill_cqe_req(req->ctx, req);
   3274	}
   3275
   3276	if (unlikely(!nr_events))
   3277		return 0;
   3278
   3279	io_commit_cqring(ctx);
   3280	io_cqring_ev_posted_iopoll(ctx);
   3281	pos = start ? start->next : ctx->iopoll_list.first;
   3282	wq_list_cut(&ctx->iopoll_list, prev, start);
   3283	io_free_batch_list(ctx, pos);
   3284	return nr_events;
   3285}
   3286
   3287/*
   3288 * We can't just wait for polled events to come to us, we have to actively
   3289 * find and complete them.
   3290 */
   3291static __cold void io_iopoll_try_reap_events(struct io_ring_ctx *ctx)
   3292{
   3293	if (!(ctx->flags & IORING_SETUP_IOPOLL))
   3294		return;
   3295
   3296	mutex_lock(&ctx->uring_lock);
   3297	while (!wq_list_empty(&ctx->iopoll_list)) {
   3298		/* let it sleep and repeat later if can't complete a request */
   3299		if (io_do_iopoll(ctx, true) == 0)
   3300			break;
   3301		/*
   3302		 * Ensure we allow local-to-the-cpu processing to take place,
   3303		 * in this case we need to ensure that we reap all events.
   3304		 * Also let task_work, etc. to progress by releasing the mutex
   3305		 */
   3306		if (need_resched()) {
   3307			mutex_unlock(&ctx->uring_lock);
   3308			cond_resched();
   3309			mutex_lock(&ctx->uring_lock);
   3310		}
   3311	}
   3312	mutex_unlock(&ctx->uring_lock);
   3313}
   3314
   3315static int io_iopoll_check(struct io_ring_ctx *ctx, long min)
   3316{
   3317	unsigned int nr_events = 0;
   3318	int ret = 0;
   3319	unsigned long check_cq;
   3320
   3321	/*
   3322	 * Don't enter poll loop if we already have events pending.
   3323	 * If we do, we can potentially be spinning for commands that
   3324	 * already triggered a CQE (eg in error).
   3325	 */
   3326	check_cq = READ_ONCE(ctx->check_cq);
   3327	if (check_cq & BIT(IO_CHECK_CQ_OVERFLOW_BIT))
   3328		__io_cqring_overflow_flush(ctx, false);
   3329	if (io_cqring_events(ctx))
   3330		return 0;
   3331
   3332	/*
   3333	 * Similarly do not spin if we have not informed the user of any
   3334	 * dropped CQE.
   3335	 */
   3336	if (unlikely(check_cq & BIT(IO_CHECK_CQ_DROPPED_BIT)))
   3337		return -EBADR;
   3338
   3339	do {
   3340		/*
   3341		 * If a submit got punted to a workqueue, we can have the
   3342		 * application entering polling for a command before it gets
   3343		 * issued. That app will hold the uring_lock for the duration
   3344		 * of the poll right here, so we need to take a breather every
   3345		 * now and then to ensure that the issue has a chance to add
   3346		 * the poll to the issued list. Otherwise we can spin here
   3347		 * forever, while the workqueue is stuck trying to acquire the
   3348		 * very same mutex.
   3349		 */
   3350		if (wq_list_empty(&ctx->iopoll_list)) {
   3351			u32 tail = ctx->cached_cq_tail;
   3352
   3353			mutex_unlock(&ctx->uring_lock);
   3354			io_run_task_work();
   3355			mutex_lock(&ctx->uring_lock);
   3356
   3357			/* some requests don't go through iopoll_list */
   3358			if (tail != ctx->cached_cq_tail ||
   3359			    wq_list_empty(&ctx->iopoll_list))
   3360				break;
   3361		}
   3362		ret = io_do_iopoll(ctx, !min);
   3363		if (ret < 0)
   3364			break;
   3365		nr_events += ret;
   3366		ret = 0;
   3367	} while (nr_events < min && !need_resched());
   3368
   3369	return ret;
   3370}
   3371
   3372static void kiocb_end_write(struct io_kiocb *req)
   3373{
   3374	/*
   3375	 * Tell lockdep we inherited freeze protection from submission
   3376	 * thread.
   3377	 */
   3378	if (req->flags & REQ_F_ISREG) {
   3379		struct super_block *sb = file_inode(req->file)->i_sb;
   3380
   3381		__sb_writers_acquired(sb, SB_FREEZE_WRITE);
   3382		sb_end_write(sb);
   3383	}
   3384}
   3385
   3386#ifdef CONFIG_BLOCK
   3387static bool io_resubmit_prep(struct io_kiocb *req)
   3388{
   3389	struct io_async_rw *rw = req->async_data;
   3390
   3391	if (!req_has_async_data(req))
   3392		return !io_req_prep_async(req);
   3393	iov_iter_restore(&rw->s.iter, &rw->s.iter_state);
   3394	return true;
   3395}
   3396
   3397static bool io_rw_should_reissue(struct io_kiocb *req)
   3398{
   3399	umode_t mode = file_inode(req->file)->i_mode;
   3400	struct io_ring_ctx *ctx = req->ctx;
   3401
   3402	if (!S_ISBLK(mode) && !S_ISREG(mode))
   3403		return false;
   3404	if ((req->flags & REQ_F_NOWAIT) || (io_wq_current_is_worker() &&
   3405	    !(ctx->flags & IORING_SETUP_IOPOLL)))
   3406		return false;
   3407	/*
   3408	 * If ref is dying, we might be running poll reap from the exit work.
   3409	 * Don't attempt to reissue from that path, just let it fail with
   3410	 * -EAGAIN.
   3411	 */
   3412	if (percpu_ref_is_dying(&ctx->refs))
   3413		return false;
   3414	/*
   3415	 * Play it safe and assume not safe to re-import and reissue if we're
   3416	 * not in the original thread group (or in task context).
   3417	 */
   3418	if (!same_thread_group(req->task, current) || !in_task())
   3419		return false;
   3420	return true;
   3421}
   3422#else
   3423static bool io_resubmit_prep(struct io_kiocb *req)
   3424{
   3425	return false;
   3426}
   3427static bool io_rw_should_reissue(struct io_kiocb *req)
   3428{
   3429	return false;
   3430}
   3431#endif
   3432
   3433static bool __io_complete_rw_common(struct io_kiocb *req, long res)
   3434{
   3435	if (req->rw.kiocb.ki_flags & IOCB_WRITE) {
   3436		kiocb_end_write(req);
   3437		fsnotify_modify(req->file);
   3438	} else {
   3439		fsnotify_access(req->file);
   3440	}
   3441	if (unlikely(res != req->cqe.res)) {
   3442		if ((res == -EAGAIN || res == -EOPNOTSUPP) &&
   3443		    io_rw_should_reissue(req)) {
   3444			req->flags |= REQ_F_REISSUE | REQ_F_PARTIAL_IO;
   3445			return true;
   3446		}
   3447		req_set_fail(req);
   3448		req->cqe.res = res;
   3449	}
   3450	return false;
   3451}
   3452
   3453static inline void io_req_task_complete(struct io_kiocb *req, bool *locked)
   3454{
   3455	int res = req->cqe.res;
   3456
   3457	if (*locked) {
   3458		io_req_complete_state(req, res, io_put_kbuf(req, 0));
   3459		io_req_add_compl_list(req);
   3460	} else {
   3461		io_req_complete_post(req, res,
   3462					io_put_kbuf(req, IO_URING_F_UNLOCKED));
   3463	}
   3464}
   3465
   3466static void __io_complete_rw(struct io_kiocb *req, long res,
   3467			     unsigned int issue_flags)
   3468{
   3469	if (__io_complete_rw_common(req, res))
   3470		return;
   3471	__io_req_complete(req, issue_flags, req->cqe.res,
   3472				io_put_kbuf(req, issue_flags));
   3473}
   3474
   3475static void io_complete_rw(struct kiocb *kiocb, long res)
   3476{
   3477	struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw.kiocb);
   3478
   3479	if (__io_complete_rw_common(req, res))
   3480		return;
   3481	req->cqe.res = res;
   3482	req->io_task_work.func = io_req_task_complete;
   3483	io_req_task_prio_work_add(req);
   3484}
   3485
   3486static void io_complete_rw_iopoll(struct kiocb *kiocb, long res)
   3487{
   3488	struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw.kiocb);
   3489
   3490	if (kiocb->ki_flags & IOCB_WRITE)
   3491		kiocb_end_write(req);
   3492	if (unlikely(res != req->cqe.res)) {
   3493		if (res == -EAGAIN && io_rw_should_reissue(req)) {
   3494			req->flags |= REQ_F_REISSUE | REQ_F_PARTIAL_IO;
   3495			return;
   3496		}
   3497		req->cqe.res = res;
   3498	}
   3499
   3500	/* order with io_iopoll_complete() checking ->iopoll_completed */
   3501	smp_store_release(&req->iopoll_completed, 1);
   3502}
   3503
   3504/*
   3505 * After the iocb has been issued, it's safe to be found on the poll list.
   3506 * Adding the kiocb to the list AFTER submission ensures that we don't
   3507 * find it from a io_do_iopoll() thread before the issuer is done
   3508 * accessing the kiocb cookie.
   3509 */
   3510static void io_iopoll_req_issued(struct io_kiocb *req, unsigned int issue_flags)
   3511{
   3512	struct io_ring_ctx *ctx = req->ctx;
   3513	const bool needs_lock = issue_flags & IO_URING_F_UNLOCKED;
   3514
   3515	/* workqueue context doesn't hold uring_lock, grab it now */
   3516	if (unlikely(needs_lock))
   3517		mutex_lock(&ctx->uring_lock);
   3518
   3519	/*
   3520	 * Track whether we have multiple files in our lists. This will impact
   3521	 * how we do polling eventually, not spinning if we're on potentially
   3522	 * different devices.
   3523	 */
   3524	if (wq_list_empty(&ctx->iopoll_list)) {
   3525		ctx->poll_multi_queue = false;
   3526	} else if (!ctx->poll_multi_queue) {
   3527		struct io_kiocb *list_req;
   3528
   3529		list_req = container_of(ctx->iopoll_list.first, struct io_kiocb,
   3530					comp_list);
   3531		if (list_req->file != req->file)
   3532			ctx->poll_multi_queue = true;
   3533	}
   3534
   3535	/*
   3536	 * For fast devices, IO may have already completed. If it has, add
   3537	 * it to the front so we find it first.
   3538	 */
   3539	if (READ_ONCE(req->iopoll_completed))
   3540		wq_list_add_head(&req->comp_list, &ctx->iopoll_list);
   3541	else
   3542		wq_list_add_tail(&req->comp_list, &ctx->iopoll_list);
   3543
   3544	if (unlikely(needs_lock)) {
   3545		/*
   3546		 * If IORING_SETUP_SQPOLL is enabled, sqes are either handle
   3547		 * in sq thread task context or in io worker task context. If
   3548		 * current task context is sq thread, we don't need to check
   3549		 * whether should wake up sq thread.
   3550		 */
   3551		if ((ctx->flags & IORING_SETUP_SQPOLL) &&
   3552		    wq_has_sleeper(&ctx->sq_data->wait))
   3553			wake_up(&ctx->sq_data->wait);
   3554
   3555		mutex_unlock(&ctx->uring_lock);
   3556	}
   3557}
   3558
   3559static bool io_bdev_nowait(struct block_device *bdev)
   3560{
   3561	return !bdev || blk_queue_nowait(bdev_get_queue(bdev));
   3562}
   3563
   3564/*
   3565 * If we tracked the file through the SCM inflight mechanism, we could support
   3566 * any file. For now, just ensure that anything potentially problematic is done
   3567 * inline.
   3568 */
   3569static bool __io_file_supports_nowait(struct file *file, umode_t mode)
   3570{
   3571	if (S_ISBLK(mode)) {
   3572		if (IS_ENABLED(CONFIG_BLOCK) &&
   3573		    io_bdev_nowait(I_BDEV(file->f_mapping->host)))
   3574			return true;
   3575		return false;
   3576	}
   3577	if (S_ISSOCK(mode))
   3578		return true;
   3579	if (S_ISREG(mode)) {
   3580		if (IS_ENABLED(CONFIG_BLOCK) &&
   3581		    io_bdev_nowait(file->f_inode->i_sb->s_bdev) &&
   3582		    file->f_op != &io_uring_fops)
   3583			return true;
   3584		return false;
   3585	}
   3586
   3587	/* any ->read/write should understand O_NONBLOCK */
   3588	if (file->f_flags & O_NONBLOCK)
   3589		return true;
   3590	return file->f_mode & FMODE_NOWAIT;
   3591}
   3592
   3593/*
   3594 * If we tracked the file through the SCM inflight mechanism, we could support
   3595 * any file. For now, just ensure that anything potentially problematic is done
   3596 * inline.
   3597 */
   3598static unsigned int io_file_get_flags(struct file *file)
   3599{
   3600	umode_t mode = file_inode(file)->i_mode;
   3601	unsigned int res = 0;
   3602
   3603	if (S_ISREG(mode))
   3604		res |= FFS_ISREG;
   3605	if (__io_file_supports_nowait(file, mode))
   3606		res |= FFS_NOWAIT;
   3607	if (io_file_need_scm(file))
   3608		res |= FFS_SCM;
   3609	return res;
   3610}
   3611
   3612static inline bool io_file_supports_nowait(struct io_kiocb *req)
   3613{
   3614	return req->flags & REQ_F_SUPPORT_NOWAIT;
   3615}
   3616
   3617static int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe)
   3618{
   3619	struct kiocb *kiocb = &req->rw.kiocb;
   3620	unsigned ioprio;
   3621	int ret;
   3622
   3623	kiocb->ki_pos = READ_ONCE(sqe->off);
   3624	/* used for fixed read/write too - just read unconditionally */
   3625	req->buf_index = READ_ONCE(sqe->buf_index);
   3626
   3627	if (req->opcode == IORING_OP_READ_FIXED ||
   3628	    req->opcode == IORING_OP_WRITE_FIXED) {
   3629		struct io_ring_ctx *ctx = req->ctx;
   3630		u16 index;
   3631
   3632		if (unlikely(req->buf_index >= ctx->nr_user_bufs))
   3633			return -EFAULT;
   3634		index = array_index_nospec(req->buf_index, ctx->nr_user_bufs);
   3635		req->imu = ctx->user_bufs[index];
   3636		io_req_set_rsrc_node(req, ctx, 0);
   3637	}
   3638
   3639	ioprio = READ_ONCE(sqe->ioprio);
   3640	if (ioprio) {
   3641		ret = ioprio_check_cap(ioprio);
   3642		if (ret)
   3643			return ret;
   3644
   3645		kiocb->ki_ioprio = ioprio;
   3646	} else {
   3647		kiocb->ki_ioprio = get_current_ioprio();
   3648	}
   3649
   3650	req->rw.addr = READ_ONCE(sqe->addr);
   3651	req->rw.len = READ_ONCE(sqe->len);
   3652	req->rw.flags = READ_ONCE(sqe->rw_flags);
   3653	return 0;
   3654}
   3655
   3656static inline void io_rw_done(struct kiocb *kiocb, ssize_t ret)
   3657{
   3658	switch (ret) {
   3659	case -EIOCBQUEUED:
   3660		break;
   3661	case -ERESTARTSYS:
   3662	case -ERESTARTNOINTR:
   3663	case -ERESTARTNOHAND:
   3664	case -ERESTART_RESTARTBLOCK:
   3665		/*
   3666		 * We can't just restart the syscall, since previously
   3667		 * submitted sqes may already be in progress. Just fail this
   3668		 * IO with EINTR.
   3669		 */
   3670		ret = -EINTR;
   3671		fallthrough;
   3672	default:
   3673		kiocb->ki_complete(kiocb, ret);
   3674	}
   3675}
   3676
   3677static inline loff_t *io_kiocb_update_pos(struct io_kiocb *req)
   3678{
   3679	struct kiocb *kiocb = &req->rw.kiocb;
   3680
   3681	if (kiocb->ki_pos != -1)
   3682		return &kiocb->ki_pos;
   3683
   3684	if (!(req->file->f_mode & FMODE_STREAM)) {
   3685		req->flags |= REQ_F_CUR_POS;
   3686		kiocb->ki_pos = req->file->f_pos;
   3687		return &kiocb->ki_pos;
   3688	}
   3689
   3690	kiocb->ki_pos = 0;
   3691	return NULL;
   3692}
   3693
   3694static void kiocb_done(struct io_kiocb *req, ssize_t ret,
   3695		       unsigned int issue_flags)
   3696{
   3697	struct io_async_rw *io = req->async_data;
   3698
   3699	/* add previously done IO, if any */
   3700	if (req_has_async_data(req) && io->bytes_done > 0) {
   3701		if (ret < 0)
   3702			ret = io->bytes_done;
   3703		else
   3704			ret += io->bytes_done;
   3705	}
   3706
   3707	if (req->flags & REQ_F_CUR_POS)
   3708		req->file->f_pos = req->rw.kiocb.ki_pos;
   3709	if (ret >= 0 && (req->rw.kiocb.ki_complete == io_complete_rw))
   3710		__io_complete_rw(req, ret, issue_flags);
   3711	else
   3712		io_rw_done(&req->rw.kiocb, ret);
   3713
   3714	if (req->flags & REQ_F_REISSUE) {
   3715		req->flags &= ~REQ_F_REISSUE;
   3716		if (io_resubmit_prep(req))
   3717			io_req_task_queue_reissue(req);
   3718		else
   3719			io_req_task_queue_fail(req, ret);
   3720	}
   3721}
   3722
   3723static int __io_import_fixed(struct io_kiocb *req, int rw, struct iov_iter *iter,
   3724			     struct io_mapped_ubuf *imu)
   3725{
   3726	size_t len = req->rw.len;
   3727	u64 buf_end, buf_addr = req->rw.addr;
   3728	size_t offset;
   3729
   3730	if (unlikely(check_add_overflow(buf_addr, (u64)len, &buf_end)))
   3731		return -EFAULT;
   3732	/* not inside the mapped region */
   3733	if (unlikely(buf_addr < imu->ubuf || buf_end > imu->ubuf_end))
   3734		return -EFAULT;
   3735
   3736	/*
   3737	 * May not be a start of buffer, set size appropriately
   3738	 * and advance us to the beginning.
   3739	 */
   3740	offset = buf_addr - imu->ubuf;
   3741	iov_iter_bvec(iter, rw, imu->bvec, imu->nr_bvecs, offset + len);
   3742
   3743	if (offset) {
   3744		/*
   3745		 * Don't use iov_iter_advance() here, as it's really slow for
   3746		 * using the latter parts of a big fixed buffer - it iterates
   3747		 * over each segment manually. We can cheat a bit here, because
   3748		 * we know that:
   3749		 *
   3750		 * 1) it's a BVEC iter, we set it up
   3751		 * 2) all bvecs are PAGE_SIZE in size, except potentially the
   3752		 *    first and last bvec
   3753		 *
   3754		 * So just find our index, and adjust the iterator afterwards.
   3755		 * If the offset is within the first bvec (or the whole first
   3756		 * bvec, just use iov_iter_advance(). This makes it easier
   3757		 * since we can just skip the first segment, which may not
   3758		 * be PAGE_SIZE aligned.
   3759		 */
   3760		const struct bio_vec *bvec = imu->bvec;
   3761
   3762		if (offset <= bvec->bv_len) {
   3763			iov_iter_advance(iter, offset);
   3764		} else {
   3765			unsigned long seg_skip;
   3766
   3767			/* skip first vec */
   3768			offset -= bvec->bv_len;
   3769			seg_skip = 1 + (offset >> PAGE_SHIFT);
   3770
   3771			iter->bvec = bvec + seg_skip;
   3772			iter->nr_segs -= seg_skip;
   3773			iter->count -= bvec->bv_len + offset;
   3774			iter->iov_offset = offset & ~PAGE_MASK;
   3775		}
   3776	}
   3777
   3778	return 0;
   3779}
   3780
   3781static int io_import_fixed(struct io_kiocb *req, int rw, struct iov_iter *iter,
   3782			   unsigned int issue_flags)
   3783{
   3784	if (WARN_ON_ONCE(!req->imu))
   3785		return -EFAULT;
   3786	return __io_import_fixed(req, rw, iter, req->imu);
   3787}
   3788
   3789static int io_buffer_add_list(struct io_ring_ctx *ctx,
   3790			      struct io_buffer_list *bl, unsigned int bgid)
   3791{
   3792	bl->bgid = bgid;
   3793	if (bgid < BGID_ARRAY)
   3794		return 0;
   3795
   3796	return xa_err(xa_store(&ctx->io_bl_xa, bgid, bl, GFP_KERNEL));
   3797}
   3798
   3799static void __user *io_provided_buffer_select(struct io_kiocb *req, size_t *len,
   3800					      struct io_buffer_list *bl)
   3801{
   3802	if (!list_empty(&bl->buf_list)) {
   3803		struct io_buffer *kbuf;
   3804
   3805		kbuf = list_first_entry(&bl->buf_list, struct io_buffer, list);
   3806		list_del(&kbuf->list);
   3807		if (*len > kbuf->len)
   3808			*len = kbuf->len;
   3809		req->flags |= REQ_F_BUFFER_SELECTED;
   3810		req->kbuf = kbuf;
   3811		req->buf_index = kbuf->bid;
   3812		return u64_to_user_ptr(kbuf->addr);
   3813	}
   3814	return NULL;
   3815}
   3816
   3817static void __user *io_ring_buffer_select(struct io_kiocb *req, size_t *len,
   3818					  struct io_buffer_list *bl,
   3819					  unsigned int issue_flags)
   3820{
   3821	struct io_uring_buf_ring *br = bl->buf_ring;
   3822	struct io_uring_buf *buf;
   3823	__u16 head = bl->head;
   3824
   3825	if (unlikely(smp_load_acquire(&br->tail) == head))
   3826		return NULL;
   3827
   3828	head &= bl->mask;
   3829	if (head < IO_BUFFER_LIST_BUF_PER_PAGE) {
   3830		buf = &br->bufs[head];
   3831	} else {
   3832		int off = head & (IO_BUFFER_LIST_BUF_PER_PAGE - 1);
   3833		int index = head / IO_BUFFER_LIST_BUF_PER_PAGE;
   3834		buf = page_address(bl->buf_pages[index]);
   3835		buf += off;
   3836	}
   3837	if (*len > buf->len)
   3838		*len = buf->len;
   3839	req->flags |= REQ_F_BUFFER_RING;
   3840	req->buf_list = bl;
   3841	req->buf_index = buf->bid;
   3842
   3843	if (issue_flags & IO_URING_F_UNLOCKED || !file_can_poll(req->file)) {
   3844		/*
   3845		 * If we came in unlocked, we have no choice but to consume the
   3846		 * buffer here. This does mean it'll be pinned until the IO
   3847		 * completes. But coming in unlocked means we're in io-wq
   3848		 * context, hence there should be no further retry. For the
   3849		 * locked case, the caller must ensure to call the commit when
   3850		 * the transfer completes (or if we get -EAGAIN and must poll
   3851		 * or retry).
   3852		 */
   3853		req->buf_list = NULL;
   3854		bl->head++;
   3855	}
   3856	return u64_to_user_ptr(buf->addr);
   3857}
   3858
   3859static void __user *io_buffer_select(struct io_kiocb *req, size_t *len,
   3860				     unsigned int issue_flags)
   3861{
   3862	struct io_ring_ctx *ctx = req->ctx;
   3863	struct io_buffer_list *bl;
   3864	void __user *ret = NULL;
   3865
   3866	io_ring_submit_lock(req->ctx, issue_flags);
   3867
   3868	bl = io_buffer_get_list(ctx, req->buf_index);
   3869	if (likely(bl)) {
   3870		if (bl->buf_nr_pages)
   3871			ret = io_ring_buffer_select(req, len, bl, issue_flags);
   3872		else
   3873			ret = io_provided_buffer_select(req, len, bl);
   3874	}
   3875	io_ring_submit_unlock(req->ctx, issue_flags);
   3876	return ret;
   3877}
   3878
   3879#ifdef CONFIG_COMPAT
   3880static ssize_t io_compat_import(struct io_kiocb *req, struct iovec *iov,
   3881				unsigned int issue_flags)
   3882{
   3883	struct compat_iovec __user *uiov;
   3884	compat_ssize_t clen;
   3885	void __user *buf;
   3886	size_t len;
   3887
   3888	uiov = u64_to_user_ptr(req->rw.addr);
   3889	if (!access_ok(uiov, sizeof(*uiov)))
   3890		return -EFAULT;
   3891	if (__get_user(clen, &uiov->iov_len))
   3892		return -EFAULT;
   3893	if (clen < 0)
   3894		return -EINVAL;
   3895
   3896	len = clen;
   3897	buf = io_buffer_select(req, &len, issue_flags);
   3898	if (!buf)
   3899		return -ENOBUFS;
   3900	req->rw.addr = (unsigned long) buf;
   3901	iov[0].iov_base = buf;
   3902	req->rw.len = iov[0].iov_len = (compat_size_t) len;
   3903	return 0;
   3904}
   3905#endif
   3906
   3907static ssize_t __io_iov_buffer_select(struct io_kiocb *req, struct iovec *iov,
   3908				      unsigned int issue_flags)
   3909{
   3910	struct iovec __user *uiov = u64_to_user_ptr(req->rw.addr);
   3911	void __user *buf;
   3912	ssize_t len;
   3913
   3914	if (copy_from_user(iov, uiov, sizeof(*uiov)))
   3915		return -EFAULT;
   3916
   3917	len = iov[0].iov_len;
   3918	if (len < 0)
   3919		return -EINVAL;
   3920	buf = io_buffer_select(req, &len, issue_flags);
   3921	if (!buf)
   3922		return -ENOBUFS;
   3923	req->rw.addr = (unsigned long) buf;
   3924	iov[0].iov_base = buf;
   3925	req->rw.len = iov[0].iov_len = len;
   3926	return 0;
   3927}
   3928
   3929static ssize_t io_iov_buffer_select(struct io_kiocb *req, struct iovec *iov,
   3930				    unsigned int issue_flags)
   3931{
   3932	if (req->flags & (REQ_F_BUFFER_SELECTED|REQ_F_BUFFER_RING)) {
   3933		iov[0].iov_base = u64_to_user_ptr(req->rw.addr);
   3934		iov[0].iov_len = req->rw.len;
   3935		return 0;
   3936	}
   3937	if (req->rw.len != 1)
   3938		return -EINVAL;
   3939
   3940#ifdef CONFIG_COMPAT
   3941	if (req->ctx->compat)
   3942		return io_compat_import(req, iov, issue_flags);
   3943#endif
   3944
   3945	return __io_iov_buffer_select(req, iov, issue_flags);
   3946}
   3947
   3948static inline bool io_do_buffer_select(struct io_kiocb *req)
   3949{
   3950	if (!(req->flags & REQ_F_BUFFER_SELECT))
   3951		return false;
   3952	return !(req->flags & (REQ_F_BUFFER_SELECTED|REQ_F_BUFFER_RING));
   3953}
   3954
   3955static struct iovec *__io_import_iovec(int rw, struct io_kiocb *req,
   3956				       struct io_rw_state *s,
   3957				       unsigned int issue_flags)
   3958{
   3959	struct iov_iter *iter = &s->iter;
   3960	u8 opcode = req->opcode;
   3961	struct iovec *iovec;
   3962	void __user *buf;
   3963	size_t sqe_len;
   3964	ssize_t ret;
   3965
   3966	if (opcode == IORING_OP_READ_FIXED || opcode == IORING_OP_WRITE_FIXED) {
   3967		ret = io_import_fixed(req, rw, iter, issue_flags);
   3968		if (ret)
   3969			return ERR_PTR(ret);
   3970		return NULL;
   3971	}
   3972
   3973	buf = u64_to_user_ptr(req->rw.addr);
   3974	sqe_len = req->rw.len;
   3975
   3976	if (opcode == IORING_OP_READ || opcode == IORING_OP_WRITE) {
   3977		if (io_do_buffer_select(req)) {
   3978			buf = io_buffer_select(req, &sqe_len, issue_flags);
   3979			if (!buf)
   3980				return ERR_PTR(-ENOBUFS);
   3981			req->rw.addr = (unsigned long) buf;
   3982			req->rw.len = sqe_len;
   3983		}
   3984
   3985		ret = import_single_range(rw, buf, sqe_len, s->fast_iov, iter);
   3986		if (ret)
   3987			return ERR_PTR(ret);
   3988		return NULL;
   3989	}
   3990
   3991	iovec = s->fast_iov;
   3992	if (req->flags & REQ_F_BUFFER_SELECT) {
   3993		ret = io_iov_buffer_select(req, iovec, issue_flags);
   3994		if (ret)
   3995			return ERR_PTR(ret);
   3996		iov_iter_init(iter, rw, iovec, 1, iovec->iov_len);
   3997		return NULL;
   3998	}
   3999
   4000	ret = __import_iovec(rw, buf, sqe_len, UIO_FASTIOV, &iovec, iter,
   4001			      req->ctx->compat);
   4002	if (unlikely(ret < 0))
   4003		return ERR_PTR(ret);
   4004	return iovec;
   4005}
   4006
   4007static inline int io_import_iovec(int rw, struct io_kiocb *req,
   4008				  struct iovec **iovec, struct io_rw_state *s,
   4009				  unsigned int issue_flags)
   4010{
   4011	*iovec = __io_import_iovec(rw, req, s, issue_flags);
   4012	if (unlikely(IS_ERR(*iovec)))
   4013		return PTR_ERR(*iovec);
   4014
   4015	iov_iter_save_state(&s->iter, &s->iter_state);
   4016	return 0;
   4017}
   4018
   4019static inline loff_t *io_kiocb_ppos(struct kiocb *kiocb)
   4020{
   4021	return (kiocb->ki_filp->f_mode & FMODE_STREAM) ? NULL : &kiocb->ki_pos;
   4022}
   4023
   4024/*
   4025 * For files that don't have ->read_iter() and ->write_iter(), handle them
   4026 * by looping over ->read() or ->write() manually.
   4027 */
   4028static ssize_t loop_rw_iter(int rw, struct io_kiocb *req, struct iov_iter *iter)
   4029{
   4030	struct kiocb *kiocb = &req->rw.kiocb;
   4031	struct file *file = req->file;
   4032	ssize_t ret = 0;
   4033	loff_t *ppos;
   4034
   4035	/*
   4036	 * Don't support polled IO through this interface, and we can't
   4037	 * support non-blocking either. For the latter, this just causes
   4038	 * the kiocb to be handled from an async context.
   4039	 */
   4040	if (kiocb->ki_flags & IOCB_HIPRI)
   4041		return -EOPNOTSUPP;
   4042	if ((kiocb->ki_flags & IOCB_NOWAIT) &&
   4043	    !(kiocb->ki_filp->f_flags & O_NONBLOCK))
   4044		return -EAGAIN;
   4045
   4046	ppos = io_kiocb_ppos(kiocb);
   4047
   4048	while (iov_iter_count(iter)) {
   4049		struct iovec iovec;
   4050		ssize_t nr;
   4051
   4052		if (!iov_iter_is_bvec(iter)) {
   4053			iovec = iov_iter_iovec(iter);
   4054		} else {
   4055			iovec.iov_base = u64_to_user_ptr(req->rw.addr);
   4056			iovec.iov_len = req->rw.len;
   4057		}
   4058
   4059		if (rw == READ) {
   4060			nr = file->f_op->read(file, iovec.iov_base,
   4061					      iovec.iov_len, ppos);
   4062		} else {
   4063			nr = file->f_op->write(file, iovec.iov_base,
   4064					       iovec.iov_len, ppos);
   4065		}
   4066
   4067		if (nr < 0) {
   4068			if (!ret)
   4069				ret = nr;
   4070			break;
   4071		}
   4072		ret += nr;
   4073		if (!iov_iter_is_bvec(iter)) {
   4074			iov_iter_advance(iter, nr);
   4075		} else {
   4076			req->rw.addr += nr;
   4077			req->rw.len -= nr;
   4078			if (!req->rw.len)
   4079				break;
   4080		}
   4081		if (nr != iovec.iov_len)
   4082			break;
   4083	}
   4084
   4085	return ret;
   4086}
   4087
   4088static void io_req_map_rw(struct io_kiocb *req, const struct iovec *iovec,
   4089			  const struct iovec *fast_iov, struct iov_iter *iter)
   4090{
   4091	struct io_async_rw *rw = req->async_data;
   4092
   4093	memcpy(&rw->s.iter, iter, sizeof(*iter));
   4094	rw->free_iovec = iovec;
   4095	rw->bytes_done = 0;
   4096	/* can only be fixed buffers, no need to do anything */
   4097	if (iov_iter_is_bvec(iter))
   4098		return;
   4099	if (!iovec) {
   4100		unsigned iov_off = 0;
   4101
   4102		rw->s.iter.iov = rw->s.fast_iov;
   4103		if (iter->iov != fast_iov) {
   4104			iov_off = iter->iov - fast_iov;
   4105			rw->s.iter.iov += iov_off;
   4106		}
   4107		if (rw->s.fast_iov != fast_iov)
   4108			memcpy(rw->s.fast_iov + iov_off, fast_iov + iov_off,
   4109			       sizeof(struct iovec) * iter->nr_segs);
   4110	} else {
   4111		req->flags |= REQ_F_NEED_CLEANUP;
   4112	}
   4113}
   4114
   4115static inline bool io_alloc_async_data(struct io_kiocb *req)
   4116{
   4117	WARN_ON_ONCE(!io_op_defs[req->opcode].async_size);
   4118	req->async_data = kmalloc(io_op_defs[req->opcode].async_size, GFP_KERNEL);
   4119	if (req->async_data) {
   4120		req->flags |= REQ_F_ASYNC_DATA;
   4121		return false;
   4122	}
   4123	return true;
   4124}
   4125
   4126static int io_setup_async_rw(struct io_kiocb *req, const struct iovec *iovec,
   4127			     struct io_rw_state *s, bool force)
   4128{
   4129	if (!force && !io_op_defs[req->opcode].needs_async_setup)
   4130		return 0;
   4131	if (!req_has_async_data(req)) {
   4132		struct io_async_rw *iorw;
   4133
   4134		if (io_alloc_async_data(req)) {
   4135			kfree(iovec);
   4136			return -ENOMEM;
   4137		}
   4138
   4139		io_req_map_rw(req, iovec, s->fast_iov, &s->iter);
   4140		iorw = req->async_data;
   4141		/* we've copied and mapped the iter, ensure state is saved */
   4142		iov_iter_save_state(&iorw->s.iter, &iorw->s.iter_state);
   4143	}
   4144	return 0;
   4145}
   4146
   4147static inline int io_rw_prep_async(struct io_kiocb *req, int rw)
   4148{
   4149	struct io_async_rw *iorw = req->async_data;
   4150	struct iovec *iov;
   4151	int ret;
   4152
   4153	/* submission path, ->uring_lock should already be taken */
   4154	ret = io_import_iovec(rw, req, &iov, &iorw->s, 0);
   4155	if (unlikely(ret < 0))
   4156		return ret;
   4157
   4158	iorw->bytes_done = 0;
   4159	iorw->free_iovec = iov;
   4160	if (iov)
   4161		req->flags |= REQ_F_NEED_CLEANUP;
   4162	return 0;
   4163}
   4164
   4165static int io_readv_prep_async(struct io_kiocb *req)
   4166{
   4167	return io_rw_prep_async(req, READ);
   4168}
   4169
   4170static int io_writev_prep_async(struct io_kiocb *req)
   4171{
   4172	return io_rw_prep_async(req, WRITE);
   4173}
   4174
   4175/*
   4176 * This is our waitqueue callback handler, registered through __folio_lock_async()
   4177 * when we initially tried to do the IO with the iocb armed our waitqueue.
   4178 * This gets called when the page is unlocked, and we generally expect that to
   4179 * happen when the page IO is completed and the page is now uptodate. This will
   4180 * queue a task_work based retry of the operation, attempting to copy the data
   4181 * again. If the latter fails because the page was NOT uptodate, then we will
   4182 * do a thread based blocking retry of the operation. That's the unexpected
   4183 * slow path.
   4184 */
   4185static int io_async_buf_func(struct wait_queue_entry *wait, unsigned mode,
   4186			     int sync, void *arg)
   4187{
   4188	struct wait_page_queue *wpq;
   4189	struct io_kiocb *req = wait->private;
   4190	struct wait_page_key *key = arg;
   4191
   4192	wpq = container_of(wait, struct wait_page_queue, wait);
   4193
   4194	if (!wake_page_match(wpq, key))
   4195		return 0;
   4196
   4197	req->rw.kiocb.ki_flags &= ~IOCB_WAITQ;
   4198	list_del_init(&wait->entry);
   4199	io_req_task_queue(req);
   4200	return 1;
   4201}
   4202
   4203/*
   4204 * This controls whether a given IO request should be armed for async page
   4205 * based retry. If we return false here, the request is handed to the async
   4206 * worker threads for retry. If we're doing buffered reads on a regular file,
   4207 * we prepare a private wait_page_queue entry and retry the operation. This
   4208 * will either succeed because the page is now uptodate and unlocked, or it
   4209 * will register a callback when the page is unlocked at IO completion. Through
   4210 * that callback, io_uring uses task_work to setup a retry of the operation.
   4211 * That retry will attempt the buffered read again. The retry will generally
   4212 * succeed, or in rare cases where it fails, we then fall back to using the
   4213 * async worker threads for a blocking retry.
   4214 */
   4215static bool io_rw_should_retry(struct io_kiocb *req)
   4216{
   4217	struct io_async_rw *rw = req->async_data;
   4218	struct wait_page_queue *wait = &rw->wpq;
   4219	struct kiocb *kiocb = &req->rw.kiocb;
   4220
   4221	/* never retry for NOWAIT, we just complete with -EAGAIN */
   4222	if (req->flags & REQ_F_NOWAIT)
   4223		return false;
   4224
   4225	/* Only for buffered IO */
   4226	if (kiocb->ki_flags & (IOCB_DIRECT | IOCB_HIPRI))
   4227		return false;
   4228
   4229	/*
   4230	 * just use poll if we can, and don't attempt if the fs doesn't
   4231	 * support callback based unlocks
   4232	 */
   4233	if (file_can_poll(req->file) || !(req->file->f_mode & FMODE_BUF_RASYNC))
   4234		return false;
   4235
   4236	wait->wait.func = io_async_buf_func;
   4237	wait->wait.private = req;
   4238	wait->wait.flags = 0;
   4239	INIT_LIST_HEAD(&wait->wait.entry);
   4240	kiocb->ki_flags |= IOCB_WAITQ;
   4241	kiocb->ki_flags &= ~IOCB_NOWAIT;
   4242	kiocb->ki_waitq = wait;
   4243	return true;
   4244}
   4245
   4246static inline int io_iter_do_read(struct io_kiocb *req, struct iov_iter *iter)
   4247{
   4248	if (likely(req->file->f_op->read_iter))
   4249		return call_read_iter(req->file, &req->rw.kiocb, iter);
   4250	else if (req->file->f_op->read)
   4251		return loop_rw_iter(READ, req, iter);
   4252	else
   4253		return -EINVAL;
   4254}
   4255
   4256static bool need_read_all(struct io_kiocb *req)
   4257{
   4258	return req->flags & REQ_F_ISREG ||
   4259		S_ISBLK(file_inode(req->file)->i_mode);
   4260}
   4261
   4262static int io_rw_init_file(struct io_kiocb *req, fmode_t mode)
   4263{
   4264	struct kiocb *kiocb = &req->rw.kiocb;
   4265	struct io_ring_ctx *ctx = req->ctx;
   4266	struct file *file = req->file;
   4267	int ret;
   4268
   4269	if (unlikely(!file || !(file->f_mode & mode)))
   4270		return -EBADF;
   4271
   4272	if (!io_req_ffs_set(req))
   4273		req->flags |= io_file_get_flags(file) << REQ_F_SUPPORT_NOWAIT_BIT;
   4274
   4275	kiocb->ki_flags = iocb_flags(file);
   4276	ret = kiocb_set_rw_flags(kiocb, req->rw.flags);
   4277	if (unlikely(ret))
   4278		return ret;
   4279
   4280	/*
   4281	 * If the file is marked O_NONBLOCK, still allow retry for it if it
   4282	 * supports async. Otherwise it's impossible to use O_NONBLOCK files
   4283	 * reliably. If not, or it IOCB_NOWAIT is set, don't retry.
   4284	 */
   4285	if ((kiocb->ki_flags & IOCB_NOWAIT) ||
   4286	    ((file->f_flags & O_NONBLOCK) && !io_file_supports_nowait(req)))
   4287		req->flags |= REQ_F_NOWAIT;
   4288
   4289	if (ctx->flags & IORING_SETUP_IOPOLL) {
   4290		if (!(kiocb->ki_flags & IOCB_DIRECT) || !file->f_op->iopoll)
   4291			return -EOPNOTSUPP;
   4292
   4293		kiocb->private = NULL;
   4294		kiocb->ki_flags |= IOCB_HIPRI | IOCB_ALLOC_CACHE;
   4295		kiocb->ki_complete = io_complete_rw_iopoll;
   4296		req->iopoll_completed = 0;
   4297	} else {
   4298		if (kiocb->ki_flags & IOCB_HIPRI)
   4299			return -EINVAL;
   4300		kiocb->ki_complete = io_complete_rw;
   4301	}
   4302
   4303	return 0;
   4304}
   4305
   4306static int io_read(struct io_kiocb *req, unsigned int issue_flags)
   4307{
   4308	struct io_rw_state __s, *s = &__s;
   4309	struct iovec *iovec;
   4310	struct kiocb *kiocb = &req->rw.kiocb;
   4311	bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
   4312	struct io_async_rw *rw;
   4313	ssize_t ret, ret2;
   4314	loff_t *ppos;
   4315
   4316	if (!req_has_async_data(req)) {
   4317		ret = io_import_iovec(READ, req, &iovec, s, issue_flags);
   4318		if (unlikely(ret < 0))
   4319			return ret;
   4320	} else {
   4321		rw = req->async_data;
   4322		s = &rw->s;
   4323
   4324		/*
   4325		 * Safe and required to re-import if we're using provided
   4326		 * buffers, as we dropped the selected one before retry.
   4327		 */
   4328		if (io_do_buffer_select(req)) {
   4329			ret = io_import_iovec(READ, req, &iovec, s, issue_flags);
   4330			if (unlikely(ret < 0))
   4331				return ret;
   4332		}
   4333
   4334		/*
   4335		 * We come here from an earlier attempt, restore our state to
   4336		 * match in case it doesn't. It's cheap enough that we don't
   4337		 * need to make this conditional.
   4338		 */
   4339		iov_iter_restore(&s->iter, &s->iter_state);
   4340		iovec = NULL;
   4341	}
   4342	ret = io_rw_init_file(req, FMODE_READ);
   4343	if (unlikely(ret)) {
   4344		kfree(iovec);
   4345		return ret;
   4346	}
   4347	req->cqe.res = iov_iter_count(&s->iter);
   4348
   4349	if (force_nonblock) {
   4350		/* If the file doesn't support async, just async punt */
   4351		if (unlikely(!io_file_supports_nowait(req))) {
   4352			ret = io_setup_async_rw(req, iovec, s, true);
   4353			return ret ?: -EAGAIN;
   4354		}
   4355		kiocb->ki_flags |= IOCB_NOWAIT;
   4356	} else {
   4357		/* Ensure we clear previously set non-block flag */
   4358		kiocb->ki_flags &= ~IOCB_NOWAIT;
   4359	}
   4360
   4361	ppos = io_kiocb_update_pos(req);
   4362
   4363	ret = rw_verify_area(READ, req->file, ppos, req->cqe.res);
   4364	if (unlikely(ret)) {
   4365		kfree(iovec);
   4366		return ret;
   4367	}
   4368
   4369	ret = io_iter_do_read(req, &s->iter);
   4370
   4371	if (ret == -EAGAIN || (req->flags & REQ_F_REISSUE)) {
   4372		req->flags &= ~REQ_F_REISSUE;
   4373		/* if we can poll, just do that */
   4374		if (req->opcode == IORING_OP_READ && file_can_poll(req->file))
   4375			return -EAGAIN;
   4376		/* IOPOLL retry should happen for io-wq threads */
   4377		if (!force_nonblock && !(req->ctx->flags & IORING_SETUP_IOPOLL))
   4378			goto done;
   4379		/* no retry on NONBLOCK nor RWF_NOWAIT */
   4380		if (req->flags & REQ_F_NOWAIT)
   4381			goto done;
   4382		ret = 0;
   4383	} else if (ret == -EIOCBQUEUED) {
   4384		goto out_free;
   4385	} else if (ret == req->cqe.res || ret <= 0 || !force_nonblock ||
   4386		   (req->flags & REQ_F_NOWAIT) || !need_read_all(req)) {
   4387		/* read all, failed, already did sync or don't want to retry */
   4388		goto done;
   4389	}
   4390
   4391	/*
   4392	 * Don't depend on the iter state matching what was consumed, or being
   4393	 * untouched in case of error. Restore it and we'll advance it
   4394	 * manually if we need to.
   4395	 */
   4396	iov_iter_restore(&s->iter, &s->iter_state);
   4397
   4398	ret2 = io_setup_async_rw(req, iovec, s, true);
   4399	if (ret2)
   4400		return ret2;
   4401
   4402	iovec = NULL;
   4403	rw = req->async_data;
   4404	s = &rw->s;
   4405	/*
   4406	 * Now use our persistent iterator and state, if we aren't already.
   4407	 * We've restored and mapped the iter to match.
   4408	 */
   4409
   4410	do {
   4411		/*
   4412		 * We end up here because of a partial read, either from
   4413		 * above or inside this loop. Advance the iter by the bytes
   4414		 * that were consumed.
   4415		 */
   4416		iov_iter_advance(&s->iter, ret);
   4417		if (!iov_iter_count(&s->iter))
   4418			break;
   4419		rw->bytes_done += ret;
   4420		iov_iter_save_state(&s->iter, &s->iter_state);
   4421
   4422		/* if we can retry, do so with the callbacks armed */
   4423		if (!io_rw_should_retry(req)) {
   4424			kiocb->ki_flags &= ~IOCB_WAITQ;
   4425			return -EAGAIN;
   4426		}
   4427
   4428		/*
   4429		 * Now retry read with the IOCB_WAITQ parts set in the iocb. If
   4430		 * we get -EIOCBQUEUED, then we'll get a notification when the
   4431		 * desired page gets unlocked. We can also get a partial read
   4432		 * here, and if we do, then just retry at the new offset.
   4433		 */
   4434		ret = io_iter_do_read(req, &s->iter);
   4435		if (ret == -EIOCBQUEUED)
   4436			return 0;
   4437		/* we got some bytes, but not all. retry. */
   4438		kiocb->ki_flags &= ~IOCB_WAITQ;
   4439		iov_iter_restore(&s->iter, &s->iter_state);
   4440	} while (ret > 0);
   4441done:
   4442	kiocb_done(req, ret, issue_flags);
   4443out_free:
   4444	/* it's faster to check here then delegate to kfree */
   4445	if (iovec)
   4446		kfree(iovec);
   4447	return 0;
   4448}
   4449
   4450static int io_write(struct io_kiocb *req, unsigned int issue_flags)
   4451{
   4452	struct io_rw_state __s, *s = &__s;
   4453	struct iovec *iovec;
   4454	struct kiocb *kiocb = &req->rw.kiocb;
   4455	bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
   4456	ssize_t ret, ret2;
   4457	loff_t *ppos;
   4458
   4459	if (!req_has_async_data(req)) {
   4460		ret = io_import_iovec(WRITE, req, &iovec, s, issue_flags);
   4461		if (unlikely(ret < 0))
   4462			return ret;
   4463	} else {
   4464		struct io_async_rw *rw = req->async_data;
   4465
   4466		s = &rw->s;
   4467		iov_iter_restore(&s->iter, &s->iter_state);
   4468		iovec = NULL;
   4469	}
   4470	ret = io_rw_init_file(req, FMODE_WRITE);
   4471	if (unlikely(ret)) {
   4472		kfree(iovec);
   4473		return ret;
   4474	}
   4475	req->cqe.res = iov_iter_count(&s->iter);
   4476
   4477	if (force_nonblock) {
   4478		/* If the file doesn't support async, just async punt */
   4479		if (unlikely(!io_file_supports_nowait(req)))
   4480			goto copy_iov;
   4481
   4482		/* file path doesn't support NOWAIT for non-direct_IO */
   4483		if (force_nonblock && !(kiocb->ki_flags & IOCB_DIRECT) &&
   4484		    (req->flags & REQ_F_ISREG))
   4485			goto copy_iov;
   4486
   4487		kiocb->ki_flags |= IOCB_NOWAIT;
   4488	} else {
   4489		/* Ensure we clear previously set non-block flag */
   4490		kiocb->ki_flags &= ~IOCB_NOWAIT;
   4491	}
   4492
   4493	ppos = io_kiocb_update_pos(req);
   4494
   4495	ret = rw_verify_area(WRITE, req->file, ppos, req->cqe.res);
   4496	if (unlikely(ret))
   4497		goto out_free;
   4498
   4499	/*
   4500	 * Open-code file_start_write here to grab freeze protection,
   4501	 * which will be released by another thread in
   4502	 * io_complete_rw().  Fool lockdep by telling it the lock got
   4503	 * released so that it doesn't complain about the held lock when
   4504	 * we return to userspace.
   4505	 */
   4506	if (req->flags & REQ_F_ISREG) {
   4507		sb_start_write(file_inode(req->file)->i_sb);
   4508		__sb_writers_release(file_inode(req->file)->i_sb,
   4509					SB_FREEZE_WRITE);
   4510	}
   4511	kiocb->ki_flags |= IOCB_WRITE;
   4512
   4513	if (likely(req->file->f_op->write_iter))
   4514		ret2 = call_write_iter(req->file, kiocb, &s->iter);
   4515	else if (req->file->f_op->write)
   4516		ret2 = loop_rw_iter(WRITE, req, &s->iter);
   4517	else
   4518		ret2 = -EINVAL;
   4519
   4520	if (req->flags & REQ_F_REISSUE) {
   4521		req->flags &= ~REQ_F_REISSUE;
   4522		ret2 = -EAGAIN;
   4523	}
   4524
   4525	/*
   4526	 * Raw bdev writes will return -EOPNOTSUPP for IOCB_NOWAIT. Just
   4527	 * retry them without IOCB_NOWAIT.
   4528	 */
   4529	if (ret2 == -EOPNOTSUPP && (kiocb->ki_flags & IOCB_NOWAIT))
   4530		ret2 = -EAGAIN;
   4531	/* no retry on NONBLOCK nor RWF_NOWAIT */
   4532	if (ret2 == -EAGAIN && (req->flags & REQ_F_NOWAIT))
   4533		goto done;
   4534	if (!force_nonblock || ret2 != -EAGAIN) {
   4535		/* IOPOLL retry should happen for io-wq threads */
   4536		if (ret2 == -EAGAIN && (req->ctx->flags & IORING_SETUP_IOPOLL))
   4537			goto copy_iov;
   4538done:
   4539		kiocb_done(req, ret2, issue_flags);
   4540	} else {
   4541copy_iov:
   4542		iov_iter_restore(&s->iter, &s->iter_state);
   4543		ret = io_setup_async_rw(req, iovec, s, false);
   4544		return ret ?: -EAGAIN;
   4545	}
   4546out_free:
   4547	/* it's reportedly faster than delegating the null check to kfree() */
   4548	if (iovec)
   4549		kfree(iovec);
   4550	return ret;
   4551}
   4552
   4553static int io_renameat_prep(struct io_kiocb *req,
   4554			    const struct io_uring_sqe *sqe)
   4555{
   4556	struct io_rename *ren = &req->rename;
   4557	const char __user *oldf, *newf;
   4558
   4559	if (sqe->buf_index || sqe->splice_fd_in)
   4560		return -EINVAL;
   4561	if (unlikely(req->flags & REQ_F_FIXED_FILE))
   4562		return -EBADF;
   4563
   4564	ren->old_dfd = READ_ONCE(sqe->fd);
   4565	oldf = u64_to_user_ptr(READ_ONCE(sqe->addr));
   4566	newf = u64_to_user_ptr(READ_ONCE(sqe->addr2));
   4567	ren->new_dfd = READ_ONCE(sqe->len);
   4568	ren->flags = READ_ONCE(sqe->rename_flags);
   4569
   4570	ren->oldpath = getname(oldf);
   4571	if (IS_ERR(ren->oldpath))
   4572		return PTR_ERR(ren->oldpath);
   4573
   4574	ren->newpath = getname(newf);
   4575	if (IS_ERR(ren->newpath)) {
   4576		putname(ren->oldpath);
   4577		return PTR_ERR(ren->newpath);
   4578	}
   4579
   4580	req->flags |= REQ_F_NEED_CLEANUP;
   4581	return 0;
   4582}
   4583
   4584static int io_renameat(struct io_kiocb *req, unsigned int issue_flags)
   4585{
   4586	struct io_rename *ren = &req->rename;
   4587	int ret;
   4588
   4589	if (issue_flags & IO_URING_F_NONBLOCK)
   4590		return -EAGAIN;
   4591
   4592	ret = do_renameat2(ren->old_dfd, ren->oldpath, ren->new_dfd,
   4593				ren->newpath, ren->flags);
   4594
   4595	req->flags &= ~REQ_F_NEED_CLEANUP;
   4596	io_req_complete(req, ret);
   4597	return 0;
   4598}
   4599
   4600static inline void __io_xattr_finish(struct io_kiocb *req)
   4601{
   4602	struct io_xattr *ix = &req->xattr;
   4603
   4604	if (ix->filename)
   4605		putname(ix->filename);
   4606
   4607	kfree(ix->ctx.kname);
   4608	kvfree(ix->ctx.kvalue);
   4609}
   4610
   4611static void io_xattr_finish(struct io_kiocb *req, int ret)
   4612{
   4613	req->flags &= ~REQ_F_NEED_CLEANUP;
   4614
   4615	__io_xattr_finish(req);
   4616	io_req_complete(req, ret);
   4617}
   4618
   4619static int __io_getxattr_prep(struct io_kiocb *req,
   4620			      const struct io_uring_sqe *sqe)
   4621{
   4622	struct io_xattr *ix = &req->xattr;
   4623	const char __user *name;
   4624	int ret;
   4625
   4626	if (unlikely(req->flags & REQ_F_FIXED_FILE))
   4627		return -EBADF;
   4628
   4629	ix->filename = NULL;
   4630	ix->ctx.kvalue = NULL;
   4631	name = u64_to_user_ptr(READ_ONCE(sqe->addr));
   4632	ix->ctx.cvalue = u64_to_user_ptr(READ_ONCE(sqe->addr2));
   4633	ix->ctx.size = READ_ONCE(sqe->len);
   4634	ix->ctx.flags = READ_ONCE(sqe->xattr_flags);
   4635
   4636	if (ix->ctx.flags)
   4637		return -EINVAL;
   4638
   4639	ix->ctx.kname = kmalloc(sizeof(*ix->ctx.kname), GFP_KERNEL);
   4640	if (!ix->ctx.kname)
   4641		return -ENOMEM;
   4642
   4643	ret = strncpy_from_user(ix->ctx.kname->name, name,
   4644				sizeof(ix->ctx.kname->name));
   4645	if (!ret || ret == sizeof(ix->ctx.kname->name))
   4646		ret = -ERANGE;
   4647	if (ret < 0) {
   4648		kfree(ix->ctx.kname);
   4649		return ret;
   4650	}
   4651
   4652	req->flags |= REQ_F_NEED_CLEANUP;
   4653	return 0;
   4654}
   4655
   4656static int io_fgetxattr_prep(struct io_kiocb *req,
   4657			     const struct io_uring_sqe *sqe)
   4658{
   4659	return __io_getxattr_prep(req, sqe);
   4660}
   4661
   4662static int io_getxattr_prep(struct io_kiocb *req,
   4663			    const struct io_uring_sqe *sqe)
   4664{
   4665	struct io_xattr *ix = &req->xattr;
   4666	const char __user *path;
   4667	int ret;
   4668
   4669	ret = __io_getxattr_prep(req, sqe);
   4670	if (ret)
   4671		return ret;
   4672
   4673	path = u64_to_user_ptr(READ_ONCE(sqe->addr3));
   4674
   4675	ix->filename = getname_flags(path, LOOKUP_FOLLOW, NULL);
   4676	if (IS_ERR(ix->filename)) {
   4677		ret = PTR_ERR(ix->filename);
   4678		ix->filename = NULL;
   4679	}
   4680
   4681	return ret;
   4682}
   4683
   4684static int io_fgetxattr(struct io_kiocb *req, unsigned int issue_flags)
   4685{
   4686	struct io_xattr *ix = &req->xattr;
   4687	int ret;
   4688
   4689	if (issue_flags & IO_URING_F_NONBLOCK)
   4690		return -EAGAIN;
   4691
   4692	ret = do_getxattr(mnt_user_ns(req->file->f_path.mnt),
   4693			req->file->f_path.dentry,
   4694			&ix->ctx);
   4695
   4696	io_xattr_finish(req, ret);
   4697	return 0;
   4698}
   4699
   4700static int io_getxattr(struct io_kiocb *req, unsigned int issue_flags)
   4701{
   4702	struct io_xattr *ix = &req->xattr;
   4703	unsigned int lookup_flags = LOOKUP_FOLLOW;
   4704	struct path path;
   4705	int ret;
   4706
   4707	if (issue_flags & IO_URING_F_NONBLOCK)
   4708		return -EAGAIN;
   4709
   4710retry:
   4711	ret = filename_lookup(AT_FDCWD, ix->filename, lookup_flags, &path, NULL);
   4712	if (!ret) {
   4713		ret = do_getxattr(mnt_user_ns(path.mnt),
   4714				path.dentry,
   4715				&ix->ctx);
   4716
   4717		path_put(&path);
   4718		if (retry_estale(ret, lookup_flags)) {
   4719			lookup_flags |= LOOKUP_REVAL;
   4720			goto retry;
   4721		}
   4722	}
   4723
   4724	io_xattr_finish(req, ret);
   4725	return 0;
   4726}
   4727
   4728static int __io_setxattr_prep(struct io_kiocb *req,
   4729			const struct io_uring_sqe *sqe)
   4730{
   4731	struct io_xattr *ix = &req->xattr;
   4732	const char __user *name;
   4733	int ret;
   4734
   4735	if (unlikely(req->flags & REQ_F_FIXED_FILE))
   4736		return -EBADF;
   4737
   4738	ix->filename = NULL;
   4739	name = u64_to_user_ptr(READ_ONCE(sqe->addr));
   4740	ix->ctx.cvalue = u64_to_user_ptr(READ_ONCE(sqe->addr2));
   4741	ix->ctx.kvalue = NULL;
   4742	ix->ctx.size = READ_ONCE(sqe->len);
   4743	ix->ctx.flags = READ_ONCE(sqe->xattr_flags);
   4744
   4745	ix->ctx.kname = kmalloc(sizeof(*ix->ctx.kname), GFP_KERNEL);
   4746	if (!ix->ctx.kname)
   4747		return -ENOMEM;
   4748
   4749	ret = setxattr_copy(name, &ix->ctx);
   4750	if (ret) {
   4751		kfree(ix->ctx.kname);
   4752		return ret;
   4753	}
   4754
   4755	req->flags |= REQ_F_NEED_CLEANUP;
   4756	return 0;
   4757}
   4758
   4759static int io_setxattr_prep(struct io_kiocb *req,
   4760			const struct io_uring_sqe *sqe)
   4761{
   4762	struct io_xattr *ix = &req->xattr;
   4763	const char __user *path;
   4764	int ret;
   4765
   4766	ret = __io_setxattr_prep(req, sqe);
   4767	if (ret)
   4768		return ret;
   4769
   4770	path = u64_to_user_ptr(READ_ONCE(sqe->addr3));
   4771
   4772	ix->filename = getname_flags(path, LOOKUP_FOLLOW, NULL);
   4773	if (IS_ERR(ix->filename)) {
   4774		ret = PTR_ERR(ix->filename);
   4775		ix->filename = NULL;
   4776	}
   4777
   4778	return ret;
   4779}
   4780
   4781static int io_fsetxattr_prep(struct io_kiocb *req,
   4782			const struct io_uring_sqe *sqe)
   4783{
   4784	return __io_setxattr_prep(req, sqe);
   4785}
   4786
   4787static int __io_setxattr(struct io_kiocb *req, unsigned int issue_flags,
   4788			struct path *path)
   4789{
   4790	struct io_xattr *ix = &req->xattr;
   4791	int ret;
   4792
   4793	ret = mnt_want_write(path->mnt);
   4794	if (!ret) {
   4795		ret = do_setxattr(mnt_user_ns(path->mnt), path->dentry, &ix->ctx);
   4796		mnt_drop_write(path->mnt);
   4797	}
   4798
   4799	return ret;
   4800}
   4801
   4802static int io_fsetxattr(struct io_kiocb *req, unsigned int issue_flags)
   4803{
   4804	int ret;
   4805
   4806	if (issue_flags & IO_URING_F_NONBLOCK)
   4807		return -EAGAIN;
   4808
   4809	ret = __io_setxattr(req, issue_flags, &req->file->f_path);
   4810	io_xattr_finish(req, ret);
   4811
   4812	return 0;
   4813}
   4814
   4815static int io_setxattr(struct io_kiocb *req, unsigned int issue_flags)
   4816{
   4817	struct io_xattr *ix = &req->xattr;
   4818	unsigned int lookup_flags = LOOKUP_FOLLOW;
   4819	struct path path;
   4820	int ret;
   4821
   4822	if (issue_flags & IO_URING_F_NONBLOCK)
   4823		return -EAGAIN;
   4824
   4825retry:
   4826	ret = filename_lookup(AT_FDCWD, ix->filename, lookup_flags, &path, NULL);
   4827	if (!ret) {
   4828		ret = __io_setxattr(req, issue_flags, &path);
   4829		path_put(&path);
   4830		if (retry_estale(ret, lookup_flags)) {
   4831			lookup_flags |= LOOKUP_REVAL;
   4832			goto retry;
   4833		}
   4834	}
   4835
   4836	io_xattr_finish(req, ret);
   4837	return 0;
   4838}
   4839
   4840static int io_unlinkat_prep(struct io_kiocb *req,
   4841			    const struct io_uring_sqe *sqe)
   4842{
   4843	struct io_unlink *un = &req->unlink;
   4844	const char __user *fname;
   4845
   4846	if (sqe->off || sqe->len || sqe->buf_index || sqe->splice_fd_in)
   4847		return -EINVAL;
   4848	if (unlikely(req->flags & REQ_F_FIXED_FILE))
   4849		return -EBADF;
   4850
   4851	un->dfd = READ_ONCE(sqe->fd);
   4852
   4853	un->flags = READ_ONCE(sqe->unlink_flags);
   4854	if (un->flags & ~AT_REMOVEDIR)
   4855		return -EINVAL;
   4856
   4857	fname = u64_to_user_ptr(READ_ONCE(sqe->addr));
   4858	un->filename = getname(fname);
   4859	if (IS_ERR(un->filename))
   4860		return PTR_ERR(un->filename);
   4861
   4862	req->flags |= REQ_F_NEED_CLEANUP;
   4863	return 0;
   4864}
   4865
   4866static int io_unlinkat(struct io_kiocb *req, unsigned int issue_flags)
   4867{
   4868	struct io_unlink *un = &req->unlink;
   4869	int ret;
   4870
   4871	if (issue_flags & IO_URING_F_NONBLOCK)
   4872		return -EAGAIN;
   4873
   4874	if (un->flags & AT_REMOVEDIR)
   4875		ret = do_rmdir(un->dfd, un->filename);
   4876	else
   4877		ret = do_unlinkat(un->dfd, un->filename);
   4878
   4879	req->flags &= ~REQ_F_NEED_CLEANUP;
   4880	io_req_complete(req, ret);
   4881	return 0;
   4882}
   4883
   4884static int io_mkdirat_prep(struct io_kiocb *req,
   4885			    const struct io_uring_sqe *sqe)
   4886{
   4887	struct io_mkdir *mkd = &req->mkdir;
   4888	const char __user *fname;
   4889
   4890	if (sqe->off || sqe->rw_flags || sqe->buf_index || sqe->splice_fd_in)
   4891		return -EINVAL;
   4892	if (unlikely(req->flags & REQ_F_FIXED_FILE))
   4893		return -EBADF;
   4894
   4895	mkd->dfd = READ_ONCE(sqe->fd);
   4896	mkd->mode = READ_ONCE(sqe->len);
   4897
   4898	fname = u64_to_user_ptr(READ_ONCE(sqe->addr));
   4899	mkd->filename = getname(fname);
   4900	if (IS_ERR(mkd->filename))
   4901		return PTR_ERR(mkd->filename);
   4902
   4903	req->flags |= REQ_F_NEED_CLEANUP;
   4904	return 0;
   4905}
   4906
   4907static int io_mkdirat(struct io_kiocb *req, unsigned int issue_flags)
   4908{
   4909	struct io_mkdir *mkd = &req->mkdir;
   4910	int ret;
   4911
   4912	if (issue_flags & IO_URING_F_NONBLOCK)
   4913		return -EAGAIN;
   4914
   4915	ret = do_mkdirat(mkd->dfd, mkd->filename, mkd->mode);
   4916
   4917	req->flags &= ~REQ_F_NEED_CLEANUP;
   4918	io_req_complete(req, ret);
   4919	return 0;
   4920}
   4921
   4922static int io_symlinkat_prep(struct io_kiocb *req,
   4923			    const struct io_uring_sqe *sqe)
   4924{
   4925	struct io_symlink *sl = &req->symlink;
   4926	const char __user *oldpath, *newpath;
   4927
   4928	if (sqe->len || sqe->rw_flags || sqe->buf_index || sqe->splice_fd_in)
   4929		return -EINVAL;
   4930	if (unlikely(req->flags & REQ_F_FIXED_FILE))
   4931		return -EBADF;
   4932
   4933	sl->new_dfd = READ_ONCE(sqe->fd);
   4934	oldpath = u64_to_user_ptr(READ_ONCE(sqe->addr));
   4935	newpath = u64_to_user_ptr(READ_ONCE(sqe->addr2));
   4936
   4937	sl->oldpath = getname(oldpath);
   4938	if (IS_ERR(sl->oldpath))
   4939		return PTR_ERR(sl->oldpath);
   4940
   4941	sl->newpath = getname(newpath);
   4942	if (IS_ERR(sl->newpath)) {
   4943		putname(sl->oldpath);
   4944		return PTR_ERR(sl->newpath);
   4945	}
   4946
   4947	req->flags |= REQ_F_NEED_CLEANUP;
   4948	return 0;
   4949}
   4950
   4951static int io_symlinkat(struct io_kiocb *req, unsigned int issue_flags)
   4952{
   4953	struct io_symlink *sl = &req->symlink;
   4954	int ret;
   4955
   4956	if (issue_flags & IO_URING_F_NONBLOCK)
   4957		return -EAGAIN;
   4958
   4959	ret = do_symlinkat(sl->oldpath, sl->new_dfd, sl->newpath);
   4960
   4961	req->flags &= ~REQ_F_NEED_CLEANUP;
   4962	io_req_complete(req, ret);
   4963	return 0;
   4964}
   4965
   4966static int io_linkat_prep(struct io_kiocb *req,
   4967			    const struct io_uring_sqe *sqe)
   4968{
   4969	struct io_hardlink *lnk = &req->hardlink;
   4970	const char __user *oldf, *newf;
   4971
   4972	if (sqe->rw_flags || sqe->buf_index || sqe->splice_fd_in)
   4973		return -EINVAL;
   4974	if (unlikely(req->flags & REQ_F_FIXED_FILE))
   4975		return -EBADF;
   4976
   4977	lnk->old_dfd = READ_ONCE(sqe->fd);
   4978	lnk->new_dfd = READ_ONCE(sqe->len);
   4979	oldf = u64_to_user_ptr(READ_ONCE(sqe->addr));
   4980	newf = u64_to_user_ptr(READ_ONCE(sqe->addr2));
   4981	lnk->flags = READ_ONCE(sqe->hardlink_flags);
   4982
   4983	lnk->oldpath = getname(oldf);
   4984	if (IS_ERR(lnk->oldpath))
   4985		return PTR_ERR(lnk->oldpath);
   4986
   4987	lnk->newpath = getname(newf);
   4988	if (IS_ERR(lnk->newpath)) {
   4989		putname(lnk->oldpath);
   4990		return PTR_ERR(lnk->newpath);
   4991	}
   4992
   4993	req->flags |= REQ_F_NEED_CLEANUP;
   4994	return 0;
   4995}
   4996
   4997static int io_linkat(struct io_kiocb *req, unsigned int issue_flags)
   4998{
   4999	struct io_hardlink *lnk = &req->hardlink;
   5000	int ret;
   5001
   5002	if (issue_flags & IO_URING_F_NONBLOCK)
   5003		return -EAGAIN;
   5004
   5005	ret = do_linkat(lnk->old_dfd, lnk->oldpath, lnk->new_dfd,
   5006				lnk->newpath, lnk->flags);
   5007
   5008	req->flags &= ~REQ_F_NEED_CLEANUP;
   5009	io_req_complete(req, ret);
   5010	return 0;
   5011}
   5012
   5013static void io_uring_cmd_work(struct io_kiocb *req, bool *locked)
   5014{
   5015	req->uring_cmd.task_work_cb(&req->uring_cmd);
   5016}
   5017
   5018void io_uring_cmd_complete_in_task(struct io_uring_cmd *ioucmd,
   5019			void (*task_work_cb)(struct io_uring_cmd *))
   5020{
   5021	struct io_kiocb *req = container_of(ioucmd, struct io_kiocb, uring_cmd);
   5022
   5023	req->uring_cmd.task_work_cb = task_work_cb;
   5024	req->io_task_work.func = io_uring_cmd_work;
   5025	io_req_task_work_add(req);
   5026}
   5027EXPORT_SYMBOL_GPL(io_uring_cmd_complete_in_task);
   5028
   5029static inline void io_req_set_cqe32_extra(struct io_kiocb *req,
   5030					  u64 extra1, u64 extra2)
   5031{
   5032	req->extra1 = extra1;
   5033	req->extra2 = extra2;
   5034	req->flags |= REQ_F_CQE32_INIT;
   5035}
   5036
   5037/*
   5038 * Called by consumers of io_uring_cmd, if they originally returned
   5039 * -EIOCBQUEUED upon receiving the command.
   5040 */
   5041void io_uring_cmd_done(struct io_uring_cmd *ioucmd, ssize_t ret, ssize_t res2)
   5042{
   5043	struct io_kiocb *req = container_of(ioucmd, struct io_kiocb, uring_cmd);
   5044
   5045	if (ret < 0)
   5046		req_set_fail(req);
   5047
   5048	if (req->ctx->flags & IORING_SETUP_CQE32)
   5049		io_req_set_cqe32_extra(req, res2, 0);
   5050	io_req_complete(req, ret);
   5051}
   5052EXPORT_SYMBOL_GPL(io_uring_cmd_done);
   5053
   5054static int io_uring_cmd_prep_async(struct io_kiocb *req)
   5055{
   5056	size_t cmd_size;
   5057
   5058	cmd_size = uring_cmd_pdu_size(req->ctx->flags & IORING_SETUP_SQE128);
   5059
   5060	memcpy(req->async_data, req->uring_cmd.cmd, cmd_size);
   5061	return 0;
   5062}
   5063
   5064static int io_uring_cmd_prep(struct io_kiocb *req,
   5065			     const struct io_uring_sqe *sqe)
   5066{
   5067	struct io_uring_cmd *ioucmd = &req->uring_cmd;
   5068
   5069	if (sqe->rw_flags || sqe->__pad1)
   5070		return -EINVAL;
   5071	ioucmd->cmd = sqe->cmd;
   5072	ioucmd->cmd_op = READ_ONCE(sqe->cmd_op);
   5073	return 0;
   5074}
   5075
   5076static int io_uring_cmd(struct io_kiocb *req, unsigned int issue_flags)
   5077{
   5078	struct io_uring_cmd *ioucmd = &req->uring_cmd;
   5079	struct io_ring_ctx *ctx = req->ctx;
   5080	struct file *file = req->file;
   5081	int ret;
   5082
   5083	if (!req->file->f_op->uring_cmd)
   5084		return -EOPNOTSUPP;
   5085
   5086	if (ctx->flags & IORING_SETUP_SQE128)
   5087		issue_flags |= IO_URING_F_SQE128;
   5088	if (ctx->flags & IORING_SETUP_CQE32)
   5089		issue_flags |= IO_URING_F_CQE32;
   5090	if (ctx->flags & IORING_SETUP_IOPOLL)
   5091		issue_flags |= IO_URING_F_IOPOLL;
   5092
   5093	if (req_has_async_data(req))
   5094		ioucmd->cmd = req->async_data;
   5095
   5096	ret = file->f_op->uring_cmd(ioucmd, issue_flags);
   5097	if (ret == -EAGAIN) {
   5098		if (!req_has_async_data(req)) {
   5099			if (io_alloc_async_data(req))
   5100				return -ENOMEM;
   5101			io_uring_cmd_prep_async(req);
   5102		}
   5103		return -EAGAIN;
   5104	}
   5105
   5106	if (ret != -EIOCBQUEUED)
   5107		io_uring_cmd_done(ioucmd, ret, 0);
   5108	return 0;
   5109}
   5110
   5111static int __io_splice_prep(struct io_kiocb *req,
   5112			    const struct io_uring_sqe *sqe)
   5113{
   5114	struct io_splice *sp = &req->splice;
   5115	unsigned int valid_flags = SPLICE_F_FD_IN_FIXED | SPLICE_F_ALL;
   5116
   5117	sp->len = READ_ONCE(sqe->len);
   5118	sp->flags = READ_ONCE(sqe->splice_flags);
   5119	if (unlikely(sp->flags & ~valid_flags))
   5120		return -EINVAL;
   5121	sp->splice_fd_in = READ_ONCE(sqe->splice_fd_in);
   5122	return 0;
   5123}
   5124
   5125static int io_tee_prep(struct io_kiocb *req,
   5126		       const struct io_uring_sqe *sqe)
   5127{
   5128	if (READ_ONCE(sqe->splice_off_in) || READ_ONCE(sqe->off))
   5129		return -EINVAL;
   5130	return __io_splice_prep(req, sqe);
   5131}
   5132
   5133static int io_tee(struct io_kiocb *req, unsigned int issue_flags)
   5134{
   5135	struct io_splice *sp = &req->splice;
   5136	struct file *out = sp->file_out;
   5137	unsigned int flags = sp->flags & ~SPLICE_F_FD_IN_FIXED;
   5138	struct file *in;
   5139	long ret = 0;
   5140
   5141	if (issue_flags & IO_URING_F_NONBLOCK)
   5142		return -EAGAIN;
   5143
   5144	if (sp->flags & SPLICE_F_FD_IN_FIXED)
   5145		in = io_file_get_fixed(req, sp->splice_fd_in, issue_flags);
   5146	else
   5147		in = io_file_get_normal(req, sp->splice_fd_in);
   5148	if (!in) {
   5149		ret = -EBADF;
   5150		goto done;
   5151	}
   5152
   5153	if (sp->len)
   5154		ret = do_tee(in, out, sp->len, flags);
   5155
   5156	if (!(sp->flags & SPLICE_F_FD_IN_FIXED))
   5157		io_put_file(in);
   5158done:
   5159	if (ret != sp->len)
   5160		req_set_fail(req);
   5161	__io_req_complete(req, 0, ret, 0);
   5162	return 0;
   5163}
   5164
   5165static int io_splice_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
   5166{
   5167	struct io_splice *sp = &req->splice;
   5168
   5169	sp->off_in = READ_ONCE(sqe->splice_off_in);
   5170	sp->off_out = READ_ONCE(sqe->off);
   5171	return __io_splice_prep(req, sqe);
   5172}
   5173
   5174static int io_splice(struct io_kiocb *req, unsigned int issue_flags)
   5175{
   5176	struct io_splice *sp = &req->splice;
   5177	struct file *out = sp->file_out;
   5178	unsigned int flags = sp->flags & ~SPLICE_F_FD_IN_FIXED;
   5179	loff_t *poff_in, *poff_out;
   5180	struct file *in;
   5181	long ret = 0;
   5182
   5183	if (issue_flags & IO_URING_F_NONBLOCK)
   5184		return -EAGAIN;
   5185
   5186	if (sp->flags & SPLICE_F_FD_IN_FIXED)
   5187		in = io_file_get_fixed(req, sp->splice_fd_in, issue_flags);
   5188	else
   5189		in = io_file_get_normal(req, sp->splice_fd_in);
   5190	if (!in) {
   5191		ret = -EBADF;
   5192		goto done;
   5193	}
   5194
   5195	poff_in = (sp->off_in == -1) ? NULL : &sp->off_in;
   5196	poff_out = (sp->off_out == -1) ? NULL : &sp->off_out;
   5197
   5198	if (sp->len)
   5199		ret = do_splice(in, poff_in, out, poff_out, sp->len, flags);
   5200
   5201	if (!(sp->flags & SPLICE_F_FD_IN_FIXED))
   5202		io_put_file(in);
   5203done:
   5204	if (ret != sp->len)
   5205		req_set_fail(req);
   5206	__io_req_complete(req, 0, ret, 0);
   5207	return 0;
   5208}
   5209
   5210static int io_nop_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
   5211{
   5212	return 0;
   5213}
   5214
   5215/*
   5216 * IORING_OP_NOP just posts a completion event, nothing else.
   5217 */
   5218static int io_nop(struct io_kiocb *req, unsigned int issue_flags)
   5219{
   5220	__io_req_complete(req, issue_flags, 0, 0);
   5221	return 0;
   5222}
   5223
   5224static int io_msg_ring_prep(struct io_kiocb *req,
   5225			    const struct io_uring_sqe *sqe)
   5226{
   5227	if (unlikely(sqe->addr || sqe->rw_flags || sqe->splice_fd_in ||
   5228		     sqe->buf_index || sqe->personality))
   5229		return -EINVAL;
   5230
   5231	req->msg.user_data = READ_ONCE(sqe->off);
   5232	req->msg.len = READ_ONCE(sqe->len);
   5233	return 0;
   5234}
   5235
   5236static int io_msg_ring(struct io_kiocb *req, unsigned int issue_flags)
   5237{
   5238	struct io_ring_ctx *target_ctx;
   5239	struct io_msg *msg = &req->msg;
   5240	bool filled;
   5241	int ret;
   5242
   5243	ret = -EBADFD;
   5244	if (req->file->f_op != &io_uring_fops)
   5245		goto done;
   5246
   5247	ret = -EOVERFLOW;
   5248	target_ctx = req->file->private_data;
   5249
   5250	spin_lock(&target_ctx->completion_lock);
   5251	filled = io_fill_cqe_aux(target_ctx, msg->user_data, msg->len, 0);
   5252	io_commit_cqring(target_ctx);
   5253	spin_unlock(&target_ctx->completion_lock);
   5254
   5255	if (filled) {
   5256		io_cqring_ev_posted(target_ctx);
   5257		ret = 0;
   5258	}
   5259
   5260done:
   5261	if (ret < 0)
   5262		req_set_fail(req);
   5263	__io_req_complete(req, issue_flags, ret, 0);
   5264	/* put file to avoid an attempt to IOPOLL the req */
   5265	io_put_file(req->file);
   5266	req->file = NULL;
   5267	return 0;
   5268}
   5269
   5270static int io_fsync_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
   5271{
   5272	if (unlikely(sqe->addr || sqe->buf_index || sqe->splice_fd_in))
   5273		return -EINVAL;
   5274
   5275	req->sync.flags = READ_ONCE(sqe->fsync_flags);
   5276	if (unlikely(req->sync.flags & ~IORING_FSYNC_DATASYNC))
   5277		return -EINVAL;
   5278
   5279	req->sync.off = READ_ONCE(sqe->off);
   5280	req->sync.len = READ_ONCE(sqe->len);
   5281	return 0;
   5282}
   5283
   5284static int io_fsync(struct io_kiocb *req, unsigned int issue_flags)
   5285{
   5286	loff_t end = req->sync.off + req->sync.len;
   5287	int ret;
   5288
   5289	/* fsync always requires a blocking context */
   5290	if (issue_flags & IO_URING_F_NONBLOCK)
   5291		return -EAGAIN;
   5292
   5293	ret = vfs_fsync_range(req->file, req->sync.off,
   5294				end > 0 ? end : LLONG_MAX,
   5295				req->sync.flags & IORING_FSYNC_DATASYNC);
   5296	io_req_complete(req, ret);
   5297	return 0;
   5298}
   5299
   5300static int io_fallocate_prep(struct io_kiocb *req,
   5301			     const struct io_uring_sqe *sqe)
   5302{
   5303	if (sqe->buf_index || sqe->rw_flags || sqe->splice_fd_in)
   5304		return -EINVAL;
   5305
   5306	req->sync.off = READ_ONCE(sqe->off);
   5307	req->sync.len = READ_ONCE(sqe->addr);
   5308	req->sync.mode = READ_ONCE(sqe->len);
   5309	return 0;
   5310}
   5311
   5312static int io_fallocate(struct io_kiocb *req, unsigned int issue_flags)
   5313{
   5314	int ret;
   5315
   5316	/* fallocate always requiring blocking context */
   5317	if (issue_flags & IO_URING_F_NONBLOCK)
   5318		return -EAGAIN;
   5319	ret = vfs_fallocate(req->file, req->sync.mode, req->sync.off,
   5320				req->sync.len);
   5321	if (ret >= 0)
   5322		fsnotify_modify(req->file);
   5323	io_req_complete(req, ret);
   5324	return 0;
   5325}
   5326
   5327static int __io_openat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
   5328{
   5329	const char __user *fname;
   5330	int ret;
   5331
   5332	if (unlikely(sqe->buf_index))
   5333		return -EINVAL;
   5334	if (unlikely(req->flags & REQ_F_FIXED_FILE))
   5335		return -EBADF;
   5336
   5337	/* open.how should be already initialised */
   5338	if (!(req->open.how.flags & O_PATH) && force_o_largefile())
   5339		req->open.how.flags |= O_LARGEFILE;
   5340
   5341	req->open.dfd = READ_ONCE(sqe->fd);
   5342	fname = u64_to_user_ptr(READ_ONCE(sqe->addr));
   5343	req->open.filename = getname(fname);
   5344	if (IS_ERR(req->open.filename)) {
   5345		ret = PTR_ERR(req->open.filename);
   5346		req->open.filename = NULL;
   5347		return ret;
   5348	}
   5349
   5350	req->open.file_slot = READ_ONCE(sqe->file_index);
   5351	if (req->open.file_slot && (req->open.how.flags & O_CLOEXEC))
   5352		return -EINVAL;
   5353
   5354	req->open.nofile = rlimit(RLIMIT_NOFILE);
   5355	req->flags |= REQ_F_NEED_CLEANUP;
   5356	return 0;
   5357}
   5358
   5359static int io_openat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
   5360{
   5361	u64 mode = READ_ONCE(sqe->len);
   5362	u64 flags = READ_ONCE(sqe->open_flags);
   5363
   5364	req->open.how = build_open_how(flags, mode);
   5365	return __io_openat_prep(req, sqe);
   5366}
   5367
   5368static int io_openat2_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
   5369{
   5370	struct open_how __user *how;
   5371	size_t len;
   5372	int ret;
   5373
   5374	how = u64_to_user_ptr(READ_ONCE(sqe->addr2));
   5375	len = READ_ONCE(sqe->len);
   5376	if (len < OPEN_HOW_SIZE_VER0)
   5377		return -EINVAL;
   5378
   5379	ret = copy_struct_from_user(&req->open.how, sizeof(req->open.how), how,
   5380					len);
   5381	if (ret)
   5382		return ret;
   5383
   5384	return __io_openat_prep(req, sqe);
   5385}
   5386
   5387static int io_file_bitmap_get(struct io_ring_ctx *ctx)
   5388{
   5389	struct io_file_table *table = &ctx->file_table;
   5390	unsigned long nr = ctx->nr_user_files;
   5391	int ret;
   5392
   5393	do {
   5394		ret = find_next_zero_bit(table->bitmap, nr, table->alloc_hint);
   5395		if (ret != nr)
   5396			return ret;
   5397
   5398		if (!table->alloc_hint)
   5399			break;
   5400
   5401		nr = table->alloc_hint;
   5402		table->alloc_hint = 0;
   5403	} while (1);
   5404
   5405	return -ENFILE;
   5406}
   5407
   5408/*
   5409 * Note when io_fixed_fd_install() returns error value, it will ensure
   5410 * fput() is called correspondingly.
   5411 */
   5412static int io_fixed_fd_install(struct io_kiocb *req, unsigned int issue_flags,
   5413			       struct file *file, unsigned int file_slot)
   5414{
   5415	bool alloc_slot = file_slot == IORING_FILE_INDEX_ALLOC;
   5416	struct io_ring_ctx *ctx = req->ctx;
   5417	int ret;
   5418
   5419	io_ring_submit_lock(ctx, issue_flags);
   5420
   5421	if (alloc_slot) {
   5422		ret = io_file_bitmap_get(ctx);
   5423		if (unlikely(ret < 0))
   5424			goto err;
   5425		file_slot = ret;
   5426	} else {
   5427		file_slot--;
   5428	}
   5429
   5430	ret = io_install_fixed_file(req, file, issue_flags, file_slot);
   5431	if (!ret && alloc_slot)
   5432		ret = file_slot;
   5433err:
   5434	io_ring_submit_unlock(ctx, issue_flags);
   5435	if (unlikely(ret < 0))
   5436		fput(file);
   5437	return ret;
   5438}
   5439
   5440static int io_openat2(struct io_kiocb *req, unsigned int issue_flags)
   5441{
   5442	struct open_flags op;
   5443	struct file *file;
   5444	bool resolve_nonblock, nonblock_set;
   5445	bool fixed = !!req->open.file_slot;
   5446	int ret;
   5447
   5448	ret = build_open_flags(&req->open.how, &op);
   5449	if (ret)
   5450		goto err;
   5451	nonblock_set = op.open_flag & O_NONBLOCK;
   5452	resolve_nonblock = req->open.how.resolve & RESOLVE_CACHED;
   5453	if (issue_flags & IO_URING_F_NONBLOCK) {
   5454		/*
   5455		 * Don't bother trying for O_TRUNC, O_CREAT, or O_TMPFILE open,
   5456		 * it'll always -EAGAIN
   5457		 */
   5458		if (req->open.how.flags & (O_TRUNC | O_CREAT | O_TMPFILE))
   5459			return -EAGAIN;
   5460		op.lookup_flags |= LOOKUP_CACHED;
   5461		op.open_flag |= O_NONBLOCK;
   5462	}
   5463
   5464	if (!fixed) {
   5465		ret = __get_unused_fd_flags(req->open.how.flags, req->open.nofile);
   5466		if (ret < 0)
   5467			goto err;
   5468	}
   5469
   5470	file = do_filp_open(req->open.dfd, req->open.filename, &op);
   5471	if (IS_ERR(file)) {
   5472		/*
   5473		 * We could hang on to this 'fd' on retrying, but seems like
   5474		 * marginal gain for something that is now known to be a slower
   5475		 * path. So just put it, and we'll get a new one when we retry.
   5476		 */
   5477		if (!fixed)
   5478			put_unused_fd(ret);
   5479
   5480		ret = PTR_ERR(file);
   5481		/* only retry if RESOLVE_CACHED wasn't already set by application */
   5482		if (ret == -EAGAIN &&
   5483		    (!resolve_nonblock && (issue_flags & IO_URING_F_NONBLOCK)))
   5484			return -EAGAIN;
   5485		goto err;
   5486	}
   5487
   5488	if ((issue_flags & IO_URING_F_NONBLOCK) && !nonblock_set)
   5489		file->f_flags &= ~O_NONBLOCK;
   5490	fsnotify_open(file);
   5491
   5492	if (!fixed)
   5493		fd_install(ret, file);
   5494	else
   5495		ret = io_fixed_fd_install(req, issue_flags, file,
   5496						req->open.file_slot);
   5497err:
   5498	putname(req->open.filename);
   5499	req->flags &= ~REQ_F_NEED_CLEANUP;
   5500	if (ret < 0)
   5501		req_set_fail(req);
   5502	__io_req_complete(req, issue_flags, ret, 0);
   5503	return 0;
   5504}
   5505
   5506static int io_openat(struct io_kiocb *req, unsigned int issue_flags)
   5507{
   5508	return io_openat2(req, issue_flags);
   5509}
   5510
   5511static int io_remove_buffers_prep(struct io_kiocb *req,
   5512				  const struct io_uring_sqe *sqe)
   5513{
   5514	struct io_provide_buf *p = &req->pbuf;
   5515	u64 tmp;
   5516
   5517	if (sqe->rw_flags || sqe->addr || sqe->len || sqe->off ||
   5518	    sqe->splice_fd_in)
   5519		return -EINVAL;
   5520
   5521	tmp = READ_ONCE(sqe->fd);
   5522	if (!tmp || tmp > USHRT_MAX)
   5523		return -EINVAL;
   5524
   5525	memset(p, 0, sizeof(*p));
   5526	p->nbufs = tmp;
   5527	p->bgid = READ_ONCE(sqe->buf_group);
   5528	return 0;
   5529}
   5530
   5531static int __io_remove_buffers(struct io_ring_ctx *ctx,
   5532			       struct io_buffer_list *bl, unsigned nbufs)
   5533{
   5534	unsigned i = 0;
   5535
   5536	/* shouldn't happen */
   5537	if (!nbufs)
   5538		return 0;
   5539
   5540	if (bl->buf_nr_pages) {
   5541		int j;
   5542
   5543		i = bl->buf_ring->tail - bl->head;
   5544		for (j = 0; j < bl->buf_nr_pages; j++)
   5545			unpin_user_page(bl->buf_pages[j]);
   5546		kvfree(bl->buf_pages);
   5547		bl->buf_pages = NULL;
   5548		bl->buf_nr_pages = 0;
   5549		/* make sure it's seen as empty */
   5550		INIT_LIST_HEAD(&bl->buf_list);
   5551		return i;
   5552	}
   5553
   5554	/* the head kbuf is the list itself */
   5555	while (!list_empty(&bl->buf_list)) {
   5556		struct io_buffer *nxt;
   5557
   5558		nxt = list_first_entry(&bl->buf_list, struct io_buffer, list);
   5559		list_del(&nxt->list);
   5560		if (++i == nbufs)
   5561			return i;
   5562		cond_resched();
   5563	}
   5564	i++;
   5565
   5566	return i;
   5567}
   5568
   5569static int io_remove_buffers(struct io_kiocb *req, unsigned int issue_flags)
   5570{
   5571	struct io_provide_buf *p = &req->pbuf;
   5572	struct io_ring_ctx *ctx = req->ctx;
   5573	struct io_buffer_list *bl;
   5574	int ret = 0;
   5575
   5576	io_ring_submit_lock(ctx, issue_flags);
   5577
   5578	ret = -ENOENT;
   5579	bl = io_buffer_get_list(ctx, p->bgid);
   5580	if (bl) {
   5581		ret = -EINVAL;
   5582		/* can't use provide/remove buffers command on mapped buffers */
   5583		if (!bl->buf_nr_pages)
   5584			ret = __io_remove_buffers(ctx, bl, p->nbufs);
   5585	}
   5586	if (ret < 0)
   5587		req_set_fail(req);
   5588
   5589	/* complete before unlock, IOPOLL may need the lock */
   5590	__io_req_complete(req, issue_flags, ret, 0);
   5591	io_ring_submit_unlock(ctx, issue_flags);
   5592	return 0;
   5593}
   5594
   5595static int io_provide_buffers_prep(struct io_kiocb *req,
   5596				   const struct io_uring_sqe *sqe)
   5597{
   5598	unsigned long size, tmp_check;
   5599	struct io_provide_buf *p = &req->pbuf;
   5600	u64 tmp;
   5601
   5602	if (sqe->rw_flags || sqe->splice_fd_in)
   5603		return -EINVAL;
   5604
   5605	tmp = READ_ONCE(sqe->fd);
   5606	if (!tmp || tmp > USHRT_MAX)
   5607		return -E2BIG;
   5608	p->nbufs = tmp;
   5609	p->addr = READ_ONCE(sqe->addr);
   5610	p->len = READ_ONCE(sqe->len);
   5611
   5612	if (check_mul_overflow((unsigned long)p->len, (unsigned long)p->nbufs,
   5613				&size))
   5614		return -EOVERFLOW;
   5615	if (check_add_overflow((unsigned long)p->addr, size, &tmp_check))
   5616		return -EOVERFLOW;
   5617
   5618	size = (unsigned long)p->len * p->nbufs;
   5619	if (!access_ok(u64_to_user_ptr(p->addr), size))
   5620		return -EFAULT;
   5621
   5622	p->bgid = READ_ONCE(sqe->buf_group);
   5623	tmp = READ_ONCE(sqe->off);
   5624	if (tmp > USHRT_MAX)
   5625		return -E2BIG;
   5626	p->bid = tmp;
   5627	return 0;
   5628}
   5629
   5630static int io_refill_buffer_cache(struct io_ring_ctx *ctx)
   5631{
   5632	struct io_buffer *buf;
   5633	struct page *page;
   5634	int bufs_in_page;
   5635
   5636	/*
   5637	 * Completions that don't happen inline (eg not under uring_lock) will
   5638	 * add to ->io_buffers_comp. If we don't have any free buffers, check
   5639	 * the completion list and splice those entries first.
   5640	 */
   5641	if (!list_empty_careful(&ctx->io_buffers_comp)) {
   5642		spin_lock(&ctx->completion_lock);
   5643		if (!list_empty(&ctx->io_buffers_comp)) {
   5644			list_splice_init(&ctx->io_buffers_comp,
   5645						&ctx->io_buffers_cache);
   5646			spin_unlock(&ctx->completion_lock);
   5647			return 0;
   5648		}
   5649		spin_unlock(&ctx->completion_lock);
   5650	}
   5651
   5652	/*
   5653	 * No free buffers and no completion entries either. Allocate a new
   5654	 * page worth of buffer entries and add those to our freelist.
   5655	 */
   5656	page = alloc_page(GFP_KERNEL_ACCOUNT);
   5657	if (!page)
   5658		return -ENOMEM;
   5659
   5660	list_add(&page->lru, &ctx->io_buffers_pages);
   5661
   5662	buf = page_address(page);
   5663	bufs_in_page = PAGE_SIZE / sizeof(*buf);
   5664	while (bufs_in_page) {
   5665		list_add_tail(&buf->list, &ctx->io_buffers_cache);
   5666		buf++;
   5667		bufs_in_page--;
   5668	}
   5669
   5670	return 0;
   5671}
   5672
   5673static int io_add_buffers(struct io_ring_ctx *ctx, struct io_provide_buf *pbuf,
   5674			  struct io_buffer_list *bl)
   5675{
   5676	struct io_buffer *buf;
   5677	u64 addr = pbuf->addr;
   5678	int i, bid = pbuf->bid;
   5679
   5680	for (i = 0; i < pbuf->nbufs; i++) {
   5681		if (list_empty(&ctx->io_buffers_cache) &&
   5682		    io_refill_buffer_cache(ctx))
   5683			break;
   5684		buf = list_first_entry(&ctx->io_buffers_cache, struct io_buffer,
   5685					list);
   5686		list_move_tail(&buf->list, &bl->buf_list);
   5687		buf->addr = addr;
   5688		buf->len = min_t(__u32, pbuf->len, MAX_RW_COUNT);
   5689		buf->bid = bid;
   5690		buf->bgid = pbuf->bgid;
   5691		addr += pbuf->len;
   5692		bid++;
   5693		cond_resched();
   5694	}
   5695
   5696	return i ? 0 : -ENOMEM;
   5697}
   5698
   5699static __cold int io_init_bl_list(struct io_ring_ctx *ctx)
   5700{
   5701	int i;
   5702
   5703	ctx->io_bl = kcalloc(BGID_ARRAY, sizeof(struct io_buffer_list),
   5704				GFP_KERNEL);
   5705	if (!ctx->io_bl)
   5706		return -ENOMEM;
   5707
   5708	for (i = 0; i < BGID_ARRAY; i++) {
   5709		INIT_LIST_HEAD(&ctx->io_bl[i].buf_list);
   5710		ctx->io_bl[i].bgid = i;
   5711	}
   5712
   5713	return 0;
   5714}
   5715
   5716static int io_provide_buffers(struct io_kiocb *req, unsigned int issue_flags)
   5717{
   5718	struct io_provide_buf *p = &req->pbuf;
   5719	struct io_ring_ctx *ctx = req->ctx;
   5720	struct io_buffer_list *bl;
   5721	int ret = 0;
   5722
   5723	io_ring_submit_lock(ctx, issue_flags);
   5724
   5725	if (unlikely(p->bgid < BGID_ARRAY && !ctx->io_bl)) {
   5726		ret = io_init_bl_list(ctx);
   5727		if (ret)
   5728			goto err;
   5729	}
   5730
   5731	bl = io_buffer_get_list(ctx, p->bgid);
   5732	if (unlikely(!bl)) {
   5733		bl = kzalloc(sizeof(*bl), GFP_KERNEL);
   5734		if (!bl) {
   5735			ret = -ENOMEM;
   5736			goto err;
   5737		}
   5738		INIT_LIST_HEAD(&bl->buf_list);
   5739		ret = io_buffer_add_list(ctx, bl, p->bgid);
   5740		if (ret) {
   5741			kfree(bl);
   5742			goto err;
   5743		}
   5744	}
   5745	/* can't add buffers via this command for a mapped buffer ring */
   5746	if (bl->buf_nr_pages) {
   5747		ret = -EINVAL;
   5748		goto err;
   5749	}
   5750
   5751	ret = io_add_buffers(ctx, p, bl);
   5752err:
   5753	if (ret < 0)
   5754		req_set_fail(req);
   5755	/* complete before unlock, IOPOLL may need the lock */
   5756	__io_req_complete(req, issue_flags, ret, 0);
   5757	io_ring_submit_unlock(ctx, issue_flags);
   5758	return 0;
   5759}
   5760
   5761static int io_epoll_ctl_prep(struct io_kiocb *req,
   5762			     const struct io_uring_sqe *sqe)
   5763{
   5764#if defined(CONFIG_EPOLL)
   5765	if (sqe->buf_index || sqe->splice_fd_in)
   5766		return -EINVAL;
   5767
   5768	req->epoll.epfd = READ_ONCE(sqe->fd);
   5769	req->epoll.op = READ_ONCE(sqe->len);
   5770	req->epoll.fd = READ_ONCE(sqe->off);
   5771
   5772	if (ep_op_has_event(req->epoll.op)) {
   5773		struct epoll_event __user *ev;
   5774
   5775		ev = u64_to_user_ptr(READ_ONCE(sqe->addr));
   5776		if (copy_from_user(&req->epoll.event, ev, sizeof(*ev)))
   5777			return -EFAULT;
   5778	}
   5779
   5780	return 0;
   5781#else
   5782	return -EOPNOTSUPP;
   5783#endif
   5784}
   5785
   5786static int io_epoll_ctl(struct io_kiocb *req, unsigned int issue_flags)
   5787{
   5788#if defined(CONFIG_EPOLL)
   5789	struct io_epoll *ie = &req->epoll;
   5790	int ret;
   5791	bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
   5792
   5793	ret = do_epoll_ctl(ie->epfd, ie->op, ie->fd, &ie->event, force_nonblock);
   5794	if (force_nonblock && ret == -EAGAIN)
   5795		return -EAGAIN;
   5796
   5797	if (ret < 0)
   5798		req_set_fail(req);
   5799	__io_req_complete(req, issue_flags, ret, 0);
   5800	return 0;
   5801#else
   5802	return -EOPNOTSUPP;
   5803#endif
   5804}
   5805
   5806static int io_madvise_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
   5807{
   5808#if defined(CONFIG_ADVISE_SYSCALLS) && defined(CONFIG_MMU)
   5809	if (sqe->buf_index || sqe->off || sqe->splice_fd_in)
   5810		return -EINVAL;
   5811
   5812	req->madvise.addr = READ_ONCE(sqe->addr);
   5813	req->madvise.len = READ_ONCE(sqe->len);
   5814	req->madvise.advice = READ_ONCE(sqe->fadvise_advice);
   5815	return 0;
   5816#else
   5817	return -EOPNOTSUPP;
   5818#endif
   5819}
   5820
   5821static int io_madvise(struct io_kiocb *req, unsigned int issue_flags)
   5822{
   5823#if defined(CONFIG_ADVISE_SYSCALLS) && defined(CONFIG_MMU)
   5824	struct io_madvise *ma = &req->madvise;
   5825	int ret;
   5826
   5827	if (issue_flags & IO_URING_F_NONBLOCK)
   5828		return -EAGAIN;
   5829
   5830	ret = do_madvise(current->mm, ma->addr, ma->len, ma->advice);
   5831	io_req_complete(req, ret);
   5832	return 0;
   5833#else
   5834	return -EOPNOTSUPP;
   5835#endif
   5836}
   5837
   5838static int io_fadvise_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
   5839{
   5840	if (sqe->buf_index || sqe->addr || sqe->splice_fd_in)
   5841		return -EINVAL;
   5842
   5843	req->fadvise.offset = READ_ONCE(sqe->off);
   5844	req->fadvise.len = READ_ONCE(sqe->len);
   5845	req->fadvise.advice = READ_ONCE(sqe->fadvise_advice);
   5846	return 0;
   5847}
   5848
   5849static int io_fadvise(struct io_kiocb *req, unsigned int issue_flags)
   5850{
   5851	struct io_fadvise *fa = &req->fadvise;
   5852	int ret;
   5853
   5854	if (issue_flags & IO_URING_F_NONBLOCK) {
   5855		switch (fa->advice) {
   5856		case POSIX_FADV_NORMAL:
   5857		case POSIX_FADV_RANDOM:
   5858		case POSIX_FADV_SEQUENTIAL:
   5859			break;
   5860		default:
   5861			return -EAGAIN;
   5862		}
   5863	}
   5864
   5865	ret = vfs_fadvise(req->file, fa->offset, fa->len, fa->advice);
   5866	if (ret < 0)
   5867		req_set_fail(req);
   5868	__io_req_complete(req, issue_flags, ret, 0);
   5869	return 0;
   5870}
   5871
   5872static int io_statx_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
   5873{
   5874	const char __user *path;
   5875
   5876	if (sqe->buf_index || sqe->splice_fd_in)
   5877		return -EINVAL;
   5878	if (req->flags & REQ_F_FIXED_FILE)
   5879		return -EBADF;
   5880
   5881	req->statx.dfd = READ_ONCE(sqe->fd);
   5882	req->statx.mask = READ_ONCE(sqe->len);
   5883	path = u64_to_user_ptr(READ_ONCE(sqe->addr));
   5884	req->statx.buffer = u64_to_user_ptr(READ_ONCE(sqe->addr2));
   5885	req->statx.flags = READ_ONCE(sqe->statx_flags);
   5886
   5887	req->statx.filename = getname_flags(path,
   5888					getname_statx_lookup_flags(req->statx.flags),
   5889					NULL);
   5890
   5891	if (IS_ERR(req->statx.filename)) {
   5892		int ret = PTR_ERR(req->statx.filename);
   5893
   5894		req->statx.filename = NULL;
   5895		return ret;
   5896	}
   5897
   5898	req->flags |= REQ_F_NEED_CLEANUP;
   5899	return 0;
   5900}
   5901
   5902static int io_statx(struct io_kiocb *req, unsigned int issue_flags)
   5903{
   5904	struct io_statx *ctx = &req->statx;
   5905	int ret;
   5906
   5907	if (issue_flags & IO_URING_F_NONBLOCK)
   5908		return -EAGAIN;
   5909
   5910	ret = do_statx(ctx->dfd, ctx->filename, ctx->flags, ctx->mask,
   5911		       ctx->buffer);
   5912	io_req_complete(req, ret);
   5913	return 0;
   5914}
   5915
   5916static int io_close_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
   5917{
   5918	if (sqe->off || sqe->addr || sqe->len || sqe->rw_flags || sqe->buf_index)
   5919		return -EINVAL;
   5920	if (req->flags & REQ_F_FIXED_FILE)
   5921		return -EBADF;
   5922
   5923	req->close.fd = READ_ONCE(sqe->fd);
   5924	req->close.file_slot = READ_ONCE(sqe->file_index);
   5925	if (req->close.file_slot && req->close.fd)
   5926		return -EINVAL;
   5927
   5928	return 0;
   5929}
   5930
   5931static int io_close(struct io_kiocb *req, unsigned int issue_flags)
   5932{
   5933	struct files_struct *files = current->files;
   5934	struct io_close *close = &req->close;
   5935	struct fdtable *fdt;
   5936	struct file *file;
   5937	int ret = -EBADF;
   5938
   5939	if (req->close.file_slot) {
   5940		ret = io_close_fixed(req, issue_flags);
   5941		goto err;
   5942	}
   5943
   5944	spin_lock(&files->file_lock);
   5945	fdt = files_fdtable(files);
   5946	if (close->fd >= fdt->max_fds) {
   5947		spin_unlock(&files->file_lock);
   5948		goto err;
   5949	}
   5950	file = rcu_dereference_protected(fdt->fd[close->fd],
   5951			lockdep_is_held(&files->file_lock));
   5952	if (!file || file->f_op == &io_uring_fops) {
   5953		spin_unlock(&files->file_lock);
   5954		goto err;
   5955	}
   5956
   5957	/* if the file has a flush method, be safe and punt to async */
   5958	if (file->f_op->flush && (issue_flags & IO_URING_F_NONBLOCK)) {
   5959		spin_unlock(&files->file_lock);
   5960		return -EAGAIN;
   5961	}
   5962
   5963	file = __close_fd_get_file(close->fd);
   5964	spin_unlock(&files->file_lock);
   5965	if (!file)
   5966		goto err;
   5967
   5968	/* No ->flush() or already async, safely close from here */
   5969	ret = filp_close(file, current->files);
   5970err:
   5971	if (ret < 0)
   5972		req_set_fail(req);
   5973	__io_req_complete(req, issue_flags, ret, 0);
   5974	return 0;
   5975}
   5976
   5977static int io_sfr_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
   5978{
   5979	if (unlikely(sqe->addr || sqe->buf_index || sqe->splice_fd_in))
   5980		return -EINVAL;
   5981
   5982	req->sync.off = READ_ONCE(sqe->off);
   5983	req->sync.len = READ_ONCE(sqe->len);
   5984	req->sync.flags = READ_ONCE(sqe->sync_range_flags);
   5985	return 0;
   5986}
   5987
   5988static int io_sync_file_range(struct io_kiocb *req, unsigned int issue_flags)
   5989{
   5990	int ret;
   5991
   5992	/* sync_file_range always requires a blocking context */
   5993	if (issue_flags & IO_URING_F_NONBLOCK)
   5994		return -EAGAIN;
   5995
   5996	ret = sync_file_range(req->file, req->sync.off, req->sync.len,
   5997				req->sync.flags);
   5998	io_req_complete(req, ret);
   5999	return 0;
   6000}
   6001
   6002#if defined(CONFIG_NET)
   6003static int io_shutdown_prep(struct io_kiocb *req,
   6004			    const struct io_uring_sqe *sqe)
   6005{
   6006	if (unlikely(sqe->off || sqe->addr || sqe->rw_flags ||
   6007		     sqe->buf_index || sqe->splice_fd_in))
   6008		return -EINVAL;
   6009
   6010	req->shutdown.how = READ_ONCE(sqe->len);
   6011	return 0;
   6012}
   6013
   6014static int io_shutdown(struct io_kiocb *req, unsigned int issue_flags)
   6015{
   6016	struct socket *sock;
   6017	int ret;
   6018
   6019	if (issue_flags & IO_URING_F_NONBLOCK)
   6020		return -EAGAIN;
   6021
   6022	sock = sock_from_file(req->file);
   6023	if (unlikely(!sock))
   6024		return -ENOTSOCK;
   6025
   6026	ret = __sys_shutdown_sock(sock, req->shutdown.how);
   6027	io_req_complete(req, ret);
   6028	return 0;
   6029}
   6030
   6031static bool io_net_retry(struct socket *sock, int flags)
   6032{
   6033	if (!(flags & MSG_WAITALL))
   6034		return false;
   6035	return sock->type == SOCK_STREAM || sock->type == SOCK_SEQPACKET;
   6036}
   6037
   6038static int io_setup_async_msg(struct io_kiocb *req,
   6039			      struct io_async_msghdr *kmsg)
   6040{
   6041	struct io_async_msghdr *async_msg = req->async_data;
   6042
   6043	if (async_msg)
   6044		return -EAGAIN;
   6045	if (io_alloc_async_data(req)) {
   6046		kfree(kmsg->free_iov);
   6047		return -ENOMEM;
   6048	}
   6049	async_msg = req->async_data;
   6050	req->flags |= REQ_F_NEED_CLEANUP;
   6051	memcpy(async_msg, kmsg, sizeof(*kmsg));
   6052	async_msg->msg.msg_name = &async_msg->addr;
   6053	/* if were using fast_iov, set it to the new one */
   6054	if (!async_msg->free_iov)
   6055		async_msg->msg.msg_iter.iov = async_msg->fast_iov;
   6056
   6057	return -EAGAIN;
   6058}
   6059
   6060static int io_sendmsg_copy_hdr(struct io_kiocb *req,
   6061			       struct io_async_msghdr *iomsg)
   6062{
   6063	iomsg->msg.msg_name = &iomsg->addr;
   6064	iomsg->free_iov = iomsg->fast_iov;
   6065	return sendmsg_copy_msghdr(&iomsg->msg, req->sr_msg.umsg,
   6066				   req->sr_msg.msg_flags, &iomsg->free_iov);
   6067}
   6068
   6069static int io_sendmsg_prep_async(struct io_kiocb *req)
   6070{
   6071	int ret;
   6072
   6073	ret = io_sendmsg_copy_hdr(req, req->async_data);
   6074	if (!ret)
   6075		req->flags |= REQ_F_NEED_CLEANUP;
   6076	return ret;
   6077}
   6078
   6079static int io_sendmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
   6080{
   6081	struct io_sr_msg *sr = &req->sr_msg;
   6082
   6083	if (unlikely(sqe->file_index || sqe->addr2))
   6084		return -EINVAL;
   6085
   6086	sr->umsg = u64_to_user_ptr(READ_ONCE(sqe->addr));
   6087	sr->len = READ_ONCE(sqe->len);
   6088	sr->flags = READ_ONCE(sqe->ioprio);
   6089	if (sr->flags & ~IORING_RECVSEND_POLL_FIRST)
   6090		return -EINVAL;
   6091	sr->msg_flags = READ_ONCE(sqe->msg_flags) | MSG_NOSIGNAL;
   6092	if (sr->msg_flags & MSG_DONTWAIT)
   6093		req->flags |= REQ_F_NOWAIT;
   6094
   6095#ifdef CONFIG_COMPAT
   6096	if (req->ctx->compat)
   6097		sr->msg_flags |= MSG_CMSG_COMPAT;
   6098#endif
   6099	sr->done_io = 0;
   6100	return 0;
   6101}
   6102
   6103static int io_sendmsg(struct io_kiocb *req, unsigned int issue_flags)
   6104{
   6105	struct io_async_msghdr iomsg, *kmsg;
   6106	struct io_sr_msg *sr = &req->sr_msg;
   6107	struct socket *sock;
   6108	unsigned flags;
   6109	int min_ret = 0;
   6110	int ret;
   6111
   6112	sock = sock_from_file(req->file);
   6113	if (unlikely(!sock))
   6114		return -ENOTSOCK;
   6115
   6116	if (req_has_async_data(req)) {
   6117		kmsg = req->async_data;
   6118	} else {
   6119		ret = io_sendmsg_copy_hdr(req, &iomsg);
   6120		if (ret)
   6121			return ret;
   6122		kmsg = &iomsg;
   6123	}
   6124
   6125	if (!(req->flags & REQ_F_POLLED) &&
   6126	    (sr->flags & IORING_RECVSEND_POLL_FIRST))
   6127		return io_setup_async_msg(req, kmsg);
   6128
   6129	flags = sr->msg_flags;
   6130	if (issue_flags & IO_URING_F_NONBLOCK)
   6131		flags |= MSG_DONTWAIT;
   6132	if (flags & MSG_WAITALL)
   6133		min_ret = iov_iter_count(&kmsg->msg.msg_iter);
   6134
   6135	ret = __sys_sendmsg_sock(sock, &kmsg->msg, flags);
   6136
   6137	if (ret < min_ret) {
   6138		if (ret == -EAGAIN && (issue_flags & IO_URING_F_NONBLOCK))
   6139			return io_setup_async_msg(req, kmsg);
   6140		if (ret == -ERESTARTSYS)
   6141			ret = -EINTR;
   6142		if (ret > 0 && io_net_retry(sock, flags)) {
   6143			sr->done_io += ret;
   6144			req->flags |= REQ_F_PARTIAL_IO;
   6145			return io_setup_async_msg(req, kmsg);
   6146		}
   6147		req_set_fail(req);
   6148	}
   6149	/* fast path, check for non-NULL to avoid function call */
   6150	if (kmsg->free_iov)
   6151		kfree(kmsg->free_iov);
   6152	req->flags &= ~REQ_F_NEED_CLEANUP;
   6153	if (ret >= 0)
   6154		ret += sr->done_io;
   6155	else if (sr->done_io)
   6156		ret = sr->done_io;
   6157	__io_req_complete(req, issue_flags, ret, 0);
   6158	return 0;
   6159}
   6160
   6161static int io_send(struct io_kiocb *req, unsigned int issue_flags)
   6162{
   6163	struct io_sr_msg *sr = &req->sr_msg;
   6164	struct msghdr msg;
   6165	struct iovec iov;
   6166	struct socket *sock;
   6167	unsigned flags;
   6168	int min_ret = 0;
   6169	int ret;
   6170
   6171	if (!(req->flags & REQ_F_POLLED) &&
   6172	    (sr->flags & IORING_RECVSEND_POLL_FIRST))
   6173		return -EAGAIN;
   6174
   6175	sock = sock_from_file(req->file);
   6176	if (unlikely(!sock))
   6177		return -ENOTSOCK;
   6178
   6179	ret = import_single_range(WRITE, sr->buf, sr->len, &iov, &msg.msg_iter);
   6180	if (unlikely(ret))
   6181		return ret;
   6182
   6183	msg.msg_name = NULL;
   6184	msg.msg_control = NULL;
   6185	msg.msg_controllen = 0;
   6186	msg.msg_namelen = 0;
   6187
   6188	flags = sr->msg_flags;
   6189	if (issue_flags & IO_URING_F_NONBLOCK)
   6190		flags |= MSG_DONTWAIT;
   6191	if (flags & MSG_WAITALL)
   6192		min_ret = iov_iter_count(&msg.msg_iter);
   6193
   6194	msg.msg_flags = flags;
   6195	ret = sock_sendmsg(sock, &msg);
   6196	if (ret < min_ret) {
   6197		if (ret == -EAGAIN && (issue_flags & IO_URING_F_NONBLOCK))
   6198			return -EAGAIN;
   6199		if (ret == -ERESTARTSYS)
   6200			ret = -EINTR;
   6201		if (ret > 0 && io_net_retry(sock, flags)) {
   6202			sr->len -= ret;
   6203			sr->buf += ret;
   6204			sr->done_io += ret;
   6205			req->flags |= REQ_F_PARTIAL_IO;
   6206			return -EAGAIN;
   6207		}
   6208		req_set_fail(req);
   6209	}
   6210	if (ret >= 0)
   6211		ret += sr->done_io;
   6212	else if (sr->done_io)
   6213		ret = sr->done_io;
   6214	__io_req_complete(req, issue_flags, ret, 0);
   6215	return 0;
   6216}
   6217
   6218static int __io_recvmsg_copy_hdr(struct io_kiocb *req,
   6219				 struct io_async_msghdr *iomsg)
   6220{
   6221	struct io_sr_msg *sr = &req->sr_msg;
   6222	struct iovec __user *uiov;
   6223	size_t iov_len;
   6224	int ret;
   6225
   6226	ret = __copy_msghdr_from_user(&iomsg->msg, sr->umsg,
   6227					&iomsg->uaddr, &uiov, &iov_len);
   6228	if (ret)
   6229		return ret;
   6230
   6231	if (req->flags & REQ_F_BUFFER_SELECT) {
   6232		if (iov_len > 1)
   6233			return -EINVAL;
   6234		if (copy_from_user(iomsg->fast_iov, uiov, sizeof(*uiov)))
   6235			return -EFAULT;
   6236		sr->len = iomsg->fast_iov[0].iov_len;
   6237		iomsg->free_iov = NULL;
   6238	} else {
   6239		iomsg->free_iov = iomsg->fast_iov;
   6240		ret = __import_iovec(READ, uiov, iov_len, UIO_FASTIOV,
   6241				     &iomsg->free_iov, &iomsg->msg.msg_iter,
   6242				     false);
   6243		if (ret > 0)
   6244			ret = 0;
   6245	}
   6246
   6247	return ret;
   6248}
   6249
   6250#ifdef CONFIG_COMPAT
   6251static int __io_compat_recvmsg_copy_hdr(struct io_kiocb *req,
   6252					struct io_async_msghdr *iomsg)
   6253{
   6254	struct io_sr_msg *sr = &req->sr_msg;
   6255	struct compat_iovec __user *uiov;
   6256	compat_uptr_t ptr;
   6257	compat_size_t len;
   6258	int ret;
   6259
   6260	ret = __get_compat_msghdr(&iomsg->msg, sr->umsg_compat, &iomsg->uaddr,
   6261				  &ptr, &len);
   6262	if (ret)
   6263		return ret;
   6264
   6265	uiov = compat_ptr(ptr);
   6266	if (req->flags & REQ_F_BUFFER_SELECT) {
   6267		compat_ssize_t clen;
   6268
   6269		if (len > 1)
   6270			return -EINVAL;
   6271		if (!access_ok(uiov, sizeof(*uiov)))
   6272			return -EFAULT;
   6273		if (__get_user(clen, &uiov->iov_len))
   6274			return -EFAULT;
   6275		if (clen < 0)
   6276			return -EINVAL;
   6277		sr->len = clen;
   6278		iomsg->free_iov = NULL;
   6279	} else {
   6280		iomsg->free_iov = iomsg->fast_iov;
   6281		ret = __import_iovec(READ, (struct iovec __user *)uiov, len,
   6282				   UIO_FASTIOV, &iomsg->free_iov,
   6283				   &iomsg->msg.msg_iter, true);
   6284		if (ret < 0)
   6285			return ret;
   6286	}
   6287
   6288	return 0;
   6289}
   6290#endif
   6291
   6292static int io_recvmsg_copy_hdr(struct io_kiocb *req,
   6293			       struct io_async_msghdr *iomsg)
   6294{
   6295	iomsg->msg.msg_name = &iomsg->addr;
   6296
   6297#ifdef CONFIG_COMPAT
   6298	if (req->ctx->compat)
   6299		return __io_compat_recvmsg_copy_hdr(req, iomsg);
   6300#endif
   6301
   6302	return __io_recvmsg_copy_hdr(req, iomsg);
   6303}
   6304
   6305static int io_recvmsg_prep_async(struct io_kiocb *req)
   6306{
   6307	int ret;
   6308
   6309	ret = io_recvmsg_copy_hdr(req, req->async_data);
   6310	if (!ret)
   6311		req->flags |= REQ_F_NEED_CLEANUP;
   6312	return ret;
   6313}
   6314
   6315static int io_recvmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
   6316{
   6317	struct io_sr_msg *sr = &req->sr_msg;
   6318
   6319	if (unlikely(sqe->file_index || sqe->addr2))
   6320		return -EINVAL;
   6321
   6322	sr->umsg = u64_to_user_ptr(READ_ONCE(sqe->addr));
   6323	sr->len = READ_ONCE(sqe->len);
   6324	sr->flags = READ_ONCE(sqe->ioprio);
   6325	if (sr->flags & ~IORING_RECVSEND_POLL_FIRST)
   6326		return -EINVAL;
   6327	sr->msg_flags = READ_ONCE(sqe->msg_flags) | MSG_NOSIGNAL;
   6328	if (sr->msg_flags & MSG_DONTWAIT)
   6329		req->flags |= REQ_F_NOWAIT;
   6330
   6331#ifdef CONFIG_COMPAT
   6332	if (req->ctx->compat)
   6333		sr->msg_flags |= MSG_CMSG_COMPAT;
   6334#endif
   6335	sr->done_io = 0;
   6336	return 0;
   6337}
   6338
   6339static int io_recvmsg(struct io_kiocb *req, unsigned int issue_flags)
   6340{
   6341	struct io_async_msghdr iomsg, *kmsg;
   6342	struct io_sr_msg *sr = &req->sr_msg;
   6343	struct socket *sock;
   6344	unsigned int cflags;
   6345	unsigned flags;
   6346	int ret, min_ret = 0;
   6347	bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
   6348
   6349	sock = sock_from_file(req->file);
   6350	if (unlikely(!sock))
   6351		return -ENOTSOCK;
   6352
   6353	if (req_has_async_data(req)) {
   6354		kmsg = req->async_data;
   6355	} else {
   6356		ret = io_recvmsg_copy_hdr(req, &iomsg);
   6357		if (ret)
   6358			return ret;
   6359		kmsg = &iomsg;
   6360	}
   6361
   6362	if (!(req->flags & REQ_F_POLLED) &&
   6363	    (sr->flags & IORING_RECVSEND_POLL_FIRST))
   6364		return io_setup_async_msg(req, kmsg);
   6365
   6366	if (io_do_buffer_select(req)) {
   6367		void __user *buf;
   6368
   6369		buf = io_buffer_select(req, &sr->len, issue_flags);
   6370		if (!buf)
   6371			return -ENOBUFS;
   6372		kmsg->fast_iov[0].iov_base = buf;
   6373		kmsg->fast_iov[0].iov_len = sr->len;
   6374		iov_iter_init(&kmsg->msg.msg_iter, READ, kmsg->fast_iov, 1,
   6375				sr->len);
   6376	}
   6377
   6378	flags = sr->msg_flags;
   6379	if (force_nonblock)
   6380		flags |= MSG_DONTWAIT;
   6381	if (flags & MSG_WAITALL)
   6382		min_ret = iov_iter_count(&kmsg->msg.msg_iter);
   6383
   6384	kmsg->msg.msg_get_inq = 1;
   6385	ret = __sys_recvmsg_sock(sock, &kmsg->msg, sr->umsg, kmsg->uaddr, flags);
   6386	if (ret < min_ret) {
   6387		if (ret == -EAGAIN && force_nonblock)
   6388			return io_setup_async_msg(req, kmsg);
   6389		if (ret == -ERESTARTSYS)
   6390			ret = -EINTR;
   6391		if (ret > 0 && io_net_retry(sock, flags)) {
   6392			sr->done_io += ret;
   6393			req->flags |= REQ_F_PARTIAL_IO;
   6394			return io_setup_async_msg(req, kmsg);
   6395		}
   6396		req_set_fail(req);
   6397	} else if ((flags & MSG_WAITALL) && (kmsg->msg.msg_flags & (MSG_TRUNC | MSG_CTRUNC))) {
   6398		req_set_fail(req);
   6399	}
   6400
   6401	/* fast path, check for non-NULL to avoid function call */
   6402	if (kmsg->free_iov)
   6403		kfree(kmsg->free_iov);
   6404	req->flags &= ~REQ_F_NEED_CLEANUP;
   6405	if (ret >= 0)
   6406		ret += sr->done_io;
   6407	else if (sr->done_io)
   6408		ret = sr->done_io;
   6409	cflags = io_put_kbuf(req, issue_flags);
   6410	if (kmsg->msg.msg_inq)
   6411		cflags |= IORING_CQE_F_SOCK_NONEMPTY;
   6412	__io_req_complete(req, issue_flags, ret, cflags);
   6413	return 0;
   6414}
   6415
   6416static int io_recv(struct io_kiocb *req, unsigned int issue_flags)
   6417{
   6418	struct io_sr_msg *sr = &req->sr_msg;
   6419	struct msghdr msg;
   6420	struct socket *sock;
   6421	struct iovec iov;
   6422	unsigned int cflags;
   6423	unsigned flags;
   6424	int ret, min_ret = 0;
   6425	bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
   6426
   6427	if (!(req->flags & REQ_F_POLLED) &&
   6428	    (sr->flags & IORING_RECVSEND_POLL_FIRST))
   6429		return -EAGAIN;
   6430
   6431	sock = sock_from_file(req->file);
   6432	if (unlikely(!sock))
   6433		return -ENOTSOCK;
   6434
   6435	if (io_do_buffer_select(req)) {
   6436		void __user *buf;
   6437
   6438		buf = io_buffer_select(req, &sr->len, issue_flags);
   6439		if (!buf)
   6440			return -ENOBUFS;
   6441		sr->buf = buf;
   6442	}
   6443
   6444	ret = import_single_range(READ, sr->buf, sr->len, &iov, &msg.msg_iter);
   6445	if (unlikely(ret))
   6446		goto out_free;
   6447
   6448	msg.msg_name = NULL;
   6449	msg.msg_namelen = 0;
   6450	msg.msg_control = NULL;
   6451	msg.msg_get_inq = 1;
   6452	msg.msg_flags = 0;
   6453	msg.msg_controllen = 0;
   6454	msg.msg_iocb = NULL;
   6455
   6456	flags = sr->msg_flags;
   6457	if (force_nonblock)
   6458		flags |= MSG_DONTWAIT;
   6459	if (flags & MSG_WAITALL)
   6460		min_ret = iov_iter_count(&msg.msg_iter);
   6461
   6462	ret = sock_recvmsg(sock, &msg, flags);
   6463	if (ret < min_ret) {
   6464		if (ret == -EAGAIN && force_nonblock)
   6465			return -EAGAIN;
   6466		if (ret == -ERESTARTSYS)
   6467			ret = -EINTR;
   6468		if (ret > 0 && io_net_retry(sock, flags)) {
   6469			sr->len -= ret;
   6470			sr->buf += ret;
   6471			sr->done_io += ret;
   6472			req->flags |= REQ_F_PARTIAL_IO;
   6473			return -EAGAIN;
   6474		}
   6475		req_set_fail(req);
   6476	} else if ((flags & MSG_WAITALL) && (msg.msg_flags & (MSG_TRUNC | MSG_CTRUNC))) {
   6477out_free:
   6478		req_set_fail(req);
   6479	}
   6480
   6481	if (ret >= 0)
   6482		ret += sr->done_io;
   6483	else if (sr->done_io)
   6484		ret = sr->done_io;
   6485	cflags = io_put_kbuf(req, issue_flags);
   6486	if (msg.msg_inq)
   6487		cflags |= IORING_CQE_F_SOCK_NONEMPTY;
   6488	__io_req_complete(req, issue_flags, ret, cflags);
   6489	return 0;
   6490}
   6491
   6492static int io_accept_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
   6493{
   6494	struct io_accept *accept = &req->accept;
   6495	unsigned flags;
   6496
   6497	if (sqe->len || sqe->buf_index)
   6498		return -EINVAL;
   6499
   6500	accept->addr = u64_to_user_ptr(READ_ONCE(sqe->addr));
   6501	accept->addr_len = u64_to_user_ptr(READ_ONCE(sqe->addr2));
   6502	accept->flags = READ_ONCE(sqe->accept_flags);
   6503	accept->nofile = rlimit(RLIMIT_NOFILE);
   6504	flags = READ_ONCE(sqe->ioprio);
   6505	if (flags & ~IORING_ACCEPT_MULTISHOT)
   6506		return -EINVAL;
   6507
   6508	accept->file_slot = READ_ONCE(sqe->file_index);
   6509	if (accept->file_slot) {
   6510		if (accept->flags & SOCK_CLOEXEC)
   6511			return -EINVAL;
   6512		if (flags & IORING_ACCEPT_MULTISHOT &&
   6513		    accept->file_slot != IORING_FILE_INDEX_ALLOC)
   6514			return -EINVAL;
   6515	}
   6516	if (accept->flags & ~(SOCK_CLOEXEC | SOCK_NONBLOCK))
   6517		return -EINVAL;
   6518	if (SOCK_NONBLOCK != O_NONBLOCK && (accept->flags & SOCK_NONBLOCK))
   6519		accept->flags = (accept->flags & ~SOCK_NONBLOCK) | O_NONBLOCK;
   6520	if (flags & IORING_ACCEPT_MULTISHOT)
   6521		req->flags |= REQ_F_APOLL_MULTISHOT;
   6522	return 0;
   6523}
   6524
   6525static int io_accept(struct io_kiocb *req, unsigned int issue_flags)
   6526{
   6527	struct io_ring_ctx *ctx = req->ctx;
   6528	struct io_accept *accept = &req->accept;
   6529	bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
   6530	unsigned int file_flags = force_nonblock ? O_NONBLOCK : 0;
   6531	bool fixed = !!accept->file_slot;
   6532	struct file *file;
   6533	int ret, fd;
   6534
   6535retry:
   6536	if (!fixed) {
   6537		fd = __get_unused_fd_flags(accept->flags, accept->nofile);
   6538		if (unlikely(fd < 0))
   6539			return fd;
   6540	}
   6541	file = do_accept(req->file, file_flags, accept->addr, accept->addr_len,
   6542			 accept->flags);
   6543	if (IS_ERR(file)) {
   6544		if (!fixed)
   6545			put_unused_fd(fd);
   6546		ret = PTR_ERR(file);
   6547		if (ret == -EAGAIN && force_nonblock) {
   6548			/*
   6549			 * if it's multishot and polled, we don't need to
   6550			 * return EAGAIN to arm the poll infra since it
   6551			 * has already been done
   6552			 */
   6553			if ((req->flags & IO_APOLL_MULTI_POLLED) ==
   6554			    IO_APOLL_MULTI_POLLED)
   6555				ret = 0;
   6556			return ret;
   6557		}
   6558		if (ret == -ERESTARTSYS)
   6559			ret = -EINTR;
   6560		req_set_fail(req);
   6561	} else if (!fixed) {
   6562		fd_install(fd, file);
   6563		ret = fd;
   6564	} else {
   6565		ret = io_fixed_fd_install(req, issue_flags, file,
   6566						accept->file_slot);
   6567	}
   6568
   6569	if (!(req->flags & REQ_F_APOLL_MULTISHOT)) {
   6570		__io_req_complete(req, issue_flags, ret, 0);
   6571		return 0;
   6572	}
   6573	if (ret >= 0) {
   6574		bool filled;
   6575
   6576		spin_lock(&ctx->completion_lock);
   6577		filled = io_fill_cqe_aux(ctx, req->cqe.user_data, ret,
   6578					 IORING_CQE_F_MORE);
   6579		io_commit_cqring(ctx);
   6580		spin_unlock(&ctx->completion_lock);
   6581		if (filled) {
   6582			io_cqring_ev_posted(ctx);
   6583			goto retry;
   6584		}
   6585		ret = -ECANCELED;
   6586	}
   6587
   6588	return ret;
   6589}
   6590
   6591static int io_socket_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
   6592{
   6593	struct io_socket *sock = &req->sock;
   6594
   6595	if (sqe->addr || sqe->rw_flags || sqe->buf_index)
   6596		return -EINVAL;
   6597
   6598	sock->domain = READ_ONCE(sqe->fd);
   6599	sock->type = READ_ONCE(sqe->off);
   6600	sock->protocol = READ_ONCE(sqe->len);
   6601	sock->file_slot = READ_ONCE(sqe->file_index);
   6602	sock->nofile = rlimit(RLIMIT_NOFILE);
   6603
   6604	sock->flags = sock->type & ~SOCK_TYPE_MASK;
   6605	if (sock->file_slot && (sock->flags & SOCK_CLOEXEC))
   6606		return -EINVAL;
   6607	if (sock->flags & ~(SOCK_CLOEXEC | SOCK_NONBLOCK))
   6608		return -EINVAL;
   6609	return 0;
   6610}
   6611
   6612static int io_socket(struct io_kiocb *req, unsigned int issue_flags)
   6613{
   6614	struct io_socket *sock = &req->sock;
   6615	bool fixed = !!sock->file_slot;
   6616	struct file *file;
   6617	int ret, fd;
   6618
   6619	if (!fixed) {
   6620		fd = __get_unused_fd_flags(sock->flags, sock->nofile);
   6621		if (unlikely(fd < 0))
   6622			return fd;
   6623	}
   6624	file = __sys_socket_file(sock->domain, sock->type, sock->protocol);
   6625	if (IS_ERR(file)) {
   6626		if (!fixed)
   6627			put_unused_fd(fd);
   6628		ret = PTR_ERR(file);
   6629		if (ret == -EAGAIN && (issue_flags & IO_URING_F_NONBLOCK))
   6630			return -EAGAIN;
   6631		if (ret == -ERESTARTSYS)
   6632			ret = -EINTR;
   6633		req_set_fail(req);
   6634	} else if (!fixed) {
   6635		fd_install(fd, file);
   6636		ret = fd;
   6637	} else {
   6638		ret = io_fixed_fd_install(req, issue_flags, file,
   6639					    sock->file_slot);
   6640	}
   6641	__io_req_complete(req, issue_flags, ret, 0);
   6642	return 0;
   6643}
   6644
   6645static int io_connect_prep_async(struct io_kiocb *req)
   6646{
   6647	struct io_async_connect *io = req->async_data;
   6648	struct io_connect *conn = &req->connect;
   6649
   6650	return move_addr_to_kernel(conn->addr, conn->addr_len, &io->address);
   6651}
   6652
   6653static int io_connect_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
   6654{
   6655	struct io_connect *conn = &req->connect;
   6656
   6657	if (sqe->len || sqe->buf_index || sqe->rw_flags || sqe->splice_fd_in)
   6658		return -EINVAL;
   6659
   6660	conn->addr = u64_to_user_ptr(READ_ONCE(sqe->addr));
   6661	conn->addr_len =  READ_ONCE(sqe->addr2);
   6662	return 0;
   6663}
   6664
   6665static int io_connect(struct io_kiocb *req, unsigned int issue_flags)
   6666{
   6667	struct io_async_connect __io, *io;
   6668	unsigned file_flags;
   6669	int ret;
   6670	bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
   6671
   6672	if (req_has_async_data(req)) {
   6673		io = req->async_data;
   6674	} else {
   6675		ret = move_addr_to_kernel(req->connect.addr,
   6676						req->connect.addr_len,
   6677						&__io.address);
   6678		if (ret)
   6679			goto out;
   6680		io = &__io;
   6681	}
   6682
   6683	file_flags = force_nonblock ? O_NONBLOCK : 0;
   6684
   6685	ret = __sys_connect_file(req->file, &io->address,
   6686					req->connect.addr_len, file_flags);
   6687	if ((ret == -EAGAIN || ret == -EINPROGRESS) && force_nonblock) {
   6688		if (req_has_async_data(req))
   6689			return -EAGAIN;
   6690		if (io_alloc_async_data(req)) {
   6691			ret = -ENOMEM;
   6692			goto out;
   6693		}
   6694		memcpy(req->async_data, &__io, sizeof(__io));
   6695		return -EAGAIN;
   6696	}
   6697	if (ret == -ERESTARTSYS)
   6698		ret = -EINTR;
   6699out:
   6700	if (ret < 0)
   6701		req_set_fail(req);
   6702	__io_req_complete(req, issue_flags, ret, 0);
   6703	return 0;
   6704}
   6705#else /* !CONFIG_NET */
   6706#define IO_NETOP_FN(op)							\
   6707static int io_##op(struct io_kiocb *req, unsigned int issue_flags)	\
   6708{									\
   6709	return -EOPNOTSUPP;						\
   6710}
   6711
   6712#define IO_NETOP_PREP(op)						\
   6713IO_NETOP_FN(op)								\
   6714static int io_##op##_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) \
   6715{									\
   6716	return -EOPNOTSUPP;						\
   6717}									\
   6718
   6719#define IO_NETOP_PREP_ASYNC(op)						\
   6720IO_NETOP_PREP(op)							\
   6721static int io_##op##_prep_async(struct io_kiocb *req)			\
   6722{									\
   6723	return -EOPNOTSUPP;						\
   6724}
   6725
   6726IO_NETOP_PREP_ASYNC(sendmsg);
   6727IO_NETOP_PREP_ASYNC(recvmsg);
   6728IO_NETOP_PREP_ASYNC(connect);
   6729IO_NETOP_PREP(accept);
   6730IO_NETOP_PREP(socket);
   6731IO_NETOP_PREP(shutdown);
   6732IO_NETOP_FN(send);
   6733IO_NETOP_FN(recv);
   6734#endif /* CONFIG_NET */
   6735
   6736struct io_poll_table {
   6737	struct poll_table_struct pt;
   6738	struct io_kiocb *req;
   6739	int nr_entries;
   6740	int error;
   6741};
   6742
   6743#define IO_POLL_CANCEL_FLAG	BIT(31)
   6744#define IO_POLL_REF_MASK	GENMASK(30, 0)
   6745
   6746/*
   6747 * If refs part of ->poll_refs (see IO_POLL_REF_MASK) is 0, it's free. We can
   6748 * bump it and acquire ownership. It's disallowed to modify requests while not
   6749 * owning it, that prevents from races for enqueueing task_work's and b/w
   6750 * arming poll and wakeups.
   6751 */
   6752static inline bool io_poll_get_ownership(struct io_kiocb *req)
   6753{
   6754	return !(atomic_fetch_inc(&req->poll_refs) & IO_POLL_REF_MASK);
   6755}
   6756
   6757static void io_poll_mark_cancelled(struct io_kiocb *req)
   6758{
   6759	atomic_or(IO_POLL_CANCEL_FLAG, &req->poll_refs);
   6760}
   6761
   6762static struct io_poll_iocb *io_poll_get_double(struct io_kiocb *req)
   6763{
   6764	/* pure poll stashes this in ->async_data, poll driven retry elsewhere */
   6765	if (req->opcode == IORING_OP_POLL_ADD)
   6766		return req->async_data;
   6767	return req->apoll->double_poll;
   6768}
   6769
   6770static struct io_poll_iocb *io_poll_get_single(struct io_kiocb *req)
   6771{
   6772	if (req->opcode == IORING_OP_POLL_ADD)
   6773		return &req->poll;
   6774	return &req->apoll->poll;
   6775}
   6776
   6777static void io_poll_req_insert(struct io_kiocb *req)
   6778{
   6779	struct io_ring_ctx *ctx = req->ctx;
   6780	struct hlist_head *list;
   6781
   6782	list = &ctx->cancel_hash[hash_long(req->cqe.user_data, ctx->cancel_hash_bits)];
   6783	hlist_add_head(&req->hash_node, list);
   6784}
   6785
   6786static void io_init_poll_iocb(struct io_poll_iocb *poll, __poll_t events,
   6787			      wait_queue_func_t wake_func)
   6788{
   6789	poll->head = NULL;
   6790#define IO_POLL_UNMASK	(EPOLLERR|EPOLLHUP|EPOLLNVAL|EPOLLRDHUP)
   6791	/* mask in events that we always want/need */
   6792	poll->events = events | IO_POLL_UNMASK;
   6793	INIT_LIST_HEAD(&poll->wait.entry);
   6794	init_waitqueue_func_entry(&poll->wait, wake_func);
   6795}
   6796
   6797static inline void io_poll_remove_entry(struct io_poll_iocb *poll)
   6798{
   6799	struct wait_queue_head *head = smp_load_acquire(&poll->head);
   6800
   6801	if (head) {
   6802		spin_lock_irq(&head->lock);
   6803		list_del_init(&poll->wait.entry);
   6804		poll->head = NULL;
   6805		spin_unlock_irq(&head->lock);
   6806	}
   6807}
   6808
   6809static void io_poll_remove_entries(struct io_kiocb *req)
   6810{
   6811	/*
   6812	 * Nothing to do if neither of those flags are set. Avoid dipping
   6813	 * into the poll/apoll/double cachelines if we can.
   6814	 */
   6815	if (!(req->flags & (REQ_F_SINGLE_POLL | REQ_F_DOUBLE_POLL)))
   6816		return;
   6817
   6818	/*
   6819	 * While we hold the waitqueue lock and the waitqueue is nonempty,
   6820	 * wake_up_pollfree() will wait for us.  However, taking the waitqueue
   6821	 * lock in the first place can race with the waitqueue being freed.
   6822	 *
   6823	 * We solve this as eventpoll does: by taking advantage of the fact that
   6824	 * all users of wake_up_pollfree() will RCU-delay the actual free.  If
   6825	 * we enter rcu_read_lock() and see that the pointer to the queue is
   6826	 * non-NULL, we can then lock it without the memory being freed out from
   6827	 * under us.
   6828	 *
   6829	 * Keep holding rcu_read_lock() as long as we hold the queue lock, in
   6830	 * case the caller deletes the entry from the queue, leaving it empty.
   6831	 * In that case, only RCU prevents the queue memory from being freed.
   6832	 */
   6833	rcu_read_lock();
   6834	if (req->flags & REQ_F_SINGLE_POLL)
   6835		io_poll_remove_entry(io_poll_get_single(req));
   6836	if (req->flags & REQ_F_DOUBLE_POLL)
   6837		io_poll_remove_entry(io_poll_get_double(req));
   6838	rcu_read_unlock();
   6839}
   6840
   6841static int io_issue_sqe(struct io_kiocb *req, unsigned int issue_flags);
   6842/*
   6843 * All poll tw should go through this. Checks for poll events, manages
   6844 * references, does rewait, etc.
   6845 *
   6846 * Returns a negative error on failure. >0 when no action require, which is
   6847 * either spurious wakeup or multishot CQE is served. 0 when it's done with
   6848 * the request, then the mask is stored in req->cqe.res.
   6849 */
   6850static int io_poll_check_events(struct io_kiocb *req, bool *locked)
   6851{
   6852	struct io_ring_ctx *ctx = req->ctx;
   6853	int v, ret;
   6854
   6855	/* req->task == current here, checking PF_EXITING is safe */
   6856	if (unlikely(req->task->flags & PF_EXITING))
   6857		return -ECANCELED;
   6858
   6859	do {
   6860		v = atomic_read(&req->poll_refs);
   6861
   6862		/* tw handler should be the owner, and so have some references */
   6863		if (WARN_ON_ONCE(!(v & IO_POLL_REF_MASK)))
   6864			return 0;
   6865		if (v & IO_POLL_CANCEL_FLAG)
   6866			return -ECANCELED;
   6867
   6868		if (!req->cqe.res) {
   6869			struct poll_table_struct pt = { ._key = req->apoll_events };
   6870			req->cqe.res = vfs_poll(req->file, &pt) & req->apoll_events;
   6871		}
   6872
   6873		if ((unlikely(!req->cqe.res)))
   6874			continue;
   6875		if (req->apoll_events & EPOLLONESHOT)
   6876			return 0;
   6877
   6878		/* multishot, just fill a CQE and proceed */
   6879		if (!(req->flags & REQ_F_APOLL_MULTISHOT)) {
   6880			__poll_t mask = mangle_poll(req->cqe.res &
   6881						    req->apoll_events);
   6882			bool filled;
   6883
   6884			spin_lock(&ctx->completion_lock);
   6885			filled = io_fill_cqe_aux(ctx, req->cqe.user_data,
   6886						 mask, IORING_CQE_F_MORE);
   6887			io_commit_cqring(ctx);
   6888			spin_unlock(&ctx->completion_lock);
   6889			if (filled) {
   6890				io_cqring_ev_posted(ctx);
   6891				continue;
   6892			}
   6893			return -ECANCELED;
   6894		}
   6895
   6896		io_tw_lock(req->ctx, locked);
   6897		if (unlikely(req->task->flags & PF_EXITING))
   6898			return -EFAULT;
   6899		ret = io_issue_sqe(req,
   6900				   IO_URING_F_NONBLOCK|IO_URING_F_COMPLETE_DEFER);
   6901		if (ret)
   6902			return ret;
   6903
   6904		/*
   6905		 * Release all references, retry if someone tried to restart
   6906		 * task_work while we were executing it.
   6907		 */
   6908	} while (atomic_sub_return(v & IO_POLL_REF_MASK, &req->poll_refs));
   6909
   6910	return 1;
   6911}
   6912
   6913static void io_poll_task_func(struct io_kiocb *req, bool *locked)
   6914{
   6915	struct io_ring_ctx *ctx = req->ctx;
   6916	int ret;
   6917
   6918	ret = io_poll_check_events(req, locked);
   6919	if (ret > 0)
   6920		return;
   6921
   6922	if (!ret) {
   6923		req->cqe.res = mangle_poll(req->cqe.res & req->poll.events);
   6924	} else {
   6925		req->cqe.res = ret;
   6926		req_set_fail(req);
   6927	}
   6928
   6929	io_poll_remove_entries(req);
   6930	spin_lock(&ctx->completion_lock);
   6931	hash_del(&req->hash_node);
   6932	__io_req_complete_post(req, req->cqe.res, 0);
   6933	io_commit_cqring(ctx);
   6934	spin_unlock(&ctx->completion_lock);
   6935	io_cqring_ev_posted(ctx);
   6936}
   6937
   6938static void io_apoll_task_func(struct io_kiocb *req, bool *locked)
   6939{
   6940	struct io_ring_ctx *ctx = req->ctx;
   6941	int ret;
   6942
   6943	ret = io_poll_check_events(req, locked);
   6944	if (ret > 0)
   6945		return;
   6946
   6947	io_poll_remove_entries(req);
   6948	spin_lock(&ctx->completion_lock);
   6949	hash_del(&req->hash_node);
   6950	spin_unlock(&ctx->completion_lock);
   6951
   6952	if (!ret)
   6953		io_req_task_submit(req, locked);
   6954	else
   6955		io_req_complete_failed(req, ret);
   6956}
   6957
   6958static void __io_poll_execute(struct io_kiocb *req, int mask,
   6959			      __poll_t __maybe_unused events)
   6960{
   6961	req->cqe.res = mask;
   6962	/*
   6963	 * This is useful for poll that is armed on behalf of another
   6964	 * request, and where the wakeup path could be on a different
   6965	 * CPU. We want to avoid pulling in req->apoll->events for that
   6966	 * case.
   6967	 */
   6968	if (req->opcode == IORING_OP_POLL_ADD)
   6969		req->io_task_work.func = io_poll_task_func;
   6970	else
   6971		req->io_task_work.func = io_apoll_task_func;
   6972
   6973	trace_io_uring_task_add(req->ctx, req, req->cqe.user_data, req->opcode, mask);
   6974	io_req_task_work_add(req);
   6975}
   6976
   6977static inline void io_poll_execute(struct io_kiocb *req, int res,
   6978		__poll_t events)
   6979{
   6980	if (io_poll_get_ownership(req))
   6981		__io_poll_execute(req, res, events);
   6982}
   6983
   6984static void io_poll_cancel_req(struct io_kiocb *req)
   6985{
   6986	io_poll_mark_cancelled(req);
   6987	/* kick tw, which should complete the request */
   6988	io_poll_execute(req, 0, 0);
   6989}
   6990
   6991#define wqe_to_req(wait)	((void *)((unsigned long) (wait)->private & ~1))
   6992#define wqe_is_double(wait)	((unsigned long) (wait)->private & 1)
   6993#define IO_ASYNC_POLL_COMMON	(EPOLLONESHOT | EPOLLPRI)
   6994
   6995static int io_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync,
   6996			void *key)
   6997{
   6998	struct io_kiocb *req = wqe_to_req(wait);
   6999	struct io_poll_iocb *poll = container_of(wait, struct io_poll_iocb,
   7000						 wait);
   7001	__poll_t mask = key_to_poll(key);
   7002
   7003	if (unlikely(mask & POLLFREE)) {
   7004		io_poll_mark_cancelled(req);
   7005		/* we have to kick tw in case it's not already */
   7006		io_poll_execute(req, 0, poll->events);
   7007
   7008		/*
   7009		 * If the waitqueue is being freed early but someone is already
   7010		 * holds ownership over it, we have to tear down the request as
   7011		 * best we can. That means immediately removing the request from
   7012		 * its waitqueue and preventing all further accesses to the
   7013		 * waitqueue via the request.
   7014		 */
   7015		list_del_init(&poll->wait.entry);
   7016
   7017		/*
   7018		 * Careful: this *must* be the last step, since as soon
   7019		 * as req->head is NULL'ed out, the request can be
   7020		 * completed and freed, since aio_poll_complete_work()
   7021		 * will no longer need to take the waitqueue lock.
   7022		 */
   7023		smp_store_release(&poll->head, NULL);
   7024		return 1;
   7025	}
   7026
   7027	/* for instances that support it check for an event match first */
   7028	if (mask && !(mask & (poll->events & ~IO_ASYNC_POLL_COMMON)))
   7029		return 0;
   7030
   7031	if (io_poll_get_ownership(req)) {
   7032		/* optional, saves extra locking for removal in tw handler */
   7033		if (mask && poll->events & EPOLLONESHOT) {
   7034			list_del_init(&poll->wait.entry);
   7035			poll->head = NULL;
   7036			if (wqe_is_double(wait))
   7037				req->flags &= ~REQ_F_DOUBLE_POLL;
   7038			else
   7039				req->flags &= ~REQ_F_SINGLE_POLL;
   7040		}
   7041		__io_poll_execute(req, mask, poll->events);
   7042	}
   7043	return 1;
   7044}
   7045
   7046static void __io_queue_proc(struct io_poll_iocb *poll, struct io_poll_table *pt,
   7047			    struct wait_queue_head *head,
   7048			    struct io_poll_iocb **poll_ptr)
   7049{
   7050	struct io_kiocb *req = pt->req;
   7051	unsigned long wqe_private = (unsigned long) req;
   7052
   7053	/*
   7054	 * The file being polled uses multiple waitqueues for poll handling
   7055	 * (e.g. one for read, one for write). Setup a separate io_poll_iocb
   7056	 * if this happens.
   7057	 */
   7058	if (unlikely(pt->nr_entries)) {
   7059		struct io_poll_iocb *first = poll;
   7060
   7061		/* double add on the same waitqueue head, ignore */
   7062		if (first->head == head)
   7063			return;
   7064		/* already have a 2nd entry, fail a third attempt */
   7065		if (*poll_ptr) {
   7066			if ((*poll_ptr)->head == head)
   7067				return;
   7068			pt->error = -EINVAL;
   7069			return;
   7070		}
   7071
   7072		poll = kmalloc(sizeof(*poll), GFP_ATOMIC);
   7073		if (!poll) {
   7074			pt->error = -ENOMEM;
   7075			return;
   7076		}
   7077		/* mark as double wq entry */
   7078		wqe_private |= 1;
   7079		req->flags |= REQ_F_DOUBLE_POLL;
   7080		io_init_poll_iocb(poll, first->events, first->wait.func);
   7081		*poll_ptr = poll;
   7082		if (req->opcode == IORING_OP_POLL_ADD)
   7083			req->flags |= REQ_F_ASYNC_DATA;
   7084	}
   7085
   7086	req->flags |= REQ_F_SINGLE_POLL;
   7087	pt->nr_entries++;
   7088	poll->head = head;
   7089	poll->wait.private = (void *) wqe_private;
   7090
   7091	if (poll->events & EPOLLEXCLUSIVE)
   7092		add_wait_queue_exclusive(head, &poll->wait);
   7093	else
   7094		add_wait_queue(head, &poll->wait);
   7095}
   7096
   7097static void io_poll_queue_proc(struct file *file, struct wait_queue_head *head,
   7098			       struct poll_table_struct *p)
   7099{
   7100	struct io_poll_table *pt = container_of(p, struct io_poll_table, pt);
   7101
   7102	__io_queue_proc(&pt->req->poll, pt, head,
   7103			(struct io_poll_iocb **) &pt->req->async_data);
   7104}
   7105
   7106static int __io_arm_poll_handler(struct io_kiocb *req,
   7107				 struct io_poll_iocb *poll,
   7108				 struct io_poll_table *ipt, __poll_t mask)
   7109{
   7110	struct io_ring_ctx *ctx = req->ctx;
   7111	int v;
   7112
   7113	INIT_HLIST_NODE(&req->hash_node);
   7114	req->work.cancel_seq = atomic_read(&ctx->cancel_seq);
   7115	io_init_poll_iocb(poll, mask, io_poll_wake);
   7116	poll->file = req->file;
   7117
   7118	req->apoll_events = poll->events;
   7119
   7120	ipt->pt._key = mask;
   7121	ipt->req = req;
   7122	ipt->error = 0;
   7123	ipt->nr_entries = 0;
   7124
   7125	/*
   7126	 * Take the ownership to delay any tw execution up until we're done
   7127	 * with poll arming. see io_poll_get_ownership().
   7128	 */
   7129	atomic_set(&req->poll_refs, 1);
   7130	mask = vfs_poll(req->file, &ipt->pt) & poll->events;
   7131
   7132	if (mask && (poll->events & EPOLLONESHOT)) {
   7133		io_poll_remove_entries(req);
   7134		/* no one else has access to the req, forget about the ref */
   7135		return mask;
   7136	}
   7137	if (!mask && unlikely(ipt->error || !ipt->nr_entries)) {
   7138		io_poll_remove_entries(req);
   7139		if (!ipt->error)
   7140			ipt->error = -EINVAL;
   7141		return 0;
   7142	}
   7143
   7144	spin_lock(&ctx->completion_lock);
   7145	io_poll_req_insert(req);
   7146	spin_unlock(&ctx->completion_lock);
   7147
   7148	if (mask) {
   7149		/* can't multishot if failed, just queue the event we've got */
   7150		if (unlikely(ipt->error || !ipt->nr_entries)) {
   7151			poll->events |= EPOLLONESHOT;
   7152			req->apoll_events |= EPOLLONESHOT;
   7153			ipt->error = 0;
   7154		}
   7155		__io_poll_execute(req, mask, poll->events);
   7156		return 0;
   7157	}
   7158
   7159	/*
   7160	 * Release ownership. If someone tried to queue a tw while it was
   7161	 * locked, kick it off for them.
   7162	 */
   7163	v = atomic_dec_return(&req->poll_refs);
   7164	if (unlikely(v & IO_POLL_REF_MASK))
   7165		__io_poll_execute(req, 0, poll->events);
   7166	return 0;
   7167}
   7168
   7169static void io_async_queue_proc(struct file *file, struct wait_queue_head *head,
   7170			       struct poll_table_struct *p)
   7171{
   7172	struct io_poll_table *pt = container_of(p, struct io_poll_table, pt);
   7173	struct async_poll *apoll = pt->req->apoll;
   7174
   7175	__io_queue_proc(&apoll->poll, pt, head, &apoll->double_poll);
   7176}
   7177
   7178enum {
   7179	IO_APOLL_OK,
   7180	IO_APOLL_ABORTED,
   7181	IO_APOLL_READY
   7182};
   7183
   7184static int io_arm_poll_handler(struct io_kiocb *req, unsigned issue_flags)
   7185{
   7186	const struct io_op_def *def = &io_op_defs[req->opcode];
   7187	struct io_ring_ctx *ctx = req->ctx;
   7188	struct async_poll *apoll;
   7189	struct io_poll_table ipt;
   7190	__poll_t mask = POLLPRI | POLLERR;
   7191	int ret;
   7192
   7193	if (!def->pollin && !def->pollout)
   7194		return IO_APOLL_ABORTED;
   7195	if (!file_can_poll(req->file))
   7196		return IO_APOLL_ABORTED;
   7197	if ((req->flags & (REQ_F_POLLED|REQ_F_PARTIAL_IO)) == REQ_F_POLLED)
   7198		return IO_APOLL_ABORTED;
   7199	if (!(req->flags & REQ_F_APOLL_MULTISHOT))
   7200		mask |= EPOLLONESHOT;
   7201
   7202	if (def->pollin) {
   7203		mask |= EPOLLIN | EPOLLRDNORM;
   7204
   7205		/* If reading from MSG_ERRQUEUE using recvmsg, ignore POLLIN */
   7206		if ((req->opcode == IORING_OP_RECVMSG) &&
   7207		    (req->sr_msg.msg_flags & MSG_ERRQUEUE))
   7208			mask &= ~EPOLLIN;
   7209	} else {
   7210		mask |= EPOLLOUT | EPOLLWRNORM;
   7211	}
   7212	if (def->poll_exclusive)
   7213		mask |= EPOLLEXCLUSIVE;
   7214	if (req->flags & REQ_F_POLLED) {
   7215		apoll = req->apoll;
   7216		kfree(apoll->double_poll);
   7217	} else if (!(issue_flags & IO_URING_F_UNLOCKED) &&
   7218		   !list_empty(&ctx->apoll_cache)) {
   7219		apoll = list_first_entry(&ctx->apoll_cache, struct async_poll,
   7220						poll.wait.entry);
   7221		list_del_init(&apoll->poll.wait.entry);
   7222	} else {
   7223		apoll = kmalloc(sizeof(*apoll), GFP_ATOMIC);
   7224		if (unlikely(!apoll))
   7225			return IO_APOLL_ABORTED;
   7226	}
   7227	apoll->double_poll = NULL;
   7228	req->apoll = apoll;
   7229	req->flags |= REQ_F_POLLED;
   7230	ipt.pt._qproc = io_async_queue_proc;
   7231
   7232	io_kbuf_recycle(req, issue_flags);
   7233
   7234	ret = __io_arm_poll_handler(req, &apoll->poll, &ipt, mask);
   7235	if (ret || ipt.error)
   7236		return ret ? IO_APOLL_READY : IO_APOLL_ABORTED;
   7237
   7238	trace_io_uring_poll_arm(ctx, req, req->cqe.user_data, req->opcode,
   7239				mask, apoll->poll.events);
   7240	return IO_APOLL_OK;
   7241}
   7242
   7243/*
   7244 * Returns true if we found and killed one or more poll requests
   7245 */
   7246static __cold bool io_poll_remove_all(struct io_ring_ctx *ctx,
   7247				      struct task_struct *tsk, bool cancel_all)
   7248{
   7249	struct hlist_node *tmp;
   7250	struct io_kiocb *req;
   7251	bool found = false;
   7252	int i;
   7253
   7254	spin_lock(&ctx->completion_lock);
   7255	for (i = 0; i < (1U << ctx->cancel_hash_bits); i++) {
   7256		struct hlist_head *list;
   7257
   7258		list = &ctx->cancel_hash[i];
   7259		hlist_for_each_entry_safe(req, tmp, list, hash_node) {
   7260			if (io_match_task_safe(req, tsk, cancel_all)) {
   7261				hlist_del_init(&req->hash_node);
   7262				io_poll_cancel_req(req);
   7263				found = true;
   7264			}
   7265		}
   7266	}
   7267	spin_unlock(&ctx->completion_lock);
   7268	return found;
   7269}
   7270
   7271static struct io_kiocb *io_poll_find(struct io_ring_ctx *ctx, bool poll_only,
   7272				     struct io_cancel_data *cd)
   7273	__must_hold(&ctx->completion_lock)
   7274{
   7275	struct hlist_head *list;
   7276	struct io_kiocb *req;
   7277
   7278	list = &ctx->cancel_hash[hash_long(cd->data, ctx->cancel_hash_bits)];
   7279	hlist_for_each_entry(req, list, hash_node) {
   7280		if (cd->data != req->cqe.user_data)
   7281			continue;
   7282		if (poll_only && req->opcode != IORING_OP_POLL_ADD)
   7283			continue;
   7284		if (cd->flags & IORING_ASYNC_CANCEL_ALL) {
   7285			if (cd->seq == req->work.cancel_seq)
   7286				continue;
   7287			req->work.cancel_seq = cd->seq;
   7288		}
   7289		return req;
   7290	}
   7291	return NULL;
   7292}
   7293
   7294static struct io_kiocb *io_poll_file_find(struct io_ring_ctx *ctx,
   7295					  struct io_cancel_data *cd)
   7296	__must_hold(&ctx->completion_lock)
   7297{
   7298	struct io_kiocb *req;
   7299	int i;
   7300
   7301	for (i = 0; i < (1U << ctx->cancel_hash_bits); i++) {
   7302		struct hlist_head *list;
   7303
   7304		list = &ctx->cancel_hash[i];
   7305		hlist_for_each_entry(req, list, hash_node) {
   7306			if (!(cd->flags & IORING_ASYNC_CANCEL_ANY) &&
   7307			    req->file != cd->file)
   7308				continue;
   7309			if (cd->seq == req->work.cancel_seq)
   7310				continue;
   7311			req->work.cancel_seq = cd->seq;
   7312			return req;
   7313		}
   7314	}
   7315	return NULL;
   7316}
   7317
   7318static bool io_poll_disarm(struct io_kiocb *req)
   7319	__must_hold(&ctx->completion_lock)
   7320{
   7321	if (!io_poll_get_ownership(req))
   7322		return false;
   7323	io_poll_remove_entries(req);
   7324	hash_del(&req->hash_node);
   7325	return true;
   7326}
   7327
   7328static int io_poll_cancel(struct io_ring_ctx *ctx, struct io_cancel_data *cd)
   7329	__must_hold(&ctx->completion_lock)
   7330{
   7331	struct io_kiocb *req;
   7332
   7333	if (cd->flags & (IORING_ASYNC_CANCEL_FD|IORING_ASYNC_CANCEL_ANY))
   7334		req = io_poll_file_find(ctx, cd);
   7335	else
   7336		req = io_poll_find(ctx, false, cd);
   7337	if (!req)
   7338		return -ENOENT;
   7339	io_poll_cancel_req(req);
   7340	return 0;
   7341}
   7342
   7343static __poll_t io_poll_parse_events(const struct io_uring_sqe *sqe,
   7344				     unsigned int flags)
   7345{
   7346	u32 events;
   7347
   7348	events = READ_ONCE(sqe->poll32_events);
   7349#ifdef __BIG_ENDIAN
   7350	events = swahw32(events);
   7351#endif
   7352	if (!(flags & IORING_POLL_ADD_MULTI))
   7353		events |= EPOLLONESHOT;
   7354	return demangle_poll(events) | (events & (EPOLLEXCLUSIVE|EPOLLONESHOT));
   7355}
   7356
   7357static int io_poll_remove_prep(struct io_kiocb *req,
   7358			       const struct io_uring_sqe *sqe)
   7359{
   7360	struct io_poll_update *upd = &req->poll_update;
   7361	u32 flags;
   7362
   7363	if (sqe->buf_index || sqe->splice_fd_in)
   7364		return -EINVAL;
   7365	flags = READ_ONCE(sqe->len);
   7366	if (flags & ~(IORING_POLL_UPDATE_EVENTS | IORING_POLL_UPDATE_USER_DATA |
   7367		      IORING_POLL_ADD_MULTI))
   7368		return -EINVAL;
   7369	/* meaningless without update */
   7370	if (flags == IORING_POLL_ADD_MULTI)
   7371		return -EINVAL;
   7372
   7373	upd->old_user_data = READ_ONCE(sqe->addr);
   7374	upd->update_events = flags & IORING_POLL_UPDATE_EVENTS;
   7375	upd->update_user_data = flags & IORING_POLL_UPDATE_USER_DATA;
   7376
   7377	upd->new_user_data = READ_ONCE(sqe->off);
   7378	if (!upd->update_user_data && upd->new_user_data)
   7379		return -EINVAL;
   7380	if (upd->update_events)
   7381		upd->events = io_poll_parse_events(sqe, flags);
   7382	else if (sqe->poll32_events)
   7383		return -EINVAL;
   7384
   7385	return 0;
   7386}
   7387
   7388static int io_poll_add_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
   7389{
   7390	struct io_poll_iocb *poll = &req->poll;
   7391	u32 flags;
   7392
   7393	if (sqe->buf_index || sqe->off || sqe->addr)
   7394		return -EINVAL;
   7395	flags = READ_ONCE(sqe->len);
   7396	if (flags & ~IORING_POLL_ADD_MULTI)
   7397		return -EINVAL;
   7398	if ((flags & IORING_POLL_ADD_MULTI) && (req->flags & REQ_F_CQE_SKIP))
   7399		return -EINVAL;
   7400
   7401	io_req_set_refcount(req);
   7402	poll->events = io_poll_parse_events(sqe, flags);
   7403	return 0;
   7404}
   7405
   7406static int io_poll_add(struct io_kiocb *req, unsigned int issue_flags)
   7407{
   7408	struct io_poll_iocb *poll = &req->poll;
   7409	struct io_poll_table ipt;
   7410	int ret;
   7411
   7412	ipt.pt._qproc = io_poll_queue_proc;
   7413
   7414	ret = __io_arm_poll_handler(req, &req->poll, &ipt, poll->events);
   7415	if (!ret && ipt.error)
   7416		req_set_fail(req);
   7417	ret = ret ?: ipt.error;
   7418	if (ret)
   7419		__io_req_complete(req, issue_flags, ret, 0);
   7420	return 0;
   7421}
   7422
   7423static int io_poll_remove(struct io_kiocb *req, unsigned int issue_flags)
   7424{
   7425	struct io_cancel_data cd = { .data = req->poll_update.old_user_data, };
   7426	struct io_ring_ctx *ctx = req->ctx;
   7427	struct io_kiocb *preq;
   7428	int ret2, ret = 0;
   7429	bool locked;
   7430
   7431	spin_lock(&ctx->completion_lock);
   7432	preq = io_poll_find(ctx, true, &cd);
   7433	if (!preq || !io_poll_disarm(preq)) {
   7434		spin_unlock(&ctx->completion_lock);
   7435		ret = preq ? -EALREADY : -ENOENT;
   7436		goto out;
   7437	}
   7438	spin_unlock(&ctx->completion_lock);
   7439
   7440	if (req->poll_update.update_events || req->poll_update.update_user_data) {
   7441		/* only mask one event flags, keep behavior flags */
   7442		if (req->poll_update.update_events) {
   7443			preq->poll.events &= ~0xffff;
   7444			preq->poll.events |= req->poll_update.events & 0xffff;
   7445			preq->poll.events |= IO_POLL_UNMASK;
   7446		}
   7447		if (req->poll_update.update_user_data)
   7448			preq->cqe.user_data = req->poll_update.new_user_data;
   7449
   7450		ret2 = io_poll_add(preq, issue_flags);
   7451		/* successfully updated, don't complete poll request */
   7452		if (!ret2)
   7453			goto out;
   7454	}
   7455
   7456	req_set_fail(preq);
   7457	preq->cqe.res = -ECANCELED;
   7458	locked = !(issue_flags & IO_URING_F_UNLOCKED);
   7459	io_req_task_complete(preq, &locked);
   7460out:
   7461	if (ret < 0)
   7462		req_set_fail(req);
   7463	/* complete update request, we're done with it */
   7464	__io_req_complete(req, issue_flags, ret, 0);
   7465	return 0;
   7466}
   7467
   7468static enum hrtimer_restart io_timeout_fn(struct hrtimer *timer)
   7469{
   7470	struct io_timeout_data *data = container_of(timer,
   7471						struct io_timeout_data, timer);
   7472	struct io_kiocb *req = data->req;
   7473	struct io_ring_ctx *ctx = req->ctx;
   7474	unsigned long flags;
   7475
   7476	spin_lock_irqsave(&ctx->timeout_lock, flags);
   7477	list_del_init(&req->timeout.list);
   7478	atomic_set(&req->ctx->cq_timeouts,
   7479		atomic_read(&req->ctx->cq_timeouts) + 1);
   7480	spin_unlock_irqrestore(&ctx->timeout_lock, flags);
   7481
   7482	if (!(data->flags & IORING_TIMEOUT_ETIME_SUCCESS))
   7483		req_set_fail(req);
   7484
   7485	req->cqe.res = -ETIME;
   7486	req->io_task_work.func = io_req_task_complete;
   7487	io_req_task_work_add(req);
   7488	return HRTIMER_NORESTART;
   7489}
   7490
   7491static struct io_kiocb *io_timeout_extract(struct io_ring_ctx *ctx,
   7492					   struct io_cancel_data *cd)
   7493	__must_hold(&ctx->timeout_lock)
   7494{
   7495	struct io_timeout_data *io;
   7496	struct io_kiocb *req;
   7497	bool found = false;
   7498
   7499	list_for_each_entry(req, &ctx->timeout_list, timeout.list) {
   7500		if (!(cd->flags & IORING_ASYNC_CANCEL_ANY) &&
   7501		    cd->data != req->cqe.user_data)
   7502			continue;
   7503		if (cd->flags & (IORING_ASYNC_CANCEL_ALL|IORING_ASYNC_CANCEL_ANY)) {
   7504			if (cd->seq == req->work.cancel_seq)
   7505				continue;
   7506			req->work.cancel_seq = cd->seq;
   7507		}
   7508		found = true;
   7509		break;
   7510	}
   7511	if (!found)
   7512		return ERR_PTR(-ENOENT);
   7513
   7514	io = req->async_data;
   7515	if (hrtimer_try_to_cancel(&io->timer) == -1)
   7516		return ERR_PTR(-EALREADY);
   7517	list_del_init(&req->timeout.list);
   7518	return req;
   7519}
   7520
   7521static int io_timeout_cancel(struct io_ring_ctx *ctx, struct io_cancel_data *cd)
   7522	__must_hold(&ctx->completion_lock)
   7523{
   7524	struct io_kiocb *req;
   7525
   7526	spin_lock_irq(&ctx->timeout_lock);
   7527	req = io_timeout_extract(ctx, cd);
   7528	spin_unlock_irq(&ctx->timeout_lock);
   7529
   7530	if (IS_ERR(req))
   7531		return PTR_ERR(req);
   7532	io_req_task_queue_fail(req, -ECANCELED);
   7533	return 0;
   7534}
   7535
   7536static clockid_t io_timeout_get_clock(struct io_timeout_data *data)
   7537{
   7538	switch (data->flags & IORING_TIMEOUT_CLOCK_MASK) {
   7539	case IORING_TIMEOUT_BOOTTIME:
   7540		return CLOCK_BOOTTIME;
   7541	case IORING_TIMEOUT_REALTIME:
   7542		return CLOCK_REALTIME;
   7543	default:
   7544		/* can't happen, vetted at prep time */
   7545		WARN_ON_ONCE(1);
   7546		fallthrough;
   7547	case 0:
   7548		return CLOCK_MONOTONIC;
   7549	}
   7550}
   7551
   7552static int io_linked_timeout_update(struct io_ring_ctx *ctx, __u64 user_data,
   7553				    struct timespec64 *ts, enum hrtimer_mode mode)
   7554	__must_hold(&ctx->timeout_lock)
   7555{
   7556	struct io_timeout_data *io;
   7557	struct io_kiocb *req;
   7558	bool found = false;
   7559
   7560	list_for_each_entry(req, &ctx->ltimeout_list, timeout.list) {
   7561		found = user_data == req->cqe.user_data;
   7562		if (found)
   7563			break;
   7564	}
   7565	if (!found)
   7566		return -ENOENT;
   7567
   7568	io = req->async_data;
   7569	if (hrtimer_try_to_cancel(&io->timer) == -1)
   7570		return -EALREADY;
   7571	hrtimer_init(&io->timer, io_timeout_get_clock(io), mode);
   7572	io->timer.function = io_link_timeout_fn;
   7573	hrtimer_start(&io->timer, timespec64_to_ktime(*ts), mode);
   7574	return 0;
   7575}
   7576
   7577static int io_timeout_update(struct io_ring_ctx *ctx, __u64 user_data,
   7578			     struct timespec64 *ts, enum hrtimer_mode mode)
   7579	__must_hold(&ctx->timeout_lock)
   7580{
   7581	struct io_cancel_data cd = { .data = user_data, };
   7582	struct io_kiocb *req = io_timeout_extract(ctx, &cd);
   7583	struct io_timeout_data *data;
   7584
   7585	if (IS_ERR(req))
   7586		return PTR_ERR(req);
   7587
   7588	req->timeout.off = 0; /* noseq */
   7589	data = req->async_data;
   7590	list_add_tail(&req->timeout.list, &ctx->timeout_list);
   7591	hrtimer_init(&data->timer, io_timeout_get_clock(data), mode);
   7592	data->timer.function = io_timeout_fn;
   7593	hrtimer_start(&data->timer, timespec64_to_ktime(*ts), mode);
   7594	return 0;
   7595}
   7596
   7597static int io_timeout_remove_prep(struct io_kiocb *req,
   7598				  const struct io_uring_sqe *sqe)
   7599{
   7600	struct io_timeout_rem *tr = &req->timeout_rem;
   7601
   7602	if (unlikely(req->flags & (REQ_F_FIXED_FILE | REQ_F_BUFFER_SELECT)))
   7603		return -EINVAL;
   7604	if (sqe->buf_index || sqe->len || sqe->splice_fd_in)
   7605		return -EINVAL;
   7606
   7607	tr->ltimeout = false;
   7608	tr->addr = READ_ONCE(sqe->addr);
   7609	tr->flags = READ_ONCE(sqe->timeout_flags);
   7610	if (tr->flags & IORING_TIMEOUT_UPDATE_MASK) {
   7611		if (hweight32(tr->flags & IORING_TIMEOUT_CLOCK_MASK) > 1)
   7612			return -EINVAL;
   7613		if (tr->flags & IORING_LINK_TIMEOUT_UPDATE)
   7614			tr->ltimeout = true;
   7615		if (tr->flags & ~(IORING_TIMEOUT_UPDATE_MASK|IORING_TIMEOUT_ABS))
   7616			return -EINVAL;
   7617		if (get_timespec64(&tr->ts, u64_to_user_ptr(sqe->addr2)))
   7618			return -EFAULT;
   7619		if (tr->ts.tv_sec < 0 || tr->ts.tv_nsec < 0)
   7620			return -EINVAL;
   7621	} else if (tr->flags) {
   7622		/* timeout removal doesn't support flags */
   7623		return -EINVAL;
   7624	}
   7625
   7626	return 0;
   7627}
   7628
   7629static inline enum hrtimer_mode io_translate_timeout_mode(unsigned int flags)
   7630{
   7631	return (flags & IORING_TIMEOUT_ABS) ? HRTIMER_MODE_ABS
   7632					    : HRTIMER_MODE_REL;
   7633}
   7634
   7635/*
   7636 * Remove or update an existing timeout command
   7637 */
   7638static int io_timeout_remove(struct io_kiocb *req, unsigned int issue_flags)
   7639{
   7640	struct io_timeout_rem *tr = &req->timeout_rem;
   7641	struct io_ring_ctx *ctx = req->ctx;
   7642	int ret;
   7643
   7644	if (!(req->timeout_rem.flags & IORING_TIMEOUT_UPDATE)) {
   7645		struct io_cancel_data cd = { .data = tr->addr, };
   7646
   7647		spin_lock(&ctx->completion_lock);
   7648		ret = io_timeout_cancel(ctx, &cd);
   7649		spin_unlock(&ctx->completion_lock);
   7650	} else {
   7651		enum hrtimer_mode mode = io_translate_timeout_mode(tr->flags);
   7652
   7653		spin_lock_irq(&ctx->timeout_lock);
   7654		if (tr->ltimeout)
   7655			ret = io_linked_timeout_update(ctx, tr->addr, &tr->ts, mode);
   7656		else
   7657			ret = io_timeout_update(ctx, tr->addr, &tr->ts, mode);
   7658		spin_unlock_irq(&ctx->timeout_lock);
   7659	}
   7660
   7661	if (ret < 0)
   7662		req_set_fail(req);
   7663	io_req_complete_post(req, ret, 0);
   7664	return 0;
   7665}
   7666
   7667static int __io_timeout_prep(struct io_kiocb *req,
   7668			     const struct io_uring_sqe *sqe,
   7669			     bool is_timeout_link)
   7670{
   7671	struct io_timeout_data *data;
   7672	unsigned flags;
   7673	u32 off = READ_ONCE(sqe->off);
   7674
   7675	if (sqe->buf_index || sqe->len != 1 || sqe->splice_fd_in)
   7676		return -EINVAL;
   7677	if (off && is_timeout_link)
   7678		return -EINVAL;
   7679	flags = READ_ONCE(sqe->timeout_flags);
   7680	if (flags & ~(IORING_TIMEOUT_ABS | IORING_TIMEOUT_CLOCK_MASK |
   7681		      IORING_TIMEOUT_ETIME_SUCCESS))
   7682		return -EINVAL;
   7683	/* more than one clock specified is invalid, obviously */
   7684	if (hweight32(flags & IORING_TIMEOUT_CLOCK_MASK) > 1)
   7685		return -EINVAL;
   7686
   7687	INIT_LIST_HEAD(&req->timeout.list);
   7688	req->timeout.off = off;
   7689	if (unlikely(off && !req->ctx->off_timeout_used))
   7690		req->ctx->off_timeout_used = true;
   7691
   7692	if (WARN_ON_ONCE(req_has_async_data(req)))
   7693		return -EFAULT;
   7694	if (io_alloc_async_data(req))
   7695		return -ENOMEM;
   7696
   7697	data = req->async_data;
   7698	data->req = req;
   7699	data->flags = flags;
   7700
   7701	if (get_timespec64(&data->ts, u64_to_user_ptr(sqe->addr)))
   7702		return -EFAULT;
   7703
   7704	if (data->ts.tv_sec < 0 || data->ts.tv_nsec < 0)
   7705		return -EINVAL;
   7706
   7707	INIT_LIST_HEAD(&req->timeout.list);
   7708	data->mode = io_translate_timeout_mode(flags);
   7709	hrtimer_init(&data->timer, io_timeout_get_clock(data), data->mode);
   7710
   7711	if (is_timeout_link) {
   7712		struct io_submit_link *link = &req->ctx->submit_state.link;
   7713
   7714		if (!link->head)
   7715			return -EINVAL;
   7716		if (link->last->opcode == IORING_OP_LINK_TIMEOUT)
   7717			return -EINVAL;
   7718		req->timeout.head = link->last;
   7719		link->last->flags |= REQ_F_ARM_LTIMEOUT;
   7720	}
   7721	return 0;
   7722}
   7723
   7724static int io_timeout_prep(struct io_kiocb *req,
   7725			   const struct io_uring_sqe *sqe)
   7726{
   7727	return __io_timeout_prep(req, sqe, false);
   7728}
   7729
   7730static int io_link_timeout_prep(struct io_kiocb *req,
   7731				const struct io_uring_sqe *sqe)
   7732{
   7733	return __io_timeout_prep(req, sqe, true);
   7734}
   7735
   7736static int io_timeout(struct io_kiocb *req, unsigned int issue_flags)
   7737{
   7738	struct io_ring_ctx *ctx = req->ctx;
   7739	struct io_timeout_data *data = req->async_data;
   7740	struct list_head *entry;
   7741	u32 tail, off = req->timeout.off;
   7742
   7743	spin_lock_irq(&ctx->timeout_lock);
   7744
   7745	/*
   7746	 * sqe->off holds how many events that need to occur for this
   7747	 * timeout event to be satisfied. If it isn't set, then this is
   7748	 * a pure timeout request, sequence isn't used.
   7749	 */
   7750	if (io_is_timeout_noseq(req)) {
   7751		entry = ctx->timeout_list.prev;
   7752		goto add;
   7753	}
   7754
   7755	tail = ctx->cached_cq_tail - atomic_read(&ctx->cq_timeouts);
   7756	req->timeout.target_seq = tail + off;
   7757
   7758	/* Update the last seq here in case io_flush_timeouts() hasn't.
   7759	 * This is safe because ->completion_lock is held, and submissions
   7760	 * and completions are never mixed in the same ->completion_lock section.
   7761	 */
   7762	ctx->cq_last_tm_flush = tail;
   7763
   7764	/*
   7765	 * Insertion sort, ensuring the first entry in the list is always
   7766	 * the one we need first.
   7767	 */
   7768	list_for_each_prev(entry, &ctx->timeout_list) {
   7769		struct io_kiocb *nxt = list_entry(entry, struct io_kiocb,
   7770						  timeout.list);
   7771
   7772		if (io_is_timeout_noseq(nxt))
   7773			continue;
   7774		/* nxt.seq is behind @tail, otherwise would've been completed */
   7775		if (off >= nxt->timeout.target_seq - tail)
   7776			break;
   7777	}
   7778add:
   7779	list_add(&req->timeout.list, entry);
   7780	data->timer.function = io_timeout_fn;
   7781	hrtimer_start(&data->timer, timespec64_to_ktime(data->ts), data->mode);
   7782	spin_unlock_irq(&ctx->timeout_lock);
   7783	return 0;
   7784}
   7785
   7786static bool io_cancel_cb(struct io_wq_work *work, void *data)
   7787{
   7788	struct io_kiocb *req = container_of(work, struct io_kiocb, work);
   7789	struct io_cancel_data *cd = data;
   7790
   7791	if (req->ctx != cd->ctx)
   7792		return false;
   7793	if (cd->flags & IORING_ASYNC_CANCEL_ANY) {
   7794		;
   7795	} else if (cd->flags & IORING_ASYNC_CANCEL_FD) {
   7796		if (req->file != cd->file)
   7797			return false;
   7798	} else {
   7799		if (req->cqe.user_data != cd->data)
   7800			return false;
   7801	}
   7802	if (cd->flags & (IORING_ASYNC_CANCEL_ALL|IORING_ASYNC_CANCEL_ANY)) {
   7803		if (cd->seq == req->work.cancel_seq)
   7804			return false;
   7805		req->work.cancel_seq = cd->seq;
   7806	}
   7807	return true;
   7808}
   7809
   7810static int io_async_cancel_one(struct io_uring_task *tctx,
   7811			       struct io_cancel_data *cd)
   7812{
   7813	enum io_wq_cancel cancel_ret;
   7814	int ret = 0;
   7815	bool all;
   7816
   7817	if (!tctx || !tctx->io_wq)
   7818		return -ENOENT;
   7819
   7820	all = cd->flags & (IORING_ASYNC_CANCEL_ALL|IORING_ASYNC_CANCEL_ANY);
   7821	cancel_ret = io_wq_cancel_cb(tctx->io_wq, io_cancel_cb, cd, all);
   7822	switch (cancel_ret) {
   7823	case IO_WQ_CANCEL_OK:
   7824		ret = 0;
   7825		break;
   7826	case IO_WQ_CANCEL_RUNNING:
   7827		ret = -EALREADY;
   7828		break;
   7829	case IO_WQ_CANCEL_NOTFOUND:
   7830		ret = -ENOENT;
   7831		break;
   7832	}
   7833
   7834	return ret;
   7835}
   7836
   7837static int io_try_cancel(struct io_kiocb *req, struct io_cancel_data *cd)
   7838{
   7839	struct io_ring_ctx *ctx = req->ctx;
   7840	int ret;
   7841
   7842	WARN_ON_ONCE(!io_wq_current_is_worker() && req->task != current);
   7843
   7844	ret = io_async_cancel_one(req->task->io_uring, cd);
   7845	/*
   7846	 * Fall-through even for -EALREADY, as we may have poll armed
   7847	 * that need unarming.
   7848	 */
   7849	if (!ret)
   7850		return 0;
   7851
   7852	spin_lock(&ctx->completion_lock);
   7853	ret = io_poll_cancel(ctx, cd);
   7854	if (ret != -ENOENT)
   7855		goto out;
   7856	if (!(cd->flags & IORING_ASYNC_CANCEL_FD))
   7857		ret = io_timeout_cancel(ctx, cd);
   7858out:
   7859	spin_unlock(&ctx->completion_lock);
   7860	return ret;
   7861}
   7862
   7863#define CANCEL_FLAGS	(IORING_ASYNC_CANCEL_ALL | IORING_ASYNC_CANCEL_FD | \
   7864			 IORING_ASYNC_CANCEL_ANY)
   7865
   7866static int io_async_cancel_prep(struct io_kiocb *req,
   7867				const struct io_uring_sqe *sqe)
   7868{
   7869	if (unlikely(req->flags & REQ_F_BUFFER_SELECT))
   7870		return -EINVAL;
   7871	if (sqe->off || sqe->len || sqe->splice_fd_in)
   7872		return -EINVAL;
   7873
   7874	req->cancel.addr = READ_ONCE(sqe->addr);
   7875	req->cancel.flags = READ_ONCE(sqe->cancel_flags);
   7876	if (req->cancel.flags & ~CANCEL_FLAGS)
   7877		return -EINVAL;
   7878	if (req->cancel.flags & IORING_ASYNC_CANCEL_FD) {
   7879		if (req->cancel.flags & IORING_ASYNC_CANCEL_ANY)
   7880			return -EINVAL;
   7881		req->cancel.fd = READ_ONCE(sqe->fd);
   7882	}
   7883
   7884	return 0;
   7885}
   7886
   7887static int __io_async_cancel(struct io_cancel_data *cd, struct io_kiocb *req,
   7888			     unsigned int issue_flags)
   7889{
   7890	bool all = cd->flags & (IORING_ASYNC_CANCEL_ALL|IORING_ASYNC_CANCEL_ANY);
   7891	struct io_ring_ctx *ctx = cd->ctx;
   7892	struct io_tctx_node *node;
   7893	int ret, nr = 0;
   7894
   7895	do {
   7896		ret = io_try_cancel(req, cd);
   7897		if (ret == -ENOENT)
   7898			break;
   7899		if (!all)
   7900			return ret;
   7901		nr++;
   7902	} while (1);
   7903
   7904	/* slow path, try all io-wq's */
   7905	io_ring_submit_lock(ctx, issue_flags);
   7906	ret = -ENOENT;
   7907	list_for_each_entry(node, &ctx->tctx_list, ctx_node) {
   7908		struct io_uring_task *tctx = node->task->io_uring;
   7909
   7910		ret = io_async_cancel_one(tctx, cd);
   7911		if (ret != -ENOENT) {
   7912			if (!all)
   7913				break;
   7914			nr++;
   7915		}
   7916	}
   7917	io_ring_submit_unlock(ctx, issue_flags);
   7918	return all ? nr : ret;
   7919}
   7920
   7921static int io_async_cancel(struct io_kiocb *req, unsigned int issue_flags)
   7922{
   7923	struct io_cancel_data cd = {
   7924		.ctx	= req->ctx,
   7925		.data	= req->cancel.addr,
   7926		.flags	= req->cancel.flags,
   7927		.seq	= atomic_inc_return(&req->ctx->cancel_seq),
   7928	};
   7929	int ret;
   7930
   7931	if (cd.flags & IORING_ASYNC_CANCEL_FD) {
   7932		if (req->flags & REQ_F_FIXED_FILE)
   7933			req->file = io_file_get_fixed(req, req->cancel.fd,
   7934							issue_flags);
   7935		else
   7936			req->file = io_file_get_normal(req, req->cancel.fd);
   7937		if (!req->file) {
   7938			ret = -EBADF;
   7939			goto done;
   7940		}
   7941		cd.file = req->file;
   7942	}
   7943
   7944	ret = __io_async_cancel(&cd, req, issue_flags);
   7945done:
   7946	if (ret < 0)
   7947		req_set_fail(req);
   7948	io_req_complete_post(req, ret, 0);
   7949	return 0;
   7950}
   7951
   7952static int io_files_update_prep(struct io_kiocb *req,
   7953				const struct io_uring_sqe *sqe)
   7954{
   7955	if (unlikely(req->flags & (REQ_F_FIXED_FILE | REQ_F_BUFFER_SELECT)))
   7956		return -EINVAL;
   7957	if (sqe->rw_flags || sqe->splice_fd_in)
   7958		return -EINVAL;
   7959
   7960	req->rsrc_update.offset = READ_ONCE(sqe->off);
   7961	req->rsrc_update.nr_args = READ_ONCE(sqe->len);
   7962	if (!req->rsrc_update.nr_args)
   7963		return -EINVAL;
   7964	req->rsrc_update.arg = READ_ONCE(sqe->addr);
   7965	return 0;
   7966}
   7967
   7968static int io_files_update_with_index_alloc(struct io_kiocb *req,
   7969					    unsigned int issue_flags)
   7970{
   7971	__s32 __user *fds = u64_to_user_ptr(req->rsrc_update.arg);
   7972	unsigned int done;
   7973	struct file *file;
   7974	int ret, fd;
   7975
   7976	if (!req->ctx->file_data)
   7977		return -ENXIO;
   7978
   7979	for (done = 0; done < req->rsrc_update.nr_args; done++) {
   7980		if (copy_from_user(&fd, &fds[done], sizeof(fd))) {
   7981			ret = -EFAULT;
   7982			break;
   7983		}
   7984
   7985		file = fget(fd);
   7986		if (!file) {
   7987			ret = -EBADF;
   7988			break;
   7989		}
   7990		ret = io_fixed_fd_install(req, issue_flags, file,
   7991					  IORING_FILE_INDEX_ALLOC);
   7992		if (ret < 0)
   7993			break;
   7994		if (copy_to_user(&fds[done], &ret, sizeof(ret))) {
   7995			__io_close_fixed(req, issue_flags, ret);
   7996			ret = -EFAULT;
   7997			break;
   7998		}
   7999	}
   8000
   8001	if (done)
   8002		return done;
   8003	return ret;
   8004}
   8005
   8006static int io_files_update(struct io_kiocb *req, unsigned int issue_flags)
   8007{
   8008	struct io_ring_ctx *ctx = req->ctx;
   8009	struct io_uring_rsrc_update2 up;
   8010	int ret;
   8011
   8012	up.offset = req->rsrc_update.offset;
   8013	up.data = req->rsrc_update.arg;
   8014	up.nr = 0;
   8015	up.tags = 0;
   8016	up.resv = 0;
   8017	up.resv2 = 0;
   8018
   8019	if (req->rsrc_update.offset == IORING_FILE_INDEX_ALLOC) {
   8020		ret = io_files_update_with_index_alloc(req, issue_flags);
   8021	} else {
   8022		io_ring_submit_lock(ctx, issue_flags);
   8023		ret = __io_register_rsrc_update(ctx, IORING_RSRC_FILE,
   8024				&up, req->rsrc_update.nr_args);
   8025		io_ring_submit_unlock(ctx, issue_flags);
   8026	}
   8027
   8028	if (ret < 0)
   8029		req_set_fail(req);
   8030	__io_req_complete(req, issue_flags, ret, 0);
   8031	return 0;
   8032}
   8033
   8034static int io_req_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
   8035{
   8036	switch (req->opcode) {
   8037	case IORING_OP_NOP:
   8038		return io_nop_prep(req, sqe);
   8039	case IORING_OP_READV:
   8040	case IORING_OP_READ_FIXED:
   8041	case IORING_OP_READ:
   8042	case IORING_OP_WRITEV:
   8043	case IORING_OP_WRITE_FIXED:
   8044	case IORING_OP_WRITE:
   8045		return io_prep_rw(req, sqe);
   8046	case IORING_OP_POLL_ADD:
   8047		return io_poll_add_prep(req, sqe);
   8048	case IORING_OP_POLL_REMOVE:
   8049		return io_poll_remove_prep(req, sqe);
   8050	case IORING_OP_FSYNC:
   8051		return io_fsync_prep(req, sqe);
   8052	case IORING_OP_SYNC_FILE_RANGE:
   8053		return io_sfr_prep(req, sqe);
   8054	case IORING_OP_SENDMSG:
   8055	case IORING_OP_SEND:
   8056		return io_sendmsg_prep(req, sqe);
   8057	case IORING_OP_RECVMSG:
   8058	case IORING_OP_RECV:
   8059		return io_recvmsg_prep(req, sqe);
   8060	case IORING_OP_CONNECT:
   8061		return io_connect_prep(req, sqe);
   8062	case IORING_OP_TIMEOUT:
   8063		return io_timeout_prep(req, sqe);
   8064	case IORING_OP_TIMEOUT_REMOVE:
   8065		return io_timeout_remove_prep(req, sqe);
   8066	case IORING_OP_ASYNC_CANCEL:
   8067		return io_async_cancel_prep(req, sqe);
   8068	case IORING_OP_LINK_TIMEOUT:
   8069		return io_link_timeout_prep(req, sqe);
   8070	case IORING_OP_ACCEPT:
   8071		return io_accept_prep(req, sqe);
   8072	case IORING_OP_FALLOCATE:
   8073		return io_fallocate_prep(req, sqe);
   8074	case IORING_OP_OPENAT:
   8075		return io_openat_prep(req, sqe);
   8076	case IORING_OP_CLOSE:
   8077		return io_close_prep(req, sqe);
   8078	case IORING_OP_FILES_UPDATE:
   8079		return io_files_update_prep(req, sqe);
   8080	case IORING_OP_STATX:
   8081		return io_statx_prep(req, sqe);
   8082	case IORING_OP_FADVISE:
   8083		return io_fadvise_prep(req, sqe);
   8084	case IORING_OP_MADVISE:
   8085		return io_madvise_prep(req, sqe);
   8086	case IORING_OP_OPENAT2:
   8087		return io_openat2_prep(req, sqe);
   8088	case IORING_OP_EPOLL_CTL:
   8089		return io_epoll_ctl_prep(req, sqe);
   8090	case IORING_OP_SPLICE:
   8091		return io_splice_prep(req, sqe);
   8092	case IORING_OP_PROVIDE_BUFFERS:
   8093		return io_provide_buffers_prep(req, sqe);
   8094	case IORING_OP_REMOVE_BUFFERS:
   8095		return io_remove_buffers_prep(req, sqe);
   8096	case IORING_OP_TEE:
   8097		return io_tee_prep(req, sqe);
   8098	case IORING_OP_SHUTDOWN:
   8099		return io_shutdown_prep(req, sqe);
   8100	case IORING_OP_RENAMEAT:
   8101		return io_renameat_prep(req, sqe);
   8102	case IORING_OP_UNLINKAT:
   8103		return io_unlinkat_prep(req, sqe);
   8104	case IORING_OP_MKDIRAT:
   8105		return io_mkdirat_prep(req, sqe);
   8106	case IORING_OP_SYMLINKAT:
   8107		return io_symlinkat_prep(req, sqe);
   8108	case IORING_OP_LINKAT:
   8109		return io_linkat_prep(req, sqe);
   8110	case IORING_OP_MSG_RING:
   8111		return io_msg_ring_prep(req, sqe);
   8112	case IORING_OP_FSETXATTR:
   8113		return io_fsetxattr_prep(req, sqe);
   8114	case IORING_OP_SETXATTR:
   8115		return io_setxattr_prep(req, sqe);
   8116	case IORING_OP_FGETXATTR:
   8117		return io_fgetxattr_prep(req, sqe);
   8118	case IORING_OP_GETXATTR:
   8119		return io_getxattr_prep(req, sqe);
   8120	case IORING_OP_SOCKET:
   8121		return io_socket_prep(req, sqe);
   8122	case IORING_OP_URING_CMD:
   8123		return io_uring_cmd_prep(req, sqe);
   8124	}
   8125
   8126	printk_once(KERN_WARNING "io_uring: unhandled opcode %d\n",
   8127			req->opcode);
   8128	return -EINVAL;
   8129}
   8130
   8131static int io_req_prep_async(struct io_kiocb *req)
   8132{
   8133	const struct io_op_def *def = &io_op_defs[req->opcode];
   8134
   8135	/* assign early for deferred execution for non-fixed file */
   8136	if (def->needs_file && !(req->flags & REQ_F_FIXED_FILE))
   8137		req->file = io_file_get_normal(req, req->cqe.fd);
   8138	if (!def->needs_async_setup)
   8139		return 0;
   8140	if (WARN_ON_ONCE(req_has_async_data(req)))
   8141		return -EFAULT;
   8142	if (io_alloc_async_data(req))
   8143		return -EAGAIN;
   8144
   8145	switch (req->opcode) {
   8146	case IORING_OP_READV:
   8147		return io_readv_prep_async(req);
   8148	case IORING_OP_WRITEV:
   8149		return io_writev_prep_async(req);
   8150	case IORING_OP_SENDMSG:
   8151		return io_sendmsg_prep_async(req);
   8152	case IORING_OP_RECVMSG:
   8153		return io_recvmsg_prep_async(req);
   8154	case IORING_OP_CONNECT:
   8155		return io_connect_prep_async(req);
   8156	case IORING_OP_URING_CMD:
   8157		return io_uring_cmd_prep_async(req);
   8158	}
   8159	printk_once(KERN_WARNING "io_uring: prep_async() bad opcode %d\n",
   8160		    req->opcode);
   8161	return -EFAULT;
   8162}
   8163
   8164static u32 io_get_sequence(struct io_kiocb *req)
   8165{
   8166	u32 seq = req->ctx->cached_sq_head;
   8167	struct io_kiocb *cur;
   8168
   8169	/* need original cached_sq_head, but it was increased for each req */
   8170	io_for_each_link(cur, req)
   8171		seq--;
   8172	return seq;
   8173}
   8174
   8175static __cold void io_drain_req(struct io_kiocb *req)
   8176{
   8177	struct io_ring_ctx *ctx = req->ctx;
   8178	struct io_defer_entry *de;
   8179	int ret;
   8180	u32 seq = io_get_sequence(req);
   8181
   8182	/* Still need defer if there is pending req in defer list. */
   8183	spin_lock(&ctx->completion_lock);
   8184	if (!req_need_defer(req, seq) && list_empty_careful(&ctx->defer_list)) {
   8185		spin_unlock(&ctx->completion_lock);
   8186queue:
   8187		ctx->drain_active = false;
   8188		io_req_task_queue(req);
   8189		return;
   8190	}
   8191	spin_unlock(&ctx->completion_lock);
   8192
   8193	ret = io_req_prep_async(req);
   8194	if (ret) {
   8195fail:
   8196		io_req_complete_failed(req, ret);
   8197		return;
   8198	}
   8199	io_prep_async_link(req);
   8200	de = kmalloc(sizeof(*de), GFP_KERNEL);
   8201	if (!de) {
   8202		ret = -ENOMEM;
   8203		goto fail;
   8204	}
   8205
   8206	spin_lock(&ctx->completion_lock);
   8207	if (!req_need_defer(req, seq) && list_empty(&ctx->defer_list)) {
   8208		spin_unlock(&ctx->completion_lock);
   8209		kfree(de);
   8210		goto queue;
   8211	}
   8212
   8213	trace_io_uring_defer(ctx, req, req->cqe.user_data, req->opcode);
   8214	de->req = req;
   8215	de->seq = seq;
   8216	list_add_tail(&de->list, &ctx->defer_list);
   8217	spin_unlock(&ctx->completion_lock);
   8218}
   8219
   8220static void io_clean_op(struct io_kiocb *req)
   8221{
   8222	if (req->flags & REQ_F_BUFFER_SELECTED) {
   8223		spin_lock(&req->ctx->completion_lock);
   8224		io_put_kbuf_comp(req);
   8225		spin_unlock(&req->ctx->completion_lock);
   8226	}
   8227
   8228	if (req->flags & REQ_F_NEED_CLEANUP) {
   8229		switch (req->opcode) {
   8230		case IORING_OP_READV:
   8231		case IORING_OP_READ_FIXED:
   8232		case IORING_OP_READ:
   8233		case IORING_OP_WRITEV:
   8234		case IORING_OP_WRITE_FIXED:
   8235		case IORING_OP_WRITE: {
   8236			struct io_async_rw *io = req->async_data;
   8237
   8238			kfree(io->free_iovec);
   8239			break;
   8240			}
   8241		case IORING_OP_RECVMSG:
   8242		case IORING_OP_SENDMSG: {
   8243			struct io_async_msghdr *io = req->async_data;
   8244
   8245			kfree(io->free_iov);
   8246			break;
   8247			}
   8248		case IORING_OP_OPENAT:
   8249		case IORING_OP_OPENAT2:
   8250			if (req->open.filename)
   8251				putname(req->open.filename);
   8252			break;
   8253		case IORING_OP_RENAMEAT:
   8254			putname(req->rename.oldpath);
   8255			putname(req->rename.newpath);
   8256			break;
   8257		case IORING_OP_UNLINKAT:
   8258			putname(req->unlink.filename);
   8259			break;
   8260		case IORING_OP_MKDIRAT:
   8261			putname(req->mkdir.filename);
   8262			break;
   8263		case IORING_OP_SYMLINKAT:
   8264			putname(req->symlink.oldpath);
   8265			putname(req->symlink.newpath);
   8266			break;
   8267		case IORING_OP_LINKAT:
   8268			putname(req->hardlink.oldpath);
   8269			putname(req->hardlink.newpath);
   8270			break;
   8271		case IORING_OP_STATX:
   8272			if (req->statx.filename)
   8273				putname(req->statx.filename);
   8274			break;
   8275		case IORING_OP_SETXATTR:
   8276		case IORING_OP_FSETXATTR:
   8277		case IORING_OP_GETXATTR:
   8278		case IORING_OP_FGETXATTR:
   8279			__io_xattr_finish(req);
   8280			break;
   8281		}
   8282	}
   8283	if ((req->flags & REQ_F_POLLED) && req->apoll) {
   8284		kfree(req->apoll->double_poll);
   8285		kfree(req->apoll);
   8286		req->apoll = NULL;
   8287	}
   8288	if (req->flags & REQ_F_INFLIGHT) {
   8289		struct io_uring_task *tctx = req->task->io_uring;
   8290
   8291		atomic_dec(&tctx->inflight_tracked);
   8292	}
   8293	if (req->flags & REQ_F_CREDS)
   8294		put_cred(req->creds);
   8295	if (req->flags & REQ_F_ASYNC_DATA) {
   8296		kfree(req->async_data);
   8297		req->async_data = NULL;
   8298	}
   8299	req->flags &= ~IO_REQ_CLEAN_FLAGS;
   8300}
   8301
   8302static bool io_assign_file(struct io_kiocb *req, unsigned int issue_flags)
   8303{
   8304	if (req->file || !io_op_defs[req->opcode].needs_file)
   8305		return true;
   8306
   8307	if (req->flags & REQ_F_FIXED_FILE)
   8308		req->file = io_file_get_fixed(req, req->cqe.fd, issue_flags);
   8309	else
   8310		req->file = io_file_get_normal(req, req->cqe.fd);
   8311
   8312	return !!req->file;
   8313}
   8314
   8315static int io_issue_sqe(struct io_kiocb *req, unsigned int issue_flags)
   8316{
   8317	const struct io_op_def *def = &io_op_defs[req->opcode];
   8318	const struct cred *creds = NULL;
   8319	int ret;
   8320
   8321	if (unlikely(!io_assign_file(req, issue_flags)))
   8322		return -EBADF;
   8323
   8324	if (unlikely((req->flags & REQ_F_CREDS) && req->creds != current_cred()))
   8325		creds = override_creds(req->creds);
   8326
   8327	if (!def->audit_skip)
   8328		audit_uring_entry(req->opcode);
   8329
   8330	switch (req->opcode) {
   8331	case IORING_OP_NOP:
   8332		ret = io_nop(req, issue_flags);
   8333		break;
   8334	case IORING_OP_READV:
   8335	case IORING_OP_READ_FIXED:
   8336	case IORING_OP_READ:
   8337		ret = io_read(req, issue_flags);
   8338		break;
   8339	case IORING_OP_WRITEV:
   8340	case IORING_OP_WRITE_FIXED:
   8341	case IORING_OP_WRITE:
   8342		ret = io_write(req, issue_flags);
   8343		break;
   8344	case IORING_OP_FSYNC:
   8345		ret = io_fsync(req, issue_flags);
   8346		break;
   8347	case IORING_OP_POLL_ADD:
   8348		ret = io_poll_add(req, issue_flags);
   8349		break;
   8350	case IORING_OP_POLL_REMOVE:
   8351		ret = io_poll_remove(req, issue_flags);
   8352		break;
   8353	case IORING_OP_SYNC_FILE_RANGE:
   8354		ret = io_sync_file_range(req, issue_flags);
   8355		break;
   8356	case IORING_OP_SENDMSG:
   8357		ret = io_sendmsg(req, issue_flags);
   8358		break;
   8359	case IORING_OP_SEND:
   8360		ret = io_send(req, issue_flags);
   8361		break;
   8362	case IORING_OP_RECVMSG:
   8363		ret = io_recvmsg(req, issue_flags);
   8364		break;
   8365	case IORING_OP_RECV:
   8366		ret = io_recv(req, issue_flags);
   8367		break;
   8368	case IORING_OP_TIMEOUT:
   8369		ret = io_timeout(req, issue_flags);
   8370		break;
   8371	case IORING_OP_TIMEOUT_REMOVE:
   8372		ret = io_timeout_remove(req, issue_flags);
   8373		break;
   8374	case IORING_OP_ACCEPT:
   8375		ret = io_accept(req, issue_flags);
   8376		break;
   8377	case IORING_OP_CONNECT:
   8378		ret = io_connect(req, issue_flags);
   8379		break;
   8380	case IORING_OP_ASYNC_CANCEL:
   8381		ret = io_async_cancel(req, issue_flags);
   8382		break;
   8383	case IORING_OP_FALLOCATE:
   8384		ret = io_fallocate(req, issue_flags);
   8385		break;
   8386	case IORING_OP_OPENAT:
   8387		ret = io_openat(req, issue_flags);
   8388		break;
   8389	case IORING_OP_CLOSE:
   8390		ret = io_close(req, issue_flags);
   8391		break;
   8392	case IORING_OP_FILES_UPDATE:
   8393		ret = io_files_update(req, issue_flags);
   8394		break;
   8395	case IORING_OP_STATX:
   8396		ret = io_statx(req, issue_flags);
   8397		break;
   8398	case IORING_OP_FADVISE:
   8399		ret = io_fadvise(req, issue_flags);
   8400		break;
   8401	case IORING_OP_MADVISE:
   8402		ret = io_madvise(req, issue_flags);
   8403		break;
   8404	case IORING_OP_OPENAT2:
   8405		ret = io_openat2(req, issue_flags);
   8406		break;
   8407	case IORING_OP_EPOLL_CTL:
   8408		ret = io_epoll_ctl(req, issue_flags);
   8409		break;
   8410	case IORING_OP_SPLICE:
   8411		ret = io_splice(req, issue_flags);
   8412		break;
   8413	case IORING_OP_PROVIDE_BUFFERS:
   8414		ret = io_provide_buffers(req, issue_flags);
   8415		break;
   8416	case IORING_OP_REMOVE_BUFFERS:
   8417		ret = io_remove_buffers(req, issue_flags);
   8418		break;
   8419	case IORING_OP_TEE:
   8420		ret = io_tee(req, issue_flags);
   8421		break;
   8422	case IORING_OP_SHUTDOWN:
   8423		ret = io_shutdown(req, issue_flags);
   8424		break;
   8425	case IORING_OP_RENAMEAT:
   8426		ret = io_renameat(req, issue_flags);
   8427		break;
   8428	case IORING_OP_UNLINKAT:
   8429		ret = io_unlinkat(req, issue_flags);
   8430		break;
   8431	case IORING_OP_MKDIRAT:
   8432		ret = io_mkdirat(req, issue_flags);
   8433		break;
   8434	case IORING_OP_SYMLINKAT:
   8435		ret = io_symlinkat(req, issue_flags);
   8436		break;
   8437	case IORING_OP_LINKAT:
   8438		ret = io_linkat(req, issue_flags);
   8439		break;
   8440	case IORING_OP_MSG_RING:
   8441		ret = io_msg_ring(req, issue_flags);
   8442		break;
   8443	case IORING_OP_FSETXATTR:
   8444		ret = io_fsetxattr(req, issue_flags);
   8445		break;
   8446	case IORING_OP_SETXATTR:
   8447		ret = io_setxattr(req, issue_flags);
   8448		break;
   8449	case IORING_OP_FGETXATTR:
   8450		ret = io_fgetxattr(req, issue_flags);
   8451		break;
   8452	case IORING_OP_GETXATTR:
   8453		ret = io_getxattr(req, issue_flags);
   8454		break;
   8455	case IORING_OP_SOCKET:
   8456		ret = io_socket(req, issue_flags);
   8457		break;
   8458	case IORING_OP_URING_CMD:
   8459		ret = io_uring_cmd(req, issue_flags);
   8460		break;
   8461	default:
   8462		ret = -EINVAL;
   8463		break;
   8464	}
   8465
   8466	if (!def->audit_skip)
   8467		audit_uring_exit(!ret, ret);
   8468
   8469	if (creds)
   8470		revert_creds(creds);
   8471	if (ret)
   8472		return ret;
   8473	/* If the op doesn't have a file, we're not polling for it */
   8474	if ((req->ctx->flags & IORING_SETUP_IOPOLL) && req->file)
   8475		io_iopoll_req_issued(req, issue_flags);
   8476
   8477	return 0;
   8478}
   8479
   8480static struct io_wq_work *io_wq_free_work(struct io_wq_work *work)
   8481{
   8482	struct io_kiocb *req = container_of(work, struct io_kiocb, work);
   8483
   8484	req = io_put_req_find_next(req);
   8485	return req ? &req->work : NULL;
   8486}
   8487
   8488static void io_wq_submit_work(struct io_wq_work *work)
   8489{
   8490	struct io_kiocb *req = container_of(work, struct io_kiocb, work);
   8491	const struct io_op_def *def = &io_op_defs[req->opcode];
   8492	unsigned int issue_flags = IO_URING_F_UNLOCKED;
   8493	bool needs_poll = false;
   8494	int ret = 0, err = -ECANCELED;
   8495
   8496	/* one will be dropped by ->io_free_work() after returning to io-wq */
   8497	if (!(req->flags & REQ_F_REFCOUNT))
   8498		__io_req_set_refcount(req, 2);
   8499	else
   8500		req_ref_get(req);
   8501
   8502	io_arm_ltimeout(req);
   8503
   8504	/* either cancelled or io-wq is dying, so don't touch tctx->iowq */
   8505	if (work->flags & IO_WQ_WORK_CANCEL) {
   8506fail:
   8507		io_req_task_queue_fail(req, err);
   8508		return;
   8509	}
   8510	if (!io_assign_file(req, issue_flags)) {
   8511		err = -EBADF;
   8512		work->flags |= IO_WQ_WORK_CANCEL;
   8513		goto fail;
   8514	}
   8515
   8516	if (req->flags & REQ_F_FORCE_ASYNC) {
   8517		bool opcode_poll = def->pollin || def->pollout;
   8518
   8519		if (opcode_poll && file_can_poll(req->file)) {
   8520			needs_poll = true;
   8521			issue_flags |= IO_URING_F_NONBLOCK;
   8522		}
   8523	}
   8524
   8525	do {
   8526		ret = io_issue_sqe(req, issue_flags);
   8527		if (ret != -EAGAIN)
   8528			break;
   8529		/*
   8530		 * We can get EAGAIN for iopolled IO even though we're
   8531		 * forcing a sync submission from here, since we can't
   8532		 * wait for request slots on the block side.
   8533		 */
   8534		if (!needs_poll) {
   8535			if (!(req->ctx->flags & IORING_SETUP_IOPOLL))
   8536				break;
   8537			cond_resched();
   8538			continue;
   8539		}
   8540
   8541		if (io_arm_poll_handler(req, issue_flags) == IO_APOLL_OK)
   8542			return;
   8543		/* aborted or ready, in either case retry blocking */
   8544		needs_poll = false;
   8545		issue_flags &= ~IO_URING_F_NONBLOCK;
   8546	} while (1);
   8547
   8548	/* avoid locking problems by failing it from a clean context */
   8549	if (ret)
   8550		io_req_task_queue_fail(req, ret);
   8551}
   8552
   8553static inline struct io_fixed_file *io_fixed_file_slot(struct io_file_table *table,
   8554						       unsigned i)
   8555{
   8556	return &table->files[i];
   8557}
   8558
   8559static inline struct file *io_file_from_index(struct io_ring_ctx *ctx,
   8560					      int index)
   8561{
   8562	struct io_fixed_file *slot = io_fixed_file_slot(&ctx->file_table, index);
   8563
   8564	return (struct file *) (slot->file_ptr & FFS_MASK);
   8565}
   8566
   8567static void io_fixed_file_set(struct io_fixed_file *file_slot, struct file *file)
   8568{
   8569	unsigned long file_ptr = (unsigned long) file;
   8570
   8571	file_ptr |= io_file_get_flags(file);
   8572	file_slot->file_ptr = file_ptr;
   8573}
   8574
   8575static inline struct file *io_file_get_fixed(struct io_kiocb *req, int fd,
   8576					     unsigned int issue_flags)
   8577{
   8578	struct io_ring_ctx *ctx = req->ctx;
   8579	struct file *file = NULL;
   8580	unsigned long file_ptr;
   8581
   8582	io_ring_submit_lock(ctx, issue_flags);
   8583
   8584	if (unlikely((unsigned int)fd >= ctx->nr_user_files))
   8585		goto out;
   8586	fd = array_index_nospec(fd, ctx->nr_user_files);
   8587	file_ptr = io_fixed_file_slot(&ctx->file_table, fd)->file_ptr;
   8588	file = (struct file *) (file_ptr & FFS_MASK);
   8589	file_ptr &= ~FFS_MASK;
   8590	/* mask in overlapping REQ_F and FFS bits */
   8591	req->flags |= (file_ptr << REQ_F_SUPPORT_NOWAIT_BIT);
   8592	io_req_set_rsrc_node(req, ctx, 0);
   8593	WARN_ON_ONCE(file && !test_bit(fd, ctx->file_table.bitmap));
   8594out:
   8595	io_ring_submit_unlock(ctx, issue_flags);
   8596	return file;
   8597}
   8598
   8599static struct file *io_file_get_normal(struct io_kiocb *req, int fd)
   8600{
   8601	struct file *file = fget(fd);
   8602
   8603	trace_io_uring_file_get(req->ctx, req, req->cqe.user_data, fd);
   8604
   8605	/* we don't allow fixed io_uring files */
   8606	if (file && file->f_op == &io_uring_fops)
   8607		io_req_track_inflight(req);
   8608	return file;
   8609}
   8610
   8611static void io_req_task_link_timeout(struct io_kiocb *req, bool *locked)
   8612{
   8613	struct io_kiocb *prev = req->timeout.prev;
   8614	int ret = -ENOENT;
   8615
   8616	if (prev) {
   8617		if (!(req->task->flags & PF_EXITING)) {
   8618			struct io_cancel_data cd = {
   8619				.ctx		= req->ctx,
   8620				.data		= prev->cqe.user_data,
   8621			};
   8622
   8623			ret = io_try_cancel(req, &cd);
   8624		}
   8625		io_req_complete_post(req, ret ?: -ETIME, 0);
   8626		io_put_req(prev);
   8627	} else {
   8628		io_req_complete_post(req, -ETIME, 0);
   8629	}
   8630}
   8631
   8632static enum hrtimer_restart io_link_timeout_fn(struct hrtimer *timer)
   8633{
   8634	struct io_timeout_data *data = container_of(timer,
   8635						struct io_timeout_data, timer);
   8636	struct io_kiocb *prev, *req = data->req;
   8637	struct io_ring_ctx *ctx = req->ctx;
   8638	unsigned long flags;
   8639
   8640	spin_lock_irqsave(&ctx->timeout_lock, flags);
   8641	prev = req->timeout.head;
   8642	req->timeout.head = NULL;
   8643
   8644	/*
   8645	 * We don't expect the list to be empty, that will only happen if we
   8646	 * race with the completion of the linked work.
   8647	 */
   8648	if (prev) {
   8649		io_remove_next_linked(prev);
   8650		if (!req_ref_inc_not_zero(prev))
   8651			prev = NULL;
   8652	}
   8653	list_del(&req->timeout.list);
   8654	req->timeout.prev = prev;
   8655	spin_unlock_irqrestore(&ctx->timeout_lock, flags);
   8656
   8657	req->io_task_work.func = io_req_task_link_timeout;
   8658	io_req_task_work_add(req);
   8659	return HRTIMER_NORESTART;
   8660}
   8661
   8662static void io_queue_linked_timeout(struct io_kiocb *req)
   8663{
   8664	struct io_ring_ctx *ctx = req->ctx;
   8665
   8666	spin_lock_irq(&ctx->timeout_lock);
   8667	/*
   8668	 * If the back reference is NULL, then our linked request finished
   8669	 * before we got a chance to setup the timer
   8670	 */
   8671	if (req->timeout.head) {
   8672		struct io_timeout_data *data = req->async_data;
   8673
   8674		data->timer.function = io_link_timeout_fn;
   8675		hrtimer_start(&data->timer, timespec64_to_ktime(data->ts),
   8676				data->mode);
   8677		list_add_tail(&req->timeout.list, &ctx->ltimeout_list);
   8678	}
   8679	spin_unlock_irq(&ctx->timeout_lock);
   8680	/* drop submission reference */
   8681	io_put_req(req);
   8682}
   8683
   8684static void io_queue_async(struct io_kiocb *req, int ret)
   8685	__must_hold(&req->ctx->uring_lock)
   8686{
   8687	struct io_kiocb *linked_timeout;
   8688
   8689	if (ret != -EAGAIN || (req->flags & REQ_F_NOWAIT)) {
   8690		io_req_complete_failed(req, ret);
   8691		return;
   8692	}
   8693
   8694	linked_timeout = io_prep_linked_timeout(req);
   8695
   8696	switch (io_arm_poll_handler(req, 0)) {
   8697	case IO_APOLL_READY:
   8698		io_req_task_queue(req);
   8699		break;
   8700	case IO_APOLL_ABORTED:
   8701		/*
   8702		 * Queued up for async execution, worker will release
   8703		 * submit reference when the iocb is actually submitted.
   8704		 */
   8705		io_kbuf_recycle(req, 0);
   8706		io_queue_iowq(req, NULL);
   8707		break;
   8708	case IO_APOLL_OK:
   8709		break;
   8710	}
   8711
   8712	if (linked_timeout)
   8713		io_queue_linked_timeout(linked_timeout);
   8714}
   8715
   8716static inline void io_queue_sqe(struct io_kiocb *req)
   8717	__must_hold(&req->ctx->uring_lock)
   8718{
   8719	int ret;
   8720
   8721	ret = io_issue_sqe(req, IO_URING_F_NONBLOCK|IO_URING_F_COMPLETE_DEFER);
   8722
   8723	if (req->flags & REQ_F_COMPLETE_INLINE) {
   8724		io_req_add_compl_list(req);
   8725		return;
   8726	}
   8727	/*
   8728	 * We async punt it if the file wasn't marked NOWAIT, or if the file
   8729	 * doesn't support non-blocking read/write attempts
   8730	 */
   8731	if (likely(!ret))
   8732		io_arm_ltimeout(req);
   8733	else
   8734		io_queue_async(req, ret);
   8735}
   8736
   8737static void io_queue_sqe_fallback(struct io_kiocb *req)
   8738	__must_hold(&req->ctx->uring_lock)
   8739{
   8740	if (unlikely(req->flags & REQ_F_FAIL)) {
   8741		/*
   8742		 * We don't submit, fail them all, for that replace hardlinks
   8743		 * with normal links. Extra REQ_F_LINK is tolerated.
   8744		 */
   8745		req->flags &= ~REQ_F_HARDLINK;
   8746		req->flags |= REQ_F_LINK;
   8747		io_req_complete_failed(req, req->cqe.res);
   8748	} else if (unlikely(req->ctx->drain_active)) {
   8749		io_drain_req(req);
   8750	} else {
   8751		int ret = io_req_prep_async(req);
   8752
   8753		if (unlikely(ret))
   8754			io_req_complete_failed(req, ret);
   8755		else
   8756			io_queue_iowq(req, NULL);
   8757	}
   8758}
   8759
   8760/*
   8761 * Check SQE restrictions (opcode and flags).
   8762 *
   8763 * Returns 'true' if SQE is allowed, 'false' otherwise.
   8764 */
   8765static inline bool io_check_restriction(struct io_ring_ctx *ctx,
   8766					struct io_kiocb *req,
   8767					unsigned int sqe_flags)
   8768{
   8769	if (!test_bit(req->opcode, ctx->restrictions.sqe_op))
   8770		return false;
   8771
   8772	if ((sqe_flags & ctx->restrictions.sqe_flags_required) !=
   8773	    ctx->restrictions.sqe_flags_required)
   8774		return false;
   8775
   8776	if (sqe_flags & ~(ctx->restrictions.sqe_flags_allowed |
   8777			  ctx->restrictions.sqe_flags_required))
   8778		return false;
   8779
   8780	return true;
   8781}
   8782
   8783static void io_init_req_drain(struct io_kiocb *req)
   8784{
   8785	struct io_ring_ctx *ctx = req->ctx;
   8786	struct io_kiocb *head = ctx->submit_state.link.head;
   8787
   8788	ctx->drain_active = true;
   8789	if (head) {
   8790		/*
   8791		 * If we need to drain a request in the middle of a link, drain
   8792		 * the head request and the next request/link after the current
   8793		 * link. Considering sequential execution of links,
   8794		 * REQ_F_IO_DRAIN will be maintained for every request of our
   8795		 * link.
   8796		 */
   8797		head->flags |= REQ_F_IO_DRAIN | REQ_F_FORCE_ASYNC;
   8798		ctx->drain_next = true;
   8799	}
   8800}
   8801
   8802static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req,
   8803		       const struct io_uring_sqe *sqe)
   8804	__must_hold(&ctx->uring_lock)
   8805{
   8806	const struct io_op_def *def;
   8807	unsigned int sqe_flags;
   8808	int personality;
   8809	u8 opcode;
   8810
   8811	/* req is partially pre-initialised, see io_preinit_req() */
   8812	req->opcode = opcode = READ_ONCE(sqe->opcode);
   8813	/* same numerical values with corresponding REQ_F_*, safe to copy */
   8814	req->flags = sqe_flags = READ_ONCE(sqe->flags);
   8815	req->cqe.user_data = READ_ONCE(sqe->user_data);
   8816	req->file = NULL;
   8817	req->rsrc_node = NULL;
   8818	req->task = current;
   8819
   8820	if (unlikely(opcode >= IORING_OP_LAST)) {
   8821		req->opcode = 0;
   8822		return -EINVAL;
   8823	}
   8824	def = &io_op_defs[opcode];
   8825	if (unlikely(sqe_flags & ~SQE_COMMON_FLAGS)) {
   8826		/* enforce forwards compatibility on users */
   8827		if (sqe_flags & ~SQE_VALID_FLAGS)
   8828			return -EINVAL;
   8829		if (sqe_flags & IOSQE_BUFFER_SELECT) {
   8830			if (!def->buffer_select)
   8831				return -EOPNOTSUPP;
   8832			req->buf_index = READ_ONCE(sqe->buf_group);
   8833		}
   8834		if (sqe_flags & IOSQE_CQE_SKIP_SUCCESS)
   8835			ctx->drain_disabled = true;
   8836		if (sqe_flags & IOSQE_IO_DRAIN) {
   8837			if (ctx->drain_disabled)
   8838				return -EOPNOTSUPP;
   8839			io_init_req_drain(req);
   8840		}
   8841	}
   8842	if (unlikely(ctx->restricted || ctx->drain_active || ctx->drain_next)) {
   8843		if (ctx->restricted && !io_check_restriction(ctx, req, sqe_flags))
   8844			return -EACCES;
   8845		/* knock it to the slow queue path, will be drained there */
   8846		if (ctx->drain_active)
   8847			req->flags |= REQ_F_FORCE_ASYNC;
   8848		/* if there is no link, we're at "next" request and need to drain */
   8849		if (unlikely(ctx->drain_next) && !ctx->submit_state.link.head) {
   8850			ctx->drain_next = false;
   8851			ctx->drain_active = true;
   8852			req->flags |= REQ_F_IO_DRAIN | REQ_F_FORCE_ASYNC;
   8853		}
   8854	}
   8855
   8856	if (!def->ioprio && sqe->ioprio)
   8857		return -EINVAL;
   8858	if (!def->iopoll && (ctx->flags & IORING_SETUP_IOPOLL))
   8859		return -EINVAL;
   8860
   8861	if (def->needs_file) {
   8862		struct io_submit_state *state = &ctx->submit_state;
   8863
   8864		req->cqe.fd = READ_ONCE(sqe->fd);
   8865
   8866		/*
   8867		 * Plug now if we have more than 2 IO left after this, and the
   8868		 * target is potentially a read/write to block based storage.
   8869		 */
   8870		if (state->need_plug && def->plug) {
   8871			state->plug_started = true;
   8872			state->need_plug = false;
   8873			blk_start_plug_nr_ios(&state->plug, state->submit_nr);
   8874		}
   8875	}
   8876
   8877	personality = READ_ONCE(sqe->personality);
   8878	if (personality) {
   8879		int ret;
   8880
   8881		req->creds = xa_load(&ctx->personalities, personality);
   8882		if (!req->creds)
   8883			return -EINVAL;
   8884		get_cred(req->creds);
   8885		ret = security_uring_override_creds(req->creds);
   8886		if (ret) {
   8887			put_cred(req->creds);
   8888			return ret;
   8889		}
   8890		req->flags |= REQ_F_CREDS;
   8891	}
   8892
   8893	return io_req_prep(req, sqe);
   8894}
   8895
   8896static __cold int io_submit_fail_init(const struct io_uring_sqe *sqe,
   8897				      struct io_kiocb *req, int ret)
   8898{
   8899	struct io_ring_ctx *ctx = req->ctx;
   8900	struct io_submit_link *link = &ctx->submit_state.link;
   8901	struct io_kiocb *head = link->head;
   8902
   8903	trace_io_uring_req_failed(sqe, ctx, req, ret);
   8904
   8905	/*
   8906	 * Avoid breaking links in the middle as it renders links with SQPOLL
   8907	 * unusable. Instead of failing eagerly, continue assembling the link if
   8908	 * applicable and mark the head with REQ_F_FAIL. The link flushing code
   8909	 * should find the flag and handle the rest.
   8910	 */
   8911	req_fail_link_node(req, ret);
   8912	if (head && !(head->flags & REQ_F_FAIL))
   8913		req_fail_link_node(head, -ECANCELED);
   8914
   8915	if (!(req->flags & IO_REQ_LINK_FLAGS)) {
   8916		if (head) {
   8917			link->last->link = req;
   8918			link->head = NULL;
   8919			req = head;
   8920		}
   8921		io_queue_sqe_fallback(req);
   8922		return ret;
   8923	}
   8924
   8925	if (head)
   8926		link->last->link = req;
   8927	else
   8928		link->head = req;
   8929	link->last = req;
   8930	return 0;
   8931}
   8932
   8933static inline int io_submit_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req,
   8934			 const struct io_uring_sqe *sqe)
   8935	__must_hold(&ctx->uring_lock)
   8936{
   8937	struct io_submit_link *link = &ctx->submit_state.link;
   8938	int ret;
   8939
   8940	ret = io_init_req(ctx, req, sqe);
   8941	if (unlikely(ret))
   8942		return io_submit_fail_init(sqe, req, ret);
   8943
   8944	/* don't need @sqe from now on */
   8945	trace_io_uring_submit_sqe(ctx, req, req->cqe.user_data, req->opcode,
   8946				  req->flags, true,
   8947				  ctx->flags & IORING_SETUP_SQPOLL);
   8948
   8949	/*
   8950	 * If we already have a head request, queue this one for async
   8951	 * submittal once the head completes. If we don't have a head but
   8952	 * IOSQE_IO_LINK is set in the sqe, start a new head. This one will be
   8953	 * submitted sync once the chain is complete. If none of those
   8954	 * conditions are true (normal request), then just queue it.
   8955	 */
   8956	if (unlikely(link->head)) {
   8957		ret = io_req_prep_async(req);
   8958		if (unlikely(ret))
   8959			return io_submit_fail_init(sqe, req, ret);
   8960
   8961		trace_io_uring_link(ctx, req, link->head);
   8962		link->last->link = req;
   8963		link->last = req;
   8964
   8965		if (req->flags & IO_REQ_LINK_FLAGS)
   8966			return 0;
   8967		/* last request of the link, flush it */
   8968		req = link->head;
   8969		link->head = NULL;
   8970		if (req->flags & (REQ_F_FORCE_ASYNC | REQ_F_FAIL))
   8971			goto fallback;
   8972
   8973	} else if (unlikely(req->flags & (IO_REQ_LINK_FLAGS |
   8974					  REQ_F_FORCE_ASYNC | REQ_F_FAIL))) {
   8975		if (req->flags & IO_REQ_LINK_FLAGS) {
   8976			link->head = req;
   8977			link->last = req;
   8978		} else {
   8979fallback:
   8980			io_queue_sqe_fallback(req);
   8981		}
   8982		return 0;
   8983	}
   8984
   8985	io_queue_sqe(req);
   8986	return 0;
   8987}
   8988
   8989/*
   8990 * Batched submission is done, ensure local IO is flushed out.
   8991 */
   8992static void io_submit_state_end(struct io_ring_ctx *ctx)
   8993{
   8994	struct io_submit_state *state = &ctx->submit_state;
   8995
   8996	if (unlikely(state->link.head))
   8997		io_queue_sqe_fallback(state->link.head);
   8998	/* flush only after queuing links as they can generate completions */
   8999	io_submit_flush_completions(ctx);
   9000	if (state->plug_started)
   9001		blk_finish_plug(&state->plug);
   9002}
   9003
   9004/*
   9005 * Start submission side cache.
   9006 */
   9007static void io_submit_state_start(struct io_submit_state *state,
   9008				  unsigned int max_ios)
   9009{
   9010	state->plug_started = false;
   9011	state->need_plug = max_ios > 2;
   9012	state->submit_nr = max_ios;
   9013	/* set only head, no need to init link_last in advance */
   9014	state->link.head = NULL;
   9015}
   9016
   9017static void io_commit_sqring(struct io_ring_ctx *ctx)
   9018{
   9019	struct io_rings *rings = ctx->rings;
   9020
   9021	/*
   9022	 * Ensure any loads from the SQEs are done at this point,
   9023	 * since once we write the new head, the application could
   9024	 * write new data to them.
   9025	 */
   9026	smp_store_release(&rings->sq.head, ctx->cached_sq_head);
   9027}
   9028
   9029/*
   9030 * Fetch an sqe, if one is available. Note this returns a pointer to memory
   9031 * that is mapped by userspace. This means that care needs to be taken to
   9032 * ensure that reads are stable, as we cannot rely on userspace always
   9033 * being a good citizen. If members of the sqe are validated and then later
   9034 * used, it's important that those reads are done through READ_ONCE() to
   9035 * prevent a re-load down the line.
   9036 */
   9037static const struct io_uring_sqe *io_get_sqe(struct io_ring_ctx *ctx)
   9038{
   9039	unsigned head, mask = ctx->sq_entries - 1;
   9040	unsigned sq_idx = ctx->cached_sq_head++ & mask;
   9041
   9042	/*
   9043	 * The cached sq head (or cq tail) serves two purposes:
   9044	 *
   9045	 * 1) allows us to batch the cost of updating the user visible
   9046	 *    head updates.
   9047	 * 2) allows the kernel side to track the head on its own, even
   9048	 *    though the application is the one updating it.
   9049	 */
   9050	head = READ_ONCE(ctx->sq_array[sq_idx]);
   9051	if (likely(head < ctx->sq_entries)) {
   9052		/* double index for 128-byte SQEs, twice as long */
   9053		if (ctx->flags & IORING_SETUP_SQE128)
   9054			head <<= 1;
   9055		return &ctx->sq_sqes[head];
   9056	}
   9057
   9058	/* drop invalid entries */
   9059	ctx->cq_extra--;
   9060	WRITE_ONCE(ctx->rings->sq_dropped,
   9061		   READ_ONCE(ctx->rings->sq_dropped) + 1);
   9062	return NULL;
   9063}
   9064
   9065static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr)
   9066	__must_hold(&ctx->uring_lock)
   9067{
   9068	unsigned int entries = io_sqring_entries(ctx);
   9069	unsigned int left;
   9070	int ret;
   9071
   9072	if (unlikely(!entries))
   9073		return 0;
   9074	/* make sure SQ entry isn't read before tail */
   9075	ret = left = min3(nr, ctx->sq_entries, entries);
   9076	io_get_task_refs(left);
   9077	io_submit_state_start(&ctx->submit_state, left);
   9078
   9079	do {
   9080		const struct io_uring_sqe *sqe;
   9081		struct io_kiocb *req;
   9082
   9083		if (unlikely(!io_alloc_req_refill(ctx)))
   9084			break;
   9085		req = io_alloc_req(ctx);
   9086		sqe = io_get_sqe(ctx);
   9087		if (unlikely(!sqe)) {
   9088			io_req_add_to_cache(req, ctx);
   9089			break;
   9090		}
   9091
   9092		/*
   9093		 * Continue submitting even for sqe failure if the
   9094		 * ring was setup with IORING_SETUP_SUBMIT_ALL
   9095		 */
   9096		if (unlikely(io_submit_sqe(ctx, req, sqe)) &&
   9097		    !(ctx->flags & IORING_SETUP_SUBMIT_ALL)) {
   9098			left--;
   9099			break;
   9100		}
   9101	} while (--left);
   9102
   9103	if (unlikely(left)) {
   9104		ret -= left;
   9105		/* try again if it submitted nothing and can't allocate a req */
   9106		if (!ret && io_req_cache_empty(ctx))
   9107			ret = -EAGAIN;
   9108		current->io_uring->cached_refs += left;
   9109	}
   9110
   9111	io_submit_state_end(ctx);
   9112	 /* Commit SQ ring head once we've consumed and submitted all SQEs */
   9113	io_commit_sqring(ctx);
   9114	return ret;
   9115}
   9116
   9117static inline bool io_sqd_events_pending(struct io_sq_data *sqd)
   9118{
   9119	return READ_ONCE(sqd->state);
   9120}
   9121
   9122static int __io_sq_thread(struct io_ring_ctx *ctx, bool cap_entries)
   9123{
   9124	unsigned int to_submit;
   9125	int ret = 0;
   9126
   9127	to_submit = io_sqring_entries(ctx);
   9128	/* if we're handling multiple rings, cap submit size for fairness */
   9129	if (cap_entries && to_submit > IORING_SQPOLL_CAP_ENTRIES_VALUE)
   9130		to_submit = IORING_SQPOLL_CAP_ENTRIES_VALUE;
   9131
   9132	if (!wq_list_empty(&ctx->iopoll_list) || to_submit) {
   9133		const struct cred *creds = NULL;
   9134
   9135		if (ctx->sq_creds != current_cred())
   9136			creds = override_creds(ctx->sq_creds);
   9137
   9138		mutex_lock(&ctx->uring_lock);
   9139		if (!wq_list_empty(&ctx->iopoll_list))
   9140			io_do_iopoll(ctx, true);
   9141
   9142		/*
   9143		 * Don't submit if refs are dying, good for io_uring_register(),
   9144		 * but also it is relied upon by io_ring_exit_work()
   9145		 */
   9146		if (to_submit && likely(!percpu_ref_is_dying(&ctx->refs)) &&
   9147		    !(ctx->flags & IORING_SETUP_R_DISABLED))
   9148			ret = io_submit_sqes(ctx, to_submit);
   9149		mutex_unlock(&ctx->uring_lock);
   9150
   9151		if (to_submit && wq_has_sleeper(&ctx->sqo_sq_wait))
   9152			wake_up(&ctx->sqo_sq_wait);
   9153		if (creds)
   9154			revert_creds(creds);
   9155	}
   9156
   9157	return ret;
   9158}
   9159
   9160static __cold void io_sqd_update_thread_idle(struct io_sq_data *sqd)
   9161{
   9162	struct io_ring_ctx *ctx;
   9163	unsigned sq_thread_idle = 0;
   9164
   9165	list_for_each_entry(ctx, &sqd->ctx_list, sqd_list)
   9166		sq_thread_idle = max(sq_thread_idle, ctx->sq_thread_idle);
   9167	sqd->sq_thread_idle = sq_thread_idle;
   9168}
   9169
   9170static bool io_sqd_handle_event(struct io_sq_data *sqd)
   9171{
   9172	bool did_sig = false;
   9173	struct ksignal ksig;
   9174
   9175	if (test_bit(IO_SQ_THREAD_SHOULD_PARK, &sqd->state) ||
   9176	    signal_pending(current)) {
   9177		mutex_unlock(&sqd->lock);
   9178		if (signal_pending(current))
   9179			did_sig = get_signal(&ksig);
   9180		cond_resched();
   9181		mutex_lock(&sqd->lock);
   9182	}
   9183	return did_sig || test_bit(IO_SQ_THREAD_SHOULD_STOP, &sqd->state);
   9184}
   9185
   9186static int io_sq_thread(void *data)
   9187{
   9188	struct io_sq_data *sqd = data;
   9189	struct io_ring_ctx *ctx;
   9190	unsigned long timeout = 0;
   9191	char buf[TASK_COMM_LEN];
   9192	DEFINE_WAIT(wait);
   9193
   9194	snprintf(buf, sizeof(buf), "iou-sqp-%d", sqd->task_pid);
   9195	set_task_comm(current, buf);
   9196
   9197	if (sqd->sq_cpu != -1)
   9198		set_cpus_allowed_ptr(current, cpumask_of(sqd->sq_cpu));
   9199	else
   9200		set_cpus_allowed_ptr(current, cpu_online_mask);
   9201	current->flags |= PF_NO_SETAFFINITY;
   9202
   9203	audit_alloc_kernel(current);
   9204
   9205	mutex_lock(&sqd->lock);
   9206	while (1) {
   9207		bool cap_entries, sqt_spin = false;
   9208
   9209		if (io_sqd_events_pending(sqd) || signal_pending(current)) {
   9210			if (io_sqd_handle_event(sqd))
   9211				break;
   9212			timeout = jiffies + sqd->sq_thread_idle;
   9213		}
   9214
   9215		cap_entries = !list_is_singular(&sqd->ctx_list);
   9216		list_for_each_entry(ctx, &sqd->ctx_list, sqd_list) {
   9217			int ret = __io_sq_thread(ctx, cap_entries);
   9218
   9219			if (!sqt_spin && (ret > 0 || !wq_list_empty(&ctx->iopoll_list)))
   9220				sqt_spin = true;
   9221		}
   9222		if (io_run_task_work())
   9223			sqt_spin = true;
   9224
   9225		if (sqt_spin || !time_after(jiffies, timeout)) {
   9226			cond_resched();
   9227			if (sqt_spin)
   9228				timeout = jiffies + sqd->sq_thread_idle;
   9229			continue;
   9230		}
   9231
   9232		prepare_to_wait(&sqd->wait, &wait, TASK_INTERRUPTIBLE);
   9233		if (!io_sqd_events_pending(sqd) && !task_work_pending(current)) {
   9234			bool needs_sched = true;
   9235
   9236			list_for_each_entry(ctx, &sqd->ctx_list, sqd_list) {
   9237				atomic_or(IORING_SQ_NEED_WAKEUP,
   9238						&ctx->rings->sq_flags);
   9239				if ((ctx->flags & IORING_SETUP_IOPOLL) &&
   9240				    !wq_list_empty(&ctx->iopoll_list)) {
   9241					needs_sched = false;
   9242					break;
   9243				}
   9244
   9245				/*
   9246				 * Ensure the store of the wakeup flag is not
   9247				 * reordered with the load of the SQ tail
   9248				 */
   9249				smp_mb__after_atomic();
   9250
   9251				if (io_sqring_entries(ctx)) {
   9252					needs_sched = false;
   9253					break;
   9254				}
   9255			}
   9256
   9257			if (needs_sched) {
   9258				mutex_unlock(&sqd->lock);
   9259				schedule();
   9260				mutex_lock(&sqd->lock);
   9261			}
   9262			list_for_each_entry(ctx, &sqd->ctx_list, sqd_list)
   9263				atomic_andnot(IORING_SQ_NEED_WAKEUP,
   9264						&ctx->rings->sq_flags);
   9265		}
   9266
   9267		finish_wait(&sqd->wait, &wait);
   9268		timeout = jiffies + sqd->sq_thread_idle;
   9269	}
   9270
   9271	io_uring_cancel_generic(true, sqd);
   9272	sqd->thread = NULL;
   9273	list_for_each_entry(ctx, &sqd->ctx_list, sqd_list)
   9274		atomic_or(IORING_SQ_NEED_WAKEUP, &ctx->rings->sq_flags);
   9275	io_run_task_work();
   9276	mutex_unlock(&sqd->lock);
   9277
   9278	audit_free(current);
   9279
   9280	complete(&sqd->exited);
   9281	do_exit(0);
   9282}
   9283
   9284struct io_wait_queue {
   9285	struct wait_queue_entry wq;
   9286	struct io_ring_ctx *ctx;
   9287	unsigned cq_tail;
   9288	unsigned nr_timeouts;
   9289};
   9290
   9291static inline bool io_should_wake(struct io_wait_queue *iowq)
   9292{
   9293	struct io_ring_ctx *ctx = iowq->ctx;
   9294	int dist = ctx->cached_cq_tail - (int) iowq->cq_tail;
   9295
   9296	/*
   9297	 * Wake up if we have enough events, or if a timeout occurred since we
   9298	 * started waiting. For timeouts, we always want to return to userspace,
   9299	 * regardless of event count.
   9300	 */
   9301	return dist >= 0 || atomic_read(&ctx->cq_timeouts) != iowq->nr_timeouts;
   9302}
   9303
   9304static int io_wake_function(struct wait_queue_entry *curr, unsigned int mode,
   9305			    int wake_flags, void *key)
   9306{
   9307	struct io_wait_queue *iowq = container_of(curr, struct io_wait_queue,
   9308							wq);
   9309
   9310	/*
   9311	 * Cannot safely flush overflowed CQEs from here, ensure we wake up
   9312	 * the task, and the next invocation will do it.
   9313	 */
   9314	if (io_should_wake(iowq) ||
   9315	    test_bit(IO_CHECK_CQ_OVERFLOW_BIT, &iowq->ctx->check_cq))
   9316		return autoremove_wake_function(curr, mode, wake_flags, key);
   9317	return -1;
   9318}
   9319
   9320static int io_run_task_work_sig(void)
   9321{
   9322	if (io_run_task_work())
   9323		return 1;
   9324	if (test_thread_flag(TIF_NOTIFY_SIGNAL))
   9325		return -ERESTARTSYS;
   9326	if (task_sigpending(current))
   9327		return -EINTR;
   9328	return 0;
   9329}
   9330
   9331/* when returns >0, the caller should retry */
   9332static inline int io_cqring_wait_schedule(struct io_ring_ctx *ctx,
   9333					  struct io_wait_queue *iowq,
   9334					  ktime_t timeout)
   9335{
   9336	int ret;
   9337	unsigned long check_cq;
   9338
   9339	/* make sure we run task_work before checking for signals */
   9340	ret = io_run_task_work_sig();
   9341	if (ret || io_should_wake(iowq))
   9342		return ret;
   9343	check_cq = READ_ONCE(ctx->check_cq);
   9344	/* let the caller flush overflows, retry */
   9345	if (check_cq & BIT(IO_CHECK_CQ_OVERFLOW_BIT))
   9346		return 1;
   9347	if (unlikely(check_cq & BIT(IO_CHECK_CQ_DROPPED_BIT)))
   9348		return -EBADR;
   9349	if (!schedule_hrtimeout(&timeout, HRTIMER_MODE_ABS))
   9350		return -ETIME;
   9351	return 1;
   9352}
   9353
   9354/*
   9355 * Wait until events become available, if we don't already have some. The
   9356 * application must reap them itself, as they reside on the shared cq ring.
   9357 */
   9358static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events,
   9359			  const sigset_t __user *sig, size_t sigsz,
   9360			  struct __kernel_timespec __user *uts)
   9361{
   9362	struct io_wait_queue iowq;
   9363	struct io_rings *rings = ctx->rings;
   9364	ktime_t timeout = KTIME_MAX;
   9365	int ret;
   9366
   9367	do {
   9368		io_cqring_overflow_flush(ctx);
   9369		if (io_cqring_events(ctx) >= min_events)
   9370			return 0;
   9371		if (!io_run_task_work())
   9372			break;
   9373	} while (1);
   9374
   9375	if (sig) {
   9376#ifdef CONFIG_COMPAT
   9377		if (in_compat_syscall())
   9378			ret = set_compat_user_sigmask((const compat_sigset_t __user *)sig,
   9379						      sigsz);
   9380		else
   9381#endif
   9382			ret = set_user_sigmask(sig, sigsz);
   9383
   9384		if (ret)
   9385			return ret;
   9386	}
   9387
   9388	if (uts) {
   9389		struct timespec64 ts;
   9390
   9391		if (get_timespec64(&ts, uts))
   9392			return -EFAULT;
   9393		timeout = ktime_add_ns(timespec64_to_ktime(ts), ktime_get_ns());
   9394	}
   9395
   9396	init_waitqueue_func_entry(&iowq.wq, io_wake_function);
   9397	iowq.wq.private = current;
   9398	INIT_LIST_HEAD(&iowq.wq.entry);
   9399	iowq.ctx = ctx;
   9400	iowq.nr_timeouts = atomic_read(&ctx->cq_timeouts);
   9401	iowq.cq_tail = READ_ONCE(ctx->rings->cq.head) + min_events;
   9402
   9403	trace_io_uring_cqring_wait(ctx, min_events);
   9404	do {
   9405		/* if we can't even flush overflow, don't wait for more */
   9406		if (!io_cqring_overflow_flush(ctx)) {
   9407			ret = -EBUSY;
   9408			break;
   9409		}
   9410		prepare_to_wait_exclusive(&ctx->cq_wait, &iowq.wq,
   9411						TASK_INTERRUPTIBLE);
   9412		ret = io_cqring_wait_schedule(ctx, &iowq, timeout);
   9413		cond_resched();
   9414	} while (ret > 0);
   9415
   9416	finish_wait(&ctx->cq_wait, &iowq.wq);
   9417	restore_saved_sigmask_unless(ret == -EINTR);
   9418
   9419	return READ_ONCE(rings->cq.head) == READ_ONCE(rings->cq.tail) ? ret : 0;
   9420}
   9421
   9422static void io_free_page_table(void **table, size_t size)
   9423{
   9424	unsigned i, nr_tables = DIV_ROUND_UP(size, PAGE_SIZE);
   9425
   9426	for (i = 0; i < nr_tables; i++)
   9427		kfree(table[i]);
   9428	kfree(table);
   9429}
   9430
   9431static __cold void **io_alloc_page_table(size_t size)
   9432{
   9433	unsigned i, nr_tables = DIV_ROUND_UP(size, PAGE_SIZE);
   9434	size_t init_size = size;
   9435	void **table;
   9436
   9437	table = kcalloc(nr_tables, sizeof(*table), GFP_KERNEL_ACCOUNT);
   9438	if (!table)
   9439		return NULL;
   9440
   9441	for (i = 0; i < nr_tables; i++) {
   9442		unsigned int this_size = min_t(size_t, size, PAGE_SIZE);
   9443
   9444		table[i] = kzalloc(this_size, GFP_KERNEL_ACCOUNT);
   9445		if (!table[i]) {
   9446			io_free_page_table(table, init_size);
   9447			return NULL;
   9448		}
   9449		size -= this_size;
   9450	}
   9451	return table;
   9452}
   9453
   9454static void io_rsrc_node_destroy(struct io_rsrc_node *ref_node)
   9455{
   9456	percpu_ref_exit(&ref_node->refs);
   9457	kfree(ref_node);
   9458}
   9459
   9460static __cold void io_rsrc_node_ref_zero(struct percpu_ref *ref)
   9461{
   9462	struct io_rsrc_node *node = container_of(ref, struct io_rsrc_node, refs);
   9463	struct io_ring_ctx *ctx = node->rsrc_data->ctx;
   9464	unsigned long flags;
   9465	bool first_add = false;
   9466	unsigned long delay = HZ;
   9467
   9468	spin_lock_irqsave(&ctx->rsrc_ref_lock, flags);
   9469	node->done = true;
   9470
   9471	/* if we are mid-quiesce then do not delay */
   9472	if (node->rsrc_data->quiesce)
   9473		delay = 0;
   9474
   9475	while (!list_empty(&ctx->rsrc_ref_list)) {
   9476		node = list_first_entry(&ctx->rsrc_ref_list,
   9477					    struct io_rsrc_node, node);
   9478		/* recycle ref nodes in order */
   9479		if (!node->done)
   9480			break;
   9481		list_del(&node->node);
   9482		first_add |= llist_add(&node->llist, &ctx->rsrc_put_llist);
   9483	}
   9484	spin_unlock_irqrestore(&ctx->rsrc_ref_lock, flags);
   9485
   9486	if (first_add)
   9487		mod_delayed_work(system_wq, &ctx->rsrc_put_work, delay);
   9488}
   9489
   9490static struct io_rsrc_node *io_rsrc_node_alloc(void)
   9491{
   9492	struct io_rsrc_node *ref_node;
   9493
   9494	ref_node = kzalloc(sizeof(*ref_node), GFP_KERNEL);
   9495	if (!ref_node)
   9496		return NULL;
   9497
   9498	if (percpu_ref_init(&ref_node->refs, io_rsrc_node_ref_zero,
   9499			    0, GFP_KERNEL)) {
   9500		kfree(ref_node);
   9501		return NULL;
   9502	}
   9503	INIT_LIST_HEAD(&ref_node->node);
   9504	INIT_LIST_HEAD(&ref_node->rsrc_list);
   9505	ref_node->done = false;
   9506	return ref_node;
   9507}
   9508
   9509static void io_rsrc_node_switch(struct io_ring_ctx *ctx,
   9510				struct io_rsrc_data *data_to_kill)
   9511	__must_hold(&ctx->uring_lock)
   9512{
   9513	WARN_ON_ONCE(!ctx->rsrc_backup_node);
   9514	WARN_ON_ONCE(data_to_kill && !ctx->rsrc_node);
   9515
   9516	io_rsrc_refs_drop(ctx);
   9517
   9518	if (data_to_kill) {
   9519		struct io_rsrc_node *rsrc_node = ctx->rsrc_node;
   9520
   9521		rsrc_node->rsrc_data = data_to_kill;
   9522		spin_lock_irq(&ctx->rsrc_ref_lock);
   9523		list_add_tail(&rsrc_node->node, &ctx->rsrc_ref_list);
   9524		spin_unlock_irq(&ctx->rsrc_ref_lock);
   9525
   9526		atomic_inc(&data_to_kill->refs);
   9527		percpu_ref_kill(&rsrc_node->refs);
   9528		ctx->rsrc_node = NULL;
   9529	}
   9530
   9531	if (!ctx->rsrc_node) {
   9532		ctx->rsrc_node = ctx->rsrc_backup_node;
   9533		ctx->rsrc_backup_node = NULL;
   9534	}
   9535}
   9536
   9537static int io_rsrc_node_switch_start(struct io_ring_ctx *ctx)
   9538{
   9539	if (ctx->rsrc_backup_node)
   9540		return 0;
   9541	ctx->rsrc_backup_node = io_rsrc_node_alloc();
   9542	return ctx->rsrc_backup_node ? 0 : -ENOMEM;
   9543}
   9544
   9545static __cold int io_rsrc_ref_quiesce(struct io_rsrc_data *data,
   9546				      struct io_ring_ctx *ctx)
   9547{
   9548	int ret;
   9549
   9550	/* As we may drop ->uring_lock, other task may have started quiesce */
   9551	if (data->quiesce)
   9552		return -ENXIO;
   9553
   9554	data->quiesce = true;
   9555	do {
   9556		ret = io_rsrc_node_switch_start(ctx);
   9557		if (ret)
   9558			break;
   9559		io_rsrc_node_switch(ctx, data);
   9560
   9561		/* kill initial ref, already quiesced if zero */
   9562		if (atomic_dec_and_test(&data->refs))
   9563			break;
   9564		mutex_unlock(&ctx->uring_lock);
   9565		flush_delayed_work(&ctx->rsrc_put_work);
   9566		ret = wait_for_completion_interruptible(&data->done);
   9567		if (!ret) {
   9568			mutex_lock(&ctx->uring_lock);
   9569			if (atomic_read(&data->refs) > 0) {
   9570				/*
   9571				 * it has been revived by another thread while
   9572				 * we were unlocked
   9573				 */
   9574				mutex_unlock(&ctx->uring_lock);
   9575			} else {
   9576				break;
   9577			}
   9578		}
   9579
   9580		atomic_inc(&data->refs);
   9581		/* wait for all works potentially completing data->done */
   9582		flush_delayed_work(&ctx->rsrc_put_work);
   9583		reinit_completion(&data->done);
   9584
   9585		ret = io_run_task_work_sig();
   9586		mutex_lock(&ctx->uring_lock);
   9587	} while (ret >= 0);
   9588	data->quiesce = false;
   9589
   9590	return ret;
   9591}
   9592
   9593static u64 *io_get_tag_slot(struct io_rsrc_data *data, unsigned int idx)
   9594{
   9595	unsigned int off = idx & IO_RSRC_TAG_TABLE_MASK;
   9596	unsigned int table_idx = idx >> IO_RSRC_TAG_TABLE_SHIFT;
   9597
   9598	return &data->tags[table_idx][off];
   9599}
   9600
   9601static void io_rsrc_data_free(struct io_rsrc_data *data)
   9602{
   9603	size_t size = data->nr * sizeof(data->tags[0][0]);
   9604
   9605	if (data->tags)
   9606		io_free_page_table((void **)data->tags, size);
   9607	kfree(data);
   9608}
   9609
   9610static __cold int io_rsrc_data_alloc(struct io_ring_ctx *ctx, rsrc_put_fn *do_put,
   9611				     u64 __user *utags, unsigned nr,
   9612				     struct io_rsrc_data **pdata)
   9613{
   9614	struct io_rsrc_data *data;
   9615	int ret = -ENOMEM;
   9616	unsigned i;
   9617
   9618	data = kzalloc(sizeof(*data), GFP_KERNEL);
   9619	if (!data)
   9620		return -ENOMEM;
   9621	data->tags = (u64 **)io_alloc_page_table(nr * sizeof(data->tags[0][0]));
   9622	if (!data->tags) {
   9623		kfree(data);
   9624		return -ENOMEM;
   9625	}
   9626
   9627	data->nr = nr;
   9628	data->ctx = ctx;
   9629	data->do_put = do_put;
   9630	if (utags) {
   9631		ret = -EFAULT;
   9632		for (i = 0; i < nr; i++) {
   9633			u64 *tag_slot = io_get_tag_slot(data, i);
   9634
   9635			if (copy_from_user(tag_slot, &utags[i],
   9636					   sizeof(*tag_slot)))
   9637				goto fail;
   9638		}
   9639	}
   9640
   9641	atomic_set(&data->refs, 1);
   9642	init_completion(&data->done);
   9643	*pdata = data;
   9644	return 0;
   9645fail:
   9646	io_rsrc_data_free(data);
   9647	return ret;
   9648}
   9649
   9650static bool io_alloc_file_tables(struct io_file_table *table, unsigned nr_files)
   9651{
   9652	table->files = kvcalloc(nr_files, sizeof(table->files[0]),
   9653				GFP_KERNEL_ACCOUNT);
   9654	if (unlikely(!table->files))
   9655		return false;
   9656
   9657	table->bitmap = bitmap_zalloc(nr_files, GFP_KERNEL_ACCOUNT);
   9658	if (unlikely(!table->bitmap)) {
   9659		kvfree(table->files);
   9660		return false;
   9661	}
   9662
   9663	return true;
   9664}
   9665
   9666static void io_free_file_tables(struct io_file_table *table)
   9667{
   9668	kvfree(table->files);
   9669	bitmap_free(table->bitmap);
   9670	table->files = NULL;
   9671	table->bitmap = NULL;
   9672}
   9673
   9674static inline void io_file_bitmap_set(struct io_file_table *table, int bit)
   9675{
   9676	WARN_ON_ONCE(test_bit(bit, table->bitmap));
   9677	__set_bit(bit, table->bitmap);
   9678	table->alloc_hint = bit + 1;
   9679}
   9680
   9681static inline void io_file_bitmap_clear(struct io_file_table *table, int bit)
   9682{
   9683	__clear_bit(bit, table->bitmap);
   9684	table->alloc_hint = bit;
   9685}
   9686
   9687static void __io_sqe_files_unregister(struct io_ring_ctx *ctx)
   9688{
   9689#if !defined(IO_URING_SCM_ALL)
   9690	int i;
   9691
   9692	for (i = 0; i < ctx->nr_user_files; i++) {
   9693		struct file *file = io_file_from_index(ctx, i);
   9694
   9695		if (!file)
   9696			continue;
   9697		if (io_fixed_file_slot(&ctx->file_table, i)->file_ptr & FFS_SCM)
   9698			continue;
   9699		io_file_bitmap_clear(&ctx->file_table, i);
   9700		fput(file);
   9701	}
   9702#endif
   9703
   9704#if defined(CONFIG_UNIX)
   9705	if (ctx->ring_sock) {
   9706		struct sock *sock = ctx->ring_sock->sk;
   9707		struct sk_buff *skb;
   9708
   9709		while ((skb = skb_dequeue(&sock->sk_receive_queue)) != NULL)
   9710			kfree_skb(skb);
   9711	}
   9712#endif
   9713	io_free_file_tables(&ctx->file_table);
   9714	io_rsrc_data_free(ctx->file_data);
   9715	ctx->file_data = NULL;
   9716	ctx->nr_user_files = 0;
   9717}
   9718
   9719static int io_sqe_files_unregister(struct io_ring_ctx *ctx)
   9720{
   9721	unsigned nr = ctx->nr_user_files;
   9722	int ret;
   9723
   9724	if (!ctx->file_data)
   9725		return -ENXIO;
   9726
   9727	/*
   9728	 * Quiesce may unlock ->uring_lock, and while it's not held
   9729	 * prevent new requests using the table.
   9730	 */
   9731	ctx->nr_user_files = 0;
   9732	ret = io_rsrc_ref_quiesce(ctx->file_data, ctx);
   9733	ctx->nr_user_files = nr;
   9734	if (!ret)
   9735		__io_sqe_files_unregister(ctx);
   9736	return ret;
   9737}
   9738
   9739static void io_sq_thread_unpark(struct io_sq_data *sqd)
   9740	__releases(&sqd->lock)
   9741{
   9742	WARN_ON_ONCE(sqd->thread == current);
   9743
   9744	/*
   9745	 * Do the dance but not conditional clear_bit() because it'd race with
   9746	 * other threads incrementing park_pending and setting the bit.
   9747	 */
   9748	clear_bit(IO_SQ_THREAD_SHOULD_PARK, &sqd->state);
   9749	if (atomic_dec_return(&sqd->park_pending))
   9750		set_bit(IO_SQ_THREAD_SHOULD_PARK, &sqd->state);
   9751	mutex_unlock(&sqd->lock);
   9752}
   9753
   9754static void io_sq_thread_park(struct io_sq_data *sqd)
   9755	__acquires(&sqd->lock)
   9756{
   9757	WARN_ON_ONCE(sqd->thread == current);
   9758
   9759	atomic_inc(&sqd->park_pending);
   9760	set_bit(IO_SQ_THREAD_SHOULD_PARK, &sqd->state);
   9761	mutex_lock(&sqd->lock);
   9762	if (sqd->thread)
   9763		wake_up_process(sqd->thread);
   9764}
   9765
   9766static void io_sq_thread_stop(struct io_sq_data *sqd)
   9767{
   9768	WARN_ON_ONCE(sqd->thread == current);
   9769	WARN_ON_ONCE(test_bit(IO_SQ_THREAD_SHOULD_STOP, &sqd->state));
   9770
   9771	set_bit(IO_SQ_THREAD_SHOULD_STOP, &sqd->state);
   9772	mutex_lock(&sqd->lock);
   9773	if (sqd->thread)
   9774		wake_up_process(sqd->thread);
   9775	mutex_unlock(&sqd->lock);
   9776	wait_for_completion(&sqd->exited);
   9777}
   9778
   9779static void io_put_sq_data(struct io_sq_data *sqd)
   9780{
   9781	if (refcount_dec_and_test(&sqd->refs)) {
   9782		WARN_ON_ONCE(atomic_read(&sqd->park_pending));
   9783
   9784		io_sq_thread_stop(sqd);
   9785		kfree(sqd);
   9786	}
   9787}
   9788
   9789static void io_sq_thread_finish(struct io_ring_ctx *ctx)
   9790{
   9791	struct io_sq_data *sqd = ctx->sq_data;
   9792
   9793	if (sqd) {
   9794		io_sq_thread_park(sqd);
   9795		list_del_init(&ctx->sqd_list);
   9796		io_sqd_update_thread_idle(sqd);
   9797		io_sq_thread_unpark(sqd);
   9798
   9799		io_put_sq_data(sqd);
   9800		ctx->sq_data = NULL;
   9801	}
   9802}
   9803
   9804static struct io_sq_data *io_attach_sq_data(struct io_uring_params *p)
   9805{
   9806	struct io_ring_ctx *ctx_attach;
   9807	struct io_sq_data *sqd;
   9808	struct fd f;
   9809
   9810	f = fdget(p->wq_fd);
   9811	if (!f.file)
   9812		return ERR_PTR(-ENXIO);
   9813	if (f.file->f_op != &io_uring_fops) {
   9814		fdput(f);
   9815		return ERR_PTR(-EINVAL);
   9816	}
   9817
   9818	ctx_attach = f.file->private_data;
   9819	sqd = ctx_attach->sq_data;
   9820	if (!sqd) {
   9821		fdput(f);
   9822		return ERR_PTR(-EINVAL);
   9823	}
   9824	if (sqd->task_tgid != current->tgid) {
   9825		fdput(f);
   9826		return ERR_PTR(-EPERM);
   9827	}
   9828
   9829	refcount_inc(&sqd->refs);
   9830	fdput(f);
   9831	return sqd;
   9832}
   9833
   9834static struct io_sq_data *io_get_sq_data(struct io_uring_params *p,
   9835					 bool *attached)
   9836{
   9837	struct io_sq_data *sqd;
   9838
   9839	*attached = false;
   9840	if (p->flags & IORING_SETUP_ATTACH_WQ) {
   9841		sqd = io_attach_sq_data(p);
   9842		if (!IS_ERR(sqd)) {
   9843			*attached = true;
   9844			return sqd;
   9845		}
   9846		/* fall through for EPERM case, setup new sqd/task */
   9847		if (PTR_ERR(sqd) != -EPERM)
   9848			return sqd;
   9849	}
   9850
   9851	sqd = kzalloc(sizeof(*sqd), GFP_KERNEL);
   9852	if (!sqd)
   9853		return ERR_PTR(-ENOMEM);
   9854
   9855	atomic_set(&sqd->park_pending, 0);
   9856	refcount_set(&sqd->refs, 1);
   9857	INIT_LIST_HEAD(&sqd->ctx_list);
   9858	mutex_init(&sqd->lock);
   9859	init_waitqueue_head(&sqd->wait);
   9860	init_completion(&sqd->exited);
   9861	return sqd;
   9862}
   9863
   9864/*
   9865 * Ensure the UNIX gc is aware of our file set, so we are certain that
   9866 * the io_uring can be safely unregistered on process exit, even if we have
   9867 * loops in the file referencing. We account only files that can hold other
   9868 * files because otherwise they can't form a loop and so are not interesting
   9869 * for GC.
   9870 */
   9871static int io_scm_file_account(struct io_ring_ctx *ctx, struct file *file)
   9872{
   9873#if defined(CONFIG_UNIX)
   9874	struct sock *sk = ctx->ring_sock->sk;
   9875	struct sk_buff_head *head = &sk->sk_receive_queue;
   9876	struct scm_fp_list *fpl;
   9877	struct sk_buff *skb;
   9878
   9879	if (likely(!io_file_need_scm(file)))
   9880		return 0;
   9881
   9882	/*
   9883	 * See if we can merge this file into an existing skb SCM_RIGHTS
   9884	 * file set. If there's no room, fall back to allocating a new skb
   9885	 * and filling it in.
   9886	 */
   9887	spin_lock_irq(&head->lock);
   9888	skb = skb_peek(head);
   9889	if (skb && UNIXCB(skb).fp->count < SCM_MAX_FD)
   9890		__skb_unlink(skb, head);
   9891	else
   9892		skb = NULL;
   9893	spin_unlock_irq(&head->lock);
   9894
   9895	if (!skb) {
   9896		fpl = kzalloc(sizeof(*fpl), GFP_KERNEL);
   9897		if (!fpl)
   9898			return -ENOMEM;
   9899
   9900		skb = alloc_skb(0, GFP_KERNEL);
   9901		if (!skb) {
   9902			kfree(fpl);
   9903			return -ENOMEM;
   9904		}
   9905
   9906		fpl->user = get_uid(current_user());
   9907		fpl->max = SCM_MAX_FD;
   9908		fpl->count = 0;
   9909
   9910		UNIXCB(skb).fp = fpl;
   9911		skb->sk = sk;
   9912		skb->destructor = unix_destruct_scm;
   9913		refcount_add(skb->truesize, &sk->sk_wmem_alloc);
   9914	}
   9915
   9916	fpl = UNIXCB(skb).fp;
   9917	fpl->fp[fpl->count++] = get_file(file);
   9918	unix_inflight(fpl->user, file);
   9919	skb_queue_head(head, skb);
   9920	fput(file);
   9921#endif
   9922	return 0;
   9923}
   9924
   9925static void io_rsrc_file_put(struct io_ring_ctx *ctx, struct io_rsrc_put *prsrc)
   9926{
   9927	struct file *file = prsrc->file;
   9928#if defined(CONFIG_UNIX)
   9929	struct sock *sock = ctx->ring_sock->sk;
   9930	struct sk_buff_head list, *head = &sock->sk_receive_queue;
   9931	struct sk_buff *skb;
   9932	int i;
   9933
   9934	if (!io_file_need_scm(file)) {
   9935		fput(file);
   9936		return;
   9937	}
   9938
   9939	__skb_queue_head_init(&list);
   9940
   9941	/*
   9942	 * Find the skb that holds this file in its SCM_RIGHTS. When found,
   9943	 * remove this entry and rearrange the file array.
   9944	 */
   9945	skb = skb_dequeue(head);
   9946	while (skb) {
   9947		struct scm_fp_list *fp;
   9948
   9949		fp = UNIXCB(skb).fp;
   9950		for (i = 0; i < fp->count; i++) {
   9951			int left;
   9952
   9953			if (fp->fp[i] != file)
   9954				continue;
   9955
   9956			unix_notinflight(fp->user, fp->fp[i]);
   9957			left = fp->count - 1 - i;
   9958			if (left) {
   9959				memmove(&fp->fp[i], &fp->fp[i + 1],
   9960						left * sizeof(struct file *));
   9961			}
   9962			fp->count--;
   9963			if (!fp->count) {
   9964				kfree_skb(skb);
   9965				skb = NULL;
   9966			} else {
   9967				__skb_queue_tail(&list, skb);
   9968			}
   9969			fput(file);
   9970			file = NULL;
   9971			break;
   9972		}
   9973
   9974		if (!file)
   9975			break;
   9976
   9977		__skb_queue_tail(&list, skb);
   9978
   9979		skb = skb_dequeue(head);
   9980	}
   9981
   9982	if (skb_peek(&list)) {
   9983		spin_lock_irq(&head->lock);
   9984		while ((skb = __skb_dequeue(&list)) != NULL)
   9985			__skb_queue_tail(head, skb);
   9986		spin_unlock_irq(&head->lock);
   9987	}
   9988#else
   9989	fput(file);
   9990#endif
   9991}
   9992
   9993static void __io_rsrc_put_work(struct io_rsrc_node *ref_node)
   9994{
   9995	struct io_rsrc_data *rsrc_data = ref_node->rsrc_data;
   9996	struct io_ring_ctx *ctx = rsrc_data->ctx;
   9997	struct io_rsrc_put *prsrc, *tmp;
   9998
   9999	list_for_each_entry_safe(prsrc, tmp, &ref_node->rsrc_list, list) {
  10000		list_del(&prsrc->list);
  10001
  10002		if (prsrc->tag) {
  10003			if (ctx->flags & IORING_SETUP_IOPOLL)
  10004				mutex_lock(&ctx->uring_lock);
  10005
  10006			spin_lock(&ctx->completion_lock);
  10007			io_fill_cqe_aux(ctx, prsrc->tag, 0, 0);
  10008			io_commit_cqring(ctx);
  10009			spin_unlock(&ctx->completion_lock);
  10010			io_cqring_ev_posted(ctx);
  10011
  10012			if (ctx->flags & IORING_SETUP_IOPOLL)
  10013				mutex_unlock(&ctx->uring_lock);
  10014		}
  10015
  10016		rsrc_data->do_put(ctx, prsrc);
  10017		kfree(prsrc);
  10018	}
  10019
  10020	io_rsrc_node_destroy(ref_node);
  10021	if (atomic_dec_and_test(&rsrc_data->refs))
  10022		complete(&rsrc_data->done);
  10023}
  10024
  10025static void io_rsrc_put_work(struct work_struct *work)
  10026{
  10027	struct io_ring_ctx *ctx;
  10028	struct llist_node *node;
  10029
  10030	ctx = container_of(work, struct io_ring_ctx, rsrc_put_work.work);
  10031	node = llist_del_all(&ctx->rsrc_put_llist);
  10032
  10033	while (node) {
  10034		struct io_rsrc_node *ref_node;
  10035		struct llist_node *next = node->next;
  10036
  10037		ref_node = llist_entry(node, struct io_rsrc_node, llist);
  10038		__io_rsrc_put_work(ref_node);
  10039		node = next;
  10040	}
  10041}
  10042
  10043static int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg,
  10044				 unsigned nr_args, u64 __user *tags)
  10045{
  10046	__s32 __user *fds = (__s32 __user *) arg;
  10047	struct file *file;
  10048	int fd, ret;
  10049	unsigned i;
  10050
  10051	if (ctx->file_data)
  10052		return -EBUSY;
  10053	if (!nr_args)
  10054		return -EINVAL;
  10055	if (nr_args > IORING_MAX_FIXED_FILES)
  10056		return -EMFILE;
  10057	if (nr_args > rlimit(RLIMIT_NOFILE))
  10058		return -EMFILE;
  10059	ret = io_rsrc_node_switch_start(ctx);
  10060	if (ret)
  10061		return ret;
  10062	ret = io_rsrc_data_alloc(ctx, io_rsrc_file_put, tags, nr_args,
  10063				 &ctx->file_data);
  10064	if (ret)
  10065		return ret;
  10066
  10067	if (!io_alloc_file_tables(&ctx->file_table, nr_args)) {
  10068		io_rsrc_data_free(ctx->file_data);
  10069		ctx->file_data = NULL;
  10070		return -ENOMEM;
  10071	}
  10072
  10073	for (i = 0; i < nr_args; i++, ctx->nr_user_files++) {
  10074		struct io_fixed_file *file_slot;
  10075
  10076		if (fds && copy_from_user(&fd, &fds[i], sizeof(fd))) {
  10077			ret = -EFAULT;
  10078			goto fail;
  10079		}
  10080		/* allow sparse sets */
  10081		if (!fds || fd == -1) {
  10082			ret = -EINVAL;
  10083			if (unlikely(*io_get_tag_slot(ctx->file_data, i)))
  10084				goto fail;
  10085			continue;
  10086		}
  10087
  10088		file = fget(fd);
  10089		ret = -EBADF;
  10090		if (unlikely(!file))
  10091			goto fail;
  10092
  10093		/*
  10094		 * Don't allow io_uring instances to be registered. If UNIX
  10095		 * isn't enabled, then this causes a reference cycle and this
  10096		 * instance can never get freed. If UNIX is enabled we'll
  10097		 * handle it just fine, but there's still no point in allowing
  10098		 * a ring fd as it doesn't support regular read/write anyway.
  10099		 */
  10100		if (file->f_op == &io_uring_fops) {
  10101			fput(file);
  10102			goto fail;
  10103		}
  10104		ret = io_scm_file_account(ctx, file);
  10105		if (ret) {
  10106			fput(file);
  10107			goto fail;
  10108		}
  10109		file_slot = io_fixed_file_slot(&ctx->file_table, i);
  10110		io_fixed_file_set(file_slot, file);
  10111		io_file_bitmap_set(&ctx->file_table, i);
  10112	}
  10113
  10114	io_rsrc_node_switch(ctx, NULL);
  10115	return 0;
  10116fail:
  10117	__io_sqe_files_unregister(ctx);
  10118	return ret;
  10119}
  10120
  10121static int io_queue_rsrc_removal(struct io_rsrc_data *data, unsigned idx,
  10122				 struct io_rsrc_node *node, void *rsrc)
  10123{
  10124	u64 *tag_slot = io_get_tag_slot(data, idx);
  10125	struct io_rsrc_put *prsrc;
  10126
  10127	prsrc = kzalloc(sizeof(*prsrc), GFP_KERNEL);
  10128	if (!prsrc)
  10129		return -ENOMEM;
  10130
  10131	prsrc->tag = *tag_slot;
  10132	*tag_slot = 0;
  10133	prsrc->rsrc = rsrc;
  10134	list_add(&prsrc->list, &node->rsrc_list);
  10135	return 0;
  10136}
  10137
  10138static int io_install_fixed_file(struct io_kiocb *req, struct file *file,
  10139				 unsigned int issue_flags, u32 slot_index)
  10140	__must_hold(&req->ctx->uring_lock)
  10141{
  10142	struct io_ring_ctx *ctx = req->ctx;
  10143	bool needs_switch = false;
  10144	struct io_fixed_file *file_slot;
  10145	int ret;
  10146
  10147	if (file->f_op == &io_uring_fops)
  10148		return -EBADF;
  10149	if (!ctx->file_data)
  10150		return -ENXIO;
  10151	if (slot_index >= ctx->nr_user_files)
  10152		return -EINVAL;
  10153
  10154	slot_index = array_index_nospec(slot_index, ctx->nr_user_files);
  10155	file_slot = io_fixed_file_slot(&ctx->file_table, slot_index);
  10156
  10157	if (file_slot->file_ptr) {
  10158		struct file *old_file;
  10159
  10160		ret = io_rsrc_node_switch_start(ctx);
  10161		if (ret)
  10162			goto err;
  10163
  10164		old_file = (struct file *)(file_slot->file_ptr & FFS_MASK);
  10165		ret = io_queue_rsrc_removal(ctx->file_data, slot_index,
  10166					    ctx->rsrc_node, old_file);
  10167		if (ret)
  10168			goto err;
  10169		file_slot->file_ptr = 0;
  10170		io_file_bitmap_clear(&ctx->file_table, slot_index);
  10171		needs_switch = true;
  10172	}
  10173
  10174	ret = io_scm_file_account(ctx, file);
  10175	if (!ret) {
  10176		*io_get_tag_slot(ctx->file_data, slot_index) = 0;
  10177		io_fixed_file_set(file_slot, file);
  10178		io_file_bitmap_set(&ctx->file_table, slot_index);
  10179	}
  10180err:
  10181	if (needs_switch)
  10182		io_rsrc_node_switch(ctx, ctx->file_data);
  10183	if (ret)
  10184		fput(file);
  10185	return ret;
  10186}
  10187
  10188static int __io_close_fixed(struct io_kiocb *req, unsigned int issue_flags,
  10189			    unsigned int offset)
  10190{
  10191	struct io_ring_ctx *ctx = req->ctx;
  10192	struct io_fixed_file *file_slot;
  10193	struct file *file;
  10194	int ret;
  10195
  10196	io_ring_submit_lock(ctx, issue_flags);
  10197	ret = -ENXIO;
  10198	if (unlikely(!ctx->file_data))
  10199		goto out;
  10200	ret = -EINVAL;
  10201	if (offset >= ctx->nr_user_files)
  10202		goto out;
  10203	ret = io_rsrc_node_switch_start(ctx);
  10204	if (ret)
  10205		goto out;
  10206
  10207	offset = array_index_nospec(offset, ctx->nr_user_files);
  10208	file_slot = io_fixed_file_slot(&ctx->file_table, offset);
  10209	ret = -EBADF;
  10210	if (!file_slot->file_ptr)
  10211		goto out;
  10212
  10213	file = (struct file *)(file_slot->file_ptr & FFS_MASK);
  10214	ret = io_queue_rsrc_removal(ctx->file_data, offset, ctx->rsrc_node, file);
  10215	if (ret)
  10216		goto out;
  10217
  10218	file_slot->file_ptr = 0;
  10219	io_file_bitmap_clear(&ctx->file_table, offset);
  10220	io_rsrc_node_switch(ctx, ctx->file_data);
  10221	ret = 0;
  10222out:
  10223	io_ring_submit_unlock(ctx, issue_flags);
  10224	return ret;
  10225}
  10226
  10227static inline int io_close_fixed(struct io_kiocb *req, unsigned int issue_flags)
  10228{
  10229	return __io_close_fixed(req, issue_flags, req->close.file_slot - 1);
  10230}
  10231
  10232static int __io_sqe_files_update(struct io_ring_ctx *ctx,
  10233				 struct io_uring_rsrc_update2 *up,
  10234				 unsigned nr_args)
  10235{
  10236	u64 __user *tags = u64_to_user_ptr(up->tags);
  10237	__s32 __user *fds = u64_to_user_ptr(up->data);
  10238	struct io_rsrc_data *data = ctx->file_data;
  10239	struct io_fixed_file *file_slot;
  10240	struct file *file;
  10241	int fd, i, err = 0;
  10242	unsigned int done;
  10243	bool needs_switch = false;
  10244
  10245	if (!ctx->file_data)
  10246		return -ENXIO;
  10247	if (up->offset + nr_args > ctx->nr_user_files)
  10248		return -EINVAL;
  10249
  10250	for (done = 0; done < nr_args; done++) {
  10251		u64 tag = 0;
  10252
  10253		if ((tags && copy_from_user(&tag, &tags[done], sizeof(tag))) ||
  10254		    copy_from_user(&fd, &fds[done], sizeof(fd))) {
  10255			err = -EFAULT;
  10256			break;
  10257		}
  10258		if ((fd == IORING_REGISTER_FILES_SKIP || fd == -1) && tag) {
  10259			err = -EINVAL;
  10260			break;
  10261		}
  10262		if (fd == IORING_REGISTER_FILES_SKIP)
  10263			continue;
  10264
  10265		i = array_index_nospec(up->offset + done, ctx->nr_user_files);
  10266		file_slot = io_fixed_file_slot(&ctx->file_table, i);
  10267
  10268		if (file_slot->file_ptr) {
  10269			file = (struct file *)(file_slot->file_ptr & FFS_MASK);
  10270			err = io_queue_rsrc_removal(data, i, ctx->rsrc_node, file);
  10271			if (err)
  10272				break;
  10273			file_slot->file_ptr = 0;
  10274			io_file_bitmap_clear(&ctx->file_table, i);
  10275			needs_switch = true;
  10276		}
  10277		if (fd != -1) {
  10278			file = fget(fd);
  10279			if (!file) {
  10280				err = -EBADF;
  10281				break;
  10282			}
  10283			/*
  10284			 * Don't allow io_uring instances to be registered. If
  10285			 * UNIX isn't enabled, then this causes a reference
  10286			 * cycle and this instance can never get freed. If UNIX
  10287			 * is enabled we'll handle it just fine, but there's
  10288			 * still no point in allowing a ring fd as it doesn't
  10289			 * support regular read/write anyway.
  10290			 */
  10291			if (file->f_op == &io_uring_fops) {
  10292				fput(file);
  10293				err = -EBADF;
  10294				break;
  10295			}
  10296			err = io_scm_file_account(ctx, file);
  10297			if (err) {
  10298				fput(file);
  10299				break;
  10300			}
  10301			*io_get_tag_slot(data, i) = tag;
  10302			io_fixed_file_set(file_slot, file);
  10303			io_file_bitmap_set(&ctx->file_table, i);
  10304		}
  10305	}
  10306
  10307	if (needs_switch)
  10308		io_rsrc_node_switch(ctx, data);
  10309	return done ? done : err;
  10310}
  10311
  10312static struct io_wq *io_init_wq_offload(struct io_ring_ctx *ctx,
  10313					struct task_struct *task)
  10314{
  10315	struct io_wq_hash *hash;
  10316	struct io_wq_data data;
  10317	unsigned int concurrency;
  10318
  10319	mutex_lock(&ctx->uring_lock);
  10320	hash = ctx->hash_map;
  10321	if (!hash) {
  10322		hash = kzalloc(sizeof(*hash), GFP_KERNEL);
  10323		if (!hash) {
  10324			mutex_unlock(&ctx->uring_lock);
  10325			return ERR_PTR(-ENOMEM);
  10326		}
  10327		refcount_set(&hash->refs, 1);
  10328		init_waitqueue_head(&hash->wait);
  10329		ctx->hash_map = hash;
  10330	}
  10331	mutex_unlock(&ctx->uring_lock);
  10332
  10333	data.hash = hash;
  10334	data.task = task;
  10335	data.free_work = io_wq_free_work;
  10336	data.do_work = io_wq_submit_work;
  10337
  10338	/* Do QD, or 4 * CPUS, whatever is smallest */
  10339	concurrency = min(ctx->sq_entries, 4 * num_online_cpus());
  10340
  10341	return io_wq_create(concurrency, &data);
  10342}
  10343
  10344static __cold int io_uring_alloc_task_context(struct task_struct *task,
  10345					      struct io_ring_ctx *ctx)
  10346{
  10347	struct io_uring_task *tctx;
  10348	int ret;
  10349
  10350	tctx = kzalloc(sizeof(*tctx), GFP_KERNEL);
  10351	if (unlikely(!tctx))
  10352		return -ENOMEM;
  10353
  10354	tctx->registered_rings = kcalloc(IO_RINGFD_REG_MAX,
  10355					 sizeof(struct file *), GFP_KERNEL);
  10356	if (unlikely(!tctx->registered_rings)) {
  10357		kfree(tctx);
  10358		return -ENOMEM;
  10359	}
  10360
  10361	ret = percpu_counter_init(&tctx->inflight, 0, GFP_KERNEL);
  10362	if (unlikely(ret)) {
  10363		kfree(tctx->registered_rings);
  10364		kfree(tctx);
  10365		return ret;
  10366	}
  10367
  10368	tctx->io_wq = io_init_wq_offload(ctx, task);
  10369	if (IS_ERR(tctx->io_wq)) {
  10370		ret = PTR_ERR(tctx->io_wq);
  10371		percpu_counter_destroy(&tctx->inflight);
  10372		kfree(tctx->registered_rings);
  10373		kfree(tctx);
  10374		return ret;
  10375	}
  10376
  10377	xa_init(&tctx->xa);
  10378	init_waitqueue_head(&tctx->wait);
  10379	atomic_set(&tctx->in_idle, 0);
  10380	atomic_set(&tctx->inflight_tracked, 0);
  10381	task->io_uring = tctx;
  10382	spin_lock_init(&tctx->task_lock);
  10383	INIT_WQ_LIST(&tctx->task_list);
  10384	INIT_WQ_LIST(&tctx->prio_task_list);
  10385	init_task_work(&tctx->task_work, tctx_task_work);
  10386	return 0;
  10387}
  10388
  10389void __io_uring_free(struct task_struct *tsk)
  10390{
  10391	struct io_uring_task *tctx = tsk->io_uring;
  10392
  10393	WARN_ON_ONCE(!xa_empty(&tctx->xa));
  10394	WARN_ON_ONCE(tctx->io_wq);
  10395	WARN_ON_ONCE(tctx->cached_refs);
  10396
  10397	kfree(tctx->registered_rings);
  10398	percpu_counter_destroy(&tctx->inflight);
  10399	kfree(tctx);
  10400	tsk->io_uring = NULL;
  10401}
  10402
  10403static __cold int io_sq_offload_create(struct io_ring_ctx *ctx,
  10404				       struct io_uring_params *p)
  10405{
  10406	int ret;
  10407
  10408	/* Retain compatibility with failing for an invalid attach attempt */
  10409	if ((ctx->flags & (IORING_SETUP_ATTACH_WQ | IORING_SETUP_SQPOLL)) ==
  10410				IORING_SETUP_ATTACH_WQ) {
  10411		struct fd f;
  10412
  10413		f = fdget(p->wq_fd);
  10414		if (!f.file)
  10415			return -ENXIO;
  10416		if (f.file->f_op != &io_uring_fops) {
  10417			fdput(f);
  10418			return -EINVAL;
  10419		}
  10420		fdput(f);
  10421	}
  10422	if (ctx->flags & IORING_SETUP_SQPOLL) {
  10423		struct task_struct *tsk;
  10424		struct io_sq_data *sqd;
  10425		bool attached;
  10426
  10427		ret = security_uring_sqpoll();
  10428		if (ret)
  10429			return ret;
  10430
  10431		sqd = io_get_sq_data(p, &attached);
  10432		if (IS_ERR(sqd)) {
  10433			ret = PTR_ERR(sqd);
  10434			goto err;
  10435		}
  10436
  10437		ctx->sq_creds = get_current_cred();
  10438		ctx->sq_data = sqd;
  10439		ctx->sq_thread_idle = msecs_to_jiffies(p->sq_thread_idle);
  10440		if (!ctx->sq_thread_idle)
  10441			ctx->sq_thread_idle = HZ;
  10442
  10443		io_sq_thread_park(sqd);
  10444		list_add(&ctx->sqd_list, &sqd->ctx_list);
  10445		io_sqd_update_thread_idle(sqd);
  10446		/* don't attach to a dying SQPOLL thread, would be racy */
  10447		ret = (attached && !sqd->thread) ? -ENXIO : 0;
  10448		io_sq_thread_unpark(sqd);
  10449
  10450		if (ret < 0)
  10451			goto err;
  10452		if (attached)
  10453			return 0;
  10454
  10455		if (p->flags & IORING_SETUP_SQ_AFF) {
  10456			int cpu = p->sq_thread_cpu;
  10457
  10458			ret = -EINVAL;
  10459			if (cpu >= nr_cpu_ids || !cpu_online(cpu))
  10460				goto err_sqpoll;
  10461			sqd->sq_cpu = cpu;
  10462		} else {
  10463			sqd->sq_cpu = -1;
  10464		}
  10465
  10466		sqd->task_pid = current->pid;
  10467		sqd->task_tgid = current->tgid;
  10468		tsk = create_io_thread(io_sq_thread, sqd, NUMA_NO_NODE);
  10469		if (IS_ERR(tsk)) {
  10470			ret = PTR_ERR(tsk);
  10471			goto err_sqpoll;
  10472		}
  10473
  10474		sqd->thread = tsk;
  10475		ret = io_uring_alloc_task_context(tsk, ctx);
  10476		wake_up_new_task(tsk);
  10477		if (ret)
  10478			goto err;
  10479	} else if (p->flags & IORING_SETUP_SQ_AFF) {
  10480		/* Can't have SQ_AFF without SQPOLL */
  10481		ret = -EINVAL;
  10482		goto err;
  10483	}
  10484
  10485	return 0;
  10486err_sqpoll:
  10487	complete(&ctx->sq_data->exited);
  10488err:
  10489	io_sq_thread_finish(ctx);
  10490	return ret;
  10491}
  10492
  10493static inline void __io_unaccount_mem(struct user_struct *user,
  10494				      unsigned long nr_pages)
  10495{
  10496	atomic_long_sub(nr_pages, &user->locked_vm);
  10497}
  10498
  10499static inline int __io_account_mem(struct user_struct *user,
  10500				   unsigned long nr_pages)
  10501{
  10502	unsigned long page_limit, cur_pages, new_pages;
  10503
  10504	/* Don't allow more pages than we can safely lock */
  10505	page_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
  10506
  10507	do {
  10508		cur_pages = atomic_long_read(&user->locked_vm);
  10509		new_pages = cur_pages + nr_pages;
  10510		if (new_pages > page_limit)
  10511			return -ENOMEM;
  10512	} while (atomic_long_cmpxchg(&user->locked_vm, cur_pages,
  10513					new_pages) != cur_pages);
  10514
  10515	return 0;
  10516}
  10517
  10518static void io_unaccount_mem(struct io_ring_ctx *ctx, unsigned long nr_pages)
  10519{
  10520	if (ctx->user)
  10521		__io_unaccount_mem(ctx->user, nr_pages);
  10522
  10523	if (ctx->mm_account)
  10524		atomic64_sub(nr_pages, &ctx->mm_account->pinned_vm);
  10525}
  10526
  10527static int io_account_mem(struct io_ring_ctx *ctx, unsigned long nr_pages)
  10528{
  10529	int ret;
  10530
  10531	if (ctx->user) {
  10532		ret = __io_account_mem(ctx->user, nr_pages);
  10533		if (ret)
  10534			return ret;
  10535	}
  10536
  10537	if (ctx->mm_account)
  10538		atomic64_add(nr_pages, &ctx->mm_account->pinned_vm);
  10539
  10540	return 0;
  10541}
  10542
  10543static void io_mem_free(void *ptr)
  10544{
  10545	struct page *page;
  10546
  10547	if (!ptr)
  10548		return;
  10549
  10550	page = virt_to_head_page(ptr);
  10551	if (put_page_testzero(page))
  10552		free_compound_page(page);
  10553}
  10554
  10555static void *io_mem_alloc(size_t size)
  10556{
  10557	gfp_t gfp = GFP_KERNEL_ACCOUNT | __GFP_ZERO | __GFP_NOWARN | __GFP_COMP;
  10558
  10559	return (void *) __get_free_pages(gfp, get_order(size));
  10560}
  10561
  10562static unsigned long rings_size(struct io_ring_ctx *ctx, unsigned int sq_entries,
  10563				unsigned int cq_entries, size_t *sq_offset)
  10564{
  10565	struct io_rings *rings;
  10566	size_t off, sq_array_size;
  10567
  10568	off = struct_size(rings, cqes, cq_entries);
  10569	if (off == SIZE_MAX)
  10570		return SIZE_MAX;
  10571	if (ctx->flags & IORING_SETUP_CQE32) {
  10572		if (check_shl_overflow(off, 1, &off))
  10573			return SIZE_MAX;
  10574	}
  10575
  10576#ifdef CONFIG_SMP
  10577	off = ALIGN(off, SMP_CACHE_BYTES);
  10578	if (off == 0)
  10579		return SIZE_MAX;
  10580#endif
  10581
  10582	if (sq_offset)
  10583		*sq_offset = off;
  10584
  10585	sq_array_size = array_size(sizeof(u32), sq_entries);
  10586	if (sq_array_size == SIZE_MAX)
  10587		return SIZE_MAX;
  10588
  10589	if (check_add_overflow(off, sq_array_size, &off))
  10590		return SIZE_MAX;
  10591
  10592	return off;
  10593}
  10594
  10595static void io_buffer_unmap(struct io_ring_ctx *ctx, struct io_mapped_ubuf **slot)
  10596{
  10597	struct io_mapped_ubuf *imu = *slot;
  10598	unsigned int i;
  10599
  10600	if (imu != ctx->dummy_ubuf) {
  10601		for (i = 0; i < imu->nr_bvecs; i++)
  10602			unpin_user_page(imu->bvec[i].bv_page);
  10603		if (imu->acct_pages)
  10604			io_unaccount_mem(ctx, imu->acct_pages);
  10605		kvfree(imu);
  10606	}
  10607	*slot = NULL;
  10608}
  10609
  10610static void io_rsrc_buf_put(struct io_ring_ctx *ctx, struct io_rsrc_put *prsrc)
  10611{
  10612	io_buffer_unmap(ctx, &prsrc->buf);
  10613	prsrc->buf = NULL;
  10614}
  10615
  10616static void __io_sqe_buffers_unregister(struct io_ring_ctx *ctx)
  10617{
  10618	unsigned int i;
  10619
  10620	for (i = 0; i < ctx->nr_user_bufs; i++)
  10621		io_buffer_unmap(ctx, &ctx->user_bufs[i]);
  10622	kfree(ctx->user_bufs);
  10623	io_rsrc_data_free(ctx->buf_data);
  10624	ctx->user_bufs = NULL;
  10625	ctx->buf_data = NULL;
  10626	ctx->nr_user_bufs = 0;
  10627}
  10628
  10629static int io_sqe_buffers_unregister(struct io_ring_ctx *ctx)
  10630{
  10631	unsigned nr = ctx->nr_user_bufs;
  10632	int ret;
  10633
  10634	if (!ctx->buf_data)
  10635		return -ENXIO;
  10636
  10637	/*
  10638	 * Quiesce may unlock ->uring_lock, and while it's not held
  10639	 * prevent new requests using the table.
  10640	 */
  10641	ctx->nr_user_bufs = 0;
  10642	ret = io_rsrc_ref_quiesce(ctx->buf_data, ctx);
  10643	ctx->nr_user_bufs = nr;
  10644	if (!ret)
  10645		__io_sqe_buffers_unregister(ctx);
  10646	return ret;
  10647}
  10648
  10649static int io_copy_iov(struct io_ring_ctx *ctx, struct iovec *dst,
  10650		       void __user *arg, unsigned index)
  10651{
  10652	struct iovec __user *src;
  10653
  10654#ifdef CONFIG_COMPAT
  10655	if (ctx->compat) {
  10656		struct compat_iovec __user *ciovs;
  10657		struct compat_iovec ciov;
  10658
  10659		ciovs = (struct compat_iovec __user *) arg;
  10660		if (copy_from_user(&ciov, &ciovs[index], sizeof(ciov)))
  10661			return -EFAULT;
  10662
  10663		dst->iov_base = u64_to_user_ptr((u64)ciov.iov_base);
  10664		dst->iov_len = ciov.iov_len;
  10665		return 0;
  10666	}
  10667#endif
  10668	src = (struct iovec __user *) arg;
  10669	if (copy_from_user(dst, &src[index], sizeof(*dst)))
  10670		return -EFAULT;
  10671	return 0;
  10672}
  10673
  10674/*
  10675 * Not super efficient, but this is just a registration time. And we do cache
  10676 * the last compound head, so generally we'll only do a full search if we don't
  10677 * match that one.
  10678 *
  10679 * We check if the given compound head page has already been accounted, to
  10680 * avoid double accounting it. This allows us to account the full size of the
  10681 * page, not just the constituent pages of a huge page.
  10682 */
  10683static bool headpage_already_acct(struct io_ring_ctx *ctx, struct page **pages,
  10684				  int nr_pages, struct page *hpage)
  10685{
  10686	int i, j;
  10687
  10688	/* check current page array */
  10689	for (i = 0; i < nr_pages; i++) {
  10690		if (!PageCompound(pages[i]))
  10691			continue;
  10692		if (compound_head(pages[i]) == hpage)
  10693			return true;
  10694	}
  10695
  10696	/* check previously registered pages */
  10697	for (i = 0; i < ctx->nr_user_bufs; i++) {
  10698		struct io_mapped_ubuf *imu = ctx->user_bufs[i];
  10699
  10700		for (j = 0; j < imu->nr_bvecs; j++) {
  10701			if (!PageCompound(imu->bvec[j].bv_page))
  10702				continue;
  10703			if (compound_head(imu->bvec[j].bv_page) == hpage)
  10704				return true;
  10705		}
  10706	}
  10707
  10708	return false;
  10709}
  10710
  10711static int io_buffer_account_pin(struct io_ring_ctx *ctx, struct page **pages,
  10712				 int nr_pages, struct io_mapped_ubuf *imu,
  10713				 struct page **last_hpage)
  10714{
  10715	int i, ret;
  10716
  10717	imu->acct_pages = 0;
  10718	for (i = 0; i < nr_pages; i++) {
  10719		if (!PageCompound(pages[i])) {
  10720			imu->acct_pages++;
  10721		} else {
  10722			struct page *hpage;
  10723
  10724			hpage = compound_head(pages[i]);
  10725			if (hpage == *last_hpage)
  10726				continue;
  10727			*last_hpage = hpage;
  10728			if (headpage_already_acct(ctx, pages, i, hpage))
  10729				continue;
  10730			imu->acct_pages += page_size(hpage) >> PAGE_SHIFT;
  10731		}
  10732	}
  10733
  10734	if (!imu->acct_pages)
  10735		return 0;
  10736
  10737	ret = io_account_mem(ctx, imu->acct_pages);
  10738	if (ret)
  10739		imu->acct_pages = 0;
  10740	return ret;
  10741}
  10742
  10743static struct page **io_pin_pages(unsigned long ubuf, unsigned long len,
  10744				  int *npages)
  10745{
  10746	unsigned long start, end, nr_pages;
  10747	struct vm_area_struct **vmas = NULL;
  10748	struct page **pages = NULL;
  10749	int i, pret, ret = -ENOMEM;
  10750
  10751	end = (ubuf + len + PAGE_SIZE - 1) >> PAGE_SHIFT;
  10752	start = ubuf >> PAGE_SHIFT;
  10753	nr_pages = end - start;
  10754
  10755	pages = kvmalloc_array(nr_pages, sizeof(struct page *), GFP_KERNEL);
  10756	if (!pages)
  10757		goto done;
  10758
  10759	vmas = kvmalloc_array(nr_pages, sizeof(struct vm_area_struct *),
  10760			      GFP_KERNEL);
  10761	if (!vmas)
  10762		goto done;
  10763
  10764	ret = 0;
  10765	mmap_read_lock(current->mm);
  10766	pret = pin_user_pages(ubuf, nr_pages, FOLL_WRITE | FOLL_LONGTERM,
  10767			      pages, vmas);
  10768	if (pret == nr_pages) {
  10769		/* don't support file backed memory */
  10770		for (i = 0; i < nr_pages; i++) {
  10771			struct vm_area_struct *vma = vmas[i];
  10772
  10773			if (vma_is_shmem(vma))
  10774				continue;
  10775			if (vma->vm_file &&
  10776			    !is_file_hugepages(vma->vm_file)) {
  10777				ret = -EOPNOTSUPP;
  10778				break;
  10779			}
  10780		}
  10781		*npages = nr_pages;
  10782	} else {
  10783		ret = pret < 0 ? pret : -EFAULT;
  10784	}
  10785	mmap_read_unlock(current->mm);
  10786	if (ret) {
  10787		/*
  10788		 * if we did partial map, or found file backed vmas,
  10789		 * release any pages we did get
  10790		 */
  10791		if (pret > 0)
  10792			unpin_user_pages(pages, pret);
  10793		goto done;
  10794	}
  10795	ret = 0;
  10796done:
  10797	kvfree(vmas);
  10798	if (ret < 0) {
  10799		kvfree(pages);
  10800		pages = ERR_PTR(ret);
  10801	}
  10802	return pages;
  10803}
  10804
  10805static int io_sqe_buffer_register(struct io_ring_ctx *ctx, struct iovec *iov,
  10806				  struct io_mapped_ubuf **pimu,
  10807				  struct page **last_hpage)
  10808{
  10809	struct io_mapped_ubuf *imu = NULL;
  10810	struct page **pages = NULL;
  10811	unsigned long off;
  10812	size_t size;
  10813	int ret, nr_pages, i;
  10814
  10815	if (!iov->iov_base) {
  10816		*pimu = ctx->dummy_ubuf;
  10817		return 0;
  10818	}
  10819
  10820	*pimu = NULL;
  10821	ret = -ENOMEM;
  10822
  10823	pages = io_pin_pages((unsigned long) iov->iov_base, iov->iov_len,
  10824				&nr_pages);
  10825	if (IS_ERR(pages)) {
  10826		ret = PTR_ERR(pages);
  10827		pages = NULL;
  10828		goto done;
  10829	}
  10830
  10831	imu = kvmalloc(struct_size(imu, bvec, nr_pages), GFP_KERNEL);
  10832	if (!imu)
  10833		goto done;
  10834
  10835	ret = io_buffer_account_pin(ctx, pages, nr_pages, imu, last_hpage);
  10836	if (ret) {
  10837		unpin_user_pages(pages, nr_pages);
  10838		goto done;
  10839	}
  10840
  10841	off = (unsigned long) iov->iov_base & ~PAGE_MASK;
  10842	size = iov->iov_len;
  10843	for (i = 0; i < nr_pages; i++) {
  10844		size_t vec_len;
  10845
  10846		vec_len = min_t(size_t, size, PAGE_SIZE - off);
  10847		imu->bvec[i].bv_page = pages[i];
  10848		imu->bvec[i].bv_len = vec_len;
  10849		imu->bvec[i].bv_offset = off;
  10850		off = 0;
  10851		size -= vec_len;
  10852	}
  10853	/* store original address for later verification */
  10854	imu->ubuf = (unsigned long) iov->iov_base;
  10855	imu->ubuf_end = imu->ubuf + iov->iov_len;
  10856	imu->nr_bvecs = nr_pages;
  10857	*pimu = imu;
  10858	ret = 0;
  10859done:
  10860	if (ret)
  10861		kvfree(imu);
  10862	kvfree(pages);
  10863	return ret;
  10864}
  10865
  10866static int io_buffers_map_alloc(struct io_ring_ctx *ctx, unsigned int nr_args)
  10867{
  10868	ctx->user_bufs = kcalloc(nr_args, sizeof(*ctx->user_bufs), GFP_KERNEL);
  10869	return ctx->user_bufs ? 0 : -ENOMEM;
  10870}
  10871
  10872static int io_buffer_validate(struct iovec *iov)
  10873{
  10874	unsigned long tmp, acct_len = iov->iov_len + (PAGE_SIZE - 1);
  10875
  10876	/*
  10877	 * Don't impose further limits on the size and buffer
  10878	 * constraints here, we'll -EINVAL later when IO is
  10879	 * submitted if they are wrong.
  10880	 */
  10881	if (!iov->iov_base)
  10882		return iov->iov_len ? -EFAULT : 0;
  10883	if (!iov->iov_len)
  10884		return -EFAULT;
  10885
  10886	/* arbitrary limit, but we need something */
  10887	if (iov->iov_len > SZ_1G)
  10888		return -EFAULT;
  10889
  10890	if (check_add_overflow((unsigned long)iov->iov_base, acct_len, &tmp))
  10891		return -EOVERFLOW;
  10892
  10893	return 0;
  10894}
  10895
  10896static int io_sqe_buffers_register(struct io_ring_ctx *ctx, void __user *arg,
  10897				   unsigned int nr_args, u64 __user *tags)
  10898{
  10899	struct page *last_hpage = NULL;
  10900	struct io_rsrc_data *data;
  10901	int i, ret;
  10902	struct iovec iov;
  10903
  10904	if (ctx->user_bufs)
  10905		return -EBUSY;
  10906	if (!nr_args || nr_args > IORING_MAX_REG_BUFFERS)
  10907		return -EINVAL;
  10908	ret = io_rsrc_node_switch_start(ctx);
  10909	if (ret)
  10910		return ret;
  10911	ret = io_rsrc_data_alloc(ctx, io_rsrc_buf_put, tags, nr_args, &data);
  10912	if (ret)
  10913		return ret;
  10914	ret = io_buffers_map_alloc(ctx, nr_args);
  10915	if (ret) {
  10916		io_rsrc_data_free(data);
  10917		return ret;
  10918	}
  10919
  10920	for (i = 0; i < nr_args; i++, ctx->nr_user_bufs++) {
  10921		if (arg) {
  10922			ret = io_copy_iov(ctx, &iov, arg, i);
  10923			if (ret)
  10924				break;
  10925			ret = io_buffer_validate(&iov);
  10926			if (ret)
  10927				break;
  10928		} else {
  10929			memset(&iov, 0, sizeof(iov));
  10930		}
  10931
  10932		if (!iov.iov_base && *io_get_tag_slot(data, i)) {
  10933			ret = -EINVAL;
  10934			break;
  10935		}
  10936
  10937		ret = io_sqe_buffer_register(ctx, &iov, &ctx->user_bufs[i],
  10938					     &last_hpage);
  10939		if (ret)
  10940			break;
  10941	}
  10942
  10943	WARN_ON_ONCE(ctx->buf_data);
  10944
  10945	ctx->buf_data = data;
  10946	if (ret)
  10947		__io_sqe_buffers_unregister(ctx);
  10948	else
  10949		io_rsrc_node_switch(ctx, NULL);
  10950	return ret;
  10951}
  10952
  10953static int __io_sqe_buffers_update(struct io_ring_ctx *ctx,
  10954				   struct io_uring_rsrc_update2 *up,
  10955				   unsigned int nr_args)
  10956{
  10957	u64 __user *tags = u64_to_user_ptr(up->tags);
  10958	struct iovec iov, __user *iovs = u64_to_user_ptr(up->data);
  10959	struct page *last_hpage = NULL;
  10960	bool needs_switch = false;
  10961	__u32 done;
  10962	int i, err;
  10963
  10964	if (!ctx->buf_data)
  10965		return -ENXIO;
  10966	if (up->offset + nr_args > ctx->nr_user_bufs)
  10967		return -EINVAL;
  10968
  10969	for (done = 0; done < nr_args; done++) {
  10970		struct io_mapped_ubuf *imu;
  10971		int offset = up->offset + done;
  10972		u64 tag = 0;
  10973
  10974		err = io_copy_iov(ctx, &iov, iovs, done);
  10975		if (err)
  10976			break;
  10977		if (tags && copy_from_user(&tag, &tags[done], sizeof(tag))) {
  10978			err = -EFAULT;
  10979			break;
  10980		}
  10981		err = io_buffer_validate(&iov);
  10982		if (err)
  10983			break;
  10984		if (!iov.iov_base && tag) {
  10985			err = -EINVAL;
  10986			break;
  10987		}
  10988		err = io_sqe_buffer_register(ctx, &iov, &imu, &last_hpage);
  10989		if (err)
  10990			break;
  10991
  10992		i = array_index_nospec(offset, ctx->nr_user_bufs);
  10993		if (ctx->user_bufs[i] != ctx->dummy_ubuf) {
  10994			err = io_queue_rsrc_removal(ctx->buf_data, i,
  10995						    ctx->rsrc_node, ctx->user_bufs[i]);
  10996			if (unlikely(err)) {
  10997				io_buffer_unmap(ctx, &imu);
  10998				break;
  10999			}
  11000			ctx->user_bufs[i] = NULL;
  11001			needs_switch = true;
  11002		}
  11003
  11004		ctx->user_bufs[i] = imu;
  11005		*io_get_tag_slot(ctx->buf_data, offset) = tag;
  11006	}
  11007
  11008	if (needs_switch)
  11009		io_rsrc_node_switch(ctx, ctx->buf_data);
  11010	return done ? done : err;
  11011}
  11012
  11013static int io_eventfd_register(struct io_ring_ctx *ctx, void __user *arg,
  11014			       unsigned int eventfd_async)
  11015{
  11016	struct io_ev_fd *ev_fd;
  11017	__s32 __user *fds = arg;
  11018	int fd;
  11019
  11020	ev_fd = rcu_dereference_protected(ctx->io_ev_fd,
  11021					lockdep_is_held(&ctx->uring_lock));
  11022	if (ev_fd)
  11023		return -EBUSY;
  11024
  11025	if (copy_from_user(&fd, fds, sizeof(*fds)))
  11026		return -EFAULT;
  11027
  11028	ev_fd = kmalloc(sizeof(*ev_fd), GFP_KERNEL);
  11029	if (!ev_fd)
  11030		return -ENOMEM;
  11031
  11032	ev_fd->cq_ev_fd = eventfd_ctx_fdget(fd);
  11033	if (IS_ERR(ev_fd->cq_ev_fd)) {
  11034		int ret = PTR_ERR(ev_fd->cq_ev_fd);
  11035		kfree(ev_fd);
  11036		return ret;
  11037	}
  11038	ev_fd->eventfd_async = eventfd_async;
  11039	ctx->has_evfd = true;
  11040	rcu_assign_pointer(ctx->io_ev_fd, ev_fd);
  11041	return 0;
  11042}
  11043
  11044static void io_eventfd_put(struct rcu_head *rcu)
  11045{
  11046	struct io_ev_fd *ev_fd = container_of(rcu, struct io_ev_fd, rcu);
  11047
  11048	eventfd_ctx_put(ev_fd->cq_ev_fd);
  11049	kfree(ev_fd);
  11050}
  11051
  11052static int io_eventfd_unregister(struct io_ring_ctx *ctx)
  11053{
  11054	struct io_ev_fd *ev_fd;
  11055
  11056	ev_fd = rcu_dereference_protected(ctx->io_ev_fd,
  11057					lockdep_is_held(&ctx->uring_lock));
  11058	if (ev_fd) {
  11059		ctx->has_evfd = false;
  11060		rcu_assign_pointer(ctx->io_ev_fd, NULL);
  11061		call_rcu(&ev_fd->rcu, io_eventfd_put);
  11062		return 0;
  11063	}
  11064
  11065	return -ENXIO;
  11066}
  11067
  11068static void io_destroy_buffers(struct io_ring_ctx *ctx)
  11069{
  11070	struct io_buffer_list *bl;
  11071	unsigned long index;
  11072	int i;
  11073
  11074	for (i = 0; i < BGID_ARRAY; i++) {
  11075		if (!ctx->io_bl)
  11076			break;
  11077		__io_remove_buffers(ctx, &ctx->io_bl[i], -1U);
  11078	}
  11079
  11080	xa_for_each(&ctx->io_bl_xa, index, bl) {
  11081		xa_erase(&ctx->io_bl_xa, bl->bgid);
  11082		__io_remove_buffers(ctx, bl, -1U);
  11083		kfree(bl);
  11084	}
  11085
  11086	while (!list_empty(&ctx->io_buffers_pages)) {
  11087		struct page *page;
  11088
  11089		page = list_first_entry(&ctx->io_buffers_pages, struct page, lru);
  11090		list_del_init(&page->lru);
  11091		__free_page(page);
  11092	}
  11093}
  11094
  11095static void io_req_caches_free(struct io_ring_ctx *ctx)
  11096{
  11097	struct io_submit_state *state = &ctx->submit_state;
  11098	int nr = 0;
  11099
  11100	mutex_lock(&ctx->uring_lock);
  11101	io_flush_cached_locked_reqs(ctx, state);
  11102
  11103	while (!io_req_cache_empty(ctx)) {
  11104		struct io_wq_work_node *node;
  11105		struct io_kiocb *req;
  11106
  11107		node = wq_stack_extract(&state->free_list);
  11108		req = container_of(node, struct io_kiocb, comp_list);
  11109		kmem_cache_free(req_cachep, req);
  11110		nr++;
  11111	}
  11112	if (nr)
  11113		percpu_ref_put_many(&ctx->refs, nr);
  11114	mutex_unlock(&ctx->uring_lock);
  11115}
  11116
  11117static void io_wait_rsrc_data(struct io_rsrc_data *data)
  11118{
  11119	if (data && !atomic_dec_and_test(&data->refs))
  11120		wait_for_completion(&data->done);
  11121}
  11122
  11123static void io_flush_apoll_cache(struct io_ring_ctx *ctx)
  11124{
  11125	struct async_poll *apoll;
  11126
  11127	while (!list_empty(&ctx->apoll_cache)) {
  11128		apoll = list_first_entry(&ctx->apoll_cache, struct async_poll,
  11129						poll.wait.entry);
  11130		list_del(&apoll->poll.wait.entry);
  11131		kfree(apoll);
  11132	}
  11133}
  11134
  11135static __cold void io_ring_ctx_free(struct io_ring_ctx *ctx)
  11136{
  11137	io_sq_thread_finish(ctx);
  11138
  11139	if (ctx->mm_account) {
  11140		mmdrop(ctx->mm_account);
  11141		ctx->mm_account = NULL;
  11142	}
  11143
  11144	io_rsrc_refs_drop(ctx);
  11145	/* __io_rsrc_put_work() may need uring_lock to progress, wait w/o it */
  11146	io_wait_rsrc_data(ctx->buf_data);
  11147	io_wait_rsrc_data(ctx->file_data);
  11148
  11149	mutex_lock(&ctx->uring_lock);
  11150	if (ctx->buf_data)
  11151		__io_sqe_buffers_unregister(ctx);
  11152	if (ctx->file_data)
  11153		__io_sqe_files_unregister(ctx);
  11154	if (ctx->rings)
  11155		__io_cqring_overflow_flush(ctx, true);
  11156	io_eventfd_unregister(ctx);
  11157	io_flush_apoll_cache(ctx);
  11158	mutex_unlock(&ctx->uring_lock);
  11159	io_destroy_buffers(ctx);
  11160	if (ctx->sq_creds)
  11161		put_cred(ctx->sq_creds);
  11162
  11163	/* there are no registered resources left, nobody uses it */
  11164	if (ctx->rsrc_node)
  11165		io_rsrc_node_destroy(ctx->rsrc_node);
  11166	if (ctx->rsrc_backup_node)
  11167		io_rsrc_node_destroy(ctx->rsrc_backup_node);
  11168	flush_delayed_work(&ctx->rsrc_put_work);
  11169	flush_delayed_work(&ctx->fallback_work);
  11170
  11171	WARN_ON_ONCE(!list_empty(&ctx->rsrc_ref_list));
  11172	WARN_ON_ONCE(!llist_empty(&ctx->rsrc_put_llist));
  11173
  11174#if defined(CONFIG_UNIX)
  11175	if (ctx->ring_sock) {
  11176		ctx->ring_sock->file = NULL; /* so that iput() is called */
  11177		sock_release(ctx->ring_sock);
  11178	}
  11179#endif
  11180	WARN_ON_ONCE(!list_empty(&ctx->ltimeout_list));
  11181
  11182	io_mem_free(ctx->rings);
  11183	io_mem_free(ctx->sq_sqes);
  11184
  11185	percpu_ref_exit(&ctx->refs);
  11186	free_uid(ctx->user);
  11187	io_req_caches_free(ctx);
  11188	if (ctx->hash_map)
  11189		io_wq_put_hash(ctx->hash_map);
  11190	kfree(ctx->cancel_hash);
  11191	kfree(ctx->dummy_ubuf);
  11192	kfree(ctx->io_bl);
  11193	xa_destroy(&ctx->io_bl_xa);
  11194	kfree(ctx);
  11195}
  11196
  11197static __poll_t io_uring_poll(struct file *file, poll_table *wait)
  11198{
  11199	struct io_ring_ctx *ctx = file->private_data;
  11200	__poll_t mask = 0;
  11201
  11202	poll_wait(file, &ctx->cq_wait, wait);
  11203	/*
  11204	 * synchronizes with barrier from wq_has_sleeper call in
  11205	 * io_commit_cqring
  11206	 */
  11207	smp_rmb();
  11208	if (!io_sqring_full(ctx))
  11209		mask |= EPOLLOUT | EPOLLWRNORM;
  11210
  11211	/*
  11212	 * Don't flush cqring overflow list here, just do a simple check.
  11213	 * Otherwise there could possible be ABBA deadlock:
  11214	 *      CPU0                    CPU1
  11215	 *      ----                    ----
  11216	 * lock(&ctx->uring_lock);
  11217	 *                              lock(&ep->mtx);
  11218	 *                              lock(&ctx->uring_lock);
  11219	 * lock(&ep->mtx);
  11220	 *
  11221	 * Users may get EPOLLIN meanwhile seeing nothing in cqring, this
  11222	 * pushs them to do the flush.
  11223	 */
  11224	if (io_cqring_events(ctx) ||
  11225	    test_bit(IO_CHECK_CQ_OVERFLOW_BIT, &ctx->check_cq))
  11226		mask |= EPOLLIN | EPOLLRDNORM;
  11227
  11228	return mask;
  11229}
  11230
  11231static int io_unregister_personality(struct io_ring_ctx *ctx, unsigned id)
  11232{
  11233	const struct cred *creds;
  11234
  11235	creds = xa_erase(&ctx->personalities, id);
  11236	if (creds) {
  11237		put_cred(creds);
  11238		return 0;
  11239	}
  11240
  11241	return -EINVAL;
  11242}
  11243
  11244struct io_tctx_exit {
  11245	struct callback_head		task_work;
  11246	struct completion		completion;
  11247	struct io_ring_ctx		*ctx;
  11248};
  11249
  11250static __cold void io_tctx_exit_cb(struct callback_head *cb)
  11251{
  11252	struct io_uring_task *tctx = current->io_uring;
  11253	struct io_tctx_exit *work;
  11254
  11255	work = container_of(cb, struct io_tctx_exit, task_work);
  11256	/*
  11257	 * When @in_idle, we're in cancellation and it's racy to remove the
  11258	 * node. It'll be removed by the end of cancellation, just ignore it.
  11259	 */
  11260	if (!atomic_read(&tctx->in_idle))
  11261		io_uring_del_tctx_node((unsigned long)work->ctx);
  11262	complete(&work->completion);
  11263}
  11264
  11265static __cold bool io_cancel_ctx_cb(struct io_wq_work *work, void *data)
  11266{
  11267	struct io_kiocb *req = container_of(work, struct io_kiocb, work);
  11268
  11269	return req->ctx == data;
  11270}
  11271
  11272static __cold void io_ring_exit_work(struct work_struct *work)
  11273{
  11274	struct io_ring_ctx *ctx = container_of(work, struct io_ring_ctx, exit_work);
  11275	unsigned long timeout = jiffies + HZ * 60 * 5;
  11276	unsigned long interval = HZ / 20;
  11277	struct io_tctx_exit exit;
  11278	struct io_tctx_node *node;
  11279	int ret;
  11280
  11281	/*
  11282	 * If we're doing polled IO and end up having requests being
  11283	 * submitted async (out-of-line), then completions can come in while
  11284	 * we're waiting for refs to drop. We need to reap these manually,
  11285	 * as nobody else will be looking for them.
  11286	 */
  11287	do {
  11288		io_uring_try_cancel_requests(ctx, NULL, true);
  11289		if (ctx->sq_data) {
  11290			struct io_sq_data *sqd = ctx->sq_data;
  11291			struct task_struct *tsk;
  11292
  11293			io_sq_thread_park(sqd);
  11294			tsk = sqd->thread;
  11295			if (tsk && tsk->io_uring && tsk->io_uring->io_wq)
  11296				io_wq_cancel_cb(tsk->io_uring->io_wq,
  11297						io_cancel_ctx_cb, ctx, true);
  11298			io_sq_thread_unpark(sqd);
  11299		}
  11300
  11301		io_req_caches_free(ctx);
  11302
  11303		if (WARN_ON_ONCE(time_after(jiffies, timeout))) {
  11304			/* there is little hope left, don't run it too often */
  11305			interval = HZ * 60;
  11306		}
  11307	} while (!wait_for_completion_timeout(&ctx->ref_comp, interval));
  11308
  11309	init_completion(&exit.completion);
  11310	init_task_work(&exit.task_work, io_tctx_exit_cb);
  11311	exit.ctx = ctx;
  11312	/*
  11313	 * Some may use context even when all refs and requests have been put,
  11314	 * and they are free to do so while still holding uring_lock or
  11315	 * completion_lock, see io_req_task_submit(). Apart from other work,
  11316	 * this lock/unlock section also waits them to finish.
  11317	 */
  11318	mutex_lock(&ctx->uring_lock);
  11319	while (!list_empty(&ctx->tctx_list)) {
  11320		WARN_ON_ONCE(time_after(jiffies, timeout));
  11321
  11322		node = list_first_entry(&ctx->tctx_list, struct io_tctx_node,
  11323					ctx_node);
  11324		/* don't spin on a single task if cancellation failed */
  11325		list_rotate_left(&ctx->tctx_list);
  11326		ret = task_work_add(node->task, &exit.task_work, TWA_SIGNAL);
  11327		if (WARN_ON_ONCE(ret))
  11328			continue;
  11329
  11330		mutex_unlock(&ctx->uring_lock);
  11331		wait_for_completion(&exit.completion);
  11332		mutex_lock(&ctx->uring_lock);
  11333	}
  11334	mutex_unlock(&ctx->uring_lock);
  11335	spin_lock(&ctx->completion_lock);
  11336	spin_unlock(&ctx->completion_lock);
  11337
  11338	io_ring_ctx_free(ctx);
  11339}
  11340
  11341/* Returns true if we found and killed one or more timeouts */
  11342static __cold bool io_kill_timeouts(struct io_ring_ctx *ctx,
  11343				    struct task_struct *tsk, bool cancel_all)
  11344{
  11345	struct io_kiocb *req, *tmp;
  11346	int canceled = 0;
  11347
  11348	spin_lock(&ctx->completion_lock);
  11349	spin_lock_irq(&ctx->timeout_lock);
  11350	list_for_each_entry_safe(req, tmp, &ctx->timeout_list, timeout.list) {
  11351		if (io_match_task(req, tsk, cancel_all)) {
  11352			io_kill_timeout(req, -ECANCELED);
  11353			canceled++;
  11354		}
  11355	}
  11356	spin_unlock_irq(&ctx->timeout_lock);
  11357	io_commit_cqring(ctx);
  11358	spin_unlock(&ctx->completion_lock);
  11359	if (canceled != 0)
  11360		io_cqring_ev_posted(ctx);
  11361	return canceled != 0;
  11362}
  11363
  11364static __cold void io_ring_ctx_wait_and_kill(struct io_ring_ctx *ctx)
  11365{
  11366	unsigned long index;
  11367	struct creds *creds;
  11368
  11369	mutex_lock(&ctx->uring_lock);
  11370	percpu_ref_kill(&ctx->refs);
  11371	if (ctx->rings)
  11372		__io_cqring_overflow_flush(ctx, true);
  11373	xa_for_each(&ctx->personalities, index, creds)
  11374		io_unregister_personality(ctx, index);
  11375	mutex_unlock(&ctx->uring_lock);
  11376
  11377	/* failed during ring init, it couldn't have issued any requests */
  11378	if (ctx->rings) {
  11379		io_kill_timeouts(ctx, NULL, true);
  11380		io_poll_remove_all(ctx, NULL, true);
  11381		/* if we failed setting up the ctx, we might not have any rings */
  11382		io_iopoll_try_reap_events(ctx);
  11383	}
  11384
  11385	INIT_WORK(&ctx->exit_work, io_ring_exit_work);
  11386	/*
  11387	 * Use system_unbound_wq to avoid spawning tons of event kworkers
  11388	 * if we're exiting a ton of rings at the same time. It just adds
  11389	 * noise and overhead, there's no discernable change in runtime
  11390	 * over using system_wq.
  11391	 */
  11392	queue_work(system_unbound_wq, &ctx->exit_work);
  11393}
  11394
  11395static int io_uring_release(struct inode *inode, struct file *file)
  11396{
  11397	struct io_ring_ctx *ctx = file->private_data;
  11398
  11399	file->private_data = NULL;
  11400	io_ring_ctx_wait_and_kill(ctx);
  11401	return 0;
  11402}
  11403
  11404struct io_task_cancel {
  11405	struct task_struct *task;
  11406	bool all;
  11407};
  11408
  11409static bool io_cancel_task_cb(struct io_wq_work *work, void *data)
  11410{
  11411	struct io_kiocb *req = container_of(work, struct io_kiocb, work);
  11412	struct io_task_cancel *cancel = data;
  11413
  11414	return io_match_task_safe(req, cancel->task, cancel->all);
  11415}
  11416
  11417static __cold bool io_cancel_defer_files(struct io_ring_ctx *ctx,
  11418					 struct task_struct *task,
  11419					 bool cancel_all)
  11420{
  11421	struct io_defer_entry *de;
  11422	LIST_HEAD(list);
  11423
  11424	spin_lock(&ctx->completion_lock);
  11425	list_for_each_entry_reverse(de, &ctx->defer_list, list) {
  11426		if (io_match_task_safe(de->req, task, cancel_all)) {
  11427			list_cut_position(&list, &ctx->defer_list, &de->list);
  11428			break;
  11429		}
  11430	}
  11431	spin_unlock(&ctx->completion_lock);
  11432	if (list_empty(&list))
  11433		return false;
  11434
  11435	while (!list_empty(&list)) {
  11436		de = list_first_entry(&list, struct io_defer_entry, list);
  11437		list_del_init(&de->list);
  11438		io_req_complete_failed(de->req, -ECANCELED);
  11439		kfree(de);
  11440	}
  11441	return true;
  11442}
  11443
  11444static __cold bool io_uring_try_cancel_iowq(struct io_ring_ctx *ctx)
  11445{
  11446	struct io_tctx_node *node;
  11447	enum io_wq_cancel cret;
  11448	bool ret = false;
  11449
  11450	mutex_lock(&ctx->uring_lock);
  11451	list_for_each_entry(node, &ctx->tctx_list, ctx_node) {
  11452		struct io_uring_task *tctx = node->task->io_uring;
  11453
  11454		/*
  11455		 * io_wq will stay alive while we hold uring_lock, because it's
  11456		 * killed after ctx nodes, which requires to take the lock.
  11457		 */
  11458		if (!tctx || !tctx->io_wq)
  11459			continue;
  11460		cret = io_wq_cancel_cb(tctx->io_wq, io_cancel_ctx_cb, ctx, true);
  11461		ret |= (cret != IO_WQ_CANCEL_NOTFOUND);
  11462	}
  11463	mutex_unlock(&ctx->uring_lock);
  11464
  11465	return ret;
  11466}
  11467
  11468static __cold void io_uring_try_cancel_requests(struct io_ring_ctx *ctx,
  11469						struct task_struct *task,
  11470						bool cancel_all)
  11471{
  11472	struct io_task_cancel cancel = { .task = task, .all = cancel_all, };
  11473	struct io_uring_task *tctx = task ? task->io_uring : NULL;
  11474
  11475	/* failed during ring init, it couldn't have issued any requests */
  11476	if (!ctx->rings)
  11477		return;
  11478
  11479	while (1) {
  11480		enum io_wq_cancel cret;
  11481		bool ret = false;
  11482
  11483		if (!task) {
  11484			ret |= io_uring_try_cancel_iowq(ctx);
  11485		} else if (tctx && tctx->io_wq) {
  11486			/*
  11487			 * Cancels requests of all rings, not only @ctx, but
  11488			 * it's fine as the task is in exit/exec.
  11489			 */
  11490			cret = io_wq_cancel_cb(tctx->io_wq, io_cancel_task_cb,
  11491					       &cancel, true);
  11492			ret |= (cret != IO_WQ_CANCEL_NOTFOUND);
  11493		}
  11494
  11495		/* SQPOLL thread does its own polling */
  11496		if ((!(ctx->flags & IORING_SETUP_SQPOLL) && cancel_all) ||
  11497		    (ctx->sq_data && ctx->sq_data->thread == current)) {
  11498			while (!wq_list_empty(&ctx->iopoll_list)) {
  11499				io_iopoll_try_reap_events(ctx);
  11500				ret = true;
  11501			}
  11502		}
  11503
  11504		ret |= io_cancel_defer_files(ctx, task, cancel_all);
  11505		ret |= io_poll_remove_all(ctx, task, cancel_all);
  11506		ret |= io_kill_timeouts(ctx, task, cancel_all);
  11507		if (task)
  11508			ret |= io_run_task_work();
  11509		if (!ret)
  11510			break;
  11511		cond_resched();
  11512	}
  11513}
  11514
  11515static int __io_uring_add_tctx_node(struct io_ring_ctx *ctx)
  11516{
  11517	struct io_uring_task *tctx = current->io_uring;
  11518	struct io_tctx_node *node;
  11519	int ret;
  11520
  11521	if (unlikely(!tctx)) {
  11522		ret = io_uring_alloc_task_context(current, ctx);
  11523		if (unlikely(ret))
  11524			return ret;
  11525
  11526		tctx = current->io_uring;
  11527		if (ctx->iowq_limits_set) {
  11528			unsigned int limits[2] = { ctx->iowq_limits[0],
  11529						   ctx->iowq_limits[1], };
  11530
  11531			ret = io_wq_max_workers(tctx->io_wq, limits);
  11532			if (ret)
  11533				return ret;
  11534		}
  11535	}
  11536	if (!xa_load(&tctx->xa, (unsigned long)ctx)) {
  11537		node = kmalloc(sizeof(*node), GFP_KERNEL);
  11538		if (!node)
  11539			return -ENOMEM;
  11540		node->ctx = ctx;
  11541		node->task = current;
  11542
  11543		ret = xa_err(xa_store(&tctx->xa, (unsigned long)ctx,
  11544					node, GFP_KERNEL));
  11545		if (ret) {
  11546			kfree(node);
  11547			return ret;
  11548		}
  11549
  11550		mutex_lock(&ctx->uring_lock);
  11551		list_add(&node->ctx_node, &ctx->tctx_list);
  11552		mutex_unlock(&ctx->uring_lock);
  11553	}
  11554	tctx->last = ctx;
  11555	return 0;
  11556}
  11557
  11558/*
  11559 * Note that this task has used io_uring. We use it for cancelation purposes.
  11560 */
  11561static inline int io_uring_add_tctx_node(struct io_ring_ctx *ctx)
  11562{
  11563	struct io_uring_task *tctx = current->io_uring;
  11564
  11565	if (likely(tctx && tctx->last == ctx))
  11566		return 0;
  11567	return __io_uring_add_tctx_node(ctx);
  11568}
  11569
  11570/*
  11571 * Remove this io_uring_file -> task mapping.
  11572 */
  11573static __cold void io_uring_del_tctx_node(unsigned long index)
  11574{
  11575	struct io_uring_task *tctx = current->io_uring;
  11576	struct io_tctx_node *node;
  11577
  11578	if (!tctx)
  11579		return;
  11580	node = xa_erase(&tctx->xa, index);
  11581	if (!node)
  11582		return;
  11583
  11584	WARN_ON_ONCE(current != node->task);
  11585	WARN_ON_ONCE(list_empty(&node->ctx_node));
  11586
  11587	mutex_lock(&node->ctx->uring_lock);
  11588	list_del(&node->ctx_node);
  11589	mutex_unlock(&node->ctx->uring_lock);
  11590
  11591	if (tctx->last == node->ctx)
  11592		tctx->last = NULL;
  11593	kfree(node);
  11594}
  11595
  11596static __cold void io_uring_clean_tctx(struct io_uring_task *tctx)
  11597{
  11598	struct io_wq *wq = tctx->io_wq;
  11599	struct io_tctx_node *node;
  11600	unsigned long index;
  11601
  11602	xa_for_each(&tctx->xa, index, node) {
  11603		io_uring_del_tctx_node(index);
  11604		cond_resched();
  11605	}
  11606	if (wq) {
  11607		/*
  11608		 * Must be after io_uring_del_tctx_node() (removes nodes under
  11609		 * uring_lock) to avoid race with io_uring_try_cancel_iowq().
  11610		 */
  11611		io_wq_put_and_exit(wq);
  11612		tctx->io_wq = NULL;
  11613	}
  11614}
  11615
  11616static s64 tctx_inflight(struct io_uring_task *tctx, bool tracked)
  11617{
  11618	if (tracked)
  11619		return atomic_read(&tctx->inflight_tracked);
  11620	return percpu_counter_sum(&tctx->inflight);
  11621}
  11622
  11623/*
  11624 * Find any io_uring ctx that this task has registered or done IO on, and cancel
  11625 * requests. @sqd should be not-null IFF it's an SQPOLL thread cancellation.
  11626 */
  11627static __cold void io_uring_cancel_generic(bool cancel_all,
  11628					   struct io_sq_data *sqd)
  11629{
  11630	struct io_uring_task *tctx = current->io_uring;
  11631	struct io_ring_ctx *ctx;
  11632	s64 inflight;
  11633	DEFINE_WAIT(wait);
  11634
  11635	WARN_ON_ONCE(sqd && sqd->thread != current);
  11636
  11637	if (!current->io_uring)
  11638		return;
  11639	if (tctx->io_wq)
  11640		io_wq_exit_start(tctx->io_wq);
  11641
  11642	atomic_inc(&tctx->in_idle);
  11643	do {
  11644		io_uring_drop_tctx_refs(current);
  11645		/* read completions before cancelations */
  11646		inflight = tctx_inflight(tctx, !cancel_all);
  11647		if (!inflight)
  11648			break;
  11649
  11650		if (!sqd) {
  11651			struct io_tctx_node *node;
  11652			unsigned long index;
  11653
  11654			xa_for_each(&tctx->xa, index, node) {
  11655				/* sqpoll task will cancel all its requests */
  11656				if (node->ctx->sq_data)
  11657					continue;
  11658				io_uring_try_cancel_requests(node->ctx, current,
  11659							     cancel_all);
  11660			}
  11661		} else {
  11662			list_for_each_entry(ctx, &sqd->ctx_list, sqd_list)
  11663				io_uring_try_cancel_requests(ctx, current,
  11664							     cancel_all);
  11665		}
  11666
  11667		prepare_to_wait(&tctx->wait, &wait, TASK_INTERRUPTIBLE);
  11668		io_run_task_work();
  11669		io_uring_drop_tctx_refs(current);
  11670
  11671		/*
  11672		 * If we've seen completions, retry without waiting. This
  11673		 * avoids a race where a completion comes in before we did
  11674		 * prepare_to_wait().
  11675		 */
  11676		if (inflight == tctx_inflight(tctx, !cancel_all))
  11677			schedule();
  11678		finish_wait(&tctx->wait, &wait);
  11679	} while (1);
  11680
  11681	io_uring_clean_tctx(tctx);
  11682	if (cancel_all) {
  11683		/*
  11684		 * We shouldn't run task_works after cancel, so just leave
  11685		 * ->in_idle set for normal exit.
  11686		 */
  11687		atomic_dec(&tctx->in_idle);
  11688		/* for exec all current's requests should be gone, kill tctx */
  11689		__io_uring_free(current);
  11690	}
  11691}
  11692
  11693void __io_uring_cancel(bool cancel_all)
  11694{
  11695	io_uring_cancel_generic(cancel_all, NULL);
  11696}
  11697
  11698void io_uring_unreg_ringfd(void)
  11699{
  11700	struct io_uring_task *tctx = current->io_uring;
  11701	int i;
  11702
  11703	for (i = 0; i < IO_RINGFD_REG_MAX; i++) {
  11704		if (tctx->registered_rings[i]) {
  11705			fput(tctx->registered_rings[i]);
  11706			tctx->registered_rings[i] = NULL;
  11707		}
  11708	}
  11709}
  11710
  11711static int io_ring_add_registered_fd(struct io_uring_task *tctx, int fd,
  11712				     int start, int end)
  11713{
  11714	struct file *file;
  11715	int offset;
  11716
  11717	for (offset = start; offset < end; offset++) {
  11718		offset = array_index_nospec(offset, IO_RINGFD_REG_MAX);
  11719		if (tctx->registered_rings[offset])
  11720			continue;
  11721
  11722		file = fget(fd);
  11723		if (!file) {
  11724			return -EBADF;
  11725		} else if (file->f_op != &io_uring_fops) {
  11726			fput(file);
  11727			return -EOPNOTSUPP;
  11728		}
  11729		tctx->registered_rings[offset] = file;
  11730		return offset;
  11731	}
  11732
  11733	return -EBUSY;
  11734}
  11735
  11736/*
  11737 * Register a ring fd to avoid fdget/fdput for each io_uring_enter()
  11738 * invocation. User passes in an array of struct io_uring_rsrc_update
  11739 * with ->data set to the ring_fd, and ->offset given for the desired
  11740 * index. If no index is desired, application may set ->offset == -1U
  11741 * and we'll find an available index. Returns number of entries
  11742 * successfully processed, or < 0 on error if none were processed.
  11743 */
  11744static int io_ringfd_register(struct io_ring_ctx *ctx, void __user *__arg,
  11745			      unsigned nr_args)
  11746{
  11747	struct io_uring_rsrc_update __user *arg = __arg;
  11748	struct io_uring_rsrc_update reg;
  11749	struct io_uring_task *tctx;
  11750	int ret, i;
  11751
  11752	if (!nr_args || nr_args > IO_RINGFD_REG_MAX)
  11753		return -EINVAL;
  11754
  11755	mutex_unlock(&ctx->uring_lock);
  11756	ret = io_uring_add_tctx_node(ctx);
  11757	mutex_lock(&ctx->uring_lock);
  11758	if (ret)
  11759		return ret;
  11760
  11761	tctx = current->io_uring;
  11762	for (i = 0; i < nr_args; i++) {
  11763		int start, end;
  11764
  11765		if (copy_from_user(&reg, &arg[i], sizeof(reg))) {
  11766			ret = -EFAULT;
  11767			break;
  11768		}
  11769
  11770		if (reg.resv) {
  11771			ret = -EINVAL;
  11772			break;
  11773		}
  11774
  11775		if (reg.offset == -1U) {
  11776			start = 0;
  11777			end = IO_RINGFD_REG_MAX;
  11778		} else {
  11779			if (reg.offset >= IO_RINGFD_REG_MAX) {
  11780				ret = -EINVAL;
  11781				break;
  11782			}
  11783			start = reg.offset;
  11784			end = start + 1;
  11785		}
  11786
  11787		ret = io_ring_add_registered_fd(tctx, reg.data, start, end);
  11788		if (ret < 0)
  11789			break;
  11790
  11791		reg.offset = ret;
  11792		if (copy_to_user(&arg[i], &reg, sizeof(reg))) {
  11793			fput(tctx->registered_rings[reg.offset]);
  11794			tctx->registered_rings[reg.offset] = NULL;
  11795			ret = -EFAULT;
  11796			break;
  11797		}
  11798	}
  11799
  11800	return i ? i : ret;
  11801}
  11802
  11803static int io_ringfd_unregister(struct io_ring_ctx *ctx, void __user *__arg,
  11804				unsigned nr_args)
  11805{
  11806	struct io_uring_rsrc_update __user *arg = __arg;
  11807	struct io_uring_task *tctx = current->io_uring;
  11808	struct io_uring_rsrc_update reg;
  11809	int ret = 0, i;
  11810
  11811	if (!nr_args || nr_args > IO_RINGFD_REG_MAX)
  11812		return -EINVAL;
  11813	if (!tctx)
  11814		return 0;
  11815
  11816	for (i = 0; i < nr_args; i++) {
  11817		if (copy_from_user(&reg, &arg[i], sizeof(reg))) {
  11818			ret = -EFAULT;
  11819			break;
  11820		}
  11821		if (reg.resv || reg.data || reg.offset >= IO_RINGFD_REG_MAX) {
  11822			ret = -EINVAL;
  11823			break;
  11824		}
  11825
  11826		reg.offset = array_index_nospec(reg.offset, IO_RINGFD_REG_MAX);
  11827		if (tctx->registered_rings[reg.offset]) {
  11828			fput(tctx->registered_rings[reg.offset]);
  11829			tctx->registered_rings[reg.offset] = NULL;
  11830		}
  11831	}
  11832
  11833	return i ? i : ret;
  11834}
  11835
  11836static void *io_uring_validate_mmap_request(struct file *file,
  11837					    loff_t pgoff, size_t sz)
  11838{
  11839	struct io_ring_ctx *ctx = file->private_data;
  11840	loff_t offset = pgoff << PAGE_SHIFT;
  11841	struct page *page;
  11842	void *ptr;
  11843
  11844	switch (offset) {
  11845	case IORING_OFF_SQ_RING:
  11846	case IORING_OFF_CQ_RING:
  11847		ptr = ctx->rings;
  11848		break;
  11849	case IORING_OFF_SQES:
  11850		ptr = ctx->sq_sqes;
  11851		break;
  11852	default:
  11853		return ERR_PTR(-EINVAL);
  11854	}
  11855
  11856	page = virt_to_head_page(ptr);
  11857	if (sz > page_size(page))
  11858		return ERR_PTR(-EINVAL);
  11859
  11860	return ptr;
  11861}
  11862
  11863#ifdef CONFIG_MMU
  11864
  11865static __cold int io_uring_mmap(struct file *file, struct vm_area_struct *vma)
  11866{
  11867	size_t sz = vma->vm_end - vma->vm_start;
  11868	unsigned long pfn;
  11869	void *ptr;
  11870
  11871	ptr = io_uring_validate_mmap_request(file, vma->vm_pgoff, sz);
  11872	if (IS_ERR(ptr))
  11873		return PTR_ERR(ptr);
  11874
  11875	pfn = virt_to_phys(ptr) >> PAGE_SHIFT;
  11876	return remap_pfn_range(vma, vma->vm_start, pfn, sz, vma->vm_page_prot);
  11877}
  11878
  11879#else /* !CONFIG_MMU */
  11880
  11881static int io_uring_mmap(struct file *file, struct vm_area_struct *vma)
  11882{
  11883	return vma->vm_flags & (VM_SHARED | VM_MAYSHARE) ? 0 : -EINVAL;
  11884}
  11885
  11886static unsigned int io_uring_nommu_mmap_capabilities(struct file *file)
  11887{
  11888	return NOMMU_MAP_DIRECT | NOMMU_MAP_READ | NOMMU_MAP_WRITE;
  11889}
  11890
  11891static unsigned long io_uring_nommu_get_unmapped_area(struct file *file,
  11892	unsigned long addr, unsigned long len,
  11893	unsigned long pgoff, unsigned long flags)
  11894{
  11895	void *ptr;
  11896
  11897	ptr = io_uring_validate_mmap_request(file, pgoff, len);
  11898	if (IS_ERR(ptr))
  11899		return PTR_ERR(ptr);
  11900
  11901	return (unsigned long) ptr;
  11902}
  11903
  11904#endif /* !CONFIG_MMU */
  11905
  11906static int io_sqpoll_wait_sq(struct io_ring_ctx *ctx)
  11907{
  11908	DEFINE_WAIT(wait);
  11909
  11910	do {
  11911		if (!io_sqring_full(ctx))
  11912			break;
  11913		prepare_to_wait(&ctx->sqo_sq_wait, &wait, TASK_INTERRUPTIBLE);
  11914
  11915		if (!io_sqring_full(ctx))
  11916			break;
  11917		schedule();
  11918	} while (!signal_pending(current));
  11919
  11920	finish_wait(&ctx->sqo_sq_wait, &wait);
  11921	return 0;
  11922}
  11923
  11924static int io_validate_ext_arg(unsigned flags, const void __user *argp, size_t argsz)
  11925{
  11926	if (flags & IORING_ENTER_EXT_ARG) {
  11927		struct io_uring_getevents_arg arg;
  11928
  11929		if (argsz != sizeof(arg))
  11930			return -EINVAL;
  11931		if (copy_from_user(&arg, argp, sizeof(arg)))
  11932			return -EFAULT;
  11933	}
  11934	return 0;
  11935}
  11936
  11937static int io_get_ext_arg(unsigned flags, const void __user *argp, size_t *argsz,
  11938			  struct __kernel_timespec __user **ts,
  11939			  const sigset_t __user **sig)
  11940{
  11941	struct io_uring_getevents_arg arg;
  11942
  11943	/*
  11944	 * If EXT_ARG isn't set, then we have no timespec and the argp pointer
  11945	 * is just a pointer to the sigset_t.
  11946	 */
  11947	if (!(flags & IORING_ENTER_EXT_ARG)) {
  11948		*sig = (const sigset_t __user *) argp;
  11949		*ts = NULL;
  11950		return 0;
  11951	}
  11952
  11953	/*
  11954	 * EXT_ARG is set - ensure we agree on the size of it and copy in our
  11955	 * timespec and sigset_t pointers if good.
  11956	 */
  11957	if (*argsz != sizeof(arg))
  11958		return -EINVAL;
  11959	if (copy_from_user(&arg, argp, sizeof(arg)))
  11960		return -EFAULT;
  11961	if (arg.pad)
  11962		return -EINVAL;
  11963	*sig = u64_to_user_ptr(arg.sigmask);
  11964	*argsz = arg.sigmask_sz;
  11965	*ts = u64_to_user_ptr(arg.ts);
  11966	return 0;
  11967}
  11968
  11969SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit,
  11970		u32, min_complete, u32, flags, const void __user *, argp,
  11971		size_t, argsz)
  11972{
  11973	struct io_ring_ctx *ctx;
  11974	struct fd f;
  11975	long ret;
  11976
  11977	io_run_task_work();
  11978
  11979	if (unlikely(flags & ~(IORING_ENTER_GETEVENTS | IORING_ENTER_SQ_WAKEUP |
  11980			       IORING_ENTER_SQ_WAIT | IORING_ENTER_EXT_ARG |
  11981			       IORING_ENTER_REGISTERED_RING)))
  11982		return -EINVAL;
  11983
  11984	/*
  11985	 * Ring fd has been registered via IORING_REGISTER_RING_FDS, we
  11986	 * need only dereference our task private array to find it.
  11987	 */
  11988	if (flags & IORING_ENTER_REGISTERED_RING) {
  11989		struct io_uring_task *tctx = current->io_uring;
  11990
  11991		if (!tctx || fd >= IO_RINGFD_REG_MAX)
  11992			return -EINVAL;
  11993		fd = array_index_nospec(fd, IO_RINGFD_REG_MAX);
  11994		f.file = tctx->registered_rings[fd];
  11995		f.flags = 0;
  11996	} else {
  11997		f = fdget(fd);
  11998	}
  11999
  12000	if (unlikely(!f.file))
  12001		return -EBADF;
  12002
  12003	ret = -EOPNOTSUPP;
  12004	if (unlikely(f.file->f_op != &io_uring_fops))
  12005		goto out_fput;
  12006
  12007	ret = -ENXIO;
  12008	ctx = f.file->private_data;
  12009	if (unlikely(!percpu_ref_tryget(&ctx->refs)))
  12010		goto out_fput;
  12011
  12012	ret = -EBADFD;
  12013	if (unlikely(ctx->flags & IORING_SETUP_R_DISABLED))
  12014		goto out;
  12015
  12016	/*
  12017	 * For SQ polling, the thread will do all submissions and completions.
  12018	 * Just return the requested submit count, and wake the thread if
  12019	 * we were asked to.
  12020	 */
  12021	ret = 0;
  12022	if (ctx->flags & IORING_SETUP_SQPOLL) {
  12023		io_cqring_overflow_flush(ctx);
  12024
  12025		if (unlikely(ctx->sq_data->thread == NULL)) {
  12026			ret = -EOWNERDEAD;
  12027			goto out;
  12028		}
  12029		if (flags & IORING_ENTER_SQ_WAKEUP)
  12030			wake_up(&ctx->sq_data->wait);
  12031		if (flags & IORING_ENTER_SQ_WAIT) {
  12032			ret = io_sqpoll_wait_sq(ctx);
  12033			if (ret)
  12034				goto out;
  12035		}
  12036		ret = to_submit;
  12037	} else if (to_submit) {
  12038		ret = io_uring_add_tctx_node(ctx);
  12039		if (unlikely(ret))
  12040			goto out;
  12041
  12042		mutex_lock(&ctx->uring_lock);
  12043		ret = io_submit_sqes(ctx, to_submit);
  12044		if (ret != to_submit) {
  12045			mutex_unlock(&ctx->uring_lock);
  12046			goto out;
  12047		}
  12048		if ((flags & IORING_ENTER_GETEVENTS) && ctx->syscall_iopoll)
  12049			goto iopoll_locked;
  12050		mutex_unlock(&ctx->uring_lock);
  12051	}
  12052	if (flags & IORING_ENTER_GETEVENTS) {
  12053		int ret2;
  12054		if (ctx->syscall_iopoll) {
  12055			/*
  12056			 * We disallow the app entering submit/complete with
  12057			 * polling, but we still need to lock the ring to
  12058			 * prevent racing with polled issue that got punted to
  12059			 * a workqueue.
  12060			 */
  12061			mutex_lock(&ctx->uring_lock);
  12062iopoll_locked:
  12063			ret2 = io_validate_ext_arg(flags, argp, argsz);
  12064			if (likely(!ret2)) {
  12065				min_complete = min(min_complete,
  12066						   ctx->cq_entries);
  12067				ret2 = io_iopoll_check(ctx, min_complete);
  12068			}
  12069			mutex_unlock(&ctx->uring_lock);
  12070		} else {
  12071			const sigset_t __user *sig;
  12072			struct __kernel_timespec __user *ts;
  12073
  12074			ret2 = io_get_ext_arg(flags, argp, &argsz, &ts, &sig);
  12075			if (likely(!ret2)) {
  12076				min_complete = min(min_complete,
  12077						   ctx->cq_entries);
  12078				ret2 = io_cqring_wait(ctx, min_complete, sig,
  12079						      argsz, ts);
  12080			}
  12081		}
  12082
  12083		if (!ret) {
  12084			ret = ret2;
  12085
  12086			/*
  12087			 * EBADR indicates that one or more CQE were dropped.
  12088			 * Once the user has been informed we can clear the bit
  12089			 * as they are obviously ok with those drops.
  12090			 */
  12091			if (unlikely(ret2 == -EBADR))
  12092				clear_bit(IO_CHECK_CQ_DROPPED_BIT,
  12093					  &ctx->check_cq);
  12094		}
  12095	}
  12096
  12097out:
  12098	percpu_ref_put(&ctx->refs);
  12099out_fput:
  12100	fdput(f);
  12101	return ret;
  12102}
  12103
  12104#ifdef CONFIG_PROC_FS
  12105static __cold int io_uring_show_cred(struct seq_file *m, unsigned int id,
  12106		const struct cred *cred)
  12107{
  12108	struct user_namespace *uns = seq_user_ns(m);
  12109	struct group_info *gi;
  12110	kernel_cap_t cap;
  12111	unsigned __capi;
  12112	int g;
  12113
  12114	seq_printf(m, "%5d\n", id);
  12115	seq_put_decimal_ull(m, "\tUid:\t", from_kuid_munged(uns, cred->uid));
  12116	seq_put_decimal_ull(m, "\t\t", from_kuid_munged(uns, cred->euid));
  12117	seq_put_decimal_ull(m, "\t\t", from_kuid_munged(uns, cred->suid));
  12118	seq_put_decimal_ull(m, "\t\t", from_kuid_munged(uns, cred->fsuid));
  12119	seq_put_decimal_ull(m, "\n\tGid:\t", from_kgid_munged(uns, cred->gid));
  12120	seq_put_decimal_ull(m, "\t\t", from_kgid_munged(uns, cred->egid));
  12121	seq_put_decimal_ull(m, "\t\t", from_kgid_munged(uns, cred->sgid));
  12122	seq_put_decimal_ull(m, "\t\t", from_kgid_munged(uns, cred->fsgid));
  12123	seq_puts(m, "\n\tGroups:\t");
  12124	gi = cred->group_info;
  12125	for (g = 0; g < gi->ngroups; g++) {
  12126		seq_put_decimal_ull(m, g ? " " : "",
  12127					from_kgid_munged(uns, gi->gid[g]));
  12128	}
  12129	seq_puts(m, "\n\tCapEff:\t");
  12130	cap = cred->cap_effective;
  12131	CAP_FOR_EACH_U32(__capi)
  12132		seq_put_hex_ll(m, NULL, cap.cap[CAP_LAST_U32 - __capi], 8);
  12133	seq_putc(m, '\n');
  12134	return 0;
  12135}
  12136
  12137static __cold void __io_uring_show_fdinfo(struct io_ring_ctx *ctx,
  12138					  struct seq_file *m)
  12139{
  12140	struct io_sq_data *sq = NULL;
  12141	struct io_overflow_cqe *ocqe;
  12142	struct io_rings *r = ctx->rings;
  12143	unsigned int sq_mask = ctx->sq_entries - 1, cq_mask = ctx->cq_entries - 1;
  12144	unsigned int sq_head = READ_ONCE(r->sq.head);
  12145	unsigned int sq_tail = READ_ONCE(r->sq.tail);
  12146	unsigned int cq_head = READ_ONCE(r->cq.head);
  12147	unsigned int cq_tail = READ_ONCE(r->cq.tail);
  12148	unsigned int cq_shift = 0;
  12149	unsigned int sq_entries, cq_entries;
  12150	bool has_lock;
  12151	bool is_cqe32 = (ctx->flags & IORING_SETUP_CQE32);
  12152	unsigned int i;
  12153
  12154	if (is_cqe32)
  12155		cq_shift = 1;
  12156
  12157	/*
  12158	 * we may get imprecise sqe and cqe info if uring is actively running
  12159	 * since we get cached_sq_head and cached_cq_tail without uring_lock
  12160	 * and sq_tail and cq_head are changed by userspace. But it's ok since
  12161	 * we usually use these info when it is stuck.
  12162	 */
  12163	seq_printf(m, "SqMask:\t0x%x\n", sq_mask);
  12164	seq_printf(m, "SqHead:\t%u\n", sq_head);
  12165	seq_printf(m, "SqTail:\t%u\n", sq_tail);
  12166	seq_printf(m, "CachedSqHead:\t%u\n", ctx->cached_sq_head);
  12167	seq_printf(m, "CqMask:\t0x%x\n", cq_mask);
  12168	seq_printf(m, "CqHead:\t%u\n", cq_head);
  12169	seq_printf(m, "CqTail:\t%u\n", cq_tail);
  12170	seq_printf(m, "CachedCqTail:\t%u\n", ctx->cached_cq_tail);
  12171	seq_printf(m, "SQEs:\t%u\n", sq_tail - ctx->cached_sq_head);
  12172	sq_entries = min(sq_tail - sq_head, ctx->sq_entries);
  12173	for (i = 0; i < sq_entries; i++) {
  12174		unsigned int entry = i + sq_head;
  12175		unsigned int sq_idx = READ_ONCE(ctx->sq_array[entry & sq_mask]);
  12176		struct io_uring_sqe *sqe;
  12177
  12178		if (sq_idx > sq_mask)
  12179			continue;
  12180		sqe = &ctx->sq_sqes[sq_idx];
  12181		seq_printf(m, "%5u: opcode:%d, fd:%d, flags:%x, user_data:%llu\n",
  12182			   sq_idx, sqe->opcode, sqe->fd, sqe->flags,
  12183			   sqe->user_data);
  12184	}
  12185	seq_printf(m, "CQEs:\t%u\n", cq_tail - cq_head);
  12186	cq_entries = min(cq_tail - cq_head, ctx->cq_entries);
  12187	for (i = 0; i < cq_entries; i++) {
  12188		unsigned int entry = i + cq_head;
  12189		struct io_uring_cqe *cqe = &r->cqes[(entry & cq_mask) << cq_shift];
  12190
  12191		if (!is_cqe32) {
  12192			seq_printf(m, "%5u: user_data:%llu, res:%d, flag:%x\n",
  12193			   entry & cq_mask, cqe->user_data, cqe->res,
  12194			   cqe->flags);
  12195		} else {
  12196			seq_printf(m, "%5u: user_data:%llu, res:%d, flag:%x, "
  12197				"extra1:%llu, extra2:%llu\n",
  12198				entry & cq_mask, cqe->user_data, cqe->res,
  12199				cqe->flags, cqe->big_cqe[0], cqe->big_cqe[1]);
  12200		}
  12201	}
  12202
  12203	/*
  12204	 * Avoid ABBA deadlock between the seq lock and the io_uring mutex,
  12205	 * since fdinfo case grabs it in the opposite direction of normal use
  12206	 * cases. If we fail to get the lock, we just don't iterate any
  12207	 * structures that could be going away outside the io_uring mutex.
  12208	 */
  12209	has_lock = mutex_trylock(&ctx->uring_lock);
  12210
  12211	if (has_lock && (ctx->flags & IORING_SETUP_SQPOLL)) {
  12212		sq = ctx->sq_data;
  12213		if (!sq->thread)
  12214			sq = NULL;
  12215	}
  12216
  12217	seq_printf(m, "SqThread:\t%d\n", sq ? task_pid_nr(sq->thread) : -1);
  12218	seq_printf(m, "SqThreadCpu:\t%d\n", sq ? task_cpu(sq->thread) : -1);
  12219	seq_printf(m, "UserFiles:\t%u\n", ctx->nr_user_files);
  12220	for (i = 0; has_lock && i < ctx->nr_user_files; i++) {
  12221		struct file *f = io_file_from_index(ctx, i);
  12222
  12223		if (f)
  12224			seq_printf(m, "%5u: %s\n", i, file_dentry(f)->d_iname);
  12225		else
  12226			seq_printf(m, "%5u: <none>\n", i);
  12227	}
  12228	seq_printf(m, "UserBufs:\t%u\n", ctx->nr_user_bufs);
  12229	for (i = 0; has_lock && i < ctx->nr_user_bufs; i++) {
  12230		struct io_mapped_ubuf *buf = ctx->user_bufs[i];
  12231		unsigned int len = buf->ubuf_end - buf->ubuf;
  12232
  12233		seq_printf(m, "%5u: 0x%llx/%u\n", i, buf->ubuf, len);
  12234	}
  12235	if (has_lock && !xa_empty(&ctx->personalities)) {
  12236		unsigned long index;
  12237		const struct cred *cred;
  12238
  12239		seq_printf(m, "Personalities:\n");
  12240		xa_for_each(&ctx->personalities, index, cred)
  12241			io_uring_show_cred(m, index, cred);
  12242	}
  12243	if (has_lock)
  12244		mutex_unlock(&ctx->uring_lock);
  12245
  12246	seq_puts(m, "PollList:\n");
  12247	spin_lock(&ctx->completion_lock);
  12248	for (i = 0; i < (1U << ctx->cancel_hash_bits); i++) {
  12249		struct hlist_head *list = &ctx->cancel_hash[i];
  12250		struct io_kiocb *req;
  12251
  12252		hlist_for_each_entry(req, list, hash_node)
  12253			seq_printf(m, "  op=%d, task_works=%d\n", req->opcode,
  12254					task_work_pending(req->task));
  12255	}
  12256
  12257	seq_puts(m, "CqOverflowList:\n");
  12258	list_for_each_entry(ocqe, &ctx->cq_overflow_list, list) {
  12259		struct io_uring_cqe *cqe = &ocqe->cqe;
  12260
  12261		seq_printf(m, "  user_data=%llu, res=%d, flags=%x\n",
  12262			   cqe->user_data, cqe->res, cqe->flags);
  12263
  12264	}
  12265
  12266	spin_unlock(&ctx->completion_lock);
  12267}
  12268
  12269static __cold void io_uring_show_fdinfo(struct seq_file *m, struct file *f)
  12270{
  12271	struct io_ring_ctx *ctx = f->private_data;
  12272
  12273	if (percpu_ref_tryget(&ctx->refs)) {
  12274		__io_uring_show_fdinfo(ctx, m);
  12275		percpu_ref_put(&ctx->refs);
  12276	}
  12277}
  12278#endif
  12279
  12280static const struct file_operations io_uring_fops = {
  12281	.release	= io_uring_release,
  12282	.mmap		= io_uring_mmap,
  12283#ifndef CONFIG_MMU
  12284	.get_unmapped_area = io_uring_nommu_get_unmapped_area,
  12285	.mmap_capabilities = io_uring_nommu_mmap_capabilities,
  12286#endif
  12287	.poll		= io_uring_poll,
  12288#ifdef CONFIG_PROC_FS
  12289	.show_fdinfo	= io_uring_show_fdinfo,
  12290#endif
  12291};
  12292
  12293static __cold int io_allocate_scq_urings(struct io_ring_ctx *ctx,
  12294					 struct io_uring_params *p)
  12295{
  12296	struct io_rings *rings;
  12297	size_t size, sq_array_offset;
  12298
  12299	/* make sure these are sane, as we already accounted them */
  12300	ctx->sq_entries = p->sq_entries;
  12301	ctx->cq_entries = p->cq_entries;
  12302
  12303	size = rings_size(ctx, p->sq_entries, p->cq_entries, &sq_array_offset);
  12304	if (size == SIZE_MAX)
  12305		return -EOVERFLOW;
  12306
  12307	rings = io_mem_alloc(size);
  12308	if (!rings)
  12309		return -ENOMEM;
  12310
  12311	ctx->rings = rings;
  12312	ctx->sq_array = (u32 *)((char *)rings + sq_array_offset);
  12313	rings->sq_ring_mask = p->sq_entries - 1;
  12314	rings->cq_ring_mask = p->cq_entries - 1;
  12315	rings->sq_ring_entries = p->sq_entries;
  12316	rings->cq_ring_entries = p->cq_entries;
  12317
  12318	if (p->flags & IORING_SETUP_SQE128)
  12319		size = array_size(2 * sizeof(struct io_uring_sqe), p->sq_entries);
  12320	else
  12321		size = array_size(sizeof(struct io_uring_sqe), p->sq_entries);
  12322	if (size == SIZE_MAX) {
  12323		io_mem_free(ctx->rings);
  12324		ctx->rings = NULL;
  12325		return -EOVERFLOW;
  12326	}
  12327
  12328	ctx->sq_sqes = io_mem_alloc(size);
  12329	if (!ctx->sq_sqes) {
  12330		io_mem_free(ctx->rings);
  12331		ctx->rings = NULL;
  12332		return -ENOMEM;
  12333	}
  12334
  12335	return 0;
  12336}
  12337
  12338static int io_uring_install_fd(struct io_ring_ctx *ctx, struct file *file)
  12339{
  12340	int ret, fd;
  12341
  12342	fd = get_unused_fd_flags(O_RDWR | O_CLOEXEC);
  12343	if (fd < 0)
  12344		return fd;
  12345
  12346	ret = io_uring_add_tctx_node(ctx);
  12347	if (ret) {
  12348		put_unused_fd(fd);
  12349		return ret;
  12350	}
  12351	fd_install(fd, file);
  12352	return fd;
  12353}
  12354
  12355/*
  12356 * Allocate an anonymous fd, this is what constitutes the application
  12357 * visible backing of an io_uring instance. The application mmaps this
  12358 * fd to gain access to the SQ/CQ ring details. If UNIX sockets are enabled,
  12359 * we have to tie this fd to a socket for file garbage collection purposes.
  12360 */
  12361static struct file *io_uring_get_file(struct io_ring_ctx *ctx)
  12362{
  12363	struct file *file;
  12364#if defined(CONFIG_UNIX)
  12365	int ret;
  12366
  12367	ret = sock_create_kern(&init_net, PF_UNIX, SOCK_RAW, IPPROTO_IP,
  12368				&ctx->ring_sock);
  12369	if (ret)
  12370		return ERR_PTR(ret);
  12371#endif
  12372
  12373	file = anon_inode_getfile_secure("[io_uring]", &io_uring_fops, ctx,
  12374					 O_RDWR | O_CLOEXEC, NULL);
  12375#if defined(CONFIG_UNIX)
  12376	if (IS_ERR(file)) {
  12377		sock_release(ctx->ring_sock);
  12378		ctx->ring_sock = NULL;
  12379	} else {
  12380		ctx->ring_sock->file = file;
  12381	}
  12382#endif
  12383	return file;
  12384}
  12385
  12386static __cold int io_uring_create(unsigned entries, struct io_uring_params *p,
  12387				  struct io_uring_params __user *params)
  12388{
  12389	struct io_ring_ctx *ctx;
  12390	struct file *file;
  12391	int ret;
  12392
  12393	if (!entries)
  12394		return -EINVAL;
  12395	if (entries > IORING_MAX_ENTRIES) {
  12396		if (!(p->flags & IORING_SETUP_CLAMP))
  12397			return -EINVAL;
  12398		entries = IORING_MAX_ENTRIES;
  12399	}
  12400
  12401	/*
  12402	 * Use twice as many entries for the CQ ring. It's possible for the
  12403	 * application to drive a higher depth than the size of the SQ ring,
  12404	 * since the sqes are only used at submission time. This allows for
  12405	 * some flexibility in overcommitting a bit. If the application has
  12406	 * set IORING_SETUP_CQSIZE, it will have passed in the desired number
  12407	 * of CQ ring entries manually.
  12408	 */
  12409	p->sq_entries = roundup_pow_of_two(entries);
  12410	if (p->flags & IORING_SETUP_CQSIZE) {
  12411		/*
  12412		 * If IORING_SETUP_CQSIZE is set, we do the same roundup
  12413		 * to a power-of-two, if it isn't already. We do NOT impose
  12414		 * any cq vs sq ring sizing.
  12415		 */
  12416		if (!p->cq_entries)
  12417			return -EINVAL;
  12418		if (p->cq_entries > IORING_MAX_CQ_ENTRIES) {
  12419			if (!(p->flags & IORING_SETUP_CLAMP))
  12420				return -EINVAL;
  12421			p->cq_entries = IORING_MAX_CQ_ENTRIES;
  12422		}
  12423		p->cq_entries = roundup_pow_of_two(p->cq_entries);
  12424		if (p->cq_entries < p->sq_entries)
  12425			return -EINVAL;
  12426	} else {
  12427		p->cq_entries = 2 * p->sq_entries;
  12428	}
  12429
  12430	ctx = io_ring_ctx_alloc(p);
  12431	if (!ctx)
  12432		return -ENOMEM;
  12433
  12434	/*
  12435	 * When SETUP_IOPOLL and SETUP_SQPOLL are both enabled, user
  12436	 * space applications don't need to do io completion events
  12437	 * polling again, they can rely on io_sq_thread to do polling
  12438	 * work, which can reduce cpu usage and uring_lock contention.
  12439	 */
  12440	if (ctx->flags & IORING_SETUP_IOPOLL &&
  12441	    !(ctx->flags & IORING_SETUP_SQPOLL))
  12442		ctx->syscall_iopoll = 1;
  12443
  12444	ctx->compat = in_compat_syscall();
  12445	if (!capable(CAP_IPC_LOCK))
  12446		ctx->user = get_uid(current_user());
  12447
  12448	/*
  12449	 * For SQPOLL, we just need a wakeup, always. For !SQPOLL, if
  12450	 * COOP_TASKRUN is set, then IPIs are never needed by the app.
  12451	 */
  12452	ret = -EINVAL;
  12453	if (ctx->flags & IORING_SETUP_SQPOLL) {
  12454		/* IPI related flags don't make sense with SQPOLL */
  12455		if (ctx->flags & (IORING_SETUP_COOP_TASKRUN |
  12456				  IORING_SETUP_TASKRUN_FLAG))
  12457			goto err;
  12458		ctx->notify_method = TWA_SIGNAL_NO_IPI;
  12459	} else if (ctx->flags & IORING_SETUP_COOP_TASKRUN) {
  12460		ctx->notify_method = TWA_SIGNAL_NO_IPI;
  12461	} else {
  12462		if (ctx->flags & IORING_SETUP_TASKRUN_FLAG)
  12463			goto err;
  12464		ctx->notify_method = TWA_SIGNAL;
  12465	}
  12466
  12467	/*
  12468	 * This is just grabbed for accounting purposes. When a process exits,
  12469	 * the mm is exited and dropped before the files, hence we need to hang
  12470	 * on to this mm purely for the purposes of being able to unaccount
  12471	 * memory (locked/pinned vm). It's not used for anything else.
  12472	 */
  12473	mmgrab(current->mm);
  12474	ctx->mm_account = current->mm;
  12475
  12476	ret = io_allocate_scq_urings(ctx, p);
  12477	if (ret)
  12478		goto err;
  12479
  12480	ret = io_sq_offload_create(ctx, p);
  12481	if (ret)
  12482		goto err;
  12483	/* always set a rsrc node */
  12484	ret = io_rsrc_node_switch_start(ctx);
  12485	if (ret)
  12486		goto err;
  12487	io_rsrc_node_switch(ctx, NULL);
  12488
  12489	memset(&p->sq_off, 0, sizeof(p->sq_off));
  12490	p->sq_off.head = offsetof(struct io_rings, sq.head);
  12491	p->sq_off.tail = offsetof(struct io_rings, sq.tail);
  12492	p->sq_off.ring_mask = offsetof(struct io_rings, sq_ring_mask);
  12493	p->sq_off.ring_entries = offsetof(struct io_rings, sq_ring_entries);
  12494	p->sq_off.flags = offsetof(struct io_rings, sq_flags);
  12495	p->sq_off.dropped = offsetof(struct io_rings, sq_dropped);
  12496	p->sq_off.array = (char *)ctx->sq_array - (char *)ctx->rings;
  12497
  12498	memset(&p->cq_off, 0, sizeof(p->cq_off));
  12499	p->cq_off.head = offsetof(struct io_rings, cq.head);
  12500	p->cq_off.tail = offsetof(struct io_rings, cq.tail);
  12501	p->cq_off.ring_mask = offsetof(struct io_rings, cq_ring_mask);
  12502	p->cq_off.ring_entries = offsetof(struct io_rings, cq_ring_entries);
  12503	p->cq_off.overflow = offsetof(struct io_rings, cq_overflow);
  12504	p->cq_off.cqes = offsetof(struct io_rings, cqes);
  12505	p->cq_off.flags = offsetof(struct io_rings, cq_flags);
  12506
  12507	p->features = IORING_FEAT_SINGLE_MMAP | IORING_FEAT_NODROP |
  12508			IORING_FEAT_SUBMIT_STABLE | IORING_FEAT_RW_CUR_POS |
  12509			IORING_FEAT_CUR_PERSONALITY | IORING_FEAT_FAST_POLL |
  12510			IORING_FEAT_POLL_32BITS | IORING_FEAT_SQPOLL_NONFIXED |
  12511			IORING_FEAT_EXT_ARG | IORING_FEAT_NATIVE_WORKERS |
  12512			IORING_FEAT_RSRC_TAGS | IORING_FEAT_CQE_SKIP |
  12513			IORING_FEAT_LINKED_FILE;
  12514
  12515	if (copy_to_user(params, p, sizeof(*p))) {
  12516		ret = -EFAULT;
  12517		goto err;
  12518	}
  12519
  12520	file = io_uring_get_file(ctx);
  12521	if (IS_ERR(file)) {
  12522		ret = PTR_ERR(file);
  12523		goto err;
  12524	}
  12525
  12526	/*
  12527	 * Install ring fd as the very last thing, so we don't risk someone
  12528	 * having closed it before we finish setup
  12529	 */
  12530	ret = io_uring_install_fd(ctx, file);
  12531	if (ret < 0) {
  12532		/* fput will clean it up */
  12533		fput(file);
  12534		return ret;
  12535	}
  12536
  12537	trace_io_uring_create(ret, ctx, p->sq_entries, p->cq_entries, p->flags);
  12538	return ret;
  12539err:
  12540	io_ring_ctx_wait_and_kill(ctx);
  12541	return ret;
  12542}
  12543
  12544/*
  12545 * Sets up an aio uring context, and returns the fd. Applications asks for a
  12546 * ring size, we return the actual sq/cq ring sizes (among other things) in the
  12547 * params structure passed in.
  12548 */
  12549static long io_uring_setup(u32 entries, struct io_uring_params __user *params)
  12550{
  12551	struct io_uring_params p;
  12552	int i;
  12553
  12554	if (copy_from_user(&p, params, sizeof(p)))
  12555		return -EFAULT;
  12556	for (i = 0; i < ARRAY_SIZE(p.resv); i++) {
  12557		if (p.resv[i])
  12558			return -EINVAL;
  12559	}
  12560
  12561	if (p.flags & ~(IORING_SETUP_IOPOLL | IORING_SETUP_SQPOLL |
  12562			IORING_SETUP_SQ_AFF | IORING_SETUP_CQSIZE |
  12563			IORING_SETUP_CLAMP | IORING_SETUP_ATTACH_WQ |
  12564			IORING_SETUP_R_DISABLED | IORING_SETUP_SUBMIT_ALL |
  12565			IORING_SETUP_COOP_TASKRUN | IORING_SETUP_TASKRUN_FLAG |
  12566			IORING_SETUP_SQE128 | IORING_SETUP_CQE32))
  12567		return -EINVAL;
  12568
  12569	return io_uring_create(entries, &p, params);
  12570}
  12571
  12572SYSCALL_DEFINE2(io_uring_setup, u32, entries,
  12573		struct io_uring_params __user *, params)
  12574{
  12575	return io_uring_setup(entries, params);
  12576}
  12577
  12578static __cold int io_probe(struct io_ring_ctx *ctx, void __user *arg,
  12579			   unsigned nr_args)
  12580{
  12581	struct io_uring_probe *p;
  12582	size_t size;
  12583	int i, ret;
  12584
  12585	size = struct_size(p, ops, nr_args);
  12586	if (size == SIZE_MAX)
  12587		return -EOVERFLOW;
  12588	p = kzalloc(size, GFP_KERNEL);
  12589	if (!p)
  12590		return -ENOMEM;
  12591
  12592	ret = -EFAULT;
  12593	if (copy_from_user(p, arg, size))
  12594		goto out;
  12595	ret = -EINVAL;
  12596	if (memchr_inv(p, 0, size))
  12597		goto out;
  12598
  12599	p->last_op = IORING_OP_LAST - 1;
  12600	if (nr_args > IORING_OP_LAST)
  12601		nr_args = IORING_OP_LAST;
  12602
  12603	for (i = 0; i < nr_args; i++) {
  12604		p->ops[i].op = i;
  12605		if (!io_op_defs[i].not_supported)
  12606			p->ops[i].flags = IO_URING_OP_SUPPORTED;
  12607	}
  12608	p->ops_len = i;
  12609
  12610	ret = 0;
  12611	if (copy_to_user(arg, p, size))
  12612		ret = -EFAULT;
  12613out:
  12614	kfree(p);
  12615	return ret;
  12616}
  12617
  12618static int io_register_personality(struct io_ring_ctx *ctx)
  12619{
  12620	const struct cred *creds;
  12621	u32 id;
  12622	int ret;
  12623
  12624	creds = get_current_cred();
  12625
  12626	ret = xa_alloc_cyclic(&ctx->personalities, &id, (void *)creds,
  12627			XA_LIMIT(0, USHRT_MAX), &ctx->pers_next, GFP_KERNEL);
  12628	if (ret < 0) {
  12629		put_cred(creds);
  12630		return ret;
  12631	}
  12632	return id;
  12633}
  12634
  12635static __cold int io_register_restrictions(struct io_ring_ctx *ctx,
  12636					   void __user *arg, unsigned int nr_args)
  12637{
  12638	struct io_uring_restriction *res;
  12639	size_t size;
  12640	int i, ret;
  12641
  12642	/* Restrictions allowed only if rings started disabled */
  12643	if (!(ctx->flags & IORING_SETUP_R_DISABLED))
  12644		return -EBADFD;
  12645
  12646	/* We allow only a single restrictions registration */
  12647	if (ctx->restrictions.registered)
  12648		return -EBUSY;
  12649
  12650	if (!arg || nr_args > IORING_MAX_RESTRICTIONS)
  12651		return -EINVAL;
  12652
  12653	size = array_size(nr_args, sizeof(*res));
  12654	if (size == SIZE_MAX)
  12655		return -EOVERFLOW;
  12656
  12657	res = memdup_user(arg, size);
  12658	if (IS_ERR(res))
  12659		return PTR_ERR(res);
  12660
  12661	ret = 0;
  12662
  12663	for (i = 0; i < nr_args; i++) {
  12664		switch (res[i].opcode) {
  12665		case IORING_RESTRICTION_REGISTER_OP:
  12666			if (res[i].register_op >= IORING_REGISTER_LAST) {
  12667				ret = -EINVAL;
  12668				goto out;
  12669			}
  12670
  12671			__set_bit(res[i].register_op,
  12672				  ctx->restrictions.register_op);
  12673			break;
  12674		case IORING_RESTRICTION_SQE_OP:
  12675			if (res[i].sqe_op >= IORING_OP_LAST) {
  12676				ret = -EINVAL;
  12677				goto out;
  12678			}
  12679
  12680			__set_bit(res[i].sqe_op, ctx->restrictions.sqe_op);
  12681			break;
  12682		case IORING_RESTRICTION_SQE_FLAGS_ALLOWED:
  12683			ctx->restrictions.sqe_flags_allowed = res[i].sqe_flags;
  12684			break;
  12685		case IORING_RESTRICTION_SQE_FLAGS_REQUIRED:
  12686			ctx->restrictions.sqe_flags_required = res[i].sqe_flags;
  12687			break;
  12688		default:
  12689			ret = -EINVAL;
  12690			goto out;
  12691		}
  12692	}
  12693
  12694out:
  12695	/* Reset all restrictions if an error happened */
  12696	if (ret != 0)
  12697		memset(&ctx->restrictions, 0, sizeof(ctx->restrictions));
  12698	else
  12699		ctx->restrictions.registered = true;
  12700
  12701	kfree(res);
  12702	return ret;
  12703}
  12704
  12705static int io_register_enable_rings(struct io_ring_ctx *ctx)
  12706{
  12707	if (!(ctx->flags & IORING_SETUP_R_DISABLED))
  12708		return -EBADFD;
  12709
  12710	if (ctx->restrictions.registered)
  12711		ctx->restricted = 1;
  12712
  12713	ctx->flags &= ~IORING_SETUP_R_DISABLED;
  12714	if (ctx->sq_data && wq_has_sleeper(&ctx->sq_data->wait))
  12715		wake_up(&ctx->sq_data->wait);
  12716	return 0;
  12717}
  12718
  12719static int __io_register_rsrc_update(struct io_ring_ctx *ctx, unsigned type,
  12720				     struct io_uring_rsrc_update2 *up,
  12721				     unsigned nr_args)
  12722{
  12723	__u32 tmp;
  12724	int err;
  12725
  12726	if (check_add_overflow(up->offset, nr_args, &tmp))
  12727		return -EOVERFLOW;
  12728	err = io_rsrc_node_switch_start(ctx);
  12729	if (err)
  12730		return err;
  12731
  12732	switch (type) {
  12733	case IORING_RSRC_FILE:
  12734		return __io_sqe_files_update(ctx, up, nr_args);
  12735	case IORING_RSRC_BUFFER:
  12736		return __io_sqe_buffers_update(ctx, up, nr_args);
  12737	}
  12738	return -EINVAL;
  12739}
  12740
  12741static int io_register_files_update(struct io_ring_ctx *ctx, void __user *arg,
  12742				    unsigned nr_args)
  12743{
  12744	struct io_uring_rsrc_update2 up;
  12745
  12746	if (!nr_args)
  12747		return -EINVAL;
  12748	memset(&up, 0, sizeof(up));
  12749	if (copy_from_user(&up, arg, sizeof(struct io_uring_rsrc_update)))
  12750		return -EFAULT;
  12751	if (up.resv || up.resv2)
  12752		return -EINVAL;
  12753	return __io_register_rsrc_update(ctx, IORING_RSRC_FILE, &up, nr_args);
  12754}
  12755
  12756static int io_register_rsrc_update(struct io_ring_ctx *ctx, void __user *arg,
  12757				   unsigned size, unsigned type)
  12758{
  12759	struct io_uring_rsrc_update2 up;
  12760
  12761	if (size != sizeof(up))
  12762		return -EINVAL;
  12763	if (copy_from_user(&up, arg, sizeof(up)))
  12764		return -EFAULT;
  12765	if (!up.nr || up.resv || up.resv2)
  12766		return -EINVAL;
  12767	return __io_register_rsrc_update(ctx, type, &up, up.nr);
  12768}
  12769
  12770static __cold int io_register_rsrc(struct io_ring_ctx *ctx, void __user *arg,
  12771			    unsigned int size, unsigned int type)
  12772{
  12773	struct io_uring_rsrc_register rr;
  12774
  12775	/* keep it extendible */
  12776	if (size != sizeof(rr))
  12777		return -EINVAL;
  12778
  12779	memset(&rr, 0, sizeof(rr));
  12780	if (copy_from_user(&rr, arg, size))
  12781		return -EFAULT;
  12782	if (!rr.nr || rr.resv2)
  12783		return -EINVAL;
  12784	if (rr.flags & ~IORING_RSRC_REGISTER_SPARSE)
  12785		return -EINVAL;
  12786
  12787	switch (type) {
  12788	case IORING_RSRC_FILE:
  12789		if (rr.flags & IORING_RSRC_REGISTER_SPARSE && rr.data)
  12790			break;
  12791		return io_sqe_files_register(ctx, u64_to_user_ptr(rr.data),
  12792					     rr.nr, u64_to_user_ptr(rr.tags));
  12793	case IORING_RSRC_BUFFER:
  12794		if (rr.flags & IORING_RSRC_REGISTER_SPARSE && rr.data)
  12795			break;
  12796		return io_sqe_buffers_register(ctx, u64_to_user_ptr(rr.data),
  12797					       rr.nr, u64_to_user_ptr(rr.tags));
  12798	}
  12799	return -EINVAL;
  12800}
  12801
  12802static __cold int io_register_iowq_aff(struct io_ring_ctx *ctx,
  12803				       void __user *arg, unsigned len)
  12804{
  12805	struct io_uring_task *tctx = current->io_uring;
  12806	cpumask_var_t new_mask;
  12807	int ret;
  12808
  12809	if (!tctx || !tctx->io_wq)
  12810		return -EINVAL;
  12811
  12812	if (!alloc_cpumask_var(&new_mask, GFP_KERNEL))
  12813		return -ENOMEM;
  12814
  12815	cpumask_clear(new_mask);
  12816	if (len > cpumask_size())
  12817		len = cpumask_size();
  12818
  12819	if (in_compat_syscall()) {
  12820		ret = compat_get_bitmap(cpumask_bits(new_mask),
  12821					(const compat_ulong_t __user *)arg,
  12822					len * 8 /* CHAR_BIT */);
  12823	} else {
  12824		ret = copy_from_user(new_mask, arg, len);
  12825	}
  12826
  12827	if (ret) {
  12828		free_cpumask_var(new_mask);
  12829		return -EFAULT;
  12830	}
  12831
  12832	ret = io_wq_cpu_affinity(tctx->io_wq, new_mask);
  12833	free_cpumask_var(new_mask);
  12834	return ret;
  12835}
  12836
  12837static __cold int io_unregister_iowq_aff(struct io_ring_ctx *ctx)
  12838{
  12839	struct io_uring_task *tctx = current->io_uring;
  12840
  12841	if (!tctx || !tctx->io_wq)
  12842		return -EINVAL;
  12843
  12844	return io_wq_cpu_affinity(tctx->io_wq, NULL);
  12845}
  12846
  12847static __cold int io_register_iowq_max_workers(struct io_ring_ctx *ctx,
  12848					       void __user *arg)
  12849	__must_hold(&ctx->uring_lock)
  12850{
  12851	struct io_tctx_node *node;
  12852	struct io_uring_task *tctx = NULL;
  12853	struct io_sq_data *sqd = NULL;
  12854	__u32 new_count[2];
  12855	int i, ret;
  12856
  12857	if (copy_from_user(new_count, arg, sizeof(new_count)))
  12858		return -EFAULT;
  12859	for (i = 0; i < ARRAY_SIZE(new_count); i++)
  12860		if (new_count[i] > INT_MAX)
  12861			return -EINVAL;
  12862
  12863	if (ctx->flags & IORING_SETUP_SQPOLL) {
  12864		sqd = ctx->sq_data;
  12865		if (sqd) {
  12866			/*
  12867			 * Observe the correct sqd->lock -> ctx->uring_lock
  12868			 * ordering. Fine to drop uring_lock here, we hold
  12869			 * a ref to the ctx.
  12870			 */
  12871			refcount_inc(&sqd->refs);
  12872			mutex_unlock(&ctx->uring_lock);
  12873			mutex_lock(&sqd->lock);
  12874			mutex_lock(&ctx->uring_lock);
  12875			if (sqd->thread)
  12876				tctx = sqd->thread->io_uring;
  12877		}
  12878	} else {
  12879		tctx = current->io_uring;
  12880	}
  12881
  12882	BUILD_BUG_ON(sizeof(new_count) != sizeof(ctx->iowq_limits));
  12883
  12884	for (i = 0; i < ARRAY_SIZE(new_count); i++)
  12885		if (new_count[i])
  12886			ctx->iowq_limits[i] = new_count[i];
  12887	ctx->iowq_limits_set = true;
  12888
  12889	if (tctx && tctx->io_wq) {
  12890		ret = io_wq_max_workers(tctx->io_wq, new_count);
  12891		if (ret)
  12892			goto err;
  12893	} else {
  12894		memset(new_count, 0, sizeof(new_count));
  12895	}
  12896
  12897	if (sqd) {
  12898		mutex_unlock(&sqd->lock);
  12899		io_put_sq_data(sqd);
  12900	}
  12901
  12902	if (copy_to_user(arg, new_count, sizeof(new_count)))
  12903		return -EFAULT;
  12904
  12905	/* that's it for SQPOLL, only the SQPOLL task creates requests */
  12906	if (sqd)
  12907		return 0;
  12908
  12909	/* now propagate the restriction to all registered users */
  12910	list_for_each_entry(node, &ctx->tctx_list, ctx_node) {
  12911		struct io_uring_task *tctx = node->task->io_uring;
  12912
  12913		if (WARN_ON_ONCE(!tctx->io_wq))
  12914			continue;
  12915
  12916		for (i = 0; i < ARRAY_SIZE(new_count); i++)
  12917			new_count[i] = ctx->iowq_limits[i];
  12918		/* ignore errors, it always returns zero anyway */
  12919		(void)io_wq_max_workers(tctx->io_wq, new_count);
  12920	}
  12921	return 0;
  12922err:
  12923	if (sqd) {
  12924		mutex_unlock(&sqd->lock);
  12925		io_put_sq_data(sqd);
  12926	}
  12927	return ret;
  12928}
  12929
  12930static int io_register_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg)
  12931{
  12932	struct io_uring_buf_ring *br;
  12933	struct io_uring_buf_reg reg;
  12934	struct io_buffer_list *bl;
  12935	struct page **pages;
  12936	int nr_pages;
  12937
  12938	if (copy_from_user(&reg, arg, sizeof(reg)))
  12939		return -EFAULT;
  12940
  12941	if (reg.pad || reg.resv[0] || reg.resv[1] || reg.resv[2])
  12942		return -EINVAL;
  12943	if (!reg.ring_addr)
  12944		return -EFAULT;
  12945	if (reg.ring_addr & ~PAGE_MASK)
  12946		return -EINVAL;
  12947	if (!is_power_of_2(reg.ring_entries))
  12948		return -EINVAL;
  12949
  12950	/* cannot disambiguate full vs empty due to head/tail size */
  12951	if (reg.ring_entries >= 65536)
  12952		return -EINVAL;
  12953
  12954	if (unlikely(reg.bgid < BGID_ARRAY && !ctx->io_bl)) {
  12955		int ret = io_init_bl_list(ctx);
  12956		if (ret)
  12957			return ret;
  12958	}
  12959
  12960	bl = io_buffer_get_list(ctx, reg.bgid);
  12961	if (bl) {
  12962		/* if mapped buffer ring OR classic exists, don't allow */
  12963		if (bl->buf_nr_pages || !list_empty(&bl->buf_list))
  12964			return -EEXIST;
  12965	} else {
  12966		bl = kzalloc(sizeof(*bl), GFP_KERNEL);
  12967		if (!bl)
  12968			return -ENOMEM;
  12969	}
  12970
  12971	pages = io_pin_pages(reg.ring_addr,
  12972			     struct_size(br, bufs, reg.ring_entries),
  12973			     &nr_pages);
  12974	if (IS_ERR(pages)) {
  12975		kfree(bl);
  12976		return PTR_ERR(pages);
  12977	}
  12978
  12979	br = page_address(pages[0]);
  12980	bl->buf_pages = pages;
  12981	bl->buf_nr_pages = nr_pages;
  12982	bl->nr_entries = reg.ring_entries;
  12983	bl->buf_ring = br;
  12984	bl->mask = reg.ring_entries - 1;
  12985	io_buffer_add_list(ctx, bl, reg.bgid);
  12986	return 0;
  12987}
  12988
  12989static int io_unregister_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg)
  12990{
  12991	struct io_uring_buf_reg reg;
  12992	struct io_buffer_list *bl;
  12993
  12994	if (copy_from_user(&reg, arg, sizeof(reg)))
  12995		return -EFAULT;
  12996	if (reg.pad || reg.resv[0] || reg.resv[1] || reg.resv[2])
  12997		return -EINVAL;
  12998
  12999	bl = io_buffer_get_list(ctx, reg.bgid);
  13000	if (!bl)
  13001		return -ENOENT;
  13002	if (!bl->buf_nr_pages)
  13003		return -EINVAL;
  13004
  13005	__io_remove_buffers(ctx, bl, -1U);
  13006	if (bl->bgid >= BGID_ARRAY) {
  13007		xa_erase(&ctx->io_bl_xa, bl->bgid);
  13008		kfree(bl);
  13009	}
  13010	return 0;
  13011}
  13012
  13013static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode,
  13014			       void __user *arg, unsigned nr_args)
  13015	__releases(ctx->uring_lock)
  13016	__acquires(ctx->uring_lock)
  13017{
  13018	int ret;
  13019
  13020	/*
  13021	 * We're inside the ring mutex, if the ref is already dying, then
  13022	 * someone else killed the ctx or is already going through
  13023	 * io_uring_register().
  13024	 */
  13025	if (percpu_ref_is_dying(&ctx->refs))
  13026		return -ENXIO;
  13027
  13028	if (ctx->restricted) {
  13029		if (opcode >= IORING_REGISTER_LAST)
  13030			return -EINVAL;
  13031		opcode = array_index_nospec(opcode, IORING_REGISTER_LAST);
  13032		if (!test_bit(opcode, ctx->restrictions.register_op))
  13033			return -EACCES;
  13034	}
  13035
  13036	switch (opcode) {
  13037	case IORING_REGISTER_BUFFERS:
  13038		ret = -EFAULT;
  13039		if (!arg)
  13040			break;
  13041		ret = io_sqe_buffers_register(ctx, arg, nr_args, NULL);
  13042		break;
  13043	case IORING_UNREGISTER_BUFFERS:
  13044		ret = -EINVAL;
  13045		if (arg || nr_args)
  13046			break;
  13047		ret = io_sqe_buffers_unregister(ctx);
  13048		break;
  13049	case IORING_REGISTER_FILES:
  13050		ret = -EFAULT;
  13051		if (!arg)
  13052			break;
  13053		ret = io_sqe_files_register(ctx, arg, nr_args, NULL);
  13054		break;
  13055	case IORING_UNREGISTER_FILES:
  13056		ret = -EINVAL;
  13057		if (arg || nr_args)
  13058			break;
  13059		ret = io_sqe_files_unregister(ctx);
  13060		break;
  13061	case IORING_REGISTER_FILES_UPDATE:
  13062		ret = io_register_files_update(ctx, arg, nr_args);
  13063		break;
  13064	case IORING_REGISTER_EVENTFD:
  13065		ret = -EINVAL;
  13066		if (nr_args != 1)
  13067			break;
  13068		ret = io_eventfd_register(ctx, arg, 0);
  13069		break;
  13070	case IORING_REGISTER_EVENTFD_ASYNC:
  13071		ret = -EINVAL;
  13072		if (nr_args != 1)
  13073			break;
  13074		ret = io_eventfd_register(ctx, arg, 1);
  13075		break;
  13076	case IORING_UNREGISTER_EVENTFD:
  13077		ret = -EINVAL;
  13078		if (arg || nr_args)
  13079			break;
  13080		ret = io_eventfd_unregister(ctx);
  13081		break;
  13082	case IORING_REGISTER_PROBE:
  13083		ret = -EINVAL;
  13084		if (!arg || nr_args > 256)
  13085			break;
  13086		ret = io_probe(ctx, arg, nr_args);
  13087		break;
  13088	case IORING_REGISTER_PERSONALITY:
  13089		ret = -EINVAL;
  13090		if (arg || nr_args)
  13091			break;
  13092		ret = io_register_personality(ctx);
  13093		break;
  13094	case IORING_UNREGISTER_PERSONALITY:
  13095		ret = -EINVAL;
  13096		if (arg)
  13097			break;
  13098		ret = io_unregister_personality(ctx, nr_args);
  13099		break;
  13100	case IORING_REGISTER_ENABLE_RINGS:
  13101		ret = -EINVAL;
  13102		if (arg || nr_args)
  13103			break;
  13104		ret = io_register_enable_rings(ctx);
  13105		break;
  13106	case IORING_REGISTER_RESTRICTIONS:
  13107		ret = io_register_restrictions(ctx, arg, nr_args);
  13108		break;
  13109	case IORING_REGISTER_FILES2:
  13110		ret = io_register_rsrc(ctx, arg, nr_args, IORING_RSRC_FILE);
  13111		break;
  13112	case IORING_REGISTER_FILES_UPDATE2:
  13113		ret = io_register_rsrc_update(ctx, arg, nr_args,
  13114					      IORING_RSRC_FILE);
  13115		break;
  13116	case IORING_REGISTER_BUFFERS2:
  13117		ret = io_register_rsrc(ctx, arg, nr_args, IORING_RSRC_BUFFER);
  13118		break;
  13119	case IORING_REGISTER_BUFFERS_UPDATE:
  13120		ret = io_register_rsrc_update(ctx, arg, nr_args,
  13121					      IORING_RSRC_BUFFER);
  13122		break;
  13123	case IORING_REGISTER_IOWQ_AFF:
  13124		ret = -EINVAL;
  13125		if (!arg || !nr_args)
  13126			break;
  13127		ret = io_register_iowq_aff(ctx, arg, nr_args);
  13128		break;
  13129	case IORING_UNREGISTER_IOWQ_AFF:
  13130		ret = -EINVAL;
  13131		if (arg || nr_args)
  13132			break;
  13133		ret = io_unregister_iowq_aff(ctx);
  13134		break;
  13135	case IORING_REGISTER_IOWQ_MAX_WORKERS:
  13136		ret = -EINVAL;
  13137		if (!arg || nr_args != 2)
  13138			break;
  13139		ret = io_register_iowq_max_workers(ctx, arg);
  13140		break;
  13141	case IORING_REGISTER_RING_FDS:
  13142		ret = io_ringfd_register(ctx, arg, nr_args);
  13143		break;
  13144	case IORING_UNREGISTER_RING_FDS:
  13145		ret = io_ringfd_unregister(ctx, arg, nr_args);
  13146		break;
  13147	case IORING_REGISTER_PBUF_RING:
  13148		ret = -EINVAL;
  13149		if (!arg || nr_args != 1)
  13150			break;
  13151		ret = io_register_pbuf_ring(ctx, arg);
  13152		break;
  13153	case IORING_UNREGISTER_PBUF_RING:
  13154		ret = -EINVAL;
  13155		if (!arg || nr_args != 1)
  13156			break;
  13157		ret = io_unregister_pbuf_ring(ctx, arg);
  13158		break;
  13159	default:
  13160		ret = -EINVAL;
  13161		break;
  13162	}
  13163
  13164	return ret;
  13165}
  13166
  13167SYSCALL_DEFINE4(io_uring_register, unsigned int, fd, unsigned int, opcode,
  13168		void __user *, arg, unsigned int, nr_args)
  13169{
  13170	struct io_ring_ctx *ctx;
  13171	long ret = -EBADF;
  13172	struct fd f;
  13173
  13174	f = fdget(fd);
  13175	if (!f.file)
  13176		return -EBADF;
  13177
  13178	ret = -EOPNOTSUPP;
  13179	if (f.file->f_op != &io_uring_fops)
  13180		goto out_fput;
  13181
  13182	ctx = f.file->private_data;
  13183
  13184	io_run_task_work();
  13185
  13186	mutex_lock(&ctx->uring_lock);
  13187	ret = __io_uring_register(ctx, opcode, arg, nr_args);
  13188	mutex_unlock(&ctx->uring_lock);
  13189	trace_io_uring_register(ctx, opcode, ctx->nr_user_files, ctx->nr_user_bufs, ret);
  13190out_fput:
  13191	fdput(f);
  13192	return ret;
  13193}
  13194
  13195static int __init io_uring_init(void)
  13196{
  13197#define __BUILD_BUG_VERIFY_ELEMENT(stype, eoffset, etype, ename) do { \
  13198	BUILD_BUG_ON(offsetof(stype, ename) != eoffset); \
  13199	BUILD_BUG_ON(sizeof(etype) != sizeof_field(stype, ename)); \
  13200} while (0)
  13201
  13202#define BUILD_BUG_SQE_ELEM(eoffset, etype, ename) \
  13203	__BUILD_BUG_VERIFY_ELEMENT(struct io_uring_sqe, eoffset, etype, ename)
  13204	BUILD_BUG_ON(sizeof(struct io_uring_sqe) != 64);
  13205	BUILD_BUG_SQE_ELEM(0,  __u8,   opcode);
  13206	BUILD_BUG_SQE_ELEM(1,  __u8,   flags);
  13207	BUILD_BUG_SQE_ELEM(2,  __u16,  ioprio);
  13208	BUILD_BUG_SQE_ELEM(4,  __s32,  fd);
  13209	BUILD_BUG_SQE_ELEM(8,  __u64,  off);
  13210	BUILD_BUG_SQE_ELEM(8,  __u64,  addr2);
  13211	BUILD_BUG_SQE_ELEM(16, __u64,  addr);
  13212	BUILD_BUG_SQE_ELEM(16, __u64,  splice_off_in);
  13213	BUILD_BUG_SQE_ELEM(24, __u32,  len);
  13214	BUILD_BUG_SQE_ELEM(28,     __kernel_rwf_t, rw_flags);
  13215	BUILD_BUG_SQE_ELEM(28, /* compat */   int, rw_flags);
  13216	BUILD_BUG_SQE_ELEM(28, /* compat */ __u32, rw_flags);
  13217	BUILD_BUG_SQE_ELEM(28, __u32,  fsync_flags);
  13218	BUILD_BUG_SQE_ELEM(28, /* compat */ __u16,  poll_events);
  13219	BUILD_BUG_SQE_ELEM(28, __u32,  poll32_events);
  13220	BUILD_BUG_SQE_ELEM(28, __u32,  sync_range_flags);
  13221	BUILD_BUG_SQE_ELEM(28, __u32,  msg_flags);
  13222	BUILD_BUG_SQE_ELEM(28, __u32,  timeout_flags);
  13223	BUILD_BUG_SQE_ELEM(28, __u32,  accept_flags);
  13224	BUILD_BUG_SQE_ELEM(28, __u32,  cancel_flags);
  13225	BUILD_BUG_SQE_ELEM(28, __u32,  open_flags);
  13226	BUILD_BUG_SQE_ELEM(28, __u32,  statx_flags);
  13227	BUILD_BUG_SQE_ELEM(28, __u32,  fadvise_advice);
  13228	BUILD_BUG_SQE_ELEM(28, __u32,  splice_flags);
  13229	BUILD_BUG_SQE_ELEM(32, __u64,  user_data);
  13230	BUILD_BUG_SQE_ELEM(40, __u16,  buf_index);
  13231	BUILD_BUG_SQE_ELEM(40, __u16,  buf_group);
  13232	BUILD_BUG_SQE_ELEM(42, __u16,  personality);
  13233	BUILD_BUG_SQE_ELEM(44, __s32,  splice_fd_in);
  13234	BUILD_BUG_SQE_ELEM(44, __u32,  file_index);
  13235	BUILD_BUG_SQE_ELEM(48, __u64,  addr3);
  13236
  13237	BUILD_BUG_ON(sizeof(struct io_uring_files_update) !=
  13238		     sizeof(struct io_uring_rsrc_update));
  13239	BUILD_BUG_ON(sizeof(struct io_uring_rsrc_update) >
  13240		     sizeof(struct io_uring_rsrc_update2));
  13241
  13242	/* ->buf_index is u16 */
  13243	BUILD_BUG_ON(IORING_MAX_REG_BUFFERS >= (1u << 16));
  13244	BUILD_BUG_ON(BGID_ARRAY * sizeof(struct io_buffer_list) > PAGE_SIZE);
  13245	BUILD_BUG_ON(offsetof(struct io_uring_buf_ring, bufs) != 0);
  13246	BUILD_BUG_ON(offsetof(struct io_uring_buf, resv) !=
  13247		     offsetof(struct io_uring_buf_ring, tail));
  13248
  13249	/* should fit into one byte */
  13250	BUILD_BUG_ON(SQE_VALID_FLAGS >= (1 << 8));
  13251	BUILD_BUG_ON(SQE_COMMON_FLAGS >= (1 << 8));
  13252	BUILD_BUG_ON((SQE_VALID_FLAGS | SQE_COMMON_FLAGS) != SQE_VALID_FLAGS);
  13253
  13254	BUILD_BUG_ON(ARRAY_SIZE(io_op_defs) != IORING_OP_LAST);
  13255	BUILD_BUG_ON(__REQ_F_LAST_BIT > 8 * sizeof(int));
  13256
  13257	BUILD_BUG_ON(sizeof(atomic_t) != sizeof(u32));
  13258
  13259	BUILD_BUG_ON(sizeof(struct io_uring_cmd) > 64);
  13260
  13261	req_cachep = KMEM_CACHE(io_kiocb, SLAB_HWCACHE_ALIGN | SLAB_PANIC |
  13262				SLAB_ACCOUNT);
  13263	return 0;
  13264};
  13265__initcall(io_uring_init);