cachepc-linux

Fork of AMDESE/linux with modifications for CachePC side-channel attack
git clone https://git.sinitax.com/sinitax/cachepc-linux
Log | Files | Refs | README | LICENSE | sfeed.txt

file.c (80559B)


      1/*
      2  FUSE: Filesystem in Userspace
      3  Copyright (C) 2001-2008  Miklos Szeredi <miklos@szeredi.hu>
      4
      5  This program can be distributed under the terms of the GNU GPL.
      6  See the file COPYING.
      7*/
      8
      9#include "fuse_i.h"
     10
     11#include <linux/pagemap.h>
     12#include <linux/slab.h>
     13#include <linux/kernel.h>
     14#include <linux/sched.h>
     15#include <linux/sched/signal.h>
     16#include <linux/module.h>
     17#include <linux/swap.h>
     18#include <linux/falloc.h>
     19#include <linux/uio.h>
     20#include <linux/fs.h>
     21
     22static int fuse_send_open(struct fuse_mount *fm, u64 nodeid,
     23			  unsigned int open_flags, int opcode,
     24			  struct fuse_open_out *outargp)
     25{
     26	struct fuse_open_in inarg;
     27	FUSE_ARGS(args);
     28
     29	memset(&inarg, 0, sizeof(inarg));
     30	inarg.flags = open_flags & ~(O_CREAT | O_EXCL | O_NOCTTY);
     31	if (!fm->fc->atomic_o_trunc)
     32		inarg.flags &= ~O_TRUNC;
     33
     34	if (fm->fc->handle_killpriv_v2 &&
     35	    (inarg.flags & O_TRUNC) && !capable(CAP_FSETID)) {
     36		inarg.open_flags |= FUSE_OPEN_KILL_SUIDGID;
     37	}
     38
     39	args.opcode = opcode;
     40	args.nodeid = nodeid;
     41	args.in_numargs = 1;
     42	args.in_args[0].size = sizeof(inarg);
     43	args.in_args[0].value = &inarg;
     44	args.out_numargs = 1;
     45	args.out_args[0].size = sizeof(*outargp);
     46	args.out_args[0].value = outargp;
     47
     48	return fuse_simple_request(fm, &args);
     49}
     50
     51struct fuse_release_args {
     52	struct fuse_args args;
     53	struct fuse_release_in inarg;
     54	struct inode *inode;
     55};
     56
     57struct fuse_file *fuse_file_alloc(struct fuse_mount *fm)
     58{
     59	struct fuse_file *ff;
     60
     61	ff = kzalloc(sizeof(struct fuse_file), GFP_KERNEL_ACCOUNT);
     62	if (unlikely(!ff))
     63		return NULL;
     64
     65	ff->fm = fm;
     66	ff->release_args = kzalloc(sizeof(*ff->release_args),
     67				   GFP_KERNEL_ACCOUNT);
     68	if (!ff->release_args) {
     69		kfree(ff);
     70		return NULL;
     71	}
     72
     73	INIT_LIST_HEAD(&ff->write_entry);
     74	mutex_init(&ff->readdir.lock);
     75	refcount_set(&ff->count, 1);
     76	RB_CLEAR_NODE(&ff->polled_node);
     77	init_waitqueue_head(&ff->poll_wait);
     78
     79	ff->kh = atomic64_inc_return(&fm->fc->khctr);
     80
     81	return ff;
     82}
     83
     84void fuse_file_free(struct fuse_file *ff)
     85{
     86	kfree(ff->release_args);
     87	mutex_destroy(&ff->readdir.lock);
     88	kfree(ff);
     89}
     90
     91static struct fuse_file *fuse_file_get(struct fuse_file *ff)
     92{
     93	refcount_inc(&ff->count);
     94	return ff;
     95}
     96
     97static void fuse_release_end(struct fuse_mount *fm, struct fuse_args *args,
     98			     int error)
     99{
    100	struct fuse_release_args *ra = container_of(args, typeof(*ra), args);
    101
    102	iput(ra->inode);
    103	kfree(ra);
    104}
    105
    106static void fuse_file_put(struct fuse_file *ff, bool sync, bool isdir)
    107{
    108	if (refcount_dec_and_test(&ff->count)) {
    109		struct fuse_args *args = &ff->release_args->args;
    110
    111		if (isdir ? ff->fm->fc->no_opendir : ff->fm->fc->no_open) {
    112			/* Do nothing when client does not implement 'open' */
    113			fuse_release_end(ff->fm, args, 0);
    114		} else if (sync) {
    115			fuse_simple_request(ff->fm, args);
    116			fuse_release_end(ff->fm, args, 0);
    117		} else {
    118			args->end = fuse_release_end;
    119			if (fuse_simple_background(ff->fm, args,
    120						   GFP_KERNEL | __GFP_NOFAIL))
    121				fuse_release_end(ff->fm, args, -ENOTCONN);
    122		}
    123		kfree(ff);
    124	}
    125}
    126
    127struct fuse_file *fuse_file_open(struct fuse_mount *fm, u64 nodeid,
    128				 unsigned int open_flags, bool isdir)
    129{
    130	struct fuse_conn *fc = fm->fc;
    131	struct fuse_file *ff;
    132	int opcode = isdir ? FUSE_OPENDIR : FUSE_OPEN;
    133
    134	ff = fuse_file_alloc(fm);
    135	if (!ff)
    136		return ERR_PTR(-ENOMEM);
    137
    138	ff->fh = 0;
    139	/* Default for no-open */
    140	ff->open_flags = FOPEN_KEEP_CACHE | (isdir ? FOPEN_CACHE_DIR : 0);
    141	if (isdir ? !fc->no_opendir : !fc->no_open) {
    142		struct fuse_open_out outarg;
    143		int err;
    144
    145		err = fuse_send_open(fm, nodeid, open_flags, opcode, &outarg);
    146		if (!err) {
    147			ff->fh = outarg.fh;
    148			ff->open_flags = outarg.open_flags;
    149
    150		} else if (err != -ENOSYS) {
    151			fuse_file_free(ff);
    152			return ERR_PTR(err);
    153		} else {
    154			if (isdir)
    155				fc->no_opendir = 1;
    156			else
    157				fc->no_open = 1;
    158		}
    159	}
    160
    161	if (isdir)
    162		ff->open_flags &= ~FOPEN_DIRECT_IO;
    163
    164	ff->nodeid = nodeid;
    165
    166	return ff;
    167}
    168
    169int fuse_do_open(struct fuse_mount *fm, u64 nodeid, struct file *file,
    170		 bool isdir)
    171{
    172	struct fuse_file *ff = fuse_file_open(fm, nodeid, file->f_flags, isdir);
    173
    174	if (!IS_ERR(ff))
    175		file->private_data = ff;
    176
    177	return PTR_ERR_OR_ZERO(ff);
    178}
    179EXPORT_SYMBOL_GPL(fuse_do_open);
    180
    181static void fuse_link_write_file(struct file *file)
    182{
    183	struct inode *inode = file_inode(file);
    184	struct fuse_inode *fi = get_fuse_inode(inode);
    185	struct fuse_file *ff = file->private_data;
    186	/*
    187	 * file may be written through mmap, so chain it onto the
    188	 * inodes's write_file list
    189	 */
    190	spin_lock(&fi->lock);
    191	if (list_empty(&ff->write_entry))
    192		list_add(&ff->write_entry, &fi->write_files);
    193	spin_unlock(&fi->lock);
    194}
    195
    196void fuse_finish_open(struct inode *inode, struct file *file)
    197{
    198	struct fuse_file *ff = file->private_data;
    199	struct fuse_conn *fc = get_fuse_conn(inode);
    200
    201	if (ff->open_flags & FOPEN_STREAM)
    202		stream_open(inode, file);
    203	else if (ff->open_flags & FOPEN_NONSEEKABLE)
    204		nonseekable_open(inode, file);
    205
    206	if (fc->atomic_o_trunc && (file->f_flags & O_TRUNC)) {
    207		struct fuse_inode *fi = get_fuse_inode(inode);
    208
    209		spin_lock(&fi->lock);
    210		fi->attr_version = atomic64_inc_return(&fc->attr_version);
    211		i_size_write(inode, 0);
    212		spin_unlock(&fi->lock);
    213		truncate_pagecache(inode, 0);
    214		file_update_time(file);
    215		fuse_invalidate_attr_mask(inode, FUSE_STATX_MODSIZE);
    216	} else if (!(ff->open_flags & FOPEN_KEEP_CACHE)) {
    217		invalidate_inode_pages2(inode->i_mapping);
    218	}
    219
    220	if ((file->f_mode & FMODE_WRITE) && fc->writeback_cache)
    221		fuse_link_write_file(file);
    222}
    223
    224int fuse_open_common(struct inode *inode, struct file *file, bool isdir)
    225{
    226	struct fuse_mount *fm = get_fuse_mount(inode);
    227	struct fuse_conn *fc = fm->fc;
    228	int err;
    229	bool is_wb_truncate = (file->f_flags & O_TRUNC) &&
    230			  fc->atomic_o_trunc &&
    231			  fc->writeback_cache;
    232	bool dax_truncate = (file->f_flags & O_TRUNC) &&
    233			  fc->atomic_o_trunc && FUSE_IS_DAX(inode);
    234
    235	if (fuse_is_bad(inode))
    236		return -EIO;
    237
    238	err = generic_file_open(inode, file);
    239	if (err)
    240		return err;
    241
    242	if (is_wb_truncate || dax_truncate) {
    243		inode_lock(inode);
    244		fuse_set_nowrite(inode);
    245	}
    246
    247	if (dax_truncate) {
    248		filemap_invalidate_lock(inode->i_mapping);
    249		err = fuse_dax_break_layouts(inode, 0, 0);
    250		if (err)
    251			goto out;
    252	}
    253
    254	err = fuse_do_open(fm, get_node_id(inode), file, isdir);
    255	if (!err)
    256		fuse_finish_open(inode, file);
    257
    258out:
    259	if (dax_truncate)
    260		filemap_invalidate_unlock(inode->i_mapping);
    261
    262	if (is_wb_truncate | dax_truncate) {
    263		fuse_release_nowrite(inode);
    264		inode_unlock(inode);
    265	}
    266
    267	return err;
    268}
    269
    270static void fuse_prepare_release(struct fuse_inode *fi, struct fuse_file *ff,
    271				 unsigned int flags, int opcode)
    272{
    273	struct fuse_conn *fc = ff->fm->fc;
    274	struct fuse_release_args *ra = ff->release_args;
    275
    276	/* Inode is NULL on error path of fuse_create_open() */
    277	if (likely(fi)) {
    278		spin_lock(&fi->lock);
    279		list_del(&ff->write_entry);
    280		spin_unlock(&fi->lock);
    281	}
    282	spin_lock(&fc->lock);
    283	if (!RB_EMPTY_NODE(&ff->polled_node))
    284		rb_erase(&ff->polled_node, &fc->polled_files);
    285	spin_unlock(&fc->lock);
    286
    287	wake_up_interruptible_all(&ff->poll_wait);
    288
    289	ra->inarg.fh = ff->fh;
    290	ra->inarg.flags = flags;
    291	ra->args.in_numargs = 1;
    292	ra->args.in_args[0].size = sizeof(struct fuse_release_in);
    293	ra->args.in_args[0].value = &ra->inarg;
    294	ra->args.opcode = opcode;
    295	ra->args.nodeid = ff->nodeid;
    296	ra->args.force = true;
    297	ra->args.nocreds = true;
    298}
    299
    300void fuse_file_release(struct inode *inode, struct fuse_file *ff,
    301		       unsigned int open_flags, fl_owner_t id, bool isdir)
    302{
    303	struct fuse_inode *fi = get_fuse_inode(inode);
    304	struct fuse_release_args *ra = ff->release_args;
    305	int opcode = isdir ? FUSE_RELEASEDIR : FUSE_RELEASE;
    306
    307	fuse_prepare_release(fi, ff, open_flags, opcode);
    308
    309	if (ff->flock) {
    310		ra->inarg.release_flags |= FUSE_RELEASE_FLOCK_UNLOCK;
    311		ra->inarg.lock_owner = fuse_lock_owner_id(ff->fm->fc, id);
    312	}
    313	/* Hold inode until release is finished */
    314	ra->inode = igrab(inode);
    315
    316	/*
    317	 * Normally this will send the RELEASE request, however if
    318	 * some asynchronous READ or WRITE requests are outstanding,
    319	 * the sending will be delayed.
    320	 *
    321	 * Make the release synchronous if this is a fuseblk mount,
    322	 * synchronous RELEASE is allowed (and desirable) in this case
    323	 * because the server can be trusted not to screw up.
    324	 */
    325	fuse_file_put(ff, ff->fm->fc->destroy, isdir);
    326}
    327
    328void fuse_release_common(struct file *file, bool isdir)
    329{
    330	fuse_file_release(file_inode(file), file->private_data, file->f_flags,
    331			  (fl_owner_t) file, isdir);
    332}
    333
    334static int fuse_open(struct inode *inode, struct file *file)
    335{
    336	return fuse_open_common(inode, file, false);
    337}
    338
    339static int fuse_release(struct inode *inode, struct file *file)
    340{
    341	fuse_release_common(file, false);
    342
    343	/* return value is ignored by VFS */
    344	return 0;
    345}
    346
    347void fuse_sync_release(struct fuse_inode *fi, struct fuse_file *ff,
    348		       unsigned int flags)
    349{
    350	WARN_ON(refcount_read(&ff->count) > 1);
    351	fuse_prepare_release(fi, ff, flags, FUSE_RELEASE);
    352	/*
    353	 * iput(NULL) is a no-op and since the refcount is 1 and everything's
    354	 * synchronous, we are fine with not doing igrab() here"
    355	 */
    356	fuse_file_put(ff, true, false);
    357}
    358EXPORT_SYMBOL_GPL(fuse_sync_release);
    359
    360/*
    361 * Scramble the ID space with XTEA, so that the value of the files_struct
    362 * pointer is not exposed to userspace.
    363 */
    364u64 fuse_lock_owner_id(struct fuse_conn *fc, fl_owner_t id)
    365{
    366	u32 *k = fc->scramble_key;
    367	u64 v = (unsigned long) id;
    368	u32 v0 = v;
    369	u32 v1 = v >> 32;
    370	u32 sum = 0;
    371	int i;
    372
    373	for (i = 0; i < 32; i++) {
    374		v0 += ((v1 << 4 ^ v1 >> 5) + v1) ^ (sum + k[sum & 3]);
    375		sum += 0x9E3779B9;
    376		v1 += ((v0 << 4 ^ v0 >> 5) + v0) ^ (sum + k[sum>>11 & 3]);
    377	}
    378
    379	return (u64) v0 + ((u64) v1 << 32);
    380}
    381
    382struct fuse_writepage_args {
    383	struct fuse_io_args ia;
    384	struct rb_node writepages_entry;
    385	struct list_head queue_entry;
    386	struct fuse_writepage_args *next;
    387	struct inode *inode;
    388	struct fuse_sync_bucket *bucket;
    389};
    390
    391static struct fuse_writepage_args *fuse_find_writeback(struct fuse_inode *fi,
    392					    pgoff_t idx_from, pgoff_t idx_to)
    393{
    394	struct rb_node *n;
    395
    396	n = fi->writepages.rb_node;
    397
    398	while (n) {
    399		struct fuse_writepage_args *wpa;
    400		pgoff_t curr_index;
    401
    402		wpa = rb_entry(n, struct fuse_writepage_args, writepages_entry);
    403		WARN_ON(get_fuse_inode(wpa->inode) != fi);
    404		curr_index = wpa->ia.write.in.offset >> PAGE_SHIFT;
    405		if (idx_from >= curr_index + wpa->ia.ap.num_pages)
    406			n = n->rb_right;
    407		else if (idx_to < curr_index)
    408			n = n->rb_left;
    409		else
    410			return wpa;
    411	}
    412	return NULL;
    413}
    414
    415/*
    416 * Check if any page in a range is under writeback
    417 *
    418 * This is currently done by walking the list of writepage requests
    419 * for the inode, which can be pretty inefficient.
    420 */
    421static bool fuse_range_is_writeback(struct inode *inode, pgoff_t idx_from,
    422				   pgoff_t idx_to)
    423{
    424	struct fuse_inode *fi = get_fuse_inode(inode);
    425	bool found;
    426
    427	spin_lock(&fi->lock);
    428	found = fuse_find_writeback(fi, idx_from, idx_to);
    429	spin_unlock(&fi->lock);
    430
    431	return found;
    432}
    433
    434static inline bool fuse_page_is_writeback(struct inode *inode, pgoff_t index)
    435{
    436	return fuse_range_is_writeback(inode, index, index);
    437}
    438
    439/*
    440 * Wait for page writeback to be completed.
    441 *
    442 * Since fuse doesn't rely on the VM writeback tracking, this has to
    443 * use some other means.
    444 */
    445static void fuse_wait_on_page_writeback(struct inode *inode, pgoff_t index)
    446{
    447	struct fuse_inode *fi = get_fuse_inode(inode);
    448
    449	wait_event(fi->page_waitq, !fuse_page_is_writeback(inode, index));
    450}
    451
    452/*
    453 * Wait for all pending writepages on the inode to finish.
    454 *
    455 * This is currently done by blocking further writes with FUSE_NOWRITE
    456 * and waiting for all sent writes to complete.
    457 *
    458 * This must be called under i_mutex, otherwise the FUSE_NOWRITE usage
    459 * could conflict with truncation.
    460 */
    461static void fuse_sync_writes(struct inode *inode)
    462{
    463	fuse_set_nowrite(inode);
    464	fuse_release_nowrite(inode);
    465}
    466
    467static int fuse_flush(struct file *file, fl_owner_t id)
    468{
    469	struct inode *inode = file_inode(file);
    470	struct fuse_mount *fm = get_fuse_mount(inode);
    471	struct fuse_file *ff = file->private_data;
    472	struct fuse_flush_in inarg;
    473	FUSE_ARGS(args);
    474	int err;
    475
    476	if (fuse_is_bad(inode))
    477		return -EIO;
    478
    479	if (ff->open_flags & FOPEN_NOFLUSH && !fm->fc->writeback_cache)
    480		return 0;
    481
    482	err = write_inode_now(inode, 1);
    483	if (err)
    484		return err;
    485
    486	inode_lock(inode);
    487	fuse_sync_writes(inode);
    488	inode_unlock(inode);
    489
    490	err = filemap_check_errors(file->f_mapping);
    491	if (err)
    492		return err;
    493
    494	err = 0;
    495	if (fm->fc->no_flush)
    496		goto inval_attr_out;
    497
    498	memset(&inarg, 0, sizeof(inarg));
    499	inarg.fh = ff->fh;
    500	inarg.lock_owner = fuse_lock_owner_id(fm->fc, id);
    501	args.opcode = FUSE_FLUSH;
    502	args.nodeid = get_node_id(inode);
    503	args.in_numargs = 1;
    504	args.in_args[0].size = sizeof(inarg);
    505	args.in_args[0].value = &inarg;
    506	args.force = true;
    507
    508	err = fuse_simple_request(fm, &args);
    509	if (err == -ENOSYS) {
    510		fm->fc->no_flush = 1;
    511		err = 0;
    512	}
    513
    514inval_attr_out:
    515	/*
    516	 * In memory i_blocks is not maintained by fuse, if writeback cache is
    517	 * enabled, i_blocks from cached attr may not be accurate.
    518	 */
    519	if (!err && fm->fc->writeback_cache)
    520		fuse_invalidate_attr_mask(inode, STATX_BLOCKS);
    521	return err;
    522}
    523
    524int fuse_fsync_common(struct file *file, loff_t start, loff_t end,
    525		      int datasync, int opcode)
    526{
    527	struct inode *inode = file->f_mapping->host;
    528	struct fuse_mount *fm = get_fuse_mount(inode);
    529	struct fuse_file *ff = file->private_data;
    530	FUSE_ARGS(args);
    531	struct fuse_fsync_in inarg;
    532
    533	memset(&inarg, 0, sizeof(inarg));
    534	inarg.fh = ff->fh;
    535	inarg.fsync_flags = datasync ? FUSE_FSYNC_FDATASYNC : 0;
    536	args.opcode = opcode;
    537	args.nodeid = get_node_id(inode);
    538	args.in_numargs = 1;
    539	args.in_args[0].size = sizeof(inarg);
    540	args.in_args[0].value = &inarg;
    541	return fuse_simple_request(fm, &args);
    542}
    543
    544static int fuse_fsync(struct file *file, loff_t start, loff_t end,
    545		      int datasync)
    546{
    547	struct inode *inode = file->f_mapping->host;
    548	struct fuse_conn *fc = get_fuse_conn(inode);
    549	int err;
    550
    551	if (fuse_is_bad(inode))
    552		return -EIO;
    553
    554	inode_lock(inode);
    555
    556	/*
    557	 * Start writeback against all dirty pages of the inode, then
    558	 * wait for all outstanding writes, before sending the FSYNC
    559	 * request.
    560	 */
    561	err = file_write_and_wait_range(file, start, end);
    562	if (err)
    563		goto out;
    564
    565	fuse_sync_writes(inode);
    566
    567	/*
    568	 * Due to implementation of fuse writeback
    569	 * file_write_and_wait_range() does not catch errors.
    570	 * We have to do this directly after fuse_sync_writes()
    571	 */
    572	err = file_check_and_advance_wb_err(file);
    573	if (err)
    574		goto out;
    575
    576	err = sync_inode_metadata(inode, 1);
    577	if (err)
    578		goto out;
    579
    580	if (fc->no_fsync)
    581		goto out;
    582
    583	err = fuse_fsync_common(file, start, end, datasync, FUSE_FSYNC);
    584	if (err == -ENOSYS) {
    585		fc->no_fsync = 1;
    586		err = 0;
    587	}
    588out:
    589	inode_unlock(inode);
    590
    591	return err;
    592}
    593
    594void fuse_read_args_fill(struct fuse_io_args *ia, struct file *file, loff_t pos,
    595			 size_t count, int opcode)
    596{
    597	struct fuse_file *ff = file->private_data;
    598	struct fuse_args *args = &ia->ap.args;
    599
    600	ia->read.in.fh = ff->fh;
    601	ia->read.in.offset = pos;
    602	ia->read.in.size = count;
    603	ia->read.in.flags = file->f_flags;
    604	args->opcode = opcode;
    605	args->nodeid = ff->nodeid;
    606	args->in_numargs = 1;
    607	args->in_args[0].size = sizeof(ia->read.in);
    608	args->in_args[0].value = &ia->read.in;
    609	args->out_argvar = true;
    610	args->out_numargs = 1;
    611	args->out_args[0].size = count;
    612}
    613
    614static void fuse_release_user_pages(struct fuse_args_pages *ap,
    615				    bool should_dirty)
    616{
    617	unsigned int i;
    618
    619	for (i = 0; i < ap->num_pages; i++) {
    620		if (should_dirty)
    621			set_page_dirty_lock(ap->pages[i]);
    622		put_page(ap->pages[i]);
    623	}
    624}
    625
    626static void fuse_io_release(struct kref *kref)
    627{
    628	kfree(container_of(kref, struct fuse_io_priv, refcnt));
    629}
    630
    631static ssize_t fuse_get_res_by_io(struct fuse_io_priv *io)
    632{
    633	if (io->err)
    634		return io->err;
    635
    636	if (io->bytes >= 0 && io->write)
    637		return -EIO;
    638
    639	return io->bytes < 0 ? io->size : io->bytes;
    640}
    641
    642/**
    643 * In case of short read, the caller sets 'pos' to the position of
    644 * actual end of fuse request in IO request. Otherwise, if bytes_requested
    645 * == bytes_transferred or rw == WRITE, the caller sets 'pos' to -1.
    646 *
    647 * An example:
    648 * User requested DIO read of 64K. It was split into two 32K fuse requests,
    649 * both submitted asynchronously. The first of them was ACKed by userspace as
    650 * fully completed (req->out.args[0].size == 32K) resulting in pos == -1. The
    651 * second request was ACKed as short, e.g. only 1K was read, resulting in
    652 * pos == 33K.
    653 *
    654 * Thus, when all fuse requests are completed, the minimal non-negative 'pos'
    655 * will be equal to the length of the longest contiguous fragment of
    656 * transferred data starting from the beginning of IO request.
    657 */
    658static void fuse_aio_complete(struct fuse_io_priv *io, int err, ssize_t pos)
    659{
    660	int left;
    661
    662	spin_lock(&io->lock);
    663	if (err)
    664		io->err = io->err ? : err;
    665	else if (pos >= 0 && (io->bytes < 0 || pos < io->bytes))
    666		io->bytes = pos;
    667
    668	left = --io->reqs;
    669	if (!left && io->blocking)
    670		complete(io->done);
    671	spin_unlock(&io->lock);
    672
    673	if (!left && !io->blocking) {
    674		ssize_t res = fuse_get_res_by_io(io);
    675
    676		if (res >= 0) {
    677			struct inode *inode = file_inode(io->iocb->ki_filp);
    678			struct fuse_conn *fc = get_fuse_conn(inode);
    679			struct fuse_inode *fi = get_fuse_inode(inode);
    680
    681			spin_lock(&fi->lock);
    682			fi->attr_version = atomic64_inc_return(&fc->attr_version);
    683			spin_unlock(&fi->lock);
    684		}
    685
    686		io->iocb->ki_complete(io->iocb, res);
    687	}
    688
    689	kref_put(&io->refcnt, fuse_io_release);
    690}
    691
    692static struct fuse_io_args *fuse_io_alloc(struct fuse_io_priv *io,
    693					  unsigned int npages)
    694{
    695	struct fuse_io_args *ia;
    696
    697	ia = kzalloc(sizeof(*ia), GFP_KERNEL);
    698	if (ia) {
    699		ia->io = io;
    700		ia->ap.pages = fuse_pages_alloc(npages, GFP_KERNEL,
    701						&ia->ap.descs);
    702		if (!ia->ap.pages) {
    703			kfree(ia);
    704			ia = NULL;
    705		}
    706	}
    707	return ia;
    708}
    709
    710static void fuse_io_free(struct fuse_io_args *ia)
    711{
    712	kfree(ia->ap.pages);
    713	kfree(ia);
    714}
    715
    716static void fuse_aio_complete_req(struct fuse_mount *fm, struct fuse_args *args,
    717				  int err)
    718{
    719	struct fuse_io_args *ia = container_of(args, typeof(*ia), ap.args);
    720	struct fuse_io_priv *io = ia->io;
    721	ssize_t pos = -1;
    722
    723	fuse_release_user_pages(&ia->ap, io->should_dirty);
    724
    725	if (err) {
    726		/* Nothing */
    727	} else if (io->write) {
    728		if (ia->write.out.size > ia->write.in.size) {
    729			err = -EIO;
    730		} else if (ia->write.in.size != ia->write.out.size) {
    731			pos = ia->write.in.offset - io->offset +
    732				ia->write.out.size;
    733		}
    734	} else {
    735		u32 outsize = args->out_args[0].size;
    736
    737		if (ia->read.in.size != outsize)
    738			pos = ia->read.in.offset - io->offset + outsize;
    739	}
    740
    741	fuse_aio_complete(io, err, pos);
    742	fuse_io_free(ia);
    743}
    744
    745static ssize_t fuse_async_req_send(struct fuse_mount *fm,
    746				   struct fuse_io_args *ia, size_t num_bytes)
    747{
    748	ssize_t err;
    749	struct fuse_io_priv *io = ia->io;
    750
    751	spin_lock(&io->lock);
    752	kref_get(&io->refcnt);
    753	io->size += num_bytes;
    754	io->reqs++;
    755	spin_unlock(&io->lock);
    756
    757	ia->ap.args.end = fuse_aio_complete_req;
    758	ia->ap.args.may_block = io->should_dirty;
    759	err = fuse_simple_background(fm, &ia->ap.args, GFP_KERNEL);
    760	if (err)
    761		fuse_aio_complete_req(fm, &ia->ap.args, err);
    762
    763	return num_bytes;
    764}
    765
    766static ssize_t fuse_send_read(struct fuse_io_args *ia, loff_t pos, size_t count,
    767			      fl_owner_t owner)
    768{
    769	struct file *file = ia->io->iocb->ki_filp;
    770	struct fuse_file *ff = file->private_data;
    771	struct fuse_mount *fm = ff->fm;
    772
    773	fuse_read_args_fill(ia, file, pos, count, FUSE_READ);
    774	if (owner != NULL) {
    775		ia->read.in.read_flags |= FUSE_READ_LOCKOWNER;
    776		ia->read.in.lock_owner = fuse_lock_owner_id(fm->fc, owner);
    777	}
    778
    779	if (ia->io->async)
    780		return fuse_async_req_send(fm, ia, count);
    781
    782	return fuse_simple_request(fm, &ia->ap.args);
    783}
    784
    785static void fuse_read_update_size(struct inode *inode, loff_t size,
    786				  u64 attr_ver)
    787{
    788	struct fuse_conn *fc = get_fuse_conn(inode);
    789	struct fuse_inode *fi = get_fuse_inode(inode);
    790
    791	spin_lock(&fi->lock);
    792	if (attr_ver >= fi->attr_version && size < inode->i_size &&
    793	    !test_bit(FUSE_I_SIZE_UNSTABLE, &fi->state)) {
    794		fi->attr_version = atomic64_inc_return(&fc->attr_version);
    795		i_size_write(inode, size);
    796	}
    797	spin_unlock(&fi->lock);
    798}
    799
    800static void fuse_short_read(struct inode *inode, u64 attr_ver, size_t num_read,
    801			    struct fuse_args_pages *ap)
    802{
    803	struct fuse_conn *fc = get_fuse_conn(inode);
    804
    805	/*
    806	 * If writeback_cache is enabled, a short read means there's a hole in
    807	 * the file.  Some data after the hole is in page cache, but has not
    808	 * reached the client fs yet.  So the hole is not present there.
    809	 */
    810	if (!fc->writeback_cache) {
    811		loff_t pos = page_offset(ap->pages[0]) + num_read;
    812		fuse_read_update_size(inode, pos, attr_ver);
    813	}
    814}
    815
    816static int fuse_do_readpage(struct file *file, struct page *page)
    817{
    818	struct inode *inode = page->mapping->host;
    819	struct fuse_mount *fm = get_fuse_mount(inode);
    820	loff_t pos = page_offset(page);
    821	struct fuse_page_desc desc = { .length = PAGE_SIZE };
    822	struct fuse_io_args ia = {
    823		.ap.args.page_zeroing = true,
    824		.ap.args.out_pages = true,
    825		.ap.num_pages = 1,
    826		.ap.pages = &page,
    827		.ap.descs = &desc,
    828	};
    829	ssize_t res;
    830	u64 attr_ver;
    831
    832	/*
    833	 * Page writeback can extend beyond the lifetime of the
    834	 * page-cache page, so make sure we read a properly synced
    835	 * page.
    836	 */
    837	fuse_wait_on_page_writeback(inode, page->index);
    838
    839	attr_ver = fuse_get_attr_version(fm->fc);
    840
    841	/* Don't overflow end offset */
    842	if (pos + (desc.length - 1) == LLONG_MAX)
    843		desc.length--;
    844
    845	fuse_read_args_fill(&ia, file, pos, desc.length, FUSE_READ);
    846	res = fuse_simple_request(fm, &ia.ap.args);
    847	if (res < 0)
    848		return res;
    849	/*
    850	 * Short read means EOF.  If file size is larger, truncate it
    851	 */
    852	if (res < desc.length)
    853		fuse_short_read(inode, attr_ver, res, &ia.ap);
    854
    855	SetPageUptodate(page);
    856
    857	return 0;
    858}
    859
    860static int fuse_read_folio(struct file *file, struct folio *folio)
    861{
    862	struct page *page = &folio->page;
    863	struct inode *inode = page->mapping->host;
    864	int err;
    865
    866	err = -EIO;
    867	if (fuse_is_bad(inode))
    868		goto out;
    869
    870	err = fuse_do_readpage(file, page);
    871	fuse_invalidate_atime(inode);
    872 out:
    873	unlock_page(page);
    874	return err;
    875}
    876
    877static void fuse_readpages_end(struct fuse_mount *fm, struct fuse_args *args,
    878			       int err)
    879{
    880	int i;
    881	struct fuse_io_args *ia = container_of(args, typeof(*ia), ap.args);
    882	struct fuse_args_pages *ap = &ia->ap;
    883	size_t count = ia->read.in.size;
    884	size_t num_read = args->out_args[0].size;
    885	struct address_space *mapping = NULL;
    886
    887	for (i = 0; mapping == NULL && i < ap->num_pages; i++)
    888		mapping = ap->pages[i]->mapping;
    889
    890	if (mapping) {
    891		struct inode *inode = mapping->host;
    892
    893		/*
    894		 * Short read means EOF. If file size is larger, truncate it
    895		 */
    896		if (!err && num_read < count)
    897			fuse_short_read(inode, ia->read.attr_ver, num_read, ap);
    898
    899		fuse_invalidate_atime(inode);
    900	}
    901
    902	for (i = 0; i < ap->num_pages; i++) {
    903		struct page *page = ap->pages[i];
    904
    905		if (!err)
    906			SetPageUptodate(page);
    907		else
    908			SetPageError(page);
    909		unlock_page(page);
    910		put_page(page);
    911	}
    912	if (ia->ff)
    913		fuse_file_put(ia->ff, false, false);
    914
    915	fuse_io_free(ia);
    916}
    917
    918static void fuse_send_readpages(struct fuse_io_args *ia, struct file *file)
    919{
    920	struct fuse_file *ff = file->private_data;
    921	struct fuse_mount *fm = ff->fm;
    922	struct fuse_args_pages *ap = &ia->ap;
    923	loff_t pos = page_offset(ap->pages[0]);
    924	size_t count = ap->num_pages << PAGE_SHIFT;
    925	ssize_t res;
    926	int err;
    927
    928	ap->args.out_pages = true;
    929	ap->args.page_zeroing = true;
    930	ap->args.page_replace = true;
    931
    932	/* Don't overflow end offset */
    933	if (pos + (count - 1) == LLONG_MAX) {
    934		count--;
    935		ap->descs[ap->num_pages - 1].length--;
    936	}
    937	WARN_ON((loff_t) (pos + count) < 0);
    938
    939	fuse_read_args_fill(ia, file, pos, count, FUSE_READ);
    940	ia->read.attr_ver = fuse_get_attr_version(fm->fc);
    941	if (fm->fc->async_read) {
    942		ia->ff = fuse_file_get(ff);
    943		ap->args.end = fuse_readpages_end;
    944		err = fuse_simple_background(fm, &ap->args, GFP_KERNEL);
    945		if (!err)
    946			return;
    947	} else {
    948		res = fuse_simple_request(fm, &ap->args);
    949		err = res < 0 ? res : 0;
    950	}
    951	fuse_readpages_end(fm, &ap->args, err);
    952}
    953
    954static void fuse_readahead(struct readahead_control *rac)
    955{
    956	struct inode *inode = rac->mapping->host;
    957	struct fuse_conn *fc = get_fuse_conn(inode);
    958	unsigned int i, max_pages, nr_pages = 0;
    959
    960	if (fuse_is_bad(inode))
    961		return;
    962
    963	max_pages = min_t(unsigned int, fc->max_pages,
    964			fc->max_read / PAGE_SIZE);
    965
    966	for (;;) {
    967		struct fuse_io_args *ia;
    968		struct fuse_args_pages *ap;
    969
    970		if (fc->num_background >= fc->congestion_threshold &&
    971		    rac->ra->async_size >= readahead_count(rac))
    972			/*
    973			 * Congested and only async pages left, so skip the
    974			 * rest.
    975			 */
    976			break;
    977
    978		nr_pages = readahead_count(rac) - nr_pages;
    979		if (nr_pages > max_pages)
    980			nr_pages = max_pages;
    981		if (nr_pages == 0)
    982			break;
    983		ia = fuse_io_alloc(NULL, nr_pages);
    984		if (!ia)
    985			return;
    986		ap = &ia->ap;
    987		nr_pages = __readahead_batch(rac, ap->pages, nr_pages);
    988		for (i = 0; i < nr_pages; i++) {
    989			fuse_wait_on_page_writeback(inode,
    990						    readahead_index(rac) + i);
    991			ap->descs[i].length = PAGE_SIZE;
    992		}
    993		ap->num_pages = nr_pages;
    994		fuse_send_readpages(ia, rac->file);
    995	}
    996}
    997
    998static ssize_t fuse_cache_read_iter(struct kiocb *iocb, struct iov_iter *to)
    999{
   1000	struct inode *inode = iocb->ki_filp->f_mapping->host;
   1001	struct fuse_conn *fc = get_fuse_conn(inode);
   1002
   1003	/*
   1004	 * In auto invalidate mode, always update attributes on read.
   1005	 * Otherwise, only update if we attempt to read past EOF (to ensure
   1006	 * i_size is up to date).
   1007	 */
   1008	if (fc->auto_inval_data ||
   1009	    (iocb->ki_pos + iov_iter_count(to) > i_size_read(inode))) {
   1010		int err;
   1011		err = fuse_update_attributes(inode, iocb->ki_filp, STATX_SIZE);
   1012		if (err)
   1013			return err;
   1014	}
   1015
   1016	return generic_file_read_iter(iocb, to);
   1017}
   1018
   1019static void fuse_write_args_fill(struct fuse_io_args *ia, struct fuse_file *ff,
   1020				 loff_t pos, size_t count)
   1021{
   1022	struct fuse_args *args = &ia->ap.args;
   1023
   1024	ia->write.in.fh = ff->fh;
   1025	ia->write.in.offset = pos;
   1026	ia->write.in.size = count;
   1027	args->opcode = FUSE_WRITE;
   1028	args->nodeid = ff->nodeid;
   1029	args->in_numargs = 2;
   1030	if (ff->fm->fc->minor < 9)
   1031		args->in_args[0].size = FUSE_COMPAT_WRITE_IN_SIZE;
   1032	else
   1033		args->in_args[0].size = sizeof(ia->write.in);
   1034	args->in_args[0].value = &ia->write.in;
   1035	args->in_args[1].size = count;
   1036	args->out_numargs = 1;
   1037	args->out_args[0].size = sizeof(ia->write.out);
   1038	args->out_args[0].value = &ia->write.out;
   1039}
   1040
   1041static unsigned int fuse_write_flags(struct kiocb *iocb)
   1042{
   1043	unsigned int flags = iocb->ki_filp->f_flags;
   1044
   1045	if (iocb->ki_flags & IOCB_DSYNC)
   1046		flags |= O_DSYNC;
   1047	if (iocb->ki_flags & IOCB_SYNC)
   1048		flags |= O_SYNC;
   1049
   1050	return flags;
   1051}
   1052
   1053static ssize_t fuse_send_write(struct fuse_io_args *ia, loff_t pos,
   1054			       size_t count, fl_owner_t owner)
   1055{
   1056	struct kiocb *iocb = ia->io->iocb;
   1057	struct file *file = iocb->ki_filp;
   1058	struct fuse_file *ff = file->private_data;
   1059	struct fuse_mount *fm = ff->fm;
   1060	struct fuse_write_in *inarg = &ia->write.in;
   1061	ssize_t err;
   1062
   1063	fuse_write_args_fill(ia, ff, pos, count);
   1064	inarg->flags = fuse_write_flags(iocb);
   1065	if (owner != NULL) {
   1066		inarg->write_flags |= FUSE_WRITE_LOCKOWNER;
   1067		inarg->lock_owner = fuse_lock_owner_id(fm->fc, owner);
   1068	}
   1069
   1070	if (ia->io->async)
   1071		return fuse_async_req_send(fm, ia, count);
   1072
   1073	err = fuse_simple_request(fm, &ia->ap.args);
   1074	if (!err && ia->write.out.size > count)
   1075		err = -EIO;
   1076
   1077	return err ?: ia->write.out.size;
   1078}
   1079
   1080bool fuse_write_update_attr(struct inode *inode, loff_t pos, ssize_t written)
   1081{
   1082	struct fuse_conn *fc = get_fuse_conn(inode);
   1083	struct fuse_inode *fi = get_fuse_inode(inode);
   1084	bool ret = false;
   1085
   1086	spin_lock(&fi->lock);
   1087	fi->attr_version = atomic64_inc_return(&fc->attr_version);
   1088	if (written > 0 && pos > inode->i_size) {
   1089		i_size_write(inode, pos);
   1090		ret = true;
   1091	}
   1092	spin_unlock(&fi->lock);
   1093
   1094	fuse_invalidate_attr_mask(inode, FUSE_STATX_MODSIZE);
   1095
   1096	return ret;
   1097}
   1098
   1099static ssize_t fuse_send_write_pages(struct fuse_io_args *ia,
   1100				     struct kiocb *iocb, struct inode *inode,
   1101				     loff_t pos, size_t count)
   1102{
   1103	struct fuse_args_pages *ap = &ia->ap;
   1104	struct file *file = iocb->ki_filp;
   1105	struct fuse_file *ff = file->private_data;
   1106	struct fuse_mount *fm = ff->fm;
   1107	unsigned int offset, i;
   1108	bool short_write;
   1109	int err;
   1110
   1111	for (i = 0; i < ap->num_pages; i++)
   1112		fuse_wait_on_page_writeback(inode, ap->pages[i]->index);
   1113
   1114	fuse_write_args_fill(ia, ff, pos, count);
   1115	ia->write.in.flags = fuse_write_flags(iocb);
   1116	if (fm->fc->handle_killpriv_v2 && !capable(CAP_FSETID))
   1117		ia->write.in.write_flags |= FUSE_WRITE_KILL_SUIDGID;
   1118
   1119	err = fuse_simple_request(fm, &ap->args);
   1120	if (!err && ia->write.out.size > count)
   1121		err = -EIO;
   1122
   1123	short_write = ia->write.out.size < count;
   1124	offset = ap->descs[0].offset;
   1125	count = ia->write.out.size;
   1126	for (i = 0; i < ap->num_pages; i++) {
   1127		struct page *page = ap->pages[i];
   1128
   1129		if (err) {
   1130			ClearPageUptodate(page);
   1131		} else {
   1132			if (count >= PAGE_SIZE - offset)
   1133				count -= PAGE_SIZE - offset;
   1134			else {
   1135				if (short_write)
   1136					ClearPageUptodate(page);
   1137				count = 0;
   1138			}
   1139			offset = 0;
   1140		}
   1141		if (ia->write.page_locked && (i == ap->num_pages - 1))
   1142			unlock_page(page);
   1143		put_page(page);
   1144	}
   1145
   1146	return err;
   1147}
   1148
   1149static ssize_t fuse_fill_write_pages(struct fuse_io_args *ia,
   1150				     struct address_space *mapping,
   1151				     struct iov_iter *ii, loff_t pos,
   1152				     unsigned int max_pages)
   1153{
   1154	struct fuse_args_pages *ap = &ia->ap;
   1155	struct fuse_conn *fc = get_fuse_conn(mapping->host);
   1156	unsigned offset = pos & (PAGE_SIZE - 1);
   1157	size_t count = 0;
   1158	int err;
   1159
   1160	ap->args.in_pages = true;
   1161	ap->descs[0].offset = offset;
   1162
   1163	do {
   1164		size_t tmp;
   1165		struct page *page;
   1166		pgoff_t index = pos >> PAGE_SHIFT;
   1167		size_t bytes = min_t(size_t, PAGE_SIZE - offset,
   1168				     iov_iter_count(ii));
   1169
   1170		bytes = min_t(size_t, bytes, fc->max_write - count);
   1171
   1172 again:
   1173		err = -EFAULT;
   1174		if (fault_in_iov_iter_readable(ii, bytes))
   1175			break;
   1176
   1177		err = -ENOMEM;
   1178		page = grab_cache_page_write_begin(mapping, index);
   1179		if (!page)
   1180			break;
   1181
   1182		if (mapping_writably_mapped(mapping))
   1183			flush_dcache_page(page);
   1184
   1185		tmp = copy_page_from_iter_atomic(page, offset, bytes, ii);
   1186		flush_dcache_page(page);
   1187
   1188		if (!tmp) {
   1189			unlock_page(page);
   1190			put_page(page);
   1191			goto again;
   1192		}
   1193
   1194		err = 0;
   1195		ap->pages[ap->num_pages] = page;
   1196		ap->descs[ap->num_pages].length = tmp;
   1197		ap->num_pages++;
   1198
   1199		count += tmp;
   1200		pos += tmp;
   1201		offset += tmp;
   1202		if (offset == PAGE_SIZE)
   1203			offset = 0;
   1204
   1205		/* If we copied full page, mark it uptodate */
   1206		if (tmp == PAGE_SIZE)
   1207			SetPageUptodate(page);
   1208
   1209		if (PageUptodate(page)) {
   1210			unlock_page(page);
   1211		} else {
   1212			ia->write.page_locked = true;
   1213			break;
   1214		}
   1215		if (!fc->big_writes)
   1216			break;
   1217	} while (iov_iter_count(ii) && count < fc->max_write &&
   1218		 ap->num_pages < max_pages && offset == 0);
   1219
   1220	return count > 0 ? count : err;
   1221}
   1222
   1223static inline unsigned int fuse_wr_pages(loff_t pos, size_t len,
   1224				     unsigned int max_pages)
   1225{
   1226	return min_t(unsigned int,
   1227		     ((pos + len - 1) >> PAGE_SHIFT) -
   1228		     (pos >> PAGE_SHIFT) + 1,
   1229		     max_pages);
   1230}
   1231
   1232static ssize_t fuse_perform_write(struct kiocb *iocb,
   1233				  struct address_space *mapping,
   1234				  struct iov_iter *ii, loff_t pos)
   1235{
   1236	struct inode *inode = mapping->host;
   1237	struct fuse_conn *fc = get_fuse_conn(inode);
   1238	struct fuse_inode *fi = get_fuse_inode(inode);
   1239	int err = 0;
   1240	ssize_t res = 0;
   1241
   1242	if (inode->i_size < pos + iov_iter_count(ii))
   1243		set_bit(FUSE_I_SIZE_UNSTABLE, &fi->state);
   1244
   1245	do {
   1246		ssize_t count;
   1247		struct fuse_io_args ia = {};
   1248		struct fuse_args_pages *ap = &ia.ap;
   1249		unsigned int nr_pages = fuse_wr_pages(pos, iov_iter_count(ii),
   1250						      fc->max_pages);
   1251
   1252		ap->pages = fuse_pages_alloc(nr_pages, GFP_KERNEL, &ap->descs);
   1253		if (!ap->pages) {
   1254			err = -ENOMEM;
   1255			break;
   1256		}
   1257
   1258		count = fuse_fill_write_pages(&ia, mapping, ii, pos, nr_pages);
   1259		if (count <= 0) {
   1260			err = count;
   1261		} else {
   1262			err = fuse_send_write_pages(&ia, iocb, inode,
   1263						    pos, count);
   1264			if (!err) {
   1265				size_t num_written = ia.write.out.size;
   1266
   1267				res += num_written;
   1268				pos += num_written;
   1269
   1270				/* break out of the loop on short write */
   1271				if (num_written != count)
   1272					err = -EIO;
   1273			}
   1274		}
   1275		kfree(ap->pages);
   1276	} while (!err && iov_iter_count(ii));
   1277
   1278	fuse_write_update_attr(inode, pos, res);
   1279	clear_bit(FUSE_I_SIZE_UNSTABLE, &fi->state);
   1280
   1281	return res > 0 ? res : err;
   1282}
   1283
   1284static ssize_t fuse_cache_write_iter(struct kiocb *iocb, struct iov_iter *from)
   1285{
   1286	struct file *file = iocb->ki_filp;
   1287	struct address_space *mapping = file->f_mapping;
   1288	ssize_t written = 0;
   1289	ssize_t written_buffered = 0;
   1290	struct inode *inode = mapping->host;
   1291	ssize_t err;
   1292	struct fuse_conn *fc = get_fuse_conn(inode);
   1293	loff_t endbyte = 0;
   1294
   1295	if (fc->writeback_cache) {
   1296		/* Update size (EOF optimization) and mode (SUID clearing) */
   1297		err = fuse_update_attributes(mapping->host, file,
   1298					     STATX_SIZE | STATX_MODE);
   1299		if (err)
   1300			return err;
   1301
   1302		if (fc->handle_killpriv_v2 &&
   1303		    should_remove_suid(file_dentry(file))) {
   1304			goto writethrough;
   1305		}
   1306
   1307		return generic_file_write_iter(iocb, from);
   1308	}
   1309
   1310writethrough:
   1311	inode_lock(inode);
   1312
   1313	/* We can write back this queue in page reclaim */
   1314	current->backing_dev_info = inode_to_bdi(inode);
   1315
   1316	err = generic_write_checks(iocb, from);
   1317	if (err <= 0)
   1318		goto out;
   1319
   1320	err = file_remove_privs(file);
   1321	if (err)
   1322		goto out;
   1323
   1324	err = file_update_time(file);
   1325	if (err)
   1326		goto out;
   1327
   1328	if (iocb->ki_flags & IOCB_DIRECT) {
   1329		loff_t pos = iocb->ki_pos;
   1330		written = generic_file_direct_write(iocb, from);
   1331		if (written < 0 || !iov_iter_count(from))
   1332			goto out;
   1333
   1334		pos += written;
   1335
   1336		written_buffered = fuse_perform_write(iocb, mapping, from, pos);
   1337		if (written_buffered < 0) {
   1338			err = written_buffered;
   1339			goto out;
   1340		}
   1341		endbyte = pos + written_buffered - 1;
   1342
   1343		err = filemap_write_and_wait_range(file->f_mapping, pos,
   1344						   endbyte);
   1345		if (err)
   1346			goto out;
   1347
   1348		invalidate_mapping_pages(file->f_mapping,
   1349					 pos >> PAGE_SHIFT,
   1350					 endbyte >> PAGE_SHIFT);
   1351
   1352		written += written_buffered;
   1353		iocb->ki_pos = pos + written_buffered;
   1354	} else {
   1355		written = fuse_perform_write(iocb, mapping, from, iocb->ki_pos);
   1356		if (written >= 0)
   1357			iocb->ki_pos += written;
   1358	}
   1359out:
   1360	current->backing_dev_info = NULL;
   1361	inode_unlock(inode);
   1362	if (written > 0)
   1363		written = generic_write_sync(iocb, written);
   1364
   1365	return written ? written : err;
   1366}
   1367
   1368static inline unsigned long fuse_get_user_addr(const struct iov_iter *ii)
   1369{
   1370	return (unsigned long)ii->iov->iov_base + ii->iov_offset;
   1371}
   1372
   1373static inline size_t fuse_get_frag_size(const struct iov_iter *ii,
   1374					size_t max_size)
   1375{
   1376	return min(iov_iter_single_seg_count(ii), max_size);
   1377}
   1378
   1379static int fuse_get_user_pages(struct fuse_args_pages *ap, struct iov_iter *ii,
   1380			       size_t *nbytesp, int write,
   1381			       unsigned int max_pages)
   1382{
   1383	size_t nbytes = 0;  /* # bytes already packed in req */
   1384	ssize_t ret = 0;
   1385
   1386	/* Special case for kernel I/O: can copy directly into the buffer */
   1387	if (iov_iter_is_kvec(ii)) {
   1388		unsigned long user_addr = fuse_get_user_addr(ii);
   1389		size_t frag_size = fuse_get_frag_size(ii, *nbytesp);
   1390
   1391		if (write)
   1392			ap->args.in_args[1].value = (void *) user_addr;
   1393		else
   1394			ap->args.out_args[0].value = (void *) user_addr;
   1395
   1396		iov_iter_advance(ii, frag_size);
   1397		*nbytesp = frag_size;
   1398		return 0;
   1399	}
   1400
   1401	while (nbytes < *nbytesp && ap->num_pages < max_pages) {
   1402		unsigned npages;
   1403		size_t start;
   1404		ret = iov_iter_get_pages(ii, &ap->pages[ap->num_pages],
   1405					*nbytesp - nbytes,
   1406					max_pages - ap->num_pages,
   1407					&start);
   1408		if (ret < 0)
   1409			break;
   1410
   1411		iov_iter_advance(ii, ret);
   1412		nbytes += ret;
   1413
   1414		ret += start;
   1415		npages = DIV_ROUND_UP(ret, PAGE_SIZE);
   1416
   1417		ap->descs[ap->num_pages].offset = start;
   1418		fuse_page_descs_length_init(ap->descs, ap->num_pages, npages);
   1419
   1420		ap->num_pages += npages;
   1421		ap->descs[ap->num_pages - 1].length -=
   1422			(PAGE_SIZE - ret) & (PAGE_SIZE - 1);
   1423	}
   1424
   1425	ap->args.user_pages = true;
   1426	if (write)
   1427		ap->args.in_pages = true;
   1428	else
   1429		ap->args.out_pages = true;
   1430
   1431	*nbytesp = nbytes;
   1432
   1433	return ret < 0 ? ret : 0;
   1434}
   1435
   1436ssize_t fuse_direct_io(struct fuse_io_priv *io, struct iov_iter *iter,
   1437		       loff_t *ppos, int flags)
   1438{
   1439	int write = flags & FUSE_DIO_WRITE;
   1440	int cuse = flags & FUSE_DIO_CUSE;
   1441	struct file *file = io->iocb->ki_filp;
   1442	struct inode *inode = file->f_mapping->host;
   1443	struct fuse_file *ff = file->private_data;
   1444	struct fuse_conn *fc = ff->fm->fc;
   1445	size_t nmax = write ? fc->max_write : fc->max_read;
   1446	loff_t pos = *ppos;
   1447	size_t count = iov_iter_count(iter);
   1448	pgoff_t idx_from = pos >> PAGE_SHIFT;
   1449	pgoff_t idx_to = (pos + count - 1) >> PAGE_SHIFT;
   1450	ssize_t res = 0;
   1451	int err = 0;
   1452	struct fuse_io_args *ia;
   1453	unsigned int max_pages;
   1454
   1455	max_pages = iov_iter_npages(iter, fc->max_pages);
   1456	ia = fuse_io_alloc(io, max_pages);
   1457	if (!ia)
   1458		return -ENOMEM;
   1459
   1460	if (!cuse && fuse_range_is_writeback(inode, idx_from, idx_to)) {
   1461		if (!write)
   1462			inode_lock(inode);
   1463		fuse_sync_writes(inode);
   1464		if (!write)
   1465			inode_unlock(inode);
   1466	}
   1467
   1468	io->should_dirty = !write && iter_is_iovec(iter);
   1469	while (count) {
   1470		ssize_t nres;
   1471		fl_owner_t owner = current->files;
   1472		size_t nbytes = min(count, nmax);
   1473
   1474		err = fuse_get_user_pages(&ia->ap, iter, &nbytes, write,
   1475					  max_pages);
   1476		if (err && !nbytes)
   1477			break;
   1478
   1479		if (write) {
   1480			if (!capable(CAP_FSETID))
   1481				ia->write.in.write_flags |= FUSE_WRITE_KILL_SUIDGID;
   1482
   1483			nres = fuse_send_write(ia, pos, nbytes, owner);
   1484		} else {
   1485			nres = fuse_send_read(ia, pos, nbytes, owner);
   1486		}
   1487
   1488		if (!io->async || nres < 0) {
   1489			fuse_release_user_pages(&ia->ap, io->should_dirty);
   1490			fuse_io_free(ia);
   1491		}
   1492		ia = NULL;
   1493		if (nres < 0) {
   1494			iov_iter_revert(iter, nbytes);
   1495			err = nres;
   1496			break;
   1497		}
   1498		WARN_ON(nres > nbytes);
   1499
   1500		count -= nres;
   1501		res += nres;
   1502		pos += nres;
   1503		if (nres != nbytes) {
   1504			iov_iter_revert(iter, nbytes - nres);
   1505			break;
   1506		}
   1507		if (count) {
   1508			max_pages = iov_iter_npages(iter, fc->max_pages);
   1509			ia = fuse_io_alloc(io, max_pages);
   1510			if (!ia)
   1511				break;
   1512		}
   1513	}
   1514	if (ia)
   1515		fuse_io_free(ia);
   1516	if (res > 0)
   1517		*ppos = pos;
   1518
   1519	return res > 0 ? res : err;
   1520}
   1521EXPORT_SYMBOL_GPL(fuse_direct_io);
   1522
   1523static ssize_t __fuse_direct_read(struct fuse_io_priv *io,
   1524				  struct iov_iter *iter,
   1525				  loff_t *ppos)
   1526{
   1527	ssize_t res;
   1528	struct inode *inode = file_inode(io->iocb->ki_filp);
   1529
   1530	res = fuse_direct_io(io, iter, ppos, 0);
   1531
   1532	fuse_invalidate_atime(inode);
   1533
   1534	return res;
   1535}
   1536
   1537static ssize_t fuse_direct_IO(struct kiocb *iocb, struct iov_iter *iter);
   1538
   1539static ssize_t fuse_direct_read_iter(struct kiocb *iocb, struct iov_iter *to)
   1540{
   1541	ssize_t res;
   1542
   1543	if (!is_sync_kiocb(iocb) && iocb->ki_flags & IOCB_DIRECT) {
   1544		res = fuse_direct_IO(iocb, to);
   1545	} else {
   1546		struct fuse_io_priv io = FUSE_IO_PRIV_SYNC(iocb);
   1547
   1548		res = __fuse_direct_read(&io, to, &iocb->ki_pos);
   1549	}
   1550
   1551	return res;
   1552}
   1553
   1554static ssize_t fuse_direct_write_iter(struct kiocb *iocb, struct iov_iter *from)
   1555{
   1556	struct inode *inode = file_inode(iocb->ki_filp);
   1557	struct fuse_io_priv io = FUSE_IO_PRIV_SYNC(iocb);
   1558	ssize_t res;
   1559
   1560	/* Don't allow parallel writes to the same file */
   1561	inode_lock(inode);
   1562	res = generic_write_checks(iocb, from);
   1563	if (res > 0) {
   1564		if (!is_sync_kiocb(iocb) && iocb->ki_flags & IOCB_DIRECT) {
   1565			res = fuse_direct_IO(iocb, from);
   1566		} else {
   1567			res = fuse_direct_io(&io, from, &iocb->ki_pos,
   1568					     FUSE_DIO_WRITE);
   1569			fuse_write_update_attr(inode, iocb->ki_pos, res);
   1570		}
   1571	}
   1572	inode_unlock(inode);
   1573
   1574	return res;
   1575}
   1576
   1577static ssize_t fuse_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
   1578{
   1579	struct file *file = iocb->ki_filp;
   1580	struct fuse_file *ff = file->private_data;
   1581	struct inode *inode = file_inode(file);
   1582
   1583	if (fuse_is_bad(inode))
   1584		return -EIO;
   1585
   1586	if (FUSE_IS_DAX(inode))
   1587		return fuse_dax_read_iter(iocb, to);
   1588
   1589	if (!(ff->open_flags & FOPEN_DIRECT_IO))
   1590		return fuse_cache_read_iter(iocb, to);
   1591	else
   1592		return fuse_direct_read_iter(iocb, to);
   1593}
   1594
   1595static ssize_t fuse_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
   1596{
   1597	struct file *file = iocb->ki_filp;
   1598	struct fuse_file *ff = file->private_data;
   1599	struct inode *inode = file_inode(file);
   1600
   1601	if (fuse_is_bad(inode))
   1602		return -EIO;
   1603
   1604	if (FUSE_IS_DAX(inode))
   1605		return fuse_dax_write_iter(iocb, from);
   1606
   1607	if (!(ff->open_flags & FOPEN_DIRECT_IO))
   1608		return fuse_cache_write_iter(iocb, from);
   1609	else
   1610		return fuse_direct_write_iter(iocb, from);
   1611}
   1612
   1613static void fuse_writepage_free(struct fuse_writepage_args *wpa)
   1614{
   1615	struct fuse_args_pages *ap = &wpa->ia.ap;
   1616	int i;
   1617
   1618	if (wpa->bucket)
   1619		fuse_sync_bucket_dec(wpa->bucket);
   1620
   1621	for (i = 0; i < ap->num_pages; i++)
   1622		__free_page(ap->pages[i]);
   1623
   1624	if (wpa->ia.ff)
   1625		fuse_file_put(wpa->ia.ff, false, false);
   1626
   1627	kfree(ap->pages);
   1628	kfree(wpa);
   1629}
   1630
   1631static void fuse_writepage_finish(struct fuse_mount *fm,
   1632				  struct fuse_writepage_args *wpa)
   1633{
   1634	struct fuse_args_pages *ap = &wpa->ia.ap;
   1635	struct inode *inode = wpa->inode;
   1636	struct fuse_inode *fi = get_fuse_inode(inode);
   1637	struct backing_dev_info *bdi = inode_to_bdi(inode);
   1638	int i;
   1639
   1640	for (i = 0; i < ap->num_pages; i++) {
   1641		dec_wb_stat(&bdi->wb, WB_WRITEBACK);
   1642		dec_node_page_state(ap->pages[i], NR_WRITEBACK_TEMP);
   1643		wb_writeout_inc(&bdi->wb);
   1644	}
   1645	wake_up(&fi->page_waitq);
   1646}
   1647
   1648/* Called under fi->lock, may release and reacquire it */
   1649static void fuse_send_writepage(struct fuse_mount *fm,
   1650				struct fuse_writepage_args *wpa, loff_t size)
   1651__releases(fi->lock)
   1652__acquires(fi->lock)
   1653{
   1654	struct fuse_writepage_args *aux, *next;
   1655	struct fuse_inode *fi = get_fuse_inode(wpa->inode);
   1656	struct fuse_write_in *inarg = &wpa->ia.write.in;
   1657	struct fuse_args *args = &wpa->ia.ap.args;
   1658	__u64 data_size = wpa->ia.ap.num_pages * PAGE_SIZE;
   1659	int err;
   1660
   1661	fi->writectr++;
   1662	if (inarg->offset + data_size <= size) {
   1663		inarg->size = data_size;
   1664	} else if (inarg->offset < size) {
   1665		inarg->size = size - inarg->offset;
   1666	} else {
   1667		/* Got truncated off completely */
   1668		goto out_free;
   1669	}
   1670
   1671	args->in_args[1].size = inarg->size;
   1672	args->force = true;
   1673	args->nocreds = true;
   1674
   1675	err = fuse_simple_background(fm, args, GFP_ATOMIC);
   1676	if (err == -ENOMEM) {
   1677		spin_unlock(&fi->lock);
   1678		err = fuse_simple_background(fm, args, GFP_NOFS | __GFP_NOFAIL);
   1679		spin_lock(&fi->lock);
   1680	}
   1681
   1682	/* Fails on broken connection only */
   1683	if (unlikely(err))
   1684		goto out_free;
   1685
   1686	return;
   1687
   1688 out_free:
   1689	fi->writectr--;
   1690	rb_erase(&wpa->writepages_entry, &fi->writepages);
   1691	fuse_writepage_finish(fm, wpa);
   1692	spin_unlock(&fi->lock);
   1693
   1694	/* After fuse_writepage_finish() aux request list is private */
   1695	for (aux = wpa->next; aux; aux = next) {
   1696		next = aux->next;
   1697		aux->next = NULL;
   1698		fuse_writepage_free(aux);
   1699	}
   1700
   1701	fuse_writepage_free(wpa);
   1702	spin_lock(&fi->lock);
   1703}
   1704
   1705/*
   1706 * If fi->writectr is positive (no truncate or fsync going on) send
   1707 * all queued writepage requests.
   1708 *
   1709 * Called with fi->lock
   1710 */
   1711void fuse_flush_writepages(struct inode *inode)
   1712__releases(fi->lock)
   1713__acquires(fi->lock)
   1714{
   1715	struct fuse_mount *fm = get_fuse_mount(inode);
   1716	struct fuse_inode *fi = get_fuse_inode(inode);
   1717	loff_t crop = i_size_read(inode);
   1718	struct fuse_writepage_args *wpa;
   1719
   1720	while (fi->writectr >= 0 && !list_empty(&fi->queued_writes)) {
   1721		wpa = list_entry(fi->queued_writes.next,
   1722				 struct fuse_writepage_args, queue_entry);
   1723		list_del_init(&wpa->queue_entry);
   1724		fuse_send_writepage(fm, wpa, crop);
   1725	}
   1726}
   1727
   1728static struct fuse_writepage_args *fuse_insert_writeback(struct rb_root *root,
   1729						struct fuse_writepage_args *wpa)
   1730{
   1731	pgoff_t idx_from = wpa->ia.write.in.offset >> PAGE_SHIFT;
   1732	pgoff_t idx_to = idx_from + wpa->ia.ap.num_pages - 1;
   1733	struct rb_node **p = &root->rb_node;
   1734	struct rb_node  *parent = NULL;
   1735
   1736	WARN_ON(!wpa->ia.ap.num_pages);
   1737	while (*p) {
   1738		struct fuse_writepage_args *curr;
   1739		pgoff_t curr_index;
   1740
   1741		parent = *p;
   1742		curr = rb_entry(parent, struct fuse_writepage_args,
   1743				writepages_entry);
   1744		WARN_ON(curr->inode != wpa->inode);
   1745		curr_index = curr->ia.write.in.offset >> PAGE_SHIFT;
   1746
   1747		if (idx_from >= curr_index + curr->ia.ap.num_pages)
   1748			p = &(*p)->rb_right;
   1749		else if (idx_to < curr_index)
   1750			p = &(*p)->rb_left;
   1751		else
   1752			return curr;
   1753	}
   1754
   1755	rb_link_node(&wpa->writepages_entry, parent, p);
   1756	rb_insert_color(&wpa->writepages_entry, root);
   1757	return NULL;
   1758}
   1759
   1760static void tree_insert(struct rb_root *root, struct fuse_writepage_args *wpa)
   1761{
   1762	WARN_ON(fuse_insert_writeback(root, wpa));
   1763}
   1764
   1765static void fuse_writepage_end(struct fuse_mount *fm, struct fuse_args *args,
   1766			       int error)
   1767{
   1768	struct fuse_writepage_args *wpa =
   1769		container_of(args, typeof(*wpa), ia.ap.args);
   1770	struct inode *inode = wpa->inode;
   1771	struct fuse_inode *fi = get_fuse_inode(inode);
   1772	struct fuse_conn *fc = get_fuse_conn(inode);
   1773
   1774	mapping_set_error(inode->i_mapping, error);
   1775	/*
   1776	 * A writeback finished and this might have updated mtime/ctime on
   1777	 * server making local mtime/ctime stale.  Hence invalidate attrs.
   1778	 * Do this only if writeback_cache is not enabled.  If writeback_cache
   1779	 * is enabled, we trust local ctime/mtime.
   1780	 */
   1781	if (!fc->writeback_cache)
   1782		fuse_invalidate_attr_mask(inode, FUSE_STATX_MODIFY);
   1783	spin_lock(&fi->lock);
   1784	rb_erase(&wpa->writepages_entry, &fi->writepages);
   1785	while (wpa->next) {
   1786		struct fuse_mount *fm = get_fuse_mount(inode);
   1787		struct fuse_write_in *inarg = &wpa->ia.write.in;
   1788		struct fuse_writepage_args *next = wpa->next;
   1789
   1790		wpa->next = next->next;
   1791		next->next = NULL;
   1792		next->ia.ff = fuse_file_get(wpa->ia.ff);
   1793		tree_insert(&fi->writepages, next);
   1794
   1795		/*
   1796		 * Skip fuse_flush_writepages() to make it easy to crop requests
   1797		 * based on primary request size.
   1798		 *
   1799		 * 1st case (trivial): there are no concurrent activities using
   1800		 * fuse_set/release_nowrite.  Then we're on safe side because
   1801		 * fuse_flush_writepages() would call fuse_send_writepage()
   1802		 * anyway.
   1803		 *
   1804		 * 2nd case: someone called fuse_set_nowrite and it is waiting
   1805		 * now for completion of all in-flight requests.  This happens
   1806		 * rarely and no more than once per page, so this should be
   1807		 * okay.
   1808		 *
   1809		 * 3rd case: someone (e.g. fuse_do_setattr()) is in the middle
   1810		 * of fuse_set_nowrite..fuse_release_nowrite section.  The fact
   1811		 * that fuse_set_nowrite returned implies that all in-flight
   1812		 * requests were completed along with all of their secondary
   1813		 * requests.  Further primary requests are blocked by negative
   1814		 * writectr.  Hence there cannot be any in-flight requests and
   1815		 * no invocations of fuse_writepage_end() while we're in
   1816		 * fuse_set_nowrite..fuse_release_nowrite section.
   1817		 */
   1818		fuse_send_writepage(fm, next, inarg->offset + inarg->size);
   1819	}
   1820	fi->writectr--;
   1821	fuse_writepage_finish(fm, wpa);
   1822	spin_unlock(&fi->lock);
   1823	fuse_writepage_free(wpa);
   1824}
   1825
   1826static struct fuse_file *__fuse_write_file_get(struct fuse_inode *fi)
   1827{
   1828	struct fuse_file *ff;
   1829
   1830	spin_lock(&fi->lock);
   1831	ff = list_first_entry_or_null(&fi->write_files, struct fuse_file,
   1832				      write_entry);
   1833	if (ff)
   1834		fuse_file_get(ff);
   1835	spin_unlock(&fi->lock);
   1836
   1837	return ff;
   1838}
   1839
   1840static struct fuse_file *fuse_write_file_get(struct fuse_inode *fi)
   1841{
   1842	struct fuse_file *ff = __fuse_write_file_get(fi);
   1843	WARN_ON(!ff);
   1844	return ff;
   1845}
   1846
   1847int fuse_write_inode(struct inode *inode, struct writeback_control *wbc)
   1848{
   1849	struct fuse_inode *fi = get_fuse_inode(inode);
   1850	struct fuse_file *ff;
   1851	int err;
   1852
   1853	/*
   1854	 * Inode is always written before the last reference is dropped and
   1855	 * hence this should not be reached from reclaim.
   1856	 *
   1857	 * Writing back the inode from reclaim can deadlock if the request
   1858	 * processing itself needs an allocation.  Allocations triggering
   1859	 * reclaim while serving a request can't be prevented, because it can
   1860	 * involve any number of unrelated userspace processes.
   1861	 */
   1862	WARN_ON(wbc->for_reclaim);
   1863
   1864	ff = __fuse_write_file_get(fi);
   1865	err = fuse_flush_times(inode, ff);
   1866	if (ff)
   1867		fuse_file_put(ff, false, false);
   1868
   1869	return err;
   1870}
   1871
   1872static struct fuse_writepage_args *fuse_writepage_args_alloc(void)
   1873{
   1874	struct fuse_writepage_args *wpa;
   1875	struct fuse_args_pages *ap;
   1876
   1877	wpa = kzalloc(sizeof(*wpa), GFP_NOFS);
   1878	if (wpa) {
   1879		ap = &wpa->ia.ap;
   1880		ap->num_pages = 0;
   1881		ap->pages = fuse_pages_alloc(1, GFP_NOFS, &ap->descs);
   1882		if (!ap->pages) {
   1883			kfree(wpa);
   1884			wpa = NULL;
   1885		}
   1886	}
   1887	return wpa;
   1888
   1889}
   1890
   1891static void fuse_writepage_add_to_bucket(struct fuse_conn *fc,
   1892					 struct fuse_writepage_args *wpa)
   1893{
   1894	if (!fc->sync_fs)
   1895		return;
   1896
   1897	rcu_read_lock();
   1898	/* Prevent resurrection of dead bucket in unlikely race with syncfs */
   1899	do {
   1900		wpa->bucket = rcu_dereference(fc->curr_bucket);
   1901	} while (unlikely(!atomic_inc_not_zero(&wpa->bucket->count)));
   1902	rcu_read_unlock();
   1903}
   1904
   1905static int fuse_writepage_locked(struct page *page)
   1906{
   1907	struct address_space *mapping = page->mapping;
   1908	struct inode *inode = mapping->host;
   1909	struct fuse_conn *fc = get_fuse_conn(inode);
   1910	struct fuse_inode *fi = get_fuse_inode(inode);
   1911	struct fuse_writepage_args *wpa;
   1912	struct fuse_args_pages *ap;
   1913	struct page *tmp_page;
   1914	int error = -ENOMEM;
   1915
   1916	set_page_writeback(page);
   1917
   1918	wpa = fuse_writepage_args_alloc();
   1919	if (!wpa)
   1920		goto err;
   1921	ap = &wpa->ia.ap;
   1922
   1923	tmp_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
   1924	if (!tmp_page)
   1925		goto err_free;
   1926
   1927	error = -EIO;
   1928	wpa->ia.ff = fuse_write_file_get(fi);
   1929	if (!wpa->ia.ff)
   1930		goto err_nofile;
   1931
   1932	fuse_writepage_add_to_bucket(fc, wpa);
   1933	fuse_write_args_fill(&wpa->ia, wpa->ia.ff, page_offset(page), 0);
   1934
   1935	copy_highpage(tmp_page, page);
   1936	wpa->ia.write.in.write_flags |= FUSE_WRITE_CACHE;
   1937	wpa->next = NULL;
   1938	ap->args.in_pages = true;
   1939	ap->num_pages = 1;
   1940	ap->pages[0] = tmp_page;
   1941	ap->descs[0].offset = 0;
   1942	ap->descs[0].length = PAGE_SIZE;
   1943	ap->args.end = fuse_writepage_end;
   1944	wpa->inode = inode;
   1945
   1946	inc_wb_stat(&inode_to_bdi(inode)->wb, WB_WRITEBACK);
   1947	inc_node_page_state(tmp_page, NR_WRITEBACK_TEMP);
   1948
   1949	spin_lock(&fi->lock);
   1950	tree_insert(&fi->writepages, wpa);
   1951	list_add_tail(&wpa->queue_entry, &fi->queued_writes);
   1952	fuse_flush_writepages(inode);
   1953	spin_unlock(&fi->lock);
   1954
   1955	end_page_writeback(page);
   1956
   1957	return 0;
   1958
   1959err_nofile:
   1960	__free_page(tmp_page);
   1961err_free:
   1962	kfree(wpa);
   1963err:
   1964	mapping_set_error(page->mapping, error);
   1965	end_page_writeback(page);
   1966	return error;
   1967}
   1968
   1969static int fuse_writepage(struct page *page, struct writeback_control *wbc)
   1970{
   1971	struct fuse_conn *fc = get_fuse_conn(page->mapping->host);
   1972	int err;
   1973
   1974	if (fuse_page_is_writeback(page->mapping->host, page->index)) {
   1975		/*
   1976		 * ->writepages() should be called for sync() and friends.  We
   1977		 * should only get here on direct reclaim and then we are
   1978		 * allowed to skip a page which is already in flight
   1979		 */
   1980		WARN_ON(wbc->sync_mode == WB_SYNC_ALL);
   1981
   1982		redirty_page_for_writepage(wbc, page);
   1983		unlock_page(page);
   1984		return 0;
   1985	}
   1986
   1987	if (wbc->sync_mode == WB_SYNC_NONE &&
   1988	    fc->num_background >= fc->congestion_threshold)
   1989		return AOP_WRITEPAGE_ACTIVATE;
   1990
   1991	err = fuse_writepage_locked(page);
   1992	unlock_page(page);
   1993
   1994	return err;
   1995}
   1996
   1997struct fuse_fill_wb_data {
   1998	struct fuse_writepage_args *wpa;
   1999	struct fuse_file *ff;
   2000	struct inode *inode;
   2001	struct page **orig_pages;
   2002	unsigned int max_pages;
   2003};
   2004
   2005static bool fuse_pages_realloc(struct fuse_fill_wb_data *data)
   2006{
   2007	struct fuse_args_pages *ap = &data->wpa->ia.ap;
   2008	struct fuse_conn *fc = get_fuse_conn(data->inode);
   2009	struct page **pages;
   2010	struct fuse_page_desc *descs;
   2011	unsigned int npages = min_t(unsigned int,
   2012				    max_t(unsigned int, data->max_pages * 2,
   2013					  FUSE_DEFAULT_MAX_PAGES_PER_REQ),
   2014				    fc->max_pages);
   2015	WARN_ON(npages <= data->max_pages);
   2016
   2017	pages = fuse_pages_alloc(npages, GFP_NOFS, &descs);
   2018	if (!pages)
   2019		return false;
   2020
   2021	memcpy(pages, ap->pages, sizeof(struct page *) * ap->num_pages);
   2022	memcpy(descs, ap->descs, sizeof(struct fuse_page_desc) * ap->num_pages);
   2023	kfree(ap->pages);
   2024	ap->pages = pages;
   2025	ap->descs = descs;
   2026	data->max_pages = npages;
   2027
   2028	return true;
   2029}
   2030
   2031static void fuse_writepages_send(struct fuse_fill_wb_data *data)
   2032{
   2033	struct fuse_writepage_args *wpa = data->wpa;
   2034	struct inode *inode = data->inode;
   2035	struct fuse_inode *fi = get_fuse_inode(inode);
   2036	int num_pages = wpa->ia.ap.num_pages;
   2037	int i;
   2038
   2039	wpa->ia.ff = fuse_file_get(data->ff);
   2040	spin_lock(&fi->lock);
   2041	list_add_tail(&wpa->queue_entry, &fi->queued_writes);
   2042	fuse_flush_writepages(inode);
   2043	spin_unlock(&fi->lock);
   2044
   2045	for (i = 0; i < num_pages; i++)
   2046		end_page_writeback(data->orig_pages[i]);
   2047}
   2048
   2049/*
   2050 * Check under fi->lock if the page is under writeback, and insert it onto the
   2051 * rb_tree if not. Otherwise iterate auxiliary write requests, to see if there's
   2052 * one already added for a page at this offset.  If there's none, then insert
   2053 * this new request onto the auxiliary list, otherwise reuse the existing one by
   2054 * swapping the new temp page with the old one.
   2055 */
   2056static bool fuse_writepage_add(struct fuse_writepage_args *new_wpa,
   2057			       struct page *page)
   2058{
   2059	struct fuse_inode *fi = get_fuse_inode(new_wpa->inode);
   2060	struct fuse_writepage_args *tmp;
   2061	struct fuse_writepage_args *old_wpa;
   2062	struct fuse_args_pages *new_ap = &new_wpa->ia.ap;
   2063
   2064	WARN_ON(new_ap->num_pages != 0);
   2065	new_ap->num_pages = 1;
   2066
   2067	spin_lock(&fi->lock);
   2068	old_wpa = fuse_insert_writeback(&fi->writepages, new_wpa);
   2069	if (!old_wpa) {
   2070		spin_unlock(&fi->lock);
   2071		return true;
   2072	}
   2073
   2074	for (tmp = old_wpa->next; tmp; tmp = tmp->next) {
   2075		pgoff_t curr_index;
   2076
   2077		WARN_ON(tmp->inode != new_wpa->inode);
   2078		curr_index = tmp->ia.write.in.offset >> PAGE_SHIFT;
   2079		if (curr_index == page->index) {
   2080			WARN_ON(tmp->ia.ap.num_pages != 1);
   2081			swap(tmp->ia.ap.pages[0], new_ap->pages[0]);
   2082			break;
   2083		}
   2084	}
   2085
   2086	if (!tmp) {
   2087		new_wpa->next = old_wpa->next;
   2088		old_wpa->next = new_wpa;
   2089	}
   2090
   2091	spin_unlock(&fi->lock);
   2092
   2093	if (tmp) {
   2094		struct backing_dev_info *bdi = inode_to_bdi(new_wpa->inode);
   2095
   2096		dec_wb_stat(&bdi->wb, WB_WRITEBACK);
   2097		dec_node_page_state(new_ap->pages[0], NR_WRITEBACK_TEMP);
   2098		wb_writeout_inc(&bdi->wb);
   2099		fuse_writepage_free(new_wpa);
   2100	}
   2101
   2102	return false;
   2103}
   2104
   2105static bool fuse_writepage_need_send(struct fuse_conn *fc, struct page *page,
   2106				     struct fuse_args_pages *ap,
   2107				     struct fuse_fill_wb_data *data)
   2108{
   2109	WARN_ON(!ap->num_pages);
   2110
   2111	/*
   2112	 * Being under writeback is unlikely but possible.  For example direct
   2113	 * read to an mmaped fuse file will set the page dirty twice; once when
   2114	 * the pages are faulted with get_user_pages(), and then after the read
   2115	 * completed.
   2116	 */
   2117	if (fuse_page_is_writeback(data->inode, page->index))
   2118		return true;
   2119
   2120	/* Reached max pages */
   2121	if (ap->num_pages == fc->max_pages)
   2122		return true;
   2123
   2124	/* Reached max write bytes */
   2125	if ((ap->num_pages + 1) * PAGE_SIZE > fc->max_write)
   2126		return true;
   2127
   2128	/* Discontinuity */
   2129	if (data->orig_pages[ap->num_pages - 1]->index + 1 != page->index)
   2130		return true;
   2131
   2132	/* Need to grow the pages array?  If so, did the expansion fail? */
   2133	if (ap->num_pages == data->max_pages && !fuse_pages_realloc(data))
   2134		return true;
   2135
   2136	return false;
   2137}
   2138
   2139static int fuse_writepages_fill(struct page *page,
   2140		struct writeback_control *wbc, void *_data)
   2141{
   2142	struct fuse_fill_wb_data *data = _data;
   2143	struct fuse_writepage_args *wpa = data->wpa;
   2144	struct fuse_args_pages *ap = &wpa->ia.ap;
   2145	struct inode *inode = data->inode;
   2146	struct fuse_inode *fi = get_fuse_inode(inode);
   2147	struct fuse_conn *fc = get_fuse_conn(inode);
   2148	struct page *tmp_page;
   2149	int err;
   2150
   2151	if (!data->ff) {
   2152		err = -EIO;
   2153		data->ff = fuse_write_file_get(fi);
   2154		if (!data->ff)
   2155			goto out_unlock;
   2156	}
   2157
   2158	if (wpa && fuse_writepage_need_send(fc, page, ap, data)) {
   2159		fuse_writepages_send(data);
   2160		data->wpa = NULL;
   2161	}
   2162
   2163	err = -ENOMEM;
   2164	tmp_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
   2165	if (!tmp_page)
   2166		goto out_unlock;
   2167
   2168	/*
   2169	 * The page must not be redirtied until the writeout is completed
   2170	 * (i.e. userspace has sent a reply to the write request).  Otherwise
   2171	 * there could be more than one temporary page instance for each real
   2172	 * page.
   2173	 *
   2174	 * This is ensured by holding the page lock in page_mkwrite() while
   2175	 * checking fuse_page_is_writeback().  We already hold the page lock
   2176	 * since clear_page_dirty_for_io() and keep it held until we add the
   2177	 * request to the fi->writepages list and increment ap->num_pages.
   2178	 * After this fuse_page_is_writeback() will indicate that the page is
   2179	 * under writeback, so we can release the page lock.
   2180	 */
   2181	if (data->wpa == NULL) {
   2182		err = -ENOMEM;
   2183		wpa = fuse_writepage_args_alloc();
   2184		if (!wpa) {
   2185			__free_page(tmp_page);
   2186			goto out_unlock;
   2187		}
   2188		fuse_writepage_add_to_bucket(fc, wpa);
   2189
   2190		data->max_pages = 1;
   2191
   2192		ap = &wpa->ia.ap;
   2193		fuse_write_args_fill(&wpa->ia, data->ff, page_offset(page), 0);
   2194		wpa->ia.write.in.write_flags |= FUSE_WRITE_CACHE;
   2195		wpa->next = NULL;
   2196		ap->args.in_pages = true;
   2197		ap->args.end = fuse_writepage_end;
   2198		ap->num_pages = 0;
   2199		wpa->inode = inode;
   2200	}
   2201	set_page_writeback(page);
   2202
   2203	copy_highpage(tmp_page, page);
   2204	ap->pages[ap->num_pages] = tmp_page;
   2205	ap->descs[ap->num_pages].offset = 0;
   2206	ap->descs[ap->num_pages].length = PAGE_SIZE;
   2207	data->orig_pages[ap->num_pages] = page;
   2208
   2209	inc_wb_stat(&inode_to_bdi(inode)->wb, WB_WRITEBACK);
   2210	inc_node_page_state(tmp_page, NR_WRITEBACK_TEMP);
   2211
   2212	err = 0;
   2213	if (data->wpa) {
   2214		/*
   2215		 * Protected by fi->lock against concurrent access by
   2216		 * fuse_page_is_writeback().
   2217		 */
   2218		spin_lock(&fi->lock);
   2219		ap->num_pages++;
   2220		spin_unlock(&fi->lock);
   2221	} else if (fuse_writepage_add(wpa, page)) {
   2222		data->wpa = wpa;
   2223	} else {
   2224		end_page_writeback(page);
   2225	}
   2226out_unlock:
   2227	unlock_page(page);
   2228
   2229	return err;
   2230}
   2231
   2232static int fuse_writepages(struct address_space *mapping,
   2233			   struct writeback_control *wbc)
   2234{
   2235	struct inode *inode = mapping->host;
   2236	struct fuse_conn *fc = get_fuse_conn(inode);
   2237	struct fuse_fill_wb_data data;
   2238	int err;
   2239
   2240	err = -EIO;
   2241	if (fuse_is_bad(inode))
   2242		goto out;
   2243
   2244	if (wbc->sync_mode == WB_SYNC_NONE &&
   2245	    fc->num_background >= fc->congestion_threshold)
   2246		return 0;
   2247
   2248	data.inode = inode;
   2249	data.wpa = NULL;
   2250	data.ff = NULL;
   2251
   2252	err = -ENOMEM;
   2253	data.orig_pages = kcalloc(fc->max_pages,
   2254				  sizeof(struct page *),
   2255				  GFP_NOFS);
   2256	if (!data.orig_pages)
   2257		goto out;
   2258
   2259	err = write_cache_pages(mapping, wbc, fuse_writepages_fill, &data);
   2260	if (data.wpa) {
   2261		WARN_ON(!data.wpa->ia.ap.num_pages);
   2262		fuse_writepages_send(&data);
   2263	}
   2264	if (data.ff)
   2265		fuse_file_put(data.ff, false, false);
   2266
   2267	kfree(data.orig_pages);
   2268out:
   2269	return err;
   2270}
   2271
   2272/*
   2273 * It's worthy to make sure that space is reserved on disk for the write,
   2274 * but how to implement it without killing performance need more thinking.
   2275 */
   2276static int fuse_write_begin(struct file *file, struct address_space *mapping,
   2277		loff_t pos, unsigned len, struct page **pagep, void **fsdata)
   2278{
   2279	pgoff_t index = pos >> PAGE_SHIFT;
   2280	struct fuse_conn *fc = get_fuse_conn(file_inode(file));
   2281	struct page *page;
   2282	loff_t fsize;
   2283	int err = -ENOMEM;
   2284
   2285	WARN_ON(!fc->writeback_cache);
   2286
   2287	page = grab_cache_page_write_begin(mapping, index);
   2288	if (!page)
   2289		goto error;
   2290
   2291	fuse_wait_on_page_writeback(mapping->host, page->index);
   2292
   2293	if (PageUptodate(page) || len == PAGE_SIZE)
   2294		goto success;
   2295	/*
   2296	 * Check if the start this page comes after the end of file, in which
   2297	 * case the readpage can be optimized away.
   2298	 */
   2299	fsize = i_size_read(mapping->host);
   2300	if (fsize <= (pos & PAGE_MASK)) {
   2301		size_t off = pos & ~PAGE_MASK;
   2302		if (off)
   2303			zero_user_segment(page, 0, off);
   2304		goto success;
   2305	}
   2306	err = fuse_do_readpage(file, page);
   2307	if (err)
   2308		goto cleanup;
   2309success:
   2310	*pagep = page;
   2311	return 0;
   2312
   2313cleanup:
   2314	unlock_page(page);
   2315	put_page(page);
   2316error:
   2317	return err;
   2318}
   2319
   2320static int fuse_write_end(struct file *file, struct address_space *mapping,
   2321		loff_t pos, unsigned len, unsigned copied,
   2322		struct page *page, void *fsdata)
   2323{
   2324	struct inode *inode = page->mapping->host;
   2325
   2326	/* Haven't copied anything?  Skip zeroing, size extending, dirtying. */
   2327	if (!copied)
   2328		goto unlock;
   2329
   2330	pos += copied;
   2331	if (!PageUptodate(page)) {
   2332		/* Zero any unwritten bytes at the end of the page */
   2333		size_t endoff = pos & ~PAGE_MASK;
   2334		if (endoff)
   2335			zero_user_segment(page, endoff, PAGE_SIZE);
   2336		SetPageUptodate(page);
   2337	}
   2338
   2339	if (pos > inode->i_size)
   2340		i_size_write(inode, pos);
   2341
   2342	set_page_dirty(page);
   2343
   2344unlock:
   2345	unlock_page(page);
   2346	put_page(page);
   2347
   2348	return copied;
   2349}
   2350
   2351static int fuse_launder_folio(struct folio *folio)
   2352{
   2353	int err = 0;
   2354	if (folio_clear_dirty_for_io(folio)) {
   2355		struct inode *inode = folio->mapping->host;
   2356
   2357		/* Serialize with pending writeback for the same page */
   2358		fuse_wait_on_page_writeback(inode, folio->index);
   2359		err = fuse_writepage_locked(&folio->page);
   2360		if (!err)
   2361			fuse_wait_on_page_writeback(inode, folio->index);
   2362	}
   2363	return err;
   2364}
   2365
   2366/*
   2367 * Write back dirty data/metadata now (there may not be any suitable
   2368 * open files later for data)
   2369 */
   2370static void fuse_vma_close(struct vm_area_struct *vma)
   2371{
   2372	int err;
   2373
   2374	err = write_inode_now(vma->vm_file->f_mapping->host, 1);
   2375	mapping_set_error(vma->vm_file->f_mapping, err);
   2376}
   2377
   2378/*
   2379 * Wait for writeback against this page to complete before allowing it
   2380 * to be marked dirty again, and hence written back again, possibly
   2381 * before the previous writepage completed.
   2382 *
   2383 * Block here, instead of in ->writepage(), so that the userspace fs
   2384 * can only block processes actually operating on the filesystem.
   2385 *
   2386 * Otherwise unprivileged userspace fs would be able to block
   2387 * unrelated:
   2388 *
   2389 * - page migration
   2390 * - sync(2)
   2391 * - try_to_free_pages() with order > PAGE_ALLOC_COSTLY_ORDER
   2392 */
   2393static vm_fault_t fuse_page_mkwrite(struct vm_fault *vmf)
   2394{
   2395	struct page *page = vmf->page;
   2396	struct inode *inode = file_inode(vmf->vma->vm_file);
   2397
   2398	file_update_time(vmf->vma->vm_file);
   2399	lock_page(page);
   2400	if (page->mapping != inode->i_mapping) {
   2401		unlock_page(page);
   2402		return VM_FAULT_NOPAGE;
   2403	}
   2404
   2405	fuse_wait_on_page_writeback(inode, page->index);
   2406	return VM_FAULT_LOCKED;
   2407}
   2408
   2409static const struct vm_operations_struct fuse_file_vm_ops = {
   2410	.close		= fuse_vma_close,
   2411	.fault		= filemap_fault,
   2412	.map_pages	= filemap_map_pages,
   2413	.page_mkwrite	= fuse_page_mkwrite,
   2414};
   2415
   2416static int fuse_file_mmap(struct file *file, struct vm_area_struct *vma)
   2417{
   2418	struct fuse_file *ff = file->private_data;
   2419
   2420	/* DAX mmap is superior to direct_io mmap */
   2421	if (FUSE_IS_DAX(file_inode(file)))
   2422		return fuse_dax_mmap(file, vma);
   2423
   2424	if (ff->open_flags & FOPEN_DIRECT_IO) {
   2425		/* Can't provide the coherency needed for MAP_SHARED */
   2426		if (vma->vm_flags & VM_MAYSHARE)
   2427			return -ENODEV;
   2428
   2429		invalidate_inode_pages2(file->f_mapping);
   2430
   2431		return generic_file_mmap(file, vma);
   2432	}
   2433
   2434	if ((vma->vm_flags & VM_SHARED) && (vma->vm_flags & VM_MAYWRITE))
   2435		fuse_link_write_file(file);
   2436
   2437	file_accessed(file);
   2438	vma->vm_ops = &fuse_file_vm_ops;
   2439	return 0;
   2440}
   2441
   2442static int convert_fuse_file_lock(struct fuse_conn *fc,
   2443				  const struct fuse_file_lock *ffl,
   2444				  struct file_lock *fl)
   2445{
   2446	switch (ffl->type) {
   2447	case F_UNLCK:
   2448		break;
   2449
   2450	case F_RDLCK:
   2451	case F_WRLCK:
   2452		if (ffl->start > OFFSET_MAX || ffl->end > OFFSET_MAX ||
   2453		    ffl->end < ffl->start)
   2454			return -EIO;
   2455
   2456		fl->fl_start = ffl->start;
   2457		fl->fl_end = ffl->end;
   2458
   2459		/*
   2460		 * Convert pid into init's pid namespace.  The locks API will
   2461		 * translate it into the caller's pid namespace.
   2462		 */
   2463		rcu_read_lock();
   2464		fl->fl_pid = pid_nr_ns(find_pid_ns(ffl->pid, fc->pid_ns), &init_pid_ns);
   2465		rcu_read_unlock();
   2466		break;
   2467
   2468	default:
   2469		return -EIO;
   2470	}
   2471	fl->fl_type = ffl->type;
   2472	return 0;
   2473}
   2474
   2475static void fuse_lk_fill(struct fuse_args *args, struct file *file,
   2476			 const struct file_lock *fl, int opcode, pid_t pid,
   2477			 int flock, struct fuse_lk_in *inarg)
   2478{
   2479	struct inode *inode = file_inode(file);
   2480	struct fuse_conn *fc = get_fuse_conn(inode);
   2481	struct fuse_file *ff = file->private_data;
   2482
   2483	memset(inarg, 0, sizeof(*inarg));
   2484	inarg->fh = ff->fh;
   2485	inarg->owner = fuse_lock_owner_id(fc, fl->fl_owner);
   2486	inarg->lk.start = fl->fl_start;
   2487	inarg->lk.end = fl->fl_end;
   2488	inarg->lk.type = fl->fl_type;
   2489	inarg->lk.pid = pid;
   2490	if (flock)
   2491		inarg->lk_flags |= FUSE_LK_FLOCK;
   2492	args->opcode = opcode;
   2493	args->nodeid = get_node_id(inode);
   2494	args->in_numargs = 1;
   2495	args->in_args[0].size = sizeof(*inarg);
   2496	args->in_args[0].value = inarg;
   2497}
   2498
   2499static int fuse_getlk(struct file *file, struct file_lock *fl)
   2500{
   2501	struct inode *inode = file_inode(file);
   2502	struct fuse_mount *fm = get_fuse_mount(inode);
   2503	FUSE_ARGS(args);
   2504	struct fuse_lk_in inarg;
   2505	struct fuse_lk_out outarg;
   2506	int err;
   2507
   2508	fuse_lk_fill(&args, file, fl, FUSE_GETLK, 0, 0, &inarg);
   2509	args.out_numargs = 1;
   2510	args.out_args[0].size = sizeof(outarg);
   2511	args.out_args[0].value = &outarg;
   2512	err = fuse_simple_request(fm, &args);
   2513	if (!err)
   2514		err = convert_fuse_file_lock(fm->fc, &outarg.lk, fl);
   2515
   2516	return err;
   2517}
   2518
   2519static int fuse_setlk(struct file *file, struct file_lock *fl, int flock)
   2520{
   2521	struct inode *inode = file_inode(file);
   2522	struct fuse_mount *fm = get_fuse_mount(inode);
   2523	FUSE_ARGS(args);
   2524	struct fuse_lk_in inarg;
   2525	int opcode = (fl->fl_flags & FL_SLEEP) ? FUSE_SETLKW : FUSE_SETLK;
   2526	struct pid *pid = fl->fl_type != F_UNLCK ? task_tgid(current) : NULL;
   2527	pid_t pid_nr = pid_nr_ns(pid, fm->fc->pid_ns);
   2528	int err;
   2529
   2530	if (fl->fl_lmops && fl->fl_lmops->lm_grant) {
   2531		/* NLM needs asynchronous locks, which we don't support yet */
   2532		return -ENOLCK;
   2533	}
   2534
   2535	/* Unlock on close is handled by the flush method */
   2536	if ((fl->fl_flags & FL_CLOSE_POSIX) == FL_CLOSE_POSIX)
   2537		return 0;
   2538
   2539	fuse_lk_fill(&args, file, fl, opcode, pid_nr, flock, &inarg);
   2540	err = fuse_simple_request(fm, &args);
   2541
   2542	/* locking is restartable */
   2543	if (err == -EINTR)
   2544		err = -ERESTARTSYS;
   2545
   2546	return err;
   2547}
   2548
   2549static int fuse_file_lock(struct file *file, int cmd, struct file_lock *fl)
   2550{
   2551	struct inode *inode = file_inode(file);
   2552	struct fuse_conn *fc = get_fuse_conn(inode);
   2553	int err;
   2554
   2555	if (cmd == F_CANCELLK) {
   2556		err = 0;
   2557	} else if (cmd == F_GETLK) {
   2558		if (fc->no_lock) {
   2559			posix_test_lock(file, fl);
   2560			err = 0;
   2561		} else
   2562			err = fuse_getlk(file, fl);
   2563	} else {
   2564		if (fc->no_lock)
   2565			err = posix_lock_file(file, fl, NULL);
   2566		else
   2567			err = fuse_setlk(file, fl, 0);
   2568	}
   2569	return err;
   2570}
   2571
   2572static int fuse_file_flock(struct file *file, int cmd, struct file_lock *fl)
   2573{
   2574	struct inode *inode = file_inode(file);
   2575	struct fuse_conn *fc = get_fuse_conn(inode);
   2576	int err;
   2577
   2578	if (fc->no_flock) {
   2579		err = locks_lock_file_wait(file, fl);
   2580	} else {
   2581		struct fuse_file *ff = file->private_data;
   2582
   2583		/* emulate flock with POSIX locks */
   2584		ff->flock = true;
   2585		err = fuse_setlk(file, fl, 1);
   2586	}
   2587
   2588	return err;
   2589}
   2590
   2591static sector_t fuse_bmap(struct address_space *mapping, sector_t block)
   2592{
   2593	struct inode *inode = mapping->host;
   2594	struct fuse_mount *fm = get_fuse_mount(inode);
   2595	FUSE_ARGS(args);
   2596	struct fuse_bmap_in inarg;
   2597	struct fuse_bmap_out outarg;
   2598	int err;
   2599
   2600	if (!inode->i_sb->s_bdev || fm->fc->no_bmap)
   2601		return 0;
   2602
   2603	memset(&inarg, 0, sizeof(inarg));
   2604	inarg.block = block;
   2605	inarg.blocksize = inode->i_sb->s_blocksize;
   2606	args.opcode = FUSE_BMAP;
   2607	args.nodeid = get_node_id(inode);
   2608	args.in_numargs = 1;
   2609	args.in_args[0].size = sizeof(inarg);
   2610	args.in_args[0].value = &inarg;
   2611	args.out_numargs = 1;
   2612	args.out_args[0].size = sizeof(outarg);
   2613	args.out_args[0].value = &outarg;
   2614	err = fuse_simple_request(fm, &args);
   2615	if (err == -ENOSYS)
   2616		fm->fc->no_bmap = 1;
   2617
   2618	return err ? 0 : outarg.block;
   2619}
   2620
   2621static loff_t fuse_lseek(struct file *file, loff_t offset, int whence)
   2622{
   2623	struct inode *inode = file->f_mapping->host;
   2624	struct fuse_mount *fm = get_fuse_mount(inode);
   2625	struct fuse_file *ff = file->private_data;
   2626	FUSE_ARGS(args);
   2627	struct fuse_lseek_in inarg = {
   2628		.fh = ff->fh,
   2629		.offset = offset,
   2630		.whence = whence
   2631	};
   2632	struct fuse_lseek_out outarg;
   2633	int err;
   2634
   2635	if (fm->fc->no_lseek)
   2636		goto fallback;
   2637
   2638	args.opcode = FUSE_LSEEK;
   2639	args.nodeid = ff->nodeid;
   2640	args.in_numargs = 1;
   2641	args.in_args[0].size = sizeof(inarg);
   2642	args.in_args[0].value = &inarg;
   2643	args.out_numargs = 1;
   2644	args.out_args[0].size = sizeof(outarg);
   2645	args.out_args[0].value = &outarg;
   2646	err = fuse_simple_request(fm, &args);
   2647	if (err) {
   2648		if (err == -ENOSYS) {
   2649			fm->fc->no_lseek = 1;
   2650			goto fallback;
   2651		}
   2652		return err;
   2653	}
   2654
   2655	return vfs_setpos(file, outarg.offset, inode->i_sb->s_maxbytes);
   2656
   2657fallback:
   2658	err = fuse_update_attributes(inode, file, STATX_SIZE);
   2659	if (!err)
   2660		return generic_file_llseek(file, offset, whence);
   2661	else
   2662		return err;
   2663}
   2664
   2665static loff_t fuse_file_llseek(struct file *file, loff_t offset, int whence)
   2666{
   2667	loff_t retval;
   2668	struct inode *inode = file_inode(file);
   2669
   2670	switch (whence) {
   2671	case SEEK_SET:
   2672	case SEEK_CUR:
   2673		 /* No i_mutex protection necessary for SEEK_CUR and SEEK_SET */
   2674		retval = generic_file_llseek(file, offset, whence);
   2675		break;
   2676	case SEEK_END:
   2677		inode_lock(inode);
   2678		retval = fuse_update_attributes(inode, file, STATX_SIZE);
   2679		if (!retval)
   2680			retval = generic_file_llseek(file, offset, whence);
   2681		inode_unlock(inode);
   2682		break;
   2683	case SEEK_HOLE:
   2684	case SEEK_DATA:
   2685		inode_lock(inode);
   2686		retval = fuse_lseek(file, offset, whence);
   2687		inode_unlock(inode);
   2688		break;
   2689	default:
   2690		retval = -EINVAL;
   2691	}
   2692
   2693	return retval;
   2694}
   2695
   2696/*
   2697 * All files which have been polled are linked to RB tree
   2698 * fuse_conn->polled_files which is indexed by kh.  Walk the tree and
   2699 * find the matching one.
   2700 */
   2701static struct rb_node **fuse_find_polled_node(struct fuse_conn *fc, u64 kh,
   2702					      struct rb_node **parent_out)
   2703{
   2704	struct rb_node **link = &fc->polled_files.rb_node;
   2705	struct rb_node *last = NULL;
   2706
   2707	while (*link) {
   2708		struct fuse_file *ff;
   2709
   2710		last = *link;
   2711		ff = rb_entry(last, struct fuse_file, polled_node);
   2712
   2713		if (kh < ff->kh)
   2714			link = &last->rb_left;
   2715		else if (kh > ff->kh)
   2716			link = &last->rb_right;
   2717		else
   2718			return link;
   2719	}
   2720
   2721	if (parent_out)
   2722		*parent_out = last;
   2723	return link;
   2724}
   2725
   2726/*
   2727 * The file is about to be polled.  Make sure it's on the polled_files
   2728 * RB tree.  Note that files once added to the polled_files tree are
   2729 * not removed before the file is released.  This is because a file
   2730 * polled once is likely to be polled again.
   2731 */
   2732static void fuse_register_polled_file(struct fuse_conn *fc,
   2733				      struct fuse_file *ff)
   2734{
   2735	spin_lock(&fc->lock);
   2736	if (RB_EMPTY_NODE(&ff->polled_node)) {
   2737		struct rb_node **link, *parent;
   2738
   2739		link = fuse_find_polled_node(fc, ff->kh, &parent);
   2740		BUG_ON(*link);
   2741		rb_link_node(&ff->polled_node, parent, link);
   2742		rb_insert_color(&ff->polled_node, &fc->polled_files);
   2743	}
   2744	spin_unlock(&fc->lock);
   2745}
   2746
   2747__poll_t fuse_file_poll(struct file *file, poll_table *wait)
   2748{
   2749	struct fuse_file *ff = file->private_data;
   2750	struct fuse_mount *fm = ff->fm;
   2751	struct fuse_poll_in inarg = { .fh = ff->fh, .kh = ff->kh };
   2752	struct fuse_poll_out outarg;
   2753	FUSE_ARGS(args);
   2754	int err;
   2755
   2756	if (fm->fc->no_poll)
   2757		return DEFAULT_POLLMASK;
   2758
   2759	poll_wait(file, &ff->poll_wait, wait);
   2760	inarg.events = mangle_poll(poll_requested_events(wait));
   2761
   2762	/*
   2763	 * Ask for notification iff there's someone waiting for it.
   2764	 * The client may ignore the flag and always notify.
   2765	 */
   2766	if (waitqueue_active(&ff->poll_wait)) {
   2767		inarg.flags |= FUSE_POLL_SCHEDULE_NOTIFY;
   2768		fuse_register_polled_file(fm->fc, ff);
   2769	}
   2770
   2771	args.opcode = FUSE_POLL;
   2772	args.nodeid = ff->nodeid;
   2773	args.in_numargs = 1;
   2774	args.in_args[0].size = sizeof(inarg);
   2775	args.in_args[0].value = &inarg;
   2776	args.out_numargs = 1;
   2777	args.out_args[0].size = sizeof(outarg);
   2778	args.out_args[0].value = &outarg;
   2779	err = fuse_simple_request(fm, &args);
   2780
   2781	if (!err)
   2782		return demangle_poll(outarg.revents);
   2783	if (err == -ENOSYS) {
   2784		fm->fc->no_poll = 1;
   2785		return DEFAULT_POLLMASK;
   2786	}
   2787	return EPOLLERR;
   2788}
   2789EXPORT_SYMBOL_GPL(fuse_file_poll);
   2790
   2791/*
   2792 * This is called from fuse_handle_notify() on FUSE_NOTIFY_POLL and
   2793 * wakes up the poll waiters.
   2794 */
   2795int fuse_notify_poll_wakeup(struct fuse_conn *fc,
   2796			    struct fuse_notify_poll_wakeup_out *outarg)
   2797{
   2798	u64 kh = outarg->kh;
   2799	struct rb_node **link;
   2800
   2801	spin_lock(&fc->lock);
   2802
   2803	link = fuse_find_polled_node(fc, kh, NULL);
   2804	if (*link) {
   2805		struct fuse_file *ff;
   2806
   2807		ff = rb_entry(*link, struct fuse_file, polled_node);
   2808		wake_up_interruptible_sync(&ff->poll_wait);
   2809	}
   2810
   2811	spin_unlock(&fc->lock);
   2812	return 0;
   2813}
   2814
   2815static void fuse_do_truncate(struct file *file)
   2816{
   2817	struct inode *inode = file->f_mapping->host;
   2818	struct iattr attr;
   2819
   2820	attr.ia_valid = ATTR_SIZE;
   2821	attr.ia_size = i_size_read(inode);
   2822
   2823	attr.ia_file = file;
   2824	attr.ia_valid |= ATTR_FILE;
   2825
   2826	fuse_do_setattr(file_dentry(file), &attr, file);
   2827}
   2828
   2829static inline loff_t fuse_round_up(struct fuse_conn *fc, loff_t off)
   2830{
   2831	return round_up(off, fc->max_pages << PAGE_SHIFT);
   2832}
   2833
   2834static ssize_t
   2835fuse_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
   2836{
   2837	DECLARE_COMPLETION_ONSTACK(wait);
   2838	ssize_t ret = 0;
   2839	struct file *file = iocb->ki_filp;
   2840	struct fuse_file *ff = file->private_data;
   2841	loff_t pos = 0;
   2842	struct inode *inode;
   2843	loff_t i_size;
   2844	size_t count = iov_iter_count(iter), shortened = 0;
   2845	loff_t offset = iocb->ki_pos;
   2846	struct fuse_io_priv *io;
   2847
   2848	pos = offset;
   2849	inode = file->f_mapping->host;
   2850	i_size = i_size_read(inode);
   2851
   2852	if ((iov_iter_rw(iter) == READ) && (offset >= i_size))
   2853		return 0;
   2854
   2855	io = kmalloc(sizeof(struct fuse_io_priv), GFP_KERNEL);
   2856	if (!io)
   2857		return -ENOMEM;
   2858	spin_lock_init(&io->lock);
   2859	kref_init(&io->refcnt);
   2860	io->reqs = 1;
   2861	io->bytes = -1;
   2862	io->size = 0;
   2863	io->offset = offset;
   2864	io->write = (iov_iter_rw(iter) == WRITE);
   2865	io->err = 0;
   2866	/*
   2867	 * By default, we want to optimize all I/Os with async request
   2868	 * submission to the client filesystem if supported.
   2869	 */
   2870	io->async = ff->fm->fc->async_dio;
   2871	io->iocb = iocb;
   2872	io->blocking = is_sync_kiocb(iocb);
   2873
   2874	/* optimization for short read */
   2875	if (io->async && !io->write && offset + count > i_size) {
   2876		iov_iter_truncate(iter, fuse_round_up(ff->fm->fc, i_size - offset));
   2877		shortened = count - iov_iter_count(iter);
   2878		count -= shortened;
   2879	}
   2880
   2881	/*
   2882	 * We cannot asynchronously extend the size of a file.
   2883	 * In such case the aio will behave exactly like sync io.
   2884	 */
   2885	if ((offset + count > i_size) && io->write)
   2886		io->blocking = true;
   2887
   2888	if (io->async && io->blocking) {
   2889		/*
   2890		 * Additional reference to keep io around after
   2891		 * calling fuse_aio_complete()
   2892		 */
   2893		kref_get(&io->refcnt);
   2894		io->done = &wait;
   2895	}
   2896
   2897	if (iov_iter_rw(iter) == WRITE) {
   2898		ret = fuse_direct_io(io, iter, &pos, FUSE_DIO_WRITE);
   2899		fuse_invalidate_attr_mask(inode, FUSE_STATX_MODSIZE);
   2900	} else {
   2901		ret = __fuse_direct_read(io, iter, &pos);
   2902	}
   2903	iov_iter_reexpand(iter, iov_iter_count(iter) + shortened);
   2904
   2905	if (io->async) {
   2906		bool blocking = io->blocking;
   2907
   2908		fuse_aio_complete(io, ret < 0 ? ret : 0, -1);
   2909
   2910		/* we have a non-extending, async request, so return */
   2911		if (!blocking)
   2912			return -EIOCBQUEUED;
   2913
   2914		wait_for_completion(&wait);
   2915		ret = fuse_get_res_by_io(io);
   2916	}
   2917
   2918	kref_put(&io->refcnt, fuse_io_release);
   2919
   2920	if (iov_iter_rw(iter) == WRITE) {
   2921		fuse_write_update_attr(inode, pos, ret);
   2922		if (ret < 0 && offset + count > i_size)
   2923			fuse_do_truncate(file);
   2924	}
   2925
   2926	return ret;
   2927}
   2928
   2929static int fuse_writeback_range(struct inode *inode, loff_t start, loff_t end)
   2930{
   2931	int err = filemap_write_and_wait_range(inode->i_mapping, start, LLONG_MAX);
   2932
   2933	if (!err)
   2934		fuse_sync_writes(inode);
   2935
   2936	return err;
   2937}
   2938
   2939static long fuse_file_fallocate(struct file *file, int mode, loff_t offset,
   2940				loff_t length)
   2941{
   2942	struct fuse_file *ff = file->private_data;
   2943	struct inode *inode = file_inode(file);
   2944	struct fuse_inode *fi = get_fuse_inode(inode);
   2945	struct fuse_mount *fm = ff->fm;
   2946	FUSE_ARGS(args);
   2947	struct fuse_fallocate_in inarg = {
   2948		.fh = ff->fh,
   2949		.offset = offset,
   2950		.length = length,
   2951		.mode = mode
   2952	};
   2953	int err;
   2954	bool lock_inode = !(mode & FALLOC_FL_KEEP_SIZE) ||
   2955			   (mode & (FALLOC_FL_PUNCH_HOLE |
   2956				    FALLOC_FL_ZERO_RANGE));
   2957
   2958	bool block_faults = FUSE_IS_DAX(inode) && lock_inode;
   2959
   2960	if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE |
   2961		     FALLOC_FL_ZERO_RANGE))
   2962		return -EOPNOTSUPP;
   2963
   2964	if (fm->fc->no_fallocate)
   2965		return -EOPNOTSUPP;
   2966
   2967	if (lock_inode) {
   2968		inode_lock(inode);
   2969		if (block_faults) {
   2970			filemap_invalidate_lock(inode->i_mapping);
   2971			err = fuse_dax_break_layouts(inode, 0, 0);
   2972			if (err)
   2973				goto out;
   2974		}
   2975
   2976		if (mode & (FALLOC_FL_PUNCH_HOLE | FALLOC_FL_ZERO_RANGE)) {
   2977			loff_t endbyte = offset + length - 1;
   2978
   2979			err = fuse_writeback_range(inode, offset, endbyte);
   2980			if (err)
   2981				goto out;
   2982		}
   2983	}
   2984
   2985	if (!(mode & FALLOC_FL_KEEP_SIZE) &&
   2986	    offset + length > i_size_read(inode)) {
   2987		err = inode_newsize_ok(inode, offset + length);
   2988		if (err)
   2989			goto out;
   2990	}
   2991
   2992	if (!(mode & FALLOC_FL_KEEP_SIZE))
   2993		set_bit(FUSE_I_SIZE_UNSTABLE, &fi->state);
   2994
   2995	args.opcode = FUSE_FALLOCATE;
   2996	args.nodeid = ff->nodeid;
   2997	args.in_numargs = 1;
   2998	args.in_args[0].size = sizeof(inarg);
   2999	args.in_args[0].value = &inarg;
   3000	err = fuse_simple_request(fm, &args);
   3001	if (err == -ENOSYS) {
   3002		fm->fc->no_fallocate = 1;
   3003		err = -EOPNOTSUPP;
   3004	}
   3005	if (err)
   3006		goto out;
   3007
   3008	/* we could have extended the file */
   3009	if (!(mode & FALLOC_FL_KEEP_SIZE)) {
   3010		if (fuse_write_update_attr(inode, offset + length, length))
   3011			file_update_time(file);
   3012	}
   3013
   3014	if (mode & (FALLOC_FL_PUNCH_HOLE | FALLOC_FL_ZERO_RANGE))
   3015		truncate_pagecache_range(inode, offset, offset + length - 1);
   3016
   3017	fuse_invalidate_attr_mask(inode, FUSE_STATX_MODSIZE);
   3018
   3019out:
   3020	if (!(mode & FALLOC_FL_KEEP_SIZE))
   3021		clear_bit(FUSE_I_SIZE_UNSTABLE, &fi->state);
   3022
   3023	if (block_faults)
   3024		filemap_invalidate_unlock(inode->i_mapping);
   3025
   3026	if (lock_inode)
   3027		inode_unlock(inode);
   3028
   3029	fuse_flush_time_update(inode);
   3030
   3031	return err;
   3032}
   3033
   3034static ssize_t __fuse_copy_file_range(struct file *file_in, loff_t pos_in,
   3035				      struct file *file_out, loff_t pos_out,
   3036				      size_t len, unsigned int flags)
   3037{
   3038	struct fuse_file *ff_in = file_in->private_data;
   3039	struct fuse_file *ff_out = file_out->private_data;
   3040	struct inode *inode_in = file_inode(file_in);
   3041	struct inode *inode_out = file_inode(file_out);
   3042	struct fuse_inode *fi_out = get_fuse_inode(inode_out);
   3043	struct fuse_mount *fm = ff_in->fm;
   3044	struct fuse_conn *fc = fm->fc;
   3045	FUSE_ARGS(args);
   3046	struct fuse_copy_file_range_in inarg = {
   3047		.fh_in = ff_in->fh,
   3048		.off_in = pos_in,
   3049		.nodeid_out = ff_out->nodeid,
   3050		.fh_out = ff_out->fh,
   3051		.off_out = pos_out,
   3052		.len = len,
   3053		.flags = flags
   3054	};
   3055	struct fuse_write_out outarg;
   3056	ssize_t err;
   3057	/* mark unstable when write-back is not used, and file_out gets
   3058	 * extended */
   3059	bool is_unstable = (!fc->writeback_cache) &&
   3060			   ((pos_out + len) > inode_out->i_size);
   3061
   3062	if (fc->no_copy_file_range)
   3063		return -EOPNOTSUPP;
   3064
   3065	if (file_inode(file_in)->i_sb != file_inode(file_out)->i_sb)
   3066		return -EXDEV;
   3067
   3068	inode_lock(inode_in);
   3069	err = fuse_writeback_range(inode_in, pos_in, pos_in + len - 1);
   3070	inode_unlock(inode_in);
   3071	if (err)
   3072		return err;
   3073
   3074	inode_lock(inode_out);
   3075
   3076	err = file_modified(file_out);
   3077	if (err)
   3078		goto out;
   3079
   3080	/*
   3081	 * Write out dirty pages in the destination file before sending the COPY
   3082	 * request to userspace.  After the request is completed, truncate off
   3083	 * pages (including partial ones) from the cache that have been copied,
   3084	 * since these contain stale data at that point.
   3085	 *
   3086	 * This should be mostly correct, but if the COPY writes to partial
   3087	 * pages (at the start or end) and the parts not covered by the COPY are
   3088	 * written through a memory map after calling fuse_writeback_range(),
   3089	 * then these partial page modifications will be lost on truncation.
   3090	 *
   3091	 * It is unlikely that someone would rely on such mixed style
   3092	 * modifications.  Yet this does give less guarantees than if the
   3093	 * copying was performed with write(2).
   3094	 *
   3095	 * To fix this a mapping->invalidate_lock could be used to prevent new
   3096	 * faults while the copy is ongoing.
   3097	 */
   3098	err = fuse_writeback_range(inode_out, pos_out, pos_out + len - 1);
   3099	if (err)
   3100		goto out;
   3101
   3102	if (is_unstable)
   3103		set_bit(FUSE_I_SIZE_UNSTABLE, &fi_out->state);
   3104
   3105	args.opcode = FUSE_COPY_FILE_RANGE;
   3106	args.nodeid = ff_in->nodeid;
   3107	args.in_numargs = 1;
   3108	args.in_args[0].size = sizeof(inarg);
   3109	args.in_args[0].value = &inarg;
   3110	args.out_numargs = 1;
   3111	args.out_args[0].size = sizeof(outarg);
   3112	args.out_args[0].value = &outarg;
   3113	err = fuse_simple_request(fm, &args);
   3114	if (err == -ENOSYS) {
   3115		fc->no_copy_file_range = 1;
   3116		err = -EOPNOTSUPP;
   3117	}
   3118	if (err)
   3119		goto out;
   3120
   3121	truncate_inode_pages_range(inode_out->i_mapping,
   3122				   ALIGN_DOWN(pos_out, PAGE_SIZE),
   3123				   ALIGN(pos_out + outarg.size, PAGE_SIZE) - 1);
   3124
   3125	file_update_time(file_out);
   3126	fuse_write_update_attr(inode_out, pos_out + outarg.size, outarg.size);
   3127
   3128	err = outarg.size;
   3129out:
   3130	if (is_unstable)
   3131		clear_bit(FUSE_I_SIZE_UNSTABLE, &fi_out->state);
   3132
   3133	inode_unlock(inode_out);
   3134	file_accessed(file_in);
   3135
   3136	fuse_flush_time_update(inode_out);
   3137
   3138	return err;
   3139}
   3140
   3141static ssize_t fuse_copy_file_range(struct file *src_file, loff_t src_off,
   3142				    struct file *dst_file, loff_t dst_off,
   3143				    size_t len, unsigned int flags)
   3144{
   3145	ssize_t ret;
   3146
   3147	ret = __fuse_copy_file_range(src_file, src_off, dst_file, dst_off,
   3148				     len, flags);
   3149
   3150	if (ret == -EOPNOTSUPP || ret == -EXDEV)
   3151		ret = generic_copy_file_range(src_file, src_off, dst_file,
   3152					      dst_off, len, flags);
   3153	return ret;
   3154}
   3155
   3156static const struct file_operations fuse_file_operations = {
   3157	.llseek		= fuse_file_llseek,
   3158	.read_iter	= fuse_file_read_iter,
   3159	.write_iter	= fuse_file_write_iter,
   3160	.mmap		= fuse_file_mmap,
   3161	.open		= fuse_open,
   3162	.flush		= fuse_flush,
   3163	.release	= fuse_release,
   3164	.fsync		= fuse_fsync,
   3165	.lock		= fuse_file_lock,
   3166	.get_unmapped_area = thp_get_unmapped_area,
   3167	.flock		= fuse_file_flock,
   3168	.splice_read	= generic_file_splice_read,
   3169	.splice_write	= iter_file_splice_write,
   3170	.unlocked_ioctl	= fuse_file_ioctl,
   3171	.compat_ioctl	= fuse_file_compat_ioctl,
   3172	.poll		= fuse_file_poll,
   3173	.fallocate	= fuse_file_fallocate,
   3174	.copy_file_range = fuse_copy_file_range,
   3175};
   3176
   3177static const struct address_space_operations fuse_file_aops  = {
   3178	.read_folio	= fuse_read_folio,
   3179	.readahead	= fuse_readahead,
   3180	.writepage	= fuse_writepage,
   3181	.writepages	= fuse_writepages,
   3182	.launder_folio	= fuse_launder_folio,
   3183	.dirty_folio	= filemap_dirty_folio,
   3184	.bmap		= fuse_bmap,
   3185	.direct_IO	= fuse_direct_IO,
   3186	.write_begin	= fuse_write_begin,
   3187	.write_end	= fuse_write_end,
   3188};
   3189
   3190void fuse_init_file_inode(struct inode *inode, unsigned int flags)
   3191{
   3192	struct fuse_inode *fi = get_fuse_inode(inode);
   3193
   3194	inode->i_fop = &fuse_file_operations;
   3195	inode->i_data.a_ops = &fuse_file_aops;
   3196
   3197	INIT_LIST_HEAD(&fi->write_files);
   3198	INIT_LIST_HEAD(&fi->queued_writes);
   3199	fi->writectr = 0;
   3200	init_waitqueue_head(&fi->page_waitq);
   3201	fi->writepages = RB_ROOT;
   3202
   3203	if (IS_ENABLED(CONFIG_FUSE_DAX))
   3204		fuse_dax_inode_init(inode, flags);
   3205}