cachepc-linux

Fork of AMDESE/linux with modifications for CachePC side-channel attack
git clone https://git.sinitax.com/sinitax/cachepc-linux
Log | Files | Refs | README | LICENSE | sfeed.txt

read_write.c (41135B)


      1// SPDX-License-Identifier: GPL-2.0
      2/*
      3 *  linux/fs/read_write.c
      4 *
      5 *  Copyright (C) 1991, 1992  Linus Torvalds
      6 */
      7
      8#include <linux/slab.h>
      9#include <linux/stat.h>
     10#include <linux/sched/xacct.h>
     11#include <linux/fcntl.h>
     12#include <linux/file.h>
     13#include <linux/uio.h>
     14#include <linux/fsnotify.h>
     15#include <linux/security.h>
     16#include <linux/export.h>
     17#include <linux/syscalls.h>
     18#include <linux/pagemap.h>
     19#include <linux/splice.h>
     20#include <linux/compat.h>
     21#include <linux/mount.h>
     22#include <linux/fs.h>
     23#include "internal.h"
     24
     25#include <linux/uaccess.h>
     26#include <asm/unistd.h>
     27
     28const struct file_operations generic_ro_fops = {
     29	.llseek		= generic_file_llseek,
     30	.read_iter	= generic_file_read_iter,
     31	.mmap		= generic_file_readonly_mmap,
     32	.splice_read	= generic_file_splice_read,
     33};
     34
     35EXPORT_SYMBOL(generic_ro_fops);
     36
     37static inline bool unsigned_offsets(struct file *file)
     38{
     39	return file->f_mode & FMODE_UNSIGNED_OFFSET;
     40}
     41
     42/**
     43 * vfs_setpos - update the file offset for lseek
     44 * @file:	file structure in question
     45 * @offset:	file offset to seek to
     46 * @maxsize:	maximum file size
     47 *
     48 * This is a low-level filesystem helper for updating the file offset to
     49 * the value specified by @offset if the given offset is valid and it is
     50 * not equal to the current file offset.
     51 *
     52 * Return the specified offset on success and -EINVAL on invalid offset.
     53 */
     54loff_t vfs_setpos(struct file *file, loff_t offset, loff_t maxsize)
     55{
     56	if (offset < 0 && !unsigned_offsets(file))
     57		return -EINVAL;
     58	if (offset > maxsize)
     59		return -EINVAL;
     60
     61	if (offset != file->f_pos) {
     62		file->f_pos = offset;
     63		file->f_version = 0;
     64	}
     65	return offset;
     66}
     67EXPORT_SYMBOL(vfs_setpos);
     68
     69/**
     70 * generic_file_llseek_size - generic llseek implementation for regular files
     71 * @file:	file structure to seek on
     72 * @offset:	file offset to seek to
     73 * @whence:	type of seek
     74 * @size:	max size of this file in file system
     75 * @eof:	offset used for SEEK_END position
     76 *
     77 * This is a variant of generic_file_llseek that allows passing in a custom
     78 * maximum file size and a custom EOF position, for e.g. hashed directories
     79 *
     80 * Synchronization:
     81 * SEEK_SET and SEEK_END are unsynchronized (but atomic on 64bit platforms)
     82 * SEEK_CUR is synchronized against other SEEK_CURs, but not read/writes.
     83 * read/writes behave like SEEK_SET against seeks.
     84 */
     85loff_t
     86generic_file_llseek_size(struct file *file, loff_t offset, int whence,
     87		loff_t maxsize, loff_t eof)
     88{
     89	switch (whence) {
     90	case SEEK_END:
     91		offset += eof;
     92		break;
     93	case SEEK_CUR:
     94		/*
     95		 * Here we special-case the lseek(fd, 0, SEEK_CUR)
     96		 * position-querying operation.  Avoid rewriting the "same"
     97		 * f_pos value back to the file because a concurrent read(),
     98		 * write() or lseek() might have altered it
     99		 */
    100		if (offset == 0)
    101			return file->f_pos;
    102		/*
    103		 * f_lock protects against read/modify/write race with other
    104		 * SEEK_CURs. Note that parallel writes and reads behave
    105		 * like SEEK_SET.
    106		 */
    107		spin_lock(&file->f_lock);
    108		offset = vfs_setpos(file, file->f_pos + offset, maxsize);
    109		spin_unlock(&file->f_lock);
    110		return offset;
    111	case SEEK_DATA:
    112		/*
    113		 * In the generic case the entire file is data, so as long as
    114		 * offset isn't at the end of the file then the offset is data.
    115		 */
    116		if ((unsigned long long)offset >= eof)
    117			return -ENXIO;
    118		break;
    119	case SEEK_HOLE:
    120		/*
    121		 * There is a virtual hole at the end of the file, so as long as
    122		 * offset isn't i_size or larger, return i_size.
    123		 */
    124		if ((unsigned long long)offset >= eof)
    125			return -ENXIO;
    126		offset = eof;
    127		break;
    128	}
    129
    130	return vfs_setpos(file, offset, maxsize);
    131}
    132EXPORT_SYMBOL(generic_file_llseek_size);
    133
    134/**
    135 * generic_file_llseek - generic llseek implementation for regular files
    136 * @file:	file structure to seek on
    137 * @offset:	file offset to seek to
    138 * @whence:	type of seek
    139 *
    140 * This is a generic implemenation of ->llseek useable for all normal local
    141 * filesystems.  It just updates the file offset to the value specified by
    142 * @offset and @whence.
    143 */
    144loff_t generic_file_llseek(struct file *file, loff_t offset, int whence)
    145{
    146	struct inode *inode = file->f_mapping->host;
    147
    148	return generic_file_llseek_size(file, offset, whence,
    149					inode->i_sb->s_maxbytes,
    150					i_size_read(inode));
    151}
    152EXPORT_SYMBOL(generic_file_llseek);
    153
    154/**
    155 * fixed_size_llseek - llseek implementation for fixed-sized devices
    156 * @file:	file structure to seek on
    157 * @offset:	file offset to seek to
    158 * @whence:	type of seek
    159 * @size:	size of the file
    160 *
    161 */
    162loff_t fixed_size_llseek(struct file *file, loff_t offset, int whence, loff_t size)
    163{
    164	switch (whence) {
    165	case SEEK_SET: case SEEK_CUR: case SEEK_END:
    166		return generic_file_llseek_size(file, offset, whence,
    167						size, size);
    168	default:
    169		return -EINVAL;
    170	}
    171}
    172EXPORT_SYMBOL(fixed_size_llseek);
    173
    174/**
    175 * no_seek_end_llseek - llseek implementation for fixed-sized devices
    176 * @file:	file structure to seek on
    177 * @offset:	file offset to seek to
    178 * @whence:	type of seek
    179 *
    180 */
    181loff_t no_seek_end_llseek(struct file *file, loff_t offset, int whence)
    182{
    183	switch (whence) {
    184	case SEEK_SET: case SEEK_CUR:
    185		return generic_file_llseek_size(file, offset, whence,
    186						OFFSET_MAX, 0);
    187	default:
    188		return -EINVAL;
    189	}
    190}
    191EXPORT_SYMBOL(no_seek_end_llseek);
    192
    193/**
    194 * no_seek_end_llseek_size - llseek implementation for fixed-sized devices
    195 * @file:	file structure to seek on
    196 * @offset:	file offset to seek to
    197 * @whence:	type of seek
    198 * @size:	maximal offset allowed
    199 *
    200 */
    201loff_t no_seek_end_llseek_size(struct file *file, loff_t offset, int whence, loff_t size)
    202{
    203	switch (whence) {
    204	case SEEK_SET: case SEEK_CUR:
    205		return generic_file_llseek_size(file, offset, whence,
    206						size, 0);
    207	default:
    208		return -EINVAL;
    209	}
    210}
    211EXPORT_SYMBOL(no_seek_end_llseek_size);
    212
    213/**
    214 * noop_llseek - No Operation Performed llseek implementation
    215 * @file:	file structure to seek on
    216 * @offset:	file offset to seek to
    217 * @whence:	type of seek
    218 *
    219 * This is an implementation of ->llseek useable for the rare special case when
    220 * userspace expects the seek to succeed but the (device) file is actually not
    221 * able to perform the seek. In this case you use noop_llseek() instead of
    222 * falling back to the default implementation of ->llseek.
    223 */
    224loff_t noop_llseek(struct file *file, loff_t offset, int whence)
    225{
    226	return file->f_pos;
    227}
    228EXPORT_SYMBOL(noop_llseek);
    229
    230loff_t no_llseek(struct file *file, loff_t offset, int whence)
    231{
    232	return -ESPIPE;
    233}
    234EXPORT_SYMBOL(no_llseek);
    235
    236loff_t default_llseek(struct file *file, loff_t offset, int whence)
    237{
    238	struct inode *inode = file_inode(file);
    239	loff_t retval;
    240
    241	inode_lock(inode);
    242	switch (whence) {
    243		case SEEK_END:
    244			offset += i_size_read(inode);
    245			break;
    246		case SEEK_CUR:
    247			if (offset == 0) {
    248				retval = file->f_pos;
    249				goto out;
    250			}
    251			offset += file->f_pos;
    252			break;
    253		case SEEK_DATA:
    254			/*
    255			 * In the generic case the entire file is data, so as
    256			 * long as offset isn't at the end of the file then the
    257			 * offset is data.
    258			 */
    259			if (offset >= inode->i_size) {
    260				retval = -ENXIO;
    261				goto out;
    262			}
    263			break;
    264		case SEEK_HOLE:
    265			/*
    266			 * There is a virtual hole at the end of the file, so
    267			 * as long as offset isn't i_size or larger, return
    268			 * i_size.
    269			 */
    270			if (offset >= inode->i_size) {
    271				retval = -ENXIO;
    272				goto out;
    273			}
    274			offset = inode->i_size;
    275			break;
    276	}
    277	retval = -EINVAL;
    278	if (offset >= 0 || unsigned_offsets(file)) {
    279		if (offset != file->f_pos) {
    280			file->f_pos = offset;
    281			file->f_version = 0;
    282		}
    283		retval = offset;
    284	}
    285out:
    286	inode_unlock(inode);
    287	return retval;
    288}
    289EXPORT_SYMBOL(default_llseek);
    290
    291loff_t vfs_llseek(struct file *file, loff_t offset, int whence)
    292{
    293	loff_t (*fn)(struct file *, loff_t, int);
    294
    295	fn = no_llseek;
    296	if (file->f_mode & FMODE_LSEEK) {
    297		if (file->f_op->llseek)
    298			fn = file->f_op->llseek;
    299	}
    300	return fn(file, offset, whence);
    301}
    302EXPORT_SYMBOL(vfs_llseek);
    303
    304static off_t ksys_lseek(unsigned int fd, off_t offset, unsigned int whence)
    305{
    306	off_t retval;
    307	struct fd f = fdget_pos(fd);
    308	if (!f.file)
    309		return -EBADF;
    310
    311	retval = -EINVAL;
    312	if (whence <= SEEK_MAX) {
    313		loff_t res = vfs_llseek(f.file, offset, whence);
    314		retval = res;
    315		if (res != (loff_t)retval)
    316			retval = -EOVERFLOW;	/* LFS: should only happen on 32 bit platforms */
    317	}
    318	fdput_pos(f);
    319	return retval;
    320}
    321
    322SYSCALL_DEFINE3(lseek, unsigned int, fd, off_t, offset, unsigned int, whence)
    323{
    324	return ksys_lseek(fd, offset, whence);
    325}
    326
    327#ifdef CONFIG_COMPAT
    328COMPAT_SYSCALL_DEFINE3(lseek, unsigned int, fd, compat_off_t, offset, unsigned int, whence)
    329{
    330	return ksys_lseek(fd, offset, whence);
    331}
    332#endif
    333
    334#if !defined(CONFIG_64BIT) || defined(CONFIG_COMPAT) || \
    335	defined(__ARCH_WANT_SYS_LLSEEK)
    336SYSCALL_DEFINE5(llseek, unsigned int, fd, unsigned long, offset_high,
    337		unsigned long, offset_low, loff_t __user *, result,
    338		unsigned int, whence)
    339{
    340	int retval;
    341	struct fd f = fdget_pos(fd);
    342	loff_t offset;
    343
    344	if (!f.file)
    345		return -EBADF;
    346
    347	retval = -EINVAL;
    348	if (whence > SEEK_MAX)
    349		goto out_putf;
    350
    351	offset = vfs_llseek(f.file, ((loff_t) offset_high << 32) | offset_low,
    352			whence);
    353
    354	retval = (int)offset;
    355	if (offset >= 0) {
    356		retval = -EFAULT;
    357		if (!copy_to_user(result, &offset, sizeof(offset)))
    358			retval = 0;
    359	}
    360out_putf:
    361	fdput_pos(f);
    362	return retval;
    363}
    364#endif
    365
    366int rw_verify_area(int read_write, struct file *file, const loff_t *ppos, size_t count)
    367{
    368	if (unlikely((ssize_t) count < 0))
    369		return -EINVAL;
    370
    371	if (ppos) {
    372		loff_t pos = *ppos;
    373
    374		if (unlikely(pos < 0)) {
    375			if (!unsigned_offsets(file))
    376				return -EINVAL;
    377			if (count >= -pos) /* both values are in 0..LLONG_MAX */
    378				return -EOVERFLOW;
    379		} else if (unlikely((loff_t) (pos + count) < 0)) {
    380			if (!unsigned_offsets(file))
    381				return -EINVAL;
    382		}
    383	}
    384
    385	return security_file_permission(file,
    386				read_write == READ ? MAY_READ : MAY_WRITE);
    387}
    388EXPORT_SYMBOL(rw_verify_area);
    389
    390static ssize_t new_sync_read(struct file *filp, char __user *buf, size_t len, loff_t *ppos)
    391{
    392	struct iovec iov = { .iov_base = buf, .iov_len = len };
    393	struct kiocb kiocb;
    394	struct iov_iter iter;
    395	ssize_t ret;
    396
    397	init_sync_kiocb(&kiocb, filp);
    398	kiocb.ki_pos = (ppos ? *ppos : 0);
    399	iov_iter_init(&iter, READ, &iov, 1, len);
    400
    401	ret = call_read_iter(filp, &kiocb, &iter);
    402	BUG_ON(ret == -EIOCBQUEUED);
    403	if (ppos)
    404		*ppos = kiocb.ki_pos;
    405	return ret;
    406}
    407
    408static int warn_unsupported(struct file *file, const char *op)
    409{
    410	pr_warn_ratelimited(
    411		"kernel %s not supported for file %pD4 (pid: %d comm: %.20s)\n",
    412		op, file, current->pid, current->comm);
    413	return -EINVAL;
    414}
    415
    416ssize_t __kernel_read(struct file *file, void *buf, size_t count, loff_t *pos)
    417{
    418	struct kvec iov = {
    419		.iov_base	= buf,
    420		.iov_len	= min_t(size_t, count, MAX_RW_COUNT),
    421	};
    422	struct kiocb kiocb;
    423	struct iov_iter iter;
    424	ssize_t ret;
    425
    426	if (WARN_ON_ONCE(!(file->f_mode & FMODE_READ)))
    427		return -EINVAL;
    428	if (!(file->f_mode & FMODE_CAN_READ))
    429		return -EINVAL;
    430	/*
    431	 * Also fail if ->read_iter and ->read are both wired up as that
    432	 * implies very convoluted semantics.
    433	 */
    434	if (unlikely(!file->f_op->read_iter || file->f_op->read))
    435		return warn_unsupported(file, "read");
    436
    437	init_sync_kiocb(&kiocb, file);
    438	kiocb.ki_pos = pos ? *pos : 0;
    439	iov_iter_kvec(&iter, READ, &iov, 1, iov.iov_len);
    440	ret = file->f_op->read_iter(&kiocb, &iter);
    441	if (ret > 0) {
    442		if (pos)
    443			*pos = kiocb.ki_pos;
    444		fsnotify_access(file);
    445		add_rchar(current, ret);
    446	}
    447	inc_syscr(current);
    448	return ret;
    449}
    450
    451ssize_t kernel_read(struct file *file, void *buf, size_t count, loff_t *pos)
    452{
    453	ssize_t ret;
    454
    455	ret = rw_verify_area(READ, file, pos, count);
    456	if (ret)
    457		return ret;
    458	return __kernel_read(file, buf, count, pos);
    459}
    460EXPORT_SYMBOL(kernel_read);
    461
    462ssize_t vfs_read(struct file *file, char __user *buf, size_t count, loff_t *pos)
    463{
    464	ssize_t ret;
    465
    466	if (!(file->f_mode & FMODE_READ))
    467		return -EBADF;
    468	if (!(file->f_mode & FMODE_CAN_READ))
    469		return -EINVAL;
    470	if (unlikely(!access_ok(buf, count)))
    471		return -EFAULT;
    472
    473	ret = rw_verify_area(READ, file, pos, count);
    474	if (ret)
    475		return ret;
    476	if (count > MAX_RW_COUNT)
    477		count =  MAX_RW_COUNT;
    478
    479	if (file->f_op->read)
    480		ret = file->f_op->read(file, buf, count, pos);
    481	else if (file->f_op->read_iter)
    482		ret = new_sync_read(file, buf, count, pos);
    483	else
    484		ret = -EINVAL;
    485	if (ret > 0) {
    486		fsnotify_access(file);
    487		add_rchar(current, ret);
    488	}
    489	inc_syscr(current);
    490	return ret;
    491}
    492
    493static ssize_t new_sync_write(struct file *filp, const char __user *buf, size_t len, loff_t *ppos)
    494{
    495	struct iovec iov = { .iov_base = (void __user *)buf, .iov_len = len };
    496	struct kiocb kiocb;
    497	struct iov_iter iter;
    498	ssize_t ret;
    499
    500	init_sync_kiocb(&kiocb, filp);
    501	kiocb.ki_pos = (ppos ? *ppos : 0);
    502	iov_iter_init(&iter, WRITE, &iov, 1, len);
    503
    504	ret = call_write_iter(filp, &kiocb, &iter);
    505	BUG_ON(ret == -EIOCBQUEUED);
    506	if (ret > 0 && ppos)
    507		*ppos = kiocb.ki_pos;
    508	return ret;
    509}
    510
    511/* caller is responsible for file_start_write/file_end_write */
    512ssize_t __kernel_write(struct file *file, const void *buf, size_t count, loff_t *pos)
    513{
    514	struct kvec iov = {
    515		.iov_base	= (void *)buf,
    516		.iov_len	= min_t(size_t, count, MAX_RW_COUNT),
    517	};
    518	struct kiocb kiocb;
    519	struct iov_iter iter;
    520	ssize_t ret;
    521
    522	if (WARN_ON_ONCE(!(file->f_mode & FMODE_WRITE)))
    523		return -EBADF;
    524	if (!(file->f_mode & FMODE_CAN_WRITE))
    525		return -EINVAL;
    526	/*
    527	 * Also fail if ->write_iter and ->write are both wired up as that
    528	 * implies very convoluted semantics.
    529	 */
    530	if (unlikely(!file->f_op->write_iter || file->f_op->write))
    531		return warn_unsupported(file, "write");
    532
    533	init_sync_kiocb(&kiocb, file);
    534	kiocb.ki_pos = pos ? *pos : 0;
    535	iov_iter_kvec(&iter, WRITE, &iov, 1, iov.iov_len);
    536	ret = file->f_op->write_iter(&kiocb, &iter);
    537	if (ret > 0) {
    538		if (pos)
    539			*pos = kiocb.ki_pos;
    540		fsnotify_modify(file);
    541		add_wchar(current, ret);
    542	}
    543	inc_syscw(current);
    544	return ret;
    545}
    546/*
    547 * This "EXPORT_SYMBOL_GPL()" is more of a "EXPORT_SYMBOL_DONTUSE()",
    548 * but autofs is one of the few internal kernel users that actually
    549 * wants this _and_ can be built as a module. So we need to export
    550 * this symbol for autofs, even though it really isn't appropriate
    551 * for any other kernel modules.
    552 */
    553EXPORT_SYMBOL_GPL(__kernel_write);
    554
    555ssize_t kernel_write(struct file *file, const void *buf, size_t count,
    556			    loff_t *pos)
    557{
    558	ssize_t ret;
    559
    560	ret = rw_verify_area(WRITE, file, pos, count);
    561	if (ret)
    562		return ret;
    563
    564	file_start_write(file);
    565	ret =  __kernel_write(file, buf, count, pos);
    566	file_end_write(file);
    567	return ret;
    568}
    569EXPORT_SYMBOL(kernel_write);
    570
    571ssize_t vfs_write(struct file *file, const char __user *buf, size_t count, loff_t *pos)
    572{
    573	ssize_t ret;
    574
    575	if (!(file->f_mode & FMODE_WRITE))
    576		return -EBADF;
    577	if (!(file->f_mode & FMODE_CAN_WRITE))
    578		return -EINVAL;
    579	if (unlikely(!access_ok(buf, count)))
    580		return -EFAULT;
    581
    582	ret = rw_verify_area(WRITE, file, pos, count);
    583	if (ret)
    584		return ret;
    585	if (count > MAX_RW_COUNT)
    586		count =  MAX_RW_COUNT;
    587	file_start_write(file);
    588	if (file->f_op->write)
    589		ret = file->f_op->write(file, buf, count, pos);
    590	else if (file->f_op->write_iter)
    591		ret = new_sync_write(file, buf, count, pos);
    592	else
    593		ret = -EINVAL;
    594	if (ret > 0) {
    595		fsnotify_modify(file);
    596		add_wchar(current, ret);
    597	}
    598	inc_syscw(current);
    599	file_end_write(file);
    600	return ret;
    601}
    602
    603/* file_ppos returns &file->f_pos or NULL if file is stream */
    604static inline loff_t *file_ppos(struct file *file)
    605{
    606	return file->f_mode & FMODE_STREAM ? NULL : &file->f_pos;
    607}
    608
    609ssize_t ksys_read(unsigned int fd, char __user *buf, size_t count)
    610{
    611	struct fd f = fdget_pos(fd);
    612	ssize_t ret = -EBADF;
    613
    614	if (f.file) {
    615		loff_t pos, *ppos = file_ppos(f.file);
    616		if (ppos) {
    617			pos = *ppos;
    618			ppos = &pos;
    619		}
    620		ret = vfs_read(f.file, buf, count, ppos);
    621		if (ret >= 0 && ppos)
    622			f.file->f_pos = pos;
    623		fdput_pos(f);
    624	}
    625	return ret;
    626}
    627
    628SYSCALL_DEFINE3(read, unsigned int, fd, char __user *, buf, size_t, count)
    629{
    630	return ksys_read(fd, buf, count);
    631}
    632
    633ssize_t ksys_write(unsigned int fd, const char __user *buf, size_t count)
    634{
    635	struct fd f = fdget_pos(fd);
    636	ssize_t ret = -EBADF;
    637
    638	if (f.file) {
    639		loff_t pos, *ppos = file_ppos(f.file);
    640		if (ppos) {
    641			pos = *ppos;
    642			ppos = &pos;
    643		}
    644		ret = vfs_write(f.file, buf, count, ppos);
    645		if (ret >= 0 && ppos)
    646			f.file->f_pos = pos;
    647		fdput_pos(f);
    648	}
    649
    650	return ret;
    651}
    652
    653SYSCALL_DEFINE3(write, unsigned int, fd, const char __user *, buf,
    654		size_t, count)
    655{
    656	return ksys_write(fd, buf, count);
    657}
    658
    659ssize_t ksys_pread64(unsigned int fd, char __user *buf, size_t count,
    660		     loff_t pos)
    661{
    662	struct fd f;
    663	ssize_t ret = -EBADF;
    664
    665	if (pos < 0)
    666		return -EINVAL;
    667
    668	f = fdget(fd);
    669	if (f.file) {
    670		ret = -ESPIPE;
    671		if (f.file->f_mode & FMODE_PREAD)
    672			ret = vfs_read(f.file, buf, count, &pos);
    673		fdput(f);
    674	}
    675
    676	return ret;
    677}
    678
    679SYSCALL_DEFINE4(pread64, unsigned int, fd, char __user *, buf,
    680			size_t, count, loff_t, pos)
    681{
    682	return ksys_pread64(fd, buf, count, pos);
    683}
    684
    685#if defined(CONFIG_COMPAT) && defined(__ARCH_WANT_COMPAT_PREAD64)
    686COMPAT_SYSCALL_DEFINE5(pread64, unsigned int, fd, char __user *, buf,
    687		       size_t, count, compat_arg_u64_dual(pos))
    688{
    689	return ksys_pread64(fd, buf, count, compat_arg_u64_glue(pos));
    690}
    691#endif
    692
    693ssize_t ksys_pwrite64(unsigned int fd, const char __user *buf,
    694		      size_t count, loff_t pos)
    695{
    696	struct fd f;
    697	ssize_t ret = -EBADF;
    698
    699	if (pos < 0)
    700		return -EINVAL;
    701
    702	f = fdget(fd);
    703	if (f.file) {
    704		ret = -ESPIPE;
    705		if (f.file->f_mode & FMODE_PWRITE)  
    706			ret = vfs_write(f.file, buf, count, &pos);
    707		fdput(f);
    708	}
    709
    710	return ret;
    711}
    712
    713SYSCALL_DEFINE4(pwrite64, unsigned int, fd, const char __user *, buf,
    714			 size_t, count, loff_t, pos)
    715{
    716	return ksys_pwrite64(fd, buf, count, pos);
    717}
    718
    719#if defined(CONFIG_COMPAT) && defined(__ARCH_WANT_COMPAT_PWRITE64)
    720COMPAT_SYSCALL_DEFINE5(pwrite64, unsigned int, fd, const char __user *, buf,
    721		       size_t, count, compat_arg_u64_dual(pos))
    722{
    723	return ksys_pwrite64(fd, buf, count, compat_arg_u64_glue(pos));
    724}
    725#endif
    726
    727static ssize_t do_iter_readv_writev(struct file *filp, struct iov_iter *iter,
    728		loff_t *ppos, int type, rwf_t flags)
    729{
    730	struct kiocb kiocb;
    731	ssize_t ret;
    732
    733	init_sync_kiocb(&kiocb, filp);
    734	ret = kiocb_set_rw_flags(&kiocb, flags);
    735	if (ret)
    736		return ret;
    737	kiocb.ki_pos = (ppos ? *ppos : 0);
    738
    739	if (type == READ)
    740		ret = call_read_iter(filp, &kiocb, iter);
    741	else
    742		ret = call_write_iter(filp, &kiocb, iter);
    743	BUG_ON(ret == -EIOCBQUEUED);
    744	if (ppos)
    745		*ppos = kiocb.ki_pos;
    746	return ret;
    747}
    748
    749/* Do it by hand, with file-ops */
    750static ssize_t do_loop_readv_writev(struct file *filp, struct iov_iter *iter,
    751		loff_t *ppos, int type, rwf_t flags)
    752{
    753	ssize_t ret = 0;
    754
    755	if (flags & ~RWF_HIPRI)
    756		return -EOPNOTSUPP;
    757
    758	while (iov_iter_count(iter)) {
    759		struct iovec iovec = iov_iter_iovec(iter);
    760		ssize_t nr;
    761
    762		if (type == READ) {
    763			nr = filp->f_op->read(filp, iovec.iov_base,
    764					      iovec.iov_len, ppos);
    765		} else {
    766			nr = filp->f_op->write(filp, iovec.iov_base,
    767					       iovec.iov_len, ppos);
    768		}
    769
    770		if (nr < 0) {
    771			if (!ret)
    772				ret = nr;
    773			break;
    774		}
    775		ret += nr;
    776		if (nr != iovec.iov_len)
    777			break;
    778		iov_iter_advance(iter, nr);
    779	}
    780
    781	return ret;
    782}
    783
    784static ssize_t do_iter_read(struct file *file, struct iov_iter *iter,
    785		loff_t *pos, rwf_t flags)
    786{
    787	size_t tot_len;
    788	ssize_t ret = 0;
    789
    790	if (!(file->f_mode & FMODE_READ))
    791		return -EBADF;
    792	if (!(file->f_mode & FMODE_CAN_READ))
    793		return -EINVAL;
    794
    795	tot_len = iov_iter_count(iter);
    796	if (!tot_len)
    797		goto out;
    798	ret = rw_verify_area(READ, file, pos, tot_len);
    799	if (ret < 0)
    800		return ret;
    801
    802	if (file->f_op->read_iter)
    803		ret = do_iter_readv_writev(file, iter, pos, READ, flags);
    804	else
    805		ret = do_loop_readv_writev(file, iter, pos, READ, flags);
    806out:
    807	if (ret >= 0)
    808		fsnotify_access(file);
    809	return ret;
    810}
    811
    812ssize_t vfs_iocb_iter_read(struct file *file, struct kiocb *iocb,
    813			   struct iov_iter *iter)
    814{
    815	size_t tot_len;
    816	ssize_t ret = 0;
    817
    818	if (!file->f_op->read_iter)
    819		return -EINVAL;
    820	if (!(file->f_mode & FMODE_READ))
    821		return -EBADF;
    822	if (!(file->f_mode & FMODE_CAN_READ))
    823		return -EINVAL;
    824
    825	tot_len = iov_iter_count(iter);
    826	if (!tot_len)
    827		goto out;
    828	ret = rw_verify_area(READ, file, &iocb->ki_pos, tot_len);
    829	if (ret < 0)
    830		return ret;
    831
    832	ret = call_read_iter(file, iocb, iter);
    833out:
    834	if (ret >= 0)
    835		fsnotify_access(file);
    836	return ret;
    837}
    838EXPORT_SYMBOL(vfs_iocb_iter_read);
    839
    840ssize_t vfs_iter_read(struct file *file, struct iov_iter *iter, loff_t *ppos,
    841		rwf_t flags)
    842{
    843	if (!file->f_op->read_iter)
    844		return -EINVAL;
    845	return do_iter_read(file, iter, ppos, flags);
    846}
    847EXPORT_SYMBOL(vfs_iter_read);
    848
    849static ssize_t do_iter_write(struct file *file, struct iov_iter *iter,
    850		loff_t *pos, rwf_t flags)
    851{
    852	size_t tot_len;
    853	ssize_t ret = 0;
    854
    855	if (!(file->f_mode & FMODE_WRITE))
    856		return -EBADF;
    857	if (!(file->f_mode & FMODE_CAN_WRITE))
    858		return -EINVAL;
    859
    860	tot_len = iov_iter_count(iter);
    861	if (!tot_len)
    862		return 0;
    863	ret = rw_verify_area(WRITE, file, pos, tot_len);
    864	if (ret < 0)
    865		return ret;
    866
    867	if (file->f_op->write_iter)
    868		ret = do_iter_readv_writev(file, iter, pos, WRITE, flags);
    869	else
    870		ret = do_loop_readv_writev(file, iter, pos, WRITE, flags);
    871	if (ret > 0)
    872		fsnotify_modify(file);
    873	return ret;
    874}
    875
    876ssize_t vfs_iocb_iter_write(struct file *file, struct kiocb *iocb,
    877			    struct iov_iter *iter)
    878{
    879	size_t tot_len;
    880	ssize_t ret = 0;
    881
    882	if (!file->f_op->write_iter)
    883		return -EINVAL;
    884	if (!(file->f_mode & FMODE_WRITE))
    885		return -EBADF;
    886	if (!(file->f_mode & FMODE_CAN_WRITE))
    887		return -EINVAL;
    888
    889	tot_len = iov_iter_count(iter);
    890	if (!tot_len)
    891		return 0;
    892	ret = rw_verify_area(WRITE, file, &iocb->ki_pos, tot_len);
    893	if (ret < 0)
    894		return ret;
    895
    896	ret = call_write_iter(file, iocb, iter);
    897	if (ret > 0)
    898		fsnotify_modify(file);
    899
    900	return ret;
    901}
    902EXPORT_SYMBOL(vfs_iocb_iter_write);
    903
    904ssize_t vfs_iter_write(struct file *file, struct iov_iter *iter, loff_t *ppos,
    905		rwf_t flags)
    906{
    907	if (!file->f_op->write_iter)
    908		return -EINVAL;
    909	return do_iter_write(file, iter, ppos, flags);
    910}
    911EXPORT_SYMBOL(vfs_iter_write);
    912
    913static ssize_t vfs_readv(struct file *file, const struct iovec __user *vec,
    914		  unsigned long vlen, loff_t *pos, rwf_t flags)
    915{
    916	struct iovec iovstack[UIO_FASTIOV];
    917	struct iovec *iov = iovstack;
    918	struct iov_iter iter;
    919	ssize_t ret;
    920
    921	ret = import_iovec(READ, vec, vlen, ARRAY_SIZE(iovstack), &iov, &iter);
    922	if (ret >= 0) {
    923		ret = do_iter_read(file, &iter, pos, flags);
    924		kfree(iov);
    925	}
    926
    927	return ret;
    928}
    929
    930static ssize_t vfs_writev(struct file *file, const struct iovec __user *vec,
    931		   unsigned long vlen, loff_t *pos, rwf_t flags)
    932{
    933	struct iovec iovstack[UIO_FASTIOV];
    934	struct iovec *iov = iovstack;
    935	struct iov_iter iter;
    936	ssize_t ret;
    937
    938	ret = import_iovec(WRITE, vec, vlen, ARRAY_SIZE(iovstack), &iov, &iter);
    939	if (ret >= 0) {
    940		file_start_write(file);
    941		ret = do_iter_write(file, &iter, pos, flags);
    942		file_end_write(file);
    943		kfree(iov);
    944	}
    945	return ret;
    946}
    947
    948static ssize_t do_readv(unsigned long fd, const struct iovec __user *vec,
    949			unsigned long vlen, rwf_t flags)
    950{
    951	struct fd f = fdget_pos(fd);
    952	ssize_t ret = -EBADF;
    953
    954	if (f.file) {
    955		loff_t pos, *ppos = file_ppos(f.file);
    956		if (ppos) {
    957			pos = *ppos;
    958			ppos = &pos;
    959		}
    960		ret = vfs_readv(f.file, vec, vlen, ppos, flags);
    961		if (ret >= 0 && ppos)
    962			f.file->f_pos = pos;
    963		fdput_pos(f);
    964	}
    965
    966	if (ret > 0)
    967		add_rchar(current, ret);
    968	inc_syscr(current);
    969	return ret;
    970}
    971
    972static ssize_t do_writev(unsigned long fd, const struct iovec __user *vec,
    973			 unsigned long vlen, rwf_t flags)
    974{
    975	struct fd f = fdget_pos(fd);
    976	ssize_t ret = -EBADF;
    977
    978	if (f.file) {
    979		loff_t pos, *ppos = file_ppos(f.file);
    980		if (ppos) {
    981			pos = *ppos;
    982			ppos = &pos;
    983		}
    984		ret = vfs_writev(f.file, vec, vlen, ppos, flags);
    985		if (ret >= 0 && ppos)
    986			f.file->f_pos = pos;
    987		fdput_pos(f);
    988	}
    989
    990	if (ret > 0)
    991		add_wchar(current, ret);
    992	inc_syscw(current);
    993	return ret;
    994}
    995
    996static inline loff_t pos_from_hilo(unsigned long high, unsigned long low)
    997{
    998#define HALF_LONG_BITS (BITS_PER_LONG / 2)
    999	return (((loff_t)high << HALF_LONG_BITS) << HALF_LONG_BITS) | low;
   1000}
   1001
   1002static ssize_t do_preadv(unsigned long fd, const struct iovec __user *vec,
   1003			 unsigned long vlen, loff_t pos, rwf_t flags)
   1004{
   1005	struct fd f;
   1006	ssize_t ret = -EBADF;
   1007
   1008	if (pos < 0)
   1009		return -EINVAL;
   1010
   1011	f = fdget(fd);
   1012	if (f.file) {
   1013		ret = -ESPIPE;
   1014		if (f.file->f_mode & FMODE_PREAD)
   1015			ret = vfs_readv(f.file, vec, vlen, &pos, flags);
   1016		fdput(f);
   1017	}
   1018
   1019	if (ret > 0)
   1020		add_rchar(current, ret);
   1021	inc_syscr(current);
   1022	return ret;
   1023}
   1024
   1025static ssize_t do_pwritev(unsigned long fd, const struct iovec __user *vec,
   1026			  unsigned long vlen, loff_t pos, rwf_t flags)
   1027{
   1028	struct fd f;
   1029	ssize_t ret = -EBADF;
   1030
   1031	if (pos < 0)
   1032		return -EINVAL;
   1033
   1034	f = fdget(fd);
   1035	if (f.file) {
   1036		ret = -ESPIPE;
   1037		if (f.file->f_mode & FMODE_PWRITE)
   1038			ret = vfs_writev(f.file, vec, vlen, &pos, flags);
   1039		fdput(f);
   1040	}
   1041
   1042	if (ret > 0)
   1043		add_wchar(current, ret);
   1044	inc_syscw(current);
   1045	return ret;
   1046}
   1047
   1048SYSCALL_DEFINE3(readv, unsigned long, fd, const struct iovec __user *, vec,
   1049		unsigned long, vlen)
   1050{
   1051	return do_readv(fd, vec, vlen, 0);
   1052}
   1053
   1054SYSCALL_DEFINE3(writev, unsigned long, fd, const struct iovec __user *, vec,
   1055		unsigned long, vlen)
   1056{
   1057	return do_writev(fd, vec, vlen, 0);
   1058}
   1059
   1060SYSCALL_DEFINE5(preadv, unsigned long, fd, const struct iovec __user *, vec,
   1061		unsigned long, vlen, unsigned long, pos_l, unsigned long, pos_h)
   1062{
   1063	loff_t pos = pos_from_hilo(pos_h, pos_l);
   1064
   1065	return do_preadv(fd, vec, vlen, pos, 0);
   1066}
   1067
   1068SYSCALL_DEFINE6(preadv2, unsigned long, fd, const struct iovec __user *, vec,
   1069		unsigned long, vlen, unsigned long, pos_l, unsigned long, pos_h,
   1070		rwf_t, flags)
   1071{
   1072	loff_t pos = pos_from_hilo(pos_h, pos_l);
   1073
   1074	if (pos == -1)
   1075		return do_readv(fd, vec, vlen, flags);
   1076
   1077	return do_preadv(fd, vec, vlen, pos, flags);
   1078}
   1079
   1080SYSCALL_DEFINE5(pwritev, unsigned long, fd, const struct iovec __user *, vec,
   1081		unsigned long, vlen, unsigned long, pos_l, unsigned long, pos_h)
   1082{
   1083	loff_t pos = pos_from_hilo(pos_h, pos_l);
   1084
   1085	return do_pwritev(fd, vec, vlen, pos, 0);
   1086}
   1087
   1088SYSCALL_DEFINE6(pwritev2, unsigned long, fd, const struct iovec __user *, vec,
   1089		unsigned long, vlen, unsigned long, pos_l, unsigned long, pos_h,
   1090		rwf_t, flags)
   1091{
   1092	loff_t pos = pos_from_hilo(pos_h, pos_l);
   1093
   1094	if (pos == -1)
   1095		return do_writev(fd, vec, vlen, flags);
   1096
   1097	return do_pwritev(fd, vec, vlen, pos, flags);
   1098}
   1099
   1100/*
   1101 * Various compat syscalls.  Note that they all pretend to take a native
   1102 * iovec - import_iovec will properly treat those as compat_iovecs based on
   1103 * in_compat_syscall().
   1104 */
   1105#ifdef CONFIG_COMPAT
   1106#ifdef __ARCH_WANT_COMPAT_SYS_PREADV64
   1107COMPAT_SYSCALL_DEFINE4(preadv64, unsigned long, fd,
   1108		const struct iovec __user *, vec,
   1109		unsigned long, vlen, loff_t, pos)
   1110{
   1111	return do_preadv(fd, vec, vlen, pos, 0);
   1112}
   1113#endif
   1114
   1115COMPAT_SYSCALL_DEFINE5(preadv, compat_ulong_t, fd,
   1116		const struct iovec __user *, vec,
   1117		compat_ulong_t, vlen, u32, pos_low, u32, pos_high)
   1118{
   1119	loff_t pos = ((loff_t)pos_high << 32) | pos_low;
   1120
   1121	return do_preadv(fd, vec, vlen, pos, 0);
   1122}
   1123
   1124#ifdef __ARCH_WANT_COMPAT_SYS_PREADV64V2
   1125COMPAT_SYSCALL_DEFINE5(preadv64v2, unsigned long, fd,
   1126		const struct iovec __user *, vec,
   1127		unsigned long, vlen, loff_t, pos, rwf_t, flags)
   1128{
   1129	if (pos == -1)
   1130		return do_readv(fd, vec, vlen, flags);
   1131	return do_preadv(fd, vec, vlen, pos, flags);
   1132}
   1133#endif
   1134
   1135COMPAT_SYSCALL_DEFINE6(preadv2, compat_ulong_t, fd,
   1136		const struct iovec __user *, vec,
   1137		compat_ulong_t, vlen, u32, pos_low, u32, pos_high,
   1138		rwf_t, flags)
   1139{
   1140	loff_t pos = ((loff_t)pos_high << 32) | pos_low;
   1141
   1142	if (pos == -1)
   1143		return do_readv(fd, vec, vlen, flags);
   1144	return do_preadv(fd, vec, vlen, pos, flags);
   1145}
   1146
   1147#ifdef __ARCH_WANT_COMPAT_SYS_PWRITEV64
   1148COMPAT_SYSCALL_DEFINE4(pwritev64, unsigned long, fd,
   1149		const struct iovec __user *, vec,
   1150		unsigned long, vlen, loff_t, pos)
   1151{
   1152	return do_pwritev(fd, vec, vlen, pos, 0);
   1153}
   1154#endif
   1155
   1156COMPAT_SYSCALL_DEFINE5(pwritev, compat_ulong_t, fd,
   1157		const struct iovec __user *,vec,
   1158		compat_ulong_t, vlen, u32, pos_low, u32, pos_high)
   1159{
   1160	loff_t pos = ((loff_t)pos_high << 32) | pos_low;
   1161
   1162	return do_pwritev(fd, vec, vlen, pos, 0);
   1163}
   1164
   1165#ifdef __ARCH_WANT_COMPAT_SYS_PWRITEV64V2
   1166COMPAT_SYSCALL_DEFINE5(pwritev64v2, unsigned long, fd,
   1167		const struct iovec __user *, vec,
   1168		unsigned long, vlen, loff_t, pos, rwf_t, flags)
   1169{
   1170	if (pos == -1)
   1171		return do_writev(fd, vec, vlen, flags);
   1172	return do_pwritev(fd, vec, vlen, pos, flags);
   1173}
   1174#endif
   1175
   1176COMPAT_SYSCALL_DEFINE6(pwritev2, compat_ulong_t, fd,
   1177		const struct iovec __user *,vec,
   1178		compat_ulong_t, vlen, u32, pos_low, u32, pos_high, rwf_t, flags)
   1179{
   1180	loff_t pos = ((loff_t)pos_high << 32) | pos_low;
   1181
   1182	if (pos == -1)
   1183		return do_writev(fd, vec, vlen, flags);
   1184	return do_pwritev(fd, vec, vlen, pos, flags);
   1185}
   1186#endif /* CONFIG_COMPAT */
   1187
   1188static ssize_t do_sendfile(int out_fd, int in_fd, loff_t *ppos,
   1189		  	   size_t count, loff_t max)
   1190{
   1191	struct fd in, out;
   1192	struct inode *in_inode, *out_inode;
   1193	struct pipe_inode_info *opipe;
   1194	loff_t pos;
   1195	loff_t out_pos;
   1196	ssize_t retval;
   1197	int fl;
   1198
   1199	/*
   1200	 * Get input file, and verify that it is ok..
   1201	 */
   1202	retval = -EBADF;
   1203	in = fdget(in_fd);
   1204	if (!in.file)
   1205		goto out;
   1206	if (!(in.file->f_mode & FMODE_READ))
   1207		goto fput_in;
   1208	retval = -ESPIPE;
   1209	if (!ppos) {
   1210		pos = in.file->f_pos;
   1211	} else {
   1212		pos = *ppos;
   1213		if (!(in.file->f_mode & FMODE_PREAD))
   1214			goto fput_in;
   1215	}
   1216	retval = rw_verify_area(READ, in.file, &pos, count);
   1217	if (retval < 0)
   1218		goto fput_in;
   1219	if (count > MAX_RW_COUNT)
   1220		count =  MAX_RW_COUNT;
   1221
   1222	/*
   1223	 * Get output file, and verify that it is ok..
   1224	 */
   1225	retval = -EBADF;
   1226	out = fdget(out_fd);
   1227	if (!out.file)
   1228		goto fput_in;
   1229	if (!(out.file->f_mode & FMODE_WRITE))
   1230		goto fput_out;
   1231	in_inode = file_inode(in.file);
   1232	out_inode = file_inode(out.file);
   1233	out_pos = out.file->f_pos;
   1234
   1235	if (!max)
   1236		max = min(in_inode->i_sb->s_maxbytes, out_inode->i_sb->s_maxbytes);
   1237
   1238	if (unlikely(pos + count > max)) {
   1239		retval = -EOVERFLOW;
   1240		if (pos >= max)
   1241			goto fput_out;
   1242		count = max - pos;
   1243	}
   1244
   1245	fl = 0;
   1246#if 0
   1247	/*
   1248	 * We need to debate whether we can enable this or not. The
   1249	 * man page documents EAGAIN return for the output at least,
   1250	 * and the application is arguably buggy if it doesn't expect
   1251	 * EAGAIN on a non-blocking file descriptor.
   1252	 */
   1253	if (in.file->f_flags & O_NONBLOCK)
   1254		fl = SPLICE_F_NONBLOCK;
   1255#endif
   1256	opipe = get_pipe_info(out.file, true);
   1257	if (!opipe) {
   1258		retval = rw_verify_area(WRITE, out.file, &out_pos, count);
   1259		if (retval < 0)
   1260			goto fput_out;
   1261		file_start_write(out.file);
   1262		retval = do_splice_direct(in.file, &pos, out.file, &out_pos,
   1263					  count, fl);
   1264		file_end_write(out.file);
   1265	} else {
   1266		retval = splice_file_to_pipe(in.file, opipe, &pos, count, fl);
   1267	}
   1268
   1269	if (retval > 0) {
   1270		add_rchar(current, retval);
   1271		add_wchar(current, retval);
   1272		fsnotify_access(in.file);
   1273		fsnotify_modify(out.file);
   1274		out.file->f_pos = out_pos;
   1275		if (ppos)
   1276			*ppos = pos;
   1277		else
   1278			in.file->f_pos = pos;
   1279	}
   1280
   1281	inc_syscr(current);
   1282	inc_syscw(current);
   1283	if (pos > max)
   1284		retval = -EOVERFLOW;
   1285
   1286fput_out:
   1287	fdput(out);
   1288fput_in:
   1289	fdput(in);
   1290out:
   1291	return retval;
   1292}
   1293
   1294SYSCALL_DEFINE4(sendfile, int, out_fd, int, in_fd, off_t __user *, offset, size_t, count)
   1295{
   1296	loff_t pos;
   1297	off_t off;
   1298	ssize_t ret;
   1299
   1300	if (offset) {
   1301		if (unlikely(get_user(off, offset)))
   1302			return -EFAULT;
   1303		pos = off;
   1304		ret = do_sendfile(out_fd, in_fd, &pos, count, MAX_NON_LFS);
   1305		if (unlikely(put_user(pos, offset)))
   1306			return -EFAULT;
   1307		return ret;
   1308	}
   1309
   1310	return do_sendfile(out_fd, in_fd, NULL, count, 0);
   1311}
   1312
   1313SYSCALL_DEFINE4(sendfile64, int, out_fd, int, in_fd, loff_t __user *, offset, size_t, count)
   1314{
   1315	loff_t pos;
   1316	ssize_t ret;
   1317
   1318	if (offset) {
   1319		if (unlikely(copy_from_user(&pos, offset, sizeof(loff_t))))
   1320			return -EFAULT;
   1321		ret = do_sendfile(out_fd, in_fd, &pos, count, 0);
   1322		if (unlikely(put_user(pos, offset)))
   1323			return -EFAULT;
   1324		return ret;
   1325	}
   1326
   1327	return do_sendfile(out_fd, in_fd, NULL, count, 0);
   1328}
   1329
   1330#ifdef CONFIG_COMPAT
   1331COMPAT_SYSCALL_DEFINE4(sendfile, int, out_fd, int, in_fd,
   1332		compat_off_t __user *, offset, compat_size_t, count)
   1333{
   1334	loff_t pos;
   1335	off_t off;
   1336	ssize_t ret;
   1337
   1338	if (offset) {
   1339		if (unlikely(get_user(off, offset)))
   1340			return -EFAULT;
   1341		pos = off;
   1342		ret = do_sendfile(out_fd, in_fd, &pos, count, MAX_NON_LFS);
   1343		if (unlikely(put_user(pos, offset)))
   1344			return -EFAULT;
   1345		return ret;
   1346	}
   1347
   1348	return do_sendfile(out_fd, in_fd, NULL, count, 0);
   1349}
   1350
   1351COMPAT_SYSCALL_DEFINE4(sendfile64, int, out_fd, int, in_fd,
   1352		compat_loff_t __user *, offset, compat_size_t, count)
   1353{
   1354	loff_t pos;
   1355	ssize_t ret;
   1356
   1357	if (offset) {
   1358		if (unlikely(copy_from_user(&pos, offset, sizeof(loff_t))))
   1359			return -EFAULT;
   1360		ret = do_sendfile(out_fd, in_fd, &pos, count, 0);
   1361		if (unlikely(put_user(pos, offset)))
   1362			return -EFAULT;
   1363		return ret;
   1364	}
   1365
   1366	return do_sendfile(out_fd, in_fd, NULL, count, 0);
   1367}
   1368#endif
   1369
   1370/**
   1371 * generic_copy_file_range - copy data between two files
   1372 * @file_in:	file structure to read from
   1373 * @pos_in:	file offset to read from
   1374 * @file_out:	file structure to write data to
   1375 * @pos_out:	file offset to write data to
   1376 * @len:	amount of data to copy
   1377 * @flags:	copy flags
   1378 *
   1379 * This is a generic filesystem helper to copy data from one file to another.
   1380 * It has no constraints on the source or destination file owners - the files
   1381 * can belong to different superblocks and different filesystem types. Short
   1382 * copies are allowed.
   1383 *
   1384 * This should be called from the @file_out filesystem, as per the
   1385 * ->copy_file_range() method.
   1386 *
   1387 * Returns the number of bytes copied or a negative error indicating the
   1388 * failure.
   1389 */
   1390
   1391ssize_t generic_copy_file_range(struct file *file_in, loff_t pos_in,
   1392				struct file *file_out, loff_t pos_out,
   1393				size_t len, unsigned int flags)
   1394{
   1395	return do_splice_direct(file_in, &pos_in, file_out, &pos_out,
   1396				len > MAX_RW_COUNT ? MAX_RW_COUNT : len, 0);
   1397}
   1398EXPORT_SYMBOL(generic_copy_file_range);
   1399
   1400/*
   1401 * Performs necessary checks before doing a file copy
   1402 *
   1403 * Can adjust amount of bytes to copy via @req_count argument.
   1404 * Returns appropriate error code that caller should return or
   1405 * zero in case the copy should be allowed.
   1406 */
   1407static int generic_copy_file_checks(struct file *file_in, loff_t pos_in,
   1408				    struct file *file_out, loff_t pos_out,
   1409				    size_t *req_count, unsigned int flags)
   1410{
   1411	struct inode *inode_in = file_inode(file_in);
   1412	struct inode *inode_out = file_inode(file_out);
   1413	uint64_t count = *req_count;
   1414	loff_t size_in;
   1415	int ret;
   1416
   1417	ret = generic_file_rw_checks(file_in, file_out);
   1418	if (ret)
   1419		return ret;
   1420
   1421	/*
   1422	 * We allow some filesystems to handle cross sb copy, but passing
   1423	 * a file of the wrong filesystem type to filesystem driver can result
   1424	 * in an attempt to dereference the wrong type of ->private_data, so
   1425	 * avoid doing that until we really have a good reason.
   1426	 *
   1427	 * nfs and cifs define several different file_system_type structures
   1428	 * and several different sets of file_operations, but they all end up
   1429	 * using the same ->copy_file_range() function pointer.
   1430	 */
   1431	if (file_out->f_op->copy_file_range) {
   1432		if (file_in->f_op->copy_file_range !=
   1433		    file_out->f_op->copy_file_range)
   1434			return -EXDEV;
   1435	} else if (file_inode(file_in)->i_sb != file_inode(file_out)->i_sb) {
   1436		return -EXDEV;
   1437	}
   1438
   1439	/* Don't touch certain kinds of inodes */
   1440	if (IS_IMMUTABLE(inode_out))
   1441		return -EPERM;
   1442
   1443	if (IS_SWAPFILE(inode_in) || IS_SWAPFILE(inode_out))
   1444		return -ETXTBSY;
   1445
   1446	/* Ensure offsets don't wrap. */
   1447	if (pos_in + count < pos_in || pos_out + count < pos_out)
   1448		return -EOVERFLOW;
   1449
   1450	/* Shorten the copy to EOF */
   1451	size_in = i_size_read(inode_in);
   1452	if (pos_in >= size_in)
   1453		count = 0;
   1454	else
   1455		count = min(count, size_in - (uint64_t)pos_in);
   1456
   1457	ret = generic_write_check_limits(file_out, pos_out, &count);
   1458	if (ret)
   1459		return ret;
   1460
   1461	/* Don't allow overlapped copying within the same file. */
   1462	if (inode_in == inode_out &&
   1463	    pos_out + count > pos_in &&
   1464	    pos_out < pos_in + count)
   1465		return -EINVAL;
   1466
   1467	*req_count = count;
   1468	return 0;
   1469}
   1470
   1471/*
   1472 * copy_file_range() differs from regular file read and write in that it
   1473 * specifically allows return partial success.  When it does so is up to
   1474 * the copy_file_range method.
   1475 */
   1476ssize_t vfs_copy_file_range(struct file *file_in, loff_t pos_in,
   1477			    struct file *file_out, loff_t pos_out,
   1478			    size_t len, unsigned int flags)
   1479{
   1480	ssize_t ret;
   1481
   1482	if (flags != 0)
   1483		return -EINVAL;
   1484
   1485	ret = generic_copy_file_checks(file_in, pos_in, file_out, pos_out, &len,
   1486				       flags);
   1487	if (unlikely(ret))
   1488		return ret;
   1489
   1490	ret = rw_verify_area(READ, file_in, &pos_in, len);
   1491	if (unlikely(ret))
   1492		return ret;
   1493
   1494	ret = rw_verify_area(WRITE, file_out, &pos_out, len);
   1495	if (unlikely(ret))
   1496		return ret;
   1497
   1498	if (len == 0)
   1499		return 0;
   1500
   1501	file_start_write(file_out);
   1502
   1503	/*
   1504	 * Cloning is supported by more file systems, so we implement copy on
   1505	 * same sb using clone, but for filesystems where both clone and copy
   1506	 * are supported (e.g. nfs,cifs), we only call the copy method.
   1507	 */
   1508	if (file_out->f_op->copy_file_range) {
   1509		ret = file_out->f_op->copy_file_range(file_in, pos_in,
   1510						      file_out, pos_out,
   1511						      len, flags);
   1512		goto done;
   1513	}
   1514
   1515	if (file_in->f_op->remap_file_range &&
   1516	    file_inode(file_in)->i_sb == file_inode(file_out)->i_sb) {
   1517		ret = file_in->f_op->remap_file_range(file_in, pos_in,
   1518				file_out, pos_out,
   1519				min_t(loff_t, MAX_RW_COUNT, len),
   1520				REMAP_FILE_CAN_SHORTEN);
   1521		if (ret > 0)
   1522			goto done;
   1523	}
   1524
   1525	/*
   1526	 * We can get here for same sb copy of filesystems that do not implement
   1527	 * ->copy_file_range() in case filesystem does not support clone or in
   1528	 * case filesystem supports clone but rejected the clone request (e.g.
   1529	 * because it was not block aligned).
   1530	 *
   1531	 * In both cases, fall back to kernel copy so we are able to maintain a
   1532	 * consistent story about which filesystems support copy_file_range()
   1533	 * and which filesystems do not, that will allow userspace tools to
   1534	 * make consistent desicions w.r.t using copy_file_range().
   1535	 */
   1536	ret = generic_copy_file_range(file_in, pos_in, file_out, pos_out, len,
   1537				      flags);
   1538
   1539done:
   1540	if (ret > 0) {
   1541		fsnotify_access(file_in);
   1542		add_rchar(current, ret);
   1543		fsnotify_modify(file_out);
   1544		add_wchar(current, ret);
   1545	}
   1546
   1547	inc_syscr(current);
   1548	inc_syscw(current);
   1549
   1550	file_end_write(file_out);
   1551
   1552	return ret;
   1553}
   1554EXPORT_SYMBOL(vfs_copy_file_range);
   1555
   1556SYSCALL_DEFINE6(copy_file_range, int, fd_in, loff_t __user *, off_in,
   1557		int, fd_out, loff_t __user *, off_out,
   1558		size_t, len, unsigned int, flags)
   1559{
   1560	loff_t pos_in;
   1561	loff_t pos_out;
   1562	struct fd f_in;
   1563	struct fd f_out;
   1564	ssize_t ret = -EBADF;
   1565
   1566	f_in = fdget(fd_in);
   1567	if (!f_in.file)
   1568		goto out2;
   1569
   1570	f_out = fdget(fd_out);
   1571	if (!f_out.file)
   1572		goto out1;
   1573
   1574	ret = -EFAULT;
   1575	if (off_in) {
   1576		if (copy_from_user(&pos_in, off_in, sizeof(loff_t)))
   1577			goto out;
   1578	} else {
   1579		pos_in = f_in.file->f_pos;
   1580	}
   1581
   1582	if (off_out) {
   1583		if (copy_from_user(&pos_out, off_out, sizeof(loff_t)))
   1584			goto out;
   1585	} else {
   1586		pos_out = f_out.file->f_pos;
   1587	}
   1588
   1589	ret = vfs_copy_file_range(f_in.file, pos_in, f_out.file, pos_out, len,
   1590				  flags);
   1591	if (ret > 0) {
   1592		pos_in += ret;
   1593		pos_out += ret;
   1594
   1595		if (off_in) {
   1596			if (copy_to_user(off_in, &pos_in, sizeof(loff_t)))
   1597				ret = -EFAULT;
   1598		} else {
   1599			f_in.file->f_pos = pos_in;
   1600		}
   1601
   1602		if (off_out) {
   1603			if (copy_to_user(off_out, &pos_out, sizeof(loff_t)))
   1604				ret = -EFAULT;
   1605		} else {
   1606			f_out.file->f_pos = pos_out;
   1607		}
   1608	}
   1609
   1610out:
   1611	fdput(f_out);
   1612out1:
   1613	fdput(f_in);
   1614out2:
   1615	return ret;
   1616}
   1617
   1618/*
   1619 * Don't operate on ranges the page cache doesn't support, and don't exceed the
   1620 * LFS limits.  If pos is under the limit it becomes a short access.  If it
   1621 * exceeds the limit we return -EFBIG.
   1622 */
   1623int generic_write_check_limits(struct file *file, loff_t pos, loff_t *count)
   1624{
   1625	struct inode *inode = file->f_mapping->host;
   1626	loff_t max_size = inode->i_sb->s_maxbytes;
   1627	loff_t limit = rlimit(RLIMIT_FSIZE);
   1628
   1629	if (limit != RLIM_INFINITY) {
   1630		if (pos >= limit) {
   1631			send_sig(SIGXFSZ, current, 0);
   1632			return -EFBIG;
   1633		}
   1634		*count = min(*count, limit - pos);
   1635	}
   1636
   1637	if (!(file->f_flags & O_LARGEFILE))
   1638		max_size = MAX_NON_LFS;
   1639
   1640	if (unlikely(pos >= max_size))
   1641		return -EFBIG;
   1642
   1643	*count = min(*count, max_size - pos);
   1644
   1645	return 0;
   1646}
   1647
   1648/* Like generic_write_checks(), but takes size of write instead of iter. */
   1649int generic_write_checks_count(struct kiocb *iocb, loff_t *count)
   1650{
   1651	struct file *file = iocb->ki_filp;
   1652	struct inode *inode = file->f_mapping->host;
   1653
   1654	if (IS_SWAPFILE(inode))
   1655		return -ETXTBSY;
   1656
   1657	if (!*count)
   1658		return 0;
   1659
   1660	if (iocb->ki_flags & IOCB_APPEND)
   1661		iocb->ki_pos = i_size_read(inode);
   1662
   1663	if ((iocb->ki_flags & IOCB_NOWAIT) && !(iocb->ki_flags & IOCB_DIRECT))
   1664		return -EINVAL;
   1665
   1666	return generic_write_check_limits(iocb->ki_filp, iocb->ki_pos, count);
   1667}
   1668EXPORT_SYMBOL(generic_write_checks_count);
   1669
   1670/*
   1671 * Performs necessary checks before doing a write
   1672 *
   1673 * Can adjust writing position or amount of bytes to write.
   1674 * Returns appropriate error code that caller should return or
   1675 * zero in case that write should be allowed.
   1676 */
   1677ssize_t generic_write_checks(struct kiocb *iocb, struct iov_iter *from)
   1678{
   1679	loff_t count = iov_iter_count(from);
   1680	int ret;
   1681
   1682	ret = generic_write_checks_count(iocb, &count);
   1683	if (ret)
   1684		return ret;
   1685
   1686	iov_iter_truncate(from, count);
   1687	return iov_iter_count(from);
   1688}
   1689EXPORT_SYMBOL(generic_write_checks);
   1690
   1691/*
   1692 * Performs common checks before doing a file copy/clone
   1693 * from @file_in to @file_out.
   1694 */
   1695int generic_file_rw_checks(struct file *file_in, struct file *file_out)
   1696{
   1697	struct inode *inode_in = file_inode(file_in);
   1698	struct inode *inode_out = file_inode(file_out);
   1699
   1700	/* Don't copy dirs, pipes, sockets... */
   1701	if (S_ISDIR(inode_in->i_mode) || S_ISDIR(inode_out->i_mode))
   1702		return -EISDIR;
   1703	if (!S_ISREG(inode_in->i_mode) || !S_ISREG(inode_out->i_mode))
   1704		return -EINVAL;
   1705
   1706	if (!(file_in->f_mode & FMODE_READ) ||
   1707	    !(file_out->f_mode & FMODE_WRITE) ||
   1708	    (file_out->f_flags & O_APPEND))
   1709		return -EBADF;
   1710
   1711	return 0;
   1712}