cachepc-linux

Fork of AMDESE/linux with modifications for CachePC side-channel attack
git clone https://git.sinitax.com/sinitax/cachepc-linux
Log | Files | Refs | README | LICENSE | sfeed.txt

file.c (15067B)


      1// SPDX-License-Identifier: GPL-2.0
      2/*
      3 * (C) 2001 Clemson University and The University of Chicago
      4 * Copyright 2018 Omnibond Systems, L.L.C.
      5 *
      6 * See COPYING in top-level directory.
      7 */
      8
      9/*
     10 *  Linux VFS file operations.
     11 */
     12
     13#include "protocol.h"
     14#include "orangefs-kernel.h"
     15#include "orangefs-bufmap.h"
     16#include <linux/fs.h>
     17#include <linux/pagemap.h>
     18
     19static int flush_racache(struct inode *inode)
     20{
     21	struct orangefs_inode_s *orangefs_inode = ORANGEFS_I(inode);
     22	struct orangefs_kernel_op_s *new_op;
     23	int ret;
     24
     25	gossip_debug(GOSSIP_UTILS_DEBUG,
     26	    "%s: %pU: Handle is %pU | fs_id %d\n", __func__,
     27	    get_khandle_from_ino(inode), &orangefs_inode->refn.khandle,
     28	    orangefs_inode->refn.fs_id);
     29
     30	new_op = op_alloc(ORANGEFS_VFS_OP_RA_FLUSH);
     31	if (!new_op)
     32		return -ENOMEM;
     33	new_op->upcall.req.ra_cache_flush.refn = orangefs_inode->refn;
     34
     35	ret = service_operation(new_op, "orangefs_flush_racache",
     36	    get_interruptible_flag(inode));
     37
     38	gossip_debug(GOSSIP_UTILS_DEBUG, "%s: got return value of %d\n",
     39	    __func__, ret);
     40
     41	op_release(new_op);
     42	return ret;
     43}
     44
     45/*
     46 * Post and wait for the I/O upcall to finish
     47 */
     48ssize_t wait_for_direct_io(enum ORANGEFS_io_type type, struct inode *inode,
     49	loff_t *offset, struct iov_iter *iter, size_t total_size,
     50	loff_t readahead_size, struct orangefs_write_range *wr,
     51	int *index_return, struct file *file)
     52{
     53	struct orangefs_inode_s *orangefs_inode = ORANGEFS_I(inode);
     54	struct orangefs_khandle *handle = &orangefs_inode->refn.khandle;
     55	struct orangefs_kernel_op_s *new_op = NULL;
     56	int buffer_index;
     57	ssize_t ret;
     58	size_t copy_amount;
     59	int open_for_read;
     60	int open_for_write;
     61
     62	new_op = op_alloc(ORANGEFS_VFS_OP_FILE_IO);
     63	if (!new_op)
     64		return -ENOMEM;
     65
     66	/* synchronous I/O */
     67	new_op->upcall.req.io.readahead_size = readahead_size;
     68	new_op->upcall.req.io.io_type = type;
     69	new_op->upcall.req.io.refn = orangefs_inode->refn;
     70
     71populate_shared_memory:
     72	/* get a shared buffer index */
     73	buffer_index = orangefs_bufmap_get();
     74	if (buffer_index < 0) {
     75		ret = buffer_index;
     76		gossip_debug(GOSSIP_FILE_DEBUG,
     77			     "%s: orangefs_bufmap_get failure (%zd)\n",
     78			     __func__, ret);
     79		goto out;
     80	}
     81	gossip_debug(GOSSIP_FILE_DEBUG,
     82		     "%s(%pU): GET op %p -> buffer_index %d\n",
     83		     __func__,
     84		     handle,
     85		     new_op,
     86		     buffer_index);
     87
     88	new_op->uses_shared_memory = 1;
     89	new_op->upcall.req.io.buf_index = buffer_index;
     90	new_op->upcall.req.io.count = total_size;
     91	new_op->upcall.req.io.offset = *offset;
     92	if (type == ORANGEFS_IO_WRITE && wr) {
     93		new_op->upcall.uid = from_kuid(&init_user_ns, wr->uid);
     94		new_op->upcall.gid = from_kgid(&init_user_ns, wr->gid);
     95	}
     96	/*
     97	 * Orangefs has no open, and orangefs checks file permissions
     98	 * on each file access. Posix requires that file permissions
     99	 * be checked on open and nowhere else. Orangefs-through-the-kernel
    100	 * needs to seem posix compliant.
    101	 *
    102	 * The VFS opens files, even if the filesystem provides no
    103	 * method. We can see if a file was successfully opened for
    104	 * read and or for write by looking at file->f_mode.
    105	 *
    106	 * When writes are flowing from the page cache, file is no
    107	 * longer available. We can trust the VFS to have checked
    108	 * file->f_mode before writing to the page cache.
    109	 *
    110	 * The mode of a file might change between when it is opened
    111	 * and IO commences, or it might be created with an arbitrary mode.
    112	 *
    113	 * We'll make sure we don't hit EACCES during the IO stage by
    114	 * using UID 0. Some of the time we have access without changing
    115	 * to UID 0 - how to check?
    116	 */
    117	if (file) {
    118		open_for_write = file->f_mode & FMODE_WRITE;
    119		open_for_read = file->f_mode & FMODE_READ;
    120	} else {
    121		open_for_write = 1;
    122		open_for_read = 0; /* not relevant? */
    123	}
    124	if ((type == ORANGEFS_IO_WRITE) && open_for_write)
    125		new_op->upcall.uid = 0;
    126	if ((type == ORANGEFS_IO_READ) && open_for_read)
    127		new_op->upcall.uid = 0;
    128
    129	gossip_debug(GOSSIP_FILE_DEBUG,
    130		     "%s(%pU): offset: %llu total_size: %zd\n",
    131		     __func__,
    132		     handle,
    133		     llu(*offset),
    134		     total_size);
    135	/*
    136	 * Stage 1: copy the buffers into client-core's address space
    137	 */
    138	if (type == ORANGEFS_IO_WRITE && total_size) {
    139		ret = orangefs_bufmap_copy_from_iovec(iter, buffer_index,
    140		    total_size);
    141		if (ret < 0) {
    142			gossip_err("%s: Failed to copy-in buffers. Please make sure that the pvfs2-client is running. %ld\n",
    143			    __func__, (long)ret);
    144			goto out;
    145		}
    146	}
    147
    148	gossip_debug(GOSSIP_FILE_DEBUG,
    149		     "%s(%pU): Calling post_io_request with tag (%llu)\n",
    150		     __func__,
    151		     handle,
    152		     llu(new_op->tag));
    153
    154	/* Stage 2: Service the I/O operation */
    155	ret = service_operation(new_op,
    156				type == ORANGEFS_IO_WRITE ?
    157					"file_write" :
    158					"file_read",
    159				get_interruptible_flag(inode));
    160
    161	/*
    162	 * If service_operation() returns -EAGAIN #and# the operation was
    163	 * purged from orangefs_request_list or htable_ops_in_progress, then
    164	 * we know that the client was restarted, causing the shared memory
    165	 * area to be wiped clean.  To restart a  write operation in this
    166	 * case, we must re-copy the data from the user's iovec to a NEW
    167	 * shared memory location. To restart a read operation, we must get
    168	 * a new shared memory location.
    169	 */
    170	if (ret == -EAGAIN && op_state_purged(new_op)) {
    171		orangefs_bufmap_put(buffer_index);
    172		if (type == ORANGEFS_IO_WRITE)
    173			iov_iter_revert(iter, total_size);
    174		gossip_debug(GOSSIP_FILE_DEBUG,
    175			     "%s:going to repopulate_shared_memory.\n",
    176			     __func__);
    177		goto populate_shared_memory;
    178	}
    179
    180	if (ret < 0) {
    181		if (ret == -EINTR) {
    182			/*
    183			 * We can't return EINTR if any data was written,
    184			 * it's not POSIX. It is minimally acceptable
    185			 * to give a partial write, the way NFS does.
    186			 *
    187			 * It would be optimal to return all or nothing,
    188			 * but if a userspace write is bigger than
    189			 * an IO buffer, and the interrupt occurs
    190			 * between buffer writes, that would not be
    191			 * possible.
    192			 */
    193			switch (new_op->op_state - OP_VFS_STATE_GIVEN_UP) {
    194			/*
    195			 * If the op was waiting when the interrupt
    196			 * occurred, then the client-core did not
    197			 * trigger the write.
    198			 */
    199			case OP_VFS_STATE_WAITING:
    200				if (*offset == 0)
    201					ret = -EINTR;
    202				else
    203					ret = 0;
    204				break;
    205			/*
    206			 * If the op was in progress when the interrupt
    207			 * occurred, then the client-core was able to
    208			 * trigger the write.
    209			 */
    210			case OP_VFS_STATE_INPROGR:
    211				if (type == ORANGEFS_IO_READ)
    212					ret = -EINTR;
    213				else
    214					ret = total_size;
    215				break;
    216			default:
    217				gossip_err("%s: unexpected op state :%d:.\n",
    218					   __func__,
    219					   new_op->op_state);
    220				ret = 0;
    221				break;
    222			}
    223			gossip_debug(GOSSIP_FILE_DEBUG,
    224				     "%s: got EINTR, state:%d: %p\n",
    225				     __func__,
    226				     new_op->op_state,
    227				     new_op);
    228		} else {
    229			gossip_err("%s: error in %s handle %pU, returning %zd\n",
    230				__func__,
    231				type == ORANGEFS_IO_READ ?
    232					"read from" : "write to",
    233				handle, ret);
    234		}
    235		if (orangefs_cancel_op_in_progress(new_op))
    236			return ret;
    237
    238		goto out;
    239	}
    240
    241	/*
    242	 * Stage 3: Post copy buffers from client-core's address space
    243	 */
    244	if (type == ORANGEFS_IO_READ && new_op->downcall.resp.io.amt_complete) {
    245		/*
    246		 * NOTE: the iovector can either contain addresses which
    247		 *       can futher be kernel-space or user-space addresses.
    248		 *       or it can pointers to struct page's
    249		 */
    250
    251		copy_amount = new_op->downcall.resp.io.amt_complete;
    252
    253		ret = orangefs_bufmap_copy_to_iovec(iter, buffer_index,
    254			copy_amount);
    255		if (ret < 0) {
    256			gossip_err("%s: Failed to copy-out buffers. Please make sure that the pvfs2-client is running (%ld)\n",
    257			    __func__, (long)ret);
    258			goto out;
    259		}
    260	}
    261	gossip_debug(GOSSIP_FILE_DEBUG,
    262	    "%s(%pU): Amount %s, returned by the sys-io call:%d\n",
    263	    __func__,
    264	    handle,
    265	    type == ORANGEFS_IO_READ ?  "read" : "written",
    266	    (int)new_op->downcall.resp.io.amt_complete);
    267
    268	ret = new_op->downcall.resp.io.amt_complete;
    269
    270out:
    271	if (buffer_index >= 0) {
    272		orangefs_bufmap_put(buffer_index);
    273		gossip_debug(GOSSIP_FILE_DEBUG,
    274			"%s(%pU): PUT buffer_index %d\n",
    275			__func__, handle, buffer_index);
    276		buffer_index = -1;
    277	}
    278	op_release(new_op);
    279	return ret;
    280}
    281
    282int orangefs_revalidate_mapping(struct inode *inode)
    283{
    284	struct orangefs_inode_s *orangefs_inode = ORANGEFS_I(inode);
    285	struct address_space *mapping = inode->i_mapping;
    286	unsigned long *bitlock = &orangefs_inode->bitlock;
    287	int ret;
    288
    289	while (1) {
    290		ret = wait_on_bit(bitlock, 1, TASK_KILLABLE);
    291		if (ret)
    292			return ret;
    293		spin_lock(&inode->i_lock);
    294		if (test_bit(1, bitlock)) {
    295			spin_unlock(&inode->i_lock);
    296			continue;
    297		}
    298		if (!time_before(jiffies, orangefs_inode->mapping_time))
    299			break;
    300		spin_unlock(&inode->i_lock);
    301		return 0;
    302	}
    303
    304	set_bit(1, bitlock);
    305	smp_wmb();
    306	spin_unlock(&inode->i_lock);
    307
    308	unmap_mapping_range(mapping, 0, 0, 0);
    309	ret = filemap_write_and_wait(mapping);
    310	if (!ret)
    311		ret = invalidate_inode_pages2(mapping);
    312
    313	orangefs_inode->mapping_time = jiffies +
    314	    orangefs_cache_timeout_msecs*HZ/1000;
    315
    316	clear_bit(1, bitlock);
    317	smp_mb__after_atomic();
    318	wake_up_bit(bitlock, 1);
    319
    320	return ret;
    321}
    322
    323static ssize_t orangefs_file_read_iter(struct kiocb *iocb,
    324    struct iov_iter *iter)
    325{
    326	int ret;
    327	orangefs_stats.reads++;
    328
    329	down_read(&file_inode(iocb->ki_filp)->i_rwsem);
    330	ret = orangefs_revalidate_mapping(file_inode(iocb->ki_filp));
    331	if (ret)
    332		goto out;
    333
    334	ret = generic_file_read_iter(iocb, iter);
    335out:
    336	up_read(&file_inode(iocb->ki_filp)->i_rwsem);
    337	return ret;
    338}
    339
    340static ssize_t orangefs_file_write_iter(struct kiocb *iocb,
    341    struct iov_iter *iter)
    342{
    343	int ret;
    344	orangefs_stats.writes++;
    345
    346	if (iocb->ki_pos > i_size_read(file_inode(iocb->ki_filp))) {
    347		ret = orangefs_revalidate_mapping(file_inode(iocb->ki_filp));
    348		if (ret)
    349			return ret;
    350	}
    351
    352	ret = generic_file_write_iter(iocb, iter);
    353	return ret;
    354}
    355
    356static vm_fault_t orangefs_fault(struct vm_fault *vmf)
    357{
    358	struct file *file = vmf->vma->vm_file;
    359	int ret;
    360	ret = orangefs_inode_getattr(file->f_mapping->host,
    361	    ORANGEFS_GETATTR_SIZE);
    362	if (ret == -ESTALE)
    363		ret = -EIO;
    364	if (ret) {
    365		gossip_err("%s: orangefs_inode_getattr failed, "
    366		    "ret:%d:.\n", __func__, ret);
    367		return VM_FAULT_SIGBUS;
    368	}
    369	return filemap_fault(vmf);
    370}
    371
    372static const struct vm_operations_struct orangefs_file_vm_ops = {
    373	.fault = orangefs_fault,
    374	.map_pages = filemap_map_pages,
    375	.page_mkwrite = orangefs_page_mkwrite,
    376};
    377
    378/*
    379 * Memory map a region of a file.
    380 */
    381static int orangefs_file_mmap(struct file *file, struct vm_area_struct *vma)
    382{
    383	int ret;
    384
    385	ret = orangefs_revalidate_mapping(file_inode(file));
    386	if (ret)
    387		return ret;
    388
    389	gossip_debug(GOSSIP_FILE_DEBUG,
    390		     "orangefs_file_mmap: called on %pD\n", file);
    391
    392	/* set the sequential readahead hint */
    393	vma->vm_flags |= VM_SEQ_READ;
    394	vma->vm_flags &= ~VM_RAND_READ;
    395
    396	file_accessed(file);
    397	vma->vm_ops = &orangefs_file_vm_ops;
    398	return 0;
    399}
    400
    401#define mapping_nrpages(idata) ((idata)->nrpages)
    402
    403/*
    404 * Called to notify the module that there are no more references to
    405 * this file (i.e. no processes have it open).
    406 *
    407 * \note Not called when each file is closed.
    408 */
    409static int orangefs_file_release(struct inode *inode, struct file *file)
    410{
    411	gossip_debug(GOSSIP_FILE_DEBUG,
    412		     "orangefs_file_release: called on %pD\n",
    413		     file);
    414
    415	/*
    416	 * remove all associated inode pages from the page cache and
    417	 * readahead cache (if any); this forces an expensive refresh of
    418	 * data for the next caller of mmap (or 'get_block' accesses)
    419	 */
    420	if (file_inode(file) &&
    421	    file_inode(file)->i_mapping &&
    422	    mapping_nrpages(&file_inode(file)->i_data)) {
    423		if (orangefs_features & ORANGEFS_FEATURE_READAHEAD) {
    424			gossip_debug(GOSSIP_INODE_DEBUG,
    425			    "calling flush_racache on %pU\n",
    426			    get_khandle_from_ino(inode));
    427			flush_racache(inode);
    428			gossip_debug(GOSSIP_INODE_DEBUG,
    429			    "flush_racache finished\n");
    430		}
    431
    432	}
    433	return 0;
    434}
    435
    436/*
    437 * Push all data for a specific file onto permanent storage.
    438 */
    439static int orangefs_fsync(struct file *file,
    440		       loff_t start,
    441		       loff_t end,
    442		       int datasync)
    443{
    444	int ret;
    445	struct orangefs_inode_s *orangefs_inode =
    446		ORANGEFS_I(file_inode(file));
    447	struct orangefs_kernel_op_s *new_op = NULL;
    448
    449	ret = filemap_write_and_wait_range(file_inode(file)->i_mapping,
    450	    start, end);
    451	if (ret < 0)
    452		return ret;
    453
    454	new_op = op_alloc(ORANGEFS_VFS_OP_FSYNC);
    455	if (!new_op)
    456		return -ENOMEM;
    457	new_op->upcall.req.fsync.refn = orangefs_inode->refn;
    458
    459	ret = service_operation(new_op,
    460			"orangefs_fsync",
    461			get_interruptible_flag(file_inode(file)));
    462
    463	gossip_debug(GOSSIP_FILE_DEBUG,
    464		     "orangefs_fsync got return value of %d\n",
    465		     ret);
    466
    467	op_release(new_op);
    468	return ret;
    469}
    470
    471/*
    472 * Change the file pointer position for an instance of an open file.
    473 *
    474 * \note If .llseek is overriden, we must acquire lock as described in
    475 *       Documentation/filesystems/locking.rst.
    476 *
    477 * Future upgrade could support SEEK_DATA and SEEK_HOLE but would
    478 * require much changes to the FS
    479 */
    480static loff_t orangefs_file_llseek(struct file *file, loff_t offset, int origin)
    481{
    482	int ret = -EINVAL;
    483	struct inode *inode = file_inode(file);
    484
    485	if (origin == SEEK_END) {
    486		/*
    487		 * revalidate the inode's file size.
    488		 * NOTE: We are only interested in file size here,
    489		 * so we set mask accordingly.
    490		 */
    491		ret = orangefs_inode_getattr(file->f_mapping->host,
    492		    ORANGEFS_GETATTR_SIZE);
    493		if (ret == -ESTALE)
    494			ret = -EIO;
    495		if (ret) {
    496			gossip_debug(GOSSIP_FILE_DEBUG,
    497				     "%s:%s:%d calling make bad inode\n",
    498				     __FILE__,
    499				     __func__,
    500				     __LINE__);
    501			return ret;
    502		}
    503	}
    504
    505	gossip_debug(GOSSIP_FILE_DEBUG,
    506		     "orangefs_file_llseek: offset is %ld | origin is %d"
    507		     " | inode size is %lu\n",
    508		     (long)offset,
    509		     origin,
    510		     (unsigned long)i_size_read(inode));
    511
    512	return generic_file_llseek(file, offset, origin);
    513}
    514
    515/*
    516 * Support local locks (locks that only this kernel knows about)
    517 * if Orangefs was mounted -o local_lock.
    518 */
    519static int orangefs_lock(struct file *filp, int cmd, struct file_lock *fl)
    520{
    521	int rc = -EINVAL;
    522
    523	if (ORANGEFS_SB(file_inode(filp)->i_sb)->flags & ORANGEFS_OPT_LOCAL_LOCK) {
    524		if (cmd == F_GETLK) {
    525			rc = 0;
    526			posix_test_lock(filp, fl);
    527		} else {
    528			rc = posix_lock_file(filp, fl, NULL);
    529		}
    530	}
    531
    532	return rc;
    533}
    534
    535static int orangefs_flush(struct file *file, fl_owner_t id)
    536{
    537	/*
    538	 * This is vfs_fsync_range(file, 0, LLONG_MAX, 0) without the
    539	 * service_operation in orangefs_fsync.
    540	 *
    541	 * Do not send fsync to OrangeFS server on a close.  Do send fsync
    542	 * on an explicit fsync call.  This duplicates historical OrangeFS
    543	 * behavior.
    544	 */
    545	int r;
    546
    547	r = filemap_write_and_wait_range(file->f_mapping, 0, LLONG_MAX);
    548	if (r > 0)
    549		return 0;
    550	else
    551		return r;
    552}
    553
    554/** ORANGEFS implementation of VFS file operations */
    555const struct file_operations orangefs_file_operations = {
    556	.llseek		= orangefs_file_llseek,
    557	.read_iter	= orangefs_file_read_iter,
    558	.write_iter	= orangefs_file_write_iter,
    559	.lock		= orangefs_lock,
    560	.mmap		= orangefs_file_mmap,
    561	.open		= generic_file_open,
    562	.splice_read    = generic_file_splice_read,
    563	.splice_write   = iter_file_splice_write,
    564	.flush		= orangefs_flush,
    565	.release	= orangefs_file_release,
    566	.fsync		= orangefs_fsync,
    567};