file.c - cachepc-linux - Fork of AMDESE/linux with modifications for CachePC side-channel attack

	cachepc-linux Fork of AMDESE/linux with modifications for CachePC side-channel attack
	git clone https://git.sinitax.com/sinitax/cachepc-linux
	Log \| Files \| Refs \| README \| LICENSE \| sfeed.txt
file.c (60945B)
      1// SPDX-License-Identifier: GPL-2.0-or-later
      2/*
      3 * file.c - NTFS kernel file operations.  Part of the Linux-NTFS project.
      4 *
      5 * Copyright (c) 2001-2015 Anton Altaparmakov and Tuxera Inc.
      6 */
      7
      8#include <linux/blkdev.h>
      9#include <linux/backing-dev.h>
     10#include <linux/buffer_head.h>
     11#include <linux/gfp.h>
     12#include <linux/pagemap.h>
     13#include <linux/pagevec.h>
     14#include <linux/sched/signal.h>
     15#include <linux/swap.h>
     16#include <linux/uio.h>
     17#include <linux/writeback.h>
     18
     19#include <asm/page.h>
     20#include <linux/uaccess.h>
     21
     22#include "attrib.h"
     23#include "bitmap.h"
     24#include "inode.h"
     25#include "debug.h"
     26#include "lcnalloc.h"
     27#include "malloc.h"
     28#include "mft.h"
     29#include "ntfs.h"
     30
     31/**
     32 * ntfs_file_open - called when an inode is about to be opened
     33 * @vi:		inode to be opened
     34 * @filp:	file structure describing the inode
     35 *
     36 * Limit file size to the page cache limit on architectures where unsigned long
     37 * is 32-bits. This is the most we can do for now without overflowing the page
     38 * cache page index. Doing it this way means we don't run into problems because
     39 * of existing too large files. It would be better to allow the user to read
     40 * the beginning of the file but I doubt very much anyone is going to hit this
     41 * check on a 32-bit architecture, so there is no point in adding the extra
     42 * complexity required to support this.
     43 *
     44 * On 64-bit architectures, the check is hopefully optimized away by the
     45 * compiler.
     46 *
     47 * After the check passes, just call generic_file_open() to do its work.
     48 */
     49static int ntfs_file_open(struct inode *vi, struct file *filp)
     50{
     51	if (sizeof(unsigned long) < 8) {
     52		if (i_size_read(vi) > MAX_LFS_FILESIZE)
     53			return -EOVERFLOW;
     54	}
     55	return generic_file_open(vi, filp);
     56}
     57
     58#ifdef NTFS_RW
     59
     60/**
     61 * ntfs_attr_extend_initialized - extend the initialized size of an attribute
     62 * @ni:			ntfs inode of the attribute to extend
     63 * @new_init_size:	requested new initialized size in bytes
     64 *
     65 * Extend the initialized size of an attribute described by the ntfs inode @ni
     66 * to @new_init_size bytes.  This involves zeroing any non-sparse space between
     67 * the old initialized size and @new_init_size both in the page cache and on
     68 * disk (if relevant complete pages are already uptodate in the page cache then
     69 * these are simply marked dirty).
     70 *
     71 * As a side-effect, the file size (vfs inode->i_size) may be incremented as,
     72 * in the resident attribute case, it is tied to the initialized size and, in
     73 * the non-resident attribute case, it may not fall below the initialized size.
     74 *
     75 * Note that if the attribute is resident, we do not need to touch the page
     76 * cache at all.  This is because if the page cache page is not uptodate we
     77 * bring it uptodate later, when doing the write to the mft record since we
     78 * then already have the page mapped.  And if the page is uptodate, the
     79 * non-initialized region will already have been zeroed when the page was
     80 * brought uptodate and the region may in fact already have been overwritten
     81 * with new data via mmap() based writes, so we cannot just zero it.  And since
     82 * POSIX specifies that the behaviour of resizing a file whilst it is mmap()ped
     83 * is unspecified, we choose not to do zeroing and thus we do not need to touch
     84 * the page at all.  For a more detailed explanation see ntfs_truncate() in
     85 * fs/ntfs/inode.c.
     86 *
     87 * Return 0 on success and -errno on error.  In the case that an error is
     88 * encountered it is possible that the initialized size will already have been
     89 * incremented some way towards @new_init_size but it is guaranteed that if
     90 * this is the case, the necessary zeroing will also have happened and that all
     91 * metadata is self-consistent.
     92 *
     93 * Locking: i_mutex on the vfs inode corrseponsind to the ntfs inode @ni must be
     94 *	    held by the caller.
     95 */
     96static int ntfs_attr_extend_initialized(ntfs_inode *ni, const s64 new_init_size)
     97{
     98	s64 old_init_size;
     99	loff_t old_i_size;
    100	pgoff_t index, end_index;
    101	unsigned long flags;
    102	struct inode *vi = VFS_I(ni);
    103	ntfs_inode *base_ni;
    104	MFT_RECORD *m = NULL;
    105	ATTR_RECORD *a;
    106	ntfs_attr_search_ctx *ctx = NULL;
    107	struct address_space *mapping;
    108	struct page *page = NULL;
    109	u8 *kattr;
    110	int err;
    111	u32 attr_len;
    112
    113	read_lock_irqsave(&ni->size_lock, flags);
    114	old_init_size = ni->initialized_size;
    115	old_i_size = i_size_read(vi);
    116	BUG_ON(new_init_size > ni->allocated_size);
    117	read_unlock_irqrestore(&ni->size_lock, flags);
    118	ntfs_debug("Entering for i_ino 0x%lx, attribute type 0x%x, "
    119			"old_initialized_size 0x%llx, "
    120			"new_initialized_size 0x%llx, i_size 0x%llx.",
    121			vi->i_ino, (unsigned)le32_to_cpu(ni->type),
    122			(unsigned long long)old_init_size,
    123			(unsigned long long)new_init_size, old_i_size);
    124	if (!NInoAttr(ni))
    125		base_ni = ni;
    126	else
    127		base_ni = ni->ext.base_ntfs_ino;
    128	/* Use goto to reduce indentation and we need the label below anyway. */
    129	if (NInoNonResident(ni))
    130		goto do_non_resident_extend;
    131	BUG_ON(old_init_size != old_i_size);
    132	m = map_mft_record(base_ni);
    133	if (IS_ERR(m)) {
    134		err = PTR_ERR(m);
    135		m = NULL;
    136		goto err_out;
    137	}
    138	ctx = ntfs_attr_get_search_ctx(base_ni, m);
    139	if (unlikely(!ctx)) {
    140		err = -ENOMEM;
    141		goto err_out;
    142	}
    143	err = ntfs_attr_lookup(ni->type, ni->name, ni->name_len,
    144			CASE_SENSITIVE, 0, NULL, 0, ctx);
    145	if (unlikely(err)) {
    146		if (err == -ENOENT)
    147			err = -EIO;
    148		goto err_out;
    149	}
    150	m = ctx->mrec;
    151	a = ctx->attr;
    152	BUG_ON(a->non_resident);
    153	/* The total length of the attribute value. */
    154	attr_len = le32_to_cpu(a->data.resident.value_length);
    155	BUG_ON(old_i_size != (loff_t)attr_len);
    156	/*
    157	 * Do the zeroing in the mft record and update the attribute size in
    158	 * the mft record.
    159	 */
    160	kattr = (u8*)a + le16_to_cpu(a->data.resident.value_offset);
    161	memset(kattr + attr_len, 0, new_init_size - attr_len);
    162	a->data.resident.value_length = cpu_to_le32((u32)new_init_size);
    163	/* Finally, update the sizes in the vfs and ntfs inodes. */
    164	write_lock_irqsave(&ni->size_lock, flags);
    165	i_size_write(vi, new_init_size);
    166	ni->initialized_size = new_init_size;
    167	write_unlock_irqrestore(&ni->size_lock, flags);
    168	goto done;
    169do_non_resident_extend:
    170	/*
    171	 * If the new initialized size @new_init_size exceeds the current file
    172	 * size (vfs inode->i_size), we need to extend the file size to the
    173	 * new initialized size.
    174	 */
    175	if (new_init_size > old_i_size) {
    176		m = map_mft_record(base_ni);
    177		if (IS_ERR(m)) {
    178			err = PTR_ERR(m);
    179			m = NULL;
    180			goto err_out;
    181		}
    182		ctx = ntfs_attr_get_search_ctx(base_ni, m);
    183		if (unlikely(!ctx)) {
    184			err = -ENOMEM;
    185			goto err_out;
    186		}
    187		err = ntfs_attr_lookup(ni->type, ni->name, ni->name_len,
    188				CASE_SENSITIVE, 0, NULL, 0, ctx);
    189		if (unlikely(err)) {
    190			if (err == -ENOENT)
    191				err = -EIO;
    192			goto err_out;
    193		}
    194		m = ctx->mrec;
    195		a = ctx->attr;
    196		BUG_ON(!a->non_resident);
    197		BUG_ON(old_i_size != (loff_t)
    198				sle64_to_cpu(a->data.non_resident.data_size));
    199		a->data.non_resident.data_size = cpu_to_sle64(new_init_size);
    200		flush_dcache_mft_record_page(ctx->ntfs_ino);
    201		mark_mft_record_dirty(ctx->ntfs_ino);
    202		/* Update the file size in the vfs inode. */
    203		i_size_write(vi, new_init_size);
    204		ntfs_attr_put_search_ctx(ctx);
    205		ctx = NULL;
    206		unmap_mft_record(base_ni);
    207		m = NULL;
    208	}
    209	mapping = vi->i_mapping;
    210	index = old_init_size >> PAGE_SHIFT;
    211	end_index = (new_init_size + PAGE_SIZE - 1) >> PAGE_SHIFT;
    212	do {
    213		/*
    214		 * Read the page.  If the page is not present, this will zero
    215		 * the uninitialized regions for us.
    216		 */
    217		page = read_mapping_page(mapping, index, NULL);
    218		if (IS_ERR(page)) {
    219			err = PTR_ERR(page);
    220			goto init_err_out;
    221		}
    222		if (unlikely(PageError(page))) {
    223			put_page(page);
    224			err = -EIO;
    225			goto init_err_out;
    226		}
    227		/*
    228		 * Update the initialized size in the ntfs inode.  This is
    229		 * enough to make ntfs_writepage() work.
    230		 */
    231		write_lock_irqsave(&ni->size_lock, flags);
    232		ni->initialized_size = (s64)(index + 1) << PAGE_SHIFT;
    233		if (ni->initialized_size > new_init_size)
    234			ni->initialized_size = new_init_size;
    235		write_unlock_irqrestore(&ni->size_lock, flags);
    236		/* Set the page dirty so it gets written out. */
    237		set_page_dirty(page);
    238		put_page(page);
    239		/*
    240		 * Play nice with the vm and the rest of the system.  This is
    241		 * very much needed as we can potentially be modifying the
    242		 * initialised size from a very small value to a really huge
    243		 * value, e.g.
    244		 *	f = open(somefile, O_TRUNC);
    245		 *	truncate(f, 10GiB);
    246		 *	seek(f, 10GiB);
    247		 *	write(f, 1);
    248		 * And this would mean we would be marking dirty hundreds of
    249		 * thousands of pages or as in the above example more than
    250		 * two and a half million pages!
    251		 *
    252		 * TODO: For sparse pages could optimize this workload by using
    253		 * the FsMisc / MiscFs page bit as a "PageIsSparse" bit.  This
    254		 * would be set in read_folio for sparse pages and here we would
    255		 * not need to mark dirty any pages which have this bit set.
    256		 * The only caveat is that we have to clear the bit everywhere
    257		 * where we allocate any clusters that lie in the page or that
    258		 * contain the page.
    259		 *
    260		 * TODO: An even greater optimization would be for us to only
    261		 * call read_folio() on pages which are not in sparse regions as
    262		 * determined from the runlist.  This would greatly reduce the
    263		 * number of pages we read and make dirty in the case of sparse
    264		 * files.
    265		 */
    266		balance_dirty_pages_ratelimited(mapping);
    267		cond_resched();
    268	} while (++index < end_index);
    269	read_lock_irqsave(&ni->size_lock, flags);
    270	BUG_ON(ni->initialized_size != new_init_size);
    271	read_unlock_irqrestore(&ni->size_lock, flags);
    272	/* Now bring in sync the initialized_size in the mft record. */
    273	m = map_mft_record(base_ni);
    274	if (IS_ERR(m)) {
    275		err = PTR_ERR(m);
    276		m = NULL;
    277		goto init_err_out;
    278	}
    279	ctx = ntfs_attr_get_search_ctx(base_ni, m);
    280	if (unlikely(!ctx)) {
    281		err = -ENOMEM;
    282		goto init_err_out;
    283	}
    284	err = ntfs_attr_lookup(ni->type, ni->name, ni->name_len,
    285			CASE_SENSITIVE, 0, NULL, 0, ctx);
    286	if (unlikely(err)) {
    287		if (err == -ENOENT)
    288			err = -EIO;
    289		goto init_err_out;
    290	}
    291	m = ctx->mrec;
    292	a = ctx->attr;
    293	BUG_ON(!a->non_resident);
    294	a->data.non_resident.initialized_size = cpu_to_sle64(new_init_size);
    295done:
    296	flush_dcache_mft_record_page(ctx->ntfs_ino);
    297	mark_mft_record_dirty(ctx->ntfs_ino);
    298	if (ctx)
    299		ntfs_attr_put_search_ctx(ctx);
    300	if (m)
    301		unmap_mft_record(base_ni);
    302	ntfs_debug("Done, initialized_size 0x%llx, i_size 0x%llx.",
    303			(unsigned long long)new_init_size, i_size_read(vi));
    304	return 0;
    305init_err_out:
    306	write_lock_irqsave(&ni->size_lock, flags);
    307	ni->initialized_size = old_init_size;
    308	write_unlock_irqrestore(&ni->size_lock, flags);
    309err_out:
    310	if (ctx)
    311		ntfs_attr_put_search_ctx(ctx);
    312	if (m)
    313		unmap_mft_record(base_ni);
    314	ntfs_debug("Failed.  Returning error code %i.", err);
    315	return err;
    316}
    317
    318static ssize_t ntfs_prepare_file_for_write(struct kiocb *iocb,
    319		struct iov_iter *from)
    320{
    321	loff_t pos;
    322	s64 end, ll;
    323	ssize_t err;
    324	unsigned long flags;
    325	struct file *file = iocb->ki_filp;
    326	struct inode *vi = file_inode(file);
    327	ntfs_inode *ni = NTFS_I(vi);
    328	ntfs_volume *vol = ni->vol;
    329
    330	ntfs_debug("Entering for i_ino 0x%lx, attribute type 0x%x, pos "
    331			"0x%llx, count 0x%zx.", vi->i_ino,
    332			(unsigned)le32_to_cpu(ni->type),
    333			(unsigned long long)iocb->ki_pos,
    334			iov_iter_count(from));
    335	err = generic_write_checks(iocb, from);
    336	if (unlikely(err <= 0))
    337		goto out;
    338	/*
    339	 * All checks have passed.  Before we start doing any writing we want
    340	 * to abort any totally illegal writes.
    341	 */
    342	BUG_ON(NInoMstProtected(ni));
    343	BUG_ON(ni->type != AT_DATA);
    344	/* If file is encrypted, deny access, just like NT4. */
    345	if (NInoEncrypted(ni)) {
    346		/* Only $DATA attributes can be encrypted. */
    347		/*
    348		 * Reminder for later: Encrypted files are _always_
    349		 * non-resident so that the content can always be encrypted.
    350		 */
    351		ntfs_debug("Denying write access to encrypted file.");
    352		err = -EACCES;
    353		goto out;
    354	}
    355	if (NInoCompressed(ni)) {
    356		/* Only unnamed $DATA attribute can be compressed. */
    357		BUG_ON(ni->name_len);
    358		/*
    359		 * Reminder for later: If resident, the data is not actually
    360		 * compressed.  Only on the switch to non-resident does
    361		 * compression kick in.  This is in contrast to encrypted files
    362		 * (see above).
    363		 */
    364		ntfs_error(vi->i_sb, "Writing to compressed files is not "
    365				"implemented yet.  Sorry.");
    366		err = -EOPNOTSUPP;
    367		goto out;
    368	}
    369	err = file_remove_privs(file);
    370	if (unlikely(err))
    371		goto out;
    372	/*
    373	 * Our ->update_time method always succeeds thus file_update_time()
    374	 * cannot fail either so there is no need to check the return code.
    375	 */
    376	file_update_time(file);
    377	pos = iocb->ki_pos;
    378	/* The first byte after the last cluster being written to. */
    379	end = (pos + iov_iter_count(from) + vol->cluster_size_mask) &
    380			~(u64)vol->cluster_size_mask;
    381	/*
    382	 * If the write goes beyond the allocated size, extend the allocation
    383	 * to cover the whole of the write, rounded up to the nearest cluster.
    384	 */
    385	read_lock_irqsave(&ni->size_lock, flags);
    386	ll = ni->allocated_size;
    387	read_unlock_irqrestore(&ni->size_lock, flags);
    388	if (end > ll) {
    389		/*
    390		 * Extend the allocation without changing the data size.
    391		 *
    392		 * Note we ensure the allocation is big enough to at least
    393		 * write some data but we do not require the allocation to be
    394		 * complete, i.e. it may be partial.
    395		 */
    396		ll = ntfs_attr_extend_allocation(ni, end, -1, pos);
    397		if (likely(ll >= 0)) {
    398			BUG_ON(pos >= ll);
    399			/* If the extension was partial truncate the write. */
    400			if (end > ll) {
    401				ntfs_debug("Truncating write to inode 0x%lx, "
    402						"attribute type 0x%x, because "
    403						"the allocation was only "
    404						"partially extended.",
    405						vi->i_ino, (unsigned)
    406						le32_to_cpu(ni->type));
    407				iov_iter_truncate(from, ll - pos);
    408			}
    409		} else {
    410			err = ll;
    411			read_lock_irqsave(&ni->size_lock, flags);
    412			ll = ni->allocated_size;
    413			read_unlock_irqrestore(&ni->size_lock, flags);
    414			/* Perform a partial write if possible or fail. */
    415			if (pos < ll) {
    416				ntfs_debug("Truncating write to inode 0x%lx "
    417						"attribute type 0x%x, because "
    418						"extending the allocation "
    419						"failed (error %d).",
    420						vi->i_ino, (unsigned)
    421						le32_to_cpu(ni->type),
    422						(int)-err);
    423				iov_iter_truncate(from, ll - pos);
    424			} else {
    425				if (err != -ENOSPC)
    426					ntfs_error(vi->i_sb, "Cannot perform "
    427							"write to inode "
    428							"0x%lx, attribute "
    429							"type 0x%x, because "
    430							"extending the "
    431							"allocation failed "
    432							"(error %ld).",
    433							vi->i_ino, (unsigned)
    434							le32_to_cpu(ni->type),
    435							(long)-err);
    436				else
    437					ntfs_debug("Cannot perform write to "
    438							"inode 0x%lx, "
    439							"attribute type 0x%x, "
    440							"because there is not "
    441							"space left.",
    442							vi->i_ino, (unsigned)
    443							le32_to_cpu(ni->type));
    444				goto out;
    445			}
    446		}
    447	}
    448	/*
    449	 * If the write starts beyond the initialized size, extend it up to the
    450	 * beginning of the write and initialize all non-sparse space between
    451	 * the old initialized size and the new one.  This automatically also
    452	 * increments the vfs inode->i_size to keep it above or equal to the
    453	 * initialized_size.
    454	 */
    455	read_lock_irqsave(&ni->size_lock, flags);
    456	ll = ni->initialized_size;
    457	read_unlock_irqrestore(&ni->size_lock, flags);
    458	if (pos > ll) {
    459		/*
    460		 * Wait for ongoing direct i/o to complete before proceeding.
    461		 * New direct i/o cannot start as we hold i_mutex.
    462		 */
    463		inode_dio_wait(vi);
    464		err = ntfs_attr_extend_initialized(ni, pos);
    465		if (unlikely(err < 0))
    466			ntfs_error(vi->i_sb, "Cannot perform write to inode "
    467					"0x%lx, attribute type 0x%x, because "
    468					"extending the initialized size "
    469					"failed (error %d).", vi->i_ino,
    470					(unsigned)le32_to_cpu(ni->type),
    471					(int)-err);
    472	}
    473out:
    474	return err;
    475}
    476
    477/**
    478 * __ntfs_grab_cache_pages - obtain a number of locked pages
    479 * @mapping:	address space mapping from which to obtain page cache pages
    480 * @index:	starting index in @mapping at which to begin obtaining pages
    481 * @nr_pages:	number of page cache pages to obtain
    482 * @pages:	array of pages in which to return the obtained page cache pages
    483 * @cached_page: allocated but as yet unused page
    484 *
    485 * Obtain @nr_pages locked page cache pages from the mapping @mapping and
    486 * starting at index @index.
    487 *
    488 * If a page is newly created, add it to lru list
    489 *
    490 * Note, the page locks are obtained in ascending page index order.
    491 */
    492static inline int __ntfs_grab_cache_pages(struct address_space *mapping,
    493		pgoff_t index, const unsigned nr_pages, struct page **pages,
    494		struct page **cached_page)
    495{
    496	int err, nr;
    497
    498	BUG_ON(!nr_pages);
    499	err = nr = 0;
    500	do {
    501		pages[nr] = find_get_page_flags(mapping, index, FGP_LOCK |
    502				FGP_ACCESSED);
    503		if (!pages[nr]) {
    504			if (!*cached_page) {
    505				*cached_page = page_cache_alloc(mapping);
    506				if (unlikely(!*cached_page)) {
    507					err = -ENOMEM;
    508					goto err_out;
    509				}
    510			}
    511			err = add_to_page_cache_lru(*cached_page, mapping,
    512				   index,
    513				   mapping_gfp_constraint(mapping, GFP_KERNEL));
    514			if (unlikely(err)) {
    515				if (err == -EEXIST)
    516					continue;
    517				goto err_out;
    518			}
    519			pages[nr] = *cached_page;
    520			*cached_page = NULL;
    521		}
    522		index++;
    523		nr++;
    524	} while (nr < nr_pages);
    525out:
    526	return err;
    527err_out:
    528	while (nr > 0) {
    529		unlock_page(pages[--nr]);
    530		put_page(pages[nr]);
    531	}
    532	goto out;
    533}
    534
    535static inline int ntfs_submit_bh_for_read(struct buffer_head *bh)
    536{
    537	lock_buffer(bh);
    538	get_bh(bh);
    539	bh->b_end_io = end_buffer_read_sync;
    540	return submit_bh(REQ_OP_READ, 0, bh);
    541}
    542
    543/**
    544 * ntfs_prepare_pages_for_non_resident_write - prepare pages for receiving data
    545 * @pages:	array of destination pages
    546 * @nr_pages:	number of pages in @pages
    547 * @pos:	byte position in file at which the write begins
    548 * @bytes:	number of bytes to be written
    549 *
    550 * This is called for non-resident attributes from ntfs_file_buffered_write()
    551 * with i_mutex held on the inode (@pages[0]->mapping->host).  There are
    552 * @nr_pages pages in @pages which are locked but not kmap()ped.  The source
    553 * data has not yet been copied into the @pages.
    554 * 
    555 * Need to fill any holes with actual clusters, allocate buffers if necessary,
    556 * ensure all the buffers are mapped, and bring uptodate any buffers that are
    557 * only partially being written to.
    558 *
    559 * If @nr_pages is greater than one, we are guaranteed that the cluster size is
    560 * greater than PAGE_SIZE, that all pages in @pages are entirely inside
    561 * the same cluster and that they are the entirety of that cluster, and that
    562 * the cluster is sparse, i.e. we need to allocate a cluster to fill the hole.
    563 *
    564 * i_size is not to be modified yet.
    565 *
    566 * Return 0 on success or -errno on error.
    567 */
    568static int ntfs_prepare_pages_for_non_resident_write(struct page **pages,
    569		unsigned nr_pages, s64 pos, size_t bytes)
    570{
    571	VCN vcn, highest_vcn = 0, cpos, cend, bh_cpos, bh_cend;
    572	LCN lcn;
    573	s64 bh_pos, vcn_len, end, initialized_size;
    574	sector_t lcn_block;
    575	struct page *page;
    576	struct inode *vi;
    577	ntfs_inode *ni, *base_ni = NULL;
    578	ntfs_volume *vol;
    579	runlist_element *rl, *rl2;
    580	struct buffer_head *bh, *head, *wait[2], **wait_bh = wait;
    581	ntfs_attr_search_ctx *ctx = NULL;
    582	MFT_RECORD *m = NULL;
    583	ATTR_RECORD *a = NULL;
    584	unsigned long flags;
    585	u32 attr_rec_len = 0;
    586	unsigned blocksize, u;
    587	int err, mp_size;
    588	bool rl_write_locked, was_hole, is_retry;
    589	unsigned char blocksize_bits;
    590	struct {
    591		u8 runlist_merged:1;
    592		u8 mft_attr_mapped:1;
    593		u8 mp_rebuilt:1;
    594		u8 attr_switched:1;
    595	} status = { 0, 0, 0, 0 };
    596
    597	BUG_ON(!nr_pages);
    598	BUG_ON(!pages);
    599	BUG_ON(!*pages);
    600	vi = pages[0]->mapping->host;
    601	ni = NTFS_I(vi);
    602	vol = ni->vol;
    603	ntfs_debug("Entering for inode 0x%lx, attribute type 0x%x, start page "
    604			"index 0x%lx, nr_pages 0x%x, pos 0x%llx, bytes 0x%zx.",
    605			vi->i_ino, ni->type, pages[0]->index, nr_pages,
    606			(long long)pos, bytes);
    607	blocksize = vol->sb->s_blocksize;
    608	blocksize_bits = vol->sb->s_blocksize_bits;
    609	u = 0;
    610	do {
    611		page = pages[u];
    612		BUG_ON(!page);
    613		/*
    614		 * create_empty_buffers() will create uptodate/dirty buffers if
    615		 * the page is uptodate/dirty.
    616		 */
    617		if (!page_has_buffers(page)) {
    618			create_empty_buffers(page, blocksize, 0);
    619			if (unlikely(!page_has_buffers(page)))
    620				return -ENOMEM;
    621		}
    622	} while (++u < nr_pages);
    623	rl_write_locked = false;
    624	rl = NULL;
    625	err = 0;
    626	vcn = lcn = -1;
    627	vcn_len = 0;
    628	lcn_block = -1;
    629	was_hole = false;
    630	cpos = pos >> vol->cluster_size_bits;
    631	end = pos + bytes;
    632	cend = (end + vol->cluster_size - 1) >> vol->cluster_size_bits;
    633	/*
    634	 * Loop over each page and for each page over each buffer.  Use goto to
    635	 * reduce indentation.
    636	 */
    637	u = 0;
    638do_next_page:
    639	page = pages[u];
    640	bh_pos = (s64)page->index << PAGE_SHIFT;
    641	bh = head = page_buffers(page);
    642	do {
    643		VCN cdelta;
    644		s64 bh_end;
    645		unsigned bh_cofs;
    646
    647		/* Clear buffer_new on all buffers to reinitialise state. */
    648		if (buffer_new(bh))
    649			clear_buffer_new(bh);
    650		bh_end = bh_pos + blocksize;
    651		bh_cpos = bh_pos >> vol->cluster_size_bits;
    652		bh_cofs = bh_pos & vol->cluster_size_mask;
    653		if (buffer_mapped(bh)) {
    654			/*
    655			 * The buffer is already mapped.  If it is uptodate,
    656			 * ignore it.
    657			 */
    658			if (buffer_uptodate(bh))
    659				continue;
    660			/*
    661			 * The buffer is not uptodate.  If the page is uptodate
    662			 * set the buffer uptodate and otherwise ignore it.
    663			 */
    664			if (PageUptodate(page)) {
    665				set_buffer_uptodate(bh);
    666				continue;
    667			}
    668			/*
    669			 * Neither the page nor the buffer are uptodate.  If
    670			 * the buffer is only partially being written to, we
    671			 * need to read it in before the write, i.e. now.
    672			 */
    673			if ((bh_pos < pos && bh_end > pos) ||
    674					(bh_pos < end && bh_end > end)) {
    675				/*
    676				 * If the buffer is fully or partially within
    677				 * the initialized size, do an actual read.
    678				 * Otherwise, simply zero the buffer.
    679				 */
    680				read_lock_irqsave(&ni->size_lock, flags);
    681				initialized_size = ni->initialized_size;
    682				read_unlock_irqrestore(&ni->size_lock, flags);
    683				if (bh_pos < initialized_size) {
    684					ntfs_submit_bh_for_read(bh);
    685					*wait_bh++ = bh;
    686				} else {
    687					zero_user(page, bh_offset(bh),
    688							blocksize);
    689					set_buffer_uptodate(bh);
    690				}
    691			}
    692			continue;
    693		}
    694		/* Unmapped buffer.  Need to map it. */
    695		bh->b_bdev = vol->sb->s_bdev;
    696		/*
    697		 * If the current buffer is in the same clusters as the map
    698		 * cache, there is no need to check the runlist again.  The
    699		 * map cache is made up of @vcn, which is the first cached file
    700		 * cluster, @vcn_len which is the number of cached file
    701		 * clusters, @lcn is the device cluster corresponding to @vcn,
    702		 * and @lcn_block is the block number corresponding to @lcn.
    703		 */
    704		cdelta = bh_cpos - vcn;
    705		if (likely(!cdelta || (cdelta > 0 && cdelta < vcn_len))) {
    706map_buffer_cached:
    707			BUG_ON(lcn < 0);
    708			bh->b_blocknr = lcn_block +
    709					(cdelta << (vol->cluster_size_bits -
    710					blocksize_bits)) +
    711					(bh_cofs >> blocksize_bits);
    712			set_buffer_mapped(bh);
    713			/*
    714			 * If the page is uptodate so is the buffer.  If the
    715			 * buffer is fully outside the write, we ignore it if
    716			 * it was already allocated and we mark it dirty so it
    717			 * gets written out if we allocated it.  On the other
    718			 * hand, if we allocated the buffer but we are not
    719			 * marking it dirty we set buffer_new so we can do
    720			 * error recovery.
    721			 */
    722			if (PageUptodate(page)) {
    723				if (!buffer_uptodate(bh))
    724					set_buffer_uptodate(bh);
    725				if (unlikely(was_hole)) {
    726					/* We allocated the buffer. */
    727					clean_bdev_bh_alias(bh);
    728					if (bh_end <= pos || bh_pos >= end)
    729						mark_buffer_dirty(bh);
    730					else
    731						set_buffer_new(bh);
    732				}
    733				continue;
    734			}
    735			/* Page is _not_ uptodate. */
    736			if (likely(!was_hole)) {
    737				/*
    738				 * Buffer was already allocated.  If it is not
    739				 * uptodate and is only partially being written
    740				 * to, we need to read it in before the write,
    741				 * i.e. now.
    742				 */
    743				if (!buffer_uptodate(bh) && bh_pos < end &&
    744						bh_end > pos &&
    745						(bh_pos < pos ||
    746						bh_end > end)) {
    747					/*
    748					 * If the buffer is fully or partially
    749					 * within the initialized size, do an
    750					 * actual read.  Otherwise, simply zero
    751					 * the buffer.
    752					 */
    753					read_lock_irqsave(&ni->size_lock,
    754							flags);
    755					initialized_size = ni->initialized_size;
    756					read_unlock_irqrestore(&ni->size_lock,
    757							flags);
    758					if (bh_pos < initialized_size) {
    759						ntfs_submit_bh_for_read(bh);
    760						*wait_bh++ = bh;
    761					} else {
    762						zero_user(page, bh_offset(bh),
    763								blocksize);
    764						set_buffer_uptodate(bh);
    765					}
    766				}
    767				continue;
    768			}
    769			/* We allocated the buffer. */
    770			clean_bdev_bh_alias(bh);
    771			/*
    772			 * If the buffer is fully outside the write, zero it,
    773			 * set it uptodate, and mark it dirty so it gets
    774			 * written out.  If it is partially being written to,
    775			 * zero region surrounding the write but leave it to
    776			 * commit write to do anything else.  Finally, if the
    777			 * buffer is fully being overwritten, do nothing.
    778			 */
    779			if (bh_end <= pos || bh_pos >= end) {
    780				if (!buffer_uptodate(bh)) {
    781					zero_user(page, bh_offset(bh),
    782							blocksize);
    783					set_buffer_uptodate(bh);
    784				}
    785				mark_buffer_dirty(bh);
    786				continue;
    787			}
    788			set_buffer_new(bh);
    789			if (!buffer_uptodate(bh) &&
    790					(bh_pos < pos || bh_end > end)) {
    791				u8 *kaddr;
    792				unsigned pofs;
    793					
    794				kaddr = kmap_atomic(page);
    795				if (bh_pos < pos) {
    796					pofs = bh_pos & ~PAGE_MASK;
    797					memset(kaddr + pofs, 0, pos - bh_pos);
    798				}
    799				if (bh_end > end) {
    800					pofs = end & ~PAGE_MASK;
    801					memset(kaddr + pofs, 0, bh_end - end);
    802				}
    803				kunmap_atomic(kaddr);
    804				flush_dcache_page(page);
    805			}
    806			continue;
    807		}
    808		/*
    809		 * Slow path: this is the first buffer in the cluster.  If it
    810		 * is outside allocated size and is not uptodate, zero it and
    811		 * set it uptodate.
    812		 */
    813		read_lock_irqsave(&ni->size_lock, flags);
    814		initialized_size = ni->allocated_size;
    815		read_unlock_irqrestore(&ni->size_lock, flags);
    816		if (bh_pos > initialized_size) {
    817			if (PageUptodate(page)) {
    818				if (!buffer_uptodate(bh))
    819					set_buffer_uptodate(bh);
    820			} else if (!buffer_uptodate(bh)) {
    821				zero_user(page, bh_offset(bh), blocksize);
    822				set_buffer_uptodate(bh);
    823			}
    824			continue;
    825		}
    826		is_retry = false;
    827		if (!rl) {
    828			down_read(&ni->runlist.lock);
    829retry_remap:
    830			rl = ni->runlist.rl;
    831		}
    832		if (likely(rl != NULL)) {
    833			/* Seek to element containing target cluster. */
    834			while (rl->length && rl[1].vcn <= bh_cpos)
    835				rl++;
    836			lcn = ntfs_rl_vcn_to_lcn(rl, bh_cpos);
    837			if (likely(lcn >= 0)) {
    838				/*
    839				 * Successful remap, setup the map cache and
    840				 * use that to deal with the buffer.
    841				 */
    842				was_hole = false;
    843				vcn = bh_cpos;
    844				vcn_len = rl[1].vcn - vcn;
    845				lcn_block = lcn << (vol->cluster_size_bits -
    846						blocksize_bits);
    847				cdelta = 0;
    848				/*
    849				 * If the number of remaining clusters touched
    850				 * by the write is smaller or equal to the
    851				 * number of cached clusters, unlock the
    852				 * runlist as the map cache will be used from
    853				 * now on.
    854				 */
    855				if (likely(vcn + vcn_len >= cend)) {
    856					if (rl_write_locked) {
    857						up_write(&ni->runlist.lock);
    858						rl_write_locked = false;
    859					} else
    860						up_read(&ni->runlist.lock);
    861					rl = NULL;
    862				}
    863				goto map_buffer_cached;
    864			}
    865		} else
    866			lcn = LCN_RL_NOT_MAPPED;
    867		/*
    868		 * If it is not a hole and not out of bounds, the runlist is
    869		 * probably unmapped so try to map it now.
    870		 */
    871		if (unlikely(lcn != LCN_HOLE && lcn != LCN_ENOENT)) {
    872			if (likely(!is_retry && lcn == LCN_RL_NOT_MAPPED)) {
    873				/* Attempt to map runlist. */
    874				if (!rl_write_locked) {
    875					/*
    876					 * We need the runlist locked for
    877					 * writing, so if it is locked for
    878					 * reading relock it now and retry in
    879					 * case it changed whilst we dropped
    880					 * the lock.
    881					 */
    882					up_read(&ni->runlist.lock);
    883					down_write(&ni->runlist.lock);
    884					rl_write_locked = true;
    885					goto retry_remap;
    886				}
    887				err = ntfs_map_runlist_nolock(ni, bh_cpos,
    888						NULL);
    889				if (likely(!err)) {
    890					is_retry = true;
    891					goto retry_remap;
    892				}
    893				/*
    894				 * If @vcn is out of bounds, pretend @lcn is
    895				 * LCN_ENOENT.  As long as the buffer is out
    896				 * of bounds this will work fine.
    897				 */
    898				if (err == -ENOENT) {
    899					lcn = LCN_ENOENT;
    900					err = 0;
    901					goto rl_not_mapped_enoent;
    902				}
    903			} else
    904				err = -EIO;
    905			/* Failed to map the buffer, even after retrying. */
    906			bh->b_blocknr = -1;
    907			ntfs_error(vol->sb, "Failed to write to inode 0x%lx, "
    908					"attribute type 0x%x, vcn 0x%llx, "
    909					"vcn offset 0x%x, because its "
    910					"location on disk could not be "
    911					"determined%s (error code %i).",
    912					ni->mft_no, ni->type,
    913					(unsigned long long)bh_cpos,
    914					(unsigned)bh_pos &
    915					vol->cluster_size_mask,
    916					is_retry ? " even after retrying" : "",
    917					err);
    918			break;
    919		}
    920rl_not_mapped_enoent:
    921		/*
    922		 * The buffer is in a hole or out of bounds.  We need to fill
    923		 * the hole, unless the buffer is in a cluster which is not
    924		 * touched by the write, in which case we just leave the buffer
    925		 * unmapped.  This can only happen when the cluster size is
    926		 * less than the page cache size.
    927		 */
    928		if (unlikely(vol->cluster_size < PAGE_SIZE)) {
    929			bh_cend = (bh_end + vol->cluster_size - 1) >>
    930					vol->cluster_size_bits;
    931			if ((bh_cend <= cpos || bh_cpos >= cend)) {
    932				bh->b_blocknr = -1;
    933				/*
    934				 * If the buffer is uptodate we skip it.  If it
    935				 * is not but the page is uptodate, we can set
    936				 * the buffer uptodate.  If the page is not
    937				 * uptodate, we can clear the buffer and set it
    938				 * uptodate.  Whether this is worthwhile is
    939				 * debatable and this could be removed.
    940				 */
    941				if (PageUptodate(page)) {
    942					if (!buffer_uptodate(bh))
    943						set_buffer_uptodate(bh);
    944				} else if (!buffer_uptodate(bh)) {
    945					zero_user(page, bh_offset(bh),
    946						blocksize);
    947					set_buffer_uptodate(bh);
    948				}
    949				continue;
    950			}
    951		}
    952		/*
    953		 * Out of bounds buffer is invalid if it was not really out of
    954		 * bounds.
    955		 */
    956		BUG_ON(lcn != LCN_HOLE);
    957		/*
    958		 * We need the runlist locked for writing, so if it is locked
    959		 * for reading relock it now and retry in case it changed
    960		 * whilst we dropped the lock.
    961		 */
    962		BUG_ON(!rl);
    963		if (!rl_write_locked) {
    964			up_read(&ni->runlist.lock);
    965			down_write(&ni->runlist.lock);
    966			rl_write_locked = true;
    967			goto retry_remap;
    968		}
    969		/* Find the previous last allocated cluster. */
    970		BUG_ON(rl->lcn != LCN_HOLE);
    971		lcn = -1;
    972		rl2 = rl;
    973		while (--rl2 >= ni->runlist.rl) {
    974			if (rl2->lcn >= 0) {
    975				lcn = rl2->lcn + rl2->length;
    976				break;
    977			}
    978		}
    979		rl2 = ntfs_cluster_alloc(vol, bh_cpos, 1, lcn, DATA_ZONE,
    980				false);
    981		if (IS_ERR(rl2)) {
    982			err = PTR_ERR(rl2);
    983			ntfs_debug("Failed to allocate cluster, error code %i.",
    984					err);
    985			break;
    986		}
    987		lcn = rl2->lcn;
    988		rl = ntfs_runlists_merge(ni->runlist.rl, rl2);
    989		if (IS_ERR(rl)) {
    990			err = PTR_ERR(rl);
    991			if (err != -ENOMEM)
    992				err = -EIO;
    993			if (ntfs_cluster_free_from_rl(vol, rl2)) {
    994				ntfs_error(vol->sb, "Failed to release "
    995						"allocated cluster in error "
    996						"code path.  Run chkdsk to "
    997						"recover the lost cluster.");
    998				NVolSetErrors(vol);
    999			}
   1000			ntfs_free(rl2);
   1001			break;
   1002		}
   1003		ni->runlist.rl = rl;
   1004		status.runlist_merged = 1;
   1005		ntfs_debug("Allocated cluster, lcn 0x%llx.",
   1006				(unsigned long long)lcn);
   1007		/* Map and lock the mft record and get the attribute record. */
   1008		if (!NInoAttr(ni))
   1009			base_ni = ni;
   1010		else
   1011			base_ni = ni->ext.base_ntfs_ino;
   1012		m = map_mft_record(base_ni);
   1013		if (IS_ERR(m)) {
   1014			err = PTR_ERR(m);
   1015			break;
   1016		}
   1017		ctx = ntfs_attr_get_search_ctx(base_ni, m);
   1018		if (unlikely(!ctx)) {
   1019			err = -ENOMEM;
   1020			unmap_mft_record(base_ni);
   1021			break;
   1022		}
   1023		status.mft_attr_mapped = 1;
   1024		err = ntfs_attr_lookup(ni->type, ni->name, ni->name_len,
   1025				CASE_SENSITIVE, bh_cpos, NULL, 0, ctx);
   1026		if (unlikely(err)) {
   1027			if (err == -ENOENT)
   1028				err = -EIO;
   1029			break;
   1030		}
   1031		m = ctx->mrec;
   1032		a = ctx->attr;
   1033		/*
   1034		 * Find the runlist element with which the attribute extent
   1035		 * starts.  Note, we cannot use the _attr_ version because we
   1036		 * have mapped the mft record.  That is ok because we know the
   1037		 * runlist fragment must be mapped already to have ever gotten
   1038		 * here, so we can just use the _rl_ version.
   1039		 */
   1040		vcn = sle64_to_cpu(a->data.non_resident.lowest_vcn);
   1041		rl2 = ntfs_rl_find_vcn_nolock(rl, vcn);
   1042		BUG_ON(!rl2);
   1043		BUG_ON(!rl2->length);
   1044		BUG_ON(rl2->lcn < LCN_HOLE);
   1045		highest_vcn = sle64_to_cpu(a->data.non_resident.highest_vcn);
   1046		/*
   1047		 * If @highest_vcn is zero, calculate the real highest_vcn
   1048		 * (which can really be zero).
   1049		 */
   1050		if (!highest_vcn)
   1051			highest_vcn = (sle64_to_cpu(
   1052					a->data.non_resident.allocated_size) >>
   1053					vol->cluster_size_bits) - 1;
   1054		/*
   1055		 * Determine the size of the mapping pairs array for the new
   1056		 * extent, i.e. the old extent with the hole filled.
   1057		 */
   1058		mp_size = ntfs_get_size_for_mapping_pairs(vol, rl2, vcn,
   1059				highest_vcn);
   1060		if (unlikely(mp_size <= 0)) {
   1061			if (!(err = mp_size))
   1062				err = -EIO;
   1063			ntfs_debug("Failed to get size for mapping pairs "
   1064					"array, error code %i.", err);
   1065			break;
   1066		}
   1067		/*
   1068		 * Resize the attribute record to fit the new mapping pairs
   1069		 * array.
   1070		 */
   1071		attr_rec_len = le32_to_cpu(a->length);
   1072		err = ntfs_attr_record_resize(m, a, mp_size + le16_to_cpu(
   1073				a->data.non_resident.mapping_pairs_offset));
   1074		if (unlikely(err)) {
   1075			BUG_ON(err != -ENOSPC);
   1076			// TODO: Deal with this by using the current attribute
   1077			// and fill it with as much of the mapping pairs
   1078			// array as possible.  Then loop over each attribute
   1079			// extent rewriting the mapping pairs arrays as we go
   1080			// along and if when we reach the end we have not
   1081			// enough space, try to resize the last attribute
   1082			// extent and if even that fails, add a new attribute
   1083			// extent.
   1084			// We could also try to resize at each step in the hope
   1085			// that we will not need to rewrite every single extent.
   1086			// Note, we may need to decompress some extents to fill
   1087			// the runlist as we are walking the extents...
   1088			ntfs_error(vol->sb, "Not enough space in the mft "
   1089					"record for the extended attribute "
   1090					"record.  This case is not "
   1091					"implemented yet.");
   1092			err = -EOPNOTSUPP;
   1093			break ;
   1094		}
   1095		status.mp_rebuilt = 1;
   1096		/*
   1097		 * Generate the mapping pairs array directly into the attribute
   1098		 * record.
   1099		 */
   1100		err = ntfs_mapping_pairs_build(vol, (u8*)a + le16_to_cpu(
   1101				a->data.non_resident.mapping_pairs_offset),
   1102				mp_size, rl2, vcn, highest_vcn, NULL);
   1103		if (unlikely(err)) {
   1104			ntfs_error(vol->sb, "Cannot fill hole in inode 0x%lx, "
   1105					"attribute type 0x%x, because building "
   1106					"the mapping pairs failed with error "
   1107					"code %i.", vi->i_ino,
   1108					(unsigned)le32_to_cpu(ni->type), err);
   1109			err = -EIO;
   1110			break;
   1111		}
   1112		/* Update the highest_vcn but only if it was not set. */
   1113		if (unlikely(!a->data.non_resident.highest_vcn))
   1114			a->data.non_resident.highest_vcn =
   1115					cpu_to_sle64(highest_vcn);
   1116		/*
   1117		 * If the attribute is sparse/compressed, update the compressed
   1118		 * size in the ntfs_inode structure and the attribute record.
   1119		 */
   1120		if (likely(NInoSparse(ni) || NInoCompressed(ni))) {
   1121			/*
   1122			 * If we are not in the first attribute extent, switch
   1123			 * to it, but first ensure the changes will make it to
   1124			 * disk later.
   1125			 */
   1126			if (a->data.non_resident.lowest_vcn) {
   1127				flush_dcache_mft_record_page(ctx->ntfs_ino);
   1128				mark_mft_record_dirty(ctx->ntfs_ino);
   1129				ntfs_attr_reinit_search_ctx(ctx);
   1130				err = ntfs_attr_lookup(ni->type, ni->name,
   1131						ni->name_len, CASE_SENSITIVE,
   1132						0, NULL, 0, ctx);
   1133				if (unlikely(err)) {
   1134					status.attr_switched = 1;
   1135					break;
   1136				}
   1137				/* @m is not used any more so do not set it. */
   1138				a = ctx->attr;
   1139			}
   1140			write_lock_irqsave(&ni->size_lock, flags);
   1141			ni->itype.compressed.size += vol->cluster_size;
   1142			a->data.non_resident.compressed_size =
   1143					cpu_to_sle64(ni->itype.compressed.size);
   1144			write_unlock_irqrestore(&ni->size_lock, flags);
   1145		}
   1146		/* Ensure the changes make it to disk. */
   1147		flush_dcache_mft_record_page(ctx->ntfs_ino);
   1148		mark_mft_record_dirty(ctx->ntfs_ino);
   1149		ntfs_attr_put_search_ctx(ctx);
   1150		unmap_mft_record(base_ni);
   1151		/* Successfully filled the hole. */
   1152		status.runlist_merged = 0;
   1153		status.mft_attr_mapped = 0;
   1154		status.mp_rebuilt = 0;
   1155		/* Setup the map cache and use that to deal with the buffer. */
   1156		was_hole = true;
   1157		vcn = bh_cpos;
   1158		vcn_len = 1;
   1159		lcn_block = lcn << (vol->cluster_size_bits - blocksize_bits);
   1160		cdelta = 0;
   1161		/*
   1162		 * If the number of remaining clusters in the @pages is smaller
   1163		 * or equal to the number of cached clusters, unlock the
   1164		 * runlist as the map cache will be used from now on.
   1165		 */
   1166		if (likely(vcn + vcn_len >= cend)) {
   1167			up_write(&ni->runlist.lock);
   1168			rl_write_locked = false;
   1169			rl = NULL;
   1170		}
   1171		goto map_buffer_cached;
   1172	} while (bh_pos += blocksize, (bh = bh->b_this_page) != head);
   1173	/* If there are no errors, do the next page. */
   1174	if (likely(!err && ++u < nr_pages))
   1175		goto do_next_page;
   1176	/* If there are no errors, release the runlist lock if we took it. */
   1177	if (likely(!err)) {
   1178		if (unlikely(rl_write_locked)) {
   1179			up_write(&ni->runlist.lock);
   1180			rl_write_locked = false;
   1181		} else if (unlikely(rl))
   1182			up_read(&ni->runlist.lock);
   1183		rl = NULL;
   1184	}
   1185	/* If we issued read requests, let them complete. */
   1186	read_lock_irqsave(&ni->size_lock, flags);
   1187	initialized_size = ni->initialized_size;
   1188	read_unlock_irqrestore(&ni->size_lock, flags);
   1189	while (wait_bh > wait) {
   1190		bh = *--wait_bh;
   1191		wait_on_buffer(bh);
   1192		if (likely(buffer_uptodate(bh))) {
   1193			page = bh->b_page;
   1194			bh_pos = ((s64)page->index << PAGE_SHIFT) +
   1195					bh_offset(bh);
   1196			/*
   1197			 * If the buffer overflows the initialized size, need
   1198			 * to zero the overflowing region.
   1199			 */
   1200			if (unlikely(bh_pos + blocksize > initialized_size)) {
   1201				int ofs = 0;
   1202
   1203				if (likely(bh_pos < initialized_size))
   1204					ofs = initialized_size - bh_pos;
   1205				zero_user_segment(page, bh_offset(bh) + ofs,
   1206						blocksize);
   1207			}
   1208		} else /* if (unlikely(!buffer_uptodate(bh))) */
   1209			err = -EIO;
   1210	}
   1211	if (likely(!err)) {
   1212		/* Clear buffer_new on all buffers. */
   1213		u = 0;
   1214		do {
   1215			bh = head = page_buffers(pages[u]);
   1216			do {
   1217				if (buffer_new(bh))
   1218					clear_buffer_new(bh);
   1219			} while ((bh = bh->b_this_page) != head);
   1220		} while (++u < nr_pages);
   1221		ntfs_debug("Done.");
   1222		return err;
   1223	}
   1224	if (status.attr_switched) {
   1225		/* Get back to the attribute extent we modified. */
   1226		ntfs_attr_reinit_search_ctx(ctx);
   1227		if (ntfs_attr_lookup(ni->type, ni->name, ni->name_len,
   1228				CASE_SENSITIVE, bh_cpos, NULL, 0, ctx)) {
   1229			ntfs_error(vol->sb, "Failed to find required "
   1230					"attribute extent of attribute in "
   1231					"error code path.  Run chkdsk to "
   1232					"recover.");
   1233			write_lock_irqsave(&ni->size_lock, flags);
   1234			ni->itype.compressed.size += vol->cluster_size;
   1235			write_unlock_irqrestore(&ni->size_lock, flags);
   1236			flush_dcache_mft_record_page(ctx->ntfs_ino);
   1237			mark_mft_record_dirty(ctx->ntfs_ino);
   1238			/*
   1239			 * The only thing that is now wrong is the compressed
   1240			 * size of the base attribute extent which chkdsk
   1241			 * should be able to fix.
   1242			 */
   1243			NVolSetErrors(vol);
   1244		} else {
   1245			m = ctx->mrec;
   1246			a = ctx->attr;
   1247			status.attr_switched = 0;
   1248		}
   1249	}
   1250	/*
   1251	 * If the runlist has been modified, need to restore it by punching a
   1252	 * hole into it and we then need to deallocate the on-disk cluster as
   1253	 * well.  Note, we only modify the runlist if we are able to generate a
   1254	 * new mapping pairs array, i.e. only when the mapped attribute extent
   1255	 * is not switched.
   1256	 */
   1257	if (status.runlist_merged && !status.attr_switched) {
   1258		BUG_ON(!rl_write_locked);
   1259		/* Make the file cluster we allocated sparse in the runlist. */
   1260		if (ntfs_rl_punch_nolock(vol, &ni->runlist, bh_cpos, 1)) {
   1261			ntfs_error(vol->sb, "Failed to punch hole into "
   1262					"attribute runlist in error code "
   1263					"path.  Run chkdsk to recover the "
   1264					"lost cluster.");
   1265			NVolSetErrors(vol);
   1266		} else /* if (success) */ {
   1267			status.runlist_merged = 0;
   1268			/*
   1269			 * Deallocate the on-disk cluster we allocated but only
   1270			 * if we succeeded in punching its vcn out of the
   1271			 * runlist.
   1272			 */
   1273			down_write(&vol->lcnbmp_lock);
   1274			if (ntfs_bitmap_clear_bit(vol->lcnbmp_ino, lcn)) {
   1275				ntfs_error(vol->sb, "Failed to release "
   1276						"allocated cluster in error "
   1277						"code path.  Run chkdsk to "
   1278						"recover the lost cluster.");
   1279				NVolSetErrors(vol);
   1280			}
   1281			up_write(&vol->lcnbmp_lock);
   1282		}
   1283	}
   1284	/*
   1285	 * Resize the attribute record to its old size and rebuild the mapping
   1286	 * pairs array.  Note, we only can do this if the runlist has been
   1287	 * restored to its old state which also implies that the mapped
   1288	 * attribute extent is not switched.
   1289	 */
   1290	if (status.mp_rebuilt && !status.runlist_merged) {
   1291		if (ntfs_attr_record_resize(m, a, attr_rec_len)) {
   1292			ntfs_error(vol->sb, "Failed to restore attribute "
   1293					"record in error code path.  Run "
   1294					"chkdsk to recover.");
   1295			NVolSetErrors(vol);
   1296		} else /* if (success) */ {
   1297			if (ntfs_mapping_pairs_build(vol, (u8*)a +
   1298					le16_to_cpu(a->data.non_resident.
   1299					mapping_pairs_offset), attr_rec_len -
   1300					le16_to_cpu(a->data.non_resident.
   1301					mapping_pairs_offset), ni->runlist.rl,
   1302					vcn, highest_vcn, NULL)) {
   1303				ntfs_error(vol->sb, "Failed to restore "
   1304						"mapping pairs array in error "
   1305						"code path.  Run chkdsk to "
   1306						"recover.");
   1307				NVolSetErrors(vol);
   1308			}
   1309			flush_dcache_mft_record_page(ctx->ntfs_ino);
   1310			mark_mft_record_dirty(ctx->ntfs_ino);
   1311		}
   1312	}
   1313	/* Release the mft record and the attribute. */
   1314	if (status.mft_attr_mapped) {
   1315		ntfs_attr_put_search_ctx(ctx);
   1316		unmap_mft_record(base_ni);
   1317	}
   1318	/* Release the runlist lock. */
   1319	if (rl_write_locked)
   1320		up_write(&ni->runlist.lock);
   1321	else if (rl)
   1322		up_read(&ni->runlist.lock);
   1323	/*
   1324	 * Zero out any newly allocated blocks to avoid exposing stale data.
   1325	 * If BH_New is set, we know that the block was newly allocated above
   1326	 * and that it has not been fully zeroed and marked dirty yet.
   1327	 */
   1328	nr_pages = u;
   1329	u = 0;
   1330	end = bh_cpos << vol->cluster_size_bits;
   1331	do {
   1332		page = pages[u];
   1333		bh = head = page_buffers(page);
   1334		do {
   1335			if (u == nr_pages &&
   1336					((s64)page->index << PAGE_SHIFT) +
   1337					bh_offset(bh) >= end)
   1338				break;
   1339			if (!buffer_new(bh))
   1340				continue;
   1341			clear_buffer_new(bh);
   1342			if (!buffer_uptodate(bh)) {
   1343				if (PageUptodate(page))
   1344					set_buffer_uptodate(bh);
   1345				else {
   1346					zero_user(page, bh_offset(bh),
   1347							blocksize);
   1348					set_buffer_uptodate(bh);
   1349				}
   1350			}
   1351			mark_buffer_dirty(bh);
   1352		} while ((bh = bh->b_this_page) != head);
   1353	} while (++u <= nr_pages);
   1354	ntfs_error(vol->sb, "Failed.  Returning error code %i.", err);
   1355	return err;
   1356}
   1357
   1358static inline void ntfs_flush_dcache_pages(struct page **pages,
   1359		unsigned nr_pages)
   1360{
   1361	BUG_ON(!nr_pages);
   1362	/*
   1363	 * Warning: Do not do the decrement at the same time as the call to
   1364	 * flush_dcache_page() because it is a NULL macro on i386 and hence the
   1365	 * decrement never happens so the loop never terminates.
   1366	 */
   1367	do {
   1368		--nr_pages;
   1369		flush_dcache_page(pages[nr_pages]);
   1370	} while (nr_pages > 0);
   1371}
   1372
   1373/**
   1374 * ntfs_commit_pages_after_non_resident_write - commit the received data
   1375 * @pages:	array of destination pages
   1376 * @nr_pages:	number of pages in @pages
   1377 * @pos:	byte position in file at which the write begins
   1378 * @bytes:	number of bytes to be written
   1379 *
   1380 * See description of ntfs_commit_pages_after_write(), below.
   1381 */
   1382static inline int ntfs_commit_pages_after_non_resident_write(
   1383		struct page **pages, const unsigned nr_pages,
   1384		s64 pos, size_t bytes)
   1385{
   1386	s64 end, initialized_size;
   1387	struct inode *vi;
   1388	ntfs_inode *ni, *base_ni;
   1389	struct buffer_head *bh, *head;
   1390	ntfs_attr_search_ctx *ctx;
   1391	MFT_RECORD *m;
   1392	ATTR_RECORD *a;
   1393	unsigned long flags;
   1394	unsigned blocksize, u;
   1395	int err;
   1396
   1397	vi = pages[0]->mapping->host;
   1398	ni = NTFS_I(vi);
   1399	blocksize = vi->i_sb->s_blocksize;
   1400	end = pos + bytes;
   1401	u = 0;
   1402	do {
   1403		s64 bh_pos;
   1404		struct page *page;
   1405		bool partial;
   1406
   1407		page = pages[u];
   1408		bh_pos = (s64)page->index << PAGE_SHIFT;
   1409		bh = head = page_buffers(page);
   1410		partial = false;
   1411		do {
   1412			s64 bh_end;
   1413
   1414			bh_end = bh_pos + blocksize;
   1415			if (bh_end <= pos || bh_pos >= end) {
   1416				if (!buffer_uptodate(bh))
   1417					partial = true;
   1418			} else {
   1419				set_buffer_uptodate(bh);
   1420				mark_buffer_dirty(bh);
   1421			}
   1422		} while (bh_pos += blocksize, (bh = bh->b_this_page) != head);
   1423		/*
   1424		 * If all buffers are now uptodate but the page is not, set the
   1425		 * page uptodate.
   1426		 */
   1427		if (!partial && !PageUptodate(page))
   1428			SetPageUptodate(page);
   1429	} while (++u < nr_pages);
   1430	/*
   1431	 * Finally, if we do not need to update initialized_size or i_size we
   1432	 * are finished.
   1433	 */
   1434	read_lock_irqsave(&ni->size_lock, flags);
   1435	initialized_size = ni->initialized_size;
   1436	read_unlock_irqrestore(&ni->size_lock, flags);
   1437	if (end <= initialized_size) {
   1438		ntfs_debug("Done.");
   1439		return 0;
   1440	}
   1441	/*
   1442	 * Update initialized_size/i_size as appropriate, both in the inode and
   1443	 * the mft record.
   1444	 */
   1445	if (!NInoAttr(ni))
   1446		base_ni = ni;
   1447	else
   1448		base_ni = ni->ext.base_ntfs_ino;
   1449	/* Map, pin, and lock the mft record. */
   1450	m = map_mft_record(base_ni);
   1451	if (IS_ERR(m)) {
   1452		err = PTR_ERR(m);
   1453		m = NULL;
   1454		ctx = NULL;
   1455		goto err_out;
   1456	}
   1457	BUG_ON(!NInoNonResident(ni));
   1458	ctx = ntfs_attr_get_search_ctx(base_ni, m);
   1459	if (unlikely(!ctx)) {
   1460		err = -ENOMEM;
   1461		goto err_out;
   1462	}
   1463	err = ntfs_attr_lookup(ni->type, ni->name, ni->name_len,
   1464			CASE_SENSITIVE, 0, NULL, 0, ctx);
   1465	if (unlikely(err)) {
   1466		if (err == -ENOENT)
   1467			err = -EIO;
   1468		goto err_out;
   1469	}
   1470	a = ctx->attr;
   1471	BUG_ON(!a->non_resident);
   1472	write_lock_irqsave(&ni->size_lock, flags);
   1473	BUG_ON(end > ni->allocated_size);
   1474	ni->initialized_size = end;
   1475	a->data.non_resident.initialized_size = cpu_to_sle64(end);
   1476	if (end > i_size_read(vi)) {
   1477		i_size_write(vi, end);
   1478		a->data.non_resident.data_size =
   1479				a->data.non_resident.initialized_size;
   1480	}
   1481	write_unlock_irqrestore(&ni->size_lock, flags);
   1482	/* Mark the mft record dirty, so it gets written back. */
   1483	flush_dcache_mft_record_page(ctx->ntfs_ino);
   1484	mark_mft_record_dirty(ctx->ntfs_ino);
   1485	ntfs_attr_put_search_ctx(ctx);
   1486	unmap_mft_record(base_ni);
   1487	ntfs_debug("Done.");
   1488	return 0;
   1489err_out:
   1490	if (ctx)
   1491		ntfs_attr_put_search_ctx(ctx);
   1492	if (m)
   1493		unmap_mft_record(base_ni);
   1494	ntfs_error(vi->i_sb, "Failed to update initialized_size/i_size (error "
   1495			"code %i).", err);
   1496	if (err != -ENOMEM)
   1497		NVolSetErrors(ni->vol);
   1498	return err;
   1499}
   1500
   1501/**
   1502 * ntfs_commit_pages_after_write - commit the received data
   1503 * @pages:	array of destination pages
   1504 * @nr_pages:	number of pages in @pages
   1505 * @pos:	byte position in file at which the write begins
   1506 * @bytes:	number of bytes to be written
   1507 *
   1508 * This is called from ntfs_file_buffered_write() with i_mutex held on the inode
   1509 * (@pages[0]->mapping->host).  There are @nr_pages pages in @pages which are
   1510 * locked but not kmap()ped.  The source data has already been copied into the
   1511 * @page.  ntfs_prepare_pages_for_non_resident_write() has been called before
   1512 * the data was copied (for non-resident attributes only) and it returned
   1513 * success.
   1514 *
   1515 * Need to set uptodate and mark dirty all buffers within the boundary of the
   1516 * write.  If all buffers in a page are uptodate we set the page uptodate, too.
   1517 *
   1518 * Setting the buffers dirty ensures that they get written out later when
   1519 * ntfs_writepage() is invoked by the VM.
   1520 *
   1521 * Finally, we need to update i_size and initialized_size as appropriate both
   1522 * in the inode and the mft record.
   1523 *
   1524 * This is modelled after fs/buffer.c::generic_commit_write(), which marks
   1525 * buffers uptodate and dirty, sets the page uptodate if all buffers in the
   1526 * page are uptodate, and updates i_size if the end of io is beyond i_size.  In
   1527 * that case, it also marks the inode dirty.
   1528 *
   1529 * If things have gone as outlined in
   1530 * ntfs_prepare_pages_for_non_resident_write(), we do not need to do any page
   1531 * content modifications here for non-resident attributes.  For resident
   1532 * attributes we need to do the uptodate bringing here which we combine with
   1533 * the copying into the mft record which means we save one atomic kmap.
   1534 *
   1535 * Return 0 on success or -errno on error.
   1536 */
   1537static int ntfs_commit_pages_after_write(struct page **pages,
   1538		const unsigned nr_pages, s64 pos, size_t bytes)
   1539{
   1540	s64 end, initialized_size;
   1541	loff_t i_size;
   1542	struct inode *vi;
   1543	ntfs_inode *ni, *base_ni;
   1544	struct page *page;
   1545	ntfs_attr_search_ctx *ctx;
   1546	MFT_RECORD *m;
   1547	ATTR_RECORD *a;
   1548	char *kattr, *kaddr;
   1549	unsigned long flags;
   1550	u32 attr_len;
   1551	int err;
   1552
   1553	BUG_ON(!nr_pages);
   1554	BUG_ON(!pages);
   1555	page = pages[0];
   1556	BUG_ON(!page);
   1557	vi = page->mapping->host;
   1558	ni = NTFS_I(vi);
   1559	ntfs_debug("Entering for inode 0x%lx, attribute type 0x%x, start page "
   1560			"index 0x%lx, nr_pages 0x%x, pos 0x%llx, bytes 0x%zx.",
   1561			vi->i_ino, ni->type, page->index, nr_pages,
   1562			(long long)pos, bytes);
   1563	if (NInoNonResident(ni))
   1564		return ntfs_commit_pages_after_non_resident_write(pages,
   1565				nr_pages, pos, bytes);
   1566	BUG_ON(nr_pages > 1);
   1567	/*
   1568	 * Attribute is resident, implying it is not compressed, encrypted, or
   1569	 * sparse.
   1570	 */
   1571	if (!NInoAttr(ni))
   1572		base_ni = ni;
   1573	else
   1574		base_ni = ni->ext.base_ntfs_ino;
   1575	BUG_ON(NInoNonResident(ni));
   1576	/* Map, pin, and lock the mft record. */
   1577	m = map_mft_record(base_ni);
   1578	if (IS_ERR(m)) {
   1579		err = PTR_ERR(m);
   1580		m = NULL;
   1581		ctx = NULL;
   1582		goto err_out;
   1583	}
   1584	ctx = ntfs_attr_get_search_ctx(base_ni, m);
   1585	if (unlikely(!ctx)) {
   1586		err = -ENOMEM;
   1587		goto err_out;
   1588	}
   1589	err = ntfs_attr_lookup(ni->type, ni->name, ni->name_len,
   1590			CASE_SENSITIVE, 0, NULL, 0, ctx);
   1591	if (unlikely(err)) {
   1592		if (err == -ENOENT)
   1593			err = -EIO;
   1594		goto err_out;
   1595	}
   1596	a = ctx->attr;
   1597	BUG_ON(a->non_resident);
   1598	/* The total length of the attribute value. */
   1599	attr_len = le32_to_cpu(a->data.resident.value_length);
   1600	i_size = i_size_read(vi);
   1601	BUG_ON(attr_len != i_size);
   1602	BUG_ON(pos > attr_len);
   1603	end = pos + bytes;
   1604	BUG_ON(end > le32_to_cpu(a->length) -
   1605			le16_to_cpu(a->data.resident.value_offset));
   1606	kattr = (u8*)a + le16_to_cpu(a->data.resident.value_offset);
   1607	kaddr = kmap_atomic(page);
   1608	/* Copy the received data from the page to the mft record. */
   1609	memcpy(kattr + pos, kaddr + pos, bytes);
   1610	/* Update the attribute length if necessary. */
   1611	if (end > attr_len) {
   1612		attr_len = end;
   1613		a->data.resident.value_length = cpu_to_le32(attr_len);
   1614	}
   1615	/*
   1616	 * If the page is not uptodate, bring the out of bounds area(s)
   1617	 * uptodate by copying data from the mft record to the page.
   1618	 */
   1619	if (!PageUptodate(page)) {
   1620		if (pos > 0)
   1621			memcpy(kaddr, kattr, pos);
   1622		if (end < attr_len)
   1623			memcpy(kaddr + end, kattr + end, attr_len - end);
   1624		/* Zero the region outside the end of the attribute value. */
   1625		memset(kaddr + attr_len, 0, PAGE_SIZE - attr_len);
   1626		flush_dcache_page(page);
   1627		SetPageUptodate(page);
   1628	}
   1629	kunmap_atomic(kaddr);
   1630	/* Update initialized_size/i_size if necessary. */
   1631	read_lock_irqsave(&ni->size_lock, flags);
   1632	initialized_size = ni->initialized_size;
   1633	BUG_ON(end > ni->allocated_size);
   1634	read_unlock_irqrestore(&ni->size_lock, flags);
   1635	BUG_ON(initialized_size != i_size);
   1636	if (end > initialized_size) {
   1637		write_lock_irqsave(&ni->size_lock, flags);
   1638		ni->initialized_size = end;
   1639		i_size_write(vi, end);
   1640		write_unlock_irqrestore(&ni->size_lock, flags);
   1641	}
   1642	/* Mark the mft record dirty, so it gets written back. */
   1643	flush_dcache_mft_record_page(ctx->ntfs_ino);
   1644	mark_mft_record_dirty(ctx->ntfs_ino);
   1645	ntfs_attr_put_search_ctx(ctx);
   1646	unmap_mft_record(base_ni);
   1647	ntfs_debug("Done.");
   1648	return 0;
   1649err_out:
   1650	if (err == -ENOMEM) {
   1651		ntfs_warning(vi->i_sb, "Error allocating memory required to "
   1652				"commit the write.");
   1653		if (PageUptodate(page)) {
   1654			ntfs_warning(vi->i_sb, "Page is uptodate, setting "
   1655					"dirty so the write will be retried "
   1656					"later on by the VM.");
   1657			/*
   1658			 * Put the page on mapping->dirty_pages, but leave its
   1659			 * buffers' dirty state as-is.
   1660			 */
   1661			__set_page_dirty_nobuffers(page);
   1662			err = 0;
   1663		} else
   1664			ntfs_error(vi->i_sb, "Page is not uptodate.  Written "
   1665					"data has been lost.");
   1666	} else {
   1667		ntfs_error(vi->i_sb, "Resident attribute commit write failed "
   1668				"with error %i.", err);
   1669		NVolSetErrors(ni->vol);
   1670	}
   1671	if (ctx)
   1672		ntfs_attr_put_search_ctx(ctx);
   1673	if (m)
   1674		unmap_mft_record(base_ni);
   1675	return err;
   1676}
   1677
   1678/*
   1679 * Copy as much as we can into the pages and return the number of bytes which
   1680 * were successfully copied.  If a fault is encountered then clear the pages
   1681 * out to (ofs + bytes) and return the number of bytes which were copied.
   1682 */
   1683static size_t ntfs_copy_from_user_iter(struct page **pages, unsigned nr_pages,
   1684		unsigned ofs, struct iov_iter *i, size_t bytes)
   1685{
   1686	struct page **last_page = pages + nr_pages;
   1687	size_t total = 0;
   1688	unsigned len, copied;
   1689
   1690	do {
   1691		len = PAGE_SIZE - ofs;
   1692		if (len > bytes)
   1693			len = bytes;
   1694		copied = copy_page_from_iter_atomic(*pages, ofs, len, i);
   1695		total += copied;
   1696		bytes -= copied;
   1697		if (!bytes)
   1698			break;
   1699		if (copied < len)
   1700			goto err;
   1701		ofs = 0;
   1702	} while (++pages < last_page);
   1703out:
   1704	return total;
   1705err:
   1706	/* Zero the rest of the target like __copy_from_user(). */
   1707	len = PAGE_SIZE - copied;
   1708	do {
   1709		if (len > bytes)
   1710			len = bytes;
   1711		zero_user(*pages, copied, len);
   1712		bytes -= len;
   1713		copied = 0;
   1714		len = PAGE_SIZE;
   1715	} while (++pages < last_page);
   1716	goto out;
   1717}
   1718
   1719/**
   1720 * ntfs_perform_write - perform buffered write to a file
   1721 * @file:	file to write to
   1722 * @i:		iov_iter with data to write
   1723 * @pos:	byte offset in file at which to begin writing to
   1724 */
   1725static ssize_t ntfs_perform_write(struct file *file, struct iov_iter *i,
   1726		loff_t pos)
   1727{
   1728	struct address_space *mapping = file->f_mapping;
   1729	struct inode *vi = mapping->host;
   1730	ntfs_inode *ni = NTFS_I(vi);
   1731	ntfs_volume *vol = ni->vol;
   1732	struct page *pages[NTFS_MAX_PAGES_PER_CLUSTER];
   1733	struct page *cached_page = NULL;
   1734	VCN last_vcn;
   1735	LCN lcn;
   1736	size_t bytes;
   1737	ssize_t status, written = 0;
   1738	unsigned nr_pages;
   1739
   1740	ntfs_debug("Entering for i_ino 0x%lx, attribute type 0x%x, pos "
   1741			"0x%llx, count 0x%lx.", vi->i_ino,
   1742			(unsigned)le32_to_cpu(ni->type),
   1743			(unsigned long long)pos,
   1744			(unsigned long)iov_iter_count(i));
   1745	/*
   1746	 * If a previous ntfs_truncate() failed, repeat it and abort if it
   1747	 * fails again.
   1748	 */
   1749	if (unlikely(NInoTruncateFailed(ni))) {
   1750		int err;
   1751
   1752		inode_dio_wait(vi);
   1753		err = ntfs_truncate(vi);
   1754		if (err || NInoTruncateFailed(ni)) {
   1755			if (!err)
   1756				err = -EIO;
   1757			ntfs_error(vol->sb, "Cannot perform write to inode "
   1758					"0x%lx, attribute type 0x%x, because "
   1759					"ntfs_truncate() failed (error code "
   1760					"%i).", vi->i_ino,
   1761					(unsigned)le32_to_cpu(ni->type), err);
   1762			return err;
   1763		}
   1764	}
   1765	/*
   1766	 * Determine the number of pages per cluster for non-resident
   1767	 * attributes.
   1768	 */
   1769	nr_pages = 1;
   1770	if (vol->cluster_size > PAGE_SIZE && NInoNonResident(ni))
   1771		nr_pages = vol->cluster_size >> PAGE_SHIFT;
   1772	last_vcn = -1;
   1773	do {
   1774		VCN vcn;
   1775		pgoff_t start_idx;
   1776		unsigned ofs, do_pages, u;
   1777		size_t copied;
   1778
   1779		start_idx = pos >> PAGE_SHIFT;
   1780		ofs = pos & ~PAGE_MASK;
   1781		bytes = PAGE_SIZE - ofs;
   1782		do_pages = 1;
   1783		if (nr_pages > 1) {
   1784			vcn = pos >> vol->cluster_size_bits;
   1785			if (vcn != last_vcn) {
   1786				last_vcn = vcn;
   1787				/*
   1788				 * Get the lcn of the vcn the write is in.  If
   1789				 * it is a hole, need to lock down all pages in
   1790				 * the cluster.
   1791				 */
   1792				down_read(&ni->runlist.lock);
   1793				lcn = ntfs_attr_vcn_to_lcn_nolock(ni, pos >>
   1794						vol->cluster_size_bits, false);
   1795				up_read(&ni->runlist.lock);
   1796				if (unlikely(lcn < LCN_HOLE)) {
   1797					if (lcn == LCN_ENOMEM)
   1798						status = -ENOMEM;
   1799					else {
   1800						status = -EIO;
   1801						ntfs_error(vol->sb, "Cannot "
   1802							"perform write to "
   1803							"inode 0x%lx, "
   1804							"attribute type 0x%x, "
   1805							"because the attribute "
   1806							"is corrupt.",
   1807							vi->i_ino, (unsigned)
   1808							le32_to_cpu(ni->type));
   1809					}
   1810					break;
   1811				}
   1812				if (lcn == LCN_HOLE) {
   1813					start_idx = (pos & ~(s64)
   1814							vol->cluster_size_mask)
   1815							>> PAGE_SHIFT;
   1816					bytes = vol->cluster_size - (pos &
   1817							vol->cluster_size_mask);
   1818					do_pages = nr_pages;
   1819				}
   1820			}
   1821		}
   1822		if (bytes > iov_iter_count(i))
   1823			bytes = iov_iter_count(i);
   1824again:
   1825		/*
   1826		 * Bring in the user page(s) that we will copy from _first_.
   1827		 * Otherwise there is a nasty deadlock on copying from the same
   1828		 * page(s) as we are writing to, without it/them being marked
   1829		 * up-to-date.  Note, at present there is nothing to stop the
   1830		 * pages being swapped out between us bringing them into memory
   1831		 * and doing the actual copying.
   1832		 */
   1833		if (unlikely(fault_in_iov_iter_readable(i, bytes))) {
   1834			status = -EFAULT;
   1835			break;
   1836		}
   1837		/* Get and lock @do_pages starting at index @start_idx. */
   1838		status = __ntfs_grab_cache_pages(mapping, start_idx, do_pages,
   1839				pages, &cached_page);
   1840		if (unlikely(status))
   1841			break;
   1842		/*
   1843		 * For non-resident attributes, we need to fill any holes with
   1844		 * actual clusters and ensure all bufferes are mapped.  We also
   1845		 * need to bring uptodate any buffers that are only partially
   1846		 * being written to.
   1847		 */
   1848		if (NInoNonResident(ni)) {
   1849			status = ntfs_prepare_pages_for_non_resident_write(
   1850					pages, do_pages, pos, bytes);
   1851			if (unlikely(status)) {
   1852				do {
   1853					unlock_page(pages[--do_pages]);
   1854					put_page(pages[do_pages]);
   1855				} while (do_pages);
   1856				break;
   1857			}
   1858		}
   1859		u = (pos >> PAGE_SHIFT) - pages[0]->index;
   1860		copied = ntfs_copy_from_user_iter(pages + u, do_pages - u, ofs,
   1861					i, bytes);
   1862		ntfs_flush_dcache_pages(pages + u, do_pages - u);
   1863		status = 0;
   1864		if (likely(copied == bytes)) {
   1865			status = ntfs_commit_pages_after_write(pages, do_pages,
   1866					pos, bytes);
   1867		}
   1868		do {
   1869			unlock_page(pages[--do_pages]);
   1870			put_page(pages[do_pages]);
   1871		} while (do_pages);
   1872		if (unlikely(status < 0)) {
   1873			iov_iter_revert(i, copied);
   1874			break;
   1875		}
   1876		cond_resched();
   1877		if (unlikely(copied < bytes)) {
   1878			iov_iter_revert(i, copied);
   1879			if (copied)
   1880				bytes = copied;
   1881			else if (bytes > PAGE_SIZE - ofs)
   1882				bytes = PAGE_SIZE - ofs;
   1883			goto again;
   1884		}
   1885		pos += copied;
   1886		written += copied;
   1887		balance_dirty_pages_ratelimited(mapping);
   1888		if (fatal_signal_pending(current)) {
   1889			status = -EINTR;
   1890			break;
   1891		}
   1892	} while (iov_iter_count(i));
   1893	if (cached_page)
   1894		put_page(cached_page);
   1895	ntfs_debug("Done.  Returning %s (written 0x%lx, status %li).",
   1896			written ? "written" : "status", (unsigned long)written,
   1897			(long)status);
   1898	return written ? written : status;
   1899}
   1900
   1901/**
   1902 * ntfs_file_write_iter - simple wrapper for ntfs_file_write_iter_nolock()
   1903 * @iocb:	IO state structure
   1904 * @from:	iov_iter with data to write
   1905 *
   1906 * Basically the same as generic_file_write_iter() except that it ends up
   1907 * up calling ntfs_perform_write() instead of generic_perform_write() and that
   1908 * O_DIRECT is not implemented.
   1909 */
   1910static ssize_t ntfs_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
   1911{
   1912	struct file *file = iocb->ki_filp;
   1913	struct inode *vi = file_inode(file);
   1914	ssize_t written = 0;
   1915	ssize_t err;
   1916
   1917	inode_lock(vi);
   1918	/* We can write back this queue in page reclaim. */
   1919	current->backing_dev_info = inode_to_bdi(vi);
   1920	err = ntfs_prepare_file_for_write(iocb, from);
   1921	if (iov_iter_count(from) && !err)
   1922		written = ntfs_perform_write(file, from, iocb->ki_pos);
   1923	current->backing_dev_info = NULL;
   1924	inode_unlock(vi);
   1925	iocb->ki_pos += written;
   1926	if (likely(written > 0))
   1927		written = generic_write_sync(iocb, written);
   1928	return written ? written : err;
   1929}
   1930
   1931/**
   1932 * ntfs_file_fsync - sync a file to disk
   1933 * @filp:	file to be synced
   1934 * @datasync:	if non-zero only flush user data and not metadata
   1935 *
   1936 * Data integrity sync of a file to disk.  Used for fsync, fdatasync, and msync
   1937 * system calls.  This function is inspired by fs/buffer.c::file_fsync().
   1938 *
   1939 * If @datasync is false, write the mft record and all associated extent mft
   1940 * records as well as the $DATA attribute and then sync the block device.
   1941 *
   1942 * If @datasync is true and the attribute is non-resident, we skip the writing
   1943 * of the mft record and all associated extent mft records (this might still
   1944 * happen due to the write_inode_now() call).
   1945 *
   1946 * Also, if @datasync is true, we do not wait on the inode to be written out
   1947 * but we always wait on the page cache pages to be written out.
   1948 *
   1949 * Locking: Caller must hold i_mutex on the inode.
   1950 *
   1951 * TODO: We should probably also write all attribute/index inodes associated
   1952 * with this inode but since we have no simple way of getting to them we ignore
   1953 * this problem for now.
   1954 */
   1955static int ntfs_file_fsync(struct file *filp, loff_t start, loff_t end,
   1956			   int datasync)
   1957{
   1958	struct inode *vi = filp->f_mapping->host;
   1959	int err, ret = 0;
   1960
   1961	ntfs_debug("Entering for inode 0x%lx.", vi->i_ino);
   1962
   1963	err = file_write_and_wait_range(filp, start, end);
   1964	if (err)
   1965		return err;
   1966	inode_lock(vi);
   1967
   1968	BUG_ON(S_ISDIR(vi->i_mode));
   1969	if (!datasync || !NInoNonResident(NTFS_I(vi)))
   1970		ret = __ntfs_write_inode(vi, 1);
   1971	write_inode_now(vi, !datasync);
   1972	/*
   1973	 * NOTE: If we were to use mapping->private_list (see ext2 and
   1974	 * fs/buffer.c) for dirty blocks then we could optimize the below to be
   1975	 * sync_mapping_buffers(vi->i_mapping).
   1976	 */
   1977	err = sync_blockdev(vi->i_sb->s_bdev);
   1978	if (unlikely(err && !ret))
   1979		ret = err;
   1980	if (likely(!ret))
   1981		ntfs_debug("Done.");
   1982	else
   1983		ntfs_warning(vi->i_sb, "Failed to f%ssync inode 0x%lx.  Error "
   1984				"%u.", datasync ? "data" : "", vi->i_ino, -ret);
   1985	inode_unlock(vi);
   1986	return ret;
   1987}
   1988
   1989#endif /* NTFS_RW */
   1990
   1991const struct file_operations ntfs_file_ops = {
   1992	.llseek		= generic_file_llseek,
   1993	.read_iter	= generic_file_read_iter,
   1994#ifdef NTFS_RW
   1995	.write_iter	= ntfs_file_write_iter,
   1996	.fsync		= ntfs_file_fsync,
   1997#endif /* NTFS_RW */
   1998	.mmap		= generic_file_mmap,
   1999	.open		= ntfs_file_open,
   2000	.splice_read	= generic_file_splice_read,
   2001};
   2002
   2003const struct inode_operations ntfs_file_inode_ops = {
   2004#ifdef NTFS_RW
   2005	.setattr	= ntfs_setattr,
   2006#endif /* NTFS_RW */
   2007};
   2008
   2009const struct file_operations ntfs_empty_file_ops = {};
   2010
   2011const struct inode_operations ntfs_empty_inode_ops = {};