filemap.c - cachepc-linux - Fork of AMDESE/linux with modifications for CachePC side-channel attack

	cachepc-linux Fork of AMDESE/linux with modifications for CachePC side-channel attack
	git clone https://git.sinitax.com/sinitax/cachepc-linux
	Log \| Files \| Refs \| README \| LICENSE \| sfeed.txt
filemap.c (114344B)
      1// SPDX-License-Identifier: GPL-2.0-only
      2/*
      3 *	linux/mm/filemap.c
      4 *
      5 * Copyright (C) 1994-1999  Linus Torvalds
      6 */
      7
      8/*
      9 * This file handles the generic file mmap semantics used by
     10 * most "normal" filesystems (but you don't /have/ to use this:
     11 * the NFS filesystem used to do this differently, for example)
     12 */
     13#include <linux/export.h>
     14#include <linux/compiler.h>
     15#include <linux/dax.h>
     16#include <linux/fs.h>
     17#include <linux/sched/signal.h>
     18#include <linux/uaccess.h>
     19#include <linux/capability.h>
     20#include <linux/kernel_stat.h>
     21#include <linux/gfp.h>
     22#include <linux/mm.h>
     23#include <linux/swap.h>
     24#include <linux/swapops.h>
     25#include <linux/mman.h>
     26#include <linux/pagemap.h>
     27#include <linux/file.h>
     28#include <linux/uio.h>
     29#include <linux/error-injection.h>
     30#include <linux/hash.h>
     31#include <linux/writeback.h>
     32#include <linux/backing-dev.h>
     33#include <linux/pagevec.h>
     34#include <linux/security.h>
     35#include <linux/cpuset.h>
     36#include <linux/hugetlb.h>
     37#include <linux/memcontrol.h>
     38#include <linux/shmem_fs.h>
     39#include <linux/rmap.h>
     40#include <linux/delayacct.h>
     41#include <linux/psi.h>
     42#include <linux/ramfs.h>
     43#include <linux/page_idle.h>
     44#include <linux/migrate.h>
     45#include <asm/pgalloc.h>
     46#include <asm/tlbflush.h>
     47#include "internal.h"
     48
     49#define CREATE_TRACE_POINTS
     50#include <trace/events/filemap.h>
     51
     52/*
     53 * FIXME: remove all knowledge of the buffer layer from the core VM
     54 */
     55#include <linux/buffer_head.h> /* for try_to_free_buffers */
     56
     57#include <asm/mman.h>
     58
     59/*
     60 * Shared mappings implemented 30.11.1994. It's not fully working yet,
     61 * though.
     62 *
     63 * Shared mappings now work. 15.8.1995  Bruno.
     64 *
     65 * finished 'unifying' the page and buffer cache and SMP-threaded the
     66 * page-cache, 21.05.1999, Ingo Molnar <mingo@redhat.com>
     67 *
     68 * SMP-threaded pagemap-LRU 1999, Andrea Arcangeli <andrea@suse.de>
     69 */
     70
     71/*
     72 * Lock ordering:
     73 *
     74 *  ->i_mmap_rwsem		(truncate_pagecache)
     75 *    ->private_lock		(__free_pte->block_dirty_folio)
     76 *      ->swap_lock		(exclusive_swap_page, others)
     77 *        ->i_pages lock
     78 *
     79 *  ->i_rwsem
     80 *    ->invalidate_lock		(acquired by fs in truncate path)
     81 *      ->i_mmap_rwsem		(truncate->unmap_mapping_range)
     82 *
     83 *  ->mmap_lock
     84 *    ->i_mmap_rwsem
     85 *      ->page_table_lock or pte_lock	(various, mainly in memory.c)
     86 *        ->i_pages lock	(arch-dependent flush_dcache_mmap_lock)
     87 *
     88 *  ->mmap_lock
     89 *    ->invalidate_lock		(filemap_fault)
     90 *      ->lock_page		(filemap_fault, access_process_vm)
     91 *
     92 *  ->i_rwsem			(generic_perform_write)
     93 *    ->mmap_lock		(fault_in_readable->do_page_fault)
     94 *
     95 *  bdi->wb.list_lock
     96 *    sb_lock			(fs/fs-writeback.c)
     97 *    ->i_pages lock		(__sync_single_inode)
     98 *
     99 *  ->i_mmap_rwsem
    100 *    ->anon_vma.lock		(vma_adjust)
    101 *
    102 *  ->anon_vma.lock
    103 *    ->page_table_lock or pte_lock	(anon_vma_prepare and various)
    104 *
    105 *  ->page_table_lock or pte_lock
    106 *    ->swap_lock		(try_to_unmap_one)
    107 *    ->private_lock		(try_to_unmap_one)
    108 *    ->i_pages lock		(try_to_unmap_one)
    109 *    ->lruvec->lru_lock	(follow_page->mark_page_accessed)
    110 *    ->lruvec->lru_lock	(check_pte_range->isolate_lru_page)
    111 *    ->private_lock		(page_remove_rmap->set_page_dirty)
    112 *    ->i_pages lock		(page_remove_rmap->set_page_dirty)
    113 *    bdi.wb->list_lock		(page_remove_rmap->set_page_dirty)
    114 *    ->inode->i_lock		(page_remove_rmap->set_page_dirty)
    115 *    ->memcg->move_lock	(page_remove_rmap->lock_page_memcg)
    116 *    bdi.wb->list_lock		(zap_pte_range->set_page_dirty)
    117 *    ->inode->i_lock		(zap_pte_range->set_page_dirty)
    118 *    ->private_lock		(zap_pte_range->block_dirty_folio)
    119 *
    120 * ->i_mmap_rwsem
    121 *   ->tasklist_lock            (memory_failure, collect_procs_ao)
    122 */
    123
    124static void page_cache_delete(struct address_space *mapping,
    125				   struct folio *folio, void *shadow)
    126{
    127	XA_STATE(xas, &mapping->i_pages, folio->index);
    128	long nr = 1;
    129
    130	mapping_set_update(&xas, mapping);
    131
    132	/* hugetlb pages are represented by a single entry in the xarray */
    133	if (!folio_test_hugetlb(folio)) {
    134		xas_set_order(&xas, folio->index, folio_order(folio));
    135		nr = folio_nr_pages(folio);
    136	}
    137
    138	VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);
    139
    140	xas_store(&xas, shadow);
    141	xas_init_marks(&xas);
    142
    143	folio->mapping = NULL;
    144	/* Leave page->index set: truncation lookup relies upon it */
    145	mapping->nrpages -= nr;
    146}
    147
    148static void filemap_unaccount_folio(struct address_space *mapping,
    149		struct folio *folio)
    150{
    151	long nr;
    152
    153	VM_BUG_ON_FOLIO(folio_mapped(folio), folio);
    154	if (!IS_ENABLED(CONFIG_DEBUG_VM) && unlikely(folio_mapped(folio))) {
    155		pr_alert("BUG: Bad page cache in process %s  pfn:%05lx\n",
    156			 current->comm, folio_pfn(folio));
    157		dump_page(&folio->page, "still mapped when deleted");
    158		dump_stack();
    159		add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE);
    160
    161		if (mapping_exiting(mapping) && !folio_test_large(folio)) {
    162			int mapcount = page_mapcount(&folio->page);
    163
    164			if (folio_ref_count(folio) >= mapcount + 2) {
    165				/*
    166				 * All vmas have already been torn down, so it's
    167				 * a good bet that actually the page is unmapped
    168				 * and we'd rather not leak it: if we're wrong,
    169				 * another bad page check should catch it later.
    170				 */
    171				page_mapcount_reset(&folio->page);
    172				folio_ref_sub(folio, mapcount);
    173			}
    174		}
    175	}
    176
    177	/* hugetlb folios do not participate in page cache accounting. */
    178	if (folio_test_hugetlb(folio))
    179		return;
    180
    181	nr = folio_nr_pages(folio);
    182
    183	__lruvec_stat_mod_folio(folio, NR_FILE_PAGES, -nr);
    184	if (folio_test_swapbacked(folio)) {
    185		__lruvec_stat_mod_folio(folio, NR_SHMEM, -nr);
    186		if (folio_test_pmd_mappable(folio))
    187			__lruvec_stat_mod_folio(folio, NR_SHMEM_THPS, -nr);
    188	} else if (folio_test_pmd_mappable(folio)) {
    189		__lruvec_stat_mod_folio(folio, NR_FILE_THPS, -nr);
    190		filemap_nr_thps_dec(mapping);
    191	}
    192
    193	/*
    194	 * At this point folio must be either written or cleaned by
    195	 * truncate.  Dirty folio here signals a bug and loss of
    196	 * unwritten data - on ordinary filesystems.
    197	 *
    198	 * But it's harmless on in-memory filesystems like tmpfs; and can
    199	 * occur when a driver which did get_user_pages() sets page dirty
    200	 * before putting it, while the inode is being finally evicted.
    201	 *
    202	 * Below fixes dirty accounting after removing the folio entirely
    203	 * but leaves the dirty flag set: it has no effect for truncated
    204	 * folio and anyway will be cleared before returning folio to
    205	 * buddy allocator.
    206	 */
    207	if (WARN_ON_ONCE(folio_test_dirty(folio) &&
    208			 mapping_can_writeback(mapping)))
    209		folio_account_cleaned(folio, inode_to_wb(mapping->host));
    210}
    211
    212/*
    213 * Delete a page from the page cache and free it. Caller has to make
    214 * sure the page is locked and that nobody else uses it - or that usage
    215 * is safe.  The caller must hold the i_pages lock.
    216 */
    217void __filemap_remove_folio(struct folio *folio, void *shadow)
    218{
    219	struct address_space *mapping = folio->mapping;
    220
    221	trace_mm_filemap_delete_from_page_cache(folio);
    222	filemap_unaccount_folio(mapping, folio);
    223	page_cache_delete(mapping, folio, shadow);
    224}
    225
    226void filemap_free_folio(struct address_space *mapping, struct folio *folio)
    227{
    228	void (*free_folio)(struct folio *);
    229	int refs = 1;
    230
    231	free_folio = mapping->a_ops->free_folio;
    232	if (free_folio)
    233		free_folio(folio);
    234
    235	if (folio_test_large(folio) && !folio_test_hugetlb(folio))
    236		refs = folio_nr_pages(folio);
    237	folio_put_refs(folio, refs);
    238}
    239
    240/**
    241 * filemap_remove_folio - Remove folio from page cache.
    242 * @folio: The folio.
    243 *
    244 * This must be called only on folios that are locked and have been
    245 * verified to be in the page cache.  It will never put the folio into
    246 * the free list because the caller has a reference on the page.
    247 */
    248void filemap_remove_folio(struct folio *folio)
    249{
    250	struct address_space *mapping = folio->mapping;
    251
    252	BUG_ON(!folio_test_locked(folio));
    253	spin_lock(&mapping->host->i_lock);
    254	xa_lock_irq(&mapping->i_pages);
    255	__filemap_remove_folio(folio, NULL);
    256	xa_unlock_irq(&mapping->i_pages);
    257	if (mapping_shrinkable(mapping))
    258		inode_add_lru(mapping->host);
    259	spin_unlock(&mapping->host->i_lock);
    260
    261	filemap_free_folio(mapping, folio);
    262}
    263
    264/*
    265 * page_cache_delete_batch - delete several folios from page cache
    266 * @mapping: the mapping to which folios belong
    267 * @fbatch: batch of folios to delete
    268 *
    269 * The function walks over mapping->i_pages and removes folios passed in
    270 * @fbatch from the mapping. The function expects @fbatch to be sorted
    271 * by page index and is optimised for it to be dense.
    272 * It tolerates holes in @fbatch (mapping entries at those indices are not
    273 * modified).
    274 *
    275 * The function expects the i_pages lock to be held.
    276 */
    277static void page_cache_delete_batch(struct address_space *mapping,
    278			     struct folio_batch *fbatch)
    279{
    280	XA_STATE(xas, &mapping->i_pages, fbatch->folios[0]->index);
    281	long total_pages = 0;
    282	int i = 0;
    283	struct folio *folio;
    284
    285	mapping_set_update(&xas, mapping);
    286	xas_for_each(&xas, folio, ULONG_MAX) {
    287		if (i >= folio_batch_count(fbatch))
    288			break;
    289
    290		/* A swap/dax/shadow entry got inserted? Skip it. */
    291		if (xa_is_value(folio))
    292			continue;
    293		/*
    294		 * A page got inserted in our range? Skip it. We have our
    295		 * pages locked so they are protected from being removed.
    296		 * If we see a page whose index is higher than ours, it
    297		 * means our page has been removed, which shouldn't be
    298		 * possible because we're holding the PageLock.
    299		 */
    300		if (folio != fbatch->folios[i]) {
    301			VM_BUG_ON_FOLIO(folio->index >
    302					fbatch->folios[i]->index, folio);
    303			continue;
    304		}
    305
    306		WARN_ON_ONCE(!folio_test_locked(folio));
    307
    308		folio->mapping = NULL;
    309		/* Leave folio->index set: truncation lookup relies on it */
    310
    311		i++;
    312		xas_store(&xas, NULL);
    313		total_pages += folio_nr_pages(folio);
    314	}
    315	mapping->nrpages -= total_pages;
    316}
    317
    318void delete_from_page_cache_batch(struct address_space *mapping,
    319				  struct folio_batch *fbatch)
    320{
    321	int i;
    322
    323	if (!folio_batch_count(fbatch))
    324		return;
    325
    326	spin_lock(&mapping->host->i_lock);
    327	xa_lock_irq(&mapping->i_pages);
    328	for (i = 0; i < folio_batch_count(fbatch); i++) {
    329		struct folio *folio = fbatch->folios[i];
    330
    331		trace_mm_filemap_delete_from_page_cache(folio);
    332		filemap_unaccount_folio(mapping, folio);
    333	}
    334	page_cache_delete_batch(mapping, fbatch);
    335	xa_unlock_irq(&mapping->i_pages);
    336	if (mapping_shrinkable(mapping))
    337		inode_add_lru(mapping->host);
    338	spin_unlock(&mapping->host->i_lock);
    339
    340	for (i = 0; i < folio_batch_count(fbatch); i++)
    341		filemap_free_folio(mapping, fbatch->folios[i]);
    342}
    343
    344int filemap_check_errors(struct address_space *mapping)
    345{
    346	int ret = 0;
    347	/* Check for outstanding write errors */
    348	if (test_bit(AS_ENOSPC, &mapping->flags) &&
    349	    test_and_clear_bit(AS_ENOSPC, &mapping->flags))
    350		ret = -ENOSPC;
    351	if (test_bit(AS_EIO, &mapping->flags) &&
    352	    test_and_clear_bit(AS_EIO, &mapping->flags))
    353		ret = -EIO;
    354	return ret;
    355}
    356EXPORT_SYMBOL(filemap_check_errors);
    357
    358static int filemap_check_and_keep_errors(struct address_space *mapping)
    359{
    360	/* Check for outstanding write errors */
    361	if (test_bit(AS_EIO, &mapping->flags))
    362		return -EIO;
    363	if (test_bit(AS_ENOSPC, &mapping->flags))
    364		return -ENOSPC;
    365	return 0;
    366}
    367
    368/**
    369 * filemap_fdatawrite_wbc - start writeback on mapping dirty pages in range
    370 * @mapping:	address space structure to write
    371 * @wbc:	the writeback_control controlling the writeout
    372 *
    373 * Call writepages on the mapping using the provided wbc to control the
    374 * writeout.
    375 *
    376 * Return: %0 on success, negative error code otherwise.
    377 */
    378int filemap_fdatawrite_wbc(struct address_space *mapping,
    379			   struct writeback_control *wbc)
    380{
    381	int ret;
    382
    383	if (!mapping_can_writeback(mapping) ||
    384	    !mapping_tagged(mapping, PAGECACHE_TAG_DIRTY))
    385		return 0;
    386
    387	wbc_attach_fdatawrite_inode(wbc, mapping->host);
    388	ret = do_writepages(mapping, wbc);
    389	wbc_detach_inode(wbc);
    390	return ret;
    391}
    392EXPORT_SYMBOL(filemap_fdatawrite_wbc);
    393
    394/**
    395 * __filemap_fdatawrite_range - start writeback on mapping dirty pages in range
    396 * @mapping:	address space structure to write
    397 * @start:	offset in bytes where the range starts
    398 * @end:	offset in bytes where the range ends (inclusive)
    399 * @sync_mode:	enable synchronous operation
    400 *
    401 * Start writeback against all of a mapping's dirty pages that lie
    402 * within the byte offsets <start, end> inclusive.
    403 *
    404 * If sync_mode is WB_SYNC_ALL then this is a "data integrity" operation, as
    405 * opposed to a regular memory cleansing writeback.  The difference between
    406 * these two operations is that if a dirty page/buffer is encountered, it must
    407 * be waited upon, and not just skipped over.
    408 *
    409 * Return: %0 on success, negative error code otherwise.
    410 */
    411int __filemap_fdatawrite_range(struct address_space *mapping, loff_t start,
    412				loff_t end, int sync_mode)
    413{
    414	struct writeback_control wbc = {
    415		.sync_mode = sync_mode,
    416		.nr_to_write = LONG_MAX,
    417		.range_start = start,
    418		.range_end = end,
    419	};
    420
    421	return filemap_fdatawrite_wbc(mapping, &wbc);
    422}
    423
    424static inline int __filemap_fdatawrite(struct address_space *mapping,
    425	int sync_mode)
    426{
    427	return __filemap_fdatawrite_range(mapping, 0, LLONG_MAX, sync_mode);
    428}
    429
    430int filemap_fdatawrite(struct address_space *mapping)
    431{
    432	return __filemap_fdatawrite(mapping, WB_SYNC_ALL);
    433}
    434EXPORT_SYMBOL(filemap_fdatawrite);
    435
    436int filemap_fdatawrite_range(struct address_space *mapping, loff_t start,
    437				loff_t end)
    438{
    439	return __filemap_fdatawrite_range(mapping, start, end, WB_SYNC_ALL);
    440}
    441EXPORT_SYMBOL(filemap_fdatawrite_range);
    442
    443/**
    444 * filemap_flush - mostly a non-blocking flush
    445 * @mapping:	target address_space
    446 *
    447 * This is a mostly non-blocking flush.  Not suitable for data-integrity
    448 * purposes - I/O may not be started against all dirty pages.
    449 *
    450 * Return: %0 on success, negative error code otherwise.
    451 */
    452int filemap_flush(struct address_space *mapping)
    453{
    454	return __filemap_fdatawrite(mapping, WB_SYNC_NONE);
    455}
    456EXPORT_SYMBOL(filemap_flush);
    457
    458/**
    459 * filemap_range_has_page - check if a page exists in range.
    460 * @mapping:           address space within which to check
    461 * @start_byte:        offset in bytes where the range starts
    462 * @end_byte:          offset in bytes where the range ends (inclusive)
    463 *
    464 * Find at least one page in the range supplied, usually used to check if
    465 * direct writing in this range will trigger a writeback.
    466 *
    467 * Return: %true if at least one page exists in the specified range,
    468 * %false otherwise.
    469 */
    470bool filemap_range_has_page(struct address_space *mapping,
    471			   loff_t start_byte, loff_t end_byte)
    472{
    473	struct page *page;
    474	XA_STATE(xas, &mapping->i_pages, start_byte >> PAGE_SHIFT);
    475	pgoff_t max = end_byte >> PAGE_SHIFT;
    476
    477	if (end_byte < start_byte)
    478		return false;
    479
    480	rcu_read_lock();
    481	for (;;) {
    482		page = xas_find(&xas, max);
    483		if (xas_retry(&xas, page))
    484			continue;
    485		/* Shadow entries don't count */
    486		if (xa_is_value(page))
    487			continue;
    488		/*
    489		 * We don't need to try to pin this page; we're about to
    490		 * release the RCU lock anyway.  It is enough to know that
    491		 * there was a page here recently.
    492		 */
    493		break;
    494	}
    495	rcu_read_unlock();
    496
    497	return page != NULL;
    498}
    499EXPORT_SYMBOL(filemap_range_has_page);
    500
    501static void __filemap_fdatawait_range(struct address_space *mapping,
    502				     loff_t start_byte, loff_t end_byte)
    503{
    504	pgoff_t index = start_byte >> PAGE_SHIFT;
    505	pgoff_t end = end_byte >> PAGE_SHIFT;
    506	struct pagevec pvec;
    507	int nr_pages;
    508
    509	if (end_byte < start_byte)
    510		return;
    511
    512	pagevec_init(&pvec);
    513	while (index <= end) {
    514		unsigned i;
    515
    516		nr_pages = pagevec_lookup_range_tag(&pvec, mapping, &index,
    517				end, PAGECACHE_TAG_WRITEBACK);
    518		if (!nr_pages)
    519			break;
    520
    521		for (i = 0; i < nr_pages; i++) {
    522			struct page *page = pvec.pages[i];
    523
    524			wait_on_page_writeback(page);
    525			ClearPageError(page);
    526		}
    527		pagevec_release(&pvec);
    528		cond_resched();
    529	}
    530}
    531
    532/**
    533 * filemap_fdatawait_range - wait for writeback to complete
    534 * @mapping:		address space structure to wait for
    535 * @start_byte:		offset in bytes where the range starts
    536 * @end_byte:		offset in bytes where the range ends (inclusive)
    537 *
    538 * Walk the list of under-writeback pages of the given address space
    539 * in the given range and wait for all of them.  Check error status of
    540 * the address space and return it.
    541 *
    542 * Since the error status of the address space is cleared by this function,
    543 * callers are responsible for checking the return value and handling and/or
    544 * reporting the error.
    545 *
    546 * Return: error status of the address space.
    547 */
    548int filemap_fdatawait_range(struct address_space *mapping, loff_t start_byte,
    549			    loff_t end_byte)
    550{
    551	__filemap_fdatawait_range(mapping, start_byte, end_byte);
    552	return filemap_check_errors(mapping);
    553}
    554EXPORT_SYMBOL(filemap_fdatawait_range);
    555
    556/**
    557 * filemap_fdatawait_range_keep_errors - wait for writeback to complete
    558 * @mapping:		address space structure to wait for
    559 * @start_byte:		offset in bytes where the range starts
    560 * @end_byte:		offset in bytes where the range ends (inclusive)
    561 *
    562 * Walk the list of under-writeback pages of the given address space in the
    563 * given range and wait for all of them.  Unlike filemap_fdatawait_range(),
    564 * this function does not clear error status of the address space.
    565 *
    566 * Use this function if callers don't handle errors themselves.  Expected
    567 * call sites are system-wide / filesystem-wide data flushers: e.g. sync(2),
    568 * fsfreeze(8)
    569 */
    570int filemap_fdatawait_range_keep_errors(struct address_space *mapping,
    571		loff_t start_byte, loff_t end_byte)
    572{
    573	__filemap_fdatawait_range(mapping, start_byte, end_byte);
    574	return filemap_check_and_keep_errors(mapping);
    575}
    576EXPORT_SYMBOL(filemap_fdatawait_range_keep_errors);
    577
    578/**
    579 * file_fdatawait_range - wait for writeback to complete
    580 * @file:		file pointing to address space structure to wait for
    581 * @start_byte:		offset in bytes where the range starts
    582 * @end_byte:		offset in bytes where the range ends (inclusive)
    583 *
    584 * Walk the list of under-writeback pages of the address space that file
    585 * refers to, in the given range and wait for all of them.  Check error
    586 * status of the address space vs. the file->f_wb_err cursor and return it.
    587 *
    588 * Since the error status of the file is advanced by this function,
    589 * callers are responsible for checking the return value and handling and/or
    590 * reporting the error.
    591 *
    592 * Return: error status of the address space vs. the file->f_wb_err cursor.
    593 */
    594int file_fdatawait_range(struct file *file, loff_t start_byte, loff_t end_byte)
    595{
    596	struct address_space *mapping = file->f_mapping;
    597
    598	__filemap_fdatawait_range(mapping, start_byte, end_byte);
    599	return file_check_and_advance_wb_err(file);
    600}
    601EXPORT_SYMBOL(file_fdatawait_range);
    602
    603/**
    604 * filemap_fdatawait_keep_errors - wait for writeback without clearing errors
    605 * @mapping: address space structure to wait for
    606 *
    607 * Walk the list of under-writeback pages of the given address space
    608 * and wait for all of them.  Unlike filemap_fdatawait(), this function
    609 * does not clear error status of the address space.
    610 *
    611 * Use this function if callers don't handle errors themselves.  Expected
    612 * call sites are system-wide / filesystem-wide data flushers: e.g. sync(2),
    613 * fsfreeze(8)
    614 *
    615 * Return: error status of the address space.
    616 */
    617int filemap_fdatawait_keep_errors(struct address_space *mapping)
    618{
    619	__filemap_fdatawait_range(mapping, 0, LLONG_MAX);
    620	return filemap_check_and_keep_errors(mapping);
    621}
    622EXPORT_SYMBOL(filemap_fdatawait_keep_errors);
    623
    624/* Returns true if writeback might be needed or already in progress. */
    625static bool mapping_needs_writeback(struct address_space *mapping)
    626{
    627	return mapping->nrpages;
    628}
    629
    630bool filemap_range_has_writeback(struct address_space *mapping,
    631				 loff_t start_byte, loff_t end_byte)
    632{
    633	XA_STATE(xas, &mapping->i_pages, start_byte >> PAGE_SHIFT);
    634	pgoff_t max = end_byte >> PAGE_SHIFT;
    635	struct page *page;
    636
    637	if (end_byte < start_byte)
    638		return false;
    639
    640	rcu_read_lock();
    641	xas_for_each(&xas, page, max) {
    642		if (xas_retry(&xas, page))
    643			continue;
    644		if (xa_is_value(page))
    645			continue;
    646		if (PageDirty(page) || PageLocked(page) || PageWriteback(page))
    647			break;
    648	}
    649	rcu_read_unlock();
    650	return page != NULL;
    651}
    652EXPORT_SYMBOL_GPL(filemap_range_has_writeback);
    653
    654/**
    655 * filemap_write_and_wait_range - write out & wait on a file range
    656 * @mapping:	the address_space for the pages
    657 * @lstart:	offset in bytes where the range starts
    658 * @lend:	offset in bytes where the range ends (inclusive)
    659 *
    660 * Write out and wait upon file offsets lstart->lend, inclusive.
    661 *
    662 * Note that @lend is inclusive (describes the last byte to be written) so
    663 * that this function can be used to write to the very end-of-file (end = -1).
    664 *
    665 * Return: error status of the address space.
    666 */
    667int filemap_write_and_wait_range(struct address_space *mapping,
    668				 loff_t lstart, loff_t lend)
    669{
    670	int err = 0;
    671
    672	if (mapping_needs_writeback(mapping)) {
    673		err = __filemap_fdatawrite_range(mapping, lstart, lend,
    674						 WB_SYNC_ALL);
    675		/*
    676		 * Even if the above returned error, the pages may be
    677		 * written partially (e.g. -ENOSPC), so we wait for it.
    678		 * But the -EIO is special case, it may indicate the worst
    679		 * thing (e.g. bug) happened, so we avoid waiting for it.
    680		 */
    681		if (err != -EIO) {
    682			int err2 = filemap_fdatawait_range(mapping,
    683						lstart, lend);
    684			if (!err)
    685				err = err2;
    686		} else {
    687			/* Clear any previously stored errors */
    688			filemap_check_errors(mapping);
    689		}
    690	} else {
    691		err = filemap_check_errors(mapping);
    692	}
    693	return err;
    694}
    695EXPORT_SYMBOL(filemap_write_and_wait_range);
    696
    697void __filemap_set_wb_err(struct address_space *mapping, int err)
    698{
    699	errseq_t eseq = errseq_set(&mapping->wb_err, err);
    700
    701	trace_filemap_set_wb_err(mapping, eseq);
    702}
    703EXPORT_SYMBOL(__filemap_set_wb_err);
    704
    705/**
    706 * file_check_and_advance_wb_err - report wb error (if any) that was previously
    707 * 				   and advance wb_err to current one
    708 * @file: struct file on which the error is being reported
    709 *
    710 * When userland calls fsync (or something like nfsd does the equivalent), we
    711 * want to report any writeback errors that occurred since the last fsync (or
    712 * since the file was opened if there haven't been any).
    713 *
    714 * Grab the wb_err from the mapping. If it matches what we have in the file,
    715 * then just quickly return 0. The file is all caught up.
    716 *
    717 * If it doesn't match, then take the mapping value, set the "seen" flag in
    718 * it and try to swap it into place. If it works, or another task beat us
    719 * to it with the new value, then update the f_wb_err and return the error
    720 * portion. The error at this point must be reported via proper channels
    721 * (a'la fsync, or NFS COMMIT operation, etc.).
    722 *
    723 * While we handle mapping->wb_err with atomic operations, the f_wb_err
    724 * value is protected by the f_lock since we must ensure that it reflects
    725 * the latest value swapped in for this file descriptor.
    726 *
    727 * Return: %0 on success, negative error code otherwise.
    728 */
    729int file_check_and_advance_wb_err(struct file *file)
    730{
    731	int err = 0;
    732	errseq_t old = READ_ONCE(file->f_wb_err);
    733	struct address_space *mapping = file->f_mapping;
    734
    735	/* Locklessly handle the common case where nothing has changed */
    736	if (errseq_check(&mapping->wb_err, old)) {
    737		/* Something changed, must use slow path */
    738		spin_lock(&file->f_lock);
    739		old = file->f_wb_err;
    740		err = errseq_check_and_advance(&mapping->wb_err,
    741						&file->f_wb_err);
    742		trace_file_check_and_advance_wb_err(file, old);
    743		spin_unlock(&file->f_lock);
    744	}
    745
    746	/*
    747	 * We're mostly using this function as a drop in replacement for
    748	 * filemap_check_errors. Clear AS_EIO/AS_ENOSPC to emulate the effect
    749	 * that the legacy code would have had on these flags.
    750	 */
    751	clear_bit(AS_EIO, &mapping->flags);
    752	clear_bit(AS_ENOSPC, &mapping->flags);
    753	return err;
    754}
    755EXPORT_SYMBOL(file_check_and_advance_wb_err);
    756
    757/**
    758 * file_write_and_wait_range - write out & wait on a file range
    759 * @file:	file pointing to address_space with pages
    760 * @lstart:	offset in bytes where the range starts
    761 * @lend:	offset in bytes where the range ends (inclusive)
    762 *
    763 * Write out and wait upon file offsets lstart->lend, inclusive.
    764 *
    765 * Note that @lend is inclusive (describes the last byte to be written) so
    766 * that this function can be used to write to the very end-of-file (end = -1).
    767 *
    768 * After writing out and waiting on the data, we check and advance the
    769 * f_wb_err cursor to the latest value, and return any errors detected there.
    770 *
    771 * Return: %0 on success, negative error code otherwise.
    772 */
    773int file_write_and_wait_range(struct file *file, loff_t lstart, loff_t lend)
    774{
    775	int err = 0, err2;
    776	struct address_space *mapping = file->f_mapping;
    777
    778	if (mapping_needs_writeback(mapping)) {
    779		err = __filemap_fdatawrite_range(mapping, lstart, lend,
    780						 WB_SYNC_ALL);
    781		/* See comment of filemap_write_and_wait() */
    782		if (err != -EIO)
    783			__filemap_fdatawait_range(mapping, lstart, lend);
    784	}
    785	err2 = file_check_and_advance_wb_err(file);
    786	if (!err)
    787		err = err2;
    788	return err;
    789}
    790EXPORT_SYMBOL(file_write_and_wait_range);
    791
    792/**
    793 * replace_page_cache_page - replace a pagecache page with a new one
    794 * @old:	page to be replaced
    795 * @new:	page to replace with
    796 *
    797 * This function replaces a page in the pagecache with a new one.  On
    798 * success it acquires the pagecache reference for the new page and
    799 * drops it for the old page.  Both the old and new pages must be
    800 * locked.  This function does not add the new page to the LRU, the
    801 * caller must do that.
    802 *
    803 * The remove + add is atomic.  This function cannot fail.
    804 */
    805void replace_page_cache_page(struct page *old, struct page *new)
    806{
    807	struct folio *fold = page_folio(old);
    808	struct folio *fnew = page_folio(new);
    809	struct address_space *mapping = old->mapping;
    810	void (*free_folio)(struct folio *) = mapping->a_ops->free_folio;
    811	pgoff_t offset = old->index;
    812	XA_STATE(xas, &mapping->i_pages, offset);
    813
    814	VM_BUG_ON_PAGE(!PageLocked(old), old);
    815	VM_BUG_ON_PAGE(!PageLocked(new), new);
    816	VM_BUG_ON_PAGE(new->mapping, new);
    817
    818	get_page(new);
    819	new->mapping = mapping;
    820	new->index = offset;
    821
    822	mem_cgroup_migrate(fold, fnew);
    823
    824	xas_lock_irq(&xas);
    825	xas_store(&xas, new);
    826
    827	old->mapping = NULL;
    828	/* hugetlb pages do not participate in page cache accounting. */
    829	if (!PageHuge(old))
    830		__dec_lruvec_page_state(old, NR_FILE_PAGES);
    831	if (!PageHuge(new))
    832		__inc_lruvec_page_state(new, NR_FILE_PAGES);
    833	if (PageSwapBacked(old))
    834		__dec_lruvec_page_state(old, NR_SHMEM);
    835	if (PageSwapBacked(new))
    836		__inc_lruvec_page_state(new, NR_SHMEM);
    837	xas_unlock_irq(&xas);
    838	if (free_folio)
    839		free_folio(fold);
    840	folio_put(fold);
    841}
    842EXPORT_SYMBOL_GPL(replace_page_cache_page);
    843
    844noinline int __filemap_add_folio(struct address_space *mapping,
    845		struct folio *folio, pgoff_t index, gfp_t gfp, void **shadowp)
    846{
    847	XA_STATE(xas, &mapping->i_pages, index);
    848	int huge = folio_test_hugetlb(folio);
    849	bool charged = false;
    850	long nr = 1;
    851
    852	VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);
    853	VM_BUG_ON_FOLIO(folio_test_swapbacked(folio), folio);
    854	mapping_set_update(&xas, mapping);
    855
    856	if (!huge) {
    857		int error = mem_cgroup_charge(folio, NULL, gfp);
    858		VM_BUG_ON_FOLIO(index & (folio_nr_pages(folio) - 1), folio);
    859		if (error)
    860			return error;
    861		charged = true;
    862		xas_set_order(&xas, index, folio_order(folio));
    863		nr = folio_nr_pages(folio);
    864	}
    865
    866	gfp &= GFP_RECLAIM_MASK;
    867	folio_ref_add(folio, nr);
    868	folio->mapping = mapping;
    869	folio->index = xas.xa_index;
    870
    871	do {
    872		unsigned int order = xa_get_order(xas.xa, xas.xa_index);
    873		void *entry, *old = NULL;
    874
    875		if (order > folio_order(folio))
    876			xas_split_alloc(&xas, xa_load(xas.xa, xas.xa_index),
    877					order, gfp);
    878		xas_lock_irq(&xas);
    879		xas_for_each_conflict(&xas, entry) {
    880			old = entry;
    881			if (!xa_is_value(entry)) {
    882				xas_set_err(&xas, -EEXIST);
    883				goto unlock;
    884			}
    885		}
    886
    887		if (old) {
    888			if (shadowp)
    889				*shadowp = old;
    890			/* entry may have been split before we acquired lock */
    891			order = xa_get_order(xas.xa, xas.xa_index);
    892			if (order > folio_order(folio)) {
    893				/* How to handle large swap entries? */
    894				BUG_ON(shmem_mapping(mapping));
    895				xas_split(&xas, old, order);
    896				xas_reset(&xas);
    897			}
    898		}
    899
    900		xas_store(&xas, folio);
    901		if (xas_error(&xas))
    902			goto unlock;
    903
    904		mapping->nrpages += nr;
    905
    906		/* hugetlb pages do not participate in page cache accounting */
    907		if (!huge) {
    908			__lruvec_stat_mod_folio(folio, NR_FILE_PAGES, nr);
    909			if (folio_test_pmd_mappable(folio))
    910				__lruvec_stat_mod_folio(folio,
    911						NR_FILE_THPS, nr);
    912		}
    913unlock:
    914		xas_unlock_irq(&xas);
    915	} while (xas_nomem(&xas, gfp));
    916
    917	if (xas_error(&xas))
    918		goto error;
    919
    920	trace_mm_filemap_add_to_page_cache(folio);
    921	return 0;
    922error:
    923	if (charged)
    924		mem_cgroup_uncharge(folio);
    925	folio->mapping = NULL;
    926	/* Leave page->index set: truncation relies upon it */
    927	folio_put_refs(folio, nr);
    928	return xas_error(&xas);
    929}
    930ALLOW_ERROR_INJECTION(__filemap_add_folio, ERRNO);
    931
    932/**
    933 * add_to_page_cache_locked - add a locked page to the pagecache
    934 * @page:	page to add
    935 * @mapping:	the page's address_space
    936 * @offset:	page index
    937 * @gfp_mask:	page allocation mode
    938 *
    939 * This function is used to add a page to the pagecache. It must be locked.
    940 * This function does not add the page to the LRU.  The caller must do that.
    941 *
    942 * Return: %0 on success, negative error code otherwise.
    943 */
    944int add_to_page_cache_locked(struct page *page, struct address_space *mapping,
    945		pgoff_t offset, gfp_t gfp_mask)
    946{
    947	return __filemap_add_folio(mapping, page_folio(page), offset,
    948					  gfp_mask, NULL);
    949}
    950EXPORT_SYMBOL(add_to_page_cache_locked);
    951
    952int filemap_add_folio(struct address_space *mapping, struct folio *folio,
    953				pgoff_t index, gfp_t gfp)
    954{
    955	void *shadow = NULL;
    956	int ret;
    957
    958	__folio_set_locked(folio);
    959	ret = __filemap_add_folio(mapping, folio, index, gfp, &shadow);
    960	if (unlikely(ret))
    961		__folio_clear_locked(folio);
    962	else {
    963		/*
    964		 * The folio might have been evicted from cache only
    965		 * recently, in which case it should be activated like
    966		 * any other repeatedly accessed folio.
    967		 * The exception is folios getting rewritten; evicting other
    968		 * data from the working set, only to cache data that will
    969		 * get overwritten with something else, is a waste of memory.
    970		 */
    971		WARN_ON_ONCE(folio_test_active(folio));
    972		if (!(gfp & __GFP_WRITE) && shadow)
    973			workingset_refault(folio, shadow);
    974		folio_add_lru(folio);
    975	}
    976	return ret;
    977}
    978EXPORT_SYMBOL_GPL(filemap_add_folio);
    979
    980#ifdef CONFIG_NUMA
    981struct folio *filemap_alloc_folio(gfp_t gfp, unsigned int order)
    982{
    983	int n;
    984	struct folio *folio;
    985
    986	if (cpuset_do_page_mem_spread()) {
    987		unsigned int cpuset_mems_cookie;
    988		do {
    989			cpuset_mems_cookie = read_mems_allowed_begin();
    990			n = cpuset_mem_spread_node();
    991			folio = __folio_alloc_node(gfp, order, n);
    992		} while (!folio && read_mems_allowed_retry(cpuset_mems_cookie));
    993
    994		return folio;
    995	}
    996	return folio_alloc(gfp, order);
    997}
    998EXPORT_SYMBOL(filemap_alloc_folio);
    999#endif
   1000
   1001/*
   1002 * filemap_invalidate_lock_two - lock invalidate_lock for two mappings
   1003 *
   1004 * Lock exclusively invalidate_lock of any passed mapping that is not NULL.
   1005 *
   1006 * @mapping1: the first mapping to lock
   1007 * @mapping2: the second mapping to lock
   1008 */
   1009void filemap_invalidate_lock_two(struct address_space *mapping1,
   1010				 struct address_space *mapping2)
   1011{
   1012	if (mapping1 > mapping2)
   1013		swap(mapping1, mapping2);
   1014	if (mapping1)
   1015		down_write(&mapping1->invalidate_lock);
   1016	if (mapping2 && mapping1 != mapping2)
   1017		down_write_nested(&mapping2->invalidate_lock, 1);
   1018}
   1019EXPORT_SYMBOL(filemap_invalidate_lock_two);
   1020
   1021/*
   1022 * filemap_invalidate_unlock_two - unlock invalidate_lock for two mappings
   1023 *
   1024 * Unlock exclusive invalidate_lock of any passed mapping that is not NULL.
   1025 *
   1026 * @mapping1: the first mapping to unlock
   1027 * @mapping2: the second mapping to unlock
   1028 */
   1029void filemap_invalidate_unlock_two(struct address_space *mapping1,
   1030				   struct address_space *mapping2)
   1031{
   1032	if (mapping1)
   1033		up_write(&mapping1->invalidate_lock);
   1034	if (mapping2 && mapping1 != mapping2)
   1035		up_write(&mapping2->invalidate_lock);
   1036}
   1037EXPORT_SYMBOL(filemap_invalidate_unlock_two);
   1038
   1039/*
   1040 * In order to wait for pages to become available there must be
   1041 * waitqueues associated with pages. By using a hash table of
   1042 * waitqueues where the bucket discipline is to maintain all
   1043 * waiters on the same queue and wake all when any of the pages
   1044 * become available, and for the woken contexts to check to be
   1045 * sure the appropriate page became available, this saves space
   1046 * at a cost of "thundering herd" phenomena during rare hash
   1047 * collisions.
   1048 */
   1049#define PAGE_WAIT_TABLE_BITS 8
   1050#define PAGE_WAIT_TABLE_SIZE (1 << PAGE_WAIT_TABLE_BITS)
   1051static wait_queue_head_t folio_wait_table[PAGE_WAIT_TABLE_SIZE] __cacheline_aligned;
   1052
   1053static wait_queue_head_t *folio_waitqueue(struct folio *folio)
   1054{
   1055	return &folio_wait_table[hash_ptr(folio, PAGE_WAIT_TABLE_BITS)];
   1056}
   1057
   1058void __init pagecache_init(void)
   1059{
   1060	int i;
   1061
   1062	for (i = 0; i < PAGE_WAIT_TABLE_SIZE; i++)
   1063		init_waitqueue_head(&folio_wait_table[i]);
   1064
   1065	page_writeback_init();
   1066}
   1067
   1068/*
   1069 * The page wait code treats the "wait->flags" somewhat unusually, because
   1070 * we have multiple different kinds of waits, not just the usual "exclusive"
   1071 * one.
   1072 *
   1073 * We have:
   1074 *
   1075 *  (a) no special bits set:
   1076 *
   1077 *	We're just waiting for the bit to be released, and when a waker
   1078 *	calls the wakeup function, we set WQ_FLAG_WOKEN and wake it up,
   1079 *	and remove it from the wait queue.
   1080 *
   1081 *	Simple and straightforward.
   1082 *
   1083 *  (b) WQ_FLAG_EXCLUSIVE:
   1084 *
   1085 *	The waiter is waiting to get the lock, and only one waiter should
   1086 *	be woken up to avoid any thundering herd behavior. We'll set the
   1087 *	WQ_FLAG_WOKEN bit, wake it up, and remove it from the wait queue.
   1088 *
   1089 *	This is the traditional exclusive wait.
   1090 *
   1091 *  (c) WQ_FLAG_EXCLUSIVE | WQ_FLAG_CUSTOM:
   1092 *
   1093 *	The waiter is waiting to get the bit, and additionally wants the
   1094 *	lock to be transferred to it for fair lock behavior. If the lock
   1095 *	cannot be taken, we stop walking the wait queue without waking
   1096 *	the waiter.
   1097 *
   1098 *	This is the "fair lock handoff" case, and in addition to setting
   1099 *	WQ_FLAG_WOKEN, we set WQ_FLAG_DONE to let the waiter easily see
   1100 *	that it now has the lock.
   1101 */
   1102static int wake_page_function(wait_queue_entry_t *wait, unsigned mode, int sync, void *arg)
   1103{
   1104	unsigned int flags;
   1105	struct wait_page_key *key = arg;
   1106	struct wait_page_queue *wait_page
   1107		= container_of(wait, struct wait_page_queue, wait);
   1108
   1109	if (!wake_page_match(wait_page, key))
   1110		return 0;
   1111
   1112	/*
   1113	 * If it's a lock handoff wait, we get the bit for it, and
   1114	 * stop walking (and do not wake it up) if we can't.
   1115	 */
   1116	flags = wait->flags;
   1117	if (flags & WQ_FLAG_EXCLUSIVE) {
   1118		if (test_bit(key->bit_nr, &key->folio->flags))
   1119			return -1;
   1120		if (flags & WQ_FLAG_CUSTOM) {
   1121			if (test_and_set_bit(key->bit_nr, &key->folio->flags))
   1122				return -1;
   1123			flags |= WQ_FLAG_DONE;
   1124		}
   1125	}
   1126
   1127	/*
   1128	 * We are holding the wait-queue lock, but the waiter that
   1129	 * is waiting for this will be checking the flags without
   1130	 * any locking.
   1131	 *
   1132	 * So update the flags atomically, and wake up the waiter
   1133	 * afterwards to avoid any races. This store-release pairs
   1134	 * with the load-acquire in folio_wait_bit_common().
   1135	 */
   1136	smp_store_release(&wait->flags, flags | WQ_FLAG_WOKEN);
   1137	wake_up_state(wait->private, mode);
   1138
   1139	/*
   1140	 * Ok, we have successfully done what we're waiting for,
   1141	 * and we can unconditionally remove the wait entry.
   1142	 *
   1143	 * Note that this pairs with the "finish_wait()" in the
   1144	 * waiter, and has to be the absolute last thing we do.
   1145	 * After this list_del_init(&wait->entry) the wait entry
   1146	 * might be de-allocated and the process might even have
   1147	 * exited.
   1148	 */
   1149	list_del_init_careful(&wait->entry);
   1150	return (flags & WQ_FLAG_EXCLUSIVE) != 0;
   1151}
   1152
   1153static void folio_wake_bit(struct folio *folio, int bit_nr)
   1154{
   1155	wait_queue_head_t *q = folio_waitqueue(folio);
   1156	struct wait_page_key key;
   1157	unsigned long flags;
   1158	wait_queue_entry_t bookmark;
   1159
   1160	key.folio = folio;
   1161	key.bit_nr = bit_nr;
   1162	key.page_match = 0;
   1163
   1164	bookmark.flags = 0;
   1165	bookmark.private = NULL;
   1166	bookmark.func = NULL;
   1167	INIT_LIST_HEAD(&bookmark.entry);
   1168
   1169	spin_lock_irqsave(&q->lock, flags);
   1170	__wake_up_locked_key_bookmark(q, TASK_NORMAL, &key, &bookmark);
   1171
   1172	while (bookmark.flags & WQ_FLAG_BOOKMARK) {
   1173		/*
   1174		 * Take a breather from holding the lock,
   1175		 * allow pages that finish wake up asynchronously
   1176		 * to acquire the lock and remove themselves
   1177		 * from wait queue
   1178		 */
   1179		spin_unlock_irqrestore(&q->lock, flags);
   1180		cpu_relax();
   1181		spin_lock_irqsave(&q->lock, flags);
   1182		__wake_up_locked_key_bookmark(q, TASK_NORMAL, &key, &bookmark);
   1183	}
   1184
   1185	/*
   1186	 * It's possible to miss clearing waiters here, when we woke our page
   1187	 * waiters, but the hashed waitqueue has waiters for other pages on it.
   1188	 * That's okay, it's a rare case. The next waker will clear it.
   1189	 *
   1190	 * Note that, depending on the page pool (buddy, hugetlb, ZONE_DEVICE,
   1191	 * other), the flag may be cleared in the course of freeing the page;
   1192	 * but that is not required for correctness.
   1193	 */
   1194	if (!waitqueue_active(q) || !key.page_match)
   1195		folio_clear_waiters(folio);
   1196
   1197	spin_unlock_irqrestore(&q->lock, flags);
   1198}
   1199
   1200static void folio_wake(struct folio *folio, int bit)
   1201{
   1202	if (!folio_test_waiters(folio))
   1203		return;
   1204	folio_wake_bit(folio, bit);
   1205}
   1206
   1207/*
   1208 * A choice of three behaviors for folio_wait_bit_common():
   1209 */
   1210enum behavior {
   1211	EXCLUSIVE,	/* Hold ref to page and take the bit when woken, like
   1212			 * __folio_lock() waiting on then setting PG_locked.
   1213			 */
   1214	SHARED,		/* Hold ref to page and check the bit when woken, like
   1215			 * folio_wait_writeback() waiting on PG_writeback.
   1216			 */
   1217	DROP,		/* Drop ref to page before wait, no check when woken,
   1218			 * like folio_put_wait_locked() on PG_locked.
   1219			 */
   1220};
   1221
   1222/*
   1223 * Attempt to check (or get) the folio flag, and mark us done
   1224 * if successful.
   1225 */
   1226static inline bool folio_trylock_flag(struct folio *folio, int bit_nr,
   1227					struct wait_queue_entry *wait)
   1228{
   1229	if (wait->flags & WQ_FLAG_EXCLUSIVE) {
   1230		if (test_and_set_bit(bit_nr, &folio->flags))
   1231			return false;
   1232	} else if (test_bit(bit_nr, &folio->flags))
   1233		return false;
   1234
   1235	wait->flags |= WQ_FLAG_WOKEN | WQ_FLAG_DONE;
   1236	return true;
   1237}
   1238
   1239/* How many times do we accept lock stealing from under a waiter? */
   1240int sysctl_page_lock_unfairness = 5;
   1241
   1242static inline int folio_wait_bit_common(struct folio *folio, int bit_nr,
   1243		int state, enum behavior behavior)
   1244{
   1245	wait_queue_head_t *q = folio_waitqueue(folio);
   1246	int unfairness = sysctl_page_lock_unfairness;
   1247	struct wait_page_queue wait_page;
   1248	wait_queue_entry_t *wait = &wait_page.wait;
   1249	bool thrashing = false;
   1250	bool delayacct = false;
   1251	unsigned long pflags;
   1252
   1253	if (bit_nr == PG_locked &&
   1254	    !folio_test_uptodate(folio) && folio_test_workingset(folio)) {
   1255		if (!folio_test_swapbacked(folio)) {
   1256			delayacct_thrashing_start();
   1257			delayacct = true;
   1258		}
   1259		psi_memstall_enter(&pflags);
   1260		thrashing = true;
   1261	}
   1262
   1263	init_wait(wait);
   1264	wait->func = wake_page_function;
   1265	wait_page.folio = folio;
   1266	wait_page.bit_nr = bit_nr;
   1267
   1268repeat:
   1269	wait->flags = 0;
   1270	if (behavior == EXCLUSIVE) {
   1271		wait->flags = WQ_FLAG_EXCLUSIVE;
   1272		if (--unfairness < 0)
   1273			wait->flags |= WQ_FLAG_CUSTOM;
   1274	}
   1275
   1276	/*
   1277	 * Do one last check whether we can get the
   1278	 * page bit synchronously.
   1279	 *
   1280	 * Do the folio_set_waiters() marking before that
   1281	 * to let any waker we _just_ missed know they
   1282	 * need to wake us up (otherwise they'll never
   1283	 * even go to the slow case that looks at the
   1284	 * page queue), and add ourselves to the wait
   1285	 * queue if we need to sleep.
   1286	 *
   1287	 * This part needs to be done under the queue
   1288	 * lock to avoid races.
   1289	 */
   1290	spin_lock_irq(&q->lock);
   1291	folio_set_waiters(folio);
   1292	if (!folio_trylock_flag(folio, bit_nr, wait))
   1293		__add_wait_queue_entry_tail(q, wait);
   1294	spin_unlock_irq(&q->lock);
   1295
   1296	/*
   1297	 * From now on, all the logic will be based on
   1298	 * the WQ_FLAG_WOKEN and WQ_FLAG_DONE flag, to
   1299	 * see whether the page bit testing has already
   1300	 * been done by the wake function.
   1301	 *
   1302	 * We can drop our reference to the folio.
   1303	 */
   1304	if (behavior == DROP)
   1305		folio_put(folio);
   1306
   1307	/*
   1308	 * Note that until the "finish_wait()", or until
   1309	 * we see the WQ_FLAG_WOKEN flag, we need to
   1310	 * be very careful with the 'wait->flags', because
   1311	 * we may race with a waker that sets them.
   1312	 */
   1313	for (;;) {
   1314		unsigned int flags;
   1315
   1316		set_current_state(state);
   1317
   1318		/* Loop until we've been woken or interrupted */
   1319		flags = smp_load_acquire(&wait->flags);
   1320		if (!(flags & WQ_FLAG_WOKEN)) {
   1321			if (signal_pending_state(state, current))
   1322				break;
   1323
   1324			io_schedule();
   1325			continue;
   1326		}
   1327
   1328		/* If we were non-exclusive, we're done */
   1329		if (behavior != EXCLUSIVE)
   1330			break;
   1331
   1332		/* If the waker got the lock for us, we're done */
   1333		if (flags & WQ_FLAG_DONE)
   1334			break;
   1335
   1336		/*
   1337		 * Otherwise, if we're getting the lock, we need to
   1338		 * try to get it ourselves.
   1339		 *
   1340		 * And if that fails, we'll have to retry this all.
   1341		 */
   1342		if (unlikely(test_and_set_bit(bit_nr, folio_flags(folio, 0))))
   1343			goto repeat;
   1344
   1345		wait->flags |= WQ_FLAG_DONE;
   1346		break;
   1347	}
   1348
   1349	/*
   1350	 * If a signal happened, this 'finish_wait()' may remove the last
   1351	 * waiter from the wait-queues, but the folio waiters bit will remain
   1352	 * set. That's ok. The next wakeup will take care of it, and trying
   1353	 * to do it here would be difficult and prone to races.
   1354	 */
   1355	finish_wait(q, wait);
   1356
   1357	if (thrashing) {
   1358		if (delayacct)
   1359			delayacct_thrashing_end();
   1360		psi_memstall_leave(&pflags);
   1361	}
   1362
   1363	/*
   1364	 * NOTE! The wait->flags weren't stable until we've done the
   1365	 * 'finish_wait()', and we could have exited the loop above due
   1366	 * to a signal, and had a wakeup event happen after the signal
   1367	 * test but before the 'finish_wait()'.
   1368	 *
   1369	 * So only after the finish_wait() can we reliably determine
   1370	 * if we got woken up or not, so we can now figure out the final
   1371	 * return value based on that state without races.
   1372	 *
   1373	 * Also note that WQ_FLAG_WOKEN is sufficient for a non-exclusive
   1374	 * waiter, but an exclusive one requires WQ_FLAG_DONE.
   1375	 */
   1376	if (behavior == EXCLUSIVE)
   1377		return wait->flags & WQ_FLAG_DONE ? 0 : -EINTR;
   1378
   1379	return wait->flags & WQ_FLAG_WOKEN ? 0 : -EINTR;
   1380}
   1381
   1382#ifdef CONFIG_MIGRATION
   1383/**
   1384 * migration_entry_wait_on_locked - Wait for a migration entry to be removed
   1385 * @entry: migration swap entry.
   1386 * @ptep: mapped pte pointer. Will return with the ptep unmapped. Only required
   1387 *        for pte entries, pass NULL for pmd entries.
   1388 * @ptl: already locked ptl. This function will drop the lock.
   1389 *
   1390 * Wait for a migration entry referencing the given page to be removed. This is
   1391 * equivalent to put_and_wait_on_page_locked(page, TASK_UNINTERRUPTIBLE) except
   1392 * this can be called without taking a reference on the page. Instead this
   1393 * should be called while holding the ptl for the migration entry referencing
   1394 * the page.
   1395 *
   1396 * Returns after unmapping and unlocking the pte/ptl with pte_unmap_unlock().
   1397 *
   1398 * This follows the same logic as folio_wait_bit_common() so see the comments
   1399 * there.
   1400 */
   1401void migration_entry_wait_on_locked(swp_entry_t entry, pte_t *ptep,
   1402				spinlock_t *ptl)
   1403{
   1404	struct wait_page_queue wait_page;
   1405	wait_queue_entry_t *wait = &wait_page.wait;
   1406	bool thrashing = false;
   1407	bool delayacct = false;
   1408	unsigned long pflags;
   1409	wait_queue_head_t *q;
   1410	struct folio *folio = page_folio(pfn_swap_entry_to_page(entry));
   1411
   1412	q = folio_waitqueue(folio);
   1413	if (!folio_test_uptodate(folio) && folio_test_workingset(folio)) {
   1414		if (!folio_test_swapbacked(folio)) {
   1415			delayacct_thrashing_start();
   1416			delayacct = true;
   1417		}
   1418		psi_memstall_enter(&pflags);
   1419		thrashing = true;
   1420	}
   1421
   1422	init_wait(wait);
   1423	wait->func = wake_page_function;
   1424	wait_page.folio = folio;
   1425	wait_page.bit_nr = PG_locked;
   1426	wait->flags = 0;
   1427
   1428	spin_lock_irq(&q->lock);
   1429	folio_set_waiters(folio);
   1430	if (!folio_trylock_flag(folio, PG_locked, wait))
   1431		__add_wait_queue_entry_tail(q, wait);
   1432	spin_unlock_irq(&q->lock);
   1433
   1434	/*
   1435	 * If a migration entry exists for the page the migration path must hold
   1436	 * a valid reference to the page, and it must take the ptl to remove the
   1437	 * migration entry. So the page is valid until the ptl is dropped.
   1438	 */
   1439	if (ptep)
   1440		pte_unmap_unlock(ptep, ptl);
   1441	else
   1442		spin_unlock(ptl);
   1443
   1444	for (;;) {
   1445		unsigned int flags;
   1446
   1447		set_current_state(TASK_UNINTERRUPTIBLE);
   1448
   1449		/* Loop until we've been woken or interrupted */
   1450		flags = smp_load_acquire(&wait->flags);
   1451		if (!(flags & WQ_FLAG_WOKEN)) {
   1452			if (signal_pending_state(TASK_UNINTERRUPTIBLE, current))
   1453				break;
   1454
   1455			io_schedule();
   1456			continue;
   1457		}
   1458		break;
   1459	}
   1460
   1461	finish_wait(q, wait);
   1462
   1463	if (thrashing) {
   1464		if (delayacct)
   1465			delayacct_thrashing_end();
   1466		psi_memstall_leave(&pflags);
   1467	}
   1468}
   1469#endif
   1470
   1471void folio_wait_bit(struct folio *folio, int bit_nr)
   1472{
   1473	folio_wait_bit_common(folio, bit_nr, TASK_UNINTERRUPTIBLE, SHARED);
   1474}
   1475EXPORT_SYMBOL(folio_wait_bit);
   1476
   1477int folio_wait_bit_killable(struct folio *folio, int bit_nr)
   1478{
   1479	return folio_wait_bit_common(folio, bit_nr, TASK_KILLABLE, SHARED);
   1480}
   1481EXPORT_SYMBOL(folio_wait_bit_killable);
   1482
   1483/**
   1484 * folio_put_wait_locked - Drop a reference and wait for it to be unlocked
   1485 * @folio: The folio to wait for.
   1486 * @state: The sleep state (TASK_KILLABLE, TASK_UNINTERRUPTIBLE, etc).
   1487 *
   1488 * The caller should hold a reference on @folio.  They expect the page to
   1489 * become unlocked relatively soon, but do not wish to hold up migration
   1490 * (for example) by holding the reference while waiting for the folio to
   1491 * come unlocked.  After this function returns, the caller should not
   1492 * dereference @folio.
   1493 *
   1494 * Return: 0 if the folio was unlocked or -EINTR if interrupted by a signal.
   1495 */
   1496int folio_put_wait_locked(struct folio *folio, int state)
   1497{
   1498	return folio_wait_bit_common(folio, PG_locked, state, DROP);
   1499}
   1500
   1501/**
   1502 * folio_add_wait_queue - Add an arbitrary waiter to a folio's wait queue
   1503 * @folio: Folio defining the wait queue of interest
   1504 * @waiter: Waiter to add to the queue
   1505 *
   1506 * Add an arbitrary @waiter to the wait queue for the nominated @folio.
   1507 */
   1508void folio_add_wait_queue(struct folio *folio, wait_queue_entry_t *waiter)
   1509{
   1510	wait_queue_head_t *q = folio_waitqueue(folio);
   1511	unsigned long flags;
   1512
   1513	spin_lock_irqsave(&q->lock, flags);
   1514	__add_wait_queue_entry_tail(q, waiter);
   1515	folio_set_waiters(folio);
   1516	spin_unlock_irqrestore(&q->lock, flags);
   1517}
   1518EXPORT_SYMBOL_GPL(folio_add_wait_queue);
   1519
   1520#ifndef clear_bit_unlock_is_negative_byte
   1521
   1522/*
   1523 * PG_waiters is the high bit in the same byte as PG_lock.
   1524 *
   1525 * On x86 (and on many other architectures), we can clear PG_lock and
   1526 * test the sign bit at the same time. But if the architecture does
   1527 * not support that special operation, we just do this all by hand
   1528 * instead.
   1529 *
   1530 * The read of PG_waiters has to be after (or concurrently with) PG_locked
   1531 * being cleared, but a memory barrier should be unnecessary since it is
   1532 * in the same byte as PG_locked.
   1533 */
   1534static inline bool clear_bit_unlock_is_negative_byte(long nr, volatile void *mem)
   1535{
   1536	clear_bit_unlock(nr, mem);
   1537	/* smp_mb__after_atomic(); */
   1538	return test_bit(PG_waiters, mem);
   1539}
   1540
   1541#endif
   1542
   1543/**
   1544 * folio_unlock - Unlock a locked folio.
   1545 * @folio: The folio.
   1546 *
   1547 * Unlocks the folio and wakes up any thread sleeping on the page lock.
   1548 *
   1549 * Context: May be called from interrupt or process context.  May not be
   1550 * called from NMI context.
   1551 */
   1552void folio_unlock(struct folio *folio)
   1553{
   1554	/* Bit 7 allows x86 to check the byte's sign bit */
   1555	BUILD_BUG_ON(PG_waiters != 7);
   1556	BUILD_BUG_ON(PG_locked > 7);
   1557	VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);
   1558	if (clear_bit_unlock_is_negative_byte(PG_locked, folio_flags(folio, 0)))
   1559		folio_wake_bit(folio, PG_locked);
   1560}
   1561EXPORT_SYMBOL(folio_unlock);
   1562
   1563/**
   1564 * folio_end_private_2 - Clear PG_private_2 and wake any waiters.
   1565 * @folio: The folio.
   1566 *
   1567 * Clear the PG_private_2 bit on a folio and wake up any sleepers waiting for
   1568 * it.  The folio reference held for PG_private_2 being set is released.
   1569 *
   1570 * This is, for example, used when a netfs folio is being written to a local
   1571 * disk cache, thereby allowing writes to the cache for the same folio to be
   1572 * serialised.
   1573 */
   1574void folio_end_private_2(struct folio *folio)
   1575{
   1576	VM_BUG_ON_FOLIO(!folio_test_private_2(folio), folio);
   1577	clear_bit_unlock(PG_private_2, folio_flags(folio, 0));
   1578	folio_wake_bit(folio, PG_private_2);
   1579	folio_put(folio);
   1580}
   1581EXPORT_SYMBOL(folio_end_private_2);
   1582
   1583/**
   1584 * folio_wait_private_2 - Wait for PG_private_2 to be cleared on a folio.
   1585 * @folio: The folio to wait on.
   1586 *
   1587 * Wait for PG_private_2 (aka PG_fscache) to be cleared on a folio.
   1588 */
   1589void folio_wait_private_2(struct folio *folio)
   1590{
   1591	while (folio_test_private_2(folio))
   1592		folio_wait_bit(folio, PG_private_2);
   1593}
   1594EXPORT_SYMBOL(folio_wait_private_2);
   1595
   1596/**
   1597 * folio_wait_private_2_killable - Wait for PG_private_2 to be cleared on a folio.
   1598 * @folio: The folio to wait on.
   1599 *
   1600 * Wait for PG_private_2 (aka PG_fscache) to be cleared on a folio or until a
   1601 * fatal signal is received by the calling task.
   1602 *
   1603 * Return:
   1604 * - 0 if successful.
   1605 * - -EINTR if a fatal signal was encountered.
   1606 */
   1607int folio_wait_private_2_killable(struct folio *folio)
   1608{
   1609	int ret = 0;
   1610
   1611	while (folio_test_private_2(folio)) {
   1612		ret = folio_wait_bit_killable(folio, PG_private_2);
   1613		if (ret < 0)
   1614			break;
   1615	}
   1616
   1617	return ret;
   1618}
   1619EXPORT_SYMBOL(folio_wait_private_2_killable);
   1620
   1621/**
   1622 * folio_end_writeback - End writeback against a folio.
   1623 * @folio: The folio.
   1624 */
   1625void folio_end_writeback(struct folio *folio)
   1626{
   1627	/*
   1628	 * folio_test_clear_reclaim() could be used here but it is an
   1629	 * atomic operation and overkill in this particular case. Failing
   1630	 * to shuffle a folio marked for immediate reclaim is too mild
   1631	 * a gain to justify taking an atomic operation penalty at the
   1632	 * end of every folio writeback.
   1633	 */
   1634	if (folio_test_reclaim(folio)) {
   1635		folio_clear_reclaim(folio);
   1636		folio_rotate_reclaimable(folio);
   1637	}
   1638
   1639	/*
   1640	 * Writeback does not hold a folio reference of its own, relying
   1641	 * on truncation to wait for the clearing of PG_writeback.
   1642	 * But here we must make sure that the folio is not freed and
   1643	 * reused before the folio_wake().
   1644	 */
   1645	folio_get(folio);
   1646	if (!__folio_end_writeback(folio))
   1647		BUG();
   1648
   1649	smp_mb__after_atomic();
   1650	folio_wake(folio, PG_writeback);
   1651	acct_reclaim_writeback(folio);
   1652	folio_put(folio);
   1653}
   1654EXPORT_SYMBOL(folio_end_writeback);
   1655
   1656/*
   1657 * After completing I/O on a page, call this routine to update the page
   1658 * flags appropriately
   1659 */
   1660void page_endio(struct page *page, bool is_write, int err)
   1661{
   1662	if (!is_write) {
   1663		if (!err) {
   1664			SetPageUptodate(page);
   1665		} else {
   1666			ClearPageUptodate(page);
   1667			SetPageError(page);
   1668		}
   1669		unlock_page(page);
   1670	} else {
   1671		if (err) {
   1672			struct address_space *mapping;
   1673
   1674			SetPageError(page);
   1675			mapping = page_mapping(page);
   1676			if (mapping)
   1677				mapping_set_error(mapping, err);
   1678		}
   1679		end_page_writeback(page);
   1680	}
   1681}
   1682EXPORT_SYMBOL_GPL(page_endio);
   1683
   1684/**
   1685 * __folio_lock - Get a lock on the folio, assuming we need to sleep to get it.
   1686 * @folio: The folio to lock
   1687 */
   1688void __folio_lock(struct folio *folio)
   1689{
   1690	folio_wait_bit_common(folio, PG_locked, TASK_UNINTERRUPTIBLE,
   1691				EXCLUSIVE);
   1692}
   1693EXPORT_SYMBOL(__folio_lock);
   1694
   1695int __folio_lock_killable(struct folio *folio)
   1696{
   1697	return folio_wait_bit_common(folio, PG_locked, TASK_KILLABLE,
   1698					EXCLUSIVE);
   1699}
   1700EXPORT_SYMBOL_GPL(__folio_lock_killable);
   1701
   1702static int __folio_lock_async(struct folio *folio, struct wait_page_queue *wait)
   1703{
   1704	struct wait_queue_head *q = folio_waitqueue(folio);
   1705	int ret = 0;
   1706
   1707	wait->folio = folio;
   1708	wait->bit_nr = PG_locked;
   1709
   1710	spin_lock_irq(&q->lock);
   1711	__add_wait_queue_entry_tail(q, &wait->wait);
   1712	folio_set_waiters(folio);
   1713	ret = !folio_trylock(folio);
   1714	/*
   1715	 * If we were successful now, we know we're still on the
   1716	 * waitqueue as we're still under the lock. This means it's
   1717	 * safe to remove and return success, we know the callback
   1718	 * isn't going to trigger.
   1719	 */
   1720	if (!ret)
   1721		__remove_wait_queue(q, &wait->wait);
   1722	else
   1723		ret = -EIOCBQUEUED;
   1724	spin_unlock_irq(&q->lock);
   1725	return ret;
   1726}
   1727
   1728/*
   1729 * Return values:
   1730 * true - folio is locked; mmap_lock is still held.
   1731 * false - folio is not locked.
   1732 *     mmap_lock has been released (mmap_read_unlock(), unless flags had both
   1733 *     FAULT_FLAG_ALLOW_RETRY and FAULT_FLAG_RETRY_NOWAIT set, in
   1734 *     which case mmap_lock is still held.
   1735 *
   1736 * If neither ALLOW_RETRY nor KILLABLE are set, will always return true
   1737 * with the folio locked and the mmap_lock unperturbed.
   1738 */
   1739bool __folio_lock_or_retry(struct folio *folio, struct mm_struct *mm,
   1740			 unsigned int flags)
   1741{
   1742	if (fault_flag_allow_retry_first(flags)) {
   1743		/*
   1744		 * CAUTION! In this case, mmap_lock is not released
   1745		 * even though return 0.
   1746		 */
   1747		if (flags & FAULT_FLAG_RETRY_NOWAIT)
   1748			return false;
   1749
   1750		mmap_read_unlock(mm);
   1751		if (flags & FAULT_FLAG_KILLABLE)
   1752			folio_wait_locked_killable(folio);
   1753		else
   1754			folio_wait_locked(folio);
   1755		return false;
   1756	}
   1757	if (flags & FAULT_FLAG_KILLABLE) {
   1758		bool ret;
   1759
   1760		ret = __folio_lock_killable(folio);
   1761		if (ret) {
   1762			mmap_read_unlock(mm);
   1763			return false;
   1764		}
   1765	} else {
   1766		__folio_lock(folio);
   1767	}
   1768
   1769	return true;
   1770}
   1771
   1772/**
   1773 * page_cache_next_miss() - Find the next gap in the page cache.
   1774 * @mapping: Mapping.
   1775 * @index: Index.
   1776 * @max_scan: Maximum range to search.
   1777 *
   1778 * Search the range [index, min(index + max_scan - 1, ULONG_MAX)] for the
   1779 * gap with the lowest index.
   1780 *
   1781 * This function may be called under the rcu_read_lock.  However, this will
   1782 * not atomically search a snapshot of the cache at a single point in time.
   1783 * For example, if a gap is created at index 5, then subsequently a gap is
   1784 * created at index 10, page_cache_next_miss covering both indices may
   1785 * return 10 if called under the rcu_read_lock.
   1786 *
   1787 * Return: The index of the gap if found, otherwise an index outside the
   1788 * range specified (in which case 'return - index >= max_scan' will be true).
   1789 * In the rare case of index wrap-around, 0 will be returned.
   1790 */
   1791pgoff_t page_cache_next_miss(struct address_space *mapping,
   1792			     pgoff_t index, unsigned long max_scan)
   1793{
   1794	XA_STATE(xas, &mapping->i_pages, index);
   1795
   1796	while (max_scan--) {
   1797		void *entry = xas_next(&xas);
   1798		if (!entry || xa_is_value(entry))
   1799			break;
   1800		if (xas.xa_index == 0)
   1801			break;
   1802	}
   1803
   1804	return xas.xa_index;
   1805}
   1806EXPORT_SYMBOL(page_cache_next_miss);
   1807
   1808/**
   1809 * page_cache_prev_miss() - Find the previous gap in the page cache.
   1810 * @mapping: Mapping.
   1811 * @index: Index.
   1812 * @max_scan: Maximum range to search.
   1813 *
   1814 * Search the range [max(index - max_scan + 1, 0), index] for the
   1815 * gap with the highest index.
   1816 *
   1817 * This function may be called under the rcu_read_lock.  However, this will
   1818 * not atomically search a snapshot of the cache at a single point in time.
   1819 * For example, if a gap is created at index 10, then subsequently a gap is
   1820 * created at index 5, page_cache_prev_miss() covering both indices may
   1821 * return 5 if called under the rcu_read_lock.
   1822 *
   1823 * Return: The index of the gap if found, otherwise an index outside the
   1824 * range specified (in which case 'index - return >= max_scan' will be true).
   1825 * In the rare case of wrap-around, ULONG_MAX will be returned.
   1826 */
   1827pgoff_t page_cache_prev_miss(struct address_space *mapping,
   1828			     pgoff_t index, unsigned long max_scan)
   1829{
   1830	XA_STATE(xas, &mapping->i_pages, index);
   1831
   1832	while (max_scan--) {
   1833		void *entry = xas_prev(&xas);
   1834		if (!entry || xa_is_value(entry))
   1835			break;
   1836		if (xas.xa_index == ULONG_MAX)
   1837			break;
   1838	}
   1839
   1840	return xas.xa_index;
   1841}
   1842EXPORT_SYMBOL(page_cache_prev_miss);
   1843
   1844/*
   1845 * Lockless page cache protocol:
   1846 * On the lookup side:
   1847 * 1. Load the folio from i_pages
   1848 * 2. Increment the refcount if it's not zero
   1849 * 3. If the folio is not found by xas_reload(), put the refcount and retry
   1850 *
   1851 * On the removal side:
   1852 * A. Freeze the page (by zeroing the refcount if nobody else has a reference)
   1853 * B. Remove the page from i_pages
   1854 * C. Return the page to the page allocator
   1855 *
   1856 * This means that any page may have its reference count temporarily
   1857 * increased by a speculative page cache (or fast GUP) lookup as it can
   1858 * be allocated by another user before the RCU grace period expires.
   1859 * Because the refcount temporarily acquired here may end up being the
   1860 * last refcount on the page, any page allocation must be freeable by
   1861 * folio_put().
   1862 */
   1863
   1864/*
   1865 * mapping_get_entry - Get a page cache entry.
   1866 * @mapping: the address_space to search
   1867 * @index: The page cache index.
   1868 *
   1869 * Looks up the page cache entry at @mapping & @index.  If it is a folio,
   1870 * it is returned with an increased refcount.  If it is a shadow entry
   1871 * of a previously evicted folio, or a swap entry from shmem/tmpfs,
   1872 * it is returned without further action.
   1873 *
   1874 * Return: The folio, swap or shadow entry, %NULL if nothing is found.
   1875 */
   1876static void *mapping_get_entry(struct address_space *mapping, pgoff_t index)
   1877{
   1878	XA_STATE(xas, &mapping->i_pages, index);
   1879	struct folio *folio;
   1880
   1881	rcu_read_lock();
   1882repeat:
   1883	xas_reset(&xas);
   1884	folio = xas_load(&xas);
   1885	if (xas_retry(&xas, folio))
   1886		goto repeat;
   1887	/*
   1888	 * A shadow entry of a recently evicted page, or a swap entry from
   1889	 * shmem/tmpfs.  Return it without attempting to raise page count.
   1890	 */
   1891	if (!folio || xa_is_value(folio))
   1892		goto out;
   1893
   1894	if (!folio_try_get_rcu(folio))
   1895		goto repeat;
   1896
   1897	if (unlikely(folio != xas_reload(&xas))) {
   1898		folio_put(folio);
   1899		goto repeat;
   1900	}
   1901out:
   1902	rcu_read_unlock();
   1903
   1904	return folio;
   1905}
   1906
   1907/**
   1908 * __filemap_get_folio - Find and get a reference to a folio.
   1909 * @mapping: The address_space to search.
   1910 * @index: The page index.
   1911 * @fgp_flags: %FGP flags modify how the folio is returned.
   1912 * @gfp: Memory allocation flags to use if %FGP_CREAT is specified.
   1913 *
   1914 * Looks up the page cache entry at @mapping & @index.
   1915 *
   1916 * @fgp_flags can be zero or more of these flags:
   1917 *
   1918 * * %FGP_ACCESSED - The folio will be marked accessed.
   1919 * * %FGP_LOCK - The folio is returned locked.
   1920 * * %FGP_ENTRY - If there is a shadow / swap / DAX entry, return it
   1921 *   instead of allocating a new folio to replace it.
   1922 * * %FGP_CREAT - If no page is present then a new page is allocated using
   1923 *   @gfp and added to the page cache and the VM's LRU list.
   1924 *   The page is returned locked and with an increased refcount.
   1925 * * %FGP_FOR_MMAP - The caller wants to do its own locking dance if the
   1926 *   page is already in cache.  If the page was allocated, unlock it before
   1927 *   returning so the caller can do the same dance.
   1928 * * %FGP_WRITE - The page will be written to by the caller.
   1929 * * %FGP_NOFS - __GFP_FS will get cleared in gfp.
   1930 * * %FGP_NOWAIT - Don't get blocked by page lock.
   1931 * * %FGP_STABLE - Wait for the folio to be stable (finished writeback)
   1932 *
   1933 * If %FGP_LOCK or %FGP_CREAT are specified then the function may sleep even
   1934 * if the %GFP flags specified for %FGP_CREAT are atomic.
   1935 *
   1936 * If there is a page cache page, it is returned with an increased refcount.
   1937 *
   1938 * Return: The found folio or %NULL otherwise.
   1939 */
   1940struct folio *__filemap_get_folio(struct address_space *mapping, pgoff_t index,
   1941		int fgp_flags, gfp_t gfp)
   1942{
   1943	struct folio *folio;
   1944
   1945repeat:
   1946	folio = mapping_get_entry(mapping, index);
   1947	if (xa_is_value(folio)) {
   1948		if (fgp_flags & FGP_ENTRY)
   1949			return folio;
   1950		folio = NULL;
   1951	}
   1952	if (!folio)
   1953		goto no_page;
   1954
   1955	if (fgp_flags & FGP_LOCK) {
   1956		if (fgp_flags & FGP_NOWAIT) {
   1957			if (!folio_trylock(folio)) {
   1958				folio_put(folio);
   1959				return NULL;
   1960			}
   1961		} else {
   1962			folio_lock(folio);
   1963		}
   1964
   1965		/* Has the page been truncated? */
   1966		if (unlikely(folio->mapping != mapping)) {
   1967			folio_unlock(folio);
   1968			folio_put(folio);
   1969			goto repeat;
   1970		}
   1971		VM_BUG_ON_FOLIO(!folio_contains(folio, index), folio);
   1972	}
   1973
   1974	if (fgp_flags & FGP_ACCESSED)
   1975		folio_mark_accessed(folio);
   1976	else if (fgp_flags & FGP_WRITE) {
   1977		/* Clear idle flag for buffer write */
   1978		if (folio_test_idle(folio))
   1979			folio_clear_idle(folio);
   1980	}
   1981
   1982	if (fgp_flags & FGP_STABLE)
   1983		folio_wait_stable(folio);
   1984no_page:
   1985	if (!folio && (fgp_flags & FGP_CREAT)) {
   1986		int err;
   1987		if ((fgp_flags & FGP_WRITE) && mapping_can_writeback(mapping))
   1988			gfp |= __GFP_WRITE;
   1989		if (fgp_flags & FGP_NOFS)
   1990			gfp &= ~__GFP_FS;
   1991
   1992		folio = filemap_alloc_folio(gfp, 0);
   1993		if (!folio)
   1994			return NULL;
   1995
   1996		if (WARN_ON_ONCE(!(fgp_flags & (FGP_LOCK | FGP_FOR_MMAP))))
   1997			fgp_flags |= FGP_LOCK;
   1998
   1999		/* Init accessed so avoid atomic mark_page_accessed later */
   2000		if (fgp_flags & FGP_ACCESSED)
   2001			__folio_set_referenced(folio);
   2002
   2003		err = filemap_add_folio(mapping, folio, index, gfp);
   2004		if (unlikely(err)) {
   2005			folio_put(folio);
   2006			folio = NULL;
   2007			if (err == -EEXIST)
   2008				goto repeat;
   2009		}
   2010
   2011		/*
   2012		 * filemap_add_folio locks the page, and for mmap
   2013		 * we expect an unlocked page.
   2014		 */
   2015		if (folio && (fgp_flags & FGP_FOR_MMAP))
   2016			folio_unlock(folio);
   2017	}
   2018
   2019	return folio;
   2020}
   2021EXPORT_SYMBOL(__filemap_get_folio);
   2022
   2023static inline struct folio *find_get_entry(struct xa_state *xas, pgoff_t max,
   2024		xa_mark_t mark)
   2025{
   2026	struct folio *folio;
   2027
   2028retry:
   2029	if (mark == XA_PRESENT)
   2030		folio = xas_find(xas, max);
   2031	else
   2032		folio = xas_find_marked(xas, max, mark);
   2033
   2034	if (xas_retry(xas, folio))
   2035		goto retry;
   2036	/*
   2037	 * A shadow entry of a recently evicted page, a swap
   2038	 * entry from shmem/tmpfs or a DAX entry.  Return it
   2039	 * without attempting to raise page count.
   2040	 */
   2041	if (!folio || xa_is_value(folio))
   2042		return folio;
   2043
   2044	if (!folio_try_get_rcu(folio))
   2045		goto reset;
   2046
   2047	if (unlikely(folio != xas_reload(xas))) {
   2048		folio_put(folio);
   2049		goto reset;
   2050	}
   2051
   2052	return folio;
   2053reset:
   2054	xas_reset(xas);
   2055	goto retry;
   2056}
   2057
   2058/**
   2059 * find_get_entries - gang pagecache lookup
   2060 * @mapping:	The address_space to search
   2061 * @start:	The starting page cache index
   2062 * @end:	The final page index (inclusive).
   2063 * @fbatch:	Where the resulting entries are placed.
   2064 * @indices:	The cache indices corresponding to the entries in @entries
   2065 *
   2066 * find_get_entries() will search for and return a batch of entries in
   2067 * the mapping.  The entries are placed in @fbatch.  find_get_entries()
   2068 * takes a reference on any actual folios it returns.
   2069 *
   2070 * The entries have ascending indexes.  The indices may not be consecutive
   2071 * due to not-present entries or large folios.
   2072 *
   2073 * Any shadow entries of evicted folios, or swap entries from
   2074 * shmem/tmpfs, are included in the returned array.
   2075 *
   2076 * Return: The number of entries which were found.
   2077 */
   2078unsigned find_get_entries(struct address_space *mapping, pgoff_t start,
   2079		pgoff_t end, struct folio_batch *fbatch, pgoff_t *indices)
   2080{
   2081	XA_STATE(xas, &mapping->i_pages, start);
   2082	struct folio *folio;
   2083
   2084	rcu_read_lock();
   2085	while ((folio = find_get_entry(&xas, end, XA_PRESENT)) != NULL) {
   2086		indices[fbatch->nr] = xas.xa_index;
   2087		if (!folio_batch_add(fbatch, folio))
   2088			break;
   2089	}
   2090	rcu_read_unlock();
   2091
   2092	return folio_batch_count(fbatch);
   2093}
   2094
   2095/**
   2096 * find_lock_entries - Find a batch of pagecache entries.
   2097 * @mapping:	The address_space to search.
   2098 * @start:	The starting page cache index.
   2099 * @end:	The final page index (inclusive).
   2100 * @fbatch:	Where the resulting entries are placed.
   2101 * @indices:	The cache indices of the entries in @fbatch.
   2102 *
   2103 * find_lock_entries() will return a batch of entries from @mapping.
   2104 * Swap, shadow and DAX entries are included.  Folios are returned
   2105 * locked and with an incremented refcount.  Folios which are locked
   2106 * by somebody else or under writeback are skipped.  Folios which are
   2107 * partially outside the range are not returned.
   2108 *
   2109 * The entries have ascending indexes.  The indices may not be consecutive
   2110 * due to not-present entries, large folios, folios which could not be
   2111 * locked or folios under writeback.
   2112 *
   2113 * Return: The number of entries which were found.
   2114 */
   2115unsigned find_lock_entries(struct address_space *mapping, pgoff_t start,
   2116		pgoff_t end, struct folio_batch *fbatch, pgoff_t *indices)
   2117{
   2118	XA_STATE(xas, &mapping->i_pages, start);
   2119	struct folio *folio;
   2120
   2121	rcu_read_lock();
   2122	while ((folio = find_get_entry(&xas, end, XA_PRESENT))) {
   2123		if (!xa_is_value(folio)) {
   2124			if (folio->index < start)
   2125				goto put;
   2126			if (folio->index + folio_nr_pages(folio) - 1 > end)
   2127				goto put;
   2128			if (!folio_trylock(folio))
   2129				goto put;
   2130			if (folio->mapping != mapping ||
   2131			    folio_test_writeback(folio))
   2132				goto unlock;
   2133			VM_BUG_ON_FOLIO(!folio_contains(folio, xas.xa_index),
   2134					folio);
   2135		}
   2136		indices[fbatch->nr] = xas.xa_index;
   2137		if (!folio_batch_add(fbatch, folio))
   2138			break;
   2139		continue;
   2140unlock:
   2141		folio_unlock(folio);
   2142put:
   2143		folio_put(folio);
   2144	}
   2145	rcu_read_unlock();
   2146
   2147	return folio_batch_count(fbatch);
   2148}
   2149
   2150static inline
   2151bool folio_more_pages(struct folio *folio, pgoff_t index, pgoff_t max)
   2152{
   2153	if (!folio_test_large(folio) || folio_test_hugetlb(folio))
   2154		return false;
   2155	if (index >= max)
   2156		return false;
   2157	return index < folio->index + folio_nr_pages(folio) - 1;
   2158}
   2159
   2160/**
   2161 * find_get_pages_range - gang pagecache lookup
   2162 * @mapping:	The address_space to search
   2163 * @start:	The starting page index
   2164 * @end:	The final page index (inclusive)
   2165 * @nr_pages:	The maximum number of pages
   2166 * @pages:	Where the resulting pages are placed
   2167 *
   2168 * find_get_pages_range() will search for and return a group of up to @nr_pages
   2169 * pages in the mapping starting at index @start and up to index @end
   2170 * (inclusive).  The pages are placed at @pages.  find_get_pages_range() takes
   2171 * a reference against the returned pages.
   2172 *
   2173 * The search returns a group of mapping-contiguous pages with ascending
   2174 * indexes.  There may be holes in the indices due to not-present pages.
   2175 * We also update @start to index the next page for the traversal.
   2176 *
   2177 * Return: the number of pages which were found. If this number is
   2178 * smaller than @nr_pages, the end of specified range has been
   2179 * reached.
   2180 */
   2181unsigned find_get_pages_range(struct address_space *mapping, pgoff_t *start,
   2182			      pgoff_t end, unsigned int nr_pages,
   2183			      struct page **pages)
   2184{
   2185	XA_STATE(xas, &mapping->i_pages, *start);
   2186	struct folio *folio;
   2187	unsigned ret = 0;
   2188
   2189	if (unlikely(!nr_pages))
   2190		return 0;
   2191
   2192	rcu_read_lock();
   2193	while ((folio = find_get_entry(&xas, end, XA_PRESENT))) {
   2194		/* Skip over shadow, swap and DAX entries */
   2195		if (xa_is_value(folio))
   2196			continue;
   2197
   2198again:
   2199		pages[ret] = folio_file_page(folio, xas.xa_index);
   2200		if (++ret == nr_pages) {
   2201			*start = xas.xa_index + 1;
   2202			goto out;
   2203		}
   2204		if (folio_more_pages(folio, xas.xa_index, end)) {
   2205			xas.xa_index++;
   2206			folio_ref_inc(folio);
   2207			goto again;
   2208		}
   2209	}
   2210
   2211	/*
   2212	 * We come here when there is no page beyond @end. We take care to not
   2213	 * overflow the index @start as it confuses some of the callers. This
   2214	 * breaks the iteration when there is a page at index -1 but that is
   2215	 * already broken anyway.
   2216	 */
   2217	if (end == (pgoff_t)-1)
   2218		*start = (pgoff_t)-1;
   2219	else
   2220		*start = end + 1;
   2221out:
   2222	rcu_read_unlock();
   2223
   2224	return ret;
   2225}
   2226
   2227/**
   2228 * find_get_pages_contig - gang contiguous pagecache lookup
   2229 * @mapping:	The address_space to search
   2230 * @index:	The starting page index
   2231 * @nr_pages:	The maximum number of pages
   2232 * @pages:	Where the resulting pages are placed
   2233 *
   2234 * find_get_pages_contig() works exactly like find_get_pages_range(),
   2235 * except that the returned number of pages are guaranteed to be
   2236 * contiguous.
   2237 *
   2238 * Return: the number of pages which were found.
   2239 */
   2240unsigned find_get_pages_contig(struct address_space *mapping, pgoff_t index,
   2241			       unsigned int nr_pages, struct page **pages)
   2242{
   2243	XA_STATE(xas, &mapping->i_pages, index);
   2244	struct folio *folio;
   2245	unsigned int ret = 0;
   2246
   2247	if (unlikely(!nr_pages))
   2248		return 0;
   2249
   2250	rcu_read_lock();
   2251	for (folio = xas_load(&xas); folio; folio = xas_next(&xas)) {
   2252		if (xas_retry(&xas, folio))
   2253			continue;
   2254		/*
   2255		 * If the entry has been swapped out, we can stop looking.
   2256		 * No current caller is looking for DAX entries.
   2257		 */
   2258		if (xa_is_value(folio))
   2259			break;
   2260
   2261		if (!folio_try_get_rcu(folio))
   2262			goto retry;
   2263
   2264		if (unlikely(folio != xas_reload(&xas)))
   2265			goto put_page;
   2266
   2267again:
   2268		pages[ret] = folio_file_page(folio, xas.xa_index);
   2269		if (++ret == nr_pages)
   2270			break;
   2271		if (folio_more_pages(folio, xas.xa_index, ULONG_MAX)) {
   2272			xas.xa_index++;
   2273			folio_ref_inc(folio);
   2274			goto again;
   2275		}
   2276		continue;
   2277put_page:
   2278		folio_put(folio);
   2279retry:
   2280		xas_reset(&xas);
   2281	}
   2282	rcu_read_unlock();
   2283	return ret;
   2284}
   2285EXPORT_SYMBOL(find_get_pages_contig);
   2286
   2287/**
   2288 * find_get_pages_range_tag - Find and return head pages matching @tag.
   2289 * @mapping:	the address_space to search
   2290 * @index:	the starting page index
   2291 * @end:	The final page index (inclusive)
   2292 * @tag:	the tag index
   2293 * @nr_pages:	the maximum number of pages
   2294 * @pages:	where the resulting pages are placed
   2295 *
   2296 * Like find_get_pages_range(), except we only return head pages which are
   2297 * tagged with @tag.  @index is updated to the index immediately after the
   2298 * last page we return, ready for the next iteration.
   2299 *
   2300 * Return: the number of pages which were found.
   2301 */
   2302unsigned find_get_pages_range_tag(struct address_space *mapping, pgoff_t *index,
   2303			pgoff_t end, xa_mark_t tag, unsigned int nr_pages,
   2304			struct page **pages)
   2305{
   2306	XA_STATE(xas, &mapping->i_pages, *index);
   2307	struct folio *folio;
   2308	unsigned ret = 0;
   2309
   2310	if (unlikely(!nr_pages))
   2311		return 0;
   2312
   2313	rcu_read_lock();
   2314	while ((folio = find_get_entry(&xas, end, tag))) {
   2315		/*
   2316		 * Shadow entries should never be tagged, but this iteration
   2317		 * is lockless so there is a window for page reclaim to evict
   2318		 * a page we saw tagged.  Skip over it.
   2319		 */
   2320		if (xa_is_value(folio))
   2321			continue;
   2322
   2323		pages[ret] = &folio->page;
   2324		if (++ret == nr_pages) {
   2325			*index = folio->index + folio_nr_pages(folio);
   2326			goto out;
   2327		}
   2328	}
   2329
   2330	/*
   2331	 * We come here when we got to @end. We take care to not overflow the
   2332	 * index @index as it confuses some of the callers. This breaks the
   2333	 * iteration when there is a page at index -1 but that is already
   2334	 * broken anyway.
   2335	 */
   2336	if (end == (pgoff_t)-1)
   2337		*index = (pgoff_t)-1;
   2338	else
   2339		*index = end + 1;
   2340out:
   2341	rcu_read_unlock();
   2342
   2343	return ret;
   2344}
   2345EXPORT_SYMBOL(find_get_pages_range_tag);
   2346
   2347/*
   2348 * CD/DVDs are error prone. When a medium error occurs, the driver may fail
   2349 * a _large_ part of the i/o request. Imagine the worst scenario:
   2350 *
   2351 *      ---R__________________________________________B__________
   2352 *         ^ reading here                             ^ bad block(assume 4k)
   2353 *
   2354 * read(R) => miss => readahead(R...B) => media error => frustrating retries
   2355 * => failing the whole request => read(R) => read(R+1) =>
   2356 * readahead(R+1...B+1) => bang => read(R+2) => read(R+3) =>
   2357 * readahead(R+3...B+2) => bang => read(R+3) => read(R+4) =>
   2358 * readahead(R+4...B+3) => bang => read(R+4) => read(R+5) => ......
   2359 *
   2360 * It is going insane. Fix it by quickly scaling down the readahead size.
   2361 */
   2362static void shrink_readahead_size_eio(struct file_ra_state *ra)
   2363{
   2364	ra->ra_pages /= 4;
   2365}
   2366
   2367/*
   2368 * filemap_get_read_batch - Get a batch of folios for read
   2369 *
   2370 * Get a batch of folios which represent a contiguous range of bytes in
   2371 * the file.  No exceptional entries will be returned.  If @index is in
   2372 * the middle of a folio, the entire folio will be returned.  The last
   2373 * folio in the batch may have the readahead flag set or the uptodate flag
   2374 * clear so that the caller can take the appropriate action.
   2375 */
   2376static void filemap_get_read_batch(struct address_space *mapping,
   2377		pgoff_t index, pgoff_t max, struct folio_batch *fbatch)
   2378{
   2379	XA_STATE(xas, &mapping->i_pages, index);
   2380	struct folio *folio;
   2381
   2382	rcu_read_lock();
   2383	for (folio = xas_load(&xas); folio; folio = xas_next(&xas)) {
   2384		if (xas_retry(&xas, folio))
   2385			continue;
   2386		if (xas.xa_index > max || xa_is_value(folio))
   2387			break;
   2388		if (xa_is_sibling(folio))
   2389			break;
   2390		if (!folio_try_get_rcu(folio))
   2391			goto retry;
   2392
   2393		if (unlikely(folio != xas_reload(&xas)))
   2394			goto put_folio;
   2395
   2396		if (!folio_batch_add(fbatch, folio))
   2397			break;
   2398		if (!folio_test_uptodate(folio))
   2399			break;
   2400		if (folio_test_readahead(folio))
   2401			break;
   2402		xas_advance(&xas, folio->index + folio_nr_pages(folio) - 1);
   2403		continue;
   2404put_folio:
   2405		folio_put(folio);
   2406retry:
   2407		xas_reset(&xas);
   2408	}
   2409	rcu_read_unlock();
   2410}
   2411
   2412static int filemap_read_folio(struct file *file, struct address_space *mapping,
   2413		struct folio *folio)
   2414{
   2415	int error;
   2416
   2417	/*
   2418	 * A previous I/O error may have been due to temporary failures,
   2419	 * eg. multipath errors.  PG_error will be set again if read_folio
   2420	 * fails.
   2421	 */
   2422	folio_clear_error(folio);
   2423	/* Start the actual read. The read will unlock the page. */
   2424	error = mapping->a_ops->read_folio(file, folio);
   2425	if (error)
   2426		return error;
   2427
   2428	error = folio_wait_locked_killable(folio);
   2429	if (error)
   2430		return error;
   2431	if (folio_test_uptodate(folio))
   2432		return 0;
   2433	shrink_readahead_size_eio(&file->f_ra);
   2434	return -EIO;
   2435}
   2436
   2437static bool filemap_range_uptodate(struct address_space *mapping,
   2438		loff_t pos, struct iov_iter *iter, struct folio *folio)
   2439{
   2440	int count;
   2441
   2442	if (folio_test_uptodate(folio))
   2443		return true;
   2444	/* pipes can't handle partially uptodate pages */
   2445	if (iov_iter_is_pipe(iter))
   2446		return false;
   2447	if (!mapping->a_ops->is_partially_uptodate)
   2448		return false;
   2449	if (mapping->host->i_blkbits >= folio_shift(folio))
   2450		return false;
   2451
   2452	count = iter->count;
   2453	if (folio_pos(folio) > pos) {
   2454		count -= folio_pos(folio) - pos;
   2455		pos = 0;
   2456	} else {
   2457		pos -= folio_pos(folio);
   2458	}
   2459
   2460	return mapping->a_ops->is_partially_uptodate(folio, pos, count);
   2461}
   2462
   2463static int filemap_update_page(struct kiocb *iocb,
   2464		struct address_space *mapping, struct iov_iter *iter,
   2465		struct folio *folio)
   2466{
   2467	int error;
   2468
   2469	if (iocb->ki_flags & IOCB_NOWAIT) {
   2470		if (!filemap_invalidate_trylock_shared(mapping))
   2471			return -EAGAIN;
   2472	} else {
   2473		filemap_invalidate_lock_shared(mapping);
   2474	}
   2475
   2476	if (!folio_trylock(folio)) {
   2477		error = -EAGAIN;
   2478		if (iocb->ki_flags & (IOCB_NOWAIT | IOCB_NOIO))
   2479			goto unlock_mapping;
   2480		if (!(iocb->ki_flags & IOCB_WAITQ)) {
   2481			filemap_invalidate_unlock_shared(mapping);
   2482			/*
   2483			 * This is where we usually end up waiting for a
   2484			 * previously submitted readahead to finish.
   2485			 */
   2486			folio_put_wait_locked(folio, TASK_KILLABLE);
   2487			return AOP_TRUNCATED_PAGE;
   2488		}
   2489		error = __folio_lock_async(folio, iocb->ki_waitq);
   2490		if (error)
   2491			goto unlock_mapping;
   2492	}
   2493
   2494	error = AOP_TRUNCATED_PAGE;
   2495	if (!folio->mapping)
   2496		goto unlock;
   2497
   2498	error = 0;
   2499	if (filemap_range_uptodate(mapping, iocb->ki_pos, iter, folio))
   2500		goto unlock;
   2501
   2502	error = -EAGAIN;
   2503	if (iocb->ki_flags & (IOCB_NOIO | IOCB_NOWAIT | IOCB_WAITQ))
   2504		goto unlock;
   2505
   2506	error = filemap_read_folio(iocb->ki_filp, mapping, folio);
   2507	goto unlock_mapping;
   2508unlock:
   2509	folio_unlock(folio);
   2510unlock_mapping:
   2511	filemap_invalidate_unlock_shared(mapping);
   2512	if (error == AOP_TRUNCATED_PAGE)
   2513		folio_put(folio);
   2514	return error;
   2515}
   2516
   2517static int filemap_create_folio(struct file *file,
   2518		struct address_space *mapping, pgoff_t index,
   2519		struct folio_batch *fbatch)
   2520{
   2521	struct folio *folio;
   2522	int error;
   2523
   2524	folio = filemap_alloc_folio(mapping_gfp_mask(mapping), 0);
   2525	if (!folio)
   2526		return -ENOMEM;
   2527
   2528	/*
   2529	 * Protect against truncate / hole punch. Grabbing invalidate_lock
   2530	 * here assures we cannot instantiate and bring uptodate new
   2531	 * pagecache folios after evicting page cache during truncate
   2532	 * and before actually freeing blocks.	Note that we could
   2533	 * release invalidate_lock after inserting the folio into
   2534	 * the page cache as the locked folio would then be enough to
   2535	 * synchronize with hole punching. But there are code paths
   2536	 * such as filemap_update_page() filling in partially uptodate
   2537	 * pages or ->readahead() that need to hold invalidate_lock
   2538	 * while mapping blocks for IO so let's hold the lock here as
   2539	 * well to keep locking rules simple.
   2540	 */
   2541	filemap_invalidate_lock_shared(mapping);
   2542	error = filemap_add_folio(mapping, folio, index,
   2543			mapping_gfp_constraint(mapping, GFP_KERNEL));
   2544	if (error == -EEXIST)
   2545		error = AOP_TRUNCATED_PAGE;
   2546	if (error)
   2547		goto error;
   2548
   2549	error = filemap_read_folio(file, mapping, folio);
   2550	if (error)
   2551		goto error;
   2552
   2553	filemap_invalidate_unlock_shared(mapping);
   2554	folio_batch_add(fbatch, folio);
   2555	return 0;
   2556error:
   2557	filemap_invalidate_unlock_shared(mapping);
   2558	folio_put(folio);
   2559	return error;
   2560}
   2561
   2562static int filemap_readahead(struct kiocb *iocb, struct file *file,
   2563		struct address_space *mapping, struct folio *folio,
   2564		pgoff_t last_index)
   2565{
   2566	DEFINE_READAHEAD(ractl, file, &file->f_ra, mapping, folio->index);
   2567
   2568	if (iocb->ki_flags & IOCB_NOIO)
   2569		return -EAGAIN;
   2570	page_cache_async_ra(&ractl, folio, last_index - folio->index);
   2571	return 0;
   2572}
   2573
   2574static int filemap_get_pages(struct kiocb *iocb, struct iov_iter *iter,
   2575		struct folio_batch *fbatch)
   2576{
   2577	struct file *filp = iocb->ki_filp;
   2578	struct address_space *mapping = filp->f_mapping;
   2579	struct file_ra_state *ra = &filp->f_ra;
   2580	pgoff_t index = iocb->ki_pos >> PAGE_SHIFT;
   2581	pgoff_t last_index;
   2582	struct folio *folio;
   2583	int err = 0;
   2584
   2585	last_index = DIV_ROUND_UP(iocb->ki_pos + iter->count, PAGE_SIZE);
   2586retry:
   2587	if (fatal_signal_pending(current))
   2588		return -EINTR;
   2589
   2590	filemap_get_read_batch(mapping, index, last_index, fbatch);
   2591	if (!folio_batch_count(fbatch)) {
   2592		if (iocb->ki_flags & IOCB_NOIO)
   2593			return -EAGAIN;
   2594		page_cache_sync_readahead(mapping, ra, filp, index,
   2595				last_index - index);
   2596		filemap_get_read_batch(mapping, index, last_index, fbatch);
   2597	}
   2598	if (!folio_batch_count(fbatch)) {
   2599		if (iocb->ki_flags & (IOCB_NOWAIT | IOCB_WAITQ))
   2600			return -EAGAIN;
   2601		err = filemap_create_folio(filp, mapping,
   2602				iocb->ki_pos >> PAGE_SHIFT, fbatch);
   2603		if (err == AOP_TRUNCATED_PAGE)
   2604			goto retry;
   2605		return err;
   2606	}
   2607
   2608	folio = fbatch->folios[folio_batch_count(fbatch) - 1];
   2609	if (folio_test_readahead(folio)) {
   2610		err = filemap_readahead(iocb, filp, mapping, folio, last_index);
   2611		if (err)
   2612			goto err;
   2613	}
   2614	if (!folio_test_uptodate(folio)) {
   2615		if ((iocb->ki_flags & IOCB_WAITQ) &&
   2616		    folio_batch_count(fbatch) > 1)
   2617			iocb->ki_flags |= IOCB_NOWAIT;
   2618		err = filemap_update_page(iocb, mapping, iter, folio);
   2619		if (err)
   2620			goto err;
   2621	}
   2622
   2623	return 0;
   2624err:
   2625	if (err < 0)
   2626		folio_put(folio);
   2627	if (likely(--fbatch->nr))
   2628		return 0;
   2629	if (err == AOP_TRUNCATED_PAGE)
   2630		goto retry;
   2631	return err;
   2632}
   2633
   2634static inline bool pos_same_folio(loff_t pos1, loff_t pos2, struct folio *folio)
   2635{
   2636	unsigned int shift = folio_shift(folio);
   2637
   2638	return (pos1 >> shift == pos2 >> shift);
   2639}
   2640
   2641/**
   2642 * filemap_read - Read data from the page cache.
   2643 * @iocb: The iocb to read.
   2644 * @iter: Destination for the data.
   2645 * @already_read: Number of bytes already read by the caller.
   2646 *
   2647 * Copies data from the page cache.  If the data is not currently present,
   2648 * uses the readahead and read_folio address_space operations to fetch it.
   2649 *
   2650 * Return: Total number of bytes copied, including those already read by
   2651 * the caller.  If an error happens before any bytes are copied, returns
   2652 * a negative error number.
   2653 */
   2654ssize_t filemap_read(struct kiocb *iocb, struct iov_iter *iter,
   2655		ssize_t already_read)
   2656{
   2657	struct file *filp = iocb->ki_filp;
   2658	struct file_ra_state *ra = &filp->f_ra;
   2659	struct address_space *mapping = filp->f_mapping;
   2660	struct inode *inode = mapping->host;
   2661	struct folio_batch fbatch;
   2662	int i, error = 0;
   2663	bool writably_mapped;
   2664	loff_t isize, end_offset;
   2665
   2666	if (unlikely(iocb->ki_pos >= inode->i_sb->s_maxbytes))
   2667		return 0;
   2668	if (unlikely(!iov_iter_count(iter)))
   2669		return 0;
   2670
   2671	iov_iter_truncate(iter, inode->i_sb->s_maxbytes);
   2672	folio_batch_init(&fbatch);
   2673
   2674	do {
   2675		cond_resched();
   2676
   2677		/*
   2678		 * If we've already successfully copied some data, then we
   2679		 * can no longer safely return -EIOCBQUEUED. Hence mark
   2680		 * an async read NOWAIT at that point.
   2681		 */
   2682		if ((iocb->ki_flags & IOCB_WAITQ) && already_read)
   2683			iocb->ki_flags |= IOCB_NOWAIT;
   2684
   2685		if (unlikely(iocb->ki_pos >= i_size_read(inode)))
   2686			break;
   2687
   2688		error = filemap_get_pages(iocb, iter, &fbatch);
   2689		if (error < 0)
   2690			break;
   2691
   2692		/*
   2693		 * i_size must be checked after we know the pages are Uptodate.
   2694		 *
   2695		 * Checking i_size after the check allows us to calculate
   2696		 * the correct value for "nr", which means the zero-filled
   2697		 * part of the page is not copied back to userspace (unless
   2698		 * another truncate extends the file - this is desired though).
   2699		 */
   2700		isize = i_size_read(inode);
   2701		if (unlikely(iocb->ki_pos >= isize))
   2702			goto put_folios;
   2703		end_offset = min_t(loff_t, isize, iocb->ki_pos + iter->count);
   2704
   2705		/*
   2706		 * Once we start copying data, we don't want to be touching any
   2707		 * cachelines that might be contended:
   2708		 */
   2709		writably_mapped = mapping_writably_mapped(mapping);
   2710
   2711		/*
   2712		 * When a read accesses the same folio several times, only
   2713		 * mark it as accessed the first time.
   2714		 */
   2715		if (!pos_same_folio(iocb->ki_pos, ra->prev_pos - 1,
   2716							fbatch.folios[0]))
   2717			folio_mark_accessed(fbatch.folios[0]);
   2718
   2719		for (i = 0; i < folio_batch_count(&fbatch); i++) {
   2720			struct folio *folio = fbatch.folios[i];
   2721			size_t fsize = folio_size(folio);
   2722			size_t offset = iocb->ki_pos & (fsize - 1);
   2723			size_t bytes = min_t(loff_t, end_offset - iocb->ki_pos,
   2724					     fsize - offset);
   2725			size_t copied;
   2726
   2727			if (end_offset < folio_pos(folio))
   2728				break;
   2729			if (i > 0)
   2730				folio_mark_accessed(folio);
   2731			/*
   2732			 * If users can be writing to this folio using arbitrary
   2733			 * virtual addresses, take care of potential aliasing
   2734			 * before reading the folio on the kernel side.
   2735			 */
   2736			if (writably_mapped)
   2737				flush_dcache_folio(folio);
   2738
   2739			copied = copy_folio_to_iter(folio, offset, bytes, iter);
   2740
   2741			already_read += copied;
   2742			iocb->ki_pos += copied;
   2743			ra->prev_pos = iocb->ki_pos;
   2744
   2745			if (copied < bytes) {
   2746				error = -EFAULT;
   2747				break;
   2748			}
   2749		}
   2750put_folios:
   2751		for (i = 0; i < folio_batch_count(&fbatch); i++)
   2752			folio_put(fbatch.folios[i]);
   2753		folio_batch_init(&fbatch);
   2754	} while (iov_iter_count(iter) && iocb->ki_pos < isize && !error);
   2755
   2756	file_accessed(filp);
   2757
   2758	return already_read ? already_read : error;
   2759}
   2760EXPORT_SYMBOL_GPL(filemap_read);
   2761
   2762/**
   2763 * generic_file_read_iter - generic filesystem read routine
   2764 * @iocb:	kernel I/O control block
   2765 * @iter:	destination for the data read
   2766 *
   2767 * This is the "read_iter()" routine for all filesystems
   2768 * that can use the page cache directly.
   2769 *
   2770 * The IOCB_NOWAIT flag in iocb->ki_flags indicates that -EAGAIN shall
   2771 * be returned when no data can be read without waiting for I/O requests
   2772 * to complete; it doesn't prevent readahead.
   2773 *
   2774 * The IOCB_NOIO flag in iocb->ki_flags indicates that no new I/O
   2775 * requests shall be made for the read or for readahead.  When no data
   2776 * can be read, -EAGAIN shall be returned.  When readahead would be
   2777 * triggered, a partial, possibly empty read shall be returned.
   2778 *
   2779 * Return:
   2780 * * number of bytes copied, even for partial reads
   2781 * * negative error code (or 0 if IOCB_NOIO) if nothing was read
   2782 */
   2783ssize_t
   2784generic_file_read_iter(struct kiocb *iocb, struct iov_iter *iter)
   2785{
   2786	size_t count = iov_iter_count(iter);
   2787	ssize_t retval = 0;
   2788
   2789	if (!count)
   2790		return 0; /* skip atime */
   2791
   2792	if (iocb->ki_flags & IOCB_DIRECT) {
   2793		struct file *file = iocb->ki_filp;
   2794		struct address_space *mapping = file->f_mapping;
   2795		struct inode *inode = mapping->host;
   2796
   2797		if (iocb->ki_flags & IOCB_NOWAIT) {
   2798			if (filemap_range_needs_writeback(mapping, iocb->ki_pos,
   2799						iocb->ki_pos + count - 1))
   2800				return -EAGAIN;
   2801		} else {
   2802			retval = filemap_write_and_wait_range(mapping,
   2803						iocb->ki_pos,
   2804					        iocb->ki_pos + count - 1);
   2805			if (retval < 0)
   2806				return retval;
   2807		}
   2808
   2809		file_accessed(file);
   2810
   2811		retval = mapping->a_ops->direct_IO(iocb, iter);
   2812		if (retval >= 0) {
   2813			iocb->ki_pos += retval;
   2814			count -= retval;
   2815		}
   2816		if (retval != -EIOCBQUEUED)
   2817			iov_iter_revert(iter, count - iov_iter_count(iter));
   2818
   2819		/*
   2820		 * Btrfs can have a short DIO read if we encounter
   2821		 * compressed extents, so if there was an error, or if
   2822		 * we've already read everything we wanted to, or if
   2823		 * there was a short read because we hit EOF, go ahead
   2824		 * and return.  Otherwise fallthrough to buffered io for
   2825		 * the rest of the read.  Buffered reads will not work for
   2826		 * DAX files, so don't bother trying.
   2827		 */
   2828		if (retval < 0 || !count || IS_DAX(inode))
   2829			return retval;
   2830		if (iocb->ki_pos >= i_size_read(inode))
   2831			return retval;
   2832	}
   2833
   2834	return filemap_read(iocb, iter, retval);
   2835}
   2836EXPORT_SYMBOL(generic_file_read_iter);
   2837
   2838static inline loff_t folio_seek_hole_data(struct xa_state *xas,
   2839		struct address_space *mapping, struct folio *folio,
   2840		loff_t start, loff_t end, bool seek_data)
   2841{
   2842	const struct address_space_operations *ops = mapping->a_ops;
   2843	size_t offset, bsz = i_blocksize(mapping->host);
   2844
   2845	if (xa_is_value(folio) || folio_test_uptodate(folio))
   2846		return seek_data ? start : end;
   2847	if (!ops->is_partially_uptodate)
   2848		return seek_data ? end : start;
   2849
   2850	xas_pause(xas);
   2851	rcu_read_unlock();
   2852	folio_lock(folio);
   2853	if (unlikely(folio->mapping != mapping))
   2854		goto unlock;
   2855
   2856	offset = offset_in_folio(folio, start) & ~(bsz - 1);
   2857
   2858	do {
   2859		if (ops->is_partially_uptodate(folio, offset, bsz) ==
   2860							seek_data)
   2861			break;
   2862		start = (start + bsz) & ~(bsz - 1);
   2863		offset += bsz;
   2864	} while (offset < folio_size(folio));
   2865unlock:
   2866	folio_unlock(folio);
   2867	rcu_read_lock();
   2868	return start;
   2869}
   2870
   2871static inline size_t seek_folio_size(struct xa_state *xas, struct folio *folio)
   2872{
   2873	if (xa_is_value(folio))
   2874		return PAGE_SIZE << xa_get_order(xas->xa, xas->xa_index);
   2875	return folio_size(folio);
   2876}
   2877
   2878/**
   2879 * mapping_seek_hole_data - Seek for SEEK_DATA / SEEK_HOLE in the page cache.
   2880 * @mapping: Address space to search.
   2881 * @start: First byte to consider.
   2882 * @end: Limit of search (exclusive).
   2883 * @whence: Either SEEK_HOLE or SEEK_DATA.
   2884 *
   2885 * If the page cache knows which blocks contain holes and which blocks
   2886 * contain data, your filesystem can use this function to implement
   2887 * SEEK_HOLE and SEEK_DATA.  This is useful for filesystems which are
   2888 * entirely memory-based such as tmpfs, and filesystems which support
   2889 * unwritten extents.
   2890 *
   2891 * Return: The requested offset on success, or -ENXIO if @whence specifies
   2892 * SEEK_DATA and there is no data after @start.  There is an implicit hole
   2893 * after @end - 1, so SEEK_HOLE returns @end if all the bytes between @start
   2894 * and @end contain data.
   2895 */
   2896loff_t mapping_seek_hole_data(struct address_space *mapping, loff_t start,
   2897		loff_t end, int whence)
   2898{
   2899	XA_STATE(xas, &mapping->i_pages, start >> PAGE_SHIFT);
   2900	pgoff_t max = (end - 1) >> PAGE_SHIFT;
   2901	bool seek_data = (whence == SEEK_DATA);
   2902	struct folio *folio;
   2903
   2904	if (end <= start)
   2905		return -ENXIO;
   2906
   2907	rcu_read_lock();
   2908	while ((folio = find_get_entry(&xas, max, XA_PRESENT))) {
   2909		loff_t pos = (u64)xas.xa_index << PAGE_SHIFT;
   2910		size_t seek_size;
   2911
   2912		if (start < pos) {
   2913			if (!seek_data)
   2914				goto unlock;
   2915			start = pos;
   2916		}
   2917
   2918		seek_size = seek_folio_size(&xas, folio);
   2919		pos = round_up((u64)pos + 1, seek_size);
   2920		start = folio_seek_hole_data(&xas, mapping, folio, start, pos,
   2921				seek_data);
   2922		if (start < pos)
   2923			goto unlock;
   2924		if (start >= end)
   2925			break;
   2926		if (seek_size > PAGE_SIZE)
   2927			xas_set(&xas, pos >> PAGE_SHIFT);
   2928		if (!xa_is_value(folio))
   2929			folio_put(folio);
   2930	}
   2931	if (seek_data)
   2932		start = -ENXIO;
   2933unlock:
   2934	rcu_read_unlock();
   2935	if (folio && !xa_is_value(folio))
   2936		folio_put(folio);
   2937	if (start > end)
   2938		return end;
   2939	return start;
   2940}
   2941
   2942#ifdef CONFIG_MMU
   2943#define MMAP_LOTSAMISS  (100)
   2944/*
   2945 * lock_folio_maybe_drop_mmap - lock the page, possibly dropping the mmap_lock
   2946 * @vmf - the vm_fault for this fault.
   2947 * @folio - the folio to lock.
   2948 * @fpin - the pointer to the file we may pin (or is already pinned).
   2949 *
   2950 * This works similar to lock_folio_or_retry in that it can drop the
   2951 * mmap_lock.  It differs in that it actually returns the folio locked
   2952 * if it returns 1 and 0 if it couldn't lock the folio.  If we did have
   2953 * to drop the mmap_lock then fpin will point to the pinned file and
   2954 * needs to be fput()'ed at a later point.
   2955 */
   2956static int lock_folio_maybe_drop_mmap(struct vm_fault *vmf, struct folio *folio,
   2957				     struct file **fpin)
   2958{
   2959	if (folio_trylock(folio))
   2960		return 1;
   2961
   2962	/*
   2963	 * NOTE! This will make us return with VM_FAULT_RETRY, but with
   2964	 * the mmap_lock still held. That's how FAULT_FLAG_RETRY_NOWAIT
   2965	 * is supposed to work. We have way too many special cases..
   2966	 */
   2967	if (vmf->flags & FAULT_FLAG_RETRY_NOWAIT)
   2968		return 0;
   2969
   2970	*fpin = maybe_unlock_mmap_for_io(vmf, *fpin);
   2971	if (vmf->flags & FAULT_FLAG_KILLABLE) {
   2972		if (__folio_lock_killable(folio)) {
   2973			/*
   2974			 * We didn't have the right flags to drop the mmap_lock,
   2975			 * but all fault_handlers only check for fatal signals
   2976			 * if we return VM_FAULT_RETRY, so we need to drop the
   2977			 * mmap_lock here and return 0 if we don't have a fpin.
   2978			 */
   2979			if (*fpin == NULL)
   2980				mmap_read_unlock(vmf->vma->vm_mm);
   2981			return 0;
   2982		}
   2983	} else
   2984		__folio_lock(folio);
   2985
   2986	return 1;
   2987}
   2988
   2989/*
   2990 * Synchronous readahead happens when we don't even find a page in the page
   2991 * cache at all.  We don't want to perform IO under the mmap sem, so if we have
   2992 * to drop the mmap sem we return the file that was pinned in order for us to do
   2993 * that.  If we didn't pin a file then we return NULL.  The file that is
   2994 * returned needs to be fput()'ed when we're done with it.
   2995 */
   2996static struct file *do_sync_mmap_readahead(struct vm_fault *vmf)
   2997{
   2998	struct file *file = vmf->vma->vm_file;
   2999	struct file_ra_state *ra = &file->f_ra;
   3000	struct address_space *mapping = file->f_mapping;
   3001	DEFINE_READAHEAD(ractl, file, ra, mapping, vmf->pgoff);
   3002	struct file *fpin = NULL;
   3003	unsigned long vm_flags = vmf->vma->vm_flags;
   3004	unsigned int mmap_miss;
   3005
   3006#ifdef CONFIG_TRANSPARENT_HUGEPAGE
   3007	/* Use the readahead code, even if readahead is disabled */
   3008	if (vm_flags & VM_HUGEPAGE) {
   3009		fpin = maybe_unlock_mmap_for_io(vmf, fpin);
   3010		ractl._index &= ~((unsigned long)HPAGE_PMD_NR - 1);
   3011		ra->size = HPAGE_PMD_NR;
   3012		/*
   3013		 * Fetch two PMD folios, so we get the chance to actually
   3014		 * readahead, unless we've been told not to.
   3015		 */
   3016		if (!(vm_flags & VM_RAND_READ))
   3017			ra->size *= 2;
   3018		ra->async_size = HPAGE_PMD_NR;
   3019		page_cache_ra_order(&ractl, ra, HPAGE_PMD_ORDER);
   3020		return fpin;
   3021	}
   3022#endif
   3023
   3024	/* If we don't want any read-ahead, don't bother */
   3025	if (vm_flags & VM_RAND_READ)
   3026		return fpin;
   3027	if (!ra->ra_pages)
   3028		return fpin;
   3029
   3030	if (vm_flags & VM_SEQ_READ) {
   3031		fpin = maybe_unlock_mmap_for_io(vmf, fpin);
   3032		page_cache_sync_ra(&ractl, ra->ra_pages);
   3033		return fpin;
   3034	}
   3035
   3036	/* Avoid banging the cache line if not needed */
   3037	mmap_miss = READ_ONCE(ra->mmap_miss);
   3038	if (mmap_miss < MMAP_LOTSAMISS * 10)
   3039		WRITE_ONCE(ra->mmap_miss, ++mmap_miss);
   3040
   3041	/*
   3042	 * Do we miss much more than hit in this file? If so,
   3043	 * stop bothering with read-ahead. It will only hurt.
   3044	 */
   3045	if (mmap_miss > MMAP_LOTSAMISS)
   3046		return fpin;
   3047
   3048	/*
   3049	 * mmap read-around
   3050	 */
   3051	fpin = maybe_unlock_mmap_for_io(vmf, fpin);
   3052	ra->start = max_t(long, 0, vmf->pgoff - ra->ra_pages / 2);
   3053	ra->size = ra->ra_pages;
   3054	ra->async_size = ra->ra_pages / 4;
   3055	ractl._index = ra->start;
   3056	page_cache_ra_order(&ractl, ra, 0);
   3057	return fpin;
   3058}
   3059
   3060/*
   3061 * Asynchronous readahead happens when we find the page and PG_readahead,
   3062 * so we want to possibly extend the readahead further.  We return the file that
   3063 * was pinned if we have to drop the mmap_lock in order to do IO.
   3064 */
   3065static struct file *do_async_mmap_readahead(struct vm_fault *vmf,
   3066					    struct folio *folio)
   3067{
   3068	struct file *file = vmf->vma->vm_file;
   3069	struct file_ra_state *ra = &file->f_ra;
   3070	DEFINE_READAHEAD(ractl, file, ra, file->f_mapping, vmf->pgoff);
   3071	struct file *fpin = NULL;
   3072	unsigned int mmap_miss;
   3073
   3074	/* If we don't want any read-ahead, don't bother */
   3075	if (vmf->vma->vm_flags & VM_RAND_READ || !ra->ra_pages)
   3076		return fpin;
   3077
   3078	mmap_miss = READ_ONCE(ra->mmap_miss);
   3079	if (mmap_miss)
   3080		WRITE_ONCE(ra->mmap_miss, --mmap_miss);
   3081
   3082	if (folio_test_readahead(folio)) {
   3083		fpin = maybe_unlock_mmap_for_io(vmf, fpin);
   3084		page_cache_async_ra(&ractl, folio, ra->ra_pages);
   3085	}
   3086	return fpin;
   3087}
   3088
   3089/**
   3090 * filemap_fault - read in file data for page fault handling
   3091 * @vmf:	struct vm_fault containing details of the fault
   3092 *
   3093 * filemap_fault() is invoked via the vma operations vector for a
   3094 * mapped memory region to read in file data during a page fault.
   3095 *
   3096 * The goto's are kind of ugly, but this streamlines the normal case of having
   3097 * it in the page cache, and handles the special cases reasonably without
   3098 * having a lot of duplicated code.
   3099 *
   3100 * vma->vm_mm->mmap_lock must be held on entry.
   3101 *
   3102 * If our return value has VM_FAULT_RETRY set, it's because the mmap_lock
   3103 * may be dropped before doing I/O or by lock_folio_maybe_drop_mmap().
   3104 *
   3105 * If our return value does not have VM_FAULT_RETRY set, the mmap_lock
   3106 * has not been released.
   3107 *
   3108 * We never return with VM_FAULT_RETRY and a bit from VM_FAULT_ERROR set.
   3109 *
   3110 * Return: bitwise-OR of %VM_FAULT_ codes.
   3111 */
   3112vm_fault_t filemap_fault(struct vm_fault *vmf)
   3113{
   3114	int error;
   3115	struct file *file = vmf->vma->vm_file;
   3116	struct file *fpin = NULL;
   3117	struct address_space *mapping = file->f_mapping;
   3118	struct inode *inode = mapping->host;
   3119	pgoff_t max_idx, index = vmf->pgoff;
   3120	struct folio *folio;
   3121	vm_fault_t ret = 0;
   3122	bool mapping_locked = false;
   3123
   3124	max_idx = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE);
   3125	if (unlikely(index >= max_idx))
   3126		return VM_FAULT_SIGBUS;
   3127
   3128	/*
   3129	 * Do we have something in the page cache already?
   3130	 */
   3131	folio = filemap_get_folio(mapping, index);
   3132	if (likely(folio)) {
   3133		/*
   3134		 * We found the page, so try async readahead before waiting for
   3135		 * the lock.
   3136		 */
   3137		if (!(vmf->flags & FAULT_FLAG_TRIED))
   3138			fpin = do_async_mmap_readahead(vmf, folio);
   3139		if (unlikely(!folio_test_uptodate(folio))) {
   3140			filemap_invalidate_lock_shared(mapping);
   3141			mapping_locked = true;
   3142		}
   3143	} else {
   3144		/* No page in the page cache at all */
   3145		count_vm_event(PGMAJFAULT);
   3146		count_memcg_event_mm(vmf->vma->vm_mm, PGMAJFAULT);
   3147		ret = VM_FAULT_MAJOR;
   3148		fpin = do_sync_mmap_readahead(vmf);
   3149retry_find:
   3150		/*
   3151		 * See comment in filemap_create_folio() why we need
   3152		 * invalidate_lock
   3153		 */
   3154		if (!mapping_locked) {
   3155			filemap_invalidate_lock_shared(mapping);
   3156			mapping_locked = true;
   3157		}
   3158		folio = __filemap_get_folio(mapping, index,
   3159					  FGP_CREAT|FGP_FOR_MMAP,
   3160					  vmf->gfp_mask);
   3161		if (!folio) {
   3162			if (fpin)
   3163				goto out_retry;
   3164			filemap_invalidate_unlock_shared(mapping);
   3165			return VM_FAULT_OOM;
   3166		}
   3167	}
   3168
   3169	if (!lock_folio_maybe_drop_mmap(vmf, folio, &fpin))
   3170		goto out_retry;
   3171
   3172	/* Did it get truncated? */
   3173	if (unlikely(folio->mapping != mapping)) {
   3174		folio_unlock(folio);
   3175		folio_put(folio);
   3176		goto retry_find;
   3177	}
   3178	VM_BUG_ON_FOLIO(!folio_contains(folio, index), folio);
   3179
   3180	/*
   3181	 * We have a locked page in the page cache, now we need to check
   3182	 * that it's up-to-date. If not, it is going to be due to an error.
   3183	 */
   3184	if (unlikely(!folio_test_uptodate(folio))) {
   3185		/*
   3186		 * The page was in cache and uptodate and now it is not.
   3187		 * Strange but possible since we didn't hold the page lock all
   3188		 * the time. Let's drop everything get the invalidate lock and
   3189		 * try again.
   3190		 */
   3191		if (!mapping_locked) {
   3192			folio_unlock(folio);
   3193			folio_put(folio);
   3194			goto retry_find;
   3195		}
   3196		goto page_not_uptodate;
   3197	}
   3198
   3199	/*
   3200	 * We've made it this far and we had to drop our mmap_lock, now is the
   3201	 * time to return to the upper layer and have it re-find the vma and
   3202	 * redo the fault.
   3203	 */
   3204	if (fpin) {
   3205		folio_unlock(folio);
   3206		goto out_retry;
   3207	}
   3208	if (mapping_locked)
   3209		filemap_invalidate_unlock_shared(mapping);
   3210
   3211	/*
   3212	 * Found the page and have a reference on it.
   3213	 * We must recheck i_size under page lock.
   3214	 */
   3215	max_idx = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE);
   3216	if (unlikely(index >= max_idx)) {
   3217		folio_unlock(folio);
   3218		folio_put(folio);
   3219		return VM_FAULT_SIGBUS;
   3220	}
   3221
   3222	vmf->page = folio_file_page(folio, index);
   3223	return ret | VM_FAULT_LOCKED;
   3224
   3225page_not_uptodate:
   3226	/*
   3227	 * Umm, take care of errors if the page isn't up-to-date.
   3228	 * Try to re-read it _once_. We do this synchronously,
   3229	 * because there really aren't any performance issues here
   3230	 * and we need to check for errors.
   3231	 */
   3232	fpin = maybe_unlock_mmap_for_io(vmf, fpin);
   3233	error = filemap_read_folio(file, mapping, folio);
   3234	if (fpin)
   3235		goto out_retry;
   3236	folio_put(folio);
   3237
   3238	if (!error || error == AOP_TRUNCATED_PAGE)
   3239		goto retry_find;
   3240	filemap_invalidate_unlock_shared(mapping);
   3241
   3242	return VM_FAULT_SIGBUS;
   3243
   3244out_retry:
   3245	/*
   3246	 * We dropped the mmap_lock, we need to return to the fault handler to
   3247	 * re-find the vma and come back and find our hopefully still populated
   3248	 * page.
   3249	 */
   3250	if (folio)
   3251		folio_put(folio);
   3252	if (mapping_locked)
   3253		filemap_invalidate_unlock_shared(mapping);
   3254	if (fpin)
   3255		fput(fpin);
   3256	return ret | VM_FAULT_RETRY;
   3257}
   3258EXPORT_SYMBOL(filemap_fault);
   3259
   3260static bool filemap_map_pmd(struct vm_fault *vmf, struct page *page)
   3261{
   3262	struct mm_struct *mm = vmf->vma->vm_mm;
   3263
   3264	/* Huge page is mapped? No need to proceed. */
   3265	if (pmd_trans_huge(*vmf->pmd)) {
   3266		unlock_page(page);
   3267		put_page(page);
   3268		return true;
   3269	}
   3270
   3271	if (pmd_none(*vmf->pmd) && PageTransHuge(page)) {
   3272		vm_fault_t ret = do_set_pmd(vmf, page);
   3273		if (!ret) {
   3274			/* The page is mapped successfully, reference consumed. */
   3275			unlock_page(page);
   3276			return true;
   3277		}
   3278	}
   3279
   3280	if (pmd_none(*vmf->pmd))
   3281		pmd_install(mm, vmf->pmd, &vmf->prealloc_pte);
   3282
   3283	/* See comment in handle_pte_fault() */
   3284	if (pmd_devmap_trans_unstable(vmf->pmd)) {
   3285		unlock_page(page);
   3286		put_page(page);
   3287		return true;
   3288	}
   3289
   3290	return false;
   3291}
   3292
   3293static struct folio *next_uptodate_page(struct folio *folio,
   3294				       struct address_space *mapping,
   3295				       struct xa_state *xas, pgoff_t end_pgoff)
   3296{
   3297	unsigned long max_idx;
   3298
   3299	do {
   3300		if (!folio)
   3301			return NULL;
   3302		if (xas_retry(xas, folio))
   3303			continue;
   3304		if (xa_is_value(folio))
   3305			continue;
   3306		if (folio_test_locked(folio))
   3307			continue;
   3308		if (!folio_try_get_rcu(folio))
   3309			continue;
   3310		/* Has the page moved or been split? */
   3311		if (unlikely(folio != xas_reload(xas)))
   3312			goto skip;
   3313		if (!folio_test_uptodate(folio) || folio_test_readahead(folio))
   3314			goto skip;
   3315		if (!folio_trylock(folio))
   3316			goto skip;
   3317		if (folio->mapping != mapping)
   3318			goto unlock;
   3319		if (!folio_test_uptodate(folio))
   3320			goto unlock;
   3321		max_idx = DIV_ROUND_UP(i_size_read(mapping->host), PAGE_SIZE);
   3322		if (xas->xa_index >= max_idx)
   3323			goto unlock;
   3324		return folio;
   3325unlock:
   3326		folio_unlock(folio);
   3327skip:
   3328		folio_put(folio);
   3329	} while ((folio = xas_next_entry(xas, end_pgoff)) != NULL);
   3330
   3331	return NULL;
   3332}
   3333
   3334static inline struct folio *first_map_page(struct address_space *mapping,
   3335					  struct xa_state *xas,
   3336					  pgoff_t end_pgoff)
   3337{
   3338	return next_uptodate_page(xas_find(xas, end_pgoff),
   3339				  mapping, xas, end_pgoff);
   3340}
   3341
   3342static inline struct folio *next_map_page(struct address_space *mapping,
   3343					 struct xa_state *xas,
   3344					 pgoff_t end_pgoff)
   3345{
   3346	return next_uptodate_page(xas_next_entry(xas, end_pgoff),
   3347				  mapping, xas, end_pgoff);
   3348}
   3349
   3350vm_fault_t filemap_map_pages(struct vm_fault *vmf,
   3351			     pgoff_t start_pgoff, pgoff_t end_pgoff)
   3352{
   3353	struct vm_area_struct *vma = vmf->vma;
   3354	struct file *file = vma->vm_file;
   3355	struct address_space *mapping = file->f_mapping;
   3356	pgoff_t last_pgoff = start_pgoff;
   3357	unsigned long addr;
   3358	XA_STATE(xas, &mapping->i_pages, start_pgoff);
   3359	struct folio *folio;
   3360	struct page *page;
   3361	unsigned int mmap_miss = READ_ONCE(file->f_ra.mmap_miss);
   3362	vm_fault_t ret = 0;
   3363
   3364	rcu_read_lock();
   3365	folio = first_map_page(mapping, &xas, end_pgoff);
   3366	if (!folio)
   3367		goto out;
   3368
   3369	if (filemap_map_pmd(vmf, &folio->page)) {
   3370		ret = VM_FAULT_NOPAGE;
   3371		goto out;
   3372	}
   3373
   3374	addr = vma->vm_start + ((start_pgoff - vma->vm_pgoff) << PAGE_SHIFT);
   3375	vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, addr, &vmf->ptl);
   3376	do {
   3377again:
   3378		page = folio_file_page(folio, xas.xa_index);
   3379		if (PageHWPoison(page))
   3380			goto unlock;
   3381
   3382		if (mmap_miss > 0)
   3383			mmap_miss--;
   3384
   3385		addr += (xas.xa_index - last_pgoff) << PAGE_SHIFT;
   3386		vmf->pte += xas.xa_index - last_pgoff;
   3387		last_pgoff = xas.xa_index;
   3388
   3389		/*
   3390		 * NOTE: If there're PTE markers, we'll leave them to be
   3391		 * handled in the specific fault path, and it'll prohibit the
   3392		 * fault-around logic.
   3393		 */
   3394		if (!pte_none(*vmf->pte))
   3395			goto unlock;
   3396
   3397		/* We're about to handle the fault */
   3398		if (vmf->address == addr)
   3399			ret = VM_FAULT_NOPAGE;
   3400
   3401		do_set_pte(vmf, page, addr);
   3402		/* no need to invalidate: a not-present page won't be cached */
   3403		update_mmu_cache(vma, addr, vmf->pte);
   3404		if (folio_more_pages(folio, xas.xa_index, end_pgoff)) {
   3405			xas.xa_index++;
   3406			folio_ref_inc(folio);
   3407			goto again;
   3408		}
   3409		folio_unlock(folio);
   3410		continue;
   3411unlock:
   3412		if (folio_more_pages(folio, xas.xa_index, end_pgoff)) {
   3413			xas.xa_index++;
   3414			goto again;
   3415		}
   3416		folio_unlock(folio);
   3417		folio_put(folio);
   3418	} while ((folio = next_map_page(mapping, &xas, end_pgoff)) != NULL);
   3419	pte_unmap_unlock(vmf->pte, vmf->ptl);
   3420out:
   3421	rcu_read_unlock();
   3422	WRITE_ONCE(file->f_ra.mmap_miss, mmap_miss);
   3423	return ret;
   3424}
   3425EXPORT_SYMBOL(filemap_map_pages);
   3426
   3427vm_fault_t filemap_page_mkwrite(struct vm_fault *vmf)
   3428{
   3429	struct address_space *mapping = vmf->vma->vm_file->f_mapping;
   3430	struct folio *folio = page_folio(vmf->page);
   3431	vm_fault_t ret = VM_FAULT_LOCKED;
   3432
   3433	sb_start_pagefault(mapping->host->i_sb);
   3434	file_update_time(vmf->vma->vm_file);
   3435	folio_lock(folio);
   3436	if (folio->mapping != mapping) {
   3437		folio_unlock(folio);
   3438		ret = VM_FAULT_NOPAGE;
   3439		goto out;
   3440	}
   3441	/*
   3442	 * We mark the folio dirty already here so that when freeze is in
   3443	 * progress, we are guaranteed that writeback during freezing will
   3444	 * see the dirty folio and writeprotect it again.
   3445	 */
   3446	folio_mark_dirty(folio);
   3447	folio_wait_stable(folio);
   3448out:
   3449	sb_end_pagefault(mapping->host->i_sb);
   3450	return ret;
   3451}
   3452
   3453const struct vm_operations_struct generic_file_vm_ops = {
   3454	.fault		= filemap_fault,
   3455	.map_pages	= filemap_map_pages,
   3456	.page_mkwrite	= filemap_page_mkwrite,
   3457};
   3458
   3459/* This is used for a general mmap of a disk file */
   3460
   3461int generic_file_mmap(struct file *file, struct vm_area_struct *vma)
   3462{
   3463	struct address_space *mapping = file->f_mapping;
   3464
   3465	if (!mapping->a_ops->read_folio)
   3466		return -ENOEXEC;
   3467	file_accessed(file);
   3468	vma->vm_ops = &generic_file_vm_ops;
   3469	return 0;
   3470}
   3471
   3472/*
   3473 * This is for filesystems which do not implement ->writepage.
   3474 */
   3475int generic_file_readonly_mmap(struct file *file, struct vm_area_struct *vma)
   3476{
   3477	if ((vma->vm_flags & VM_SHARED) && (vma->vm_flags & VM_MAYWRITE))
   3478		return -EINVAL;
   3479	return generic_file_mmap(file, vma);
   3480}
   3481#else
   3482vm_fault_t filemap_page_mkwrite(struct vm_fault *vmf)
   3483{
   3484	return VM_FAULT_SIGBUS;
   3485}
   3486int generic_file_mmap(struct file *file, struct vm_area_struct *vma)
   3487{
   3488	return -ENOSYS;
   3489}
   3490int generic_file_readonly_mmap(struct file *file, struct vm_area_struct *vma)
   3491{
   3492	return -ENOSYS;
   3493}
   3494#endif /* CONFIG_MMU */
   3495
   3496EXPORT_SYMBOL(filemap_page_mkwrite);
   3497EXPORT_SYMBOL(generic_file_mmap);
   3498EXPORT_SYMBOL(generic_file_readonly_mmap);
   3499
   3500static struct folio *do_read_cache_folio(struct address_space *mapping,
   3501		pgoff_t index, filler_t filler, struct file *file, gfp_t gfp)
   3502{
   3503	struct folio *folio;
   3504	int err;
   3505
   3506	if (!filler)
   3507		filler = mapping->a_ops->read_folio;
   3508repeat:
   3509	folio = filemap_get_folio(mapping, index);
   3510	if (!folio) {
   3511		folio = filemap_alloc_folio(gfp, 0);
   3512		if (!folio)
   3513			return ERR_PTR(-ENOMEM);
   3514		err = filemap_add_folio(mapping, folio, index, gfp);
   3515		if (unlikely(err)) {
   3516			folio_put(folio);
   3517			if (err == -EEXIST)
   3518				goto repeat;
   3519			/* Presumably ENOMEM for xarray node */
   3520			return ERR_PTR(err);
   3521		}
   3522
   3523filler:
   3524		err = filler(file, folio);
   3525		if (err < 0) {
   3526			folio_put(folio);
   3527			return ERR_PTR(err);
   3528		}
   3529
   3530		folio_wait_locked(folio);
   3531		if (!folio_test_uptodate(folio)) {
   3532			folio_put(folio);
   3533			return ERR_PTR(-EIO);
   3534		}
   3535
   3536		goto out;
   3537	}
   3538	if (folio_test_uptodate(folio))
   3539		goto out;
   3540
   3541	if (!folio_trylock(folio)) {
   3542		folio_put_wait_locked(folio, TASK_UNINTERRUPTIBLE);
   3543		goto repeat;
   3544	}
   3545
   3546	/* Folio was truncated from mapping */
   3547	if (!folio->mapping) {
   3548		folio_unlock(folio);
   3549		folio_put(folio);
   3550		goto repeat;
   3551	}
   3552
   3553	/* Someone else locked and filled the page in a very small window */
   3554	if (folio_test_uptodate(folio)) {
   3555		folio_unlock(folio);
   3556		goto out;
   3557	}
   3558
   3559	/*
   3560	 * A previous I/O error may have been due to temporary
   3561	 * failures.
   3562	 * Clear page error before actual read, PG_error will be
   3563	 * set again if read page fails.
   3564	 */
   3565	folio_clear_error(folio);
   3566	goto filler;
   3567
   3568out:
   3569	folio_mark_accessed(folio);
   3570	return folio;
   3571}
   3572
   3573/**
   3574 * read_cache_folio - Read into page cache, fill it if needed.
   3575 * @mapping: The address_space to read from.
   3576 * @index: The index to read.
   3577 * @filler: Function to perform the read, or NULL to use aops->read_folio().
   3578 * @file: Passed to filler function, may be NULL if not required.
   3579 *
   3580 * Read one page into the page cache.  If it succeeds, the folio returned
   3581 * will contain @index, but it may not be the first page of the folio.
   3582 *
   3583 * If the filler function returns an error, it will be returned to the
   3584 * caller.
   3585 *
   3586 * Context: May sleep.  Expects mapping->invalidate_lock to be held.
   3587 * Return: An uptodate folio on success, ERR_PTR() on failure.
   3588 */
   3589struct folio *read_cache_folio(struct address_space *mapping, pgoff_t index,
   3590		filler_t filler, struct file *file)
   3591{
   3592	return do_read_cache_folio(mapping, index, filler, file,
   3593			mapping_gfp_mask(mapping));
   3594}
   3595EXPORT_SYMBOL(read_cache_folio);
   3596
   3597static struct page *do_read_cache_page(struct address_space *mapping,
   3598		pgoff_t index, filler_t *filler, struct file *file, gfp_t gfp)
   3599{
   3600	struct folio *folio;
   3601
   3602	folio = do_read_cache_folio(mapping, index, filler, file, gfp);
   3603	if (IS_ERR(folio))
   3604		return &folio->page;
   3605	return folio_file_page(folio, index);
   3606}
   3607
   3608struct page *read_cache_page(struct address_space *mapping,
   3609			pgoff_t index, filler_t *filler, struct file *file)
   3610{
   3611	return do_read_cache_page(mapping, index, filler, file,
   3612			mapping_gfp_mask(mapping));
   3613}
   3614EXPORT_SYMBOL(read_cache_page);
   3615
   3616/**
   3617 * read_cache_page_gfp - read into page cache, using specified page allocation flags.
   3618 * @mapping:	the page's address_space
   3619 * @index:	the page index
   3620 * @gfp:	the page allocator flags to use if allocating
   3621 *
   3622 * This is the same as "read_mapping_page(mapping, index, NULL)", but with
   3623 * any new page allocations done using the specified allocation flags.
   3624 *
   3625 * If the page does not get brought uptodate, return -EIO.
   3626 *
   3627 * The function expects mapping->invalidate_lock to be already held.
   3628 *
   3629 * Return: up to date page on success, ERR_PTR() on failure.
   3630 */
   3631struct page *read_cache_page_gfp(struct address_space *mapping,
   3632				pgoff_t index,
   3633				gfp_t gfp)
   3634{
   3635	return do_read_cache_page(mapping, index, NULL, NULL, gfp);
   3636}
   3637EXPORT_SYMBOL(read_cache_page_gfp);
   3638
   3639/*
   3640 * Warn about a page cache invalidation failure during a direct I/O write.
   3641 */
   3642void dio_warn_stale_pagecache(struct file *filp)
   3643{
   3644	static DEFINE_RATELIMIT_STATE(_rs, 86400 * HZ, DEFAULT_RATELIMIT_BURST);
   3645	char pathname[128];
   3646	char *path;
   3647
   3648	errseq_set(&filp->f_mapping->wb_err, -EIO);
   3649	if (__ratelimit(&_rs)) {
   3650		path = file_path(filp, pathname, sizeof(pathname));
   3651		if (IS_ERR(path))
   3652			path = "(unknown)";
   3653		pr_crit("Page cache invalidation failure on direct I/O.  Possible data corruption due to collision with buffered I/O!\n");
   3654		pr_crit("File: %s PID: %d Comm: %.20s\n", path, current->pid,
   3655			current->comm);
   3656	}
   3657}
   3658
   3659ssize_t
   3660generic_file_direct_write(struct kiocb *iocb, struct iov_iter *from)
   3661{
   3662	struct file	*file = iocb->ki_filp;
   3663	struct address_space *mapping = file->f_mapping;
   3664	struct inode	*inode = mapping->host;
   3665	loff_t		pos = iocb->ki_pos;
   3666	ssize_t		written;
   3667	size_t		write_len;
   3668	pgoff_t		end;
   3669
   3670	write_len = iov_iter_count(from);
   3671	end = (pos + write_len - 1) >> PAGE_SHIFT;
   3672
   3673	if (iocb->ki_flags & IOCB_NOWAIT) {
   3674		/* If there are pages to writeback, return */
   3675		if (filemap_range_has_page(file->f_mapping, pos,
   3676					   pos + write_len - 1))
   3677			return -EAGAIN;
   3678	} else {
   3679		written = filemap_write_and_wait_range(mapping, pos,
   3680							pos + write_len - 1);
   3681		if (written)
   3682			goto out;
   3683	}
   3684
   3685	/*
   3686	 * After a write we want buffered reads to be sure to go to disk to get
   3687	 * the new data.  We invalidate clean cached page from the region we're
   3688	 * about to write.  We do this *before* the write so that we can return
   3689	 * without clobbering -EIOCBQUEUED from ->direct_IO().
   3690	 */
   3691	written = invalidate_inode_pages2_range(mapping,
   3692					pos >> PAGE_SHIFT, end);
   3693	/*
   3694	 * If a page can not be invalidated, return 0 to fall back
   3695	 * to buffered write.
   3696	 */
   3697	if (written) {
   3698		if (written == -EBUSY)
   3699			return 0;
   3700		goto out;
   3701	}
   3702
   3703	written = mapping->a_ops->direct_IO(iocb, from);
   3704
   3705	/*
   3706	 * Finally, try again to invalidate clean pages which might have been
   3707	 * cached by non-direct readahead, or faulted in by get_user_pages()
   3708	 * if the source of the write was an mmap'ed region of the file
   3709	 * we're writing.  Either one is a pretty crazy thing to do,
   3710	 * so we don't support it 100%.  If this invalidation
   3711	 * fails, tough, the write still worked...
   3712	 *
   3713	 * Most of the time we do not need this since dio_complete() will do
   3714	 * the invalidation for us. However there are some file systems that
   3715	 * do not end up with dio_complete() being called, so let's not break
   3716	 * them by removing it completely.
   3717	 *
   3718	 * Noticeable example is a blkdev_direct_IO().
   3719	 *
   3720	 * Skip invalidation for async writes or if mapping has no pages.
   3721	 */
   3722	if (written > 0 && mapping->nrpages &&
   3723	    invalidate_inode_pages2_range(mapping, pos >> PAGE_SHIFT, end))
   3724		dio_warn_stale_pagecache(file);
   3725
   3726	if (written > 0) {
   3727		pos += written;
   3728		write_len -= written;
   3729		if (pos > i_size_read(inode) && !S_ISBLK(inode->i_mode)) {
   3730			i_size_write(inode, pos);
   3731			mark_inode_dirty(inode);
   3732		}
   3733		iocb->ki_pos = pos;
   3734	}
   3735	if (written != -EIOCBQUEUED)
   3736		iov_iter_revert(from, write_len - iov_iter_count(from));
   3737out:
   3738	return written;
   3739}
   3740EXPORT_SYMBOL(generic_file_direct_write);
   3741
   3742ssize_t generic_perform_write(struct kiocb *iocb, struct iov_iter *i)
   3743{
   3744	struct file *file = iocb->ki_filp;
   3745	loff_t pos = iocb->ki_pos;
   3746	struct address_space *mapping = file->f_mapping;
   3747	const struct address_space_operations *a_ops = mapping->a_ops;
   3748	long status = 0;
   3749	ssize_t written = 0;
   3750
   3751	do {
   3752		struct page *page;
   3753		unsigned long offset;	/* Offset into pagecache page */
   3754		unsigned long bytes;	/* Bytes to write to page */
   3755		size_t copied;		/* Bytes copied from user */
   3756		void *fsdata;
   3757
   3758		offset = (pos & (PAGE_SIZE - 1));
   3759		bytes = min_t(unsigned long, PAGE_SIZE - offset,
   3760						iov_iter_count(i));
   3761
   3762again:
   3763		/*
   3764		 * Bring in the user page that we will copy from _first_.
   3765		 * Otherwise there's a nasty deadlock on copying from the
   3766		 * same page as we're writing to, without it being marked
   3767		 * up-to-date.
   3768		 */
   3769		if (unlikely(fault_in_iov_iter_readable(i, bytes) == bytes)) {
   3770			status = -EFAULT;
   3771			break;
   3772		}
   3773
   3774		if (fatal_signal_pending(current)) {
   3775			status = -EINTR;
   3776			break;
   3777		}
   3778
   3779		status = a_ops->write_begin(file, mapping, pos, bytes,
   3780						&page, &fsdata);
   3781		if (unlikely(status < 0))
   3782			break;
   3783
   3784		if (mapping_writably_mapped(mapping))
   3785			flush_dcache_page(page);
   3786
   3787		copied = copy_page_from_iter_atomic(page, offset, bytes, i);
   3788		flush_dcache_page(page);
   3789
   3790		status = a_ops->write_end(file, mapping, pos, bytes, copied,
   3791						page, fsdata);
   3792		if (unlikely(status != copied)) {
   3793			iov_iter_revert(i, copied - max(status, 0L));
   3794			if (unlikely(status < 0))
   3795				break;
   3796		}
   3797		cond_resched();
   3798
   3799		if (unlikely(status == 0)) {
   3800			/*
   3801			 * A short copy made ->write_end() reject the
   3802			 * thing entirely.  Might be memory poisoning
   3803			 * halfway through, might be a race with munmap,
   3804			 * might be severe memory pressure.
   3805			 */
   3806			if (copied)
   3807				bytes = copied;
   3808			goto again;
   3809		}
   3810		pos += status;
   3811		written += status;
   3812
   3813		balance_dirty_pages_ratelimited(mapping);
   3814	} while (iov_iter_count(i));
   3815
   3816	return written ? written : status;
   3817}
   3818EXPORT_SYMBOL(generic_perform_write);
   3819
   3820/**
   3821 * __generic_file_write_iter - write data to a file
   3822 * @iocb:	IO state structure (file, offset, etc.)
   3823 * @from:	iov_iter with data to write
   3824 *
   3825 * This function does all the work needed for actually writing data to a
   3826 * file. It does all basic checks, removes SUID from the file, updates
   3827 * modification times and calls proper subroutines depending on whether we
   3828 * do direct IO or a standard buffered write.
   3829 *
   3830 * It expects i_rwsem to be grabbed unless we work on a block device or similar
   3831 * object which does not need locking at all.
   3832 *
   3833 * This function does *not* take care of syncing data in case of O_SYNC write.
   3834 * A caller has to handle it. This is mainly due to the fact that we want to
   3835 * avoid syncing under i_rwsem.
   3836 *
   3837 * Return:
   3838 * * number of bytes written, even for truncated writes
   3839 * * negative error code if no data has been written at all
   3840 */
   3841ssize_t __generic_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
   3842{
   3843	struct file *file = iocb->ki_filp;
   3844	struct address_space *mapping = file->f_mapping;
   3845	struct inode 	*inode = mapping->host;
   3846	ssize_t		written = 0;
   3847	ssize_t		err;
   3848	ssize_t		status;
   3849
   3850	/* We can write back this queue in page reclaim */
   3851	current->backing_dev_info = inode_to_bdi(inode);
   3852	err = file_remove_privs(file);
   3853	if (err)
   3854		goto out;
   3855
   3856	err = file_update_time(file);
   3857	if (err)
   3858		goto out;
   3859
   3860	if (iocb->ki_flags & IOCB_DIRECT) {
   3861		loff_t pos, endbyte;
   3862
   3863		written = generic_file_direct_write(iocb, from);
   3864		/*
   3865		 * If the write stopped short of completing, fall back to
   3866		 * buffered writes.  Some filesystems do this for writes to
   3867		 * holes, for example.  For DAX files, a buffered write will
   3868		 * not succeed (even if it did, DAX does not handle dirty
   3869		 * page-cache pages correctly).
   3870		 */
   3871		if (written < 0 || !iov_iter_count(from) || IS_DAX(inode))
   3872			goto out;
   3873
   3874		pos = iocb->ki_pos;
   3875		status = generic_perform_write(iocb, from);
   3876		/*
   3877		 * If generic_perform_write() returned a synchronous error
   3878		 * then we want to return the number of bytes which were
   3879		 * direct-written, or the error code if that was zero.  Note
   3880		 * that this differs from normal direct-io semantics, which
   3881		 * will return -EFOO even if some bytes were written.
   3882		 */
   3883		if (unlikely(status < 0)) {
   3884			err = status;
   3885			goto out;
   3886		}
   3887		/*
   3888		 * We need to ensure that the page cache pages are written to
   3889		 * disk and invalidated to preserve the expected O_DIRECT
   3890		 * semantics.
   3891		 */
   3892		endbyte = pos + status - 1;
   3893		err = filemap_write_and_wait_range(mapping, pos, endbyte);
   3894		if (err == 0) {
   3895			iocb->ki_pos = endbyte + 1;
   3896			written += status;
   3897			invalidate_mapping_pages(mapping,
   3898						 pos >> PAGE_SHIFT,
   3899						 endbyte >> PAGE_SHIFT);
   3900		} else {
   3901			/*
   3902			 * We don't know how much we wrote, so just return
   3903			 * the number of bytes which were direct-written
   3904			 */
   3905		}
   3906	} else {
   3907		written = generic_perform_write(iocb, from);
   3908		if (likely(written > 0))
   3909			iocb->ki_pos += written;
   3910	}
   3911out:
   3912	current->backing_dev_info = NULL;
   3913	return written ? written : err;
   3914}
   3915EXPORT_SYMBOL(__generic_file_write_iter);
   3916
   3917/**
   3918 * generic_file_write_iter - write data to a file
   3919 * @iocb:	IO state structure
   3920 * @from:	iov_iter with data to write
   3921 *
   3922 * This is a wrapper around __generic_file_write_iter() to be used by most
   3923 * filesystems. It takes care of syncing the file in case of O_SYNC file
   3924 * and acquires i_rwsem as needed.
   3925 * Return:
   3926 * * negative error code if no data has been written at all of
   3927 *   vfs_fsync_range() failed for a synchronous write
   3928 * * number of bytes written, even for truncated writes
   3929 */
   3930ssize_t generic_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
   3931{
   3932	struct file *file = iocb->ki_filp;
   3933	struct inode *inode = file->f_mapping->host;
   3934	ssize_t ret;
   3935
   3936	inode_lock(inode);
   3937	ret = generic_write_checks(iocb, from);
   3938	if (ret > 0)
   3939		ret = __generic_file_write_iter(iocb, from);
   3940	inode_unlock(inode);
   3941
   3942	if (ret > 0)
   3943		ret = generic_write_sync(iocb, ret);
   3944	return ret;
   3945}
   3946EXPORT_SYMBOL(generic_file_write_iter);
   3947
   3948/**
   3949 * filemap_release_folio() - Release fs-specific metadata on a folio.
   3950 * @folio: The folio which the kernel is trying to free.
   3951 * @gfp: Memory allocation flags (and I/O mode).
   3952 *
   3953 * The address_space is trying to release any data attached to a folio
   3954 * (presumably at folio->private).
   3955 *
   3956 * This will also be called if the private_2 flag is set on a page,
   3957 * indicating that the folio has other metadata associated with it.
   3958 *
   3959 * The @gfp argument specifies whether I/O may be performed to release
   3960 * this page (__GFP_IO), and whether the call may block
   3961 * (__GFP_RECLAIM & __GFP_FS).
   3962 *
   3963 * Return: %true if the release was successful, otherwise %false.
   3964 */
   3965bool filemap_release_folio(struct folio *folio, gfp_t gfp)
   3966{
   3967	struct address_space * const mapping = folio->mapping;
   3968
   3969	BUG_ON(!folio_test_locked(folio));
   3970	if (folio_test_writeback(folio))
   3971		return false;
   3972
   3973	if (mapping && mapping->a_ops->release_folio)
   3974		return mapping->a_ops->release_folio(folio, gfp);
   3975	return try_to_free_buffers(folio);
   3976}
   3977EXPORT_SYMBOL(filemap_release_folio);