cachepc-linux

Fork of AMDESE/linux with modifications for CachePC side-channel attack
git clone https://git.sinitax.com/sinitax/cachepc-linux
Log | Files | Refs | README | LICENSE | sfeed.txt

gup.c (90395B)


      1// SPDX-License-Identifier: GPL-2.0-only
      2#include <linux/kernel.h>
      3#include <linux/errno.h>
      4#include <linux/err.h>
      5#include <linux/spinlock.h>
      6
      7#include <linux/mm.h>
      8#include <linux/memremap.h>
      9#include <linux/pagemap.h>
     10#include <linux/rmap.h>
     11#include <linux/swap.h>
     12#include <linux/swapops.h>
     13#include <linux/secretmem.h>
     14
     15#include <linux/sched/signal.h>
     16#include <linux/rwsem.h>
     17#include <linux/hugetlb.h>
     18#include <linux/migrate.h>
     19#include <linux/mm_inline.h>
     20#include <linux/sched/mm.h>
     21
     22#include <asm/mmu_context.h>
     23#include <asm/tlbflush.h>
     24
     25#include "internal.h"
     26
     27struct follow_page_context {
     28	struct dev_pagemap *pgmap;
     29	unsigned int page_mask;
     30};
     31
     32static inline void sanity_check_pinned_pages(struct page **pages,
     33					     unsigned long npages)
     34{
     35	if (!IS_ENABLED(CONFIG_DEBUG_VM))
     36		return;
     37
     38	/*
     39	 * We only pin anonymous pages if they are exclusive. Once pinned, we
     40	 * can no longer turn them possibly shared and PageAnonExclusive() will
     41	 * stick around until the page is freed.
     42	 *
     43	 * We'd like to verify that our pinned anonymous pages are still mapped
     44	 * exclusively. The issue with anon THP is that we don't know how
     45	 * they are/were mapped when pinning them. However, for anon
     46	 * THP we can assume that either the given page (PTE-mapped THP) or
     47	 * the head page (PMD-mapped THP) should be PageAnonExclusive(). If
     48	 * neither is the case, there is certainly something wrong.
     49	 */
     50	for (; npages; npages--, pages++) {
     51		struct page *page = *pages;
     52		struct folio *folio = page_folio(page);
     53
     54		if (!folio_test_anon(folio))
     55			continue;
     56		if (!folio_test_large(folio) || folio_test_hugetlb(folio))
     57			VM_BUG_ON_PAGE(!PageAnonExclusive(&folio->page), page);
     58		else
     59			/* Either a PTE-mapped or a PMD-mapped THP. */
     60			VM_BUG_ON_PAGE(!PageAnonExclusive(&folio->page) &&
     61				       !PageAnonExclusive(page), page);
     62	}
     63}
     64
     65/*
     66 * Return the folio with ref appropriately incremented,
     67 * or NULL if that failed.
     68 */
     69static inline struct folio *try_get_folio(struct page *page, int refs)
     70{
     71	struct folio *folio;
     72
     73retry:
     74	folio = page_folio(page);
     75	if (WARN_ON_ONCE(folio_ref_count(folio) < 0))
     76		return NULL;
     77	if (unlikely(!folio_ref_try_add_rcu(folio, refs)))
     78		return NULL;
     79
     80	/*
     81	 * At this point we have a stable reference to the folio; but it
     82	 * could be that between calling page_folio() and the refcount
     83	 * increment, the folio was split, in which case we'd end up
     84	 * holding a reference on a folio that has nothing to do with the page
     85	 * we were given anymore.
     86	 * So now that the folio is stable, recheck that the page still
     87	 * belongs to this folio.
     88	 */
     89	if (unlikely(page_folio(page) != folio)) {
     90		folio_put_refs(folio, refs);
     91		goto retry;
     92	}
     93
     94	return folio;
     95}
     96
     97/**
     98 * try_grab_folio() - Attempt to get or pin a folio.
     99 * @page:  pointer to page to be grabbed
    100 * @refs:  the value to (effectively) add to the folio's refcount
    101 * @flags: gup flags: these are the FOLL_* flag values.
    102 *
    103 * "grab" names in this file mean, "look at flags to decide whether to use
    104 * FOLL_PIN or FOLL_GET behavior, when incrementing the folio's refcount.
    105 *
    106 * Either FOLL_PIN or FOLL_GET (or neither) must be set, but not both at the
    107 * same time. (That's true throughout the get_user_pages*() and
    108 * pin_user_pages*() APIs.) Cases:
    109 *
    110 *    FOLL_GET: folio's refcount will be incremented by @refs.
    111 *
    112 *    FOLL_PIN on large folios: folio's refcount will be incremented by
    113 *    @refs, and its compound_pincount will be incremented by @refs.
    114 *
    115 *    FOLL_PIN on single-page folios: folio's refcount will be incremented by
    116 *    @refs * GUP_PIN_COUNTING_BIAS.
    117 *
    118 * Return: The folio containing @page (with refcount appropriately
    119 * incremented) for success, or NULL upon failure. If neither FOLL_GET
    120 * nor FOLL_PIN was set, that's considered failure, and furthermore,
    121 * a likely bug in the caller, so a warning is also emitted.
    122 */
    123struct folio *try_grab_folio(struct page *page, int refs, unsigned int flags)
    124{
    125	if (flags & FOLL_GET)
    126		return try_get_folio(page, refs);
    127	else if (flags & FOLL_PIN) {
    128		struct folio *folio;
    129
    130		/*
    131		 * Can't do FOLL_LONGTERM + FOLL_PIN gup fast path if not in a
    132		 * right zone, so fail and let the caller fall back to the slow
    133		 * path.
    134		 */
    135		if (unlikely((flags & FOLL_LONGTERM) &&
    136			     !is_pinnable_page(page)))
    137			return NULL;
    138
    139		/*
    140		 * CAUTION: Don't use compound_head() on the page before this
    141		 * point, the result won't be stable.
    142		 */
    143		folio = try_get_folio(page, refs);
    144		if (!folio)
    145			return NULL;
    146
    147		/*
    148		 * When pinning a large folio, use an exact count to track it.
    149		 *
    150		 * However, be sure to *also* increment the normal folio
    151		 * refcount field at least once, so that the folio really
    152		 * is pinned.  That's why the refcount from the earlier
    153		 * try_get_folio() is left intact.
    154		 */
    155		if (folio_test_large(folio))
    156			atomic_add(refs, folio_pincount_ptr(folio));
    157		else
    158			folio_ref_add(folio,
    159					refs * (GUP_PIN_COUNTING_BIAS - 1));
    160		node_stat_mod_folio(folio, NR_FOLL_PIN_ACQUIRED, refs);
    161
    162		return folio;
    163	}
    164
    165	WARN_ON_ONCE(1);
    166	return NULL;
    167}
    168
    169static void gup_put_folio(struct folio *folio, int refs, unsigned int flags)
    170{
    171	if (flags & FOLL_PIN) {
    172		node_stat_mod_folio(folio, NR_FOLL_PIN_RELEASED, refs);
    173		if (folio_test_large(folio))
    174			atomic_sub(refs, folio_pincount_ptr(folio));
    175		else
    176			refs *= GUP_PIN_COUNTING_BIAS;
    177	}
    178
    179	folio_put_refs(folio, refs);
    180}
    181
    182/**
    183 * try_grab_page() - elevate a page's refcount by a flag-dependent amount
    184 * @page:    pointer to page to be grabbed
    185 * @flags:   gup flags: these are the FOLL_* flag values.
    186 *
    187 * This might not do anything at all, depending on the flags argument.
    188 *
    189 * "grab" names in this file mean, "look at flags to decide whether to use
    190 * FOLL_PIN or FOLL_GET behavior, when incrementing the page's refcount.
    191 *
    192 * Either FOLL_PIN or FOLL_GET (or neither) may be set, but not both at the same
    193 * time. Cases: please see the try_grab_folio() documentation, with
    194 * "refs=1".
    195 *
    196 * Return: true for success, or if no action was required (if neither FOLL_PIN
    197 * nor FOLL_GET was set, nothing is done). False for failure: FOLL_GET or
    198 * FOLL_PIN was set, but the page could not be grabbed.
    199 */
    200bool __must_check try_grab_page(struct page *page, unsigned int flags)
    201{
    202	struct folio *folio = page_folio(page);
    203
    204	WARN_ON_ONCE((flags & (FOLL_GET | FOLL_PIN)) == (FOLL_GET | FOLL_PIN));
    205	if (WARN_ON_ONCE(folio_ref_count(folio) <= 0))
    206		return false;
    207
    208	if (flags & FOLL_GET)
    209		folio_ref_inc(folio);
    210	else if (flags & FOLL_PIN) {
    211		/*
    212		 * Similar to try_grab_folio(): be sure to *also*
    213		 * increment the normal page refcount field at least once,
    214		 * so that the page really is pinned.
    215		 */
    216		if (folio_test_large(folio)) {
    217			folio_ref_add(folio, 1);
    218			atomic_add(1, folio_pincount_ptr(folio));
    219		} else {
    220			folio_ref_add(folio, GUP_PIN_COUNTING_BIAS);
    221		}
    222
    223		node_stat_mod_folio(folio, NR_FOLL_PIN_ACQUIRED, 1);
    224	}
    225
    226	return true;
    227}
    228
    229/**
    230 * unpin_user_page() - release a dma-pinned page
    231 * @page:            pointer to page to be released
    232 *
    233 * Pages that were pinned via pin_user_pages*() must be released via either
    234 * unpin_user_page(), or one of the unpin_user_pages*() routines. This is so
    235 * that such pages can be separately tracked and uniquely handled. In
    236 * particular, interactions with RDMA and filesystems need special handling.
    237 */
    238void unpin_user_page(struct page *page)
    239{
    240	sanity_check_pinned_pages(&page, 1);
    241	gup_put_folio(page_folio(page), 1, FOLL_PIN);
    242}
    243EXPORT_SYMBOL(unpin_user_page);
    244
    245static inline struct folio *gup_folio_range_next(struct page *start,
    246		unsigned long npages, unsigned long i, unsigned int *ntails)
    247{
    248	struct page *next = nth_page(start, i);
    249	struct folio *folio = page_folio(next);
    250	unsigned int nr = 1;
    251
    252	if (folio_test_large(folio))
    253		nr = min_t(unsigned int, npages - i,
    254			   folio_nr_pages(folio) - folio_page_idx(folio, next));
    255
    256	*ntails = nr;
    257	return folio;
    258}
    259
    260static inline struct folio *gup_folio_next(struct page **list,
    261		unsigned long npages, unsigned long i, unsigned int *ntails)
    262{
    263	struct folio *folio = page_folio(list[i]);
    264	unsigned int nr;
    265
    266	for (nr = i + 1; nr < npages; nr++) {
    267		if (page_folio(list[nr]) != folio)
    268			break;
    269	}
    270
    271	*ntails = nr - i;
    272	return folio;
    273}
    274
    275/**
    276 * unpin_user_pages_dirty_lock() - release and optionally dirty gup-pinned pages
    277 * @pages:  array of pages to be maybe marked dirty, and definitely released.
    278 * @npages: number of pages in the @pages array.
    279 * @make_dirty: whether to mark the pages dirty
    280 *
    281 * "gup-pinned page" refers to a page that has had one of the get_user_pages()
    282 * variants called on that page.
    283 *
    284 * For each page in the @pages array, make that page (or its head page, if a
    285 * compound page) dirty, if @make_dirty is true, and if the page was previously
    286 * listed as clean. In any case, releases all pages using unpin_user_page(),
    287 * possibly via unpin_user_pages(), for the non-dirty case.
    288 *
    289 * Please see the unpin_user_page() documentation for details.
    290 *
    291 * set_page_dirty_lock() is used internally. If instead, set_page_dirty() is
    292 * required, then the caller should a) verify that this is really correct,
    293 * because _lock() is usually required, and b) hand code it:
    294 * set_page_dirty_lock(), unpin_user_page().
    295 *
    296 */
    297void unpin_user_pages_dirty_lock(struct page **pages, unsigned long npages,
    298				 bool make_dirty)
    299{
    300	unsigned long i;
    301	struct folio *folio;
    302	unsigned int nr;
    303
    304	if (!make_dirty) {
    305		unpin_user_pages(pages, npages);
    306		return;
    307	}
    308
    309	sanity_check_pinned_pages(pages, npages);
    310	for (i = 0; i < npages; i += nr) {
    311		folio = gup_folio_next(pages, npages, i, &nr);
    312		/*
    313		 * Checking PageDirty at this point may race with
    314		 * clear_page_dirty_for_io(), but that's OK. Two key
    315		 * cases:
    316		 *
    317		 * 1) This code sees the page as already dirty, so it
    318		 * skips the call to set_page_dirty(). That could happen
    319		 * because clear_page_dirty_for_io() called
    320		 * page_mkclean(), followed by set_page_dirty().
    321		 * However, now the page is going to get written back,
    322		 * which meets the original intention of setting it
    323		 * dirty, so all is well: clear_page_dirty_for_io() goes
    324		 * on to call TestClearPageDirty(), and write the page
    325		 * back.
    326		 *
    327		 * 2) This code sees the page as clean, so it calls
    328		 * set_page_dirty(). The page stays dirty, despite being
    329		 * written back, so it gets written back again in the
    330		 * next writeback cycle. This is harmless.
    331		 */
    332		if (!folio_test_dirty(folio)) {
    333			folio_lock(folio);
    334			folio_mark_dirty(folio);
    335			folio_unlock(folio);
    336		}
    337		gup_put_folio(folio, nr, FOLL_PIN);
    338	}
    339}
    340EXPORT_SYMBOL(unpin_user_pages_dirty_lock);
    341
    342/**
    343 * unpin_user_page_range_dirty_lock() - release and optionally dirty
    344 * gup-pinned page range
    345 *
    346 * @page:  the starting page of a range maybe marked dirty, and definitely released.
    347 * @npages: number of consecutive pages to release.
    348 * @make_dirty: whether to mark the pages dirty
    349 *
    350 * "gup-pinned page range" refers to a range of pages that has had one of the
    351 * pin_user_pages() variants called on that page.
    352 *
    353 * For the page ranges defined by [page .. page+npages], make that range (or
    354 * its head pages, if a compound page) dirty, if @make_dirty is true, and if the
    355 * page range was previously listed as clean.
    356 *
    357 * set_page_dirty_lock() is used internally. If instead, set_page_dirty() is
    358 * required, then the caller should a) verify that this is really correct,
    359 * because _lock() is usually required, and b) hand code it:
    360 * set_page_dirty_lock(), unpin_user_page().
    361 *
    362 */
    363void unpin_user_page_range_dirty_lock(struct page *page, unsigned long npages,
    364				      bool make_dirty)
    365{
    366	unsigned long i;
    367	struct folio *folio;
    368	unsigned int nr;
    369
    370	for (i = 0; i < npages; i += nr) {
    371		folio = gup_folio_range_next(page, npages, i, &nr);
    372		if (make_dirty && !folio_test_dirty(folio)) {
    373			folio_lock(folio);
    374			folio_mark_dirty(folio);
    375			folio_unlock(folio);
    376		}
    377		gup_put_folio(folio, nr, FOLL_PIN);
    378	}
    379}
    380EXPORT_SYMBOL(unpin_user_page_range_dirty_lock);
    381
    382static void unpin_user_pages_lockless(struct page **pages, unsigned long npages)
    383{
    384	unsigned long i;
    385	struct folio *folio;
    386	unsigned int nr;
    387
    388	/*
    389	 * Don't perform any sanity checks because we might have raced with
    390	 * fork() and some anonymous pages might now actually be shared --
    391	 * which is why we're unpinning after all.
    392	 */
    393	for (i = 0; i < npages; i += nr) {
    394		folio = gup_folio_next(pages, npages, i, &nr);
    395		gup_put_folio(folio, nr, FOLL_PIN);
    396	}
    397}
    398
    399/**
    400 * unpin_user_pages() - release an array of gup-pinned pages.
    401 * @pages:  array of pages to be marked dirty and released.
    402 * @npages: number of pages in the @pages array.
    403 *
    404 * For each page in the @pages array, release the page using unpin_user_page().
    405 *
    406 * Please see the unpin_user_page() documentation for details.
    407 */
    408void unpin_user_pages(struct page **pages, unsigned long npages)
    409{
    410	unsigned long i;
    411	struct folio *folio;
    412	unsigned int nr;
    413
    414	/*
    415	 * If this WARN_ON() fires, then the system *might* be leaking pages (by
    416	 * leaving them pinned), but probably not. More likely, gup/pup returned
    417	 * a hard -ERRNO error to the caller, who erroneously passed it here.
    418	 */
    419	if (WARN_ON(IS_ERR_VALUE(npages)))
    420		return;
    421
    422	sanity_check_pinned_pages(pages, npages);
    423	for (i = 0; i < npages; i += nr) {
    424		folio = gup_folio_next(pages, npages, i, &nr);
    425		gup_put_folio(folio, nr, FOLL_PIN);
    426	}
    427}
    428EXPORT_SYMBOL(unpin_user_pages);
    429
    430/*
    431 * Set the MMF_HAS_PINNED if not set yet; after set it'll be there for the mm's
    432 * lifecycle.  Avoid setting the bit unless necessary, or it might cause write
    433 * cache bouncing on large SMP machines for concurrent pinned gups.
    434 */
    435static inline void mm_set_has_pinned_flag(unsigned long *mm_flags)
    436{
    437	if (!test_bit(MMF_HAS_PINNED, mm_flags))
    438		set_bit(MMF_HAS_PINNED, mm_flags);
    439}
    440
    441#ifdef CONFIG_MMU
    442static struct page *no_page_table(struct vm_area_struct *vma,
    443		unsigned int flags)
    444{
    445	/*
    446	 * When core dumping an enormous anonymous area that nobody
    447	 * has touched so far, we don't want to allocate unnecessary pages or
    448	 * page tables.  Return error instead of NULL to skip handle_mm_fault,
    449	 * then get_dump_page() will return NULL to leave a hole in the dump.
    450	 * But we can only make this optimization where a hole would surely
    451	 * be zero-filled if handle_mm_fault() actually did handle it.
    452	 */
    453	if ((flags & FOLL_DUMP) &&
    454			(vma_is_anonymous(vma) || !vma->vm_ops->fault))
    455		return ERR_PTR(-EFAULT);
    456	return NULL;
    457}
    458
    459static int follow_pfn_pte(struct vm_area_struct *vma, unsigned long address,
    460		pte_t *pte, unsigned int flags)
    461{
    462	if (flags & FOLL_TOUCH) {
    463		pte_t entry = *pte;
    464
    465		if (flags & FOLL_WRITE)
    466			entry = pte_mkdirty(entry);
    467		entry = pte_mkyoung(entry);
    468
    469		if (!pte_same(*pte, entry)) {
    470			set_pte_at(vma->vm_mm, address, pte, entry);
    471			update_mmu_cache(vma, address, pte);
    472		}
    473	}
    474
    475	/* Proper page table entry exists, but no corresponding struct page */
    476	return -EEXIST;
    477}
    478
    479/*
    480 * FOLL_FORCE can write to even unwritable pte's, but only
    481 * after we've gone through a COW cycle and they are dirty.
    482 */
    483static inline bool can_follow_write_pte(pte_t pte, unsigned int flags)
    484{
    485	return pte_write(pte) ||
    486		((flags & FOLL_FORCE) && (flags & FOLL_COW) && pte_dirty(pte));
    487}
    488
    489static struct page *follow_page_pte(struct vm_area_struct *vma,
    490		unsigned long address, pmd_t *pmd, unsigned int flags,
    491		struct dev_pagemap **pgmap)
    492{
    493	struct mm_struct *mm = vma->vm_mm;
    494	struct page *page;
    495	spinlock_t *ptl;
    496	pte_t *ptep, pte;
    497	int ret;
    498
    499	/* FOLL_GET and FOLL_PIN are mutually exclusive. */
    500	if (WARN_ON_ONCE((flags & (FOLL_PIN | FOLL_GET)) ==
    501			 (FOLL_PIN | FOLL_GET)))
    502		return ERR_PTR(-EINVAL);
    503retry:
    504	if (unlikely(pmd_bad(*pmd)))
    505		return no_page_table(vma, flags);
    506
    507	ptep = pte_offset_map_lock(mm, pmd, address, &ptl);
    508	pte = *ptep;
    509	if (!pte_present(pte)) {
    510		swp_entry_t entry;
    511		/*
    512		 * KSM's break_ksm() relies upon recognizing a ksm page
    513		 * even while it is being migrated, so for that case we
    514		 * need migration_entry_wait().
    515		 */
    516		if (likely(!(flags & FOLL_MIGRATION)))
    517			goto no_page;
    518		if (pte_none(pte))
    519			goto no_page;
    520		entry = pte_to_swp_entry(pte);
    521		if (!is_migration_entry(entry))
    522			goto no_page;
    523		pte_unmap_unlock(ptep, ptl);
    524		migration_entry_wait(mm, pmd, address);
    525		goto retry;
    526	}
    527	if ((flags & FOLL_NUMA) && pte_protnone(pte))
    528		goto no_page;
    529	if ((flags & FOLL_WRITE) && !can_follow_write_pte(pte, flags)) {
    530		pte_unmap_unlock(ptep, ptl);
    531		return NULL;
    532	}
    533
    534	page = vm_normal_page(vma, address, pte);
    535	if (!page && pte_devmap(pte) && (flags & (FOLL_GET | FOLL_PIN))) {
    536		/*
    537		 * Only return device mapping pages in the FOLL_GET or FOLL_PIN
    538		 * case since they are only valid while holding the pgmap
    539		 * reference.
    540		 */
    541		*pgmap = get_dev_pagemap(pte_pfn(pte), *pgmap);
    542		if (*pgmap)
    543			page = pte_page(pte);
    544		else
    545			goto no_page;
    546	} else if (unlikely(!page)) {
    547		if (flags & FOLL_DUMP) {
    548			/* Avoid special (like zero) pages in core dumps */
    549			page = ERR_PTR(-EFAULT);
    550			goto out;
    551		}
    552
    553		if (is_zero_pfn(pte_pfn(pte))) {
    554			page = pte_page(pte);
    555		} else {
    556			ret = follow_pfn_pte(vma, address, ptep, flags);
    557			page = ERR_PTR(ret);
    558			goto out;
    559		}
    560	}
    561
    562	if (!pte_write(pte) && gup_must_unshare(flags, page)) {
    563		page = ERR_PTR(-EMLINK);
    564		goto out;
    565	}
    566
    567	VM_BUG_ON_PAGE((flags & FOLL_PIN) && PageAnon(page) &&
    568		       !PageAnonExclusive(page), page);
    569
    570	/* try_grab_page() does nothing unless FOLL_GET or FOLL_PIN is set. */
    571	if (unlikely(!try_grab_page(page, flags))) {
    572		page = ERR_PTR(-ENOMEM);
    573		goto out;
    574	}
    575	/*
    576	 * We need to make the page accessible if and only if we are going
    577	 * to access its content (the FOLL_PIN case).  Please see
    578	 * Documentation/core-api/pin_user_pages.rst for details.
    579	 */
    580	if (flags & FOLL_PIN) {
    581		ret = arch_make_page_accessible(page);
    582		if (ret) {
    583			unpin_user_page(page);
    584			page = ERR_PTR(ret);
    585			goto out;
    586		}
    587	}
    588	if (flags & FOLL_TOUCH) {
    589		if ((flags & FOLL_WRITE) &&
    590		    !pte_dirty(pte) && !PageDirty(page))
    591			set_page_dirty(page);
    592		/*
    593		 * pte_mkyoung() would be more correct here, but atomic care
    594		 * is needed to avoid losing the dirty bit: it is easier to use
    595		 * mark_page_accessed().
    596		 */
    597		mark_page_accessed(page);
    598	}
    599out:
    600	pte_unmap_unlock(ptep, ptl);
    601	return page;
    602no_page:
    603	pte_unmap_unlock(ptep, ptl);
    604	if (!pte_none(pte))
    605		return NULL;
    606	return no_page_table(vma, flags);
    607}
    608
    609static struct page *follow_pmd_mask(struct vm_area_struct *vma,
    610				    unsigned long address, pud_t *pudp,
    611				    unsigned int flags,
    612				    struct follow_page_context *ctx)
    613{
    614	pmd_t *pmd, pmdval;
    615	spinlock_t *ptl;
    616	struct page *page;
    617	struct mm_struct *mm = vma->vm_mm;
    618
    619	pmd = pmd_offset(pudp, address);
    620	/*
    621	 * The READ_ONCE() will stabilize the pmdval in a register or
    622	 * on the stack so that it will stop changing under the code.
    623	 */
    624	pmdval = READ_ONCE(*pmd);
    625	if (pmd_none(pmdval))
    626		return no_page_table(vma, flags);
    627	if (pmd_huge(pmdval) && is_vm_hugetlb_page(vma)) {
    628		page = follow_huge_pmd(mm, address, pmd, flags);
    629		if (page)
    630			return page;
    631		return no_page_table(vma, flags);
    632	}
    633	if (is_hugepd(__hugepd(pmd_val(pmdval)))) {
    634		page = follow_huge_pd(vma, address,
    635				      __hugepd(pmd_val(pmdval)), flags,
    636				      PMD_SHIFT);
    637		if (page)
    638			return page;
    639		return no_page_table(vma, flags);
    640	}
    641retry:
    642	if (!pmd_present(pmdval)) {
    643		/*
    644		 * Should never reach here, if thp migration is not supported;
    645		 * Otherwise, it must be a thp migration entry.
    646		 */
    647		VM_BUG_ON(!thp_migration_supported() ||
    648				  !is_pmd_migration_entry(pmdval));
    649
    650		if (likely(!(flags & FOLL_MIGRATION)))
    651			return no_page_table(vma, flags);
    652
    653		pmd_migration_entry_wait(mm, pmd);
    654		pmdval = READ_ONCE(*pmd);
    655		/*
    656		 * MADV_DONTNEED may convert the pmd to null because
    657		 * mmap_lock is held in read mode
    658		 */
    659		if (pmd_none(pmdval))
    660			return no_page_table(vma, flags);
    661		goto retry;
    662	}
    663	if (pmd_devmap(pmdval)) {
    664		ptl = pmd_lock(mm, pmd);
    665		page = follow_devmap_pmd(vma, address, pmd, flags, &ctx->pgmap);
    666		spin_unlock(ptl);
    667		if (page)
    668			return page;
    669	}
    670	if (likely(!pmd_trans_huge(pmdval)))
    671		return follow_page_pte(vma, address, pmd, flags, &ctx->pgmap);
    672
    673	if ((flags & FOLL_NUMA) && pmd_protnone(pmdval))
    674		return no_page_table(vma, flags);
    675
    676retry_locked:
    677	ptl = pmd_lock(mm, pmd);
    678	if (unlikely(pmd_none(*pmd))) {
    679		spin_unlock(ptl);
    680		return no_page_table(vma, flags);
    681	}
    682	if (unlikely(!pmd_present(*pmd))) {
    683		spin_unlock(ptl);
    684		if (likely(!(flags & FOLL_MIGRATION)))
    685			return no_page_table(vma, flags);
    686		pmd_migration_entry_wait(mm, pmd);
    687		goto retry_locked;
    688	}
    689	if (unlikely(!pmd_trans_huge(*pmd))) {
    690		spin_unlock(ptl);
    691		return follow_page_pte(vma, address, pmd, flags, &ctx->pgmap);
    692	}
    693	if (flags & FOLL_SPLIT_PMD) {
    694		int ret;
    695		page = pmd_page(*pmd);
    696		if (is_huge_zero_page(page)) {
    697			spin_unlock(ptl);
    698			ret = 0;
    699			split_huge_pmd(vma, pmd, address);
    700			if (pmd_trans_unstable(pmd))
    701				ret = -EBUSY;
    702		} else {
    703			spin_unlock(ptl);
    704			split_huge_pmd(vma, pmd, address);
    705			ret = pte_alloc(mm, pmd) ? -ENOMEM : 0;
    706		}
    707
    708		return ret ? ERR_PTR(ret) :
    709			follow_page_pte(vma, address, pmd, flags, &ctx->pgmap);
    710	}
    711	page = follow_trans_huge_pmd(vma, address, pmd, flags);
    712	spin_unlock(ptl);
    713	ctx->page_mask = HPAGE_PMD_NR - 1;
    714	return page;
    715}
    716
    717static struct page *follow_pud_mask(struct vm_area_struct *vma,
    718				    unsigned long address, p4d_t *p4dp,
    719				    unsigned int flags,
    720				    struct follow_page_context *ctx)
    721{
    722	pud_t *pud;
    723	spinlock_t *ptl;
    724	struct page *page;
    725	struct mm_struct *mm = vma->vm_mm;
    726
    727	pud = pud_offset(p4dp, address);
    728	if (pud_none(*pud))
    729		return no_page_table(vma, flags);
    730	if (pud_huge(*pud) && is_vm_hugetlb_page(vma)) {
    731		page = follow_huge_pud(mm, address, pud, flags);
    732		if (page)
    733			return page;
    734		return no_page_table(vma, flags);
    735	}
    736	if (is_hugepd(__hugepd(pud_val(*pud)))) {
    737		page = follow_huge_pd(vma, address,
    738				      __hugepd(pud_val(*pud)), flags,
    739				      PUD_SHIFT);
    740		if (page)
    741			return page;
    742		return no_page_table(vma, flags);
    743	}
    744	if (pud_devmap(*pud)) {
    745		ptl = pud_lock(mm, pud);
    746		page = follow_devmap_pud(vma, address, pud, flags, &ctx->pgmap);
    747		spin_unlock(ptl);
    748		if (page)
    749			return page;
    750	}
    751	if (unlikely(pud_bad(*pud)))
    752		return no_page_table(vma, flags);
    753
    754	return follow_pmd_mask(vma, address, pud, flags, ctx);
    755}
    756
    757static struct page *follow_p4d_mask(struct vm_area_struct *vma,
    758				    unsigned long address, pgd_t *pgdp,
    759				    unsigned int flags,
    760				    struct follow_page_context *ctx)
    761{
    762	p4d_t *p4d;
    763	struct page *page;
    764
    765	p4d = p4d_offset(pgdp, address);
    766	if (p4d_none(*p4d))
    767		return no_page_table(vma, flags);
    768	BUILD_BUG_ON(p4d_huge(*p4d));
    769	if (unlikely(p4d_bad(*p4d)))
    770		return no_page_table(vma, flags);
    771
    772	if (is_hugepd(__hugepd(p4d_val(*p4d)))) {
    773		page = follow_huge_pd(vma, address,
    774				      __hugepd(p4d_val(*p4d)), flags,
    775				      P4D_SHIFT);
    776		if (page)
    777			return page;
    778		return no_page_table(vma, flags);
    779	}
    780	return follow_pud_mask(vma, address, p4d, flags, ctx);
    781}
    782
    783/**
    784 * follow_page_mask - look up a page descriptor from a user-virtual address
    785 * @vma: vm_area_struct mapping @address
    786 * @address: virtual address to look up
    787 * @flags: flags modifying lookup behaviour
    788 * @ctx: contains dev_pagemap for %ZONE_DEVICE memory pinning and a
    789 *       pointer to output page_mask
    790 *
    791 * @flags can have FOLL_ flags set, defined in <linux/mm.h>
    792 *
    793 * When getting pages from ZONE_DEVICE memory, the @ctx->pgmap caches
    794 * the device's dev_pagemap metadata to avoid repeating expensive lookups.
    795 *
    796 * When getting an anonymous page and the caller has to trigger unsharing
    797 * of a shared anonymous page first, -EMLINK is returned. The caller should
    798 * trigger a fault with FAULT_FLAG_UNSHARE set. Note that unsharing is only
    799 * relevant with FOLL_PIN and !FOLL_WRITE.
    800 *
    801 * On output, the @ctx->page_mask is set according to the size of the page.
    802 *
    803 * Return: the mapped (struct page *), %NULL if no mapping exists, or
    804 * an error pointer if there is a mapping to something not represented
    805 * by a page descriptor (see also vm_normal_page()).
    806 */
    807static struct page *follow_page_mask(struct vm_area_struct *vma,
    808			      unsigned long address, unsigned int flags,
    809			      struct follow_page_context *ctx)
    810{
    811	pgd_t *pgd;
    812	struct page *page;
    813	struct mm_struct *mm = vma->vm_mm;
    814
    815	ctx->page_mask = 0;
    816
    817	/* make this handle hugepd */
    818	page = follow_huge_addr(mm, address, flags & FOLL_WRITE);
    819	if (!IS_ERR(page)) {
    820		WARN_ON_ONCE(flags & (FOLL_GET | FOLL_PIN));
    821		return page;
    822	}
    823
    824	pgd = pgd_offset(mm, address);
    825
    826	if (pgd_none(*pgd) || unlikely(pgd_bad(*pgd)))
    827		return no_page_table(vma, flags);
    828
    829	if (pgd_huge(*pgd)) {
    830		page = follow_huge_pgd(mm, address, pgd, flags);
    831		if (page)
    832			return page;
    833		return no_page_table(vma, flags);
    834	}
    835	if (is_hugepd(__hugepd(pgd_val(*pgd)))) {
    836		page = follow_huge_pd(vma, address,
    837				      __hugepd(pgd_val(*pgd)), flags,
    838				      PGDIR_SHIFT);
    839		if (page)
    840			return page;
    841		return no_page_table(vma, flags);
    842	}
    843
    844	return follow_p4d_mask(vma, address, pgd, flags, ctx);
    845}
    846
    847struct page *follow_page(struct vm_area_struct *vma, unsigned long address,
    848			 unsigned int foll_flags)
    849{
    850	struct follow_page_context ctx = { NULL };
    851	struct page *page;
    852
    853	if (vma_is_secretmem(vma))
    854		return NULL;
    855
    856	if (foll_flags & FOLL_PIN)
    857		return NULL;
    858
    859	page = follow_page_mask(vma, address, foll_flags, &ctx);
    860	if (ctx.pgmap)
    861		put_dev_pagemap(ctx.pgmap);
    862	return page;
    863}
    864
    865static int get_gate_page(struct mm_struct *mm, unsigned long address,
    866		unsigned int gup_flags, struct vm_area_struct **vma,
    867		struct page **page)
    868{
    869	pgd_t *pgd;
    870	p4d_t *p4d;
    871	pud_t *pud;
    872	pmd_t *pmd;
    873	pte_t *pte;
    874	int ret = -EFAULT;
    875
    876	/* user gate pages are read-only */
    877	if (gup_flags & FOLL_WRITE)
    878		return -EFAULT;
    879	if (address > TASK_SIZE)
    880		pgd = pgd_offset_k(address);
    881	else
    882		pgd = pgd_offset_gate(mm, address);
    883	if (pgd_none(*pgd))
    884		return -EFAULT;
    885	p4d = p4d_offset(pgd, address);
    886	if (p4d_none(*p4d))
    887		return -EFAULT;
    888	pud = pud_offset(p4d, address);
    889	if (pud_none(*pud))
    890		return -EFAULT;
    891	pmd = pmd_offset(pud, address);
    892	if (!pmd_present(*pmd))
    893		return -EFAULT;
    894	VM_BUG_ON(pmd_trans_huge(*pmd));
    895	pte = pte_offset_map(pmd, address);
    896	if (pte_none(*pte))
    897		goto unmap;
    898	*vma = get_gate_vma(mm);
    899	if (!page)
    900		goto out;
    901	*page = vm_normal_page(*vma, address, *pte);
    902	if (!*page) {
    903		if ((gup_flags & FOLL_DUMP) || !is_zero_pfn(pte_pfn(*pte)))
    904			goto unmap;
    905		*page = pte_page(*pte);
    906	}
    907	if (unlikely(!try_grab_page(*page, gup_flags))) {
    908		ret = -ENOMEM;
    909		goto unmap;
    910	}
    911out:
    912	ret = 0;
    913unmap:
    914	pte_unmap(pte);
    915	return ret;
    916}
    917
    918/*
    919 * mmap_lock must be held on entry.  If @locked != NULL and *@flags
    920 * does not include FOLL_NOWAIT, the mmap_lock may be released.  If it
    921 * is, *@locked will be set to 0 and -EBUSY returned.
    922 */
    923static int faultin_page(struct vm_area_struct *vma,
    924		unsigned long address, unsigned int *flags, bool unshare,
    925		int *locked)
    926{
    927	unsigned int fault_flags = 0;
    928	vm_fault_t ret;
    929
    930	if (*flags & FOLL_NOFAULT)
    931		return -EFAULT;
    932	if (*flags & FOLL_WRITE)
    933		fault_flags |= FAULT_FLAG_WRITE;
    934	if (*flags & FOLL_REMOTE)
    935		fault_flags |= FAULT_FLAG_REMOTE;
    936	if (locked)
    937		fault_flags |= FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE;
    938	if (*flags & FOLL_NOWAIT)
    939		fault_flags |= FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_RETRY_NOWAIT;
    940	if (*flags & FOLL_TRIED) {
    941		/*
    942		 * Note: FAULT_FLAG_ALLOW_RETRY and FAULT_FLAG_TRIED
    943		 * can co-exist
    944		 */
    945		fault_flags |= FAULT_FLAG_TRIED;
    946	}
    947	if (unshare) {
    948		fault_flags |= FAULT_FLAG_UNSHARE;
    949		/* FAULT_FLAG_WRITE and FAULT_FLAG_UNSHARE are incompatible */
    950		VM_BUG_ON(fault_flags & FAULT_FLAG_WRITE);
    951	}
    952
    953	ret = handle_mm_fault(vma, address, fault_flags, NULL);
    954	if (ret & VM_FAULT_ERROR) {
    955		int err = vm_fault_to_errno(ret, *flags);
    956
    957		if (err)
    958			return err;
    959		BUG();
    960	}
    961
    962	if (ret & VM_FAULT_RETRY) {
    963		if (locked && !(fault_flags & FAULT_FLAG_RETRY_NOWAIT))
    964			*locked = 0;
    965		return -EBUSY;
    966	}
    967
    968	/*
    969	 * The VM_FAULT_WRITE bit tells us that do_wp_page has broken COW when
    970	 * necessary, even if maybe_mkwrite decided not to set pte_write. We
    971	 * can thus safely do subsequent page lookups as if they were reads.
    972	 * But only do so when looping for pte_write is futile: in some cases
    973	 * userspace may also be wanting to write to the gotten user page,
    974	 * which a read fault here might prevent (a readonly page might get
    975	 * reCOWed by userspace write).
    976	 */
    977	if ((ret & VM_FAULT_WRITE) && !(vma->vm_flags & VM_WRITE))
    978		*flags |= FOLL_COW;
    979	return 0;
    980}
    981
    982static int check_vma_flags(struct vm_area_struct *vma, unsigned long gup_flags)
    983{
    984	vm_flags_t vm_flags = vma->vm_flags;
    985	int write = (gup_flags & FOLL_WRITE);
    986	int foreign = (gup_flags & FOLL_REMOTE);
    987
    988	if (vm_flags & (VM_IO | VM_PFNMAP))
    989		return -EFAULT;
    990
    991	if (gup_flags & FOLL_ANON && !vma_is_anonymous(vma))
    992		return -EFAULT;
    993
    994	if ((gup_flags & FOLL_LONGTERM) && vma_is_fsdax(vma))
    995		return -EOPNOTSUPP;
    996
    997	if (vma_is_secretmem(vma))
    998		return -EFAULT;
    999
   1000	if (write) {
   1001		if (!(vm_flags & VM_WRITE)) {
   1002			if (!(gup_flags & FOLL_FORCE))
   1003				return -EFAULT;
   1004			/*
   1005			 * We used to let the write,force case do COW in a
   1006			 * VM_MAYWRITE VM_SHARED !VM_WRITE vma, so ptrace could
   1007			 * set a breakpoint in a read-only mapping of an
   1008			 * executable, without corrupting the file (yet only
   1009			 * when that file had been opened for writing!).
   1010			 * Anon pages in shared mappings are surprising: now
   1011			 * just reject it.
   1012			 */
   1013			if (!is_cow_mapping(vm_flags))
   1014				return -EFAULT;
   1015		}
   1016	} else if (!(vm_flags & VM_READ)) {
   1017		if (!(gup_flags & FOLL_FORCE))
   1018			return -EFAULT;
   1019		/*
   1020		 * Is there actually any vma we can reach here which does not
   1021		 * have VM_MAYREAD set?
   1022		 */
   1023		if (!(vm_flags & VM_MAYREAD))
   1024			return -EFAULT;
   1025	}
   1026	/*
   1027	 * gups are always data accesses, not instruction
   1028	 * fetches, so execute=false here
   1029	 */
   1030	if (!arch_vma_access_permitted(vma, write, false, foreign))
   1031		return -EFAULT;
   1032	return 0;
   1033}
   1034
   1035/**
   1036 * __get_user_pages() - pin user pages in memory
   1037 * @mm:		mm_struct of target mm
   1038 * @start:	starting user address
   1039 * @nr_pages:	number of pages from start to pin
   1040 * @gup_flags:	flags modifying pin behaviour
   1041 * @pages:	array that receives pointers to the pages pinned.
   1042 *		Should be at least nr_pages long. Or NULL, if caller
   1043 *		only intends to ensure the pages are faulted in.
   1044 * @vmas:	array of pointers to vmas corresponding to each page.
   1045 *		Or NULL if the caller does not require them.
   1046 * @locked:     whether we're still with the mmap_lock held
   1047 *
   1048 * Returns either number of pages pinned (which may be less than the
   1049 * number requested), or an error. Details about the return value:
   1050 *
   1051 * -- If nr_pages is 0, returns 0.
   1052 * -- If nr_pages is >0, but no pages were pinned, returns -errno.
   1053 * -- If nr_pages is >0, and some pages were pinned, returns the number of
   1054 *    pages pinned. Again, this may be less than nr_pages.
   1055 * -- 0 return value is possible when the fault would need to be retried.
   1056 *
   1057 * The caller is responsible for releasing returned @pages, via put_page().
   1058 *
   1059 * @vmas are valid only as long as mmap_lock is held.
   1060 *
   1061 * Must be called with mmap_lock held.  It may be released.  See below.
   1062 *
   1063 * __get_user_pages walks a process's page tables and takes a reference to
   1064 * each struct page that each user address corresponds to at a given
   1065 * instant. That is, it takes the page that would be accessed if a user
   1066 * thread accesses the given user virtual address at that instant.
   1067 *
   1068 * This does not guarantee that the page exists in the user mappings when
   1069 * __get_user_pages returns, and there may even be a completely different
   1070 * page there in some cases (eg. if mmapped pagecache has been invalidated
   1071 * and subsequently re faulted). However it does guarantee that the page
   1072 * won't be freed completely. And mostly callers simply care that the page
   1073 * contains data that was valid *at some point in time*. Typically, an IO
   1074 * or similar operation cannot guarantee anything stronger anyway because
   1075 * locks can't be held over the syscall boundary.
   1076 *
   1077 * If @gup_flags & FOLL_WRITE == 0, the page must not be written to. If
   1078 * the page is written to, set_page_dirty (or set_page_dirty_lock, as
   1079 * appropriate) must be called after the page is finished with, and
   1080 * before put_page is called.
   1081 *
   1082 * If @locked != NULL, *@locked will be set to 0 when mmap_lock is
   1083 * released by an up_read().  That can happen if @gup_flags does not
   1084 * have FOLL_NOWAIT.
   1085 *
   1086 * A caller using such a combination of @locked and @gup_flags
   1087 * must therefore hold the mmap_lock for reading only, and recognize
   1088 * when it's been released.  Otherwise, it must be held for either
   1089 * reading or writing and will not be released.
   1090 *
   1091 * In most cases, get_user_pages or get_user_pages_fast should be used
   1092 * instead of __get_user_pages. __get_user_pages should be used only if
   1093 * you need some special @gup_flags.
   1094 */
   1095static long __get_user_pages(struct mm_struct *mm,
   1096		unsigned long start, unsigned long nr_pages,
   1097		unsigned int gup_flags, struct page **pages,
   1098		struct vm_area_struct **vmas, int *locked)
   1099{
   1100	long ret = 0, i = 0;
   1101	struct vm_area_struct *vma = NULL;
   1102	struct follow_page_context ctx = { NULL };
   1103
   1104	if (!nr_pages)
   1105		return 0;
   1106
   1107	start = untagged_addr(start);
   1108
   1109	VM_BUG_ON(!!pages != !!(gup_flags & (FOLL_GET | FOLL_PIN)));
   1110
   1111	/*
   1112	 * If FOLL_FORCE is set then do not force a full fault as the hinting
   1113	 * fault information is unrelated to the reference behaviour of a task
   1114	 * using the address space
   1115	 */
   1116	if (!(gup_flags & FOLL_FORCE))
   1117		gup_flags |= FOLL_NUMA;
   1118
   1119	do {
   1120		struct page *page;
   1121		unsigned int foll_flags = gup_flags;
   1122		unsigned int page_increm;
   1123
   1124		/* first iteration or cross vma bound */
   1125		if (!vma || start >= vma->vm_end) {
   1126			vma = find_extend_vma(mm, start);
   1127			if (!vma && in_gate_area(mm, start)) {
   1128				ret = get_gate_page(mm, start & PAGE_MASK,
   1129						gup_flags, &vma,
   1130						pages ? &pages[i] : NULL);
   1131				if (ret)
   1132					goto out;
   1133				ctx.page_mask = 0;
   1134				goto next_page;
   1135			}
   1136
   1137			if (!vma) {
   1138				ret = -EFAULT;
   1139				goto out;
   1140			}
   1141			ret = check_vma_flags(vma, gup_flags);
   1142			if (ret)
   1143				goto out;
   1144
   1145			if (is_vm_hugetlb_page(vma)) {
   1146				i = follow_hugetlb_page(mm, vma, pages, vmas,
   1147						&start, &nr_pages, i,
   1148						gup_flags, locked);
   1149				if (locked && *locked == 0) {
   1150					/*
   1151					 * We've got a VM_FAULT_RETRY
   1152					 * and we've lost mmap_lock.
   1153					 * We must stop here.
   1154					 */
   1155					BUG_ON(gup_flags & FOLL_NOWAIT);
   1156					goto out;
   1157				}
   1158				continue;
   1159			}
   1160		}
   1161retry:
   1162		/*
   1163		 * If we have a pending SIGKILL, don't keep faulting pages and
   1164		 * potentially allocating memory.
   1165		 */
   1166		if (fatal_signal_pending(current)) {
   1167			ret = -EINTR;
   1168			goto out;
   1169		}
   1170		cond_resched();
   1171
   1172		page = follow_page_mask(vma, start, foll_flags, &ctx);
   1173		if (!page || PTR_ERR(page) == -EMLINK) {
   1174			ret = faultin_page(vma, start, &foll_flags,
   1175					   PTR_ERR(page) == -EMLINK, locked);
   1176			switch (ret) {
   1177			case 0:
   1178				goto retry;
   1179			case -EBUSY:
   1180				ret = 0;
   1181				fallthrough;
   1182			case -EFAULT:
   1183			case -ENOMEM:
   1184			case -EHWPOISON:
   1185				goto out;
   1186			}
   1187			BUG();
   1188		} else if (PTR_ERR(page) == -EEXIST) {
   1189			/*
   1190			 * Proper page table entry exists, but no corresponding
   1191			 * struct page. If the caller expects **pages to be
   1192			 * filled in, bail out now, because that can't be done
   1193			 * for this page.
   1194			 */
   1195			if (pages) {
   1196				ret = PTR_ERR(page);
   1197				goto out;
   1198			}
   1199
   1200			goto next_page;
   1201		} else if (IS_ERR(page)) {
   1202			ret = PTR_ERR(page);
   1203			goto out;
   1204		}
   1205		if (pages) {
   1206			pages[i] = page;
   1207			flush_anon_page(vma, page, start);
   1208			flush_dcache_page(page);
   1209			ctx.page_mask = 0;
   1210		}
   1211next_page:
   1212		if (vmas) {
   1213			vmas[i] = vma;
   1214			ctx.page_mask = 0;
   1215		}
   1216		page_increm = 1 + (~(start >> PAGE_SHIFT) & ctx.page_mask);
   1217		if (page_increm > nr_pages)
   1218			page_increm = nr_pages;
   1219		i += page_increm;
   1220		start += page_increm * PAGE_SIZE;
   1221		nr_pages -= page_increm;
   1222	} while (nr_pages);
   1223out:
   1224	if (ctx.pgmap)
   1225		put_dev_pagemap(ctx.pgmap);
   1226	return i ? i : ret;
   1227}
   1228
   1229static bool vma_permits_fault(struct vm_area_struct *vma,
   1230			      unsigned int fault_flags)
   1231{
   1232	bool write   = !!(fault_flags & FAULT_FLAG_WRITE);
   1233	bool foreign = !!(fault_flags & FAULT_FLAG_REMOTE);
   1234	vm_flags_t vm_flags = write ? VM_WRITE : VM_READ;
   1235
   1236	if (!(vm_flags & vma->vm_flags))
   1237		return false;
   1238
   1239	/*
   1240	 * The architecture might have a hardware protection
   1241	 * mechanism other than read/write that can deny access.
   1242	 *
   1243	 * gup always represents data access, not instruction
   1244	 * fetches, so execute=false here:
   1245	 */
   1246	if (!arch_vma_access_permitted(vma, write, false, foreign))
   1247		return false;
   1248
   1249	return true;
   1250}
   1251
   1252/**
   1253 * fixup_user_fault() - manually resolve a user page fault
   1254 * @mm:		mm_struct of target mm
   1255 * @address:	user address
   1256 * @fault_flags:flags to pass down to handle_mm_fault()
   1257 * @unlocked:	did we unlock the mmap_lock while retrying, maybe NULL if caller
   1258 *		does not allow retry. If NULL, the caller must guarantee
   1259 *		that fault_flags does not contain FAULT_FLAG_ALLOW_RETRY.
   1260 *
   1261 * This is meant to be called in the specific scenario where for locking reasons
   1262 * we try to access user memory in atomic context (within a pagefault_disable()
   1263 * section), this returns -EFAULT, and we want to resolve the user fault before
   1264 * trying again.
   1265 *
   1266 * Typically this is meant to be used by the futex code.
   1267 *
   1268 * The main difference with get_user_pages() is that this function will
   1269 * unconditionally call handle_mm_fault() which will in turn perform all the
   1270 * necessary SW fixup of the dirty and young bits in the PTE, while
   1271 * get_user_pages() only guarantees to update these in the struct page.
   1272 *
   1273 * This is important for some architectures where those bits also gate the
   1274 * access permission to the page because they are maintained in software.  On
   1275 * such architectures, gup() will not be enough to make a subsequent access
   1276 * succeed.
   1277 *
   1278 * This function will not return with an unlocked mmap_lock. So it has not the
   1279 * same semantics wrt the @mm->mmap_lock as does filemap_fault().
   1280 */
   1281int fixup_user_fault(struct mm_struct *mm,
   1282		     unsigned long address, unsigned int fault_flags,
   1283		     bool *unlocked)
   1284{
   1285	struct vm_area_struct *vma;
   1286	vm_fault_t ret;
   1287
   1288	address = untagged_addr(address);
   1289
   1290	if (unlocked)
   1291		fault_flags |= FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE;
   1292
   1293retry:
   1294	vma = find_extend_vma(mm, address);
   1295	if (!vma || address < vma->vm_start)
   1296		return -EFAULT;
   1297
   1298	if (!vma_permits_fault(vma, fault_flags))
   1299		return -EFAULT;
   1300
   1301	if ((fault_flags & FAULT_FLAG_KILLABLE) &&
   1302	    fatal_signal_pending(current))
   1303		return -EINTR;
   1304
   1305	ret = handle_mm_fault(vma, address, fault_flags, NULL);
   1306	if (ret & VM_FAULT_ERROR) {
   1307		int err = vm_fault_to_errno(ret, 0);
   1308
   1309		if (err)
   1310			return err;
   1311		BUG();
   1312	}
   1313
   1314	if (ret & VM_FAULT_RETRY) {
   1315		mmap_read_lock(mm);
   1316		*unlocked = true;
   1317		fault_flags |= FAULT_FLAG_TRIED;
   1318		goto retry;
   1319	}
   1320
   1321	return 0;
   1322}
   1323EXPORT_SYMBOL_GPL(fixup_user_fault);
   1324
   1325/*
   1326 * Please note that this function, unlike __get_user_pages will not
   1327 * return 0 for nr_pages > 0 without FOLL_NOWAIT
   1328 */
   1329static __always_inline long __get_user_pages_locked(struct mm_struct *mm,
   1330						unsigned long start,
   1331						unsigned long nr_pages,
   1332						struct page **pages,
   1333						struct vm_area_struct **vmas,
   1334						int *locked,
   1335						unsigned int flags)
   1336{
   1337	long ret, pages_done;
   1338	bool lock_dropped;
   1339
   1340	if (locked) {
   1341		/* if VM_FAULT_RETRY can be returned, vmas become invalid */
   1342		BUG_ON(vmas);
   1343		/* check caller initialized locked */
   1344		BUG_ON(*locked != 1);
   1345	}
   1346
   1347	if (flags & FOLL_PIN)
   1348		mm_set_has_pinned_flag(&mm->flags);
   1349
   1350	/*
   1351	 * FOLL_PIN and FOLL_GET are mutually exclusive. Traditional behavior
   1352	 * is to set FOLL_GET if the caller wants pages[] filled in (but has
   1353	 * carelessly failed to specify FOLL_GET), so keep doing that, but only
   1354	 * for FOLL_GET, not for the newer FOLL_PIN.
   1355	 *
   1356	 * FOLL_PIN always expects pages to be non-null, but no need to assert
   1357	 * that here, as any failures will be obvious enough.
   1358	 */
   1359	if (pages && !(flags & FOLL_PIN))
   1360		flags |= FOLL_GET;
   1361
   1362	pages_done = 0;
   1363	lock_dropped = false;
   1364	for (;;) {
   1365		ret = __get_user_pages(mm, start, nr_pages, flags, pages,
   1366				       vmas, locked);
   1367		if (!locked)
   1368			/* VM_FAULT_RETRY couldn't trigger, bypass */
   1369			return ret;
   1370
   1371		/* VM_FAULT_RETRY cannot return errors */
   1372		if (!*locked) {
   1373			BUG_ON(ret < 0);
   1374			BUG_ON(ret >= nr_pages);
   1375		}
   1376
   1377		if (ret > 0) {
   1378			nr_pages -= ret;
   1379			pages_done += ret;
   1380			if (!nr_pages)
   1381				break;
   1382		}
   1383		if (*locked) {
   1384			/*
   1385			 * VM_FAULT_RETRY didn't trigger or it was a
   1386			 * FOLL_NOWAIT.
   1387			 */
   1388			if (!pages_done)
   1389				pages_done = ret;
   1390			break;
   1391		}
   1392		/*
   1393		 * VM_FAULT_RETRY triggered, so seek to the faulting offset.
   1394		 * For the prefault case (!pages) we only update counts.
   1395		 */
   1396		if (likely(pages))
   1397			pages += ret;
   1398		start += ret << PAGE_SHIFT;
   1399		lock_dropped = true;
   1400
   1401retry:
   1402		/*
   1403		 * Repeat on the address that fired VM_FAULT_RETRY
   1404		 * with both FAULT_FLAG_ALLOW_RETRY and
   1405		 * FAULT_FLAG_TRIED.  Note that GUP can be interrupted
   1406		 * by fatal signals, so we need to check it before we
   1407		 * start trying again otherwise it can loop forever.
   1408		 */
   1409
   1410		if (fatal_signal_pending(current)) {
   1411			if (!pages_done)
   1412				pages_done = -EINTR;
   1413			break;
   1414		}
   1415
   1416		ret = mmap_read_lock_killable(mm);
   1417		if (ret) {
   1418			BUG_ON(ret > 0);
   1419			if (!pages_done)
   1420				pages_done = ret;
   1421			break;
   1422		}
   1423
   1424		*locked = 1;
   1425		ret = __get_user_pages(mm, start, 1, flags | FOLL_TRIED,
   1426				       pages, NULL, locked);
   1427		if (!*locked) {
   1428			/* Continue to retry until we succeeded */
   1429			BUG_ON(ret != 0);
   1430			goto retry;
   1431		}
   1432		if (ret != 1) {
   1433			BUG_ON(ret > 1);
   1434			if (!pages_done)
   1435				pages_done = ret;
   1436			break;
   1437		}
   1438		nr_pages--;
   1439		pages_done++;
   1440		if (!nr_pages)
   1441			break;
   1442		if (likely(pages))
   1443			pages++;
   1444		start += PAGE_SIZE;
   1445	}
   1446	if (lock_dropped && *locked) {
   1447		/*
   1448		 * We must let the caller know we temporarily dropped the lock
   1449		 * and so the critical section protected by it was lost.
   1450		 */
   1451		mmap_read_unlock(mm);
   1452		*locked = 0;
   1453	}
   1454	return pages_done;
   1455}
   1456
   1457/**
   1458 * populate_vma_page_range() -  populate a range of pages in the vma.
   1459 * @vma:   target vma
   1460 * @start: start address
   1461 * @end:   end address
   1462 * @locked: whether the mmap_lock is still held
   1463 *
   1464 * This takes care of mlocking the pages too if VM_LOCKED is set.
   1465 *
   1466 * Return either number of pages pinned in the vma, or a negative error
   1467 * code on error.
   1468 *
   1469 * vma->vm_mm->mmap_lock must be held.
   1470 *
   1471 * If @locked is NULL, it may be held for read or write and will
   1472 * be unperturbed.
   1473 *
   1474 * If @locked is non-NULL, it must held for read only and may be
   1475 * released.  If it's released, *@locked will be set to 0.
   1476 */
   1477long populate_vma_page_range(struct vm_area_struct *vma,
   1478		unsigned long start, unsigned long end, int *locked)
   1479{
   1480	struct mm_struct *mm = vma->vm_mm;
   1481	unsigned long nr_pages = (end - start) / PAGE_SIZE;
   1482	int gup_flags;
   1483	long ret;
   1484
   1485	VM_BUG_ON(!PAGE_ALIGNED(start));
   1486	VM_BUG_ON(!PAGE_ALIGNED(end));
   1487	VM_BUG_ON_VMA(start < vma->vm_start, vma);
   1488	VM_BUG_ON_VMA(end   > vma->vm_end, vma);
   1489	mmap_assert_locked(mm);
   1490
   1491	/*
   1492	 * Rightly or wrongly, the VM_LOCKONFAULT case has never used
   1493	 * faultin_page() to break COW, so it has no work to do here.
   1494	 */
   1495	if (vma->vm_flags & VM_LOCKONFAULT)
   1496		return nr_pages;
   1497
   1498	gup_flags = FOLL_TOUCH;
   1499	/*
   1500	 * We want to touch writable mappings with a write fault in order
   1501	 * to break COW, except for shared mappings because these don't COW
   1502	 * and we would not want to dirty them for nothing.
   1503	 */
   1504	if ((vma->vm_flags & (VM_WRITE | VM_SHARED)) == VM_WRITE)
   1505		gup_flags |= FOLL_WRITE;
   1506
   1507	/*
   1508	 * We want mlock to succeed for regions that have any permissions
   1509	 * other than PROT_NONE.
   1510	 */
   1511	if (vma_is_accessible(vma))
   1512		gup_flags |= FOLL_FORCE;
   1513
   1514	/*
   1515	 * We made sure addr is within a VMA, so the following will
   1516	 * not result in a stack expansion that recurses back here.
   1517	 */
   1518	ret = __get_user_pages(mm, start, nr_pages, gup_flags,
   1519				NULL, NULL, locked);
   1520	lru_add_drain();
   1521	return ret;
   1522}
   1523
   1524/*
   1525 * faultin_vma_page_range() - populate (prefault) page tables inside the
   1526 *			      given VMA range readable/writable
   1527 *
   1528 * This takes care of mlocking the pages, too, if VM_LOCKED is set.
   1529 *
   1530 * @vma: target vma
   1531 * @start: start address
   1532 * @end: end address
   1533 * @write: whether to prefault readable or writable
   1534 * @locked: whether the mmap_lock is still held
   1535 *
   1536 * Returns either number of processed pages in the vma, or a negative error
   1537 * code on error (see __get_user_pages()).
   1538 *
   1539 * vma->vm_mm->mmap_lock must be held. The range must be page-aligned and
   1540 * covered by the VMA.
   1541 *
   1542 * If @locked is NULL, it may be held for read or write and will be unperturbed.
   1543 *
   1544 * If @locked is non-NULL, it must held for read only and may be released.  If
   1545 * it's released, *@locked will be set to 0.
   1546 */
   1547long faultin_vma_page_range(struct vm_area_struct *vma, unsigned long start,
   1548			    unsigned long end, bool write, int *locked)
   1549{
   1550	struct mm_struct *mm = vma->vm_mm;
   1551	unsigned long nr_pages = (end - start) / PAGE_SIZE;
   1552	int gup_flags;
   1553	long ret;
   1554
   1555	VM_BUG_ON(!PAGE_ALIGNED(start));
   1556	VM_BUG_ON(!PAGE_ALIGNED(end));
   1557	VM_BUG_ON_VMA(start < vma->vm_start, vma);
   1558	VM_BUG_ON_VMA(end > vma->vm_end, vma);
   1559	mmap_assert_locked(mm);
   1560
   1561	/*
   1562	 * FOLL_TOUCH: Mark page accessed and thereby young; will also mark
   1563	 *	       the page dirty with FOLL_WRITE -- which doesn't make a
   1564	 *	       difference with !FOLL_FORCE, because the page is writable
   1565	 *	       in the page table.
   1566	 * FOLL_HWPOISON: Return -EHWPOISON instead of -EFAULT when we hit
   1567	 *		  a poisoned page.
   1568	 * !FOLL_FORCE: Require proper access permissions.
   1569	 */
   1570	gup_flags = FOLL_TOUCH | FOLL_HWPOISON;
   1571	if (write)
   1572		gup_flags |= FOLL_WRITE;
   1573
   1574	/*
   1575	 * We want to report -EINVAL instead of -EFAULT for any permission
   1576	 * problems or incompatible mappings.
   1577	 */
   1578	if (check_vma_flags(vma, gup_flags))
   1579		return -EINVAL;
   1580
   1581	ret = __get_user_pages(mm, start, nr_pages, gup_flags,
   1582				NULL, NULL, locked);
   1583	lru_add_drain();
   1584	return ret;
   1585}
   1586
   1587/*
   1588 * __mm_populate - populate and/or mlock pages within a range of address space.
   1589 *
   1590 * This is used to implement mlock() and the MAP_POPULATE / MAP_LOCKED mmap
   1591 * flags. VMAs must be already marked with the desired vm_flags, and
   1592 * mmap_lock must not be held.
   1593 */
   1594int __mm_populate(unsigned long start, unsigned long len, int ignore_errors)
   1595{
   1596	struct mm_struct *mm = current->mm;
   1597	unsigned long end, nstart, nend;
   1598	struct vm_area_struct *vma = NULL;
   1599	int locked = 0;
   1600	long ret = 0;
   1601
   1602	end = start + len;
   1603
   1604	for (nstart = start; nstart < end; nstart = nend) {
   1605		/*
   1606		 * We want to fault in pages for [nstart; end) address range.
   1607		 * Find first corresponding VMA.
   1608		 */
   1609		if (!locked) {
   1610			locked = 1;
   1611			mmap_read_lock(mm);
   1612			vma = find_vma(mm, nstart);
   1613		} else if (nstart >= vma->vm_end)
   1614			vma = vma->vm_next;
   1615		if (!vma || vma->vm_start >= end)
   1616			break;
   1617		/*
   1618		 * Set [nstart; nend) to intersection of desired address
   1619		 * range with the first VMA. Also, skip undesirable VMA types.
   1620		 */
   1621		nend = min(end, vma->vm_end);
   1622		if (vma->vm_flags & (VM_IO | VM_PFNMAP))
   1623			continue;
   1624		if (nstart < vma->vm_start)
   1625			nstart = vma->vm_start;
   1626		/*
   1627		 * Now fault in a range of pages. populate_vma_page_range()
   1628		 * double checks the vma flags, so that it won't mlock pages
   1629		 * if the vma was already munlocked.
   1630		 */
   1631		ret = populate_vma_page_range(vma, nstart, nend, &locked);
   1632		if (ret < 0) {
   1633			if (ignore_errors) {
   1634				ret = 0;
   1635				continue;	/* continue at next VMA */
   1636			}
   1637			break;
   1638		}
   1639		nend = nstart + ret * PAGE_SIZE;
   1640		ret = 0;
   1641	}
   1642	if (locked)
   1643		mmap_read_unlock(mm);
   1644	return ret;	/* 0 or negative error code */
   1645}
   1646#else /* CONFIG_MMU */
   1647static long __get_user_pages_locked(struct mm_struct *mm, unsigned long start,
   1648		unsigned long nr_pages, struct page **pages,
   1649		struct vm_area_struct **vmas, int *locked,
   1650		unsigned int foll_flags)
   1651{
   1652	struct vm_area_struct *vma;
   1653	unsigned long vm_flags;
   1654	long i;
   1655
   1656	/* calculate required read or write permissions.
   1657	 * If FOLL_FORCE is set, we only require the "MAY" flags.
   1658	 */
   1659	vm_flags  = (foll_flags & FOLL_WRITE) ?
   1660			(VM_WRITE | VM_MAYWRITE) : (VM_READ | VM_MAYREAD);
   1661	vm_flags &= (foll_flags & FOLL_FORCE) ?
   1662			(VM_MAYREAD | VM_MAYWRITE) : (VM_READ | VM_WRITE);
   1663
   1664	for (i = 0; i < nr_pages; i++) {
   1665		vma = find_vma(mm, start);
   1666		if (!vma)
   1667			goto finish_or_fault;
   1668
   1669		/* protect what we can, including chardevs */
   1670		if ((vma->vm_flags & (VM_IO | VM_PFNMAP)) ||
   1671		    !(vm_flags & vma->vm_flags))
   1672			goto finish_or_fault;
   1673
   1674		if (pages) {
   1675			pages[i] = virt_to_page(start);
   1676			if (pages[i])
   1677				get_page(pages[i]);
   1678		}
   1679		if (vmas)
   1680			vmas[i] = vma;
   1681		start = (start + PAGE_SIZE) & PAGE_MASK;
   1682	}
   1683
   1684	return i;
   1685
   1686finish_or_fault:
   1687	return i ? : -EFAULT;
   1688}
   1689#endif /* !CONFIG_MMU */
   1690
   1691/**
   1692 * fault_in_writeable - fault in userspace address range for writing
   1693 * @uaddr: start of address range
   1694 * @size: size of address range
   1695 *
   1696 * Returns the number of bytes not faulted in (like copy_to_user() and
   1697 * copy_from_user()).
   1698 */
   1699size_t fault_in_writeable(char __user *uaddr, size_t size)
   1700{
   1701	char __user *start = uaddr, *end;
   1702
   1703	if (unlikely(size == 0))
   1704		return 0;
   1705	if (!user_write_access_begin(uaddr, size))
   1706		return size;
   1707	if (!PAGE_ALIGNED(uaddr)) {
   1708		unsafe_put_user(0, uaddr, out);
   1709		uaddr = (char __user *)PAGE_ALIGN((unsigned long)uaddr);
   1710	}
   1711	end = (char __user *)PAGE_ALIGN((unsigned long)start + size);
   1712	if (unlikely(end < start))
   1713		end = NULL;
   1714	while (uaddr != end) {
   1715		unsafe_put_user(0, uaddr, out);
   1716		uaddr += PAGE_SIZE;
   1717	}
   1718
   1719out:
   1720	user_write_access_end();
   1721	if (size > uaddr - start)
   1722		return size - (uaddr - start);
   1723	return 0;
   1724}
   1725EXPORT_SYMBOL(fault_in_writeable);
   1726
   1727/**
   1728 * fault_in_subpage_writeable - fault in an address range for writing
   1729 * @uaddr: start of address range
   1730 * @size: size of address range
   1731 *
   1732 * Fault in a user address range for writing while checking for permissions at
   1733 * sub-page granularity (e.g. arm64 MTE). This function should be used when
   1734 * the caller cannot guarantee forward progress of a copy_to_user() loop.
   1735 *
   1736 * Returns the number of bytes not faulted in (like copy_to_user() and
   1737 * copy_from_user()).
   1738 */
   1739size_t fault_in_subpage_writeable(char __user *uaddr, size_t size)
   1740{
   1741	size_t faulted_in;
   1742
   1743	/*
   1744	 * Attempt faulting in at page granularity first for page table
   1745	 * permission checking. The arch-specific probe_subpage_writeable()
   1746	 * functions may not check for this.
   1747	 */
   1748	faulted_in = size - fault_in_writeable(uaddr, size);
   1749	if (faulted_in)
   1750		faulted_in -= probe_subpage_writeable(uaddr, faulted_in);
   1751
   1752	return size - faulted_in;
   1753}
   1754EXPORT_SYMBOL(fault_in_subpage_writeable);
   1755
   1756/*
   1757 * fault_in_safe_writeable - fault in an address range for writing
   1758 * @uaddr: start of address range
   1759 * @size: length of address range
   1760 *
   1761 * Faults in an address range for writing.  This is primarily useful when we
   1762 * already know that some or all of the pages in the address range aren't in
   1763 * memory.
   1764 *
   1765 * Unlike fault_in_writeable(), this function is non-destructive.
   1766 *
   1767 * Note that we don't pin or otherwise hold the pages referenced that we fault
   1768 * in.  There's no guarantee that they'll stay in memory for any duration of
   1769 * time.
   1770 *
   1771 * Returns the number of bytes not faulted in, like copy_to_user() and
   1772 * copy_from_user().
   1773 */
   1774size_t fault_in_safe_writeable(const char __user *uaddr, size_t size)
   1775{
   1776	unsigned long start = (unsigned long)uaddr, end;
   1777	struct mm_struct *mm = current->mm;
   1778	bool unlocked = false;
   1779
   1780	if (unlikely(size == 0))
   1781		return 0;
   1782	end = PAGE_ALIGN(start + size);
   1783	if (end < start)
   1784		end = 0;
   1785
   1786	mmap_read_lock(mm);
   1787	do {
   1788		if (fixup_user_fault(mm, start, FAULT_FLAG_WRITE, &unlocked))
   1789			break;
   1790		start = (start + PAGE_SIZE) & PAGE_MASK;
   1791	} while (start != end);
   1792	mmap_read_unlock(mm);
   1793
   1794	if (size > (unsigned long)uaddr - start)
   1795		return size - ((unsigned long)uaddr - start);
   1796	return 0;
   1797}
   1798EXPORT_SYMBOL(fault_in_safe_writeable);
   1799
   1800/**
   1801 * fault_in_readable - fault in userspace address range for reading
   1802 * @uaddr: start of user address range
   1803 * @size: size of user address range
   1804 *
   1805 * Returns the number of bytes not faulted in (like copy_to_user() and
   1806 * copy_from_user()).
   1807 */
   1808size_t fault_in_readable(const char __user *uaddr, size_t size)
   1809{
   1810	const char __user *start = uaddr, *end;
   1811	volatile char c;
   1812
   1813	if (unlikely(size == 0))
   1814		return 0;
   1815	if (!user_read_access_begin(uaddr, size))
   1816		return size;
   1817	if (!PAGE_ALIGNED(uaddr)) {
   1818		unsafe_get_user(c, uaddr, out);
   1819		uaddr = (const char __user *)PAGE_ALIGN((unsigned long)uaddr);
   1820	}
   1821	end = (const char __user *)PAGE_ALIGN((unsigned long)start + size);
   1822	if (unlikely(end < start))
   1823		end = NULL;
   1824	while (uaddr != end) {
   1825		unsafe_get_user(c, uaddr, out);
   1826		uaddr += PAGE_SIZE;
   1827	}
   1828
   1829out:
   1830	user_read_access_end();
   1831	(void)c;
   1832	if (size > uaddr - start)
   1833		return size - (uaddr - start);
   1834	return 0;
   1835}
   1836EXPORT_SYMBOL(fault_in_readable);
   1837
   1838/**
   1839 * get_dump_page() - pin user page in memory while writing it to core dump
   1840 * @addr: user address
   1841 *
   1842 * Returns struct page pointer of user page pinned for dump,
   1843 * to be freed afterwards by put_page().
   1844 *
   1845 * Returns NULL on any kind of failure - a hole must then be inserted into
   1846 * the corefile, to preserve alignment with its headers; and also returns
   1847 * NULL wherever the ZERO_PAGE, or an anonymous pte_none, has been found -
   1848 * allowing a hole to be left in the corefile to save disk space.
   1849 *
   1850 * Called without mmap_lock (takes and releases the mmap_lock by itself).
   1851 */
   1852#ifdef CONFIG_ELF_CORE
   1853struct page *get_dump_page(unsigned long addr)
   1854{
   1855	struct mm_struct *mm = current->mm;
   1856	struct page *page;
   1857	int locked = 1;
   1858	int ret;
   1859
   1860	if (mmap_read_lock_killable(mm))
   1861		return NULL;
   1862	ret = __get_user_pages_locked(mm, addr, 1, &page, NULL, &locked,
   1863				      FOLL_FORCE | FOLL_DUMP | FOLL_GET);
   1864	if (locked)
   1865		mmap_read_unlock(mm);
   1866	return (ret == 1) ? page : NULL;
   1867}
   1868#endif /* CONFIG_ELF_CORE */
   1869
   1870#ifdef CONFIG_MIGRATION
   1871/*
   1872 * Check whether all pages are pinnable, if so return number of pages.  If some
   1873 * pages are not pinnable, migrate them, and unpin all pages. Return zero if
   1874 * pages were migrated, or if some pages were not successfully isolated.
   1875 * Return negative error if migration fails.
   1876 */
   1877static long check_and_migrate_movable_pages(unsigned long nr_pages,
   1878					    struct page **pages,
   1879					    unsigned int gup_flags)
   1880{
   1881	unsigned long isolation_error_count = 0, i;
   1882	struct folio *prev_folio = NULL;
   1883	LIST_HEAD(movable_page_list);
   1884	bool drain_allow = true;
   1885	int ret = 0;
   1886
   1887	for (i = 0; i < nr_pages; i++) {
   1888		struct folio *folio = page_folio(pages[i]);
   1889
   1890		if (folio == prev_folio)
   1891			continue;
   1892		prev_folio = folio;
   1893
   1894		if (folio_is_pinnable(folio))
   1895			continue;
   1896
   1897		/*
   1898		 * Try to move out any movable page before pinning the range.
   1899		 */
   1900		if (folio_test_hugetlb(folio)) {
   1901			if (!isolate_huge_page(&folio->page,
   1902						&movable_page_list))
   1903				isolation_error_count++;
   1904			continue;
   1905		}
   1906
   1907		if (!folio_test_lru(folio) && drain_allow) {
   1908			lru_add_drain_all();
   1909			drain_allow = false;
   1910		}
   1911
   1912		if (folio_isolate_lru(folio)) {
   1913			isolation_error_count++;
   1914			continue;
   1915		}
   1916		list_add_tail(&folio->lru, &movable_page_list);
   1917		node_stat_mod_folio(folio,
   1918				    NR_ISOLATED_ANON + folio_is_file_lru(folio),
   1919				    folio_nr_pages(folio));
   1920	}
   1921
   1922	if (!list_empty(&movable_page_list) || isolation_error_count)
   1923		goto unpin_pages;
   1924
   1925	/*
   1926	 * If list is empty, and no isolation errors, means that all pages are
   1927	 * in the correct zone.
   1928	 */
   1929	return nr_pages;
   1930
   1931unpin_pages:
   1932	if (gup_flags & FOLL_PIN) {
   1933		unpin_user_pages(pages, nr_pages);
   1934	} else {
   1935		for (i = 0; i < nr_pages; i++)
   1936			put_page(pages[i]);
   1937	}
   1938
   1939	if (!list_empty(&movable_page_list)) {
   1940		struct migration_target_control mtc = {
   1941			.nid = NUMA_NO_NODE,
   1942			.gfp_mask = GFP_USER | __GFP_NOWARN,
   1943		};
   1944
   1945		ret = migrate_pages(&movable_page_list, alloc_migration_target,
   1946				    NULL, (unsigned long)&mtc, MIGRATE_SYNC,
   1947				    MR_LONGTERM_PIN, NULL);
   1948		if (ret > 0) /* number of pages not migrated */
   1949			ret = -ENOMEM;
   1950	}
   1951
   1952	if (ret && !list_empty(&movable_page_list))
   1953		putback_movable_pages(&movable_page_list);
   1954	return ret;
   1955}
   1956#else
   1957static long check_and_migrate_movable_pages(unsigned long nr_pages,
   1958					    struct page **pages,
   1959					    unsigned int gup_flags)
   1960{
   1961	return nr_pages;
   1962}
   1963#endif /* CONFIG_MIGRATION */
   1964
   1965/*
   1966 * __gup_longterm_locked() is a wrapper for __get_user_pages_locked which
   1967 * allows us to process the FOLL_LONGTERM flag.
   1968 */
   1969static long __gup_longterm_locked(struct mm_struct *mm,
   1970				  unsigned long start,
   1971				  unsigned long nr_pages,
   1972				  struct page **pages,
   1973				  struct vm_area_struct **vmas,
   1974				  unsigned int gup_flags)
   1975{
   1976	unsigned int flags;
   1977	long rc;
   1978
   1979	if (!(gup_flags & FOLL_LONGTERM))
   1980		return __get_user_pages_locked(mm, start, nr_pages, pages, vmas,
   1981					       NULL, gup_flags);
   1982	flags = memalloc_pin_save();
   1983	do {
   1984		rc = __get_user_pages_locked(mm, start, nr_pages, pages, vmas,
   1985					     NULL, gup_flags);
   1986		if (rc <= 0)
   1987			break;
   1988		rc = check_and_migrate_movable_pages(rc, pages, gup_flags);
   1989	} while (!rc);
   1990	memalloc_pin_restore(flags);
   1991
   1992	return rc;
   1993}
   1994
   1995static bool is_valid_gup_flags(unsigned int gup_flags)
   1996{
   1997	/*
   1998	 * FOLL_PIN must only be set internally by the pin_user_pages*() APIs,
   1999	 * never directly by the caller, so enforce that with an assertion:
   2000	 */
   2001	if (WARN_ON_ONCE(gup_flags & FOLL_PIN))
   2002		return false;
   2003	/*
   2004	 * FOLL_PIN is a prerequisite to FOLL_LONGTERM. Another way of saying
   2005	 * that is, FOLL_LONGTERM is a specific case, more restrictive case of
   2006	 * FOLL_PIN.
   2007	 */
   2008	if (WARN_ON_ONCE(gup_flags & FOLL_LONGTERM))
   2009		return false;
   2010
   2011	return true;
   2012}
   2013
   2014#ifdef CONFIG_MMU
   2015static long __get_user_pages_remote(struct mm_struct *mm,
   2016				    unsigned long start, unsigned long nr_pages,
   2017				    unsigned int gup_flags, struct page **pages,
   2018				    struct vm_area_struct **vmas, int *locked)
   2019{
   2020	/*
   2021	 * Parts of FOLL_LONGTERM behavior are incompatible with
   2022	 * FAULT_FLAG_ALLOW_RETRY because of the FS DAX check requirement on
   2023	 * vmas. However, this only comes up if locked is set, and there are
   2024	 * callers that do request FOLL_LONGTERM, but do not set locked. So,
   2025	 * allow what we can.
   2026	 */
   2027	if (gup_flags & FOLL_LONGTERM) {
   2028		if (WARN_ON_ONCE(locked))
   2029			return -EINVAL;
   2030		/*
   2031		 * This will check the vmas (even if our vmas arg is NULL)
   2032		 * and return -ENOTSUPP if DAX isn't allowed in this case:
   2033		 */
   2034		return __gup_longterm_locked(mm, start, nr_pages, pages,
   2035					     vmas, gup_flags | FOLL_TOUCH |
   2036					     FOLL_REMOTE);
   2037	}
   2038
   2039	return __get_user_pages_locked(mm, start, nr_pages, pages, vmas,
   2040				       locked,
   2041				       gup_flags | FOLL_TOUCH | FOLL_REMOTE);
   2042}
   2043
   2044/**
   2045 * get_user_pages_remote() - pin user pages in memory
   2046 * @mm:		mm_struct of target mm
   2047 * @start:	starting user address
   2048 * @nr_pages:	number of pages from start to pin
   2049 * @gup_flags:	flags modifying lookup behaviour
   2050 * @pages:	array that receives pointers to the pages pinned.
   2051 *		Should be at least nr_pages long. Or NULL, if caller
   2052 *		only intends to ensure the pages are faulted in.
   2053 * @vmas:	array of pointers to vmas corresponding to each page.
   2054 *		Or NULL if the caller does not require them.
   2055 * @locked:	pointer to lock flag indicating whether lock is held and
   2056 *		subsequently whether VM_FAULT_RETRY functionality can be
   2057 *		utilised. Lock must initially be held.
   2058 *
   2059 * Returns either number of pages pinned (which may be less than the
   2060 * number requested), or an error. Details about the return value:
   2061 *
   2062 * -- If nr_pages is 0, returns 0.
   2063 * -- If nr_pages is >0, but no pages were pinned, returns -errno.
   2064 * -- If nr_pages is >0, and some pages were pinned, returns the number of
   2065 *    pages pinned. Again, this may be less than nr_pages.
   2066 *
   2067 * The caller is responsible for releasing returned @pages, via put_page().
   2068 *
   2069 * @vmas are valid only as long as mmap_lock is held.
   2070 *
   2071 * Must be called with mmap_lock held for read or write.
   2072 *
   2073 * get_user_pages_remote walks a process's page tables and takes a reference
   2074 * to each struct page that each user address corresponds to at a given
   2075 * instant. That is, it takes the page that would be accessed if a user
   2076 * thread accesses the given user virtual address at that instant.
   2077 *
   2078 * This does not guarantee that the page exists in the user mappings when
   2079 * get_user_pages_remote returns, and there may even be a completely different
   2080 * page there in some cases (eg. if mmapped pagecache has been invalidated
   2081 * and subsequently re faulted). However it does guarantee that the page
   2082 * won't be freed completely. And mostly callers simply care that the page
   2083 * contains data that was valid *at some point in time*. Typically, an IO
   2084 * or similar operation cannot guarantee anything stronger anyway because
   2085 * locks can't be held over the syscall boundary.
   2086 *
   2087 * If gup_flags & FOLL_WRITE == 0, the page must not be written to. If the page
   2088 * is written to, set_page_dirty (or set_page_dirty_lock, as appropriate) must
   2089 * be called after the page is finished with, and before put_page is called.
   2090 *
   2091 * get_user_pages_remote is typically used for fewer-copy IO operations,
   2092 * to get a handle on the memory by some means other than accesses
   2093 * via the user virtual addresses. The pages may be submitted for
   2094 * DMA to devices or accessed via their kernel linear mapping (via the
   2095 * kmap APIs). Care should be taken to use the correct cache flushing APIs.
   2096 *
   2097 * See also get_user_pages_fast, for performance critical applications.
   2098 *
   2099 * get_user_pages_remote should be phased out in favor of
   2100 * get_user_pages_locked|unlocked or get_user_pages_fast. Nothing
   2101 * should use get_user_pages_remote because it cannot pass
   2102 * FAULT_FLAG_ALLOW_RETRY to handle_mm_fault.
   2103 */
   2104long get_user_pages_remote(struct mm_struct *mm,
   2105		unsigned long start, unsigned long nr_pages,
   2106		unsigned int gup_flags, struct page **pages,
   2107		struct vm_area_struct **vmas, int *locked)
   2108{
   2109	if (!is_valid_gup_flags(gup_flags))
   2110		return -EINVAL;
   2111
   2112	return __get_user_pages_remote(mm, start, nr_pages, gup_flags,
   2113				       pages, vmas, locked);
   2114}
   2115EXPORT_SYMBOL(get_user_pages_remote);
   2116
   2117#else /* CONFIG_MMU */
   2118long get_user_pages_remote(struct mm_struct *mm,
   2119			   unsigned long start, unsigned long nr_pages,
   2120			   unsigned int gup_flags, struct page **pages,
   2121			   struct vm_area_struct **vmas, int *locked)
   2122{
   2123	return 0;
   2124}
   2125
   2126static long __get_user_pages_remote(struct mm_struct *mm,
   2127				    unsigned long start, unsigned long nr_pages,
   2128				    unsigned int gup_flags, struct page **pages,
   2129				    struct vm_area_struct **vmas, int *locked)
   2130{
   2131	return 0;
   2132}
   2133#endif /* !CONFIG_MMU */
   2134
   2135/**
   2136 * get_user_pages() - pin user pages in memory
   2137 * @start:      starting user address
   2138 * @nr_pages:   number of pages from start to pin
   2139 * @gup_flags:  flags modifying lookup behaviour
   2140 * @pages:      array that receives pointers to the pages pinned.
   2141 *              Should be at least nr_pages long. Or NULL, if caller
   2142 *              only intends to ensure the pages are faulted in.
   2143 * @vmas:       array of pointers to vmas corresponding to each page.
   2144 *              Or NULL if the caller does not require them.
   2145 *
   2146 * This is the same as get_user_pages_remote(), just with a less-flexible
   2147 * calling convention where we assume that the mm being operated on belongs to
   2148 * the current task, and doesn't allow passing of a locked parameter.  We also
   2149 * obviously don't pass FOLL_REMOTE in here.
   2150 */
   2151long get_user_pages(unsigned long start, unsigned long nr_pages,
   2152		unsigned int gup_flags, struct page **pages,
   2153		struct vm_area_struct **vmas)
   2154{
   2155	if (!is_valid_gup_flags(gup_flags))
   2156		return -EINVAL;
   2157
   2158	return __gup_longterm_locked(current->mm, start, nr_pages,
   2159				     pages, vmas, gup_flags | FOLL_TOUCH);
   2160}
   2161EXPORT_SYMBOL(get_user_pages);
   2162
   2163/*
   2164 * get_user_pages_unlocked() is suitable to replace the form:
   2165 *
   2166 *      mmap_read_lock(mm);
   2167 *      get_user_pages(mm, ..., pages, NULL);
   2168 *      mmap_read_unlock(mm);
   2169 *
   2170 *  with:
   2171 *
   2172 *      get_user_pages_unlocked(mm, ..., pages);
   2173 *
   2174 * It is functionally equivalent to get_user_pages_fast so
   2175 * get_user_pages_fast should be used instead if specific gup_flags
   2176 * (e.g. FOLL_FORCE) are not required.
   2177 */
   2178long get_user_pages_unlocked(unsigned long start, unsigned long nr_pages,
   2179			     struct page **pages, unsigned int gup_flags)
   2180{
   2181	struct mm_struct *mm = current->mm;
   2182	int locked = 1;
   2183	long ret;
   2184
   2185	/*
   2186	 * FIXME: Current FOLL_LONGTERM behavior is incompatible with
   2187	 * FAULT_FLAG_ALLOW_RETRY because of the FS DAX check requirement on
   2188	 * vmas.  As there are no users of this flag in this call we simply
   2189	 * disallow this option for now.
   2190	 */
   2191	if (WARN_ON_ONCE(gup_flags & FOLL_LONGTERM))
   2192		return -EINVAL;
   2193
   2194	mmap_read_lock(mm);
   2195	ret = __get_user_pages_locked(mm, start, nr_pages, pages, NULL,
   2196				      &locked, gup_flags | FOLL_TOUCH);
   2197	if (locked)
   2198		mmap_read_unlock(mm);
   2199	return ret;
   2200}
   2201EXPORT_SYMBOL(get_user_pages_unlocked);
   2202
   2203/*
   2204 * Fast GUP
   2205 *
   2206 * get_user_pages_fast attempts to pin user pages by walking the page
   2207 * tables directly and avoids taking locks. Thus the walker needs to be
   2208 * protected from page table pages being freed from under it, and should
   2209 * block any THP splits.
   2210 *
   2211 * One way to achieve this is to have the walker disable interrupts, and
   2212 * rely on IPIs from the TLB flushing code blocking before the page table
   2213 * pages are freed. This is unsuitable for architectures that do not need
   2214 * to broadcast an IPI when invalidating TLBs.
   2215 *
   2216 * Another way to achieve this is to batch up page table containing pages
   2217 * belonging to more than one mm_user, then rcu_sched a callback to free those
   2218 * pages. Disabling interrupts will allow the fast_gup walker to both block
   2219 * the rcu_sched callback, and an IPI that we broadcast for splitting THPs
   2220 * (which is a relatively rare event). The code below adopts this strategy.
   2221 *
   2222 * Before activating this code, please be aware that the following assumptions
   2223 * are currently made:
   2224 *
   2225 *  *) Either MMU_GATHER_RCU_TABLE_FREE is enabled, and tlb_remove_table() is used to
   2226 *  free pages containing page tables or TLB flushing requires IPI broadcast.
   2227 *
   2228 *  *) ptes can be read atomically by the architecture.
   2229 *
   2230 *  *) access_ok is sufficient to validate userspace address ranges.
   2231 *
   2232 * The last two assumptions can be relaxed by the addition of helper functions.
   2233 *
   2234 * This code is based heavily on the PowerPC implementation by Nick Piggin.
   2235 */
   2236#ifdef CONFIG_HAVE_FAST_GUP
   2237
   2238static void __maybe_unused undo_dev_pagemap(int *nr, int nr_start,
   2239					    unsigned int flags,
   2240					    struct page **pages)
   2241{
   2242	while ((*nr) - nr_start) {
   2243		struct page *page = pages[--(*nr)];
   2244
   2245		ClearPageReferenced(page);
   2246		if (flags & FOLL_PIN)
   2247			unpin_user_page(page);
   2248		else
   2249			put_page(page);
   2250	}
   2251}
   2252
   2253#ifdef CONFIG_ARCH_HAS_PTE_SPECIAL
   2254static int gup_pte_range(pmd_t pmd, unsigned long addr, unsigned long end,
   2255			 unsigned int flags, struct page **pages, int *nr)
   2256{
   2257	struct dev_pagemap *pgmap = NULL;
   2258	int nr_start = *nr, ret = 0;
   2259	pte_t *ptep, *ptem;
   2260
   2261	ptem = ptep = pte_offset_map(&pmd, addr);
   2262	do {
   2263		pte_t pte = ptep_get_lockless(ptep);
   2264		struct page *page;
   2265		struct folio *folio;
   2266
   2267		/*
   2268		 * Similar to the PMD case below, NUMA hinting must take slow
   2269		 * path using the pte_protnone check.
   2270		 */
   2271		if (pte_protnone(pte))
   2272			goto pte_unmap;
   2273
   2274		if (!pte_access_permitted(pte, flags & FOLL_WRITE))
   2275			goto pte_unmap;
   2276
   2277		if (pte_devmap(pte)) {
   2278			if (unlikely(flags & FOLL_LONGTERM))
   2279				goto pte_unmap;
   2280
   2281			pgmap = get_dev_pagemap(pte_pfn(pte), pgmap);
   2282			if (unlikely(!pgmap)) {
   2283				undo_dev_pagemap(nr, nr_start, flags, pages);
   2284				goto pte_unmap;
   2285			}
   2286		} else if (pte_special(pte))
   2287			goto pte_unmap;
   2288
   2289		VM_BUG_ON(!pfn_valid(pte_pfn(pte)));
   2290		page = pte_page(pte);
   2291
   2292		folio = try_grab_folio(page, 1, flags);
   2293		if (!folio)
   2294			goto pte_unmap;
   2295
   2296		if (unlikely(page_is_secretmem(page))) {
   2297			gup_put_folio(folio, 1, flags);
   2298			goto pte_unmap;
   2299		}
   2300
   2301		if (unlikely(pte_val(pte) != pte_val(*ptep))) {
   2302			gup_put_folio(folio, 1, flags);
   2303			goto pte_unmap;
   2304		}
   2305
   2306		if (!pte_write(pte) && gup_must_unshare(flags, page)) {
   2307			gup_put_folio(folio, 1, flags);
   2308			goto pte_unmap;
   2309		}
   2310
   2311		/*
   2312		 * We need to make the page accessible if and only if we are
   2313		 * going to access its content (the FOLL_PIN case).  Please
   2314		 * see Documentation/core-api/pin_user_pages.rst for
   2315		 * details.
   2316		 */
   2317		if (flags & FOLL_PIN) {
   2318			ret = arch_make_page_accessible(page);
   2319			if (ret) {
   2320				gup_put_folio(folio, 1, flags);
   2321				goto pte_unmap;
   2322			}
   2323		}
   2324		folio_set_referenced(folio);
   2325		pages[*nr] = page;
   2326		(*nr)++;
   2327	} while (ptep++, addr += PAGE_SIZE, addr != end);
   2328
   2329	ret = 1;
   2330
   2331pte_unmap:
   2332	if (pgmap)
   2333		put_dev_pagemap(pgmap);
   2334	pte_unmap(ptem);
   2335	return ret;
   2336}
   2337#else
   2338
   2339/*
   2340 * If we can't determine whether or not a pte is special, then fail immediately
   2341 * for ptes. Note, we can still pin HugeTLB and THP as these are guaranteed not
   2342 * to be special.
   2343 *
   2344 * For a futex to be placed on a THP tail page, get_futex_key requires a
   2345 * get_user_pages_fast_only implementation that can pin pages. Thus it's still
   2346 * useful to have gup_huge_pmd even if we can't operate on ptes.
   2347 */
   2348static int gup_pte_range(pmd_t pmd, unsigned long addr, unsigned long end,
   2349			 unsigned int flags, struct page **pages, int *nr)
   2350{
   2351	return 0;
   2352}
   2353#endif /* CONFIG_ARCH_HAS_PTE_SPECIAL */
   2354
   2355#if defined(CONFIG_ARCH_HAS_PTE_DEVMAP) && defined(CONFIG_TRANSPARENT_HUGEPAGE)
   2356static int __gup_device_huge(unsigned long pfn, unsigned long addr,
   2357			     unsigned long end, unsigned int flags,
   2358			     struct page **pages, int *nr)
   2359{
   2360	int nr_start = *nr;
   2361	struct dev_pagemap *pgmap = NULL;
   2362
   2363	do {
   2364		struct page *page = pfn_to_page(pfn);
   2365
   2366		pgmap = get_dev_pagemap(pfn, pgmap);
   2367		if (unlikely(!pgmap)) {
   2368			undo_dev_pagemap(nr, nr_start, flags, pages);
   2369			break;
   2370		}
   2371		SetPageReferenced(page);
   2372		pages[*nr] = page;
   2373		if (unlikely(!try_grab_page(page, flags))) {
   2374			undo_dev_pagemap(nr, nr_start, flags, pages);
   2375			break;
   2376		}
   2377		(*nr)++;
   2378		pfn++;
   2379	} while (addr += PAGE_SIZE, addr != end);
   2380
   2381	put_dev_pagemap(pgmap);
   2382	return addr == end;
   2383}
   2384
   2385static int __gup_device_huge_pmd(pmd_t orig, pmd_t *pmdp, unsigned long addr,
   2386				 unsigned long end, unsigned int flags,
   2387				 struct page **pages, int *nr)
   2388{
   2389	unsigned long fault_pfn;
   2390	int nr_start = *nr;
   2391
   2392	fault_pfn = pmd_pfn(orig) + ((addr & ~PMD_MASK) >> PAGE_SHIFT);
   2393	if (!__gup_device_huge(fault_pfn, addr, end, flags, pages, nr))
   2394		return 0;
   2395
   2396	if (unlikely(pmd_val(orig) != pmd_val(*pmdp))) {
   2397		undo_dev_pagemap(nr, nr_start, flags, pages);
   2398		return 0;
   2399	}
   2400	return 1;
   2401}
   2402
   2403static int __gup_device_huge_pud(pud_t orig, pud_t *pudp, unsigned long addr,
   2404				 unsigned long end, unsigned int flags,
   2405				 struct page **pages, int *nr)
   2406{
   2407	unsigned long fault_pfn;
   2408	int nr_start = *nr;
   2409
   2410	fault_pfn = pud_pfn(orig) + ((addr & ~PUD_MASK) >> PAGE_SHIFT);
   2411	if (!__gup_device_huge(fault_pfn, addr, end, flags, pages, nr))
   2412		return 0;
   2413
   2414	if (unlikely(pud_val(orig) != pud_val(*pudp))) {
   2415		undo_dev_pagemap(nr, nr_start, flags, pages);
   2416		return 0;
   2417	}
   2418	return 1;
   2419}
   2420#else
   2421static int __gup_device_huge_pmd(pmd_t orig, pmd_t *pmdp, unsigned long addr,
   2422				 unsigned long end, unsigned int flags,
   2423				 struct page **pages, int *nr)
   2424{
   2425	BUILD_BUG();
   2426	return 0;
   2427}
   2428
   2429static int __gup_device_huge_pud(pud_t pud, pud_t *pudp, unsigned long addr,
   2430				 unsigned long end, unsigned int flags,
   2431				 struct page **pages, int *nr)
   2432{
   2433	BUILD_BUG();
   2434	return 0;
   2435}
   2436#endif
   2437
   2438static int record_subpages(struct page *page, unsigned long addr,
   2439			   unsigned long end, struct page **pages)
   2440{
   2441	int nr;
   2442
   2443	for (nr = 0; addr != end; nr++, addr += PAGE_SIZE)
   2444		pages[nr] = nth_page(page, nr);
   2445
   2446	return nr;
   2447}
   2448
   2449#ifdef CONFIG_ARCH_HAS_HUGEPD
   2450static unsigned long hugepte_addr_end(unsigned long addr, unsigned long end,
   2451				      unsigned long sz)
   2452{
   2453	unsigned long __boundary = (addr + sz) & ~(sz-1);
   2454	return (__boundary - 1 < end - 1) ? __boundary : end;
   2455}
   2456
   2457static int gup_hugepte(pte_t *ptep, unsigned long sz, unsigned long addr,
   2458		       unsigned long end, unsigned int flags,
   2459		       struct page **pages, int *nr)
   2460{
   2461	unsigned long pte_end;
   2462	struct page *page;
   2463	struct folio *folio;
   2464	pte_t pte;
   2465	int refs;
   2466
   2467	pte_end = (addr + sz) & ~(sz-1);
   2468	if (pte_end < end)
   2469		end = pte_end;
   2470
   2471	pte = huge_ptep_get(ptep);
   2472
   2473	if (!pte_access_permitted(pte, flags & FOLL_WRITE))
   2474		return 0;
   2475
   2476	/* hugepages are never "special" */
   2477	VM_BUG_ON(!pfn_valid(pte_pfn(pte)));
   2478
   2479	page = nth_page(pte_page(pte), (addr & (sz - 1)) >> PAGE_SHIFT);
   2480	refs = record_subpages(page, addr, end, pages + *nr);
   2481
   2482	folio = try_grab_folio(page, refs, flags);
   2483	if (!folio)
   2484		return 0;
   2485
   2486	if (unlikely(pte_val(pte) != pte_val(*ptep))) {
   2487		gup_put_folio(folio, refs, flags);
   2488		return 0;
   2489	}
   2490
   2491	if (!pte_write(pte) && gup_must_unshare(flags, &folio->page)) {
   2492		gup_put_folio(folio, refs, flags);
   2493		return 0;
   2494	}
   2495
   2496	*nr += refs;
   2497	folio_set_referenced(folio);
   2498	return 1;
   2499}
   2500
   2501static int gup_huge_pd(hugepd_t hugepd, unsigned long addr,
   2502		unsigned int pdshift, unsigned long end, unsigned int flags,
   2503		struct page **pages, int *nr)
   2504{
   2505	pte_t *ptep;
   2506	unsigned long sz = 1UL << hugepd_shift(hugepd);
   2507	unsigned long next;
   2508
   2509	ptep = hugepte_offset(hugepd, addr, pdshift);
   2510	do {
   2511		next = hugepte_addr_end(addr, end, sz);
   2512		if (!gup_hugepte(ptep, sz, addr, end, flags, pages, nr))
   2513			return 0;
   2514	} while (ptep++, addr = next, addr != end);
   2515
   2516	return 1;
   2517}
   2518#else
   2519static inline int gup_huge_pd(hugepd_t hugepd, unsigned long addr,
   2520		unsigned int pdshift, unsigned long end, unsigned int flags,
   2521		struct page **pages, int *nr)
   2522{
   2523	return 0;
   2524}
   2525#endif /* CONFIG_ARCH_HAS_HUGEPD */
   2526
   2527static int gup_huge_pmd(pmd_t orig, pmd_t *pmdp, unsigned long addr,
   2528			unsigned long end, unsigned int flags,
   2529			struct page **pages, int *nr)
   2530{
   2531	struct page *page;
   2532	struct folio *folio;
   2533	int refs;
   2534
   2535	if (!pmd_access_permitted(orig, flags & FOLL_WRITE))
   2536		return 0;
   2537
   2538	if (pmd_devmap(orig)) {
   2539		if (unlikely(flags & FOLL_LONGTERM))
   2540			return 0;
   2541		return __gup_device_huge_pmd(orig, pmdp, addr, end, flags,
   2542					     pages, nr);
   2543	}
   2544
   2545	page = nth_page(pmd_page(orig), (addr & ~PMD_MASK) >> PAGE_SHIFT);
   2546	refs = record_subpages(page, addr, end, pages + *nr);
   2547
   2548	folio = try_grab_folio(page, refs, flags);
   2549	if (!folio)
   2550		return 0;
   2551
   2552	if (unlikely(pmd_val(orig) != pmd_val(*pmdp))) {
   2553		gup_put_folio(folio, refs, flags);
   2554		return 0;
   2555	}
   2556
   2557	if (!pmd_write(orig) && gup_must_unshare(flags, &folio->page)) {
   2558		gup_put_folio(folio, refs, flags);
   2559		return 0;
   2560	}
   2561
   2562	*nr += refs;
   2563	folio_set_referenced(folio);
   2564	return 1;
   2565}
   2566
   2567static int gup_huge_pud(pud_t orig, pud_t *pudp, unsigned long addr,
   2568			unsigned long end, unsigned int flags,
   2569			struct page **pages, int *nr)
   2570{
   2571	struct page *page;
   2572	struct folio *folio;
   2573	int refs;
   2574
   2575	if (!pud_access_permitted(orig, flags & FOLL_WRITE))
   2576		return 0;
   2577
   2578	if (pud_devmap(orig)) {
   2579		if (unlikely(flags & FOLL_LONGTERM))
   2580			return 0;
   2581		return __gup_device_huge_pud(orig, pudp, addr, end, flags,
   2582					     pages, nr);
   2583	}
   2584
   2585	page = nth_page(pud_page(orig), (addr & ~PUD_MASK) >> PAGE_SHIFT);
   2586	refs = record_subpages(page, addr, end, pages + *nr);
   2587
   2588	folio = try_grab_folio(page, refs, flags);
   2589	if (!folio)
   2590		return 0;
   2591
   2592	if (unlikely(pud_val(orig) != pud_val(*pudp))) {
   2593		gup_put_folio(folio, refs, flags);
   2594		return 0;
   2595	}
   2596
   2597	if (!pud_write(orig) && gup_must_unshare(flags, &folio->page)) {
   2598		gup_put_folio(folio, refs, flags);
   2599		return 0;
   2600	}
   2601
   2602	*nr += refs;
   2603	folio_set_referenced(folio);
   2604	return 1;
   2605}
   2606
   2607static int gup_huge_pgd(pgd_t orig, pgd_t *pgdp, unsigned long addr,
   2608			unsigned long end, unsigned int flags,
   2609			struct page **pages, int *nr)
   2610{
   2611	int refs;
   2612	struct page *page;
   2613	struct folio *folio;
   2614
   2615	if (!pgd_access_permitted(orig, flags & FOLL_WRITE))
   2616		return 0;
   2617
   2618	BUILD_BUG_ON(pgd_devmap(orig));
   2619
   2620	page = nth_page(pgd_page(orig), (addr & ~PGDIR_MASK) >> PAGE_SHIFT);
   2621	refs = record_subpages(page, addr, end, pages + *nr);
   2622
   2623	folio = try_grab_folio(page, refs, flags);
   2624	if (!folio)
   2625		return 0;
   2626
   2627	if (unlikely(pgd_val(orig) != pgd_val(*pgdp))) {
   2628		gup_put_folio(folio, refs, flags);
   2629		return 0;
   2630	}
   2631
   2632	*nr += refs;
   2633	folio_set_referenced(folio);
   2634	return 1;
   2635}
   2636
   2637static int gup_pmd_range(pud_t *pudp, pud_t pud, unsigned long addr, unsigned long end,
   2638		unsigned int flags, struct page **pages, int *nr)
   2639{
   2640	unsigned long next;
   2641	pmd_t *pmdp;
   2642
   2643	pmdp = pmd_offset_lockless(pudp, pud, addr);
   2644	do {
   2645		pmd_t pmd = READ_ONCE(*pmdp);
   2646
   2647		next = pmd_addr_end(addr, end);
   2648		if (!pmd_present(pmd))
   2649			return 0;
   2650
   2651		if (unlikely(pmd_trans_huge(pmd) || pmd_huge(pmd) ||
   2652			     pmd_devmap(pmd))) {
   2653			/*
   2654			 * NUMA hinting faults need to be handled in the GUP
   2655			 * slowpath for accounting purposes and so that they
   2656			 * can be serialised against THP migration.
   2657			 */
   2658			if (pmd_protnone(pmd))
   2659				return 0;
   2660
   2661			if (!gup_huge_pmd(pmd, pmdp, addr, next, flags,
   2662				pages, nr))
   2663				return 0;
   2664
   2665		} else if (unlikely(is_hugepd(__hugepd(pmd_val(pmd))))) {
   2666			/*
   2667			 * architecture have different format for hugetlbfs
   2668			 * pmd format and THP pmd format
   2669			 */
   2670			if (!gup_huge_pd(__hugepd(pmd_val(pmd)), addr,
   2671					 PMD_SHIFT, next, flags, pages, nr))
   2672				return 0;
   2673		} else if (!gup_pte_range(pmd, addr, next, flags, pages, nr))
   2674			return 0;
   2675	} while (pmdp++, addr = next, addr != end);
   2676
   2677	return 1;
   2678}
   2679
   2680static int gup_pud_range(p4d_t *p4dp, p4d_t p4d, unsigned long addr, unsigned long end,
   2681			 unsigned int flags, struct page **pages, int *nr)
   2682{
   2683	unsigned long next;
   2684	pud_t *pudp;
   2685
   2686	pudp = pud_offset_lockless(p4dp, p4d, addr);
   2687	do {
   2688		pud_t pud = READ_ONCE(*pudp);
   2689
   2690		next = pud_addr_end(addr, end);
   2691		if (unlikely(!pud_present(pud)))
   2692			return 0;
   2693		if (unlikely(pud_huge(pud))) {
   2694			if (!gup_huge_pud(pud, pudp, addr, next, flags,
   2695					  pages, nr))
   2696				return 0;
   2697		} else if (unlikely(is_hugepd(__hugepd(pud_val(pud))))) {
   2698			if (!gup_huge_pd(__hugepd(pud_val(pud)), addr,
   2699					 PUD_SHIFT, next, flags, pages, nr))
   2700				return 0;
   2701		} else if (!gup_pmd_range(pudp, pud, addr, next, flags, pages, nr))
   2702			return 0;
   2703	} while (pudp++, addr = next, addr != end);
   2704
   2705	return 1;
   2706}
   2707
   2708static int gup_p4d_range(pgd_t *pgdp, pgd_t pgd, unsigned long addr, unsigned long end,
   2709			 unsigned int flags, struct page **pages, int *nr)
   2710{
   2711	unsigned long next;
   2712	p4d_t *p4dp;
   2713
   2714	p4dp = p4d_offset_lockless(pgdp, pgd, addr);
   2715	do {
   2716		p4d_t p4d = READ_ONCE(*p4dp);
   2717
   2718		next = p4d_addr_end(addr, end);
   2719		if (p4d_none(p4d))
   2720			return 0;
   2721		BUILD_BUG_ON(p4d_huge(p4d));
   2722		if (unlikely(is_hugepd(__hugepd(p4d_val(p4d))))) {
   2723			if (!gup_huge_pd(__hugepd(p4d_val(p4d)), addr,
   2724					 P4D_SHIFT, next, flags, pages, nr))
   2725				return 0;
   2726		} else if (!gup_pud_range(p4dp, p4d, addr, next, flags, pages, nr))
   2727			return 0;
   2728	} while (p4dp++, addr = next, addr != end);
   2729
   2730	return 1;
   2731}
   2732
   2733static void gup_pgd_range(unsigned long addr, unsigned long end,
   2734		unsigned int flags, struct page **pages, int *nr)
   2735{
   2736	unsigned long next;
   2737	pgd_t *pgdp;
   2738
   2739	pgdp = pgd_offset(current->mm, addr);
   2740	do {
   2741		pgd_t pgd = READ_ONCE(*pgdp);
   2742
   2743		next = pgd_addr_end(addr, end);
   2744		if (pgd_none(pgd))
   2745			return;
   2746		if (unlikely(pgd_huge(pgd))) {
   2747			if (!gup_huge_pgd(pgd, pgdp, addr, next, flags,
   2748					  pages, nr))
   2749				return;
   2750		} else if (unlikely(is_hugepd(__hugepd(pgd_val(pgd))))) {
   2751			if (!gup_huge_pd(__hugepd(pgd_val(pgd)), addr,
   2752					 PGDIR_SHIFT, next, flags, pages, nr))
   2753				return;
   2754		} else if (!gup_p4d_range(pgdp, pgd, addr, next, flags, pages, nr))
   2755			return;
   2756	} while (pgdp++, addr = next, addr != end);
   2757}
   2758#else
   2759static inline void gup_pgd_range(unsigned long addr, unsigned long end,
   2760		unsigned int flags, struct page **pages, int *nr)
   2761{
   2762}
   2763#endif /* CONFIG_HAVE_FAST_GUP */
   2764
   2765#ifndef gup_fast_permitted
   2766/*
   2767 * Check if it's allowed to use get_user_pages_fast_only() for the range, or
   2768 * we need to fall back to the slow version:
   2769 */
   2770static bool gup_fast_permitted(unsigned long start, unsigned long end)
   2771{
   2772	return true;
   2773}
   2774#endif
   2775
   2776static int __gup_longterm_unlocked(unsigned long start, int nr_pages,
   2777				   unsigned int gup_flags, struct page **pages)
   2778{
   2779	int ret;
   2780
   2781	/*
   2782	 * FIXME: FOLL_LONGTERM does not work with
   2783	 * get_user_pages_unlocked() (see comments in that function)
   2784	 */
   2785	if (gup_flags & FOLL_LONGTERM) {
   2786		mmap_read_lock(current->mm);
   2787		ret = __gup_longterm_locked(current->mm,
   2788					    start, nr_pages,
   2789					    pages, NULL, gup_flags);
   2790		mmap_read_unlock(current->mm);
   2791	} else {
   2792		ret = get_user_pages_unlocked(start, nr_pages,
   2793					      pages, gup_flags);
   2794	}
   2795
   2796	return ret;
   2797}
   2798
   2799static unsigned long lockless_pages_from_mm(unsigned long start,
   2800					    unsigned long end,
   2801					    unsigned int gup_flags,
   2802					    struct page **pages)
   2803{
   2804	unsigned long flags;
   2805	int nr_pinned = 0;
   2806	unsigned seq;
   2807
   2808	if (!IS_ENABLED(CONFIG_HAVE_FAST_GUP) ||
   2809	    !gup_fast_permitted(start, end))
   2810		return 0;
   2811
   2812	if (gup_flags & FOLL_PIN) {
   2813		seq = raw_read_seqcount(&current->mm->write_protect_seq);
   2814		if (seq & 1)
   2815			return 0;
   2816	}
   2817
   2818	/*
   2819	 * Disable interrupts. The nested form is used, in order to allow full,
   2820	 * general purpose use of this routine.
   2821	 *
   2822	 * With interrupts disabled, we block page table pages from being freed
   2823	 * from under us. See struct mmu_table_batch comments in
   2824	 * include/asm-generic/tlb.h for more details.
   2825	 *
   2826	 * We do not adopt an rcu_read_lock() here as we also want to block IPIs
   2827	 * that come from THPs splitting.
   2828	 */
   2829	local_irq_save(flags);
   2830	gup_pgd_range(start, end, gup_flags, pages, &nr_pinned);
   2831	local_irq_restore(flags);
   2832
   2833	/*
   2834	 * When pinning pages for DMA there could be a concurrent write protect
   2835	 * from fork() via copy_page_range(), in this case always fail fast GUP.
   2836	 */
   2837	if (gup_flags & FOLL_PIN) {
   2838		if (read_seqcount_retry(&current->mm->write_protect_seq, seq)) {
   2839			unpin_user_pages_lockless(pages, nr_pinned);
   2840			return 0;
   2841		} else {
   2842			sanity_check_pinned_pages(pages, nr_pinned);
   2843		}
   2844	}
   2845	return nr_pinned;
   2846}
   2847
   2848static int internal_get_user_pages_fast(unsigned long start,
   2849					unsigned long nr_pages,
   2850					unsigned int gup_flags,
   2851					struct page **pages)
   2852{
   2853	unsigned long len, end;
   2854	unsigned long nr_pinned;
   2855	int ret;
   2856
   2857	if (WARN_ON_ONCE(gup_flags & ~(FOLL_WRITE | FOLL_LONGTERM |
   2858				       FOLL_FORCE | FOLL_PIN | FOLL_GET |
   2859				       FOLL_FAST_ONLY | FOLL_NOFAULT)))
   2860		return -EINVAL;
   2861
   2862	if (gup_flags & FOLL_PIN)
   2863		mm_set_has_pinned_flag(&current->mm->flags);
   2864
   2865	if (!(gup_flags & FOLL_FAST_ONLY))
   2866		might_lock_read(&current->mm->mmap_lock);
   2867
   2868	start = untagged_addr(start) & PAGE_MASK;
   2869	len = nr_pages << PAGE_SHIFT;
   2870	if (check_add_overflow(start, len, &end))
   2871		return 0;
   2872	if (unlikely(!access_ok((void __user *)start, len)))
   2873		return -EFAULT;
   2874
   2875	nr_pinned = lockless_pages_from_mm(start, end, gup_flags, pages);
   2876	if (nr_pinned == nr_pages || gup_flags & FOLL_FAST_ONLY)
   2877		return nr_pinned;
   2878
   2879	/* Slow path: try to get the remaining pages with get_user_pages */
   2880	start += nr_pinned << PAGE_SHIFT;
   2881	pages += nr_pinned;
   2882	ret = __gup_longterm_unlocked(start, nr_pages - nr_pinned, gup_flags,
   2883				      pages);
   2884	if (ret < 0) {
   2885		/*
   2886		 * The caller has to unpin the pages we already pinned so
   2887		 * returning -errno is not an option
   2888		 */
   2889		if (nr_pinned)
   2890			return nr_pinned;
   2891		return ret;
   2892	}
   2893	return ret + nr_pinned;
   2894}
   2895
   2896/**
   2897 * get_user_pages_fast_only() - pin user pages in memory
   2898 * @start:      starting user address
   2899 * @nr_pages:   number of pages from start to pin
   2900 * @gup_flags:  flags modifying pin behaviour
   2901 * @pages:      array that receives pointers to the pages pinned.
   2902 *              Should be at least nr_pages long.
   2903 *
   2904 * Like get_user_pages_fast() except it's IRQ-safe in that it won't fall back to
   2905 * the regular GUP.
   2906 * Note a difference with get_user_pages_fast: this always returns the
   2907 * number of pages pinned, 0 if no pages were pinned.
   2908 *
   2909 * If the architecture does not support this function, simply return with no
   2910 * pages pinned.
   2911 *
   2912 * Careful, careful! COW breaking can go either way, so a non-write
   2913 * access can get ambiguous page results. If you call this function without
   2914 * 'write' set, you'd better be sure that you're ok with that ambiguity.
   2915 */
   2916int get_user_pages_fast_only(unsigned long start, int nr_pages,
   2917			     unsigned int gup_flags, struct page **pages)
   2918{
   2919	int nr_pinned;
   2920	/*
   2921	 * Internally (within mm/gup.c), gup fast variants must set FOLL_GET,
   2922	 * because gup fast is always a "pin with a +1 page refcount" request.
   2923	 *
   2924	 * FOLL_FAST_ONLY is required in order to match the API description of
   2925	 * this routine: no fall back to regular ("slow") GUP.
   2926	 */
   2927	gup_flags |= FOLL_GET | FOLL_FAST_ONLY;
   2928
   2929	nr_pinned = internal_get_user_pages_fast(start, nr_pages, gup_flags,
   2930						 pages);
   2931
   2932	/*
   2933	 * As specified in the API description above, this routine is not
   2934	 * allowed to return negative values. However, the common core
   2935	 * routine internal_get_user_pages_fast() *can* return -errno.
   2936	 * Therefore, correct for that here:
   2937	 */
   2938	if (nr_pinned < 0)
   2939		nr_pinned = 0;
   2940
   2941	return nr_pinned;
   2942}
   2943EXPORT_SYMBOL_GPL(get_user_pages_fast_only);
   2944
   2945/**
   2946 * get_user_pages_fast() - pin user pages in memory
   2947 * @start:      starting user address
   2948 * @nr_pages:   number of pages from start to pin
   2949 * @gup_flags:  flags modifying pin behaviour
   2950 * @pages:      array that receives pointers to the pages pinned.
   2951 *              Should be at least nr_pages long.
   2952 *
   2953 * Attempt to pin user pages in memory without taking mm->mmap_lock.
   2954 * If not successful, it will fall back to taking the lock and
   2955 * calling get_user_pages().
   2956 *
   2957 * Returns number of pages pinned. This may be fewer than the number requested.
   2958 * If nr_pages is 0 or negative, returns 0. If no pages were pinned, returns
   2959 * -errno.
   2960 */
   2961int get_user_pages_fast(unsigned long start, int nr_pages,
   2962			unsigned int gup_flags, struct page **pages)
   2963{
   2964	if (!is_valid_gup_flags(gup_flags))
   2965		return -EINVAL;
   2966
   2967	/*
   2968	 * The caller may or may not have explicitly set FOLL_GET; either way is
   2969	 * OK. However, internally (within mm/gup.c), gup fast variants must set
   2970	 * FOLL_GET, because gup fast is always a "pin with a +1 page refcount"
   2971	 * request.
   2972	 */
   2973	gup_flags |= FOLL_GET;
   2974	return internal_get_user_pages_fast(start, nr_pages, gup_flags, pages);
   2975}
   2976EXPORT_SYMBOL_GPL(get_user_pages_fast);
   2977
   2978/**
   2979 * pin_user_pages_fast() - pin user pages in memory without taking locks
   2980 *
   2981 * @start:      starting user address
   2982 * @nr_pages:   number of pages from start to pin
   2983 * @gup_flags:  flags modifying pin behaviour
   2984 * @pages:      array that receives pointers to the pages pinned.
   2985 *              Should be at least nr_pages long.
   2986 *
   2987 * Nearly the same as get_user_pages_fast(), except that FOLL_PIN is set. See
   2988 * get_user_pages_fast() for documentation on the function arguments, because
   2989 * the arguments here are identical.
   2990 *
   2991 * FOLL_PIN means that the pages must be released via unpin_user_page(). Please
   2992 * see Documentation/core-api/pin_user_pages.rst for further details.
   2993 */
   2994int pin_user_pages_fast(unsigned long start, int nr_pages,
   2995			unsigned int gup_flags, struct page **pages)
   2996{
   2997	/* FOLL_GET and FOLL_PIN are mutually exclusive. */
   2998	if (WARN_ON_ONCE(gup_flags & FOLL_GET))
   2999		return -EINVAL;
   3000
   3001	if (WARN_ON_ONCE(!pages))
   3002		return -EINVAL;
   3003
   3004	gup_flags |= FOLL_PIN;
   3005	return internal_get_user_pages_fast(start, nr_pages, gup_flags, pages);
   3006}
   3007EXPORT_SYMBOL_GPL(pin_user_pages_fast);
   3008
   3009/*
   3010 * This is the FOLL_PIN equivalent of get_user_pages_fast_only(). Behavior
   3011 * is the same, except that this one sets FOLL_PIN instead of FOLL_GET.
   3012 *
   3013 * The API rules are the same, too: no negative values may be returned.
   3014 */
   3015int pin_user_pages_fast_only(unsigned long start, int nr_pages,
   3016			     unsigned int gup_flags, struct page **pages)
   3017{
   3018	int nr_pinned;
   3019
   3020	/*
   3021	 * FOLL_GET and FOLL_PIN are mutually exclusive. Note that the API
   3022	 * rules require returning 0, rather than -errno:
   3023	 */
   3024	if (WARN_ON_ONCE(gup_flags & FOLL_GET))
   3025		return 0;
   3026
   3027	if (WARN_ON_ONCE(!pages))
   3028		return 0;
   3029	/*
   3030	 * FOLL_FAST_ONLY is required in order to match the API description of
   3031	 * this routine: no fall back to regular ("slow") GUP.
   3032	 */
   3033	gup_flags |= (FOLL_PIN | FOLL_FAST_ONLY);
   3034	nr_pinned = internal_get_user_pages_fast(start, nr_pages, gup_flags,
   3035						 pages);
   3036	/*
   3037	 * This routine is not allowed to return negative values. However,
   3038	 * internal_get_user_pages_fast() *can* return -errno. Therefore,
   3039	 * correct for that here:
   3040	 */
   3041	if (nr_pinned < 0)
   3042		nr_pinned = 0;
   3043
   3044	return nr_pinned;
   3045}
   3046EXPORT_SYMBOL_GPL(pin_user_pages_fast_only);
   3047
   3048/**
   3049 * pin_user_pages_remote() - pin pages of a remote process
   3050 *
   3051 * @mm:		mm_struct of target mm
   3052 * @start:	starting user address
   3053 * @nr_pages:	number of pages from start to pin
   3054 * @gup_flags:	flags modifying lookup behaviour
   3055 * @pages:	array that receives pointers to the pages pinned.
   3056 *		Should be at least nr_pages long.
   3057 * @vmas:	array of pointers to vmas corresponding to each page.
   3058 *		Or NULL if the caller does not require them.
   3059 * @locked:	pointer to lock flag indicating whether lock is held and
   3060 *		subsequently whether VM_FAULT_RETRY functionality can be
   3061 *		utilised. Lock must initially be held.
   3062 *
   3063 * Nearly the same as get_user_pages_remote(), except that FOLL_PIN is set. See
   3064 * get_user_pages_remote() for documentation on the function arguments, because
   3065 * the arguments here are identical.
   3066 *
   3067 * FOLL_PIN means that the pages must be released via unpin_user_page(). Please
   3068 * see Documentation/core-api/pin_user_pages.rst for details.
   3069 */
   3070long pin_user_pages_remote(struct mm_struct *mm,
   3071			   unsigned long start, unsigned long nr_pages,
   3072			   unsigned int gup_flags, struct page **pages,
   3073			   struct vm_area_struct **vmas, int *locked)
   3074{
   3075	/* FOLL_GET and FOLL_PIN are mutually exclusive. */
   3076	if (WARN_ON_ONCE(gup_flags & FOLL_GET))
   3077		return -EINVAL;
   3078
   3079	if (WARN_ON_ONCE(!pages))
   3080		return -EINVAL;
   3081
   3082	gup_flags |= FOLL_PIN;
   3083	return __get_user_pages_remote(mm, start, nr_pages, gup_flags,
   3084				       pages, vmas, locked);
   3085}
   3086EXPORT_SYMBOL(pin_user_pages_remote);
   3087
   3088/**
   3089 * pin_user_pages() - pin user pages in memory for use by other devices
   3090 *
   3091 * @start:	starting user address
   3092 * @nr_pages:	number of pages from start to pin
   3093 * @gup_flags:	flags modifying lookup behaviour
   3094 * @pages:	array that receives pointers to the pages pinned.
   3095 *		Should be at least nr_pages long.
   3096 * @vmas:	array of pointers to vmas corresponding to each page.
   3097 *		Or NULL if the caller does not require them.
   3098 *
   3099 * Nearly the same as get_user_pages(), except that FOLL_TOUCH is not set, and
   3100 * FOLL_PIN is set.
   3101 *
   3102 * FOLL_PIN means that the pages must be released via unpin_user_page(). Please
   3103 * see Documentation/core-api/pin_user_pages.rst for details.
   3104 */
   3105long pin_user_pages(unsigned long start, unsigned long nr_pages,
   3106		    unsigned int gup_flags, struct page **pages,
   3107		    struct vm_area_struct **vmas)
   3108{
   3109	/* FOLL_GET and FOLL_PIN are mutually exclusive. */
   3110	if (WARN_ON_ONCE(gup_flags & FOLL_GET))
   3111		return -EINVAL;
   3112
   3113	if (WARN_ON_ONCE(!pages))
   3114		return -EINVAL;
   3115
   3116	gup_flags |= FOLL_PIN;
   3117	return __gup_longterm_locked(current->mm, start, nr_pages,
   3118				     pages, vmas, gup_flags);
   3119}
   3120EXPORT_SYMBOL(pin_user_pages);
   3121
   3122/*
   3123 * pin_user_pages_unlocked() is the FOLL_PIN variant of
   3124 * get_user_pages_unlocked(). Behavior is the same, except that this one sets
   3125 * FOLL_PIN and rejects FOLL_GET.
   3126 */
   3127long pin_user_pages_unlocked(unsigned long start, unsigned long nr_pages,
   3128			     struct page **pages, unsigned int gup_flags)
   3129{
   3130	/* FOLL_GET and FOLL_PIN are mutually exclusive. */
   3131	if (WARN_ON_ONCE(gup_flags & FOLL_GET))
   3132		return -EINVAL;
   3133
   3134	if (WARN_ON_ONCE(!pages))
   3135		return -EINVAL;
   3136
   3137	gup_flags |= FOLL_PIN;
   3138	return get_user_pages_unlocked(start, nr_pages, pages, gup_flags);
   3139}
   3140EXPORT_SYMBOL(pin_user_pages_unlocked);