cachepc-linux

Fork of AMDESE/linux with modifications for CachePC side-channel attack
git clone https://git.sinitax.com/sinitax/cachepc-linux
Log | Files | Refs | README | LICENSE | sfeed.txt

madvise.c (38525B)


      1// SPDX-License-Identifier: GPL-2.0
      2/*
      3 *	linux/mm/madvise.c
      4 *
      5 * Copyright (C) 1999  Linus Torvalds
      6 * Copyright (C) 2002  Christoph Hellwig
      7 */
      8
      9#include <linux/mman.h>
     10#include <linux/pagemap.h>
     11#include <linux/syscalls.h>
     12#include <linux/mempolicy.h>
     13#include <linux/page-isolation.h>
     14#include <linux/page_idle.h>
     15#include <linux/userfaultfd_k.h>
     16#include <linux/hugetlb.h>
     17#include <linux/falloc.h>
     18#include <linux/fadvise.h>
     19#include <linux/sched.h>
     20#include <linux/sched/mm.h>
     21#include <linux/mm_inline.h>
     22#include <linux/string.h>
     23#include <linux/uio.h>
     24#include <linux/ksm.h>
     25#include <linux/fs.h>
     26#include <linux/file.h>
     27#include <linux/blkdev.h>
     28#include <linux/backing-dev.h>
     29#include <linux/pagewalk.h>
     30#include <linux/swap.h>
     31#include <linux/swapops.h>
     32#include <linux/shmem_fs.h>
     33#include <linux/mmu_notifier.h>
     34
     35#include <asm/tlb.h>
     36
     37#include "internal.h"
     38#include "swap.h"
     39
     40struct madvise_walk_private {
     41	struct mmu_gather *tlb;
     42	bool pageout;
     43};
     44
     45/*
     46 * Any behaviour which results in changes to the vma->vm_flags needs to
     47 * take mmap_lock for writing. Others, which simply traverse vmas, need
     48 * to only take it for reading.
     49 */
     50static int madvise_need_mmap_write(int behavior)
     51{
     52	switch (behavior) {
     53	case MADV_REMOVE:
     54	case MADV_WILLNEED:
     55	case MADV_DONTNEED:
     56	case MADV_DONTNEED_LOCKED:
     57	case MADV_COLD:
     58	case MADV_PAGEOUT:
     59	case MADV_FREE:
     60	case MADV_POPULATE_READ:
     61	case MADV_POPULATE_WRITE:
     62		return 0;
     63	default:
     64		/* be safe, default to 1. list exceptions explicitly */
     65		return 1;
     66	}
     67}
     68
     69#ifdef CONFIG_ANON_VMA_NAME
     70struct anon_vma_name *anon_vma_name_alloc(const char *name)
     71{
     72	struct anon_vma_name *anon_name;
     73	size_t count;
     74
     75	/* Add 1 for NUL terminator at the end of the anon_name->name */
     76	count = strlen(name) + 1;
     77	anon_name = kmalloc(struct_size(anon_name, name, count), GFP_KERNEL);
     78	if (anon_name) {
     79		kref_init(&anon_name->kref);
     80		memcpy(anon_name->name, name, count);
     81	}
     82
     83	return anon_name;
     84}
     85
     86void anon_vma_name_free(struct kref *kref)
     87{
     88	struct anon_vma_name *anon_name =
     89			container_of(kref, struct anon_vma_name, kref);
     90	kfree(anon_name);
     91}
     92
     93struct anon_vma_name *anon_vma_name(struct vm_area_struct *vma)
     94{
     95	mmap_assert_locked(vma->vm_mm);
     96
     97	if (vma->vm_file)
     98		return NULL;
     99
    100	return vma->anon_name;
    101}
    102
    103/* mmap_lock should be write-locked */
    104static int replace_anon_vma_name(struct vm_area_struct *vma,
    105				 struct anon_vma_name *anon_name)
    106{
    107	struct anon_vma_name *orig_name = anon_vma_name(vma);
    108
    109	if (!anon_name) {
    110		vma->anon_name = NULL;
    111		anon_vma_name_put(orig_name);
    112		return 0;
    113	}
    114
    115	if (anon_vma_name_eq(orig_name, anon_name))
    116		return 0;
    117
    118	vma->anon_name = anon_vma_name_reuse(anon_name);
    119	anon_vma_name_put(orig_name);
    120
    121	return 0;
    122}
    123#else /* CONFIG_ANON_VMA_NAME */
    124static int replace_anon_vma_name(struct vm_area_struct *vma,
    125				 struct anon_vma_name *anon_name)
    126{
    127	if (anon_name)
    128		return -EINVAL;
    129
    130	return 0;
    131}
    132#endif /* CONFIG_ANON_VMA_NAME */
    133/*
    134 * Update the vm_flags on region of a vma, splitting it or merging it as
    135 * necessary.  Must be called with mmap_sem held for writing;
    136 * Caller should ensure anon_name stability by raising its refcount even when
    137 * anon_name belongs to a valid vma because this function might free that vma.
    138 */
    139static int madvise_update_vma(struct vm_area_struct *vma,
    140			      struct vm_area_struct **prev, unsigned long start,
    141			      unsigned long end, unsigned long new_flags,
    142			      struct anon_vma_name *anon_name)
    143{
    144	struct mm_struct *mm = vma->vm_mm;
    145	int error;
    146	pgoff_t pgoff;
    147
    148	if (new_flags == vma->vm_flags && anon_vma_name_eq(anon_vma_name(vma), anon_name)) {
    149		*prev = vma;
    150		return 0;
    151	}
    152
    153	pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT);
    154	*prev = vma_merge(mm, *prev, start, end, new_flags, vma->anon_vma,
    155			  vma->vm_file, pgoff, vma_policy(vma),
    156			  vma->vm_userfaultfd_ctx, anon_name);
    157	if (*prev) {
    158		vma = *prev;
    159		goto success;
    160	}
    161
    162	*prev = vma;
    163
    164	if (start != vma->vm_start) {
    165		if (unlikely(mm->map_count >= sysctl_max_map_count))
    166			return -ENOMEM;
    167		error = __split_vma(mm, vma, start, 1);
    168		if (error)
    169			return error;
    170	}
    171
    172	if (end != vma->vm_end) {
    173		if (unlikely(mm->map_count >= sysctl_max_map_count))
    174			return -ENOMEM;
    175		error = __split_vma(mm, vma, end, 0);
    176		if (error)
    177			return error;
    178	}
    179
    180success:
    181	/*
    182	 * vm_flags is protected by the mmap_lock held in write mode.
    183	 */
    184	vma->vm_flags = new_flags;
    185	if (!vma->vm_file) {
    186		error = replace_anon_vma_name(vma, anon_name);
    187		if (error)
    188			return error;
    189	}
    190
    191	return 0;
    192}
    193
    194#ifdef CONFIG_SWAP
    195static int swapin_walk_pmd_entry(pmd_t *pmd, unsigned long start,
    196	unsigned long end, struct mm_walk *walk)
    197{
    198	pte_t *orig_pte;
    199	struct vm_area_struct *vma = walk->private;
    200	unsigned long index;
    201	struct swap_iocb *splug = NULL;
    202
    203	if (pmd_none_or_trans_huge_or_clear_bad(pmd))
    204		return 0;
    205
    206	for (index = start; index != end; index += PAGE_SIZE) {
    207		pte_t pte;
    208		swp_entry_t entry;
    209		struct page *page;
    210		spinlock_t *ptl;
    211
    212		orig_pte = pte_offset_map_lock(vma->vm_mm, pmd, start, &ptl);
    213		pte = *(orig_pte + ((index - start) / PAGE_SIZE));
    214		pte_unmap_unlock(orig_pte, ptl);
    215
    216		if (pte_present(pte) || pte_none(pte))
    217			continue;
    218		entry = pte_to_swp_entry(pte);
    219		if (unlikely(non_swap_entry(entry)))
    220			continue;
    221
    222		page = read_swap_cache_async(entry, GFP_HIGHUSER_MOVABLE,
    223					     vma, index, false, &splug);
    224		if (page)
    225			put_page(page);
    226	}
    227	swap_read_unplug(splug);
    228
    229	return 0;
    230}
    231
    232static const struct mm_walk_ops swapin_walk_ops = {
    233	.pmd_entry		= swapin_walk_pmd_entry,
    234};
    235
    236static void force_shm_swapin_readahead(struct vm_area_struct *vma,
    237		unsigned long start, unsigned long end,
    238		struct address_space *mapping)
    239{
    240	XA_STATE(xas, &mapping->i_pages, linear_page_index(vma, start));
    241	pgoff_t end_index = linear_page_index(vma, end + PAGE_SIZE - 1);
    242	struct page *page;
    243	struct swap_iocb *splug = NULL;
    244
    245	rcu_read_lock();
    246	xas_for_each(&xas, page, end_index) {
    247		swp_entry_t swap;
    248
    249		if (!xa_is_value(page))
    250			continue;
    251		swap = radix_to_swp_entry(page);
    252		/* There might be swapin error entries in shmem mapping. */
    253		if (non_swap_entry(swap))
    254			continue;
    255		xas_pause(&xas);
    256		rcu_read_unlock();
    257
    258		page = read_swap_cache_async(swap, GFP_HIGHUSER_MOVABLE,
    259					     NULL, 0, false, &splug);
    260		if (page)
    261			put_page(page);
    262
    263		rcu_read_lock();
    264	}
    265	rcu_read_unlock();
    266	swap_read_unplug(splug);
    267
    268	lru_add_drain();	/* Push any new pages onto the LRU now */
    269}
    270#endif		/* CONFIG_SWAP */
    271
    272/*
    273 * Schedule all required I/O operations.  Do not wait for completion.
    274 */
    275static long madvise_willneed(struct vm_area_struct *vma,
    276			     struct vm_area_struct **prev,
    277			     unsigned long start, unsigned long end)
    278{
    279	struct mm_struct *mm = vma->vm_mm;
    280	struct file *file = vma->vm_file;
    281	loff_t offset;
    282
    283	*prev = vma;
    284#ifdef CONFIG_SWAP
    285	if (!file) {
    286		walk_page_range(vma->vm_mm, start, end, &swapin_walk_ops, vma);
    287		lru_add_drain(); /* Push any new pages onto the LRU now */
    288		return 0;
    289	}
    290
    291	if (shmem_mapping(file->f_mapping)) {
    292		force_shm_swapin_readahead(vma, start, end,
    293					file->f_mapping);
    294		return 0;
    295	}
    296#else
    297	if (!file)
    298		return -EBADF;
    299#endif
    300
    301	if (IS_DAX(file_inode(file))) {
    302		/* no bad return value, but ignore advice */
    303		return 0;
    304	}
    305
    306	/*
    307	 * Filesystem's fadvise may need to take various locks.  We need to
    308	 * explicitly grab a reference because the vma (and hence the
    309	 * vma's reference to the file) can go away as soon as we drop
    310	 * mmap_lock.
    311	 */
    312	*prev = NULL;	/* tell sys_madvise we drop mmap_lock */
    313	get_file(file);
    314	offset = (loff_t)(start - vma->vm_start)
    315			+ ((loff_t)vma->vm_pgoff << PAGE_SHIFT);
    316	mmap_read_unlock(mm);
    317	vfs_fadvise(file, offset, end - start, POSIX_FADV_WILLNEED);
    318	fput(file);
    319	mmap_read_lock(mm);
    320	return 0;
    321}
    322
    323static int madvise_cold_or_pageout_pte_range(pmd_t *pmd,
    324				unsigned long addr, unsigned long end,
    325				struct mm_walk *walk)
    326{
    327	struct madvise_walk_private *private = walk->private;
    328	struct mmu_gather *tlb = private->tlb;
    329	bool pageout = private->pageout;
    330	struct mm_struct *mm = tlb->mm;
    331	struct vm_area_struct *vma = walk->vma;
    332	pte_t *orig_pte, *pte, ptent;
    333	spinlock_t *ptl;
    334	struct page *page = NULL;
    335	LIST_HEAD(page_list);
    336
    337	if (fatal_signal_pending(current))
    338		return -EINTR;
    339
    340#ifdef CONFIG_TRANSPARENT_HUGEPAGE
    341	if (pmd_trans_huge(*pmd)) {
    342		pmd_t orig_pmd;
    343		unsigned long next = pmd_addr_end(addr, end);
    344
    345		tlb_change_page_size(tlb, HPAGE_PMD_SIZE);
    346		ptl = pmd_trans_huge_lock(pmd, vma);
    347		if (!ptl)
    348			return 0;
    349
    350		orig_pmd = *pmd;
    351		if (is_huge_zero_pmd(orig_pmd))
    352			goto huge_unlock;
    353
    354		if (unlikely(!pmd_present(orig_pmd))) {
    355			VM_BUG_ON(thp_migration_supported() &&
    356					!is_pmd_migration_entry(orig_pmd));
    357			goto huge_unlock;
    358		}
    359
    360		page = pmd_page(orig_pmd);
    361
    362		/* Do not interfere with other mappings of this page */
    363		if (page_mapcount(page) != 1)
    364			goto huge_unlock;
    365
    366		if (next - addr != HPAGE_PMD_SIZE) {
    367			int err;
    368
    369			get_page(page);
    370			spin_unlock(ptl);
    371			lock_page(page);
    372			err = split_huge_page(page);
    373			unlock_page(page);
    374			put_page(page);
    375			if (!err)
    376				goto regular_page;
    377			return 0;
    378		}
    379
    380		if (pmd_young(orig_pmd)) {
    381			pmdp_invalidate(vma, addr, pmd);
    382			orig_pmd = pmd_mkold(orig_pmd);
    383
    384			set_pmd_at(mm, addr, pmd, orig_pmd);
    385			tlb_remove_pmd_tlb_entry(tlb, pmd, addr);
    386		}
    387
    388		ClearPageReferenced(page);
    389		test_and_clear_page_young(page);
    390		if (pageout) {
    391			if (!isolate_lru_page(page)) {
    392				if (PageUnevictable(page))
    393					putback_lru_page(page);
    394				else
    395					list_add(&page->lru, &page_list);
    396			}
    397		} else
    398			deactivate_page(page);
    399huge_unlock:
    400		spin_unlock(ptl);
    401		if (pageout)
    402			reclaim_pages(&page_list);
    403		return 0;
    404	}
    405
    406regular_page:
    407	if (pmd_trans_unstable(pmd))
    408		return 0;
    409#endif
    410	tlb_change_page_size(tlb, PAGE_SIZE);
    411	orig_pte = pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
    412	flush_tlb_batched_pending(mm);
    413	arch_enter_lazy_mmu_mode();
    414	for (; addr < end; pte++, addr += PAGE_SIZE) {
    415		ptent = *pte;
    416
    417		if (pte_none(ptent))
    418			continue;
    419
    420		if (!pte_present(ptent))
    421			continue;
    422
    423		page = vm_normal_page(vma, addr, ptent);
    424		if (!page)
    425			continue;
    426
    427		/*
    428		 * Creating a THP page is expensive so split it only if we
    429		 * are sure it's worth. Split it if we are only owner.
    430		 */
    431		if (PageTransCompound(page)) {
    432			if (page_mapcount(page) != 1)
    433				break;
    434			get_page(page);
    435			if (!trylock_page(page)) {
    436				put_page(page);
    437				break;
    438			}
    439			pte_unmap_unlock(orig_pte, ptl);
    440			if (split_huge_page(page)) {
    441				unlock_page(page);
    442				put_page(page);
    443				orig_pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
    444				break;
    445			}
    446			unlock_page(page);
    447			put_page(page);
    448			orig_pte = pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
    449			pte--;
    450			addr -= PAGE_SIZE;
    451			continue;
    452		}
    453
    454		/* Do not interfere with other mappings of this page */
    455		if (page_mapcount(page) != 1)
    456			continue;
    457
    458		VM_BUG_ON_PAGE(PageTransCompound(page), page);
    459
    460		if (pte_young(ptent)) {
    461			ptent = ptep_get_and_clear_full(mm, addr, pte,
    462							tlb->fullmm);
    463			ptent = pte_mkold(ptent);
    464			set_pte_at(mm, addr, pte, ptent);
    465			tlb_remove_tlb_entry(tlb, pte, addr);
    466		}
    467
    468		/*
    469		 * We are deactivating a page for accelerating reclaiming.
    470		 * VM couldn't reclaim the page unless we clear PG_young.
    471		 * As a side effect, it makes confuse idle-page tracking
    472		 * because they will miss recent referenced history.
    473		 */
    474		ClearPageReferenced(page);
    475		test_and_clear_page_young(page);
    476		if (pageout) {
    477			if (!isolate_lru_page(page)) {
    478				if (PageUnevictable(page))
    479					putback_lru_page(page);
    480				else
    481					list_add(&page->lru, &page_list);
    482			}
    483		} else
    484			deactivate_page(page);
    485	}
    486
    487	arch_leave_lazy_mmu_mode();
    488	pte_unmap_unlock(orig_pte, ptl);
    489	if (pageout)
    490		reclaim_pages(&page_list);
    491	cond_resched();
    492
    493	return 0;
    494}
    495
    496static const struct mm_walk_ops cold_walk_ops = {
    497	.pmd_entry = madvise_cold_or_pageout_pte_range,
    498};
    499
    500static void madvise_cold_page_range(struct mmu_gather *tlb,
    501			     struct vm_area_struct *vma,
    502			     unsigned long addr, unsigned long end)
    503{
    504	struct madvise_walk_private walk_private = {
    505		.pageout = false,
    506		.tlb = tlb,
    507	};
    508
    509	tlb_start_vma(tlb, vma);
    510	walk_page_range(vma->vm_mm, addr, end, &cold_walk_ops, &walk_private);
    511	tlb_end_vma(tlb, vma);
    512}
    513
    514static inline bool can_madv_lru_vma(struct vm_area_struct *vma)
    515{
    516	return !(vma->vm_flags & (VM_LOCKED|VM_PFNMAP|VM_HUGETLB));
    517}
    518
    519static long madvise_cold(struct vm_area_struct *vma,
    520			struct vm_area_struct **prev,
    521			unsigned long start_addr, unsigned long end_addr)
    522{
    523	struct mm_struct *mm = vma->vm_mm;
    524	struct mmu_gather tlb;
    525
    526	*prev = vma;
    527	if (!can_madv_lru_vma(vma))
    528		return -EINVAL;
    529
    530	lru_add_drain();
    531	tlb_gather_mmu(&tlb, mm);
    532	madvise_cold_page_range(&tlb, vma, start_addr, end_addr);
    533	tlb_finish_mmu(&tlb);
    534
    535	return 0;
    536}
    537
    538static void madvise_pageout_page_range(struct mmu_gather *tlb,
    539			     struct vm_area_struct *vma,
    540			     unsigned long addr, unsigned long end)
    541{
    542	struct madvise_walk_private walk_private = {
    543		.pageout = true,
    544		.tlb = tlb,
    545	};
    546
    547	tlb_start_vma(tlb, vma);
    548	walk_page_range(vma->vm_mm, addr, end, &cold_walk_ops, &walk_private);
    549	tlb_end_vma(tlb, vma);
    550}
    551
    552static inline bool can_do_pageout(struct vm_area_struct *vma)
    553{
    554	if (vma_is_anonymous(vma))
    555		return true;
    556	if (!vma->vm_file)
    557		return false;
    558	/*
    559	 * paging out pagecache only for non-anonymous mappings that correspond
    560	 * to the files the calling process could (if tried) open for writing;
    561	 * otherwise we'd be including shared non-exclusive mappings, which
    562	 * opens a side channel.
    563	 */
    564	return inode_owner_or_capable(&init_user_ns,
    565				      file_inode(vma->vm_file)) ||
    566	       file_permission(vma->vm_file, MAY_WRITE) == 0;
    567}
    568
    569static long madvise_pageout(struct vm_area_struct *vma,
    570			struct vm_area_struct **prev,
    571			unsigned long start_addr, unsigned long end_addr)
    572{
    573	struct mm_struct *mm = vma->vm_mm;
    574	struct mmu_gather tlb;
    575
    576	*prev = vma;
    577	if (!can_madv_lru_vma(vma))
    578		return -EINVAL;
    579
    580	if (!can_do_pageout(vma))
    581		return 0;
    582
    583	lru_add_drain();
    584	tlb_gather_mmu(&tlb, mm);
    585	madvise_pageout_page_range(&tlb, vma, start_addr, end_addr);
    586	tlb_finish_mmu(&tlb);
    587
    588	return 0;
    589}
    590
    591static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr,
    592				unsigned long end, struct mm_walk *walk)
    593
    594{
    595	struct mmu_gather *tlb = walk->private;
    596	struct mm_struct *mm = tlb->mm;
    597	struct vm_area_struct *vma = walk->vma;
    598	spinlock_t *ptl;
    599	pte_t *orig_pte, *pte, ptent;
    600	struct page *page;
    601	int nr_swap = 0;
    602	unsigned long next;
    603
    604	next = pmd_addr_end(addr, end);
    605	if (pmd_trans_huge(*pmd))
    606		if (madvise_free_huge_pmd(tlb, vma, pmd, addr, next))
    607			goto next;
    608
    609	if (pmd_trans_unstable(pmd))
    610		return 0;
    611
    612	tlb_change_page_size(tlb, PAGE_SIZE);
    613	orig_pte = pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
    614	flush_tlb_batched_pending(mm);
    615	arch_enter_lazy_mmu_mode();
    616	for (; addr != end; pte++, addr += PAGE_SIZE) {
    617		ptent = *pte;
    618
    619		if (pte_none(ptent))
    620			continue;
    621		/*
    622		 * If the pte has swp_entry, just clear page table to
    623		 * prevent swap-in which is more expensive rather than
    624		 * (page allocation + zeroing).
    625		 */
    626		if (!pte_present(ptent)) {
    627			swp_entry_t entry;
    628
    629			entry = pte_to_swp_entry(ptent);
    630			if (!non_swap_entry(entry)) {
    631				nr_swap--;
    632				free_swap_and_cache(entry);
    633				pte_clear_not_present_full(mm, addr, pte, tlb->fullmm);
    634			} else if (is_hwpoison_entry(entry) ||
    635				   is_swapin_error_entry(entry)) {
    636				pte_clear_not_present_full(mm, addr, pte, tlb->fullmm);
    637			}
    638			continue;
    639		}
    640
    641		page = vm_normal_page(vma, addr, ptent);
    642		if (!page)
    643			continue;
    644
    645		/*
    646		 * If pmd isn't transhuge but the page is THP and
    647		 * is owned by only this process, split it and
    648		 * deactivate all pages.
    649		 */
    650		if (PageTransCompound(page)) {
    651			if (page_mapcount(page) != 1)
    652				goto out;
    653			get_page(page);
    654			if (!trylock_page(page)) {
    655				put_page(page);
    656				goto out;
    657			}
    658			pte_unmap_unlock(orig_pte, ptl);
    659			if (split_huge_page(page)) {
    660				unlock_page(page);
    661				put_page(page);
    662				orig_pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
    663				goto out;
    664			}
    665			unlock_page(page);
    666			put_page(page);
    667			orig_pte = pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
    668			pte--;
    669			addr -= PAGE_SIZE;
    670			continue;
    671		}
    672
    673		VM_BUG_ON_PAGE(PageTransCompound(page), page);
    674
    675		if (PageSwapCache(page) || PageDirty(page)) {
    676			if (!trylock_page(page))
    677				continue;
    678			/*
    679			 * If page is shared with others, we couldn't clear
    680			 * PG_dirty of the page.
    681			 */
    682			if (page_mapcount(page) != 1) {
    683				unlock_page(page);
    684				continue;
    685			}
    686
    687			if (PageSwapCache(page) && !try_to_free_swap(page)) {
    688				unlock_page(page);
    689				continue;
    690			}
    691
    692			ClearPageDirty(page);
    693			unlock_page(page);
    694		}
    695
    696		if (pte_young(ptent) || pte_dirty(ptent)) {
    697			/*
    698			 * Some of architecture(ex, PPC) don't update TLB
    699			 * with set_pte_at and tlb_remove_tlb_entry so for
    700			 * the portability, remap the pte with old|clean
    701			 * after pte clearing.
    702			 */
    703			ptent = ptep_get_and_clear_full(mm, addr, pte,
    704							tlb->fullmm);
    705
    706			ptent = pte_mkold(ptent);
    707			ptent = pte_mkclean(ptent);
    708			set_pte_at(mm, addr, pte, ptent);
    709			tlb_remove_tlb_entry(tlb, pte, addr);
    710		}
    711		mark_page_lazyfree(page);
    712	}
    713out:
    714	if (nr_swap) {
    715		if (current->mm == mm)
    716			sync_mm_rss(mm);
    717
    718		add_mm_counter(mm, MM_SWAPENTS, nr_swap);
    719	}
    720	arch_leave_lazy_mmu_mode();
    721	pte_unmap_unlock(orig_pte, ptl);
    722	cond_resched();
    723next:
    724	return 0;
    725}
    726
    727static const struct mm_walk_ops madvise_free_walk_ops = {
    728	.pmd_entry		= madvise_free_pte_range,
    729};
    730
    731static int madvise_free_single_vma(struct vm_area_struct *vma,
    732			unsigned long start_addr, unsigned long end_addr)
    733{
    734	struct mm_struct *mm = vma->vm_mm;
    735	struct mmu_notifier_range range;
    736	struct mmu_gather tlb;
    737
    738	/* MADV_FREE works for only anon vma at the moment */
    739	if (!vma_is_anonymous(vma))
    740		return -EINVAL;
    741
    742	range.start = max(vma->vm_start, start_addr);
    743	if (range.start >= vma->vm_end)
    744		return -EINVAL;
    745	range.end = min(vma->vm_end, end_addr);
    746	if (range.end <= vma->vm_start)
    747		return -EINVAL;
    748	mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, mm,
    749				range.start, range.end);
    750
    751	lru_add_drain();
    752	tlb_gather_mmu(&tlb, mm);
    753	update_hiwater_rss(mm);
    754
    755	mmu_notifier_invalidate_range_start(&range);
    756	tlb_start_vma(&tlb, vma);
    757	walk_page_range(vma->vm_mm, range.start, range.end,
    758			&madvise_free_walk_ops, &tlb);
    759	tlb_end_vma(&tlb, vma);
    760	mmu_notifier_invalidate_range_end(&range);
    761	tlb_finish_mmu(&tlb);
    762
    763	return 0;
    764}
    765
    766/*
    767 * Application no longer needs these pages.  If the pages are dirty,
    768 * it's OK to just throw them away.  The app will be more careful about
    769 * data it wants to keep.  Be sure to free swap resources too.  The
    770 * zap_page_range call sets things up for shrink_active_list to actually free
    771 * these pages later if no one else has touched them in the meantime,
    772 * although we could add these pages to a global reuse list for
    773 * shrink_active_list to pick up before reclaiming other pages.
    774 *
    775 * NB: This interface discards data rather than pushes it out to swap,
    776 * as some implementations do.  This has performance implications for
    777 * applications like large transactional databases which want to discard
    778 * pages in anonymous maps after committing to backing store the data
    779 * that was kept in them.  There is no reason to write this data out to
    780 * the swap area if the application is discarding it.
    781 *
    782 * An interface that causes the system to free clean pages and flush
    783 * dirty pages is already available as msync(MS_INVALIDATE).
    784 */
    785static long madvise_dontneed_single_vma(struct vm_area_struct *vma,
    786					unsigned long start, unsigned long end)
    787{
    788	zap_page_range(vma, start, end - start);
    789	return 0;
    790}
    791
    792static bool madvise_dontneed_free_valid_vma(struct vm_area_struct *vma,
    793					    unsigned long start,
    794					    unsigned long *end,
    795					    int behavior)
    796{
    797	if (!is_vm_hugetlb_page(vma)) {
    798		unsigned int forbidden = VM_PFNMAP;
    799
    800		if (behavior != MADV_DONTNEED_LOCKED)
    801			forbidden |= VM_LOCKED;
    802
    803		return !(vma->vm_flags & forbidden);
    804	}
    805
    806	if (behavior != MADV_DONTNEED && behavior != MADV_DONTNEED_LOCKED)
    807		return false;
    808	if (start & ~huge_page_mask(hstate_vma(vma)))
    809		return false;
    810
    811	*end = ALIGN(*end, huge_page_size(hstate_vma(vma)));
    812	return true;
    813}
    814
    815static long madvise_dontneed_free(struct vm_area_struct *vma,
    816				  struct vm_area_struct **prev,
    817				  unsigned long start, unsigned long end,
    818				  int behavior)
    819{
    820	struct mm_struct *mm = vma->vm_mm;
    821
    822	*prev = vma;
    823	if (!madvise_dontneed_free_valid_vma(vma, start, &end, behavior))
    824		return -EINVAL;
    825
    826	if (!userfaultfd_remove(vma, start, end)) {
    827		*prev = NULL; /* mmap_lock has been dropped, prev is stale */
    828
    829		mmap_read_lock(mm);
    830		vma = find_vma(mm, start);
    831		if (!vma)
    832			return -ENOMEM;
    833		if (start < vma->vm_start) {
    834			/*
    835			 * This "vma" under revalidation is the one
    836			 * with the lowest vma->vm_start where start
    837			 * is also < vma->vm_end. If start <
    838			 * vma->vm_start it means an hole materialized
    839			 * in the user address space within the
    840			 * virtual range passed to MADV_DONTNEED
    841			 * or MADV_FREE.
    842			 */
    843			return -ENOMEM;
    844		}
    845		/*
    846		 * Potential end adjustment for hugetlb vma is OK as
    847		 * the check below keeps end within vma.
    848		 */
    849		if (!madvise_dontneed_free_valid_vma(vma, start, &end,
    850						     behavior))
    851			return -EINVAL;
    852		if (end > vma->vm_end) {
    853			/*
    854			 * Don't fail if end > vma->vm_end. If the old
    855			 * vma was split while the mmap_lock was
    856			 * released the effect of the concurrent
    857			 * operation may not cause madvise() to
    858			 * have an undefined result. There may be an
    859			 * adjacent next vma that we'll walk
    860			 * next. userfaultfd_remove() will generate an
    861			 * UFFD_EVENT_REMOVE repetition on the
    862			 * end-vma->vm_end range, but the manager can
    863			 * handle a repetition fine.
    864			 */
    865			end = vma->vm_end;
    866		}
    867		VM_WARN_ON(start >= end);
    868	}
    869
    870	if (behavior == MADV_DONTNEED || behavior == MADV_DONTNEED_LOCKED)
    871		return madvise_dontneed_single_vma(vma, start, end);
    872	else if (behavior == MADV_FREE)
    873		return madvise_free_single_vma(vma, start, end);
    874	else
    875		return -EINVAL;
    876}
    877
    878static long madvise_populate(struct vm_area_struct *vma,
    879			     struct vm_area_struct **prev,
    880			     unsigned long start, unsigned long end,
    881			     int behavior)
    882{
    883	const bool write = behavior == MADV_POPULATE_WRITE;
    884	struct mm_struct *mm = vma->vm_mm;
    885	unsigned long tmp_end;
    886	int locked = 1;
    887	long pages;
    888
    889	*prev = vma;
    890
    891	while (start < end) {
    892		/*
    893		 * We might have temporarily dropped the lock. For example,
    894		 * our VMA might have been split.
    895		 */
    896		if (!vma || start >= vma->vm_end) {
    897			vma = vma_lookup(mm, start);
    898			if (!vma)
    899				return -ENOMEM;
    900		}
    901
    902		tmp_end = min_t(unsigned long, end, vma->vm_end);
    903		/* Populate (prefault) page tables readable/writable. */
    904		pages = faultin_vma_page_range(vma, start, tmp_end, write,
    905					       &locked);
    906		if (!locked) {
    907			mmap_read_lock(mm);
    908			locked = 1;
    909			*prev = NULL;
    910			vma = NULL;
    911		}
    912		if (pages < 0) {
    913			switch (pages) {
    914			case -EINTR:
    915				return -EINTR;
    916			case -EINVAL: /* Incompatible mappings / permissions. */
    917				return -EINVAL;
    918			case -EHWPOISON:
    919				return -EHWPOISON;
    920			case -EFAULT: /* VM_FAULT_SIGBUS or VM_FAULT_SIGSEGV */
    921				return -EFAULT;
    922			default:
    923				pr_warn_once("%s: unhandled return value: %ld\n",
    924					     __func__, pages);
    925				fallthrough;
    926			case -ENOMEM:
    927				return -ENOMEM;
    928			}
    929		}
    930		start += pages * PAGE_SIZE;
    931	}
    932	return 0;
    933}
    934
    935/*
    936 * Application wants to free up the pages and associated backing store.
    937 * This is effectively punching a hole into the middle of a file.
    938 */
    939static long madvise_remove(struct vm_area_struct *vma,
    940				struct vm_area_struct **prev,
    941				unsigned long start, unsigned long end)
    942{
    943	loff_t offset;
    944	int error;
    945	struct file *f;
    946	struct mm_struct *mm = vma->vm_mm;
    947
    948	*prev = NULL;	/* tell sys_madvise we drop mmap_lock */
    949
    950	if (vma->vm_flags & VM_LOCKED)
    951		return -EINVAL;
    952
    953	f = vma->vm_file;
    954
    955	if (!f || !f->f_mapping || !f->f_mapping->host) {
    956			return -EINVAL;
    957	}
    958
    959	if ((vma->vm_flags & (VM_SHARED|VM_WRITE)) != (VM_SHARED|VM_WRITE))
    960		return -EACCES;
    961
    962	offset = (loff_t)(start - vma->vm_start)
    963			+ ((loff_t)vma->vm_pgoff << PAGE_SHIFT);
    964
    965	/*
    966	 * Filesystem's fallocate may need to take i_rwsem.  We need to
    967	 * explicitly grab a reference because the vma (and hence the
    968	 * vma's reference to the file) can go away as soon as we drop
    969	 * mmap_lock.
    970	 */
    971	get_file(f);
    972	if (userfaultfd_remove(vma, start, end)) {
    973		/* mmap_lock was not released by userfaultfd_remove() */
    974		mmap_read_unlock(mm);
    975	}
    976	error = vfs_fallocate(f,
    977				FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE,
    978				offset, end - start);
    979	fput(f);
    980	mmap_read_lock(mm);
    981	return error;
    982}
    983
    984/*
    985 * Apply an madvise behavior to a region of a vma.  madvise_update_vma
    986 * will handle splitting a vm area into separate areas, each area with its own
    987 * behavior.
    988 */
    989static int madvise_vma_behavior(struct vm_area_struct *vma,
    990				struct vm_area_struct **prev,
    991				unsigned long start, unsigned long end,
    992				unsigned long behavior)
    993{
    994	int error;
    995	struct anon_vma_name *anon_name;
    996	unsigned long new_flags = vma->vm_flags;
    997
    998	switch (behavior) {
    999	case MADV_REMOVE:
   1000		return madvise_remove(vma, prev, start, end);
   1001	case MADV_WILLNEED:
   1002		return madvise_willneed(vma, prev, start, end);
   1003	case MADV_COLD:
   1004		return madvise_cold(vma, prev, start, end);
   1005	case MADV_PAGEOUT:
   1006		return madvise_pageout(vma, prev, start, end);
   1007	case MADV_FREE:
   1008	case MADV_DONTNEED:
   1009	case MADV_DONTNEED_LOCKED:
   1010		return madvise_dontneed_free(vma, prev, start, end, behavior);
   1011	case MADV_POPULATE_READ:
   1012	case MADV_POPULATE_WRITE:
   1013		return madvise_populate(vma, prev, start, end, behavior);
   1014	case MADV_NORMAL:
   1015		new_flags = new_flags & ~VM_RAND_READ & ~VM_SEQ_READ;
   1016		break;
   1017	case MADV_SEQUENTIAL:
   1018		new_flags = (new_flags & ~VM_RAND_READ) | VM_SEQ_READ;
   1019		break;
   1020	case MADV_RANDOM:
   1021		new_flags = (new_flags & ~VM_SEQ_READ) | VM_RAND_READ;
   1022		break;
   1023	case MADV_DONTFORK:
   1024		new_flags |= VM_DONTCOPY;
   1025		break;
   1026	case MADV_DOFORK:
   1027		if (vma->vm_flags & VM_IO)
   1028			return -EINVAL;
   1029		new_flags &= ~VM_DONTCOPY;
   1030		break;
   1031	case MADV_WIPEONFORK:
   1032		/* MADV_WIPEONFORK is only supported on anonymous memory. */
   1033		if (vma->vm_file || vma->vm_flags & VM_SHARED)
   1034			return -EINVAL;
   1035		new_flags |= VM_WIPEONFORK;
   1036		break;
   1037	case MADV_KEEPONFORK:
   1038		new_flags &= ~VM_WIPEONFORK;
   1039		break;
   1040	case MADV_DONTDUMP:
   1041		new_flags |= VM_DONTDUMP;
   1042		break;
   1043	case MADV_DODUMP:
   1044		if (!is_vm_hugetlb_page(vma) && new_flags & VM_SPECIAL)
   1045			return -EINVAL;
   1046		new_flags &= ~VM_DONTDUMP;
   1047		break;
   1048	case MADV_MERGEABLE:
   1049	case MADV_UNMERGEABLE:
   1050		error = ksm_madvise(vma, start, end, behavior, &new_flags);
   1051		if (error)
   1052			goto out;
   1053		break;
   1054	case MADV_HUGEPAGE:
   1055	case MADV_NOHUGEPAGE:
   1056		error = hugepage_madvise(vma, &new_flags, behavior);
   1057		if (error)
   1058			goto out;
   1059		break;
   1060	}
   1061
   1062	anon_name = anon_vma_name(vma);
   1063	anon_vma_name_get(anon_name);
   1064	error = madvise_update_vma(vma, prev, start, end, new_flags,
   1065				   anon_name);
   1066	anon_vma_name_put(anon_name);
   1067
   1068out:
   1069	/*
   1070	 * madvise() returns EAGAIN if kernel resources, such as
   1071	 * slab, are temporarily unavailable.
   1072	 */
   1073	if (error == -ENOMEM)
   1074		error = -EAGAIN;
   1075	return error;
   1076}
   1077
   1078#ifdef CONFIG_MEMORY_FAILURE
   1079/*
   1080 * Error injection support for memory error handling.
   1081 */
   1082static int madvise_inject_error(int behavior,
   1083		unsigned long start, unsigned long end)
   1084{
   1085	unsigned long size;
   1086
   1087	if (!capable(CAP_SYS_ADMIN))
   1088		return -EPERM;
   1089
   1090
   1091	for (; start < end; start += size) {
   1092		unsigned long pfn;
   1093		struct page *page;
   1094		int ret;
   1095
   1096		ret = get_user_pages_fast(start, 1, 0, &page);
   1097		if (ret != 1)
   1098			return ret;
   1099		pfn = page_to_pfn(page);
   1100
   1101		/*
   1102		 * When soft offlining hugepages, after migrating the page
   1103		 * we dissolve it, therefore in the second loop "page" will
   1104		 * no longer be a compound page.
   1105		 */
   1106		size = page_size(compound_head(page));
   1107
   1108		if (behavior == MADV_SOFT_OFFLINE) {
   1109			pr_info("Soft offlining pfn %#lx at process virtual address %#lx\n",
   1110				 pfn, start);
   1111			ret = soft_offline_page(pfn, MF_COUNT_INCREASED);
   1112		} else {
   1113			pr_info("Injecting memory failure for pfn %#lx at process virtual address %#lx\n",
   1114				 pfn, start);
   1115			ret = memory_failure(pfn, MF_COUNT_INCREASED | MF_SW_SIMULATED);
   1116			if (ret == -EOPNOTSUPP)
   1117				ret = 0;
   1118		}
   1119
   1120		if (ret)
   1121			return ret;
   1122	}
   1123
   1124	return 0;
   1125}
   1126#endif
   1127
   1128static bool
   1129madvise_behavior_valid(int behavior)
   1130{
   1131	switch (behavior) {
   1132	case MADV_DOFORK:
   1133	case MADV_DONTFORK:
   1134	case MADV_NORMAL:
   1135	case MADV_SEQUENTIAL:
   1136	case MADV_RANDOM:
   1137	case MADV_REMOVE:
   1138	case MADV_WILLNEED:
   1139	case MADV_DONTNEED:
   1140	case MADV_DONTNEED_LOCKED:
   1141	case MADV_FREE:
   1142	case MADV_COLD:
   1143	case MADV_PAGEOUT:
   1144	case MADV_POPULATE_READ:
   1145	case MADV_POPULATE_WRITE:
   1146#ifdef CONFIG_KSM
   1147	case MADV_MERGEABLE:
   1148	case MADV_UNMERGEABLE:
   1149#endif
   1150#ifdef CONFIG_TRANSPARENT_HUGEPAGE
   1151	case MADV_HUGEPAGE:
   1152	case MADV_NOHUGEPAGE:
   1153#endif
   1154	case MADV_DONTDUMP:
   1155	case MADV_DODUMP:
   1156	case MADV_WIPEONFORK:
   1157	case MADV_KEEPONFORK:
   1158#ifdef CONFIG_MEMORY_FAILURE
   1159	case MADV_SOFT_OFFLINE:
   1160	case MADV_HWPOISON:
   1161#endif
   1162		return true;
   1163
   1164	default:
   1165		return false;
   1166	}
   1167}
   1168
   1169static bool
   1170process_madvise_behavior_valid(int behavior)
   1171{
   1172	switch (behavior) {
   1173	case MADV_COLD:
   1174	case MADV_PAGEOUT:
   1175	case MADV_WILLNEED:
   1176		return true;
   1177	default:
   1178		return false;
   1179	}
   1180}
   1181
   1182/*
   1183 * Walk the vmas in range [start,end), and call the visit function on each one.
   1184 * The visit function will get start and end parameters that cover the overlap
   1185 * between the current vma and the original range.  Any unmapped regions in the
   1186 * original range will result in this function returning -ENOMEM while still
   1187 * calling the visit function on all of the existing vmas in the range.
   1188 * Must be called with the mmap_lock held for reading or writing.
   1189 */
   1190static
   1191int madvise_walk_vmas(struct mm_struct *mm, unsigned long start,
   1192		      unsigned long end, unsigned long arg,
   1193		      int (*visit)(struct vm_area_struct *vma,
   1194				   struct vm_area_struct **prev, unsigned long start,
   1195				   unsigned long end, unsigned long arg))
   1196{
   1197	struct vm_area_struct *vma;
   1198	struct vm_area_struct *prev;
   1199	unsigned long tmp;
   1200	int unmapped_error = 0;
   1201
   1202	/*
   1203	 * If the interval [start,end) covers some unmapped address
   1204	 * ranges, just ignore them, but return -ENOMEM at the end.
   1205	 * - different from the way of handling in mlock etc.
   1206	 */
   1207	vma = find_vma_prev(mm, start, &prev);
   1208	if (vma && start > vma->vm_start)
   1209		prev = vma;
   1210
   1211	for (;;) {
   1212		int error;
   1213
   1214		/* Still start < end. */
   1215		if (!vma)
   1216			return -ENOMEM;
   1217
   1218		/* Here start < (end|vma->vm_end). */
   1219		if (start < vma->vm_start) {
   1220			unmapped_error = -ENOMEM;
   1221			start = vma->vm_start;
   1222			if (start >= end)
   1223				break;
   1224		}
   1225
   1226		/* Here vma->vm_start <= start < (end|vma->vm_end) */
   1227		tmp = vma->vm_end;
   1228		if (end < tmp)
   1229			tmp = end;
   1230
   1231		/* Here vma->vm_start <= start < tmp <= (end|vma->vm_end). */
   1232		error = visit(vma, &prev, start, tmp, arg);
   1233		if (error)
   1234			return error;
   1235		start = tmp;
   1236		if (prev && start < prev->vm_end)
   1237			start = prev->vm_end;
   1238		if (start >= end)
   1239			break;
   1240		if (prev)
   1241			vma = prev->vm_next;
   1242		else	/* madvise_remove dropped mmap_lock */
   1243			vma = find_vma(mm, start);
   1244	}
   1245
   1246	return unmapped_error;
   1247}
   1248
   1249#ifdef CONFIG_ANON_VMA_NAME
   1250static int madvise_vma_anon_name(struct vm_area_struct *vma,
   1251				 struct vm_area_struct **prev,
   1252				 unsigned long start, unsigned long end,
   1253				 unsigned long anon_name)
   1254{
   1255	int error;
   1256
   1257	/* Only anonymous mappings can be named */
   1258	if (vma->vm_file)
   1259		return -EBADF;
   1260
   1261	error = madvise_update_vma(vma, prev, start, end, vma->vm_flags,
   1262				   (struct anon_vma_name *)anon_name);
   1263
   1264	/*
   1265	 * madvise() returns EAGAIN if kernel resources, such as
   1266	 * slab, are temporarily unavailable.
   1267	 */
   1268	if (error == -ENOMEM)
   1269		error = -EAGAIN;
   1270	return error;
   1271}
   1272
   1273int madvise_set_anon_name(struct mm_struct *mm, unsigned long start,
   1274			  unsigned long len_in, struct anon_vma_name *anon_name)
   1275{
   1276	unsigned long end;
   1277	unsigned long len;
   1278
   1279	if (start & ~PAGE_MASK)
   1280		return -EINVAL;
   1281	len = (len_in + ~PAGE_MASK) & PAGE_MASK;
   1282
   1283	/* Check to see whether len was rounded up from small -ve to zero */
   1284	if (len_in && !len)
   1285		return -EINVAL;
   1286
   1287	end = start + len;
   1288	if (end < start)
   1289		return -EINVAL;
   1290
   1291	if (end == start)
   1292		return 0;
   1293
   1294	return madvise_walk_vmas(mm, start, end, (unsigned long)anon_name,
   1295				 madvise_vma_anon_name);
   1296}
   1297#endif /* CONFIG_ANON_VMA_NAME */
   1298/*
   1299 * The madvise(2) system call.
   1300 *
   1301 * Applications can use madvise() to advise the kernel how it should
   1302 * handle paging I/O in this VM area.  The idea is to help the kernel
   1303 * use appropriate read-ahead and caching techniques.  The information
   1304 * provided is advisory only, and can be safely disregarded by the
   1305 * kernel without affecting the correct operation of the application.
   1306 *
   1307 * behavior values:
   1308 *  MADV_NORMAL - the default behavior is to read clusters.  This
   1309 *		results in some read-ahead and read-behind.
   1310 *  MADV_RANDOM - the system should read the minimum amount of data
   1311 *		on any access, since it is unlikely that the appli-
   1312 *		cation will need more than what it asks for.
   1313 *  MADV_SEQUENTIAL - pages in the given range will probably be accessed
   1314 *		once, so they can be aggressively read ahead, and
   1315 *		can be freed soon after they are accessed.
   1316 *  MADV_WILLNEED - the application is notifying the system to read
   1317 *		some pages ahead.
   1318 *  MADV_DONTNEED - the application is finished with the given range,
   1319 *		so the kernel can free resources associated with it.
   1320 *  MADV_FREE - the application marks pages in the given range as lazy free,
   1321 *		where actual purges are postponed until memory pressure happens.
   1322 *  MADV_REMOVE - the application wants to free up the given range of
   1323 *		pages and associated backing store.
   1324 *  MADV_DONTFORK - omit this area from child's address space when forking:
   1325 *		typically, to avoid COWing pages pinned by get_user_pages().
   1326 *  MADV_DOFORK - cancel MADV_DONTFORK: no longer omit this area when forking.
   1327 *  MADV_WIPEONFORK - present the child process with zero-filled memory in this
   1328 *              range after a fork.
   1329 *  MADV_KEEPONFORK - undo the effect of MADV_WIPEONFORK
   1330 *  MADV_HWPOISON - trigger memory error handler as if the given memory range
   1331 *		were corrupted by unrecoverable hardware memory failure.
   1332 *  MADV_SOFT_OFFLINE - try to soft-offline the given range of memory.
   1333 *  MADV_MERGEABLE - the application recommends that KSM try to merge pages in
   1334 *		this area with pages of identical content from other such areas.
   1335 *  MADV_UNMERGEABLE- cancel MADV_MERGEABLE: no longer merge pages with others.
   1336 *  MADV_HUGEPAGE - the application wants to back the given range by transparent
   1337 *		huge pages in the future. Existing pages might be coalesced and
   1338 *		new pages might be allocated as THP.
   1339 *  MADV_NOHUGEPAGE - mark the given range as not worth being backed by
   1340 *		transparent huge pages so the existing pages will not be
   1341 *		coalesced into THP and new pages will not be allocated as THP.
   1342 *  MADV_DONTDUMP - the application wants to prevent pages in the given range
   1343 *		from being included in its core dump.
   1344 *  MADV_DODUMP - cancel MADV_DONTDUMP: no longer exclude from core dump.
   1345 *  MADV_COLD - the application is not expected to use this memory soon,
   1346 *		deactivate pages in this range so that they can be reclaimed
   1347 *		easily if memory pressure happens.
   1348 *  MADV_PAGEOUT - the application is not expected to use this memory soon,
   1349 *		page out the pages in this range immediately.
   1350 *  MADV_POPULATE_READ - populate (prefault) page tables readable by
   1351 *		triggering read faults if required
   1352 *  MADV_POPULATE_WRITE - populate (prefault) page tables writable by
   1353 *		triggering write faults if required
   1354 *
   1355 * return values:
   1356 *  zero    - success
   1357 *  -EINVAL - start + len < 0, start is not page-aligned,
   1358 *		"behavior" is not a valid value, or application
   1359 *		is attempting to release locked or shared pages,
   1360 *		or the specified address range includes file, Huge TLB,
   1361 *		MAP_SHARED or VMPFNMAP range.
   1362 *  -ENOMEM - addresses in the specified range are not currently
   1363 *		mapped, or are outside the AS of the process.
   1364 *  -EIO    - an I/O error occurred while paging in data.
   1365 *  -EBADF  - map exists, but area maps something that isn't a file.
   1366 *  -EAGAIN - a kernel resource was temporarily unavailable.
   1367 */
   1368int do_madvise(struct mm_struct *mm, unsigned long start, size_t len_in, int behavior)
   1369{
   1370	unsigned long end;
   1371	int error;
   1372	int write;
   1373	size_t len;
   1374	struct blk_plug plug;
   1375
   1376	start = untagged_addr(start);
   1377
   1378	if (!madvise_behavior_valid(behavior))
   1379		return -EINVAL;
   1380
   1381	if (!PAGE_ALIGNED(start))
   1382		return -EINVAL;
   1383	len = PAGE_ALIGN(len_in);
   1384
   1385	/* Check to see whether len was rounded up from small -ve to zero */
   1386	if (len_in && !len)
   1387		return -EINVAL;
   1388
   1389	end = start + len;
   1390	if (end < start)
   1391		return -EINVAL;
   1392
   1393	if (end == start)
   1394		return 0;
   1395
   1396#ifdef CONFIG_MEMORY_FAILURE
   1397	if (behavior == MADV_HWPOISON || behavior == MADV_SOFT_OFFLINE)
   1398		return madvise_inject_error(behavior, start, start + len_in);
   1399#endif
   1400
   1401	write = madvise_need_mmap_write(behavior);
   1402	if (write) {
   1403		if (mmap_write_lock_killable(mm))
   1404			return -EINTR;
   1405	} else {
   1406		mmap_read_lock(mm);
   1407	}
   1408
   1409	blk_start_plug(&plug);
   1410	error = madvise_walk_vmas(mm, start, end, behavior,
   1411			madvise_vma_behavior);
   1412	blk_finish_plug(&plug);
   1413	if (write)
   1414		mmap_write_unlock(mm);
   1415	else
   1416		mmap_read_unlock(mm);
   1417
   1418	return error;
   1419}
   1420
   1421SYSCALL_DEFINE3(madvise, unsigned long, start, size_t, len_in, int, behavior)
   1422{
   1423	return do_madvise(current->mm, start, len_in, behavior);
   1424}
   1425
   1426SYSCALL_DEFINE5(process_madvise, int, pidfd, const struct iovec __user *, vec,
   1427		size_t, vlen, int, behavior, unsigned int, flags)
   1428{
   1429	ssize_t ret;
   1430	struct iovec iovstack[UIO_FASTIOV], iovec;
   1431	struct iovec *iov = iovstack;
   1432	struct iov_iter iter;
   1433	struct task_struct *task;
   1434	struct mm_struct *mm;
   1435	size_t total_len;
   1436	unsigned int f_flags;
   1437
   1438	if (flags != 0) {
   1439		ret = -EINVAL;
   1440		goto out;
   1441	}
   1442
   1443	ret = import_iovec(READ, vec, vlen, ARRAY_SIZE(iovstack), &iov, &iter);
   1444	if (ret < 0)
   1445		goto out;
   1446
   1447	task = pidfd_get_task(pidfd, &f_flags);
   1448	if (IS_ERR(task)) {
   1449		ret = PTR_ERR(task);
   1450		goto free_iov;
   1451	}
   1452
   1453	if (!process_madvise_behavior_valid(behavior)) {
   1454		ret = -EINVAL;
   1455		goto release_task;
   1456	}
   1457
   1458	/* Require PTRACE_MODE_READ to avoid leaking ASLR metadata. */
   1459	mm = mm_access(task, PTRACE_MODE_READ_FSCREDS);
   1460	if (IS_ERR_OR_NULL(mm)) {
   1461		ret = IS_ERR(mm) ? PTR_ERR(mm) : -ESRCH;
   1462		goto release_task;
   1463	}
   1464
   1465	/*
   1466	 * Require CAP_SYS_NICE for influencing process performance. Note that
   1467	 * only non-destructive hints are currently supported.
   1468	 */
   1469	if (!capable(CAP_SYS_NICE)) {
   1470		ret = -EPERM;
   1471		goto release_mm;
   1472	}
   1473
   1474	total_len = iov_iter_count(&iter);
   1475
   1476	while (iov_iter_count(&iter)) {
   1477		iovec = iov_iter_iovec(&iter);
   1478		ret = do_madvise(mm, (unsigned long)iovec.iov_base,
   1479					iovec.iov_len, behavior);
   1480		if (ret < 0)
   1481			break;
   1482		iov_iter_advance(&iter, iovec.iov_len);
   1483	}
   1484
   1485	ret = (total_len - iov_iter_count(&iter)) ? : ret;
   1486
   1487release_mm:
   1488	mmput(mm);
   1489release_task:
   1490	put_task_struct(task);
   1491free_iov:
   1492	kfree(iov);
   1493out:
   1494	return ret;
   1495}