cachepc-linux

Fork of AMDESE/linux with modifications for CachePC side-channel attack
git clone https://git.sinitax.com/sinitax/cachepc-linux
Log | Files | Refs | README | LICENSE | sfeed.txt

mremap.c (28785B)


      1// SPDX-License-Identifier: GPL-2.0
      2/*
      3 *	mm/mremap.c
      4 *
      5 *	(C) Copyright 1996 Linus Torvalds
      6 *
      7 *	Address space accounting code	<alan@lxorguk.ukuu.org.uk>
      8 *	(C) Copyright 2002 Red Hat Inc, All Rights Reserved
      9 */
     10
     11#include <linux/mm.h>
     12#include <linux/hugetlb.h>
     13#include <linux/shm.h>
     14#include <linux/ksm.h>
     15#include <linux/mman.h>
     16#include <linux/swap.h>
     17#include <linux/capability.h>
     18#include <linux/fs.h>
     19#include <linux/swapops.h>
     20#include <linux/highmem.h>
     21#include <linux/security.h>
     22#include <linux/syscalls.h>
     23#include <linux/mmu_notifier.h>
     24#include <linux/uaccess.h>
     25#include <linux/userfaultfd_k.h>
     26
     27#include <asm/cacheflush.h>
     28#include <asm/tlb.h>
     29#include <asm/pgalloc.h>
     30
     31#include "internal.h"
     32
     33static pud_t *get_old_pud(struct mm_struct *mm, unsigned long addr)
     34{
     35	pgd_t *pgd;
     36	p4d_t *p4d;
     37	pud_t *pud;
     38
     39	pgd = pgd_offset(mm, addr);
     40	if (pgd_none_or_clear_bad(pgd))
     41		return NULL;
     42
     43	p4d = p4d_offset(pgd, addr);
     44	if (p4d_none_or_clear_bad(p4d))
     45		return NULL;
     46
     47	pud = pud_offset(p4d, addr);
     48	if (pud_none_or_clear_bad(pud))
     49		return NULL;
     50
     51	return pud;
     52}
     53
     54static pmd_t *get_old_pmd(struct mm_struct *mm, unsigned long addr)
     55{
     56	pud_t *pud;
     57	pmd_t *pmd;
     58
     59	pud = get_old_pud(mm, addr);
     60	if (!pud)
     61		return NULL;
     62
     63	pmd = pmd_offset(pud, addr);
     64	if (pmd_none(*pmd))
     65		return NULL;
     66
     67	return pmd;
     68}
     69
     70static pud_t *alloc_new_pud(struct mm_struct *mm, struct vm_area_struct *vma,
     71			    unsigned long addr)
     72{
     73	pgd_t *pgd;
     74	p4d_t *p4d;
     75
     76	pgd = pgd_offset(mm, addr);
     77	p4d = p4d_alloc(mm, pgd, addr);
     78	if (!p4d)
     79		return NULL;
     80
     81	return pud_alloc(mm, p4d, addr);
     82}
     83
     84static pmd_t *alloc_new_pmd(struct mm_struct *mm, struct vm_area_struct *vma,
     85			    unsigned long addr)
     86{
     87	pud_t *pud;
     88	pmd_t *pmd;
     89
     90	pud = alloc_new_pud(mm, vma, addr);
     91	if (!pud)
     92		return NULL;
     93
     94	pmd = pmd_alloc(mm, pud, addr);
     95	if (!pmd)
     96		return NULL;
     97
     98	VM_BUG_ON(pmd_trans_huge(*pmd));
     99
    100	return pmd;
    101}
    102
    103static void take_rmap_locks(struct vm_area_struct *vma)
    104{
    105	if (vma->vm_file)
    106		i_mmap_lock_write(vma->vm_file->f_mapping);
    107	if (vma->anon_vma)
    108		anon_vma_lock_write(vma->anon_vma);
    109}
    110
    111static void drop_rmap_locks(struct vm_area_struct *vma)
    112{
    113	if (vma->anon_vma)
    114		anon_vma_unlock_write(vma->anon_vma);
    115	if (vma->vm_file)
    116		i_mmap_unlock_write(vma->vm_file->f_mapping);
    117}
    118
    119static pte_t move_soft_dirty_pte(pte_t pte)
    120{
    121	/*
    122	 * Set soft dirty bit so we can notice
    123	 * in userspace the ptes were moved.
    124	 */
    125#ifdef CONFIG_MEM_SOFT_DIRTY
    126	if (pte_present(pte))
    127		pte = pte_mksoft_dirty(pte);
    128	else if (is_swap_pte(pte))
    129		pte = pte_swp_mksoft_dirty(pte);
    130#endif
    131	return pte;
    132}
    133
    134static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd,
    135		unsigned long old_addr, unsigned long old_end,
    136		struct vm_area_struct *new_vma, pmd_t *new_pmd,
    137		unsigned long new_addr, bool need_rmap_locks)
    138{
    139	struct mm_struct *mm = vma->vm_mm;
    140	pte_t *old_pte, *new_pte, pte;
    141	spinlock_t *old_ptl, *new_ptl;
    142	bool force_flush = false;
    143	unsigned long len = old_end - old_addr;
    144
    145	/*
    146	 * When need_rmap_locks is true, we take the i_mmap_rwsem and anon_vma
    147	 * locks to ensure that rmap will always observe either the old or the
    148	 * new ptes. This is the easiest way to avoid races with
    149	 * truncate_pagecache(), page migration, etc...
    150	 *
    151	 * When need_rmap_locks is false, we use other ways to avoid
    152	 * such races:
    153	 *
    154	 * - During exec() shift_arg_pages(), we use a specially tagged vma
    155	 *   which rmap call sites look for using vma_is_temporary_stack().
    156	 *
    157	 * - During mremap(), new_vma is often known to be placed after vma
    158	 *   in rmap traversal order. This ensures rmap will always observe
    159	 *   either the old pte, or the new pte, or both (the page table locks
    160	 *   serialize access to individual ptes, but only rmap traversal
    161	 *   order guarantees that we won't miss both the old and new ptes).
    162	 */
    163	if (need_rmap_locks)
    164		take_rmap_locks(vma);
    165
    166	/*
    167	 * We don't have to worry about the ordering of src and dst
    168	 * pte locks because exclusive mmap_lock prevents deadlock.
    169	 */
    170	old_pte = pte_offset_map_lock(mm, old_pmd, old_addr, &old_ptl);
    171	new_pte = pte_offset_map(new_pmd, new_addr);
    172	new_ptl = pte_lockptr(mm, new_pmd);
    173	if (new_ptl != old_ptl)
    174		spin_lock_nested(new_ptl, SINGLE_DEPTH_NESTING);
    175	flush_tlb_batched_pending(vma->vm_mm);
    176	arch_enter_lazy_mmu_mode();
    177
    178	for (; old_addr < old_end; old_pte++, old_addr += PAGE_SIZE,
    179				   new_pte++, new_addr += PAGE_SIZE) {
    180		if (pte_none(*old_pte))
    181			continue;
    182
    183		pte = ptep_get_and_clear(mm, old_addr, old_pte);
    184		/*
    185		 * If we are remapping a valid PTE, make sure
    186		 * to flush TLB before we drop the PTL for the
    187		 * PTE.
    188		 *
    189		 * NOTE! Both old and new PTL matter: the old one
    190		 * for racing with page_mkclean(), the new one to
    191		 * make sure the physical page stays valid until
    192		 * the TLB entry for the old mapping has been
    193		 * flushed.
    194		 */
    195		if (pte_present(pte))
    196			force_flush = true;
    197		pte = move_pte(pte, new_vma->vm_page_prot, old_addr, new_addr);
    198		pte = move_soft_dirty_pte(pte);
    199		set_pte_at(mm, new_addr, new_pte, pte);
    200	}
    201
    202	arch_leave_lazy_mmu_mode();
    203	if (force_flush)
    204		flush_tlb_range(vma, old_end - len, old_end);
    205	if (new_ptl != old_ptl)
    206		spin_unlock(new_ptl);
    207	pte_unmap(new_pte - 1);
    208	pte_unmap_unlock(old_pte - 1, old_ptl);
    209	if (need_rmap_locks)
    210		drop_rmap_locks(vma);
    211}
    212
    213#ifndef arch_supports_page_table_move
    214#define arch_supports_page_table_move arch_supports_page_table_move
    215static inline bool arch_supports_page_table_move(void)
    216{
    217	return IS_ENABLED(CONFIG_HAVE_MOVE_PMD) ||
    218		IS_ENABLED(CONFIG_HAVE_MOVE_PUD);
    219}
    220#endif
    221
    222#ifdef CONFIG_HAVE_MOVE_PMD
    223static bool move_normal_pmd(struct vm_area_struct *vma, unsigned long old_addr,
    224		  unsigned long new_addr, pmd_t *old_pmd, pmd_t *new_pmd)
    225{
    226	spinlock_t *old_ptl, *new_ptl;
    227	struct mm_struct *mm = vma->vm_mm;
    228	pmd_t pmd;
    229
    230	if (!arch_supports_page_table_move())
    231		return false;
    232	/*
    233	 * The destination pmd shouldn't be established, free_pgtables()
    234	 * should have released it.
    235	 *
    236	 * However, there's a case during execve() where we use mremap
    237	 * to move the initial stack, and in that case the target area
    238	 * may overlap the source area (always moving down).
    239	 *
    240	 * If everything is PMD-aligned, that works fine, as moving
    241	 * each pmd down will clear the source pmd. But if we first
    242	 * have a few 4kB-only pages that get moved down, and then
    243	 * hit the "now the rest is PMD-aligned, let's do everything
    244	 * one pmd at a time", we will still have the old (now empty
    245	 * of any 4kB pages, but still there) PMD in the page table
    246	 * tree.
    247	 *
    248	 * Warn on it once - because we really should try to figure
    249	 * out how to do this better - but then say "I won't move
    250	 * this pmd".
    251	 *
    252	 * One alternative might be to just unmap the target pmd at
    253	 * this point, and verify that it really is empty. We'll see.
    254	 */
    255	if (WARN_ON_ONCE(!pmd_none(*new_pmd)))
    256		return false;
    257
    258	/*
    259	 * We don't have to worry about the ordering of src and dst
    260	 * ptlocks because exclusive mmap_lock prevents deadlock.
    261	 */
    262	old_ptl = pmd_lock(vma->vm_mm, old_pmd);
    263	new_ptl = pmd_lockptr(mm, new_pmd);
    264	if (new_ptl != old_ptl)
    265		spin_lock_nested(new_ptl, SINGLE_DEPTH_NESTING);
    266
    267	/* Clear the pmd */
    268	pmd = *old_pmd;
    269	pmd_clear(old_pmd);
    270
    271	VM_BUG_ON(!pmd_none(*new_pmd));
    272
    273	pmd_populate(mm, new_pmd, pmd_pgtable(pmd));
    274	flush_tlb_range(vma, old_addr, old_addr + PMD_SIZE);
    275	if (new_ptl != old_ptl)
    276		spin_unlock(new_ptl);
    277	spin_unlock(old_ptl);
    278
    279	return true;
    280}
    281#else
    282static inline bool move_normal_pmd(struct vm_area_struct *vma,
    283		unsigned long old_addr, unsigned long new_addr, pmd_t *old_pmd,
    284		pmd_t *new_pmd)
    285{
    286	return false;
    287}
    288#endif
    289
    290#if CONFIG_PGTABLE_LEVELS > 2 && defined(CONFIG_HAVE_MOVE_PUD)
    291static bool move_normal_pud(struct vm_area_struct *vma, unsigned long old_addr,
    292		  unsigned long new_addr, pud_t *old_pud, pud_t *new_pud)
    293{
    294	spinlock_t *old_ptl, *new_ptl;
    295	struct mm_struct *mm = vma->vm_mm;
    296	pud_t pud;
    297
    298	if (!arch_supports_page_table_move())
    299		return false;
    300	/*
    301	 * The destination pud shouldn't be established, free_pgtables()
    302	 * should have released it.
    303	 */
    304	if (WARN_ON_ONCE(!pud_none(*new_pud)))
    305		return false;
    306
    307	/*
    308	 * We don't have to worry about the ordering of src and dst
    309	 * ptlocks because exclusive mmap_lock prevents deadlock.
    310	 */
    311	old_ptl = pud_lock(vma->vm_mm, old_pud);
    312	new_ptl = pud_lockptr(mm, new_pud);
    313	if (new_ptl != old_ptl)
    314		spin_lock_nested(new_ptl, SINGLE_DEPTH_NESTING);
    315
    316	/* Clear the pud */
    317	pud = *old_pud;
    318	pud_clear(old_pud);
    319
    320	VM_BUG_ON(!pud_none(*new_pud));
    321
    322	pud_populate(mm, new_pud, pud_pgtable(pud));
    323	flush_tlb_range(vma, old_addr, old_addr + PUD_SIZE);
    324	if (new_ptl != old_ptl)
    325		spin_unlock(new_ptl);
    326	spin_unlock(old_ptl);
    327
    328	return true;
    329}
    330#else
    331static inline bool move_normal_pud(struct vm_area_struct *vma,
    332		unsigned long old_addr, unsigned long new_addr, pud_t *old_pud,
    333		pud_t *new_pud)
    334{
    335	return false;
    336}
    337#endif
    338
    339#ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD
    340static bool move_huge_pud(struct vm_area_struct *vma, unsigned long old_addr,
    341			  unsigned long new_addr, pud_t *old_pud, pud_t *new_pud)
    342{
    343	spinlock_t *old_ptl, *new_ptl;
    344	struct mm_struct *mm = vma->vm_mm;
    345	pud_t pud;
    346
    347	/*
    348	 * The destination pud shouldn't be established, free_pgtables()
    349	 * should have released it.
    350	 */
    351	if (WARN_ON_ONCE(!pud_none(*new_pud)))
    352		return false;
    353
    354	/*
    355	 * We don't have to worry about the ordering of src and dst
    356	 * ptlocks because exclusive mmap_lock prevents deadlock.
    357	 */
    358	old_ptl = pud_lock(vma->vm_mm, old_pud);
    359	new_ptl = pud_lockptr(mm, new_pud);
    360	if (new_ptl != old_ptl)
    361		spin_lock_nested(new_ptl, SINGLE_DEPTH_NESTING);
    362
    363	/* Clear the pud */
    364	pud = *old_pud;
    365	pud_clear(old_pud);
    366
    367	VM_BUG_ON(!pud_none(*new_pud));
    368
    369	/* Set the new pud */
    370	/* mark soft_ditry when we add pud level soft dirty support */
    371	set_pud_at(mm, new_addr, new_pud, pud);
    372	flush_pud_tlb_range(vma, old_addr, old_addr + HPAGE_PUD_SIZE);
    373	if (new_ptl != old_ptl)
    374		spin_unlock(new_ptl);
    375	spin_unlock(old_ptl);
    376
    377	return true;
    378}
    379#else
    380static bool move_huge_pud(struct vm_area_struct *vma, unsigned long old_addr,
    381			  unsigned long new_addr, pud_t *old_pud, pud_t *new_pud)
    382{
    383	WARN_ON_ONCE(1);
    384	return false;
    385
    386}
    387#endif
    388
    389enum pgt_entry {
    390	NORMAL_PMD,
    391	HPAGE_PMD,
    392	NORMAL_PUD,
    393	HPAGE_PUD,
    394};
    395
    396/*
    397 * Returns an extent of the corresponding size for the pgt_entry specified if
    398 * valid. Else returns a smaller extent bounded by the end of the source and
    399 * destination pgt_entry.
    400 */
    401static __always_inline unsigned long get_extent(enum pgt_entry entry,
    402			unsigned long old_addr, unsigned long old_end,
    403			unsigned long new_addr)
    404{
    405	unsigned long next, extent, mask, size;
    406
    407	switch (entry) {
    408	case HPAGE_PMD:
    409	case NORMAL_PMD:
    410		mask = PMD_MASK;
    411		size = PMD_SIZE;
    412		break;
    413	case HPAGE_PUD:
    414	case NORMAL_PUD:
    415		mask = PUD_MASK;
    416		size = PUD_SIZE;
    417		break;
    418	default:
    419		BUILD_BUG();
    420		break;
    421	}
    422
    423	next = (old_addr + size) & mask;
    424	/* even if next overflowed, extent below will be ok */
    425	extent = next - old_addr;
    426	if (extent > old_end - old_addr)
    427		extent = old_end - old_addr;
    428	next = (new_addr + size) & mask;
    429	if (extent > next - new_addr)
    430		extent = next - new_addr;
    431	return extent;
    432}
    433
    434/*
    435 * Attempts to speedup the move by moving entry at the level corresponding to
    436 * pgt_entry. Returns true if the move was successful, else false.
    437 */
    438static bool move_pgt_entry(enum pgt_entry entry, struct vm_area_struct *vma,
    439			unsigned long old_addr, unsigned long new_addr,
    440			void *old_entry, void *new_entry, bool need_rmap_locks)
    441{
    442	bool moved = false;
    443
    444	/* See comment in move_ptes() */
    445	if (need_rmap_locks)
    446		take_rmap_locks(vma);
    447
    448	switch (entry) {
    449	case NORMAL_PMD:
    450		moved = move_normal_pmd(vma, old_addr, new_addr, old_entry,
    451					new_entry);
    452		break;
    453	case NORMAL_PUD:
    454		moved = move_normal_pud(vma, old_addr, new_addr, old_entry,
    455					new_entry);
    456		break;
    457	case HPAGE_PMD:
    458		moved = IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) &&
    459			move_huge_pmd(vma, old_addr, new_addr, old_entry,
    460				      new_entry);
    461		break;
    462	case HPAGE_PUD:
    463		moved = IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) &&
    464			move_huge_pud(vma, old_addr, new_addr, old_entry,
    465				      new_entry);
    466		break;
    467
    468	default:
    469		WARN_ON_ONCE(1);
    470		break;
    471	}
    472
    473	if (need_rmap_locks)
    474		drop_rmap_locks(vma);
    475
    476	return moved;
    477}
    478
    479unsigned long move_page_tables(struct vm_area_struct *vma,
    480		unsigned long old_addr, struct vm_area_struct *new_vma,
    481		unsigned long new_addr, unsigned long len,
    482		bool need_rmap_locks)
    483{
    484	unsigned long extent, old_end;
    485	struct mmu_notifier_range range;
    486	pmd_t *old_pmd, *new_pmd;
    487	pud_t *old_pud, *new_pud;
    488
    489	if (!len)
    490		return 0;
    491
    492	old_end = old_addr + len;
    493
    494	if (is_vm_hugetlb_page(vma))
    495		return move_hugetlb_page_tables(vma, new_vma, old_addr,
    496						new_addr, len);
    497
    498	flush_cache_range(vma, old_addr, old_end);
    499	mmu_notifier_range_init(&range, MMU_NOTIFY_UNMAP, 0, vma, vma->vm_mm,
    500				old_addr, old_end);
    501	mmu_notifier_invalidate_range_start(&range);
    502
    503	for (; old_addr < old_end; old_addr += extent, new_addr += extent) {
    504		cond_resched();
    505		/*
    506		 * If extent is PUD-sized try to speed up the move by moving at the
    507		 * PUD level if possible.
    508		 */
    509		extent = get_extent(NORMAL_PUD, old_addr, old_end, new_addr);
    510
    511		old_pud = get_old_pud(vma->vm_mm, old_addr);
    512		if (!old_pud)
    513			continue;
    514		new_pud = alloc_new_pud(vma->vm_mm, vma, new_addr);
    515		if (!new_pud)
    516			break;
    517		if (pud_trans_huge(*old_pud) || pud_devmap(*old_pud)) {
    518			if (extent == HPAGE_PUD_SIZE) {
    519				move_pgt_entry(HPAGE_PUD, vma, old_addr, new_addr,
    520					       old_pud, new_pud, need_rmap_locks);
    521				/* We ignore and continue on error? */
    522				continue;
    523			}
    524		} else if (IS_ENABLED(CONFIG_HAVE_MOVE_PUD) && extent == PUD_SIZE) {
    525
    526			if (move_pgt_entry(NORMAL_PUD, vma, old_addr, new_addr,
    527					   old_pud, new_pud, true))
    528				continue;
    529		}
    530
    531		extent = get_extent(NORMAL_PMD, old_addr, old_end, new_addr);
    532		old_pmd = get_old_pmd(vma->vm_mm, old_addr);
    533		if (!old_pmd)
    534			continue;
    535		new_pmd = alloc_new_pmd(vma->vm_mm, vma, new_addr);
    536		if (!new_pmd)
    537			break;
    538		if (is_swap_pmd(*old_pmd) || pmd_trans_huge(*old_pmd) ||
    539		    pmd_devmap(*old_pmd)) {
    540			if (extent == HPAGE_PMD_SIZE &&
    541			    move_pgt_entry(HPAGE_PMD, vma, old_addr, new_addr,
    542					   old_pmd, new_pmd, need_rmap_locks))
    543				continue;
    544			split_huge_pmd(vma, old_pmd, old_addr);
    545			if (pmd_trans_unstable(old_pmd))
    546				continue;
    547		} else if (IS_ENABLED(CONFIG_HAVE_MOVE_PMD) &&
    548			   extent == PMD_SIZE) {
    549			/*
    550			 * If the extent is PMD-sized, try to speed the move by
    551			 * moving at the PMD level if possible.
    552			 */
    553			if (move_pgt_entry(NORMAL_PMD, vma, old_addr, new_addr,
    554					   old_pmd, new_pmd, true))
    555				continue;
    556		}
    557
    558		if (pte_alloc(new_vma->vm_mm, new_pmd))
    559			break;
    560		move_ptes(vma, old_pmd, old_addr, old_addr + extent, new_vma,
    561			  new_pmd, new_addr, need_rmap_locks);
    562	}
    563
    564	mmu_notifier_invalidate_range_end(&range);
    565
    566	return len + old_addr - old_end;	/* how much done */
    567}
    568
    569static unsigned long move_vma(struct vm_area_struct *vma,
    570		unsigned long old_addr, unsigned long old_len,
    571		unsigned long new_len, unsigned long new_addr,
    572		bool *locked, unsigned long flags,
    573		struct vm_userfaultfd_ctx *uf, struct list_head *uf_unmap)
    574{
    575	long to_account = new_len - old_len;
    576	struct mm_struct *mm = vma->vm_mm;
    577	struct vm_area_struct *new_vma;
    578	unsigned long vm_flags = vma->vm_flags;
    579	unsigned long new_pgoff;
    580	unsigned long moved_len;
    581	unsigned long excess = 0;
    582	unsigned long hiwater_vm;
    583	int split = 0;
    584	int err = 0;
    585	bool need_rmap_locks;
    586
    587	/*
    588	 * We'd prefer to avoid failure later on in do_munmap:
    589	 * which may split one vma into three before unmapping.
    590	 */
    591	if (mm->map_count >= sysctl_max_map_count - 3)
    592		return -ENOMEM;
    593
    594	if (unlikely(flags & MREMAP_DONTUNMAP))
    595		to_account = new_len;
    596
    597	if (vma->vm_ops && vma->vm_ops->may_split) {
    598		if (vma->vm_start != old_addr)
    599			err = vma->vm_ops->may_split(vma, old_addr);
    600		if (!err && vma->vm_end != old_addr + old_len)
    601			err = vma->vm_ops->may_split(vma, old_addr + old_len);
    602		if (err)
    603			return err;
    604	}
    605
    606	/*
    607	 * Advise KSM to break any KSM pages in the area to be moved:
    608	 * it would be confusing if they were to turn up at the new
    609	 * location, where they happen to coincide with different KSM
    610	 * pages recently unmapped.  But leave vma->vm_flags as it was,
    611	 * so KSM can come around to merge on vma and new_vma afterwards.
    612	 */
    613	err = ksm_madvise(vma, old_addr, old_addr + old_len,
    614						MADV_UNMERGEABLE, &vm_flags);
    615	if (err)
    616		return err;
    617
    618	if (vm_flags & VM_ACCOUNT) {
    619		if (security_vm_enough_memory_mm(mm, to_account >> PAGE_SHIFT))
    620			return -ENOMEM;
    621	}
    622
    623	new_pgoff = vma->vm_pgoff + ((old_addr - vma->vm_start) >> PAGE_SHIFT);
    624	new_vma = copy_vma(&vma, new_addr, new_len, new_pgoff,
    625			   &need_rmap_locks);
    626	if (!new_vma) {
    627		if (vm_flags & VM_ACCOUNT)
    628			vm_unacct_memory(to_account >> PAGE_SHIFT);
    629		return -ENOMEM;
    630	}
    631
    632	moved_len = move_page_tables(vma, old_addr, new_vma, new_addr, old_len,
    633				     need_rmap_locks);
    634	if (moved_len < old_len) {
    635		err = -ENOMEM;
    636	} else if (vma->vm_ops && vma->vm_ops->mremap) {
    637		err = vma->vm_ops->mremap(new_vma);
    638	}
    639
    640	if (unlikely(err)) {
    641		/*
    642		 * On error, move entries back from new area to old,
    643		 * which will succeed since page tables still there,
    644		 * and then proceed to unmap new area instead of old.
    645		 */
    646		move_page_tables(new_vma, new_addr, vma, old_addr, moved_len,
    647				 true);
    648		vma = new_vma;
    649		old_len = new_len;
    650		old_addr = new_addr;
    651		new_addr = err;
    652	} else {
    653		mremap_userfaultfd_prep(new_vma, uf);
    654	}
    655
    656	if (is_vm_hugetlb_page(vma)) {
    657		clear_vma_resv_huge_pages(vma);
    658	}
    659
    660	/* Conceal VM_ACCOUNT so old reservation is not undone */
    661	if (vm_flags & VM_ACCOUNT && !(flags & MREMAP_DONTUNMAP)) {
    662		vma->vm_flags &= ~VM_ACCOUNT;
    663		excess = vma->vm_end - vma->vm_start - old_len;
    664		if (old_addr > vma->vm_start &&
    665		    old_addr + old_len < vma->vm_end)
    666			split = 1;
    667	}
    668
    669	/*
    670	 * If we failed to move page tables we still do total_vm increment
    671	 * since do_munmap() will decrement it by old_len == new_len.
    672	 *
    673	 * Since total_vm is about to be raised artificially high for a
    674	 * moment, we need to restore high watermark afterwards: if stats
    675	 * are taken meanwhile, total_vm and hiwater_vm appear too high.
    676	 * If this were a serious issue, we'd add a flag to do_munmap().
    677	 */
    678	hiwater_vm = mm->hiwater_vm;
    679	vm_stat_account(mm, vma->vm_flags, new_len >> PAGE_SHIFT);
    680
    681	/* Tell pfnmap has moved from this vma */
    682	if (unlikely(vma->vm_flags & VM_PFNMAP))
    683		untrack_pfn_moved(vma);
    684
    685	if (unlikely(!err && (flags & MREMAP_DONTUNMAP))) {
    686		/* We always clear VM_LOCKED[ONFAULT] on the old vma */
    687		vma->vm_flags &= VM_LOCKED_CLEAR_MASK;
    688
    689		/*
    690		 * anon_vma links of the old vma is no longer needed after its page
    691		 * table has been moved.
    692		 */
    693		if (new_vma != vma && vma->vm_start == old_addr &&
    694			vma->vm_end == (old_addr + old_len))
    695			unlink_anon_vmas(vma);
    696
    697		/* Because we won't unmap we don't need to touch locked_vm */
    698		return new_addr;
    699	}
    700
    701	if (do_munmap(mm, old_addr, old_len, uf_unmap) < 0) {
    702		/* OOM: unable to split vma, just get accounts right */
    703		if (vm_flags & VM_ACCOUNT && !(flags & MREMAP_DONTUNMAP))
    704			vm_acct_memory(old_len >> PAGE_SHIFT);
    705		excess = 0;
    706	}
    707
    708	if (vm_flags & VM_LOCKED) {
    709		mm->locked_vm += new_len >> PAGE_SHIFT;
    710		*locked = true;
    711	}
    712
    713	mm->hiwater_vm = hiwater_vm;
    714
    715	/* Restore VM_ACCOUNT if one or two pieces of vma left */
    716	if (excess) {
    717		vma->vm_flags |= VM_ACCOUNT;
    718		if (split)
    719			vma->vm_next->vm_flags |= VM_ACCOUNT;
    720	}
    721
    722	return new_addr;
    723}
    724
    725static struct vm_area_struct *vma_to_resize(unsigned long addr,
    726	unsigned long old_len, unsigned long new_len, unsigned long flags)
    727{
    728	struct mm_struct *mm = current->mm;
    729	struct vm_area_struct *vma;
    730	unsigned long pgoff;
    731
    732	vma = vma_lookup(mm, addr);
    733	if (!vma)
    734		return ERR_PTR(-EFAULT);
    735
    736	/*
    737	 * !old_len is a special case where an attempt is made to 'duplicate'
    738	 * a mapping.  This makes no sense for private mappings as it will
    739	 * instead create a fresh/new mapping unrelated to the original.  This
    740	 * is contrary to the basic idea of mremap which creates new mappings
    741	 * based on the original.  There are no known use cases for this
    742	 * behavior.  As a result, fail such attempts.
    743	 */
    744	if (!old_len && !(vma->vm_flags & (VM_SHARED | VM_MAYSHARE))) {
    745		pr_warn_once("%s (%d): attempted to duplicate a private mapping with mremap.  This is not supported.\n", current->comm, current->pid);
    746		return ERR_PTR(-EINVAL);
    747	}
    748
    749	if ((flags & MREMAP_DONTUNMAP) &&
    750			(vma->vm_flags & (VM_DONTEXPAND | VM_PFNMAP)))
    751		return ERR_PTR(-EINVAL);
    752
    753	/* We can't remap across vm area boundaries */
    754	if (old_len > vma->vm_end - addr)
    755		return ERR_PTR(-EFAULT);
    756
    757	if (new_len == old_len)
    758		return vma;
    759
    760	/* Need to be careful about a growing mapping */
    761	pgoff = (addr - vma->vm_start) >> PAGE_SHIFT;
    762	pgoff += vma->vm_pgoff;
    763	if (pgoff + (new_len >> PAGE_SHIFT) < pgoff)
    764		return ERR_PTR(-EINVAL);
    765
    766	if (vma->vm_flags & (VM_DONTEXPAND | VM_PFNMAP))
    767		return ERR_PTR(-EFAULT);
    768
    769	if (mlock_future_check(mm, vma->vm_flags, new_len - old_len))
    770		return ERR_PTR(-EAGAIN);
    771
    772	if (!may_expand_vm(mm, vma->vm_flags,
    773				(new_len - old_len) >> PAGE_SHIFT))
    774		return ERR_PTR(-ENOMEM);
    775
    776	return vma;
    777}
    778
    779static unsigned long mremap_to(unsigned long addr, unsigned long old_len,
    780		unsigned long new_addr, unsigned long new_len, bool *locked,
    781		unsigned long flags, struct vm_userfaultfd_ctx *uf,
    782		struct list_head *uf_unmap_early,
    783		struct list_head *uf_unmap)
    784{
    785	struct mm_struct *mm = current->mm;
    786	struct vm_area_struct *vma;
    787	unsigned long ret = -EINVAL;
    788	unsigned long map_flags = 0;
    789
    790	if (offset_in_page(new_addr))
    791		goto out;
    792
    793	if (new_len > TASK_SIZE || new_addr > TASK_SIZE - new_len)
    794		goto out;
    795
    796	/* Ensure the old/new locations do not overlap */
    797	if (addr + old_len > new_addr && new_addr + new_len > addr)
    798		goto out;
    799
    800	/*
    801	 * move_vma() need us to stay 4 maps below the threshold, otherwise
    802	 * it will bail out at the very beginning.
    803	 * That is a problem if we have already unmaped the regions here
    804	 * (new_addr, and old_addr), because userspace will not know the
    805	 * state of the vma's after it gets -ENOMEM.
    806	 * So, to avoid such scenario we can pre-compute if the whole
    807	 * operation has high chances to success map-wise.
    808	 * Worst-scenario case is when both vma's (new_addr and old_addr) get
    809	 * split in 3 before unmapping it.
    810	 * That means 2 more maps (1 for each) to the ones we already hold.
    811	 * Check whether current map count plus 2 still leads us to 4 maps below
    812	 * the threshold, otherwise return -ENOMEM here to be more safe.
    813	 */
    814	if ((mm->map_count + 2) >= sysctl_max_map_count - 3)
    815		return -ENOMEM;
    816
    817	if (flags & MREMAP_FIXED) {
    818		ret = do_munmap(mm, new_addr, new_len, uf_unmap_early);
    819		if (ret)
    820			goto out;
    821	}
    822
    823	if (old_len > new_len) {
    824		ret = do_munmap(mm, addr+new_len, old_len - new_len, uf_unmap);
    825		if (ret)
    826			goto out;
    827		old_len = new_len;
    828	}
    829
    830	vma = vma_to_resize(addr, old_len, new_len, flags);
    831	if (IS_ERR(vma)) {
    832		ret = PTR_ERR(vma);
    833		goto out;
    834	}
    835
    836	/* MREMAP_DONTUNMAP expands by old_len since old_len == new_len */
    837	if (flags & MREMAP_DONTUNMAP &&
    838		!may_expand_vm(mm, vma->vm_flags, old_len >> PAGE_SHIFT)) {
    839		ret = -ENOMEM;
    840		goto out;
    841	}
    842
    843	if (flags & MREMAP_FIXED)
    844		map_flags |= MAP_FIXED;
    845
    846	if (vma->vm_flags & VM_MAYSHARE)
    847		map_flags |= MAP_SHARED;
    848
    849	ret = get_unmapped_area(vma->vm_file, new_addr, new_len, vma->vm_pgoff +
    850				((addr - vma->vm_start) >> PAGE_SHIFT),
    851				map_flags);
    852	if (IS_ERR_VALUE(ret))
    853		goto out;
    854
    855	/* We got a new mapping */
    856	if (!(flags & MREMAP_FIXED))
    857		new_addr = ret;
    858
    859	ret = move_vma(vma, addr, old_len, new_len, new_addr, locked, flags, uf,
    860		       uf_unmap);
    861
    862out:
    863	return ret;
    864}
    865
    866static int vma_expandable(struct vm_area_struct *vma, unsigned long delta)
    867{
    868	unsigned long end = vma->vm_end + delta;
    869	if (end < vma->vm_end) /* overflow */
    870		return 0;
    871	if (vma->vm_next && vma->vm_next->vm_start < end) /* intersection */
    872		return 0;
    873	if (get_unmapped_area(NULL, vma->vm_start, end - vma->vm_start,
    874			      0, MAP_FIXED) & ~PAGE_MASK)
    875		return 0;
    876	return 1;
    877}
    878
    879/*
    880 * Expand (or shrink) an existing mapping, potentially moving it at the
    881 * same time (controlled by the MREMAP_MAYMOVE flag and available VM space)
    882 *
    883 * MREMAP_FIXED option added 5-Dec-1999 by Benjamin LaHaise
    884 * This option implies MREMAP_MAYMOVE.
    885 */
    886SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len,
    887		unsigned long, new_len, unsigned long, flags,
    888		unsigned long, new_addr)
    889{
    890	struct mm_struct *mm = current->mm;
    891	struct vm_area_struct *vma;
    892	unsigned long ret = -EINVAL;
    893	bool locked = false;
    894	bool downgraded = false;
    895	struct vm_userfaultfd_ctx uf = NULL_VM_UFFD_CTX;
    896	LIST_HEAD(uf_unmap_early);
    897	LIST_HEAD(uf_unmap);
    898
    899	/*
    900	 * There is a deliberate asymmetry here: we strip the pointer tag
    901	 * from the old address but leave the new address alone. This is
    902	 * for consistency with mmap(), where we prevent the creation of
    903	 * aliasing mappings in userspace by leaving the tag bits of the
    904	 * mapping address intact. A non-zero tag will cause the subsequent
    905	 * range checks to reject the address as invalid.
    906	 *
    907	 * See Documentation/arm64/tagged-address-abi.rst for more information.
    908	 */
    909	addr = untagged_addr(addr);
    910
    911	if (flags & ~(MREMAP_FIXED | MREMAP_MAYMOVE | MREMAP_DONTUNMAP))
    912		return ret;
    913
    914	if (flags & MREMAP_FIXED && !(flags & MREMAP_MAYMOVE))
    915		return ret;
    916
    917	/*
    918	 * MREMAP_DONTUNMAP is always a move and it does not allow resizing
    919	 * in the process.
    920	 */
    921	if (flags & MREMAP_DONTUNMAP &&
    922			(!(flags & MREMAP_MAYMOVE) || old_len != new_len))
    923		return ret;
    924
    925
    926	if (offset_in_page(addr))
    927		return ret;
    928
    929	old_len = PAGE_ALIGN(old_len);
    930	new_len = PAGE_ALIGN(new_len);
    931
    932	/*
    933	 * We allow a zero old-len as a special case
    934	 * for DOS-emu "duplicate shm area" thing. But
    935	 * a zero new-len is nonsensical.
    936	 */
    937	if (!new_len)
    938		return ret;
    939
    940	if (mmap_write_lock_killable(current->mm))
    941		return -EINTR;
    942	vma = vma_lookup(mm, addr);
    943	if (!vma) {
    944		ret = -EFAULT;
    945		goto out;
    946	}
    947
    948	if (is_vm_hugetlb_page(vma)) {
    949		struct hstate *h __maybe_unused = hstate_vma(vma);
    950
    951		old_len = ALIGN(old_len, huge_page_size(h));
    952		new_len = ALIGN(new_len, huge_page_size(h));
    953
    954		/* addrs must be huge page aligned */
    955		if (addr & ~huge_page_mask(h))
    956			goto out;
    957		if (new_addr & ~huge_page_mask(h))
    958			goto out;
    959
    960		/*
    961		 * Don't allow remap expansion, because the underlying hugetlb
    962		 * reservation is not yet capable to handle split reservation.
    963		 */
    964		if (new_len > old_len)
    965			goto out;
    966	}
    967
    968	if (flags & (MREMAP_FIXED | MREMAP_DONTUNMAP)) {
    969		ret = mremap_to(addr, old_len, new_addr, new_len,
    970				&locked, flags, &uf, &uf_unmap_early,
    971				&uf_unmap);
    972		goto out;
    973	}
    974
    975	/*
    976	 * Always allow a shrinking remap: that just unmaps
    977	 * the unnecessary pages..
    978	 * __do_munmap does all the needed commit accounting, and
    979	 * downgrades mmap_lock to read if so directed.
    980	 */
    981	if (old_len >= new_len) {
    982		int retval;
    983
    984		retval = __do_munmap(mm, addr+new_len, old_len - new_len,
    985				  &uf_unmap, true);
    986		if (retval < 0 && old_len != new_len) {
    987			ret = retval;
    988			goto out;
    989		/* Returning 1 indicates mmap_lock is downgraded to read. */
    990		} else if (retval == 1)
    991			downgraded = true;
    992		ret = addr;
    993		goto out;
    994	}
    995
    996	/*
    997	 * Ok, we need to grow..
    998	 */
    999	vma = vma_to_resize(addr, old_len, new_len, flags);
   1000	if (IS_ERR(vma)) {
   1001		ret = PTR_ERR(vma);
   1002		goto out;
   1003	}
   1004
   1005	/* old_len exactly to the end of the area..
   1006	 */
   1007	if (old_len == vma->vm_end - addr) {
   1008		/* can we just expand the current mapping? */
   1009		if (vma_expandable(vma, new_len - old_len)) {
   1010			long pages = (new_len - old_len) >> PAGE_SHIFT;
   1011
   1012			if (vma->vm_flags & VM_ACCOUNT) {
   1013				if (security_vm_enough_memory_mm(mm, pages)) {
   1014					ret = -ENOMEM;
   1015					goto out;
   1016				}
   1017			}
   1018
   1019			if (vma_adjust(vma, vma->vm_start, addr + new_len,
   1020				       vma->vm_pgoff, NULL)) {
   1021				vm_unacct_memory(pages);
   1022				ret = -ENOMEM;
   1023				goto out;
   1024			}
   1025
   1026			vm_stat_account(mm, vma->vm_flags, pages);
   1027			if (vma->vm_flags & VM_LOCKED) {
   1028				mm->locked_vm += pages;
   1029				locked = true;
   1030				new_addr = addr;
   1031			}
   1032			ret = addr;
   1033			goto out;
   1034		}
   1035	}
   1036
   1037	/*
   1038	 * We weren't able to just expand or shrink the area,
   1039	 * we need to create a new one and move it..
   1040	 */
   1041	ret = -ENOMEM;
   1042	if (flags & MREMAP_MAYMOVE) {
   1043		unsigned long map_flags = 0;
   1044		if (vma->vm_flags & VM_MAYSHARE)
   1045			map_flags |= MAP_SHARED;
   1046
   1047		new_addr = get_unmapped_area(vma->vm_file, 0, new_len,
   1048					vma->vm_pgoff +
   1049					((addr - vma->vm_start) >> PAGE_SHIFT),
   1050					map_flags);
   1051		if (IS_ERR_VALUE(new_addr)) {
   1052			ret = new_addr;
   1053			goto out;
   1054		}
   1055
   1056		ret = move_vma(vma, addr, old_len, new_len, new_addr,
   1057			       &locked, flags, &uf, &uf_unmap);
   1058	}
   1059out:
   1060	if (offset_in_page(ret))
   1061		locked = false;
   1062	if (downgraded)
   1063		mmap_read_unlock(current->mm);
   1064	else
   1065		mmap_write_unlock(current->mm);
   1066	if (locked && new_len > old_len)
   1067		mm_populate(new_addr + old_len, new_len - old_len);
   1068	userfaultfd_unmap_complete(mm, &uf_unmap_early);
   1069	mremap_userfaultfd_complete(&uf, addr, ret, old_len);
   1070	userfaultfd_unmap_complete(mm, &uf_unmap);
   1071	return ret;
   1072}