cachepc-linux

Fork of AMDESE/linux with modifications for CachePC side-channel attack
git clone https://git.sinitax.com/sinitax/cachepc-linux
Log | Files | Refs | README | LICENSE | sfeed.txt

nommu.c (45452B)


      1// SPDX-License-Identifier: GPL-2.0-only
      2/*
      3 *  linux/mm/nommu.c
      4 *
      5 *  Replacement code for mm functions to support CPU's that don't
      6 *  have any form of memory management unit (thus no virtual memory).
      7 *
      8 *  See Documentation/admin-guide/mm/nommu-mmap.rst
      9 *
     10 *  Copyright (c) 2004-2008 David Howells <dhowells@redhat.com>
     11 *  Copyright (c) 2000-2003 David McCullough <davidm@snapgear.com>
     12 *  Copyright (c) 2000-2001 D Jeff Dionne <jeff@uClinux.org>
     13 *  Copyright (c) 2002      Greg Ungerer <gerg@snapgear.com>
     14 *  Copyright (c) 2007-2010 Paul Mundt <lethal@linux-sh.org>
     15 */
     16
     17#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
     18
     19#include <linux/export.h>
     20#include <linux/mm.h>
     21#include <linux/sched/mm.h>
     22#include <linux/vmacache.h>
     23#include <linux/mman.h>
     24#include <linux/swap.h>
     25#include <linux/file.h>
     26#include <linux/highmem.h>
     27#include <linux/pagemap.h>
     28#include <linux/slab.h>
     29#include <linux/vmalloc.h>
     30#include <linux/backing-dev.h>
     31#include <linux/compiler.h>
     32#include <linux/mount.h>
     33#include <linux/personality.h>
     34#include <linux/security.h>
     35#include <linux/syscalls.h>
     36#include <linux/audit.h>
     37#include <linux/printk.h>
     38
     39#include <linux/uaccess.h>
     40#include <asm/tlb.h>
     41#include <asm/tlbflush.h>
     42#include <asm/mmu_context.h>
     43#include "internal.h"
     44
     45void *high_memory;
     46EXPORT_SYMBOL(high_memory);
     47struct page *mem_map;
     48unsigned long max_mapnr;
     49EXPORT_SYMBOL(max_mapnr);
     50unsigned long highest_memmap_pfn;
     51int sysctl_nr_trim_pages = CONFIG_NOMMU_INITIAL_TRIM_EXCESS;
     52int heap_stack_gap = 0;
     53
     54atomic_long_t mmap_pages_allocated;
     55
     56EXPORT_SYMBOL(mem_map);
     57
     58/* list of mapped, potentially shareable regions */
     59static struct kmem_cache *vm_region_jar;
     60struct rb_root nommu_region_tree = RB_ROOT;
     61DECLARE_RWSEM(nommu_region_sem);
     62
     63const struct vm_operations_struct generic_file_vm_ops = {
     64};
     65
     66/*
     67 * Return the total memory allocated for this pointer, not
     68 * just what the caller asked for.
     69 *
     70 * Doesn't have to be accurate, i.e. may have races.
     71 */
     72unsigned int kobjsize(const void *objp)
     73{
     74	struct page *page;
     75
     76	/*
     77	 * If the object we have should not have ksize performed on it,
     78	 * return size of 0
     79	 */
     80	if (!objp || !virt_addr_valid(objp))
     81		return 0;
     82
     83	page = virt_to_head_page(objp);
     84
     85	/*
     86	 * If the allocator sets PageSlab, we know the pointer came from
     87	 * kmalloc().
     88	 */
     89	if (PageSlab(page))
     90		return ksize(objp);
     91
     92	/*
     93	 * If it's not a compound page, see if we have a matching VMA
     94	 * region. This test is intentionally done in reverse order,
     95	 * so if there's no VMA, we still fall through and hand back
     96	 * PAGE_SIZE for 0-order pages.
     97	 */
     98	if (!PageCompound(page)) {
     99		struct vm_area_struct *vma;
    100
    101		vma = find_vma(current->mm, (unsigned long)objp);
    102		if (vma)
    103			return vma->vm_end - vma->vm_start;
    104	}
    105
    106	/*
    107	 * The ksize() function is only guaranteed to work for pointers
    108	 * returned by kmalloc(). So handle arbitrary pointers here.
    109	 */
    110	return page_size(page);
    111}
    112
    113/**
    114 * follow_pfn - look up PFN at a user virtual address
    115 * @vma: memory mapping
    116 * @address: user virtual address
    117 * @pfn: location to store found PFN
    118 *
    119 * Only IO mappings and raw PFN mappings are allowed.
    120 *
    121 * Returns zero and the pfn at @pfn on success, -ve otherwise.
    122 */
    123int follow_pfn(struct vm_area_struct *vma, unsigned long address,
    124	unsigned long *pfn)
    125{
    126	if (!(vma->vm_flags & (VM_IO | VM_PFNMAP)))
    127		return -EINVAL;
    128
    129	*pfn = address >> PAGE_SHIFT;
    130	return 0;
    131}
    132EXPORT_SYMBOL(follow_pfn);
    133
    134LIST_HEAD(vmap_area_list);
    135
    136void vfree(const void *addr)
    137{
    138	kfree(addr);
    139}
    140EXPORT_SYMBOL(vfree);
    141
    142void *__vmalloc(unsigned long size, gfp_t gfp_mask)
    143{
    144	/*
    145	 *  You can't specify __GFP_HIGHMEM with kmalloc() since kmalloc()
    146	 * returns only a logical address.
    147	 */
    148	return kmalloc(size, (gfp_mask | __GFP_COMP) & ~__GFP_HIGHMEM);
    149}
    150EXPORT_SYMBOL(__vmalloc);
    151
    152void *__vmalloc_node_range(unsigned long size, unsigned long align,
    153		unsigned long start, unsigned long end, gfp_t gfp_mask,
    154		pgprot_t prot, unsigned long vm_flags, int node,
    155		const void *caller)
    156{
    157	return __vmalloc(size, gfp_mask);
    158}
    159
    160void *__vmalloc_node(unsigned long size, unsigned long align, gfp_t gfp_mask,
    161		int node, const void *caller)
    162{
    163	return __vmalloc(size, gfp_mask);
    164}
    165
    166static void *__vmalloc_user_flags(unsigned long size, gfp_t flags)
    167{
    168	void *ret;
    169
    170	ret = __vmalloc(size, flags);
    171	if (ret) {
    172		struct vm_area_struct *vma;
    173
    174		mmap_write_lock(current->mm);
    175		vma = find_vma(current->mm, (unsigned long)ret);
    176		if (vma)
    177			vma->vm_flags |= VM_USERMAP;
    178		mmap_write_unlock(current->mm);
    179	}
    180
    181	return ret;
    182}
    183
    184void *vmalloc_user(unsigned long size)
    185{
    186	return __vmalloc_user_flags(size, GFP_KERNEL | __GFP_ZERO);
    187}
    188EXPORT_SYMBOL(vmalloc_user);
    189
    190struct page *vmalloc_to_page(const void *addr)
    191{
    192	return virt_to_page(addr);
    193}
    194EXPORT_SYMBOL(vmalloc_to_page);
    195
    196unsigned long vmalloc_to_pfn(const void *addr)
    197{
    198	return page_to_pfn(virt_to_page(addr));
    199}
    200EXPORT_SYMBOL(vmalloc_to_pfn);
    201
    202long vread(char *buf, char *addr, unsigned long count)
    203{
    204	/* Don't allow overflow */
    205	if ((unsigned long) buf + count < count)
    206		count = -(unsigned long) buf;
    207
    208	memcpy(buf, addr, count);
    209	return count;
    210}
    211
    212/*
    213 *	vmalloc  -  allocate virtually contiguous memory
    214 *
    215 *	@size:		allocation size
    216 *
    217 *	Allocate enough pages to cover @size from the page level
    218 *	allocator and map them into contiguous kernel virtual space.
    219 *
    220 *	For tight control over page level allocator and protection flags
    221 *	use __vmalloc() instead.
    222 */
    223void *vmalloc(unsigned long size)
    224{
    225	return __vmalloc(size, GFP_KERNEL);
    226}
    227EXPORT_SYMBOL(vmalloc);
    228
    229void *vmalloc_huge(unsigned long size, gfp_t gfp_mask) __weak __alias(__vmalloc);
    230
    231/*
    232 *	vzalloc - allocate virtually contiguous memory with zero fill
    233 *
    234 *	@size:		allocation size
    235 *
    236 *	Allocate enough pages to cover @size from the page level
    237 *	allocator and map them into contiguous kernel virtual space.
    238 *	The memory allocated is set to zero.
    239 *
    240 *	For tight control over page level allocator and protection flags
    241 *	use __vmalloc() instead.
    242 */
    243void *vzalloc(unsigned long size)
    244{
    245	return __vmalloc(size, GFP_KERNEL | __GFP_ZERO);
    246}
    247EXPORT_SYMBOL(vzalloc);
    248
    249/**
    250 * vmalloc_node - allocate memory on a specific node
    251 * @size:	allocation size
    252 * @node:	numa node
    253 *
    254 * Allocate enough pages to cover @size from the page level
    255 * allocator and map them into contiguous kernel virtual space.
    256 *
    257 * For tight control over page level allocator and protection flags
    258 * use __vmalloc() instead.
    259 */
    260void *vmalloc_node(unsigned long size, int node)
    261{
    262	return vmalloc(size);
    263}
    264EXPORT_SYMBOL(vmalloc_node);
    265
    266/**
    267 * vzalloc_node - allocate memory on a specific node with zero fill
    268 * @size:	allocation size
    269 * @node:	numa node
    270 *
    271 * Allocate enough pages to cover @size from the page level
    272 * allocator and map them into contiguous kernel virtual space.
    273 * The memory allocated is set to zero.
    274 *
    275 * For tight control over page level allocator and protection flags
    276 * use __vmalloc() instead.
    277 */
    278void *vzalloc_node(unsigned long size, int node)
    279{
    280	return vzalloc(size);
    281}
    282EXPORT_SYMBOL(vzalloc_node);
    283
    284/**
    285 * vmalloc_32  -  allocate virtually contiguous memory (32bit addressable)
    286 *	@size:		allocation size
    287 *
    288 *	Allocate enough 32bit PA addressable pages to cover @size from the
    289 *	page level allocator and map them into contiguous kernel virtual space.
    290 */
    291void *vmalloc_32(unsigned long size)
    292{
    293	return __vmalloc(size, GFP_KERNEL);
    294}
    295EXPORT_SYMBOL(vmalloc_32);
    296
    297/**
    298 * vmalloc_32_user - allocate zeroed virtually contiguous 32bit memory
    299 *	@size:		allocation size
    300 *
    301 * The resulting memory area is 32bit addressable and zeroed so it can be
    302 * mapped to userspace without leaking data.
    303 *
    304 * VM_USERMAP is set on the corresponding VMA so that subsequent calls to
    305 * remap_vmalloc_range() are permissible.
    306 */
    307void *vmalloc_32_user(unsigned long size)
    308{
    309	/*
    310	 * We'll have to sort out the ZONE_DMA bits for 64-bit,
    311	 * but for now this can simply use vmalloc_user() directly.
    312	 */
    313	return vmalloc_user(size);
    314}
    315EXPORT_SYMBOL(vmalloc_32_user);
    316
    317void *vmap(struct page **pages, unsigned int count, unsigned long flags, pgprot_t prot)
    318{
    319	BUG();
    320	return NULL;
    321}
    322EXPORT_SYMBOL(vmap);
    323
    324void vunmap(const void *addr)
    325{
    326	BUG();
    327}
    328EXPORT_SYMBOL(vunmap);
    329
    330void *vm_map_ram(struct page **pages, unsigned int count, int node)
    331{
    332	BUG();
    333	return NULL;
    334}
    335EXPORT_SYMBOL(vm_map_ram);
    336
    337void vm_unmap_ram(const void *mem, unsigned int count)
    338{
    339	BUG();
    340}
    341EXPORT_SYMBOL(vm_unmap_ram);
    342
    343void vm_unmap_aliases(void)
    344{
    345}
    346EXPORT_SYMBOL_GPL(vm_unmap_aliases);
    347
    348void free_vm_area(struct vm_struct *area)
    349{
    350	BUG();
    351}
    352EXPORT_SYMBOL_GPL(free_vm_area);
    353
    354int vm_insert_page(struct vm_area_struct *vma, unsigned long addr,
    355		   struct page *page)
    356{
    357	return -EINVAL;
    358}
    359EXPORT_SYMBOL(vm_insert_page);
    360
    361int vm_map_pages(struct vm_area_struct *vma, struct page **pages,
    362			unsigned long num)
    363{
    364	return -EINVAL;
    365}
    366EXPORT_SYMBOL(vm_map_pages);
    367
    368int vm_map_pages_zero(struct vm_area_struct *vma, struct page **pages,
    369				unsigned long num)
    370{
    371	return -EINVAL;
    372}
    373EXPORT_SYMBOL(vm_map_pages_zero);
    374
    375/*
    376 *  sys_brk() for the most part doesn't need the global kernel
    377 *  lock, except when an application is doing something nasty
    378 *  like trying to un-brk an area that has already been mapped
    379 *  to a regular file.  in this case, the unmapping will need
    380 *  to invoke file system routines that need the global lock.
    381 */
    382SYSCALL_DEFINE1(brk, unsigned long, brk)
    383{
    384	struct mm_struct *mm = current->mm;
    385
    386	if (brk < mm->start_brk || brk > mm->context.end_brk)
    387		return mm->brk;
    388
    389	if (mm->brk == brk)
    390		return mm->brk;
    391
    392	/*
    393	 * Always allow shrinking brk
    394	 */
    395	if (brk <= mm->brk) {
    396		mm->brk = brk;
    397		return brk;
    398	}
    399
    400	/*
    401	 * Ok, looks good - let it rip.
    402	 */
    403	flush_icache_user_range(mm->brk, brk);
    404	return mm->brk = brk;
    405}
    406
    407/*
    408 * initialise the percpu counter for VM and region record slabs
    409 */
    410void __init mmap_init(void)
    411{
    412	int ret;
    413
    414	ret = percpu_counter_init(&vm_committed_as, 0, GFP_KERNEL);
    415	VM_BUG_ON(ret);
    416	vm_region_jar = KMEM_CACHE(vm_region, SLAB_PANIC|SLAB_ACCOUNT);
    417}
    418
    419/*
    420 * validate the region tree
    421 * - the caller must hold the region lock
    422 */
    423#ifdef CONFIG_DEBUG_NOMMU_REGIONS
    424static noinline void validate_nommu_regions(void)
    425{
    426	struct vm_region *region, *last;
    427	struct rb_node *p, *lastp;
    428
    429	lastp = rb_first(&nommu_region_tree);
    430	if (!lastp)
    431		return;
    432
    433	last = rb_entry(lastp, struct vm_region, vm_rb);
    434	BUG_ON(last->vm_end <= last->vm_start);
    435	BUG_ON(last->vm_top < last->vm_end);
    436
    437	while ((p = rb_next(lastp))) {
    438		region = rb_entry(p, struct vm_region, vm_rb);
    439		last = rb_entry(lastp, struct vm_region, vm_rb);
    440
    441		BUG_ON(region->vm_end <= region->vm_start);
    442		BUG_ON(region->vm_top < region->vm_end);
    443		BUG_ON(region->vm_start < last->vm_top);
    444
    445		lastp = p;
    446	}
    447}
    448#else
    449static void validate_nommu_regions(void)
    450{
    451}
    452#endif
    453
    454/*
    455 * add a region into the global tree
    456 */
    457static void add_nommu_region(struct vm_region *region)
    458{
    459	struct vm_region *pregion;
    460	struct rb_node **p, *parent;
    461
    462	validate_nommu_regions();
    463
    464	parent = NULL;
    465	p = &nommu_region_tree.rb_node;
    466	while (*p) {
    467		parent = *p;
    468		pregion = rb_entry(parent, struct vm_region, vm_rb);
    469		if (region->vm_start < pregion->vm_start)
    470			p = &(*p)->rb_left;
    471		else if (region->vm_start > pregion->vm_start)
    472			p = &(*p)->rb_right;
    473		else if (pregion == region)
    474			return;
    475		else
    476			BUG();
    477	}
    478
    479	rb_link_node(&region->vm_rb, parent, p);
    480	rb_insert_color(&region->vm_rb, &nommu_region_tree);
    481
    482	validate_nommu_regions();
    483}
    484
    485/*
    486 * delete a region from the global tree
    487 */
    488static void delete_nommu_region(struct vm_region *region)
    489{
    490	BUG_ON(!nommu_region_tree.rb_node);
    491
    492	validate_nommu_regions();
    493	rb_erase(&region->vm_rb, &nommu_region_tree);
    494	validate_nommu_regions();
    495}
    496
    497/*
    498 * free a contiguous series of pages
    499 */
    500static void free_page_series(unsigned long from, unsigned long to)
    501{
    502	for (; from < to; from += PAGE_SIZE) {
    503		struct page *page = virt_to_page(from);
    504
    505		atomic_long_dec(&mmap_pages_allocated);
    506		put_page(page);
    507	}
    508}
    509
    510/*
    511 * release a reference to a region
    512 * - the caller must hold the region semaphore for writing, which this releases
    513 * - the region may not have been added to the tree yet, in which case vm_top
    514 *   will equal vm_start
    515 */
    516static void __put_nommu_region(struct vm_region *region)
    517	__releases(nommu_region_sem)
    518{
    519	BUG_ON(!nommu_region_tree.rb_node);
    520
    521	if (--region->vm_usage == 0) {
    522		if (region->vm_top > region->vm_start)
    523			delete_nommu_region(region);
    524		up_write(&nommu_region_sem);
    525
    526		if (region->vm_file)
    527			fput(region->vm_file);
    528
    529		/* IO memory and memory shared directly out of the pagecache
    530		 * from ramfs/tmpfs mustn't be released here */
    531		if (region->vm_flags & VM_MAPPED_COPY)
    532			free_page_series(region->vm_start, region->vm_top);
    533		kmem_cache_free(vm_region_jar, region);
    534	} else {
    535		up_write(&nommu_region_sem);
    536	}
    537}
    538
    539/*
    540 * release a reference to a region
    541 */
    542static void put_nommu_region(struct vm_region *region)
    543{
    544	down_write(&nommu_region_sem);
    545	__put_nommu_region(region);
    546}
    547
    548/*
    549 * add a VMA into a process's mm_struct in the appropriate place in the list
    550 * and tree and add to the address space's page tree also if not an anonymous
    551 * page
    552 * - should be called with mm->mmap_lock held writelocked
    553 */
    554static void add_vma_to_mm(struct mm_struct *mm, struct vm_area_struct *vma)
    555{
    556	struct vm_area_struct *pvma, *prev;
    557	struct address_space *mapping;
    558	struct rb_node **p, *parent, *rb_prev;
    559
    560	BUG_ON(!vma->vm_region);
    561
    562	mm->map_count++;
    563	vma->vm_mm = mm;
    564
    565	/* add the VMA to the mapping */
    566	if (vma->vm_file) {
    567		mapping = vma->vm_file->f_mapping;
    568
    569		i_mmap_lock_write(mapping);
    570		flush_dcache_mmap_lock(mapping);
    571		vma_interval_tree_insert(vma, &mapping->i_mmap);
    572		flush_dcache_mmap_unlock(mapping);
    573		i_mmap_unlock_write(mapping);
    574	}
    575
    576	/* add the VMA to the tree */
    577	parent = rb_prev = NULL;
    578	p = &mm->mm_rb.rb_node;
    579	while (*p) {
    580		parent = *p;
    581		pvma = rb_entry(parent, struct vm_area_struct, vm_rb);
    582
    583		/* sort by: start addr, end addr, VMA struct addr in that order
    584		 * (the latter is necessary as we may get identical VMAs) */
    585		if (vma->vm_start < pvma->vm_start)
    586			p = &(*p)->rb_left;
    587		else if (vma->vm_start > pvma->vm_start) {
    588			rb_prev = parent;
    589			p = &(*p)->rb_right;
    590		} else if (vma->vm_end < pvma->vm_end)
    591			p = &(*p)->rb_left;
    592		else if (vma->vm_end > pvma->vm_end) {
    593			rb_prev = parent;
    594			p = &(*p)->rb_right;
    595		} else if (vma < pvma)
    596			p = &(*p)->rb_left;
    597		else if (vma > pvma) {
    598			rb_prev = parent;
    599			p = &(*p)->rb_right;
    600		} else
    601			BUG();
    602	}
    603
    604	rb_link_node(&vma->vm_rb, parent, p);
    605	rb_insert_color(&vma->vm_rb, &mm->mm_rb);
    606
    607	/* add VMA to the VMA list also */
    608	prev = NULL;
    609	if (rb_prev)
    610		prev = rb_entry(rb_prev, struct vm_area_struct, vm_rb);
    611
    612	__vma_link_list(mm, vma, prev);
    613}
    614
    615/*
    616 * delete a VMA from its owning mm_struct and address space
    617 */
    618static void delete_vma_from_mm(struct vm_area_struct *vma)
    619{
    620	int i;
    621	struct address_space *mapping;
    622	struct mm_struct *mm = vma->vm_mm;
    623	struct task_struct *curr = current;
    624
    625	mm->map_count--;
    626	for (i = 0; i < VMACACHE_SIZE; i++) {
    627		/* if the vma is cached, invalidate the entire cache */
    628		if (curr->vmacache.vmas[i] == vma) {
    629			vmacache_invalidate(mm);
    630			break;
    631		}
    632	}
    633
    634	/* remove the VMA from the mapping */
    635	if (vma->vm_file) {
    636		mapping = vma->vm_file->f_mapping;
    637
    638		i_mmap_lock_write(mapping);
    639		flush_dcache_mmap_lock(mapping);
    640		vma_interval_tree_remove(vma, &mapping->i_mmap);
    641		flush_dcache_mmap_unlock(mapping);
    642		i_mmap_unlock_write(mapping);
    643	}
    644
    645	/* remove from the MM's tree and list */
    646	rb_erase(&vma->vm_rb, &mm->mm_rb);
    647
    648	__vma_unlink_list(mm, vma);
    649}
    650
    651/*
    652 * destroy a VMA record
    653 */
    654static void delete_vma(struct mm_struct *mm, struct vm_area_struct *vma)
    655{
    656	if (vma->vm_ops && vma->vm_ops->close)
    657		vma->vm_ops->close(vma);
    658	if (vma->vm_file)
    659		fput(vma->vm_file);
    660	put_nommu_region(vma->vm_region);
    661	vm_area_free(vma);
    662}
    663
    664/*
    665 * look up the first VMA in which addr resides, NULL if none
    666 * - should be called with mm->mmap_lock at least held readlocked
    667 */
    668struct vm_area_struct *find_vma(struct mm_struct *mm, unsigned long addr)
    669{
    670	struct vm_area_struct *vma;
    671
    672	/* check the cache first */
    673	vma = vmacache_find(mm, addr);
    674	if (likely(vma))
    675		return vma;
    676
    677	/* trawl the list (there may be multiple mappings in which addr
    678	 * resides) */
    679	for (vma = mm->mmap; vma; vma = vma->vm_next) {
    680		if (vma->vm_start > addr)
    681			return NULL;
    682		if (vma->vm_end > addr) {
    683			vmacache_update(addr, vma);
    684			return vma;
    685		}
    686	}
    687
    688	return NULL;
    689}
    690EXPORT_SYMBOL(find_vma);
    691
    692/*
    693 * find a VMA
    694 * - we don't extend stack VMAs under NOMMU conditions
    695 */
    696struct vm_area_struct *find_extend_vma(struct mm_struct *mm, unsigned long addr)
    697{
    698	return find_vma(mm, addr);
    699}
    700
    701/*
    702 * expand a stack to a given address
    703 * - not supported under NOMMU conditions
    704 */
    705int expand_stack(struct vm_area_struct *vma, unsigned long address)
    706{
    707	return -ENOMEM;
    708}
    709
    710/*
    711 * look up the first VMA exactly that exactly matches addr
    712 * - should be called with mm->mmap_lock at least held readlocked
    713 */
    714static struct vm_area_struct *find_vma_exact(struct mm_struct *mm,
    715					     unsigned long addr,
    716					     unsigned long len)
    717{
    718	struct vm_area_struct *vma;
    719	unsigned long end = addr + len;
    720
    721	/* check the cache first */
    722	vma = vmacache_find_exact(mm, addr, end);
    723	if (vma)
    724		return vma;
    725
    726	/* trawl the list (there may be multiple mappings in which addr
    727	 * resides) */
    728	for (vma = mm->mmap; vma; vma = vma->vm_next) {
    729		if (vma->vm_start < addr)
    730			continue;
    731		if (vma->vm_start > addr)
    732			return NULL;
    733		if (vma->vm_end == end) {
    734			vmacache_update(addr, vma);
    735			return vma;
    736		}
    737	}
    738
    739	return NULL;
    740}
    741
    742/*
    743 * determine whether a mapping should be permitted and, if so, what sort of
    744 * mapping we're capable of supporting
    745 */
    746static int validate_mmap_request(struct file *file,
    747				 unsigned long addr,
    748				 unsigned long len,
    749				 unsigned long prot,
    750				 unsigned long flags,
    751				 unsigned long pgoff,
    752				 unsigned long *_capabilities)
    753{
    754	unsigned long capabilities, rlen;
    755	int ret;
    756
    757	/* do the simple checks first */
    758	if (flags & MAP_FIXED)
    759		return -EINVAL;
    760
    761	if ((flags & MAP_TYPE) != MAP_PRIVATE &&
    762	    (flags & MAP_TYPE) != MAP_SHARED)
    763		return -EINVAL;
    764
    765	if (!len)
    766		return -EINVAL;
    767
    768	/* Careful about overflows.. */
    769	rlen = PAGE_ALIGN(len);
    770	if (!rlen || rlen > TASK_SIZE)
    771		return -ENOMEM;
    772
    773	/* offset overflow? */
    774	if ((pgoff + (rlen >> PAGE_SHIFT)) < pgoff)
    775		return -EOVERFLOW;
    776
    777	if (file) {
    778		/* files must support mmap */
    779		if (!file->f_op->mmap)
    780			return -ENODEV;
    781
    782		/* work out if what we've got could possibly be shared
    783		 * - we support chardevs that provide their own "memory"
    784		 * - we support files/blockdevs that are memory backed
    785		 */
    786		if (file->f_op->mmap_capabilities) {
    787			capabilities = file->f_op->mmap_capabilities(file);
    788		} else {
    789			/* no explicit capabilities set, so assume some
    790			 * defaults */
    791			switch (file_inode(file)->i_mode & S_IFMT) {
    792			case S_IFREG:
    793			case S_IFBLK:
    794				capabilities = NOMMU_MAP_COPY;
    795				break;
    796
    797			case S_IFCHR:
    798				capabilities =
    799					NOMMU_MAP_DIRECT |
    800					NOMMU_MAP_READ |
    801					NOMMU_MAP_WRITE;
    802				break;
    803
    804			default:
    805				return -EINVAL;
    806			}
    807		}
    808
    809		/* eliminate any capabilities that we can't support on this
    810		 * device */
    811		if (!file->f_op->get_unmapped_area)
    812			capabilities &= ~NOMMU_MAP_DIRECT;
    813		if (!(file->f_mode & FMODE_CAN_READ))
    814			capabilities &= ~NOMMU_MAP_COPY;
    815
    816		/* The file shall have been opened with read permission. */
    817		if (!(file->f_mode & FMODE_READ))
    818			return -EACCES;
    819
    820		if (flags & MAP_SHARED) {
    821			/* do checks for writing, appending and locking */
    822			if ((prot & PROT_WRITE) &&
    823			    !(file->f_mode & FMODE_WRITE))
    824				return -EACCES;
    825
    826			if (IS_APPEND(file_inode(file)) &&
    827			    (file->f_mode & FMODE_WRITE))
    828				return -EACCES;
    829
    830			if (!(capabilities & NOMMU_MAP_DIRECT))
    831				return -ENODEV;
    832
    833			/* we mustn't privatise shared mappings */
    834			capabilities &= ~NOMMU_MAP_COPY;
    835		} else {
    836			/* we're going to read the file into private memory we
    837			 * allocate */
    838			if (!(capabilities & NOMMU_MAP_COPY))
    839				return -ENODEV;
    840
    841			/* we don't permit a private writable mapping to be
    842			 * shared with the backing device */
    843			if (prot & PROT_WRITE)
    844				capabilities &= ~NOMMU_MAP_DIRECT;
    845		}
    846
    847		if (capabilities & NOMMU_MAP_DIRECT) {
    848			if (((prot & PROT_READ)  && !(capabilities & NOMMU_MAP_READ))  ||
    849			    ((prot & PROT_WRITE) && !(capabilities & NOMMU_MAP_WRITE)) ||
    850			    ((prot & PROT_EXEC)  && !(capabilities & NOMMU_MAP_EXEC))
    851			    ) {
    852				capabilities &= ~NOMMU_MAP_DIRECT;
    853				if (flags & MAP_SHARED) {
    854					pr_warn("MAP_SHARED not completely supported on !MMU\n");
    855					return -EINVAL;
    856				}
    857			}
    858		}
    859
    860		/* handle executable mappings and implied executable
    861		 * mappings */
    862		if (path_noexec(&file->f_path)) {
    863			if (prot & PROT_EXEC)
    864				return -EPERM;
    865		} else if ((prot & PROT_READ) && !(prot & PROT_EXEC)) {
    866			/* handle implication of PROT_EXEC by PROT_READ */
    867			if (current->personality & READ_IMPLIES_EXEC) {
    868				if (capabilities & NOMMU_MAP_EXEC)
    869					prot |= PROT_EXEC;
    870			}
    871		} else if ((prot & PROT_READ) &&
    872			 (prot & PROT_EXEC) &&
    873			 !(capabilities & NOMMU_MAP_EXEC)
    874			 ) {
    875			/* backing file is not executable, try to copy */
    876			capabilities &= ~NOMMU_MAP_DIRECT;
    877		}
    878	} else {
    879		/* anonymous mappings are always memory backed and can be
    880		 * privately mapped
    881		 */
    882		capabilities = NOMMU_MAP_COPY;
    883
    884		/* handle PROT_EXEC implication by PROT_READ */
    885		if ((prot & PROT_READ) &&
    886		    (current->personality & READ_IMPLIES_EXEC))
    887			prot |= PROT_EXEC;
    888	}
    889
    890	/* allow the security API to have its say */
    891	ret = security_mmap_addr(addr);
    892	if (ret < 0)
    893		return ret;
    894
    895	/* looks okay */
    896	*_capabilities = capabilities;
    897	return 0;
    898}
    899
    900/*
    901 * we've determined that we can make the mapping, now translate what we
    902 * now know into VMA flags
    903 */
    904static unsigned long determine_vm_flags(struct file *file,
    905					unsigned long prot,
    906					unsigned long flags,
    907					unsigned long capabilities)
    908{
    909	unsigned long vm_flags;
    910
    911	vm_flags = calc_vm_prot_bits(prot, 0) | calc_vm_flag_bits(flags);
    912	/* vm_flags |= mm->def_flags; */
    913
    914	if (!(capabilities & NOMMU_MAP_DIRECT)) {
    915		/* attempt to share read-only copies of mapped file chunks */
    916		vm_flags |= VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC;
    917		if (file && !(prot & PROT_WRITE))
    918			vm_flags |= VM_MAYSHARE;
    919	} else {
    920		/* overlay a shareable mapping on the backing device or inode
    921		 * if possible - used for chardevs, ramfs/tmpfs/shmfs and
    922		 * romfs/cramfs */
    923		vm_flags |= VM_MAYSHARE | (capabilities & NOMMU_VMFLAGS);
    924		if (flags & MAP_SHARED)
    925			vm_flags |= VM_SHARED;
    926	}
    927
    928	/* refuse to let anyone share private mappings with this process if
    929	 * it's being traced - otherwise breakpoints set in it may interfere
    930	 * with another untraced process
    931	 */
    932	if ((flags & MAP_PRIVATE) && current->ptrace)
    933		vm_flags &= ~VM_MAYSHARE;
    934
    935	return vm_flags;
    936}
    937
    938/*
    939 * set up a shared mapping on a file (the driver or filesystem provides and
    940 * pins the storage)
    941 */
    942static int do_mmap_shared_file(struct vm_area_struct *vma)
    943{
    944	int ret;
    945
    946	ret = call_mmap(vma->vm_file, vma);
    947	if (ret == 0) {
    948		vma->vm_region->vm_top = vma->vm_region->vm_end;
    949		return 0;
    950	}
    951	if (ret != -ENOSYS)
    952		return ret;
    953
    954	/* getting -ENOSYS indicates that direct mmap isn't possible (as
    955	 * opposed to tried but failed) so we can only give a suitable error as
    956	 * it's not possible to make a private copy if MAP_SHARED was given */
    957	return -ENODEV;
    958}
    959
    960/*
    961 * set up a private mapping or an anonymous shared mapping
    962 */
    963static int do_mmap_private(struct vm_area_struct *vma,
    964			   struct vm_region *region,
    965			   unsigned long len,
    966			   unsigned long capabilities)
    967{
    968	unsigned long total, point;
    969	void *base;
    970	int ret, order;
    971
    972	/* invoke the file's mapping function so that it can keep track of
    973	 * shared mappings on devices or memory
    974	 * - VM_MAYSHARE will be set if it may attempt to share
    975	 */
    976	if (capabilities & NOMMU_MAP_DIRECT) {
    977		ret = call_mmap(vma->vm_file, vma);
    978		if (ret == 0) {
    979			/* shouldn't return success if we're not sharing */
    980			BUG_ON(!(vma->vm_flags & VM_MAYSHARE));
    981			vma->vm_region->vm_top = vma->vm_region->vm_end;
    982			return 0;
    983		}
    984		if (ret != -ENOSYS)
    985			return ret;
    986
    987		/* getting an ENOSYS error indicates that direct mmap isn't
    988		 * possible (as opposed to tried but failed) so we'll try to
    989		 * make a private copy of the data and map that instead */
    990	}
    991
    992
    993	/* allocate some memory to hold the mapping
    994	 * - note that this may not return a page-aligned address if the object
    995	 *   we're allocating is smaller than a page
    996	 */
    997	order = get_order(len);
    998	total = 1 << order;
    999	point = len >> PAGE_SHIFT;
   1000
   1001	/* we don't want to allocate a power-of-2 sized page set */
   1002	if (sysctl_nr_trim_pages && total - point >= sysctl_nr_trim_pages)
   1003		total = point;
   1004
   1005	base = alloc_pages_exact(total << PAGE_SHIFT, GFP_KERNEL);
   1006	if (!base)
   1007		goto enomem;
   1008
   1009	atomic_long_add(total, &mmap_pages_allocated);
   1010
   1011	region->vm_flags = vma->vm_flags |= VM_MAPPED_COPY;
   1012	region->vm_start = (unsigned long) base;
   1013	region->vm_end   = region->vm_start + len;
   1014	region->vm_top   = region->vm_start + (total << PAGE_SHIFT);
   1015
   1016	vma->vm_start = region->vm_start;
   1017	vma->vm_end   = region->vm_start + len;
   1018
   1019	if (vma->vm_file) {
   1020		/* read the contents of a file into the copy */
   1021		loff_t fpos;
   1022
   1023		fpos = vma->vm_pgoff;
   1024		fpos <<= PAGE_SHIFT;
   1025
   1026		ret = kernel_read(vma->vm_file, base, len, &fpos);
   1027		if (ret < 0)
   1028			goto error_free;
   1029
   1030		/* clear the last little bit */
   1031		if (ret < len)
   1032			memset(base + ret, 0, len - ret);
   1033
   1034	} else {
   1035		vma_set_anonymous(vma);
   1036	}
   1037
   1038	return 0;
   1039
   1040error_free:
   1041	free_page_series(region->vm_start, region->vm_top);
   1042	region->vm_start = vma->vm_start = 0;
   1043	region->vm_end   = vma->vm_end = 0;
   1044	region->vm_top   = 0;
   1045	return ret;
   1046
   1047enomem:
   1048	pr_err("Allocation of length %lu from process %d (%s) failed\n",
   1049	       len, current->pid, current->comm);
   1050	show_free_areas(0, NULL);
   1051	return -ENOMEM;
   1052}
   1053
   1054/*
   1055 * handle mapping creation for uClinux
   1056 */
   1057unsigned long do_mmap(struct file *file,
   1058			unsigned long addr,
   1059			unsigned long len,
   1060			unsigned long prot,
   1061			unsigned long flags,
   1062			unsigned long pgoff,
   1063			unsigned long *populate,
   1064			struct list_head *uf)
   1065{
   1066	struct vm_area_struct *vma;
   1067	struct vm_region *region;
   1068	struct rb_node *rb;
   1069	vm_flags_t vm_flags;
   1070	unsigned long capabilities, result;
   1071	int ret;
   1072
   1073	*populate = 0;
   1074
   1075	/* decide whether we should attempt the mapping, and if so what sort of
   1076	 * mapping */
   1077	ret = validate_mmap_request(file, addr, len, prot, flags, pgoff,
   1078				    &capabilities);
   1079	if (ret < 0)
   1080		return ret;
   1081
   1082	/* we ignore the address hint */
   1083	addr = 0;
   1084	len = PAGE_ALIGN(len);
   1085
   1086	/* we've determined that we can make the mapping, now translate what we
   1087	 * now know into VMA flags */
   1088	vm_flags = determine_vm_flags(file, prot, flags, capabilities);
   1089
   1090	/* we're going to need to record the mapping */
   1091	region = kmem_cache_zalloc(vm_region_jar, GFP_KERNEL);
   1092	if (!region)
   1093		goto error_getting_region;
   1094
   1095	vma = vm_area_alloc(current->mm);
   1096	if (!vma)
   1097		goto error_getting_vma;
   1098
   1099	region->vm_usage = 1;
   1100	region->vm_flags = vm_flags;
   1101	region->vm_pgoff = pgoff;
   1102
   1103	vma->vm_flags = vm_flags;
   1104	vma->vm_pgoff = pgoff;
   1105
   1106	if (file) {
   1107		region->vm_file = get_file(file);
   1108		vma->vm_file = get_file(file);
   1109	}
   1110
   1111	down_write(&nommu_region_sem);
   1112
   1113	/* if we want to share, we need to check for regions created by other
   1114	 * mmap() calls that overlap with our proposed mapping
   1115	 * - we can only share with a superset match on most regular files
   1116	 * - shared mappings on character devices and memory backed files are
   1117	 *   permitted to overlap inexactly as far as we are concerned for in
   1118	 *   these cases, sharing is handled in the driver or filesystem rather
   1119	 *   than here
   1120	 */
   1121	if (vm_flags & VM_MAYSHARE) {
   1122		struct vm_region *pregion;
   1123		unsigned long pglen, rpglen, pgend, rpgend, start;
   1124
   1125		pglen = (len + PAGE_SIZE - 1) >> PAGE_SHIFT;
   1126		pgend = pgoff + pglen;
   1127
   1128		for (rb = rb_first(&nommu_region_tree); rb; rb = rb_next(rb)) {
   1129			pregion = rb_entry(rb, struct vm_region, vm_rb);
   1130
   1131			if (!(pregion->vm_flags & VM_MAYSHARE))
   1132				continue;
   1133
   1134			/* search for overlapping mappings on the same file */
   1135			if (file_inode(pregion->vm_file) !=
   1136			    file_inode(file))
   1137				continue;
   1138
   1139			if (pregion->vm_pgoff >= pgend)
   1140				continue;
   1141
   1142			rpglen = pregion->vm_end - pregion->vm_start;
   1143			rpglen = (rpglen + PAGE_SIZE - 1) >> PAGE_SHIFT;
   1144			rpgend = pregion->vm_pgoff + rpglen;
   1145			if (pgoff >= rpgend)
   1146				continue;
   1147
   1148			/* handle inexactly overlapping matches between
   1149			 * mappings */
   1150			if ((pregion->vm_pgoff != pgoff || rpglen != pglen) &&
   1151			    !(pgoff >= pregion->vm_pgoff && pgend <= rpgend)) {
   1152				/* new mapping is not a subset of the region */
   1153				if (!(capabilities & NOMMU_MAP_DIRECT))
   1154					goto sharing_violation;
   1155				continue;
   1156			}
   1157
   1158			/* we've found a region we can share */
   1159			pregion->vm_usage++;
   1160			vma->vm_region = pregion;
   1161			start = pregion->vm_start;
   1162			start += (pgoff - pregion->vm_pgoff) << PAGE_SHIFT;
   1163			vma->vm_start = start;
   1164			vma->vm_end = start + len;
   1165
   1166			if (pregion->vm_flags & VM_MAPPED_COPY)
   1167				vma->vm_flags |= VM_MAPPED_COPY;
   1168			else {
   1169				ret = do_mmap_shared_file(vma);
   1170				if (ret < 0) {
   1171					vma->vm_region = NULL;
   1172					vma->vm_start = 0;
   1173					vma->vm_end = 0;
   1174					pregion->vm_usage--;
   1175					pregion = NULL;
   1176					goto error_just_free;
   1177				}
   1178			}
   1179			fput(region->vm_file);
   1180			kmem_cache_free(vm_region_jar, region);
   1181			region = pregion;
   1182			result = start;
   1183			goto share;
   1184		}
   1185
   1186		/* obtain the address at which to make a shared mapping
   1187		 * - this is the hook for quasi-memory character devices to
   1188		 *   tell us the location of a shared mapping
   1189		 */
   1190		if (capabilities & NOMMU_MAP_DIRECT) {
   1191			addr = file->f_op->get_unmapped_area(file, addr, len,
   1192							     pgoff, flags);
   1193			if (IS_ERR_VALUE(addr)) {
   1194				ret = addr;
   1195				if (ret != -ENOSYS)
   1196					goto error_just_free;
   1197
   1198				/* the driver refused to tell us where to site
   1199				 * the mapping so we'll have to attempt to copy
   1200				 * it */
   1201				ret = -ENODEV;
   1202				if (!(capabilities & NOMMU_MAP_COPY))
   1203					goto error_just_free;
   1204
   1205				capabilities &= ~NOMMU_MAP_DIRECT;
   1206			} else {
   1207				vma->vm_start = region->vm_start = addr;
   1208				vma->vm_end = region->vm_end = addr + len;
   1209			}
   1210		}
   1211	}
   1212
   1213	vma->vm_region = region;
   1214
   1215	/* set up the mapping
   1216	 * - the region is filled in if NOMMU_MAP_DIRECT is still set
   1217	 */
   1218	if (file && vma->vm_flags & VM_SHARED)
   1219		ret = do_mmap_shared_file(vma);
   1220	else
   1221		ret = do_mmap_private(vma, region, len, capabilities);
   1222	if (ret < 0)
   1223		goto error_just_free;
   1224	add_nommu_region(region);
   1225
   1226	/* clear anonymous mappings that don't ask for uninitialized data */
   1227	if (!vma->vm_file &&
   1228	    (!IS_ENABLED(CONFIG_MMAP_ALLOW_UNINITIALIZED) ||
   1229	     !(flags & MAP_UNINITIALIZED)))
   1230		memset((void *)region->vm_start, 0,
   1231		       region->vm_end - region->vm_start);
   1232
   1233	/* okay... we have a mapping; now we have to register it */
   1234	result = vma->vm_start;
   1235
   1236	current->mm->total_vm += len >> PAGE_SHIFT;
   1237
   1238share:
   1239	add_vma_to_mm(current->mm, vma);
   1240
   1241	/* we flush the region from the icache only when the first executable
   1242	 * mapping of it is made  */
   1243	if (vma->vm_flags & VM_EXEC && !region->vm_icache_flushed) {
   1244		flush_icache_user_range(region->vm_start, region->vm_end);
   1245		region->vm_icache_flushed = true;
   1246	}
   1247
   1248	up_write(&nommu_region_sem);
   1249
   1250	return result;
   1251
   1252error_just_free:
   1253	up_write(&nommu_region_sem);
   1254error:
   1255	if (region->vm_file)
   1256		fput(region->vm_file);
   1257	kmem_cache_free(vm_region_jar, region);
   1258	if (vma->vm_file)
   1259		fput(vma->vm_file);
   1260	vm_area_free(vma);
   1261	return ret;
   1262
   1263sharing_violation:
   1264	up_write(&nommu_region_sem);
   1265	pr_warn("Attempt to share mismatched mappings\n");
   1266	ret = -EINVAL;
   1267	goto error;
   1268
   1269error_getting_vma:
   1270	kmem_cache_free(vm_region_jar, region);
   1271	pr_warn("Allocation of vma for %lu byte allocation from process %d failed\n",
   1272			len, current->pid);
   1273	show_free_areas(0, NULL);
   1274	return -ENOMEM;
   1275
   1276error_getting_region:
   1277	pr_warn("Allocation of vm region for %lu byte allocation from process %d failed\n",
   1278			len, current->pid);
   1279	show_free_areas(0, NULL);
   1280	return -ENOMEM;
   1281}
   1282
   1283unsigned long ksys_mmap_pgoff(unsigned long addr, unsigned long len,
   1284			      unsigned long prot, unsigned long flags,
   1285			      unsigned long fd, unsigned long pgoff)
   1286{
   1287	struct file *file = NULL;
   1288	unsigned long retval = -EBADF;
   1289
   1290	audit_mmap_fd(fd, flags);
   1291	if (!(flags & MAP_ANONYMOUS)) {
   1292		file = fget(fd);
   1293		if (!file)
   1294			goto out;
   1295	}
   1296
   1297	retval = vm_mmap_pgoff(file, addr, len, prot, flags, pgoff);
   1298
   1299	if (file)
   1300		fput(file);
   1301out:
   1302	return retval;
   1303}
   1304
   1305SYSCALL_DEFINE6(mmap_pgoff, unsigned long, addr, unsigned long, len,
   1306		unsigned long, prot, unsigned long, flags,
   1307		unsigned long, fd, unsigned long, pgoff)
   1308{
   1309	return ksys_mmap_pgoff(addr, len, prot, flags, fd, pgoff);
   1310}
   1311
   1312#ifdef __ARCH_WANT_SYS_OLD_MMAP
   1313struct mmap_arg_struct {
   1314	unsigned long addr;
   1315	unsigned long len;
   1316	unsigned long prot;
   1317	unsigned long flags;
   1318	unsigned long fd;
   1319	unsigned long offset;
   1320};
   1321
   1322SYSCALL_DEFINE1(old_mmap, struct mmap_arg_struct __user *, arg)
   1323{
   1324	struct mmap_arg_struct a;
   1325
   1326	if (copy_from_user(&a, arg, sizeof(a)))
   1327		return -EFAULT;
   1328	if (offset_in_page(a.offset))
   1329		return -EINVAL;
   1330
   1331	return ksys_mmap_pgoff(a.addr, a.len, a.prot, a.flags, a.fd,
   1332			       a.offset >> PAGE_SHIFT);
   1333}
   1334#endif /* __ARCH_WANT_SYS_OLD_MMAP */
   1335
   1336/*
   1337 * split a vma into two pieces at address 'addr', a new vma is allocated either
   1338 * for the first part or the tail.
   1339 */
   1340int split_vma(struct mm_struct *mm, struct vm_area_struct *vma,
   1341	      unsigned long addr, int new_below)
   1342{
   1343	struct vm_area_struct *new;
   1344	struct vm_region *region;
   1345	unsigned long npages;
   1346
   1347	/* we're only permitted to split anonymous regions (these should have
   1348	 * only a single usage on the region) */
   1349	if (vma->vm_file)
   1350		return -ENOMEM;
   1351
   1352	if (mm->map_count >= sysctl_max_map_count)
   1353		return -ENOMEM;
   1354
   1355	region = kmem_cache_alloc(vm_region_jar, GFP_KERNEL);
   1356	if (!region)
   1357		return -ENOMEM;
   1358
   1359	new = vm_area_dup(vma);
   1360	if (!new) {
   1361		kmem_cache_free(vm_region_jar, region);
   1362		return -ENOMEM;
   1363	}
   1364
   1365	/* most fields are the same, copy all, and then fixup */
   1366	*region = *vma->vm_region;
   1367	new->vm_region = region;
   1368
   1369	npages = (addr - vma->vm_start) >> PAGE_SHIFT;
   1370
   1371	if (new_below) {
   1372		region->vm_top = region->vm_end = new->vm_end = addr;
   1373	} else {
   1374		region->vm_start = new->vm_start = addr;
   1375		region->vm_pgoff = new->vm_pgoff += npages;
   1376	}
   1377
   1378	if (new->vm_ops && new->vm_ops->open)
   1379		new->vm_ops->open(new);
   1380
   1381	delete_vma_from_mm(vma);
   1382	down_write(&nommu_region_sem);
   1383	delete_nommu_region(vma->vm_region);
   1384	if (new_below) {
   1385		vma->vm_region->vm_start = vma->vm_start = addr;
   1386		vma->vm_region->vm_pgoff = vma->vm_pgoff += npages;
   1387	} else {
   1388		vma->vm_region->vm_end = vma->vm_end = addr;
   1389		vma->vm_region->vm_top = addr;
   1390	}
   1391	add_nommu_region(vma->vm_region);
   1392	add_nommu_region(new->vm_region);
   1393	up_write(&nommu_region_sem);
   1394	add_vma_to_mm(mm, vma);
   1395	add_vma_to_mm(mm, new);
   1396	return 0;
   1397}
   1398
   1399/*
   1400 * shrink a VMA by removing the specified chunk from either the beginning or
   1401 * the end
   1402 */
   1403static int shrink_vma(struct mm_struct *mm,
   1404		      struct vm_area_struct *vma,
   1405		      unsigned long from, unsigned long to)
   1406{
   1407	struct vm_region *region;
   1408
   1409	/* adjust the VMA's pointers, which may reposition it in the MM's tree
   1410	 * and list */
   1411	delete_vma_from_mm(vma);
   1412	if (from > vma->vm_start)
   1413		vma->vm_end = from;
   1414	else
   1415		vma->vm_start = to;
   1416	add_vma_to_mm(mm, vma);
   1417
   1418	/* cut the backing region down to size */
   1419	region = vma->vm_region;
   1420	BUG_ON(region->vm_usage != 1);
   1421
   1422	down_write(&nommu_region_sem);
   1423	delete_nommu_region(region);
   1424	if (from > region->vm_start) {
   1425		to = region->vm_top;
   1426		region->vm_top = region->vm_end = from;
   1427	} else {
   1428		region->vm_start = to;
   1429	}
   1430	add_nommu_region(region);
   1431	up_write(&nommu_region_sem);
   1432
   1433	free_page_series(from, to);
   1434	return 0;
   1435}
   1436
   1437/*
   1438 * release a mapping
   1439 * - under NOMMU conditions the chunk to be unmapped must be backed by a single
   1440 *   VMA, though it need not cover the whole VMA
   1441 */
   1442int do_munmap(struct mm_struct *mm, unsigned long start, size_t len, struct list_head *uf)
   1443{
   1444	struct vm_area_struct *vma;
   1445	unsigned long end;
   1446	int ret;
   1447
   1448	len = PAGE_ALIGN(len);
   1449	if (len == 0)
   1450		return -EINVAL;
   1451
   1452	end = start + len;
   1453
   1454	/* find the first potentially overlapping VMA */
   1455	vma = find_vma(mm, start);
   1456	if (!vma) {
   1457		static int limit;
   1458		if (limit < 5) {
   1459			pr_warn("munmap of memory not mmapped by process %d (%s): 0x%lx-0x%lx\n",
   1460					current->pid, current->comm,
   1461					start, start + len - 1);
   1462			limit++;
   1463		}
   1464		return -EINVAL;
   1465	}
   1466
   1467	/* we're allowed to split an anonymous VMA but not a file-backed one */
   1468	if (vma->vm_file) {
   1469		do {
   1470			if (start > vma->vm_start)
   1471				return -EINVAL;
   1472			if (end == vma->vm_end)
   1473				goto erase_whole_vma;
   1474			vma = vma->vm_next;
   1475		} while (vma);
   1476		return -EINVAL;
   1477	} else {
   1478		/* the chunk must be a subset of the VMA found */
   1479		if (start == vma->vm_start && end == vma->vm_end)
   1480			goto erase_whole_vma;
   1481		if (start < vma->vm_start || end > vma->vm_end)
   1482			return -EINVAL;
   1483		if (offset_in_page(start))
   1484			return -EINVAL;
   1485		if (end != vma->vm_end && offset_in_page(end))
   1486			return -EINVAL;
   1487		if (start != vma->vm_start && end != vma->vm_end) {
   1488			ret = split_vma(mm, vma, start, 1);
   1489			if (ret < 0)
   1490				return ret;
   1491		}
   1492		return shrink_vma(mm, vma, start, end);
   1493	}
   1494
   1495erase_whole_vma:
   1496	delete_vma_from_mm(vma);
   1497	delete_vma(mm, vma);
   1498	return 0;
   1499}
   1500
   1501int vm_munmap(unsigned long addr, size_t len)
   1502{
   1503	struct mm_struct *mm = current->mm;
   1504	int ret;
   1505
   1506	mmap_write_lock(mm);
   1507	ret = do_munmap(mm, addr, len, NULL);
   1508	mmap_write_unlock(mm);
   1509	return ret;
   1510}
   1511EXPORT_SYMBOL(vm_munmap);
   1512
   1513SYSCALL_DEFINE2(munmap, unsigned long, addr, size_t, len)
   1514{
   1515	return vm_munmap(addr, len);
   1516}
   1517
   1518/*
   1519 * release all the mappings made in a process's VM space
   1520 */
   1521void exit_mmap(struct mm_struct *mm)
   1522{
   1523	struct vm_area_struct *vma;
   1524
   1525	if (!mm)
   1526		return;
   1527
   1528	mm->total_vm = 0;
   1529
   1530	while ((vma = mm->mmap)) {
   1531		mm->mmap = vma->vm_next;
   1532		delete_vma_from_mm(vma);
   1533		delete_vma(mm, vma);
   1534		cond_resched();
   1535	}
   1536}
   1537
   1538int vm_brk(unsigned long addr, unsigned long len)
   1539{
   1540	return -ENOMEM;
   1541}
   1542
   1543/*
   1544 * expand (or shrink) an existing mapping, potentially moving it at the same
   1545 * time (controlled by the MREMAP_MAYMOVE flag and available VM space)
   1546 *
   1547 * under NOMMU conditions, we only permit changing a mapping's size, and only
   1548 * as long as it stays within the region allocated by do_mmap_private() and the
   1549 * block is not shareable
   1550 *
   1551 * MREMAP_FIXED is not supported under NOMMU conditions
   1552 */
   1553static unsigned long do_mremap(unsigned long addr,
   1554			unsigned long old_len, unsigned long new_len,
   1555			unsigned long flags, unsigned long new_addr)
   1556{
   1557	struct vm_area_struct *vma;
   1558
   1559	/* insanity checks first */
   1560	old_len = PAGE_ALIGN(old_len);
   1561	new_len = PAGE_ALIGN(new_len);
   1562	if (old_len == 0 || new_len == 0)
   1563		return (unsigned long) -EINVAL;
   1564
   1565	if (offset_in_page(addr))
   1566		return -EINVAL;
   1567
   1568	if (flags & MREMAP_FIXED && new_addr != addr)
   1569		return (unsigned long) -EINVAL;
   1570
   1571	vma = find_vma_exact(current->mm, addr, old_len);
   1572	if (!vma)
   1573		return (unsigned long) -EINVAL;
   1574
   1575	if (vma->vm_end != vma->vm_start + old_len)
   1576		return (unsigned long) -EFAULT;
   1577
   1578	if (vma->vm_flags & VM_MAYSHARE)
   1579		return (unsigned long) -EPERM;
   1580
   1581	if (new_len > vma->vm_region->vm_end - vma->vm_region->vm_start)
   1582		return (unsigned long) -ENOMEM;
   1583
   1584	/* all checks complete - do it */
   1585	vma->vm_end = vma->vm_start + new_len;
   1586	return vma->vm_start;
   1587}
   1588
   1589SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len,
   1590		unsigned long, new_len, unsigned long, flags,
   1591		unsigned long, new_addr)
   1592{
   1593	unsigned long ret;
   1594
   1595	mmap_write_lock(current->mm);
   1596	ret = do_mremap(addr, old_len, new_len, flags, new_addr);
   1597	mmap_write_unlock(current->mm);
   1598	return ret;
   1599}
   1600
   1601struct page *follow_page(struct vm_area_struct *vma, unsigned long address,
   1602			 unsigned int foll_flags)
   1603{
   1604	return NULL;
   1605}
   1606
   1607int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr,
   1608		unsigned long pfn, unsigned long size, pgprot_t prot)
   1609{
   1610	if (addr != (pfn << PAGE_SHIFT))
   1611		return -EINVAL;
   1612
   1613	vma->vm_flags |= VM_IO | VM_PFNMAP | VM_DONTEXPAND | VM_DONTDUMP;
   1614	return 0;
   1615}
   1616EXPORT_SYMBOL(remap_pfn_range);
   1617
   1618int vm_iomap_memory(struct vm_area_struct *vma, phys_addr_t start, unsigned long len)
   1619{
   1620	unsigned long pfn = start >> PAGE_SHIFT;
   1621	unsigned long vm_len = vma->vm_end - vma->vm_start;
   1622
   1623	pfn += vma->vm_pgoff;
   1624	return io_remap_pfn_range(vma, vma->vm_start, pfn, vm_len, vma->vm_page_prot);
   1625}
   1626EXPORT_SYMBOL(vm_iomap_memory);
   1627
   1628int remap_vmalloc_range(struct vm_area_struct *vma, void *addr,
   1629			unsigned long pgoff)
   1630{
   1631	unsigned int size = vma->vm_end - vma->vm_start;
   1632
   1633	if (!(vma->vm_flags & VM_USERMAP))
   1634		return -EINVAL;
   1635
   1636	vma->vm_start = (unsigned long)(addr + (pgoff << PAGE_SHIFT));
   1637	vma->vm_end = vma->vm_start + size;
   1638
   1639	return 0;
   1640}
   1641EXPORT_SYMBOL(remap_vmalloc_range);
   1642
   1643vm_fault_t filemap_fault(struct vm_fault *vmf)
   1644{
   1645	BUG();
   1646	return 0;
   1647}
   1648EXPORT_SYMBOL(filemap_fault);
   1649
   1650vm_fault_t filemap_map_pages(struct vm_fault *vmf,
   1651		pgoff_t start_pgoff, pgoff_t end_pgoff)
   1652{
   1653	BUG();
   1654	return 0;
   1655}
   1656EXPORT_SYMBOL(filemap_map_pages);
   1657
   1658int __access_remote_vm(struct mm_struct *mm, unsigned long addr, void *buf,
   1659		       int len, unsigned int gup_flags)
   1660{
   1661	struct vm_area_struct *vma;
   1662	int write = gup_flags & FOLL_WRITE;
   1663
   1664	if (mmap_read_lock_killable(mm))
   1665		return 0;
   1666
   1667	/* the access must start within one of the target process's mappings */
   1668	vma = find_vma(mm, addr);
   1669	if (vma) {
   1670		/* don't overrun this mapping */
   1671		if (addr + len >= vma->vm_end)
   1672			len = vma->vm_end - addr;
   1673
   1674		/* only read or write mappings where it is permitted */
   1675		if (write && vma->vm_flags & VM_MAYWRITE)
   1676			copy_to_user_page(vma, NULL, addr,
   1677					 (void *) addr, buf, len);
   1678		else if (!write && vma->vm_flags & VM_MAYREAD)
   1679			copy_from_user_page(vma, NULL, addr,
   1680					    buf, (void *) addr, len);
   1681		else
   1682			len = 0;
   1683	} else {
   1684		len = 0;
   1685	}
   1686
   1687	mmap_read_unlock(mm);
   1688
   1689	return len;
   1690}
   1691
   1692/**
   1693 * access_remote_vm - access another process' address space
   1694 * @mm:		the mm_struct of the target address space
   1695 * @addr:	start address to access
   1696 * @buf:	source or destination buffer
   1697 * @len:	number of bytes to transfer
   1698 * @gup_flags:	flags modifying lookup behaviour
   1699 *
   1700 * The caller must hold a reference on @mm.
   1701 */
   1702int access_remote_vm(struct mm_struct *mm, unsigned long addr,
   1703		void *buf, int len, unsigned int gup_flags)
   1704{
   1705	return __access_remote_vm(mm, addr, buf, len, gup_flags);
   1706}
   1707
   1708/*
   1709 * Access another process' address space.
   1710 * - source/target buffer must be kernel space
   1711 */
   1712int access_process_vm(struct task_struct *tsk, unsigned long addr, void *buf, int len,
   1713		unsigned int gup_flags)
   1714{
   1715	struct mm_struct *mm;
   1716
   1717	if (addr + len < addr)
   1718		return 0;
   1719
   1720	mm = get_task_mm(tsk);
   1721	if (!mm)
   1722		return 0;
   1723
   1724	len = __access_remote_vm(mm, addr, buf, len, gup_flags);
   1725
   1726	mmput(mm);
   1727	return len;
   1728}
   1729EXPORT_SYMBOL_GPL(access_process_vm);
   1730
   1731/**
   1732 * nommu_shrink_inode_mappings - Shrink the shared mappings on an inode
   1733 * @inode: The inode to check
   1734 * @size: The current filesize of the inode
   1735 * @newsize: The proposed filesize of the inode
   1736 *
   1737 * Check the shared mappings on an inode on behalf of a shrinking truncate to
   1738 * make sure that any outstanding VMAs aren't broken and then shrink the
   1739 * vm_regions that extend beyond so that do_mmap() doesn't
   1740 * automatically grant mappings that are too large.
   1741 */
   1742int nommu_shrink_inode_mappings(struct inode *inode, size_t size,
   1743				size_t newsize)
   1744{
   1745	struct vm_area_struct *vma;
   1746	struct vm_region *region;
   1747	pgoff_t low, high;
   1748	size_t r_size, r_top;
   1749
   1750	low = newsize >> PAGE_SHIFT;
   1751	high = (size + PAGE_SIZE - 1) >> PAGE_SHIFT;
   1752
   1753	down_write(&nommu_region_sem);
   1754	i_mmap_lock_read(inode->i_mapping);
   1755
   1756	/* search for VMAs that fall within the dead zone */
   1757	vma_interval_tree_foreach(vma, &inode->i_mapping->i_mmap, low, high) {
   1758		/* found one - only interested if it's shared out of the page
   1759		 * cache */
   1760		if (vma->vm_flags & VM_SHARED) {
   1761			i_mmap_unlock_read(inode->i_mapping);
   1762			up_write(&nommu_region_sem);
   1763			return -ETXTBSY; /* not quite true, but near enough */
   1764		}
   1765	}
   1766
   1767	/* reduce any regions that overlap the dead zone - if in existence,
   1768	 * these will be pointed to by VMAs that don't overlap the dead zone
   1769	 *
   1770	 * we don't check for any regions that start beyond the EOF as there
   1771	 * shouldn't be any
   1772	 */
   1773	vma_interval_tree_foreach(vma, &inode->i_mapping->i_mmap, 0, ULONG_MAX) {
   1774		if (!(vma->vm_flags & VM_SHARED))
   1775			continue;
   1776
   1777		region = vma->vm_region;
   1778		r_size = region->vm_top - region->vm_start;
   1779		r_top = (region->vm_pgoff << PAGE_SHIFT) + r_size;
   1780
   1781		if (r_top > newsize) {
   1782			region->vm_top -= r_top - newsize;
   1783			if (region->vm_end > region->vm_top)
   1784				region->vm_end = region->vm_top;
   1785		}
   1786	}
   1787
   1788	i_mmap_unlock_read(inode->i_mapping);
   1789	up_write(&nommu_region_sem);
   1790	return 0;
   1791}
   1792
   1793/*
   1794 * Initialise sysctl_user_reserve_kbytes.
   1795 *
   1796 * This is intended to prevent a user from starting a single memory hogging
   1797 * process, such that they cannot recover (kill the hog) in OVERCOMMIT_NEVER
   1798 * mode.
   1799 *
   1800 * The default value is min(3% of free memory, 128MB)
   1801 * 128MB is enough to recover with sshd/login, bash, and top/kill.
   1802 */
   1803static int __meminit init_user_reserve(void)
   1804{
   1805	unsigned long free_kbytes;
   1806
   1807	free_kbytes = global_zone_page_state(NR_FREE_PAGES) << (PAGE_SHIFT - 10);
   1808
   1809	sysctl_user_reserve_kbytes = min(free_kbytes / 32, 1UL << 17);
   1810	return 0;
   1811}
   1812subsys_initcall(init_user_reserve);
   1813
   1814/*
   1815 * Initialise sysctl_admin_reserve_kbytes.
   1816 *
   1817 * The purpose of sysctl_admin_reserve_kbytes is to allow the sys admin
   1818 * to log in and kill a memory hogging process.
   1819 *
   1820 * Systems with more than 256MB will reserve 8MB, enough to recover
   1821 * with sshd, bash, and top in OVERCOMMIT_GUESS. Smaller systems will
   1822 * only reserve 3% of free pages by default.
   1823 */
   1824static int __meminit init_admin_reserve(void)
   1825{
   1826	unsigned long free_kbytes;
   1827
   1828	free_kbytes = global_zone_page_state(NR_FREE_PAGES) << (PAGE_SHIFT - 10);
   1829
   1830	sysctl_admin_reserve_kbytes = min(free_kbytes / 32, 1UL << 13);
   1831	return 0;
   1832}
   1833subsys_initcall(init_admin_reserve);